{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 5562, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 5.988023952095808e-10, "logits/chosen": -2.0979156494140625, "logits/rejected": -2.3109986782073975, "logps/chosen": -10.958471298217773, "logps/rejected": -10.488727569580078, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.0, "learning_rate": 1.1976047904191617e-09, "logits/chosen": -2.171200752258301, "logits/rejected": -2.172977924346924, "logps/chosen": -11.140979766845703, "logps/rejected": -8.594125747680664, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.0, "learning_rate": 1.7964071856287424e-09, "logits/chosen": -2.2806477546691895, "logits/rejected": -2.1801018714904785, "logps/chosen": -42.361122131347656, "logps/rejected": -9.799006462097168, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": -0.018108749762177467, "rewards/margins": -0.017947768792510033, "rewards/rejected": -0.00016098022751975805, "step": 3 }, { "epoch": 0.0, "learning_rate": 2.3952095808383234e-09, "logits/chosen": -2.19307017326355, "logits/rejected": -2.195385217666626, "logps/chosen": -19.044578552246094, "logps/rejected": -9.655104637145996, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": -0.0034381865989416838, "rewards/margins": -0.0012120245955884457, "rewards/rejected": -0.002226162003353238, "step": 4 }, { "epoch": 0.0, "learning_rate": 2.994011976047904e-09, "logits/chosen": -2.1520895957946777, "logits/rejected": -2.1502363681793213, "logps/chosen": -13.768416404724121, "logps/rejected": -11.32718276977539, "loss": 0.698, "rewards/accuracies": 0.0, "rewards/chosen": -0.013616085052490234, "rewards/margins": -0.009621906094253063, "rewards/rejected": -0.003994178958237171, "step": 5 }, { "epoch": 0.0, "learning_rate": 3.592814371257485e-09, "logits/chosen": -2.2668168544769287, "logits/rejected": -2.4108195304870605, "logps/chosen": -8.252432823181152, "logps/rejected": -7.846161365509033, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 0.003317260881885886, "rewards/margins": 0.0012307644356042147, "rewards/rejected": 0.0020864964462816715, "step": 6 }, { "epoch": 0.0, "learning_rate": 4.191616766467066e-09, "logits/chosen": -2.1220784187316895, "logits/rejected": -2.1156280040740967, "logps/chosen": -20.252761840820312, "logps/rejected": -10.921347618103027, "loss": 0.6993, "rewards/accuracies": 0.0, "rewards/chosen": -0.00685882568359375, "rewards/margins": -0.012170982547104359, "rewards/rejected": 0.005312156863510609, "step": 7 }, { "epoch": 0.0, "learning_rate": 4.790419161676647e-09, "logits/chosen": -2.1048295497894287, "logits/rejected": -2.348219633102417, "logps/chosen": -9.45234203338623, "logps/rejected": -9.406747817993164, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.01814746856689453, "rewards/margins": 0.018404101952910423, "rewards/rejected": -0.00025663376436568797, "step": 8 }, { "epoch": 0.0, "learning_rate": 5.389221556886227e-09, "logits/chosen": -2.0476434230804443, "logits/rejected": -2.284775733947754, "logps/chosen": -9.717644691467285, "logps/rejected": -9.796012878417969, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": -0.002002239227294922, "rewards/margins": -0.0013316154945641756, "rewards/rejected": -0.0006706237909384072, "step": 9 }, { "epoch": 0.01, "learning_rate": 5.988023952095808e-09, "logits/chosen": -2.136190414428711, "logits/rejected": -2.280541181564331, "logps/chosen": -7.229641914367676, "logps/rejected": -7.1628618240356445, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.003893947694450617, "rewards/margins": -0.003452110104262829, "rewards/rejected": 0.007346057798713446, "step": 10 }, { "epoch": 0.01, "learning_rate": 6.5868263473053894e-09, "logits/chosen": -2.2306816577911377, "logits/rejected": -2.237344980239868, "logps/chosen": -10.665277481079102, "logps/rejected": -7.616218090057373, "loss": 0.6983, "rewards/accuracies": 0.0, "rewards/chosen": -0.0014238357543945312, "rewards/margins": -0.010281515307724476, "rewards/rejected": 0.008857679553329945, "step": 11 }, { "epoch": 0.01, "learning_rate": 7.18562874251497e-09, "logits/chosen": -2.134218454360962, "logits/rejected": -2.207756519317627, "logps/chosen": -13.980777740478516, "logps/rejected": -26.74662971496582, "loss": 0.7011, "rewards/accuracies": 0.0, "rewards/chosen": 9.31739850784652e-05, "rewards/margins": -0.015865422785282135, "rewards/rejected": 0.015958596020936966, "step": 12 }, { "epoch": 0.01, "learning_rate": 7.78443113772455e-09, "logits/chosen": -2.0931925773620605, "logits/rejected": -2.0991830825805664, "logps/chosen": -11.424118041992188, "logps/rejected": -8.425310134887695, "loss": 0.7007, "rewards/accuracies": 0.0, "rewards/chosen": -0.006567573640495539, "rewards/margins": -0.014998627826571465, "rewards/rejected": 0.008431053720414639, "step": 13 }, { "epoch": 0.01, "learning_rate": 8.383233532934131e-09, "logits/chosen": -2.1964221000671387, "logits/rejected": -2.1975514888763428, "logps/chosen": -13.52840805053711, "logps/rejected": -8.546295166015625, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": -0.010986328125, "rewards/margins": -0.01394796371459961, "rewards/rejected": 0.0029616355895996094, "step": 14 }, { "epoch": 0.01, "learning_rate": 8.982035928143712e-09, "logits/chosen": -2.1469714641571045, "logits/rejected": -2.147407054901123, "logps/chosen": -18.814748764038086, "logps/rejected": -8.163641929626465, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.0036422729026526213, "rewards/margins": 0.004960346035659313, "rewards/rejected": -0.0013180732494220138, "step": 15 }, { "epoch": 0.01, "learning_rate": 9.580838323353294e-09, "logits/chosen": -2.122285842895508, "logits/rejected": -2.299502372741699, "logps/chosen": -9.353716850280762, "logps/rejected": -9.142786026000977, "loss": 0.7, "rewards/accuracies": 0.0, "rewards/chosen": -0.0028641701210290194, "rewards/margins": -0.013737964443862438, "rewards/rejected": 0.010873794555664062, "step": 16 }, { "epoch": 0.01, "learning_rate": 1.0179640718562873e-08, "logits/chosen": -2.283074140548706, "logits/rejected": -2.337242603302002, "logps/chosen": -19.90506362915039, "logps/rejected": -32.37358093261719, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.0046524046920239925, "rewards/margins": 0.018262099474668503, "rewards/rejected": -0.013609695248305798, "step": 17 }, { "epoch": 0.01, "learning_rate": 1.0778443113772454e-08, "logits/chosen": -2.057246208190918, "logits/rejected": -2.0610029697418213, "logps/chosen": -11.633684158325195, "logps/rejected": -12.299459457397461, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.006306266877800226, "rewards/margins": 0.006646347232162952, "rewards/rejected": -0.00034008026705123484, "step": 18 }, { "epoch": 0.01, "learning_rate": 1.1377245508982035e-08, "logits/chosen": -2.1114273071289062, "logits/rejected": -2.1050655841827393, "logps/chosen": -20.327821731567383, "logps/rejected": -10.927478790283203, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": -0.009495354257524014, "rewards/margins": -0.009960556402802467, "rewards/rejected": 0.000465202349005267, "step": 19 }, { "epoch": 0.01, "learning_rate": 1.1976047904191617e-08, "logits/chosen": -2.1790881156921387, "logits/rejected": -2.181143045425415, "logps/chosen": -11.53612995147705, "logps/rejected": -10.330820083618164, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.004594040103256702, "rewards/margins": 0.005547333043068647, "rewards/rejected": -0.000953292881604284, "step": 20 }, { "epoch": 0.01, "learning_rate": 1.2574850299401196e-08, "logits/chosen": -2.063520669937134, "logits/rejected": -2.0564792156219482, "logps/chosen": -48.104251861572266, "logps/rejected": -13.894438743591309, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.0054115294478833675, "rewards/margins": -0.0031163222156465054, "rewards/rejected": 0.008527851663529873, "step": 21 }, { "epoch": 0.01, "learning_rate": 1.3173652694610779e-08, "logits/chosen": -2.1644489765167236, "logits/rejected": -2.175598621368408, "logps/chosen": -16.553998947143555, "logps/rejected": -9.899483680725098, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.013926696963608265, "rewards/margins": 0.008031463250517845, "rewards/rejected": 0.0058952332474291325, "step": 22 }, { "epoch": 0.01, "learning_rate": 1.3772455089820358e-08, "logits/chosen": -2.1691336631774902, "logits/rejected": -2.091620922088623, "logps/chosen": -31.429931640625, "logps/rejected": -9.718562126159668, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": -0.007484817411750555, "rewards/margins": -0.010674667544662952, "rewards/rejected": 0.0031898499000817537, "step": 23 }, { "epoch": 0.01, "learning_rate": 1.437125748502994e-08, "logits/chosen": -2.0553810596466064, "logits/rejected": -2.330698013305664, "logps/chosen": -8.776742935180664, "logps/rejected": -8.597453117370605, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.004621315281838179, "rewards/margins": 0.005370807833969593, "rewards/rejected": -0.0007494926685467362, "step": 24 }, { "epoch": 0.01, "learning_rate": 1.4970059880239517e-08, "logits/chosen": -2.1600959300994873, "logits/rejected": -2.1583011150360107, "logps/chosen": -15.645503997802734, "logps/rejected": -9.447269439697266, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": -0.00328407296910882, "rewards/margins": 0.0020203590393066406, "rewards/rejected": -0.005304432008415461, "step": 25 }, { "epoch": 0.01, "learning_rate": 1.55688622754491e-08, "logits/chosen": -2.1900954246520996, "logits/rejected": -2.1919119358062744, "logps/chosen": -20.618249893188477, "logps/rejected": -18.102550506591797, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.008811759762465954, "rewards/margins": 0.018060874193906784, "rewards/rejected": -0.009249115362763405, "step": 26 }, { "epoch": 0.01, "learning_rate": 1.6167664670658683e-08, "logits/chosen": -2.086261510848999, "logits/rejected": -2.092257261276245, "logps/chosen": -19.009735107421875, "logps/rejected": -9.838574409484863, "loss": 0.7029, "rewards/accuracies": 0.0, "rewards/chosen": -0.0060249329544603825, "rewards/margins": -0.01949186436831951, "rewards/rejected": 0.013466930948197842, "step": 27 }, { "epoch": 0.02, "learning_rate": 1.6766467065868262e-08, "logits/chosen": -2.1890101432800293, "logits/rejected": -2.358189105987549, "logps/chosen": -20.0346736907959, "logps/rejected": -14.541224479675293, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.0006488800281658769, "rewards/margins": 0.009042453952133656, "rewards/rejected": -0.008393573574721813, "step": 28 }, { "epoch": 0.02, "learning_rate": 1.7365269461077845e-08, "logits/chosen": -2.009934425354004, "logits/rejected": -2.290698528289795, "logps/chosen": -10.93287181854248, "logps/rejected": -10.355958938598633, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.005409050267189741, "rewards/margins": 0.002321911044418812, "rewards/rejected": 0.0030871392227709293, "step": 29 }, { "epoch": 0.02, "learning_rate": 1.7964071856287425e-08, "logits/chosen": -2.07171630859375, "logits/rejected": -2.324683904647827, "logps/chosen": -9.783227920532227, "logps/rejected": -9.261146545410156, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": -0.022924423217773438, "rewards/margins": -0.0217267032712698, "rewards/rejected": -0.0011977195972576737, "step": 30 }, { "epoch": 0.02, "learning_rate": 1.8562874251497004e-08, "logits/chosen": -2.080933094024658, "logits/rejected": -2.323272228240967, "logps/chosen": -9.17174243927002, "logps/rejected": -9.035951614379883, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": -0.0011231423122808337, "rewards/margins": -0.0014688492519780993, "rewards/rejected": 0.0003457069396972656, "step": 31 }, { "epoch": 0.02, "learning_rate": 1.9161676646706587e-08, "logits/chosen": -2.032352924346924, "logits/rejected": -2.034011125564575, "logps/chosen": -12.781051635742188, "logps/rejected": -11.793323516845703, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.0006729126325808465, "rewards/margins": 0.00467681884765625, "rewards/rejected": -0.0040039061568677425, "step": 32 }, { "epoch": 0.02, "learning_rate": 1.9760479041916167e-08, "logits/chosen": -2.217085361480713, "logits/rejected": -2.125012159347534, "logps/chosen": -47.779911041259766, "logps/rejected": -9.501927375793457, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.018179703503847122, "rewards/margins": 0.01724538952112198, "rewards/rejected": 0.0009343147394247353, "step": 33 }, { "epoch": 0.02, "learning_rate": 2.0359281437125746e-08, "logits/chosen": -2.033640146255493, "logits/rejected": -2.3116350173950195, "logps/chosen": -9.438238143920898, "logps/rejected": -9.262679100036621, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": -0.0010282517177984118, "rewards/margins": -0.004901981446892023, "rewards/rejected": 0.003873729845508933, "step": 34 }, { "epoch": 0.02, "learning_rate": 2.0958083832335326e-08, "logits/chosen": -2.0197408199310303, "logits/rejected": -2.253222942352295, "logps/chosen": -9.203672409057617, "logps/rejected": -8.883655548095703, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": -0.0001639366237213835, "rewards/margins": -0.0020609856583178043, "rewards/rejected": 0.0018970490200445056, "step": 35 }, { "epoch": 0.02, "learning_rate": 2.155688622754491e-08, "logits/chosen": -2.0224411487579346, "logits/rejected": -2.3080337047576904, "logps/chosen": -8.753106117248535, "logps/rejected": -8.373930931091309, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.0012947082286700606, "rewards/margins": 0.007284832186996937, "rewards/rejected": -0.005990123841911554, "step": 36 }, { "epoch": 0.02, "learning_rate": 2.215568862275449e-08, "logits/chosen": -2.082209348678589, "logits/rejected": -2.085095167160034, "logps/chosen": -16.871997833251953, "logps/rejected": -8.13684368133545, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.011597061529755592, "rewards/margins": 0.01171731948852539, "rewards/rejected": -0.0001202583298436366, "step": 37 }, { "epoch": 0.02, "learning_rate": 2.275449101796407e-08, "logits/chosen": -2.0975074768066406, "logits/rejected": -2.001901865005493, "logps/chosen": -45.843223571777344, "logps/rejected": -11.109920501708984, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 0.0034389495849609375, "rewards/margins": -0.005471420474350452, "rewards/rejected": 0.00891037005931139, "step": 38 }, { "epoch": 0.02, "learning_rate": 2.3353293413173654e-08, "logits/chosen": -2.1645593643188477, "logits/rejected": -2.166994333267212, "logps/chosen": -16.8641357421875, "logps/rejected": -13.132423400878906, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.007637596223503351, "rewards/margins": 0.004525947384536266, "rewards/rejected": 0.0031116486061364412, "step": 39 }, { "epoch": 0.02, "learning_rate": 2.3952095808383233e-08, "logits/chosen": -2.1991729736328125, "logits/rejected": -2.3230018615722656, "logps/chosen": -14.382692337036133, "logps/rejected": -15.292593955993652, "loss": 0.7048, "rewards/accuracies": 0.0, "rewards/chosen": -0.011005210690200329, "rewards/margins": -0.023195933550596237, "rewards/rejected": 0.012190723791718483, "step": 40 }, { "epoch": 0.02, "learning_rate": 2.4550898203592813e-08, "logits/chosen": -2.172686815261841, "logits/rejected": -2.181542158126831, "logps/chosen": -15.370450019836426, "logps/rejected": -21.398927688598633, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": 0.002742290496826172, "rewards/margins": -0.0014197351410984993, "rewards/rejected": 0.004162025637924671, "step": 41 }, { "epoch": 0.02, "learning_rate": 2.5149700598802392e-08, "logits/chosen": -2.125290632247925, "logits/rejected": -2.370950222015381, "logps/chosen": -11.453412055969238, "logps/rejected": -15.376302719116211, "loss": 0.6983, "rewards/accuracies": 0.0, "rewards/chosen": -0.004276180174201727, "rewards/margins": -0.010200118646025658, "rewards/rejected": 0.005923938937485218, "step": 42 }, { "epoch": 0.02, "learning_rate": 2.574850299401197e-08, "logits/chosen": -2.191587448120117, "logits/rejected": -2.191767930984497, "logps/chosen": -17.134016036987305, "logps/rejected": -10.363306999206543, "loss": 0.6978, "rewards/accuracies": 0.0, "rewards/chosen": -0.0076919556595385075, "rewards/margins": -0.009229660034179688, "rewards/rejected": 0.0015377044910565019, "step": 43 }, { "epoch": 0.02, "learning_rate": 2.6347305389221558e-08, "logits/chosen": -2.2024598121643066, "logits/rejected": -2.2377007007598877, "logps/chosen": -24.98147964477539, "logps/rejected": -19.53157615661621, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.008563041687011719, "rewards/margins": 0.010098266415297985, "rewards/rejected": -0.00153522496111691, "step": 44 }, { "epoch": 0.02, "learning_rate": 2.6946107784431137e-08, "logits/chosen": -2.109586238861084, "logits/rejected": -2.066589117050171, "logps/chosen": -31.452796936035156, "logps/rejected": -10.896369934082031, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.0033535005059093237, "rewards/margins": -0.00015182490460574627, "rewards/rejected": 0.00350532541051507, "step": 45 }, { "epoch": 0.02, "learning_rate": 2.7544910179640717e-08, "logits/chosen": -2.0115320682525635, "logits/rejected": -2.0317440032958984, "logps/chosen": -10.15146541595459, "logps/rejected": -23.04925537109375, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": -0.008557033725082874, "rewards/margins": 0.0022377967834472656, "rewards/rejected": -0.01079483050853014, "step": 46 }, { "epoch": 0.03, "learning_rate": 2.81437125748503e-08, "logits/chosen": -2.2034506797790527, "logits/rejected": -2.347400665283203, "logps/chosen": -13.073119163513184, "logps/rejected": -12.91992473602295, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": -0.0020820617210119963, "rewards/margins": 0.0009079934097826481, "rewards/rejected": -0.0029900551307946444, "step": 47 }, { "epoch": 0.03, "learning_rate": 2.874251497005988e-08, "logits/chosen": -2.036632776260376, "logits/rejected": -2.0319366455078125, "logps/chosen": -11.426937103271484, "logps/rejected": -9.93587589263916, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.0028202056419104338, "rewards/margins": -0.0004382133483886719, "rewards/rejected": 0.0032584189902991056, "step": 48 }, { "epoch": 0.03, "learning_rate": 2.934131736526946e-08, "logits/chosen": -2.2244856357574463, "logits/rejected": -2.221395969390869, "logps/chosen": -19.908634185791016, "logps/rejected": -9.368327140808105, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.0025451660621911287, "rewards/margins": 0.011210250668227673, "rewards/rejected": -0.008665084838867188, "step": 49 }, { "epoch": 0.03, "learning_rate": 2.9940119760479035e-08, "logits/chosen": -2.0372424125671387, "logits/rejected": -2.305026054382324, "logps/chosen": -10.318634986877441, "logps/rejected": -10.13671875, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": -0.002641010330989957, "rewards/margins": 0.0003408431075513363, "rewards/rejected": -0.002981853438541293, "step": 50 }, { "epoch": 0.03, "learning_rate": 3.053892215568862e-08, "logits/chosen": -2.001779317855835, "logits/rejected": -2.000807762145996, "logps/chosen": -10.556661605834961, "logps/rejected": -8.468564987182617, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.009660339914262295, "rewards/margins": 0.019778728485107422, "rewards/rejected": -0.010118389502167702, "step": 51 }, { "epoch": 0.03, "learning_rate": 3.11377245508982e-08, "logits/chosen": -2.112924098968506, "logits/rejected": -2.3733770847320557, "logps/chosen": -11.091135025024414, "logps/rejected": -11.095255851745605, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": -0.0018992424011230469, "rewards/margins": -0.0024177550803869963, "rewards/rejected": 0.0005185127374716103, "step": 52 }, { "epoch": 0.03, "learning_rate": 3.1736526946107786e-08, "logits/chosen": -2.082143783569336, "logits/rejected": -2.3364851474761963, "logps/chosen": -11.325684547424316, "logps/rejected": -10.871271133422852, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.007728863041847944, "rewards/margins": 0.003836059710010886, "rewards/rejected": 0.003892803331837058, "step": 53 }, { "epoch": 0.03, "learning_rate": 3.2335329341317366e-08, "logits/chosen": -2.1302826404571533, "logits/rejected": -2.1175520420074463, "logps/chosen": -27.1384220123291, "logps/rejected": -9.268571853637695, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.013849067501723766, "rewards/margins": 0.012765788473188877, "rewards/rejected": 0.0010832786792889237, "step": 54 }, { "epoch": 0.03, "learning_rate": 3.2934131736526945e-08, "logits/chosen": -1.9913909435272217, "logits/rejected": -2.280839443206787, "logps/chosen": -10.654471397399902, "logps/rejected": -10.156837463378906, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.010746193118393421, "rewards/margins": 0.004476356785744429, "rewards/rejected": 0.0062698363326489925, "step": 55 }, { "epoch": 0.03, "learning_rate": 3.3532934131736525e-08, "logits/chosen": -2.0612940788269043, "logits/rejected": -2.0564703941345215, "logps/chosen": -17.322038650512695, "logps/rejected": -8.598268508911133, "loss": 0.7023, "rewards/accuracies": 0.0, "rewards/chosen": -0.007116127293556929, "rewards/margins": -0.01813793182373047, "rewards/rejected": 0.011021804995834827, "step": 56 }, { "epoch": 0.03, "learning_rate": 3.413173652694611e-08, "logits/chosen": -2.0037147998809814, "logits/rejected": -2.3608784675598145, "logps/chosen": -9.511384963989258, "logps/rejected": -22.219709396362305, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.00130634312517941, "rewards/margins": 0.008821296505630016, "rewards/rejected": -0.00751495361328125, "step": 57 }, { "epoch": 0.03, "learning_rate": 3.473053892215569e-08, "logits/chosen": -2.2474124431610107, "logits/rejected": -2.375666856765747, "logps/chosen": -13.37817096710205, "logps/rejected": -12.415132522583008, "loss": 0.7004, "rewards/accuracies": 0.0, "rewards/chosen": -0.007659244816750288, "rewards/margins": -0.014452648349106312, "rewards/rejected": 0.006793403532356024, "step": 58 }, { "epoch": 0.03, "learning_rate": 3.532934131736527e-08, "logits/chosen": -2.153276205062866, "logits/rejected": -2.3152220249176025, "logps/chosen": -9.87206745147705, "logps/rejected": -9.60525131225586, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": -0.0047401427291333675, "rewards/margins": -0.006966304965317249, "rewards/rejected": 0.002226162003353238, "step": 59 }, { "epoch": 0.03, "learning_rate": 3.592814371257485e-08, "logits/chosen": -2.0695087909698486, "logits/rejected": -2.0766024589538574, "logps/chosen": -11.419910430908203, "logps/rejected": -11.081637382507324, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.0017148017650470138, "rewards/margins": 0.007834911346435547, "rewards/rejected": -0.006120109464973211, "step": 60 }, { "epoch": 0.03, "learning_rate": 3.652694610778443e-08, "logits/chosen": -2.0774431228637695, "logits/rejected": -2.2902028560638428, "logps/chosen": -9.625088691711426, "logps/rejected": -9.525925636291504, "loss": 0.6962, "rewards/accuracies": 0.0, "rewards/chosen": 0.007505321409553289, "rewards/margins": -0.006178856361657381, "rewards/rejected": 0.01368417777121067, "step": 61 }, { "epoch": 0.03, "learning_rate": 3.712574850299401e-08, "logits/chosen": -2.1199288368225098, "logits/rejected": -2.1434409618377686, "logps/chosen": -14.980703353881836, "logps/rejected": -12.910759925842285, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": -0.0037218094803392887, "rewards/margins": -0.004533290863037109, "rewards/rejected": 0.0008114814991131425, "step": 62 }, { "epoch": 0.03, "learning_rate": 3.772455089820359e-08, "logits/chosen": -2.0866425037384033, "logits/rejected": -2.2408361434936523, "logps/chosen": -8.98461627960205, "logps/rejected": -8.835596084594727, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.0059258462861180305, "rewards/margins": 0.0033813477493822575, "rewards/rejected": 0.002544498536735773, "step": 63 }, { "epoch": 0.03, "learning_rate": 3.8323353293413174e-08, "logits/chosen": -2.212338447570801, "logits/rejected": -2.2397077083587646, "logps/chosen": -33.71771240234375, "logps/rejected": -13.987553596496582, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.0088958740234375, "rewards/margins": 0.005745982751250267, "rewards/rejected": 0.0031498910393565893, "step": 64 }, { "epoch": 0.04, "learning_rate": 3.8922155688622754e-08, "logits/chosen": -2.1122384071350098, "logits/rejected": -2.1149110794067383, "logps/chosen": -12.645782470703125, "logps/rejected": -11.289641380310059, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.0032690048683434725, "rewards/margins": 0.0018817902309820056, "rewards/rejected": 0.0013872146373614669, "step": 65 }, { "epoch": 0.04, "learning_rate": 3.952095808383233e-08, "logits/chosen": -2.0071442127227783, "logits/rejected": -2.2863450050354004, "logps/chosen": -9.621089935302734, "logps/rejected": -9.36291217803955, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.009087658487260342, "rewards/margins": 0.006687069311738014, "rewards/rejected": 0.0024005889426916838, "step": 66 }, { "epoch": 0.04, "learning_rate": 4.011976047904191e-08, "logits/chosen": -2.2074148654937744, "logits/rejected": -2.1879940032958984, "logps/chosen": -26.711008071899414, "logps/rejected": -13.275725364685059, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.009914589114487171, "rewards/margins": 0.013732433319091797, "rewards/rejected": -0.0038178444374352694, "step": 67 }, { "epoch": 0.04, "learning_rate": 4.071856287425149e-08, "logits/chosen": -2.1413567066192627, "logits/rejected": -2.138355016708374, "logps/chosen": -21.988557815551758, "logps/rejected": -8.479101181030273, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": 0.0026132583152502775, "rewards/margins": -0.001600551651790738, "rewards/rejected": 0.004213809967041016, "step": 68 }, { "epoch": 0.04, "learning_rate": 4.131736526946107e-08, "logits/chosen": -2.031099319458008, "logits/rejected": -2.0390918254852295, "logps/chosen": -15.345788955688477, "logps/rejected": -9.091521263122559, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.0006248474237509072, "rewards/margins": -0.00384693150408566, "rewards/rejected": 0.004471778869628906, "step": 69 }, { "epoch": 0.04, "learning_rate": 4.191616766467065e-08, "logits/chosen": -2.2766504287719727, "logits/rejected": -2.277428150177002, "logps/chosen": -12.272825241088867, "logps/rejected": -9.114608764648438, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": -0.004258919041603804, "rewards/margins": -0.008103752508759499, "rewards/rejected": 0.003844833467155695, "step": 70 }, { "epoch": 0.04, "learning_rate": 4.251497005988024e-08, "logits/chosen": -2.1497275829315186, "logits/rejected": -2.1518969535827637, "logps/chosen": -10.156100273132324, "logps/rejected": -9.789898872375488, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.006843280978500843, "rewards/margins": -0.007723617367446423, "rewards/rejected": 0.014566898345947266, "step": 71 }, { "epoch": 0.04, "learning_rate": 4.311377245508982e-08, "logits/chosen": -2.182758331298828, "logits/rejected": -2.3216300010681152, "logps/chosen": -8.686869621276855, "logps/rejected": -8.51818561553955, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": 0.0026293755508959293, "rewards/margins": 0.0026380540803074837, "rewards/rejected": -8.678436643094756e-06, "step": 72 }, { "epoch": 0.04, "learning_rate": 4.3712574850299396e-08, "logits/chosen": -2.0872843265533447, "logits/rejected": -2.077965259552002, "logps/chosen": -20.031715393066406, "logps/rejected": -10.427141189575195, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 0.007603454869240522, "rewards/margins": -0.005949115846306086, "rewards/rejected": 0.013552570715546608, "step": 73 }, { "epoch": 0.04, "learning_rate": 4.431137724550898e-08, "logits/chosen": -2.1391186714172363, "logits/rejected": -2.1409196853637695, "logps/chosen": -10.568460464477539, "logps/rejected": -9.870757102966309, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.014916610904037952, "rewards/margins": 0.013483620248734951, "rewards/rejected": 0.0014329910045489669, "step": 74 }, { "epoch": 0.04, "learning_rate": 4.491017964071856e-08, "logits/chosen": -2.0653891563415527, "logits/rejected": -2.061624765396118, "logps/chosen": -15.005682945251465, "logps/rejected": -9.725150108337402, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.005707073491066694, "rewards/margins": 0.012103939428925514, "rewards/rejected": -0.00639686593785882, "step": 75 }, { "epoch": 0.04, "learning_rate": 4.550898203592814e-08, "logits/chosen": -2.0795552730560303, "logits/rejected": -2.0893592834472656, "logps/chosen": -14.408939361572266, "logps/rejected": -9.1983642578125, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.02653675153851509, "rewards/margins": 0.017840005457401276, "rewards/rejected": 0.00869674701243639, "step": 76 }, { "epoch": 0.04, "learning_rate": 4.610778443113773e-08, "logits/chosen": -2.0806872844696045, "logits/rejected": -2.0777394771575928, "logps/chosen": -22.98017692565918, "logps/rejected": -9.163311004638672, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.011730385012924671, "rewards/margins": -0.0034955982118844986, "rewards/rejected": 0.01522598322480917, "step": 77 }, { "epoch": 0.04, "learning_rate": 4.670658682634731e-08, "logits/chosen": -2.1407816410064697, "logits/rejected": -2.1439805030822754, "logps/chosen": -23.062232971191406, "logps/rejected": -8.349564552307129, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.02420349232852459, "rewards/margins": 0.019690513610839844, "rewards/rejected": 0.004512977786362171, "step": 78 }, { "epoch": 0.04, "learning_rate": 4.7305389221556887e-08, "logits/chosen": -2.1169166564941406, "logits/rejected": -2.110389232635498, "logps/chosen": -21.13638687133789, "logps/rejected": -12.595857620239258, "loss": 0.7042, "rewards/accuracies": 0.0, "rewards/chosen": -0.010751152411103249, "rewards/margins": -0.022069836035370827, "rewards/rejected": 0.011318683624267578, "step": 79 }, { "epoch": 0.04, "learning_rate": 4.7904191616766466e-08, "logits/chosen": -2.2256357669830322, "logits/rejected": -2.2541983127593994, "logps/chosen": -20.52251434326172, "logps/rejected": -20.54475975036621, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.015552902594208717, "rewards/margins": 0.013078117743134499, "rewards/rejected": 0.0024747848510742188, "step": 80 }, { "epoch": 0.04, "learning_rate": 4.8502994011976046e-08, "logits/chosen": -2.0845398902893066, "logits/rejected": -2.0792174339294434, "logps/chosen": -12.132347106933594, "logps/rejected": -8.778169631958008, "loss": 0.7023, "rewards/accuracies": 0.0, "rewards/chosen": 0.00015106202044989914, "rewards/margins": -0.01831808127462864, "rewards/rejected": 0.01846914365887642, "step": 81 }, { "epoch": 0.04, "learning_rate": 4.9101796407185625e-08, "logits/chosen": -2.168095111846924, "logits/rejected": -2.1729111671447754, "logps/chosen": -15.327953338623047, "logps/rejected": -8.104217529296875, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.013421154581010342, "rewards/margins": 0.0024643903598189354, "rewards/rejected": 0.010956764221191406, "step": 82 }, { "epoch": 0.04, "learning_rate": 4.9700598802395205e-08, "logits/chosen": -2.0609378814697266, "logits/rejected": -2.0658371448516846, "logps/chosen": -10.046298027038574, "logps/rejected": -10.703130722045898, "loss": 0.6745, "rewards/accuracies": 1.0, "rewards/chosen": 0.04140844568610191, "rewards/margins": 0.037581540644168854, "rewards/rejected": 0.0038269043434411287, "step": 83 }, { "epoch": 0.05, "learning_rate": 5.0299401197604784e-08, "logits/chosen": -2.1340365409851074, "logits/rejected": -2.144913673400879, "logps/chosen": -18.512155532836914, "logps/rejected": -15.42941665649414, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.013103104196488857, "rewards/margins": 0.003489876165986061, "rewards/rejected": 0.009613228030502796, "step": 84 }, { "epoch": 0.05, "learning_rate": 5.089820359281437e-08, "logits/chosen": -2.0215439796447754, "logits/rejected": -2.0172910690307617, "logps/chosen": -23.787403106689453, "logps/rejected": -9.515847206115723, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.01623230054974556, "rewards/margins": 0.008060360327363014, "rewards/rejected": 0.008171940222382545, "step": 85 }, { "epoch": 0.05, "learning_rate": 5.149700598802394e-08, "logits/chosen": -2.1730191707611084, "logits/rejected": -2.1821069717407227, "logps/chosen": -15.827571868896484, "logps/rejected": -11.986709594726562, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.021857071667909622, "rewards/margins": 0.019102001562714577, "rewards/rejected": 0.002755069872364402, "step": 86 }, { "epoch": 0.05, "learning_rate": 5.209580838323353e-08, "logits/chosen": -2.1470115184783936, "logits/rejected": -2.2943525314331055, "logps/chosen": -12.311257362365723, "logps/rejected": -10.605905532836914, "loss": 0.695, "rewards/accuracies": 0.0, "rewards/chosen": 0.011141300201416016, "rewards/margins": -0.003794384188950062, "rewards/rejected": 0.014935684390366077, "step": 87 }, { "epoch": 0.05, "learning_rate": 5.2694610778443115e-08, "logits/chosen": -2.000725507736206, "logits/rejected": -2.2563185691833496, "logps/chosen": -9.345174789428711, "logps/rejected": -9.436463356018066, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.01375427283346653, "rewards/margins": -0.00505981408059597, "rewards/rejected": 0.0188140869140625, "step": 88 }, { "epoch": 0.05, "learning_rate": 5.329341317365269e-08, "logits/chosen": -2.070589780807495, "logits/rejected": -2.0784809589385986, "logps/chosen": -11.600725173950195, "logps/rejected": -18.4509334564209, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.014331626705825329, "rewards/margins": 0.013307380490005016, "rewards/rejected": 0.0010242462158203125, "step": 89 }, { "epoch": 0.05, "learning_rate": 5.3892215568862274e-08, "logits/chosen": -2.1653432846069336, "logits/rejected": -2.1581718921661377, "logps/chosen": -20.793819427490234, "logps/rejected": -10.635101318359375, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.016032982617616653, "rewards/margins": 0.0033536916598677635, "rewards/rejected": 0.01267929095774889, "step": 90 }, { "epoch": 0.05, "learning_rate": 5.449101796407185e-08, "logits/chosen": -2.1395792961120605, "logits/rejected": -2.137909412384033, "logps/chosen": -14.751914978027344, "logps/rejected": -9.965921401977539, "loss": 0.6977, "rewards/accuracies": 0.0, "rewards/chosen": 0.014207459054887295, "rewards/margins": -0.009031771682202816, "rewards/rejected": 0.02323923073709011, "step": 91 }, { "epoch": 0.05, "learning_rate": 5.5089820359281433e-08, "logits/chosen": -2.1835544109344482, "logits/rejected": -2.1880300045013428, "logps/chosen": -12.26913070678711, "logps/rejected": -10.720724105834961, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.02851085737347603, "rewards/margins": 0.004570579156279564, "rewards/rejected": 0.023940278217196465, "step": 92 }, { "epoch": 0.05, "learning_rate": 5.568862275449101e-08, "logits/chosen": -2.216925621032715, "logits/rejected": -2.327428102493286, "logps/chosen": -21.31270408630371, "logps/rejected": -24.035587310791016, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.009770012460649014, "rewards/margins": 0.012777138501405716, "rewards/rejected": -0.0030071258079260588, "step": 93 }, { "epoch": 0.05, "learning_rate": 5.62874251497006e-08, "logits/chosen": -2.222801685333252, "logits/rejected": -2.2197649478912354, "logps/chosen": -24.628292083740234, "logps/rejected": -8.562743186950684, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.030533218756318092, "rewards/margins": 0.022320080548524857, "rewards/rejected": 0.00821313913911581, "step": 94 }, { "epoch": 0.05, "learning_rate": 5.688622754491018e-08, "logits/chosen": -2.054525375366211, "logits/rejected": -2.311052083969116, "logps/chosen": -10.267644882202148, "logps/rejected": -10.257949829101562, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.018663406372070312, "rewards/margins": 0.012148093432188034, "rewards/rejected": 0.006515312474220991, "step": 95 }, { "epoch": 0.05, "learning_rate": 5.748502994011976e-08, "logits/chosen": -2.1148343086242676, "logits/rejected": -2.354222536087036, "logps/chosen": -10.37639331817627, "logps/rejected": -10.701351165771484, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.022355079650878906, "rewards/margins": 0.014456748962402344, "rewards/rejected": 0.007898330688476562, "step": 96 }, { "epoch": 0.05, "learning_rate": 5.8083832335329344e-08, "logits/chosen": -2.03421688079834, "logits/rejected": -2.0342767238616943, "logps/chosen": -14.612006187438965, "logps/rejected": -8.684837341308594, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.012875652872025967, "rewards/margins": 0.004599476233124733, "rewards/rejected": 0.008276176638901234, "step": 97 }, { "epoch": 0.05, "learning_rate": 5.868263473053892e-08, "logits/chosen": -2.1014904975891113, "logits/rejected": -2.008559465408325, "logps/chosen": -53.509864807128906, "logps/rejected": -9.695104598999023, "loss": 0.718, "rewards/accuracies": 0.0, "rewards/chosen": -0.018994523212313652, "rewards/margins": -0.04901838302612305, "rewards/rejected": 0.030023861676454544, "step": 98 }, { "epoch": 0.05, "learning_rate": 5.92814371257485e-08, "logits/chosen": -2.1727590560913086, "logits/rejected": -2.0554981231689453, "logps/chosen": -55.83694839477539, "logps/rejected": -9.01846981048584, "loss": 0.6748, "rewards/accuracies": 1.0, "rewards/chosen": 0.051126863807439804, "rewards/margins": 0.03704138100147247, "rewards/rejected": 0.014085483737289906, "step": 99 }, { "epoch": 0.05, "learning_rate": 5.988023952095807e-08, "logits/chosen": -2.169166088104248, "logits/rejected": -2.365407705307007, "logps/chosen": -9.995257377624512, "logps/rejected": -9.729559898376465, "loss": 0.6991, "rewards/accuracies": 0.0, "rewards/chosen": 0.007813549600541592, "rewards/margins": -0.011901282705366611, "rewards/rejected": 0.019714832305908203, "step": 100 }, { "epoch": 0.05, "learning_rate": 6.047904191616766e-08, "logits/chosen": -2.1680970191955566, "logits/rejected": -2.159405469894409, "logps/chosen": -24.184803009033203, "logps/rejected": -8.851638793945312, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.03679237514734268, "rewards/margins": 0.010417463257908821, "rewards/rejected": 0.02637491188943386, "step": 101 }, { "epoch": 0.06, "learning_rate": 6.107784431137724e-08, "logits/chosen": -2.1515512466430664, "logits/rejected": -2.1446890830993652, "logps/chosen": -31.457908630371094, "logps/rejected": -10.132776260375977, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 0.02070179022848606, "rewards/margins": -0.0002575870603322983, "rewards/rejected": 0.02095937728881836, "step": 102 }, { "epoch": 0.06, "learning_rate": 6.167664670658683e-08, "logits/chosen": -2.0354926586151123, "logits/rejected": -2.246004819869995, "logps/chosen": -9.219076156616211, "logps/rejected": -9.252620697021484, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.022976303473114967, "rewards/margins": 0.0032558441162109375, "rewards/rejected": 0.01972045935690403, "step": 103 }, { "epoch": 0.06, "learning_rate": 6.22754491017964e-08, "logits/chosen": -2.173171043395996, "logits/rejected": -2.279550790786743, "logps/chosen": -9.875617980957031, "logps/rejected": -9.361398696899414, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.029628945514559746, "rewards/margins": -2.746470272541046e-05, "rewards/rejected": 0.029656410217285156, "step": 104 }, { "epoch": 0.06, "learning_rate": 6.287425149700599e-08, "logits/chosen": -2.1710093021392822, "logits/rejected": -2.3062379360198975, "logps/chosen": -9.700754165649414, "logps/rejected": -9.816267967224121, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.026854991912841797, "rewards/margins": 0.016991902142763138, "rewards/rejected": 0.009863090701401234, "step": 105 }, { "epoch": 0.06, "learning_rate": 6.347305389221557e-08, "logits/chosen": -2.166234016418457, "logits/rejected": -2.171205997467041, "logps/chosen": -10.914085388183594, "logps/rejected": -8.214065551757812, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.02299661748111248, "rewards/margins": 0.015521050430834293, "rewards/rejected": 0.007475567050278187, "step": 106 }, { "epoch": 0.06, "learning_rate": 6.407185628742515e-08, "logits/chosen": -2.0539705753326416, "logits/rejected": -2.3179354667663574, "logps/chosen": -10.74355697631836, "logps/rejected": -10.711734771728516, "loss": 0.7013, "rewards/accuracies": 0.0, "rewards/chosen": 0.021503543481230736, "rewards/margins": -0.01626425050199032, "rewards/rejected": 0.037767793983221054, "step": 107 }, { "epoch": 0.06, "learning_rate": 6.467065868263473e-08, "logits/chosen": -2.2176706790924072, "logits/rejected": -2.1417202949523926, "logps/chosen": -52.630638122558594, "logps/rejected": -8.783158302307129, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.01609039306640625, "rewards/margins": 0.0011359211057424545, "rewards/rejected": 0.014954471960663795, "step": 108 }, { "epoch": 0.06, "learning_rate": 6.526946107784432e-08, "logits/chosen": -2.1851367950439453, "logits/rejected": -2.177199125289917, "logps/chosen": -21.327144622802734, "logps/rejected": -8.895694732666016, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.03161163255572319, "rewards/margins": 0.014476297423243523, "rewards/rejected": 0.017135335132479668, "step": 109 }, { "epoch": 0.06, "learning_rate": 6.586826347305389e-08, "logits/chosen": -2.191641330718994, "logits/rejected": -2.185135841369629, "logps/chosen": -13.659268379211426, "logps/rejected": -11.805540084838867, "loss": 0.7016, "rewards/accuracies": 0.0, "rewards/chosen": 0.024065781384706497, "rewards/margins": -0.01681690290570259, "rewards/rejected": 0.04088268429040909, "step": 110 }, { "epoch": 0.06, "learning_rate": 6.646706586826348e-08, "logits/chosen": -2.2637743949890137, "logits/rejected": -2.2212488651275635, "logps/chosen": -44.500892639160156, "logps/rejected": -11.91779899597168, "loss": 0.6777, "rewards/accuracies": 1.0, "rewards/chosen": 0.038352202624082565, "rewards/margins": 0.031041622161865234, "rewards/rejected": 0.0073105813935399055, "step": 111 }, { "epoch": 0.06, "learning_rate": 6.706586826347305e-08, "logits/chosen": -2.1043622493743896, "logits/rejected": -2.1147818565368652, "logps/chosen": -11.919310569763184, "logps/rejected": -9.79914665222168, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.022217560559511185, "rewards/margins": 0.00950250681489706, "rewards/rejected": 0.012715053744614124, "step": 112 }, { "epoch": 0.06, "learning_rate": 6.766467065868264e-08, "logits/chosen": -2.1387264728546143, "logits/rejected": -2.2819249629974365, "logps/chosen": -11.030794143676758, "logps/rejected": -10.91385269165039, "loss": 0.6992, "rewards/accuracies": 0.0, "rewards/chosen": 0.016012191772460938, "rewards/margins": -0.011998940259218216, "rewards/rejected": 0.028011132031679153, "step": 113 }, { "epoch": 0.06, "learning_rate": 6.826347305389222e-08, "logits/chosen": -2.134732961654663, "logits/rejected": -2.3307533264160156, "logps/chosen": -14.647011756896973, "logps/rejected": -14.031876564025879, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.02377195470035076, "rewards/margins": 0.003600694239139557, "rewards/rejected": 0.020171260461211205, "step": 114 }, { "epoch": 0.06, "learning_rate": 6.88622754491018e-08, "logits/chosen": -2.2498886585235596, "logits/rejected": -2.1094186305999756, "logps/chosen": -61.89494323730469, "logps/rejected": -8.810297966003418, "loss": 0.6491, "rewards/accuracies": 1.0, "rewards/chosen": 0.11543159931898117, "rewards/margins": 0.09010200947523117, "rewards/rejected": 0.02532958984375, "step": 115 }, { "epoch": 0.06, "learning_rate": 6.946107784431138e-08, "logits/chosen": -2.2414627075195312, "logits/rejected": -2.269969940185547, "logps/chosen": -9.749083518981934, "logps/rejected": -14.994815826416016, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 0.03028392791748047, "rewards/margins": 0.02486877515912056, "rewards/rejected": 0.005415153689682484, "step": 116 }, { "epoch": 0.06, "learning_rate": 7.005988023952095e-08, "logits/chosen": -2.0758702754974365, "logits/rejected": -2.325120687484741, "logps/chosen": -16.386791229248047, "logps/rejected": -16.027700424194336, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.04195442423224449, "rewards/margins": -2.6702880859375e-05, "rewards/rejected": 0.04198112711310387, "step": 117 }, { "epoch": 0.06, "learning_rate": 7.065868263473054e-08, "logits/chosen": -2.155989170074463, "logits/rejected": -2.3058135509490967, "logps/chosen": -6.9890007972717285, "logps/rejected": -6.648977279663086, "loss": 0.6965, "rewards/accuracies": 0.0, "rewards/chosen": 0.02582373656332493, "rewards/margins": -0.006782772019505501, "rewards/rejected": 0.03260650858283043, "step": 118 }, { "epoch": 0.06, "learning_rate": 7.125748502994011e-08, "logits/chosen": -2.175783634185791, "logits/rejected": -2.334841251373291, "logps/chosen": -7.318718433380127, "logps/rejected": -7.009897708892822, "loss": 0.6982, "rewards/accuracies": 0.0, "rewards/chosen": 0.04105892404913902, "rewards/margins": -0.010015677660703659, "rewards/rejected": 0.05107460170984268, "step": 119 }, { "epoch": 0.06, "learning_rate": 7.18562874251497e-08, "logits/chosen": -2.082822799682617, "logits/rejected": -2.310870885848999, "logps/chosen": -8.447615623474121, "logps/rejected": -8.282356262207031, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 0.03728818893432617, "rewards/margins": -0.004987049847841263, "rewards/rejected": 0.042275238782167435, "step": 120 }, { "epoch": 0.07, "learning_rate": 7.245508982035929e-08, "logits/chosen": -2.0603508949279785, "logits/rejected": -2.0646450519561768, "logps/chosen": -16.462688446044922, "logps/rejected": -12.070505142211914, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.039700698107481, "rewards/margins": 0.0015697479248046875, "rewards/rejected": 0.038130950182676315, "step": 121 }, { "epoch": 0.07, "learning_rate": 7.305389221556886e-08, "logits/chosen": -2.097029685974121, "logits/rejected": -2.0948429107666016, "logps/chosen": -11.582793235778809, "logps/rejected": -9.097574234008789, "loss": 0.6593, "rewards/accuracies": 1.0, "rewards/chosen": 0.07863960415124893, "rewards/margins": 0.06878995895385742, "rewards/rejected": 0.009849644266068935, "step": 122 }, { "epoch": 0.07, "learning_rate": 7.365269461077844e-08, "logits/chosen": -2.044933319091797, "logits/rejected": -2.3376176357269287, "logps/chosen": -10.825481414794922, "logps/rejected": -10.73626708984375, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.04650993272662163, "rewards/margins": 0.013980671763420105, "rewards/rejected": 0.03252926096320152, "step": 123 }, { "epoch": 0.07, "learning_rate": 7.425149700598802e-08, "logits/chosen": -2.1210601329803467, "logits/rejected": -2.1247360706329346, "logps/chosen": -11.938990592956543, "logps/rejected": -16.094511032104492, "loss": 0.6722, "rewards/accuracies": 1.0, "rewards/chosen": 0.04901885986328125, "rewards/margins": 0.04228382185101509, "rewards/rejected": 0.006735038943588734, "step": 124 }, { "epoch": 0.07, "learning_rate": 7.48502994011976e-08, "logits/chosen": -2.1316730976104736, "logits/rejected": -2.329294204711914, "logps/chosen": -14.667659759521484, "logps/rejected": -9.045886039733887, "loss": 0.7089, "rewards/accuracies": 0.0, "rewards/chosen": 0.0254395492374897, "rewards/margins": -0.0312931053340435, "rewards/rejected": 0.0567326545715332, "step": 125 }, { "epoch": 0.07, "learning_rate": 7.544910179640718e-08, "logits/chosen": -2.0990025997161865, "logits/rejected": -2.1011645793914795, "logps/chosen": -12.885945320129395, "logps/rejected": -11.05099868774414, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.05571804195642471, "rewards/margins": 0.020563315600156784, "rewards/rejected": 0.03515472635626793, "step": 126 }, { "epoch": 0.07, "learning_rate": 7.604790419161676e-08, "logits/chosen": -2.081960916519165, "logits/rejected": -2.0915396213531494, "logps/chosen": -11.931280136108398, "logps/rejected": -6.951378345489502, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.06037301942706108, "rewards/margins": 0.034506749361753464, "rewards/rejected": 0.025866270065307617, "step": 127 }, { "epoch": 0.07, "learning_rate": 7.664670658682635e-08, "logits/chosen": -2.256284475326538, "logits/rejected": -2.226715087890625, "logps/chosen": -42.48335647583008, "logps/rejected": -22.022647857666016, "loss": 0.6774, "rewards/accuracies": 1.0, "rewards/chosen": 0.04546623304486275, "rewards/margins": 0.03182220458984375, "rewards/rejected": 0.013644027523696423, "step": 128 }, { "epoch": 0.07, "learning_rate": 7.724550898203592e-08, "logits/chosen": -2.1783788204193115, "logits/rejected": -2.1779088973999023, "logps/chosen": -11.680098533630371, "logps/rejected": -8.822985649108887, "loss": 0.6735, "rewards/accuracies": 1.0, "rewards/chosen": 0.06475954502820969, "rewards/margins": 0.039689067751169205, "rewards/rejected": 0.02507047727704048, "step": 129 }, { "epoch": 0.07, "learning_rate": 7.784431137724551e-08, "logits/chosen": -2.2217743396759033, "logits/rejected": -2.3734214305877686, "logps/chosen": -11.440587043762207, "logps/rejected": -14.757913589477539, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.045981504023075104, "rewards/margins": 0.015979671850800514, "rewards/rejected": 0.03000183217227459, "step": 130 }, { "epoch": 0.07, "learning_rate": 7.844311377245508e-08, "logits/chosen": -2.0898261070251465, "logits/rejected": -2.091982126235962, "logps/chosen": -18.391382217407227, "logps/rejected": -10.652045249938965, "loss": 0.6751, "rewards/accuracies": 1.0, "rewards/chosen": 0.08815822750329971, "rewards/margins": 0.03644314035773277, "rewards/rejected": 0.05171508714556694, "step": 131 }, { "epoch": 0.07, "learning_rate": 7.904191616766467e-08, "logits/chosen": -2.1741416454315186, "logits/rejected": -2.170304298400879, "logps/chosen": -20.838272094726562, "logps/rejected": -8.434471130371094, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.05414943769574165, "rewards/margins": 0.00047969818115234375, "rewards/rejected": 0.05366973951458931, "step": 132 }, { "epoch": 0.07, "learning_rate": 7.964071856287424e-08, "logits/chosen": -2.01043701171875, "logits/rejected": -2.011410713195801, "logps/chosen": -9.470027923583984, "logps/rejected": -9.206988334655762, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.05468320846557617, "rewards/margins": 0.020734023302793503, "rewards/rejected": 0.03394918516278267, "step": 133 }, { "epoch": 0.07, "learning_rate": 8.023952095808383e-08, "logits/chosen": -2.1517410278320312, "logits/rejected": -2.1627399921417236, "logps/chosen": -12.455161094665527, "logps/rejected": -15.629027366638184, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.04838981851935387, "rewards/margins": 0.0066565535962581635, "rewards/rejected": 0.0417332649230957, "step": 134 }, { "epoch": 0.07, "learning_rate": 8.083832335329341e-08, "logits/chosen": -2.152024030685425, "logits/rejected": -2.1369447708129883, "logps/chosen": -26.75043296813965, "logps/rejected": -6.501386642456055, "loss": 0.6608, "rewards/accuracies": 1.0, "rewards/chosen": 0.10837020725011826, "rewards/margins": 0.0657368153333664, "rewards/rejected": 0.04263339191675186, "step": 135 }, { "epoch": 0.07, "learning_rate": 8.143712574850298e-08, "logits/chosen": -2.2027642726898193, "logits/rejected": -2.196291446685791, "logps/chosen": -18.991580963134766, "logps/rejected": -9.74489974975586, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.03951912000775337, "rewards/margins": -0.0022552497684955597, "rewards/rejected": 0.04177436977624893, "step": 136 }, { "epoch": 0.07, "learning_rate": 8.203592814371257e-08, "logits/chosen": -2.032248020172119, "logits/rejected": -2.0326883792877197, "logps/chosen": -8.71728229522705, "logps/rejected": -7.287235260009766, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.06083860620856285, "rewards/margins": 0.006669379770755768, "rewards/rejected": 0.05416922643780708, "step": 137 }, { "epoch": 0.07, "learning_rate": 8.263473053892214e-08, "logits/chosen": -2.206101417541504, "logits/rejected": -2.338489055633545, "logps/chosen": -8.545544624328613, "logps/rejected": -8.163434982299805, "loss": 0.6977, "rewards/accuracies": 0.0, "rewards/chosen": 0.042513180524110794, "rewards/margins": -0.009169675409793854, "rewards/rejected": 0.05168285593390465, "step": 138 }, { "epoch": 0.07, "learning_rate": 8.323353293413173e-08, "logits/chosen": -2.0271060466766357, "logits/rejected": -2.28347110748291, "logps/chosen": -9.21231460571289, "logps/rejected": -8.940032958984375, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.058467961847782135, "rewards/margins": -0.0076924338936805725, "rewards/rejected": 0.06616039574146271, "step": 139 }, { "epoch": 0.08, "learning_rate": 8.38323353293413e-08, "logits/chosen": -2.14156436920166, "logits/rejected": -2.313347816467285, "logps/chosen": -11.526708602905273, "logps/rejected": -11.302858352661133, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.05373039469122887, "rewards/margins": -0.005643460899591446, "rewards/rejected": 0.05937385559082031, "step": 140 }, { "epoch": 0.08, "learning_rate": 8.443113772455089e-08, "logits/chosen": -2.0846893787384033, "logits/rejected": -2.09494948387146, "logps/chosen": -14.942212104797363, "logps/rejected": -8.84738826751709, "loss": 0.6642, "rewards/accuracies": 1.0, "rewards/chosen": 0.09298162907361984, "rewards/margins": 0.05873747169971466, "rewards/rejected": 0.03424415737390518, "step": 141 }, { "epoch": 0.08, "learning_rate": 8.502994011976047e-08, "logits/chosen": -2.1679775714874268, "logits/rejected": -2.1689209938049316, "logps/chosen": -13.379714012145996, "logps/rejected": -8.191315650939941, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.09326773136854172, "rewards/margins": 0.020836547017097473, "rewards/rejected": 0.07243118435144424, "step": 142 }, { "epoch": 0.08, "learning_rate": 8.562874251497005e-08, "logits/chosen": -1.9822757244110107, "logits/rejected": -1.9898631572723389, "logps/chosen": -10.129500389099121, "logps/rejected": -8.225820541381836, "loss": 0.6655, "rewards/accuracies": 1.0, "rewards/chosen": 0.10942373424768448, "rewards/margins": 0.056182004511356354, "rewards/rejected": 0.053241729736328125, "step": 143 }, { "epoch": 0.08, "learning_rate": 8.622754491017963e-08, "logits/chosen": -2.1870508193969727, "logits/rejected": -2.288431406021118, "logps/chosen": -10.305020332336426, "logps/rejected": -9.1925048828125, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.08512315899133682, "rewards/margins": 0.029216479510068893, "rewards/rejected": 0.05590667948126793, "step": 144 }, { "epoch": 0.08, "learning_rate": 8.682634730538922e-08, "logits/chosen": -2.202906370162964, "logits/rejected": -2.2003421783447266, "logps/chosen": -11.083683013916016, "logps/rejected": -8.208761215209961, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 0.05009717866778374, "rewards/margins": 0.002966593950986862, "rewards/rejected": 0.047130584716796875, "step": 145 }, { "epoch": 0.08, "learning_rate": 8.742514970059879e-08, "logits/chosen": -2.248631238937378, "logits/rejected": -2.3758420944213867, "logps/chosen": -10.291163444519043, "logps/rejected": -10.185615539550781, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.06792908161878586, "rewards/margins": 0.008155062794685364, "rewards/rejected": 0.059774018824100494, "step": 146 }, { "epoch": 0.08, "learning_rate": 8.802395209580838e-08, "logits/chosen": -2.077632427215576, "logits/rejected": -2.327742576599121, "logps/chosen": -10.12020492553711, "logps/rejected": -10.686904907226562, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.061307527124881744, "rewards/margins": -0.0034301728010177612, "rewards/rejected": 0.0647376999258995, "step": 147 }, { "epoch": 0.08, "learning_rate": 8.862275449101797e-08, "logits/chosen": -2.1063790321350098, "logits/rejected": -2.3640456199645996, "logps/chosen": -9.880207061767578, "logps/rejected": -10.031105041503906, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.08783721923828125, "rewards/margins": -0.004475973546504974, "rewards/rejected": 0.09231319278478622, "step": 148 }, { "epoch": 0.08, "learning_rate": 8.922155688622755e-08, "logits/chosen": -2.1527926921844482, "logits/rejected": -2.1585452556610107, "logps/chosen": -11.425393104553223, "logps/rejected": -8.14310073852539, "loss": 0.6765, "rewards/accuracies": 1.0, "rewards/chosen": 0.08627519756555557, "rewards/margins": 0.03348741680383682, "rewards/rejected": 0.05278778076171875, "step": 149 }, { "epoch": 0.08, "learning_rate": 8.982035928143712e-08, "logits/chosen": -2.2239878177642822, "logits/rejected": -2.2313570976257324, "logps/chosen": -8.430689811706543, "logps/rejected": -7.693068504333496, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.07420101016759872, "rewards/margins": 0.021687697619199753, "rewards/rejected": 0.05251331254839897, "step": 150 }, { "epoch": 0.08, "learning_rate": 9.041916167664671e-08, "logits/chosen": -2.249321937561035, "logits/rejected": -2.2486352920532227, "logps/chosen": -15.138116836547852, "logps/rejected": -8.787635803222656, "loss": 0.6553, "rewards/accuracies": 1.0, "rewards/chosen": 0.12239513546228409, "rewards/margins": 0.07714233547449112, "rewards/rejected": 0.04525279998779297, "step": 151 }, { "epoch": 0.08, "learning_rate": 9.101796407185628e-08, "logits/chosen": -2.1549232006073, "logits/rejected": -2.152503252029419, "logps/chosen": -19.18927001953125, "logps/rejected": -9.291912078857422, "loss": 0.6778, "rewards/accuracies": 1.0, "rewards/chosen": 0.09357452392578125, "rewards/margins": 0.031009294092655182, "rewards/rejected": 0.06256522983312607, "step": 152 }, { "epoch": 0.08, "learning_rate": 9.161676646706587e-08, "logits/chosen": -2.103344678878784, "logits/rejected": -2.3614745140075684, "logps/chosen": -18.311717987060547, "logps/rejected": -17.125774383544922, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 0.07336483150720596, "rewards/margins": -0.010488703846931458, "rewards/rejected": 0.08385353535413742, "step": 153 }, { "epoch": 0.08, "learning_rate": 9.221556886227546e-08, "logits/chosen": -2.1575098037719727, "logits/rejected": -2.3008973598480225, "logps/chosen": -9.447405815124512, "logps/rejected": -9.156623840332031, "loss": 0.7006, "rewards/accuracies": 0.0, "rewards/chosen": 0.08062639087438583, "rewards/margins": -0.01490812748670578, "rewards/rejected": 0.09553451836109161, "step": 154 }, { "epoch": 0.08, "learning_rate": 9.281437125748503e-08, "logits/chosen": -2.1063547134399414, "logits/rejected": -2.318528413772583, "logps/chosen": -7.851142883300781, "logps/rejected": -7.911672115325928, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.04631691053509712, "rewards/margins": -0.003138303756713867, "rewards/rejected": 0.04945521429181099, "step": 155 }, { "epoch": 0.08, "learning_rate": 9.341317365269461e-08, "logits/chosen": -2.129100799560547, "logits/rejected": -2.1290507316589355, "logps/chosen": -16.188806533813477, "logps/rejected": -9.358833312988281, "loss": 0.642, "rewards/accuracies": 1.0, "rewards/chosen": 0.15730153024196625, "rewards/margins": 0.10495597124099731, "rewards/rejected": 0.05234556272625923, "step": 156 }, { "epoch": 0.08, "learning_rate": 9.401197604790419e-08, "logits/chosen": -2.1303539276123047, "logits/rejected": -2.2643582820892334, "logps/chosen": -12.310507774353027, "logps/rejected": -12.453755378723145, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": 0.10029001533985138, "rewards/margins": -0.009548284113407135, "rewards/rejected": 0.10983829945325851, "step": 157 }, { "epoch": 0.09, "learning_rate": 9.461077844311377e-08, "logits/chosen": -2.1362664699554443, "logits/rejected": -2.14245343208313, "logps/chosen": -14.186534881591797, "logps/rejected": -21.82668113708496, "loss": 0.6639, "rewards/accuracies": 1.0, "rewards/chosen": 0.12171106785535812, "rewards/margins": 0.0592925101518631, "rewards/rejected": 0.062418557703495026, "step": 158 }, { "epoch": 0.09, "learning_rate": 9.520958083832335e-08, "logits/chosen": -2.099060535430908, "logits/rejected": -2.1070165634155273, "logps/chosen": -14.925067901611328, "logps/rejected": -8.055194854736328, "loss": 0.6746, "rewards/accuracies": 1.0, "rewards/chosen": 0.1128467544913292, "rewards/margins": 0.03741855174303055, "rewards/rejected": 0.07542820274829865, "step": 159 }, { "epoch": 0.09, "learning_rate": 9.580838323353293e-08, "logits/chosen": -2.0945751667022705, "logits/rejected": -2.301788091659546, "logps/chosen": -10.780511856079102, "logps/rejected": -10.572575569152832, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.11332760006189346, "rewards/margins": 0.011789701879024506, "rewards/rejected": 0.10153789818286896, "step": 160 }, { "epoch": 0.09, "learning_rate": 9.640718562874252e-08, "logits/chosen": -2.057724714279175, "logits/rejected": -2.092217445373535, "logps/chosen": -15.43944263458252, "logps/rejected": -14.2000732421875, "loss": 0.6486, "rewards/accuracies": 1.0, "rewards/chosen": 0.12882986664772034, "rewards/margins": 0.09118376672267914, "rewards/rejected": 0.0376461036503315, "step": 161 }, { "epoch": 0.09, "learning_rate": 9.700598802395209e-08, "logits/chosen": -2.0978596210479736, "logits/rejected": -2.292423963546753, "logps/chosen": -8.991631507873535, "logps/rejected": -8.852952003479004, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.08672752231359482, "rewards/margins": -0.0006947517395019531, "rewards/rejected": 0.08742227405309677, "step": 162 }, { "epoch": 0.09, "learning_rate": 9.760479041916168e-08, "logits/chosen": -2.058906316757202, "logits/rejected": -2.066056966781616, "logps/chosen": -9.32284164428711, "logps/rejected": -8.292828559875488, "loss": 0.6607, "rewards/accuracies": 1.0, "rewards/chosen": 0.12386150658130646, "rewards/margins": 0.06605682522058487, "rewards/rejected": 0.05780468136072159, "step": 163 }, { "epoch": 0.09, "learning_rate": 9.820359281437125e-08, "logits/chosen": -2.0965912342071533, "logits/rejected": -2.0947248935699463, "logps/chosen": -10.635886192321777, "logps/rejected": -9.001676559448242, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.10577154159545898, "rewards/margins": 0.026578135788440704, "rewards/rejected": 0.07919340580701828, "step": 164 }, { "epoch": 0.09, "learning_rate": 9.880239520958084e-08, "logits/chosen": -2.1576006412506104, "logits/rejected": -2.3073666095733643, "logps/chosen": -9.321231842041016, "logps/rejected": -9.085103034973145, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.12324104458093643, "rewards/margins": 0.009411051869392395, "rewards/rejected": 0.11382999271154404, "step": 165 }, { "epoch": 0.09, "learning_rate": 9.940119760479041e-08, "logits/chosen": -2.156313896179199, "logits/rejected": -2.1633188724517822, "logps/chosen": -24.860206604003906, "logps/rejected": -7.915232181549072, "loss": 0.6527, "rewards/accuracies": 1.0, "rewards/chosen": 0.15181580185890198, "rewards/margins": 0.08262210339307785, "rewards/rejected": 0.06919369846582413, "step": 166 }, { "epoch": 0.09, "learning_rate": 1e-07, "logits/chosen": -2.083913564682007, "logits/rejected": -2.2652132511138916, "logps/chosen": -7.93259334564209, "logps/rejected": -7.806826591491699, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.08560638874769211, "rewards/margins": 0.015200614929199219, "rewards/rejected": 0.07040577381849289, "step": 167 }, { "epoch": 0.09, "learning_rate": 9.999999152271383e-08, "logits/chosen": -2.236755132675171, "logits/rejected": -2.2444300651550293, "logps/chosen": -14.674066543579102, "logps/rejected": -8.212796211242676, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 0.1444379836320877, "rewards/margins": 0.03863334655761719, "rewards/rejected": 0.10580463707447052, "step": 168 }, { "epoch": 0.09, "learning_rate": 9.999996609085821e-08, "logits/chosen": -2.050854444503784, "logits/rejected": -2.0504746437072754, "logps/chosen": -17.193763732910156, "logps/rejected": -8.540130615234375, "loss": 0.6551, "rewards/accuracies": 1.0, "rewards/chosen": 0.1378767043352127, "rewards/margins": 0.07764224708080292, "rewards/rejected": 0.06023445352911949, "step": 169 }, { "epoch": 0.09, "learning_rate": 9.999992370444176e-08, "logits/chosen": -2.1369106769561768, "logits/rejected": -2.137510061264038, "logps/chosen": -15.437736511230469, "logps/rejected": -12.11609935760498, "loss": 0.656, "rewards/accuracies": 1.0, "rewards/chosen": 0.15796279907226562, "rewards/margins": 0.0756431594491005, "rewards/rejected": 0.08231963962316513, "step": 170 }, { "epoch": 0.09, "learning_rate": 9.999986436347885e-08, "logits/chosen": -2.0931618213653564, "logits/rejected": -2.397494316101074, "logps/chosen": -8.329607009887695, "logps/rejected": -8.460824012756348, "loss": 0.6997, "rewards/accuracies": 0.0, "rewards/chosen": 0.11236371845006943, "rewards/margins": -0.013082705438137054, "rewards/rejected": 0.12544642388820648, "step": 171 }, { "epoch": 0.09, "learning_rate": 9.999978806798958e-08, "logits/chosen": -2.107243776321411, "logits/rejected": -2.1154887676239014, "logps/chosen": -18.211414337158203, "logps/rejected": -13.709915161132812, "loss": 0.6609, "rewards/accuracies": 1.0, "rewards/chosen": 0.17435359954833984, "rewards/margins": 0.06547413021326065, "rewards/rejected": 0.1088794693350792, "step": 172 }, { "epoch": 0.09, "learning_rate": 9.999969481799988e-08, "logits/chosen": -2.239475727081299, "logits/rejected": -2.239922523498535, "logps/chosen": -17.04981231689453, "logps/rejected": -13.062705039978027, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.15963439643383026, "rewards/margins": 0.034113019704818726, "rewards/rejected": 0.12552137672901154, "step": 173 }, { "epoch": 0.09, "learning_rate": 9.999958461354132e-08, "logits/chosen": -2.0948851108551025, "logits/rejected": -2.3454794883728027, "logps/chosen": -8.529027938842773, "logps/rejected": -8.642244338989258, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": 0.1387193650007248, "rewards/margins": 0.026437371969223022, "rewards/rejected": 0.11228199303150177, "step": 174 }, { "epoch": 0.09, "learning_rate": 9.999945745465128e-08, "logits/chosen": -2.1944663524627686, "logits/rejected": -2.335191011428833, "logps/chosen": -12.02459716796875, "logps/rejected": -11.681771278381348, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.12108641117811203, "rewards/margins": 0.001941874623298645, "rewards/rejected": 0.11914453655481339, "step": 175 }, { "epoch": 0.09, "learning_rate": 9.999931334137288e-08, "logits/chosen": -2.125621795654297, "logits/rejected": -2.3018062114715576, "logps/chosen": -13.757869720458984, "logps/rejected": -13.320724487304688, "loss": 0.7038, "rewards/accuracies": 0.0, "rewards/chosen": 0.1172858253121376, "rewards/margins": -0.021256066858768463, "rewards/rejected": 0.13854189217090607, "step": 176 }, { "epoch": 0.1, "learning_rate": 9.9999152273755e-08, "logits/chosen": -2.1820578575134277, "logits/rejected": -2.224766254425049, "logps/chosen": -16.976957321166992, "logps/rejected": -29.103992462158203, "loss": 0.6207, "rewards/accuracies": 1.0, "rewards/chosen": 0.18329505622386932, "rewards/margins": 0.15062636137008667, "rewards/rejected": 0.03266868740320206, "step": 177 }, { "epoch": 0.1, "learning_rate": 9.999897425185224e-08, "logits/chosen": -2.2175393104553223, "logits/rejected": -2.3628852367401123, "logps/chosen": -16.398832321166992, "logps/rejected": -10.158283233642578, "loss": 0.7297, "rewards/accuracies": 0.0, "rewards/chosen": 0.08623657375574112, "rewards/margins": -0.07184333354234695, "rewards/rejected": 0.15807990729808807, "step": 178 }, { "epoch": 0.1, "learning_rate": 9.999877927572497e-08, "logits/chosen": -2.2045276165008545, "logits/rejected": -2.2090938091278076, "logps/chosen": -9.761396408081055, "logps/rejected": -7.512382507324219, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": 0.1600017547607422, "rewards/margins": 0.03832082450389862, "rewards/rejected": 0.12168093025684357, "step": 179 }, { "epoch": 0.1, "learning_rate": 9.999856734543932e-08, "logits/chosen": -2.237004041671753, "logits/rejected": -2.236248731613159, "logps/chosen": -11.357705116271973, "logps/rejected": -9.679445266723633, "loss": 0.6477, "rewards/accuracies": 1.0, "rewards/chosen": 0.19422779977321625, "rewards/margins": 0.09302950650453568, "rewards/rejected": 0.10119829326868057, "step": 180 }, { "epoch": 0.1, "learning_rate": 9.999833846106713e-08, "logits/chosen": -2.1796233654022217, "logits/rejected": -2.1726064682006836, "logps/chosen": -14.554512023925781, "logps/rejected": -12.55143928527832, "loss": 0.6464, "rewards/accuracies": 1.0, "rewards/chosen": 0.19154834747314453, "rewards/margins": 0.09579410403966904, "rewards/rejected": 0.0957542434334755, "step": 181 }, { "epoch": 0.1, "learning_rate": 9.999809262268602e-08, "logits/chosen": -2.025388717651367, "logits/rejected": -2.0088534355163574, "logps/chosen": -24.17774200439453, "logps/rejected": -12.265125274658203, "loss": 0.6992, "rewards/accuracies": 0.0, "rewards/chosen": 0.13920174539089203, "rewards/margins": -0.01213015615940094, "rewards/rejected": 0.15133190155029297, "step": 182 }, { "epoch": 0.1, "learning_rate": 9.999782983037936e-08, "logits/chosen": -2.043602705001831, "logits/rejected": -2.2926249504089355, "logps/chosen": -7.218709945678711, "logps/rejected": -7.0443854331970215, "loss": 0.7006, "rewards/accuracies": 0.0, "rewards/chosen": 0.13404951989650726, "rewards/margins": -0.014859244227409363, "rewards/rejected": 0.14890876412391663, "step": 183 }, { "epoch": 0.1, "learning_rate": 9.999755008423626e-08, "logits/chosen": -1.9986425638198853, "logits/rejected": -1.999616026878357, "logps/chosen": -8.506755828857422, "logps/rejected": -6.7361860275268555, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.2014923095703125, "rewards/margins": 0.015917018055915833, "rewards/rejected": 0.18557529151439667, "step": 184 }, { "epoch": 0.1, "learning_rate": 9.999725338435156e-08, "logits/chosen": -2.0920143127441406, "logits/rejected": -2.2707667350769043, "logps/chosen": -8.326848983764648, "logps/rejected": -19.10015869140625, "loss": 0.6575, "rewards/accuracies": 1.0, "rewards/chosen": 0.15364590287208557, "rewards/margins": 0.0725502073764801, "rewards/rejected": 0.08109569549560547, "step": 185 }, { "epoch": 0.1, "learning_rate": 9.99969397308259e-08, "logits/chosen": -2.1107003688812256, "logits/rejected": -2.3113274574279785, "logps/chosen": -10.778020858764648, "logps/rejected": -10.774696350097656, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.14597168564796448, "rewards/margins": -0.00015135109424591064, "rewards/rejected": 0.1461230367422104, "step": 186 }, { "epoch": 0.1, "learning_rate": 9.999660912376561e-08, "logits/chosen": -2.061099052429199, "logits/rejected": -2.2882421016693115, "logps/chosen": -8.664192199707031, "logps/rejected": -8.67059326171875, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.2050189971923828, "rewards/margins": 0.01187075674533844, "rewards/rejected": 0.19314824044704437, "step": 187 }, { "epoch": 0.1, "learning_rate": 9.999626156328282e-08, "logits/chosen": -2.294947624206543, "logits/rejected": -2.3688108921051025, "logps/chosen": -8.969539642333984, "logps/rejected": -9.010223388671875, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.1021307036280632, "rewards/margins": 0.011194422841072083, "rewards/rejected": 0.09093628078699112, "step": 188 }, { "epoch": 0.1, "learning_rate": 9.999589704949537e-08, "logits/chosen": -2.249804973602295, "logits/rejected": -2.2496888637542725, "logps/chosen": -7.300312519073486, "logps/rejected": -10.98077392578125, "loss": 0.6334, "rewards/accuracies": 1.0, "rewards/chosen": 0.22870592772960663, "rewards/margins": 0.12324919551610947, "rewards/rejected": 0.10545673221349716, "step": 189 }, { "epoch": 0.1, "learning_rate": 9.999551558252685e-08, "logits/chosen": -2.1494626998901367, "logits/rejected": -2.2128353118896484, "logps/chosen": -7.525479793548584, "logps/rejected": -25.09557342529297, "loss": 0.5818, "rewards/accuracies": 1.0, "rewards/chosen": 0.24764256179332733, "rewards/margins": 0.23656105995178223, "rewards/rejected": 0.011081504635512829, "step": 190 }, { "epoch": 0.1, "learning_rate": 9.999511716250664e-08, "logits/chosen": -2.074781656265259, "logits/rejected": -2.2676236629486084, "logps/chosen": -8.149369239807129, "logps/rejected": -7.723587989807129, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.17569951713085175, "rewards/margins": -0.007736504077911377, "rewards/rejected": 0.18343602120876312, "step": 191 }, { "epoch": 0.1, "learning_rate": 9.999470178956981e-08, "logits/chosen": -2.132338762283325, "logits/rejected": -2.3055191040039062, "logps/chosen": -7.26273775100708, "logps/rejected": -6.809706687927246, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.19810481369495392, "rewards/margins": 0.005903199315071106, "rewards/rejected": 0.1922016143798828, "step": 192 }, { "epoch": 0.1, "learning_rate": 9.999426946385725e-08, "logits/chosen": -2.238467216491699, "logits/rejected": -2.2407562732696533, "logps/chosen": -7.57138729095459, "logps/rejected": -6.91178560256958, "loss": 0.6645, "rewards/accuracies": 1.0, "rewards/chosen": 0.1999107450246811, "rewards/margins": 0.05813613533973694, "rewards/rejected": 0.14177460968494415, "step": 193 }, { "epoch": 0.1, "learning_rate": 9.999382018551553e-08, "logits/chosen": -2.222426652908325, "logits/rejected": -2.222764492034912, "logps/chosen": -7.2940168380737305, "logps/rejected": -7.2171854972839355, "loss": 0.6149, "rewards/accuracies": 1.0, "rewards/chosen": 0.24109335243701935, "rewards/margins": 0.1632283627986908, "rewards/rejected": 0.07786498218774796, "step": 194 }, { "epoch": 0.11, "learning_rate": 9.999335395469699e-08, "logits/chosen": -2.2043728828430176, "logits/rejected": -2.2096166610717773, "logps/chosen": -8.698447227478027, "logps/rejected": -10.06596565246582, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.1669604331254959, "rewards/margins": 0.022857576608657837, "rewards/rejected": 0.14410285651683807, "step": 195 }, { "epoch": 0.11, "learning_rate": 9.999287077155976e-08, "logits/chosen": -2.09234881401062, "logits/rejected": -2.1301937103271484, "logps/chosen": -18.96909523010254, "logps/rejected": -12.978675842285156, "loss": 0.6621, "rewards/accuracies": 1.0, "rewards/chosen": 0.18784771859645844, "rewards/margins": 0.06305771321058273, "rewards/rejected": 0.1247900053858757, "step": 196 }, { "epoch": 0.11, "learning_rate": 9.999237063626764e-08, "logits/chosen": -2.0182011127471924, "logits/rejected": -2.0198397636413574, "logps/chosen": -15.530750274658203, "logps/rejected": -7.631843566894531, "loss": 0.7003, "rewards/accuracies": 0.0, "rewards/chosen": 0.19790764153003693, "rewards/margins": -0.01422920823097229, "rewards/rejected": 0.21213684976100922, "step": 197 }, { "epoch": 0.11, "learning_rate": 9.999185354899025e-08, "logits/chosen": -2.190065383911133, "logits/rejected": -2.300408363342285, "logps/chosen": -7.774911880493164, "logps/rejected": -12.619563102722168, "loss": 0.6721, "rewards/accuracies": 1.0, "rewards/chosen": 0.19877319037914276, "rewards/margins": 0.042535483837127686, "rewards/rejected": 0.15623770654201508, "step": 198 }, { "epoch": 0.11, "learning_rate": 9.999131950990292e-08, "logits/chosen": -2.0457427501678467, "logits/rejected": -2.3123137950897217, "logps/chosen": -7.358709335327148, "logps/rejected": -7.374157428741455, "loss": 0.6814, "rewards/accuracies": 1.0, "rewards/chosen": 0.2579154968261719, "rewards/margins": 0.023533672094345093, "rewards/rejected": 0.23438182473182678, "step": 199 }, { "epoch": 0.11, "learning_rate": 9.999076851918673e-08, "logits/chosen": -2.1917076110839844, "logits/rejected": -2.176229238510132, "logps/chosen": -23.581459045410156, "logps/rejected": -8.284305572509766, "loss": 0.5761, "rewards/accuracies": 1.0, "rewards/chosen": 0.34785348176956177, "rewards/margins": 0.24965927004814148, "rewards/rejected": 0.09819421917200089, "step": 200 }, { "epoch": 0.11, "learning_rate": 9.999020057702854e-08, "logits/chosen": -2.077150821685791, "logits/rejected": -2.0600881576538086, "logps/chosen": -17.8856201171875, "logps/rejected": -8.242608070373535, "loss": 0.6512, "rewards/accuracies": 1.0, "rewards/chosen": 0.20038071274757385, "rewards/margins": 0.08582039177417755, "rewards/rejected": 0.1145603209733963, "step": 201 }, { "epoch": 0.11, "learning_rate": 9.99896156836209e-08, "logits/chosen": -2.0862200260162354, "logits/rejected": -2.101123332977295, "logps/chosen": -12.198607444763184, "logps/rejected": -12.189401626586914, "loss": 0.6745, "rewards/accuracies": 1.0, "rewards/chosen": 0.19592523574829102, "rewards/margins": 0.037657931447029114, "rewards/rejected": 0.1582673043012619, "step": 202 }, { "epoch": 0.11, "learning_rate": 9.998901383916219e-08, "logits/chosen": -1.9986311197280884, "logits/rejected": -2.294386625289917, "logps/chosen": -7.318279266357422, "logps/rejected": -7.066373348236084, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.20036545395851135, "rewards/margins": 0.015282973647117615, "rewards/rejected": 0.18508248031139374, "step": 203 }, { "epoch": 0.11, "learning_rate": 9.998839504385644e-08, "logits/chosen": -2.0693304538726807, "logits/rejected": -2.276110887527466, "logps/chosen": -8.108627319335938, "logps/rejected": -8.210007667541504, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.25107547640800476, "rewards/margins": 0.002300277352333069, "rewards/rejected": 0.2487751990556717, "step": 204 }, { "epoch": 0.11, "learning_rate": 9.998775929791353e-08, "logits/chosen": -2.037731647491455, "logits/rejected": -2.0378427505493164, "logps/chosen": -10.332984924316406, "logps/rejected": -8.971658706665039, "loss": 0.7293, "rewards/accuracies": 0.0, "rewards/chosen": 0.17929582297801971, "rewards/margins": -0.07105846703052521, "rewards/rejected": 0.2503542900085449, "step": 205 }, { "epoch": 0.11, "learning_rate": 9.998710660154896e-08, "logits/chosen": -2.235109329223633, "logits/rejected": -2.2603743076324463, "logps/chosen": -34.21550369262695, "logps/rejected": -25.5551815032959, "loss": 0.6692, "rewards/accuracies": 1.0, "rewards/chosen": 0.18124733865261078, "rewards/margins": 0.04847927391529083, "rewards/rejected": 0.13276806473731995, "step": 206 }, { "epoch": 0.11, "learning_rate": 9.998643695498415e-08, "logits/chosen": -2.131469964981079, "logits/rejected": -2.316901683807373, "logps/chosen": -8.05000114440918, "logps/rejected": -7.671285152435303, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 0.1893966645002365, "rewards/margins": -0.008487939834594727, "rewards/rejected": 0.19788460433483124, "step": 207 }, { "epoch": 0.11, "learning_rate": 9.998575035844609e-08, "logits/chosen": -2.1065280437469482, "logits/rejected": -2.0993010997772217, "logps/chosen": -11.087653160095215, "logps/rejected": -8.194300651550293, "loss": 0.7035, "rewards/accuracies": 0.0, "rewards/chosen": 0.18690024316310883, "rewards/margins": -0.020561978220939636, "rewards/rejected": 0.20746222138404846, "step": 208 }, { "epoch": 0.11, "learning_rate": 9.998504681216764e-08, "logits/chosen": -2.114325761795044, "logits/rejected": -2.2779881954193115, "logps/chosen": -12.840688705444336, "logps/rejected": -9.705984115600586, "loss": 0.7066, "rewards/accuracies": 0.0, "rewards/chosen": 0.21430455148220062, "rewards/margins": -0.026820838451385498, "rewards/rejected": 0.24112538993358612, "step": 209 }, { "epoch": 0.11, "learning_rate": 9.998432631638737e-08, "logits/chosen": -2.010284662246704, "logits/rejected": -2.0230014324188232, "logps/chosen": -13.435134887695312, "logps/rejected": -13.55058479309082, "loss": 0.6358, "rewards/accuracies": 1.0, "rewards/chosen": 0.20929089188575745, "rewards/margins": 0.11816368252038956, "rewards/rejected": 0.09112720936536789, "step": 210 }, { "epoch": 0.11, "learning_rate": 9.998358887134956e-08, "logits/chosen": -2.1424875259399414, "logits/rejected": -2.157200574874878, "logps/chosen": -19.098472595214844, "logps/rejected": -9.9305419921875, "loss": 0.6268, "rewards/accuracies": 1.0, "rewards/chosen": 0.31831303238868713, "rewards/margins": 0.13746432960033417, "rewards/rejected": 0.18084870278835297, "step": 211 }, { "epoch": 0.11, "learning_rate": 9.99828344773043e-08, "logits/chosen": -2.1213719844818115, "logits/rejected": -2.1131210327148438, "logps/chosen": -9.22949504852295, "logps/rejected": -8.270204544067383, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.2347085028886795, "rewards/margins": 0.04056844115257263, "rewards/rejected": 0.19414006173610687, "step": 212 }, { "epoch": 0.11, "learning_rate": 9.998206313450741e-08, "logits/chosen": -2.1388864517211914, "logits/rejected": -2.1099634170532227, "logps/chosen": -29.635940551757812, "logps/rejected": -8.027835845947266, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": 0.18257160484790802, "rewards/margins": -0.009460926055908203, "rewards/rejected": 0.19203253090381622, "step": 213 }, { "epoch": 0.12, "learning_rate": 9.998127484322041e-08, "logits/chosen": -2.1816108226776123, "logits/rejected": -2.1844050884246826, "logps/chosen": -9.447705268859863, "logps/rejected": -7.94569206237793, "loss": 0.6652, "rewards/accuracies": 1.0, "rewards/chosen": 0.17654180526733398, "rewards/margins": 0.056731224060058594, "rewards/rejected": 0.11981058120727539, "step": 214 }, { "epoch": 0.12, "learning_rate": 9.998046960371063e-08, "logits/chosen": -2.127737283706665, "logits/rejected": -2.134955406188965, "logps/chosen": -11.05094051361084, "logps/rejected": -7.780493259429932, "loss": 0.6231, "rewards/accuracies": 1.0, "rewards/chosen": 0.32230398058891296, "rewards/margins": 0.1454748809337616, "rewards/rejected": 0.17682909965515137, "step": 215 }, { "epoch": 0.12, "learning_rate": 9.99796474162511e-08, "logits/chosen": -2.0884597301483154, "logits/rejected": -2.2833242416381836, "logps/chosen": -10.924288749694824, "logps/rejected": -8.401596069335938, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.21290789544582367, "rewards/margins": 0.02237434685230255, "rewards/rejected": 0.19053354859352112, "step": 216 }, { "epoch": 0.12, "learning_rate": 9.997880828112064e-08, "logits/chosen": -2.101409912109375, "logits/rejected": -2.104218006134033, "logps/chosen": -12.169754028320312, "logps/rejected": -8.288612365722656, "loss": 0.6652, "rewards/accuracies": 1.0, "rewards/chosen": 0.2663228213787079, "rewards/margins": 0.05669508874416351, "rewards/rejected": 0.20962773263454437, "step": 217 }, { "epoch": 0.12, "learning_rate": 9.997795219860377e-08, "logits/chosen": -2.20501708984375, "logits/rejected": -2.1893980503082275, "logps/chosen": -26.235530853271484, "logps/rejected": -7.954805850982666, "loss": 0.7058, "rewards/accuracies": 0.0, "rewards/chosen": 0.27333107590675354, "rewards/margins": -0.02518695592880249, "rewards/rejected": 0.29851803183555603, "step": 218 }, { "epoch": 0.12, "learning_rate": 9.997707916899078e-08, "logits/chosen": -2.1808183193206787, "logits/rejected": -2.29939866065979, "logps/chosen": -10.26947021484375, "logps/rejected": -13.904784202575684, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.24439440667629242, "rewards/margins": 0.0003389418125152588, "rewards/rejected": 0.24405546486377716, "step": 219 }, { "epoch": 0.12, "learning_rate": 9.997618919257774e-08, "logits/chosen": -2.173717975616455, "logits/rejected": -2.2239558696746826, "logps/chosen": -26.296161651611328, "logps/rejected": -26.42555809020996, "loss": 0.7151, "rewards/accuracies": 0.0, "rewards/chosen": 0.20660439133644104, "rewards/margins": -0.04352244734764099, "rewards/rejected": 0.25012683868408203, "step": 220 }, { "epoch": 0.12, "learning_rate": 9.99752822696664e-08, "logits/chosen": -2.0848898887634277, "logits/rejected": -2.0929224491119385, "logps/chosen": -8.43840217590332, "logps/rejected": -6.809311866760254, "loss": 0.624, "rewards/accuracies": 1.0, "rewards/chosen": 0.34167787432670593, "rewards/margins": 0.1433451771736145, "rewards/rejected": 0.19833269715309143, "step": 221 }, { "epoch": 0.12, "learning_rate": 9.99743584005643e-08, "logits/chosen": -2.1459906101226807, "logits/rejected": -2.307655096054077, "logps/chosen": -16.144508361816406, "logps/rejected": -7.822479724884033, "loss": 0.7361, "rewards/accuracies": 0.0, "rewards/chosen": 0.15908508002758026, "rewards/margins": -0.08422160148620605, "rewards/rejected": 0.24330668151378632, "step": 222 }, { "epoch": 0.12, "learning_rate": 9.997341758558471e-08, "logits/chosen": -2.1443936824798584, "logits/rejected": -2.349841594696045, "logps/chosen": -10.704084396362305, "logps/rejected": -10.694997787475586, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.25228291749954224, "rewards/margins": -0.003841102123260498, "rewards/rejected": 0.25612401962280273, "step": 223 }, { "epoch": 0.12, "learning_rate": 9.997245982504667e-08, "logits/chosen": -2.2279906272888184, "logits/rejected": -2.2370190620422363, "logps/chosen": -12.74058723449707, "logps/rejected": -6.60615873336792, "loss": 0.6306, "rewards/accuracies": 1.0, "rewards/chosen": 0.3415535092353821, "rewards/margins": 0.12935081124305725, "rewards/rejected": 0.21220269799232483, "step": 224 }, { "epoch": 0.12, "learning_rate": 9.997148511927493e-08, "logits/chosen": -2.0902187824249268, "logits/rejected": -2.2963156700134277, "logps/chosen": -7.601818561553955, "logps/rejected": -7.27847146987915, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.3045210540294647, "rewards/margins": 0.011489778757095337, "rewards/rejected": 0.2930312752723694, "step": 225 }, { "epoch": 0.12, "learning_rate": 9.997049346860003e-08, "logits/chosen": -2.090674638748169, "logits/rejected": -2.0963621139526367, "logps/chosen": -9.909343719482422, "logps/rejected": -6.4641265869140625, "loss": 0.6296, "rewards/accuracies": 1.0, "rewards/chosen": 0.3706524074077606, "rewards/margins": 0.13146965205669403, "rewards/rejected": 0.2391827553510666, "step": 226 }, { "epoch": 0.12, "learning_rate": 9.996948487335818e-08, "logits/chosen": -2.0933218002319336, "logits/rejected": -2.260032892227173, "logps/chosen": -9.675145149230957, "logps/rejected": -10.877883911132812, "loss": 0.7149, "rewards/accuracies": 0.0, "rewards/chosen": 0.2524906098842621, "rewards/margins": -0.04297983646392822, "rewards/rejected": 0.2954704463481903, "step": 227 }, { "epoch": 0.12, "learning_rate": 9.996845933389144e-08, "logits/chosen": -2.1625561714172363, "logits/rejected": -2.1655702590942383, "logps/chosen": -7.214556694030762, "logps/rejected": -7.977644920349121, "loss": 0.6376, "rewards/accuracies": 1.0, "rewards/chosen": 0.22832803428173065, "rewards/margins": 0.1143336221575737, "rewards/rejected": 0.11399441212415695, "step": 228 }, { "epoch": 0.12, "learning_rate": 9.996741685054755e-08, "logits/chosen": -2.2256410121917725, "logits/rejected": -2.3678383827209473, "logps/chosen": -10.051511764526367, "logps/rejected": -10.070146560668945, "loss": 0.695, "rewards/accuracies": 0.0, "rewards/chosen": 0.19467706978321075, "rewards/margins": -0.0036745965480804443, "rewards/rejected": 0.1983516663312912, "step": 229 }, { "epoch": 0.12, "learning_rate": 9.996635742367999e-08, "logits/chosen": -2.207920551300049, "logits/rejected": -2.207484722137451, "logps/chosen": -8.475793838500977, "logps/rejected": -8.797066688537598, "loss": 0.649, "rewards/accuracies": 1.0, "rewards/chosen": 0.2845284640789032, "rewards/margins": 0.09028618037700653, "rewards/rejected": 0.19424228370189667, "step": 230 }, { "epoch": 0.12, "learning_rate": 9.996528105364799e-08, "logits/chosen": -2.164074182510376, "logits/rejected": -2.310853958129883, "logps/chosen": -6.9442267417907715, "logps/rejected": -6.776099681854248, "loss": 0.6998, "rewards/accuracies": 0.0, "rewards/chosen": 0.30064091086387634, "rewards/margins": -0.013286501169204712, "rewards/rejected": 0.31392741203308105, "step": 231 }, { "epoch": 0.13, "learning_rate": 9.996418774081656e-08, "logits/chosen": -2.157689332962036, "logits/rejected": -2.157567024230957, "logps/chosen": -18.183204650878906, "logps/rejected": -6.098960876464844, "loss": 0.5841, "rewards/accuracies": 1.0, "rewards/chosen": 0.44426193833351135, "rewards/margins": 0.23142604529857635, "rewards/rejected": 0.212835893034935, "step": 232 }, { "epoch": 0.13, "learning_rate": 9.996307748555644e-08, "logits/chosen": -2.072812795639038, "logits/rejected": -2.262686252593994, "logps/chosen": -6.3262529373168945, "logps/rejected": -6.252285003662109, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.347281277179718, "rewards/margins": 0.01499176025390625, "rewards/rejected": 0.33228951692581177, "step": 233 }, { "epoch": 0.13, "learning_rate": 9.99619502882441e-08, "logits/chosen": -2.2337088584899902, "logits/rejected": -2.242037296295166, "logps/chosen": -8.538265228271484, "logps/rejected": -6.006162166595459, "loss": 0.6096, "rewards/accuracies": 1.0, "rewards/chosen": 0.425214022397995, "rewards/margins": 0.1747874915599823, "rewards/rejected": 0.2504265308380127, "step": 234 }, { "epoch": 0.13, "learning_rate": 9.996080614926177e-08, "logits/chosen": -2.0066487789154053, "logits/rejected": -2.2751381397247314, "logps/chosen": -6.505316734313965, "logps/rejected": -6.242354393005371, "loss": 0.701, "rewards/accuracies": 0.0, "rewards/chosen": 0.17374010384082794, "rewards/margins": -0.015545368194580078, "rewards/rejected": 0.18928547203540802, "step": 235 }, { "epoch": 0.13, "learning_rate": 9.99596450689974e-08, "logits/chosen": -2.1544573307037354, "logits/rejected": -2.1325650215148926, "logps/chosen": -30.65877914428711, "logps/rejected": -9.161983489990234, "loss": 0.6028, "rewards/accuracies": 1.0, "rewards/chosen": 0.3360595703125, "rewards/margins": 0.18957014381885529, "rewards/rejected": 0.14648942649364471, "step": 236 }, { "epoch": 0.13, "learning_rate": 9.995846704784471e-08, "logits/chosen": -2.071498155593872, "logits/rejected": -2.07661509513855, "logps/chosen": -18.375896453857422, "logps/rejected": -19.83071517944336, "loss": 0.5573, "rewards/accuracies": 1.0, "rewards/chosen": 0.4382017254829407, "rewards/margins": 0.2930673658847809, "rewards/rejected": 0.1451343595981598, "step": 237 }, { "epoch": 0.13, "learning_rate": 9.995727208620316e-08, "logits/chosen": -1.9958386421203613, "logits/rejected": -1.9991286993026733, "logps/chosen": -13.16961669921875, "logps/rejected": -10.725434303283691, "loss": 0.6212, "rewards/accuracies": 1.0, "rewards/chosen": 0.3737359941005707, "rewards/margins": 0.14955243468284607, "rewards/rejected": 0.2241835594177246, "step": 238 }, { "epoch": 0.13, "learning_rate": 9.995606018447796e-08, "logits/chosen": -2.141094207763672, "logits/rejected": -2.137906551361084, "logps/chosen": -12.346811294555664, "logps/rejected": -9.284651756286621, "loss": 0.6185, "rewards/accuracies": 1.0, "rewards/chosen": 0.3235604465007782, "rewards/margins": 0.15528565645217896, "rewards/rejected": 0.16827479004859924, "step": 239 }, { "epoch": 0.13, "learning_rate": 9.995483134308002e-08, "logits/chosen": -2.085911750793457, "logits/rejected": -2.08522891998291, "logps/chosen": -14.273037910461426, "logps/rejected": -8.232479095458984, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.2930193841457367, "rewards/margins": 0.0261458158493042, "rewards/rejected": 0.2668735682964325, "step": 240 }, { "epoch": 0.13, "learning_rate": 9.995358556242608e-08, "logits/chosen": -2.092602491378784, "logits/rejected": -2.08371901512146, "logps/chosen": -13.237015724182129, "logps/rejected": -7.580197334289551, "loss": 0.5938, "rewards/accuracies": 1.0, "rewards/chosen": 0.44217196106910706, "rewards/margins": 0.20967531204223633, "rewards/rejected": 0.23249664902687073, "step": 241 }, { "epoch": 0.13, "learning_rate": 9.995232284293853e-08, "logits/chosen": -2.134394645690918, "logits/rejected": -2.1347970962524414, "logps/chosen": -7.263946533203125, "logps/rejected": -7.127353668212891, "loss": 0.6671, "rewards/accuracies": 1.0, "rewards/chosen": 0.37767964601516724, "rewards/margins": 0.052792370319366455, "rewards/rejected": 0.3248872756958008, "step": 242 }, { "epoch": 0.13, "learning_rate": 9.995104318504557e-08, "logits/chosen": -2.0347862243652344, "logits/rejected": -2.0297255516052246, "logps/chosen": -9.018664360046387, "logps/rejected": -6.872889041900635, "loss": 0.6395, "rewards/accuracies": 1.0, "rewards/chosen": 0.29816389083862305, "rewards/margins": 0.11039681732654572, "rewards/rejected": 0.18776707351207733, "step": 243 }, { "epoch": 0.13, "learning_rate": 9.994974658918113e-08, "logits/chosen": -2.188673257827759, "logits/rejected": -2.1890058517456055, "logps/chosen": -33.67694091796875, "logps/rejected": -11.399374008178711, "loss": 0.6099, "rewards/accuracies": 1.0, "rewards/chosen": 0.3248451352119446, "rewards/margins": 0.17405138909816742, "rewards/rejected": 0.15079374611377716, "step": 244 }, { "epoch": 0.13, "learning_rate": 9.994843305578485e-08, "logits/chosen": -2.116427421569824, "logits/rejected": -2.270346164703369, "logps/chosen": -5.441055774688721, "logps/rejected": -5.437333106994629, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.3977791368961334, "rewards/margins": 0.008890002965927124, "rewards/rejected": 0.3888891339302063, "step": 245 }, { "epoch": 0.13, "learning_rate": 9.994710258530215e-08, "logits/chosen": -2.0827012062072754, "logits/rejected": -2.080291509628296, "logps/chosen": -17.74495506286621, "logps/rejected": -7.200425148010254, "loss": 0.6267, "rewards/accuracies": 1.0, "rewards/chosen": 0.4661388397216797, "rewards/margins": 0.1375449001789093, "rewards/rejected": 0.3285939395427704, "step": 246 }, { "epoch": 0.13, "learning_rate": 9.994575517818419e-08, "logits/chosen": -2.168630599975586, "logits/rejected": -2.1726582050323486, "logps/chosen": -9.739006042480469, "logps/rejected": -13.756587982177734, "loss": 0.6176, "rewards/accuracies": 1.0, "rewards/chosen": 0.4144926071166992, "rewards/margins": 0.1572023332118988, "rewards/rejected": 0.2572902739048004, "step": 247 }, { "epoch": 0.13, "learning_rate": 9.994439083488784e-08, "logits/chosen": -2.1044089794158936, "logits/rejected": -2.1128997802734375, "logps/chosen": -9.331058502197266, "logps/rejected": -7.545881748199463, "loss": 0.5506, "rewards/accuracies": 1.0, "rewards/chosen": 0.548435389995575, "rewards/margins": 0.3087473213672638, "rewards/rejected": 0.23968806862831116, "step": 248 }, { "epoch": 0.13, "learning_rate": 9.994300955587575e-08, "logits/chosen": -2.1729819774627686, "logits/rejected": -2.1807167530059814, "logps/chosen": -7.627651691436768, "logps/rejected": -6.517669677734375, "loss": 0.5936, "rewards/accuracies": 1.0, "rewards/chosen": 0.47076088190078735, "rewards/margins": 0.21006131172180176, "rewards/rejected": 0.2606995701789856, "step": 249 }, { "epoch": 0.13, "learning_rate": 9.994161134161633e-08, "logits/chosen": -2.0314419269561768, "logits/rejected": -2.041855812072754, "logps/chosen": -9.623208999633789, "logps/rejected": -6.245420455932617, "loss": 0.5566, "rewards/accuracies": 1.0, "rewards/chosen": 0.5233255624771118, "rewards/margins": 0.29465818405151367, "rewards/rejected": 0.22866736352443695, "step": 250 }, { "epoch": 0.14, "learning_rate": 9.994019619258365e-08, "logits/chosen": -2.0403988361358643, "logits/rejected": -2.041997194290161, "logps/chosen": -4.753566265106201, "logps/rejected": -7.625927925109863, "loss": 0.5908, "rewards/accuracies": 1.0, "rewards/chosen": 0.4392866790294647, "rewards/margins": 0.2163529098033905, "rewards/rejected": 0.22293376922607422, "step": 251 }, { "epoch": 0.14, "learning_rate": 9.993876410925761e-08, "logits/chosen": -2.125713348388672, "logits/rejected": -2.1845762729644775, "logps/chosen": -14.839675903320312, "logps/rejected": -21.569339752197266, "loss": 0.5287, "rewards/accuracies": 1.0, "rewards/chosen": 0.44403305649757385, "rewards/margins": 0.3613004684448242, "rewards/rejected": 0.08273258060216904, "step": 252 }, { "epoch": 0.14, "learning_rate": 9.99373150921238e-08, "logits/chosen": -2.1386945247650146, "logits/rejected": -2.1231703758239746, "logps/chosen": -12.20473575592041, "logps/rejected": -9.51345443725586, "loss": 0.6085, "rewards/accuracies": 1.0, "rewards/chosen": 0.35275229811668396, "rewards/margins": 0.1771741807460785, "rewards/rejected": 0.17557811737060547, "step": 253 }, { "epoch": 0.14, "learning_rate": 9.993584914167356e-08, "logits/chosen": -1.9543402194976807, "logits/rejected": -1.9624477624893188, "logps/chosen": -9.33866024017334, "logps/rejected": -8.55909252166748, "loss": 0.6055, "rewards/accuracies": 1.0, "rewards/chosen": 0.37786418199539185, "rewards/margins": 0.18382473289966583, "rewards/rejected": 0.194039449095726, "step": 254 }, { "epoch": 0.14, "learning_rate": 9.993436625840404e-08, "logits/chosen": -2.1308698654174805, "logits/rejected": -2.134965658187866, "logps/chosen": -10.714397430419922, "logps/rejected": -6.784745216369629, "loss": 0.6583, "rewards/accuracies": 1.0, "rewards/chosen": 0.475633442401886, "rewards/margins": 0.07094955444335938, "rewards/rejected": 0.4046838879585266, "step": 255 }, { "epoch": 0.14, "learning_rate": 9.9932866442818e-08, "logits/chosen": -2.256882905960083, "logits/rejected": -2.4970574378967285, "logps/chosen": -20.399206161499023, "logps/rejected": -35.12434005737305, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.1273515671491623, "rewards/margins": 0.0003858506679534912, "rewards/rejected": 0.1269657164812088, "step": 256 }, { "epoch": 0.14, "learning_rate": 9.993134969542405e-08, "logits/chosen": -2.2026429176330566, "logits/rejected": -2.205108165740967, "logps/chosen": -15.405714988708496, "logps/rejected": -7.002631664276123, "loss": 0.6145, "rewards/accuracies": 1.0, "rewards/chosen": 0.3735886514186859, "rewards/margins": 0.16407464444637299, "rewards/rejected": 0.20951400697231293, "step": 257 }, { "epoch": 0.14, "learning_rate": 9.992981601673649e-08, "logits/chosen": -2.0682528018951416, "logits/rejected": -2.0642006397247314, "logps/chosen": -15.946550369262695, "logps/rejected": -6.738924026489258, "loss": 0.5772, "rewards/accuracies": 1.0, "rewards/chosen": 0.4676595628261566, "rewards/margins": 0.24714259803295135, "rewards/rejected": 0.22051696479320526, "step": 258 }, { "epoch": 0.14, "learning_rate": 9.99282654072754e-08, "logits/chosen": -1.9873965978622437, "logits/rejected": -2.289041519165039, "logps/chosen": -5.513155460357666, "logps/rejected": -5.618996620178223, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.5234542489051819, "rewards/margins": 0.006158590316772461, "rewards/rejected": 0.5172956585884094, "step": 259 }, { "epoch": 0.14, "learning_rate": 9.992669786756657e-08, "logits/chosen": -2.0769476890563965, "logits/rejected": -2.0772507190704346, "logps/chosen": -11.662100791931152, "logps/rejected": -6.503054141998291, "loss": 0.6363, "rewards/accuracies": 1.0, "rewards/chosen": 0.5576581358909607, "rewards/margins": 0.11713406443595886, "rewards/rejected": 0.44052407145500183, "step": 260 }, { "epoch": 0.14, "learning_rate": 9.992511339814153e-08, "logits/chosen": -2.1581475734710693, "logits/rejected": -2.2657341957092285, "logps/chosen": -5.900561332702637, "logps/rejected": -6.008807182312012, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.49923038482666016, "rewards/margins": -0.0007475912570953369, "rewards/rejected": 0.4999779760837555, "step": 261 }, { "epoch": 0.14, "learning_rate": 9.992351199953756e-08, "logits/chosen": -2.151777982711792, "logits/rejected": -2.1636574268341064, "logps/chosen": -17.09432601928711, "logps/rejected": -8.193134307861328, "loss": 0.6628, "rewards/accuracies": 1.0, "rewards/chosen": 0.3560333251953125, "rewards/margins": 0.06164512038230896, "rewards/rejected": 0.29438820481300354, "step": 262 }, { "epoch": 0.14, "learning_rate": 9.99218936722977e-08, "logits/chosen": -2.083937406539917, "logits/rejected": -2.313955545425415, "logps/chosen": -7.65142822265625, "logps/rejected": -7.635248184204102, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.35207387804985046, "rewards/margins": 0.01947137713432312, "rewards/rejected": 0.33260250091552734, "step": 263 }, { "epoch": 0.14, "learning_rate": 9.992025841697068e-08, "logits/chosen": -2.143327236175537, "logits/rejected": -2.295318365097046, "logps/chosen": -4.594616889953613, "logps/rejected": -4.337674617767334, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.29645583033561707, "rewards/margins": 0.011904716491699219, "rewards/rejected": 0.28455111384391785, "step": 264 }, { "epoch": 0.14, "learning_rate": 9.991860623411104e-08, "logits/chosen": -2.002923011779785, "logits/rejected": -1.994982361793518, "logps/chosen": -7.395331859588623, "logps/rejected": -8.170682907104492, "loss": 0.6068, "rewards/accuracies": 1.0, "rewards/chosen": 0.4826817214488983, "rewards/margins": 0.18076416850090027, "rewards/rejected": 0.30191755294799805, "step": 265 }, { "epoch": 0.14, "learning_rate": 9.991693712427898e-08, "logits/chosen": -2.081225633621216, "logits/rejected": -2.3363096714019775, "logps/chosen": -11.564748764038086, "logps/rejected": -11.463815689086914, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.35760727524757385, "rewards/margins": 0.01082611083984375, "rewards/rejected": 0.3467811644077301, "step": 266 }, { "epoch": 0.14, "learning_rate": 9.991525108804051e-08, "logits/chosen": -2.0307118892669678, "logits/rejected": -2.03055477142334, "logps/chosen": -8.490070343017578, "logps/rejected": -8.35582160949707, "loss": 0.562, "rewards/accuracies": 1.0, "rewards/chosen": 0.5001987814903259, "rewards/margins": 0.2821182608604431, "rewards/rejected": 0.2180805206298828, "step": 267 }, { "epoch": 0.14, "learning_rate": 9.991354812596734e-08, "logits/chosen": -2.0887811183929443, "logits/rejected": -2.088846445083618, "logps/chosen": -12.571435928344727, "logps/rejected": -5.879888534545898, "loss": 0.5475, "rewards/accuracies": 1.0, "rewards/chosen": 0.6173955798149109, "rewards/margins": 0.3162151277065277, "rewards/rejected": 0.3011804521083832, "step": 268 }, { "epoch": 0.15, "learning_rate": 9.991182823863693e-08, "logits/chosen": -2.109288215637207, "logits/rejected": -2.123326063156128, "logps/chosen": -10.241907119750977, "logps/rejected": -7.782729148864746, "loss": 0.5577, "rewards/accuracies": 1.0, "rewards/chosen": 0.6620367169380188, "rewards/margins": 0.29226428270339966, "rewards/rejected": 0.36977243423461914, "step": 269 }, { "epoch": 0.15, "learning_rate": 9.991009142663247e-08, "logits/chosen": -2.145596742630005, "logits/rejected": -2.393075466156006, "logps/chosen": -6.994880676269531, "logps/rejected": -6.992701530456543, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.42635783553123474, "rewards/margins": 0.01811063289642334, "rewards/rejected": 0.4082472026348114, "step": 270 }, { "epoch": 0.15, "learning_rate": 9.990833769054292e-08, "logits/chosen": -1.9983032941818237, "logits/rejected": -1.9984210729599, "logps/chosen": -6.053711891174316, "logps/rejected": -7.774024963378906, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.421979159116745, "rewards/margins": 0.021694183349609375, "rewards/rejected": 0.4002849757671356, "step": 271 }, { "epoch": 0.15, "learning_rate": 9.990656703096295e-08, "logits/chosen": -2.089071750640869, "logits/rejected": -2.0897908210754395, "logps/chosen": -13.100172996520996, "logps/rejected": -8.263365745544434, "loss": 0.5815, "rewards/accuracies": 1.0, "rewards/chosen": 0.49673032760620117, "rewards/margins": 0.2373061180114746, "rewards/rejected": 0.25942420959472656, "step": 272 }, { "epoch": 0.15, "learning_rate": 9.990477944849296e-08, "logits/chosen": -2.1597440242767334, "logits/rejected": -2.27441668510437, "logps/chosen": -4.535297393798828, "logps/rejected": -4.614883899688721, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.3837305009365082, "rewards/margins": 0.001450568437576294, "rewards/rejected": 0.3822799324989319, "step": 273 }, { "epoch": 0.15, "learning_rate": 9.990297494373911e-08, "logits/chosen": -2.0467262268066406, "logits/rejected": -2.3137593269348145, "logps/chosen": -4.029392719268799, "logps/rejected": -4.1580810546875, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.5658353567123413, "rewards/margins": 0.026911914348602295, "rewards/rejected": 0.538923442363739, "step": 274 }, { "epoch": 0.15, "learning_rate": 9.990115351731332e-08, "logits/chosen": -2.0573549270629883, "logits/rejected": -2.0528697967529297, "logps/chosen": -14.487357139587402, "logps/rejected": -6.045941352844238, "loss": 0.4851, "rewards/accuracies": 1.0, "rewards/chosen": 0.6156472563743591, "rewards/margins": 0.47094589471817017, "rewards/rejected": 0.14470134675502777, "step": 275 }, { "epoch": 0.15, "learning_rate": 9.989931516983317e-08, "logits/chosen": -2.1550729274749756, "logits/rejected": -2.1576642990112305, "logps/chosen": -6.691764831542969, "logps/rejected": -7.44627046585083, "loss": 0.5982, "rewards/accuracies": 1.0, "rewards/chosen": 0.5319501757621765, "rewards/margins": 0.199823796749115, "rewards/rejected": 0.3321263790130615, "step": 276 }, { "epoch": 0.15, "learning_rate": 9.989745990192207e-08, "logits/chosen": -2.076079845428467, "logits/rejected": -2.2674150466918945, "logps/chosen": -4.301116466522217, "logps/rejected": -4.208254337310791, "loss": 0.6797, "rewards/accuracies": 1.0, "rewards/chosen": 0.47252747416496277, "rewards/margins": 0.02712908387184143, "rewards/rejected": 0.44539839029312134, "step": 277 }, { "epoch": 0.15, "learning_rate": 9.98955877142091e-08, "logits/chosen": -2.111248254776001, "logits/rejected": -2.3579037189483643, "logps/chosen": -12.737058639526367, "logps/rejected": -12.964011192321777, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.4811677932739258, "rewards/margins": 0.02767801284790039, "rewards/rejected": 0.4534897804260254, "step": 278 }, { "epoch": 0.15, "learning_rate": 9.989369860732912e-08, "logits/chosen": -2.1746442317962646, "logits/rejected": -2.249199628829956, "logps/chosen": -4.138955116271973, "logps/rejected": -4.0501909255981445, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.4868519902229309, "rewards/margins": 0.015519529581069946, "rewards/rejected": 0.47133246064186096, "step": 279 }, { "epoch": 0.15, "learning_rate": 9.989179258192269e-08, "logits/chosen": -2.1126794815063477, "logits/rejected": -2.335395336151123, "logps/chosen": -14.93378734588623, "logps/rejected": -12.465192794799805, "loss": 0.6361, "rewards/accuracies": 1.0, "rewards/chosen": 0.4299897253513336, "rewards/margins": 0.11749440431594849, "rewards/rejected": 0.31249532103538513, "step": 280 }, { "epoch": 0.15, "learning_rate": 9.988986963863617e-08, "logits/chosen": -2.2188422679901123, "logits/rejected": -2.2752161026000977, "logps/chosen": -6.215962886810303, "logps/rejected": -6.096853256225586, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.37790170311927795, "rewards/margins": 0.016646921634674072, "rewards/rejected": 0.3612547814846039, "step": 281 }, { "epoch": 0.15, "learning_rate": 9.988792977812154e-08, "logits/chosen": -2.2118263244628906, "logits/rejected": -2.2079427242279053, "logps/chosen": -13.046476364135742, "logps/rejected": -7.1727399826049805, "loss": 0.5371, "rewards/accuracies": 1.0, "rewards/chosen": 0.6173517107963562, "rewards/margins": 0.34092995524406433, "rewards/rejected": 0.27642175555229187, "step": 282 }, { "epoch": 0.15, "learning_rate": 9.988597300103667e-08, "logits/chosen": -2.06188702583313, "logits/rejected": -2.3149490356445312, "logps/chosen": -11.367313385009766, "logps/rejected": -5.671079635620117, "loss": 0.7579, "rewards/accuracies": 0.0, "rewards/chosen": 0.3762211799621582, "rewards/margins": -0.12562286853790283, "rewards/rejected": 0.501844048500061, "step": 283 }, { "epoch": 0.15, "learning_rate": 9.988399930804503e-08, "logits/chosen": -2.1537535190582275, "logits/rejected": -2.3537046909332275, "logps/chosen": -11.525510787963867, "logps/rejected": -18.679885864257812, "loss": 0.621, "rewards/accuracies": 1.0, "rewards/chosen": 0.32886335253715515, "rewards/margins": 0.14981938898563385, "rewards/rejected": 0.1790439635515213, "step": 284 }, { "epoch": 0.15, "learning_rate": 9.988200869981592e-08, "logits/chosen": -2.075482130050659, "logits/rejected": -2.2722365856170654, "logps/chosen": -4.852715015411377, "logps/rejected": -4.682159900665283, "loss": 0.7039, "rewards/accuracies": 0.0, "rewards/chosen": 0.3491639792919159, "rewards/margins": -0.02134549617767334, "rewards/rejected": 0.37050947546958923, "step": 285 }, { "epoch": 0.15, "learning_rate": 9.98800011770243e-08, "logits/chosen": -2.1560285091400146, "logits/rejected": -2.1581835746765137, "logps/chosen": -6.209895133972168, "logps/rejected": -6.46019172668457, "loss": 0.5859, "rewards/accuracies": 1.0, "rewards/chosen": 0.5404519438743591, "rewards/margins": 0.22738143801689148, "rewards/rejected": 0.31307050585746765, "step": 286 }, { "epoch": 0.15, "learning_rate": 9.987797674035093e-08, "logits/chosen": -2.0639209747314453, "logits/rejected": -2.3519833087921143, "logps/chosen": -4.699275970458984, "logps/rejected": -4.56667423248291, "loss": 0.6689, "rewards/accuracies": 1.0, "rewards/chosen": 0.5328512191772461, "rewards/margins": 0.04902723431587219, "rewards/rejected": 0.4838239848613739, "step": 287 }, { "epoch": 0.16, "learning_rate": 9.987593539048227e-08, "logits/chosen": -2.1490073204040527, "logits/rejected": -2.326047897338867, "logps/chosen": -5.1074137687683105, "logps/rejected": -5.06866979598999, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 0.5090081691741943, "rewards/margins": 0.02797907590866089, "rewards/rejected": 0.48102909326553345, "step": 288 }, { "epoch": 0.16, "learning_rate": 9.987387712811055e-08, "logits/chosen": -2.219130754470825, "logits/rejected": -2.1031250953674316, "logps/chosen": -75.36454010009766, "logps/rejected": -9.051593780517578, "loss": 0.6463, "rewards/accuracies": 1.0, "rewards/chosen": 0.35484620928764343, "rewards/margins": 0.09600555896759033, "rewards/rejected": 0.2588406503200531, "step": 289 }, { "epoch": 0.16, "learning_rate": 9.987180195393366e-08, "logits/chosen": -2.145977020263672, "logits/rejected": -2.1537554264068604, "logps/chosen": -25.105884552001953, "logps/rejected": -11.873947143554688, "loss": 0.4493, "rewards/accuracies": 1.0, "rewards/chosen": 0.8900344967842102, "rewards/margins": 0.5671030282974243, "rewards/rejected": 0.3229314982891083, "step": 290 }, { "epoch": 0.16, "learning_rate": 9.98697098686553e-08, "logits/chosen": -2.178452730178833, "logits/rejected": -2.1847517490386963, "logps/chosen": -5.801213264465332, "logps/rejected": -5.3476786613464355, "loss": 0.6051, "rewards/accuracies": 1.0, "rewards/chosen": 0.6056962013244629, "rewards/margins": 0.18457207083702087, "rewards/rejected": 0.421124130487442, "step": 291 }, { "epoch": 0.16, "learning_rate": 9.986760087298491e-08, "logits/chosen": -2.0809850692749023, "logits/rejected": -2.162079095840454, "logps/chosen": -10.266651153564453, "logps/rejected": -22.899152755737305, "loss": 0.5393, "rewards/accuracies": 1.0, "rewards/chosen": 0.5960624814033508, "rewards/margins": 0.3357546031475067, "rewards/rejected": 0.2603078782558441, "step": 292 }, { "epoch": 0.16, "learning_rate": 9.986547496763757e-08, "logits/chosen": -1.9804507493972778, "logits/rejected": -1.9725204706192017, "logps/chosen": -14.06764030456543, "logps/rejected": -5.810801029205322, "loss": 0.5338, "rewards/accuracies": 1.0, "rewards/chosen": 0.5754202008247375, "rewards/margins": 0.34895557165145874, "rewards/rejected": 0.22646461427211761, "step": 293 }, { "epoch": 0.16, "learning_rate": 9.98633321533342e-08, "logits/chosen": -1.9758713245391846, "logits/rejected": -1.9761502742767334, "logps/chosen": -4.53476619720459, "logps/rejected": -3.845674991607666, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.578381359577179, "rewards/margins": 0.0284346342086792, "rewards/rejected": 0.5499467253684998, "step": 294 }, { "epoch": 0.16, "learning_rate": 9.986117243080139e-08, "logits/chosen": -2.2299699783325195, "logits/rejected": -2.2321250438690186, "logps/chosen": -6.501764297485352, "logps/rejected": -7.5302557945251465, "loss": 0.5061, "rewards/accuracies": 1.0, "rewards/chosen": 0.7454997897148132, "rewards/margins": 0.4172358810901642, "rewards/rejected": 0.32826390862464905, "step": 295 }, { "epoch": 0.16, "learning_rate": 9.98589958007715e-08, "logits/chosen": -2.18792986869812, "logits/rejected": -2.321885347366333, "logps/chosen": -5.145108699798584, "logps/rejected": -5.071083068847656, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.5028946399688721, "rewards/margins": 0.006062537431716919, "rewards/rejected": 0.49683210253715515, "step": 296 }, { "epoch": 0.16, "learning_rate": 9.98568022639826e-08, "logits/chosen": -2.0704848766326904, "logits/rejected": -2.072730302810669, "logps/chosen": -4.1036763191223145, "logps/rejected": -2.9769556522369385, "loss": 0.6245, "rewards/accuracies": 1.0, "rewards/chosen": 0.6272820830345154, "rewards/margins": 0.14235517382621765, "rewards/rejected": 0.48492690920829773, "step": 297 }, { "epoch": 0.16, "learning_rate": 9.985459182117849e-08, "logits/chosen": -2.0486555099487305, "logits/rejected": -2.3209428787231445, "logps/chosen": -10.621664047241211, "logps/rejected": -10.054428100585938, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.2661583125591278, "rewards/margins": -0.001936793327331543, "rewards/rejected": 0.26809510588645935, "step": 298 }, { "epoch": 0.16, "learning_rate": 9.985236447310871e-08, "logits/chosen": -2.2983884811401367, "logits/rejected": -2.043665885925293, "logps/chosen": -95.80242919921875, "logps/rejected": -16.4885311126709, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.394827276468277, "rewards/margins": 0.02952098846435547, "rewards/rejected": 0.3653062880039215, "step": 299 }, { "epoch": 0.16, "learning_rate": 9.985012022052857e-08, "logits/chosen": -2.0329110622406006, "logits/rejected": -2.0324153900146484, "logps/chosen": -7.126235008239746, "logps/rejected": -4.480960845947266, "loss": 0.6568, "rewards/accuracies": 1.0, "rewards/chosen": 0.6241300702095032, "rewards/margins": 0.07403087615966797, "rewards/rejected": 0.5500991940498352, "step": 300 }, { "epoch": 0.16, "learning_rate": 9.984785906419904e-08, "logits/chosen": -2.001243829727173, "logits/rejected": -1.9993096590042114, "logps/chosen": -7.750667095184326, "logps/rejected": -8.581191062927246, "loss": 0.5781, "rewards/accuracies": 1.0, "rewards/chosen": 0.6730717420578003, "rewards/margins": 0.245065838098526, "rewards/rejected": 0.4280059039592743, "step": 301 }, { "epoch": 0.16, "learning_rate": 9.984558100488686e-08, "logits/chosen": -2.1120150089263916, "logits/rejected": -2.2685463428497314, "logps/chosen": -3.8988285064697266, "logps/rejected": -10.38664436340332, "loss": 0.6425, "rewards/accuracies": 1.0, "rewards/chosen": 0.5322098731994629, "rewards/margins": 0.103936105966568, "rewards/rejected": 0.4282737672328949, "step": 302 }, { "epoch": 0.16, "learning_rate": 9.984328604336452e-08, "logits/chosen": -1.9902746677398682, "logits/rejected": -1.9995670318603516, "logps/chosen": -7.0777177810668945, "logps/rejected": -8.736178398132324, "loss": 0.5836, "rewards/accuracies": 1.0, "rewards/chosen": 0.65276700258255, "rewards/margins": 0.232502281665802, "rewards/rejected": 0.42026472091674805, "step": 303 }, { "epoch": 0.16, "learning_rate": 9.98409741804102e-08, "logits/chosen": -2.228184461593628, "logits/rejected": -2.097322940826416, "logps/chosen": -50.037193298339844, "logps/rejected": -12.142529487609863, "loss": 0.7065, "rewards/accuracies": 0.0, "rewards/chosen": 0.39106711745262146, "rewards/margins": -0.026530742645263672, "rewards/rejected": 0.41759786009788513, "step": 304 }, { "epoch": 0.16, "learning_rate": 9.983864541680785e-08, "logits/chosen": -2.07011342048645, "logits/rejected": -2.066723346710205, "logps/chosen": -16.866382598876953, "logps/rejected": -5.742284774780273, "loss": 0.6619, "rewards/accuracies": 1.0, "rewards/chosen": 0.4651786983013153, "rewards/margins": 0.06348180770874023, "rewards/rejected": 0.4016968905925751, "step": 305 }, { "epoch": 0.17, "learning_rate": 9.983629975334713e-08, "logits/chosen": -2.133862018585205, "logits/rejected": -2.3024001121520996, "logps/chosen": -8.174165725708008, "logps/rejected": -6.074926376342773, "loss": 0.7199, "rewards/accuracies": 0.0, "rewards/chosen": 0.2743232846260071, "rewards/margins": -0.05273580551147461, "rewards/rejected": 0.3270590901374817, "step": 306 }, { "epoch": 0.17, "learning_rate": 9.983393719082345e-08, "logits/chosen": -2.2206344604492188, "logits/rejected": -2.388056755065918, "logps/chosen": -4.691246032714844, "logps/rejected": -4.705246925354004, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.5033354163169861, "rewards/margins": 0.012167960405349731, "rewards/rejected": 0.49116745591163635, "step": 307 }, { "epoch": 0.17, "learning_rate": 9.983155773003788e-08, "logits/chosen": -2.0870730876922607, "logits/rejected": -2.28475284576416, "logps/chosen": -4.669895172119141, "logps/rejected": -4.659619331359863, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.5541159510612488, "rewards/margins": -0.00189971923828125, "rewards/rejected": 0.55601567029953, "step": 308 }, { "epoch": 0.17, "learning_rate": 9.982916137179736e-08, "logits/chosen": -2.1424975395202637, "logits/rejected": -2.1559126377105713, "logps/chosen": -7.64335298538208, "logps/rejected": -8.596445083618164, "loss": 0.5077, "rewards/accuracies": 1.0, "rewards/chosen": 0.9090524911880493, "rewards/margins": 0.41333943605422974, "rewards/rejected": 0.4957130551338196, "step": 309 }, { "epoch": 0.17, "learning_rate": 9.982674811691438e-08, "logits/chosen": -2.03010892868042, "logits/rejected": -2.0301411151885986, "logps/chosen": -5.324283599853516, "logps/rejected": -5.564876556396484, "loss": 0.6037, "rewards/accuracies": 1.0, "rewards/chosen": 0.6934324502944946, "rewards/margins": 0.18758058547973633, "rewards/rejected": 0.5058518648147583, "step": 310 }, { "epoch": 0.17, "learning_rate": 9.982431796620734e-08, "logits/chosen": -2.190189838409424, "logits/rejected": -2.2980968952178955, "logps/chosen": -4.633477687835693, "logps/rejected": -4.639430999755859, "loss": 0.6963, "rewards/accuracies": 0.0, "rewards/chosen": 0.575429379940033, "rewards/margins": -0.006389200687408447, "rewards/rejected": 0.5818185806274414, "step": 311 }, { "epoch": 0.17, "learning_rate": 9.982187092050023e-08, "logits/chosen": -2.1447908878326416, "logits/rejected": -2.1476171016693115, "logps/chosen": -4.4898834228515625, "logps/rejected": -5.414607048034668, "loss": 0.5498, "rewards/accuracies": 1.0, "rewards/chosen": 0.7022525072097778, "rewards/margins": 0.3106258511543274, "rewards/rejected": 0.39162665605545044, "step": 312 }, { "epoch": 0.17, "learning_rate": 9.981940698062283e-08, "logits/chosen": -2.2797515392303467, "logits/rejected": -2.454387903213501, "logps/chosen": -4.427160263061523, "logps/rejected": -4.19930362701416, "loss": 0.7047, "rewards/accuracies": 0.0, "rewards/chosen": 0.47338762879371643, "rewards/margins": -0.02295321226119995, "rewards/rejected": 0.4963408410549164, "step": 313 }, { "epoch": 0.17, "learning_rate": 9.981692614741065e-08, "logits/chosen": -2.135021686553955, "logits/rejected": -2.305630922317505, "logps/chosen": -5.494198322296143, "logps/rejected": -4.034680366516113, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.548133134841919, "rewards/margins": 0.026062428951263428, "rewards/rejected": 0.5220707058906555, "step": 314 }, { "epoch": 0.17, "learning_rate": 9.981442842170492e-08, "logits/chosen": -2.185689926147461, "logits/rejected": -2.0862250328063965, "logps/chosen": -39.46454620361328, "logps/rejected": -4.75273323059082, "loss": 0.5744, "rewards/accuracies": 1.0, "rewards/chosen": 0.6311065554618835, "rewards/margins": 0.2535802721977234, "rewards/rejected": 0.37752628326416016, "step": 315 }, { "epoch": 0.17, "learning_rate": 9.98119138043526e-08, "logits/chosen": -2.084765672683716, "logits/rejected": -2.291036605834961, "logps/chosen": -5.031302452087402, "logps/rejected": -8.43242359161377, "loss": 0.6358, "rewards/accuracies": 1.0, "rewards/chosen": 0.6960075497627258, "rewards/margins": 0.11816328763961792, "rewards/rejected": 0.5778442621231079, "step": 316 }, { "epoch": 0.17, "learning_rate": 9.980938229620636e-08, "logits/chosen": -2.1700992584228516, "logits/rejected": -2.1550345420837402, "logps/chosen": -19.374813079833984, "logps/rejected": -5.841821670532227, "loss": 0.5844, "rewards/accuracies": 1.0, "rewards/chosen": 0.6387434005737305, "rewards/margins": 0.2307056486606598, "rewards/rejected": 0.4080377519130707, "step": 317 }, { "epoch": 0.17, "learning_rate": 9.980683389812463e-08, "logits/chosen": -2.2259206771850586, "logits/rejected": -2.2283058166503906, "logps/chosen": -7.22728967666626, "logps/rejected": -4.0624213218688965, "loss": 0.6265, "rewards/accuracies": 1.0, "rewards/chosen": 0.759321391582489, "rewards/margins": 0.13812518119812012, "rewards/rejected": 0.6211962103843689, "step": 318 }, { "epoch": 0.17, "learning_rate": 9.980426861097154e-08, "logits/chosen": -2.1412014961242676, "logits/rejected": -2.39418363571167, "logps/chosen": -2.8576509952545166, "logps/rejected": -2.7643771171569824, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.6386251449584961, "rewards/margins": 0.0008903741836547852, "rewards/rejected": 0.6377347707748413, "step": 319 }, { "epoch": 0.17, "learning_rate": 9.980168643561696e-08, "logits/chosen": -2.1953935623168945, "logits/rejected": -2.1559431552886963, "logps/chosen": -26.92713165283203, "logps/rejected": -5.442574501037598, "loss": 0.5779, "rewards/accuracies": 1.0, "rewards/chosen": 0.6366416811943054, "rewards/margins": 0.24547165632247925, "rewards/rejected": 0.39117002487182617, "step": 320 }, { "epoch": 0.17, "learning_rate": 9.979908737293649e-08, "logits/chosen": -2.1707918643951416, "logits/rejected": -2.1755754947662354, "logps/chosen": -6.26176118850708, "logps/rejected": -11.254218101501465, "loss": 0.5413, "rewards/accuracies": 1.0, "rewards/chosen": 0.6844334006309509, "rewards/margins": 0.3308570086956024, "rewards/rejected": 0.3535763919353485, "step": 321 }, { "epoch": 0.17, "learning_rate": 9.979647142381143e-08, "logits/chosen": -2.0823309421539307, "logits/rejected": -2.320389986038208, "logps/chosen": -2.7380008697509766, "logps/rejected": -2.712055206298828, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.657221257686615, "rewards/margins": 0.02279496192932129, "rewards/rejected": 0.6344262957572937, "step": 322 }, { "epoch": 0.17, "learning_rate": 9.979383858912884e-08, "logits/chosen": -1.9934293031692505, "logits/rejected": -1.992599368095398, "logps/chosen": -3.442934036254883, "logps/rejected": -3.842222213745117, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.6881486773490906, "rewards/margins": 0.03231334686279297, "rewards/rejected": 0.6558353304862976, "step": 323 }, { "epoch": 0.17, "learning_rate": 9.97911888697815e-08, "logits/chosen": -2.150571346282959, "logits/rejected": -2.315776824951172, "logps/chosen": -3.222386598587036, "logps/rejected": -3.247319221496582, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.5668732523918152, "rewards/margins": 0.02558755874633789, "rewards/rejected": 0.5412856936454773, "step": 324 }, { "epoch": 0.18, "learning_rate": 9.97885222666679e-08, "logits/chosen": -2.121077060699463, "logits/rejected": -2.1085681915283203, "logps/chosen": -23.28360366821289, "logps/rejected": -3.997568368911743, "loss": 0.4698, "rewards/accuracies": 1.0, "rewards/chosen": 0.9172767996788025, "rewards/margins": 0.5114279985427856, "rewards/rejected": 0.40584880113601685, "step": 325 }, { "epoch": 0.18, "learning_rate": 9.978583878069224e-08, "logits/chosen": -2.072929859161377, "logits/rejected": -2.29314923286438, "logps/chosen": -3.548207998275757, "logps/rejected": -3.468998908996582, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.5267462134361267, "rewards/margins": 0.011461973190307617, "rewards/rejected": 0.5152842402458191, "step": 326 }, { "epoch": 0.18, "learning_rate": 9.978313841276451e-08, "logits/chosen": -2.177304267883301, "logits/rejected": -2.2991108894348145, "logps/chosen": -5.937639236450195, "logps/rejected": -5.893078327178955, "loss": 0.7071, "rewards/accuracies": 0.0, "rewards/chosen": 0.5722258687019348, "rewards/margins": -0.027804970741271973, "rewards/rejected": 0.6000308394432068, "step": 327 }, { "epoch": 0.18, "learning_rate": 9.978042116380036e-08, "logits/chosen": -2.09747576713562, "logits/rejected": -2.104832649230957, "logps/chosen": -5.461416721343994, "logps/rejected": -5.08820104598999, "loss": 0.5143, "rewards/accuracies": 1.0, "rewards/chosen": 0.8026787042617798, "rewards/margins": 0.3967658579349518, "rewards/rejected": 0.405912846326828, "step": 328 }, { "epoch": 0.18, "learning_rate": 9.977768703472118e-08, "logits/chosen": -2.0420072078704834, "logits/rejected": -2.080584764480591, "logps/chosen": -4.556092262268066, "logps/rejected": -19.024049758911133, "loss": 0.6275, "rewards/accuracies": 1.0, "rewards/chosen": 0.426351934671402, "rewards/margins": 0.13592225313186646, "rewards/rejected": 0.2904296815395355, "step": 329 }, { "epoch": 0.18, "learning_rate": 9.977493602645408e-08, "logits/chosen": -2.0978434085845947, "logits/rejected": -2.098090410232544, "logps/chosen": -12.866496086120605, "logps/rejected": -5.268485069274902, "loss": 0.5654, "rewards/accuracies": 1.0, "rewards/chosen": 0.7872368693351746, "rewards/margins": 0.2741319537162781, "rewards/rejected": 0.5131049156188965, "step": 330 }, { "epoch": 0.18, "learning_rate": 9.977216813993192e-08, "logits/chosen": -2.017894744873047, "logits/rejected": -2.281825304031372, "logps/chosen": -4.0805511474609375, "logps/rejected": -3.773266315460205, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.6434709429740906, "rewards/margins": 0.012405633926391602, "rewards/rejected": 0.631065309047699, "step": 331 }, { "epoch": 0.18, "learning_rate": 9.976938337609328e-08, "logits/chosen": -1.9828970432281494, "logits/rejected": -2.2908260822296143, "logps/chosen": -14.346960067749023, "logps/rejected": -4.588107109069824, "loss": 0.7559, "rewards/accuracies": 0.0, "rewards/chosen": 0.4622230529785156, "rewards/margins": -0.1217760443687439, "rewards/rejected": 0.5839990973472595, "step": 332 }, { "epoch": 0.18, "learning_rate": 9.976658173588242e-08, "logits/chosen": -2.1319639682769775, "logits/rejected": -2.1357030868530273, "logps/chosen": -9.121716499328613, "logps/rejected": -8.894962310791016, "loss": 0.5886, "rewards/accuracies": 1.0, "rewards/chosen": 0.7100871205329895, "rewards/margins": 0.22122812271118164, "rewards/rejected": 0.48885899782180786, "step": 333 }, { "epoch": 0.18, "learning_rate": 9.976376322024938e-08, "logits/chosen": -2.038510322570801, "logits/rejected": -2.0481560230255127, "logps/chosen": -5.3366169929504395, "logps/rejected": -3.801018476486206, "loss": 0.5465, "rewards/accuracies": 1.0, "rewards/chosen": 0.8048868179321289, "rewards/margins": 0.3185731768608093, "rewards/rejected": 0.4863136410713196, "step": 334 }, { "epoch": 0.18, "learning_rate": 9.976092783014988e-08, "logits/chosen": -2.1115121841430664, "logits/rejected": -2.113105535507202, "logps/chosen": -2.428466796875, "logps/rejected": -7.53021240234375, "loss": 0.5553, "rewards/accuracies": 1.0, "rewards/chosen": 0.8146262168884277, "rewards/margins": 0.2978954315185547, "rewards/rejected": 0.516730785369873, "step": 335 }, { "epoch": 0.18, "learning_rate": 9.975807556654536e-08, "logits/chosen": -2.1124839782714844, "logits/rejected": -2.3116238117218018, "logps/chosen": -3.7292590141296387, "logps/rejected": -3.5032248497009277, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.5365549921989441, "rewards/margins": 0.018347740173339844, "rewards/rejected": 0.5182072520256042, "step": 336 }, { "epoch": 0.18, "learning_rate": 9.975520643040304e-08, "logits/chosen": -2.142773151397705, "logits/rejected": -2.1061739921569824, "logps/chosen": -27.42668914794922, "logps/rejected": -10.749088287353516, "loss": 0.5867, "rewards/accuracies": 1.0, "rewards/chosen": 0.7590950131416321, "rewards/margins": 0.2254917025566101, "rewards/rejected": 0.533603310585022, "step": 337 }, { "epoch": 0.18, "learning_rate": 9.975232042269578e-08, "logits/chosen": -2.051558256149292, "logits/rejected": -2.0529632568359375, "logps/chosen": -3.0695457458496094, "logps/rejected": -5.083012580871582, "loss": 0.5778, "rewards/accuracies": 1.0, "rewards/chosen": 0.5816102027893066, "rewards/margins": 0.24586114287376404, "rewards/rejected": 0.3357490599155426, "step": 338 }, { "epoch": 0.18, "learning_rate": 9.974941754440223e-08, "logits/chosen": -2.114666223526001, "logits/rejected": -2.3145034313201904, "logps/chosen": -2.775428295135498, "logps/rejected": -2.7785236835479736, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.6302502155303955, "rewards/margins": 0.008663415908813477, "rewards/rejected": 0.621586799621582, "step": 339 }, { "epoch": 0.18, "learning_rate": 9.97464977965067e-08, "logits/chosen": -2.128389358520508, "logits/rejected": -2.3631186485290527, "logps/chosen": -2.646808624267578, "logps/rejected": -2.546497344970703, "loss": 0.6962, "rewards/accuracies": 0.0, "rewards/chosen": 0.6250885128974915, "rewards/margins": -0.006062805652618408, "rewards/rejected": 0.6311513185501099, "step": 340 }, { "epoch": 0.18, "learning_rate": 9.974356117999927e-08, "logits/chosen": -2.180717945098877, "logits/rejected": -2.1834816932678223, "logps/chosen": -5.229081153869629, "logps/rejected": -4.408329010009766, "loss": 0.4552, "rewards/accuracies": 1.0, "rewards/chosen": 0.9878905415534973, "rewards/margins": 0.5506681203842163, "rewards/rejected": 0.4372223913669586, "step": 341 }, { "epoch": 0.18, "learning_rate": 9.974060769587573e-08, "logits/chosen": -2.1521430015563965, "logits/rejected": -2.152597188949585, "logps/chosen": -5.074809551239014, "logps/rejected": -3.4973602294921875, "loss": 0.525, "rewards/accuracies": 1.0, "rewards/chosen": 0.8352228403091431, "rewards/margins": 0.37049686908721924, "rewards/rejected": 0.46472597122192383, "step": 342 }, { "epoch": 0.19, "learning_rate": 9.973763734513756e-08, "logits/chosen": -2.0515174865722656, "logits/rejected": -2.050832748413086, "logps/chosen": -2.8767011165618896, "logps/rejected": -4.188157558441162, "loss": 0.5922, "rewards/accuracies": 1.0, "rewards/chosen": 0.7123162150382996, "rewards/margins": 0.21323367953300476, "rewards/rejected": 0.4990825355052948, "step": 343 }, { "epoch": 0.19, "learning_rate": 9.973465012879201e-08, "logits/chosen": -2.0612740516662598, "logits/rejected": -2.2667248249053955, "logps/chosen": -9.973630905151367, "logps/rejected": -8.185602188110352, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.5457448363304138, "rewards/margins": 0.0021399855613708496, "rewards/rejected": 0.543604850769043, "step": 344 }, { "epoch": 0.19, "learning_rate": 9.973164604785198e-08, "logits/chosen": -2.109760284423828, "logits/rejected": -2.3264968395233154, "logps/chosen": -5.300682067871094, "logps/rejected": -6.244744300842285, "loss": 0.6665, "rewards/accuracies": 1.0, "rewards/chosen": 0.63360595703125, "rewards/margins": 0.054118454456329346, "rewards/rejected": 0.5794875025749207, "step": 345 }, { "epoch": 0.19, "learning_rate": 9.972862510333616e-08, "logits/chosen": -2.021282196044922, "logits/rejected": -2.233741521835327, "logps/chosen": -2.195343255996704, "logps/rejected": -2.1732378005981445, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.686525821685791, "rewards/margins": 0.00018072128295898438, "rewards/rejected": 0.686345100402832, "step": 346 }, { "epoch": 0.19, "learning_rate": 9.972558729626892e-08, "logits/chosen": -2.019357204437256, "logits/rejected": -2.0188167095184326, "logps/chosen": -3.343471050262451, "logps/rejected": -5.048086166381836, "loss": 0.5409, "rewards/accuracies": 1.0, "rewards/chosen": 0.7617136836051941, "rewards/margins": 0.33192166686058044, "rewards/rejected": 0.42979201674461365, "step": 347 }, { "epoch": 0.19, "learning_rate": 9.972253262768034e-08, "logits/chosen": -2.1590237617492676, "logits/rejected": -2.2981624603271484, "logps/chosen": -3.1406378746032715, "logps/rejected": -3.202720880508423, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.6701139211654663, "rewards/margins": 0.010729730129241943, "rewards/rejected": 0.6593841910362244, "step": 348 }, { "epoch": 0.19, "learning_rate": 9.971946109860625e-08, "logits/chosen": -2.1520705223083496, "logits/rejected": -2.3539531230926514, "logps/chosen": -3.133728504180908, "logps/rejected": -14.803014755249023, "loss": 0.5672, "rewards/accuracies": 1.0, "rewards/chosen": 0.8045101165771484, "rewards/margins": 0.27013128995895386, "rewards/rejected": 0.5343788266181946, "step": 349 }, { "epoch": 0.19, "learning_rate": 9.971637271008817e-08, "logits/chosen": -2.2249574661254883, "logits/rejected": -2.305441379547119, "logps/chosen": -3.479168176651001, "logps/rejected": -3.525665283203125, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.69810551404953, "rewards/margins": 0.014172136783599854, "rewards/rejected": 0.6839333772659302, "step": 350 }, { "epoch": 0.19, "learning_rate": 9.971326746317334e-08, "logits/chosen": -2.0294220447540283, "logits/rejected": -2.306723117828369, "logps/chosen": -4.057042121887207, "logps/rejected": -4.079983711242676, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.6347412467002869, "rewards/margins": 0.0018666386604309082, "rewards/rejected": 0.632874608039856, "step": 351 }, { "epoch": 0.19, "learning_rate": 9.971014535891476e-08, "logits/chosen": -2.071566343307495, "logits/rejected": -2.0702595710754395, "logps/chosen": -1.8986903429031372, "logps/rejected": -3.523193597793579, "loss": 0.5536, "rewards/accuracies": 1.0, "rewards/chosen": 0.8250337839126587, "rewards/margins": 0.30178165435791016, "rewards/rejected": 0.5232521295547485, "step": 352 }, { "epoch": 0.19, "learning_rate": 9.970700639837105e-08, "logits/chosen": -2.069091320037842, "logits/rejected": -2.2733101844787598, "logps/chosen": -2.4942803382873535, "logps/rejected": -2.182375192642212, "loss": 0.7035, "rewards/accuracies": 0.0, "rewards/chosen": 0.5998222231864929, "rewards/margins": -0.020679771900177002, "rewards/rejected": 0.6205019950866699, "step": 353 }, { "epoch": 0.19, "learning_rate": 9.970385058260664e-08, "logits/chosen": -2.1065733432769775, "logits/rejected": -2.1077427864074707, "logps/chosen": -4.174045085906982, "logps/rejected": -4.154199123382568, "loss": 0.6675, "rewards/accuracies": 1.0, "rewards/chosen": 0.7432729601860046, "rewards/margins": 0.051877498626708984, "rewards/rejected": 0.6913954615592957, "step": 354 }, { "epoch": 0.19, "learning_rate": 9.970067791269164e-08, "logits/chosen": -2.148552656173706, "logits/rejected": -2.1537179946899414, "logps/chosen": -5.713425636291504, "logps/rejected": -6.000421524047852, "loss": 0.4344, "rewards/accuracies": 1.0, "rewards/chosen": 1.071794033050537, "rewards/margins": 0.6088608503341675, "rewards/rejected": 0.46293315291404724, "step": 355 }, { "epoch": 0.19, "learning_rate": 9.969748838970185e-08, "logits/chosen": -2.142429828643799, "logits/rejected": -2.2520296573638916, "logps/chosen": -4.225045680999756, "logps/rejected": -3.753866195678711, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 0.7549633383750916, "rewards/margins": -0.0024511218070983887, "rewards/rejected": 0.7574144601821899, "step": 356 }, { "epoch": 0.19, "learning_rate": 9.969428201471886e-08, "logits/chosen": -2.060563325881958, "logits/rejected": -2.317607879638672, "logps/chosen": -2.552090883255005, "logps/rejected": -2.235743999481201, "loss": 0.7048, "rewards/accuracies": 0.0, "rewards/chosen": 0.6351874470710754, "rewards/margins": -0.023122429847717285, "rewards/rejected": 0.6583098769187927, "step": 357 }, { "epoch": 0.19, "learning_rate": 9.969105878882986e-08, "logits/chosen": -2.170560836791992, "logits/rejected": -2.1716227531433105, "logps/chosen": -4.768401622772217, "logps/rejected": -5.010958194732666, "loss": 0.509, "rewards/accuracies": 1.0, "rewards/chosen": 0.6897292733192444, "rewards/margins": 0.4101579189300537, "rewards/rejected": 0.2795713543891907, "step": 358 }, { "epoch": 0.19, "learning_rate": 9.968781871312788e-08, "logits/chosen": -2.127917766571045, "logits/rejected": -2.1258063316345215, "logps/chosen": -14.4134521484375, "logps/rejected": -4.21321964263916, "loss": 0.6534, "rewards/accuracies": 1.0, "rewards/chosen": 0.5839397311210632, "rewards/margins": 0.08112835884094238, "rewards/rejected": 0.5028113722801208, "step": 359 }, { "epoch": 0.19, "learning_rate": 9.968456178871156e-08, "logits/chosen": -2.0662055015563965, "logits/rejected": -2.2542850971221924, "logps/chosen": -2.5935983657836914, "logps/rejected": -2.6841468811035156, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.6011198163032532, "rewards/margins": 0.013258278369903564, "rewards/rejected": 0.5878615379333496, "step": 360 }, { "epoch": 0.19, "learning_rate": 9.968128801668532e-08, "logits/chosen": -2.0849289894104004, "logits/rejected": -2.0823428630828857, "logps/chosen": -14.053863525390625, "logps/rejected": -2.608781337738037, "loss": 0.5508, "rewards/accuracies": 1.0, "rewards/chosen": 1.0050534009933472, "rewards/margins": 0.30831557512283325, "rewards/rejected": 0.6967378258705139, "step": 361 }, { "epoch": 0.2, "learning_rate": 9.967799739815924e-08, "logits/chosen": -2.021890163421631, "logits/rejected": -2.018434524536133, "logps/chosen": -4.822541236877441, "logps/rejected": -6.800718307495117, "loss": 0.4916, "rewards/accuracies": 1.0, "rewards/chosen": 0.7677054405212402, "rewards/margins": 0.45418861508369446, "rewards/rejected": 0.3135168254375458, "step": 362 }, { "epoch": 0.2, "learning_rate": 9.967468993424917e-08, "logits/chosen": -1.9617880582809448, "logits/rejected": -1.9634032249450684, "logps/chosen": -4.192770957946777, "logps/rejected": -6.022067546844482, "loss": 0.6214, "rewards/accuracies": 1.0, "rewards/chosen": 0.800297200679779, "rewards/margins": 0.149078369140625, "rewards/rejected": 0.651218831539154, "step": 363 }, { "epoch": 0.2, "learning_rate": 9.967136562607663e-08, "logits/chosen": -2.1120681762695312, "logits/rejected": -2.1122841835021973, "logps/chosen": -5.557740211486816, "logps/rejected": -3.8974337577819824, "loss": 0.6975, "rewards/accuracies": 0.0, "rewards/chosen": 0.6961283087730408, "rewards/margins": -0.008636713027954102, "rewards/rejected": 0.7047650218009949, "step": 364 }, { "epoch": 0.2, "learning_rate": 9.966802447476885e-08, "logits/chosen": -2.1759276390075684, "logits/rejected": -2.2057762145996094, "logps/chosen": -12.36790657043457, "logps/rejected": -21.197452545166016, "loss": 0.3816, "rewards/accuracies": 1.0, "rewards/chosen": 0.9048580527305603, "rewards/margins": 0.7664211392402649, "rewards/rejected": 0.13843689858913422, "step": 365 }, { "epoch": 0.2, "learning_rate": 9.966466648145881e-08, "logits/chosen": -2.038867235183716, "logits/rejected": -2.2299721240997314, "logps/chosen": -7.762862682342529, "logps/rejected": -7.721823692321777, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.7076727747917175, "rewards/margins": -0.0075817108154296875, "rewards/rejected": 0.7152544856071472, "step": 366 }, { "epoch": 0.2, "learning_rate": 9.966129164728518e-08, "logits/chosen": -2.0079267024993896, "logits/rejected": -2.005807876586914, "logps/chosen": -2.1339032649993896, "logps/rejected": -4.0285797119140625, "loss": 0.5418, "rewards/accuracies": 1.0, "rewards/chosen": 0.8668085932731628, "rewards/margins": 0.32964086532592773, "rewards/rejected": 0.5371677279472351, "step": 367 }, { "epoch": 0.2, "learning_rate": 9.965789997339231e-08, "logits/chosen": -2.044872283935547, "logits/rejected": -2.2819836139678955, "logps/chosen": -1.3944776058197021, "logps/rejected": -1.4517741203308105, "loss": 0.6683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9106249213218689, "rewards/margins": 0.0504152774810791, "rewards/rejected": 0.8602096438407898, "step": 368 }, { "epoch": 0.2, "learning_rate": 9.965449146093032e-08, "logits/chosen": -2.120227098464966, "logits/rejected": -2.118178367614746, "logps/chosen": -3.7360801696777344, "logps/rejected": -5.5610432624816895, "loss": 0.4934, "rewards/accuracies": 1.0, "rewards/chosen": 0.8945701718330383, "rewards/margins": 0.4495951235294342, "rewards/rejected": 0.4449750483036041, "step": 369 }, { "epoch": 0.2, "learning_rate": 9.965106611105498e-08, "logits/chosen": -1.976503849029541, "logits/rejected": -2.273815393447876, "logps/chosen": -3.17354679107666, "logps/rejected": -3.399036407470703, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.5961388945579529, "rewards/margins": 0.014921486377716064, "rewards/rejected": 0.5812174081802368, "step": 370 }, { "epoch": 0.2, "learning_rate": 9.964762392492781e-08, "logits/chosen": -2.126953601837158, "logits/rejected": -2.108229398727417, "logps/chosen": -27.221633911132812, "logps/rejected": -4.862920761108398, "loss": 0.4836, "rewards/accuracies": 1.0, "rewards/chosen": 1.0875930786132812, "rewards/margins": 0.47492271661758423, "rewards/rejected": 0.612670361995697, "step": 371 }, { "epoch": 0.2, "learning_rate": 9.964416490371601e-08, "logits/chosen": -2.0817041397094727, "logits/rejected": -2.082883358001709, "logps/chosen": -3.354848623275757, "logps/rejected": -3.007885217666626, "loss": 0.5589, "rewards/accuracies": 1.0, "rewards/chosen": 0.9031161665916443, "rewards/margins": 0.28942346572875977, "rewards/rejected": 0.6136927008628845, "step": 372 }, { "epoch": 0.2, "learning_rate": 9.964068904859253e-08, "logits/chosen": -2.0787644386291504, "logits/rejected": -2.2593231201171875, "logps/chosen": -3.9649295806884766, "logps/rejected": -3.8886730670928955, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.8369553685188293, "rewards/margins": 0.008862853050231934, "rewards/rejected": 0.8280925154685974, "step": 373 }, { "epoch": 0.2, "learning_rate": 9.963719636073597e-08, "logits/chosen": -2.0837008953094482, "logits/rejected": -2.0884487628936768, "logps/chosen": -4.742971420288086, "logps/rejected": -4.05778169631958, "loss": 0.7146, "rewards/accuracies": 0.0, "rewards/chosen": 0.6993135809898376, "rewards/margins": -0.04237884283065796, "rewards/rejected": 0.7416924238204956, "step": 374 }, { "epoch": 0.2, "learning_rate": 9.963368684133071e-08, "logits/chosen": -2.101961612701416, "logits/rejected": -2.1012563705444336, "logps/chosen": -3.3051748275756836, "logps/rejected": -11.137455940246582, "loss": 0.5836, "rewards/accuracies": 1.0, "rewards/chosen": 0.8361789584159851, "rewards/margins": 0.2326403260231018, "rewards/rejected": 0.6035386323928833, "step": 375 }, { "epoch": 0.2, "learning_rate": 9.963016049156678e-08, "logits/chosen": -1.9321812391281128, "logits/rejected": -2.2796406745910645, "logps/chosen": -7.172908782958984, "logps/rejected": -8.91479206085205, "loss": 0.5336, "rewards/accuracies": 1.0, "rewards/chosen": 0.691990315914154, "rewards/margins": 0.34947168827056885, "rewards/rejected": 0.3425186276435852, "step": 376 }, { "epoch": 0.2, "learning_rate": 9.962661731263992e-08, "logits/chosen": -2.1199657917022705, "logits/rejected": -2.3026294708251953, "logps/chosen": -13.388533592224121, "logps/rejected": -12.46190071105957, "loss": 0.7155, "rewards/accuracies": 0.0, "rewards/chosen": 0.7076340913772583, "rewards/margins": -0.04427915811538696, "rewards/rejected": 0.7519132494926453, "step": 377 }, { "epoch": 0.2, "learning_rate": 9.962305730575163e-08, "logits/chosen": -2.041227102279663, "logits/rejected": -2.04955792427063, "logps/chosen": -3.924323558807373, "logps/rejected": -3.289991617202759, "loss": 0.5574, "rewards/accuracies": 1.0, "rewards/chosen": 0.8365150690078735, "rewards/margins": 0.2927798628807068, "rewards/rejected": 0.5437352061271667, "step": 378 }, { "epoch": 0.2, "learning_rate": 9.961948047210903e-08, "logits/chosen": -2.092434883117676, "logits/rejected": -2.066251516342163, "logps/chosen": -12.54562759399414, "logps/rejected": -5.383822917938232, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.5438705682754517, "rewards/margins": 0.022677600383758545, "rewards/rejected": 0.5211929678916931, "step": 379 }, { "epoch": 0.2, "learning_rate": 9.961588681292503e-08, "logits/chosen": -2.1657800674438477, "logits/rejected": -2.1466519832611084, "logps/chosen": -33.26313781738281, "logps/rejected": -16.946413040161133, "loss": 0.653, "rewards/accuracies": 1.0, "rewards/chosen": 0.7728752493858337, "rewards/margins": 0.0819433331489563, "rewards/rejected": 0.6909319162368774, "step": 380 }, { "epoch": 0.21, "learning_rate": 9.961227632941818e-08, "logits/chosen": -2.1146631240844727, "logits/rejected": -2.3355095386505127, "logps/chosen": -3.007628917694092, "logps/rejected": -2.912635087966919, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.9906316995620728, "rewards/margins": 0.019333064556121826, "rewards/rejected": 0.9712986350059509, "step": 381 }, { "epoch": 0.21, "learning_rate": 9.96086490228128e-08, "logits/chosen": -2.1215174198150635, "logits/rejected": -2.2254042625427246, "logps/chosen": -16.71958351135254, "logps/rejected": -23.762582778930664, "loss": 0.4018, "rewards/accuracies": 1.0, "rewards/chosen": 1.11697518825531, "rewards/margins": 0.7041884660720825, "rewards/rejected": 0.41278669238090515, "step": 382 }, { "epoch": 0.21, "learning_rate": 9.960500489433885e-08, "logits/chosen": -2.094780683517456, "logits/rejected": -2.401761054992676, "logps/chosen": -24.245800018310547, "logps/rejected": -22.82387351989746, "loss": 0.7354, "rewards/accuracies": 0.0, "rewards/chosen": 0.33369407057762146, "rewards/margins": -0.08286762237548828, "rewards/rejected": 0.41656169295310974, "step": 383 }, { "epoch": 0.21, "learning_rate": 9.960134394523202e-08, "logits/chosen": -2.106107473373413, "logits/rejected": -2.2993216514587402, "logps/chosen": -2.07399845123291, "logps/rejected": -1.973113775253296, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.7757854461669922, "rewards/margins": 0.02283269166946411, "rewards/rejected": 0.7529527544975281, "step": 384 }, { "epoch": 0.21, "learning_rate": 9.959766617673372e-08, "logits/chosen": -2.03281307220459, "logits/rejected": -2.0184783935546875, "logps/chosen": -12.35594367980957, "logps/rejected": -12.006172180175781, "loss": 0.5521, "rewards/accuracies": 1.0, "rewards/chosen": 0.9596500396728516, "rewards/margins": 0.3053155541419983, "rewards/rejected": 0.6543344855308533, "step": 385 }, { "epoch": 0.21, "learning_rate": 9.959397159009106e-08, "logits/chosen": -2.132309913635254, "logits/rejected": -2.2673940658569336, "logps/chosen": -1.859419345855713, "logps/rejected": -1.7540035247802734, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.78416508436203, "rewards/margins": 0.002585291862487793, "rewards/rejected": 0.7815797924995422, "step": 386 }, { "epoch": 0.21, "learning_rate": 9.959026018655683e-08, "logits/chosen": -2.0559699535369873, "logits/rejected": -2.2453126907348633, "logps/chosen": -3.1841235160827637, "logps/rejected": -2.518787145614624, "loss": 0.7363, "rewards/accuracies": 0.0, "rewards/chosen": 0.792195200920105, "rewards/margins": -0.08444362878799438, "rewards/rejected": 0.8766388297080994, "step": 387 }, { "epoch": 0.21, "learning_rate": 9.958653196738954e-08, "logits/chosen": -2.1465744972229004, "logits/rejected": -2.1536622047424316, "logps/chosen": -4.01967191696167, "logps/rejected": -4.204869270324707, "loss": 0.4908, "rewards/accuracies": 1.0, "rewards/chosen": 0.9701277017593384, "rewards/margins": 0.4564041495323181, "rewards/rejected": 0.5137235522270203, "step": 388 }, { "epoch": 0.21, "learning_rate": 9.958278693385338e-08, "logits/chosen": -2.058199644088745, "logits/rejected": -2.0573253631591797, "logps/chosen": -2.4591915607452393, "logps/rejected": -4.441277503967285, "loss": 0.5425, "rewards/accuracies": 1.0, "rewards/chosen": 0.9386316537857056, "rewards/margins": 0.32800304889678955, "rewards/rejected": 0.610628604888916, "step": 389 }, { "epoch": 0.21, "learning_rate": 9.957902508721827e-08, "logits/chosen": -2.115539789199829, "logits/rejected": -2.127729654312134, "logps/chosen": -2.1909284591674805, "logps/rejected": -8.350787162780762, "loss": 0.558, "rewards/accuracies": 1.0, "rewards/chosen": 0.9352368712425232, "rewards/margins": 0.291370153427124, "rewards/rejected": 0.6438667178153992, "step": 390 }, { "epoch": 0.21, "learning_rate": 9.957524642875983e-08, "logits/chosen": -2.032808303833008, "logits/rejected": -2.260662078857422, "logps/chosen": -1.5918580293655396, "logps/rejected": -1.6459112167358398, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.8588229417800903, "rewards/margins": 0.0017310380935668945, "rewards/rejected": 0.8570919036865234, "step": 391 }, { "epoch": 0.21, "learning_rate": 9.957145095975936e-08, "logits/chosen": -2.127095937728882, "logits/rejected": -2.32247257232666, "logps/chosen": -18.898815155029297, "logps/rejected": -14.090340614318848, "loss": 0.8337, "rewards/accuracies": 0.0, "rewards/chosen": 0.29386407136917114, "rewards/margins": -0.26368284225463867, "rewards/rejected": 0.5575469136238098, "step": 392 }, { "epoch": 0.21, "learning_rate": 9.956763868150388e-08, "logits/chosen": -2.0400640964508057, "logits/rejected": -2.2289247512817383, "logps/chosen": -8.074418067932129, "logps/rejected": -8.349808692932129, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.5370334982872009, "rewards/margins": 0.0346720814704895, "rewards/rejected": 0.5023614168167114, "step": 393 }, { "epoch": 0.21, "learning_rate": 9.95638095952861e-08, "logits/chosen": -2.038057565689087, "logits/rejected": -2.03719162940979, "logps/chosen": -2.6477346420288086, "logps/rejected": -5.611385345458984, "loss": 0.5156, "rewards/accuracies": 1.0, "rewards/chosen": 0.9587889909744263, "rewards/margins": 0.39362412691116333, "rewards/rejected": 0.5651648640632629, "step": 394 }, { "epoch": 0.21, "learning_rate": 9.95599637024044e-08, "logits/chosen": -2.1555190086364746, "logits/rejected": -2.0794477462768555, "logps/chosen": -29.379871368408203, "logps/rejected": -3.6689224243164062, "loss": 0.6575, "rewards/accuracies": 1.0, "rewards/chosen": 0.7049053311347961, "rewards/margins": 0.07263487577438354, "rewards/rejected": 0.6322704553604126, "step": 395 }, { "epoch": 0.21, "learning_rate": 9.955610100416295e-08, "logits/chosen": -2.0634615421295166, "logits/rejected": -2.0633153915405273, "logps/chosen": -9.049270629882812, "logps/rejected": -3.6070311069488525, "loss": 0.4346, "rewards/accuracies": 1.0, "rewards/chosen": 1.1191357374191284, "rewards/margins": 0.6081135272979736, "rewards/rejected": 0.5110222101211548, "step": 396 }, { "epoch": 0.21, "learning_rate": 9.955222150187148e-08, "logits/chosen": -2.134082317352295, "logits/rejected": -2.3268532752990723, "logps/chosen": -3.122270345687866, "logps/rejected": -3.0969319343566895, "loss": 0.6995, "rewards/accuracies": 0.0, "rewards/chosen": 0.7557060122489929, "rewards/margins": -0.012582957744598389, "rewards/rejected": 0.7682889699935913, "step": 397 }, { "epoch": 0.21, "learning_rate": 9.954832519684557e-08, "logits/chosen": -2.11594557762146, "logits/rejected": -2.122117042541504, "logps/chosen": -8.61508560180664, "logps/rejected": -9.188971519470215, "loss": 0.3843, "rewards/accuracies": 1.0, "rewards/chosen": 1.0683910846710205, "rewards/margins": 0.7581876516342163, "rewards/rejected": 0.3102034628391266, "step": 398 }, { "epoch": 0.22, "learning_rate": 9.954441209040639e-08, "logits/chosen": -2.0285470485687256, "logits/rejected": -2.026240825653076, "logps/chosen": -11.908604621887207, "logps/rejected": -5.464859962463379, "loss": 0.4752, "rewards/accuracies": 1.0, "rewards/chosen": 0.9015040397644043, "rewards/margins": 0.4969792366027832, "rewards/rejected": 0.4045248031616211, "step": 399 }, { "epoch": 0.22, "learning_rate": 9.954048218388083e-08, "logits/chosen": -2.1100099086761475, "logits/rejected": -2.262207269668579, "logps/chosen": -3.7761247158050537, "logps/rejected": -3.8014187812805176, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": 0.7918269038200378, "rewards/margins": -0.0021820068359375, "rewards/rejected": 0.7940089106559753, "step": 400 }, { "epoch": 0.22, "learning_rate": 9.953653547860151e-08, "logits/chosen": -2.0916390419006348, "logits/rejected": -2.0906307697296143, "logps/chosen": -1.970499038696289, "logps/rejected": -4.723759651184082, "loss": 0.6, "rewards/accuracies": 1.0, "rewards/chosen": 0.7558069229125977, "rewards/margins": 0.1958756446838379, "rewards/rejected": 0.5599312782287598, "step": 401 }, { "epoch": 0.22, "learning_rate": 9.95325719759067e-08, "logits/chosen": -1.9789390563964844, "logits/rejected": -2.2655889987945557, "logps/chosen": -2.571178913116455, "logps/rejected": -2.4759597778320312, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.7222678065299988, "rewards/margins": 0.013444125652313232, "rewards/rejected": 0.7088236808776855, "step": 402 }, { "epoch": 0.22, "learning_rate": 9.952859167714042e-08, "logits/chosen": -2.0540080070495605, "logits/rejected": -2.309648275375366, "logps/chosen": -1.2113455533981323, "logps/rejected": -3.7811009883880615, "loss": 0.6486, "rewards/accuracies": 1.0, "rewards/chosen": 0.8570181131362915, "rewards/margins": 0.09117138385772705, "rewards/rejected": 0.7658467292785645, "step": 403 }, { "epoch": 0.22, "learning_rate": 9.952459458365234e-08, "logits/chosen": -2.000678539276123, "logits/rejected": -2.220635175704956, "logps/chosen": -3.5224595069885254, "logps/rejected": -3.149547576904297, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.745065450668335, "rewards/margins": 0.004344522953033447, "rewards/rejected": 0.7407209277153015, "step": 404 }, { "epoch": 0.22, "learning_rate": 9.952058069679783e-08, "logits/chosen": -2.163477897644043, "logits/rejected": -2.3169219493865967, "logps/chosen": -2.8464879989624023, "logps/rejected": -2.8328423500061035, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.8537775874137878, "rewards/margins": 0.0020384788513183594, "rewards/rejected": 0.8517391085624695, "step": 405 }, { "epoch": 0.22, "learning_rate": 9.951655001793797e-08, "logits/chosen": -2.147329092025757, "logits/rejected": -2.1385409832000732, "logps/chosen": -3.0339102745056152, "logps/rejected": -6.799982070922852, "loss": 0.5503, "rewards/accuracies": 1.0, "rewards/chosen": 0.9076297879219055, "rewards/margins": 0.309481143951416, "rewards/rejected": 0.5981486439704895, "step": 406 }, { "epoch": 0.22, "learning_rate": 9.951250254843955e-08, "logits/chosen": -2.1180224418640137, "logits/rejected": -2.2990031242370605, "logps/chosen": -1.7967486381530762, "logps/rejected": -1.8457629680633545, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.7873964905738831, "rewards/margins": -0.0018395185470581055, "rewards/rejected": 0.7892360091209412, "step": 407 }, { "epoch": 0.22, "learning_rate": 9.950843828967502e-08, "logits/chosen": -2.136991262435913, "logits/rejected": -2.1378087997436523, "logps/chosen": -3.3455967903137207, "logps/rejected": -5.699227333068848, "loss": 0.5734, "rewards/accuracies": 1.0, "rewards/chosen": 0.6831284165382385, "rewards/margins": 0.2558083236217499, "rewards/rejected": 0.42732009291648865, "step": 408 }, { "epoch": 0.22, "learning_rate": 9.95043572430225e-08, "logits/chosen": -2.1794533729553223, "logits/rejected": -2.1737332344055176, "logps/chosen": -6.590200424194336, "logps/rejected": -10.825026512145996, "loss": 0.484, "rewards/accuracies": 1.0, "rewards/chosen": 0.9390193223953247, "rewards/margins": 0.47383061051368713, "rewards/rejected": 0.4651887118816376, "step": 409 }, { "epoch": 0.22, "learning_rate": 9.95002594098659e-08, "logits/chosen": -2.131230354309082, "logits/rejected": -2.1274425983428955, "logps/chosen": -7.5056562423706055, "logps/rejected": -3.894334554672241, "loss": 0.5635, "rewards/accuracies": 1.0, "rewards/chosen": 0.8105593919754028, "rewards/margins": 0.2786356210708618, "rewards/rejected": 0.531923770904541, "step": 410 }, { "epoch": 0.22, "learning_rate": 9.949614479159472e-08, "logits/chosen": -2.0752131938934326, "logits/rejected": -2.0812745094299316, "logps/chosen": -6.007015705108643, "logps/rejected": -4.629556179046631, "loss": 0.4662, "rewards/accuracies": 1.0, "rewards/chosen": 1.0531662702560425, "rewards/margins": 0.5211165547370911, "rewards/rejected": 0.5320497155189514, "step": 411 }, { "epoch": 0.22, "learning_rate": 9.94920133896042e-08, "logits/chosen": -1.9704217910766602, "logits/rejected": -2.2705392837524414, "logps/chosen": -3.2263755798339844, "logps/rejected": -3.452561140060425, "loss": 0.6763, "rewards/accuracies": 1.0, "rewards/chosen": 0.7138673663139343, "rewards/margins": 0.03405541181564331, "rewards/rejected": 0.679811954498291, "step": 412 }, { "epoch": 0.22, "learning_rate": 9.948786520529527e-08, "logits/chosen": -2.0393059253692627, "logits/rejected": -2.0803303718566895, "logps/chosen": -8.62308120727539, "logps/rejected": -20.335433959960938, "loss": 0.5283, "rewards/accuracies": 1.0, "rewards/chosen": 0.8344072699546814, "rewards/margins": 0.3622848689556122, "rewards/rejected": 0.4721224009990692, "step": 413 }, { "epoch": 0.22, "learning_rate": 9.948370024007452e-08, "logits/chosen": -2.1360385417938232, "logits/rejected": -2.3255693912506104, "logps/chosen": -2.13716721534729, "logps/rejected": -2.2074177265167236, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.8769850134849548, "rewards/margins": 0.019407033920288086, "rewards/rejected": 0.8575779795646667, "step": 414 }, { "epoch": 0.22, "learning_rate": 9.94795184953543e-08, "logits/chosen": -2.1928274631500244, "logits/rejected": -2.293271780014038, "logps/chosen": -2.834239959716797, "logps/rejected": -2.78909969329834, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.8412322402000427, "rewards/margins": 0.012225449085235596, "rewards/rejected": 0.8290067911148071, "step": 415 }, { "epoch": 0.22, "learning_rate": 9.947531997255255e-08, "logits/chosen": -2.0433881282806396, "logits/rejected": -2.0933055877685547, "logps/chosen": -5.9388957023620605, "logps/rejected": -11.136988639831543, "loss": 0.4941, "rewards/accuracies": 1.0, "rewards/chosen": 1.0632749795913696, "rewards/margins": 0.4476918578147888, "rewards/rejected": 0.6155831217765808, "step": 416 }, { "epoch": 0.22, "learning_rate": 9.947110467309298e-08, "logits/chosen": -2.059903383255005, "logits/rejected": -2.0586860179901123, "logps/chosen": -2.30936861038208, "logps/rejected": -5.333141326904297, "loss": 0.5282, "rewards/accuracies": 1.0, "rewards/chosen": 0.9594680666923523, "rewards/margins": 0.36263418197631836, "rewards/rejected": 0.5968338847160339, "step": 417 }, { "epoch": 0.23, "learning_rate": 9.946687259840498e-08, "logits/chosen": -2.0598866939544678, "logits/rejected": -2.03596830368042, "logps/chosen": -21.24955940246582, "logps/rejected": -2.719712018966675, "loss": 0.5137, "rewards/accuracies": 1.0, "rewards/chosen": 0.8956241607666016, "rewards/margins": 0.3982640206813812, "rewards/rejected": 0.49736014008522034, "step": 418 }, { "epoch": 0.23, "learning_rate": 9.946262374992358e-08, "logits/chosen": -2.083284616470337, "logits/rejected": -2.3135781288146973, "logps/chosen": -2.7777633666992188, "logps/rejected": -2.8261470794677734, "loss": 0.7069, "rewards/accuracies": 0.0, "rewards/chosen": 0.6605227589607239, "rewards/margins": -0.0272790789604187, "rewards/rejected": 0.6878018379211426, "step": 419 }, { "epoch": 0.23, "learning_rate": 9.945835812908955e-08, "logits/chosen": -2.087085247039795, "logits/rejected": -2.2396037578582764, "logps/chosen": -1.2879791259765625, "logps/rejected": -2.3226397037506104, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.7851411700248718, "rewards/margins": 0.021787166595458984, "rewards/rejected": 0.7633540034294128, "step": 420 }, { "epoch": 0.23, "learning_rate": 9.94540757373493e-08, "logits/chosen": -2.079033136367798, "logits/rejected": -2.2487945556640625, "logps/chosen": -1.6413893699645996, "logps/rejected": -1.66470205783844, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.7012436389923096, "rewards/margins": 0.004805862903594971, "rewards/rejected": 0.6964377760887146, "step": 421 }, { "epoch": 0.23, "learning_rate": 9.944977657615499e-08, "logits/chosen": -2.0578181743621826, "logits/rejected": -2.258551836013794, "logps/chosen": -1.6944944858551025, "logps/rejected": -1.5446693897247314, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.722084641456604, "rewards/margins": 0.004983067512512207, "rewards/rejected": 0.7171015739440918, "step": 422 }, { "epoch": 0.23, "learning_rate": 9.944546064696438e-08, "logits/chosen": -2.027158737182617, "logits/rejected": -2.271798849105835, "logps/chosen": -10.193449020385742, "logps/rejected": -6.427292346954346, "loss": 0.7883, "rewards/accuracies": 0.0, "rewards/chosen": 0.7508298754692078, "rewards/margins": -0.18196260929107666, "rewards/rejected": 0.9327924847602844, "step": 423 }, { "epoch": 0.23, "learning_rate": 9.9441127951241e-08, "logits/chosen": -1.9917088747024536, "logits/rejected": -1.9926670789718628, "logps/chosen": -3.5600173473358154, "logps/rejected": -4.522427082061768, "loss": 0.5245, "rewards/accuracies": 1.0, "rewards/chosen": 0.7922335863113403, "rewards/margins": 0.37163224816322327, "rewards/rejected": 0.42060133814811707, "step": 424 }, { "epoch": 0.23, "learning_rate": 9.943677849045401e-08, "logits/chosen": -2.0516018867492676, "logits/rejected": -2.052971363067627, "logps/chosen": -10.358366012573242, "logps/rejected": -3.3835701942443848, "loss": 0.5274, "rewards/accuracies": 1.0, "rewards/chosen": 0.9960840344429016, "rewards/margins": 0.3646399974822998, "rewards/rejected": 0.6314440369606018, "step": 425 }, { "epoch": 0.23, "learning_rate": 9.943241226607833e-08, "logits/chosen": -2.1610682010650635, "logits/rejected": -2.2729501724243164, "logps/chosen": -7.568031311035156, "logps/rejected": -28.971216201782227, "loss": 0.3909, "rewards/accuracies": 1.0, "rewards/chosen": 0.9646793603897095, "rewards/margins": 0.7373526096343994, "rewards/rejected": 0.22732678055763245, "step": 426 }, { "epoch": 0.23, "learning_rate": 9.942802927959441e-08, "logits/chosen": -2.0527708530426025, "logits/rejected": -2.3056445121765137, "logps/chosen": -1.678112506866455, "logps/rejected": -1.7917169332504272, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 0.6686956286430359, "rewards/margins": 0.02784121036529541, "rewards/rejected": 0.6408544182777405, "step": 427 }, { "epoch": 0.23, "learning_rate": 9.942362953248857e-08, "logits/chosen": -2.2134933471679688, "logits/rejected": -2.3237709999084473, "logps/chosen": -1.5744043588638306, "logps/rejected": -1.5741899013519287, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": 0.865848183631897, "rewards/margins": 0.002613663673400879, "rewards/rejected": 0.8632345199584961, "step": 428 }, { "epoch": 0.23, "learning_rate": 9.941921302625269e-08, "logits/chosen": -2.045865297317505, "logits/rejected": -2.2618980407714844, "logps/chosen": -1.6922014951705933, "logps/rejected": -1.5861870050430298, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.8272881507873535, "rewards/margins": 0.02625828981399536, "rewards/rejected": 0.8010298609733582, "step": 429 }, { "epoch": 0.23, "learning_rate": 9.941477976238438e-08, "logits/chosen": -1.986388921737671, "logits/rejected": -1.9929783344268799, "logps/chosen": -6.636472702026367, "logps/rejected": -5.7529520988464355, "loss": 0.4597, "rewards/accuracies": 1.0, "rewards/chosen": 1.0524193048477173, "rewards/margins": 0.5385738015174866, "rewards/rejected": 0.5138455033302307, "step": 430 }, { "epoch": 0.23, "learning_rate": 9.941032974238691e-08, "logits/chosen": -2.1599578857421875, "logits/rejected": -2.1544806957244873, "logps/chosen": -8.137004852294922, "logps/rejected": -6.421879768371582, "loss": 0.4561, "rewards/accuracies": 1.0, "rewards/chosen": 0.9965538382530212, "rewards/margins": 0.5484116077423096, "rewards/rejected": 0.44814226031303406, "step": 431 }, { "epoch": 0.23, "learning_rate": 9.940586296776925e-08, "logits/chosen": -2.0123491287231445, "logits/rejected": -2.01863956451416, "logps/chosen": -2.6808547973632812, "logps/rejected": -5.101343631744385, "loss": 0.5008, "rewards/accuracies": 1.0, "rewards/chosen": 0.9051519632339478, "rewards/margins": 0.4305954575538635, "rewards/rejected": 0.47455650568008423, "step": 432 }, { "epoch": 0.23, "learning_rate": 9.940137944004605e-08, "logits/chosen": -2.009531259536743, "logits/rejected": -2.015256643295288, "logps/chosen": -3.9190356731414795, "logps/rejected": -4.7530317306518555, "loss": 0.5105, "rewards/accuracies": 1.0, "rewards/chosen": 0.8952662348747253, "rewards/margins": 0.40637263655662537, "rewards/rejected": 0.4888935983181, "step": 433 }, { "epoch": 0.23, "learning_rate": 9.939687916073763e-08, "logits/chosen": -2.0665485858917236, "logits/rejected": -2.0721967220306396, "logps/chosen": -10.410897254943848, "logps/rejected": -1.6734790802001953, "loss": 0.6228, "rewards/accuracies": 1.0, "rewards/chosen": 0.878279983997345, "rewards/margins": 0.1459762454032898, "rewards/rejected": 0.7323037385940552, "step": 434 }, { "epoch": 0.23, "learning_rate": 9.939236213136999e-08, "logits/chosen": -2.070315361022949, "logits/rejected": -2.0656864643096924, "logps/chosen": -7.06215238571167, "logps/rejected": -10.417933464050293, "loss": 0.6516, "rewards/accuracies": 1.0, "rewards/chosen": 0.9716898202896118, "rewards/margins": 0.08490896224975586, "rewards/rejected": 0.886780858039856, "step": 435 }, { "epoch": 0.24, "learning_rate": 9.938782835347483e-08, "logits/chosen": -1.9443416595458984, "logits/rejected": -1.9437545537948608, "logps/chosen": -1.9491517543792725, "logps/rejected": -1.789790391921997, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.7453964352607727, "rewards/margins": 0.028921961784362793, "rewards/rejected": 0.7164744734764099, "step": 436 }, { "epoch": 0.24, "learning_rate": 9.93832778285895e-08, "logits/chosen": -2.054725170135498, "logits/rejected": -2.3133153915405273, "logps/chosen": -4.040816307067871, "logps/rejected": -1.740933895111084, "loss": 0.7272, "rewards/accuracies": 0.0, "rewards/chosen": 0.5297999382019043, "rewards/margins": -0.06696242094039917, "rewards/rejected": 0.5967623591423035, "step": 437 }, { "epoch": 0.24, "learning_rate": 9.937871055825707e-08, "logits/chosen": -2.13035249710083, "logits/rejected": -2.1248619556427, "logps/chosen": -6.816621780395508, "logps/rejected": -4.749639987945557, "loss": 0.6714, "rewards/accuracies": 1.0, "rewards/chosen": 1.0202487707138062, "rewards/margins": 0.04390996694564819, "rewards/rejected": 0.976338803768158, "step": 438 }, { "epoch": 0.24, "learning_rate": 9.937412654402624e-08, "logits/chosen": -2.1635026931762695, "logits/rejected": -2.269214391708374, "logps/chosen": -4.265314102172852, "logps/rejected": -4.142309188842773, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 0.7637374997138977, "rewards/margins": -0.0002359151840209961, "rewards/rejected": 0.7639734148979187, "step": 439 }, { "epoch": 0.24, "learning_rate": 9.936952578745141e-08, "logits/chosen": -2.041168451309204, "logits/rejected": -2.047280788421631, "logps/chosen": -3.150240659713745, "logps/rejected": -2.4084696769714355, "loss": 0.5712, "rewards/accuracies": 1.0, "rewards/chosen": 0.8837186694145203, "rewards/margins": 0.26090264320373535, "rewards/rejected": 0.6228160262107849, "step": 440 }, { "epoch": 0.24, "learning_rate": 9.936490829009266e-08, "logits/chosen": -2.0374176502227783, "logits/rejected": -2.289283514022827, "logps/chosen": -14.974161148071289, "logps/rejected": -15.29334545135498, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.5819732546806335, "rewards/margins": 0.016463160514831543, "rewards/rejected": 0.565510094165802, "step": 441 }, { "epoch": 0.24, "learning_rate": 9.936027405351575e-08, "logits/chosen": -2.0070748329162598, "logits/rejected": -2.2213406562805176, "logps/chosen": -1.4705241918563843, "logps/rejected": -1.4511665105819702, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": 0.7971187233924866, "rewards/margins": -0.002007603645324707, "rewards/rejected": 0.7991263270378113, "step": 442 }, { "epoch": 0.24, "learning_rate": 9.93556230792921e-08, "logits/chosen": -2.043726682662964, "logits/rejected": -2.0507709980010986, "logps/chosen": -2.9179317951202393, "logps/rejected": -3.4859750270843506, "loss": 0.4502, "rewards/accuracies": 1.0, "rewards/chosen": 1.084757685661316, "rewards/margins": 0.5644263625144958, "rewards/rejected": 0.5203313231468201, "step": 443 }, { "epoch": 0.24, "learning_rate": 9.935095536899885e-08, "logits/chosen": -2.1992275714874268, "logits/rejected": -2.0926122665405273, "logps/chosen": -31.867353439331055, "logps/rejected": -19.978525161743164, "loss": 0.8703, "rewards/accuracies": 0.0, "rewards/chosen": 0.10668964684009552, "rewards/margins": -0.3276418447494507, "rewards/rejected": 0.4343315064907074, "step": 444 }, { "epoch": 0.24, "learning_rate": 9.934627092421873e-08, "logits/chosen": -2.084946870803833, "logits/rejected": -2.2584385871887207, "logps/chosen": -2.036071538925171, "logps/rejected": -1.9237957000732422, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.9050559401512146, "rewards/margins": -0.004463076591491699, "rewards/rejected": 0.9095190167427063, "step": 445 }, { "epoch": 0.24, "learning_rate": 9.93415697465402e-08, "logits/chosen": -2.299119472503662, "logits/rejected": -2.249884843826294, "logps/chosen": -7.486584186553955, "logps/rejected": -7.286174774169922, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.48157602548599243, "rewards/margins": 0.005991309881210327, "rewards/rejected": 0.4755847156047821, "step": 446 }, { "epoch": 0.24, "learning_rate": 9.933685183755744e-08, "logits/chosen": -2.0994985103607178, "logits/rejected": -2.25264835357666, "logps/chosen": -1.348006248474121, "logps/rejected": -1.3723706007003784, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.9245917201042175, "rewards/margins": 0.005822360515594482, "rewards/rejected": 0.918769359588623, "step": 447 }, { "epoch": 0.24, "learning_rate": 9.93321171988702e-08, "logits/chosen": -2.157575845718384, "logits/rejected": -2.2764129638671875, "logps/chosen": -2.9326326847076416, "logps/rejected": -2.734184503555298, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.49664565920829773, "rewards/margins": -0.0018145442008972168, "rewards/rejected": 0.49846020340919495, "step": 448 }, { "epoch": 0.24, "learning_rate": 9.932736583208398e-08, "logits/chosen": -2.1014976501464844, "logits/rejected": -2.13396954536438, "logps/chosen": -5.761539936065674, "logps/rejected": -16.362064361572266, "loss": 0.4239, "rewards/accuracies": 1.0, "rewards/chosen": 1.0694410800933838, "rewards/margins": 0.6389326453208923, "rewards/rejected": 0.43050843477249146, "step": 449 }, { "epoch": 0.24, "learning_rate": 9.932259773880993e-08, "logits/chosen": -2.125253915786743, "logits/rejected": -2.124958038330078, "logps/chosen": -2.370793104171753, "logps/rejected": -1.3894850015640259, "loss": 0.621, "rewards/accuracies": 1.0, "rewards/chosen": 0.9140484929084778, "rewards/margins": 0.15000402927398682, "rewards/rejected": 0.764044463634491, "step": 450 }, { "epoch": 0.24, "learning_rate": 9.931781292066486e-08, "logits/chosen": -1.98301100730896, "logits/rejected": -2.2534339427948, "logps/chosen": -1.4787750244140625, "logps/rejected": -1.4838995933532715, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 0.8100002408027649, "rewards/margins": 0.02879232168197632, "rewards/rejected": 0.7812079191207886, "step": 451 }, { "epoch": 0.24, "learning_rate": 9.931301137927126e-08, "logits/chosen": -1.9667562246322632, "logits/rejected": -2.2914159297943115, "logps/chosen": -1.1112706661224365, "logps/rejected": -1.1625962257385254, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.7332367897033691, "rewards/margins": 0.010486900806427002, "rewards/rejected": 0.7227498888969421, "step": 452 }, { "epoch": 0.24, "learning_rate": 9.930819311625729e-08, "logits/chosen": -2.0510447025299072, "logits/rejected": -2.288944959640503, "logps/chosen": -3.586956024169922, "logps/rejected": -4.658337116241455, "loss": 0.6484, "rewards/accuracies": 1.0, "rewards/chosen": 0.8799166679382324, "rewards/margins": 0.09164738655090332, "rewards/rejected": 0.7882692813873291, "step": 453 }, { "epoch": 0.24, "learning_rate": 9.930335813325679e-08, "logits/chosen": -2.0927987098693848, "logits/rejected": -2.2241122722625732, "logps/chosen": -1.4591132402420044, "logps/rejected": -1.4483197927474976, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.8494586944580078, "rewards/margins": 0.009800612926483154, "rewards/rejected": 0.8396580815315247, "step": 454 }, { "epoch": 0.25, "learning_rate": 9.929850643190925e-08, "logits/chosen": -2.005740165710449, "logits/rejected": -2.2347710132598877, "logps/chosen": -2.2168197631835938, "logps/rejected": -2.296595811843872, "loss": 0.6747, "rewards/accuracies": 1.0, "rewards/chosen": 0.8941982388496399, "rewards/margins": 0.03720635175704956, "rewards/rejected": 0.8569918870925903, "step": 455 }, { "epoch": 0.25, "learning_rate": 9.929363801385985e-08, "logits/chosen": -2.0629093647003174, "logits/rejected": -2.282484292984009, "logps/chosen": -8.72753620147705, "logps/rejected": -8.735275268554688, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 0.5912501215934753, "rewards/margins": 0.033136844635009766, "rewards/rejected": 0.5581132769584656, "step": 456 }, { "epoch": 0.25, "learning_rate": 9.928875288075944e-08, "logits/chosen": -2.0337679386138916, "logits/rejected": -2.2694249153137207, "logps/chosen": -3.7426095008850098, "logps/rejected": -3.586430072784424, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.658900797367096, "rewards/margins": 0.006294548511505127, "rewards/rejected": 0.6526062488555908, "step": 457 }, { "epoch": 0.25, "learning_rate": 9.928385103426451e-08, "logits/chosen": -2.0748069286346436, "logits/rejected": -2.0761101245880127, "logps/chosen": -4.471127510070801, "logps/rejected": -3.4536876678466797, "loss": 0.6749, "rewards/accuracies": 1.0, "rewards/chosen": 0.8194987177848816, "rewards/margins": 0.03690505027770996, "rewards/rejected": 0.7825936675071716, "step": 458 }, { "epoch": 0.25, "learning_rate": 9.927893247603724e-08, "logits/chosen": -1.9617376327514648, "logits/rejected": -1.9638231992721558, "logps/chosen": -3.7658047676086426, "logps/rejected": -1.3763234615325928, "loss": 0.6501, "rewards/accuracies": 1.0, "rewards/chosen": 0.9948744177818298, "rewards/margins": 0.08797574043273926, "rewards/rejected": 0.9068986773490906, "step": 459 }, { "epoch": 0.25, "learning_rate": 9.927399720774547e-08, "logits/chosen": -2.143796682357788, "logits/rejected": -2.3129870891571045, "logps/chosen": -10.217793464660645, "logps/rejected": -12.945149421691895, "loss": 0.7434, "rewards/accuracies": 0.0, "rewards/chosen": 0.6465622782707214, "rewards/margins": -0.09813332557678223, "rewards/rejected": 0.7446956038475037, "step": 460 }, { "epoch": 0.25, "learning_rate": 9.926904523106269e-08, "logits/chosen": -2.108372449874878, "logits/rejected": -2.2809741497039795, "logps/chosen": -3.7720372676849365, "logps/rejected": -3.4209611415863037, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.5607194900512695, "rewards/margins": 0.03271299600601196, "rewards/rejected": 0.5280064940452576, "step": 461 }, { "epoch": 0.25, "learning_rate": 9.926407654766811e-08, "logits/chosen": -2.0329484939575195, "logits/rejected": -2.217257022857666, "logps/chosen": -1.4988715648651123, "logps/rejected": -1.4527435302734375, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.874264657497406, "rewards/margins": 0.02045583724975586, "rewards/rejected": 0.8538088202476501, "step": 462 }, { "epoch": 0.25, "learning_rate": 9.925909115924655e-08, "logits/chosen": -2.0281457901000977, "logits/rejected": -2.036271810531616, "logps/chosen": -2.1571755409240723, "logps/rejected": -4.367532253265381, "loss": 0.4822, "rewards/accuracies": 1.0, "rewards/chosen": 0.9624630212783813, "rewards/margins": 0.47855469584465027, "rewards/rejected": 0.4839083254337311, "step": 463 }, { "epoch": 0.25, "learning_rate": 9.925408906748849e-08, "logits/chosen": -1.991594672203064, "logits/rejected": -1.9894261360168457, "logps/chosen": -0.9613337516784668, "logps/rejected": -10.62264633178711, "loss": 0.4918, "rewards/accuracies": 1.0, "rewards/chosen": 0.8846277594566345, "rewards/margins": 0.4538010060787201, "rewards/rejected": 0.43082675337791443, "step": 464 }, { "epoch": 0.25, "learning_rate": 9.924907027409013e-08, "logits/chosen": -1.9975523948669434, "logits/rejected": -2.246375322341919, "logps/chosen": -2.243884801864624, "logps/rejected": -2.1271653175354004, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.8701285719871521, "rewards/margins": 0.03429841995239258, "rewards/rejected": 0.8358301520347595, "step": 465 }, { "epoch": 0.25, "learning_rate": 9.92440347807533e-08, "logits/chosen": -2.0508675575256348, "logits/rejected": -2.2674219608306885, "logps/chosen": -1.5846683979034424, "logps/rejected": -1.739353895187378, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.9519863128662109, "rewards/margins": 0.039105117321014404, "rewards/rejected": 0.9128811955451965, "step": 466 }, { "epoch": 0.25, "learning_rate": 9.923898258918544e-08, "logits/chosen": -2.0283522605895996, "logits/rejected": -2.021211862564087, "logps/chosen": -9.609999656677246, "logps/rejected": -7.307793140411377, "loss": 0.4521, "rewards/accuracies": 1.0, "rewards/chosen": 1.1774731874465942, "rewards/margins": 0.5592929720878601, "rewards/rejected": 0.6181802153587341, "step": 467 }, { "epoch": 0.25, "learning_rate": 9.923391370109979e-08, "logits/chosen": -1.9743083715438843, "logits/rejected": -1.9745256900787354, "logps/chosen": -1.707465648651123, "logps/rejected": -1.82603120803833, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.8840298056602478, "rewards/margins": 0.007521092891693115, "rewards/rejected": 0.8765087127685547, "step": 468 }, { "epoch": 0.25, "learning_rate": 9.922882811821509e-08, "logits/chosen": -1.9541020393371582, "logits/rejected": -2.198737382888794, "logps/chosen": -2.186823844909668, "logps/rejected": -2.121295928955078, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.7195166945457458, "rewards/margins": 0.0127640962600708, "rewards/rejected": 0.706752598285675, "step": 469 }, { "epoch": 0.25, "learning_rate": 9.922372584225585e-08, "logits/chosen": -2.052445411682129, "logits/rejected": -2.055912971496582, "logps/chosen": -2.0967438220977783, "logps/rejected": -4.064013481140137, "loss": 0.4678, "rewards/accuracies": 1.0, "rewards/chosen": 1.0360816717147827, "rewards/margins": 0.5167074799537659, "rewards/rejected": 0.5193741917610168, "step": 470 }, { "epoch": 0.25, "learning_rate": 9.921860687495222e-08, "logits/chosen": -2.0656955242156982, "logits/rejected": -2.1091079711914062, "logps/chosen": -7.1621174812316895, "logps/rejected": -23.54322052001953, "loss": 0.2887, "rewards/accuracies": 1.0, "rewards/chosen": 0.9828667044639587, "rewards/margins": 1.094704270362854, "rewards/rejected": -0.11183758080005646, "step": 471 }, { "epoch": 0.25, "learning_rate": 9.921347121803998e-08, "logits/chosen": -2.0735814571380615, "logits/rejected": -2.2610220909118652, "logps/chosen": -5.984420299530029, "logps/rejected": -0.980964183807373, "loss": 0.8041, "rewards/accuracies": 0.0, "rewards/chosen": 0.5993160605430603, "rewards/margins": -0.21075016260147095, "rewards/rejected": 0.8100662231445312, "step": 472 }, { "epoch": 0.26, "learning_rate": 9.92083188732606e-08, "logits/chosen": -2.20632266998291, "logits/rejected": -2.2059309482574463, "logps/chosen": -1.3112934827804565, "logps/rejected": -2.645108222961426, "loss": 0.5304, "rewards/accuracies": 1.0, "rewards/chosen": 0.9070086479187012, "rewards/margins": 0.35717087984085083, "rewards/rejected": 0.5498377680778503, "step": 473 }, { "epoch": 0.26, "learning_rate": 9.920314984236118e-08, "logits/chosen": -2.1167116165161133, "logits/rejected": -2.067671775817871, "logps/chosen": -24.117950439453125, "logps/rejected": -3.736473560333252, "loss": 0.5025, "rewards/accuracies": 1.0, "rewards/chosen": 1.0127887725830078, "rewards/margins": 0.42629092931747437, "rewards/rejected": 0.5864978432655334, "step": 474 }, { "epoch": 0.26, "learning_rate": 9.919796412709449e-08, "logits/chosen": -2.0697669982910156, "logits/rejected": -2.2556309700012207, "logps/chosen": -2.3398900032043457, "logps/rejected": -2.3435921669006348, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.7308551669120789, "rewards/margins": 0.005291759967803955, "rewards/rejected": 0.7255634069442749, "step": 475 }, { "epoch": 0.26, "learning_rate": 9.9192761729219e-08, "logits/chosen": -2.1970009803771973, "logits/rejected": -2.239084243774414, "logps/chosen": -10.156576156616211, "logps/rejected": -5.747710227966309, "loss": 0.7706, "rewards/accuracies": 0.0, "rewards/chosen": 0.657214343547821, "rewards/margins": -0.14924347400665283, "rewards/rejected": 0.8064578175544739, "step": 476 }, { "epoch": 0.26, "learning_rate": 9.918754265049876e-08, "logits/chosen": -2.0776681900024414, "logits/rejected": -2.0084152221679688, "logps/chosen": -53.81886291503906, "logps/rejected": -3.323978900909424, "loss": 0.6637, "rewards/accuracies": 1.0, "rewards/chosen": 0.4619606137275696, "rewards/margins": 0.059790611267089844, "rewards/rejected": 0.40217000246047974, "step": 477 }, { "epoch": 0.26, "learning_rate": 9.918230689270353e-08, "logits/chosen": -2.2775678634643555, "logits/rejected": -2.285109519958496, "logps/chosen": -5.4667863845825195, "logps/rejected": -3.957434892654419, "loss": 0.4507, "rewards/accuracies": 1.0, "rewards/chosen": 1.2488257884979248, "rewards/margins": 0.5630548596382141, "rewards/rejected": 0.6857709288597107, "step": 478 }, { "epoch": 0.26, "learning_rate": 9.917705445760869e-08, "logits/chosen": -2.2320709228515625, "logits/rejected": -2.1999869346618652, "logps/chosen": -32.29376220703125, "logps/rejected": -33.45516586303711, "loss": 0.5529, "rewards/accuracies": 1.0, "rewards/chosen": 0.6856910586357117, "rewards/margins": 0.3033958375453949, "rewards/rejected": 0.3822952210903168, "step": 479 }, { "epoch": 0.26, "learning_rate": 9.917178534699533e-08, "logits/chosen": -2.1415786743164062, "logits/rejected": -2.1302707195281982, "logps/chosen": -3.7985591888427734, "logps/rejected": -6.562707901000977, "loss": 0.487, "rewards/accuracies": 1.0, "rewards/chosen": 0.9517847895622253, "rewards/margins": 0.46616342663764954, "rewards/rejected": 0.4856213629245758, "step": 480 }, { "epoch": 0.26, "learning_rate": 9.916649956265016e-08, "logits/chosen": -2.1151063442230225, "logits/rejected": -2.0203254222869873, "logps/chosen": -25.43071174621582, "logps/rejected": -4.917394161224365, "loss": 0.6088, "rewards/accuracies": 1.0, "rewards/chosen": 0.8588289618492126, "rewards/margins": 0.17645937204360962, "rewards/rejected": 0.682369589805603, "step": 481 }, { "epoch": 0.26, "learning_rate": 9.91611971063655e-08, "logits/chosen": -1.975743293762207, "logits/rejected": -2.239076614379883, "logps/chosen": -1.8136825561523438, "logps/rejected": -7.524742126464844, "loss": 0.5692, "rewards/accuracies": 1.0, "rewards/chosen": 0.7586946487426758, "rewards/margins": 0.26542186737060547, "rewards/rejected": 0.4932727813720703, "step": 482 }, { "epoch": 0.26, "learning_rate": 9.91558779799394e-08, "logits/chosen": -1.936888575553894, "logits/rejected": -2.2552151679992676, "logps/chosen": -1.1699541807174683, "logps/rejected": -1.2261927127838135, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.7765881419181824, "rewards/margins": 0.022165954113006592, "rewards/rejected": 0.7544221878051758, "step": 483 }, { "epoch": 0.26, "learning_rate": 9.915054218517554e-08, "logits/chosen": -2.225355386734009, "logits/rejected": -2.228084087371826, "logps/chosen": -1.547163486480713, "logps/rejected": -3.0260703563690186, "loss": 0.5768, "rewards/accuracies": 1.0, "rewards/chosen": 0.6220718026161194, "rewards/margins": 0.24810397624969482, "rewards/rejected": 0.37396782636642456, "step": 484 }, { "epoch": 0.26, "learning_rate": 9.914518972388324e-08, "logits/chosen": -2.156637191772461, "logits/rejected": -2.3251845836639404, "logps/chosen": -8.296174049377441, "logps/rejected": -6.184150218963623, "loss": 0.7784, "rewards/accuracies": 0.0, "rewards/chosen": 0.3183883726596832, "rewards/margins": -0.16375526785850525, "rewards/rejected": 0.4821436405181885, "step": 485 }, { "epoch": 0.26, "learning_rate": 9.913982059787745e-08, "logits/chosen": -2.0791969299316406, "logits/rejected": -2.302227258682251, "logps/chosen": -6.344446182250977, "logps/rejected": -1.9935643672943115, "loss": 0.7346, "rewards/accuracies": 0.0, "rewards/chosen": 0.7497255206108093, "rewards/margins": -0.08129829168319702, "rewards/rejected": 0.8310238122940063, "step": 486 }, { "epoch": 0.26, "learning_rate": 9.913443480897879e-08, "logits/chosen": -2.274710178375244, "logits/rejected": -2.400235414505005, "logps/chosen": -15.23396110534668, "logps/rejected": -18.15328025817871, "loss": 0.7144, "rewards/accuracies": 0.0, "rewards/chosen": 0.45645809173583984, "rewards/margins": -0.041969865560531616, "rewards/rejected": 0.49842795729637146, "step": 487 }, { "epoch": 0.26, "learning_rate": 9.912903235901358e-08, "logits/chosen": -2.05073881149292, "logits/rejected": -2.0457029342651367, "logps/chosen": -8.109586715698242, "logps/rejected": -3.719452381134033, "loss": 0.5646, "rewards/accuracies": 1.0, "rewards/chosen": 0.9975734949111938, "rewards/margins": 0.27618908882141113, "rewards/rejected": 0.7213844060897827, "step": 488 }, { "epoch": 0.26, "learning_rate": 9.91236132498137e-08, "logits/chosen": -2.156619071960449, "logits/rejected": -2.2867391109466553, "logps/chosen": -1.302551031112671, "logps/rejected": -1.3432884216308594, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.8587251901626587, "rewards/margins": 0.007232367992401123, "rewards/rejected": 0.8514928221702576, "step": 489 }, { "epoch": 0.26, "learning_rate": 9.911817748321674e-08, "logits/chosen": -2.0631704330444336, "logits/rejected": -2.06868577003479, "logps/chosen": -1.4927430152893066, "logps/rejected": -11.033300399780273, "loss": 0.5979, "rewards/accuracies": 1.0, "rewards/chosen": 0.9418699145317078, "rewards/margins": 0.2004706859588623, "rewards/rejected": 0.7413992285728455, "step": 490 }, { "epoch": 0.26, "learning_rate": 9.911272506106593e-08, "logits/chosen": -1.9921655654907227, "logits/rejected": -2.1880171298980713, "logps/chosen": -2.537170886993408, "logps/rejected": -2.8149526119232178, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.8873611688613892, "rewards/margins": 0.012422740459442139, "rewards/rejected": 0.874938428401947, "step": 491 }, { "epoch": 0.27, "learning_rate": 9.910725598521012e-08, "logits/chosen": -2.0694291591644287, "logits/rejected": -2.0672333240509033, "logps/chosen": -13.915656089782715, "logps/rejected": -12.679468154907227, "loss": 0.7012, "rewards/accuracies": 0.0, "rewards/chosen": 0.8080565333366394, "rewards/margins": -0.016081154346466064, "rewards/rejected": 0.8241376876831055, "step": 492 }, { "epoch": 0.27, "learning_rate": 9.910177025750386e-08, "logits/chosen": -2.0284361839294434, "logits/rejected": -2.302000045776367, "logps/chosen": -1.859939694404602, "logps/rejected": -1.7785873413085938, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.8756165504455566, "rewards/margins": 0.006280422210693359, "rewards/rejected": 0.8693361282348633, "step": 493 }, { "epoch": 0.27, "learning_rate": 9.909626787980728e-08, "logits/chosen": -2.068765878677368, "logits/rejected": -2.0722851753234863, "logps/chosen": -2.083949089050293, "logps/rejected": -2.2309226989746094, "loss": 0.5117, "rewards/accuracies": 1.0, "rewards/chosen": 1.0865261554718018, "rewards/margins": 0.4033019542694092, "rewards/rejected": 0.6832242012023926, "step": 494 }, { "epoch": 0.27, "learning_rate": 9.909074885398621e-08, "logits/chosen": -2.2021944522857666, "logits/rejected": -2.2032909393310547, "logps/chosen": -1.0949372053146362, "logps/rejected": -3.1000864505767822, "loss": 0.5213, "rewards/accuracies": 1.0, "rewards/chosen": 0.962865948677063, "rewards/margins": 0.37938159704208374, "rewards/rejected": 0.5834843516349792, "step": 495 }, { "epoch": 0.27, "learning_rate": 9.908521318191207e-08, "logits/chosen": -2.000171184539795, "logits/rejected": -2.253133773803711, "logps/chosen": -3.693622589111328, "logps/rejected": -3.4530725479125977, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.4129420220851898, "rewards/margins": 0.016337960958480835, "rewards/rejected": 0.396604061126709, "step": 496 }, { "epoch": 0.27, "learning_rate": 9.907966086546202e-08, "logits/chosen": -2.033827781677246, "logits/rejected": -2.2636895179748535, "logps/chosen": -5.43010139465332, "logps/rejected": -3.7966713905334473, "loss": 0.7556, "rewards/accuracies": 0.0, "rewards/chosen": 0.6085954904556274, "rewards/margins": -0.12119060754776001, "rewards/rejected": 0.7297860980033875, "step": 497 }, { "epoch": 0.27, "learning_rate": 9.907409190651875e-08, "logits/chosen": -1.9557838439941406, "logits/rejected": -1.9618117809295654, "logps/chosen": -0.9967549443244934, "logps/rejected": -7.103750228881836, "loss": 0.4977, "rewards/accuracies": 1.0, "rewards/chosen": 0.9870859980583191, "rewards/margins": 0.4386274814605713, "rewards/rejected": 0.5484585165977478, "step": 498 }, { "epoch": 0.27, "learning_rate": 9.906850630697066e-08, "logits/chosen": -2.1453661918640137, "logits/rejected": -2.1559648513793945, "logps/chosen": -4.622318267822266, "logps/rejected": -4.521288871765137, "loss": 0.5852, "rewards/accuracies": 1.0, "rewards/chosen": 1.1605160236358643, "rewards/margins": 0.2290058732032776, "rewards/rejected": 0.9315101504325867, "step": 499 }, { "epoch": 0.27, "learning_rate": 9.90629040687118e-08, "logits/chosen": -2.166715145111084, "logits/rejected": -2.2548205852508545, "logps/chosen": -4.3087944984436035, "logps/rejected": -8.688837051391602, "loss": 0.621, "rewards/accuracies": 1.0, "rewards/chosen": 0.9495639801025391, "rewards/margins": 0.14987868070602417, "rewards/rejected": 0.7996852993965149, "step": 500 }, { "epoch": 0.27, "learning_rate": 9.905728519364182e-08, "logits/chosen": -2.186169147491455, "logits/rejected": -2.1910037994384766, "logps/chosen": -2.9338555335998535, "logps/rejected": -3.7584924697875977, "loss": 0.5228, "rewards/accuracies": 1.0, "rewards/chosen": 0.8286089301109314, "rewards/margins": 0.37587329745292664, "rewards/rejected": 0.45273563265800476, "step": 501 }, { "epoch": 0.27, "learning_rate": 9.905164968366602e-08, "logits/chosen": -2.0522735118865967, "logits/rejected": -2.255176305770874, "logps/chosen": -2.0295634269714355, "logps/rejected": -1.9546245336532593, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.6940845251083374, "rewards/margins": 0.028931617736816406, "rewards/rejected": 0.665152907371521, "step": 502 }, { "epoch": 0.27, "learning_rate": 9.90459975406954e-08, "logits/chosen": -2.1304657459259033, "logits/rejected": -2.1416757106781006, "logps/chosen": -6.0742950439453125, "logps/rejected": -4.114386081695557, "loss": 0.5955, "rewards/accuracies": 1.0, "rewards/chosen": 1.0570160150527954, "rewards/margins": 0.205780029296875, "rewards/rejected": 0.8512359857559204, "step": 503 }, { "epoch": 0.27, "learning_rate": 9.90403287666465e-08, "logits/chosen": -2.0506486892700195, "logits/rejected": -2.3028461933135986, "logps/chosen": -2.1878695487976074, "logps/rejected": -2.3792102336883545, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 0.5723869204521179, "rewards/margins": 0.036561012268066406, "rewards/rejected": 0.5358259081840515, "step": 504 }, { "epoch": 0.27, "learning_rate": 9.903464336344159e-08, "logits/chosen": -2.0714778900146484, "logits/rejected": -2.0711710453033447, "logps/chosen": -1.7514784336090088, "logps/rejected": -1.6513108015060425, "loss": 0.6777, "rewards/accuracies": 1.0, "rewards/chosen": 0.8673831820487976, "rewards/margins": 0.031046390533447266, "rewards/rejected": 0.8363367915153503, "step": 505 }, { "epoch": 0.27, "learning_rate": 9.902894133300852e-08, "logits/chosen": -2.0155892372131348, "logits/rejected": -2.2956786155700684, "logps/chosen": -2.616529703140259, "logps/rejected": -2.523177146911621, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.8764484524726868, "rewards/margins": 0.002066493034362793, "rewards/rejected": 0.874381959438324, "step": 506 }, { "epoch": 0.27, "learning_rate": 9.90232226772808e-08, "logits/chosen": -2.099933385848999, "logits/rejected": -2.0945568084716797, "logps/chosen": -6.39109992980957, "logps/rejected": -4.635281085968018, "loss": 0.4906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9740411639213562, "rewards/margins": 0.45679914951324463, "rewards/rejected": 0.5172420144081116, "step": 507 }, { "epoch": 0.27, "learning_rate": 9.90174873981976e-08, "logits/chosen": -2.0955376625061035, "logits/rejected": -2.3054542541503906, "logps/chosen": -1.9264068603515625, "logps/rejected": -2.026968479156494, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 1.0187122821807861, "rewards/margins": 0.01794111728668213, "rewards/rejected": 1.000771164894104, "step": 508 }, { "epoch": 0.27, "learning_rate": 9.901173549770367e-08, "logits/chosen": -2.0483758449554443, "logits/rejected": -2.052386522293091, "logps/chosen": -5.7010579109191895, "logps/rejected": -3.062241554260254, "loss": 0.4906, "rewards/accuracies": 1.0, "rewards/chosen": 1.0803078413009644, "rewards/margins": 0.4569295644760132, "rewards/rejected": 0.6233782768249512, "step": 509 }, { "epoch": 0.28, "learning_rate": 9.900596697774948e-08, "logits/chosen": -2.119919776916504, "logits/rejected": -2.2145590782165527, "logps/chosen": -4.402811050415039, "logps/rejected": -4.588527679443359, "loss": 0.6777, "rewards/accuracies": 1.0, "rewards/chosen": 0.8714160323143005, "rewards/margins": 0.031037449836730957, "rewards/rejected": 0.8403785824775696, "step": 510 }, { "epoch": 0.28, "learning_rate": 9.900018184029104e-08, "logits/chosen": -2.1771132946014404, "logits/rejected": -2.365297794342041, "logps/chosen": -1.784189224243164, "logps/rejected": -1.749709963798523, "loss": 0.7018, "rewards/accuracies": 0.0, "rewards/chosen": 0.751788318157196, "rewards/margins": -0.017187118530273438, "rewards/rejected": 0.7689754366874695, "step": 511 }, { "epoch": 0.28, "learning_rate": 9.899438008729004e-08, "logits/chosen": -2.0712761878967285, "logits/rejected": -2.346195697784424, "logps/chosen": -1.0374584197998047, "logps/rejected": -1.0177611112594604, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.948570728302002, "rewards/margins": 0.017081737518310547, "rewards/rejected": 0.9314889907836914, "step": 512 }, { "epoch": 0.28, "learning_rate": 9.898856172071383e-08, "logits/chosen": -2.037893772125244, "logits/rejected": -2.0211057662963867, "logps/chosen": -21.37179946899414, "logps/rejected": -5.986860275268555, "loss": 0.6999, "rewards/accuracies": 0.0, "rewards/chosen": 0.9897907376289368, "rewards/margins": -0.013387858867645264, "rewards/rejected": 1.003178596496582, "step": 513 }, { "epoch": 0.28, "learning_rate": 9.898272674253537e-08, "logits/chosen": -2.0183260440826416, "logits/rejected": -2.021374225616455, "logps/chosen": -5.743528842926025, "logps/rejected": -3.185847759246826, "loss": 0.5681, "rewards/accuracies": 1.0, "rewards/chosen": 0.9285457730293274, "rewards/margins": 0.2679222822189331, "rewards/rejected": 0.6606234908103943, "step": 514 }, { "epoch": 0.28, "learning_rate": 9.897687515473324e-08, "logits/chosen": -2.1336252689361572, "logits/rejected": -2.1304078102111816, "logps/chosen": -12.605079650878906, "logps/rejected": -3.7193379402160645, "loss": 0.5092, "rewards/accuracies": 1.0, "rewards/chosen": 1.011928915977478, "rewards/margins": 0.40961676836013794, "rewards/rejected": 0.6023121476173401, "step": 515 }, { "epoch": 0.28, "learning_rate": 9.897100695929165e-08, "logits/chosen": -2.1017186641693115, "logits/rejected": -2.1049869060516357, "logps/chosen": -1.5303237438201904, "logps/rejected": -1.7380859851837158, "loss": 0.5641, "rewards/accuracies": 1.0, "rewards/chosen": 1.017369031906128, "rewards/margins": 0.27728790044784546, "rewards/rejected": 0.7400811314582825, "step": 516 }, { "epoch": 0.28, "learning_rate": 9.896512215820048e-08, "logits/chosen": -2.073904514312744, "logits/rejected": -2.073138475418091, "logps/chosen": -4.421444416046143, "logps/rejected": -5.120613098144531, "loss": 0.5034, "rewards/accuracies": 1.0, "rewards/chosen": 1.0894087553024292, "rewards/margins": 0.42402780055999756, "rewards/rejected": 0.6653809547424316, "step": 517 }, { "epoch": 0.28, "learning_rate": 9.895922075345519e-08, "logits/chosen": -2.153057336807251, "logits/rejected": -2.029524564743042, "logps/chosen": -40.07892990112305, "logps/rejected": -20.422473907470703, "loss": 0.5245, "rewards/accuracies": 1.0, "rewards/chosen": 0.916325032711029, "rewards/margins": 0.3715452551841736, "rewards/rejected": 0.5447797775268555, "step": 518 }, { "epoch": 0.28, "learning_rate": 9.895330274705693e-08, "logits/chosen": -1.9987839460372925, "logits/rejected": -2.00239634513855, "logps/chosen": -2.268446922302246, "logps/rejected": -3.6039702892303467, "loss": 0.5064, "rewards/accuracies": 1.0, "rewards/chosen": 0.9265090227127075, "rewards/margins": 0.4164935350418091, "rewards/rejected": 0.5100154876708984, "step": 519 }, { "epoch": 0.28, "learning_rate": 9.894736814101241e-08, "logits/chosen": -1.9965918064117432, "logits/rejected": -1.9991824626922607, "logps/chosen": -3.088480234146118, "logps/rejected": -3.2523388862609863, "loss": 0.4864, "rewards/accuracies": 1.0, "rewards/chosen": 1.0475609302520752, "rewards/margins": 0.4676218032836914, "rewards/rejected": 0.5799391269683838, "step": 520 }, { "epoch": 0.28, "learning_rate": 9.894141693733402e-08, "logits/chosen": -1.9858237504959106, "logits/rejected": -1.9862706661224365, "logps/chosen": -2.5266027450561523, "logps/rejected": -1.0618915557861328, "loss": 0.6632, "rewards/accuracies": 1.0, "rewards/chosen": 0.8234415054321289, "rewards/margins": 0.060860633850097656, "rewards/rejected": 0.7625808715820312, "step": 521 }, { "epoch": 0.28, "learning_rate": 9.893544913803977e-08, "logits/chosen": -1.9720909595489502, "logits/rejected": -2.256150484085083, "logps/chosen": -0.6932246685028076, "logps/rejected": -0.7049269676208496, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.777245819568634, "rewards/margins": 0.006308913230895996, "rewards/rejected": 0.770936906337738, "step": 522 }, { "epoch": 0.28, "learning_rate": 9.892946474515328e-08, "logits/chosen": -2.146207094192505, "logits/rejected": -2.147791624069214, "logps/chosen": -4.789748191833496, "logps/rejected": -3.368264675140381, "loss": 0.5389, "rewards/accuracies": 1.0, "rewards/chosen": 0.9565787315368652, "rewards/margins": 0.336601197719574, "rewards/rejected": 0.6199775338172913, "step": 523 }, { "epoch": 0.28, "learning_rate": 9.89234637607038e-08, "logits/chosen": -2.1264760494232178, "logits/rejected": -2.312516450881958, "logps/chosen": -5.257915019989014, "logps/rejected": -5.172889709472656, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.6805972456932068, "rewards/margins": 0.022309720516204834, "rewards/rejected": 0.658287525177002, "step": 524 }, { "epoch": 0.28, "learning_rate": 9.891744618672624e-08, "logits/chosen": -2.114318370819092, "logits/rejected": -2.279849052429199, "logps/chosen": -0.7620605230331421, "logps/rejected": -0.7803424596786499, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.887354850769043, "rewards/margins": 0.025105535984039307, "rewards/rejected": 0.8622493147850037, "step": 525 }, { "epoch": 0.28, "learning_rate": 9.891141202526107e-08, "logits/chosen": -2.070256233215332, "logits/rejected": -2.2313408851623535, "logps/chosen": -3.02341890335083, "logps/rejected": -2.9190211296081543, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 0.6759520173072815, "rewards/margins": -0.004788219928741455, "rewards/rejected": 0.680740237236023, "step": 526 }, { "epoch": 0.28, "learning_rate": 9.890536127835445e-08, "logits/chosen": -2.0608413219451904, "logits/rejected": -2.051180124282837, "logps/chosen": -12.538450241088867, "logps/rejected": -1.652222990989685, "loss": 0.6487, "rewards/accuracies": 1.0, "rewards/chosen": 1.0313446521759033, "rewards/margins": 0.09104007482528687, "rewards/rejected": 0.9403045773506165, "step": 527 }, { "epoch": 0.28, "learning_rate": 9.889929394805811e-08, "logits/chosen": -2.1095542907714844, "logits/rejected": -2.090817928314209, "logps/chosen": -16.870574951171875, "logps/rejected": -5.432055473327637, "loss": 0.7209, "rewards/accuracies": 0.0, "rewards/chosen": 0.6295021176338196, "rewards/margins": -0.054743289947509766, "rewards/rejected": 0.6842454075813293, "step": 528 }, { "epoch": 0.29, "learning_rate": 9.889321003642946e-08, "logits/chosen": -2.1167492866516113, "logits/rejected": -2.240551233291626, "logps/chosen": -3.8305411338806152, "logps/rejected": -3.7855067253112793, "loss": 0.7021, "rewards/accuracies": 0.0, "rewards/chosen": 0.49291282892227173, "rewards/margins": -0.017902672290802002, "rewards/rejected": 0.5108155012130737, "step": 529 }, { "epoch": 0.29, "learning_rate": 9.88871095455315e-08, "logits/chosen": -2.078019142150879, "logits/rejected": -2.274899482727051, "logps/chosen": -3.3084659576416016, "logps/rejected": -3.139927864074707, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 0.9283767938613892, "rewards/margins": -0.0009168386459350586, "rewards/rejected": 0.9292936325073242, "step": 530 }, { "epoch": 0.29, "learning_rate": 9.888099247743283e-08, "logits/chosen": -2.060652256011963, "logits/rejected": -1.9735181331634521, "logps/chosen": -37.811702728271484, "logps/rejected": -2.3782522678375244, "loss": 0.5703, "rewards/accuracies": 1.0, "rewards/chosen": 0.7797630429267883, "rewards/margins": 0.2629404067993164, "rewards/rejected": 0.5168226361274719, "step": 531 }, { "epoch": 0.29, "learning_rate": 9.887485883420771e-08, "logits/chosen": -2.075881004333496, "logits/rejected": -2.230285882949829, "logps/chosen": -4.838769912719727, "logps/rejected": -4.820245742797852, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 0.8020860552787781, "rewards/margins": -0.004754543304443359, "rewards/rejected": 0.8068405985832214, "step": 532 }, { "epoch": 0.29, "learning_rate": 9.886870861793601e-08, "logits/chosen": -2.1562867164611816, "logits/rejected": -2.130176544189453, "logps/chosen": -12.203160285949707, "logps/rejected": -16.98650360107422, "loss": 0.5303, "rewards/accuracies": 1.0, "rewards/chosen": 0.6280555129051208, "rewards/margins": 0.3575904071331024, "rewards/rejected": 0.27046510577201843, "step": 533 }, { "epoch": 0.29, "learning_rate": 9.88625418307032e-08, "logits/chosen": -2.218355417251587, "logits/rejected": -2.212275266647339, "logps/chosen": -5.614764213562012, "logps/rejected": -2.146531343460083, "loss": 0.5859, "rewards/accuracies": 1.0, "rewards/chosen": 0.9272632598876953, "rewards/margins": 0.22733813524246216, "rewards/rejected": 0.6999251246452332, "step": 534 }, { "epoch": 0.29, "learning_rate": 9.88563584746004e-08, "logits/chosen": -2.0655252933502197, "logits/rejected": -2.255852699279785, "logps/chosen": -2.0963680744171143, "logps/rejected": -2.0272490978240967, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.7251990437507629, "rewards/margins": 0.018460392951965332, "rewards/rejected": 0.7067386507987976, "step": 535 }, { "epoch": 0.29, "learning_rate": 9.885015855172434e-08, "logits/chosen": -2.0982887744903564, "logits/rejected": -2.093614339828491, "logps/chosen": -10.42471694946289, "logps/rejected": -3.091233253479004, "loss": 0.4707, "rewards/accuracies": 1.0, "rewards/chosen": 1.0544852018356323, "rewards/margins": 0.508910596370697, "rewards/rejected": 0.5455746054649353, "step": 536 }, { "epoch": 0.29, "learning_rate": 9.884394206417734e-08, "logits/chosen": -2.1221234798431396, "logits/rejected": -2.123152494430542, "logps/chosen": -1.3125735521316528, "logps/rejected": -6.384261131286621, "loss": 0.4953, "rewards/accuracies": 1.0, "rewards/chosen": 0.8797840476036072, "rewards/margins": 0.44477370381355286, "rewards/rejected": 0.4350103437900543, "step": 537 }, { "epoch": 0.29, "learning_rate": 9.883770901406738e-08, "logits/chosen": -1.9806478023529053, "logits/rejected": -1.9797357320785522, "logps/chosen": -1.1014890670776367, "logps/rejected": -2.9984562397003174, "loss": 0.588, "rewards/accuracies": 1.0, "rewards/chosen": 0.9212931990623474, "rewards/margins": 0.22262787818908691, "rewards/rejected": 0.6986653208732605, "step": 538 }, { "epoch": 0.29, "learning_rate": 9.883145940350801e-08, "logits/chosen": -2.0159456729888916, "logits/rejected": -2.0140018463134766, "logps/chosen": -1.8960070610046387, "logps/rejected": -3.9668610095977783, "loss": 0.556, "rewards/accuracies": 1.0, "rewards/chosen": 0.8406227231025696, "rewards/margins": 0.29620617628097534, "rewards/rejected": 0.5444165468215942, "step": 539 }, { "epoch": 0.29, "learning_rate": 9.882519323461844e-08, "logits/chosen": -2.0300350189208984, "logits/rejected": -2.021745443344116, "logps/chosen": -5.098915100097656, "logps/rejected": -5.778911113739014, "loss": 0.5526, "rewards/accuracies": 1.0, "rewards/chosen": 1.0900250673294067, "rewards/margins": 0.30417943000793457, "rewards/rejected": 0.7858456373214722, "step": 540 }, { "epoch": 0.29, "learning_rate": 9.881891050952344e-08, "logits/chosen": -2.016125202178955, "logits/rejected": -2.338249444961548, "logps/chosen": -1.5625848770141602, "logps/rejected": -1.3743212223052979, "loss": 0.7201, "rewards/accuracies": 0.0, "rewards/chosen": 0.9288984537124634, "rewards/margins": -0.05324441194534302, "rewards/rejected": 0.9821428656578064, "step": 541 }, { "epoch": 0.29, "learning_rate": 9.881261123035349e-08, "logits/chosen": -2.0295372009277344, "logits/rejected": -2.298297882080078, "logps/chosen": -1.25961434841156, "logps/rejected": -1.1526787281036377, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.7980306148529053, "rewards/margins": 0.025004148483276367, "rewards/rejected": 0.7730264663696289, "step": 542 }, { "epoch": 0.29, "learning_rate": 9.880629539924456e-08, "logits/chosen": -2.080688953399658, "logits/rejected": -2.0804600715637207, "logps/chosen": -1.939440369606018, "logps/rejected": -1.7638156414031982, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.7761218547821045, "rewards/margins": 0.019651710987091064, "rewards/rejected": 0.7564701437950134, "step": 543 }, { "epoch": 0.29, "learning_rate": 9.879996301833833e-08, "logits/chosen": -2.106370449066162, "logits/rejected": -2.2873098850250244, "logps/chosen": -8.779139518737793, "logps/rejected": -35.957481384277344, "loss": 0.3701, "rewards/accuracies": 1.0, "rewards/chosen": 1.07975172996521, "rewards/margins": 0.8031667470932007, "rewards/rejected": 0.27658501267433167, "step": 544 }, { "epoch": 0.29, "learning_rate": 9.879361408978205e-08, "logits/chosen": -2.019737958908081, "logits/rejected": -2.2624213695526123, "logps/chosen": -1.1240931749343872, "logps/rejected": -1.0621938705444336, "loss": 0.6682, "rewards/accuracies": 1.0, "rewards/chosen": 0.8910408020019531, "rewards/margins": 0.05050086975097656, "rewards/rejected": 0.8405399322509766, "step": 545 }, { "epoch": 0.29, "learning_rate": 9.878724861572857e-08, "logits/chosen": -2.012741804122925, "logits/rejected": -2.2459876537323, "logps/chosen": -0.7897710204124451, "logps/rejected": -0.8703033924102783, "loss": 0.7013, "rewards/accuracies": 0.0, "rewards/chosen": 0.8317425847053528, "rewards/margins": -0.016276955604553223, "rewards/rejected": 0.848019540309906, "step": 546 }, { "epoch": 0.3, "learning_rate": 9.878086659833638e-08, "logits/chosen": -2.1645662784576416, "logits/rejected": -2.3410212993621826, "logps/chosen": -1.824663758277893, "logps/rejected": -1.9971519708633423, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": 0.9359955191612244, "rewards/margins": 0.02643418312072754, "rewards/rejected": 0.9095613360404968, "step": 547 }, { "epoch": 0.3, "learning_rate": 9.877446803976959e-08, "logits/chosen": -1.9637244939804077, "logits/rejected": -2.2685608863830566, "logps/chosen": -0.6886667609214783, "logps/rejected": -0.7258634567260742, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 0.8623577356338501, "rewards/margins": 0.024482250213623047, "rewards/rejected": 0.837875485420227, "step": 548 }, { "epoch": 0.3, "learning_rate": 9.876805294219786e-08, "logits/chosen": -2.0499022006988525, "logits/rejected": -2.053244113922119, "logps/chosen": -1.1337109804153442, "logps/rejected": -4.255516529083252, "loss": 0.5617, "rewards/accuracies": 1.0, "rewards/chosen": 0.7733618021011353, "rewards/margins": 0.2828621566295624, "rewards/rejected": 0.4904996454715729, "step": 549 }, { "epoch": 0.3, "learning_rate": 9.876162130779651e-08, "logits/chosen": -2.061875104904175, "logits/rejected": -2.0592873096466064, "logps/chosen": -4.142260551452637, "logps/rejected": -5.575295448303223, "loss": 0.5209, "rewards/accuracies": 1.0, "rewards/chosen": 1.1325312852859497, "rewards/margins": 0.3804571032524109, "rewards/rejected": 0.7520741820335388, "step": 550 }, { "epoch": 0.3, "learning_rate": 9.875517313874644e-08, "logits/chosen": -2.04219651222229, "logits/rejected": -2.1827685832977295, "logps/chosen": -2.2243869304656982, "logps/rejected": -2.3739638328552246, "loss": 0.663, "rewards/accuracies": 1.0, "rewards/chosen": 0.7177016139030457, "rewards/margins": 0.061329782009124756, "rewards/rejected": 0.6563718318939209, "step": 551 }, { "epoch": 0.3, "learning_rate": 9.874870843723419e-08, "logits/chosen": -2.1236791610717773, "logits/rejected": -2.1382784843444824, "logps/chosen": -1.2056739330291748, "logps/rejected": -8.551796913146973, "loss": 0.5157, "rewards/accuracies": 1.0, "rewards/chosen": 0.8925329446792603, "rewards/margins": 0.39324885606765747, "rewards/rejected": 0.4992840886116028, "step": 552 }, { "epoch": 0.3, "learning_rate": 9.874222720545189e-08, "logits/chosen": -2.1870992183685303, "logits/rejected": -2.0927066802978516, "logps/chosen": -31.36623191833496, "logps/rejected": -3.656341075897217, "loss": 0.5697, "rewards/accuracies": 1.0, "rewards/chosen": 0.8569712042808533, "rewards/margins": 0.2642401456832886, "rewards/rejected": 0.5927310585975647, "step": 553 }, { "epoch": 0.3, "learning_rate": 9.873572944559723e-08, "logits/chosen": -1.9676238298416138, "logits/rejected": -2.218562602996826, "logps/chosen": -1.5380593538284302, "logps/rejected": -1.5705621242523193, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.8458549380302429, "rewards/margins": 0.018052756786346436, "rewards/rejected": 0.8278021812438965, "step": 554 }, { "epoch": 0.3, "learning_rate": 9.872921515987359e-08, "logits/chosen": -2.036313772201538, "logits/rejected": -2.326779842376709, "logps/chosen": -1.0110900402069092, "logps/rejected": -0.9334367513656616, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.7898549437522888, "rewards/margins": 0.01545262336730957, "rewards/rejected": 0.7744023203849792, "step": 555 }, { "epoch": 0.3, "learning_rate": 9.87226843504899e-08, "logits/chosen": -1.9942982196807861, "logits/rejected": -1.9832196235656738, "logps/chosen": -16.73763656616211, "logps/rejected": -6.031959533691406, "loss": 0.4743, "rewards/accuracies": 1.0, "rewards/chosen": 1.432871699333191, "rewards/margins": 0.499353289604187, "rewards/rejected": 0.9335184097290039, "step": 556 }, { "epoch": 0.3, "learning_rate": 9.871613701966066e-08, "logits/chosen": -2.021042585372925, "logits/rejected": -2.028874397277832, "logps/chosen": -3.8724005222320557, "logps/rejected": -2.796168804168701, "loss": 0.4175, "rewards/accuracies": 1.0, "rewards/chosen": 1.281447410583496, "rewards/margins": 0.6575453877449036, "rewards/rejected": 0.6239020228385925, "step": 557 }, { "epoch": 0.3, "learning_rate": 9.870957316960607e-08, "logits/chosen": -2.1514546871185303, "logits/rejected": -2.3429572582244873, "logps/chosen": -11.957149505615234, "logps/rejected": -11.159420013427734, "loss": 0.7164, "rewards/accuracies": 0.0, "rewards/chosen": 0.21247254312038422, "rewards/margins": -0.04606170952320099, "rewards/rejected": 0.2585342526435852, "step": 558 }, { "epoch": 0.3, "learning_rate": 9.870299280255184e-08, "logits/chosen": -2.0787174701690674, "logits/rejected": -2.0842838287353516, "logps/chosen": -1.219735860824585, "logps/rejected": -4.77692985534668, "loss": 0.6069, "rewards/accuracies": 1.0, "rewards/chosen": 0.8594867587089539, "rewards/margins": 0.1806638240814209, "rewards/rejected": 0.678822934627533, "step": 559 }, { "epoch": 0.3, "learning_rate": 9.869639592072933e-08, "logits/chosen": -2.0944788455963135, "logits/rejected": -2.3170173168182373, "logps/chosen": -3.349781036376953, "logps/rejected": -3.3944358825683594, "loss": 0.6701, "rewards/accuracies": 1.0, "rewards/chosen": 0.8836755156517029, "rewards/margins": 0.04667770862579346, "rewards/rejected": 0.8369978070259094, "step": 560 }, { "epoch": 0.3, "learning_rate": 9.868978252637548e-08, "logits/chosen": -2.185044288635254, "logits/rejected": -2.1645333766937256, "logps/chosen": -5.834765434265137, "logps/rejected": -7.295151233673096, "loss": 0.5148, "rewards/accuracies": 1.0, "rewards/chosen": 0.9649896621704102, "rewards/margins": 0.3955918550491333, "rewards/rejected": 0.5693978071212769, "step": 561 }, { "epoch": 0.3, "learning_rate": 9.868315262173283e-08, "logits/chosen": -2.1828415393829346, "logits/rejected": -2.1075656414031982, "logps/chosen": -44.99730682373047, "logps/rejected": -0.8704475164413452, "loss": 0.7226, "rewards/accuracies": 0.0, "rewards/chosen": 0.8337737917900085, "rewards/margins": -0.058142125606536865, "rewards/rejected": 0.8919159173965454, "step": 562 }, { "epoch": 0.3, "learning_rate": 9.867650620904954e-08, "logits/chosen": -2.043128728866577, "logits/rejected": -2.239138126373291, "logps/chosen": -1.543228030204773, "logps/rejected": -2.3172049522399902, "loss": 0.6998, "rewards/accuracies": 0.0, "rewards/chosen": 0.830544650554657, "rewards/margins": -0.01317906379699707, "rewards/rejected": 0.843723714351654, "step": 563 }, { "epoch": 0.3, "learning_rate": 9.866984329057935e-08, "logits/chosen": -2.1386826038360596, "logits/rejected": -2.045461893081665, "logps/chosen": -42.691619873046875, "logps/rejected": -4.426666259765625, "loss": 0.6144, "rewards/accuracies": 1.0, "rewards/chosen": 0.8896743655204773, "rewards/margins": 0.164276123046875, "rewards/rejected": 0.7253982424736023, "step": 564 }, { "epoch": 0.3, "learning_rate": 9.86631638685816e-08, "logits/chosen": -2.099484443664551, "logits/rejected": -2.3000309467315674, "logps/chosen": -0.7754479646682739, "logps/rejected": -0.7081884145736694, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.8507646918296814, "rewards/margins": 0.00948953628540039, "rewards/rejected": 0.841275155544281, "step": 565 }, { "epoch": 0.31, "learning_rate": 9.865646794532119e-08, "logits/chosen": -2.0446667671203613, "logits/rejected": -2.042280912399292, "logps/chosen": -4.15308952331543, "logps/rejected": -3.995913028717041, "loss": 0.5311, "rewards/accuracies": 1.0, "rewards/chosen": 1.057215929031372, "rewards/margins": 0.3555362820625305, "rewards/rejected": 0.7016796469688416, "step": 566 }, { "epoch": 0.31, "learning_rate": 9.864975552306869e-08, "logits/chosen": -2.1321816444396973, "logits/rejected": -2.2537600994110107, "logps/chosen": -6.461624622344971, "logps/rejected": -6.435380935668945, "loss": 0.6508, "rewards/accuracies": 1.0, "rewards/chosen": 0.5286911725997925, "rewards/margins": 0.08661839365959167, "rewards/rejected": 0.4420727789402008, "step": 567 }, { "epoch": 0.31, "learning_rate": 9.864302660410023e-08, "logits/chosen": -1.9953522682189941, "logits/rejected": -2.261899948120117, "logps/chosen": -1.7424566745758057, "logps/rejected": -1.8017525672912598, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.8163952231407166, "rewards/margins": 0.026699542999267578, "rewards/rejected": 0.789695680141449, "step": 568 }, { "epoch": 0.31, "learning_rate": 9.863628119069749e-08, "logits/chosen": -2.0618207454681396, "logits/rejected": -2.057098865509033, "logps/chosen": -7.048700332641602, "logps/rejected": -4.060202121734619, "loss": 0.5258, "rewards/accuracies": 1.0, "rewards/chosen": 0.8207451701164246, "rewards/margins": 0.3685496747493744, "rewards/rejected": 0.45219549536705017, "step": 569 }, { "epoch": 0.31, "learning_rate": 9.862951928514781e-08, "logits/chosen": -2.180917263031006, "logits/rejected": -2.1226210594177246, "logps/chosen": -33.925201416015625, "logps/rejected": -1.963183879852295, "loss": 0.5108, "rewards/accuracies": 1.0, "rewards/chosen": 1.0353740453720093, "rewards/margins": 0.4056324362754822, "rewards/rejected": 0.6297416090965271, "step": 570 }, { "epoch": 0.31, "learning_rate": 9.86227408897441e-08, "logits/chosen": -2.056528329849243, "logits/rejected": -2.2974462509155273, "logps/chosen": -2.259007453918457, "logps/rejected": -2.4849424362182617, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 0.9556900262832642, "rewards/margins": 0.039400577545166016, "rewards/rejected": 0.9162894487380981, "step": 571 }, { "epoch": 0.31, "learning_rate": 9.861594600678482e-08, "logits/chosen": -1.8844656944274902, "logits/rejected": -2.3037590980529785, "logps/chosen": -1.4022653102874756, "logps/rejected": -1.4417861700057983, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 1.0140111446380615, "rewards/margins": -0.00802755355834961, "rewards/rejected": 1.0220386981964111, "step": 572 }, { "epoch": 0.31, "learning_rate": 9.86091346385741e-08, "logits/chosen": -2.175916910171509, "logits/rejected": -2.273226737976074, "logps/chosen": -8.324664115905762, "logps/rejected": -3.62661075592041, "loss": 0.8038, "rewards/accuracies": 0.0, "rewards/chosen": 0.384131520986557, "rewards/margins": -0.21032145619392395, "rewards/rejected": 0.594452977180481, "step": 573 }, { "epoch": 0.31, "learning_rate": 9.860230678742158e-08, "logits/chosen": -2.092215061187744, "logits/rejected": -2.112562656402588, "logps/chosen": -3.1904819011688232, "logps/rejected": -3.101468563079834, "loss": 0.6221, "rewards/accuracies": 1.0, "rewards/chosen": 0.9408559799194336, "rewards/margins": 0.1476198434829712, "rewards/rejected": 0.7932361364364624, "step": 574 }, { "epoch": 0.31, "learning_rate": 9.859546245564255e-08, "logits/chosen": -2.062349796295166, "logits/rejected": -2.0668349266052246, "logps/chosen": -1.6640970706939697, "logps/rejected": -2.454660177230835, "loss": 0.5571, "rewards/accuracies": 1.0, "rewards/chosen": 0.8491975665092468, "rewards/margins": 0.29346609115600586, "rewards/rejected": 0.555731475353241, "step": 575 }, { "epoch": 0.31, "learning_rate": 9.858860164555787e-08, "logits/chosen": -1.9579962491989136, "logits/rejected": -2.238657236099243, "logps/chosen": -0.6228766441345215, "logps/rejected": -0.6333739757537842, "loss": 0.6684, "rewards/accuracies": 1.0, "rewards/chosen": 0.8704339861869812, "rewards/margins": 0.050086379051208496, "rewards/rejected": 0.8203476071357727, "step": 576 }, { "epoch": 0.31, "learning_rate": 9.858172435949395e-08, "logits/chosen": -2.0536177158355713, "logits/rejected": -2.0652976036071777, "logps/chosen": -7.749349594116211, "logps/rejected": -9.928627014160156, "loss": 0.3721, "rewards/accuracies": 1.0, "rewards/chosen": 1.3755708932876587, "rewards/margins": 0.7969337701797485, "rewards/rejected": 0.5786371231079102, "step": 577 }, { "epoch": 0.31, "learning_rate": 9.857483059978284e-08, "logits/chosen": -2.1098520755767822, "logits/rejected": -2.1172399520874023, "logps/chosen": -3.0097341537475586, "logps/rejected": -2.435513973236084, "loss": 0.5343, "rewards/accuracies": 1.0, "rewards/chosen": 1.0511082410812378, "rewards/margins": 0.3477194905281067, "rewards/rejected": 0.7033887505531311, "step": 578 }, { "epoch": 0.31, "learning_rate": 9.856792036876217e-08, "logits/chosen": -2.1519274711608887, "logits/rejected": -2.11727237701416, "logps/chosen": -39.04924774169922, "logps/rejected": -14.805909156799316, "loss": 0.7984, "rewards/accuracies": 0.0, "rewards/chosen": 0.5880355834960938, "rewards/margins": -0.20043879747390747, "rewards/rejected": 0.7884743809700012, "step": 579 }, { "epoch": 0.31, "learning_rate": 9.856099366877512e-08, "logits/chosen": -2.0299031734466553, "logits/rejected": -2.239168643951416, "logps/chosen": -0.9064377546310425, "logps/rejected": -0.9369895458221436, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.9709624648094177, "rewards/margins": -0.0004679560661315918, "rewards/rejected": 0.9714304208755493, "step": 580 }, { "epoch": 0.31, "learning_rate": 9.855405050217048e-08, "logits/chosen": -2.063140869140625, "logits/rejected": -2.0648298263549805, "logps/chosen": -9.170660018920898, "logps/rejected": -6.951109886169434, "loss": 0.7101, "rewards/accuracies": 0.0, "rewards/chosen": 0.9126821756362915, "rewards/margins": -0.033628761768341064, "rewards/rejected": 0.9463109374046326, "step": 581 }, { "epoch": 0.31, "learning_rate": 9.85470908713026e-08, "logits/chosen": -2.143021821975708, "logits/rejected": -2.1938462257385254, "logps/chosen": -8.164588928222656, "logps/rejected": -9.685243606567383, "loss": 0.5692, "rewards/accuracies": 1.0, "rewards/chosen": 1.0113794803619385, "rewards/margins": 0.2654194235801697, "rewards/rejected": 0.7459600567817688, "step": 582 }, { "epoch": 0.31, "learning_rate": 9.854011477853146e-08, "logits/chosen": -2.038367509841919, "logits/rejected": -2.2365641593933105, "logps/chosen": -1.0902832746505737, "logps/rejected": -1.1406975984573364, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.9105855822563171, "rewards/margins": 0.029494822025299072, "rewards/rejected": 0.8810907602310181, "step": 583 }, { "epoch": 0.31, "learning_rate": 9.853312222622257e-08, "logits/chosen": -2.1138954162597656, "logits/rejected": -2.140289306640625, "logps/chosen": -11.007452011108398, "logps/rejected": -17.390853881835938, "loss": 0.5527, "rewards/accuracies": 1.0, "rewards/chosen": 1.0547436475753784, "rewards/margins": 0.3038637638092041, "rewards/rejected": 0.7508798837661743, "step": 584 }, { "epoch": 0.32, "learning_rate": 9.852611321674707e-08, "logits/chosen": -2.0846431255340576, "logits/rejected": -2.093623161315918, "logps/chosen": -3.1591796875, "logps/rejected": -2.319049596786499, "loss": 0.5341, "rewards/accuracies": 1.0, "rewards/chosen": 1.0772768259048462, "rewards/margins": 0.34833651781082153, "rewards/rejected": 0.7289403080940247, "step": 585 }, { "epoch": 0.32, "learning_rate": 9.851908775248163e-08, "logits/chosen": -2.005570411682129, "logits/rejected": -2.268688201904297, "logps/chosen": -5.994953155517578, "logps/rejected": -2.4871292114257812, "loss": 0.7446, "rewards/accuracies": 0.0, "rewards/chosen": 0.8463021516799927, "rewards/margins": -0.10032802820205688, "rewards/rejected": 0.9466301798820496, "step": 586 }, { "epoch": 0.32, "learning_rate": 9.851204583580855e-08, "logits/chosen": -2.0107228755950928, "logits/rejected": -2.249307632446289, "logps/chosen": -1.2481504678726196, "logps/rejected": -1.243876576423645, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.8113770484924316, "rewards/margins": 0.007985413074493408, "rewards/rejected": 0.8033916354179382, "step": 587 }, { "epoch": 0.32, "learning_rate": 9.850498746911566e-08, "logits/chosen": -2.143240213394165, "logits/rejected": -2.1124792098999023, "logps/chosen": -27.83160400390625, "logps/rejected": -3.374802350997925, "loss": 0.3965, "rewards/accuracies": 1.0, "rewards/chosen": 1.3096656799316406, "rewards/margins": 0.7202824354171753, "rewards/rejected": 0.5893832445144653, "step": 588 }, { "epoch": 0.32, "learning_rate": 9.84979126547964e-08, "logits/chosen": -2.053999900817871, "logits/rejected": -2.0450925827026367, "logps/chosen": -6.26816463470459, "logps/rejected": -6.009485244750977, "loss": 0.4397, "rewards/accuracies": 1.0, "rewards/chosen": 1.0049268007278442, "rewards/margins": 0.5937654972076416, "rewards/rejected": 0.411161333322525, "step": 589 }, { "epoch": 0.32, "learning_rate": 9.849082139524978e-08, "logits/chosen": -2.1038968563079834, "logits/rejected": -2.0890328884124756, "logps/chosen": -43.4478759765625, "logps/rejected": -26.84164047241211, "loss": 0.6206, "rewards/accuracies": 1.0, "rewards/chosen": 0.929334282875061, "rewards/margins": 0.15086710453033447, "rewards/rejected": 0.7784671783447266, "step": 590 }, { "epoch": 0.32, "learning_rate": 9.84837136928804e-08, "logits/chosen": -2.137378215789795, "logits/rejected": -2.2473061084747314, "logps/chosen": -6.008976459503174, "logps/rejected": -5.6099534034729, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.7363293170928955, "rewards/margins": 0.0024883151054382324, "rewards/rejected": 0.7338410019874573, "step": 591 }, { "epoch": 0.32, "learning_rate": 9.84765895500984e-08, "logits/chosen": -2.1519181728363037, "logits/rejected": -2.2917237281799316, "logps/chosen": -1.9267739057540894, "logps/rejected": -1.9296231269836426, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 0.7726696133613586, "rewards/margins": -0.004768013954162598, "rewards/rejected": 0.7774376273155212, "step": 592 }, { "epoch": 0.32, "learning_rate": 9.846944896931951e-08, "logits/chosen": -2.0460989475250244, "logits/rejected": -2.221233606338501, "logps/chosen": -0.8559690117835999, "logps/rejected": -0.9301103949546814, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.7559682726860046, "rewards/margins": 0.015656709671020508, "rewards/rejected": 0.7403115630149841, "step": 593 }, { "epoch": 0.32, "learning_rate": 9.846229195296505e-08, "logits/chosen": -2.15228271484375, "logits/rejected": -2.2132205963134766, "logps/chosen": -5.945125579833984, "logps/rejected": -3.442613124847412, "loss": 0.7195, "rewards/accuracies": 0.0, "rewards/chosen": 0.7013745307922363, "rewards/margins": -0.05200475454330444, "rewards/rejected": 0.7533792853355408, "step": 594 }, { "epoch": 0.32, "learning_rate": 9.845511850346193e-08, "logits/chosen": -2.156991481781006, "logits/rejected": -2.1504220962524414, "logps/chosen": -5.572810173034668, "logps/rejected": -5.564005374908447, "loss": 0.6378, "rewards/accuracies": 1.0, "rewards/chosen": 0.6947029232978821, "rewards/margins": 0.11401832103729248, "rewards/rejected": 0.5806846022605896, "step": 595 }, { "epoch": 0.32, "learning_rate": 9.844792862324257e-08, "logits/chosen": -2.1465675830841064, "logits/rejected": -2.1767256259918213, "logps/chosen": -1.307602882385254, "logps/rejected": -9.781024932861328, "loss": 0.627, "rewards/accuracies": 1.0, "rewards/chosen": 0.9446257948875427, "rewards/margins": 0.13692885637283325, "rewards/rejected": 0.8076969385147095, "step": 596 }, { "epoch": 0.32, "learning_rate": 9.8440722314745e-08, "logits/chosen": -2.0184786319732666, "logits/rejected": -2.0115373134613037, "logps/chosen": -11.902130126953125, "logps/rejected": -1.399101734161377, "loss": 0.6486, "rewards/accuracies": 1.0, "rewards/chosen": 0.9383465051651001, "rewards/margins": 0.09119319915771484, "rewards/rejected": 0.8471533060073853, "step": 597 }, { "epoch": 0.32, "learning_rate": 9.843349958041284e-08, "logits/chosen": -2.037834405899048, "logits/rejected": -2.310177803039551, "logps/chosen": -1.138478398323059, "logps/rejected": -3.1343307495117188, "loss": 0.559, "rewards/accuracies": 1.0, "rewards/chosen": 0.8740776181221008, "rewards/margins": 0.2891542315483093, "rewards/rejected": 0.5849233865737915, "step": 598 }, { "epoch": 0.32, "learning_rate": 9.842626042269524e-08, "logits/chosen": -2.000431537628174, "logits/rejected": -1.9976123571395874, "logps/chosen": -10.157770156860352, "logps/rejected": -3.345150947570801, "loss": 0.4254, "rewards/accuracies": 1.0, "rewards/chosen": 1.1544567346572876, "rewards/margins": 0.6346368193626404, "rewards/rejected": 0.5198199152946472, "step": 599 }, { "epoch": 0.32, "learning_rate": 9.841900484404692e-08, "logits/chosen": -2.034519672393799, "logits/rejected": -2.2264621257781982, "logps/chosen": -1.0533232688903809, "logps/rejected": -1.0201038122177124, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.8863205313682556, "rewards/margins": 0.0041887760162353516, "rewards/rejected": 0.8821317553520203, "step": 600 }, { "epoch": 0.32, "learning_rate": 9.841173284692823e-08, "logits/chosen": -1.9916801452636719, "logits/rejected": -2.0022964477539062, "logps/chosen": -2.6332530975341797, "logps/rejected": -10.166035652160645, "loss": 0.6481, "rewards/accuracies": 1.0, "rewards/chosen": 0.8466812372207642, "rewards/margins": 0.09223794937133789, "rewards/rejected": 0.7544432878494263, "step": 601 }, { "epoch": 0.32, "learning_rate": 9.840444443380501e-08, "logits/chosen": -1.9758052825927734, "logits/rejected": -2.217991590499878, "logps/chosen": -4.09018611907959, "logps/rejected": -3.9394450187683105, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.8384795188903809, "rewards/margins": 0.00785130262374878, "rewards/rejected": 0.8306282162666321, "step": 602 }, { "epoch": 0.33, "learning_rate": 9.839713960714871e-08, "logits/chosen": -2.070082902908325, "logits/rejected": -2.0740535259246826, "logps/chosen": -3.4542505741119385, "logps/rejected": -3.1350040435791016, "loss": 0.5179, "rewards/accuracies": 1.0, "rewards/chosen": 0.9942521452903748, "rewards/margins": 0.38784801959991455, "rewards/rejected": 0.6064041256904602, "step": 603 }, { "epoch": 0.33, "learning_rate": 9.838981836943632e-08, "logits/chosen": -2.068941354751587, "logits/rejected": -2.316157341003418, "logps/chosen": -1.4307645559310913, "logps/rejected": -1.3438009023666382, "loss": 0.7081, "rewards/accuracies": 0.0, "rewards/chosen": 0.7878388166427612, "rewards/margins": -0.029675602912902832, "rewards/rejected": 0.8175144195556641, "step": 604 }, { "epoch": 0.33, "learning_rate": 9.83824807231504e-08, "logits/chosen": -2.0736255645751953, "logits/rejected": -2.0422911643981934, "logps/chosen": -16.44434356689453, "logps/rejected": -3.770663022994995, "loss": 0.4959, "rewards/accuracies": 1.0, "rewards/chosen": 1.1371452808380127, "rewards/margins": 0.4433005452156067, "rewards/rejected": 0.693844735622406, "step": 605 }, { "epoch": 0.33, "learning_rate": 9.837512667077913e-08, "logits/chosen": -2.1181936264038086, "logits/rejected": -2.0644381046295166, "logps/chosen": -41.105403900146484, "logps/rejected": -9.43458366394043, "loss": 0.5506, "rewards/accuracies": 1.0, "rewards/chosen": 1.0642303228378296, "rewards/margins": 0.3088756203651428, "rewards/rejected": 0.7553547024726868, "step": 606 }, { "epoch": 0.33, "learning_rate": 9.836775621481617e-08, "logits/chosen": -2.1802866458892822, "logits/rejected": -2.1807096004486084, "logps/chosen": -1.4252228736877441, "logps/rejected": -4.258847713470459, "loss": 0.5074, "rewards/accuracies": 1.0, "rewards/chosen": 0.9986290335655212, "rewards/margins": 0.4139975309371948, "rewards/rejected": 0.5846315026283264, "step": 607 }, { "epoch": 0.33, "learning_rate": 9.836036935776078e-08, "logits/chosen": -2.046344041824341, "logits/rejected": -2.258013963699341, "logps/chosen": -2.4161787033081055, "logps/rejected": -2.6118862628936768, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.9963887333869934, "rewards/margins": -0.005318105220794678, "rewards/rejected": 1.001706838607788, "step": 608 }, { "epoch": 0.33, "learning_rate": 9.835296610211778e-08, "logits/chosen": -2.0256080627441406, "logits/rejected": -2.0272281169891357, "logps/chosen": -4.710328578948975, "logps/rejected": -3.0715136528015137, "loss": 0.6017, "rewards/accuracies": 1.0, "rewards/chosen": 0.8983623385429382, "rewards/margins": 0.19209790229797363, "rewards/rejected": 0.7062644362449646, "step": 609 }, { "epoch": 0.33, "learning_rate": 9.834554645039756e-08, "logits/chosen": -2.0206210613250732, "logits/rejected": -2.210709810256958, "logps/chosen": -1.097912311553955, "logps/rejected": -1.1063179969787598, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.8051344156265259, "rewards/margins": 0.005962789058685303, "rewards/rejected": 0.7991716265678406, "step": 610 }, { "epoch": 0.33, "learning_rate": 9.833811040511608e-08, "logits/chosen": -1.9704782962799072, "logits/rejected": -1.9713842868804932, "logps/chosen": -2.666658401489258, "logps/rejected": -1.4500064849853516, "loss": 0.6503, "rewards/accuracies": 1.0, "rewards/chosen": 0.8632544875144958, "rewards/margins": 0.08761531114578247, "rewards/rejected": 0.7756391763687134, "step": 611 }, { "epoch": 0.33, "learning_rate": 9.833065796879479e-08, "logits/chosen": -2.0618410110473633, "logits/rejected": -2.2528250217437744, "logps/chosen": -0.7890889048576355, "logps/rejected": -0.7782936096191406, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.8222417235374451, "rewards/margins": 0.029433071613311768, "rewards/rejected": 0.7928086519241333, "step": 612 }, { "epoch": 0.33, "learning_rate": 9.832318914396078e-08, "logits/chosen": -2.0757391452789307, "logits/rejected": -2.088301420211792, "logps/chosen": -15.737165451049805, "logps/rejected": -6.2134881019592285, "loss": 0.4455, "rewards/accuracies": 1.0, "rewards/chosen": 1.3209342956542969, "rewards/margins": 0.577683687210083, "rewards/rejected": 0.7432506084442139, "step": 613 }, { "epoch": 0.33, "learning_rate": 9.831570393314667e-08, "logits/chosen": -2.0453929901123047, "logits/rejected": -2.0500593185424805, "logps/chosen": -2.1479332447052, "logps/rejected": -3.82570481300354, "loss": 0.5167, "rewards/accuracies": 1.0, "rewards/chosen": 0.9566651582717896, "rewards/margins": 0.39094287157058716, "rewards/rejected": 0.5657222867012024, "step": 614 }, { "epoch": 0.33, "learning_rate": 9.830820233889062e-08, "logits/chosen": -2.0433976650238037, "logits/rejected": -2.2278244495391846, "logps/chosen": -1.7500122785568237, "logps/rejected": -1.8068287372589111, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 1.003613829612732, "rewards/margins": 0.0219690203666687, "rewards/rejected": 0.9816448092460632, "step": 615 }, { "epoch": 0.33, "learning_rate": 9.830068436373634e-08, "logits/chosen": -2.0338051319122314, "logits/rejected": -2.0331034660339355, "logps/chosen": -0.8322094678878784, "logps/rejected": -2.843379497528076, "loss": 0.5803, "rewards/accuracies": 1.0, "rewards/chosen": 0.8865619897842407, "rewards/margins": 0.24015367031097412, "rewards/rejected": 0.6464083194732666, "step": 616 }, { "epoch": 0.33, "learning_rate": 9.829315001023313e-08, "logits/chosen": -2.053257703781128, "logits/rejected": -2.2844834327697754, "logps/chosen": -7.167888641357422, "logps/rejected": -2.2134621143341064, "loss": 0.7543, "rewards/accuracies": 0.0, "rewards/chosen": 0.8746647834777832, "rewards/margins": -0.1188696026802063, "rewards/rejected": 0.9935343861579895, "step": 617 }, { "epoch": 0.33, "learning_rate": 9.828559928093583e-08, "logits/chosen": -2.0461974143981934, "logits/rejected": -2.0459439754486084, "logps/chosen": -7.069878101348877, "logps/rejected": -6.514370918273926, "loss": 0.6127, "rewards/accuracies": 1.0, "rewards/chosen": 1.1116052865982056, "rewards/margins": 0.16791915893554688, "rewards/rejected": 0.9436861276626587, "step": 618 }, { "epoch": 0.33, "learning_rate": 9.827803217840482e-08, "logits/chosen": -2.108720064163208, "logits/rejected": -2.1092960834503174, "logps/chosen": -3.9867725372314453, "logps/rejected": -1.748612403869629, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.9246324896812439, "rewards/margins": -0.007668018341064453, "rewards/rejected": 0.9323005080223083, "step": 619 }, { "epoch": 0.33, "learning_rate": 9.827044870520602e-08, "logits/chosen": -2.100231409072876, "logits/rejected": -2.3073878288269043, "logps/chosen": -2.2659363746643066, "logps/rejected": -18.844837188720703, "loss": 0.5288, "rewards/accuracies": 1.0, "rewards/chosen": 1.0921167135238647, "rewards/margins": 0.36107462644577026, "rewards/rejected": 0.7310420870780945, "step": 620 }, { "epoch": 0.33, "learning_rate": 9.826284886391097e-08, "logits/chosen": -2.002682685852051, "logits/rejected": -2.0149130821228027, "logps/chosen": -36.58356475830078, "logps/rejected": -21.384122848510742, "loss": 0.8171, "rewards/accuracies": 0.0, "rewards/chosen": 0.13315926492214203, "rewards/margins": -0.23429031670093536, "rewards/rejected": 0.3674495816230774, "step": 621 }, { "epoch": 0.34, "learning_rate": 9.825523265709666e-08, "logits/chosen": -1.9906010627746582, "logits/rejected": -1.9898935556411743, "logps/chosen": -6.585145950317383, "logps/rejected": -14.273859977722168, "loss": 0.5421, "rewards/accuracies": 1.0, "rewards/chosen": 0.7938596606254578, "rewards/margins": 0.3289250135421753, "rewards/rejected": 0.46493464708328247, "step": 622 }, { "epoch": 0.34, "learning_rate": 9.824760008734572e-08, "logits/chosen": -2.065433979034424, "logits/rejected": -2.2927727699279785, "logps/chosen": -18.603565216064453, "logps/rejected": -14.389350891113281, "loss": 0.7354, "rewards/accuracies": 0.0, "rewards/chosen": 0.6457592248916626, "rewards/margins": -0.08270794153213501, "rewards/rejected": 0.7284671664237976, "step": 623 }, { "epoch": 0.34, "learning_rate": 9.823995115724625e-08, "logits/chosen": -2.0669643878936768, "logits/rejected": -2.306250810623169, "logps/chosen": -2.2511184215545654, "logps/rejected": -2.0790374279022217, "loss": 0.7028, "rewards/accuracies": 0.0, "rewards/chosen": 0.8086116909980774, "rewards/margins": -0.019252777099609375, "rewards/rejected": 0.8278644680976868, "step": 624 }, { "epoch": 0.34, "learning_rate": 9.823228586939196e-08, "logits/chosen": -1.9229406118392944, "logits/rejected": -2.17331600189209, "logps/chosen": -4.288793563842773, "logps/rejected": -1.379244327545166, "loss": 0.7245, "rewards/accuracies": 0.0, "rewards/chosen": 0.7573302388191223, "rewards/margins": -0.06172072887420654, "rewards/rejected": 0.8190509676933289, "step": 625 }, { "epoch": 0.34, "learning_rate": 9.82246042263821e-08, "logits/chosen": -2.1739389896392822, "logits/rejected": -2.278174638748169, "logps/chosen": -11.04104995727539, "logps/rejected": -32.70515823364258, "loss": 0.4919, "rewards/accuracies": 1.0, "rewards/chosen": 0.8990859985351562, "rewards/margins": 0.4533790647983551, "rewards/rejected": 0.44570693373680115, "step": 626 }, { "epoch": 0.34, "learning_rate": 9.821690623082142e-08, "logits/chosen": -2.075674295425415, "logits/rejected": -2.2727577686309814, "logps/chosen": -5.247954845428467, "logps/rejected": -5.244171142578125, "loss": 0.7546, "rewards/accuracies": 0.0, "rewards/chosen": 0.6014630198478699, "rewards/margins": -0.11937439441680908, "rewards/rejected": 0.720837414264679, "step": 627 }, { "epoch": 0.34, "learning_rate": 9.820919188532025e-08, "logits/chosen": -2.01350474357605, "logits/rejected": -2.014395236968994, "logps/chosen": -0.8536143898963928, "logps/rejected": -2.6141517162323, "loss": 0.5767, "rewards/accuracies": 1.0, "rewards/chosen": 0.9797168970108032, "rewards/margins": 0.24833279848098755, "rewards/rejected": 0.7313840985298157, "step": 628 }, { "epoch": 0.34, "learning_rate": 9.820146119249447e-08, "logits/chosen": -2.0471761226654053, "logits/rejected": -2.04372239112854, "logps/chosen": -0.9168750643730164, "logps/rejected": -2.4862217903137207, "loss": 0.5716, "rewards/accuracies": 1.0, "rewards/chosen": 0.9540721774101257, "rewards/margins": 0.2598680853843689, "rewards/rejected": 0.6942040920257568, "step": 629 }, { "epoch": 0.34, "learning_rate": 9.819371415496549e-08, "logits/chosen": -2.044363260269165, "logits/rejected": -2.0463640689849854, "logps/chosen": -6.604145526885986, "logps/rejected": -1.0545153617858887, "loss": 0.6002, "rewards/accuracies": 1.0, "rewards/chosen": 1.0858840942382812, "rewards/margins": 0.19539600610733032, "rewards/rejected": 0.8904880881309509, "step": 630 }, { "epoch": 0.34, "learning_rate": 9.818595077536024e-08, "logits/chosen": -2.164443254470825, "logits/rejected": -2.2535383701324463, "logps/chosen": -6.030287742614746, "logps/rejected": -1.6226181983947754, "loss": 0.7657, "rewards/accuracies": 0.0, "rewards/chosen": 0.7348058819770813, "rewards/margins": -0.14028054475784302, "rewards/rejected": 0.8750864267349243, "step": 631 }, { "epoch": 0.34, "learning_rate": 9.817817105631126e-08, "logits/chosen": -2.1712138652801514, "logits/rejected": -2.212660074234009, "logps/chosen": -1.1881457567214966, "logps/rejected": -1.280080795288086, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.7912241816520691, "rewards/margins": 0.01471620798110962, "rewards/rejected": 0.7765079736709595, "step": 632 }, { "epoch": 0.34, "learning_rate": 9.817037500045655e-08, "logits/chosen": -2.060683250427246, "logits/rejected": -2.2267889976501465, "logps/chosen": -3.200911045074463, "logps/rejected": -6.894357204437256, "loss": 0.626, "rewards/accuracies": 1.0, "rewards/chosen": 0.5682668089866638, "rewards/margins": 0.13917586207389832, "rewards/rejected": 0.4290909469127655, "step": 633 }, { "epoch": 0.34, "learning_rate": 9.816256261043971e-08, "logits/chosen": -2.1382193565368652, "logits/rejected": -2.2991065979003906, "logps/chosen": -10.490066528320312, "logps/rejected": -10.117862701416016, "loss": 0.7063, "rewards/accuracies": 0.0, "rewards/chosen": 0.7863384485244751, "rewards/margins": -0.026130080223083496, "rewards/rejected": 0.8124685287475586, "step": 634 }, { "epoch": 0.34, "learning_rate": 9.815473388890982e-08, "logits/chosen": -2.1378109455108643, "logits/rejected": -2.2512664794921875, "logps/chosen": -1.5908373594284058, "logps/rejected": -4.567632675170898, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.9066171050071716, "rewards/margins": 0.03270590305328369, "rewards/rejected": 0.8739112019538879, "step": 635 }, { "epoch": 0.34, "learning_rate": 9.814688883852158e-08, "logits/chosen": -2.1154730319976807, "logits/rejected": -2.3085763454437256, "logps/chosen": -6.800375938415527, "logps/rejected": -6.577117919921875, "loss": 0.6428, "rewards/accuracies": 1.0, "rewards/chosen": 0.8922192454338074, "rewards/margins": 0.10341739654541016, "rewards/rejected": 0.7888018488883972, "step": 636 }, { "epoch": 0.34, "learning_rate": 9.813902746193514e-08, "logits/chosen": -2.1382973194122314, "logits/rejected": -2.230512857437134, "logps/chosen": -2.8276960849761963, "logps/rejected": -3.1389763355255127, "loss": 0.6719, "rewards/accuracies": 1.0, "rewards/chosen": 0.6633926630020142, "rewards/margins": 0.04292196035385132, "rewards/rejected": 0.6204707026481628, "step": 637 }, { "epoch": 0.34, "learning_rate": 9.813114976181624e-08, "logits/chosen": -2.088855028152466, "logits/rejected": -1.9826840162277222, "logps/chosen": -23.45503044128418, "logps/rejected": -11.623796463012695, "loss": 0.6449, "rewards/accuracies": 1.0, "rewards/chosen": 0.8973730206489563, "rewards/margins": 0.09887850284576416, "rewards/rejected": 0.7984945178031921, "step": 638 }, { "epoch": 0.34, "learning_rate": 9.812325574083614e-08, "logits/chosen": -2.1035234928131104, "logits/rejected": -1.924946904182434, "logps/chosen": -48.19010925292969, "logps/rejected": -2.166794776916504, "loss": 0.5057, "rewards/accuracies": 1.0, "rewards/chosen": 1.1146682500839233, "rewards/margins": 0.4182148575782776, "rewards/rejected": 0.6964533925056458, "step": 639 }, { "epoch": 0.35, "learning_rate": 9.811534540167164e-08, "logits/chosen": -2.032158374786377, "logits/rejected": -2.038726806640625, "logps/chosen": -12.261818885803223, "logps/rejected": -6.41753625869751, "loss": 0.4818, "rewards/accuracies": 1.0, "rewards/chosen": 1.275168538093567, "rewards/margins": 0.47971636056900024, "rewards/rejected": 0.7954521775245667, "step": 640 }, { "epoch": 0.35, "learning_rate": 9.810741874700505e-08, "logits/chosen": -2.0571987628936768, "logits/rejected": -2.0616872310638428, "logps/chosen": -4.345333099365234, "logps/rejected": -0.5523240566253662, "loss": 0.5742, "rewards/accuracies": 1.0, "rewards/chosen": 1.1244187355041504, "rewards/margins": 0.253936767578125, "rewards/rejected": 0.8704819679260254, "step": 641 }, { "epoch": 0.35, "learning_rate": 9.809947577952426e-08, "logits/chosen": -2.0443594455718994, "logits/rejected": -2.0511322021484375, "logps/chosen": -2.245450496673584, "logps/rejected": -3.9962949752807617, "loss": 0.4794, "rewards/accuracies": 1.0, "rewards/chosen": 1.1572731733322144, "rewards/margins": 0.4860686659812927, "rewards/rejected": 0.6712045073509216, "step": 642 }, { "epoch": 0.35, "learning_rate": 9.809151650192264e-08, "logits/chosen": -1.9494003057479858, "logits/rejected": -2.1981394290924072, "logps/chosen": -1.042861819267273, "logps/rejected": -1.060572862625122, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.8319509625434875, "rewards/margins": 0.01613438129425049, "rewards/rejected": 0.8158165812492371, "step": 643 }, { "epoch": 0.35, "learning_rate": 9.808354091689912e-08, "logits/chosen": -2.090069532394409, "logits/rejected": -2.0591862201690674, "logps/chosen": -6.9298882484436035, "logps/rejected": -4.5504279136657715, "loss": 0.4806, "rewards/accuracies": 1.0, "rewards/chosen": 1.0410032272338867, "rewards/margins": 0.4826977849006653, "rewards/rejected": 0.5583054423332214, "step": 644 }, { "epoch": 0.35, "learning_rate": 9.807554902715816e-08, "logits/chosen": -2.0163795948028564, "logits/rejected": -2.2170095443725586, "logps/chosen": -1.5250012874603271, "logps/rejected": -1.612200379371643, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.6851348876953125, "rewards/margins": 0.034323811531066895, "rewards/rejected": 0.6508110761642456, "step": 645 }, { "epoch": 0.35, "learning_rate": 9.806754083540972e-08, "logits/chosen": -2.2211761474609375, "logits/rejected": -2.2215805053710938, "logps/chosen": -1.9311689138412476, "logps/rejected": -1.578009009361267, "loss": 0.5474, "rewards/accuracies": 1.0, "rewards/chosen": 0.9771512150764465, "rewards/margins": 0.31640011072158813, "rewards/rejected": 0.6607511043548584, "step": 646 }, { "epoch": 0.35, "learning_rate": 9.805951634436933e-08, "logits/chosen": -2.109344482421875, "logits/rejected": -2.105233907699585, "logps/chosen": -2.4806020259857178, "logps/rejected": -10.051456451416016, "loss": 0.5697, "rewards/accuracies": 1.0, "rewards/chosen": 1.0496217012405396, "rewards/margins": 0.2643427848815918, "rewards/rejected": 0.7852789163589478, "step": 647 }, { "epoch": 0.35, "learning_rate": 9.805147555675804e-08, "logits/chosen": -2.0194263458251953, "logits/rejected": -2.019956588745117, "logps/chosen": -6.450214862823486, "logps/rejected": -2.5025200843811035, "loss": 0.3818, "rewards/accuracies": 1.0, "rewards/chosen": 1.3398336172103882, "rewards/margins": 0.765770435333252, "rewards/rejected": 0.5740631818771362, "step": 648 }, { "epoch": 0.35, "learning_rate": 9.804341847530235e-08, "logits/chosen": -2.117006301879883, "logits/rejected": -2.1126325130462646, "logps/chosen": -3.1680071353912354, "logps/rejected": -4.298977851867676, "loss": 0.8166, "rewards/accuracies": 0.0, "rewards/chosen": 0.6973814368247986, "rewards/margins": -0.2333880066871643, "rewards/rejected": 0.9307694435119629, "step": 649 }, { "epoch": 0.35, "learning_rate": 9.803534510273443e-08, "logits/chosen": -2.043844699859619, "logits/rejected": -2.042689561843872, "logps/chosen": -1.798875331878662, "logps/rejected": -1.6100201606750488, "loss": 0.6599, "rewards/accuracies": 1.0, "rewards/chosen": 0.9109617471694946, "rewards/margins": 0.06754171848297119, "rewards/rejected": 0.8434200286865234, "step": 650 }, { "epoch": 0.35, "learning_rate": 9.802725544179183e-08, "logits/chosen": -2.0462758541107178, "logits/rejected": -2.0475833415985107, "logps/chosen": -1.1323332786560059, "logps/rejected": -3.9681501388549805, "loss": 0.5765, "rewards/accuracies": 1.0, "rewards/chosen": 0.9043659567832947, "rewards/margins": 0.2487892508506775, "rewards/rejected": 0.6555767059326172, "step": 651 }, { "epoch": 0.35, "learning_rate": 9.80191494952177e-08, "logits/chosen": -1.9701411724090576, "logits/rejected": -1.976447343826294, "logps/chosen": -3.3546924591064453, "logps/rejected": -4.080923080444336, "loss": 0.4475, "rewards/accuracies": 1.0, "rewards/chosen": 1.0521608591079712, "rewards/margins": 0.5720551013946533, "rewards/rejected": 0.48010578751564026, "step": 652 }, { "epoch": 0.35, "learning_rate": 9.801102726576071e-08, "logits/chosen": -2.124227285385132, "logits/rejected": -2.1117448806762695, "logps/chosen": -10.267963409423828, "logps/rejected": -3.1359944343566895, "loss": 0.5764, "rewards/accuracies": 1.0, "rewards/chosen": 1.017737627029419, "rewards/margins": 0.24888521432876587, "rewards/rejected": 0.7688524127006531, "step": 653 }, { "epoch": 0.35, "learning_rate": 9.800288875617504e-08, "logits/chosen": -2.0952022075653076, "logits/rejected": -2.100102186203003, "logps/chosen": -3.909646511077881, "logps/rejected": -6.674083709716797, "loss": 0.5262, "rewards/accuracies": 1.0, "rewards/chosen": 0.9718246459960938, "rewards/margins": 0.3674251437187195, "rewards/rejected": 0.6043995022773743, "step": 654 }, { "epoch": 0.35, "learning_rate": 9.799473396922038e-08, "logits/chosen": -2.1477653980255127, "logits/rejected": -2.221803903579712, "logps/chosen": -25.092342376708984, "logps/rejected": -2.1746320724487305, "loss": 0.993, "rewards/accuracies": 0.0, "rewards/chosen": 0.47002869844436646, "rewards/margins": -0.5301640629768372, "rewards/rejected": 1.0001927614212036, "step": 655 }, { "epoch": 0.35, "learning_rate": 9.798656290766194e-08, "logits/chosen": -2.0597710609436035, "logits/rejected": -2.2442147731781006, "logps/chosen": -1.4629626274108887, "logps/rejected": -3.6943321228027344, "loss": 0.6277, "rewards/accuracies": 1.0, "rewards/chosen": 0.8014282584190369, "rewards/margins": 0.13553744554519653, "rewards/rejected": 0.6658908128738403, "step": 656 }, { "epoch": 0.35, "learning_rate": 9.797837557427046e-08, "logits/chosen": -2.1033501625061035, "logits/rejected": -2.292123794555664, "logps/chosen": -4.125957489013672, "logps/rejected": -2.3747267723083496, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 1.0319266319274902, "rewards/margins": 0.011199593544006348, "rewards/rejected": 1.0207270383834839, "step": 657 }, { "epoch": 0.35, "learning_rate": 9.797017197182222e-08, "logits/chosen": -1.9854804277420044, "logits/rejected": -2.000863790512085, "logps/chosen": -2.3039040565490723, "logps/rejected": -7.563756942749023, "loss": 0.5298, "rewards/accuracies": 1.0, "rewards/chosen": 1.0797046422958374, "rewards/margins": 0.3586963415145874, "rewards/rejected": 0.72100830078125, "step": 658 }, { "epoch": 0.36, "learning_rate": 9.796195210309896e-08, "logits/chosen": -1.9917426109313965, "logits/rejected": -1.997715711593628, "logps/chosen": -2.9970920085906982, "logps/rejected": -2.3333089351654053, "loss": 0.5448, "rewards/accuracies": 1.0, "rewards/chosen": 1.1022242307662964, "rewards/margins": 0.32260388135910034, "rewards/rejected": 0.779620349407196, "step": 659 }, { "epoch": 0.36, "learning_rate": 9.795371597088797e-08, "logits/chosen": -1.9419996738433838, "logits/rejected": -2.2475790977478027, "logps/chosen": -0.5796206593513489, "logps/rejected": -0.6190966367721558, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.9070425033569336, "rewards/margins": 0.011507570743560791, "rewards/rejected": 0.8955349326133728, "step": 660 }, { "epoch": 0.36, "learning_rate": 9.794546357798208e-08, "logits/chosen": -2.12776517868042, "logits/rejected": -2.10795259475708, "logps/chosen": -13.58186149597168, "logps/rejected": -3.7291486263275146, "loss": 0.5374, "rewards/accuracies": 1.0, "rewards/chosen": 0.9198059439659119, "rewards/margins": 0.3402438163757324, "rewards/rejected": 0.5795621275901794, "step": 661 }, { "epoch": 0.36, "learning_rate": 9.793719492717958e-08, "logits/chosen": -1.9997732639312744, "logits/rejected": -1.9948146343231201, "logps/chosen": -6.902684211730957, "logps/rejected": -1.9207301139831543, "loss": 0.5586, "rewards/accuracies": 1.0, "rewards/chosen": 1.1117209196090698, "rewards/margins": 0.29007667303085327, "rewards/rejected": 0.8216442465782166, "step": 662 }, { "epoch": 0.36, "learning_rate": 9.79289100212843e-08, "logits/chosen": -2.0352044105529785, "logits/rejected": -2.25895619392395, "logps/chosen": -2.018719434738159, "logps/rejected": -2.6728453636169434, "loss": 0.6967, "rewards/accuracies": 0.0, "rewards/chosen": 0.8131896257400513, "rewards/margins": -0.007100939750671387, "rewards/rejected": 0.8202905654907227, "step": 663 }, { "epoch": 0.36, "learning_rate": 9.79206088631056e-08, "logits/chosen": -2.1564128398895264, "logits/rejected": -2.0959701538085938, "logps/chosen": -23.425294876098633, "logps/rejected": -9.080854415893555, "loss": 0.3534, "rewards/accuracies": 1.0, "rewards/chosen": 1.3552488088607788, "rewards/margins": 0.8581071496009827, "rewards/rejected": 0.49714165925979614, "step": 664 }, { "epoch": 0.36, "learning_rate": 9.79122914554583e-08, "logits/chosen": -2.1245877742767334, "logits/rejected": -2.1247174739837646, "logps/chosen": -2.6312642097473145, "logps/rejected": -1.776637315750122, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.9077310562133789, "rewards/margins": 0.03206014633178711, "rewards/rejected": 0.8756709098815918, "step": 665 }, { "epoch": 0.36, "learning_rate": 9.79039578011628e-08, "logits/chosen": -1.9449083805084229, "logits/rejected": -1.955189824104309, "logps/chosen": -2.8462882041931152, "logps/rejected": -3.443753957748413, "loss": 0.5823, "rewards/accuracies": 1.0, "rewards/chosen": 0.7474318146705627, "rewards/margins": 0.23545986413955688, "rewards/rejected": 0.5119719505310059, "step": 666 }, { "epoch": 0.36, "learning_rate": 9.789560790304494e-08, "logits/chosen": -2.1347873210906982, "logits/rejected": -2.2536065578460693, "logps/chosen": -2.9492006301879883, "logps/rejected": -1.1664904356002808, "loss": 0.6703, "rewards/accuracies": 1.0, "rewards/chosen": 0.8880295753479004, "rewards/margins": 0.04632425308227539, "rewards/rejected": 0.841705322265625, "step": 667 }, { "epoch": 0.36, "learning_rate": 9.788724176393611e-08, "logits/chosen": -2.1894872188568115, "logits/rejected": -2.169477939605713, "logps/chosen": -21.7396240234375, "logps/rejected": -2.086932420730591, "loss": 0.5743, "rewards/accuracies": 1.0, "rewards/chosen": 0.8859524130821228, "rewards/margins": 0.2538442015647888, "rewards/rejected": 0.632108211517334, "step": 668 }, { "epoch": 0.36, "learning_rate": 9.787885938667319e-08, "logits/chosen": -2.111989974975586, "logits/rejected": -2.1256868839263916, "logps/chosen": -4.522233009338379, "logps/rejected": -3.521969795227051, "loss": 0.4981, "rewards/accuracies": 1.0, "rewards/chosen": 1.1236903667449951, "rewards/margins": 0.4375351071357727, "rewards/rejected": 0.6861552596092224, "step": 669 }, { "epoch": 0.36, "learning_rate": 9.78704607740986e-08, "logits/chosen": -1.949800968170166, "logits/rejected": -2.2318785190582275, "logps/chosen": -0.6072626709938049, "logps/rejected": -0.6673832535743713, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.9049360156059265, "rewards/margins": 0.024137377738952637, "rewards/rejected": 0.8807986378669739, "step": 670 }, { "epoch": 0.36, "learning_rate": 9.78620459290602e-08, "logits/chosen": -2.1382927894592285, "logits/rejected": -2.1405985355377197, "logps/chosen": -0.9625955820083618, "logps/rejected": -3.008516550064087, "loss": 0.5767, "rewards/accuracies": 1.0, "rewards/chosen": 0.7814247012138367, "rewards/margins": 0.24814963340759277, "rewards/rejected": 0.5332750678062439, "step": 671 }, { "epoch": 0.36, "learning_rate": 9.785361485441141e-08, "logits/chosen": -2.036606788635254, "logits/rejected": -2.236283540725708, "logps/chosen": -3.642033815383911, "logps/rejected": -3.5635509490966797, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.45054197311401367, "rewards/margins": 0.01532086730003357, "rewards/rejected": 0.4352211058139801, "step": 672 }, { "epoch": 0.36, "learning_rate": 9.784516755301113e-08, "logits/chosen": -1.9490429162979126, "logits/rejected": -1.9446203708648682, "logps/chosen": -7.4248270988464355, "logps/rejected": -4.112933158874512, "loss": 0.4, "rewards/accuracies": 1.0, "rewards/chosen": 1.336949348449707, "rewards/margins": 0.7096961736679077, "rewards/rejected": 0.6272531747817993, "step": 673 }, { "epoch": 0.36, "learning_rate": 9.783670402772379e-08, "logits/chosen": -2.193401575088501, "logits/rejected": -2.2610931396484375, "logps/chosen": -1.657220482826233, "logps/rejected": -2.3933091163635254, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.9550625681877136, "rewards/margins": 0.03240591287612915, "rewards/rejected": 0.9226566553115845, "step": 674 }, { "epoch": 0.36, "learning_rate": 9.782822428141926e-08, "logits/chosen": -1.9544342756271362, "logits/rejected": -2.2662065029144287, "logps/chosen": -2.0422523021698, "logps/rejected": -2.14380145072937, "loss": 0.7092, "rewards/accuracies": 0.0, "rewards/chosen": 0.9192859530448914, "rewards/margins": -0.03191876411437988, "rewards/rejected": 0.9512047171592712, "step": 675 }, { "epoch": 0.36, "learning_rate": 9.781972831697297e-08, "logits/chosen": -2.1284520626068115, "logits/rejected": -2.252629041671753, "logps/chosen": -4.122282028198242, "logps/rejected": -0.8107329607009888, "loss": 0.7106, "rewards/accuracies": 0.0, "rewards/chosen": 0.9123736619949341, "rewards/margins": -0.0345805287361145, "rewards/rejected": 0.9469541907310486, "step": 676 }, { "epoch": 0.37, "learning_rate": 9.781121613726584e-08, "logits/chosen": -1.9748364686965942, "logits/rejected": -1.9839603900909424, "logps/chosen": -2.376058578491211, "logps/rejected": -3.45393705368042, "loss": 0.4805, "rewards/accuracies": 1.0, "rewards/chosen": 1.074894905090332, "rewards/margins": 0.48312389850616455, "rewards/rejected": 0.5917710065841675, "step": 677 }, { "epoch": 0.37, "learning_rate": 9.780268774518426e-08, "logits/chosen": -2.070728063583374, "logits/rejected": -1.9880465269088745, "logps/chosen": -11.44515609741211, "logps/rejected": -4.073761463165283, "loss": 0.6091, "rewards/accuracies": 1.0, "rewards/chosen": 1.045008659362793, "rewards/margins": 0.17587357759475708, "rewards/rejected": 0.8691350817680359, "step": 678 }, { "epoch": 0.37, "learning_rate": 9.779414314362014e-08, "logits/chosen": -2.154123544692993, "logits/rejected": -2.154508113861084, "logps/chosen": -3.092927932739258, "logps/rejected": -8.646305084228516, "loss": 0.5987, "rewards/accuracies": 1.0, "rewards/chosen": 1.0315828323364258, "rewards/margins": 0.1987069845199585, "rewards/rejected": 0.8328758478164673, "step": 679 }, { "epoch": 0.37, "learning_rate": 9.778558233547088e-08, "logits/chosen": -1.999205231666565, "logits/rejected": -2.2262492179870605, "logps/chosen": -2.5157852172851562, "logps/rejected": -2.5248160362243652, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.5954522490501404, "rewards/margins": 0.0036118030548095703, "rewards/rejected": 0.5918404459953308, "step": 680 }, { "epoch": 0.37, "learning_rate": 9.77770053236394e-08, "logits/chosen": -2.098188877105713, "logits/rejected": -2.1213154792785645, "logps/chosen": -11.568706512451172, "logps/rejected": -3.1849188804626465, "loss": 0.4991, "rewards/accuracies": 1.0, "rewards/chosen": 1.1974626779556274, "rewards/margins": 0.4349519610404968, "rewards/rejected": 0.7625107169151306, "step": 681 }, { "epoch": 0.37, "learning_rate": 9.776841211103403e-08, "logits/chosen": -2.1059365272521973, "logits/rejected": -2.276017904281616, "logps/chosen": -4.688159465789795, "logps/rejected": -1.164510726928711, "loss": 0.714, "rewards/accuracies": 0.0, "rewards/chosen": 0.8444631695747375, "rewards/margins": -0.04137146472930908, "rewards/rejected": 0.8858346343040466, "step": 682 }, { "epoch": 0.37, "learning_rate": 9.775980270056874e-08, "logits/chosen": -2.133366823196411, "logits/rejected": -2.1335320472717285, "logps/chosen": -0.6821565628051758, "logps/rejected": -2.520664930343628, "loss": 0.532, "rewards/accuracies": 1.0, "rewards/chosen": 0.9771164059638977, "rewards/margins": 0.35337650775909424, "rewards/rejected": 0.6237398982048035, "step": 683 }, { "epoch": 0.37, "learning_rate": 9.775117709516283e-08, "logits/chosen": -2.107257843017578, "logits/rejected": -2.208616256713867, "logps/chosen": -1.3395336866378784, "logps/rejected": -1.476566195487976, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.8675194978713989, "rewards/margins": 0.017014026641845703, "rewards/rejected": 0.8505054712295532, "step": 684 }, { "epoch": 0.37, "learning_rate": 9.774253529774121e-08, "logits/chosen": -2.115321159362793, "logits/rejected": -2.3032498359680176, "logps/chosen": -2.140524387359619, "logps/rejected": -2.09507417678833, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.5971192717552185, "rewards/margins": 0.022731244564056396, "rewards/rejected": 0.5743880271911621, "step": 685 }, { "epoch": 0.37, "learning_rate": 9.773387731123423e-08, "logits/chosen": -2.155841112136841, "logits/rejected": -2.1561105251312256, "logps/chosen": -1.174064040184021, "logps/rejected": -2.0933678150177, "loss": 0.532, "rewards/accuracies": 1.0, "rewards/chosen": 1.040418028831482, "rewards/margins": 0.3533176779747009, "rewards/rejected": 0.687100350856781, "step": 686 }, { "epoch": 0.37, "learning_rate": 9.772520313857775e-08, "logits/chosen": -2.1228339672088623, "logits/rejected": -2.1183958053588867, "logps/chosen": -12.405905723571777, "logps/rejected": -2.480313301086426, "loss": 0.5352, "rewards/accuracies": 1.0, "rewards/chosen": 1.0366085767745972, "rewards/margins": 0.3456422686576843, "rewards/rejected": 0.6909663081169128, "step": 687 }, { "epoch": 0.37, "learning_rate": 9.771651278271311e-08, "logits/chosen": -2.052828073501587, "logits/rejected": -2.2783401012420654, "logps/chosen": -1.136695384979248, "logps/rejected": -1.1459535360336304, "loss": 0.6691, "rewards/accuracies": 1.0, "rewards/chosen": 0.8206545114517212, "rewards/margins": 0.04859870672225952, "rewards/rejected": 0.7720558047294617, "step": 688 }, { "epoch": 0.37, "learning_rate": 9.77078062465871e-08, "logits/chosen": -2.066328287124634, "logits/rejected": -2.2817230224609375, "logps/chosen": -1.2051409482955933, "logps/rejected": -1.1038872003555298, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.9092304110527039, "rewards/margins": 0.016871631145477295, "rewards/rejected": 0.8923587799072266, "step": 689 }, { "epoch": 0.37, "learning_rate": 9.769908353315205e-08, "logits/chosen": -2.0504837036132812, "logits/rejected": -2.053467035293579, "logps/chosen": -4.024552822113037, "logps/rejected": -1.1777026653289795, "loss": 0.56, "rewards/accuracies": 1.0, "rewards/chosen": 1.1915873289108276, "rewards/margins": 0.2868749499320984, "rewards/rejected": 0.9047123789787292, "step": 690 }, { "epoch": 0.37, "learning_rate": 9.769034464536578e-08, "logits/chosen": -2.1318917274475098, "logits/rejected": -2.3255653381347656, "logps/chosen": -1.7100732326507568, "logps/rejected": -6.157217979431152, "loss": 0.643, "rewards/accuracies": 1.0, "rewards/chosen": 0.8422215580940247, "rewards/margins": 0.10289633274078369, "rewards/rejected": 0.739325225353241, "step": 691 }, { "epoch": 0.37, "learning_rate": 9.768158958619155e-08, "logits/chosen": -1.9871143102645874, "logits/rejected": -1.9872678518295288, "logps/chosen": -5.475683689117432, "logps/rejected": -2.457257032394409, "loss": 0.4633, "rewards/accuracies": 1.0, "rewards/chosen": 1.2686903476715088, "rewards/margins": 0.5288355350494385, "rewards/rejected": 0.7398548126220703, "step": 692 }, { "epoch": 0.37, "learning_rate": 9.767281835859813e-08, "logits/chosen": -2.0564444065093994, "logits/rejected": -2.239880084991455, "logps/chosen": -5.843206405639648, "logps/rejected": -1.7689517736434937, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.6650484204292297, "rewards/margins": 0.005057096481323242, "rewards/rejected": 0.6599913239479065, "step": 693 }, { "epoch": 0.37, "learning_rate": 9.766403096555977e-08, "logits/chosen": -2.0243263244628906, "logits/rejected": -2.0318570137023926, "logps/chosen": -4.545343399047852, "logps/rejected": -3.3904573917388916, "loss": 0.5229, "rewards/accuracies": 1.0, "rewards/chosen": 0.9289800524711609, "rewards/margins": 0.3754913806915283, "rewards/rejected": 0.5534886717796326, "step": 694 }, { "epoch": 0.37, "learning_rate": 9.76552274100562e-08, "logits/chosen": -2.1464366912841797, "logits/rejected": -2.1438465118408203, "logps/chosen": -6.270244598388672, "logps/rejected": -4.609103679656982, "loss": 0.3645, "rewards/accuracies": 1.0, "rewards/chosen": 1.2653850317001343, "rewards/margins": 0.8215733766555786, "rewards/rejected": 0.44381165504455566, "step": 695 }, { "epoch": 0.38, "learning_rate": 9.764640769507264e-08, "logits/chosen": -2.0366158485412598, "logits/rejected": -2.0396580696105957, "logps/chosen": -2.21189022064209, "logps/rejected": -0.9617481231689453, "loss": 0.6394, "rewards/accuracies": 1.0, "rewards/chosen": 0.9024203419685364, "rewards/margins": 0.11062842607498169, "rewards/rejected": 0.7917919158935547, "step": 696 }, { "epoch": 0.38, "learning_rate": 9.763757182359975e-08, "logits/chosen": -2.0639798641204834, "logits/rejected": -2.2271931171417236, "logps/chosen": -1.3997069597244263, "logps/rejected": -1.317482829093933, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.6976818442344666, "rewards/margins": 0.00022995471954345703, "rewards/rejected": 0.6974518895149231, "step": 697 }, { "epoch": 0.38, "learning_rate": 9.762871979863372e-08, "logits/chosen": -2.191316843032837, "logits/rejected": -2.082118511199951, "logps/chosen": -29.177873611450195, "logps/rejected": -11.861299514770508, "loss": 0.4531, "rewards/accuracies": 1.0, "rewards/chosen": 1.0228246450424194, "rewards/margins": 0.5566617250442505, "rewards/rejected": 0.46616289019584656, "step": 698 }, { "epoch": 0.38, "learning_rate": 9.76198516231762e-08, "logits/chosen": -2.0197408199310303, "logits/rejected": -2.2533974647521973, "logps/chosen": -0.7807541489601135, "logps/rejected": -0.7613993287086487, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.9357610940933228, "rewards/margins": 0.018686771392822266, "rewards/rejected": 0.9170743227005005, "step": 699 }, { "epoch": 0.38, "learning_rate": 9.761096730023431e-08, "logits/chosen": -1.9795068502426147, "logits/rejected": -2.2447140216827393, "logps/chosen": -3.488218307495117, "logps/rejected": -0.8003056049346924, "loss": 0.7318, "rewards/accuracies": 0.0, "rewards/chosen": 0.855840802192688, "rewards/margins": -0.0757746696472168, "rewards/rejected": 0.9316154718399048, "step": 700 }, { "epoch": 0.38, "learning_rate": 9.760206683282063e-08, "logits/chosen": -1.9894081354141235, "logits/rejected": -2.228856325149536, "logps/chosen": -1.5555944442749023, "logps/rejected": -1.5138545036315918, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.8034495711326599, "rewards/margins": 0.010950565338134766, "rewards/rejected": 0.7924990057945251, "step": 701 }, { "epoch": 0.38, "learning_rate": 9.759315022395324e-08, "logits/chosen": -2.1132657527923584, "logits/rejected": -2.1048314571380615, "logps/chosen": -14.274261474609375, "logps/rejected": -2.094398021697998, "loss": 0.4481, "rewards/accuracies": 1.0, "rewards/chosen": 1.2192386388778687, "rewards/margins": 0.5702215433120728, "rewards/rejected": 0.6490170955657959, "step": 702 }, { "epoch": 0.38, "learning_rate": 9.758421747665572e-08, "logits/chosen": -2.1727454662323, "logits/rejected": -2.179100513458252, "logps/chosen": -6.713874340057373, "logps/rejected": -2.291755437850952, "loss": 0.7363, "rewards/accuracies": 0.0, "rewards/chosen": 0.5062380433082581, "rewards/margins": -0.08446955680847168, "rewards/rejected": 0.5907076001167297, "step": 703 }, { "epoch": 0.38, "learning_rate": 9.757526859395704e-08, "logits/chosen": -2.0094025135040283, "logits/rejected": -1.9789674282073975, "logps/chosen": -8.84089469909668, "logps/rejected": -3.106464147567749, "loss": 0.446, "rewards/accuracies": 1.0, "rewards/chosen": 1.1175463199615479, "rewards/margins": 0.5760477185249329, "rewards/rejected": 0.541498601436615, "step": 704 }, { "epoch": 0.38, "learning_rate": 9.756630357889173e-08, "logits/chosen": -2.0834832191467285, "logits/rejected": -2.234128952026367, "logps/chosen": -19.283193588256836, "logps/rejected": -6.4264044761657715, "loss": 1.0616, "rewards/accuracies": 0.0, "rewards/chosen": 0.15974770486354828, "rewards/margins": -0.6370294690132141, "rewards/rejected": 0.7967771887779236, "step": 705 }, { "epoch": 0.38, "learning_rate": 9.75573224344997e-08, "logits/chosen": -1.9898067712783813, "logits/rejected": -2.238863945007324, "logps/chosen": -2.9536476135253906, "logps/rejected": -2.816977024078369, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.5155259370803833, "rewards/margins": 0.009379088878631592, "rewards/rejected": 0.5061468482017517, "step": 706 }, { "epoch": 0.38, "learning_rate": 9.754832516382642e-08, "logits/chosen": -1.9560705423355103, "logits/rejected": -2.2407748699188232, "logps/chosen": -1.1242384910583496, "logps/rejected": -1.1658978462219238, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.7327277064323425, "rewards/margins": 0.017879486083984375, "rewards/rejected": 0.7148482203483582, "step": 707 }, { "epoch": 0.38, "learning_rate": 9.75393117699228e-08, "logits/chosen": -2.190382957458496, "logits/rejected": -2.1590628623962402, "logps/chosen": -16.087223052978516, "logps/rejected": -2.8332061767578125, "loss": 0.6089, "rewards/accuracies": 1.0, "rewards/chosen": 0.9342842102050781, "rewards/margins": 0.17621475458145142, "rewards/rejected": 0.7580694556236267, "step": 708 }, { "epoch": 0.38, "learning_rate": 9.753028225584515e-08, "logits/chosen": -2.1173856258392334, "logits/rejected": -2.2684006690979004, "logps/chosen": -2.9243061542510986, "logps/rejected": -2.85524845123291, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 0.6503620147705078, "rewards/margins": 0.0248640775680542, "rewards/rejected": 0.6254979372024536, "step": 709 }, { "epoch": 0.38, "learning_rate": 9.752123662465535e-08, "logits/chosen": -2.004514455795288, "logits/rejected": -1.9973030090332031, "logps/chosen": -4.013054370880127, "logps/rejected": -2.7884464263916016, "loss": 0.5945, "rewards/accuracies": 1.0, "rewards/chosen": 0.9682895541191101, "rewards/margins": 0.2082078456878662, "rewards/rejected": 0.7600817084312439, "step": 710 }, { "epoch": 0.38, "learning_rate": 9.751217487942067e-08, "logits/chosen": -2.0607657432556152, "logits/rejected": -2.0654361248016357, "logps/chosen": -1.9870922565460205, "logps/rejected": -2.5569920539855957, "loss": 0.4665, "rewards/accuracies": 1.0, "rewards/chosen": 1.0911035537719727, "rewards/margins": 0.5201250314712524, "rewards/rejected": 0.5709785223007202, "step": 711 }, { "epoch": 0.38, "learning_rate": 9.750309702321388e-08, "logits/chosen": -2.0696849822998047, "logits/rejected": -2.0725290775299072, "logps/chosen": -3.300734043121338, "logps/rejected": -1.3174631595611572, "loss": 0.6129, "rewards/accuracies": 1.0, "rewards/chosen": 1.0715678930282593, "rewards/margins": 0.16759943962097168, "rewards/rejected": 0.9039684534072876, "step": 712 }, { "epoch": 0.38, "learning_rate": 9.749400305911322e-08, "logits/chosen": -2.095655918121338, "logits/rejected": -2.1335151195526123, "logps/chosen": -5.724201679229736, "logps/rejected": -11.75865364074707, "loss": 0.4798, "rewards/accuracies": 1.0, "rewards/chosen": 1.1118030548095703, "rewards/margins": 0.4849916100502014, "rewards/rejected": 0.6268114447593689, "step": 713 }, { "epoch": 0.39, "learning_rate": 9.748489299020233e-08, "logits/chosen": -2.029343843460083, "logits/rejected": -2.2542717456817627, "logps/chosen": -1.9466341733932495, "logps/rejected": -2.128537178039551, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.8071237802505493, "rewards/margins": 0.0034667253494262695, "rewards/rejected": 0.803657054901123, "step": 714 }, { "epoch": 0.39, "learning_rate": 9.74757668195704e-08, "logits/chosen": -2.0009689331054688, "logits/rejected": -1.983919382095337, "logps/chosen": -5.219762802124023, "logps/rejected": -5.5905890464782715, "loss": 0.5218, "rewards/accuracies": 1.0, "rewards/chosen": 0.9662864804267883, "rewards/margins": 0.3781527280807495, "rewards/rejected": 0.5881337523460388, "step": 715 }, { "epoch": 0.39, "learning_rate": 9.746662455031202e-08, "logits/chosen": -2.0180561542510986, "logits/rejected": -2.2867462635040283, "logps/chosen": -4.3305816650390625, "logps/rejected": -5.925024032592773, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.8309165835380554, "rewards/margins": 0.016094088554382324, "rewards/rejected": 0.8148224949836731, "step": 716 }, { "epoch": 0.39, "learning_rate": 9.745746618552723e-08, "logits/chosen": -1.9985120296478271, "logits/rejected": -2.267965078353882, "logps/chosen": -4.803171634674072, "logps/rejected": -3.3330702781677246, "loss": 0.7058, "rewards/accuracies": 0.0, "rewards/chosen": 0.5705098509788513, "rewards/margins": -0.025188684463500977, "rewards/rejected": 0.5956985354423523, "step": 717 }, { "epoch": 0.39, "learning_rate": 9.744829172832161e-08, "logits/chosen": -2.031200647354126, "logits/rejected": -2.020395040512085, "logps/chosen": -6.344085693359375, "logps/rejected": -1.757312297821045, "loss": 0.6593, "rewards/accuracies": 1.0, "rewards/chosen": 0.7204786539077759, "rewards/margins": 0.06897056102752686, "rewards/rejected": 0.651508092880249, "step": 718 }, { "epoch": 0.39, "learning_rate": 9.74391011818061e-08, "logits/chosen": -2.114891529083252, "logits/rejected": -2.112234354019165, "logps/chosen": -5.373620510101318, "logps/rejected": -3.3137118816375732, "loss": 0.417, "rewards/accuracies": 1.0, "rewards/chosen": 1.1975901126861572, "rewards/margins": 0.6590739488601685, "rewards/rejected": 0.5385161638259888, "step": 719 }, { "epoch": 0.39, "learning_rate": 9.742989454909715e-08, "logits/chosen": -2.072242498397827, "logits/rejected": -2.0879769325256348, "logps/chosen": -12.907793998718262, "logps/rejected": -6.484207630157471, "loss": 0.5555, "rewards/accuracies": 1.0, "rewards/chosen": 1.0687510967254639, "rewards/margins": 0.29738670587539673, "rewards/rejected": 0.7713643908500671, "step": 720 }, { "epoch": 0.39, "learning_rate": 9.742067183331663e-08, "logits/chosen": -2.03617525100708, "logits/rejected": -2.049631357192993, "logps/chosen": -1.1500177383422852, "logps/rejected": -3.439513921737671, "loss": 0.607, "rewards/accuracies": 1.0, "rewards/chosen": 0.9787508845329285, "rewards/margins": 0.18041837215423584, "rewards/rejected": 0.7983325123786926, "step": 721 }, { "epoch": 0.39, "learning_rate": 9.741143303759191e-08, "logits/chosen": -2.0223774909973145, "logits/rejected": -2.2987213134765625, "logps/chosen": -0.8886338472366333, "logps/rejected": -0.9406912326812744, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 1.0160936117172241, "rewards/margins": 0.006109714508056641, "rewards/rejected": 1.0099838972091675, "step": 722 }, { "epoch": 0.39, "learning_rate": 9.740217816505578e-08, "logits/chosen": -2.062282085418701, "logits/rejected": -2.2222554683685303, "logps/chosen": -1.1110730171203613, "logps/rejected": -1.1303167343139648, "loss": 0.6565, "rewards/accuracies": 1.0, "rewards/chosen": 0.9300684332847595, "rewards/margins": 0.07462942600250244, "rewards/rejected": 0.8554390072822571, "step": 723 }, { "epoch": 0.39, "learning_rate": 9.739290721884647e-08, "logits/chosen": -2.011697769165039, "logits/rejected": -2.256556987762451, "logps/chosen": -5.085090160369873, "logps/rejected": -6.880647659301758, "loss": 0.5844, "rewards/accuracies": 1.0, "rewards/chosen": 0.7938563823699951, "rewards/margins": 0.23088866472244263, "rewards/rejected": 0.5629677176475525, "step": 724 }, { "epoch": 0.39, "learning_rate": 9.73836202021077e-08, "logits/chosen": -1.9782034158706665, "logits/rejected": -2.246735095977783, "logps/chosen": -2.044254779815674, "logps/rejected": -2.006910800933838, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.6526188254356384, "rewards/margins": 0.014470100402832031, "rewards/rejected": 0.6381487250328064, "step": 725 }, { "epoch": 0.39, "learning_rate": 9.737431711798863e-08, "logits/chosen": -1.9606064558029175, "logits/rejected": -2.2104780673980713, "logps/chosen": -1.8671653270721436, "logps/rejected": -1.7834994792938232, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.7209469676017761, "rewards/margins": 0.01171022653579712, "rewards/rejected": 0.709236741065979, "step": 726 }, { "epoch": 0.39, "learning_rate": 9.73649979696438e-08, "logits/chosen": -1.9731518030166626, "logits/rejected": -1.995228886604309, "logps/chosen": -8.743396759033203, "logps/rejected": -27.464786529541016, "loss": 0.6069, "rewards/accuracies": 1.0, "rewards/chosen": 1.0432449579238892, "rewards/margins": 0.18063700199127197, "rewards/rejected": 0.8626079559326172, "step": 727 }, { "epoch": 0.39, "learning_rate": 9.735566276023332e-08, "logits/chosen": -2.243192672729492, "logits/rejected": -2.2401857376098633, "logps/chosen": -1.5673480033874512, "logps/rejected": -1.540756344795227, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.8474187850952148, "rewards/margins": 0.0136566162109375, "rewards/rejected": 0.8337621688842773, "step": 728 }, { "epoch": 0.39, "learning_rate": 9.734631149292262e-08, "logits/chosen": -2.0274579524993896, "logits/rejected": -2.2857611179351807, "logps/chosen": -0.7140110731124878, "logps/rejected": -0.7212008833885193, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.7898131608963013, "rewards/margins": 0.02941274642944336, "rewards/rejected": 0.7604004144668579, "step": 729 }, { "epoch": 0.39, "learning_rate": 9.733694417088269e-08, "logits/chosen": -2.0465261936187744, "logits/rejected": -2.0870633125305176, "logps/chosen": -6.251544952392578, "logps/rejected": -10.681264877319336, "loss": 0.4172, "rewards/accuracies": 1.0, "rewards/chosen": 1.2147337198257446, "rewards/margins": 0.6582260131835938, "rewards/rejected": 0.5565077066421509, "step": 730 }, { "epoch": 0.39, "learning_rate": 9.732756079728988e-08, "logits/chosen": -2.1273722648620605, "logits/rejected": -2.123654365539551, "logps/chosen": -5.432598114013672, "logps/rejected": -4.940448760986328, "loss": 0.4214, "rewards/accuracies": 1.0, "rewards/chosen": 1.0681060552597046, "rewards/margins": 0.64609694480896, "rewards/rejected": 0.42200908064842224, "step": 731 }, { "epoch": 0.39, "learning_rate": 9.7318161375326e-08, "logits/chosen": -2.04610276222229, "logits/rejected": -2.2388269901275635, "logps/chosen": -1.0163383483886719, "logps/rejected": -0.9731709957122803, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.741945743560791, "rewards/margins": -0.0033028721809387207, "rewards/rejected": 0.7452486157417297, "step": 732 }, { "epoch": 0.4, "learning_rate": 9.730874590817834e-08, "logits/chosen": -2.1140670776367188, "logits/rejected": -2.254772186279297, "logps/chosen": -2.0227742195129395, "logps/rejected": -1.845428705215454, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 0.9846132397651672, "rewards/margins": 0.03864556550979614, "rewards/rejected": 0.9459676742553711, "step": 733 }, { "epoch": 0.4, "learning_rate": 9.72993143990396e-08, "logits/chosen": -2.0861785411834717, "logits/rejected": -2.2378392219543457, "logps/chosen": -0.5785784721374512, "logps/rejected": -0.6552047729492188, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.7763103246688843, "rewards/margins": 0.021030187606811523, "rewards/rejected": 0.7552801370620728, "step": 734 }, { "epoch": 0.4, "learning_rate": 9.728986685110789e-08, "logits/chosen": -2.0177648067474365, "logits/rejected": -2.013007402420044, "logps/chosen": -1.50954270362854, "logps/rejected": -4.133573532104492, "loss": 0.5265, "rewards/accuracies": 1.0, "rewards/chosen": 0.9863420724868774, "rewards/margins": 0.36671507358551025, "rewards/rejected": 0.6196269989013672, "step": 735 }, { "epoch": 0.4, "learning_rate": 9.728040326758684e-08, "logits/chosen": -2.013305187225342, "logits/rejected": -2.0112385749816895, "logps/chosen": -1.646511197090149, "logps/rejected": -2.1844322681427, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.8289486169815063, "rewards/margins": 0.005975544452667236, "rewards/rejected": 0.8229730725288391, "step": 736 }, { "epoch": 0.4, "learning_rate": 9.727092365168545e-08, "logits/chosen": -2.046541213989258, "logits/rejected": -2.2139523029327393, "logps/chosen": -4.964243412017822, "logps/rejected": -8.208748817443848, "loss": 0.4562, "rewards/accuracies": 1.0, "rewards/chosen": 0.8351419568061829, "rewards/margins": 0.5481747388839722, "rewards/rejected": 0.2869671881198883, "step": 737 }, { "epoch": 0.4, "learning_rate": 9.726142800661818e-08, "logits/chosen": -2.112030029296875, "logits/rejected": -2.2667365074157715, "logps/chosen": -1.574743390083313, "logps/rejected": -1.5459696054458618, "loss": 0.6994, "rewards/accuracies": 0.0, "rewards/chosen": 0.935609757900238, "rewards/margins": -0.012548625469207764, "rewards/rejected": 0.9481583833694458, "step": 738 }, { "epoch": 0.4, "learning_rate": 9.72519163356049e-08, "logits/chosen": -2.1100690364837646, "logits/rejected": -2.3070578575134277, "logps/chosen": -0.9644100666046143, "logps/rejected": -0.9942756295204163, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.9269683957099915, "rewards/margins": 0.00960540771484375, "rewards/rejected": 0.9173629879951477, "step": 739 }, { "epoch": 0.4, "learning_rate": 9.724238864187098e-08, "logits/chosen": -2.035681962966919, "logits/rejected": -2.2306761741638184, "logps/chosen": -1.1058220863342285, "logps/rejected": -2.0311412811279297, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.8811561465263367, "rewards/margins": 0.040569186210632324, "rewards/rejected": 0.8405869603157043, "step": 740 }, { "epoch": 0.4, "learning_rate": 9.723284492864714e-08, "logits/chosen": -2.081003189086914, "logits/rejected": -2.33840274810791, "logps/chosen": -8.591239929199219, "logps/rejected": -8.68510627746582, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.7637237906455994, "rewards/margins": 0.009864449501037598, "rewards/rejected": 0.7538593411445618, "step": 741 }, { "epoch": 0.4, "learning_rate": 9.722328519916958e-08, "logits/chosen": -2.089224100112915, "logits/rejected": -2.089460611343384, "logps/chosen": -2.038142204284668, "logps/rejected": -1.2722666263580322, "loss": 0.6505, "rewards/accuracies": 1.0, "rewards/chosen": 0.9182615280151367, "rewards/margins": 0.08715230226516724, "rewards/rejected": 0.8311092257499695, "step": 742 }, { "epoch": 0.4, "learning_rate": 9.721370945667992e-08, "logits/chosen": -2.049834966659546, "logits/rejected": -2.270275592803955, "logps/chosen": -0.9323971271514893, "logps/rejected": -0.924955427646637, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.7548972368240356, "rewards/margins": 0.010594785213470459, "rewards/rejected": 0.7443024516105652, "step": 743 }, { "epoch": 0.4, "learning_rate": 9.720411770442525e-08, "logits/chosen": -1.9706133604049683, "logits/rejected": -1.9287755489349365, "logps/chosen": -15.02501106262207, "logps/rejected": -2.3374199867248535, "loss": 0.6373, "rewards/accuracies": 1.0, "rewards/chosen": 0.9943168759346008, "rewards/margins": 0.1149703860282898, "rewards/rejected": 0.879346489906311, "step": 744 }, { "epoch": 0.4, "learning_rate": 9.7194509945658e-08, "logits/chosen": -2.0084171295166016, "logits/rejected": -2.0109829902648926, "logps/chosen": -5.480005741119385, "logps/rejected": -2.910092353820801, "loss": 0.3958, "rewards/accuracies": 1.0, "rewards/chosen": 1.3628580570220947, "rewards/margins": 0.7225258946418762, "rewards/rejected": 0.6403321623802185, "step": 745 }, { "epoch": 0.4, "learning_rate": 9.718488618363611e-08, "logits/chosen": -2.2116780281066895, "logits/rejected": -2.297640085220337, "logps/chosen": -2.8617336750030518, "logps/rejected": -3.06351637840271, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": 0.7615814208984375, "rewards/margins": 0.03821218013763428, "rewards/rejected": 0.7233692407608032, "step": 746 }, { "epoch": 0.4, "learning_rate": 9.717524642162292e-08, "logits/chosen": -2.092013359069824, "logits/rejected": -2.09432315826416, "logps/chosen": -4.589305877685547, "logps/rejected": -3.923258066177368, "loss": 0.6127, "rewards/accuracies": 1.0, "rewards/chosen": 0.960601270198822, "rewards/margins": 0.16802990436553955, "rewards/rejected": 0.7925713658332825, "step": 747 }, { "epoch": 0.4, "learning_rate": 9.716559066288714e-08, "logits/chosen": -2.04732608795166, "logits/rejected": -2.3183743953704834, "logps/chosen": -0.8380643129348755, "logps/rejected": -0.8059197664260864, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.9441733360290527, "rewards/margins": 0.006440222263336182, "rewards/rejected": 0.9377331137657166, "step": 748 }, { "epoch": 0.4, "learning_rate": 9.715591891070303e-08, "logits/chosen": -2.041748046875, "logits/rejected": -2.2764382362365723, "logps/chosen": -1.6517877578735352, "logps/rejected": -1.7467368841171265, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.9876805543899536, "rewards/margins": 0.01180487871170044, "rewards/rejected": 0.9758756756782532, "step": 749 }, { "epoch": 0.4, "learning_rate": 9.714623116835015e-08, "logits/chosen": -2.1431915760040283, "logits/rejected": -2.1396868228912354, "logps/chosen": -5.1598687171936035, "logps/rejected": -2.6914498805999756, "loss": 0.4335, "rewards/accuracies": 1.0, "rewards/chosen": 1.378989577293396, "rewards/margins": 0.6112921237945557, "rewards/rejected": 0.7676974534988403, "step": 750 }, { "epoch": 0.41, "learning_rate": 9.713652743911352e-08, "logits/chosen": -2.0102670192718506, "logits/rejected": -2.336660861968994, "logps/chosen": -1.6621830463409424, "logps/rejected": -1.5867286920547485, "loss": 0.7057, "rewards/accuracies": 0.0, "rewards/chosen": 0.9325906038284302, "rewards/margins": -0.025022029876708984, "rewards/rejected": 0.9576126337051392, "step": 751 }, { "epoch": 0.41, "learning_rate": 9.712680772628363e-08, "logits/chosen": -1.9618066549301147, "logits/rejected": -2.2215487957000732, "logps/chosen": -0.8554784059524536, "logps/rejected": -0.8345946073532104, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.8362676501274109, "rewards/margins": 0.017587363719940186, "rewards/rejected": 0.8186802864074707, "step": 752 }, { "epoch": 0.41, "learning_rate": 9.711707203315633e-08, "logits/chosen": -2.013068437576294, "logits/rejected": -2.235994338989258, "logps/chosen": -1.3495042324066162, "logps/rejected": -1.2388023138046265, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.94776451587677, "rewards/margins": 0.0037655234336853027, "rewards/rejected": 0.9439989924430847, "step": 753 }, { "epoch": 0.41, "learning_rate": 9.710732036303291e-08, "logits/chosen": -2.000225782394409, "logits/rejected": -2.2669899463653564, "logps/chosen": -0.969155490398407, "logps/rejected": -1.0002186298370361, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9647842645645142, "rewards/margins": 0.005757808685302734, "rewards/rejected": 0.9590264558792114, "step": 754 }, { "epoch": 0.41, "learning_rate": 9.70975527192201e-08, "logits/chosen": -2.1148667335510254, "logits/rejected": -2.1160786151885986, "logps/chosen": -2.484484910964966, "logps/rejected": -1.165265440940857, "loss": 0.6075, "rewards/accuracies": 1.0, "rewards/chosen": 1.0320111513137817, "rewards/margins": 0.17924463748931885, "rewards/rejected": 0.8527665138244629, "step": 755 }, { "epoch": 0.41, "learning_rate": 9.708776910502999e-08, "logits/chosen": -2.1923294067382812, "logits/rejected": -2.194971799850464, "logps/chosen": -2.532257556915283, "logps/rejected": -1.4403105974197388, "loss": 0.583, "rewards/accuracies": 1.0, "rewards/chosen": 0.9589902758598328, "rewards/margins": 0.23384559154510498, "rewards/rejected": 0.7251446843147278, "step": 756 }, { "epoch": 0.41, "learning_rate": 9.707796952378014e-08, "logits/chosen": -2.13364577293396, "logits/rejected": -2.1251282691955566, "logps/chosen": -6.593991756439209, "logps/rejected": -3.052295207977295, "loss": 0.5916, "rewards/accuracies": 1.0, "rewards/chosen": 0.8097770810127258, "rewards/margins": 0.21466928720474243, "rewards/rejected": 0.5951077938079834, "step": 757 }, { "epoch": 0.41, "learning_rate": 9.706815397879351e-08, "logits/chosen": -2.025395631790161, "logits/rejected": -2.2348673343658447, "logps/chosen": -1.4774433374404907, "logps/rejected": -1.7450966835021973, "loss": 0.6748, "rewards/accuracies": 1.0, "rewards/chosen": 0.7821733951568604, "rewards/margins": 0.03711622953414917, "rewards/rejected": 0.7450571656227112, "step": 758 }, { "epoch": 0.41, "learning_rate": 9.705832247339844e-08, "logits/chosen": -1.9905824661254883, "logits/rejected": -2.239884376525879, "logps/chosen": -1.9148958921432495, "logps/rejected": -2.2088704109191895, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.766610324382782, "rewards/margins": 0.027183353900909424, "rewards/rejected": 0.7394269704818726, "step": 759 }, { "epoch": 0.41, "learning_rate": 9.704847501092875e-08, "logits/chosen": -2.126952886581421, "logits/rejected": -2.112907648086548, "logps/chosen": -11.404644966125488, "logps/rejected": -2.8752760887145996, "loss": 0.463, "rewards/accuracies": 1.0, "rewards/chosen": 1.204024076461792, "rewards/margins": 0.5295281410217285, "rewards/rejected": 0.6744959354400635, "step": 760 }, { "epoch": 0.41, "learning_rate": 9.70386115947236e-08, "logits/chosen": -2.181783437728882, "logits/rejected": -2.1889126300811768, "logps/chosen": -3.284464120864868, "logps/rejected": -2.982703924179077, "loss": 0.4334, "rewards/accuracies": 1.0, "rewards/chosen": 1.1909228563308716, "rewards/margins": 0.6117130517959595, "rewards/rejected": 0.5792098045349121, "step": 761 }, { "epoch": 0.41, "learning_rate": 9.70287322281276e-08, "logits/chosen": -2.1029579639434814, "logits/rejected": -2.2296624183654785, "logps/chosen": -9.784246444702148, "logps/rejected": -1.500848650932312, "loss": 0.7333, "rewards/accuracies": 0.0, "rewards/chosen": 0.6894058585166931, "rewards/margins": -0.07882839441299438, "rewards/rejected": 0.7682342529296875, "step": 762 }, { "epoch": 0.41, "learning_rate": 9.701883691449075e-08, "logits/chosen": -2.04935359954834, "logits/rejected": -2.300246477127075, "logps/chosen": -3.602918863296509, "logps/rejected": -3.8452022075653076, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.7175182700157166, "rewards/margins": 0.0010712146759033203, "rewards/rejected": 0.7164470553398132, "step": 763 }, { "epoch": 0.41, "learning_rate": 9.700892565716847e-08, "logits/chosen": -2.0838623046875, "logits/rejected": -2.082430124282837, "logps/chosen": -5.250256538391113, "logps/rejected": -4.198360919952393, "loss": 0.4275, "rewards/accuracies": 1.0, "rewards/chosen": 1.1927863359451294, "rewards/margins": 0.6285889148712158, "rewards/rejected": 0.5641974210739136, "step": 764 }, { "epoch": 0.41, "learning_rate": 9.699899845952158e-08, "logits/chosen": -2.03361177444458, "logits/rejected": -2.2823870182037354, "logps/chosen": -0.9670379161834717, "logps/rejected": -0.9411323070526123, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.8749400973320007, "rewards/margins": 0.0323789119720459, "rewards/rejected": 0.8425611853599548, "step": 765 }, { "epoch": 0.41, "learning_rate": 9.698905532491633e-08, "logits/chosen": -2.0058934688568115, "logits/rejected": -2.0083260536193848, "logps/chosen": -2.689042091369629, "logps/rejected": -4.6281280517578125, "loss": 0.5156, "rewards/accuracies": 1.0, "rewards/chosen": 0.9017060399055481, "rewards/margins": 0.39353859424591064, "rewards/rejected": 0.5081674456596375, "step": 766 }, { "epoch": 0.41, "learning_rate": 9.697909625672433e-08, "logits/chosen": -2.100552558898926, "logits/rejected": -2.290801525115967, "logps/chosen": -2.7595672607421875, "logps/rejected": -6.73707914352417, "loss": 0.6608, "rewards/accuracies": 1.0, "rewards/chosen": 0.9542363286018372, "rewards/margins": 0.065684974193573, "rewards/rejected": 0.8885513544082642, "step": 767 }, { "epoch": 0.41, "learning_rate": 9.696912125832262e-08, "logits/chosen": -2.1386520862579346, "logits/rejected": -2.1382386684417725, "logps/chosen": -6.370325088500977, "logps/rejected": -2.9003419876098633, "loss": 0.3457, "rewards/accuracies": 1.0, "rewards/chosen": 1.461936593055725, "rewards/margins": 0.8842113614082336, "rewards/rejected": 0.5777252316474915, "step": 768 }, { "epoch": 0.41, "learning_rate": 9.695913033309364e-08, "logits/chosen": -1.9313015937805176, "logits/rejected": -2.2767333984375, "logps/chosen": -10.139973640441895, "logps/rejected": -10.197734832763672, "loss": 0.6665, "rewards/accuracies": 1.0, "rewards/chosen": 0.8133748173713684, "rewards/margins": 0.05407840013504028, "rewards/rejected": 0.7592964172363281, "step": 769 }, { "epoch": 0.42, "learning_rate": 9.694912348442521e-08, "logits/chosen": -2.0874390602111816, "logits/rejected": -2.2527942657470703, "logps/chosen": -2.028528928756714, "logps/rejected": -10.79062557220459, "loss": 0.6268, "rewards/accuracies": 1.0, "rewards/chosen": 0.9619420170783997, "rewards/margins": 0.13740652799606323, "rewards/rejected": 0.8245354890823364, "step": 770 }, { "epoch": 0.42, "learning_rate": 9.69391007157106e-08, "logits/chosen": -2.087169885635376, "logits/rejected": -2.087157726287842, "logps/chosen": -3.0384294986724854, "logps/rejected": -1.7240428924560547, "loss": 0.6374, "rewards/accuracies": 1.0, "rewards/chosen": 0.9482694864273071, "rewards/margins": 0.11481708288192749, "rewards/rejected": 0.8334524035453796, "step": 771 }, { "epoch": 0.42, "learning_rate": 9.69290620303484e-08, "logits/chosen": -2.0611965656280518, "logits/rejected": -2.204190254211426, "logps/chosen": -8.28017807006836, "logps/rejected": -1.3969473838806152, "loss": 0.7141, "rewards/accuracies": 0.0, "rewards/chosen": 0.7470809817314148, "rewards/margins": -0.04150122404098511, "rewards/rejected": 0.7885822057723999, "step": 772 }, { "epoch": 0.42, "learning_rate": 9.69190074317427e-08, "logits/chosen": -2.100212574005127, "logits/rejected": -2.102410316467285, "logps/chosen": -1.393657922744751, "logps/rejected": -8.702144622802734, "loss": 0.4225, "rewards/accuracies": 1.0, "rewards/chosen": 1.1122748851776123, "rewards/margins": 0.6428994536399841, "rewards/rejected": 0.4693754315376282, "step": 773 }, { "epoch": 0.42, "learning_rate": 9.690893692330288e-08, "logits/chosen": -2.0735206604003906, "logits/rejected": -2.2481086254119873, "logps/chosen": -0.9650331139564514, "logps/rejected": -0.9796010255813599, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.9232983589172363, "rewards/margins": 0.006793498992919922, "rewards/rejected": 0.9165048599243164, "step": 774 }, { "epoch": 0.42, "learning_rate": 9.689885050844378e-08, "logits/chosen": -1.9897007942199707, "logits/rejected": -2.214829683303833, "logps/chosen": -1.7652766704559326, "logps/rejected": -1.7489763498306274, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.659632682800293, "rewards/margins": 0.027992665767669678, "rewards/rejected": 0.6316400170326233, "step": 775 }, { "epoch": 0.42, "learning_rate": 9.688874819058562e-08, "logits/chosen": -1.9599381685256958, "logits/rejected": -2.2385668754577637, "logps/chosen": -1.8840409517288208, "logps/rejected": -1.9522767066955566, "loss": 0.6713, "rewards/accuracies": 1.0, "rewards/chosen": 0.9308390617370605, "rewards/margins": 0.04418933391571045, "rewards/rejected": 0.8866497278213501, "step": 776 }, { "epoch": 0.42, "learning_rate": 9.6878629973154e-08, "logits/chosen": -2.0411736965179443, "logits/rejected": -2.0319161415100098, "logps/chosen": -8.64848518371582, "logps/rejected": -2.3254342079162598, "loss": 0.5201, "rewards/accuracies": 1.0, "rewards/chosen": 1.288423776626587, "rewards/margins": 0.3825380802154541, "rewards/rejected": 0.9058856964111328, "step": 777 }, { "epoch": 0.42, "learning_rate": 9.686849585957994e-08, "logits/chosen": -2.078685998916626, "logits/rejected": -2.086790084838867, "logps/chosen": -5.599409103393555, "logps/rejected": -8.018377304077148, "loss": 0.5139, "rewards/accuracies": 1.0, "rewards/chosen": 1.07417893409729, "rewards/margins": 0.3979074954986572, "rewards/rejected": 0.6762714385986328, "step": 778 }, { "epoch": 0.42, "learning_rate": 9.68583458532998e-08, "logits/chosen": -2.022348403930664, "logits/rejected": -2.3310139179229736, "logps/chosen": -5.182393550872803, "logps/rejected": -7.654712677001953, "loss": 0.6455, "rewards/accuracies": 1.0, "rewards/chosen": 1.0902996063232422, "rewards/margins": 0.09776383638381958, "rewards/rejected": 0.9925357699394226, "step": 779 }, { "epoch": 0.42, "learning_rate": 9.68481799577554e-08, "logits/chosen": -2.0146102905273438, "logits/rejected": -2.0224595069885254, "logps/chosen": -2.392914056777954, "logps/rejected": -2.257627010345459, "loss": 0.5558, "rewards/accuracies": 1.0, "rewards/chosen": 0.9079494476318359, "rewards/margins": 0.2966311573982239, "rewards/rejected": 0.6113182902336121, "step": 780 }, { "epoch": 0.42, "learning_rate": 9.683799817639386e-08, "logits/chosen": -2.1146819591522217, "logits/rejected": -2.1103463172912598, "logps/chosen": -4.377567768096924, "logps/rejected": -2.250520944595337, "loss": 0.4982, "rewards/accuracies": 1.0, "rewards/chosen": 1.1555843353271484, "rewards/margins": 0.43731099367141724, "rewards/rejected": 0.7182733416557312, "step": 781 }, { "epoch": 0.42, "learning_rate": 9.682780051266779e-08, "logits/chosen": -2.022207498550415, "logits/rejected": -2.2513742446899414, "logps/chosen": -3.1749603748321533, "logps/rejected": -3.0537424087524414, "loss": 0.6764, "rewards/accuracies": 1.0, "rewards/chosen": 0.5818638205528259, "rewards/margins": 0.033712029457092285, "rewards/rejected": 0.5481517910957336, "step": 782 }, { "epoch": 0.42, "learning_rate": 9.68175869700351e-08, "logits/chosen": -2.013465404510498, "logits/rejected": -2.24326753616333, "logps/chosen": -5.624499320983887, "logps/rejected": -1.6247422695159912, "loss": 0.7269, "rewards/accuracies": 0.0, "rewards/chosen": 0.7995306849479675, "rewards/margins": -0.06648468971252441, "rewards/rejected": 0.8660153746604919, "step": 783 }, { "epoch": 0.42, "learning_rate": 9.68073575519591e-08, "logits/chosen": -2.055377244949341, "logits/rejected": -2.2638299465179443, "logps/chosen": -0.8814398050308228, "logps/rejected": -0.8668960332870483, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.8207877278327942, "rewards/margins": 0.0006629824638366699, "rewards/rejected": 0.8201247453689575, "step": 784 }, { "epoch": 0.42, "learning_rate": 9.679711226190853e-08, "logits/chosen": -1.9487719535827637, "logits/rejected": -2.240553140640259, "logps/chosen": -3.5732922554016113, "logps/rejected": -3.6140456199645996, "loss": 0.7087, "rewards/accuracies": 0.0, "rewards/chosen": 0.6752411723136902, "rewards/margins": -0.03089386224746704, "rewards/rejected": 0.7061350345611572, "step": 785 }, { "epoch": 0.42, "learning_rate": 9.678685110335746e-08, "logits/chosen": -2.139615774154663, "logits/rejected": -2.139557123184204, "logps/chosen": -1.2831547260284424, "logps/rejected": -1.7615350484848022, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": 0.8567973971366882, "rewards/margins": -0.021875977516174316, "rewards/rejected": 0.8786733746528625, "step": 786 }, { "epoch": 0.42, "learning_rate": 9.677657407978537e-08, "logits/chosen": -2.0216457843780518, "logits/rejected": -2.229576826095581, "logps/chosen": -1.8524473905563354, "logps/rejected": -1.8388597965240479, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.9155839085578918, "rewards/margins": 0.008634865283966064, "rewards/rejected": 0.9069490432739258, "step": 787 }, { "epoch": 0.43, "learning_rate": 9.676628119467711e-08, "logits/chosen": -2.051889657974243, "logits/rejected": -2.260061025619507, "logps/chosen": -1.1364774703979492, "logps/rejected": -1.0443987846374512, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.7577983140945435, "rewards/margins": 0.01737743616104126, "rewards/rejected": 0.7404208779335022, "step": 788 }, { "epoch": 0.43, "learning_rate": 9.675597245152289e-08, "logits/chosen": -2.078721761703491, "logits/rejected": -2.276027202606201, "logps/chosen": -1.3963927030563354, "logps/rejected": -1.4871025085449219, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.8474928140640259, "rewards/margins": 0.0405956506729126, "rewards/rejected": 0.8068971633911133, "step": 789 }, { "epoch": 0.43, "learning_rate": 9.674564785381836e-08, "logits/chosen": -2.079357385635376, "logits/rejected": -2.0872175693511963, "logps/chosen": -1.8218424320220947, "logps/rejected": -9.093782424926758, "loss": 0.6424, "rewards/accuracies": 1.0, "rewards/chosen": 0.8598939776420593, "rewards/margins": 0.10420054197311401, "rewards/rejected": 0.7556934356689453, "step": 790 }, { "epoch": 0.43, "learning_rate": 9.673530740506446e-08, "logits/chosen": -2.1138265132904053, "logits/rejected": -2.2651479244232178, "logps/chosen": -1.2992531061172485, "logps/rejected": -1.3674349784851074, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 1.0331698656082153, "rewards/margins": 0.021388888359069824, "rewards/rejected": 1.0117809772491455, "step": 791 }, { "epoch": 0.43, "learning_rate": 9.672495110876758e-08, "logits/chosen": -2.0110068321228027, "logits/rejected": -2.2225120067596436, "logps/chosen": -1.9644379615783691, "logps/rejected": -3.902566909790039, "loss": 0.6541, "rewards/accuracies": 1.0, "rewards/chosen": 0.9028162360191345, "rewards/margins": 0.07967007160186768, "rewards/rejected": 0.8231461644172668, "step": 792 }, { "epoch": 0.43, "learning_rate": 9.671457896843942e-08, "logits/chosen": -2.2067370414733887, "logits/rejected": -2.0339291095733643, "logps/chosen": -57.09090805053711, "logps/rejected": -1.5629684925079346, "loss": 0.4206, "rewards/accuracies": 1.0, "rewards/chosen": 1.3955090045928955, "rewards/margins": 0.6484652161598206, "rewards/rejected": 0.747043788433075, "step": 793 }, { "epoch": 0.43, "learning_rate": 9.67041909875971e-08, "logits/chosen": -2.040182113647461, "logits/rejected": -2.223554849624634, "logps/chosen": -0.5868239402770996, "logps/rejected": -0.5706867575645447, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.6890859007835388, "rewards/margins": 0.015318691730499268, "rewards/rejected": 0.6737672090530396, "step": 794 }, { "epoch": 0.43, "learning_rate": 9.669378716976311e-08, "logits/chosen": -2.001227378845215, "logits/rejected": -2.262833833694458, "logps/chosen": -1.2400363683700562, "logps/rejected": -1.2498173713684082, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.7790939807891846, "rewards/margins": 0.008258700370788574, "rewards/rejected": 0.770835280418396, "step": 795 }, { "epoch": 0.43, "learning_rate": 9.668336751846527e-08, "logits/chosen": -2.015437602996826, "logits/rejected": -2.0194482803344727, "logps/chosen": -0.6163105368614197, "logps/rejected": -3.4502406120300293, "loss": 0.6152, "rewards/accuracies": 1.0, "rewards/chosen": 0.7227783799171448, "rewards/margins": 0.16253626346588135, "rewards/rejected": 0.5602421164512634, "step": 796 }, { "epoch": 0.43, "learning_rate": 9.667293203723681e-08, "logits/chosen": -1.9600400924682617, "logits/rejected": -2.2397782802581787, "logps/chosen": -0.5598790049552917, "logps/rejected": -0.5668911933898926, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.8053030371665955, "rewards/margins": 0.007512152194976807, "rewards/rejected": 0.7977908849716187, "step": 797 }, { "epoch": 0.43, "learning_rate": 9.66624807296163e-08, "logits/chosen": -1.969586968421936, "logits/rejected": -2.2393994331359863, "logps/chosen": -1.1812512874603271, "logps/rejected": -1.1361204385757446, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.9887839555740356, "rewards/margins": 0.021691441535949707, "rewards/rejected": 0.9670925140380859, "step": 798 }, { "epoch": 0.43, "learning_rate": 9.665201359914771e-08, "logits/chosen": -1.9919116497039795, "logits/rejected": -2.271893262863159, "logps/chosen": -0.6541263461112976, "logps/rejected": -0.636017918586731, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.9562947154045105, "rewards/margins": 0.02028900384902954, "rewards/rejected": 0.936005711555481, "step": 799 }, { "epoch": 0.43, "learning_rate": 9.664153064938033e-08, "logits/chosen": -2.0285003185272217, "logits/rejected": -2.0247373580932617, "logps/chosen": -6.999521255493164, "logps/rejected": -2.6593990325927734, "loss": 0.5291, "rewards/accuracies": 1.0, "rewards/chosen": 1.0142191648483276, "rewards/margins": 0.36035698652267456, "rewards/rejected": 0.6538621783256531, "step": 800 }, { "epoch": 0.43, "learning_rate": 9.663103188386886e-08, "logits/chosen": -2.057323694229126, "logits/rejected": -2.013324499130249, "logps/chosen": -35.04690933227539, "logps/rejected": -2.038470506668091, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.8839802145957947, "rewards/margins": 0.0221976637840271, "rewards/rejected": 0.8617825508117676, "step": 801 }, { "epoch": 0.43, "learning_rate": 9.662051730617332e-08, "logits/chosen": -2.117913007736206, "logits/rejected": -2.128199577331543, "logps/chosen": -5.877665996551514, "logps/rejected": -2.7865664958953857, "loss": 0.4811, "rewards/accuracies": 1.0, "rewards/chosen": 1.1668213605880737, "rewards/margins": 0.48145145177841187, "rewards/rejected": 0.6853699088096619, "step": 802 }, { "epoch": 0.43, "learning_rate": 9.660998691985914e-08, "logits/chosen": -2.138530969619751, "logits/rejected": -2.164036512374878, "logps/chosen": -1.6523969173431396, "logps/rejected": -8.849287033081055, "loss": 0.4709, "rewards/accuracies": 1.0, "rewards/chosen": 1.019452691078186, "rewards/margins": 0.5085687637329102, "rewards/rejected": 0.5108839273452759, "step": 803 }, { "epoch": 0.43, "learning_rate": 9.659944072849706e-08, "logits/chosen": -2.1419084072113037, "logits/rejected": -2.321089506149292, "logps/chosen": -1.9686139822006226, "logps/rejected": -1.8576544523239136, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.616416335105896, "rewards/margins": 0.00021636486053466797, "rewards/rejected": 0.6161999702453613, "step": 804 }, { "epoch": 0.43, "learning_rate": 9.658887873566322e-08, "logits/chosen": -2.0410046577453613, "logits/rejected": -2.0294692516326904, "logps/chosen": -27.173139572143555, "logps/rejected": -0.6482908725738525, "loss": 0.4654, "rewards/accuracies": 1.0, "rewards/chosen": 1.413060188293457, "rewards/margins": 0.5230329632759094, "rewards/rejected": 0.8900272250175476, "step": 805 }, { "epoch": 0.43, "learning_rate": 9.657830094493907e-08, "logits/chosen": -2.069002389907837, "logits/rejected": -1.9631575345993042, "logps/chosen": -32.72806167602539, "logps/rejected": -5.094482421875, "loss": 0.7359, "rewards/accuracies": 0.0, "rewards/chosen": 0.8269810080528259, "rewards/margins": -0.08373820781707764, "rewards/rejected": 0.9107192158699036, "step": 806 }, { "epoch": 0.44, "learning_rate": 9.656770735991149e-08, "logits/chosen": -2.120147466659546, "logits/rejected": -2.0437817573547363, "logps/chosen": -30.487817764282227, "logps/rejected": -6.340915679931641, "loss": 0.6104, "rewards/accuracies": 1.0, "rewards/chosen": 0.816706657409668, "rewards/margins": 0.1729181408882141, "rewards/rejected": 0.6437885165214539, "step": 807 }, { "epoch": 0.44, "learning_rate": 9.655709798417267e-08, "logits/chosen": -2.0328457355499268, "logits/rejected": -2.0265860557556152, "logps/chosen": -7.88062047958374, "logps/rejected": -2.2739644050598145, "loss": 0.4253, "rewards/accuracies": 1.0, "rewards/chosen": 1.2774709463119507, "rewards/margins": 0.6348302960395813, "rewards/rejected": 0.6426406502723694, "step": 808 }, { "epoch": 0.44, "learning_rate": 9.654647282132011e-08, "logits/chosen": -2.081669807434082, "logits/rejected": -2.3004117012023926, "logps/chosen": -2.1385245323181152, "logps/rejected": -1.696093201637268, "loss": 0.6586, "rewards/accuracies": 1.0, "rewards/chosen": 1.133098840713501, "rewards/margins": 0.0702815055847168, "rewards/rejected": 1.0628173351287842, "step": 809 }, { "epoch": 0.44, "learning_rate": 9.653583187495677e-08, "logits/chosen": -2.0963988304138184, "logits/rejected": -2.0847983360290527, "logps/chosen": -7.544620513916016, "logps/rejected": -3.8668594360351562, "loss": 0.4843, "rewards/accuracies": 1.0, "rewards/chosen": 1.3154348134994507, "rewards/margins": 0.47324568033218384, "rewards/rejected": 0.8421891331672668, "step": 810 }, { "epoch": 0.44, "learning_rate": 9.652517514869089e-08, "logits/chosen": -1.9740761518478394, "logits/rejected": -2.255922555923462, "logps/chosen": -2.6784543991088867, "logps/rejected": -1.0250300168991089, "loss": 0.6995, "rewards/accuracies": 0.0, "rewards/chosen": 0.7458288073539734, "rewards/margins": -0.012699723243713379, "rewards/rejected": 0.7585285305976868, "step": 811 }, { "epoch": 0.44, "learning_rate": 9.651450264613605e-08, "logits/chosen": -2.113679885864258, "logits/rejected": -2.1096720695495605, "logps/chosen": -2.370622396469116, "logps/rejected": -6.502180099487305, "loss": 0.4923, "rewards/accuracies": 1.0, "rewards/chosen": 1.0292233228683472, "rewards/margins": 0.4523887038230896, "rewards/rejected": 0.5768346190452576, "step": 812 }, { "epoch": 0.44, "learning_rate": 9.650381437091122e-08, "logits/chosen": -2.121037006378174, "logits/rejected": -2.114499807357788, "logps/chosen": -7.174473285675049, "logps/rejected": -3.208158493041992, "loss": 0.4954, "rewards/accuracies": 1.0, "rewards/chosen": 1.09811270236969, "rewards/margins": 0.4445396661758423, "rewards/rejected": 0.6535730361938477, "step": 813 }, { "epoch": 0.44, "learning_rate": 9.64931103266407e-08, "logits/chosen": -2.1249589920043945, "logits/rejected": -2.1841228008270264, "logps/chosen": -10.09322738647461, "logps/rejected": -21.57384490966797, "loss": 0.4863, "rewards/accuracies": 1.0, "rewards/chosen": 1.194196343421936, "rewards/margins": 0.4679420590400696, "rewards/rejected": 0.7262542843818665, "step": 814 }, { "epoch": 0.44, "learning_rate": 9.648239051695415e-08, "logits/chosen": -2.039646625518799, "logits/rejected": -2.0468037128448486, "logps/chosen": -2.159597873687744, "logps/rejected": -13.292329788208008, "loss": 0.5572, "rewards/accuracies": 1.0, "rewards/chosen": 1.011181116104126, "rewards/margins": 0.2933313846588135, "rewards/rejected": 0.7178497314453125, "step": 815 }, { "epoch": 0.44, "learning_rate": 9.647165494548655e-08, "logits/chosen": -2.076857566833496, "logits/rejected": -1.9704256057739258, "logps/chosen": -41.5711669921875, "logps/rejected": -2.1634838581085205, "loss": 0.4597, "rewards/accuracies": 1.0, "rewards/chosen": 1.1579288244247437, "rewards/margins": 0.538449227809906, "rewards/rejected": 0.6194795966148376, "step": 816 }, { "epoch": 0.44, "learning_rate": 9.646090361587826e-08, "logits/chosen": -2.145608901977539, "logits/rejected": -2.2664730548858643, "logps/chosen": -3.163407325744629, "logps/rejected": -3.0640740394592285, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 0.5174679756164551, "rewards/margins": 0.02979564666748047, "rewards/rejected": 0.4876723289489746, "step": 817 }, { "epoch": 0.44, "learning_rate": 9.645013653177494e-08, "logits/chosen": -2.117046356201172, "logits/rejected": -2.117485523223877, "logps/chosen": -2.896228551864624, "logps/rejected": -3.6690502166748047, "loss": 0.5282, "rewards/accuracies": 1.0, "rewards/chosen": 0.936376690864563, "rewards/margins": 0.36262351274490356, "rewards/rejected": 0.5737531781196594, "step": 818 }, { "epoch": 0.44, "learning_rate": 9.643935369682763e-08, "logits/chosen": -2.0246477127075195, "logits/rejected": -2.0138888359069824, "logps/chosen": -9.682572364807129, "logps/rejected": -0.5223731994628906, "loss": 0.5124, "rewards/accuracies": 1.0, "rewards/chosen": 1.2526049613952637, "rewards/margins": 0.40145939588546753, "rewards/rejected": 0.8511455655097961, "step": 819 }, { "epoch": 0.44, "learning_rate": 9.64285551146927e-08, "logits/chosen": -2.058802366256714, "logits/rejected": -2.0651676654815674, "logps/chosen": -5.635256290435791, "logps/rejected": -2.1057021617889404, "loss": 0.5689, "rewards/accuracies": 1.0, "rewards/chosen": 0.992247998714447, "rewards/margins": 0.2661975026130676, "rewards/rejected": 0.7260504961013794, "step": 820 }, { "epoch": 0.44, "learning_rate": 9.641774078903186e-08, "logits/chosen": -2.173077344894409, "logits/rejected": -2.237314462661743, "logps/chosen": -1.0546534061431885, "logps/rejected": -1.0639114379882812, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 1.014255166053772, "rewards/margins": 0.01740187406539917, "rewards/rejected": 0.9968532919883728, "step": 821 }, { "epoch": 0.44, "learning_rate": 9.640691072351213e-08, "logits/chosen": -1.9924441576004028, "logits/rejected": -2.202949285507202, "logps/chosen": -0.7899144887924194, "logps/rejected": -0.8300929665565491, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.8110727667808533, "rewards/margins": 0.006603360176086426, "rewards/rejected": 0.8044694066047668, "step": 822 }, { "epoch": 0.44, "learning_rate": 9.639606492180592e-08, "logits/chosen": -2.1352717876434326, "logits/rejected": -2.135981321334839, "logps/chosen": -1.7149513959884644, "logps/rejected": -2.196390151977539, "loss": 0.5737, "rewards/accuracies": 1.0, "rewards/chosen": 0.9896909594535828, "rewards/margins": 0.25514721870422363, "rewards/rejected": 0.7345437407493591, "step": 823 }, { "epoch": 0.44, "learning_rate": 9.638520338759094e-08, "logits/chosen": -2.057572841644287, "logits/rejected": -2.043775796890259, "logps/chosen": -22.65699577331543, "logps/rejected": -5.324042320251465, "loss": 0.6149, "rewards/accuracies": 1.0, "rewards/chosen": 1.0189058780670166, "rewards/margins": 0.16317009925842285, "rewards/rejected": 0.8557357788085938, "step": 824 }, { "epoch": 0.44, "learning_rate": 9.637432612455024e-08, "logits/chosen": -2.034480333328247, "logits/rejected": -2.0322141647338867, "logps/chosen": -5.203022003173828, "logps/rejected": -4.052666664123535, "loss": 0.3448, "rewards/accuracies": 1.0, "rewards/chosen": 1.3367903232574463, "rewards/margins": 0.8874198198318481, "rewards/rejected": 0.44937047362327576, "step": 825 }, { "epoch": 0.45, "learning_rate": 9.636343313637221e-08, "logits/chosen": -2.0302910804748535, "logits/rejected": -2.241858720779419, "logps/chosen": -10.394308090209961, "logps/rejected": -10.494797706604004, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.6523966193199158, "rewards/margins": 0.0001404881477355957, "rewards/rejected": 0.6522561311721802, "step": 826 }, { "epoch": 0.45, "learning_rate": 9.635252442675057e-08, "logits/chosen": -1.9658983945846558, "logits/rejected": -2.2415812015533447, "logps/chosen": -2.3228392601013184, "logps/rejected": -2.1969969272613525, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": 0.9905678033828735, "rewards/margins": 0.03822755813598633, "rewards/rejected": 0.9523402452468872, "step": 827 }, { "epoch": 0.45, "learning_rate": 9.634159999938436e-08, "logits/chosen": -2.1260297298431396, "logits/rejected": -2.028043270111084, "logps/chosen": -34.81416320800781, "logps/rejected": -2.290097951889038, "loss": 0.4833, "rewards/accuracies": 1.0, "rewards/chosen": 1.1704822778701782, "rewards/margins": 0.47583532333374023, "rewards/rejected": 0.694646954536438, "step": 828 }, { "epoch": 0.45, "learning_rate": 9.633065985797797e-08, "logits/chosen": -2.116339921951294, "logits/rejected": -2.117471218109131, "logps/chosen": -2.8368632793426514, "logps/rejected": -2.836322784423828, "loss": 0.621, "rewards/accuracies": 1.0, "rewards/chosen": 0.8198380470275879, "rewards/margins": 0.14999741315841675, "rewards/rejected": 0.6698406338691711, "step": 829 }, { "epoch": 0.45, "learning_rate": 9.631970400624111e-08, "logits/chosen": -2.0281147956848145, "logits/rejected": -2.0276522636413574, "logps/chosen": -6.482179164886475, "logps/rejected": -3.648303747177124, "loss": 0.5486, "rewards/accuracies": 1.0, "rewards/chosen": 1.0706900358200073, "rewards/margins": 0.31364673376083374, "rewards/rejected": 0.7570433020591736, "step": 830 }, { "epoch": 0.45, "learning_rate": 9.630873244788881e-08, "logits/chosen": -2.141995429992676, "logits/rejected": -2.1119396686553955, "logps/chosen": -6.501638412475586, "logps/rejected": -4.662588119506836, "loss": 0.4322, "rewards/accuracies": 1.0, "rewards/chosen": 1.1439783573150635, "rewards/margins": 0.6149199604988098, "rewards/rejected": 0.5290583968162537, "step": 831 }, { "epoch": 0.45, "learning_rate": 9.629774518664144e-08, "logits/chosen": -1.9418132305145264, "logits/rejected": -1.936261773109436, "logps/chosen": -6.189367294311523, "logps/rejected": -3.6874992847442627, "loss": 0.4129, "rewards/accuracies": 1.0, "rewards/chosen": 1.2781442403793335, "rewards/margins": 0.6710165739059448, "rewards/rejected": 0.6071276664733887, "step": 832 }, { "epoch": 0.45, "learning_rate": 9.628674222622468e-08, "logits/chosen": -2.118741750717163, "logits/rejected": -2.1106975078582764, "logps/chosen": -16.590381622314453, "logps/rejected": -11.439547538757324, "loss": 0.5283, "rewards/accuracies": 1.0, "rewards/chosen": 1.0939435958862305, "rewards/margins": 0.36237478256225586, "rewards/rejected": 0.7315688133239746, "step": 833 }, { "epoch": 0.45, "learning_rate": 9.627572357036954e-08, "logits/chosen": -2.1726112365722656, "logits/rejected": -2.316812515258789, "logps/chosen": -4.596026420593262, "logps/rejected": -1.7309688329696655, "loss": 0.8007, "rewards/accuracies": 0.0, "rewards/chosen": 0.6354807019233704, "rewards/margins": -0.20467042922973633, "rewards/rejected": 0.8401511311531067, "step": 834 }, { "epoch": 0.45, "learning_rate": 9.626468922281234e-08, "logits/chosen": -1.9855916500091553, "logits/rejected": -1.975486397743225, "logps/chosen": -8.390623092651367, "logps/rejected": -4.3846025466918945, "loss": 0.4496, "rewards/accuracies": 1.0, "rewards/chosen": 1.2959320545196533, "rewards/margins": 0.5661202669143677, "rewards/rejected": 0.7298117876052856, "step": 835 }, { "epoch": 0.45, "learning_rate": 9.625363918729477e-08, "logits/chosen": -2.1066622734069824, "logits/rejected": -2.1053926944732666, "logps/chosen": -1.6502294540405273, "logps/rejected": -1.2848031520843506, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.8236175775527954, "rewards/margins": 0.012732446193695068, "rewards/rejected": 0.8108851313591003, "step": 836 }, { "epoch": 0.45, "learning_rate": 9.624257346756375e-08, "logits/chosen": -2.0642263889312744, "logits/rejected": -2.0561277866363525, "logps/chosen": -4.4016852378845215, "logps/rejected": -7.733988285064697, "loss": 0.5097, "rewards/accuracies": 1.0, "rewards/chosen": 0.8221813440322876, "rewards/margins": 0.4081856310367584, "rewards/rejected": 0.4139957129955292, "step": 837 }, { "epoch": 0.45, "learning_rate": 9.62314920673716e-08, "logits/chosen": -2.1283974647521973, "logits/rejected": -2.077580213546753, "logps/chosen": -32.899452209472656, "logps/rejected": -3.0130224227905273, "loss": 0.5477, "rewards/accuracies": 1.0, "rewards/chosen": 1.1116447448730469, "rewards/margins": 0.31577861309051514, "rewards/rejected": 0.7958661317825317, "step": 838 }, { "epoch": 0.45, "learning_rate": 9.622039499047594e-08, "logits/chosen": -2.004592180252075, "logits/rejected": -1.9927529096603394, "logps/chosen": -13.078922271728516, "logps/rejected": -3.112647294998169, "loss": 0.6527, "rewards/accuracies": 1.0, "rewards/chosen": 0.885497510433197, "rewards/margins": 0.08261805772781372, "rewards/rejected": 0.8028794527053833, "step": 839 }, { "epoch": 0.45, "learning_rate": 9.620928224063968e-08, "logits/chosen": -2.1577165126800537, "logits/rejected": -2.305410146713257, "logps/chosen": -1.5344699621200562, "logps/rejected": -1.6088805198669434, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 1.0281040668487549, "rewards/margins": 0.01867985725402832, "rewards/rejected": 1.0094242095947266, "step": 840 }, { "epoch": 0.45, "learning_rate": 9.619815382163106e-08, "logits/chosen": -2.032397508621216, "logits/rejected": -2.0277345180511475, "logps/chosen": -6.700547218322754, "logps/rejected": -2.3513875007629395, "loss": 0.4076, "rewards/accuracies": 1.0, "rewards/chosen": 1.3008906841278076, "rewards/margins": 0.6868820190429688, "rewards/rejected": 0.6140086650848389, "step": 841 }, { "epoch": 0.45, "learning_rate": 9.618700973722362e-08, "logits/chosen": -2.0962040424346924, "logits/rejected": -2.1024105548858643, "logps/chosen": -2.3970131874084473, "logps/rejected": -2.1055452823638916, "loss": 0.5068, "rewards/accuracies": 1.0, "rewards/chosen": 1.0380834341049194, "rewards/margins": 0.41548532247543335, "rewards/rejected": 0.6225981116294861, "step": 842 }, { "epoch": 0.45, "learning_rate": 9.617584999119623e-08, "logits/chosen": -2.152005434036255, "logits/rejected": -2.1588857173919678, "logps/chosen": -2.897982120513916, "logps/rejected": -3.1863107681274414, "loss": 0.5132, "rewards/accuracies": 1.0, "rewards/chosen": 1.028985857963562, "rewards/margins": 0.39952200651168823, "rewards/rejected": 0.6294638514518738, "step": 843 }, { "epoch": 0.46, "learning_rate": 9.616467458733307e-08, "logits/chosen": -2.0884053707122803, "logits/rejected": -2.0962307453155518, "logps/chosen": -3.2481977939605713, "logps/rejected": -2.5934572219848633, "loss": 0.5484, "rewards/accuracies": 1.0, "rewards/chosen": 1.185839295387268, "rewards/margins": 0.31406402587890625, "rewards/rejected": 0.8717752695083618, "step": 844 }, { "epoch": 0.46, "learning_rate": 9.615348352942363e-08, "logits/chosen": -1.965111255645752, "logits/rejected": -1.9659106731414795, "logps/chosen": -4.584875583648682, "logps/rejected": -1.422669768333435, "loss": 0.5497, "rewards/accuracies": 1.0, "rewards/chosen": 1.1649061441421509, "rewards/margins": 0.3108838200569153, "rewards/rejected": 0.8540223240852356, "step": 845 }, { "epoch": 0.46, "learning_rate": 9.61422768212627e-08, "logits/chosen": -1.9999476671218872, "logits/rejected": -2.342942237854004, "logps/chosen": -1.6090232133865356, "logps/rejected": -1.299086332321167, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.8108588457107544, "rewards/margins": 0.026936471462249756, "rewards/rejected": 0.7839223742485046, "step": 846 }, { "epoch": 0.46, "learning_rate": 9.613105446665036e-08, "logits/chosen": -2.0524940490722656, "logits/rejected": -2.2488625049591064, "logps/chosen": -3.325071334838867, "logps/rejected": -7.950028896331787, "loss": 0.772, "rewards/accuracies": 0.0, "rewards/chosen": 0.7822829484939575, "rewards/margins": -0.15188056230545044, "rewards/rejected": 0.934163510799408, "step": 847 }, { "epoch": 0.46, "learning_rate": 9.611981646939203e-08, "logits/chosen": -2.0690033435821533, "logits/rejected": -2.0526552200317383, "logps/chosen": -11.929584503173828, "logps/rejected": -3.1932215690612793, "loss": 0.4665, "rewards/accuracies": 1.0, "rewards/chosen": 1.200553297996521, "rewards/margins": 0.5202569961547852, "rewards/rejected": 0.6802963018417358, "step": 848 }, { "epoch": 0.46, "learning_rate": 9.610856283329842e-08, "logits/chosen": -2.0660207271575928, "logits/rejected": -2.2040538787841797, "logps/chosen": -3.723766803741455, "logps/rejected": -3.528425693511963, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.5802832245826721, "rewards/margins": 0.013866424560546875, "rewards/rejected": 0.5664168000221252, "step": 849 }, { "epoch": 0.46, "learning_rate": 9.609729356218552e-08, "logits/chosen": -2.111104965209961, "logits/rejected": -2.1065003871917725, "logps/chosen": -7.721991539001465, "logps/rejected": -3.390333652496338, "loss": 0.5115, "rewards/accuracies": 1.0, "rewards/chosen": 1.0151774883270264, "rewards/margins": 0.4036906957626343, "rewards/rejected": 0.6114867925643921, "step": 850 }, { "epoch": 0.46, "learning_rate": 9.608600865987468e-08, "logits/chosen": -1.9821255207061768, "logits/rejected": -2.3542637825012207, "logps/chosen": -9.100452423095703, "logps/rejected": -21.7970027923584, "loss": 0.5907, "rewards/accuracies": 1.0, "rewards/chosen": 0.737989068031311, "rewards/margins": 0.21667063236236572, "rewards/rejected": 0.5213184356689453, "step": 851 }, { "epoch": 0.46, "learning_rate": 9.607470813019249e-08, "logits/chosen": -2.0519869327545166, "logits/rejected": -2.0575191974639893, "logps/chosen": -2.629582405090332, "logps/rejected": -3.3555476665496826, "loss": 0.5446, "rewards/accuracies": 1.0, "rewards/chosen": 0.9526087641716003, "rewards/margins": 0.32318180799484253, "rewards/rejected": 0.6294269561767578, "step": 852 }, { "epoch": 0.46, "learning_rate": 9.606339197697089e-08, "logits/chosen": -2.023793935775757, "logits/rejected": -2.2330236434936523, "logps/chosen": -0.9694553017616272, "logps/rejected": -0.9888969659805298, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.9654717445373535, "rewards/margins": 0.0009649991989135742, "rewards/rejected": 0.9645067453384399, "step": 853 }, { "epoch": 0.46, "learning_rate": 9.605206020404703e-08, "logits/chosen": -2.0186567306518555, "logits/rejected": -2.017068386077881, "logps/chosen": -1.1544787883758545, "logps/rejected": -1.2677648067474365, "loss": 0.6352, "rewards/accuracies": 1.0, "rewards/chosen": 0.9578043818473816, "rewards/margins": 0.11935508251190186, "rewards/rejected": 0.8384492993354797, "step": 854 }, { "epoch": 0.46, "learning_rate": 9.604071281526347e-08, "logits/chosen": -2.032939910888672, "logits/rejected": -2.0268495082855225, "logps/chosen": -4.287526607513428, "logps/rejected": -4.005575656890869, "loss": 0.4338, "rewards/accuracies": 1.0, "rewards/chosen": 1.194336175918579, "rewards/margins": 0.6103737950325012, "rewards/rejected": 0.5839623808860779, "step": 855 }, { "epoch": 0.46, "learning_rate": 9.602934981446803e-08, "logits/chosen": -2.1054351329803467, "logits/rejected": -2.0910024642944336, "logps/chosen": -18.4743595123291, "logps/rejected": -4.747381210327148, "loss": 0.4458, "rewards/accuracies": 1.0, "rewards/chosen": 1.0743166208267212, "rewards/margins": 0.5767464637756348, "rewards/rejected": 0.49757012724876404, "step": 856 }, { "epoch": 0.46, "learning_rate": 9.601797120551373e-08, "logits/chosen": -1.981574535369873, "logits/rejected": -2.2418925762176514, "logps/chosen": -3.8216195106506348, "logps/rejected": -6.35912561416626, "loss": 0.6484, "rewards/accuracies": 1.0, "rewards/chosen": 0.8908531069755554, "rewards/margins": 0.09166407585144043, "rewards/rejected": 0.799189031124115, "step": 857 }, { "epoch": 0.46, "learning_rate": 9.600657699225903e-08, "logits/chosen": -2.0994911193847656, "logits/rejected": -2.2652084827423096, "logps/chosen": -1.2241575717926025, "logps/rejected": -1.2027336359024048, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.817985475063324, "rewards/margins": -0.007880747318267822, "rewards/rejected": 0.8258662223815918, "step": 858 }, { "epoch": 0.46, "learning_rate": 9.599516717856757e-08, "logits/chosen": -2.0295071601867676, "logits/rejected": -2.2898943424224854, "logps/chosen": -0.8911576271057129, "logps/rejected": -0.8951948881149292, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.8056259155273438, "rewards/margins": -0.00739443302154541, "rewards/rejected": 0.8130203485488892, "step": 859 }, { "epoch": 0.46, "learning_rate": 9.598374176830835e-08, "logits/chosen": -2.065929412841797, "logits/rejected": -2.2312943935394287, "logps/chosen": -0.6065900325775146, "logps/rejected": -0.6389264464378357, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.8278384208679199, "rewards/margins": 0.03445899486541748, "rewards/rejected": 0.7933794260025024, "step": 860 }, { "epoch": 0.46, "learning_rate": 9.597230076535561e-08, "logits/chosen": -2.137357711791992, "logits/rejected": -2.143369436264038, "logps/chosen": -3.744251012802124, "logps/rejected": -2.979193687438965, "loss": 0.5876, "rewards/accuracies": 1.0, "rewards/chosen": 0.9464228749275208, "rewards/margins": 0.22355711460113525, "rewards/rejected": 0.7228657603263855, "step": 861 }, { "epoch": 0.46, "learning_rate": 9.596084417358889e-08, "logits/chosen": -2.0008111000061035, "logits/rejected": -2.260791540145874, "logps/chosen": -11.804057121276855, "logps/rejected": -13.637153625488281, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.9478486180305481, "rewards/margins": 0.013451695442199707, "rewards/rejected": 0.9343969225883484, "step": 862 }, { "epoch": 0.47, "learning_rate": 9.594937199689304e-08, "logits/chosen": -2.1420176029205322, "logits/rejected": -2.1573143005371094, "logps/chosen": -15.529729843139648, "logps/rejected": -9.439155578613281, "loss": 0.4563, "rewards/accuracies": 1.0, "rewards/chosen": 1.4213968515396118, "rewards/margins": 0.5477092862129211, "rewards/rejected": 0.8736875653266907, "step": 863 }, { "epoch": 0.47, "learning_rate": 9.593788423915816e-08, "logits/chosen": -2.108527898788452, "logits/rejected": -2.2733426094055176, "logps/chosen": -1.5930391550064087, "logps/rejected": -1.8005521297454834, "loss": 0.6417, "rewards/accuracies": 1.0, "rewards/chosen": 0.910840630531311, "rewards/margins": 0.1057087779045105, "rewards/rejected": 0.8051318526268005, "step": 864 }, { "epoch": 0.47, "learning_rate": 9.592638090427967e-08, "logits/chosen": -2.123185634613037, "logits/rejected": -2.1163299083709717, "logps/chosen": -4.761325359344482, "logps/rejected": -3.074512243270874, "loss": 0.4724, "rewards/accuracies": 1.0, "rewards/chosen": 1.0121132135391235, "rewards/margins": 0.5044007301330566, "rewards/rejected": 0.5077124834060669, "step": 865 }, { "epoch": 0.47, "learning_rate": 9.591486199615824e-08, "logits/chosen": -2.063443183898926, "logits/rejected": -2.2438840866088867, "logps/chosen": -7.348162651062012, "logps/rejected": -10.329188346862793, "loss": 0.6619, "rewards/accuracies": 1.0, "rewards/chosen": 0.7416917085647583, "rewards/margins": 0.06355303525924683, "rewards/rejected": 0.6781386733055115, "step": 866 }, { "epoch": 0.47, "learning_rate": 9.590332751869983e-08, "logits/chosen": -2.059386968612671, "logits/rejected": -2.060527801513672, "logps/chosen": -1.1226885318756104, "logps/rejected": -2.9964191913604736, "loss": 0.5417, "rewards/accuracies": 1.0, "rewards/chosen": 0.9159688353538513, "rewards/margins": 0.3299598693847656, "rewards/rejected": 0.5860089659690857, "step": 867 }, { "epoch": 0.47, "learning_rate": 9.58917774758157e-08, "logits/chosen": -1.9559911489486694, "logits/rejected": -1.9521455764770508, "logps/chosen": -8.690690994262695, "logps/rejected": -2.4409639835357666, "loss": 0.3982, "rewards/accuracies": 1.0, "rewards/chosen": 1.342219352722168, "rewards/margins": 0.7149973511695862, "rewards/rejected": 0.6272220015525818, "step": 868 }, { "epoch": 0.47, "learning_rate": 9.588021187142234e-08, "logits/chosen": -2.0165910720825195, "logits/rejected": -2.2507150173187256, "logps/chosen": -1.157707691192627, "logps/rejected": -1.1457055807113647, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.8718852996826172, "rewards/margins": 0.005243182182312012, "rewards/rejected": 0.8666421175003052, "step": 869 }, { "epoch": 0.47, "learning_rate": 9.586863070944158e-08, "logits/chosen": -2.158409833908081, "logits/rejected": -2.349123954772949, "logps/chosen": -1.087193250656128, "logps/rejected": -1.1302504539489746, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 1.023018717765808, "rewards/margins": 0.01704871654510498, "rewards/rejected": 1.0059700012207031, "step": 870 }, { "epoch": 0.47, "learning_rate": 9.585703399380047e-08, "logits/chosen": -2.118176221847534, "logits/rejected": -2.3626749515533447, "logps/chosen": -14.997312545776367, "logps/rejected": -19.972829818725586, "loss": 0.6478, "rewards/accuracies": 1.0, "rewards/chosen": 0.45761242508888245, "rewards/margins": 0.09280586242675781, "rewards/rejected": 0.36480656266212463, "step": 871 }, { "epoch": 0.47, "learning_rate": 9.584542172843137e-08, "logits/chosen": -2.020230531692505, "logits/rejected": -2.0244102478027344, "logps/chosen": -1.0078375339508057, "logps/rejected": -2.7191250324249268, "loss": 0.5784, "rewards/accuracies": 1.0, "rewards/chosen": 0.7826453447341919, "rewards/margins": 0.244351327419281, "rewards/rejected": 0.5382940173149109, "step": 872 }, { "epoch": 0.47, "learning_rate": 9.583379391727188e-08, "logits/chosen": -2.1663599014282227, "logits/rejected": -2.295513153076172, "logps/chosen": -5.223212242126465, "logps/rejected": -7.9159746170043945, "loss": 0.6252, "rewards/accuracies": 1.0, "rewards/chosen": 0.8723799586296082, "rewards/margins": 0.14077907800674438, "rewards/rejected": 0.7316008806228638, "step": 873 }, { "epoch": 0.47, "learning_rate": 9.582215056426493e-08, "logits/chosen": -2.063100576400757, "logits/rejected": -2.210270404815674, "logps/chosen": -2.2796149253845215, "logps/rejected": -2.7977664470672607, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.8307186365127563, "rewards/margins": 0.02235966920852661, "rewards/rejected": 0.8083589673042297, "step": 874 }, { "epoch": 0.47, "learning_rate": 9.581049167335865e-08, "logits/chosen": -2.187232494354248, "logits/rejected": -2.218095541000366, "logps/chosen": -6.909623146057129, "logps/rejected": -27.755525588989258, "loss": 0.6292, "rewards/accuracies": 1.0, "rewards/chosen": 0.9485594630241394, "rewards/margins": 0.13224303722381592, "rewards/rejected": 0.8163164258003235, "step": 875 }, { "epoch": 0.47, "learning_rate": 9.579881724850648e-08, "logits/chosen": -2.112229108810425, "logits/rejected": -2.107140302658081, "logps/chosen": -9.146270751953125, "logps/rejected": -3.660209894180298, "loss": 0.4399, "rewards/accuracies": 1.0, "rewards/chosen": 1.1765613555908203, "rewards/margins": 0.5930827260017395, "rewards/rejected": 0.5834786295890808, "step": 876 }, { "epoch": 0.47, "learning_rate": 9.578712729366711e-08, "logits/chosen": -2.155303716659546, "logits/rejected": -2.302415132522583, "logps/chosen": -4.828212738037109, "logps/rejected": -4.759274482727051, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.7195891737937927, "rewards/margins": -0.005790591239929199, "rewards/rejected": 0.7253797650337219, "step": 877 }, { "epoch": 0.47, "learning_rate": 9.577542181280452e-08, "logits/chosen": -2.147969961166382, "logits/rejected": -2.149028778076172, "logps/chosen": -3.328066349029541, "logps/rejected": -12.070717811584473, "loss": 0.617, "rewards/accuracies": 1.0, "rewards/chosen": 0.8969699740409851, "rewards/margins": 0.15851616859436035, "rewards/rejected": 0.7384538054466248, "step": 878 }, { "epoch": 0.47, "learning_rate": 9.576370080988793e-08, "logits/chosen": -2.119009256362915, "logits/rejected": -2.292543888092041, "logps/chosen": -0.9070196747779846, "logps/rejected": -0.9549552202224731, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 0.8236203193664551, "rewards/margins": 0.029672980308532715, "rewards/rejected": 0.7939473390579224, "step": 879 }, { "epoch": 0.47, "learning_rate": 9.575196428889183e-08, "logits/chosen": -2.003849983215332, "logits/rejected": -2.0002987384796143, "logps/chosen": -7.996644973754883, "logps/rejected": -13.536956787109375, "loss": 0.5718, "rewards/accuracies": 1.0, "rewards/chosen": 1.1028568744659424, "rewards/margins": 0.2594435214996338, "rewards/rejected": 0.8434133529663086, "step": 880 }, { "epoch": 0.48, "learning_rate": 9.574021225379596e-08, "logits/chosen": -1.9783141613006592, "logits/rejected": -1.9845834970474243, "logps/chosen": -2.5834712982177734, "logps/rejected": -2.8808507919311523, "loss": 0.5258, "rewards/accuracies": 1.0, "rewards/chosen": 0.9110116362571716, "rewards/margins": 0.3685348629951477, "rewards/rejected": 0.5424767732620239, "step": 881 }, { "epoch": 0.48, "learning_rate": 9.572844470858536e-08, "logits/chosen": -2.1320760250091553, "logits/rejected": -2.003864049911499, "logps/chosen": -38.24116516113281, "logps/rejected": -4.198277473449707, "loss": 0.4629, "rewards/accuracies": 1.0, "rewards/chosen": 1.034462332725525, "rewards/margins": 0.5297955274581909, "rewards/rejected": 0.504666805267334, "step": 882 }, { "epoch": 0.48, "learning_rate": 9.57166616572503e-08, "logits/chosen": -2.0747063159942627, "logits/rejected": -2.016382932662964, "logps/chosen": -26.89242935180664, "logps/rejected": -2.731830596923828, "loss": 0.566, "rewards/accuracies": 1.0, "rewards/chosen": 1.0224064588546753, "rewards/margins": 0.27279990911483765, "rewards/rejected": 0.7496065497398376, "step": 883 }, { "epoch": 0.48, "learning_rate": 9.570486310378629e-08, "logits/chosen": -2.0769729614257812, "logits/rejected": -2.271113395690918, "logps/chosen": -0.9129024147987366, "logps/rejected": -1.005799412727356, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.6773948073387146, "rewards/margins": 0.01688295602798462, "rewards/rejected": 0.66051185131073, "step": 884 }, { "epoch": 0.48, "learning_rate": 9.569304905219413e-08, "logits/chosen": -1.9842125177383423, "logits/rejected": -2.2480719089508057, "logps/chosen": -3.7257673740386963, "logps/rejected": -3.842113494873047, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 1.0992193222045898, "rewards/margins": 0.030272841453552246, "rewards/rejected": 1.0689464807510376, "step": 885 }, { "epoch": 0.48, "learning_rate": 9.568121950647987e-08, "logits/chosen": -2.1143922805786133, "logits/rejected": -2.103891611099243, "logps/chosen": -5.355874061584473, "logps/rejected": -7.18923282623291, "loss": 0.5626, "rewards/accuracies": 1.0, "rewards/chosen": 1.003995656967163, "rewards/margins": 0.2807028293609619, "rewards/rejected": 0.7232928276062012, "step": 886 }, { "epoch": 0.48, "learning_rate": 9.566937447065481e-08, "logits/chosen": -2.119478464126587, "logits/rejected": -2.1199140548706055, "logps/chosen": -2.778611421585083, "logps/rejected": -2.7238779067993164, "loss": 0.6173, "rewards/accuracies": 1.0, "rewards/chosen": 0.9319669008255005, "rewards/margins": 0.157958984375, "rewards/rejected": 0.7740079164505005, "step": 887 }, { "epoch": 0.48, "learning_rate": 9.565751394873551e-08, "logits/chosen": -1.9974185228347778, "logits/rejected": -2.009828805923462, "logps/chosen": -3.9555130004882812, "logps/rejected": -7.793799877166748, "loss": 0.481, "rewards/accuracies": 1.0, "rewards/chosen": 1.057023286819458, "rewards/margins": 0.4818721413612366, "rewards/rejected": 0.5751511454582214, "step": 888 }, { "epoch": 0.48, "learning_rate": 9.564563794474374e-08, "logits/chosen": -2.1692657470703125, "logits/rejected": -2.1705408096313477, "logps/chosen": -2.037961959838867, "logps/rejected": -1.3739099502563477, "loss": 0.6201, "rewards/accuracies": 1.0, "rewards/chosen": 1.168028473854065, "rewards/margins": 0.15189099311828613, "rewards/rejected": 1.0161374807357788, "step": 889 }, { "epoch": 0.48, "learning_rate": 9.563374646270657e-08, "logits/chosen": -2.079162120819092, "logits/rejected": -2.0671072006225586, "logps/chosen": -8.245336532592773, "logps/rejected": -2.520353078842163, "loss": 0.5258, "rewards/accuracies": 1.0, "rewards/chosen": 0.9967283606529236, "rewards/margins": 0.36847710609436035, "rewards/rejected": 0.6282512545585632, "step": 890 }, { "epoch": 0.48, "learning_rate": 9.56218395066563e-08, "logits/chosen": -2.177875518798828, "logits/rejected": -2.131155490875244, "logps/chosen": -20.65157127380371, "logps/rejected": -3.4489657878875732, "loss": 0.4174, "rewards/accuracies": 1.0, "rewards/chosen": 1.262669563293457, "rewards/margins": 0.6577604413032532, "rewards/rejected": 0.6049091219902039, "step": 891 }, { "epoch": 0.48, "learning_rate": 9.560991708063047e-08, "logits/chosen": -2.0756285190582275, "logits/rejected": -2.267646074295044, "logps/chosen": -1.6314404010772705, "logps/rejected": -5.8590240478515625, "loss": 0.5909, "rewards/accuracies": 1.0, "rewards/chosen": 0.7833582162857056, "rewards/margins": 0.216072678565979, "rewards/rejected": 0.5672855377197266, "step": 892 }, { "epoch": 0.48, "learning_rate": 9.559797918867187e-08, "logits/chosen": -2.110952615737915, "logits/rejected": -2.1176254749298096, "logps/chosen": -2.123415470123291, "logps/rejected": -3.3782734870910645, "loss": 0.5496, "rewards/accuracies": 1.0, "rewards/chosen": 0.9931129813194275, "rewards/margins": 0.3111129403114319, "rewards/rejected": 0.6820000410079956, "step": 893 }, { "epoch": 0.48, "learning_rate": 9.558602583482856e-08, "logits/chosen": -1.931272268295288, "logits/rejected": -1.904443383216858, "logps/chosen": -12.34697151184082, "logps/rejected": -3.0450599193573, "loss": 0.6188, "rewards/accuracies": 1.0, "rewards/chosen": 0.7687931060791016, "rewards/margins": 0.1545853614807129, "rewards/rejected": 0.6142077445983887, "step": 894 }, { "epoch": 0.48, "learning_rate": 9.557405702315379e-08, "logits/chosen": -2.1389360427856445, "logits/rejected": -2.208477735519409, "logps/chosen": -0.9767094254493713, "logps/rejected": -2.5818212032318115, "loss": 0.6396, "rewards/accuracies": 1.0, "rewards/chosen": 0.8823887705802917, "rewards/margins": 0.11016088724136353, "rewards/rejected": 0.7722278833389282, "step": 895 }, { "epoch": 0.48, "learning_rate": 9.556207275770611e-08, "logits/chosen": -2.02841854095459, "logits/rejected": -2.025710344314575, "logps/chosen": -8.007657051086426, "logps/rejected": -3.2713518142700195, "loss": 0.3528, "rewards/accuracies": 1.0, "rewards/chosen": 1.3907843828201294, "rewards/margins": 0.8601972460746765, "rewards/rejected": 0.5305871367454529, "step": 896 }, { "epoch": 0.48, "learning_rate": 9.555007304254928e-08, "logits/chosen": -1.959173321723938, "logits/rejected": -2.2071142196655273, "logps/chosen": -1.2096188068389893, "logps/rejected": -3.5644333362579346, "loss": 0.5613, "rewards/accuracies": 1.0, "rewards/chosen": 0.9522073864936829, "rewards/margins": 0.2837584614753723, "rewards/rejected": 0.6684489250183105, "step": 897 }, { "epoch": 0.48, "learning_rate": 9.553805788175227e-08, "logits/chosen": -2.1735568046569824, "logits/rejected": -2.179203510284424, "logps/chosen": -2.331693172454834, "logps/rejected": -1.3011854887008667, "loss": 0.5358, "rewards/accuracies": 1.0, "rewards/chosen": 0.9734857678413391, "rewards/margins": 0.3441465497016907, "rewards/rejected": 0.6293392181396484, "step": 898 }, { "epoch": 0.48, "learning_rate": 9.552602727938934e-08, "logits/chosen": -2.1054422855377197, "logits/rejected": -2.1053261756896973, "logps/chosen": -4.379853248596191, "logps/rejected": -1.6012659072875977, "loss": 0.6059, "rewards/accuracies": 1.0, "rewards/chosen": 1.164962649345398, "rewards/margins": 0.18285918235778809, "rewards/rejected": 0.9821034669876099, "step": 899 }, { "epoch": 0.49, "learning_rate": 9.551398123953995e-08, "logits/chosen": -1.9938222169876099, "logits/rejected": -2.2868645191192627, "logps/chosen": -0.8923888802528381, "logps/rejected": -1.0449697971343994, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.8302286267280579, "rewards/margins": 0.0313715934753418, "rewards/rejected": 0.7988570332527161, "step": 900 }, { "epoch": 0.49, "learning_rate": 9.550191976628884e-08, "logits/chosen": -2.12939453125, "logits/rejected": -2.1287920475006104, "logps/chosen": -1.224503517150879, "logps/rejected": -1.967633843421936, "loss": 0.7058, "rewards/accuracies": 0.0, "rewards/chosen": 0.8799957633018494, "rewards/margins": -0.02521437406539917, "rewards/rejected": 0.9052101373672485, "step": 901 }, { "epoch": 0.49, "learning_rate": 9.548984286372593e-08, "logits/chosen": -2.040508985519409, "logits/rejected": -2.24017333984375, "logps/chosen": -1.4964699745178223, "logps/rejected": -1.4767768383026123, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.7456045746803284, "rewards/margins": 0.010714948177337646, "rewards/rejected": 0.7348896265029907, "step": 902 }, { "epoch": 0.49, "learning_rate": 9.547775053594638e-08, "logits/chosen": -2.0620944499969482, "logits/rejected": -2.058242082595825, "logps/chosen": -0.47550150752067566, "logps/rejected": -10.56717300415039, "loss": 0.565, "rewards/accuracies": 1.0, "rewards/chosen": 0.8437572717666626, "rewards/margins": 0.27509576082229614, "rewards/rejected": 0.5686615109443665, "step": 903 }, { "epoch": 0.49, "learning_rate": 9.546564278705064e-08, "logits/chosen": -1.996509075164795, "logits/rejected": -1.9788357019424438, "logps/chosen": -37.50908660888672, "logps/rejected": -7.659600734710693, "loss": 0.4785, "rewards/accuracies": 1.0, "rewards/chosen": 0.9392772912979126, "rewards/margins": 0.48832833766937256, "rewards/rejected": 0.45094895362854004, "step": 904 }, { "epoch": 0.49, "learning_rate": 9.54535196211443e-08, "logits/chosen": -2.118807554244995, "logits/rejected": -2.122764825820923, "logps/chosen": -3.800652503967285, "logps/rejected": -12.096445083618164, "loss": 0.3498, "rewards/accuracies": 1.0, "rewards/chosen": 1.4035613536834717, "rewards/margins": 0.8705475926399231, "rewards/rejected": 0.5330137610435486, "step": 905 }, { "epoch": 0.49, "learning_rate": 9.544138104233823e-08, "logits/chosen": -2.038416862487793, "logits/rejected": -2.2352612018585205, "logps/chosen": -0.8224509358406067, "logps/rejected": -0.8640026450157166, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.7405836582183838, "rewards/margins": 0.009183943271636963, "rewards/rejected": 0.7313997149467468, "step": 906 }, { "epoch": 0.49, "learning_rate": 9.542922705474853e-08, "logits/chosen": -2.064307451248169, "logits/rejected": -2.0661909580230713, "logps/chosen": -0.9663909077644348, "logps/rejected": -1.52740478515625, "loss": 0.5926, "rewards/accuracies": 1.0, "rewards/chosen": 1.022684931755066, "rewards/margins": 0.21228569746017456, "rewards/rejected": 0.8103992342948914, "step": 907 }, { "epoch": 0.49, "learning_rate": 9.541705766249653e-08, "logits/chosen": -1.960240364074707, "logits/rejected": -1.9602410793304443, "logps/chosen": -2.1082684993743896, "logps/rejected": -1.1047837734222412, "loss": 0.6998, "rewards/accuracies": 0.0, "rewards/chosen": 0.8254759907722473, "rewards/margins": -0.013303875923156738, "rewards/rejected": 0.838779866695404, "step": 908 }, { "epoch": 0.49, "learning_rate": 9.540487286970873e-08, "logits/chosen": -2.05825138092041, "logits/rejected": -2.2732226848602295, "logps/chosen": -13.767621994018555, "logps/rejected": -15.398187637329102, "loss": 0.5998, "rewards/accuracies": 1.0, "rewards/chosen": 0.9232189059257507, "rewards/margins": 0.19635391235351562, "rewards/rejected": 0.7268649935722351, "step": 909 }, { "epoch": 0.49, "learning_rate": 9.539267268051691e-08, "logits/chosen": -2.056124448776245, "logits/rejected": -2.290581226348877, "logps/chosen": -5.3238139152526855, "logps/rejected": -1.472605586051941, "loss": 0.9048, "rewards/accuracies": 0.0, "rewards/chosen": 0.6562513113021851, "rewards/margins": -0.3863006830215454, "rewards/rejected": 1.0425519943237305, "step": 910 }, { "epoch": 0.49, "learning_rate": 9.538045709905806e-08, "logits/chosen": -2.0237390995025635, "logits/rejected": -2.018512487411499, "logps/chosen": -3.0334115028381348, "logps/rejected": -8.458649635314941, "loss": 0.4714, "rewards/accuracies": 1.0, "rewards/chosen": 1.004336953163147, "rewards/margins": 0.507207989692688, "rewards/rejected": 0.497128963470459, "step": 911 }, { "epoch": 0.49, "learning_rate": 9.536822612947434e-08, "logits/chosen": -2.0311381816864014, "logits/rejected": -2.2472445964813232, "logps/chosen": -0.8379088640213013, "logps/rejected": -0.8100053668022156, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.828129231929779, "rewards/margins": 0.011275112628936768, "rewards/rejected": 0.8168541193008423, "step": 912 }, { "epoch": 0.49, "learning_rate": 9.53559797759132e-08, "logits/chosen": -2.090744733810425, "logits/rejected": -2.100227117538452, "logps/chosen": -2.117182731628418, "logps/rejected": -1.9304420948028564, "loss": 0.5348, "rewards/accuracies": 1.0, "rewards/chosen": 1.1308172941207886, "rewards/margins": 0.3465988039970398, "rewards/rejected": 0.7842184901237488, "step": 913 }, { "epoch": 0.49, "learning_rate": 9.534371804252727e-08, "logits/chosen": -2.2019824981689453, "logits/rejected": -2.0166118144989014, "logps/chosen": -51.04412841796875, "logps/rejected": -3.1025495529174805, "loss": 0.3711, "rewards/accuracies": 1.0, "rewards/chosen": 1.4142128229141235, "rewards/margins": 0.8000914454460144, "rewards/rejected": 0.6141213774681091, "step": 914 }, { "epoch": 0.49, "learning_rate": 9.533144093347438e-08, "logits/chosen": -2.066649913787842, "logits/rejected": -2.0603346824645996, "logps/chosen": -15.519072532653809, "logps/rejected": -6.314748287200928, "loss": 0.5899, "rewards/accuracies": 1.0, "rewards/chosen": 0.8613282442092896, "rewards/margins": 0.2183130979537964, "rewards/rejected": 0.6430151462554932, "step": 915 }, { "epoch": 0.49, "learning_rate": 9.531914845291762e-08, "logits/chosen": -2.038907766342163, "logits/rejected": -2.284411668777466, "logps/chosen": -2.28351092338562, "logps/rejected": -2.2422451972961426, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.7570095062255859, "rewards/margins": -1.5437602996826172e-05, "rewards/rejected": 0.7570249438285828, "step": 916 }, { "epoch": 0.49, "learning_rate": 9.530684060502525e-08, "logits/chosen": -2.046557664871216, "logits/rejected": -2.360116481781006, "logps/chosen": -7.359820365905762, "logps/rejected": -23.101011276245117, "loss": 0.5603, "rewards/accuracies": 1.0, "rewards/chosen": 0.6239760518074036, "rewards/margins": 0.2861815392971039, "rewards/rejected": 0.3377945125102997, "step": 917 }, { "epoch": 0.5, "learning_rate": 9.529451739397076e-08, "logits/chosen": -2.048726797103882, "logits/rejected": -2.0487663745880127, "logps/chosen": -2.4862515926361084, "logps/rejected": -2.5894322395324707, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.8820304870605469, "rewards/margins": 0.022367477416992188, "rewards/rejected": 0.8596630096435547, "step": 918 }, { "epoch": 0.5, "learning_rate": 9.528217882393285e-08, "logits/chosen": -2.088153123855591, "logits/rejected": -2.0855398178100586, "logps/chosen": -4.766857147216797, "logps/rejected": -4.362890243530273, "loss": 0.356, "rewards/accuracies": 1.0, "rewards/chosen": 1.4154396057128906, "rewards/margins": 0.8496017456054688, "rewards/rejected": 0.5658378601074219, "step": 919 }, { "epoch": 0.5, "learning_rate": 9.52698248990954e-08, "logits/chosen": -1.9997891187667847, "logits/rejected": -2.0020761489868164, "logps/chosen": -1.8195561170578003, "logps/rejected": -2.8734073638916016, "loss": 0.5467, "rewards/accuracies": 1.0, "rewards/chosen": 0.8973425030708313, "rewards/margins": 0.31796932220458984, "rewards/rejected": 0.5793731808662415, "step": 920 }, { "epoch": 0.5, "learning_rate": 9.525745562364756e-08, "logits/chosen": -1.9974002838134766, "logits/rejected": -2.244936466217041, "logps/chosen": -1.0778887271881104, "logps/rejected": -5.807923316955566, "loss": 0.6189, "rewards/accuracies": 1.0, "rewards/chosen": 0.7594962120056152, "rewards/margins": 0.15444475412368774, "rewards/rejected": 0.6050514578819275, "step": 921 }, { "epoch": 0.5, "learning_rate": 9.524507100178361e-08, "logits/chosen": -2.0576610565185547, "logits/rejected": -2.236067295074463, "logps/chosen": -1.2241114377975464, "logps/rejected": -2.3586983680725098, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 0.841645359992981, "rewards/margins": -0.004735291004180908, "rewards/rejected": 0.8463806509971619, "step": 922 }, { "epoch": 0.5, "learning_rate": 9.523267103770307e-08, "logits/chosen": -2.0335967540740967, "logits/rejected": -2.2325215339660645, "logps/chosen": -0.9594781994819641, "logps/rejected": -0.9170852899551392, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 0.8401174545288086, "rewards/margins": -0.004849135875701904, "rewards/rejected": 0.8449665904045105, "step": 923 }, { "epoch": 0.5, "learning_rate": 9.52202557356107e-08, "logits/chosen": -2.080021619796753, "logits/rejected": -2.078277349472046, "logps/chosen": -1.7114678621292114, "logps/rejected": -2.7766194343566895, "loss": 0.5138, "rewards/accuracies": 1.0, "rewards/chosen": 1.0645121335983276, "rewards/margins": 0.3980942964553833, "rewards/rejected": 0.6664178371429443, "step": 924 }, { "epoch": 0.5, "learning_rate": 9.520782509971637e-08, "logits/chosen": -2.061627149581909, "logits/rejected": -2.235548496246338, "logps/chosen": -0.7484545707702637, "logps/rejected": -0.7658794522285461, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.8270317316055298, "rewards/margins": 0.011410176753997803, "rewards/rejected": 0.815621554851532, "step": 925 }, { "epoch": 0.5, "learning_rate": 9.519537913423524e-08, "logits/chosen": -2.1214208602905273, "logits/rejected": -2.1189069747924805, "logps/chosen": -3.610971689224243, "logps/rejected": -2.4213199615478516, "loss": 0.5042, "rewards/accuracies": 1.0, "rewards/chosen": 1.1283191442489624, "rewards/margins": 0.4222058057785034, "rewards/rejected": 0.706113338470459, "step": 926 }, { "epoch": 0.5, "learning_rate": 9.518291784338764e-08, "logits/chosen": -2.09417986869812, "logits/rejected": -2.0963432788848877, "logps/chosen": -3.0661399364471436, "logps/rejected": -2.701741933822632, "loss": 0.4852, "rewards/accuracies": 1.0, "rewards/chosen": 1.068141222000122, "rewards/margins": 0.4708194136619568, "rewards/rejected": 0.5973218083381653, "step": 927 }, { "epoch": 0.5, "learning_rate": 9.517044123139903e-08, "logits/chosen": -2.127009868621826, "logits/rejected": -2.234332323074341, "logps/chosen": -3.2103002071380615, "logps/rejected": -3.3394999504089355, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.6033541560173035, "rewards/margins": 0.026838600635528564, "rewards/rejected": 0.5765155553817749, "step": 928 }, { "epoch": 0.5, "learning_rate": 9.515794930250019e-08, "logits/chosen": -2.0955564975738525, "logits/rejected": -2.08488392829895, "logps/chosen": -4.054677486419678, "logps/rejected": -3.425647735595703, "loss": 0.5341, "rewards/accuracies": 1.0, "rewards/chosen": 0.9463810324668884, "rewards/margins": 0.34826111793518066, "rewards/rejected": 0.5981199145317078, "step": 929 }, { "epoch": 0.5, "learning_rate": 9.514544206092697e-08, "logits/chosen": -2.09114933013916, "logits/rejected": -2.3190057277679443, "logps/chosen": -2.822260618209839, "logps/rejected": -1.858957290649414, "loss": 0.6705, "rewards/accuracies": 1.0, "rewards/chosen": 0.9457217454910278, "rewards/margins": 0.04574984312057495, "rewards/rejected": 0.8999719023704529, "step": 930 }, { "epoch": 0.5, "learning_rate": 9.513291951092051e-08, "logits/chosen": -2.1356399059295654, "logits/rejected": -2.1042368412017822, "logps/chosen": -18.837364196777344, "logps/rejected": -6.596668243408203, "loss": 0.5103, "rewards/accuracies": 1.0, "rewards/chosen": 1.2106090784072876, "rewards/margins": 0.40681809186935425, "rewards/rejected": 0.8037909865379333, "step": 931 }, { "epoch": 0.5, "learning_rate": 9.512038165672708e-08, "logits/chosen": -2.082608699798584, "logits/rejected": -2.0914855003356934, "logps/chosen": -3.436539649963379, "logps/rejected": -2.6511757373809814, "loss": 0.5087, "rewards/accuracies": 1.0, "rewards/chosen": 0.9406377077102661, "rewards/margins": 0.4106826186180115, "rewards/rejected": 0.5299550890922546, "step": 932 }, { "epoch": 0.5, "learning_rate": 9.510782850259815e-08, "logits/chosen": -2.055424451828003, "logits/rejected": -2.077716112136841, "logps/chosen": -31.865291595458984, "logps/rejected": -9.025416374206543, "loss": 0.6169, "rewards/accuracies": 1.0, "rewards/chosen": 0.9735256433486938, "rewards/margins": 0.15876835584640503, "rewards/rejected": 0.8147572875022888, "step": 933 }, { "epoch": 0.5, "learning_rate": 9.509526005279042e-08, "logits/chosen": -2.0639188289642334, "logits/rejected": -2.063295364379883, "logps/chosen": -4.9116315841674805, "logps/rejected": -9.823860168457031, "loss": 0.4995, "rewards/accuracies": 1.0, "rewards/chosen": 0.9098907709121704, "rewards/margins": 0.43393784761428833, "rewards/rejected": 0.4759529232978821, "step": 934 }, { "epoch": 0.5, "learning_rate": 9.508267631156572e-08, "logits/chosen": -2.076200485229492, "logits/rejected": -2.255431890487671, "logps/chosen": -3.7171523571014404, "logps/rejected": -2.264249801635742, "loss": 0.722, "rewards/accuracies": 0.0, "rewards/chosen": 0.8984810709953308, "rewards/margins": -0.05688297748565674, "rewards/rejected": 0.9553640484809875, "step": 935 }, { "epoch": 0.5, "learning_rate": 9.507007728319108e-08, "logits/chosen": -2.071399450302124, "logits/rejected": -2.057264804840088, "logps/chosen": -7.12483024597168, "logps/rejected": -4.9666619300842285, "loss": 0.3971, "rewards/accuracies": 1.0, "rewards/chosen": 1.2165426015853882, "rewards/margins": 0.7185850143432617, "rewards/rejected": 0.4979575574398041, "step": 936 }, { "epoch": 0.51, "learning_rate": 9.505746297193875e-08, "logits/chosen": -2.0130035877227783, "logits/rejected": -1.9583905935287476, "logps/chosen": -37.66706466674805, "logps/rejected": -2.654193639755249, "loss": 0.3721, "rewards/accuracies": 1.0, "rewards/chosen": 1.4105281829833984, "rewards/margins": 0.796699047088623, "rewards/rejected": 0.6138291358947754, "step": 937 }, { "epoch": 0.51, "learning_rate": 9.504483338208611e-08, "logits/chosen": -2.0925111770629883, "logits/rejected": -2.232327699661255, "logps/chosen": -4.691596031188965, "logps/rejected": -4.406190872192383, "loss": 0.7016, "rewards/accuracies": 0.0, "rewards/chosen": 0.8834282755851746, "rewards/margins": -0.016758739948272705, "rewards/rejected": 0.9001870155334473, "step": 938 }, { "epoch": 0.51, "learning_rate": 9.503218851791576e-08, "logits/chosen": -1.9440566301345825, "logits/rejected": -2.291532516479492, "logps/chosen": -1.7187448740005493, "logps/rejected": -1.7079899311065674, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.8500847220420837, "rewards/margins": 0.01151585578918457, "rewards/rejected": 0.8385688662528992, "step": 939 }, { "epoch": 0.51, "learning_rate": 9.501952838371546e-08, "logits/chosen": -1.9725959300994873, "logits/rejected": -1.9756464958190918, "logps/chosen": -7.610124588012695, "logps/rejected": -1.4605109691619873, "loss": 0.2789, "rewards/accuracies": 1.0, "rewards/chosen": 1.6660387516021729, "rewards/margins": 1.134242296218872, "rewards/rejected": 0.5317964553833008, "step": 940 }, { "epoch": 0.51, "learning_rate": 9.500685298377815e-08, "logits/chosen": -2.0022568702697754, "logits/rejected": -2.010499954223633, "logps/chosen": -2.7362096309661865, "logps/rejected": -2.4034409523010254, "loss": 0.4348, "rewards/accuracies": 1.0, "rewards/chosen": 1.3394005298614502, "rewards/margins": 0.6076993942260742, "rewards/rejected": 0.731701135635376, "step": 941 }, { "epoch": 0.51, "learning_rate": 9.499416232240197e-08, "logits/chosen": -1.9298964738845825, "logits/rejected": -2.1996049880981445, "logps/chosen": -1.1729021072387695, "logps/rejected": -1.3309364318847656, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.8527101874351501, "rewards/margins": -0.0011538267135620117, "rewards/rejected": 0.8538640141487122, "step": 942 }, { "epoch": 0.51, "learning_rate": 9.498145640389019e-08, "logits/chosen": -2.1438097953796387, "logits/rejected": -2.1478357315063477, "logps/chosen": -4.039600849151611, "logps/rejected": -3.44749116897583, "loss": 0.4842, "rewards/accuracies": 1.0, "rewards/chosen": 1.0719326734542847, "rewards/margins": 0.4732915163040161, "rewards/rejected": 0.5986411571502686, "step": 943 }, { "epoch": 0.51, "learning_rate": 9.49687352325513e-08, "logits/chosen": -1.9693174362182617, "logits/rejected": -1.949683666229248, "logps/chosen": -20.638158798217773, "logps/rejected": -2.2206432819366455, "loss": 0.4039, "rewards/accuracies": 1.0, "rewards/chosen": 1.4110910892486572, "rewards/margins": 0.697766900062561, "rewards/rejected": 0.7133241891860962, "step": 944 }, { "epoch": 0.51, "learning_rate": 9.495599881269892e-08, "logits/chosen": -2.0956411361694336, "logits/rejected": -2.09865665435791, "logps/chosen": -4.287405967712402, "logps/rejected": -0.6861462593078613, "loss": 0.5997, "rewards/accuracies": 1.0, "rewards/chosen": 1.198949933052063, "rewards/margins": 0.19643986225128174, "rewards/rejected": 1.0025100708007812, "step": 945 }, { "epoch": 0.51, "learning_rate": 9.494324714865185e-08, "logits/chosen": -2.107128381729126, "logits/rejected": -2.235222101211548, "logps/chosen": -2.7410645484924316, "logps/rejected": -1.6821731328964233, "loss": 0.6659, "rewards/accuracies": 1.0, "rewards/chosen": 0.9900903701782227, "rewards/margins": 0.05523282289505005, "rewards/rejected": 0.9348575472831726, "step": 946 }, { "epoch": 0.51, "learning_rate": 9.493048024473411e-08, "logits/chosen": -2.107177257537842, "logits/rejected": -2.243335723876953, "logps/chosen": -0.4920365810394287, "logps/rejected": -0.5319284796714783, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.7955473065376282, "rewards/margins": 0.009407639503479004, "rewards/rejected": 0.7861396670341492, "step": 947 }, { "epoch": 0.51, "learning_rate": 9.49176981052748e-08, "logits/chosen": -2.1407809257507324, "logits/rejected": -2.136335849761963, "logps/chosen": -3.068787097930908, "logps/rejected": -5.346944808959961, "loss": 0.6382, "rewards/accuracies": 1.0, "rewards/chosen": 0.8851652145385742, "rewards/margins": 0.11304569244384766, "rewards/rejected": 0.7721195220947266, "step": 948 }, { "epoch": 0.51, "learning_rate": 9.490490073460831e-08, "logits/chosen": -2.0746288299560547, "logits/rejected": -2.2491469383239746, "logps/chosen": -0.8709273934364319, "logps/rejected": -0.9159038066864014, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.8403950929641724, "rewards/margins": 0.015987396240234375, "rewards/rejected": 0.824407696723938, "step": 949 }, { "epoch": 0.51, "learning_rate": 9.489208813707403e-08, "logits/chosen": -2.008837938308716, "logits/rejected": -2.233506441116333, "logps/chosen": -1.337883710861206, "logps/rejected": -5.623003959655762, "loss": 0.528, "rewards/accuracies": 1.0, "rewards/chosen": 1.0587178468704224, "rewards/margins": 0.36303168535232544, "rewards/rejected": 0.6956861615180969, "step": 950 }, { "epoch": 0.51, "learning_rate": 9.487926031701665e-08, "logits/chosen": -2.0962250232696533, "logits/rejected": -2.2385122776031494, "logps/chosen": -1.2420804500579834, "logps/rejected": -1.2588040828704834, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.8583208322525024, "rewards/margins": 0.019848287105560303, "rewards/rejected": 0.8384725451469421, "step": 951 }, { "epoch": 0.51, "learning_rate": 9.486641727878597e-08, "logits/chosen": -2.1232993602752686, "logits/rejected": -2.123098850250244, "logps/chosen": -3.5147085189819336, "logps/rejected": -2.7186999320983887, "loss": 0.3506, "rewards/accuracies": 1.0, "rewards/chosen": 1.4349735975265503, "rewards/margins": 0.8676007390022278, "rewards/rejected": 0.5673728585243225, "step": 952 }, { "epoch": 0.51, "learning_rate": 9.485355902673696e-08, "logits/chosen": -1.9492650032043457, "logits/rejected": -1.9491478204727173, "logps/chosen": -0.7136294841766357, "logps/rejected": -4.7836174964904785, "loss": 0.5433, "rewards/accuracies": 1.0, "rewards/chosen": 0.9269124269485474, "rewards/margins": 0.32626092433929443, "rewards/rejected": 0.6006515026092529, "step": 953 }, { "epoch": 0.51, "learning_rate": 9.484068556522972e-08, "logits/chosen": -2.1362507343292236, "logits/rejected": -2.321420907974243, "logps/chosen": -1.79396390914917, "logps/rejected": -1.7912309169769287, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.979756772518158, "rewards/margins": 0.020949184894561768, "rewards/rejected": 0.9588075876235962, "step": 954 }, { "epoch": 0.52, "learning_rate": 9.482779689862957e-08, "logits/chosen": -2.081732749938965, "logits/rejected": -2.2437398433685303, "logps/chosen": -2.3098278045654297, "logps/rejected": -2.365957021713257, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.596089780330658, "rewards/margins": 0.01814216375350952, "rewards/rejected": 0.5779476165771484, "step": 955 }, { "epoch": 0.52, "learning_rate": 9.48148930313069e-08, "logits/chosen": -2.141742467880249, "logits/rejected": -2.1746914386749268, "logps/chosen": -3.5765061378479004, "logps/rejected": -14.07756519317627, "loss": 0.4095, "rewards/accuracies": 1.0, "rewards/chosen": 1.2034037113189697, "rewards/margins": 0.6810130476951599, "rewards/rejected": 0.5223906636238098, "step": 956 }, { "epoch": 0.52, "learning_rate": 9.480197396763732e-08, "logits/chosen": -1.9892929792404175, "logits/rejected": -1.9916776418685913, "logps/chosen": -3.080904483795166, "logps/rejected": -5.971914291381836, "loss": 0.5238, "rewards/accuracies": 1.0, "rewards/chosen": 0.915775716304779, "rewards/margins": 0.37322068214416504, "rewards/rejected": 0.542555034160614, "step": 957 }, { "epoch": 0.52, "learning_rate": 9.478903971200159e-08, "logits/chosen": -2.0618438720703125, "logits/rejected": -2.0715365409851074, "logps/chosen": -3.7802906036376953, "logps/rejected": -11.46763801574707, "loss": 0.662, "rewards/accuracies": 1.0, "rewards/chosen": 0.8396303057670593, "rewards/margins": 0.06338995695114136, "rewards/rejected": 0.776240348815918, "step": 958 }, { "epoch": 0.52, "learning_rate": 9.477609026878557e-08, "logits/chosen": -2.074341058731079, "logits/rejected": -2.2590699195861816, "logps/chosen": -3.7598989009857178, "logps/rejected": -14.896939277648926, "loss": 0.6643, "rewards/accuracies": 1.0, "rewards/chosen": 0.954812228679657, "rewards/margins": 0.0586281418800354, "rewards/rejected": 0.8961840867996216, "step": 959 }, { "epoch": 0.52, "learning_rate": 9.476312564238034e-08, "logits/chosen": -2.0481972694396973, "logits/rejected": -2.2515599727630615, "logps/chosen": -12.894436836242676, "logps/rejected": -6.94932746887207, "loss": 0.7731, "rewards/accuracies": 0.0, "rewards/chosen": 0.7480891346931458, "rewards/margins": -0.1540360450744629, "rewards/rejected": 0.9021251797676086, "step": 960 }, { "epoch": 0.52, "learning_rate": 9.475014583718208e-08, "logits/chosen": -2.0562515258789062, "logits/rejected": -2.2645812034606934, "logps/chosen": -1.4560633897781372, "logps/rejected": -1.491584300994873, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.8633519411087036, "rewards/margins": 0.026822984218597412, "rewards/rejected": 0.8365289568901062, "step": 961 }, { "epoch": 0.52, "learning_rate": 9.473715085759215e-08, "logits/chosen": -2.056852340698242, "logits/rejected": -2.05727219581604, "logps/chosen": -4.297023296356201, "logps/rejected": -3.59496808052063, "loss": 0.5763, "rewards/accuracies": 1.0, "rewards/chosen": 0.8790376782417297, "rewards/margins": 0.2492508888244629, "rewards/rejected": 0.6297867894172668, "step": 962 }, { "epoch": 0.52, "learning_rate": 9.4724140708017e-08, "logits/chosen": -2.104313850402832, "logits/rejected": -2.106395959854126, "logps/chosen": -0.6025898456573486, "logps/rejected": -3.199073076248169, "loss": 0.5587, "rewards/accuracies": 1.0, "rewards/chosen": 0.950546383857727, "rewards/margins": 0.2899322509765625, "rewards/rejected": 0.6606141328811646, "step": 963 }, { "epoch": 0.52, "learning_rate": 9.471111539286828e-08, "logits/chosen": -2.0505049228668213, "logits/rejected": -2.259965419769287, "logps/chosen": -2.601534843444824, "logps/rejected": -2.5491886138916016, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.8443289995193481, "rewards/margins": 0.01086515188217163, "rewards/rejected": 0.8334638476371765, "step": 964 }, { "epoch": 0.52, "learning_rate": 9.469807491656277e-08, "logits/chosen": -2.0744035243988037, "logits/rejected": -2.258876323699951, "logps/chosen": -32.320655822753906, "logps/rejected": -32.81623840332031, "loss": 0.748, "rewards/accuracies": 0.0, "rewards/chosen": 0.18050765991210938, "rewards/margins": -0.10686683654785156, "rewards/rejected": 0.28737449645996094, "step": 965 }, { "epoch": 0.52, "learning_rate": 9.468501928352237e-08, "logits/chosen": -2.1311428546905518, "logits/rejected": -2.1170530319213867, "logps/chosen": -10.295075416564941, "logps/rejected": -2.580630302429199, "loss": 0.5675, "rewards/accuracies": 1.0, "rewards/chosen": 1.0280693769454956, "rewards/margins": 0.26929765939712524, "rewards/rejected": 0.7587717175483704, "step": 966 }, { "epoch": 0.52, "learning_rate": 9.467194849817414e-08, "logits/chosen": -2.055241107940674, "logits/rejected": -2.2825934886932373, "logps/chosen": -3.0937228202819824, "logps/rejected": -8.206336975097656, "loss": 0.5967, "rewards/accuracies": 1.0, "rewards/chosen": 0.7244083285331726, "rewards/margins": 0.20321577787399292, "rewards/rejected": 0.5211925506591797, "step": 967 }, { "epoch": 0.52, "learning_rate": 9.465886256495027e-08, "logits/chosen": -2.1073553562164307, "logits/rejected": -2.1053192615509033, "logps/chosen": -6.260716438293457, "logps/rejected": -5.451304912567139, "loss": 0.3352, "rewards/accuracies": 1.0, "rewards/chosen": 1.3161380290985107, "rewards/margins": 0.9209229946136475, "rewards/rejected": 0.3952150046825409, "step": 968 }, { "epoch": 0.52, "learning_rate": 9.464576148828808e-08, "logits/chosen": -2.2305779457092285, "logits/rejected": -2.3144443035125732, "logps/chosen": -1.6395795345306396, "logps/rejected": -1.6665782928466797, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.991188645362854, "rewards/margins": -0.0038126707077026367, "rewards/rejected": 0.9950013160705566, "step": 969 }, { "epoch": 0.52, "learning_rate": 9.463264527263007e-08, "logits/chosen": -2.137601852416992, "logits/rejected": -2.2696609497070312, "logps/chosen": -0.62461918592453, "logps/rejected": -0.6677385568618774, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.9066271185874939, "rewards/margins": 0.012072563171386719, "rewards/rejected": 0.8945545554161072, "step": 970 }, { "epoch": 0.52, "learning_rate": 9.461951392242378e-08, "logits/chosen": -2.022250175476074, "logits/rejected": -2.2778611183166504, "logps/chosen": -1.1123369932174683, "logps/rejected": -1.0619537830352783, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.8005144000053406, "rewards/margins": 0.009234488010406494, "rewards/rejected": 0.7912799119949341, "step": 971 }, { "epoch": 0.52, "learning_rate": 9.460636744212198e-08, "logits/chosen": -2.027036666870117, "logits/rejected": -2.2609734535217285, "logps/chosen": -2.235649824142456, "logps/rejected": -2.4560928344726562, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.9767225384712219, "rewards/margins": 0.020945370197296143, "rewards/rejected": 0.9557771682739258, "step": 972 }, { "epoch": 0.52, "learning_rate": 9.459320583618252e-08, "logits/chosen": -2.055410146713257, "logits/rejected": -2.1051034927368164, "logps/chosen": -7.4897332191467285, "logps/rejected": -15.449471473693848, "loss": 0.3, "rewards/accuracies": 1.0, "rewards/chosen": 1.2719287872314453, "rewards/margins": 1.0500638484954834, "rewards/rejected": 0.2218649834394455, "step": 973 }, { "epoch": 0.53, "learning_rate": 9.458002910906837e-08, "logits/chosen": -2.140410900115967, "logits/rejected": -2.143413543701172, "logps/chosen": -4.862399578094482, "logps/rejected": -9.186290740966797, "loss": 0.5, "rewards/accuracies": 1.0, "rewards/chosen": 1.1982265710830688, "rewards/margins": 0.4327114224433899, "rewards/rejected": 0.765515148639679, "step": 974 }, { "epoch": 0.53, "learning_rate": 9.456683726524767e-08, "logits/chosen": -2.0805323123931885, "logits/rejected": -2.0850744247436523, "logps/chosen": -11.23953914642334, "logps/rejected": -1.952378511428833, "loss": 0.441, "rewards/accuracies": 1.0, "rewards/chosen": 1.2751500606536865, "rewards/margins": 0.5900055170059204, "rewards/rejected": 0.6851445436477661, "step": 975 }, { "epoch": 0.53, "learning_rate": 9.455363030919365e-08, "logits/chosen": -2.039003610610962, "logits/rejected": -2.2274556159973145, "logps/chosen": -6.657717227935791, "logps/rejected": -1.1041840314865112, "loss": 0.7615, "rewards/accuracies": 0.0, "rewards/chosen": 0.7778906226158142, "rewards/margins": -0.13229656219482422, "rewards/rejected": 0.9101871848106384, "step": 976 }, { "epoch": 0.53, "learning_rate": 9.454040824538467e-08, "logits/chosen": -2.0215392112731934, "logits/rejected": -2.033231496810913, "logps/chosen": -3.428741693496704, "logps/rejected": -1.837886929512024, "loss": 0.5061, "rewards/accuracies": 1.0, "rewards/chosen": 1.0612672567367554, "rewards/margins": 0.4172239899635315, "rewards/rejected": 0.6440432667732239, "step": 977 }, { "epoch": 0.53, "learning_rate": 9.452717107830423e-08, "logits/chosen": -2.048476219177246, "logits/rejected": -2.044410228729248, "logps/chosen": -10.385977745056152, "logps/rejected": -6.652296543121338, "loss": 0.4206, "rewards/accuracies": 1.0, "rewards/chosen": 1.2960008382797241, "rewards/margins": 0.6485220193862915, "rewards/rejected": 0.6474788188934326, "step": 978 }, { "epoch": 0.53, "learning_rate": 9.451391881244096e-08, "logits/chosen": -2.0579910278320312, "logits/rejected": -2.2592897415161133, "logps/chosen": -0.3977699875831604, "logps/rejected": -0.38719674944877625, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.6480249762535095, "rewards/margins": 0.020052433013916016, "rewards/rejected": 0.6279725432395935, "step": 979 }, { "epoch": 0.53, "learning_rate": 9.450065145228854e-08, "logits/chosen": -2.040649652481079, "logits/rejected": -2.262420415878296, "logps/chosen": -1.9235200881958008, "logps/rejected": -2.022486448287964, "loss": 0.6722, "rewards/accuracies": 1.0, "rewards/chosen": 0.9143359065055847, "rewards/margins": 0.04233509302139282, "rewards/rejected": 0.8720008134841919, "step": 980 }, { "epoch": 0.53, "learning_rate": 9.448736900234584e-08, "logits/chosen": -2.0563554763793945, "logits/rejected": -2.0342869758605957, "logps/chosen": -8.219139099121094, "logps/rejected": -3.5167839527130127, "loss": 0.4167, "rewards/accuracies": 1.0, "rewards/chosen": 1.2931305170059204, "rewards/margins": 0.6599222421646118, "rewards/rejected": 0.6332082748413086, "step": 981 }, { "epoch": 0.53, "learning_rate": 9.447407146711683e-08, "logits/chosen": -2.0602433681488037, "logits/rejected": -2.0608253479003906, "logps/chosen": -4.078010082244873, "logps/rejected": -2.652966260910034, "loss": 0.3958, "rewards/accuracies": 1.0, "rewards/chosen": 1.6281852722167969, "rewards/margins": 0.722290575504303, "rewards/rejected": 0.9058946967124939, "step": 982 }, { "epoch": 0.53, "learning_rate": 9.446075885111061e-08, "logits/chosen": -2.045489549636841, "logits/rejected": -2.0459988117218018, "logps/chosen": -6.037397384643555, "logps/rejected": -2.573362112045288, "loss": 0.3137, "rewards/accuracies": 1.0, "rewards/chosen": 1.5655605792999268, "rewards/margins": 0.9981887936592102, "rewards/rejected": 0.5673717856407166, "step": 983 }, { "epoch": 0.53, "learning_rate": 9.444743115884134e-08, "logits/chosen": -1.97016179561615, "logits/rejected": -1.9623589515686035, "logps/chosen": -12.789307594299316, "logps/rejected": -11.503480911254883, "loss": 0.5931, "rewards/accuracies": 1.0, "rewards/chosen": 1.171069622039795, "rewards/margins": 0.21129655838012695, "rewards/rejected": 0.959773063659668, "step": 984 }, { "epoch": 0.53, "learning_rate": 9.443408839482834e-08, "logits/chosen": -2.1156654357910156, "logits/rejected": -2.1128573417663574, "logps/chosen": -10.340802192687988, "logps/rejected": -1.4553625583648682, "loss": 0.5698, "rewards/accuracies": 1.0, "rewards/chosen": 1.2031524181365967, "rewards/margins": 0.2641618251800537, "rewards/rejected": 0.938990592956543, "step": 985 }, { "epoch": 0.53, "learning_rate": 9.442073056359603e-08, "logits/chosen": -2.087939500808716, "logits/rejected": -2.1160411834716797, "logps/chosen": -8.873806953430176, "logps/rejected": -8.543838500976562, "loss": 0.5325, "rewards/accuracies": 1.0, "rewards/chosen": 1.3102740049362183, "rewards/margins": 0.3520740866661072, "rewards/rejected": 0.9581999182701111, "step": 986 }, { "epoch": 0.53, "learning_rate": 9.440735766967394e-08, "logits/chosen": -1.9621877670288086, "logits/rejected": -1.96176278591156, "logps/chosen": -0.438217431306839, "logps/rejected": -6.239575386047363, "loss": 0.5621, "rewards/accuracies": 1.0, "rewards/chosen": 0.7980275750160217, "rewards/margins": 0.28192436695098877, "rewards/rejected": 0.516103208065033, "step": 987 }, { "epoch": 0.53, "learning_rate": 9.439396971759668e-08, "logits/chosen": -2.142059087753296, "logits/rejected": -2.2385778427124023, "logps/chosen": -4.188911437988281, "logps/rejected": -4.276714324951172, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.9731504321098328, "rewards/margins": 0.028829634189605713, "rewards/rejected": 0.944320797920227, "step": 988 }, { "epoch": 0.53, "learning_rate": 9.438056671190402e-08, "logits/chosen": -2.0308613777160645, "logits/rejected": -2.0407161712646484, "logps/chosen": -5.630744457244873, "logps/rejected": -8.305002212524414, "loss": 0.5289, "rewards/accuracies": 1.0, "rewards/chosen": 1.0074137449264526, "rewards/margins": 0.3609555959701538, "rewards/rejected": 0.6464581489562988, "step": 989 }, { "epoch": 0.53, "learning_rate": 9.43671486571408e-08, "logits/chosen": -2.137728691101074, "logits/rejected": -2.135241746902466, "logps/chosen": -5.320884704589844, "logps/rejected": -7.068048477172852, "loss": 0.4566, "rewards/accuracies": 1.0, "rewards/chosen": 1.3053468465805054, "rewards/margins": 0.5468610525131226, "rewards/rejected": 0.7584857940673828, "step": 990 }, { "epoch": 0.53, "learning_rate": 9.435371555785694e-08, "logits/chosen": -1.9921908378601074, "logits/rejected": -2.001621723175049, "logps/chosen": -2.3341543674468994, "logps/rejected": -2.134385585784912, "loss": 0.4906, "rewards/accuracies": 1.0, "rewards/chosen": 1.1712422370910645, "rewards/margins": 0.4568555951118469, "rewards/rejected": 0.7143866419792175, "step": 991 }, { "epoch": 0.54, "learning_rate": 9.434026741860752e-08, "logits/chosen": -2.0977087020874023, "logits/rejected": -2.2887415885925293, "logps/chosen": -10.479516983032227, "logps/rejected": -8.590496063232422, "loss": 0.7699, "rewards/accuracies": 0.0, "rewards/chosen": 0.7098907828330994, "rewards/margins": -0.1480756402015686, "rewards/rejected": 0.857966423034668, "step": 992 }, { "epoch": 0.54, "learning_rate": 9.432680424395269e-08, "logits/chosen": -2.1140003204345703, "logits/rejected": -2.2275550365448, "logps/chosen": -7.765037536621094, "logps/rejected": -7.054296970367432, "loss": 0.652, "rewards/accuracies": 1.0, "rewards/chosen": 0.8401718139648438, "rewards/margins": 0.084134042263031, "rewards/rejected": 0.7560377717018127, "step": 993 }, { "epoch": 0.54, "learning_rate": 9.431332603845768e-08, "logits/chosen": -2.0000510215759277, "logits/rejected": -2.2993557453155518, "logps/chosen": -2.9748566150665283, "logps/rejected": -3.271259307861328, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.9691190719604492, "rewards/margins": 0.0020554661750793457, "rewards/rejected": 0.9670636057853699, "step": 994 }, { "epoch": 0.54, "learning_rate": 9.429983280669282e-08, "logits/chosen": -2.00038480758667, "logits/rejected": -2.0100958347320557, "logps/chosen": -3.1112489700317383, "logps/rejected": -1.7080590724945068, "loss": 0.4341, "rewards/accuracies": 1.0, "rewards/chosen": 1.2888530492782593, "rewards/margins": 0.6094620227813721, "rewards/rejected": 0.6793910264968872, "step": 995 }, { "epoch": 0.54, "learning_rate": 9.428632455323359e-08, "logits/chosen": -2.0813493728637695, "logits/rejected": -2.0590274333953857, "logps/chosen": -14.701329231262207, "logps/rejected": -2.7989325523376465, "loss": 0.4319, "rewards/accuracies": 1.0, "rewards/chosen": 1.2458142042160034, "rewards/margins": 0.615776538848877, "rewards/rejected": 0.6300376653671265, "step": 996 }, { "epoch": 0.54, "learning_rate": 9.427280128266049e-08, "logits/chosen": -2.1096906661987305, "logits/rejected": -2.1068379878997803, "logps/chosen": -1.9086319208145142, "logps/rejected": -4.509848594665527, "loss": 0.6331, "rewards/accuracies": 1.0, "rewards/chosen": 0.9588242769241333, "rewards/margins": 0.12389582395553589, "rewards/rejected": 0.8349284529685974, "step": 997 }, { "epoch": 0.54, "learning_rate": 9.425926299955916e-08, "logits/chosen": -2.0642759799957275, "logits/rejected": -2.2746448516845703, "logps/chosen": -3.7785396575927734, "logps/rejected": -3.8095884323120117, "loss": 0.6968, "rewards/accuracies": 0.0, "rewards/chosen": 0.7229119539260864, "rewards/margins": -0.007233023643493652, "rewards/rejected": 0.7301449775695801, "step": 998 }, { "epoch": 0.54, "learning_rate": 9.424570970852032e-08, "logits/chosen": -2.170586585998535, "logits/rejected": -2.1556475162506104, "logps/chosen": -3.3046703338623047, "logps/rejected": -7.240659713745117, "loss": 0.4561, "rewards/accuracies": 1.0, "rewards/chosen": 1.195854902267456, "rewards/margins": 0.5483739972114563, "rewards/rejected": 0.6474809050559998, "step": 999 }, { "epoch": 0.54, "learning_rate": 9.423214141413977e-08, "logits/chosen": -1.9742588996887207, "logits/rejected": -1.9812246561050415, "logps/chosen": -4.216744899749756, "logps/rejected": -2.5759449005126953, "loss": 0.4881, "rewards/accuracies": 1.0, "rewards/chosen": 1.1042362451553345, "rewards/margins": 0.46328943967819214, "rewards/rejected": 0.6409468054771423, "step": 1000 }, { "epoch": 0.54, "learning_rate": 9.421855812101839e-08, "logits/chosen": -1.9272620677947998, "logits/rejected": -1.9354743957519531, "logps/chosen": -1.8261610269546509, "logps/rejected": -2.459007740020752, "loss": 0.5114, "rewards/accuracies": 1.0, "rewards/chosen": 1.022072196006775, "rewards/margins": 0.40396738052368164, "rewards/rejected": 0.6181048154830933, "step": 1001 }, { "epoch": 0.54, "learning_rate": 9.420495983376218e-08, "logits/chosen": -2.112351417541504, "logits/rejected": -2.263201951980591, "logps/chosen": -3.0200414657592773, "logps/rejected": -2.8774566650390625, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.6012449264526367, "rewards/margins": 0.007382571697235107, "rewards/rejected": 0.5938623547554016, "step": 1002 }, { "epoch": 0.54, "learning_rate": 9.41913465569822e-08, "logits/chosen": -2.021956205368042, "logits/rejected": -2.0244853496551514, "logps/chosen": -3.924469470977783, "logps/rejected": -0.7545192241668701, "loss": 0.5624, "rewards/accuracies": 1.0, "rewards/chosen": 1.1312789916992188, "rewards/margins": 0.2811813950538635, "rewards/rejected": 0.8500975966453552, "step": 1003 }, { "epoch": 0.54, "learning_rate": 9.417771829529458e-08, "logits/chosen": -2.0860249996185303, "logits/rejected": -2.3245646953582764, "logps/chosen": -0.4512392282485962, "logps/rejected": -0.4885098934173584, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 1.0089343786239624, "rewards/margins": 0.013684213161468506, "rewards/rejected": 0.9952501654624939, "step": 1004 }, { "epoch": 0.54, "learning_rate": 9.416407505332054e-08, "logits/chosen": -1.9323419332504272, "logits/rejected": -2.246387004852295, "logps/chosen": -0.7763842940330505, "logps/rejected": -0.7870175838470459, "loss": 0.6753, "rewards/accuracies": 1.0, "rewards/chosen": 0.807386040687561, "rewards/margins": 0.03603869676589966, "rewards/rejected": 0.7713473439216614, "step": 1005 }, { "epoch": 0.54, "learning_rate": 9.415041683568642e-08, "logits/chosen": -2.0452075004577637, "logits/rejected": -2.045527219772339, "logps/chosen": -1.1754077672958374, "logps/rejected": -2.173393726348877, "loss": 0.5335, "rewards/accuracies": 1.0, "rewards/chosen": 1.0168216228485107, "rewards/margins": 0.3498179316520691, "rewards/rejected": 0.6670036911964417, "step": 1006 }, { "epoch": 0.54, "learning_rate": 9.413674364702357e-08, "logits/chosen": -2.011106491088867, "logits/rejected": -2.251720428466797, "logps/chosen": -0.8645858764648438, "logps/rejected": -0.8998215794563293, "loss": 0.6608, "rewards/accuracies": 1.0, "rewards/chosen": 0.9411171078681946, "rewards/margins": 0.06571710109710693, "rewards/rejected": 0.8754000067710876, "step": 1007 }, { "epoch": 0.54, "learning_rate": 9.412305549196849e-08, "logits/chosen": -2.0712430477142334, "logits/rejected": -2.2420573234558105, "logps/chosen": -0.7605608701705933, "logps/rejected": -3.6245360374450684, "loss": 0.6118, "rewards/accuracies": 1.0, "rewards/chosen": 0.9494436383247375, "rewards/margins": 0.16979384422302246, "rewards/rejected": 0.7796497941017151, "step": 1008 }, { "epoch": 0.54, "learning_rate": 9.410935237516269e-08, "logits/chosen": -2.0117013454437256, "logits/rejected": -2.0047433376312256, "logps/chosen": -16.550630569458008, "logps/rejected": -6.233431339263916, "loss": 0.4209, "rewards/accuracies": 1.0, "rewards/chosen": 1.3561718463897705, "rewards/margins": 0.6475192904472351, "rewards/rejected": 0.7086525559425354, "step": 1009 }, { "epoch": 0.54, "learning_rate": 9.409563430125278e-08, "logits/chosen": -2.021404504776001, "logits/rejected": -2.309462070465088, "logps/chosen": -0.8107801079750061, "logps/rejected": -0.8112109303474426, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": 0.8281580805778503, "rewards/margins": 0.002650916576385498, "rewards/rejected": 0.8255071640014648, "step": 1010 }, { "epoch": 0.55, "learning_rate": 9.408190127489044e-08, "logits/chosen": -2.0833377838134766, "logits/rejected": -2.282167911529541, "logps/chosen": -11.060415267944336, "logps/rejected": -0.5507327318191528, "loss": 0.8216, "rewards/accuracies": 0.0, "rewards/chosen": 0.6797043085098267, "rewards/margins": -0.24231797456741333, "rewards/rejected": 0.92202228307724, "step": 1011 }, { "epoch": 0.55, "learning_rate": 9.406815330073244e-08, "logits/chosen": -1.9295055866241455, "logits/rejected": -1.9279977083206177, "logps/chosen": -1.6695094108581543, "logps/rejected": -1.5663808584213257, "loss": 0.7056, "rewards/accuracies": 0.0, "rewards/chosen": 0.9192819595336914, "rewards/margins": -0.02479952573776245, "rewards/rejected": 0.9440814852714539, "step": 1012 }, { "epoch": 0.55, "learning_rate": 9.405439038344058e-08, "logits/chosen": -2.2700417041778564, "logits/rejected": -2.1554338932037354, "logps/chosen": -42.04438781738281, "logps/rejected": -3.628715753555298, "loss": 0.5167, "rewards/accuracies": 1.0, "rewards/chosen": 1.2992523908615112, "rewards/margins": 0.390738844871521, "rewards/rejected": 0.9085135459899902, "step": 1013 }, { "epoch": 0.55, "learning_rate": 9.404061252768176e-08, "logits/chosen": -2.0474252700805664, "logits/rejected": -2.2075486183166504, "logps/chosen": -1.0969101190567017, "logps/rejected": -1.1359565258026123, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.8810227513313293, "rewards/margins": -0.004325985908508301, "rewards/rejected": 0.8853487372398376, "step": 1014 }, { "epoch": 0.55, "learning_rate": 9.402681973812791e-08, "logits/chosen": -2.1260385513305664, "logits/rejected": -2.2914252281188965, "logps/chosen": -33.96730422973633, "logps/rejected": -8.22355842590332, "loss": 1.0986, "rewards/accuracies": 0.0, "rewards/chosen": 0.21698494255542755, "rewards/margins": -0.6931770443916321, "rewards/rejected": 0.9101619720458984, "step": 1015 }, { "epoch": 0.55, "learning_rate": 9.40130120194561e-08, "logits/chosen": -2.1035990715026855, "logits/rejected": -2.106729507446289, "logps/chosen": -2.083446502685547, "logps/rejected": -3.64176082611084, "loss": 0.4466, "rewards/accuracies": 1.0, "rewards/chosen": 1.1601066589355469, "rewards/margins": 0.5743940472602844, "rewards/rejected": 0.5857126116752625, "step": 1016 }, { "epoch": 0.55, "learning_rate": 9.399918937634834e-08, "logits/chosen": -2.0889952182769775, "logits/rejected": -2.0297229290008545, "logps/chosen": -15.221850395202637, "logps/rejected": -19.269542694091797, "loss": 0.3354, "rewards/accuracies": 1.0, "rewards/chosen": 1.445401906967163, "rewards/margins": 0.9198846220970154, "rewards/rejected": 0.5255172848701477, "step": 1017 }, { "epoch": 0.55, "learning_rate": 9.398535181349183e-08, "logits/chosen": -2.0855414867401123, "logits/rejected": -2.2930307388305664, "logps/chosen": -0.8044178485870361, "logps/rejected": -0.8349957466125488, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.9062160849571228, "rewards/margins": 0.0044786930084228516, "rewards/rejected": 0.9017373919487, "step": 1018 }, { "epoch": 0.55, "learning_rate": 9.397149933557873e-08, "logits/chosen": -2.1496365070343018, "logits/rejected": -2.1201586723327637, "logps/chosen": -24.912012100219727, "logps/rejected": -4.020376682281494, "loss": 0.3637, "rewards/accuracies": 1.0, "rewards/chosen": 1.3923609256744385, "rewards/margins": 0.8241686820983887, "rewards/rejected": 0.5681922435760498, "step": 1019 }, { "epoch": 0.55, "learning_rate": 9.395763194730632e-08, "logits/chosen": -2.0378336906433105, "logits/rejected": -2.04172420501709, "logps/chosen": -3.1640865802764893, "logps/rejected": -3.6308159828186035, "loss": 0.3668, "rewards/accuracies": 1.0, "rewards/chosen": 1.3695244789123535, "rewards/margins": 0.8138796091079712, "rewards/rejected": 0.5556448698043823, "step": 1020 }, { "epoch": 0.55, "learning_rate": 9.39437496533769e-08, "logits/chosen": -2.112288475036621, "logits/rejected": -2.2586288452148438, "logps/chosen": -9.590699195861816, "logps/rejected": -9.817617416381836, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.8180289268493652, "rewards/margins": 0.012301921844482422, "rewards/rejected": 0.8057270050048828, "step": 1021 }, { "epoch": 0.55, "learning_rate": 9.392985245849784e-08, "logits/chosen": -2.1723616123199463, "logits/rejected": -2.173788547515869, "logps/chosen": -1.0324993133544922, "logps/rejected": -2.12372088432312, "loss": 0.562, "rewards/accuracies": 1.0, "rewards/chosen": 1.0392743349075317, "rewards/margins": 0.28212594985961914, "rewards/rejected": 0.7571483850479126, "step": 1022 }, { "epoch": 0.55, "learning_rate": 9.391594036738156e-08, "logits/chosen": -2.050661087036133, "logits/rejected": -2.2128498554229736, "logps/chosen": -0.9175164103507996, "logps/rejected": -0.9530088901519775, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 0.761775553226471, "rewards/margins": 0.024360299110412598, "rewards/rejected": 0.7374152541160583, "step": 1023 }, { "epoch": 0.55, "learning_rate": 9.390201338474552e-08, "logits/chosen": -2.0593554973602295, "logits/rejected": -2.1721527576446533, "logps/chosen": -0.9287809133529663, "logps/rejected": -0.9098867177963257, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.8074550628662109, "rewards/margins": 0.028233826160430908, "rewards/rejected": 0.77922123670578, "step": 1024 }, { "epoch": 0.55, "learning_rate": 9.388807151531229e-08, "logits/chosen": -2.0467240810394287, "logits/rejected": -2.214707374572754, "logps/chosen": -1.7102539539337158, "logps/rejected": -1.7101415395736694, "loss": 0.7013, "rewards/accuracies": 0.0, "rewards/chosen": 0.8997997641563416, "rewards/margins": -0.01617342233657837, "rewards/rejected": 0.9159731864929199, "step": 1025 }, { "epoch": 0.55, "learning_rate": 9.387411476380937e-08, "logits/chosen": -2.1033263206481934, "logits/rejected": -2.1922495365142822, "logps/chosen": -1.2573966979980469, "logps/rejected": -1.3456603288650513, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.7492448687553406, "rewards/margins": -0.005689382553100586, "rewards/rejected": 0.7549342513084412, "step": 1026 }, { "epoch": 0.55, "learning_rate": 9.38601431349694e-08, "logits/chosen": -2.0273616313934326, "logits/rejected": -2.0290942192077637, "logps/chosen": -3.116168737411499, "logps/rejected": -2.6223673820495605, "loss": 0.4082, "rewards/accuracies": 1.0, "rewards/chosen": 1.1739729642868042, "rewards/margins": 0.6848796010017395, "rewards/rejected": 0.4890933632850647, "step": 1027 }, { "epoch": 0.55, "learning_rate": 9.384615663353006e-08, "logits/chosen": -2.029857873916626, "logits/rejected": -2.233793020248413, "logps/chosen": -1.1810460090637207, "logps/rejected": -1.2087624073028564, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.93365478515625, "rewards/margins": 0.02176457643508911, "rewards/rejected": 0.9118902087211609, "step": 1028 }, { "epoch": 0.56, "learning_rate": 9.383215526423404e-08, "logits/chosen": -2.0209004878997803, "logits/rejected": -2.015026807785034, "logps/chosen": -3.727587938308716, "logps/rejected": -4.88071346282959, "loss": 0.568, "rewards/accuracies": 1.0, "rewards/chosen": 0.772197425365448, "rewards/margins": 0.268202006816864, "rewards/rejected": 0.503995418548584, "step": 1029 }, { "epoch": 0.56, "learning_rate": 9.381813903182909e-08, "logits/chosen": -1.9630155563354492, "logits/rejected": -2.269944190979004, "logps/chosen": -18.781814575195312, "logps/rejected": -1.2101852893829346, "loss": 0.6568, "rewards/accuracies": 1.0, "rewards/chosen": 0.8340789675712585, "rewards/margins": 0.07401227951049805, "rewards/rejected": 0.7600666880607605, "step": 1030 }, { "epoch": 0.56, "learning_rate": 9.380410794106797e-08, "logits/chosen": -2.0938143730163574, "logits/rejected": -2.250420093536377, "logps/chosen": -0.6618132591247559, "logps/rejected": -0.6913925409317017, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.8155195116996765, "rewards/margins": -0.0022460222244262695, "rewards/rejected": 0.8177655339241028, "step": 1031 }, { "epoch": 0.56, "learning_rate": 9.379006199670853e-08, "logits/chosen": -2.1492271423339844, "logits/rejected": -2.1516709327697754, "logps/chosen": -1.6900590658187866, "logps/rejected": -2.3207614421844482, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 0.9042661786079407, "rewards/margins": 0.0012885332107543945, "rewards/rejected": 0.9029776453971863, "step": 1032 }, { "epoch": 0.56, "learning_rate": 9.377600120351364e-08, "logits/chosen": -2.053795337677002, "logits/rejected": -2.221071720123291, "logps/chosen": -8.866252899169922, "logps/rejected": -5.356868743896484, "loss": 0.6664, "rewards/accuracies": 1.0, "rewards/chosen": 0.8710262179374695, "rewards/margins": 0.0541689395904541, "rewards/rejected": 0.8168572783470154, "step": 1033 }, { "epoch": 0.56, "learning_rate": 9.376192556625113e-08, "logits/chosen": -2.1655309200286865, "logits/rejected": -2.161410331726074, "logps/chosen": -5.359220027923584, "logps/rejected": -3.403566360473633, "loss": 0.3151, "rewards/accuracies": 1.0, "rewards/chosen": 1.5477958917617798, "rewards/margins": 0.9932145476341248, "rewards/rejected": 0.554581344127655, "step": 1034 }, { "epoch": 0.56, "learning_rate": 9.374783508969401e-08, "logits/chosen": -2.088122606277466, "logits/rejected": -2.24751353263855, "logps/chosen": -0.9295693635940552, "logps/rejected": -1.0061625242233276, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.876960277557373, "rewards/margins": 0.02735060453414917, "rewards/rejected": 0.8496096730232239, "step": 1035 }, { "epoch": 0.56, "learning_rate": 9.37337297786202e-08, "logits/chosen": -2.0138418674468994, "logits/rejected": -2.0149569511413574, "logps/chosen": -4.992314338684082, "logps/rejected": -1.0256785154342651, "loss": 0.5603, "rewards/accuracies": 1.0, "rewards/chosen": 1.2458930015563965, "rewards/margins": 0.28620070219039917, "rewards/rejected": 0.9596922993659973, "step": 1036 }, { "epoch": 0.56, "learning_rate": 9.371960963781268e-08, "logits/chosen": -2.188305377960205, "logits/rejected": -2.180858612060547, "logps/chosen": -8.413743019104004, "logps/rejected": -3.465824604034424, "loss": 0.511, "rewards/accuracies": 1.0, "rewards/chosen": 1.1698635816574097, "rewards/margins": 0.405076801776886, "rewards/rejected": 0.7647867798805237, "step": 1037 }, { "epoch": 0.56, "learning_rate": 9.370547467205949e-08, "logits/chosen": -1.963314175605774, "logits/rejected": -2.2223703861236572, "logps/chosen": -0.6312217116355896, "logps/rejected": -0.7104601860046387, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 0.8978952765464783, "rewards/margins": 0.029139339923858643, "rewards/rejected": 0.8687559366226196, "step": 1038 }, { "epoch": 0.56, "learning_rate": 9.369132488615367e-08, "logits/chosen": -1.9628932476043701, "logits/rejected": -2.004275321960449, "logps/chosen": -9.169939041137695, "logps/rejected": -8.363289833068848, "loss": 0.3951, "rewards/accuracies": 1.0, "rewards/chosen": 1.4188038110733032, "rewards/margins": 0.7245233654975891, "rewards/rejected": 0.6942804455757141, "step": 1039 }, { "epoch": 0.56, "learning_rate": 9.367716028489328e-08, "logits/chosen": -1.9565256834030151, "logits/rejected": -2.2445178031921387, "logps/chosen": -2.780001401901245, "logps/rejected": -2.5033323764801025, "loss": 0.6997, "rewards/accuracies": 0.0, "rewards/chosen": 0.6318632364273071, "rewards/margins": -0.012965023517608643, "rewards/rejected": 0.6448282599449158, "step": 1040 }, { "epoch": 0.56, "learning_rate": 9.366298087308144e-08, "logits/chosen": -2.159701347351074, "logits/rejected": -2.322232246398926, "logps/chosen": -0.7255111336708069, "logps/rejected": -0.720702052116394, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.8644189238548279, "rewards/margins": 0.01932692527770996, "rewards/rejected": 0.8450919985771179, "step": 1041 }, { "epoch": 0.56, "learning_rate": 9.364878665552625e-08, "logits/chosen": -2.1367311477661133, "logits/rejected": -2.132908821105957, "logps/chosen": -7.096475124359131, "logps/rejected": -3.633349895477295, "loss": 0.3987, "rewards/accuracies": 1.0, "rewards/chosen": 1.270163893699646, "rewards/margins": 0.7134888768196106, "rewards/rejected": 0.5566750168800354, "step": 1042 }, { "epoch": 0.56, "learning_rate": 9.363457763704083e-08, "logits/chosen": -2.0707433223724365, "logits/rejected": -2.071958303451538, "logps/chosen": -3.586937427520752, "logps/rejected": -1.012067437171936, "loss": 0.611, "rewards/accuracies": 1.0, "rewards/chosen": 1.202479600906372, "rewards/margins": 0.17167985439300537, "rewards/rejected": 1.0307997465133667, "step": 1043 }, { "epoch": 0.56, "learning_rate": 9.362035382244339e-08, "logits/chosen": -2.0911080837249756, "logits/rejected": -2.093153953552246, "logps/chosen": -1.4743309020996094, "logps/rejected": -2.4855892658233643, "loss": 0.5534, "rewards/accuracies": 1.0, "rewards/chosen": 0.9529383778572083, "rewards/margins": 0.30233049392700195, "rewards/rejected": 0.6506078839302063, "step": 1044 }, { "epoch": 0.56, "learning_rate": 9.360611521655705e-08, "logits/chosen": -2.001772403717041, "logits/rejected": -2.247999429702759, "logps/chosen": -1.2273383140563965, "logps/rejected": -1.2867704629898071, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.6517001390457153, "rewards/margins": 0.003744363784790039, "rewards/rejected": 0.6479557752609253, "step": 1045 }, { "epoch": 0.56, "learning_rate": 9.359186182421002e-08, "logits/chosen": -2.2023327350616455, "logits/rejected": -2.303461790084839, "logps/chosen": -0.7088465094566345, "logps/rejected": -0.7609893083572388, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 1.0182712078094482, "rewards/margins": 0.005767107009887695, "rewards/rejected": 1.0125041007995605, "step": 1046 }, { "epoch": 0.56, "learning_rate": 9.357759365023552e-08, "logits/chosen": -2.094620704650879, "logits/rejected": -2.062352180480957, "logps/chosen": -4.025537967681885, "logps/rejected": -2.4736168384552, "loss": 0.521, "rewards/accuracies": 1.0, "rewards/chosen": 1.038389801979065, "rewards/margins": 0.38017719984054565, "rewards/rejected": 0.6582126021385193, "step": 1047 }, { "epoch": 0.57, "learning_rate": 9.356331069947174e-08, "logits/chosen": -2.1202917098999023, "logits/rejected": -2.2934958934783936, "logps/chosen": -4.1494622230529785, "logps/rejected": -4.124778747558594, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.819461464881897, "rewards/margins": 0.015762925148010254, "rewards/rejected": 0.8036985397338867, "step": 1048 }, { "epoch": 0.57, "learning_rate": 9.354901297676191e-08, "logits/chosen": -2.045440912246704, "logits/rejected": -2.2694835662841797, "logps/chosen": -2.717118501663208, "logps/rejected": -7.4768757820129395, "loss": 0.6381, "rewards/accuracies": 1.0, "rewards/chosen": 0.7156888842582703, "rewards/margins": 0.11336994171142578, "rewards/rejected": 0.6023189425468445, "step": 1049 }, { "epoch": 0.57, "learning_rate": 9.353470048695427e-08, "logits/chosen": -2.050396203994751, "logits/rejected": -2.0514745712280273, "logps/chosen": -0.564686119556427, "logps/rejected": -3.395815849304199, "loss": 0.56, "rewards/accuracies": 1.0, "rewards/chosen": 0.8950867056846619, "rewards/margins": 0.2868863344192505, "rewards/rejected": 0.6082003712654114, "step": 1050 }, { "epoch": 0.57, "learning_rate": 9.352037323490207e-08, "logits/chosen": -2.066610336303711, "logits/rejected": -2.19698166847229, "logps/chosen": -4.929473876953125, "logps/rejected": -0.7115447521209717, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.7426319122314453, "rewards/margins": 0.032576918601989746, "rewards/rejected": 0.7100549936294556, "step": 1051 }, { "epoch": 0.57, "learning_rate": 9.350603122546357e-08, "logits/chosen": -2.030426502227783, "logits/rejected": -2.2482666969299316, "logps/chosen": -0.5074241161346436, "logps/rejected": -0.5184804201126099, "loss": 0.6774, "rewards/accuracies": 1.0, "rewards/chosen": 0.8048276901245117, "rewards/margins": 0.031726837158203125, "rewards/rejected": 0.7731008529663086, "step": 1052 }, { "epoch": 0.57, "learning_rate": 9.349167446350198e-08, "logits/chosen": -1.9396333694458008, "logits/rejected": -2.2324881553649902, "logps/chosen": -4.798589706420898, "logps/rejected": -4.86496639251709, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.870951771736145, "rewards/margins": 0.019319355487823486, "rewards/rejected": 0.8516324162483215, "step": 1053 }, { "epoch": 0.57, "learning_rate": 9.347730295388558e-08, "logits/chosen": -2.0606377124786377, "logits/rejected": -2.066387414932251, "logps/chosen": -2.2210628986358643, "logps/rejected": -3.194355010986328, "loss": 0.4581, "rewards/accuracies": 1.0, "rewards/chosen": 1.0796794891357422, "rewards/margins": 0.5429377555847168, "rewards/rejected": 0.5367417335510254, "step": 1054 }, { "epoch": 0.57, "learning_rate": 9.346291670148765e-08, "logits/chosen": -2.1362791061401367, "logits/rejected": -2.280562162399292, "logps/chosen": -5.170077323913574, "logps/rejected": -2.5693821907043457, "loss": 0.639, "rewards/accuracies": 1.0, "rewards/chosen": 0.9553052186965942, "rewards/margins": 0.11138945817947388, "rewards/rejected": 0.8439157605171204, "step": 1055 }, { "epoch": 0.57, "learning_rate": 9.34485157111864e-08, "logits/chosen": -2.0834784507751465, "logits/rejected": -2.0898098945617676, "logps/chosen": -2.94503116607666, "logps/rejected": -4.822634696960449, "loss": 0.4126, "rewards/accuracies": 1.0, "rewards/chosen": 1.182085394859314, "rewards/margins": 0.6718815565109253, "rewards/rejected": 0.5102038383483887, "step": 1056 }, { "epoch": 0.57, "learning_rate": 9.343409998786511e-08, "logits/chosen": -2.0917704105377197, "logits/rejected": -2.023736000061035, "logps/chosen": -24.325986862182617, "logps/rejected": -2.9375245571136475, "loss": 0.4764, "rewards/accuracies": 1.0, "rewards/chosen": 1.0861120223999023, "rewards/margins": 0.49374425411224365, "rewards/rejected": 0.5923677682876587, "step": 1057 }, { "epoch": 0.57, "learning_rate": 9.341966953641203e-08, "logits/chosen": -2.1427271366119385, "logits/rejected": -2.292518138885498, "logps/chosen": -1.0848047733306885, "logps/rejected": -1.0842399597167969, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.9613800048828125, "rewards/margins": 0.004942774772644043, "rewards/rejected": 0.9564372301101685, "step": 1058 }, { "epoch": 0.57, "learning_rate": 9.34052243617204e-08, "logits/chosen": -2.034632921218872, "logits/rejected": -2.0303542613983154, "logps/chosen": -2.2206971645355225, "logps/rejected": -3.5503125190734863, "loss": 0.4434, "rewards/accuracies": 1.0, "rewards/chosen": 1.1982941627502441, "rewards/margins": 0.5833305716514587, "rewards/rejected": 0.6149635910987854, "step": 1059 }, { "epoch": 0.57, "learning_rate": 9.339076446868843e-08, "logits/chosen": -2.1275649070739746, "logits/rejected": -2.273010730743408, "logps/chosen": -3.610025405883789, "logps/rejected": -3.3867876529693604, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.9424285888671875, "rewards/margins": 0.004777729511260986, "rewards/rejected": 0.9376508593559265, "step": 1060 }, { "epoch": 0.57, "learning_rate": 9.337628986221939e-08, "logits/chosen": -1.9930155277252197, "logits/rejected": -2.258483648300171, "logps/chosen": -1.333322286605835, "logps/rejected": -1.5303256511688232, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9806338548660278, "rewards/margins": 0.02042466402053833, "rewards/rejected": 0.9602091908454895, "step": 1061 }, { "epoch": 0.57, "learning_rate": 9.336180054722147e-08, "logits/chosen": -2.0389087200164795, "logits/rejected": -2.04089093208313, "logps/chosen": -12.227519989013672, "logps/rejected": -3.5136215686798096, "loss": 0.2628, "rewards/accuracies": 1.0, "rewards/chosen": 1.6698862314224243, "rewards/margins": 1.2021547555923462, "rewards/rejected": 0.4677314758300781, "step": 1062 }, { "epoch": 0.57, "learning_rate": 9.334729652860786e-08, "logits/chosen": -2.095573902130127, "logits/rejected": -2.317633867263794, "logps/chosen": -1.2866640090942383, "logps/rejected": -6.223676681518555, "loss": 0.5689, "rewards/accuracies": 1.0, "rewards/chosen": 0.9370631575584412, "rewards/margins": 0.26625704765319824, "rewards/rejected": 0.6708061099052429, "step": 1063 }, { "epoch": 0.57, "learning_rate": 9.333277781129676e-08, "logits/chosen": -2.071540117263794, "logits/rejected": -2.054598569869995, "logps/chosen": -11.283770561218262, "logps/rejected": -1.182987928390503, "loss": 0.4831, "rewards/accuracies": 1.0, "rewards/chosen": 1.3640198707580566, "rewards/margins": 0.4763277769088745, "rewards/rejected": 0.8876920938491821, "step": 1064 }, { "epoch": 0.57, "learning_rate": 9.331824440021137e-08, "logits/chosen": -1.940717101097107, "logits/rejected": -2.193295955657959, "logps/chosen": -0.41353750228881836, "logps/rejected": -0.41162705421447754, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.9013710021972656, "rewards/margins": 0.01142740249633789, "rewards/rejected": 0.8899435997009277, "step": 1065 }, { "epoch": 0.57, "learning_rate": 9.33036963002798e-08, "logits/chosen": -2.1303043365478516, "logits/rejected": -2.127800703048706, "logps/chosen": -8.274884223937988, "logps/rejected": -2.324808359146118, "loss": 0.5166, "rewards/accuracies": 1.0, "rewards/chosen": 1.1262567043304443, "rewards/margins": 0.3910316824913025, "rewards/rejected": 0.7352250218391418, "step": 1066 }, { "epoch": 0.58, "learning_rate": 9.328913351643522e-08, "logits/chosen": -1.9817349910736084, "logits/rejected": -2.315387010574341, "logps/chosen": -0.3901398777961731, "logps/rejected": -0.45029371976852417, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 0.972406804561615, "rewards/margins": -0.00854635238647461, "rewards/rejected": 0.9809531569480896, "step": 1067 }, { "epoch": 0.58, "learning_rate": 9.327455605361574e-08, "logits/chosen": -1.9713208675384521, "logits/rejected": -1.9731262922286987, "logps/chosen": -10.787569046020508, "logps/rejected": -1.3576631546020508, "loss": 0.5338, "rewards/accuracies": 1.0, "rewards/chosen": 1.216373085975647, "rewards/margins": 0.34894275665283203, "rewards/rejected": 0.8674303293228149, "step": 1068 }, { "epoch": 0.58, "learning_rate": 9.325996391676443e-08, "logits/chosen": -2.057847499847412, "logits/rejected": -2.230059862136841, "logps/chosen": -0.8791651129722595, "logps/rejected": -0.8619687557220459, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.7748265862464905, "rewards/margins": 0.01636528968811035, "rewards/rejected": 0.7584612965583801, "step": 1069 }, { "epoch": 0.58, "learning_rate": 9.324535711082939e-08, "logits/chosen": -1.9738959074020386, "logits/rejected": -2.2483160495758057, "logps/chosen": -0.9119623303413391, "logps/rejected": -0.8671619296073914, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.7532646059989929, "rewards/margins": 0.015435457229614258, "rewards/rejected": 0.7378291487693787, "step": 1070 }, { "epoch": 0.58, "learning_rate": 9.323073564076364e-08, "logits/chosen": -2.1887292861938477, "logits/rejected": -2.190063714981079, "logps/chosen": -1.280155897140503, "logps/rejected": -4.537079811096191, "loss": 0.5153, "rewards/accuracies": 1.0, "rewards/chosen": 1.0158965587615967, "rewards/margins": 0.39431506395339966, "rewards/rejected": 0.621581494808197, "step": 1071 }, { "epoch": 0.58, "learning_rate": 9.32160995115252e-08, "logits/chosen": -1.9759026765823364, "logits/rejected": -1.973495602607727, "logps/chosen": -3.247668504714966, "logps/rejected": -7.554187774658203, "loss": 0.3477, "rewards/accuracies": 1.0, "rewards/chosen": 1.3630439043045044, "rewards/margins": 0.8775036931037903, "rewards/rejected": 0.4855402112007141, "step": 1072 }, { "epoch": 0.58, "learning_rate": 9.320144872807707e-08, "logits/chosen": -1.9828565120697021, "logits/rejected": -1.981723666191101, "logps/chosen": -2.4128246307373047, "logps/rejected": -3.0042195320129395, "loss": 0.5507, "rewards/accuracies": 1.0, "rewards/chosen": 0.8899173140525818, "rewards/margins": 0.3085216283798218, "rewards/rejected": 0.58139568567276, "step": 1073 }, { "epoch": 0.58, "learning_rate": 9.318678329538718e-08, "logits/chosen": -2.0291812419891357, "logits/rejected": -2.1018872261047363, "logps/chosen": -4.506184101104736, "logps/rejected": -12.127950668334961, "loss": 0.3601, "rewards/accuracies": 1.0, "rewards/chosen": 1.2825205326080322, "rewards/margins": 0.8358820676803589, "rewards/rejected": 0.4466384947299957, "step": 1074 }, { "epoch": 0.58, "learning_rate": 9.317210321842849e-08, "logits/chosen": -2.079024076461792, "logits/rejected": -2.08072566986084, "logps/chosen": -3.9608757495880127, "logps/rejected": -1.1010682582855225, "loss": 0.6311, "rewards/accuracies": 1.0, "rewards/chosen": 0.9470793008804321, "rewards/margins": 0.1282130479812622, "rewards/rejected": 0.8188662528991699, "step": 1075 }, { "epoch": 0.58, "learning_rate": 9.315740850217886e-08, "logits/chosen": -2.08134126663208, "logits/rejected": -2.0858941078186035, "logps/chosen": -2.247398853302002, "logps/rejected": -2.9185967445373535, "loss": 0.4959, "rewards/accuracies": 1.0, "rewards/chosen": 1.022228479385376, "rewards/margins": 0.4431934952735901, "rewards/rejected": 0.5790349841117859, "step": 1076 }, { "epoch": 0.58, "learning_rate": 9.314269915162114e-08, "logits/chosen": -2.117910385131836, "logits/rejected": -2.143550395965576, "logps/chosen": -9.148688316345215, "logps/rejected": -18.24083137512207, "loss": 0.3662, "rewards/accuracies": 1.0, "rewards/chosen": 1.200092077255249, "rewards/margins": 0.8160245418548584, "rewards/rejected": 0.3840675354003906, "step": 1077 }, { "epoch": 0.58, "learning_rate": 9.312797517174316e-08, "logits/chosen": -1.9953733682632446, "logits/rejected": -2.0031707286834717, "logps/chosen": -1.936234474182129, "logps/rejected": -2.728729486465454, "loss": 0.419, "rewards/accuracies": 1.0, "rewards/chosen": 1.2754909992218018, "rewards/margins": 0.6529770493507385, "rewards/rejected": 0.6225139498710632, "step": 1078 }, { "epoch": 0.58, "learning_rate": 9.311323656753769e-08, "logits/chosen": -2.042146921157837, "logits/rejected": -2.268789768218994, "logps/chosen": -0.7534434199333191, "logps/rejected": -0.7352105379104614, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.8018884658813477, "rewards/margins": 0.016003906726837158, "rewards/rejected": 0.7858845591545105, "step": 1079 }, { "epoch": 0.58, "learning_rate": 9.309848334400246e-08, "logits/chosen": -2.2457528114318848, "logits/rejected": -2.16782808303833, "logps/chosen": -50.93143844604492, "logps/rejected": -5.372373580932617, "loss": 0.5322, "rewards/accuracies": 1.0, "rewards/chosen": 1.1818363666534424, "rewards/margins": 0.3528164029121399, "rewards/rejected": 0.8290199637413025, "step": 1080 }, { "epoch": 0.58, "learning_rate": 9.308371550614017e-08, "logits/chosen": -2.06296706199646, "logits/rejected": -2.0505435466766357, "logps/chosen": -5.091604709625244, "logps/rejected": -4.708209991455078, "loss": 0.3406, "rewards/accuracies": 1.0, "rewards/chosen": 1.3598812818527222, "rewards/margins": 0.9020370244979858, "rewards/rejected": 0.45784425735473633, "step": 1081 }, { "epoch": 0.58, "learning_rate": 9.306893305895846e-08, "logits/chosen": -2.101024627685547, "logits/rejected": -2.2531332969665527, "logps/chosen": -2.6152029037475586, "logps/rejected": -2.5024611949920654, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.9971904754638672, "rewards/margins": 0.00543135404586792, "rewards/rejected": 0.9917591214179993, "step": 1082 }, { "epoch": 0.58, "learning_rate": 9.305413600746994e-08, "logits/chosen": -2.01723051071167, "logits/rejected": -2.0147719383239746, "logps/chosen": -6.662571430206299, "logps/rejected": -3.786619186401367, "loss": 0.3954, "rewards/accuracies": 1.0, "rewards/chosen": 1.238390326499939, "rewards/margins": 0.7235602736473083, "rewards/rejected": 0.5148300528526306, "step": 1083 }, { "epoch": 0.58, "learning_rate": 9.303932435669215e-08, "logits/chosen": -1.9492591619491577, "logits/rejected": -1.949608325958252, "logps/chosen": -4.934047222137451, "logps/rejected": -2.2376708984375, "loss": 0.3976, "rewards/accuracies": 1.0, "rewards/chosen": 1.3669861555099487, "rewards/margins": 0.7169830799102783, "rewards/rejected": 0.6500030755996704, "step": 1084 }, { "epoch": 0.59, "learning_rate": 9.30244981116476e-08, "logits/chosen": -2.062432289123535, "logits/rejected": -2.277933120727539, "logps/chosen": -9.482452392578125, "logps/rejected": -10.789507865905762, "loss": 0.6186, "rewards/accuracies": 1.0, "rewards/chosen": 0.8039939999580383, "rewards/margins": 0.15510112047195435, "rewards/rejected": 0.648892879486084, "step": 1085 }, { "epoch": 0.59, "learning_rate": 9.300965727736377e-08, "logits/chosen": -2.107924222946167, "logits/rejected": -2.015768051147461, "logps/chosen": -33.07563781738281, "logps/rejected": -3.4254086017608643, "loss": 0.402, "rewards/accuracies": 1.0, "rewards/chosen": 1.2829402685165405, "rewards/margins": 0.7034775614738464, "rewards/rejected": 0.5794627070426941, "step": 1086 }, { "epoch": 0.59, "learning_rate": 9.299480185887302e-08, "logits/chosen": -1.960771918296814, "logits/rejected": -2.246257781982422, "logps/chosen": -0.729211688041687, "logps/rejected": -0.8015245199203491, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.7440993189811707, "rewards/margins": -0.0019533634185791016, "rewards/rejected": 0.7460526823997498, "step": 1087 }, { "epoch": 0.59, "learning_rate": 9.29799318612127e-08, "logits/chosen": -2.0971083641052246, "logits/rejected": -2.127264976501465, "logps/chosen": -3.5645461082458496, "logps/rejected": -24.51594352722168, "loss": 0.5815, "rewards/accuracies": 1.0, "rewards/chosen": 1.0049084424972534, "rewards/margins": 0.23725342750549316, "rewards/rejected": 0.7676550149917603, "step": 1088 }, { "epoch": 0.59, "learning_rate": 9.296504728942513e-08, "logits/chosen": -2.0224714279174805, "logits/rejected": -2.0090372562408447, "logps/chosen": -4.0530686378479, "logps/rejected": -2.3890774250030518, "loss": 0.515, "rewards/accuracies": 1.0, "rewards/chosen": 1.092245101928711, "rewards/margins": 0.39505988359451294, "rewards/rejected": 0.697185218334198, "step": 1089 }, { "epoch": 0.59, "learning_rate": 9.295014814855752e-08, "logits/chosen": -1.955183744430542, "logits/rejected": -1.9595229625701904, "logps/chosen": -3.1579058170318604, "logps/rejected": -3.7892262935638428, "loss": 0.3959, "rewards/accuracies": 1.0, "rewards/chosen": 1.3225802183151245, "rewards/margins": 0.7221921682357788, "rewards/rejected": 0.6003880500793457, "step": 1090 }, { "epoch": 0.59, "learning_rate": 9.293523444366205e-08, "logits/chosen": -1.9724863767623901, "logits/rejected": -2.2959556579589844, "logps/chosen": -0.5526763796806335, "logps/rejected": -0.6562821269035339, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.9458404779434204, "rewards/margins": 0.004976391792297363, "rewards/rejected": 0.940864086151123, "step": 1091 }, { "epoch": 0.59, "learning_rate": 9.29203061797958e-08, "logits/chosen": -2.136042594909668, "logits/rejected": -2.2803122997283936, "logps/chosen": -0.8059216141700745, "logps/rejected": -0.7066846489906311, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.9086751937866211, "rewards/margins": -0.002693653106689453, "rewards/rejected": 0.9113688468933105, "step": 1092 }, { "epoch": 0.59, "learning_rate": 9.290536336202085e-08, "logits/chosen": -2.002208709716797, "logits/rejected": -2.208238124847412, "logps/chosen": -0.6577005982398987, "logps/rejected": -0.7001094818115234, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.7904506921768188, "rewards/margins": 0.028236985206604004, "rewards/rejected": 0.7622137069702148, "step": 1093 }, { "epoch": 0.59, "learning_rate": 9.289040599540417e-08, "logits/chosen": -2.1321120262145996, "logits/rejected": -2.3063831329345703, "logps/chosen": -6.666260719299316, "logps/rejected": -6.444934368133545, "loss": 0.695, "rewards/accuracies": 0.0, "rewards/chosen": 0.8085514903068542, "rewards/margins": -0.003770589828491211, "rewards/rejected": 0.8123220801353455, "step": 1094 }, { "epoch": 0.59, "learning_rate": 9.287543408501767e-08, "logits/chosen": -2.024752616882324, "logits/rejected": -2.30096173286438, "logps/chosen": -2.6790781021118164, "logps/rejected": -1.1153419017791748, "loss": 0.7171, "rewards/accuracies": 0.0, "rewards/chosen": 0.9738608598709106, "rewards/margins": -0.04736638069152832, "rewards/rejected": 1.021227240562439, "step": 1095 }, { "epoch": 0.59, "learning_rate": 9.286044763593821e-08, "logits/chosen": -2.0975089073181152, "logits/rejected": -2.2970898151397705, "logps/chosen": -0.6787180304527283, "logps/rejected": -0.733763575553894, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.812031090259552, "rewards/margins": 0.017174363136291504, "rewards/rejected": 0.7948567271232605, "step": 1096 }, { "epoch": 0.59, "learning_rate": 9.284544665324754e-08, "logits/chosen": -2.01786732673645, "logits/rejected": -2.210406541824341, "logps/chosen": -2.884854555130005, "logps/rejected": -2.889247179031372, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 0.8137702345848083, "rewards/margins": 0.024848222732543945, "rewards/rejected": 0.7889220118522644, "step": 1097 }, { "epoch": 0.59, "learning_rate": 9.283043114203241e-08, "logits/chosen": -1.89448881149292, "logits/rejected": -1.9035159349441528, "logps/chosen": -2.7437822818756104, "logps/rejected": -4.530940532684326, "loss": 0.4241, "rewards/accuracies": 1.0, "rewards/chosen": 1.199026346206665, "rewards/margins": 0.6381934881210327, "rewards/rejected": 0.5608328580856323, "step": 1098 }, { "epoch": 0.59, "learning_rate": 9.281540110738441e-08, "logits/chosen": -1.9923219680786133, "logits/rejected": -2.2669544219970703, "logps/chosen": -0.6541804671287537, "logps/rejected": -0.7193453311920166, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.9275676608085632, "rewards/margins": 0.013028144836425781, "rewards/rejected": 0.9145395159721375, "step": 1099 }, { "epoch": 0.59, "learning_rate": 9.280035655440012e-08, "logits/chosen": -2.0097286701202393, "logits/rejected": -2.003814697265625, "logps/chosen": -16.21650505065918, "logps/rejected": -5.357418060302734, "loss": 0.3142, "rewards/accuracies": 1.0, "rewards/chosen": 1.7009257078170776, "rewards/margins": 0.9964380860328674, "rewards/rejected": 0.7044876217842102, "step": 1100 }, { "epoch": 0.59, "learning_rate": 9.2785297488181e-08, "logits/chosen": -2.0238587856292725, "logits/rejected": -2.033504009246826, "logps/chosen": -2.992835760116577, "logps/rejected": -1.839368462562561, "loss": 0.3604, "rewards/accuracies": 1.0, "rewards/chosen": 1.5341953039169312, "rewards/margins": 0.8349095582962036, "rewards/rejected": 0.6992857456207275, "step": 1101 }, { "epoch": 0.59, "learning_rate": 9.277022391383346e-08, "logits/chosen": -2.0566232204437256, "logits/rejected": -2.0562167167663574, "logps/chosen": -1.2890688180923462, "logps/rejected": -1.5159956216812134, "loss": 0.6992, "rewards/accuracies": 0.0, "rewards/chosen": 0.874844491481781, "rewards/margins": -0.012032687664031982, "rewards/rejected": 0.886877179145813, "step": 1102 }, { "epoch": 0.59, "learning_rate": 9.275513583646883e-08, "logits/chosen": -2.112727642059326, "logits/rejected": -2.1112241744995117, "logps/chosen": -7.817354202270508, "logps/rejected": -2.629420280456543, "loss": 0.3243, "rewards/accuracies": 1.0, "rewards/chosen": 1.5122705698013306, "rewards/margins": 0.959561288356781, "rewards/rejected": 0.5527092814445496, "step": 1103 }, { "epoch": 0.6, "learning_rate": 9.274003326120334e-08, "logits/chosen": -2.0127217769622803, "logits/rejected": -2.0125982761383057, "logps/chosen": -0.4678698778152466, "logps/rejected": -2.950981616973877, "loss": 0.5329, "rewards/accuracies": 1.0, "rewards/chosen": 0.863019585609436, "rewards/margins": 0.35112643241882324, "rewards/rejected": 0.5118931531906128, "step": 1104 }, { "epoch": 0.6, "learning_rate": 9.272491619315814e-08, "logits/chosen": -1.9712342023849487, "logits/rejected": -2.2168896198272705, "logps/chosen": -0.4009431302547455, "logps/rejected": -0.4331806004047394, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.8835873603820801, "rewards/margins": 0.009597599506378174, "rewards/rejected": 0.8739897608757019, "step": 1105 }, { "epoch": 0.6, "learning_rate": 9.270978463745931e-08, "logits/chosen": -2.155691146850586, "logits/rejected": -2.2982237339019775, "logps/chosen": -2.4242208003997803, "logps/rejected": -2.1800031661987305, "loss": 0.6999, "rewards/accuracies": 0.0, "rewards/chosen": 0.9598304629325867, "rewards/margins": -0.013438701629638672, "rewards/rejected": 0.9732691645622253, "step": 1106 }, { "epoch": 0.6, "learning_rate": 9.269463859923782e-08, "logits/chosen": -2.0468950271606445, "logits/rejected": -2.2136971950531006, "logps/chosen": -0.5239459276199341, "logps/rejected": -0.5638449192047119, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 1.0055745840072632, "rewards/margins": 0.022520244121551514, "rewards/rejected": 0.9830543398857117, "step": 1107 }, { "epoch": 0.6, "learning_rate": 9.267947808362955e-08, "logits/chosen": -1.9956769943237305, "logits/rejected": -2.241692066192627, "logps/chosen": -0.3936631977558136, "logps/rejected": -0.3932475745677948, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.9068540930747986, "rewards/margins": -0.001970529556274414, "rewards/rejected": 0.908824622631073, "step": 1108 }, { "epoch": 0.6, "learning_rate": 9.266430309577533e-08, "logits/chosen": -2.0933003425598145, "logits/rejected": -2.091891288757324, "logps/chosen": -1.1317309141159058, "logps/rejected": -1.684966802597046, "loss": 0.6679, "rewards/accuracies": 1.0, "rewards/chosen": 0.9316554069519043, "rewards/margins": 0.051236510276794434, "rewards/rejected": 0.8804188966751099, "step": 1109 }, { "epoch": 0.6, "learning_rate": 9.264911364082087e-08, "logits/chosen": -2.1401243209838867, "logits/rejected": -2.1405584812164307, "logps/chosen": -4.223397254943848, "logps/rejected": -3.3584635257720947, "loss": 0.48, "rewards/accuracies": 1.0, "rewards/chosen": 1.2973917722702026, "rewards/margins": 0.4843251705169678, "rewards/rejected": 0.8130666017532349, "step": 1110 }, { "epoch": 0.6, "learning_rate": 9.263390972391674e-08, "logits/chosen": -2.042858839035034, "logits/rejected": -2.0465877056121826, "logps/chosen": -5.957507610321045, "logps/rejected": -2.583625555038452, "loss": 0.5052, "rewards/accuracies": 1.0, "rewards/chosen": 1.2658623456954956, "rewards/margins": 0.4197084307670593, "rewards/rejected": 0.8461539149284363, "step": 1111 }, { "epoch": 0.6, "learning_rate": 9.261869135021849e-08, "logits/chosen": -2.1180319786071777, "logits/rejected": -2.110936164855957, "logps/chosen": -5.43225622177124, "logps/rejected": -4.574405193328857, "loss": 0.3989, "rewards/accuracies": 1.0, "rewards/chosen": 1.2955058813095093, "rewards/margins": 0.7130802273750305, "rewards/rejected": 0.5824256539344788, "step": 1112 }, { "epoch": 0.6, "learning_rate": 9.260345852488656e-08, "logits/chosen": -2.1026337146759033, "logits/rejected": -2.333176851272583, "logps/chosen": -1.1173851490020752, "logps/rejected": -1.1699028015136719, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.9541775584220886, "rewards/margins": 0.020834803581237793, "rewards/rejected": 0.9333427548408508, "step": 1113 }, { "epoch": 0.6, "learning_rate": 9.258821125308622e-08, "logits/chosen": -2.166645050048828, "logits/rejected": -2.266721725463867, "logps/chosen": -2.3388895988464355, "logps/rejected": -2.365668773651123, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.9491060376167297, "rewards/margins": 0.027258694171905518, "rewards/rejected": 0.9218473434448242, "step": 1114 }, { "epoch": 0.6, "learning_rate": 9.257294953998774e-08, "logits/chosen": -2.254142999649048, "logits/rejected": -2.1098594665527344, "logps/chosen": -36.98430633544922, "logps/rejected": -3.336651563644409, "loss": 0.3954, "rewards/accuracies": 1.0, "rewards/chosen": 1.3012443780899048, "rewards/margins": 0.7237727046012878, "rewards/rejected": 0.5774716734886169, "step": 1115 }, { "epoch": 0.6, "learning_rate": 9.255767339076622e-08, "logits/chosen": -1.9555836915969849, "logits/rejected": -2.272268295288086, "logps/chosen": -2.0289769172668457, "logps/rejected": -2.122185468673706, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.8730335235595703, "rewards/margins": 0.01988828182220459, "rewards/rejected": 0.8531452417373657, "step": 1116 }, { "epoch": 0.6, "learning_rate": 9.254238281060165e-08, "logits/chosen": -2.0500333309173584, "logits/rejected": -2.285142660140991, "logps/chosen": -2.0280213356018066, "logps/rejected": -2.058443307876587, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 0.9126524329185486, "rewards/margins": -0.0017838478088378906, "rewards/rejected": 0.9144362807273865, "step": 1117 }, { "epoch": 0.6, "learning_rate": 9.252707780467894e-08, "logits/chosen": -1.9553911685943604, "logits/rejected": -1.9604647159576416, "logps/chosen": -2.0169856548309326, "logps/rejected": -3.0502278804779053, "loss": 0.4783, "rewards/accuracies": 1.0, "rewards/chosen": 1.0845404863357544, "rewards/margins": 0.4888726472854614, "rewards/rejected": 0.595667839050293, "step": 1118 }, { "epoch": 0.6, "learning_rate": 9.251175837818793e-08, "logits/chosen": -1.9871159791946411, "logits/rejected": -1.9896970987319946, "logps/chosen": -1.6221225261688232, "logps/rejected": -2.706002712249756, "loss": 0.5051, "rewards/accuracies": 1.0, "rewards/chosen": 1.0945581197738647, "rewards/margins": 0.4198256731033325, "rewards/rejected": 0.6747324466705322, "step": 1119 }, { "epoch": 0.6, "learning_rate": 9.249642453632325e-08, "logits/chosen": -2.1438794136047363, "logits/rejected": -2.302311420440674, "logps/chosen": -1.3523149490356445, "logps/rejected": -1.3689002990722656, "loss": 0.7016, "rewards/accuracies": 0.0, "rewards/chosen": 0.8643075823783875, "rewards/margins": -0.016798794269561768, "rewards/rejected": 0.8811063766479492, "step": 1120 }, { "epoch": 0.6, "learning_rate": 9.24810762842845e-08, "logits/chosen": -2.1159231662750244, "logits/rejected": -2.123131275177002, "logps/chosen": -1.9602924585342407, "logps/rejected": -4.034993648529053, "loss": 0.4008, "rewards/accuracies": 1.0, "rewards/chosen": 1.302490472793579, "rewards/margins": 0.7072350382804871, "rewards/rejected": 0.595255434513092, "step": 1121 }, { "epoch": 0.61, "learning_rate": 9.246571362727613e-08, "logits/chosen": -2.0924367904663086, "logits/rejected": -2.250040054321289, "logps/chosen": -0.6355730891227722, "logps/rejected": -0.5958875417709351, "loss": 0.6765, "rewards/accuracies": 1.0, "rewards/chosen": 0.904570996761322, "rewards/margins": 0.033480286598205566, "rewards/rejected": 0.8710907101631165, "step": 1122 }, { "epoch": 0.61, "learning_rate": 9.245033657050751e-08, "logits/chosen": -2.039200782775879, "logits/rejected": -2.0405328273773193, "logps/chosen": -1.0814539194107056, "logps/rejected": -3.7906625270843506, "loss": 0.6191, "rewards/accuracies": 1.0, "rewards/chosen": 0.9685015678405762, "rewards/margins": 0.15407902002334595, "rewards/rejected": 0.8144225478172302, "step": 1123 }, { "epoch": 0.61, "learning_rate": 9.243494511919285e-08, "logits/chosen": -2.0423269271850586, "logits/rejected": -2.300097942352295, "logps/chosen": -8.838254928588867, "logps/rejected": -3.2706401348114014, "loss": 0.7285, "rewards/accuracies": 0.0, "rewards/chosen": 0.8653125762939453, "rewards/margins": -0.06947964429855347, "rewards/rejected": 0.9347922205924988, "step": 1124 }, { "epoch": 0.61, "learning_rate": 9.241953927855126e-08, "logits/chosen": -1.9873778820037842, "logits/rejected": -1.989731788635254, "logps/chosen": -1.7310761213302612, "logps/rejected": -4.524502277374268, "loss": 0.434, "rewards/accuracies": 1.0, "rewards/chosen": 1.0392708778381348, "rewards/margins": 0.6099423170089722, "rewards/rejected": 0.429328590631485, "step": 1125 }, { "epoch": 0.61, "learning_rate": 9.240411905380674e-08, "logits/chosen": -2.0594050884246826, "logits/rejected": -2.2455027103424072, "logps/chosen": -0.8787940740585327, "logps/rejected": -0.8058642745018005, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.871073842048645, "rewards/margins": 0.012790977954864502, "rewards/rejected": 0.8582828640937805, "step": 1126 }, { "epoch": 0.61, "learning_rate": 9.238868445018815e-08, "logits/chosen": -2.1329290866851807, "logits/rejected": -2.307981491088867, "logps/chosen": -2.0388238430023193, "logps/rejected": -4.934920310974121, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.8033124208450317, "rewards/margins": 0.006332814693450928, "rewards/rejected": 0.7969796061515808, "step": 1127 }, { "epoch": 0.61, "learning_rate": 9.237323547292922e-08, "logits/chosen": -2.0536510944366455, "logits/rejected": -2.059706926345825, "logps/chosen": -2.8494346141815186, "logps/rejected": -3.0326499938964844, "loss": 0.4368, "rewards/accuracies": 1.0, "rewards/chosen": 1.312369704246521, "rewards/margins": 0.6019192337989807, "rewards/rejected": 0.7104504704475403, "step": 1128 }, { "epoch": 0.61, "learning_rate": 9.235777212726859e-08, "logits/chosen": -2.0036308765411377, "logits/rejected": -2.0081419944763184, "logps/chosen": -1.6466618776321411, "logps/rejected": -2.9182076454162598, "loss": 0.5092, "rewards/accuracies": 1.0, "rewards/chosen": 0.9722160696983337, "rewards/margins": 0.4095364212989807, "rewards/rejected": 0.562679648399353, "step": 1129 }, { "epoch": 0.61, "learning_rate": 9.234229441844972e-08, "logits/chosen": -2.01566219329834, "logits/rejected": -2.018714189529419, "logps/chosen": -0.8474517464637756, "logps/rejected": -2.635709762573242, "loss": 0.5405, "rewards/accuracies": 1.0, "rewards/chosen": 0.8509919047355652, "rewards/margins": 0.3328746557235718, "rewards/rejected": 0.5181172490119934, "step": 1130 }, { "epoch": 0.61, "learning_rate": 9.232680235172098e-08, "logits/chosen": -1.9585869312286377, "logits/rejected": -1.961079716682434, "logps/chosen": -0.6907652616500854, "logps/rejected": -5.166512966156006, "loss": 0.563, "rewards/accuracies": 1.0, "rewards/chosen": 0.7838110327720642, "rewards/margins": 0.279742956161499, "rewards/rejected": 0.5040680766105652, "step": 1131 }, { "epoch": 0.61, "learning_rate": 9.23112959323356e-08, "logits/chosen": -2.061927318572998, "logits/rejected": -2.2945003509521484, "logps/chosen": -1.7488120794296265, "logps/rejected": -1.5614418983459473, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.7524883151054382, "rewards/margins": 0.017092227935791016, "rewards/rejected": 0.7353960871696472, "step": 1132 }, { "epoch": 0.61, "learning_rate": 9.229577516555169e-08, "logits/chosen": -2.068634510040283, "logits/rejected": -2.2489514350891113, "logps/chosen": -2.7601873874664307, "logps/rejected": -7.665463924407959, "loss": 0.7041, "rewards/accuracies": 0.0, "rewards/chosen": 1.087224006652832, "rewards/margins": -0.02168905735015869, "rewards/rejected": 1.1089130640029907, "step": 1133 }, { "epoch": 0.61, "learning_rate": 9.228024005663219e-08, "logits/chosen": -2.078186273574829, "logits/rejected": -1.953642725944519, "logps/chosen": -24.72369384765625, "logps/rejected": -8.768508911132812, "loss": 0.553, "rewards/accuracies": 1.0, "rewards/chosen": 1.0949352979660034, "rewards/margins": 0.3031765818595886, "rewards/rejected": 0.7917587161064148, "step": 1134 }, { "epoch": 0.61, "learning_rate": 9.226469061084492e-08, "logits/chosen": -2.013162851333618, "logits/rejected": -2.2040982246398926, "logps/chosen": -1.6195778846740723, "logps/rejected": -1.7025022506713867, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.8302536010742188, "rewards/margins": 0.005467414855957031, "rewards/rejected": 0.8247861862182617, "step": 1135 }, { "epoch": 0.61, "learning_rate": 9.224912683346258e-08, "logits/chosen": -2.0218658447265625, "logits/rejected": -2.2601852416992188, "logps/chosen": -2.5832178592681885, "logps/rejected": -2.6131677627563477, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.8342675566673279, "rewards/margins": -0.007762432098388672, "rewards/rejected": 0.8420299887657166, "step": 1136 }, { "epoch": 0.61, "learning_rate": 9.223354872976268e-08, "logits/chosen": -2.0235352516174316, "logits/rejected": -1.9769647121429443, "logps/chosen": -12.010660171508789, "logps/rejected": -7.983924865722656, "loss": 0.5541, "rewards/accuracies": 1.0, "rewards/chosen": 1.0906645059585571, "rewards/margins": 0.3005968928337097, "rewards/rejected": 0.7900676131248474, "step": 1137 }, { "epoch": 0.61, "learning_rate": 9.221795630502767e-08, "logits/chosen": -1.9994810819625854, "logits/rejected": -2.2040011882781982, "logps/chosen": -1.3473656177520752, "logps/rejected": -1.3509654998779297, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.8281266093254089, "rewards/margins": 0.004662215709686279, "rewards/rejected": 0.8234643936157227, "step": 1138 }, { "epoch": 0.61, "learning_rate": 9.220234956454478e-08, "logits/chosen": -2.082658290863037, "logits/rejected": -2.0852253437042236, "logps/chosen": -1.1711961030960083, "logps/rejected": -2.760561466217041, "loss": 0.5264, "rewards/accuracies": 1.0, "rewards/chosen": 1.030593752861023, "rewards/margins": 0.36690014600753784, "rewards/rejected": 0.6636936068534851, "step": 1139 }, { "epoch": 0.61, "learning_rate": 9.218672851360611e-08, "logits/chosen": -2.0441973209381104, "logits/rejected": -2.0530993938446045, "logps/chosen": -2.118642807006836, "logps/rejected": -2.5536067485809326, "loss": 0.5146, "rewards/accuracies": 1.0, "rewards/chosen": 1.0596050024032593, "rewards/margins": 0.39615654945373535, "rewards/rejected": 0.6634484529495239, "step": 1140 }, { "epoch": 0.62, "learning_rate": 9.217109315750867e-08, "logits/chosen": -1.9687626361846924, "logits/rejected": -1.9672205448150635, "logps/chosen": -7.480353355407715, "logps/rejected": -2.24106764793396, "loss": 0.3713, "rewards/accuracies": 1.0, "rewards/chosen": 1.499838948249817, "rewards/margins": 0.7992845773696899, "rewards/rejected": 0.700554370880127, "step": 1141 }, { "epoch": 0.62, "learning_rate": 9.215544350155421e-08, "logits/chosen": -2.0383238792419434, "logits/rejected": -2.2471728324890137, "logps/chosen": -0.7277504801750183, "logps/rejected": -0.6796770095825195, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 0.8015437126159668, "rewards/margins": 0.0028971433639526367, "rewards/rejected": 0.7986465692520142, "step": 1142 }, { "epoch": 0.62, "learning_rate": 9.213977955104946e-08, "logits/chosen": -2.036102533340454, "logits/rejected": -2.2444164752960205, "logps/chosen": -3.186339855194092, "logps/rejected": -3.141252040863037, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.784481942653656, "rewards/margins": 0.018869876861572266, "rewards/rejected": 0.7656120657920837, "step": 1143 }, { "epoch": 0.62, "learning_rate": 9.21241013113059e-08, "logits/chosen": -2.1182146072387695, "logits/rejected": -2.290590763092041, "logps/chosen": -8.695240020751953, "logps/rejected": -8.474764823913574, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 1.0284849405288696, "rewards/margins": 0.027272939682006836, "rewards/rejected": 1.0012120008468628, "step": 1144 }, { "epoch": 0.62, "learning_rate": 9.210840878763987e-08, "logits/chosen": -2.1439478397369385, "logits/rejected": -2.247133731842041, "logps/chosen": -2.081932306289673, "logps/rejected": -1.9018590450286865, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.832148015499115, "rewards/margins": 0.014913797378540039, "rewards/rejected": 0.817234218120575, "step": 1145 }, { "epoch": 0.62, "learning_rate": 9.20927019853726e-08, "logits/chosen": -1.9442777633666992, "logits/rejected": -1.917506456375122, "logps/chosen": -15.199264526367188, "logps/rejected": -3.811309337615967, "loss": 0.4279, "rewards/accuracies": 1.0, "rewards/chosen": 1.0988506078720093, "rewards/margins": 0.6274373531341553, "rewards/rejected": 0.4714132845401764, "step": 1146 }, { "epoch": 0.62, "learning_rate": 9.207698090983012e-08, "logits/chosen": -2.1063079833984375, "logits/rejected": -2.305912971496582, "logps/chosen": -1.2572985887527466, "logps/rejected": -1.1970912218093872, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.9373872876167297, "rewards/margins": 0.010681629180908203, "rewards/rejected": 0.9267056584358215, "step": 1147 }, { "epoch": 0.62, "learning_rate": 9.206124556634332e-08, "logits/chosen": -2.0693557262420654, "logits/rejected": -2.0967929363250732, "logps/chosen": -5.109064102172852, "logps/rejected": -5.781940460205078, "loss": 0.4276, "rewards/accuracies": 1.0, "rewards/chosen": 1.4360086917877197, "rewards/margins": 0.6282812356948853, "rewards/rejected": 0.8077274560928345, "step": 1148 }, { "epoch": 0.62, "learning_rate": 9.204549596024791e-08, "logits/chosen": -2.010889768600464, "logits/rejected": -2.203402042388916, "logps/chosen": -0.688818633556366, "logps/rejected": -0.7927802801132202, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.9267629981040955, "rewards/margins": 0.003189980983734131, "rewards/rejected": 0.9235730171203613, "step": 1149 }, { "epoch": 0.62, "learning_rate": 9.202973209688447e-08, "logits/chosen": -2.1007747650146484, "logits/rejected": -2.0998713970184326, "logps/chosen": -1.4982041120529175, "logps/rejected": -3.637587308883667, "loss": 0.5311, "rewards/accuracies": 1.0, "rewards/chosen": 0.8820120692253113, "rewards/margins": 0.35547852516174316, "rewards/rejected": 0.5265335440635681, "step": 1150 }, { "epoch": 0.62, "learning_rate": 9.201395398159835e-08, "logits/chosen": -2.12618088722229, "logits/rejected": -2.131220579147339, "logps/chosen": -1.3789958953857422, "logps/rejected": -2.260714292526245, "loss": 0.5216, "rewards/accuracies": 1.0, "rewards/chosen": 0.9055486917495728, "rewards/margins": 0.3786763548851013, "rewards/rejected": 0.5268723368644714, "step": 1151 }, { "epoch": 0.62, "learning_rate": 9.199816161973981e-08, "logits/chosen": -1.9343961477279663, "logits/rejected": -1.9298394918441772, "logps/chosen": -3.9491236209869385, "logps/rejected": -5.051234245300293, "loss": 0.5822, "rewards/accuracies": 1.0, "rewards/chosen": 0.7348044514656067, "rewards/margins": 0.2356908917427063, "rewards/rejected": 0.4991135597229004, "step": 1152 }, { "epoch": 0.62, "learning_rate": 9.198235501666388e-08, "logits/chosen": -2.070737838745117, "logits/rejected": -2.0794427394866943, "logps/chosen": -2.0988974571228027, "logps/rejected": -2.753188133239746, "loss": 0.5087, "rewards/accuracies": 1.0, "rewards/chosen": 0.9018970727920532, "rewards/margins": 0.41080090403556824, "rewards/rejected": 0.491096168756485, "step": 1153 }, { "epoch": 0.62, "learning_rate": 9.196653417773046e-08, "logits/chosen": -1.911850094795227, "logits/rejected": -2.228426694869995, "logps/chosen": -0.4591144323348999, "logps/rejected": -0.5247971415519714, "loss": 0.6717, "rewards/accuracies": 1.0, "rewards/chosen": 0.8574259877204895, "rewards/margins": 0.04328644275665283, "rewards/rejected": 0.8141395449638367, "step": 1154 }, { "epoch": 0.62, "learning_rate": 9.195069910830426e-08, "logits/chosen": -1.975893497467041, "logits/rejected": -2.220322370529175, "logps/chosen": -1.2483482360839844, "logps/rejected": -1.3304789066314697, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.9621648192405701, "rewards/margins": 0.004332065582275391, "rewards/rejected": 0.9578327536582947, "step": 1155 }, { "epoch": 0.62, "learning_rate": 9.193484981375481e-08, "logits/chosen": -2.026552438735962, "logits/rejected": -2.2548129558563232, "logps/chosen": -0.367880254983902, "logps/rejected": -0.4304412007331848, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.970617413520813, "rewards/margins": 0.0044165849685668945, "rewards/rejected": 0.9662008285522461, "step": 1156 }, { "epoch": 0.62, "learning_rate": 9.191898629945647e-08, "logits/chosen": -2.026672840118408, "logits/rejected": -2.0199248790740967, "logps/chosen": -5.773597717285156, "logps/rejected": -6.673874378204346, "loss": 0.3477, "rewards/accuracies": 1.0, "rewards/chosen": 1.3873471021652222, "rewards/margins": 0.8775593638420105, "rewards/rejected": 0.5097877383232117, "step": 1157 }, { "epoch": 0.62, "learning_rate": 9.190310857078843e-08, "logits/chosen": -2.09538197517395, "logits/rejected": -2.2411282062530518, "logps/chosen": -1.6145333051681519, "logps/rejected": -1.5595661401748657, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 1.04190194606781, "rewards/margins": 0.015672683715820312, "rewards/rejected": 1.0262292623519897, "step": 1158 }, { "epoch": 0.63, "learning_rate": 9.188721663313469e-08, "logits/chosen": -2.2061774730682373, "logits/rejected": -2.0369229316711426, "logps/chosen": -70.49742126464844, "logps/rejected": -0.3907724618911743, "loss": 0.4203, "rewards/accuracies": 1.0, "rewards/chosen": 1.4916671514511108, "rewards/margins": 0.6492570638656616, "rewards/rejected": 0.8424100875854492, "step": 1159 }, { "epoch": 0.63, "learning_rate": 9.187131049188407e-08, "logits/chosen": -2.114795684814453, "logits/rejected": -2.1148760318756104, "logps/chosen": -4.925068378448486, "logps/rejected": -4.2806243896484375, "loss": 0.5647, "rewards/accuracies": 1.0, "rewards/chosen": 1.0364124774932861, "rewards/margins": 0.27578598260879517, "rewards/rejected": 0.760626494884491, "step": 1160 }, { "epoch": 0.63, "learning_rate": 9.18553901524302e-08, "logits/chosen": -2.025438070297241, "logits/rejected": -2.022355318069458, "logps/chosen": -4.532875061035156, "logps/rejected": -7.675829887390137, "loss": 0.4876, "rewards/accuracies": 1.0, "rewards/chosen": 1.0456069707870483, "rewards/margins": 0.464682936668396, "rewards/rejected": 0.5809240341186523, "step": 1161 }, { "epoch": 0.63, "learning_rate": 9.183945562017152e-08, "logits/chosen": -2.044297933578491, "logits/rejected": -2.0505282878875732, "logps/chosen": -3.305629253387451, "logps/rejected": -1.3106646537780762, "loss": 0.4559, "rewards/accuracies": 1.0, "rewards/chosen": 1.2899357080459595, "rewards/margins": 0.5489845871925354, "rewards/rejected": 0.7409511208534241, "step": 1162 }, { "epoch": 0.63, "learning_rate": 9.182350690051132e-08, "logits/chosen": -2.0722508430480957, "logits/rejected": -2.2808923721313477, "logps/chosen": -0.6409826874732971, "logps/rejected": -0.7297218441963196, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.8373495936393738, "rewards/margins": 0.013362765312194824, "rewards/rejected": 0.823986828327179, "step": 1163 }, { "epoch": 0.63, "learning_rate": 9.180754399885768e-08, "logits/chosen": -2.139092445373535, "logits/rejected": -2.2962679862976074, "logps/chosen": -0.8221272230148315, "logps/rejected": -0.8783930540084839, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 1.0823038816452026, "rewards/margins": 0.01017308235168457, "rewards/rejected": 1.072130799293518, "step": 1164 }, { "epoch": 0.63, "learning_rate": 9.179156692062345e-08, "logits/chosen": -1.9817570447921753, "logits/rejected": -2.2593581676483154, "logps/chosen": -10.283288955688477, "logps/rejected": -8.310168266296387, "loss": 0.7482, "rewards/accuracies": 0.0, "rewards/chosen": 0.8125039935112, "rewards/margins": -0.10726958513259888, "rewards/rejected": 0.9197735786437988, "step": 1165 }, { "epoch": 0.63, "learning_rate": 9.177557567122633e-08, "logits/chosen": -2.2007896900177, "logits/rejected": -2.292813539505005, "logps/chosen": -2.32881760597229, "logps/rejected": -2.4068989753723145, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.9584550857543945, "rewards/margins": 0.017201244831085205, "rewards/rejected": 0.9412538409233093, "step": 1166 }, { "epoch": 0.63, "learning_rate": 9.175957025608884e-08, "logits/chosen": -2.105957508087158, "logits/rejected": -2.0461580753326416, "logps/chosen": -33.59214782714844, "logps/rejected": -2.0999362468719482, "loss": 0.4026, "rewards/accuracies": 1.0, "rewards/chosen": 1.5052727460861206, "rewards/margins": 0.7017354369163513, "rewards/rejected": 0.8035373091697693, "step": 1167 }, { "epoch": 0.63, "learning_rate": 9.174355068063826e-08, "logits/chosen": -1.948043704032898, "logits/rejected": -2.2350428104400635, "logps/chosen": -1.6993460655212402, "logps/rejected": -1.574588656425476, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 0.9313749670982361, "rewards/margins": -0.01046532392501831, "rewards/rejected": 0.9418402910232544, "step": 1168 }, { "epoch": 0.63, "learning_rate": 9.172751695030669e-08, "logits/chosen": -2.030217409133911, "logits/rejected": -2.2722272872924805, "logps/chosen": -0.6241111755371094, "logps/rejected": -0.6419647932052612, "loss": 0.673, "rewards/accuracies": 1.0, "rewards/chosen": 0.6812929511070251, "rewards/margins": 0.04076540470123291, "rewards/rejected": 0.6405275464057922, "step": 1169 }, { "epoch": 0.63, "learning_rate": 9.171146907053102e-08, "logits/chosen": -2.01649808883667, "logits/rejected": -2.2291808128356934, "logps/chosen": -2.017188787460327, "logps/rejected": -2.0820555686950684, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 1.1536587476730347, "rewards/margins": 0.01785135269165039, "rewards/rejected": 1.1358073949813843, "step": 1170 }, { "epoch": 0.63, "learning_rate": 9.169540704675298e-08, "logits/chosen": -2.008697986602783, "logits/rejected": -2.231396436691284, "logps/chosen": -6.229551315307617, "logps/rejected": -1.7111802101135254, "loss": 0.816, "rewards/accuracies": 0.0, "rewards/chosen": 0.6318182349205017, "rewards/margins": -0.2322295904159546, "rewards/rejected": 0.8640478253364563, "step": 1171 }, { "epoch": 0.63, "learning_rate": 9.167933088441904e-08, "logits/chosen": -2.0813076496124268, "logits/rejected": -2.2787067890167236, "logps/chosen": -0.522973358631134, "logps/rejected": -0.5994588732719421, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.7990631461143494, "rewards/margins": 0.0070002079010009766, "rewards/rejected": 0.7920629382133484, "step": 1172 }, { "epoch": 0.63, "learning_rate": 9.16632405889805e-08, "logits/chosen": -2.004141092300415, "logits/rejected": -2.0075628757476807, "logps/chosen": -2.3762307167053223, "logps/rejected": -3.264315605163574, "loss": 0.5363, "rewards/accuracies": 1.0, "rewards/chosen": 0.9056358337402344, "rewards/margins": 0.34305763244628906, "rewards/rejected": 0.5625782012939453, "step": 1173 }, { "epoch": 0.63, "learning_rate": 9.164713616589343e-08, "logits/chosen": -2.16198468208313, "logits/rejected": -2.1532585620880127, "logps/chosen": -3.4769957065582275, "logps/rejected": -2.7774362564086914, "loss": 0.4652, "rewards/accuracies": 1.0, "rewards/chosen": 1.118661880493164, "rewards/margins": 0.5236265063285828, "rewards/rejected": 0.5950353741645813, "step": 1174 }, { "epoch": 0.63, "learning_rate": 9.163101762061873e-08, "logits/chosen": -2.0713095664978027, "logits/rejected": -2.2910542488098145, "logps/chosen": -1.8157285451889038, "logps/rejected": -2.032484531402588, "loss": 0.6774, "rewards/accuracies": 1.0, "rewards/chosen": 0.5632159113883972, "rewards/margins": 0.03176194429397583, "rewards/rejected": 0.5314539670944214, "step": 1175 }, { "epoch": 0.63, "learning_rate": 9.161488495862201e-08, "logits/chosen": -2.1989285945892334, "logits/rejected": -2.2239878177642822, "logps/chosen": -1.823439359664917, "logps/rejected": -1.8577395677566528, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.8419463038444519, "rewards/margins": 0.026871204376220703, "rewards/rejected": 0.8150750994682312, "step": 1176 }, { "epoch": 0.63, "learning_rate": 9.159873818537378e-08, "logits/chosen": -2.0919458866119385, "logits/rejected": -2.097583532333374, "logps/chosen": -4.19831657409668, "logps/rejected": -2.9860730171203613, "loss": 0.4788, "rewards/accuracies": 1.0, "rewards/chosen": 1.1741905212402344, "rewards/margins": 0.48751479387283325, "rewards/rejected": 0.6866757273674011, "step": 1177 }, { "epoch": 0.64, "learning_rate": 9.158257730634923e-08, "logits/chosen": -2.038179397583008, "logits/rejected": -2.047645330429077, "logps/chosen": -8.767594337463379, "logps/rejected": -2.4846272468566895, "loss": 0.563, "rewards/accuracies": 1.0, "rewards/chosen": 1.0225909948349, "rewards/margins": 0.2798392176628113, "rewards/rejected": 0.7427517771720886, "step": 1178 }, { "epoch": 0.64, "learning_rate": 9.156640232702839e-08, "logits/chosen": -1.9809043407440186, "logits/rejected": -2.258690357208252, "logps/chosen": -1.4485772848129272, "logps/rejected": -1.4842582941055298, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 1.0428448915481567, "rewards/margins": 0.0023560523986816406, "rewards/rejected": 1.040488839149475, "step": 1179 }, { "epoch": 0.64, "learning_rate": 9.155021325289605e-08, "logits/chosen": -2.227437973022461, "logits/rejected": -2.2186923027038574, "logps/chosen": -1.700968623161316, "logps/rejected": -3.33217716217041, "loss": 0.5135, "rewards/accuracies": 1.0, "rewards/chosen": 0.9340749979019165, "rewards/margins": 0.3988313674926758, "rewards/rejected": 0.5352436304092407, "step": 1180 }, { "epoch": 0.64, "learning_rate": 9.15340100894418e-08, "logits/chosen": -1.9251993894577026, "logits/rejected": -1.9401575326919556, "logps/chosen": -4.348881244659424, "logps/rejected": -5.709934711456299, "loss": 0.6244, "rewards/accuracies": 1.0, "rewards/chosen": 0.934583306312561, "rewards/margins": 0.1425434947013855, "rewards/rejected": 0.7920398116111755, "step": 1181 }, { "epoch": 0.64, "learning_rate": 9.151779284215997e-08, "logits/chosen": -2.045931100845337, "logits/rejected": -2.349564552307129, "logps/chosen": -2.8280768394470215, "logps/rejected": -2.9791617393493652, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.9190580248832703, "rewards/margins": 0.0074596405029296875, "rewards/rejected": 0.9115983843803406, "step": 1182 }, { "epoch": 0.64, "learning_rate": 9.15015615165497e-08, "logits/chosen": -2.005117654800415, "logits/rejected": -2.3771088123321533, "logps/chosen": -0.6753467917442322, "logps/rejected": -31.39992904663086, "loss": 0.5253, "rewards/accuracies": 1.0, "rewards/chosen": 1.0348622798919678, "rewards/margins": 0.36958593130111694, "rewards/rejected": 0.6652763485908508, "step": 1183 }, { "epoch": 0.64, "learning_rate": 9.148531611811493e-08, "logits/chosen": -2.011270523071289, "logits/rejected": -2.0041427612304688, "logps/chosen": -7.144445896148682, "logps/rejected": -2.8987514972686768, "loss": 0.4747, "rewards/accuracies": 1.0, "rewards/chosen": 1.1777256727218628, "rewards/margins": 0.4983104467391968, "rewards/rejected": 0.679415225982666, "step": 1184 }, { "epoch": 0.64, "learning_rate": 9.146905665236429e-08, "logits/chosen": -1.9706345796585083, "logits/rejected": -2.3200950622558594, "logps/chosen": -5.109609127044678, "logps/rejected": -5.392942428588867, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.8995195627212524, "rewards/margins": 0.01475745439529419, "rewards/rejected": 0.8847621083259583, "step": 1185 }, { "epoch": 0.64, "learning_rate": 9.145278312481124e-08, "logits/chosen": -2.1359617710113525, "logits/rejected": -2.1413023471832275, "logps/chosen": -2.57967209815979, "logps/rejected": -1.8073538541793823, "loss": 0.6351, "rewards/accuracies": 1.0, "rewards/chosen": 0.9924975633621216, "rewards/margins": 0.11973696947097778, "rewards/rejected": 0.8727605938911438, "step": 1186 }, { "epoch": 0.64, "learning_rate": 9.143649554097398e-08, "logits/chosen": -2.054823875427246, "logits/rejected": -2.0453968048095703, "logps/chosen": -6.401223182678223, "logps/rejected": -1.8806297779083252, "loss": 0.4747, "rewards/accuracies": 1.0, "rewards/chosen": 1.4006679058074951, "rewards/margins": 0.4984070062637329, "rewards/rejected": 0.9022608995437622, "step": 1187 }, { "epoch": 0.64, "learning_rate": 9.142019390637553e-08, "logits/chosen": -2.1235806941986084, "logits/rejected": -2.287937641143799, "logps/chosen": -7.309934616088867, "logps/rejected": -6.223628997802734, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": 0.7094667553901672, "rewards/margins": -0.010714709758758545, "rewards/rejected": 0.7201814651489258, "step": 1188 }, { "epoch": 0.64, "learning_rate": 9.140387822654359e-08, "logits/chosen": -2.0815951824188232, "logits/rejected": -2.2318577766418457, "logps/chosen": -0.8497703075408936, "logps/rejected": -0.8779521584510803, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.8331364989280701, "rewards/margins": 0.010517597198486328, "rewards/rejected": 0.8226189017295837, "step": 1189 }, { "epoch": 0.64, "learning_rate": 9.13875485070107e-08, "logits/chosen": -2.0951545238494873, "logits/rejected": -2.128809690475464, "logps/chosen": -4.392993450164795, "logps/rejected": -7.253872871398926, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.848253071308136, "rewards/margins": 0.018592536449432373, "rewards/rejected": 0.8296605348587036, "step": 1190 }, { "epoch": 0.64, "learning_rate": 9.13712047533141e-08, "logits/chosen": -2.0459861755371094, "logits/rejected": -2.277784585952759, "logps/chosen": -0.801207959651947, "logps/rejected": -0.9057971835136414, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 1.0380101203918457, "rewards/margins": 0.002061009407043457, "rewards/rejected": 1.0359491109848022, "step": 1191 }, { "epoch": 0.64, "learning_rate": 9.135484697099585e-08, "logits/chosen": -2.0118069648742676, "logits/rejected": -1.996134877204895, "logps/chosen": -8.144946098327637, "logps/rejected": -4.37446928024292, "loss": 0.3957, "rewards/accuracies": 1.0, "rewards/chosen": 1.2545182704925537, "rewards/margins": 0.722655177116394, "rewards/rejected": 0.5318630933761597, "step": 1192 }, { "epoch": 0.64, "learning_rate": 9.13384751656027e-08, "logits/chosen": -2.0194482803344727, "logits/rejected": -2.1881959438323975, "logps/chosen": -2.5739099979400635, "logps/rejected": -2.289835214614868, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.5535281300544739, "rewards/margins": 0.012343108654022217, "rewards/rejected": 0.5411850214004517, "step": 1193 }, { "epoch": 0.64, "learning_rate": 9.132208934268622e-08, "logits/chosen": -2.1400909423828125, "logits/rejected": -2.1402928829193115, "logps/chosen": -1.9143927097320557, "logps/rejected": -5.850518703460693, "loss": 0.6686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9986152648925781, "rewards/margins": 0.0498010516166687, "rewards/rejected": 0.9488142132759094, "step": 1194 }, { "epoch": 0.64, "learning_rate": 9.130568950780268e-08, "logits/chosen": -1.9902263879776, "logits/rejected": -1.9862014055252075, "logps/chosen": -4.173974990844727, "logps/rejected": -3.6006712913513184, "loss": 0.382, "rewards/accuracies": 1.0, "rewards/chosen": 1.3225332498550415, "rewards/margins": 0.7651992440223694, "rewards/rejected": 0.5573340058326721, "step": 1195 }, { "epoch": 0.65, "learning_rate": 9.128927566651314e-08, "logits/chosen": -2.0007739067077637, "logits/rejected": -2.255218505859375, "logps/chosen": -0.6703810095787048, "logps/rejected": -0.675180971622467, "loss": 0.6724, "rewards/accuracies": 1.0, "rewards/chosen": 0.9848087430000305, "rewards/margins": 0.04201698303222656, "rewards/rejected": 0.942791759967804, "step": 1196 }, { "epoch": 0.65, "learning_rate": 9.127284782438335e-08, "logits/chosen": -2.0403945446014404, "logits/rejected": -2.2542762756347656, "logps/chosen": -0.8836520910263062, "logps/rejected": -0.9361611008644104, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.7077285051345825, "rewards/margins": 0.016372263431549072, "rewards/rejected": 0.6913562417030334, "step": 1197 }, { "epoch": 0.65, "learning_rate": 9.125640598698393e-08, "logits/chosen": -2.0400350093841553, "logits/rejected": -2.2850341796875, "logps/chosen": -4.471646308898926, "logps/rejected": -4.792056560516357, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.6319880485534668, "rewards/margins": 0.009938299655914307, "rewards/rejected": 0.6220497488975525, "step": 1198 }, { "epoch": 0.65, "learning_rate": 9.12399501598901e-08, "logits/chosen": -1.9356240034103394, "logits/rejected": -2.2122323513031006, "logps/chosen": -0.733089804649353, "logps/rejected": -0.7749875783920288, "loss": 0.7067, "rewards/accuracies": 0.0, "rewards/chosen": 0.8960745930671692, "rewards/margins": -0.026925206184387207, "rewards/rejected": 0.9229997992515564, "step": 1199 }, { "epoch": 0.65, "learning_rate": 9.122348034868192e-08, "logits/chosen": -2.132422924041748, "logits/rejected": -2.2964706420898438, "logps/chosen": -0.5249032974243164, "logps/rejected": -0.5198979377746582, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.8340606093406677, "rewards/margins": 0.008711576461791992, "rewards/rejected": 0.8253490328788757, "step": 1200 }, { "epoch": 0.65, "learning_rate": 9.120699655894415e-08, "logits/chosen": -2.043489694595337, "logits/rejected": -2.049856185913086, "logps/chosen": -0.6753057241439819, "logps/rejected": -5.438624858856201, "loss": 0.4614, "rewards/accuracies": 1.0, "rewards/chosen": 0.9550483822822571, "rewards/margins": 0.5339833498001099, "rewards/rejected": 0.42106500267982483, "step": 1201 }, { "epoch": 0.65, "learning_rate": 9.119049879626632e-08, "logits/chosen": -2.0493218898773193, "logits/rejected": -2.2611258029937744, "logps/chosen": -1.1111516952514648, "logps/rejected": -1.0660278797149658, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.870520293712616, "rewards/margins": 0.021568655967712402, "rewards/rejected": 0.8489516377449036, "step": 1202 }, { "epoch": 0.65, "learning_rate": 9.117398706624265e-08, "logits/chosen": -2.0153915882110596, "logits/rejected": -2.2048468589782715, "logps/chosen": -2.5487139225006104, "logps/rejected": -2.2970333099365234, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.6939089894294739, "rewards/margins": 0.0007672309875488281, "rewards/rejected": 0.693141758441925, "step": 1203 }, { "epoch": 0.65, "learning_rate": 9.115746137447216e-08, "logits/chosen": -2.0840752124786377, "logits/rejected": -2.0336713790893555, "logps/chosen": -34.94575500488281, "logps/rejected": -3.7348198890686035, "loss": 0.487, "rewards/accuracies": 1.0, "rewards/chosen": 1.0801597833633423, "rewards/margins": 0.4661557674407959, "rewards/rejected": 0.6140040159225464, "step": 1204 }, { "epoch": 0.65, "learning_rate": 9.114092172655854e-08, "logits/chosen": -2.104428291320801, "logits/rejected": -2.028925657272339, "logps/chosen": -10.510746002197266, "logps/rejected": -1.4384372234344482, "loss": 0.5418, "rewards/accuracies": 1.0, "rewards/chosen": 1.2533034086227417, "rewards/margins": 0.32965826988220215, "rewards/rejected": 0.9236451387405396, "step": 1205 }, { "epoch": 0.65, "learning_rate": 9.112436812811027e-08, "logits/chosen": -2.0181736946105957, "logits/rejected": -2.2937679290771484, "logps/chosen": -1.5594464540481567, "logps/rejected": -1.4307612180709839, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 1.0731343030929565, "rewards/margins": 0.013626813888549805, "rewards/rejected": 1.0595074892044067, "step": 1206 }, { "epoch": 0.65, "learning_rate": 9.11078005847405e-08, "logits/chosen": -2.12935209274292, "logits/rejected": -2.2333340644836426, "logps/chosen": -7.23068380355835, "logps/rejected": -5.312969207763672, "loss": 0.6703, "rewards/accuracies": 1.0, "rewards/chosen": 0.8406143188476562, "rewards/margins": 0.04612940549850464, "rewards/rejected": 0.7944849133491516, "step": 1207 }, { "epoch": 0.65, "learning_rate": 9.109121910206717e-08, "logits/chosen": -2.07149338722229, "logits/rejected": -2.0686352252960205, "logps/chosen": -4.42755126953125, "logps/rejected": -2.0279996395111084, "loss": 0.5879, "rewards/accuracies": 1.0, "rewards/chosen": 1.0335071086883545, "rewards/margins": 0.22281843423843384, "rewards/rejected": 0.8106886744499207, "step": 1208 }, { "epoch": 0.65, "learning_rate": 9.107462368571291e-08, "logits/chosen": -2.0672497749328613, "logits/rejected": -2.211012363433838, "logps/chosen": -1.1247869729995728, "logps/rejected": -1.1849173307418823, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.8944416046142578, "rewards/margins": 0.02256840467453003, "rewards/rejected": 0.8718731999397278, "step": 1209 }, { "epoch": 0.65, "learning_rate": 9.105801434130509e-08, "logits/chosen": -1.977967619895935, "logits/rejected": -1.9860340356826782, "logps/chosen": -3.139313220977783, "logps/rejected": -4.150257110595703, "loss": 0.4425, "rewards/accuracies": 1.0, "rewards/chosen": 1.32778799533844, "rewards/margins": 0.5860101580619812, "rewards/rejected": 0.7417778372764587, "step": 1210 }, { "epoch": 0.65, "learning_rate": 9.10413910744758e-08, "logits/chosen": -2.093391180038452, "logits/rejected": -1.99562406539917, "logps/chosen": -46.05775451660156, "logps/rejected": -3.4930171966552734, "loss": 0.3751, "rewards/accuracies": 1.0, "rewards/chosen": 1.502465844154358, "rewards/margins": 0.7871271371841431, "rewards/rejected": 0.7153387069702148, "step": 1211 }, { "epoch": 0.65, "learning_rate": 9.102475389086183e-08, "logits/chosen": -2.163715362548828, "logits/rejected": -2.275505542755127, "logps/chosen": -0.5286909341812134, "logps/rejected": -0.5410433411598206, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 1.0340572595596313, "rewards/margins": 0.01948726177215576, "rewards/rejected": 1.0145699977874756, "step": 1212 }, { "epoch": 0.65, "learning_rate": 9.100810279610472e-08, "logits/chosen": -2.0447704792022705, "logits/rejected": -2.2897419929504395, "logps/chosen": -3.8684630393981934, "logps/rejected": -3.9616825580596924, "loss": 0.6656, "rewards/accuracies": 1.0, "rewards/chosen": 0.7534350156784058, "rewards/margins": 0.05595594644546509, "rewards/rejected": 0.6974790692329407, "step": 1213 }, { "epoch": 0.65, "learning_rate": 9.099143779585071e-08, "logits/chosen": -2.0915439128875732, "logits/rejected": -2.1908304691314697, "logps/chosen": -22.997005462646484, "logps/rejected": -21.605960845947266, "loss": 0.4497, "rewards/accuracies": 1.0, "rewards/chosen": 1.2449432611465454, "rewards/margins": 0.565952479839325, "rewards/rejected": 0.6789907813072205, "step": 1214 }, { "epoch": 0.66, "learning_rate": 9.097475889575076e-08, "logits/chosen": -2.0502066612243652, "logits/rejected": -2.043454647064209, "logps/chosen": -4.92811393737793, "logps/rejected": -3.2361056804656982, "loss": 0.5628, "rewards/accuracies": 1.0, "rewards/chosen": 0.8267641067504883, "rewards/margins": 0.280278742313385, "rewards/rejected": 0.5464853644371033, "step": 1215 }, { "epoch": 0.66, "learning_rate": 9.095806610146055e-08, "logits/chosen": -2.0653984546661377, "logits/rejected": -2.27900767326355, "logps/chosen": -0.6684821248054504, "logps/rejected": -0.712302565574646, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.8934213519096375, "rewards/margins": 0.020703494548797607, "rewards/rejected": 0.8727178573608398, "step": 1216 }, { "epoch": 0.66, "learning_rate": 9.094135941864043e-08, "logits/chosen": -2.214123487472534, "logits/rejected": -2.1425459384918213, "logps/chosen": -26.139602661132812, "logps/rejected": -6.29180908203125, "loss": 0.5861, "rewards/accuracies": 1.0, "rewards/chosen": 1.0178871154785156, "rewards/margins": 0.22683829069137573, "rewards/rejected": 0.7910488247871399, "step": 1217 }, { "epoch": 0.66, "learning_rate": 9.092463885295553e-08, "logits/chosen": -2.005645513534546, "logits/rejected": -2.0051660537719727, "logps/chosen": -0.7598445415496826, "logps/rejected": -2.7055373191833496, "loss": 0.5341, "rewards/accuracies": 1.0, "rewards/chosen": 1.0174500942230225, "rewards/margins": 0.3483648896217346, "rewards/rejected": 0.6690852046012878, "step": 1218 }, { "epoch": 0.66, "learning_rate": 9.090790441007565e-08, "logits/chosen": -2.2140235900878906, "logits/rejected": -2.073331832885742, "logps/chosen": -62.41239929199219, "logps/rejected": -25.705642700195312, "loss": 0.297, "rewards/accuracies": 1.0, "rewards/chosen": 1.7031081914901733, "rewards/margins": 1.0619330406188965, "rewards/rejected": 0.6411750912666321, "step": 1219 }, { "epoch": 0.66, "learning_rate": 9.08911560956753e-08, "logits/chosen": -1.9594982862472534, "logits/rejected": -1.9718469381332397, "logps/chosen": -2.8173398971557617, "logps/rejected": -6.442859649658203, "loss": 0.4633, "rewards/accuracies": 1.0, "rewards/chosen": 1.1800639629364014, "rewards/margins": 0.5288689136505127, "rewards/rejected": 0.6511950492858887, "step": 1220 }, { "epoch": 0.66, "learning_rate": 9.087439391543366e-08, "logits/chosen": -2.1718249320983887, "logits/rejected": -2.161189079284668, "logps/chosen": -4.04270076751709, "logps/rejected": -9.060439109802246, "loss": 0.6637, "rewards/accuracies": 1.0, "rewards/chosen": 0.7929652333259583, "rewards/margins": 0.05980968475341797, "rewards/rejected": 0.7331555485725403, "step": 1221 }, { "epoch": 0.66, "learning_rate": 9.085761787503464e-08, "logits/chosen": -1.9661970138549805, "logits/rejected": -2.2369165420532227, "logps/chosen": -3.5593085289001465, "logps/rejected": -3.4226112365722656, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 1.0609415769577026, "rewards/margins": 0.01856207847595215, "rewards/rejected": 1.0423794984817505, "step": 1222 }, { "epoch": 0.66, "learning_rate": 9.084082798016689e-08, "logits/chosen": -2.307098150253296, "logits/rejected": -2.0721917152404785, "logps/chosen": -55.12406921386719, "logps/rejected": -4.9342570304870605, "loss": 0.3773, "rewards/accuracies": 1.0, "rewards/chosen": 1.3456779718399048, "rewards/margins": 0.7801374197006226, "rewards/rejected": 0.5655405521392822, "step": 1223 }, { "epoch": 0.66, "learning_rate": 9.08240242365237e-08, "logits/chosen": -2.0833921432495117, "logits/rejected": -2.237391710281372, "logps/chosen": -0.8312222361564636, "logps/rejected": -0.9533546566963196, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 1.0071204900741577, "rewards/margins": 0.008543729782104492, "rewards/rejected": 0.9985767602920532, "step": 1224 }, { "epoch": 0.66, "learning_rate": 9.080720664980305e-08, "logits/chosen": -2.065319538116455, "logits/rejected": -2.0671210289001465, "logps/chosen": -4.578096389770508, "logps/rejected": -13.449223518371582, "loss": 0.313, "rewards/accuracies": 1.0, "rewards/chosen": 1.0925098657608032, "rewards/margins": 1.0009160041809082, "rewards/rejected": 0.09159383922815323, "step": 1225 }, { "epoch": 0.66, "learning_rate": 9.07903752257077e-08, "logits/chosen": -2.014796018600464, "logits/rejected": -2.009310722351074, "logps/chosen": -5.732001781463623, "logps/rejected": -2.588653802871704, "loss": 0.489, "rewards/accuracies": 1.0, "rewards/chosen": 1.203283667564392, "rewards/margins": 0.46084439754486084, "rewards/rejected": 0.7424392700195312, "step": 1226 }, { "epoch": 0.66, "learning_rate": 9.077352996994499e-08, "logits/chosen": -2.2067172527313232, "logits/rejected": -2.0787346363067627, "logps/chosen": -42.4195671081543, "logps/rejected": -0.42961612343788147, "loss": 0.264, "rewards/accuracies": 1.0, "rewards/chosen": 2.0037503242492676, "rewards/margins": 1.197087049484253, "rewards/rejected": 0.8066632151603699, "step": 1227 }, { "epoch": 0.66, "learning_rate": 9.075667088822702e-08, "logits/chosen": -2.1398634910583496, "logits/rejected": -2.26009464263916, "logps/chosen": -1.2735021114349365, "logps/rejected": -1.2873194217681885, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.8989656567573547, "rewards/margins": 0.0014142990112304688, "rewards/rejected": 0.8975513577461243, "step": 1228 }, { "epoch": 0.66, "learning_rate": 9.073979798627056e-08, "logits/chosen": -2.21867299079895, "logits/rejected": -2.215266704559326, "logps/chosen": -8.663153648376465, "logps/rejected": -3.7054083347320557, "loss": 0.3579, "rewards/accuracies": 1.0, "rewards/chosen": 1.3458088636398315, "rewards/margins": 0.8431804180145264, "rewards/rejected": 0.5026284456253052, "step": 1229 }, { "epoch": 0.66, "learning_rate": 9.072291126979707e-08, "logits/chosen": -1.9241628646850586, "logits/rejected": -2.2111623287200928, "logps/chosen": -2.577014923095703, "logps/rejected": -2.5217032432556152, "loss": 0.702, "rewards/accuracies": 0.0, "rewards/chosen": 0.7883325815200806, "rewards/margins": -0.017603278160095215, "rewards/rejected": 0.8059358596801758, "step": 1230 }, { "epoch": 0.66, "learning_rate": 9.070601074453268e-08, "logits/chosen": -2.1207120418548584, "logits/rejected": -2.02158522605896, "logps/chosen": -27.3095703125, "logps/rejected": -2.038151264190674, "loss": 0.3959, "rewards/accuracies": 1.0, "rewards/chosen": 1.5044784545898438, "rewards/margins": 0.7222450375556946, "rewards/rejected": 0.7822334170341492, "step": 1231 }, { "epoch": 0.66, "learning_rate": 9.068909641620824e-08, "logits/chosen": -2.0309104919433594, "logits/rejected": -2.0347423553466797, "logps/chosen": -2.8478753566741943, "logps/rejected": -3.9738826751708984, "loss": 0.4789, "rewards/accuracies": 1.0, "rewards/chosen": 1.06887948513031, "rewards/margins": 0.48732972145080566, "rewards/rejected": 0.5815497636795044, "step": 1232 }, { "epoch": 0.67, "learning_rate": 9.067216829055921e-08, "logits/chosen": -1.9542326927185059, "logits/rejected": -2.2550241947174072, "logps/chosen": -4.604156494140625, "logps/rejected": -5.281750679016113, "loss": 0.6508, "rewards/accuracies": 1.0, "rewards/chosen": 0.7871593832969666, "rewards/margins": 0.08665162324905396, "rewards/rejected": 0.7005077600479126, "step": 1233 }, { "epoch": 0.67, "learning_rate": 9.065522637332581e-08, "logits/chosen": -2.0712554454803467, "logits/rejected": -2.221856117248535, "logps/chosen": -2.158076286315918, "logps/rejected": -2.037702798843384, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.9436982274055481, "rewards/margins": 0.01826256513595581, "rewards/rejected": 0.9254356622695923, "step": 1234 }, { "epoch": 0.67, "learning_rate": 9.063827067025289e-08, "logits/chosen": -2.0210719108581543, "logits/rejected": -2.267246723175049, "logps/chosen": -0.5766528248786926, "logps/rejected": -0.6147873997688293, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9307320713996887, "rewards/margins": 0.005680680274963379, "rewards/rejected": 0.9250513911247253, "step": 1235 }, { "epoch": 0.67, "learning_rate": 9.062130118708997e-08, "logits/chosen": -2.1160824298858643, "logits/rejected": -2.1180553436279297, "logps/chosen": -0.36840924620628357, "logps/rejected": -3.453835964202881, "loss": 0.586, "rewards/accuracies": 1.0, "rewards/chosen": 0.8259289860725403, "rewards/margins": 0.227256178855896, "rewards/rejected": 0.5986728072166443, "step": 1236 }, { "epoch": 0.67, "learning_rate": 9.060431792959128e-08, "logits/chosen": -1.9156912565231323, "logits/rejected": -1.9491724967956543, "logps/chosen": -1.723585844039917, "logps/rejected": -12.829241752624512, "loss": 0.5958, "rewards/accuracies": 1.0, "rewards/chosen": 0.9706746339797974, "rewards/margins": 0.20530074834823608, "rewards/rejected": 0.7653738856315613, "step": 1237 }, { "epoch": 0.67, "learning_rate": 9.058732090351568e-08, "logits/chosen": -2.014065980911255, "logits/rejected": -2.011537790298462, "logps/chosen": -1.9110881090164185, "logps/rejected": -5.129183292388916, "loss": 0.5042, "rewards/accuracies": 1.0, "rewards/chosen": 0.9420822262763977, "rewards/margins": 0.4222021698951721, "rewards/rejected": 0.5198800563812256, "step": 1238 }, { "epoch": 0.67, "learning_rate": 9.057031011462673e-08, "logits/chosen": -2.1041665077209473, "logits/rejected": -2.33408522605896, "logps/chosen": -4.165591239929199, "logps/rejected": -4.329138278961182, "loss": 0.6749, "rewards/accuracies": 1.0, "rewards/chosen": 0.485323041677475, "rewards/margins": 0.03684648871421814, "rewards/rejected": 0.44847655296325684, "step": 1239 }, { "epoch": 0.67, "learning_rate": 9.055328556869262e-08, "logits/chosen": -1.9995278120040894, "logits/rejected": -2.003153085708618, "logps/chosen": -3.2159640789031982, "logps/rejected": -1.123095989227295, "loss": 0.6307, "rewards/accuracies": 1.0, "rewards/chosen": 1.0962671041488647, "rewards/margins": 0.12906813621520996, "rewards/rejected": 0.9671989679336548, "step": 1240 }, { "epoch": 0.67, "learning_rate": 9.053624727148623e-08, "logits/chosen": -2.068307638168335, "logits/rejected": -2.2732491493225098, "logps/chosen": -1.5821973085403442, "logps/rejected": -1.5596580505371094, "loss": 0.6754, "rewards/accuracies": 1.0, "rewards/chosen": 1.0337580442428589, "rewards/margins": 0.03585249185562134, "rewards/rejected": 0.9979055523872375, "step": 1241 }, { "epoch": 0.67, "learning_rate": 9.051919522878513e-08, "logits/chosen": -2.0145115852355957, "logits/rejected": -2.01411771774292, "logps/chosen": -1.3547879457473755, "logps/rejected": -2.5186407566070557, "loss": 0.5428, "rewards/accuracies": 1.0, "rewards/chosen": 0.923250138759613, "rewards/margins": 0.3274383544921875, "rewards/rejected": 0.5958117842674255, "step": 1242 }, { "epoch": 0.67, "learning_rate": 9.050212944637152e-08, "logits/chosen": -2.001216411590576, "logits/rejected": -2.2296254634857178, "logps/chosen": -0.8337447643280029, "logps/rejected": -0.7682327032089233, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.8149928450584412, "rewards/margins": 0.019236445426940918, "rewards/rejected": 0.7957563996315002, "step": 1243 }, { "epoch": 0.67, "learning_rate": 9.048504993003221e-08, "logits/chosen": -2.0392863750457764, "logits/rejected": -2.0409514904022217, "logps/chosen": -1.2986375093460083, "logps/rejected": -2.1687374114990234, "loss": 0.5648, "rewards/accuracies": 1.0, "rewards/chosen": 0.8857482075691223, "rewards/margins": 0.27560532093048096, "rewards/rejected": 0.6101428866386414, "step": 1244 }, { "epoch": 0.67, "learning_rate": 9.046795668555878e-08, "logits/chosen": -1.9485034942626953, "logits/rejected": -2.305035352706909, "logps/chosen": -2.9617106914520264, "logps/rejected": -3.0730764865875244, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": 0.8153105974197388, "rewards/margins": -0.013851702213287354, "rewards/rejected": 0.8291622996330261, "step": 1245 }, { "epoch": 0.67, "learning_rate": 9.045084971874737e-08, "logits/chosen": -2.1212832927703857, "logits/rejected": -2.311469793319702, "logps/chosen": -13.500970840454102, "logps/rejected": -14.400270462036133, "loss": 0.6198, "rewards/accuracies": 1.0, "rewards/chosen": 0.869872510433197, "rewards/margins": 0.1525566577911377, "rewards/rejected": 0.7173158526420593, "step": 1246 }, { "epoch": 0.67, "learning_rate": 9.043372903539881e-08, "logits/chosen": -1.9800221920013428, "logits/rejected": -2.2637081146240234, "logps/chosen": -0.7854486703872681, "logps/rejected": -0.6889188289642334, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.7856077551841736, "rewards/margins": 0.012520134449005127, "rewards/rejected": 0.7730876207351685, "step": 1247 }, { "epoch": 0.67, "learning_rate": 9.041659464131859e-08, "logits/chosen": -2.0739595890045166, "logits/rejected": -2.0750081539154053, "logps/chosen": -4.003058910369873, "logps/rejected": -0.3961409330368042, "loss": 0.4665, "rewards/accuracies": 1.0, "rewards/chosen": 1.2992390394210815, "rewards/margins": 0.5202205181121826, "rewards/rejected": 0.7790185213088989, "step": 1248 }, { "epoch": 0.67, "learning_rate": 9.03994465423168e-08, "logits/chosen": -2.095571517944336, "logits/rejected": -2.1212151050567627, "logps/chosen": -3.5535335540771484, "logps/rejected": -6.110225200653076, "loss": 0.5444, "rewards/accuracies": 1.0, "rewards/chosen": 1.0349234342575073, "rewards/margins": 0.3235788345336914, "rewards/rejected": 0.7113445997238159, "step": 1249 }, { "epoch": 0.67, "learning_rate": 9.038228474420826e-08, "logits/chosen": -1.9675158262252808, "logits/rejected": -2.278139352798462, "logps/chosen": -0.8416234850883484, "logps/rejected": -0.8846527934074402, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.8460758328437805, "rewards/margins": 0.008863091468811035, "rewards/rejected": 0.8372127413749695, "step": 1250 }, { "epoch": 0.67, "learning_rate": 9.036510925281236e-08, "logits/chosen": -2.0554096698760986, "logits/rejected": -2.0146303176879883, "logps/chosen": -34.86533737182617, "logps/rejected": -1.9494887590408325, "loss": 0.4892, "rewards/accuracies": 1.0, "rewards/chosen": 1.269826889038086, "rewards/margins": 0.460482656955719, "rewards/rejected": 0.8093442320823669, "step": 1251 }, { "epoch": 0.68, "learning_rate": 9.034792007395317e-08, "logits/chosen": -1.9216868877410889, "logits/rejected": -2.2470345497131348, "logps/chosen": -3.132050037384033, "logps/rejected": -6.3241729736328125, "loss": 0.6446, "rewards/accuracies": 1.0, "rewards/chosen": 0.8531879782676697, "rewards/margins": 0.09966164827346802, "rewards/rejected": 0.7535263299942017, "step": 1252 }, { "epoch": 0.68, "learning_rate": 9.033071721345942e-08, "logits/chosen": -2.201298713684082, "logits/rejected": -2.29386043548584, "logps/chosen": -18.583444595336914, "logps/rejected": -13.482540130615234, "loss": 0.7027, "rewards/accuracies": 0.0, "rewards/chosen": 0.4901967942714691, "rewards/margins": -0.01897069811820984, "rewards/rejected": 0.509167492389679, "step": 1253 }, { "epoch": 0.68, "learning_rate": 9.03135006771644e-08, "logits/chosen": -2.070139169692993, "logits/rejected": -2.3272197246551514, "logps/chosen": -1.5602256059646606, "logps/rejected": -1.7434542179107666, "loss": 0.6632, "rewards/accuracies": 1.0, "rewards/chosen": 0.7960721254348755, "rewards/margins": 0.060731589794158936, "rewards/rejected": 0.7353405356407166, "step": 1254 }, { "epoch": 0.68, "learning_rate": 9.029627047090613e-08, "logits/chosen": -2.0591156482696533, "logits/rejected": -2.0576932430267334, "logps/chosen": -1.108335018157959, "logps/rejected": -1.7481790781021118, "loss": 0.589, "rewards/accuracies": 1.0, "rewards/chosen": 1.0028774738311768, "rewards/margins": 0.2203373908996582, "rewards/rejected": 0.7825400829315186, "step": 1255 }, { "epoch": 0.68, "learning_rate": 9.027902660052721e-08, "logits/chosen": -2.0880720615386963, "logits/rejected": -2.0911898612976074, "logps/chosen": -0.8508493900299072, "logps/rejected": -2.3228743076324463, "loss": 0.552, "rewards/accuracies": 1.0, "rewards/chosen": 0.9237821698188782, "rewards/margins": 0.30558884143829346, "rewards/rejected": 0.6181933283805847, "step": 1256 }, { "epoch": 0.68, "learning_rate": 9.026176907187489e-08, "logits/chosen": -2.0506348609924316, "logits/rejected": -2.303682327270508, "logps/chosen": -6.396646499633789, "logps/rejected": -3.1279029846191406, "loss": 0.8035, "rewards/accuracies": 0.0, "rewards/chosen": 0.6235930323600769, "rewards/margins": -0.20973199605941772, "rewards/rejected": 0.8333250284194946, "step": 1257 }, { "epoch": 0.68, "learning_rate": 9.024449789080106e-08, "logits/chosen": -2.1602842807769775, "logits/rejected": -2.062046527862549, "logps/chosen": -48.39337921142578, "logps/rejected": -1.9982191324234009, "loss": 0.3735, "rewards/accuracies": 1.0, "rewards/chosen": 1.4414558410644531, "rewards/margins": 0.7922387719154358, "rewards/rejected": 0.6492170691490173, "step": 1258 }, { "epoch": 0.68, "learning_rate": 9.022721306316221e-08, "logits/chosen": -2.008064031600952, "logits/rejected": -2.012760877609253, "logps/chosen": -1.6030755043029785, "logps/rejected": -2.6329405307769775, "loss": 0.5106, "rewards/accuracies": 1.0, "rewards/chosen": 1.0613278150558472, "rewards/margins": 0.40599143505096436, "rewards/rejected": 0.6553363800048828, "step": 1259 }, { "epoch": 0.68, "learning_rate": 9.020991459481951e-08, "logits/chosen": -1.9761914014816284, "logits/rejected": -1.9773706197738647, "logps/chosen": -1.1715850830078125, "logps/rejected": -2.711317539215088, "loss": 0.5578, "rewards/accuracies": 1.0, "rewards/chosen": 0.9264248013496399, "rewards/margins": 0.29195839166641235, "rewards/rejected": 0.6344664096832275, "step": 1260 }, { "epoch": 0.68, "learning_rate": 9.019260249163867e-08, "logits/chosen": -2.0648341178894043, "logits/rejected": -2.060886859893799, "logps/chosen": -4.70591926574707, "logps/rejected": -3.1870474815368652, "loss": 0.5847, "rewards/accuracies": 1.0, "rewards/chosen": 0.9245516657829285, "rewards/margins": 0.2300330400466919, "rewards/rejected": 0.6945186257362366, "step": 1261 }, { "epoch": 0.68, "learning_rate": 9.017527675949013e-08, "logits/chosen": -2.0585596561431885, "logits/rejected": -2.04730224609375, "logps/chosen": -0.6122183203697205, "logps/rejected": -5.312292098999023, "loss": 0.5165, "rewards/accuracies": 1.0, "rewards/chosen": 0.8978134989738464, "rewards/margins": 0.3913852572441101, "rewards/rejected": 0.5064282417297363, "step": 1262 }, { "epoch": 0.68, "learning_rate": 9.015793740424887e-08, "logits/chosen": -2.0634970664978027, "logits/rejected": -2.253835678100586, "logps/chosen": -0.5294927358627319, "logps/rejected": -0.6849455833435059, "loss": 0.6973, "rewards/accuracies": 0.0, "rewards/chosen": 0.8496796488761902, "rewards/margins": -0.008350670337677002, "rewards/rejected": 0.8580303192138672, "step": 1263 }, { "epoch": 0.68, "learning_rate": 9.014058443179453e-08, "logits/chosen": -2.049375057220459, "logits/rejected": -2.051199436187744, "logps/chosen": -0.7621200680732727, "logps/rejected": -2.6367948055267334, "loss": 0.553, "rewards/accuracies": 1.0, "rewards/chosen": 1.0281578302383423, "rewards/margins": 0.30318206548690796, "rewards/rejected": 0.7249757647514343, "step": 1264 }, { "epoch": 0.68, "learning_rate": 9.012321784801134e-08, "logits/chosen": -1.9803414344787598, "logits/rejected": -2.2815940380096436, "logps/chosen": -1.5278781652450562, "logps/rejected": -4.936516761779785, "loss": 0.6061, "rewards/accuracies": 1.0, "rewards/chosen": 0.9690141677856445, "rewards/margins": 0.1824072003364563, "rewards/rejected": 0.7866069674491882, "step": 1265 }, { "epoch": 0.68, "learning_rate": 9.010583765878816e-08, "logits/chosen": -2.098066806793213, "logits/rejected": -2.1000585556030273, "logps/chosen": -1.9813075065612793, "logps/rejected": -2.205806255340576, "loss": 0.5323, "rewards/accuracies": 1.0, "rewards/chosen": 1.0144684314727783, "rewards/margins": 0.3525354266166687, "rewards/rejected": 0.6619330048561096, "step": 1266 }, { "epoch": 0.68, "learning_rate": 9.008844387001848e-08, "logits/chosen": -1.9113472700119019, "logits/rejected": -2.243905544281006, "logps/chosen": -0.53487229347229, "logps/rejected": -0.558833658695221, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.9338585734367371, "rewards/margins": 0.011308908462524414, "rewards/rejected": 0.9225496649742126, "step": 1267 }, { "epoch": 0.68, "learning_rate": 9.007103648760038e-08, "logits/chosen": -1.9537930488586426, "logits/rejected": -2.2458512783050537, "logps/chosen": -1.2050447463989258, "logps/rejected": -2.721439838409424, "loss": 0.6182, "rewards/accuracies": 1.0, "rewards/chosen": 0.9840189814567566, "rewards/margins": 0.15602654218673706, "rewards/rejected": 0.8279924392700195, "step": 1268 }, { "epoch": 0.68, "learning_rate": 9.005361551743656e-08, "logits/chosen": -2.028141736984253, "logits/rejected": -2.0260488986968994, "logps/chosen": -10.126824378967285, "logps/rejected": -1.6286940574645996, "loss": 0.4146, "rewards/accuracies": 1.0, "rewards/chosen": 1.3703986406326294, "rewards/margins": 0.6659321188926697, "rewards/rejected": 0.7044665217399597, "step": 1269 }, { "epoch": 0.69, "learning_rate": 9.003618096543429e-08, "logits/chosen": -2.0950398445129395, "logits/rejected": -2.092440605163574, "logps/chosen": -2.388469934463501, "logps/rejected": -2.411381244659424, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 1.0359388589859009, "rewards/margins": 0.02968275547027588, "rewards/rejected": 1.006256103515625, "step": 1270 }, { "epoch": 0.69, "learning_rate": 9.001873283750551e-08, "logits/chosen": -2.0683467388153076, "logits/rejected": -2.260317087173462, "logps/chosen": -1.846796989440918, "logps/rejected": -1.3901466131210327, "loss": 0.7058, "rewards/accuracies": 0.0, "rewards/chosen": 0.5129222273826599, "rewards/margins": -0.025194108486175537, "rewards/rejected": 0.5381163358688354, "step": 1271 }, { "epoch": 0.69, "learning_rate": 9.000127113956672e-08, "logits/chosen": -2.054421901702881, "logits/rejected": -2.3182151317596436, "logps/chosen": -0.4364873170852661, "logps/rejected": -0.4508803188800812, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.8275771141052246, "rewards/margins": 0.022189795970916748, "rewards/rejected": 0.8053873181343079, "step": 1272 }, { "epoch": 0.69, "learning_rate": 8.998379587753904e-08, "logits/chosen": -2.040001153945923, "logits/rejected": -2.219158172607422, "logps/chosen": -0.5277805328369141, "logps/rejected": -0.6500949263572693, "loss": 0.701, "rewards/accuracies": 0.0, "rewards/chosen": 0.769406259059906, "rewards/margins": -0.015591859817504883, "rewards/rejected": 0.7849981188774109, "step": 1273 }, { "epoch": 0.69, "learning_rate": 8.996630705734816e-08, "logits/chosen": -2.09006929397583, "logits/rejected": -2.224039077758789, "logps/chosen": -2.634760856628418, "logps/rejected": -0.814891517162323, "loss": 0.703, "rewards/accuracies": 0.0, "rewards/chosen": 0.8220290541648865, "rewards/margins": -0.019605815410614014, "rewards/rejected": 0.8416348695755005, "step": 1274 }, { "epoch": 0.69, "learning_rate": 8.99488046849244e-08, "logits/chosen": -2.0200631618499756, "logits/rejected": -2.016788959503174, "logps/chosen": -5.411766052246094, "logps/rejected": -2.4496567249298096, "loss": 0.4102, "rewards/accuracies": 1.0, "rewards/chosen": 1.4373568296432495, "rewards/margins": 0.6790396571159363, "rewards/rejected": 0.7583171725273132, "step": 1275 }, { "epoch": 0.69, "learning_rate": 8.99312887662027e-08, "logits/chosen": -2.06424880027771, "logits/rejected": -2.0636510848999023, "logps/chosen": -1.0373444557189941, "logps/rejected": -2.878945827484131, "loss": 0.5825, "rewards/accuracies": 1.0, "rewards/chosen": 0.8533607721328735, "rewards/margins": 0.23510068655014038, "rewards/rejected": 0.6182600855827332, "step": 1276 }, { "epoch": 0.69, "learning_rate": 8.99137593071225e-08, "logits/chosen": -2.138617992401123, "logits/rejected": -2.1205360889434814, "logps/chosen": -17.271718978881836, "logps/rejected": -2.911206007003784, "loss": 0.4318, "rewards/accuracies": 1.0, "rewards/chosen": 1.1409624814987183, "rewards/margins": 0.6161067485809326, "rewards/rejected": 0.5248557329177856, "step": 1277 }, { "epoch": 0.69, "learning_rate": 8.989621631362794e-08, "logits/chosen": -1.985958456993103, "logits/rejected": -2.263737916946411, "logps/chosen": -6.618094444274902, "logps/rejected": -6.305507183074951, "loss": 0.7085, "rewards/accuracies": 0.0, "rewards/chosen": 0.5118361711502075, "rewards/margins": -0.030459344387054443, "rewards/rejected": 0.542295515537262, "step": 1278 }, { "epoch": 0.69, "learning_rate": 8.987865979166766e-08, "logits/chosen": -2.0643296241760254, "logits/rejected": -2.235888957977295, "logps/chosen": -0.8547320365905762, "logps/rejected": -0.8325465321540833, "loss": 0.6759, "rewards/accuracies": 1.0, "rewards/chosen": 0.8104097247123718, "rewards/margins": 0.03482288122177124, "rewards/rejected": 0.7755868434906006, "step": 1279 }, { "epoch": 0.69, "learning_rate": 8.986108974719493e-08, "logits/chosen": -2.0553925037384033, "logits/rejected": -2.0822832584381104, "logps/chosen": -3.541074514389038, "logps/rejected": -8.052499771118164, "loss": 0.3623, "rewards/accuracies": 1.0, "rewards/chosen": 1.5270719528198242, "rewards/margins": 0.8287873864173889, "rewards/rejected": 0.6982845664024353, "step": 1280 }, { "epoch": 0.69, "learning_rate": 8.984350618616764e-08, "logits/chosen": -1.9770416021347046, "logits/rejected": -1.9725397825241089, "logps/chosen": -9.635469436645508, "logps/rejected": -3.4649341106414795, "loss": 0.5428, "rewards/accuracies": 1.0, "rewards/chosen": 1.3671716451644897, "rewards/margins": 0.3274726867675781, "rewards/rejected": 1.0396989583969116, "step": 1281 }, { "epoch": 0.69, "learning_rate": 8.98259091145482e-08, "logits/chosen": -2.0294899940490723, "logits/rejected": -2.2201292514801025, "logps/chosen": -0.4640964865684509, "logps/rejected": -0.5621024966239929, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.7559250593185425, "rewards/margins": 0.013547778129577637, "rewards/rejected": 0.7423772811889648, "step": 1282 }, { "epoch": 0.69, "learning_rate": 8.98082985383036e-08, "logits/chosen": -2.006127119064331, "logits/rejected": -2.3005459308624268, "logps/chosen": -0.7286991477012634, "logps/rejected": -0.7143434286117554, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9899043440818787, "rewards/margins": 0.014393806457519531, "rewards/rejected": 0.9755105376243591, "step": 1283 }, { "epoch": 0.69, "learning_rate": 8.979067446340547e-08, "logits/chosen": -2.0070109367370605, "logits/rejected": -2.0191457271575928, "logps/chosen": -2.1372997760772705, "logps/rejected": -7.012421131134033, "loss": 0.4547, "rewards/accuracies": 1.0, "rewards/chosen": 1.0616990327835083, "rewards/margins": 0.5520655512809753, "rewards/rejected": 0.509633481502533, "step": 1284 }, { "epoch": 0.69, "learning_rate": 8.977303689582999e-08, "logits/chosen": -2.137202024459839, "logits/rejected": -2.1392745971679688, "logps/chosen": -2.0096540451049805, "logps/rejected": -2.978839159011841, "loss": 0.5451, "rewards/accuracies": 1.0, "rewards/chosen": 1.0216631889343262, "rewards/margins": 0.32197415828704834, "rewards/rejected": 0.6996890306472778, "step": 1285 }, { "epoch": 0.69, "learning_rate": 8.975538584155789e-08, "logits/chosen": -2.007298707962036, "logits/rejected": -2.0006730556488037, "logps/chosen": -3.158705234527588, "logps/rejected": -3.5801587104797363, "loss": 0.4284, "rewards/accuracies": 1.0, "rewards/chosen": 1.2126505374908447, "rewards/margins": 0.625791609287262, "rewards/rejected": 0.5868589282035828, "step": 1286 }, { "epoch": 0.69, "learning_rate": 8.973772130657448e-08, "logits/chosen": -1.9752793312072754, "logits/rejected": -1.9785277843475342, "logps/chosen": -1.183924913406372, "logps/rejected": -3.293701648712158, "loss": 0.5306, "rewards/accuracies": 1.0, "rewards/chosen": 1.04424250125885, "rewards/margins": 0.35680443048477173, "rewards/rejected": 0.6874380707740784, "step": 1287 }, { "epoch": 0.69, "learning_rate": 8.972004329686969e-08, "logits/chosen": -1.9938459396362305, "logits/rejected": -1.964147925376892, "logps/chosen": -18.640790939331055, "logps/rejected": -1.697766900062561, "loss": 0.5311, "rewards/accuracies": 1.0, "rewards/chosen": 1.2222150564193726, "rewards/margins": 0.3554648756980896, "rewards/rejected": 0.866750180721283, "step": 1288 }, { "epoch": 0.7, "learning_rate": 8.970235181843794e-08, "logits/chosen": -2.1923625469207764, "logits/rejected": -2.1584396362304688, "logps/chosen": -32.76332092285156, "logps/rejected": -2.2083210945129395, "loss": 0.5276, "rewards/accuracies": 1.0, "rewards/chosen": 1.3641575574874878, "rewards/margins": 0.3640599250793457, "rewards/rejected": 1.000097632408142, "step": 1289 }, { "epoch": 0.7, "learning_rate": 8.968464687727828e-08, "logits/chosen": -2.0821356773376465, "logits/rejected": -2.2717175483703613, "logps/chosen": -0.9830381274223328, "logps/rejected": -0.8421672582626343, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.9819585084915161, "rewards/margins": 0.03455078601837158, "rewards/rejected": 0.9474077224731445, "step": 1290 }, { "epoch": 0.7, "learning_rate": 8.96669284793943e-08, "logits/chosen": -2.0714173316955566, "logits/rejected": -2.0819242000579834, "logps/chosen": -5.308095932006836, "logps/rejected": -2.160234212875366, "loss": 0.3922, "rewards/accuracies": 1.0, "rewards/chosen": 1.3252172470092773, "rewards/margins": 0.7336059212684631, "rewards/rejected": 0.5916113257408142, "step": 1291 }, { "epoch": 0.7, "learning_rate": 8.964919663079418e-08, "logits/chosen": -2.118144989013672, "logits/rejected": -2.1203575134277344, "logps/chosen": -3.2604737281799316, "logps/rejected": -4.845133304595947, "loss": 0.5286, "rewards/accuracies": 1.0, "rewards/chosen": 1.0255268812179565, "rewards/margins": 0.36167192459106445, "rewards/rejected": 0.6638549566268921, "step": 1292 }, { "epoch": 0.7, "learning_rate": 8.963145133749059e-08, "logits/chosen": -2.024552583694458, "logits/rejected": -2.0246639251708984, "logps/chosen": -0.5144233703613281, "logps/rejected": -3.0945796966552734, "loss": 0.5321, "rewards/accuracies": 1.0, "rewards/chosen": 1.050737738609314, "rewards/margins": 0.353162944316864, "rewards/rejected": 0.69757479429245, "step": 1293 }, { "epoch": 0.7, "learning_rate": 8.961369260550085e-08, "logits/chosen": -2.2139484882354736, "logits/rejected": -2.25888729095459, "logps/chosen": -11.567842483520508, "logps/rejected": -8.682668685913086, "loss": 0.7014, "rewards/accuracies": 0.0, "rewards/chosen": 0.7580873370170593, "rewards/margins": -0.01648944616317749, "rewards/rejected": 0.7745767831802368, "step": 1294 }, { "epoch": 0.7, "learning_rate": 8.959592044084679e-08, "logits/chosen": -2.0818893909454346, "logits/rejected": -2.0815749168395996, "logps/chosen": -0.513987123966217, "logps/rejected": -6.488280773162842, "loss": 0.4049, "rewards/accuracies": 1.0, "rewards/chosen": 0.9650593996047974, "rewards/margins": 0.6947509050369263, "rewards/rejected": 0.2703084647655487, "step": 1295 }, { "epoch": 0.7, "learning_rate": 8.957813484955477e-08, "logits/chosen": -2.0194804668426514, "logits/rejected": -2.0204877853393555, "logps/chosen": -3.5360825061798096, "logps/rejected": -4.72842264175415, "loss": 0.6633, "rewards/accuracies": 1.0, "rewards/chosen": 0.9739707112312317, "rewards/margins": 0.06069439649581909, "rewards/rejected": 0.9132763147354126, "step": 1296 }, { "epoch": 0.7, "learning_rate": 8.956033583765574e-08, "logits/chosen": -1.9886023998260498, "logits/rejected": -2.068946599960327, "logps/chosen": -17.7374210357666, "logps/rejected": -18.014280319213867, "loss": 0.5887, "rewards/accuracies": 1.0, "rewards/chosen": 1.1141084432601929, "rewards/margins": 0.22107559442520142, "rewards/rejected": 0.8930328488349915, "step": 1297 }, { "epoch": 0.7, "learning_rate": 8.954252341118522e-08, "logits/chosen": -2.1462209224700928, "logits/rejected": -2.1446926593780518, "logps/chosen": -0.5465227961540222, "logps/rejected": -3.398059129714966, "loss": 0.5427, "rewards/accuracies": 1.0, "rewards/chosen": 0.8518816232681274, "rewards/margins": 0.32750797271728516, "rewards/rejected": 0.5243736505508423, "step": 1298 }, { "epoch": 0.7, "learning_rate": 8.952469757618325e-08, "logits/chosen": -1.957271695137024, "logits/rejected": -2.25789737701416, "logps/chosen": -2.7686004638671875, "logps/rejected": -2.823826789855957, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": 0.9901129007339478, "rewards/margins": -0.0021103620529174805, "rewards/rejected": 0.9922232627868652, "step": 1299 }, { "epoch": 0.7, "learning_rate": 8.950685833869438e-08, "logits/chosen": -2.0365149974823, "logits/rejected": -2.027655601501465, "logps/chosen": -7.557187080383301, "logps/rejected": -2.183518886566162, "loss": 0.483, "rewards/accuracies": 1.0, "rewards/chosen": 1.352036952972412, "rewards/margins": 0.47658395767211914, "rewards/rejected": 0.875452995300293, "step": 1300 }, { "epoch": 0.7, "learning_rate": 8.948900570476776e-08, "logits/chosen": -2.163250684738159, "logits/rejected": -2.2874250411987305, "logps/chosen": -0.44741886854171753, "logps/rejected": -0.4586683213710785, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 0.8639830946922302, "rewards/margins": -0.000244140625, "rewards/rejected": 0.8642272353172302, "step": 1301 }, { "epoch": 0.7, "learning_rate": 8.947113968045707e-08, "logits/chosen": -2.023543119430542, "logits/rejected": -2.295922040939331, "logps/chosen": -0.7875057458877563, "logps/rejected": -0.7777209281921387, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 1.0780444145202637, "rewards/margins": 0.015674233436584473, "rewards/rejected": 1.0623701810836792, "step": 1302 }, { "epoch": 0.7, "learning_rate": 8.945326027182054e-08, "logits/chosen": -2.138223648071289, "logits/rejected": -2.338310718536377, "logps/chosen": -5.880132675170898, "logps/rejected": -0.868118405342102, "loss": 0.7883, "rewards/accuracies": 0.0, "rewards/chosen": 0.8292748332023621, "rewards/margins": -0.1820576786994934, "rewards/rejected": 1.0113325119018555, "step": 1303 }, { "epoch": 0.7, "learning_rate": 8.943536748492091e-08, "logits/chosen": -2.1317100524902344, "logits/rejected": -2.2362704277038574, "logps/chosen": -0.8718318939208984, "logps/rejected": -3.623952865600586, "loss": 0.6183, "rewards/accuracies": 1.0, "rewards/chosen": 1.1020328998565674, "rewards/margins": 0.155734121799469, "rewards/rejected": 0.9462987780570984, "step": 1304 }, { "epoch": 0.7, "learning_rate": 8.941746132582548e-08, "logits/chosen": -1.9765592813491821, "logits/rejected": -2.2665915489196777, "logps/chosen": -3.2571489810943604, "logps/rejected": -10.830684661865234, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.8045257925987244, "rewards/margins": -0.0019931793212890625, "rewards/rejected": 0.8065189719200134, "step": 1305 }, { "epoch": 0.7, "learning_rate": 8.939954180060605e-08, "logits/chosen": -2.1975314617156982, "logits/rejected": -2.0833542346954346, "logps/chosen": -55.12201690673828, "logps/rejected": -9.020007133483887, "loss": 0.5839, "rewards/accuracies": 1.0, "rewards/chosen": 1.281378984451294, "rewards/margins": 0.2318974733352661, "rewards/rejected": 1.0494815111160278, "step": 1306 }, { "epoch": 0.7, "learning_rate": 8.9381608915339e-08, "logits/chosen": -2.011704683303833, "logits/rejected": -2.0109565258026123, "logps/chosen": -0.39640989899635315, "logps/rejected": -3.5033152103424072, "loss": 0.533, "rewards/accuracies": 1.0, "rewards/chosen": 0.9487310647964478, "rewards/margins": 0.35101884603500366, "rewards/rejected": 0.5977122187614441, "step": 1307 }, { "epoch": 0.71, "learning_rate": 8.936366267610523e-08, "logits/chosen": -2.1085972785949707, "logits/rejected": -2.325917959213257, "logps/chosen": -1.7419708967208862, "logps/rejected": -1.2849416732788086, "loss": 0.6967, "rewards/accuracies": 0.0, "rewards/chosen": 0.944794774055481, "rewards/margins": -0.007133185863494873, "rewards/rejected": 0.9519279599189758, "step": 1308 }, { "epoch": 0.71, "learning_rate": 8.934570308899012e-08, "logits/chosen": -2.0656917095184326, "logits/rejected": -2.295297384262085, "logps/chosen": -2.274362087249756, "logps/rejected": -2.2895474433898926, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.7897372841835022, "rewards/margins": 0.02545863389968872, "rewards/rejected": 0.7642786502838135, "step": 1309 }, { "epoch": 0.71, "learning_rate": 8.932773016008363e-08, "logits/chosen": -2.071288824081421, "logits/rejected": -2.077420949935913, "logps/chosen": -2.0378835201263428, "logps/rejected": -2.11246919631958, "loss": 0.5336, "rewards/accuracies": 1.0, "rewards/chosen": 1.0254578590393066, "rewards/margins": 0.34939253330230713, "rewards/rejected": 0.6760653257369995, "step": 1310 }, { "epoch": 0.71, "learning_rate": 8.930974389548023e-08, "logits/chosen": -1.9855831861495972, "logits/rejected": -2.267103672027588, "logps/chosen": -0.668340265750885, "logps/rejected": -0.8385729789733887, "loss": 0.6964, "rewards/accuracies": 0.0, "rewards/chosen": 0.8885269165039062, "rewards/margins": -0.006487488746643066, "rewards/rejected": 0.8950144052505493, "step": 1311 }, { "epoch": 0.71, "learning_rate": 8.929174430127891e-08, "logits/chosen": -2.1123127937316895, "logits/rejected": -2.1119205951690674, "logps/chosen": -1.2906451225280762, "logps/rejected": -1.6571787595748901, "loss": 0.7132, "rewards/accuracies": 0.0, "rewards/chosen": 0.8535005450248718, "rewards/margins": -0.039732277393341064, "rewards/rejected": 0.8932328224182129, "step": 1312 }, { "epoch": 0.71, "learning_rate": 8.927373138358318e-08, "logits/chosen": -2.143296718597412, "logits/rejected": -2.0743954181671143, "logps/chosen": -33.94431686401367, "logps/rejected": -3.4056904315948486, "loss": 0.4742, "rewards/accuracies": 1.0, "rewards/chosen": 1.1506832838058472, "rewards/margins": 0.49973732233047485, "rewards/rejected": 0.6509459614753723, "step": 1313 }, { "epoch": 0.71, "learning_rate": 8.925570514850107e-08, "logits/chosen": -2.104107618331909, "logits/rejected": -2.1013288497924805, "logps/chosen": -6.085389614105225, "logps/rejected": -2.866522789001465, "loss": 0.4853, "rewards/accuracies": 1.0, "rewards/chosen": 1.3606077432632446, "rewards/margins": 0.47058385610580444, "rewards/rejected": 0.8900238871574402, "step": 1314 }, { "epoch": 0.71, "learning_rate": 8.923766560214509e-08, "logits/chosen": -2.0774409770965576, "logits/rejected": -2.088981866836548, "logps/chosen": -3.1558024883270264, "logps/rejected": -4.4736714363098145, "loss": 0.7184, "rewards/accuracies": 0.0, "rewards/chosen": 0.9634499549865723, "rewards/margins": -0.04986608028411865, "rewards/rejected": 1.013316035270691, "step": 1315 }, { "epoch": 0.71, "learning_rate": 8.921961275063234e-08, "logits/chosen": -2.1368086338043213, "logits/rejected": -2.142536163330078, "logps/chosen": -4.201363563537598, "logps/rejected": -7.668123245239258, "loss": 0.4142, "rewards/accuracies": 1.0, "rewards/chosen": 1.2080100774765015, "rewards/margins": 0.6670944094657898, "rewards/rejected": 0.5409156680107117, "step": 1316 }, { "epoch": 0.71, "learning_rate": 8.920154660008436e-08, "logits/chosen": -2.0166029930114746, "logits/rejected": -2.014190435409546, "logps/chosen": -0.5358829498291016, "logps/rejected": -2.832360029220581, "loss": 0.5385, "rewards/accuracies": 1.0, "rewards/chosen": 0.9155451059341431, "rewards/margins": 0.33765000104904175, "rewards/rejected": 0.5778951048851013, "step": 1317 }, { "epoch": 0.71, "learning_rate": 8.918346715662723e-08, "logits/chosen": -2.099351167678833, "logits/rejected": -2.101026773452759, "logps/chosen": -1.706903338432312, "logps/rejected": -1.5199685096740723, "loss": 0.5656, "rewards/accuracies": 1.0, "rewards/chosen": 1.0192965269088745, "rewards/margins": 0.27383488416671753, "rewards/rejected": 0.745461642742157, "step": 1318 }, { "epoch": 0.71, "learning_rate": 8.916537442639154e-08, "logits/chosen": -2.11588978767395, "logits/rejected": -2.2844934463500977, "logps/chosen": -4.125980377197266, "logps/rejected": -5.065474510192871, "loss": 0.7935, "rewards/accuracies": 0.0, "rewards/chosen": 0.8033317923545837, "rewards/margins": -0.19155126810073853, "rewards/rejected": 0.9948830604553223, "step": 1319 }, { "epoch": 0.71, "learning_rate": 8.914726841551239e-08, "logits/chosen": -2.1891121864318848, "logits/rejected": -2.3156797885894775, "logps/chosen": -13.230058670043945, "logps/rejected": -8.575166702270508, "loss": 0.8037, "rewards/accuracies": 0.0, "rewards/chosen": 0.7103641629219055, "rewards/margins": -0.21016138792037964, "rewards/rejected": 0.9205255508422852, "step": 1320 }, { "epoch": 0.71, "learning_rate": 8.912914913012934e-08, "logits/chosen": -2.1656508445739746, "logits/rejected": -2.074148416519165, "logps/chosen": -34.83161163330078, "logps/rejected": -2.6227033138275146, "loss": 0.3898, "rewards/accuracies": 1.0, "rewards/chosen": 1.2981551885604858, "rewards/margins": 0.7410523891448975, "rewards/rejected": 0.5571027994155884, "step": 1321 }, { "epoch": 0.71, "learning_rate": 8.911101657638652e-08, "logits/chosen": -1.980940580368042, "logits/rejected": -2.270905017852783, "logps/chosen": -0.4970185160636902, "logps/rejected": -0.5145081877708435, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.8318907022476196, "rewards/margins": -0.001997649669647217, "rewards/rejected": 0.8338883519172668, "step": 1322 }, { "epoch": 0.71, "learning_rate": 8.909287076043252e-08, "logits/chosen": -2.137044906616211, "logits/rejected": -2.143610954284668, "logps/chosen": -7.013623237609863, "logps/rejected": -3.3340063095092773, "loss": 0.7659, "rewards/accuracies": 0.0, "rewards/chosen": 1.0410106182098389, "rewards/margins": -0.14053797721862793, "rewards/rejected": 1.1815485954284668, "step": 1323 }, { "epoch": 0.71, "learning_rate": 8.90747116884204e-08, "logits/chosen": -1.964871883392334, "logits/rejected": -2.0270581245422363, "logps/chosen": -5.953786849975586, "logps/rejected": -23.155681610107422, "loss": 0.3094, "rewards/accuracies": 1.0, "rewards/chosen": 1.3850382566452026, "rewards/margins": 1.0144643783569336, "rewards/rejected": 0.37057381868362427, "step": 1324 }, { "epoch": 0.71, "learning_rate": 8.905653936650779e-08, "logits/chosen": -2.093940258026123, "logits/rejected": -2.28192138671875, "logps/chosen": -14.97477912902832, "logps/rejected": -10.067139625549316, "loss": 0.5926, "rewards/accuracies": 1.0, "rewards/chosen": 0.9778118133544922, "rewards/margins": 0.2124413251876831, "rewards/rejected": 0.7653704881668091, "step": 1325 }, { "epoch": 0.72, "learning_rate": 8.903835380085674e-08, "logits/chosen": -1.94956374168396, "logits/rejected": -2.2779886722564697, "logps/chosen": -1.4623355865478516, "logps/rejected": -1.5133371353149414, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.9033007621765137, "rewards/margins": 3.1828880310058594e-05, "rewards/rejected": 0.9032689332962036, "step": 1326 }, { "epoch": 0.72, "learning_rate": 8.902015499763382e-08, "logits/chosen": -2.043024778366089, "logits/rejected": -2.0472254753112793, "logps/chosen": -1.7774877548217773, "logps/rejected": -4.080756664276123, "loss": 0.4973, "rewards/accuracies": 1.0, "rewards/chosen": 0.9681908488273621, "rewards/margins": 0.4395695924758911, "rewards/rejected": 0.528621256351471, "step": 1327 }, { "epoch": 0.72, "learning_rate": 8.90019429630101e-08, "logits/chosen": -1.9979403018951416, "logits/rejected": -2.249581813812256, "logps/chosen": -0.5579670667648315, "logps/rejected": -0.5804456472396851, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.9836182594299316, "rewards/margins": 0.009574711322784424, "rewards/rejected": 0.9740435481071472, "step": 1328 }, { "epoch": 0.72, "learning_rate": 8.898371770316111e-08, "logits/chosen": -2.0043089389801025, "logits/rejected": -2.000786542892456, "logps/chosen": -9.780500411987305, "logps/rejected": -1.6558172702789307, "loss": 0.4623, "rewards/accuracies": 1.0, "rewards/chosen": 1.3355703353881836, "rewards/margins": 0.531475841999054, "rewards/rejected": 0.8040944933891296, "step": 1329 }, { "epoch": 0.72, "learning_rate": 8.896547922426691e-08, "logits/chosen": -2.0625760555267334, "logits/rejected": -2.2429702281951904, "logps/chosen": -2.8122241497039795, "logps/rejected": -2.8944411277770996, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.9262930154800415, "rewards/margins": 0.004214465618133545, "rewards/rejected": 0.922078549861908, "step": 1330 }, { "epoch": 0.72, "learning_rate": 8.894722753251198e-08, "logits/chosen": -2.1297659873962402, "logits/rejected": -2.266998529434204, "logps/chosen": -2.7775321006774902, "logps/rejected": -8.443921089172363, "loss": 0.6249, "rewards/accuracies": 1.0, "rewards/chosen": 1.1055837869644165, "rewards/margins": 0.1414126753807068, "rewards/rejected": 0.9641711115837097, "step": 1331 }, { "epoch": 0.72, "learning_rate": 8.892896263408533e-08, "logits/chosen": -1.9932255744934082, "logits/rejected": -2.2519078254699707, "logps/chosen": -0.6776062846183777, "logps/rejected": -0.6805497407913208, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.9851841330528259, "rewards/margins": 0.001726388931274414, "rewards/rejected": 0.9834577441215515, "step": 1332 }, { "epoch": 0.72, "learning_rate": 8.891068453518042e-08, "logits/chosen": -2.031857490539551, "logits/rejected": -2.2278597354888916, "logps/chosen": -2.4116477966308594, "logps/rejected": -0.8049637675285339, "loss": 0.6431, "rewards/accuracies": 1.0, "rewards/chosen": 1.0283355712890625, "rewards/margins": 0.10263603925704956, "rewards/rejected": 0.9256995320320129, "step": 1333 }, { "epoch": 0.72, "learning_rate": 8.88923932419952e-08, "logits/chosen": -2.0786101818084717, "logits/rejected": -2.034999132156372, "logps/chosen": -12.239866256713867, "logps/rejected": -5.909390449523926, "loss": 0.5125, "rewards/accuracies": 1.0, "rewards/chosen": 1.2118828296661377, "rewards/margins": 0.4011620879173279, "rewards/rejected": 0.8107207417488098, "step": 1334 }, { "epoch": 0.72, "learning_rate": 8.88740887607321e-08, "logits/chosen": -2.090782403945923, "logits/rejected": -2.093079090118408, "logps/chosen": -0.8125153183937073, "logps/rejected": -2.7148823738098145, "loss": 0.5844, "rewards/accuracies": 1.0, "rewards/chosen": 0.7454229593276978, "rewards/margins": 0.23075085878372192, "rewards/rejected": 0.5146721005439758, "step": 1335 }, { "epoch": 0.72, "learning_rate": 8.885577109759801e-08, "logits/chosen": -2.006939172744751, "logits/rejected": -2.280866861343384, "logps/chosen": -0.8810293078422546, "logps/rejected": -0.9265121221542358, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.9250316619873047, "rewards/margins": 0.018241465091705322, "rewards/rejected": 0.9067901968955994, "step": 1336 }, { "epoch": 0.72, "learning_rate": 8.883744025880427e-08, "logits/chosen": -1.9603725671768188, "logits/rejected": -1.967380166053772, "logps/chosen": -2.213440418243408, "logps/rejected": -3.335688591003418, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 1.142785668373108, "rewards/margins": 0.6039628982543945, "rewards/rejected": 0.5388227701187134, "step": 1337 }, { "epoch": 0.72, "learning_rate": 8.881909625056675e-08, "logits/chosen": -2.1007277965545654, "logits/rejected": -2.263296604156494, "logps/chosen": -0.9448826909065247, "logps/rejected": -0.9632182121276855, "loss": 0.6993, "rewards/accuracies": 0.0, "rewards/chosen": 0.8929176330566406, "rewards/margins": -0.012224793434143066, "rewards/rejected": 0.9051424264907837, "step": 1338 }, { "epoch": 0.72, "learning_rate": 8.880073907910573e-08, "logits/chosen": -2.071161985397339, "logits/rejected": -2.0821235179901123, "logps/chosen": -7.6336798667907715, "logps/rejected": -5.6042680740356445, "loss": 0.3778, "rewards/accuracies": 1.0, "rewards/chosen": 1.6252081394195557, "rewards/margins": 0.7785624861717224, "rewards/rejected": 0.8466456532478333, "step": 1339 }, { "epoch": 0.72, "learning_rate": 8.878236875064595e-08, "logits/chosen": -1.9972140789031982, "logits/rejected": -2.000844717025757, "logps/chosen": -5.365240097045898, "logps/rejected": -7.356900215148926, "loss": 0.3906, "rewards/accuracies": 1.0, "rewards/chosen": 1.2162634134292603, "rewards/margins": 0.7383780479431152, "rewards/rejected": 0.47788533568382263, "step": 1340 }, { "epoch": 0.72, "learning_rate": 8.876398527141666e-08, "logits/chosen": -2.031095027923584, "logits/rejected": -2.0308282375335693, "logps/chosen": -1.9983954429626465, "logps/rejected": -0.8747149705886841, "loss": 0.6097, "rewards/accuracies": 1.0, "rewards/chosen": 0.8846341967582703, "rewards/margins": 0.1744421124458313, "rewards/rejected": 0.710192084312439, "step": 1341 }, { "epoch": 0.72, "learning_rate": 8.874558864765151e-08, "logits/chosen": -2.0277397632598877, "logits/rejected": -2.2684662342071533, "logps/chosen": -0.4864293932914734, "logps/rejected": -0.5238986015319824, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.8840399980545044, "rewards/margins": 0.020830631256103516, "rewards/rejected": 0.8632093667984009, "step": 1342 }, { "epoch": 0.72, "learning_rate": 8.872717888558868e-08, "logits/chosen": -2.179741382598877, "logits/rejected": -2.1751630306243896, "logps/chosen": -7.135954856872559, "logps/rejected": -5.952631950378418, "loss": 0.428, "rewards/accuracies": 1.0, "rewards/chosen": 1.1204508543014526, "rewards/margins": 0.6270427703857422, "rewards/rejected": 0.49340811371803284, "step": 1343 }, { "epoch": 0.72, "learning_rate": 8.870875599147071e-08, "logits/chosen": -2.047621488571167, "logits/rejected": -2.0473976135253906, "logps/chosen": -4.168988227844238, "logps/rejected": -3.5261847972869873, "loss": 0.3804, "rewards/accuracies": 1.0, "rewards/chosen": 1.4138941764831543, "rewards/margins": 0.7703024744987488, "rewards/rejected": 0.6435917019844055, "step": 1344 }, { "epoch": 0.73, "learning_rate": 8.86903199715447e-08, "logits/chosen": -2.027531862258911, "logits/rejected": -2.2587077617645264, "logps/chosen": -3.5916855335235596, "logps/rejected": -3.562133312225342, "loss": 0.7034, "rewards/accuracies": 0.0, "rewards/chosen": 0.747765064239502, "rewards/margins": -0.020385026931762695, "rewards/rejected": 0.7681500911712646, "step": 1345 }, { "epoch": 0.73, "learning_rate": 8.867187083206211e-08, "logits/chosen": -2.0573151111602783, "logits/rejected": -2.287947654724121, "logps/chosen": -0.4540725350379944, "logps/rejected": -0.44831782579421997, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.8530699610710144, "rewards/margins": 0.01809746026992798, "rewards/rejected": 0.8349725008010864, "step": 1346 }, { "epoch": 0.73, "learning_rate": 8.86534085792789e-08, "logits/chosen": -2.0587124824523926, "logits/rejected": -2.063157558441162, "logps/chosen": -1.0568177700042725, "logps/rejected": -4.238213539123535, "loss": 0.499, "rewards/accuracies": 1.0, "rewards/chosen": 0.9217036366462708, "rewards/margins": 0.43528079986572266, "rewards/rejected": 0.4864228367805481, "step": 1347 }, { "epoch": 0.73, "learning_rate": 8.863493321945546e-08, "logits/chosen": -2.1097769737243652, "logits/rejected": -2.094799518585205, "logps/chosen": -2.1198837757110596, "logps/rejected": -9.066974639892578, "loss": 0.4486, "rewards/accuracies": 1.0, "rewards/chosen": 1.1575672626495361, "rewards/margins": 0.568977415561676, "rewards/rejected": 0.5885898470878601, "step": 1348 }, { "epoch": 0.73, "learning_rate": 8.86164447588566e-08, "logits/chosen": -2.1163718700408936, "logits/rejected": -2.1217305660247803, "logps/chosen": -4.1311516761779785, "logps/rejected": -3.4909534454345703, "loss": 0.443, "rewards/accuracies": 1.0, "rewards/chosen": 1.1285772323608398, "rewards/margins": 0.5845524668693542, "rewards/rejected": 0.5440247654914856, "step": 1349 }, { "epoch": 0.73, "learning_rate": 8.859794320375167e-08, "logits/chosen": -2.203404188156128, "logits/rejected": -2.2004268169403076, "logps/chosen": -5.57501220703125, "logps/rejected": -2.343451976776123, "loss": 0.4436, "rewards/accuracies": 1.0, "rewards/chosen": 1.201619029045105, "rewards/margins": 0.5827359557151794, "rewards/rejected": 0.6188830733299255, "step": 1350 }, { "epoch": 0.73, "learning_rate": 8.857942856041432e-08, "logits/chosen": -1.9690279960632324, "logits/rejected": -2.242685317993164, "logps/chosen": -0.7410640716552734, "logps/rejected": -0.7744737863540649, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 1.0535587072372437, "rewards/margins": 0.00687408447265625, "rewards/rejected": 1.0466846227645874, "step": 1351 }, { "epoch": 0.73, "learning_rate": 8.856090083512273e-08, "logits/chosen": -2.056117534637451, "logits/rejected": -2.29843807220459, "logps/chosen": -0.9605938196182251, "logps/rejected": -1.1177608966827393, "loss": 0.6697, "rewards/accuracies": 1.0, "rewards/chosen": 0.7633579969406128, "rewards/margins": 0.04755091667175293, "rewards/rejected": 0.7158070802688599, "step": 1352 }, { "epoch": 0.73, "learning_rate": 8.85423600341595e-08, "logits/chosen": -2.0713534355163574, "logits/rejected": -2.3138844966888428, "logps/chosen": -0.7814251184463501, "logps/rejected": -1.0742822885513306, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.9630752801895142, "rewards/margins": 0.02806156873703003, "rewards/rejected": 0.9350137114524841, "step": 1353 }, { "epoch": 0.73, "learning_rate": 8.852380616381164e-08, "logits/chosen": -2.1150095462799072, "logits/rejected": -2.2678260803222656, "logps/chosen": -2.250206232070923, "logps/rejected": -2.5393877029418945, "loss": 0.6636, "rewards/accuracies": 1.0, "rewards/chosen": 0.8305057883262634, "rewards/margins": 0.06001484394073486, "rewards/rejected": 0.7704909443855286, "step": 1354 }, { "epoch": 0.73, "learning_rate": 8.850523923037063e-08, "logits/chosen": -2.1305923461914062, "logits/rejected": -2.1303062438964844, "logps/chosen": -5.372392177581787, "logps/rejected": -2.458556890487671, "loss": 0.3054, "rewards/accuracies": 1.0, "rewards/chosen": 1.5675861835479736, "rewards/margins": 1.0296871662139893, "rewards/rejected": 0.5378989577293396, "step": 1355 }, { "epoch": 0.73, "learning_rate": 8.848665924013233e-08, "logits/chosen": -2.079918146133423, "logits/rejected": -2.0716795921325684, "logps/chosen": -2.8201658725738525, "logps/rejected": -10.51546573638916, "loss": 0.3922, "rewards/accuracies": 1.0, "rewards/chosen": 1.2896260023117065, "rewards/margins": 0.7334520220756531, "rewards/rejected": 0.5561739802360535, "step": 1356 }, { "epoch": 0.73, "learning_rate": 8.846806619939708e-08, "logits/chosen": -1.9843379259109497, "logits/rejected": -1.9846495389938354, "logps/chosen": -0.4565211534500122, "logps/rejected": -2.493213176727295, "loss": 0.5424, "rewards/accuracies": 1.0, "rewards/chosen": 0.882431149482727, "rewards/margins": 0.328338086605072, "rewards/rejected": 0.554093062877655, "step": 1357 }, { "epoch": 0.73, "learning_rate": 8.844946011446964e-08, "logits/chosen": -2.050483465194702, "logits/rejected": -2.114182233810425, "logps/chosen": -8.111430168151855, "logps/rejected": -17.081087112426758, "loss": 0.4093, "rewards/accuracies": 1.0, "rewards/chosen": 1.2832932472229004, "rewards/margins": 0.6817789673805237, "rewards/rejected": 0.6015142798423767, "step": 1358 }, { "epoch": 0.73, "learning_rate": 8.84308409916591e-08, "logits/chosen": -2.0689141750335693, "logits/rejected": -2.05879807472229, "logps/chosen": -0.6035536527633667, "logps/rejected": -5.322592735290527, "loss": 0.4991, "rewards/accuracies": 1.0, "rewards/chosen": 0.9908134341239929, "rewards/margins": 0.4351416230201721, "rewards/rejected": 0.5556718111038208, "step": 1359 }, { "epoch": 0.73, "learning_rate": 8.841220883727914e-08, "logits/chosen": -2.1163110733032227, "logits/rejected": -2.3151843547821045, "logps/chosen": -0.9626787304878235, "logps/rejected": -0.9834728240966797, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 1.0127506256103516, "rewards/margins": 0.024020254611968994, "rewards/rejected": 0.9887303709983826, "step": 1360 }, { "epoch": 0.73, "learning_rate": 8.839356365764769e-08, "logits/chosen": -2.026087760925293, "logits/rejected": -2.0758144855499268, "logps/chosen": -2.7453227043151855, "logps/rejected": -23.36554527282715, "loss": 0.3648, "rewards/accuracies": 1.0, "rewards/chosen": 1.11190927028656, "rewards/margins": 0.8205573558807373, "rewards/rejected": 0.29135188460350037, "step": 1361 }, { "epoch": 0.73, "learning_rate": 8.837490545908721e-08, "logits/chosen": -1.9301795959472656, "logits/rejected": -1.93812096118927, "logps/chosen": -1.83729887008667, "logps/rejected": -2.891709566116333, "loss": 0.5311, "rewards/accuracies": 1.0, "rewards/chosen": 0.9853200912475586, "rewards/margins": 0.3554862141609192, "rewards/rejected": 0.6298338770866394, "step": 1362 }, { "epoch": 0.74, "learning_rate": 8.835623424792451e-08, "logits/chosen": -2.0223114490509033, "logits/rejected": -2.2585902214050293, "logps/chosen": -0.7827439904212952, "logps/rejected": -0.771685779094696, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.7784383296966553, "rewards/margins": 0.023357927799224854, "rewards/rejected": 0.7550804018974304, "step": 1363 }, { "epoch": 0.74, "learning_rate": 8.833755003049087e-08, "logits/chosen": -2.0485033988952637, "logits/rejected": -2.0535390377044678, "logps/chosen": -1.1420527696609497, "logps/rejected": -19.897016525268555, "loss": 0.5754, "rewards/accuracies": 1.0, "rewards/chosen": 1.0490974187850952, "rewards/margins": 0.2513253688812256, "rewards/rejected": 0.7977720499038696, "step": 1364 }, { "epoch": 0.74, "learning_rate": 8.831885281312192e-08, "logits/chosen": -1.985158920288086, "logits/rejected": -2.2247793674468994, "logps/chosen": -0.5629680156707764, "logps/rejected": -0.6585940718650818, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9864634871482849, "rewards/margins": 0.02037346363067627, "rewards/rejected": 0.9660900235176086, "step": 1365 }, { "epoch": 0.74, "learning_rate": 8.830014260215775e-08, "logits/chosen": -2.033165693283081, "logits/rejected": -2.0605955123901367, "logps/chosen": -6.076169013977051, "logps/rejected": -4.705861568450928, "loss": 0.6487, "rewards/accuracies": 1.0, "rewards/chosen": 0.983879566192627, "rewards/margins": 0.0910423994064331, "rewards/rejected": 0.8928371667861938, "step": 1366 }, { "epoch": 0.74, "learning_rate": 8.828141940394282e-08, "logits/chosen": -2.124802827835083, "logits/rejected": -2.132798910140991, "logps/chosen": -2.427553176879883, "logps/rejected": -2.2529616355895996, "loss": 0.5725, "rewards/accuracies": 1.0, "rewards/chosen": 0.9365726709365845, "rewards/margins": 0.257778525352478, "rewards/rejected": 0.6787941455841064, "step": 1367 }, { "epoch": 0.74, "learning_rate": 8.8262683224826e-08, "logits/chosen": -2.009052038192749, "logits/rejected": -2.2712111473083496, "logps/chosen": -7.269491195678711, "logps/rejected": -6.696354389190674, "loss": 0.7081, "rewards/accuracies": 0.0, "rewards/chosen": 0.5239424109458923, "rewards/margins": -0.029744267463684082, "rewards/rejected": 0.5536866784095764, "step": 1368 }, { "epoch": 0.74, "learning_rate": 8.824393407116057e-08, "logits/chosen": -2.0351955890655518, "logits/rejected": -2.019524335861206, "logps/chosen": -15.892619132995605, "logps/rejected": -4.574108123779297, "loss": 0.6272, "rewards/accuracies": 1.0, "rewards/chosen": 1.1929023265838623, "rewards/margins": 0.13658463954925537, "rewards/rejected": 1.056317687034607, "step": 1369 }, { "epoch": 0.74, "learning_rate": 8.822517194930422e-08, "logits/chosen": -1.9785736799240112, "logits/rejected": -1.9750499725341797, "logps/chosen": -3.2515573501586914, "logps/rejected": -6.644369125366211, "loss": 0.5423, "rewards/accuracies": 1.0, "rewards/chosen": 0.9655806422233582, "rewards/margins": 0.3284494876861572, "rewards/rejected": 0.6371311545372009, "step": 1370 }, { "epoch": 0.74, "learning_rate": 8.8206396865619e-08, "logits/chosen": -2.148771286010742, "logits/rejected": -2.3023290634155273, "logps/chosen": -2.2702527046203613, "logps/rejected": -2.2654662132263184, "loss": 0.6987, "rewards/accuracies": 0.0, "rewards/chosen": 0.9527153372764587, "rewards/margins": -0.01116865873336792, "rewards/rejected": 0.9638839960098267, "step": 1371 }, { "epoch": 0.74, "learning_rate": 8.818760882647141e-08, "logits/chosen": -1.9742640256881714, "logits/rejected": -2.243849992752075, "logps/chosen": -1.6418688297271729, "logps/rejected": -1.6072945594787598, "loss": 0.6992, "rewards/accuracies": 0.0, "rewards/chosen": 0.7839599847793579, "rewards/margins": -0.01200193166732788, "rewards/rejected": 0.7959619164466858, "step": 1372 }, { "epoch": 0.74, "learning_rate": 8.81688078382323e-08, "logits/chosen": -2.07159686088562, "logits/rejected": -2.072695016860962, "logps/chosen": -3.1426913738250732, "logps/rejected": -3.8932621479034424, "loss": 0.3286, "rewards/accuracies": 1.0, "rewards/chosen": 1.4773459434509277, "rewards/margins": 0.9440012574195862, "rewards/rejected": 0.5333446860313416, "step": 1373 }, { "epoch": 0.74, "learning_rate": 8.814999390727693e-08, "logits/chosen": -2.0355820655822754, "logits/rejected": -2.036836624145508, "logps/chosen": -2.4600179195404053, "logps/rejected": -1.819304347038269, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 1.1183414459228516, "rewards/margins": 0.0095747709274292, "rewards/rejected": 1.1087666749954224, "step": 1374 }, { "epoch": 0.74, "learning_rate": 8.813116703998494e-08, "logits/chosen": -2.1965439319610596, "logits/rejected": -2.2108802795410156, "logps/chosen": -5.297628879547119, "logps/rejected": -10.147261619567871, "loss": 0.5216, "rewards/accuracies": 1.0, "rewards/chosen": 1.1922883987426758, "rewards/margins": 0.378742516040802, "rewards/rejected": 0.8135458827018738, "step": 1375 }, { "epoch": 0.74, "learning_rate": 8.811232724274033e-08, "logits/chosen": -2.0003433227539062, "logits/rejected": -2.0827038288116455, "logps/chosen": -2.991889476776123, "logps/rejected": -23.88561248779297, "loss": 0.5402, "rewards/accuracies": 1.0, "rewards/chosen": 0.7751976847648621, "rewards/margins": 0.33353927731513977, "rewards/rejected": 0.4416584074497223, "step": 1376 }, { "epoch": 0.74, "learning_rate": 8.809347452193157e-08, "logits/chosen": -1.991969347000122, "logits/rejected": -1.991672158241272, "logps/chosen": -1.2274550199508667, "logps/rejected": -1.2705895900726318, "loss": 0.6347, "rewards/accuracies": 1.0, "rewards/chosen": 0.9837027788162231, "rewards/margins": 0.12054938077926636, "rewards/rejected": 0.8631533980369568, "step": 1377 }, { "epoch": 0.74, "learning_rate": 8.807460888395141e-08, "logits/chosen": -2.016037940979004, "logits/rejected": -2.0167593955993652, "logps/chosen": -1.9392366409301758, "logps/rejected": -1.5721824169158936, "loss": 0.6408, "rewards/accuracies": 1.0, "rewards/chosen": 1.0242199897766113, "rewards/margins": 0.10759323835372925, "rewards/rejected": 0.9166267514228821, "step": 1378 }, { "epoch": 0.74, "learning_rate": 8.805573033519707e-08, "logits/chosen": -1.96638822555542, "logits/rejected": -2.2546839714050293, "logps/chosen": -3.337346076965332, "logps/rejected": -6.929727554321289, "loss": 0.5613, "rewards/accuracies": 1.0, "rewards/chosen": 0.7375855445861816, "rewards/margins": 0.28385305404663086, "rewards/rejected": 0.4537324905395508, "step": 1379 }, { "epoch": 0.74, "learning_rate": 8.803683888207006e-08, "logits/chosen": -2.154202938079834, "logits/rejected": -2.149580240249634, "logps/chosen": -7.474222660064697, "logps/rejected": -4.263814449310303, "loss": 0.411, "rewards/accuracies": 1.0, "rewards/chosen": 1.2208337783813477, "rewards/margins": 0.6766423583030701, "rewards/rejected": 0.5441914200782776, "step": 1380 }, { "epoch": 0.74, "learning_rate": 8.801793453097634e-08, "logits/chosen": -2.0680088996887207, "logits/rejected": -2.0722291469573975, "logps/chosen": -1.8181620836257935, "logps/rejected": -2.512019395828247, "loss": 0.4802, "rewards/accuracies": 1.0, "rewards/chosen": 1.1487157344818115, "rewards/margins": 0.4837854504585266, "rewards/rejected": 0.6649302840232849, "step": 1381 }, { "epoch": 0.75, "learning_rate": 8.799901728832619e-08, "logits/chosen": -2.1429519653320312, "logits/rejected": -2.271862745285034, "logps/chosen": -18.77965545654297, "logps/rejected": -12.546428680419922, "loss": 0.5741, "rewards/accuracies": 1.0, "rewards/chosen": 0.7525873184204102, "rewards/margins": 0.2541843354701996, "rewards/rejected": 0.49840298295021057, "step": 1382 }, { "epoch": 0.75, "learning_rate": 8.79800871605343e-08, "logits/chosen": -2.0256006717681885, "logits/rejected": -2.286064386367798, "logps/chosen": -2.9796323776245117, "logps/rejected": -2.857084274291992, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.8547369837760925, "rewards/margins": 0.016506731510162354, "rewards/rejected": 0.8382302522659302, "step": 1383 }, { "epoch": 0.75, "learning_rate": 8.796114415401972e-08, "logits/chosen": -2.2928640842437744, "logits/rejected": -2.285677909851074, "logps/chosen": -1.1436524391174316, "logps/rejected": -0.6204984188079834, "loss": 0.7078, "rewards/accuracies": 0.0, "rewards/chosen": 0.7296642065048218, "rewards/margins": -0.02910923957824707, "rewards/rejected": 0.7587734460830688, "step": 1384 }, { "epoch": 0.75, "learning_rate": 8.794218827520585e-08, "logits/chosen": -2.1090500354766846, "logits/rejected": -2.109830617904663, "logps/chosen": -4.041155815124512, "logps/rejected": -9.693819999694824, "loss": 0.4422, "rewards/accuracies": 1.0, "rewards/chosen": 1.2165769338607788, "rewards/margins": 0.5866619944572449, "rewards/rejected": 0.6299149394035339, "step": 1385 }, { "epoch": 0.75, "learning_rate": 8.792321953052046e-08, "logits/chosen": -2.214381456375122, "logits/rejected": -2.2321267127990723, "logps/chosen": -3.9323973655700684, "logps/rejected": -7.641359329223633, "loss": 0.4114, "rewards/accuracies": 1.0, "rewards/chosen": 1.2674273252487183, "rewards/margins": 0.6755363941192627, "rewards/rejected": 0.5918909311294556, "step": 1386 }, { "epoch": 0.75, "learning_rate": 8.790423792639572e-08, "logits/chosen": -2.1806745529174805, "logits/rejected": -2.1802523136138916, "logps/chosen": -2.7926814556121826, "logps/rejected": -4.747395992279053, "loss": 0.4929, "rewards/accuracies": 1.0, "rewards/chosen": 0.9884708523750305, "rewards/margins": 0.4508854150772095, "rewards/rejected": 0.537585437297821, "step": 1387 }, { "epoch": 0.75, "learning_rate": 8.78852434692681e-08, "logits/chosen": -2.0327446460723877, "logits/rejected": -2.2967238426208496, "logps/chosen": -1.1804531812667847, "logps/rejected": -1.0388001203536987, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.788210391998291, "rewards/margins": 0.006770789623260498, "rewards/rejected": 0.7814396023750305, "step": 1388 }, { "epoch": 0.75, "learning_rate": 8.786623616557846e-08, "logits/chosen": -1.9827176332473755, "logits/rejected": -1.9787760972976685, "logps/chosen": -3.9598255157470703, "logps/rejected": -2.95743989944458, "loss": 0.3937, "rewards/accuracies": 1.0, "rewards/chosen": 1.370829463005066, "rewards/margins": 0.728919506072998, "rewards/rejected": 0.6419099569320679, "step": 1389 }, { "epoch": 0.75, "learning_rate": 8.784721602177202e-08, "logits/chosen": -2.194889783859253, "logits/rejected": -2.1899728775024414, "logps/chosen": -6.596125602722168, "logps/rejected": -2.9714417457580566, "loss": 0.5292, "rewards/accuracies": 1.0, "rewards/chosen": 0.9118308424949646, "rewards/margins": 0.36004120111465454, "rewards/rejected": 0.5517896413803101, "step": 1390 }, { "epoch": 0.75, "learning_rate": 8.782818304429838e-08, "logits/chosen": -1.9885746240615845, "logits/rejected": -1.9799444675445557, "logps/chosen": -6.379818916320801, "logps/rejected": -5.032089710235596, "loss": 0.4826, "rewards/accuracies": 1.0, "rewards/chosen": 1.040269374847412, "rewards/margins": 0.4776880741119385, "rewards/rejected": 0.5625813007354736, "step": 1391 }, { "epoch": 0.75, "learning_rate": 8.78091372396114e-08, "logits/chosen": -2.1583642959594727, "logits/rejected": -2.22497820854187, "logps/chosen": -2.6703553199768066, "logps/rejected": -2.7101778984069824, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.9274064898490906, "rewards/margins": 0.015119731426239014, "rewards/rejected": 0.9122867584228516, "step": 1392 }, { "epoch": 0.75, "learning_rate": 8.779007861416939e-08, "logits/chosen": -2.165961742401123, "logits/rejected": -2.2354612350463867, "logps/chosen": -2.7263071537017822, "logps/rejected": -2.5410826206207275, "loss": 0.6963, "rewards/accuracies": 0.0, "rewards/chosen": 0.7733233571052551, "rewards/margins": -0.006314098834991455, "rewards/rejected": 0.7796374559402466, "step": 1393 }, { "epoch": 0.75, "learning_rate": 8.777100717443495e-08, "logits/chosen": -2.1871964931488037, "logits/rejected": -2.065661668777466, "logps/chosen": -38.113365173339844, "logps/rejected": -4.050978660583496, "loss": 0.3666, "rewards/accuracies": 1.0, "rewards/chosen": 1.4659172296524048, "rewards/margins": 0.8147335052490234, "rewards/rejected": 0.6511837244033813, "step": 1394 }, { "epoch": 0.75, "learning_rate": 8.775192292687504e-08, "logits/chosen": -2.1280088424682617, "logits/rejected": -2.230524778366089, "logps/chosen": -1.5199000835418701, "logps/rejected": -1.316685438156128, "loss": 0.698, "rewards/accuracies": 0.0, "rewards/chosen": 1.0012781620025635, "rewards/margins": -0.0095900297164917, "rewards/rejected": 1.0108681917190552, "step": 1395 }, { "epoch": 0.75, "learning_rate": 8.773282587796097e-08, "logits/chosen": -1.9919638633728027, "logits/rejected": -2.241403579711914, "logps/chosen": -0.7245022058486938, "logps/rejected": -0.7185314893722534, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.945960521697998, "rewards/margins": 0.018217086791992188, "rewards/rejected": 0.9277434349060059, "step": 1396 }, { "epoch": 0.75, "learning_rate": 8.771371603416841e-08, "logits/chosen": -2.0360870361328125, "logits/rejected": -2.037661075592041, "logps/chosen": -1.5068817138671875, "logps/rejected": -1.7994911670684814, "loss": 0.5414, "rewards/accuracies": 1.0, "rewards/chosen": 1.0418013334274292, "rewards/margins": 0.3307681679725647, "rewards/rejected": 0.7110331654548645, "step": 1397 }, { "epoch": 0.75, "learning_rate": 8.769459340197731e-08, "logits/chosen": -2.0876386165618896, "logits/rejected": -2.2646641731262207, "logps/chosen": -3.436990737915039, "logps/rejected": -3.5533862113952637, "loss": 0.7053, "rewards/accuracies": 0.0, "rewards/chosen": 0.7389822006225586, "rewards/margins": -0.024180471897125244, "rewards/rejected": 0.7631626725196838, "step": 1398 }, { "epoch": 0.75, "learning_rate": 8.767545798787199e-08, "logits/chosen": -2.1854355335235596, "logits/rejected": -2.1749472618103027, "logps/chosen": -12.698844909667969, "logps/rejected": -2.5174221992492676, "loss": 0.5647, "rewards/accuracies": 1.0, "rewards/chosen": 1.3170788288116455, "rewards/margins": 0.27582061290740967, "rewards/rejected": 1.0412582159042358, "step": 1399 }, { "epoch": 0.76, "learning_rate": 8.765630979834115e-08, "logits/chosen": -2.0637192726135254, "logits/rejected": -2.266339063644409, "logps/chosen": -0.551750659942627, "logps/rejected": -0.6020761132240295, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.840789794921875, "rewards/margins": 0.02240091562271118, "rewards/rejected": 0.8183888792991638, "step": 1400 }, { "epoch": 0.76, "learning_rate": 8.763714883987773e-08, "logits/chosen": -1.992990255355835, "logits/rejected": -1.9916155338287354, "logps/chosen": -1.5831184387207031, "logps/rejected": -2.7496564388275146, "loss": 0.5197, "rewards/accuracies": 1.0, "rewards/chosen": 0.9486385583877563, "rewards/margins": 0.3834074139595032, "rewards/rejected": 0.5652311444282532, "step": 1401 }, { "epoch": 0.76, "learning_rate": 8.761797511897906e-08, "logits/chosen": -1.9783986806869507, "logits/rejected": -1.987470269203186, "logps/chosen": -8.301820755004883, "logps/rejected": -1.846892237663269, "loss": 0.7494, "rewards/accuracies": 0.0, "rewards/chosen": 1.0029855966567993, "rewards/margins": -0.10945296287536621, "rewards/rejected": 1.1124385595321655, "step": 1402 }, { "epoch": 0.76, "learning_rate": 8.75987886421468e-08, "logits/chosen": -2.1156158447265625, "logits/rejected": -2.1119863986968994, "logps/chosen": -4.780673980712891, "logps/rejected": -7.521662712097168, "loss": 0.6003, "rewards/accuracies": 1.0, "rewards/chosen": 0.9111785888671875, "rewards/margins": 0.19523972272872925, "rewards/rejected": 0.7159388661384583, "step": 1403 }, { "epoch": 0.76, "learning_rate": 8.75795894158869e-08, "logits/chosen": -2.0714282989501953, "logits/rejected": -2.253979444503784, "logps/chosen": -0.5520952343940735, "logps/rejected": -0.6154201030731201, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.8940181732177734, "rewards/margins": 0.025045931339263916, "rewards/rejected": 0.8689722418785095, "step": 1404 }, { "epoch": 0.76, "learning_rate": 8.756037744670965e-08, "logits/chosen": -2.0972537994384766, "logits/rejected": -2.2661077976226807, "logps/chosen": -0.8153191804885864, "logps/rejected": -0.8071233630180359, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.8154123425483704, "rewards/margins": 0.019253075122833252, "rewards/rejected": 0.7961592674255371, "step": 1405 }, { "epoch": 0.76, "learning_rate": 8.75411527411297e-08, "logits/chosen": -2.0567069053649902, "logits/rejected": -2.05363392829895, "logps/chosen": -3.7286946773529053, "logps/rejected": -3.242306709289551, "loss": 0.568, "rewards/accuracies": 1.0, "rewards/chosen": 1.0744577646255493, "rewards/margins": 0.2681216597557068, "rewards/rejected": 0.8063361048698425, "step": 1406 }, { "epoch": 0.76, "learning_rate": 8.752191530566595e-08, "logits/chosen": -2.09854793548584, "logits/rejected": -2.073654890060425, "logps/chosen": -31.434356689453125, "logps/rejected": -17.969371795654297, "loss": 0.4434, "rewards/accuracies": 1.0, "rewards/chosen": 1.355303168296814, "rewards/margins": 0.5834153890609741, "rewards/rejected": 0.7718877792358398, "step": 1407 }, { "epoch": 0.76, "learning_rate": 8.750266514684166e-08, "logits/chosen": -2.180901050567627, "logits/rejected": -2.053023099899292, "logps/chosen": -52.648170471191406, "logps/rejected": -3.1772284507751465, "loss": 0.2869, "rewards/accuracies": 1.0, "rewards/chosen": 1.8038994073867798, "rewards/margins": 1.1017003059387207, "rewards/rejected": 0.7021990418434143, "step": 1408 }, { "epoch": 0.76, "learning_rate": 8.74834022711844e-08, "logits/chosen": -2.080209970474243, "logits/rejected": -2.08557391166687, "logps/chosen": -2.051279067993164, "logps/rejected": -3.15476393699646, "loss": 0.501, "rewards/accuracies": 1.0, "rewards/chosen": 1.048736572265625, "rewards/margins": 0.4303341507911682, "rewards/rejected": 0.6184024214744568, "step": 1409 }, { "epoch": 0.76, "learning_rate": 8.746412668522602e-08, "logits/chosen": -2.1180360317230225, "logits/rejected": -2.1190264225006104, "logps/chosen": -0.4048271179199219, "logps/rejected": -3.4872496128082275, "loss": 0.5069, "rewards/accuracies": 1.0, "rewards/chosen": 0.9862769246101379, "rewards/margins": 0.41531479358673096, "rewards/rejected": 0.570962131023407, "step": 1410 }, { "epoch": 0.76, "learning_rate": 8.744483839550275e-08, "logits/chosen": -2.275430917739868, "logits/rejected": -2.333160877227783, "logps/chosen": -0.5654368996620178, "logps/rejected": -0.5686285495758057, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.937407910823822, "rewards/margins": 0.003936052322387695, "rewards/rejected": 0.9334718585014343, "step": 1411 }, { "epoch": 0.76, "learning_rate": 8.742553740855505e-08, "logits/chosen": -2.2155113220214844, "logits/rejected": -2.1098968982696533, "logps/chosen": -44.07331466674805, "logps/rejected": -3.93733811378479, "loss": 0.385, "rewards/accuracies": 1.0, "rewards/chosen": 1.6118758916854858, "rewards/margins": 0.7559600472450256, "rewards/rejected": 0.8559158444404602, "step": 1412 }, { "epoch": 0.76, "learning_rate": 8.740622373092774e-08, "logits/chosen": -2.0598983764648438, "logits/rejected": -2.059298276901245, "logps/chosen": -2.4341280460357666, "logps/rejected": -4.001280307769775, "loss": 0.5371, "rewards/accuracies": 1.0, "rewards/chosen": 0.8909503817558289, "rewards/margins": 0.34096527099609375, "rewards/rejected": 0.5499851107597351, "step": 1413 }, { "epoch": 0.76, "learning_rate": 8.73868973691699e-08, "logits/chosen": -2.1028811931610107, "logits/rejected": -2.2511157989501953, "logps/chosen": -2.900874614715576, "logps/rejected": -4.3263163566589355, "loss": 0.5975, "rewards/accuracies": 1.0, "rewards/chosen": 0.8812532424926758, "rewards/margins": 0.2013322114944458, "rewards/rejected": 0.67992103099823, "step": 1414 }, { "epoch": 0.76, "learning_rate": 8.736755832983495e-08, "logits/chosen": -1.9928585290908813, "logits/rejected": -2.2601547241210938, "logps/chosen": -1.667339563369751, "logps/rejected": -1.5670466423034668, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9548342823982239, "rewards/margins": 0.0056937336921691895, "rewards/rejected": 0.9491405487060547, "step": 1415 }, { "epoch": 0.76, "learning_rate": 8.734820661948059e-08, "logits/chosen": -2.1230051517486572, "logits/rejected": -2.2521958351135254, "logps/chosen": -0.7034559845924377, "logps/rejected": -0.7458905577659607, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.8765891194343567, "rewards/margins": 0.00802069902420044, "rewards/rejected": 0.8685684204101562, "step": 1416 }, { "epoch": 0.76, "learning_rate": 8.732884224466883e-08, "logits/chosen": -2.062969446182251, "logits/rejected": -2.282912254333496, "logps/chosen": -1.553382158279419, "logps/rejected": -1.5629215240478516, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.7466568350791931, "rewards/margins": 0.007005929946899414, "rewards/rejected": 0.7396509051322937, "step": 1417 }, { "epoch": 0.76, "learning_rate": 8.730946521196594e-08, "logits/chosen": -2.1347529888153076, "logits/rejected": -2.0261502265930176, "logps/chosen": -11.330669403076172, "logps/rejected": -2.515639305114746, "loss": 0.3735, "rewards/accuracies": 1.0, "rewards/chosen": 1.5547631978988647, "rewards/margins": 0.7924385666847229, "rewards/rejected": 0.7623246312141418, "step": 1418 }, { "epoch": 0.77, "learning_rate": 8.729007552794252e-08, "logits/chosen": -2.1264524459838867, "logits/rejected": -2.132596731185913, "logps/chosen": -2.2789881229400635, "logps/rejected": -3.384039878845215, "loss": 0.4394, "rewards/accuracies": 1.0, "rewards/chosen": 1.2874072790145874, "rewards/margins": 0.5946579575538635, "rewards/rejected": 0.6927493214607239, "step": 1419 }, { "epoch": 0.77, "learning_rate": 8.727067319917345e-08, "logits/chosen": -1.9898252487182617, "logits/rejected": -2.2481374740600586, "logps/chosen": -5.272712230682373, "logps/rejected": -5.243740558624268, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.7671205997467041, "rewards/margins": 0.015045166015625, "rewards/rejected": 0.7520754337310791, "step": 1420 }, { "epoch": 0.77, "learning_rate": 8.725125823223788e-08, "logits/chosen": -2.082453489303589, "logits/rejected": -2.2157673835754395, "logps/chosen": -1.5338611602783203, "logps/rejected": -1.4749977588653564, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9131292700767517, "rewards/margins": 0.012539982795715332, "rewards/rejected": 0.9005892872810364, "step": 1421 }, { "epoch": 0.77, "learning_rate": 8.723183063371927e-08, "logits/chosen": -2.002995014190674, "logits/rejected": -2.015329122543335, "logps/chosen": -3.4946837425231934, "logps/rejected": -1.8739571571350098, "loss": 0.5705, "rewards/accuracies": 1.0, "rewards/chosen": 0.8736332058906555, "rewards/margins": 0.2623746395111084, "rewards/rejected": 0.6112585663795471, "step": 1422 }, { "epoch": 0.77, "learning_rate": 8.721239041020536e-08, "logits/chosen": -1.9801826477050781, "logits/rejected": -1.920605182647705, "logps/chosen": -21.222576141357422, "logps/rejected": -1.678046464920044, "loss": 0.5345, "rewards/accuracies": 1.0, "rewards/chosen": 1.2371082305908203, "rewards/margins": 0.3473677635192871, "rewards/rejected": 0.8897404670715332, "step": 1423 }, { "epoch": 0.77, "learning_rate": 8.719293756828814e-08, "logits/chosen": -2.211294412612915, "logits/rejected": -2.3806405067443848, "logps/chosen": -11.117271423339844, "logps/rejected": -11.250591278076172, "loss": 0.6686, "rewards/accuracies": 1.0, "rewards/chosen": 1.0246349573135376, "rewards/margins": 0.049694061279296875, "rewards/rejected": 0.9749408960342407, "step": 1424 }, { "epoch": 0.77, "learning_rate": 8.717347211456393e-08, "logits/chosen": -2.0235750675201416, "logits/rejected": -2.014178991317749, "logps/chosen": -5.746757507324219, "logps/rejected": -1.9142987728118896, "loss": 0.436, "rewards/accuracies": 1.0, "rewards/chosen": 1.4903011322021484, "rewards/margins": 0.6041064858436584, "rewards/rejected": 0.88619464635849, "step": 1425 }, { "epoch": 0.77, "learning_rate": 8.715399405563329e-08, "logits/chosen": -2.1381890773773193, "logits/rejected": -2.325232982635498, "logps/chosen": -0.7118072509765625, "logps/rejected": -18.060943603515625, "loss": 0.6293, "rewards/accuracies": 1.0, "rewards/chosen": 0.8120208978652954, "rewards/margins": 0.1319555640220642, "rewards/rejected": 0.6800653338432312, "step": 1426 }, { "epoch": 0.77, "learning_rate": 8.713450339810104e-08, "logits/chosen": -2.0372114181518555, "logits/rejected": -2.0442259311676025, "logps/chosen": -0.84828782081604, "logps/rejected": -5.9902191162109375, "loss": 0.4605, "rewards/accuracies": 1.0, "rewards/chosen": 0.9835090637207031, "rewards/margins": 0.5362833738327026, "rewards/rejected": 0.4472256600856781, "step": 1427 }, { "epoch": 0.77, "learning_rate": 8.711500014857634e-08, "logits/chosen": -2.0079345703125, "logits/rejected": -2.3051509857177734, "logps/chosen": -0.45223721861839294, "logps/rejected": -0.5698856711387634, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.9641566276550293, "rewards/margins": 0.015820562839508057, "rewards/rejected": 0.9483360648155212, "step": 1428 }, { "epoch": 0.77, "learning_rate": 8.709548431367254e-08, "logits/chosen": -1.9811021089553833, "logits/rejected": -1.9813553094863892, "logps/chosen": -0.5165772438049316, "logps/rejected": -6.120936393737793, "loss": 0.4377, "rewards/accuracies": 1.0, "rewards/chosen": 1.0304352045059204, "rewards/margins": 0.5993804931640625, "rewards/rejected": 0.4310546815395355, "step": 1429 }, { "epoch": 0.77, "learning_rate": 8.707595590000728e-08, "logits/chosen": -2.1136364936828613, "logits/rejected": -2.118180513381958, "logps/chosen": -4.7184529304504395, "logps/rejected": -0.4970765709877014, "loss": 0.5835, "rewards/accuracies": 1.0, "rewards/chosen": 1.1963409185409546, "rewards/margins": 0.23283803462982178, "rewards/rejected": 0.9635028839111328, "step": 1430 }, { "epoch": 0.77, "learning_rate": 8.705641491420252e-08, "logits/chosen": -2.087156295776367, "logits/rejected": -2.1006979942321777, "logps/chosen": -4.961414813995361, "logps/rejected": -3.420219898223877, "loss": 0.56, "rewards/accuracies": 1.0, "rewards/chosen": 1.1022411584854126, "rewards/margins": 0.28685474395751953, "rewards/rejected": 0.8153864145278931, "step": 1431 }, { "epoch": 0.77, "learning_rate": 8.703686136288441e-08, "logits/chosen": -1.9990657567977905, "logits/rejected": -1.9715372323989868, "logps/chosen": -10.20844841003418, "logps/rejected": -5.322857856750488, "loss": 0.4831, "rewards/accuracies": 1.0, "rewards/chosen": 1.238921046257019, "rewards/margins": 0.4763829708099365, "rewards/rejected": 0.7625380754470825, "step": 1432 }, { "epoch": 0.77, "learning_rate": 8.70172952526834e-08, "logits/chosen": -2.041449785232544, "logits/rejected": -2.293663263320923, "logps/chosen": -0.8272759318351746, "logps/rejected": -0.8002216815948486, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.8960264325141907, "rewards/margins": 0.02297133207321167, "rewards/rejected": 0.873055100440979, "step": 1433 }, { "epoch": 0.77, "learning_rate": 8.699771659023421e-08, "logits/chosen": -2.034205198287964, "logits/rejected": -2.1441023349761963, "logps/chosen": -0.7883018851280212, "logps/rejected": -21.877851486206055, "loss": 0.3425, "rewards/accuracies": 1.0, "rewards/chosen": 0.9461675882339478, "rewards/margins": 0.8952614068984985, "rewards/rejected": 0.05090618133544922, "step": 1434 }, { "epoch": 0.77, "learning_rate": 8.697812538217576e-08, "logits/chosen": -2.311204671859741, "logits/rejected": -2.1989195346832275, "logps/chosen": -37.17417907714844, "logps/rejected": -2.7795629501342773, "loss": 0.4132, "rewards/accuracies": 1.0, "rewards/chosen": 1.4356590509414673, "rewards/margins": 0.6702621579170227, "rewards/rejected": 0.7653968930244446, "step": 1435 }, { "epoch": 0.77, "learning_rate": 8.69585216351513e-08, "logits/chosen": -2.146329164505005, "logits/rejected": -2.1466729640960693, "logps/chosen": -6.955021858215332, "logps/rejected": -10.307411193847656, "loss": 0.2543, "rewards/accuracies": 1.0, "rewards/chosen": 1.6538066864013672, "rewards/margins": 1.2395296096801758, "rewards/rejected": 0.4142770767211914, "step": 1436 }, { "epoch": 0.78, "learning_rate": 8.693890535580825e-08, "logits/chosen": -2.1025147438049316, "logits/rejected": -2.234607458114624, "logps/chosen": -3.5785911083221436, "logps/rejected": -4.45356559753418, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.7395079731941223, "rewards/margins": -0.011495888233184814, "rewards/rejected": 0.7510038614273071, "step": 1437 }, { "epoch": 0.78, "learning_rate": 8.691927655079838e-08, "logits/chosen": -2.0083465576171875, "logits/rejected": -2.2559967041015625, "logps/chosen": -1.7780659198760986, "logps/rejected": -1.8452883958816528, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.8548765182495117, "rewards/margins": -0.004561066627502441, "rewards/rejected": 0.8594375848770142, "step": 1438 }, { "epoch": 0.78, "learning_rate": 8.689963522677759e-08, "logits/chosen": -2.0532984733581543, "logits/rejected": -2.3320367336273193, "logps/chosen": -1.8952747583389282, "logps/rejected": -19.237056732177734, "loss": 0.3484, "rewards/accuracies": 1.0, "rewards/chosen": 0.9663603901863098, "rewards/margins": 0.8750128746032715, "rewards/rejected": 0.09134750813245773, "step": 1439 }, { "epoch": 0.78, "learning_rate": 8.687998139040613e-08, "logits/chosen": -1.972191572189331, "logits/rejected": -1.975872278213501, "logps/chosen": -2.4852418899536133, "logps/rejected": -3.3893089294433594, "loss": 0.5595, "rewards/accuracies": 1.0, "rewards/chosen": 0.9656817317008972, "rewards/margins": 0.28788983821868896, "rewards/rejected": 0.6777918934822083, "step": 1440 }, { "epoch": 0.78, "learning_rate": 8.686031504834842e-08, "logits/chosen": -2.0243136882781982, "logits/rejected": -2.2443056106567383, "logps/chosen": -1.4101817607879639, "logps/rejected": -1.2997981309890747, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.9327601790428162, "rewards/margins": 0.005805253982543945, "rewards/rejected": 0.9269549250602722, "step": 1441 }, { "epoch": 0.78, "learning_rate": 8.684063620727315e-08, "logits/chosen": -2.0658998489379883, "logits/rejected": -2.0569405555725098, "logps/chosen": -9.391286849975586, "logps/rejected": -5.781535625457764, "loss": 0.3345, "rewards/accuracies": 1.0, "rewards/chosen": 1.4336270093917847, "rewards/margins": 0.9231462478637695, "rewards/rejected": 0.5104807615280151, "step": 1442 }, { "epoch": 0.78, "learning_rate": 8.682094487385327e-08, "logits/chosen": -2.124560832977295, "logits/rejected": -2.2717270851135254, "logps/chosen": -0.7121274471282959, "logps/rejected": -0.7159426808357239, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.9598942995071411, "rewards/margins": -0.0023164749145507812, "rewards/rejected": 0.9622107744216919, "step": 1443 }, { "epoch": 0.78, "learning_rate": 8.680124105476592e-08, "logits/chosen": -2.0240092277526855, "logits/rejected": -2.2192437648773193, "logps/chosen": -7.379704475402832, "logps/rejected": -4.280946731567383, "loss": 0.7079, "rewards/accuracies": 0.0, "rewards/chosen": 0.7859930992126465, "rewards/margins": -0.029253780841827393, "rewards/rejected": 0.8152468800544739, "step": 1444 }, { "epoch": 0.78, "learning_rate": 8.67815247566925e-08, "logits/chosen": -1.9704575538635254, "logits/rejected": -2.272615671157837, "logps/chosen": -0.8380268812179565, "logps/rejected": -0.8247650265693665, "loss": 0.7045, "rewards/accuracies": 0.0, "rewards/chosen": 0.7655431628227234, "rewards/margins": -0.02267688512802124, "rewards/rejected": 0.7882200479507446, "step": 1445 }, { "epoch": 0.78, "learning_rate": 8.676179598631865e-08, "logits/chosen": -2.078094720840454, "logits/rejected": -2.082144021987915, "logps/chosen": -2.840444564819336, "logps/rejected": -4.61815071105957, "loss": 0.4254, "rewards/accuracies": 1.0, "rewards/chosen": 1.1127737760543823, "rewards/margins": 0.6345392465591431, "rewards/rejected": 0.47823449969291687, "step": 1446 }, { "epoch": 0.78, "learning_rate": 8.674205475033422e-08, "logits/chosen": -2.18091082572937, "logits/rejected": -2.2819159030914307, "logps/chosen": -5.121231555938721, "logps/rejected": -1.6522126197814941, "loss": 0.7633, "rewards/accuracies": 0.0, "rewards/chosen": 1.1069908142089844, "rewards/margins": -0.13563668727874756, "rewards/rejected": 1.242627501487732, "step": 1447 }, { "epoch": 0.78, "learning_rate": 8.672230105543328e-08, "logits/chosen": -2.011718511581421, "logits/rejected": -2.0113282203674316, "logps/chosen": -3.0092246532440186, "logps/rejected": -4.178119659423828, "loss": 0.5226, "rewards/accuracies": 1.0, "rewards/chosen": 1.0534263849258423, "rewards/margins": 0.3761877417564392, "rewards/rejected": 0.6772386431694031, "step": 1448 }, { "epoch": 0.78, "learning_rate": 8.670253490831418e-08, "logits/chosen": -2.099574089050293, "logits/rejected": -2.0974085330963135, "logps/chosen": -8.210139274597168, "logps/rejected": -3.7751386165618896, "loss": 0.545, "rewards/accuracies": 1.0, "rewards/chosen": 1.358641505241394, "rewards/margins": 0.3221261501312256, "rewards/rejected": 1.0365153551101685, "step": 1449 }, { "epoch": 0.78, "learning_rate": 8.66827563156794e-08, "logits/chosen": -2.0315842628479004, "logits/rejected": -2.248915672302246, "logps/chosen": -0.4699033200740814, "logps/rejected": -0.49046769738197327, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.8757006525993347, "rewards/margins": 0.017549753189086914, "rewards/rejected": 0.8581508994102478, "step": 1450 }, { "epoch": 0.78, "learning_rate": 8.666296528423571e-08, "logits/chosen": -1.9294623136520386, "logits/rejected": -2.247860908508301, "logps/chosen": -1.0487449169158936, "logps/rejected": -0.9880080223083496, "loss": 0.6774, "rewards/accuracies": 1.0, "rewards/chosen": 0.8299735188484192, "rewards/margins": 0.031669676303863525, "rewards/rejected": 0.7983038425445557, "step": 1451 }, { "epoch": 0.78, "learning_rate": 8.66431618206941e-08, "logits/chosen": -2.0262980461120605, "logits/rejected": -2.20621657371521, "logps/chosen": -0.8741042017936707, "logps/rejected": -1.0308969020843506, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 0.7822385430335999, "rewards/margins": -0.00037217140197753906, "rewards/rejected": 0.7826107144355774, "step": 1452 }, { "epoch": 0.78, "learning_rate": 8.662334593176974e-08, "logits/chosen": -2.1035103797912598, "logits/rejected": -2.0918827056884766, "logps/chosen": -7.197021007537842, "logps/rejected": -3.575133800506592, "loss": 0.4111, "rewards/accuracies": 1.0, "rewards/chosen": 1.2240489721298218, "rewards/margins": 0.6763038039207458, "rewards/rejected": 0.5477451682090759, "step": 1453 }, { "epoch": 0.78, "learning_rate": 8.660351762418203e-08, "logits/chosen": -1.9839662313461304, "logits/rejected": -2.21364426612854, "logps/chosen": -0.4662754535675049, "logps/rejected": -0.5234746932983398, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.7727994322776794, "rewards/margins": 0.00873422622680664, "rewards/rejected": 0.7640652060508728, "step": 1454 }, { "epoch": 0.78, "learning_rate": 8.658367690465457e-08, "logits/chosen": -2.0884275436401367, "logits/rejected": -2.2852795124053955, "logps/chosen": -2.407942771911621, "logps/rejected": -7.007408618927002, "loss": 0.5697, "rewards/accuracies": 1.0, "rewards/chosen": 0.6983616948127747, "rewards/margins": 0.2642391622066498, "rewards/rejected": 0.4341225326061249, "step": 1455 }, { "epoch": 0.79, "learning_rate": 8.656382377991518e-08, "logits/chosen": -2.0437612533569336, "logits/rejected": -2.05664324760437, "logps/chosen": -6.56051778793335, "logps/rejected": -9.88823127746582, "loss": 0.2946, "rewards/accuracies": 1.0, "rewards/chosen": 1.590635895729065, "rewards/margins": 1.0711474418640137, "rewards/rejected": 0.519488513469696, "step": 1456 }, { "epoch": 0.79, "learning_rate": 8.65439582566959e-08, "logits/chosen": -2.084789276123047, "logits/rejected": -2.078519582748413, "logps/chosen": -0.9061049818992615, "logps/rejected": -3.6430742740631104, "loss": 0.5986, "rewards/accuracies": 1.0, "rewards/chosen": 0.912544846534729, "rewards/margins": 0.19901341199874878, "rewards/rejected": 0.7135314345359802, "step": 1457 }, { "epoch": 0.79, "learning_rate": 8.652408034173295e-08, "logits/chosen": -2.08364200592041, "logits/rejected": -2.0878524780273438, "logps/chosen": -0.7339478135108948, "logps/rejected": -13.625267028808594, "loss": 0.4494, "rewards/accuracies": 1.0, "rewards/chosen": 0.9272411465644836, "rewards/margins": 0.5667369961738586, "rewards/rejected": 0.360504150390625, "step": 1458 }, { "epoch": 0.79, "learning_rate": 8.650419004176676e-08, "logits/chosen": -2.2290170192718506, "logits/rejected": -2.154385805130005, "logps/chosen": -28.976219177246094, "logps/rejected": -4.111903190612793, "loss": 0.3219, "rewards/accuracies": 1.0, "rewards/chosen": 1.5124553442001343, "rewards/margins": 0.9683096408843994, "rewards/rejected": 0.5441457033157349, "step": 1459 }, { "epoch": 0.79, "learning_rate": 8.648428736354197e-08, "logits/chosen": -2.087622880935669, "logits/rejected": -2.2239482402801514, "logps/chosen": -2.050995111465454, "logps/rejected": -1.4619941711425781, "loss": 0.7327, "rewards/accuracies": 0.0, "rewards/chosen": 0.8541393280029297, "rewards/margins": -0.07759398221969604, "rewards/rejected": 0.9317333102226257, "step": 1460 }, { "epoch": 0.79, "learning_rate": 8.646437231380739e-08, "logits/chosen": -2.135424852371216, "logits/rejected": -2.138903856277466, "logps/chosen": -2.9163358211517334, "logps/rejected": -3.4416093826293945, "loss": 0.4065, "rewards/accuracies": 1.0, "rewards/chosen": 1.2361897230148315, "rewards/margins": 0.6900047659873962, "rewards/rejected": 0.5461849570274353, "step": 1461 }, { "epoch": 0.79, "learning_rate": 8.644444489931605e-08, "logits/chosen": -2.0284535884857178, "logits/rejected": -2.290064573287964, "logps/chosen": -4.073795318603516, "logps/rejected": -2.755532741546631, "loss": 0.7096, "rewards/accuracies": 0.0, "rewards/chosen": 0.8171404004096985, "rewards/margins": -0.03266257047653198, "rewards/rejected": 0.8498029708862305, "step": 1462 }, { "epoch": 0.79, "learning_rate": 8.642450512682517e-08, "logits/chosen": -2.0089378356933594, "logits/rejected": -2.0138766765594482, "logps/chosen": -1.684149146080017, "logps/rejected": -3.6441545486450195, "loss": 0.4537, "rewards/accuracies": 1.0, "rewards/chosen": 0.9108455777168274, "rewards/margins": 0.555016815662384, "rewards/rejected": 0.35582876205444336, "step": 1463 }, { "epoch": 0.79, "learning_rate": 8.640455300309616e-08, "logits/chosen": -2.0082337856292725, "logits/rejected": -2.012355327606201, "logps/chosen": -3.6934409141540527, "logps/rejected": -3.3690876960754395, "loss": 0.4312, "rewards/accuracies": 1.0, "rewards/chosen": 1.2808313369750977, "rewards/margins": 0.6177207231521606, "rewards/rejected": 0.663110613822937, "step": 1464 }, { "epoch": 0.79, "learning_rate": 8.638458853489461e-08, "logits/chosen": -2.1081838607788086, "logits/rejected": -2.102022409439087, "logps/chosen": -5.300889015197754, "logps/rejected": -3.4962375164031982, "loss": 0.3461, "rewards/accuracies": 1.0, "rewards/chosen": 1.367254376411438, "rewards/margins": 0.8831184506416321, "rewards/rejected": 0.4841359257698059, "step": 1465 }, { "epoch": 0.79, "learning_rate": 8.636461172899029e-08, "logits/chosen": -2.1386611461639404, "logits/rejected": -2.3336021900177, "logps/chosen": -2.2714664936065674, "logps/rejected": -0.5315895080566406, "loss": 0.756, "rewards/accuracies": 0.0, "rewards/chosen": 0.8718922734260559, "rewards/margins": -0.12202376127243042, "rewards/rejected": 0.9939160346984863, "step": 1466 }, { "epoch": 0.79, "learning_rate": 8.634462259215718e-08, "logits/chosen": -2.034891366958618, "logits/rejected": -2.255801200866699, "logps/chosen": -1.4207125902175903, "logps/rejected": -1.406290054321289, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.9285984039306641, "rewards/margins": 0.010110855102539062, "rewards/rejected": 0.918487548828125, "step": 1467 }, { "epoch": 0.79, "learning_rate": 8.632462113117343e-08, "logits/chosen": -2.0884058475494385, "logits/rejected": -2.089172601699829, "logps/chosen": -3.5605180263519287, "logps/rejected": -0.9767487645149231, "loss": 0.6737, "rewards/accuracies": 1.0, "rewards/chosen": 0.7219359278678894, "rewards/margins": 0.03929877281188965, "rewards/rejected": 0.6826371550559998, "step": 1468 }, { "epoch": 0.79, "learning_rate": 8.630460735282133e-08, "logits/chosen": -2.001950740814209, "logits/rejected": -1.9958462715148926, "logps/chosen": -5.894547462463379, "logps/rejected": -1.6186953783035278, "loss": 0.3525, "rewards/accuracies": 1.0, "rewards/chosen": 1.4926040172576904, "rewards/margins": 0.8613613247871399, "rewards/rejected": 0.6312426924705505, "step": 1469 }, { "epoch": 0.79, "learning_rate": 8.628458126388742e-08, "logits/chosen": -2.0853378772735596, "logits/rejected": -2.1822566986083984, "logps/chosen": -1.087203025817871, "logps/rejected": -1.278703212738037, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.9081382751464844, "rewards/margins": 0.00460892915725708, "rewards/rejected": 0.9035293459892273, "step": 1470 }, { "epoch": 0.79, "learning_rate": 8.626454287116235e-08, "logits/chosen": -2.1465518474578857, "logits/rejected": -2.2601418495178223, "logps/chosen": -2.1947896480560303, "logps/rejected": -2.015413761138916, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.895213782787323, "rewards/margins": 0.016782641410827637, "rewards/rejected": 0.8784311413764954, "step": 1471 }, { "epoch": 0.79, "learning_rate": 8.624449218144098e-08, "logits/chosen": -2.118000030517578, "logits/rejected": -2.337930917739868, "logps/chosen": -0.9862759709358215, "logps/rejected": -0.9974544644355774, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.7070742249488831, "rewards/margins": 0.024990558624267578, "rewards/rejected": 0.6820836663246155, "step": 1472 }, { "epoch": 0.79, "learning_rate": 8.622442920152233e-08, "logits/chosen": -2.0064921379089355, "logits/rejected": -2.0069613456726074, "logps/chosen": -2.8991029262542725, "logps/rejected": -4.650634765625, "loss": 0.3599, "rewards/accuracies": 1.0, "rewards/chosen": 1.4251480102539062, "rewards/margins": 0.8364331126213074, "rewards/rejected": 0.5887148976325989, "step": 1473 }, { "epoch": 0.8, "learning_rate": 8.620435393820958e-08, "logits/chosen": -2.1688454151153564, "logits/rejected": -2.1658074855804443, "logps/chosen": -7.40460729598999, "logps/rejected": -4.519042491912842, "loss": 0.4045, "rewards/accuracies": 1.0, "rewards/chosen": 1.196223258972168, "rewards/margins": 0.6958962082862854, "rewards/rejected": 0.5003270506858826, "step": 1474 }, { "epoch": 0.8, "learning_rate": 8.618426639831008e-08, "logits/chosen": -2.0735743045806885, "logits/rejected": -2.2282323837280273, "logps/chosen": -1.3274327516555786, "logps/rejected": -1.3315787315368652, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.7000649571418762, "rewards/margins": 0.02668607234954834, "rewards/rejected": 0.6733788847923279, "step": 1475 }, { "epoch": 0.8, "learning_rate": 8.616416658863534e-08, "logits/chosen": -2.0093116760253906, "logits/rejected": -2.2525084018707275, "logps/chosen": -0.463571697473526, "logps/rejected": -0.5580710172653198, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 1.0232536792755127, "rewards/margins": 0.015893936157226562, "rewards/rejected": 1.0073597431182861, "step": 1476 }, { "epoch": 0.8, "learning_rate": 8.614405451600104e-08, "logits/chosen": -2.0862603187561035, "logits/rejected": -2.249105930328369, "logps/chosen": -0.4760742783546448, "logps/rejected": -0.5020257830619812, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.7538568377494812, "rewards/margins": 0.017620563507080078, "rewards/rejected": 0.7362362742424011, "step": 1477 }, { "epoch": 0.8, "learning_rate": 8.612393018722699e-08, "logits/chosen": -2.0178704261779785, "logits/rejected": -2.265671491622925, "logps/chosen": -14.522446632385254, "logps/rejected": -8.71523666381836, "loss": 0.7165, "rewards/accuracies": 0.0, "rewards/chosen": 0.6298266649246216, "rewards/margins": -0.046191513538360596, "rewards/rejected": 0.6760181784629822, "step": 1478 }, { "epoch": 0.8, "learning_rate": 8.610379360913722e-08, "logits/chosen": -2.196216344833374, "logits/rejected": -2.1819403171539307, "logps/chosen": -11.52700424194336, "logps/rejected": -5.635226249694824, "loss": 0.6001, "rewards/accuracies": 1.0, "rewards/chosen": 1.1467430591583252, "rewards/margins": 0.19573140144348145, "rewards/rejected": 0.9510116577148438, "step": 1479 }, { "epoch": 0.8, "learning_rate": 8.608364478855983e-08, "logits/chosen": -2.0536506175994873, "logits/rejected": -2.2274672985076904, "logps/chosen": -7.3690924644470215, "logps/rejected": -8.829475402832031, "loss": 0.6094, "rewards/accuracies": 1.0, "rewards/chosen": 0.9499471783638, "rewards/margins": 0.17504960298538208, "rewards/rejected": 0.774897575378418, "step": 1480 }, { "epoch": 0.8, "learning_rate": 8.606348373232714e-08, "logits/chosen": -1.9907763004302979, "logits/rejected": -1.985836148262024, "logps/chosen": -4.2450032234191895, "logps/rejected": -3.305234670639038, "loss": 0.3827, "rewards/accuracies": 1.0, "rewards/chosen": 1.3879879713058472, "rewards/margins": 0.7629884481430054, "rewards/rejected": 0.6249995231628418, "step": 1481 }, { "epoch": 0.8, "learning_rate": 8.604331044727557e-08, "logits/chosen": -2.0970325469970703, "logits/rejected": -2.2458066940307617, "logps/chosen": -0.30655595660209656, "logps/rejected": -0.3293272852897644, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.829403817653656, "rewards/margins": 0.01349252462387085, "rewards/rejected": 0.8159112930297852, "step": 1482 }, { "epoch": 0.8, "learning_rate": 8.602312494024572e-08, "logits/chosen": -2.022883176803589, "logits/rejected": -2.024143695831299, "logps/chosen": -4.278497695922852, "logps/rejected": -0.9880889654159546, "loss": 0.512, "rewards/accuracies": 1.0, "rewards/chosen": 1.3509858846664429, "rewards/margins": 0.4024627208709717, "rewards/rejected": 0.9485231637954712, "step": 1483 }, { "epoch": 0.8, "learning_rate": 8.600292721808235e-08, "logits/chosen": -2.2343711853027344, "logits/rejected": -2.1414101123809814, "logps/chosen": -32.371910095214844, "logps/rejected": -3.2852284908294678, "loss": 0.3678, "rewards/accuracies": 1.0, "rewards/chosen": 1.3453987836837769, "rewards/margins": 0.8106741309165955, "rewards/rejected": 0.5347246527671814, "step": 1484 }, { "epoch": 0.8, "learning_rate": 8.598271728763429e-08, "logits/chosen": -2.1839046478271484, "logits/rejected": -2.2478771209716797, "logps/chosen": -4.224520683288574, "logps/rejected": -12.745134353637695, "loss": 0.4771, "rewards/accuracies": 1.0, "rewards/chosen": 1.2592551708221436, "rewards/margins": 0.4920678734779358, "rewards/rejected": 0.7671872973442078, "step": 1485 }, { "epoch": 0.8, "learning_rate": 8.596249515575456e-08, "logits/chosen": -2.0193541049957275, "logits/rejected": -2.2221930027008057, "logps/chosen": -0.9983518123626709, "logps/rejected": -0.9793733358383179, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 0.86323082447052, "rewards/margins": 0.027807652950286865, "rewards/rejected": 0.8354231715202332, "step": 1486 }, { "epoch": 0.8, "learning_rate": 8.594226082930035e-08, "logits/chosen": -2.013301372528076, "logits/rejected": -2.018599271774292, "logps/chosen": -0.8373270630836487, "logps/rejected": -3.116227865219116, "loss": 0.5072, "rewards/accuracies": 1.0, "rewards/chosen": 0.9735104441642761, "rewards/margins": 0.4146488904953003, "rewards/rejected": 0.5588615536689758, "step": 1487 }, { "epoch": 0.8, "learning_rate": 8.592201431513288e-08, "logits/chosen": -2.0713255405426025, "logits/rejected": -2.089763641357422, "logps/chosen": -1.3590046167373657, "logps/rejected": -8.222729682922363, "loss": 0.4688, "rewards/accuracies": 1.0, "rewards/chosen": 1.1530406475067139, "rewards/margins": 0.514024555683136, "rewards/rejected": 0.6390160918235779, "step": 1488 }, { "epoch": 0.8, "learning_rate": 8.590175562011766e-08, "logits/chosen": -1.9923070669174194, "logits/rejected": -2.0284738540649414, "logps/chosen": -9.366976737976074, "logps/rejected": -18.954593658447266, "loss": 0.3897, "rewards/accuracies": 1.0, "rewards/chosen": 1.090084195137024, "rewards/margins": 0.741195797920227, "rewards/rejected": 0.3488883972167969, "step": 1489 }, { "epoch": 0.8, "learning_rate": 8.588148475112416e-08, "logits/chosen": -2.057016611099243, "logits/rejected": -2.261796236038208, "logps/chosen": -8.136801719665527, "logps/rejected": -1.3026043176651, "loss": 0.9353, "rewards/accuracies": 0.0, "rewards/chosen": 0.6412798166275024, "rewards/margins": -0.43700528144836426, "rewards/rejected": 1.0782850980758667, "step": 1490 }, { "epoch": 0.8, "learning_rate": 8.586120171502608e-08, "logits/chosen": -1.9927246570587158, "logits/rejected": -1.9987597465515137, "logps/chosen": -2.15041184425354, "logps/rejected": -3.277233600616455, "loss": 0.5297, "rewards/accuracies": 1.0, "rewards/chosen": 0.9242097735404968, "rewards/margins": 0.3588241934776306, "rewards/rejected": 0.5653855800628662, "step": 1491 }, { "epoch": 0.8, "learning_rate": 8.584090651870126e-08, "logits/chosen": -2.0914454460144043, "logits/rejected": -2.230440139770508, "logps/chosen": -0.8611153960227966, "logps/rejected": -5.473180770874023, "loss": 0.7068, "rewards/accuracies": 0.0, "rewards/chosen": 0.7215492129325867, "rewards/margins": -0.027149677276611328, "rewards/rejected": 0.748698890209198, "step": 1492 }, { "epoch": 0.81, "learning_rate": 8.582059916903159e-08, "logits/chosen": -2.003171682357788, "logits/rejected": -1.9964441061019897, "logps/chosen": -4.611106872558594, "logps/rejected": -4.4249067306518555, "loss": 0.4477, "rewards/accuracies": 1.0, "rewards/chosen": 1.250203013420105, "rewards/margins": 0.5715379118919373, "rewards/rejected": 0.6786651015281677, "step": 1493 }, { "epoch": 0.81, "learning_rate": 8.580027967290314e-08, "logits/chosen": -2.0860280990600586, "logits/rejected": -2.271073579788208, "logps/chosen": -2.0555641651153564, "logps/rejected": -1.9081838130950928, "loss": 0.6975, "rewards/accuracies": 0.0, "rewards/chosen": 0.9992216229438782, "rewards/margins": -0.008688032627105713, "rewards/rejected": 1.0079096555709839, "step": 1494 }, { "epoch": 0.81, "learning_rate": 8.577994803720606e-08, "logits/chosen": -2.0491139888763428, "logits/rejected": -2.0538740158081055, "logps/chosen": -2.9541633129119873, "logps/rejected": -11.491437911987305, "loss": 0.4235, "rewards/accuracies": 1.0, "rewards/chosen": 1.0666073560714722, "rewards/margins": 0.640097975730896, "rewards/rejected": 0.42650938034057617, "step": 1495 }, { "epoch": 0.81, "learning_rate": 8.575960426883463e-08, "logits/chosen": -1.977536916732788, "logits/rejected": -2.27593994140625, "logps/chosen": -0.8574438095092773, "logps/rejected": -0.819535493850708, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.7521618008613586, "rewards/margins": 0.02258908748626709, "rewards/rejected": 0.7295727133750916, "step": 1496 }, { "epoch": 0.81, "learning_rate": 8.573924837468727e-08, "logits/chosen": -2.03513503074646, "logits/rejected": -2.0395848751068115, "logps/chosen": -0.36583632230758667, "logps/rejected": -4.796453475952148, "loss": 0.548, "rewards/accuracies": 1.0, "rewards/chosen": 0.9235734343528748, "rewards/margins": 0.31508201360702515, "rewards/rejected": 0.6084914207458496, "step": 1497 }, { "epoch": 0.81, "learning_rate": 8.571888036166646e-08, "logits/chosen": -2.0543079376220703, "logits/rejected": -2.0536911487579346, "logps/chosen": -3.3151400089263916, "logps/rejected": -3.8473589420318604, "loss": 0.3693, "rewards/accuracies": 1.0, "rewards/chosen": 1.37911057472229, "rewards/margins": 0.805669367313385, "rewards/rejected": 0.573441207408905, "step": 1498 }, { "epoch": 0.81, "learning_rate": 8.569850023667886e-08, "logits/chosen": -2.038975238800049, "logits/rejected": -2.3178653717041016, "logps/chosen": -5.238393306732178, "logps/rejected": -5.102468967437744, "loss": 0.6983, "rewards/accuracies": 0.0, "rewards/chosen": 1.0692981481552124, "rewards/margins": -0.010205507278442383, "rewards/rejected": 1.0795036554336548, "step": 1499 }, { "epoch": 0.81, "learning_rate": 8.567810800663517e-08, "logits/chosen": -1.9768441915512085, "logits/rejected": -1.973799705505371, "logps/chosen": -4.734960556030273, "logps/rejected": -1.6989758014678955, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 1.0544134378433228, "rewards/margins": 0.003822922706604004, "rewards/rejected": 1.0505905151367188, "step": 1500 }, { "epoch": 0.81, "learning_rate": 8.565770367845022e-08, "logits/chosen": -2.1185128688812256, "logits/rejected": -2.005446195602417, "logps/chosen": -28.250965118408203, "logps/rejected": -2.7266156673431396, "loss": 0.4747, "rewards/accuracies": 1.0, "rewards/chosen": 1.1221802234649658, "rewards/margins": 0.4982779026031494, "rewards/rejected": 0.6239023208618164, "step": 1501 }, { "epoch": 0.81, "learning_rate": 8.563728725904293e-08, "logits/chosen": -2.044712543487549, "logits/rejected": -2.288163661956787, "logps/chosen": -3.743098020553589, "logps/rejected": -1.2057310342788696, "loss": 0.7932, "rewards/accuracies": 0.0, "rewards/chosen": 0.8728265166282654, "rewards/margins": -0.19101697206497192, "rewards/rejected": 1.0638434886932373, "step": 1502 }, { "epoch": 0.81, "learning_rate": 8.561685875533637e-08, "logits/chosen": -2.076781988143921, "logits/rejected": -2.27228045463562, "logps/chosen": -2.3518714904785156, "logps/rejected": -2.644455909729004, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.8784009218215942, "rewards/margins": 0.01692098379135132, "rewards/rejected": 0.8614799380302429, "step": 1503 }, { "epoch": 0.81, "learning_rate": 8.559641817425764e-08, "logits/chosen": -2.0411620140075684, "logits/rejected": -2.2650067806243896, "logps/chosen": -0.6929004192352295, "logps/rejected": -0.6828538179397583, "loss": 0.7014, "rewards/accuracies": 0.0, "rewards/chosen": 0.8812641501426697, "rewards/margins": -0.016349494457244873, "rewards/rejected": 0.8976136445999146, "step": 1504 }, { "epoch": 0.81, "learning_rate": 8.5575965522738e-08, "logits/chosen": -2.2357707023620605, "logits/rejected": -2.2456395626068115, "logps/chosen": -5.778040885925293, "logps/rejected": -5.248256683349609, "loss": 0.4535, "rewards/accuracies": 1.0, "rewards/chosen": 1.0560052394866943, "rewards/margins": 0.5553619861602783, "rewards/rejected": 0.500643253326416, "step": 1505 }, { "epoch": 0.81, "learning_rate": 8.555550080771273e-08, "logits/chosen": -2.0783274173736572, "logits/rejected": -2.2758467197418213, "logps/chosen": -1.3540980815887451, "logps/rejected": -1.4039376974105835, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 1.0512927770614624, "rewards/margins": 0.010399341583251953, "rewards/rejected": 1.0408934354782104, "step": 1506 }, { "epoch": 0.81, "learning_rate": 8.553502403612126e-08, "logits/chosen": -2.084385395050049, "logits/rejected": -2.0817878246307373, "logps/chosen": -1.4106661081314087, "logps/rejected": -4.1269941329956055, "loss": 0.5042, "rewards/accuracies": 1.0, "rewards/chosen": 1.0340174436569214, "rewards/margins": 0.42219436168670654, "rewards/rejected": 0.6118230819702148, "step": 1507 }, { "epoch": 0.81, "learning_rate": 8.551453521490708e-08, "logits/chosen": -1.8444569110870361, "logits/rejected": -2.260655403137207, "logps/chosen": -1.6962088346481323, "logps/rejected": -1.9359835386276245, "loss": 0.66, "rewards/accuracies": 1.0, "rewards/chosen": 0.6929364204406738, "rewards/margins": 0.06740140914916992, "rewards/rejected": 0.6255350112915039, "step": 1508 }, { "epoch": 0.81, "learning_rate": 8.549403435101777e-08, "logits/chosen": -2.1719906330108643, "logits/rejected": -2.292874336242676, "logps/chosen": -5.345616817474365, "logps/rejected": -0.4992920756340027, "loss": 0.8049, "rewards/accuracies": 0.0, "rewards/chosen": 0.8184501528739929, "rewards/margins": -0.21218925714492798, "rewards/rejected": 1.030639410018921, "step": 1509 }, { "epoch": 0.81, "learning_rate": 8.547352145140502e-08, "logits/chosen": -2.0450282096862793, "logits/rejected": -2.093319892883301, "logps/chosen": -4.35807991027832, "logps/rejected": -19.023725509643555, "loss": 0.2151, "rewards/accuracies": 1.0, "rewards/chosen": 1.3709009885787964, "rewards/margins": 1.426970362663269, "rewards/rejected": -0.056069374084472656, "step": 1510 }, { "epoch": 0.81, "learning_rate": 8.545299652302456e-08, "logits/chosen": -2.0277960300445557, "logits/rejected": -2.2822184562683105, "logps/chosen": -0.6425192356109619, "logps/rejected": -0.7075401544570923, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.9659662246704102, "rewards/margins": -0.0035266876220703125, "rewards/rejected": 0.9694929122924805, "step": 1511 }, { "epoch": 0.82, "learning_rate": 8.543245957283622e-08, "logits/chosen": -1.998111605644226, "logits/rejected": -2.3126587867736816, "logps/chosen": -3.3773515224456787, "logps/rejected": -3.653688669204712, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.9234229922294617, "rewards/margins": -0.0007210969924926758, "rewards/rejected": 0.9241440892219543, "step": 1512 }, { "epoch": 0.82, "learning_rate": 8.541191060780391e-08, "logits/chosen": -2.046074628829956, "logits/rejected": -2.2734837532043457, "logps/chosen": -11.396673202514648, "logps/rejected": -11.091346740722656, "loss": 0.7012, "rewards/accuracies": 0.0, "rewards/chosen": 0.252720445394516, "rewards/margins": -0.01597091555595398, "rewards/rejected": 0.26869136095046997, "step": 1513 }, { "epoch": 0.82, "learning_rate": 8.53913496348956e-08, "logits/chosen": -2.0580620765686035, "logits/rejected": -2.323246717453003, "logps/chosen": -0.9498355388641357, "logps/rejected": -0.9663276672363281, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 1.1209219694137573, "rewards/margins": 0.015155673027038574, "rewards/rejected": 1.1057662963867188, "step": 1514 }, { "epoch": 0.82, "learning_rate": 8.537077666108335e-08, "logits/chosen": -2.0959479808807373, "logits/rejected": -2.0954298973083496, "logps/chosen": -1.3033651113510132, "logps/rejected": -3.3171050548553467, "loss": 0.5214, "rewards/accuracies": 1.0, "rewards/chosen": 0.9739915132522583, "rewards/margins": 0.37912750244140625, "rewards/rejected": 0.594864010810852, "step": 1515 }, { "epoch": 0.82, "learning_rate": 8.535019169334328e-08, "logits/chosen": -2.203836679458618, "logits/rejected": -2.1998114585876465, "logps/chosen": -1.6014031171798706, "logps/rejected": -3.9357738494873047, "loss": 0.5283, "rewards/accuracies": 1.0, "rewards/chosen": 1.0021296739578247, "rewards/margins": 0.3623254895210266, "rewards/rejected": 0.6398041844367981, "step": 1516 }, { "epoch": 0.82, "learning_rate": 8.532959473865558e-08, "logits/chosen": -2.1271915435791016, "logits/rejected": -2.122889995574951, "logps/chosen": -3.702843189239502, "logps/rejected": -3.9560885429382324, "loss": 0.5528, "rewards/accuracies": 1.0, "rewards/chosen": 0.9691013693809509, "rewards/margins": 0.3035454750061035, "rewards/rejected": 0.6655558943748474, "step": 1517 }, { "epoch": 0.82, "learning_rate": 8.530898580400447e-08, "logits/chosen": -1.9965505599975586, "logits/rejected": -2.249300479888916, "logps/chosen": -1.7993144989013672, "logps/rejected": -1.459061861038208, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 0.8589420318603516, "rewards/margins": -0.004696369171142578, "rewards/rejected": 0.8636384010314941, "step": 1518 }, { "epoch": 0.82, "learning_rate": 8.528836489637827e-08, "logits/chosen": -2.117377758026123, "logits/rejected": -2.2944765090942383, "logps/chosen": -7.0187788009643555, "logps/rejected": -6.624732971191406, "loss": 0.7013, "rewards/accuracies": 0.0, "rewards/chosen": 0.47447940707206726, "rewards/margins": -0.016231894493103027, "rewards/rejected": 0.4907113015651703, "step": 1519 }, { "epoch": 0.82, "learning_rate": 8.52677320227694e-08, "logits/chosen": -2.0533041954040527, "logits/rejected": -2.0419421195983887, "logps/chosen": -13.752915382385254, "logps/rejected": -3.251577615737915, "loss": 0.3774, "rewards/accuracies": 1.0, "rewards/chosen": 1.3143523931503296, "rewards/margins": 0.779941737651825, "rewards/rejected": 0.5344106554985046, "step": 1520 }, { "epoch": 0.82, "learning_rate": 8.524708719017426e-08, "logits/chosen": -2.1335623264312744, "logits/rejected": -2.215205669403076, "logps/chosen": -0.8089016675949097, "logps/rejected": -0.8261749744415283, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.7502653002738953, "rewards/margins": 0.013322234153747559, "rewards/rejected": 0.7369430661201477, "step": 1521 }, { "epoch": 0.82, "learning_rate": 8.522643040559332e-08, "logits/chosen": -2.132810354232788, "logits/rejected": -2.1376736164093018, "logps/chosen": -2.675854206085205, "logps/rejected": -2.5910403728485107, "loss": 0.5438, "rewards/accuracies": 1.0, "rewards/chosen": 0.970965564250946, "rewards/margins": 0.3249243497848511, "rewards/rejected": 0.646041214466095, "step": 1522 }, { "epoch": 0.82, "learning_rate": 8.520576167603115e-08, "logits/chosen": -1.9747035503387451, "logits/rejected": -1.9545985460281372, "logps/chosen": -15.268767356872559, "logps/rejected": -6.61430549621582, "loss": 0.4666, "rewards/accuracies": 1.0, "rewards/chosen": 1.2775228023529053, "rewards/margins": 0.5198636651039124, "rewards/rejected": 0.7576591372489929, "step": 1523 }, { "epoch": 0.82, "learning_rate": 8.51850810084963e-08, "logits/chosen": -2.0499260425567627, "logits/rejected": -2.2588934898376465, "logps/chosen": -2.990647315979004, "logps/rejected": -2.609923839569092, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.6789273619651794, "rewards/margins": -0.007797956466674805, "rewards/rejected": 0.6867253184318542, "step": 1524 }, { "epoch": 0.82, "learning_rate": 8.516438841000147e-08, "logits/chosen": -2.042888879776001, "logits/rejected": -2.0408358573913574, "logps/chosen": -0.7282723188400269, "logps/rejected": -1.7074251174926758, "loss": 0.5756, "rewards/accuracies": 1.0, "rewards/chosen": 0.9759117364883423, "rewards/margins": 0.25076866149902344, "rewards/rejected": 0.7251430749893188, "step": 1525 }, { "epoch": 0.82, "learning_rate": 8.514368388756327e-08, "logits/chosen": -2.1124463081359863, "logits/rejected": -2.0897915363311768, "logps/chosen": -17.768836975097656, "logps/rejected": -1.440651297569275, "loss": 0.488, "rewards/accuracies": 1.0, "rewards/chosen": 1.4161032438278198, "rewards/margins": 0.4635293483734131, "rewards/rejected": 0.9525738954544067, "step": 1526 }, { "epoch": 0.82, "learning_rate": 8.512296744820248e-08, "logits/chosen": -2.1406874656677246, "logits/rejected": -2.2645864486694336, "logps/chosen": -0.4462171792984009, "logps/rejected": -0.4161362051963806, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.9186474084854126, "rewards/margins": 0.020522236824035645, "rewards/rejected": 0.898125171661377, "step": 1527 }, { "epoch": 0.82, "learning_rate": 8.510223909894385e-08, "logits/chosen": -2.0720267295837402, "logits/rejected": -2.0734751224517822, "logps/chosen": -4.031334400177002, "logps/rejected": -4.808161735534668, "loss": 0.3829, "rewards/accuracies": 1.0, "rewards/chosen": 1.3689030408859253, "rewards/margins": 0.7624265551567078, "rewards/rejected": 0.6064764857292175, "step": 1528 }, { "epoch": 0.82, "learning_rate": 8.508149884681617e-08, "logits/chosen": -2.075371265411377, "logits/rejected": -2.0757150650024414, "logps/chosen": -0.5006066560745239, "logps/rejected": -3.058370351791382, "loss": 0.4925, "rewards/accuracies": 1.0, "rewards/chosen": 1.0072357654571533, "rewards/margins": 0.4519498944282532, "rewards/rejected": 0.5552858710289001, "step": 1529 }, { "epoch": 0.83, "learning_rate": 8.50607466988523e-08, "logits/chosen": -2.086617946624756, "logits/rejected": -2.105262517929077, "logps/chosen": -6.35752010345459, "logps/rejected": -2.9671926498413086, "loss": 0.5705, "rewards/accuracies": 1.0, "rewards/chosen": 1.0706428289413452, "rewards/margins": 0.2624182105064392, "rewards/rejected": 0.808224618434906, "step": 1530 }, { "epoch": 0.83, "learning_rate": 8.503998266208913e-08, "logits/chosen": -1.9949171543121338, "logits/rejected": -2.2326905727386475, "logps/chosen": -2.5470633506774902, "logps/rejected": -2.4850945472717285, "loss": 0.6721, "rewards/accuracies": 1.0, "rewards/chosen": 0.6726413369178772, "rewards/margins": 0.04255259037017822, "rewards/rejected": 0.630088746547699, "step": 1531 }, { "epoch": 0.83, "learning_rate": 8.501920674356754e-08, "logits/chosen": -2.0671751499176025, "logits/rejected": -2.0638697147369385, "logps/chosen": -3.4106338024139404, "logps/rejected": -3.9277775287628174, "loss": 0.5956, "rewards/accuracies": 1.0, "rewards/chosen": 0.8941184878349304, "rewards/margins": 0.20569026470184326, "rewards/rejected": 0.6884282231330872, "step": 1532 }, { "epoch": 0.83, "learning_rate": 8.499841895033247e-08, "logits/chosen": -2.105088949203491, "logits/rejected": -2.3039369583129883, "logps/chosen": -0.9298652410507202, "logps/rejected": -0.9951446056365967, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": 1.0033807754516602, "rewards/margins": 0.002622246742248535, "rewards/rejected": 1.0007585287094116, "step": 1533 }, { "epoch": 0.83, "learning_rate": 8.49776192894329e-08, "logits/chosen": -2.192228317260742, "logits/rejected": -2.1893298625946045, "logps/chosen": -3.5339131355285645, "logps/rejected": -3.9586822986602783, "loss": 0.539, "rewards/accuracies": 1.0, "rewards/chosen": 0.9535368084907532, "rewards/margins": 0.33639639616012573, "rewards/rejected": 0.6171404123306274, "step": 1534 }, { "epoch": 0.83, "learning_rate": 8.49568077679218e-08, "logits/chosen": -2.1834187507629395, "logits/rejected": -2.0124144554138184, "logps/chosen": -29.6044864654541, "logps/rejected": -3.5455570220947266, "loss": 0.329, "rewards/accuracies": 1.0, "rewards/chosen": 1.5945703983306885, "rewards/margins": 0.9427154660224915, "rewards/rejected": 0.651854932308197, "step": 1535 }, { "epoch": 0.83, "learning_rate": 8.493598439285619e-08, "logits/chosen": -2.086073637008667, "logits/rejected": -2.0955686569213867, "logps/chosen": -6.537712574005127, "logps/rejected": -3.190506935119629, "loss": 0.3335, "rewards/accuracies": 1.0, "rewards/chosen": 1.5521758794784546, "rewards/margins": 0.9265649318695068, "rewards/rejected": 0.6256109476089478, "step": 1536 }, { "epoch": 0.83, "learning_rate": 8.491514917129709e-08, "logits/chosen": -2.110637664794922, "logits/rejected": -2.0100345611572266, "logps/chosen": -30.316072463989258, "logps/rejected": -3.270979166030884, "loss": 0.3075, "rewards/accuracies": 1.0, "rewards/chosen": 1.6046804189682007, "rewards/margins": 1.0214437246322632, "rewards/rejected": 0.5832366943359375, "step": 1537 }, { "epoch": 0.83, "learning_rate": 8.489430211030954e-08, "logits/chosen": -2.0784928798675537, "logits/rejected": -2.0428061485290527, "logps/chosen": -6.619935512542725, "logps/rejected": -1.7419979572296143, "loss": 0.4013, "rewards/accuracies": 1.0, "rewards/chosen": 1.463476538658142, "rewards/margins": 0.7057226896286011, "rewards/rejected": 0.757753849029541, "step": 1538 }, { "epoch": 0.83, "learning_rate": 8.487344321696262e-08, "logits/chosen": -2.0436716079711914, "logits/rejected": -2.0525732040405273, "logps/chosen": -1.6517713069915771, "logps/rejected": -2.2790088653564453, "loss": 0.5103, "rewards/accuracies": 1.0, "rewards/chosen": 1.0214489698410034, "rewards/margins": 0.4068829417228699, "rewards/rejected": 0.6145660281181335, "step": 1539 }, { "epoch": 0.83, "learning_rate": 8.485257249832938e-08, "logits/chosen": -2.066521644592285, "logits/rejected": -2.1985020637512207, "logps/chosen": -0.5152305364608765, "logps/rejected": -0.5753833055496216, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.9067859649658203, "rewards/margins": 0.023768723011016846, "rewards/rejected": 0.8830172419548035, "step": 1540 }, { "epoch": 0.83, "learning_rate": 8.483168996148694e-08, "logits/chosen": -2.173581600189209, "logits/rejected": -2.1373345851898193, "logps/chosen": -20.334407806396484, "logps/rejected": -2.82131028175354, "loss": 0.3655, "rewards/accuracies": 1.0, "rewards/chosen": 1.4200619459152222, "rewards/margins": 0.8181231021881104, "rewards/rejected": 0.6019388437271118, "step": 1541 }, { "epoch": 0.83, "learning_rate": 8.481079561351634e-08, "logits/chosen": -2.1072304248809814, "logits/rejected": -2.2550151348114014, "logps/chosen": -0.3735562264919281, "logps/rejected": -0.43684887886047363, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.907323956489563, "rewards/margins": -0.005755007266998291, "rewards/rejected": 0.9130789637565613, "step": 1542 }, { "epoch": 0.83, "learning_rate": 8.47898894615027e-08, "logits/chosen": -2.09128475189209, "logits/rejected": -2.138385057449341, "logps/chosen": -9.316869735717773, "logps/rejected": -8.085719108581543, "loss": 0.5007, "rewards/accuracies": 1.0, "rewards/chosen": 1.2660833597183228, "rewards/margins": 0.431022584438324, "rewards/rejected": 0.8350607752799988, "step": 1543 }, { "epoch": 0.83, "learning_rate": 8.476897151253512e-08, "logits/chosen": -1.985080599784851, "logits/rejected": -1.9975024461746216, "logps/chosen": -2.602456569671631, "logps/rejected": -8.34459114074707, "loss": 0.4517, "rewards/accuracies": 1.0, "rewards/chosen": 1.0465748310089111, "rewards/margins": 0.560511589050293, "rewards/rejected": 0.4860632121562958, "step": 1544 }, { "epoch": 0.83, "learning_rate": 8.47480417737067e-08, "logits/chosen": -2.025116443634033, "logits/rejected": -2.0219881534576416, "logps/chosen": -7.269874095916748, "logps/rejected": -1.8669929504394531, "loss": 0.5797, "rewards/accuracies": 1.0, "rewards/chosen": 0.9909696578979492, "rewards/margins": 0.24142932891845703, "rewards/rejected": 0.7495403289794922, "step": 1545 }, { "epoch": 0.83, "learning_rate": 8.472710025211452e-08, "logits/chosen": -2.033391237258911, "logits/rejected": -2.039260149002075, "logps/chosen": -1.8012216091156006, "logps/rejected": -5.049161911010742, "loss": 0.4719, "rewards/accuracies": 1.0, "rewards/chosen": 1.122176170349121, "rewards/margins": 0.5058088898658752, "rewards/rejected": 0.6163672804832458, "step": 1546 }, { "epoch": 0.83, "learning_rate": 8.470614695485966e-08, "logits/chosen": -1.9162795543670654, "logits/rejected": -1.9194366931915283, "logps/chosen": -1.5421462059020996, "logps/rejected": -2.9848110675811768, "loss": 0.5156, "rewards/accuracies": 1.0, "rewards/chosen": 1.0450286865234375, "rewards/margins": 0.3936259150505066, "rewards/rejected": 0.6514027714729309, "step": 1547 }, { "epoch": 0.83, "learning_rate": 8.468518188904725e-08, "logits/chosen": -2.1697850227355957, "logits/rejected": -2.0368378162384033, "logps/chosen": -38.55278778076172, "logps/rejected": -5.073042392730713, "loss": 0.3561, "rewards/accuracies": 1.0, "rewards/chosen": 1.3466728925704956, "rewards/margins": 0.8492566347122192, "rewards/rejected": 0.49741625785827637, "step": 1548 }, { "epoch": 0.84, "learning_rate": 8.466420506178633e-08, "logits/chosen": -1.985327959060669, "logits/rejected": -1.972961664199829, "logps/chosen": -1.2195457220077515, "logps/rejected": -3.574780225753784, "loss": 0.5444, "rewards/accuracies": 1.0, "rewards/chosen": 0.9868532419204712, "rewards/margins": 0.32359087467193604, "rewards/rejected": 0.6632623672485352, "step": 1549 }, { "epoch": 0.84, "learning_rate": 8.464321648018997e-08, "logits/chosen": -2.1083736419677734, "logits/rejected": -2.292022943496704, "logps/chosen": -3.592395782470703, "logps/rejected": -3.5883066654205322, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.736729621887207, "rewards/margins": 0.012735545635223389, "rewards/rejected": 0.7239940762519836, "step": 1550 }, { "epoch": 0.84, "learning_rate": 8.462221615137521e-08, "logits/chosen": -2.096437692642212, "logits/rejected": -2.0906174182891846, "logps/chosen": -3.282179832458496, "logps/rejected": -3.7135536670684814, "loss": 0.558, "rewards/accuracies": 1.0, "rewards/chosen": 1.1796103715896606, "rewards/margins": 0.2915186882019043, "rewards/rejected": 0.8880916833877563, "step": 1551 }, { "epoch": 0.84, "learning_rate": 8.460120408246311e-08, "logits/chosen": -2.0854477882385254, "logits/rejected": -2.261542558670044, "logps/chosen": -1.3745827674865723, "logps/rejected": -5.901137351989746, "loss": 0.5912, "rewards/accuracies": 1.0, "rewards/chosen": 0.9900871515274048, "rewards/margins": 0.21556681394577026, "rewards/rejected": 0.7745203375816345, "step": 1552 }, { "epoch": 0.84, "learning_rate": 8.458018028057866e-08, "logits/chosen": -2.037076234817505, "logits/rejected": -2.2812952995300293, "logps/chosen": -1.5483791828155518, "logps/rejected": -4.750624179840088, "loss": 0.6553, "rewards/accuracies": 1.0, "rewards/chosen": 1.0179933309555054, "rewards/margins": 0.07722163200378418, "rewards/rejected": 0.9407716989517212, "step": 1553 }, { "epoch": 0.84, "learning_rate": 8.455914475285084e-08, "logits/chosen": -2.1791579723358154, "logits/rejected": -2.369178056716919, "logps/chosen": -17.082658767700195, "logps/rejected": -17.539588928222656, "loss": 0.6046, "rewards/accuracies": 1.0, "rewards/chosen": 1.0712851285934448, "rewards/margins": 0.18570369482040405, "rewards/rejected": 0.8855814337730408, "step": 1554 }, { "epoch": 0.84, "learning_rate": 8.453809750641263e-08, "logits/chosen": -2.135782241821289, "logits/rejected": -2.1309268474578857, "logps/chosen": -7.486761093139648, "logps/rejected": -4.636264801025391, "loss": 0.4325, "rewards/accuracies": 1.0, "rewards/chosen": 1.0465952157974243, "rewards/margins": 0.6141708493232727, "rewards/rejected": 0.4324243664741516, "step": 1555 }, { "epoch": 0.84, "learning_rate": 8.4517038548401e-08, "logits/chosen": -2.0732555389404297, "logits/rejected": -2.2607157230377197, "logps/chosen": -0.3906225562095642, "logps/rejected": -0.37267324328422546, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.8943694233894348, "rewards/margins": 0.018262386322021484, "rewards/rejected": 0.8761070370674133, "step": 1556 }, { "epoch": 0.84, "learning_rate": 8.449596788595681e-08, "logits/chosen": -2.2975010871887207, "logits/rejected": -2.166947364807129, "logps/chosen": -42.513797760009766, "logps/rejected": -1.4668693542480469, "loss": 0.3389, "rewards/accuracies": 1.0, "rewards/chosen": 1.8356876373291016, "rewards/margins": 0.9077571034431458, "rewards/rejected": 0.9279305338859558, "step": 1557 }, { "epoch": 0.84, "learning_rate": 8.447488552622498e-08, "logits/chosen": -2.049734354019165, "logits/rejected": -2.0526137351989746, "logps/chosen": -5.3720293045043945, "logps/rejected": -3.2382867336273193, "loss": 0.5885, "rewards/accuracies": 1.0, "rewards/chosen": 0.9127587676048279, "rewards/margins": 0.22158217430114746, "rewards/rejected": 0.6911765933036804, "step": 1558 }, { "epoch": 0.84, "learning_rate": 8.445379147635433e-08, "logits/chosen": -2.321413278579712, "logits/rejected": -2.184072732925415, "logps/chosen": -27.501079559326172, "logps/rejected": -1.7392001152038574, "loss": 0.3086, "rewards/accuracies": 1.0, "rewards/chosen": 1.6625823974609375, "rewards/margins": 1.0175237655639648, "rewards/rejected": 0.6450585722923279, "step": 1559 }, { "epoch": 0.84, "learning_rate": 8.443268574349771e-08, "logits/chosen": -2.129821538925171, "logits/rejected": -2.1390674114227295, "logps/chosen": -2.108166456222534, "logps/rejected": -3.5420453548431396, "loss": 0.479, "rewards/accuracies": 1.0, "rewards/chosen": 1.0543394088745117, "rewards/margins": 0.4869807958602905, "rewards/rejected": 0.5673586130142212, "step": 1560 }, { "epoch": 0.84, "learning_rate": 8.441156833481186e-08, "logits/chosen": -2.167344570159912, "logits/rejected": -2.310746908187866, "logps/chosen": -0.6805684566497803, "logps/rejected": -0.8036193251609802, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.9239677786827087, "rewards/margins": 0.018275976181030273, "rewards/rejected": 0.9056918025016785, "step": 1561 }, { "epoch": 0.84, "learning_rate": 8.439043925745752e-08, "logits/chosen": -2.0587451457977295, "logits/rejected": -2.2039237022399902, "logps/chosen": -1.0243960618972778, "logps/rejected": -0.9608701467514038, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.7917704582214355, "rewards/margins": 0.023859262466430664, "rewards/rejected": 0.7679111957550049, "step": 1562 }, { "epoch": 0.84, "learning_rate": 8.436929851859938e-08, "logits/chosen": -1.998191475868225, "logits/rejected": -2.1754798889160156, "logps/chosen": -1.4201269149780273, "logps/rejected": -1.3555270433425903, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 0.8778491020202637, "rewards/margins": -0.00036412477493286133, "rewards/rejected": 0.8782132267951965, "step": 1563 }, { "epoch": 0.84, "learning_rate": 8.43481461254061e-08, "logits/chosen": -2.021928548812866, "logits/rejected": -2.0239264965057373, "logps/chosen": -3.095297336578369, "logps/rejected": -0.8782985210418701, "loss": 0.6463, "rewards/accuracies": 1.0, "rewards/chosen": 1.0480035543441772, "rewards/margins": 0.09591090679168701, "rewards/rejected": 0.9520926475524902, "step": 1564 }, { "epoch": 0.84, "learning_rate": 8.432698208505025e-08, "logits/chosen": -2.044433832168579, "logits/rejected": -2.2665328979492188, "logps/chosen": -1.0721561908721924, "logps/rejected": -1.0876985788345337, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 0.7978038787841797, "rewards/margins": 0.024838745594024658, "rewards/rejected": 0.772965133190155, "step": 1565 }, { "epoch": 0.84, "learning_rate": 8.43058064047084e-08, "logits/chosen": -1.9189372062683105, "logits/rejected": -1.9185104370117188, "logps/chosen": -0.9649975299835205, "logps/rejected": -1.3990027904510498, "loss": 0.7111, "rewards/accuracies": 0.0, "rewards/chosen": 0.832111656665802, "rewards/margins": -0.03568696975708008, "rewards/rejected": 0.8677986264228821, "step": 1566 }, { "epoch": 0.85, "learning_rate": 8.428461909156102e-08, "logits/chosen": -2.2442071437835693, "logits/rejected": -2.192002058029175, "logps/chosen": -14.678495407104492, "logps/rejected": -9.593425750732422, "loss": 0.2745, "rewards/accuracies": 1.0, "rewards/chosen": 1.6748141050338745, "rewards/margins": 1.1522578001022339, "rewards/rejected": 0.5225563049316406, "step": 1567 }, { "epoch": 0.85, "learning_rate": 8.426342015279255e-08, "logits/chosen": -2.0806734561920166, "logits/rejected": -2.281270742416382, "logps/chosen": -8.472746849060059, "logps/rejected": -0.5509775876998901, "loss": 0.6674, "rewards/accuracies": 1.0, "rewards/chosen": 1.0031161308288574, "rewards/margins": 0.05208784341812134, "rewards/rejected": 0.9510282874107361, "step": 1568 }, { "epoch": 0.85, "learning_rate": 8.424220959559139e-08, "logits/chosen": -2.204907178878784, "logits/rejected": -2.065267324447632, "logps/chosen": -41.59858703613281, "logps/rejected": -1.546760082244873, "loss": 0.2801, "rewards/accuracies": 1.0, "rewards/chosen": 2.0995068550109863, "rewards/margins": 1.1293123960494995, "rewards/rejected": 0.9701944589614868, "step": 1569 }, { "epoch": 0.85, "learning_rate": 8.422098742714982e-08, "logits/chosen": -2.0535895824432373, "logits/rejected": -2.0461373329162598, "logps/chosen": -5.198451995849609, "logps/rejected": -3.076767683029175, "loss": 0.548, "rewards/accuracies": 1.0, "rewards/chosen": 1.0279955863952637, "rewards/margins": 0.31506699323654175, "rewards/rejected": 0.7129285931587219, "step": 1570 }, { "epoch": 0.85, "learning_rate": 8.419975365466415e-08, "logits/chosen": -1.9791524410247803, "logits/rejected": -1.9958232641220093, "logps/chosen": -1.9552943706512451, "logps/rejected": -6.302623748779297, "loss": 0.3828, "rewards/accuracies": 1.0, "rewards/chosen": 1.236608862876892, "rewards/margins": 0.7626235485076904, "rewards/rejected": 0.4739852845668793, "step": 1571 }, { "epoch": 0.85, "learning_rate": 8.41785082853345e-08, "logits/chosen": -2.2698686122894287, "logits/rejected": -2.130272150039673, "logps/chosen": -36.1485710144043, "logps/rejected": -3.3859994411468506, "loss": 0.3603, "rewards/accuracies": 1.0, "rewards/chosen": 1.2764934301376343, "rewards/margins": 0.8352184891700745, "rewards/rejected": 0.4412749409675598, "step": 1572 }, { "epoch": 0.85, "learning_rate": 8.415725132636506e-08, "logits/chosen": -2.1642744541168213, "logits/rejected": -2.160606861114502, "logps/chosen": -5.360318660736084, "logps/rejected": -4.378066062927246, "loss": 0.3019, "rewards/accuracies": 1.0, "rewards/chosen": 1.5129890441894531, "rewards/margins": 1.0428760051727295, "rewards/rejected": 0.47011300921440125, "step": 1573 }, { "epoch": 0.85, "learning_rate": 8.413598278496385e-08, "logits/chosen": -2.11232852935791, "logits/rejected": -2.2809770107269287, "logps/chosen": -0.7164546847343445, "logps/rejected": -0.7039675116539001, "loss": 0.6746, "rewards/accuracies": 1.0, "rewards/chosen": 0.8094386458396912, "rewards/margins": 0.03751230239868164, "rewards/rejected": 0.7719263434410095, "step": 1574 }, { "epoch": 0.85, "learning_rate": 8.411470266834286e-08, "logits/chosen": -2.018575668334961, "logits/rejected": -2.2494633197784424, "logps/chosen": -0.7220787405967712, "logps/rejected": -0.7848242521286011, "loss": 0.6667, "rewards/accuracies": 1.0, "rewards/chosen": 0.9645870327949524, "rewards/margins": 0.05361497402191162, "rewards/rejected": 0.9109720587730408, "step": 1575 }, { "epoch": 0.85, "learning_rate": 8.409341098371801e-08, "logits/chosen": -2.1033942699432373, "logits/rejected": -2.1026129722595215, "logps/chosen": -5.456270217895508, "logps/rejected": -5.674760341644287, "loss": 0.3688, "rewards/accuracies": 1.0, "rewards/chosen": 1.272868037223816, "rewards/margins": 0.8074785470962524, "rewards/rejected": 0.4653894901275635, "step": 1576 }, { "epoch": 0.85, "learning_rate": 8.407210773830907e-08, "logits/chosen": -2.0439419746398926, "logits/rejected": -2.037569046020508, "logps/chosen": -16.391382217407227, "logps/rejected": -8.587424278259277, "loss": 0.3393, "rewards/accuracies": 1.0, "rewards/chosen": 1.2386702299118042, "rewards/margins": 0.9062578678131104, "rewards/rejected": 0.33241233229637146, "step": 1577 }, { "epoch": 0.85, "learning_rate": 8.405079293933985e-08, "logits/chosen": -2.025850296020508, "logits/rejected": -2.0256471633911133, "logps/chosen": -4.105197906494141, "logps/rejected": -5.689981460571289, "loss": 0.4639, "rewards/accuracies": 1.0, "rewards/chosen": 1.079174280166626, "rewards/margins": 0.5272591710090637, "rewards/rejected": 0.5519151091575623, "step": 1578 }, { "epoch": 0.85, "learning_rate": 8.4029466594038e-08, "logits/chosen": -2.0042879581451416, "logits/rejected": -2.271793842315674, "logps/chosen": -0.6936439275741577, "logps/rejected": -0.7977467179298401, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.8004908561706543, "rewards/margins": 0.021138250827789307, "rewards/rejected": 0.779352605342865, "step": 1579 }, { "epoch": 0.85, "learning_rate": 8.400812870963508e-08, "logits/chosen": -1.9669559001922607, "logits/rejected": -2.2731118202209473, "logps/chosen": -2.559654474258423, "logps/rejected": -10.579863548278809, "loss": 0.6349, "rewards/accuracies": 1.0, "rewards/chosen": 0.9139819145202637, "rewards/margins": 0.12006491422653198, "rewards/rejected": 0.7939170002937317, "step": 1580 }, { "epoch": 0.85, "learning_rate": 8.398677929336662e-08, "logits/chosen": -2.033836603164673, "logits/rejected": -2.0398333072662354, "logps/chosen": -1.619418978691101, "logps/rejected": -4.3096442222595215, "loss": 0.4761, "rewards/accuracies": 1.0, "rewards/chosen": 0.8133994936943054, "rewards/margins": 0.4947150945663452, "rewards/rejected": 0.3186843991279602, "step": 1581 }, { "epoch": 0.85, "learning_rate": 8.396541835247197e-08, "logits/chosen": -2.086946964263916, "logits/rejected": -2.0889110565185547, "logps/chosen": -0.6025062203407288, "logps/rejected": -3.3688111305236816, "loss": 0.4994, "rewards/accuracies": 1.0, "rewards/chosen": 0.9564414024353027, "rewards/margins": 0.43428874015808105, "rewards/rejected": 0.5221526622772217, "step": 1582 }, { "epoch": 0.85, "learning_rate": 8.39440458941945e-08, "logits/chosen": -2.018836498260498, "logits/rejected": -2.0348775386810303, "logps/chosen": -2.240629196166992, "logps/rejected": -11.141098976135254, "loss": 0.5513, "rewards/accuracies": 1.0, "rewards/chosen": 1.1476627588272095, "rewards/margins": 0.3070862889289856, "rewards/rejected": 0.8405764698982239, "step": 1583 }, { "epoch": 0.85, "learning_rate": 8.392266192578142e-08, "logits/chosen": -2.110197067260742, "logits/rejected": -2.178438425064087, "logps/chosen": -4.040188312530518, "logps/rejected": -28.293331146240234, "loss": 0.4278, "rewards/accuracies": 1.0, "rewards/chosen": 1.063396692276001, "rewards/margins": 0.6274470090866089, "rewards/rejected": 0.4359497129917145, "step": 1584 }, { "epoch": 0.85, "learning_rate": 8.390126645448382e-08, "logits/chosen": -2.011246919631958, "logits/rejected": -2.0071496963500977, "logps/chosen": -3.7577974796295166, "logps/rejected": -4.612905502319336, "loss": 0.307, "rewards/accuracies": 1.0, "rewards/chosen": 1.4422365427017212, "rewards/margins": 1.0232993364334106, "rewards/rejected": 0.41893720626831055, "step": 1585 }, { "epoch": 0.86, "learning_rate": 8.387985948755672e-08, "logits/chosen": -2.114522933959961, "logits/rejected": -2.294567346572876, "logps/chosen": -1.215681552886963, "logps/rejected": -1.2102859020233154, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.9343925714492798, "rewards/margins": 0.013846337795257568, "rewards/rejected": 0.9205462336540222, "step": 1586 }, { "epoch": 0.86, "learning_rate": 8.385844103225906e-08, "logits/chosen": -2.0541012287139893, "logits/rejected": -2.270286798477173, "logps/chosen": -1.0117628574371338, "logps/rejected": -1.078705072402954, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.8691388964653015, "rewards/margins": 0.012306928634643555, "rewards/rejected": 0.856831967830658, "step": 1587 }, { "epoch": 0.86, "learning_rate": 8.383701109585366e-08, "logits/chosen": -2.079580068588257, "logits/rejected": -2.0863659381866455, "logps/chosen": -4.125107765197754, "logps/rejected": -4.69926643371582, "loss": 0.4457, "rewards/accuracies": 1.0, "rewards/chosen": 1.2650989294052124, "rewards/margins": 0.5770177245140076, "rewards/rejected": 0.6880812048912048, "step": 1588 }, { "epoch": 0.86, "learning_rate": 8.381556968560721e-08, "logits/chosen": -2.0971691608428955, "logits/rejected": -2.24782133102417, "logps/chosen": -1.6916793584823608, "logps/rejected": -1.6287018060684204, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.9724608659744263, "rewards/margins": 0.03915214538574219, "rewards/rejected": 0.9333087205886841, "step": 1589 }, { "epoch": 0.86, "learning_rate": 8.379411680879034e-08, "logits/chosen": -2.035623550415039, "logits/rejected": -2.2265162467956543, "logps/chosen": -1.366213083267212, "logps/rejected": -2.381664514541626, "loss": 0.6978, "rewards/accuracies": 0.0, "rewards/chosen": 1.0068206787109375, "rewards/margins": -0.009372115135192871, "rewards/rejected": 1.0161927938461304, "step": 1590 }, { "epoch": 0.86, "learning_rate": 8.377265247267749e-08, "logits/chosen": -2.1593565940856934, "logits/rejected": -2.1635613441467285, "logps/chosen": -3.4285356998443604, "logps/rejected": -5.721688270568848, "loss": 0.3342, "rewards/accuracies": 1.0, "rewards/chosen": 1.5139827728271484, "rewards/margins": 0.9242839813232422, "rewards/rejected": 0.5896987915039062, "step": 1591 }, { "epoch": 0.86, "learning_rate": 8.375117668454706e-08, "logits/chosen": -2.005568027496338, "logits/rejected": -2.311229705810547, "logps/chosen": -8.87055492401123, "logps/rejected": -8.588495254516602, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.49644899368286133, "rewards/margins": 0.009821414947509766, "rewards/rejected": 0.48662757873535156, "step": 1592 }, { "epoch": 0.86, "learning_rate": 8.37296894516813e-08, "logits/chosen": -2.1976637840270996, "logits/rejected": -2.1364519596099854, "logps/chosen": -34.128536224365234, "logps/rejected": -5.096578121185303, "loss": 0.3789, "rewards/accuracies": 1.0, "rewards/chosen": 1.4372093677520752, "rewards/margins": 0.7750887870788574, "rewards/rejected": 0.6621205806732178, "step": 1593 }, { "epoch": 0.86, "learning_rate": 8.370819078136637e-08, "logits/chosen": -2.117157459259033, "logits/rejected": -2.27370548248291, "logps/chosen": -3.7290968894958496, "logps/rejected": -3.5022616386413574, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.8964107632637024, "rewards/margins": 0.005665004253387451, "rewards/rejected": 0.8907457590103149, "step": 1594 }, { "epoch": 0.86, "learning_rate": 8.368668068089227e-08, "logits/chosen": -2.1961240768432617, "logits/rejected": -2.190051555633545, "logps/chosen": -2.1715707778930664, "logps/rejected": -10.70522403717041, "loss": 0.3677, "rewards/accuracies": 1.0, "rewards/chosen": 1.3141701221466064, "rewards/margins": 0.8111060261726379, "rewards/rejected": 0.5030640959739685, "step": 1595 }, { "epoch": 0.86, "learning_rate": 8.366515915755286e-08, "logits/chosen": -2.113520383834839, "logits/rejected": -2.310654640197754, "logps/chosen": -2.1044812202453613, "logps/rejected": -2.3374409675598145, "loss": 0.6741, "rewards/accuracies": 1.0, "rewards/chosen": 0.9475975036621094, "rewards/margins": 0.038565993309020996, "rewards/rejected": 0.9090315103530884, "step": 1596 }, { "epoch": 0.86, "learning_rate": 8.364362621864594e-08, "logits/chosen": -2.0216057300567627, "logits/rejected": -2.010499954223633, "logps/chosen": -6.813765525817871, "logps/rejected": -5.509830474853516, "loss": 0.4656, "rewards/accuracies": 1.0, "rewards/chosen": 1.0633527040481567, "rewards/margins": 0.5227448344230652, "rewards/rejected": 0.5406078696250916, "step": 1597 }, { "epoch": 0.86, "learning_rate": 8.362208187147315e-08, "logits/chosen": -2.107771635055542, "logits/rejected": -2.112131118774414, "logps/chosen": -2.123356580734253, "logps/rejected": -1.6780245304107666, "loss": 0.4736, "rewards/accuracies": 1.0, "rewards/chosen": 1.1272485256195068, "rewards/margins": 0.5012231469154358, "rewards/rejected": 0.626025378704071, "step": 1598 }, { "epoch": 0.86, "learning_rate": 8.360052612333997e-08, "logits/chosen": -2.0015194416046143, "logits/rejected": -1.9914965629577637, "logps/chosen": -6.759843349456787, "logps/rejected": -4.726586818695068, "loss": 0.5474, "rewards/accuracies": 1.0, "rewards/chosen": 1.0913034677505493, "rewards/margins": 0.31639838218688965, "rewards/rejected": 0.7749050855636597, "step": 1599 }, { "epoch": 0.86, "learning_rate": 8.357895898155579e-08, "logits/chosen": -2.169719934463501, "logits/rejected": -2.062481164932251, "logps/chosen": -44.72850036621094, "logps/rejected": -2.302051305770874, "loss": 0.3207, "rewards/accuracies": 1.0, "rewards/chosen": 1.7105499505996704, "rewards/margins": 0.9727768301963806, "rewards/rejected": 0.7377731204032898, "step": 1600 }, { "epoch": 0.86, "learning_rate": 8.355738045343381e-08, "logits/chosen": -2.041231870651245, "logits/rejected": -2.274022340774536, "logps/chosen": -7.841370105743408, "logps/rejected": -5.211808681488037, "loss": 0.7492, "rewards/accuracies": 0.0, "rewards/chosen": 0.9357349276542664, "rewards/margins": -0.10909849405288696, "rewards/rejected": 1.0448334217071533, "step": 1601 }, { "epoch": 0.86, "learning_rate": 8.353579054629115e-08, "logits/chosen": -2.0933725833892822, "logits/rejected": -2.273057699203491, "logps/chosen": -4.265344619750977, "logps/rejected": -4.161562442779541, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": 0.927524983882904, "rewards/margins": 0.0026082992553710938, "rewards/rejected": 0.924916684627533, "step": 1602 }, { "epoch": 0.86, "learning_rate": 8.351418926744877e-08, "logits/chosen": -2.0052340030670166, "logits/rejected": -2.273693561553955, "logps/chosen": -3.108856439590454, "logps/rejected": -5.666676044464111, "loss": 0.7265, "rewards/accuracies": 0.0, "rewards/chosen": 0.8276261687278748, "rewards/margins": -0.06558942794799805, "rewards/rejected": 0.8932155966758728, "step": 1603 }, { "epoch": 0.87, "learning_rate": 8.349257662423148e-08, "logits/chosen": -2.0697782039642334, "logits/rejected": -2.342672109603882, "logps/chosen": -21.57639503479004, "logps/rejected": -19.836200714111328, "loss": 0.5788, "rewards/accuracies": 1.0, "rewards/chosen": 0.06591663509607315, "rewards/margins": 0.24353083968162537, "rewards/rejected": -0.1776142120361328, "step": 1604 }, { "epoch": 0.87, "learning_rate": 8.347095262396792e-08, "logits/chosen": -2.0925889015197754, "logits/rejected": -2.2858009338378906, "logps/chosen": -5.210824489593506, "logps/rejected": -4.031696319580078, "loss": 0.5941, "rewards/accuracies": 1.0, "rewards/chosen": 1.0539801120758057, "rewards/margins": 0.20891469717025757, "rewards/rejected": 0.8450654149055481, "step": 1605 }, { "epoch": 0.87, "learning_rate": 8.344931727399063e-08, "logits/chosen": -2.0847890377044678, "logits/rejected": -2.0804169178009033, "logps/chosen": -9.544964790344238, "logps/rejected": -3.4732584953308105, "loss": 0.3476, "rewards/accuracies": 1.0, "rewards/chosen": 1.4485976696014404, "rewards/margins": 0.8779768943786621, "rewards/rejected": 0.5706207752227783, "step": 1606 }, { "epoch": 0.87, "learning_rate": 8.342767058163593e-08, "logits/chosen": -1.986242413520813, "logits/rejected": -2.247972011566162, "logps/chosen": -1.5627598762512207, "logps/rejected": -1.6455559730529785, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.848890483379364, "rewards/margins": 0.015513002872467041, "rewards/rejected": 0.833377480506897, "step": 1607 }, { "epoch": 0.87, "learning_rate": 8.340601255424407e-08, "logits/chosen": -1.9948042631149292, "logits/rejected": -2.232163667678833, "logps/chosen": -1.593218207359314, "logps/rejected": -1.8274474143981934, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.9432719349861145, "rewards/margins": 0.017489314079284668, "rewards/rejected": 0.9257826209068298, "step": 1608 }, { "epoch": 0.87, "learning_rate": 8.33843431991591e-08, "logits/chosen": -2.126971960067749, "logits/rejected": -2.2654643058776855, "logps/chosen": -3.6129374504089355, "logps/rejected": -3.8295700550079346, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.8562425971031189, "rewards/margins": 0.020766496658325195, "rewards/rejected": 0.8354761004447937, "step": 1609 }, { "epoch": 0.87, "learning_rate": 8.336266252372888e-08, "logits/chosen": -2.0397143363952637, "logits/rejected": -2.264846086502075, "logps/chosen": -0.6939017176628113, "logps/rejected": -0.7240216732025146, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.9620881080627441, "rewards/margins": 0.013847172260284424, "rewards/rejected": 0.9482409358024597, "step": 1610 }, { "epoch": 0.87, "learning_rate": 8.334097053530517e-08, "logits/chosen": -2.1240506172180176, "logits/rejected": -2.307781219482422, "logps/chosen": -4.059412002563477, "logps/rejected": -4.8220109939575195, "loss": 0.7975, "rewards/accuracies": 0.0, "rewards/chosen": 0.9810218214988708, "rewards/margins": -0.19880789518356323, "rewards/rejected": 1.179829716682434, "step": 1611 }, { "epoch": 0.87, "learning_rate": 8.331926724124354e-08, "logits/chosen": -2.052926540374756, "logits/rejected": -2.32734751701355, "logps/chosen": -0.5875087976455688, "logps/rejected": -0.6756287217140198, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.7669252753257751, "rewards/margins": 0.01988774538040161, "rewards/rejected": 0.7470375299453735, "step": 1612 }, { "epoch": 0.87, "learning_rate": 8.329755264890336e-08, "logits/chosen": -2.144174337387085, "logits/rejected": -2.31483793258667, "logps/chosen": -0.7919929027557373, "logps/rejected": -0.8428332209587097, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 1.0235298871994019, "rewards/margins": 0.008318305015563965, "rewards/rejected": 1.015211582183838, "step": 1613 }, { "epoch": 0.87, "learning_rate": 8.32758267656479e-08, "logits/chosen": -2.03617262840271, "logits/rejected": -2.2514805793762207, "logps/chosen": -1.7063806056976318, "logps/rejected": -1.7058203220367432, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": 0.7867864966392517, "rewards/margins": -0.010760366916656494, "rewards/rejected": 0.7975468635559082, "step": 1614 }, { "epoch": 0.87, "learning_rate": 8.325408959884421e-08, "logits/chosen": -1.9323245286941528, "logits/rejected": -2.2156574726104736, "logps/chosen": -0.5816388726234436, "logps/rejected": -0.555832028388977, "loss": 0.6754, "rewards/accuracies": 1.0, "rewards/chosen": 0.8099220395088196, "rewards/margins": 0.035798728466033936, "rewards/rejected": 0.7741233110427856, "step": 1615 }, { "epoch": 0.87, "learning_rate": 8.323234115586316e-08, "logits/chosen": -2.1940693855285645, "logits/rejected": -2.0267233848571777, "logps/chosen": -44.091087341308594, "logps/rejected": -2.675165891647339, "loss": 0.2228, "rewards/accuracies": 1.0, "rewards/chosen": 1.9611462354660034, "rewards/margins": 1.3881964683532715, "rewards/rejected": 0.5729498267173767, "step": 1616 }, { "epoch": 0.87, "learning_rate": 8.321058144407949e-08, "logits/chosen": -2.085665464401245, "logits/rejected": -2.0868351459503174, "logps/chosen": -1.9468679428100586, "logps/rejected": -1.5533784627914429, "loss": 0.6509, "rewards/accuracies": 1.0, "rewards/chosen": 1.0844149589538574, "rewards/margins": 0.08642232418060303, "rewards/rejected": 0.9979926347732544, "step": 1617 }, { "epoch": 0.87, "learning_rate": 8.31888104708717e-08, "logits/chosen": -2.1396706104278564, "logits/rejected": -2.134171962738037, "logps/chosen": -2.956369638442993, "logps/rejected": -2.7936418056488037, "loss": 0.5493, "rewards/accuracies": 1.0, "rewards/chosen": 1.09158456325531, "rewards/margins": 0.3119642734527588, "rewards/rejected": 0.7796202898025513, "step": 1618 }, { "epoch": 0.87, "learning_rate": 8.316702824362216e-08, "logits/chosen": -2.0051820278167725, "logits/rejected": -2.01143741607666, "logps/chosen": -1.9164774417877197, "logps/rejected": -4.641687393188477, "loss": 0.4156, "rewards/accuracies": 1.0, "rewards/chosen": 1.0764286518096924, "rewards/margins": 0.6631232500076294, "rewards/rejected": 0.4133053719997406, "step": 1619 }, { "epoch": 0.87, "learning_rate": 8.314523476971703e-08, "logits/chosen": -2.0636634826660156, "logits/rejected": -2.2769415378570557, "logps/chosen": -0.5262861847877502, "logps/rejected": -0.5838369131088257, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.933037281036377, "rewards/margins": 0.016118407249450684, "rewards/rejected": 0.9169188737869263, "step": 1620 }, { "epoch": 0.87, "learning_rate": 8.31234300565463e-08, "logits/chosen": -2.1367931365966797, "logits/rejected": -2.0591530799865723, "logps/chosen": -40.43101501464844, "logps/rejected": -12.123222351074219, "loss": 0.3025, "rewards/accuracies": 1.0, "rewards/chosen": 1.6983692646026611, "rewards/margins": 1.040428876876831, "rewards/rejected": 0.6579403281211853, "step": 1621 }, { "epoch": 0.87, "learning_rate": 8.310161411150375e-08, "logits/chosen": -2.0455198287963867, "logits/rejected": -2.282834768295288, "logps/chosen": -0.3925339877605438, "logps/rejected": -0.4517531394958496, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.8443255424499512, "rewards/margins": 0.01517397165298462, "rewards/rejected": 0.8291515707969666, "step": 1622 }, { "epoch": 0.88, "learning_rate": 8.307978694198699e-08, "logits/chosen": -1.9896421432495117, "logits/rejected": -1.9915908575057983, "logps/chosen": -4.654997825622559, "logps/rejected": -1.377792477607727, "loss": 0.698, "rewards/accuracies": 0.0, "rewards/chosen": 1.2034754753112793, "rewards/margins": -0.00959920883178711, "rewards/rejected": 1.2130746841430664, "step": 1623 }, { "epoch": 0.88, "learning_rate": 8.305794855539741e-08, "logits/chosen": -2.148723602294922, "logits/rejected": -2.3096699714660645, "logps/chosen": -5.964078903198242, "logps/rejected": -3.664862632751465, "loss": 0.6584, "rewards/accuracies": 1.0, "rewards/chosen": 0.6377622485160828, "rewards/margins": 0.07075715065002441, "rewards/rejected": 0.5670050978660583, "step": 1624 }, { "epoch": 0.88, "learning_rate": 8.303609895914026e-08, "logits/chosen": -2.01053786277771, "logits/rejected": -2.0112032890319824, "logps/chosen": -2.5933079719543457, "logps/rejected": -5.103888034820557, "loss": 0.3594, "rewards/accuracies": 1.0, "rewards/chosen": 1.4468010663986206, "rewards/margins": 0.8384072184562683, "rewards/rejected": 0.6083938479423523, "step": 1625 }, { "epoch": 0.88, "learning_rate": 8.30142381606245e-08, "logits/chosen": -2.151477813720703, "logits/rejected": -2.2783424854278564, "logps/chosen": -5.359306335449219, "logps/rejected": -1.5687718391418457, "loss": 0.728, "rewards/accuracies": 0.0, "rewards/chosen": 0.9480113983154297, "rewards/margins": -0.06845176219940186, "rewards/rejected": 1.0164631605148315, "step": 1626 }, { "epoch": 0.88, "learning_rate": 8.299236616726297e-08, "logits/chosen": -2.032461166381836, "logits/rejected": -2.2393383979797363, "logps/chosen": -0.33954447507858276, "logps/rejected": -0.33604907989501953, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.9358271956443787, "rewards/margins": 0.008485615253448486, "rewards/rejected": 0.9273415803909302, "step": 1627 }, { "epoch": 0.88, "learning_rate": 8.297048298647226e-08, "logits/chosen": -2.148468494415283, "logits/rejected": -2.2873435020446777, "logps/chosen": -0.9506284594535828, "logps/rejected": -3.486029624938965, "loss": 0.6676, "rewards/accuracies": 1.0, "rewards/chosen": 1.0107343196868896, "rewards/margins": 0.051691651344299316, "rewards/rejected": 0.9590426683425903, "step": 1628 }, { "epoch": 0.88, "learning_rate": 8.29485886256728e-08, "logits/chosen": -2.0718801021575928, "logits/rejected": -2.2594547271728516, "logps/chosen": -0.41014397144317627, "logps/rejected": -0.3739897310733795, "loss": 0.6799, "rewards/accuracies": 1.0, "rewards/chosen": 0.7502390146255493, "rewards/margins": 0.026642560958862305, "rewards/rejected": 0.723596453666687, "step": 1629 }, { "epoch": 0.88, "learning_rate": 8.292668309228874e-08, "logits/chosen": -2.143287181854248, "logits/rejected": -2.3251047134399414, "logps/chosen": -2.90954327583313, "logps/rejected": -2.7761120796203613, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.8126932382583618, "rewards/margins": 0.010974347591400146, "rewards/rejected": 0.8017188906669617, "step": 1630 }, { "epoch": 0.88, "learning_rate": 8.290476639374808e-08, "logits/chosen": -2.0710647106170654, "logits/rejected": -2.2391462326049805, "logps/chosen": -3.12607479095459, "logps/rejected": -3.2016096115112305, "loss": 0.6691, "rewards/accuracies": 1.0, "rewards/chosen": 0.6521775126457214, "rewards/margins": 0.04873901605606079, "rewards/rejected": 0.6034384965896606, "step": 1631 }, { "epoch": 0.88, "learning_rate": 8.28828385374826e-08, "logits/chosen": -2.105189561843872, "logits/rejected": -2.215080738067627, "logps/chosen": -1.311332106590271, "logps/rejected": -1.4190149307250977, "loss": 0.6744, "rewards/accuracies": 1.0, "rewards/chosen": 0.8872662782669067, "rewards/margins": 0.037921369075775146, "rewards/rejected": 0.8493449091911316, "step": 1632 }, { "epoch": 0.88, "learning_rate": 8.286089953092782e-08, "logits/chosen": -2.0807864665985107, "logits/rejected": -2.090083599090576, "logps/chosen": -20.156314849853516, "logps/rejected": -6.288952827453613, "loss": 0.2729, "rewards/accuracies": 1.0, "rewards/chosen": 1.8869606256484985, "rewards/margins": 1.1593034267425537, "rewards/rejected": 0.7276572585105896, "step": 1633 }, { "epoch": 0.88, "learning_rate": 8.28389493815231e-08, "logits/chosen": -2.028214693069458, "logits/rejected": -2.023264169692993, "logps/chosen": -4.0314202308654785, "logps/rejected": -4.488067626953125, "loss": 0.3942, "rewards/accuracies": 1.0, "rewards/chosen": 1.3173573017120361, "rewards/margins": 0.7272470593452454, "rewards/rejected": 0.5901102423667908, "step": 1634 }, { "epoch": 0.88, "learning_rate": 8.28169880967115e-08, "logits/chosen": -2.0455029010772705, "logits/rejected": -2.292841672897339, "logps/chosen": -0.7709256410598755, "logps/rejected": -0.658637285232544, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 0.9337694048881531, "rewards/margins": 0.038687944412231445, "rewards/rejected": 0.8950814604759216, "step": 1635 }, { "epoch": 0.88, "learning_rate": 8.279501568393994e-08, "logits/chosen": -2.1456832885742188, "logits/rejected": -2.2630341053009033, "logps/chosen": -5.231295108795166, "logps/rejected": -0.6686026453971863, "loss": 0.7297, "rewards/accuracies": 0.0, "rewards/chosen": 0.8873912692070007, "rewards/margins": -0.07181006669998169, "rewards/rejected": 0.9592013359069824, "step": 1636 }, { "epoch": 0.88, "learning_rate": 8.277303215065907e-08, "logits/chosen": -2.0016376972198486, "logits/rejected": -2.0082266330718994, "logps/chosen": -4.187596321105957, "logps/rejected": -5.1433424949646, "loss": 0.4751, "rewards/accuracies": 1.0, "rewards/chosen": 0.8964213728904724, "rewards/margins": 0.49728819727897644, "rewards/rejected": 0.39913317561149597, "step": 1637 }, { "epoch": 0.88, "learning_rate": 8.275103750432332e-08, "logits/chosen": -1.932861328125, "logits/rejected": -1.938485860824585, "logps/chosen": -2.0575151443481445, "logps/rejected": -3.3863863945007324, "loss": 0.5115, "rewards/accuracies": 1.0, "rewards/chosen": 0.8747334480285645, "rewards/margins": 0.40377870202064514, "rewards/rejected": 0.4709547460079193, "step": 1638 }, { "epoch": 0.88, "learning_rate": 8.27290317523909e-08, "logits/chosen": -2.085292100906372, "logits/rejected": -2.0892269611358643, "logps/chosen": -3.782708168029785, "logps/rejected": -3.512662649154663, "loss": 0.5382, "rewards/accuracies": 1.0, "rewards/chosen": 1.1148607730865479, "rewards/margins": 0.338504433631897, "rewards/rejected": 0.7763563394546509, "step": 1639 }, { "epoch": 0.88, "learning_rate": 8.270701490232375e-08, "logits/chosen": -2.1155974864959717, "logits/rejected": -2.114753484725952, "logps/chosen": -6.442215919494629, "logps/rejected": -3.332933187484741, "loss": 0.4003, "rewards/accuracies": 1.0, "rewards/chosen": 1.3597098588943481, "rewards/margins": 0.7087439894676208, "rewards/rejected": 0.6509658694267273, "step": 1640 }, { "epoch": 0.89, "learning_rate": 8.268498696158759e-08, "logits/chosen": -2.0789642333984375, "logits/rejected": -2.2697982788085938, "logps/chosen": -10.239919662475586, "logps/rejected": -10.629354476928711, "loss": 0.7323, "rewards/accuracies": 0.0, "rewards/chosen": 0.8564632534980774, "rewards/margins": -0.0768728256225586, "rewards/rejected": 0.933336079120636, "step": 1641 }, { "epoch": 0.89, "learning_rate": 8.266294793765194e-08, "logits/chosen": -2.037648916244507, "logits/rejected": -2.0296566486358643, "logps/chosen": -6.564261436462402, "logps/rejected": -6.292081356048584, "loss": 0.3996, "rewards/accuracies": 1.0, "rewards/chosen": 1.3204175233840942, "rewards/margins": 0.7108930945396423, "rewards/rejected": 0.6095244288444519, "step": 1642 }, { "epoch": 0.89, "learning_rate": 8.264089783799e-08, "logits/chosen": -2.003227472305298, "logits/rejected": -2.235102891921997, "logps/chosen": -0.48674437403678894, "logps/rejected": -0.5551660060882568, "loss": 0.7001, "rewards/accuracies": 0.0, "rewards/chosen": 0.8595812916755676, "rewards/margins": -0.01395481824874878, "rewards/rejected": 0.8735361099243164, "step": 1643 }, { "epoch": 0.89, "learning_rate": 8.261883667007881e-08, "logits/chosen": -2.050649642944336, "logits/rejected": -2.2822513580322266, "logps/chosen": -0.634148120880127, "logps/rejected": -0.6495034694671631, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 1.0134241580963135, "rewards/margins": 0.0033425092697143555, "rewards/rejected": 1.0100816488265991, "step": 1644 }, { "epoch": 0.89, "learning_rate": 8.259676444139909e-08, "logits/chosen": -2.127047300338745, "logits/rejected": -2.1975533962249756, "logps/chosen": -0.685187578201294, "logps/rejected": -0.6886345744132996, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.9497737884521484, "rewards/margins": 0.008233904838562012, "rewards/rejected": 0.9415398836135864, "step": 1645 }, { "epoch": 0.89, "learning_rate": 8.257468115943537e-08, "logits/chosen": -2.1407408714294434, "logits/rejected": -2.2617578506469727, "logps/chosen": -1.3921840190887451, "logps/rejected": -1.4913139343261719, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.6628279685974121, "rewards/margins": 0.01191091537475586, "rewards/rejected": 0.6509170532226562, "step": 1646 }, { "epoch": 0.89, "learning_rate": 8.25525868316759e-08, "logits/chosen": -2.19555926322937, "logits/rejected": -2.198219060897827, "logps/chosen": -0.8624207973480225, "logps/rejected": -3.559706926345825, "loss": 0.5256, "rewards/accuracies": 1.0, "rewards/chosen": 1.0193512439727783, "rewards/margins": 0.3689749240875244, "rewards/rejected": 0.6503763198852539, "step": 1647 }, { "epoch": 0.89, "learning_rate": 8.253048146561267e-08, "logits/chosen": -2.141848087310791, "logits/rejected": -2.1353237628936768, "logps/chosen": -4.6905927658081055, "logps/rejected": -3.7709527015686035, "loss": 0.6629, "rewards/accuracies": 1.0, "rewards/chosen": 0.758362889289856, "rewards/margins": 0.06134992837905884, "rewards/rejected": 0.6970129609107971, "step": 1648 }, { "epoch": 0.89, "learning_rate": 8.250836506874141e-08, "logits/chosen": -1.9539670944213867, "logits/rejected": -1.9596331119537354, "logps/chosen": -3.150075674057007, "logps/rejected": -4.1925249099731445, "loss": 0.4614, "rewards/accuracies": 1.0, "rewards/chosen": 1.1778322458267212, "rewards/margins": 0.5338957905769348, "rewards/rejected": 0.6439364552497864, "step": 1649 }, { "epoch": 0.89, "learning_rate": 8.248623764856161e-08, "logits/chosen": -2.083228826522827, "logits/rejected": -2.0900490283966064, "logps/chosen": -2.330247640609741, "logps/rejected": -3.474478244781494, "loss": 0.4908, "rewards/accuracies": 1.0, "rewards/chosen": 1.1057872772216797, "rewards/margins": 0.4562215209007263, "rewards/rejected": 0.6495657563209534, "step": 1650 }, { "epoch": 0.89, "learning_rate": 8.246409921257652e-08, "logits/chosen": -2.050755023956299, "logits/rejected": -2.2266340255737305, "logps/chosen": -4.470071315765381, "logps/rejected": -3.5354537963867188, "loss": 0.7531, "rewards/accuracies": 0.0, "rewards/chosen": 0.7969521284103394, "rewards/margins": -0.11646664142608643, "rewards/rejected": 0.9134187698364258, "step": 1651 }, { "epoch": 0.89, "learning_rate": 8.244194976829303e-08, "logits/chosen": -2.1552488803863525, "logits/rejected": -2.1582491397857666, "logps/chosen": -2.2443647384643555, "logps/rejected": -3.956629991531372, "loss": 0.5017, "rewards/accuracies": 1.0, "rewards/chosen": 1.11896812915802, "rewards/margins": 0.4284214973449707, "rewards/rejected": 0.6905466318130493, "step": 1652 }, { "epoch": 0.89, "learning_rate": 8.241978932322189e-08, "logits/chosen": -1.9264112710952759, "logits/rejected": -2.2393813133239746, "logps/chosen": -0.4031246304512024, "logps/rejected": -0.43718093633651733, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.9647673964500427, "rewards/margins": 0.010435998439788818, "rewards/rejected": 0.9543313980102539, "step": 1653 }, { "epoch": 0.89, "learning_rate": 8.239761788487747e-08, "logits/chosen": -2.0899782180786133, "logits/rejected": -2.088895320892334, "logps/chosen": -2.1484766006469727, "logps/rejected": -6.510437488555908, "loss": 0.4796, "rewards/accuracies": 1.0, "rewards/chosen": 0.9078158736228943, "rewards/margins": 0.4855058491230011, "rewards/rejected": 0.4223100244998932, "step": 1654 }, { "epoch": 0.89, "learning_rate": 8.237543546077794e-08, "logits/chosen": -1.958781361579895, "logits/rejected": -2.2146286964416504, "logps/chosen": -1.7582939863204956, "logps/rejected": -3.864626169204712, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.9952625632286072, "rewards/margins": 0.013386547565460205, "rewards/rejected": 0.981876015663147, "step": 1655 }, { "epoch": 0.89, "learning_rate": 8.235324205844515e-08, "logits/chosen": -2.0635874271392822, "logits/rejected": -2.0641794204711914, "logps/chosen": -1.2821016311645508, "logps/rejected": -1.8534730672836304, "loss": 0.6599, "rewards/accuracies": 1.0, "rewards/chosen": 0.7486724257469177, "rewards/margins": 0.067538321018219, "rewards/rejected": 0.6811341047286987, "step": 1656 }, { "epoch": 0.89, "learning_rate": 8.233103768540475e-08, "logits/chosen": -2.0678329467773438, "logits/rejected": -2.3038625717163086, "logps/chosen": -1.010027527809143, "logps/rejected": -0.9560645222663879, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.8139246106147766, "rewards/margins": 0.00990229845046997, "rewards/rejected": 0.8040223121643066, "step": 1657 }, { "epoch": 0.89, "learning_rate": 8.230882234918599e-08, "logits/chosen": -2.138335704803467, "logits/rejected": -2.1106343269348145, "logps/chosen": -30.36888885498047, "logps/rejected": -4.045719623565674, "loss": 0.7985, "rewards/accuracies": 0.0, "rewards/chosen": 0.9766502380371094, "rewards/margins": -0.2006673812866211, "rewards/rejected": 1.1773176193237305, "step": 1658 }, { "epoch": 0.89, "learning_rate": 8.228659605732191e-08, "logits/chosen": -2.0235238075256348, "logits/rejected": -2.089743137359619, "logps/chosen": -4.555437088012695, "logps/rejected": -23.2923583984375, "loss": 0.2368, "rewards/accuracies": 1.0, "rewards/chosen": 1.4799368381500244, "rewards/margins": 1.3196134567260742, "rewards/rejected": 0.1603233367204666, "step": 1659 }, { "epoch": 0.9, "learning_rate": 8.226435881734926e-08, "logits/chosen": -1.9532113075256348, "logits/rejected": -1.952370047569275, "logps/chosen": -1.6616765260696411, "logps/rejected": -2.0603110790252686, "loss": 0.6161, "rewards/accuracies": 1.0, "rewards/chosen": 1.0834760665893555, "rewards/margins": 0.16059386730194092, "rewards/rejected": 0.9228821992874146, "step": 1660 }, { "epoch": 0.9, "learning_rate": 8.224211063680852e-08, "logits/chosen": -2.1457667350769043, "logits/rejected": -2.137098789215088, "logps/chosen": -8.562397956848145, "logps/rejected": -0.7225927710533142, "loss": 0.5039, "rewards/accuracies": 1.0, "rewards/chosen": 1.394432783126831, "rewards/margins": 0.4227731227874756, "rewards/rejected": 0.9716596603393555, "step": 1661 }, { "epoch": 0.9, "learning_rate": 8.221985152324385e-08, "logits/chosen": -2.0244407653808594, "logits/rejected": -2.2422244548797607, "logps/chosen": -1.1365611553192139, "logps/rejected": -1.0617083311080933, "loss": 0.6766, "rewards/accuracies": 1.0, "rewards/chosen": 0.8954612612724304, "rewards/margins": 0.03332996368408203, "rewards/rejected": 0.8621312975883484, "step": 1662 }, { "epoch": 0.9, "learning_rate": 8.219758148420311e-08, "logits/chosen": -2.115487575531006, "logits/rejected": -2.2441701889038086, "logps/chosen": -1.3739951848983765, "logps/rejected": -1.4354045391082764, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.807729184627533, "rewards/margins": -0.0012710094451904297, "rewards/rejected": 0.8090001940727234, "step": 1663 }, { "epoch": 0.9, "learning_rate": 8.217530052723789e-08, "logits/chosen": -2.162930965423584, "logits/rejected": -2.1669018268585205, "logps/chosen": -0.7781190872192383, "logps/rejected": -4.932704925537109, "loss": 0.445, "rewards/accuracies": 1.0, "rewards/chosen": 1.0046018362045288, "rewards/margins": 0.578860878944397, "rewards/rejected": 0.42574092745780945, "step": 1664 }, { "epoch": 0.9, "learning_rate": 8.215300865990346e-08, "logits/chosen": -2.0271949768066406, "logits/rejected": -2.0347952842712402, "logps/chosen": -1.7900526523590088, "logps/rejected": -2.561925172805786, "loss": 0.5151, "rewards/accuracies": 1.0, "rewards/chosen": 1.0020427703857422, "rewards/margins": 0.3948999047279358, "rewards/rejected": 0.6071428656578064, "step": 1665 }, { "epoch": 0.9, "learning_rate": 8.213070588975881e-08, "logits/chosen": -1.970200538635254, "logits/rejected": -2.2162837982177734, "logps/chosen": -0.7085700035095215, "logps/rejected": -0.7090848088264465, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.9357625842094421, "rewards/margins": -0.0005100369453430176, "rewards/rejected": 0.9362726211547852, "step": 1666 }, { "epoch": 0.9, "learning_rate": 8.210839222436663e-08, "logits/chosen": -2.05456805229187, "logits/rejected": -2.255261182785034, "logps/chosen": -0.8797547817230225, "logps/rejected": -0.9288127422332764, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.7279438376426697, "rewards/margins": 0.029909133911132812, "rewards/rejected": 0.6980347037315369, "step": 1667 }, { "epoch": 0.9, "learning_rate": 8.20860676712933e-08, "logits/chosen": -2.021860361099243, "logits/rejected": -2.229916572570801, "logps/chosen": -2.712354898452759, "logps/rejected": -2.8508777618408203, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.7867472767829895, "rewards/margins": 0.019422829151153564, "rewards/rejected": 0.7673244476318359, "step": 1668 }, { "epoch": 0.9, "learning_rate": 8.206373223810883e-08, "logits/chosen": -2.0223171710968018, "logits/rejected": -2.3015215396881104, "logps/chosen": -6.94651985168457, "logps/rejected": -8.098472595214844, "loss": 0.622, "rewards/accuracies": 1.0, "rewards/chosen": 0.4729325473308563, "rewards/margins": 0.14764404296875, "rewards/rejected": 0.3252885043621063, "step": 1669 }, { "epoch": 0.9, "learning_rate": 8.204138593238702e-08, "logits/chosen": -2.052863597869873, "logits/rejected": -2.3059825897216797, "logps/chosen": -0.6566655039787292, "logps/rejected": -5.568070888519287, "loss": 0.5517, "rewards/accuracies": 1.0, "rewards/chosen": 0.9750466346740723, "rewards/margins": 0.3063150644302368, "rewards/rejected": 0.6687315702438354, "step": 1670 }, { "epoch": 0.9, "learning_rate": 8.201902876170532e-08, "logits/chosen": -2.061638355255127, "logits/rejected": -2.0662293434143066, "logps/chosen": -2.964090585708618, "logps/rejected": -4.738020420074463, "loss": 0.4871, "rewards/accuracies": 1.0, "rewards/chosen": 0.9946309924125671, "rewards/margins": 0.4658321738243103, "rewards/rejected": 0.5287988185882568, "step": 1671 }, { "epoch": 0.9, "learning_rate": 8.199666073364482e-08, "logits/chosen": -2.021923303604126, "logits/rejected": -2.019632577896118, "logps/chosen": -1.4086962938308716, "logps/rejected": -4.168987274169922, "loss": 0.4828, "rewards/accuracies": 1.0, "rewards/chosen": 0.8982154130935669, "rewards/margins": 0.4769652485847473, "rewards/rejected": 0.4212501645088196, "step": 1672 }, { "epoch": 0.9, "learning_rate": 8.197428185579036e-08, "logits/chosen": -2.0508012771606445, "logits/rejected": -2.2643096446990967, "logps/chosen": -2.2608699798583984, "logps/rejected": -2.322852849960327, "loss": 0.6765, "rewards/accuracies": 1.0, "rewards/chosen": 0.6153521537780762, "rewards/margins": 0.0335121750831604, "rewards/rejected": 0.5818399786949158, "step": 1673 }, { "epoch": 0.9, "learning_rate": 8.195189213573041e-08, "logits/chosen": -2.000002145767212, "logits/rejected": -2.2572247982025146, "logps/chosen": -0.38621997833251953, "logps/rejected": -0.39644506573677063, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": 0.9747600555419922, "rewards/margins": 0.03832060098648071, "rewards/rejected": 0.9364394545555115, "step": 1674 }, { "epoch": 0.9, "learning_rate": 8.192949158105712e-08, "logits/chosen": -2.096144676208496, "logits/rejected": -2.123805522918701, "logps/chosen": -4.690921783447266, "logps/rejected": -10.867196083068848, "loss": 0.3592, "rewards/accuracies": 1.0, "rewards/chosen": 1.406779170036316, "rewards/margins": 0.8388658165931702, "rewards/rejected": 0.5679133534431458, "step": 1675 }, { "epoch": 0.9, "learning_rate": 8.190708019936634e-08, "logits/chosen": -2.0701591968536377, "logits/rejected": -2.0686168670654297, "logps/chosen": -0.876548707485199, "logps/rejected": -3.3398725986480713, "loss": 0.5059, "rewards/accuracies": 1.0, "rewards/chosen": 0.8976049423217773, "rewards/margins": 0.41783684492111206, "rewards/rejected": 0.4797680974006653, "step": 1676 }, { "epoch": 0.9, "learning_rate": 8.188465799825759e-08, "logits/chosen": -2.055133581161499, "logits/rejected": -2.0543782711029053, "logps/chosen": -1.4323604106903076, "logps/rejected": -4.947846412658691, "loss": 0.4624, "rewards/accuracies": 1.0, "rewards/chosen": 1.0535911321640015, "rewards/margins": 0.5313336849212646, "rewards/rejected": 0.5222574472427368, "step": 1677 }, { "epoch": 0.91, "learning_rate": 8.186222498533404e-08, "logits/chosen": -1.9282923936843872, "logits/rejected": -2.2484240531921387, "logps/chosen": -5.973751544952393, "logps/rejected": -6.306876182556152, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.4318053722381592, "rewards/margins": 0.012917846441268921, "rewards/rejected": 0.41888752579689026, "step": 1678 }, { "epoch": 0.91, "learning_rate": 8.183978116820252e-08, "logits/chosen": -2.093132734298706, "logits/rejected": -2.0914878845214844, "logps/chosen": -6.034902572631836, "logps/rejected": -3.012334108352661, "loss": 0.4549, "rewards/accuracies": 1.0, "rewards/chosen": 1.2662200927734375, "rewards/margins": 0.5516947507858276, "rewards/rejected": 0.7145253419876099, "step": 1679 }, { "epoch": 0.91, "learning_rate": 8.181732655447354e-08, "logits/chosen": -2.094071865081787, "logits/rejected": -2.017599105834961, "logps/chosen": -26.79468536376953, "logps/rejected": -2.60575795173645, "loss": 0.39, "rewards/accuracies": 1.0, "rewards/chosen": 1.3584133386611938, "rewards/margins": 0.740301787853241, "rewards/rejected": 0.6181115508079529, "step": 1680 }, { "epoch": 0.91, "learning_rate": 8.179486115176128e-08, "logits/chosen": -2.0453176498413086, "logits/rejected": -2.0452585220336914, "logps/chosen": -2.069260835647583, "logps/rejected": -0.698180615901947, "loss": 0.6968, "rewards/accuracies": 0.0, "rewards/chosen": 0.8801781535148621, "rewards/margins": -0.007233917713165283, "rewards/rejected": 0.8874120712280273, "step": 1681 }, { "epoch": 0.91, "learning_rate": 8.177238496768356e-08, "logits/chosen": -2.07810115814209, "logits/rejected": -2.2366840839385986, "logps/chosen": -0.43116652965545654, "logps/rejected": -0.4823981523513794, "loss": 0.6774, "rewards/accuracies": 1.0, "rewards/chosen": 0.8701114058494568, "rewards/margins": 0.03182852268218994, "rewards/rejected": 0.8382828831672668, "step": 1682 }, { "epoch": 0.91, "learning_rate": 8.174989800986185e-08, "logits/chosen": -2.0137250423431396, "logits/rejected": -2.019235372543335, "logps/chosen": -2.025745391845703, "logps/rejected": -4.330531597137451, "loss": 0.4499, "rewards/accuracies": 1.0, "rewards/chosen": 0.9673144221305847, "rewards/margins": 0.5653479099273682, "rewards/rejected": 0.40196654200553894, "step": 1683 }, { "epoch": 0.91, "learning_rate": 8.172740028592129e-08, "logits/chosen": -1.979788064956665, "logits/rejected": -1.980942964553833, "logps/chosen": -6.532615661621094, "logps/rejected": -0.4072297513484955, "loss": 0.4731, "rewards/accuracies": 1.0, "rewards/chosen": 1.187670111656189, "rewards/margins": 0.5025517344474792, "rewards/rejected": 0.6851183772087097, "step": 1684 }, { "epoch": 0.91, "learning_rate": 8.170489180349067e-08, "logits/chosen": -2.1560394763946533, "logits/rejected": -2.318067789077759, "logps/chosen": -13.066330909729004, "logps/rejected": -14.853580474853516, "loss": 0.8526, "rewards/accuracies": 0.0, "rewards/chosen": 0.7032219767570496, "rewards/margins": -0.29694145917892456, "rewards/rejected": 1.0001634359359741, "step": 1685 }, { "epoch": 0.91, "learning_rate": 8.168237257020243e-08, "logits/chosen": -2.0493533611297607, "logits/rejected": -2.244062900543213, "logps/chosen": -0.6906567811965942, "logps/rejected": -0.669014573097229, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.8604463934898376, "rewards/margins": 0.00965815782546997, "rewards/rejected": 0.8507882356643677, "step": 1686 }, { "epoch": 0.91, "learning_rate": 8.165984259369265e-08, "logits/chosen": -2.072136878967285, "logits/rejected": -2.3345072269439697, "logps/chosen": -7.7246856689453125, "logps/rejected": -16.78978157043457, "loss": 0.5767, "rewards/accuracies": 1.0, "rewards/chosen": 0.9051006436347961, "rewards/margins": 0.24817007780075073, "rewards/rejected": 0.6569305658340454, "step": 1687 }, { "epoch": 0.91, "learning_rate": 8.163730188160104e-08, "logits/chosen": -1.976776361465454, "logits/rejected": -1.9787514209747314, "logps/chosen": -1.0626695156097412, "logps/rejected": -5.711943626403809, "loss": 0.5425, "rewards/accuracies": 1.0, "rewards/chosen": 0.8027300238609314, "rewards/margins": 0.32807478308677673, "rewards/rejected": 0.47465524077415466, "step": 1688 }, { "epoch": 0.91, "learning_rate": 8.161475044157098e-08, "logits/chosen": -2.047285556793213, "logits/rejected": -2.0478289127349854, "logps/chosen": -0.7336485385894775, "logps/rejected": -3.8069818019866943, "loss": 0.5255, "rewards/accuracies": 1.0, "rewards/chosen": 0.9434353113174438, "rewards/margins": 0.36905890703201294, "rewards/rejected": 0.5743764042854309, "step": 1689 }, { "epoch": 0.91, "learning_rate": 8.159218828124943e-08, "logits/chosen": -1.9855668544769287, "logits/rejected": -1.9939849376678467, "logps/chosen": -3.825389862060547, "logps/rejected": -1.35203218460083, "loss": 0.6623, "rewards/accuracies": 1.0, "rewards/chosen": 1.0312267541885376, "rewards/margins": 0.06275314092636108, "rewards/rejected": 0.9684736132621765, "step": 1690 }, { "epoch": 0.91, "learning_rate": 8.156961540828707e-08, "logits/chosen": -2.0291128158569336, "logits/rejected": -2.273883819580078, "logps/chosen": -2.362558126449585, "logps/rejected": -6.4985175132751465, "loss": 0.6088, "rewards/accuracies": 1.0, "rewards/chosen": 1.1658740043640137, "rewards/margins": 0.17650634050369263, "rewards/rejected": 0.989367663860321, "step": 1691 }, { "epoch": 0.91, "learning_rate": 8.154703183033816e-08, "logits/chosen": -2.1540005207061768, "logits/rejected": -2.3146350383758545, "logps/chosen": -0.6886966824531555, "logps/rejected": -0.672076940536499, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.9492669105529785, "rewards/margins": 0.023360788822174072, "rewards/rejected": 0.9259061217308044, "step": 1692 }, { "epoch": 0.91, "learning_rate": 8.152443755506058e-08, "logits/chosen": -2.014770269393921, "logits/rejected": -2.0214271545410156, "logps/chosen": -2.310283660888672, "logps/rejected": -4.073662757873535, "loss": 0.4687, "rewards/accuracies": 1.0, "rewards/chosen": 1.078232765197754, "rewards/margins": 0.5143237709999084, "rewards/rejected": 0.5639089941978455, "step": 1693 }, { "epoch": 0.91, "learning_rate": 8.150183259011587e-08, "logits/chosen": -2.0723965167999268, "logits/rejected": -2.0992307662963867, "logps/chosen": -22.698692321777344, "logps/rejected": -13.860753059387207, "loss": 0.2003, "rewards/accuracies": 1.0, "rewards/chosen": 1.9287430047988892, "rewards/margins": 1.5062921047210693, "rewards/rejected": 0.4224509298801422, "step": 1694 }, { "epoch": 0.91, "learning_rate": 8.147921694316918e-08, "logits/chosen": -2.0427777767181396, "logits/rejected": -2.0353336334228516, "logps/chosen": -5.324679851531982, "logps/rejected": -3.5894951820373535, "loss": 0.447, "rewards/accuracies": 1.0, "rewards/chosen": 1.6780582666397095, "rewards/margins": 0.5733267068862915, "rewards/rejected": 1.104731559753418, "step": 1695 }, { "epoch": 0.91, "learning_rate": 8.145659062188928e-08, "logits/chosen": -1.9644142389297485, "logits/rejected": -1.9719592332839966, "logps/chosen": -5.067536354064941, "logps/rejected": -3.714589834213257, "loss": 0.2773, "rewards/accuracies": 1.0, "rewards/chosen": 1.677896499633789, "rewards/margins": 1.140885591506958, "rewards/rejected": 0.5370108485221863, "step": 1696 }, { "epoch": 0.92, "learning_rate": 8.143395363394854e-08, "logits/chosen": -2.0874176025390625, "logits/rejected": -2.3190836906433105, "logps/chosen": -1.4244545698165894, "logps/rejected": -1.3040683269500732, "loss": 0.6686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9967698454856873, "rewards/margins": 0.049746930599212646, "rewards/rejected": 0.9470229148864746, "step": 1697 }, { "epoch": 0.92, "learning_rate": 8.141130598702302e-08, "logits/chosen": -2.129917621612549, "logits/rejected": -2.2627902030944824, "logps/chosen": -1.8813085556030273, "logps/rejected": -2.0659985542297363, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.95347660779953, "rewards/margins": 0.02564859390258789, "rewards/rejected": 0.9278280138969421, "step": 1698 }, { "epoch": 0.92, "learning_rate": 8.13886476887923e-08, "logits/chosen": -2.0411038398742676, "logits/rejected": -2.27665114402771, "logps/chosen": -1.080316424369812, "logps/rejected": -1.1088521480560303, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.7991450428962708, "rewards/margins": 0.013281047344207764, "rewards/rejected": 0.785863995552063, "step": 1699 }, { "epoch": 0.92, "learning_rate": 8.136597874693961e-08, "logits/chosen": -2.105518341064453, "logits/rejected": -2.2805912494659424, "logps/chosen": -4.689865589141846, "logps/rejected": -4.473884582519531, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.9211465716362, "rewards/margins": -0.00310671329498291, "rewards/rejected": 0.9242532849311829, "step": 1700 }, { "epoch": 0.92, "learning_rate": 8.134329916915183e-08, "logits/chosen": -2.0396058559417725, "logits/rejected": -2.2608604431152344, "logps/chosen": -0.42827123403549194, "logps/rejected": -0.4383286237716675, "loss": 0.6706, "rewards/accuracies": 1.0, "rewards/chosen": 0.9624174237251282, "rewards/margins": 0.045610129833221436, "rewards/rejected": 0.9168072938919067, "step": 1701 }, { "epoch": 0.92, "learning_rate": 8.132060896311939e-08, "logits/chosen": -2.0263686180114746, "logits/rejected": -2.0325047969818115, "logps/chosen": -2.2025856971740723, "logps/rejected": -3.346771717071533, "loss": 0.5045, "rewards/accuracies": 1.0, "rewards/chosen": 1.0565407276153564, "rewards/margins": 0.4212350845336914, "rewards/rejected": 0.635305643081665, "step": 1702 }, { "epoch": 0.92, "learning_rate": 8.129790813653637e-08, "logits/chosen": -2.0440173149108887, "logits/rejected": -2.0369927883148193, "logps/chosen": -9.815267562866211, "logps/rejected": -3.2994511127471924, "loss": 0.3953, "rewards/accuracies": 1.0, "rewards/chosen": 1.3829466104507446, "rewards/margins": 0.7239012718200684, "rewards/rejected": 0.6590453386306763, "step": 1703 }, { "epoch": 0.92, "learning_rate": 8.127519669710037e-08, "logits/chosen": -2.096447467803955, "logits/rejected": -2.103708505630493, "logps/chosen": -2.087285041809082, "logps/rejected": -2.683915853500366, "loss": 0.4813, "rewards/accuracies": 1.0, "rewards/chosen": 1.1748292446136475, "rewards/margins": 0.4808865189552307, "rewards/rejected": 0.6939427256584167, "step": 1704 }, { "epoch": 0.92, "learning_rate": 8.12524746525127e-08, "logits/chosen": -2.1088991165161133, "logits/rejected": -2.2937231063842773, "logps/chosen": -3.890829086303711, "logps/rejected": -3.813957452774048, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.754317581653595, "rewards/margins": 0.002140223979949951, "rewards/rejected": 0.752177357673645, "step": 1705 }, { "epoch": 0.92, "learning_rate": 8.122974201047818e-08, "logits/chosen": -2.1756999492645264, "logits/rejected": -2.27583646774292, "logps/chosen": -0.5082435011863708, "logps/rejected": -0.5402793884277344, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.8140655755996704, "rewards/margins": 0.007838249206542969, "rewards/rejected": 0.8062273263931274, "step": 1706 }, { "epoch": 0.92, "learning_rate": 8.120699877870527e-08, "logits/chosen": -1.9473778009414673, "logits/rejected": -2.257711172103882, "logps/chosen": -2.805542469024658, "logps/rejected": -1.6562861204147339, "loss": 0.7699, "rewards/accuracies": 0.0, "rewards/chosen": 0.6527013182640076, "rewards/margins": -0.14805245399475098, "rewards/rejected": 0.8007537722587585, "step": 1707 }, { "epoch": 0.92, "learning_rate": 8.118424496490599e-08, "logits/chosen": -2.070570230484009, "logits/rejected": -2.062598466873169, "logps/chosen": -6.499416828155518, "logps/rejected": -3.250047206878662, "loss": 0.4781, "rewards/accuracies": 1.0, "rewards/chosen": 1.2130895853042603, "rewards/margins": 0.4894147515296936, "rewards/rejected": 0.7236748337745667, "step": 1708 }, { "epoch": 0.92, "learning_rate": 8.116148057679598e-08, "logits/chosen": -1.9481898546218872, "logits/rejected": -1.9587544202804565, "logps/chosen": -6.876374244689941, "logps/rejected": -5.326092720031738, "loss": 0.364, "rewards/accuracies": 1.0, "rewards/chosen": 1.4163964986801147, "rewards/margins": 0.8229248523712158, "rewards/rejected": 0.5934716463088989, "step": 1709 }, { "epoch": 0.92, "learning_rate": 8.113870562209444e-08, "logits/chosen": -2.0848774909973145, "logits/rejected": -2.0950469970703125, "logps/chosen": -1.204962968826294, "logps/rejected": -11.265778541564941, "loss": 0.6327, "rewards/accuracies": 1.0, "rewards/chosen": 0.9569768309593201, "rewards/margins": 0.12468892335891724, "rewards/rejected": 0.8322879076004028, "step": 1710 }, { "epoch": 0.92, "learning_rate": 8.111592010852417e-08, "logits/chosen": -1.926806926727295, "logits/rejected": -1.9107816219329834, "logps/chosen": -8.94001293182373, "logps/rejected": -1.2362627983093262, "loss": 0.6021, "rewards/accuracies": 1.0, "rewards/chosen": 1.2497018575668335, "rewards/margins": 0.19114398956298828, "rewards/rejected": 1.0585578680038452, "step": 1711 }, { "epoch": 0.92, "learning_rate": 8.109312404381155e-08, "logits/chosen": -2.0086417198181152, "logits/rejected": -2.0162909030914307, "logps/chosen": -1.859543800354004, "logps/rejected": -2.6178884506225586, "loss": 0.4347, "rewards/accuracies": 1.0, "rewards/chosen": 1.2491081953048706, "rewards/margins": 0.607775866985321, "rewards/rejected": 0.6413323283195496, "step": 1712 }, { "epoch": 0.92, "learning_rate": 8.10703174356865e-08, "logits/chosen": -1.9941681623458862, "logits/rejected": -2.3097267150878906, "logps/chosen": -0.8675856590270996, "logps/rejected": -0.9253818392753601, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.8683603405952454, "rewards/margins": 0.032621681690216064, "rewards/rejected": 0.8357386589050293, "step": 1713 }, { "epoch": 0.92, "learning_rate": 8.104750029188256e-08, "logits/chosen": -2.0970542430877686, "logits/rejected": -2.0945885181427, "logps/chosen": -3.794185161590576, "logps/rejected": -1.9763516187667847, "loss": 0.369, "rewards/accuracies": 1.0, "rewards/chosen": 1.4713932275772095, "rewards/margins": 0.8066730499267578, "rewards/rejected": 0.6647201776504517, "step": 1714 }, { "epoch": 0.93, "learning_rate": 8.102467262013686e-08, "logits/chosen": -2.052795648574829, "logits/rejected": -2.0517725944519043, "logps/chosen": -1.7340052127838135, "logps/rejected": -4.253348350524902, "loss": 0.4885, "rewards/accuracies": 1.0, "rewards/chosen": 1.0570911169052124, "rewards/margins": 0.4623212218284607, "rewards/rejected": 0.5947698950767517, "step": 1715 }, { "epoch": 0.93, "learning_rate": 8.100183442819002e-08, "logits/chosen": -2.1794393062591553, "logits/rejected": -2.1771388053894043, "logps/chosen": -7.999037265777588, "logps/rejected": -0.678800106048584, "loss": 0.6139, "rewards/accuracies": 1.0, "rewards/chosen": 0.863381564617157, "rewards/margins": 0.1653742790222168, "rewards/rejected": 0.6980072855949402, "step": 1716 }, { "epoch": 0.93, "learning_rate": 8.09789857237863e-08, "logits/chosen": -1.9446089267730713, "logits/rejected": -1.9456866979599, "logps/chosen": -1.7976984977722168, "logps/rejected": -0.9402986764907837, "loss": 0.6298, "rewards/accuracies": 1.0, "rewards/chosen": 1.059370994567871, "rewards/margins": 0.13090306520462036, "rewards/rejected": 0.9284679293632507, "step": 1717 }, { "epoch": 0.93, "learning_rate": 8.095612651467349e-08, "logits/chosen": -2.07271409034729, "logits/rejected": -2.069063901901245, "logps/chosen": -6.353235721588135, "logps/rejected": -4.40691614151001, "loss": 0.3468, "rewards/accuracies": 1.0, "rewards/chosen": 1.5846842527389526, "rewards/margins": 0.8805924654006958, "rewards/rejected": 0.7040917873382568, "step": 1718 }, { "epoch": 0.93, "learning_rate": 8.093325680860297e-08, "logits/chosen": -2.0306849479675293, "logits/rejected": -2.260864019393921, "logps/chosen": -0.45275065302848816, "logps/rejected": -0.44928765296936035, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.8797644972801208, "rewards/margins": 0.021306335926055908, "rewards/rejected": 0.8584581613540649, "step": 1719 }, { "epoch": 0.93, "learning_rate": 8.091037661332965e-08, "logits/chosen": -1.9507859945297241, "logits/rejected": -1.9090301990509033, "logps/chosen": -15.484827995300293, "logps/rejected": -8.301061630249023, "loss": 0.3874, "rewards/accuracies": 1.0, "rewards/chosen": 1.165074110031128, "rewards/margins": 0.748405933380127, "rewards/rejected": 0.4166681468486786, "step": 1720 }, { "epoch": 0.93, "learning_rate": 8.0887485936612e-08, "logits/chosen": -1.9990025758743286, "logits/rejected": -2.0065553188323975, "logps/chosen": -8.360244750976562, "logps/rejected": -1.7315666675567627, "loss": 0.6239, "rewards/accuracies": 1.0, "rewards/chosen": 1.2072422504425049, "rewards/margins": 0.14357054233551025, "rewards/rejected": 1.0636717081069946, "step": 1721 }, { "epoch": 0.93, "learning_rate": 8.086458478621207e-08, "logits/chosen": -2.0409295558929443, "logits/rejected": -2.034350633621216, "logps/chosen": -8.431974411010742, "logps/rejected": -7.482441425323486, "loss": 0.461, "rewards/accuracies": 1.0, "rewards/chosen": 1.0514860153198242, "rewards/margins": 0.5349488854408264, "rewards/rejected": 0.5165371298789978, "step": 1722 }, { "epoch": 0.93, "learning_rate": 8.084167316989543e-08, "logits/chosen": -2.0802972316741943, "logits/rejected": -2.2696080207824707, "logps/chosen": -0.7728774547576904, "logps/rejected": -0.8037079572677612, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.9793516397476196, "rewards/margins": 0.009685337543487549, "rewards/rejected": 0.9696663022041321, "step": 1723 }, { "epoch": 0.93, "learning_rate": 8.081875109543122e-08, "logits/chosen": -2.0211803913116455, "logits/rejected": -2.0308077335357666, "logps/chosen": -8.627156257629395, "logps/rejected": -3.1334080696105957, "loss": 0.3936, "rewards/accuracies": 1.0, "rewards/chosen": 1.502886176109314, "rewards/margins": 0.7292704582214355, "rewards/rejected": 0.7736157178878784, "step": 1724 }, { "epoch": 0.93, "learning_rate": 8.079581857059212e-08, "logits/chosen": -2.1217122077941895, "logits/rejected": -2.2318429946899414, "logps/chosen": -0.5982494354248047, "logps/rejected": -0.573570728302002, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.9740926623344421, "rewards/margins": 0.032186269760131836, "rewards/rejected": 0.9419063925743103, "step": 1725 }, { "epoch": 0.93, "learning_rate": 8.077287560315436e-08, "logits/chosen": -1.9982472658157349, "logits/rejected": -2.0064589977264404, "logps/chosen": -1.8001396656036377, "logps/rejected": -2.710885763168335, "loss": 0.52, "rewards/accuracies": 1.0, "rewards/chosen": 1.0466910600662231, "rewards/margins": 0.3827570080757141, "rewards/rejected": 0.663934051990509, "step": 1726 }, { "epoch": 0.93, "learning_rate": 8.074992220089769e-08, "logits/chosen": -1.995091438293457, "logits/rejected": -1.9959487915039062, "logps/chosen": -0.9130694270133972, "logps/rejected": -2.8933136463165283, "loss": 0.5652, "rewards/accuracies": 1.0, "rewards/chosen": 0.9652944803237915, "rewards/margins": 0.27470821142196655, "rewards/rejected": 0.690586268901825, "step": 1727 }, { "epoch": 0.93, "learning_rate": 8.07269583716054e-08, "logits/chosen": -2.0943074226379395, "logits/rejected": -2.3445892333984375, "logps/chosen": -2.7327239513397217, "logps/rejected": -2.652207136154175, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.6231839060783386, "rewards/margins": 0.017324209213256836, "rewards/rejected": 0.6058596968650818, "step": 1728 }, { "epoch": 0.93, "learning_rate": 8.070398412306436e-08, "logits/chosen": -2.04473876953125, "logits/rejected": -2.040182590484619, "logps/chosen": -6.906972885131836, "logps/rejected": -0.7018710970878601, "loss": 0.31, "rewards/accuracies": 1.0, "rewards/chosen": 1.9435402154922485, "rewards/margins": 1.0123052597045898, "rewards/rejected": 0.9312350153923035, "step": 1729 }, { "epoch": 0.93, "learning_rate": 8.068099946306492e-08, "logits/chosen": -2.1207938194274902, "logits/rejected": -2.2962822914123535, "logps/chosen": -1.861309289932251, "logps/rejected": -1.841217279434204, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": 0.9443908929824829, "rewards/margins": -0.010585129261016846, "rewards/rejected": 0.9549760222434998, "step": 1730 }, { "epoch": 0.93, "learning_rate": 8.0658004399401e-08, "logits/chosen": -2.0161728858947754, "logits/rejected": -2.2865190505981445, "logps/chosen": -0.4315674901008606, "logps/rejected": -0.4551874101161957, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 0.740402340888977, "rewards/margins": -0.0016170740127563477, "rewards/rejected": 0.7420194149017334, "step": 1731 }, { "epoch": 0.93, "learning_rate": 8.063499893987002e-08, "logits/chosen": -1.919165015220642, "logits/rejected": -2.248342752456665, "logps/chosen": -4.650753021240234, "logps/rejected": -4.489364147186279, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.6454556584358215, "rewards/margins": 0.022359251976013184, "rewards/rejected": 0.6230964064598083, "step": 1732 }, { "epoch": 0.93, "learning_rate": 8.061198309227292e-08, "logits/chosen": -2.0219597816467285, "logits/rejected": -2.0289463996887207, "logps/chosen": -1.7464457750320435, "logps/rejected": -2.2364118099212646, "loss": 0.5034, "rewards/accuracies": 1.0, "rewards/chosen": 1.0511326789855957, "rewards/margins": 0.42417699098587036, "rewards/rejected": 0.6269556879997253, "step": 1733 }, { "epoch": 0.94, "learning_rate": 8.05889568644142e-08, "logits/chosen": -2.114924192428589, "logits/rejected": -2.2565150260925293, "logps/chosen": -3.528425455093384, "logps/rejected": -3.6416025161743164, "loss": 0.6755, "rewards/accuracies": 1.0, "rewards/chosen": 0.8745282292366028, "rewards/margins": 0.03554534912109375, "rewards/rejected": 0.838982880115509, "step": 1734 }, { "epoch": 0.94, "learning_rate": 8.056592026410185e-08, "logits/chosen": -2.0500571727752686, "logits/rejected": -2.24220609664917, "logps/chosen": -0.44353196024894714, "logps/rejected": -0.47766873240470886, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.7454807162284851, "rewards/margins": 0.012455761432647705, "rewards/rejected": 0.7330249547958374, "step": 1735 }, { "epoch": 0.94, "learning_rate": 8.054287329914738e-08, "logits/chosen": -2.024259090423584, "logits/rejected": -2.2852237224578857, "logps/chosen": -5.093570232391357, "logps/rejected": -4.781044006347656, "loss": 0.707, "rewards/accuracies": 0.0, "rewards/chosen": 0.47010692954063416, "rewards/margins": -0.027595192193984985, "rewards/rejected": 0.49770212173461914, "step": 1736 }, { "epoch": 0.94, "learning_rate": 8.051981597736581e-08, "logits/chosen": -1.9994819164276123, "logits/rejected": -1.9996720552444458, "logps/chosen": -2.100989818572998, "logps/rejected": -2.104102849960327, "loss": 0.5659, "rewards/accuracies": 1.0, "rewards/chosen": 1.218836784362793, "rewards/margins": 0.27310895919799805, "rewards/rejected": 0.9457278251647949, "step": 1737 }, { "epoch": 0.94, "learning_rate": 8.04967483065757e-08, "logits/chosen": -2.105470895767212, "logits/rejected": -2.217470407485962, "logps/chosen": -1.7528289556503296, "logps/rejected": -4.800010681152344, "loss": 0.6394, "rewards/accuracies": 1.0, "rewards/chosen": 0.8881136178970337, "rewards/margins": 0.11061745882034302, "rewards/rejected": 0.7774961590766907, "step": 1738 }, { "epoch": 0.94, "learning_rate": 8.047367029459908e-08, "logits/chosen": -2.02724027633667, "logits/rejected": -2.190579652786255, "logps/chosen": -5.985611915588379, "logps/rejected": -1.1720744371414185, "loss": 0.8207, "rewards/accuracies": 0.0, "rewards/chosen": 0.6150021553039551, "rewards/margins": -0.24069559574127197, "rewards/rejected": 0.855697751045227, "step": 1739 }, { "epoch": 0.94, "learning_rate": 8.045058194926152e-08, "logits/chosen": -1.964066982269287, "logits/rejected": -2.2257254123687744, "logps/chosen": -10.308094024658203, "logps/rejected": -9.954360961914062, "loss": 0.704, "rewards/accuracies": 0.0, "rewards/chosen": 0.32106420397758484, "rewards/margins": -0.021544158458709717, "rewards/rejected": 0.34260836243629456, "step": 1740 }, { "epoch": 0.94, "learning_rate": 8.042748327839208e-08, "logits/chosen": -2.051513433456421, "logits/rejected": -2.269940137863159, "logps/chosen": -1.3154219388961792, "logps/rejected": -1.439288854598999, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.942999541759491, "rewards/margins": 0.010325133800506592, "rewards/rejected": 0.9326744079589844, "step": 1741 }, { "epoch": 0.94, "learning_rate": 8.040437428982332e-08, "logits/chosen": -2.1108808517456055, "logits/rejected": -2.111412286758423, "logps/chosen": -0.516494870185852, "logps/rejected": -3.90095591545105, "loss": 0.4652, "rewards/accuracies": 1.0, "rewards/chosen": 1.0280786752700806, "rewards/margins": 0.5236853957176208, "rewards/rejected": 0.5043932795524597, "step": 1742 }, { "epoch": 0.94, "learning_rate": 8.038125499139128e-08, "logits/chosen": -2.071958065032959, "logits/rejected": -2.1689929962158203, "logps/chosen": -1.6926084756851196, "logps/rejected": -26.538558959960938, "loss": 0.274, "rewards/accuracies": 1.0, "rewards/chosen": 1.1037944555282593, "rewards/margins": 1.1543264389038086, "rewards/rejected": -0.050531961023807526, "step": 1743 }, { "epoch": 0.94, "learning_rate": 8.035812539093556e-08, "logits/chosen": -2.035710096359253, "logits/rejected": -2.2689297199249268, "logps/chosen": -0.9864454865455627, "logps/rejected": -0.9849883317947388, "loss": 0.6777, "rewards/accuracies": 1.0, "rewards/chosen": 0.8948696255683899, "rewards/margins": 0.03105628490447998, "rewards/rejected": 0.8638133406639099, "step": 1744 }, { "epoch": 0.94, "learning_rate": 8.033498549629917e-08, "logits/chosen": -2.0683538913726807, "logits/rejected": -2.0681028366088867, "logps/chosen": -1.4114007949829102, "logps/rejected": -2.1521317958831787, "loss": 0.6481, "rewards/accuracies": 1.0, "rewards/chosen": 1.06217360496521, "rewards/margins": 0.09226459264755249, "rewards/rejected": 0.9699090123176575, "step": 1745 }, { "epoch": 0.94, "learning_rate": 8.031183531532868e-08, "logits/chosen": -2.1937367916107178, "logits/rejected": -2.1117422580718994, "logps/chosen": -26.429248809814453, "logps/rejected": -2.0722413063049316, "loss": 0.2968, "rewards/accuracies": 1.0, "rewards/chosen": 1.7403595447540283, "rewards/margins": 1.062448263168335, "rewards/rejected": 0.6779113411903381, "step": 1746 }, { "epoch": 0.94, "learning_rate": 8.02886748558741e-08, "logits/chosen": -2.2069733142852783, "logits/rejected": -2.0401926040649414, "logps/chosen": -56.25196075439453, "logps/rejected": -12.081334114074707, "loss": 0.3392, "rewards/accuracies": 1.0, "rewards/chosen": 1.7811698913574219, "rewards/margins": 0.9067316651344299, "rewards/rejected": 0.8744382262229919, "step": 1747 }, { "epoch": 0.94, "learning_rate": 8.026550412578892e-08, "logits/chosen": -2.1063895225524902, "logits/rejected": -2.195664167404175, "logps/chosen": -2.4093639850616455, "logps/rejected": -2.4312469959259033, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.9345019459724426, "rewards/margins": 0.023407578468322754, "rewards/rejected": 0.9110943675041199, "step": 1748 }, { "epoch": 0.94, "learning_rate": 8.02423231329302e-08, "logits/chosen": -2.0845181941986084, "logits/rejected": -2.086212158203125, "logps/chosen": -1.52058744430542, "logps/rejected": -4.476083278656006, "loss": 0.4705, "rewards/accuracies": 1.0, "rewards/chosen": 0.9975898861885071, "rewards/margins": 0.5094204545021057, "rewards/rejected": 0.48816943168640137, "step": 1749 }, { "epoch": 0.94, "learning_rate": 8.021913188515836e-08, "logits/chosen": -1.9825061559677124, "logits/rejected": -1.986738920211792, "logps/chosen": -1.998218059539795, "logps/rejected": -2.5477283000946045, "loss": 0.534, "rewards/accuracies": 1.0, "rewards/chosen": 1.0042561292648315, "rewards/margins": 0.3484112620353699, "rewards/rejected": 0.6558448672294617, "step": 1750 }, { "epoch": 0.94, "learning_rate": 8.019593039033738e-08, "logits/chosen": -2.038470506668091, "logits/rejected": -2.2236037254333496, "logps/chosen": -0.31253379583358765, "logps/rejected": -0.3681115508079529, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.9545871019363403, "rewards/margins": 0.028965115547180176, "rewards/rejected": 0.9256219863891602, "step": 1751 }, { "epoch": 0.94, "learning_rate": 8.017271865633468e-08, "logits/chosen": -2.0294089317321777, "logits/rejected": -2.2523281574249268, "logps/chosen": -1.0125809907913208, "logps/rejected": -1.125433325767517, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.788415253162384, "rewards/margins": 0.0302237868309021, "rewards/rejected": 0.7581914663314819, "step": 1752 }, { "epoch": 0.95, "learning_rate": 8.014949669102116e-08, "logits/chosen": -2.0482187271118164, "logits/rejected": -2.0489985942840576, "logps/chosen": -2.3175747394561768, "logps/rejected": -1.639809250831604, "loss": 0.6225, "rewards/accuracies": 1.0, "rewards/chosen": 1.1191214323043823, "rewards/margins": 0.14667600393295288, "rewards/rejected": 0.9724454283714294, "step": 1753 }, { "epoch": 0.95, "learning_rate": 8.01262645022712e-08, "logits/chosen": -2.045799493789673, "logits/rejected": -2.2942392826080322, "logps/chosen": -2.958228588104248, "logps/rejected": -4.349955081939697, "loss": 0.6149, "rewards/accuracies": 1.0, "rewards/chosen": 0.5581333041191101, "rewards/margins": 0.1631326973438263, "rewards/rejected": 0.3950006067752838, "step": 1754 }, { "epoch": 0.95, "learning_rate": 8.010302209796263e-08, "logits/chosen": -2.182483673095703, "logits/rejected": -2.2558064460754395, "logps/chosen": -4.501677513122559, "logps/rejected": -26.72024154663086, "loss": 0.3967, "rewards/accuracies": 1.0, "rewards/chosen": 1.0726617574691772, "rewards/margins": 0.7197622060775757, "rewards/rejected": 0.35289955139160156, "step": 1755 }, { "epoch": 0.95, "learning_rate": 8.007976948597674e-08, "logits/chosen": -2.109071969985962, "logits/rejected": -1.9380966424942017, "logps/chosen": -41.449729919433594, "logps/rejected": -2.6725847721099854, "loss": 0.3458, "rewards/accuracies": 1.0, "rewards/chosen": 1.5781711339950562, "rewards/margins": 0.8841335773468018, "rewards/rejected": 0.6940375566482544, "step": 1756 }, { "epoch": 0.95, "learning_rate": 8.00565066741983e-08, "logits/chosen": -2.1938436031341553, "logits/rejected": -2.1997478008270264, "logps/chosen": -1.9308298826217651, "logps/rejected": -2.947641611099243, "loss": 0.5268, "rewards/accuracies": 1.0, "rewards/chosen": 0.9627255797386169, "rewards/margins": 0.3660430312156677, "rewards/rejected": 0.5966825485229492, "step": 1757 }, { "epoch": 0.95, "learning_rate": 8.003323367051554e-08, "logits/chosen": -2.084333896636963, "logits/rejected": -2.0878279209136963, "logps/chosen": -6.799038410186768, "logps/rejected": -8.649808883666992, "loss": 0.2999, "rewards/accuracies": 1.0, "rewards/chosen": 1.5056018829345703, "rewards/margins": 1.0507400035858154, "rewards/rejected": 0.4548618495464325, "step": 1758 }, { "epoch": 0.95, "learning_rate": 8.000995048282013e-08, "logits/chosen": -1.9987720251083374, "logits/rejected": -2.203204870223999, "logps/chosen": -2.0717549324035645, "logps/rejected": -2.134504795074463, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.9294120669364929, "rewards/margins": 0.0186120867729187, "rewards/rejected": 0.9107999801635742, "step": 1759 }, { "epoch": 0.95, "learning_rate": 7.99866571190072e-08, "logits/chosen": -2.01220703125, "logits/rejected": -2.237926483154297, "logps/chosen": -9.539690017700195, "logps/rejected": -6.338276386260986, "loss": 0.6749, "rewards/accuracies": 1.0, "rewards/chosen": 0.804717481136322, "rewards/margins": 0.036760568618774414, "rewards/rejected": 0.7679569125175476, "step": 1760 }, { "epoch": 0.95, "learning_rate": 7.996335358697533e-08, "logits/chosen": -2.0455522537231445, "logits/rejected": -2.105325698852539, "logps/chosen": -9.577447891235352, "logps/rejected": -7.911116600036621, "loss": 0.5682, "rewards/accuracies": 1.0, "rewards/chosen": 1.246410608291626, "rewards/margins": 0.26786166429519653, "rewards/rejected": 0.9785489439964294, "step": 1761 }, { "epoch": 0.95, "learning_rate": 7.994003989462655e-08, "logits/chosen": -2.1291794776916504, "logits/rejected": -2.3017737865448, "logps/chosen": -0.5434249639511108, "logps/rejected": -0.5343851447105408, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.8470829129219055, "rewards/margins": 0.01914989948272705, "rewards/rejected": 0.8279330134391785, "step": 1762 }, { "epoch": 0.95, "learning_rate": 7.99167160498663e-08, "logits/chosen": -2.0310142040252686, "logits/rejected": -2.280132532119751, "logps/chosen": -2.486553192138672, "logps/rejected": -2.4034461975097656, "loss": 0.6726, "rewards/accuracies": 1.0, "rewards/chosen": 1.175479531288147, "rewards/margins": 0.04142618179321289, "rewards/rejected": 1.134053349494934, "step": 1763 }, { "epoch": 0.95, "learning_rate": 7.989338206060356e-08, "logits/chosen": -2.145946502685547, "logits/rejected": -2.145984649658203, "logps/chosen": -3.375136375427246, "logps/rejected": -3.2366857528686523, "loss": 0.2973, "rewards/accuracies": 1.0, "rewards/chosen": 1.651811957359314, "rewards/margins": 1.060784101486206, "rewards/rejected": 0.5910279154777527, "step": 1764 }, { "epoch": 0.95, "learning_rate": 7.987003793475065e-08, "logits/chosen": -2.0012428760528564, "logits/rejected": -1.997856855392456, "logps/chosen": -3.1750810146331787, "logps/rejected": -4.994535446166992, "loss": 0.5838, "rewards/accuracies": 1.0, "rewards/chosen": 1.065468430519104, "rewards/margins": 0.2320665717124939, "rewards/rejected": 0.8334018588066101, "step": 1765 }, { "epoch": 0.95, "learning_rate": 7.984668368022335e-08, "logits/chosen": -2.1553008556365967, "logits/rejected": -2.117093801498413, "logps/chosen": -24.50440788269043, "logps/rejected": -3.8142576217651367, "loss": 0.3709, "rewards/accuracies": 1.0, "rewards/chosen": 1.3856878280639648, "rewards/margins": 0.8007381558418274, "rewards/rejected": 0.5849496722221375, "step": 1766 }, { "epoch": 0.95, "learning_rate": 7.982331930494092e-08, "logits/chosen": -2.034980297088623, "logits/rejected": -2.2933638095855713, "logps/chosen": -0.8150393962860107, "logps/rejected": -0.8592243194580078, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.9149218797683716, "rewards/margins": 0.016274631023406982, "rewards/rejected": 0.8986472487449646, "step": 1767 }, { "epoch": 0.95, "learning_rate": 7.979994481682599e-08, "logits/chosen": -2.1975302696228027, "logits/rejected": -2.2491469383239746, "logps/chosen": -19.937244415283203, "logps/rejected": -10.071704864501953, "loss": 0.659, "rewards/accuracies": 1.0, "rewards/chosen": 0.8724908828735352, "rewards/margins": 0.06958425045013428, "rewards/rejected": 0.8029066324234009, "step": 1768 }, { "epoch": 0.95, "learning_rate": 7.977656022380467e-08, "logits/chosen": -2.1565096378326416, "logits/rejected": -2.0704634189605713, "logps/chosen": -24.298452377319336, "logps/rejected": -3.132949113845825, "loss": 0.2165, "rewards/accuracies": 1.0, "rewards/chosen": 1.9670069217681885, "rewards/margins": 1.4200631380081177, "rewards/rejected": 0.5469437837600708, "step": 1769 }, { "epoch": 0.95, "learning_rate": 7.975316553380647e-08, "logits/chosen": -1.9675557613372803, "logits/rejected": -1.9684028625488281, "logps/chosen": -4.727350234985352, "logps/rejected": -3.328458547592163, "loss": 0.2925, "rewards/accuracies": 1.0, "rewards/chosen": 1.6339645385742188, "rewards/margins": 1.079591155052185, "rewards/rejected": 0.5543733835220337, "step": 1770 }, { "epoch": 0.96, "learning_rate": 7.972976075476431e-08, "logits/chosen": -2.097594738006592, "logits/rejected": -2.1656582355499268, "logps/chosen": -17.179458618164062, "logps/rejected": -12.68536376953125, "loss": 0.553, "rewards/accuracies": 1.0, "rewards/chosen": 1.2831534147262573, "rewards/margins": 0.3031923770904541, "rewards/rejected": 0.9799610376358032, "step": 1771 }, { "epoch": 0.96, "learning_rate": 7.97063458946146e-08, "logits/chosen": -1.9816642999649048, "logits/rejected": -1.987471580505371, "logps/chosen": -2.1191513538360596, "logps/rejected": -2.724712610244751, "loss": 0.4933, "rewards/accuracies": 1.0, "rewards/chosen": 1.0375783443450928, "rewards/margins": 0.44977909326553345, "rewards/rejected": 0.5877992510795593, "step": 1772 }, { "epoch": 0.96, "learning_rate": 7.968292096129707e-08, "logits/chosen": -2.137913942337036, "logits/rejected": -2.1339900493621826, "logps/chosen": -3.1881296634674072, "logps/rejected": -3.020015001296997, "loss": 0.4093, "rewards/accuracies": 1.0, "rewards/chosen": 1.4881362915039062, "rewards/margins": 0.6817647814750671, "rewards/rejected": 0.8063715100288391, "step": 1773 }, { "epoch": 0.96, "learning_rate": 7.965948596275493e-08, "logits/chosen": -2.106583595275879, "logits/rejected": -2.3496861457824707, "logps/chosen": -0.3482743501663208, "logps/rejected": -0.3536348044872284, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.7554039359092712, "rewards/margins": 0.027577579021453857, "rewards/rejected": 0.7278263568878174, "step": 1774 }, { "epoch": 0.96, "learning_rate": 7.963604090693481e-08, "logits/chosen": -2.022587537765503, "logits/rejected": -2.0226473808288574, "logps/chosen": -0.5834206342697144, "logps/rejected": -3.763665199279785, "loss": 0.5568, "rewards/accuracies": 1.0, "rewards/chosen": 0.8926998376846313, "rewards/margins": 0.2942647337913513, "rewards/rejected": 0.59843510389328, "step": 1775 }, { "epoch": 0.96, "learning_rate": 7.961258580178669e-08, "logits/chosen": -1.9700195789337158, "logits/rejected": -2.149805784225464, "logps/chosen": -0.5681440830230713, "logps/rejected": -0.6276452541351318, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.991402804851532, "rewards/margins": 0.021259665489196777, "rewards/rejected": 0.9701431393623352, "step": 1776 }, { "epoch": 0.96, "learning_rate": 7.958912065526402e-08, "logits/chosen": -2.1229920387268066, "logits/rejected": -2.307502031326294, "logps/chosen": -2.5036308765411377, "logps/rejected": -2.381643295288086, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.807388424873352, "rewards/margins": 0.008982837200164795, "rewards/rejected": 0.7984055876731873, "step": 1777 }, { "epoch": 0.96, "learning_rate": 7.956564547532361e-08, "logits/chosen": -2.011512041091919, "logits/rejected": -2.013184070587158, "logps/chosen": -0.8940367698669434, "logps/rejected": -3.0588700771331787, "loss": 0.4975, "rewards/accuracies": 1.0, "rewards/chosen": 1.1385506391525269, "rewards/margins": 0.43918758630752563, "rewards/rejected": 0.6993630528450012, "step": 1778 }, { "epoch": 0.96, "learning_rate": 7.95421602699257e-08, "logits/chosen": -2.0488181114196777, "logits/rejected": -2.039504289627075, "logps/chosen": -13.155985832214355, "logps/rejected": -3.6011507511138916, "loss": 0.4579, "rewards/accuracies": 1.0, "rewards/chosen": 1.0392024517059326, "rewards/margins": 0.5434603691101074, "rewards/rejected": 0.4957420527935028, "step": 1779 }, { "epoch": 0.96, "learning_rate": 7.951866504703395e-08, "logits/chosen": -2.047816038131714, "logits/rejected": -2.2971386909484863, "logps/chosen": -8.829835891723633, "logps/rejected": -10.124646186828613, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.8810157775878906, "rewards/margins": 0.004441559314727783, "rewards/rejected": 0.8765742182731628, "step": 1780 }, { "epoch": 0.96, "learning_rate": 7.949515981461536e-08, "logits/chosen": -2.047523260116577, "logits/rejected": -2.2596182823181152, "logps/chosen": -0.8149874806404114, "logps/rejected": -0.827229380607605, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.9149818420410156, "rewards/margins": 0.025682449340820312, "rewards/rejected": 0.8892993927001953, "step": 1781 }, { "epoch": 0.96, "learning_rate": 7.947164458064037e-08, "logits/chosen": -2.001434803009033, "logits/rejected": -2.007657766342163, "logps/chosen": -1.7038819789886475, "logps/rejected": -3.1535072326660156, "loss": 0.5162, "rewards/accuracies": 1.0, "rewards/chosen": 0.951216995716095, "rewards/margins": 0.39215487241744995, "rewards/rejected": 0.559062123298645, "step": 1782 }, { "epoch": 0.96, "learning_rate": 7.944811935308277e-08, "logits/chosen": -2.2131268978118896, "logits/rejected": -2.1071817874908447, "logps/chosen": -32.76060485839844, "logps/rejected": -4.461193561553955, "loss": 0.2199, "rewards/accuracies": 1.0, "rewards/chosen": 1.8919674158096313, "rewards/margins": 1.4025168418884277, "rewards/rejected": 0.489450603723526, "step": 1783 }, { "epoch": 0.96, "learning_rate": 7.942458413991977e-08, "logits/chosen": -2.036489248275757, "logits/rejected": -2.0418007373809814, "logps/chosen": -11.161712646484375, "logps/rejected": -4.373228549957275, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 0.8432575464248657, "rewards/margins": 0.0421062707901001, "rewards/rejected": 0.8011512756347656, "step": 1784 }, { "epoch": 0.96, "learning_rate": 7.9401038949132e-08, "logits/chosen": -2.0380637645721436, "logits/rejected": -2.281741142272949, "logps/chosen": -0.3980666697025299, "logps/rejected": -0.491904616355896, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.8824559450149536, "rewards/margins": 0.01850426197052002, "rewards/rejected": 0.8639516830444336, "step": 1785 }, { "epoch": 0.96, "learning_rate": 7.937748378870338e-08, "logits/chosen": -2.070960521697998, "logits/rejected": -2.271900177001953, "logps/chosen": -4.318721294403076, "logps/rejected": -1.3877410888671875, "loss": 0.7477, "rewards/accuracies": 0.0, "rewards/chosen": 0.6107714176177979, "rewards/margins": -0.10630577802658081, "rewards/rejected": 0.7170771956443787, "step": 1786 }, { "epoch": 0.96, "learning_rate": 7.935391866662127e-08, "logits/chosen": -2.09425950050354, "logits/rejected": -2.0667033195495605, "logps/chosen": -13.022745132446289, "logps/rejected": -1.9026024341583252, "loss": 0.35, "rewards/accuracies": 1.0, "rewards/chosen": 1.5574473142623901, "rewards/margins": 0.8695557117462158, "rewards/rejected": 0.6878916025161743, "step": 1787 }, { "epoch": 0.96, "learning_rate": 7.933034359087644e-08, "logits/chosen": -2.0432324409484863, "logits/rejected": -2.0416746139526367, "logps/chosen": -2.9105944633483887, "logps/rejected": -4.481776714324951, "loss": 0.3214, "rewards/accuracies": 1.0, "rewards/chosen": 1.4627552032470703, "rewards/margins": 0.9700626134872437, "rewards/rejected": 0.49269261956214905, "step": 1788 }, { "epoch": 0.96, "learning_rate": 7.930675856946296e-08, "logits/chosen": -1.9891911745071411, "logits/rejected": -1.9891217947006226, "logps/chosen": -0.28991663455963135, "logps/rejected": -4.465846061706543, "loss": 0.4937, "rewards/accuracies": 1.0, "rewards/chosen": 0.937231719493866, "rewards/margins": 0.44878730177879333, "rewards/rejected": 0.48844441771507263, "step": 1789 }, { "epoch": 0.97, "learning_rate": 7.928316361037834e-08, "logits/chosen": -2.018733501434326, "logits/rejected": -2.2585349082946777, "logps/chosen": -0.9730775356292725, "logps/rejected": -0.9822074174880981, "loss": 0.6649, "rewards/accuracies": 1.0, "rewards/chosen": 0.9610365033149719, "rewards/margins": 0.057361900806427, "rewards/rejected": 0.9036746025085449, "step": 1790 }, { "epoch": 0.97, "learning_rate": 7.92595587216234e-08, "logits/chosen": -2.129770278930664, "logits/rejected": -2.1137914657592773, "logps/chosen": -9.40127944946289, "logps/rejected": -5.866420745849609, "loss": 0.4072, "rewards/accuracies": 1.0, "rewards/chosen": 1.3004802465438843, "rewards/margins": 0.6879191994667053, "rewards/rejected": 0.612561047077179, "step": 1791 }, { "epoch": 0.97, "learning_rate": 7.923594391120235e-08, "logits/chosen": -2.040330410003662, "logits/rejected": -2.140495777130127, "logps/chosen": -3.6208267211914062, "logps/rejected": -10.012630462646484, "loss": 0.5308, "rewards/accuracies": 1.0, "rewards/chosen": 1.2063688039779663, "rewards/margins": 0.3562503457069397, "rewards/rejected": 0.8501184582710266, "step": 1792 }, { "epoch": 0.97, "learning_rate": 7.92123191871228e-08, "logits/chosen": -2.112424373626709, "logits/rejected": -2.032480478286743, "logps/chosen": -16.149438858032227, "logps/rejected": -4.385578632354736, "loss": 0.4307, "rewards/accuracies": 1.0, "rewards/chosen": 1.3472578525543213, "rewards/margins": 0.619304358959198, "rewards/rejected": 0.7279534935951233, "step": 1793 }, { "epoch": 0.97, "learning_rate": 7.918868455739568e-08, "logits/chosen": -2.045952081680298, "logits/rejected": -2.173936128616333, "logps/chosen": -0.6916958093643188, "logps/rejected": -0.5430206060409546, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.7501418590545654, "rewards/margins": 0.004544615745544434, "rewards/rejected": 0.745597243309021, "step": 1794 }, { "epoch": 0.97, "learning_rate": 7.916504003003528e-08, "logits/chosen": -2.1976730823516846, "logits/rejected": -2.2478888034820557, "logps/chosen": -6.827212333679199, "logps/rejected": -17.83365821838379, "loss": 0.3545, "rewards/accuracies": 1.0, "rewards/chosen": 1.2884007692337036, "rewards/margins": 0.8545371294021606, "rewards/rejected": 0.43386363983154297, "step": 1795 }, { "epoch": 0.97, "learning_rate": 7.914138561305927e-08, "logits/chosen": -2.047926425933838, "logits/rejected": -2.117319107055664, "logps/chosen": -2.7856531143188477, "logps/rejected": -19.499576568603516, "loss": 0.5629, "rewards/accuracies": 1.0, "rewards/chosen": 1.3009799718856812, "rewards/margins": 0.28003835678100586, "rewards/rejected": 1.0209416151046753, "step": 1796 }, { "epoch": 0.97, "learning_rate": 7.911772131448866e-08, "logits/chosen": -1.9684909582138062, "logits/rejected": -1.9660978317260742, "logps/chosen": -3.192887783050537, "logps/rejected": -3.633793592453003, "loss": 0.3901, "rewards/accuracies": 1.0, "rewards/chosen": 1.3416311740875244, "rewards/margins": 0.7400497794151306, "rewards/rejected": 0.6015813946723938, "step": 1797 }, { "epoch": 0.97, "learning_rate": 7.909404714234781e-08, "logits/chosen": -2.0012099742889404, "logits/rejected": -2.2958362102508545, "logps/chosen": -0.655737042427063, "logps/rejected": -0.7819558382034302, "loss": 0.6681, "rewards/accuracies": 1.0, "rewards/chosen": 1.0396314859390259, "rewards/margins": 0.05070841312408447, "rewards/rejected": 0.9889230728149414, "step": 1798 }, { "epoch": 0.97, "learning_rate": 7.907036310466442e-08, "logits/chosen": -1.990256905555725, "logits/rejected": -2.2425665855407715, "logps/chosen": -0.6156795024871826, "logps/rejected": -0.6509857177734375, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.8267180323600769, "rewards/margins": -0.0028482675552368164, "rewards/rejected": 0.8295662999153137, "step": 1799 }, { "epoch": 0.97, "learning_rate": 7.904666920946956e-08, "logits/chosen": -2.1505258083343506, "logits/rejected": -2.2244186401367188, "logps/chosen": -6.328571319580078, "logps/rejected": -17.1632022857666, "loss": 0.5034, "rewards/accuracies": 1.0, "rewards/chosen": 1.0198936462402344, "rewards/margins": 0.42403143644332886, "rewards/rejected": 0.5958622097969055, "step": 1800 }, { "epoch": 0.97, "learning_rate": 7.902296546479761e-08, "logits/chosen": -2.1068179607391357, "logits/rejected": -2.276650905609131, "logps/chosen": -0.7461040019989014, "logps/rejected": -0.7223376035690308, "loss": 0.6777, "rewards/accuracies": 1.0, "rewards/chosen": 0.9759469032287598, "rewards/margins": 0.031041622161865234, "rewards/rejected": 0.9449052810668945, "step": 1801 }, { "epoch": 0.97, "learning_rate": 7.899925187868632e-08, "logits/chosen": -2.154935598373413, "logits/rejected": -2.1596107482910156, "logps/chosen": -2.9686124324798584, "logps/rejected": -3.574246883392334, "loss": 0.4276, "rewards/accuracies": 1.0, "rewards/chosen": 1.1955889463424683, "rewards/margins": 0.6281158328056335, "rewards/rejected": 0.5674731135368347, "step": 1802 }, { "epoch": 0.97, "learning_rate": 7.897552845917676e-08, "logits/chosen": -2.1410560607910156, "logits/rejected": -2.2743093967437744, "logps/chosen": -2.7668018341064453, "logps/rejected": -0.7272323369979858, "loss": 0.7182, "rewards/accuracies": 0.0, "rewards/chosen": 0.8409199714660645, "rewards/margins": -0.049485862255096436, "rewards/rejected": 0.8904058337211609, "step": 1803 }, { "epoch": 0.97, "learning_rate": 7.895179521431334e-08, "logits/chosen": -2.0312600135803223, "logits/rejected": -2.248901844024658, "logps/chosen": -1.4393701553344727, "logps/rejected": -45.97456741333008, "loss": 0.2561, "rewards/accuracies": 1.0, "rewards/chosen": 0.8638567328453064, "rewards/margins": 1.2312560081481934, "rewards/rejected": -0.3673992156982422, "step": 1804 }, { "epoch": 0.97, "learning_rate": 7.89280521521438e-08, "logits/chosen": -2.0578649044036865, "logits/rejected": -2.0583949089050293, "logps/chosen": -2.8145408630371094, "logps/rejected": -1.1510738134384155, "loss": 0.6173, "rewards/accuracies": 1.0, "rewards/chosen": 1.0836132764816284, "rewards/margins": 0.1579095721244812, "rewards/rejected": 0.9257037043571472, "step": 1805 }, { "epoch": 0.97, "learning_rate": 7.89042992807192e-08, "logits/chosen": -2.072819232940674, "logits/rejected": -2.333815813064575, "logps/chosen": -2.4089386463165283, "logps/rejected": -2.3468875885009766, "loss": 0.662, "rewards/accuracies": 1.0, "rewards/chosen": 1.0559306144714355, "rewards/margins": 0.06334227323532104, "rewards/rejected": 0.9925883412361145, "step": 1806 }, { "epoch": 0.97, "learning_rate": 7.888053660809395e-08, "logits/chosen": -1.9568188190460205, "logits/rejected": -1.9608709812164307, "logps/chosen": -2.0517709255218506, "logps/rejected": -4.116324424743652, "loss": 0.665, "rewards/accuracies": 1.0, "rewards/chosen": 1.0900501012802124, "rewards/margins": 0.05709719657897949, "rewards/rejected": 1.032952904701233, "step": 1807 }, { "epoch": 0.98, "learning_rate": 7.885676414232576e-08, "logits/chosen": -2.055349111557007, "logits/rejected": -2.050389289855957, "logps/chosen": -11.820749282836914, "logps/rejected": -5.618762016296387, "loss": 0.2972, "rewards/accuracies": 1.0, "rewards/chosen": 1.660073161125183, "rewards/margins": 1.0610554218292236, "rewards/rejected": 0.5990177989006042, "step": 1808 }, { "epoch": 0.98, "learning_rate": 7.883298189147567e-08, "logits/chosen": -1.9617233276367188, "logits/rejected": -2.2505600452423096, "logps/chosen": -0.4777212142944336, "logps/rejected": -0.46338891983032227, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.9355788230895996, "rewards/margins": 0.0010626912117004395, "rewards/rejected": 0.9345161318778992, "step": 1809 }, { "epoch": 0.98, "learning_rate": 7.880918986360804e-08, "logits/chosen": -2.011737585067749, "logits/rejected": -2.0066211223602295, "logps/chosen": -6.895923614501953, "logps/rejected": -4.305591583251953, "loss": 0.3093, "rewards/accuracies": 1.0, "rewards/chosen": 1.5635101795196533, "rewards/margins": 1.014784574508667, "rewards/rejected": 0.5487255454063416, "step": 1810 }, { "epoch": 0.98, "learning_rate": 7.878538806679056e-08, "logits/chosen": -2.071601152420044, "logits/rejected": -2.0776681900024414, "logps/chosen": -2.3368284702301025, "logps/rejected": -4.100432395935059, "loss": 0.5057, "rewards/accuracies": 1.0, "rewards/chosen": 1.056906819343567, "rewards/margins": 0.41827481985092163, "rewards/rejected": 0.6386319994926453, "step": 1811 }, { "epoch": 0.98, "learning_rate": 7.876157650909417e-08, "logits/chosen": -2.234295606613159, "logits/rejected": -2.1935465335845947, "logps/chosen": -40.733638763427734, "logps/rejected": -11.789846420288086, "loss": 0.2886, "rewards/accuracies": 1.0, "rewards/chosen": 1.8364101648330688, "rewards/margins": 1.0948196649551392, "rewards/rejected": 0.7415904998779297, "step": 1812 }, { "epoch": 0.98, "learning_rate": 7.873775519859323e-08, "logits/chosen": -2.158886194229126, "logits/rejected": -2.2964751720428467, "logps/chosen": -2.442409038543701, "logps/rejected": -1.6954307556152344, "loss": 0.77, "rewards/accuracies": 0.0, "rewards/chosen": 0.8350790143013, "rewards/margins": -0.14812582731246948, "rewards/rejected": 0.9832048416137695, "step": 1813 }, { "epoch": 0.98, "learning_rate": 7.871392414336529e-08, "logits/chosen": -2.031403064727783, "logits/rejected": -2.2617950439453125, "logps/chosen": -0.8584333658218384, "logps/rejected": -0.8803176283836365, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 0.9889564514160156, "rewards/margins": 0.029726386070251465, "rewards/rejected": 0.9592300653457642, "step": 1814 }, { "epoch": 0.98, "learning_rate": 7.869008335149126e-08, "logits/chosen": -2.0201165676116943, "logits/rejected": -2.3151895999908447, "logps/chosen": -0.9380027651786804, "logps/rejected": -5.318385124206543, "loss": 0.6294, "rewards/accuracies": 1.0, "rewards/chosen": 1.029266595840454, "rewards/margins": 0.13192951679229736, "rewards/rejected": 0.8973370790481567, "step": 1815 }, { "epoch": 0.98, "learning_rate": 7.866623283105538e-08, "logits/chosen": -2.0423266887664795, "logits/rejected": -2.298764705657959, "logps/chosen": -0.19573506712913513, "logps/rejected": -0.19560953974723816, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.8571190237998962, "rewards/margins": 0.02304816246032715, "rewards/rejected": 0.8340708613395691, "step": 1816 }, { "epoch": 0.98, "learning_rate": 7.864237259014514e-08, "logits/chosen": -2.117515802383423, "logits/rejected": -2.275339365005493, "logps/chosen": -1.0471447706222534, "logps/rejected": -1.0702426433563232, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 0.9826914072036743, "rewards/margins": -0.008169233798980713, "rewards/rejected": 0.990860641002655, "step": 1817 }, { "epoch": 0.98, "learning_rate": 7.861850263685133e-08, "logits/chosen": -2.2218589782714844, "logits/rejected": -2.141434669494629, "logps/chosen": -24.38216781616211, "logps/rejected": -3.486752986907959, "loss": 0.337, "rewards/accuracies": 1.0, "rewards/chosen": 1.6171833276748657, "rewards/margins": 0.9144529104232788, "rewards/rejected": 0.7027304172515869, "step": 1818 }, { "epoch": 0.98, "learning_rate": 7.859462297926809e-08, "logits/chosen": -2.1452863216400146, "logits/rejected": -2.3215465545654297, "logps/chosen": -0.39722833037376404, "logps/rejected": -0.39787113666534424, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9346005320549011, "rewards/margins": 0.01260441541671753, "rewards/rejected": 0.9219961166381836, "step": 1819 }, { "epoch": 0.98, "learning_rate": 7.857073362549276e-08, "logits/chosen": -1.9515864849090576, "logits/rejected": -1.950227975845337, "logps/chosen": -8.085993766784668, "logps/rejected": -3.269115686416626, "loss": 0.3836, "rewards/accuracies": 1.0, "rewards/chosen": 1.3690348863601685, "rewards/margins": 0.7601704001426697, "rewards/rejected": 0.6088644862174988, "step": 1820 }, { "epoch": 0.98, "learning_rate": 7.854683458362603e-08, "logits/chosen": -1.9835889339447021, "logits/rejected": -2.216756820678711, "logps/chosen": -0.6750068664550781, "logps/rejected": -0.7844393849372864, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.8551847338676453, "rewards/margins": 0.015891432762145996, "rewards/rejected": 0.8392933011054993, "step": 1821 }, { "epoch": 0.98, "learning_rate": 7.852292586177188e-08, "logits/chosen": -2.1744372844696045, "logits/rejected": -2.336719512939453, "logps/chosen": -18.772235870361328, "logps/rejected": -4.961979866027832, "loss": 0.7815, "rewards/accuracies": 0.0, "rewards/chosen": 0.646579384803772, "rewards/margins": -0.16955095529556274, "rewards/rejected": 0.8161303400993347, "step": 1822 }, { "epoch": 0.98, "learning_rate": 7.849900746803752e-08, "logits/chosen": -2.1094655990600586, "logits/rejected": -2.120485782623291, "logps/chosen": -7.48199462890625, "logps/rejected": -2.44331693649292, "loss": 0.5318, "rewards/accuracies": 1.0, "rewards/chosen": 1.0725246667861938, "rewards/margins": 0.3537862300872803, "rewards/rejected": 0.7187384366989136, "step": 1823 }, { "epoch": 0.98, "learning_rate": 7.847507941053351e-08, "logits/chosen": -2.1067659854888916, "logits/rejected": -2.2344448566436768, "logps/chosen": -0.5799385905265808, "logps/rejected": -0.6536626219749451, "loss": 0.67, "rewards/accuracies": 1.0, "rewards/chosen": 0.826130211353302, "rewards/margins": 0.04688626527786255, "rewards/rejected": 0.7792439460754395, "step": 1824 }, { "epoch": 0.98, "learning_rate": 7.845114169737361e-08, "logits/chosen": -2.126478433609009, "logits/rejected": -2.1275272369384766, "logps/chosen": -0.46997392177581787, "logps/rejected": -4.759087562561035, "loss": 0.4863, "rewards/accuracies": 1.0, "rewards/chosen": 0.9421043395996094, "rewards/margins": 0.46806639432907104, "rewards/rejected": 0.47403794527053833, "step": 1825 }, { "epoch": 0.98, "learning_rate": 7.842719433667493e-08, "logits/chosen": -1.9746348857879639, "logits/rejected": -1.979093074798584, "logps/chosen": -1.1295222043991089, "logps/rejected": -1.9762437343597412, "loss": 0.5122, "rewards/accuracies": 1.0, "rewards/chosen": 1.1440181732177734, "rewards/margins": 0.4020307660102844, "rewards/rejected": 0.741987407207489, "step": 1826 }, { "epoch": 0.99, "learning_rate": 7.840323733655779e-08, "logits/chosen": -2.028198003768921, "logits/rejected": -2.0223896503448486, "logps/chosen": -3.4576573371887207, "logps/rejected": -2.668344497680664, "loss": 0.6293, "rewards/accuracies": 1.0, "rewards/chosen": 0.9113489389419556, "rewards/margins": 0.1321067214012146, "rewards/rejected": 0.779242217540741, "step": 1827 }, { "epoch": 0.99, "learning_rate": 7.837927070514581e-08, "logits/chosen": -2.1637630462646484, "logits/rejected": -2.17156982421875, "logps/chosen": -4.868817329406738, "logps/rejected": -6.358755111694336, "loss": 0.4091, "rewards/accuracies": 1.0, "rewards/chosen": 1.5536086559295654, "rewards/margins": 0.6822890639305115, "rewards/rejected": 0.871319591999054, "step": 1828 }, { "epoch": 0.99, "learning_rate": 7.835529445056587e-08, "logits/chosen": -2.0498945713043213, "logits/rejected": -2.250948190689087, "logps/chosen": -1.2019158601760864, "logps/rejected": -1.3372002840042114, "loss": 0.6789, "rewards/accuracies": 1.0, "rewards/chosen": 0.9204146265983582, "rewards/margins": 0.02879244089126587, "rewards/rejected": 0.8916221857070923, "step": 1829 }, { "epoch": 0.99, "learning_rate": 7.833130858094814e-08, "logits/chosen": -1.912564754486084, "logits/rejected": -2.194934606552124, "logps/chosen": -0.48406386375427246, "logps/rejected": -0.49154144525527954, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.784882664680481, "rewards/margins": 0.021211326122283936, "rewards/rejected": 0.763671338558197, "step": 1830 }, { "epoch": 0.99, "learning_rate": 7.830731310442599e-08, "logits/chosen": -2.106934070587158, "logits/rejected": -2.0455820560455322, "logps/chosen": -12.806928634643555, "logps/rejected": -7.545398712158203, "loss": 0.824, "rewards/accuracies": 0.0, "rewards/chosen": 0.4295744001865387, "rewards/margins": -0.2464762032032013, "rewards/rejected": 0.67605060338974, "step": 1831 }, { "epoch": 0.99, "learning_rate": 7.828330802913607e-08, "logits/chosen": -2.112987995147705, "logits/rejected": -2.117975950241089, "logps/chosen": -2.6077582836151123, "logps/rejected": -3.9482474327087402, "loss": 0.4843, "rewards/accuracies": 1.0, "rewards/chosen": 1.00918710231781, "rewards/margins": 0.47301989793777466, "rewards/rejected": 0.5361672043800354, "step": 1832 }, { "epoch": 0.99, "learning_rate": 7.825929336321835e-08, "logits/chosen": -2.1475625038146973, "logits/rejected": -2.138868570327759, "logps/chosen": -1.543808937072754, "logps/rejected": -5.747866153717041, "loss": 0.4663, "rewards/accuracies": 1.0, "rewards/chosen": 1.0336774587631226, "rewards/margins": 0.5207476615905762, "rewards/rejected": 0.5129297971725464, "step": 1833 }, { "epoch": 0.99, "learning_rate": 7.823526911481593e-08, "logits/chosen": -2.075684070587158, "logits/rejected": -2.293264865875244, "logps/chosen": -0.20265908539295197, "logps/rejected": -0.20167411863803864, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.9260813593864441, "rewards/margins": 0.018587946891784668, "rewards/rejected": 0.9074934124946594, "step": 1834 }, { "epoch": 0.99, "learning_rate": 7.821123529207528e-08, "logits/chosen": -2.0556373596191406, "logits/rejected": -2.0624172687530518, "logps/chosen": -0.8270714282989502, "logps/rejected": -6.913473606109619, "loss": 0.4217, "rewards/accuracies": 1.0, "rewards/chosen": 1.1397863626480103, "rewards/margins": 0.6452603936195374, "rewards/rejected": 0.4945259690284729, "step": 1835 }, { "epoch": 0.99, "learning_rate": 7.818719190314604e-08, "logits/chosen": -2.0629079341888428, "logits/rejected": -2.2996973991394043, "logps/chosen": -0.6018937826156616, "logps/rejected": -0.6045300960540771, "loss": 0.6721, "rewards/accuracies": 1.0, "rewards/chosen": 0.8955439925193787, "rewards/margins": 0.042449951171875, "rewards/rejected": 0.8530940413475037, "step": 1836 }, { "epoch": 0.99, "learning_rate": 7.816313895618112e-08, "logits/chosen": -2.1621134281158447, "logits/rejected": -1.9961826801300049, "logps/chosen": -41.90658950805664, "logps/rejected": -4.399301052093506, "loss": 0.3478, "rewards/accuracies": 1.0, "rewards/chosen": 1.6100231409072876, "rewards/margins": 0.8770471215248108, "rewards/rejected": 0.7329760193824768, "step": 1837 }, { "epoch": 0.99, "learning_rate": 7.813907645933667e-08, "logits/chosen": -2.0043840408325195, "logits/rejected": -2.321014165878296, "logps/chosen": -2.262199878692627, "logps/rejected": -0.8389002084732056, "loss": 0.7106, "rewards/accuracies": 0.0, "rewards/chosen": 1.0019012689590454, "rewards/margins": -0.03450918197631836, "rewards/rejected": 1.0364104509353638, "step": 1838 }, { "epoch": 0.99, "learning_rate": 7.811500442077208e-08, "logits/chosen": -2.0294198989868164, "logits/rejected": -2.009594678878784, "logps/chosen": -7.182037353515625, "logps/rejected": -5.851757049560547, "loss": 0.3769, "rewards/accuracies": 1.0, "rewards/chosen": 1.3163756132125854, "rewards/margins": 0.7813683152198792, "rewards/rejected": 0.5350072979927063, "step": 1839 }, { "epoch": 0.99, "learning_rate": 7.809092284864998e-08, "logits/chosen": -2.1824662685394287, "logits/rejected": -2.194451332092285, "logps/chosen": -2.379988193511963, "logps/rejected": -2.461676597595215, "loss": 0.5059, "rewards/accuracies": 1.0, "rewards/chosen": 1.2623175382614136, "rewards/margins": 0.4177330732345581, "rewards/rejected": 0.8445844650268555, "step": 1840 }, { "epoch": 0.99, "learning_rate": 7.806683175113618e-08, "logits/chosen": -2.2322187423706055, "logits/rejected": -2.2503130435943604, "logps/chosen": -17.832738876342773, "logps/rejected": -18.946735382080078, "loss": 0.5539, "rewards/accuracies": 1.0, "rewards/chosen": 1.281265139579773, "rewards/margins": 0.3010561466217041, "rewards/rejected": 0.9802089929580688, "step": 1841 }, { "epoch": 0.99, "learning_rate": 7.804273113639983e-08, "logits/chosen": -2.1913065910339355, "logits/rejected": -2.1908833980560303, "logps/chosen": -1.9908185005187988, "logps/rejected": -4.858588218688965, "loss": 0.4334, "rewards/accuracies": 1.0, "rewards/chosen": 1.0730379819869995, "rewards/margins": 0.6114836931228638, "rewards/rejected": 0.46155425906181335, "step": 1842 }, { "epoch": 0.99, "learning_rate": 7.80186210126132e-08, "logits/chosen": -2.072901964187622, "logits/rejected": -2.0626351833343506, "logps/chosen": -1.4942405223846436, "logps/rejected": -10.299192428588867, "loss": 0.5907, "rewards/accuracies": 1.0, "rewards/chosen": 0.9750474095344543, "rewards/margins": 0.21670866012573242, "rewards/rejected": 0.7583387494087219, "step": 1843 }, { "epoch": 0.99, "learning_rate": 7.799450138795184e-08, "logits/chosen": -2.0647823810577393, "logits/rejected": -2.256129026412964, "logps/chosen": -0.31357061862945557, "logps/rejected": -0.3272177278995514, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.9087240099906921, "rewards/margins": 0.0004975795745849609, "rewards/rejected": 0.9082264304161072, "step": 1844 }, { "epoch": 1.0, "learning_rate": 7.797037227059454e-08, "logits/chosen": -2.0839762687683105, "logits/rejected": -2.2336061000823975, "logps/chosen": -1.2041743993759155, "logps/rejected": -1.2398663759231567, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.7394775748252869, "rewards/margins": 0.010128438472747803, "rewards/rejected": 0.7293491363525391, "step": 1845 }, { "epoch": 1.0, "learning_rate": 7.79462336687232e-08, "logits/chosen": -1.969627857208252, "logits/rejected": -1.9695254564285278, "logps/chosen": -1.9387328624725342, "logps/rejected": -1.0091087818145752, "loss": 0.5862, "rewards/accuracies": 1.0, "rewards/chosen": 1.1011748313903809, "rewards/margins": 0.2266407608985901, "rewards/rejected": 0.8745340704917908, "step": 1846 }, { "epoch": 1.0, "learning_rate": 7.792208559052307e-08, "logits/chosen": -1.9865238666534424, "logits/rejected": -1.9780551195144653, "logps/chosen": -5.72704553604126, "logps/rejected": -4.844179153442383, "loss": 0.3144, "rewards/accuracies": 1.0, "rewards/chosen": 1.5080894231796265, "rewards/margins": 0.9957007169723511, "rewards/rejected": 0.5123887062072754, "step": 1847 }, { "epoch": 1.0, "learning_rate": 7.789792804418255e-08, "logits/chosen": -1.9772884845733643, "logits/rejected": -2.2341878414154053, "logps/chosen": -0.9538307785987854, "logps/rejected": -1.0894513130187988, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 0.8589853644371033, "rewards/margins": 0.02425217628479004, "rewards/rejected": 0.8347331881523132, "step": 1848 }, { "epoch": 1.0, "learning_rate": 7.787376103789324e-08, "logits/chosen": -2.1309797763824463, "logits/rejected": -2.1743505001068115, "logps/chosen": -5.448548316955566, "logps/rejected": -11.00941276550293, "loss": 0.3585, "rewards/accuracies": 1.0, "rewards/chosen": 1.4793978929519653, "rewards/margins": 0.8413193821907043, "rewards/rejected": 0.638078510761261, "step": 1849 }, { "epoch": 1.0, "learning_rate": 7.784958457985001e-08, "logits/chosen": -2.081620693206787, "logits/rejected": -2.274723768234253, "logps/chosen": -0.5946215987205505, "logps/rejected": -0.694605827331543, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.9207114577293396, "rewards/margins": 0.02118903398513794, "rewards/rejected": 0.8995224237442017, "step": 1850 }, { "epoch": 1.0, "learning_rate": 7.782539867825082e-08, "logits/chosen": -2.01495099067688, "logits/rejected": -2.259774923324585, "logps/chosen": -1.0671645402908325, "logps/rejected": -1.0553269386291504, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 1.0233441591262817, "rewards/margins": 0.0117417573928833, "rewards/rejected": 1.0116024017333984, "step": 1851 }, { "epoch": 1.0, "learning_rate": 7.780120334129696e-08, "logits/chosen": -2.0244734287261963, "logits/rejected": -2.028137683868408, "logps/chosen": -1.616405725479126, "logps/rejected": -2.122354030609131, "loss": 0.508, "rewards/accuracies": 1.0, "rewards/chosen": 1.065348505973816, "rewards/margins": 0.41254186630249023, "rewards/rejected": 0.6528066396713257, "step": 1852 }, { "epoch": 1.0, "learning_rate": 7.777699857719284e-08, "logits/chosen": -2.341142416000366, "logits/rejected": -2.2886393070220947, "logps/chosen": -25.68854522705078, "logps/rejected": -4.744428634643555, "loss": 0.2314, "rewards/accuracies": 1.0, "rewards/chosen": 1.8490371704101562, "rewards/margins": 1.3454501628875732, "rewards/rejected": 0.5035870671272278, "step": 1853 }, { "epoch": 1.0, "learning_rate": 7.775278439414609e-08, "logits/chosen": -1.9927440881729126, "logits/rejected": -2.223738193511963, "logps/chosen": -0.6545814275741577, "logps/rejected": -0.5556369423866272, "loss": 0.7009, "rewards/accuracies": 0.0, "rewards/chosen": 0.8923606872558594, "rewards/margins": -0.01549607515335083, "rewards/rejected": 0.9078567624092102, "step": 1854 }, { "epoch": 1.0, "learning_rate": 7.772856080036753e-08, "logits/chosen": -2.0454511642456055, "logits/rejected": -2.0442705154418945, "logps/chosen": -1.2016643285751343, "logps/rejected": -1.748696208000183, "loss": 0.6251, "rewards/accuracies": 1.0, "rewards/chosen": 0.9706827998161316, "rewards/margins": 0.14113032817840576, "rewards/rejected": 0.8295524716377258, "step": 1855 }, { "epoch": 1.0, "learning_rate": 7.770432780407117e-08, "logits/chosen": -1.9853756427764893, "logits/rejected": -1.984107494354248, "logps/chosen": -1.974853277206421, "logps/rejected": -3.9100842475891113, "loss": 0.496, "rewards/accuracies": 1.0, "rewards/chosen": 0.9337143898010254, "rewards/margins": 0.4429051876068115, "rewards/rejected": 0.49080920219421387, "step": 1856 }, { "epoch": 1.0, "learning_rate": 7.768008541347422e-08, "logits/chosen": -2.0720319747924805, "logits/rejected": -2.0697600841522217, "logps/chosen": -4.055943965911865, "logps/rejected": -10.976048469543457, "loss": 0.4253, "rewards/accuracies": 1.0, "rewards/chosen": 0.9954595565795898, "rewards/margins": 0.6347254514694214, "rewards/rejected": 0.36073407530784607, "step": 1857 }, { "epoch": 1.0, "learning_rate": 7.765583363679707e-08, "logits/chosen": -2.1634514331817627, "logits/rejected": -2.1618335247039795, "logps/chosen": -1.163830041885376, "logps/rejected": -4.885218620300293, "loss": 0.501, "rewards/accuracies": 1.0, "rewards/chosen": 1.0157248973846436, "rewards/margins": 0.4302977919578552, "rewards/rejected": 0.5854271054267883, "step": 1858 }, { "epoch": 1.0, "learning_rate": 7.763157248226329e-08, "logits/chosen": -2.020683765411377, "logits/rejected": -2.255347490310669, "logps/chosen": -0.4060189127922058, "logps/rejected": -0.4526793360710144, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.8754491806030273, "rewards/margins": 0.01249009370803833, "rewards/rejected": 0.862959086894989, "step": 1859 }, { "epoch": 1.0, "learning_rate": 7.760730195809961e-08, "logits/chosen": -2.3131728172302246, "logits/rejected": -2.0764760971069336, "logps/chosen": -51.282432556152344, "logps/rejected": -5.475815296173096, "loss": 0.259, "rewards/accuracies": 1.0, "rewards/chosen": 1.7298415899276733, "rewards/margins": 1.2184568643569946, "rewards/rejected": 0.5113847255706787, "step": 1860 }, { "epoch": 1.0, "learning_rate": 7.758302207253597e-08, "logits/chosen": -2.0796170234680176, "logits/rejected": -2.316345691680908, "logps/chosen": -3.8661322593688965, "logps/rejected": -13.703840255737305, "loss": 0.7185, "rewards/accuracies": 0.0, "rewards/chosen": 0.9441888928413391, "rewards/margins": -0.050158679485321045, "rewards/rejected": 0.9943475723266602, "step": 1861 }, { "epoch": 1.0, "learning_rate": 7.755873283380549e-08, "logits/chosen": -2.1447391510009766, "logits/rejected": -2.3205344676971436, "logps/chosen": -3.0188329219818115, "logps/rejected": -2.782233953475952, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.801764190196991, "rewards/margins": -0.0077326297760009766, "rewards/rejected": 0.8094968199729919, "step": 1862 }, { "epoch": 1.0, "learning_rate": 7.753443425014443e-08, "logits/chosen": -2.062074899673462, "logits/rejected": -2.2418336868286133, "logps/chosen": -3.284979820251465, "logps/rejected": -6.241766929626465, "loss": 0.6546, "rewards/accuracies": 1.0, "rewards/chosen": 0.5598599314689636, "rewards/margins": 0.07859820127487183, "rewards/rejected": 0.4812617301940918, "step": 1863 }, { "epoch": 1.01, "learning_rate": 7.751012632979222e-08, "logits/chosen": -2.1656363010406494, "logits/rejected": -2.169482946395874, "logps/chosen": -0.8982020616531372, "logps/rejected": -5.278018951416016, "loss": 0.437, "rewards/accuracies": 1.0, "rewards/chosen": 0.9925934672355652, "rewards/margins": 0.6013839244842529, "rewards/rejected": 0.39120951294898987, "step": 1864 }, { "epoch": 1.01, "learning_rate": 7.748580908099147e-08, "logits/chosen": -2.014617681503296, "logits/rejected": -2.2384390830993652, "logps/chosen": -0.5769468545913696, "logps/rejected": -0.6458670496940613, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.9379501342773438, "rewards/margins": -0.011539757251739502, "rewards/rejected": 0.9494898915290833, "step": 1865 }, { "epoch": 1.01, "learning_rate": 7.746148251198796e-08, "logits/chosen": -2.0879414081573486, "logits/rejected": -2.3445327281951904, "logps/chosen": -1.9437334537506104, "logps/rejected": -1.716030478477478, "loss": 0.699, "rewards/accuracies": 0.0, "rewards/chosen": 1.0104047060012817, "rewards/margins": -0.011574506759643555, "rewards/rejected": 1.0219792127609253, "step": 1866 }, { "epoch": 1.01, "learning_rate": 7.743714663103062e-08, "logits/chosen": -2.0417263507843018, "logits/rejected": -2.036708116531372, "logps/chosen": -7.8865861892700195, "logps/rejected": -7.63111686706543, "loss": 0.4359, "rewards/accuracies": 1.0, "rewards/chosen": 1.1060248613357544, "rewards/margins": 0.6043552756309509, "rewards/rejected": 0.5016695857048035, "step": 1867 }, { "epoch": 1.01, "learning_rate": 7.741280144637154e-08, "logits/chosen": -2.0359809398651123, "logits/rejected": -2.2471656799316406, "logps/chosen": -0.5823577642440796, "logps/rejected": -0.5848076343536377, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.8536843657493591, "rewards/margins": 0.020906567573547363, "rewards/rejected": 0.8327777981758118, "step": 1868 }, { "epoch": 1.01, "learning_rate": 7.738844696626597e-08, "logits/chosen": -2.1074867248535156, "logits/rejected": -2.317004680633545, "logps/chosen": -0.7559265494346619, "logps/rejected": -0.7713809013366699, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.8648085594177246, "rewards/margins": -0.0007537007331848145, "rewards/rejected": 0.8655622601509094, "step": 1869 }, { "epoch": 1.01, "learning_rate": 7.73640831989723e-08, "logits/chosen": -2.0580780506134033, "logits/rejected": -2.221334457397461, "logps/chosen": -0.9943125247955322, "logps/rejected": -1.0564500093460083, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.8708839416503906, "rewards/margins": 0.008191108703613281, "rewards/rejected": 0.8626928329467773, "step": 1870 }, { "epoch": 1.01, "learning_rate": 7.733971015275205e-08, "logits/chosen": -2.050697088241577, "logits/rejected": -2.253545045852661, "logps/chosen": -8.78093433380127, "logps/rejected": -2.8414595127105713, "loss": 0.7481, "rewards/accuracies": 0.0, "rewards/chosen": 0.8710446357727051, "rewards/margins": -0.107052743434906, "rewards/rejected": 0.9780973792076111, "step": 1871 }, { "epoch": 1.01, "learning_rate": 7.731532783586996e-08, "logits/chosen": -2.030717134475708, "logits/rejected": -2.248830795288086, "logps/chosen": -0.6409630179405212, "logps/rejected": -0.6627905368804932, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.8268676996231079, "rewards/margins": 0.020858705043792725, "rewards/rejected": 0.8060089945793152, "step": 1872 }, { "epoch": 1.01, "learning_rate": 7.729093625659381e-08, "logits/chosen": -2.11846661567688, "logits/rejected": -2.299842119216919, "logps/chosen": -2.768637180328369, "logps/rejected": -2.9358510971069336, "loss": 0.6734, "rewards/accuracies": 1.0, "rewards/chosen": 0.6263853907585144, "rewards/margins": 0.039948225021362305, "rewards/rejected": 0.5864371657371521, "step": 1873 }, { "epoch": 1.01, "learning_rate": 7.726653542319463e-08, "logits/chosen": -2.146263837814331, "logits/rejected": -2.27807879447937, "logps/chosen": -0.8272715210914612, "logps/rejected": -0.767108678817749, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.9062531590461731, "rewards/margins": 0.008257746696472168, "rewards/rejected": 0.8979954123497009, "step": 1874 }, { "epoch": 1.01, "learning_rate": 7.724212534394649e-08, "logits/chosen": -2.249870777130127, "logits/rejected": -2.1959922313690186, "logps/chosen": -13.189154624938965, "logps/rejected": -9.996596336364746, "loss": 0.2323, "rewards/accuracies": 1.0, "rewards/chosen": 1.823747992515564, "rewards/margins": 1.3415087461471558, "rewards/rejected": 0.4822392463684082, "step": 1875 }, { "epoch": 1.01, "learning_rate": 7.72177060271267e-08, "logits/chosen": -2.248685121536255, "logits/rejected": -2.2869510650634766, "logps/chosen": -1.3057217597961426, "logps/rejected": -1.1800867319107056, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.8735815286636353, "rewards/margins": 0.004882752895355225, "rewards/rejected": 0.86869877576828, "step": 1876 }, { "epoch": 1.01, "learning_rate": 7.719327748101557e-08, "logits/chosen": -2.2866697311401367, "logits/rejected": -2.2434377670288086, "logps/chosen": -7.9789886474609375, "logps/rejected": -7.857353210449219, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.43233558535575867, "rewards/margins": 0.011457085609436035, "rewards/rejected": 0.42087849974632263, "step": 1877 }, { "epoch": 1.01, "learning_rate": 7.716883971389663e-08, "logits/chosen": -2.122828483581543, "logits/rejected": -2.006678342819214, "logps/chosen": -26.169200897216797, "logps/rejected": -2.5022640228271484, "loss": 0.4085, "rewards/accuracies": 1.0, "rewards/chosen": 1.3303565979003906, "rewards/margins": 0.6840190887451172, "rewards/rejected": 0.6463375091552734, "step": 1878 }, { "epoch": 1.01, "learning_rate": 7.714439273405655e-08, "logits/chosen": -2.00970196723938, "logits/rejected": -2.2501795291900635, "logps/chosen": -0.6891146302223206, "logps/rejected": -0.6512959003448486, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9938627481460571, "rewards/margins": 0.012385070323944092, "rewards/rejected": 0.981477677822113, "step": 1879 }, { "epoch": 1.01, "learning_rate": 7.711993654978506e-08, "logits/chosen": -2.005650281906128, "logits/rejected": -1.9881823062896729, "logps/chosen": -9.739752769470215, "logps/rejected": -10.434122085571289, "loss": 0.5091, "rewards/accuracies": 1.0, "rewards/chosen": 1.221269130706787, "rewards/margins": 0.40972965955734253, "rewards/rejected": 0.8115394711494446, "step": 1880 }, { "epoch": 1.01, "learning_rate": 7.709547116937504e-08, "logits/chosen": -2.0612313747406006, "logits/rejected": -2.280724048614502, "logps/chosen": -1.1991134881973267, "logps/rejected": -1.3628113269805908, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.889046847820282, "rewards/margins": 0.04049402475357056, "rewards/rejected": 0.8485528230667114, "step": 1881 }, { "epoch": 1.02, "learning_rate": 7.707099660112253e-08, "logits/chosen": -1.9939919710159302, "logits/rejected": -1.9962427616119385, "logps/chosen": -1.6713861227035522, "logps/rejected": -4.658858776092529, "loss": 0.4272, "rewards/accuracies": 1.0, "rewards/chosen": 1.0452399253845215, "rewards/margins": 0.6293469667434692, "rewards/rejected": 0.41589292883872986, "step": 1882 }, { "epoch": 1.02, "learning_rate": 7.704651285332662e-08, "logits/chosen": -2.0695416927337646, "logits/rejected": -2.2535977363586426, "logps/chosen": -0.8596264123916626, "logps/rejected": -1.1557101011276245, "loss": 0.7193, "rewards/accuracies": 0.0, "rewards/chosen": 0.827976405620575, "rewards/margins": -0.05157577991485596, "rewards/rejected": 0.8795521855354309, "step": 1883 }, { "epoch": 1.02, "learning_rate": 7.702201993428953e-08, "logits/chosen": -2.101792573928833, "logits/rejected": -2.1016845703125, "logps/chosen": -4.695785045623779, "logps/rejected": -2.3449621200561523, "loss": 0.2622, "rewards/accuracies": 1.0, "rewards/chosen": 1.7930039167404175, "rewards/margins": 1.204768180847168, "rewards/rejected": 0.5882357954978943, "step": 1884 }, { "epoch": 1.02, "learning_rate": 7.69975178523166e-08, "logits/chosen": -2.134033441543579, "logits/rejected": -2.0322000980377197, "logps/chosen": -27.57879066467285, "logps/rejected": -3.1060032844543457, "loss": 0.2451, "rewards/accuracies": 1.0, "rewards/chosen": 1.8940194845199585, "rewards/margins": 1.2809630632400513, "rewards/rejected": 0.6130564212799072, "step": 1885 }, { "epoch": 1.02, "learning_rate": 7.69730066157163e-08, "logits/chosen": -1.9827749729156494, "logits/rejected": -2.2770745754241943, "logps/chosen": -0.4965636730194092, "logps/rejected": -0.5231456160545349, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.9795660972595215, "rewards/margins": 0.022395014762878418, "rewards/rejected": 0.9571710824966431, "step": 1886 }, { "epoch": 1.02, "learning_rate": 7.694848623280017e-08, "logits/chosen": -2.0812947750091553, "logits/rejected": -2.3007853031158447, "logps/chosen": -12.800086975097656, "logps/rejected": -7.050943851470947, "loss": 0.8107, "rewards/accuracies": 0.0, "rewards/chosen": 0.6433597803115845, "rewards/margins": -0.2226734161376953, "rewards/rejected": 0.8660331964492798, "step": 1887 }, { "epoch": 1.02, "learning_rate": 7.692395671188285e-08, "logits/chosen": -2.080566883087158, "logits/rejected": -2.0810484886169434, "logps/chosen": -2.8933165073394775, "logps/rejected": -6.3760986328125, "loss": 0.3849, "rewards/accuracies": 1.0, "rewards/chosen": 1.4124048948287964, "rewards/margins": 0.7560851573944092, "rewards/rejected": 0.6563197374343872, "step": 1888 }, { "epoch": 1.02, "learning_rate": 7.689941806128212e-08, "logits/chosen": -2.1295132637023926, "logits/rejected": -2.1731693744659424, "logps/chosen": -5.473072052001953, "logps/rejected": -10.939675331115723, "loss": 0.3613, "rewards/accuracies": 1.0, "rewards/chosen": 1.4769455194473267, "rewards/margins": 0.831893265247345, "rewards/rejected": 0.6450522541999817, "step": 1889 }, { "epoch": 1.02, "learning_rate": 7.687487028931878e-08, "logits/chosen": -2.1418514251708984, "logits/rejected": -2.282183885574341, "logps/chosen": -0.4732443392276764, "logps/rejected": -0.44968950748443604, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.9159446954727173, "rewards/margins": 0.011442601680755615, "rewards/rejected": 0.9045020937919617, "step": 1890 }, { "epoch": 1.02, "learning_rate": 7.685031340431682e-08, "logits/chosen": -2.1043450832366943, "logits/rejected": -2.1173126697540283, "logps/chosen": -8.159941673278809, "logps/rejected": -4.520603656768799, "loss": 0.4418, "rewards/accuracies": 1.0, "rewards/chosen": 1.2494717836380005, "rewards/margins": 0.5878304839134216, "rewards/rejected": 0.6616412997245789, "step": 1891 }, { "epoch": 1.02, "learning_rate": 7.682574741460322e-08, "logits/chosen": -2.1546337604522705, "logits/rejected": -2.256901502609253, "logps/chosen": -5.2321953773498535, "logps/rejected": -2.933194398880005, "loss": 0.7093, "rewards/accuracies": 0.0, "rewards/chosen": 0.7726675868034363, "rewards/margins": -0.03208744525909424, "rewards/rejected": 0.8047550320625305, "step": 1892 }, { "epoch": 1.02, "learning_rate": 7.680117232850815e-08, "logits/chosen": -2.1169376373291016, "logits/rejected": -2.2959232330322266, "logps/chosen": -2.052175283432007, "logps/rejected": -1.9832026958465576, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.952828049659729, "rewards/margins": 0.014777302742004395, "rewards/rejected": 0.9380507469177246, "step": 1893 }, { "epoch": 1.02, "learning_rate": 7.677658815436478e-08, "logits/chosen": -2.0282793045043945, "logits/rejected": -2.2982723712921143, "logps/chosen": -0.40046173334121704, "logps/rejected": -0.44791167974472046, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.8926367163658142, "rewards/margins": 0.017415344715118408, "rewards/rejected": 0.8752213716506958, "step": 1894 }, { "epoch": 1.02, "learning_rate": 7.675199490050938e-08, "logits/chosen": -2.043722152709961, "logits/rejected": -2.2315292358398438, "logps/chosen": -4.450891494750977, "logps/rejected": -4.7330241203308105, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.6340635418891907, "rewards/margins": 0.018162667751312256, "rewards/rejected": 0.6159008741378784, "step": 1895 }, { "epoch": 1.02, "learning_rate": 7.672739257528134e-08, "logits/chosen": -2.095249891281128, "logits/rejected": -2.033665895462036, "logps/chosen": -13.964828491210938, "logps/rejected": -20.147708892822266, "loss": 0.2791, "rewards/accuracies": 1.0, "rewards/chosen": 1.5711040496826172, "rewards/margins": 1.1334034204483032, "rewards/rejected": 0.43770065903663635, "step": 1896 }, { "epoch": 1.02, "learning_rate": 7.67027811870231e-08, "logits/chosen": -2.0147976875305176, "logits/rejected": -2.0219833850860596, "logps/chosen": -2.2258217334747314, "logps/rejected": -4.266792297363281, "loss": 0.4584, "rewards/accuracies": 1.0, "rewards/chosen": 1.0866789817810059, "rewards/margins": 0.5420829653739929, "rewards/rejected": 0.5445960164070129, "step": 1897 }, { "epoch": 1.02, "learning_rate": 7.667816074408013e-08, "logits/chosen": -2.1730146408081055, "logits/rejected": -2.0640523433685303, "logps/chosen": -42.68394470214844, "logps/rejected": -2.0962274074554443, "loss": 0.2735, "rewards/accuracies": 1.0, "rewards/chosen": 1.9150055646896362, "rewards/margins": 1.1566500663757324, "rewards/rejected": 0.7583555579185486, "step": 1898 }, { "epoch": 1.02, "learning_rate": 7.665353125480106e-08, "logits/chosen": -2.197262763977051, "logits/rejected": -2.199174642562866, "logps/chosen": -0.44186869263648987, "logps/rejected": -3.2000558376312256, "loss": 0.4914, "rewards/accuracies": 1.0, "rewards/chosen": 1.0281728506088257, "rewards/margins": 0.4546854496002197, "rewards/rejected": 0.573487401008606, "step": 1899 }, { "epoch": 1.02, "learning_rate": 7.662889272753752e-08, "logits/chosen": -2.05487060546875, "logits/rejected": -2.0548911094665527, "logps/chosen": -4.791842937469482, "logps/rejected": -2.364525318145752, "loss": 0.2869, "rewards/accuracies": 1.0, "rewards/chosen": 1.6901159286499023, "rewards/margins": 1.101860523223877, "rewards/rejected": 0.5882554650306702, "step": 1900 }, { "epoch": 1.03, "learning_rate": 7.660424517064421e-08, "logits/chosen": -2.1806273460388184, "logits/rejected": -2.2091104984283447, "logps/chosen": -9.431119918823242, "logps/rejected": -10.62124252319336, "loss": 0.6303, "rewards/accuracies": 1.0, "rewards/chosen": 1.1246923208236694, "rewards/margins": 0.129865825176239, "rewards/rejected": 0.9948264956474304, "step": 1901 }, { "epoch": 1.03, "learning_rate": 7.657958859247893e-08, "logits/chosen": -2.078970432281494, "logits/rejected": -2.3009839057922363, "logps/chosen": -0.703110933303833, "logps/rejected": -0.7274026870727539, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.9863283038139343, "rewards/margins": 0.004944503307342529, "rewards/rejected": 0.9813838005065918, "step": 1902 }, { "epoch": 1.03, "learning_rate": 7.655492300140251e-08, "logits/chosen": -1.994442105293274, "logits/rejected": -2.2655484676361084, "logps/chosen": -1.4393197298049927, "logps/rejected": -1.3296287059783936, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.9776362776756287, "rewards/margins": 0.00017142295837402344, "rewards/rejected": 0.9774648547172546, "step": 1903 }, { "epoch": 1.03, "learning_rate": 7.653024840577883e-08, "logits/chosen": -2.1136550903320312, "logits/rejected": -2.117929458618164, "logps/chosen": -1.3144243955612183, "logps/rejected": -3.235710620880127, "loss": 0.5152, "rewards/accuracies": 1.0, "rewards/chosen": 1.029999017715454, "rewards/margins": 0.3944804072380066, "rewards/rejected": 0.6355186104774475, "step": 1904 }, { "epoch": 1.03, "learning_rate": 7.650556481397483e-08, "logits/chosen": -2.110762357711792, "logits/rejected": -2.008150577545166, "logps/chosen": -19.686161041259766, "logps/rejected": -4.683237552642822, "loss": 0.3941, "rewards/accuracies": 1.0, "rewards/chosen": 1.433284044265747, "rewards/margins": 0.7274988293647766, "rewards/rejected": 0.7057852149009705, "step": 1905 }, { "epoch": 1.03, "learning_rate": 7.648087223436051e-08, "logits/chosen": -2.022834062576294, "logits/rejected": -2.0189740657806396, "logps/chosen": -4.550625324249268, "logps/rejected": -2.750861167907715, "loss": 0.3726, "rewards/accuracies": 1.0, "rewards/chosen": 1.5234708786010742, "rewards/margins": 0.7952741384506226, "rewards/rejected": 0.7281967401504517, "step": 1906 }, { "epoch": 1.03, "learning_rate": 7.645617067530894e-08, "logits/chosen": -2.0079522132873535, "logits/rejected": -2.0489649772644043, "logps/chosen": -1.882352352142334, "logps/rejected": -15.521509170532227, "loss": 0.667, "rewards/accuracies": 1.0, "rewards/chosen": 0.6937259435653687, "rewards/margins": 0.05304217338562012, "rewards/rejected": 0.6406837701797485, "step": 1907 }, { "epoch": 1.03, "learning_rate": 7.643146014519617e-08, "logits/chosen": -2.206906795501709, "logits/rejected": -2.2033231258392334, "logps/chosen": -5.208906173706055, "logps/rejected": -2.4383583068847656, "loss": 0.4274, "rewards/accuracies": 1.0, "rewards/chosen": 1.2382296323776245, "rewards/margins": 0.6288371682167053, "rewards/rejected": 0.6093924641609192, "step": 1908 }, { "epoch": 1.03, "learning_rate": 7.640674065240136e-08, "logits/chosen": -2.0426816940307617, "logits/rejected": -2.2581329345703125, "logps/chosen": -1.7922791242599487, "logps/rejected": -1.7942326068878174, "loss": 0.6746, "rewards/accuracies": 1.0, "rewards/chosen": 1.0121508836746216, "rewards/margins": 0.03750365972518921, "rewards/rejected": 0.9746472239494324, "step": 1909 }, { "epoch": 1.03, "learning_rate": 7.638201220530663e-08, "logits/chosen": -2.1237926483154297, "logits/rejected": -2.2157809734344482, "logps/chosen": -0.5742533802986145, "logps/rejected": -0.6331982016563416, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.889509379863739, "rewards/margins": 0.01683521270751953, "rewards/rejected": 0.8726741671562195, "step": 1910 }, { "epoch": 1.03, "learning_rate": 7.635727481229724e-08, "logits/chosen": -1.9678637981414795, "logits/rejected": -2.2280654907226562, "logps/chosen": -0.9743583798408508, "logps/rejected": -0.9226645231246948, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.8604418635368347, "rewards/margins": 0.03409230709075928, "rewards/rejected": 0.8263495564460754, "step": 1911 }, { "epoch": 1.03, "learning_rate": 7.633252848176139e-08, "logits/chosen": -1.9968907833099365, "logits/rejected": -2.2459557056427, "logps/chosen": -1.5158889293670654, "logps/rejected": -1.5796533823013306, "loss": 0.7, "rewards/accuracies": 0.0, "rewards/chosen": 0.9510048031806946, "rewards/margins": -0.013659119606018066, "rewards/rejected": 0.9646639227867126, "step": 1912 }, { "epoch": 1.03, "learning_rate": 7.630777322209039e-08, "logits/chosen": -2.0607471466064453, "logits/rejected": -2.064436435699463, "logps/chosen": -1.4205840826034546, "logps/rejected": -1.6528748273849487, "loss": 0.5083, "rewards/accuracies": 1.0, "rewards/chosen": 1.1528626680374146, "rewards/margins": 0.4118337035179138, "rewards/rejected": 0.7410289645195007, "step": 1913 }, { "epoch": 1.03, "learning_rate": 7.62830090416785e-08, "logits/chosen": -2.0260281562805176, "logits/rejected": -2.0284228324890137, "logps/chosen": -3.432957172393799, "logps/rejected": -0.6491807103157043, "loss": 0.546, "rewards/accuracies": 1.0, "rewards/chosen": 1.180430293083191, "rewards/margins": 0.3197988271713257, "rewards/rejected": 0.8606314659118652, "step": 1914 }, { "epoch": 1.03, "learning_rate": 7.625823594892304e-08, "logits/chosen": -2.1027283668518066, "logits/rejected": -2.1049530506134033, "logps/chosen": -1.4921770095825195, "logps/rejected": -1.5739837884902954, "loss": 0.5541, "rewards/accuracies": 1.0, "rewards/chosen": 1.0407692193984985, "rewards/margins": 0.3007091283798218, "rewards/rejected": 0.7400600910186768, "step": 1915 }, { "epoch": 1.03, "learning_rate": 7.623345395222437e-08, "logits/chosen": -2.020169258117676, "logits/rejected": -2.0312702655792236, "logps/chosen": -8.181241989135742, "logps/rejected": -3.0887887477874756, "loss": 0.3807, "rewards/accuracies": 1.0, "rewards/chosen": 1.5474776029586792, "rewards/margins": 0.7694000005722046, "rewards/rejected": 0.7780776023864746, "step": 1916 }, { "epoch": 1.03, "learning_rate": 7.620866305998585e-08, "logits/chosen": -2.100768804550171, "logits/rejected": -2.305832862854004, "logps/chosen": -0.9213442802429199, "logps/rejected": -0.9568418264389038, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.7976349592208862, "rewards/margins": 0.0035305023193359375, "rewards/rejected": 0.7941044569015503, "step": 1917 }, { "epoch": 1.03, "learning_rate": 7.618386328061388e-08, "logits/chosen": -1.973574161529541, "logits/rejected": -2.248544454574585, "logps/chosen": -0.691545307636261, "logps/rejected": -0.6198773384094238, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 0.8757293820381165, "rewards/margins": -0.006954014301300049, "rewards/rejected": 0.8826833963394165, "step": 1918 }, { "epoch": 1.04, "learning_rate": 7.615905462251781e-08, "logits/chosen": -2.0907955169677734, "logits/rejected": -2.1699934005737305, "logps/chosen": -1.8973777294158936, "logps/rejected": -19.171361923217773, "loss": 0.4991, "rewards/accuracies": 1.0, "rewards/chosen": 1.2084331512451172, "rewards/margins": 0.43494778871536255, "rewards/rejected": 0.7734853625297546, "step": 1919 }, { "epoch": 1.04, "learning_rate": 7.613423709411007e-08, "logits/chosen": -2.1202142238616943, "logits/rejected": -2.2842884063720703, "logps/chosen": -4.940463066101074, "logps/rejected": -4.621153831481934, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.7123424410820007, "rewards/margins": -0.011379361152648926, "rewards/rejected": 0.7237218022346497, "step": 1920 }, { "epoch": 1.04, "learning_rate": 7.610941070380607e-08, "logits/chosen": -2.1523797512054443, "logits/rejected": -2.152289628982544, "logps/chosen": -2.979433536529541, "logps/rejected": -12.411436080932617, "loss": 0.5859, "rewards/accuracies": 1.0, "rewards/chosen": 0.9318332672119141, "rewards/margins": 0.22745132446289062, "rewards/rejected": 0.7043819427490234, "step": 1921 }, { "epoch": 1.04, "learning_rate": 7.608457546002422e-08, "logits/chosen": -2.064500570297241, "logits/rejected": -2.066737413406372, "logps/chosen": -3.263619899749756, "logps/rejected": -2.5814807415008545, "loss": 0.6585, "rewards/accuracies": 1.0, "rewards/chosen": 0.9402494430541992, "rewards/margins": 0.07043510675430298, "rewards/rejected": 0.8698143362998962, "step": 1922 }, { "epoch": 1.04, "learning_rate": 7.605973137118596e-08, "logits/chosen": -2.1447501182556152, "logits/rejected": -2.1446316242218018, "logps/chosen": -3.834101676940918, "logps/rejected": -3.2221009731292725, "loss": 0.4705, "rewards/accuracies": 1.0, "rewards/chosen": 1.3363213539123535, "rewards/margins": 0.5096185803413391, "rewards/rejected": 0.8267027735710144, "step": 1923 }, { "epoch": 1.04, "learning_rate": 7.603487844571567e-08, "logits/chosen": -2.29679536819458, "logits/rejected": -2.343987464904785, "logps/chosen": -1.0108551979064941, "logps/rejected": -0.6694115996360779, "loss": 0.7072, "rewards/accuracies": 0.0, "rewards/chosen": 0.7429439425468445, "rewards/margins": -0.028004765510559082, "rewards/rejected": 0.7709487080574036, "step": 1924 }, { "epoch": 1.04, "learning_rate": 7.60100166920408e-08, "logits/chosen": -2.012601852416992, "logits/rejected": -2.0062358379364014, "logps/chosen": -6.237833023071289, "logps/rejected": -2.7469470500946045, "loss": 0.4469, "rewards/accuracies": 1.0, "rewards/chosen": 1.2683868408203125, "rewards/margins": 0.5737912058830261, "rewards/rejected": 0.6945956349372864, "step": 1925 }, { "epoch": 1.04, "learning_rate": 7.598514611859172e-08, "logits/chosen": -2.090935468673706, "logits/rejected": -2.10806941986084, "logps/chosen": -5.504558563232422, "logps/rejected": -3.120415687561035, "loss": 0.528, "rewards/accuracies": 1.0, "rewards/chosen": 1.155938982963562, "rewards/margins": 0.3630366921424866, "rewards/rejected": 0.7929022908210754, "step": 1926 }, { "epoch": 1.04, "learning_rate": 7.596026673380189e-08, "logits/chosen": -2.0646257400512695, "logits/rejected": -2.222801923751831, "logps/chosen": -0.8202499151229858, "logps/rejected": -0.8978661298751831, "loss": 0.656, "rewards/accuracies": 1.0, "rewards/chosen": 0.9591507911682129, "rewards/margins": 0.07576501369476318, "rewards/rejected": 0.8833857774734497, "step": 1927 }, { "epoch": 1.04, "learning_rate": 7.593537854610764e-08, "logits/chosen": -2.0669217109680176, "logits/rejected": -2.0583362579345703, "logps/chosen": -2.6550099849700928, "logps/rejected": -10.062285423278809, "loss": 0.3663, "rewards/accuracies": 1.0, "rewards/chosen": 0.9968487620353699, "rewards/margins": 0.8156827688217163, "rewards/rejected": 0.18116597831249237, "step": 1928 }, { "epoch": 1.04, "learning_rate": 7.591048156394835e-08, "logits/chosen": -1.9644843339920044, "logits/rejected": -1.973153829574585, "logps/chosen": -4.958485126495361, "logps/rejected": -3.876919746398926, "loss": 0.2708, "rewards/accuracies": 1.0, "rewards/chosen": 1.6888017654418945, "rewards/margins": 1.1680238246917725, "rewards/rejected": 0.5207778811454773, "step": 1929 }, { "epoch": 1.04, "learning_rate": 7.58855757957664e-08, "logits/chosen": -2.0110931396484375, "logits/rejected": -2.250967264175415, "logps/chosen": -0.2662654519081116, "logps/rejected": -0.30399876832962036, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.8556690216064453, "rewards/margins": 0.015555083751678467, "rewards/rejected": 0.8401139378547668, "step": 1930 }, { "epoch": 1.04, "learning_rate": 7.58606612500071e-08, "logits/chosen": -2.1480047702789307, "logits/rejected": -2.2271487712860107, "logps/chosen": -1.7195160388946533, "logps/rejected": -1.5820181369781494, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.8683896064758301, "rewards/margins": 0.011191189289093018, "rewards/rejected": 0.8571984171867371, "step": 1931 }, { "epoch": 1.04, "learning_rate": 7.583573793511877e-08, "logits/chosen": -2.1737163066864014, "logits/rejected": -2.251936912536621, "logps/chosen": -0.6506692171096802, "logps/rejected": -0.762786865234375, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.8449718356132507, "rewards/margins": 0.010904014110565186, "rewards/rejected": 0.8340678215026855, "step": 1932 }, { "epoch": 1.04, "learning_rate": 7.58108058595527e-08, "logits/chosen": -2.128859519958496, "logits/rejected": -2.134843349456787, "logps/chosen": -1.1967345476150513, "logps/rejected": -2.157947301864624, "loss": 0.5184, "rewards/accuracies": 1.0, "rewards/chosen": 0.9237748384475708, "rewards/margins": 0.3866257667541504, "rewards/rejected": 0.5371490716934204, "step": 1933 }, { "epoch": 1.04, "learning_rate": 7.57858650317631e-08, "logits/chosen": -2.0246572494506836, "logits/rejected": -2.261693239212036, "logps/chosen": -0.5599737167358398, "logps/rejected": -0.6297075748443604, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.914016842842102, "rewards/margins": 0.027660369873046875, "rewards/rejected": 0.8863564729690552, "step": 1934 }, { "epoch": 1.04, "learning_rate": 7.576091546020724e-08, "logits/chosen": -1.951570987701416, "logits/rejected": -2.2735493183135986, "logps/chosen": -1.2644968032836914, "logps/rejected": -1.2546288967132568, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.9230846762657166, "rewards/margins": 0.003032982349395752, "rewards/rejected": 0.9200516939163208, "step": 1935 }, { "epoch": 1.04, "learning_rate": 7.573595715334531e-08, "logits/chosen": -2.1290297508239746, "logits/rejected": -2.1302502155303955, "logps/chosen": -1.1795852184295654, "logps/rejected": -1.9996325969696045, "loss": 0.5591, "rewards/accuracies": 1.0, "rewards/chosen": 1.0033538341522217, "rewards/margins": 0.2889373302459717, "rewards/rejected": 0.71441650390625, "step": 1936 }, { "epoch": 1.04, "learning_rate": 7.571099011964042e-08, "logits/chosen": -2.1054797172546387, "logits/rejected": -2.275545120239258, "logps/chosen": -2.210131883621216, "logps/rejected": -2.0221753120422363, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 1.037697672843933, "rewards/margins": 0.0029480457305908203, "rewards/rejected": 1.0347496271133423, "step": 1937 }, { "epoch": 1.05, "learning_rate": 7.568601436755872e-08, "logits/chosen": -2.1261372566223145, "logits/rejected": -2.28678560256958, "logps/chosen": -3.77583384513855, "logps/rejected": -5.09642219543457, "loss": 0.7764, "rewards/accuracies": 0.0, "rewards/chosen": 1.0093796253204346, "rewards/margins": -0.16012072563171387, "rewards/rejected": 1.1695003509521484, "step": 1938 }, { "epoch": 1.05, "learning_rate": 7.566102990556921e-08, "logits/chosen": -2.025096893310547, "logits/rejected": -2.0332131385803223, "logps/chosen": -3.0814952850341797, "logps/rejected": -2.196831464767456, "loss": 0.4956, "rewards/accuracies": 1.0, "rewards/chosen": 0.9453515410423279, "rewards/margins": 0.4440305829048157, "rewards/rejected": 0.5013209581375122, "step": 1939 }, { "epoch": 1.05, "learning_rate": 7.563603674214399e-08, "logits/chosen": -2.040477752685547, "logits/rejected": -2.2218070030212402, "logps/chosen": -0.46676334738731384, "logps/rejected": -0.5496125221252441, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.7948887944221497, "rewards/margins": 0.02350437641143799, "rewards/rejected": 0.7713844180107117, "step": 1940 }, { "epoch": 1.05, "learning_rate": 7.561103488575795e-08, "logits/chosen": -1.9936476945877075, "logits/rejected": -1.982053518295288, "logps/chosen": -11.977893829345703, "logps/rejected": -7.050112724304199, "loss": 0.2931, "rewards/accuracies": 1.0, "rewards/chosen": 1.9088459014892578, "rewards/margins": 1.0771427154541016, "rewards/rejected": 0.8317031264305115, "step": 1941 }, { "epoch": 1.05, "learning_rate": 7.558602434488906e-08, "logits/chosen": -2.0358080863952637, "logits/rejected": -2.022371768951416, "logps/chosen": -15.34600830078125, "logps/rejected": -4.416013240814209, "loss": 0.6093, "rewards/accuracies": 1.0, "rewards/chosen": 1.247563362121582, "rewards/margins": 0.1754361391067505, "rewards/rejected": 1.0721272230148315, "step": 1942 }, { "epoch": 1.05, "learning_rate": 7.556100512801818e-08, "logits/chosen": -2.112541675567627, "logits/rejected": -2.2382214069366455, "logps/chosen": -0.26377934217453003, "logps/rejected": -0.2664165198802948, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.8183730244636536, "rewards/margins": 0.022719502449035645, "rewards/rejected": 0.7956535220146179, "step": 1943 }, { "epoch": 1.05, "learning_rate": 7.553597724362906e-08, "logits/chosen": -2.0040388107299805, "logits/rejected": -2.0177624225616455, "logps/chosen": -3.3005452156066895, "logps/rejected": -1.508297324180603, "loss": 0.578, "rewards/accuracies": 1.0, "rewards/chosen": 0.8930471539497375, "rewards/margins": 0.24522262811660767, "rewards/rejected": 0.6478245258331299, "step": 1944 }, { "epoch": 1.05, "learning_rate": 7.551094070020848e-08, "logits/chosen": -2.1262786388397217, "logits/rejected": -2.0471832752227783, "logps/chosen": -26.4785099029541, "logps/rejected": -4.927328109741211, "loss": 0.5001, "rewards/accuracies": 1.0, "rewards/chosen": 1.2176374197006226, "rewards/margins": 0.43249011039733887, "rewards/rejected": 0.7851473093032837, "step": 1945 }, { "epoch": 1.05, "learning_rate": 7.548589550624614e-08, "logits/chosen": -2.0562944412231445, "logits/rejected": -2.2311508655548096, "logps/chosen": -0.5091648101806641, "logps/rejected": -0.49413537979125977, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.858015239238739, "rewards/margins": 0.01854705810546875, "rewards/rejected": 0.8394681811332703, "step": 1946 }, { "epoch": 1.05, "learning_rate": 7.546084167023461e-08, "logits/chosen": -1.9535456895828247, "logits/rejected": -2.2271084785461426, "logps/chosen": -0.9511069655418396, "logps/rejected": -1.022943377494812, "loss": 0.7023, "rewards/accuracies": 0.0, "rewards/chosen": 1.0284005403518677, "rewards/margins": -0.01826024055480957, "rewards/rejected": 1.0466607809066772, "step": 1947 }, { "epoch": 1.05, "learning_rate": 7.543577920066943e-08, "logits/chosen": -2.0360634326934814, "logits/rejected": -2.277454376220703, "logps/chosen": -0.5405784845352173, "logps/rejected": -0.6053157448768616, "loss": 0.6743, "rewards/accuracies": 1.0, "rewards/chosen": 0.9655560851097107, "rewards/margins": 0.038067758083343506, "rewards/rejected": 0.9274883270263672, "step": 1948 }, { "epoch": 1.05, "learning_rate": 7.54107081060491e-08, "logits/chosen": -1.963657021522522, "logits/rejected": -2.240248918533325, "logps/chosen": -0.5035548210144043, "logps/rejected": -0.532913327217102, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.7666650414466858, "rewards/margins": 8.90493392944336e-05, "rewards/rejected": 0.7665759921073914, "step": 1949 }, { "epoch": 1.05, "learning_rate": 7.5385628394875e-08, "logits/chosen": -2.1782772541046143, "logits/rejected": -2.293623685836792, "logps/chosen": -3.9999122619628906, "logps/rejected": -1.6378329992294312, "loss": 0.7722, "rewards/accuracies": 0.0, "rewards/chosen": 0.6950921416282654, "rewards/margins": -0.15225094556808472, "rewards/rejected": 0.8473430871963501, "step": 1950 }, { "epoch": 1.05, "learning_rate": 7.536054007565145e-08, "logits/chosen": -2.095769166946411, "logits/rejected": -2.100939989089966, "logps/chosen": -3.767080307006836, "logps/rejected": -6.084746837615967, "loss": 0.5447, "rewards/accuracies": 1.0, "rewards/chosen": 0.9860813021659851, "rewards/margins": 0.3227481245994568, "rewards/rejected": 0.6633331775665283, "step": 1951 }, { "epoch": 1.05, "learning_rate": 7.533544315688567e-08, "logits/chosen": -2.186542272567749, "logits/rejected": -2.251560926437378, "logps/chosen": -3.7491796016693115, "logps/rejected": -12.235526084899902, "loss": 0.4784, "rewards/accuracies": 1.0, "rewards/chosen": 1.3067892789840698, "rewards/margins": 0.4886411428451538, "rewards/rejected": 0.818148136138916, "step": 1952 }, { "epoch": 1.05, "learning_rate": 7.531033764708782e-08, "logits/chosen": -2.0897860527038574, "logits/rejected": -2.274946689605713, "logps/chosen": -0.5489855408668518, "logps/rejected": -0.5754696726799011, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.9150186777114868, "rewards/margins": 0.019168198108673096, "rewards/rejected": 0.8958504796028137, "step": 1953 }, { "epoch": 1.05, "learning_rate": 7.528522355477096e-08, "logits/chosen": -2.1091020107269287, "logits/rejected": -2.047043561935425, "logps/chosen": -29.57867431640625, "logps/rejected": -2.0036959648132324, "loss": 0.289, "rewards/accuracies": 1.0, "rewards/chosen": 1.9066200256347656, "rewards/margins": 1.093458652496338, "rewards/rejected": 0.813161313533783, "step": 1954 }, { "epoch": 1.05, "learning_rate": 7.526010088845108e-08, "logits/chosen": -2.1157610416412354, "logits/rejected": -2.1117541790008545, "logps/chosen": -4.66747522354126, "logps/rejected": -4.4421281814575195, "loss": 0.4314, "rewards/accuracies": 1.0, "rewards/chosen": 1.0943775177001953, "rewards/margins": 0.6172330379486084, "rewards/rejected": 0.4771444499492645, "step": 1955 }, { "epoch": 1.06, "learning_rate": 7.523496965664704e-08, "logits/chosen": -2.0366735458374023, "logits/rejected": -2.0440568923950195, "logps/chosen": -4.431623935699463, "logps/rejected": -9.086586952209473, "loss": 0.4522, "rewards/accuracies": 1.0, "rewards/chosen": 1.1273256540298462, "rewards/margins": 0.5590259432792664, "rewards/rejected": 0.5682997107505798, "step": 1956 }, { "epoch": 1.06, "learning_rate": 7.520982986788064e-08, "logits/chosen": -2.0442097187042236, "logits/rejected": -2.2339706420898438, "logps/chosen": -7.469279766082764, "logps/rejected": -5.121928691864014, "loss": 0.7461, "rewards/accuracies": 0.0, "rewards/chosen": 0.972943902015686, "rewards/margins": -0.10328388214111328, "rewards/rejected": 1.0762277841567993, "step": 1957 }, { "epoch": 1.06, "learning_rate": 7.518468153067655e-08, "logits/chosen": -2.119067430496216, "logits/rejected": -2.279453992843628, "logps/chosen": -0.6823633313179016, "logps/rejected": -0.7393882870674133, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.9286940693855286, "rewards/margins": 0.010191738605499268, "rewards/rejected": 0.9185023307800293, "step": 1958 }, { "epoch": 1.06, "learning_rate": 7.515952465356236e-08, "logits/chosen": -2.097736120223999, "logits/rejected": -2.2955052852630615, "logps/chosen": -4.334872245788574, "logps/rejected": -4.029955863952637, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.9191007018089294, "rewards/margins": 0.0033295154571533203, "rewards/rejected": 0.9157711863517761, "step": 1959 }, { "epoch": 1.06, "learning_rate": 7.51343592450686e-08, "logits/chosen": -2.0122153759002686, "logits/rejected": -2.2233901023864746, "logps/chosen": -0.352781742811203, "logps/rejected": -0.38000819087028503, "loss": 0.6689, "rewards/accuracies": 1.0, "rewards/chosen": 1.0147945880889893, "rewards/margins": 0.04916137456893921, "rewards/rejected": 0.96563321352005, "step": 1960 }, { "epoch": 1.06, "learning_rate": 7.510918531372856e-08, "logits/chosen": -2.1663355827331543, "logits/rejected": -2.1550633907318115, "logps/chosen": -2.982645034790039, "logps/rejected": -3.2372374534606934, "loss": 0.4308, "rewards/accuracies": 1.0, "rewards/chosen": 1.168096899986267, "rewards/margins": 0.6190416216850281, "rewards/rejected": 0.549055278301239, "step": 1961 }, { "epoch": 1.06, "learning_rate": 7.508400286807857e-08, "logits/chosen": -2.074052333831787, "logits/rejected": -2.075350522994995, "logps/chosen": -2.752453565597534, "logps/rejected": -3.92045259475708, "loss": 0.3171, "rewards/accuracies": 1.0, "rewards/chosen": 1.5163697004318237, "rewards/margins": 0.9857441186904907, "rewards/rejected": 0.530625581741333, "step": 1962 }, { "epoch": 1.06, "learning_rate": 7.505881191665775e-08, "logits/chosen": -2.1548945903778076, "logits/rejected": -2.146282434463501, "logps/chosen": -7.35724401473999, "logps/rejected": -3.1618154048919678, "loss": 0.473, "rewards/accuracies": 1.0, "rewards/chosen": 1.2029527425765991, "rewards/margins": 0.5028699636459351, "rewards/rejected": 0.7000827789306641, "step": 1963 }, { "epoch": 1.06, "learning_rate": 7.503361246800815e-08, "logits/chosen": -2.0372161865234375, "logits/rejected": -2.0425453186035156, "logps/chosen": -0.9348081350326538, "logps/rejected": -1.9905132055282593, "loss": 0.5169, "rewards/accuracies": 1.0, "rewards/chosen": 1.0423635244369507, "rewards/margins": 0.3904528021812439, "rewards/rejected": 0.6519107222557068, "step": 1964 }, { "epoch": 1.06, "learning_rate": 7.500840453067469e-08, "logits/chosen": -2.0557830333709717, "logits/rejected": -2.0605199337005615, "logps/chosen": -0.6424248218536377, "logps/rejected": -11.789087295532227, "loss": 0.5288, "rewards/accuracies": 1.0, "rewards/chosen": 1.0269017219543457, "rewards/margins": 0.36108118295669556, "rewards/rejected": 0.6658205389976501, "step": 1965 }, { "epoch": 1.06, "learning_rate": 7.498318811320517e-08, "logits/chosen": -2.0535240173339844, "logits/rejected": -2.2445461750030518, "logps/chosen": -0.7195175290107727, "logps/rejected": -0.750980794429779, "loss": 0.6962, "rewards/accuracies": 0.0, "rewards/chosen": 0.9187620282173157, "rewards/margins": -0.006007671356201172, "rewards/rejected": 0.9247696995735168, "step": 1966 }, { "epoch": 1.06, "learning_rate": 7.495796322415025e-08, "logits/chosen": -2.0111424922943115, "logits/rejected": -2.2798376083374023, "logps/chosen": -0.7454683184623718, "logps/rejected": -0.8247107267379761, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.996891438961029, "rewards/margins": -0.0022077560424804688, "rewards/rejected": 0.9990991950035095, "step": 1967 }, { "epoch": 1.06, "learning_rate": 7.493272987206346e-08, "logits/chosen": -2.0281286239624023, "logits/rejected": -2.2633886337280273, "logps/chosen": -2.194829225540161, "logps/rejected": -6.544373512268066, "loss": 0.5934, "rewards/accuracies": 1.0, "rewards/chosen": 1.1826469898223877, "rewards/margins": 0.21049034595489502, "rewards/rejected": 0.9721566438674927, "step": 1968 }, { "epoch": 1.06, "learning_rate": 7.490748806550122e-08, "logits/chosen": -2.09242844581604, "logits/rejected": -2.1400671005249023, "logps/chosen": -8.530990600585938, "logps/rejected": -8.658797264099121, "loss": 0.4493, "rewards/accuracies": 1.0, "rewards/chosen": 1.3446712493896484, "rewards/margins": 0.5669182538986206, "rewards/rejected": 0.7777529954910278, "step": 1969 }, { "epoch": 1.06, "learning_rate": 7.488223781302284e-08, "logits/chosen": -2.0904769897460938, "logits/rejected": -2.2809343338012695, "logps/chosen": -2.4188246726989746, "logps/rejected": -2.474402904510498, "loss": 0.6675, "rewards/accuracies": 1.0, "rewards/chosen": 0.9767711758613586, "rewards/margins": 0.05205804109573364, "rewards/rejected": 0.924713134765625, "step": 1970 }, { "epoch": 1.06, "learning_rate": 7.485697912319044e-08, "logits/chosen": -2.1378612518310547, "logits/rejected": -2.054969072341919, "logps/chosen": -20.134197235107422, "logps/rejected": -2.029189109802246, "loss": 0.3609, "rewards/accuracies": 1.0, "rewards/chosen": 1.6294727325439453, "rewards/margins": 0.8332289457321167, "rewards/rejected": 0.7962437868118286, "step": 1971 }, { "epoch": 1.06, "learning_rate": 7.483171200456905e-08, "logits/chosen": -2.0880813598632812, "logits/rejected": -2.281297206878662, "logps/chosen": -8.380132675170898, "logps/rejected": -8.172216415405273, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 1.2084741592407227, "rewards/margins": 0.02691173553466797, "rewards/rejected": 1.1815624237060547, "step": 1972 }, { "epoch": 1.06, "learning_rate": 7.480643646572649e-08, "logits/chosen": -2.1242144107818604, "logits/rejected": -2.0638010501861572, "logps/chosen": -31.818262100219727, "logps/rejected": -7.631265163421631, "loss": 0.2982, "rewards/accuracies": 1.0, "rewards/chosen": 1.992944598197937, "rewards/margins": 1.057258129119873, "rewards/rejected": 0.9356865286827087, "step": 1973 }, { "epoch": 1.06, "learning_rate": 7.47811525152335e-08, "logits/chosen": -1.994365930557251, "logits/rejected": -2.3063127994537354, "logps/chosen": -0.8398900032043457, "logps/rejected": -0.8456758260726929, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.8711299896240234, "rewards/margins": 0.02253180742263794, "rewards/rejected": 0.8485981822013855, "step": 1974 }, { "epoch": 1.07, "learning_rate": 7.475586016166367e-08, "logits/chosen": -2.0234601497650146, "logits/rejected": -2.0689303874969482, "logps/chosen": -6.231226921081543, "logps/rejected": -18.088075637817383, "loss": 0.5224, "rewards/accuracies": 1.0, "rewards/chosen": 1.0735926628112793, "rewards/margins": 0.3767344355583191, "rewards/rejected": 0.6968582272529602, "step": 1975 }, { "epoch": 1.07, "learning_rate": 7.473055941359341e-08, "logits/chosen": -2.116899251937866, "logits/rejected": -2.112739086151123, "logps/chosen": -6.410874366760254, "logps/rejected": -4.375740051269531, "loss": 0.4258, "rewards/accuracies": 1.0, "rewards/chosen": 1.1462892293930054, "rewards/margins": 0.6333431005477905, "rewards/rejected": 0.5129461288452148, "step": 1976 }, { "epoch": 1.07, "learning_rate": 7.470525027960198e-08, "logits/chosen": -2.102442979812622, "logits/rejected": -2.1043498516082764, "logps/chosen": -0.7490432858467102, "logps/rejected": -8.669705390930176, "loss": 0.4018, "rewards/accuracies": 1.0, "rewards/chosen": 1.1767362356185913, "rewards/margins": 0.7041168808937073, "rewards/rejected": 0.47261935472488403, "step": 1977 }, { "epoch": 1.07, "learning_rate": 7.467993276827147e-08, "logits/chosen": -2.1356492042541504, "logits/rejected": -2.2548792362213135, "logps/chosen": -0.661889910697937, "logps/rejected": -0.7204586267471313, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.7649664878845215, "rewards/margins": 0.011960446834564209, "rewards/rejected": 0.7530060410499573, "step": 1978 }, { "epoch": 1.07, "learning_rate": 7.465460688818688e-08, "logits/chosen": -1.9610131978988647, "logits/rejected": -1.9756324291229248, "logps/chosen": -5.047074317932129, "logps/rejected": -7.7657880783081055, "loss": 0.5217, "rewards/accuracies": 1.0, "rewards/chosen": 1.048097014427185, "rewards/margins": 0.378490149974823, "rewards/rejected": 0.6696068644523621, "step": 1979 }, { "epoch": 1.07, "learning_rate": 7.462927264793597e-08, "logits/chosen": -1.9803720712661743, "logits/rejected": -1.9891597032546997, "logps/chosen": -1.9332587718963623, "logps/rejected": -2.7061095237731934, "loss": 0.5249, "rewards/accuracies": 1.0, "rewards/chosen": 1.0107520818710327, "rewards/margins": 0.37074530124664307, "rewards/rejected": 0.6400067806243896, "step": 1980 }, { "epoch": 1.07, "learning_rate": 7.460393005610938e-08, "logits/chosen": -2.1901798248291016, "logits/rejected": -2.170978307723999, "logps/chosen": -18.31801414489746, "logps/rejected": -1.732358455657959, "loss": 0.4516, "rewards/accuracies": 1.0, "rewards/chosen": 1.2281134128570557, "rewards/margins": 0.5605478286743164, "rewards/rejected": 0.6675655841827393, "step": 1981 }, { "epoch": 1.07, "learning_rate": 7.457857912130053e-08, "logits/chosen": -2.1107563972473145, "logits/rejected": -2.363013744354248, "logps/chosen": -0.7832522988319397, "logps/rejected": -0.8280133605003357, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.9875907897949219, "rewards/margins": 0.015548408031463623, "rewards/rejected": 0.9720423817634583, "step": 1982 }, { "epoch": 1.07, "learning_rate": 7.455321985210574e-08, "logits/chosen": -2.1212425231933594, "logits/rejected": -2.1321704387664795, "logps/chosen": -5.100315093994141, "logps/rejected": -3.2951061725616455, "loss": 0.4339, "rewards/accuracies": 1.0, "rewards/chosen": 1.2445564270019531, "rewards/margins": 0.6100404858589172, "rewards/rejected": 0.6345159411430359, "step": 1983 }, { "epoch": 1.07, "learning_rate": 7.452785225712413e-08, "logits/chosen": -2.059931755065918, "logits/rejected": -2.08596134185791, "logps/chosen": -2.574021577835083, "logps/rejected": -8.146810531616211, "loss": 0.3312, "rewards/accuracies": 1.0, "rewards/chosen": 1.6237772703170776, "rewards/margins": 0.9349238276481628, "rewards/rejected": 0.6888534426689148, "step": 1984 }, { "epoch": 1.07, "learning_rate": 7.45024763449576e-08, "logits/chosen": -2.175558567047119, "logits/rejected": -2.0409417152404785, "logps/chosen": -35.7530517578125, "logps/rejected": -5.171647071838379, "loss": 0.2777, "rewards/accuracies": 1.0, "rewards/chosen": 1.6266463994979858, "rewards/margins": 1.1390905380249023, "rewards/rejected": 0.4875558018684387, "step": 1985 }, { "epoch": 1.07, "learning_rate": 7.447709212421091e-08, "logits/chosen": -2.119338274002075, "logits/rejected": -2.3297085762023926, "logps/chosen": -0.10558248311281204, "logps/rejected": -0.10426563024520874, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.7623724937438965, "rewards/margins": 0.014921009540557861, "rewards/rejected": 0.7474514842033386, "step": 1986 }, { "epoch": 1.07, "learning_rate": 7.445169960349166e-08, "logits/chosen": -2.0719494819641113, "logits/rejected": -2.073328733444214, "logps/chosen": -2.732645034790039, "logps/rejected": -0.8138338923454285, "loss": 0.5815, "rewards/accuracies": 1.0, "rewards/chosen": 1.287908911705017, "rewards/margins": 0.23728585243225098, "rewards/rejected": 1.0506230592727661, "step": 1987 }, { "epoch": 1.07, "learning_rate": 7.442629879141022e-08, "logits/chosen": -2.129828929901123, "logits/rejected": -2.2947564125061035, "logps/chosen": -3.718773365020752, "logps/rejected": -0.5106249451637268, "loss": 0.7059, "rewards/accuracies": 0.0, "rewards/chosen": 0.9527244567871094, "rewards/margins": -0.025393784046173096, "rewards/rejected": 0.9781182408332825, "step": 1988 }, { "epoch": 1.07, "learning_rate": 7.440088969657977e-08, "logits/chosen": -2.0866453647613525, "logits/rejected": -2.2458975315093994, "logps/chosen": -0.7100837826728821, "logps/rejected": -0.8692876696586609, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 1.019234299659729, "rewards/margins": 0.013302326202392578, "rewards/rejected": 1.0059319734573364, "step": 1989 }, { "epoch": 1.07, "learning_rate": 7.437547232761636e-08, "logits/chosen": -2.142606019973755, "logits/rejected": -2.1358911991119385, "logps/chosen": -4.442751884460449, "logps/rejected": -3.9625911712646484, "loss": 0.6419, "rewards/accuracies": 1.0, "rewards/chosen": 0.7831469774246216, "rewards/margins": 0.10529786348342896, "rewards/rejected": 0.6778491139411926, "step": 1990 }, { "epoch": 1.07, "learning_rate": 7.435004669313876e-08, "logits/chosen": -2.2248482704162598, "logits/rejected": -2.224069118499756, "logps/chosen": -1.608893632888794, "logps/rejected": -1.3471094369888306, "loss": 0.5436, "rewards/accuracies": 1.0, "rewards/chosen": 1.0093787908554077, "rewards/margins": 0.32553768157958984, "rewards/rejected": 0.6838411092758179, "step": 1991 }, { "epoch": 1.07, "learning_rate": 7.43246128017686e-08, "logits/chosen": -1.9852319955825806, "logits/rejected": -2.2772390842437744, "logps/chosen": -0.8020651936531067, "logps/rejected": -0.8444555401802063, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 0.8895557522773743, "rewards/margins": -0.004880428314208984, "rewards/rejected": 0.8944361805915833, "step": 1992 }, { "epoch": 1.07, "learning_rate": 7.429917066213029e-08, "logits/chosen": -2.2369940280914307, "logits/rejected": -2.248600482940674, "logps/chosen": -5.461402416229248, "logps/rejected": -5.2305216789245605, "loss": 0.4427, "rewards/accuracies": 1.0, "rewards/chosen": 1.087669014930725, "rewards/margins": 0.5852522253990173, "rewards/rejected": 0.5024167895317078, "step": 1993 }, { "epoch": 1.08, "learning_rate": 7.427372028285107e-08, "logits/chosen": -2.1917474269866943, "logits/rejected": -2.0677974224090576, "logps/chosen": -35.71900177001953, "logps/rejected": -4.180452823638916, "loss": 0.2956, "rewards/accuracies": 1.0, "rewards/chosen": 1.7053536176681519, "rewards/margins": 1.0671173334121704, "rewards/rejected": 0.6382362842559814, "step": 1994 }, { "epoch": 1.08, "learning_rate": 7.424826167256091e-08, "logits/chosen": -2.12605357170105, "logits/rejected": -2.272948741912842, "logps/chosen": -0.6031711101531982, "logps/rejected": -0.6073572635650635, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.970789909362793, "rewards/margins": -0.0027605295181274414, "rewards/rejected": 0.9735504388809204, "step": 1995 }, { "epoch": 1.08, "learning_rate": 7.422279483989264e-08, "logits/chosen": -2.049669027328491, "logits/rejected": -2.0526998043060303, "logps/chosen": -5.3689117431640625, "logps/rejected": -3.455263614654541, "loss": 0.5788, "rewards/accuracies": 1.0, "rewards/chosen": 0.9130704998970032, "rewards/margins": 0.2435915470123291, "rewards/rejected": 0.6694789528846741, "step": 1996 }, { "epoch": 1.08, "learning_rate": 7.419731979348181e-08, "logits/chosen": -2.0460562705993652, "logits/rejected": -2.3229546546936035, "logps/chosen": -0.44357752799987793, "logps/rejected": -0.40967658162117004, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.9836220145225525, "rewards/margins": 0.0160064697265625, "rewards/rejected": 0.96761554479599, "step": 1997 }, { "epoch": 1.08, "learning_rate": 7.41718365419668e-08, "logits/chosen": -1.9575964212417603, "logits/rejected": -2.2679293155670166, "logps/chosen": -0.2853066325187683, "logps/rejected": -0.3255009651184082, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.9026936888694763, "rewards/margins": 0.03060007095336914, "rewards/rejected": 0.8720936179161072, "step": 1998 }, { "epoch": 1.08, "learning_rate": 7.414634509398879e-08, "logits/chosen": -2.3150031566619873, "logits/rejected": -2.1998093128204346, "logps/chosen": -34.243160247802734, "logps/rejected": -2.6348536014556885, "loss": 0.3273, "rewards/accuracies": 1.0, "rewards/chosen": 1.7287609577178955, "rewards/margins": 0.948893129825592, "rewards/rejected": 0.7798678278923035, "step": 1999 }, { "epoch": 1.08, "learning_rate": 7.412084545819169e-08, "logits/chosen": -1.9579589366912842, "logits/rejected": -1.9585065841674805, "logps/chosen": -4.95710563659668, "logps/rejected": -0.3795401155948639, "loss": 0.5468, "rewards/accuracies": 1.0, "rewards/chosen": 1.2552721500396729, "rewards/margins": 0.3179050087928772, "rewards/rejected": 0.9373671412467957, "step": 2000 }, { "epoch": 1.08, "learning_rate": 7.40953376432222e-08, "logits/chosen": -2.0326449871063232, "logits/rejected": -2.266660213470459, "logps/chosen": -1.3271777629852295, "logps/rejected": -1.2356716394424438, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.7735379338264465, "rewards/margins": 0.012749254703521729, "rewards/rejected": 0.7607886791229248, "step": 2001 }, { "epoch": 1.08, "learning_rate": 7.40698216577298e-08, "logits/chosen": -2.1019020080566406, "logits/rejected": -2.248964786529541, "logps/chosen": -3.6583831310272217, "logps/rejected": -4.488616466522217, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.7315287590026855, "rewards/margins": 0.001993238925933838, "rewards/rejected": 0.7295355200767517, "step": 2002 }, { "epoch": 1.08, "learning_rate": 7.404429751036679e-08, "logits/chosen": -2.094374418258667, "logits/rejected": -2.0919229984283447, "logps/chosen": -3.0974419116973877, "logps/rejected": -4.730203628540039, "loss": 0.2992, "rewards/accuracies": 1.0, "rewards/chosen": 1.5823811292648315, "rewards/margins": 1.0532746315002441, "rewards/rejected": 0.5291065573692322, "step": 2003 }, { "epoch": 1.08, "learning_rate": 7.401876520978812e-08, "logits/chosen": -2.1761019229888916, "logits/rejected": -2.1618878841400146, "logps/chosen": -2.793205499649048, "logps/rejected": -8.709476470947266, "loss": 0.388, "rewards/accuracies": 1.0, "rewards/chosen": 1.2470014095306396, "rewards/margins": 0.7464022040367126, "rewards/rejected": 0.500599205493927, "step": 2004 }, { "epoch": 1.08, "learning_rate": 7.399322476465165e-08, "logits/chosen": -1.9734827280044556, "logits/rejected": -2.3083362579345703, "logps/chosen": -0.7232762575149536, "logps/rejected": -0.7573211789131165, "loss": 0.7005, "rewards/accuracies": 0.0, "rewards/chosen": 0.7770182490348816, "rewards/margins": -0.014680624008178711, "rewards/rejected": 0.7916988730430603, "step": 2005 }, { "epoch": 1.08, "learning_rate": 7.396767618361784e-08, "logits/chosen": -2.1064200401306152, "logits/rejected": -2.0912487506866455, "logps/chosen": -16.08201026916504, "logps/rejected": -5.800453186035156, "loss": 0.335, "rewards/accuracies": 1.0, "rewards/chosen": 1.3135515451431274, "rewards/margins": 0.9212886095046997, "rewards/rejected": 0.39226293563842773, "step": 2006 }, { "epoch": 1.08, "learning_rate": 7.394211947535006e-08, "logits/chosen": -2.1422386169433594, "logits/rejected": -2.147213935852051, "logps/chosen": -2.0432443618774414, "logps/rejected": -3.081346273422241, "loss": 0.5401, "rewards/accuracies": 1.0, "rewards/chosen": 0.9814931154251099, "rewards/margins": 0.33373576402664185, "rewards/rejected": 0.647757351398468, "step": 2007 }, { "epoch": 1.08, "learning_rate": 7.391655464851435e-08, "logits/chosen": -2.2232823371887207, "logits/rejected": -2.2198166847229004, "logps/chosen": -7.832936763763428, "logps/rejected": -4.753906726837158, "loss": 0.305, "rewards/accuracies": 1.0, "rewards/chosen": 1.4288305044174194, "rewards/margins": 1.0310518741607666, "rewards/rejected": 0.39777857065200806, "step": 2008 }, { "epoch": 1.08, "learning_rate": 7.389098171177953e-08, "logits/chosen": -2.0553886890411377, "logits/rejected": -2.231637477874756, "logps/chosen": -0.350716769695282, "logps/rejected": -0.3543064296245575, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.8634056448936462, "rewards/margins": 0.0211370587348938, "rewards/rejected": 0.8422685861587524, "step": 2009 }, { "epoch": 1.08, "learning_rate": 7.386540067381717e-08, "logits/chosen": -1.9813874959945679, "logits/rejected": -1.9788947105407715, "logps/chosen": -0.4459713101387024, "logps/rejected": -3.9250898361206055, "loss": 0.4786, "rewards/accuracies": 1.0, "rewards/chosen": 1.0356018543243408, "rewards/margins": 0.4880850911140442, "rewards/rejected": 0.5475167632102966, "step": 2010 }, { "epoch": 1.08, "learning_rate": 7.383981154330155e-08, "logits/chosen": -1.9867373704910278, "logits/rejected": -1.9869450330734253, "logps/chosen": -2.327308416366577, "logps/rejected": -1.1754523515701294, "loss": 0.7185, "rewards/accuracies": 0.0, "rewards/chosen": 0.9798634648323059, "rewards/margins": -0.050111472606658936, "rewards/rejected": 1.0299749374389648, "step": 2011 }, { "epoch": 1.09, "learning_rate": 7.381421432890975e-08, "logits/chosen": -2.1410772800445557, "logits/rejected": -2.2604801654815674, "logps/chosen": -1.5430268049240112, "logps/rejected": -1.5672332048416138, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.6477437019348145, "rewards/margins": 0.006055891513824463, "rewards/rejected": 0.64168781042099, "step": 2012 }, { "epoch": 1.09, "learning_rate": 7.378860903932158e-08, "logits/chosen": -2.0769543647766113, "logits/rejected": -2.0770506858825684, "logps/chosen": -2.608234167098999, "logps/rejected": -1.5838077068328857, "loss": 0.666, "rewards/accuracies": 1.0, "rewards/chosen": 0.9910788536071777, "rewards/margins": 0.054951250553131104, "rewards/rejected": 0.9361276030540466, "step": 2013 }, { "epoch": 1.09, "learning_rate": 7.376299568321956e-08, "logits/chosen": -1.9257882833480835, "logits/rejected": -1.9417046308517456, "logps/chosen": -3.4876322746276855, "logps/rejected": -5.834545612335205, "loss": 0.5798, "rewards/accuracies": 1.0, "rewards/chosen": 1.0207083225250244, "rewards/margins": 0.2411295771598816, "rewards/rejected": 0.7795787453651428, "step": 2014 }, { "epoch": 1.09, "learning_rate": 7.373737426928894e-08, "logits/chosen": -1.966245412826538, "logits/rejected": -1.9666342735290527, "logps/chosen": -3.662574529647827, "logps/rejected": -1.6567466259002686, "loss": 0.5025, "rewards/accuracies": 1.0, "rewards/chosen": 1.2571362257003784, "rewards/margins": 0.4265216588973999, "rewards/rejected": 0.8306145668029785, "step": 2015 }, { "epoch": 1.09, "learning_rate": 7.371174480621774e-08, "logits/chosen": -2.0196657180786133, "logits/rejected": -2.2420992851257324, "logps/chosen": -0.5406943559646606, "logps/rejected": -0.5094560980796814, "loss": 0.7003, "rewards/accuracies": 0.0, "rewards/chosen": 0.933586597442627, "rewards/margins": -0.014348506927490234, "rewards/rejected": 0.9479351043701172, "step": 2016 }, { "epoch": 1.09, "learning_rate": 7.36861073026967e-08, "logits/chosen": -2.0100886821746826, "logits/rejected": -2.1946656703948975, "logps/chosen": -6.410201072692871, "logps/rejected": -1.4391591548919678, "loss": 0.8393, "rewards/accuracies": 0.0, "rewards/chosen": 0.6137532591819763, "rewards/margins": -0.27362334728240967, "rewards/rejected": 0.887376606464386, "step": 2017 }, { "epoch": 1.09, "learning_rate": 7.366046176741928e-08, "logits/chosen": -2.0215206146240234, "logits/rejected": -2.277423858642578, "logps/chosen": -1.6419838666915894, "logps/rejected": -1.4092521667480469, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 1.0648804903030396, "rewards/margins": -0.0013821125030517578, "rewards/rejected": 1.0662626028060913, "step": 2018 }, { "epoch": 1.09, "learning_rate": 7.363480820908164e-08, "logits/chosen": -2.0988552570343018, "logits/rejected": -2.312936544418335, "logps/chosen": -1.1027443408966064, "logps/rejected": -6.3757171630859375, "loss": 0.5531, "rewards/accuracies": 1.0, "rewards/chosen": 0.9554551243782043, "rewards/margins": 0.3029627799987793, "rewards/rejected": 0.652492344379425, "step": 2019 }, { "epoch": 1.09, "learning_rate": 7.36091466363827e-08, "logits/chosen": -2.138259172439575, "logits/rejected": -2.2698557376861572, "logps/chosen": -2.736043930053711, "logps/rejected": -0.6874745488166809, "loss": 0.7148, "rewards/accuracies": 0.0, "rewards/chosen": 0.8439957499504089, "rewards/margins": -0.04280644655227661, "rewards/rejected": 0.8868021965026855, "step": 2020 }, { "epoch": 1.09, "learning_rate": 7.358347705802407e-08, "logits/chosen": -2.006340742111206, "logits/rejected": -2.014115333557129, "logps/chosen": -1.7240217924118042, "logps/rejected": -5.271310806274414, "loss": 0.3884, "rewards/accuracies": 1.0, "rewards/chosen": 1.0956741571426392, "rewards/margins": 0.7453311085700989, "rewards/rejected": 0.3503430485725403, "step": 2021 }, { "epoch": 1.09, "learning_rate": 7.355779948271008e-08, "logits/chosen": -2.044607162475586, "logits/rejected": -2.046792507171631, "logps/chosen": -5.280122756958008, "logps/rejected": -0.6310310959815979, "loss": 0.5606, "rewards/accuracies": 1.0, "rewards/chosen": 1.218286395072937, "rewards/margins": 0.28544992208480835, "rewards/rejected": 0.9328364729881287, "step": 2022 }, { "epoch": 1.09, "learning_rate": 7.35321139191478e-08, "logits/chosen": -2.0290069580078125, "logits/rejected": -2.03780198097229, "logps/chosen": -2.5021259784698486, "logps/rejected": -1.8246644735336304, "loss": 0.3462, "rewards/accuracies": 1.0, "rewards/chosen": 1.5832663774490356, "rewards/margins": 0.882510244846344, "rewards/rejected": 0.7007561326026917, "step": 2023 }, { "epoch": 1.09, "learning_rate": 7.350642037604697e-08, "logits/chosen": -2.1470329761505127, "logits/rejected": -2.1513452529907227, "logps/chosen": -0.583460807800293, "logps/rejected": -6.695390701293945, "loss": 0.4751, "rewards/accuracies": 1.0, "rewards/chosen": 0.9784590601921082, "rewards/margins": 0.4972986876964569, "rewards/rejected": 0.48116037249565125, "step": 2024 }, { "epoch": 1.09, "learning_rate": 7.348071886212004e-08, "logits/chosen": -2.0920212268829346, "logits/rejected": -2.0869081020355225, "logps/chosen": -4.515956878662109, "logps/rejected": -4.788977146148682, "loss": 0.4167, "rewards/accuracies": 1.0, "rewards/chosen": 1.161555528640747, "rewards/margins": 0.659683108329773, "rewards/rejected": 0.5018724203109741, "step": 2025 }, { "epoch": 1.09, "learning_rate": 7.34550093860822e-08, "logits/chosen": -2.0472402572631836, "logits/rejected": -2.0458860397338867, "logps/chosen": -5.20132303237915, "logps/rejected": -6.735598087310791, "loss": 0.5224, "rewards/accuracies": 1.0, "rewards/chosen": 1.298460841178894, "rewards/margins": 0.37689751386642456, "rewards/rejected": 0.9215633273124695, "step": 2026 }, { "epoch": 1.09, "learning_rate": 7.342929195665126e-08, "logits/chosen": -2.032330274581909, "logits/rejected": -2.1090383529663086, "logps/chosen": -2.9918553829193115, "logps/rejected": -11.39529037475586, "loss": 0.3371, "rewards/accuracies": 1.0, "rewards/chosen": 1.4339532852172852, "rewards/margins": 0.9140487313270569, "rewards/rejected": 0.5199045538902283, "step": 2027 }, { "epoch": 1.09, "learning_rate": 7.340356658254785e-08, "logits/chosen": -2.0260679721832275, "logits/rejected": -2.2538514137268066, "logps/chosen": -7.192410469055176, "logps/rejected": -4.431034088134766, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.8047224879264832, "rewards/margins": 0.0016963481903076172, "rewards/rejected": 0.8030261397361755, "step": 2028 }, { "epoch": 1.09, "learning_rate": 7.33778332724952e-08, "logits/chosen": -2.138807773590088, "logits/rejected": -2.286098003387451, "logps/chosen": -1.2799450159072876, "logps/rejected": -4.46505880355835, "loss": 0.6646, "rewards/accuracies": 1.0, "rewards/chosen": 0.9377062916755676, "rewards/margins": 0.05790853500366211, "rewards/rejected": 0.8797977566719055, "step": 2029 }, { "epoch": 1.09, "learning_rate": 7.335209203521922e-08, "logits/chosen": -2.113593101501465, "logits/rejected": -2.1075620651245117, "logps/chosen": -1.0483474731445312, "logps/rejected": -11.684112548828125, "loss": 0.4479, "rewards/accuracies": 1.0, "rewards/chosen": 1.1928471326828003, "rewards/margins": 0.5708338618278503, "rewards/rejected": 0.62201327085495, "step": 2030 }, { "epoch": 1.1, "learning_rate": 7.33263428794486e-08, "logits/chosen": -1.9576791524887085, "logits/rejected": -1.9764912128448486, "logps/chosen": -2.2453596591949463, "logps/rejected": -13.478963851928711, "loss": 0.7786, "rewards/accuracies": 0.0, "rewards/chosen": 0.7820535898208618, "rewards/margins": -0.16418075561523438, "rewards/rejected": 0.9462343454360962, "step": 2031 }, { "epoch": 1.1, "learning_rate": 7.330058581391459e-08, "logits/chosen": -1.978827714920044, "logits/rejected": -1.9847544431686401, "logps/chosen": -2.036527156829834, "logps/rejected": -3.120863676071167, "loss": 0.475, "rewards/accuracies": 1.0, "rewards/chosen": 1.0458406209945679, "rewards/margins": 0.49765652418136597, "rewards/rejected": 0.5481840968132019, "step": 2032 }, { "epoch": 1.1, "learning_rate": 7.327482084735125e-08, "logits/chosen": -2.0875625610351562, "logits/rejected": -2.268791913986206, "logps/chosen": -1.357649803161621, "logps/rejected": -1.3097341060638428, "loss": 0.6726, "rewards/accuracies": 1.0, "rewards/chosen": 1.0034502744674683, "rewards/margins": 0.04157865047454834, "rewards/rejected": 0.9618716239929199, "step": 2033 }, { "epoch": 1.1, "learning_rate": 7.324904798849525e-08, "logits/chosen": -2.136554479598999, "logits/rejected": -2.3182711601257324, "logps/chosen": -0.40199270844459534, "logps/rejected": -0.3810204267501831, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.8463516235351562, "rewards/margins": 0.0061514973640441895, "rewards/rejected": 0.8402001261711121, "step": 2034 }, { "epoch": 1.1, "learning_rate": 7.322326724608592e-08, "logits/chosen": -1.9883179664611816, "logits/rejected": -1.9885892868041992, "logps/chosen": -3.3750598430633545, "logps/rejected": -2.8901689052581787, "loss": 0.3767, "rewards/accuracies": 1.0, "rewards/chosen": 1.478752613067627, "rewards/margins": 0.7821889519691467, "rewards/rejected": 0.6965636610984802, "step": 2035 }, { "epoch": 1.1, "learning_rate": 7.319747862886531e-08, "logits/chosen": -2.1739296913146973, "logits/rejected": -2.163381576538086, "logps/chosen": -3.6542773246765137, "logps/rejected": -9.561537742614746, "loss": 0.6215, "rewards/accuracies": 1.0, "rewards/chosen": 0.831807553768158, "rewards/margins": 0.14876186847686768, "rewards/rejected": 0.6830456852912903, "step": 2036 }, { "epoch": 1.1, "learning_rate": 7.31716821455781e-08, "logits/chosen": -2.056877613067627, "logits/rejected": -2.0631887912750244, "logps/chosen": -2.7362418174743652, "logps/rejected": -4.409297943115234, "loss": 0.5059, "rewards/accuracies": 1.0, "rewards/chosen": 0.9419428110122681, "rewards/margins": 0.41789084672927856, "rewards/rejected": 0.5240519642829895, "step": 2037 }, { "epoch": 1.1, "learning_rate": 7.314587780497169e-08, "logits/chosen": -2.1480839252471924, "logits/rejected": -2.149123191833496, "logps/chosen": -5.971985816955566, "logps/rejected": -10.667439460754395, "loss": 0.2256, "rewards/accuracies": 1.0, "rewards/chosen": 1.7521103620529175, "rewards/margins": 1.3738360404968262, "rewards/rejected": 0.37827426195144653, "step": 2038 }, { "epoch": 1.1, "learning_rate": 7.31200656157961e-08, "logits/chosen": -2.0949745178222656, "logits/rejected": -2.1009185314178467, "logps/chosen": -1.5368258953094482, "logps/rejected": -3.6714558601379395, "loss": 0.4464, "rewards/accuracies": 1.0, "rewards/chosen": 1.075131893157959, "rewards/margins": 0.5751795768737793, "rewards/rejected": 0.4999522864818573, "step": 2039 }, { "epoch": 1.1, "learning_rate": 7.309424558680399e-08, "logits/chosen": -2.0755462646484375, "logits/rejected": -2.233812093734741, "logps/chosen": -0.5957326889038086, "logps/rejected": -0.5943347215652466, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.960228443145752, "rewards/margins": 0.0009301304817199707, "rewards/rejected": 0.959298312664032, "step": 2040 }, { "epoch": 1.1, "learning_rate": 7.306841772675074e-08, "logits/chosen": -2.1283533573150635, "logits/rejected": -2.254235029220581, "logps/chosen": -3.8132896423339844, "logps/rejected": -3.903308391571045, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.836207389831543, "rewards/margins": -0.00124359130859375, "rewards/rejected": 0.8374509811401367, "step": 2041 }, { "epoch": 1.1, "learning_rate": 7.304258204439437e-08, "logits/chosen": -2.172816753387451, "logits/rejected": -2.1683313846588135, "logps/chosen": -6.724279403686523, "logps/rejected": -4.826384544372559, "loss": 0.3728, "rewards/accuracies": 1.0, "rewards/chosen": 1.2642561197280884, "rewards/margins": 0.7946632504463196, "rewards/rejected": 0.4695928692817688, "step": 2042 }, { "epoch": 1.1, "learning_rate": 7.301673854849551e-08, "logits/chosen": -2.0013856887817383, "logits/rejected": -2.2536845207214355, "logps/chosen": -5.3013596534729, "logps/rejected": -3.480624198913574, "loss": 0.7295, "rewards/accuracies": 0.0, "rewards/chosen": 0.5206910967826843, "rewards/margins": -0.07136821746826172, "rewards/rejected": 0.592059314250946, "step": 2043 }, { "epoch": 1.1, "learning_rate": 7.299088724781747e-08, "logits/chosen": -2.142127275466919, "logits/rejected": -2.0076661109924316, "logps/chosen": -33.37673568725586, "logps/rejected": -5.112125396728516, "loss": 0.2854, "rewards/accuracies": 1.0, "rewards/chosen": 1.5209053754806519, "rewards/margins": 1.1076233386993408, "rewards/rejected": 0.41328200697898865, "step": 2044 }, { "epoch": 1.1, "learning_rate": 7.296502815112622e-08, "logits/chosen": -1.9232367277145386, "logits/rejected": -1.923130989074707, "logps/chosen": -1.2644457817077637, "logps/rejected": -1.1082309484481812, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.8138670325279236, "rewards/margins": 0.029236674308776855, "rewards/rejected": 0.7846303582191467, "step": 2045 }, { "epoch": 1.1, "learning_rate": 7.293916126719035e-08, "logits/chosen": -2.16790509223938, "logits/rejected": -2.288132429122925, "logps/chosen": -1.3140735626220703, "logps/rejected": -7.219019889831543, "loss": 0.5631, "rewards/accuracies": 1.0, "rewards/chosen": 1.0586328506469727, "rewards/margins": 0.27960842847824097, "rewards/rejected": 0.7790244221687317, "step": 2046 }, { "epoch": 1.1, "learning_rate": 7.291328660478112e-08, "logits/chosen": -2.076688766479492, "logits/rejected": -2.2865328788757324, "logps/chosen": -6.827963352203369, "logps/rejected": -5.985056400299072, "loss": 0.7061, "rewards/accuracies": 0.0, "rewards/chosen": 0.8056768774986267, "rewards/margins": -0.02566850185394287, "rewards/rejected": 0.8313453793525696, "step": 2047 }, { "epoch": 1.1, "learning_rate": 7.288740417267238e-08, "logits/chosen": -2.0264227390289307, "logits/rejected": -2.0329713821411133, "logps/chosen": -1.8990180492401123, "logps/rejected": -3.9323954582214355, "loss": 0.4703, "rewards/accuracies": 1.0, "rewards/chosen": 1.086897373199463, "rewards/margins": 0.5101540684700012, "rewards/rejected": 0.5767433047294617, "step": 2048 }, { "epoch": 1.11, "learning_rate": 7.286151397964064e-08, "logits/chosen": -2.139291763305664, "logits/rejected": -2.1503734588623047, "logps/chosen": -3.2680773735046387, "logps/rejected": -3.408071994781494, "loss": 0.5746, "rewards/accuracies": 1.0, "rewards/chosen": 1.2959400415420532, "rewards/margins": 0.25310826301574707, "rewards/rejected": 1.0428317785263062, "step": 2049 }, { "epoch": 1.11, "learning_rate": 7.283561603446507e-08, "logits/chosen": -2.0820398330688477, "logits/rejected": -2.085653305053711, "logps/chosen": -1.8860282897949219, "logps/rejected": -2.7833762168884277, "loss": 0.4542, "rewards/accuracies": 1.0, "rewards/chosen": 1.3584703207015991, "rewards/margins": 0.5536494851112366, "rewards/rejected": 0.8048208355903625, "step": 2050 }, { "epoch": 1.11, "learning_rate": 7.28097103459274e-08, "logits/chosen": -2.1829652786254883, "logits/rejected": -2.2570688724517822, "logps/chosen": -4.260046005249023, "logps/rejected": -26.789966583251953, "loss": 0.3866, "rewards/accuracies": 1.0, "rewards/chosen": 1.0968248844146729, "rewards/margins": 0.7508978247642517, "rewards/rejected": 0.34592705965042114, "step": 2051 }, { "epoch": 1.11, "learning_rate": 7.278379692281207e-08, "logits/chosen": -2.057037353515625, "logits/rejected": -2.1020708084106445, "logps/chosen": -4.378006935119629, "logps/rejected": -25.235244750976562, "loss": 0.1938, "rewards/accuracies": 1.0, "rewards/chosen": 1.261277675628662, "rewards/margins": 1.5423176288604736, "rewards/rejected": -0.2810400128364563, "step": 2052 }, { "epoch": 1.11, "learning_rate": 7.275787577390608e-08, "logits/chosen": -2.111178398132324, "logits/rejected": -2.257155179977417, "logps/chosen": -2.3866987228393555, "logps/rejected": -1.383595585823059, "loss": 0.6551, "rewards/accuracies": 1.0, "rewards/chosen": 1.0255268812179565, "rewards/margins": 0.07769912481307983, "rewards/rejected": 0.9478277564048767, "step": 2053 }, { "epoch": 1.11, "learning_rate": 7.273194690799908e-08, "logits/chosen": -2.0374484062194824, "logits/rejected": -2.0430586338043213, "logps/chosen": -1.4765362739562988, "logps/rejected": -3.6167972087860107, "loss": 0.4614, "rewards/accuracies": 1.0, "rewards/chosen": 1.0981024503707886, "rewards/margins": 0.534006655216217, "rewards/rejected": 0.5640957951545715, "step": 2054 }, { "epoch": 1.11, "learning_rate": 7.27060103338833e-08, "logits/chosen": -1.9939066171646118, "logits/rejected": -2.029371500015259, "logps/chosen": -8.700748443603516, "logps/rejected": -18.34874153137207, "loss": 0.3878, "rewards/accuracies": 1.0, "rewards/chosen": 1.1567070484161377, "rewards/margins": 0.7472333908081055, "rewards/rejected": 0.40947362780570984, "step": 2055 }, { "epoch": 1.11, "learning_rate": 7.268006606035364e-08, "logits/chosen": -2.0752410888671875, "logits/rejected": -2.0863256454467773, "logps/chosen": -9.925034523010254, "logps/rejected": -6.532990455627441, "loss": 0.2654, "rewards/accuracies": 1.0, "rewards/chosen": 1.9021472930908203, "rewards/margins": 1.1908469200134277, "rewards/rejected": 0.7113003730773926, "step": 2056 }, { "epoch": 1.11, "learning_rate": 7.265411409620758e-08, "logits/chosen": -2.101796865463257, "logits/rejected": -2.3299968242645264, "logps/chosen": -13.620404243469238, "logps/rejected": -8.378872871398926, "loss": 0.86, "rewards/accuracies": 0.0, "rewards/chosen": 0.821705162525177, "rewards/margins": -0.3098079562187195, "rewards/rejected": 1.1315131187438965, "step": 2057 }, { "epoch": 1.11, "learning_rate": 7.262815445024519e-08, "logits/chosen": -1.9789470434188843, "logits/rejected": -1.9752211570739746, "logps/chosen": -2.733187198638916, "logps/rejected": -7.2248053550720215, "loss": 0.4978, "rewards/accuracies": 1.0, "rewards/chosen": 1.0174177885055542, "rewards/margins": 0.4383302927017212, "rewards/rejected": 0.579087495803833, "step": 2058 }, { "epoch": 1.11, "learning_rate": 7.260218713126916e-08, "logits/chosen": -2.1376709938049316, "logits/rejected": -2.0263569355010986, "logps/chosen": -10.191129684448242, "logps/rejected": -2.575108051300049, "loss": 0.3376, "rewards/accuracies": 1.0, "rewards/chosen": 1.6687172651290894, "rewards/margins": 0.9123395085334778, "rewards/rejected": 0.7563777565956116, "step": 2059 }, { "epoch": 1.11, "learning_rate": 7.257621214808483e-08, "logits/chosen": -2.089653968811035, "logits/rejected": -2.0977368354797363, "logps/chosen": -3.27702260017395, "logps/rejected": -6.161011695861816, "loss": 0.7044, "rewards/accuracies": 0.0, "rewards/chosen": 0.9662036895751953, "rewards/margins": -0.022331178188323975, "rewards/rejected": 0.9885348677635193, "step": 2060 }, { "epoch": 1.11, "learning_rate": 7.255022950950004e-08, "logits/chosen": -2.097348928451538, "logits/rejected": -2.103768825531006, "logps/chosen": -1.8584812879562378, "logps/rejected": -3.3008298873901367, "loss": 0.4499, "rewards/accuracies": 1.0, "rewards/chosen": 1.197709560394287, "rewards/margins": 0.5654581785202026, "rewards/rejected": 0.6322513818740845, "step": 2061 }, { "epoch": 1.11, "learning_rate": 7.252423922432531e-08, "logits/chosen": -2.13691782951355, "logits/rejected": -2.1337153911590576, "logps/chosen": -2.836961507797241, "logps/rejected": -3.058417558670044, "loss": 0.3964, "rewards/accuracies": 1.0, "rewards/chosen": 1.523253083229065, "rewards/margins": 0.7207218408584595, "rewards/rejected": 0.8025312423706055, "step": 2062 }, { "epoch": 1.11, "learning_rate": 7.249824130137372e-08, "logits/chosen": -2.044041872024536, "logits/rejected": -2.2422101497650146, "logps/chosen": -1.5741227865219116, "logps/rejected": -1.480841040611267, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.7378392815589905, "rewards/margins": 0.003620326519012451, "rewards/rejected": 0.734218955039978, "step": 2063 }, { "epoch": 1.11, "learning_rate": 7.247223574946093e-08, "logits/chosen": -2.1271471977233887, "logits/rejected": -2.1212472915649414, "logps/chosen": -3.6346919536590576, "logps/rejected": -3.4467246532440186, "loss": 0.4186, "rewards/accuracies": 1.0, "rewards/chosen": 1.1247766017913818, "rewards/margins": 0.6542853713035583, "rewards/rejected": 0.4704912304878235, "step": 2064 }, { "epoch": 1.11, "learning_rate": 7.244622257740523e-08, "logits/chosen": -2.0541162490844727, "logits/rejected": -2.280799388885498, "logps/chosen": -0.8932276964187622, "logps/rejected": -0.9414255619049072, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 0.7588141560554504, "rewards/margins": 0.017617106437683105, "rewards/rejected": 0.7411970496177673, "step": 2065 }, { "epoch": 1.11, "learning_rate": 7.242020179402744e-08, "logits/chosen": -2.0630903244018555, "logits/rejected": -2.055100440979004, "logps/chosen": -5.533587455749512, "logps/rejected": -1.0062528848648071, "loss": 0.4827, "rewards/accuracies": 1.0, "rewards/chosen": 1.4699281454086304, "rewards/margins": 0.4773404002189636, "rewards/rejected": 0.9925877451896667, "step": 2066 }, { "epoch": 1.11, "learning_rate": 7.239417340815098e-08, "logits/chosen": -2.1290690898895264, "logits/rejected": -2.135512590408325, "logps/chosen": -1.9275200366973877, "logps/rejected": -4.120482921600342, "loss": 0.4176, "rewards/accuracies": 1.0, "rewards/chosen": 1.1793429851531982, "rewards/margins": 0.6571807861328125, "rewards/rejected": 0.5221621990203857, "step": 2067 }, { "epoch": 1.12, "learning_rate": 7.236813742860186e-08, "logits/chosen": -2.059121608734131, "logits/rejected": -2.277850866317749, "logps/chosen": -7.871334552764893, "logps/rejected": -1.5079180002212524, "loss": 0.9123, "rewards/accuracies": 0.0, "rewards/chosen": 0.6678265333175659, "rewards/margins": -0.39882421493530273, "rewards/rejected": 1.0666507482528687, "step": 2068 }, { "epoch": 1.12, "learning_rate": 7.234209386420869e-08, "logits/chosen": -2.0518686771392822, "logits/rejected": -2.2694060802459717, "logps/chosen": -2.428924083709717, "logps/rejected": -2.5089335441589355, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.5985467433929443, "rewards/margins": 0.024647235870361328, "rewards/rejected": 0.573899507522583, "step": 2069 }, { "epoch": 1.12, "learning_rate": 7.231604272380256e-08, "logits/chosen": -2.0492300987243652, "logits/rejected": -2.050971508026123, "logps/chosen": -1.0277259349822998, "logps/rejected": -4.534058094024658, "loss": 0.5081, "rewards/accuracies": 1.0, "rewards/chosen": 0.9140275120735168, "rewards/margins": 0.41235870122909546, "rewards/rejected": 0.5016688108444214, "step": 2070 }, { "epoch": 1.12, "learning_rate": 7.228998401621723e-08, "logits/chosen": -1.9597840309143066, "logits/rejected": -1.9665744304656982, "logps/chosen": -1.8082082271575928, "logps/rejected": -4.040648460388184, "loss": 0.3982, "rewards/accuracies": 1.0, "rewards/chosen": 1.1833088397979736, "rewards/margins": 0.7149820327758789, "rewards/rejected": 0.46832677721977234, "step": 2071 }, { "epoch": 1.12, "learning_rate": 7.226391775028898e-08, "logits/chosen": -2.1190807819366455, "logits/rejected": -2.119476318359375, "logps/chosen": -2.6660637855529785, "logps/rejected": -3.537156820297241, "loss": 0.5242, "rewards/accuracies": 1.0, "rewards/chosen": 0.9593931436538696, "rewards/margins": 0.37245064973831177, "rewards/rejected": 0.5869424939155579, "step": 2072 }, { "epoch": 1.12, "learning_rate": 7.223784393485666e-08, "logits/chosen": -2.1138978004455566, "logits/rejected": -2.1852145195007324, "logps/chosen": -3.579807758331299, "logps/rejected": -28.612226486206055, "loss": 0.4014, "rewards/accuracies": 1.0, "rewards/chosen": 1.109434723854065, "rewards/margins": 0.705374538898468, "rewards/rejected": 0.4040601849555969, "step": 2073 }, { "epoch": 1.12, "learning_rate": 7.221176257876166e-08, "logits/chosen": -2.0526750087738037, "logits/rejected": -1.951851487159729, "logps/chosen": -34.96824264526367, "logps/rejected": -2.282301425933838, "loss": 0.2965, "rewards/accuracies": 1.0, "rewards/chosen": 1.8351677656173706, "rewards/margins": 1.0638635158538818, "rewards/rejected": 0.771304190158844, "step": 2074 }, { "epoch": 1.12, "learning_rate": 7.218567369084796e-08, "logits/chosen": -1.9816021919250488, "logits/rejected": -2.292649745941162, "logps/chosen": -0.6920400857925415, "logps/rejected": -0.6623992323875427, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.81238853931427, "rewards/margins": 0.0007901191711425781, "rewards/rejected": 0.8115984201431274, "step": 2075 }, { "epoch": 1.12, "learning_rate": 7.215957727996206e-08, "logits/chosen": -2.0461230278015137, "logits/rejected": -2.0463624000549316, "logps/chosen": -4.707681179046631, "logps/rejected": -3.617851972579956, "loss": 0.3018, "rewards/accuracies": 1.0, "rewards/chosen": 1.5532945394515991, "rewards/margins": 1.0433545112609863, "rewards/rejected": 0.5099400877952576, "step": 2076 }, { "epoch": 1.12, "learning_rate": 7.213347335495306e-08, "logits/chosen": -2.0404467582702637, "logits/rejected": -2.0472891330718994, "logps/chosen": -1.3588244915008545, "logps/rejected": -3.763291120529175, "loss": 0.488, "rewards/accuracies": 1.0, "rewards/chosen": 1.0355759859085083, "rewards/margins": 0.4636123776435852, "rewards/rejected": 0.5719636082649231, "step": 2077 }, { "epoch": 1.12, "learning_rate": 7.210736192467255e-08, "logits/chosen": -2.142094612121582, "logits/rejected": -2.1426563262939453, "logps/chosen": -2.7298965454101562, "logps/rejected": -5.65806245803833, "loss": 0.3783, "rewards/accuracies": 1.0, "rewards/chosen": 1.0688648223876953, "rewards/margins": 0.7770799398422241, "rewards/rejected": 0.2917849123477936, "step": 2078 }, { "epoch": 1.12, "learning_rate": 7.208124299797472e-08, "logits/chosen": -2.0870583057403564, "logits/rejected": -2.0879921913146973, "logps/chosen": -1.5159003734588623, "logps/rejected": -1.9132851362228394, "loss": 0.6138, "rewards/accuracies": 1.0, "rewards/chosen": 1.127511739730835, "rewards/margins": 0.16550981998443604, "rewards/rejected": 0.9620019197463989, "step": 2079 }, { "epoch": 1.12, "learning_rate": 7.205511658371625e-08, "logits/chosen": -2.003063917160034, "logits/rejected": -1.9978282451629639, "logps/chosen": -5.019170761108398, "logps/rejected": -2.0512185096740723, "loss": 0.3154, "rewards/accuracies": 1.0, "rewards/chosen": 1.5801416635513306, "rewards/margins": 0.9921513199806213, "rewards/rejected": 0.5879903435707092, "step": 2080 }, { "epoch": 1.12, "learning_rate": 7.202898269075639e-08, "logits/chosen": -2.111326217651367, "logits/rejected": -2.1098110675811768, "logps/chosen": -4.490280628204346, "logps/rejected": -7.096439361572266, "loss": 0.2492, "rewards/accuracies": 1.0, "rewards/chosen": 1.493181586265564, "rewards/margins": 1.2624800205230713, "rewards/rejected": 0.23070155084133148, "step": 2081 }, { "epoch": 1.12, "learning_rate": 7.200284132795693e-08, "logits/chosen": -2.071463108062744, "logits/rejected": -2.284480571746826, "logps/chosen": -2.236074447631836, "logps/rejected": -1.647318959236145, "loss": 0.717, "rewards/accuracies": 0.0, "rewards/chosen": 0.4739944636821747, "rewards/margins": -0.04714199900627136, "rewards/rejected": 0.521136462688446, "step": 2082 }, { "epoch": 1.12, "learning_rate": 7.197669250418217e-08, "logits/chosen": -2.1425139904022217, "logits/rejected": -2.3392345905303955, "logps/chosen": -1.4918582439422607, "logps/rejected": -1.3800305128097534, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 1.00996732711792, "rewards/margins": 0.007114768028259277, "rewards/rejected": 1.0028525590896606, "step": 2083 }, { "epoch": 1.12, "learning_rate": 7.195053622829896e-08, "logits/chosen": -2.0301284790039062, "logits/rejected": -2.0266358852386475, "logps/chosen": -3.517864465713501, "logps/rejected": -7.227970123291016, "loss": 0.4661, "rewards/accuracies": 1.0, "rewards/chosen": 1.1471079587936401, "rewards/margins": 0.5213979482650757, "rewards/rejected": 0.6257100105285645, "step": 2084 }, { "epoch": 1.12, "learning_rate": 7.192437250917667e-08, "logits/chosen": -2.0299136638641357, "logits/rejected": -2.029167652130127, "logps/chosen": -0.6854180097579956, "logps/rejected": -3.78161358833313, "loss": 0.5339, "rewards/accuracies": 1.0, "rewards/chosen": 0.9012411236763, "rewards/margins": 0.34865617752075195, "rewards/rejected": 0.5525849461555481, "step": 2085 }, { "epoch": 1.13, "learning_rate": 7.189820135568717e-08, "logits/chosen": -2.1431779861450195, "logits/rejected": -2.2828493118286133, "logps/chosen": -5.567214488983154, "logps/rejected": -0.8335109353065491, "loss": 0.7729, "rewards/accuracies": 0.0, "rewards/chosen": 0.8605667352676392, "rewards/margins": -0.1536792516708374, "rewards/rejected": 1.0142459869384766, "step": 2086 }, { "epoch": 1.13, "learning_rate": 7.187202277670491e-08, "logits/chosen": -2.1426947116851807, "logits/rejected": -2.3079497814178467, "logps/chosen": -0.7031754851341248, "logps/rejected": -0.7737143039703369, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 1.0941990613937378, "rewards/margins": 0.01708805561065674, "rewards/rejected": 1.077111005783081, "step": 2087 }, { "epoch": 1.13, "learning_rate": 7.18458367811068e-08, "logits/chosen": -2.094283103942871, "logits/rejected": -2.2838237285614014, "logps/chosen": -1.945117712020874, "logps/rejected": -1.9098647832870483, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.5323463678359985, "rewards/margins": 0.001953303813934326, "rewards/rejected": 0.5303930640220642, "step": 2088 }, { "epoch": 1.13, "learning_rate": 7.181964337777229e-08, "logits/chosen": -1.9808706045150757, "logits/rejected": -1.9740607738494873, "logps/chosen": -1.6531896591186523, "logps/rejected": -4.803046226501465, "loss": 0.488, "rewards/accuracies": 1.0, "rewards/chosen": 0.9801949858665466, "rewards/margins": 0.463653564453125, "rewards/rejected": 0.5165414214134216, "step": 2089 }, { "epoch": 1.13, "learning_rate": 7.179344257558335e-08, "logits/chosen": -2.066162586212158, "logits/rejected": -2.225736141204834, "logps/chosen": -1.1474757194519043, "logps/rejected": -1.0190777778625488, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.7811354398727417, "rewards/margins": 0.03026479482650757, "rewards/rejected": 0.7508706450462341, "step": 2090 }, { "epoch": 1.13, "learning_rate": 7.176723438342444e-08, "logits/chosen": -2.1608803272247314, "logits/rejected": -1.9959861040115356, "logps/chosen": -39.579830169677734, "logps/rejected": -4.502379417419434, "loss": 0.2824, "rewards/accuracies": 1.0, "rewards/chosen": 1.8426990509033203, "rewards/margins": 1.1200308799743652, "rewards/rejected": 0.7226681709289551, "step": 2091 }, { "epoch": 1.13, "learning_rate": 7.174101881018255e-08, "logits/chosen": -2.1413090229034424, "logits/rejected": -2.123974084854126, "logps/chosen": -16.340124130249023, "logps/rejected": -3.8040640354156494, "loss": 0.3715, "rewards/accuracies": 1.0, "rewards/chosen": 1.2341219186782837, "rewards/margins": 0.7985519170761108, "rewards/rejected": 0.43556997179985046, "step": 2092 }, { "epoch": 1.13, "learning_rate": 7.171479586474711e-08, "logits/chosen": -2.1385467052459717, "logits/rejected": -2.1299915313720703, "logps/chosen": -6.912247180938721, "logps/rejected": -3.3384861946105957, "loss": 0.593, "rewards/accuracies": 1.0, "rewards/chosen": 0.7779514789581299, "rewards/margins": 0.21146279573440552, "rewards/rejected": 0.5664886832237244, "step": 2093 }, { "epoch": 1.13, "learning_rate": 7.168856555601016e-08, "logits/chosen": -2.0858607292175293, "logits/rejected": -2.0882811546325684, "logps/chosen": -0.8161322474479675, "logps/rejected": -3.4425978660583496, "loss": 0.4853, "rewards/accuracies": 1.0, "rewards/chosen": 1.0661001205444336, "rewards/margins": 0.47061020135879517, "rewards/rejected": 0.5954899191856384, "step": 2094 }, { "epoch": 1.13, "learning_rate": 7.166232789286612e-08, "logits/chosen": -2.1546263694763184, "logits/rejected": -2.1175215244293213, "logps/chosen": -28.84967613220215, "logps/rejected": -12.94232177734375, "loss": 0.4259, "rewards/accuracies": 1.0, "rewards/chosen": 1.6079927682876587, "rewards/margins": 0.6331596374511719, "rewards/rejected": 0.9748331308364868, "step": 2095 }, { "epoch": 1.13, "learning_rate": 7.163608288421198e-08, "logits/chosen": -2.159794807434082, "logits/rejected": -2.1562368869781494, "logps/chosen": -2.868154764175415, "logps/rejected": -10.506133079528809, "loss": 0.3667, "rewards/accuracies": 1.0, "rewards/chosen": 1.3112238645553589, "rewards/margins": 0.8141458034515381, "rewards/rejected": 0.4970780313014984, "step": 2096 }, { "epoch": 1.13, "learning_rate": 7.16098305389472e-08, "logits/chosen": -2.014849901199341, "logits/rejected": -2.2446177005767822, "logps/chosen": -0.9702317118644714, "logps/rejected": -1.0983752012252808, "loss": 0.6749, "rewards/accuracies": 1.0, "rewards/chosen": 0.7406119108200073, "rewards/margins": 0.036815345287323, "rewards/rejected": 0.7037965655326843, "step": 2097 }, { "epoch": 1.13, "learning_rate": 7.158357086597372e-08, "logits/chosen": -2.186537981033325, "logits/rejected": -2.284449815750122, "logps/chosen": -5.265678405761719, "logps/rejected": -1.6354351043701172, "loss": 0.7683, "rewards/accuracies": 0.0, "rewards/chosen": 1.0925461053848267, "rewards/margins": -0.14509928226470947, "rewards/rejected": 1.2376453876495361, "step": 2098 }, { "epoch": 1.13, "learning_rate": 7.155730387419597e-08, "logits/chosen": -2.017343521118164, "logits/rejected": -2.307586908340454, "logps/chosen": -4.315564155578613, "logps/rejected": -5.9710283279418945, "loss": 0.5903, "rewards/accuracies": 1.0, "rewards/chosen": 0.870809018611908, "rewards/margins": 0.2174292802810669, "rewards/rejected": 0.6533797383308411, "step": 2099 }, { "epoch": 1.13, "learning_rate": 7.153102957252086e-08, "logits/chosen": -2.0050792694091797, "logits/rejected": -2.00107741355896, "logps/chosen": -8.949422836303711, "logps/rejected": -1.7170852422714233, "loss": 0.4302, "rewards/accuracies": 1.0, "rewards/chosen": 1.4186781644821167, "rewards/margins": 0.6207104921340942, "rewards/rejected": 0.7979676723480225, "step": 2100 }, { "epoch": 1.13, "learning_rate": 7.15047479698578e-08, "logits/chosen": -2.1263644695281982, "logits/rejected": -2.1267244815826416, "logps/chosen": -2.662773609161377, "logps/rejected": -2.27542781829834, "loss": 0.6327, "rewards/accuracies": 1.0, "rewards/chosen": 0.943550705909729, "rewards/margins": 0.12469780445098877, "rewards/rejected": 0.8188529014587402, "step": 2101 }, { "epoch": 1.13, "learning_rate": 7.147845907511862e-08, "logits/chosen": -2.152960777282715, "logits/rejected": -2.3003127574920654, "logps/chosen": -0.6594254970550537, "logps/rejected": -0.6421590447425842, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.9521940350532532, "rewards/margins": 0.018024563789367676, "rewards/rejected": 0.9341694712638855, "step": 2102 }, { "epoch": 1.13, "learning_rate": 7.14521628972177e-08, "logits/chosen": -2.0125648975372314, "logits/rejected": -2.260728120803833, "logps/chosen": -5.063995361328125, "logps/rejected": -4.996374607086182, "loss": 0.7075, "rewards/accuracies": 0.0, "rewards/chosen": 0.977559506893158, "rewards/margins": -0.028564393520355225, "rewards/rejected": 1.0061239004135132, "step": 2103 }, { "epoch": 1.13, "learning_rate": 7.142585944507183e-08, "logits/chosen": -2.1745195388793945, "logits/rejected": -2.262058734893799, "logps/chosen": -7.031050205230713, "logps/rejected": -3.467014789581299, "loss": 0.7427, "rewards/accuracies": 0.0, "rewards/chosen": 0.5134929418563843, "rewards/margins": -0.09676647186279297, "rewards/rejected": 0.6102594137191772, "step": 2104 }, { "epoch": 1.14, "learning_rate": 7.139954872760027e-08, "logits/chosen": -2.072295665740967, "logits/rejected": -2.0752108097076416, "logps/chosen": -2.179377555847168, "logps/rejected": -4.131720542907715, "loss": 0.4322, "rewards/accuracies": 1.0, "rewards/chosen": 1.121739387512207, "rewards/margins": 0.6150069236755371, "rewards/rejected": 0.5067324638366699, "step": 2105 }, { "epoch": 1.14, "learning_rate": 7.13732307537248e-08, "logits/chosen": -2.0398430824279785, "logits/rejected": -2.0425710678100586, "logps/chosen": -0.3445208668708801, "logps/rejected": -4.66609001159668, "loss": 0.5526, "rewards/accuracies": 1.0, "rewards/chosen": 0.9257049560546875, "rewards/margins": 0.3041771650314331, "rewards/rejected": 0.6215277910232544, "step": 2106 }, { "epoch": 1.14, "learning_rate": 7.134690553236957e-08, "logits/chosen": -2.1660208702087402, "logits/rejected": -2.1598904132843018, "logps/chosen": -6.2835307121276855, "logps/rejected": -4.049098491668701, "loss": 0.4016, "rewards/accuracies": 1.0, "rewards/chosen": 1.293646216392517, "rewards/margins": 0.7048603296279907, "rewards/rejected": 0.5887858867645264, "step": 2107 }, { "epoch": 1.14, "learning_rate": 7.132057307246129e-08, "logits/chosen": -2.0519821643829346, "logits/rejected": -2.118687868118286, "logps/chosen": -7.644397258758545, "logps/rejected": -17.58951759338379, "loss": 0.3776, "rewards/accuracies": 1.0, "rewards/chosen": 1.3299964666366577, "rewards/margins": 0.7793252468109131, "rewards/rejected": 0.5506712198257446, "step": 2108 }, { "epoch": 1.14, "learning_rate": 7.129423338292904e-08, "logits/chosen": -2.0761687755584717, "logits/rejected": -2.2930190563201904, "logps/chosen": -1.0940395593643188, "logps/rejected": -6.461134910583496, "loss": 0.6226, "rewards/accuracies": 1.0, "rewards/chosen": 1.031661033630371, "rewards/margins": 0.14649486541748047, "rewards/rejected": 0.8851661682128906, "step": 2109 }, { "epoch": 1.14, "learning_rate": 7.126788647270436e-08, "logits/chosen": -2.0865633487701416, "logits/rejected": -2.2528188228607178, "logps/chosen": -0.5629672408103943, "logps/rejected": -0.6005122661590576, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.7950637936592102, "rewards/margins": 0.0030921101570129395, "rewards/rejected": 0.7919716835021973, "step": 2110 }, { "epoch": 1.14, "learning_rate": 7.124153235072132e-08, "logits/chosen": -2.117318630218506, "logits/rejected": -2.012972116470337, "logps/chosen": -26.902097702026367, "logps/rejected": -3.4462265968322754, "loss": 0.2243, "rewards/accuracies": 1.0, "rewards/chosen": 1.9460779428482056, "rewards/margins": 1.3803660869598389, "rewards/rejected": 0.5657119154930115, "step": 2111 }, { "epoch": 1.14, "learning_rate": 7.121517102591633e-08, "logits/chosen": -2.1772193908691406, "logits/rejected": -2.1846914291381836, "logps/chosen": -3.2693655490875244, "logps/rejected": -2.9848945140838623, "loss": 0.542, "rewards/accuracies": 1.0, "rewards/chosen": 0.8506889343261719, "rewards/margins": 0.32929521799087524, "rewards/rejected": 0.5213937163352966, "step": 2112 }, { "epoch": 1.14, "learning_rate": 7.11888025072283e-08, "logits/chosen": -2.097895860671997, "logits/rejected": -2.283134698867798, "logps/chosen": -3.075218915939331, "logps/rejected": -2.9046480655670166, "loss": 0.7084, "rewards/accuracies": 0.0, "rewards/chosen": 1.0151695013046265, "rewards/margins": -0.030307888984680176, "rewards/rejected": 1.0454773902893066, "step": 2113 }, { "epoch": 1.14, "learning_rate": 7.11624268035986e-08, "logits/chosen": -2.086261749267578, "logits/rejected": -2.0861973762512207, "logps/chosen": -4.313488006591797, "logps/rejected": -2.2132797241210938, "loss": 0.3529, "rewards/accuracies": 1.0, "rewards/chosen": 1.4537683725357056, "rewards/margins": 0.8600502014160156, "rewards/rejected": 0.5937181711196899, "step": 2114 }, { "epoch": 1.14, "learning_rate": 7.113604392397095e-08, "logits/chosen": -2.0066723823547363, "logits/rejected": -2.252995252609253, "logps/chosen": -0.5220940709114075, "logps/rejected": -0.5537627935409546, "loss": 0.6998, "rewards/accuracies": 0.0, "rewards/chosen": 0.8560463190078735, "rewards/margins": -0.013238310813903809, "rewards/rejected": 0.8692846298217773, "step": 2115 }, { "epoch": 1.14, "learning_rate": 7.11096538772916e-08, "logits/chosen": -1.9925971031188965, "logits/rejected": -2.2265946865081787, "logps/chosen": -0.6088517904281616, "logps/rejected": -0.6618121862411499, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.8291789889335632, "rewards/margins": 0.008903682231903076, "rewards/rejected": 0.8202753067016602, "step": 2116 }, { "epoch": 1.14, "learning_rate": 7.10832566725092e-08, "logits/chosen": -2.1078901290893555, "logits/rejected": -2.3360559940338135, "logps/chosen": -0.37748077511787415, "logps/rejected": -0.4434271454811096, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.8866421580314636, "rewards/margins": 0.030585289001464844, "rewards/rejected": 0.8560568690299988, "step": 2117 }, { "epoch": 1.14, "learning_rate": 7.105685231857479e-08, "logits/chosen": -2.1374995708465576, "logits/rejected": -2.12874436378479, "logps/chosen": -7.584221839904785, "logps/rejected": -1.6510725021362305, "loss": 0.4152, "rewards/accuracies": 1.0, "rewards/chosen": 1.405903935432434, "rewards/margins": 0.6643063426017761, "rewards/rejected": 0.741597592830658, "step": 2118 }, { "epoch": 1.14, "learning_rate": 7.103044082444185e-08, "logits/chosen": -2.0021724700927734, "logits/rejected": -2.0047037601470947, "logps/chosen": -2.8651225566864014, "logps/rejected": -2.0310044288635254, "loss": 0.5303, "rewards/accuracies": 1.0, "rewards/chosen": 1.399457335472107, "rewards/margins": 0.35737621784210205, "rewards/rejected": 1.0420811176300049, "step": 2119 }, { "epoch": 1.14, "learning_rate": 7.100402219906631e-08, "logits/chosen": -2.1741750240325928, "logits/rejected": -2.32735276222229, "logps/chosen": -17.638444900512695, "logps/rejected": -4.578916072845459, "loss": 0.7424, "rewards/accuracies": 0.0, "rewards/chosen": 0.7599584460258484, "rewards/margins": -0.09613537788391113, "rewards/rejected": 0.8560938239097595, "step": 2120 }, { "epoch": 1.14, "learning_rate": 7.097759645140651e-08, "logits/chosen": -2.2818634510040283, "logits/rejected": -2.162184238433838, "logps/chosen": -34.38751220703125, "logps/rejected": -3.674283027648926, "loss": 0.2725, "rewards/accuracies": 1.0, "rewards/chosen": 2.0649399757385254, "rewards/margins": 1.1609830856323242, "rewards/rejected": 0.9039568305015564, "step": 2121 }, { "epoch": 1.14, "learning_rate": 7.095116359042318e-08, "logits/chosen": -2.1202096939086914, "logits/rejected": -2.120499610900879, "logps/chosen": -6.106109142303467, "logps/rejected": -2.6320245265960693, "loss": 0.2797, "rewards/accuracies": 1.0, "rewards/chosen": 1.6833950281143188, "rewards/margins": 1.130946159362793, "rewards/rejected": 0.5524488687515259, "step": 2122 }, { "epoch": 1.15, "learning_rate": 7.092472362507949e-08, "logits/chosen": -2.0000057220458984, "logits/rejected": -2.005157470703125, "logps/chosen": -1.5715447664260864, "logps/rejected": -2.9561567306518555, "loss": 0.3992, "rewards/accuracies": 1.0, "rewards/chosen": 1.311959981918335, "rewards/margins": 0.7121887803077698, "rewards/rejected": 0.5997712016105652, "step": 2123 }, { "epoch": 1.15, "learning_rate": 7.089827656434098e-08, "logits/chosen": -2.173682451248169, "logits/rejected": -2.249370574951172, "logps/chosen": -0.7426785230636597, "logps/rejected": -0.7350611686706543, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 1.04545259475708, "rewards/margins": 0.020050883293151855, "rewards/rejected": 1.0254017114639282, "step": 2124 }, { "epoch": 1.15, "learning_rate": 7.087182241717564e-08, "logits/chosen": -2.080012559890747, "logits/rejected": -2.272618055343628, "logps/chosen": -2.148850917816162, "logps/rejected": -2.4778833389282227, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.8987030386924744, "rewards/margins": 0.012090444564819336, "rewards/rejected": 0.886612594127655, "step": 2125 }, { "epoch": 1.15, "learning_rate": 7.084536119255384e-08, "logits/chosen": -2.0418827533721924, "logits/rejected": -2.3001492023468018, "logps/chosen": -0.41042548418045044, "logps/rejected": -0.3823326528072357, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.7067257761955261, "rewards/margins": 0.012401580810546875, "rewards/rejected": 0.6943241953849792, "step": 2126 }, { "epoch": 1.15, "learning_rate": 7.081889289944836e-08, "logits/chosen": -2.2090659141540527, "logits/rejected": -2.0649454593658447, "logps/chosen": -36.43163299560547, "logps/rejected": -10.288402557373047, "loss": 0.2754, "rewards/accuracies": 1.0, "rewards/chosen": 1.7516231536865234, "rewards/margins": 1.1486124992370605, "rewards/rejected": 0.6030105948448181, "step": 2127 }, { "epoch": 1.15, "learning_rate": 7.079241754683438e-08, "logits/chosen": -1.952884554862976, "logits/rejected": -1.9100373983383179, "logps/chosen": -13.801313400268555, "logps/rejected": -8.658576011657715, "loss": 0.3263, "rewards/accuracies": 1.0, "rewards/chosen": 1.333425521850586, "rewards/margins": 0.952508807182312, "rewards/rejected": 0.38091668486595154, "step": 2128 }, { "epoch": 1.15, "learning_rate": 7.076593514368945e-08, "logits/chosen": -2.0412888526916504, "logits/rejected": -2.238550901412964, "logps/chosen": -0.30304765701293945, "logps/rejected": -0.3138256072998047, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.9555357098579407, "rewards/margins": 0.024012207984924316, "rewards/rejected": 0.9315235018730164, "step": 2129 }, { "epoch": 1.15, "learning_rate": 7.073944569899354e-08, "logits/chosen": -2.222766637802124, "logits/rejected": -2.2804324626922607, "logps/chosen": -10.906726837158203, "logps/rejected": -8.120431900024414, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.8241989016532898, "rewards/margins": 0.009596407413482666, "rewards/rejected": 0.8146024942398071, "step": 2130 }, { "epoch": 1.15, "learning_rate": 7.071294922172897e-08, "logits/chosen": -2.0971009731292725, "logits/rejected": -2.2901172637939453, "logps/chosen": -3.3177313804626465, "logps/rejected": -3.407128095626831, "loss": 0.6569, "rewards/accuracies": 1.0, "rewards/chosen": 0.6061500906944275, "rewards/margins": 0.07380706071853638, "rewards/rejected": 0.5323430299758911, "step": 2131 }, { "epoch": 1.15, "learning_rate": 7.06864457208805e-08, "logits/chosen": -2.020817756652832, "logits/rejected": -2.0226550102233887, "logps/chosen": -0.40111228823661804, "logps/rejected": -4.717337608337402, "loss": 0.5498, "rewards/accuracies": 1.0, "rewards/chosen": 0.7442982196807861, "rewards/margins": 0.3107657730579376, "rewards/rejected": 0.4335324466228485, "step": 2132 }, { "epoch": 1.15, "learning_rate": 7.065993520543522e-08, "logits/chosen": -2.118927240371704, "logits/rejected": -2.119563579559326, "logps/chosen": -0.3100920617580414, "logps/rejected": -3.9947378635406494, "loss": 0.4834, "rewards/accuracies": 1.0, "rewards/chosen": 0.9957504272460938, "rewards/margins": 0.47553712129592896, "rewards/rejected": 0.5202133059501648, "step": 2133 }, { "epoch": 1.15, "learning_rate": 7.063341768438261e-08, "logits/chosen": -2.107586145401001, "logits/rejected": -2.112886905670166, "logps/chosen": -12.991328239440918, "logps/rejected": -7.455779075622559, "loss": 0.2333, "rewards/accuracies": 1.0, "rewards/chosen": 2.101490020751953, "rewards/margins": 1.3367416858673096, "rewards/rejected": 0.7647482752799988, "step": 2134 }, { "epoch": 1.15, "learning_rate": 7.060689316671456e-08, "logits/chosen": -2.227322578430176, "logits/rejected": -2.0830671787261963, "logps/chosen": -56.463035583496094, "logps/rejected": -23.214750289916992, "loss": 0.2189, "rewards/accuracies": 1.0, "rewards/chosen": 2.2980446815490723, "rewards/margins": 1.407780408859253, "rewards/rejected": 0.8902643322944641, "step": 2135 }, { "epoch": 1.15, "learning_rate": 7.05803616614253e-08, "logits/chosen": -2.0506346225738525, "logits/rejected": -2.2777411937713623, "logps/chosen": -0.7823816537857056, "logps/rejected": -0.8801431059837341, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 1.0398926734924316, "rewards/margins": -0.004365682601928711, "rewards/rejected": 1.0442583560943604, "step": 2136 }, { "epoch": 1.15, "learning_rate": 7.055382317751144e-08, "logits/chosen": -2.1325509548187256, "logits/rejected": -2.321495294570923, "logps/chosen": -8.748746871948242, "logps/rejected": -7.153388023376465, "loss": 0.5203, "rewards/accuracies": 1.0, "rewards/chosen": 1.1292415857315063, "rewards/margins": 0.38195371627807617, "rewards/rejected": 0.7472878694534302, "step": 2137 }, { "epoch": 1.15, "learning_rate": 7.052727772397193e-08, "logits/chosen": -2.138829231262207, "logits/rejected": -2.137704372406006, "logps/chosen": -7.191331386566162, "logps/rejected": -2.057931661605835, "loss": 0.4845, "rewards/accuracies": 1.0, "rewards/chosen": 1.2346118688583374, "rewards/margins": 0.47269922494888306, "rewards/rejected": 0.7619126439094543, "step": 2138 }, { "epoch": 1.15, "learning_rate": 7.050072530980812e-08, "logits/chosen": -2.148684024810791, "logits/rejected": -2.139040946960449, "logps/chosen": -1.3825740814208984, "logps/rejected": -5.531330585479736, "loss": 0.4684, "rewards/accuracies": 1.0, "rewards/chosen": 1.0498008728027344, "rewards/margins": 0.5152175426483154, "rewards/rejected": 0.534583330154419, "step": 2139 }, { "epoch": 1.15, "learning_rate": 7.047416594402371e-08, "logits/chosen": -2.142303943634033, "logits/rejected": -2.1413023471832275, "logps/chosen": -24.68224334716797, "logps/rejected": -8.407247543334961, "loss": 0.3791, "rewards/accuracies": 1.0, "rewards/chosen": 1.2243149280548096, "rewards/margins": 0.7743085622787476, "rewards/rejected": 0.4500063955783844, "step": 2140 }, { "epoch": 1.15, "learning_rate": 7.044759963562477e-08, "logits/chosen": -2.013044834136963, "logits/rejected": -2.3421552181243896, "logps/chosen": -6.041821479797363, "logps/rejected": -5.475850582122803, "loss": 0.6965, "rewards/accuracies": 0.0, "rewards/chosen": 0.6467093825340271, "rewards/margins": -0.006602883338928223, "rewards/rejected": 0.6533122658729553, "step": 2141 }, { "epoch": 1.16, "learning_rate": 7.042102639361967e-08, "logits/chosen": -2.1456375122070312, "logits/rejected": -2.1753859519958496, "logps/chosen": -4.77970027923584, "logps/rejected": -21.032100677490234, "loss": 0.1998, "rewards/accuracies": 1.0, "rewards/chosen": 1.663678765296936, "rewards/margins": 1.5087066888809204, "rewards/rejected": 0.15497207641601562, "step": 2142 }, { "epoch": 1.16, "learning_rate": 7.039444622701921e-08, "logits/chosen": -2.0077178478240967, "logits/rejected": -2.3020195960998535, "logps/chosen": -1.2756212949752808, "logps/rejected": -0.6482980251312256, "loss": 0.7369, "rewards/accuracies": 0.0, "rewards/chosen": 0.9575948119163513, "rewards/margins": -0.08561068773269653, "rewards/rejected": 1.0432054996490479, "step": 2143 }, { "epoch": 1.16, "learning_rate": 7.036785914483646e-08, "logits/chosen": -1.9298943281173706, "logits/rejected": -2.2360167503356934, "logps/chosen": -1.022710919380188, "logps/rejected": -1.027268886566162, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.7913125157356262, "rewards/margins": 0.025500893592834473, "rewards/rejected": 0.7658116221427917, "step": 2144 }, { "epoch": 1.16, "learning_rate": 7.034126515608688e-08, "logits/chosen": -2.043022871017456, "logits/rejected": -2.230395555496216, "logps/chosen": -0.6767069101333618, "logps/rejected": -0.6051096320152283, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 0.7667648196220398, "rewards/margins": -0.010437369346618652, "rewards/rejected": 0.7772021889686584, "step": 2145 }, { "epoch": 1.16, "learning_rate": 7.03146642697883e-08, "logits/chosen": -2.1693668365478516, "logits/rejected": -2.309229850769043, "logps/chosen": -2.3366942405700684, "logps/rejected": -2.3661131858825684, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.8122846484184265, "rewards/margins": 0.004891753196716309, "rewards/rejected": 0.8073928952217102, "step": 2146 }, { "epoch": 1.16, "learning_rate": 7.02880564949608e-08, "logits/chosen": -2.091297149658203, "logits/rejected": -2.0894038677215576, "logps/chosen": -0.43323197960853577, "logps/rejected": -4.550748348236084, "loss": 0.4707, "rewards/accuracies": 1.0, "rewards/chosen": 0.9731349349021912, "rewards/margins": 0.5090732574462891, "rewards/rejected": 0.4640617072582245, "step": 2147 }, { "epoch": 1.16, "learning_rate": 7.026144184062688e-08, "logits/chosen": -2.2078239917755127, "logits/rejected": -2.0549910068511963, "logps/chosen": -38.07243347167969, "logps/rejected": -4.005557060241699, "loss": 0.1279, "rewards/accuracies": 1.0, "rewards/chosen": 2.497682571411133, "rewards/margins": 1.991878867149353, "rewards/rejected": 0.5058037042617798, "step": 2148 }, { "epoch": 1.16, "learning_rate": 7.023482031581134e-08, "logits/chosen": -2.1020398139953613, "logits/rejected": -2.127351760864258, "logps/chosen": -2.256481409072876, "logps/rejected": -6.6462836265563965, "loss": 0.4715, "rewards/accuracies": 1.0, "rewards/chosen": 1.1646287441253662, "rewards/margins": 0.5068899989128113, "rewards/rejected": 0.6577387452125549, "step": 2149 }, { "epoch": 1.16, "learning_rate": 7.020819192954131e-08, "logits/chosen": -2.0384087562561035, "logits/rejected": -2.047067165374756, "logps/chosen": -3.9749603271484375, "logps/rejected": -2.8145363330841064, "loss": 0.4547, "rewards/accuracies": 1.0, "rewards/chosen": 1.189706802368164, "rewards/margins": 0.5521774291992188, "rewards/rejected": 0.6375293731689453, "step": 2150 }, { "epoch": 1.16, "learning_rate": 7.018155669084623e-08, "logits/chosen": -2.128434181213379, "logits/rejected": -2.30734920501709, "logps/chosen": -3.7232351303100586, "logps/rejected": -3.74780535697937, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.8620840907096863, "rewards/margins": 0.010341823101043701, "rewards/rejected": 0.8517422676086426, "step": 2151 }, { "epoch": 1.16, "learning_rate": 7.015491460875792e-08, "logits/chosen": -2.036167860031128, "logits/rejected": -2.262181043624878, "logps/chosen": -1.9847075939178467, "logps/rejected": -0.9412992000579834, "loss": 0.6192, "rewards/accuracies": 1.0, "rewards/chosen": 1.071029543876648, "rewards/margins": 0.1537860631942749, "rewards/rejected": 0.917243480682373, "step": 2152 }, { "epoch": 1.16, "learning_rate": 7.012826569231045e-08, "logits/chosen": -2.1524879932403564, "logits/rejected": -2.3281641006469727, "logps/chosen": -1.3157401084899902, "logps/rejected": -1.3865779638290405, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.8679651618003845, "rewards/margins": -0.004421234130859375, "rewards/rejected": 0.8723863959312439, "step": 2153 }, { "epoch": 1.16, "learning_rate": 7.010160995054024e-08, "logits/chosen": -2.1403236389160156, "logits/rejected": -2.3330931663513184, "logps/chosen": -6.5450263023376465, "logps/rejected": -6.507282733917236, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.8206748962402344, "rewards/margins": 0.004771053791046143, "rewards/rejected": 0.8159038424491882, "step": 2154 }, { "epoch": 1.16, "learning_rate": 7.007494739248603e-08, "logits/chosen": -1.9424382448196411, "logits/rejected": -2.2402799129486084, "logps/chosen": -4.08922004699707, "logps/rejected": -4.577116012573242, "loss": 0.6627, "rewards/accuracies": 1.0, "rewards/chosen": 0.9418887495994568, "rewards/margins": 0.06190675497055054, "rewards/rejected": 0.8799819946289062, "step": 2155 }, { "epoch": 1.16, "learning_rate": 7.004827802718889e-08, "logits/chosen": -2.0527360439300537, "logits/rejected": -2.053194522857666, "logps/chosen": -0.8802263736724854, "logps/rejected": -2.673200845718384, "loss": 0.5014, "rewards/accuracies": 1.0, "rewards/chosen": 1.046339750289917, "rewards/margins": 0.42931675910949707, "rewards/rejected": 0.6170229911804199, "step": 2156 }, { "epoch": 1.16, "learning_rate": 7.002160186369214e-08, "logits/chosen": -2.1821868419647217, "logits/rejected": -2.1784939765930176, "logps/chosen": -7.991135597229004, "logps/rejected": -0.6746060252189636, "loss": 0.6137, "rewards/accuracies": 1.0, "rewards/chosen": 0.8641716837882996, "rewards/margins": 0.16574496030807495, "rewards/rejected": 0.6984267234802246, "step": 2157 }, { "epoch": 1.16, "learning_rate": 6.999491891104146e-08, "logits/chosen": -1.9604485034942627, "logits/rejected": -2.2933292388916016, "logps/chosen": -1.1902685165405273, "logps/rejected": -1.1848005056381226, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 1.0002163648605347, "rewards/margins": 0.03156161308288574, "rewards/rejected": 0.9686547517776489, "step": 2158 }, { "epoch": 1.16, "learning_rate": 6.996822917828477e-08, "logits/chosen": -2.040952444076538, "logits/rejected": -2.289330244064331, "logps/chosen": -0.5253778100013733, "logps/rejected": -0.5289690494537354, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 0.8835274577140808, "rewards/margins": -0.005038261413574219, "rewards/rejected": 0.888565719127655, "step": 2159 }, { "epoch": 1.17, "learning_rate": 6.994153267447238e-08, "logits/chosen": -2.1391563415527344, "logits/rejected": -2.2880213260650635, "logps/chosen": -0.7064861059188843, "logps/rejected": -2.9790332317352295, "loss": 0.639, "rewards/accuracies": 1.0, "rewards/chosen": 1.1185674667358398, "rewards/margins": 0.11136066913604736, "rewards/rejected": 1.0072067975997925, "step": 2160 }, { "epoch": 1.17, "learning_rate": 6.991482940865685e-08, "logits/chosen": -2.1154353618621826, "logits/rejected": -2.12648344039917, "logps/chosen": -2.940676212310791, "logps/rejected": -12.413806915283203, "loss": 0.5363, "rewards/accuracies": 1.0, "rewards/chosen": 1.2457197904586792, "rewards/margins": 0.3430456519126892, "rewards/rejected": 0.90267413854599, "step": 2161 }, { "epoch": 1.17, "learning_rate": 6.988811938989299e-08, "logits/chosen": -2.056474208831787, "logits/rejected": -2.3328301906585693, "logps/chosen": -4.5939130783081055, "logps/rejected": -3.4727330207824707, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.8038663864135742, "rewards/margins": -0.0023756027221679688, "rewards/rejected": 0.8062419891357422, "step": 2162 }, { "epoch": 1.17, "learning_rate": 6.986140262723794e-08, "logits/chosen": -2.1528310775756836, "logits/rejected": -2.292147159576416, "logps/chosen": -2.2362866401672363, "logps/rejected": -2.270869731903076, "loss": 0.7007, "rewards/accuracies": 0.0, "rewards/chosen": 0.9561119079589844, "rewards/margins": -0.015135586261749268, "rewards/rejected": 0.9712474942207336, "step": 2163 }, { "epoch": 1.17, "learning_rate": 6.983467912975116e-08, "logits/chosen": -2.2075774669647217, "logits/rejected": -2.203242540359497, "logps/chosen": -1.343859314918518, "logps/rejected": -3.804661512374878, "loss": 0.5231, "rewards/accuracies": 1.0, "rewards/chosen": 1.0278840065002441, "rewards/margins": 0.37496858835220337, "rewards/rejected": 0.6529154181480408, "step": 2164 }, { "epoch": 1.17, "learning_rate": 6.980794890649432e-08, "logits/chosen": -2.051500082015991, "logits/rejected": -2.3030028343200684, "logps/chosen": -3.696305513381958, "logps/rejected": -4.081089973449707, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.7081795930862427, "rewards/margins": -0.003989279270172119, "rewards/rejected": 0.7121688723564148, "step": 2165 }, { "epoch": 1.17, "learning_rate": 6.978121196653146e-08, "logits/chosen": -2.0150156021118164, "logits/rejected": -2.0095155239105225, "logps/chosen": -31.408206939697266, "logps/rejected": -8.238838195800781, "loss": 0.2871, "rewards/accuracies": 1.0, "rewards/chosen": 1.6750160455703735, "rewards/margins": 1.1009280681610107, "rewards/rejected": 0.574087917804718, "step": 2166 }, { "epoch": 1.17, "learning_rate": 6.975446831892881e-08, "logits/chosen": -2.228933572769165, "logits/rejected": -2.193800210952759, "logps/chosen": -25.541173934936523, "logps/rejected": -26.85799789428711, "loss": 0.5463, "rewards/accuracies": 1.0, "rewards/chosen": 1.3609498739242554, "rewards/margins": 0.3189377784729004, "rewards/rejected": 1.042012095451355, "step": 2167 }, { "epoch": 1.17, "learning_rate": 6.972771797275492e-08, "logits/chosen": -2.2014315128326416, "logits/rejected": -2.1958632469177246, "logps/chosen": -1.9623467922210693, "logps/rejected": -10.759220123291016, "loss": 0.3596, "rewards/accuracies": 1.0, "rewards/chosen": 1.3350924253463745, "rewards/margins": 0.8374279737472534, "rewards/rejected": 0.4976644515991211, "step": 2168 }, { "epoch": 1.17, "learning_rate": 6.97009609370806e-08, "logits/chosen": -1.9673690795898438, "logits/rejected": -2.033550977706909, "logps/chosen": -4.9868245124816895, "logps/rejected": -22.694746017456055, "loss": 0.2962, "rewards/accuracies": 1.0, "rewards/chosen": 1.4817345142364502, "rewards/margins": 1.065067172050476, "rewards/rejected": 0.4166673719882965, "step": 2169 }, { "epoch": 1.17, "learning_rate": 6.967419722097894e-08, "logits/chosen": -1.9893001317977905, "logits/rejected": -2.248311996459961, "logps/chosen": -0.656173586845398, "logps/rejected": -0.7854296565055847, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.8897436261177063, "rewards/margins": -0.0007440447807312012, "rewards/rejected": 0.8904876708984375, "step": 2170 }, { "epoch": 1.17, "learning_rate": 6.964742683352528e-08, "logits/chosen": -2.0520551204681396, "logits/rejected": -2.0567269325256348, "logps/chosen": -1.1951160430908203, "logps/rejected": -18.573455810546875, "loss": 0.6379, "rewards/accuracies": 1.0, "rewards/chosen": 1.0437910556793213, "rewards/margins": 0.1136629581451416, "rewards/rejected": 0.9301280975341797, "step": 2171 }, { "epoch": 1.17, "learning_rate": 6.962064978379724e-08, "logits/chosen": -2.0862584114074707, "logits/rejected": -2.0881025791168213, "logps/chosen": -4.5907464027404785, "logps/rejected": -2.46954345703125, "loss": 0.2456, "rewards/accuracies": 1.0, "rewards/chosen": 1.871352195739746, "rewards/margins": 1.2788370847702026, "rewards/rejected": 0.5925151109695435, "step": 2172 }, { "epoch": 1.17, "learning_rate": 6.959386608087466e-08, "logits/chosen": -2.052130937576294, "logits/rejected": -2.2918777465820312, "logps/chosen": -1.2117950916290283, "logps/rejected": -1.349761724472046, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.9533622860908508, "rewards/margins": 0.01987636089324951, "rewards/rejected": 0.9334859251976013, "step": 2173 }, { "epoch": 1.17, "learning_rate": 6.95670757338397e-08, "logits/chosen": -1.9472286701202393, "logits/rejected": -2.2193565368652344, "logps/chosen": -1.0981743335723877, "logps/rejected": -1.0289044380187988, "loss": 0.6702, "rewards/accuracies": 1.0, "rewards/chosen": 0.9888788461685181, "rewards/margins": 0.046425044536590576, "rewards/rejected": 0.9424538016319275, "step": 2174 }, { "epoch": 1.17, "learning_rate": 6.954027875177672e-08, "logits/chosen": -2.0219740867614746, "logits/rejected": -2.245924472808838, "logps/chosen": -3.234992742538452, "logps/rejected": -3.4539413452148438, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.7384389042854309, "rewards/margins": 0.02682429552078247, "rewards/rejected": 0.7116146087646484, "step": 2175 }, { "epoch": 1.17, "learning_rate": 6.951347514377237e-08, "logits/chosen": -2.0454139709472656, "logits/rejected": -2.2473080158233643, "logps/chosen": -0.5615006685256958, "logps/rejected": -0.47208964824676514, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 0.7631613612174988, "rewards/margins": 0.005831241607666016, "rewards/rejected": 0.7573301196098328, "step": 2176 }, { "epoch": 1.17, "learning_rate": 6.948666491891549e-08, "logits/chosen": -2.0384771823883057, "logits/rejected": -2.052095651626587, "logps/chosen": -0.8004845380783081, "logps/rejected": -3.4087376594543457, "loss": 0.5926, "rewards/accuracies": 1.0, "rewards/chosen": 1.0137041807174683, "rewards/margins": 0.21229404211044312, "rewards/rejected": 0.8014101386070251, "step": 2177 }, { "epoch": 1.17, "learning_rate": 6.945984808629722e-08, "logits/chosen": -2.0285284519195557, "logits/rejected": -2.2827298641204834, "logps/chosen": -2.327155590057373, "logps/rejected": -2.4417316913604736, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.8598737716674805, "rewards/margins": 0.012617111206054688, "rewards/rejected": 0.8472566604614258, "step": 2178 }, { "epoch": 1.18, "learning_rate": 6.943302465501089e-08, "logits/chosen": -2.0476913452148438, "logits/rejected": -2.2462034225463867, "logps/chosen": -3.864283800125122, "logps/rejected": -3.8577821254730225, "loss": 0.6715, "rewards/accuracies": 1.0, "rewards/chosen": 0.7538529634475708, "rewards/margins": 0.04368555545806885, "rewards/rejected": 0.710167407989502, "step": 2179 }, { "epoch": 1.18, "learning_rate": 6.940619463415214e-08, "logits/chosen": -1.9922568798065186, "logits/rejected": -2.2989513874053955, "logps/chosen": -2.200028896331787, "logps/rejected": -2.381960868835449, "loss": 0.6764, "rewards/accuracies": 1.0, "rewards/chosen": 0.5908877849578857, "rewards/margins": 0.03380709886550903, "rewards/rejected": 0.5570806860923767, "step": 2180 }, { "epoch": 1.18, "learning_rate": 6.937935803281878e-08, "logits/chosen": -1.9756394624710083, "logits/rejected": -1.9792059659957886, "logps/chosen": -0.9927603006362915, "logps/rejected": -1.7831212282180786, "loss": 0.5145, "rewards/accuracies": 1.0, "rewards/chosen": 1.1576943397521973, "rewards/margins": 0.39639461040496826, "rewards/rejected": 0.761299729347229, "step": 2181 }, { "epoch": 1.18, "learning_rate": 6.935251486011087e-08, "logits/chosen": -2.002479314804077, "logits/rejected": -2.154853582382202, "logps/chosen": -1.1841645240783691, "logps/rejected": -1.1320500373840332, "loss": 0.6975, "rewards/accuracies": 0.0, "rewards/chosen": 0.9014453887939453, "rewards/margins": -0.008765816688537598, "rewards/rejected": 0.9102112054824829, "step": 2182 }, { "epoch": 1.18, "learning_rate": 6.932566512513067e-08, "logits/chosen": -2.0576529502868652, "logits/rejected": -2.2708182334899902, "logps/chosen": -0.8045356273651123, "logps/rejected": -1.0484576225280762, "loss": 0.6702, "rewards/accuracies": 1.0, "rewards/chosen": 0.7789637446403503, "rewards/margins": 0.04650270938873291, "rewards/rejected": 0.7324610352516174, "step": 2183 }, { "epoch": 1.18, "learning_rate": 6.929880883698276e-08, "logits/chosen": -2.081176996231079, "logits/rejected": -2.0935120582580566, "logps/chosen": -2.228266477584839, "logps/rejected": -4.526612281799316, "loss": 0.6693, "rewards/accuracies": 1.0, "rewards/chosen": 1.0562036037445068, "rewards/margins": 0.04818165302276611, "rewards/rejected": 1.0080219507217407, "step": 2184 }, { "epoch": 1.18, "learning_rate": 6.927194600477383e-08, "logits/chosen": -2.0755207538604736, "logits/rejected": -2.3458945751190186, "logps/chosen": -0.23117710649967194, "logps/rejected": -0.42320072650909424, "loss": 0.6732, "rewards/accuracies": 1.0, "rewards/chosen": 1.0181001424789429, "rewards/margins": 0.040341317653656006, "rewards/rejected": 0.9777588248252869, "step": 2185 }, { "epoch": 1.18, "learning_rate": 6.924507663761286e-08, "logits/chosen": -2.056220769882202, "logits/rejected": -2.2914555072784424, "logps/chosen": -1.7008981704711914, "logps/rejected": -17.444345474243164, "loss": 0.3947, "rewards/accuracies": 1.0, "rewards/chosen": 0.9857980608940125, "rewards/margins": 0.7257812023162842, "rewards/rejected": 0.2600168287754059, "step": 2186 }, { "epoch": 1.18, "learning_rate": 6.9218200744611e-08, "logits/chosen": -2.0875494480133057, "logits/rejected": -2.276695966720581, "logps/chosen": -0.30439215898513794, "logps/rejected": -0.3581123352050781, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.8037289977073669, "rewards/margins": 0.010013997554779053, "rewards/rejected": 0.7937150001525879, "step": 2187 }, { "epoch": 1.18, "learning_rate": 6.919131833488164e-08, "logits/chosen": -2.1015255451202393, "logits/rejected": -2.2941513061523438, "logps/chosen": -0.7926260232925415, "logps/rejected": -0.7320880889892578, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.817681610584259, "rewards/margins": 0.0016540288925170898, "rewards/rejected": 0.8160275816917419, "step": 2188 }, { "epoch": 1.18, "learning_rate": 6.916442941754041e-08, "logits/chosen": -2.053603410720825, "logits/rejected": -2.0467207431793213, "logps/chosen": -4.1851806640625, "logps/rejected": -3.2624168395996094, "loss": 0.5304, "rewards/accuracies": 1.0, "rewards/chosen": 0.9010574221611023, "rewards/margins": 0.3572031855583191, "rewards/rejected": 0.5438542366027832, "step": 2189 }, { "epoch": 1.18, "learning_rate": 6.913753400170507e-08, "logits/chosen": -1.8827481269836426, "logits/rejected": -2.2638986110687256, "logps/chosen": -0.9875327944755554, "logps/rejected": -1.036494255065918, "loss": 0.6968, "rewards/accuracies": 0.0, "rewards/chosen": 1.055484414100647, "rewards/margins": -0.007207751274108887, "rewards/rejected": 1.0626921653747559, "step": 2190 }, { "epoch": 1.18, "learning_rate": 6.911063209649565e-08, "logits/chosen": -2.090683698654175, "logits/rejected": -2.0934603214263916, "logps/chosen": -3.623112916946411, "logps/rejected": -0.9144036769866943, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.715676486492157, "rewards/margins": 0.026804804801940918, "rewards/rejected": 0.6888716816902161, "step": 2191 }, { "epoch": 1.18, "learning_rate": 6.908372371103434e-08, "logits/chosen": -2.1092159748077393, "logits/rejected": -2.1081016063690186, "logps/chosen": -1.2792575359344482, "logps/rejected": -3.2756593227386475, "loss": 0.537, "rewards/accuracies": 1.0, "rewards/chosen": 0.9039067625999451, "rewards/margins": 0.3411804437637329, "rewards/rejected": 0.5627263188362122, "step": 2192 }, { "epoch": 1.18, "learning_rate": 6.905680885444554e-08, "logits/chosen": -2.1593000888824463, "logits/rejected": -2.117957830429077, "logps/chosen": -22.758846282958984, "logps/rejected": -3.7465832233428955, "loss": 0.3218, "rewards/accuracies": 1.0, "rewards/chosen": 1.5602439641952515, "rewards/margins": 0.9685267806053162, "rewards/rejected": 0.5917171835899353, "step": 2193 }, { "epoch": 1.18, "learning_rate": 6.902988753585588e-08, "logits/chosen": -2.0636556148529053, "logits/rejected": -2.258147716522217, "logps/chosen": -1.1459954977035522, "logps/rejected": -1.1562695503234863, "loss": 0.7022, "rewards/accuracies": 0.0, "rewards/chosen": 0.9453949332237244, "rewards/margins": -0.01796203851699829, "rewards/rejected": 0.9633569717407227, "step": 2194 }, { "epoch": 1.18, "learning_rate": 6.900295976439412e-08, "logits/chosen": -1.9521034955978394, "logits/rejected": -1.9510375261306763, "logps/chosen": -7.880643844604492, "logps/rejected": -2.9907901287078857, "loss": 0.386, "rewards/accuracies": 1.0, "rewards/chosen": 1.389569878578186, "rewards/margins": 0.752872884273529, "rewards/rejected": 0.636696994304657, "step": 2195 }, { "epoch": 1.18, "learning_rate": 6.897602554919123e-08, "logits/chosen": -2.064621686935425, "logits/rejected": -2.0638270378112793, "logps/chosen": -1.1111291646957397, "logps/rejected": -1.6738438606262207, "loss": 0.5925, "rewards/accuracies": 1.0, "rewards/chosen": 1.0025981664657593, "rewards/margins": 0.21262454986572266, "rewards/rejected": 0.7899736166000366, "step": 2196 }, { "epoch": 1.19, "learning_rate": 6.894908489938041e-08, "logits/chosen": -2.018249034881592, "logits/rejected": -2.249738931655884, "logps/chosen": -0.8955532312393188, "logps/rejected": -0.9136359691619873, "loss": 0.6632, "rewards/accuracies": 1.0, "rewards/chosen": 0.9687889218330383, "rewards/margins": 0.060912489891052246, "rewards/rejected": 0.9078764319419861, "step": 2197 }, { "epoch": 1.19, "learning_rate": 6.892213782409694e-08, "logits/chosen": -1.997441291809082, "logits/rejected": -2.2438297271728516, "logps/chosen": -2.4345529079437256, "logps/rejected": -2.416252851486206, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 0.6838923692703247, "rewards/margins": 0.03875678777694702, "rewards/rejected": 0.6451355814933777, "step": 2198 }, { "epoch": 1.19, "learning_rate": 6.889518433247843e-08, "logits/chosen": -2.227398633956909, "logits/rejected": -2.1778454780578613, "logps/chosen": -24.19631576538086, "logps/rejected": -4.043217182159424, "loss": 0.2466, "rewards/accuracies": 1.0, "rewards/chosen": 2.068809986114502, "rewards/margins": 1.2740411758422852, "rewards/rejected": 0.794768750667572, "step": 2199 }, { "epoch": 1.19, "learning_rate": 6.886822443366451e-08, "logits/chosen": -2.1451380252838135, "logits/rejected": -2.1703648567199707, "logps/chosen": -1.1277767419815063, "logps/rejected": -9.392674446105957, "loss": 0.4321, "rewards/accuracies": 1.0, "rewards/chosen": 1.0719146728515625, "rewards/margins": 0.6153694987297058, "rewards/rejected": 0.4565451741218567, "step": 2200 }, { "epoch": 1.19, "learning_rate": 6.884125813679706e-08, "logits/chosen": -2.1449027061462402, "logits/rejected": -2.3372654914855957, "logps/chosen": -0.9712745547294617, "logps/rejected": -0.9466977119445801, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.9291884303092957, "rewards/margins": 0.006502211093902588, "rewards/rejected": 0.9226862192153931, "step": 2201 }, { "epoch": 1.19, "learning_rate": 6.881428545102015e-08, "logits/chosen": -2.0334084033966064, "logits/rejected": -2.033278465270996, "logps/chosen": -6.226076126098633, "logps/rejected": -2.350950002670288, "loss": 0.328, "rewards/accuracies": 1.0, "rewards/chosen": 1.5689424276351929, "rewards/margins": 0.9463151693344116, "rewards/rejected": 0.6226272583007812, "step": 2202 }, { "epoch": 1.19, "learning_rate": 6.878730638547995e-08, "logits/chosen": -2.0624399185180664, "logits/rejected": -2.0470848083496094, "logps/chosen": -19.429752349853516, "logps/rejected": -5.1211347579956055, "loss": 0.4872, "rewards/accuracies": 1.0, "rewards/chosen": 1.341630220413208, "rewards/margins": 0.46560364961624146, "rewards/rejected": 0.8760265707969666, "step": 2203 }, { "epoch": 1.19, "learning_rate": 6.876032094932487e-08, "logits/chosen": -2.1071832180023193, "logits/rejected": -2.0783028602600098, "logps/chosen": -28.04590606689453, "logps/rejected": -17.00594711303711, "loss": 0.3631, "rewards/accuracies": 1.0, "rewards/chosen": 1.694148302078247, "rewards/margins": 0.8259180188179016, "rewards/rejected": 0.8682302832603455, "step": 2204 }, { "epoch": 1.19, "learning_rate": 6.873332915170543e-08, "logits/chosen": -2.1472997665405273, "logits/rejected": -2.1512153148651123, "logps/chosen": -3.5665955543518066, "logps/rejected": -9.48359203338623, "loss": 0.4403, "rewards/accuracies": 1.0, "rewards/chosen": 1.327807068824768, "rewards/margins": 0.5920220613479614, "rewards/rejected": 0.7357850074768066, "step": 2205 }, { "epoch": 1.19, "learning_rate": 6.870633100177427e-08, "logits/chosen": -2.012949228286743, "logits/rejected": -2.2135937213897705, "logps/chosen": -2.0125041007995605, "logps/rejected": -1.809927225112915, "loss": 0.695, "rewards/accuracies": 0.0, "rewards/chosen": 0.9368509650230408, "rewards/margins": -0.0037360787391662598, "rewards/rejected": 0.940587043762207, "step": 2206 }, { "epoch": 1.19, "learning_rate": 6.867932650868629e-08, "logits/chosen": -2.121363401412964, "logits/rejected": -2.1147255897521973, "logps/chosen": -1.5019639730453491, "logps/rejected": -7.506224155426025, "loss": 0.4236, "rewards/accuracies": 1.0, "rewards/chosen": 1.1160892248153687, "rewards/margins": 0.63965904712677, "rewards/rejected": 0.47643017768859863, "step": 2207 }, { "epoch": 1.19, "learning_rate": 6.865231568159846e-08, "logits/chosen": -2.016839027404785, "logits/rejected": -2.021214723587036, "logps/chosen": -0.6942183375358582, "logps/rejected": -2.954507350921631, "loss": 0.5079, "rewards/accuracies": 1.0, "rewards/chosen": 0.9878212809562683, "rewards/margins": 0.41278767585754395, "rewards/rejected": 0.5750336050987244, "step": 2208 }, { "epoch": 1.19, "learning_rate": 6.862529852966994e-08, "logits/chosen": -1.9274362325668335, "logits/rejected": -2.193173885345459, "logps/chosen": -0.3435143828392029, "logps/rejected": -0.36761197447776794, "loss": 0.6962, "rewards/accuracies": 0.0, "rewards/chosen": 0.9707283973693848, "rewards/margins": -0.0061209797859191895, "rewards/rejected": 0.976849377155304, "step": 2209 }, { "epoch": 1.19, "learning_rate": 6.859827506206198e-08, "logits/chosen": -2.0048668384552, "logits/rejected": -2.3054189682006836, "logps/chosen": -2.7231221199035645, "logps/rejected": -2.5014288425445557, "loss": 0.7039, "rewards/accuracies": 0.0, "rewards/chosen": 0.5747185349464417, "rewards/margins": -0.02136361598968506, "rewards/rejected": 0.5960821509361267, "step": 2210 }, { "epoch": 1.19, "learning_rate": 6.857124528793803e-08, "logits/chosen": -1.9688371419906616, "logits/rejected": -2.267198324203491, "logps/chosen": -1.5690152645111084, "logps/rejected": -1.5127160549163818, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 1.0659502744674683, "rewards/margins": 0.036646366119384766, "rewards/rejected": 1.0293039083480835, "step": 2211 }, { "epoch": 1.19, "learning_rate": 6.854420921646365e-08, "logits/chosen": -2.0287961959838867, "logits/rejected": -2.298322916030884, "logps/chosen": -3.0721306800842285, "logps/rejected": -2.72432279586792, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": 0.8454872369766235, "rewards/margins": -0.00996243953704834, "rewards/rejected": 0.8554496765136719, "step": 2212 }, { "epoch": 1.19, "learning_rate": 6.851716685680653e-08, "logits/chosen": -2.002843141555786, "logits/rejected": -1.9840326309204102, "logps/chosen": -32.99256896972656, "logps/rejected": -8.680914878845215, "loss": 0.3021, "rewards/accuracies": 1.0, "rewards/chosen": 1.390929102897644, "rewards/margins": 1.0421115159988403, "rewards/rejected": 0.3488175570964813, "step": 2213 }, { "epoch": 1.19, "learning_rate": 6.849011821813653e-08, "logits/chosen": -2.070875406265259, "logits/rejected": -2.2612366676330566, "logps/chosen": -2.019174337387085, "logps/rejected": -1.9912712574005127, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.8152560591697693, "rewards/margins": 0.016964435577392578, "rewards/rejected": 0.7982916235923767, "step": 2214 }, { "epoch": 1.19, "learning_rate": 6.846306330962559e-08, "logits/chosen": -2.022937536239624, "logits/rejected": -2.2141494750976562, "logps/chosen": -10.072628021240234, "logps/rejected": -8.312186241149902, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.5358451008796692, "rewards/margins": 0.014426589012145996, "rewards/rejected": 0.5214185118675232, "step": 2215 }, { "epoch": 1.2, "learning_rate": 6.84360021404478e-08, "logits/chosen": -1.9928022623062134, "logits/rejected": -1.9957374334335327, "logps/chosen": -3.1636199951171875, "logps/rejected": -6.333252906799316, "loss": 0.5126, "rewards/accuracies": 1.0, "rewards/chosen": 0.9075042009353638, "rewards/margins": 0.40108299255371094, "rewards/rejected": 0.5064212083816528, "step": 2216 }, { "epoch": 1.2, "learning_rate": 6.840893471977938e-08, "logits/chosen": -1.9999854564666748, "logits/rejected": -2.0142862796783447, "logps/chosen": -30.186542510986328, "logps/rejected": -20.94270133972168, "loss": 0.5287, "rewards/accuracies": 1.0, "rewards/chosen": 0.7728614807128906, "rewards/margins": 0.3612697422504425, "rewards/rejected": 0.4115917384624481, "step": 2217 }, { "epoch": 1.2, "learning_rate": 6.838186105679864e-08, "logits/chosen": -1.9944522380828857, "logits/rejected": -2.227250337600708, "logps/chosen": -0.4971403181552887, "logps/rejected": -0.45476698875427246, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.9081048369407654, "rewards/margins": 0.0011265873908996582, "rewards/rejected": 0.9069782495498657, "step": 2218 }, { "epoch": 1.2, "learning_rate": 6.835478116068607e-08, "logits/chosen": -2.0758650302886963, "logits/rejected": -2.2679576873779297, "logps/chosen": -0.45023828744888306, "logps/rejected": -0.5236179828643799, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.8564240336418152, "rewards/margins": 0.011009395122528076, "rewards/rejected": 0.8454146385192871, "step": 2219 }, { "epoch": 1.2, "learning_rate": 6.832769504062418e-08, "logits/chosen": -2.017707347869873, "logits/rejected": -2.010911226272583, "logps/chosen": -14.809853553771973, "logps/rejected": -5.996762275695801, "loss": 0.3717, "rewards/accuracies": 1.0, "rewards/chosen": 1.5302494764328003, "rewards/margins": 0.7979300022125244, "rewards/rejected": 0.7323194742202759, "step": 2220 }, { "epoch": 1.2, "learning_rate": 6.830060270579768e-08, "logits/chosen": -2.063001871109009, "logits/rejected": -2.2378227710723877, "logps/chosen": -1.040229082107544, "logps/rejected": -4.03120756149292, "loss": 0.5882, "rewards/accuracies": 1.0, "rewards/chosen": 0.8437016606330872, "rewards/margins": 0.22228294610977173, "rewards/rejected": 0.6214187145233154, "step": 2221 }, { "epoch": 1.2, "learning_rate": 6.827350416539333e-08, "logits/chosen": -2.2003138065338135, "logits/rejected": -2.287933349609375, "logps/chosen": -18.43923568725586, "logps/rejected": -9.723341941833496, "loss": 0.6036, "rewards/accuracies": 1.0, "rewards/chosen": 1.0222917795181274, "rewards/margins": 0.18784970045089722, "rewards/rejected": 0.8344420790672302, "step": 2222 }, { "epoch": 1.2, "learning_rate": 6.824639942860001e-08, "logits/chosen": -1.9653198719024658, "logits/rejected": -1.9655510187149048, "logps/chosen": -1.7677983045578003, "logps/rejected": -0.8696110248565674, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.8595229983329773, "rewards/margins": -0.002774178981781006, "rewards/rejected": 0.8622971773147583, "step": 2223 }, { "epoch": 1.2, "learning_rate": 6.821928850460873e-08, "logits/chosen": -1.946902871131897, "logits/rejected": -2.227952480316162, "logps/chosen": -8.117865562438965, "logps/rejected": -1.252488613128662, "loss": 0.6109, "rewards/accuracies": 1.0, "rewards/chosen": 1.0851324796676636, "rewards/margins": 0.17180222272872925, "rewards/rejected": 0.9133302569389343, "step": 2224 }, { "epoch": 1.2, "learning_rate": 6.819217140261255e-08, "logits/chosen": -2.0627505779266357, "logits/rejected": -2.25226092338562, "logps/chosen": -0.7451794743537903, "logps/rejected": -0.7505922317504883, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.8196921348571777, "rewards/margins": 0.02955615520477295, "rewards/rejected": 0.7901359796524048, "step": 2225 }, { "epoch": 1.2, "learning_rate": 6.816504813180663e-08, "logits/chosen": -2.1315953731536865, "logits/rejected": -2.0013976097106934, "logps/chosen": -35.51533889770508, "logps/rejected": -1.8985533714294434, "loss": 0.229, "rewards/accuracies": 1.0, "rewards/chosen": 2.0832879543304443, "rewards/margins": 1.3572108745574951, "rewards/rejected": 0.726077139377594, "step": 2226 }, { "epoch": 1.2, "learning_rate": 6.813791870138827e-08, "logits/chosen": -1.9807438850402832, "logits/rejected": -2.2989344596862793, "logps/chosen": -1.0038927793502808, "logps/rejected": -1.0613276958465576, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.9866103529930115, "rewards/margins": 0.0022755861282348633, "rewards/rejected": 0.9843347668647766, "step": 2227 }, { "epoch": 1.2, "learning_rate": 6.811078312055685e-08, "logits/chosen": -2.040989637374878, "logits/rejected": -2.0403006076812744, "logps/chosen": -3.1775083541870117, "logps/rejected": -3.799960136413574, "loss": 0.3007, "rewards/accuracies": 1.0, "rewards/chosen": 1.5567883253097534, "rewards/margins": 1.0476150512695312, "rewards/rejected": 0.5091733336448669, "step": 2228 }, { "epoch": 1.2, "learning_rate": 6.808364139851375e-08, "logits/chosen": -2.0968616008758545, "logits/rejected": -2.29915714263916, "logps/chosen": -3.9358835220336914, "logps/rejected": -3.9012279510498047, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.9604710936546326, "rewards/margins": -0.005396068096160889, "rewards/rejected": 0.9658671617507935, "step": 2229 }, { "epoch": 1.2, "learning_rate": 6.805649354446255e-08, "logits/chosen": -1.9699031114578247, "logits/rejected": -2.0127644538879395, "logps/chosen": -7.262634754180908, "logps/rejected": -8.360355377197266, "loss": 0.3369, "rewards/accuracies": 1.0, "rewards/chosen": 1.6095342636108398, "rewards/margins": 0.9149603843688965, "rewards/rejected": 0.6945738792419434, "step": 2230 }, { "epoch": 1.2, "learning_rate": 6.802933956760881e-08, "logits/chosen": -2.0302786827087402, "logits/rejected": -2.250427007675171, "logps/chosen": -1.3413357734680176, "logps/rejected": -45.95296859741211, "loss": 0.2544, "rewards/accuracies": 1.0, "rewards/chosen": 0.8736600875854492, "rewards/margins": 1.2388994693756104, "rewards/rejected": -0.36523935198783875, "step": 2231 }, { "epoch": 1.2, "learning_rate": 6.800217947716025e-08, "logits/chosen": -2.069176197052002, "logits/rejected": -2.3217616081237793, "logps/chosen": -0.34683388471603394, "logps/rejected": -0.4144562780857086, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.8978241086006165, "rewards/margins": 0.012543797492980957, "rewards/rejected": 0.8852803111076355, "step": 2232 }, { "epoch": 1.2, "learning_rate": 6.797501328232661e-08, "logits/chosen": -2.0349698066711426, "logits/rejected": -2.040635108947754, "logps/chosen": -2.1176865100860596, "logps/rejected": -1.8992443084716797, "loss": 0.4916, "rewards/accuracies": 1.0, "rewards/chosen": 1.1498181819915771, "rewards/margins": 0.4541471600532532, "rewards/rejected": 0.695671021938324, "step": 2233 }, { "epoch": 1.2, "learning_rate": 6.794784099231971e-08, "logits/chosen": -1.9497661590576172, "logits/rejected": -1.9567248821258545, "logps/chosen": -0.4790644645690918, "logps/rejected": -9.559248924255371, "loss": 0.3914, "rewards/accuracies": 1.0, "rewards/chosen": 1.038854956626892, "rewards/margins": 0.7359462976455688, "rewards/rejected": 0.30290862917900085, "step": 2234 }, { "epoch": 1.21, "learning_rate": 6.792066261635343e-08, "logits/chosen": -2.1258506774902344, "logits/rejected": -2.122457981109619, "logps/chosen": -9.923921585083008, "logps/rejected": -2.5610384941101074, "loss": 0.4368, "rewards/accuracies": 1.0, "rewards/chosen": 1.2848070859909058, "rewards/margins": 0.6019132733345032, "rewards/rejected": 0.6828938126564026, "step": 2235 }, { "epoch": 1.21, "learning_rate": 6.789347816364377e-08, "logits/chosen": -2.1362972259521484, "logits/rejected": -2.264988422393799, "logps/chosen": -5.167978286743164, "logps/rejected": -4.927946090698242, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.8204291462898254, "rewards/margins": 0.021909892559051514, "rewards/rejected": 0.7985192537307739, "step": 2236 }, { "epoch": 1.21, "learning_rate": 6.786628764340868e-08, "logits/chosen": -2.031757116317749, "logits/rejected": -2.024729013442993, "logps/chosen": -2.2414231300354004, "logps/rejected": -7.369359970092773, "loss": 0.4826, "rewards/accuracies": 1.0, "rewards/chosen": 1.0835357904434204, "rewards/margins": 0.4774778485298157, "rewards/rejected": 0.6060579419136047, "step": 2237 }, { "epoch": 1.21, "learning_rate": 6.783909106486829e-08, "logits/chosen": -2.0292999744415283, "logits/rejected": -2.0237948894500732, "logps/chosen": -3.1364047527313232, "logps/rejected": -2.791264295578003, "loss": 0.6088, "rewards/accuracies": 1.0, "rewards/chosen": 0.9434741139411926, "rewards/margins": 0.1765238642692566, "rewards/rejected": 0.766950249671936, "step": 2238 }, { "epoch": 1.21, "learning_rate": 6.78118884372447e-08, "logits/chosen": -2.140991449356079, "logits/rejected": -2.1468212604522705, "logps/chosen": -6.419661998748779, "logps/rejected": -2.9237375259399414, "loss": 0.7561, "rewards/accuracies": 0.0, "rewards/chosen": 1.1004066467285156, "rewards/margins": -0.1221688985824585, "rewards/rejected": 1.2225755453109741, "step": 2239 }, { "epoch": 1.21, "learning_rate": 6.77846797697621e-08, "logits/chosen": -2.0358574390411377, "logits/rejected": -2.273369789123535, "logps/chosen": -0.788484513759613, "logps/rejected": -0.8147833943367004, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.914665699005127, "rewards/margins": 0.03413587808609009, "rewards/rejected": 0.8805298209190369, "step": 2240 }, { "epoch": 1.21, "learning_rate": 6.77574650716467e-08, "logits/chosen": -1.9955978393554688, "logits/rejected": -2.0011682510375977, "logps/chosen": -2.052321434020996, "logps/rejected": -3.1066298484802246, "loss": 0.5327, "rewards/accuracies": 1.0, "rewards/chosen": 0.9340187907218933, "rewards/margins": 0.35157281160354614, "rewards/rejected": 0.5824459791183472, "step": 2241 }, { "epoch": 1.21, "learning_rate": 6.773024435212677e-08, "logits/chosen": -1.9901621341705322, "logits/rejected": -2.2865171432495117, "logps/chosen": -5.184386253356934, "logps/rejected": -4.959733486175537, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.7759532332420349, "rewards/margins": 0.0003393888473510742, "rewards/rejected": 0.7756138443946838, "step": 2242 }, { "epoch": 1.21, "learning_rate": 6.770301762043266e-08, "logits/chosen": -2.063244342803955, "logits/rejected": -2.062532424926758, "logps/chosen": -4.178904056549072, "logps/rejected": -3.313850164413452, "loss": 0.5834, "rewards/accuracies": 1.0, "rewards/chosen": 0.8908497095108032, "rewards/margins": 0.23295116424560547, "rewards/rejected": 0.6578985452651978, "step": 2243 }, { "epoch": 1.21, "learning_rate": 6.767578488579668e-08, "logits/chosen": -2.097480535507202, "logits/rejected": -2.272608995437622, "logps/chosen": -0.33271175622940063, "logps/rejected": -0.30886775255203247, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 0.8950382471084595, "rewards/margins": 0.0023252367973327637, "rewards/rejected": 0.8927130103111267, "step": 2244 }, { "epoch": 1.21, "learning_rate": 6.764854615745324e-08, "logits/chosen": -2.1424314975738525, "logits/rejected": -2.1198959350585938, "logps/chosen": -23.973831176757812, "logps/rejected": -15.416568756103516, "loss": 0.3535, "rewards/accuracies": 1.0, "rewards/chosen": 1.7018059492111206, "rewards/margins": 0.8578895926475525, "rewards/rejected": 0.8439163565635681, "step": 2245 }, { "epoch": 1.21, "learning_rate": 6.762130144463875e-08, "logits/chosen": -2.065251111984253, "logits/rejected": -2.0613672733306885, "logps/chosen": -12.224625587463379, "logps/rejected": -11.4923677444458, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 0.9771596193313599, "rewards/margins": 0.034311890602111816, "rewards/rejected": 0.942847728729248, "step": 2246 }, { "epoch": 1.21, "learning_rate": 6.759405075659165e-08, "logits/chosen": -2.0154478549957275, "logits/rejected": -2.325751543045044, "logps/chosen": -1.0382064580917358, "logps/rejected": -0.962670624256134, "loss": 0.7011, "rewards/accuracies": 0.0, "rewards/chosen": 0.9949882626533508, "rewards/margins": -0.015761077404022217, "rewards/rejected": 1.010749340057373, "step": 2247 }, { "epoch": 1.21, "learning_rate": 6.756679410255244e-08, "logits/chosen": -1.9733593463897705, "logits/rejected": -2.2504889965057373, "logps/chosen": -0.6879525780677795, "logps/rejected": -0.7926490902900696, "loss": 0.6667, "rewards/accuracies": 1.0, "rewards/chosen": 0.8078100085258484, "rewards/margins": 0.0536574125289917, "rewards/rejected": 0.7541525959968567, "step": 2248 }, { "epoch": 1.21, "learning_rate": 6.753953149176361e-08, "logits/chosen": -1.9978017807006836, "logits/rejected": -2.251051664352417, "logps/chosen": -0.42596668004989624, "logps/rejected": -0.41174328327178955, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.9791107177734375, "rewards/margins": 0.026267528533935547, "rewards/rejected": 0.952843189239502, "step": 2249 }, { "epoch": 1.21, "learning_rate": 6.751226293346967e-08, "logits/chosen": -2.0940816402435303, "logits/rejected": -2.0942907333374023, "logps/chosen": -1.815529227256775, "logps/rejected": -0.9514538049697876, "loss": 0.6552, "rewards/accuracies": 1.0, "rewards/chosen": 0.9405228495597839, "rewards/margins": 0.07733237743377686, "rewards/rejected": 0.8631904721260071, "step": 2250 }, { "epoch": 1.21, "learning_rate": 6.748498843691715e-08, "logits/chosen": -1.9616944789886475, "logits/rejected": -2.2957210540771484, "logps/chosen": -0.3420378565788269, "logps/rejected": -0.3649955689907074, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.9491471648216248, "rewards/margins": 0.008882641792297363, "rewards/rejected": 0.9402645230293274, "step": 2251 }, { "epoch": 1.21, "learning_rate": 6.74577080113546e-08, "logits/chosen": -1.9297877550125122, "logits/rejected": -1.9117097854614258, "logps/chosen": -8.564631462097168, "logps/rejected": -1.163684606552124, "loss": 0.5886, "rewards/accuracies": 1.0, "rewards/chosen": 1.287239909172058, "rewards/margins": 0.22142422199249268, "rewards/rejected": 1.0658156871795654, "step": 2252 }, { "epoch": 1.22, "learning_rate": 6.743042166603262e-08, "logits/chosen": -2.153926372528076, "logits/rejected": -2.150770425796509, "logps/chosen": -0.4307692050933838, "logps/rejected": -3.9451210498809814, "loss": 0.5155, "rewards/accuracies": 1.0, "rewards/chosen": 0.8634570240974426, "rewards/margins": 0.3937895894050598, "rewards/rejected": 0.4696674346923828, "step": 2253 }, { "epoch": 1.22, "learning_rate": 6.740312941020371e-08, "logits/chosen": -2.083402395248413, "logits/rejected": -2.2541491985321045, "logps/chosen": -9.516310691833496, "logps/rejected": -10.984615325927734, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 0.9288241267204285, "rewards/margins": 0.0012679100036621094, "rewards/rejected": 0.9275562167167664, "step": 2254 }, { "epoch": 1.22, "learning_rate": 6.737583125312248e-08, "logits/chosen": -2.0997490882873535, "logits/rejected": -2.3030319213867188, "logps/chosen": -0.8318713903427124, "logps/rejected": -0.8492018580436707, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.9721771478652954, "rewards/margins": 0.02322930097579956, "rewards/rejected": 0.9489478468894958, "step": 2255 }, { "epoch": 1.22, "learning_rate": 6.734852720404551e-08, "logits/chosen": -1.9380897283554077, "logits/rejected": -2.2927160263061523, "logps/chosen": -7.828677177429199, "logps/rejected": -8.08072566986084, "loss": 0.6557, "rewards/accuracies": 1.0, "rewards/chosen": 1.0445045232772827, "rewards/margins": 0.07643228769302368, "rewards/rejected": 0.968072235584259, "step": 2256 }, { "epoch": 1.22, "learning_rate": 6.732121727223134e-08, "logits/chosen": -2.0717921257019043, "logits/rejected": -2.0715715885162354, "logps/chosen": -1.3509539365768433, "logps/rejected": -2.1218981742858887, "loss": 0.6466, "rewards/accuracies": 1.0, "rewards/chosen": 1.0682182312011719, "rewards/margins": 0.09528577327728271, "rewards/rejected": 0.9729324579238892, "step": 2257 }, { "epoch": 1.22, "learning_rate": 6.729390146694055e-08, "logits/chosen": -1.9816172122955322, "logits/rejected": -2.260012626647949, "logps/chosen": -1.8642499446868896, "logps/rejected": -10.653748512268066, "loss": 0.6358, "rewards/accuracies": 1.0, "rewards/chosen": 0.9438156485557556, "rewards/margins": 0.11812829971313477, "rewards/rejected": 0.8256873488426208, "step": 2258 }, { "epoch": 1.22, "learning_rate": 6.726657979743568e-08, "logits/chosen": -2.154040813446045, "logits/rejected": -2.2462210655212402, "logps/chosen": -1.9003870487213135, "logps/rejected": -1.790516972541809, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.9246541261672974, "rewards/margins": 0.01630038022994995, "rewards/rejected": 0.9083537459373474, "step": 2259 }, { "epoch": 1.22, "learning_rate": 6.723925227298131e-08, "logits/chosen": -2.042909860610962, "logits/rejected": -2.0183305740356445, "logps/chosen": -15.413450241088867, "logps/rejected": -2.2977652549743652, "loss": 0.3298, "rewards/accuracies": 1.0, "rewards/chosen": 1.479235053062439, "rewards/margins": 0.9396802186965942, "rewards/rejected": 0.5395548343658447, "step": 2260 }, { "epoch": 1.22, "learning_rate": 6.721191890284395e-08, "logits/chosen": -2.0187854766845703, "logits/rejected": -2.0115480422973633, "logps/chosen": -3.3892693519592285, "logps/rejected": -4.496218681335449, "loss": 0.3005, "rewards/accuracies": 1.0, "rewards/chosen": 1.479089379310608, "rewards/margins": 1.0484834909439087, "rewards/rejected": 0.4306058883666992, "step": 2261 }, { "epoch": 1.22, "learning_rate": 6.71845796962921e-08, "logits/chosen": -2.1400022506713867, "logits/rejected": -2.1433839797973633, "logps/chosen": -2.408945322036743, "logps/rejected": -2.5768253803253174, "loss": 0.5333, "rewards/accuracies": 1.0, "rewards/chosen": 0.9976564645767212, "rewards/margins": 0.35019367933273315, "rewards/rejected": 0.647462785243988, "step": 2262 }, { "epoch": 1.22, "learning_rate": 6.715723466259626e-08, "logits/chosen": -2.1244943141937256, "logits/rejected": -2.322974920272827, "logps/chosen": -1.6105842590332031, "logps/rejected": -1.5362863540649414, "loss": 0.7002, "rewards/accuracies": 0.0, "rewards/chosen": 0.9694633483886719, "rewards/margins": -0.014049649238586426, "rewards/rejected": 0.9835129976272583, "step": 2263 }, { "epoch": 1.22, "learning_rate": 6.71298838110289e-08, "logits/chosen": -2.0141541957855225, "logits/rejected": -2.0205905437469482, "logps/chosen": -0.9490095376968384, "logps/rejected": -4.90877628326416, "loss": 0.4981, "rewards/accuracies": 1.0, "rewards/chosen": 1.0488048791885376, "rewards/margins": 0.43763887882232666, "rewards/rejected": 0.6111660003662109, "step": 2264 }, { "epoch": 1.22, "learning_rate": 6.710252715086446e-08, "logits/chosen": -2.0262415409088135, "logits/rejected": -2.269280195236206, "logps/chosen": -0.5462702512741089, "logps/rejected": -0.5193517804145813, "loss": 0.6708, "rewards/accuracies": 1.0, "rewards/chosen": 0.8693650364875793, "rewards/margins": 0.04526251554489136, "rewards/rejected": 0.824102520942688, "step": 2265 }, { "epoch": 1.22, "learning_rate": 6.707516469137935e-08, "logits/chosen": -2.053685188293457, "logits/rejected": -2.0454838275909424, "logps/chosen": -3.240504264831543, "logps/rejected": -3.9826979637145996, "loss": 0.5363, "rewards/accuracies": 1.0, "rewards/chosen": 0.9716151356697083, "rewards/margins": 0.3429926633834839, "rewards/rejected": 0.6286224722862244, "step": 2266 }, { "epoch": 1.22, "learning_rate": 6.704779644185195e-08, "logits/chosen": -2.0521726608276367, "logits/rejected": -2.24796199798584, "logps/chosen": -1.0901776552200317, "logps/rejected": -1.110001802444458, "loss": 0.699, "rewards/accuracies": 0.0, "rewards/chosen": 0.961807370185852, "rewards/margins": -0.011706650257110596, "rewards/rejected": 0.9735140204429626, "step": 2267 }, { "epoch": 1.22, "learning_rate": 6.702042241156258e-08, "logits/chosen": -2.095818519592285, "logits/rejected": -2.0976552963256836, "logps/chosen": -0.6687659621238708, "logps/rejected": -2.982048273086548, "loss": 0.5664, "rewards/accuracies": 1.0, "rewards/chosen": 0.7597978711128235, "rewards/margins": 0.2718423902988434, "rewards/rejected": 0.4879554808139801, "step": 2268 }, { "epoch": 1.22, "learning_rate": 6.699304260979355e-08, "logits/chosen": -2.1934139728546143, "logits/rejected": -2.196192741394043, "logps/chosen": -3.8281753063201904, "logps/rejected": -0.7127880454063416, "loss": 0.6242, "rewards/accuracies": 1.0, "rewards/chosen": 1.099232792854309, "rewards/margins": 0.1430732011795044, "rewards/rejected": 0.9561595916748047, "step": 2269 }, { "epoch": 1.22, "learning_rate": 6.696565704582913e-08, "logits/chosen": -2.132582426071167, "logits/rejected": -2.1288528442382812, "logps/chosen": -3.455111026763916, "logps/rejected": -5.347460746765137, "loss": 0.3456, "rewards/accuracies": 1.0, "rewards/chosen": 1.265854835510254, "rewards/margins": 0.8845469355583191, "rewards/rejected": 0.3813078999519348, "step": 2270 }, { "epoch": 1.22, "learning_rate": 6.693826572895552e-08, "logits/chosen": -2.124974012374878, "logits/rejected": -2.1123321056365967, "logps/chosen": -5.154325008392334, "logps/rejected": -7.816287994384766, "loss": 0.5278, "rewards/accuracies": 1.0, "rewards/chosen": 1.0241504907608032, "rewards/margins": 0.3635631799697876, "rewards/rejected": 0.6605873107910156, "step": 2271 }, { "epoch": 1.23, "learning_rate": 6.691086866846087e-08, "logits/chosen": -2.1161489486694336, "logits/rejected": -2.307293176651001, "logps/chosen": -0.6169519424438477, "logps/rejected": -0.6085184812545776, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.9617142081260681, "rewards/margins": 0.00725102424621582, "rewards/rejected": 0.9544631838798523, "step": 2272 }, { "epoch": 1.23, "learning_rate": 6.688346587363531e-08, "logits/chosen": -2.061284065246582, "logits/rejected": -2.067798137664795, "logps/chosen": -2.3513810634613037, "logps/rejected": -2.7776787281036377, "loss": 0.4283, "rewards/accuracies": 1.0, "rewards/chosen": 1.3621749877929688, "rewards/margins": 0.6262273788452148, "rewards/rejected": 0.7359476089477539, "step": 2273 }, { "epoch": 1.23, "learning_rate": 6.685605735377088e-08, "logits/chosen": -2.1286861896514893, "logits/rejected": -2.1255600452423096, "logps/chosen": -3.05595064163208, "logps/rejected": -2.2184274196624756, "loss": 0.4904, "rewards/accuracies": 1.0, "rewards/chosen": 1.1838213205337524, "rewards/margins": 0.4574187397956848, "rewards/rejected": 0.7264025807380676, "step": 2274 }, { "epoch": 1.23, "learning_rate": 6.682864311816158e-08, "logits/chosen": -2.1083970069885254, "logits/rejected": -2.233567714691162, "logps/chosen": -2.288511037826538, "logps/rejected": -2.364739418029785, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.9465872049331665, "rewards/margins": 0.01799488067626953, "rewards/rejected": 0.928592324256897, "step": 2275 }, { "epoch": 1.23, "learning_rate": 6.680122317610334e-08, "logits/chosen": -2.0707545280456543, "logits/rejected": -2.26503586769104, "logps/chosen": -0.4865339696407318, "logps/rejected": -0.6166008114814758, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.8539754748344421, "rewards/margins": -0.002819061279296875, "rewards/rejected": 0.856794536113739, "step": 2276 }, { "epoch": 1.23, "learning_rate": 6.677379753689402e-08, "logits/chosen": -2.118842601776123, "logits/rejected": -2.2434239387512207, "logps/chosen": -0.9859713315963745, "logps/rejected": -1.018571138381958, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.994486927986145, "rewards/margins": 0.00055694580078125, "rewards/rejected": 0.9939299821853638, "step": 2277 }, { "epoch": 1.23, "learning_rate": 6.674636620983343e-08, "logits/chosen": -2.0854177474975586, "logits/rejected": -2.048189878463745, "logps/chosen": -5.858371734619141, "logps/rejected": -1.9293171167373657, "loss": 0.3709, "rewards/accuracies": 1.0, "rewards/chosen": 1.53963303565979, "rewards/margins": 0.8006110787391663, "rewards/rejected": 0.7390219569206238, "step": 2278 }, { "epoch": 1.23, "learning_rate": 6.671892920422331e-08, "logits/chosen": -1.998000979423523, "logits/rejected": -1.9923646450042725, "logps/chosen": -3.346367835998535, "logps/rejected": -4.359542369842529, "loss": 0.3343, "rewards/accuracies": 1.0, "rewards/chosen": 1.4052939414978027, "rewards/margins": 0.9238470792770386, "rewards/rejected": 0.48144689202308655, "step": 2279 }, { "epoch": 1.23, "learning_rate": 6.66914865293673e-08, "logits/chosen": -2.1671359539031982, "logits/rejected": -2.3527746200561523, "logps/chosen": -1.674498438835144, "logps/rejected": -1.6639186143875122, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.9510120749473572, "rewards/margins": 0.011404454708099365, "rewards/rejected": 0.9396076202392578, "step": 2280 }, { "epoch": 1.23, "learning_rate": 6.666403819457096e-08, "logits/chosen": -1.9525399208068848, "logits/rejected": -2.286728620529175, "logps/chosen": -1.9177452325820923, "logps/rejected": -2.023711919784546, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.7217190265655518, "rewards/margins": 0.005170285701751709, "rewards/rejected": 0.7165487408638, "step": 2281 }, { "epoch": 1.23, "learning_rate": 6.663658420914182e-08, "logits/chosen": -2.221007823944092, "logits/rejected": -2.0259883403778076, "logps/chosen": -42.05652618408203, "logps/rejected": -3.6543211936950684, "loss": 0.1596, "rewards/accuracies": 1.0, "rewards/chosen": 2.3129730224609375, "rewards/margins": 1.7540287971496582, "rewards/rejected": 0.5589441657066345, "step": 2282 }, { "epoch": 1.23, "learning_rate": 6.660912458238925e-08, "logits/chosen": -2.093770980834961, "logits/rejected": -1.9780024290084839, "logps/chosen": -33.90418243408203, "logps/rejected": -2.9934511184692383, "loss": 0.2228, "rewards/accuracies": 1.0, "rewards/chosen": 1.9246273040771484, "rewards/margins": 1.3881444931030273, "rewards/rejected": 0.5364828109741211, "step": 2283 }, { "epoch": 1.23, "learning_rate": 6.658165932362463e-08, "logits/chosen": -2.070793867111206, "logits/rejected": -2.2300357818603516, "logps/chosen": -0.9570491313934326, "logps/rejected": -0.9167784452438354, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.8192224502563477, "rewards/margins": 0.0075078606605529785, "rewards/rejected": 0.8117145895957947, "step": 2284 }, { "epoch": 1.23, "learning_rate": 6.655418844216115e-08, "logits/chosen": -2.0256264209747314, "logits/rejected": -2.034416675567627, "logps/chosen": -1.9191020727157593, "logps/rejected": -2.188504219055176, "loss": 0.5284, "rewards/accuracies": 1.0, "rewards/chosen": 1.0068325996398926, "rewards/margins": 0.36202001571655273, "rewards/rejected": 0.6448125839233398, "step": 2285 }, { "epoch": 1.23, "learning_rate": 6.652671194731395e-08, "logits/chosen": -2.1373870372772217, "logits/rejected": -2.1226935386657715, "logps/chosen": -1.886838674545288, "logps/rejected": -6.860173225402832, "loss": 0.4075, "rewards/accuracies": 1.0, "rewards/chosen": 1.1429568529129028, "rewards/margins": 0.687082052230835, "rewards/rejected": 0.45587483048439026, "step": 2286 }, { "epoch": 1.23, "learning_rate": 6.64992298484001e-08, "logits/chosen": -2.060333251953125, "logits/rejected": -2.2451255321502686, "logps/chosen": -0.6062240600585938, "logps/rejected": -0.6164641380310059, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.8021206855773926, "rewards/margins": 0.016311585903167725, "rewards/rejected": 0.7858090996742249, "step": 2287 }, { "epoch": 1.23, "learning_rate": 6.647174215473852e-08, "logits/chosen": -1.9963892698287964, "logits/rejected": -1.9917941093444824, "logps/chosen": -1.3725471496582031, "logps/rejected": -4.861145496368408, "loss": 0.4356, "rewards/accuracies": 1.0, "rewards/chosen": 1.112704873085022, "rewards/margins": 0.6052307486534119, "rewards/rejected": 0.5074741244316101, "step": 2288 }, { "epoch": 1.23, "learning_rate": 6.644424887565008e-08, "logits/chosen": -2.0843148231506348, "logits/rejected": -2.0224132537841797, "logps/chosen": -21.602035522460938, "logps/rejected": -3.0215582847595215, "loss": 0.3616, "rewards/accuracies": 1.0, "rewards/chosen": 1.5514458417892456, "rewards/margins": 0.8308120965957642, "rewards/rejected": 0.7206337451934814, "step": 2289 }, { "epoch": 1.24, "learning_rate": 6.641675002045751e-08, "logits/chosen": -2.0925796031951904, "logits/rejected": -2.2838857173919678, "logps/chosen": -3.6207399368286133, "logps/rejected": -3.546003818511963, "loss": 0.715, "rewards/accuracies": 0.0, "rewards/chosen": 0.7206072807312012, "rewards/margins": -0.04318511486053467, "rewards/rejected": 0.7637923955917358, "step": 2290 }, { "epoch": 1.24, "learning_rate": 6.638924559848541e-08, "logits/chosen": -2.0812933444976807, "logits/rejected": -2.2761428356170654, "logps/chosen": -0.5001959204673767, "logps/rejected": -3.3187344074249268, "loss": 0.6166, "rewards/accuracies": 1.0, "rewards/chosen": 0.9754801988601685, "rewards/margins": 0.15942001342773438, "rewards/rejected": 0.8160601854324341, "step": 2291 }, { "epoch": 1.24, "learning_rate": 6.636173561906032e-08, "logits/chosen": -2.069936990737915, "logits/rejected": -2.28981876373291, "logps/chosen": -15.167879104614258, "logps/rejected": -12.199909210205078, "loss": 0.6665, "rewards/accuracies": 1.0, "rewards/chosen": 0.9893278479576111, "rewards/margins": 0.05409836769104004, "rewards/rejected": 0.935229480266571, "step": 2292 }, { "epoch": 1.24, "learning_rate": 6.633422009151061e-08, "logits/chosen": -2.088021993637085, "logits/rejected": -2.084897518157959, "logps/chosen": -1.2370964288711548, "logps/rejected": -3.1852266788482666, "loss": 0.4792, "rewards/accuracies": 1.0, "rewards/chosen": 1.1119492053985596, "rewards/margins": 0.486392080783844, "rewards/rejected": 0.6255571246147156, "step": 2293 }, { "epoch": 1.24, "learning_rate": 6.63066990251666e-08, "logits/chosen": -2.23860502243042, "logits/rejected": -2.1964902877807617, "logps/chosen": -38.4922981262207, "logps/rejected": -11.85869312286377, "loss": 0.2355, "rewards/accuracies": 1.0, "rewards/chosen": 2.06054425239563, "rewards/margins": 1.325838327407837, "rewards/rejected": 0.7347058653831482, "step": 2294 }, { "epoch": 1.24, "learning_rate": 6.627917242936043e-08, "logits/chosen": -2.036792516708374, "logits/rejected": -2.3076424598693848, "logps/chosen": -0.5763432383537292, "logps/rejected": -0.4825923442840576, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.9140095710754395, "rewards/margins": 0.0322992205619812, "rewards/rejected": 0.8817103505134583, "step": 2295 }, { "epoch": 1.24, "learning_rate": 6.625164031342611e-08, "logits/chosen": -2.0502023696899414, "logits/rejected": -2.256225109100342, "logps/chosen": -0.33459922671318054, "logps/rejected": -0.370390921831131, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 1.024509310722351, "rewards/margins": 0.021016478538513184, "rewards/rejected": 1.003492832183838, "step": 2296 }, { "epoch": 1.24, "learning_rate": 6.622410268669957e-08, "logits/chosen": -2.076770782470703, "logits/rejected": -2.3184738159179688, "logps/chosen": -7.71635627746582, "logps/rejected": -15.701139450073242, "loss": 0.6319, "rewards/accuracies": 1.0, "rewards/chosen": 0.9059335589408875, "rewards/margins": 0.12652796506881714, "rewards/rejected": 0.7794055938720703, "step": 2297 }, { "epoch": 1.24, "learning_rate": 6.61965595585186e-08, "logits/chosen": -2.107630491256714, "logits/rejected": -2.116278648376465, "logps/chosen": -1.9733957052230835, "logps/rejected": -2.4353675842285156, "loss": 0.4927, "rewards/accuracies": 1.0, "rewards/chosen": 1.1547421216964722, "rewards/margins": 0.4513387084007263, "rewards/rejected": 0.7034034132957458, "step": 2298 }, { "epoch": 1.24, "learning_rate": 6.616901093822282e-08, "logits/chosen": -2.0901567935943604, "logits/rejected": -2.3232421875, "logps/chosen": -0.6169280409812927, "logps/rejected": -0.6789880990982056, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.8564206957817078, "rewards/margins": 0.007195174694061279, "rewards/rejected": 0.8492255210876465, "step": 2299 }, { "epoch": 1.24, "learning_rate": 6.614145683515373e-08, "logits/chosen": -2.1486313343048096, "logits/rejected": -2.297028064727783, "logps/chosen": -2.2741832733154297, "logps/rejected": -2.091841220855713, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.5858594179153442, "rewards/margins": -0.003268003463745117, "rewards/rejected": 0.5891274213790894, "step": 2300 }, { "epoch": 1.24, "learning_rate": 6.611389725865467e-08, "logits/chosen": -2.0447769165039062, "logits/rejected": -2.0482444763183594, "logps/chosen": -5.302878379821777, "logps/rejected": -3.317411422729492, "loss": 0.1427, "rewards/accuracies": 1.0, "rewards/chosen": 2.3623504638671875, "rewards/margins": 1.8749979734420776, "rewards/rejected": 0.4873524606227875, "step": 2301 }, { "epoch": 1.24, "learning_rate": 6.608633221807088e-08, "logits/chosen": -1.9658020734786987, "logits/rejected": -2.2542850971221924, "logps/chosen": -0.20669707655906677, "logps/rejected": -0.22375324368476868, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.8258985877037048, "rewards/margins": 0.012943506240844727, "rewards/rejected": 0.8129550814628601, "step": 2302 }, { "epoch": 1.24, "learning_rate": 6.605876172274945e-08, "logits/chosen": -1.9717991352081299, "logits/rejected": -2.305971384048462, "logps/chosen": -0.6191985011100769, "logps/rejected": -0.6483535766601562, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 1.065745234489441, "rewards/margins": 0.012452960014343262, "rewards/rejected": 1.0532922744750977, "step": 2303 }, { "epoch": 1.24, "learning_rate": 6.603118578203926e-08, "logits/chosen": -1.9762977361679077, "logits/rejected": -1.93245530128479, "logps/chosen": -12.266668319702148, "logps/rejected": -1.7322684526443481, "loss": 0.5416, "rewards/accuracies": 1.0, "rewards/chosen": 1.270151138305664, "rewards/margins": 0.3302895426750183, "rewards/rejected": 0.9398615956306458, "step": 2304 }, { "epoch": 1.24, "learning_rate": 6.60036044052911e-08, "logits/chosen": -2.093426465988159, "logits/rejected": -2.0973570346832275, "logps/chosen": -0.7340012788772583, "logps/rejected": -2.3277668952941895, "loss": 0.5468, "rewards/accuracies": 1.0, "rewards/chosen": 0.9354669451713562, "rewards/margins": 0.3177628517150879, "rewards/rejected": 0.6177040934562683, "step": 2305 }, { "epoch": 1.24, "learning_rate": 6.597601760185757e-08, "logits/chosen": -2.0312087535858154, "logits/rejected": -2.0413436889648438, "logps/chosen": -3.3538920879364014, "logps/rejected": -2.256849765777588, "loss": 0.4868, "rewards/accuracies": 1.0, "rewards/chosen": 1.0687521696090698, "rewards/margins": 0.46660518646240234, "rewards/rejected": 0.6021469831466675, "step": 2306 }, { "epoch": 1.24, "learning_rate": 6.594842538109312e-08, "logits/chosen": -1.9819730520248413, "logits/rejected": -1.9916303157806396, "logps/chosen": -1.819814682006836, "logps/rejected": -5.050388336181641, "loss": 0.3686, "rewards/accuracies": 1.0, "rewards/chosen": 1.459737777709961, "rewards/margins": 0.8079730868339539, "rewards/rejected": 0.6517646908760071, "step": 2307 }, { "epoch": 1.24, "learning_rate": 6.592082775235403e-08, "logits/chosen": -2.1204402446746826, "logits/rejected": -2.1120290756225586, "logps/chosen": -5.393831729888916, "logps/rejected": -2.328275203704834, "loss": 0.399, "rewards/accuracies": 1.0, "rewards/chosen": 1.5560318231582642, "rewards/margins": 0.7126699686050415, "rewards/rejected": 0.8433618545532227, "step": 2308 }, { "epoch": 1.25, "learning_rate": 6.589322472499845e-08, "logits/chosen": -2.0739665031433105, "logits/rejected": -2.318528175354004, "logps/chosen": -4.699409484863281, "logps/rejected": -4.7042436599731445, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 1.2849327325820923, "rewards/margins": 0.01210474967956543, "rewards/rejected": 1.2728279829025269, "step": 2309 }, { "epoch": 1.25, "learning_rate": 6.586561630838627e-08, "logits/chosen": -2.048801898956299, "logits/rejected": -2.052098274230957, "logps/chosen": -0.6393107175827026, "logps/rejected": -4.339240550994873, "loss": 0.5372, "rewards/accuracies": 1.0, "rewards/chosen": 0.8228017687797546, "rewards/margins": 0.3406745195388794, "rewards/rejected": 0.48212724924087524, "step": 2310 }, { "epoch": 1.25, "learning_rate": 6.583800251187933e-08, "logits/chosen": -2.119384765625, "logits/rejected": -2.118274688720703, "logps/chosen": -6.2002081871032715, "logps/rejected": -3.203594207763672, "loss": 0.3966, "rewards/accuracies": 1.0, "rewards/chosen": 1.3839105367660522, "rewards/margins": 0.7200106978416443, "rewards/rejected": 0.663899838924408, "step": 2311 }, { "epoch": 1.25, "learning_rate": 6.581038334484119e-08, "logits/chosen": -2.111687660217285, "logits/rejected": -2.282477378845215, "logps/chosen": -1.1023778915405273, "logps/rejected": -1.0767874717712402, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.9528793692588806, "rewards/margins": 0.018965542316436768, "rewards/rejected": 0.9339138269424438, "step": 2312 }, { "epoch": 1.25, "learning_rate": 6.578275881663731e-08, "logits/chosen": -1.9511414766311646, "logits/rejected": -2.2552616596221924, "logps/chosen": -1.4079606533050537, "logps/rejected": -1.3548262119293213, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.9605135321617126, "rewards/margins": -0.003359973430633545, "rewards/rejected": 0.9638735055923462, "step": 2313 }, { "epoch": 1.25, "learning_rate": 6.575512893663491e-08, "logits/chosen": -2.038252115249634, "logits/rejected": -2.043879985809326, "logps/chosen": -8.061443328857422, "logps/rejected": -7.495867729187012, "loss": 0.3112, "rewards/accuracies": 1.0, "rewards/chosen": 1.6952060461044312, "rewards/margins": 1.00758695602417, "rewards/rejected": 0.6876190304756165, "step": 2314 }, { "epoch": 1.25, "learning_rate": 6.572749371420303e-08, "logits/chosen": -2.1186420917510986, "logits/rejected": -2.0216426849365234, "logps/chosen": -27.13982582092285, "logps/rejected": -3.3799784183502197, "loss": 0.2426, "rewards/accuracies": 1.0, "rewards/chosen": 1.8765214681625366, "rewards/margins": 1.292515754699707, "rewards/rejected": 0.5840057730674744, "step": 2315 }, { "epoch": 1.25, "learning_rate": 6.569985315871256e-08, "logits/chosen": -2.1156845092773438, "logits/rejected": -2.112393379211426, "logps/chosen": -5.262596607208252, "logps/rejected": -2.4604408740997314, "loss": 0.324, "rewards/accuracies": 1.0, "rewards/chosen": 1.6117171049118042, "rewards/margins": 0.9606443047523499, "rewards/rejected": 0.6510728001594543, "step": 2316 }, { "epoch": 1.25, "learning_rate": 6.567220727953618e-08, "logits/chosen": -2.1893908977508545, "logits/rejected": -2.0159811973571777, "logps/chosen": -26.80461883544922, "logps/rejected": -3.891416549682617, "loss": 0.2503, "rewards/accuracies": 1.0, "rewards/chosen": 1.8745571374893188, "rewards/margins": 1.2572882175445557, "rewards/rejected": 0.617268979549408, "step": 2317 }, { "epoch": 1.25, "learning_rate": 6.564455608604835e-08, "logits/chosen": -2.0999488830566406, "logits/rejected": -2.138500213623047, "logps/chosen": -3.6788578033447266, "logps/rejected": -9.730446815490723, "loss": 0.4791, "rewards/accuracies": 1.0, "rewards/chosen": 1.3163374662399292, "rewards/margins": 0.48670536279678345, "rewards/rejected": 0.8296321034431458, "step": 2318 }, { "epoch": 1.25, "learning_rate": 6.561689958762538e-08, "logits/chosen": -2.056720018386841, "logits/rejected": -2.27640438079834, "logps/chosen": -1.3117685317993164, "logps/rejected": -1.163060188293457, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.8036589622497559, "rewards/margins": 0.01114422082901001, "rewards/rejected": 0.7925147414207458, "step": 2319 }, { "epoch": 1.25, "learning_rate": 6.558923779364533e-08, "logits/chosen": -2.045307159423828, "logits/rejected": -2.294098138809204, "logps/chosen": -0.6815810203552246, "logps/rejected": -0.6964108347892761, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.9105958938598633, "rewards/margins": 0.02739197015762329, "rewards/rejected": 0.88320392370224, "step": 2320 }, { "epoch": 1.25, "learning_rate": 6.556157071348807e-08, "logits/chosen": -2.0396018028259277, "logits/rejected": -2.032806634902954, "logps/chosen": -5.898778915405273, "logps/rejected": -6.061454772949219, "loss": 0.3855, "rewards/accuracies": 1.0, "rewards/chosen": 1.3869657516479492, "rewards/margins": 0.7543786764144897, "rewards/rejected": 0.6325870752334595, "step": 2321 }, { "epoch": 1.25, "learning_rate": 6.553389835653532e-08, "logits/chosen": -2.0956268310546875, "logits/rejected": -2.092485189437866, "logps/chosen": -7.734389781951904, "logps/rejected": -3.490413188934326, "loss": 0.3656, "rewards/accuracies": 1.0, "rewards/chosen": 1.3235177993774414, "rewards/margins": 0.8178611993789673, "rewards/rejected": 0.5056565999984741, "step": 2322 }, { "epoch": 1.25, "learning_rate": 6.550622073217047e-08, "logits/chosen": -2.057945966720581, "logits/rejected": -2.0486762523651123, "logps/chosen": -4.6381611824035645, "logps/rejected": -2.0629446506500244, "loss": 0.4055, "rewards/accuracies": 1.0, "rewards/chosen": 1.5769741535186768, "rewards/margins": 0.6929447650909424, "rewards/rejected": 0.8840293884277344, "step": 2323 }, { "epoch": 1.25, "learning_rate": 6.547853784977883e-08, "logits/chosen": -2.063971519470215, "logits/rejected": -2.228618860244751, "logps/chosen": -7.63733434677124, "logps/rejected": -1.0586172342300415, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 0.8113653063774109, "rewards/margins": -0.005488693714141846, "rewards/rejected": 0.8168540000915527, "step": 2324 }, { "epoch": 1.25, "learning_rate": 6.545084971874738e-08, "logits/chosen": -2.0983705520629883, "logits/rejected": -2.1275999546051025, "logps/chosen": -4.038541793823242, "logps/rejected": -11.73172664642334, "loss": 0.3158, "rewards/accuracies": 1.0, "rewards/chosen": 1.4720171689987183, "rewards/margins": 0.9905568361282349, "rewards/rejected": 0.481460303068161, "step": 2325 }, { "epoch": 1.25, "learning_rate": 6.542315634846493e-08, "logits/chosen": -2.1821887493133545, "logits/rejected": -2.190133571624756, "logps/chosen": -1.8299429416656494, "logps/rejected": -1.7513889074325562, "loss": 0.4974, "rewards/accuracies": 1.0, "rewards/chosen": 1.0236608982086182, "rewards/margins": 0.43934202194213867, "rewards/rejected": 0.5843188762664795, "step": 2326 }, { "epoch": 1.26, "learning_rate": 6.539545774832211e-08, "logits/chosen": -2.042030096054077, "logits/rejected": -2.2400102615356445, "logps/chosen": -0.5664867162704468, "logps/rejected": -0.5953876376152039, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.9748296737670898, "rewards/margins": 0.01832711696624756, "rewards/rejected": 0.9565025568008423, "step": 2327 }, { "epoch": 1.26, "learning_rate": 6.536775392771126e-08, "logits/chosen": -2.065061092376709, "logits/rejected": -2.2808144092559814, "logps/chosen": -5.246535778045654, "logps/rejected": -1.0255882740020752, "loss": 0.9209, "rewards/accuracies": 0.0, "rewards/chosen": 0.6639791131019592, "rewards/margins": -0.4131724238395691, "rewards/rejected": 1.0771515369415283, "step": 2328 }, { "epoch": 1.26, "learning_rate": 6.534004489602649e-08, "logits/chosen": -2.1546289920806885, "logits/rejected": -2.157236099243164, "logps/chosen": -1.3464189767837524, "logps/rejected": -2.2080273628234863, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 0.9386301040649414, "rewards/margins": 0.024378955364227295, "rewards/rejected": 0.9142511487007141, "step": 2329 }, { "epoch": 1.26, "learning_rate": 6.531233066266368e-08, "logits/chosen": -2.067953586578369, "logits/rejected": -2.2876322269439697, "logps/chosen": -2.5678272247314453, "logps/rejected": -2.6247365474700928, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.9507327079772949, "rewards/margins": 0.01001960039138794, "rewards/rejected": 0.940713107585907, "step": 2330 }, { "epoch": 1.26, "learning_rate": 6.528461123702054e-08, "logits/chosen": -2.029261350631714, "logits/rejected": -2.0248682498931885, "logps/chosen": -6.642531394958496, "logps/rejected": -2.2294387817382812, "loss": 0.5374, "rewards/accuracies": 1.0, "rewards/chosen": 1.053704023361206, "rewards/margins": 0.3404082655906677, "rewards/rejected": 0.7132957577705383, "step": 2331 }, { "epoch": 1.26, "learning_rate": 6.525688662849647e-08, "logits/chosen": -1.9553779363632202, "logits/rejected": -1.9611567258834839, "logps/chosen": -3.114820718765259, "logps/rejected": -4.569360256195068, "loss": 0.4464, "rewards/accuracies": 1.0, "rewards/chosen": 1.181357741355896, "rewards/margins": 0.575104832649231, "rewards/rejected": 0.606252908706665, "step": 2332 }, { "epoch": 1.26, "learning_rate": 6.522915684649262e-08, "logits/chosen": -2.101135015487671, "logits/rejected": -2.1981098651885986, "logps/chosen": -1.3615919351577759, "logps/rejected": -1.3135969638824463, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 1.067196011543274, "rewards/margins": 0.011030077934265137, "rewards/rejected": 1.0561659336090088, "step": 2333 }, { "epoch": 1.26, "learning_rate": 6.520142190041197e-08, "logits/chosen": -2.0576171875, "logits/rejected": -2.3015525341033936, "logps/chosen": -0.6795855760574341, "logps/rejected": -0.7188109755516052, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.7974240183830261, "rewards/margins": -0.0006604194641113281, "rewards/rejected": 0.7980844378471375, "step": 2334 }, { "epoch": 1.26, "learning_rate": 6.517368179965915e-08, "logits/chosen": -2.00644850730896, "logits/rejected": -2.008862257003784, "logps/chosen": -2.145871877670288, "logps/rejected": -6.948832988739014, "loss": 0.4099, "rewards/accuracies": 1.0, "rewards/chosen": 0.9560230374336243, "rewards/margins": 0.6799260377883911, "rewards/rejected": 0.27609696984291077, "step": 2335 }, { "epoch": 1.26, "learning_rate": 6.514593655364064e-08, "logits/chosen": -2.0840160846710205, "logits/rejected": -2.250567674636841, "logps/chosen": -1.2109320163726807, "logps/rejected": -1.1550695896148682, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.738801896572113, "rewards/margins": 0.009029507637023926, "rewards/rejected": 0.7297723889350891, "step": 2336 }, { "epoch": 1.26, "learning_rate": 6.511818617176457e-08, "logits/chosen": -2.020864963531494, "logits/rejected": -2.001840114593506, "logps/chosen": -4.992119789123535, "logps/rejected": -5.233755111694336, "loss": 0.2938, "rewards/accuracies": 1.0, "rewards/chosen": 1.489730715751648, "rewards/margins": 1.0742850303649902, "rewards/rejected": 0.41544562578201294, "step": 2337 }, { "epoch": 1.26, "learning_rate": 6.50904306634409e-08, "logits/chosen": -2.1869328022003174, "logits/rejected": -2.1372194290161133, "logps/chosen": -17.207019805908203, "logps/rejected": -3.914515495300293, "loss": 0.3004, "rewards/accuracies": 1.0, "rewards/chosen": 1.60712468624115, "rewards/margins": 1.0487704277038574, "rewards/rejected": 0.5583541989326477, "step": 2338 }, { "epoch": 1.26, "learning_rate": 6.506267003808128e-08, "logits/chosen": -2.108597993850708, "logits/rejected": -2.044753313064575, "logps/chosen": -12.30194091796875, "logps/rejected": -7.693511486053467, "loss": 0.7878, "rewards/accuracies": 0.0, "rewards/chosen": 0.4800731837749481, "rewards/margins": -0.1811661422252655, "rewards/rejected": 0.6612393260002136, "step": 2339 }, { "epoch": 1.26, "learning_rate": 6.503490430509905e-08, "logits/chosen": -1.9853614568710327, "logits/rejected": -2.254241943359375, "logps/chosen": -0.32452771067619324, "logps/rejected": -0.3529825210571289, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.873607337474823, "rewards/margins": 0.007047414779663086, "rewards/rejected": 0.8665599226951599, "step": 2340 }, { "epoch": 1.26, "learning_rate": 6.50071334739094e-08, "logits/chosen": -2.050870180130005, "logits/rejected": -2.042112350463867, "logps/chosen": -4.115994930267334, "logps/rejected": -6.137690544128418, "loss": 0.3644, "rewards/accuracies": 1.0, "rewards/chosen": 1.2201436758041382, "rewards/margins": 0.8218028545379639, "rewards/rejected": 0.39834079146385193, "step": 2341 }, { "epoch": 1.26, "learning_rate": 6.497935755392913e-08, "logits/chosen": -1.9357460737228394, "logits/rejected": -2.2274093627929688, "logps/chosen": -0.5318396091461182, "logps/rejected": -0.46858301758766174, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.8149020075798035, "rewards/margins": 0.015923500061035156, "rewards/rejected": 0.7989785075187683, "step": 2342 }, { "epoch": 1.26, "learning_rate": 6.495157655457686e-08, "logits/chosen": -2.1839449405670166, "logits/rejected": -2.1803183555603027, "logps/chosen": -6.318576812744141, "logps/rejected": -6.393867492675781, "loss": 0.3859, "rewards/accuracies": 1.0, "rewards/chosen": 1.2021887302398682, "rewards/margins": 0.7529041767120361, "rewards/rejected": 0.44928455352783203, "step": 2343 }, { "epoch": 1.26, "learning_rate": 6.492379048527286e-08, "logits/chosen": -2.059364080429077, "logits/rejected": -2.111015796661377, "logps/chosen": -5.374510765075684, "logps/rejected": -14.748032569885254, "loss": 0.2653, "rewards/accuracies": 1.0, "rewards/chosen": 1.483451008796692, "rewards/margins": 1.1914421319961548, "rewards/rejected": 0.2920088768005371, "step": 2344 }, { "epoch": 1.26, "learning_rate": 6.489599935543915e-08, "logits/chosen": -2.2313597202301025, "logits/rejected": -2.2906558513641357, "logps/chosen": -8.356907844543457, "logps/rejected": -26.46978187561035, "loss": 0.4437, "rewards/accuracies": 1.0, "rewards/chosen": 1.1594680547714233, "rewards/margins": 0.5826978087425232, "rewards/rejected": 0.5767702460289001, "step": 2345 }, { "epoch": 1.27, "learning_rate": 6.486820317449947e-08, "logits/chosen": -1.9643758535385132, "logits/rejected": -2.2458930015563965, "logps/chosen": -2.499828338623047, "logps/rejected": -2.4801857471466064, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 1.016990065574646, "rewards/margins": -0.004337787628173828, "rewards/rejected": 1.0213278532028198, "step": 2346 }, { "epoch": 1.27, "learning_rate": 6.484040195187927e-08, "logits/chosen": -2.0585415363311768, "logits/rejected": -2.2667160034179688, "logps/chosen": -2.2216365337371826, "logps/rejected": -2.120666980743408, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.8823187947273254, "rewards/margins": 0.004594326019287109, "rewards/rejected": 0.8777244687080383, "step": 2347 }, { "epoch": 1.27, "learning_rate": 6.48125956970057e-08, "logits/chosen": -2.142796039581299, "logits/rejected": -2.1439294815063477, "logps/chosen": -1.5206120014190674, "logps/rejected": -2.8860602378845215, "loss": 0.5361, "rewards/accuracies": 1.0, "rewards/chosen": 1.009124994277954, "rewards/margins": 0.3435482978820801, "rewards/rejected": 0.665576696395874, "step": 2348 }, { "epoch": 1.27, "learning_rate": 6.478478441930762e-08, "logits/chosen": -2.105311393737793, "logits/rejected": -2.3157994747161865, "logps/chosen": -0.5490676164627075, "logps/rejected": -0.5417693257331848, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.9648427963256836, "rewards/margins": 0.014928638935089111, "rewards/rejected": 0.9499141573905945, "step": 2349 }, { "epoch": 1.27, "learning_rate": 6.475696812821562e-08, "logits/chosen": -2.083707332611084, "logits/rejected": -2.3118371963500977, "logps/chosen": -0.32372552156448364, "logps/rejected": -0.2953648865222931, "loss": 0.6702, "rewards/accuracies": 1.0, "rewards/chosen": 0.8920060396194458, "rewards/margins": 0.04634571075439453, "rewards/rejected": 0.8456603288650513, "step": 2350 }, { "epoch": 1.27, "learning_rate": 6.472914683316195e-08, "logits/chosen": -1.9919499158859253, "logits/rejected": -2.2699549198150635, "logps/chosen": -0.6665410399436951, "logps/rejected": -0.6893168091773987, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.9271323084831238, "rewards/margins": 0.02316528558731079, "rewards/rejected": 0.903967022895813, "step": 2351 }, { "epoch": 1.27, "learning_rate": 6.470132054358056e-08, "logits/chosen": -1.9807400703430176, "logits/rejected": -2.246380567550659, "logps/chosen": -1.8878147602081299, "logps/rejected": -1.9797054529190063, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 0.9522967338562012, "rewards/margins": -0.004628956317901611, "rewards/rejected": 0.9569256901741028, "step": 2352 }, { "epoch": 1.27, "learning_rate": 6.467348926890714e-08, "logits/chosen": -2.0358331203460693, "logits/rejected": -2.0257089138031006, "logps/chosen": -4.201320648193359, "logps/rejected": -1.882293701171875, "loss": 0.3957, "rewards/accuracies": 1.0, "rewards/chosen": 1.5906429290771484, "rewards/margins": 0.7226055860519409, "rewards/rejected": 0.8680373430252075, "step": 2353 }, { "epoch": 1.27, "learning_rate": 6.464565301857899e-08, "logits/chosen": -1.9797182083129883, "logits/rejected": -1.977948546409607, "logps/chosen": -3.710902214050293, "logps/rejected": -1.5715949535369873, "loss": 0.6475, "rewards/accuracies": 1.0, "rewards/chosen": 1.156819224357605, "rewards/margins": 0.0934906005859375, "rewards/rejected": 1.0633286237716675, "step": 2354 }, { "epoch": 1.27, "learning_rate": 6.461781180203517e-08, "logits/chosen": -2.2019119262695312, "logits/rejected": -2.0296857357025146, "logps/chosen": -38.875022888183594, "logps/rejected": -3.1379501819610596, "loss": 0.1323, "rewards/accuracies": 1.0, "rewards/chosen": 2.482752561569214, "rewards/margins": 1.9560811519622803, "rewards/rejected": 0.5266714096069336, "step": 2355 }, { "epoch": 1.27, "learning_rate": 6.458996562871643e-08, "logits/chosen": -1.8964014053344727, "logits/rejected": -1.9055609703063965, "logps/chosen": -2.383279323577881, "logps/rejected": -4.9813079833984375, "loss": 0.3968, "rewards/accuracies": 1.0, "rewards/chosen": 1.2350765466690063, "rewards/margins": 0.7192804217338562, "rewards/rejected": 0.5157961249351501, "step": 2356 }, { "epoch": 1.27, "learning_rate": 6.456211450806513e-08, "logits/chosen": -2.0761890411376953, "logits/rejected": -2.0912845134735107, "logps/chosen": -8.713582992553711, "logps/rejected": -5.311840057373047, "loss": 0.4376, "rewards/accuracies": 1.0, "rewards/chosen": 1.488172173500061, "rewards/margins": 0.5995710492134094, "rewards/rejected": 0.8886011242866516, "step": 2357 }, { "epoch": 1.27, "learning_rate": 6.453425844952535e-08, "logits/chosen": -2.067079544067383, "logits/rejected": -2.2720563411712646, "logps/chosen": -4.077369213104248, "logps/rejected": -2.6376898288726807, "loss": 0.783, "rewards/accuracies": 0.0, "rewards/chosen": 0.9236038327217102, "rewards/margins": -0.17227715253829956, "rewards/rejected": 1.0958809852600098, "step": 2358 }, { "epoch": 1.27, "learning_rate": 6.450639746254282e-08, "logits/chosen": -2.015049934387207, "logits/rejected": -2.0236165523529053, "logps/chosen": -1.579517126083374, "logps/rejected": -3.5255367755889893, "loss": 0.3242, "rewards/accuracies": 1.0, "rewards/chosen": 1.5107357501983643, "rewards/margins": 0.9597705006599426, "rewards/rejected": 0.5509652495384216, "step": 2359 }, { "epoch": 1.27, "learning_rate": 6.447853155656502e-08, "logits/chosen": -2.1284215450286865, "logits/rejected": -2.136439323425293, "logps/chosen": -2.153738021850586, "logps/rejected": -3.7071499824523926, "loss": 0.4237, "rewards/accuracies": 1.0, "rewards/chosen": 1.2999323606491089, "rewards/margins": 0.6394940614700317, "rewards/rejected": 0.6604382991790771, "step": 2360 }, { "epoch": 1.27, "learning_rate": 6.445066074104101e-08, "logits/chosen": -2.032588243484497, "logits/rejected": -2.224867343902588, "logps/chosen": -0.814393937587738, "logps/rejected": -0.825635552406311, "loss": 0.6774, "rewards/accuracies": 1.0, "rewards/chosen": 0.9427124261856079, "rewards/margins": 0.03176039457321167, "rewards/rejected": 0.9109520316123962, "step": 2361 }, { "epoch": 1.27, "learning_rate": 6.442278502542154e-08, "logits/chosen": -2.093492031097412, "logits/rejected": -2.3492863178253174, "logps/chosen": -1.486842393875122, "logps/rejected": -1.4269055128097534, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 1.0626686811447144, "rewards/margins": -0.005052924156188965, "rewards/rejected": 1.0677216053009033, "step": 2362 }, { "epoch": 1.27, "learning_rate": 6.439490441915904e-08, "logits/chosen": -2.138612747192383, "logits/rejected": -2.2425525188446045, "logps/chosen": -2.4662623405456543, "logps/rejected": -0.8896573781967163, "loss": 0.6582, "rewards/accuracies": 1.0, "rewards/chosen": 0.936323344707489, "rewards/margins": 0.07122981548309326, "rewards/rejected": 0.8650935292243958, "step": 2363 }, { "epoch": 1.28, "learning_rate": 6.436701893170756e-08, "logits/chosen": -2.076427459716797, "logits/rejected": -2.255923271179199, "logps/chosen": -0.5389376878738403, "logps/rejected": -0.4690001904964447, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.7373596429824829, "rewards/margins": 0.01301884651184082, "rewards/rejected": 0.7243407964706421, "step": 2364 }, { "epoch": 1.28, "learning_rate": 6.433912857252285e-08, "logits/chosen": -2.125544309616089, "logits/rejected": -2.248445510864258, "logps/chosen": -8.15198802947998, "logps/rejected": -8.249200820922852, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 1.0828101634979248, "rewards/margins": 0.019279956817626953, "rewards/rejected": 1.0635302066802979, "step": 2365 }, { "epoch": 1.28, "learning_rate": 6.43112333510623e-08, "logits/chosen": -2.240164041519165, "logits/rejected": -2.3134689331054688, "logps/chosen": -2.393692970275879, "logps/rejected": -2.4570231437683105, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.7597153782844543, "rewards/margins": 0.003259897232055664, "rewards/rejected": 0.7564554810523987, "step": 2366 }, { "epoch": 1.28, "learning_rate": 6.428333327678494e-08, "logits/chosen": -2.0857625007629395, "logits/rejected": -2.0958147048950195, "logps/chosen": -2.105186700820923, "logps/rejected": -2.3305060863494873, "loss": 0.4914, "rewards/accuracies": 1.0, "rewards/chosen": 1.1826761960983276, "rewards/margins": 0.45488154888153076, "rewards/rejected": 0.7277946472167969, "step": 2367 }, { "epoch": 1.28, "learning_rate": 6.425542835915141e-08, "logits/chosen": -2.0994760990142822, "logits/rejected": -2.2521228790283203, "logps/chosen": -0.5009051561355591, "logps/rejected": -0.5208449363708496, "loss": 0.6964, "rewards/accuracies": 0.0, "rewards/chosen": 0.8316103219985962, "rewards/margins": -0.0065389275550842285, "rewards/rejected": 0.8381492495536804, "step": 2368 }, { "epoch": 1.28, "learning_rate": 6.422751860762406e-08, "logits/chosen": -1.9818817377090454, "logits/rejected": -1.9853042364120483, "logps/chosen": -1.2550439834594727, "logps/rejected": -4.2533416748046875, "loss": 0.4949, "rewards/accuracies": 1.0, "rewards/chosen": 1.03713059425354, "rewards/margins": 0.44565653800964355, "rewards/rejected": 0.5914740562438965, "step": 2369 }, { "epoch": 1.28, "learning_rate": 6.419960403166685e-08, "logits/chosen": -2.173431634902954, "logits/rejected": -2.239198923110962, "logps/chosen": -0.5107994675636292, "logps/rejected": -0.48633354902267456, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 1.035846471786499, "rewards/margins": 0.004948854446411133, "rewards/rejected": 1.030897617340088, "step": 2370 }, { "epoch": 1.28, "learning_rate": 6.417168464074537e-08, "logits/chosen": -1.9779975414276123, "logits/rejected": -1.983555793762207, "logps/chosen": -5.211871147155762, "logps/rejected": -1.259674072265625, "loss": 0.2297, "rewards/accuracies": 1.0, "rewards/chosen": 1.9058640003204346, "rewards/margins": 1.3539838790893555, "rewards/rejected": 0.5518801808357239, "step": 2371 }, { "epoch": 1.28, "learning_rate": 6.414376044432685e-08, "logits/chosen": -2.0134260654449463, "logits/rejected": -2.263558864593506, "logps/chosen": -0.32792502641677856, "logps/rejected": -0.43016546964645386, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.9765878915786743, "rewards/margins": 0.00869673490524292, "rewards/rejected": 0.9678911566734314, "step": 2372 }, { "epoch": 1.28, "learning_rate": 6.411583145188013e-08, "logits/chosen": -2.1270992755889893, "logits/rejected": -2.291095733642578, "logps/chosen": -0.6725678443908691, "logps/rejected": -0.6760590076446533, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.8470655679702759, "rewards/margins": 0.0268515944480896, "rewards/rejected": 0.8202139735221863, "step": 2373 }, { "epoch": 1.28, "learning_rate": 6.40878976728757e-08, "logits/chosen": -2.00227952003479, "logits/rejected": -2.257689952850342, "logps/chosen": -0.530293345451355, "logps/rejected": -0.5454487800598145, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 1.008670449256897, "rewards/margins": 0.0061223506927490234, "rewards/rejected": 1.002548098564148, "step": 2374 }, { "epoch": 1.28, "learning_rate": 6.405995911678568e-08, "logits/chosen": -1.982125163078308, "logits/rejected": -1.982244849205017, "logps/chosen": -1.2009307146072388, "logps/rejected": -1.4285694360733032, "loss": 0.5847, "rewards/accuracies": 1.0, "rewards/chosen": 0.9560087323188782, "rewards/margins": 0.23009562492370605, "rewards/rejected": 0.7259131073951721, "step": 2375 }, { "epoch": 1.28, "learning_rate": 6.403201579308378e-08, "logits/chosen": -2.1329896450042725, "logits/rejected": -2.1870484352111816, "logps/chosen": -12.83260726928711, "logps/rejected": -20.95863914489746, "loss": 0.3849, "rewards/accuracies": 1.0, "rewards/chosen": 1.5529597997665405, "rewards/margins": 0.7561410665512085, "rewards/rejected": 0.796818733215332, "step": 2376 }, { "epoch": 1.28, "learning_rate": 6.400406771124536e-08, "logits/chosen": -2.214115619659424, "logits/rejected": -2.041116952896118, "logps/chosen": -52.277587890625, "logps/rejected": -11.771147727966309, "loss": 0.2468, "rewards/accuracies": 1.0, "rewards/chosen": 2.178607225418091, "rewards/margins": 1.2731504440307617, "rewards/rejected": 0.9054568409919739, "step": 2377 }, { "epoch": 1.28, "learning_rate": 6.397611488074734e-08, "logits/chosen": -2.096708059310913, "logits/rejected": -2.0639395713806152, "logps/chosen": -5.705142021179199, "logps/rejected": -5.1450114250183105, "loss": 0.4151, "rewards/accuracies": 1.0, "rewards/chosen": 1.1634777784347534, "rewards/margins": 0.6646307110786438, "rewards/rejected": 0.4988470673561096, "step": 2378 }, { "epoch": 1.28, "learning_rate": 6.39481573110683e-08, "logits/chosen": -2.0810444355010986, "logits/rejected": -2.2674500942230225, "logps/chosen": -1.9913355112075806, "logps/rejected": -2.862212657928467, "loss": 0.6605, "rewards/accuracies": 1.0, "rewards/chosen": 0.9645406007766724, "rewards/margins": 0.06638920307159424, "rewards/rejected": 0.8981513977050781, "step": 2379 }, { "epoch": 1.28, "learning_rate": 6.392019501168844e-08, "logits/chosen": -2.039829730987549, "logits/rejected": -2.0374679565429688, "logps/chosen": -1.213713526725769, "logps/rejected": -5.025829315185547, "loss": 0.3614, "rewards/accuracies": 1.0, "rewards/chosen": 1.2989925146102905, "rewards/margins": 0.831580638885498, "rewards/rejected": 0.46741190552711487, "step": 2380 }, { "epoch": 1.28, "learning_rate": 6.389222799208951e-08, "logits/chosen": -2.026054859161377, "logits/rejected": -2.0284411907196045, "logps/chosen": -1.5102224349975586, "logps/rejected": -2.091186761856079, "loss": 0.505, "rewards/accuracies": 1.0, "rewards/chosen": 1.075966715812683, "rewards/margins": 0.42004334926605225, "rewards/rejected": 0.6559233665466309, "step": 2381 }, { "epoch": 1.28, "learning_rate": 6.38642562617549e-08, "logits/chosen": -2.0903191566467285, "logits/rejected": -2.084136724472046, "logps/chosen": -8.39231014251709, "logps/rejected": -4.040075302124023, "loss": 0.3001, "rewards/accuracies": 1.0, "rewards/chosen": 1.5638631582260132, "rewards/margins": 1.0499241352081299, "rewards/rejected": 0.5139390826225281, "step": 2382 }, { "epoch": 1.29, "learning_rate": 6.383627983016956e-08, "logits/chosen": -2.2265381813049316, "logits/rejected": -2.4009850025177, "logps/chosen": -11.399280548095703, "logps/rejected": -27.450037002563477, "loss": 0.6422, "rewards/accuracies": 1.0, "rewards/chosen": 1.0273441076278687, "rewards/margins": 0.10467547178268433, "rewards/rejected": 0.9226686358451843, "step": 2383 }, { "epoch": 1.29, "learning_rate": 6.380829870682008e-08, "logits/chosen": -2.046600580215454, "logits/rejected": -2.293088674545288, "logps/chosen": -0.47804009914398193, "logps/rejected": -0.5861184597015381, "loss": 0.6991, "rewards/accuracies": 0.0, "rewards/chosen": 0.7743802666664124, "rewards/margins": -0.011778295040130615, "rewards/rejected": 0.786158561706543, "step": 2384 }, { "epoch": 1.29, "learning_rate": 6.378031290119463e-08, "logits/chosen": -1.9871337413787842, "logits/rejected": -1.9928693771362305, "logps/chosen": -1.384611964225769, "logps/rejected": -4.588475704193115, "loss": 0.4363, "rewards/accuracies": 1.0, "rewards/chosen": 1.0148924589157104, "rewards/margins": 0.6033275127410889, "rewards/rejected": 0.41156497597694397, "step": 2385 }, { "epoch": 1.29, "learning_rate": 6.375232242278295e-08, "logits/chosen": -2.0263876914978027, "logits/rejected": -2.0172595977783203, "logps/chosen": -4.451621055603027, "logps/rejected": -2.1655166149139404, "loss": 0.3841, "rewards/accuracies": 1.0, "rewards/chosen": 1.6198147535324097, "rewards/margins": 0.7587419152259827, "rewards/rejected": 0.861072838306427, "step": 2386 }, { "epoch": 1.29, "learning_rate": 6.372432728107635e-08, "logits/chosen": -2.0274486541748047, "logits/rejected": -2.0290000438690186, "logps/chosen": -2.7470593452453613, "logps/rejected": -0.8746978044509888, "loss": 0.6301, "rewards/accuracies": 1.0, "rewards/chosen": 1.0828274488449097, "rewards/margins": 0.13037467002868652, "rewards/rejected": 0.9524527788162231, "step": 2387 }, { "epoch": 1.29, "learning_rate": 6.369632748556776e-08, "logits/chosen": -2.0775492191314697, "logits/rejected": -2.271613836288452, "logps/chosen": -1.4340510368347168, "logps/rejected": -1.530196189880371, "loss": 0.6777, "rewards/accuracies": 1.0, "rewards/chosen": 0.6013836860656738, "rewards/margins": 0.031215310096740723, "rewards/rejected": 0.5701683759689331, "step": 2388 }, { "epoch": 1.29, "learning_rate": 6.366832304575167e-08, "logits/chosen": -2.0661017894744873, "logits/rejected": -2.065471649169922, "logps/chosen": -1.0425299406051636, "logps/rejected": -1.5744305849075317, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.8994984030723572, "rewards/margins": 0.01846468448638916, "rewards/rejected": 0.881033718585968, "step": 2389 }, { "epoch": 1.29, "learning_rate": 6.364031397112415e-08, "logits/chosen": -2.0359902381896973, "logits/rejected": -2.2818362712860107, "logps/chosen": -0.5800485610961914, "logps/rejected": -0.6127195358276367, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.8335291743278503, "rewards/margins": 0.02102142572402954, "rewards/rejected": 0.8125077486038208, "step": 2390 }, { "epoch": 1.29, "learning_rate": 6.361230027118283e-08, "logits/chosen": -2.1216201782226562, "logits/rejected": -2.1215128898620605, "logps/chosen": -2.2762451171875, "logps/rejected": -1.6646621227264404, "loss": 0.4866, "rewards/accuracies": 1.0, "rewards/chosen": 1.1150792837142944, "rewards/margins": 0.46708351373672485, "rewards/rejected": 0.6479957699775696, "step": 2391 }, { "epoch": 1.29, "learning_rate": 6.358428195542692e-08, "logits/chosen": -2.0875205993652344, "logits/rejected": -2.2297215461730957, "logps/chosen": -0.6279772520065308, "logps/rejected": -0.6306979656219482, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9325723052024841, "rewards/margins": 0.01775801181793213, "rewards/rejected": 0.914814293384552, "step": 2392 }, { "epoch": 1.29, "learning_rate": 6.355625903335718e-08, "logits/chosen": -2.020772695541382, "logits/rejected": -2.233567476272583, "logps/chosen": -0.5083768963813782, "logps/rejected": -0.5090899467468262, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.9629988074302673, "rewards/margins": 0.0070590972900390625, "rewards/rejected": 0.9559397101402283, "step": 2393 }, { "epoch": 1.29, "learning_rate": 6.352823151447597e-08, "logits/chosen": -2.0117266178131104, "logits/rejected": -2.048717737197876, "logps/chosen": -1.940410852432251, "logps/rejected": -9.836496353149414, "loss": 0.312, "rewards/accuracies": 1.0, "rewards/chosen": 1.4787330627441406, "rewards/margins": 1.0047292709350586, "rewards/rejected": 0.47400379180908203, "step": 2394 }, { "epoch": 1.29, "learning_rate": 6.350019940828717e-08, "logits/chosen": -2.023761510848999, "logits/rejected": -2.0265729427337646, "logps/chosen": -0.5371900200843811, "logps/rejected": -3.5362801551818848, "loss": 0.4917, "rewards/accuracies": 1.0, "rewards/chosen": 0.8820180892944336, "rewards/margins": 0.4539578855037689, "rewards/rejected": 0.4280602037906647, "step": 2395 }, { "epoch": 1.29, "learning_rate": 6.347216272429621e-08, "logits/chosen": -2.0934016704559326, "logits/rejected": -2.095900774002075, "logps/chosen": -0.4303903579711914, "logps/rejected": -4.355515956878662, "loss": 0.4554, "rewards/accuracies": 1.0, "rewards/chosen": 0.9736530184745789, "rewards/margins": 0.5501708388328552, "rewards/rejected": 0.42348217964172363, "step": 2396 }, { "epoch": 1.29, "learning_rate": 6.34441214720101e-08, "logits/chosen": -2.091747283935547, "logits/rejected": -2.279473066329956, "logps/chosen": -11.377679824829102, "logps/rejected": -0.494662880897522, "loss": 0.839, "rewards/accuracies": 0.0, "rewards/chosen": 0.6479778289794922, "rewards/margins": -0.2731098532676697, "rewards/rejected": 0.9210876822471619, "step": 2397 }, { "epoch": 1.29, "learning_rate": 6.34160756609374e-08, "logits/chosen": -1.9765406847000122, "logits/rejected": -2.2237324714660645, "logps/chosen": -1.39213228225708, "logps/rejected": -1.3988182544708252, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.808933675289154, "rewards/margins": -0.007932662963867188, "rewards/rejected": 0.8168663382530212, "step": 2398 }, { "epoch": 1.29, "learning_rate": 6.338802530058819e-08, "logits/chosen": -2.0994508266448975, "logits/rejected": -2.315843105316162, "logps/chosen": -2.4171502590179443, "logps/rejected": -1.2655813694000244, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.9862327575683594, "rewards/margins": 0.020604968070983887, "rewards/rejected": 0.9656277894973755, "step": 2399 }, { "epoch": 1.29, "learning_rate": 6.335997040047412e-08, "logits/chosen": -2.0255637168884277, "logits/rejected": -2.0161194801330566, "logps/chosen": -6.258500099182129, "logps/rejected": -5.40966272354126, "loss": 0.4489, "rewards/accuracies": 1.0, "rewards/chosen": 1.1188791990280151, "rewards/margins": 0.5682545900344849, "rewards/rejected": 0.5506246089935303, "step": 2400 }, { "epoch": 1.3, "learning_rate": 6.333191097010838e-08, "logits/chosen": -2.012596607208252, "logits/rejected": -2.011752128601074, "logps/chosen": -1.0810679197311401, "logps/rejected": -3.4133763313293457, "loss": 0.5415, "rewards/accuracies": 1.0, "rewards/chosen": 1.1154556274414062, "rewards/margins": 0.3304898738861084, "rewards/rejected": 0.7849657535552979, "step": 2401 }, { "epoch": 1.3, "learning_rate": 6.330384701900563e-08, "logits/chosen": -2.1258325576782227, "logits/rejected": -2.130811929702759, "logps/chosen": -2.6318206787109375, "logps/rejected": -14.548650741577148, "loss": 0.2558, "rewards/accuracies": 1.0, "rewards/chosen": 1.5204445123672485, "rewards/margins": 1.2326513528823853, "rewards/rejected": 0.2877931594848633, "step": 2402 }, { "epoch": 1.3, "learning_rate": 6.327577855668214e-08, "logits/chosen": -2.0734598636627197, "logits/rejected": -2.063096284866333, "logps/chosen": -1.226809024810791, "logps/rejected": -10.475341796875, "loss": 0.5711, "rewards/accuracies": 1.0, "rewards/chosen": 1.0017906427383423, "rewards/margins": 0.26106685400009155, "rewards/rejected": 0.7407237887382507, "step": 2403 }, { "epoch": 1.3, "learning_rate": 6.324770559265573e-08, "logits/chosen": -2.0527241230010986, "logits/rejected": -2.0579285621643066, "logps/chosen": -3.031414031982422, "logps/rejected": -12.591275215148926, "loss": 0.3893, "rewards/accuracies": 1.0, "rewards/chosen": 1.058882236480713, "rewards/margins": 0.742356538772583, "rewards/rejected": 0.3165256679058075, "step": 2404 }, { "epoch": 1.3, "learning_rate": 6.321962813644566e-08, "logits/chosen": -2.028315782546997, "logits/rejected": -2.2577171325683594, "logps/chosen": -0.45548897981643677, "logps/rejected": -0.44042232632637024, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.8636872172355652, "rewards/margins": 0.0017482638359069824, "rewards/rejected": 0.8619389533996582, "step": 2405 }, { "epoch": 1.3, "learning_rate": 6.319154619757276e-08, "logits/chosen": -2.075657367706299, "logits/rejected": -2.072850465774536, "logps/chosen": -5.591738700866699, "logps/rejected": -5.188739776611328, "loss": 0.304, "rewards/accuracies": 1.0, "rewards/chosen": 1.660833716392517, "rewards/margins": 1.0349242687225342, "rewards/rejected": 0.6259094476699829, "step": 2406 }, { "epoch": 1.3, "learning_rate": 6.316345978555936e-08, "logits/chosen": -2.072096586227417, "logits/rejected": -2.0434439182281494, "logps/chosen": -6.188425540924072, "logps/rejected": -3.632234573364258, "loss": 0.4804, "rewards/accuracies": 1.0, "rewards/chosen": 1.1795908212661743, "rewards/margins": 0.48323899507522583, "rewards/rejected": 0.6963518261909485, "step": 2407 }, { "epoch": 1.3, "learning_rate": 6.313536890992934e-08, "logits/chosen": -2.018677234649658, "logits/rejected": -2.0195798873901367, "logps/chosen": -0.4598856568336487, "logps/rejected": -2.8029372692108154, "loss": 0.5516, "rewards/accuracies": 1.0, "rewards/chosen": 1.0190898180007935, "rewards/margins": 0.30658429861068726, "rewards/rejected": 0.7125055193901062, "step": 2408 }, { "epoch": 1.3, "learning_rate": 6.310727358020809e-08, "logits/chosen": -2.0100536346435547, "logits/rejected": -2.011955976486206, "logps/chosen": -1.5223867893218994, "logps/rejected": -5.723467826843262, "loss": 0.5162, "rewards/accuracies": 1.0, "rewards/chosen": 0.8937996029853821, "rewards/margins": 0.39200639724731445, "rewards/rejected": 0.5017932057380676, "step": 2409 }, { "epoch": 1.3, "learning_rate": 6.307917380592247e-08, "logits/chosen": -2.0765228271484375, "logits/rejected": -2.1386101245880127, "logps/chosen": -3.5423831939697266, "logps/rejected": -15.474494934082031, "loss": 0.3465, "rewards/accuracies": 1.0, "rewards/chosen": 1.57376229763031, "rewards/margins": 0.8815452456474304, "rewards/rejected": 0.6922170519828796, "step": 2410 }, { "epoch": 1.3, "learning_rate": 6.305106959660089e-08, "logits/chosen": -2.001099109649658, "logits/rejected": -1.9701297283172607, "logps/chosen": -15.450552940368652, "logps/rejected": -2.2930564880371094, "loss": 0.392, "rewards/accuracies": 1.0, "rewards/chosen": 1.5412389039993286, "rewards/margins": 0.7340176701545715, "rewards/rejected": 0.8072212338447571, "step": 2411 }, { "epoch": 1.3, "learning_rate": 6.302296096177324e-08, "logits/chosen": -2.0054123401641846, "logits/rejected": -2.0884757041931152, "logps/chosen": -2.579730272293091, "logps/rejected": -22.002588272094727, "loss": 0.6043, "rewards/accuracies": 1.0, "rewards/chosen": 0.8164135813713074, "rewards/margins": 0.1864527463912964, "rewards/rejected": 0.629960834980011, "step": 2412 }, { "epoch": 1.3, "learning_rate": 6.299484791097092e-08, "logits/chosen": -2.04205322265625, "logits/rejected": -2.2723004817962646, "logps/chosen": -0.48330196738243103, "logps/rejected": -0.5233960151672363, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.7744985818862915, "rewards/margins": 0.011755704879760742, "rewards/rejected": 0.7627428770065308, "step": 2413 }, { "epoch": 1.3, "learning_rate": 6.296673045372681e-08, "logits/chosen": -2.1964080333709717, "logits/rejected": -2.202977180480957, "logps/chosen": -1.7113527059555054, "logps/rejected": -3.837315320968628, "loss": 0.4828, "rewards/accuracies": 1.0, "rewards/chosen": 0.9846733212471008, "rewards/margins": 0.47695815563201904, "rewards/rejected": 0.5077151656150818, "step": 2414 }, { "epoch": 1.3, "learning_rate": 6.293860859957534e-08, "logits/chosen": -1.9686338901519775, "logits/rejected": -1.9635263681411743, "logps/chosen": -7.701697826385498, "logps/rejected": -3.527764081954956, "loss": 0.3085, "rewards/accuracies": 1.0, "rewards/chosen": 1.624802827835083, "rewards/margins": 1.017822504043579, "rewards/rejected": 0.6069802641868591, "step": 2415 }, { "epoch": 1.3, "learning_rate": 6.291048235805233e-08, "logits/chosen": -2.006721258163452, "logits/rejected": -2.3237035274505615, "logps/chosen": -6.0113205909729, "logps/rejected": -2.166569948196411, "loss": 0.7636, "rewards/accuracies": 0.0, "rewards/chosen": 0.8446653485298157, "rewards/margins": -0.1363547444343567, "rewards/rejected": 0.9810200929641724, "step": 2416 }, { "epoch": 1.3, "learning_rate": 6.28823517386952e-08, "logits/chosen": -2.0188350677490234, "logits/rejected": -2.0129332542419434, "logps/chosen": -2.633737087249756, "logps/rejected": -5.479271411895752, "loss": 0.3635, "rewards/accuracies": 1.0, "rewards/chosen": 1.2429016828536987, "rewards/margins": 0.8247106671333313, "rewards/rejected": 0.41819101572036743, "step": 2417 }, { "epoch": 1.3, "learning_rate": 6.285421675104277e-08, "logits/chosen": -2.1094517707824707, "logits/rejected": -2.1098923683166504, "logps/chosen": -2.1431636810302734, "logps/rejected": -1.5430487394332886, "loss": 0.6181, "rewards/accuracies": 1.0, "rewards/chosen": 1.108993411064148, "rewards/margins": 0.15613657236099243, "rewards/rejected": 0.9528568387031555, "step": 2418 }, { "epoch": 1.3, "learning_rate": 6.282607740463542e-08, "logits/chosen": -2.121701955795288, "logits/rejected": -2.2904365062713623, "logps/chosen": -3.9264321327209473, "logps/rejected": -4.422713279724121, "loss": 0.8201, "rewards/accuracies": 0.0, "rewards/chosen": 0.8232866525650024, "rewards/margins": -0.23962199687957764, "rewards/rejected": 1.06290864944458, "step": 2419 }, { "epoch": 1.31, "learning_rate": 6.27979337090149e-08, "logits/chosen": -2.1013333797454834, "logits/rejected": -2.0685112476348877, "logps/chosen": -3.1845293045043945, "logps/rejected": -3.1039204597473145, "loss": 0.4639, "rewards/accuracies": 1.0, "rewards/chosen": 1.122490644454956, "rewards/margins": 0.527308464050293, "rewards/rejected": 0.5951821804046631, "step": 2420 }, { "epoch": 1.31, "learning_rate": 6.276978567372453e-08, "logits/chosen": -2.0661697387695312, "logits/rejected": -2.071699380874634, "logps/chosen": -1.4315199851989746, "logps/rejected": -3.1982171535491943, "loss": 0.4236, "rewards/accuracies": 1.0, "rewards/chosen": 1.1466608047485352, "rewards/margins": 0.6398048400878906, "rewards/rejected": 0.5068559646606445, "step": 2421 }, { "epoch": 1.31, "learning_rate": 6.274163330830906e-08, "logits/chosen": -2.161931037902832, "logits/rejected": -2.1386642456054688, "logps/chosen": -9.982377052307129, "logps/rejected": -6.941145420074463, "loss": 0.2992, "rewards/accuracies": 1.0, "rewards/chosen": 1.682777762413025, "rewards/margins": 1.0531375408172607, "rewards/rejected": 0.6296401619911194, "step": 2422 }, { "epoch": 1.31, "learning_rate": 6.271347662231472e-08, "logits/chosen": -2.0060105323791504, "logits/rejected": -2.2110583782196045, "logps/chosen": -1.0909055471420288, "logps/rejected": -1.0786168575286865, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 0.8271016478538513, "rewards/margins": -0.0017154216766357422, "rewards/rejected": 0.8288170695304871, "step": 2423 }, { "epoch": 1.31, "learning_rate": 6.26853156252892e-08, "logits/chosen": -2.1561245918273926, "logits/rejected": -2.307133436203003, "logps/chosen": -2.2885286808013916, "logps/rejected": -2.298783540725708, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.9614160656929016, "rewards/margins": 0.023213684558868408, "rewards/rejected": 0.9382023811340332, "step": 2424 }, { "epoch": 1.31, "learning_rate": 6.265715032678168e-08, "logits/chosen": -2.028184652328491, "logits/rejected": -2.028181791305542, "logps/chosen": -2.462153911590576, "logps/rejected": -4.7516889572143555, "loss": 0.6116, "rewards/accuracies": 1.0, "rewards/chosen": 1.081363558769226, "rewards/margins": 0.17041391134262085, "rewards/rejected": 0.9109496474266052, "step": 2425 }, { "epoch": 1.31, "learning_rate": 6.262898073634271e-08, "logits/chosen": -2.0604240894317627, "logits/rejected": -2.0510082244873047, "logps/chosen": -8.067115783691406, "logps/rejected": -0.8130238056182861, "loss": 0.4916, "rewards/accuracies": 1.0, "rewards/chosen": 1.4784780740737915, "rewards/margins": 0.45425355434417725, "rewards/rejected": 1.0242245197296143, "step": 2426 }, { "epoch": 1.31, "learning_rate": 6.26008068635244e-08, "logits/chosen": -2.0537407398223877, "logits/rejected": -2.094691038131714, "logps/chosen": -3.7724812030792236, "logps/rejected": -9.372695922851562, "loss": 0.3788, "rewards/accuracies": 1.0, "rewards/chosen": 1.4626401662826538, "rewards/margins": 0.7752755880355835, "rewards/rejected": 0.6873645782470703, "step": 2427 }, { "epoch": 1.31, "learning_rate": 6.257262871788028e-08, "logits/chosen": -2.059412956237793, "logits/rejected": -2.053920269012451, "logps/chosen": -8.319869041442871, "logps/rejected": -8.105177879333496, "loss": 0.3131, "rewards/accuracies": 1.0, "rewards/chosen": 1.5026116371154785, "rewards/margins": 1.0004209280014038, "rewards/rejected": 0.5021907091140747, "step": 2428 }, { "epoch": 1.31, "learning_rate": 6.254444630896528e-08, "logits/chosen": -2.109168529510498, "logits/rejected": -2.2496910095214844, "logps/chosen": -0.6101630926132202, "logps/rejected": -0.5765738487243652, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.9895410537719727, "rewards/margins": 0.029584288597106934, "rewards/rejected": 0.9599567651748657, "step": 2429 }, { "epoch": 1.31, "learning_rate": 6.251625964633585e-08, "logits/chosen": -2.093508243560791, "logits/rejected": -2.316558599472046, "logps/chosen": -1.2950795888900757, "logps/rejected": -1.221889615058899, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 1.1618866920471191, "rewards/margins": 0.03242695331573486, "rewards/rejected": 1.1294597387313843, "step": 2430 }, { "epoch": 1.31, "learning_rate": 6.248806873954982e-08, "logits/chosen": -2.045961856842041, "logits/rejected": -2.2377729415893555, "logps/chosen": -0.5662865042686462, "logps/rejected": -0.6434153914451599, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.8939256072044373, "rewards/margins": -0.007395625114440918, "rewards/rejected": 0.9013212323188782, "step": 2431 }, { "epoch": 1.31, "learning_rate": 6.24598735981665e-08, "logits/chosen": -1.9748557806015015, "logits/rejected": -2.3031795024871826, "logps/chosen": -3.2692551612854004, "logps/rejected": -6.2589006423950195, "loss": 0.5849, "rewards/accuracies": 1.0, "rewards/chosen": 0.7443946599960327, "rewards/margins": 0.22973448038101196, "rewards/rejected": 0.5146601796150208, "step": 2432 }, { "epoch": 1.31, "learning_rate": 6.243167423174661e-08, "logits/chosen": -2.098139762878418, "logits/rejected": -2.0979068279266357, "logps/chosen": -4.33754825592041, "logps/rejected": -3.128962755203247, "loss": 0.4737, "rewards/accuracies": 1.0, "rewards/chosen": 1.1275115013122559, "rewards/margins": 0.5009852051734924, "rewards/rejected": 0.6265262961387634, "step": 2433 }, { "epoch": 1.31, "learning_rate": 6.240347064985234e-08, "logits/chosen": -2.0583536624908447, "logits/rejected": -2.2690916061401367, "logps/chosen": -7.523125648498535, "logps/rejected": -8.007200241088867, "loss": 0.663, "rewards/accuracies": 1.0, "rewards/chosen": 0.7116912007331848, "rewards/margins": 0.06116020679473877, "rewards/rejected": 0.650530993938446, "step": 2434 }, { "epoch": 1.31, "learning_rate": 6.237526286204725e-08, "logits/chosen": -1.9653269052505493, "logits/rejected": -2.2459614276885986, "logps/chosen": -1.806064248085022, "logps/rejected": -1.88556706905365, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.89532470703125, "rewards/margins": 0.010321974754333496, "rewards/rejected": 0.8850027322769165, "step": 2435 }, { "epoch": 1.31, "learning_rate": 6.234705087789637e-08, "logits/chosen": -2.0769691467285156, "logits/rejected": -2.228654146194458, "logps/chosen": -0.31864744424819946, "logps/rejected": -0.315062940120697, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.9015669226646423, "rewards/margins": 0.018753349781036377, "rewards/rejected": 0.882813572883606, "step": 2436 }, { "epoch": 1.31, "learning_rate": 6.231883470696615e-08, "logits/chosen": -2.1847352981567383, "logits/rejected": -2.1891093254089355, "logps/chosen": -1.130293846130371, "logps/rejected": -3.3649966716766357, "loss": 0.4673, "rewards/accuracies": 1.0, "rewards/chosen": 0.9520745277404785, "rewards/margins": 0.5180947184562683, "rewards/rejected": 0.4339798092842102, "step": 2437 }, { "epoch": 1.31, "learning_rate": 6.229061435882445e-08, "logits/chosen": -2.0471324920654297, "logits/rejected": -2.0542359352111816, "logps/chosen": -2.298133134841919, "logps/rejected": -4.521170616149902, "loss": 0.4957, "rewards/accuracies": 1.0, "rewards/chosen": 0.9843353629112244, "rewards/margins": 0.4438226819038391, "rewards/rejected": 0.5405126810073853, "step": 2438 }, { "epoch": 1.32, "learning_rate": 6.226238984304055e-08, "logits/chosen": -1.9755206108093262, "logits/rejected": -1.9758234024047852, "logps/chosen": -0.4812053143978119, "logps/rejected": -3.341482639312744, "loss": 0.5463, "rewards/accuracies": 1.0, "rewards/chosen": 0.9833216071128845, "rewards/margins": 0.31895893812179565, "rewards/rejected": 0.6643626689910889, "step": 2439 }, { "epoch": 1.32, "learning_rate": 6.223416116918514e-08, "logits/chosen": -2.0533618927001953, "logits/rejected": -2.044398069381714, "logps/chosen": -4.74256706237793, "logps/rejected": -4.54332160949707, "loss": 0.3851, "rewards/accuracies": 1.0, "rewards/chosen": 1.2916167974472046, "rewards/margins": 0.7554325461387634, "rewards/rejected": 0.5361842513084412, "step": 2440 }, { "epoch": 1.32, "learning_rate": 6.220592834683032e-08, "logits/chosen": -2.061136484146118, "logits/rejected": -2.306623935699463, "logps/chosen": -2.8695740699768066, "logps/rejected": -9.042963027954102, "loss": 0.5454, "rewards/accuracies": 1.0, "rewards/chosen": 0.7468231916427612, "rewards/margins": 0.3212798833847046, "rewards/rejected": 0.42554330825805664, "step": 2441 }, { "epoch": 1.32, "learning_rate": 6.217769138554959e-08, "logits/chosen": -1.9976162910461426, "logits/rejected": -1.9984439611434937, "logps/chosen": -0.7194930911064148, "logps/rejected": -3.8986830711364746, "loss": 0.4819, "rewards/accuracies": 1.0, "rewards/chosen": 1.0241115093231201, "rewards/margins": 0.47937917709350586, "rewards/rejected": 0.5447323322296143, "step": 2442 }, { "epoch": 1.32, "learning_rate": 6.21494502949179e-08, "logits/chosen": -2.1030800342559814, "logits/rejected": -2.2970187664031982, "logps/chosen": -1.0576727390289307, "logps/rejected": -17.27750587463379, "loss": 0.5461, "rewards/accuracies": 1.0, "rewards/chosen": 1.2129430770874023, "rewards/margins": 0.3195316195487976, "rewards/rejected": 0.8934114575386047, "step": 2443 }, { "epoch": 1.32, "learning_rate": 6.21212050845115e-08, "logits/chosen": -2.100616216659546, "logits/rejected": -2.2665789127349854, "logps/chosen": -0.3556515574455261, "logps/rejected": -0.3989945948123932, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.989801824092865, "rewards/margins": 0.022174954414367676, "rewards/rejected": 0.9676268696784973, "step": 2444 }, { "epoch": 1.32, "learning_rate": 6.209295576390815e-08, "logits/chosen": -2.081629514694214, "logits/rejected": -2.2881760597229004, "logps/chosen": -0.3967532515525818, "logps/rejected": -0.42068642377853394, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.8735527396202087, "rewards/margins": 0.02896064519882202, "rewards/rejected": 0.8445920944213867, "step": 2445 }, { "epoch": 1.32, "learning_rate": 6.206470234268694e-08, "logits/chosen": -2.138395309448242, "logits/rejected": -2.1491427421569824, "logps/chosen": -2.0131165981292725, "logps/rejected": -4.222035884857178, "loss": 0.4502, "rewards/accuracies": 1.0, "rewards/chosen": 1.0638444423675537, "rewards/margins": 0.5644848346710205, "rewards/rejected": 0.4993595778942108, "step": 2446 }, { "epoch": 1.32, "learning_rate": 6.203644483042835e-08, "logits/chosen": -2.023076295852661, "logits/rejected": -2.0137338638305664, "logps/chosen": -5.2865729331970215, "logps/rejected": -7.563946723937988, "loss": 0.3087, "rewards/accuracies": 1.0, "rewards/chosen": 1.6098158359527588, "rewards/margins": 1.0172510147094727, "rewards/rejected": 0.5925648808479309, "step": 2447 }, { "epoch": 1.32, "learning_rate": 6.200818323671427e-08, "logits/chosen": -2.0919320583343506, "logits/rejected": -2.293848991394043, "logps/chosen": -0.1965799629688263, "logps/rejected": -0.20961791276931763, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 1.0344003438949585, "rewards/margins": 0.023435115814208984, "rewards/rejected": 1.0109652280807495, "step": 2448 }, { "epoch": 1.32, "learning_rate": 6.197991757112799e-08, "logits/chosen": -2.070591449737549, "logits/rejected": -2.281698226928711, "logps/chosen": -7.099171161651611, "logps/rejected": -0.646809995174408, "loss": 0.7481, "rewards/accuracies": 0.0, "rewards/chosen": 0.7822884321212769, "rewards/margins": -0.10696035623550415, "rewards/rejected": 0.889248788356781, "step": 2449 }, { "epoch": 1.32, "learning_rate": 6.19516478432541e-08, "logits/chosen": -1.959209680557251, "logits/rejected": -2.210101366043091, "logps/chosen": -0.8482319116592407, "logps/rejected": -3.034724712371826, "loss": 0.5847, "rewards/accuracies": 1.0, "rewards/chosen": 1.019700288772583, "rewards/margins": 0.23007172346115112, "rewards/rejected": 0.7896285653114319, "step": 2450 }, { "epoch": 1.32, "learning_rate": 6.192337406267866e-08, "logits/chosen": -2.025846242904663, "logits/rejected": -2.218360662460327, "logps/chosen": -0.5370885133743286, "logps/rejected": -0.5818332433700562, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 1.0087084770202637, "rewards/margins": 0.005699753761291504, "rewards/rejected": 1.0030087232589722, "step": 2451 }, { "epoch": 1.32, "learning_rate": 6.189509623898905e-08, "logits/chosen": -2.0335257053375244, "logits/rejected": -2.0297224521636963, "logps/chosen": -9.193878173828125, "logps/rejected": -2.1047847270965576, "loss": 0.369, "rewards/accuracies": 1.0, "rewards/chosen": 1.4636932611465454, "rewards/margins": 0.8068358302116394, "rewards/rejected": 0.656857430934906, "step": 2452 }, { "epoch": 1.32, "learning_rate": 6.186681438177408e-08, "logits/chosen": -2.0245907306671143, "logits/rejected": -2.193599224090576, "logps/chosen": -0.8359360098838806, "logps/rejected": -0.7821778655052185, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.8207939267158508, "rewards/margins": 0.02126169204711914, "rewards/rejected": 0.7995322346687317, "step": 2453 }, { "epoch": 1.32, "learning_rate": 6.183852850062384e-08, "logits/chosen": -2.164123296737671, "logits/rejected": -2.1044135093688965, "logps/chosen": -17.649669647216797, "logps/rejected": -10.430261611938477, "loss": 0.1889, "rewards/accuracies": 1.0, "rewards/chosen": 1.9328113794326782, "rewards/margins": 1.5706104040145874, "rewards/rejected": 0.36220094561576843, "step": 2454 }, { "epoch": 1.32, "learning_rate": 6.181023860512984e-08, "logits/chosen": -2.060224771499634, "logits/rejected": -2.2740345001220703, "logps/chosen": -3.329291820526123, "logps/rejected": -8.778236389160156, "loss": 0.7354, "rewards/accuracies": 0.0, "rewards/chosen": 0.781860888004303, "rewards/margins": -0.08271145820617676, "rewards/rejected": 0.8645723462104797, "step": 2455 }, { "epoch": 1.32, "learning_rate": 6.178194470488495e-08, "logits/chosen": -2.050751209259033, "logits/rejected": -2.1195154190063477, "logps/chosen": -2.323281764984131, "logps/rejected": -19.822795867919922, "loss": 0.5298, "rewards/accuracies": 1.0, "rewards/chosen": 1.3472169637680054, "rewards/margins": 0.3585973381996155, "rewards/rejected": 0.9886196255683899, "step": 2456 }, { "epoch": 1.33, "learning_rate": 6.175364680948338e-08, "logits/chosen": -2.0007805824279785, "logits/rejected": -2.2613461017608643, "logps/chosen": -0.878804624080658, "logps/rejected": -1.0194942951202393, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 1.0260857343673706, "rewards/margins": 0.026089608669281006, "rewards/rejected": 0.9999961256980896, "step": 2457 }, { "epoch": 1.33, "learning_rate": 6.172534492852072e-08, "logits/chosen": -2.0323073863983154, "logits/rejected": -2.2091856002807617, "logps/chosen": -0.9967638254165649, "logps/rejected": -1.0014824867248535, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 1.0175403356552124, "rewards/margins": 0.012994766235351562, "rewards/rejected": 1.0045455694198608, "step": 2458 }, { "epoch": 1.33, "learning_rate": 6.169703907159388e-08, "logits/chosen": -2.0478453636169434, "logits/rejected": -2.0391204357147217, "logps/chosen": -9.139364242553711, "logps/rejected": -4.005528450012207, "loss": 0.3523, "rewards/accuracies": 1.0, "rewards/chosen": 1.4505369663238525, "rewards/margins": 0.8620994091033936, "rewards/rejected": 0.588437557220459, "step": 2459 }, { "epoch": 1.33, "learning_rate": 6.166872924830115e-08, "logits/chosen": -2.0546929836273193, "logits/rejected": -2.005352735519409, "logps/chosen": -14.913700103759766, "logps/rejected": -3.354556083679199, "loss": 0.3413, "rewards/accuracies": 1.0, "rewards/chosen": 1.657263159751892, "rewards/margins": 0.8995764255523682, "rewards/rejected": 0.7576867341995239, "step": 2460 }, { "epoch": 1.33, "learning_rate": 6.164041546824213e-08, "logits/chosen": -2.226813554763794, "logits/rejected": -2.148946523666382, "logps/chosen": -23.00444221496582, "logps/rejected": -6.415395259857178, "loss": 0.4545, "rewards/accuracies": 1.0, "rewards/chosen": 1.331403136253357, "rewards/margins": 0.5527129173278809, "rewards/rejected": 0.7786902189254761, "step": 2461 }, { "epoch": 1.33, "learning_rate": 6.161209774101779e-08, "logits/chosen": -2.1321380138397217, "logits/rejected": -2.116828680038452, "logps/chosen": -9.319770812988281, "logps/rejected": -4.132586479187012, "loss": 0.3518, "rewards/accuracies": 1.0, "rewards/chosen": 1.4125114679336548, "rewards/margins": 0.8637465834617615, "rewards/rejected": 0.5487648844718933, "step": 2462 }, { "epoch": 1.33, "learning_rate": 6.158377607623044e-08, "logits/chosen": -2.234896421432495, "logits/rejected": -2.254570245742798, "logps/chosen": -15.491110801696777, "logps/rejected": -17.59037208557129, "loss": 0.5132, "rewards/accuracies": 1.0, "rewards/chosen": 1.5154279470443726, "rewards/margins": 0.3995826244354248, "rewards/rejected": 1.1158453226089478, "step": 2463 }, { "epoch": 1.33, "learning_rate": 6.155545048348367e-08, "logits/chosen": -2.073361873626709, "logits/rejected": -2.0855143070220947, "logps/chosen": -5.666042804718018, "logps/rejected": -5.696463108062744, "loss": 0.3174, "rewards/accuracies": 1.0, "rewards/chosen": 1.8219718933105469, "rewards/margins": 0.9845457077026367, "rewards/rejected": 0.8374261856079102, "step": 2464 }, { "epoch": 1.33, "learning_rate": 6.152712097238251e-08, "logits/chosen": -2.2010886669158936, "logits/rejected": -2.2028467655181885, "logps/chosen": -0.8887845277786255, "logps/rejected": -4.153631210327148, "loss": 0.5028, "rewards/accuracies": 1.0, "rewards/chosen": 1.0167149305343628, "rewards/margins": 0.4257310628890991, "rewards/rejected": 0.5909838676452637, "step": 2465 }, { "epoch": 1.33, "learning_rate": 6.149878755253323e-08, "logits/chosen": -2.0302276611328125, "logits/rejected": -2.031205892562866, "logps/chosen": -1.8000158071517944, "logps/rejected": -0.47353705763816833, "loss": 0.6338, "rewards/accuracies": 1.0, "rewards/chosen": 0.8576481938362122, "rewards/margins": 0.12237942218780518, "rewards/rejected": 0.735268771648407, "step": 2466 }, { "epoch": 1.33, "learning_rate": 6.147045023354342e-08, "logits/chosen": -2.008277177810669, "logits/rejected": -2.0171029567718506, "logps/chosen": -1.9242039918899536, "logps/rejected": -3.046145439147949, "loss": 0.3859, "rewards/accuracies": 1.0, "rewards/chosen": 1.4206011295318604, "rewards/margins": 0.7531704306602478, "rewards/rejected": 0.6674306988716125, "step": 2467 }, { "epoch": 1.33, "learning_rate": 6.144210902502205e-08, "logits/chosen": -2.025733232498169, "logits/rejected": -2.2649664878845215, "logps/chosen": -0.7585643529891968, "logps/rejected": -0.7688046097755432, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.8795419931411743, "rewards/margins": 0.031305909156799316, "rewards/rejected": 0.848236083984375, "step": 2468 }, { "epoch": 1.33, "learning_rate": 6.141376393657939e-08, "logits/chosen": -2.1276872158050537, "logits/rejected": -2.1196303367614746, "logps/chosen": -0.6156755685806274, "logps/rejected": -6.945126533508301, "loss": 0.4497, "rewards/accuracies": 1.0, "rewards/chosen": 1.1494531631469727, "rewards/margins": 0.5658189654350281, "rewards/rejected": 0.5836341977119446, "step": 2469 }, { "epoch": 1.33, "learning_rate": 6.138541497782701e-08, "logits/chosen": -2.0456018447875977, "logits/rejected": -2.0522818565368652, "logps/chosen": -4.973767280578613, "logps/rejected": -3.603073835372925, "loss": 0.442, "rewards/accuracies": 1.0, "rewards/chosen": 1.1079767942428589, "rewards/margins": 0.5873364806175232, "rewards/rejected": 0.5206403136253357, "step": 2470 }, { "epoch": 1.33, "learning_rate": 6.13570621583778e-08, "logits/chosen": -2.1102445125579834, "logits/rejected": -2.0986053943634033, "logps/chosen": -6.941847801208496, "logps/rejected": -4.0644097328186035, "loss": 0.3866, "rewards/accuracies": 1.0, "rewards/chosen": 1.2495664358139038, "rewards/margins": 0.750748872756958, "rewards/rejected": 0.4988175928592682, "step": 2471 }, { "epoch": 1.33, "learning_rate": 6.132870548784592e-08, "logits/chosen": -1.953118085861206, "logits/rejected": -1.9527077674865723, "logps/chosen": -0.40403932332992554, "logps/rejected": -5.838657855987549, "loss": 0.4883, "rewards/accuracies": 1.0, "rewards/chosen": 0.9578714370727539, "rewards/margins": 0.462723970413208, "rewards/rejected": 0.4951474666595459, "step": 2472 }, { "epoch": 1.33, "learning_rate": 6.130034497584694e-08, "logits/chosen": -2.103700637817383, "logits/rejected": -2.111464262008667, "logps/chosen": -5.4120965003967285, "logps/rejected": -2.312481164932251, "loss": 0.2075, "rewards/accuracies": 1.0, "rewards/chosen": 2.0966269969940186, "rewards/margins": 1.4671580791473389, "rewards/rejected": 0.6294688582420349, "step": 2473 }, { "epoch": 1.33, "learning_rate": 6.127198063199762e-08, "logits/chosen": -2.1122870445251465, "logits/rejected": -2.3103415966033936, "logps/chosen": -0.3296907842159271, "logps/rejected": -0.36797159910202026, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.9117104411125183, "rewards/margins": -0.005223751068115234, "rewards/rejected": 0.9169341921806335, "step": 2474 }, { "epoch": 1.33, "learning_rate": 6.12436124659161e-08, "logits/chosen": -2.141641616821289, "logits/rejected": -2.31501841545105, "logps/chosen": -1.6913955211639404, "logps/rejected": -4.249593257904053, "loss": 0.7081, "rewards/accuracies": 0.0, "rewards/chosen": 0.8380552530288696, "rewards/margins": -0.029709041118621826, "rewards/rejected": 0.8677642941474915, "step": 2475 }, { "epoch": 1.34, "learning_rate": 6.121524048722175e-08, "logits/chosen": -2.18839430809021, "logits/rejected": -2.1073215007781982, "logps/chosen": -33.82190704345703, "logps/rejected": -1.6440964937210083, "loss": 0.2783, "rewards/accuracies": 1.0, "rewards/chosen": 1.951313853263855, "rewards/margins": 1.1367628574371338, "rewards/rejected": 0.8145509958267212, "step": 2476 }, { "epoch": 1.34, "learning_rate": 6.118686470553527e-08, "logits/chosen": -1.9444432258605957, "logits/rejected": -2.2688148021698, "logps/chosen": -0.36442047357559204, "logps/rejected": -0.3862851858139038, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 1.038327693939209, "rewards/margins": -0.0028810501098632812, "rewards/rejected": 1.0412087440490723, "step": 2477 }, { "epoch": 1.34, "learning_rate": 6.115848513047866e-08, "logits/chosen": -2.0653202533721924, "logits/rejected": -2.2598252296447754, "logps/chosen": -2.6278889179229736, "logps/rejected": -2.970430374145508, "loss": 0.6939, "rewards/accuracies": 0.0, "rewards/chosen": 0.6755102276802063, "rewards/margins": -0.0014253854751586914, "rewards/rejected": 0.676935613155365, "step": 2478 }, { "epoch": 1.34, "learning_rate": 6.113010177167518e-08, "logits/chosen": -1.983598232269287, "logits/rejected": -2.3235151767730713, "logps/chosen": -3.982051372528076, "logps/rejected": -4.234534740447998, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 1.0122753381729126, "rewards/margins": 0.03129333257675171, "rewards/rejected": 0.9809820055961609, "step": 2479 }, { "epoch": 1.34, "learning_rate": 6.110171463874941e-08, "logits/chosen": -2.0243682861328125, "logits/rejected": -2.2572312355041504, "logps/chosen": -0.5723379254341125, "logps/rejected": -0.5919764041900635, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.8576918840408325, "rewards/margins": 0.00453341007232666, "rewards/rejected": 0.8531584739685059, "step": 2480 }, { "epoch": 1.34, "learning_rate": 6.107332374132714e-08, "logits/chosen": -2.117262125015259, "logits/rejected": -2.124833822250366, "logps/chosen": -1.7215943336486816, "logps/rejected": -4.744579315185547, "loss": 0.4787, "rewards/accuracies": 1.0, "rewards/chosen": 1.0332950353622437, "rewards/margins": 0.48792558908462524, "rewards/rejected": 0.5453694462776184, "step": 2481 }, { "epoch": 1.34, "learning_rate": 6.10449290890355e-08, "logits/chosen": -1.963332176208496, "logits/rejected": -2.255018472671509, "logps/chosen": -0.8934463262557983, "logps/rejected": -0.8279971480369568, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.8900410532951355, "rewards/margins": 0.012476027011871338, "rewards/rejected": 0.8775650262832642, "step": 2482 }, { "epoch": 1.34, "learning_rate": 6.10165306915029e-08, "logits/chosen": -2.109516143798828, "logits/rejected": -2.260932683944702, "logps/chosen": -1.697249174118042, "logps/rejected": -5.232433319091797, "loss": 0.6186, "rewards/accuracies": 1.0, "rewards/chosen": 0.8936715126037598, "rewards/margins": 0.15516823530197144, "rewards/rejected": 0.7385032773017883, "step": 2483 }, { "epoch": 1.34, "learning_rate": 6.098812855835897e-08, "logits/chosen": -2.12176251411438, "logits/rejected": -2.1300160884857178, "logps/chosen": -4.13040018081665, "logps/rejected": -4.658196449279785, "loss": 0.3945, "rewards/accuracies": 1.0, "rewards/chosen": 1.25628662109375, "rewards/margins": 0.726262629032135, "rewards/rejected": 0.530023992061615, "step": 2484 }, { "epoch": 1.34, "learning_rate": 6.095972269923463e-08, "logits/chosen": -2.050816774368286, "logits/rejected": -2.0529651641845703, "logps/chosen": -1.2170631885528564, "logps/rejected": -5.762019157409668, "loss": 0.4154, "rewards/accuracies": 1.0, "rewards/chosen": 1.024233341217041, "rewards/margins": 0.6637383699417114, "rewards/rejected": 0.360495001077652, "step": 2485 }, { "epoch": 1.34, "learning_rate": 6.093131312376205e-08, "logits/chosen": -2.1688554286956787, "logits/rejected": -2.175751209259033, "logps/chosen": -3.984403133392334, "logps/rejected": -6.551015853881836, "loss": 0.3742, "rewards/accuracies": 1.0, "rewards/chosen": 1.64205002784729, "rewards/margins": 0.7899565100669861, "rewards/rejected": 0.852093517780304, "step": 2486 }, { "epoch": 1.34, "learning_rate": 6.090289984157471e-08, "logits/chosen": -2.020573616027832, "logits/rejected": -2.265612840652466, "logps/chosen": -0.9592021703720093, "logps/rejected": -0.8531865477561951, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.9867947697639465, "rewards/margins": 0.0037962794303894043, "rewards/rejected": 0.9829984903335571, "step": 2487 }, { "epoch": 1.34, "learning_rate": 6.087448286230728e-08, "logits/chosen": -2.1076600551605225, "logits/rejected": -2.373441219329834, "logps/chosen": -11.213410377502441, "logps/rejected": -9.033580780029297, "loss": 0.7548, "rewards/accuracies": 0.0, "rewards/chosen": 0.6365014314651489, "rewards/margins": -0.11980772018432617, "rewards/rejected": 0.7563091516494751, "step": 2488 }, { "epoch": 1.34, "learning_rate": 6.084606219559572e-08, "logits/chosen": -2.1019718647003174, "logits/rejected": -2.1696338653564453, "logps/chosen": -15.828773498535156, "logps/rejected": -12.372055053710938, "loss": 0.5102, "rewards/accuracies": 1.0, "rewards/chosen": 1.4182218313217163, "rewards/margins": 0.40692996978759766, "rewards/rejected": 1.0112918615341187, "step": 2489 }, { "epoch": 1.34, "learning_rate": 6.081763785107725e-08, "logits/chosen": -2.0158441066741943, "logits/rejected": -2.3005449771881104, "logps/chosen": -0.8616030216217041, "logps/rejected": -0.7299790978431702, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.9044761657714844, "rewards/margins": 0.032196998596191406, "rewards/rejected": 0.872279167175293, "step": 2490 }, { "epoch": 1.34, "learning_rate": 6.07892098383903e-08, "logits/chosen": -2.032532215118408, "logits/rejected": -2.040254592895508, "logps/chosen": -1.4215662479400635, "logps/rejected": -3.2100257873535156, "loss": 0.4754, "rewards/accuracies": 1.0, "rewards/chosen": 1.0388914346694946, "rewards/margins": 0.49655866622924805, "rewards/rejected": 0.5423327684402466, "step": 2491 }, { "epoch": 1.34, "learning_rate": 6.07607781671746e-08, "logits/chosen": -2.034926176071167, "logits/rejected": -2.230415105819702, "logps/chosen": -1.1502331495285034, "logps/rejected": -1.0924757719039917, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.7746500372886658, "rewards/margins": 0.011243462562561035, "rewards/rejected": 0.7634065747261047, "step": 2492 }, { "epoch": 1.34, "learning_rate": 6.073234284707104e-08, "logits/chosen": -2.0721535682678223, "logits/rejected": -2.068936586380005, "logps/chosen": -4.312473773956299, "logps/rejected": -4.134150981903076, "loss": 0.5276, "rewards/accuracies": 1.0, "rewards/chosen": 0.9638962149620056, "rewards/margins": 0.36408793926239014, "rewards/rejected": 0.5998082756996155, "step": 2493 }, { "epoch": 1.35, "learning_rate": 6.070390388772183e-08, "logits/chosen": -2.1068127155303955, "logits/rejected": -2.280935287475586, "logps/chosen": -3.596214771270752, "logps/rejected": -1.6270259618759155, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 1.0849008560180664, "rewards/margins": -0.003247857093811035, "rewards/rejected": 1.0881487131118774, "step": 2494 }, { "epoch": 1.35, "learning_rate": 6.067546129877037e-08, "logits/chosen": -1.9850780963897705, "logits/rejected": -1.9863252639770508, "logps/chosen": -0.4684201776981354, "logps/rejected": -7.3440446853637695, "loss": 0.3945, "rewards/accuracies": 1.0, "rewards/chosen": 1.0352509021759033, "rewards/margins": 0.7265070676803589, "rewards/rejected": 0.3087438642978668, "step": 2495 }, { "epoch": 1.35, "learning_rate": 6.064701508986129e-08, "logits/chosen": -2.059882879257202, "logits/rejected": -2.244384527206421, "logps/chosen": -0.6344763040542603, "logps/rejected": -0.6359058618545532, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.8708763122558594, "rewards/margins": 0.04043376445770264, "rewards/rejected": 0.8304425477981567, "step": 2496 }, { "epoch": 1.35, "learning_rate": 6.061856527064047e-08, "logits/chosen": -2.1052887439727783, "logits/rejected": -2.2817842960357666, "logps/chosen": -0.4066011309623718, "logps/rejected": -0.4265434145927429, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.8392427563667297, "rewards/margins": 0.012808620929718018, "rewards/rejected": 0.8264341354370117, "step": 2497 }, { "epoch": 1.35, "learning_rate": 6.059011185075496e-08, "logits/chosen": -1.9978852272033691, "logits/rejected": -2.0029468536376953, "logps/chosen": -2.391342878341675, "logps/rejected": -2.0642800331115723, "loss": 0.5308, "rewards/accuracies": 1.0, "rewards/chosen": 1.1627992391586304, "rewards/margins": 0.35627591609954834, "rewards/rejected": 0.806523323059082, "step": 2498 }, { "epoch": 1.35, "learning_rate": 6.056165483985314e-08, "logits/chosen": -1.9667977094650269, "logits/rejected": -1.9608608484268188, "logps/chosen": -6.652514457702637, "logps/rejected": -3.1660454273223877, "loss": 0.3156, "rewards/accuracies": 1.0, "rewards/chosen": 1.5460370779037476, "rewards/margins": 0.9913232326507568, "rewards/rejected": 0.5547138452529907, "step": 2499 }, { "epoch": 1.35, "learning_rate": 6.05331942475845e-08, "logits/chosen": -2.095107316970825, "logits/rejected": -2.280611038208008, "logps/chosen": -0.7805957794189453, "logps/rejected": -5.546523094177246, "loss": 0.6993, "rewards/accuracies": 0.0, "rewards/chosen": 0.7296012043952942, "rewards/margins": -0.012266457080841064, "rewards/rejected": 0.7418676614761353, "step": 2500 }, { "epoch": 1.35, "learning_rate": 6.050473008359977e-08, "logits/chosen": -2.1448452472686768, "logits/rejected": -2.1500513553619385, "logps/chosen": -3.3964667320251465, "logps/rejected": -3.3678746223449707, "loss": 0.5555, "rewards/accuracies": 1.0, "rewards/chosen": 0.9812013506889343, "rewards/margins": 0.2972036600112915, "rewards/rejected": 0.6839976906776428, "step": 2501 }, { "epoch": 1.35, "learning_rate": 6.047626235755093e-08, "logits/chosen": -1.949392557144165, "logits/rejected": -1.961819052696228, "logps/chosen": -2.041992664337158, "logps/rejected": -4.483493804931641, "loss": 0.5051, "rewards/accuracies": 1.0, "rewards/chosen": 0.8278614282608032, "rewards/margins": 0.41986343264579773, "rewards/rejected": 0.4079979956150055, "step": 2502 }, { "epoch": 1.35, "learning_rate": 6.04477910790911e-08, "logits/chosen": -2.0917458534240723, "logits/rejected": -2.091702938079834, "logps/chosen": -2.2025861740112305, "logps/rejected": -1.5980921983718872, "loss": 0.6046, "rewards/accuracies": 1.0, "rewards/chosen": 1.0318537950515747, "rewards/margins": 0.1858062744140625, "rewards/rejected": 0.8460475206375122, "step": 2503 }, { "epoch": 1.35, "learning_rate": 6.041931625787471e-08, "logits/chosen": -2.0380170345306396, "logits/rejected": -2.21685791015625, "logps/chosen": -0.3826453983783722, "logps/rejected": -0.4030351936817169, "loss": 0.6721, "rewards/accuracies": 1.0, "rewards/chosen": 0.8173055648803711, "rewards/margins": 0.04263448715209961, "rewards/rejected": 0.7746710777282715, "step": 2504 }, { "epoch": 1.35, "learning_rate": 6.03908379035573e-08, "logits/chosen": -2.066132068634033, "logits/rejected": -2.239074230194092, "logps/chosen": -0.5851157307624817, "logps/rejected": -0.6132010817527771, "loss": 0.6753, "rewards/accuracies": 1.0, "rewards/chosen": 0.8418216109275818, "rewards/margins": 0.03610032796859741, "rewards/rejected": 0.8057212829589844, "step": 2505 }, { "epoch": 1.35, "learning_rate": 6.036235602579562e-08, "logits/chosen": -2.234560489654541, "logits/rejected": -2.1245954036712646, "logps/chosen": -29.478439331054688, "logps/rejected": -5.79550838470459, "loss": 0.3499, "rewards/accuracies": 1.0, "rewards/chosen": 1.2701596021652222, "rewards/margins": 0.8699707984924316, "rewards/rejected": 0.4001888334751129, "step": 2506 }, { "epoch": 1.35, "learning_rate": 6.033387063424764e-08, "logits/chosen": -2.0634584426879883, "logits/rejected": -2.0677289962768555, "logps/chosen": -0.8166526556015015, "logps/rejected": -6.132145881652832, "loss": 0.4205, "rewards/accuracies": 1.0, "rewards/chosen": 0.9457201957702637, "rewards/margins": 0.6486905813217163, "rewards/rejected": 0.297029584646225, "step": 2507 }, { "epoch": 1.35, "learning_rate": 6.030538173857254e-08, "logits/chosen": -2.052433729171753, "logits/rejected": -2.042476177215576, "logps/chosen": -12.234777450561523, "logps/rejected": -4.017540454864502, "loss": 0.4108, "rewards/accuracies": 1.0, "rewards/chosen": 1.131323218345642, "rewards/margins": 0.6772201657295227, "rewards/rejected": 0.4541030526161194, "step": 2508 }, { "epoch": 1.35, "learning_rate": 6.027688934843063e-08, "logits/chosen": -2.1716599464416504, "logits/rejected": -2.3259382247924805, "logps/chosen": -0.718163788318634, "logps/rejected": -0.6911647915840149, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 1.0599216222763062, "rewards/margins": -0.0038396120071411133, "rewards/rejected": 1.0637612342834473, "step": 2509 }, { "epoch": 1.35, "learning_rate": 6.024839347348344e-08, "logits/chosen": -2.0229156017303467, "logits/rejected": -2.21710467338562, "logps/chosen": -4.031002521514893, "logps/rejected": -3.786043167114258, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.6300615072250366, "rewards/margins": 0.0017395615577697754, "rewards/rejected": 0.6283219456672668, "step": 2510 }, { "epoch": 1.35, "learning_rate": 6.021989412339367e-08, "logits/chosen": -2.0546863079071045, "logits/rejected": -2.2540299892425537, "logps/chosen": -0.6162729263305664, "logps/rejected": -0.644752025604248, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.8678847551345825, "rewards/margins": 0.01508396863937378, "rewards/rejected": 0.8528007864952087, "step": 2511 }, { "epoch": 1.35, "learning_rate": 6.019139130782523e-08, "logits/chosen": -2.0460798740386963, "logits/rejected": -2.0446722507476807, "logps/chosen": -6.342310905456543, "logps/rejected": -6.379952907562256, "loss": 0.395, "rewards/accuracies": 1.0, "rewards/chosen": 1.1725165843963623, "rewards/margins": 0.7247511148452759, "rewards/rejected": 0.4477654993534088, "step": 2512 }, { "epoch": 1.36, "learning_rate": 6.016288503644318e-08, "logits/chosen": -2.0307109355926514, "logits/rejected": -2.2326231002807617, "logps/chosen": -5.738163471221924, "logps/rejected": -1.0410146713256836, "loss": 0.8128, "rewards/accuracies": 0.0, "rewards/chosen": 0.6397470235824585, "rewards/margins": -0.226595938205719, "rewards/rejected": 0.8663429617881775, "step": 2513 }, { "epoch": 1.36, "learning_rate": 6.013437531891373e-08, "logits/chosen": -2.155155897140503, "logits/rejected": -2.1362955570220947, "logps/chosen": -13.25711727142334, "logps/rejected": -1.6414748430252075, "loss": 0.423, "rewards/accuracies": 1.0, "rewards/chosen": 1.5711724758148193, "rewards/margins": 0.6413213014602661, "rewards/rejected": 0.9298511743545532, "step": 2514 }, { "epoch": 1.36, "learning_rate": 6.01058621649043e-08, "logits/chosen": -2.1004340648651123, "logits/rejected": -2.214528799057007, "logps/chosen": -0.6145886182785034, "logps/rejected": -0.6395580768585205, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.9056124687194824, "rewards/margins": 0.0007458329200744629, "rewards/rejected": 0.904866635799408, "step": 2515 }, { "epoch": 1.36, "learning_rate": 6.007734558408342e-08, "logits/chosen": -1.9773346185684204, "logits/rejected": -1.9683154821395874, "logps/chosen": -9.98269271850586, "logps/rejected": -10.355077743530273, "loss": 0.5223, "rewards/accuracies": 1.0, "rewards/chosen": 1.4517310857772827, "rewards/margins": 0.3771176338195801, "rewards/rejected": 1.0746134519577026, "step": 2516 }, { "epoch": 1.36, "learning_rate": 6.004882558612088e-08, "logits/chosen": -2.105640172958374, "logits/rejected": -2.247819185256958, "logps/chosen": -0.9254496693611145, "logps/rejected": -0.8688549995422363, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.8899839520454407, "rewards/margins": 0.011731922626495361, "rewards/rejected": 0.8782520294189453, "step": 2517 }, { "epoch": 1.36, "learning_rate": 6.002030218068752e-08, "logits/chosen": -2.0319907665252686, "logits/rejected": -2.270005941390991, "logps/chosen": -0.6913838386535645, "logps/rejected": -0.6754538416862488, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.8426098227500916, "rewards/margins": 0.016590237617492676, "rewards/rejected": 0.8260195851325989, "step": 2518 }, { "epoch": 1.36, "learning_rate": 5.999177537745541e-08, "logits/chosen": -2.093583822250366, "logits/rejected": -2.1920268535614014, "logps/chosen": -1.3190293312072754, "logps/rejected": -1.4289867877960205, "loss": 0.695, "rewards/accuracies": 0.0, "rewards/chosen": 0.8849555850028992, "rewards/margins": -0.0037798285484313965, "rewards/rejected": 0.8887354135513306, "step": 2519 }, { "epoch": 1.36, "learning_rate": 5.996324518609773e-08, "logits/chosen": -2.1247777938842773, "logits/rejected": -2.1321213245391846, "logps/chosen": -1.4676165580749512, "logps/rejected": -4.340156078338623, "loss": 0.3751, "rewards/accuracies": 1.0, "rewards/chosen": 1.3517580032348633, "rewards/margins": 0.7870188355445862, "rewards/rejected": 0.5647391676902771, "step": 2520 }, { "epoch": 1.36, "learning_rate": 5.993471161628882e-08, "logits/chosen": -2.102295160293579, "logits/rejected": -2.110609531402588, "logps/chosen": -1.4892910718917847, "logps/rejected": -2.166440725326538, "loss": 0.4999, "rewards/accuracies": 1.0, "rewards/chosen": 1.1936064958572388, "rewards/margins": 0.43298786878585815, "rewards/rejected": 0.7606186270713806, "step": 2521 }, { "epoch": 1.36, "learning_rate": 5.990617467770417e-08, "logits/chosen": -1.9970221519470215, "logits/rejected": -2.007037401199341, "logps/chosen": -1.7965108156204224, "logps/rejected": -2.3601133823394775, "loss": 0.4617, "rewards/accuracies": 1.0, "rewards/chosen": 1.2250065803527832, "rewards/margins": 0.5331926941871643, "rewards/rejected": 0.6918138861656189, "step": 2522 }, { "epoch": 1.36, "learning_rate": 5.987763438002044e-08, "logits/chosen": -2.0622775554656982, "logits/rejected": -2.3325815200805664, "logps/chosen": -0.291134238243103, "logps/rejected": -0.32159316539764404, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.8421124815940857, "rewards/margins": 0.019039928913116455, "rewards/rejected": 0.8230725526809692, "step": 2523 }, { "epoch": 1.36, "learning_rate": 5.984909073291537e-08, "logits/chosen": -2.062591791152954, "logits/rejected": -2.0636324882507324, "logps/chosen": -1.5342642068862915, "logps/rejected": -1.5833485126495361, "loss": 0.5421, "rewards/accuracies": 1.0, "rewards/chosen": 1.085174560546875, "rewards/margins": 0.3290281891822815, "rewards/rejected": 0.7561463713645935, "step": 2524 }, { "epoch": 1.36, "learning_rate": 5.982054374606786e-08, "logits/chosen": -2.0513341426849365, "logits/rejected": -2.0512502193450928, "logps/chosen": -1.7852236032485962, "logps/rejected": -0.6928819417953491, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.9085818529129028, "rewards/margins": 0.020639896392822266, "rewards/rejected": 0.8879419565200806, "step": 2525 }, { "epoch": 1.36, "learning_rate": 5.979199342915799e-08, "logits/chosen": -2.1460535526275635, "logits/rejected": -2.148085117340088, "logps/chosen": -1.718454360961914, "logps/rejected": -4.386429786682129, "loss": 0.4772, "rewards/accuracies": 1.0, "rewards/chosen": 1.0507831573486328, "rewards/margins": 0.4918532371520996, "rewards/rejected": 0.5589299201965332, "step": 2526 }, { "epoch": 1.36, "learning_rate": 5.976343979186688e-08, "logits/chosen": -2.077688694000244, "logits/rejected": -2.257272720336914, "logps/chosen": -1.6382851600646973, "logps/rejected": -1.4263272285461426, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.9956774115562439, "rewards/margins": 0.012270748615264893, "rewards/rejected": 0.983406662940979, "step": 2527 }, { "epoch": 1.36, "learning_rate": 5.973488284387686e-08, "logits/chosen": -2.0459790229797363, "logits/rejected": -2.258500576019287, "logps/chosen": -1.0047411918640137, "logps/rejected": -0.6685613393783569, "loss": 0.706, "rewards/accuracies": 0.0, "rewards/chosen": 0.7487761378288269, "rewards/margins": -0.025592684745788574, "rewards/rejected": 0.7743688225746155, "step": 2528 }, { "epoch": 1.36, "learning_rate": 5.970632259487134e-08, "logits/chosen": -1.991106629371643, "logits/rejected": -2.003909111022949, "logps/chosen": -1.198395848274231, "logps/rejected": -10.172208786010742, "loss": 0.5819, "rewards/accuracies": 1.0, "rewards/chosen": 0.990166962146759, "rewards/margins": 0.23634099960327148, "rewards/rejected": 0.7538259625434875, "step": 2529 }, { "epoch": 1.36, "learning_rate": 5.967775905453483e-08, "logits/chosen": -2.0400760173797607, "logits/rejected": -2.036071300506592, "logps/chosen": -6.770928382873535, "logps/rejected": -4.51004695892334, "loss": 0.2746, "rewards/accuracies": 1.0, "rewards/chosen": 1.632655382156372, "rewards/margins": 1.1521029472351074, "rewards/rejected": 0.48055240511894226, "step": 2530 }, { "epoch": 1.37, "learning_rate": 5.9649192232553e-08, "logits/chosen": -2.0444467067718506, "logits/rejected": -2.0504205226898193, "logps/chosen": -1.7884243726730347, "logps/rejected": -4.954905986785889, "loss": 0.4278, "rewards/accuracies": 1.0, "rewards/chosen": 1.2029757499694824, "rewards/margins": 0.6276323795318604, "rewards/rejected": 0.5753433704376221, "step": 2531 }, { "epoch": 1.37, "learning_rate": 5.962062213861265e-08, "logits/chosen": -2.0893707275390625, "logits/rejected": -2.0925638675689697, "logps/chosen": -6.057077884674072, "logps/rejected": -9.408381462097168, "loss": 0.2631, "rewards/accuracies": 1.0, "rewards/chosen": 1.5797979831695557, "rewards/margins": 1.2007933855056763, "rewards/rejected": 0.379004567861557, "step": 2532 }, { "epoch": 1.37, "learning_rate": 5.959204878240159e-08, "logits/chosen": -2.032146692276001, "logits/rejected": -2.2998476028442383, "logps/chosen": -4.0022687911987305, "logps/rejected": -2.548698663711548, "loss": 0.7202, "rewards/accuracies": 0.0, "rewards/chosen": 0.8242930769920349, "rewards/margins": -0.05335956811904907, "rewards/rejected": 0.877652645111084, "step": 2533 }, { "epoch": 1.37, "learning_rate": 5.956347217360885e-08, "logits/chosen": -2.0494496822357178, "logits/rejected": -2.312147378921509, "logps/chosen": -0.9081656336784363, "logps/rejected": -0.9983270764350891, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 1.0620427131652832, "rewards/margins": 0.006807208061218262, "rewards/rejected": 1.055235505104065, "step": 2534 }, { "epoch": 1.37, "learning_rate": 5.953489232192449e-08, "logits/chosen": -2.1298727989196777, "logits/rejected": -2.137160301208496, "logps/chosen": -1.5037686824798584, "logps/rejected": -4.1409077644348145, "loss": 0.4429, "rewards/accuracies": 1.0, "rewards/chosen": 1.0831491947174072, "rewards/margins": 0.5847734212875366, "rewards/rejected": 0.4983757436275482, "step": 2535 }, { "epoch": 1.37, "learning_rate": 5.9506309237039695e-08, "logits/chosen": -2.1102802753448486, "logits/rejected": -2.1078033447265625, "logps/chosen": -6.713151454925537, "logps/rejected": -4.1039018630981445, "loss": 0.4723, "rewards/accuracies": 1.0, "rewards/chosen": 1.5083402395248413, "rewards/margins": 0.5047011375427246, "rewards/rejected": 1.0036391019821167, "step": 2536 }, { "epoch": 1.37, "learning_rate": 5.947772292864676e-08, "logits/chosen": -2.0396711826324463, "logits/rejected": -2.2670583724975586, "logps/chosen": -0.42072561383247375, "logps/rejected": -0.4124566316604614, "loss": 0.6766, "rewards/accuracies": 1.0, "rewards/chosen": 0.701631486415863, "rewards/margins": 0.03334611654281616, "rewards/rejected": 0.6682853698730469, "step": 2537 }, { "epoch": 1.37, "learning_rate": 5.944913340643904e-08, "logits/chosen": -2.0872719287872314, "logits/rejected": -2.0928895473480225, "logps/chosen": -9.021357536315918, "logps/rejected": -2.559614419937134, "loss": 0.3492, "rewards/accuracies": 1.0, "rewards/chosen": 1.496968150138855, "rewards/margins": 0.8725472092628479, "rewards/rejected": 0.6244209408760071, "step": 2538 }, { "epoch": 1.37, "learning_rate": 5.942054068011102e-08, "logits/chosen": -2.030686616897583, "logits/rejected": -2.2368853092193604, "logps/chosen": -3.2356772422790527, "logps/rejected": -2.670654296875, "loss": 0.7071, "rewards/accuracies": 0.0, "rewards/chosen": 0.7833659052848816, "rewards/margins": -0.027669429779052734, "rewards/rejected": 0.8110353350639343, "step": 2539 }, { "epoch": 1.37, "learning_rate": 5.9391944759358203e-08, "logits/chosen": -2.0391767024993896, "logits/rejected": -2.032197952270508, "logps/chosen": -4.924160957336426, "logps/rejected": -3.9646053314208984, "loss": 0.2875, "rewards/accuracies": 1.0, "rewards/chosen": 1.5731170177459717, "rewards/margins": 1.0995404720306396, "rewards/rejected": 0.47357654571533203, "step": 2540 }, { "epoch": 1.37, "learning_rate": 5.936334565387726e-08, "logits/chosen": -2.1713919639587402, "logits/rejected": -2.0898349285125732, "logps/chosen": -35.955326080322266, "logps/rejected": -4.607868194580078, "loss": 0.2517, "rewards/accuracies": 1.0, "rewards/chosen": 1.6836216449737549, "rewards/margins": 1.2511382102966309, "rewards/rejected": 0.4324834942817688, "step": 2541 }, { "epoch": 1.37, "learning_rate": 5.9334743373365905e-08, "logits/chosen": -2.105682849884033, "logits/rejected": -1.9990469217300415, "logps/chosen": -39.21542739868164, "logps/rejected": -4.177371978759766, "loss": 0.1943, "rewards/accuracies": 1.0, "rewards/chosen": 2.1866986751556396, "rewards/margins": 1.5397953987121582, "rewards/rejected": 0.6469032168388367, "step": 2542 }, { "epoch": 1.37, "learning_rate": 5.930613792752292e-08, "logits/chosen": -2.2037198543548584, "logits/rejected": -2.25475811958313, "logps/chosen": -6.211597442626953, "logps/rejected": -18.569082260131836, "loss": 0.3161, "rewards/accuracies": 1.0, "rewards/chosen": 1.3499622344970703, "rewards/margins": 0.9896409511566162, "rewards/rejected": 0.3603212535381317, "step": 2543 }, { "epoch": 1.37, "learning_rate": 5.9277529326048156e-08, "logits/chosen": -1.950282096862793, "logits/rejected": -1.9609931707382202, "logps/chosen": -1.84959876537323, "logps/rejected": -7.355851173400879, "loss": 0.4314, "rewards/accuracies": 1.0, "rewards/chosen": 1.1755789518356323, "rewards/margins": 0.6172814965248108, "rewards/rejected": 0.5582974553108215, "step": 2544 }, { "epoch": 1.37, "learning_rate": 5.9248917578642544e-08, "logits/chosen": -2.0187318325042725, "logits/rejected": -2.018704891204834, "logps/chosen": -2.1440916061401367, "logps/rejected": -5.374724388122559, "loss": 0.3382, "rewards/accuracies": 1.0, "rewards/chosen": 1.491722583770752, "rewards/margins": 0.9104123711585999, "rewards/rejected": 0.5813102126121521, "step": 2545 }, { "epoch": 1.37, "learning_rate": 5.922030269500808e-08, "logits/chosen": -1.9551242589950562, "logits/rejected": -1.9435299634933472, "logps/chosen": -0.875363826751709, "logps/rejected": -4.188085079193115, "loss": 0.4993, "rewards/accuracies": 1.0, "rewards/chosen": 1.1346784830093384, "rewards/margins": 0.43450111150741577, "rewards/rejected": 0.7001773715019226, "step": 2546 }, { "epoch": 1.37, "learning_rate": 5.9191684684847866e-08, "logits/chosen": -2.0916759967803955, "logits/rejected": -2.280496120452881, "logps/chosen": -0.7281175851821899, "logps/rejected": -0.6409755349159241, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 1.0074504613876343, "rewards/margins": 0.030129313468933105, "rewards/rejected": 0.9773211479187012, "step": 2547 }, { "epoch": 1.37, "learning_rate": 5.9163063557865986e-08, "logits/chosen": -2.1349170207977295, "logits/rejected": -2.129652261734009, "logps/chosen": -3.000424385070801, "logps/rejected": -4.489340305328369, "loss": 0.5022, "rewards/accuracies": 1.0, "rewards/chosen": 1.039343237876892, "rewards/margins": 0.42711251974105835, "rewards/rejected": 0.6122307181358337, "step": 2548 }, { "epoch": 1.37, "learning_rate": 5.913443932376764e-08, "logits/chosen": -2.0837433338165283, "logits/rejected": -2.242870807647705, "logps/chosen": -0.5377991795539856, "logps/rejected": -7.66966438293457, "loss": 0.6141, "rewards/accuracies": 1.0, "rewards/chosen": 0.8683128356933594, "rewards/margins": 0.16494017839431763, "rewards/rejected": 0.7033726572990417, "step": 2549 }, { "epoch": 1.38, "learning_rate": 5.9105811992259025e-08, "logits/chosen": -1.986696720123291, "logits/rejected": -1.9245578050613403, "logps/chosen": -17.421634674072266, "logps/rejected": -1.5483834743499756, "loss": 0.3984, "rewards/accuracies": 1.0, "rewards/chosen": 1.6172024011611938, "rewards/margins": 0.7144956588745117, "rewards/rejected": 0.9027067422866821, "step": 2550 }, { "epoch": 1.38, "learning_rate": 5.907718157304745e-08, "logits/chosen": -2.076277732849121, "logits/rejected": -2.066605567932129, "logps/chosen": -7.617020606994629, "logps/rejected": -6.427324295043945, "loss": 0.2715, "rewards/accuracies": 1.0, "rewards/chosen": 1.611053466796875, "rewards/margins": 1.165151596069336, "rewards/rejected": 0.44590187072753906, "step": 2551 }, { "epoch": 1.38, "learning_rate": 5.9048548075841255e-08, "logits/chosen": -1.9967533349990845, "logits/rejected": -2.219562292098999, "logps/chosen": -0.9546341300010681, "logps/rejected": -1.054460883140564, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.8626365661621094, "rewards/margins": 0.011209368705749512, "rewards/rejected": 0.8514271974563599, "step": 2552 }, { "epoch": 1.38, "learning_rate": 5.901991151034981e-08, "logits/chosen": -1.9775785207748413, "logits/rejected": -1.9750640392303467, "logps/chosen": -5.713464736938477, "logps/rejected": -2.6326944828033447, "loss": 0.3092, "rewards/accuracies": 1.0, "rewards/chosen": 1.6765278577804565, "rewards/margins": 1.0151361227035522, "rewards/rejected": 0.6613917350769043, "step": 2553 }, { "epoch": 1.38, "learning_rate": 5.8991271886283513e-08, "logits/chosen": -2.003755807876587, "logits/rejected": -2.2899301052093506, "logps/chosen": -0.6251106858253479, "logps/rejected": -0.6828871369361877, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.8569564819335938, "rewards/margins": 0.025582492351531982, "rewards/rejected": 0.8313739895820618, "step": 2554 }, { "epoch": 1.38, "learning_rate": 5.896262921335382e-08, "logits/chosen": -2.0402700901031494, "logits/rejected": -2.0434067249298096, "logps/chosen": -1.7612322568893433, "logps/rejected": -0.9296969771385193, "loss": 0.6198, "rewards/accuracies": 1.0, "rewards/chosen": 0.9474861025810242, "rewards/margins": 0.1524890661239624, "rewards/rejected": 0.7949970364570618, "step": 2555 }, { "epoch": 1.38, "learning_rate": 5.893398350127323e-08, "logits/chosen": -2.0847790241241455, "logits/rejected": -2.00479793548584, "logps/chosen": -42.47880554199219, "logps/rejected": -4.580612659454346, "loss": 0.2369, "rewards/accuracies": 1.0, "rewards/chosen": 1.5959663391113281, "rewards/margins": 1.3194596767425537, "rewards/rejected": 0.276506632566452, "step": 2556 }, { "epoch": 1.38, "learning_rate": 5.890533475975525e-08, "logits/chosen": -2.0534493923187256, "logits/rejected": -2.29909086227417, "logps/chosen": -0.33667656779289246, "logps/rejected": -0.37300193309783936, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.7561662793159485, "rewards/margins": 0.007293105125427246, "rewards/rejected": 0.7488731741905212, "step": 2557 }, { "epoch": 1.38, "learning_rate": 5.887668299851442e-08, "logits/chosen": -2.021669626235962, "logits/rejected": -2.2603371143341064, "logps/chosen": -1.1376391649246216, "logps/rejected": -1.055199384689331, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.9478465914726257, "rewards/margins": 0.007706344127655029, "rewards/rejected": 0.9401402473449707, "step": 2558 }, { "epoch": 1.38, "learning_rate": 5.884802822726632e-08, "logits/chosen": -1.9668564796447754, "logits/rejected": -2.263258218765259, "logps/chosen": -2.6693475246429443, "logps/rejected": -2.793816328048706, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.5573369860649109, "rewards/margins": 0.016422271728515625, "rewards/rejected": 0.5409147143363953, "step": 2559 }, { "epoch": 1.38, "learning_rate": 5.881937045572751e-08, "logits/chosen": -2.1153910160064697, "logits/rejected": -2.1201274394989014, "logps/chosen": -1.9678046703338623, "logps/rejected": -4.904637813568115, "loss": 0.3991, "rewards/accuracies": 1.0, "rewards/chosen": 1.1716707944869995, "rewards/margins": 0.7122458219528198, "rewards/rejected": 0.4594249427318573, "step": 2560 }, { "epoch": 1.38, "learning_rate": 5.8790709693615637e-08, "logits/chosen": -2.128143548965454, "logits/rejected": -2.3141019344329834, "logps/chosen": -2.5794618129730225, "logps/rejected": -2.2772881984710693, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 0.7998052835464478, "rewards/margins": -0.004924356937408447, "rewards/rejected": 0.8047296404838562, "step": 2561 }, { "epoch": 1.38, "learning_rate": 5.876204595064929e-08, "logits/chosen": -2.0242764949798584, "logits/rejected": -2.248148202896118, "logps/chosen": -1.3312524557113647, "logps/rejected": -1.4583314657211304, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.7489244937896729, "rewards/margins": 0.022393226623535156, "rewards/rejected": 0.7265312671661377, "step": 2562 }, { "epoch": 1.38, "learning_rate": 5.873337923654811e-08, "logits/chosen": -2.1050901412963867, "logits/rejected": -2.102426528930664, "logps/chosen": -8.812363624572754, "logps/rejected": -3.577542304992676, "loss": 0.4455, "rewards/accuracies": 1.0, "rewards/chosen": 1.1440485715866089, "rewards/margins": 0.5776695013046265, "rewards/rejected": 0.5663790702819824, "step": 2563 }, { "epoch": 1.38, "learning_rate": 5.8704709561032714e-08, "logits/chosen": -2.0299627780914307, "logits/rejected": -2.2576658725738525, "logps/chosen": -2.738430976867676, "logps/rejected": -2.7757880687713623, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.8284125328063965, "rewards/margins": 0.02839648723602295, "rewards/rejected": 0.8000160455703735, "step": 2564 }, { "epoch": 1.38, "learning_rate": 5.867603693382477e-08, "logits/chosen": -2.061384439468384, "logits/rejected": -2.2883996963500977, "logps/chosen": -1.975111722946167, "logps/rejected": -2.2498221397399902, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.9179434180259705, "rewards/margins": 0.03531479835510254, "rewards/rejected": 0.8826286196708679, "step": 2565 }, { "epoch": 1.38, "learning_rate": 5.864736136464692e-08, "logits/chosen": -2.0694432258605957, "logits/rejected": -2.2150120735168457, "logps/chosen": -0.22463957965373993, "logps/rejected": -0.19722986221313477, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.6653380393981934, "rewards/margins": 0.023191511631011963, "rewards/rejected": 0.6421465277671814, "step": 2566 }, { "epoch": 1.38, "learning_rate": 5.861868286322279e-08, "logits/chosen": -2.03406023979187, "logits/rejected": -2.0363340377807617, "logps/chosen": -2.618117570877075, "logps/rejected": -3.265197515487671, "loss": 0.3714, "rewards/accuracies": 1.0, "rewards/chosen": 1.2237781286239624, "rewards/margins": 0.7989678382873535, "rewards/rejected": 0.4248103201389313, "step": 2567 }, { "epoch": 1.39, "learning_rate": 5.859000143927702e-08, "logits/chosen": -2.053313732147217, "logits/rejected": -2.297597646713257, "logps/chosen": -0.6414929032325745, "logps/rejected": -0.5445217490196228, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.9467126727104187, "rewards/margins": 0.03157663345336914, "rewards/rejected": 0.9151360392570496, "step": 2568 }, { "epoch": 1.39, "learning_rate": 5.856131710253523e-08, "logits/chosen": -2.1298787593841553, "logits/rejected": -2.1395263671875, "logps/chosen": -2.0122287273406982, "logps/rejected": -2.585648536682129, "loss": 0.5406, "rewards/accuracies": 1.0, "rewards/chosen": 0.9781050682067871, "rewards/margins": 0.3325796127319336, "rewards/rejected": 0.6455254554748535, "step": 2569 }, { "epoch": 1.39, "learning_rate": 5.853262986272404e-08, "logits/chosen": -1.9401973485946655, "logits/rejected": -1.936897873878479, "logps/chosen": -3.410780906677246, "logps/rejected": -6.026228427886963, "loss": 0.5182, "rewards/accuracies": 1.0, "rewards/chosen": 0.788638710975647, "rewards/margins": 0.38702455163002014, "rewards/rejected": 0.40161415934562683, "step": 2570 }, { "epoch": 1.39, "learning_rate": 5.850393972957105e-08, "logits/chosen": -1.9652501344680786, "logits/rejected": -1.969897985458374, "logps/chosen": -2.690613269805908, "logps/rejected": -4.576509475708008, "loss": 0.3566, "rewards/accuracies": 1.0, "rewards/chosen": 1.369309425354004, "rewards/margins": 0.8476496338844299, "rewards/rejected": 0.521659791469574, "step": 2571 }, { "epoch": 1.39, "learning_rate": 5.847524671280484e-08, "logits/chosen": -2.054192543029785, "logits/rejected": -2.061723470687866, "logps/chosen": -4.39788818359375, "logps/rejected": -6.100103855133057, "loss": 0.4079, "rewards/accuracies": 1.0, "rewards/chosen": 1.555706262588501, "rewards/margins": 0.6858456134796143, "rewards/rejected": 0.8698606491088867, "step": 2572 }, { "epoch": 1.39, "learning_rate": 5.8446550822154964e-08, "logits/chosen": -1.9735853672027588, "logits/rejected": -1.9744583368301392, "logps/chosen": -3.8680315017700195, "logps/rejected": -3.8298895359039307, "loss": 0.2597, "rewards/accuracies": 1.0, "rewards/chosen": 1.7198963165283203, "rewards/margins": 1.2156660556793213, "rewards/rejected": 0.5042303204536438, "step": 2573 }, { "epoch": 1.39, "learning_rate": 5.841785206735191e-08, "logits/chosen": -2.135051727294922, "logits/rejected": -2.135145902633667, "logps/chosen": -2.0561020374298096, "logps/rejected": -3.7856686115264893, "loss": 0.2823, "rewards/accuracies": 1.0, "rewards/chosen": 1.5808342695236206, "rewards/margins": 1.1201581954956055, "rewards/rejected": 0.46067601442337036, "step": 2574 }, { "epoch": 1.39, "learning_rate": 5.838915045812727e-08, "logits/chosen": -1.9964075088500977, "logits/rejected": -2.2963857650756836, "logps/chosen": -2.4440293312072754, "logps/rejected": -2.6241698265075684, "loss": 0.6734, "rewards/accuracies": 1.0, "rewards/chosen": 1.2273931503295898, "rewards/margins": 0.03993189334869385, "rewards/rejected": 1.187461256980896, "step": 2575 }, { "epoch": 1.39, "learning_rate": 5.836044600421345e-08, "logits/chosen": -2.0061888694763184, "logits/rejected": -2.2663216590881348, "logps/chosen": -0.317544162273407, "logps/rejected": -0.34400278329849243, "loss": 0.6741, "rewards/accuracies": 1.0, "rewards/chosen": 0.9816276431083679, "rewards/margins": 0.03843677043914795, "rewards/rejected": 0.94319087266922, "step": 2576 }, { "epoch": 1.39, "learning_rate": 5.8331738715343906e-08, "logits/chosen": -2.1023924350738525, "logits/rejected": -2.1040797233581543, "logps/chosen": -2.7240145206451416, "logps/rejected": -3.73508620262146, "loss": 0.4345, "rewards/accuracies": 1.0, "rewards/chosen": 1.102353811264038, "rewards/margins": 0.6083664298057556, "rewards/rejected": 0.49398738145828247, "step": 2577 }, { "epoch": 1.39, "learning_rate": 5.8303028601253015e-08, "logits/chosen": -2.01407790184021, "logits/rejected": -2.0111920833587646, "logps/chosen": -0.5593258142471313, "logps/rejected": -3.046558380126953, "loss": 0.512, "rewards/accuracies": 1.0, "rewards/chosen": 1.0375019311904907, "rewards/margins": 0.4025188684463501, "rewards/rejected": 0.6349830627441406, "step": 2578 }, { "epoch": 1.39, "learning_rate": 5.8274315671676144e-08, "logits/chosen": -2.015255928039551, "logits/rejected": -2.027813673019409, "logps/chosen": -1.632523775100708, "logps/rejected": -7.465479850769043, "loss": 0.4208, "rewards/accuracies": 1.0, "rewards/chosen": 1.1121766567230225, "rewards/margins": 0.6478490233421326, "rewards/rejected": 0.4643276333808899, "step": 2579 }, { "epoch": 1.39, "learning_rate": 5.824559993634961e-08, "logits/chosen": -2.016284704208374, "logits/rejected": -2.2184700965881348, "logps/chosen": -5.725428104400635, "logps/rejected": -1.4724979400634766, "loss": 0.7405, "rewards/accuracies": 0.0, "rewards/chosen": 0.7894378304481506, "rewards/margins": -0.09263074398040771, "rewards/rejected": 0.8820685744285583, "step": 2580 }, { "epoch": 1.39, "learning_rate": 5.821688140501067e-08, "logits/chosen": -2.1186561584472656, "logits/rejected": -1.939927339553833, "logps/chosen": -38.49065017700195, "logps/rejected": -3.3140246868133545, "loss": 0.2532, "rewards/accuracies": 1.0, "rewards/chosen": 1.8740791082382202, "rewards/margins": 1.2441856861114502, "rewards/rejected": 0.6298934817314148, "step": 2581 }, { "epoch": 1.39, "learning_rate": 5.818816008739753e-08, "logits/chosen": -2.2203235626220703, "logits/rejected": -2.2137959003448486, "logps/chosen": -3.92191743850708, "logps/rejected": -1.8466485738754272, "loss": 0.5265, "rewards/accuracies": 1.0, "rewards/chosen": 1.0965479612350464, "rewards/margins": 0.3666345477104187, "rewards/rejected": 0.7299134135246277, "step": 2582 }, { "epoch": 1.39, "learning_rate": 5.815943599324933e-08, "logits/chosen": -2.2660837173461914, "logits/rejected": -2.179922342300415, "logps/chosen": -42.447853088378906, "logps/rejected": -5.8183722496032715, "loss": 0.2529, "rewards/accuracies": 1.0, "rewards/chosen": 2.0301949977874756, "rewards/margins": 1.2457749843597412, "rewards/rejected": 0.7844200730323792, "step": 2583 }, { "epoch": 1.39, "learning_rate": 5.813070913230618e-08, "logits/chosen": -2.0824899673461914, "logits/rejected": -2.2793474197387695, "logps/chosen": -1.0836578607559204, "logps/rejected": -1.1811472177505493, "loss": 0.6763, "rewards/accuracies": 1.0, "rewards/chosen": 0.7244424223899841, "rewards/margins": 0.034020185470581055, "rewards/rejected": 0.6904222369194031, "step": 2584 }, { "epoch": 1.39, "learning_rate": 5.8101979514309104e-08, "logits/chosen": -2.0278830528259277, "logits/rejected": -2.0351545810699463, "logps/chosen": -4.112163543701172, "logps/rejected": -5.180153846740723, "loss": 0.4383, "rewards/accuracies": 1.0, "rewards/chosen": 0.9722980856895447, "rewards/margins": 0.5977791547775269, "rewards/rejected": 0.3745189607143402, "step": 2585 }, { "epoch": 1.39, "learning_rate": 5.807324714900008e-08, "logits/chosen": -2.1461775302886963, "logits/rejected": -2.1967954635620117, "logps/chosen": -5.966305732727051, "logps/rejected": -8.480597496032715, "loss": 0.5273, "rewards/accuracies": 1.0, "rewards/chosen": 1.2312077283859253, "rewards/margins": 0.36478304862976074, "rewards/rejected": 0.8664246797561646, "step": 2586 }, { "epoch": 1.4, "learning_rate": 5.804451204612201e-08, "logits/chosen": -2.002934217453003, "logits/rejected": -2.197110652923584, "logps/chosen": -1.8030214309692383, "logps/rejected": -1.8779520988464355, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.9562854170799255, "rewards/margins": 0.022474229335784912, "rewards/rejected": 0.9338111877441406, "step": 2587 }, { "epoch": 1.4, "learning_rate": 5.801577421541869e-08, "logits/chosen": -2.067018747329712, "logits/rejected": -2.073434352874756, "logps/chosen": -2.714158296585083, "logps/rejected": -5.399798393249512, "loss": 0.4529, "rewards/accuracies": 1.0, "rewards/chosen": 1.0196242332458496, "rewards/margins": 0.5570032000541687, "rewards/rejected": 0.4626210331916809, "step": 2588 }, { "epoch": 1.4, "learning_rate": 5.7987033666634897e-08, "logits/chosen": -2.0134754180908203, "logits/rejected": -2.3297219276428223, "logps/chosen": -0.6961361765861511, "logps/rejected": -0.7369797825813293, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.9435209631919861, "rewards/margins": 0.016068100929260254, "rewards/rejected": 0.9274528622627258, "step": 2589 }, { "epoch": 1.4, "learning_rate": 5.795829040951633e-08, "logits/chosen": -2.149620532989502, "logits/rejected": -2.1520605087280273, "logps/chosen": -4.04153299331665, "logps/rejected": -3.4215614795684814, "loss": 0.5063, "rewards/accuracies": 1.0, "rewards/chosen": 1.0314003229141235, "rewards/margins": 0.4167524576187134, "rewards/rejected": 0.6146478652954102, "step": 2590 }, { "epoch": 1.4, "learning_rate": 5.7929544453809545e-08, "logits/chosen": -2.104724645614624, "logits/rejected": -2.222710132598877, "logps/chosen": -2.5697712898254395, "logps/rejected": -3.6653852462768555, "loss": 0.603, "rewards/accuracies": 1.0, "rewards/chosen": 0.91436368227005, "rewards/margins": 0.1893036961555481, "rewards/rejected": 0.725059986114502, "step": 2591 }, { "epoch": 1.4, "learning_rate": 5.790079580926206e-08, "logits/chosen": -2.1963682174682617, "logits/rejected": -2.203892469406128, "logps/chosen": -1.4417294263839722, "logps/rejected": -2.9070677757263184, "loss": 0.4466, "rewards/accuracies": 1.0, "rewards/chosen": 1.134867548942566, "rewards/margins": 0.5745315551757812, "rewards/rejected": 0.5603359937667847, "step": 2592 }, { "epoch": 1.4, "learning_rate": 5.7872044485622284e-08, "logits/chosen": -2.1124331951141357, "logits/rejected": -2.3062381744384766, "logps/chosen": -2.416553020477295, "logps/rejected": -2.3867106437683105, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.6343116164207458, "rewards/margins": -0.0012640953063964844, "rewards/rejected": 0.6355757117271423, "step": 2593 }, { "epoch": 1.4, "learning_rate": 5.7843290492639554e-08, "logits/chosen": -2.0539791584014893, "logits/rejected": -2.252403497695923, "logps/chosen": -0.3329586088657379, "logps/rejected": -0.44565263390541077, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.850283145904541, "rewards/margins": 0.009234905242919922, "rewards/rejected": 0.8410482406616211, "step": 2594 }, { "epoch": 1.4, "learning_rate": 5.7814533840064115e-08, "logits/chosen": -2.2486839294433594, "logits/rejected": -2.149536371231079, "logps/chosen": -28.977563858032227, "logps/rejected": -3.909619092941284, "loss": 0.2604, "rewards/accuracies": 1.0, "rewards/chosen": 1.6848334074020386, "rewards/margins": 1.212547779083252, "rewards/rejected": 0.47228556871414185, "step": 2595 }, { "epoch": 1.4, "learning_rate": 5.778577453764709e-08, "logits/chosen": -2.0528011322021484, "logits/rejected": -2.2795732021331787, "logps/chosen": -3.9835221767425537, "logps/rejected": -0.9656448364257812, "loss": 0.8242, "rewards/accuracies": 0.0, "rewards/chosen": 0.8487840890884399, "rewards/margins": -0.24688780307769775, "rewards/rejected": 1.0956718921661377, "step": 2596 }, { "epoch": 1.4, "learning_rate": 5.775701259514052e-08, "logits/chosen": -2.0377614498138428, "logits/rejected": -2.0250556468963623, "logps/chosen": -2.2865853309631348, "logps/rejected": -5.399440288543701, "loss": 0.4564, "rewards/accuracies": 1.0, "rewards/chosen": 1.371258020401001, "rewards/margins": 0.5474653840065002, "rewards/rejected": 0.8237926363945007, "step": 2597 }, { "epoch": 1.4, "learning_rate": 5.772824802229732e-08, "logits/chosen": -1.9704804420471191, "logits/rejected": -2.23740816116333, "logps/chosen": -0.24632872641086578, "logps/rejected": -0.2658426761627197, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.8366581201553345, "rewards/margins": 0.019573330879211426, "rewards/rejected": 0.817084789276123, "step": 2598 }, { "epoch": 1.4, "learning_rate": 5.7699480828871304e-08, "logits/chosen": -1.995491623878479, "logits/rejected": -2.2665529251098633, "logps/chosen": -0.5182278752326965, "logps/rejected": -0.5261034965515137, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.966588020324707, "rewards/margins": 0.015443027019500732, "rewards/rejected": 0.9511449933052063, "step": 2599 }, { "epoch": 1.4, "learning_rate": 5.7670711024617224e-08, "logits/chosen": -2.0574116706848145, "logits/rejected": -2.285038471221924, "logps/chosen": -1.0001078844070435, "logps/rejected": -1.096280813217163, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 0.7314204573631287, "rewards/margins": -0.002529144287109375, "rewards/rejected": 0.733949601650238, "step": 2600 }, { "epoch": 1.4, "learning_rate": 5.764193861929063e-08, "logits/chosen": -2.146618127822876, "logits/rejected": -2.14121413230896, "logps/chosen": -2.6002070903778076, "logps/rejected": -2.896141290664673, "loss": 0.5302, "rewards/accuracies": 1.0, "rewards/chosen": 1.127200722694397, "rewards/margins": 0.35783034563064575, "rewards/rejected": 0.7693703770637512, "step": 2601 }, { "epoch": 1.4, "learning_rate": 5.761316362264801e-08, "logits/chosen": -2.071934223175049, "logits/rejected": -2.0958118438720703, "logps/chosen": -3.159884452819824, "logps/rejected": -6.193259239196777, "loss": 0.4719, "rewards/accuracies": 1.0, "rewards/chosen": 1.178360104560852, "rewards/margins": 0.5057985186576843, "rewards/rejected": 0.6725615859031677, "step": 2602 }, { "epoch": 1.4, "learning_rate": 5.758438604444674e-08, "logits/chosen": -2.0338544845581055, "logits/rejected": -2.0130860805511475, "logps/chosen": -7.044717788696289, "logps/rejected": -6.230997562408447, "loss": 0.361, "rewards/accuracies": 1.0, "rewards/chosen": 1.330107569694519, "rewards/margins": 0.8330243229866028, "rewards/rejected": 0.49708324670791626, "step": 2603 }, { "epoch": 1.4, "learning_rate": 5.755560589444503e-08, "logits/chosen": -2.1494598388671875, "logits/rejected": -2.289677619934082, "logps/chosen": -0.5181054472923279, "logps/rejected": -0.5168090462684631, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 1.050918698310852, "rewards/margins": -0.0003731250762939453, "rewards/rejected": 1.051291823387146, "step": 2604 }, { "epoch": 1.41, "learning_rate": 5.7526823182401983e-08, "logits/chosen": -2.1287569999694824, "logits/rejected": -2.1305229663848877, "logps/chosen": -0.9577614665031433, "logps/rejected": -2.689715623855591, "loss": 0.5547, "rewards/accuracies": 1.0, "rewards/chosen": 1.0624308586120605, "rewards/margins": 0.29927366971969604, "rewards/rejected": 0.7631571888923645, "step": 2605 }, { "epoch": 1.41, "learning_rate": 5.749803791807758e-08, "logits/chosen": -2.0525457859039307, "logits/rejected": -2.2903761863708496, "logps/chosen": -1.36907160282135, "logps/rejected": -1.4438725709915161, "loss": 0.6678, "rewards/accuracies": 1.0, "rewards/chosen": 0.9697807431221008, "rewards/margins": 0.0514148473739624, "rewards/rejected": 0.9183658957481384, "step": 2606 }, { "epoch": 1.41, "learning_rate": 5.7469250111232635e-08, "logits/chosen": -2.2005343437194824, "logits/rejected": -2.2922117710113525, "logps/chosen": -13.425365447998047, "logps/rejected": -8.859067916870117, "loss": 0.7991, "rewards/accuracies": 0.0, "rewards/chosen": 0.6908335089683533, "rewards/margins": -0.20181846618652344, "rewards/rejected": 0.8926519751548767, "step": 2607 }, { "epoch": 1.41, "learning_rate": 5.744045977162888e-08, "logits/chosen": -1.966747760772705, "logits/rejected": -2.234318494796753, "logps/chosen": -1.5877659320831299, "logps/rejected": -3.9530446529388428, "loss": 0.6715, "rewards/accuracies": 1.0, "rewards/chosen": 1.0123153924942017, "rewards/margins": 0.04379969835281372, "rewards/rejected": 0.9685156941413879, "step": 2608 }, { "epoch": 1.41, "learning_rate": 5.7411666909028855e-08, "logits/chosen": -2.0790605545043945, "logits/rejected": -2.1775217056274414, "logps/chosen": -1.4707767963409424, "logps/rejected": -27.107006072998047, "loss": 0.2557, "rewards/accuracies": 1.0, "rewards/chosen": 1.1259777545928955, "rewards/margins": 1.2333544492721558, "rewards/rejected": -0.10737667232751846, "step": 2609 }, { "epoch": 1.41, "learning_rate": 5.7382871533195984e-08, "logits/chosen": -2.0396344661712646, "logits/rejected": -2.2874526977539062, "logps/chosen": -0.24418362975120544, "logps/rejected": -0.2381005734205246, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.9453632235527039, "rewards/margins": 0.009027957916259766, "rewards/rejected": 0.9363352656364441, "step": 2610 }, { "epoch": 1.41, "learning_rate": 5.735407365389452e-08, "logits/chosen": -2.1554102897644043, "logits/rejected": -2.3273942470550537, "logps/chosen": -0.8112886548042297, "logps/rejected": -3.1405272483825684, "loss": 0.6758, "rewards/accuracies": 1.0, "rewards/chosen": 1.0246682167053223, "rewards/margins": 0.035093605518341064, "rewards/rejected": 0.9895746111869812, "step": 2611 }, { "epoch": 1.41, "learning_rate": 5.7325273280889575e-08, "logits/chosen": -2.1504311561584473, "logits/rejected": -2.0941083431243896, "logps/chosen": -23.27344512939453, "logps/rejected": -3.4320731163024902, "loss": 0.2367, "rewards/accuracies": 1.0, "rewards/chosen": 2.0742454528808594, "rewards/margins": 1.320284366607666, "rewards/rejected": 0.7539610266685486, "step": 2612 }, { "epoch": 1.41, "learning_rate": 5.729647042394711e-08, "logits/chosen": -2.0211570262908936, "logits/rejected": -2.2694733142852783, "logps/chosen": -1.9824230670928955, "logps/rejected": -4.5529584884643555, "loss": 0.603, "rewards/accuracies": 1.0, "rewards/chosen": 0.8750857710838318, "rewards/margins": 0.18922901153564453, "rewards/rejected": 0.6858567595481873, "step": 2613 }, { "epoch": 1.41, "learning_rate": 5.726766509283395e-08, "logits/chosen": -2.1059789657592773, "logits/rejected": -2.216397523880005, "logps/chosen": -8.97461223602295, "logps/rejected": -21.639036178588867, "loss": 0.2483, "rewards/accuracies": 1.0, "rewards/chosen": 1.8914722204208374, "rewards/margins": 1.2663309574127197, "rewards/rejected": 0.6251413226127625, "step": 2614 }, { "epoch": 1.41, "learning_rate": 5.723885729731772e-08, "logits/chosen": -1.9761284589767456, "logits/rejected": -1.9766219854354858, "logps/chosen": -9.463444709777832, "logps/rejected": -1.2683013677597046, "loss": 0.4846, "rewards/accuracies": 1.0, "rewards/chosen": 1.3487855195999146, "rewards/margins": 0.47241896390914917, "rewards/rejected": 0.8763665556907654, "step": 2615 }, { "epoch": 1.41, "learning_rate": 5.7210047047166897e-08, "logits/chosen": -2.050565481185913, "logits/rejected": -2.2682197093963623, "logps/chosen": -1.1374105215072632, "logps/rejected": -1.2252538204193115, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 1.0648740530014038, "rewards/margins": 0.022090673446655273, "rewards/rejected": 1.0427833795547485, "step": 2616 }, { "epoch": 1.41, "learning_rate": 5.7181234352150774e-08, "logits/chosen": -2.0669915676116943, "logits/rejected": -2.0741007328033447, "logps/chosen": -5.432315826416016, "logps/rejected": -3.0028865337371826, "loss": 0.5226, "rewards/accuracies": 1.0, "rewards/chosen": 1.0125421285629272, "rewards/margins": 0.37621009349823, "rewards/rejected": 0.6363320350646973, "step": 2617 }, { "epoch": 1.41, "learning_rate": 5.7152419222039515e-08, "logits/chosen": -2.155463218688965, "logits/rejected": -2.2546024322509766, "logps/chosen": -11.328492164611816, "logps/rejected": -3.730665445327759, "loss": 1.1145, "rewards/accuracies": 0.0, "rewards/chosen": 0.015156554989516735, "rewards/margins": -0.7169469594955444, "rewards/rejected": 0.7321035265922546, "step": 2618 }, { "epoch": 1.41, "learning_rate": 5.712360166660408e-08, "logits/chosen": -2.017467498779297, "logits/rejected": -2.0152173042297363, "logps/chosen": -0.9355286359786987, "logps/rejected": -4.081059455871582, "loss": 0.5115, "rewards/accuracies": 1.0, "rewards/chosen": 0.9366704821586609, "rewards/margins": 0.40367382764816284, "rewards/rejected": 0.532996654510498, "step": 2619 }, { "epoch": 1.41, "learning_rate": 5.7094781695616255e-08, "logits/chosen": -2.1640937328338623, "logits/rejected": -2.256152868270874, "logps/chosen": -2.065880537033081, "logps/rejected": -2.210785388946533, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.9878539443016052, "rewards/margins": 0.019586682319641113, "rewards/rejected": 0.9682672619819641, "step": 2620 }, { "epoch": 1.41, "learning_rate": 5.7065959318848645e-08, "logits/chosen": -2.0326485633850098, "logits/rejected": -2.0321097373962402, "logps/chosen": -0.34686580300331116, "logps/rejected": -4.196262836456299, "loss": 0.4816, "rewards/accuracies": 1.0, "rewards/chosen": 1.0674935579299927, "rewards/margins": 0.4800870418548584, "rewards/rejected": 0.5874065160751343, "step": 2621 }, { "epoch": 1.41, "learning_rate": 5.703713454607465e-08, "logits/chosen": -1.9421172142028809, "logits/rejected": -2.2091901302337646, "logps/chosen": -0.25037091970443726, "logps/rejected": -0.3114197850227356, "loss": 0.7004, "rewards/accuracies": 0.0, "rewards/chosen": 0.9232346415519714, "rewards/margins": -0.014518976211547852, "rewards/rejected": 0.9377536177635193, "step": 2622 }, { "epoch": 1.41, "learning_rate": 5.700830738706852e-08, "logits/chosen": -2.0616726875305176, "logits/rejected": -2.055393934249878, "logps/chosen": -5.674044132232666, "logps/rejected": -4.683429718017578, "loss": 0.4488, "rewards/accuracies": 1.0, "rewards/chosen": 0.9582107663154602, "rewards/margins": 0.5683380365371704, "rewards/rejected": 0.3898727595806122, "step": 2623 }, { "epoch": 1.42, "learning_rate": 5.697947785160532e-08, "logits/chosen": -2.0274546146392822, "logits/rejected": -2.021726131439209, "logps/chosen": -0.8120176792144775, "logps/rejected": -4.621848106384277, "loss": 0.4797, "rewards/accuracies": 1.0, "rewards/chosen": 1.0560945272445679, "rewards/margins": 0.48529499769210815, "rewards/rejected": 0.5707995295524597, "step": 2624 }, { "epoch": 1.42, "learning_rate": 5.695064594946087e-08, "logits/chosen": -2.003103256225586, "logits/rejected": -2.1960463523864746, "logps/chosen": -1.2337614297866821, "logps/rejected": -1.2665627002716064, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.9170692563056946, "rewards/margins": -8.022785186767578e-05, "rewards/rejected": 0.9171494841575623, "step": 2625 }, { "epoch": 1.42, "learning_rate": 5.6921811690411825e-08, "logits/chosen": -2.0610575675964355, "logits/rejected": -2.27213454246521, "logps/chosen": -6.903450965881348, "logps/rejected": -3.080265522003174, "loss": 0.7413, "rewards/accuracies": 0.0, "rewards/chosen": 0.8080283403396606, "rewards/margins": -0.09401309490203857, "rewards/rejected": 0.9020414352416992, "step": 2626 }, { "epoch": 1.42, "learning_rate": 5.689297508423565e-08, "logits/chosen": -2.0165226459503174, "logits/rejected": -2.022252082824707, "logps/chosen": -1.2685515880584717, "logps/rejected": -4.218923568725586, "loss": 0.4187, "rewards/accuracies": 1.0, "rewards/chosen": 0.9524052739143372, "rewards/margins": 0.6540534496307373, "rewards/rejected": 0.29835185408592224, "step": 2627 }, { "epoch": 1.42, "learning_rate": 5.686413614071057e-08, "logits/chosen": -2.0779757499694824, "logits/rejected": -2.2691149711608887, "logps/chosen": -1.2962008714675903, "logps/rejected": -1.4443963766098022, "loss": 0.6696, "rewards/accuracies": 1.0, "rewards/chosen": 0.8224745988845825, "rewards/margins": 0.04773080348968506, "rewards/rejected": 0.7747437953948975, "step": 2628 }, { "epoch": 1.42, "learning_rate": 5.6835294869615646e-08, "logits/chosen": -2.0176963806152344, "logits/rejected": -2.0167946815490723, "logps/chosen": -2.5740182399749756, "logps/rejected": -4.895241737365723, "loss": 0.4773, "rewards/accuracies": 1.0, "rewards/chosen": 1.0969470739364624, "rewards/margins": 0.4914206266403198, "rewards/rejected": 0.6055264472961426, "step": 2629 }, { "epoch": 1.42, "learning_rate": 5.6806451280730694e-08, "logits/chosen": -2.104889392852783, "logits/rejected": -2.024125099182129, "logps/chosen": -24.617813110351562, "logps/rejected": -2.8680107593536377, "loss": 0.3175, "rewards/accuracies": 1.0, "rewards/chosen": 1.5761005878448486, "rewards/margins": 0.9842143654823303, "rewards/rejected": 0.5918862223625183, "step": 2630 }, { "epoch": 1.42, "learning_rate": 5.677760538383634e-08, "logits/chosen": -2.155416250228882, "logits/rejected": -2.260171413421631, "logps/chosen": -1.2071759700775146, "logps/rejected": -1.233188509941101, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.8446294069290161, "rewards/margins": 0.000455319881439209, "rewards/rejected": 0.8441740870475769, "step": 2631 }, { "epoch": 1.42, "learning_rate": 5.674875718871396e-08, "logits/chosen": -2.0081841945648193, "logits/rejected": -2.271527051925659, "logps/chosen": -2.0398201942443848, "logps/rejected": -0.733642041683197, "loss": 0.7064, "rewards/accuracies": 0.0, "rewards/chosen": 1.0241392850875854, "rewards/margins": -0.026233673095703125, "rewards/rejected": 1.0503729581832886, "step": 2632 }, { "epoch": 1.42, "learning_rate": 5.6719906705145734e-08, "logits/chosen": -1.9859683513641357, "logits/rejected": -2.0047833919525146, "logps/chosen": -1.830437183380127, "logps/rejected": -6.118809223175049, "loss": 0.3847, "rewards/accuracies": 1.0, "rewards/chosen": 1.2490946054458618, "rewards/margins": 0.7567278146743774, "rewards/rejected": 0.492366760969162, "step": 2633 }, { "epoch": 1.42, "learning_rate": 5.669105394291464e-08, "logits/chosen": -2.033536195755005, "logits/rejected": -2.2550201416015625, "logps/chosen": -0.6630253195762634, "logps/rejected": -0.6813362240791321, "loss": 0.6739, "rewards/accuracies": 1.0, "rewards/chosen": 1.0084973573684692, "rewards/margins": 0.038790881633758545, "rewards/rejected": 0.9697064757347107, "step": 2634 }, { "epoch": 1.42, "learning_rate": 5.6662198911804384e-08, "logits/chosen": -2.1186444759368896, "logits/rejected": -2.111624240875244, "logps/chosen": -4.699293613433838, "logps/rejected": -4.253516674041748, "loss": 0.3082, "rewards/accuracies": 1.0, "rewards/chosen": 1.4274139404296875, "rewards/margins": 1.0190058946609497, "rewards/rejected": 0.4084080159664154, "step": 2635 }, { "epoch": 1.42, "learning_rate": 5.663334162159945e-08, "logits/chosen": -2.1563913822174072, "logits/rejected": -2.083818197250366, "logps/chosen": -31.401792526245117, "logps/rejected": -3.909635305404663, "loss": 0.3697, "rewards/accuracies": 1.0, "rewards/chosen": 1.4049357175827026, "rewards/margins": 0.8043842911720276, "rewards/rejected": 0.600551426410675, "step": 2636 }, { "epoch": 1.42, "learning_rate": 5.6604482082085126e-08, "logits/chosen": -2.119548797607422, "logits/rejected": -2.1156692504882812, "logps/chosen": -6.88892936706543, "logps/rejected": -3.5097522735595703, "loss": 0.3699, "rewards/accuracies": 1.0, "rewards/chosen": 1.402295470237732, "rewards/margins": 0.8037710785865784, "rewards/rejected": 0.5985243916511536, "step": 2637 }, { "epoch": 1.42, "learning_rate": 5.65756203030474e-08, "logits/chosen": -2.015280246734619, "logits/rejected": -2.015613555908203, "logps/chosen": -2.298982858657837, "logps/rejected": -4.183917045593262, "loss": 0.2708, "rewards/accuracies": 1.0, "rewards/chosen": 1.6809602975845337, "rewards/margins": 1.1680107116699219, "rewards/rejected": 0.5129496455192566, "step": 2638 }, { "epoch": 1.42, "learning_rate": 5.654675629427308e-08, "logits/chosen": -2.068706512451172, "logits/rejected": -2.068115472793579, "logps/chosen": -2.363719940185547, "logps/rejected": -3.8758206367492676, "loss": 0.5394, "rewards/accuracies": 1.0, "rewards/chosen": 0.8979911804199219, "rewards/margins": 0.3354601263999939, "rewards/rejected": 0.562531054019928, "step": 2639 }, { "epoch": 1.42, "learning_rate": 5.65178900655497e-08, "logits/chosen": -2.046894073486328, "logits/rejected": -2.038949966430664, "logps/chosen": -2.5289199352264404, "logps/rejected": -5.416823863983154, "loss": 0.3333, "rewards/accuracies": 1.0, "rewards/chosen": 1.37019681930542, "rewards/margins": 0.9273592233657837, "rewards/rejected": 0.44283756613731384, "step": 2640 }, { "epoch": 1.42, "learning_rate": 5.648902162666555e-08, "logits/chosen": -2.0718653202056885, "logits/rejected": -2.0688600540161133, "logps/chosen": -2.8843448162078857, "logps/rejected": -4.848288536071777, "loss": 0.5332, "rewards/accuracies": 1.0, "rewards/chosen": 0.9467474222183228, "rewards/margins": 0.3503703474998474, "rewards/rejected": 0.5963770747184753, "step": 2641 }, { "epoch": 1.43, "learning_rate": 5.646015098740967e-08, "logits/chosen": -2.09126877784729, "logits/rejected": -2.099252462387085, "logps/chosen": -3.026900291442871, "logps/rejected": -3.0465571880340576, "loss": 0.4774, "rewards/accuracies": 1.0, "rewards/chosen": 0.9816016554832458, "rewards/margins": 0.4911847412586212, "rewards/rejected": 0.49041691422462463, "step": 2642 }, { "epoch": 1.43, "learning_rate": 5.643127815757184e-08, "logits/chosen": -2.2071421146392822, "logits/rejected": -2.1923935413360596, "logps/chosen": -10.87622356414795, "logps/rejected": -5.970460891723633, "loss": 0.5568, "rewards/accuracies": 1.0, "rewards/chosen": 1.2118210792541504, "rewards/margins": 0.2943328619003296, "rewards/rejected": 0.9174882173538208, "step": 2643 }, { "epoch": 1.43, "learning_rate": 5.64024031469426e-08, "logits/chosen": -2.086077928543091, "logits/rejected": -2.085190773010254, "logps/chosen": -0.701217532157898, "logps/rejected": -10.441180229187012, "loss": 0.5037, "rewards/accuracies": 1.0, "rewards/chosen": 1.0965746641159058, "rewards/margins": 0.4234084486961365, "rewards/rejected": 0.6731662154197693, "step": 2644 }, { "epoch": 1.43, "learning_rate": 5.637352596531321e-08, "logits/chosen": -1.9928561449050903, "logits/rejected": -2.25433349609375, "logps/chosen": -1.2991538047790527, "logps/rejected": -1.2945835590362549, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.8752509951591492, "rewards/margins": 0.006103992462158203, "rewards/rejected": 0.869147002696991, "step": 2645 }, { "epoch": 1.43, "learning_rate": 5.634464662247569e-08, "logits/chosen": -1.9736578464508057, "logits/rejected": -1.9736303091049194, "logps/chosen": -1.7533663511276245, "logps/rejected": -0.8016843199729919, "loss": 0.5872, "rewards/accuracies": 1.0, "rewards/chosen": 1.1197115182876587, "rewards/margins": 0.22443503141403198, "rewards/rejected": 0.8952764868736267, "step": 2646 }, { "epoch": 1.43, "learning_rate": 5.631576512822277e-08, "logits/chosen": -2.059797763824463, "logits/rejected": -2.0662529468536377, "logps/chosen": -3.0805397033691406, "logps/rejected": -0.4630148708820343, "loss": 0.5246, "rewards/accuracies": 1.0, "rewards/chosen": 1.2508981227874756, "rewards/margins": 0.37148517370224, "rewards/rejected": 0.8794129490852356, "step": 2647 }, { "epoch": 1.43, "learning_rate": 5.6286881492347904e-08, "logits/chosen": -1.9492237567901611, "logits/rejected": -2.230847120285034, "logps/chosen": -3.4810190200805664, "logps/rejected": -3.7928414344787598, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.6844684481620789, "rewards/margins": 0.0001125335693359375, "rewards/rejected": 0.6843559145927429, "step": 2648 }, { "epoch": 1.43, "learning_rate": 5.6257995724645304e-08, "logits/chosen": -2.0739328861236572, "logits/rejected": -2.0599915981292725, "logps/chosen": -3.9029722213745117, "logps/rejected": -5.894265174865723, "loss": 0.2776, "rewards/accuracies": 1.0, "rewards/chosen": 1.478744387626648, "rewards/margins": 1.1395056247711182, "rewards/rejected": 0.3392387330532074, "step": 2649 }, { "epoch": 1.43, "learning_rate": 5.622910783490987e-08, "logits/chosen": -2.145751714706421, "logits/rejected": -2.263946771621704, "logps/chosen": -4.7732062339782715, "logps/rejected": -2.252406358718872, "loss": 0.6425, "rewards/accuracies": 1.0, "rewards/chosen": 0.9949922561645508, "rewards/margins": 0.10409027338027954, "rewards/rejected": 0.8909019827842712, "step": 2650 }, { "epoch": 1.43, "learning_rate": 5.620021783293727e-08, "logits/chosen": -2.020820140838623, "logits/rejected": -2.019056797027588, "logps/chosen": -0.9902936816215515, "logps/rejected": -3.6241352558135986, "loss": 0.4838, "rewards/accuracies": 1.0, "rewards/chosen": 0.9596995711326599, "rewards/margins": 0.47443726658821106, "rewards/rejected": 0.48526230454444885, "step": 2651 }, { "epoch": 1.43, "learning_rate": 5.617132572852382e-08, "logits/chosen": -2.154676675796509, "logits/rejected": -2.2938270568847656, "logps/chosen": -5.6854753494262695, "logps/rejected": -3.595248222351074, "loss": 0.6574, "rewards/accuracies": 1.0, "rewards/chosen": 0.6656226515769958, "rewards/margins": 0.07282829284667969, "rewards/rejected": 0.5927943587303162, "step": 2652 }, { "epoch": 1.43, "learning_rate": 5.6142431531466617e-08, "logits/chosen": -2.1719677448272705, "logits/rejected": -2.302703857421875, "logps/chosen": -0.3967934548854828, "logps/rejected": -0.36723318696022034, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.8972907066345215, "rewards/margins": 0.014606475830078125, "rewards/rejected": 0.8826842308044434, "step": 2653 }, { "epoch": 1.43, "learning_rate": 5.6113535251563415e-08, "logits/chosen": -2.05485200881958, "logits/rejected": -2.360260486602783, "logps/chosen": -2.4991655349731445, "logps/rejected": -2.605198383331299, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.9519492387771606, "rewards/margins": 0.008611202239990234, "rewards/rejected": 0.9433380365371704, "step": 2654 }, { "epoch": 1.43, "learning_rate": 5.608463689861269e-08, "logits/chosen": -2.2110395431518555, "logits/rejected": -2.3380892276763916, "logps/chosen": -0.49336981773376465, "logps/rejected": -0.5511276721954346, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 1.0398187637329102, "rewards/margins": -0.0006659030914306641, "rewards/rejected": 1.0404846668243408, "step": 2655 }, { "epoch": 1.43, "learning_rate": 5.6055736482413643e-08, "logits/chosen": -1.992391586303711, "logits/rejected": -2.2206616401672363, "logps/chosen": -0.8791359066963196, "logps/rejected": -0.8789234161376953, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 1.0411444902420044, "rewards/margins": 0.023096561431884766, "rewards/rejected": 1.0180479288101196, "step": 2656 }, { "epoch": 1.43, "learning_rate": 5.6026834012766146e-08, "logits/chosen": -1.9505847692489624, "logits/rejected": -1.9517053365707397, "logps/chosen": -1.6862825155258179, "logps/rejected": -0.8249867558479309, "loss": 0.63, "rewards/accuracies": 1.0, "rewards/chosen": 1.0705126523971558, "rewards/margins": 0.13051342964172363, "rewards/rejected": 0.9399992227554321, "step": 2657 }, { "epoch": 1.43, "learning_rate": 5.5997929499470795e-08, "logits/chosen": -1.9732602834701538, "logits/rejected": -2.2795519828796387, "logps/chosen": -0.4554198682308197, "logps/rejected": -0.5361145734786987, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 1.0026752948760986, "rewards/margins": 0.027693510055541992, "rewards/rejected": 0.9749817848205566, "step": 2658 }, { "epoch": 1.43, "learning_rate": 5.596902295232886e-08, "logits/chosen": -2.1165311336517334, "logits/rejected": -2.117687940597534, "logps/chosen": -0.2531243860721588, "logps/rejected": -5.215500831604004, "loss": 0.4613, "rewards/accuracies": 1.0, "rewards/chosen": 0.9244713187217712, "rewards/margins": 0.5342625379562378, "rewards/rejected": 0.39020881056785583, "step": 2659 }, { "epoch": 1.43, "learning_rate": 5.594011438114228e-08, "logits/chosen": -2.094287633895874, "logits/rejected": -2.1092042922973633, "logps/chosen": -5.015511989593506, "logps/rejected": -3.692945718765259, "loss": 0.5507, "rewards/accuracies": 1.0, "rewards/chosen": 1.0968313217163086, "rewards/margins": 0.3087175488471985, "rewards/rejected": 0.7881137728691101, "step": 2660 }, { "epoch": 1.44, "learning_rate": 5.5911203795713734e-08, "logits/chosen": -2.111036539077759, "logits/rejected": -2.1153647899627686, "logps/chosen": -3.3828423023223877, "logps/rejected": -1.315898060798645, "loss": 0.6113, "rewards/accuracies": 1.0, "rewards/chosen": 1.355767011642456, "rewards/margins": 0.17100274562835693, "rewards/rejected": 1.1847642660140991, "step": 2661 }, { "epoch": 1.44, "learning_rate": 5.588229120584653e-08, "logits/chosen": -2.058671236038208, "logits/rejected": -2.2725670337677, "logps/chosen": -0.5014556646347046, "logps/rejected": -5.259143352508545, "loss": 0.5532, "rewards/accuracies": 1.0, "rewards/chosen": 0.990567684173584, "rewards/margins": 0.3026310205459595, "rewards/rejected": 0.6879366636276245, "step": 2662 }, { "epoch": 1.44, "learning_rate": 5.58533766213447e-08, "logits/chosen": -2.1053171157836914, "logits/rejected": -2.098252773284912, "logps/chosen": -2.483046770095825, "logps/rejected": -3.686917304992676, "loss": 0.5257, "rewards/accuracies": 1.0, "rewards/chosen": 1.2595237493515015, "rewards/margins": 0.36876845359802246, "rewards/rejected": 0.890755295753479, "step": 2663 }, { "epoch": 1.44, "learning_rate": 5.5824460052012924e-08, "logits/chosen": -2.0505874156951904, "logits/rejected": -2.290769338607788, "logps/chosen": -6.618964195251465, "logps/rejected": -0.7577424049377441, "loss": 0.7767, "rewards/accuracies": 0.0, "rewards/chosen": 0.7817659378051758, "rewards/margins": -0.16063958406448364, "rewards/rejected": 0.9424055218696594, "step": 2664 }, { "epoch": 1.44, "learning_rate": 5.579554150765655e-08, "logits/chosen": -2.0902485847473145, "logits/rejected": -2.2820818424224854, "logps/chosen": -0.8771148920059204, "logps/rejected": -0.8787583112716675, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.8947550654411316, "rewards/margins": 0.024018526077270508, "rewards/rejected": 0.8707365393638611, "step": 2665 }, { "epoch": 1.44, "learning_rate": 5.576662099808165e-08, "logits/chosen": -1.946295142173767, "logits/rejected": -1.916488528251648, "logps/chosen": -11.225053787231445, "logps/rejected": -4.74381160736084, "loss": 0.4985, "rewards/accuracies": 1.0, "rewards/chosen": 0.880984902381897, "rewards/margins": 0.4366523027420044, "rewards/rejected": 0.4443325996398926, "step": 2666 }, { "epoch": 1.44, "learning_rate": 5.5737698533094865e-08, "logits/chosen": -2.0411629676818848, "logits/rejected": -2.308133125305176, "logps/chosen": -0.5214123725891113, "logps/rejected": -0.5471449494361877, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.8294258117675781, "rewards/margins": 0.010276496410369873, "rewards/rejected": 0.8191493153572083, "step": 2667 }, { "epoch": 1.44, "learning_rate": 5.5708774122503596e-08, "logits/chosen": -1.9525066614151, "logits/rejected": -1.953143835067749, "logps/chosen": -2.737576723098755, "logps/rejected": -0.6203404068946838, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.7915688753128052, "rewards/margins": 0.016864776611328125, "rewards/rejected": 0.774704098701477, "step": 2668 }, { "epoch": 1.44, "learning_rate": 5.567984777611585e-08, "logits/chosen": -2.011791229248047, "logits/rejected": -2.0174973011016846, "logps/chosen": -1.2187391519546509, "logps/rejected": -3.4917728900909424, "loss": 0.4704, "rewards/accuracies": 1.0, "rewards/chosen": 1.0150083303451538, "rewards/margins": 0.509685218334198, "rewards/rejected": 0.5053231120109558, "step": 2669 }, { "epoch": 1.44, "learning_rate": 5.565091950374031e-08, "logits/chosen": -2.0306222438812256, "logits/rejected": -2.034001588821411, "logps/chosen": -0.5222373604774475, "logps/rejected": -3.8762848377227783, "loss": 0.5096, "rewards/accuracies": 1.0, "rewards/chosen": 0.8312053084373474, "rewards/margins": 0.4086272418498993, "rewards/rejected": 0.4225780665874481, "step": 2670 }, { "epoch": 1.44, "learning_rate": 5.56219893151863e-08, "logits/chosen": -2.1502983570098877, "logits/rejected": -2.2693490982055664, "logps/chosen": -1.8076385259628296, "logps/rejected": -1.6671384572982788, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.6091451048851013, "rewards/margins": -0.003879070281982422, "rewards/rejected": 0.6130241751670837, "step": 2671 }, { "epoch": 1.44, "learning_rate": 5.559305722026379e-08, "logits/chosen": -2.1498122215270996, "logits/rejected": -2.0505447387695312, "logps/chosen": -30.778093338012695, "logps/rejected": -5.929637908935547, "loss": 0.2003, "rewards/accuracies": 1.0, "rewards/chosen": 2.081027030944824, "rewards/margins": 1.5059258937835693, "rewards/rejected": 0.5751010775566101, "step": 2672 }, { "epoch": 1.44, "learning_rate": 5.556412322878343e-08, "logits/chosen": -2.150963068008423, "logits/rejected": -2.271972894668579, "logps/chosen": -0.265641450881958, "logps/rejected": -0.27030178904533386, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.9477592706680298, "rewards/margins": 0.014454483985900879, "rewards/rejected": 0.9333047866821289, "step": 2673 }, { "epoch": 1.44, "learning_rate": 5.553518735055647e-08, "logits/chosen": -2.070180892944336, "logits/rejected": -2.277181625366211, "logps/chosen": -0.456380158662796, "logps/rejected": -0.40809598565101624, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.9133152365684509, "rewards/margins": 0.0202481746673584, "rewards/rejected": 0.8930670619010925, "step": 2674 }, { "epoch": 1.44, "learning_rate": 5.550624959539483e-08, "logits/chosen": -2.0746443271636963, "logits/rejected": -2.0760996341705322, "logps/chosen": -3.5241799354553223, "logps/rejected": -14.327842712402344, "loss": 0.2646, "rewards/accuracies": 1.0, "rewards/chosen": 1.1979016065597534, "rewards/margins": 1.1941696405410767, "rewards/rejected": 0.0037319182883948088, "step": 2675 }, { "epoch": 1.44, "learning_rate": 5.547730997311104e-08, "logits/chosen": -2.2033281326293945, "logits/rejected": -2.170025587081909, "logps/chosen": -29.506010055541992, "logps/rejected": -1.9947223663330078, "loss": 0.4138, "rewards/accuracies": 1.0, "rewards/chosen": 1.689888596534729, "rewards/margins": 0.6684310436248779, "rewards/rejected": 1.021457552909851, "step": 2676 }, { "epoch": 1.44, "learning_rate": 5.5448368493518293e-08, "logits/chosen": -1.993480920791626, "logits/rejected": -2.3037829399108887, "logps/chosen": -0.4033520221710205, "logps/rejected": -0.5079346895217896, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 1.0024250745773315, "rewards/margins": 0.025877535343170166, "rewards/rejected": 0.9765475392341614, "step": 2677 }, { "epoch": 1.44, "learning_rate": 5.541942516643039e-08, "logits/chosen": -2.1423745155334473, "logits/rejected": -2.285102367401123, "logps/chosen": -5.866875648498535, "logps/rejected": -5.592303276062012, "loss": 0.6604, "rewards/accuracies": 1.0, "rewards/chosen": 0.5881660580635071, "rewards/margins": 0.06653499603271484, "rewards/rejected": 0.5216310620307922, "step": 2678 }, { "epoch": 1.44, "learning_rate": 5.539048000166177e-08, "logits/chosen": -2.134533166885376, "logits/rejected": -2.194490671157837, "logps/chosen": -7.233266353607178, "logps/rejected": -17.81116485595703, "loss": 0.522, "rewards/accuracies": 1.0, "rewards/chosen": 1.4801924228668213, "rewards/margins": 0.37767016887664795, "rewards/rejected": 1.1025222539901733, "step": 2679 }, { "epoch": 1.45, "learning_rate": 5.536153300902748e-08, "logits/chosen": -2.1888859272003174, "logits/rejected": -2.3042101860046387, "logps/chosen": -2.776888370513916, "logps/rejected": -2.7703425884246826, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.9221393465995789, "rewards/margins": -0.001263737678527832, "rewards/rejected": 0.9234030842781067, "step": 2680 }, { "epoch": 1.45, "learning_rate": 5.533258419834321e-08, "logits/chosen": -2.0531396865844727, "logits/rejected": -2.049588680267334, "logps/chosen": -2.5951788425445557, "logps/rejected": -3.376035213470459, "loss": 0.5681, "rewards/accuracies": 1.0, "rewards/chosen": 0.909842312335968, "rewards/margins": 0.2680847644805908, "rewards/rejected": 0.6417575478553772, "step": 2681 }, { "epoch": 1.45, "learning_rate": 5.530363357942526e-08, "logits/chosen": -2.055086851119995, "logits/rejected": -2.0573160648345947, "logps/chosen": -0.8286996483802795, "logps/rejected": -4.379397869110107, "loss": 0.5458, "rewards/accuracies": 1.0, "rewards/chosen": 0.9347292184829712, "rewards/margins": 0.32027727365493774, "rewards/rejected": 0.6144519448280334, "step": 2682 }, { "epoch": 1.45, "learning_rate": 5.527468116209052e-08, "logits/chosen": -2.1103572845458984, "logits/rejected": -2.1345198154449463, "logps/chosen": -8.371309280395508, "logps/rejected": -1.90434730052948, "loss": 0.4281, "rewards/accuracies": 1.0, "rewards/chosen": 1.517202377319336, "rewards/margins": 0.6266344785690308, "rewards/rejected": 0.8905678987503052, "step": 2683 }, { "epoch": 1.45, "learning_rate": 5.52457269561565e-08, "logits/chosen": -1.9839037656784058, "logits/rejected": -1.9927141666412354, "logps/chosen": -1.945144534111023, "logps/rejected": -4.141624927520752, "loss": 0.4393, "rewards/accuracies": 1.0, "rewards/chosen": 1.1179863214492798, "rewards/margins": 0.5949841141700745, "rewards/rejected": 0.5230022072792053, "step": 2684 }, { "epoch": 1.45, "learning_rate": 5.5216770971441365e-08, "logits/chosen": -2.047381639480591, "logits/rejected": -2.2832653522491455, "logps/chosen": -0.33149096369743347, "logps/rejected": -0.3353745937347412, "loss": 0.6694, "rewards/accuracies": 1.0, "rewards/chosen": 0.9720954895019531, "rewards/margins": 0.04815441370010376, "rewards/rejected": 0.9239410758018494, "step": 2685 }, { "epoch": 1.45, "learning_rate": 5.51878132177638e-08, "logits/chosen": -1.9862579107284546, "logits/rejected": -1.984676480293274, "logps/chosen": -2.2308616638183594, "logps/rejected": -8.528812408447266, "loss": 0.2932, "rewards/accuracies": 1.0, "rewards/chosen": 1.4647245407104492, "rewards/margins": 1.0766468048095703, "rewards/rejected": 0.3880777359008789, "step": 2686 }, { "epoch": 1.45, "learning_rate": 5.5158853704943144e-08, "logits/chosen": -2.027434825897217, "logits/rejected": -2.0209598541259766, "logps/chosen": -5.134843826293945, "logps/rejected": -2.653352737426758, "loss": 0.464, "rewards/accuracies": 1.0, "rewards/chosen": 1.2629995346069336, "rewards/margins": 0.5270301699638367, "rewards/rejected": 0.7359693646430969, "step": 2687 }, { "epoch": 1.45, "learning_rate": 5.512989244279933e-08, "logits/chosen": -2.1139700412750244, "logits/rejected": -2.1094276905059814, "logps/chosen": -4.045007228851318, "logps/rejected": -1.7557193040847778, "loss": 0.3736, "rewards/accuracies": 1.0, "rewards/chosen": 1.6080007553100586, "rewards/margins": 0.791816234588623, "rewards/rejected": 0.8161845207214355, "step": 2688 }, { "epoch": 1.45, "learning_rate": 5.5100929441152856e-08, "logits/chosen": -2.0386717319488525, "logits/rejected": -2.040372848510742, "logps/chosen": -2.0558314323425293, "logps/rejected": -0.735055685043335, "loss": 0.6188, "rewards/accuracies": 1.0, "rewards/chosen": 0.8788906335830688, "rewards/margins": 0.15473264455795288, "rewards/rejected": 0.724157989025116, "step": 2689 }, { "epoch": 1.45, "learning_rate": 5.5071964709824846e-08, "logits/chosen": -2.1392691135406494, "logits/rejected": -2.299638032913208, "logps/chosen": -3.0223464965820312, "logps/rejected": -2.97046160697937, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 1.0011965036392212, "rewards/margins": -0.0009058713912963867, "rewards/rejected": 1.0021023750305176, "step": 2690 }, { "epoch": 1.45, "learning_rate": 5.504299825863699e-08, "logits/chosen": -2.057438850402832, "logits/rejected": -2.0592331886291504, "logps/chosen": -2.2460803985595703, "logps/rejected": -1.1326030492782593, "loss": 0.5403, "rewards/accuracies": 1.0, "rewards/chosen": 1.2586902379989624, "rewards/margins": 0.3334615230560303, "rewards/rejected": 0.9252287149429321, "step": 2691 }, { "epoch": 1.45, "learning_rate": 5.5014030097411544e-08, "logits/chosen": -2.0562944412231445, "logits/rejected": -2.0604188442230225, "logps/chosen": -3.556994676589966, "logps/rejected": -4.402470588684082, "loss": 0.389, "rewards/accuracies": 1.0, "rewards/chosen": 1.2981683015823364, "rewards/margins": 0.743410050868988, "rewards/rejected": 0.5547582507133484, "step": 2692 }, { "epoch": 1.45, "learning_rate": 5.498506023597138e-08, "logits/chosen": -2.1482183933258057, "logits/rejected": -2.1487481594085693, "logps/chosen": -0.7555601000785828, "logps/rejected": -4.3088178634643555, "loss": 0.3897, "rewards/accuracies": 1.0, "rewards/chosen": 1.0910134315490723, "rewards/margins": 0.7412280440330505, "rewards/rejected": 0.34978538751602173, "step": 2693 }, { "epoch": 1.45, "learning_rate": 5.495608868413994e-08, "logits/chosen": -2.1233620643615723, "logits/rejected": -2.260620355606079, "logps/chosen": -3.0760498046875, "logps/rejected": -3.080613851547241, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.9197657704353333, "rewards/margins": 0.017244458198547363, "rewards/rejected": 0.9025213122367859, "step": 2694 }, { "epoch": 1.45, "learning_rate": 5.4927115451741226e-08, "logits/chosen": -2.067617893218994, "logits/rejected": -2.0739715099334717, "logps/chosen": -2.729379415512085, "logps/rejected": -4.360790729522705, "loss": 0.4199, "rewards/accuracies": 1.0, "rewards/chosen": 1.2086032629013062, "rewards/margins": 0.6504060626029968, "rewards/rejected": 0.5581972002983093, "step": 2695 }, { "epoch": 1.45, "learning_rate": 5.489814054859979e-08, "logits/chosen": -2.0416512489318848, "logits/rejected": -2.2647104263305664, "logps/chosen": -2.229526996612549, "logps/rejected": -1.5376050472259521, "loss": 0.5915, "rewards/accuracies": 1.0, "rewards/chosen": 1.0823841094970703, "rewards/margins": 0.21485918760299683, "rewards/rejected": 0.8675249218940735, "step": 2696 }, { "epoch": 1.45, "learning_rate": 5.48691639845408e-08, "logits/chosen": -2.1663873195648193, "logits/rejected": -2.165891170501709, "logps/chosen": -0.7985218167304993, "logps/rejected": -3.319434404373169, "loss": 0.469, "rewards/accuracies": 1.0, "rewards/chosen": 1.077972173690796, "rewards/margins": 0.5134785175323486, "rewards/rejected": 0.5644936561584473, "step": 2697 }, { "epoch": 1.46, "learning_rate": 5.484018576938993e-08, "logits/chosen": -2.0799453258514404, "logits/rejected": -2.2827188968658447, "logps/chosen": -0.34920164942741394, "logps/rejected": -0.3837789297103882, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.9143075942993164, "rewards/margins": 0.016897082328796387, "rewards/rejected": 0.89741051197052, "step": 2698 }, { "epoch": 1.46, "learning_rate": 5.481120591297349e-08, "logits/chosen": -2.0434868335723877, "logits/rejected": -2.2528581619262695, "logps/chosen": -2.5932281017303467, "logps/rejected": -2.6884732246398926, "loss": 0.6652, "rewards/accuracies": 1.0, "rewards/chosen": 0.6808175444602966, "rewards/margins": 0.0566411018371582, "rewards/rejected": 0.6241764426231384, "step": 2699 }, { "epoch": 1.46, "learning_rate": 5.4782224425118285e-08, "logits/chosen": -2.0523436069488525, "logits/rejected": -2.0604305267333984, "logps/chosen": -1.2563107013702393, "logps/rejected": -12.567656517028809, "loss": 0.5496, "rewards/accuracies": 1.0, "rewards/chosen": 1.1015098094940186, "rewards/margins": 0.31119275093078613, "rewards/rejected": 0.7903170585632324, "step": 2700 }, { "epoch": 1.46, "learning_rate": 5.475324131565165e-08, "logits/chosen": -2.1246864795684814, "logits/rejected": -2.2744956016540527, "logps/chosen": -3.2028400897979736, "logps/rejected": -3.138141632080078, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.9914131164550781, "rewards/margins": -0.0012017488479614258, "rewards/rejected": 0.9926148653030396, "step": 2701 }, { "epoch": 1.46, "learning_rate": 5.472425659440156e-08, "logits/chosen": -2.0282957553863525, "logits/rejected": -2.2554852962493896, "logps/chosen": -2.638252019882202, "logps/rejected": -2.77555775642395, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.7941575050354004, "rewards/margins": 0.029327392578125, "rewards/rejected": 0.7648301124572754, "step": 2702 }, { "epoch": 1.46, "learning_rate": 5.469527027119646e-08, "logits/chosen": -2.153526782989502, "logits/rejected": -2.188084363937378, "logps/chosen": -2.757615804672241, "logps/rejected": -12.347579956054688, "loss": 0.4411, "rewards/accuracies": 1.0, "rewards/chosen": 1.285292625427246, "rewards/margins": 0.5899034142494202, "rewards/rejected": 0.6953892111778259, "step": 2703 }, { "epoch": 1.46, "learning_rate": 5.466628235586538e-08, "logits/chosen": -1.9565805196762085, "logits/rejected": -2.239112138748169, "logps/chosen": -1.316345453262329, "logps/rejected": -1.4950339794158936, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 0.9048704504966736, "rewards/margins": 0.027837395668029785, "rewards/rejected": 0.8770330548286438, "step": 2704 }, { "epoch": 1.46, "learning_rate": 5.463729285823787e-08, "logits/chosen": -2.0331640243530273, "logits/rejected": -2.038465738296509, "logps/chosen": -0.9205046892166138, "logps/rejected": -11.859750747680664, "loss": 0.5042, "rewards/accuracies": 1.0, "rewards/chosen": 1.0823537111282349, "rewards/margins": 0.4222111701965332, "rewards/rejected": 0.6601425409317017, "step": 2705 }, { "epoch": 1.46, "learning_rate": 5.4608301788143996e-08, "logits/chosen": -2.1100430488586426, "logits/rejected": -2.250673532485962, "logps/chosen": -3.605764389038086, "logps/rejected": -3.548754930496216, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.3641473948955536, "rewards/margins": 0.021615535020828247, "rewards/rejected": 0.34253185987472534, "step": 2706 }, { "epoch": 1.46, "learning_rate": 5.4579309155414424e-08, "logits/chosen": -1.9915088415145874, "logits/rejected": -2.317631721496582, "logps/chosen": -0.20621563494205475, "logps/rejected": -0.2355392724275589, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": 0.9907991290092468, "rewards/margins": -0.010653913021087646, "rewards/rejected": 1.0014530420303345, "step": 2707 }, { "epoch": 1.46, "learning_rate": 5.4550314969880276e-08, "logits/chosen": -2.0980746746063232, "logits/rejected": -2.2359931468963623, "logps/chosen": -2.003185987472534, "logps/rejected": -0.6258931756019592, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 1.0419484376907349, "rewards/margins": 0.030401945114135742, "rewards/rejected": 1.0115464925765991, "step": 2708 }, { "epoch": 1.46, "learning_rate": 5.452131924137324e-08, "logits/chosen": -1.9998657703399658, "logits/rejected": -2.315398693084717, "logps/chosen": -3.1897385120391846, "logps/rejected": -2.953031063079834, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.46333047747612, "rewards/margins": 0.021635591983795166, "rewards/rejected": 0.44169488549232483, "step": 2709 }, { "epoch": 1.46, "learning_rate": 5.449232197972553e-08, "logits/chosen": -2.062335729598999, "logits/rejected": -2.0627896785736084, "logps/chosen": -2.1796939373016357, "logps/rejected": -1.195730447769165, "loss": 0.5866, "rewards/accuracies": 1.0, "rewards/chosen": 1.1470979452133179, "rewards/margins": 0.2258598804473877, "rewards/rejected": 0.9212380647659302, "step": 2710 }, { "epoch": 1.46, "learning_rate": 5.446332319476985e-08, "logits/chosen": -2.1499521732330322, "logits/rejected": -2.149657726287842, "logps/chosen": -1.2072452306747437, "logps/rejected": -6.045614242553711, "loss": 0.6256, "rewards/accuracies": 1.0, "rewards/chosen": 1.069330096244812, "rewards/margins": 0.14002537727355957, "rewards/rejected": 0.9293047189712524, "step": 2711 }, { "epoch": 1.46, "learning_rate": 5.4434322896339456e-08, "logits/chosen": -1.9695234298706055, "logits/rejected": -1.9730489253997803, "logps/chosen": -0.27607274055480957, "logps/rejected": -6.5451459884643555, "loss": 0.4897, "rewards/accuracies": 1.0, "rewards/chosen": 0.8252803087234497, "rewards/margins": 0.45907557010650635, "rewards/rejected": 0.36620473861694336, "step": 2712 }, { "epoch": 1.46, "learning_rate": 5.44053210942681e-08, "logits/chosen": -2.1027655601501465, "logits/rejected": -2.2432796955108643, "logps/chosen": -4.980959415435791, "logps/rejected": -3.7174386978149414, "loss": 0.595, "rewards/accuracies": 1.0, "rewards/chosen": 1.076966643333435, "rewards/margins": 0.2070329189300537, "rewards/rejected": 0.8699337244033813, "step": 2713 }, { "epoch": 1.46, "learning_rate": 5.437631779839004e-08, "logits/chosen": -2.162026882171631, "logits/rejected": -2.162583351135254, "logps/chosen": -1.8380122184753418, "logps/rejected": -7.940104007720947, "loss": 0.5744, "rewards/accuracies": 1.0, "rewards/chosen": 1.1570743322372437, "rewards/margins": 0.2535783648490906, "rewards/rejected": 0.9034959673881531, "step": 2714 }, { "epoch": 1.46, "learning_rate": 5.434731301854005e-08, "logits/chosen": -2.2056124210357666, "logits/rejected": -2.3140854835510254, "logps/chosen": -0.4845477342605591, "logps/rejected": -0.45920971035957336, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 1.0485906600952148, "rewards/margins": 0.021367788314819336, "rewards/rejected": 1.0272228717803955, "step": 2715 }, { "epoch": 1.46, "learning_rate": 5.431830676455339e-08, "logits/chosen": -2.1597490310668945, "logits/rejected": -2.049178123474121, "logps/chosen": -28.250513076782227, "logps/rejected": -3.2926688194274902, "loss": 0.2566, "rewards/accuracies": 1.0, "rewards/chosen": 1.7525099515914917, "rewards/margins": 1.2289772033691406, "rewards/rejected": 0.5235327482223511, "step": 2716 }, { "epoch": 1.47, "learning_rate": 5.4289299046265836e-08, "logits/chosen": -1.990346074104309, "logits/rejected": -2.272242307662964, "logps/chosen": -1.3560795783996582, "logps/rejected": -4.594440460205078, "loss": 0.6061, "rewards/accuracies": 1.0, "rewards/chosen": 0.9861940741539001, "rewards/margins": 0.18241357803344727, "rewards/rejected": 0.8037804961204529, "step": 2717 }, { "epoch": 1.47, "learning_rate": 5.426028987351368e-08, "logits/chosen": -2.099698781967163, "logits/rejected": -2.097869634628296, "logps/chosen": -5.5941033363342285, "logps/rejected": -3.240981340408325, "loss": 0.4309, "rewards/accuracies": 1.0, "rewards/chosen": 1.3103001117706299, "rewards/margins": 0.6186395287513733, "rewards/rejected": 0.6916605830192566, "step": 2718 }, { "epoch": 1.47, "learning_rate": 5.423127925613367e-08, "logits/chosen": -2.063474655151367, "logits/rejected": -2.052251100540161, "logps/chosen": -13.326661109924316, "logps/rejected": -3.634608745574951, "loss": 0.3526, "rewards/accuracies": 1.0, "rewards/chosen": 1.3569778203964233, "rewards/margins": 0.8608702421188354, "rewards/rejected": 0.4961075484752655, "step": 2719 }, { "epoch": 1.47, "learning_rate": 5.4202267203963046e-08, "logits/chosen": -2.1636385917663574, "logits/rejected": -2.170147657394409, "logps/chosen": -2.0356204509735107, "logps/rejected": -3.8171348571777344, "loss": 0.4966, "rewards/accuracies": 1.0, "rewards/chosen": 0.9325793385505676, "rewards/margins": 0.44137364625930786, "rewards/rejected": 0.49120569229125977, "step": 2720 }, { "epoch": 1.47, "learning_rate": 5.4173253726839564e-08, "logits/chosen": -2.1824567317962646, "logits/rejected": -2.2795283794403076, "logps/chosen": -0.7920540571212769, "logps/rejected": -0.8066413998603821, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.9202925562858582, "rewards/margins": 0.011953890323638916, "rewards/rejected": 0.9083386659622192, "step": 2721 }, { "epoch": 1.47, "learning_rate": 5.4144238834601417e-08, "logits/chosen": -2.0970048904418945, "logits/rejected": -2.043842315673828, "logps/chosen": -29.70603370666504, "logps/rejected": -4.029993534088135, "loss": 0.308, "rewards/accuracies": 1.0, "rewards/chosen": 1.6041319370269775, "rewards/margins": 1.0196452140808105, "rewards/rejected": 0.5844866633415222, "step": 2722 }, { "epoch": 1.47, "learning_rate": 5.4115222537087356e-08, "logits/chosen": -2.089386463165283, "logits/rejected": -2.0997679233551025, "logps/chosen": -0.8111859560012817, "logps/rejected": -11.71297836303711, "loss": 0.5942, "rewards/accuracies": 1.0, "rewards/chosen": 0.9963545203208923, "rewards/margins": 0.20878660678863525, "rewards/rejected": 0.7875679135322571, "step": 2723 }, { "epoch": 1.47, "learning_rate": 5.408620484413653e-08, "logits/chosen": -2.144204616546631, "logits/rejected": -2.128024101257324, "logps/chosen": -9.922650337219238, "logps/rejected": -2.860677480697632, "loss": 0.5398, "rewards/accuracies": 1.0, "rewards/chosen": 1.0653119087219238, "rewards/margins": 0.33454495668411255, "rewards/rejected": 0.7307669520378113, "step": 2724 }, { "epoch": 1.47, "learning_rate": 5.405718576558859e-08, "logits/chosen": -2.057636022567749, "logits/rejected": -2.0562655925750732, "logps/chosen": -0.484864741563797, "logps/rejected": -3.293323278427124, "loss": 0.5196, "rewards/accuracies": 1.0, "rewards/chosen": 0.9972731471061707, "rewards/margins": 0.38377922773361206, "rewards/rejected": 0.6134939193725586, "step": 2725 }, { "epoch": 1.47, "learning_rate": 5.4028165311283644e-08, "logits/chosen": -2.194257974624634, "logits/rejected": -2.18212628364563, "logps/chosen": -11.340287208557129, "logps/rejected": -2.3480513095855713, "loss": 0.5151, "rewards/accuracies": 1.0, "rewards/chosen": 1.4529346227645874, "rewards/margins": 0.39473915100097656, "rewards/rejected": 1.0581954717636108, "step": 2726 }, { "epoch": 1.47, "learning_rate": 5.399914349106229e-08, "logits/chosen": -2.081772804260254, "logits/rejected": -2.0673420429229736, "logps/chosen": -6.319404602050781, "logps/rejected": -6.673803329467773, "loss": 0.3215, "rewards/accuracies": 1.0, "rewards/chosen": 1.297085165977478, "rewards/margins": 0.969841718673706, "rewards/rejected": 0.3272434175014496, "step": 2727 }, { "epoch": 1.47, "learning_rate": 5.397012031476561e-08, "logits/chosen": -2.0970067977905273, "logits/rejected": -2.2530229091644287, "logps/chosen": -0.3826017379760742, "logps/rejected": -0.41181236505508423, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.7632040977478027, "rewards/margins": 0.010690450668334961, "rewards/rejected": 0.7525136470794678, "step": 2728 }, { "epoch": 1.47, "learning_rate": 5.394109579223508e-08, "logits/chosen": -2.070300579071045, "logits/rejected": -2.0466229915618896, "logps/chosen": -6.400717735290527, "logps/rejected": -3.742826223373413, "loss": 0.3516, "rewards/accuracies": 1.0, "rewards/chosen": 1.4749726057052612, "rewards/margins": 0.8643686175346375, "rewards/rejected": 0.6106039881706238, "step": 2729 }, { "epoch": 1.47, "learning_rate": 5.391206993331269e-08, "logits/chosen": -1.9577536582946777, "logits/rejected": -1.9531816244125366, "logps/chosen": -4.078717231750488, "logps/rejected": -5.060736179351807, "loss": 0.2777, "rewards/accuracies": 1.0, "rewards/chosen": 1.671560287475586, "rewards/margins": 1.139087438583374, "rewards/rejected": 0.5324728488922119, "step": 2730 }, { "epoch": 1.47, "learning_rate": 5.3883042747840823e-08, "logits/chosen": -1.9616645574569702, "logits/rejected": -1.9671591520309448, "logps/chosen": -1.4386491775512695, "logps/rejected": -3.9378209114074707, "loss": 0.6441, "rewards/accuracies": 1.0, "rewards/chosen": 1.1513622999191284, "rewards/margins": 0.10055911540985107, "rewards/rejected": 1.0508031845092773, "step": 2731 }, { "epoch": 1.47, "learning_rate": 5.385401424566237e-08, "logits/chosen": -2.175846815109253, "logits/rejected": -2.1730756759643555, "logps/chosen": -3.716648817062378, "logps/rejected": -4.230395793914795, "loss": 0.2541, "rewards/accuracies": 1.0, "rewards/chosen": 1.7120531797409058, "rewards/margins": 1.24015474319458, "rewards/rejected": 0.4718984067440033, "step": 2732 }, { "epoch": 1.47, "learning_rate": 5.382498443662067e-08, "logits/chosen": -2.1177377700805664, "logits/rejected": -2.1201331615448, "logps/chosen": -0.3312857449054718, "logps/rejected": -4.786232948303223, "loss": 0.4267, "rewards/accuracies": 1.0, "rewards/chosen": 1.0465996265411377, "rewards/margins": 0.6307339668273926, "rewards/rejected": 0.41586562991142273, "step": 2733 }, { "epoch": 1.47, "learning_rate": 5.379595333055946e-08, "logits/chosen": -2.094663143157959, "logits/rejected": -2.264407157897949, "logps/chosen": -2.149568557739258, "logps/rejected": -6.803973197937012, "loss": 0.5634, "rewards/accuracies": 1.0, "rewards/chosen": 0.724199116230011, "rewards/margins": 0.27882441878318787, "rewards/rejected": 0.4453746974468231, "step": 2734 }, { "epoch": 1.48, "learning_rate": 5.3766920937322956e-08, "logits/chosen": -2.0213463306427, "logits/rejected": -2.0123679637908936, "logps/chosen": -6.497062683105469, "logps/rejected": -14.693401336669922, "loss": 0.4647, "rewards/accuracies": 1.0, "rewards/chosen": 1.2528151273727417, "rewards/margins": 0.5250462293624878, "rewards/rejected": 0.7277688980102539, "step": 2735 }, { "epoch": 1.48, "learning_rate": 5.373788726675576e-08, "logits/chosen": -2.121032476425171, "logits/rejected": -2.1290104389190674, "logps/chosen": -3.4469587802886963, "logps/rejected": -2.4021267890930176, "loss": 0.4303, "rewards/accuracies": 1.0, "rewards/chosen": 1.2015206813812256, "rewards/margins": 0.6203548312187195, "rewards/rejected": 0.5811658501625061, "step": 2736 }, { "epoch": 1.48, "learning_rate": 5.370885232870296e-08, "logits/chosen": -2.134434938430786, "logits/rejected": -2.119234561920166, "logps/chosen": -8.267507553100586, "logps/rejected": -6.174391746520996, "loss": 0.3613, "rewards/accuracies": 1.0, "rewards/chosen": 1.4138574600219727, "rewards/margins": 0.8320935368537903, "rewards/rejected": 0.5817639231681824, "step": 2737 }, { "epoch": 1.48, "learning_rate": 5.367981613301008e-08, "logits/chosen": -2.197481393814087, "logits/rejected": -2.090604066848755, "logps/chosen": -20.75800132751465, "logps/rejected": -2.566474676132202, "loss": 0.2596, "rewards/accuracies": 1.0, "rewards/chosen": 1.9177942276000977, "rewards/margins": 1.2160766124725342, "rewards/rejected": 0.7017176747322083, "step": 2738 }, { "epoch": 1.48, "learning_rate": 5.365077868952301e-08, "logits/chosen": -2.0590829849243164, "logits/rejected": -2.219771385192871, "logps/chosen": -4.174693584442139, "logps/rejected": -3.359504222869873, "loss": 0.7472, "rewards/accuracies": 0.0, "rewards/chosen": 0.8264898657798767, "rewards/margins": -0.10525625944137573, "rewards/rejected": 0.9317461252212524, "step": 2739 }, { "epoch": 1.48, "learning_rate": 5.3621740008088124e-08, "logits/chosen": -2.0436418056488037, "logits/rejected": -2.0522356033325195, "logps/chosen": -1.3638426065444946, "logps/rejected": -3.289660930633545, "loss": 0.4745, "rewards/accuracies": 1.0, "rewards/chosen": 1.0491337776184082, "rewards/margins": 0.4988359808921814, "rewards/rejected": 0.5502977967262268, "step": 2740 }, { "epoch": 1.48, "learning_rate": 5.359270009855216e-08, "logits/chosen": -2.004767894744873, "logits/rejected": -2.293818712234497, "logps/chosen": -3.0194051265716553, "logps/rejected": -3.445922613143921, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.9592176675796509, "rewards/margins": 0.022120654582977295, "rewards/rejected": 0.9370970129966736, "step": 2741 }, { "epoch": 1.48, "learning_rate": 5.356365897076234e-08, "logits/chosen": -1.9244858026504517, "logits/rejected": -1.9310373067855835, "logps/chosen": -3.051064968109131, "logps/rejected": -4.04176139831543, "loss": 0.5289, "rewards/accuracies": 1.0, "rewards/chosen": 1.00662362575531, "rewards/margins": 0.3608510494232178, "rewards/rejected": 0.6457725763320923, "step": 2742 }, { "epoch": 1.48, "learning_rate": 5.3534616634566246e-08, "logits/chosen": -2.0280325412750244, "logits/rejected": -2.2710931301116943, "logps/chosen": -12.970867156982422, "logps/rejected": -8.126943588256836, "loss": 0.67, "rewards/accuracies": 1.0, "rewards/chosen": 0.7849845886230469, "rewards/margins": 0.046853601932525635, "rewards/rejected": 0.7381309866905212, "step": 2743 }, { "epoch": 1.48, "learning_rate": 5.350557309981188e-08, "logits/chosen": -2.036529541015625, "logits/rejected": -2.324408769607544, "logps/chosen": -0.6933960914611816, "logps/rejected": -0.6408304572105408, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 1.08745539188385, "rewards/margins": 0.012354135513305664, "rewards/rejected": 1.0751012563705444, "step": 2744 }, { "epoch": 1.48, "learning_rate": 5.347652837634765e-08, "logits/chosen": -2.1265709400177, "logits/rejected": -2.1217329502105713, "logps/chosen": -2.5018820762634277, "logps/rejected": -3.6520819664001465, "loss": 0.8156, "rewards/accuracies": 0.0, "rewards/chosen": 0.7639939188957214, "rewards/margins": -0.23146510124206543, "rewards/rejected": 0.9954590201377869, "step": 2745 }, { "epoch": 1.48, "learning_rate": 5.3447482474022385e-08, "logits/chosen": -1.9384536743164062, "logits/rejected": -1.9371166229248047, "logps/chosen": -1.4588245153427124, "logps/rejected": -1.2142040729522705, "loss": 0.7128, "rewards/accuracies": 0.0, "rewards/chosen": 0.9403504729270935, "rewards/margins": -0.03894871473312378, "rewards/rejected": 0.9792991876602173, "step": 2746 }, { "epoch": 1.48, "learning_rate": 5.341843540268531e-08, "logits/chosen": -2.139631748199463, "logits/rejected": -2.256460666656494, "logps/chosen": -0.3322914242744446, "logps/rejected": -7.169809818267822, "loss": 0.585, "rewards/accuracies": 1.0, "rewards/chosen": 0.9430352449417114, "rewards/margins": 0.22932666540145874, "rewards/rejected": 0.7137085795402527, "step": 2747 }, { "epoch": 1.48, "learning_rate": 5.338938717218604e-08, "logits/chosen": -2.1465771198272705, "logits/rejected": -2.149416208267212, "logps/chosen": -2.689220428466797, "logps/rejected": -4.422988414764404, "loss": 0.3677, "rewards/accuracies": 1.0, "rewards/chosen": 1.2589012384414673, "rewards/margins": 0.8108541965484619, "rewards/rejected": 0.448047012090683, "step": 2748 }, { "epoch": 1.48, "learning_rate": 5.3360337792374554e-08, "logits/chosen": -2.1933863162994385, "logits/rejected": -2.2033627033233643, "logps/chosen": -1.9214473962783813, "logps/rejected": -2.875305414199829, "loss": 0.4722, "rewards/accuracies": 1.0, "rewards/chosen": 1.3081715106964111, "rewards/margins": 0.5049498677253723, "rewards/rejected": 0.8032216429710388, "step": 2749 }, { "epoch": 1.48, "learning_rate": 5.333128727310127e-08, "logits/chosen": -2.1122872829437256, "logits/rejected": -2.353790283203125, "logps/chosen": -0.6814630627632141, "logps/rejected": -0.7219480276107788, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 1.0282210111618042, "rewards/margins": 0.020740866661071777, "rewards/rejected": 1.0074801445007324, "step": 2750 }, { "epoch": 1.48, "learning_rate": 5.330223562421695e-08, "logits/chosen": -2.1172282695770264, "logits/rejected": -2.12148118019104, "logps/chosen": -2.6746933460235596, "logps/rejected": -4.333446502685547, "loss": 0.4723, "rewards/accuracies": 1.0, "rewards/chosen": 1.0024936199188232, "rewards/margins": 0.5048463344573975, "rewards/rejected": 0.4976472854614258, "step": 2751 }, { "epoch": 1.48, "learning_rate": 5.3273182855572795e-08, "logits/chosen": -1.9849238395690918, "logits/rejected": -1.9949703216552734, "logps/chosen": -3.3237502574920654, "logps/rejected": -2.9313995838165283, "loss": 0.4438, "rewards/accuracies": 1.0, "rewards/chosen": 1.2028287649154663, "rewards/margins": 0.5823447704315186, "rewards/rejected": 0.6204839944839478, "step": 2752 }, { "epoch": 1.48, "learning_rate": 5.324412897702033e-08, "logits/chosen": -2.011361598968506, "logits/rejected": -1.9936038255691528, "logps/chosen": -3.7252371311187744, "logps/rejected": -5.821366310119629, "loss": 0.4552, "rewards/accuracies": 1.0, "rewards/chosen": 1.115739107131958, "rewards/margins": 0.5506830811500549, "rewards/rejected": 0.5650560259819031, "step": 2753 }, { "epoch": 1.49, "learning_rate": 5.3215073998411477e-08, "logits/chosen": -1.9243121147155762, "logits/rejected": -1.9601290225982666, "logps/chosen": -0.9937689900398254, "logps/rejected": -13.320636749267578, "loss": 0.5428, "rewards/accuracies": 1.0, "rewards/chosen": 1.043656349182129, "rewards/margins": 0.32742196321487427, "rewards/rejected": 0.7162343859672546, "step": 2754 }, { "epoch": 1.49, "learning_rate": 5.3186017929598516e-08, "logits/chosen": -2.1423938274383545, "logits/rejected": -2.1485800743103027, "logps/chosen": -3.4216232299804688, "logps/rejected": -9.363802909851074, "loss": 0.3369, "rewards/accuracies": 1.0, "rewards/chosen": 1.2859840393066406, "rewards/margins": 0.9146363139152527, "rewards/rejected": 0.37134772539138794, "step": 2755 }, { "epoch": 1.49, "learning_rate": 5.315696078043413e-08, "logits/chosen": -2.121062994003296, "logits/rejected": -2.1486380100250244, "logps/chosen": -9.56021499633789, "logps/rejected": -15.292330741882324, "loss": 0.5809, "rewards/accuracies": 1.0, "rewards/chosen": 1.1994673013687134, "rewards/margins": 0.2387351393699646, "rewards/rejected": 0.9607321619987488, "step": 2756 }, { "epoch": 1.49, "learning_rate": 5.312790256077135e-08, "logits/chosen": -1.987301230430603, "logits/rejected": -2.2972869873046875, "logps/chosen": -0.9323521852493286, "logps/rejected": -0.804466724395752, "loss": 0.6748, "rewards/accuracies": 1.0, "rewards/chosen": 0.9582908749580383, "rewards/margins": 0.03696095943450928, "rewards/rejected": 0.921329915523529, "step": 2757 }, { "epoch": 1.49, "learning_rate": 5.309884328046358e-08, "logits/chosen": -2.0754921436309814, "logits/rejected": -2.3113741874694824, "logps/chosen": -3.5473742485046387, "logps/rejected": -3.432128429412842, "loss": 0.6973, "rewards/accuracies": 0.0, "rewards/chosen": 0.931225597858429, "rewards/margins": -0.008372128009796143, "rewards/rejected": 0.9395977258682251, "step": 2758 }, { "epoch": 1.49, "learning_rate": 5.306978294936455e-08, "logits/chosen": -2.2814791202545166, "logits/rejected": -2.290027379989624, "logps/chosen": -2.7525479793548584, "logps/rejected": -4.531561374664307, "loss": 0.3435, "rewards/accuracies": 1.0, "rewards/chosen": 1.520249605178833, "rewards/margins": 0.891891360282898, "rewards/rejected": 0.6283582448959351, "step": 2759 }, { "epoch": 1.49, "learning_rate": 5.3040721577328365e-08, "logits/chosen": -2.0343642234802246, "logits/rejected": -2.0276453495025635, "logps/chosen": -2.984740972518921, "logps/rejected": -5.7795586585998535, "loss": 0.5001, "rewards/accuracies": 1.0, "rewards/chosen": 0.8464820981025696, "rewards/margins": 0.43237119913101196, "rewards/rejected": 0.4141108989715576, "step": 2760 }, { "epoch": 1.49, "learning_rate": 5.301165917420951e-08, "logits/chosen": -2.08229398727417, "logits/rejected": -2.0492441654205322, "logps/chosen": -11.642702102661133, "logps/rejected": -3.6110005378723145, "loss": 0.339, "rewards/accuracies": 1.0, "rewards/chosen": 1.6173094511032104, "rewards/margins": 0.9074984788894653, "rewards/rejected": 0.7098109722137451, "step": 2761 }, { "epoch": 1.49, "learning_rate": 5.29825957498628e-08, "logits/chosen": -2.1629350185394287, "logits/rejected": -2.2908852100372314, "logps/chosen": -5.568350791931152, "logps/rejected": -1.5285543203353882, "loss": 0.7477, "rewards/accuracies": 0.0, "rewards/chosen": 0.9271069765090942, "rewards/margins": -0.10631763935089111, "rewards/rejected": 1.0334246158599854, "step": 2762 }, { "epoch": 1.49, "learning_rate": 5.2953531314143376e-08, "logits/chosen": -2.0502090454101562, "logits/rejected": -2.2745273113250732, "logps/chosen": -2.524383068084717, "logps/rejected": -2.434814929962158, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.8506776690483093, "rewards/margins": 0.016387939453125, "rewards/rejected": 0.8342897295951843, "step": 2763 }, { "epoch": 1.49, "learning_rate": 5.2924465876906744e-08, "logits/chosen": -2.0131993293762207, "logits/rejected": -2.347712278366089, "logps/chosen": -0.47906073927879333, "logps/rejected": -28.78375244140625, "loss": 0.6399, "rewards/accuracies": 1.0, "rewards/chosen": 1.0544908046722412, "rewards/margins": 0.10958534479141235, "rewards/rejected": 0.9449054598808289, "step": 2764 }, { "epoch": 1.49, "learning_rate": 5.289539944800874e-08, "logits/chosen": -2.1351542472839355, "logits/rejected": -2.2035248279571533, "logps/chosen": -0.6732643246650696, "logps/rejected": -0.72446209192276, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.9509660601615906, "rewards/margins": 0.02111995220184326, "rewards/rejected": 0.9298461079597473, "step": 2765 }, { "epoch": 1.49, "learning_rate": 5.286633203730556e-08, "logits/chosen": -2.0426836013793945, "logits/rejected": -2.0708675384521484, "logps/chosen": -4.790533065795898, "logps/rejected": -4.283183574676514, "loss": 0.6084, "rewards/accuracies": 1.0, "rewards/chosen": 1.112443208694458, "rewards/margins": 0.17733824253082275, "rewards/rejected": 0.9351049661636353, "step": 2766 }, { "epoch": 1.49, "learning_rate": 5.2837263654653706e-08, "logits/chosen": -2.0546669960021973, "logits/rejected": -2.06081223487854, "logps/chosen": -6.7039361000061035, "logps/rejected": -0.8186031579971313, "loss": 0.5006, "rewards/accuracies": 1.0, "rewards/chosen": 1.2489761114120483, "rewards/margins": 0.4311848282814026, "rewards/rejected": 0.8177912831306458, "step": 2767 }, { "epoch": 1.49, "learning_rate": 5.280819430991e-08, "logits/chosen": -2.0822885036468506, "logits/rejected": -2.0828864574432373, "logps/chosen": -2.569495916366577, "logps/rejected": -0.48089396953582764, "loss": 0.4125, "rewards/accuracies": 1.0, "rewards/chosen": 1.442595362663269, "rewards/margins": 0.6720521450042725, "rewards/rejected": 0.7705432176589966, "step": 2768 }, { "epoch": 1.49, "learning_rate": 5.277912401293163e-08, "logits/chosen": -2.1205008029937744, "logits/rejected": -2.119640350341797, "logps/chosen": -1.1519182920455933, "logps/rejected": -1.1405805349349976, "loss": 0.6694, "rewards/accuracies": 1.0, "rewards/chosen": 0.8734486699104309, "rewards/margins": 0.04814118146896362, "rewards/rejected": 0.8253074884414673, "step": 2769 }, { "epoch": 1.49, "learning_rate": 5.275005277357608e-08, "logits/chosen": -2.0898776054382324, "logits/rejected": -2.0772767066955566, "logps/chosen": -6.843939781188965, "logps/rejected": -2.666116237640381, "loss": 0.4654, "rewards/accuracies": 1.0, "rewards/chosen": 1.1368680000305176, "rewards/margins": 0.5231930017471313, "rewards/rejected": 0.6136749982833862, "step": 2770 }, { "epoch": 1.49, "learning_rate": 5.2720980601701156e-08, "logits/chosen": -2.004197835922241, "logits/rejected": -2.0140864849090576, "logps/chosen": -1.7222541570663452, "logps/rejected": -2.011418104171753, "loss": 0.4737, "rewards/accuracies": 1.0, "rewards/chosen": 1.1663230657577515, "rewards/margins": 0.5010493993759155, "rewards/rejected": 0.6652736663818359, "step": 2771 }, { "epoch": 1.5, "learning_rate": 5.269190750716499e-08, "logits/chosen": -2.1103029251098633, "logits/rejected": -2.368126153945923, "logps/chosen": -0.23489107191562653, "logps/rejected": -0.22239071130752563, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.7667422294616699, "rewards/margins": 0.010271012783050537, "rewards/rejected": 0.7564712166786194, "step": 2772 }, { "epoch": 1.5, "learning_rate": 5.266283349982601e-08, "logits/chosen": -2.0162787437438965, "logits/rejected": -2.0209150314331055, "logps/chosen": -3.145150661468506, "logps/rejected": -3.0409862995147705, "loss": 0.4236, "rewards/accuracies": 1.0, "rewards/chosen": 1.3356603384017944, "rewards/margins": 0.6397395730018616, "rewards/rejected": 0.6959207653999329, "step": 2773 }, { "epoch": 1.5, "learning_rate": 5.2633758589542954e-08, "logits/chosen": -2.104012966156006, "logits/rejected": -2.073880910873413, "logps/chosen": -11.808748245239258, "logps/rejected": -2.043414831161499, "loss": 0.3119, "rewards/accuracies": 1.0, "rewards/chosen": 1.6788469552993774, "rewards/margins": 1.0050365924835205, "rewards/rejected": 0.6738103032112122, "step": 2774 }, { "epoch": 1.5, "learning_rate": 5.260468278617489e-08, "logits/chosen": -1.9876987934112549, "logits/rejected": -2.269319534301758, "logps/chosen": -0.7036405801773071, "logps/rejected": -0.5481255054473877, "loss": 0.6964, "rewards/accuracies": 0.0, "rewards/chosen": 0.7740968465805054, "rewards/margins": -0.00644993782043457, "rewards/rejected": 0.7805467844009399, "step": 2775 }, { "epoch": 1.5, "learning_rate": 5.257560609958119e-08, "logits/chosen": -2.1469578742980957, "logits/rejected": -2.150165319442749, "logps/chosen": -0.3817618489265442, "logps/rejected": -4.757373332977295, "loss": 0.4812, "rewards/accuracies": 1.0, "rewards/chosen": 0.839508056640625, "rewards/margins": 0.48111870884895325, "rewards/rejected": 0.35838934779167175, "step": 2776 }, { "epoch": 1.5, "learning_rate": 5.254652853962148e-08, "logits/chosen": -1.9366536140441895, "logits/rejected": -1.9372248649597168, "logps/chosen": -1.6652252674102783, "logps/rejected": -5.58349609375, "loss": 0.5301, "rewards/accuracies": 1.0, "rewards/chosen": 1.0530518293380737, "rewards/margins": 0.3579758405685425, "rewards/rejected": 0.6950759887695312, "step": 2777 }, { "epoch": 1.5, "learning_rate": 5.2517450116155737e-08, "logits/chosen": -1.9299578666687012, "logits/rejected": -2.240194082260132, "logps/chosen": -3.035308361053467, "logps/rejected": -6.230650901794434, "loss": 0.6418, "rewards/accuracies": 1.0, "rewards/chosen": 0.8628620505332947, "rewards/margins": 0.10540038347244263, "rewards/rejected": 0.757461667060852, "step": 2778 }, { "epoch": 1.5, "learning_rate": 5.2488370839044175e-08, "logits/chosen": -1.9990253448486328, "logits/rejected": -1.9998730421066284, "logps/chosen": -1.3120484352111816, "logps/rejected": -2.6746203899383545, "loss": 0.4941, "rewards/accuracies": 1.0, "rewards/chosen": 1.125565528869629, "rewards/margins": 0.4476948380470276, "rewards/rejected": 0.6778706908226013, "step": 2779 }, { "epoch": 1.5, "learning_rate": 5.245929071814734e-08, "logits/chosen": -2.1608452796936035, "logits/rejected": -2.2752292156219482, "logps/chosen": -3.071176767349243, "logps/rejected": -2.8139145374298096, "loss": 0.6912, "rewards/accuracies": 1.0, "rewards/chosen": 0.5899500250816345, "rewards/margins": 0.003947734832763672, "rewards/rejected": 0.5860022902488708, "step": 2780 }, { "epoch": 1.5, "learning_rate": 5.243020976332607e-08, "logits/chosen": -2.090573787689209, "logits/rejected": -2.3177616596221924, "logps/chosen": -1.4117412567138672, "logps/rejected": -1.2737830877304077, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.9253412485122681, "rewards/margins": 0.0071811676025390625, "rewards/rejected": 0.918160080909729, "step": 2781 }, { "epoch": 1.5, "learning_rate": 5.240112798444146e-08, "logits/chosen": -2.1468822956085205, "logits/rejected": -2.113201856613159, "logps/chosen": -21.76400375366211, "logps/rejected": -3.5211076736450195, "loss": 0.2322, "rewards/accuracies": 1.0, "rewards/chosen": 1.9164257049560547, "rewards/margins": 1.3416728973388672, "rewards/rejected": 0.5747527480125427, "step": 2782 }, { "epoch": 1.5, "learning_rate": 5.237204539135489e-08, "logits/chosen": -2.140599012374878, "logits/rejected": -2.1328227519989014, "logps/chosen": -3.3687148094177246, "logps/rejected": -8.960165023803711, "loss": 0.3543, "rewards/accuracies": 1.0, "rewards/chosen": 1.310128092765808, "rewards/margins": 0.8552464246749878, "rewards/rejected": 0.4548816680908203, "step": 2783 }, { "epoch": 1.5, "learning_rate": 5.2342961993927993e-08, "logits/chosen": -2.2315609455108643, "logits/rejected": -2.2318034172058105, "logps/chosen": -0.42867547273635864, "logps/rejected": -3.1059868335723877, "loss": 0.526, "rewards/accuracies": 1.0, "rewards/chosen": 0.7339205741882324, "rewards/margins": 0.3679443895816803, "rewards/rejected": 0.3659761846065521, "step": 2784 }, { "epoch": 1.5, "learning_rate": 5.2313877802022746e-08, "logits/chosen": -2.1221892833709717, "logits/rejected": -2.3567137718200684, "logps/chosen": -0.5151874423027039, "logps/rejected": -0.6016740202903748, "loss": 0.6748, "rewards/accuracies": 1.0, "rewards/chosen": 0.8295653462409973, "rewards/margins": 0.03709679841995239, "rewards/rejected": 0.7924685478210449, "step": 2785 }, { "epoch": 1.5, "learning_rate": 5.2284792825501325e-08, "logits/chosen": -2.1254749298095703, "logits/rejected": -2.1401145458221436, "logps/chosen": -0.5550123453140259, "logps/rejected": -8.994681358337402, "loss": 0.4731, "rewards/accuracies": 1.0, "rewards/chosen": 0.9575991034507751, "rewards/margins": 0.5026034712791443, "rewards/rejected": 0.45499563217163086, "step": 2786 }, { "epoch": 1.5, "learning_rate": 5.2255707074226196e-08, "logits/chosen": -2.1258177757263184, "logits/rejected": -2.117755889892578, "logps/chosen": -2.807643175125122, "logps/rejected": -2.448411226272583, "loss": 0.4325, "rewards/accuracies": 1.0, "rewards/chosen": 1.3125768899917603, "rewards/margins": 0.6140925288200378, "rewards/rejected": 0.6984843611717224, "step": 2787 }, { "epoch": 1.5, "learning_rate": 5.2226620558060087e-08, "logits/chosen": -1.9408502578735352, "logits/rejected": -2.235499620437622, "logps/chosen": -0.4326072931289673, "logps/rejected": -0.43501371145248413, "loss": 0.6748, "rewards/accuracies": 1.0, "rewards/chosen": 0.8417636752128601, "rewards/margins": 0.03695476055145264, "rewards/rejected": 0.8048089146614075, "step": 2788 }, { "epoch": 1.5, "learning_rate": 5.2197533286865994e-08, "logits/chosen": -2.0561904907226562, "logits/rejected": -2.0543839931488037, "logps/chosen": -0.6391075849533081, "logps/rejected": -4.737704277038574, "loss": 0.5573, "rewards/accuracies": 1.0, "rewards/chosen": 1.012736201286316, "rewards/margins": 0.29301780462265015, "rewards/rejected": 0.7197183966636658, "step": 2789 }, { "epoch": 1.5, "learning_rate": 5.2168445270507165e-08, "logits/chosen": -1.9970829486846924, "logits/rejected": -1.9984391927719116, "logps/chosen": -0.2817956209182739, "logps/rejected": -4.845858097076416, "loss": 0.5189, "rewards/accuracies": 1.0, "rewards/chosen": 0.8864637613296509, "rewards/margins": 0.3855230212211609, "rewards/rejected": 0.50094074010849, "step": 2790 }, { "epoch": 1.51, "learning_rate": 5.21393565188471e-08, "logits/chosen": -2.0119903087615967, "logits/rejected": -2.00140643119812, "logps/chosen": -6.470396041870117, "logps/rejected": -3.8714489936828613, "loss": 0.5716, "rewards/accuracies": 1.0, "rewards/chosen": 1.1202481985092163, "rewards/margins": 0.2598292827606201, "rewards/rejected": 0.8604189157485962, "step": 2791 }, { "epoch": 1.51, "learning_rate": 5.2110267041749536e-08, "logits/chosen": -2.0300076007843018, "logits/rejected": -2.294066905975342, "logps/chosen": -4.274540424346924, "logps/rejected": -7.24810791015625, "loss": 0.6198, "rewards/accuracies": 1.0, "rewards/chosen": 1.1810849905014038, "rewards/margins": 0.1524209976196289, "rewards/rejected": 1.028663992881775, "step": 2792 }, { "epoch": 1.51, "learning_rate": 5.208117684907846e-08, "logits/chosen": -1.9589184522628784, "logits/rejected": -1.9623486995697021, "logps/chosen": -3.3538641929626465, "logps/rejected": -6.906434059143066, "loss": 0.3871, "rewards/accuracies": 1.0, "rewards/chosen": 1.3553112745285034, "rewards/margins": 0.7492277026176453, "rewards/rejected": 0.6060835719108582, "step": 2793 }, { "epoch": 1.51, "learning_rate": 5.20520859506981e-08, "logits/chosen": -2.0778698921203613, "logits/rejected": -2.07759952545166, "logps/chosen": -0.7518578171730042, "logps/rejected": -4.647637367248535, "loss": 0.5501, "rewards/accuracies": 1.0, "rewards/chosen": 0.8776710629463196, "rewards/margins": 0.3101275563240051, "rewards/rejected": 0.5675435066223145, "step": 2794 }, { "epoch": 1.51, "learning_rate": 5.202299435647297e-08, "logits/chosen": -2.0244662761688232, "logits/rejected": -2.236952066421509, "logps/chosen": -1.6386470794677734, "logps/rejected": -1.8653209209442139, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.8283466696739197, "rewards/margins": 0.02255183458328247, "rewards/rejected": 0.8057948350906372, "step": 2795 }, { "epoch": 1.51, "learning_rate": 5.199390207626776e-08, "logits/chosen": -2.091740608215332, "logits/rejected": -2.098227024078369, "logps/chosen": -2.0388317108154297, "logps/rejected": -3.5402095317840576, "loss": 0.4771, "rewards/accuracies": 1.0, "rewards/chosen": 1.1349289417266846, "rewards/margins": 0.4919363260269165, "rewards/rejected": 0.6429926156997681, "step": 2796 }, { "epoch": 1.51, "learning_rate": 5.196480911994741e-08, "logits/chosen": -2.040379762649536, "logits/rejected": -2.263148069381714, "logps/chosen": -1.3732194900512695, "logps/rejected": -1.3113058805465698, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.9333477020263672, "rewards/margins": 0.015336394309997559, "rewards/rejected": 0.9180113077163696, "step": 2797 }, { "epoch": 1.51, "learning_rate": 5.193571549737707e-08, "logits/chosen": -2.0421440601348877, "logits/rejected": -2.2776482105255127, "logps/chosen": -1.6313326358795166, "logps/rejected": -1.9064366817474365, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 1.0371543169021606, "rewards/margins": 0.017171621322631836, "rewards/rejected": 1.0199826955795288, "step": 2798 }, { "epoch": 1.51, "learning_rate": 5.1906621218422174e-08, "logits/chosen": -2.050971746444702, "logits/rejected": -2.3049824237823486, "logps/chosen": -4.9533772468566895, "logps/rejected": -4.7380146980285645, "loss": 0.7018, "rewards/accuracies": 0.0, "rewards/chosen": 1.0977996587753296, "rewards/margins": -0.017139434814453125, "rewards/rejected": 1.1149390935897827, "step": 2799 }, { "epoch": 1.51, "learning_rate": 5.1877526292948334e-08, "logits/chosen": -2.1067187786102295, "logits/rejected": -2.3592641353607178, "logps/chosen": -1.047437071800232, "logps/rejected": -25.865217208862305, "loss": 0.4526, "rewards/accuracies": 1.0, "rewards/chosen": 1.8529218435287476, "rewards/margins": 0.5579104423522949, "rewards/rejected": 1.2950114011764526, "step": 2800 }, { "epoch": 1.51, "learning_rate": 5.184843073082139e-08, "logits/chosen": -2.0863637924194336, "logits/rejected": -2.337841272354126, "logps/chosen": -0.9510735869407654, "logps/rejected": -1.058212161064148, "loss": 0.671, "rewards/accuracies": 1.0, "rewards/chosen": 0.8920246958732605, "rewards/margins": 0.044770002365112305, "rewards/rejected": 0.8472546935081482, "step": 2801 }, { "epoch": 1.51, "learning_rate": 5.18193345419074e-08, "logits/chosen": -2.154550313949585, "logits/rejected": -2.295306921005249, "logps/chosen": -0.5624277591705322, "logps/rejected": -0.5997292399406433, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 0.9825344085693359, "rewards/margins": 0.0029081106185913086, "rewards/rejected": 0.9796262979507446, "step": 2802 }, { "epoch": 1.51, "learning_rate": 5.1790237736072606e-08, "logits/chosen": -2.20932936668396, "logits/rejected": -2.147995948791504, "logps/chosen": -30.621356964111328, "logps/rejected": -5.884920597076416, "loss": 0.2622, "rewards/accuracies": 1.0, "rewards/chosen": 1.787927269935608, "rewards/margins": 1.2046408653259277, "rewards/rejected": 0.5832863450050354, "step": 2803 }, { "epoch": 1.51, "learning_rate": 5.1761140323183516e-08, "logits/chosen": -1.9798781871795654, "logits/rejected": -2.2331044673919678, "logps/chosen": -2.8673410415649414, "logps/rejected": -2.8377161026000977, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.9607640504837036, "rewards/margins": 0.016038715839385986, "rewards/rejected": 0.9447253346443176, "step": 2804 }, { "epoch": 1.51, "learning_rate": 5.173204231310683e-08, "logits/chosen": -2.161848306655884, "logits/rejected": -2.1585330963134766, "logps/chosen": -3.9103362560272217, "logps/rejected": -4.213726997375488, "loss": 0.3084, "rewards/accuracies": 1.0, "rewards/chosen": 1.5013759136199951, "rewards/margins": 1.01802659034729, "rewards/rejected": 0.4833493232727051, "step": 2805 }, { "epoch": 1.51, "learning_rate": 5.170294371570939e-08, "logits/chosen": -2.119992971420288, "logits/rejected": -2.1174654960632324, "logps/chosen": -1.0664749145507812, "logps/rejected": -5.495197296142578, "loss": 0.5515, "rewards/accuracies": 1.0, "rewards/chosen": 1.0430400371551514, "rewards/margins": 0.306646466255188, "rewards/rejected": 0.7363935708999634, "step": 2806 }, { "epoch": 1.51, "learning_rate": 5.167384454085831e-08, "logits/chosen": -2.0089776515960693, "logits/rejected": -2.233715057373047, "logps/chosen": -0.539940357208252, "logps/rejected": -0.546835720539093, "loss": 0.6663, "rewards/accuracies": 1.0, "rewards/chosen": 0.9978527426719666, "rewards/margins": 0.054444730281829834, "rewards/rejected": 0.9434080123901367, "step": 2807 }, { "epoch": 1.51, "learning_rate": 5.164474479842085e-08, "logits/chosen": -2.095283031463623, "logits/rejected": -2.265637159347534, "logps/chosen": -1.1399970054626465, "logps/rejected": -6.420668125152588, "loss": 0.559, "rewards/accuracies": 1.0, "rewards/chosen": 1.0135456323623657, "rewards/margins": 0.289196252822876, "rewards/rejected": 0.7243493795394897, "step": 2808 }, { "epoch": 1.52, "learning_rate": 5.16156444982645e-08, "logits/chosen": -2.1826558113098145, "logits/rejected": -2.182990312576294, "logps/chosen": -0.6768369078636169, "logps/rejected": -2.2753593921661377, "loss": 0.5405, "rewards/accuracies": 1.0, "rewards/chosen": 1.0748405456542969, "rewards/margins": 0.3328559994697571, "rewards/rejected": 0.7419845461845398, "step": 2809 }, { "epoch": 1.52, "learning_rate": 5.158654365025693e-08, "logits/chosen": -1.9823079109191895, "logits/rejected": -1.9610259532928467, "logps/chosen": -16.379016876220703, "logps/rejected": -2.270390510559082, "loss": 0.2803, "rewards/accuracies": 1.0, "rewards/chosen": 1.8370052576065063, "rewards/margins": 1.1286559104919434, "rewards/rejected": 0.7083494067192078, "step": 2810 }, { "epoch": 1.52, "learning_rate": 5.155744226426598e-08, "logits/chosen": -2.0371131896972656, "logits/rejected": -2.2356085777282715, "logps/chosen": -3.125074863433838, "logps/rejected": -3.0459933280944824, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.8390981554985046, "rewards/margins": 0.032181501388549805, "rewards/rejected": 0.8069166541099548, "step": 2811 }, { "epoch": 1.52, "learning_rate": 5.152834035015966e-08, "logits/chosen": -2.108311414718628, "logits/rejected": -2.1069421768188477, "logps/chosen": -0.8919410705566406, "logps/rejected": -1.7268266677856445, "loss": 0.6542, "rewards/accuracies": 1.0, "rewards/chosen": 0.9556344151496887, "rewards/margins": 0.07940149307250977, "rewards/rejected": 0.876232922077179, "step": 2812 }, { "epoch": 1.52, "learning_rate": 5.1499237917806226e-08, "logits/chosen": -2.089895486831665, "logits/rejected": -2.079820394515991, "logps/chosen": -1.7546952962875366, "logps/rejected": -11.574776649475098, "loss": 0.3281, "rewards/accuracies": 1.0, "rewards/chosen": 1.3961730003356934, "rewards/margins": 0.9459301233291626, "rewards/rejected": 0.45024290680885315, "step": 2813 }, { "epoch": 1.52, "learning_rate": 5.147013497707402e-08, "logits/chosen": -2.0096089839935303, "logits/rejected": -2.2084877490997314, "logps/chosen": -1.7471688985824585, "logps/rejected": -1.5354154109954834, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.7236503958702087, "rewards/margins": -0.002261519432067871, "rewards/rejected": 0.7259119153022766, "step": 2814 }, { "epoch": 1.52, "learning_rate": 5.144103153783163e-08, "logits/chosen": -2.1372363567352295, "logits/rejected": -2.318441390991211, "logps/chosen": -1.9329277276992798, "logps/rejected": -7.470907688140869, "loss": 0.6308, "rewards/accuracies": 1.0, "rewards/chosen": 1.1900442838668823, "rewards/margins": 0.12891626358032227, "rewards/rejected": 1.06112802028656, "step": 2815 }, { "epoch": 1.52, "learning_rate": 5.141192760994776e-08, "logits/chosen": -2.0807547569274902, "logits/rejected": -2.2736902236938477, "logps/chosen": -4.924752712249756, "logps/rejected": -4.74919319152832, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 1.000597596168518, "rewards/margins": 0.004887700080871582, "rewards/rejected": 0.9957098960876465, "step": 2816 }, { "epoch": 1.52, "learning_rate": 5.138282320329131e-08, "logits/chosen": -2.1267709732055664, "logits/rejected": -2.2842581272125244, "logps/chosen": -0.6992244124412537, "logps/rejected": -0.7185564041137695, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.7357794046401978, "rewards/margins": 0.02401787042617798, "rewards/rejected": 0.7117615342140198, "step": 2817 }, { "epoch": 1.52, "learning_rate": 5.1353718327731355e-08, "logits/chosen": -2.0578112602233887, "logits/rejected": -2.3522870540618896, "logps/chosen": -0.13446977734565735, "logps/rejected": -0.1381940245628357, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": 0.9318774342536926, "rewards/margins": -0.009891867637634277, "rewards/rejected": 0.9417693018913269, "step": 2818 }, { "epoch": 1.52, "learning_rate": 5.1324612993137085e-08, "logits/chosen": -2.0203657150268555, "logits/rejected": -2.237818956375122, "logps/chosen": -8.984369277954102, "logps/rejected": -9.149023056030273, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.4460383355617523, "rewards/margins": 0.025453269481658936, "rewards/rejected": 0.4205850660800934, "step": 2819 }, { "epoch": 1.52, "learning_rate": 5.129550720937787e-08, "logits/chosen": -2.036888599395752, "logits/rejected": -2.039018154144287, "logps/chosen": -4.323459148406982, "logps/rejected": -3.10158109664917, "loss": 0.3298, "rewards/accuracies": 1.0, "rewards/chosen": 1.5995746850967407, "rewards/margins": 0.9399317502975464, "rewards/rejected": 0.6596429347991943, "step": 2820 }, { "epoch": 1.52, "learning_rate": 5.1266400986323266e-08, "logits/chosen": -2.0651509761810303, "logits/rejected": -2.054281234741211, "logps/chosen": -0.3593592047691345, "logps/rejected": -6.446319580078125, "loss": 0.4628, "rewards/accuracies": 1.0, "rewards/chosen": 0.9230994582176208, "rewards/margins": 0.5300740003585815, "rewards/rejected": 0.3930254876613617, "step": 2821 }, { "epoch": 1.52, "learning_rate": 5.1237294333842894e-08, "logits/chosen": -2.042910099029541, "logits/rejected": -2.039818048477173, "logps/chosen": -2.3111720085144043, "logps/rejected": -4.949294090270996, "loss": 0.4252, "rewards/accuracies": 1.0, "rewards/chosen": 1.2414076328277588, "rewards/margins": 0.6350660920143127, "rewards/rejected": 0.606341540813446, "step": 2822 }, { "epoch": 1.52, "learning_rate": 5.1208187261806616e-08, "logits/chosen": -2.073503255844116, "logits/rejected": -2.309622287750244, "logps/chosen": -7.202873229980469, "logps/rejected": -1.7314423322677612, "loss": 0.7902, "rewards/accuracies": 0.0, "rewards/chosen": 0.663882851600647, "rewards/margins": -0.18543356657028198, "rewards/rejected": 0.849316418170929, "step": 2823 }, { "epoch": 1.52, "learning_rate": 5.1179079780084356e-08, "logits/chosen": -2.086181163787842, "logits/rejected": -2.116724729537964, "logps/chosen": -3.329888343811035, "logps/rejected": -5.230093955993652, "loss": 0.3865, "rewards/accuracies": 1.0, "rewards/chosen": 1.6139262914657593, "rewards/margins": 0.7510141730308533, "rewards/rejected": 0.862912118434906, "step": 2824 }, { "epoch": 1.52, "learning_rate": 5.114997189854624e-08, "logits/chosen": -2.1138198375701904, "logits/rejected": -2.115049123764038, "logps/chosen": -2.9272239208221436, "logps/rejected": -1.4498443603515625, "loss": 0.5489, "rewards/accuracies": 1.0, "rewards/chosen": 1.3102256059646606, "rewards/margins": 0.3129799962043762, "rewards/rejected": 0.9972456097602844, "step": 2825 }, { "epoch": 1.52, "learning_rate": 5.112086362706249e-08, "logits/chosen": -1.9652763605117798, "logits/rejected": -2.2535910606384277, "logps/chosen": -0.39657679200172424, "logps/rejected": -0.38727572560310364, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.8047062158584595, "rewards/margins": 0.006636559963226318, "rewards/rejected": 0.7980696558952332, "step": 2826 }, { "epoch": 1.52, "learning_rate": 5.1091754975503466e-08, "logits/chosen": -2.0582144260406494, "logits/rejected": -2.056713342666626, "logps/chosen": -1.5106710195541382, "logps/rejected": -5.033685207366943, "loss": 0.4509, "rewards/accuracies": 1.0, "rewards/chosen": 1.0794246196746826, "rewards/margins": 0.562688410282135, "rewards/rejected": 0.5167362093925476, "step": 2827 }, { "epoch": 1.53, "learning_rate": 5.106264595373966e-08, "logits/chosen": -1.9619853496551514, "logits/rejected": -1.9687063694000244, "logps/chosen": -1.8216969966888428, "logps/rejected": -2.9797980785369873, "loss": 0.4736, "rewards/accuracies": 1.0, "rewards/chosen": 1.1040693521499634, "rewards/margins": 0.5013585090637207, "rewards/rejected": 0.6027108430862427, "step": 2828 }, { "epoch": 1.53, "learning_rate": 5.1033536571641714e-08, "logits/chosen": -2.084940195083618, "logits/rejected": -2.0928096771240234, "logps/chosen": -4.978126049041748, "logps/rejected": -9.401862144470215, "loss": 0.4381, "rewards/accuracies": 1.0, "rewards/chosen": 1.1363071203231812, "rewards/margins": 0.5983841419219971, "rewards/rejected": 0.5379229784011841, "step": 2829 }, { "epoch": 1.53, "learning_rate": 5.100442683908035e-08, "logits/chosen": -2.190990686416626, "logits/rejected": -2.3381001949310303, "logps/chosen": -16.32819938659668, "logps/rejected": -16.93433380126953, "loss": 0.5955, "rewards/accuracies": 1.0, "rewards/chosen": 1.1467310190200806, "rewards/margins": 0.2059003710746765, "rewards/rejected": 0.940830647945404, "step": 2830 }, { "epoch": 1.53, "learning_rate": 5.097531676592643e-08, "logits/chosen": -2.1250905990600586, "logits/rejected": -2.1272449493408203, "logps/chosen": -0.22205854952335358, "logps/rejected": -4.237613201141357, "loss": 0.5458, "rewards/accuracies": 1.0, "rewards/chosen": 0.8405640721321106, "rewards/margins": 0.32026898860931396, "rewards/rejected": 0.5202950835227966, "step": 2831 }, { "epoch": 1.53, "learning_rate": 5.094620636205095e-08, "logits/chosen": -2.1487808227539062, "logits/rejected": -2.1481974124908447, "logps/chosen": -0.23679912090301514, "logps/rejected": -2.9273247718811035, "loss": 0.4977, "rewards/accuracies": 1.0, "rewards/chosen": 1.021652102470398, "rewards/margins": 0.43857812881469727, "rewards/rejected": 0.5830739736557007, "step": 2832 }, { "epoch": 1.53, "learning_rate": 5.091709563732498e-08, "logits/chosen": -2.094400405883789, "logits/rejected": -2.088716745376587, "logps/chosen": -0.6757371425628662, "logps/rejected": -3.6539087295532227, "loss": 0.5878, "rewards/accuracies": 1.0, "rewards/chosen": 0.935581624507904, "rewards/margins": 0.22313368320465088, "rewards/rejected": 0.7124479413032532, "step": 2833 }, { "epoch": 1.53, "learning_rate": 5.088798460161973e-08, "logits/chosen": -2.1508750915527344, "logits/rejected": -2.1180710792541504, "logps/chosen": -15.291248321533203, "logps/rejected": -6.681094646453857, "loss": 0.3805, "rewards/accuracies": 1.0, "rewards/chosen": 1.5652207136154175, "rewards/margins": 0.7698723673820496, "rewards/rejected": 0.7953483462333679, "step": 2834 }, { "epoch": 1.53, "learning_rate": 5.085887326480649e-08, "logits/chosen": -1.9706554412841797, "logits/rejected": -2.266983985900879, "logps/chosen": -9.940715789794922, "logps/rejected": -9.87314224243164, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.35780200362205505, "rewards/margins": -0.00527992844581604, "rewards/rejected": 0.3630819320678711, "step": 2835 }, { "epoch": 1.53, "learning_rate": 5.082976163675666e-08, "logits/chosen": -1.943811058998108, "logits/rejected": -2.2358925342559814, "logps/chosen": -1.1348069906234741, "logps/rejected": -1.1250696182250977, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.8247184157371521, "rewards/margins": 0.025877773761749268, "rewards/rejected": 0.7988406419754028, "step": 2836 }, { "epoch": 1.53, "learning_rate": 5.080064972734178e-08, "logits/chosen": -2.0031001567840576, "logits/rejected": -2.247529983520508, "logps/chosen": -0.40853986144065857, "logps/rejected": -0.472296804189682, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 1.012090802192688, "rewards/margins": 0.005102872848510742, "rewards/rejected": 1.0069879293441772, "step": 2837 }, { "epoch": 1.53, "learning_rate": 5.077153754643341e-08, "logits/chosen": -2.052041530609131, "logits/rejected": -2.1136343479156494, "logps/chosen": -9.62199878692627, "logps/rejected": -7.625530242919922, "loss": 0.5826, "rewards/accuracies": 1.0, "rewards/chosen": 1.2419555187225342, "rewards/margins": 0.23484790325164795, "rewards/rejected": 1.0071076154708862, "step": 2838 }, { "epoch": 1.53, "learning_rate": 5.074242510390326e-08, "logits/chosen": -2.1473922729492188, "logits/rejected": -2.324800729751587, "logps/chosen": -10.200477600097656, "logps/rejected": -9.952388763427734, "loss": 0.7029, "rewards/accuracies": 0.0, "rewards/chosen": 0.8152973055839539, "rewards/margins": -0.019364356994628906, "rewards/rejected": 0.8346616625785828, "step": 2839 }, { "epoch": 1.53, "learning_rate": 5.071331240962312e-08, "logits/chosen": -1.9629770517349243, "logits/rejected": -2.2855961322784424, "logps/chosen": -0.7577424049377441, "logps/rejected": -0.8267509341239929, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.7693772912025452, "rewards/margins": 0.004540383815765381, "rewards/rejected": 0.7648369073867798, "step": 2840 }, { "epoch": 1.53, "learning_rate": 5.068419947346483e-08, "logits/chosen": -2.1299784183502197, "logits/rejected": -2.1342544555664062, "logps/chosen": -1.642367959022522, "logps/rejected": -3.501596689224243, "loss": 0.485, "rewards/accuracies": 1.0, "rewards/chosen": 0.9501683115959167, "rewards/margins": 0.47144582867622375, "rewards/rejected": 0.478722482919693, "step": 2841 }, { "epoch": 1.53, "learning_rate": 5.0655086305300354e-08, "logits/chosen": -2.1088054180145264, "logits/rejected": -2.2469279766082764, "logps/chosen": -0.35676708817481995, "logps/rejected": -0.37507131695747375, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.8484474420547485, "rewards/margins": 0.03035414218902588, "rewards/rejected": 0.8180932998657227, "step": 2842 }, { "epoch": 1.53, "learning_rate": 5.06259729150017e-08, "logits/chosen": -2.0529942512512207, "logits/rejected": -2.065213441848755, "logps/chosen": -4.333188056945801, "logps/rejected": -10.931714057922363, "loss": 0.2544, "rewards/accuracies": 1.0, "rewards/chosen": 1.7171871662139893, "rewards/margins": 1.238858699798584, "rewards/rejected": 0.4783284366130829, "step": 2843 }, { "epoch": 1.53, "learning_rate": 5.059685931244101e-08, "logits/chosen": -2.1990439891815186, "logits/rejected": -2.2788922786712646, "logps/chosen": -0.9367753267288208, "logps/rejected": -1.976757287979126, "loss": 0.6656, "rewards/accuracies": 1.0, "rewards/chosen": 1.0271071195602417, "rewards/margins": 0.05587881803512573, "rewards/rejected": 0.971228301525116, "step": 2844 }, { "epoch": 1.53, "learning_rate": 5.056774550749042e-08, "logits/chosen": -2.0836241245269775, "logits/rejected": -2.0938515663146973, "logps/chosen": -3.5944716930389404, "logps/rejected": -3.185316562652588, "loss": 0.3113, "rewards/accuracies": 1.0, "rewards/chosen": 1.496579647064209, "rewards/margins": 1.007476568222046, "rewards/rejected": 0.4891030788421631, "step": 2845 }, { "epoch": 1.54, "learning_rate": 5.0538631510022187e-08, "logits/chosen": -2.1828179359436035, "logits/rejected": -2.3066771030426025, "logps/chosen": -0.3635233938694, "logps/rejected": -0.3762071430683136, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.8285375833511353, "rewards/margins": 0.004667401313781738, "rewards/rejected": 0.8238701820373535, "step": 2846 }, { "epoch": 1.54, "learning_rate": 5.0509517329908615e-08, "logits/chosen": -2.1488258838653564, "logits/rejected": -2.1444156169891357, "logps/chosen": -3.6211915016174316, "logps/rejected": -7.4046430587768555, "loss": 0.3867, "rewards/accuracies": 1.0, "rewards/chosen": 1.4753162860870361, "rewards/margins": 0.7504899501800537, "rewards/rejected": 0.7248263359069824, "step": 2847 }, { "epoch": 1.54, "learning_rate": 5.048040297702207e-08, "logits/chosen": -2.056082010269165, "logits/rejected": -2.2540383338928223, "logps/chosen": -1.564440369606018, "logps/rejected": -6.717134475708008, "loss": 0.6133, "rewards/accuracies": 1.0, "rewards/chosen": 0.8309566378593445, "rewards/margins": 0.16655385494232178, "rewards/rejected": 0.6644027829170227, "step": 2848 }, { "epoch": 1.54, "learning_rate": 5.0451288461234995e-08, "logits/chosen": -2.073333978652954, "logits/rejected": -2.0803332328796387, "logps/chosen": -1.6830458641052246, "logps/rejected": -2.601741075515747, "loss": 0.4717, "rewards/accuracies": 1.0, "rewards/chosen": 1.1622272729873657, "rewards/margins": 0.5062691569328308, "rewards/rejected": 0.6559581160545349, "step": 2849 }, { "epoch": 1.54, "learning_rate": 5.042217379241986e-08, "logits/chosen": -2.071584939956665, "logits/rejected": -2.2552738189697266, "logps/chosen": -6.0874505043029785, "logps/rejected": -0.45005276799201965, "loss": 0.841, "rewards/accuracies": 0.0, "rewards/chosen": 0.5890130400657654, "rewards/margins": -0.2766183614730835, "rewards/rejected": 0.8656314015388489, "step": 2850 }, { "epoch": 1.54, "learning_rate": 5.03930589804492e-08, "logits/chosen": -2.0540990829467773, "logits/rejected": -2.242307424545288, "logps/chosen": -1.6846468448638916, "logps/rejected": -1.7590211629867554, "loss": 0.6743, "rewards/accuracies": 1.0, "rewards/chosen": 0.7285761833190918, "rewards/margins": 0.03812897205352783, "rewards/rejected": 0.690447211265564, "step": 2851 }, { "epoch": 1.54, "learning_rate": 5.0363944035195604e-08, "logits/chosen": -1.9742151498794556, "logits/rejected": -1.9724974632263184, "logps/chosen": -2.8995065689086914, "logps/rejected": -3.933318614959717, "loss": 0.3713, "rewards/accuracies": 1.0, "rewards/chosen": 1.370969295501709, "rewards/margins": 0.7993403673171997, "rewards/rejected": 0.5716289281845093, "step": 2852 }, { "epoch": 1.54, "learning_rate": 5.0334828966531686e-08, "logits/chosen": -2.180086851119995, "logits/rejected": -2.2658421993255615, "logps/chosen": -4.287476539611816, "logps/rejected": -6.460348606109619, "loss": 0.6538, "rewards/accuracies": 1.0, "rewards/chosen": 0.965953528881073, "rewards/margins": 0.08023762702941895, "rewards/rejected": 0.885715901851654, "step": 2853 }, { "epoch": 1.54, "learning_rate": 5.030571378433015e-08, "logits/chosen": -1.9935517311096191, "logits/rejected": -2.2872514724731445, "logps/chosen": -1.1457021236419678, "logps/rejected": -0.8963898420333862, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.828225314617157, "rewards/margins": 0.000525057315826416, "rewards/rejected": 0.8277002573013306, "step": 2854 }, { "epoch": 1.54, "learning_rate": 5.027659849846366e-08, "logits/chosen": -2.087836742401123, "logits/rejected": -2.293691396713257, "logps/chosen": -1.1035971641540527, "logps/rejected": -1.1771976947784424, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 1.0763429403305054, "rewards/margins": 0.011208176612854004, "rewards/rejected": 1.0651347637176514, "step": 2855 }, { "epoch": 1.54, "learning_rate": 5.024748311880501e-08, "logits/chosen": -2.11441969871521, "logits/rejected": -2.264333963394165, "logps/chosen": -0.553817629814148, "logps/rejected": -0.6245582699775696, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.8196027874946594, "rewards/margins": -0.00315016508102417, "rewards/rejected": 0.8227529525756836, "step": 2856 }, { "epoch": 1.54, "learning_rate": 5.0218367655226943e-08, "logits/chosen": -2.0987844467163086, "logits/rejected": -2.2927565574645996, "logps/chosen": -1.6709651947021484, "logps/rejected": -1.2892060279846191, "loss": 0.7217, "rewards/accuracies": 0.0, "rewards/chosen": 0.8921422958374023, "rewards/margins": -0.05635261535644531, "rewards/rejected": 0.9484949111938477, "step": 2857 }, { "epoch": 1.54, "learning_rate": 5.018925211760227e-08, "logits/chosen": -2.158419132232666, "logits/rejected": -2.2636561393737793, "logps/chosen": -9.118496894836426, "logps/rejected": -8.918099403381348, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.49633780121803284, "rewards/margins": 0.005663871765136719, "rewards/rejected": 0.4906739294528961, "step": 2858 }, { "epoch": 1.54, "learning_rate": 5.016013651580382e-08, "logits/chosen": -2.197512626647949, "logits/rejected": -2.1991710662841797, "logps/chosen": -0.9411797523498535, "logps/rejected": -3.252228260040283, "loss": 0.4571, "rewards/accuracies": 1.0, "rewards/chosen": 1.3015583753585815, "rewards/margins": 0.5454916954040527, "rewards/rejected": 0.7560666799545288, "step": 2859 }, { "epoch": 1.54, "learning_rate": 5.013102085970444e-08, "logits/chosen": -2.0194621086120605, "logits/rejected": -2.2725956439971924, "logps/chosen": -0.3009469509124756, "logps/rejected": -0.344656765460968, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 1.0395160913467407, "rewards/margins": 0.02553093433380127, "rewards/rejected": 1.0139851570129395, "step": 2860 }, { "epoch": 1.54, "learning_rate": 5.0101905159177026e-08, "logits/chosen": -2.2139530181884766, "logits/rejected": -2.0908780097961426, "logps/chosen": -47.45032501220703, "logps/rejected": -9.48964786529541, "loss": 0.3011, "rewards/accuracies": 1.0, "rewards/chosen": 2.0485482215881348, "rewards/margins": 1.0460307598114014, "rewards/rejected": 1.0025174617767334, "step": 2861 }, { "epoch": 1.54, "learning_rate": 5.0072789424094445e-08, "logits/chosen": -2.00232195854187, "logits/rejected": -2.258173942565918, "logps/chosen": -0.16431593894958496, "logps/rejected": -0.17618942260742188, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.9297887682914734, "rewards/margins": 0.0006182193756103516, "rewards/rejected": 0.929170548915863, "step": 2862 }, { "epoch": 1.54, "learning_rate": 5.004367366432957e-08, "logits/chosen": -2.0555546283721924, "logits/rejected": -2.3178439140319824, "logps/chosen": -9.713711738586426, "logps/rejected": -9.315162658691406, "loss": 0.7068, "rewards/accuracies": 0.0, "rewards/chosen": 0.4210166037082672, "rewards/margins": -0.027052223682403564, "rewards/rejected": 0.4480688273906708, "step": 2863 }, { "epoch": 1.54, "learning_rate": 5.0014557889755346e-08, "logits/chosen": -2.023972749710083, "logits/rejected": -2.289463758468628, "logps/chosen": -0.42623743414878845, "logps/rejected": -0.40310993790626526, "loss": 0.6642, "rewards/accuracies": 1.0, "rewards/chosen": 0.9849519729614258, "rewards/margins": 0.05868500471115112, "rewards/rejected": 0.9262669682502747, "step": 2864 }, { "epoch": 1.55, "learning_rate": 4.9985442110244656e-08, "logits/chosen": -2.1189985275268555, "logits/rejected": -2.223170042037964, "logps/chosen": -6.7785539627075195, "logps/rejected": -0.6497082710266113, "loss": 0.628, "rewards/accuracies": 1.0, "rewards/chosen": 0.989975094795227, "rewards/margins": 0.13483285903930664, "rewards/rejected": 0.8551422357559204, "step": 2865 }, { "epoch": 1.55, "learning_rate": 4.995632633567043e-08, "logits/chosen": -2.049645185470581, "logits/rejected": -2.322916030883789, "logps/chosen": -2.528426170349121, "logps/rejected": -2.62872314453125, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.7325180172920227, "rewards/margins": 0.016298949718475342, "rewards/rejected": 0.7162190675735474, "step": 2866 }, { "epoch": 1.55, "learning_rate": 4.992721057590558e-08, "logits/chosen": -1.9935011863708496, "logits/rejected": -1.9980230331420898, "logps/chosen": -2.063326597213745, "logps/rejected": -3.1585309505462646, "loss": 0.4516, "rewards/accuracies": 1.0, "rewards/chosen": 1.1500762701034546, "rewards/margins": 0.5607563853263855, "rewards/rejected": 0.5893198847770691, "step": 2867 }, { "epoch": 1.55, "learning_rate": 4.989809484082298e-08, "logits/chosen": -2.0645434856414795, "logits/rejected": -2.0584723949432373, "logps/chosen": -2.765413284301758, "logps/rejected": -5.686121940612793, "loss": 0.4631, "rewards/accuracies": 1.0, "rewards/chosen": 1.2702159881591797, "rewards/margins": 0.5292244553565979, "rewards/rejected": 0.7409915328025818, "step": 2868 }, { "epoch": 1.55, "learning_rate": 4.986897914029556e-08, "logits/chosen": -2.075350522994995, "logits/rejected": -2.081397533416748, "logps/chosen": -2.455826997756958, "logps/rejected": -4.001935958862305, "loss": 0.5144, "rewards/accuracies": 1.0, "rewards/chosen": 1.0450069904327393, "rewards/margins": 0.3965253233909607, "rewards/rejected": 0.6484816670417786, "step": 2869 }, { "epoch": 1.55, "learning_rate": 4.983986348419618e-08, "logits/chosen": -1.9939122200012207, "logits/rejected": -2.256530284881592, "logps/chosen": -0.3095337152481079, "logps/rejected": -0.3524377644062042, "loss": 0.6952, "rewards/accuracies": 0.0, "rewards/chosen": 0.8573326468467712, "rewards/margins": -0.004085242748260498, "rewards/rejected": 0.8614178895950317, "step": 2870 }, { "epoch": 1.55, "learning_rate": 4.981074788239772e-08, "logits/chosen": -2.1800241470336914, "logits/rejected": -2.2790591716766357, "logps/chosen": -4.755382537841797, "logps/rejected": -0.38738957047462463, "loss": 0.7762, "rewards/accuracies": 0.0, "rewards/chosen": 0.8774736523628235, "rewards/margins": -0.15975850820541382, "rewards/rejected": 1.0372321605682373, "step": 2871 }, { "epoch": 1.55, "learning_rate": 4.978163234477306e-08, "logits/chosen": -2.0551257133483887, "logits/rejected": -2.0576133728027344, "logps/chosen": -3.1681456565856934, "logps/rejected": -3.989283323287964, "loss": 0.3364, "rewards/accuracies": 1.0, "rewards/chosen": 1.513978362083435, "rewards/margins": 0.9166964888572693, "rewards/rejected": 0.5972818732261658, "step": 2872 }, { "epoch": 1.55, "learning_rate": 4.9752516881195e-08, "logits/chosen": -2.0749573707580566, "logits/rejected": -2.2539150714874268, "logps/chosen": -0.4048365354537964, "logps/rejected": -0.4086936116218567, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.9197859168052673, "rewards/margins": 0.022258877754211426, "rewards/rejected": 0.8975270390510559, "step": 2873 }, { "epoch": 1.55, "learning_rate": 4.972340150153634e-08, "logits/chosen": -2.052631139755249, "logits/rejected": -2.3158693313598633, "logps/chosen": -1.4731593132019043, "logps/rejected": -5.770316123962402, "loss": 0.6035, "rewards/accuracies": 1.0, "rewards/chosen": 1.0518219470977783, "rewards/margins": 0.18818098306655884, "rewards/rejected": 0.8636409640312195, "step": 2874 }, { "epoch": 1.55, "learning_rate": 4.9694286215669855e-08, "logits/chosen": -2.103821277618408, "logits/rejected": -2.2248098850250244, "logps/chosen": -2.281233549118042, "logps/rejected": -0.7378581762313843, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.8573817610740662, "rewards/margins": 0.01795417070388794, "rewards/rejected": 0.8394275903701782, "step": 2875 }, { "epoch": 1.55, "learning_rate": 4.966517103346831e-08, "logits/chosen": -2.1430442333221436, "logits/rejected": -2.1256110668182373, "logps/chosen": -12.309503555297852, "logps/rejected": -6.845406532287598, "loss": 0.3033, "rewards/accuracies": 1.0, "rewards/chosen": 1.3452743291854858, "rewards/margins": 1.0375950336456299, "rewards/rejected": 0.30767926573753357, "step": 2876 }, { "epoch": 1.55, "learning_rate": 4.963605596480441e-08, "logits/chosen": -2.204615354537964, "logits/rejected": -2.1698954105377197, "logps/chosen": -14.034261703491211, "logps/rejected": -2.948442220687866, "loss": 0.5158, "rewards/accuracies": 1.0, "rewards/chosen": 1.1395803689956665, "rewards/margins": 0.39303457736968994, "rewards/rejected": 0.7465457916259766, "step": 2877 }, { "epoch": 1.55, "learning_rate": 4.960694101955082e-08, "logits/chosen": -2.0638041496276855, "logits/rejected": -2.3345062732696533, "logps/chosen": -0.3187454044818878, "logps/rejected": -0.3388001322746277, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.7938016057014465, "rewards/margins": -2.390146255493164e-05, "rewards/rejected": 0.7938255071640015, "step": 2878 }, { "epoch": 1.55, "learning_rate": 4.9577826207580154e-08, "logits/chosen": -1.9944233894348145, "logits/rejected": -2.2276666164398193, "logps/chosen": -0.6360229849815369, "logps/rejected": -0.7358294129371643, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.7923182249069214, "rewards/margins": 0.015325725078582764, "rewards/rejected": 0.7769924998283386, "step": 2879 }, { "epoch": 1.55, "learning_rate": 4.9548711538765e-08, "logits/chosen": -2.1275646686553955, "logits/rejected": -2.1238584518432617, "logps/chosen": -4.501516819000244, "logps/rejected": -4.683547019958496, "loss": 0.346, "rewards/accuracies": 1.0, "rewards/chosen": 1.2848005294799805, "rewards/margins": 0.883267879486084, "rewards/rejected": 0.4015326499938965, "step": 2880 }, { "epoch": 1.55, "learning_rate": 4.951959702297792e-08, "logits/chosen": -1.9639538526535034, "logits/rejected": -2.2390005588531494, "logps/chosen": -0.7991323471069336, "logps/rejected": -0.8286526203155518, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 0.8277503252029419, "rewards/margins": 0.024337947368621826, "rewards/rejected": 0.8034123778343201, "step": 2881 }, { "epoch": 1.55, "learning_rate": 4.949048267009138e-08, "logits/chosen": -2.0775375366210938, "logits/rejected": -2.209320306777954, "logps/chosen": -2.918264865875244, "logps/rejected": -2.4251718521118164, "loss": 0.7143, "rewards/accuracies": 0.0, "rewards/chosen": 0.827265202999115, "rewards/margins": -0.04185950756072998, "rewards/rejected": 0.869124710559845, "step": 2882 }, { "epoch": 1.56, "learning_rate": 4.9461368489977816e-08, "logits/chosen": -2.2114500999450684, "logits/rejected": -2.3145551681518555, "logps/chosen": -1.6217217445373535, "logps/rejected": -1.6730375289916992, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 1.0291646718978882, "rewards/margins": 0.027788400650024414, "rewards/rejected": 1.0013762712478638, "step": 2883 }, { "epoch": 1.56, "learning_rate": 4.943225449250958e-08, "logits/chosen": -1.986008882522583, "logits/rejected": -2.293360471725464, "logps/chosen": -0.34839510917663574, "logps/rejected": -0.39221006631851196, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.9662685394287109, "rewards/margins": -0.0005577206611633301, "rewards/rejected": 0.9668262600898743, "step": 2884 }, { "epoch": 1.56, "learning_rate": 4.940314068755899e-08, "logits/chosen": -1.9392144680023193, "logits/rejected": -1.9473237991333008, "logps/chosen": -1.8182315826416016, "logps/rejected": -3.460998058319092, "loss": 0.4991, "rewards/accuracies": 1.0, "rewards/chosen": 0.8986617922782898, "rewards/margins": 0.43516820669174194, "rewards/rejected": 0.46349358558654785, "step": 2885 }, { "epoch": 1.56, "learning_rate": 4.93740270849983e-08, "logits/chosen": -1.9589446783065796, "logits/rejected": -2.3084559440612793, "logps/chosen": -4.312191009521484, "logps/rejected": -4.8495635986328125, "loss": 0.6505, "rewards/accuracies": 1.0, "rewards/chosen": 0.8163558840751648, "rewards/margins": 0.08718717098236084, "rewards/rejected": 0.729168713092804, "step": 2886 }, { "epoch": 1.56, "learning_rate": 4.934491369469966e-08, "logits/chosen": -2.0047311782836914, "logits/rejected": -2.2550623416900635, "logps/chosen": -0.29731401801109314, "logps/rejected": -0.3251710534095764, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9632543921470642, "rewards/margins": 0.014258980751037598, "rewards/rejected": 0.9489954113960266, "step": 2887 }, { "epoch": 1.56, "learning_rate": 4.931580052653518e-08, "logits/chosen": -2.0177836418151855, "logits/rejected": -2.0054917335510254, "logps/chosen": -11.964373588562012, "logps/rejected": -2.73644757270813, "loss": 0.618, "rewards/accuracies": 1.0, "rewards/chosen": 0.9969523549079895, "rewards/margins": 0.1564529538154602, "rewards/rejected": 0.8404994010925293, "step": 2888 }, { "epoch": 1.56, "learning_rate": 4.928668759037689e-08, "logits/chosen": -2.2071645259857178, "logits/rejected": -2.267873764038086, "logps/chosen": -1.2347394227981567, "logps/rejected": -1.3208059072494507, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 0.9008163809776306, "rewards/margins": 0.03512090444564819, "rewards/rejected": 0.8656954765319824, "step": 2889 }, { "epoch": 1.56, "learning_rate": 4.925757489609673e-08, "logits/chosen": -2.0795390605926514, "logits/rejected": -2.0830466747283936, "logps/chosen": -0.457895964384079, "logps/rejected": -5.856573104858398, "loss": 0.5273, "rewards/accuracies": 1.0, "rewards/chosen": 0.935670793056488, "rewards/margins": 0.3648121953010559, "rewards/rejected": 0.5708585977554321, "step": 2890 }, { "epoch": 1.56, "learning_rate": 4.922846245356658e-08, "logits/chosen": -1.9958305358886719, "logits/rejected": -2.313264846801758, "logps/chosen": -5.198293209075928, "logps/rejected": -4.61775541305542, "loss": 0.7293, "rewards/accuracies": 0.0, "rewards/chosen": 0.653816282749176, "rewards/margins": -0.07112342119216919, "rewards/rejected": 0.7249397039413452, "step": 2891 }, { "epoch": 1.56, "learning_rate": 4.9199350272658227e-08, "logits/chosen": -2.050896167755127, "logits/rejected": -2.297588348388672, "logps/chosen": -1.3334028720855713, "logps/rejected": -3.2178566455841064, "loss": 0.6044, "rewards/accuracies": 1.0, "rewards/chosen": 1.1052720546722412, "rewards/margins": 0.1861182451248169, "rewards/rejected": 0.9191538095474243, "step": 2892 }, { "epoch": 1.56, "learning_rate": 4.9170238363243336e-08, "logits/chosen": -1.9898290634155273, "logits/rejected": -1.9871978759765625, "logps/chosen": -3.345468521118164, "logps/rejected": -2.420733690261841, "loss": 0.3912, "rewards/accuracies": 1.0, "rewards/chosen": 1.4322651624679565, "rewards/margins": 0.7366845607757568, "rewards/rejected": 0.6955806016921997, "step": 2893 }, { "epoch": 1.56, "learning_rate": 4.914112673519351e-08, "logits/chosen": -2.046210765838623, "logits/rejected": -2.0546391010284424, "logps/chosen": -1.3077164888381958, "logps/rejected": -4.093023777008057, "loss": 0.4725, "rewards/accuracies": 1.0, "rewards/chosen": 0.8445698022842407, "rewards/margins": 0.5042233467102051, "rewards/rejected": 0.34034642577171326, "step": 2894 }, { "epoch": 1.56, "learning_rate": 4.911201539838028e-08, "logits/chosen": -2.13191819190979, "logits/rejected": -2.1348283290863037, "logps/chosen": -2.4403634071350098, "logps/rejected": -3.032108783721924, "loss": 0.594, "rewards/accuracies": 1.0, "rewards/chosen": 0.8594881296157837, "rewards/margins": 0.2092260718345642, "rewards/rejected": 0.6502620577812195, "step": 2895 }, { "epoch": 1.56, "learning_rate": 4.908290436267502e-08, "logits/chosen": -2.0894007682800293, "logits/rejected": -2.2504565715789795, "logps/chosen": -1.5011297464370728, "logps/rejected": -1.5599437952041626, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 0.6769595742225647, "rewards/margins": 0.027374088764190674, "rewards/rejected": 0.649585485458374, "step": 2896 }, { "epoch": 1.56, "learning_rate": 4.905379363794906e-08, "logits/chosen": -2.0869247913360596, "logits/rejected": -1.9747918844223022, "logps/chosen": -27.035749435424805, "logps/rejected": -4.334019660949707, "loss": 0.5092, "rewards/accuracies": 1.0, "rewards/chosen": 1.3962122201919556, "rewards/margins": 0.40944671630859375, "rewards/rejected": 0.9867655038833618, "step": 2897 }, { "epoch": 1.56, "learning_rate": 4.9024683234073563e-08, "logits/chosen": -2.0953731536865234, "logits/rejected": -2.3311164379119873, "logps/chosen": -1.0203043222427368, "logps/rejected": -0.8936207294464111, "loss": 0.6328, "rewards/accuracies": 1.0, "rewards/chosen": 1.2449208498001099, "rewards/margins": 0.12452316284179688, "rewards/rejected": 1.120397686958313, "step": 2898 }, { "epoch": 1.56, "learning_rate": 4.899557316091964e-08, "logits/chosen": -2.1282074451446533, "logits/rejected": -2.157597541809082, "logps/chosen": -8.699545860290527, "logps/rejected": -17.88766098022461, "loss": 0.3632, "rewards/accuracies": 1.0, "rewards/chosen": 1.2450063228607178, "rewards/margins": 0.8256217241287231, "rewards/rejected": 0.41938456892967224, "step": 2899 }, { "epoch": 1.56, "learning_rate": 4.896646342835828e-08, "logits/chosen": -2.0259437561035156, "logits/rejected": -1.9717847108840942, "logps/chosen": -32.508689880371094, "logps/rejected": -2.7926111221313477, "loss": 0.2354, "rewards/accuracies": 1.0, "rewards/chosen": 1.9263657331466675, "rewards/margins": 1.326378345489502, "rewards/rejected": 0.5999874472618103, "step": 2900 }, { "epoch": 1.56, "learning_rate": 4.893735404626033e-08, "logits/chosen": -2.007248640060425, "logits/rejected": -2.276088237762451, "logps/chosen": -0.4588933289051056, "logps/rejected": -0.5074179172515869, "loss": 0.6706, "rewards/accuracies": 1.0, "rewards/chosen": 1.059315800666809, "rewards/margins": 0.04560065269470215, "rewards/rejected": 1.013715147972107, "step": 2901 }, { "epoch": 1.57, "learning_rate": 4.890824502449654e-08, "logits/chosen": -2.0083508491516113, "logits/rejected": -2.0085361003875732, "logps/chosen": -1.7628964185714722, "logps/rejected": -2.08856201171875, "loss": 0.5521, "rewards/accuracies": 1.0, "rewards/chosen": 1.2526462078094482, "rewards/margins": 0.3053642511367798, "rewards/rejected": 0.9472819566726685, "step": 2902 }, { "epoch": 1.57, "learning_rate": 4.887913637293751e-08, "logits/chosen": -2.171290397644043, "logits/rejected": -2.4219441413879395, "logps/chosen": -11.24168872833252, "logps/rejected": -13.524436950683594, "loss": 0.833, "rewards/accuracies": 0.0, "rewards/chosen": 0.8856862187385559, "rewards/margins": -0.26249247789382935, "rewards/rejected": 1.1481786966323853, "step": 2903 }, { "epoch": 1.57, "learning_rate": 4.885002810145376e-08, "logits/chosen": -2.030630111694336, "logits/rejected": -2.2327182292938232, "logps/chosen": -2.8974015712738037, "logps/rejected": -4.06837797164917, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.9742345809936523, "rewards/margins": -5.91278076171875e-05, "rewards/rejected": 0.9742937088012695, "step": 2904 }, { "epoch": 1.57, "learning_rate": 4.882092021991564e-08, "logits/chosen": -2.119973659515381, "logits/rejected": -2.065469980239868, "logps/chosen": -17.697269439697266, "logps/rejected": -3.3006718158721924, "loss": 0.3067, "rewards/accuracies": 1.0, "rewards/chosen": 1.6548569202423096, "rewards/margins": 1.0247788429260254, "rewards/rejected": 0.6300780177116394, "step": 2905 }, { "epoch": 1.57, "learning_rate": 4.879181273819339e-08, "logits/chosen": -1.9970897436141968, "logits/rejected": -2.011096954345703, "logps/chosen": -3.161862850189209, "logps/rejected": -7.47683048248291, "loss": 0.5059, "rewards/accuracies": 1.0, "rewards/chosen": 0.9906341433525085, "rewards/margins": 0.4177948832511902, "rewards/rejected": 0.5728392601013184, "step": 2906 }, { "epoch": 1.57, "learning_rate": 4.87627056661571e-08, "logits/chosen": -2.0521399974823, "logits/rejected": -2.3154633045196533, "logps/chosen": -1.779029369354248, "logps/rejected": -1.908494472503662, "loss": 0.7004, "rewards/accuracies": 0.0, "rewards/chosen": 1.0601036548614502, "rewards/margins": -0.01440274715423584, "rewards/rejected": 1.074506402015686, "step": 2907 }, { "epoch": 1.57, "learning_rate": 4.873359901367675e-08, "logits/chosen": -2.127216100692749, "logits/rejected": -2.2988462448120117, "logps/chosen": -1.21007239818573, "logps/rejected": -1.0603469610214233, "loss": 0.6693, "rewards/accuracies": 1.0, "rewards/chosen": 1.0658833980560303, "rewards/margins": 0.048358917236328125, "rewards/rejected": 1.0175244808197021, "step": 2908 }, { "epoch": 1.57, "learning_rate": 4.870449279062211e-08, "logits/chosen": -2.1901097297668457, "logits/rejected": -2.181816577911377, "logps/chosen": -6.285672187805176, "logps/rejected": -5.6702399253845215, "loss": 0.3123, "rewards/accuracies": 1.0, "rewards/chosen": 1.364841341972351, "rewards/margins": 1.0036976337432861, "rewards/rejected": 0.36114364862442017, "step": 2909 }, { "epoch": 1.57, "learning_rate": 4.867538700686291e-08, "logits/chosen": -2.169003963470459, "logits/rejected": -2.0785882472991943, "logps/chosen": -21.06194305419922, "logps/rejected": -3.4294979572296143, "loss": 0.1568, "rewards/accuracies": 1.0, "rewards/chosen": 2.2906577587127686, "rewards/margins": 1.7733688354492188, "rewards/rejected": 0.517288863658905, "step": 2910 }, { "epoch": 1.57, "learning_rate": 4.864628167226865e-08, "logits/chosen": -1.9595400094985962, "logits/rejected": -1.9585825204849243, "logps/chosen": -1.2892228364944458, "logps/rejected": -2.5067944526672363, "loss": 0.5792, "rewards/accuracies": 1.0, "rewards/chosen": 1.120721459388733, "rewards/margins": 0.24248754978179932, "rewards/rejected": 0.8782339096069336, "step": 2911 }, { "epoch": 1.57, "learning_rate": 4.861717679670869e-08, "logits/chosen": -2.093351364135742, "logits/rejected": -2.045624256134033, "logps/chosen": -9.82752799987793, "logps/rejected": -5.955918788909912, "loss": 0.4211, "rewards/accuracies": 1.0, "rewards/chosen": 1.4531166553497314, "rewards/margins": 0.6470487713813782, "rewards/rejected": 0.8060678839683533, "step": 2912 }, { "epoch": 1.57, "learning_rate": 4.858807239005224e-08, "logits/chosen": -2.081256866455078, "logits/rejected": -2.080892324447632, "logps/chosen": -1.3653981685638428, "logps/rejected": -1.4413466453552246, "loss": 0.671, "rewards/accuracies": 1.0, "rewards/chosen": 0.833526074886322, "rewards/margins": 0.044809043407440186, "rewards/rejected": 0.7887170314788818, "step": 2913 }, { "epoch": 1.57, "learning_rate": 4.855896846216838e-08, "logits/chosen": -1.9987635612487793, "logits/rejected": -1.9881668090820312, "logps/chosen": -5.947007179260254, "logps/rejected": -4.786090850830078, "loss": 0.3557, "rewards/accuracies": 1.0, "rewards/chosen": 1.540293574333191, "rewards/margins": 0.8506306409835815, "rewards/rejected": 0.6896629333496094, "step": 2914 }, { "epoch": 1.57, "learning_rate": 4.8529865022925984e-08, "logits/chosen": -2.1139416694641113, "logits/rejected": -2.114475727081299, "logps/chosen": -1.5349602699279785, "logps/rejected": -1.9381428956985474, "loss": 0.525, "rewards/accuracies": 1.0, "rewards/chosen": 1.05910325050354, "rewards/margins": 0.37040388584136963, "rewards/rejected": 0.6886993646621704, "step": 2915 }, { "epoch": 1.57, "learning_rate": 4.850076208219379e-08, "logits/chosen": -2.158029556274414, "logits/rejected": -2.3502085208892822, "logps/chosen": -1.5575947761535645, "logps/rejected": -1.5951111316680908, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 1.1677865982055664, "rewards/margins": 0.038649678230285645, "rewards/rejected": 1.1291369199752808, "step": 2916 }, { "epoch": 1.57, "learning_rate": 4.8471659649840335e-08, "logits/chosen": -1.987014889717102, "logits/rejected": -1.993208646774292, "logps/chosen": -3.484221935272217, "logps/rejected": -2.4305002689361572, "loss": 0.4658, "rewards/accuracies": 1.0, "rewards/chosen": 1.1774885654449463, "rewards/margins": 0.5219972729682922, "rewards/rejected": 0.655491292476654, "step": 2917 }, { "epoch": 1.57, "learning_rate": 4.8442557735734036e-08, "logits/chosen": -2.1243395805358887, "logits/rejected": -2.100648880004883, "logps/chosen": -15.346774101257324, "logps/rejected": -1.275528073310852, "loss": 0.4068, "rewards/accuracies": 1.0, "rewards/chosen": 1.6583095788955688, "rewards/margins": 0.6892234086990356, "rewards/rejected": 0.9690861701965332, "step": 2918 }, { "epoch": 1.57, "learning_rate": 4.841345634974306e-08, "logits/chosen": -2.0939934253692627, "logits/rejected": -2.2907161712646484, "logps/chosen": -0.49200859665870667, "logps/rejected": -0.5317370295524597, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.8782870173454285, "rewards/margins": 0.02027338743209839, "rewards/rejected": 0.8580136299133301, "step": 2919 }, { "epoch": 1.57, "learning_rate": 4.838435550173549e-08, "logits/chosen": -2.0921638011932373, "logits/rejected": -2.098452091217041, "logps/chosen": -3.723951816558838, "logps/rejected": -3.6628265380859375, "loss": 0.5295, "rewards/accuracies": 1.0, "rewards/chosen": 1.1207363605499268, "rewards/margins": 0.35939639806747437, "rewards/rejected": 0.7613399624824524, "step": 2920 }, { "epoch": 1.58, "learning_rate": 4.835525520157915e-08, "logits/chosen": -2.108621597290039, "logits/rejected": -2.111186981201172, "logps/chosen": -3.9066996574401855, "logps/rejected": -0.34576818346977234, "loss": 0.5979, "rewards/accuracies": 1.0, "rewards/chosen": 1.237020492553711, "rewards/margins": 0.20047259330749512, "rewards/rejected": 1.0365478992462158, "step": 2921 }, { "epoch": 1.58, "learning_rate": 4.832615545914169e-08, "logits/chosen": -2.032165050506592, "logits/rejected": -2.029435157775879, "logps/chosen": -1.2280489206314087, "logps/rejected": -4.841830253601074, "loss": 0.451, "rewards/accuracies": 1.0, "rewards/chosen": 0.9162800908088684, "rewards/margins": 0.5623142719268799, "rewards/rejected": 0.3539658486843109, "step": 2922 }, { "epoch": 1.58, "learning_rate": 4.829705628429061e-08, "logits/chosen": -2.0406205654144287, "logits/rejected": -2.0288007259368896, "logps/chosen": -5.347688674926758, "logps/rejected": -2.179094076156616, "loss": 0.5933, "rewards/accuracies": 1.0, "rewards/chosen": 0.8201183676719666, "rewards/margins": 0.21078848838806152, "rewards/rejected": 0.609329879283905, "step": 2923 }, { "epoch": 1.58, "learning_rate": 4.826795768689318e-08, "logits/chosen": -2.0642175674438477, "logits/rejected": -2.0662481784820557, "logps/chosen": -0.45140406489372253, "logps/rejected": -3.5318894386291504, "loss": 0.5035, "rewards/accuracies": 1.0, "rewards/chosen": 1.0592294931411743, "rewards/margins": 0.4237631559371948, "rewards/rejected": 0.6354663372039795, "step": 2924 }, { "epoch": 1.58, "learning_rate": 4.823885967681648e-08, "logits/chosen": -2.027320146560669, "logits/rejected": -2.268291711807251, "logps/chosen": -2.5560061931610107, "logps/rejected": -2.2776939868927, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.693179726600647, "rewards/margins": 0.0034316182136535645, "rewards/rejected": 0.6897481083869934, "step": 2925 }, { "epoch": 1.58, "learning_rate": 4.820976226392739e-08, "logits/chosen": -1.9864630699157715, "logits/rejected": -1.98551607131958, "logps/chosen": -2.8350043296813965, "logps/rejected": -3.0698156356811523, "loss": 0.4302, "rewards/accuracies": 1.0, "rewards/chosen": 1.1905758380889893, "rewards/margins": 0.6207974553108215, "rewards/rejected": 0.5697783827781677, "step": 2926 }, { "epoch": 1.58, "learning_rate": 4.818066545809262e-08, "logits/chosen": -2.0909759998321533, "logits/rejected": -2.297207832336426, "logps/chosen": -3.5122435092926025, "logps/rejected": -1.7409300804138184, "loss": 0.7363, "rewards/accuracies": 0.0, "rewards/chosen": 0.9189720153808594, "rewards/margins": -0.08444178104400635, "rewards/rejected": 1.0034137964248657, "step": 2927 }, { "epoch": 1.58, "learning_rate": 4.815156926917862e-08, "logits/chosen": -2.1006228923797607, "logits/rejected": -2.0775146484375, "logps/chosen": -13.471128463745117, "logps/rejected": -2.244614362716675, "loss": 0.4087, "rewards/accuracies": 1.0, "rewards/chosen": 1.368834376335144, "rewards/margins": 0.6833649277687073, "rewards/rejected": 0.6854694485664368, "step": 2928 }, { "epoch": 1.58, "learning_rate": 4.812247370705166e-08, "logits/chosen": -2.1140778064727783, "logits/rejected": -2.3040931224823, "logps/chosen": -3.798297643661499, "logps/rejected": -3.854750156402588, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.5220524072647095, "rewards/margins": 0.02391296625137329, "rewards/rejected": 0.4981394410133362, "step": 2929 }, { "epoch": 1.58, "learning_rate": 4.809337878157782e-08, "logits/chosen": -2.11814546585083, "logits/rejected": -2.077314615249634, "logps/chosen": -20.469362258911133, "logps/rejected": -8.977743148803711, "loss": 0.3888, "rewards/accuracies": 1.0, "rewards/chosen": 1.4548276662826538, "rewards/margins": 0.7440898418426514, "rewards/rejected": 0.7107378244400024, "step": 2930 }, { "epoch": 1.58, "learning_rate": 4.806428450262293e-08, "logits/chosen": -2.1250579357147217, "logits/rejected": -2.305363416671753, "logps/chosen": -1.661228060722351, "logps/rejected": -2.9079105854034424, "loss": 0.6179, "rewards/accuracies": 1.0, "rewards/chosen": 0.8894035220146179, "rewards/margins": 0.15654301643371582, "rewards/rejected": 0.7328605055809021, "step": 2931 }, { "epoch": 1.58, "learning_rate": 4.80351908800526e-08, "logits/chosen": -2.0825095176696777, "logits/rejected": -2.3340885639190674, "logps/chosen": -7.5354905128479, "logps/rejected": -6.084902286529541, "loss": 0.8268, "rewards/accuracies": 0.0, "rewards/chosen": 0.3727094233036041, "rewards/margins": -0.25159284472465515, "rewards/rejected": 0.6243022680282593, "step": 2932 }, { "epoch": 1.58, "learning_rate": 4.8006097923732245e-08, "logits/chosen": -2.0709075927734375, "logits/rejected": -2.269986152648926, "logps/chosen": -0.38175880908966064, "logps/rejected": -0.3870782256126404, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.8637012839317322, "rewards/margins": 0.006932675838470459, "rewards/rejected": 0.8567686080932617, "step": 2933 }, { "epoch": 1.58, "learning_rate": 4.7977005643527027e-08, "logits/chosen": -2.131345272064209, "logits/rejected": -2.2882049083709717, "logps/chosen": -0.5781073570251465, "logps/rejected": -0.530544638633728, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 0.9761068224906921, "rewards/margins": 0.03307199478149414, "rewards/rejected": 0.943034827709198, "step": 2934 }, { "epoch": 1.58, "learning_rate": 4.7947914049301896e-08, "logits/chosen": -2.193479061126709, "logits/rejected": -2.2015366554260254, "logps/chosen": -1.9670295715332031, "logps/rejected": -4.413461685180664, "loss": 0.3451, "rewards/accuracies": 1.0, "rewards/chosen": 1.3226662874221802, "rewards/margins": 0.886532187461853, "rewards/rejected": 0.43613407015800476, "step": 2935 }, { "epoch": 1.58, "learning_rate": 4.791882315092155e-08, "logits/chosen": -2.0221705436706543, "logits/rejected": -2.015641927719116, "logps/chosen": -4.802243709564209, "logps/rejected": -3.737999677658081, "loss": 0.2613, "rewards/accuracies": 1.0, "rewards/chosen": 1.5841585397720337, "rewards/margins": 1.208662986755371, "rewards/rejected": 0.3754955232143402, "step": 2936 }, { "epoch": 1.58, "learning_rate": 4.788973295825048e-08, "logits/chosen": -2.2114920616149902, "logits/rejected": -2.207634925842285, "logps/chosen": -0.2579439878463745, "logps/rejected": -10.539604187011719, "loss": 0.3763, "rewards/accuracies": 1.0, "rewards/chosen": 0.932942807674408, "rewards/margins": 0.7833690643310547, "rewards/rejected": 0.14957371354103088, "step": 2937 }, { "epoch": 1.58, "learning_rate": 4.786064348115289e-08, "logits/chosen": -2.123220682144165, "logits/rejected": -2.288058280944824, "logps/chosen": -0.8807345628738403, "logps/rejected": -0.9103528261184692, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 1.075021743774414, "rewards/margins": 0.02293097972869873, "rewards/rejected": 1.0520907640457153, "step": 2938 }, { "epoch": 1.59, "learning_rate": 4.7831554729492823e-08, "logits/chosen": -2.075528860092163, "logits/rejected": -2.197028636932373, "logps/chosen": -0.47785016894340515, "logps/rejected": -0.5185706615447998, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.8940997123718262, "rewards/margins": 0.023905575275421143, "rewards/rejected": 0.870194137096405, "step": 2939 }, { "epoch": 1.59, "learning_rate": 4.7802466713133995e-08, "logits/chosen": -2.049016237258911, "logits/rejected": -2.248732089996338, "logps/chosen": -0.10737285017967224, "logps/rejected": -0.11075660586357117, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.8659551739692688, "rewards/margins": 0.02322244644165039, "rewards/rejected": 0.8427327275276184, "step": 2940 }, { "epoch": 1.59, "learning_rate": 4.777337944193991e-08, "logits/chosen": -2.149552822113037, "logits/rejected": -2.275172233581543, "logps/chosen": -4.733484268188477, "logps/rejected": -27.909299850463867, "loss": 0.337, "rewards/accuracies": 1.0, "rewards/chosen": 1.2481340169906616, "rewards/margins": 0.9146156311035156, "rewards/rejected": 0.3335184156894684, "step": 2941 }, { "epoch": 1.59, "learning_rate": 4.77442929257738e-08, "logits/chosen": -2.1304895877838135, "logits/rejected": -2.130382776260376, "logps/chosen": -1.2123591899871826, "logps/rejected": -3.0342891216278076, "loss": 0.4712, "rewards/accuracies": 1.0, "rewards/chosen": 1.111533522605896, "rewards/margins": 0.5075933933258057, "rewards/rejected": 0.6039401292800903, "step": 2942 }, { "epoch": 1.59, "learning_rate": 4.771520717449868e-08, "logits/chosen": -1.9559890031814575, "logits/rejected": -2.235421657562256, "logps/chosen": -1.3174479007720947, "logps/rejected": -1.3288654088974, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.8902144432067871, "rewards/margins": 0.020690977573394775, "rewards/rejected": 0.8695234656333923, "step": 2943 }, { "epoch": 1.59, "learning_rate": 4.7686122197977256e-08, "logits/chosen": -2.099555730819702, "logits/rejected": -2.1051928997039795, "logps/chosen": -1.0770432949066162, "logps/rejected": -1.725576639175415, "loss": 0.5453, "rewards/accuracies": 1.0, "rewards/chosen": 1.0626970529556274, "rewards/margins": 0.32136499881744385, "rewards/rejected": 0.7413320541381836, "step": 2944 }, { "epoch": 1.59, "learning_rate": 4.765703800607201e-08, "logits/chosen": -2.0367748737335205, "logits/rejected": -2.0466392040252686, "logps/chosen": -2.3655805587768555, "logps/rejected": -2.3181049823760986, "loss": 0.4613, "rewards/accuracies": 1.0, "rewards/chosen": 1.230872631072998, "rewards/margins": 0.5341499447822571, "rewards/rejected": 0.696722686290741, "step": 2945 }, { "epoch": 1.59, "learning_rate": 4.7627954608645126e-08, "logits/chosen": -2.0131726264953613, "logits/rejected": -2.2815299034118652, "logps/chosen": -11.17878532409668, "logps/rejected": -12.568283081054688, "loss": 0.708, "rewards/accuracies": 0.0, "rewards/chosen": 1.0103758573532104, "rewards/margins": -0.029471397399902344, "rewards/rejected": 1.0398472547531128, "step": 2946 }, { "epoch": 1.59, "learning_rate": 4.759887201555855e-08, "logits/chosen": -2.1051828861236572, "logits/rejected": -2.3006763458251953, "logps/chosen": -4.612288475036621, "logps/rejected": -2.6863675117492676, "loss": 0.7175, "rewards/accuracies": 0.0, "rewards/chosen": 0.6305109858512878, "rewards/margins": -0.04815816879272461, "rewards/rejected": 0.6786691546440125, "step": 2947 }, { "epoch": 1.59, "learning_rate": 4.756979023667392e-08, "logits/chosen": -2.0307960510253906, "logits/rejected": -2.0289671421051025, "logps/chosen": -0.9151701331138611, "logps/rejected": -1.1184550523757935, "loss": 0.631, "rewards/accuracies": 1.0, "rewards/chosen": 0.981735348701477, "rewards/margins": 0.1283550262451172, "rewards/rejected": 0.8533803224563599, "step": 2948 }, { "epoch": 1.59, "learning_rate": 4.7540709281852654e-08, "logits/chosen": -2.043710708618164, "logits/rejected": -2.2879927158355713, "logps/chosen": -0.4182082414627075, "logps/rejected": -0.4314640164375305, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.7946860194206238, "rewards/margins": 0.021060526371002197, "rewards/rejected": 0.7736254930496216, "step": 2949 }, { "epoch": 1.59, "learning_rate": 4.751162916095583e-08, "logits/chosen": -2.2024905681610107, "logits/rejected": -2.0725362300872803, "logps/chosen": -56.16529083251953, "logps/rejected": -4.777775287628174, "loss": 0.1858, "rewards/accuracies": 1.0, "rewards/chosen": 2.274771213531494, "rewards/margins": 1.5885486602783203, "rewards/rejected": 0.686222493648529, "step": 2950 }, { "epoch": 1.59, "learning_rate": 4.748254988384427e-08, "logits/chosen": -2.0712223052978516, "logits/rejected": -2.3295938968658447, "logps/chosen": -13.72641372680664, "logps/rejected": -15.624717712402344, "loss": 0.5903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9273397326469421, "rewards/margins": 0.21759581565856934, "rewards/rejected": 0.7097439169883728, "step": 2951 }, { "epoch": 1.59, "learning_rate": 4.7453471460378516e-08, "logits/chosen": -1.9951246976852417, "logits/rejected": -1.9867939949035645, "logps/chosen": -5.47538948059082, "logps/rejected": -4.562498092651367, "loss": 0.3152, "rewards/accuracies": 1.0, "rewards/chosen": 1.5332549810409546, "rewards/margins": 0.9926981329917908, "rewards/rejected": 0.5405568480491638, "step": 2952 }, { "epoch": 1.59, "learning_rate": 4.742439390041881e-08, "logits/chosen": -2.0867063999176025, "logits/rejected": -2.083818197250366, "logps/chosen": -2.3685097694396973, "logps/rejected": -5.6112589836120605, "loss": 0.2843, "rewards/accuracies": 1.0, "rewards/chosen": 1.539331316947937, "rewards/margins": 1.112228274345398, "rewards/rejected": 0.4271030128002167, "step": 2953 }, { "epoch": 1.59, "learning_rate": 4.739531721382511e-08, "logits/chosen": -2.138603925704956, "logits/rejected": -2.11657452583313, "logps/chosen": -9.879158973693848, "logps/rejected": -4.49141788482666, "loss": 0.3752, "rewards/accuracies": 1.0, "rewards/chosen": 1.2900761365890503, "rewards/margins": 0.7867409586906433, "rewards/rejected": 0.503335177898407, "step": 2954 }, { "epoch": 1.59, "learning_rate": 4.736624141045704e-08, "logits/chosen": -2.1112892627716064, "logits/rejected": -2.2108848094940186, "logps/chosen": -0.7818590402603149, "logps/rejected": -0.8845294117927551, "loss": 0.6727, "rewards/accuracies": 1.0, "rewards/chosen": 0.6777316331863403, "rewards/margins": 0.041229426860809326, "rewards/rejected": 0.636502206325531, "step": 2955 }, { "epoch": 1.59, "learning_rate": 4.7337166500174e-08, "logits/chosen": -2.119152307510376, "logits/rejected": -2.3134684562683105, "logps/chosen": -0.3493989109992981, "logps/rejected": -0.36953699588775635, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.8541720509529114, "rewards/margins": 0.023012518882751465, "rewards/rejected": 0.8311595320701599, "step": 2956 }, { "epoch": 1.59, "learning_rate": 4.730809249283502e-08, "logits/chosen": -2.140387773513794, "logits/rejected": -2.215378522872925, "logps/chosen": -7.11339807510376, "logps/rejected": -5.933137893676758, "loss": 0.7032, "rewards/accuracies": 0.0, "rewards/chosen": 0.7291204333305359, "rewards/margins": -0.01995849609375, "rewards/rejected": 0.7490789294242859, "step": 2957 }, { "epoch": 1.6, "learning_rate": 4.727901939829884e-08, "logits/chosen": -2.1093575954437256, "logits/rejected": -2.0390448570251465, "logps/chosen": -21.361562728881836, "logps/rejected": -2.4942233562469482, "loss": 0.3882, "rewards/accuracies": 1.0, "rewards/chosen": 1.3825544118881226, "rewards/margins": 0.745856523513794, "rewards/rejected": 0.6366978883743286, "step": 2958 }, { "epoch": 1.6, "learning_rate": 4.7249947226423913e-08, "logits/chosen": -2.139658212661743, "logits/rejected": -2.1393747329711914, "logps/chosen": -4.522087097167969, "logps/rejected": -2.3815112113952637, "loss": 0.2856, "rewards/accuracies": 1.0, "rewards/chosen": 1.6526167392730713, "rewards/margins": 1.10701322555542, "rewards/rejected": 0.5456035137176514, "step": 2959 }, { "epoch": 1.6, "learning_rate": 4.722087598706837e-08, "logits/chosen": -1.9349703788757324, "logits/rejected": -1.9410524368286133, "logps/chosen": -1.531808614730835, "logps/rejected": -5.435681343078613, "loss": 0.4246, "rewards/accuracies": 1.0, "rewards/chosen": 0.9691929221153259, "rewards/margins": 0.6369372606277466, "rewards/rejected": 0.33225566148757935, "step": 2960 }, { "epoch": 1.6, "learning_rate": 4.7191805690089995e-08, "logits/chosen": -2.067392349243164, "logits/rejected": -2.247413158416748, "logps/chosen": -0.46147626638412476, "logps/rejected": -0.5320314764976501, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 1.029710292816162, "rewards/margins": -0.005023002624511719, "rewards/rejected": 1.0347332954406738, "step": 2961 }, { "epoch": 1.6, "learning_rate": 4.7162736345346296e-08, "logits/chosen": -2.084932327270508, "logits/rejected": -2.0884082317352295, "logps/chosen": -2.8430428504943848, "logps/rejected": -5.272211074829102, "loss": 0.3332, "rewards/accuracies": 1.0, "rewards/chosen": 1.487732172012329, "rewards/margins": 0.9276605844497681, "rewards/rejected": 0.560071587562561, "step": 2962 }, { "epoch": 1.6, "learning_rate": 4.713366796269444e-08, "logits/chosen": -2.0345420837402344, "logits/rejected": -2.274766683578491, "logps/chosen": -1.015069842338562, "logps/rejected": -0.9396631717681885, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 0.9076104164123535, "rewards/margins": 0.029138565063476562, "rewards/rejected": 0.878471851348877, "step": 2963 }, { "epoch": 1.6, "learning_rate": 4.7104600551991263e-08, "logits/chosen": -2.053184747695923, "logits/rejected": -2.0537314414978027, "logps/chosen": -1.785796880722046, "logps/rejected": -0.6952632665634155, "loss": 0.3785, "rewards/accuracies": 1.0, "rewards/chosen": 1.5202171802520752, "rewards/margins": 0.7761793732643127, "rewards/rejected": 0.7440378069877625, "step": 2964 }, { "epoch": 1.6, "learning_rate": 4.7075534123093264e-08, "logits/chosen": -1.9252028465270996, "logits/rejected": -1.929295301437378, "logps/chosen": -1.4275343418121338, "logps/rejected": -2.7186388969421387, "loss": 0.5217, "rewards/accuracies": 1.0, "rewards/chosen": 1.0564898252487183, "rewards/margins": 0.37846988439559937, "rewards/rejected": 0.6780199408531189, "step": 2965 }, { "epoch": 1.6, "learning_rate": 4.704646868585663e-08, "logits/chosen": -2.0317299365997314, "logits/rejected": -2.0297296047210693, "logps/chosen": -0.32440027594566345, "logps/rejected": -2.57663631439209, "loss": 0.5425, "rewards/accuracies": 1.0, "rewards/chosen": 0.8773665428161621, "rewards/margins": 0.32803887128829956, "rewards/rejected": 0.5493276715278625, "step": 2966 }, { "epoch": 1.6, "learning_rate": 4.7017404250137213e-08, "logits/chosen": -2.126319169998169, "logits/rejected": -2.0452778339385986, "logps/chosen": -7.575961112976074, "logps/rejected": -1.7275148630142212, "loss": 0.4194, "rewards/accuracies": 1.0, "rewards/chosen": 1.5467818975448608, "rewards/margins": 0.6520445346832275, "rewards/rejected": 0.8947373628616333, "step": 2967 }, { "epoch": 1.6, "learning_rate": 4.698834082579048e-08, "logits/chosen": -2.1113243103027344, "logits/rejected": -2.1432955265045166, "logps/chosen": -2.9558820724487305, "logps/rejected": -23.260799407958984, "loss": 0.6106, "rewards/accuracies": 1.0, "rewards/chosen": 1.0657747983932495, "rewards/margins": 0.17260539531707764, "rewards/rejected": 0.8931694030761719, "step": 2968 }, { "epoch": 1.6, "learning_rate": 4.695927842267163e-08, "logits/chosen": -2.044271230697632, "logits/rejected": -2.2627127170562744, "logps/chosen": -0.39238253235816956, "logps/rejected": -0.36655956506729126, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.8555033802986145, "rewards/margins": -0.011537432670593262, "rewards/rejected": 0.8670408129692078, "step": 2969 }, { "epoch": 1.6, "learning_rate": 4.693021705063545e-08, "logits/chosen": -1.9623667001724243, "logits/rejected": -2.247619867324829, "logps/chosen": -2.099144220352173, "logps/rejected": -2.0971033573150635, "loss": 0.6788, "rewards/accuracies": 1.0, "rewards/chosen": 0.7222790122032166, "rewards/margins": 0.02895951271057129, "rewards/rejected": 0.6933194994926453, "step": 2970 }, { "epoch": 1.6, "learning_rate": 4.6901156719536424e-08, "logits/chosen": -2.022125482559204, "logits/rejected": -2.2688348293304443, "logps/chosen": -8.205204010009766, "logps/rejected": -6.851576328277588, "loss": 0.7426, "rewards/accuracies": 0.0, "rewards/chosen": 0.5078043341636658, "rewards/margins": -0.09662753343582153, "rewards/rejected": 0.6044318675994873, "step": 2971 }, { "epoch": 1.6, "learning_rate": 4.687209743922864e-08, "logits/chosen": -1.988879680633545, "logits/rejected": -1.9670275449752808, "logps/chosen": -13.417332649230957, "logps/rejected": -6.50515079498291, "loss": 0.4052, "rewards/accuracies": 1.0, "rewards/chosen": 1.4626662731170654, "rewards/margins": 0.6940916180610657, "rewards/rejected": 0.7685746550559998, "step": 2972 }, { "epoch": 1.6, "learning_rate": 4.684303921956587e-08, "logits/chosen": -2.011591672897339, "logits/rejected": -2.2584948539733887, "logps/chosen": -0.3316764831542969, "logps/rejected": -0.3585573732852936, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.9848410487174988, "rewards/margins": 0.0008098483085632324, "rewards/rejected": 0.9840312004089355, "step": 2973 }, { "epoch": 1.6, "learning_rate": 4.681398207040148e-08, "logits/chosen": -2.105473756790161, "logits/rejected": -2.081848621368408, "logps/chosen": -16.80797576904297, "logps/rejected": -3.3196911811828613, "loss": 0.2281, "rewards/accuracies": 1.0, "rewards/chosen": 2.1289589405059814, "rewards/margins": 1.3619656562805176, "rewards/rejected": 0.7669932842254639, "step": 2974 }, { "epoch": 1.6, "learning_rate": 4.678492600158854e-08, "logits/chosen": -2.208918571472168, "logits/rejected": -2.0958774089813232, "logps/chosen": -19.033790588378906, "logps/rejected": -14.268007278442383, "loss": 0.5192, "rewards/accuracies": 1.0, "rewards/chosen": 1.3900460004806519, "rewards/margins": 0.3846626281738281, "rewards/rejected": 1.0053833723068237, "step": 2975 }, { "epoch": 1.61, "learning_rate": 4.675587102297968e-08, "logits/chosen": -2.042623519897461, "logits/rejected": -2.0486679077148438, "logps/chosen": -2.859431743621826, "logps/rejected": -4.853806018829346, "loss": 0.4467, "rewards/accuracies": 1.0, "rewards/chosen": 1.0677238702774048, "rewards/margins": 0.5741664171218872, "rewards/rejected": 0.4935574233531952, "step": 2976 }, { "epoch": 1.61, "learning_rate": 4.6726817144427194e-08, "logits/chosen": -2.092670440673828, "logits/rejected": -2.1011180877685547, "logps/chosen": -3.371877431869507, "logps/rejected": -6.71004581451416, "loss": 0.4216, "rewards/accuracies": 1.0, "rewards/chosen": 1.5271309614181519, "rewards/margins": 0.6455806493759155, "rewards/rejected": 0.8815503120422363, "step": 2977 }, { "epoch": 1.61, "learning_rate": 4.669776437578304e-08, "logits/chosen": -2.144406318664551, "logits/rejected": -2.351060628890991, "logps/chosen": -11.402952194213867, "logps/rejected": -11.925573348999023, "loss": 0.6441, "rewards/accuracies": 1.0, "rewards/chosen": 1.0796743631362915, "rewards/margins": 0.10059589147567749, "rewards/rejected": 0.979078471660614, "step": 2978 }, { "epoch": 1.61, "learning_rate": 4.6668712726898734e-08, "logits/chosen": -2.105034589767456, "logits/rejected": -2.1135315895080566, "logps/chosen": -4.191563606262207, "logps/rejected": -7.6174540519714355, "loss": 0.3018, "rewards/accuracies": 1.0, "rewards/chosen": 1.510743260383606, "rewards/margins": 1.04338800907135, "rewards/rejected": 0.46735522150993347, "step": 2979 }, { "epoch": 1.61, "learning_rate": 4.663966220762544e-08, "logits/chosen": -2.129925489425659, "logits/rejected": -2.1212143898010254, "logps/chosen": -14.460687637329102, "logps/rejected": -11.720088005065918, "loss": 0.4363, "rewards/accuracies": 1.0, "rewards/chosen": 1.3069130182266235, "rewards/margins": 0.6033982634544373, "rewards/rejected": 0.7035147547721863, "step": 2980 }, { "epoch": 1.61, "learning_rate": 4.661061282781396e-08, "logits/chosen": -2.202936887741089, "logits/rejected": -2.2007577419281006, "logps/chosen": -3.007955312728882, "logps/rejected": -5.059917449951172, "loss": 0.4744, "rewards/accuracies": 1.0, "rewards/chosen": 1.0061326026916504, "rewards/margins": 0.4991157650947571, "rewards/rejected": 0.5070168375968933, "step": 2981 }, { "epoch": 1.61, "learning_rate": 4.658156459731469e-08, "logits/chosen": -2.0069777965545654, "logits/rejected": -2.016402244567871, "logps/chosen": -1.6134463548660278, "logps/rejected": -3.06823468208313, "loss": 0.54, "rewards/accuracies": 1.0, "rewards/chosen": 1.0118927955627441, "rewards/margins": 0.33402538299560547, "rewards/rejected": 0.6778674125671387, "step": 2982 }, { "epoch": 1.61, "learning_rate": 4.655251752597761e-08, "logits/chosen": -2.1562788486480713, "logits/rejected": -2.2651302814483643, "logps/chosen": -0.9668066501617432, "logps/rejected": -0.9849499464035034, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 1.0417457818984985, "rewards/margins": 0.002301931381225586, "rewards/rejected": 1.039443850517273, "step": 2983 }, { "epoch": 1.61, "learning_rate": 4.652347162365235e-08, "logits/chosen": -2.2066526412963867, "logits/rejected": -2.1205742359161377, "logps/chosen": -22.638296127319336, "logps/rejected": -1.968428611755371, "loss": 0.2143, "rewards/accuracies": 1.0, "rewards/chosen": 2.119454860687256, "rewards/margins": 1.4311622381210327, "rewards/rejected": 0.6882926225662231, "step": 2984 }, { "epoch": 1.61, "learning_rate": 4.649442690018813e-08, "logits/chosen": -2.1523921489715576, "logits/rejected": -2.121244430541992, "logps/chosen": -27.10720443725586, "logps/rejected": -4.700318813323975, "loss": 0.6022, "rewards/accuracies": 1.0, "rewards/chosen": 1.3028186559677124, "rewards/margins": 0.19096100330352783, "rewards/rejected": 1.1118576526641846, "step": 2985 }, { "epoch": 1.61, "learning_rate": 4.646538336543377e-08, "logits/chosen": -1.996995210647583, "logits/rejected": -2.2636444568634033, "logps/chosen": -1.5568569898605347, "logps/rejected": -1.5157663822174072, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.6804746985435486, "rewards/margins": 0.018466472625732422, "rewards/rejected": 0.6620082259178162, "step": 2986 }, { "epoch": 1.61, "learning_rate": 4.643634102923765e-08, "logits/chosen": -2.140477418899536, "logits/rejected": -2.4247660636901855, "logps/chosen": -10.615134239196777, "logps/rejected": -14.2684965133667, "loss": 0.7162, "rewards/accuracies": 0.0, "rewards/chosen": 0.8958302736282349, "rewards/margins": -0.04554671049118042, "rewards/rejected": 0.9413769841194153, "step": 2987 }, { "epoch": 1.61, "learning_rate": 4.640729990144783e-08, "logits/chosen": -2.0591237545013428, "logits/rejected": -2.061347007751465, "logps/chosen": -2.593783378601074, "logps/rejected": -1.2720248699188232, "loss": 0.4974, "rewards/accuracies": 1.0, "rewards/chosen": 1.334664225578308, "rewards/margins": 0.4393840432167053, "rewards/rejected": 0.8952801823616028, "step": 2988 }, { "epoch": 1.61, "learning_rate": 4.6378259991911884e-08, "logits/chosen": -2.129486083984375, "logits/rejected": -2.2892544269561768, "logps/chosen": -0.6583864688873291, "logps/rejected": -0.673547625541687, "loss": 0.6814, "rewards/accuracies": 1.0, "rewards/chosen": 1.0248631238937378, "rewards/margins": 0.02372300624847412, "rewards/rejected": 1.0011401176452637, "step": 2989 }, { "epoch": 1.61, "learning_rate": 4.6349221310476983e-08, "logits/chosen": -2.183475971221924, "logits/rejected": -2.3328182697296143, "logps/chosen": -1.5874921083450317, "logps/rejected": -1.5049444437026978, "loss": 0.699, "rewards/accuracies": 0.0, "rewards/chosen": 0.771458089351654, "rewards/margins": -0.011671066284179688, "rewards/rejected": 0.7831291556358337, "step": 2990 }, { "epoch": 1.61, "learning_rate": 4.6320183866989926e-08, "logits/chosen": -1.9786858558654785, "logits/rejected": -2.2467215061187744, "logps/chosen": -0.5417378544807434, "logps/rejected": -0.5732845067977905, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.8760644197463989, "rewards/margins": 0.0050612688064575195, "rewards/rejected": 0.8710031509399414, "step": 2991 }, { "epoch": 1.61, "learning_rate": 4.6291147671297034e-08, "logits/chosen": -2.0662856101989746, "logits/rejected": -2.05708384513855, "logps/chosen": -9.985939025878906, "logps/rejected": -1.9458646774291992, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 1.104293704032898, "rewards/margins": 0.02797567844390869, "rewards/rejected": 1.0763180255889893, "step": 2992 }, { "epoch": 1.61, "learning_rate": 4.626211273324424e-08, "logits/chosen": -2.0521411895751953, "logits/rejected": -2.2521812915802, "logps/chosen": -0.9524498581886292, "logps/rejected": -0.8408015966415405, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.9012355804443359, "rewards/margins": 0.012666881084442139, "rewards/rejected": 0.8885686993598938, "step": 2993 }, { "epoch": 1.61, "learning_rate": 4.623307906267706e-08, "logits/chosen": -2.0631937980651855, "logits/rejected": -2.2323460578918457, "logps/chosen": -0.538420557975769, "logps/rejected": -0.5163577795028687, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 1.0229969024658203, "rewards/margins": 0.010887742042541504, "rewards/rejected": 1.0121091604232788, "step": 2994 }, { "epoch": 1.62, "learning_rate": 4.620404666944054e-08, "logits/chosen": -2.099215507507324, "logits/rejected": -2.1339268684387207, "logps/chosen": -4.018892288208008, "logps/rejected": -13.757781028747559, "loss": 0.4545, "rewards/accuracies": 1.0, "rewards/chosen": 1.2437057495117188, "rewards/margins": 0.5527690052986145, "rewards/rejected": 0.6909367442131042, "step": 2995 }, { "epoch": 1.62, "learning_rate": 4.617501556337934e-08, "logits/chosen": -2.06634521484375, "logits/rejected": -2.2317986488342285, "logps/chosen": -0.7434006333351135, "logps/rejected": -0.8876879215240479, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 0.7415792346000671, "rewards/margins": 0.03516554832458496, "rewards/rejected": 0.7064136862754822, "step": 2996 }, { "epoch": 1.62, "learning_rate": 4.614598575433762e-08, "logits/chosen": -2.0358316898345947, "logits/rejected": -2.0424368381500244, "logps/chosen": -5.149209976196289, "logps/rejected": -3.3410727977752686, "loss": 0.3976, "rewards/accuracies": 1.0, "rewards/chosen": 1.3800276517868042, "rewards/margins": 0.7168105840682983, "rewards/rejected": 0.6632170677185059, "step": 2997 }, { "epoch": 1.62, "learning_rate": 4.6116957252159185e-08, "logits/chosen": -2.0223453044891357, "logits/rejected": -2.0270915031433105, "logps/chosen": -1.8314313888549805, "logps/rejected": -4.548766613006592, "loss": 0.4352, "rewards/accuracies": 1.0, "rewards/chosen": 0.9867458343505859, "rewards/margins": 0.6066027879714966, "rewards/rejected": 0.38014301657676697, "step": 2998 }, { "epoch": 1.62, "learning_rate": 4.608793006668732e-08, "logits/chosen": -2.0683624744415283, "logits/rejected": -2.0659806728363037, "logps/chosen": -7.106842994689941, "logps/rejected": -5.7593994140625, "loss": 0.6667, "rewards/accuracies": 1.0, "rewards/chosen": 1.1190638542175293, "rewards/margins": 0.053581833839416504, "rewards/rejected": 1.0654820203781128, "step": 2999 }, { "epoch": 1.62, "learning_rate": 4.605890420776492e-08, "logits/chosen": -2.1214957237243652, "logits/rejected": -2.0981104373931885, "logps/chosen": -13.181453704833984, "logps/rejected": -5.170410633087158, "loss": 0.5595, "rewards/accuracies": 1.0, "rewards/chosen": 0.9984142184257507, "rewards/margins": 0.2880043387413025, "rewards/rejected": 0.7104098796844482, "step": 3000 }, { "epoch": 1.62, "learning_rate": 4.602987968523439e-08, "logits/chosen": -2.0517380237579346, "logits/rejected": -2.039736270904541, "logps/chosen": -6.538035869598389, "logps/rejected": -2.6561710834503174, "loss": 0.4284, "rewards/accuracies": 1.0, "rewards/chosen": 1.4539520740509033, "rewards/margins": 0.6257641911506653, "rewards/rejected": 0.828187882900238, "step": 3001 }, { "epoch": 1.62, "learning_rate": 4.6000856508937705e-08, "logits/chosen": -1.9754334688186646, "logits/rejected": -2.2626137733459473, "logps/chosen": -3.4322547912597656, "logps/rejected": -3.165792465209961, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 1.073646903038025, "rewards/margins": 0.002436041831970215, "rewards/rejected": 1.0712108612060547, "step": 3002 }, { "epoch": 1.62, "learning_rate": 4.597183468871636e-08, "logits/chosen": -2.035813570022583, "logits/rejected": -2.0234622955322266, "logps/chosen": -6.594161510467529, "logps/rejected": -0.5649160146713257, "loss": 0.3984, "rewards/accuracies": 1.0, "rewards/chosen": 1.5614460706710815, "rewards/margins": 0.7145547270774841, "rewards/rejected": 0.8468913435935974, "step": 3003 }, { "epoch": 1.62, "learning_rate": 4.594281423441143e-08, "logits/chosen": -2.1791300773620605, "logits/rejected": -2.3038291931152344, "logps/chosen": -0.5212540030479431, "logps/rejected": -0.6169096827507019, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.9398992657661438, "rewards/margins": 0.01579892635345459, "rewards/rejected": 0.9241003394126892, "step": 3004 }, { "epoch": 1.62, "learning_rate": 4.591379515586348e-08, "logits/chosen": -2.0020523071289062, "logits/rejected": -2.0111501216888428, "logps/chosen": -2.127567768096924, "logps/rejected": -2.0204477310180664, "loss": 0.4298, "rewards/accuracies": 1.0, "rewards/chosen": 1.2728897333145142, "rewards/margins": 0.6217250823974609, "rewards/rejected": 0.6511646509170532, "step": 3005 }, { "epoch": 1.62, "learning_rate": 4.588477746291265e-08, "logits/chosen": -2.055462121963501, "logits/rejected": -2.0681753158569336, "logps/chosen": -5.79518985748291, "logps/rejected": -9.114766120910645, "loss": 0.2948, "rewards/accuracies": 1.0, "rewards/chosen": 1.6671688556671143, "rewards/margins": 1.07033371925354, "rewards/rejected": 0.5968350768089294, "step": 3006 }, { "epoch": 1.62, "learning_rate": 4.5855761165398566e-08, "logits/chosen": -2.178314208984375, "logits/rejected": -2.212801933288574, "logps/chosen": -11.588451385498047, "logps/rejected": -11.956393241882324, "loss": 0.4404, "rewards/accuracies": 1.0, "rewards/chosen": 1.3478659391403198, "rewards/margins": 0.5918828845024109, "rewards/rejected": 0.7559830546379089, "step": 3007 }, { "epoch": 1.62, "learning_rate": 4.5826746273160445e-08, "logits/chosen": -2.039642810821533, "logits/rejected": -2.2863152027130127, "logps/chosen": -0.5484428405761719, "logps/rejected": -0.6247612833976746, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": 0.9753738641738892, "rewards/margins": 0.0026833415031433105, "rewards/rejected": 0.9726905226707458, "step": 3008 }, { "epoch": 1.62, "learning_rate": 4.579773279603695e-08, "logits/chosen": -2.0089619159698486, "logits/rejected": -2.0072970390319824, "logps/chosen": -2.871814012527466, "logps/rejected": -5.019660949707031, "loss": 0.2732, "rewards/accuracies": 1.0, "rewards/chosen": 1.5700716972351074, "rewards/margins": 1.1577903032302856, "rewards/rejected": 0.41228142380714417, "step": 3009 }, { "epoch": 1.62, "learning_rate": 4.5768720743866334e-08, "logits/chosen": -2.0028536319732666, "logits/rejected": -2.2868316173553467, "logps/chosen": -0.8562831878662109, "logps/rejected": -0.9367890357971191, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 1.0102308988571167, "rewards/margins": -0.004546642303466797, "rewards/rejected": 1.0147775411605835, "step": 3010 }, { "epoch": 1.62, "learning_rate": 4.5739710126486316e-08, "logits/chosen": -2.1019530296325684, "logits/rejected": -2.1145260334014893, "logps/chosen": -0.6000519394874573, "logps/rejected": -6.007376670837402, "loss": 0.5909, "rewards/accuracies": 1.0, "rewards/chosen": 1.0943244695663452, "rewards/margins": 0.21611666679382324, "rewards/rejected": 0.878207802772522, "step": 3011 }, { "epoch": 1.62, "learning_rate": 4.5710700953734166e-08, "logits/chosen": -2.079742431640625, "logits/rejected": -2.077946186065674, "logps/chosen": -0.7191413640975952, "logps/rejected": -4.058250904083252, "loss": 0.472, "rewards/accuracies": 1.0, "rewards/chosen": 0.9133457541465759, "rewards/margins": 0.5054155588150024, "rewards/rejected": 0.4079302251338959, "step": 3012 }, { "epoch": 1.63, "learning_rate": 4.5681693235446615e-08, "logits/chosen": -2.145051956176758, "logits/rejected": -2.145578145980835, "logps/chosen": -2.3363473415374756, "logps/rejected": -4.127511501312256, "loss": 0.2832, "rewards/accuracies": 1.0, "rewards/chosen": 1.6673849821090698, "rewards/margins": 1.116851806640625, "rewards/rejected": 0.5505331754684448, "step": 3013 }, { "epoch": 1.63, "learning_rate": 4.5652686981459966e-08, "logits/chosen": -2.1709418296813965, "logits/rejected": -2.0355260372161865, "logps/chosen": -30.64696502685547, "logps/rejected": -17.42321014404297, "loss": 0.3093, "rewards/accuracies": 1.0, "rewards/chosen": 1.859521508216858, "rewards/margins": 1.014815330505371, "rewards/rejected": 0.8447061777114868, "step": 3014 }, { "epoch": 1.63, "learning_rate": 4.562368220160998e-08, "logits/chosen": -2.2929677963256836, "logits/rejected": -2.012148141860962, "logps/chosen": -68.8826904296875, "logps/rejected": -12.771685600280762, "loss": 0.0911, "rewards/accuracies": 1.0, "rewards/chosen": 3.086801290512085, "rewards/margins": 2.3498103618621826, "rewards/rejected": 0.7369908690452576, "step": 3015 }, { "epoch": 1.63, "learning_rate": 4.5594678905731896e-08, "logits/chosen": -1.958500862121582, "logits/rejected": -1.9579503536224365, "logps/chosen": -1.3252840042114258, "logps/rejected": -0.85395747423172, "loss": 0.6229, "rewards/accuracies": 1.0, "rewards/chosen": 0.919639527797699, "rewards/margins": 0.14584136009216309, "rewards/rejected": 0.7737981677055359, "step": 3016 }, { "epoch": 1.63, "learning_rate": 4.5565677103660546e-08, "logits/chosen": -2.151618719100952, "logits/rejected": -2.2585268020629883, "logps/chosen": -0.7941125631332397, "logps/rejected": -2.2978572845458984, "loss": 0.6488, "rewards/accuracies": 1.0, "rewards/chosen": 0.9006485342979431, "rewards/margins": 0.09068089723587036, "rewards/rejected": 0.8099676370620728, "step": 3017 }, { "epoch": 1.63, "learning_rate": 4.553667680523014e-08, "logits/chosen": -1.9267191886901855, "logits/rejected": -2.246797561645508, "logps/chosen": -0.24886921048164368, "logps/rejected": -0.26972341537475586, "loss": 0.668, "rewards/accuracies": 1.0, "rewards/chosen": 0.8784505128860474, "rewards/margins": 0.050920963287353516, "rewards/rejected": 0.8275295495986938, "step": 3018 }, { "epoch": 1.63, "learning_rate": 4.550767802027447e-08, "logits/chosen": -2.1804869174957275, "logits/rejected": -2.296203851699829, "logps/chosen": -0.3237696588039398, "logps/rejected": -0.3228360414505005, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 0.8763480186462402, "rewards/margins": 0.0008838772773742676, "rewards/rejected": 0.875464141368866, "step": 3019 }, { "epoch": 1.63, "learning_rate": 4.5478680758626766e-08, "logits/chosen": -2.1409761905670166, "logits/rejected": -2.322098970413208, "logps/chosen": -1.2192373275756836, "logps/rejected": -1.1080145835876465, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.8854155540466309, "rewards/margins": 0.0036780238151550293, "rewards/rejected": 0.8817375302314758, "step": 3020 }, { "epoch": 1.63, "learning_rate": 4.544968503011973e-08, "logits/chosen": -2.1049890518188477, "logits/rejected": -2.1329636573791504, "logps/chosen": -6.312859535217285, "logps/rejected": -9.014627456665039, "loss": 0.4183, "rewards/accuracies": 1.0, "rewards/chosen": 1.566368818283081, "rewards/margins": 0.6552478075027466, "rewards/rejected": 0.9111210107803345, "step": 3021 }, { "epoch": 1.63, "learning_rate": 4.542069084458558e-08, "logits/chosen": -2.227628469467163, "logits/rejected": -2.373842239379883, "logps/chosen": -8.943521499633789, "logps/rejected": -9.090984344482422, "loss": 0.6687, "rewards/accuracies": 1.0, "rewards/chosen": 1.2420099973678589, "rewards/margins": 0.049478769302368164, "rewards/rejected": 1.1925312280654907, "step": 3022 }, { "epoch": 1.63, "learning_rate": 4.5391698211856e-08, "logits/chosen": -2.150209426879883, "logits/rejected": -2.1465258598327637, "logps/chosen": -6.21767520904541, "logps/rejected": -3.71551251411438, "loss": 0.3681, "rewards/accuracies": 1.0, "rewards/chosen": 1.3580440282821655, "rewards/margins": 0.8095852732658386, "rewards/rejected": 0.5484587550163269, "step": 3023 }, { "epoch": 1.63, "learning_rate": 4.5362707141762144e-08, "logits/chosen": -2.0156795978546143, "logits/rejected": -2.304109573364258, "logps/chosen": -1.1349644660949707, "logps/rejected": -0.849666953086853, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.8582647442817688, "rewards/margins": 0.030508697032928467, "rewards/rejected": 0.8277560472488403, "step": 3024 }, { "epoch": 1.63, "learning_rate": 4.533371764413463e-08, "logits/chosen": -2.1236255168914795, "logits/rejected": -2.108154773712158, "logps/chosen": -34.123085021972656, "logps/rejected": -27.111127853393555, "loss": 0.2848, "rewards/accuracies": 1.0, "rewards/chosen": 1.8618134260177612, "rewards/margins": 1.110295057296753, "rewards/rejected": 0.7515184283256531, "step": 3025 }, { "epoch": 1.63, "learning_rate": 4.5304729728803523e-08, "logits/chosen": -1.9825536012649536, "logits/rejected": -2.3203318119049072, "logps/chosen": -0.7058876752853394, "logps/rejected": -0.6513441801071167, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 1.0363203287124634, "rewards/margins": 0.01969742774963379, "rewards/rejected": 1.0166229009628296, "step": 3026 }, { "epoch": 1.63, "learning_rate": 4.527574340559844e-08, "logits/chosen": -2.152019739151001, "logits/rejected": -2.2995338439941406, "logps/chosen": -2.0559425354003906, "logps/rejected": -0.6152886748313904, "loss": 0.7335, "rewards/accuracies": 0.0, "rewards/chosen": 0.8934446573257446, "rewards/margins": -0.07910984754562378, "rewards/rejected": 0.9725545048713684, "step": 3027 }, { "epoch": 1.63, "learning_rate": 4.5246758684348336e-08, "logits/chosen": -2.1080245971679688, "logits/rejected": -2.271820068359375, "logps/chosen": -0.26994097232818604, "logps/rejected": -0.2581978142261505, "loss": 0.6713, "rewards/accuracies": 1.0, "rewards/chosen": 0.9411341547966003, "rewards/margins": 0.04424095153808594, "rewards/rejected": 0.8968932032585144, "step": 3028 }, { "epoch": 1.63, "learning_rate": 4.5217775574881724e-08, "logits/chosen": -2.070279836654663, "logits/rejected": -2.070323944091797, "logps/chosen": -0.9623412489891052, "logps/rejected": -3.259909152984619, "loss": 0.5263, "rewards/accuracies": 1.0, "rewards/chosen": 1.171613097190857, "rewards/margins": 0.3672527074813843, "rewards/rejected": 0.8043603897094727, "step": 3029 }, { "epoch": 1.63, "learning_rate": 4.51887940870265e-08, "logits/chosen": -2.1055238246917725, "logits/rejected": -2.101994752883911, "logps/chosen": -1.2814953327178955, "logps/rejected": -2.9854252338409424, "loss": 0.4981, "rewards/accuracies": 1.0, "rewards/chosen": 1.140028715133667, "rewards/margins": 0.43749183416366577, "rewards/rejected": 0.7025368809700012, "step": 3030 }, { "epoch": 1.63, "learning_rate": 4.515981423061006e-08, "logits/chosen": -2.1370654106140137, "logits/rejected": -2.13806414604187, "logps/chosen": -0.32271620631217957, "logps/rejected": -5.329833984375, "loss": 0.4592, "rewards/accuracies": 1.0, "rewards/chosen": 0.9568301439285278, "rewards/margins": 0.5398668050765991, "rewards/rejected": 0.4169633090496063, "step": 3031 }, { "epoch": 1.64, "learning_rate": 4.51308360154592e-08, "logits/chosen": -2.1745963096618652, "logits/rejected": -2.1774628162384033, "logps/chosen": -2.6339735984802246, "logps/rejected": -5.594259738922119, "loss": 0.3157, "rewards/accuracies": 1.0, "rewards/chosen": 1.593438982963562, "rewards/margins": 0.990997314453125, "rewards/rejected": 0.602441668510437, "step": 3032 }, { "epoch": 1.64, "learning_rate": 4.5101859451400216e-08, "logits/chosen": -2.1056180000305176, "logits/rejected": -2.2822141647338867, "logps/chosen": -1.7690035104751587, "logps/rejected": -1.6957591772079468, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.9925390481948853, "rewards/margins": -0.004442751407623291, "rewards/rejected": 0.9969817996025085, "step": 3033 }, { "epoch": 1.64, "learning_rate": 4.507288454825879e-08, "logits/chosen": -2.04788875579834, "logits/rejected": -2.2614359855651855, "logps/chosen": -0.5643414855003357, "logps/rejected": -1.4436246156692505, "loss": 0.6308, "rewards/accuracies": 1.0, "rewards/chosen": 1.0082687139511108, "rewards/margins": 0.12884873151779175, "rewards/rejected": 0.8794199824333191, "step": 3034 }, { "epoch": 1.64, "learning_rate": 4.5043911315860066e-08, "logits/chosen": -2.1064882278442383, "logits/rejected": -2.1125357151031494, "logps/chosen": -1.605672836303711, "logps/rejected": -2.8373050689697266, "loss": 0.449, "rewards/accuracies": 1.0, "rewards/chosen": 1.1172174215316772, "rewards/margins": 0.5677953362464905, "rewards/rejected": 0.5494220852851868, "step": 3035 }, { "epoch": 1.64, "learning_rate": 4.501493976402861e-08, "logits/chosen": -2.1265745162963867, "logits/rejected": -2.1325533390045166, "logps/chosen": -3.3538546562194824, "logps/rejected": -4.077447891235352, "loss": 0.3963, "rewards/accuracies": 1.0, "rewards/chosen": 1.2063068151474, "rewards/margins": 0.7209315299987793, "rewards/rejected": 0.485375314950943, "step": 3036 }, { "epoch": 1.64, "learning_rate": 4.4985969902588445e-08, "logits/chosen": -2.116218328475952, "logits/rejected": -2.3046066761016846, "logps/chosen": -1.364923357963562, "logps/rejected": -1.4204647541046143, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.989353597164154, "rewards/margins": 0.019337594509124756, "rewards/rejected": 0.9700160026550293, "step": 3037 }, { "epoch": 1.64, "learning_rate": 4.4957001741363013e-08, "logits/chosen": -2.0233652591705322, "logits/rejected": -2.2513861656188965, "logps/chosen": -1.0373644828796387, "logps/rejected": -1.0085179805755615, "loss": 0.6671, "rewards/accuracies": 1.0, "rewards/chosen": 1.0365608930587769, "rewards/margins": 0.05288618803024292, "rewards/rejected": 0.9836747050285339, "step": 3038 }, { "epoch": 1.64, "learning_rate": 4.492803529017516e-08, "logits/chosen": -2.1989479064941406, "logits/rejected": -2.1766130924224854, "logps/chosen": -4.9207234382629395, "logps/rejected": -7.686211585998535, "loss": 0.4643, "rewards/accuracies": 1.0, "rewards/chosen": 1.0563938617706299, "rewards/margins": 0.5261021256446838, "rewards/rejected": 0.530291736125946, "step": 3039 }, { "epoch": 1.64, "learning_rate": 4.489907055884715e-08, "logits/chosen": -2.0246217250823975, "logits/rejected": -2.2626543045043945, "logps/chosen": -0.2840011417865753, "logps/rejected": -0.3161112666130066, "loss": 0.7054, "rewards/accuracies": 0.0, "rewards/chosen": 0.8823195695877075, "rewards/margins": -0.024417579174041748, "rewards/rejected": 0.9067371487617493, "step": 3040 }, { "epoch": 1.64, "learning_rate": 4.4870107557200675e-08, "logits/chosen": -2.11531925201416, "logits/rejected": -2.2840356826782227, "logps/chosen": -0.24636436998844147, "logps/rejected": -0.25991421937942505, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.9454705119132996, "rewards/margins": 0.013300597667694092, "rewards/rejected": 0.9321699142456055, "step": 3041 }, { "epoch": 1.64, "learning_rate": 4.4841146295056864e-08, "logits/chosen": -2.0734851360321045, "logits/rejected": -2.33171010017395, "logps/chosen": -0.20200899243354797, "logps/rejected": -0.20972856879234314, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.919880211353302, "rewards/margins": 0.0065888166427612305, "rewards/rejected": 0.9132913947105408, "step": 3042 }, { "epoch": 1.64, "learning_rate": 4.4812186782236215e-08, "logits/chosen": -2.0498759746551514, "logits/rejected": -2.3315513134002686, "logps/chosen": -1.0567681789398193, "logps/rejected": -1.074110746383667, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.8014998435974121, "rewards/margins": 0.02411937713623047, "rewards/rejected": 0.7773804664611816, "step": 3043 }, { "epoch": 1.64, "learning_rate": 4.478322902855866e-08, "logits/chosen": -2.0739550590515137, "logits/rejected": -2.0692248344421387, "logps/chosen": -0.19598262012004852, "logps/rejected": -11.56667423248291, "loss": 0.5118, "rewards/accuracies": 1.0, "rewards/chosen": 0.8717090487480164, "rewards/margins": 0.4029976725578308, "rewards/rejected": 0.46871137619018555, "step": 3044 }, { "epoch": 1.64, "learning_rate": 4.47542730438435e-08, "logits/chosen": -2.050114393234253, "logits/rejected": -2.33358097076416, "logps/chosen": -1.3388856649398804, "logps/rejected": -1.1242873668670654, "loss": 0.7172, "rewards/accuracies": 0.0, "rewards/chosen": 0.9767190217971802, "rewards/margins": -0.047479867935180664, "rewards/rejected": 1.0241988897323608, "step": 3045 }, { "epoch": 1.64, "learning_rate": 4.472531883790949e-08, "logits/chosen": -2.2238988876342773, "logits/rejected": -2.315717935562134, "logps/chosen": -2.8658084869384766, "logps/rejected": -3.1679301261901855, "loss": 0.6705, "rewards/accuracies": 1.0, "rewards/chosen": 0.7611739039421082, "rewards/margins": 0.04591172933578491, "rewards/rejected": 0.7152621746063232, "step": 3046 }, { "epoch": 1.64, "learning_rate": 4.469636642057474e-08, "logits/chosen": -2.0489044189453125, "logits/rejected": -2.0505640506744385, "logps/chosen": -1.3116381168365479, "logps/rejected": -2.0971696376800537, "loss": 0.5211, "rewards/accuracies": 1.0, "rewards/chosen": 1.0613257884979248, "rewards/margins": 0.38006049394607544, "rewards/rejected": 0.6812652945518494, "step": 3047 }, { "epoch": 1.64, "learning_rate": 4.4667415801656786e-08, "logits/chosen": -2.112785816192627, "logits/rejected": -2.0991039276123047, "logps/chosen": -4.138264179229736, "logps/rejected": -4.5264739990234375, "loss": 0.347, "rewards/accuracies": 1.0, "rewards/chosen": 1.656070590019226, "rewards/margins": 0.8798429369926453, "rewards/rejected": 0.7762276530265808, "step": 3048 }, { "epoch": 1.64, "learning_rate": 4.463846699097252e-08, "logits/chosen": -2.166794538497925, "logits/rejected": -2.161489248275757, "logps/chosen": -7.155804634094238, "logps/rejected": -5.195172309875488, "loss": 0.3706, "rewards/accuracies": 1.0, "rewards/chosen": 1.2526755332946777, "rewards/margins": 0.8016198873519897, "rewards/rejected": 0.4510556161403656, "step": 3049 }, { "epoch": 1.65, "learning_rate": 4.460951999833824e-08, "logits/chosen": -2.0519821643829346, "logits/rejected": -2.0461490154266357, "logps/chosen": -4.352424144744873, "logps/rejected": -3.206345796585083, "loss": 0.3113, "rewards/accuracies": 1.0, "rewards/chosen": 1.5357029438018799, "rewards/margins": 1.0071899890899658, "rewards/rejected": 0.5285128951072693, "step": 3050 }, { "epoch": 1.65, "learning_rate": 4.4580574833569614e-08, "logits/chosen": -2.001765251159668, "logits/rejected": -2.0012404918670654, "logps/chosen": -0.8110752105712891, "logps/rejected": -1.4752289056777954, "loss": 0.606, "rewards/accuracies": 1.0, "rewards/chosen": 1.0253407955169678, "rewards/margins": 0.1826513409614563, "rewards/rejected": 0.8426894545555115, "step": 3051 }, { "epoch": 1.65, "learning_rate": 4.4551631506481715e-08, "logits/chosen": -2.1150777339935303, "logits/rejected": -2.234872341156006, "logps/chosen": -0.2073766440153122, "logps/rejected": -0.21857210993766785, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.9428232312202454, "rewards/margins": 0.030468106269836426, "rewards/rejected": 0.9123551249504089, "step": 3052 }, { "epoch": 1.65, "learning_rate": 4.452269002688896e-08, "logits/chosen": -2.0925257205963135, "logits/rejected": -2.3171944618225098, "logps/chosen": -0.4919309914112091, "logps/rejected": -0.5008898973464966, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 1.0101993083953857, "rewards/margins": 0.008490204811096191, "rewards/rejected": 1.0017091035842896, "step": 3053 }, { "epoch": 1.65, "learning_rate": 4.449375040460519e-08, "logits/chosen": -2.078885078430176, "logits/rejected": -2.2721521854400635, "logps/chosen": -1.0424976348876953, "logps/rejected": -1.0998247861862183, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.9026705622673035, "rewards/margins": 0.028440237045288086, "rewards/rejected": 0.8742303252220154, "step": 3054 }, { "epoch": 1.65, "learning_rate": 4.4464812649443535e-08, "logits/chosen": -2.0807812213897705, "logits/rejected": -2.090634346008301, "logps/chosen": -1.6967037916183472, "logps/rejected": -2.488546848297119, "loss": 0.5032, "rewards/accuracies": 1.0, "rewards/chosen": 0.942116379737854, "rewards/margins": 0.42455607652664185, "rewards/rejected": 0.5175603032112122, "step": 3055 }, { "epoch": 1.65, "learning_rate": 4.4435876771216565e-08, "logits/chosen": -2.035785436630249, "logits/rejected": -2.2374801635742188, "logps/chosen": -1.2719300985336304, "logps/rejected": -1.2452247142791748, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.9465853571891785, "rewards/margins": 0.015007972717285156, "rewards/rejected": 0.9315773844718933, "step": 3056 }, { "epoch": 1.65, "learning_rate": 4.4406942779736196e-08, "logits/chosen": -2.040344715118408, "logits/rejected": -2.2552390098571777, "logps/chosen": -0.318085640668869, "logps/rejected": -0.381915420293808, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.770526111125946, "rewards/margins": 0.012491226196289062, "rewards/rejected": 0.758034884929657, "step": 3057 }, { "epoch": 1.65, "learning_rate": 4.4378010684813696e-08, "logits/chosen": -2.1308419704437256, "logits/rejected": -2.233375072479248, "logps/chosen": -0.3755864202976227, "logps/rejected": -0.4824371039867401, "loss": 0.6778, "rewards/accuracies": 1.0, "rewards/chosen": 0.95750492811203, "rewards/margins": 0.031029224395751953, "rewards/rejected": 0.9264757037162781, "step": 3058 }, { "epoch": 1.65, "learning_rate": 4.434908049625969e-08, "logits/chosen": -2.1990084648132324, "logits/rejected": -2.278249979019165, "logps/chosen": -9.2626314163208, "logps/rejected": -5.795601844787598, "loss": 0.72, "rewards/accuracies": 0.0, "rewards/chosen": 0.7466088533401489, "rewards/margins": -0.052957236766815186, "rewards/rejected": 0.7995660901069641, "step": 3059 }, { "epoch": 1.65, "learning_rate": 4.432015222388415e-08, "logits/chosen": -2.010629892349243, "logits/rejected": -2.2800164222717285, "logps/chosen": -0.5165418386459351, "logps/rejected": -5.697305679321289, "loss": 0.601, "rewards/accuracies": 1.0, "rewards/chosen": 0.8156309127807617, "rewards/margins": 0.19374561309814453, "rewards/rejected": 0.6218852996826172, "step": 3060 }, { "epoch": 1.65, "learning_rate": 4.42912258774964e-08, "logits/chosen": -2.0187017917633057, "logits/rejected": -2.27799129486084, "logps/chosen": -0.6011108756065369, "logps/rejected": -0.7771122455596924, "loss": 0.6713, "rewards/accuracies": 1.0, "rewards/chosen": 0.8097441792488098, "rewards/margins": 0.04421651363372803, "rewards/rejected": 0.7655276656150818, "step": 3061 }, { "epoch": 1.65, "learning_rate": 4.426230146690514e-08, "logits/chosen": -1.9750133752822876, "logits/rejected": -2.284468650817871, "logps/chosen": -1.0193774700164795, "logps/rejected": -8.182465553283691, "loss": 0.5089, "rewards/accuracies": 1.0, "rewards/chosen": 0.8381251692771912, "rewards/margins": 0.41031667590141296, "rewards/rejected": 0.4278084933757782, "step": 3062 }, { "epoch": 1.65, "learning_rate": 4.423337900191837e-08, "logits/chosen": -2.0488452911376953, "logits/rejected": -2.050297498703003, "logps/chosen": -1.9753143787384033, "logps/rejected": -1.7377784252166748, "loss": 0.6685, "rewards/accuracies": 1.0, "rewards/chosen": 1.1668118238449097, "rewards/margins": 0.049892544746398926, "rewards/rejected": 1.1169192790985107, "step": 3063 }, { "epoch": 1.65, "learning_rate": 4.4204458492343456e-08, "logits/chosen": -2.032097339630127, "logits/rejected": -2.293031930923462, "logps/chosen": -5.818583011627197, "logps/rejected": -7.152808666229248, "loss": 0.619, "rewards/accuracies": 1.0, "rewards/chosen": 0.5857262015342712, "rewards/margins": 0.15433529019355774, "rewards/rejected": 0.4313909113407135, "step": 3064 }, { "epoch": 1.65, "learning_rate": 4.417553994798707e-08, "logits/chosen": -1.9896624088287354, "logits/rejected": -1.9915472269058228, "logps/chosen": -1.4731109142303467, "logps/rejected": -3.4044911861419678, "loss": 0.5412, "rewards/accuracies": 1.0, "rewards/chosen": 0.8962721824645996, "rewards/margins": 0.3311231732368469, "rewards/rejected": 0.5651490092277527, "step": 3065 }, { "epoch": 1.65, "learning_rate": 4.414662337865529e-08, "logits/chosen": -2.056879758834839, "logits/rejected": -2.232778549194336, "logps/chosen": -1.9691170454025269, "logps/rejected": -4.672840595245361, "loss": 0.6398, "rewards/accuracies": 1.0, "rewards/chosen": 1.0230934619903564, "rewards/margins": 0.10973703861236572, "rewards/rejected": 0.9133564233779907, "step": 3066 }, { "epoch": 1.65, "learning_rate": 4.4117708794153466e-08, "logits/chosen": -2.0450072288513184, "logits/rejected": -2.0490310192108154, "logps/chosen": -10.29726505279541, "logps/rejected": -4.122173309326172, "loss": 0.6428, "rewards/accuracies": 1.0, "rewards/chosen": 0.9297022819519043, "rewards/margins": 0.10344552993774414, "rewards/rejected": 0.8262567520141602, "step": 3067 }, { "epoch": 1.65, "learning_rate": 4.4088796204286274e-08, "logits/chosen": -2.201117753982544, "logits/rejected": -2.1997997760772705, "logps/chosen": -2.5716209411621094, "logps/rejected": -4.982989311218262, "loss": 0.326, "rewards/accuracies": 1.0, "rewards/chosen": 1.379044771194458, "rewards/margins": 0.9533272981643677, "rewards/rejected": 0.42571744322776794, "step": 3068 }, { "epoch": 1.66, "learning_rate": 4.405988561885772e-08, "logits/chosen": -2.3162388801574707, "logits/rejected": -2.17708683013916, "logps/chosen": -36.508018493652344, "logps/rejected": -1.7350549697875977, "loss": 0.1951, "rewards/accuracies": 1.0, "rewards/chosen": 2.436265707015991, "rewards/margins": 1.5351536273956299, "rewards/rejected": 0.9011120200157166, "step": 3069 }, { "epoch": 1.66, "learning_rate": 4.403097704767115e-08, "logits/chosen": -2.2330024242401123, "logits/rejected": -2.0470876693725586, "logps/chosen": -44.36760711669922, "logps/rejected": -2.54746150970459, "loss": 0.1247, "rewards/accuracies": 1.0, "rewards/chosen": 2.6678390502929688, "rewards/margins": 2.01924467086792, "rewards/rejected": 0.6485944986343384, "step": 3070 }, { "epoch": 1.66, "learning_rate": 4.40020705005292e-08, "logits/chosen": -2.0351080894470215, "logits/rejected": -2.0346624851226807, "logps/chosen": -3.8250763416290283, "logps/rejected": -2.403376579284668, "loss": 0.4775, "rewards/accuracies": 1.0, "rewards/chosen": 1.3413605690002441, "rewards/margins": 0.4908687472343445, "rewards/rejected": 0.8504918217658997, "step": 3071 }, { "epoch": 1.66, "learning_rate": 4.397316598723385e-08, "logits/chosen": -2.067530870437622, "logits/rejected": -2.297339677810669, "logps/chosen": -0.6960121393203735, "logps/rejected": -0.6852744817733765, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 1.1541725397109985, "rewards/margins": 0.0012400150299072266, "rewards/rejected": 1.1529325246810913, "step": 3072 }, { "epoch": 1.66, "learning_rate": 4.3944263517586365e-08, "logits/chosen": -2.158402681350708, "logits/rejected": -2.3209476470947266, "logps/chosen": -4.9740190505981445, "logps/rejected": -0.5329495668411255, "loss": 0.7237, "rewards/accuracies": 0.0, "rewards/chosen": 0.9131189584732056, "rewards/margins": -0.06026250123977661, "rewards/rejected": 0.9733814597129822, "step": 3073 }, { "epoch": 1.66, "learning_rate": 4.3915363101387316e-08, "logits/chosen": -2.116334915161133, "logits/rejected": -2.115567445755005, "logps/chosen": -4.5442633628845215, "logps/rejected": -5.711089611053467, "loss": 0.3405, "rewards/accuracies": 1.0, "rewards/chosen": 1.364068627357483, "rewards/margins": 0.9023120403289795, "rewards/rejected": 0.46175655722618103, "step": 3074 }, { "epoch": 1.66, "learning_rate": 4.3886464748436594e-08, "logits/chosen": -2.1553943157196045, "logits/rejected": -2.37699294090271, "logps/chosen": -15.280245780944824, "logps/rejected": -10.028669357299805, "loss": 0.5305, "rewards/accuracies": 1.0, "rewards/chosen": 1.1025283336639404, "rewards/margins": 0.3568902611732483, "rewards/rejected": 0.7456380724906921, "step": 3075 }, { "epoch": 1.66, "learning_rate": 4.385756846853338e-08, "logits/chosen": -2.029045581817627, "logits/rejected": -2.026737689971924, "logps/chosen": -5.870930194854736, "logps/rejected": -3.421053886413574, "loss": 0.3346, "rewards/accuracies": 1.0, "rewards/chosen": 1.4752216339111328, "rewards/margins": 0.9229176640510559, "rewards/rejected": 0.5523039698600769, "step": 3076 }, { "epoch": 1.66, "learning_rate": 4.382867427147617e-08, "logits/chosen": -2.0444343090057373, "logits/rejected": -2.318028688430786, "logps/chosen": -0.36308616399765015, "logps/rejected": -0.39403825998306274, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.8863824009895325, "rewards/margins": 0.02156895399093628, "rewards/rejected": 0.8648134469985962, "step": 3077 }, { "epoch": 1.66, "learning_rate": 4.3799782167062735e-08, "logits/chosen": -2.0750322341918945, "logits/rejected": -2.2618002891540527, "logps/chosen": -0.43642836809158325, "logps/rejected": -0.45680496096611023, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.9420230984687805, "rewards/margins": 0.01621878147125244, "rewards/rejected": 0.9258043169975281, "step": 3078 }, { "epoch": 1.66, "learning_rate": 4.3770892165090123e-08, "logits/chosen": -2.1274027824401855, "logits/rejected": -2.2861886024475098, "logps/chosen": -0.4229698181152344, "logps/rejected": -0.4216013550758362, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.8290193676948547, "rewards/margins": 0.024693608283996582, "rewards/rejected": 0.8043257594108582, "step": 3079 }, { "epoch": 1.66, "learning_rate": 4.374200427535469e-08, "logits/chosen": -2.0561959743499756, "logits/rejected": -2.0986244678497314, "logps/chosen": -4.092458248138428, "logps/rejected": -9.632867813110352, "loss": 0.2596, "rewards/accuracies": 1.0, "rewards/chosen": 1.675511360168457, "rewards/margins": 1.2161405086517334, "rewards/rejected": 0.45937082171440125, "step": 3080 }, { "epoch": 1.66, "learning_rate": 4.3713118507652104e-08, "logits/chosen": -2.154439926147461, "logits/rejected": -2.2868399620056152, "logps/chosen": -0.26996612548828125, "logps/rejected": -0.2869154214859009, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.9420924186706543, "rewards/margins": 0.014508306980133057, "rewards/rejected": 0.9275841116905212, "step": 3081 }, { "epoch": 1.66, "learning_rate": 4.3684234871777246e-08, "logits/chosen": -2.081850051879883, "logits/rejected": -2.0826640129089355, "logps/chosen": -1.6302802562713623, "logps/rejected": -1.8577542304992676, "loss": 0.6556, "rewards/accuracies": 1.0, "rewards/chosen": 0.9976493716239929, "rewards/margins": 0.07660937309265137, "rewards/rejected": 0.9210399985313416, "step": 3082 }, { "epoch": 1.66, "learning_rate": 4.3655353377524324e-08, "logits/chosen": -2.108765125274658, "logits/rejected": -2.2741782665252686, "logps/chosen": -0.8255252838134766, "logps/rejected": -0.8469420671463013, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.904853343963623, "rewards/margins": 0.0030198097229003906, "rewards/rejected": 0.9018335342407227, "step": 3083 }, { "epoch": 1.66, "learning_rate": 4.3626474034686795e-08, "logits/chosen": -2.143666982650757, "logits/rejected": -2.291954755783081, "logps/chosen": -2.2296173572540283, "logps/rejected": -2.413038492202759, "loss": 0.675, "rewards/accuracies": 1.0, "rewards/chosen": 0.7014225125312805, "rewards/margins": 0.03656214475631714, "rewards/rejected": 0.6648603677749634, "step": 3084 }, { "epoch": 1.66, "learning_rate": 4.359759685305739e-08, "logits/chosen": -2.0533456802368164, "logits/rejected": -2.059659481048584, "logps/chosen": -2.2517664432525635, "logps/rejected": -3.74874210357666, "loss": 0.3363, "rewards/accuracies": 1.0, "rewards/chosen": 1.460756540298462, "rewards/margins": 0.9169042706489563, "rewards/rejected": 0.5438522696495056, "step": 3085 }, { "epoch": 1.66, "learning_rate": 4.356872184242816e-08, "logits/chosen": -2.2274932861328125, "logits/rejected": -2.246670722961426, "logps/chosen": -2.8345084190368652, "logps/rejected": -8.237135887145996, "loss": 0.3574, "rewards/accuracies": 1.0, "rewards/chosen": 1.3772162199020386, "rewards/margins": 0.8449029326438904, "rewards/rejected": 0.5323132872581482, "step": 3086 }, { "epoch": 1.67, "learning_rate": 4.353984901259034e-08, "logits/chosen": -2.129002571105957, "logits/rejected": -2.1295950412750244, "logps/chosen": -1.1067625284194946, "logps/rejected": -1.6027201414108276, "loss": 0.5289, "rewards/accuracies": 1.0, "rewards/chosen": 1.169783353805542, "rewards/margins": 0.360762357711792, "rewards/rejected": 0.80902099609375, "step": 3087 }, { "epoch": 1.67, "learning_rate": 4.3510978373334455e-08, "logits/chosen": -2.202693462371826, "logits/rejected": -2.1991961002349854, "logps/chosen": -0.7440209984779358, "logps/rejected": -6.5350422859191895, "loss": 0.4, "rewards/accuracies": 1.0, "rewards/chosen": 1.066749095916748, "rewards/margins": 0.7097370624542236, "rewards/rejected": 0.3570120334625244, "step": 3088 }, { "epoch": 1.67, "learning_rate": 4.34821099344503e-08, "logits/chosen": -2.10628080368042, "logits/rejected": -2.087179183959961, "logps/chosen": -4.391078948974609, "logps/rejected": -5.305156707763672, "loss": 0.46, "rewards/accuracies": 1.0, "rewards/chosen": 1.1341179609298706, "rewards/margins": 0.5377100706100464, "rewards/rejected": 0.5964078903198242, "step": 3089 }, { "epoch": 1.67, "learning_rate": 4.345324370572692e-08, "logits/chosen": -1.997402310371399, "logits/rejected": -1.9982620477676392, "logps/chosen": -0.20414523780345917, "logps/rejected": -5.255125045776367, "loss": 0.4605, "rewards/accuracies": 1.0, "rewards/chosen": 0.9458088874816895, "rewards/margins": 0.536292314529419, "rewards/rejected": 0.4095165431499481, "step": 3090 }, { "epoch": 1.67, "learning_rate": 4.342437969695261e-08, "logits/chosen": -2.1030352115631104, "logits/rejected": -2.277608633041382, "logps/chosen": -1.399962306022644, "logps/rejected": -1.2488077878952026, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.8431890606880188, "rewards/margins": 0.011164844036102295, "rewards/rejected": 0.8320242166519165, "step": 3091 }, { "epoch": 1.67, "learning_rate": 4.339551791791489e-08, "logits/chosen": -2.1143579483032227, "logits/rejected": -2.221334218978882, "logps/chosen": -1.3574802875518799, "logps/rejected": -1.4660307168960571, "loss": 0.6988, "rewards/accuracies": 0.0, "rewards/chosen": 0.9321851134300232, "rewards/margins": -0.011235713958740234, "rewards/rejected": 0.9434208273887634, "step": 3092 }, { "epoch": 1.67, "learning_rate": 4.336665837840056e-08, "logits/chosen": -2.0192785263061523, "logits/rejected": -2.023787498474121, "logps/chosen": -5.128576755523682, "logps/rejected": -1.8923351764678955, "loss": 0.5981, "rewards/accuracies": 1.0, "rewards/chosen": 0.9900409579277039, "rewards/margins": 0.20006614923477173, "rewards/rejected": 0.7899748086929321, "step": 3093 }, { "epoch": 1.67, "learning_rate": 4.333780108819563e-08, "logits/chosen": -2.048948049545288, "logits/rejected": -2.2182648181915283, "logps/chosen": -0.6267455220222473, "logps/rejected": -1.7232023477554321, "loss": 0.6703, "rewards/accuracies": 1.0, "rewards/chosen": 0.9290639162063599, "rewards/margins": 0.04615718126296997, "rewards/rejected": 0.8829067349433899, "step": 3094 }, { "epoch": 1.67, "learning_rate": 4.330894605708535e-08, "logits/chosen": -2.050438404083252, "logits/rejected": -2.28871488571167, "logps/chosen": -1.3204965591430664, "logps/rejected": -4.153494834899902, "loss": 0.673, "rewards/accuracies": 1.0, "rewards/chosen": 1.0407814979553223, "rewards/margins": 0.04075777530670166, "rewards/rejected": 1.0000237226486206, "step": 3095 }, { "epoch": 1.67, "learning_rate": 4.3280093294854254e-08, "logits/chosen": -1.9377729892730713, "logits/rejected": -1.9454050064086914, "logps/chosen": -1.4364835023880005, "logps/rejected": -2.953601598739624, "loss": 0.5123, "rewards/accuracies": 1.0, "rewards/chosen": 1.0254017114639282, "rewards/margins": 0.40175706148147583, "rewards/rejected": 0.6236446499824524, "step": 3096 }, { "epoch": 1.67, "learning_rate": 4.3251242811286045e-08, "logits/chosen": -2.0548412799835205, "logits/rejected": -2.22160005569458, "logps/chosen": -4.404316425323486, "logps/rejected": -6.582533359527588, "loss": 0.4984, "rewards/accuracies": 1.0, "rewards/chosen": 0.8911346793174744, "rewards/margins": 0.4368995428085327, "rewards/rejected": 0.45423513650894165, "step": 3097 }, { "epoch": 1.67, "learning_rate": 4.322239461616367e-08, "logits/chosen": -1.9910088777542114, "logits/rejected": -2.2849574089050293, "logps/chosen": -0.1944063901901245, "logps/rejected": -0.20779262483119965, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9042410254478455, "rewards/margins": 0.012526929378509521, "rewards/rejected": 0.8917140960693359, "step": 3098 }, { "epoch": 1.67, "learning_rate": 4.31935487192693e-08, "logits/chosen": -2.0094242095947266, "logits/rejected": -2.021413564682007, "logps/chosen": -2.754551887512207, "logps/rejected": -1.6202125549316406, "loss": 0.4248, "rewards/accuracies": 1.0, "rewards/chosen": 1.324522852897644, "rewards/margins": 0.6363471746444702, "rewards/rejected": 0.6881756782531738, "step": 3099 }, { "epoch": 1.67, "learning_rate": 4.3164705130384356e-08, "logits/chosen": -1.9876813888549805, "logits/rejected": -2.2219161987304688, "logps/chosen": -2.0061216354370117, "logps/rejected": -0.9642091393470764, "loss": 0.6687, "rewards/accuracies": 1.0, "rewards/chosen": 0.8130621314048767, "rewards/margins": 0.04948538541793823, "rewards/rejected": 0.7635767459869385, "step": 3100 }, { "epoch": 1.67, "learning_rate": 4.313586385928944e-08, "logits/chosen": -2.0294017791748047, "logits/rejected": -2.020420789718628, "logps/chosen": -9.769474029541016, "logps/rejected": -0.7815731167793274, "loss": 0.5791, "rewards/accuracies": 1.0, "rewards/chosen": 1.1516121625900269, "rewards/margins": 0.2427060604095459, "rewards/rejected": 0.908906102180481, "step": 3101 }, { "epoch": 1.67, "learning_rate": 4.310702491576436e-08, "logits/chosen": -2.1735169887542725, "logits/rejected": -2.1777455806732178, "logps/chosen": -0.23638023436069489, "logps/rejected": -3.8269901275634766, "loss": 0.4915, "rewards/accuracies": 1.0, "rewards/chosen": 0.8936319351196289, "rewards/margins": 0.45451077818870544, "rewards/rejected": 0.43912115693092346, "step": 3102 }, { "epoch": 1.67, "learning_rate": 4.307818830958818e-08, "logits/chosen": -2.0248842239379883, "logits/rejected": -2.0253026485443115, "logps/chosen": -1.4491512775421143, "logps/rejected": -1.9839047193527222, "loss": 0.5991, "rewards/accuracies": 1.0, "rewards/chosen": 1.0732285976409912, "rewards/margins": 0.19777405261993408, "rewards/rejected": 0.8754545450210571, "step": 3103 }, { "epoch": 1.67, "learning_rate": 4.304935405053913e-08, "logits/chosen": -2.1073882579803467, "logits/rejected": -2.110106945037842, "logps/chosen": -0.8651831746101379, "logps/rejected": -3.131897449493408, "loss": 0.5019, "rewards/accuracies": 1.0, "rewards/chosen": 1.0138531923294067, "rewards/margins": 0.42787617444992065, "rewards/rejected": 0.5859770178794861, "step": 3104 }, { "epoch": 1.67, "learning_rate": 4.302052214839467e-08, "logits/chosen": -2.0545313358306885, "logits/rejected": -2.0557923316955566, "logps/chosen": -1.2086437940597534, "logps/rejected": -2.4709999561309814, "loss": 0.5481, "rewards/accuracies": 1.0, "rewards/chosen": 0.8947475552558899, "rewards/margins": 0.3148308992385864, "rewards/rejected": 0.5799166560173035, "step": 3105 }, { "epoch": 1.68, "learning_rate": 4.299169261293147e-08, "logits/chosen": -2.10617733001709, "logits/rejected": -2.1052629947662354, "logps/chosen": -0.9165724515914917, "logps/rejected": -1.2249832153320312, "loss": 0.6377, "rewards/accuracies": 1.0, "rewards/chosen": 0.9801054000854492, "rewards/margins": 0.11409503221511841, "rewards/rejected": 0.8660103678703308, "step": 3106 }, { "epoch": 1.68, "learning_rate": 4.296286545392535e-08, "logits/chosen": -2.071265935897827, "logits/rejected": -2.215052843093872, "logps/chosen": -2.609241008758545, "logps/rejected": -2.492936372756958, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.717369794845581, "rewards/margins": -0.004500150680541992, "rewards/rejected": 0.721869945526123, "step": 3107 }, { "epoch": 1.68, "learning_rate": 4.293404068115136e-08, "logits/chosen": -2.0399272441864014, "logits/rejected": -2.0937254428863525, "logps/chosen": -1.7999589443206787, "logps/rejected": -8.078473091125488, "loss": 0.4534, "rewards/accuracies": 1.0, "rewards/chosen": 1.4771686792373657, "rewards/margins": 0.5557339787483215, "rewards/rejected": 0.9214347004890442, "step": 3108 }, { "epoch": 1.68, "learning_rate": 4.290521830438375e-08, "logits/chosen": -2.1051671504974365, "logits/rejected": -2.112612247467041, "logps/chosen": -2.0218420028686523, "logps/rejected": -3.5897634029388428, "loss": 0.4605, "rewards/accuracies": 1.0, "rewards/chosen": 1.3084748983383179, "rewards/margins": 0.5363302826881409, "rewards/rejected": 0.772144615650177, "step": 3109 }, { "epoch": 1.68, "learning_rate": 4.287639833339592e-08, "logits/chosen": -2.1503002643585205, "logits/rejected": -2.2921061515808105, "logps/chosen": -0.45238178968429565, "logps/rejected": -0.4293379485607147, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9440291523933411, "rewards/margins": 0.012497901916503906, "rewards/rejected": 0.9315312504768372, "step": 3110 }, { "epoch": 1.68, "learning_rate": 4.284758077796049e-08, "logits/chosen": -2.1698458194732666, "logits/rejected": -2.067049264907837, "logps/chosen": -25.72083854675293, "logps/rejected": -3.6697170734405518, "loss": 0.1773, "rewards/accuracies": 1.0, "rewards/chosen": 2.2240869998931885, "rewards/margins": 1.6399316787719727, "rewards/rejected": 0.5841553807258606, "step": 3111 }, { "epoch": 1.68, "learning_rate": 4.281876564784922e-08, "logits/chosen": -2.054844379425049, "logits/rejected": -2.3162784576416016, "logps/chosen": -1.1118215322494507, "logps/rejected": -1.0623372793197632, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.793837308883667, "rewards/margins": 0.014503955841064453, "rewards/rejected": 0.7793333530426025, "step": 3112 }, { "epoch": 1.68, "learning_rate": 4.278995295283312e-08, "logits/chosen": -2.098686933517456, "logits/rejected": -2.0983448028564453, "logps/chosen": -0.9546384215354919, "logps/rejected": -1.3240752220153809, "loss": 0.6608, "rewards/accuracies": 1.0, "rewards/chosen": 0.9269894957542419, "rewards/margins": 0.06584024429321289, "rewards/rejected": 0.861149251461029, "step": 3113 }, { "epoch": 1.68, "learning_rate": 4.2761142702682274e-08, "logits/chosen": -2.131377696990967, "logits/rejected": -2.142713785171509, "logps/chosen": -2.7571351528167725, "logps/rejected": -3.3256900310516357, "loss": 0.4899, "rewards/accuracies": 1.0, "rewards/chosen": 1.3887319564819336, "rewards/margins": 0.4586264491081238, "rewards/rejected": 0.9301055073738098, "step": 3114 }, { "epoch": 1.68, "learning_rate": 4.273233490716604e-08, "logits/chosen": -2.0095417499542236, "logits/rejected": -2.0187783241271973, "logps/chosen": -1.5416098833084106, "logps/rejected": -3.453537702560425, "loss": 0.4978, "rewards/accuracies": 1.0, "rewards/chosen": 0.9674442410469055, "rewards/margins": 0.4383851885795593, "rewards/rejected": 0.5290590524673462, "step": 3115 }, { "epoch": 1.68, "learning_rate": 4.2703529576052875e-08, "logits/chosen": -2.0996882915496826, "logits/rejected": -2.2670340538024902, "logps/chosen": -1.2712445259094238, "logps/rejected": -10.039430618286133, "loss": 0.6286, "rewards/accuracies": 1.0, "rewards/chosen": 1.0376704931259155, "rewards/margins": 0.1335107684135437, "rewards/rejected": 0.9041597247123718, "step": 3116 }, { "epoch": 1.68, "learning_rate": 4.2674726719110434e-08, "logits/chosen": -1.9550408124923706, "logits/rejected": -2.2995243072509766, "logps/chosen": -0.18262837827205658, "logps/rejected": -0.23060208559036255, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.9467417001724243, "rewards/margins": 0.003661632537841797, "rewards/rejected": 0.9430800676345825, "step": 3117 }, { "epoch": 1.68, "learning_rate": 4.2645926346105484e-08, "logits/chosen": -1.9389479160308838, "logits/rejected": -1.9471745491027832, "logps/chosen": -1.2759780883789062, "logps/rejected": -2.6136090755462646, "loss": 0.4838, "rewards/accuracies": 1.0, "rewards/chosen": 1.0770905017852783, "rewards/margins": 0.4744458794593811, "rewards/rejected": 0.6026446223258972, "step": 3118 }, { "epoch": 1.68, "learning_rate": 4.261712846680402e-08, "logits/chosen": -2.0498242378234863, "logits/rejected": -2.0434906482696533, "logps/chosen": -4.45607328414917, "logps/rejected": -3.338909864425659, "loss": 0.4252, "rewards/accuracies": 1.0, "rewards/chosen": 1.7649189233779907, "rewards/margins": 0.6351288557052612, "rewards/rejected": 1.1297900676727295, "step": 3119 }, { "epoch": 1.68, "learning_rate": 4.258833309097115e-08, "logits/chosen": -2.036961793899536, "logits/rejected": -2.021322250366211, "logps/chosen": -2.517685890197754, "logps/rejected": -2.9630086421966553, "loss": 0.4354, "rewards/accuracies": 1.0, "rewards/chosen": 1.2457833290100098, "rewards/margins": 0.6059912443161011, "rewards/rejected": 0.6397920846939087, "step": 3120 }, { "epoch": 1.68, "learning_rate": 4.2559540228371125e-08, "logits/chosen": -2.13297438621521, "logits/rejected": -2.1242380142211914, "logps/chosen": -3.446207046508789, "logps/rejected": -4.66754150390625, "loss": 0.524, "rewards/accuracies": 1.0, "rewards/chosen": 1.357290267944336, "rewards/margins": 0.37274169921875, "rewards/rejected": 0.9845485687255859, "step": 3121 }, { "epoch": 1.68, "learning_rate": 4.253074988876737e-08, "logits/chosen": -2.011382579803467, "logits/rejected": -2.016791582107544, "logps/chosen": -1.657802939414978, "logps/rejected": -4.411585807800293, "loss": 0.4381, "rewards/accuracies": 1.0, "rewards/chosen": 1.121389627456665, "rewards/margins": 0.5983514189720154, "rewards/rejected": 0.5230382084846497, "step": 3122 }, { "epoch": 1.68, "learning_rate": 4.250196208192244e-08, "logits/chosen": -2.192711353302002, "logits/rejected": -2.303927421569824, "logps/chosen": -2.055820941925049, "logps/rejected": -2.0511536598205566, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 0.7668778896331787, "rewards/margins": 0.02022266387939453, "rewards/rejected": 0.7466552257537842, "step": 3123 }, { "epoch": 1.69, "learning_rate": 4.247317681759801e-08, "logits/chosen": -2.094184637069702, "logits/rejected": -2.281766414642334, "logps/chosen": -0.5965001583099365, "logps/rejected": -0.6520053744316101, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.7090350389480591, "rewards/margins": 0.012111783027648926, "rewards/rejected": 0.6969232559204102, "step": 3124 }, { "epoch": 1.69, "learning_rate": 4.244439410555497e-08, "logits/chosen": -2.076601505279541, "logits/rejected": -2.2620232105255127, "logps/chosen": -0.4321156442165375, "logps/rejected": -0.5036346912384033, "loss": 0.6814, "rewards/accuracies": 1.0, "rewards/chosen": 0.9150974154472351, "rewards/margins": 0.02353692054748535, "rewards/rejected": 0.8915604948997498, "step": 3125 }, { "epoch": 1.69, "learning_rate": 4.241561395555326e-08, "logits/chosen": -2.2440435886383057, "logits/rejected": -2.2348220348358154, "logps/chosen": -1.3207380771636963, "logps/rejected": -3.45566463470459, "loss": 0.4936, "rewards/accuracies": 1.0, "rewards/chosen": 0.9720980525016785, "rewards/margins": 0.4492031931877136, "rewards/rejected": 0.5228948593139648, "step": 3126 }, { "epoch": 1.69, "learning_rate": 4.238683637735199e-08, "logits/chosen": -1.9947909116744995, "logits/rejected": -2.011317014694214, "logps/chosen": -1.5472743511199951, "logps/rejected": -8.936151504516602, "loss": 0.4476, "rewards/accuracies": 1.0, "rewards/chosen": 1.1553676128387451, "rewards/margins": 0.5715987682342529, "rewards/rejected": 0.5837688446044922, "step": 3127 }, { "epoch": 1.69, "learning_rate": 4.235806138070937e-08, "logits/chosen": -2.0991547107696533, "logits/rejected": -2.379171371459961, "logps/chosen": -17.171173095703125, "logps/rejected": -14.37460708618164, "loss": 0.7888, "rewards/accuracies": 0.0, "rewards/chosen": 1.0411567687988281, "rewards/margins": -0.18293464183807373, "rewards/rejected": 1.2240914106369019, "step": 3128 }, { "epoch": 1.69, "learning_rate": 4.2329288975382785e-08, "logits/chosen": -2.0439376831054688, "logits/rejected": -2.3119313716888428, "logps/chosen": -0.7331148386001587, "logps/rejected": -0.9545602202415466, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.9284756779670715, "rewards/margins": 0.008681654930114746, "rewards/rejected": 0.9197940230369568, "step": 3129 }, { "epoch": 1.69, "learning_rate": 4.230051917112869e-08, "logits/chosen": -1.9899770021438599, "logits/rejected": -1.9854223728179932, "logps/chosen": -7.112428188323975, "logps/rejected": -3.659274101257324, "loss": 0.4378, "rewards/accuracies": 1.0, "rewards/chosen": 1.6194757223129272, "rewards/margins": 0.5992107391357422, "rewards/rejected": 1.020264983177185, "step": 3130 }, { "epoch": 1.69, "learning_rate": 4.2271751977702695e-08, "logits/chosen": -2.3384876251220703, "logits/rejected": -2.1945714950561523, "logps/chosen": -22.836994171142578, "logps/rejected": -1.7484198808670044, "loss": 0.2042, "rewards/accuracies": 1.0, "rewards/chosen": 2.128990888595581, "rewards/margins": 1.4848542213439941, "rewards/rejected": 0.6441366076469421, "step": 3131 }, { "epoch": 1.69, "learning_rate": 4.224298740485949e-08, "logits/chosen": -2.075065851211548, "logits/rejected": -2.0754587650299072, "logps/chosen": -0.7773787379264832, "logps/rejected": -2.377150535583496, "loss": 0.6116, "rewards/accuracies": 1.0, "rewards/chosen": 0.7991446852684021, "rewards/margins": 0.17037832736968994, "rewards/rejected": 0.6287663578987122, "step": 3132 }, { "epoch": 1.69, "learning_rate": 4.221422546235292e-08, "logits/chosen": -2.0887303352355957, "logits/rejected": -2.095370292663574, "logps/chosen": -2.597853183746338, "logps/rejected": -5.4469218254089355, "loss": 0.3895, "rewards/accuracies": 1.0, "rewards/chosen": 1.1370328664779663, "rewards/margins": 0.7416754961013794, "rewards/rejected": 0.3953573703765869, "step": 3133 }, { "epoch": 1.69, "learning_rate": 4.218546615993588e-08, "logits/chosen": -2.034219264984131, "logits/rejected": -2.036214590072632, "logps/chosen": -2.5441412925720215, "logps/rejected": -1.0761034488677979, "loss": 0.4429, "rewards/accuracies": 1.0, "rewards/chosen": 1.5244215726852417, "rewards/margins": 0.5846998691558838, "rewards/rejected": 0.9397217035293579, "step": 3134 }, { "epoch": 1.69, "learning_rate": 4.215670950736044e-08, "logits/chosen": -2.0948591232299805, "logits/rejected": -2.1049721240997314, "logps/chosen": -17.619604110717773, "logps/rejected": -7.309677600860596, "loss": 0.1987, "rewards/accuracies": 1.0, "rewards/chosen": 2.140631675720215, "rewards/margins": 1.5150468349456787, "rewards/rejected": 0.6255847811698914, "step": 3135 }, { "epoch": 1.69, "learning_rate": 4.2127955514377725e-08, "logits/chosen": -2.0867056846618652, "logits/rejected": -2.297603130340576, "logps/chosen": -2.8190019130706787, "logps/rejected": -2.716839551925659, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.6828848123550415, "rewards/margins": 0.0315362811088562, "rewards/rejected": 0.6513485312461853, "step": 3136 }, { "epoch": 1.69, "learning_rate": 4.209920419073795e-08, "logits/chosen": -1.8542269468307495, "logits/rejected": -2.3022000789642334, "logps/chosen": -0.8518403172492981, "logps/rejected": -1.127210259437561, "loss": 0.6611, "rewards/accuracies": 1.0, "rewards/chosen": 0.7773732542991638, "rewards/margins": 0.06506478786468506, "rewards/rejected": 0.7123084664344788, "step": 3137 }, { "epoch": 1.69, "learning_rate": 4.2070455546190464e-08, "logits/chosen": -1.986461877822876, "logits/rejected": -2.312497138977051, "logps/chosen": -1.0419162511825562, "logps/rejected": -0.9358710050582886, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.7337145209312439, "rewards/margins": 0.022658348083496094, "rewards/rejected": 0.7110561728477478, "step": 3138 }, { "epoch": 1.69, "learning_rate": 4.204170959048368e-08, "logits/chosen": -2.0454185009002686, "logits/rejected": -2.042907953262329, "logps/chosen": -2.1484627723693848, "logps/rejected": -5.1731414794921875, "loss": 0.4305, "rewards/accuracies": 1.0, "rewards/chosen": 1.022072672843933, "rewards/margins": 0.6197797060012817, "rewards/rejected": 0.402292937040329, "step": 3139 }, { "epoch": 1.69, "learning_rate": 4.20129663333651e-08, "logits/chosen": -2.1703014373779297, "logits/rejected": -2.1959338188171387, "logps/chosen": -15.54863166809082, "logps/rejected": -4.231422424316406, "loss": 0.3568, "rewards/accuracies": 1.0, "rewards/chosen": 1.8258039951324463, "rewards/margins": 0.8470409512519836, "rewards/rejected": 0.9787630438804626, "step": 3140 }, { "epoch": 1.69, "learning_rate": 4.1984225784581304e-08, "logits/chosen": -2.158468008041382, "logits/rejected": -2.262862205505371, "logps/chosen": -3.29958438873291, "logps/rejected": -3.347402572631836, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 1.062083125114441, "rewards/margins": 0.027966737747192383, "rewards/rejected": 1.0341163873672485, "step": 3141 }, { "epoch": 1.69, "learning_rate": 4.195548795387801e-08, "logits/chosen": -2.0483219623565674, "logits/rejected": -2.2904136180877686, "logps/chosen": -3.630782127380371, "logps/rejected": -3.2912774085998535, "loss": 0.6973, "rewards/accuracies": 0.0, "rewards/chosen": 0.4516671299934387, "rewards/margins": -0.008264869451522827, "rewards/rejected": 0.45993199944496155, "step": 3142 }, { "epoch": 1.7, "learning_rate": 4.192675285099993e-08, "logits/chosen": -1.9416356086730957, "logits/rejected": -1.9349051713943481, "logps/chosen": -4.439058780670166, "logps/rejected": -4.040067195892334, "loss": 0.2788, "rewards/accuracies": 1.0, "rewards/chosen": 1.5382784605026245, "rewards/margins": 1.1347404718399048, "rewards/rejected": 0.4035379886627197, "step": 3143 }, { "epoch": 1.7, "learning_rate": 4.189802048569089e-08, "logits/chosen": -2.1653873920440674, "logits/rejected": -2.1683077812194824, "logps/chosen": -2.7660021781921387, "logps/rejected": -3.8617329597473145, "loss": 0.4108, "rewards/accuracies": 1.0, "rewards/chosen": 1.2158501148223877, "rewards/margins": 0.6771256327629089, "rewards/rejected": 0.5387244820594788, "step": 3144 }, { "epoch": 1.7, "learning_rate": 4.186929086769382e-08, "logits/chosen": -2.0238304138183594, "logits/rejected": -2.203885078430176, "logps/chosen": -0.5335524082183838, "logps/rejected": -0.5603464245796204, "loss": 0.6715, "rewards/accuracies": 1.0, "rewards/chosen": 0.9500948786735535, "rewards/margins": 0.043755531311035156, "rewards/rejected": 0.9063393473625183, "step": 3145 }, { "epoch": 1.7, "learning_rate": 4.184056400675067e-08, "logits/chosen": -2.0536270141601562, "logits/rejected": -2.2582309246063232, "logps/chosen": -0.42216795682907104, "logps/rejected": -0.41129153966903687, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 0.8321020007133484, "rewards/margins": -0.008552193641662598, "rewards/rejected": 0.840654194355011, "step": 3146 }, { "epoch": 1.7, "learning_rate": 4.181183991260247e-08, "logits/chosen": -2.07745623588562, "logits/rejected": -2.2805988788604736, "logps/chosen": -6.8295464515686035, "logps/rejected": -10.799369812011719, "loss": 0.6103, "rewards/accuracies": 1.0, "rewards/chosen": 0.7935532927513123, "rewards/margins": 0.17314749956130981, "rewards/rejected": 0.6204057931900024, "step": 3147 }, { "epoch": 1.7, "learning_rate": 4.178311859498933e-08, "logits/chosen": -1.962247371673584, "logits/rejected": -1.9629722833633423, "logps/chosen": -3.9392244815826416, "logps/rejected": -1.8530932664871216, "loss": 0.378, "rewards/accuracies": 1.0, "rewards/chosen": 1.4664684534072876, "rewards/margins": 0.7780076265335083, "rewards/rejected": 0.6884608268737793, "step": 3148 }, { "epoch": 1.7, "learning_rate": 4.175440006365038e-08, "logits/chosen": -1.9930436611175537, "logits/rejected": -2.231292486190796, "logps/chosen": -0.7582264542579651, "logps/rejected": -0.8466148376464844, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 1.1118799448013306, "rewards/margins": 0.010334253311157227, "rewards/rejected": 1.1015456914901733, "step": 3149 }, { "epoch": 1.7, "learning_rate": 4.172568432832386e-08, "logits/chosen": -1.9999394416809082, "logits/rejected": -2.0031301975250244, "logps/chosen": -3.887369155883789, "logps/rejected": -1.4395148754119873, "loss": 0.6572, "rewards/accuracies": 1.0, "rewards/chosen": 1.280238389968872, "rewards/margins": 0.07333600521087646, "rewards/rejected": 1.2069023847579956, "step": 3150 }, { "epoch": 1.7, "learning_rate": 4.169697139874699e-08, "logits/chosen": -2.1024160385131836, "logits/rejected": -1.9729195833206177, "logps/chosen": -19.160171508789062, "logps/rejected": -8.834146499633789, "loss": 0.3511, "rewards/accuracies": 1.0, "rewards/chosen": 1.6512874364852905, "rewards/margins": 0.8660924434661865, "rewards/rejected": 0.785194993019104, "step": 3151 }, { "epoch": 1.7, "learning_rate": 4.166826128465611e-08, "logits/chosen": -2.126382350921631, "logits/rejected": -2.125840425491333, "logps/chosen": -1.0746595859527588, "logps/rejected": -3.944967746734619, "loss": 0.5512, "rewards/accuracies": 1.0, "rewards/chosen": 0.9102221727371216, "rewards/margins": 0.30747610330581665, "rewards/rejected": 0.6027460694313049, "step": 3152 }, { "epoch": 1.7, "learning_rate": 4.163955399578654e-08, "logits/chosen": -2.132932186126709, "logits/rejected": -2.326847791671753, "logps/chosen": -1.4449673891067505, "logps/rejected": -1.5124882459640503, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.6666750311851501, "rewards/margins": 0.02562391757965088, "rewards/rejected": 0.6410511136054993, "step": 3153 }, { "epoch": 1.7, "learning_rate": 4.161084954187272e-08, "logits/chosen": -2.044337272644043, "logits/rejected": -2.32257342338562, "logps/chosen": -6.052911758422852, "logps/rejected": -5.901249885559082, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.9087910056114197, "rewards/margins": 0.0035584568977355957, "rewards/rejected": 0.9052325487136841, "step": 3154 }, { "epoch": 1.7, "learning_rate": 4.158214793264807e-08, "logits/chosen": -1.9587926864624023, "logits/rejected": -2.29506254196167, "logps/chosen": -0.7686368227005005, "logps/rejected": -12.000659942626953, "loss": 0.7612, "rewards/accuracies": 0.0, "rewards/chosen": 0.8755812048912048, "rewards/margins": -0.131852924823761, "rewards/rejected": 1.0074341297149658, "step": 3155 }, { "epoch": 1.7, "learning_rate": 4.1553449177845045e-08, "logits/chosen": -2.078223943710327, "logits/rejected": -2.280958890914917, "logps/chosen": -0.229145810008049, "logps/rejected": -0.22508540749549866, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 0.8655828833580017, "rewards/margins": 0.030658721923828125, "rewards/rejected": 0.8349241614341736, "step": 3156 }, { "epoch": 1.7, "learning_rate": 4.1524753287195163e-08, "logits/chosen": -2.125310182571411, "logits/rejected": -2.276904821395874, "logps/chosen": -0.9954875707626343, "logps/rejected": -1.8910945653915405, "loss": 0.6081, "rewards/accuracies": 1.0, "rewards/chosen": 0.9705958366394043, "rewards/margins": 0.17796730995178223, "rewards/rejected": 0.7926285266876221, "step": 3157 }, { "epoch": 1.7, "learning_rate": 4.1496060270428956e-08, "logits/chosen": -2.172790050506592, "logits/rejected": -2.291165351867676, "logps/chosen": -1.1052463054656982, "logps/rejected": -1.1267331838607788, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 1.07102632522583, "rewards/margins": 0.01616048812866211, "rewards/rejected": 1.054865837097168, "step": 3158 }, { "epoch": 1.7, "learning_rate": 4.146737013727597e-08, "logits/chosen": -2.033662796020508, "logits/rejected": -2.288581371307373, "logps/chosen": -8.47470474243164, "logps/rejected": -7.937861442565918, "loss": 0.7036, "rewards/accuracies": 0.0, "rewards/chosen": 0.8331630825996399, "rewards/margins": -0.020803332328796387, "rewards/rejected": 0.8539664149284363, "step": 3159 }, { "epoch": 1.7, "learning_rate": 4.143868289746477e-08, "logits/chosen": -1.9621018171310425, "logits/rejected": -2.255403757095337, "logps/chosen": -3.249875068664551, "logps/rejected": -1.698017954826355, "loss": 0.7908, "rewards/accuracies": 0.0, "rewards/chosen": 0.6082680821418762, "rewards/margins": -0.1866006851196289, "rewards/rejected": 0.7948687672615051, "step": 3160 }, { "epoch": 1.7, "learning_rate": 4.140999856072299e-08, "logits/chosen": -2.0040204524993896, "logits/rejected": -2.005263328552246, "logps/chosen": -1.067935585975647, "logps/rejected": -2.8060250282287598, "loss": 0.4969, "rewards/accuracies": 1.0, "rewards/chosen": 1.0001567602157593, "rewards/margins": 0.4405624270439148, "rewards/rejected": 0.5595943331718445, "step": 3161 }, { "epoch": 1.71, "learning_rate": 4.138131713677723e-08, "logits/chosen": -2.052919387817383, "logits/rejected": -2.056318998336792, "logps/chosen": -1.3921585083007812, "logps/rejected": -4.118378639221191, "loss": 0.4829, "rewards/accuracies": 1.0, "rewards/chosen": 1.20509672164917, "rewards/margins": 0.4766799807548523, "rewards/rejected": 0.7284167408943176, "step": 3162 }, { "epoch": 1.71, "learning_rate": 4.135263863535308e-08, "logits/chosen": -2.072634696960449, "logits/rejected": -2.2913591861724854, "logps/chosen": -8.32507038116455, "logps/rejected": -10.862534523010254, "loss": 0.5608, "rewards/accuracies": 1.0, "rewards/chosen": 0.9197322130203247, "rewards/margins": 0.2848079800605774, "rewards/rejected": 0.6349242329597473, "step": 3163 }, { "epoch": 1.71, "learning_rate": 4.132396306617522e-08, "logits/chosen": -2.0325417518615723, "logits/rejected": -2.324697971343994, "logps/chosen": -1.3184300661087036, "logps/rejected": -1.4818639755249023, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 1.2235345840454102, "rewards/margins": 0.01992189884185791, "rewards/rejected": 1.2036126852035522, "step": 3164 }, { "epoch": 1.71, "learning_rate": 4.129529043896728e-08, "logits/chosen": -2.0013303756713867, "logits/rejected": -2.2472684383392334, "logps/chosen": -0.918536365032196, "logps/rejected": -0.8824451565742493, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.8671553730964661, "rewards/margins": 0.004334568977355957, "rewards/rejected": 0.8628208041191101, "step": 3165 }, { "epoch": 1.71, "learning_rate": 4.126662076345189e-08, "logits/chosen": -2.0634045600891113, "logits/rejected": -2.3192174434661865, "logps/chosen": -19.29644775390625, "logps/rejected": -17.24044418334961, "loss": 0.7387, "rewards/accuracies": 0.0, "rewards/chosen": -0.02510814741253853, "rewards/margins": -0.0890466719865799, "rewards/rejected": 0.06393852084875107, "step": 3166 }, { "epoch": 1.71, "learning_rate": 4.123795404935071e-08, "logits/chosen": -2.0622546672821045, "logits/rejected": -2.0691158771514893, "logps/chosen": -2.518751382827759, "logps/rejected": -3.222238540649414, "loss": 0.6461, "rewards/accuracies": 1.0, "rewards/chosen": 0.9217355847358704, "rewards/margins": 0.0964888334274292, "rewards/rejected": 0.8252467513084412, "step": 3167 }, { "epoch": 1.71, "learning_rate": 4.1209290306384365e-08, "logits/chosen": -2.160053253173828, "logits/rejected": -2.3226053714752197, "logps/chosen": -3.07934308052063, "logps/rejected": -2.924381732940674, "loss": 0.6774, "rewards/accuracies": 1.0, "rewards/chosen": 0.5258744359016418, "rewards/margins": 0.03172898292541504, "rewards/rejected": 0.4941454529762268, "step": 3168 }, { "epoch": 1.71, "learning_rate": 4.118062954427249e-08, "logits/chosen": -2.011901617050171, "logits/rejected": -2.222771167755127, "logps/chosen": -0.6416604518890381, "logps/rejected": -0.6792958378791809, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.8389315605163574, "rewards/margins": 0.0065705180168151855, "rewards/rejected": 0.8323610424995422, "step": 3169 }, { "epoch": 1.71, "learning_rate": 4.115197177273368e-08, "logits/chosen": -2.004823923110962, "logits/rejected": -2.09226393699646, "logps/chosen": -13.846561431884766, "logps/rejected": -16.50710678100586, "loss": 0.4896, "rewards/accuracies": 1.0, "rewards/chosen": 1.5031944513320923, "rewards/margins": 0.4594442844390869, "rewards/rejected": 1.0437501668930054, "step": 3170 }, { "epoch": 1.71, "learning_rate": 4.112331700148558e-08, "logits/chosen": -2.0482897758483887, "logits/rejected": -2.1504979133605957, "logps/chosen": -2.8108410835266113, "logps/rejected": -10.069206237792969, "loss": 0.496, "rewards/accuracies": 1.0, "rewards/chosen": 1.2873674631118774, "rewards/margins": 0.44290655851364136, "rewards/rejected": 0.8444609045982361, "step": 3171 }, { "epoch": 1.71, "learning_rate": 4.1094665240244756e-08, "logits/chosen": -2.1619505882263184, "logits/rejected": -2.2863917350769043, "logps/chosen": -1.3826273679733276, "logps/rejected": -1.4582123756408691, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 1.0277270078659058, "rewards/margins": -0.011426091194152832, "rewards/rejected": 1.0391530990600586, "step": 3172 }, { "epoch": 1.71, "learning_rate": 4.106601649872676e-08, "logits/chosen": -2.0722436904907227, "logits/rejected": -2.025027275085449, "logps/chosen": -28.056659698486328, "logps/rejected": -2.776623249053955, "loss": 0.3726, "rewards/accuracies": 1.0, "rewards/chosen": 1.5830051898956299, "rewards/margins": 0.7950379252433777, "rewards/rejected": 0.7879672646522522, "step": 3173 }, { "epoch": 1.71, "learning_rate": 4.103737078664617e-08, "logits/chosen": -2.1510918140411377, "logits/rejected": -2.1509368419647217, "logps/chosen": -1.0895674228668213, "logps/rejected": -1.5474148988723755, "loss": 0.7052, "rewards/accuracies": 0.0, "rewards/chosen": 0.8761561512947083, "rewards/margins": -0.023929238319396973, "rewards/rejected": 0.9000853896141052, "step": 3174 }, { "epoch": 1.71, "learning_rate": 4.1008728113716495e-08, "logits/chosen": -2.066946268081665, "logits/rejected": -2.0657413005828857, "logps/chosen": -6.91508150100708, "logps/rejected": -1.1290078163146973, "loss": 0.3487, "rewards/accuracies": 1.0, "rewards/chosen": 1.7189315557479858, "rewards/margins": 0.8742164373397827, "rewards/rejected": 0.8447151184082031, "step": 3175 }, { "epoch": 1.71, "learning_rate": 4.098008848965019e-08, "logits/chosen": -2.069610834121704, "logits/rejected": -2.0649025440216064, "logps/chosen": -3.823741912841797, "logps/rejected": -10.289143562316895, "loss": 0.5147, "rewards/accuracies": 1.0, "rewards/chosen": 1.295530915260315, "rewards/margins": 0.395871102809906, "rewards/rejected": 0.8996598124504089, "step": 3176 }, { "epoch": 1.71, "learning_rate": 4.095145192415874e-08, "logits/chosen": -2.053971529006958, "logits/rejected": -2.0592081546783447, "logps/chosen": -3.754183292388916, "logps/rejected": -3.476149320602417, "loss": 0.4055, "rewards/accuracies": 1.0, "rewards/chosen": 1.27499520778656, "rewards/margins": 0.6930077075958252, "rewards/rejected": 0.5819875001907349, "step": 3177 }, { "epoch": 1.71, "learning_rate": 4.092281842695255e-08, "logits/chosen": -1.994860291481018, "logits/rejected": -2.2832882404327393, "logps/chosen": -4.84859037399292, "logps/rejected": -7.0372209548950195, "loss": 0.663, "rewards/accuracies": 1.0, "rewards/chosen": 0.7881559729576111, "rewards/margins": 0.061260998249053955, "rewards/rejected": 0.7268949747085571, "step": 3178 }, { "epoch": 1.71, "learning_rate": 4.0894188007740984e-08, "logits/chosen": -2.12758207321167, "logits/rejected": -2.1181812286376953, "logps/chosen": -12.775461196899414, "logps/rejected": -1.0395725965499878, "loss": 0.4323, "rewards/accuracies": 1.0, "rewards/chosen": 1.3691186904907227, "rewards/margins": 0.6146190762519836, "rewards/rejected": 0.754499614238739, "step": 3179 }, { "epoch": 1.72, "learning_rate": 4.086556067623238e-08, "logits/chosen": -2.0815958976745605, "logits/rejected": -2.2283682823181152, "logps/chosen": -4.0920305252075195, "logps/rejected": -0.34707799553871155, "loss": 0.6541, "rewards/accuracies": 1.0, "rewards/chosen": 0.8263762593269348, "rewards/margins": 0.07969433069229126, "rewards/rejected": 0.7466819286346436, "step": 3180 }, { "epoch": 1.72, "learning_rate": 4.083693644213402e-08, "logits/chosen": -1.9724621772766113, "logits/rejected": -1.973036766052246, "logps/chosen": -1.3174333572387695, "logps/rejected": -1.6982827186584473, "loss": 0.5771, "rewards/accuracies": 1.0, "rewards/chosen": 0.9981769919395447, "rewards/margins": 0.24736547470092773, "rewards/rejected": 0.7508115172386169, "step": 3181 }, { "epoch": 1.72, "learning_rate": 4.080831531515214e-08, "logits/chosen": -2.176504373550415, "logits/rejected": -2.1760456562042236, "logps/chosen": -4.9122090339660645, "logps/rejected": -3.284827709197998, "loss": 0.3584, "rewards/accuracies": 1.0, "rewards/chosen": 1.4229393005371094, "rewards/margins": 0.8416448831558228, "rewards/rejected": 0.5812944173812866, "step": 3182 }, { "epoch": 1.72, "learning_rate": 4.07796973049919e-08, "logits/chosen": -2.159473419189453, "logits/rejected": -2.1513047218322754, "logps/chosen": -7.833028316497803, "logps/rejected": -0.7937318682670593, "loss": 0.473, "rewards/accuracies": 1.0, "rewards/chosen": 1.4673696756362915, "rewards/margins": 0.5028239488601685, "rewards/rejected": 0.964545726776123, "step": 3183 }, { "epoch": 1.72, "learning_rate": 4.075108242135746e-08, "logits/chosen": -2.0998075008392334, "logits/rejected": -2.108903169631958, "logps/chosen": -5.709209442138672, "logps/rejected": -3.1655068397521973, "loss": 0.3114, "rewards/accuracies": 1.0, "rewards/chosen": 1.635026216506958, "rewards/margins": 1.0069153308868408, "rewards/rejected": 0.628110945224762, "step": 3184 }, { "epoch": 1.72, "learning_rate": 4.0722470673951846e-08, "logits/chosen": -2.0319950580596924, "logits/rejected": -2.041696310043335, "logps/chosen": -1.388769507408142, "logps/rejected": -3.5181329250335693, "loss": 0.4853, "rewards/accuracies": 1.0, "rewards/chosen": 1.0393036603927612, "rewards/margins": 0.4704554080963135, "rewards/rejected": 0.5688482522964478, "step": 3185 }, { "epoch": 1.72, "learning_rate": 4.069386207247708e-08, "logits/chosen": -2.0136165618896484, "logits/rejected": -2.0206615924835205, "logps/chosen": -4.040903568267822, "logps/rejected": -5.902377128601074, "loss": 0.4418, "rewards/accuracies": 1.0, "rewards/chosen": 0.9110906720161438, "rewards/margins": 0.5878609418869019, "rewards/rejected": 0.32322970032691956, "step": 3186 }, { "epoch": 1.72, "learning_rate": 4.066525662663409e-08, "logits/chosen": -2.117906332015991, "logits/rejected": -2.1119112968444824, "logps/chosen": -13.617195129394531, "logps/rejected": -3.5151519775390625, "loss": 0.4615, "rewards/accuracies": 1.0, "rewards/chosen": 1.0935531854629517, "rewards/margins": 0.5335295796394348, "rewards/rejected": 0.5600236058235168, "step": 3187 }, { "epoch": 1.72, "learning_rate": 4.063665434612274e-08, "logits/chosen": -2.082648515701294, "logits/rejected": -2.070307731628418, "logps/chosen": -0.7915009260177612, "logps/rejected": -5.121542453765869, "loss": 0.4484, "rewards/accuracies": 1.0, "rewards/chosen": 1.078507900238037, "rewards/margins": 0.5695016384124756, "rewards/rejected": 0.5090062618255615, "step": 3188 }, { "epoch": 1.72, "learning_rate": 4.06080552406418e-08, "logits/chosen": -2.0303456783294678, "logits/rejected": -2.3227744102478027, "logps/chosen": -4.987155914306641, "logps/rejected": -5.005558013916016, "loss": 0.6751, "rewards/accuracies": 1.0, "rewards/chosen": 0.4006442129611969, "rewards/margins": 0.03649577498435974, "rewards/rejected": 0.36414843797683716, "step": 3189 }, { "epoch": 1.72, "learning_rate": 4.0579459319889e-08, "logits/chosen": -2.1885018348693848, "logits/rejected": -2.1527597904205322, "logps/chosen": -17.984764099121094, "logps/rejected": -2.8791894912719727, "loss": 0.2978, "rewards/accuracies": 1.0, "rewards/chosen": 1.6550263166427612, "rewards/margins": 1.0588754415512085, "rewards/rejected": 0.5961508750915527, "step": 3190 }, { "epoch": 1.72, "learning_rate": 4.0550866593560965e-08, "logits/chosen": -1.954249382019043, "logits/rejected": -1.9267256259918213, "logps/chosen": -10.417805671691895, "logps/rejected": -3.381453514099121, "loss": 0.2968, "rewards/accuracies": 1.0, "rewards/chosen": 1.5769965648651123, "rewards/margins": 1.0625977516174316, "rewards/rejected": 0.5143988728523254, "step": 3191 }, { "epoch": 1.72, "learning_rate": 4.052227707135323e-08, "logits/chosen": -2.12907075881958, "logits/rejected": -2.0452182292938232, "logps/chosen": -14.06080436706543, "logps/rejected": -3.709416389465332, "loss": 0.3835, "rewards/accuracies": 1.0, "rewards/chosen": 1.5561212301254272, "rewards/margins": 0.7605515122413635, "rewards/rejected": 0.7955697178840637, "step": 3192 }, { "epoch": 1.72, "learning_rate": 4.04936907629603e-08, "logits/chosen": -2.0207462310791016, "logits/rejected": -2.294261932373047, "logps/chosen": -7.7421793937683105, "logps/rejected": -6.975176811218262, "loss": 0.7156, "rewards/accuracies": 0.0, "rewards/chosen": 0.6092865467071533, "rewards/margins": -0.04439026117324829, "rewards/rejected": 0.6536768078804016, "step": 3193 }, { "epoch": 1.72, "learning_rate": 4.046510767807551e-08, "logits/chosen": -2.2087597846984863, "logits/rejected": -2.2238805294036865, "logps/chosen": -3.790461301803589, "logps/rejected": -10.245723724365234, "loss": 0.4594, "rewards/accuracies": 1.0, "rewards/chosen": 1.3430050611495972, "rewards/margins": 0.5393053889274597, "rewards/rejected": 0.8036996722221375, "step": 3194 }, { "epoch": 1.72, "learning_rate": 4.043652782639114e-08, "logits/chosen": -2.151975393295288, "logits/rejected": -2.2793796062469482, "logps/chosen": -1.1320141553878784, "logps/rejected": -7.928230285644531, "loss": 0.5468, "rewards/accuracies": 1.0, "rewards/chosen": 1.1581400632858276, "rewards/margins": 0.31784313917160034, "rewards/rejected": 0.8402969241142273, "step": 3195 }, { "epoch": 1.72, "learning_rate": 4.0407951217598404e-08, "logits/chosen": -2.030360221862793, "logits/rejected": -2.023881673812866, "logps/chosen": -12.050457000732422, "logps/rejected": -5.554393291473389, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 2.117530584335327, "rewards/margins": 1.4327404499053955, "rewards/rejected": 0.6847900748252869, "step": 3196 }, { "epoch": 1.72, "learning_rate": 4.037937786138736e-08, "logits/chosen": -2.1093251705169678, "logits/rejected": -2.0953145027160645, "logps/chosen": -3.197291135787964, "logps/rejected": -4.430203914642334, "loss": 0.4612, "rewards/accuracies": 1.0, "rewards/chosen": 1.032119631767273, "rewards/margins": 0.5344552993774414, "rewards/rejected": 0.49766430258750916, "step": 3197 }, { "epoch": 1.72, "learning_rate": 4.0350807767447e-08, "logits/chosen": -2.1371350288391113, "logits/rejected": -2.3348939418792725, "logps/chosen": -1.3881419897079468, "logps/rejected": -1.4152255058288574, "loss": 0.6743, "rewards/accuracies": 1.0, "rewards/chosen": 0.803978443145752, "rewards/margins": 0.03807854652404785, "rewards/rejected": 0.7658998966217041, "step": 3198 }, { "epoch": 1.73, "learning_rate": 4.0322240945465176e-08, "logits/chosen": -2.010728359222412, "logits/rejected": -2.01385498046875, "logps/chosen": -5.049841403961182, "logps/rejected": -6.690740585327148, "loss": 0.4021, "rewards/accuracies": 1.0, "rewards/chosen": 1.2478033304214478, "rewards/margins": 0.7033020257949829, "rewards/rejected": 0.5445013046264648, "step": 3199 }, { "epoch": 1.73, "learning_rate": 4.029367740512868e-08, "logits/chosen": -2.2305052280426025, "logits/rejected": -2.330538749694824, "logps/chosen": -0.2530955970287323, "logps/rejected": -0.20128613710403442, "loss": 0.6699, "rewards/accuracies": 1.0, "rewards/chosen": 0.8032509684562683, "rewards/margins": 0.047077953815460205, "rewards/rejected": 0.7561730146408081, "step": 3200 }, { "epoch": 1.73, "learning_rate": 4.026511715612315e-08, "logits/chosen": -2.0629827976226807, "logits/rejected": -2.0587406158447266, "logps/chosen": -10.708017349243164, "logps/rejected": -5.168375492095947, "loss": 0.2806, "rewards/accuracies": 1.0, "rewards/chosen": 1.7713463306427002, "rewards/margins": 1.1272897720336914, "rewards/rejected": 0.644056499004364, "step": 3201 }, { "epoch": 1.73, "learning_rate": 4.023656020813311e-08, "logits/chosen": -2.1200459003448486, "logits/rejected": -2.1302671432495117, "logps/chosen": -7.586513996124268, "logps/rejected": -1.7225743532180786, "loss": 0.5667, "rewards/accuracies": 1.0, "rewards/chosen": 1.06207275390625, "rewards/margins": 0.2712600827217102, "rewards/rejected": 0.7908126711845398, "step": 3202 }, { "epoch": 1.73, "learning_rate": 4.020800657084202e-08, "logits/chosen": -2.1836702823638916, "logits/rejected": -2.2802226543426514, "logps/chosen": -1.5964785814285278, "logps/rejected": -1.414530634880066, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 1.02334725856781, "rewards/margins": 0.005446195602416992, "rewards/rejected": 1.017901062965393, "step": 3203 }, { "epoch": 1.73, "learning_rate": 4.017945625393213e-08, "logits/chosen": -2.1309773921966553, "logits/rejected": -2.1260604858398438, "logps/chosen": -4.607316017150879, "logps/rejected": -7.21893310546875, "loss": 0.6061, "rewards/accuracies": 1.0, "rewards/chosen": 0.9285144209861755, "rewards/margins": 0.1823025941848755, "rewards/rejected": 0.7462118268013, "step": 3204 }, { "epoch": 1.73, "learning_rate": 4.0150909267084636e-08, "logits/chosen": -2.0812530517578125, "logits/rejected": -2.3003342151641846, "logps/chosen": -1.4233418703079224, "logps/rejected": -1.3337892293930054, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 1.0496435165405273, "rewards/margins": 0.01833522319793701, "rewards/rejected": 1.0313082933425903, "step": 3205 }, { "epoch": 1.73, "learning_rate": 4.012236561997957e-08, "logits/chosen": -2.288994312286377, "logits/rejected": -2.277740001678467, "logps/chosen": -0.4990951120853424, "logps/rejected": -0.5191648602485657, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.9440421462059021, "rewards/margins": 0.005536019802093506, "rewards/rejected": 0.9385061264038086, "step": 3206 }, { "epoch": 1.73, "learning_rate": 4.0093825322295827e-08, "logits/chosen": -2.2518749237060547, "logits/rejected": -2.382526397705078, "logps/chosen": -0.6629510521888733, "logps/rejected": -0.8044533133506775, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 0.8498085141181946, "rewards/margins": 0.009423553943634033, "rewards/rejected": 0.8403849601745605, "step": 3207 }, { "epoch": 1.73, "learning_rate": 4.006528838371118e-08, "logits/chosen": -2.027944564819336, "logits/rejected": -2.2732911109924316, "logps/chosen": -0.5073190927505493, "logps/rejected": -0.5399479866027832, "loss": 0.6706, "rewards/accuracies": 1.0, "rewards/chosen": 0.9520468711853027, "rewards/margins": 0.04565894603729248, "rewards/rejected": 0.9063879251480103, "step": 3208 }, { "epoch": 1.73, "learning_rate": 4.0036754813902277e-08, "logits/chosen": -2.052682399749756, "logits/rejected": -2.290142059326172, "logps/chosen": -0.9743751287460327, "logps/rejected": -1.036954641342163, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.8884531855583191, "rewards/margins": 0.01809871196746826, "rewards/rejected": 0.8703544735908508, "step": 3209 }, { "epoch": 1.73, "learning_rate": 4.00082246225446e-08, "logits/chosen": -2.182799816131592, "logits/rejected": -2.0756635665893555, "logps/chosen": -39.50593566894531, "logps/rejected": -2.2019381523132324, "loss": 0.1676, "rewards/accuracies": 1.0, "rewards/chosen": 2.3302001953125, "rewards/margins": 1.70135498046875, "rewards/rejected": 0.6288451552391052, "step": 3210 }, { "epoch": 1.73, "learning_rate": 3.997969781931249e-08, "logits/chosen": -2.0544471740722656, "logits/rejected": -2.0469918251037598, "logps/chosen": -4.036855220794678, "logps/rejected": -6.132957935333252, "loss": 0.334, "rewards/accuracies": 1.0, "rewards/chosen": 1.4048465490341187, "rewards/margins": 0.9248126745223999, "rewards/rejected": 0.48003384470939636, "step": 3211 }, { "epoch": 1.73, "learning_rate": 3.995117441387912e-08, "logits/chosen": -2.074368953704834, "logits/rejected": -2.266129970550537, "logps/chosen": -0.268824964761734, "logps/rejected": -0.352654367685318, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.8690824508666992, "rewards/margins": 0.021667659282684326, "rewards/rejected": 0.8474147915840149, "step": 3212 }, { "epoch": 1.73, "learning_rate": 3.9922654415916565e-08, "logits/chosen": -1.9670026302337646, "logits/rejected": -2.2501535415649414, "logps/chosen": -0.3327036499977112, "logps/rejected": -0.25840628147125244, "loss": 0.6721, "rewards/accuracies": 1.0, "rewards/chosen": 0.8994513750076294, "rewards/margins": 0.04263871908187866, "rewards/rejected": 0.8568126559257507, "step": 3213 }, { "epoch": 1.73, "learning_rate": 3.989413783509571e-08, "logits/chosen": -2.062014102935791, "logits/rejected": -2.3158857822418213, "logps/chosen": -0.7082302570343018, "logps/rejected": -0.6345081329345703, "loss": 0.6733, "rewards/accuracies": 1.0, "rewards/chosen": 1.025024175643921, "rewards/margins": 0.04017692804336548, "rewards/rejected": 0.9848472476005554, "step": 3214 }, { "epoch": 1.73, "learning_rate": 3.9865624681086276e-08, "logits/chosen": -2.0697286128997803, "logits/rejected": -2.2437798976898193, "logps/chosen": -0.9310750961303711, "logps/rejected": -0.9854844808578491, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.8717366456985474, "rewards/margins": 0.014710426330566406, "rewards/rejected": 0.857026219367981, "step": 3215 }, { "epoch": 1.73, "learning_rate": 3.9837114963556826e-08, "logits/chosen": -2.1270651817321777, "logits/rejected": -2.3188283443450928, "logps/chosen": -7.9673027992248535, "logps/rejected": -8.36974048614502, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.9803686141967773, "rewards/margins": 0.03276777267456055, "rewards/rejected": 0.9476008415222168, "step": 3216 }, { "epoch": 1.74, "learning_rate": 3.9808608692174774e-08, "logits/chosen": -2.0801684856414795, "logits/rejected": -2.2683370113372803, "logps/chosen": -1.1517486572265625, "logps/rejected": -1.2280707359313965, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.7224776148796082, "rewards/margins": 0.03139156103134155, "rewards/rejected": 0.6910860538482666, "step": 3217 }, { "epoch": 1.74, "learning_rate": 3.978010587660632e-08, "logits/chosen": -2.0994038581848145, "logits/rejected": -2.0990118980407715, "logps/chosen": -1.4429389238357544, "logps/rejected": -2.6259660720825195, "loss": 0.605, "rewards/accuracies": 1.0, "rewards/chosen": 0.9597803950309753, "rewards/margins": 0.1847543716430664, "rewards/rejected": 0.7750260233879089, "step": 3218 }, { "epoch": 1.74, "learning_rate": 3.975160652651657e-08, "logits/chosen": -2.231308698654175, "logits/rejected": -2.0966591835021973, "logps/chosen": -33.681922912597656, "logps/rejected": -1.0713880062103271, "loss": 0.1118, "rewards/accuracies": 1.0, "rewards/chosen": 2.877514600753784, "rewards/margins": 2.135028600692749, "rewards/rejected": 0.7424860000610352, "step": 3219 }, { "epoch": 1.74, "learning_rate": 3.972311065156939e-08, "logits/chosen": -2.06587553024292, "logits/rejected": -2.234694480895996, "logps/chosen": -3.0007681846618652, "logps/rejected": -2.4305684566497803, "loss": 0.7077, "rewards/accuracies": 0.0, "rewards/chosen": 0.6779152750968933, "rewards/margins": -0.028981924057006836, "rewards/rejected": 0.7068971991539001, "step": 3220 }, { "epoch": 1.74, "learning_rate": 3.969461826142748e-08, "logits/chosen": -2.1181745529174805, "logits/rejected": -2.1320745944976807, "logps/chosen": -2.0921995639801025, "logps/rejected": -5.638111591339111, "loss": 0.3434, "rewards/accuracies": 1.0, "rewards/chosen": 1.3666937351226807, "rewards/margins": 0.8921526670455933, "rewards/rejected": 0.474541038274765, "step": 3221 }, { "epoch": 1.74, "learning_rate": 3.966612936575235e-08, "logits/chosen": -2.019705295562744, "logits/rejected": -2.020991086959839, "logps/chosen": -1.39104163646698, "logps/rejected": -2.7232015132904053, "loss": 0.5351, "rewards/accuracies": 1.0, "rewards/chosen": 0.9401939511299133, "rewards/margins": 0.3458002209663391, "rewards/rejected": 0.5943937301635742, "step": 3222 }, { "epoch": 1.74, "learning_rate": 3.963764397420437e-08, "logits/chosen": -2.0162336826324463, "logits/rejected": -2.030465602874756, "logps/chosen": -3.230246067047119, "logps/rejected": -8.084317207336426, "loss": 0.4434, "rewards/accuracies": 1.0, "rewards/chosen": 1.1295499801635742, "rewards/margins": 0.5834506154060364, "rewards/rejected": 0.5460993647575378, "step": 3223 }, { "epoch": 1.74, "learning_rate": 3.96091620964427e-08, "logits/chosen": -2.123379945755005, "logits/rejected": -2.3239564895629883, "logps/chosen": -3.7895891666412354, "logps/rejected": -1.1440685987472534, "loss": 0.6705, "rewards/accuracies": 1.0, "rewards/chosen": 0.9343202710151672, "rewards/margins": 0.04581022262573242, "rewards/rejected": 0.8885100483894348, "step": 3224 }, { "epoch": 1.74, "learning_rate": 3.958068374212529e-08, "logits/chosen": -2.0524606704711914, "logits/rejected": -2.041105031967163, "logps/chosen": -20.531352996826172, "logps/rejected": -0.9485530853271484, "loss": 0.2593, "rewards/accuracies": 1.0, "rewards/chosen": 2.0772387981414795, "rewards/margins": 1.2172377109527588, "rewards/rejected": 0.8600010275840759, "step": 3225 }, { "epoch": 1.74, "learning_rate": 3.95522089209089e-08, "logits/chosen": -2.0410399436950684, "logits/rejected": -2.301666498184204, "logps/chosen": -2.308643341064453, "logps/rejected": -2.1529388427734375, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 1.1932705640792847, "rewards/margins": 0.03229475021362305, "rewards/rejected": 1.1609758138656616, "step": 3226 }, { "epoch": 1.74, "learning_rate": 3.952373764244908e-08, "logits/chosen": -2.081920623779297, "logits/rejected": -2.0861213207244873, "logps/chosen": -1.5226409435272217, "logps/rejected": -2.063462972640991, "loss": 0.505, "rewards/accuracies": 1.0, "rewards/chosen": 1.2493771314620972, "rewards/margins": 0.42000871896743774, "rewards/rejected": 0.8293684124946594, "step": 3227 }, { "epoch": 1.74, "learning_rate": 3.949526991640024e-08, "logits/chosen": -2.131784677505493, "logits/rejected": -1.939348578453064, "logps/chosen": -38.089012145996094, "logps/rejected": -2.7429494857788086, "loss": 0.204, "rewards/accuracies": 1.0, "rewards/chosen": 2.1247780323028564, "rewards/margins": 1.4859400987625122, "rewards/rejected": 0.6388379335403442, "step": 3228 }, { "epoch": 1.74, "learning_rate": 3.946680575241552e-08, "logits/chosen": -2.046116828918457, "logits/rejected": -2.1621158123016357, "logps/chosen": -0.5432863235473633, "logps/rejected": -18.241411209106445, "loss": 0.4533, "rewards/accuracies": 1.0, "rewards/chosen": 0.9706690907478333, "rewards/margins": 0.5561188459396362, "rewards/rejected": 0.41455021500587463, "step": 3229 }, { "epoch": 1.74, "learning_rate": 3.943834516014687e-08, "logits/chosen": -2.0636801719665527, "logits/rejected": -2.062462091445923, "logps/chosen": -0.9772096276283264, "logps/rejected": -5.0383124351501465, "loss": 0.4425, "rewards/accuracies": 1.0, "rewards/chosen": 1.0991061925888062, "rewards/margins": 0.5858953595161438, "rewards/rejected": 0.5132108330726624, "step": 3230 }, { "epoch": 1.74, "learning_rate": 3.940988814924503e-08, "logits/chosen": -2.1476333141326904, "logits/rejected": -2.1467363834381104, "logps/chosen": -3.3417398929595947, "logps/rejected": -2.201298952102661, "loss": 0.4541, "rewards/accuracies": 1.0, "rewards/chosen": 1.3715356588363647, "rewards/margins": 0.5537971258163452, "rewards/rejected": 0.8177385330200195, "step": 3231 }, { "epoch": 1.74, "learning_rate": 3.938143472935954e-08, "logits/chosen": -2.0168206691741943, "logits/rejected": -2.016348123550415, "logps/chosen": -0.30202019214630127, "logps/rejected": -3.7803635597229004, "loss": 0.5011, "rewards/accuracies": 1.0, "rewards/chosen": 0.9697843790054321, "rewards/margins": 0.4299224615097046, "rewards/rejected": 0.5398619174957275, "step": 3232 }, { "epoch": 1.74, "learning_rate": 3.9352984910138705e-08, "logits/chosen": -2.016235113143921, "logits/rejected": -2.1823623180389404, "logps/chosen": -0.3261124789714813, "logps/rejected": -0.35769233107566833, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 0.823609471321106, "rewards/margins": 0.0219762921333313, "rewards/rejected": 0.8016331791877747, "step": 3233 }, { "epoch": 1.74, "learning_rate": 3.932453870122963e-08, "logits/chosen": -2.003293752670288, "logits/rejected": -1.9927513599395752, "logps/chosen": -5.595674514770508, "logps/rejected": -4.5810747146606445, "loss": 0.4699, "rewards/accuracies": 1.0, "rewards/chosen": 1.1186838150024414, "rewards/margins": 0.5110009908676147, "rewards/rejected": 0.6076828241348267, "step": 3234 }, { "epoch": 1.74, "learning_rate": 3.929609611227817e-08, "logits/chosen": -2.0682547092437744, "logits/rejected": -2.0677719116210938, "logps/chosen": -1.4181004762649536, "logps/rejected": -2.7983219623565674, "loss": 0.6209, "rewards/accuracies": 1.0, "rewards/chosen": 0.9888456463813782, "rewards/margins": 0.15007150173187256, "rewards/rejected": 0.8387741446495056, "step": 3235 }, { "epoch": 1.75, "learning_rate": 3.9267657152928964e-08, "logits/chosen": -1.9578732252120972, "logits/rejected": -1.9715138673782349, "logps/chosen": -6.304379463195801, "logps/rejected": -4.71135139465332, "loss": 0.3654, "rewards/accuracies": 1.0, "rewards/chosen": 1.4735959768295288, "rewards/margins": 0.8186501860618591, "rewards/rejected": 0.6549457907676697, "step": 3236 }, { "epoch": 1.75, "learning_rate": 3.923922183282541e-08, "logits/chosen": -2.0416970252990723, "logits/rejected": -2.285703182220459, "logps/chosen": -10.000137329101562, "logps/rejected": -9.077466011047363, "loss": 0.7461, "rewards/accuracies": 0.0, "rewards/chosen": 0.6918136477470398, "rewards/margins": -0.10319244861602783, "rewards/rejected": 0.7950060963630676, "step": 3237 }, { "epoch": 1.75, "learning_rate": 3.9210790161609696e-08, "logits/chosen": -2.087362766265869, "logits/rejected": -2.090514659881592, "logps/chosen": -0.42205101251602173, "logps/rejected": -3.2561442852020264, "loss": 0.4818, "rewards/accuracies": 1.0, "rewards/chosen": 1.0150913000106812, "rewards/margins": 0.47958284616470337, "rewards/rejected": 0.5355084538459778, "step": 3238 }, { "epoch": 1.75, "learning_rate": 3.918236214892276e-08, "logits/chosen": -2.0266788005828857, "logits/rejected": -2.3196353912353516, "logps/chosen": -0.5979422330856323, "logps/rejected": -5.290288925170898, "loss": 0.5108, "rewards/accuracies": 1.0, "rewards/chosen": 1.1327120065689087, "rewards/margins": 0.40542834997177124, "rewards/rejected": 0.7272836565971375, "step": 3239 }, { "epoch": 1.75, "learning_rate": 3.915393780440429e-08, "logits/chosen": -2.0589849948883057, "logits/rejected": -2.2527122497558594, "logps/chosen": -1.8018862009048462, "logps/rejected": -1.8379124402999878, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.610985279083252, "rewards/margins": 0.012631475925445557, "rewards/rejected": 0.5983538031578064, "step": 3240 }, { "epoch": 1.75, "learning_rate": 3.912551713769272e-08, "logits/chosen": -2.070047616958618, "logits/rejected": -2.0718789100646973, "logps/chosen": -4.6405029296875, "logps/rejected": -2.6907103061676025, "loss": 0.3592, "rewards/accuracies": 1.0, "rewards/chosen": 1.609836220741272, "rewards/margins": 0.8389537930488586, "rewards/rejected": 0.7708824276924133, "step": 3241 }, { "epoch": 1.75, "learning_rate": 3.909710015842529e-08, "logits/chosen": -2.142258405685425, "logits/rejected": -2.0388083457946777, "logps/chosen": -22.96126365661621, "logps/rejected": -1.2915951013565063, "loss": 0.2918, "rewards/accuracies": 1.0, "rewards/chosen": 1.9393091201782227, "rewards/margins": 1.0824201107025146, "rewards/rejected": 0.8568889498710632, "step": 3242 }, { "epoch": 1.75, "learning_rate": 3.906868687623793e-08, "logits/chosen": -2.1599583625793457, "logits/rejected": -2.1632821559906006, "logps/chosen": -3.7606635093688965, "logps/rejected": -3.902970314025879, "loss": 0.4567, "rewards/accuracies": 1.0, "rewards/chosen": 1.0998262166976929, "rewards/margins": 0.5467329621315002, "rewards/rejected": 0.5530932545661926, "step": 3243 }, { "epoch": 1.75, "learning_rate": 3.904027730076537e-08, "logits/chosen": -1.942764163017273, "logits/rejected": -2.341136932373047, "logps/chosen": -5.627968788146973, "logps/rejected": -5.755474090576172, "loss": 0.6962, "rewards/accuracies": 0.0, "rewards/chosen": 0.4663836658000946, "rewards/margins": -0.006136894226074219, "rewards/rejected": 0.4725205600261688, "step": 3244 }, { "epoch": 1.75, "learning_rate": 3.9011871441641034e-08, "logits/chosen": -2.1060638427734375, "logits/rejected": -2.2153961658477783, "logps/chosen": -18.71328353881836, "logps/rejected": -21.332542419433594, "loss": 0.3222, "rewards/accuracies": 1.0, "rewards/chosen": 1.673315405845642, "rewards/margins": 0.9669827818870544, "rewards/rejected": 0.7063326239585876, "step": 3245 }, { "epoch": 1.75, "learning_rate": 3.8983469308497095e-08, "logits/chosen": -2.2349746227264404, "logits/rejected": -2.1518120765686035, "logps/chosen": -21.311542510986328, "logps/rejected": -3.4519309997558594, "loss": 0.2591, "rewards/accuracies": 1.0, "rewards/chosen": 1.924245834350586, "rewards/margins": 1.2180331945419312, "rewards/rejected": 0.7062126398086548, "step": 3246 }, { "epoch": 1.75, "learning_rate": 3.895507091096448e-08, "logits/chosen": -2.038747549057007, "logits/rejected": -2.248711347579956, "logps/chosen": -2.3865838050842285, "logps/rejected": -0.6191803216934204, "loss": 0.7169, "rewards/accuracies": 0.0, "rewards/chosen": 1.0031102895736694, "rewards/margins": -0.047013282775878906, "rewards/rejected": 1.0501235723495483, "step": 3247 }, { "epoch": 1.75, "learning_rate": 3.892667625867286e-08, "logits/chosen": -1.935078501701355, "logits/rejected": -2.2598745822906494, "logps/chosen": -2.8221914768218994, "logps/rejected": -2.1179845333099365, "loss": 0.7277, "rewards/accuracies": 0.0, "rewards/chosen": 0.7638149261474609, "rewards/margins": -0.06803750991821289, "rewards/rejected": 0.8318524360656738, "step": 3248 }, { "epoch": 1.75, "learning_rate": 3.88982853612506e-08, "logits/chosen": -2.1927335262298584, "logits/rejected": -2.1959691047668457, "logps/chosen": -0.30766332149505615, "logps/rejected": -3.9317281246185303, "loss": 0.4792, "rewards/accuracies": 1.0, "rewards/chosen": 0.9262831807136536, "rewards/margins": 0.48650285601615906, "rewards/rejected": 0.4397803246974945, "step": 3249 }, { "epoch": 1.75, "learning_rate": 3.8869898228324815e-08, "logits/chosen": -2.088195323944092, "logits/rejected": -2.248994827270508, "logps/chosen": -3.1681230068206787, "logps/rejected": -2.970487356185913, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.6358475685119629, "rewards/margins": 0.013012111186981201, "rewards/rejected": 0.6228354573249817, "step": 3250 }, { "epoch": 1.75, "learning_rate": 3.884151486952134e-08, "logits/chosen": -2.079827070236206, "logits/rejected": -2.07924485206604, "logps/chosen": -4.787997245788574, "logps/rejected": -4.762231349945068, "loss": 0.5327, "rewards/accuracies": 1.0, "rewards/chosen": 1.0527533292770386, "rewards/margins": 0.3515341877937317, "rewards/rejected": 0.7012191414833069, "step": 3251 }, { "epoch": 1.75, "learning_rate": 3.881313529446473e-08, "logits/chosen": -2.0647056102752686, "logits/rejected": -2.07305908203125, "logps/chosen": -0.5993472933769226, "logps/rejected": -7.038218021392822, "loss": 0.4097, "rewards/accuracies": 1.0, "rewards/chosen": 1.1625587940216064, "rewards/margins": 0.6805073022842407, "rewards/rejected": 0.4820515215396881, "step": 3252 }, { "epoch": 1.75, "learning_rate": 3.878475951277826e-08, "logits/chosen": -2.029536247253418, "logits/rejected": -2.2616207599639893, "logps/chosen": -3.8280844688415527, "logps/rejected": -0.6778206825256348, "loss": 0.6259, "rewards/accuracies": 1.0, "rewards/chosen": 1.1301441192626953, "rewards/margins": 0.13941001892089844, "rewards/rejected": 0.9907341003417969, "step": 3253 }, { "epoch": 1.76, "learning_rate": 3.8756387534083913e-08, "logits/chosen": -1.939980149269104, "logits/rejected": -2.240755319595337, "logps/chosen": -2.880659341812134, "logps/rejected": -0.9410701394081116, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.8981437087059021, "rewards/margins": 0.018510103225708008, "rewards/rejected": 0.8796336054801941, "step": 3254 }, { "epoch": 1.76, "learning_rate": 3.872801936800238e-08, "logits/chosen": -2.0756473541259766, "logits/rejected": -2.029114007949829, "logps/chosen": -29.40445899963379, "logps/rejected": -1.9130945205688477, "loss": 0.3125, "rewards/accuracies": 1.0, "rewards/chosen": 1.8159147500991821, "rewards/margins": 1.0029311180114746, "rewards/rejected": 0.8129836320877075, "step": 3255 }, { "epoch": 1.76, "learning_rate": 3.869965502415306e-08, "logits/chosen": -2.0781748294830322, "logits/rejected": -2.2769081592559814, "logps/chosen": -2.456993579864502, "logps/rejected": -3.3534979820251465, "loss": 0.668, "rewards/accuracies": 1.0, "rewards/chosen": 0.8129808306694031, "rewards/margins": 0.050885915756225586, "rewards/rejected": 0.7620949149131775, "step": 3256 }, { "epoch": 1.76, "learning_rate": 3.8671294512154075e-08, "logits/chosen": -2.1087138652801514, "logits/rejected": -2.0920968055725098, "logps/chosen": -16.204315185546875, "logps/rejected": -1.7758135795593262, "loss": 0.4208, "rewards/accuracies": 1.0, "rewards/chosen": 1.1629819869995117, "rewards/margins": 0.6477912664413452, "rewards/rejected": 0.5151907205581665, "step": 3257 }, { "epoch": 1.76, "learning_rate": 3.864293784162222e-08, "logits/chosen": -2.1325511932373047, "logits/rejected": -2.3663549423217773, "logps/chosen": -3.2657270431518555, "logps/rejected": -3.0911366939544678, "loss": 0.6995, "rewards/accuracies": 0.0, "rewards/chosen": 0.9427477121353149, "rewards/margins": -0.012683212757110596, "rewards/rejected": 0.9554309248924255, "step": 3258 }, { "epoch": 1.76, "learning_rate": 3.8614585022173e-08, "logits/chosen": -2.095918655395508, "logits/rejected": -2.103076934814453, "logps/chosen": -3.925135850906372, "logps/rejected": -4.24890661239624, "loss": 0.4548, "rewards/accuracies": 1.0, "rewards/chosen": 1.2850960493087769, "rewards/margins": 0.5519788861274719, "rewards/rejected": 0.7331171631813049, "step": 3259 }, { "epoch": 1.76, "learning_rate": 3.858623606342061e-08, "logits/chosen": -2.047727108001709, "logits/rejected": -1.9954321384429932, "logps/chosen": -8.558796882629395, "logps/rejected": -7.6822285652160645, "loss": 0.432, "rewards/accuracies": 1.0, "rewards/chosen": 1.4358508586883545, "rewards/margins": 0.6156136989593506, "rewards/rejected": 0.8202371597290039, "step": 3260 }, { "epoch": 1.76, "learning_rate": 3.855789097497794e-08, "logits/chosen": -1.9603815078735352, "logits/rejected": -2.2574055194854736, "logps/chosen": -2.5464136600494385, "logps/rejected": -2.7212867736816406, "loss": 0.6978, "rewards/accuracies": 0.0, "rewards/chosen": 0.8568403124809265, "rewards/margins": -0.00933849811553955, "rewards/rejected": 0.8661788105964661, "step": 3261 }, { "epoch": 1.76, "learning_rate": 3.852954976645657e-08, "logits/chosen": -2.157926082611084, "logits/rejected": -2.158228635787964, "logps/chosen": -2.934359073638916, "logps/rejected": -3.278682231903076, "loss": 0.2851, "rewards/accuracies": 1.0, "rewards/chosen": 1.6958898305892944, "rewards/margins": 1.1090614795684814, "rewards/rejected": 0.5868282914161682, "step": 3262 }, { "epoch": 1.76, "learning_rate": 3.850121244746678e-08, "logits/chosen": -2.209851026535034, "logits/rejected": -2.237713098526001, "logps/chosen": -18.52644920349121, "logps/rejected": -14.67929744720459, "loss": 0.4629, "rewards/accuracies": 1.0, "rewards/chosen": 1.750152826309204, "rewards/margins": 0.5297963619232178, "rewards/rejected": 1.2203564643859863, "step": 3263 }, { "epoch": 1.76, "learning_rate": 3.847287902761749e-08, "logits/chosen": -2.103527307510376, "logits/rejected": -2.2739946842193604, "logps/chosen": -12.742395401000977, "logps/rejected": -5.943901062011719, "loss": 0.7074, "rewards/accuracies": 0.0, "rewards/chosen": 0.8138275146484375, "rewards/margins": -0.02837318181991577, "rewards/rejected": 0.8422006964683533, "step": 3264 }, { "epoch": 1.76, "learning_rate": 3.8444549516516327e-08, "logits/chosen": -2.0462305545806885, "logits/rejected": -2.0397844314575195, "logps/chosen": -5.732645511627197, "logps/rejected": -1.9777956008911133, "loss": 0.366, "rewards/accuracies": 1.0, "rewards/chosen": 1.6673698425292969, "rewards/margins": 0.816512942314148, "rewards/rejected": 0.8508569002151489, "step": 3265 }, { "epoch": 1.76, "learning_rate": 3.841622392376957e-08, "logits/chosen": -2.0112621784210205, "logits/rejected": -2.262432813644409, "logps/chosen": -1.8307538032531738, "logps/rejected": -1.4737119674682617, "loss": 0.6967, "rewards/accuracies": 0.0, "rewards/chosen": 0.8557981848716736, "rewards/margins": -0.007104635238647461, "rewards/rejected": 0.862902820110321, "step": 3266 }, { "epoch": 1.76, "learning_rate": 3.838790225898222e-08, "logits/chosen": -2.013709783554077, "logits/rejected": -2.2566745281219482, "logps/chosen": -0.21158847212791443, "logps/rejected": -0.24883507192134857, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9476157426834106, "rewards/margins": 0.017748773097991943, "rewards/rejected": 0.9298669695854187, "step": 3267 }, { "epoch": 1.76, "learning_rate": 3.835958453175788e-08, "logits/chosen": -2.105262041091919, "logits/rejected": -2.100980520248413, "logps/chosen": -2.4426729679107666, "logps/rejected": -6.696218490600586, "loss": 0.4167, "rewards/accuracies": 1.0, "rewards/chosen": 1.1189583539962769, "rewards/margins": 0.6598560810089111, "rewards/rejected": 0.45910224318504333, "step": 3268 }, { "epoch": 1.76, "learning_rate": 3.8331270751698874e-08, "logits/chosen": -2.125054121017456, "logits/rejected": -2.2752749919891357, "logps/chosen": -3.2884984016418457, "logps/rejected": -3.212101697921753, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.7671193480491638, "rewards/margins": 0.007486581802368164, "rewards/rejected": 0.7596327662467957, "step": 3269 }, { "epoch": 1.76, "learning_rate": 3.830296092840613e-08, "logits/chosen": -2.0607569217681885, "logits/rejected": -2.067533254623413, "logps/chosen": -0.4631801247596741, "logps/rejected": -5.724242687225342, "loss": 0.4433, "rewards/accuracies": 1.0, "rewards/chosen": 0.9762608408927917, "rewards/margins": 0.5837576389312744, "rewards/rejected": 0.3925032317638397, "step": 3270 }, { "epoch": 1.76, "learning_rate": 3.8274655071479274e-08, "logits/chosen": -2.093696117401123, "logits/rejected": -2.3353846073150635, "logps/chosen": -0.8669872879981995, "logps/rejected": -0.8758972883224487, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.8210943341255188, "rewards/margins": 0.01968449354171753, "rewards/rejected": 0.8014098405838013, "step": 3271 }, { "epoch": 1.76, "learning_rate": 3.824635319051661e-08, "logits/chosen": -1.9922412633895874, "logits/rejected": -2.250364303588867, "logps/chosen": -1.3200275897979736, "logps/rejected": -1.2340528964996338, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.7321498990058899, "rewards/margins": 0.015911102294921875, "rewards/rejected": 0.716238796710968, "step": 3272 }, { "epoch": 1.77, "learning_rate": 3.821805529511505e-08, "logits/chosen": -2.024048328399658, "logits/rejected": -2.2596588134765625, "logps/chosen": -1.4344371557235718, "logps/rejected": -4.035645484924316, "loss": 0.6213, "rewards/accuracies": 1.0, "rewards/chosen": 0.9558162689208984, "rewards/margins": 0.1493111252784729, "rewards/rejected": 0.8065051436424255, "step": 3273 }, { "epoch": 1.77, "learning_rate": 3.818976139487017e-08, "logits/chosen": -2.0869431495666504, "logits/rejected": -2.0826616287231445, "logps/chosen": -13.076221466064453, "logps/rejected": -1.4784749746322632, "loss": 0.8197, "rewards/accuracies": 0.0, "rewards/chosen": 0.7107952237129211, "rewards/margins": -0.23880422115325928, "rewards/rejected": 0.9495994448661804, "step": 3274 }, { "epoch": 1.77, "learning_rate": 3.8161471499376163e-08, "logits/chosen": -2.1399106979370117, "logits/rejected": -2.2382593154907227, "logps/chosen": -1.5602355003356934, "logps/rejected": -1.594490885734558, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.9855839014053345, "rewards/margins": 0.01382535696029663, "rewards/rejected": 0.9717585444450378, "step": 3275 }, { "epoch": 1.77, "learning_rate": 3.8133185618225927e-08, "logits/chosen": -2.186811685562134, "logits/rejected": -2.1893341541290283, "logps/chosen": -1.9724180698394775, "logps/rejected": -0.8606613278388977, "loss": 0.641, "rewards/accuracies": 1.0, "rewards/chosen": 1.174582839012146, "rewards/margins": 0.10712051391601562, "rewards/rejected": 1.0674623250961304, "step": 3276 }, { "epoch": 1.77, "learning_rate": 3.810490376101094e-08, "logits/chosen": -2.180424213409424, "logits/rejected": -2.1774535179138184, "logps/chosen": -5.503657341003418, "logps/rejected": -5.129082202911377, "loss": 0.2864, "rewards/accuracies": 1.0, "rewards/chosen": 1.4986552000045776, "rewards/margins": 1.103643774986267, "rewards/rejected": 0.39501139521598816, "step": 3277 }, { "epoch": 1.77, "learning_rate": 3.807662593732135e-08, "logits/chosen": -2.157036781311035, "logits/rejected": -2.3191380500793457, "logps/chosen": -0.6209112405776978, "logps/rejected": -0.6584508419036865, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.9553143382072449, "rewards/margins": 0.011568605899810791, "rewards/rejected": 0.9437457323074341, "step": 3278 }, { "epoch": 1.77, "learning_rate": 3.804835215674591e-08, "logits/chosen": -2.025519371032715, "logits/rejected": -2.034680128097534, "logps/chosen": -1.8263527154922485, "logps/rejected": -3.0291383266448975, "loss": 0.5009, "rewards/accuracies": 1.0, "rewards/chosen": 0.9646055102348328, "rewards/margins": 0.43043839931488037, "rewards/rejected": 0.5341671109199524, "step": 3279 }, { "epoch": 1.77, "learning_rate": 3.8020082428872023e-08, "logits/chosen": -2.0604729652404785, "logits/rejected": -2.0739498138427734, "logps/chosen": -2.303377389907837, "logps/rejected": -1.998065710067749, "loss": 0.545, "rewards/accuracies": 1.0, "rewards/chosen": 1.041131615638733, "rewards/margins": 0.3221290707588196, "rewards/rejected": 0.7190025448799133, "step": 3280 }, { "epoch": 1.77, "learning_rate": 3.7991816763285716e-08, "logits/chosen": -2.1873538494110107, "logits/rejected": -2.2986249923706055, "logps/chosen": -6.551829814910889, "logps/rejected": -31.096099853515625, "loss": 0.3896, "rewards/accuracies": 1.0, "rewards/chosen": 1.3480080366134644, "rewards/margins": 0.7413952350616455, "rewards/rejected": 0.6066128015518188, "step": 3281 }, { "epoch": 1.77, "learning_rate": 3.7963555169571644e-08, "logits/chosen": -2.0861260890960693, "logits/rejected": -2.2783429622650146, "logps/chosen": -1.96750807762146, "logps/rejected": -1.810652494430542, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": 1.0624725818634033, "rewards/margins": -0.0020966529846191406, "rewards/rejected": 1.0645692348480225, "step": 3282 }, { "epoch": 1.77, "learning_rate": 3.7935297657313066e-08, "logits/chosen": -2.1013143062591553, "logits/rejected": -2.120783567428589, "logps/chosen": -1.9488744735717773, "logps/rejected": -3.1177549362182617, "loss": 0.5658, "rewards/accuracies": 1.0, "rewards/chosen": 1.065016746520996, "rewards/margins": 0.27340924739837646, "rewards/rejected": 0.7916074991226196, "step": 3283 }, { "epoch": 1.77, "learning_rate": 3.790704423609185e-08, "logits/chosen": -2.0963213443756104, "logits/rejected": -2.2995078563690186, "logps/chosen": -4.68829870223999, "logps/rejected": -4.683576583862305, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.7375714182853699, "rewards/margins": 0.027425646781921387, "rewards/rejected": 0.7101457715034485, "step": 3284 }, { "epoch": 1.77, "learning_rate": 3.7878794915488494e-08, "logits/chosen": -2.099844455718994, "logits/rejected": -2.2698357105255127, "logps/chosen": -2.358476161956787, "logps/rejected": -0.6771700382232666, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.8617054224014282, "rewards/margins": 0.006928622722625732, "rewards/rejected": 0.8547767996788025, "step": 3285 }, { "epoch": 1.77, "learning_rate": 3.7850549705082115e-08, "logits/chosen": -2.1144473552703857, "logits/rejected": -2.329719066619873, "logps/chosen": -11.71445083618164, "logps/rejected": -10.041532516479492, "loss": 0.4586, "rewards/accuracies": 1.0, "rewards/chosen": 1.303844690322876, "rewards/margins": 0.5414087772369385, "rewards/rejected": 0.7624359130859375, "step": 3286 }, { "epoch": 1.77, "learning_rate": 3.7822308614450404e-08, "logits/chosen": -2.0819640159606934, "logits/rejected": -2.318500280380249, "logps/chosen": -0.541734516620636, "logps/rejected": -0.48827648162841797, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.8767418265342712, "rewards/margins": -0.011432826519012451, "rewards/rejected": 0.8881746530532837, "step": 3287 }, { "epoch": 1.77, "learning_rate": 3.779407165316969e-08, "logits/chosen": -2.0386414527893066, "logits/rejected": -2.2920563220977783, "logps/chosen": -0.9475442171096802, "logps/rejected": -0.8388510942459106, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.8851521611213684, "rewards/margins": 0.0034246444702148438, "rewards/rejected": 0.8817275166511536, "step": 3288 }, { "epoch": 1.77, "learning_rate": 3.7765838830814864e-08, "logits/chosen": -2.0659453868865967, "logits/rejected": -2.0655815601348877, "logps/chosen": -0.7272992134094238, "logps/rejected": -1.6441282033920288, "loss": 0.629, "rewards/accuracies": 1.0, "rewards/chosen": 0.969801127910614, "rewards/margins": 0.13274598121643066, "rewards/rejected": 0.8370551466941833, "step": 3289 }, { "epoch": 1.77, "learning_rate": 3.773761015695943e-08, "logits/chosen": -2.0955371856689453, "logits/rejected": -2.2833092212677, "logps/chosen": -7.925146102905273, "logps/rejected": -0.36296939849853516, "loss": 0.6584, "rewards/accuracies": 1.0, "rewards/chosen": 1.0578762292861938, "rewards/margins": 0.0707044005393982, "rewards/rejected": 0.9871718287467957, "step": 3290 }, { "epoch": 1.78, "learning_rate": 3.7709385641175536e-08, "logits/chosen": -2.092942714691162, "logits/rejected": -2.09429669380188, "logps/chosen": -3.5313262939453125, "logps/rejected": -0.756879985332489, "loss": 0.6271, "rewards/accuracies": 1.0, "rewards/chosen": 0.9900342226028442, "rewards/margins": 0.13674908876419067, "rewards/rejected": 0.8532851338386536, "step": 3291 }, { "epoch": 1.78, "learning_rate": 3.768116529303384e-08, "logits/chosen": -2.204587459564209, "logits/rejected": -2.3620388507843018, "logps/chosen": -4.245260715484619, "logps/rejected": -1.024702548980713, "loss": 0.8262, "rewards/accuracies": 0.0, "rewards/chosen": 0.9056318402290344, "rewards/margins": -0.2503839135169983, "rewards/rejected": 1.1560157537460327, "step": 3292 }, { "epoch": 1.78, "learning_rate": 3.765294912210363e-08, "logits/chosen": -1.9985936880111694, "logits/rejected": -1.9996141195297241, "logps/chosen": -2.058861255645752, "logps/rejected": -5.9746809005737305, "loss": 0.4075, "rewards/accuracies": 1.0, "rewards/chosen": 1.1433197259902954, "rewards/margins": 0.6871250867843628, "rewards/rejected": 0.45619460940361023, "step": 3293 }, { "epoch": 1.78, "learning_rate": 3.762473713795275e-08, "logits/chosen": -1.9400299787521362, "logits/rejected": -2.273846387863159, "logps/chosen": -0.7211570739746094, "logps/rejected": -0.6951250433921814, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.8627323508262634, "rewards/margins": 0.011951923370361328, "rewards/rejected": 0.8507804274559021, "step": 3294 }, { "epoch": 1.78, "learning_rate": 3.7596529350147665e-08, "logits/chosen": -2.1279003620147705, "logits/rejected": -2.1269655227661133, "logps/chosen": -0.6646085381507874, "logps/rejected": -2.3124194145202637, "loss": 0.6499, "rewards/accuracies": 1.0, "rewards/chosen": 0.9161041378974915, "rewards/margins": 0.08839529752731323, "rewards/rejected": 0.8277088403701782, "step": 3295 }, { "epoch": 1.78, "learning_rate": 3.756832576825339e-08, "logits/chosen": -2.0298166275024414, "logits/rejected": -2.0264956951141357, "logps/chosen": -1.2413872480392456, "logps/rejected": -6.703836441040039, "loss": 0.4212, "rewards/accuracies": 1.0, "rewards/chosen": 1.0090522766113281, "rewards/margins": 0.6466375589370728, "rewards/rejected": 0.36241474747657776, "step": 3296 }, { "epoch": 1.78, "learning_rate": 3.754012640183351e-08, "logits/chosen": -2.176544427871704, "logits/rejected": -2.1411757469177246, "logps/chosen": -6.340799331665039, "logps/rejected": -15.844858169555664, "loss": 0.362, "rewards/accuracies": 1.0, "rewards/chosen": 1.2142915725708008, "rewards/margins": 0.8296619653701782, "rewards/rejected": 0.38462963700294495, "step": 3297 }, { "epoch": 1.78, "learning_rate": 3.751193126045018e-08, "logits/chosen": -2.0842318534851074, "logits/rejected": -2.0678727626800537, "logps/chosen": -8.543457984924316, "logps/rejected": -1.8683247566223145, "loss": 0.3945, "rewards/accuracies": 1.0, "rewards/chosen": 1.53916597366333, "rewards/margins": 0.7263800501823425, "rewards/rejected": 0.8127859234809875, "step": 3298 }, { "epoch": 1.78, "learning_rate": 3.7483740353664164e-08, "logits/chosen": -2.115666151046753, "logits/rejected": -2.118255138397217, "logps/chosen": -2.3615705966949463, "logps/rejected": -1.0639231204986572, "loss": 0.509, "rewards/accuracies": 1.0, "rewards/chosen": 1.1950820684432983, "rewards/margins": 0.409911572933197, "rewards/rejected": 0.7851704955101013, "step": 3299 }, { "epoch": 1.78, "learning_rate": 3.7455553691034705e-08, "logits/chosen": -2.0592916011810303, "logits/rejected": -2.327791452407837, "logps/chosen": -0.33135566115379333, "logps/rejected": -0.38662368059158325, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 0.8978857398033142, "rewards/margins": 0.021005094051361084, "rewards/rejected": 0.8768806457519531, "step": 3300 }, { "epoch": 1.78, "learning_rate": 3.7427371282119726e-08, "logits/chosen": -2.098106861114502, "logits/rejected": -2.0935356616973877, "logps/chosen": -2.566819190979004, "logps/rejected": -5.358531475067139, "loss": 0.3899, "rewards/accuracies": 1.0, "rewards/chosen": 1.3015596866607666, "rewards/margins": 0.7406728863716125, "rewards/rejected": 0.560886800289154, "step": 3301 }, { "epoch": 1.78, "learning_rate": 3.739919313647559e-08, "logits/chosen": -2.078272581100464, "logits/rejected": -2.3027031421661377, "logps/chosen": -0.7383166551589966, "logps/rejected": -0.66914963722229, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.9559128880500793, "rewards/margins": 0.016639888286590576, "rewards/rejected": 0.9392729997634888, "step": 3302 }, { "epoch": 1.78, "learning_rate": 3.7371019263657296e-08, "logits/chosen": -2.0361809730529785, "logits/rejected": -2.2859456539154053, "logps/chosen": -0.3001403510570526, "logps/rejected": -0.32312822341918945, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 1.0315922498703003, "rewards/margins": 0.002137422561645508, "rewards/rejected": 1.0294548273086548, "step": 3303 }, { "epoch": 1.78, "learning_rate": 3.734284967321833e-08, "logits/chosen": -1.9907292127609253, "logits/rejected": -1.9886553287506104, "logps/chosen": -0.5463337898254395, "logps/rejected": -8.278114318847656, "loss": 0.5712, "rewards/accuracies": 1.0, "rewards/chosen": 0.9261276125907898, "rewards/margins": 0.26084762811660767, "rewards/rejected": 0.6652799844741821, "step": 3304 }, { "epoch": 1.78, "learning_rate": 3.731468437471079e-08, "logits/chosen": -2.090925693511963, "logits/rejected": -2.0977258682250977, "logps/chosen": -1.0748662948608398, "logps/rejected": -11.15456485748291, "loss": 0.5191, "rewards/accuracies": 1.0, "rewards/chosen": 0.9345915913581848, "rewards/margins": 0.3849763870239258, "rewards/rejected": 0.549615204334259, "step": 3305 }, { "epoch": 1.78, "learning_rate": 3.7286523377685286e-08, "logits/chosen": -2.0603785514831543, "logits/rejected": -2.283625602722168, "logps/chosen": -0.9294867515563965, "logps/rejected": -0.9801428318023682, "loss": 0.6778, "rewards/accuracies": 1.0, "rewards/chosen": 0.947657585144043, "rewards/margins": 0.030964195728302002, "rewards/rejected": 0.916693389415741, "step": 3306 }, { "epoch": 1.78, "learning_rate": 3.725836669169095e-08, "logits/chosen": -2.168783664703369, "logits/rejected": -2.1789612770080566, "logps/chosen": -2.1250293254852295, "logps/rejected": -2.482858180999756, "loss": 0.5104, "rewards/accuracies": 1.0, "rewards/chosen": 1.1062811613082886, "rewards/margins": 0.40647202730178833, "rewards/rejected": 0.6998091340065002, "step": 3307 }, { "epoch": 1.78, "learning_rate": 3.7230214326275475e-08, "logits/chosen": -2.139010429382324, "logits/rejected": -2.139204502105713, "logps/chosen": -1.212581992149353, "logps/rejected": -1.8251549005508423, "loss": 0.6077, "rewards/accuracies": 1.0, "rewards/chosen": 1.049599289894104, "rewards/margins": 0.178780198097229, "rewards/rejected": 0.870819091796875, "step": 3308 }, { "epoch": 1.78, "learning_rate": 3.720206629098511e-08, "logits/chosen": -2.1412417888641357, "logits/rejected": -2.1451754570007324, "logps/chosen": -1.5275756120681763, "logps/rejected": -11.351116180419922, "loss": 0.3668, "rewards/accuracies": 1.0, "rewards/chosen": 1.157852053642273, "rewards/margins": 0.8139654397964478, "rewards/rejected": 0.3438865840435028, "step": 3309 }, { "epoch": 1.79, "learning_rate": 3.7173922595364583e-08, "logits/chosen": -2.1276450157165527, "logits/rejected": -2.3058793544769287, "logps/chosen": -5.171815395355225, "logps/rejected": -11.197548866271973, "loss": 0.6677, "rewards/accuracies": 1.0, "rewards/chosen": 0.964232861995697, "rewards/margins": 0.05151629447937012, "rewards/rejected": 0.9127165675163269, "step": 3310 }, { "epoch": 1.79, "learning_rate": 3.714578324895721e-08, "logits/chosen": -2.024092197418213, "logits/rejected": -2.2418534755706787, "logps/chosen": -8.844489097595215, "logps/rejected": -5.564699172973633, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 0.8742375373840332, "rewards/margins": 0.02447730302810669, "rewards/rejected": 0.8497602343559265, "step": 3311 }, { "epoch": 1.79, "learning_rate": 3.7117648261304796e-08, "logits/chosen": -2.1801223754882812, "logits/rejected": -2.326981544494629, "logps/chosen": -6.885067939758301, "logps/rejected": -0.7978628873825073, "loss": 0.7169, "rewards/accuracies": 0.0, "rewards/chosen": 1.037613034248352, "rewards/margins": -0.04686903953552246, "rewards/rejected": 1.0844820737838745, "step": 3312 }, { "epoch": 1.79, "learning_rate": 3.708951764194767e-08, "logits/chosen": -2.2061257362365723, "logits/rejected": -2.237002372741699, "logps/chosen": -6.2991790771484375, "logps/rejected": -24.627803802490234, "loss": 0.7547, "rewards/accuracies": 0.0, "rewards/chosen": 1.0096038579940796, "rewards/margins": -0.1194847822189331, "rewards/rejected": 1.1290886402130127, "step": 3313 }, { "epoch": 1.79, "learning_rate": 3.706139140042467e-08, "logits/chosen": -2.154376983642578, "logits/rejected": -2.249737024307251, "logps/chosen": -5.56825590133667, "logps/rejected": -8.357529640197754, "loss": 0.6109, "rewards/accuracies": 1.0, "rewards/chosen": 0.8704383969306946, "rewards/margins": 0.17194730043411255, "rewards/rejected": 0.698491096496582, "step": 3314 }, { "epoch": 1.79, "learning_rate": 3.703326954627318e-08, "logits/chosen": -2.052083730697632, "logits/rejected": -2.2806389331817627, "logps/chosen": -1.2937166690826416, "logps/rejected": -1.2916367053985596, "loss": 0.6945, "rewards/accuracies": 0.0, "rewards/chosen": 0.8280529379844666, "rewards/margins": -0.002698659896850586, "rewards/rejected": 0.8307515978813171, "step": 3315 }, { "epoch": 1.79, "learning_rate": 3.700515208902909e-08, "logits/chosen": -2.0945897102355957, "logits/rejected": -2.289546489715576, "logps/chosen": -0.43486154079437256, "logps/rejected": -0.42004528641700745, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.9509106874465942, "rewards/margins": 0.026943504810333252, "rewards/rejected": 0.923967182636261, "step": 3316 }, { "epoch": 1.79, "learning_rate": 3.697703903822677e-08, "logits/chosen": -2.0867295265197754, "logits/rejected": -2.255863666534424, "logps/chosen": -0.13746298849582672, "logps/rejected": -0.13642928004264832, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.9326009750366211, "rewards/margins": 0.017176508903503418, "rewards/rejected": 0.9154244661331177, "step": 3317 }, { "epoch": 1.79, "learning_rate": 3.6948930403399117e-08, "logits/chosen": -2.1214725971221924, "logits/rejected": -2.1246585845947266, "logps/chosen": -0.690611720085144, "logps/rejected": -3.2944023609161377, "loss": 0.5192, "rewards/accuracies": 1.0, "rewards/chosen": 1.1320655345916748, "rewards/margins": 0.3847523331642151, "rewards/rejected": 0.7473132014274597, "step": 3318 }, { "epoch": 1.79, "learning_rate": 3.692082619407752e-08, "logits/chosen": -2.0272765159606934, "logits/rejected": -2.022470712661743, "logps/chosen": -0.9034361839294434, "logps/rejected": -2.3446033000946045, "loss": 0.6462, "rewards/accuracies": 1.0, "rewards/chosen": 0.9032560586929321, "rewards/margins": 0.09630012512207031, "rewards/rejected": 0.8069559335708618, "step": 3319 }, { "epoch": 1.79, "learning_rate": 3.68927264197919e-08, "logits/chosen": -1.9535733461380005, "logits/rejected": -1.9486421346664429, "logps/chosen": -4.70313024520874, "logps/rejected": -4.231839656829834, "loss": 0.3487, "rewards/accuracies": 1.0, "rewards/chosen": 1.4267679452896118, "rewards/margins": 0.8740743398666382, "rewards/rejected": 0.5526936054229736, "step": 3320 }, { "epoch": 1.79, "learning_rate": 3.6864631090070656e-08, "logits/chosen": -1.993894100189209, "logits/rejected": -2.2930686473846436, "logps/chosen": -9.468117713928223, "logps/rejected": -7.504071235656738, "loss": 0.7398, "rewards/accuracies": 0.0, "rewards/chosen": 0.8940211534500122, "rewards/margins": -0.09126889705657959, "rewards/rejected": 0.9852900505065918, "step": 3321 }, { "epoch": 1.79, "learning_rate": 3.683654021444065e-08, "logits/chosen": -2.117142915725708, "logits/rejected": -2.3366940021514893, "logps/chosen": -4.23818826675415, "logps/rejected": -3.952751398086548, "loss": 0.6989, "rewards/accuracies": 0.0, "rewards/chosen": 0.9663143157958984, "rewards/margins": -0.011391818523406982, "rewards/rejected": 0.9777061343193054, "step": 3322 }, { "epoch": 1.79, "learning_rate": 3.680845380242725e-08, "logits/chosen": -2.0260188579559326, "logits/rejected": -2.024545669555664, "logps/chosen": -0.3112720847129822, "logps/rejected": -3.855388641357422, "loss": 0.4884, "rewards/accuracies": 1.0, "rewards/chosen": 0.9380062222480774, "rewards/margins": 0.4624139964580536, "rewards/rejected": 0.4755922257900238, "step": 3323 }, { "epoch": 1.79, "learning_rate": 3.678037186355435e-08, "logits/chosen": -2.0137157440185547, "logits/rejected": -1.9847160577774048, "logps/chosen": -8.632630348205566, "logps/rejected": -5.325451374053955, "loss": 0.4255, "rewards/accuracies": 1.0, "rewards/chosen": 1.3965028524398804, "rewards/margins": 0.6342241168022156, "rewards/rejected": 0.7622787356376648, "step": 3324 }, { "epoch": 1.79, "learning_rate": 3.675229440734428e-08, "logits/chosen": -1.9636503458023071, "logits/rejected": -2.26381778717041, "logps/chosen": -0.1946420967578888, "logps/rejected": -0.2029421031475067, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.9461981058120728, "rewards/margins": 0.02747488021850586, "rewards/rejected": 0.9187232255935669, "step": 3325 }, { "epoch": 1.79, "learning_rate": 3.672422144331785e-08, "logits/chosen": -2.0616636276245117, "logits/rejected": -2.0645081996917725, "logps/chosen": -4.6949968338012695, "logps/rejected": -1.8004412651062012, "loss": 0.4864, "rewards/accuracies": 1.0, "rewards/chosen": 1.392113447189331, "rewards/margins": 0.4676409959793091, "rewards/rejected": 0.924472451210022, "step": 3326 }, { "epoch": 1.79, "learning_rate": 3.669615298099438e-08, "logits/chosen": -2.08634352684021, "logits/rejected": -2.2577931880950928, "logps/chosen": -27.46126937866211, "logps/rejected": -28.182952880859375, "loss": 0.7359, "rewards/accuracies": 0.0, "rewards/chosen": 0.666446328163147, "rewards/margins": -0.08379173278808594, "rewards/rejected": 0.7502380609512329, "step": 3327 }, { "epoch": 1.8, "learning_rate": 3.666808902989164e-08, "logits/chosen": -2.051926851272583, "logits/rejected": -2.063054323196411, "logps/chosen": -6.81625509262085, "logps/rejected": -2.2728233337402344, "loss": 0.4918, "rewards/accuracies": 1.0, "rewards/chosen": 1.2177248001098633, "rewards/margins": 0.45379263162612915, "rewards/rejected": 0.7639321684837341, "step": 3328 }, { "epoch": 1.8, "learning_rate": 3.6640029599525865e-08, "logits/chosen": -1.9466042518615723, "logits/rejected": -2.255063056945801, "logps/chosen": -0.759135365486145, "logps/rejected": -0.9368021488189697, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.8940868377685547, "rewards/margins": -0.00384443998336792, "rewards/rejected": 0.8979312777519226, "step": 3329 }, { "epoch": 1.8, "learning_rate": 3.6611974699411796e-08, "logits/chosen": -2.044412851333618, "logits/rejected": -2.0356547832489014, "logps/chosen": -3.7904675006866455, "logps/rejected": -7.207775115966797, "loss": 0.2801, "rewards/accuracies": 1.0, "rewards/chosen": 1.5856600999832153, "rewards/margins": 1.1292624473571777, "rewards/rejected": 0.4563976228237152, "step": 3330 }, { "epoch": 1.8, "learning_rate": 3.658392433906259e-08, "logits/chosen": -2.2847397327423096, "logits/rejected": -2.129913330078125, "logps/chosen": -28.713973999023438, "logps/rejected": -3.3211305141448975, "loss": 0.1926, "rewards/accuracies": 1.0, "rewards/chosen": 2.128277540206909, "rewards/margins": 1.5492537021636963, "rewards/rejected": 0.5790237784385681, "step": 3331 }, { "epoch": 1.8, "learning_rate": 3.6555878527989895e-08, "logits/chosen": -2.1159796714782715, "logits/rejected": -2.3013007640838623, "logps/chosen": -0.1978982388973236, "logps/rejected": -0.23310977220535278, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.8402695655822754, "rewards/margins": 0.03271615505218506, "rewards/rejected": 0.8075534105300903, "step": 3332 }, { "epoch": 1.8, "learning_rate": 3.652783727570379e-08, "logits/chosen": -1.9927548170089722, "logits/rejected": -2.252040386199951, "logps/chosen": -3.2683753967285156, "logps/rejected": -0.4327021837234497, "loss": 0.7401, "rewards/accuracies": 0.0, "rewards/chosen": 0.8778250813484192, "rewards/margins": -0.09180498123168945, "rewards/rejected": 0.9696300625801086, "step": 3333 }, { "epoch": 1.8, "learning_rate": 3.6499800591712837e-08, "logits/chosen": -2.055497407913208, "logits/rejected": -2.0625197887420654, "logps/chosen": -1.3315865993499756, "logps/rejected": -2.013049364089966, "loss": 0.5081, "rewards/accuracies": 1.0, "rewards/chosen": 1.0534673929214478, "rewards/margins": 0.4123053550720215, "rewards/rejected": 0.6411620378494263, "step": 3334 }, { "epoch": 1.8, "learning_rate": 3.647176848552403e-08, "logits/chosen": -2.1028175354003906, "logits/rejected": -2.0897395610809326, "logps/chosen": -14.434388160705566, "logps/rejected": -5.977623462677002, "loss": 0.2439, "rewards/accuracies": 1.0, "rewards/chosen": 1.7230538129806519, "rewards/margins": 1.2865791320800781, "rewards/rejected": 0.43647465109825134, "step": 3335 }, { "epoch": 1.8, "learning_rate": 3.644374096664282e-08, "logits/chosen": -2.1098403930664062, "logits/rejected": -2.1081044673919678, "logps/chosen": -1.5680135488510132, "logps/rejected": -2.2821340560913086, "loss": 0.645, "rewards/accuracies": 1.0, "rewards/chosen": 1.1179845333099365, "rewards/margins": 0.09880375862121582, "rewards/rejected": 1.0191807746887207, "step": 3336 }, { "epoch": 1.8, "learning_rate": 3.6415718044573087e-08, "logits/chosen": -1.9937679767608643, "logits/rejected": -2.25500750541687, "logps/chosen": -0.38915377855300903, "logps/rejected": -0.4539935886859894, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.8837701082229614, "rewards/margins": 0.012671113014221191, "rewards/rejected": 0.8710989952087402, "step": 3337 }, { "epoch": 1.8, "learning_rate": 3.638769972881718e-08, "logits/chosen": -2.1551456451416016, "logits/rejected": -2.1332180500030518, "logps/chosen": -8.462630271911621, "logps/rejected": -2.185378313064575, "loss": 0.2747, "rewards/accuracies": 1.0, "rewards/chosen": 1.8597363233566284, "rewards/margins": 1.1516493558883667, "rewards/rejected": 0.7080869674682617, "step": 3338 }, { "epoch": 1.8, "learning_rate": 3.6359686028875845e-08, "logits/chosen": -2.040329694747925, "logits/rejected": -2.2588844299316406, "logps/chosen": -1.148464560508728, "logps/rejected": -1.1029709577560425, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.9859822392463684, "rewards/margins": -0.005215287208557129, "rewards/rejected": 0.9911975264549255, "step": 3339 }, { "epoch": 1.8, "learning_rate": 3.6331676954248326e-08, "logits/chosen": -2.1321542263031006, "logits/rejected": -2.1271934509277344, "logps/chosen": -4.157869338989258, "logps/rejected": -4.952708721160889, "loss": 0.3475, "rewards/accuracies": 1.0, "rewards/chosen": 1.4229446649551392, "rewards/margins": 0.8783493638038635, "rewards/rejected": 0.5445953011512756, "step": 3340 }, { "epoch": 1.8, "learning_rate": 3.630367251443224e-08, "logits/chosen": -1.9941506385803223, "logits/rejected": -1.9954389333724976, "logps/chosen": -5.79459285736084, "logps/rejected": -1.0196245908737183, "loss": 0.4243, "rewards/accuracies": 1.0, "rewards/chosen": 1.261472463607788, "rewards/margins": 0.6375935673713684, "rewards/rejected": 0.6238788962364197, "step": 3341 }, { "epoch": 1.8, "learning_rate": 3.627567271892364e-08, "logits/chosen": -2.2230305671691895, "logits/rejected": -2.312818765640259, "logps/chosen": -15.181529998779297, "logps/rejected": -10.490060806274414, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.8303882479667664, "rewards/margins": 0.02405107021331787, "rewards/rejected": 0.8063371777534485, "step": 3342 }, { "epoch": 1.8, "learning_rate": 3.6247677577217055e-08, "logits/chosen": -2.1295413970947266, "logits/rejected": -2.127723217010498, "logps/chosen": -9.149394989013672, "logps/rejected": -1.7401416301727295, "loss": 0.5083, "rewards/accuracies": 1.0, "rewards/chosen": 1.3222931623458862, "rewards/margins": 0.41178053617477417, "rewards/rejected": 0.9105126261711121, "step": 3343 }, { "epoch": 1.8, "learning_rate": 3.6219687098805366e-08, "logits/chosen": -2.0041956901550293, "logits/rejected": -2.003474473953247, "logps/chosen": -5.949033260345459, "logps/rejected": -15.582645416259766, "loss": 0.4653, "rewards/accuracies": 1.0, "rewards/chosen": 0.8574709296226501, "rewards/margins": 0.5234148502349854, "rewards/rejected": 0.3340561091899872, "step": 3344 }, { "epoch": 1.8, "learning_rate": 3.619170129317992e-08, "logits/chosen": -2.100156545639038, "logits/rejected": -2.103168487548828, "logps/chosen": -3.0388906002044678, "logps/rejected": -3.1395809650421143, "loss": 0.5783, "rewards/accuracies": 1.0, "rewards/chosen": 1.115642786026001, "rewards/margins": 0.24470371007919312, "rewards/rejected": 0.8709390759468079, "step": 3345 }, { "epoch": 1.8, "learning_rate": 3.616372016983044e-08, "logits/chosen": -1.954271674156189, "logits/rejected": -2.255786418914795, "logps/chosen": -0.2717980742454529, "logps/rejected": -0.3007899820804596, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.9155449271202087, "rewards/margins": 0.0043790340423583984, "rewards/rejected": 0.9111658930778503, "step": 3346 }, { "epoch": 1.81, "learning_rate": 3.613574373824512e-08, "logits/chosen": -2.0807812213897705, "logits/rejected": -2.073385238647461, "logps/chosen": -13.450305938720703, "logps/rejected": -5.822713375091553, "loss": 0.5227, "rewards/accuracies": 1.0, "rewards/chosen": 1.0682048797607422, "rewards/margins": 0.3759862184524536, "rewards/rejected": 0.6922186613082886, "step": 3347 }, { "epoch": 1.81, "learning_rate": 3.61077720079105e-08, "logits/chosen": -2.091144323348999, "logits/rejected": -2.097566843032837, "logps/chosen": -1.5200732946395874, "logps/rejected": -2.3055167198181152, "loss": 0.5048, "rewards/accuracies": 1.0, "rewards/chosen": 1.077238917350769, "rewards/margins": 0.42047834396362305, "rewards/rejected": 0.656760573387146, "step": 3348 }, { "epoch": 1.81, "learning_rate": 3.607980498831156e-08, "logits/chosen": -2.0995066165924072, "logits/rejected": -2.1065101623535156, "logps/chosen": -2.331281900405884, "logps/rejected": -5.840524673461914, "loss": 0.3604, "rewards/accuracies": 1.0, "rewards/chosen": 1.2434602975845337, "rewards/margins": 0.8350454568862915, "rewards/rejected": 0.4084148406982422, "step": 3349 }, { "epoch": 1.81, "learning_rate": 3.605184268893169e-08, "logits/chosen": -2.1329357624053955, "logits/rejected": -2.047084331512451, "logps/chosen": -14.633277893066406, "logps/rejected": -4.8522186279296875, "loss": 0.2674, "rewards/accuracies": 1.0, "rewards/chosen": 1.6721805334091187, "rewards/margins": 1.182356357574463, "rewards/rejected": 0.48982420563697815, "step": 3350 }, { "epoch": 1.81, "learning_rate": 3.6023885119252675e-08, "logits/chosen": -2.0730671882629395, "logits/rejected": -2.073497772216797, "logps/chosen": -1.3119726181030273, "logps/rejected": -3.3658804893493652, "loss": 0.5732, "rewards/accuracies": 1.0, "rewards/chosen": 0.8258979916572571, "rewards/margins": 0.2563313841819763, "rewards/rejected": 0.5695666074752808, "step": 3351 }, { "epoch": 1.81, "learning_rate": 3.599593228875465e-08, "logits/chosen": -2.121857166290283, "logits/rejected": -2.12800931930542, "logps/chosen": -1.786224603652954, "logps/rejected": -2.505464553833008, "loss": 0.4193, "rewards/accuracies": 1.0, "rewards/chosen": 1.464514136314392, "rewards/margins": 0.6520852446556091, "rewards/rejected": 0.812428891658783, "step": 3352 }, { "epoch": 1.81, "learning_rate": 3.5967984206916216e-08, "logits/chosen": -2.018773317337036, "logits/rejected": -2.3396551609039307, "logps/chosen": -1.7143211364746094, "logps/rejected": -1.8752391338348389, "loss": 0.6994, "rewards/accuracies": 0.0, "rewards/chosen": 1.095172643661499, "rewards/margins": -0.012485504150390625, "rewards/rejected": 1.1076581478118896, "step": 3353 }, { "epoch": 1.81, "learning_rate": 3.5940040883214326e-08, "logits/chosen": -1.9962118864059448, "logits/rejected": -1.9842957258224487, "logps/chosen": -0.9944089651107788, "logps/rejected": -3.2374393939971924, "loss": 0.5491, "rewards/accuracies": 1.0, "rewards/chosen": 1.0093668699264526, "rewards/margins": 0.3123703598976135, "rewards/rejected": 0.6969965100288391, "step": 3354 }, { "epoch": 1.81, "learning_rate": 3.5912102327124307e-08, "logits/chosen": -2.0151209831237793, "logits/rejected": -2.0192325115203857, "logps/chosen": -2.908212661743164, "logps/rejected": -0.8066710233688354, "loss": 0.6311, "rewards/accuracies": 1.0, "rewards/chosen": 1.1270421743392944, "rewards/margins": 0.1282007098197937, "rewards/rejected": 0.9988414645195007, "step": 3355 }, { "epoch": 1.81, "learning_rate": 3.588416854811988e-08, "logits/chosen": -2.1406233310699463, "logits/rejected": -2.3124239444732666, "logps/chosen": -0.42519471049308777, "logps/rejected": -0.42413854598999023, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 0.8589059114456177, "rewards/margins": 0.01812124252319336, "rewards/rejected": 0.8407846689224243, "step": 3356 }, { "epoch": 1.81, "learning_rate": 3.585623955567316e-08, "logits/chosen": -2.012049436569214, "logits/rejected": -2.244506359100342, "logps/chosen": -1.0586782693862915, "logps/rejected": -1.237054467201233, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.6685661673545837, "rewards/margins": 0.018730580806732178, "rewards/rejected": 0.6498355865478516, "step": 3357 }, { "epoch": 1.81, "learning_rate": 3.582831535925464e-08, "logits/chosen": -2.1964240074157715, "logits/rejected": -2.2065205574035645, "logps/chosen": -2.6917166709899902, "logps/rejected": -5.380248069763184, "loss": 0.3101, "rewards/accuracies": 1.0, "rewards/chosen": 1.3464405536651611, "rewards/margins": 1.0116467475891113, "rewards/rejected": 0.3347937762737274, "step": 3358 }, { "epoch": 1.81, "learning_rate": 3.580039596833314e-08, "logits/chosen": -2.072857141494751, "logits/rejected": -1.9766098260879517, "logps/chosen": -25.25356674194336, "logps/rejected": -2.265880584716797, "loss": 0.2, "rewards/accuracies": 1.0, "rewards/chosen": 2.035576581954956, "rewards/margins": 1.507516860961914, "rewards/rejected": 0.5280597805976868, "step": 3359 }, { "epoch": 1.81, "learning_rate": 3.577248139237593e-08, "logits/chosen": -2.0563762187957764, "logits/rejected": -2.0501275062561035, "logps/chosen": -4.97829532623291, "logps/rejected": -0.858292818069458, "loss": 0.2585, "rewards/accuracies": 1.0, "rewards/chosen": 2.1364078521728516, "rewards/margins": 1.2208149433135986, "rewards/rejected": 0.9155928492546082, "step": 3360 }, { "epoch": 1.81, "learning_rate": 3.5744571640848597e-08, "logits/chosen": -2.08699631690979, "logits/rejected": -2.105173349380493, "logps/chosen": -1.9221495389938354, "logps/rejected": -7.155727863311768, "loss": 0.533, "rewards/accuracies": 1.0, "rewards/chosen": 1.0967261791229248, "rewards/margins": 0.35100990533828735, "rewards/rejected": 0.7457162737846375, "step": 3361 }, { "epoch": 1.81, "learning_rate": 3.5716666723215075e-08, "logits/chosen": -2.125856637954712, "logits/rejected": -2.13128924369812, "logps/chosen": -1.7438143491744995, "logps/rejected": -1.5502272844314575, "loss": 0.4642, "rewards/accuracies": 1.0, "rewards/chosen": 1.1652027368545532, "rewards/margins": 0.5263976454734802, "rewards/rejected": 0.638805091381073, "step": 3362 }, { "epoch": 1.81, "learning_rate": 3.56887666489377e-08, "logits/chosen": -1.975062608718872, "logits/rejected": -2.2187178134918213, "logps/chosen": -1.2171484231948853, "logps/rejected": -1.0717757940292358, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.7984884977340698, "rewards/margins": 0.012500107288360596, "rewards/rejected": 0.7859883904457092, "step": 3363 }, { "epoch": 1.81, "learning_rate": 3.5660871427477146e-08, "logits/chosen": -2.168078660964966, "logits/rejected": -2.3196475505828857, "logps/chosen": -1.8805702924728394, "logps/rejected": -1.3402512073516846, "loss": 0.7604, "rewards/accuracies": 0.0, "rewards/chosen": 0.891262948513031, "rewards/margins": -0.13029175996780396, "rewards/rejected": 1.021554708480835, "step": 3364 }, { "epoch": 1.81, "learning_rate": 3.563298106829244e-08, "logits/chosen": -2.0429327487945557, "logits/rejected": -2.0407097339630127, "logps/chosen": -0.6169312596321106, "logps/rejected": -6.284539699554443, "loss": 0.4043, "rewards/accuracies": 1.0, "rewards/chosen": 1.122857689857483, "rewards/margins": 0.6965552568435669, "rewards/rejected": 0.42630240321159363, "step": 3365 }, { "epoch": 1.82, "learning_rate": 3.560509558084097e-08, "logits/chosen": -2.019261121749878, "logits/rejected": -2.020411491394043, "logps/chosen": -1.0172662734985352, "logps/rejected": -2.700352668762207, "loss": 0.5166, "rewards/accuracies": 1.0, "rewards/chosen": 1.126227617263794, "rewards/margins": 0.3910127878189087, "rewards/rejected": 0.7352148294448853, "step": 3366 }, { "epoch": 1.82, "learning_rate": 3.5577214974578464e-08, "logits/chosen": -2.0959479808807373, "logits/rejected": -2.0951650142669678, "logps/chosen": -3.094647169113159, "logps/rejected": -4.2835917472839355, "loss": 0.3551, "rewards/accuracies": 1.0, "rewards/chosen": 1.408347249031067, "rewards/margins": 0.8526729345321655, "rewards/rejected": 0.5556743144989014, "step": 3367 }, { "epoch": 1.82, "learning_rate": 3.554933925895898e-08, "logits/chosen": -2.0192766189575195, "logits/rejected": -2.019540309906006, "logps/chosen": -2.1188292503356934, "logps/rejected": -4.5079169273376465, "loss": 0.3411, "rewards/accuracies": 1.0, "rewards/chosen": 1.5031753778457642, "rewards/margins": 0.900188684463501, "rewards/rejected": 0.6029866933822632, "step": 3368 }, { "epoch": 1.82, "learning_rate": 3.5521468443434964e-08, "logits/chosen": -2.1299352645874023, "logits/rejected": -2.131976366043091, "logps/chosen": -0.6806013584136963, "logps/rejected": -7.017448425292969, "loss": 0.4478, "rewards/accuracies": 1.0, "rewards/chosen": 0.9429812431335449, "rewards/margins": 0.5712896585464478, "rewards/rejected": 0.37169161438941956, "step": 3369 }, { "epoch": 1.82, "learning_rate": 3.549360253745717e-08, "logits/chosen": -2.0837745666503906, "logits/rejected": -2.252983570098877, "logps/chosen": -3.822237014770508, "logps/rejected": -1.2890028953552246, "loss": 0.7297, "rewards/accuracies": 0.0, "rewards/chosen": 0.6604198813438416, "rewards/margins": -0.07180655002593994, "rewards/rejected": 0.7322264313697815, "step": 3370 }, { "epoch": 1.82, "learning_rate": 3.546574155047466e-08, "logits/chosen": -2.045762538909912, "logits/rejected": -2.043520927429199, "logps/chosen": -7.75359582901001, "logps/rejected": -2.3413755893707275, "loss": 0.4253, "rewards/accuracies": 1.0, "rewards/chosen": 1.3764574527740479, "rewards/margins": 0.6346696019172668, "rewards/rejected": 0.741787850856781, "step": 3371 }, { "epoch": 1.82, "learning_rate": 3.543788549193488e-08, "logits/chosen": -2.042280673980713, "logits/rejected": -2.3239002227783203, "logps/chosen": -0.5046497583389282, "logps/rejected": -3.1446433067321777, "loss": 0.5304, "rewards/accuracies": 1.0, "rewards/chosen": 0.9374604225158691, "rewards/margins": 0.357185959815979, "rewards/rejected": 0.5802744626998901, "step": 3372 }, { "epoch": 1.82, "learning_rate": 3.541003437128358e-08, "logits/chosen": -2.1022610664367676, "logits/rejected": -2.0874688625335693, "logps/chosen": -16.917186737060547, "logps/rejected": -3.4617879390716553, "loss": 0.2887, "rewards/accuracies": 1.0, "rewards/chosen": 1.553918480873108, "rewards/margins": 1.094491720199585, "rewards/rejected": 0.45942679047584534, "step": 3373 }, { "epoch": 1.82, "learning_rate": 3.538218819796483e-08, "logits/chosen": -2.032102108001709, "logits/rejected": -2.3085622787475586, "logps/chosen": -0.6994953155517578, "logps/rejected": -11.34011459350586, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": 0.916381299495697, "rewards/margins": 0.04267483949661255, "rewards/rejected": 0.8737064599990845, "step": 3374 }, { "epoch": 1.82, "learning_rate": 3.5354346981421015e-08, "logits/chosen": -2.0610880851745605, "logits/rejected": -2.058469533920288, "logps/chosen": -0.8082669377326965, "logps/rejected": -6.49722146987915, "loss": 0.4273, "rewards/accuracies": 1.0, "rewards/chosen": 1.1095783710479736, "rewards/margins": 0.6291524767875671, "rewards/rejected": 0.4804258942604065, "step": 3375 }, { "epoch": 1.82, "learning_rate": 3.532651073109288e-08, "logits/chosen": -2.1694817543029785, "logits/rejected": -2.174062490463257, "logps/chosen": -2.244840145111084, "logps/rejected": -3.8171136379241943, "loss": 0.5073, "rewards/accuracies": 1.0, "rewards/chosen": 1.1189205646514893, "rewards/margins": 0.41442233324050903, "rewards/rejected": 0.7044982314109802, "step": 3376 }, { "epoch": 1.82, "learning_rate": 3.529867945641945e-08, "logits/chosen": -2.116053581237793, "logits/rejected": -2.1492137908935547, "logps/chosen": -2.1466102600097656, "logps/rejected": -6.182119369506836, "loss": 0.6274, "rewards/accuracies": 1.0, "rewards/chosen": 1.0728914737701416, "rewards/margins": 0.136055588722229, "rewards/rejected": 0.9368358850479126, "step": 3377 }, { "epoch": 1.82, "learning_rate": 3.527085316683805e-08, "logits/chosen": -2.1272664070129395, "logits/rejected": -2.300027847290039, "logps/chosen": -1.204660415649414, "logps/rejected": -1.2577846050262451, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 0.8246626257896423, "rewards/margins": -0.005831778049468994, "rewards/rejected": 0.8304944038391113, "step": 3378 }, { "epoch": 1.82, "learning_rate": 3.524303187178438e-08, "logits/chosen": -2.077239751815796, "logits/rejected": -2.3120226860046387, "logps/chosen": -3.3597872257232666, "logps/rejected": -3.745912551879883, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.7647871971130371, "rewards/margins": 0.017435908317565918, "rewards/rejected": 0.7473512887954712, "step": 3379 }, { "epoch": 1.82, "learning_rate": 3.521521558069237e-08, "logits/chosen": -2.0023508071899414, "logits/rejected": -2.3102121353149414, "logps/chosen": -0.4968501329421997, "logps/rejected": -0.5854505300521851, "loss": 0.6797, "rewards/accuracies": 1.0, "rewards/chosen": 0.995742917060852, "rewards/margins": 0.027172863483428955, "rewards/rejected": 0.9685700535774231, "step": 3380 }, { "epoch": 1.82, "learning_rate": 3.518740430299429e-08, "logits/chosen": -2.0354342460632324, "logits/rejected": -2.044066905975342, "logps/chosen": -1.6524521112442017, "logps/rejected": -2.4835548400878906, "loss": 0.4408, "rewards/accuracies": 1.0, "rewards/chosen": 1.2113056182861328, "rewards/margins": 0.5907322764396667, "rewards/rejected": 0.6205733418464661, "step": 3381 }, { "epoch": 1.82, "learning_rate": 3.515959804812073e-08, "logits/chosen": -1.9651546478271484, "logits/rejected": -1.9626089334487915, "logps/chosen": -1.5934420824050903, "logps/rejected": -3.567796468734741, "loss": 0.6699, "rewards/accuracies": 1.0, "rewards/chosen": 0.86800616979599, "rewards/margins": 0.047098398208618164, "rewards/rejected": 0.8209077715873718, "step": 3382 }, { "epoch": 1.82, "learning_rate": 3.513179682550054e-08, "logits/chosen": -2.019829750061035, "logits/rejected": -2.288083553314209, "logps/chosen": -2.2979624271392822, "logps/rejected": -5.692908763885498, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.908715546131134, "rewards/margins": 0.03027927875518799, "rewards/rejected": 0.878436267375946, "step": 3383 }, { "epoch": 1.83, "learning_rate": 3.510400064456086e-08, "logits/chosen": -2.02902889251709, "logits/rejected": -2.308522939682007, "logps/chosen": -0.27888867259025574, "logps/rejected": -0.2989543080329895, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.7556702494621277, "rewards/margins": -0.007988333702087402, "rewards/rejected": 0.7636585831642151, "step": 3384 }, { "epoch": 1.83, "learning_rate": 3.5076209514727145e-08, "logits/chosen": -2.146652936935425, "logits/rejected": -2.2388689517974854, "logps/chosen": -0.3121703863143921, "logps/rejected": -0.3001447021961212, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.8695304989814758, "rewards/margins": 0.010455965995788574, "rewards/rejected": 0.8590745329856873, "step": 3385 }, { "epoch": 1.83, "learning_rate": 3.504842344542315e-08, "logits/chosen": -2.032911777496338, "logits/rejected": -2.21209716796875, "logps/chosen": -2.5844333171844482, "logps/rejected": -2.482016086578369, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.5524757504463196, "rewards/margins": 0.01841527223587036, "rewards/rejected": 0.5340604782104492, "step": 3386 }, { "epoch": 1.83, "learning_rate": 3.502064244607087e-08, "logits/chosen": -2.0868661403656006, "logits/rejected": -2.0845370292663574, "logps/chosen": -3.3832247257232666, "logps/rejected": -2.4171719551086426, "loss": 0.5267, "rewards/accuracies": 1.0, "rewards/chosen": 1.1379398107528687, "rewards/margins": 0.3661683201789856, "rewards/rejected": 0.7717714905738831, "step": 3387 }, { "epoch": 1.83, "learning_rate": 3.499286652609059e-08, "logits/chosen": -1.942586064338684, "logits/rejected": -2.2713143825531006, "logps/chosen": -0.733056902885437, "logps/rejected": -0.8662360906600952, "loss": 0.6583, "rewards/accuracies": 1.0, "rewards/chosen": 1.0028876066207886, "rewards/margins": 0.07087868452072144, "rewards/rejected": 0.9320089221000671, "step": 3388 }, { "epoch": 1.83, "learning_rate": 3.4965095694900936e-08, "logits/chosen": -2.1277546882629395, "logits/rejected": -2.3515636920928955, "logps/chosen": -1.3527004718780518, "logps/rejected": -1.0612496137619019, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 0.9837218523025513, "rewards/margins": 0.010016441345214844, "rewards/rejected": 0.9737054109573364, "step": 3389 }, { "epoch": 1.83, "learning_rate": 3.493732996191872e-08, "logits/chosen": -2.248310089111328, "logits/rejected": -2.1685538291931152, "logps/chosen": -24.20037078857422, "logps/rejected": -3.7385060787200928, "loss": 0.2187, "rewards/accuracies": 1.0, "rewards/chosen": 1.9900401830673218, "rewards/margins": 1.4085547924041748, "rewards/rejected": 0.581485390663147, "step": 3390 }, { "epoch": 1.83, "learning_rate": 3.490956933655908e-08, "logits/chosen": -2.14471435546875, "logits/rejected": -2.1444451808929443, "logps/chosen": -0.850040078163147, "logps/rejected": -2.037602663040161, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.9174420237541199, "rewards/margins": 0.019228696823120117, "rewards/rejected": 0.8982133269309998, "step": 3391 }, { "epoch": 1.83, "learning_rate": 3.488181382823542e-08, "logits/chosen": -2.0293819904327393, "logits/rejected": -2.0312881469726562, "logps/chosen": -4.250886917114258, "logps/rejected": -2.1968350410461426, "loss": 0.3255, "rewards/accuracies": 1.0, "rewards/chosen": 1.559766411781311, "rewards/margins": 0.9551347494125366, "rewards/rejected": 0.6046316623687744, "step": 3392 }, { "epoch": 1.83, "learning_rate": 3.485406344635937e-08, "logits/chosen": -1.994179606437683, "logits/rejected": -1.9934579133987427, "logps/chosen": -4.380393981933594, "logps/rejected": -0.5989739894866943, "loss": 0.7131, "rewards/accuracies": 0.0, "rewards/chosen": 0.8987141847610474, "rewards/margins": -0.039583683013916016, "rewards/rejected": 0.9382978677749634, "step": 3393 }, { "epoch": 1.83, "learning_rate": 3.482631820034084e-08, "logits/chosen": -2.0794126987457275, "logits/rejected": -2.266491174697876, "logps/chosen": -0.9543236494064331, "logps/rejected": -2.1272685527801514, "loss": 0.6291, "rewards/accuracies": 1.0, "rewards/chosen": 0.9382911920547485, "rewards/margins": 0.13251322507858276, "rewards/rejected": 0.8057779669761658, "step": 3394 }, { "epoch": 1.83, "learning_rate": 3.479857809958804e-08, "logits/chosen": -2.050708055496216, "logits/rejected": -2.266270637512207, "logps/chosen": -0.2540372610092163, "logps/rejected": -0.32041555643081665, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.8968588709831238, "rewards/margins": 0.016344904899597168, "rewards/rejected": 0.8805139660835266, "step": 3395 }, { "epoch": 1.83, "learning_rate": 3.4770843153507384e-08, "logits/chosen": -2.178316116333008, "logits/rejected": -2.297001361846924, "logps/chosen": -6.138336181640625, "logps/rejected": -1.4594883918762207, "loss": 0.7807, "rewards/accuracies": 0.0, "rewards/chosen": 0.7240010499954224, "rewards/margins": -0.1680660843849182, "rewards/rejected": 0.8920671343803406, "step": 3396 }, { "epoch": 1.83, "learning_rate": 3.474311337150355e-08, "logits/chosen": -2.0174858570098877, "logits/rejected": -2.2689366340637207, "logps/chosen": -1.0107251405715942, "logps/rejected": -1.0823001861572266, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.861790657043457, "rewards/margins": 0.013111650943756104, "rewards/rejected": 0.8486790060997009, "step": 3397 }, { "epoch": 1.83, "learning_rate": 3.4715388762979454e-08, "logits/chosen": -2.0755908489227295, "logits/rejected": -2.213247537612915, "logps/chosen": -7.3681864738464355, "logps/rejected": -5.007288455963135, "loss": 0.6085, "rewards/accuracies": 1.0, "rewards/chosen": 1.020832896232605, "rewards/margins": 0.17708247900009155, "rewards/rejected": 0.8437504172325134, "step": 3398 }, { "epoch": 1.83, "learning_rate": 3.4687669337336314e-08, "logits/chosen": -1.983206033706665, "logits/rejected": -2.2879412174224854, "logps/chosen": -0.6901338696479797, "logps/rejected": -3.391852855682373, "loss": 0.5478, "rewards/accuracies": 1.0, "rewards/chosen": 1.0041558742523193, "rewards/margins": 0.3155466914176941, "rewards/rejected": 0.6886091828346252, "step": 3399 }, { "epoch": 1.83, "learning_rate": 3.4659955103973516e-08, "logits/chosen": -2.0608291625976562, "logits/rejected": -2.3081037998199463, "logps/chosen": -7.620471954345703, "logps/rejected": -8.605158805847168, "loss": 0.7117, "rewards/accuracies": 0.0, "rewards/chosen": 1.0019521713256836, "rewards/margins": -0.03670954704284668, "rewards/rejected": 1.0386617183685303, "step": 3400 }, { "epoch": 1.83, "learning_rate": 3.4632246072288744e-08, "logits/chosen": -2.0672566890716553, "logits/rejected": -2.0768237113952637, "logps/chosen": -1.9325202703475952, "logps/rejected": -2.1770808696746826, "loss": 0.4899, "rewards/accuracies": 1.0, "rewards/chosen": 1.1555683612823486, "rewards/margins": 0.4585435390472412, "rewards/rejected": 0.6970248222351074, "step": 3401 }, { "epoch": 1.83, "learning_rate": 3.4604542251677885e-08, "logits/chosen": -2.138129711151123, "logits/rejected": -2.26054048538208, "logps/chosen": -0.4723312258720398, "logps/rejected": -0.6660217046737671, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.9369446039199829, "rewards/margins": 0.017695605754852295, "rewards/rejected": 0.9192489981651306, "step": 3402 }, { "epoch": 1.84, "learning_rate": 3.457684365153506e-08, "logits/chosen": -2.1970911026000977, "logits/rejected": -2.1965184211730957, "logps/chosen": -2.2328436374664307, "logps/rejected": -4.768148899078369, "loss": 0.4707, "rewards/accuracies": 1.0, "rewards/chosen": 1.0444546937942505, "rewards/margins": 0.508944571018219, "rewards/rejected": 0.5355101227760315, "step": 3403 }, { "epoch": 1.84, "learning_rate": 3.4549150281252633e-08, "logits/chosen": -2.0155601501464844, "logits/rejected": -2.015977144241333, "logps/chosen": -0.5085787773132324, "logps/rejected": -4.272429943084717, "loss": 0.5047, "rewards/accuracies": 1.0, "rewards/chosen": 0.837706983089447, "rewards/margins": 0.4208996593952179, "rewards/rejected": 0.4168073236942291, "step": 3404 }, { "epoch": 1.84, "learning_rate": 3.452146215022119e-08, "logits/chosen": -1.9299063682556152, "logits/rejected": -2.252127170562744, "logps/chosen": -3.9107205867767334, "logps/rejected": -3.6642777919769287, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.7194588780403137, "rewards/margins": -0.0018677711486816406, "rewards/rejected": 0.7213266491889954, "step": 3405 }, { "epoch": 1.84, "learning_rate": 3.449377926782954e-08, "logits/chosen": -2.1180925369262695, "logits/rejected": -2.0928425788879395, "logps/chosen": -16.528339385986328, "logps/rejected": -8.01573371887207, "loss": 0.2036, "rewards/accuracies": 1.0, "rewards/chosen": 1.7491035461425781, "rewards/margins": 1.4879891872406006, "rewards/rejected": 0.2611144185066223, "step": 3406 }, { "epoch": 1.84, "learning_rate": 3.446610164346469e-08, "logits/chosen": -2.162881851196289, "logits/rejected": -2.308013916015625, "logps/chosen": -4.021336555480957, "logps/rejected": -3.6378977298736572, "loss": 0.7025, "rewards/accuracies": 0.0, "rewards/chosen": 0.9030961990356445, "rewards/margins": -0.01868981122970581, "rewards/rejected": 0.9217860102653503, "step": 3407 }, { "epoch": 1.84, "learning_rate": 3.443842928651192e-08, "logits/chosen": -2.1408724784851074, "logits/rejected": -2.295724391937256, "logps/chosen": -7.8432207107543945, "logps/rejected": -4.766922473907471, "loss": 0.7262, "rewards/accuracies": 0.0, "rewards/chosen": 0.7793607115745544, "rewards/margins": -0.06513810157775879, "rewards/rejected": 0.8444988131523132, "step": 3408 }, { "epoch": 1.84, "learning_rate": 3.441076220635467e-08, "logits/chosen": -2.1188809871673584, "logits/rejected": -2.212139844894409, "logps/chosen": -1.1117602586746216, "logps/rejected": -1.281272530555725, "loss": 0.6704, "rewards/accuracies": 1.0, "rewards/chosen": 0.9072234034538269, "rewards/margins": 0.045984625816345215, "rewards/rejected": 0.8612387776374817, "step": 3409 }, { "epoch": 1.84, "learning_rate": 3.438310041237462e-08, "logits/chosen": -2.1607236862182617, "logits/rejected": -2.1672251224517822, "logps/chosen": -2.372856378555298, "logps/rejected": -3.4826035499572754, "loss": 0.3889, "rewards/accuracies": 1.0, "rewards/chosen": 1.2735130786895752, "rewards/margins": 0.743718147277832, "rewards/rejected": 0.5297949314117432, "step": 3410 }, { "epoch": 1.84, "learning_rate": 3.435544391395165e-08, "logits/chosen": -2.140827178955078, "logits/rejected": -2.141517162322998, "logps/chosen": -2.4312450885772705, "logps/rejected": -1.2320327758789062, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 1.0122994184494019, "rewards/margins": 0.0394899845123291, "rewards/rejected": 0.9728094339370728, "step": 3411 }, { "epoch": 1.84, "learning_rate": 3.432779272046383e-08, "logits/chosen": -2.1282644271850586, "logits/rejected": -2.135283946990967, "logps/chosen": -3.0501039028167725, "logps/rejected": -13.870450973510742, "loss": 0.3596, "rewards/accuracies": 1.0, "rewards/chosen": 1.0833828449249268, "rewards/margins": 0.8374788761138916, "rewards/rejected": 0.24590396881103516, "step": 3412 }, { "epoch": 1.84, "learning_rate": 3.430014684128743e-08, "logits/chosen": -1.960497498512268, "logits/rejected": -2.259366989135742, "logps/chosen": -2.112586259841919, "logps/rejected": -2.1223576068878174, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.7599380612373352, "rewards/margins": 0.02342003583908081, "rewards/rejected": 0.7365180253982544, "step": 3413 }, { "epoch": 1.84, "learning_rate": 3.4272506285796976e-08, "logits/chosen": -1.9871952533721924, "logits/rejected": -1.9932059049606323, "logps/chosen": -2.0891270637512207, "logps/rejected": -2.6878609657287598, "loss": 0.5727, "rewards/accuracies": 1.0, "rewards/chosen": 1.0052932500839233, "rewards/margins": 0.25735658407211304, "rewards/rejected": 0.7479366660118103, "step": 3414 }, { "epoch": 1.84, "learning_rate": 3.4244871063365114e-08, "logits/chosen": -2.169168472290039, "logits/rejected": -2.163694381713867, "logps/chosen": -4.269352912902832, "logps/rejected": -4.411762237548828, "loss": 0.6307, "rewards/accuracies": 1.0, "rewards/chosen": 0.8250486254692078, "rewards/margins": 0.12913966178894043, "rewards/rejected": 0.6959089636802673, "step": 3415 }, { "epoch": 1.84, "learning_rate": 3.4217241183362703e-08, "logits/chosen": -2.1308815479278564, "logits/rejected": -2.2935407161712646, "logps/chosen": -0.7504895925521851, "logps/rejected": -0.8445723056793213, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 1.0123569965362549, "rewards/margins": 0.0028623342514038086, "rewards/rejected": 1.009494662284851, "step": 3416 }, { "epoch": 1.84, "learning_rate": 3.4189616655158796e-08, "logits/chosen": -2.053511619567871, "logits/rejected": -2.2514712810516357, "logps/chosen": -0.7757001519203186, "logps/rejected": -0.7379363179206848, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.7185236811637878, "rewards/margins": 0.014656305313110352, "rewards/rejected": 0.7038673758506775, "step": 3417 }, { "epoch": 1.84, "learning_rate": 3.4161997488120677e-08, "logits/chosen": -2.1034398078918457, "logits/rejected": -2.1013128757476807, "logps/chosen": -3.877340793609619, "logps/rejected": -1.8299533128738403, "loss": 0.5695, "rewards/accuracies": 1.0, "rewards/chosen": 1.1016649007797241, "rewards/margins": 0.2648288607597351, "rewards/rejected": 0.836836040019989, "step": 3418 }, { "epoch": 1.84, "learning_rate": 3.4134383691613723e-08, "logits/chosen": -1.9502294063568115, "logits/rejected": -2.2209572792053223, "logps/chosen": -0.33853358030319214, "logps/rejected": -0.39853471517562866, "loss": 0.7012, "rewards/accuracies": 0.0, "rewards/chosen": 0.9355301856994629, "rewards/margins": -0.01611196994781494, "rewards/rejected": 0.9516421556472778, "step": 3419 }, { "epoch": 1.84, "learning_rate": 3.410677527500156e-08, "logits/chosen": -2.2305898666381836, "logits/rejected": -2.050079584121704, "logps/chosen": -59.791954040527344, "logps/rejected": -0.3537538945674896, "loss": 0.1653, "rewards/accuracies": 1.0, "rewards/chosen": 2.562213897705078, "rewards/margins": 1.716101884841919, "rewards/rejected": 0.8461119532585144, "step": 3420 }, { "epoch": 1.85, "learning_rate": 3.407917224764597e-08, "logits/chosen": -2.227165699005127, "logits/rejected": -2.079153299331665, "logps/chosen": -36.24016571044922, "logps/rejected": -1.5624916553497314, "loss": 0.173, "rewards/accuracies": 1.0, "rewards/chosen": 2.6353490352630615, "rewards/margins": 1.6667277812957764, "rewards/rejected": 0.9686212539672852, "step": 3421 }, { "epoch": 1.85, "learning_rate": 3.4051574618906887e-08, "logits/chosen": -2.0665132999420166, "logits/rejected": -2.2699756622314453, "logps/chosen": -0.4217653274536133, "logps/rejected": -0.4049485921859741, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.8589740991592407, "rewards/margins": 0.022975146770477295, "rewards/rejected": 0.8359989523887634, "step": 3422 }, { "epoch": 1.85, "learning_rate": 3.4023982398142425e-08, "logits/chosen": -2.0452752113342285, "logits/rejected": -2.3137006759643555, "logps/chosen": -0.5402044057846069, "logps/rejected": -0.5192869901657104, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.8248453140258789, "rewards/margins": 0.014494895935058594, "rewards/rejected": 0.8103504180908203, "step": 3423 }, { "epoch": 1.85, "learning_rate": 3.3996395594708905e-08, "logits/chosen": -2.065154790878296, "logits/rejected": -2.2974510192871094, "logps/chosen": -1.205833911895752, "logps/rejected": -1.2223244905471802, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 1.1128648519515991, "rewards/margins": 0.008043289184570312, "rewards/rejected": 1.1048215627670288, "step": 3424 }, { "epoch": 1.85, "learning_rate": 3.396881421796074e-08, "logits/chosen": -1.9939796924591064, "logits/rejected": -2.2603940963745117, "logps/chosen": -0.2444821298122406, "logps/rejected": -0.2540384829044342, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.7949787378311157, "rewards/margins": -2.300739288330078e-05, "rewards/rejected": 0.795001745223999, "step": 3425 }, { "epoch": 1.85, "learning_rate": 3.394123827725056e-08, "logits/chosen": -1.9908509254455566, "logits/rejected": -1.9932475090026855, "logps/chosen": -1.0482007265090942, "logps/rejected": -5.24780797958374, "loss": 0.5616, "rewards/accuracies": 1.0, "rewards/chosen": 0.804176926612854, "rewards/margins": 0.283108115196228, "rewards/rejected": 0.521068811416626, "step": 3426 }, { "epoch": 1.85, "learning_rate": 3.39136677819291e-08, "logits/chosen": -2.063133716583252, "logits/rejected": -2.111499309539795, "logps/chosen": -3.0068678855895996, "logps/rejected": -18.34589385986328, "loss": 0.2025, "rewards/accuracies": 1.0, "rewards/chosen": 1.5060220956802368, "rewards/margins": 1.494308352470398, "rewards/rejected": 0.011713790707290173, "step": 3427 }, { "epoch": 1.85, "learning_rate": 3.388610274134533e-08, "logits/chosen": -2.0951712131500244, "logits/rejected": -2.331667184829712, "logps/chosen": -7.254693508148193, "logps/rejected": -7.368110656738281, "loss": 0.6969, "rewards/accuracies": 0.0, "rewards/chosen": 0.8973783850669861, "rewards/margins": -0.007394969463348389, "rewards/rejected": 0.9047733545303345, "step": 3428 }, { "epoch": 1.85, "learning_rate": 3.385854316484628e-08, "logits/chosen": -2.044006586074829, "logits/rejected": -2.266972780227661, "logps/chosen": -0.34105783700942993, "logps/rejected": -0.3177418112754822, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.9575470089912415, "rewards/margins": 0.007552146911621094, "rewards/rejected": 0.9499948620796204, "step": 3429 }, { "epoch": 1.85, "learning_rate": 3.3830989061777183e-08, "logits/chosen": -2.14119815826416, "logits/rejected": -2.294715166091919, "logps/chosen": -1.6228716373443604, "logps/rejected": -1.6546423435211182, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.9909810423851013, "rewards/margins": 0.008800268173217773, "rewards/rejected": 0.9821807742118835, "step": 3430 }, { "epoch": 1.85, "learning_rate": 3.380344044148139e-08, "logits/chosen": -2.032468795776367, "logits/rejected": -2.2657508850097656, "logps/chosen": -0.39742401242256165, "logps/rejected": -0.41620826721191406, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.9486549496650696, "rewards/margins": -0.0011728405952453613, "rewards/rejected": 0.9498277902603149, "step": 3431 }, { "epoch": 1.85, "learning_rate": 3.3775897313300415e-08, "logits/chosen": -2.0943214893341064, "logits/rejected": -1.9996929168701172, "logps/chosen": -6.062219142913818, "logps/rejected": -4.762719631195068, "loss": 0.3764, "rewards/accuracies": 1.0, "rewards/chosen": 1.58330237865448, "rewards/margins": 0.7830631732940674, "rewards/rejected": 0.8002392053604126, "step": 3432 }, { "epoch": 1.85, "learning_rate": 3.374835968657388e-08, "logits/chosen": -2.1391005516052246, "logits/rejected": -2.2636399269104004, "logps/chosen": -6.055575370788574, "logps/rejected": -5.387269496917725, "loss": 0.6542, "rewards/accuracies": 1.0, "rewards/chosen": 1.0111180543899536, "rewards/margins": 0.07943123579025269, "rewards/rejected": 0.9316868185997009, "step": 3433 }, { "epoch": 1.85, "learning_rate": 3.372082757063958e-08, "logits/chosen": -2.0509064197540283, "logits/rejected": -2.051067590713501, "logps/chosen": -5.517746448516846, "logps/rejected": -4.558705806732178, "loss": 0.4737, "rewards/accuracies": 1.0, "rewards/chosen": 1.1671333312988281, "rewards/margins": 0.501130223274231, "rewards/rejected": 0.6660031080245972, "step": 3434 }, { "epoch": 1.85, "learning_rate": 3.36933009748334e-08, "logits/chosen": -2.025813341140747, "logits/rejected": -1.9958386421203613, "logps/chosen": -5.636345863342285, "logps/rejected": -3.3042566776275635, "loss": 0.3365, "rewards/accuracies": 1.0, "rewards/chosen": 1.4380011558532715, "rewards/margins": 0.9162817597389221, "rewards/rejected": 0.5217193961143494, "step": 3435 }, { "epoch": 1.85, "learning_rate": 3.366577990848939e-08, "logits/chosen": -2.137171745300293, "logits/rejected": -2.1389946937561035, "logps/chosen": -3.3665688037872314, "logps/rejected": -4.194677352905273, "loss": 0.5603, "rewards/accuracies": 1.0, "rewards/chosen": 1.014917254447937, "rewards/margins": 0.28601670265197754, "rewards/rejected": 0.7289005517959595, "step": 3436 }, { "epoch": 1.85, "learning_rate": 3.3638264380939686e-08, "logits/chosen": -2.0643415451049805, "logits/rejected": -2.3869128227233887, "logps/chosen": -6.22183895111084, "logps/rejected": -19.35566520690918, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.7377741932868958, "rewards/margins": 0.025574207305908203, "rewards/rejected": 0.7121999859809875, "step": 3437 }, { "epoch": 1.85, "learning_rate": 3.3610754401514584e-08, "logits/chosen": -2.200761556625366, "logits/rejected": -2.2012712955474854, "logps/chosen": -4.161013126373291, "logps/rejected": -4.3253936767578125, "loss": 0.4935, "rewards/accuracies": 1.0, "rewards/chosen": 1.4485143423080444, "rewards/margins": 0.4492618441581726, "rewards/rejected": 0.9992524981498718, "step": 3438 }, { "epoch": 1.85, "learning_rate": 3.358324997954249e-08, "logits/chosen": -2.03606915473938, "logits/rejected": -2.2483303546905518, "logps/chosen": -10.907832145690918, "logps/rejected": -11.020537376403809, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 0.9886061549186707, "rewards/margins": -0.010440647602081299, "rewards/rejected": 0.999046802520752, "step": 3439 }, { "epoch": 1.86, "learning_rate": 3.355575112434991e-08, "logits/chosen": -2.039022445678711, "logits/rejected": -2.3264718055725098, "logps/chosen": -4.125195503234863, "logps/rejected": -4.086737155914307, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.566944420337677, "rewards/margins": 0.0254402756690979, "rewards/rejected": 0.5415041446685791, "step": 3440 }, { "epoch": 1.86, "learning_rate": 3.352825784526148e-08, "logits/chosen": -2.1132919788360596, "logits/rejected": -2.332976818084717, "logps/chosen": -2.2258036136627197, "logps/rejected": -6.391336917877197, "loss": 0.6477, "rewards/accuracies": 1.0, "rewards/chosen": 1.007612705230713, "rewards/margins": 0.09309148788452148, "rewards/rejected": 0.9145212173461914, "step": 3441 }, { "epoch": 1.86, "learning_rate": 3.35007701515999e-08, "logits/chosen": -2.080336809158325, "logits/rejected": -2.333343029022217, "logps/chosen": -1.304947018623352, "logps/rejected": -1.1215490102767944, "loss": 0.687, "rewards/accuracies": 1.0, "rewards/chosen": 0.7968748211860657, "rewards/margins": 0.012325406074523926, "rewards/rejected": 0.7845494151115417, "step": 3442 }, { "epoch": 1.86, "learning_rate": 3.347328805268605e-08, "logits/chosen": -2.099372386932373, "logits/rejected": -2.266805648803711, "logps/chosen": -9.661866188049316, "logps/rejected": -0.9593100547790527, "loss": 0.7544, "rewards/accuracies": 0.0, "rewards/chosen": 0.8073493242263794, "rewards/margins": -0.11893433332443237, "rewards/rejected": 0.9262836575508118, "step": 3443 }, { "epoch": 1.86, "learning_rate": 3.344581155783886e-08, "logits/chosen": -2.010319709777832, "logits/rejected": -2.242912530899048, "logps/chosen": -0.3459129333496094, "logps/rejected": -0.38309112191200256, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 1.0048236846923828, "rewards/margins": 0.012057006359100342, "rewards/rejected": 0.9927666783332825, "step": 3444 }, { "epoch": 1.86, "learning_rate": 3.341834067637539e-08, "logits/chosen": -2.0814592838287354, "logits/rejected": -2.082223415374756, "logps/chosen": -0.4221193194389343, "logps/rejected": -3.0181002616882324, "loss": 0.5067, "rewards/accuracies": 1.0, "rewards/chosen": 1.077112078666687, "rewards/margins": 0.4157823920249939, "rewards/rejected": 0.6613296866416931, "step": 3445 }, { "epoch": 1.86, "learning_rate": 3.339087541761074e-08, "logits/chosen": -2.0802857875823975, "logits/rejected": -2.0869040489196777, "logps/chosen": -2.2618725299835205, "logps/rejected": -3.5949671268463135, "loss": 0.4354, "rewards/accuracies": 1.0, "rewards/chosen": 1.201210856437683, "rewards/margins": 0.605829119682312, "rewards/rejected": 0.5953817367553711, "step": 3446 }, { "epoch": 1.86, "learning_rate": 3.336341579085819e-08, "logits/chosen": -2.09743070602417, "logits/rejected": -2.0966484546661377, "logps/chosen": -0.9784443974494934, "logps/rejected": -3.688854932785034, "loss": 0.5044, "rewards/accuracies": 1.0, "rewards/chosen": 1.077239751815796, "rewards/margins": 0.4216027855873108, "rewards/rejected": 0.6556369662284851, "step": 3447 }, { "epoch": 1.86, "learning_rate": 3.3335961805429024e-08, "logits/chosen": -2.157989501953125, "logits/rejected": -2.3117964267730713, "logps/chosen": -0.9281704425811768, "logps/rejected": -0.9901173114776611, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.9770434498786926, "rewards/margins": 0.00412064790725708, "rewards/rejected": 0.9729228019714355, "step": 3448 }, { "epoch": 1.86, "learning_rate": 3.3308513470632705e-08, "logits/chosen": -2.0939042568206787, "logits/rejected": -2.098233938217163, "logps/chosen": -2.270596742630005, "logps/rejected": -3.3210995197296143, "loss": 0.503, "rewards/accuracies": 1.0, "rewards/chosen": 1.026804804801941, "rewards/margins": 0.42503589391708374, "rewards/rejected": 0.6017689108848572, "step": 3449 }, { "epoch": 1.86, "learning_rate": 3.328107079577669e-08, "logits/chosen": -2.151221513748169, "logits/rejected": -2.3036093711853027, "logps/chosen": -0.40023738145828247, "logps/rejected": -16.273075103759766, "loss": 0.7052, "rewards/accuracies": 0.0, "rewards/chosen": 0.8431779146194458, "rewards/margins": -0.024055957794189453, "rewards/rejected": 0.8672338724136353, "step": 3450 }, { "epoch": 1.86, "learning_rate": 3.325363379016657e-08, "logits/chosen": -2.0209333896636963, "logits/rejected": -2.030452251434326, "logps/chosen": -1.6783920526504517, "logps/rejected": -2.517442464828491, "loss": 0.4319, "rewards/accuracies": 1.0, "rewards/chosen": 1.2672232389450073, "rewards/margins": 0.6158463358879089, "rewards/rejected": 0.6513769030570984, "step": 3451 }, { "epoch": 1.86, "learning_rate": 3.3226202463105974e-08, "logits/chosen": -2.1225814819335938, "logits/rejected": -2.1243844032287598, "logps/chosen": -3.5591626167297363, "logps/rejected": -8.047438621520996, "loss": 0.4854, "rewards/accuracies": 1.0, "rewards/chosen": 1.2647762298583984, "rewards/margins": 0.4702231287956238, "rewards/rejected": 0.7945531010627747, "step": 3452 }, { "epoch": 1.86, "learning_rate": 3.3198776823896666e-08, "logits/chosen": -2.229917287826538, "logits/rejected": -2.121368885040283, "logps/chosen": -28.48349952697754, "logps/rejected": -4.415947437286377, "loss": 0.1494, "rewards/accuracies": 1.0, "rewards/chosen": 2.3196780681610107, "rewards/margins": 1.8257029056549072, "rewards/rejected": 0.4939752221107483, "step": 3453 }, { "epoch": 1.86, "learning_rate": 3.317135688183843e-08, "logits/chosen": -2.095919370651245, "logits/rejected": -2.308194875717163, "logps/chosen": -0.47939038276672363, "logps/rejected": -0.5674296617507935, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.9322345852851868, "rewards/margins": 0.018735110759735107, "rewards/rejected": 0.9134994745254517, "step": 3454 }, { "epoch": 1.86, "learning_rate": 3.3143942646229126e-08, "logits/chosen": -2.0359132289886475, "logits/rejected": -2.034010648727417, "logps/chosen": -6.578615188598633, "logps/rejected": -4.509599208831787, "loss": 0.3698, "rewards/accuracies": 1.0, "rewards/chosen": 1.2467859983444214, "rewards/margins": 0.8042539358139038, "rewards/rejected": 0.4425320327281952, "step": 3455 }, { "epoch": 1.86, "learning_rate": 3.311653412636468e-08, "logits/chosen": -2.13710880279541, "logits/rejected": -2.2946791648864746, "logps/chosen": -0.7994732856750488, "logps/rejected": -1.2693651914596558, "loss": 0.6341, "rewards/accuracies": 1.0, "rewards/chosen": 0.9372434616088867, "rewards/margins": 0.12187564373016357, "rewards/rejected": 0.8153678178787231, "step": 3456 }, { "epoch": 1.86, "learning_rate": 3.3089131331539116e-08, "logits/chosen": -2.1356422901153564, "logits/rejected": -2.13140606880188, "logps/chosen": -7.626712799072266, "logps/rejected": -3.251675605773926, "loss": 0.3527, "rewards/accuracies": 1.0, "rewards/chosen": 1.509765625, "rewards/margins": 0.860687255859375, "rewards/rejected": 0.649078369140625, "step": 3457 }, { "epoch": 1.87, "learning_rate": 3.306173427104448e-08, "logits/chosen": -2.035698890686035, "logits/rejected": -2.3021888732910156, "logps/chosen": -0.46730855107307434, "logps/rejected": -0.4092721939086914, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 0.9497774243354797, "rewards/margins": 0.01849079132080078, "rewards/rejected": 0.931286633014679, "step": 3458 }, { "epoch": 1.87, "learning_rate": 3.303434295417087e-08, "logits/chosen": -2.1586458683013916, "logits/rejected": -2.236556053161621, "logps/chosen": -5.225546360015869, "logps/rejected": -16.125513076782227, "loss": 0.5009, "rewards/accuracies": 1.0, "rewards/chosen": 1.130196213722229, "rewards/margins": 0.4305650591850281, "rewards/rejected": 0.6996311545372009, "step": 3459 }, { "epoch": 1.87, "learning_rate": 3.300695739020645e-08, "logits/chosen": -2.0418004989624023, "logits/rejected": -2.108944892883301, "logps/chosen": -3.197310447692871, "logps/rejected": -25.404115676879883, "loss": 0.173, "rewards/accuracies": 1.0, "rewards/chosen": 1.6157493591308594, "rewards/margins": 1.6666017770767212, "rewards/rejected": -0.050852395594120026, "step": 3460 }, { "epoch": 1.87, "learning_rate": 3.2979577588437424e-08, "logits/chosen": -2.0531256198883057, "logits/rejected": -2.270002841949463, "logps/chosen": -0.3768768012523651, "logps/rejected": -0.4221760034561157, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.8227919936180115, "rewards/margins": 0.004264950752258301, "rewards/rejected": 0.8185270428657532, "step": 3461 }, { "epoch": 1.87, "learning_rate": 3.2952203558148064e-08, "logits/chosen": -2.2046756744384766, "logits/rejected": -2.205256938934326, "logps/chosen": -0.9740638136863708, "logps/rejected": -6.179562568664551, "loss": 0.4413, "rewards/accuracies": 1.0, "rewards/chosen": 1.0465058088302612, "rewards/margins": 0.5891726016998291, "rewards/rejected": 0.45733317732810974, "step": 3462 }, { "epoch": 1.87, "learning_rate": 3.292483530862066e-08, "logits/chosen": -2.1472041606903076, "logits/rejected": -2.304616689682007, "logps/chosen": -25.26124382019043, "logps/rejected": -8.191873550415039, "loss": 0.6125, "rewards/accuracies": 1.0, "rewards/chosen": 1.0875910520553589, "rewards/margins": 0.1683863401412964, "rewards/rejected": 0.9192047119140625, "step": 3463 }, { "epoch": 1.87, "learning_rate": 3.2897472849135546e-08, "logits/chosen": -1.9760783910751343, "logits/rejected": -1.9850859642028809, "logps/chosen": -3.3786661624908447, "logps/rejected": -4.104916572570801, "loss": 0.4033, "rewards/accuracies": 1.0, "rewards/chosen": 1.3781999349594116, "rewards/margins": 0.6995508670806885, "rewards/rejected": 0.6786490678787231, "step": 3464 }, { "epoch": 1.87, "learning_rate": 3.28701161889711e-08, "logits/chosen": -1.989227533340454, "logits/rejected": -2.262537717819214, "logps/chosen": -2.1322133541107178, "logps/rejected": -1.9026296138763428, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.8840900659561157, "rewards/margins": 0.02378159761428833, "rewards/rejected": 0.8603084683418274, "step": 3465 }, { "epoch": 1.87, "learning_rate": 3.2842765337403735e-08, "logits/chosen": -2.029989242553711, "logits/rejected": -2.319916009902954, "logps/chosen": -0.7693592309951782, "logps/rejected": -5.04448127746582, "loss": 0.6337, "rewards/accuracies": 1.0, "rewards/chosen": 1.046130895614624, "rewards/margins": 0.12266421318054199, "rewards/rejected": 0.923466682434082, "step": 3466 }, { "epoch": 1.87, "learning_rate": 3.28154203037079e-08, "logits/chosen": -2.0807175636291504, "logits/rejected": -2.090576410293579, "logps/chosen": -3.6209876537323, "logps/rejected": -11.21381950378418, "loss": 0.6665, "rewards/accuracies": 1.0, "rewards/chosen": 0.8555606007575989, "rewards/margins": 0.05393838882446289, "rewards/rejected": 0.801622211933136, "step": 3467 }, { "epoch": 1.87, "learning_rate": 3.2788081097156054e-08, "logits/chosen": -2.0474724769592285, "logits/rejected": -2.281266689300537, "logps/chosen": -0.2777852416038513, "logps/rejected": -0.3421115279197693, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 0.9796268343925476, "rewards/margins": 0.0012968778610229492, "rewards/rejected": 0.9783299565315247, "step": 3468 }, { "epoch": 1.87, "learning_rate": 3.276074772701869e-08, "logits/chosen": -2.0377678871154785, "logits/rejected": -2.3364641666412354, "logps/chosen": -0.41599762439727783, "logps/rejected": -0.38021841645240784, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.8487967848777771, "rewards/margins": 0.002057492733001709, "rewards/rejected": 0.8467392921447754, "step": 3469 }, { "epoch": 1.87, "learning_rate": 3.273342020256431e-08, "logits/chosen": -2.092241048812866, "logits/rejected": -2.281005859375, "logps/chosen": -1.4728577136993408, "logps/rejected": -6.197237491607666, "loss": 0.5678, "rewards/accuracies": 1.0, "rewards/chosen": 0.7992164492607117, "rewards/margins": 0.26857423782348633, "rewards/rejected": 0.5306422114372253, "step": 3470 }, { "epoch": 1.87, "learning_rate": 3.270609853305945e-08, "logits/chosen": -2.0503041744232178, "logits/rejected": -2.204618215560913, "logps/chosen": -1.1524635553359985, "logps/rejected": -2.1154277324676514, "loss": 0.7011, "rewards/accuracies": 0.0, "rewards/chosen": 1.0281956195831299, "rewards/margins": -0.015903949737548828, "rewards/rejected": 1.0440995693206787, "step": 3471 }, { "epoch": 1.87, "learning_rate": 3.2678782727768664e-08, "logits/chosen": -2.018115758895874, "logits/rejected": -2.0101475715637207, "logps/chosen": -2.002943992614746, "logps/rejected": -3.0252647399902344, "loss": 0.4999, "rewards/accuracies": 1.0, "rewards/chosen": 1.1693006753921509, "rewards/margins": 0.4329008460044861, "rewards/rejected": 0.7363998293876648, "step": 3472 }, { "epoch": 1.87, "learning_rate": 3.26514727959545e-08, "logits/chosen": -2.1239712238311768, "logits/rejected": -2.319965362548828, "logps/chosen": -1.1639195680618286, "logps/rejected": -1.1565006971359253, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 0.8850809335708618, "rewards/margins": -0.00854414701461792, "rewards/rejected": 0.8936250805854797, "step": 3473 }, { "epoch": 1.87, "learning_rate": 3.2624168746877524e-08, "logits/chosen": -2.067519187927246, "logits/rejected": -2.067253589630127, "logps/chosen": -2.5945661067962646, "logps/rejected": -3.672128915786743, "loss": 0.3528, "rewards/accuracies": 1.0, "rewards/chosen": 1.4511679410934448, "rewards/margins": 0.8602036833763123, "rewards/rejected": 0.5909642577171326, "step": 3474 }, { "epoch": 1.87, "learning_rate": 3.2596870589796296e-08, "logits/chosen": -1.9950597286224365, "logits/rejected": -2.0041415691375732, "logps/chosen": -7.056727886199951, "logps/rejected": -1.5124192237854004, "loss": 0.7024, "rewards/accuracies": 0.0, "rewards/chosen": 1.1274948120117188, "rewards/margins": -0.018391013145446777, "rewards/rejected": 1.1458858251571655, "step": 3475 }, { "epoch": 1.87, "learning_rate": 3.256957833396738e-08, "logits/chosen": -2.009779453277588, "logits/rejected": -2.0065195560455322, "logps/chosen": -7.490981578826904, "logps/rejected": -3.841728687286377, "loss": 0.3267, "rewards/accuracies": 1.0, "rewards/chosen": 1.4211355447769165, "rewards/margins": 0.950973391532898, "rewards/rejected": 0.47016215324401855, "step": 3476 }, { "epoch": 1.88, "learning_rate": 3.254229198864538e-08, "logits/chosen": -2.043048620223999, "logits/rejected": -2.2770488262176514, "logps/chosen": -0.2880622446537018, "logps/rejected": -0.2881026864051819, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 0.83240807056427, "rewards/margins": 0.024171650409698486, "rewards/rejected": 0.8082364201545715, "step": 3477 }, { "epoch": 1.88, "learning_rate": 3.251501156308285e-08, "logits/chosen": -2.130410671234131, "logits/rejected": -2.3332977294921875, "logps/chosen": -0.8926515579223633, "logps/rejected": -0.8745628595352173, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 1.019753336906433, "rewards/margins": 0.015796184539794922, "rewards/rejected": 1.0039571523666382, "step": 3478 }, { "epoch": 1.88, "learning_rate": 3.248773706653034e-08, "logits/chosen": -2.1881420612335205, "logits/rejected": -2.18888258934021, "logps/chosen": -0.34884876012802124, "logps/rejected": -3.6961164474487305, "loss": 0.472, "rewards/accuracies": 1.0, "rewards/chosen": 0.9356102347373962, "rewards/margins": 0.5056383609771729, "rewards/rejected": 0.4299719035625458, "step": 3479 }, { "epoch": 1.88, "learning_rate": 3.2460468508236396e-08, "logits/chosen": -2.1669368743896484, "logits/rejected": -2.135523796081543, "logps/chosen": -21.12596321105957, "logps/rejected": -4.248425483703613, "loss": 0.2574, "rewards/accuracies": 1.0, "rewards/chosen": 1.770965814590454, "rewards/margins": 1.2255784273147583, "rewards/rejected": 0.5453873872756958, "step": 3480 }, { "epoch": 1.88, "learning_rate": 3.243320589744756e-08, "logits/chosen": -1.9716787338256836, "logits/rejected": -2.2246577739715576, "logps/chosen": -0.4507213234901428, "logps/rejected": -0.4253513216972351, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": 0.8767433166503906, "rewards/margins": 0.026394546031951904, "rewards/rejected": 0.8503487706184387, "step": 3481 }, { "epoch": 1.88, "learning_rate": 3.240594924340835e-08, "logits/chosen": -2.0597455501556396, "logits/rejected": -2.0573244094848633, "logps/chosen": -0.5930245518684387, "logps/rejected": -1.6800637245178223, "loss": 0.5709, "rewards/accuracies": 1.0, "rewards/chosen": 0.9894365668296814, "rewards/margins": 0.26155734062194824, "rewards/rejected": 0.7278792262077332, "step": 3482 }, { "epoch": 1.88, "learning_rate": 3.237869855536126e-08, "logits/chosen": -2.135995864868164, "logits/rejected": -2.133077621459961, "logps/chosen": -4.7529706954956055, "logps/rejected": -5.390212059020996, "loss": 0.5114, "rewards/accuracies": 1.0, "rewards/chosen": 1.0536221265792847, "rewards/margins": 0.4039543867111206, "rewards/rejected": 0.6496677398681641, "step": 3483 }, { "epoch": 1.88, "learning_rate": 3.235145384254677e-08, "logits/chosen": -2.179685354232788, "logits/rejected": -2.297909736633301, "logps/chosen": -10.759349822998047, "logps/rejected": -13.216675758361816, "loss": 0.702, "rewards/accuracies": 0.0, "rewards/chosen": 1.0651054382324219, "rewards/margins": -0.0175856351852417, "rewards/rejected": 1.0826910734176636, "step": 3484 }, { "epoch": 1.88, "learning_rate": 3.2324215114203326e-08, "logits/chosen": -2.1240415573120117, "logits/rejected": -2.3501570224761963, "logps/chosen": -0.7160466909408569, "logps/rejected": -10.395898818969727, "loss": 0.6678, "rewards/accuracies": 1.0, "rewards/chosen": 1.0462783575057983, "rewards/margins": 0.05129885673522949, "rewards/rejected": 0.9949795007705688, "step": 3485 }, { "epoch": 1.88, "learning_rate": 3.229698237956733e-08, "logits/chosen": -2.0339784622192383, "logits/rejected": -2.290889263153076, "logps/chosen": -0.48527729511260986, "logps/rejected": -0.503759503364563, "loss": 0.666, "rewards/accuracies": 1.0, "rewards/chosen": 0.9882671236991882, "rewards/margins": 0.05508202314376831, "rewards/rejected": 0.9331851005554199, "step": 3486 }, { "epoch": 1.88, "learning_rate": 3.2269755647873216e-08, "logits/chosen": -2.0082573890686035, "logits/rejected": -2.0100228786468506, "logps/chosen": -0.9112151861190796, "logps/rejected": -6.101083755493164, "loss": 0.4897, "rewards/accuracies": 1.0, "rewards/chosen": 1.0785531997680664, "rewards/margins": 0.45905572175979614, "rewards/rejected": 0.6194974780082703, "step": 3487 }, { "epoch": 1.88, "learning_rate": 3.22425349283533e-08, "logits/chosen": -2.2180368900299072, "logits/rejected": -2.318863868713379, "logps/chosen": -0.32229381799697876, "logps/rejected": -0.3193851709365845, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.9910592436790466, "rewards/margins": 0.0077277421951293945, "rewards/rejected": 0.9833315014839172, "step": 3488 }, { "epoch": 1.88, "learning_rate": 3.2215320230237905e-08, "logits/chosen": -2.0621843338012695, "logits/rejected": -2.25750732421875, "logps/chosen": -0.6334770917892456, "logps/rejected": -0.5894278883934021, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9182877540588379, "rewards/margins": 0.005078494548797607, "rewards/rejected": 0.9132092595100403, "step": 3489 }, { "epoch": 1.88, "learning_rate": 3.218811156275529e-08, "logits/chosen": -2.128621816635132, "logits/rejected": -2.1327195167541504, "logps/chosen": -4.577901363372803, "logps/rejected": -0.39506083726882935, "loss": 0.5818, "rewards/accuracies": 1.0, "rewards/chosen": 1.2103960514068604, "rewards/margins": 0.23669153451919556, "rewards/rejected": 0.9737045168876648, "step": 3490 }, { "epoch": 1.88, "learning_rate": 3.216090893513171e-08, "logits/chosen": -2.077078342437744, "logits/rejected": -2.07745623588562, "logps/chosen": -0.4469485282897949, "logps/rejected": -3.638148069381714, "loss": 0.4887, "rewards/accuracies": 1.0, "rewards/chosen": 0.9835428595542908, "rewards/margins": 0.46170687675476074, "rewards/rejected": 0.52183598279953, "step": 3491 }, { "epoch": 1.88, "learning_rate": 3.213371235659131e-08, "logits/chosen": -2.1161301136016846, "logits/rejected": -1.997004747390747, "logps/chosen": -15.590422630310059, "logps/rejected": -10.582396507263184, "loss": 0.377, "rewards/accuracies": 1.0, "rewards/chosen": 1.6838337182998657, "rewards/margins": 0.7811991572380066, "rewards/rejected": 0.9026345610618591, "step": 3492 }, { "epoch": 1.88, "learning_rate": 3.210652183635625e-08, "logits/chosen": -2.0254011154174805, "logits/rejected": -2.028499126434326, "logps/chosen": -6.234065055847168, "logps/rejected": -12.451939582824707, "loss": 0.3807, "rewards/accuracies": 1.0, "rewards/chosen": 1.6523849964141846, "rewards/margins": 0.7693730592727661, "rewards/rejected": 0.8830119371414185, "step": 3493 }, { "epoch": 1.88, "learning_rate": 3.2079337383646564e-08, "logits/chosen": -2.1473233699798584, "logits/rejected": -2.365267753601074, "logps/chosen": -0.8924896121025085, "logps/rejected": -5.877378940582275, "loss": 0.6136, "rewards/accuracies": 1.0, "rewards/chosen": 0.9239799380302429, "rewards/margins": 0.16591906547546387, "rewards/rejected": 0.758060872554779, "step": 3494 }, { "epoch": 1.89, "learning_rate": 3.205215900768029e-08, "logits/chosen": -2.063417434692383, "logits/rejected": -2.323392391204834, "logps/chosen": -1.6680998802185059, "logps/rejected": -1.7402435541152954, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 1.014780879020691, "rewards/margins": 0.010023951530456543, "rewards/rejected": 1.0047569274902344, "step": 3495 }, { "epoch": 1.89, "learning_rate": 3.202498671767339e-08, "logits/chosen": -2.0758328437805176, "logits/rejected": -2.090820550918579, "logps/chosen": -1.5011003017425537, "logps/rejected": -3.3700668811798096, "loss": 0.3949, "rewards/accuracies": 1.0, "rewards/chosen": 1.536117434501648, "rewards/margins": 0.7250787615776062, "rewards/rejected": 0.8110386729240417, "step": 3496 }, { "epoch": 1.89, "learning_rate": 3.1997820522839744e-08, "logits/chosen": -2.2917444705963135, "logits/rejected": -2.392711639404297, "logps/chosen": -8.738609313964844, "logps/rejected": -13.484533309936523, "loss": 0.6291, "rewards/accuracies": 1.0, "rewards/chosen": 1.1059932708740234, "rewards/margins": 0.1325216293334961, "rewards/rejected": 0.9734716415405273, "step": 3497 }, { "epoch": 1.89, "learning_rate": 3.197066043239118e-08, "logits/chosen": -1.9765158891677856, "logits/rejected": -2.2477457523345947, "logps/chosen": -0.2455112338066101, "logps/rejected": -0.27023184299468994, "loss": 0.6786, "rewards/accuracies": 1.0, "rewards/chosen": 0.9364663362503052, "rewards/margins": 0.029345989227294922, "rewards/rejected": 0.9071203470230103, "step": 3498 }, { "epoch": 1.89, "learning_rate": 3.194350645553746e-08, "logits/chosen": -2.186386823654175, "logits/rejected": -2.086392879486084, "logps/chosen": -27.13656997680664, "logps/rejected": -3.1005876064300537, "loss": 0.191, "rewards/accuracies": 1.0, "rewards/chosen": 2.067659378051758, "rewards/margins": 1.558345079421997, "rewards/rejected": 0.5093143582344055, "step": 3499 }, { "epoch": 1.89, "learning_rate": 3.191635860148624e-08, "logits/chosen": -1.9828497171401978, "logits/rejected": -2.308039903640747, "logps/chosen": -0.486505925655365, "logps/rejected": -0.5075981616973877, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.9310488700866699, "rewards/margins": 0.029932081699371338, "rewards/rejected": 0.9011167883872986, "step": 3500 }, { "epoch": 1.89, "learning_rate": 3.188921687944316e-08, "logits/chosen": -2.16139554977417, "logits/rejected": -2.288104295730591, "logps/chosen": -20.379520416259766, "logps/rejected": -2.0748238563537598, "loss": 0.7235, "rewards/accuracies": 0.0, "rewards/chosen": 0.9413108825683594, "rewards/margins": -0.059758543968200684, "rewards/rejected": 1.00106942653656, "step": 3501 }, { "epoch": 1.89, "learning_rate": 3.186208129861172e-08, "logits/chosen": -2.1728782653808594, "logits/rejected": -2.1710379123687744, "logps/chosen": -7.334141731262207, "logps/rejected": -3.136016607284546, "loss": 0.2612, "rewards/accuracies": 1.0, "rewards/chosen": 1.7599483728408813, "rewards/margins": 1.2090625762939453, "rewards/rejected": 0.550885796546936, "step": 3502 }, { "epoch": 1.89, "learning_rate": 3.183495186819337e-08, "logits/chosen": -2.056398391723633, "logits/rejected": -2.1444525718688965, "logps/chosen": -3.1333069801330566, "logps/rejected": -19.098827362060547, "loss": 0.4136, "rewards/accuracies": 1.0, "rewards/chosen": 1.3093969821929932, "rewards/margins": 0.6690565347671509, "rewards/rejected": 0.6403404474258423, "step": 3503 }, { "epoch": 1.89, "learning_rate": 3.1807828597387463e-08, "logits/chosen": -2.152721643447876, "logits/rejected": -2.1514811515808105, "logps/chosen": -4.201544284820557, "logps/rejected": -4.267170429229736, "loss": 0.2547, "rewards/accuracies": 1.0, "rewards/chosen": 1.6788147687911987, "rewards/margins": 1.2377723455429077, "rewards/rejected": 0.44104239344596863, "step": 3504 }, { "epoch": 1.89, "learning_rate": 3.1780711495391265e-08, "logits/chosen": -2.044217109680176, "logits/rejected": -2.026184320449829, "logps/chosen": -14.520023345947266, "logps/rejected": -4.9137468338012695, "loss": 0.4502, "rewards/accuracies": 1.0, "rewards/chosen": 1.6749683618545532, "rewards/margins": 0.5644783973693848, "rewards/rejected": 1.1104899644851685, "step": 3505 }, { "epoch": 1.89, "learning_rate": 3.1753600571399975e-08, "logits/chosen": -2.074127674102783, "logits/rejected": -2.0999345779418945, "logps/chosen": -24.590822219848633, "logps/rejected": -8.447250366210938, "loss": 0.3624, "rewards/accuracies": 1.0, "rewards/chosen": 1.700972557067871, "rewards/margins": 0.8283987045288086, "rewards/rejected": 0.8725738525390625, "step": 3506 }, { "epoch": 1.89, "learning_rate": 3.172649583460667e-08, "logits/chosen": -2.1470630168914795, "logits/rejected": -2.192713975906372, "logps/chosen": -5.642940998077393, "logps/rejected": -23.34864616394043, "loss": 0.4004, "rewards/accuracies": 1.0, "rewards/chosen": 1.3166967630386353, "rewards/margins": 0.7084934115409851, "rewards/rejected": 0.6082033514976501, "step": 3507 }, { "epoch": 1.89, "learning_rate": 3.1699397294202325e-08, "logits/chosen": -2.05356764793396, "logits/rejected": -2.3396360874176025, "logps/chosen": -0.6709742546081543, "logps/rejected": -0.7954310774803162, "loss": 0.6752, "rewards/accuracies": 1.0, "rewards/chosen": 1.0433558225631714, "rewards/margins": 0.03621518611907959, "rewards/rejected": 1.0071406364440918, "step": 3508 }, { "epoch": 1.89, "learning_rate": 3.167230495937581e-08, "logits/chosen": -2.2127909660339355, "logits/rejected": -2.214979648590088, "logps/chosen": -0.5450482368469238, "logps/rejected": -3.0504114627838135, "loss": 0.4838, "rewards/accuracies": 1.0, "rewards/chosen": 0.9836332201957703, "rewards/margins": 0.4743257164955139, "rewards/rejected": 0.5093075037002563, "step": 3509 }, { "epoch": 1.89, "learning_rate": 3.1645218839313936e-08, "logits/chosen": -2.1117286682128906, "logits/rejected": -2.107706069946289, "logps/chosen": -2.9574906826019287, "logps/rejected": -2.017061710357666, "loss": 0.3428, "rewards/accuracies": 1.0, "rewards/chosen": 1.5550626516342163, "rewards/margins": 0.8944134712219238, "rewards/rejected": 0.6606491804122925, "step": 3510 }, { "epoch": 1.89, "learning_rate": 3.161813894320136e-08, "logits/chosen": -2.0781803131103516, "logits/rejected": -2.0844650268554688, "logps/chosen": -1.8631919622421265, "logps/rejected": -3.279097557067871, "loss": 0.442, "rewards/accuracies": 1.0, "rewards/chosen": 1.115466594696045, "rewards/margins": 0.5871990919113159, "rewards/rejected": 0.528267502784729, "step": 3511 }, { "epoch": 1.89, "learning_rate": 3.159106528022063e-08, "logits/chosen": -2.1042375564575195, "logits/rejected": -2.1184191703796387, "logps/chosen": -3.9132206439971924, "logps/rejected": -4.6153459548950195, "loss": 0.2813, "rewards/accuracies": 1.0, "rewards/chosen": 1.8368381261825562, "rewards/margins": 1.1244697570800781, "rewards/rejected": 0.7123683094978333, "step": 3512 }, { "epoch": 1.89, "learning_rate": 3.1563997859552206e-08, "logits/chosen": -1.9673924446105957, "logits/rejected": -2.2516119480133057, "logps/chosen": -1.716133952140808, "logps/rejected": -1.455464482307434, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 0.7382499575614929, "rewards/margins": -0.007995367050170898, "rewards/rejected": 0.7462453246116638, "step": 3513 }, { "epoch": 1.9, "learning_rate": 3.1536936690374415e-08, "logits/chosen": -2.1092629432678223, "logits/rejected": -2.179079532623291, "logps/chosen": -0.3790515661239624, "logps/rejected": -29.88899040222168, "loss": 0.2144, "rewards/accuracies": 1.0, "rewards/chosen": 0.9622854590415955, "rewards/margins": 1.430545687675476, "rewards/rejected": -0.4682601988315582, "step": 3514 }, { "epoch": 1.9, "learning_rate": 3.1509881781863464e-08, "logits/chosen": -2.0542221069335938, "logits/rejected": -2.045762300491333, "logps/chosen": -5.374970436096191, "logps/rejected": -1.867428183555603, "loss": 0.4152, "rewards/accuracies": 1.0, "rewards/chosen": 1.6157753467559814, "rewards/margins": 0.664089024066925, "rewards/rejected": 0.9516863226890564, "step": 3515 }, { "epoch": 1.9, "learning_rate": 3.148283314319346e-08, "logits/chosen": -2.0438613891601562, "logits/rejected": -2.0381858348846436, "logps/chosen": -3.633065700531006, "logps/rejected": -3.9369585514068604, "loss": 0.3992, "rewards/accuracies": 1.0, "rewards/chosen": 1.3571926355361938, "rewards/margins": 0.7119714617729187, "rewards/rejected": 0.6452211737632751, "step": 3516 }, { "epoch": 1.9, "learning_rate": 3.145579078353635e-08, "logits/chosen": -2.064304828643799, "logits/rejected": -2.2898170948028564, "logps/chosen": -5.777551174163818, "logps/rejected": -1.5874989032745361, "loss": 0.7086, "rewards/accuracies": 0.0, "rewards/chosen": 1.0136985778808594, "rewards/margins": -0.030753374099731445, "rewards/rejected": 1.0444519519805908, "step": 3517 }, { "epoch": 1.9, "learning_rate": 3.142875471206198e-08, "logits/chosen": -1.9638289213180542, "logits/rejected": -1.9619642496109009, "logps/chosen": -0.6054873466491699, "logps/rejected": -2.3827507495880127, "loss": 0.6117, "rewards/accuracies": 1.0, "rewards/chosen": 0.971893310546875, "rewards/margins": 0.1701107621192932, "rewards/rejected": 0.8017825484275818, "step": 3518 }, { "epoch": 1.9, "learning_rate": 3.140172493793802e-08, "logits/chosen": -2.0169639587402344, "logits/rejected": -2.0122454166412354, "logps/chosen": -2.866884231567383, "logps/rejected": -3.9559333324432373, "loss": 0.6168, "rewards/accuracies": 1.0, "rewards/chosen": 1.0962880849838257, "rewards/margins": 0.15902602672576904, "rewards/rejected": 0.9372620582580566, "step": 3519 }, { "epoch": 1.9, "learning_rate": 3.137470147033007e-08, "logits/chosen": -2.0233824253082275, "logits/rejected": -2.016829013824463, "logps/chosen": -2.136868953704834, "logps/rejected": -4.70818567276001, "loss": 0.3586, "rewards/accuracies": 1.0, "rewards/chosen": 1.3148339986801147, "rewards/margins": 0.8407777547836304, "rewards/rejected": 0.474056214094162, "step": 3520 }, { "epoch": 1.9, "learning_rate": 3.1347684318401534e-08, "logits/chosen": -2.1141517162323, "logits/rejected": -2.1172006130218506, "logps/chosen": -2.285963773727417, "logps/rejected": -6.820130348205566, "loss": 0.4041, "rewards/accuracies": 1.0, "rewards/chosen": 1.3936623334884644, "rewards/margins": 0.6973201632499695, "rewards/rejected": 0.6963421702384949, "step": 3521 }, { "epoch": 1.9, "learning_rate": 3.132067349131372e-08, "logits/chosen": -2.004448175430298, "logits/rejected": -2.2751100063323975, "logps/chosen": -0.8743727803230286, "logps/rejected": -0.964375913143158, "loss": 0.6777, "rewards/accuracies": 1.0, "rewards/chosen": 0.9032036066055298, "rewards/margins": 0.03113633394241333, "rewards/rejected": 0.8720672726631165, "step": 3522 }, { "epoch": 1.9, "learning_rate": 3.129366899822573e-08, "logits/chosen": -1.937970519065857, "logits/rejected": -1.9374051094055176, "logps/chosen": -0.5856261253356934, "logps/rejected": -1.8467457294464111, "loss": 0.6699, "rewards/accuracies": 1.0, "rewards/chosen": 0.8700487017631531, "rewards/margins": 0.047024309635162354, "rewards/rejected": 0.8230243921279907, "step": 3523 }, { "epoch": 1.9, "learning_rate": 3.1266670848294595e-08, "logits/chosen": -2.109879493713379, "logits/rejected": -2.109523057937622, "logps/chosen": -1.2356709241867065, "logps/rejected": -3.1336042881011963, "loss": 0.5262, "rewards/accuracies": 1.0, "rewards/chosen": 0.9807608723640442, "rewards/margins": 0.3675467371940613, "rewards/rejected": 0.6132141351699829, "step": 3524 }, { "epoch": 1.9, "learning_rate": 3.123967905067512e-08, "logits/chosen": -2.1089258193969727, "logits/rejected": -2.11637020111084, "logps/chosen": -4.125689506530762, "logps/rejected": -3.59816837310791, "loss": 0.4533, "rewards/accuracies": 1.0, "rewards/chosen": 1.1814532279968262, "rewards/margins": 0.5559870600700378, "rewards/rejected": 0.6254661679267883, "step": 3525 }, { "epoch": 1.9, "learning_rate": 3.1212693614520035e-08, "logits/chosen": -2.034820079803467, "logits/rejected": -2.0416247844696045, "logps/chosen": -1.7158594131469727, "logps/rejected": -2.6891090869903564, "loss": 0.4845, "rewards/accuracies": 1.0, "rewards/chosen": 1.0541913509368896, "rewards/margins": 0.4725053310394287, "rewards/rejected": 0.5816860198974609, "step": 3526 }, { "epoch": 1.9, "learning_rate": 3.118571454897986e-08, "logits/chosen": -2.1596250534057617, "logits/rejected": -2.125932455062866, "logps/chosen": -4.816546440124512, "logps/rejected": -7.721691131591797, "loss": 0.29, "rewards/accuracies": 1.0, "rewards/chosen": 1.3124874830245972, "rewards/margins": 1.0893393754959106, "rewards/rejected": 0.22314806282520294, "step": 3527 }, { "epoch": 1.9, "learning_rate": 3.1158741863202933e-08, "logits/chosen": -2.0083110332489014, "logits/rejected": -2.369553565979004, "logps/chosen": -8.247187614440918, "logps/rejected": -14.8197660446167, "loss": 0.9025, "rewards/accuracies": 0.0, "rewards/chosen": 0.8233155608177185, "rewards/margins": -0.3823409676551819, "rewards/rejected": 1.2056565284729004, "step": 3528 }, { "epoch": 1.9, "learning_rate": 3.1131775566335494e-08, "logits/chosen": -2.0051589012145996, "logits/rejected": -2.001915216445923, "logps/chosen": -4.218573093414307, "logps/rejected": -2.0177524089813232, "loss": 0.4489, "rewards/accuracies": 1.0, "rewards/chosen": 1.3801320791244507, "rewards/margins": 0.5681900382041931, "rewards/rejected": 0.8119420409202576, "step": 3529 }, { "epoch": 1.9, "learning_rate": 3.1104815667521576e-08, "logits/chosen": -2.101834774017334, "logits/rejected": -2.1002674102783203, "logps/chosen": -1.6706455945968628, "logps/rejected": -7.926133632659912, "loss": 0.4116, "rewards/accuracies": 1.0, "rewards/chosen": 0.9555989503860474, "rewards/margins": 0.674858570098877, "rewards/rejected": 0.2807404100894928, "step": 3530 }, { "epoch": 1.9, "learning_rate": 3.1077862175903045e-08, "logits/chosen": -2.156198263168335, "logits/rejected": -2.1522328853607178, "logps/chosen": -2.693108320236206, "logps/rejected": -5.274847507476807, "loss": 0.624, "rewards/accuracies": 1.0, "rewards/chosen": 0.9227330088615417, "rewards/margins": 0.1434037685394287, "rewards/rejected": 0.779329240322113, "step": 3531 }, { "epoch": 1.91, "learning_rate": 3.10509151006196e-08, "logits/chosen": -1.9850057363510132, "logits/rejected": -2.233628511428833, "logps/chosen": -0.5082158446311951, "logps/rejected": -0.5493023991584778, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.9557979702949524, "rewards/margins": -0.0007422566413879395, "rewards/rejected": 0.9565402269363403, "step": 3532 }, { "epoch": 1.91, "learning_rate": 3.102397445080877e-08, "logits/chosen": -1.9983046054840088, "logits/rejected": -2.004538059234619, "logps/chosen": -3.353879928588867, "logps/rejected": -1.491260290145874, "loss": 0.6331, "rewards/accuracies": 1.0, "rewards/chosen": 1.0783777236938477, "rewards/margins": 0.12382698059082031, "rewards/rejected": 0.9545507431030273, "step": 3533 }, { "epoch": 1.91, "learning_rate": 3.099704023560587e-08, "logits/chosen": -2.0419068336486816, "logits/rejected": -2.2780404090881348, "logps/chosen": -0.9244354963302612, "logps/rejected": -1.1532976627349854, "loss": 0.6708, "rewards/accuracies": 1.0, "rewards/chosen": 0.8374741673469543, "rewards/margins": 0.045113444328308105, "rewards/rejected": 0.7923607230186462, "step": 3534 }, { "epoch": 1.91, "learning_rate": 3.097011246414412e-08, "logits/chosen": -2.0253422260284424, "logits/rejected": -2.28930401802063, "logps/chosen": -0.8371789455413818, "logps/rejected": -0.7810792922973633, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 1.0463427305221558, "rewards/margins": 0.023737430572509766, "rewards/rejected": 1.022605299949646, "step": 3535 }, { "epoch": 1.91, "learning_rate": 3.0943191145554445e-08, "logits/chosen": -2.157203435897827, "logits/rejected": -2.0752551555633545, "logps/chosen": -36.17512893676758, "logps/rejected": -9.666025161743164, "loss": 0.2586, "rewards/accuracies": 1.0, "rewards/chosen": 2.123957872390747, "rewards/margins": 1.2202978134155273, "rewards/rejected": 0.903659999370575, "step": 3536 }, { "epoch": 1.91, "learning_rate": 3.091627628896566e-08, "logits/chosen": -2.044374942779541, "logits/rejected": -2.050403594970703, "logps/chosen": -1.5363496541976929, "logps/rejected": -5.002938270568848, "loss": 0.4637, "rewards/accuracies": 1.0, "rewards/chosen": 1.1486634016036987, "rewards/margins": 0.5276737809181213, "rewards/rejected": 0.6209896206855774, "step": 3537 }, { "epoch": 1.91, "learning_rate": 3.088936790350435e-08, "logits/chosen": -1.9192229509353638, "logits/rejected": -2.326784133911133, "logps/chosen": -5.515368938446045, "logps/rejected": -4.3694586753845215, "loss": 0.6599, "rewards/accuracies": 1.0, "rewards/chosen": 0.8577442169189453, "rewards/margins": 0.06754106283187866, "rewards/rejected": 0.7902031540870667, "step": 3538 }, { "epoch": 1.91, "learning_rate": 3.086246599829493e-08, "logits/chosen": -2.009507656097412, "logits/rejected": -2.019423484802246, "logps/chosen": -1.6028379201889038, "logps/rejected": -2.7058351039886475, "loss": 0.5122, "rewards/accuracies": 1.0, "rewards/chosen": 1.0664212703704834, "rewards/margins": 0.4019821286201477, "rewards/rejected": 0.6644391417503357, "step": 3539 }, { "epoch": 1.91, "learning_rate": 3.083557058245959e-08, "logits/chosen": -2.0794119834899902, "logits/rejected": -2.072575569152832, "logps/chosen": -5.929013252258301, "logps/rejected": -3.82607102394104, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 1.270129919052124, "rewards/margins": 0.6040574312210083, "rewards/rejected": 0.6660724878311157, "step": 3540 }, { "epoch": 1.91, "learning_rate": 3.080868166511835e-08, "logits/chosen": -2.039675235748291, "logits/rejected": -2.0353660583496094, "logps/chosen": -5.700226306915283, "logps/rejected": -3.71616268157959, "loss": 0.4389, "rewards/accuracies": 1.0, "rewards/chosen": 1.1441487073898315, "rewards/margins": 0.5959628820419312, "rewards/rejected": 0.5481858253479004, "step": 3541 }, { "epoch": 1.91, "learning_rate": 3.0781799255389006e-08, "logits/chosen": -2.157597780227661, "logits/rejected": -2.151561737060547, "logps/chosen": -3.5468766689300537, "logps/rejected": -7.471259117126465, "loss": 0.2843, "rewards/accuracies": 1.0, "rewards/chosen": 1.455566644668579, "rewards/margins": 1.1123623847961426, "rewards/rejected": 0.3432043194770813, "step": 3542 }, { "epoch": 1.91, "learning_rate": 3.075492336238715e-08, "logits/chosen": -1.9729119539260864, "logits/rejected": -1.985168695449829, "logps/chosen": -1.4844088554382324, "logps/rejected": -7.241377830505371, "loss": 0.3894, "rewards/accuracies": 1.0, "rewards/chosen": 1.3133569955825806, "rewards/margins": 0.7420137524604797, "rewards/rejected": 0.5713432431221008, "step": 3543 }, { "epoch": 1.91, "learning_rate": 3.072805399522616e-08, "logits/chosen": -2.0611438751220703, "logits/rejected": -2.070460081100464, "logps/chosen": -2.9338219165802, "logps/rejected": -1.1375832557678223, "loss": 0.4486, "rewards/accuracies": 1.0, "rewards/chosen": 1.3271164894104004, "rewards/margins": 0.5688572525978088, "rewards/rejected": 0.7582592368125916, "step": 3544 }, { "epoch": 1.91, "learning_rate": 3.070119116301724e-08, "logits/chosen": -1.9742999076843262, "logits/rejected": -1.9721360206604004, "logps/chosen": -1.966985821723938, "logps/rejected": -4.687946796417236, "loss": 0.4995, "rewards/accuracies": 1.0, "rewards/chosen": 1.2514399290084839, "rewards/margins": 0.43410956859588623, "rewards/rejected": 0.8173303604125977, "step": 3545 }, { "epoch": 1.91, "learning_rate": 3.067433487486932e-08, "logits/chosen": -2.073331832885742, "logits/rejected": -2.2605655193328857, "logps/chosen": -5.859260559082031, "logps/rejected": -1.2113311290740967, "loss": 0.7235, "rewards/accuracies": 0.0, "rewards/chosen": 0.6634430289268494, "rewards/margins": -0.05972546339035034, "rewards/rejected": 0.7231684923171997, "step": 3546 }, { "epoch": 1.91, "learning_rate": 3.064748513988914e-08, "logits/chosen": -2.0342636108398438, "logits/rejected": -2.2804880142211914, "logps/chosen": -0.7861614227294922, "logps/rejected": -0.698714554309845, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.8844498991966248, "rewards/margins": 0.013879239559173584, "rewards/rejected": 0.8705706596374512, "step": 3547 }, { "epoch": 1.91, "learning_rate": 3.0620641967181224e-08, "logits/chosen": -2.057384490966797, "logits/rejected": -2.051142930984497, "logps/chosen": -14.886859893798828, "logps/rejected": -9.18226146697998, "loss": 0.2833, "rewards/accuracies": 1.0, "rewards/chosen": 1.3891223669052124, "rewards/margins": 1.1161937713623047, "rewards/rejected": 0.2729286253452301, "step": 3548 }, { "epoch": 1.91, "learning_rate": 3.059380536584786e-08, "logits/chosen": -2.019150733947754, "logits/rejected": -2.0283892154693604, "logps/chosen": -1.741865634918213, "logps/rejected": -2.67199444770813, "loss": 0.4936, "rewards/accuracies": 1.0, "rewards/chosen": 1.0547608137130737, "rewards/margins": 0.44922584295272827, "rewards/rejected": 0.6055349707603455, "step": 3549 }, { "epoch": 1.91, "learning_rate": 3.056697534498911e-08, "logits/chosen": -2.189319610595703, "logits/rejected": -2.260484218597412, "logps/chosen": -1.514920949935913, "logps/rejected": -1.5025644302368164, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.9731640815734863, "rewards/margins": 0.01351773738861084, "rewards/rejected": 0.9596463441848755, "step": 3550 }, { "epoch": 1.92, "learning_rate": 3.054015191370279e-08, "logits/chosen": -1.9585379362106323, "logits/rejected": -2.2521257400512695, "logps/chosen": -0.6928296685218811, "logps/rejected": -0.8629356026649475, "loss": 0.6776, "rewards/accuracies": 1.0, "rewards/chosen": 0.8669541478157043, "rewards/margins": 0.031282782554626465, "rewards/rejected": 0.8356713652610779, "step": 3551 }, { "epoch": 1.92, "learning_rate": 3.051333508108452e-08, "logits/chosen": -2.024207592010498, "logits/rejected": -2.0290706157684326, "logps/chosen": -1.3748904466629028, "logps/rejected": -2.946315288543701, "loss": 0.4893, "rewards/accuracies": 1.0, "rewards/chosen": 1.0841463804244995, "rewards/margins": 0.460147500038147, "rewards/rejected": 0.6239988803863525, "step": 3552 }, { "epoch": 1.92, "learning_rate": 3.048652485622765e-08, "logits/chosen": -1.9861392974853516, "logits/rejected": -2.2649741172790527, "logps/chosen": -0.6369085907936096, "logps/rejected": -0.7456703782081604, "loss": 0.676, "rewards/accuracies": 1.0, "rewards/chosen": 0.8906775712966919, "rewards/margins": 0.03455686569213867, "rewards/rejected": 0.8561207056045532, "step": 3553 }, { "epoch": 1.92, "learning_rate": 3.0459721248223266e-08, "logits/chosen": -2.0842390060424805, "logits/rejected": -2.3376693725585938, "logps/chosen": -2.3109183311462402, "logps/rejected": -1.9333101511001587, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 1.0657325983047485, "rewards/margins": 0.025379538536071777, "rewards/rejected": 1.0403530597686768, "step": 3554 }, { "epoch": 1.92, "learning_rate": 3.043292426616029e-08, "logits/chosen": -1.987484335899353, "logits/rejected": -2.2794768810272217, "logps/chosen": -0.9154782295227051, "logps/rejected": -1.0418952703475952, "loss": 0.6674, "rewards/accuracies": 1.0, "rewards/chosen": 0.7654964923858643, "rewards/margins": 0.0522196888923645, "rewards/rejected": 0.7132768034934998, "step": 3555 }, { "epoch": 1.92, "learning_rate": 3.040613391912534e-08, "logits/chosen": -2.0376129150390625, "logits/rejected": -2.279433012008667, "logps/chosen": -0.466922402381897, "logps/rejected": -0.5601763129234314, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 1.0582647323608398, "rewards/margins": 0.014284133911132812, "rewards/rejected": 1.043980598449707, "step": 3556 }, { "epoch": 1.92, "learning_rate": 3.0379350216202774e-08, "logits/chosen": -2.0775935649871826, "logits/rejected": -2.078343152999878, "logps/chosen": -2.8000071048736572, "logps/rejected": -2.547997236251831, "loss": 0.359, "rewards/accuracies": 1.0, "rewards/chosen": 1.7559856176376343, "rewards/margins": 0.8395940661430359, "rewards/rejected": 0.9163915514945984, "step": 3557 }, { "epoch": 1.92, "learning_rate": 3.0352573166474724e-08, "logits/chosen": -2.095059394836426, "logits/rejected": -2.097240447998047, "logps/chosen": -1.2974759340286255, "logps/rejected": -5.463930606842041, "loss": 0.4268, "rewards/accuracies": 1.0, "rewards/chosen": 1.019901156425476, "rewards/margins": 0.6305164098739624, "rewards/rejected": 0.3893847167491913, "step": 3558 }, { "epoch": 1.92, "learning_rate": 3.032580277902107e-08, "logits/chosen": -2.0349936485290527, "logits/rejected": -2.0898637771606445, "logps/chosen": -2.8015999794006348, "logps/rejected": -23.828174591064453, "loss": 0.3525, "rewards/accuracies": 1.0, "rewards/chosen": 1.1062816381454468, "rewards/margins": 0.8611927032470703, "rewards/rejected": 0.24508896470069885, "step": 3559 }, { "epoch": 1.92, "learning_rate": 3.0299039062919415e-08, "logits/chosen": -2.0827126502990723, "logits/rejected": -2.2928197383880615, "logps/chosen": -3.2412264347076416, "logps/rejected": -7.022361755371094, "loss": 0.7609, "rewards/accuracies": 0.0, "rewards/chosen": 1.039120078086853, "rewards/margins": -0.13111484050750732, "rewards/rejected": 1.1702349185943604, "step": 3560 }, { "epoch": 1.92, "learning_rate": 3.027228202724509e-08, "logits/chosen": -2.0432188510894775, "logits/rejected": -2.043865442276001, "logps/chosen": -3.8482253551483154, "logps/rejected": -4.270457744598389, "loss": 0.5086, "rewards/accuracies": 1.0, "rewards/chosen": 1.1048715114593506, "rewards/margins": 0.41100406646728516, "rewards/rejected": 0.6938674449920654, "step": 3561 }, { "epoch": 1.92, "learning_rate": 3.02455316810712e-08, "logits/chosen": -2.0584049224853516, "logits/rejected": -2.253042697906494, "logps/chosen": -0.3903074264526367, "logps/rejected": -0.3724327087402344, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.8324152827262878, "rewards/margins": 0.025841116905212402, "rewards/rejected": 0.8065741658210754, "step": 3562 }, { "epoch": 1.92, "learning_rate": 3.021878803346855e-08, "logits/chosen": -1.9954631328582764, "logits/rejected": -2.0022385120391846, "logps/chosen": -1.6624925136566162, "logps/rejected": -3.0258448123931885, "loss": 0.4835, "rewards/accuracies": 1.0, "rewards/chosen": 1.0031094551086426, "rewards/margins": 0.4751320481300354, "rewards/rejected": 0.5279774069786072, "step": 3563 }, { "epoch": 1.92, "learning_rate": 3.019205109350566e-08, "logits/chosen": -2.074061393737793, "logits/rejected": -2.2864959239959717, "logps/chosen": -0.6684954166412354, "logps/rejected": -0.5918909907341003, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.9854670763015747, "rewards/margins": 0.0002314448356628418, "rewards/rejected": 0.9852356314659119, "step": 3564 }, { "epoch": 1.92, "learning_rate": 3.016532087024884e-08, "logits/chosen": -1.9789282083511353, "logits/rejected": -2.2867555618286133, "logps/chosen": -2.3756191730499268, "logps/rejected": -11.040096282958984, "loss": 0.6044, "rewards/accuracies": 1.0, "rewards/chosen": 0.9323854446411133, "rewards/margins": 0.18616217374801636, "rewards/rejected": 0.7462232708930969, "step": 3565 }, { "epoch": 1.92, "learning_rate": 3.013859737276206e-08, "logits/chosen": -2.0660486221313477, "logits/rejected": -2.252159357070923, "logps/chosen": -7.487537384033203, "logps/rejected": -9.209415435791016, "loss": 0.5974, "rewards/accuracies": 1.0, "rewards/chosen": 0.9381027221679688, "rewards/margins": 0.20172327756881714, "rewards/rejected": 0.7363794445991516, "step": 3566 }, { "epoch": 1.92, "learning_rate": 3.011188061010702e-08, "logits/chosen": -2.193124294281006, "logits/rejected": -2.199416160583496, "logps/chosen": -2.703681230545044, "logps/rejected": -5.355804920196533, "loss": 0.4523, "rewards/accuracies": 1.0, "rewards/chosen": 0.8516263961791992, "rewards/margins": 0.5586220026016235, "rewards/rejected": 0.2930043637752533, "step": 3567 }, { "epoch": 1.92, "learning_rate": 3.008517059134316e-08, "logits/chosen": -1.9918586015701294, "logits/rejected": -2.2523746490478516, "logps/chosen": -0.5970525145530701, "logps/rejected": -0.5897471308708191, "loss": 0.6734, "rewards/accuracies": 1.0, "rewards/chosen": 1.0348118543624878, "rewards/margins": 0.03979372978210449, "rewards/rejected": 0.9950181245803833, "step": 3568 }, { "epoch": 1.93, "learning_rate": 3.0058467325527616e-08, "logits/chosen": -2.0858192443847656, "logits/rejected": -2.0751192569732666, "logps/chosen": -0.3762592077255249, "logps/rejected": -6.509397029876709, "loss": 0.4459, "rewards/accuracies": 1.0, "rewards/chosen": 1.0135430097579956, "rewards/margins": 0.5765516757965088, "rewards/rejected": 0.4369913637638092, "step": 3569 }, { "epoch": 1.93, "learning_rate": 3.003177082171523e-08, "logits/chosen": -1.9841530323028564, "logits/rejected": -1.9643398523330688, "logps/chosen": -10.649141311645508, "logps/rejected": -3.1370725631713867, "loss": 0.5019, "rewards/accuracies": 1.0, "rewards/chosen": 1.4920618534088135, "rewards/margins": 0.42792463302612305, "rewards/rejected": 1.0641372203826904, "step": 3570 }, { "epoch": 1.93, "learning_rate": 3.000508108895856e-08, "logits/chosen": -2.1511924266815186, "logits/rejected": -2.155595064163208, "logps/chosen": -2.7308053970336914, "logps/rejected": -1.7348376512527466, "loss": 0.6456, "rewards/accuracies": 1.0, "rewards/chosen": 0.9773842096328735, "rewards/margins": 0.09737199544906616, "rewards/rejected": 0.8800122141838074, "step": 3571 }, { "epoch": 1.93, "learning_rate": 2.997839813630787e-08, "logits/chosen": -2.1692938804626465, "logits/rejected": -2.276334285736084, "logps/chosen": -1.5971068143844604, "logps/rejected": -1.9280411005020142, "loss": 0.667, "rewards/accuracies": 1.0, "rewards/chosen": 1.0425418615341187, "rewards/margins": 0.053001582622528076, "rewards/rejected": 0.9895402789115906, "step": 3572 }, { "epoch": 1.93, "learning_rate": 2.995172197281113e-08, "logits/chosen": -2.0607378482818604, "logits/rejected": -2.2057478427886963, "logps/chosen": -0.33822864294052124, "logps/rejected": -0.37492135167121887, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.8197043538093567, "rewards/margins": 0.022384703159332275, "rewards/rejected": 0.7973196506500244, "step": 3573 }, { "epoch": 1.93, "learning_rate": 2.9925052607513955e-08, "logits/chosen": -2.0038347244262695, "logits/rejected": -2.006054401397705, "logps/chosen": -0.837578535079956, "logps/rejected": -3.2758736610412598, "loss": 0.5457, "rewards/accuracies": 1.0, "rewards/chosen": 0.972843587398529, "rewards/margins": 0.3205133080482483, "rewards/rejected": 0.6523302793502808, "step": 3574 }, { "epoch": 1.93, "learning_rate": 2.989839004945976e-08, "logits/chosen": -2.159130811691284, "logits/rejected": -2.1935839653015137, "logps/chosen": -0.6421750783920288, "logps/rejected": -8.245598793029785, "loss": 0.6685, "rewards/accuracies": 1.0, "rewards/chosen": 1.0111685991287231, "rewards/margins": 0.04992908239364624, "rewards/rejected": 0.9612395167350769, "step": 3575 }, { "epoch": 1.93, "learning_rate": 2.987173430768955e-08, "logits/chosen": -2.12267804145813, "logits/rejected": -2.12369966506958, "logps/chosen": -0.32307448983192444, "logps/rejected": -4.132213115692139, "loss": 0.5085, "rewards/accuracies": 1.0, "rewards/chosen": 0.9784979224205017, "rewards/margins": 0.41119784116744995, "rewards/rejected": 0.5673000812530518, "step": 3576 }, { "epoch": 1.93, "learning_rate": 2.984508539124208e-08, "logits/chosen": -2.103450298309326, "logits/rejected": -2.2894554138183594, "logps/chosen": -0.4776609539985657, "logps/rejected": -0.453053742647171, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.9388917088508606, "rewards/margins": 0.0062667131423950195, "rewards/rejected": 0.9326249957084656, "step": 3577 }, { "epoch": 1.93, "learning_rate": 2.981844330915376e-08, "logits/chosen": -2.086256980895996, "logits/rejected": -2.378899097442627, "logps/chosen": -19.64139175415039, "logps/rejected": -17.109289169311523, "loss": 0.6087, "rewards/accuracies": 1.0, "rewards/chosen": 0.25941696763038635, "rewards/margins": 0.17674846947193146, "rewards/rejected": 0.0826684981584549, "step": 3578 }, { "epoch": 1.93, "learning_rate": 2.97918080704587e-08, "logits/chosen": -2.0594122409820557, "logits/rejected": -2.059656858444214, "logps/chosen": -0.20920702815055847, "logps/rejected": -5.167283058166504, "loss": 0.4742, "rewards/accuracies": 1.0, "rewards/chosen": 0.930634617805481, "rewards/margins": 0.499580979347229, "rewards/rejected": 0.43105363845825195, "step": 3579 }, { "epoch": 1.93, "learning_rate": 2.9765179684188656e-08, "logits/chosen": -2.130805492401123, "logits/rejected": -2.2873411178588867, "logps/chosen": -0.2739894390106201, "logps/rejected": -0.3200390338897705, "loss": 0.6968, "rewards/accuracies": 0.0, "rewards/chosen": 0.8098613023757935, "rewards/margins": -0.00732266902923584, "rewards/rejected": 0.8171839714050293, "step": 3580 }, { "epoch": 1.93, "learning_rate": 2.9738558159373124e-08, "logits/chosen": -2.0572681427001953, "logits/rejected": -2.0552353858947754, "logps/chosen": -2.3433640003204346, "logps/rejected": -5.151636600494385, "loss": 0.2889, "rewards/accuracies": 1.0, "rewards/chosen": 1.5194783210754395, "rewards/margins": 1.0937716960906982, "rewards/rejected": 0.4257066249847412, "step": 3581 }, { "epoch": 1.93, "learning_rate": 2.971194350503921e-08, "logits/chosen": -2.227262258529663, "logits/rejected": -2.1922175884246826, "logps/chosen": -24.74399757385254, "logps/rejected": -11.64062213897705, "loss": 0.3813, "rewards/accuracies": 1.0, "rewards/chosen": 1.8194020986557007, "rewards/margins": 0.7675554752349854, "rewards/rejected": 1.0518466234207153, "step": 3582 }, { "epoch": 1.93, "learning_rate": 2.9685335730211703e-08, "logits/chosen": -2.0481979846954346, "logits/rejected": -2.2418482303619385, "logps/chosen": -1.187540054321289, "logps/rejected": -1.1943514347076416, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 0.8728681802749634, "rewards/margins": 0.02425926923751831, "rewards/rejected": 0.8486089110374451, "step": 3583 }, { "epoch": 1.93, "learning_rate": 2.9658734843913112e-08, "logits/chosen": -2.191408634185791, "logits/rejected": -2.2006258964538574, "logps/chosen": -2.5375118255615234, "logps/rejected": -4.324170112609863, "loss": 0.3519, "rewards/accuracies": 1.0, "rewards/chosen": 1.358093500137329, "rewards/margins": 0.8634262084960938, "rewards/rejected": 0.49466726183891296, "step": 3584 }, { "epoch": 1.93, "learning_rate": 2.9632140855163546e-08, "logits/chosen": -2.073615074157715, "logits/rejected": -2.344304323196411, "logps/chosen": -0.6760812997817993, "logps/rejected": -0.7283097505569458, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 1.1482974290847778, "rewards/margins": 0.014615178108215332, "rewards/rejected": 1.1336822509765625, "step": 3585 }, { "epoch": 1.93, "learning_rate": 2.9605553772980794e-08, "logits/chosen": -2.0422205924987793, "logits/rejected": -2.2444560527801514, "logps/chosen": -0.3180318772792816, "logps/rejected": -4.839580059051514, "loss": 0.5574, "rewards/accuracies": 1.0, "rewards/chosen": 0.9463494420051575, "rewards/margins": 0.29278379678726196, "rewards/rejected": 0.6535656452178955, "step": 3586 }, { "epoch": 1.93, "learning_rate": 2.957897360638032e-08, "logits/chosen": -2.069122314453125, "logits/rejected": -2.0594546794891357, "logps/chosen": -4.496217727661133, "logps/rejected": -3.0327205657958984, "loss": 0.5207, "rewards/accuracies": 1.0, "rewards/chosen": 1.0982190370559692, "rewards/margins": 0.3808857202529907, "rewards/rejected": 0.7173333168029785, "step": 3587 }, { "epoch": 1.94, "learning_rate": 2.9552400364375234e-08, "logits/chosen": -2.022847890853882, "logits/rejected": -2.2508654594421387, "logps/chosen": -9.127025604248047, "logps/rejected": -5.3797173500061035, "loss": 0.7814, "rewards/accuracies": 0.0, "rewards/chosen": 0.8574722409248352, "rewards/margins": -0.1693984866142273, "rewards/rejected": 1.0268707275390625, "step": 3588 }, { "epoch": 1.94, "learning_rate": 2.9525834055976284e-08, "logits/chosen": -2.073312520980835, "logits/rejected": -2.0685484409332275, "logps/chosen": -3.5043587684631348, "logps/rejected": -2.8985586166381836, "loss": 0.5732, "rewards/accuracies": 1.0, "rewards/chosen": 1.0968914031982422, "rewards/margins": 0.25618046522140503, "rewards/rejected": 0.8407109379768372, "step": 3589 }, { "epoch": 1.94, "learning_rate": 2.9499274690191876e-08, "logits/chosen": -2.1304991245269775, "logits/rejected": -2.2961130142211914, "logps/chosen": -0.9350080490112305, "logps/rejected": -0.931200385093689, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.9624599814414978, "rewards/margins": 0.022717297077178955, "rewards/rejected": 0.9397426843643188, "step": 3590 }, { "epoch": 1.94, "learning_rate": 2.947272227602808e-08, "logits/chosen": -2.1176977157592773, "logits/rejected": -2.1321001052856445, "logps/chosen": -3.175405979156494, "logps/rejected": -5.668182849884033, "loss": 0.4492, "rewards/accuracies": 1.0, "rewards/chosen": 1.3558472394943237, "rewards/margins": 0.5673079490661621, "rewards/rejected": 0.7885392904281616, "step": 3591 }, { "epoch": 1.94, "learning_rate": 2.9446176822488577e-08, "logits/chosen": -2.133816719055176, "logits/rejected": -2.1365597248077393, "logps/chosen": -1.1028664112091064, "logps/rejected": -3.4765233993530273, "loss": 0.4771, "rewards/accuracies": 1.0, "rewards/chosen": 1.0038113594055176, "rewards/margins": 0.4920511245727539, "rewards/rejected": 0.5117602348327637, "step": 3592 }, { "epoch": 1.94, "learning_rate": 2.941963833857469e-08, "logits/chosen": -2.036787986755371, "logits/rejected": -2.0356838703155518, "logps/chosen": -6.910058975219727, "logps/rejected": -5.4908318519592285, "loss": 0.4785, "rewards/accuracies": 1.0, "rewards/chosen": 1.0293172597885132, "rewards/margins": 0.48827898502349854, "rewards/rejected": 0.5410382747650146, "step": 3593 }, { "epoch": 1.94, "learning_rate": 2.9393106833285432e-08, "logits/chosen": -2.1718432903289795, "logits/rejected": -2.125678300857544, "logps/chosen": -17.532257080078125, "logps/rejected": -2.8875863552093506, "loss": 0.3327, "rewards/accuracies": 1.0, "rewards/chosen": 1.5761291980743408, "rewards/margins": 0.929460346698761, "rewards/rejected": 0.6466688513755798, "step": 3594 }, { "epoch": 1.94, "learning_rate": 2.9366582315617383e-08, "logits/chosen": -2.1904327869415283, "logits/rejected": -2.1885952949523926, "logps/chosen": -2.3084678649902344, "logps/rejected": -3.3513357639312744, "loss": 0.5272, "rewards/accuracies": 1.0, "rewards/chosen": 1.0991514921188354, "rewards/margins": 0.36514222621917725, "rewards/rejected": 0.7340092658996582, "step": 3595 }, { "epoch": 1.94, "learning_rate": 2.934006479456479e-08, "logits/chosen": -2.034003734588623, "logits/rejected": -2.051246404647827, "logps/chosen": -1.654564619064331, "logps/rejected": -11.248796463012695, "loss": 0.5225, "rewards/accuracies": 1.0, "rewards/chosen": 1.2062691450119019, "rewards/margins": 0.3764623999595642, "rewards/rejected": 0.8298067450523376, "step": 3596 }, { "epoch": 1.94, "learning_rate": 2.9313554279119505e-08, "logits/chosen": -2.0547523498535156, "logits/rejected": -2.2686548233032227, "logps/chosen": -1.0708688497543335, "logps/rejected": -1.3512094020843506, "loss": 0.7275, "rewards/accuracies": 0.0, "rewards/chosen": 0.8777806162834167, "rewards/margins": -0.0675961971282959, "rewards/rejected": 0.9453768134117126, "step": 3597 }, { "epoch": 1.94, "learning_rate": 2.928705077827104e-08, "logits/chosen": -2.2010209560394287, "logits/rejected": -2.229353427886963, "logps/chosen": -0.7326988577842712, "logps/rejected": -9.138251304626465, "loss": 0.5372, "rewards/accuracies": 1.0, "rewards/chosen": 0.931922435760498, "rewards/margins": 0.340850830078125, "rewards/rejected": 0.591071605682373, "step": 3598 }, { "epoch": 1.94, "learning_rate": 2.926055430100647e-08, "logits/chosen": -1.9736545085906982, "logits/rejected": -2.23286771774292, "logps/chosen": -0.5321721434593201, "logps/rejected": -0.5196971297264099, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.9464436769485474, "rewards/margins": 0.02753084897994995, "rewards/rejected": 0.9189128279685974, "step": 3599 }, { "epoch": 1.94, "learning_rate": 2.923406485631055e-08, "logits/chosen": -1.9843600988388062, "logits/rejected": -1.9865766763687134, "logps/chosen": -1.1218184232711792, "logps/rejected": -2.3492398262023926, "loss": 0.5662, "rewards/accuracies": 1.0, "rewards/chosen": 0.8203849792480469, "rewards/margins": 0.2724161744117737, "rewards/rejected": 0.5479688048362732, "step": 3600 }, { "epoch": 1.94, "learning_rate": 2.9207582453165626e-08, "logits/chosen": -2.123462677001953, "logits/rejected": -2.2532145977020264, "logps/chosen": -5.583244323730469, "logps/rejected": -6.5114030838012695, "loss": 0.5873, "rewards/accuracies": 1.0, "rewards/chosen": 1.013932466506958, "rewards/margins": 0.2243272066116333, "rewards/rejected": 0.7896052598953247, "step": 3601 }, { "epoch": 1.94, "learning_rate": 2.9181107100551638e-08, "logits/chosen": -2.0253121852874756, "logits/rejected": -2.0313284397125244, "logps/chosen": -0.9158537983894348, "logps/rejected": -5.4369940757751465, "loss": 0.43, "rewards/accuracies": 1.0, "rewards/chosen": 0.964560329914093, "rewards/margins": 0.6211721897125244, "rewards/rejected": 0.3433881402015686, "step": 3602 }, { "epoch": 1.94, "learning_rate": 2.9154638807446153e-08, "logits/chosen": -2.057150363922119, "logits/rejected": -2.234712600708008, "logps/chosen": -0.3550318777561188, "logps/rejected": -0.3195244073867798, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.8080763816833496, "rewards/margins": 0.017434895038604736, "rewards/rejected": 0.7906414866447449, "step": 3603 }, { "epoch": 1.94, "learning_rate": 2.9128177582824354e-08, "logits/chosen": -2.1159963607788086, "logits/rejected": -2.3135721683502197, "logps/chosen": -1.5013941526412964, "logps/rejected": -1.5317137241363525, "loss": 0.672, "rewards/accuracies": 1.0, "rewards/chosen": 0.9914893507957458, "rewards/margins": 0.042668282985687256, "rewards/rejected": 0.9488210678100586, "step": 3604 }, { "epoch": 1.94, "learning_rate": 2.910172343565903e-08, "logits/chosen": -2.081312417984009, "logits/rejected": -2.0842113494873047, "logps/chosen": -1.4436767101287842, "logps/rejected": -10.945287704467773, "loss": 0.4457, "rewards/accuracies": 1.0, "rewards/chosen": 1.0985502004623413, "rewards/margins": 0.5768927931785583, "rewards/rejected": 0.521657407283783, "step": 3605 }, { "epoch": 1.94, "learning_rate": 2.9075276374920522e-08, "logits/chosen": -2.0246145725250244, "logits/rejected": -2.252223253250122, "logps/chosen": -1.5072855949401855, "logps/rejected": -1.573909044265747, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 0.8819546103477478, "rewards/margins": -0.0008959770202636719, "rewards/rejected": 0.8828505873680115, "step": 3606 }, { "epoch": 1.95, "learning_rate": 2.904883640957681e-08, "logits/chosen": -2.0396344661712646, "logits/rejected": -2.232933282852173, "logps/chosen": -0.6289049386978149, "logps/rejected": -0.7191789746284485, "loss": 0.7021, "rewards/accuracies": 0.0, "rewards/chosen": 0.8067584037780762, "rewards/margins": -0.01782459020614624, "rewards/rejected": 0.8245829939842224, "step": 3607 }, { "epoch": 1.95, "learning_rate": 2.9022403548593494e-08, "logits/chosen": -2.0193042755126953, "logits/rejected": -2.011699914932251, "logps/chosen": -4.335826873779297, "logps/rejected": -4.830244064331055, "loss": 0.4236, "rewards/accuracies": 1.0, "rewards/chosen": 1.277730941772461, "rewards/margins": 0.639599621295929, "rewards/rejected": 0.638131320476532, "step": 3608 }, { "epoch": 1.95, "learning_rate": 2.899597780093368e-08, "logits/chosen": -2.0175626277923584, "logits/rejected": -2.0224831104278564, "logps/chosen": -1.7974708080291748, "logps/rejected": -3.5365521907806396, "loss": 0.5018, "rewards/accuracies": 1.0, "rewards/chosen": 0.9635117650032043, "rewards/margins": 0.4281572103500366, "rewards/rejected": 0.5353545546531677, "step": 3609 }, { "epoch": 1.95, "learning_rate": 2.896955917555817e-08, "logits/chosen": -2.036271810531616, "logits/rejected": -2.036790132522583, "logps/chosen": -0.5605340600013733, "logps/rejected": -4.264161586761475, "loss": 0.5348, "rewards/accuracies": 1.0, "rewards/chosen": 0.894988477230072, "rewards/margins": 0.3466029763221741, "rewards/rejected": 0.548385500907898, "step": 3610 }, { "epoch": 1.95, "learning_rate": 2.8943147681425228e-08, "logits/chosen": -2.0183539390563965, "logits/rejected": -2.0133800506591797, "logps/chosen": -6.049914360046387, "logps/rejected": -3.9999308586120605, "loss": 0.2952, "rewards/accuracies": 1.0, "rewards/chosen": 1.6481109857559204, "rewards/margins": 1.0688194036483765, "rewards/rejected": 0.579291582107544, "step": 3611 }, { "epoch": 1.95, "learning_rate": 2.89167433274908e-08, "logits/chosen": -2.085906505584717, "logits/rejected": -2.083686351776123, "logps/chosen": -0.20202156901359558, "logps/rejected": -5.975986003875732, "loss": 0.5272, "rewards/accuracies": 1.0, "rewards/chosen": 1.0372707843780518, "rewards/margins": 0.3651173710823059, "rewards/rejected": 0.6721534132957458, "step": 3612 }, { "epoch": 1.95, "learning_rate": 2.8890346122708386e-08, "logits/chosen": -2.2433197498321533, "logits/rejected": -2.127786159515381, "logps/chosen": -35.04032897949219, "logps/rejected": -4.932895660400391, "loss": 0.1589, "rewards/accuracies": 1.0, "rewards/chosen": 2.515174627304077, "rewards/margins": 1.7588145732879639, "rewards/rejected": 0.7563600540161133, "step": 3613 }, { "epoch": 1.95, "learning_rate": 2.8863956076029038e-08, "logits/chosen": -2.2082223892211914, "logits/rejected": -2.2102744579315186, "logps/chosen": -2.3789682388305664, "logps/rejected": -1.0626881122589111, "loss": 0.593, "rewards/accuracies": 1.0, "rewards/chosen": 0.9743191599845886, "rewards/margins": 0.2114121913909912, "rewards/rejected": 0.7629069685935974, "step": 3614 }, { "epoch": 1.95, "learning_rate": 2.883757319640142e-08, "logits/chosen": -2.1449220180511475, "logits/rejected": -2.1478333473205566, "logps/chosen": -7.502164840698242, "logps/rejected": -6.7582783699035645, "loss": 0.6003, "rewards/accuracies": 1.0, "rewards/chosen": 1.320420265197754, "rewards/margins": 0.1952420473098755, "rewards/rejected": 1.1251782178878784, "step": 3615 }, { "epoch": 1.95, "learning_rate": 2.88111974927717e-08, "logits/chosen": -2.0777716636657715, "logits/rejected": -2.2801713943481445, "logps/chosen": -1.1833521127700806, "logps/rejected": -1.2244713306427002, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.783659815788269, "rewards/margins": 0.002566516399383545, "rewards/rejected": 0.7810932993888855, "step": 3616 }, { "epoch": 1.95, "learning_rate": 2.8784828974083675e-08, "logits/chosen": -2.0227062702178955, "logits/rejected": -2.020209550857544, "logps/chosen": -7.306746482849121, "logps/rejected": -4.665562629699707, "loss": 0.3478, "rewards/accuracies": 1.0, "rewards/chosen": 1.3616899251937866, "rewards/margins": 0.8772354125976562, "rewards/rejected": 0.48445454239845276, "step": 3617 }, { "epoch": 1.95, "learning_rate": 2.87584676492787e-08, "logits/chosen": -2.1298515796661377, "logits/rejected": -2.1382498741149902, "logps/chosen": -2.6442697048187256, "logps/rejected": -1.5955055952072144, "loss": 0.5551, "rewards/accuracies": 1.0, "rewards/chosen": 1.3401873111724854, "rewards/margins": 0.29831182956695557, "rewards/rejected": 1.0418754816055298, "step": 3618 }, { "epoch": 1.95, "learning_rate": 2.8732113527295644e-08, "logits/chosen": -1.977157711982727, "logits/rejected": -1.9867886304855347, "logps/chosen": -3.0537679195404053, "logps/rejected": -4.246409893035889, "loss": 0.4309, "rewards/accuracies": 1.0, "rewards/chosen": 1.082253336906433, "rewards/margins": 0.6186962127685547, "rewards/rejected": 0.46355709433555603, "step": 3619 }, { "epoch": 1.95, "learning_rate": 2.870576661707097e-08, "logits/chosen": -2.089866876602173, "logits/rejected": -2.277108669281006, "logps/chosen": -0.8197374939918518, "logps/rejected": -0.8729617595672607, "loss": 0.6844, "rewards/accuracies": 1.0, "rewards/chosen": 1.0266894102096558, "rewards/margins": 0.017510414123535156, "rewards/rejected": 1.0091789960861206, "step": 3620 }, { "epoch": 1.95, "learning_rate": 2.8679426927538726e-08, "logits/chosen": -2.139693260192871, "logits/rejected": -2.12317156791687, "logps/chosen": -7.280028343200684, "logps/rejected": -4.05177640914917, "loss": 0.4238, "rewards/accuracies": 1.0, "rewards/chosen": 1.3165310621261597, "rewards/margins": 0.6392568945884705, "rewards/rejected": 0.6772741675376892, "step": 3621 }, { "epoch": 1.95, "learning_rate": 2.86530944676304e-08, "logits/chosen": -2.032785654067993, "logits/rejected": -2.036283016204834, "logps/chosen": -3.434603214263916, "logps/rejected": -3.939039945602417, "loss": 0.5104, "rewards/accuracies": 1.0, "rewards/chosen": 1.0259348154067993, "rewards/margins": 0.40642303228378296, "rewards/rejected": 0.6195117831230164, "step": 3622 }, { "epoch": 1.95, "learning_rate": 2.8626769246275195e-08, "logits/chosen": -2.0451531410217285, "logits/rejected": -2.2192704677581787, "logps/chosen": -0.6197260022163391, "logps/rejected": -0.6204188466072083, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 0.8406664729118347, "rewards/margins": 0.02908158302307129, "rewards/rejected": 0.8115848898887634, "step": 3623 }, { "epoch": 1.95, "learning_rate": 2.8600451272399707e-08, "logits/chosen": -2.2960875034332275, "logits/rejected": -2.1459169387817383, "logps/chosen": -30.59673309326172, "logps/rejected": -3.6449854373931885, "loss": 0.2172, "rewards/accuracies": 1.0, "rewards/chosen": 1.8316773176193237, "rewards/margins": 1.416300892829895, "rewards/rejected": 0.4153763949871063, "step": 3624 }, { "epoch": 1.96, "learning_rate": 2.8574140554928173e-08, "logits/chosen": -2.0599093437194824, "logits/rejected": -2.318118095397949, "logps/chosen": -0.4090796113014221, "logps/rejected": -1.4533308744430542, "loss": 0.6305, "rewards/accuracies": 1.0, "rewards/chosen": 0.8130481839179993, "rewards/margins": 0.1295129656791687, "rewards/rejected": 0.6835352182388306, "step": 3625 }, { "epoch": 1.96, "learning_rate": 2.854783710278229e-08, "logits/chosen": -2.046673536300659, "logits/rejected": -2.060289144515991, "logps/chosen": -1.6717913150787354, "logps/rejected": -7.073764324188232, "loss": 0.4451, "rewards/accuracies": 1.0, "rewards/chosen": 1.2486069202423096, "rewards/margins": 0.5787758827209473, "rewards/rejected": 0.6698310375213623, "step": 3626 }, { "epoch": 1.96, "learning_rate": 2.852154092488136e-08, "logits/chosen": -2.1027705669403076, "logits/rejected": -2.069465398788452, "logps/chosen": -16.898696899414062, "logps/rejected": -3.1055471897125244, "loss": 0.3799, "rewards/accuracies": 1.0, "rewards/chosen": 1.4562959671020508, "rewards/margins": 0.7720345258712769, "rewards/rejected": 0.6842614412307739, "step": 3627 }, { "epoch": 1.96, "learning_rate": 2.8495252030142212e-08, "logits/chosen": -2.068540334701538, "logits/rejected": -2.3223769664764404, "logps/chosen": -0.3421895205974579, "logps/rejected": -0.35977503657341003, "loss": 0.6955, "rewards/accuracies": 0.0, "rewards/chosen": 1.0729953050613403, "rewards/margins": -0.004601955413818359, "rewards/rejected": 1.0775972604751587, "step": 3628 }, { "epoch": 1.96, "learning_rate": 2.8468970427479134e-08, "logits/chosen": -2.3593647480010986, "logits/rejected": -2.3029096126556396, "logps/chosen": -21.88447380065918, "logps/rejected": -5.188277721405029, "loss": 0.1573, "rewards/accuracies": 1.0, "rewards/chosen": 2.2294442653656006, "rewards/margins": 1.7702420949935913, "rewards/rejected": 0.4592021405696869, "step": 3629 }, { "epoch": 1.96, "learning_rate": 2.8442696125804028e-08, "logits/chosen": -1.954776644706726, "logits/rejected": -1.9573345184326172, "logps/chosen": -2.6807708740234375, "logps/rejected": -0.6250441670417786, "loss": 0.6343, "rewards/accuracies": 1.0, "rewards/chosen": 1.1033776998519897, "rewards/margins": 0.12135100364685059, "rewards/rejected": 0.9820266962051392, "step": 3630 }, { "epoch": 1.96, "learning_rate": 2.8416429134026293e-08, "logits/chosen": -2.086850881576538, "logits/rejected": -2.0661394596099854, "logps/chosen": -8.60258674621582, "logps/rejected": -1.444000244140625, "loss": 0.3803, "rewards/accuracies": 1.0, "rewards/chosen": 1.6321382522583008, "rewards/margins": 0.7705473899841309, "rewards/rejected": 0.8615908622741699, "step": 3631 }, { "epoch": 1.96, "learning_rate": 2.8390169461052787e-08, "logits/chosen": -1.9255322217941284, "logits/rejected": -2.26271915435791, "logps/chosen": -0.39225226640701294, "logps/rejected": -0.35946500301361084, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.9481205940246582, "rewards/margins": -2.300739288330078e-05, "rewards/rejected": 0.9481436014175415, "step": 3632 }, { "epoch": 1.96, "learning_rate": 2.8363917115788016e-08, "logits/chosen": -2.0861222743988037, "logits/rejected": -2.314056873321533, "logps/chosen": -0.37026575207710266, "logps/rejected": -0.31626877188682556, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.8724542856216431, "rewards/margins": 0.03916323184967041, "rewards/rejected": 0.8332910537719727, "step": 3633 }, { "epoch": 1.96, "learning_rate": 2.8337672107133875e-08, "logits/chosen": -2.2222695350646973, "logits/rejected": -2.0983035564422607, "logps/chosen": -21.869644165039062, "logps/rejected": -11.419761657714844, "loss": 0.2534, "rewards/accuracies": 1.0, "rewards/chosen": 1.7536476850509644, "rewards/margins": 1.243330955505371, "rewards/rejected": 0.5103166699409485, "step": 3634 }, { "epoch": 1.96, "learning_rate": 2.8311434443989856e-08, "logits/chosen": -2.06415057182312, "logits/rejected": -2.3270809650421143, "logps/chosen": -0.4858371615409851, "logps/rejected": -0.42988190054893494, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.8286491632461548, "rewards/margins": 0.014370143413543701, "rewards/rejected": 0.8142790198326111, "step": 3635 }, { "epoch": 1.96, "learning_rate": 2.828520413525288e-08, "logits/chosen": -2.044153928756714, "logits/rejected": -2.3010122776031494, "logps/chosen": -0.5415674448013306, "logps/rejected": -0.6134001612663269, "loss": 0.6931, "rewards/accuracies": 1.0, "rewards/chosen": 0.9422690272331238, "rewards/margins": 6.693601608276367e-05, "rewards/rejected": 0.942202091217041, "step": 3636 }, { "epoch": 1.96, "learning_rate": 2.8258981189817455e-08, "logits/chosen": -2.241445779800415, "logits/rejected": -2.239884614944458, "logps/chosen": -3.315744400024414, "logps/rejected": -4.883483409881592, "loss": 0.4876, "rewards/accuracies": 1.0, "rewards/chosen": 0.8914491534233093, "rewards/margins": 0.4644917845726013, "rewards/rejected": 0.426957368850708, "step": 3637 }, { "epoch": 1.96, "learning_rate": 2.823276561657556e-08, "logits/chosen": -2.059476852416992, "logits/rejected": -2.255427598953247, "logps/chosen": -0.9679602384567261, "logps/rejected": -0.9662430286407471, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.9243094325065613, "rewards/margins": 0.006048381328582764, "rewards/rejected": 0.9182610511779785, "step": 3638 }, { "epoch": 1.96, "learning_rate": 2.8206557424416645e-08, "logits/chosen": -2.0887866020202637, "logits/rejected": -2.1163527965545654, "logps/chosen": -18.869945526123047, "logps/rejected": -14.457496643066406, "loss": 0.1332, "rewards/accuracies": 1.0, "rewards/chosen": 2.311617612838745, "rewards/margins": 1.9488410949707031, "rewards/rejected": 0.36277657747268677, "step": 3639 }, { "epoch": 1.96, "learning_rate": 2.8180356622227696e-08, "logits/chosen": -2.00730299949646, "logits/rejected": -2.0021188259124756, "logps/chosen": -3.3974902629852295, "logps/rejected": -3.419325590133667, "loss": 0.3531, "rewards/accuracies": 1.0, "rewards/chosen": 1.4727391004562378, "rewards/margins": 0.8591486811637878, "rewards/rejected": 0.61359041929245, "step": 3640 }, { "epoch": 1.96, "learning_rate": 2.8154163218893213e-08, "logits/chosen": -2.010033130645752, "logits/rejected": -2.002877712249756, "logps/chosen": -5.0525617599487305, "logps/rejected": -1.248314380645752, "loss": 0.484, "rewards/accuracies": 1.0, "rewards/chosen": 1.2198315858840942, "rewards/margins": 0.47381436824798584, "rewards/rejected": 0.7460172176361084, "step": 3641 }, { "epoch": 1.96, "learning_rate": 2.812797722329508e-08, "logits/chosen": -2.149559259414673, "logits/rejected": -2.286653518676758, "logps/chosen": -1.5881634950637817, "logps/rejected": -1.1183655261993408, "loss": 0.6483, "rewards/accuracies": 1.0, "rewards/chosen": 0.956808865070343, "rewards/margins": 0.0918768048286438, "rewards/rejected": 0.8649320602416992, "step": 3642 }, { "epoch": 1.96, "learning_rate": 2.8101798644312834e-08, "logits/chosen": -2.250744104385376, "logits/rejected": -2.3358547687530518, "logps/chosen": -1.137279987335205, "logps/rejected": -1.2091431617736816, "loss": 0.6957, "rewards/accuracies": 0.0, "rewards/chosen": 1.0414186716079712, "rewards/margins": -0.005136251449584961, "rewards/rejected": 1.0465549230575562, "step": 3643 }, { "epoch": 1.97, "learning_rate": 2.807562749082334e-08, "logits/chosen": -2.1771254539489746, "logits/rejected": -2.3156321048736572, "logps/chosen": -3.784910202026367, "logps/rejected": -10.475611686706543, "loss": 0.5193, "rewards/accuracies": 1.0, "rewards/chosen": 1.0019524097442627, "rewards/margins": 0.38434481620788574, "rewards/rejected": 0.617607593536377, "step": 3644 }, { "epoch": 1.97, "learning_rate": 2.8049463771701037e-08, "logits/chosen": -2.000608444213867, "logits/rejected": -2.0000076293945312, "logps/chosen": -0.9024187326431274, "logps/rejected": -2.743581533432007, "loss": 0.5344, "rewards/accuracies": 1.0, "rewards/chosen": 1.1356189250946045, "rewards/margins": 0.3476375937461853, "rewards/rejected": 0.7879813313484192, "step": 3645 }, { "epoch": 1.97, "learning_rate": 2.8023307495817837e-08, "logits/chosen": -2.074960708618164, "logits/rejected": -2.2640793323516846, "logps/chosen": -0.966468334197998, "logps/rejected": -1.5319416522979736, "loss": 0.7243, "rewards/accuracies": 0.0, "rewards/chosen": 0.8674097061157227, "rewards/margins": -0.06138736009597778, "rewards/rejected": 0.9287970662117004, "step": 3646 }, { "epoch": 1.97, "learning_rate": 2.799715867204307e-08, "logits/chosen": -2.0380477905273438, "logits/rejected": -2.031446695327759, "logps/chosen": -3.6879501342773438, "logps/rejected": -1.4100017547607422, "loss": 0.6495, "rewards/accuracies": 1.0, "rewards/chosen": 0.8445907831192017, "rewards/margins": 0.0893048644065857, "rewards/rejected": 0.755285918712616, "step": 3647 }, { "epoch": 1.97, "learning_rate": 2.7971017309243618e-08, "logits/chosen": -2.1286208629608154, "logits/rejected": -2.111733913421631, "logps/chosen": -1.4837114810943604, "logps/rejected": -8.66387939453125, "loss": 0.4402, "rewards/accuracies": 1.0, "rewards/chosen": 1.221184492111206, "rewards/margins": 0.5922850966453552, "rewards/rejected": 0.6288993954658508, "step": 3648 }, { "epoch": 1.97, "learning_rate": 2.794488341628376e-08, "logits/chosen": -2.2047736644744873, "logits/rejected": -2.0678985118865967, "logps/chosen": -44.41596603393555, "logps/rejected": -3.0207862854003906, "loss": 0.1382, "rewards/accuracies": 1.0, "rewards/chosen": 2.627119779586792, "rewards/margins": 1.9092764854431152, "rewards/rejected": 0.717843234539032, "step": 3649 }, { "epoch": 1.97, "learning_rate": 2.7918757002025284e-08, "logits/chosen": -1.9791293144226074, "logits/rejected": -1.9790244102478027, "logps/chosen": -1.9056475162506104, "logps/rejected": -2.718628168106079, "loss": 0.605, "rewards/accuracies": 1.0, "rewards/chosen": 1.088213324546814, "rewards/margins": 0.1847476363182068, "rewards/rejected": 0.9034656882286072, "step": 3650 }, { "epoch": 1.97, "learning_rate": 2.7892638075327458e-08, "logits/chosen": -2.013984441757202, "logits/rejected": -2.022829294204712, "logps/chosen": -8.021700859069824, "logps/rejected": -1.6406145095825195, "loss": 0.6125, "rewards/accuracies": 1.0, "rewards/chosen": 1.2410966157913208, "rewards/margins": 0.16832971572875977, "rewards/rejected": 1.072766900062561, "step": 3651 }, { "epoch": 1.97, "learning_rate": 2.7866526645046928e-08, "logits/chosen": -1.9823460578918457, "logits/rejected": -1.9849424362182617, "logps/chosen": -1.983939528465271, "logps/rejected": -3.5192792415618896, "loss": 0.5015, "rewards/accuracies": 1.0, "rewards/chosen": 0.9498414397239685, "rewards/margins": 0.4289253354072571, "rewards/rejected": 0.5209161043167114, "step": 3652 }, { "epoch": 1.97, "learning_rate": 2.7840422720037938e-08, "logits/chosen": -2.057429075241089, "logits/rejected": -2.0504796504974365, "logps/chosen": -3.185260772705078, "logps/rejected": -5.793909549713135, "loss": 0.249, "rewards/accuracies": 1.0, "rewards/chosen": 1.5385664701461792, "rewards/margins": 1.2633202075958252, "rewards/rejected": 0.27524620294570923, "step": 3653 }, { "epoch": 1.97, "learning_rate": 2.7814326309152042e-08, "logits/chosen": -2.1556389331817627, "logits/rejected": -2.14701771736145, "logps/chosen": -5.174092769622803, "logps/rejected": -4.467671871185303, "loss": 0.6454, "rewards/accuracies": 1.0, "rewards/chosen": 0.8725833892822266, "rewards/margins": 0.09791386127471924, "rewards/rejected": 0.7746695280075073, "step": 3654 }, { "epoch": 1.97, "learning_rate": 2.7788237421238335e-08, "logits/chosen": -2.109621524810791, "logits/rejected": -2.345721960067749, "logps/chosen": -2.013134241104126, "logps/rejected": -2.2138233184814453, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.695142924785614, "rewards/margins": 0.039148032665252686, "rewards/rejected": 0.6559948921203613, "step": 3655 }, { "epoch": 1.97, "learning_rate": 2.776215606514335e-08, "logits/chosen": -2.0565550327301025, "logits/rejected": -2.2808239459991455, "logps/chosen": -4.437200546264648, "logps/rejected": -0.6656914353370667, "loss": 0.8083, "rewards/accuracies": 0.0, "rewards/chosen": 0.4901615083217621, "rewards/margins": -0.21847692131996155, "rewards/rejected": 0.7086384296417236, "step": 3656 }, { "epoch": 1.97, "learning_rate": 2.773608224971101e-08, "logits/chosen": -2.1453957557678223, "logits/rejected": -2.324099540710449, "logps/chosen": -7.288125038146973, "logps/rejected": -10.989350318908691, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.9395291209220886, "rewards/margins": 0.004678905010223389, "rewards/rejected": 0.9348502159118652, "step": 3657 }, { "epoch": 1.97, "learning_rate": 2.7710015983782776e-08, "logits/chosen": -2.127197742462158, "logits/rejected": -2.128899335861206, "logps/chosen": -0.666926920413971, "logps/rejected": -4.0612335205078125, "loss": 0.4527, "rewards/accuracies": 1.0, "rewards/chosen": 1.0845481157302856, "rewards/margins": 0.5575841069221497, "rewards/rejected": 0.526964008808136, "step": 3658 }, { "epoch": 1.97, "learning_rate": 2.7683957276197444e-08, "logits/chosen": -2.1210477352142334, "logits/rejected": -2.2661406993865967, "logps/chosen": -2.2240636348724365, "logps/rejected": -2.0688602924346924, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.9550615549087524, "rewards/margins": 0.032627761363983154, "rewards/rejected": 0.9224337935447693, "step": 3659 }, { "epoch": 1.97, "learning_rate": 2.765790613579132e-08, "logits/chosen": -2.1221115589141846, "logits/rejected": -2.1220028400421143, "logps/chosen": -1.0752789974212646, "logps/rejected": -1.1674782037734985, "loss": 0.5727, "rewards/accuracies": 1.0, "rewards/chosen": 1.0435999631881714, "rewards/margins": 0.25735485553741455, "rewards/rejected": 0.7862451076507568, "step": 3660 }, { "epoch": 1.97, "learning_rate": 2.7631862571398147e-08, "logits/chosen": -2.0805716514587402, "logits/rejected": -2.068558692932129, "logps/chosen": -12.615256309509277, "logps/rejected": -3.791440725326538, "loss": 0.3362, "rewards/accuracies": 1.0, "rewards/chosen": 1.4661656618118286, "rewards/margins": 0.9173693060874939, "rewards/rejected": 0.5487963557243347, "step": 3661 }, { "epoch": 1.98, "learning_rate": 2.7605826591849012e-08, "logits/chosen": -1.9795036315917969, "logits/rejected": -1.979742169380188, "logps/chosen": -0.6879322528839111, "logps/rejected": -2.031978130340576, "loss": 0.6302, "rewards/accuracies": 1.0, "rewards/chosen": 0.9859831929206848, "rewards/margins": 0.1300690770149231, "rewards/rejected": 0.8559141159057617, "step": 3662 }, { "epoch": 1.98, "learning_rate": 2.757979820597257e-08, "logits/chosen": -2.121886730194092, "logits/rejected": -2.1154370307922363, "logps/chosen": -4.751142978668213, "logps/rejected": -5.616977691650391, "loss": 0.3473, "rewards/accuracies": 1.0, "rewards/chosen": 1.4940325021743774, "rewards/margins": 0.879054069519043, "rewards/rejected": 0.6149784326553345, "step": 3663 }, { "epoch": 1.98, "learning_rate": 2.755377742259477e-08, "logits/chosen": -2.03079891204834, "logits/rejected": -2.0301859378814697, "logps/chosen": -0.31254148483276367, "logps/rejected": -4.890059947967529, "loss": 0.4748, "rewards/accuracies": 1.0, "rewards/chosen": 0.9571178555488586, "rewards/margins": 0.4980801045894623, "rewards/rejected": 0.45903775095939636, "step": 3664 }, { "epoch": 1.98, "learning_rate": 2.7527764250539052e-08, "logits/chosen": -2.028203248977661, "logits/rejected": -2.0293149948120117, "logps/chosen": -3.0794687271118164, "logps/rejected": -1.0460656881332397, "loss": 0.4819, "rewards/accuracies": 1.0, "rewards/chosen": 1.4371775388717651, "rewards/margins": 0.4795238971710205, "rewards/rejected": 0.9576536417007446, "step": 3665 }, { "epoch": 1.98, "learning_rate": 2.7501758698626286e-08, "logits/chosen": -2.0924978256225586, "logits/rejected": -2.0992140769958496, "logps/chosen": -2.0222067832946777, "logps/rejected": -12.695908546447754, "loss": 0.5282, "rewards/accuracies": 1.0, "rewards/chosen": 1.3381439447402954, "rewards/margins": 0.3626481294631958, "rewards/rejected": 0.9754958152770996, "step": 3666 }, { "epoch": 1.98, "learning_rate": 2.7475760775674693e-08, "logits/chosen": -2.0729591846466064, "logits/rejected": -2.32094669342041, "logps/chosen": -0.40851593017578125, "logps/rejected": -0.37529265880584717, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 0.914881706237793, "rewards/margins": 0.030158698558807373, "rewards/rejected": 0.8847230076789856, "step": 3667 }, { "epoch": 1.98, "learning_rate": 2.744977049049996e-08, "logits/chosen": -2.1237759590148926, "logits/rejected": -2.2572379112243652, "logps/chosen": -3.539655923843384, "logps/rejected": -3.4964005947113037, "loss": 0.6965, "rewards/accuracies": 0.0, "rewards/chosen": 0.7894349098205566, "rewards/margins": -0.006761014461517334, "rewards/rejected": 0.796195924282074, "step": 3668 }, { "epoch": 1.98, "learning_rate": 2.7423787851915185e-08, "logits/chosen": -2.053684949874878, "logits/rejected": -2.3008217811584473, "logps/chosen": -0.3670305609703064, "logps/rejected": -0.41531136631965637, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.9266786575317383, "rewards/margins": 0.02217942476272583, "rewards/rejected": 0.9044992327690125, "step": 3669 }, { "epoch": 1.98, "learning_rate": 2.739781286873083e-08, "logits/chosen": -2.006878137588501, "logits/rejected": -2.2396976947784424, "logps/chosen": -0.6997219920158386, "logps/rejected": -0.7842701077461243, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 0.8741989135742188, "rewards/margins": -0.0009793639183044434, "rewards/rejected": 0.8751782774925232, "step": 3670 }, { "epoch": 1.98, "learning_rate": 2.7371845549754812e-08, "logits/chosen": -1.9446752071380615, "logits/rejected": -1.9440611600875854, "logps/chosen": -0.8860807418823242, "logps/rejected": -1.3973040580749512, "loss": 0.6217, "rewards/accuracies": 1.0, "rewards/chosen": 0.9432498812675476, "rewards/margins": 0.1484660506248474, "rewards/rejected": 0.7947838306427002, "step": 3671 }, { "epoch": 1.98, "learning_rate": 2.734588590379241e-08, "logits/chosen": -2.17452335357666, "logits/rejected": -2.29465913772583, "logps/chosen": -6.83633279800415, "logps/rejected": -6.65885591506958, "loss": 0.7016, "rewards/accuracies": 0.0, "rewards/chosen": 0.5187771320343018, "rewards/margins": -0.016787171363830566, "rewards/rejected": 0.5355643033981323, "step": 3672 }, { "epoch": 1.98, "learning_rate": 2.7319933939646356e-08, "logits/chosen": -2.1589155197143555, "logits/rejected": -2.301936626434326, "logps/chosen": -2.035065174102783, "logps/rejected": -2.282240152359009, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 0.7426556944847107, "rewards/margins": 0.04214286804199219, "rewards/rejected": 0.7005128264427185, "step": 3673 }, { "epoch": 1.98, "learning_rate": 2.729398966611669e-08, "logits/chosen": -2.0669617652893066, "logits/rejected": -1.9632936716079712, "logps/chosen": -25.535926818847656, "logps/rejected": -8.371671676635742, "loss": 0.16, "rewards/accuracies": 1.0, "rewards/chosen": 2.0341687202453613, "rewards/margins": 1.7514334917068481, "rewards/rejected": 0.28273525834083557, "step": 3674 }, { "epoch": 1.98, "learning_rate": 2.726805309200092e-08, "logits/chosen": -1.998239278793335, "logits/rejected": -2.265338897705078, "logps/chosen": -0.9976202845573425, "logps/rejected": -0.965287983417511, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 1.0161181688308716, "rewards/margins": 0.019656002521514893, "rewards/rejected": 0.9964621663093567, "step": 3675 }, { "epoch": 1.98, "learning_rate": 2.7242124226093922e-08, "logits/chosen": -2.201430320739746, "logits/rejected": -2.1377549171447754, "logps/chosen": -23.646198272705078, "logps/rejected": -1.9112623929977417, "loss": 0.2149, "rewards/accuracies": 1.0, "rewards/chosen": 2.063274383544922, "rewards/margins": 1.4283406734466553, "rewards/rejected": 0.6349337697029114, "step": 3676 }, { "epoch": 1.98, "learning_rate": 2.7216203077187928e-08, "logits/chosen": -2.0360171794891357, "logits/rejected": -2.241678476333618, "logps/chosen": -0.5657584071159363, "logps/rejected": -0.5513749122619629, "loss": 0.6795, "rewards/accuracies": 1.0, "rewards/chosen": 0.8001368641853333, "rewards/margins": 0.02743297815322876, "rewards/rejected": 0.7727038860321045, "step": 3677 }, { "epoch": 1.98, "learning_rate": 2.7190289654072585e-08, "logits/chosen": -2.2108118534088135, "logits/rejected": -2.2056503295898438, "logps/chosen": -6.304153919219971, "logps/rejected": -3.5735368728637695, "loss": 0.4935, "rewards/accuracies": 1.0, "rewards/chosen": 0.9410280585289001, "rewards/margins": 0.44944795966148376, "rewards/rejected": 0.4915800988674164, "step": 3678 }, { "epoch": 1.98, "learning_rate": 2.716438396553494e-08, "logits/chosen": -1.9226500988006592, "logits/rejected": -2.2594404220581055, "logps/chosen": -0.23410549759864807, "logps/rejected": -0.23182621598243713, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.8098785281181335, "rewards/margins": 0.00518423318862915, "rewards/rejected": 0.8046942949295044, "step": 3679 }, { "epoch": 1.98, "learning_rate": 2.7138486020359364e-08, "logits/chosen": -2.06988525390625, "logits/rejected": -2.0762367248535156, "logps/chosen": -0.7725850343704224, "logps/rejected": -3.2586169242858887, "loss": 0.4882, "rewards/accuracies": 1.0, "rewards/chosen": 0.9383487701416016, "rewards/margins": 0.4630129337310791, "rewards/rejected": 0.47533583641052246, "step": 3680 }, { "epoch": 1.99, "learning_rate": 2.7112595827327622e-08, "logits/chosen": -2.0538785457611084, "logits/rejected": -2.059318780899048, "logps/chosen": -0.4662623703479767, "logps/rejected": -7.423912048339844, "loss": 0.3973, "rewards/accuracies": 1.0, "rewards/chosen": 1.0217115879058838, "rewards/margins": 0.7178552150726318, "rewards/rejected": 0.30385637283325195, "step": 3681 }, { "epoch": 1.99, "learning_rate": 2.7086713395218874e-08, "logits/chosen": -2.0433390140533447, "logits/rejected": -2.2465150356292725, "logps/chosen": -0.47842538356781006, "logps/rejected": -0.49509820342063904, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 1.00391685962677, "rewards/margins": -0.0017402172088623047, "rewards/rejected": 1.0056570768356323, "step": 3682 }, { "epoch": 1.99, "learning_rate": 2.7060838732809642e-08, "logits/chosen": -2.060530185699463, "logits/rejected": -2.0631306171417236, "logps/chosen": -1.5834088325500488, "logps/rejected": -2.295241594314575, "loss": 0.5605, "rewards/accuracies": 1.0, "rewards/chosen": 1.1925381422042847, "rewards/margins": 0.28563588857650757, "rewards/rejected": 0.9069022536277771, "step": 3683 }, { "epoch": 1.99, "learning_rate": 2.703497184887378e-08, "logits/chosen": -1.9881716966629028, "logits/rejected": -2.012782573699951, "logps/chosen": -6.917612075805664, "logps/rejected": -24.685434341430664, "loss": 0.6514, "rewards/accuracies": 1.0, "rewards/chosen": 1.2258234024047852, "rewards/margins": 0.08528017997741699, "rewards/rejected": 1.1405432224273682, "step": 3684 }, { "epoch": 1.99, "learning_rate": 2.700911275218253e-08, "logits/chosen": -2.1176044940948486, "logits/rejected": -2.232168436050415, "logps/chosen": -0.7548193335533142, "logps/rejected": -0.8043572902679443, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 1.0138046741485596, "rewards/margins": -0.0029069185256958008, "rewards/rejected": 1.0167115926742554, "step": 3685 }, { "epoch": 1.99, "learning_rate": 2.6983261451504508e-08, "logits/chosen": -2.055433750152588, "logits/rejected": -2.054730176925659, "logps/chosen": -0.20484617352485657, "logps/rejected": -4.680108070373535, "loss": 0.4422, "rewards/accuracies": 1.0, "rewards/chosen": 0.9944181442260742, "rewards/margins": 0.5868574976921082, "rewards/rejected": 0.40756064653396606, "step": 3686 }, { "epoch": 1.99, "learning_rate": 2.6957417955605642e-08, "logits/chosen": -2.162903308868408, "logits/rejected": -2.1591131687164307, "logps/chosen": -3.1774680614471436, "logps/rejected": -3.506143808364868, "loss": 0.3438, "rewards/accuracies": 1.0, "rewards/chosen": 1.577229619026184, "rewards/margins": 0.8910015225410461, "rewards/rejected": 0.6862280964851379, "step": 3687 }, { "epoch": 1.99, "learning_rate": 2.6931582273249254e-08, "logits/chosen": -2.027966022491455, "logits/rejected": -2.010342836380005, "logps/chosen": -7.00403356552124, "logps/rejected": -4.5428619384765625, "loss": 0.3548, "rewards/accuracies": 1.0, "rewards/chosen": 1.3686094284057617, "rewards/margins": 0.8535856008529663, "rewards/rejected": 0.5150238275527954, "step": 3688 }, { "epoch": 1.99, "learning_rate": 2.6905754413196026e-08, "logits/chosen": -2.2088065147399902, "logits/rejected": -2.200984239578247, "logps/chosen": -7.196788787841797, "logps/rejected": -3.5203769207000732, "loss": 0.462, "rewards/accuracies": 1.0, "rewards/chosen": 1.291559100151062, "rewards/margins": 0.5322275757789612, "rewards/rejected": 0.7593315243721008, "step": 3689 }, { "epoch": 1.99, "learning_rate": 2.6879934384203916e-08, "logits/chosen": -2.2021844387054443, "logits/rejected": -2.3077735900878906, "logps/chosen": -1.0032587051391602, "logps/rejected": -1.0774457454681396, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.9456964731216431, "rewards/margins": 0.01609182357788086, "rewards/rejected": 0.9296046495437622, "step": 3690 }, { "epoch": 1.99, "learning_rate": 2.6854122195028313e-08, "logits/chosen": -2.152385950088501, "logits/rejected": -2.1469616889953613, "logps/chosen": -7.02495813369751, "logps/rejected": -5.482683181762695, "loss": 0.3885, "rewards/accuracies": 1.0, "rewards/chosen": 1.092775583267212, "rewards/margins": 0.7449930906295776, "rewards/rejected": 0.34778252243995667, "step": 3691 }, { "epoch": 1.99, "learning_rate": 2.682831785442189e-08, "logits/chosen": -1.9667032957077026, "logits/rejected": -1.9662362337112427, "logps/chosen": -0.7441202402114868, "logps/rejected": -2.3362016677856445, "loss": 0.5953, "rewards/accuracies": 1.0, "rewards/chosen": 0.9272739291191101, "rewards/margins": 0.20624607801437378, "rewards/rejected": 0.7210278511047363, "step": 3692 }, { "epoch": 1.99, "learning_rate": 2.680252137113469e-08, "logits/chosen": -2.049769163131714, "logits/rejected": -2.2734124660491943, "logps/chosen": -1.5022022724151611, "logps/rejected": -2.0328288078308105, "loss": 0.7051, "rewards/accuracies": 0.0, "rewards/chosen": 0.8648414015769958, "rewards/margins": -0.023823261260986328, "rewards/rejected": 0.8886646628379822, "step": 3693 }, { "epoch": 1.99, "learning_rate": 2.677673275391409e-08, "logits/chosen": -2.003588914871216, "logits/rejected": -1.998839259147644, "logps/chosen": -3.1448237895965576, "logps/rejected": -5.361126899719238, "loss": 0.4567, "rewards/accuracies": 1.0, "rewards/chosen": 0.885547935962677, "rewards/margins": 0.5466046333312988, "rewards/rejected": 0.3389433026313782, "step": 3694 }, { "epoch": 1.99, "learning_rate": 2.6750952011504756e-08, "logits/chosen": -2.167335271835327, "logits/rejected": -2.181579828262329, "logps/chosen": -11.214293479919434, "logps/rejected": -10.84770393371582, "loss": 0.2824, "rewards/accuracies": 1.0, "rewards/chosen": 1.8529404401779175, "rewards/margins": 1.120107650756836, "rewards/rejected": 0.7328327298164368, "step": 3695 }, { "epoch": 1.99, "learning_rate": 2.6725179152648758e-08, "logits/chosen": -2.155564785003662, "logits/rejected": -2.1527278423309326, "logps/chosen": -3.8693480491638184, "logps/rejected": -3.6059060096740723, "loss": 0.5698, "rewards/accuracies": 1.0, "rewards/chosen": 0.7715306878089905, "rewards/margins": 0.2641145586967468, "rewards/rejected": 0.5074161291122437, "step": 3696 }, { "epoch": 1.99, "learning_rate": 2.6699414186085412e-08, "logits/chosen": -1.9840729236602783, "logits/rejected": -2.3303096294403076, "logps/chosen": -16.079504013061523, "logps/rejected": -0.6254007816314697, "loss": 0.5582, "rewards/accuracies": 1.0, "rewards/chosen": 1.1043100357055664, "rewards/margins": 0.2910703420639038, "rewards/rejected": 0.8132396936416626, "step": 3697 }, { "epoch": 1.99, "learning_rate": 2.6673657120551418e-08, "logits/chosen": -2.0439209938049316, "logits/rejected": -2.213585138320923, "logps/chosen": -0.28622108697891235, "logps/rejected": -0.31277939677238464, "loss": 0.6806, "rewards/accuracies": 1.0, "rewards/chosen": 0.8964173197746277, "rewards/margins": 0.025319576263427734, "rewards/rejected": 0.8710977435112, "step": 3698 }, { "epoch": 2.0, "learning_rate": 2.6647907964780792e-08, "logits/chosen": -1.996321439743042, "logits/rejected": -1.9963072538375854, "logps/chosen": -0.4618813991546631, "logps/rejected": -1.849547266960144, "loss": 0.5701, "rewards/accuracies": 1.0, "rewards/chosen": 0.8818950653076172, "rewards/margins": 0.263435423374176, "rewards/rejected": 0.6184596419334412, "step": 3699 }, { "epoch": 2.0, "learning_rate": 2.6622166727504818e-08, "logits/chosen": -2.0856080055236816, "logits/rejected": -2.0846197605133057, "logps/chosen": -5.196979522705078, "logps/rejected": -3.336508274078369, "loss": 0.2719, "rewards/accuracies": 1.0, "rewards/chosen": 1.681771159172058, "rewards/margins": 1.1632981300354004, "rewards/rejected": 0.5184730887413025, "step": 3700 }, { "epoch": 2.0, "learning_rate": 2.6596433417452146e-08, "logits/chosen": -1.9746631383895874, "logits/rejected": -1.9736230373382568, "logps/chosen": -0.15145795047283173, "logps/rejected": -7.521554946899414, "loss": 0.4976, "rewards/accuracies": 1.0, "rewards/chosen": 0.8267035484313965, "rewards/margins": 0.43879833817481995, "rewards/rejected": 0.38790521025657654, "step": 3701 }, { "epoch": 2.0, "learning_rate": 2.657070804334872e-08, "logits/chosen": -2.1352651119232178, "logits/rejected": -2.302821636199951, "logps/chosen": -5.885167121887207, "logps/rejected": -5.746728897094727, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.5878405570983887, "rewards/margins": 0.00036525726318359375, "rewards/rejected": 0.5874752998352051, "step": 3702 }, { "epoch": 2.0, "learning_rate": 2.65449906139178e-08, "logits/chosen": -2.0727341175079346, "logits/rejected": -2.25724458694458, "logps/chosen": -10.182390213012695, "logps/rejected": -5.612362861633301, "loss": 0.6987, "rewards/accuracies": 0.0, "rewards/chosen": 1.0192937850952148, "rewards/margins": -0.011150121688842773, "rewards/rejected": 1.0304439067840576, "step": 3703 }, { "epoch": 2.0, "learning_rate": 2.651928113787996e-08, "logits/chosen": -2.0956132411956787, "logits/rejected": -2.1021931171417236, "logps/chosen": -2.048773765563965, "logps/rejected": -3.2743356227874756, "loss": 0.4746, "rewards/accuracies": 1.0, "rewards/chosen": 1.042090892791748, "rewards/margins": 0.4986298680305481, "rewards/rejected": 0.5434610247612, "step": 3704 }, { "epoch": 2.0, "learning_rate": 2.6493579623953033e-08, "logits/chosen": -2.0440592765808105, "logits/rejected": -2.0442423820495605, "logps/chosen": -3.1714894771575928, "logps/rejected": -5.5162672996521, "loss": 0.3014, "rewards/accuracies": 1.0, "rewards/chosen": 1.610147476196289, "rewards/margins": 1.0448546409606934, "rewards/rejected": 0.5652928948402405, "step": 3705 }, { "epoch": 2.0, "learning_rate": 2.646788608085221e-08, "logits/chosen": -2.201638698577881, "logits/rejected": -2.2009782791137695, "logps/chosen": -1.6622198820114136, "logps/rejected": -5.957336902618408, "loss": 0.3855, "rewards/accuracies": 1.0, "rewards/chosen": 1.1058977842330933, "rewards/margins": 0.7542183995246887, "rewards/rejected": 0.35167938470840454, "step": 3706 }, { "epoch": 2.0, "learning_rate": 2.6442200517289913e-08, "logits/chosen": -2.0986428260803223, "logits/rejected": -2.1041367053985596, "logps/chosen": -0.4903962314128876, "logps/rejected": -15.817872047424316, "loss": 0.3679, "rewards/accuracies": 1.0, "rewards/chosen": 0.9515962600708008, "rewards/margins": 0.810352623462677, "rewards/rejected": 0.14124365150928497, "step": 3707 }, { "epoch": 2.0, "learning_rate": 2.6416522941975928e-08, "logits/chosen": -2.1364262104034424, "logits/rejected": -2.128145694732666, "logps/chosen": -5.411570072174072, "logps/rejected": -4.013559341430664, "loss": 0.4027, "rewards/accuracies": 1.0, "rewards/chosen": 1.2744029760360718, "rewards/margins": 0.7013700008392334, "rewards/rejected": 0.5730329751968384, "step": 3708 }, { "epoch": 2.0, "learning_rate": 2.6390853363617315e-08, "logits/chosen": -2.056644916534424, "logits/rejected": -2.281604766845703, "logps/chosen": -7.257056713104248, "logps/rejected": -5.036309719085693, "loss": 0.7335, "rewards/accuracies": 0.0, "rewards/chosen": 0.9941661953926086, "rewards/margins": -0.07913249731063843, "rewards/rejected": 1.073298692703247, "step": 3709 }, { "epoch": 2.0, "learning_rate": 2.6365191790918344e-08, "logits/chosen": -2.1461477279663086, "logits/rejected": -2.131845235824585, "logps/chosen": -1.6071112155914307, "logps/rejected": -7.751938819885254, "loss": 0.3698, "rewards/accuracies": 1.0, "rewards/chosen": 1.1709296703338623, "rewards/margins": 0.8042314052581787, "rewards/rejected": 0.3666982650756836, "step": 3710 }, { "epoch": 2.0, "learning_rate": 2.633953823258072e-08, "logits/chosen": -2.111506223678589, "logits/rejected": -2.098409414291382, "logps/chosen": -3.0681838989257812, "logps/rejected": -4.938312530517578, "loss": 0.4381, "rewards/accuracies": 1.0, "rewards/chosen": 1.0450303554534912, "rewards/margins": 0.598176896572113, "rewards/rejected": 0.4468534588813782, "step": 3711 }, { "epoch": 2.0, "learning_rate": 2.6313892697303286e-08, "logits/chosen": -1.9817856550216675, "logits/rejected": -1.982979416847229, "logps/chosen": -3.737653970718384, "logps/rejected": -3.5302584171295166, "loss": 0.2636, "rewards/accuracies": 1.0, "rewards/chosen": 1.7329342365264893, "rewards/margins": 1.1987407207489014, "rewards/rejected": 0.5341934561729431, "step": 3712 }, { "epoch": 2.0, "learning_rate": 2.6288255193782238e-08, "logits/chosen": -2.017188787460327, "logits/rejected": -2.2912509441375732, "logps/chosen": -1.0844440460205078, "logps/rejected": -0.7770159244537354, "loss": 0.6798, "rewards/accuracies": 1.0, "rewards/chosen": 0.8633167147636414, "rewards/margins": 0.026916980743408203, "rewards/rejected": 0.8363997340202332, "step": 3713 }, { "epoch": 2.0, "learning_rate": 2.6262625730711062e-08, "logits/chosen": -2.1763978004455566, "logits/rejected": -2.1806955337524414, "logps/chosen": -0.21401433646678925, "logps/rejected": -4.2576823234558105, "loss": 0.4741, "rewards/accuracies": 1.0, "rewards/chosen": 0.8958685994148254, "rewards/margins": 0.4998166561126709, "rewards/rejected": 0.39605194330215454, "step": 3714 }, { "epoch": 2.0, "learning_rate": 2.6237004316780442e-08, "logits/chosen": -2.1362481117248535, "logits/rejected": -2.3124032020568848, "logps/chosen": -1.3892970085144043, "logps/rejected": -1.42948317527771, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.9915920495986938, "rewards/margins": -0.00322723388671875, "rewards/rejected": 0.9948192834854126, "step": 3715 }, { "epoch": 2.0, "learning_rate": 2.621139096067841e-08, "logits/chosen": -2.019523859024048, "logits/rejected": -2.0186729431152344, "logps/chosen": -0.3083474338054657, "logps/rejected": -4.44913387298584, "loss": 0.4755, "rewards/accuracies": 1.0, "rewards/chosen": 0.9691516160964966, "rewards/margins": 0.49616673588752747, "rewards/rejected": 0.4729848802089691, "step": 3716 }, { "epoch": 2.0, "learning_rate": 2.6185785671090243e-08, "logits/chosen": -2.064851760864258, "logits/rejected": -2.3323161602020264, "logps/chosen": -18.013723373413086, "logps/rejected": -15.756214141845703, "loss": 0.7545, "rewards/accuracies": 0.0, "rewards/chosen": 0.10316429287195206, "rewards/margins": -0.11908722668886185, "rewards/rejected": 0.2222515195608139, "step": 3717 }, { "epoch": 2.01, "learning_rate": 2.616018845669845e-08, "logits/chosen": -1.9716566801071167, "logits/rejected": -2.2708418369293213, "logps/chosen": -0.8505493998527527, "logps/rejected": -0.8060446977615356, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 1.034188151359558, "rewards/margins": 0.022957563400268555, "rewards/rejected": 1.0112305879592896, "step": 3718 }, { "epoch": 2.01, "learning_rate": 2.6134599326182856e-08, "logits/chosen": -2.176928997039795, "logits/rejected": -2.1817972660064697, "logps/chosen": -2.5758748054504395, "logps/rejected": -6.007700443267822, "loss": 0.3031, "rewards/accuracies": 1.0, "rewards/chosen": 1.5992488861083984, "rewards/margins": 1.0381512641906738, "rewards/rejected": 0.5610975623130798, "step": 3719 }, { "epoch": 2.01, "learning_rate": 2.6109018288220458e-08, "logits/chosen": -2.1195785999298096, "logits/rejected": -2.3372557163238525, "logps/chosen": -0.5284469723701477, "logps/rejected": -0.5711073875427246, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.8875564932823181, "rewards/margins": 0.019708871841430664, "rewards/rejected": 0.8678476214408875, "step": 3720 }, { "epoch": 2.01, "learning_rate": 2.608344535148565e-08, "logits/chosen": -2.097320795059204, "logits/rejected": -2.249080181121826, "logps/chosen": -0.22207200527191162, "logps/rejected": -0.252746045589447, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.8119609951972961, "rewards/margins": 0.007296741008758545, "rewards/rejected": 0.8046642541885376, "step": 3721 }, { "epoch": 2.01, "learning_rate": 2.605788052464994e-08, "logits/chosen": -2.0872626304626465, "logits/rejected": -2.331294536590576, "logps/chosen": -0.3551585376262665, "logps/rejected": -0.3464219868183136, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 0.7557375431060791, "rewards/margins": 0.024123847484588623, "rewards/rejected": 0.7316136956214905, "step": 3722 }, { "epoch": 2.01, "learning_rate": 2.603232381638215e-08, "logits/chosen": -2.2015960216522217, "logits/rejected": -2.1883246898651123, "logps/chosen": -10.442107200622559, "logps/rejected": -2.2700958251953125, "loss": 0.4829, "rewards/accuracies": 1.0, "rewards/chosen": 1.5427526235580444, "rewards/margins": 0.47676169872283936, "rewards/rejected": 1.065990924835205, "step": 3723 }, { "epoch": 2.01, "learning_rate": 2.6006775235348376e-08, "logits/chosen": -2.1074771881103516, "logits/rejected": -2.1044538021087646, "logps/chosen": -7.274574279785156, "logps/rejected": -3.31514573097229, "loss": 0.357, "rewards/accuracies": 1.0, "rewards/chosen": 1.3694994449615479, "rewards/margins": 0.8463160991668701, "rewards/rejected": 0.5231833457946777, "step": 3724 }, { "epoch": 2.01, "learning_rate": 2.598123479021187e-08, "logits/chosen": -2.1443991661071777, "logits/rejected": -2.2679409980773926, "logps/chosen": -3.0414721965789795, "logps/rejected": -2.81522798538208, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.9992839097976685, "rewards/margins": 0.003525257110595703, "rewards/rejected": 0.9957586526870728, "step": 3725 }, { "epoch": 2.01, "learning_rate": 2.5955702489633214e-08, "logits/chosen": -2.1040313243865967, "logits/rejected": -2.1001360416412354, "logps/chosen": -0.2756795585155487, "logps/rejected": -4.799802303314209, "loss": 0.4556, "rewards/accuracies": 1.0, "rewards/chosen": 0.9888901710510254, "rewards/margins": 0.5497338771820068, "rewards/rejected": 0.43915629386901855, "step": 3726 }, { "epoch": 2.01, "learning_rate": 2.5930178342270193e-08, "logits/chosen": -2.0331220626831055, "logits/rejected": -2.2812891006469727, "logps/chosen": -0.38209596276283264, "logps/rejected": -0.43317893147468567, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.9501876831054688, "rewards/margins": 0.0014209747314453125, "rewards/rejected": 0.9487667083740234, "step": 3727 }, { "epoch": 2.01, "learning_rate": 2.5904662356777806e-08, "logits/chosen": -2.1624064445495605, "logits/rejected": -2.2965731620788574, "logps/chosen": -1.5740094184875488, "logps/rejected": -1.4523019790649414, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 0.9572917819023132, "rewards/margins": 0.013585329055786133, "rewards/rejected": 0.9437064528465271, "step": 3728 }, { "epoch": 2.01, "learning_rate": 2.5879154541808336e-08, "logits/chosen": -2.1761136054992676, "logits/rejected": -2.11356520652771, "logps/chosen": -16.802627563476562, "logps/rejected": -9.614786148071289, "loss": 0.1884, "rewards/accuracies": 1.0, "rewards/chosen": 2.0175156593322754, "rewards/margins": 1.5737671852111816, "rewards/rejected": 0.44374847412109375, "step": 3729 }, { "epoch": 2.01, "learning_rate": 2.5853654906011203e-08, "logits/chosen": -2.0273947715759277, "logits/rejected": -2.018756151199341, "logps/chosen": -6.253227233886719, "logps/rejected": -15.181163787841797, "loss": 0.4381, "rewards/accuracies": 1.0, "rewards/chosen": 1.2771986722946167, "rewards/margins": 0.5982059836387634, "rewards/rejected": 0.6789926886558533, "step": 3730 }, { "epoch": 2.01, "learning_rate": 2.5828163458033176e-08, "logits/chosen": -2.0635082721710205, "logits/rejected": -2.3059301376342773, "logps/chosen": -0.25041866302490234, "logps/rejected": -0.2772679030895233, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.7647920846939087, "rewards/margins": 0.0006881952285766602, "rewards/rejected": 0.764103889465332, "step": 3731 }, { "epoch": 2.01, "learning_rate": 2.580268020651819e-08, "logits/chosen": -2.0136585235595703, "logits/rejected": -2.2353925704956055, "logps/chosen": -0.9055637717247009, "logps/rejected": -0.890419602394104, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.8456358313560486, "rewards/margins": 0.014744400978088379, "rewards/rejected": 0.8308914303779602, "step": 3732 }, { "epoch": 2.01, "learning_rate": 2.577720516010736e-08, "logits/chosen": -2.1214616298675537, "logits/rejected": -2.2972452640533447, "logps/chosen": -1.8594036102294922, "logps/rejected": -1.6879873275756836, "loss": 0.6902, "rewards/accuracies": 1.0, "rewards/chosen": 1.0727704763412476, "rewards/margins": 0.00587010383605957, "rewards/rejected": 1.066900372505188, "step": 3733 }, { "epoch": 2.01, "learning_rate": 2.575173832743909e-08, "logits/chosen": -2.2091641426086426, "logits/rejected": -2.20089054107666, "logps/chosen": -7.0494866371154785, "logps/rejected": -3.4951891899108887, "loss": 0.4575, "rewards/accuracies": 1.0, "rewards/chosen": 1.3062893152236938, "rewards/margins": 0.5444390177726746, "rewards/rejected": 0.7618502974510193, "step": 3734 }, { "epoch": 2.01, "learning_rate": 2.572627971714893e-08, "logits/chosen": -2.1699678897857666, "logits/rejected": -2.213182210922241, "logps/chosen": -1.5127665996551514, "logps/rejected": -1.897611379623413, "loss": 0.6709, "rewards/accuracies": 1.0, "rewards/chosen": 1.0509759187698364, "rewards/margins": 0.04504275321960449, "rewards/rejected": 1.005933165550232, "step": 3735 }, { "epoch": 2.02, "learning_rate": 2.5700829337869695e-08, "logits/chosen": -1.9754184484481812, "logits/rejected": -1.9801182746887207, "logps/chosen": -2.6105613708496094, "logps/rejected": -4.545680999755859, "loss": 0.3551, "rewards/accuracies": 1.0, "rewards/chosen": 1.377314567565918, "rewards/margins": 0.852571964263916, "rewards/rejected": 0.524742603302002, "step": 3736 }, { "epoch": 2.02, "learning_rate": 2.5675387198231413e-08, "logits/chosen": -2.227954387664795, "logits/rejected": -2.219909906387329, "logps/chosen": -3.613922119140625, "logps/rejected": -1.6081339120864868, "loss": 0.5237, "rewards/accuracies": 1.0, "rewards/chosen": 1.127347469329834, "rewards/margins": 0.3735826015472412, "rewards/rejected": 0.7537648677825928, "step": 3737 }, { "epoch": 2.02, "learning_rate": 2.564995330686125e-08, "logits/chosen": -2.0067849159240723, "logits/rejected": -2.0942306518554688, "logps/chosen": -13.250961303710938, "logps/rejected": -16.424535751342773, "loss": 0.47, "rewards/accuracies": 1.0, "rewards/chosen": 1.562754511833191, "rewards/margins": 0.5107471942901611, "rewards/rejected": 1.0520073175430298, "step": 3738 }, { "epoch": 2.02, "learning_rate": 2.5624527672383645e-08, "logits/chosen": -2.005441427230835, "logits/rejected": -2.0047051906585693, "logps/chosen": -5.886008262634277, "logps/rejected": -15.773123741149902, "loss": 0.4559, "rewards/accuracies": 1.0, "rewards/chosen": 0.8637734651565552, "rewards/margins": 0.5487651824951172, "rewards/rejected": 0.3150082528591156, "step": 3739 }, { "epoch": 2.02, "learning_rate": 2.5599110303420213e-08, "logits/chosen": -2.157041311264038, "logits/rejected": -2.3254005908966064, "logps/chosen": -0.5276443958282471, "logps/rejected": -0.5636460781097412, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 1.1117521524429321, "rewards/margins": 0.015198111534118652, "rewards/rejected": 1.0965540409088135, "step": 3740 }, { "epoch": 2.02, "learning_rate": 2.557370120858977e-08, "logits/chosen": -2.0118370056152344, "logits/rejected": -2.254868507385254, "logps/chosen": -0.6413400769233704, "logps/rejected": -0.7006452083587646, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.8389636278152466, "rewards/margins": 0.006788372993469238, "rewards/rejected": 0.8321752548217773, "step": 3741 }, { "epoch": 2.02, "learning_rate": 2.5548300396508336e-08, "logits/chosen": -2.113009214401245, "logits/rejected": -2.285165309906006, "logps/chosen": -0.9492629766464233, "logps/rejected": -6.057106971740723, "loss": 0.561, "rewards/accuracies": 1.0, "rewards/chosen": 0.9708032608032227, "rewards/margins": 0.2845257520675659, "rewards/rejected": 0.6862775087356567, "step": 3742 }, { "epoch": 2.02, "learning_rate": 2.552290787578908e-08, "logits/chosen": -2.0692267417907715, "logits/rejected": -2.0695390701293945, "logps/chosen": -1.840143084526062, "logps/rejected": -1.305587649345398, "loss": 0.5669, "rewards/accuracies": 1.0, "rewards/chosen": 1.1810530424118042, "rewards/margins": 0.27080076932907104, "rewards/rejected": 0.9102522730827332, "step": 3743 }, { "epoch": 2.02, "learning_rate": 2.5497523655042413e-08, "logits/chosen": -1.9830501079559326, "logits/rejected": -2.047297716140747, "logps/chosen": -4.432976245880127, "logps/rejected": -23.351116180419922, "loss": 0.2665, "rewards/accuracies": 1.0, "rewards/chosen": 1.5371192693710327, "rewards/margins": 1.1860889196395874, "rewards/rejected": 0.3510303497314453, "step": 3744 }, { "epoch": 2.02, "learning_rate": 2.5472147742875882e-08, "logits/chosen": -2.0560922622680664, "logits/rejected": -2.2430059909820557, "logps/chosen": -4.300022602081299, "logps/rejected": -6.267977714538574, "loss": 0.5076, "rewards/accuracies": 1.0, "rewards/chosen": 0.9015640616416931, "rewards/margins": 0.41361087560653687, "rewards/rejected": 0.48795318603515625, "step": 3745 }, { "epoch": 2.02, "learning_rate": 2.5446780147894248e-08, "logits/chosen": -2.298079252243042, "logits/rejected": -2.1743500232696533, "logps/chosen": -30.619327545166016, "logps/rejected": -3.401323080062866, "loss": 0.1995, "rewards/accuracies": 1.0, "rewards/chosen": 2.441758394241333, "rewards/margins": 1.5105056762695312, "rewards/rejected": 0.9312527775764465, "step": 3746 }, { "epoch": 2.02, "learning_rate": 2.5421420878699474e-08, "logits/chosen": -2.1187071800231934, "logits/rejected": -2.1225268840789795, "logps/chosen": -2.96354341506958, "logps/rejected": -1.3674871921539307, "loss": 0.59, "rewards/accuracies": 1.0, "rewards/chosen": 1.397696852684021, "rewards/margins": 0.21809148788452148, "rewards/rejected": 1.1796053647994995, "step": 3747 }, { "epoch": 2.02, "learning_rate": 2.539606994389063e-08, "logits/chosen": -2.003748655319214, "logits/rejected": -2.014707088470459, "logps/chosen": -1.6712979078292847, "logps/rejected": -2.1990301609039307, "loss": 0.463, "rewards/accuracies": 1.0, "rewards/chosen": 1.237527847290039, "rewards/margins": 0.5296056866645813, "rewards/rejected": 0.7079221606254578, "step": 3748 }, { "epoch": 2.02, "learning_rate": 2.5370727352064025e-08, "logits/chosen": -2.1915056705474854, "logits/rejected": -2.3419172763824463, "logps/chosen": -16.770328521728516, "logps/rejected": -4.394383430480957, "loss": 0.7077, "rewards/accuracies": 0.0, "rewards/chosen": 0.8467701077461243, "rewards/margins": -0.028933703899383545, "rewards/rejected": 0.8757038116455078, "step": 3749 }, { "epoch": 2.02, "learning_rate": 2.5345393111813106e-08, "logits/chosen": -1.9904173612594604, "logits/rejected": -2.0001938343048096, "logps/chosen": -1.9825546741485596, "logps/rejected": -4.077496528625488, "loss": 0.4429, "rewards/accuracies": 1.0, "rewards/chosen": 1.1142452955245972, "rewards/margins": 0.5848302245140076, "rewards/rejected": 0.5294150710105896, "step": 3750 }, { "epoch": 2.02, "learning_rate": 2.532006723172851e-08, "logits/chosen": -2.1776552200317383, "logits/rejected": -2.006791114807129, "logps/chosen": -36.240821838378906, "logps/rejected": -4.294415473937988, "loss": 0.2139, "rewards/accuracies": 1.0, "rewards/chosen": 2.1765999794006348, "rewards/margins": 1.4331353902816772, "rewards/rejected": 0.7434645891189575, "step": 3751 }, { "epoch": 2.02, "learning_rate": 2.5294749720398033e-08, "logits/chosen": -2.2098422050476074, "logits/rejected": -2.211472511291504, "logps/chosen": -0.7445698976516724, "logps/rejected": -3.9367599487304688, "loss": 0.5056, "rewards/accuracies": 1.0, "rewards/chosen": 1.031136393547058, "rewards/margins": 0.41846537590026855, "rewards/rejected": 0.6126710176467896, "step": 3752 }, { "epoch": 2.02, "learning_rate": 2.5269440586406588e-08, "logits/chosen": -2.02266788482666, "logits/rejected": -2.035954713821411, "logps/chosen": -1.4664945602416992, "logps/rejected": -7.127536296844482, "loss": 0.4267, "rewards/accuracies": 1.0, "rewards/chosen": 1.1287795305252075, "rewards/margins": 0.6306575536727905, "rewards/rejected": 0.498121976852417, "step": 3753 }, { "epoch": 2.02, "learning_rate": 2.5244139838336336e-08, "logits/chosen": -2.14713978767395, "logits/rejected": -2.317112445831299, "logps/chosen": -24.351268768310547, "logps/rejected": -8.250288009643555, "loss": 0.5692, "rewards/accuracies": 1.0, "rewards/chosen": 1.1785885095596313, "rewards/margins": 0.26545220613479614, "rewards/rejected": 0.9131363034248352, "step": 3754 }, { "epoch": 2.03, "learning_rate": 2.521884748476649e-08, "logits/chosen": -2.219423532485962, "logits/rejected": -2.214776039123535, "logps/chosen": -5.232460021972656, "logps/rejected": -2.703268051147461, "loss": 0.419, "rewards/accuracies": 1.0, "rewards/chosen": 1.2358741760253906, "rewards/margins": 0.6529726982116699, "rewards/rejected": 0.5829014778137207, "step": 3755 }, { "epoch": 2.03, "learning_rate": 2.519356353427351e-08, "logits/chosen": -2.0596396923065186, "logits/rejected": -2.051168918609619, "logps/chosen": -11.649482727050781, "logps/rejected": -4.0118021965026855, "loss": 0.3917, "rewards/accuracies": 1.0, "rewards/chosen": 1.1898527145385742, "rewards/margins": 0.7351758480072021, "rewards/rejected": 0.45467686653137207, "step": 3756 }, { "epoch": 2.03, "learning_rate": 2.516828799543097e-08, "logits/chosen": -1.9935309886932373, "logits/rejected": -2.2985568046569824, "logps/chosen": -3.259645938873291, "logps/rejected": -0.41021645069122314, "loss": 0.74, "rewards/accuracies": 0.0, "rewards/chosen": 0.8786979913711548, "rewards/margins": -0.09162378311157227, "rewards/rejected": 0.970321774482727, "step": 3757 }, { "epoch": 2.03, "learning_rate": 2.5143020876809562e-08, "logits/chosen": -2.0773916244506836, "logits/rejected": -2.277578115463257, "logps/chosen": -0.4319159984588623, "logps/rejected": -0.44084522128105164, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.8986930847167969, "rewards/margins": 0.008622825145721436, "rewards/rejected": 0.8900702595710754, "step": 3758 }, { "epoch": 2.03, "learning_rate": 2.5117762186977154e-08, "logits/chosen": -1.945525884628296, "logits/rejected": -2.3288381099700928, "logps/chosen": -5.571563720703125, "logps/rejected": -5.846494674682617, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.47202417254447937, "rewards/margins": 0.0035892724990844727, "rewards/rejected": 0.4684349000453949, "step": 3759 }, { "epoch": 2.03, "learning_rate": 2.5092511934498765e-08, "logits/chosen": -2.153609275817871, "logits/rejected": -2.2970263957977295, "logps/chosen": -1.3542979955673218, "logps/rejected": -4.144567012786865, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.9302710890769958, "rewards/margins": 0.012939751148223877, "rewards/rejected": 0.917331337928772, "step": 3760 }, { "epoch": 2.03, "learning_rate": 2.5067270127936533e-08, "logits/chosen": -2.0715696811676025, "logits/rejected": -2.2655234336853027, "logps/chosen": -0.7129586338996887, "logps/rejected": -0.7720737457275391, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.8990193605422974, "rewards/margins": 0.011985242366790771, "rewards/rejected": 0.8870341181755066, "step": 3761 }, { "epoch": 2.03, "learning_rate": 2.5042036775849766e-08, "logits/chosen": -2.2506659030914307, "logits/rejected": -2.261549234390259, "logps/chosen": -4.976166725158691, "logps/rejected": -4.954462051391602, "loss": 0.4353, "rewards/accuracies": 1.0, "rewards/chosen": 1.1361926794052124, "rewards/margins": 0.6061699390411377, "rewards/rejected": 0.5300227403640747, "step": 3762 }, { "epoch": 2.03, "learning_rate": 2.5016811886794832e-08, "logits/chosen": -2.120995283126831, "logits/rejected": -2.1089046001434326, "logps/chosen": -6.963409423828125, "logps/rejected": -3.9444079399108887, "loss": 0.3912, "rewards/accuracies": 1.0, "rewards/chosen": 1.2474101781845093, "rewards/margins": 0.7365924119949341, "rewards/rejected": 0.5108177661895752, "step": 3763 }, { "epoch": 2.03, "learning_rate": 2.4991595469325296e-08, "logits/chosen": -2.1131083965301514, "logits/rejected": -2.082474946975708, "logps/chosen": -11.43871021270752, "logps/rejected": -2.0304043292999268, "loss": 0.3025, "rewards/accuracies": 1.0, "rewards/chosen": 1.715850830078125, "rewards/margins": 1.0407394170761108, "rewards/rejected": 0.6751114130020142, "step": 3764 }, { "epoch": 2.03, "learning_rate": 2.4966387531991845e-08, "logits/chosen": -2.1469008922576904, "logits/rejected": -2.1930720806121826, "logps/chosen": -5.498162269592285, "logps/rejected": -23.094867706298828, "loss": 0.404, "rewards/accuracies": 1.0, "rewards/chosen": 1.331174612045288, "rewards/margins": 0.6975934505462646, "rewards/rejected": 0.6335811614990234, "step": 3765 }, { "epoch": 2.03, "learning_rate": 2.4941188083342247e-08, "logits/chosen": -2.0138609409332275, "logits/rejected": -2.2553436756134033, "logps/chosen": -0.3152726888656616, "logps/rejected": -0.33253711462020874, "loss": 0.6948, "rewards/accuracies": 0.0, "rewards/chosen": 0.9864814877510071, "rewards/margins": -0.0033255815505981445, "rewards/rejected": 0.9898070693016052, "step": 3766 }, { "epoch": 2.03, "learning_rate": 2.4915997131921446e-08, "logits/chosen": -2.1088802814483643, "logits/rejected": -2.0745222568511963, "logps/chosen": -5.476120471954346, "logps/rejected": -4.9000678062438965, "loss": 0.4156, "rewards/accuracies": 1.0, "rewards/chosen": 1.1863800287246704, "rewards/margins": 0.6630386114120483, "rewards/rejected": 0.5233414173126221, "step": 3767 }, { "epoch": 2.03, "learning_rate": 2.4890814686271445e-08, "logits/chosen": -2.1325414180755615, "logits/rejected": -2.120323896408081, "logps/chosen": -5.005621433258057, "logps/rejected": -8.29786491394043, "loss": 0.5024, "rewards/accuracies": 1.0, "rewards/chosen": 1.0390208959579468, "rewards/margins": 0.42659127712249756, "rewards/rejected": 0.6124296188354492, "step": 3768 }, { "epoch": 2.03, "learning_rate": 2.4865640754931415e-08, "logits/chosen": -2.10638165473938, "logits/rejected": -2.1055614948272705, "logps/chosen": -0.8210310339927673, "logps/rejected": -1.2723273038864136, "loss": 0.631, "rewards/accuracies": 1.0, "rewards/chosen": 0.9896596074104309, "rewards/margins": 0.12838363647460938, "rewards/rejected": 0.8612759709358215, "step": 3769 }, { "epoch": 2.03, "learning_rate": 2.4840475346437622e-08, "logits/chosen": -2.1564202308654785, "logits/rejected": -2.3159844875335693, "logps/chosen": -0.9607855081558228, "logps/rejected": -0.9647687673568726, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.9737818837165833, "rewards/margins": 0.016826152801513672, "rewards/rejected": 0.9569557309150696, "step": 3770 }, { "epoch": 2.03, "learning_rate": 2.4815318469323443e-08, "logits/chosen": -1.9848965406417847, "logits/rejected": -2.2499232292175293, "logps/chosen": -14.960037231445312, "logps/rejected": -0.6132150888442993, "loss": 0.5094, "rewards/accuracies": 1.0, "rewards/chosen": 1.2162567377090454, "rewards/margins": 0.40903347730636597, "rewards/rejected": 0.8072232604026794, "step": 3771 }, { "epoch": 2.03, "learning_rate": 2.479017013211937e-08, "logits/chosen": -2.016005277633667, "logits/rejected": -2.295334577560425, "logps/chosen": -2.5688986778259277, "logps/rejected": -2.2141709327697754, "loss": 0.7144, "rewards/accuracies": 0.0, "rewards/chosen": 0.5901408791542053, "rewards/margins": -0.042137324810028076, "rewards/rejected": 0.6322782039642334, "step": 3772 }, { "epoch": 2.04, "learning_rate": 2.476503034335296e-08, "logits/chosen": -2.092216968536377, "logits/rejected": -2.0987327098846436, "logps/chosen": -1.7691304683685303, "logps/rejected": -3.261251926422119, "loss": 0.4329, "rewards/accuracies": 1.0, "rewards/chosen": 1.3701601028442383, "rewards/margins": 0.6131268739700317, "rewards/rejected": 0.7570332288742065, "step": 3773 }, { "epoch": 2.04, "learning_rate": 2.4739899111548918e-08, "logits/chosen": -2.1146278381347656, "logits/rejected": -2.1407439708709717, "logps/chosen": -1.6300685405731201, "logps/rejected": -6.884424209594727, "loss": 0.4399, "rewards/accuracies": 1.0, "rewards/chosen": 1.2272700071334839, "rewards/margins": 0.5933453440666199, "rewards/rejected": 0.633924663066864, "step": 3774 }, { "epoch": 2.04, "learning_rate": 2.471477644522904e-08, "logits/chosen": -2.013040542602539, "logits/rejected": -1.9811978340148926, "logps/chosen": -14.040152549743652, "logps/rejected": -2.419426202774048, "loss": 0.3447, "rewards/accuracies": 1.0, "rewards/chosen": 1.6822788715362549, "rewards/margins": 0.8876945972442627, "rewards/rejected": 0.7945842742919922, "step": 3775 }, { "epoch": 2.04, "learning_rate": 2.4689662352912184e-08, "logits/chosen": -1.9615435600280762, "logits/rejected": -2.2627243995666504, "logps/chosen": -1.873732566833496, "logps/rejected": -2.016247510910034, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.7261202931404114, "rewards/margins": 0.0066375732421875, "rewards/rejected": 0.7194827198982239, "step": 3776 }, { "epoch": 2.04, "learning_rate": 2.4664556843114348e-08, "logits/chosen": -2.1278152465820312, "logits/rejected": -2.2996771335601807, "logps/chosen": -3.1555914878845215, "logps/rejected": -3.0589587688446045, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 0.7804100513458252, "rewards/margins": 0.0006840229034423828, "rewards/rejected": 0.7797260284423828, "step": 3777 }, { "epoch": 2.04, "learning_rate": 2.4639459924348567e-08, "logits/chosen": -1.9705584049224854, "logits/rejected": -1.975167989730835, "logps/chosen": -1.0625286102294922, "logps/rejected": -4.026889324188232, "loss": 0.6223, "rewards/accuracies": 1.0, "rewards/chosen": 1.1889742612838745, "rewards/margins": 0.1470777988433838, "rewards/rejected": 1.0418964624404907, "step": 3778 }, { "epoch": 2.04, "learning_rate": 2.4614371605124983e-08, "logits/chosen": -2.171755075454712, "logits/rejected": -2.1749558448791504, "logps/chosen": -1.9866085052490234, "logps/rejected": -4.072898864746094, "loss": 0.4871, "rewards/accuracies": 1.0, "rewards/chosen": 1.144743800163269, "rewards/margins": 0.46582406759262085, "rewards/rejected": 0.6789197325706482, "step": 3779 }, { "epoch": 2.04, "learning_rate": 2.45892918939509e-08, "logits/chosen": -2.0349531173706055, "logits/rejected": -2.0346293449401855, "logps/chosen": -0.30008023977279663, "logps/rejected": -2.686244487762451, "loss": 0.5369, "rewards/accuracies": 1.0, "rewards/chosen": 0.8797985315322876, "rewards/margins": 0.3414316773414612, "rewards/rejected": 0.5383668541908264, "step": 3780 }, { "epoch": 2.04, "learning_rate": 2.456422079933056e-08, "logits/chosen": -2.055861473083496, "logits/rejected": -2.050119161605835, "logps/chosen": -7.680582046508789, "logps/rejected": -8.00339126586914, "loss": 0.4159, "rewards/accuracies": 1.0, "rewards/chosen": 1.1266252994537354, "rewards/margins": 0.6621831655502319, "rewards/rejected": 0.4644421637058258, "step": 3781 }, { "epoch": 2.04, "learning_rate": 2.4539158329765408e-08, "logits/chosen": -2.025501012802124, "logits/rejected": -2.311490058898926, "logps/chosen": -7.539988040924072, "logps/rejected": -7.373757839202881, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.6295056939125061, "rewards/margins": 0.006053447723388672, "rewards/rejected": 0.6234522461891174, "step": 3782 }, { "epoch": 2.04, "learning_rate": 2.451410449375387e-08, "logits/chosen": -2.0901153087615967, "logits/rejected": -2.289684295654297, "logps/chosen": -0.8940868377685547, "logps/rejected": -0.950062096118927, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 1.019254446029663, "rewards/margins": 0.008409380912780762, "rewards/rejected": 1.0108450651168823, "step": 3783 }, { "epoch": 2.04, "learning_rate": 2.448905929979151e-08, "logits/chosen": -2.1543009281158447, "logits/rejected": -2.3026371002197266, "logps/chosen": -1.9349795579910278, "logps/rejected": -0.5991855263710022, "loss": 0.7268, "rewards/accuracies": 0.0, "rewards/chosen": 0.905540943145752, "rewards/margins": -0.06624335050582886, "rewards/rejected": 0.9717842936515808, "step": 3784 }, { "epoch": 2.04, "learning_rate": 2.446402275637095e-08, "logits/chosen": -2.0721614360809326, "logits/rejected": -2.073981761932373, "logps/chosen": -4.247497081756592, "logps/rejected": -3.175832509994507, "loss": 0.3335, "rewards/accuracies": 1.0, "rewards/chosen": 1.6491367816925049, "rewards/margins": 0.9267666339874268, "rewards/rejected": 0.7223701477050781, "step": 3785 }, { "epoch": 2.04, "learning_rate": 2.443899487198184e-08, "logits/chosen": -2.0382702350616455, "logits/rejected": -2.038455009460449, "logps/chosen": -2.3781588077545166, "logps/rejected": -1.1112656593322754, "loss": 0.4358, "rewards/accuracies": 1.0, "rewards/chosen": 1.5410197973251343, "rewards/margins": 0.6048142910003662, "rewards/rejected": 0.9362055063247681, "step": 3786 }, { "epoch": 2.04, "learning_rate": 2.4413975655110936e-08, "logits/chosen": -2.111464738845825, "logits/rejected": -2.12103009223938, "logps/chosen": -1.4123674631118774, "logps/rejected": -1.9429817199707031, "loss": 0.5057, "rewards/accuracies": 1.0, "rewards/chosen": 1.2012988328933716, "rewards/margins": 0.41833430528640747, "rewards/rejected": 0.7829645276069641, "step": 3787 }, { "epoch": 2.04, "learning_rate": 2.4388965114242056e-08, "logits/chosen": -2.158360481262207, "logits/rejected": -2.1539080142974854, "logps/chosen": -2.5208070278167725, "logps/rejected": -5.506963729858398, "loss": 0.6054, "rewards/accuracies": 1.0, "rewards/chosen": 0.939963161945343, "rewards/margins": 0.18384552001953125, "rewards/rejected": 0.7561176419258118, "step": 3788 }, { "epoch": 2.04, "learning_rate": 2.4363963257856007e-08, "logits/chosen": -2.1533398628234863, "logits/rejected": -2.2564685344696045, "logps/chosen": -4.334662437438965, "logps/rejected": -2.1105575561523438, "loss": 0.6241, "rewards/accuracies": 1.0, "rewards/chosen": 1.038846731185913, "rewards/margins": 0.14312797784805298, "rewards/rejected": 0.8957187533378601, "step": 3789 }, { "epoch": 2.04, "learning_rate": 2.4338970094430777e-08, "logits/chosen": -2.1604530811309814, "logits/rejected": -2.3518128395080566, "logps/chosen": -1.24560546875, "logps/rejected": -1.2049148082733154, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 1.034592628479004, "rewards/margins": 0.008215904235839844, "rewards/rejected": 1.026376724243164, "step": 3790 }, { "epoch": 2.04, "learning_rate": 2.4313985632441286e-08, "logits/chosen": -2.1193530559539795, "logits/rejected": -2.331552505493164, "logps/chosen": -0.6094588041305542, "logps/rejected": -0.6929055452346802, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 1.035421371459961, "rewards/margins": 0.013810157775878906, "rewards/rejected": 1.021611213684082, "step": 3791 }, { "epoch": 2.05, "learning_rate": 2.4289009880359584e-08, "logits/chosen": -2.0081825256347656, "logits/rejected": -2.0030059814453125, "logps/chosen": -2.949537992477417, "logps/rejected": -5.318995952606201, "loss": 0.2976, "rewards/accuracies": 1.0, "rewards/chosen": 1.4449769258499146, "rewards/margins": 1.0594754219055176, "rewards/rejected": 0.38550153374671936, "step": 3792 }, { "epoch": 2.05, "learning_rate": 2.4264042846654692e-08, "logits/chosen": -1.9997785091400146, "logits/rejected": -2.2808425426483154, "logps/chosen": -0.5731610655784607, "logps/rejected": -0.6697818040847778, "loss": 0.702, "rewards/accuracies": 0.0, "rewards/chosen": 0.8980448842048645, "rewards/margins": -0.017692208290100098, "rewards/rejected": 0.9157370924949646, "step": 3793 }, { "epoch": 2.05, "learning_rate": 2.4239084539792743e-08, "logits/chosen": -2.033287286758423, "logits/rejected": -2.3534021377563477, "logps/chosen": -0.7396760582923889, "logps/rejected": -5.097623825073242, "loss": 0.6253, "rewards/accuracies": 1.0, "rewards/chosen": 1.0490992069244385, "rewards/margins": 0.14059168100357056, "rewards/rejected": 0.9085075259208679, "step": 3794 }, { "epoch": 2.05, "learning_rate": 2.4214134968236897e-08, "logits/chosen": -2.176568031311035, "logits/rejected": -2.1746320724487305, "logps/chosen": -0.9027928709983826, "logps/rejected": -5.486149311065674, "loss": 0.4679, "rewards/accuracies": 1.0, "rewards/chosen": 1.0418285131454468, "rewards/margins": 0.5164944529533386, "rewards/rejected": 0.5253340601921082, "step": 3795 }, { "epoch": 2.05, "learning_rate": 2.4189194140447315e-08, "logits/chosen": -2.1414523124694824, "logits/rejected": -2.2675092220306396, "logps/chosen": -7.859533786773682, "logps/rejected": -4.701817035675049, "loss": 0.7293, "rewards/accuracies": 0.0, "rewards/chosen": 0.7777293920516968, "rewards/margins": -0.07106977701187134, "rewards/rejected": 0.8487991690635681, "step": 3796 }, { "epoch": 2.05, "learning_rate": 2.416426206488123e-08, "logits/chosen": -2.0622925758361816, "logits/rejected": -2.208991050720215, "logps/chosen": -0.9136696457862854, "logps/rejected": -0.8790656328201294, "loss": 0.6986, "rewards/accuracies": 0.0, "rewards/chosen": 0.9794581532478333, "rewards/margins": -0.010810673236846924, "rewards/rejected": 0.9902688264846802, "step": 3797 }, { "epoch": 2.05, "learning_rate": 2.4139338749992893e-08, "logits/chosen": -2.0985724925994873, "logits/rejected": -2.1027872562408447, "logps/chosen": -2.0186607837677, "logps/rejected": -3.433594226837158, "loss": 0.4675, "rewards/accuracies": 1.0, "rewards/chosen": 1.0451022386550903, "rewards/margins": 0.5175670385360718, "rewards/rejected": 0.5275352001190186, "step": 3798 }, { "epoch": 2.05, "learning_rate": 2.4114424204233586e-08, "logits/chosen": -2.087981939315796, "logits/rejected": -2.3131022453308105, "logps/chosen": -3.7795891761779785, "logps/rejected": -3.813652992248535, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 1.3769148588180542, "rewards/margins": 0.005126357078552246, "rewards/rejected": 1.371788501739502, "step": 3799 }, { "epoch": 2.05, "learning_rate": 2.408951843605165e-08, "logits/chosen": -1.9773504734039307, "logits/rejected": -2.27630877494812, "logps/chosen": -0.9250231981277466, "logps/rejected": -8.114486694335938, "loss": 0.5022, "rewards/accuracies": 1.0, "rewards/chosen": 0.8475605845451355, "rewards/margins": 0.42705103754997253, "rewards/rejected": 0.42050954699516296, "step": 3800 }, { "epoch": 2.05, "learning_rate": 2.4064621453892358e-08, "logits/chosen": -2.129347324371338, "logits/rejected": -2.1360230445861816, "logps/chosen": -3.6148343086242676, "logps/rejected": -2.4864323139190674, "loss": 0.4333, "rewards/accuracies": 1.0, "rewards/chosen": 1.1847330331802368, "rewards/margins": 0.6119977235794067, "rewards/rejected": 0.5727353096008301, "step": 3801 }, { "epoch": 2.05, "learning_rate": 2.4039733266198103e-08, "logits/chosen": -2.105921506881714, "logits/rejected": -2.1166560649871826, "logps/chosen": -0.5555586814880371, "logps/rejected": -6.322690963745117, "loss": 0.575, "rewards/accuracies": 1.0, "rewards/chosen": 1.0987738370895386, "rewards/margins": 0.252097487449646, "rewards/rejected": 0.8466763496398926, "step": 3802 }, { "epoch": 2.05, "learning_rate": 2.401485388140827e-08, "logits/chosen": -2.0780231952667236, "logits/rejected": -2.1889350414276123, "logps/chosen": -0.941166341304779, "logps/rejected": -0.9524626135826111, "loss": 0.6738, "rewards/accuracies": 1.0, "rewards/chosen": 0.8017663955688477, "rewards/margins": 0.03898841142654419, "rewards/rejected": 0.7627779841423035, "step": 3803 }, { "epoch": 2.05, "learning_rate": 2.39899833079592e-08, "logits/chosen": -2.009843349456787, "logits/rejected": -2.019766330718994, "logps/chosen": -1.578297734260559, "logps/rejected": -2.8921573162078857, "loss": 0.5038, "rewards/accuracies": 1.0, "rewards/chosen": 1.0688751935958862, "rewards/margins": 0.4230682849884033, "rewards/rejected": 0.6458069086074829, "step": 3804 }, { "epoch": 2.05, "learning_rate": 2.396512155428434e-08, "logits/chosen": -2.1026194095611572, "logits/rejected": -2.2876856327056885, "logps/chosen": -0.4975489377975464, "logps/rejected": -0.5427446961402893, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.8683586120605469, "rewards/margins": 0.010830223560333252, "rewards/rejected": 0.8575283885002136, "step": 3805 }, { "epoch": 2.05, "learning_rate": 2.3940268628814053e-08, "logits/chosen": -2.239567756652832, "logits/rejected": -2.2401301860809326, "logps/chosen": -1.7620365619659424, "logps/rejected": -1.165149450302124, "loss": 0.5578, "rewards/accuracies": 1.0, "rewards/chosen": 0.994064450263977, "rewards/margins": 0.29202741384506226, "rewards/rejected": 0.7020370364189148, "step": 3806 }, { "epoch": 2.05, "learning_rate": 2.3915424539975777e-08, "logits/chosen": -2.154879331588745, "logits/rejected": -2.263904571533203, "logps/chosen": -9.633209228515625, "logps/rejected": -9.578369140625, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.8720241785049438, "rewards/margins": 0.0035366415977478027, "rewards/rejected": 0.868487536907196, "step": 3807 }, { "epoch": 2.05, "learning_rate": 2.3890589296193925e-08, "logits/chosen": -2.2167768478393555, "logits/rejected": -2.2125372886657715, "logps/chosen": -0.2237977385520935, "logps/rejected": -11.242624282836914, "loss": 0.3537, "rewards/accuracies": 1.0, "rewards/chosen": 0.9363574385643005, "rewards/margins": 0.8570857644081116, "rewards/rejected": 0.07927169650793076, "step": 3808 }, { "epoch": 2.05, "learning_rate": 2.386576290588992e-08, "logits/chosen": -2.158916711807251, "logits/rejected": -2.250779151916504, "logps/chosen": -1.4215257167816162, "logps/rejected": -1.4174492359161377, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.898188591003418, "rewards/margins": 0.023429572582244873, "rewards/rejected": 0.8747590184211731, "step": 3809 }, { "epoch": 2.06, "learning_rate": 2.3840945377482196e-08, "logits/chosen": -2.0728206634521484, "logits/rejected": -2.0766828060150146, "logps/chosen": -0.7756703495979309, "logps/rejected": -3.263899087905884, "loss": 0.4881, "rewards/accuracies": 1.0, "rewards/chosen": 0.9380402565002441, "rewards/margins": 0.4632326066493988, "rewards/rejected": 0.47480764985084534, "step": 3810 }, { "epoch": 2.06, "learning_rate": 2.3816136719386128e-08, "logits/chosen": -2.043480634689331, "logits/rejected": -2.2523112297058105, "logps/chosen": -0.5187888741493225, "logps/rejected": -0.5738453269004822, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 0.9783391952514648, "rewards/margins": -0.008153736591339111, "rewards/rejected": 0.986492931842804, "step": 3811 }, { "epoch": 2.06, "learning_rate": 2.3791336940014135e-08, "logits/chosen": -2.1115052700042725, "logits/rejected": -2.242060422897339, "logps/chosen": -4.493160724639893, "logps/rejected": -4.0512518882751465, "loss": 0.7071, "rewards/accuracies": 0.0, "rewards/chosen": 0.9032718539237976, "rewards/margins": -0.02777808904647827, "rewards/rejected": 0.9310499429702759, "step": 3812 }, { "epoch": 2.06, "learning_rate": 2.376654604777563e-08, "logits/chosen": -2.0535690784454346, "logits/rejected": -2.0612988471984863, "logps/chosen": -0.42887410521507263, "logps/rejected": -7.4268574714660645, "loss": 0.396, "rewards/accuracies": 1.0, "rewards/chosen": 1.0254504680633545, "rewards/margins": 0.7218886613845825, "rewards/rejected": 0.30356183648109436, "step": 3813 }, { "epoch": 2.06, "learning_rate": 2.3741764051076963e-08, "logits/chosen": -2.222245216369629, "logits/rejected": -2.098696708679199, "logps/chosen": -21.69426155090332, "logps/rejected": -11.464265823364258, "loss": 0.2485, "rewards/accuracies": 1.0, "rewards/chosen": 1.7711858749389648, "rewards/margins": 1.265319585800171, "rewards/rejected": 0.5058662295341492, "step": 3814 }, { "epoch": 2.06, "learning_rate": 2.3716990958321526e-08, "logits/chosen": -2.076761484146118, "logits/rejected": -2.2891275882720947, "logps/chosen": -5.0650739669799805, "logps/rejected": -0.8712446689605713, "loss": 0.9256, "rewards/accuracies": 0.0, "rewards/chosen": 0.6821252703666687, "rewards/margins": -0.4208764433860779, "rewards/rejected": 1.1030017137527466, "step": 3815 }, { "epoch": 2.06, "learning_rate": 2.3692226777909623e-08, "logits/chosen": -2.1531033515930176, "logits/rejected": -2.1567330360412598, "logps/chosen": -2.7642526626586914, "logps/rejected": -5.2002997398376465, "loss": 0.3467, "rewards/accuracies": 1.0, "rewards/chosen": 1.251397967338562, "rewards/margins": 0.8810820579528809, "rewards/rejected": 0.37031587958335876, "step": 3816 }, { "epoch": 2.06, "learning_rate": 2.36674715182386e-08, "logits/chosen": -2.1008787155151367, "logits/rejected": -2.1007094383239746, "logps/chosen": -1.7434053421020508, "logps/rejected": -1.7388006448745728, "loss": 0.5778, "rewards/accuracies": 1.0, "rewards/chosen": 1.0777719020843506, "rewards/margins": 0.24579530954360962, "rewards/rejected": 0.831976592540741, "step": 3817 }, { "epoch": 2.06, "learning_rate": 2.3642725187702756e-08, "logits/chosen": -2.114633560180664, "logits/rejected": -2.113856077194214, "logps/chosen": -3.759922981262207, "logps/rejected": -2.153432846069336, "loss": 0.2455, "rewards/accuracies": 1.0, "rewards/chosen": 1.8865902423858643, "rewards/margins": 1.2792015075683594, "rewards/rejected": 0.6073886752128601, "step": 3818 }, { "epoch": 2.06, "learning_rate": 2.3617987794693355e-08, "logits/chosen": -1.989564061164856, "logits/rejected": -2.245169162750244, "logps/chosen": -1.0210047960281372, "logps/rejected": -0.9092351794242859, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.7358056902885437, "rewards/margins": 0.013683497905731201, "rewards/rejected": 0.7221221923828125, "step": 3819 }, { "epoch": 2.06, "learning_rate": 2.3593259347598656e-08, "logits/chosen": -2.0792176723480225, "logits/rejected": -2.066298246383667, "logps/chosen": -3.813809394836426, "logps/rejected": -5.906346321105957, "loss": 0.2752, "rewards/accuracies": 1.0, "rewards/chosen": 1.4876607656478882, "rewards/margins": 1.149630069732666, "rewards/rejected": 0.3380306363105774, "step": 3820 }, { "epoch": 2.06, "learning_rate": 2.3568539854803825e-08, "logits/chosen": -2.1580514907836914, "logits/rejected": -2.336888313293457, "logps/chosen": -0.8758779764175415, "logps/rejected": -0.7526816725730896, "loss": 0.7006, "rewards/accuracies": 0.0, "rewards/chosen": 0.9387280344963074, "rewards/margins": -0.014880955219268799, "rewards/rejected": 0.9536089897155762, "step": 3821 }, { "epoch": 2.06, "learning_rate": 2.354382932469105e-08, "logits/chosen": -2.205324649810791, "logits/rejected": -2.205341339111328, "logps/chosen": -0.9728839993476868, "logps/rejected": -6.691705226898193, "loss": 0.4233, "rewards/accuracies": 1.0, "rewards/chosen": 1.046623706817627, "rewards/margins": 0.640504777431488, "rewards/rejected": 0.4061189293861389, "step": 3822 }, { "epoch": 2.06, "learning_rate": 2.3519127765639485e-08, "logits/chosen": -2.1817712783813477, "logits/rejected": -2.177408456802368, "logps/chosen": -5.396266937255859, "logps/rejected": -5.873056888580322, "loss": 0.2659, "rewards/accuracies": 1.0, "rewards/chosen": 1.5093942880630493, "rewards/margins": 1.1887803077697754, "rewards/rejected": 0.32061392068862915, "step": 3823 }, { "epoch": 2.06, "learning_rate": 2.349443518602517e-08, "logits/chosen": -2.093599557876587, "logits/rejected": -2.2564034461975098, "logps/chosen": -0.42860448360443115, "logps/rejected": -7.317495346069336, "loss": 0.6262, "rewards/accuracies": 1.0, "rewards/chosen": 0.8792323470115662, "rewards/margins": 0.13870567083358765, "rewards/rejected": 0.7405266761779785, "step": 3824 }, { "epoch": 2.06, "learning_rate": 2.346975159422119e-08, "logits/chosen": -2.027818202972412, "logits/rejected": -2.281052350997925, "logps/chosen": -0.8301474452018738, "logps/rejected": -0.7900190949440002, "loss": 0.6808, "rewards/accuracies": 1.0, "rewards/chosen": 1.0470458269119263, "rewards/margins": 0.024882197380065918, "rewards/rejected": 1.0221636295318604, "step": 3825 }, { "epoch": 2.06, "learning_rate": 2.3445076998597503e-08, "logits/chosen": -1.9738541841506958, "logits/rejected": -2.237434148788452, "logps/chosen": -1.5332105159759521, "logps/rejected": -4.089029312133789, "loss": 0.6649, "rewards/accuracies": 1.0, "rewards/chosen": 1.0177708864212036, "rewards/margins": 0.05739110708236694, "rewards/rejected": 0.9603797793388367, "step": 3826 }, { "epoch": 2.06, "learning_rate": 2.3420411407521066e-08, "logits/chosen": -1.9804213047027588, "logits/rejected": -1.980556845664978, "logps/chosen": -3.065641403198242, "logps/rejected": -2.4484386444091797, "loss": 0.4499, "rewards/accuracies": 1.0, "rewards/chosen": 1.3168295621871948, "rewards/margins": 0.5653841495513916, "rewards/rejected": 0.7514454126358032, "step": 3827 }, { "epoch": 2.06, "learning_rate": 2.3395754829355784e-08, "logits/chosen": -2.0600810050964355, "logits/rejected": -2.2672505378723145, "logps/chosen": -0.2767331302165985, "logps/rejected": -0.3089510500431061, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 1.0302958488464355, "rewards/margins": 0.028281569480895996, "rewards/rejected": 1.0020142793655396, "step": 3828 }, { "epoch": 2.07, "learning_rate": 2.3371107272462477e-08, "logits/chosen": -2.1058757305145264, "logits/rejected": -2.107276439666748, "logps/chosen": -3.8874387741088867, "logps/rejected": -3.0565550327301025, "loss": 0.4596, "rewards/accuracies": 1.0, "rewards/chosen": 1.1725224256515503, "rewards/margins": 0.5387553572654724, "rewards/rejected": 0.6337670683860779, "step": 3829 }, { "epoch": 2.07, "learning_rate": 2.3346468745198944e-08, "logits/chosen": -1.9625372886657715, "logits/rejected": -1.9612042903900146, "logps/chosen": -0.3325209617614746, "logps/rejected": -6.106978893280029, "loss": 0.4753, "rewards/accuracies": 1.0, "rewards/chosen": 0.9650232195854187, "rewards/margins": 0.49670785665512085, "rewards/rejected": 0.46831536293029785, "step": 3830 }, { "epoch": 2.07, "learning_rate": 2.3321839255919868e-08, "logits/chosen": -2.0379133224487305, "logits/rejected": -2.0367844104766846, "logps/chosen": -6.837355613708496, "logps/rejected": -5.757425308227539, "loss": 0.4658, "rewards/accuracies": 1.0, "rewards/chosen": 1.0365875959396362, "rewards/margins": 0.5222086310386658, "rewards/rejected": 0.5143789649009705, "step": 3831 }, { "epoch": 2.07, "learning_rate": 2.329721881297691e-08, "logits/chosen": -1.929348111152649, "logits/rejected": -1.9332807064056396, "logps/chosen": -1.560663104057312, "logps/rejected": -3.1666269302368164, "loss": 0.509, "rewards/accuracies": 1.0, "rewards/chosen": 1.0431770086288452, "rewards/margins": 0.40995585918426514, "rewards/rejected": 0.6332211494445801, "step": 3832 }, { "epoch": 2.07, "learning_rate": 2.327260742471867e-08, "logits/chosen": -2.0940849781036377, "logits/rejected": -2.2167227268218994, "logps/chosen": -1.464241862297058, "logps/rejected": -1.5475590229034424, "loss": 0.6739, "rewards/accuracies": 1.0, "rewards/chosen": 0.6806483268737793, "rewards/margins": 0.038834452629089355, "rewards/rejected": 0.6418138742446899, "step": 3833 }, { "epoch": 2.07, "learning_rate": 2.324800509949062e-08, "logits/chosen": -2.1304306983947754, "logits/rejected": -2.13236927986145, "logps/chosen": -0.20025403797626495, "logps/rejected": -4.412725925445557, "loss": 0.5375, "rewards/accuracies": 1.0, "rewards/chosen": 0.8427444696426392, "rewards/margins": 0.33996063470840454, "rewards/rejected": 0.5027838349342346, "step": 3834 }, { "epoch": 2.07, "learning_rate": 2.3223411845635228e-08, "logits/chosen": -2.1054749488830566, "logits/rejected": -2.2859084606170654, "logps/chosen": -0.33698493242263794, "logps/rejected": -0.32742005586624146, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.7677657604217529, "rewards/margins": 0.03159064054489136, "rewards/rejected": 0.7361751198768616, "step": 3835 }, { "epoch": 2.07, "learning_rate": 2.319882767149186e-08, "logits/chosen": -2.040020227432251, "logits/rejected": -2.3020033836364746, "logps/chosen": -3.876098871231079, "logps/rejected": -2.4631834030151367, "loss": 0.7131, "rewards/accuracies": 0.0, "rewards/chosen": 0.8369100689888, "rewards/margins": -0.0395125150680542, "rewards/rejected": 0.8764225840568542, "step": 3836 }, { "epoch": 2.07, "learning_rate": 2.317425258539676e-08, "logits/chosen": -2.0247204303741455, "logits/rejected": -2.03218936920166, "logps/chosen": -1.4112882614135742, "logps/rejected": -3.3762614727020264, "loss": 0.3237, "rewards/accuracies": 1.0, "rewards/chosen": 1.52755868434906, "rewards/margins": 0.9616659879684448, "rewards/rejected": 0.5658926963806152, "step": 3837 }, { "epoch": 2.07, "learning_rate": 2.314968659568318e-08, "logits/chosen": -2.020547866821289, "logits/rejected": -2.0254015922546387, "logps/chosen": -1.0465126037597656, "logps/rejected": -3.2611308097839355, "loss": 0.4726, "rewards/accuracies": 1.0, "rewards/chosen": 1.0322309732437134, "rewards/margins": 0.5038436651229858, "rewards/rejected": 0.5283873081207275, "step": 3838 }, { "epoch": 2.07, "learning_rate": 2.3125129710681208e-08, "logits/chosen": -2.042274236679077, "logits/rejected": -2.0361733436584473, "logps/chosen": -1.8255271911621094, "logps/rejected": -6.806208610534668, "loss": 0.4883, "rewards/accuracies": 1.0, "rewards/chosen": 1.1251252889633179, "rewards/margins": 0.46275222301483154, "rewards/rejected": 0.6623730659484863, "step": 3839 }, { "epoch": 2.07, "learning_rate": 2.3100581938717896e-08, "logits/chosen": -2.0004992485046387, "logits/rejected": -2.000023603439331, "logps/chosen": -0.9572343230247498, "logps/rejected": -2.6421093940734863, "loss": 0.5409, "rewards/accuracies": 1.0, "rewards/chosen": 1.130137324333191, "rewards/margins": 0.3320087790489197, "rewards/rejected": 0.7981285452842712, "step": 3840 }, { "epoch": 2.07, "learning_rate": 2.3076043288117143e-08, "logits/chosen": -2.1285903453826904, "logits/rejected": -1.9485780000686646, "logps/chosen": -36.03280258178711, "logps/rejected": -2.8463892936706543, "loss": 0.212, "rewards/accuracies": 1.0, "rewards/chosen": 2.119863986968994, "rewards/margins": 1.4432069063186646, "rewards/rejected": 0.6766570806503296, "step": 3841 }, { "epoch": 2.07, "learning_rate": 2.3051513767199824e-08, "logits/chosen": -2.1538689136505127, "logits/rejected": -2.289029598236084, "logps/chosen": -0.4469989538192749, "logps/rejected": -0.41094231605529785, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.9445675015449524, "rewards/margins": 0.019672691822052002, "rewards/rejected": 0.9248948097229004, "step": 3842 }, { "epoch": 2.07, "learning_rate": 2.3026993384283704e-08, "logits/chosen": -2.077177047729492, "logits/rejected": -2.264888286590576, "logps/chosen": -0.389617383480072, "logps/rejected": -0.41354265809059143, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9467042088508606, "rewards/margins": 0.012435555458068848, "rewards/rejected": 0.9342686533927917, "step": 3843 }, { "epoch": 2.07, "learning_rate": 2.30024821476834e-08, "logits/chosen": -2.0910446643829346, "logits/rejected": -2.252129316329956, "logps/chosen": -0.41038578748703003, "logps/rejected": -3.1751930713653564, "loss": 0.6165, "rewards/accuracies": 1.0, "rewards/chosen": 0.9844611287117004, "rewards/margins": 0.1597299575805664, "rewards/rejected": 0.824731171131134, "step": 3844 }, { "epoch": 2.07, "learning_rate": 2.2977980065710473e-08, "logits/chosen": -2.0648038387298584, "logits/rejected": -2.3928332328796387, "logps/chosen": -0.11652917414903641, "logps/rejected": -0.11944568157196045, "loss": 0.702, "rewards/accuracies": 0.0, "rewards/chosen": 0.9336714744567871, "rewards/margins": -0.017609119415283203, "rewards/rejected": 0.9512805938720703, "step": 3845 }, { "epoch": 2.07, "learning_rate": 2.29534871466734e-08, "logits/chosen": -1.937670111656189, "logits/rejected": -2.2209577560424805, "logps/chosen": -2.8292219638824463, "logps/rejected": -5.7618913650512695, "loss": 0.6505, "rewards/accuracies": 1.0, "rewards/chosen": 0.8834707140922546, "rewards/margins": 0.08715254068374634, "rewards/rejected": 0.7963181734085083, "step": 3846 }, { "epoch": 2.07, "learning_rate": 2.2929003398877457e-08, "logits/chosen": -2.0519862174987793, "logits/rejected": -1.9973057508468628, "logps/chosen": -8.139471054077148, "logps/rejected": -8.219944953918457, "loss": 0.3994, "rewards/accuracies": 1.0, "rewards/chosen": 1.477783441543579, "rewards/margins": 0.7113178372383118, "rewards/rejected": 0.7664656043052673, "step": 3847 }, { "epoch": 2.08, "learning_rate": 2.2904528830624947e-08, "logits/chosen": -2.1120219230651855, "logits/rejected": -2.1119601726531982, "logps/chosen": -1.2194064855575562, "logps/rejected": -3.338132858276367, "loss": 0.5172, "rewards/accuracies": 1.0, "rewards/chosen": 0.982387363910675, "rewards/margins": 0.389626145362854, "rewards/rejected": 0.592761218547821, "step": 3848 }, { "epoch": 2.08, "learning_rate": 2.2880063450214936e-08, "logits/chosen": -2.0702614784240723, "logits/rejected": -2.2930757999420166, "logps/chosen": -0.7442750930786133, "logps/rejected": -0.85249924659729, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.7414917945861816, "rewards/margins": 0.03249025344848633, "rewards/rejected": 0.7090015411376953, "step": 3849 }, { "epoch": 2.08, "learning_rate": 2.2855607265943444e-08, "logits/chosen": -2.0697240829467773, "logits/rejected": -2.3024253845214844, "logps/chosen": -6.650835990905762, "logps/rejected": -7.260716438293457, "loss": 0.6547, "rewards/accuracies": 1.0, "rewards/chosen": 0.7989201545715332, "rewards/margins": 0.07845687866210938, "rewards/rejected": 0.7204632759094238, "step": 3850 }, { "epoch": 2.08, "learning_rate": 2.2831160286103367e-08, "logits/chosen": -2.139270544052124, "logits/rejected": -2.2648980617523193, "logps/chosen": -1.393386721611023, "logps/rejected": -1.407682180404663, "loss": 0.6778, "rewards/accuracies": 1.0, "rewards/chosen": 0.8034539222717285, "rewards/margins": 0.03086698055267334, "rewards/rejected": 0.7725869417190552, "step": 3851 }, { "epoch": 2.08, "learning_rate": 2.2806722518984433e-08, "logits/chosen": -2.0490195751190186, "logits/rejected": -2.0443079471588135, "logps/chosen": -6.428447246551514, "logps/rejected": -4.624910354614258, "loss": 0.2638, "rewards/accuracies": 1.0, "rewards/chosen": 1.6669033765792847, "rewards/margins": 1.1978373527526855, "rewards/rejected": 0.4690660536289215, "step": 3852 }, { "epoch": 2.08, "learning_rate": 2.2782293972873323e-08, "logits/chosen": -2.0647802352905273, "logits/rejected": -2.287397861480713, "logps/chosen": -8.230539321899414, "logps/rejected": -2.4099488258361816, "loss": 0.7404, "rewards/accuracies": 0.0, "rewards/chosen": 0.9260841608047485, "rewards/margins": -0.0923759937286377, "rewards/rejected": 1.0184601545333862, "step": 3853 }, { "epoch": 2.08, "learning_rate": 2.2757874656053495e-08, "logits/chosen": -2.1347901821136475, "logits/rejected": -2.2791550159454346, "logps/chosen": -2.20810604095459, "logps/rejected": -2.200746536254883, "loss": 0.6814, "rewards/accuracies": 1.0, "rewards/chosen": 0.6824384927749634, "rewards/margins": 0.023570358753204346, "rewards/rejected": 0.658868134021759, "step": 3854 }, { "epoch": 2.08, "learning_rate": 2.273346457680536e-08, "logits/chosen": -2.0151052474975586, "logits/rejected": -2.2264297008514404, "logps/chosen": -2.1466798782348633, "logps/rejected": -1.8189492225646973, "loss": 0.7008, "rewards/accuracies": 0.0, "rewards/chosen": 0.6836993098258972, "rewards/margins": -0.01524043083190918, "rewards/rejected": 0.6989397406578064, "step": 3855 }, { "epoch": 2.08, "learning_rate": 2.2709063743406194e-08, "logits/chosen": -1.9712224006652832, "logits/rejected": -2.235222101211548, "logps/chosen": -0.2988112270832062, "logps/rejected": -0.22421394288539886, "loss": 0.6718, "rewards/accuracies": 1.0, "rewards/chosen": 0.9028406143188477, "rewards/margins": 0.0432438850402832, "rewards/rejected": 0.8595967292785645, "step": 3856 }, { "epoch": 2.08, "learning_rate": 2.2684672164130036e-08, "logits/chosen": -2.0716090202331543, "logits/rejected": -2.312035322189331, "logps/chosen": -2.507404088973999, "logps/rejected": -8.314340591430664, "loss": 0.5652, "rewards/accuracies": 1.0, "rewards/chosen": 0.7830401659011841, "rewards/margins": 0.2747839093208313, "rewards/rejected": 0.5082562565803528, "step": 3857 }, { "epoch": 2.08, "learning_rate": 2.2660289847247943e-08, "logits/chosen": -2.1600921154022217, "logits/rejected": -2.0775301456451416, "logps/chosen": -35.83495330810547, "logps/rejected": -9.318042755126953, "loss": 0.2588, "rewards/accuracies": 1.0, "rewards/chosen": 2.157975435256958, "rewards/margins": 1.2195172309875488, "rewards/rejected": 0.938458263874054, "step": 3858 }, { "epoch": 2.08, "learning_rate": 2.2635916801027704e-08, "logits/chosen": -2.113002300262451, "logits/rejected": -2.114548444747925, "logps/chosen": -0.8263245820999146, "logps/rejected": -3.5318691730499268, "loss": 0.4848, "rewards/accuracies": 1.0, "rewards/chosen": 1.0177390575408936, "rewards/margins": 0.4717591404914856, "rewards/rejected": 0.545979917049408, "step": 3859 }, { "epoch": 2.08, "learning_rate": 2.261155303373402e-08, "logits/chosen": -2.0727388858795166, "logits/rejected": -2.2468600273132324, "logps/chosen": -0.9394934177398682, "logps/rejected": -0.9269684553146362, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 0.8708948493003845, "rewards/margins": 0.0028294920921325684, "rewards/rejected": 0.868065357208252, "step": 3860 }, { "epoch": 2.08, "learning_rate": 2.2587198553628456e-08, "logits/chosen": -2.046316623687744, "logits/rejected": -2.052464008331299, "logps/chosen": -1.58735990524292, "logps/rejected": -2.3298096656799316, "loss": 0.4554, "rewards/accuracies": 1.0, "rewards/chosen": 1.2028506994247437, "rewards/margins": 0.5502362251281738, "rewards/rejected": 0.6526144742965698, "step": 3861 }, { "epoch": 2.08, "learning_rate": 2.256285336896937e-08, "logits/chosen": -2.059147357940674, "logits/rejected": -2.3093700408935547, "logps/chosen": -4.643293380737305, "logps/rejected": -4.397303581237793, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 1.1288081407546997, "rewards/margins": -0.008396387100219727, "rewards/rejected": 1.1372045278549194, "step": 3862 }, { "epoch": 2.08, "learning_rate": 2.2538517488012043e-08, "logits/chosen": -2.1327965259552, "logits/rejected": -2.3356711864471436, "logps/chosen": -0.5353704690933228, "logps/rejected": -0.607537031173706, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.943393349647522, "rewards/margins": 0.014180302619934082, "rewards/rejected": 0.9292130470275879, "step": 3863 }, { "epoch": 2.08, "learning_rate": 2.2514190919008536e-08, "logits/chosen": -1.9953020811080933, "logits/rejected": -1.9949818849563599, "logps/chosen": -1.3697940111160278, "logps/rejected": -3.5635716915130615, "loss": 0.5303, "rewards/accuracies": 1.0, "rewards/chosen": 0.9066039323806763, "rewards/margins": 0.35736292600631714, "rewards/rejected": 0.5492410063743591, "step": 3864 }, { "epoch": 2.08, "learning_rate": 2.2489873670207783e-08, "logits/chosen": -2.0991029739379883, "logits/rejected": -2.2893309593200684, "logps/chosen": -7.181269645690918, "logps/rejected": -7.271334648132324, "loss": 0.6698, "rewards/accuracies": 1.0, "rewards/chosen": 1.3283604383468628, "rewards/margins": 0.047289252281188965, "rewards/rejected": 1.2810711860656738, "step": 3865 }, { "epoch": 2.09, "learning_rate": 2.2465565749855585e-08, "logits/chosen": -2.082195520401001, "logits/rejected": -2.088022232055664, "logps/chosen": -2.153196096420288, "logps/rejected": -3.8859288692474365, "loss": 0.4215, "rewards/accuracies": 1.0, "rewards/chosen": 1.2120784521102905, "rewards/margins": 0.6457929015159607, "rewards/rejected": 0.5662855505943298, "step": 3866 }, { "epoch": 2.09, "learning_rate": 2.2441267166194493e-08, "logits/chosen": -2.001925468444824, "logits/rejected": -2.267212152481079, "logps/chosen": -1.1868956089019775, "logps/rejected": -1.1621735095977783, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.8864769339561462, "rewards/margins": 0.003587067127227783, "rewards/rejected": 0.8828898668289185, "step": 3867 }, { "epoch": 2.09, "learning_rate": 2.241697792746402e-08, "logits/chosen": -2.0791232585906982, "logits/rejected": -2.286036968231201, "logps/chosen": -6.877025604248047, "logps/rejected": -1.9113473892211914, "loss": 0.7659, "rewards/accuracies": 0.0, "rewards/chosen": 0.6964675784111023, "rewards/margins": -0.14051419496536255, "rewards/rejected": 0.8369817733764648, "step": 3868 }, { "epoch": 2.09, "learning_rate": 2.239269804190038e-08, "logits/chosen": -2.2038185596466064, "logits/rejected": -2.0254921913146973, "logps/chosen": -24.818828582763672, "logps/rejected": -4.023904323577881, "loss": 0.2071, "rewards/accuracies": 1.0, "rewards/chosen": 2.0731360912323, "rewards/margins": 1.469115972518921, "rewards/rejected": 0.6040201783180237, "step": 3869 }, { "epoch": 2.09, "learning_rate": 2.2368427517736706e-08, "logits/chosen": -2.0413854122161865, "logits/rejected": -2.2713475227355957, "logps/chosen": -0.39052093029022217, "logps/rejected": -0.4094190001487732, "loss": 0.6862, "rewards/accuracies": 1.0, "rewards/chosen": 0.930962085723877, "rewards/margins": 0.013911306858062744, "rewards/rejected": 0.9170507788658142, "step": 3870 }, { "epoch": 2.09, "learning_rate": 2.234416636320293e-08, "logits/chosen": -2.157998561859131, "logits/rejected": -2.1575865745544434, "logps/chosen": -1.0555179119110107, "logps/rejected": -6.471106052398682, "loss": 0.5992, "rewards/accuracies": 1.0, "rewards/chosen": 1.0845026969909668, "rewards/margins": 0.1977471113204956, "rewards/rejected": 0.8867555856704712, "step": 3871 }, { "epoch": 2.09, "learning_rate": 2.2319914586525774e-08, "logits/chosen": -2.196359872817993, "logits/rejected": -2.1991591453552246, "logps/chosen": -0.28087836503982544, "logps/rejected": -4.623538970947266, "loss": 0.4524, "rewards/accuracies": 1.0, "rewards/chosen": 0.9289616942405701, "rewards/margins": 0.5583624243736267, "rewards/rejected": 0.37059926986694336, "step": 3872 }, { "epoch": 2.09, "learning_rate": 2.2295672195928823e-08, "logits/chosen": -1.9714521169662476, "logits/rejected": -2.2685697078704834, "logps/chosen": -0.5992372035980225, "logps/rejected": -2.9797961711883545, "loss": 0.5803, "rewards/accuracies": 1.0, "rewards/chosen": 1.0445997714996338, "rewards/margins": 0.23996376991271973, "rewards/rejected": 0.8046360015869141, "step": 3873 }, { "epoch": 2.09, "learning_rate": 2.227143919963248e-08, "logits/chosen": -2.1048290729522705, "logits/rejected": -2.0701520442962646, "logps/chosen": -16.827110290527344, "logps/rejected": -3.201103925704956, "loss": 0.3746, "rewards/accuracies": 1.0, "rewards/chosen": 1.4634546041488647, "rewards/margins": 0.7887489199638367, "rewards/rejected": 0.6747056841850281, "step": 3874 }, { "epoch": 2.09, "learning_rate": 2.2247215605853914e-08, "logits/chosen": -2.0758280754089355, "logits/rejected": -2.3623523712158203, "logps/chosen": -0.6486793756484985, "logps/rejected": -0.7144461870193481, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 1.151037573814392, "rewards/margins": 0.03041815757751465, "rewards/rejected": 1.1206194162368774, "step": 3875 }, { "epoch": 2.09, "learning_rate": 2.2223001422807176e-08, "logits/chosen": -2.1514663696289062, "logits/rejected": -2.1352992057800293, "logps/chosen": -9.631531715393066, "logps/rejected": -3.0821592807769775, "loss": 0.5187, "rewards/accuracies": 1.0, "rewards/chosen": 1.094423770904541, "rewards/margins": 0.3858049511909485, "rewards/rejected": 0.7086188197135925, "step": 3876 }, { "epoch": 2.09, "learning_rate": 2.219879665870303e-08, "logits/chosen": -2.2081146240234375, "logits/rejected": -2.2106990814208984, "logps/chosen": -2.3227264881134033, "logps/rejected": -1.1800552606582642, "loss": 0.5853, "rewards/accuracies": 1.0, "rewards/chosen": 0.9799433946609497, "rewards/margins": 0.22877317667007446, "rewards/rejected": 0.7511702179908752, "step": 3877 }, { "epoch": 2.09, "learning_rate": 2.2174601321749174e-08, "logits/chosen": -2.1792876720428467, "logits/rejected": -2.2664377689361572, "logps/chosen": -1.9479215145111084, "logps/rejected": -1.809980869293213, "loss": 0.705, "rewards/accuracies": 0.0, "rewards/chosen": 0.8511618971824646, "rewards/margins": -0.023589611053466797, "rewards/rejected": 0.8747515082359314, "step": 3878 }, { "epoch": 2.09, "learning_rate": 2.2150415420149997e-08, "logits/chosen": -2.003901720046997, "logits/rejected": -2.004558563232422, "logps/chosen": -3.593712329864502, "logps/rejected": -2.84306001663208, "loss": 0.3851, "rewards/accuracies": 1.0, "rewards/chosen": 1.4568874835968018, "rewards/margins": 0.7556129097938538, "rewards/rejected": 0.701274573802948, "step": 3879 }, { "epoch": 2.09, "learning_rate": 2.2126238962106743e-08, "logits/chosen": -2.1355152130126953, "logits/rejected": -2.274735450744629, "logps/chosen": -0.3927997946739197, "logps/rejected": -0.39878740906715393, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.9076547622680664, "rewards/margins": 0.007211863994598389, "rewards/rejected": 0.900442898273468, "step": 3880 }, { "epoch": 2.09, "learning_rate": 2.2102071955817453e-08, "logits/chosen": -2.0357775688171387, "logits/rejected": -2.295285701751709, "logps/chosen": -0.4844556152820587, "logps/rejected": -0.46629631519317627, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 0.8082671165466309, "rewards/margins": 0.027881503105163574, "rewards/rejected": 0.7803856134414673, "step": 3881 }, { "epoch": 2.09, "learning_rate": 2.207791440947693e-08, "logits/chosen": -2.0183143615722656, "logits/rejected": -2.2926058769226074, "logps/chosen": -0.32873910665512085, "logps/rejected": -0.36945390701293945, "loss": 0.6987, "rewards/accuracies": 0.0, "rewards/chosen": 0.8753817677497864, "rewards/margins": -0.011011958122253418, "rewards/rejected": 0.8863937258720398, "step": 3882 }, { "epoch": 2.09, "learning_rate": 2.2053766331276798e-08, "logits/chosen": -2.051950454711914, "logits/rejected": -2.284841775894165, "logps/chosen": -0.4261169731616974, "logps/rejected": -0.4375483989715576, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.9888666272163391, "rewards/margins": 0.01965320110321045, "rewards/rejected": 0.9692134261131287, "step": 3883 }, { "epoch": 2.09, "learning_rate": 2.2029627729405488e-08, "logits/chosen": -2.1062655448913574, "logits/rejected": -2.1085681915283203, "logps/chosen": -0.6917897462844849, "logps/rejected": -2.142880916595459, "loss": 0.5529, "rewards/accuracies": 1.0, "rewards/chosen": 0.9396881461143494, "rewards/margins": 0.30349546670913696, "rewards/rejected": 0.6361926794052124, "step": 3884 }, { "epoch": 2.1, "learning_rate": 2.200549861204815e-08, "logits/chosen": -2.060173273086548, "logits/rejected": -2.3446240425109863, "logps/chosen": -0.32968011498451233, "logps/rejected": -1.098038911819458, "loss": 0.6422, "rewards/accuracies": 1.0, "rewards/chosen": 0.820988118648529, "rewards/margins": 0.10453855991363525, "rewards/rejected": 0.7164495587348938, "step": 3885 }, { "epoch": 2.1, "learning_rate": 2.1981378987386788e-08, "logits/chosen": -2.14123797416687, "logits/rejected": -2.1483867168426514, "logps/chosen": -0.8692690134048462, "logps/rejected": -2.114363193511963, "loss": 0.507, "rewards/accuracies": 1.0, "rewards/chosen": 0.9565213322639465, "rewards/margins": 0.4150139093399048, "rewards/rejected": 0.5415074229240417, "step": 3886 }, { "epoch": 2.1, "learning_rate": 2.195726886360016e-08, "logits/chosen": -2.1909444332122803, "logits/rejected": -2.305784225463867, "logps/chosen": -1.4539406299591064, "logps/rejected": -1.4541642665863037, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.9792621731758118, "rewards/margins": 0.013685047626495361, "rewards/rejected": 0.9655771255493164, "step": 3887 }, { "epoch": 2.1, "learning_rate": 2.1933168248863815e-08, "logits/chosen": -2.127831220626831, "logits/rejected": -2.3162612915039062, "logps/chosen": -0.8731058835983276, "logps/rejected": -0.8567810654640198, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": 1.0057734251022339, "rewards/margins": -0.009552478790283203, "rewards/rejected": 1.015325903892517, "step": 3888 }, { "epoch": 2.1, "learning_rate": 2.190907715135003e-08, "logits/chosen": -2.062696933746338, "logits/rejected": -2.0685510635375977, "logps/chosen": -0.38649237155914307, "logps/rejected": -6.305109024047852, "loss": 0.4202, "rewards/accuracies": 1.0, "rewards/chosen": 0.9839296340942383, "rewards/margins": 0.6495130062103271, "rewards/rejected": 0.33441659808158875, "step": 3889 }, { "epoch": 2.1, "learning_rate": 2.1884995579227916e-08, "logits/chosen": -2.1306240558624268, "logits/rejected": -2.130908489227295, "logps/chosen": -9.131839752197266, "logps/rejected": -1.6244425773620605, "loss": 0.5122, "rewards/accuracies": 1.0, "rewards/chosen": 1.324048638343811, "rewards/margins": 0.4019660949707031, "rewards/rejected": 0.9220825433731079, "step": 3890 }, { "epoch": 2.1, "learning_rate": 2.1860923540663335e-08, "logits/chosen": -2.06874942779541, "logits/rejected": -2.3389835357666016, "logps/chosen": -1.5146491527557373, "logps/rejected": -15.453215599060059, "loss": 0.4604, "rewards/accuracies": 1.0, "rewards/chosen": 1.0044230222702026, "rewards/margins": 0.5367150902748108, "rewards/rejected": 0.46770793199539185, "step": 3891 }, { "epoch": 2.1, "learning_rate": 2.183686104381888e-08, "logits/chosen": -2.0546631813049316, "logits/rejected": -2.063664197921753, "logps/chosen": -5.1420440673828125, "logps/rejected": -3.150393009185791, "loss": 0.4646, "rewards/accuracies": 1.0, "rewards/chosen": 1.0911492109298706, "rewards/margins": 0.5252408385276794, "rewards/rejected": 0.5659083724021912, "step": 3892 }, { "epoch": 2.1, "learning_rate": 2.181280809685396e-08, "logits/chosen": -2.1387884616851807, "logits/rejected": -2.288775682449341, "logps/chosen": -0.4534774422645569, "logps/rejected": -0.4536677598953247, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.868974506855011, "rewards/margins": 0.025358378887176514, "rewards/rejected": 0.8436161279678345, "step": 3893 }, { "epoch": 2.1, "learning_rate": 2.1788764707924728e-08, "logits/chosen": -2.0233466625213623, "logits/rejected": -2.017455577850342, "logps/chosen": -5.324199676513672, "logps/rejected": -2.719757080078125, "loss": 0.4158, "rewards/accuracies": 1.0, "rewards/chosen": 1.3597501516342163, "rewards/margins": 0.6624354720115662, "rewards/rejected": 0.6973146796226501, "step": 3894 }, { "epoch": 2.1, "learning_rate": 2.176473088518407e-08, "logits/chosen": -2.1107664108276367, "logits/rejected": -2.2792880535125732, "logps/chosen": -0.7406829595565796, "logps/rejected": -0.7088531255722046, "loss": 0.7074, "rewards/accuracies": 0.0, "rewards/chosen": 0.9133376479148865, "rewards/margins": -0.02839493751525879, "rewards/rejected": 0.9417325854301453, "step": 3895 }, { "epoch": 2.1, "learning_rate": 2.174070663678166e-08, "logits/chosen": -2.200021505355835, "logits/rejected": -2.1188879013061523, "logps/chosen": -31.63865089416504, "logps/rejected": -2.160137176513672, "loss": 0.2191, "rewards/accuracies": 1.0, "rewards/chosen": 2.1696393489837646, "rewards/margins": 1.4066925048828125, "rewards/rejected": 0.7629469037055969, "step": 3896 }, { "epoch": 2.1, "learning_rate": 2.171669197086391e-08, "logits/chosen": -2.0975894927978516, "logits/rejected": -2.279005289077759, "logps/chosen": -0.4876451790332794, "logps/rejected": -0.535641074180603, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.9314090609550476, "rewards/margins": 0.020059823989868164, "rewards/rejected": 0.9113492369651794, "step": 3897 }, { "epoch": 2.1, "learning_rate": 2.1692686895574004e-08, "logits/chosen": -2.031433582305908, "logits/rejected": -2.2703936100006104, "logps/chosen": -0.9055995941162109, "logps/rejected": -0.7623131275177002, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.9921550154685974, "rewards/margins": -0.0022611618041992188, "rewards/rejected": 0.9944161772727966, "step": 3898 }, { "epoch": 2.1, "learning_rate": 2.166869141905186e-08, "logits/chosen": -1.9408369064331055, "logits/rejected": -1.957835078239441, "logps/chosen": -2.9487974643707275, "logps/rejected": -5.639693260192871, "loss": 0.5648, "rewards/accuracies": 1.0, "rewards/chosen": 1.0745917558670044, "rewards/margins": 0.2755277752876282, "rewards/rejected": 0.7990639805793762, "step": 3899 }, { "epoch": 2.1, "learning_rate": 2.164470554943411e-08, "logits/chosen": -2.117938756942749, "logits/rejected": -2.1186506748199463, "logps/chosen": -1.9710824489593506, "logps/rejected": -1.305436611175537, "loss": 0.4843, "rewards/accuracies": 1.0, "rewards/chosen": 1.234130859375, "rewards/margins": 0.47311174869537354, "rewards/rejected": 0.7610191106796265, "step": 3900 }, { "epoch": 2.1, "learning_rate": 2.1620729294854196e-08, "logits/chosen": -2.1384294033050537, "logits/rejected": -2.2678256034851074, "logps/chosen": -0.7034066319465637, "logps/rejected": -0.715959370136261, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.9479519128799438, "rewards/margins": 0.00532686710357666, "rewards/rejected": 0.9426250457763672, "step": 3901 }, { "epoch": 2.1, "learning_rate": 2.1596762663442214e-08, "logits/chosen": -2.0445921421051025, "logits/rejected": -2.2076549530029297, "logps/chosen": -0.5223729610443115, "logps/rejected": -0.5673444271087646, "loss": 0.6687, "rewards/accuracies": 1.0, "rewards/chosen": 0.8504018187522888, "rewards/margins": 0.049604713916778564, "rewards/rejected": 0.8007971048355103, "step": 3902 }, { "epoch": 2.11, "learning_rate": 2.157280566332507e-08, "logits/chosen": -2.063232183456421, "logits/rejected": -2.3189921379089355, "logps/chosen": -3.182851552963257, "logps/rejected": -3.644998788833618, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.759524941444397, "rewards/margins": 0.012987732887268066, "rewards/rejected": 0.7465372085571289, "step": 3903 }, { "epoch": 2.11, "learning_rate": 2.15488583026264e-08, "logits/chosen": -2.1457784175872803, "logits/rejected": -2.14548659324646, "logps/chosen": -0.8058630228042603, "logps/rejected": -1.9070863723754883, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.9218597412109375, "rewards/margins": 0.010594844818115234, "rewards/rejected": 0.9112648963928223, "step": 3904 }, { "epoch": 2.11, "learning_rate": 2.1524920589466504e-08, "logits/chosen": -2.0987279415130615, "logits/rejected": -2.267071008682251, "logps/chosen": -9.277944564819336, "logps/rejected": -0.8910431265830994, "loss": 0.7339, "rewards/accuracies": 0.0, "rewards/chosen": 0.8457414507865906, "rewards/margins": -0.07999253273010254, "rewards/rejected": 0.9257339835166931, "step": 3905 }, { "epoch": 2.11, "learning_rate": 2.150099253196248e-08, "logits/chosen": -2.1377673149108887, "logits/rejected": -2.148552179336548, "logps/chosen": -4.716734886169434, "logps/rejected": -3.4338138103485107, "loss": 0.4159, "rewards/accuracies": 1.0, "rewards/chosen": 1.2829145193099976, "rewards/margins": 0.6622693538665771, "rewards/rejected": 0.6206451654434204, "step": 3906 }, { "epoch": 2.11, "learning_rate": 2.1477074138228125e-08, "logits/chosen": -2.1708879470825195, "logits/rejected": -2.147679090499878, "logps/chosen": -10.035679817199707, "logps/rejected": -7.259925842285156, "loss": 0.2924, "rewards/accuracies": 1.0, "rewards/chosen": 1.6774475574493408, "rewards/margins": 1.0796854496002197, "rewards/rejected": 0.5977621078491211, "step": 3907 }, { "epoch": 2.11, "learning_rate": 2.1453165416373958e-08, "logits/chosen": -1.9664541482925415, "logits/rejected": -1.9646222591400146, "logps/chosen": -0.5566834211349487, "logps/rejected": -2.2680578231811523, "loss": 0.6147, "rewards/accuracies": 1.0, "rewards/chosen": 0.976773738861084, "rewards/margins": 0.16352194547653198, "rewards/rejected": 0.813251793384552, "step": 3908 }, { "epoch": 2.11, "learning_rate": 2.1429266374507248e-08, "logits/chosen": -2.1335251331329346, "logits/rejected": -2.127312421798706, "logps/chosen": -6.031387805938721, "logps/rejected": -5.504191875457764, "loss": 0.376, "rewards/accuracies": 1.0, "rewards/chosen": 1.1842378377914429, "rewards/margins": 0.7841368913650513, "rewards/rejected": 0.4001009464263916, "step": 3909 }, { "epoch": 2.11, "learning_rate": 2.140537702073192e-08, "logits/chosen": -2.0428667068481445, "logits/rejected": -2.054871082305908, "logps/chosen": -3.3294835090637207, "logps/rejected": -2.316714286804199, "loss": 0.4836, "rewards/accuracies": 1.0, "rewards/chosen": 1.0711930990219116, "rewards/margins": 0.4750325679779053, "rewards/rejected": 0.5961605310440063, "step": 3910 }, { "epoch": 2.11, "learning_rate": 2.138149736314867e-08, "logits/chosen": -2.090648651123047, "logits/rejected": -2.087829351425171, "logps/chosen": -2.379625082015991, "logps/rejected": -6.103039264678955, "loss": 0.2726, "rewards/accuracies": 1.0, "rewards/chosen": 1.538219690322876, "rewards/margins": 1.160294771194458, "rewards/rejected": 0.37792497873306274, "step": 3911 }, { "epoch": 2.11, "learning_rate": 2.1357627409854866e-08, "logits/chosen": -2.19466233253479, "logits/rejected": -2.2023062705993652, "logps/chosen": -3.0509378910064697, "logps/rejected": -5.519939422607422, "loss": 0.459, "rewards/accuracies": 1.0, "rewards/chosen": 0.8169007301330566, "rewards/margins": 0.5403097867965698, "rewards/rejected": 0.27659091353416443, "step": 3912 }, { "epoch": 2.11, "learning_rate": 2.1333767168944622e-08, "logits/chosen": -2.1031529903411865, "logits/rejected": -2.270533800125122, "logps/chosen": -0.5912715196609497, "logps/rejected": -5.042032241821289, "loss": 0.7159, "rewards/accuracies": 0.0, "rewards/chosen": 0.7485336661338806, "rewards/margins": -0.04501533508300781, "rewards/rejected": 0.7935490012168884, "step": 3913 }, { "epoch": 2.11, "learning_rate": 2.1309916648508753e-08, "logits/chosen": -2.1918246746063232, "logits/rejected": -2.177375555038452, "logps/chosen": -3.018961191177368, "logps/rejected": -9.38381576538086, "loss": 0.3738, "rewards/accuracies": 1.0, "rewards/chosen": 1.2244257926940918, "rewards/margins": 0.7912604808807373, "rewards/rejected": 0.4331652820110321, "step": 3914 }, { "epoch": 2.11, "learning_rate": 2.1286075856634728e-08, "logits/chosen": -2.1646995544433594, "logits/rejected": -2.320115089416504, "logps/chosen": -1.2947402000427246, "logps/rejected": -1.2460012435913086, "loss": 0.7038, "rewards/accuracies": 0.0, "rewards/chosen": 0.8700651526451111, "rewards/margins": -0.02121943235397339, "rewards/rejected": 0.8912845849990845, "step": 3915 }, { "epoch": 2.11, "learning_rate": 2.126224480140678e-08, "logits/chosen": -2.078885078430176, "logits/rejected": -2.228797674179077, "logps/chosen": -0.18424221873283386, "logps/rejected": -0.1448148488998413, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.6693777441978455, "rewards/margins": 0.022281765937805176, "rewards/rejected": 0.6470959782600403, "step": 3916 }, { "epoch": 2.11, "learning_rate": 2.123842349090582e-08, "logits/chosen": -2.0600500106811523, "logits/rejected": -2.0638678073883057, "logps/chosen": -0.4611225724220276, "logps/rejected": -4.523454189300537, "loss": 0.5223, "rewards/accuracies": 1.0, "rewards/chosen": 0.8406206369400024, "rewards/margins": 0.37691476941108704, "rewards/rejected": 0.4637058675289154, "step": 3917 }, { "epoch": 2.11, "learning_rate": 2.121461193320944e-08, "logits/chosen": -2.1540627479553223, "logits/rejected": -2.2815449237823486, "logps/chosen": -0.25724872946739197, "logps/rejected": -0.2595406472682953, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.8608260154724121, "rewards/margins": 0.0076465606689453125, "rewards/rejected": 0.8531794548034668, "step": 3918 }, { "epoch": 2.11, "learning_rate": 2.119081013639196e-08, "logits/chosen": -2.136845827102661, "logits/rejected": -2.1271324157714844, "logps/chosen": -0.4812619090080261, "logps/rejected": -7.158395290374756, "loss": 0.4373, "rewards/accuracies": 1.0, "rewards/chosen": 1.162894606590271, "rewards/margins": 0.6005873084068298, "rewards/rejected": 0.5623072981834412, "step": 3919 }, { "epoch": 2.11, "learning_rate": 2.1167018108524332e-08, "logits/chosen": -2.065293550491333, "logits/rejected": -2.415686845779419, "logps/chosen": -5.983381748199463, "logps/rejected": -18.93053436279297, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.7616199254989624, "rewards/margins": 0.0128440260887146, "rewards/rejected": 0.7487758994102478, "step": 3920 }, { "epoch": 2.11, "learning_rate": 2.1143235857674237e-08, "logits/chosen": -2.0955731868743896, "logits/rejected": -2.0956952571868896, "logps/chosen": -0.6609930396080017, "logps/rejected": -10.000411033630371, "loss": 0.5197, "rewards/accuracies": 1.0, "rewards/chosen": 1.1005972623825073, "rewards/margins": 0.383354127407074, "rewards/rejected": 0.7172431349754333, "step": 3921 }, { "epoch": 2.12, "learning_rate": 2.1119463391906062e-08, "logits/chosen": -2.0902156829833984, "logits/rejected": -2.101187229156494, "logps/chosen": -3.3452956676483154, "logps/rejected": -3.26557993888855, "loss": 0.3026, "rewards/accuracies": 1.0, "rewards/chosen": 1.5214972496032715, "rewards/margins": 1.0404205322265625, "rewards/rejected": 0.481076717376709, "step": 3922 }, { "epoch": 2.12, "learning_rate": 2.1095700719280807e-08, "logits/chosen": -1.9072991609573364, "logits/rejected": -1.9165105819702148, "logps/chosen": -2.7422804832458496, "logps/rejected": -5.339842796325684, "loss": 0.3968, "rewards/accuracies": 1.0, "rewards/chosen": 1.1991764307022095, "rewards/margins": 0.7192338109016418, "rewards/rejected": 0.4799426198005676, "step": 3923 }, { "epoch": 2.12, "learning_rate": 2.1071947847856218e-08, "logits/chosen": -2.1611454486846924, "logits/rejected": -2.1964666843414307, "logps/chosen": -0.6261211037635803, "logps/rejected": -8.080799102783203, "loss": 0.6758, "rewards/accuracies": 1.0, "rewards/chosen": 1.012773871421814, "rewards/margins": 0.03505438566207886, "rewards/rejected": 0.9777194857597351, "step": 3924 }, { "epoch": 2.12, "learning_rate": 2.1048204785686652e-08, "logits/chosen": -2.070460796356201, "logits/rejected": -2.0663692951202393, "logps/chosen": -6.646912574768066, "logps/rejected": -1.1430892944335938, "loss": 0.3404, "rewards/accuracies": 1.0, "rewards/chosen": 1.745748519897461, "rewards/margins": 0.9024415016174316, "rewards/rejected": 0.8433070182800293, "step": 3925 }, { "epoch": 2.12, "learning_rate": 2.1024471540823246e-08, "logits/chosen": -2.082352876663208, "logits/rejected": -2.085172176361084, "logps/chosen": -1.3658164739608765, "logps/rejected": -11.12150764465332, "loss": 0.4367, "rewards/accuracies": 1.0, "rewards/chosen": 1.1063363552093506, "rewards/margins": 0.6023009419441223, "rewards/rejected": 0.5040354132652283, "step": 3926 }, { "epoch": 2.12, "learning_rate": 2.1000748121313676e-08, "logits/chosen": -2.052518367767334, "logits/rejected": -2.046198844909668, "logps/chosen": -5.741815090179443, "logps/rejected": -5.879868984222412, "loss": 0.3863, "rewards/accuracies": 1.0, "rewards/chosen": 1.4026621580123901, "rewards/margins": 0.7519165277481079, "rewards/rejected": 0.6507456302642822, "step": 3927 }, { "epoch": 2.12, "learning_rate": 2.097703453520238e-08, "logits/chosen": -2.05696439743042, "logits/rejected": -2.063488721847534, "logps/chosen": -3.7086181640625, "logps/rejected": -4.039519309997559, "loss": 0.3856, "rewards/accuracies": 1.0, "rewards/chosen": 1.2795517444610596, "rewards/margins": 0.7539012432098389, "rewards/rejected": 0.5256505012512207, "step": 3928 }, { "epoch": 2.12, "learning_rate": 2.0953330790530448e-08, "logits/chosen": -1.9620064496994019, "logits/rejected": -1.9616001844406128, "logps/chosen": -1.285845160484314, "logps/rejected": -0.7703024744987488, "loss": 0.6249, "rewards/accuracies": 1.0, "rewards/chosen": 0.923583447933197, "rewards/margins": 0.14141976833343506, "rewards/rejected": 0.782163679599762, "step": 3929 }, { "epoch": 2.12, "learning_rate": 2.092963689533558e-08, "logits/chosen": -2.1791210174560547, "logits/rejected": -2.1826119422912598, "logps/chosen": -0.4980062246322632, "logps/rejected": -6.263314723968506, "loss": 0.3901, "rewards/accuracies": 1.0, "rewards/chosen": 1.0326131582260132, "rewards/margins": 0.7399332523345947, "rewards/rejected": 0.29267993569374084, "step": 3930 }, { "epoch": 2.12, "learning_rate": 2.0905952857652183e-08, "logits/chosen": -2.111811876296997, "logits/rejected": -2.3053603172302246, "logps/chosen": -4.335715293884277, "logps/rejected": -2.9061598777770996, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.6581683158874512, "rewards/margins": 0.014392197132110596, "rewards/rejected": 0.6437761187553406, "step": 3931 }, { "epoch": 2.12, "learning_rate": 2.088227868551134e-08, "logits/chosen": -2.067457914352417, "logits/rejected": -2.058135747909546, "logps/chosen": -4.140501499176025, "logps/rejected": -2.348914623260498, "loss": 0.3801, "rewards/accuracies": 1.0, "rewards/chosen": 1.6267400979995728, "rewards/margins": 0.7713077664375305, "rewards/rejected": 0.8554323315620422, "step": 3932 }, { "epoch": 2.12, "learning_rate": 2.0858614386940725e-08, "logits/chosen": -2.054999828338623, "logits/rejected": -2.2254638671875, "logps/chosen": -0.7716301083564758, "logps/rejected": -0.7460152506828308, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.7189306616783142, "rewards/margins": 0.01019972562789917, "rewards/rejected": 0.708730936050415, "step": 3933 }, { "epoch": 2.12, "learning_rate": 2.0834959969964726e-08, "logits/chosen": -2.0486626625061035, "logits/rejected": -2.053264617919922, "logps/chosen": -9.970983505249023, "logps/rejected": -4.292319297790527, "loss": 0.6195, "rewards/accuracies": 1.0, "rewards/chosen": 0.9623304605484009, "rewards/margins": 0.1530882716178894, "rewards/rejected": 0.8092421889305115, "step": 3934 }, { "epoch": 2.12, "learning_rate": 2.081131544260431e-08, "logits/chosen": -2.0563619136810303, "logits/rejected": -2.058563709259033, "logps/chosen": -4.787980556488037, "logps/rejected": -3.0911166667938232, "loss": 0.1389, "rewards/accuracies": 1.0, "rewards/chosen": 2.4138400554656982, "rewards/margins": 1.9038581848144531, "rewards/rejected": 0.5099819302558899, "step": 3935 }, { "epoch": 2.12, "learning_rate": 2.0787680812877202e-08, "logits/chosen": -2.0591578483581543, "logits/rejected": -2.062761068344116, "logps/chosen": -1.0840797424316406, "logps/rejected": -5.975158214569092, "loss": 0.4037, "rewards/accuracies": 1.0, "rewards/chosen": 1.0375317335128784, "rewards/margins": 0.6983506679534912, "rewards/rejected": 0.3391810953617096, "step": 3936 }, { "epoch": 2.12, "learning_rate": 2.0764056088797645e-08, "logits/chosen": -2.0253796577453613, "logits/rejected": -2.0265915393829346, "logps/chosen": -2.362016201019287, "logps/rejected": -5.306227684020996, "loss": 0.4541, "rewards/accuracies": 1.0, "rewards/chosen": 1.1181472539901733, "rewards/margins": 0.5537194013595581, "rewards/rejected": 0.5644278526306152, "step": 3937 }, { "epoch": 2.12, "learning_rate": 2.07404412783766e-08, "logits/chosen": -2.181769371032715, "logits/rejected": -2.2938733100891113, "logps/chosen": -6.567105770111084, "logps/rejected": -0.7137468457221985, "loss": 0.7095, "rewards/accuracies": 0.0, "rewards/chosen": 1.0694092512130737, "rewards/margins": -0.032534122467041016, "rewards/rejected": 1.1019433736801147, "step": 3938 }, { "epoch": 2.12, "learning_rate": 2.0716836389621673e-08, "logits/chosen": -2.0522143840789795, "logits/rejected": -2.2631804943084717, "logps/chosen": -1.4799472093582153, "logps/rejected": -1.3591532707214355, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 1.043384075164795, "rewards/margins": 0.030190706253051758, "rewards/rejected": 1.0131933689117432, "step": 3939 }, { "epoch": 2.13, "learning_rate": 2.0693241430537033e-08, "logits/chosen": -2.1261656284332275, "logits/rejected": -2.1277291774749756, "logps/chosen": -0.2266770899295807, "logps/rejected": -5.492489337921143, "loss": 0.3993, "rewards/accuracies": 1.0, "rewards/chosen": 1.0570604801177979, "rewards/margins": 0.7118204832077026, "rewards/rejected": 0.3452399671077728, "step": 3940 }, { "epoch": 2.13, "learning_rate": 2.0669656409123553e-08, "logits/chosen": -2.1623849868774414, "logits/rejected": -2.1544013023376465, "logps/chosen": -7.773095607757568, "logps/rejected": -0.7836360931396484, "loss": 0.4711, "rewards/accuracies": 1.0, "rewards/chosen": 1.473362922668457, "rewards/margins": 0.5078076124191284, "rewards/rejected": 0.9655553102493286, "step": 3941 }, { "epoch": 2.13, "learning_rate": 2.064608133337873e-08, "logits/chosen": -2.053874969482422, "logits/rejected": -2.267939567565918, "logps/chosen": -1.345189094543457, "logps/rejected": -1.3453377485275269, "loss": 0.7032, "rewards/accuracies": 0.0, "rewards/chosen": 0.8229056596755981, "rewards/margins": -0.02005898952484131, "rewards/rejected": 0.8429646492004395, "step": 3942 }, { "epoch": 2.13, "learning_rate": 2.062251621129663e-08, "logits/chosen": -2.20499324798584, "logits/rejected": -2.2037770748138428, "logps/chosen": -2.747485637664795, "logps/rejected": -5.453686714172363, "loss": 0.3179, "rewards/accuracies": 1.0, "rewards/chosen": 1.3614581823349, "rewards/margins": 0.9828104972839355, "rewards/rejected": 0.37864771485328674, "step": 3943 }, { "epoch": 2.13, "learning_rate": 2.0598961050868023e-08, "logits/chosen": -2.016444444656372, "logits/rejected": -2.246617317199707, "logps/chosen": -1.8344295024871826, "logps/rejected": -0.7401666641235352, "loss": 0.6928, "rewards/accuracies": 1.0, "rewards/chosen": 1.0446783304214478, "rewards/margins": 0.0006793737411499023, "rewards/rejected": 1.0439989566802979, "step": 3944 }, { "epoch": 2.13, "learning_rate": 2.0575415860080213e-08, "logits/chosen": -2.0434374809265137, "logits/rejected": -2.0395164489746094, "logps/chosen": -8.741881370544434, "logps/rejected": -1.8251211643218994, "loss": 0.3637, "rewards/accuracies": 1.0, "rewards/chosen": 1.5088928937911987, "rewards/margins": 0.824069082736969, "rewards/rejected": 0.6848238110542297, "step": 3945 }, { "epoch": 2.13, "learning_rate": 2.0551880646917224e-08, "logits/chosen": -2.1594605445861816, "logits/rejected": -2.1595876216888428, "logps/chosen": -2.7442963123321533, "logps/rejected": -3.9330360889434814, "loss": 0.2648, "rewards/accuracies": 1.0, "rewards/chosen": 1.7148960828781128, "rewards/margins": 1.1935031414031982, "rewards/rejected": 0.5213929414749146, "step": 3946 }, { "epoch": 2.13, "learning_rate": 2.0528355419359637e-08, "logits/chosen": -2.075681686401367, "logits/rejected": -1.9765753746032715, "logps/chosen": -24.394289016723633, "logps/rejected": -2.6605019569396973, "loss": 0.1784, "rewards/accuracies": 1.0, "rewards/chosen": 2.121504545211792, "rewards/margins": 1.6329069137573242, "rewards/rejected": 0.4885976314544678, "step": 3947 }, { "epoch": 2.13, "learning_rate": 2.0504840185384638e-08, "logits/chosen": -2.1792585849761963, "logits/rejected": -2.3686649799346924, "logps/chosen": -9.433297157287598, "logps/rejected": -12.835692405700684, "loss": 0.7605, "rewards/accuracies": 0.0, "rewards/chosen": 1.0665253400802612, "rewards/margins": -0.13051187992095947, "rewards/rejected": 1.1970372200012207, "step": 3948 }, { "epoch": 2.13, "learning_rate": 2.0481334952966055e-08, "logits/chosen": -2.2279183864593506, "logits/rejected": -2.0777158737182617, "logps/chosen": -32.3291130065918, "logps/rejected": -11.207290649414062, "loss": 0.1756, "rewards/accuracies": 1.0, "rewards/chosen": 2.1618752479553223, "rewards/margins": 1.6507534980773926, "rewards/rejected": 0.5111217498779297, "step": 3949 }, { "epoch": 2.13, "learning_rate": 2.045783973007429e-08, "logits/chosen": -2.1339406967163086, "logits/rejected": -2.1424174308776855, "logps/chosen": -4.307408809661865, "logps/rejected": -4.471357822418213, "loss": 0.4066, "rewards/accuracies": 1.0, "rewards/chosen": 1.2385857105255127, "rewards/margins": 0.6898778676986694, "rewards/rejected": 0.5487078428268433, "step": 3950 }, { "epoch": 2.13, "learning_rate": 2.043435452467639e-08, "logits/chosen": -2.1089277267456055, "logits/rejected": -2.085561513900757, "logps/chosen": -16.072189331054688, "logps/rejected": -3.5451548099517822, "loss": 0.2092, "rewards/accuracies": 1.0, "rewards/chosen": 2.2025375366210938, "rewards/margins": 1.4580905437469482, "rewards/rejected": 0.7444469332695007, "step": 3951 }, { "epoch": 2.13, "learning_rate": 2.0410879344735998e-08, "logits/chosen": -2.020965814590454, "logits/rejected": -2.0200722217559814, "logps/chosen": -0.9631083011627197, "logps/rejected": -3.6798620223999023, "loss": 0.5256, "rewards/accuracies": 1.0, "rewards/chosen": 1.1272516250610352, "rewards/margins": 0.3689344525337219, "rewards/rejected": 0.7583171725273132, "step": 3952 }, { "epoch": 2.13, "learning_rate": 2.0387414198213315e-08, "logits/chosen": -2.0563137531280518, "logits/rejected": -2.0567424297332764, "logps/chosen": -0.18188126385211945, "logps/rejected": -4.980774402618408, "loss": 0.4307, "rewards/accuracies": 1.0, "rewards/chosen": 0.99671471118927, "rewards/margins": 0.6192207336425781, "rewards/rejected": 0.3774940073490143, "step": 3953 }, { "epoch": 2.13, "learning_rate": 2.036395909306519e-08, "logits/chosen": -2.0880677700042725, "logits/rejected": -2.2255570888519287, "logps/chosen": -2.576822519302368, "logps/rejected": -2.5708649158477783, "loss": 0.6726, "rewards/accuracies": 1.0, "rewards/chosen": 0.7071027755737305, "rewards/margins": 0.041504085063934326, "rewards/rejected": 0.6655986905097961, "step": 3954 }, { "epoch": 2.13, "learning_rate": 2.0340514037245055e-08, "logits/chosen": -2.1119582653045654, "logits/rejected": -2.1977362632751465, "logps/chosen": -2.242215156555176, "logps/rejected": -3.202831268310547, "loss": 0.6089, "rewards/accuracies": 1.0, "rewards/chosen": 0.9471192359924316, "rewards/margins": 0.17623001337051392, "rewards/rejected": 0.7708892226219177, "step": 3955 }, { "epoch": 2.13, "learning_rate": 2.0317079038702912e-08, "logits/chosen": -2.142853021621704, "logits/rejected": -2.1373767852783203, "logps/chosen": -2.7190279960632324, "logps/rejected": -4.13971471786499, "loss": 0.5049, "rewards/accuracies": 1.0, "rewards/chosen": 1.067482829093933, "rewards/margins": 0.4202895164489746, "rewards/rejected": 0.6471933126449585, "step": 3956 }, { "epoch": 2.13, "learning_rate": 2.02936541053854e-08, "logits/chosen": -2.2000246047973633, "logits/rejected": -2.2299716472625732, "logps/chosen": -8.568100929260254, "logps/rejected": -9.952400207519531, "loss": 0.6213, "rewards/accuracies": 1.0, "rewards/chosen": 1.2109942436218262, "rewards/margins": 0.1492835283279419, "rewards/rejected": 1.0617107152938843, "step": 3957 }, { "epoch": 2.13, "learning_rate": 2.027023924523567e-08, "logits/chosen": -2.15539813041687, "logits/rejected": -2.2835700511932373, "logps/chosen": -4.556297779083252, "logps/rejected": -27.318374633789062, "loss": 0.349, "rewards/accuracies": 1.0, "rewards/chosen": 1.2658528089523315, "rewards/margins": 0.8732419013977051, "rewards/rejected": 0.39261093735694885, "step": 3958 }, { "epoch": 2.14, "learning_rate": 2.024683446619354e-08, "logits/chosen": -2.076022148132324, "logits/rejected": -2.2621610164642334, "logps/chosen": -0.46200618147850037, "logps/rejected": -0.4964090883731842, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.8480094075202942, "rewards/margins": 0.025893986225128174, "rewards/rejected": 0.822115421295166, "step": 3959 }, { "epoch": 2.14, "learning_rate": 2.0223439776195333e-08, "logits/chosen": -2.145275592803955, "logits/rejected": -2.3453211784362793, "logps/chosen": -1.7491214275360107, "logps/rejected": -7.0568647384643555, "loss": 0.6414, "rewards/accuracies": 1.0, "rewards/chosen": 1.2084249258041382, "rewards/margins": 0.10635948181152344, "rewards/rejected": 1.1020654439926147, "step": 3960 }, { "epoch": 2.14, "learning_rate": 2.0200055183174008e-08, "logits/chosen": -2.070162057876587, "logits/rejected": -2.0721664428710938, "logps/chosen": -4.333624362945557, "logps/rejected": -2.0149223804473877, "loss": 0.2842, "rewards/accuracies": 1.0, "rewards/chosen": 1.7359379529953003, "rewards/margins": 1.1127221584320068, "rewards/rejected": 0.6232157945632935, "step": 3961 }, { "epoch": 2.14, "learning_rate": 2.01766806950591e-08, "logits/chosen": -2.042717933654785, "logits/rejected": -2.0458145141601562, "logps/chosen": -4.156932353973389, "logps/rejected": -3.4315404891967773, "loss": 0.3161, "rewards/accuracies": 1.0, "rewards/chosen": 1.616227388381958, "rewards/margins": 0.9895803928375244, "rewards/rejected": 0.6266469955444336, "step": 3962 }, { "epoch": 2.14, "learning_rate": 2.015331631977666e-08, "logits/chosen": -2.1292335987091064, "logits/rejected": -2.128459930419922, "logps/chosen": -0.9894986748695374, "logps/rejected": -1.1180970668792725, "loss": 0.6626, "rewards/accuracies": 1.0, "rewards/chosen": 0.8896905779838562, "rewards/margins": 0.06213480234146118, "rewards/rejected": 0.827555775642395, "step": 3963 }, { "epoch": 2.14, "learning_rate": 2.0129962065249363e-08, "logits/chosen": -2.0833740234375, "logits/rejected": -2.0828959941864014, "logps/chosen": -0.9058595895767212, "logps/rejected": -2.386322498321533, "loss": 0.6135, "rewards/accuracies": 1.0, "rewards/chosen": 1.1127276420593262, "rewards/margins": 0.1662377119064331, "rewards/rejected": 0.9464899301528931, "step": 3964 }, { "epoch": 2.14, "learning_rate": 2.0106617939396437e-08, "logits/chosen": -2.0324478149414062, "logits/rejected": -2.282005786895752, "logps/chosen": -0.4223865866661072, "logps/rejected": -0.4570866823196411, "loss": 0.6661, "rewards/accuracies": 1.0, "rewards/chosen": 0.960540235042572, "rewards/margins": 0.054930031299591064, "rewards/rejected": 0.905610203742981, "step": 3965 }, { "epoch": 2.14, "learning_rate": 2.0083283950133678e-08, "logits/chosen": -1.9745149612426758, "logits/rejected": -1.9723323583602905, "logps/chosen": -2.020660161972046, "logps/rejected": -4.987491607666016, "loss": 0.4899, "rewards/accuracies": 1.0, "rewards/chosen": 1.2460724115371704, "rewards/margins": 0.45869654417037964, "rewards/rejected": 0.7873758673667908, "step": 3966 }, { "epoch": 2.14, "learning_rate": 2.0059960105373462e-08, "logits/chosen": -2.12162709236145, "logits/rejected": -2.291266679763794, "logps/chosen": -1.898079752922058, "logps/rejected": -1.828109622001648, "loss": 0.698, "rewards/accuracies": 0.0, "rewards/chosen": 0.6861589550971985, "rewards/margins": -0.009646236896514893, "rewards/rejected": 0.6958051919937134, "step": 3967 }, { "epoch": 2.14, "learning_rate": 2.0036646413024673e-08, "logits/chosen": -2.1859683990478516, "logits/rejected": -2.3714306354522705, "logps/chosen": -0.39605656266212463, "logps/rejected": -0.40956440567970276, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 1.0473207235336304, "rewards/margins": 0.011406540870666504, "rewards/rejected": 1.0359141826629639, "step": 3968 }, { "epoch": 2.14, "learning_rate": 2.001334288099279e-08, "logits/chosen": -2.066844940185547, "logits/rejected": -2.059722423553467, "logps/chosen": -3.8830559253692627, "logps/rejected": -3.4779739379882812, "loss": 0.5094, "rewards/accuracies": 1.0, "rewards/chosen": 0.9312699437141418, "rewards/margins": 0.4089714288711548, "rewards/rejected": 0.5222985148429871, "step": 3969 }, { "epoch": 2.14, "learning_rate": 1.999004951717987e-08, "logits/chosen": -2.056626081466675, "logits/rejected": -2.300088405609131, "logps/chosen": -1.083456039428711, "logps/rejected": -1.3243341445922852, "loss": 0.7266, "rewards/accuracies": 0.0, "rewards/chosen": 0.87652188539505, "rewards/margins": -0.0658334493637085, "rewards/rejected": 0.9423553347587585, "step": 3970 }, { "epoch": 2.14, "learning_rate": 1.9966766329484452e-08, "logits/chosen": -2.0832974910736084, "logits/rejected": -2.0965802669525146, "logps/chosen": -5.380639553070068, "logps/rejected": -5.562567710876465, "loss": 0.3133, "rewards/accuracies": 1.0, "rewards/chosen": 1.850512146949768, "rewards/margins": 0.9996964335441589, "rewards/rejected": 0.8508157134056091, "step": 3971 }, { "epoch": 2.14, "learning_rate": 1.99434933258017e-08, "logits/chosen": -2.0783932209014893, "logits/rejected": -2.094388246536255, "logps/chosen": -1.4115378856658936, "logps/rejected": -3.305560350418091, "loss": 0.3941, "rewards/accuracies": 1.0, "rewards/chosen": 1.5450736284255981, "rewards/margins": 0.7275843024253845, "rewards/rejected": 0.8174893260002136, "step": 3972 }, { "epoch": 2.14, "learning_rate": 1.992023051402326e-08, "logits/chosen": -2.040839433670044, "logits/rejected": -2.085970401763916, "logps/chosen": -6.51035737991333, "logps/rejected": -17.929407119750977, "loss": 0.5405, "rewards/accuracies": 1.0, "rewards/chosen": 1.0456796884536743, "rewards/margins": 0.3329545855522156, "rewards/rejected": 0.7127251029014587, "step": 3973 }, { "epoch": 2.14, "learning_rate": 1.9896977902037376e-08, "logits/chosen": -1.9814165830612183, "logits/rejected": -1.9804503917694092, "logps/chosen": -2.899643898010254, "logps/rejected": -3.8907222747802734, "loss": 0.3726, "rewards/accuracies": 1.0, "rewards/chosen": 1.3709555864334106, "rewards/margins": 0.7950670123100281, "rewards/rejected": 0.5758885741233826, "step": 3974 }, { "epoch": 2.14, "learning_rate": 1.9873735497728794e-08, "logits/chosen": -2.1149847507476807, "logits/rejected": -2.3628687858581543, "logps/chosen": -0.8724490404129028, "logps/rejected": -1.003223180770874, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.8025245070457458, "rewards/margins": 0.015509605407714844, "rewards/rejected": 0.787014901638031, "step": 3975 }, { "epoch": 2.14, "learning_rate": 1.9850503308978826e-08, "logits/chosen": -2.1129724979400635, "logits/rejected": -2.109954833984375, "logps/chosen": -8.425445556640625, "logps/rejected": -3.354419231414795, "loss": 0.4396, "rewards/accuracies": 1.0, "rewards/chosen": 1.1827404499053955, "rewards/margins": 0.5940490365028381, "rewards/rejected": 0.5886914134025574, "step": 3976 }, { "epoch": 2.15, "learning_rate": 1.9827281343665324e-08, "logits/chosen": -2.0464751720428467, "logits/rejected": -2.2672269344329834, "logps/chosen": -0.38584449887275696, "logps/rejected": -0.38311678171157837, "loss": 0.689, "rewards/accuracies": 1.0, "rewards/chosen": 0.8733356595039368, "rewards/margins": 0.008357346057891846, "rewards/rejected": 0.8649783134460449, "step": 3977 }, { "epoch": 2.15, "learning_rate": 1.980406960966262e-08, "logits/chosen": -2.0797770023345947, "logits/rejected": -2.0842108726501465, "logps/chosen": -1.4447603225708008, "logps/rejected": -2.7726330757141113, "loss": 0.4389, "rewards/accuracies": 1.0, "rewards/chosen": 1.1453367471694946, "rewards/margins": 0.5959223508834839, "rewards/rejected": 0.5494143962860107, "step": 3978 }, { "epoch": 2.15, "learning_rate": 1.978086811484163e-08, "logits/chosen": -2.157604217529297, "logits/rejected": -2.1876044273376465, "logps/chosen": -4.381824970245361, "logps/rejected": -20.701601028442383, "loss": 0.1986, "rewards/accuracies": 1.0, "rewards/chosen": 1.7034662961959839, "rewards/margins": 1.515444278717041, "rewards/rejected": 0.18802204728126526, "step": 3979 }, { "epoch": 2.15, "learning_rate": 1.9757676867069813e-08, "logits/chosen": -2.0902957916259766, "logits/rejected": -2.264697313308716, "logps/chosen": -0.8724752068519592, "logps/rejected": -0.962802529335022, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.6575412750244141, "rewards/margins": 0.015862345695495605, "rewards/rejected": 0.6416789293289185, "step": 3980 }, { "epoch": 2.15, "learning_rate": 1.973449587421107e-08, "logits/chosen": -2.0383453369140625, "logits/rejected": -2.036344528198242, "logps/chosen": -6.380021095275879, "logps/rejected": -5.087818622589111, "loss": 0.3464, "rewards/accuracies": 1.0, "rewards/chosen": 1.2666453123092651, "rewards/margins": 0.8819352388381958, "rewards/rejected": 0.38471007347106934, "step": 3981 }, { "epoch": 2.15, "learning_rate": 1.9711325144125923e-08, "logits/chosen": -1.9634437561035156, "logits/rejected": -1.9523308277130127, "logps/chosen": -0.7287784218788147, "logps/rejected": -4.028126239776611, "loss": 0.4998, "rewards/accuracies": 1.0, "rewards/chosen": 1.1493370532989502, "rewards/margins": 0.4331638216972351, "rewards/rejected": 0.7161732316017151, "step": 3982 }, { "epoch": 2.15, "learning_rate": 1.9688164684671333e-08, "logits/chosen": -2.002490758895874, "logits/rejected": -2.006631851196289, "logps/chosen": -3.709707260131836, "logps/rejected": -1.4719798564910889, "loss": 0.6471, "rewards/accuracies": 1.0, "rewards/chosen": 1.2980045080184937, "rewards/margins": 0.09434854984283447, "rewards/rejected": 1.2036559581756592, "step": 3983 }, { "epoch": 2.15, "learning_rate": 1.9665014503700827e-08, "logits/chosen": -2.128383159637451, "logits/rejected": -2.2775816917419434, "logps/chosen": -0.5100070238113403, "logps/rejected": -0.5220491886138916, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.9724087119102478, "rewards/margins": 0.01214689016342163, "rewards/rejected": 0.9602618217468262, "step": 3984 }, { "epoch": 2.15, "learning_rate": 1.964187460906444e-08, "logits/chosen": -2.0615506172180176, "logits/rejected": -2.0594754219055176, "logps/chosen": -2.2888875007629395, "logps/rejected": -3.417738437652588, "loss": 0.5531, "rewards/accuracies": 1.0, "rewards/chosen": 0.9404714703559875, "rewards/margins": 0.30288422107696533, "rewards/rejected": 0.6375872492790222, "step": 3985 }, { "epoch": 2.15, "learning_rate": 1.9618745008608707e-08, "logits/chosen": -2.004363536834717, "logits/rejected": -2.2695281505584717, "logps/chosen": -4.588382244110107, "logps/rejected": -4.497840404510498, "loss": 0.6991, "rewards/accuracies": 0.0, "rewards/chosen": 0.714807391166687, "rewards/margins": -0.011818408966064453, "rewards/rejected": 0.7266258001327515, "step": 3986 }, { "epoch": 2.15, "learning_rate": 1.9595625710176694e-08, "logits/chosen": -2.141159772872925, "logits/rejected": -2.019812822341919, "logps/chosen": -22.580684661865234, "logps/rejected": -2.4698476791381836, "loss": 0.3028, "rewards/accuracies": 1.0, "rewards/chosen": 1.6892082691192627, "rewards/margins": 1.0396291017532349, "rewards/rejected": 0.6495791673660278, "step": 3987 }, { "epoch": 2.15, "learning_rate": 1.9572516721607922e-08, "logits/chosen": -2.1428136825561523, "logits/rejected": -2.203631639480591, "logps/chosen": -6.902024269104004, "logps/rejected": -17.035572052001953, "loss": 0.5403, "rewards/accuracies": 1.0, "rewards/chosen": 1.5133166313171387, "rewards/margins": 0.3332350254058838, "rewards/rejected": 1.1800816059112549, "step": 3988 }, { "epoch": 2.15, "learning_rate": 1.9549418050738474e-08, "logits/chosen": -2.0126421451568604, "logits/rejected": -2.0228724479675293, "logps/chosen": -1.6618298292160034, "logps/rejected": -1.7714916467666626, "loss": 0.4805, "rewards/accuracies": 1.0, "rewards/chosen": 1.1723655462265015, "rewards/margins": 0.48309916257858276, "rewards/rejected": 0.6892663836479187, "step": 3989 }, { "epoch": 2.15, "learning_rate": 1.952632970540093e-08, "logits/chosen": -2.1918704509735107, "logits/rejected": -2.1998109817504883, "logps/chosen": -1.4822039604187012, "logps/rejected": -1.7749583721160889, "loss": 0.483, "rewards/accuracies": 1.0, "rewards/chosen": 1.0584347248077393, "rewards/margins": 0.47647279500961304, "rewards/rejected": 0.5819619297981262, "step": 3990 }, { "epoch": 2.15, "learning_rate": 1.9503251693424306e-08, "logits/chosen": -2.1885828971862793, "logits/rejected": -2.339064598083496, "logps/chosen": -0.3248722553253174, "logps/rejected": -0.3682333528995514, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.8324027061462402, "rewards/margins": 0.016421139240264893, "rewards/rejected": 0.8159815669059753, "step": 3991 }, { "epoch": 2.15, "learning_rate": 1.9480184022634188e-08, "logits/chosen": -2.0587992668151855, "logits/rejected": -2.3133294582366943, "logps/chosen": -0.3618474304676056, "logps/rejected": -0.46529608964920044, "loss": 0.6998, "rewards/accuracies": 0.0, "rewards/chosen": 0.7859995365142822, "rewards/margins": -0.013346493244171143, "rewards/rejected": 0.7993460297584534, "step": 3992 }, { "epoch": 2.15, "learning_rate": 1.9457126700852634e-08, "logits/chosen": -2.0703494548797607, "logits/rejected": -2.0732696056365967, "logps/chosen": -0.4995163679122925, "logps/rejected": -13.000353813171387, "loss": 0.4754, "rewards/accuracies": 1.0, "rewards/chosen": 1.0411925315856934, "rewards/margins": 0.49649864435195923, "rewards/rejected": 0.5446938872337341, "step": 3993 }, { "epoch": 2.15, "learning_rate": 1.9434079735898134e-08, "logits/chosen": -2.0422468185424805, "logits/rejected": -2.0417683124542236, "logps/chosen": -0.32617276906967163, "logps/rejected": -4.043999671936035, "loss": 0.4867, "rewards/accuracies": 1.0, "rewards/chosen": 1.0695627927780151, "rewards/margins": 0.46692997217178345, "rewards/rejected": 0.6026328206062317, "step": 3994 }, { "epoch": 2.15, "learning_rate": 1.9411043135585793e-08, "logits/chosen": -2.0390260219573975, "logits/rejected": -2.2341737747192383, "logps/chosen": -4.825159549713135, "logps/rejected": -0.8117250204086304, "loss": 0.7754, "rewards/accuracies": 0.0, "rewards/chosen": 0.7310473918914795, "rewards/margins": -0.1581963300704956, "rewards/rejected": 0.8892437219619751, "step": 3995 }, { "epoch": 2.16, "learning_rate": 1.9388016907727063e-08, "logits/chosen": -2.1666338443756104, "logits/rejected": -2.3234307765960693, "logps/chosen": -0.4449542164802551, "logps/rejected": -0.4278731644153595, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.9736412167549133, "rewards/margins": 0.010251343250274658, "rewards/rejected": 0.9633898735046387, "step": 3996 }, { "epoch": 2.16, "learning_rate": 1.936500106012998e-08, "logits/chosen": -2.0690805912017822, "logits/rejected": -2.2570810317993164, "logps/chosen": -0.4019961953163147, "logps/rejected": -0.3880044221878052, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.8687320947647095, "rewards/margins": 0.021588146686553955, "rewards/rejected": 0.8471439480781555, "step": 3997 }, { "epoch": 2.16, "learning_rate": 1.9341995600598992e-08, "logits/chosen": -2.0566344261169434, "logits/rejected": -2.220323324203491, "logps/chosen": -1.7156792879104614, "logps/rejected": -1.9331634044647217, "loss": 0.6884, "rewards/accuracies": 1.0, "rewards/chosen": 1.0664386749267578, "rewards/margins": 0.009434342384338379, "rewards/rejected": 1.0570043325424194, "step": 3998 }, { "epoch": 2.16, "learning_rate": 1.9319000536935065e-08, "logits/chosen": -1.9690873622894287, "logits/rejected": -2.2072596549987793, "logps/chosen": -0.7611147165298462, "logps/rejected": -0.8260717391967773, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.769040048122406, "rewards/margins": 0.013363301753997803, "rewards/rejected": 0.7556767463684082, "step": 3999 }, { "epoch": 2.16, "learning_rate": 1.9296015876935647e-08, "logits/chosen": -2.0839340686798096, "logits/rejected": -2.231823682785034, "logps/chosen": -3.2308430671691895, "logps/rejected": -7.147982597351074, "loss": 0.7477, "rewards/accuracies": 0.0, "rewards/chosen": 1.0401585102081299, "rewards/margins": -0.10631036758422852, "rewards/rejected": 1.1464688777923584, "step": 4000 }, { "epoch": 2.16, "learning_rate": 1.92730416283946e-08, "logits/chosen": -2.077195644378662, "logits/rejected": -2.265387773513794, "logps/chosen": -1.2071613073349, "logps/rejected": -1.260344386100769, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 0.7812788486480713, "rewards/margins": 0.00816875696182251, "rewards/rejected": 0.7731100916862488, "step": 4001 }, { "epoch": 2.16, "learning_rate": 1.9250077799102322e-08, "logits/chosen": -2.0456464290618896, "logits/rejected": -2.291940689086914, "logps/chosen": -0.5940651893615723, "logps/rejected": -0.5813226699829102, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.832127571105957, "rewards/margins": 0.013388633728027344, "rewards/rejected": 0.8187389373779297, "step": 4002 }, { "epoch": 2.16, "learning_rate": 1.9227124396845663e-08, "logits/chosen": -2.128243923187256, "logits/rejected": -2.1297922134399414, "logps/chosen": -0.6295434236526489, "logps/rejected": -4.2069091796875, "loss": 0.4461, "rewards/accuracies": 1.0, "rewards/chosen": 1.0882865190505981, "rewards/margins": 0.575890064239502, "rewards/rejected": 0.5123964548110962, "step": 4003 }, { "epoch": 2.16, "learning_rate": 1.9204181429407868e-08, "logits/chosen": -2.136911630630493, "logits/rejected": -2.134434461593628, "logps/chosen": -8.646717071533203, "logps/rejected": -3.0171847343444824, "loss": 0.3788, "rewards/accuracies": 1.0, "rewards/chosen": 1.4125274419784546, "rewards/margins": 0.7752482295036316, "rewards/rejected": 0.637279212474823, "step": 4004 }, { "epoch": 2.16, "learning_rate": 1.918124890456878e-08, "logits/chosen": -2.0477278232574463, "logits/rejected": -2.0416505336761475, "logps/chosen": -4.625769138336182, "logps/rejected": -3.885737419128418, "loss": 0.282, "rewards/accuracies": 1.0, "rewards/chosen": 1.6029561758041382, "rewards/margins": 1.121492862701416, "rewards/rejected": 0.48146334290504456, "step": 4005 }, { "epoch": 2.16, "learning_rate": 1.9158326830104567e-08, "logits/chosen": -2.0592122077941895, "logits/rejected": -2.057560682296753, "logps/chosen": -0.5258919596672058, "logps/rejected": -1.86843740940094, "loss": 0.5599, "rewards/accuracies": 1.0, "rewards/chosen": 0.9961497187614441, "rewards/margins": 0.2871078848838806, "rewards/rejected": 0.7090418338775635, "step": 4006 }, { "epoch": 2.16, "learning_rate": 1.9135415213787942e-08, "logits/chosen": -1.9504879713058472, "logits/rejected": -1.9438599348068237, "logps/chosen": -3.148343801498413, "logps/rejected": -6.00425386428833, "loss": 0.5086, "rewards/accuracies": 1.0, "rewards/chosen": 0.8148823976516724, "rewards/margins": 0.4110707938671112, "rewards/rejected": 0.40381160378456116, "step": 4007 }, { "epoch": 2.16, "learning_rate": 1.9112514063388003e-08, "logits/chosen": -2.202519178390503, "logits/rejected": -2.232410430908203, "logps/chosen": -0.7200618386268616, "logps/rejected": -9.439533233642578, "loss": 0.5242, "rewards/accuracies": 1.0, "rewards/chosen": 0.9331860542297363, "rewards/margins": 0.37224262952804565, "rewards/rejected": 0.5609434247016907, "step": 4008 }, { "epoch": 2.16, "learning_rate": 1.908962338667035e-08, "logits/chosen": -2.07704496383667, "logits/rejected": -2.067439079284668, "logps/chosen": -5.259127616882324, "logps/rejected": -0.7836642265319824, "loss": 0.4807, "rewards/accuracies": 1.0, "rewards/chosen": 1.4973740577697754, "rewards/margins": 0.48252737522125244, "rewards/rejected": 1.014846682548523, "step": 4009 }, { "epoch": 2.16, "learning_rate": 1.906674319139704e-08, "logits/chosen": -2.2023069858551025, "logits/rejected": -2.202556848526001, "logps/chosen": -3.9012370109558105, "logps/rejected": -4.5640153884887695, "loss": 0.4744, "rewards/accuracies": 1.0, "rewards/chosen": 1.4744919538497925, "rewards/margins": 0.49910157918930054, "rewards/rejected": 0.9753903746604919, "step": 4010 }, { "epoch": 2.16, "learning_rate": 1.904387348532651e-08, "logits/chosen": -2.111682653427124, "logits/rejected": -2.2615697383880615, "logps/chosen": -0.2449188530445099, "logps/rejected": -0.2340318262577057, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.9038175940513611, "rewards/margins": 0.004043519496917725, "rewards/rejected": 0.8997740745544434, "step": 4011 }, { "epoch": 2.16, "learning_rate": 1.9021014276213705e-08, "logits/chosen": -2.0788397789001465, "logits/rejected": -2.0698297023773193, "logps/chosen": -1.6987026929855347, "logps/rejected": -11.79429817199707, "loss": 0.2912, "rewards/accuracies": 1.0, "rewards/chosen": 1.0924795866012573, "rewards/margins": 1.084514856338501, "rewards/rejected": 0.007964706979691982, "step": 4012 }, { "epoch": 2.16, "learning_rate": 1.8998165571809975e-08, "logits/chosen": -2.004511833190918, "logits/rejected": -2.2711222171783447, "logps/chosen": -1.9606702327728271, "logps/rejected": -2.052381992340088, "loss": 0.6752, "rewards/accuracies": 1.0, "rewards/chosen": 0.6148236393928528, "rewards/margins": 0.0362277626991272, "rewards/rejected": 0.5785958766937256, "step": 4013 }, { "epoch": 2.17, "learning_rate": 1.8975327379863136e-08, "logits/chosen": -2.0176615715026855, "logits/rejected": -2.1029467582702637, "logps/chosen": -2.075589656829834, "logps/rejected": -20.209962844848633, "loss": 0.6648, "rewards/accuracies": 1.0, "rewards/chosen": 0.8668276071548462, "rewards/margins": 0.05760425329208374, "rewards/rejected": 0.8092233538627625, "step": 4014 }, { "epoch": 2.17, "learning_rate": 1.895249970811743e-08, "logits/chosen": -2.2564733028411865, "logits/rejected": -2.408773422241211, "logps/chosen": -0.617149829864502, "logps/rejected": -0.667105495929718, "loss": 0.6922, "rewards/accuracies": 1.0, "rewards/chosen": 0.8543886542320251, "rewards/margins": 0.0018500685691833496, "rewards/rejected": 0.8525385856628418, "step": 4015 }, { "epoch": 2.17, "learning_rate": 1.8929682564313492e-08, "logits/chosen": -2.0613791942596436, "logits/rejected": -2.064535140991211, "logps/chosen": -1.636572003364563, "logps/rejected": -2.378722667694092, "loss": 0.5592, "rewards/accuracies": 1.0, "rewards/chosen": 1.1872217655181885, "rewards/margins": 0.2886677384376526, "rewards/rejected": 0.8985540270805359, "step": 4016 }, { "epoch": 2.17, "learning_rate": 1.8906875956188444e-08, "logits/chosen": -2.0619170665740967, "logits/rejected": -2.0630526542663574, "logps/chosen": -0.2004539668560028, "logps/rejected": -5.632686614990234, "loss": 0.4566, "rewards/accuracies": 1.0, "rewards/chosen": 0.9315099120140076, "rewards/margins": 0.5469965934753418, "rewards/rejected": 0.3845132887363434, "step": 4017 }, { "epoch": 2.17, "learning_rate": 1.8884079891475824e-08, "logits/chosen": -2.155714511871338, "logits/rejected": -2.155033826828003, "logps/chosen": -2.8495006561279297, "logps/rejected": -3.2980639934539795, "loss": 0.432, "rewards/accuracies": 1.0, "rewards/chosen": 1.4347814321517944, "rewards/margins": 0.6156749129295349, "rewards/rejected": 0.8191065192222595, "step": 4018 }, { "epoch": 2.17, "learning_rate": 1.886129437790555e-08, "logits/chosen": -2.04565167427063, "logits/rejected": -2.290816307067871, "logps/chosen": -0.5050563216209412, "logps/rejected": -0.6073439121246338, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.9459201693534851, "rewards/margins": 0.013257205486297607, "rewards/rejected": 0.9326629638671875, "step": 4019 }, { "epoch": 2.17, "learning_rate": 1.8838519423204024e-08, "logits/chosen": -2.1332550048828125, "logits/rejected": -2.251193046569824, "logps/chosen": -0.24747584760189056, "logps/rejected": -0.29478326439857483, "loss": 0.6962, "rewards/accuracies": 0.0, "rewards/chosen": 0.8125126957893372, "rewards/margins": -0.006006836891174316, "rewards/rejected": 0.8185195326805115, "step": 4020 }, { "epoch": 2.17, "learning_rate": 1.881575503509401e-08, "logits/chosen": -2.0611412525177, "logits/rejected": -2.0634491443634033, "logps/chosen": -0.6153446435928345, "logps/rejected": -4.948273658752441, "loss": 0.476, "rewards/accuracies": 1.0, "rewards/chosen": 0.9552656412124634, "rewards/margins": 0.4950183928012848, "rewards/rejected": 0.4602472484111786, "step": 4021 }, { "epoch": 2.17, "learning_rate": 1.8793001221294735e-08, "logits/chosen": -2.0869622230529785, "logits/rejected": -2.24834942817688, "logps/chosen": -1.864708662033081, "logps/rejected": -1.701986312866211, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 1.0727524757385254, "rewards/margins": 0.003183603286743164, "rewards/rejected": 1.0695688724517822, "step": 4022 }, { "epoch": 2.17, "learning_rate": 1.877025798952182e-08, "logits/chosen": -2.138695240020752, "logits/rejected": -2.1471245288848877, "logps/chosen": -2.135807514190674, "logps/rejected": -3.3826723098754883, "loss": 0.4344, "rewards/accuracies": 1.0, "rewards/chosen": 1.3017253875732422, "rewards/margins": 0.6088393330574036, "rewards/rejected": 0.6928860545158386, "step": 4023 }, { "epoch": 2.17, "learning_rate": 1.8747525347487297e-08, "logits/chosen": -1.9856849908828735, "logits/rejected": -2.2891733646392822, "logps/chosen": -0.5589039921760559, "logps/rejected": -0.6231970191001892, "loss": 0.6669, "rewards/accuracies": 1.0, "rewards/chosen": 0.8207149505615234, "rewards/margins": 0.053121984004974365, "rewards/rejected": 0.7675929665565491, "step": 4024 }, { "epoch": 2.17, "learning_rate": 1.8724803302899634e-08, "logits/chosen": -2.0836615562438965, "logits/rejected": -2.2942869663238525, "logps/chosen": -0.5110681056976318, "logps/rejected": -0.462088942527771, "loss": 0.7039, "rewards/accuracies": 0.0, "rewards/chosen": 0.8798084259033203, "rewards/margins": -0.02145862579345703, "rewards/rejected": 0.9012670516967773, "step": 4025 }, { "epoch": 2.17, "learning_rate": 1.8702091863463643e-08, "logits/chosen": -2.144505262374878, "logits/rejected": -2.1442008018493652, "logps/chosen": -4.466371059417725, "logps/rejected": -2.434669017791748, "loss": 0.2829, "rewards/accuracies": 1.0, "rewards/chosen": 1.6581882238388062, "rewards/margins": 1.1179004907608032, "rewards/rejected": 0.5402877330780029, "step": 4026 }, { "epoch": 2.17, "learning_rate": 1.86793910368806e-08, "logits/chosen": -1.991688847541809, "logits/rejected": -1.9897029399871826, "logps/chosen": -0.5595685243606567, "logps/rejected": -8.402379035949707, "loss": 0.5664, "rewards/accuracies": 1.0, "rewards/chosen": 0.9248042106628418, "rewards/margins": 0.27195072174072266, "rewards/rejected": 0.6528534889221191, "step": 4027 }, { "epoch": 2.17, "learning_rate": 1.8656700830848172e-08, "logits/chosen": -2.0818822383880615, "logits/rejected": -2.0812857151031494, "logps/chosen": -4.833366870880127, "logps/rejected": -5.326034069061279, "loss": 0.5117, "rewards/accuracies": 1.0, "rewards/chosen": 1.048216462135315, "rewards/margins": 0.40337759256362915, "rewards/rejected": 0.6448388695716858, "step": 4028 }, { "epoch": 2.17, "learning_rate": 1.8634021253060384e-08, "logits/chosen": -2.1261746883392334, "logits/rejected": -2.1130385398864746, "logps/chosen": -32.81599426269531, "logps/rejected": -27.66149139404297, "loss": 0.2419, "rewards/accuracies": 1.0, "rewards/chosen": 1.9925224781036377, "rewards/margins": 1.2960402965545654, "rewards/rejected": 0.6964821219444275, "step": 4029 }, { "epoch": 2.17, "learning_rate": 1.8611352311207723e-08, "logits/chosen": -2.033808708190918, "logits/rejected": -2.309950351715088, "logps/chosen": -0.6879528760910034, "logps/rejected": -0.6820331811904907, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.886603057384491, "rewards/margins": 0.029507815837860107, "rewards/rejected": 0.8570952415466309, "step": 4030 }, { "epoch": 2.17, "learning_rate": 1.8588694012976995e-08, "logits/chosen": -2.010932683944702, "logits/rejected": -2.047100305557251, "logps/chosen": -8.711282730102539, "logps/rejected": -16.065021514892578, "loss": 0.4674, "rewards/accuracies": 1.0, "rewards/chosen": 1.1556535959243774, "rewards/margins": 0.5178079605102539, "rewards/rejected": 0.6378456354141235, "step": 4031 }, { "epoch": 2.17, "learning_rate": 1.856604636605146e-08, "logits/chosen": -1.9381005764007568, "logits/rejected": -2.267906904220581, "logps/chosen": -0.256619930267334, "logps/rejected": -0.27474886178970337, "loss": 0.6938, "rewards/accuracies": 0.0, "rewards/chosen": 0.9794178009033203, "rewards/margins": -0.0012737512588500977, "rewards/rejected": 0.9806915521621704, "step": 4032 }, { "epoch": 2.18, "learning_rate": 1.8543409378110724e-08, "logits/chosen": -2.032625913619995, "logits/rejected": -2.026423215866089, "logps/chosen": -11.68143367767334, "logps/rejected": -5.462480068206787, "loss": 0.2087, "rewards/accuracies": 1.0, "rewards/chosen": 2.154432773590088, "rewards/margins": 1.460451364517212, "rewards/rejected": 0.693981409072876, "step": 4033 }, { "epoch": 2.18, "learning_rate": 1.8520783056830813e-08, "logits/chosen": -2.098001718521118, "logits/rejected": -2.094998836517334, "logps/chosen": -0.9916883111000061, "logps/rejected": -3.688220739364624, "loss": 0.4514, "rewards/accuracies": 1.0, "rewards/chosen": 1.136489987373352, "rewards/margins": 0.5612322688102722, "rewards/rejected": 0.5752577185630798, "step": 4034 }, { "epoch": 2.18, "learning_rate": 1.8498167409884135e-08, "logits/chosen": -2.076319694519043, "logits/rejected": -2.313535451889038, "logps/chosen": -5.635550498962402, "logps/rejected": -1.2452890872955322, "loss": 0.7072, "rewards/accuracies": 0.0, "rewards/chosen": 0.6858140230178833, "rewards/margins": -0.02798163890838623, "rewards/rejected": 0.7137956619262695, "step": 4035 }, { "epoch": 2.18, "learning_rate": 1.8475562444939418e-08, "logits/chosen": -2.0359973907470703, "logits/rejected": -2.2976818084716797, "logps/chosen": -1.2277077436447144, "logps/rejected": -1.344980001449585, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 1.2326067686080933, "rewards/margins": 0.025893092155456543, "rewards/rejected": 1.2067136764526367, "step": 4036 }, { "epoch": 2.18, "learning_rate": 1.8452968169661837e-08, "logits/chosen": -2.070479393005371, "logits/rejected": -2.2882978916168213, "logps/chosen": -0.2587891221046448, "logps/rejected": -0.29930850863456726, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.7997972369194031, "rewards/margins": 0.007257699966430664, "rewards/rejected": 0.7925395369529724, "step": 4037 }, { "epoch": 2.18, "learning_rate": 1.8430384591712934e-08, "logits/chosen": -2.100635051727295, "logits/rejected": -2.111161947250366, "logps/chosen": -5.461299896240234, "logps/rejected": -3.554133892059326, "loss": 0.2948, "rewards/accuracies": 1.0, "rewards/chosen": 1.659817099571228, "rewards/margins": 1.0705687999725342, "rewards/rejected": 0.5892482399940491, "step": 4038 }, { "epoch": 2.18, "learning_rate": 1.8407811718750566e-08, "logits/chosen": -2.105870485305786, "logits/rejected": -2.303988456726074, "logps/chosen": -1.3558964729309082, "logps/rejected": -1.3945226669311523, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 1.0757633447647095, "rewards/margins": 0.017884016036987305, "rewards/rejected": 1.0578793287277222, "step": 4039 }, { "epoch": 2.18, "learning_rate": 1.8385249558429028e-08, "logits/chosen": -2.2997515201568604, "logits/rejected": -2.014940023422241, "logps/chosen": -66.82029724121094, "logps/rejected": -12.778353691101074, "loss": 0.0747, "rewards/accuracies": 1.0, "rewards/chosen": 3.2930405139923096, "rewards/margins": 2.5567164421081543, "rewards/rejected": 0.7363240122795105, "step": 4040 }, { "epoch": 2.18, "learning_rate": 1.8362698118398967e-08, "logits/chosen": -2.0560219287872314, "logits/rejected": -2.3135018348693848, "logps/chosen": -0.26706641912460327, "logps/rejected": -0.269546777009964, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 0.9785379767417908, "rewards/margins": 0.03951835632324219, "rewards/rejected": 0.9390196204185486, "step": 4041 }, { "epoch": 2.18, "learning_rate": 1.834015740630735e-08, "logits/chosen": -2.0368192195892334, "logits/rejected": -2.305516242980957, "logps/chosen": -0.29151320457458496, "logps/rejected": -0.2809751629829407, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 0.8800848126411438, "rewards/margins": -0.010510921478271484, "rewards/rejected": 0.8905957341194153, "step": 4042 }, { "epoch": 2.18, "learning_rate": 1.831762742979756e-08, "logits/chosen": -2.0501914024353027, "logits/rejected": -2.0586981773376465, "logps/chosen": -3.638173818588257, "logps/rejected": -3.3335609436035156, "loss": 0.4243, "rewards/accuracies": 1.0, "rewards/chosen": 1.2233854532241821, "rewards/margins": 0.6377585530281067, "rewards/rejected": 0.5856269001960754, "step": 4043 }, { "epoch": 2.18, "learning_rate": 1.8295108196509312e-08, "logits/chosen": -2.250689744949341, "logits/rejected": -2.334033489227295, "logps/chosen": -2.181215286254883, "logps/rejected": -2.349381923675537, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.780963122844696, "rewards/margins": 0.012781262397766113, "rewards/rejected": 0.7681818604469299, "step": 4044 }, { "epoch": 2.18, "learning_rate": 1.8272599714078714e-08, "logits/chosen": -2.043802499771118, "logits/rejected": -2.2568254470825195, "logps/chosen": -0.31698372960090637, "logps/rejected": -5.006994724273682, "loss": 0.5458, "rewards/accuracies": 1.0, "rewards/chosen": 0.9464542269706726, "rewards/margins": 0.3202199339866638, "rewards/rejected": 0.6262342929840088, "step": 4045 }, { "epoch": 2.18, "learning_rate": 1.8250101990138155e-08, "logits/chosen": -1.977301001548767, "logits/rejected": -2.2646520137786865, "logps/chosen": -1.180908203125, "logps/rejected": -1.1130083799362183, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.8021125197410583, "rewards/margins": 0.028332650661468506, "rewards/rejected": 0.7737798690795898, "step": 4046 }, { "epoch": 2.18, "learning_rate": 1.822761503231644e-08, "logits/chosen": -2.2059056758880615, "logits/rejected": -2.212895631790161, "logps/chosen": -1.3170160055160522, "logps/rejected": -2.769216537475586, "loss": 0.4471, "rewards/accuracies": 1.0, "rewards/chosen": 1.1473389863967896, "rewards/margins": 0.5732178688049316, "rewards/rejected": 0.5741211175918579, "step": 4047 }, { "epoch": 2.18, "learning_rate": 1.8205138848238726e-08, "logits/chosen": -2.1326065063476562, "logits/rejected": -2.025783061981201, "logps/chosen": -23.864154815673828, "logps/rejected": -3.1807548999786377, "loss": 0.1745, "rewards/accuracies": 1.0, "rewards/chosen": 2.2498722076416016, "rewards/margins": 1.6576130390167236, "rewards/rejected": 0.5922591090202332, "step": 4048 }, { "epoch": 2.18, "learning_rate": 1.818267344552646e-08, "logits/chosen": -2.051100015640259, "logits/rejected": -2.2457919120788574, "logps/chosen": -3.6555464267730713, "logps/rejected": -3.3774263858795166, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.4491907060146332, "rewards/margins": 0.006211280822753906, "rewards/rejected": 0.4429794251918793, "step": 4049 }, { "epoch": 2.18, "learning_rate": 1.816021883179748e-08, "logits/chosen": -2.0571210384368896, "logits/rejected": -2.208425283432007, "logps/chosen": -1.2547320127487183, "logps/rejected": -1.2373316287994385, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.7697783708572388, "rewards/margins": 0.02513277530670166, "rewards/rejected": 0.7446455955505371, "step": 4050 }, { "epoch": 2.19, "learning_rate": 1.813777501466597e-08, "logits/chosen": -2.1039087772369385, "logits/rejected": -2.1229825019836426, "logps/chosen": -1.8106975555419922, "logps/rejected": -3.5386300086975098, "loss": 0.542, "rewards/accuracies": 1.0, "rewards/chosen": 1.0788344144821167, "rewards/margins": 0.3293144106864929, "rewards/rejected": 0.7495200037956238, "step": 4051 }, { "epoch": 2.19, "learning_rate": 1.811534200174239e-08, "logits/chosen": -2.0552706718444824, "logits/rejected": -2.2931790351867676, "logps/chosen": -0.9868231415748596, "logps/rejected": -0.922387957572937, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.8977982401847839, "rewards/margins": 0.02576512098312378, "rewards/rejected": 0.8720331192016602, "step": 4052 }, { "epoch": 2.19, "learning_rate": 1.809291980063365e-08, "logits/chosen": -2.180445432662964, "logits/rejected": -2.3137924671173096, "logps/chosen": -3.7827749252319336, "logps/rejected": -10.638591766357422, "loss": 0.5106, "rewards/accuracies": 1.0, "rewards/chosen": 1.0021659135818481, "rewards/margins": 0.40602368116378784, "rewards/rejected": 0.5961422324180603, "step": 4053 }, { "epoch": 2.19, "learning_rate": 1.8070508418942875e-08, "logits/chosen": -2.0649209022521973, "logits/rejected": -2.113025426864624, "logps/chosen": -2.9562151432037354, "logps/rejected": -18.505725860595703, "loss": 0.1986, "rewards/accuracies": 1.0, "rewards/chosen": 1.511087417602539, "rewards/margins": 1.5153567790985107, "rewards/rejected": -0.004269409459084272, "step": 4054 }, { "epoch": 2.19, "learning_rate": 1.8048107864269606e-08, "logits/chosen": -2.2495901584625244, "logits/rejected": -2.1365315914154053, "logps/chosen": -26.756637573242188, "logps/rejected": -6.285457611083984, "loss": 0.2653, "rewards/accuracies": 1.0, "rewards/chosen": 1.5423396825790405, "rewards/margins": 1.1911457777023315, "rewards/rejected": 0.351193904876709, "step": 4055 }, { "epoch": 2.19, "learning_rate": 1.8025718144209644e-08, "logits/chosen": -2.116960287094116, "logits/rejected": -2.3776936531066895, "logps/chosen": -0.15079136192798615, "logps/rejected": -0.15338371694087982, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.7751522064208984, "rewards/margins": 0.013311445713043213, "rewards/rejected": 0.7618407607078552, "step": 4056 }, { "epoch": 2.19, "learning_rate": 1.8003339266355173e-08, "logits/chosen": -2.2371087074279785, "logits/rejected": -2.1579360961914062, "logps/chosen": -20.962814331054688, "logps/rejected": -6.383792400360107, "loss": 0.3857, "rewards/accuracies": 1.0, "rewards/chosen": 1.535565972328186, "rewards/margins": 0.753715455532074, "rewards/rejected": 0.7818505167961121, "step": 4057 }, { "epoch": 2.19, "learning_rate": 1.7980971238294695e-08, "logits/chosen": -2.0604610443115234, "logits/rejected": -2.061699390411377, "logps/chosen": -2.9127612113952637, "logps/rejected": -4.46000862121582, "loss": 0.3161, "rewards/accuracies": 1.0, "rewards/chosen": 1.539516806602478, "rewards/margins": 0.9893074631690979, "rewards/rejected": 0.5502093434333801, "step": 4058 }, { "epoch": 2.19, "learning_rate": 1.795861406761298e-08, "logits/chosen": -2.105438470840454, "logits/rejected": -2.052311897277832, "logps/chosen": -27.937143325805664, "logps/rejected": -4.152135372161865, "loss": 0.2613, "rewards/accuracies": 1.0, "rewards/chosen": 1.781020998954773, "rewards/margins": 1.2087485790252686, "rewards/rejected": 0.5722724795341492, "step": 4059 }, { "epoch": 2.19, "learning_rate": 1.7936267761891172e-08, "logits/chosen": -2.287966012954712, "logits/rejected": -2.2958340644836426, "logps/chosen": -2.7269015312194824, "logps/rejected": -4.686534404754639, "loss": 0.3383, "rewards/accuracies": 1.0, "rewards/chosen": 1.522814154624939, "rewards/margins": 0.909953236579895, "rewards/rejected": 0.612860918045044, "step": 4060 }, { "epoch": 2.19, "learning_rate": 1.791393232870673e-08, "logits/chosen": -2.097848653793335, "logits/rejected": -2.1040148735046387, "logps/chosen": -3.6656436920166016, "logps/rejected": -4.664096355438232, "loss": 0.4306, "rewards/accuracies": 1.0, "rewards/chosen": 1.3110452890396118, "rewards/margins": 0.6194471120834351, "rewards/rejected": 0.6915981769561768, "step": 4061 }, { "epoch": 2.19, "learning_rate": 1.7891607775633357e-08, "logits/chosen": -2.028027057647705, "logits/rejected": -2.2519567012786865, "logps/chosen": -1.7153631448745728, "logps/rejected": -1.5957907438278198, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.9665650725364685, "rewards/margins": 0.027744531631469727, "rewards/rejected": 0.9388205409049988, "step": 4062 }, { "epoch": 2.19, "learning_rate": 1.7869294110241183e-08, "logits/chosen": -2.1156115531921387, "logits/rejected": -2.308603286743164, "logps/chosen": -0.8075246810913086, "logps/rejected": -0.7595406174659729, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.8161917924880981, "rewards/margins": 0.004504799842834473, "rewards/rejected": 0.8116869926452637, "step": 4063 }, { "epoch": 2.19, "learning_rate": 1.7846991340096534e-08, "logits/chosen": -2.0586740970611572, "logits/rejected": -2.27596116065979, "logps/chosen": -6.128503799438477, "logps/rejected": -0.7244309186935425, "loss": 0.7547, "rewards/accuracies": 0.0, "rewards/chosen": 0.8308119773864746, "rewards/margins": -0.1195746660232544, "rewards/rejected": 0.950386643409729, "step": 4064 }, { "epoch": 2.19, "learning_rate": 1.7824699472762105e-08, "logits/chosen": -2.0485310554504395, "logits/rejected": -2.0470798015594482, "logps/chosen": -7.017002105712891, "logps/rejected": -2.514486312866211, "loss": 0.3948, "rewards/accuracies": 1.0, "rewards/chosen": 1.450116753578186, "rewards/margins": 0.7256399989128113, "rewards/rejected": 0.7244767546653748, "step": 4065 }, { "epoch": 2.19, "learning_rate": 1.7802418515796892e-08, "logits/chosen": -2.1947641372680664, "logits/rejected": -2.2020082473754883, "logps/chosen": -1.092291235923767, "logps/rejected": -2.8222999572753906, "loss": 0.4864, "rewards/accuracies": 1.0, "rewards/chosen": 0.9558748602867126, "rewards/margins": 0.46762534976005554, "rewards/rejected": 0.4882495105266571, "step": 4066 }, { "epoch": 2.19, "learning_rate": 1.7780148476756146e-08, "logits/chosen": -2.131211042404175, "logits/rejected": -2.131542444229126, "logps/chosen": -2.3519160747528076, "logps/rejected": -3.139594793319702, "loss": 0.5276, "rewards/accuracies": 1.0, "rewards/chosen": 0.9908079504966736, "rewards/margins": 0.3641092777252197, "rewards/rejected": 0.6266986727714539, "step": 4067 }, { "epoch": 2.19, "learning_rate": 1.775788936319148e-08, "logits/chosen": -2.1910791397094727, "logits/rejected": -2.1892709732055664, "logps/chosen": -2.193843126296997, "logps/rejected": -3.5487213134765625, "loss": 0.5145, "rewards/accuracies": 1.0, "rewards/chosen": 1.1106140613555908, "rewards/margins": 0.3963433504104614, "rewards/rejected": 0.7142707109451294, "step": 4068 }, { "epoch": 2.19, "learning_rate": 1.7735641182650736e-08, "logits/chosen": -2.1528406143188477, "logits/rejected": -2.2592263221740723, "logps/chosen": -0.541003942489624, "logps/rejected": -2.7401041984558105, "loss": 0.6472, "rewards/accuracies": 1.0, "rewards/chosen": 1.135115623474121, "rewards/margins": 0.09421026706695557, "rewards/rejected": 1.0409053564071655, "step": 4069 }, { "epoch": 2.2, "learning_rate": 1.7713403942678095e-08, "logits/chosen": -2.076258897781372, "logits/rejected": -2.101778030395508, "logps/chosen": -24.254024505615234, "logps/rejected": -8.732839584350586, "loss": 0.3439, "rewards/accuracies": 1.0, "rewards/chosen": 1.7346524000167847, "rewards/margins": 0.8906374573707581, "rewards/rejected": 0.8440149426460266, "step": 4070 }, { "epoch": 2.2, "learning_rate": 1.7691177650814032e-08, "logits/chosen": -2.1103310585021973, "logits/rejected": -2.181478977203369, "logps/chosen": -0.35209423303604126, "logps/rejected": -29.987977981567383, "loss": 0.212, "rewards/accuracies": 1.0, "rewards/chosen": 0.9649810791015625, "rewards/margins": 1.4431400299072266, "rewards/rejected": -0.47815895080566406, "step": 4071 }, { "epoch": 2.2, "learning_rate": 1.766896231459525e-08, "logits/chosen": -2.0231409072875977, "logits/rejected": -2.013916492462158, "logps/chosen": -4.297948837280273, "logps/rejected": -5.05722713470459, "loss": 0.4146, "rewards/accuracies": 1.0, "rewards/chosen": 1.281518816947937, "rewards/margins": 0.6660857796669006, "rewards/rejected": 0.6154330372810364, "step": 4072 }, { "epoch": 2.2, "learning_rate": 1.7646757941554834e-08, "logits/chosen": -2.144120931625366, "logits/rejected": -2.1443750858306885, "logps/chosen": -2.061460494995117, "logps/rejected": -3.812864303588867, "loss": 0.2818, "rewards/accuracies": 1.0, "rewards/chosen": 1.5802984237670898, "rewards/margins": 1.1223419904708862, "rewards/rejected": 0.4579564034938812, "step": 4073 }, { "epoch": 2.2, "learning_rate": 1.7624564539222052e-08, "logits/chosen": -2.062217950820923, "logits/rejected": -2.2427356243133545, "logps/chosen": -1.9148004055023193, "logps/rejected": -4.585831642150879, "loss": 0.6379, "rewards/accuracies": 1.0, "rewards/chosen": 1.0285251140594482, "rewards/margins": 0.11368852853775024, "rewards/rejected": 0.914836585521698, "step": 4074 }, { "epoch": 2.2, "learning_rate": 1.760238211512252e-08, "logits/chosen": -1.9461913108825684, "logits/rejected": -1.9393153190612793, "logps/chosen": -4.088471412658691, "logps/rejected": -4.336517810821533, "loss": 0.2634, "rewards/accuracies": 1.0, "rewards/chosen": 1.5733370780944824, "rewards/margins": 1.1994441747665405, "rewards/rejected": 0.3738929331302643, "step": 4075 }, { "epoch": 2.2, "learning_rate": 1.7580210676778117e-08, "logits/chosen": -2.0354037284851074, "logits/rejected": -2.265453815460205, "logps/chosen": -0.4770229458808899, "logps/rejected": -0.5097017288208008, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 1.0147150754928589, "rewards/margins": 0.004478931427001953, "rewards/rejected": 1.010236144065857, "step": 4076 }, { "epoch": 2.2, "learning_rate": 1.755805023170696e-08, "logits/chosen": -2.1032981872558594, "logits/rejected": -2.115182399749756, "logps/chosen": -3.102565288543701, "logps/rejected": -6.160183429718018, "loss": 0.6956, "rewards/accuracies": 0.0, "rewards/chosen": 0.9836494326591492, "rewards/margins": -0.00496828556060791, "rewards/rejected": 0.9886177182197571, "step": 4077 }, { "epoch": 2.2, "learning_rate": 1.7535900787423497e-08, "logits/chosen": -2.2120273113250732, "logits/rejected": -2.0811121463775635, "logps/chosen": -32.468894958496094, "logps/rejected": -3.7876548767089844, "loss": 0.2299, "rewards/accuracies": 1.0, "rewards/chosen": 2.0303642749786377, "rewards/margins": 1.3528481721878052, "rewards/rejected": 0.6775161027908325, "step": 4078 }, { "epoch": 2.2, "learning_rate": 1.751376235143838e-08, "logits/chosen": -2.0070786476135254, "logits/rejected": -2.0084869861602783, "logps/chosen": -0.7168920636177063, "logps/rejected": -3.4661154747009277, "loss": 0.5327, "rewards/accuracies": 1.0, "rewards/chosen": 0.9849122166633606, "rewards/margins": 0.3516061305999756, "rewards/rejected": 0.633306086063385, "step": 4079 }, { "epoch": 2.2, "learning_rate": 1.7491634931258586e-08, "logits/chosen": -1.9788755178451538, "logits/rejected": -2.3063735961914062, "logps/chosen": -0.3901180326938629, "logps/rejected": -0.37917807698249817, "loss": 0.6982, "rewards/accuracies": 0.0, "rewards/chosen": 0.7780086994171143, "rewards/margins": -0.010132014751434326, "rewards/rejected": 0.7881407141685486, "step": 4080 }, { "epoch": 2.2, "learning_rate": 1.7469518534387345e-08, "logits/chosen": -2.116870880126953, "logits/rejected": -2.3209447860717773, "logps/chosen": -0.4075940251350403, "logps/rejected": -0.41956284642219543, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9789901971817017, "rewards/margins": 0.014262616634368896, "rewards/rejected": 0.9647275805473328, "step": 4081 }, { "epoch": 2.2, "learning_rate": 1.7447413168324088e-08, "logits/chosen": -2.0134570598602295, "logits/rejected": -2.021381378173828, "logps/chosen": -1.3773845434188843, "logps/rejected": -2.8600916862487793, "loss": 0.3959, "rewards/accuracies": 1.0, "rewards/chosen": 1.3313759565353394, "rewards/margins": 0.7219982147216797, "rewards/rejected": 0.6093777418136597, "step": 4082 }, { "epoch": 2.2, "learning_rate": 1.7425318840564624e-08, "logits/chosen": -1.9896084070205688, "logits/rejected": -2.014806032180786, "logps/chosen": -6.71864128112793, "logps/rejected": -24.384260177612305, "loss": 0.6563, "rewards/accuracies": 1.0, "rewards/chosen": 1.2457205057144165, "rewards/margins": 0.07505989074707031, "rewards/rejected": 1.1706606149673462, "step": 4083 }, { "epoch": 2.2, "learning_rate": 1.7403235558600903e-08, "logits/chosen": -2.0999937057495117, "logits/rejected": -2.1051278114318848, "logps/chosen": -0.45800063014030457, "logps/rejected": -15.61053466796875, "loss": 0.3733, "rewards/accuracies": 1.0, "rewards/chosen": 0.9548358917236328, "rewards/margins": 0.7928584814071655, "rewards/rejected": 0.1619773954153061, "step": 4084 }, { "epoch": 2.2, "learning_rate": 1.738116332992119e-08, "logits/chosen": -2.062195301055908, "logits/rejected": -2.061129331588745, "logps/chosen": -4.625893592834473, "logps/rejected": -6.866405487060547, "loss": 0.4942, "rewards/accuracies": 1.0, "rewards/chosen": 1.3560036420822144, "rewards/margins": 0.4475209712982178, "rewards/rejected": 0.9084826707839966, "step": 4085 }, { "epoch": 2.2, "learning_rate": 1.7359102162010003e-08, "logits/chosen": -2.022827625274658, "logits/rejected": -2.32643723487854, "logps/chosen": -1.2479697465896606, "logps/rejected": -0.4448259770870209, "loss": 0.7518, "rewards/accuracies": 0.0, "rewards/chosen": 0.9603599905967712, "rewards/margins": -0.1141158938407898, "rewards/rejected": 1.074475884437561, "step": 4086 }, { "epoch": 2.2, "learning_rate": 1.733705206234807e-08, "logits/chosen": -2.0404036045074463, "logits/rejected": -2.268181562423706, "logps/chosen": -0.5489312410354614, "logps/rejected": -0.6232466101646423, "loss": 0.7046, "rewards/accuracies": 0.0, "rewards/chosen": 0.8147558569908142, "rewards/margins": -0.02275228500366211, "rewards/rejected": 0.8375081419944763, "step": 4087 }, { "epoch": 2.2, "learning_rate": 1.7315013038412402e-08, "logits/chosen": -2.2385201454162598, "logits/rejected": -2.037844181060791, "logps/chosen": -38.773738861083984, "logps/rejected": -4.155465602874756, "loss": 0.112, "rewards/accuracies": 1.0, "rewards/chosen": 2.641251802444458, "rewards/margins": 2.1324219703674316, "rewards/rejected": 0.5088297724723816, "step": 4088 }, { "epoch": 2.21, "learning_rate": 1.729298509767626e-08, "logits/chosen": -2.1095738410949707, "logits/rejected": -2.253965139389038, "logps/chosen": -3.685720205307007, "logps/rejected": -3.4991934299468994, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.9854874014854431, "rewards/margins": -0.004371047019958496, "rewards/rejected": 0.9898584485054016, "step": 4089 }, { "epoch": 2.21, "learning_rate": 1.72709682476091e-08, "logits/chosen": -2.0083937644958496, "logits/rejected": -1.997645378112793, "logps/chosen": -5.4817609786987305, "logps/rejected": -5.021461009979248, "loss": 0.4495, "rewards/accuracies": 1.0, "rewards/chosen": 1.130075216293335, "rewards/margins": 0.5664310455322266, "rewards/rejected": 0.5636441707611084, "step": 4090 }, { "epoch": 2.21, "learning_rate": 1.7248962495676682e-08, "logits/chosen": -2.1798384189605713, "logits/rejected": -2.1811442375183105, "logps/chosen": -4.383293151855469, "logps/rejected": -3.340034008026123, "loss": 0.3411, "rewards/accuracies": 1.0, "rewards/chosen": 1.4758309125900269, "rewards/margins": 0.9000571370124817, "rewards/rejected": 0.5757737755775452, "step": 4091 }, { "epoch": 2.21, "learning_rate": 1.7226967849340912e-08, "logits/chosen": -2.0133309364318848, "logits/rejected": -2.3083293437957764, "logps/chosen": -0.2660635709762573, "logps/rejected": -0.29577627778053284, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.9663793444633484, "rewards/margins": 0.00538325309753418, "rewards/rejected": 0.9609960913658142, "step": 4092 }, { "epoch": 2.21, "learning_rate": 1.7204984316060063e-08, "logits/chosen": -2.203401803970337, "logits/rejected": -2.182469606399536, "logps/chosen": -15.866231918334961, "logps/rejected": -1.4396981000900269, "loss": 0.3785, "rewards/accuracies": 1.0, "rewards/chosen": 1.4732916355133057, "rewards/margins": 0.7764599919319153, "rewards/rejected": 0.6968316435813904, "step": 4093 }, { "epoch": 2.21, "learning_rate": 1.71830119032885e-08, "logits/chosen": -2.033318519592285, "logits/rejected": -2.025270700454712, "logps/chosen": -2.7156777381896973, "logps/rejected": -5.659750938415527, "loss": 0.3605, "rewards/accuracies": 1.0, "rewards/chosen": 1.2347077131271362, "rewards/margins": 0.8345646858215332, "rewards/rejected": 0.4001430571079254, "step": 4094 }, { "epoch": 2.21, "learning_rate": 1.716105061847691e-08, "logits/chosen": -2.0689496994018555, "logits/rejected": -2.199225902557373, "logps/chosen": -3.758441925048828, "logps/rejected": -2.9882044792175293, "loss": 0.7487, "rewards/accuracies": 0.0, "rewards/chosen": 0.8681150674819946, "rewards/margins": -0.10810810327529907, "rewards/rejected": 0.9762231707572937, "step": 4095 }, { "epoch": 2.21, "learning_rate": 1.7139100469072183e-08, "logits/chosen": -2.1224260330200195, "logits/rejected": -2.1196844577789307, "logps/chosen": -6.177104473114014, "logps/rejected": -4.53305196762085, "loss": 0.4371, "rewards/accuracies": 1.0, "rewards/chosen": 1.5619449615478516, "rewards/margins": 0.6012209057807922, "rewards/rejected": 0.9607240557670593, "step": 4096 }, { "epoch": 2.21, "learning_rate": 1.71171614625174e-08, "logits/chosen": -2.103928565979004, "logits/rejected": -2.1183559894561768, "logps/chosen": -4.77361536026001, "logps/rejected": -3.5677168369293213, "loss": 0.5457, "rewards/accuracies": 1.0, "rewards/chosen": 1.1210211515426636, "rewards/margins": 0.3203844428062439, "rewards/rejected": 0.8006367087364197, "step": 4097 }, { "epoch": 2.21, "learning_rate": 1.70952336062519e-08, "logits/chosen": -2.0077149868011475, "logits/rejected": -2.2363083362579346, "logps/chosen": -0.8520179390907288, "logps/rejected": -0.9276124238967896, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.905439019203186, "rewards/margins": 0.022666573524475098, "rewards/rejected": 0.8827724456787109, "step": 4098 }, { "epoch": 2.21, "learning_rate": 1.707331690771126e-08, "logits/chosen": -2.054572582244873, "logits/rejected": -2.2558233737945557, "logps/chosen": -0.5060056447982788, "logps/rejected": -1.6895623207092285, "loss": 0.6663, "rewards/accuracies": 1.0, "rewards/chosen": 0.9411379098892212, "rewards/margins": 0.05448383092880249, "rewards/rejected": 0.8866540789604187, "step": 4099 }, { "epoch": 2.21, "learning_rate": 1.7051411374327206e-08, "logits/chosen": -2.0047988891601562, "logits/rejected": -2.01145601272583, "logps/chosen": -1.472673773765564, "logps/rejected": -5.15144681930542, "loss": 0.4037, "rewards/accuracies": 1.0, "rewards/chosen": 1.0651111602783203, "rewards/margins": 0.698477029800415, "rewards/rejected": 0.3666341304779053, "step": 4100 }, { "epoch": 2.21, "learning_rate": 1.7029517013527727e-08, "logits/chosen": -2.0366220474243164, "logits/rejected": -2.0407042503356934, "logps/chosen": -0.3250420391559601, "logps/rejected": -3.6424834728240967, "loss": 0.4795, "rewards/accuracies": 1.0, "rewards/chosen": 0.903232991695404, "rewards/margins": 0.4857931435108185, "rewards/rejected": 0.41743984818458557, "step": 4101 }, { "epoch": 2.21, "learning_rate": 1.7007633832737024e-08, "logits/chosen": -2.15842342376709, "logits/rejected": -2.275312900543213, "logps/chosen": -0.5782058835029602, "logps/rejected": -0.5731973052024841, "loss": 0.6915, "rewards/accuracies": 1.0, "rewards/chosen": 0.9054485559463501, "rewards/margins": 0.0032225847244262695, "rewards/rejected": 0.9022259712219238, "step": 4102 }, { "epoch": 2.21, "learning_rate": 1.698576183937549e-08, "logits/chosen": -1.959878921508789, "logits/rejected": -1.9721369743347168, "logps/chosen": -1.8644442558288574, "logps/rejected": -3.8473241329193115, "loss": 0.5235, "rewards/accuracies": 1.0, "rewards/chosen": 0.8456161618232727, "rewards/margins": 0.37400123476982117, "rewards/rejected": 0.47161492705345154, "step": 4103 }, { "epoch": 2.21, "learning_rate": 1.6963901040859745e-08, "logits/chosen": -2.105515718460083, "logits/rejected": -2.2741541862487793, "logps/chosen": -1.9585673809051514, "logps/rejected": -0.583623468875885, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 1.046410322189331, "rewards/margins": 0.01327526569366455, "rewards/rejected": 1.0331350564956665, "step": 4104 }, { "epoch": 2.21, "learning_rate": 1.6942051444602578e-08, "logits/chosen": -2.127042055130005, "logits/rejected": -2.277388334274292, "logps/chosen": -1.2302069664001465, "logps/rejected": -1.2128517627716064, "loss": 0.6971, "rewards/accuracies": 0.0, "rewards/chosen": 0.8221079111099243, "rewards/margins": -0.007821619510650635, "rewards/rejected": 0.829929530620575, "step": 4105 }, { "epoch": 2.21, "learning_rate": 1.6920213058013022e-08, "logits/chosen": -2.1989586353302, "logits/rejected": -2.3563365936279297, "logps/chosen": -2.131941556930542, "logps/rejected": -2.0133461952209473, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.7592658400535583, "rewards/margins": 0.010105013847351074, "rewards/rejected": 0.7491608262062073, "step": 4106 }, { "epoch": 2.22, "learning_rate": 1.6898385888496253e-08, "logits/chosen": -2.1234021186828613, "logits/rejected": -2.1291372776031494, "logps/chosen": -2.6719703674316406, "logps/rejected": -4.735708236694336, "loss": 0.4572, "rewards/accuracies": 1.0, "rewards/chosen": 1.0027658939361572, "rewards/margins": 0.5453447699546814, "rewards/rejected": 0.45742112398147583, "step": 4107 }, { "epoch": 2.22, "learning_rate": 1.68765699434537e-08, "logits/chosen": -2.0256083011627197, "logits/rejected": -2.033438205718994, "logps/chosen": -0.9038000106811523, "logps/rejected": -4.873156547546387, "loss": 0.4977, "rewards/accuracies": 1.0, "rewards/chosen": 1.053325891494751, "rewards/margins": 0.4385979175567627, "rewards/rejected": 0.6147279739379883, "step": 4108 }, { "epoch": 2.22, "learning_rate": 1.685476523028298e-08, "logits/chosen": -2.1186363697052, "logits/rejected": -2.2857871055603027, "logps/chosen": -0.19659073650836945, "logps/rejected": -0.2344963103532791, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.8404003381729126, "rewards/margins": 0.02563410997390747, "rewards/rejected": 0.8147662281990051, "step": 4109 }, { "epoch": 2.22, "learning_rate": 1.6832971756377846e-08, "logits/chosen": -1.9940311908721924, "logits/rejected": -1.9910038709640503, "logps/chosen": -3.0055880546569824, "logps/rejected": -1.6362214088439941, "loss": 0.6115, "rewards/accuracies": 1.0, "rewards/chosen": 1.2273505926132202, "rewards/margins": 0.17048466205596924, "rewards/rejected": 1.056865930557251, "step": 4110 }, { "epoch": 2.22, "learning_rate": 1.6811189529128302e-08, "logits/chosen": -1.9793401956558228, "logits/rejected": -1.98576021194458, "logps/chosen": -2.993126392364502, "logps/rejected": -4.618696689605713, "loss": 0.3744, "rewards/accuracies": 1.0, "rewards/chosen": 1.4167540073394775, "rewards/margins": 0.7894829511642456, "rewards/rejected": 0.6272710561752319, "step": 4111 }, { "epoch": 2.22, "learning_rate": 1.6789418555920508e-08, "logits/chosen": -2.134308338165283, "logits/rejected": -2.13087797164917, "logps/chosen": -4.490147590637207, "logps/rejected": -5.195706367492676, "loss": 0.331, "rewards/accuracies": 1.0, "rewards/chosen": 1.2859374284744263, "rewards/margins": 0.9356206655502319, "rewards/rejected": 0.35031673312187195, "step": 4112 }, { "epoch": 2.22, "learning_rate": 1.6767658844136824e-08, "logits/chosen": -2.0550365447998047, "logits/rejected": -2.066141366958618, "logps/chosen": -6.180751323699951, "logps/rejected": -2.356565475463867, "loss": 0.4644, "rewards/accuracies": 1.0, "rewards/chosen": 1.2812751531600952, "rewards/margins": 0.5257171988487244, "rewards/rejected": 0.7555579543113708, "step": 4113 }, { "epoch": 2.22, "learning_rate": 1.6745910401155787e-08, "logits/chosen": -2.0369632244110107, "logits/rejected": -2.044140100479126, "logps/chosen": -3.842641830444336, "logps/rejected": -5.433387756347656, "loss": 0.42, "rewards/accuracies": 1.0, "rewards/chosen": 0.9992502331733704, "rewards/margins": 0.6500546932220459, "rewards/rejected": 0.34919556975364685, "step": 4114 }, { "epoch": 2.22, "learning_rate": 1.6724173234352083e-08, "logits/chosen": -2.0467655658721924, "logits/rejected": -2.342379093170166, "logps/chosen": -0.44579267501831055, "logps/rejected": -0.3709956705570221, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.9270647168159485, "rewards/margins": 0.03549814224243164, "rewards/rejected": 0.8915665745735168, "step": 4115 }, { "epoch": 2.22, "learning_rate": 1.670244735109664e-08, "logits/chosen": -2.0515553951263428, "logits/rejected": -2.0576744079589844, "logps/chosen": -1.6140270233154297, "logps/rejected": -3.385892629623413, "loss": 0.4751, "rewards/accuracies": 1.0, "rewards/chosen": 1.0843533277511597, "rewards/margins": 0.49716705083847046, "rewards/rejected": 0.5871862769126892, "step": 4116 }, { "epoch": 2.22, "learning_rate": 1.6680732758756467e-08, "logits/chosen": -1.9978207349777222, "logits/rejected": -2.3316457271575928, "logps/chosen": -1.2735819816589355, "logps/rejected": -4.279637336730957, "loss": 0.6218, "rewards/accuracies": 1.0, "rewards/chosen": 0.9944437146186829, "rewards/margins": 0.14813882112503052, "rewards/rejected": 0.8463048934936523, "step": 4117 }, { "epoch": 2.22, "learning_rate": 1.6659029464694825e-08, "logits/chosen": -1.9779695272445679, "logits/rejected": -1.9721928834915161, "logps/chosen": -6.3902764320373535, "logps/rejected": -2.981008529663086, "loss": 0.3135, "rewards/accuracies": 1.0, "rewards/chosen": 1.572260856628418, "rewards/margins": 0.999043345451355, "rewards/rejected": 0.573217511177063, "step": 4118 }, { "epoch": 2.22, "learning_rate": 1.6637337476271125e-08, "logits/chosen": -2.162057876586914, "logits/rejected": -2.167051076889038, "logps/chosen": -2.7608108520507812, "logps/rejected": -9.368206024169922, "loss": 0.4163, "rewards/accuracies": 1.0, "rewards/chosen": 1.4083855152130127, "rewards/margins": 0.6610618829727173, "rewards/rejected": 0.7473236322402954, "step": 4119 }, { "epoch": 2.22, "learning_rate": 1.661565680084091e-08, "logits/chosen": -2.03371262550354, "logits/rejected": -2.027172088623047, "logps/chosen": -9.16306209564209, "logps/rejected": -0.749503493309021, "loss": 0.5543, "rewards/accuracies": 1.0, "rewards/chosen": 1.2122533321380615, "rewards/margins": 0.3001402020454407, "rewards/rejected": 0.9121131300926208, "step": 4120 }, { "epoch": 2.22, "learning_rate": 1.6593987445755925e-08, "logits/chosen": -2.0464751720428467, "logits/rejected": -2.2716164588928223, "logps/chosen": -0.25730112195014954, "logps/rejected": -0.30116575956344604, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.8354841470718384, "rewards/margins": 0.022570252418518066, "rewards/rejected": 0.8129138946533203, "step": 4121 }, { "epoch": 2.22, "learning_rate": 1.657232941836406e-08, "logits/chosen": -1.9906753301620483, "logits/rejected": -2.2979187965393066, "logps/chosen": -0.9466724991798401, "logps/rejected": -1.0502662658691406, "loss": 0.6746, "rewards/accuracies": 1.0, "rewards/chosen": 0.7623770833015442, "rewards/margins": 0.037532150745391846, "rewards/rejected": 0.7248449325561523, "step": 4122 }, { "epoch": 2.22, "learning_rate": 1.6550682726009374e-08, "logits/chosen": -2.0185840129852295, "logits/rejected": -2.3170762062072754, "logps/chosen": -4.765116214752197, "logps/rejected": -2.8179593086242676, "loss": 0.7278, "rewards/accuracies": 0.0, "rewards/chosen": 0.5743154287338257, "rewards/margins": -0.06815832853317261, "rewards/rejected": 0.6424737572669983, "step": 4123 }, { "epoch": 2.22, "learning_rate": 1.6529047376032078e-08, "logits/chosen": -2.0378427505493164, "logits/rejected": -2.2779300212860107, "logps/chosen": -7.9044389724731445, "logps/rejected": -7.5046234130859375, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.8901896476745605, "rewards/margins": 0.005639135837554932, "rewards/rejected": 0.8845505118370056, "step": 4124 }, { "epoch": 2.22, "learning_rate": 1.650742337576852e-08, "logits/chosen": -2.156217336654663, "logits/rejected": -2.262078046798706, "logps/chosen": -0.6985375285148621, "logps/rejected": -2.248812437057495, "loss": 0.6389, "rewards/accuracies": 1.0, "rewards/chosen": 0.9102059602737427, "rewards/margins": 0.11163800954818726, "rewards/rejected": 0.7985679507255554, "step": 4125 }, { "epoch": 2.23, "learning_rate": 1.6485810732551232e-08, "logits/chosen": -2.0653576850891113, "logits/rejected": -2.0738139152526855, "logps/chosen": -2.3701329231262207, "logps/rejected": -3.607470750808716, "loss": 0.621, "rewards/accuracies": 1.0, "rewards/chosen": 0.936597466468811, "rewards/margins": 0.14987391233444214, "rewards/rejected": 0.7867235541343689, "step": 4126 }, { "epoch": 2.23, "learning_rate": 1.646420945370885e-08, "logits/chosen": -2.076011896133423, "logits/rejected": -2.3252882957458496, "logps/chosen": -0.601906418800354, "logps/rejected": -0.534595787525177, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.992125928401947, "rewards/margins": 0.009322941303253174, "rewards/rejected": 0.9828029870986938, "step": 4127 }, { "epoch": 2.23, "learning_rate": 1.6442619546566195e-08, "logits/chosen": -2.0306520462036133, "logits/rejected": -2.2763185501098633, "logps/chosen": -0.6037301421165466, "logps/rejected": -5.363389015197754, "loss": 0.5118, "rewards/accuracies": 1.0, "rewards/chosen": 1.1321332454681396, "rewards/margins": 0.4031001329421997, "rewards/rejected": 0.7290331125259399, "step": 4128 }, { "epoch": 2.23, "learning_rate": 1.6421041018444237e-08, "logits/chosen": -2.074756383895874, "logits/rejected": -2.0671160221099854, "logps/chosen": -3.5078632831573486, "logps/rejected": -10.220669746398926, "loss": 0.5048, "rewards/accuracies": 1.0, "rewards/chosen": 1.3271187543869019, "rewards/margins": 0.42061156034469604, "rewards/rejected": 0.9065071940422058, "step": 4129 }, { "epoch": 2.23, "learning_rate": 1.639947387666004e-08, "logits/chosen": -2.070251703262329, "logits/rejected": -2.310819149017334, "logps/chosen": -0.5296574234962463, "logps/rejected": -0.5808637738227844, "loss": 0.7, "rewards/accuracies": 0.0, "rewards/chosen": 0.9377480745315552, "rewards/margins": -0.013605296611785889, "rewards/rejected": 0.9513533711433411, "step": 4130 }, { "epoch": 2.23, "learning_rate": 1.6377918128526857e-08, "logits/chosen": -1.9948228597640991, "logits/rejected": -2.250549077987671, "logps/chosen": -0.7606909871101379, "logps/rejected": -0.7202135920524597, "loss": 0.667, "rewards/accuracies": 1.0, "rewards/chosen": 0.9754570126533508, "rewards/margins": 0.052916646003723145, "rewards/rejected": 0.9225403666496277, "step": 4131 }, { "epoch": 2.23, "learning_rate": 1.6356373781354056e-08, "logits/chosen": -2.1575815677642822, "logits/rejected": -2.262397050857544, "logps/chosen": -5.466524124145508, "logps/rejected": -8.582165718078613, "loss": 0.5987, "rewards/accuracies": 1.0, "rewards/chosen": 0.8806115984916687, "rewards/margins": 0.19876974821090698, "rewards/rejected": 0.6818418502807617, "step": 4132 }, { "epoch": 2.23, "learning_rate": 1.633484084244713e-08, "logits/chosen": -1.974514365196228, "logits/rejected": -2.265950918197632, "logps/chosen": -0.759160578250885, "logps/rejected": -0.7427458167076111, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.9034695625305176, "rewards/margins": 0.02627408504486084, "rewards/rejected": 0.8771954774856567, "step": 4133 }, { "epoch": 2.23, "learning_rate": 1.6313319319107743e-08, "logits/chosen": -2.1513662338256836, "logits/rejected": -2.2592506408691406, "logps/chosen": -4.845725059509277, "logps/rejected": -4.548638820648193, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.8526544570922852, "rewards/margins": 0.00703352689743042, "rewards/rejected": 0.8456209301948547, "step": 4134 }, { "epoch": 2.23, "learning_rate": 1.6291809218633623e-08, "logits/chosen": -1.9917657375335693, "logits/rejected": -1.9989519119262695, "logps/chosen": -1.6662908792495728, "logps/rejected": -2.7788853645324707, "loss": 0.4739, "rewards/accuracies": 1.0, "rewards/chosen": 1.0828642845153809, "rewards/margins": 0.5004823207855225, "rewards/rejected": 0.5823819637298584, "step": 4135 }, { "epoch": 2.23, "learning_rate": 1.6270310548318678e-08, "logits/chosen": -2.1150922775268555, "logits/rejected": -2.3038783073425293, "logps/chosen": -0.8140625357627869, "logps/rejected": -0.766810417175293, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.9011226892471313, "rewards/margins": 0.012859523296356201, "rewards/rejected": 0.8882631659507751, "step": 4136 }, { "epoch": 2.23, "learning_rate": 1.6248823315452942e-08, "logits/chosen": -2.0252280235290527, "logits/rejected": -2.0227839946746826, "logps/chosen": -7.244616508483887, "logps/rejected": -4.977078914642334, "loss": 0.337, "rewards/accuracies": 1.0, "rewards/chosen": 1.3679028749465942, "rewards/margins": 0.9145999550819397, "rewards/rejected": 0.45330291986465454, "step": 4137 }, { "epoch": 2.23, "learning_rate": 1.622734752732251e-08, "logits/chosen": -2.1690263748168945, "logits/rejected": -2.1714229583740234, "logps/chosen": -0.9708428978919983, "logps/rejected": -2.6083579063415527, "loss": 0.6435, "rewards/accuracies": 1.0, "rewards/chosen": 0.9761877059936523, "rewards/margins": 0.10196971893310547, "rewards/rejected": 0.8742179870605469, "step": 4138 }, { "epoch": 2.23, "learning_rate": 1.620588319120968e-08, "logits/chosen": -2.0711581707000732, "logits/rejected": -2.0662879943847656, "logps/chosen": -5.177105903625488, "logps/rejected": -4.276296138763428, "loss": 0.4456, "rewards/accuracies": 1.0, "rewards/chosen": 1.0079046487808228, "rewards/margins": 0.577318549156189, "rewards/rejected": 0.4305860996246338, "step": 4139 }, { "epoch": 2.23, "learning_rate": 1.618443031439277e-08, "logits/chosen": -2.0099546909332275, "logits/rejected": -2.009692668914795, "logps/chosen": -0.5829281210899353, "logps/rejected": -3.6640589237213135, "loss": 0.4857, "rewards/accuracies": 1.0, "rewards/chosen": 1.037768006324768, "rewards/margins": 0.4695731997489929, "rewards/rejected": 0.5681948065757751, "step": 4140 }, { "epoch": 2.23, "learning_rate": 1.616298890414634e-08, "logits/chosen": -2.0585546493530273, "logits/rejected": -2.2998788356781006, "logps/chosen": -2.113882064819336, "logps/rejected": -2.028961181640625, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.8917277455329895, "rewards/margins": 0.013010501861572266, "rewards/rejected": 0.8787172436714172, "step": 4141 }, { "epoch": 2.23, "learning_rate": 1.6141558967740937e-08, "logits/chosen": -2.0981764793395996, "logits/rejected": -2.097245454788208, "logps/chosen": -2.9615492820739746, "logps/rejected": -4.934930324554443, "loss": 0.3323, "rewards/accuracies": 1.0, "rewards/chosen": 1.4216569662094116, "rewards/margins": 0.9311164617538452, "rewards/rejected": 0.490540474653244, "step": 4142 }, { "epoch": 2.23, "learning_rate": 1.612014051244328e-08, "logits/chosen": -2.0359749794006348, "logits/rejected": -2.0146567821502686, "logps/chosen": -4.628114223480225, "logps/rejected": -5.270066738128662, "loss": 0.2838, "rewards/accuracies": 1.0, "rewards/chosen": 1.526131272315979, "rewards/margins": 1.1143168210983276, "rewards/rejected": 0.41181445121765137, "step": 4143 }, { "epoch": 2.24, "learning_rate": 1.6098733545516202e-08, "logits/chosen": -2.0553078651428223, "logits/rejected": -2.0582220554351807, "logps/chosen": -1.2943222522735596, "logps/rejected": -4.213569641113281, "loss": 0.4756, "rewards/accuracies": 1.0, "rewards/chosen": 1.214880347251892, "rewards/margins": 0.49598270654678345, "rewards/rejected": 0.7188976407051086, "step": 4144 }, { "epoch": 2.24, "learning_rate": 1.6077338074218593e-08, "logits/chosen": -2.0648281574249268, "logits/rejected": -2.0726492404937744, "logps/chosen": -2.739063262939453, "logps/rejected": -1.2873082160949707, "loss": 0.4363, "rewards/accuracies": 1.0, "rewards/chosen": 1.3465923070907593, "rewards/margins": 0.6033055186271667, "rewards/rejected": 0.7432867884635925, "step": 4145 }, { "epoch": 2.24, "learning_rate": 1.6055954105805486e-08, "logits/chosen": -2.106306552886963, "logits/rejected": -2.2950966358184814, "logps/chosen": -1.5953606367111206, "logps/rejected": -1.2575058937072754, "loss": 0.7237, "rewards/accuracies": 0.0, "rewards/chosen": 0.8997027277946472, "rewards/margins": -0.0602225661277771, "rewards/rejected": 0.9599252939224243, "step": 4146 }, { "epoch": 2.24, "learning_rate": 1.6034581647528028e-08, "logits/chosen": -2.121492862701416, "logits/rejected": -2.1201136112213135, "logps/chosen": -1.3104231357574463, "logps/rejected": -3.2470438480377197, "loss": 0.5395, "rewards/accuracies": 1.0, "rewards/chosen": 0.9007902145385742, "rewards/margins": 0.33520227670669556, "rewards/rejected": 0.5655879378318787, "step": 4147 }, { "epoch": 2.24, "learning_rate": 1.601322070663339e-08, "logits/chosen": -2.1407039165496826, "logits/rejected": -2.2969024181365967, "logps/chosen": -0.7485002875328064, "logps/rejected": -1.2668946981430054, "loss": 0.634, "rewards/accuracies": 1.0, "rewards/chosen": 0.9423407912254333, "rewards/margins": 0.12201958894729614, "rewards/rejected": 0.8203212022781372, "step": 4148 }, { "epoch": 2.24, "learning_rate": 1.5991871290364926e-08, "logits/chosen": -2.0590877532958984, "logits/rejected": -2.3196918964385986, "logps/chosen": -1.353943109512329, "logps/rejected": -3.265890598297119, "loss": 0.6069, "rewards/accuracies": 1.0, "rewards/chosen": 1.1032179594039917, "rewards/margins": 0.18055367469787598, "rewards/rejected": 0.9226642847061157, "step": 4149 }, { "epoch": 2.24, "learning_rate": 1.5970533405961995e-08, "logits/chosen": -1.9773024320602417, "logits/rejected": -2.2986104488372803, "logps/chosen": -0.13242630660533905, "logps/rejected": -0.13789530098438263, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.8333256840705872, "rewards/margins": 0.00890970230102539, "rewards/rejected": 0.8244159817695618, "step": 4150 }, { "epoch": 2.24, "learning_rate": 1.5949207060660135e-08, "logits/chosen": -2.1107537746429443, "logits/rejected": -2.0921647548675537, "logps/chosen": -4.230645656585693, "logps/rejected": -5.285538673400879, "loss": 0.4548, "rewards/accuracies": 1.0, "rewards/chosen": 1.1501613855361938, "rewards/margins": 0.5517916679382324, "rewards/rejected": 0.5983697175979614, "step": 4151 }, { "epoch": 2.24, "learning_rate": 1.5927892261690922e-08, "logits/chosen": -2.1729726791381836, "logits/rejected": -2.1812949180603027, "logps/chosen": -1.9177433252334595, "logps/rejected": -3.0760648250579834, "loss": 0.4792, "rewards/accuracies": 1.0, "rewards/chosen": 1.1270097494125366, "rewards/margins": 0.48652130365371704, "rewards/rejected": 0.6404884457588196, "step": 4152 }, { "epoch": 2.24, "learning_rate": 1.5906589016281998e-08, "logits/chosen": -2.1576225757598877, "logits/rejected": -2.157538652420044, "logps/chosen": -0.7392047047615051, "logps/rejected": -4.535823822021484, "loss": 0.3819, "rewards/accuracies": 1.0, "rewards/chosen": 1.092648983001709, "rewards/margins": 0.7655642032623291, "rewards/rejected": 0.3270847797393799, "step": 4153 }, { "epoch": 2.24, "learning_rate": 1.588529733165714e-08, "logits/chosen": -2.0357987880706787, "logits/rejected": -2.2947981357574463, "logps/chosen": -2.523653507232666, "logps/rejected": -2.3408572673797607, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.6964150071144104, "rewards/margins": 0.007932603359222412, "rewards/rejected": 0.688482403755188, "step": 4154 }, { "epoch": 2.24, "learning_rate": 1.586401721503614e-08, "logits/chosen": -2.0847363471984863, "logits/rejected": -2.2545437812805176, "logps/chosen": -6.8109049797058105, "logps/rejected": -10.745981216430664, "loss": 0.6131, "rewards/accuracies": 1.0, "rewards/chosen": 0.7954174876213074, "rewards/margins": 0.16716259717941284, "rewards/rejected": 0.6282548904418945, "step": 4155 }, { "epoch": 2.24, "learning_rate": 1.5842748673634924e-08, "logits/chosen": -2.1424171924591064, "logits/rejected": -2.1277127265930176, "logps/chosen": -8.974875450134277, "logps/rejected": -4.055487155914307, "loss": 0.3439, "rewards/accuracies": 1.0, "rewards/chosen": 1.4470009803771973, "rewards/margins": 0.8905261158943176, "rewards/rejected": 0.5564748644828796, "step": 4156 }, { "epoch": 2.24, "learning_rate": 1.5821491714665498e-08, "logits/chosen": -2.1568124294281006, "logits/rejected": -2.152616024017334, "logps/chosen": -3.9048449993133545, "logps/rejected": -4.794935703277588, "loss": 0.2367, "rewards/accuracies": 1.0, "rewards/chosen": 1.7084846496582031, "rewards/margins": 1.320218801498413, "rewards/rejected": 0.38826584815979004, "step": 4157 }, { "epoch": 2.24, "learning_rate": 1.5800246345335867e-08, "logits/chosen": -2.0791373252868652, "logits/rejected": -2.077756881713867, "logps/chosen": -0.9792285561561584, "logps/rejected": -2.074671983718872, "loss": 0.569, "rewards/accuracies": 1.0, "rewards/chosen": 1.0157880783081055, "rewards/margins": 0.265897274017334, "rewards/rejected": 0.7498908042907715, "step": 4158 }, { "epoch": 2.24, "learning_rate": 1.5779012572850175e-08, "logits/chosen": -2.0668091773986816, "logits/rejected": -2.0621886253356934, "logps/chosen": -10.335351943969727, "logps/rejected": -5.597569942474365, "loss": 0.2616, "rewards/accuracies": 1.0, "rewards/chosen": 1.8086128234863281, "rewards/margins": 1.2074757814407349, "rewards/rejected": 0.6011370420455933, "step": 4159 }, { "epoch": 2.24, "learning_rate": 1.5757790404408612e-08, "logits/chosen": -1.9789025783538818, "logits/rejected": -2.250276803970337, "logps/chosen": -1.4090405702590942, "logps/rejected": -1.5833635330200195, "loss": 0.6809, "rewards/accuracies": 1.0, "rewards/chosen": 0.9350271224975586, "rewards/margins": 0.024700701236724854, "rewards/rejected": 0.9103264212608337, "step": 4160 }, { "epoch": 2.24, "learning_rate": 1.5736579847207438e-08, "logits/chosen": -2.0978119373321533, "logits/rejected": -2.332620859146118, "logps/chosen": -6.891857147216797, "logps/rejected": -7.191474914550781, "loss": 0.6744, "rewards/accuracies": 1.0, "rewards/chosen": 0.9336620569229126, "rewards/margins": 0.037883758544921875, "rewards/rejected": 0.8957782983779907, "step": 4161 }, { "epoch": 2.24, "learning_rate": 1.5715380908438985e-08, "logits/chosen": -2.133566379547119, "logits/rejected": -2.128568649291992, "logps/chosen": -4.29290771484375, "logps/rejected": -7.448808670043945, "loss": 0.5818, "rewards/accuracies": 1.0, "rewards/chosen": 0.9599552154541016, "rewards/margins": 0.2367309331893921, "rewards/rejected": 0.7232242822647095, "step": 4162 }, { "epoch": 2.25, "learning_rate": 1.5694193595291606e-08, "logits/chosen": -2.0551867485046387, "logits/rejected": -2.0612423419952393, "logps/chosen": -0.7768676280975342, "logps/rejected": -2.3061609268188477, "loss": 0.498, "rewards/accuracies": 1.0, "rewards/chosen": 1.0581574440002441, "rewards/margins": 0.4378114342689514, "rewards/rejected": 0.6203460097312927, "step": 4163 }, { "epoch": 2.25, "learning_rate": 1.5673017914949756e-08, "logits/chosen": -1.9997735023498535, "logits/rejected": -1.997750163078308, "logps/chosen": -0.3260079622268677, "logps/rejected": -4.6885528564453125, "loss": 0.4459, "rewards/accuracies": 1.0, "rewards/chosen": 1.0475982427597046, "rewards/margins": 0.5764278173446655, "rewards/rejected": 0.47117042541503906, "step": 4164 }, { "epoch": 2.25, "learning_rate": 1.5651853874593907e-08, "logits/chosen": -2.136669635772705, "logits/rejected": -2.1387107372283936, "logps/chosen": -0.9559411406517029, "logps/rejected": -2.5146799087524414, "loss": 0.5621, "rewards/accuracies": 1.0, "rewards/chosen": 1.0626128911972046, "rewards/margins": 0.2819521427154541, "rewards/rejected": 0.7806607484817505, "step": 4165 }, { "epoch": 2.25, "learning_rate": 1.5630701481400614e-08, "logits/chosen": -2.0980958938598633, "logits/rejected": -2.315141439437866, "logps/chosen": -1.0560662746429443, "logps/rejected": -1.052696704864502, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9609087109565735, "rewards/margins": 0.02035367488861084, "rewards/rejected": 0.9405550360679626, "step": 4166 }, { "epoch": 2.25, "learning_rate": 1.5609560742542494e-08, "logits/chosen": -2.133788585662842, "logits/rejected": -2.1287522315979004, "logps/chosen": -2.8834850788116455, "logps/rejected": -2.4780192375183105, "loss": 0.4341, "rewards/accuracies": 1.0, "rewards/chosen": 1.30499267578125, "rewards/margins": 0.6094691753387451, "rewards/rejected": 0.6955235004425049, "step": 4167 }, { "epoch": 2.25, "learning_rate": 1.558843166518815e-08, "logits/chosen": -2.0791492462158203, "logits/rejected": -2.0794870853424072, "logps/chosen": -0.6864597201347351, "logps/rejected": -2.444547653198242, "loss": 0.6044, "rewards/accuracies": 1.0, "rewards/chosen": 0.8082365989685059, "rewards/margins": 0.18620997667312622, "rewards/rejected": 0.6220266222953796, "step": 4168 }, { "epoch": 2.25, "learning_rate": 1.5567314256502296e-08, "logits/chosen": -2.07637357711792, "logits/rejected": -2.0810627937316895, "logps/chosen": -2.597533941268921, "logps/rejected": -4.486055374145508, "loss": 0.4112, "rewards/accuracies": 1.0, "rewards/chosen": 1.2217878103256226, "rewards/margins": 0.6761171221733093, "rewards/rejected": 0.5456706881523132, "step": 4169 }, { "epoch": 2.25, "learning_rate": 1.554620852364566e-08, "logits/chosen": -2.0496315956115723, "logits/rejected": -2.0466115474700928, "logps/chosen": -2.1854138374328613, "logps/rejected": -5.28628396987915, "loss": 0.4094, "rewards/accuracies": 1.0, "rewards/chosen": 1.253983497619629, "rewards/margins": 0.6813409328460693, "rewards/rejected": 0.5726425647735596, "step": 4170 }, { "epoch": 2.25, "learning_rate": 1.5525114473775013e-08, "logits/chosen": -2.0681793689727783, "logits/rejected": -2.293407917022705, "logps/chosen": -3.438795566558838, "logps/rejected": -2.815260410308838, "loss": 0.712, "rewards/accuracies": 0.0, "rewards/chosen": 0.634112536907196, "rewards/margins": -0.03740328550338745, "rewards/rejected": 0.6715158224105835, "step": 4171 }, { "epoch": 2.25, "learning_rate": 1.550403211404319e-08, "logits/chosen": -2.0011441707611084, "logits/rejected": -1.991546392440796, "logps/chosen": -5.032262325286865, "logps/rejected": -5.274500846862793, "loss": 0.2853, "rewards/accuracies": 1.0, "rewards/chosen": 1.57756769657135, "rewards/margins": 1.1082111597061157, "rewards/rejected": 0.4693565368652344, "step": 4172 }, { "epoch": 2.25, "learning_rate": 1.548296145159901e-08, "logits/chosen": -1.9644029140472412, "logits/rejected": -1.9678984880447388, "logps/chosen": -3.8103458881378174, "logps/rejected": -1.9309117794036865, "loss": 0.3715, "rewards/accuracies": 1.0, "rewards/chosen": 1.479356288909912, "rewards/margins": 0.7986773252487183, "rewards/rejected": 0.6806789636611938, "step": 4173 }, { "epoch": 2.25, "learning_rate": 1.5461902493587352e-08, "logits/chosen": -2.1880574226379395, "logits/rejected": -2.3019368648529053, "logps/chosen": -6.166009426116943, "logps/rejected": -30.817401885986328, "loss": 0.3862, "rewards/accuracies": 1.0, "rewards/chosen": 1.3865900039672852, "rewards/margins": 0.7521074414253235, "rewards/rejected": 0.6344825625419617, "step": 4174 }, { "epoch": 2.25, "learning_rate": 1.5440855247149165e-08, "logits/chosen": -2.100720167160034, "logits/rejected": -2.1073362827301025, "logps/chosen": -2.171450138092041, "logps/rejected": -6.53225564956665, "loss": 0.3354, "rewards/accuracies": 1.0, "rewards/chosen": 1.2594435214996338, "rewards/margins": 0.9202017784118652, "rewards/rejected": 0.33924174308776855, "step": 4175 }, { "epoch": 2.25, "learning_rate": 1.541981971942135e-08, "logits/chosen": -2.1987133026123047, "logits/rejected": -2.1480045318603516, "logps/chosen": -15.78124713897705, "logps/rejected": -4.093708038330078, "loss": 0.2611, "rewards/accuracies": 1.0, "rewards/chosen": 1.7497020959854126, "rewards/margins": 1.2092671394348145, "rewards/rejected": 0.5404349565505981, "step": 4176 }, { "epoch": 2.25, "learning_rate": 1.53987959175369e-08, "logits/chosen": -1.9853367805480957, "logits/rejected": -2.2834582328796387, "logps/chosen": -0.5669227838516235, "logps/rejected": -0.6067014336585999, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 1.0709728002548218, "rewards/margins": 0.01587831974029541, "rewards/rejected": 1.0550944805145264, "step": 4177 }, { "epoch": 2.25, "learning_rate": 1.5377783848624788e-08, "logits/chosen": -2.0008838176727295, "logits/rejected": -2.0125389099121094, "logps/chosen": -0.9074835181236267, "logps/rejected": -10.525318145751953, "loss": 0.554, "rewards/accuracies": 1.0, "rewards/chosen": 1.0192582607269287, "rewards/margins": 0.3007432222366333, "rewards/rejected": 0.7185150384902954, "step": 4178 }, { "epoch": 2.25, "learning_rate": 1.5356783519810028e-08, "logits/chosen": -2.0882694721221924, "logits/rejected": -2.0770668983459473, "logps/chosen": -0.3303554952144623, "logps/rejected": -7.027386665344238, "loss": 0.4259, "rewards/accuracies": 1.0, "rewards/chosen": 1.018133282661438, "rewards/margins": 0.6329408884048462, "rewards/rejected": 0.3851923942565918, "step": 4179 }, { "epoch": 2.25, "learning_rate": 1.5335794938213665e-08, "logits/chosen": -2.1175289154052734, "logits/rejected": -2.333965539932251, "logps/chosen": -0.5338603854179382, "logps/rejected": -23.138572692871094, "loss": 0.5448, "rewards/accuracies": 1.0, "rewards/chosen": 1.9042795896530151, "rewards/margins": 0.3225942850112915, "rewards/rejected": 1.5816853046417236, "step": 4180 }, { "epoch": 2.26, "learning_rate": 1.5314818110952737e-08, "logits/chosen": -2.1610546112060547, "logits/rejected": -2.237830400466919, "logps/chosen": -4.994015216827393, "logps/rejected": -16.09171485900879, "loss": 0.4931, "rewards/accuracies": 1.0, "rewards/chosen": 1.1533492803573608, "rewards/margins": 0.45033830404281616, "rewards/rejected": 0.7030109763145447, "step": 4181 }, { "epoch": 2.26, "learning_rate": 1.5293853045140332e-08, "logits/chosen": -2.073967695236206, "logits/rejected": -2.237513303756714, "logps/chosen": -0.4830339848995209, "logps/rejected": -0.4768550992012024, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.8520297408103943, "rewards/margins": 0.02576279640197754, "rewards/rejected": 0.8262669444084167, "step": 4182 }, { "epoch": 2.26, "learning_rate": 1.5272899747885486e-08, "logits/chosen": -2.0474562644958496, "logits/rejected": -2.0523221492767334, "logps/chosen": -2.804250955581665, "logps/rejected": -5.442863464355469, "loss": 0.424, "rewards/accuracies": 1.0, "rewards/chosen": 1.0732420682907104, "rewards/margins": 0.638590395450592, "rewards/rejected": 0.4346516728401184, "step": 4183 }, { "epoch": 2.26, "learning_rate": 1.5251958226293305e-08, "logits/chosen": -2.1566543579101562, "logits/rejected": -2.153073310852051, "logps/chosen": -3.1765918731689453, "logps/rejected": -7.3968505859375, "loss": 0.3729, "rewards/accuracies": 1.0, "rewards/chosen": 1.5197762250900269, "rewards/margins": 0.794170618057251, "rewards/rejected": 0.7256056070327759, "step": 4184 }, { "epoch": 2.26, "learning_rate": 1.5231028487464885e-08, "logits/chosen": -2.1635687351226807, "logits/rejected": -2.2786290645599365, "logps/chosen": -3.333652973175049, "logps/rejected": -3.00182843208313, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.5004433989524841, "rewards/margins": 0.005616694688796997, "rewards/rejected": 0.49482670426368713, "step": 4185 }, { "epoch": 2.26, "learning_rate": 1.52101105384973e-08, "logits/chosen": -2.074734687805176, "logits/rejected": -2.0755550861358643, "logps/chosen": -0.8472753167152405, "logps/rejected": -3.6340901851654053, "loss": 0.5066, "rewards/accuracies": 1.0, "rewards/chosen": 1.1831196546554565, "rewards/margins": 0.41617733240127563, "rewards/rejected": 0.7669423222541809, "step": 4186 }, { "epoch": 2.26, "learning_rate": 1.5189204386483677e-08, "logits/chosen": -2.1399712562561035, "logits/rejected": -2.314897060394287, "logps/chosen": -0.46742671728134155, "logps/rejected": -0.5307889580726624, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.9374350905418396, "rewards/margins": 0.015074551105499268, "rewards/rejected": 0.9223605394363403, "step": 4187 }, { "epoch": 2.26, "learning_rate": 1.5168310038513076e-08, "logits/chosen": -2.014519214630127, "logits/rejected": -2.0263257026672363, "logps/chosen": -2.650994300842285, "logps/rejected": -1.7960597276687622, "loss": 0.4152, "rewards/accuracies": 1.0, "rewards/chosen": 1.3348785638809204, "rewards/margins": 0.6642875671386719, "rewards/rejected": 0.6705909967422485, "step": 4188 }, { "epoch": 2.26, "learning_rate": 1.5147427501670613e-08, "logits/chosen": -2.04840350151062, "logits/rejected": -2.056612968444824, "logps/chosen": -2.07893967628479, "logps/rejected": -1.4398189783096313, "loss": 0.3451, "rewards/accuracies": 1.0, "rewards/chosen": 1.6255849599838257, "rewards/margins": 0.8863442540168762, "rewards/rejected": 0.7392407059669495, "step": 4189 }, { "epoch": 2.26, "learning_rate": 1.5126556783037382e-08, "logits/chosen": -2.1258151531219482, "logits/rejected": -2.122941493988037, "logps/chosen": -4.784602165222168, "logps/rejected": -8.570104598999023, "loss": 0.2243, "rewards/accuracies": 1.0, "rewards/chosen": 1.4637494087219238, "rewards/margins": 1.380414366722107, "rewards/rejected": 0.0833350196480751, "step": 4190 }, { "epoch": 2.26, "learning_rate": 1.510569788969045e-08, "logits/chosen": -2.1884684562683105, "logits/rejected": -2.1959891319274902, "logps/chosen": -1.7398123741149902, "logps/rejected": -3.416836738586426, "loss": 0.4645, "rewards/accuracies": 1.0, "rewards/chosen": 1.0036442279815674, "rewards/margins": 0.525444746017456, "rewards/rejected": 0.47819948196411133, "step": 4191 }, { "epoch": 2.26, "learning_rate": 1.508485082870292e-08, "logits/chosen": -2.1056315898895264, "logits/rejected": -2.2530484199523926, "logps/chosen": -1.5366456508636475, "logps/rejected": -1.4571101665496826, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.8295207023620605, "rewards/margins": 0.019888460636138916, "rewards/rejected": 0.8096322417259216, "step": 4192 }, { "epoch": 2.26, "learning_rate": 1.5064015607143816e-08, "logits/chosen": -2.0209884643554688, "logits/rejected": -2.0279200077056885, "logps/chosen": -1.6724305152893066, "logps/rejected": -5.826696395874023, "loss": 0.3692, "rewards/accuracies": 1.0, "rewards/chosen": 1.1008332967758179, "rewards/margins": 0.8060288429260254, "rewards/rejected": 0.29480448365211487, "step": 4193 }, { "epoch": 2.26, "learning_rate": 1.50431922320782e-08, "logits/chosen": -2.1267025470733643, "logits/rejected": -2.1195173263549805, "logps/chosen": -0.6983359456062317, "logps/rejected": -12.721956253051758, "loss": 0.4, "rewards/accuracies": 1.0, "rewards/chosen": 1.2278484106063843, "rewards/margins": 0.7096194624900818, "rewards/rejected": 0.5182289481163025, "step": 4194 }, { "epoch": 2.26, "learning_rate": 1.502238071056711e-08, "logits/chosen": -2.1513214111328125, "logits/rejected": -2.1557016372680664, "logps/chosen": -2.1278605461120605, "logps/rejected": -3.084831714630127, "loss": 0.5014, "rewards/accuracies": 1.0, "rewards/chosen": 1.0257648229599, "rewards/margins": 0.42910271883010864, "rewards/rejected": 0.5966621041297913, "step": 4195 }, { "epoch": 2.26, "learning_rate": 1.5001581049667527e-08, "logits/chosen": -2.0575785636901855, "logits/rejected": -2.0478086471557617, "logps/chosen": -5.060728549957275, "logps/rejected": -1.9745734930038452, "loss": 0.4011, "rewards/accuracies": 1.0, "rewards/chosen": 1.6471995115280151, "rewards/margins": 0.706227719783783, "rewards/rejected": 0.9409717917442322, "step": 4196 }, { "epoch": 2.26, "learning_rate": 1.4980793256432472e-08, "logits/chosen": -2.131200075149536, "logits/rejected": -2.2619152069091797, "logps/chosen": -7.572055816650391, "logps/rejected": -7.735318183898926, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 1.0198932886123657, "rewards/margins": 0.01144719123840332, "rewards/rejected": 1.0084460973739624, "step": 4197 }, { "epoch": 2.26, "learning_rate": 1.496001733791088e-08, "logits/chosen": -2.0846469402313232, "logits/rejected": -2.0853607654571533, "logps/chosen": -1.4430570602416992, "logps/rejected": -1.969611644744873, "loss": 0.6413, "rewards/accuracies": 1.0, "rewards/chosen": 1.016371726989746, "rewards/margins": 0.10651761293411255, "rewards/rejected": 0.9098541140556335, "step": 4198 }, { "epoch": 2.26, "learning_rate": 1.4939253301147675e-08, "logits/chosen": -2.0969276428222656, "logits/rejected": -2.0347676277160645, "logps/chosen": -19.564876556396484, "logps/rejected": -3.284433603286743, "loss": 0.2973, "rewards/accuracies": 1.0, "rewards/chosen": 1.7551616430282593, "rewards/margins": 1.0608153343200684, "rewards/rejected": 0.6943462491035461, "step": 4199 }, { "epoch": 2.27, "learning_rate": 1.4918501153183827e-08, "logits/chosen": -2.00317645072937, "logits/rejected": -2.2774012088775635, "logps/chosen": -1.3812090158462524, "logps/rejected": -1.0919203758239746, "loss": 0.6983, "rewards/accuracies": 0.0, "rewards/chosen": 0.8046746253967285, "rewards/margins": -0.010370194911956787, "rewards/rejected": 0.8150448203086853, "step": 4200 }, { "epoch": 2.27, "learning_rate": 1.489776090105615e-08, "logits/chosen": -2.206303358078003, "logits/rejected": -2.204591751098633, "logps/chosen": -0.7010913491249084, "logps/rejected": -7.362605094909668, "loss": 0.3721, "rewards/accuracies": 1.0, "rewards/chosen": 1.0710420608520508, "rewards/margins": 0.7967863082885742, "rewards/rejected": 0.27425575256347656, "step": 4201 }, { "epoch": 2.27, "learning_rate": 1.4877032551797524e-08, "logits/chosen": -1.9538301229476929, "logits/rejected": -2.2917227745056152, "logps/chosen": -3.237196207046509, "logps/rejected": -4.005529880523682, "loss": 0.6541, "rewards/accuracies": 1.0, "rewards/chosen": 1.027091145515442, "rewards/margins": 0.07969391345977783, "rewards/rejected": 0.9473972320556641, "step": 4202 }, { "epoch": 2.27, "learning_rate": 1.4856316112436722e-08, "logits/chosen": -2.1050496101379395, "logits/rejected": -2.2742133140563965, "logps/chosen": -0.985227644443512, "logps/rejected": -1.0523263216018677, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 1.1928719282150269, "rewards/margins": 0.028578519821166992, "rewards/rejected": 1.1642934083938599, "step": 4203 }, { "epoch": 2.27, "learning_rate": 1.4835611589998531e-08, "logits/chosen": -1.9483985900878906, "logits/rejected": -2.2777910232543945, "logps/chosen": -0.842186450958252, "logps/rejected": -0.9201853275299072, "loss": 0.6953, "rewards/accuracies": 0.0, "rewards/chosen": 0.8857817053794861, "rewards/margins": -0.004248499870300293, "rewards/rejected": 0.8900302052497864, "step": 4204 }, { "epoch": 2.27, "learning_rate": 1.4814918991503695e-08, "logits/chosen": -2.012830972671509, "logits/rejected": -2.011401653289795, "logps/chosen": -2.8896002769470215, "logps/rejected": -5.490785598754883, "loss": 0.2626, "rewards/accuracies": 1.0, "rewards/chosen": 1.5682929754257202, "rewards/margins": 1.2031240463256836, "rewards/rejected": 0.365168958902359, "step": 4205 }, { "epoch": 2.27, "learning_rate": 1.4794238323968855e-08, "logits/chosen": -2.0877723693847656, "logits/rejected": -2.304030656814575, "logps/chosen": -0.29207271337509155, "logps/rejected": -0.32841652631759644, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 0.9200205206871033, "rewards/margins": 0.021629929542541504, "rewards/rejected": 0.8983905911445618, "step": 4206 }, { "epoch": 2.27, "learning_rate": 1.4773569594406671e-08, "logits/chosen": -2.1412720680236816, "logits/rejected": -2.1484596729278564, "logps/chosen": -2.305893659591675, "logps/rejected": -4.5318427085876465, "loss": 0.4165, "rewards/accuracies": 1.0, "rewards/chosen": 1.1415055990219116, "rewards/margins": 0.6604793667793274, "rewards/rejected": 0.48102623224258423, "step": 4207 }, { "epoch": 2.27, "learning_rate": 1.4752912809825751e-08, "logits/chosen": -2.1999783515930176, "logits/rejected": -2.211136817932129, "logps/chosen": -1.6617431640625, "logps/rejected": -3.549345016479492, "loss": 0.4381, "rewards/accuracies": 1.0, "rewards/chosen": 1.3341419696807861, "rewards/margins": 0.5983243584632874, "rewards/rejected": 0.7358176112174988, "step": 4208 }, { "epoch": 2.27, "learning_rate": 1.4732267977230583e-08, "logits/chosen": -2.035705804824829, "logits/rejected": -2.0444557666778564, "logps/chosen": -1.4632303714752197, "logps/rejected": -2.492319107055664, "loss": 0.4979, "rewards/accuracies": 1.0, "rewards/chosen": 1.0524197816848755, "rewards/margins": 0.4379886984825134, "rewards/rejected": 0.6144310832023621, "step": 4209 }, { "epoch": 2.27, "learning_rate": 1.4711635103621717e-08, "logits/chosen": -2.0704619884490967, "logits/rejected": -2.2511165142059326, "logps/chosen": -1.0202832221984863, "logps/rejected": -0.9978506565093994, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.8328075408935547, "rewards/margins": 0.0207100510597229, "rewards/rejected": 0.8120974898338318, "step": 4210 }, { "epoch": 2.27, "learning_rate": 1.4691014195995533e-08, "logits/chosen": -2.079432964324951, "logits/rejected": -2.0317535400390625, "logps/chosen": -26.372343063354492, "logps/rejected": -2.9966847896575928, "loss": 0.3172, "rewards/accuracies": 1.0, "rewards/chosen": 1.7514368295669556, "rewards/margins": 0.9854757189750671, "rewards/rejected": 0.7659611105918884, "step": 4211 }, { "epoch": 2.27, "learning_rate": 1.4670405261344442e-08, "logits/chosen": -2.0593316555023193, "logits/rejected": -2.272697687149048, "logps/chosen": -0.6269035935401917, "logps/rejected": -0.6353524923324585, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.9160637259483337, "rewards/margins": 0.018360435962677002, "rewards/rejected": 0.8977032899856567, "step": 4212 }, { "epoch": 2.27, "learning_rate": 1.4649808306656718e-08, "logits/chosen": -2.111862897872925, "logits/rejected": -2.0503439903259277, "logps/chosen": -12.560611724853516, "logps/rejected": -21.279735565185547, "loss": 0.223, "rewards/accuracies": 1.0, "rewards/chosen": 1.711525797843933, "rewards/margins": 1.3870277404785156, "rewards/rejected": 0.3244979977607727, "step": 4213 }, { "epoch": 2.27, "learning_rate": 1.462922333891664e-08, "logits/chosen": -1.9869989156723022, "logits/rejected": -1.9903533458709717, "logps/chosen": -0.9818507432937622, "logps/rejected": -1.6990152597427368, "loss": 0.5174, "rewards/accuracies": 1.0, "rewards/chosen": 1.158785343170166, "rewards/margins": 0.38907504081726074, "rewards/rejected": 0.7697103023529053, "step": 4214 }, { "epoch": 2.27, "learning_rate": 1.4608650365104403e-08, "logits/chosen": -2.069105625152588, "logits/rejected": -1.9636765718460083, "logps/chosen": -30.34825325012207, "logps/rejected": -1.7721916437149048, "loss": 0.206, "rewards/accuracies": 1.0, "rewards/chosen": 2.297166585922241, "rewards/margins": 1.474851369857788, "rewards/rejected": 0.8223151564598083, "step": 4215 }, { "epoch": 2.27, "learning_rate": 1.458808939219609e-08, "logits/chosen": -1.9905710220336914, "logits/rejected": -2.2838938236236572, "logps/chosen": -2.223153829574585, "logps/rejected": -1.835646629333496, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.8749960064888, "rewards/margins": 0.015971779823303223, "rewards/rejected": 0.8590242266654968, "step": 4216 }, { "epoch": 2.27, "learning_rate": 1.4567540427163777e-08, "logits/chosen": -2.0731146335601807, "logits/rejected": -2.290322780609131, "logps/chosen": -2.1396543979644775, "logps/rejected": -2.2550556659698486, "loss": 0.6865, "rewards/accuracies": 1.0, "rewards/chosen": 0.9014891982078552, "rewards/margins": 0.013376176357269287, "rewards/rejected": 0.8881130218505859, "step": 4217 }, { "epoch": 2.28, "learning_rate": 1.454700347697545e-08, "logits/chosen": -2.008472204208374, "logits/rejected": -2.003349542617798, "logps/chosen": -3.9998254776000977, "logps/rejected": -2.117173194885254, "loss": 0.4375, "rewards/accuracies": 1.0, "rewards/chosen": 1.4020068645477295, "rewards/margins": 0.6000068783760071, "rewards/rejected": 0.8019999861717224, "step": 4218 }, { "epoch": 2.28, "learning_rate": 1.4526478548594973e-08, "logits/chosen": -2.078965187072754, "logits/rejected": -2.0309667587280273, "logps/chosen": -28.38247299194336, "logps/rejected": -1.9388105869293213, "loss": 0.2854, "rewards/accuracies": 1.0, "rewards/chosen": 1.918113350868225, "rewards/margins": 1.107701301574707, "rewards/rejected": 0.8104120492935181, "step": 4219 }, { "epoch": 2.28, "learning_rate": 1.4505965648982226e-08, "logits/chosen": -2.102391481399536, "logits/rejected": -2.3453726768493652, "logps/chosen": -1.221283197402954, "logps/rejected": -1.115418791770935, "loss": 0.6728, "rewards/accuracies": 1.0, "rewards/chosen": 1.0170869827270508, "rewards/margins": 0.04101616144180298, "rewards/rejected": 0.9760708212852478, "step": 4220 }, { "epoch": 2.28, "learning_rate": 1.4485464785092921e-08, "logits/chosen": -2.036498546600342, "logits/rejected": -2.047820568084717, "logps/chosen": -2.577402114868164, "logps/rejected": -2.161607027053833, "loss": 0.4775, "rewards/accuracies": 1.0, "rewards/chosen": 0.9957608580589294, "rewards/margins": 0.490917444229126, "rewards/rejected": 0.5048434138298035, "step": 4221 }, { "epoch": 2.28, "learning_rate": 1.446497596387874e-08, "logits/chosen": -1.9898079633712769, "logits/rejected": -1.9930987358093262, "logps/chosen": -2.0672194957733154, "logps/rejected": -3.0369760990142822, "loss": 0.5567, "rewards/accuracies": 1.0, "rewards/chosen": 1.007483959197998, "rewards/margins": 0.2944587469100952, "rewards/rejected": 0.7130252122879028, "step": 4222 }, { "epoch": 2.28, "learning_rate": 1.4444499192287274e-08, "logits/chosen": -2.1070923805236816, "logits/rejected": -1.9778739213943481, "logps/chosen": -17.79528045654297, "logps/rejected": -8.515583038330078, "loss": 0.3212, "rewards/accuracies": 1.0, "rewards/chosen": 1.7877765893936157, "rewards/margins": 0.9707252383232117, "rewards/rejected": 0.817051351070404, "step": 4223 }, { "epoch": 2.28, "learning_rate": 1.4424034477262004e-08, "logits/chosen": -2.0870161056518555, "logits/rejected": -2.1913607120513916, "logps/chosen": -1.1026549339294434, "logps/rejected": -25.959075927734375, "loss": 0.2738, "rewards/accuracies": 1.0, "rewards/chosen": 1.1627899408340454, "rewards/margins": 1.1553735733032227, "rewards/rejected": 0.007416343782097101, "step": 4224 }, { "epoch": 2.28, "learning_rate": 1.4403581825742355e-08, "logits/chosen": -2.0570333003997803, "logits/rejected": -2.0440948009490967, "logps/chosen": -19.56194305419922, "logps/rejected": -1.003908634185791, "loss": 0.2368, "rewards/accuracies": 1.0, "rewards/chosen": 2.174179792404175, "rewards/margins": 1.3197143077850342, "rewards/rejected": 0.8544654846191406, "step": 4225 }, { "epoch": 2.28, "learning_rate": 1.4383141244663626e-08, "logits/chosen": -2.1365299224853516, "logits/rejected": -2.293278217315674, "logps/chosen": -3.1647257804870605, "logps/rejected": -3.0119736194610596, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.9528478980064392, "rewards/margins": 0.012747764587402344, "rewards/rejected": 0.9401001334190369, "step": 4226 }, { "epoch": 2.28, "learning_rate": 1.4362712740957061e-08, "logits/chosen": -2.0662522315979004, "logits/rejected": -2.267554998397827, "logps/chosen": -0.4469663202762604, "logps/rejected": -4.996649742126465, "loss": 0.5667, "rewards/accuracies": 1.0, "rewards/chosen": 0.9960166215896606, "rewards/margins": 0.27116966247558594, "rewards/rejected": 0.7248469591140747, "step": 4227 }, { "epoch": 2.28, "learning_rate": 1.434229632154978e-08, "logits/chosen": -2.0584967136383057, "logits/rejected": -2.2160515785217285, "logps/chosen": -0.8604322671890259, "logps/rejected": -0.9109643697738647, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 1.0668160915374756, "rewards/margins": 0.0017069578170776367, "rewards/rejected": 1.065109133720398, "step": 4228 }, { "epoch": 2.28, "learning_rate": 1.4321891993364826e-08, "logits/chosen": -1.9968324899673462, "logits/rejected": -2.288757562637329, "logps/chosen": -0.25660014152526855, "logps/rejected": -0.25553637742996216, "loss": 0.691, "rewards/accuracies": 1.0, "rewards/chosen": 0.7937669157981873, "rewards/margins": 0.004210054874420166, "rewards/rejected": 0.7895568609237671, "step": 4229 }, { "epoch": 2.28, "learning_rate": 1.430149976332114e-08, "logits/chosen": -2.1217265129089355, "logits/rejected": -2.1340343952178955, "logps/chosen": -7.442286491394043, "logps/rejected": -1.9740512371063232, "loss": 0.5498, "rewards/accuracies": 1.0, "rewards/chosen": 1.0764955282211304, "rewards/margins": 0.3108305335044861, "rewards/rejected": 0.7656649947166443, "step": 4230 }, { "epoch": 2.28, "learning_rate": 1.4281119638333527e-08, "logits/chosen": -1.9993654489517212, "logits/rejected": -2.0158846378326416, "logps/chosen": -1.5254803895950317, "logps/rejected": -8.715791702270508, "loss": 0.4549, "rewards/accuracies": 1.0, "rewards/chosen": 1.1575469970703125, "rewards/margins": 0.5517421364784241, "rewards/rejected": 0.6058048605918884, "step": 4231 }, { "epoch": 2.28, "learning_rate": 1.4260751625312727e-08, "logits/chosen": -2.1265599727630615, "logits/rejected": -2.1279475688934326, "logps/chosen": -0.3152964115142822, "logps/rejected": -4.55295991897583, "loss": 0.4917, "rewards/accuracies": 1.0, "rewards/chosen": 0.9792757034301758, "rewards/margins": 0.45405030250549316, "rewards/rejected": 0.5252254009246826, "step": 4232 }, { "epoch": 2.28, "learning_rate": 1.4240395731165372e-08, "logits/chosen": -2.2026138305664062, "logits/rejected": -2.2019505500793457, "logps/chosen": -1.6458879709243774, "logps/rejected": -6.375247478485107, "loss": 0.3718, "rewards/accuracies": 1.0, "rewards/chosen": 1.107530951499939, "rewards/margins": 0.7976425886154175, "rewards/rejected": 0.3098883330821991, "step": 4233 }, { "epoch": 2.28, "learning_rate": 1.422005196279395e-08, "logits/chosen": -1.9752733707427979, "logits/rejected": -2.2598745822906494, "logps/chosen": -9.132914543151855, "logps/rejected": -9.538763999938965, "loss": 0.6684, "rewards/accuracies": 1.0, "rewards/chosen": 0.4385821521282196, "rewards/margins": 0.050138115882873535, "rewards/rejected": 0.38844403624534607, "step": 4234 }, { "epoch": 2.28, "learning_rate": 1.4199720327096875e-08, "logits/chosen": -1.954986810684204, "logits/rejected": -1.92392098903656, "logps/chosen": -10.117393493652344, "logps/rejected": -4.8836894035339355, "loss": 0.4513, "rewards/accuracies": 1.0, "rewards/chosen": 0.9917508959770203, "rewards/margins": 0.5614060759544373, "rewards/rejected": 0.430344820022583, "step": 4235 }, { "epoch": 2.28, "learning_rate": 1.4179400830968412e-08, "logits/chosen": -2.1515183448791504, "logits/rejected": -2.157421350479126, "logps/chosen": -2.67525577545166, "logps/rejected": -1.8194350004196167, "loss": 0.639, "rewards/accuracies": 1.0, "rewards/chosen": 0.9829391837120056, "rewards/margins": 0.11138671636581421, "rewards/rejected": 0.8715524673461914, "step": 4236 }, { "epoch": 2.29, "learning_rate": 1.415909348129874e-08, "logits/chosen": -2.155449151992798, "logits/rejected": -2.3607001304626465, "logps/chosen": -6.262379169464111, "logps/rejected": -5.973100662231445, "loss": 0.7002, "rewards/accuracies": 0.0, "rewards/chosen": 0.8489397168159485, "rewards/margins": -0.014103114604949951, "rewards/rejected": 0.8630428314208984, "step": 4237 }, { "epoch": 2.29, "learning_rate": 1.4138798284973902e-08, "logits/chosen": -2.1828196048736572, "logits/rejected": -2.1721854209899902, "logps/chosen": -2.6813907623291016, "logps/rejected": -3.7898447513580322, "loss": 0.4017, "rewards/accuracies": 1.0, "rewards/chosen": 1.1982223987579346, "rewards/margins": 0.7044278383255005, "rewards/rejected": 0.4937945306301117, "step": 4238 }, { "epoch": 2.29, "learning_rate": 1.4118515248875834e-08, "logits/chosen": -2.0030746459960938, "logits/rejected": -2.005746841430664, "logps/chosen": -3.0220837593078613, "logps/rejected": -7.350729465484619, "loss": 0.4677, "rewards/accuracies": 1.0, "rewards/chosen": 0.9216577410697937, "rewards/margins": 0.516984224319458, "rewards/rejected": 0.4046735465526581, "step": 4239 }, { "epoch": 2.29, "learning_rate": 1.4098244379882352e-08, "logits/chosen": -2.1295018196105957, "logits/rejected": -2.274618625640869, "logps/chosen": -0.8224857449531555, "logps/rejected": -0.8331384062767029, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 1.0808466672897339, "rewards/margins": 0.007710933685302734, "rewards/rejected": 1.0731357336044312, "step": 4240 }, { "epoch": 2.29, "learning_rate": 1.4077985684867105e-08, "logits/chosen": -2.0231282711029053, "logits/rejected": -2.3972554206848145, "logps/chosen": -0.6537453532218933, "logps/rejected": -25.79018783569336, "loss": 0.7956, "rewards/accuracies": 0.0, "rewards/chosen": 1.037022352218628, "rewards/margins": -0.19541096687316895, "rewards/rejected": 1.2324333190917969, "step": 4241 }, { "epoch": 2.29, "learning_rate": 1.4057739170699657e-08, "logits/chosen": -2.050319194793701, "logits/rejected": -2.059603691101074, "logps/chosen": -3.2073445320129395, "logps/rejected": -10.347224235534668, "loss": 0.3688, "rewards/accuracies": 1.0, "rewards/chosen": 1.2497535943984985, "rewards/margins": 0.8075176477432251, "rewards/rejected": 0.44223594665527344, "step": 4242 }, { "epoch": 2.29, "learning_rate": 1.403750484424544e-08, "logits/chosen": -2.0458240509033203, "logits/rejected": -2.1674981117248535, "logps/chosen": -0.5688499212265015, "logps/rejected": -0.5884428024291992, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 0.967266857624054, "rewards/margins": 0.020568549633026123, "rewards/rejected": 0.9466983079910278, "step": 4243 }, { "epoch": 2.29, "learning_rate": 1.401728271236572e-08, "logits/chosen": -2.2488481998443604, "logits/rejected": -2.0993924140930176, "logps/chosen": -52.31431579589844, "logps/rejected": -21.74994659423828, "loss": 0.1715, "rewards/accuracies": 1.0, "rewards/chosen": 2.712916612625122, "rewards/margins": 1.676171898841858, "rewards/rejected": 1.0367447137832642, "step": 4244 }, { "epoch": 2.29, "learning_rate": 1.3997072781917651e-08, "logits/chosen": -2.2474524974823, "logits/rejected": -2.267199993133545, "logps/chosen": -13.800931930541992, "logps/rejected": -16.220232009887695, "loss": 0.5005, "rewards/accuracies": 1.0, "rewards/chosen": 1.6844457387924194, "rewards/margins": 0.4315863847732544, "rewards/rejected": 1.252859354019165, "step": 4245 }, { "epoch": 2.29, "learning_rate": 1.3976875059754273e-08, "logits/chosen": -1.9955328702926636, "logits/rejected": -2.320117950439453, "logps/chosen": -0.5737108588218689, "logps/rejected": -0.551617443561554, "loss": 0.6747, "rewards/accuracies": 1.0, "rewards/chosen": 1.037145972251892, "rewards/margins": 0.03725093603134155, "rewards/rejected": 0.9998950362205505, "step": 4246 }, { "epoch": 2.29, "learning_rate": 1.3956689552724427e-08, "logits/chosen": -2.1457550525665283, "logits/rejected": -2.302537679672241, "logps/chosen": -1.1522116661071777, "logps/rejected": -0.9818258285522461, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.8921180963516235, "rewards/margins": 0.017123520374298096, "rewards/rejected": 0.8749945759773254, "step": 4247 }, { "epoch": 2.29, "learning_rate": 1.393651626767286e-08, "logits/chosen": -2.031435251235962, "logits/rejected": -2.0323214530944824, "logps/chosen": -4.0899658203125, "logps/rejected": -2.5039138793945312, "loss": 0.3127, "rewards/accuracies": 1.0, "rewards/chosen": 1.575858473777771, "rewards/margins": 1.0019347667694092, "rewards/rejected": 0.5739237666130066, "step": 4248 }, { "epoch": 2.29, "learning_rate": 1.3916355211440162e-08, "logits/chosen": -2.2620010375976562, "logits/rejected": -2.159243583679199, "logps/chosen": -26.518901824951172, "logps/rejected": -3.962291717529297, "loss": 0.2081, "rewards/accuracies": 1.0, "rewards/chosen": 1.9306995868682861, "rewards/margins": 1.4636812210083008, "rewards/rejected": 0.46701833605766296, "step": 4249 }, { "epoch": 2.29, "learning_rate": 1.3896206390862786e-08, "logits/chosen": -2.114856004714966, "logits/rejected": -2.291719675064087, "logps/chosen": -0.2776612639427185, "logps/rejected": -0.32891133427619934, "loss": 0.6866, "rewards/accuracies": 1.0, "rewards/chosen": 0.8521367311477661, "rewards/margins": 0.013104736804962158, "rewards/rejected": 0.839031994342804, "step": 4250 }, { "epoch": 2.29, "learning_rate": 1.3876069812773e-08, "logits/chosen": -2.1172149181365967, "logits/rejected": -2.0445196628570557, "logps/chosen": -20.3713321685791, "logps/rejected": -2.7811906337738037, "loss": 0.3489, "rewards/accuracies": 1.0, "rewards/chosen": 1.4815775156021118, "rewards/margins": 0.8735763430595398, "rewards/rejected": 0.608001172542572, "step": 4251 }, { "epoch": 2.29, "learning_rate": 1.385594548399896e-08, "logits/chosen": -2.111995220184326, "logits/rejected": -2.1166820526123047, "logps/chosen": -3.6083414554595947, "logps/rejected": -5.86776065826416, "loss": 0.5472, "rewards/accuracies": 1.0, "rewards/chosen": 1.001955270767212, "rewards/margins": 0.31692343950271606, "rewards/rejected": 0.6850318312644958, "step": 4252 }, { "epoch": 2.29, "learning_rate": 1.3835833411364666e-08, "logits/chosen": -2.1478781700134277, "logits/rejected": -2.296733856201172, "logps/chosen": -6.9195356369018555, "logps/rejected": -10.608882904052734, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 0.9763880968093872, "rewards/margins": 0.00936347246170044, "rewards/rejected": 0.9670246243476868, "step": 4253 }, { "epoch": 2.29, "learning_rate": 1.3815733601689922e-08, "logits/chosen": -2.06074595451355, "logits/rejected": -2.0596537590026855, "logps/chosen": -0.7735189199447632, "logps/rejected": -7.0030317306518555, "loss": 0.4088, "rewards/accuracies": 1.0, "rewards/chosen": 1.1130532026290894, "rewards/margins": 0.6832083463668823, "rewards/rejected": 0.42984485626220703, "step": 4254 }, { "epoch": 2.3, "learning_rate": 1.3795646061790416e-08, "logits/chosen": -2.15876841545105, "logits/rejected": -2.305487632751465, "logps/chosen": -1.5365418195724487, "logps/rejected": -1.489790916442871, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 0.6362547278404236, "rewards/margins": 0.007027328014373779, "rewards/rejected": 0.6292273998260498, "step": 4255 }, { "epoch": 2.3, "learning_rate": 1.3775570798477671e-08, "logits/chosen": -2.0479612350463867, "logits/rejected": -2.2782015800476074, "logps/chosen": -5.009214878082275, "logps/rejected": -5.150389671325684, "loss": 0.6767, "rewards/accuracies": 1.0, "rewards/chosen": 1.0131607055664062, "rewards/margins": 0.03325432538986206, "rewards/rejected": 0.9799063801765442, "step": 4256 }, { "epoch": 2.3, "learning_rate": 1.3755507818559014e-08, "logits/chosen": -2.098555564880371, "logits/rejected": -2.0159175395965576, "logps/chosen": -38.59830093383789, "logps/rejected": -4.400814533233643, "loss": 0.1694, "rewards/accuracies": 1.0, "rewards/chosen": 1.9840167760849, "rewards/margins": 1.689530372619629, "rewards/rejected": 0.2944864332675934, "step": 4257 }, { "epoch": 2.3, "learning_rate": 1.3735457128837646e-08, "logits/chosen": -2.117694854736328, "logits/rejected": -2.1496689319610596, "logps/chosen": -2.5795364379882812, "logps/rejected": -23.83395767211914, "loss": 0.5683, "rewards/accuracies": 1.0, "rewards/chosen": 1.1034094095230103, "rewards/margins": 0.267555832862854, "rewards/rejected": 0.8358535766601562, "step": 4258 }, { "epoch": 2.3, "learning_rate": 1.3715418736112577e-08, "logits/chosen": -2.05956768989563, "logits/rejected": -2.066732883453369, "logps/chosen": -2.160316228866577, "logps/rejected": -4.769832611083984, "loss": 0.4807, "rewards/accuracies": 1.0, "rewards/chosen": 0.9981169700622559, "rewards/margins": 0.4824705123901367, "rewards/rejected": 0.5156464576721191, "step": 4259 }, { "epoch": 2.3, "learning_rate": 1.3695392647178672e-08, "logits/chosen": -2.1116397380828857, "logits/rejected": -2.3152551651000977, "logps/chosen": -2.239436626434326, "logps/rejected": -1.0764377117156982, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 1.0040041208267212, "rewards/margins": 0.009920775890350342, "rewards/rejected": 0.9940833449363708, "step": 4260 }, { "epoch": 2.3, "learning_rate": 1.3675378868826582e-08, "logits/chosen": -2.098907470703125, "logits/rejected": -2.1058571338653564, "logps/chosen": -3.07352614402771, "logps/rejected": -6.90544319152832, "loss": 0.4049, "rewards/accuracies": 1.0, "rewards/chosen": 1.5569660663604736, "rewards/margins": 0.6949554681777954, "rewards/rejected": 0.8620105981826782, "step": 4261 }, { "epoch": 2.3, "learning_rate": 1.365537740784281e-08, "logits/chosen": -2.0651936531066895, "logits/rejected": -2.056941032409668, "logps/chosen": -2.8716490268707275, "logps/rejected": -4.68328857421875, "loss": 0.4933, "rewards/accuracies": 1.0, "rewards/chosen": 1.0085006952285767, "rewards/margins": 0.44993728399276733, "rewards/rejected": 0.5585634112358093, "step": 4262 }, { "epoch": 2.3, "learning_rate": 1.3635388271009712e-08, "logits/chosen": -2.1420342922210693, "logits/rejected": -2.3294506072998047, "logps/chosen": -3.0830726623535156, "logps/rejected": -5.013336658477783, "loss": 0.74, "rewards/accuracies": 0.0, "rewards/chosen": 1.07865571975708, "rewards/margins": -0.09159839153289795, "rewards/rejected": 1.170254111289978, "step": 4263 }, { "epoch": 2.3, "learning_rate": 1.3615411465105393e-08, "logits/chosen": -2.119478225708008, "logits/rejected": -2.3227314949035645, "logps/chosen": -3.4335548877716064, "logps/rejected": -3.4141650199890137, "loss": 0.6819, "rewards/accuracies": 1.0, "rewards/chosen": 0.38136833906173706, "rewards/margins": 0.022657573223114014, "rewards/rejected": 0.35871076583862305, "step": 4264 }, { "epoch": 2.3, "learning_rate": 1.3595446996903837e-08, "logits/chosen": -2.084876298904419, "logits/rejected": -2.0840706825256348, "logps/chosen": -3.207051992416382, "logps/rejected": -11.907169342041016, "loss": 0.3672, "rewards/accuracies": 1.0, "rewards/chosen": 1.0803487300872803, "rewards/margins": 0.8127267360687256, "rewards/rejected": 0.2676219940185547, "step": 4265 }, { "epoch": 2.3, "learning_rate": 1.3575494873174837e-08, "logits/chosen": -2.094578742980957, "logits/rejected": -2.0964152812957764, "logps/chosen": -2.1096537113189697, "logps/rejected": -6.336735725402832, "loss": 0.3617, "rewards/accuracies": 1.0, "rewards/chosen": 1.4907711744308472, "rewards/margins": 0.8305151462554932, "rewards/rejected": 0.660256028175354, "step": 4266 }, { "epoch": 2.3, "learning_rate": 1.3555555100683952e-08, "logits/chosen": -2.0434861183166504, "logits/rejected": -2.3331799507141113, "logps/chosen": -1.716568946838379, "logps/rejected": -7.031975269317627, "loss": 0.5513, "rewards/accuracies": 1.0, "rewards/chosen": 1.2304729223251343, "rewards/margins": 0.30714279413223267, "rewards/rejected": 0.9233301281929016, "step": 4267 }, { "epoch": 2.3, "learning_rate": 1.3535627686192614e-08, "logits/chosen": -2.0056819915771484, "logits/rejected": -2.2688982486724854, "logps/chosen": -0.7607847452163696, "logps/rejected": -0.800150990486145, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 1.0197807550430298, "rewards/margins": -0.0023641586303710938, "rewards/rejected": 1.0221449136734009, "step": 4268 }, { "epoch": 2.3, "learning_rate": 1.3515712636458026e-08, "logits/chosen": -2.042569160461426, "logits/rejected": -2.0497865676879883, "logps/chosen": -1.391871452331543, "logps/rejected": -3.4906327724456787, "loss": 0.4638, "rewards/accuracies": 1.0, "rewards/chosen": 1.0418609380722046, "rewards/margins": 0.5275889039039612, "rewards/rejected": 0.5142720341682434, "step": 4269 }, { "epoch": 2.3, "learning_rate": 1.3495809958233228e-08, "logits/chosen": -2.0235555171966553, "logits/rejected": -2.272502899169922, "logps/chosen": -0.26244479417800903, "logps/rejected": -0.29167675971984863, "loss": 0.6676, "rewards/accuracies": 1.0, "rewards/chosen": 1.0238282680511475, "rewards/margins": 0.051733553409576416, "rewards/rejected": 0.972094714641571, "step": 4270 }, { "epoch": 2.3, "learning_rate": 1.347591965826705e-08, "logits/chosen": -2.1999497413635254, "logits/rejected": -2.36893892288208, "logps/chosen": -15.528751373291016, "logps/rejected": -16.570722579956055, "loss": 0.5735, "rewards/accuracies": 1.0, "rewards/chosen": 1.2266758680343628, "rewards/margins": 0.2555456757545471, "rewards/rejected": 0.9711301922798157, "step": 4271 }, { "epoch": 2.3, "learning_rate": 1.3456041743304091e-08, "logits/chosen": -2.2231531143188477, "logits/rejected": -2.065032720565796, "logps/chosen": -35.15398025512695, "logps/rejected": -4.305508613586426, "loss": 0.0943, "rewards/accuracies": 1.0, "rewards/chosen": 2.7895278930664062, "rewards/margins": 2.3137192726135254, "rewards/rejected": 0.4758085310459137, "step": 4272 }, { "epoch": 2.3, "learning_rate": 1.3436176220084822e-08, "logits/chosen": -2.1381618976593018, "logits/rejected": -2.130396842956543, "logps/chosen": -3.211242914199829, "logps/rejected": -4.941079139709473, "loss": 0.5036, "rewards/accuracies": 1.0, "rewards/chosen": 1.3807867765426636, "rewards/margins": 0.4235919713973999, "rewards/rejected": 0.9571948051452637, "step": 4273 }, { "epoch": 2.31, "learning_rate": 1.341632309534544e-08, "logits/chosen": -1.9734495878219604, "logits/rejected": -1.980810284614563, "logps/chosen": -1.9356906414031982, "logps/rejected": -3.892775297164917, "loss": 0.4074, "rewards/accuracies": 1.0, "rewards/chosen": 1.170560598373413, "rewards/margins": 0.6874465346336365, "rewards/rejected": 0.4831140637397766, "step": 4274 }, { "epoch": 2.31, "learning_rate": 1.3396482375817975e-08, "logits/chosen": -1.9937883615493774, "logits/rejected": -1.9947834014892578, "logps/chosen": -0.9454348683357239, "logps/rejected": -5.723940372467041, "loss": 0.5371, "rewards/accuracies": 1.0, "rewards/chosen": 0.8144535422325134, "rewards/margins": 0.34099796414375305, "rewards/rejected": 0.4734555780887604, "step": 4275 }, { "epoch": 2.31, "learning_rate": 1.337665406823027e-08, "logits/chosen": -2.135469436645508, "logits/rejected": -2.149273633956909, "logps/chosen": -0.46929267048835754, "logps/rejected": -8.914979934692383, "loss": 0.4729, "rewards/accuracies": 1.0, "rewards/chosen": 0.9661710858345032, "rewards/margins": 0.5032052993774414, "rewards/rejected": 0.46296578645706177, "step": 4276 }, { "epoch": 2.31, "learning_rate": 1.3356838179305885e-08, "logits/chosen": -2.243662118911743, "logits/rejected": -2.3050520420074463, "logps/chosen": -8.30461597442627, "logps/rejected": -26.09505271911621, "loss": 0.4553, "rewards/accuracies": 1.0, "rewards/chosen": 1.1646971702575684, "rewards/margins": 0.5504540205001831, "rewards/rejected": 0.6142431497573853, "step": 4277 }, { "epoch": 2.31, "learning_rate": 1.3337034715764284e-08, "logits/chosen": -2.124699592590332, "logits/rejected": -2.058763265609741, "logps/chosen": -25.708724975585938, "logps/rejected": -2.1067118644714355, "loss": 0.2031, "rewards/accuracies": 1.0, "rewards/chosen": 2.2936151027679443, "rewards/margins": 1.490755319595337, "rewards/rejected": 0.8028597235679626, "step": 4278 }, { "epoch": 2.31, "learning_rate": 1.33172436843206e-08, "logits/chosen": -2.0632903575897217, "logits/rejected": -2.253831624984741, "logps/chosen": -0.2712378203868866, "logps/rejected": -0.3788902461528778, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.8564552664756775, "rewards/margins": 0.018738389015197754, "rewards/rejected": 0.8377168774604797, "step": 4279 }, { "epoch": 2.31, "learning_rate": 1.3297465091685822e-08, "logits/chosen": -2.134298324584961, "logits/rejected": -2.3440732955932617, "logps/chosen": -0.603351354598999, "logps/rejected": -0.6136577129364014, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.7453666925430298, "rewards/margins": 0.03229951858520508, "rewards/rejected": 0.7130671739578247, "step": 4280 }, { "epoch": 2.31, "learning_rate": 1.3277698944566713e-08, "logits/chosen": -2.078000545501709, "logits/rejected": -2.0853214263916016, "logps/chosen": -5.02698278427124, "logps/rejected": -3.043368339538574, "loss": 0.5047, "rewards/accuracies": 1.0, "rewards/chosen": 1.0530754327774048, "rewards/margins": 0.4207915663719177, "rewards/rejected": 0.6322838664054871, "step": 4281 }, { "epoch": 2.31, "learning_rate": 1.3257945249665781e-08, "logits/chosen": -2.0624687671661377, "logits/rejected": -2.310011148452759, "logps/chosen": -0.40341857075691223, "logps/rejected": -0.3475238084793091, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.8311042189598083, "rewards/margins": 0.01964569091796875, "rewards/rejected": 0.8114585280418396, "step": 4282 }, { "epoch": 2.31, "learning_rate": 1.3238204013681354e-08, "logits/chosen": -2.062971591949463, "logits/rejected": -2.0606372356414795, "logps/chosen": -2.192079544067383, "logps/rejected": -5.522305965423584, "loss": 0.2761, "rewards/accuracies": 1.0, "rewards/chosen": 1.5346068143844604, "rewards/margins": 1.1459671258926392, "rewards/rejected": 0.3886396884918213, "step": 4283 }, { "epoch": 2.31, "learning_rate": 1.3218475243307498e-08, "logits/chosen": -2.074833393096924, "logits/rejected": -2.3252413272857666, "logps/chosen": -13.20569896697998, "logps/rejected": -15.665984153747559, "loss": 0.5692, "rewards/accuracies": 1.0, "rewards/chosen": 0.979411244392395, "rewards/margins": 0.26543253660202026, "rewards/rejected": 0.7139787077903748, "step": 4284 }, { "epoch": 2.31, "learning_rate": 1.3198758945234078e-08, "logits/chosen": -2.038839340209961, "logits/rejected": -2.0361380577087402, "logps/chosen": -1.3394949436187744, "logps/rejected": -5.301152229309082, "loss": 0.4385, "rewards/accuracies": 1.0, "rewards/chosen": 0.9051355719566345, "rewards/margins": 0.5971019268035889, "rewards/rejected": 0.30803367495536804, "step": 4285 }, { "epoch": 2.31, "learning_rate": 1.3179055126146737e-08, "logits/chosen": -2.205838441848755, "logits/rejected": -2.2078781127929688, "logps/chosen": -3.551110029220581, "logps/rejected": -0.810763955116272, "loss": 0.6069, "rewards/accuracies": 1.0, "rewards/chosen": 1.126939296722412, "rewards/margins": 0.1805773377418518, "rewards/rejected": 0.9463619589805603, "step": 4286 }, { "epoch": 2.31, "learning_rate": 1.3159363792726835e-08, "logits/chosen": -2.028541088104248, "logits/rejected": -2.3262763023376465, "logps/chosen": -0.8157221078872681, "logps/rejected": -0.7383788228034973, "loss": 0.7042, "rewards/accuracies": 0.0, "rewards/chosen": 1.017236590385437, "rewards/margins": -0.021904945373535156, "rewards/rejected": 1.0391415357589722, "step": 4287 }, { "epoch": 2.31, "learning_rate": 1.3139684951651585e-08, "logits/chosen": -2.113727569580078, "logits/rejected": -2.1167709827423096, "logps/chosen": -2.9558863639831543, "logps/rejected": -3.9004454612731934, "loss": 0.4369, "rewards/accuracies": 1.0, "rewards/chosen": 1.0791666507720947, "rewards/margins": 0.6017152070999146, "rewards/rejected": 0.47745147347450256, "step": 4288 }, { "epoch": 2.31, "learning_rate": 1.312001860959387e-08, "logits/chosen": -2.045173168182373, "logits/rejected": -2.2613883018493652, "logps/chosen": -3.244635581970215, "logps/rejected": -3.0613088607788086, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.827142059803009, "rewards/margins": 0.020739078521728516, "rewards/rejected": 0.8064029812812805, "step": 4289 }, { "epoch": 2.31, "learning_rate": 1.31003647732224e-08, "logits/chosen": -2.1150364875793457, "logits/rejected": -2.1845102310180664, "logps/chosen": -14.926366806030273, "logps/rejected": -11.969795227050781, "loss": 0.4906, "rewards/accuracies": 1.0, "rewards/chosen": 1.5084625482559204, "rewards/margins": 0.45694470405578613, "rewards/rejected": 1.0515178442001343, "step": 4290 }, { "epoch": 2.31, "learning_rate": 1.308072344920163e-08, "logits/chosen": -2.0262842178344727, "logits/rejected": -2.019587516784668, "logps/chosen": -4.482400894165039, "logps/rejected": -3.972062110900879, "loss": 0.2488, "rewards/accuracies": 1.0, "rewards/chosen": 1.6161428689956665, "rewards/margins": 1.2640535831451416, "rewards/rejected": 0.3520892560482025, "step": 4291 }, { "epoch": 2.31, "learning_rate": 1.306109464419174e-08, "logits/chosen": -2.0226614475250244, "logits/rejected": -2.032418727874756, "logps/chosen": -1.4121297597885132, "logps/rejected": -3.2171080112457275, "loss": 0.3991, "rewards/accuracies": 1.0, "rewards/chosen": 1.2938495874404907, "rewards/margins": 0.7124391794204712, "rewards/rejected": 0.5814104080200195, "step": 4292 }, { "epoch": 2.32, "learning_rate": 1.3041478364848701e-08, "logits/chosen": -2.0719072818756104, "logits/rejected": -2.071406841278076, "logps/chosen": -2.2488346099853516, "logps/rejected": -4.438530921936035, "loss": 0.321, "rewards/accuracies": 1.0, "rewards/chosen": 1.4857410192489624, "rewards/margins": 0.9714170098304749, "rewards/rejected": 0.5143240094184875, "step": 4293 }, { "epoch": 2.32, "learning_rate": 1.302187461782424e-08, "logits/chosen": -2.136632204055786, "logits/rejected": -2.1462244987487793, "logps/chosen": -1.7734192609786987, "logps/rejected": -2.581867218017578, "loss": 0.5309, "rewards/accuracies": 1.0, "rewards/chosen": 1.001986026763916, "rewards/margins": 0.3560824394226074, "rewards/rejected": 0.6459035873413086, "step": 4294 }, { "epoch": 2.32, "learning_rate": 1.3002283409765796e-08, "logits/chosen": -2.2518551349639893, "logits/rejected": -2.17063045501709, "logps/chosen": -24.10346031188965, "logps/rejected": -4.452760219573975, "loss": 0.2033, "rewards/accuracies": 1.0, "rewards/chosen": 1.9997313022613525, "rewards/margins": 1.489671230316162, "rewards/rejected": 0.5100600123405457, "step": 4295 }, { "epoch": 2.32, "learning_rate": 1.2982704747316608e-08, "logits/chosen": -2.037602663040161, "logits/rejected": -2.0390677452087402, "logps/chosen": -0.5484836101531982, "logps/rejected": -4.691196441650391, "loss": 0.5168, "rewards/accuracies": 1.0, "rewards/chosen": 0.8961935043334961, "rewards/margins": 0.39051151275634766, "rewards/rejected": 0.5056819915771484, "step": 4296 }, { "epoch": 2.32, "learning_rate": 1.2963138637115584e-08, "logits/chosen": -2.110463857650757, "logits/rejected": -2.119781732559204, "logps/chosen": -1.765852928161621, "logps/rejected": -2.8642196655273438, "loss": 0.4625, "rewards/accuracies": 1.0, "rewards/chosen": 1.2069724798202515, "rewards/margins": 0.5310600996017456, "rewards/rejected": 0.6759123802185059, "step": 4297 }, { "epoch": 2.32, "learning_rate": 1.2943585085797493e-08, "logits/chosen": -1.974581003189087, "logits/rejected": -2.2727365493774414, "logps/chosen": -0.2507244050502777, "logps/rejected": -0.26623111963272095, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.958278477191925, "rewards/margins": 0.00539320707321167, "rewards/rejected": 0.9528852701187134, "step": 4298 }, { "epoch": 2.32, "learning_rate": 1.292404409999272e-08, "logits/chosen": -2.04333233833313, "logits/rejected": -2.2480030059814453, "logps/chosen": -0.27026963233947754, "logps/rejected": -0.290283203125, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.9056559801101685, "rewards/margins": 0.03535717725753784, "rewards/rejected": 0.8702988028526306, "step": 4299 }, { "epoch": 2.32, "learning_rate": 1.2904515686327472e-08, "logits/chosen": -2.1706249713897705, "logits/rejected": -2.3144500255584717, "logps/chosen": -5.303137302398682, "logps/rejected": -1.3758716583251953, "loss": 0.746, "rewards/accuracies": 0.0, "rewards/chosen": 0.9536283612251282, "rewards/margins": -0.10296708345413208, "rewards/rejected": 1.0565954446792603, "step": 4300 }, { "epoch": 2.32, "learning_rate": 1.2884999851423672e-08, "logits/chosen": -2.121829032897949, "logits/rejected": -2.1349618434906006, "logps/chosen": -1.843173623085022, "logps/rejected": -5.7315993309021, "loss": 0.3336, "rewards/accuracies": 1.0, "rewards/chosen": 1.3915961980819702, "rewards/margins": 0.9264038801193237, "rewards/rejected": 0.4651922881603241, "step": 4301 }, { "epoch": 2.32, "learning_rate": 1.2865496601898951e-08, "logits/chosen": -1.9877989292144775, "logits/rejected": -1.9759149551391602, "logps/chosen": -9.413495063781738, "logps/rejected": -9.969549179077148, "loss": 0.5148, "rewards/accuracies": 1.0, "rewards/chosen": 1.5086508989334106, "rewards/margins": 0.39548468589782715, "rewards/rejected": 1.1131662130355835, "step": 4302 }, { "epoch": 2.32, "learning_rate": 1.2846005944366706e-08, "logits/chosen": -1.929731011390686, "logits/rejected": -2.2483906745910645, "logps/chosen": -3.734990358352661, "logps/rejected": -3.590054988861084, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.7370319366455078, "rewards/margins": 0.008532345294952393, "rewards/rejected": 0.7284995913505554, "step": 4303 }, { "epoch": 2.32, "learning_rate": 1.2826527885436067e-08, "logits/chosen": -2.0520520210266113, "logits/rejected": -2.251239776611328, "logps/chosen": -1.5111520290374756, "logps/rejected": -1.8271397352218628, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 1.049172282218933, "rewards/margins": 0.02958965301513672, "rewards/rejected": 1.0195826292037964, "step": 4304 }, { "epoch": 2.32, "learning_rate": 1.2807062431711852e-08, "logits/chosen": -2.1536736488342285, "logits/rejected": -2.043794870376587, "logps/chosen": -22.87487030029297, "logps/rejected": -3.6209540367126465, "loss": 0.1526, "rewards/accuracies": 1.0, "rewards/chosen": 2.3644115924835205, "rewards/margins": 1.8028502464294434, "rewards/rejected": 0.5615613460540771, "step": 4305 }, { "epoch": 2.32, "learning_rate": 1.278760958979465e-08, "logits/chosen": -2.1540417671203613, "logits/rejected": -2.038118839263916, "logps/chosen": -9.69525146484375, "logps/rejected": -2.177225112915039, "loss": 0.3348, "rewards/accuracies": 1.0, "rewards/chosen": 1.718304991722107, "rewards/margins": 0.9221389293670654, "rewards/rejected": 0.7961660623550415, "step": 4306 }, { "epoch": 2.32, "learning_rate": 1.2768169366280719e-08, "logits/chosen": -2.2070329189300537, "logits/rejected": -2.099897861480713, "logps/chosen": -18.154878616333008, "logps/rejected": -2.4028513431549072, "loss": 0.2088, "rewards/accuracies": 1.0, "rewards/chosen": 2.1781065464019775, "rewards/margins": 1.460026502609253, "rewards/rejected": 0.7180800437927246, "step": 4307 }, { "epoch": 2.32, "learning_rate": 1.2748741767762122e-08, "logits/chosen": -2.0479085445404053, "logits/rejected": -2.048293352127075, "logps/chosen": -2.15766978263855, "logps/rejected": -0.7036733031272888, "loss": 0.6249, "rewards/accuracies": 1.0, "rewards/chosen": 0.8687067031860352, "rewards/margins": 0.1414104700088501, "rewards/rejected": 0.7272962331771851, "step": 4308 }, { "epoch": 2.32, "learning_rate": 1.2729326800826556e-08, "logits/chosen": -2.119788408279419, "logits/rejected": -2.290289878845215, "logps/chosen": -1.4947612285614014, "logps/rejected": -1.4747183322906494, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 0.9921526312828064, "rewards/margins": 0.03075432777404785, "rewards/rejected": 0.9613983035087585, "step": 4309 }, { "epoch": 2.32, "learning_rate": 1.2709924472057477e-08, "logits/chosen": -2.1813015937805176, "logits/rejected": -2.1451313495635986, "logps/chosen": -5.927671909332275, "logps/rejected": -16.15471076965332, "loss": 0.3406, "rewards/accuracies": 1.0, "rewards/chosen": 1.2556043863296509, "rewards/margins": 0.901960015296936, "rewards/rejected": 0.35364437103271484, "step": 4310 }, { "epoch": 2.33, "learning_rate": 1.2690534788034075e-08, "logits/chosen": -2.1240487098693848, "logits/rejected": -2.126439332962036, "logps/chosen": -3.1498169898986816, "logps/rejected": -8.111295700073242, "loss": 0.4675, "rewards/accuracies": 1.0, "rewards/chosen": 1.305710792541504, "rewards/margins": 0.517543375492096, "rewards/rejected": 0.788167417049408, "step": 4311 }, { "epoch": 2.33, "learning_rate": 1.267115775533118e-08, "logits/chosen": -2.1177046298980713, "logits/rejected": -2.2276856899261475, "logps/chosen": -8.176055908203125, "logps/rejected": -20.952682495117188, "loss": 0.2459, "rewards/accuracies": 1.0, "rewards/chosen": 1.9713280200958252, "rewards/margins": 1.277551293373108, "rewards/rejected": 0.6937767267227173, "step": 4312 }, { "epoch": 2.33, "learning_rate": 1.2651793380519404e-08, "logits/chosen": -2.1494855880737305, "logits/rejected": -2.15484356880188, "logps/chosen": -4.015692234039307, "logps/rejected": -9.97827434539795, "loss": 0.3364, "rewards/accuracies": 1.0, "rewards/chosen": 1.2265771627426147, "rewards/margins": 0.9166765809059143, "rewards/rejected": 0.30990058183670044, "step": 4313 }, { "epoch": 2.33, "learning_rate": 1.2632441670165057e-08, "logits/chosen": -2.0547852516174316, "logits/rejected": -2.2896811962127686, "logps/chosen": -1.1145764589309692, "logps/rejected": -1.1272107362747192, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 0.7957190871238708, "rewards/margins": 0.030664682388305664, "rewards/rejected": 0.7650544047355652, "step": 4314 }, { "epoch": 2.33, "learning_rate": 1.2613102630830103e-08, "logits/chosen": -2.104919910430908, "logits/rejected": -2.31894588470459, "logps/chosen": -0.14678938686847687, "logps/rejected": -0.1600995808839798, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 1.039379358291626, "rewards/margins": 0.018300175666809082, "rewards/rejected": 1.021079182624817, "step": 4315 }, { "epoch": 2.33, "learning_rate": 1.2593776269072264e-08, "logits/chosen": -1.9762403964996338, "logits/rejected": -1.9767595529556274, "logps/chosen": -1.1044645309448242, "logps/rejected": -1.926161766052246, "loss": 0.558, "rewards/accuracies": 1.0, "rewards/chosen": 1.019473910331726, "rewards/margins": 0.29145026206970215, "rewards/rejected": 0.7280236482620239, "step": 4316 }, { "epoch": 2.33, "learning_rate": 1.257446259144494e-08, "logits/chosen": -2.062375068664551, "logits/rejected": -2.1338984966278076, "logps/chosen": -2.0087730884552, "logps/rejected": -18.895753860473633, "loss": 0.5555, "rewards/accuracies": 1.0, "rewards/chosen": 1.378667950630188, "rewards/margins": 0.2973440885543823, "rewards/rejected": 1.0813238620758057, "step": 4317 }, { "epoch": 2.33, "learning_rate": 1.2555161604497238e-08, "logits/chosen": -2.0482351779937744, "logits/rejected": -2.039505958557129, "logps/chosen": -3.6634531021118164, "logps/rejected": -7.570219993591309, "loss": 0.2684, "rewards/accuracies": 1.0, "rewards/chosen": 1.5983614921569824, "rewards/margins": 1.178208351135254, "rewards/rejected": 0.4201531410217285, "step": 4318 }, { "epoch": 2.33, "learning_rate": 1.253587331477397e-08, "logits/chosen": -2.093982219696045, "logits/rejected": -2.1000053882598877, "logps/chosen": -1.9324339628219604, "logps/rejected": -12.693338394165039, "loss": 0.5246, "rewards/accuracies": 1.0, "rewards/chosen": 1.3471211194992065, "rewards/margins": 0.37136828899383545, "rewards/rejected": 0.9757528305053711, "step": 4319 }, { "epoch": 2.33, "learning_rate": 1.2516597728815597e-08, "logits/chosen": -2.0860819816589355, "logits/rejected": -2.084577798843384, "logps/chosen": -0.20171338319778442, "logps/rejected": -6.17145299911499, "loss": 0.5192, "rewards/accuracies": 1.0, "rewards/chosen": 1.0373015403747559, "rewards/margins": 0.38469481468200684, "rewards/rejected": 0.652606725692749, "step": 4320 }, { "epoch": 2.33, "learning_rate": 1.2497334853158342e-08, "logits/chosen": -2.1445345878601074, "logits/rejected": -2.126861095428467, "logps/chosen": -7.820252895355225, "logps/rejected": -6.442598342895508, "loss": 0.3401, "rewards/accuracies": 1.0, "rewards/chosen": 1.458582878112793, "rewards/margins": 0.9036396145820618, "rewards/rejected": 0.5549432635307312, "step": 4321 }, { "epoch": 2.33, "learning_rate": 1.2478084694334051e-08, "logits/chosen": -2.0056183338165283, "logits/rejected": -2.3017375469207764, "logps/chosen": -0.47746172547340393, "logps/rejected": -0.5308984518051147, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.9460402727127075, "rewards/margins": 0.015502870082855225, "rewards/rejected": 0.9305374026298523, "step": 4322 }, { "epoch": 2.33, "learning_rate": 1.2458847258870297e-08, "logits/chosen": -1.980089545249939, "logits/rejected": -2.2728211879730225, "logps/chosen": -2.301853895187378, "logps/rejected": -2.3785738945007324, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 1.036787509918213, "rewards/margins": 0.016879558563232422, "rewards/rejected": 1.0199079513549805, "step": 4323 }, { "epoch": 2.33, "learning_rate": 1.2439622553290346e-08, "logits/chosen": -2.065058469772339, "logits/rejected": -2.0702130794525146, "logps/chosen": -1.0434062480926514, "logps/rejected": -12.564752578735352, "loss": 0.5408, "rewards/accuracies": 1.0, "rewards/chosen": 1.1228002309799194, "rewards/margins": 0.3321927785873413, "rewards/rejected": 0.7906074523925781, "step": 4324 }, { "epoch": 2.33, "learning_rate": 1.2420410584113106e-08, "logits/chosen": -2.1472666263580322, "logits/rejected": -2.1500468254089355, "logps/chosen": -7.287982940673828, "logps/rejected": -6.727880477905273, "loss": 0.592, "rewards/accuracies": 1.0, "rewards/chosen": 1.3418384790420532, "rewards/margins": 0.21362054347991943, "rewards/rejected": 1.1282179355621338, "step": 4325 }, { "epoch": 2.33, "learning_rate": 1.2401211357853203e-08, "logits/chosen": -2.04179048538208, "logits/rejected": -2.2504897117614746, "logps/chosen": -2.352654218673706, "logps/rejected": -0.5879546403884888, "loss": 0.7247, "rewards/accuracies": 0.0, "rewards/chosen": 1.0065032243728638, "rewards/margins": -0.06222832202911377, "rewards/rejected": 1.0687315464019775, "step": 4326 }, { "epoch": 2.33, "learning_rate": 1.2382024881020936e-08, "logits/chosen": -2.1867456436157227, "logits/rejected": -2.273528814315796, "logps/chosen": -4.106865406036377, "logps/rejected": -6.1448798179626465, "loss": 0.656, "rewards/accuracies": 1.0, "rewards/chosen": 0.9840146899223328, "rewards/margins": 0.0757211446762085, "rewards/rejected": 0.9082935452461243, "step": 4327 }, { "epoch": 2.33, "learning_rate": 1.2362851160122268e-08, "logits/chosen": -1.9700285196304321, "logits/rejected": -2.264841079711914, "logps/chosen": -2.0704116821289062, "logps/rejected": -1.803983449935913, "loss": 0.6993, "rewards/accuracies": 0.0, "rewards/chosen": 0.702822208404541, "rewards/margins": -0.012237250804901123, "rewards/rejected": 0.7150594592094421, "step": 4328 }, { "epoch": 2.33, "learning_rate": 1.2343690201658858e-08, "logits/chosen": -2.0711758136749268, "logits/rejected": -2.2768757343292236, "logps/chosen": -4.179291725158691, "logps/rejected": -3.0571138858795166, "loss": 0.6878, "rewards/accuracies": 1.0, "rewards/chosen": 0.8453285098075867, "rewards/margins": 0.010695338249206543, "rewards/rejected": 0.8346331715583801, "step": 4329 }, { "epoch": 2.34, "learning_rate": 1.2324542012127997e-08, "logits/chosen": -2.0290074348449707, "logits/rejected": -2.0354368686676025, "logps/chosen": -1.9880993366241455, "logps/rejected": -4.67413330078125, "loss": 0.4352, "rewards/accuracies": 1.0, "rewards/chosen": 1.1104512214660645, "rewards/margins": 0.6065893173217773, "rewards/rejected": 0.5038619041442871, "step": 4330 }, { "epoch": 2.34, "learning_rate": 1.23054065980227e-08, "logits/chosen": -2.120577812194824, "logits/rejected": -2.2456867694854736, "logps/chosen": -0.7430810928344727, "logps/rejected": -0.7895572781562805, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 1.0149784088134766, "rewards/margins": 0.007895350456237793, "rewards/rejected": 1.0070830583572388, "step": 4331 }, { "epoch": 2.34, "learning_rate": 1.22862839658316e-08, "logits/chosen": -2.0387322902679443, "logits/rejected": -2.0930137634277344, "logps/chosen": -2.642749071121216, "logps/rejected": -23.578781127929688, "loss": 0.3552, "rewards/accuracies": 1.0, "rewards/chosen": 1.1221667528152466, "rewards/margins": 0.8521384000778198, "rewards/rejected": 0.27002832293510437, "step": 4332 }, { "epoch": 2.34, "learning_rate": 1.226717412203902e-08, "logits/chosen": -2.0690159797668457, "logits/rejected": -2.2815725803375244, "logps/chosen": -7.370222091674805, "logps/rejected": -9.186380386352539, "loss": 0.5957, "rewards/accuracies": 1.0, "rewards/chosen": 0.9498342871665955, "rewards/margins": 0.20542526245117188, "rewards/rejected": 0.7444090247154236, "step": 4333 }, { "epoch": 2.34, "learning_rate": 1.2248077073124974e-08, "logits/chosen": -2.099600076675415, "logits/rejected": -2.3307907581329346, "logps/chosen": -7.122461795806885, "logps/rejected": -0.3236941695213318, "loss": 0.6185, "rewards/accuracies": 1.0, "rewards/chosen": 1.1381447315216064, "rewards/margins": 0.1553831696510315, "rewards/rejected": 0.982761561870575, "step": 4334 }, { "epoch": 2.34, "learning_rate": 1.2228992825565065e-08, "logits/chosen": -2.012078046798706, "logits/rejected": -2.276808500289917, "logps/chosen": -0.766122579574585, "logps/rejected": -0.8324031829833984, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 1.0373538732528687, "rewards/margins": 0.005205273628234863, "rewards/rejected": 1.0321485996246338, "step": 4335 }, { "epoch": 2.34, "learning_rate": 1.2209921385830619e-08, "logits/chosen": -1.9964039325714111, "logits/rejected": -2.281994581222534, "logps/chosen": -8.849418640136719, "logps/rejected": -6.908061504364014, "loss": 0.7353, "rewards/accuracies": 0.0, "rewards/chosen": 0.9558910727500916, "rewards/margins": -0.0826534628868103, "rewards/rejected": 1.0385445356369019, "step": 4336 }, { "epoch": 2.34, "learning_rate": 1.2190862760388598e-08, "logits/chosen": -2.0551156997680664, "logits/rejected": -2.2694945335388184, "logps/chosen": -0.09552937746047974, "logps/rejected": -0.09563849866390228, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.8671395182609558, "rewards/margins": 0.026007235050201416, "rewards/rejected": 0.8411322832107544, "step": 4337 }, { "epoch": 2.34, "learning_rate": 1.2171816955701619e-08, "logits/chosen": -2.0788958072662354, "logits/rejected": -2.07824444770813, "logps/chosen": -2.746429443359375, "logps/rejected": -3.6824936866760254, "loss": 0.5638, "rewards/accuracies": 1.0, "rewards/chosen": 0.8597202301025391, "rewards/margins": 0.2778564691543579, "rewards/rejected": 0.5818637609481812, "step": 4338 }, { "epoch": 2.34, "learning_rate": 1.2152783978227971e-08, "logits/chosen": -2.1058056354522705, "logits/rejected": -2.294898271560669, "logps/chosen": -0.4062122404575348, "logps/rejected": -0.39205941557884216, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.9292960166931152, "rewards/margins": 0.022181391716003418, "rewards/rejected": 0.9071146249771118, "step": 4339 }, { "epoch": 2.34, "learning_rate": 1.2133763834421529e-08, "logits/chosen": -2.0067102909088135, "logits/rejected": -2.0057828426361084, "logps/chosen": -0.6604344844818115, "logps/rejected": -1.7011730670928955, "loss": 0.589, "rewards/accuracies": 1.0, "rewards/chosen": 1.0404047966003418, "rewards/margins": 0.22030973434448242, "rewards/rejected": 0.8200950622558594, "step": 4340 }, { "epoch": 2.34, "learning_rate": 1.2114756530731895e-08, "logits/chosen": -2.199831247329712, "logits/rejected": -2.206967353820801, "logps/chosen": -1.6177846193313599, "logps/rejected": -5.119282245635986, "loss": 0.3154, "rewards/accuracies": 1.0, "rewards/chosen": 1.3575907945632935, "rewards/margins": 0.9920387864112854, "rewards/rejected": 0.36555200815200806, "step": 4341 }, { "epoch": 2.34, "learning_rate": 1.2095762073604283e-08, "logits/chosen": -2.109785556793213, "logits/rejected": -2.263745069503784, "logps/chosen": -0.2113468199968338, "logps/rejected": -0.19617998600006104, "loss": 0.6769, "rewards/accuracies": 1.0, "rewards/chosen": 0.9469936490058899, "rewards/margins": 0.032723307609558105, "rewards/rejected": 0.9142703413963318, "step": 4342 }, { "epoch": 2.34, "learning_rate": 1.2076780469479531e-08, "logits/chosen": -2.148883104324341, "logits/rejected": -2.307025909423828, "logps/chosen": -0.2776980400085449, "logps/rejected": -6.9480695724487305, "loss": 0.5885, "rewards/accuracies": 1.0, "rewards/chosen": 0.948494553565979, "rewards/margins": 0.22153568267822266, "rewards/rejected": 0.7269588708877563, "step": 4343 }, { "epoch": 2.34, "learning_rate": 1.2057811724794159e-08, "logits/chosen": -2.336893081665039, "logits/rejected": -2.2161858081817627, "logps/chosen": -30.425445556640625, "logps/rejected": -2.7030515670776367, "loss": 0.2331, "rewards/accuracies": 1.0, "rewards/chosen": 2.110532522201538, "rewards/margins": 1.3374844789505005, "rewards/rejected": 0.7730480432510376, "step": 4344 }, { "epoch": 2.34, "learning_rate": 1.2038855845980283e-08, "logits/chosen": -2.1221866607666016, "logits/rejected": -2.276860237121582, "logps/chosen": -0.48542773723602295, "logps/rejected": -0.46026375889778137, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 1.0020145177841187, "rewards/margins": 0.024064719676971436, "rewards/rejected": 0.9779497981071472, "step": 4345 }, { "epoch": 2.34, "learning_rate": 1.2019912839465695e-08, "logits/chosen": -2.1461246013641357, "logits/rejected": -2.389690399169922, "logps/chosen": -9.856191635131836, "logps/rejected": -13.308164596557617, "loss": 0.7338, "rewards/accuracies": 0.0, "rewards/chosen": 0.9717245101928711, "rewards/margins": -0.07963871955871582, "rewards/rejected": 1.051363229751587, "step": 4346 }, { "epoch": 2.34, "learning_rate": 1.2000982711673807e-08, "logits/chosen": -2.0191900730133057, "logits/rejected": -2.024439573287964, "logps/chosen": -1.7093379497528076, "logps/rejected": -3.901634454727173, "loss": 0.4842, "rewards/accuracies": 1.0, "rewards/chosen": 0.9723250269889832, "rewards/margins": 0.47347867488861084, "rewards/rejected": 0.4988463521003723, "step": 4347 }, { "epoch": 2.35, "learning_rate": 1.1982065469023661e-08, "logits/chosen": -2.0775270462036133, "logits/rejected": -2.2724177837371826, "logps/chosen": -2.741959810256958, "logps/rejected": -5.910763263702393, "loss": 0.643, "rewards/accuracies": 1.0, "rewards/chosen": 0.6141619682312012, "rewards/margins": 0.10287070274353027, "rewards/rejected": 0.5112912654876709, "step": 4348 }, { "epoch": 2.35, "learning_rate": 1.1963161117929943e-08, "logits/chosen": -2.093230724334717, "logits/rejected": -2.223731756210327, "logps/chosen": -0.8278281092643738, "logps/rejected": -0.8671513795852661, "loss": 0.6727, "rewards/accuracies": 1.0, "rewards/chosen": 0.7500254511833191, "rewards/margins": 0.041230976581573486, "rewards/rejected": 0.7087944746017456, "step": 4349 }, { "epoch": 2.35, "learning_rate": 1.194426966480293e-08, "logits/chosen": -2.1142795085906982, "logits/rejected": -2.3303282260894775, "logps/chosen": -2.2208235263824463, "logps/rejected": -6.4777512550354, "loss": 0.6429, "rewards/accuracies": 1.0, "rewards/chosen": 1.008110761642456, "rewards/margins": 0.10315811634063721, "rewards/rejected": 0.9049526453018188, "step": 4350 }, { "epoch": 2.35, "learning_rate": 1.1925391116048573e-08, "logits/chosen": -2.0822560787200928, "logits/rejected": -2.322284460067749, "logps/chosen": -0.2664515972137451, "logps/rejected": -0.33166223764419556, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 0.9058623313903809, "rewards/margins": 0.01888275146484375, "rewards/rejected": 0.8869795799255371, "step": 4351 }, { "epoch": 2.35, "learning_rate": 1.1906525478068436e-08, "logits/chosen": -2.096116304397583, "logits/rejected": -2.3565258979797363, "logps/chosen": -1.0404689311981201, "logps/rejected": -1.0529589653015137, "loss": 0.6892, "rewards/accuracies": 1.0, "rewards/chosen": 0.8037461638450623, "rewards/margins": 0.007817745208740234, "rewards/rejected": 0.795928418636322, "step": 4352 }, { "epoch": 2.35, "learning_rate": 1.1887672757259659e-08, "logits/chosen": -2.019198417663574, "logits/rejected": -2.029477119445801, "logps/chosen": -1.8729183673858643, "logps/rejected": -2.5824151039123535, "loss": 0.3992, "rewards/accuracies": 1.0, "rewards/chosen": 1.4257296323776245, "rewards/margins": 0.7119259238243103, "rewards/rejected": 0.7138037085533142, "step": 4353 }, { "epoch": 2.35, "learning_rate": 1.186883296001508e-08, "logits/chosen": -2.058936595916748, "logits/rejected": -2.049295425415039, "logps/chosen": -3.999103546142578, "logps/rejected": -6.554877758026123, "loss": 0.3212, "rewards/accuracies": 1.0, "rewards/chosen": 1.4086217880249023, "rewards/margins": 0.9707798957824707, "rewards/rejected": 0.43784186244010925, "step": 4354 }, { "epoch": 2.35, "learning_rate": 1.1850006092723064e-08, "logits/chosen": -1.9429931640625, "logits/rejected": -2.27378249168396, "logps/chosen": -0.6893490552902222, "logps/rejected": -0.659268319606781, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.8659130930900574, "rewards/margins": 0.014554738998413086, "rewards/rejected": 0.8513583540916443, "step": 4355 }, { "epoch": 2.35, "learning_rate": 1.1831192161767684e-08, "logits/chosen": -2.0399155616760254, "logits/rejected": -2.26308274269104, "logps/chosen": -1.0195704698562622, "logps/rejected": -1.0270799398422241, "loss": 0.6762, "rewards/accuracies": 1.0, "rewards/chosen": 0.9071604013442993, "rewards/margins": 0.03420060873031616, "rewards/rejected": 0.8729597926139832, "step": 4356 }, { "epoch": 2.35, "learning_rate": 1.1812391173528585e-08, "logits/chosen": -2.0655157566070557, "logits/rejected": -2.0692732334136963, "logps/chosen": -4.412642478942871, "logps/rejected": -1.9513226747512817, "loss": 0.47, "rewards/accuracies": 1.0, "rewards/chosen": 1.420348882675171, "rewards/margins": 0.5109646320343018, "rewards/rejected": 0.9093842506408691, "step": 4357 }, { "epoch": 2.35, "learning_rate": 1.1793603134380992e-08, "logits/chosen": -2.019834280014038, "logits/rejected": -2.0363309383392334, "logps/chosen": -3.3719892501831055, "logps/rejected": -8.227333068847656, "loss": 0.4433, "rewards/accuracies": 1.0, "rewards/chosen": 1.1153756380081177, "rewards/margins": 0.5835778117179871, "rewards/rejected": 0.5317978262901306, "step": 4358 }, { "epoch": 2.35, "learning_rate": 1.1774828050695795e-08, "logits/chosen": -2.170516014099121, "logits/rejected": -2.1734976768493652, "logps/chosen": -2.6910171508789062, "logps/rejected": -4.242034435272217, "loss": 0.3957, "rewards/accuracies": 1.0, "rewards/chosen": 1.2233484983444214, "rewards/margins": 0.7226541638374329, "rewards/rejected": 0.5006943345069885, "step": 4359 }, { "epoch": 2.35, "learning_rate": 1.1756065928839431e-08, "logits/chosen": -1.9522066116333008, "logits/rejected": -2.2931625843048096, "logps/chosen": -0.19499270617961884, "logps/rejected": -0.2769930958747864, "loss": 0.7056, "rewards/accuracies": 0.0, "rewards/chosen": 0.9287724494934082, "rewards/margins": -0.024787068367004395, "rewards/rejected": 0.9535595178604126, "step": 4360 }, { "epoch": 2.35, "learning_rate": 1.1737316775174005e-08, "logits/chosen": -2.153301954269409, "logits/rejected": -2.017886161804199, "logps/chosen": -31.20040512084961, "logps/rejected": -2.119335174560547, "loss": 0.1515, "rewards/accuracies": 1.0, "rewards/chosen": 2.5147812366485596, "rewards/margins": 1.8107821941375732, "rewards/rejected": 0.7039989829063416, "step": 4361 }, { "epoch": 2.35, "learning_rate": 1.1718580596057193e-08, "logits/chosen": -2.0914106369018555, "logits/rejected": -2.2820520401000977, "logps/chosen": -1.8488489389419556, "logps/rejected": -6.574329853057861, "loss": 0.5638, "rewards/accuracies": 1.0, "rewards/chosen": 0.7616173624992371, "rewards/margins": 0.27797433733940125, "rewards/rejected": 0.4836430251598358, "step": 4362 }, { "epoch": 2.35, "learning_rate": 1.169985739784225e-08, "logits/chosen": -2.064554452896118, "logits/rejected": -2.0753183364868164, "logps/chosen": -1.9759461879730225, "logps/rejected": -2.6467981338500977, "loss": 0.5051, "rewards/accuracies": 1.0, "rewards/chosen": 1.0738747119903564, "rewards/margins": 0.41974538564682007, "rewards/rejected": 0.6541293263435364, "step": 4363 }, { "epoch": 2.35, "learning_rate": 1.1681147186878071e-08, "logits/chosen": -2.1434569358825684, "logits/rejected": -2.1434433460235596, "logps/chosen": -2.6707777976989746, "logps/rejected": -2.175764322280884, "loss": 0.6378, "rewards/accuracies": 1.0, "rewards/chosen": 0.9427501559257507, "rewards/margins": 0.11393088102340698, "rewards/rejected": 0.8288192749023438, "step": 4364 }, { "epoch": 2.35, "learning_rate": 1.166244996950912e-08, "logits/chosen": -2.0998263359069824, "logits/rejected": -2.1034164428710938, "logps/chosen": -1.3174391984939575, "logps/rejected": -1.879409670829773, "loss": 0.6063, "rewards/accuracies": 1.0, "rewards/chosen": 1.1473578214645386, "rewards/margins": 0.18196839094161987, "rewards/rejected": 0.9653894305229187, "step": 4365 }, { "epoch": 2.35, "learning_rate": 1.1643765752075468e-08, "logits/chosen": -2.0170798301696777, "logits/rejected": -2.2897324562072754, "logps/chosen": -0.256285697221756, "logps/rejected": -0.24550394713878632, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.9877535104751587, "rewards/margins": 0.019133567810058594, "rewards/rejected": 0.9686199426651001, "step": 4366 }, { "epoch": 2.36, "learning_rate": 1.1625094540912795e-08, "logits/chosen": -2.076408863067627, "logits/rejected": -2.0766468048095703, "logps/chosen": -1.5319805145263672, "logps/rejected": -4.029882907867432, "loss": 0.554, "rewards/accuracies": 1.0, "rewards/chosen": 0.8038972020149231, "rewards/margins": 0.30073082447052, "rewards/rejected": 0.5031663775444031, "step": 4367 }, { "epoch": 2.36, "learning_rate": 1.1606436342352304e-08, "logits/chosen": -2.182037830352783, "logits/rejected": -2.187232494354248, "logps/chosen": -3.3920812606811523, "logps/rejected": -6.421688079833984, "loss": 0.36, "rewards/accuracies": 1.0, "rewards/chosen": 1.7012821435928345, "rewards/margins": 0.8362558484077454, "rewards/rejected": 0.8650262951850891, "step": 4368 }, { "epoch": 2.36, "learning_rate": 1.1587791162720873e-08, "logits/chosen": -2.060699701309204, "logits/rejected": -2.0728156566619873, "logps/chosen": -5.4641265869140625, "logps/rejected": -9.473455429077148, "loss": 0.2777, "rewards/accuracies": 1.0, "rewards/chosen": 1.7002750635147095, "rewards/margins": 1.1393089294433594, "rewards/rejected": 0.5609661340713501, "step": 4369 }, { "epoch": 2.36, "learning_rate": 1.156915900834089e-08, "logits/chosen": -2.175157308578491, "logits/rejected": -2.1672956943511963, "logps/chosen": -6.842706680297852, "logps/rejected": -2.9428670406341553, "loss": 0.462, "rewards/accuracies": 1.0, "rewards/chosen": 1.2544065713882446, "rewards/margins": 0.5324289202690125, "rewards/rejected": 0.7219776511192322, "step": 4370 }, { "epoch": 2.36, "learning_rate": 1.1550539885530375e-08, "logits/chosen": -2.0719685554504395, "logits/rejected": -2.0713610649108887, "logps/chosen": -1.253324031829834, "logps/rejected": -3.027231454849243, "loss": 0.6029, "rewards/accuracies": 1.0, "rewards/chosen": 1.0053232908248901, "rewards/margins": 0.18944013118743896, "rewards/rejected": 0.8158831596374512, "step": 4371 }, { "epoch": 2.36, "learning_rate": 1.1531933800602923e-08, "logits/chosen": -2.1304335594177246, "logits/rejected": -2.2830758094787598, "logps/chosen": -1.4507033824920654, "logps/rejected": -3.076801061630249, "loss": 0.6021, "rewards/accuracies": 1.0, "rewards/chosen": 0.9104560017585754, "rewards/margins": 0.19127130508422852, "rewards/rejected": 0.7191846966743469, "step": 4372 }, { "epoch": 2.36, "learning_rate": 1.1513340759867673e-08, "logits/chosen": -2.047515392303467, "logits/rejected": -2.061586618423462, "logps/chosen": -1.5702683925628662, "logps/rejected": -7.105440139770508, "loss": 0.4403, "rewards/accuracies": 1.0, "rewards/chosen": 1.2587591409683228, "rewards/margins": 0.592095673084259, "rewards/rejected": 0.6666634678840637, "step": 4373 }, { "epoch": 2.36, "learning_rate": 1.149476076962938e-08, "logits/chosen": -2.0441417694091797, "logits/rejected": -2.273401975631714, "logps/chosen": -0.8546847105026245, "logps/rejected": -0.877183198928833, "loss": 0.6831, "rewards/accuracies": 1.0, "rewards/chosen": 1.0317481756210327, "rewards/margins": 0.020144343376159668, "rewards/rejected": 1.011603832244873, "step": 4374 }, { "epoch": 2.36, "learning_rate": 1.1476193836188358e-08, "logits/chosen": -2.0213205814361572, "logits/rejected": -2.200137138366699, "logps/chosen": -1.0409876108169556, "logps/rejected": -1.122497320175171, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.8587643504142761, "rewards/margins": 0.01045757532119751, "rewards/rejected": 0.8483067750930786, "step": 4375 }, { "epoch": 2.36, "learning_rate": 1.1457639965840498e-08, "logits/chosen": -2.0113778114318848, "logits/rejected": -2.0090086460113525, "logps/chosen": -7.085110664367676, "logps/rejected": -4.330959796905518, "loss": 0.3025, "rewards/accuracies": 1.0, "rewards/chosen": 1.461722731590271, "rewards/margins": 1.0404837131500244, "rewards/rejected": 0.42123904824256897, "step": 4376 }, { "epoch": 2.36, "learning_rate": 1.143909916487727e-08, "logits/chosen": -2.1513149738311768, "logits/rejected": -2.1422009468078613, "logps/chosen": -3.3624281883239746, "logps/rejected": -9.125581741333008, "loss": 0.3492, "rewards/accuracies": 1.0, "rewards/chosen": 1.3107566833496094, "rewards/margins": 0.8724166750907898, "rewards/rejected": 0.4383400082588196, "step": 4377 }, { "epoch": 2.36, "learning_rate": 1.1420571439585674e-08, "logits/chosen": -1.9858530759811401, "logits/rejected": -2.243154764175415, "logps/chosen": -0.6654717922210693, "logps/rejected": -3.460536003112793, "loss": 0.5369, "rewards/accuracies": 1.0, "rewards/chosen": 1.006622076034546, "rewards/margins": 0.3415599465370178, "rewards/rejected": 0.6650621294975281, "step": 4378 }, { "epoch": 2.36, "learning_rate": 1.1402056796248339e-08, "logits/chosen": -2.144867420196533, "logits/rejected": -2.221315860748291, "logps/chosen": -1.6714060306549072, "logps/rejected": -1.606151819229126, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.9861275553703308, "rewards/margins": 0.0047833919525146484, "rewards/rejected": 0.9813441634178162, "step": 4379 }, { "epoch": 2.36, "learning_rate": 1.1383555241143378e-08, "logits/chosen": -2.1910698413848877, "logits/rejected": -2.0774314403533936, "logps/chosen": -36.86180114746094, "logps/rejected": -2.068260669708252, "loss": 0.1623, "rewards/accuracies": 1.0, "rewards/chosen": 2.4972198009490967, "rewards/margins": 1.736067533493042, "rewards/rejected": 0.7611522078514099, "step": 4380 }, { "epoch": 2.36, "learning_rate": 1.1365066780544541e-08, "logits/chosen": -2.0751383304595947, "logits/rejected": -2.291552782058716, "logps/chosen": -0.8838178515434265, "logps/rejected": -1.5788062810897827, "loss": 0.7129, "rewards/accuracies": 0.0, "rewards/chosen": 0.8756747245788574, "rewards/margins": -0.0392112135887146, "rewards/rejected": 0.914885938167572, "step": 4381 }, { "epoch": 2.36, "learning_rate": 1.1346591420721107e-08, "logits/chosen": -2.017923593521118, "logits/rejected": -1.9878331422805786, "logps/chosen": -8.354893684387207, "logps/rejected": -5.703448295593262, "loss": 0.4033, "rewards/accuracies": 1.0, "rewards/chosen": 1.4242764711380005, "rewards/margins": 0.6997974514961243, "rewards/rejected": 0.7244790196418762, "step": 4382 }, { "epoch": 2.36, "learning_rate": 1.1328129167937894e-08, "logits/chosen": -2.1991543769836426, "logits/rejected": -2.2074031829833984, "logps/chosen": -2.6218550205230713, "logps/rejected": -6.10210657119751, "loss": 0.2896, "rewards/accuracies": 1.0, "rewards/chosen": 1.3534268140792847, "rewards/margins": 1.0908188819885254, "rewards/rejected": 0.2626079022884369, "step": 4383 }, { "epoch": 2.36, "learning_rate": 1.1309680028455298e-08, "logits/chosen": -2.0917227268218994, "logits/rejected": -2.316446304321289, "logps/chosen": -0.10402227193117142, "logps/rejected": -0.11703336983919144, "loss": 0.6886, "rewards/accuracies": 1.0, "rewards/chosen": 0.9359450340270996, "rewards/margins": 0.009199976921081543, "rewards/rejected": 0.9267450571060181, "step": 4384 }, { "epoch": 2.37, "learning_rate": 1.1291244008529278e-08, "logits/chosen": -2.0453169345855713, "logits/rejected": -2.0429728031158447, "logps/chosen": -0.6031396389007568, "logps/rejected": -6.820355415344238, "loss": 0.3864, "rewards/accuracies": 1.0, "rewards/chosen": 1.124236822128296, "rewards/margins": 0.7515159845352173, "rewards/rejected": 0.3727208077907562, "step": 4385 }, { "epoch": 2.37, "learning_rate": 1.1272821114411318e-08, "logits/chosen": -1.9675140380859375, "logits/rejected": -1.9630045890808105, "logps/chosen": -3.7375850677490234, "logps/rejected": -5.515553951263428, "loss": 0.259, "rewards/accuracies": 1.0, "rewards/chosen": 1.7056735754013062, "rewards/margins": 1.2186825275421143, "rewards/rejected": 0.4869910776615143, "step": 4386 }, { "epoch": 2.37, "learning_rate": 1.1254411352348486e-08, "logits/chosen": -2.141672372817993, "logits/rejected": -2.139737844467163, "logps/chosen": -3.2771570682525635, "logps/rejected": -2.32415771484375, "loss": 0.4949, "rewards/accuracies": 1.0, "rewards/chosen": 1.1617006063461304, "rewards/margins": 0.4458710551261902, "rewards/rejected": 0.7158295512199402, "step": 4387 }, { "epoch": 2.37, "learning_rate": 1.1236014728583349e-08, "logits/chosen": -1.9988155364990234, "logits/rejected": -2.005904197692871, "logps/chosen": -1.6034594774246216, "logps/rejected": -4.588597774505615, "loss": 0.4441, "rewards/accuracies": 1.0, "rewards/chosen": 0.9930077791213989, "rewards/margins": 0.5814549922943115, "rewards/rejected": 0.411552757024765, "step": 4388 }, { "epoch": 2.37, "learning_rate": 1.1217631249354049e-08, "logits/chosen": -2.2375411987304688, "logits/rejected": -2.271409511566162, "logps/chosen": -9.452767372131348, "logps/rejected": -7.271456718444824, "loss": 0.6615, "rewards/accuracies": 1.0, "rewards/chosen": 0.9695948958396912, "rewards/margins": 0.0644075870513916, "rewards/rejected": 0.9051873087882996, "step": 4389 }, { "epoch": 2.37, "learning_rate": 1.1199260920894282e-08, "logits/chosen": -2.112243413925171, "logits/rejected": -2.310300350189209, "logps/chosen": -0.5288763046264648, "logps/rejected": -0.581636905670166, "loss": 0.6923, "rewards/accuracies": 1.0, "rewards/chosen": 0.9141837358474731, "rewards/margins": 0.0017264485359191895, "rewards/rejected": 0.912457287311554, "step": 4390 }, { "epoch": 2.37, "learning_rate": 1.1180903749433252e-08, "logits/chosen": -2.029071569442749, "logits/rejected": -2.2876780033111572, "logps/chosen": -0.6274864077568054, "logps/rejected": -0.4879450798034668, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.9278878569602966, "rewards/margins": 0.02179861068725586, "rewards/rejected": 0.9060892462730408, "step": 4391 }, { "epoch": 2.37, "learning_rate": 1.1162559741195731e-08, "logits/chosen": -2.0388331413269043, "logits/rejected": -2.1849920749664307, "logps/chosen": -1.222030520439148, "logps/rejected": -1.1741268634796143, "loss": 0.6857, "rewards/accuracies": 1.0, "rewards/chosen": 0.9515752792358398, "rewards/margins": 0.014926016330718994, "rewards/rejected": 0.9366492629051208, "step": 4392 }, { "epoch": 2.37, "learning_rate": 1.1144228902402002e-08, "logits/chosen": -2.140692949295044, "logits/rejected": -2.1419484615325928, "logps/chosen": -3.07248592376709, "logps/rejected": -4.216122150421143, "loss": 0.5469, "rewards/accuracies": 1.0, "rewards/chosen": 1.044325590133667, "rewards/margins": 0.3175695538520813, "rewards/rejected": 0.7267560362815857, "step": 4393 }, { "epoch": 2.37, "learning_rate": 1.1125911239267904e-08, "logits/chosen": -1.9777024984359741, "logits/rejected": -2.311445474624634, "logps/chosen": -3.1002659797668457, "logps/rejected": -2.924851179122925, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 1.1068458557128906, "rewards/margins": 0.012813925743103027, "rewards/rejected": 1.0940319299697876, "step": 4394 }, { "epoch": 2.37, "learning_rate": 1.1107606758004801e-08, "logits/chosen": -2.191455602645874, "logits/rejected": -2.1902499198913574, "logps/chosen": -0.34192225337028503, "logps/rejected": -4.445052146911621, "loss": 0.4442, "rewards/accuracies": 1.0, "rewards/chosen": 0.9363028407096863, "rewards/margins": 0.5812245607376099, "rewards/rejected": 0.3550783097743988, "step": 4395 }, { "epoch": 2.37, "learning_rate": 1.1089315464819576e-08, "logits/chosen": -2.114351511001587, "logits/rejected": -2.2557246685028076, "logps/chosen": -4.681334495544434, "logps/rejected": -3.678171396255493, "loss": 0.5865, "rewards/accuracies": 1.0, "rewards/chosen": 1.1069291830062866, "rewards/margins": 0.22600215673446655, "rewards/rejected": 0.8809270262718201, "step": 4396 }, { "epoch": 2.37, "learning_rate": 1.1071037365914682e-08, "logits/chosen": -2.1633410453796387, "logits/rejected": -2.1451261043548584, "logps/chosen": -12.621434211730957, "logps/rejected": -1.480432391166687, "loss": 0.4069, "rewards/accuracies": 1.0, "rewards/chosen": 1.6347408294677734, "rewards/margins": 0.6887854337692261, "rewards/rejected": 0.9459553956985474, "step": 4397 }, { "epoch": 2.37, "learning_rate": 1.105277246748802e-08, "logits/chosen": -2.120953321456909, "logits/rejected": -2.291105031967163, "logps/chosen": -4.160545825958252, "logps/rejected": -3.7925877571105957, "loss": 0.7072, "rewards/accuracies": 0.0, "rewards/chosen": 0.9740785956382751, "rewards/margins": -0.027838289737701416, "rewards/rejected": 1.0019168853759766, "step": 4398 }, { "epoch": 2.37, "learning_rate": 1.1034520775733085e-08, "logits/chosen": -2.1734163761138916, "logits/rejected": -2.1340901851654053, "logps/chosen": -25.410289764404297, "logps/rejected": -12.454483032226562, "loss": 0.3331, "rewards/accuracies": 1.0, "rewards/chosen": 1.951931357383728, "rewards/margins": 0.9283143281936646, "rewards/rejected": 1.0236170291900635, "step": 4399 }, { "epoch": 2.37, "learning_rate": 1.1016282296838886e-08, "logits/chosen": -2.1056880950927734, "logits/rejected": -2.336994171142578, "logps/chosen": -1.4523496627807617, "logps/rejected": -1.2413476705551147, "loss": 0.7043, "rewards/accuracies": 0.0, "rewards/chosen": 1.0595430135726929, "rewards/margins": -0.02228093147277832, "rewards/rejected": 1.0818239450454712, "step": 4400 }, { "epoch": 2.37, "learning_rate": 1.0998057036989904e-08, "logits/chosen": -2.0668089389801025, "logits/rejected": -2.0690462589263916, "logps/chosen": -1.744753122329712, "logps/rejected": -1.4161450862884521, "loss": 0.5082, "rewards/accuracies": 1.0, "rewards/chosen": 1.3088229894638062, "rewards/margins": 0.411948561668396, "rewards/rejected": 0.8968744277954102, "step": 4401 }, { "epoch": 2.37, "learning_rate": 1.097984500236619e-08, "logits/chosen": -2.0310475826263428, "logits/rejected": -2.0235862731933594, "logps/chosen": -27.923547744750977, "logps/rejected": -8.374934196472168, "loss": 0.2083, "rewards/accuracies": 1.0, "rewards/chosen": 2.023482084274292, "rewards/margins": 1.4630037546157837, "rewards/rejected": 0.5604783296585083, "step": 4402 }, { "epoch": 2.37, "learning_rate": 1.0961646199143271e-08, "logits/chosen": -2.1518170833587646, "logits/rejected": -2.2797019481658936, "logps/chosen": -2.2380805015563965, "logps/rejected": -0.7246890068054199, "loss": 0.6531, "rewards/accuracies": 1.0, "rewards/chosen": 0.9591415524482727, "rewards/margins": 0.08185863494873047, "rewards/rejected": 0.8772829174995422, "step": 4403 }, { "epoch": 2.38, "learning_rate": 1.0943460633492203e-08, "logits/chosen": -2.0027079582214355, "logits/rejected": -2.257967710494995, "logps/chosen": -0.7499851584434509, "logps/rejected": -0.7670649886131287, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 0.8947637677192688, "rewards/margins": -0.005835354328155518, "rewards/rejected": 0.9005991220474243, "step": 4404 }, { "epoch": 2.38, "learning_rate": 1.0925288311579588e-08, "logits/chosen": -2.1913416385650635, "logits/rejected": -2.1571085453033447, "logps/chosen": -17.47142219543457, "logps/rejected": -3.387033224105835, "loss": 0.2724, "rewards/accuracies": 1.0, "rewards/chosen": 1.7063604593276978, "rewards/margins": 1.160994052886963, "rewards/rejected": 0.5453664660453796, "step": 4405 }, { "epoch": 2.38, "learning_rate": 1.0907129239567481e-08, "logits/chosen": -2.010605812072754, "logits/rejected": -2.010373115539551, "logps/chosen": -0.8557835817337036, "logps/rejected": -6.680771827697754, "loss": 0.4656, "rewards/accuracies": 1.0, "rewards/chosen": 1.0840963125228882, "rewards/margins": 0.522567629814148, "rewards/rejected": 0.5615286827087402, "step": 4406 }, { "epoch": 2.38, "learning_rate": 1.0888983423613479e-08, "logits/chosen": -2.158357858657837, "logits/rejected": -2.3128015995025635, "logps/chosen": -0.41973212361335754, "logps/rejected": -0.41902559995651245, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 1.0607560873031616, "rewards/margins": 0.001173853874206543, "rewards/rejected": 1.059582233428955, "step": 4407 }, { "epoch": 2.38, "learning_rate": 1.0870850869870657e-08, "logits/chosen": -2.101405382156372, "logits/rejected": -2.0968990325927734, "logps/chosen": -2.321831464767456, "logps/rejected": -6.116017818450928, "loss": 0.3586, "rewards/accuracies": 1.0, "rewards/chosen": 1.3260583877563477, "rewards/margins": 0.8409202098846436, "rewards/rejected": 0.4851381778717041, "step": 4408 }, { "epoch": 2.38, "learning_rate": 1.0852731584487612e-08, "logits/chosen": -2.289891004562378, "logits/rejected": -2.133589029312134, "logps/chosen": -27.600120544433594, "logps/rejected": -4.032565593719482, "loss": 0.1629, "rewards/accuracies": 1.0, "rewards/chosen": 2.2396628856658936, "rewards/margins": 1.7317826747894287, "rewards/rejected": 0.5078802704811096, "step": 4409 }, { "epoch": 2.38, "learning_rate": 1.0834625573608464e-08, "logits/chosen": -2.213252544403076, "logits/rejected": -2.215932846069336, "logps/chosen": -0.2377970814704895, "logps/rejected": -4.546738624572754, "loss": 0.434, "rewards/accuracies": 1.0, "rewards/chosen": 1.0485800504684448, "rewards/margins": 0.6097608804702759, "rewards/rejected": 0.43881914019584656, "step": 4410 }, { "epoch": 2.38, "learning_rate": 1.0816532843372772e-08, "logits/chosen": -2.2077114582061768, "logits/rejected": -2.070868730545044, "logps/chosen": -43.31851577758789, "logps/rejected": -3.047330856323242, "loss": 0.1244, "rewards/accuracies": 1.0, "rewards/chosen": 2.7368648052215576, "rewards/margins": 2.0216760635375977, "rewards/rejected": 0.7151888012886047, "step": 4411 }, { "epoch": 2.38, "learning_rate": 1.0798453399915642e-08, "logits/chosen": -2.151488780975342, "logits/rejected": -2.3164703845977783, "logps/chosen": -1.571199893951416, "logps/rejected": -1.0592420101165771, "loss": 0.6527, "rewards/accuracies": 1.0, "rewards/chosen": 0.9585052728652954, "rewards/margins": 0.0825003981590271, "rewards/rejected": 0.8760048747062683, "step": 4412 }, { "epoch": 2.38, "learning_rate": 1.0780387249367667e-08, "logits/chosen": -2.2302186489105225, "logits/rejected": -2.080556869506836, "logps/chosen": -35.35897445678711, "logps/rejected": -1.5914835929870605, "loss": 0.1591, "rewards/accuracies": 1.0, "rewards/chosen": 2.723468065261841, "rewards/margins": 1.7577459812164307, "rewards/rejected": 0.9657220840454102, "step": 4413 }, { "epoch": 2.38, "learning_rate": 1.0762334397854894e-08, "logits/chosen": -2.119713306427002, "logits/rejected": -2.279974937438965, "logps/chosen": -11.23355484008789, "logps/rejected": -9.486157417297363, "loss": 0.4696, "rewards/accuracies": 1.0, "rewards/chosen": 1.3519343137741089, "rewards/margins": 0.5118825435638428, "rewards/rejected": 0.8400517702102661, "step": 4414 }, { "epoch": 2.38, "learning_rate": 1.0744294851498936e-08, "logits/chosen": -2.1479456424713135, "logits/rejected": -2.1473937034606934, "logps/chosen": -6.826018333435059, "logps/rejected": -2.125916004180908, "loss": 0.4681, "rewards/accuracies": 1.0, "rewards/chosen": 1.2711433172225952, "rewards/margins": 0.5160290598869324, "rewards/rejected": 0.7551142573356628, "step": 4415 }, { "epoch": 2.38, "learning_rate": 1.0726268616416811e-08, "logits/chosen": -2.1855549812316895, "logits/rejected": -2.2308459281921387, "logps/chosen": -1.3784961700439453, "logps/rejected": -1.283986210823059, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 1.0451453924179077, "rewards/margins": 0.004556894302368164, "rewards/rejected": 1.0405884981155396, "step": 4416 }, { "epoch": 2.38, "learning_rate": 1.0708255698721086e-08, "logits/chosen": -2.163086414337158, "logits/rejected": -2.263421058654785, "logps/chosen": -3.097982406616211, "logps/rejected": -3.071852207183838, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 1.0822433233261108, "rewards/margins": 0.018524169921875, "rewards/rejected": 1.0637191534042358, "step": 4417 }, { "epoch": 2.38, "learning_rate": 1.0690256104519762e-08, "logits/chosen": -2.1618781089782715, "logits/rejected": -2.3263771533966064, "logps/chosen": -2.0242152214050293, "logps/rejected": -2.2385451793670654, "loss": 0.6722, "rewards/accuracies": 1.0, "rewards/chosen": 0.7437407374382019, "rewards/margins": 0.04243302345275879, "rewards/rejected": 0.7013077139854431, "step": 4418 }, { "epoch": 2.38, "learning_rate": 1.0672269839916364e-08, "logits/chosen": -2.1573400497436523, "logits/rejected": -2.150179147720337, "logps/chosen": -2.4086358547210693, "logps/rejected": -2.6886751651763916, "loss": 0.5308, "rewards/accuracies": 1.0, "rewards/chosen": 1.1463578939437866, "rewards/margins": 0.3562409281730652, "rewards/rejected": 0.7901169657707214, "step": 4419 }, { "epoch": 2.38, "learning_rate": 1.065429691100989e-08, "logits/chosen": -1.893597960472107, "logits/rejected": -2.273880958557129, "logps/chosen": -0.809593677520752, "logps/rejected": -0.9042040109634399, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 1.0732783079147339, "rewards/margins": 0.0029181241989135742, "rewards/rejected": 1.0703601837158203, "step": 4420 }, { "epoch": 2.38, "learning_rate": 1.0636337323894785e-08, "logits/chosen": -2.0692315101623535, "logits/rejected": -2.2711997032165527, "logps/chosen": -0.4663955271244049, "logps/rejected": -0.3778749704360962, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.8545110821723938, "rewards/margins": 0.015623211860656738, "rewards/rejected": 0.8388878703117371, "step": 4421 }, { "epoch": 2.39, "learning_rate": 1.0618391084660994e-08, "logits/chosen": -2.162794828414917, "logits/rejected": -2.2906711101531982, "logps/chosen": -4.8480706214904785, "logps/rejected": -0.4923672080039978, "loss": 0.7212, "rewards/accuracies": 0.0, "rewards/chosen": 0.9257137179374695, "rewards/margins": -0.05538821220397949, "rewards/rejected": 0.981101930141449, "step": 4422 }, { "epoch": 2.39, "learning_rate": 1.0600458199393957e-08, "logits/chosen": -2.1580007076263428, "logits/rejected": -2.1393964290618896, "logps/chosen": -14.821451187133789, "logps/rejected": -3.8942337036132812, "loss": 0.3243, "rewards/accuracies": 1.0, "rewards/chosen": 1.3859891891479492, "rewards/margins": 0.9594361782073975, "rewards/rejected": 0.42655298113822937, "step": 4423 }, { "epoch": 2.39, "learning_rate": 1.0582538674174513e-08, "logits/chosen": -1.9657284021377563, "logits/rejected": -2.2565433979034424, "logps/chosen": -1.097977638244629, "logps/rejected": -1.2451034784317017, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.9267072081565857, "rewards/margins": 0.03023761510848999, "rewards/rejected": 0.8964695930480957, "step": 4424 }, { "epoch": 2.39, "learning_rate": 1.0564632515079087e-08, "logits/chosen": -2.102847099304199, "logits/rejected": -2.088151454925537, "logps/chosen": -17.083566665649414, "logps/rejected": -4.178615093231201, "loss": 0.2752, "rewards/accuracies": 1.0, "rewards/chosen": 1.5372804403305054, "rewards/margins": 1.149536371231079, "rewards/rejected": 0.38774409890174866, "step": 4425 }, { "epoch": 2.39, "learning_rate": 1.054673972817945e-08, "logits/chosen": -1.9682347774505615, "logits/rejected": -1.974701166152954, "logps/chosen": -1.5608829259872437, "logps/rejected": -3.129438877105713, "loss": 0.4583, "rewards/accuracies": 1.0, "rewards/chosen": 1.1301507949829102, "rewards/margins": 0.5424039959907532, "rewards/rejected": 0.587746798992157, "step": 4426 }, { "epoch": 2.39, "learning_rate": 1.052886031954291e-08, "logits/chosen": -2.033679962158203, "logits/rejected": -2.0373497009277344, "logps/chosen": -3.378331184387207, "logps/rejected": -4.111177444458008, "loss": 0.5014, "rewards/accuracies": 1.0, "rewards/chosen": 1.031562089920044, "rewards/margins": 0.42926400899887085, "rewards/rejected": 0.6022980809211731, "step": 4427 }, { "epoch": 2.39, "learning_rate": 1.0510994295232239e-08, "logits/chosen": -2.128434658050537, "logits/rejected": -2.124692916870117, "logps/chosen": -0.8352457284927368, "logps/rejected": -6.044891357421875, "loss": 0.5192, "rewards/accuracies": 1.0, "rewards/chosen": 1.0661629438400269, "rewards/margins": 0.3847388029098511, "rewards/rejected": 0.6814241409301758, "step": 4428 }, { "epoch": 2.39, "learning_rate": 1.0493141661305621e-08, "logits/chosen": -2.042437791824341, "logits/rejected": -2.0356550216674805, "logps/chosen": -2.935314178466797, "logps/rejected": -3.2295145988464355, "loss": 0.5801, "rewards/accuracies": 1.0, "rewards/chosen": 0.9635831713676453, "rewards/margins": 0.2404579520225525, "rewards/rejected": 0.7231252193450928, "step": 4429 }, { "epoch": 2.39, "learning_rate": 1.0475302423816768e-08, "logits/chosen": -2.09836745262146, "logits/rejected": -2.336725950241089, "logps/chosen": -12.951777458190918, "logps/rejected": -6.452503204345703, "loss": 0.8574, "rewards/accuracies": 0.0, "rewards/chosen": 0.6281906962394714, "rewards/margins": -0.30533498525619507, "rewards/rejected": 0.9335256814956665, "step": 4430 }, { "epoch": 2.39, "learning_rate": 1.0457476588814773e-08, "logits/chosen": -2.072521686553955, "logits/rejected": -2.336174249649048, "logps/chosen": -0.3168284296989441, "logps/rejected": -0.33115172386169434, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": 1.0755313634872437, "rewards/margins": -0.009972929954528809, "rewards/rejected": 1.0855042934417725, "step": 4431 }, { "epoch": 2.39, "learning_rate": 1.0439664162344247e-08, "logits/chosen": -2.032120943069458, "logits/rejected": -2.2868247032165527, "logps/chosen": -0.730322003364563, "logps/rejected": -0.7313631176948547, "loss": 0.6712, "rewards/accuracies": 1.0, "rewards/chosen": 0.9853121042251587, "rewards/margins": 0.044463276863098145, "rewards/rejected": 0.9408488273620605, "step": 4432 }, { "epoch": 2.39, "learning_rate": 1.0421865150445241e-08, "logits/chosen": -2.042776346206665, "logits/rejected": -2.2759406566619873, "logps/chosen": -1.1008458137512207, "logps/rejected": -1.0572947263717651, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 0.9907440543174744, "rewards/margins": 0.0011587142944335938, "rewards/rejected": 0.9895853400230408, "step": 4433 }, { "epoch": 2.39, "learning_rate": 1.0404079559153212e-08, "logits/chosen": -2.0702829360961914, "logits/rejected": -2.2734506130218506, "logps/chosen": -0.5026238560676575, "logps/rejected": -0.51556396484375, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 1.0265766382217407, "rewards/margins": 0.0040166378021240234, "rewards/rejected": 1.0225600004196167, "step": 4434 }, { "epoch": 2.39, "learning_rate": 1.038630739449915e-08, "logits/chosen": -2.1279046535491943, "logits/rejected": -2.13466215133667, "logps/chosen": -1.4703327417373657, "logps/rejected": -1.913072943687439, "loss": 0.441, "rewards/accuracies": 1.0, "rewards/chosen": 1.1925508975982666, "rewards/margins": 0.5900303721427917, "rewards/rejected": 0.6025205254554749, "step": 4435 }, { "epoch": 2.39, "learning_rate": 1.0368548662509402e-08, "logits/chosen": -2.028014659881592, "logits/rejected": -2.0226004123687744, "logps/chosen": -2.6859569549560547, "logps/rejected": -4.730383396148682, "loss": 0.277, "rewards/accuracies": 1.0, "rewards/chosen": 1.5494205951690674, "rewards/margins": 1.1422312259674072, "rewards/rejected": 0.40718942880630493, "step": 4436 }, { "epoch": 2.39, "learning_rate": 1.0350803369205824e-08, "logits/chosen": -2.0735716819763184, "logits/rejected": -2.0744261741638184, "logps/chosen": -1.433518886566162, "logps/rejected": -1.539475440979004, "loss": 0.5397, "rewards/accuracies": 1.0, "rewards/chosen": 1.0952491760253906, "rewards/margins": 0.3347155451774597, "rewards/rejected": 0.7605336308479309, "step": 4437 }, { "epoch": 2.39, "learning_rate": 1.0333071520605696e-08, "logits/chosen": -2.019315719604492, "logits/rejected": -2.0027401447296143, "logps/chosen": -3.707881450653076, "logps/rejected": -6.312592506408691, "loss": 0.4369, "rewards/accuracies": 1.0, "rewards/chosen": 1.1174745559692383, "rewards/margins": 0.6015411019325256, "rewards/rejected": 0.5159334540367126, "step": 4438 }, { "epoch": 2.39, "learning_rate": 1.031535312272172e-08, "logits/chosen": -1.9706405401229858, "logits/rejected": -2.2544620037078857, "logps/chosen": -0.7269818186759949, "logps/rejected": -0.7337825894355774, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.8349653482437134, "rewards/margins": 0.019218862056732178, "rewards/rejected": 0.8157464861869812, "step": 4439 }, { "epoch": 2.39, "learning_rate": 1.0297648181562074e-08, "logits/chosen": -2.2040793895721436, "logits/rejected": -2.3137764930725098, "logps/chosen": -0.9442945718765259, "logps/rejected": -0.9963042140007019, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.9515928626060486, "rewards/margins": 0.010028660297393799, "rewards/rejected": 0.9415642023086548, "step": 4440 }, { "epoch": 2.4, "learning_rate": 1.027995670313032e-08, "logits/chosen": -2.1772520542144775, "logits/rejected": -2.083454132080078, "logps/chosen": -19.462923049926758, "logps/rejected": -4.238952159881592, "loss": 0.1252, "rewards/accuracies": 1.0, "rewards/chosen": 2.4505598545074463, "rewards/margins": 2.014216423034668, "rewards/rejected": 0.4363434314727783, "step": 4441 }, { "epoch": 2.4, "learning_rate": 1.0262278693425513e-08, "logits/chosen": -1.966490387916565, "logits/rejected": -2.284142017364502, "logps/chosen": -0.5898815989494324, "logps/rejected": -0.6866052746772766, "loss": 0.7125, "rewards/accuracies": 0.0, "rewards/chosen": 1.0645231008529663, "rewards/margins": -0.038255929946899414, "rewards/rejected": 1.1027790307998657, "step": 4442 }, { "epoch": 2.4, "learning_rate": 1.0244614158442111e-08, "logits/chosen": -2.2253549098968506, "logits/rejected": -2.2736141681671143, "logps/chosen": -2.478309154510498, "logps/rejected": -2.6419060230255127, "loss": 0.6726, "rewards/accuracies": 1.0, "rewards/chosen": 0.799923837184906, "rewards/margins": 0.04155057668685913, "rewards/rejected": 0.7583732604980469, "step": 4443 }, { "epoch": 2.4, "learning_rate": 1.022696310417e-08, "logits/chosen": -2.0932581424713135, "logits/rejected": -2.2929468154907227, "logps/chosen": -0.6687626838684082, "logps/rejected": -1.0382351875305176, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 1.0967979431152344, "rewards/margins": 0.009348511695861816, "rewards/rejected": 1.0874494314193726, "step": 4444 }, { "epoch": 2.4, "learning_rate": 1.020932553659452e-08, "logits/chosen": -2.077150344848633, "logits/rejected": -2.251112937927246, "logps/chosen": -2.230700969696045, "logps/rejected": -2.720707893371582, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 0.7152289748191833, "rewards/margins": 0.004887819290161133, "rewards/rejected": 0.7103411555290222, "step": 4445 }, { "epoch": 2.4, "learning_rate": 1.0191701461696395e-08, "logits/chosen": -1.9930254220962524, "logits/rejected": -2.28535532951355, "logps/chosen": -1.9513853788375854, "logps/rejected": -1.017483115196228, "loss": 0.6646, "rewards/accuracies": 1.0, "rewards/chosen": 0.8185357451438904, "rewards/margins": 0.057836294174194336, "rewards/rejected": 0.760699450969696, "step": 4446 }, { "epoch": 2.4, "learning_rate": 1.0174090885451809e-08, "logits/chosen": -2.10137939453125, "logits/rejected": -2.2530581951141357, "logps/chosen": -0.4712342917919159, "logps/rejected": -0.47870197892189026, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.8803644180297852, "rewards/margins": 0.011867046356201172, "rewards/rejected": 0.868497371673584, "step": 4447 }, { "epoch": 2.4, "learning_rate": 1.0156493813832363e-08, "logits/chosen": -2.1434872150421143, "logits/rejected": -2.289735794067383, "logps/chosen": -3.2353081703186035, "logps/rejected": -3.466104507446289, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.8940055966377258, "rewards/margins": 0.016027092933654785, "rewards/rejected": 0.877978503704071, "step": 4448 }, { "epoch": 2.4, "learning_rate": 1.0138910252805061e-08, "logits/chosen": -2.1591382026672363, "logits/rejected": -2.3073928356170654, "logps/chosen": -0.22176599502563477, "logps/rejected": -0.20555102825164795, "loss": 0.685, "rewards/accuracies": 1.0, "rewards/chosen": 0.9521467089653015, "rewards/margins": 0.016329944133758545, "rewards/rejected": 0.935816764831543, "step": 4449 }, { "epoch": 2.4, "learning_rate": 1.0121340208332352e-08, "logits/chosen": -2.157041549682617, "logits/rejected": -2.0698814392089844, "logps/chosen": -16.764881134033203, "logps/rejected": -1.6330182552337646, "loss": 0.2798, "rewards/accuracies": 1.0, "rewards/chosen": 1.9664043188095093, "rewards/margins": 1.1305434703826904, "rewards/rejected": 0.8358608484268188, "step": 4450 }, { "epoch": 2.4, "learning_rate": 1.0103783686372075e-08, "logits/chosen": -2.0287563800811768, "logits/rejected": -2.2677042484283447, "logps/chosen": -5.358130931854248, "logps/rejected": -1.297006368637085, "loss": 0.7874, "rewards/accuracies": 0.0, "rewards/chosen": 0.7189602255821228, "rewards/margins": -0.1804477572441101, "rewards/rejected": 0.8994079828262329, "step": 4451 }, { "epoch": 2.4, "learning_rate": 1.0086240692877496e-08, "logits/chosen": -2.0504043102264404, "logits/rejected": -2.288926839828491, "logps/chosen": -0.28111177682876587, "logps/rejected": -0.27871930599212646, "loss": 0.6747, "rewards/accuracies": 1.0, "rewards/chosen": 0.8274589776992798, "rewards/margins": 0.03725457191467285, "rewards/rejected": 0.7902044057846069, "step": 4452 }, { "epoch": 2.4, "learning_rate": 1.00687112337973e-08, "logits/chosen": -2.054659128189087, "logits/rejected": -2.0556423664093018, "logps/chosen": -5.929076194763184, "logps/rejected": -7.319499969482422, "loss": 0.3529, "rewards/accuracies": 1.0, "rewards/chosen": 1.2138400077819824, "rewards/margins": 0.8600292205810547, "rewards/rejected": 0.35381078720092773, "step": 4453 }, { "epoch": 2.4, "learning_rate": 1.0051195315075578e-08, "logits/chosen": -2.064436197280884, "logits/rejected": -2.3625075817108154, "logps/chosen": -7.211641311645508, "logps/rejected": -8.541960716247559, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 1.0428352355957031, "rewards/margins": 0.002922534942626953, "rewards/rejected": 1.0399127006530762, "step": 4454 }, { "epoch": 2.4, "learning_rate": 1.0033692942651838e-08, "logits/chosen": -2.133889675140381, "logits/rejected": -2.1257457733154297, "logps/chosen": -13.66120719909668, "logps/rejected": -12.636235237121582, "loss": 0.3789, "rewards/accuracies": 1.0, "rewards/chosen": 1.3868610858917236, "rewards/margins": 0.7749610543251038, "rewards/rejected": 0.6119000315666199, "step": 4455 }, { "epoch": 2.4, "learning_rate": 1.0016204122460964e-08, "logits/chosen": -1.9878520965576172, "logits/rejected": -2.317986488342285, "logps/chosen": -0.5619277954101562, "logps/rejected": -0.515201210975647, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 1.0507162809371948, "rewards/margins": 0.01924419403076172, "rewards/rejected": 1.031472086906433, "step": 4456 }, { "epoch": 2.4, "learning_rate": 9.998728860433275e-09, "logits/chosen": -2.0630972385406494, "logits/rejected": -2.101970672607422, "logps/chosen": -3.5462734699249268, "logps/rejected": -9.707066535949707, "loss": 0.2457, "rewards/accuracies": 1.0, "rewards/chosen": 1.7301298379898071, "rewards/margins": 1.2781789302825928, "rewards/rejected": 0.45195093750953674, "step": 4457 }, { "epoch": 2.4, "learning_rate": 9.981267162494494e-09, "logits/chosen": -2.0059521198272705, "logits/rejected": -2.0124905109405518, "logps/chosen": -2.4059946537017822, "logps/rejected": -1.7748098373413086, "loss": 0.5434, "rewards/accuracies": 1.0, "rewards/chosen": 1.1613340377807617, "rewards/margins": 0.32586371898651123, "rewards/rejected": 0.8354703187942505, "step": 4458 }, { "epoch": 2.41, "learning_rate": 9.963819034565707e-09, "logits/chosen": -2.1209492683410645, "logits/rejected": -2.308075428009033, "logps/chosen": -3.9231863021850586, "logps/rejected": -4.228845119476318, "loss": 0.6726, "rewards/accuracies": 1.0, "rewards/chosen": 0.5095635652542114, "rewards/margins": 0.041475266218185425, "rewards/rejected": 0.468088299036026, "step": 4459 }, { "epoch": 2.41, "learning_rate": 9.946384482563442e-09, "logits/chosen": -1.9857923984527588, "logits/rejected": -1.9862074851989746, "logps/chosen": -0.3920968770980835, "logps/rejected": -3.4627838134765625, "loss": 0.5375, "rewards/accuracies": 1.0, "rewards/chosen": 0.9922324419021606, "rewards/margins": 0.33999985456466675, "rewards/rejected": 0.6522325873374939, "step": 4460 }, { "epoch": 2.41, "learning_rate": 9.928963512399618e-09, "logits/chosen": -2.0011394023895264, "logits/rejected": -2.248959541320801, "logps/chosen": -0.9346351027488708, "logps/rejected": -0.8866375088691711, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 1.0224167108535767, "rewards/margins": 0.018503904342651367, "rewards/rejected": 1.0039128065109253, "step": 4461 }, { "epoch": 2.41, "learning_rate": 9.911556129981512e-09, "logits/chosen": -2.0223963260650635, "logits/rejected": -2.0306246280670166, "logps/chosen": -1.6533647775650024, "logps/rejected": -2.8962626457214355, "loss": 0.4815, "rewards/accuracies": 1.0, "rewards/chosen": 1.0636109113693237, "rewards/margins": 0.48050278425216675, "rewards/rejected": 0.583108127117157, "step": 4462 }, { "epoch": 2.41, "learning_rate": 9.894162341211832e-09, "logits/chosen": -2.0128793716430664, "logits/rejected": -2.2530629634857178, "logps/chosen": -0.29292410612106323, "logps/rejected": -0.33990737795829773, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 1.0101226568222046, "rewards/margins": 0.016671359539031982, "rewards/rejected": 0.9934512972831726, "step": 4463 }, { "epoch": 2.41, "learning_rate": 9.876782151988655e-09, "logits/chosen": -1.9831433296203613, "logits/rejected": -2.2677597999572754, "logps/chosen": -1.1398547887802124, "logps/rejected": -1.0083492994308472, "loss": 0.6806, "rewards/accuracies": 1.0, "rewards/chosen": 0.8438922762870789, "rewards/margins": 0.02531200647354126, "rewards/rejected": 0.8185802698135376, "step": 4464 }, { "epoch": 2.41, "learning_rate": 9.859415568205476e-09, "logits/chosen": -2.324126720428467, "logits/rejected": -2.1804885864257812, "logps/chosen": -34.89387893676758, "logps/rejected": -1.6938576698303223, "loss": 0.169, "rewards/accuracies": 1.0, "rewards/chosen": 2.597679615020752, "rewards/margins": 1.6924479007720947, "rewards/rejected": 0.9052316546440125, "step": 4465 }, { "epoch": 2.41, "learning_rate": 9.842062595751127e-09, "logits/chosen": -2.189544200897217, "logits/rejected": -2.181164503097534, "logps/chosen": -0.48704299330711365, "logps/rejected": -0.5200619697570801, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.8613344430923462, "rewards/margins": 0.006503283977508545, "rewards/rejected": 0.8548311591148376, "step": 4466 }, { "epoch": 2.41, "learning_rate": 9.824723240509863e-09, "logits/chosen": -2.0467891693115234, "logits/rejected": -2.1286814212799072, "logps/chosen": -2.1348648071289062, "logps/rejected": -10.512422561645508, "loss": 0.3379, "rewards/accuracies": 1.0, "rewards/chosen": 1.5196523666381836, "rewards/margins": 0.9114610552787781, "rewards/rejected": 0.6081913113594055, "step": 4467 }, { "epoch": 2.41, "learning_rate": 9.807397508361331e-09, "logits/chosen": -2.1858222484588623, "logits/rejected": -2.3019871711730957, "logps/chosen": -4.489363670349121, "logps/rejected": -0.4002685546875, "loss": 0.7585, "rewards/accuracies": 0.0, "rewards/chosen": 0.904075562953949, "rewards/margins": -0.12675923109054565, "rewards/rejected": 1.0308347940444946, "step": 4468 }, { "epoch": 2.41, "learning_rate": 9.790085405180504e-09, "logits/chosen": -2.0143849849700928, "logits/rejected": -2.2922964096069336, "logps/chosen": -1.115136981010437, "logps/rejected": -1.213929533958435, "loss": 0.69, "rewards/accuracies": 1.0, "rewards/chosen": 0.6629202365875244, "rewards/margins": 0.006346225738525391, "rewards/rejected": 0.656574010848999, "step": 4469 }, { "epoch": 2.41, "learning_rate": 9.772786936837785e-09, "logits/chosen": -2.091090440750122, "logits/rejected": -2.3388564586639404, "logps/chosen": -5.973299503326416, "logps/rejected": -4.885092735290527, "loss": 0.7263, "rewards/accuracies": 0.0, "rewards/chosen": 0.891143262386322, "rewards/margins": -0.06533020734786987, "rewards/rejected": 0.9564734697341919, "step": 4470 }, { "epoch": 2.41, "learning_rate": 9.755502109198955e-09, "logits/chosen": -2.2005109786987305, "logits/rejected": -2.199754476547241, "logps/chosen": -2.319063663482666, "logps/rejected": -5.402948379516602, "loss": 0.4505, "rewards/accuracies": 1.0, "rewards/chosen": 1.0358326435089111, "rewards/margins": 0.5638024806976318, "rewards/rejected": 0.4720301628112793, "step": 4471 }, { "epoch": 2.41, "learning_rate": 9.738230928125114e-09, "logits/chosen": -2.130939245223999, "logits/rejected": -2.28849720954895, "logps/chosen": -4.900765419006348, "logps/rejected": -10.883467674255371, "loss": 0.6734, "rewards/accuracies": 1.0, "rewards/chosen": 0.9913378953933716, "rewards/margins": 0.039978623390197754, "rewards/rejected": 0.9513592720031738, "step": 4472 }, { "epoch": 2.41, "learning_rate": 9.720973399472788e-09, "logits/chosen": -2.0803496837615967, "logits/rejected": -2.244560480117798, "logps/chosen": -5.494814872741699, "logps/rejected": -0.4825224280357361, "loss": 0.8023, "rewards/accuracies": 0.0, "rewards/chosen": 0.6482766270637512, "rewards/margins": -0.20753556489944458, "rewards/rejected": 0.8558121919631958, "step": 4473 }, { "epoch": 2.41, "learning_rate": 9.703729529093863e-09, "logits/chosen": -2.118112325668335, "logits/rejected": -2.0344455242156982, "logps/chosen": -23.388660430908203, "logps/rejected": -2.495182514190674, "loss": 0.295, "rewards/accuracies": 1.0, "rewards/chosen": 1.6990158557891846, "rewards/margins": 1.0698468685150146, "rewards/rejected": 0.6291690468788147, "step": 4474 }, { "epoch": 2.41, "learning_rate": 9.68649932283559e-09, "logits/chosen": -1.982558012008667, "logits/rejected": -2.2969703674316406, "logps/chosen": -2.3062143325805664, "logps/rejected": -11.183526992797852, "loss": 0.5964, "rewards/accuracies": 1.0, "rewards/chosen": 0.9393259286880493, "rewards/margins": 0.20390796661376953, "rewards/rejected": 0.7354179620742798, "step": 4475 }, { "epoch": 2.41, "learning_rate": 9.669282786540583e-09, "logits/chosen": -2.06314754486084, "logits/rejected": -2.125434637069702, "logps/chosen": -10.124505996704102, "logps/rejected": -7.179923057556152, "loss": 0.6256, "rewards/accuracies": 1.0, "rewards/chosen": 1.1917047500610352, "rewards/margins": 0.1400364637374878, "rewards/rejected": 1.0516682863235474, "step": 4476 }, { "epoch": 2.41, "learning_rate": 9.652079926046814e-09, "logits/chosen": -2.1818339824676514, "logits/rejected": -2.217362403869629, "logps/chosen": -11.454090118408203, "logps/rejected": -12.37564468383789, "loss": 0.421, "rewards/accuracies": 1.0, "rewards/chosen": 1.3613020181655884, "rewards/margins": 0.6472440958023071, "rewards/rejected": 0.7140579223632812, "step": 4477 }, { "epoch": 2.42, "learning_rate": 9.634890747187641e-09, "logits/chosen": -2.1825737953186035, "logits/rejected": -2.343052387237549, "logps/chosen": -1.4733710289001465, "logps/rejected": -1.5536178350448608, "loss": 0.6816, "rewards/accuracies": 1.0, "rewards/chosen": 0.9711248278617859, "rewards/margins": 0.02324986457824707, "rewards/rejected": 0.9478749632835388, "step": 4478 }, { "epoch": 2.42, "learning_rate": 9.617715255791742e-09, "logits/chosen": -2.1884751319885254, "logits/rejected": -2.0790014266967773, "logps/chosen": -38.1461181640625, "logps/rejected": -2.6006062030792236, "loss": 0.1424, "rewards/accuracies": 1.0, "rewards/chosen": 2.466181993484497, "rewards/margins": 1.8772035837173462, "rewards/rejected": 0.5889784097671509, "step": 4479 }, { "epoch": 2.42, "learning_rate": 9.600553457683191e-09, "logits/chosen": -2.0760293006896973, "logits/rejected": -2.0582871437072754, "logps/chosen": -17.24007797241211, "logps/rejected": -4.946608543395996, "loss": 0.4142, "rewards/accuracies": 1.0, "rewards/chosen": 1.5605976581573486, "rewards/margins": 0.667118489742279, "rewards/rejected": 0.8934791684150696, "step": 4480 }, { "epoch": 2.42, "learning_rate": 9.583405358681429e-09, "logits/chosen": -2.0096733570098877, "logits/rejected": -2.0095860958099365, "logps/chosen": -0.8824008107185364, "logps/rejected": -3.413517713546753, "loss": 0.4666, "rewards/accuracies": 1.0, "rewards/chosen": 1.0187103748321533, "rewards/margins": 0.5198653936386108, "rewards/rejected": 0.49884501099586487, "step": 4481 }, { "epoch": 2.42, "learning_rate": 9.566270964601193e-09, "logits/chosen": -2.071762800216675, "logits/rejected": -2.0697171688079834, "logps/chosen": -6.6520514488220215, "logps/rejected": -5.746248245239258, "loss": 0.6455, "rewards/accuracies": 1.0, "rewards/chosen": 1.1645430326461792, "rewards/margins": 0.09774589538574219, "rewards/rejected": 1.066797137260437, "step": 4482 }, { "epoch": 2.42, "learning_rate": 9.549150281252633e-09, "logits/chosen": -2.0303902626037598, "logits/rejected": -2.0362181663513184, "logps/chosen": -0.7463834881782532, "logps/rejected": -3.570889472961426, "loss": 0.4858, "rewards/accuracies": 1.0, "rewards/chosen": 0.9826048016548157, "rewards/margins": 0.46920937299728394, "rewards/rejected": 0.5133954286575317, "step": 4483 }, { "epoch": 2.42, "learning_rate": 9.532043314441218e-09, "logits/chosen": -2.1532809734344482, "logits/rejected": -2.1499476432800293, "logps/chosen": -5.956389427185059, "logps/rejected": -4.116828918457031, "loss": 0.3482, "rewards/accuracies": 1.0, "rewards/chosen": 1.3841725587844849, "rewards/margins": 0.8758454322814941, "rewards/rejected": 0.5083271265029907, "step": 4484 }, { "epoch": 2.42, "learning_rate": 9.514950069967775e-09, "logits/chosen": -2.183570384979248, "logits/rejected": -2.343108892440796, "logps/chosen": -1.0365544557571411, "logps/rejected": -6.892946720123291, "loss": 0.5687, "rewards/accuracies": 1.0, "rewards/chosen": 1.0863847732543945, "rewards/margins": 0.2666894793510437, "rewards/rejected": 0.8196952939033508, "step": 4485 }, { "epoch": 2.42, "learning_rate": 9.49787055362849e-09, "logits/chosen": -2.0584678649902344, "logits/rejected": -2.0589826107025146, "logps/chosen": -1.4620814323425293, "logps/rejected": -0.8124167323112488, "loss": 0.3649, "rewards/accuracies": 1.0, "rewards/chosen": 1.552588701248169, "rewards/margins": 0.8202662467956543, "rewards/rejected": 0.7323224544525146, "step": 4486 }, { "epoch": 2.42, "learning_rate": 9.480804771214862e-09, "logits/chosen": -2.0710175037384033, "logits/rejected": -2.062290906906128, "logps/chosen": -9.70949935913086, "logps/rejected": -1.8875250816345215, "loss": 0.6686, "rewards/accuracies": 1.0, "rewards/chosen": 1.1319376230239868, "rewards/margins": 0.049785614013671875, "rewards/rejected": 1.082152009010315, "step": 4487 }, { "epoch": 2.42, "learning_rate": 9.463752728513768e-09, "logits/chosen": -2.0281710624694824, "logits/rejected": -2.2664687633514404, "logps/chosen": -0.3375661075115204, "logps/rejected": -0.35713258385658264, "loss": 0.6988, "rewards/accuracies": 0.0, "rewards/chosen": 0.9618882536888123, "rewards/margins": -0.011345922946929932, "rewards/rejected": 0.9732341766357422, "step": 4488 }, { "epoch": 2.42, "learning_rate": 9.44671443130739e-09, "logits/chosen": -2.166134834289551, "logits/rejected": -2.167231321334839, "logps/chosen": -4.758026599884033, "logps/rejected": -10.119000434875488, "loss": 0.2126, "rewards/accuracies": 1.0, "rewards/chosen": 1.8735064268112183, "rewards/margins": 1.4403882026672363, "rewards/rejected": 0.43311816453933716, "step": 4489 }, { "epoch": 2.42, "learning_rate": 9.429689885373282e-09, "logits/chosen": -2.11916446685791, "logits/rejected": -2.104938268661499, "logps/chosen": -14.569108009338379, "logps/rejected": -6.276113986968994, "loss": 0.2823, "rewards/accuracies": 1.0, "rewards/chosen": 1.4648417234420776, "rewards/margins": 1.1201448440551758, "rewards/rejected": 0.34469684958457947, "step": 4490 }, { "epoch": 2.42, "learning_rate": 9.412679096484333e-09, "logits/chosen": -2.13149356842041, "logits/rejected": -2.2843263149261475, "logps/chosen": -0.9441916942596436, "logps/rejected": -0.8958373069763184, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.9615415930747986, "rewards/margins": 0.015157341957092285, "rewards/rejected": 0.9463842511177063, "step": 4491 }, { "epoch": 2.42, "learning_rate": 9.395682070408712e-09, "logits/chosen": -1.998345971107483, "logits/rejected": -2.009277582168579, "logps/chosen": -6.815874099731445, "logps/rejected": -1.4648370742797852, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 1.1515802145004272, "rewards/margins": 0.0009361505508422852, "rewards/rejected": 1.150644063949585, "step": 4492 }, { "epoch": 2.42, "learning_rate": 9.378698812910024e-09, "logits/chosen": -2.025676965713501, "logits/rejected": -2.250088691711426, "logps/chosen": -8.982743263244629, "logps/rejected": -5.338613510131836, "loss": 0.7803, "rewards/accuracies": 0.0, "rewards/chosen": 0.8719004988670349, "rewards/margins": -0.16734272241592407, "rewards/rejected": 1.039243221282959, "step": 4493 }, { "epoch": 2.42, "learning_rate": 9.361729329747108e-09, "logits/chosen": -2.078322649002075, "logits/rejected": -2.222365140914917, "logps/chosen": -0.7186606526374817, "logps/rejected": -0.7067120671272278, "loss": 0.6585, "rewards/accuracies": 1.0, "rewards/chosen": 0.9693097472190857, "rewards/margins": 0.07056307792663574, "rewards/rejected": 0.89874666929245, "step": 4494 }, { "epoch": 2.42, "learning_rate": 9.34477362667418e-09, "logits/chosen": -2.046231746673584, "logits/rejected": -2.0474255084991455, "logps/chosen": -3.123211145401001, "logps/rejected": -5.862210750579834, "loss": 0.2913, "rewards/accuracies": 1.0, "rewards/chosen": 1.6149753332138062, "rewards/margins": 1.084276795387268, "rewards/rejected": 0.5306985378265381, "step": 4495 }, { "epoch": 2.43, "learning_rate": 9.327831709440793e-09, "logits/chosen": -2.0376856327056885, "logits/rejected": -2.040064573287964, "logps/chosen": -2.553147554397583, "logps/rejected": -0.664814829826355, "loss": 0.6308, "rewards/accuracies": 1.0, "rewards/chosen": 1.1022186279296875, "rewards/margins": 0.12877756357192993, "rewards/rejected": 0.9734410643577576, "step": 4496 }, { "epoch": 2.43, "learning_rate": 9.310903583791769e-09, "logits/chosen": -2.01265811920166, "logits/rejected": -2.2610700130462646, "logps/chosen": -0.4110034108161926, "logps/rejected": -0.45041704177856445, "loss": 0.6716, "rewards/accuracies": 1.0, "rewards/chosen": 1.0641049146652222, "rewards/margins": 0.043498992919921875, "rewards/rejected": 1.0206059217453003, "step": 4497 }, { "epoch": 2.43, "learning_rate": 9.293989255467311e-09, "logits/chosen": -2.005819082260132, "logits/rejected": -2.0159919261932373, "logps/chosen": -2.000162363052368, "logps/rejected": -2.2047576904296875, "loss": 0.4191, "rewards/accuracies": 1.0, "rewards/chosen": 1.285630226135254, "rewards/margins": 0.6528965830802917, "rewards/rejected": 0.6327336430549622, "step": 4498 }, { "epoch": 2.43, "learning_rate": 9.277088730202931e-09, "logits/chosen": -2.0612640380859375, "logits/rejected": -2.262007474899292, "logps/chosen": -0.9104448556900024, "logps/rejected": -0.9661470055580139, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.9300609827041626, "rewards/margins": 0.014647960662841797, "rewards/rejected": 0.9154130220413208, "step": 4499 }, { "epoch": 2.43, "learning_rate": 9.260202013729434e-09, "logits/chosen": -2.0343363285064697, "logits/rejected": -2.244408369064331, "logps/chosen": -10.25433349609375, "logps/rejected": -8.565622329711914, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.5176745653152466, "rewards/margins": 0.015969574451446533, "rewards/rejected": 0.5017049908638, "step": 4500 }, { "epoch": 2.43, "learning_rate": 9.243329111772985e-09, "logits/chosen": -2.2133708000183105, "logits/rejected": -2.125835657119751, "logps/chosen": -20.607505798339844, "logps/rejected": -1.9810243844985962, "loss": 0.178, "rewards/accuracies": 1.0, "rewards/chosen": 2.3225338459014893, "rewards/margins": 1.6355007886886597, "rewards/rejected": 0.6870330572128296, "step": 4501 }, { "epoch": 2.43, "learning_rate": 9.226470030055e-09, "logits/chosen": -2.131977081298828, "logits/rejected": -2.135802745819092, "logps/chosen": -4.386563301086426, "logps/rejected": -0.4511979818344116, "loss": 0.5709, "rewards/accuracies": 1.0, "rewards/chosen": 1.229529857635498, "rewards/margins": 0.2614390254020691, "rewards/rejected": 0.968090832233429, "step": 4502 }, { "epoch": 2.43, "learning_rate": 9.209624774292302e-09, "logits/chosen": -1.9842119216918945, "logits/rejected": -2.029309034347534, "logps/chosen": -5.921475887298584, "logps/rejected": -8.16362190246582, "loss": 0.3054, "rewards/accuracies": 1.0, "rewards/chosen": 1.7436500787734985, "rewards/margins": 1.0294028520584106, "rewards/rejected": 0.7142472267150879, "step": 4503 }, { "epoch": 2.43, "learning_rate": 9.192793350196936e-09, "logits/chosen": -2.0333101749420166, "logits/rejected": -2.029719829559326, "logps/chosen": -1.2001702785491943, "logps/rejected": -7.895390510559082, "loss": 0.3805, "rewards/accuracies": 1.0, "rewards/chosen": 1.0131739377975464, "rewards/margins": 0.7699146270751953, "rewards/rejected": 0.24325934052467346, "step": 4504 }, { "epoch": 2.43, "learning_rate": 9.175975763476301e-09, "logits/chosen": -2.033156156539917, "logits/rejected": -2.2209091186523438, "logps/chosen": -0.3507407605648041, "logps/rejected": -0.36766862869262695, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.8798516392707825, "rewards/margins": -0.0034356117248535156, "rewards/rejected": 0.883287250995636, "step": 4505 }, { "epoch": 2.43, "learning_rate": 9.159172019833117e-09, "logits/chosen": -2.0783538818359375, "logits/rejected": -2.3141119480133057, "logps/chosen": -2.7602577209472656, "logps/rejected": -3.1522157192230225, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": 0.8247401118278503, "rewards/margins": 0.026415348052978516, "rewards/rejected": 0.7983247637748718, "step": 4506 }, { "epoch": 2.43, "learning_rate": 9.142382124965353e-09, "logits/chosen": -2.0822112560272217, "logits/rejected": -2.3280680179595947, "logps/chosen": -0.9286631941795349, "logps/rejected": -2.205672025680542, "loss": 0.6301, "rewards/accuracies": 1.0, "rewards/chosen": 0.9408572316169739, "rewards/margins": 0.13023817539215088, "rewards/rejected": 0.810619056224823, "step": 4507 }, { "epoch": 2.43, "learning_rate": 9.125606084566345e-09, "logits/chosen": -2.0865395069122314, "logits/rejected": -2.0743167400360107, "logps/chosen": -1.0037643909454346, "logps/rejected": -10.83405876159668, "loss": 0.5462, "rewards/accuracies": 1.0, "rewards/chosen": 1.024095058441162, "rewards/margins": 0.3192429542541504, "rewards/rejected": 0.7048521041870117, "step": 4508 }, { "epoch": 2.43, "learning_rate": 9.108843904324714e-09, "logits/chosen": -2.0525970458984375, "logits/rejected": -2.225576877593994, "logps/chosen": -0.15178316831588745, "logps/rejected": -0.16319023072719574, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 0.9706621170043945, "rewards/margins": 0.02508467435836792, "rewards/rejected": 0.9455774426460266, "step": 4509 }, { "epoch": 2.43, "learning_rate": 9.092095589924342e-09, "logits/chosen": -2.054926633834839, "logits/rejected": -2.0562474727630615, "logps/chosen": -1.7694967985153198, "logps/rejected": -1.7998573780059814, "loss": 0.6555, "rewards/accuracies": 1.0, "rewards/chosen": 1.1873935461044312, "rewards/margins": 0.07668209075927734, "rewards/rejected": 1.1107114553451538, "step": 4510 }, { "epoch": 2.43, "learning_rate": 9.075361147044464e-09, "logits/chosen": -2.0135908126831055, "logits/rejected": -2.013815402984619, "logps/chosen": -1.435605764389038, "logps/rejected": -2.291370391845703, "loss": 0.5299, "rewards/accuracies": 1.0, "rewards/chosen": 1.2853752374649048, "rewards/margins": 0.35837411880493164, "rewards/rejected": 0.9270011186599731, "step": 4511 }, { "epoch": 2.43, "learning_rate": 9.058640581359551e-09, "logits/chosen": -2.0675437450408936, "logits/rejected": -2.0170400142669678, "logps/chosen": -12.668089866638184, "logps/rejected": -3.0737714767456055, "loss": 0.2883, "rewards/accuracies": 1.0, "rewards/chosen": 1.8818241357803345, "rewards/margins": 1.096058964729309, "rewards/rejected": 0.7857651710510254, "step": 4512 }, { "epoch": 2.43, "learning_rate": 9.041933898539455e-09, "logits/chosen": -2.0885164737701416, "logits/rejected": -2.1009738445281982, "logps/chosen": -8.164435386657715, "logps/rejected": -7.163644313812256, "loss": 0.2146, "rewards/accuracies": 1.0, "rewards/chosen": 2.078207492828369, "rewards/margins": 1.4299724102020264, "rewards/rejected": 0.648235023021698, "step": 4513 }, { "epoch": 2.43, "learning_rate": 9.025241104249237e-09, "logits/chosen": -2.129589557647705, "logits/rejected": -2.140187978744507, "logps/chosen": -2.581439256668091, "logps/rejected": -1.5980836153030396, "loss": 0.5523, "rewards/accuracies": 1.0, "rewards/chosen": 1.3464703559875488, "rewards/margins": 0.30485260486602783, "rewards/rejected": 1.041617751121521, "step": 4514 }, { "epoch": 2.44, "learning_rate": 9.008562204149284e-09, "logits/chosen": -1.9830838441848755, "logits/rejected": -1.9910426139831543, "logps/chosen": -4.26269006729126, "logps/rejected": -4.607874870300293, "loss": 0.2387, "rewards/accuracies": 1.0, "rewards/chosen": 1.7583812475204468, "rewards/margins": 1.3106988668441772, "rewards/rejected": 0.44768238067626953, "step": 4515 }, { "epoch": 2.44, "learning_rate": 8.991897203895282e-09, "logits/chosen": -2.0269153118133545, "logits/rejected": -2.0295569896698, "logps/chosen": -3.20733642578125, "logps/rejected": -3.1870224475860596, "loss": 0.4207, "rewards/accuracies": 1.0, "rewards/chosen": 1.3294419050216675, "rewards/margins": 0.6481247544288635, "rewards/rejected": 0.681317150592804, "step": 4516 }, { "epoch": 2.44, "learning_rate": 8.975246109138169e-09, "logits/chosen": -2.1919610500335693, "logits/rejected": -2.2823429107666016, "logps/chosen": -0.7425416707992554, "logps/rejected": -0.7544799447059631, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 0.9252438545227051, "rewards/margins": 0.024225294589996338, "rewards/rejected": 0.9010185599327087, "step": 4517 }, { "epoch": 2.44, "learning_rate": 8.958608925524197e-09, "logits/chosen": -2.016207218170166, "logits/rejected": -1.9962936639785767, "logps/chosen": -30.081439971923828, "logps/rejected": -9.614194869995117, "loss": 0.2152, "rewards/accuracies": 1.0, "rewards/chosen": 1.6820420026779175, "rewards/margins": 1.426552414894104, "rewards/rejected": 0.2554895579814911, "step": 4518 }, { "epoch": 2.44, "learning_rate": 8.941985658694907e-09, "logits/chosen": -1.9338147640228271, "logits/rejected": -1.9401198625564575, "logps/chosen": -3.146825075149536, "logps/rejected": -4.4630446434021, "loss": 0.5157, "rewards/accuracies": 1.0, "rewards/chosen": 0.9970477223396301, "rewards/margins": 0.39340347051620483, "rewards/rejected": 0.6036442518234253, "step": 4519 }, { "epoch": 2.44, "learning_rate": 8.925376314287086e-09, "logits/chosen": -2.0332305431365967, "logits/rejected": -1.9793837070465088, "logps/chosen": -30.58275604248047, "logps/rejected": -3.0808629989624023, "loss": 0.1929, "rewards/accuracies": 1.0, "rewards/chosen": 2.1189591884613037, "rewards/margins": 1.5477969646453857, "rewards/rejected": 0.571162223815918, "step": 4520 }, { "epoch": 2.44, "learning_rate": 8.908780897932838e-09, "logits/chosen": -2.0516715049743652, "logits/rejected": -2.281334400177002, "logps/chosen": -0.2495572417974472, "logps/rejected": -0.3485318422317505, "loss": 0.6894, "rewards/accuracies": 1.0, "rewards/chosen": 0.9824496507644653, "rewards/margins": 0.007607102394104004, "rewards/rejected": 0.9748425483703613, "step": 4521 }, { "epoch": 2.44, "learning_rate": 8.892199415259499e-09, "logits/chosen": -2.049879789352417, "logits/rejected": -2.285987138748169, "logps/chosen": -0.3378572165966034, "logps/rejected": -0.3730228543281555, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.8889052271842957, "rewards/margins": 0.016585886478424072, "rewards/rejected": 0.8723193407058716, "step": 4522 }, { "epoch": 2.44, "learning_rate": 8.875631871889732e-09, "logits/chosen": -2.1373746395111084, "logits/rejected": -2.130033493041992, "logps/chosen": -4.432339668273926, "logps/rejected": -5.913773536682129, "loss": 0.3749, "rewards/accuracies": 1.0, "rewards/chosen": 1.1178910732269287, "rewards/margins": 0.7879111766815186, "rewards/rejected": 0.32997989654541016, "step": 4523 }, { "epoch": 2.44, "learning_rate": 8.859078273441461e-09, "logits/chosen": -2.3619625568389893, "logits/rejected": -2.304872751235962, "logps/chosen": -20.903236389160156, "logps/rejected": -5.551878929138184, "loss": 0.1388, "rewards/accuracies": 1.0, "rewards/chosen": 2.3275680541992188, "rewards/margins": 1.9047260284423828, "rewards/rejected": 0.42284202575683594, "step": 4524 }, { "epoch": 2.44, "learning_rate": 8.84253862552784e-09, "logits/chosen": -2.1413168907165527, "logits/rejected": -2.1440231800079346, "logps/chosen": -2.1589765548706055, "logps/rejected": -1.2341265678405762, "loss": 0.6602, "rewards/accuracies": 1.0, "rewards/chosen": 1.039526343345642, "rewards/margins": 0.06692636013031006, "rewards/rejected": 0.972599983215332, "step": 4525 }, { "epoch": 2.44, "learning_rate": 8.826012933757348e-09, "logits/chosen": -2.085308313369751, "logits/rejected": -2.2220120429992676, "logps/chosen": -3.956820011138916, "logps/rejected": -0.3488060534000397, "loss": 0.6502, "rewards/accuracies": 1.0, "rewards/chosen": 0.8398973345756531, "rewards/margins": 0.0877370834350586, "rewards/rejected": 0.7521602511405945, "step": 4526 }, { "epoch": 2.44, "learning_rate": 8.809501203733682e-09, "logits/chosen": -2.0585784912109375, "logits/rejected": -2.052274227142334, "logps/chosen": -4.548367977142334, "logps/rejected": -0.9161183834075928, "loss": 0.2476, "rewards/accuracies": 1.0, "rewards/chosen": 2.179400682449341, "rewards/margins": 1.2695903778076172, "rewards/rejected": 0.9098102450370789, "step": 4527 }, { "epoch": 2.44, "learning_rate": 8.79300344105584e-09, "logits/chosen": -1.966019630432129, "logits/rejected": -1.9630329608917236, "logps/chosen": -1.4126988649368286, "logps/rejected": -3.5372674465179443, "loss": 0.6626, "rewards/accuracies": 1.0, "rewards/chosen": 0.8860804438591003, "rewards/margins": 0.06211972236633301, "rewards/rejected": 0.8239607214927673, "step": 4528 }, { "epoch": 2.44, "learning_rate": 8.776519651318082e-09, "logits/chosen": -2.1154444217681885, "logits/rejected": -2.312696933746338, "logps/chosen": -1.1468186378479004, "logps/rejected": -1.0937459468841553, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 1.08867347240448, "rewards/margins": 0.018345355987548828, "rewards/rejected": 1.0703281164169312, "step": 4529 }, { "epoch": 2.44, "learning_rate": 8.760049840109895e-09, "logits/chosen": -2.1611440181732178, "logits/rejected": -2.1271169185638428, "logps/chosen": -14.228677749633789, "logps/rejected": -6.737906455993652, "loss": 0.3464, "rewards/accuracies": 1.0, "rewards/chosen": 1.6714776754379272, "rewards/margins": 0.8818105459213257, "rewards/rejected": 0.7896671295166016, "step": 4530 }, { "epoch": 2.44, "learning_rate": 8.743594013016064e-09, "logits/chosen": -2.102494716644287, "logits/rejected": -2.1054604053497314, "logps/chosen": -2.6311604976654053, "logps/rejected": -3.2092642784118652, "loss": 0.5576, "rewards/accuracies": 1.0, "rewards/chosen": 1.1564158201217651, "rewards/margins": 0.2924450635910034, "rewards/rejected": 0.8639707565307617, "step": 4531 }, { "epoch": 2.44, "learning_rate": 8.727152175616626e-09, "logits/chosen": -2.024989366531372, "logits/rejected": -2.276171922683716, "logps/chosen": -0.9403250217437744, "logps/rejected": -0.8744430541992188, "loss": 0.6728, "rewards/accuracies": 1.0, "rewards/chosen": 1.046264886856079, "rewards/margins": 0.041084885597229004, "rewards/rejected": 1.00518000125885, "step": 4532 }, { "epoch": 2.44, "learning_rate": 8.710724333486863e-09, "logits/chosen": -2.1409969329833984, "logits/rejected": -2.224600315093994, "logps/chosen": -6.234312534332275, "logps/rejected": -5.619210720062256, "loss": 0.6547, "rewards/accuracies": 1.0, "rewards/chosen": 0.9932443499565125, "rewards/margins": 0.07834470272064209, "rewards/rejected": 0.9148996472358704, "step": 4533 }, { "epoch": 2.45, "learning_rate": 8.69431049219732e-09, "logits/chosen": -2.214063882827759, "logits/rejected": -2.267860174179077, "logps/chosen": -5.641195297241211, "logps/rejected": -17.401771545410156, "loss": 0.3326, "rewards/accuracies": 1.0, "rewards/chosen": 1.4070024490356445, "rewards/margins": 0.9299501180648804, "rewards/rejected": 0.4770523011684418, "step": 4534 }, { "epoch": 2.45, "learning_rate": 8.677910657313781e-09, "logits/chosen": -2.116952419281006, "logits/rejected": -2.10363507270813, "logps/chosen": -3.8894474506378174, "logps/rejected": -4.72846794128418, "loss": 0.334, "rewards/accuracies": 1.0, "rewards/chosen": 1.6809520721435547, "rewards/margins": 0.9249237775802612, "rewards/rejected": 0.7560282945632935, "step": 4535 }, { "epoch": 2.45, "learning_rate": 8.661524834397304e-09, "logits/chosen": -2.079671859741211, "logits/rejected": -2.2911641597747803, "logps/chosen": -0.8248157501220703, "logps/rejected": -0.9389686584472656, "loss": 0.7279, "rewards/accuracies": 0.0, "rewards/chosen": 0.8314575552940369, "rewards/margins": -0.06830120086669922, "rewards/rejected": 0.8997587561607361, "step": 4536 }, { "epoch": 2.45, "learning_rate": 8.64515302900416e-09, "logits/chosen": -2.1587107181549072, "logits/rejected": -2.1587963104248047, "logps/chosen": -2.5007057189941406, "logps/rejected": -6.911389350891113, "loss": 0.3339, "rewards/accuracies": 1.0, "rewards/chosen": 1.091783881187439, "rewards/margins": 0.9253316521644592, "rewards/rejected": 0.16645221412181854, "step": 4537 }, { "epoch": 2.45, "learning_rate": 8.628795246685893e-09, "logits/chosen": -2.087456703186035, "logits/rejected": -2.2400062084198, "logps/chosen": -0.44077345728874207, "logps/rejected": -0.40911075472831726, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 0.9757243990898132, "rewards/margins": -0.0018696188926696777, "rewards/rejected": 0.9775940179824829, "step": 4538 }, { "epoch": 2.45, "learning_rate": 8.612451492989315e-09, "logits/chosen": -1.9466567039489746, "logits/rejected": -1.9455193281173706, "logps/chosen": -1.3057470321655273, "logps/rejected": -1.329607367515564, "loss": 0.6992, "rewards/accuracies": 0.0, "rewards/chosen": 0.9556581377983093, "rewards/margins": -0.012100696563720703, "rewards/rejected": 0.96775883436203, "step": 4539 }, { "epoch": 2.45, "learning_rate": 8.59612177345641e-09, "logits/chosen": -2.1052439212799072, "logits/rejected": -2.3672754764556885, "logps/chosen": -16.125757217407227, "logps/rejected": -13.688034057617188, "loss": 0.7701, "rewards/accuracies": 0.0, "rewards/chosen": 1.1456984281539917, "rewards/margins": -0.14848339557647705, "rewards/rejected": 1.2941818237304688, "step": 4540 }, { "epoch": 2.45, "learning_rate": 8.579806093624475e-09, "logits/chosen": -2.094496011734009, "logits/rejected": -2.101464033126831, "logps/chosen": -2.4722118377685547, "logps/rejected": -6.056501865386963, "loss": 0.3664, "rewards/accuracies": 1.0, "rewards/chosen": 1.1495970487594604, "rewards/margins": 0.8151977062225342, "rewards/rejected": 0.33439937233924866, "step": 4541 }, { "epoch": 2.45, "learning_rate": 8.563504459026011e-09, "logits/chosen": -2.0132389068603516, "logits/rejected": -2.3269214630126953, "logps/chosen": -0.7840400338172913, "logps/rejected": -0.8224374651908875, "loss": 0.6771, "rewards/accuracies": 1.0, "rewards/chosen": 0.8767148852348328, "rewards/margins": 0.032318294048309326, "rewards/rejected": 0.8443965911865234, "step": 4542 }, { "epoch": 2.45, "learning_rate": 8.547216875188756e-09, "logits/chosen": -2.0469706058502197, "logits/rejected": -2.0521583557128906, "logps/chosen": -1.48239004611969, "logps/rejected": -5.415999889373779, "loss": 0.4466, "rewards/accuracies": 1.0, "rewards/chosen": 1.1540592908859253, "rewards/margins": 0.5743758082389832, "rewards/rejected": 0.5796834826469421, "step": 4543 }, { "epoch": 2.45, "learning_rate": 8.530943347635716e-09, "logits/chosen": -2.0660815238952637, "logits/rejected": -2.069430112838745, "logps/chosen": -0.9231342077255249, "logps/rejected": -17.732120513916016, "loss": 0.6652, "rewards/accuracies": 1.0, "rewards/chosen": 1.0709892511367798, "rewards/margins": 0.05672764778137207, "rewards/rejected": 1.0142616033554077, "step": 4544 }, { "epoch": 2.45, "learning_rate": 8.514683881885065e-09, "logits/chosen": -1.991052508354187, "logits/rejected": -1.9460347890853882, "logps/chosen": -10.617622375488281, "logps/rejected": -1.5857564210891724, "loss": 0.4815, "rewards/accuracies": 1.0, "rewards/chosen": 1.4350557327270508, "rewards/margins": 0.48054295778274536, "rewards/rejected": 0.9545127749443054, "step": 4545 }, { "epoch": 2.45, "learning_rate": 8.498438483450277e-09, "logits/chosen": -2.158653736114502, "logits/rejected": -2.1855320930480957, "logps/chosen": -0.9529436230659485, "logps/rejected": -9.081031799316406, "loss": 0.4369, "rewards/accuracies": 1.0, "rewards/chosen": 1.0893980264663696, "rewards/margins": 0.6016886234283447, "rewards/rejected": 0.4877094328403473, "step": 4546 }, { "epoch": 2.45, "learning_rate": 8.482207157840033e-09, "logits/chosen": -2.05719256401062, "logits/rejected": -2.043959140777588, "logps/chosen": -6.170373916625977, "logps/rejected": -2.832613945007324, "loss": 0.4098, "rewards/accuracies": 1.0, "rewards/chosen": 1.4907182455062866, "rewards/margins": 0.680174708366394, "rewards/rejected": 0.8105435371398926, "step": 4547 }, { "epoch": 2.45, "learning_rate": 8.465989910558207e-09, "logits/chosen": -2.2196602821350098, "logits/rejected": -2.3217384815216064, "logps/chosen": -0.5142480134963989, "logps/rejected": -0.5309300422668457, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 1.0377310514450073, "rewards/margins": -0.0016425848007202148, "rewards/rejected": 1.0393736362457275, "step": 4548 }, { "epoch": 2.45, "learning_rate": 8.44978674710396e-09, "logits/chosen": -2.065009117126465, "logits/rejected": -2.276937484741211, "logps/chosen": -0.46649885177612305, "logps/rejected": -0.3742256164550781, "loss": 0.6751, "rewards/accuracies": 1.0, "rewards/chosen": 0.9642120599746704, "rewards/margins": 0.03639310598373413, "rewards/rejected": 0.9278189539909363, "step": 4549 }, { "epoch": 2.45, "learning_rate": 8.433597672971616e-09, "logits/chosen": -1.960018515586853, "logits/rejected": -2.250586986541748, "logps/chosen": -1.3719114065170288, "logps/rejected": -1.3178151845932007, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": 0.964118480682373, "rewards/margins": -0.009418487548828125, "rewards/rejected": 0.9735369682312012, "step": 4550 }, { "epoch": 2.45, "learning_rate": 8.417422693650773e-09, "logits/chosen": -2.048367500305176, "logits/rejected": -2.050305128097534, "logps/chosen": -1.5240628719329834, "logps/rejected": -1.2359344959259033, "loss": 0.5951, "rewards/accuracies": 1.0, "rewards/chosen": 0.9712030291557312, "rewards/margins": 0.2068297266960144, "rewards/rejected": 0.7643733024597168, "step": 4551 }, { "epoch": 2.46, "learning_rate": 8.401261814626215e-09, "logits/chosen": -2.0736234188079834, "logits/rejected": -2.2817270755767822, "logps/chosen": -2.9761781692504883, "logps/rejected": -8.799515724182129, "loss": 0.7089, "rewards/accuracies": 0.0, "rewards/chosen": 0.8171722292900085, "rewards/margins": -0.03127986192703247, "rewards/rejected": 0.848452091217041, "step": 4552 }, { "epoch": 2.46, "learning_rate": 8.385115041377972e-09, "logits/chosen": -2.066392183303833, "logits/rejected": -2.281496286392212, "logps/chosen": -1.1967259645462036, "logps/rejected": -1.2476754188537598, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 1.1137757301330566, "rewards/margins": 0.016977906227111816, "rewards/rejected": 1.0967978239059448, "step": 4553 }, { "epoch": 2.46, "learning_rate": 8.368982379381279e-09, "logits/chosen": -2.084652900695801, "logits/rejected": -2.3062548637390137, "logps/chosen": -1.3302167654037476, "logps/rejected": -1.2133606672286987, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 1.0589560270309448, "rewards/margins": 0.024150371551513672, "rewards/rejected": 1.0348056554794312, "step": 4554 }, { "epoch": 2.46, "learning_rate": 8.352863834106561e-09, "logits/chosen": -2.0404293537139893, "logits/rejected": -2.0435385704040527, "logps/chosen": -1.5287160873413086, "logps/rejected": -1.5371472835540771, "loss": 0.5281, "rewards/accuracies": 1.0, "rewards/chosen": 1.0741174221038818, "rewards/margins": 0.36279016733169556, "rewards/rejected": 0.7113272547721863, "step": 4555 }, { "epoch": 2.46, "learning_rate": 8.336759411019495e-09, "logits/chosen": -2.1502010822296143, "logits/rejected": -2.149151086807251, "logps/chosen": -3.0622901916503906, "logps/rejected": -2.285985231399536, "loss": 0.441, "rewards/accuracies": 1.0, "rewards/chosen": 1.3994807004928589, "rewards/margins": 0.5902107954025269, "rewards/rejected": 0.809269905090332, "step": 4556 }, { "epoch": 2.46, "learning_rate": 8.320669115580964e-09, "logits/chosen": -2.116069793701172, "logits/rejected": -2.260291337966919, "logps/chosen": -0.3607337474822998, "logps/rejected": -0.35313835740089417, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.8480507135391235, "rewards/margins": 0.02856987714767456, "rewards/rejected": 0.819480836391449, "step": 4557 }, { "epoch": 2.46, "learning_rate": 8.304592953247019e-09, "logits/chosen": -2.0300984382629395, "logits/rejected": -2.0139973163604736, "logps/chosen": -6.7333197593688965, "logps/rejected": -4.768036842346191, "loss": 0.3402, "rewards/accuracies": 1.0, "rewards/chosen": 1.3956807851791382, "rewards/margins": 0.9031744599342346, "rewards/rejected": 0.49250632524490356, "step": 4558 }, { "epoch": 2.46, "learning_rate": 8.288530929468984e-09, "logits/chosen": -2.2044010162353516, "logits/rejected": -2.290132522583008, "logps/chosen": -5.405877590179443, "logps/rejected": -1.3993847370147705, "loss": 0.793, "rewards/accuracies": 0.0, "rewards/chosen": 1.0785261392593384, "rewards/margins": -0.19071340560913086, "rewards/rejected": 1.2692395448684692, "step": 4559 }, { "epoch": 2.46, "learning_rate": 8.272483049693318e-09, "logits/chosen": -2.0625810623168945, "logits/rejected": -2.254974126815796, "logps/chosen": -1.799370527267456, "logps/rejected": -1.8036327362060547, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.7171037793159485, "rewards/margins": 0.026172518730163574, "rewards/rejected": 0.6909312605857849, "step": 4560 }, { "epoch": 2.46, "learning_rate": 8.256449319361746e-09, "logits/chosen": -2.0422825813293457, "logits/rejected": -2.275451421737671, "logps/chosen": -0.5729169249534607, "logps/rejected": -0.5517639517784119, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 1.0175081491470337, "rewards/margins": 0.018131375312805176, "rewards/rejected": 0.9993767738342285, "step": 4561 }, { "epoch": 2.46, "learning_rate": 8.240429743911153e-09, "logits/chosen": -1.9727271795272827, "logits/rejected": -1.9879692792892456, "logps/chosen": -1.4516265392303467, "logps/rejected": -7.380852699279785, "loss": 0.3839, "rewards/accuracies": 1.0, "rewards/chosen": 1.316635251045227, "rewards/margins": 0.7592394948005676, "rewards/rejected": 0.5573957562446594, "step": 4562 }, { "epoch": 2.46, "learning_rate": 8.224424328773656e-09, "logits/chosen": -2.0870461463928223, "logits/rejected": -2.0727362632751465, "logps/chosen": -0.7265646457672119, "logps/rejected": -5.877294540405273, "loss": 0.4195, "rewards/accuracies": 1.0, "rewards/chosen": 1.0850015878677368, "rewards/margins": 0.6515705585479736, "rewards/rejected": 0.43343105912208557, "step": 4563 }, { "epoch": 2.46, "learning_rate": 8.208433079376553e-09, "logits/chosen": -2.23245906829834, "logits/rejected": -2.057206630706787, "logps/chosen": -48.600006103515625, "logps/rejected": -10.973916053771973, "loss": 0.1905, "rewards/accuracies": 1.0, "rewards/chosen": 2.54636549949646, "rewards/margins": 1.5611854791641235, "rewards/rejected": 0.9851800203323364, "step": 4564 }, { "epoch": 2.46, "learning_rate": 8.192456001142318e-09, "logits/chosen": -2.209064245223999, "logits/rejected": -2.2396395206451416, "logps/chosen": -5.734951019287109, "logps/rejected": -24.386173248291016, "loss": 0.7377, "rewards/accuracies": 0.0, "rewards/chosen": 1.0660266876220703, "rewards/margins": -0.08722496032714844, "rewards/rejected": 1.1532516479492188, "step": 4565 }, { "epoch": 2.46, "learning_rate": 8.176493099488663e-09, "logits/chosen": -2.0282227993011475, "logits/rejected": -2.26884126663208, "logps/chosen": -0.24539715051651, "logps/rejected": -0.2757076025009155, "loss": 0.7048, "rewards/accuracies": 0.0, "rewards/chosen": 0.8861799240112305, "rewards/margins": -0.023200690746307373, "rewards/rejected": 0.9093806147575378, "step": 4566 }, { "epoch": 2.46, "learning_rate": 8.160544379828471e-09, "logits/chosen": -2.0643045902252197, "logits/rejected": -2.295644760131836, "logps/chosen": -0.9391084313392639, "logps/rejected": -1.0223565101623535, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.9466953277587891, "rewards/margins": 0.022469520568847656, "rewards/rejected": 0.9242258071899414, "step": 4567 }, { "epoch": 2.46, "learning_rate": 8.144609847569806e-09, "logits/chosen": -2.0082366466522217, "logits/rejected": -2.306401491165161, "logps/chosen": -0.3303287625312805, "logps/rejected": -0.33962583541870117, "loss": 0.68, "rewards/accuracies": 1.0, "rewards/chosen": 0.9886744618415833, "rewards/margins": 0.026558995246887207, "rewards/rejected": 0.962115466594696, "step": 4568 }, { "epoch": 2.46, "learning_rate": 8.128689508115927e-09, "logits/chosen": -2.13452410697937, "logits/rejected": -2.3078155517578125, "logps/chosen": -3.533670663833618, "logps/rejected": -3.9885692596435547, "loss": 0.8239, "rewards/accuracies": 0.0, "rewards/chosen": 0.8625627756118774, "rewards/margins": -0.24634873867034912, "rewards/rejected": 1.1089115142822266, "step": 4569 }, { "epoch": 2.46, "learning_rate": 8.1127833668653e-09, "logits/chosen": -2.054870128631592, "logits/rejected": -2.2745048999786377, "logps/chosen": -0.4404703974723816, "logps/rejected": -0.5196011066436768, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.906507134437561, "rewards/margins": -0.005633413791656494, "rewards/rejected": 0.9121405482292175, "step": 4570 }, { "epoch": 2.47, "learning_rate": 8.096891429211556e-09, "logits/chosen": -2.0300307273864746, "logits/rejected": -2.327414035797119, "logps/chosen": -1.5654354095458984, "logps/rejected": -4.3716535568237305, "loss": 0.5899, "rewards/accuracies": 1.0, "rewards/chosen": 0.9167844653129578, "rewards/margins": 0.21833109855651855, "rewards/rejected": 0.6984533667564392, "step": 4571 }, { "epoch": 2.47, "learning_rate": 8.081013700543521e-09, "logits/chosen": -2.1614620685577393, "logits/rejected": -2.1292073726654053, "logps/chosen": -4.717005252838135, "logps/rejected": -8.382651329040527, "loss": 0.2714, "rewards/accuracies": 1.0, "rewards/chosen": 1.3224416971206665, "rewards/margins": 1.1653896570205688, "rewards/rejected": 0.15705204010009766, "step": 4572 }, { "epoch": 2.47, "learning_rate": 8.065150186245184e-09, "logits/chosen": -2.0440568923950195, "logits/rejected": -2.3306596279144287, "logps/chosen": -0.4886167347431183, "logps/rejected": -0.500748872756958, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.8421022295951843, "rewards/margins": 0.0143890380859375, "rewards/rejected": 0.8277131915092468, "step": 4573 }, { "epoch": 2.47, "learning_rate": 8.049300891695743e-09, "logits/chosen": -2.0918221473693848, "logits/rejected": -2.092076539993286, "logps/chosen": -2.1445908546447754, "logps/rejected": -0.46756693720817566, "loss": 0.3988, "rewards/accuracies": 1.0, "rewards/chosen": 1.4850858449935913, "rewards/margins": 0.7132099270820618, "rewards/rejected": 0.7718759179115295, "step": 4574 }, { "epoch": 2.47, "learning_rate": 8.033465822269536e-09, "logits/chosen": -1.9929802417755127, "logits/rejected": -2.2192375659942627, "logps/chosen": -0.4913502335548401, "logps/rejected": -0.41498634219169617, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.7953258752822876, "rewards/margins": 0.01624464988708496, "rewards/rejected": 0.7790812253952026, "step": 4575 }, { "epoch": 2.47, "learning_rate": 8.017644983336114e-09, "logits/chosen": -2.0744524002075195, "logits/rejected": -2.0690016746520996, "logps/chosen": -7.828645706176758, "logps/rejected": -8.99008846282959, "loss": 0.278, "rewards/accuracies": 1.0, "rewards/chosen": 1.5517339706420898, "rewards/margins": 1.1380343437194824, "rewards/rejected": 0.4136996269226074, "step": 4576 }, { "epoch": 2.47, "learning_rate": 8.0018383802602e-09, "logits/chosen": -2.0772922039031982, "logits/rejected": -2.316272735595703, "logps/chosen": -0.2464122623205185, "logps/rejected": -0.28001439571380615, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.8713237047195435, "rewards/margins": -0.0010193586349487305, "rewards/rejected": 0.8723430633544922, "step": 4577 }, { "epoch": 2.47, "learning_rate": 7.98604601840165e-09, "logits/chosen": -1.9963014125823975, "logits/rejected": -1.9316775798797607, "logps/chosen": -15.639530181884766, "logps/rejected": -1.6188747882843018, "loss": 0.3412, "rewards/accuracies": 1.0, "rewards/chosen": 1.7954128980636597, "rewards/margins": 0.8997552394866943, "rewards/rejected": 0.8956576585769653, "step": 4578 }, { "epoch": 2.47, "learning_rate": 7.970267903115535e-09, "logits/chosen": -1.9988086223602295, "logits/rejected": -2.008073329925537, "logps/chosen": -3.3323569297790527, "logps/rejected": -1.570129156112671, "loss": 0.6285, "rewards/accuracies": 1.0, "rewards/chosen": 1.080530047416687, "rewards/margins": 0.13386619091033936, "rewards/rejected": 0.9466638565063477, "step": 4579 }, { "epoch": 2.47, "learning_rate": 7.954504039752075e-09, "logits/chosen": -2.0097835063934326, "logits/rejected": -2.2788853645324707, "logps/chosen": -0.15687784552574158, "logps/rejected": -0.15350216627120972, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.9305326342582703, "rewards/margins": 0.0045212507247924805, "rewards/rejected": 0.9260113835334778, "step": 4580 }, { "epoch": 2.47, "learning_rate": 7.938754433656664e-09, "logits/chosen": -2.13567852973938, "logits/rejected": -2.3186872005462646, "logps/chosen": -0.5786262154579163, "logps/rejected": -0.5094774961471558, "loss": 0.6783, "rewards/accuracies": 1.0, "rewards/chosen": 0.9760549664497375, "rewards/margins": 0.029897093772888184, "rewards/rejected": 0.9461578726768494, "step": 4581 }, { "epoch": 2.47, "learning_rate": 7.923019090169875e-09, "logits/chosen": -2.141702651977539, "logits/rejected": -2.1429946422576904, "logps/chosen": -0.24442654848098755, "logps/rejected": -6.071053504943848, "loss": 0.4298, "rewards/accuracies": 1.0, "rewards/chosen": 0.9646590352058411, "rewards/margins": 0.6218177080154419, "rewards/rejected": 0.34284135699272156, "step": 4582 }, { "epoch": 2.47, "learning_rate": 7.907298014627395e-09, "logits/chosen": -2.0995402336120605, "logits/rejected": -2.2958221435546875, "logps/chosen": -3.443361520767212, "logps/rejected": -1.7652475833892822, "loss": 0.7357, "rewards/accuracies": 0.0, "rewards/chosen": 0.9258602261543274, "rewards/margins": -0.08335751295089722, "rewards/rejected": 1.0092177391052246, "step": 4583 }, { "epoch": 2.47, "learning_rate": 7.891591212360132e-09, "logits/chosen": -1.9943935871124268, "logits/rejected": -2.0030322074890137, "logps/chosen": -1.5397766828536987, "logps/rejected": -2.914393186569214, "loss": 0.5007, "rewards/accuracies": 1.0, "rewards/chosen": 1.0501002073287964, "rewards/margins": 0.4309217929840088, "rewards/rejected": 0.6191784143447876, "step": 4584 }, { "epoch": 2.47, "learning_rate": 7.87589868869411e-09, "logits/chosen": -2.0651493072509766, "logits/rejected": -2.260038137435913, "logps/chosen": -0.44173353910446167, "logps/rejected": -0.4152691066265106, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.8330594897270203, "rewards/margins": 0.017950057983398438, "rewards/rejected": 0.8151094317436218, "step": 4585 }, { "epoch": 2.47, "learning_rate": 7.860220448950538e-09, "logits/chosen": -2.0569911003112793, "logits/rejected": -2.2986762523651123, "logps/chosen": -0.1679522693157196, "logps/rejected": -0.17549768090248108, "loss": 0.6843, "rewards/accuracies": 1.0, "rewards/chosen": 0.7309731245040894, "rewards/margins": 0.017732322216033936, "rewards/rejected": 0.7132408022880554, "step": 4586 }, { "epoch": 2.47, "learning_rate": 7.844556498445787e-09, "logits/chosen": -2.025747537612915, "logits/rejected": -2.2491438388824463, "logps/chosen": -0.1924736499786377, "logps/rejected": -0.25878995656967163, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.9901329874992371, "rewards/margins": 0.004479944705963135, "rewards/rejected": 0.9856530427932739, "step": 4587 }, { "epoch": 2.47, "learning_rate": 7.828906842491345e-09, "logits/chosen": -2.0146656036376953, "logits/rejected": -2.2201383113861084, "logps/chosen": -0.8743436336517334, "logps/rejected": -0.7562636137008667, "loss": 0.6914, "rewards/accuracies": 1.0, "rewards/chosen": 0.9324274063110352, "rewards/margins": 0.0035291314125061035, "rewards/rejected": 0.928898274898529, "step": 4588 }, { "epoch": 2.48, "learning_rate": 7.813271486393886e-09, "logits/chosen": -2.0976784229278564, "logits/rejected": -1.9820080995559692, "logps/chosen": -24.893016815185547, "logps/rejected": -4.439820766448975, "loss": 0.4255, "rewards/accuracies": 1.0, "rewards/chosen": 1.6104854345321655, "rewards/margins": 0.6342999935150146, "rewards/rejected": 0.9761854410171509, "step": 4589 }, { "epoch": 2.48, "learning_rate": 7.797650435455222e-09, "logits/chosen": -2.164475202560425, "logits/rejected": -2.3258719444274902, "logps/chosen": -0.6594605445861816, "logps/rejected": -2.9631175994873047, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 1.0398510694503784, "rewards/margins": 0.02217864990234375, "rewards/rejected": 1.0176724195480347, "step": 4590 }, { "epoch": 2.48, "learning_rate": 7.782043694972324e-09, "logits/chosen": -2.13992977142334, "logits/rejected": -2.1419990062713623, "logps/chosen": -2.203014850616455, "logps/rejected": -2.8381714820861816, "loss": 0.5921, "rewards/accuracies": 1.0, "rewards/chosen": 0.8832229971885681, "rewards/margins": 0.21356725692749023, "rewards/rejected": 0.6696557402610779, "step": 4591 }, { "epoch": 2.48, "learning_rate": 7.766451270237317e-09, "logits/chosen": -2.149303674697876, "logits/rejected": -2.275946855545044, "logps/chosen": -2.2927732467651367, "logps/rejected": -2.458156108856201, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.6951069235801697, "rewards/margins": 0.025879383087158203, "rewards/rejected": 0.6692275404930115, "step": 4592 }, { "epoch": 2.48, "learning_rate": 7.750873166537431e-09, "logits/chosen": -2.120149850845337, "logits/rejected": -2.096658706665039, "logps/chosen": -16.120418548583984, "logps/rejected": -8.829240798950195, "loss": 0.1822, "rewards/accuracies": 1.0, "rewards/chosen": 1.7898956537246704, "rewards/margins": 1.6101319789886475, "rewards/rejected": 0.17976370453834534, "step": 4593 }, { "epoch": 2.48, "learning_rate": 7.735309389155075e-09, "logits/chosen": -2.138253927230835, "logits/rejected": -2.1298227310180664, "logps/chosen": -1.025628924369812, "logps/rejected": -9.116925239562988, "loss": 0.3564, "rewards/accuracies": 1.0, "rewards/chosen": 1.1637226343154907, "rewards/margins": 0.8483625650405884, "rewards/rejected": 0.31536006927490234, "step": 4594 }, { "epoch": 2.48, "learning_rate": 7.719759943367816e-09, "logits/chosen": -2.0279808044433594, "logits/rejected": -2.026334047317505, "logps/chosen": -0.27523383498191833, "logps/rejected": -4.51791524887085, "loss": 0.462, "rewards/accuracies": 1.0, "rewards/chosen": 0.9416099786758423, "rewards/margins": 0.5322704315185547, "rewards/rejected": 0.40933957695961, "step": 4595 }, { "epoch": 2.48, "learning_rate": 7.704224834448308e-09, "logits/chosen": -2.0416646003723145, "logits/rejected": -2.3402771949768066, "logps/chosen": -2.5605945587158203, "logps/rejected": -2.2269346714019775, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.8966407775878906, "rewards/margins": 0.008668720722198486, "rewards/rejected": 0.8879720568656921, "step": 4596 }, { "epoch": 2.48, "learning_rate": 7.688704067664398e-09, "logits/chosen": -2.184260606765747, "logits/rejected": -2.325298309326172, "logps/chosen": -10.360516548156738, "logps/rejected": -13.070638656616211, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 1.1049888134002686, "rewards/margins": 0.005694270133972168, "rewards/rejected": 1.0992945432662964, "step": 4597 }, { "epoch": 2.48, "learning_rate": 7.673197648279022e-09, "logits/chosen": -2.043057441711426, "logits/rejected": -2.2651991844177246, "logps/chosen": -2.2852323055267334, "logps/rejected": -2.1744816303253174, "loss": 0.6973, "rewards/accuracies": 0.0, "rewards/chosen": 0.8640660643577576, "rewards/margins": -0.008337259292602539, "rewards/rejected": 0.8724033236503601, "step": 4598 }, { "epoch": 2.48, "learning_rate": 7.657705581550283e-09, "logits/chosen": -2.038954257965088, "logits/rejected": -2.0382020473480225, "logps/chosen": -3.4597628116607666, "logps/rejected": -2.6886672973632812, "loss": 0.4533, "rewards/accuracies": 1.0, "rewards/chosen": 1.3778918981552124, "rewards/margins": 0.5559291243553162, "rewards/rejected": 0.8219627737998962, "step": 4599 }, { "epoch": 2.48, "learning_rate": 7.642227872731417e-09, "logits/chosen": -2.0263025760650635, "logits/rejected": -2.0199007987976074, "logps/chosen": -2.232473373413086, "logps/rejected": -5.388129711151123, "loss": 0.3414, "rewards/accuracies": 1.0, "rewards/chosen": 1.3052736520767212, "rewards/margins": 0.8992118835449219, "rewards/rejected": 0.4060617983341217, "step": 4600 }, { "epoch": 2.48, "learning_rate": 7.626764527070773e-09, "logits/chosen": -2.1709015369415283, "logits/rejected": -2.1678197383880615, "logps/chosen": -7.029999256134033, "logps/rejected": -5.8150553703308105, "loss": 0.3481, "rewards/accuracies": 1.0, "rewards/chosen": 1.2652561664581299, "rewards/margins": 0.8761888742446899, "rewards/rejected": 0.38906732201576233, "step": 4601 }, { "epoch": 2.48, "learning_rate": 7.611315549811853e-09, "logits/chosen": -2.2421786785125732, "logits/rejected": -2.4259188175201416, "logps/chosen": -10.349943161010742, "logps/rejected": -26.55596923828125, "loss": 0.6313, "rewards/accuracies": 1.0, "rewards/chosen": 1.1322778463363647, "rewards/margins": 0.12781083583831787, "rewards/rejected": 1.0044670104980469, "step": 4602 }, { "epoch": 2.48, "learning_rate": 7.59588094619325e-09, "logits/chosen": -2.2090682983398438, "logits/rejected": -2.2057900428771973, "logps/chosen": -2.963827610015869, "logps/rejected": -6.1309309005737305, "loss": 0.4337, "rewards/accuracies": 1.0, "rewards/chosen": 1.0105453729629517, "rewards/margins": 0.6106298565864563, "rewards/rejected": 0.39991551637649536, "step": 4603 }, { "epoch": 2.48, "learning_rate": 7.580460721448722e-09, "logits/chosen": -2.007457733154297, "logits/rejected": -2.280611515045166, "logps/chosen": -1.137535810470581, "logps/rejected": -0.9954732656478882, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 1.007814645767212, "rewards/margins": 0.0005897283554077148, "rewards/rejected": 1.0072249174118042, "step": 4604 }, { "epoch": 2.48, "learning_rate": 7.565054880807148e-09, "logits/chosen": -2.0750131607055664, "logits/rejected": -2.264810562133789, "logps/chosen": -1.2577712535858154, "logps/rejected": -1.2933257818222046, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.883181095123291, "rewards/margins": 0.031454265117645264, "rewards/rejected": 0.8517268300056458, "step": 4605 }, { "epoch": 2.48, "learning_rate": 7.549663429492486e-09, "logits/chosen": -2.0900115966796875, "logits/rejected": -2.0567474365234375, "logps/chosen": -11.318292617797852, "logps/rejected": -3.580460548400879, "loss": 0.3306, "rewards/accuracies": 1.0, "rewards/chosen": 1.6497503519058228, "rewards/margins": 0.9368853569030762, "rewards/rejected": 0.7128649950027466, "step": 4606 }, { "epoch": 2.48, "learning_rate": 7.53428637272387e-09, "logits/chosen": -2.1862990856170654, "logits/rejected": -2.2766408920288086, "logps/chosen": -0.559476375579834, "logps/rejected": -0.5299054384231567, "loss": 0.6841, "rewards/accuracies": 1.0, "rewards/chosen": 1.0637727975845337, "rewards/margins": 0.01811087131500244, "rewards/rejected": 1.0456619262695312, "step": 4607 }, { "epoch": 2.49, "learning_rate": 7.518923715715508e-09, "logits/chosen": -2.0977704524993896, "logits/rejected": -2.0020968914031982, "logps/chosen": -5.503671646118164, "logps/rejected": -4.847795009613037, "loss": 0.3566, "rewards/accuracies": 1.0, "rewards/chosen": 1.6391571760177612, "rewards/margins": 0.8474254608154297, "rewards/rejected": 0.7917317152023315, "step": 4608 }, { "epoch": 2.49, "learning_rate": 7.503575463676758e-09, "logits/chosen": -2.0580921173095703, "logits/rejected": -2.0530076026916504, "logps/chosen": -3.127683639526367, "logps/rejected": -6.237268447875977, "loss": 0.2382, "rewards/accuracies": 1.0, "rewards/chosen": 1.5443241596221924, "rewards/margins": 1.3134138584136963, "rewards/rejected": 0.2309103012084961, "step": 4609 }, { "epoch": 2.49, "learning_rate": 7.488241621812081e-09, "logits/chosen": -2.0579237937927246, "logits/rejected": -2.050492286682129, "logps/chosen": -2.6689670085906982, "logps/rejected": -6.003930568695068, "loss": 0.3209, "rewards/accuracies": 1.0, "rewards/chosen": 1.3561921119689941, "rewards/margins": 0.9720652103424072, "rewards/rejected": 0.3841269016265869, "step": 4610 }, { "epoch": 2.49, "learning_rate": 7.472922195321046e-09, "logits/chosen": -2.125028371810913, "logits/rejected": -2.288663148880005, "logps/chosen": -1.0434023141860962, "logps/rejected": -1.0727028846740723, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.9587768912315369, "rewards/margins": 0.01953887939453125, "rewards/rejected": 0.9392380118370056, "step": 4611 }, { "epoch": 2.49, "learning_rate": 7.457617189398357e-09, "logits/chosen": -2.0500431060791016, "logits/rejected": -2.2389588356018066, "logps/chosen": -2.430354595184326, "logps/rejected": -2.5741660594940186, "loss": 0.6644, "rewards/accuracies": 1.0, "rewards/chosen": 0.6971048712730408, "rewards/margins": 0.05833941698074341, "rewards/rejected": 0.6387654542922974, "step": 4612 }, { "epoch": 2.49, "learning_rate": 7.4423266092337845e-09, "logits/chosen": -2.1295130252838135, "logits/rejected": -2.125898599624634, "logps/chosen": -6.126676559448242, "logps/rejected": -3.10150146484375, "loss": 0.3591, "rewards/accuracies": 1.0, "rewards/chosen": 1.4785207509994507, "rewards/margins": 0.839171290397644, "rewards/rejected": 0.6393494606018066, "step": 4613 }, { "epoch": 2.49, "learning_rate": 7.427050460012252e-09, "logits/chosen": -1.9999728202819824, "logits/rejected": -2.0037479400634766, "logps/chosen": -1.8123962879180908, "logps/rejected": -3.4836180210113525, "loss": 0.431, "rewards/accuracies": 1.0, "rewards/chosen": 1.1751693487167358, "rewards/margins": 0.6183580756187439, "rewards/rejected": 0.5568112730979919, "step": 4614 }, { "epoch": 2.49, "learning_rate": 7.411788746913772e-09, "logits/chosen": -2.026808023452759, "logits/rejected": -2.0251522064208984, "logps/chosen": -1.9139429330825806, "logps/rejected": -5.041116714477539, "loss": 0.3411, "rewards/accuracies": 1.0, "rewards/chosen": 1.5147374868392944, "rewards/margins": 0.9000665545463562, "rewards/rejected": 0.6146709322929382, "step": 4615 }, { "epoch": 2.49, "learning_rate": 7.3965414751134435e-09, "logits/chosen": -2.1187045574188232, "logits/rejected": -2.274831771850586, "logps/chosen": -0.7422263026237488, "logps/rejected": -0.9145345687866211, "loss": 0.6703, "rewards/accuracies": 1.0, "rewards/chosen": 0.6816949248313904, "rewards/margins": 0.04621070623397827, "rewards/rejected": 0.6354842185974121, "step": 4616 }, { "epoch": 2.49, "learning_rate": 7.381308649781498e-09, "logits/chosen": -2.0707201957702637, "logits/rejected": -2.072675943374634, "logps/chosen": -0.394381582736969, "logps/rejected": -4.513882160186768, "loss": 0.4637, "rewards/accuracies": 1.0, "rewards/chosen": 1.0649317502975464, "rewards/margins": 0.5276646614074707, "rewards/rejected": 0.5372670888900757, "step": 4617 }, { "epoch": 2.49, "learning_rate": 7.366090276083264e-09, "logits/chosen": -2.1101181507110596, "logits/rejected": -2.220384120941162, "logps/chosen": -18.125808715820312, "logps/rejected": -21.200523376464844, "loss": 0.3099, "rewards/accuracies": 1.0, "rewards/chosen": 1.7320629358291626, "rewards/margins": 1.012528419494629, "rewards/rejected": 0.7195345163345337, "step": 4618 }, { "epoch": 2.49, "learning_rate": 7.3508863591791316e-09, "logits/chosen": -2.182633876800537, "logits/rejected": -2.3739187717437744, "logps/chosen": -0.5494048595428467, "logps/rejected": -0.572002649307251, "loss": 0.6891, "rewards/accuracies": 1.0, "rewards/chosen": 1.0767974853515625, "rewards/margins": 0.008205175399780273, "rewards/rejected": 1.0685923099517822, "step": 4619 }, { "epoch": 2.49, "learning_rate": 7.335696904224658e-09, "logits/chosen": -2.2664217948913574, "logits/rejected": -2.2095460891723633, "logps/chosen": -11.451913833618164, "logps/rejected": -10.115270614624023, "loss": 0.1965, "rewards/accuracies": 1.0, "rewards/chosen": 1.9974721670150757, "rewards/margins": 1.5271003246307373, "rewards/rejected": 0.470371812582016, "step": 4620 }, { "epoch": 2.49, "learning_rate": 7.320521916370437e-09, "logits/chosen": -2.1388819217681885, "logits/rejected": -2.3358988761901855, "logps/chosen": -2.149440050125122, "logps/rejected": -1.9606586694717407, "loss": 0.695, "rewards/accuracies": 0.0, "rewards/chosen": 0.8428074717521667, "rewards/margins": -0.0036234259605407715, "rewards/rejected": 0.8464308977127075, "step": 4621 }, { "epoch": 2.49, "learning_rate": 7.305361400762183e-09, "logits/chosen": -2.099836587905884, "logits/rejected": -2.0618770122528076, "logps/chosen": -5.402207374572754, "logps/rejected": -2.4618513584136963, "loss": 0.3413, "rewards/accuracies": 1.0, "rewards/chosen": 1.585249423980713, "rewards/margins": 0.8994808793067932, "rewards/rejected": 0.6857685446739197, "step": 4622 }, { "epoch": 2.49, "learning_rate": 7.290215362540686e-09, "logits/chosen": -2.0212206840515137, "logits/rejected": -2.0209569931030273, "logps/chosen": -1.85603928565979, "logps/rejected": -5.197309494018555, "loss": 0.3145, "rewards/accuracies": 1.0, "rewards/chosen": 1.5294544696807861, "rewards/margins": 0.9954070448875427, "rewards/rejected": 0.5340474247932434, "step": 4623 }, { "epoch": 2.49, "learning_rate": 7.2750838068418475e-09, "logits/chosen": -2.0761141777038574, "logits/rejected": -2.074784517288208, "logps/chosen": -4.262286186218262, "logps/rejected": -3.851764678955078, "loss": 0.5636, "rewards/accuracies": 1.0, "rewards/chosen": 0.8825114369392395, "rewards/margins": 0.2784043550491333, "rewards/rejected": 0.6041070818901062, "step": 4624 }, { "epoch": 2.49, "learning_rate": 7.259966738796658e-09, "logits/chosen": -2.1315605640411377, "logits/rejected": -2.122359275817871, "logps/chosen": -12.287748336791992, "logps/rejected": -1.24152410030365, "loss": 0.4087, "rewards/accuracies": 1.0, "rewards/chosen": 1.417889952659607, "rewards/margins": 0.6835854649543762, "rewards/rejected": 0.7343044877052307, "step": 4625 }, { "epoch": 2.5, "learning_rate": 7.244864163531161e-09, "logits/chosen": -2.1316304206848145, "logits/rejected": -2.1320855617523193, "logps/chosen": -0.8976516723632812, "logps/rejected": -1.937997579574585, "loss": 0.5069, "rewards/accuracies": 1.0, "rewards/chosen": 1.1906944513320923, "rewards/margins": 0.41520118713378906, "rewards/rejected": 0.7754932641983032, "step": 4626 }, { "epoch": 2.5, "learning_rate": 7.229776086166528e-09, "logits/chosen": -2.090245008468628, "logits/rejected": -2.087200164794922, "logps/chosen": -3.1258444786071777, "logps/rejected": -2.886457920074463, "loss": 0.4976, "rewards/accuracies": 1.0, "rewards/chosen": 1.1636778116226196, "rewards/margins": 0.43883490562438965, "rewards/rejected": 0.72484290599823, "step": 4627 }, { "epoch": 2.5, "learning_rate": 7.214702511819004e-09, "logits/chosen": -2.15167498588562, "logits/rejected": -2.3615102767944336, "logps/chosen": -0.41747555136680603, "logps/rejected": -16.17139434814453, "loss": 0.703, "rewards/accuracies": 0.0, "rewards/chosen": 0.841454029083252, "rewards/margins": -0.019685864448547363, "rewards/rejected": 0.8611398935317993, "step": 4628 }, { "epoch": 2.5, "learning_rate": 7.1996434455998736e-09, "logits/chosen": -1.965937614440918, "logits/rejected": -1.9652823209762573, "logps/chosen": -6.774374485015869, "logps/rejected": -3.561640977859497, "loss": 0.3352, "rewards/accuracies": 1.0, "rewards/chosen": 1.5001968145370483, "rewards/margins": 0.9205848574638367, "rewards/rejected": 0.5796119570732117, "step": 4629 }, { "epoch": 2.5, "learning_rate": 7.18459889261559e-09, "logits/chosen": -2.062028646469116, "logits/rejected": -2.0731441974639893, "logps/chosen": -4.1060791015625, "logps/rejected": -10.674301147460938, "loss": 0.2551, "rewards/accuracies": 1.0, "rewards/chosen": 1.7398979663848877, "rewards/margins": 1.235828161239624, "rewards/rejected": 0.5040697455406189, "step": 4630 }, { "epoch": 2.5, "learning_rate": 7.1695688579675895e-09, "logits/chosen": -2.033221483230591, "logits/rejected": -2.2591190338134766, "logps/chosen": -4.018194198608398, "logps/rejected": -3.748047351837158, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.631342351436615, "rewards/margins": -0.005257725715637207, "rewards/rejected": 0.6366000771522522, "step": 4631 }, { "epoch": 2.5, "learning_rate": 7.154553346752457e-09, "logits/chosen": -2.0303494930267334, "logits/rejected": -2.3109352588653564, "logps/chosen": -0.25383129715919495, "logps/rejected": -0.2912728488445282, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 0.7581759691238403, "rewards/margins": -0.005890905857086182, "rewards/rejected": 0.7640668749809265, "step": 4632 }, { "epoch": 2.5, "learning_rate": 7.139552364061796e-09, "logits/chosen": -2.0543010234832764, "logits/rejected": -2.271440267562866, "logps/chosen": -1.2242345809936523, "logps/rejected": -1.070069432258606, "loss": 0.7109, "rewards/accuracies": 0.0, "rewards/chosen": 0.9881840944290161, "rewards/margins": -0.035240769386291504, "rewards/rejected": 1.0234248638153076, "step": 4633 }, { "epoch": 2.5, "learning_rate": 7.124565914982328e-09, "logits/chosen": -2.131222724914551, "logits/rejected": -2.131605863571167, "logps/chosen": -0.16178065538406372, "logps/rejected": -5.2371368408203125, "loss": 0.4323, "rewards/accuracies": 1.0, "rewards/chosen": 1.010581612586975, "rewards/margins": 0.6146081686019897, "rewards/rejected": 0.39597341418266296, "step": 4634 }, { "epoch": 2.5, "learning_rate": 7.109594004595837e-09, "logits/chosen": -2.001163959503174, "logits/rejected": -2.281538963317871, "logps/chosen": -0.8095441460609436, "logps/rejected": -0.7380905151367188, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 1.0481036901474, "rewards/margins": 0.012140750885009766, "rewards/rejected": 1.0359629392623901, "step": 4635 }, { "epoch": 2.5, "learning_rate": 7.094636637979151e-09, "logits/chosen": -2.0928761959075928, "logits/rejected": -2.0933034420013428, "logps/chosen": -0.3486534357070923, "logps/rejected": -4.152026653289795, "loss": 0.4459, "rewards/accuracies": 1.0, "rewards/chosen": 1.0224310159683228, "rewards/margins": 0.5765107870101929, "rewards/rejected": 0.4459202289581299, "step": 4636 }, { "epoch": 2.5, "learning_rate": 7.079693820204197e-09, "logits/chosen": -1.9546525478363037, "logits/rejected": -2.272733211517334, "logps/chosen": -1.0665265321731567, "logps/rejected": -1.0305278301239014, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.831546425819397, "rewards/margins": 0.01683962345123291, "rewards/rejected": 0.8147068023681641, "step": 4637 }, { "epoch": 2.5, "learning_rate": 7.064765556337965e-09, "logits/chosen": -2.035214900970459, "logits/rejected": -2.2773094177246094, "logps/chosen": -3.0474443435668945, "logps/rejected": -2.7968411445617676, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.5061746835708618, "rewards/margins": 0.004407048225402832, "rewards/rejected": 0.501767635345459, "step": 4638 }, { "epoch": 2.5, "learning_rate": 7.0498518514424675e-09, "logits/chosen": -2.1234376430511475, "logits/rejected": -2.123042583465576, "logps/chosen": -2.5030362606048584, "logps/rejected": -1.57958984375, "loss": 0.5258, "rewards/accuracies": 1.0, "rewards/chosen": 1.3526443243026733, "rewards/margins": 0.3683732748031616, "rewards/rejected": 0.9842710494995117, "step": 4639 }, { "epoch": 2.5, "learning_rate": 7.034952710574859e-09, "logits/chosen": -2.088697910308838, "logits/rejected": -2.090440273284912, "logps/chosen": -2.0593090057373047, "logps/rejected": -5.230854511260986, "loss": 0.3911, "rewards/accuracies": 1.0, "rewards/chosen": 1.1337462663650513, "rewards/margins": 0.7369271516799927, "rewards/rejected": 0.3968190848827362, "step": 4640 }, { "epoch": 2.5, "learning_rate": 7.020068138787278e-09, "logits/chosen": -2.0080482959747314, "logits/rejected": -2.222529172897339, "logps/chosen": -0.3958718478679657, "logps/rejected": -0.3681322932243347, "loss": 0.6935, "rewards/accuracies": 0.0, "rewards/chosen": 0.9182316064834595, "rewards/margins": -0.0007414817810058594, "rewards/rejected": 0.9189730882644653, "step": 4641 }, { "epoch": 2.5, "learning_rate": 7.005198141126967e-09, "logits/chosen": -2.1179001331329346, "logits/rejected": -2.2786402702331543, "logps/chosen": -1.3778358697891235, "logps/rejected": -1.405979871749878, "loss": 0.7026, "rewards/accuracies": 0.0, "rewards/chosen": 0.930149495601654, "rewards/margins": -0.018842756748199463, "rewards/rejected": 0.9489922523498535, "step": 4642 }, { "epoch": 2.5, "learning_rate": 6.990342722636228e-09, "logits/chosen": -2.063538074493408, "logits/rejected": -2.069681167602539, "logps/chosen": -3.3858115673065186, "logps/rejected": -13.466800689697266, "loss": 0.3728, "rewards/accuracies": 1.0, "rewards/chosen": 1.0234425067901611, "rewards/margins": 0.79446941614151, "rewards/rejected": 0.22897310554981232, "step": 4643 }, { "epoch": 2.5, "learning_rate": 6.975501888352381e-09, "logits/chosen": -2.151881456375122, "logits/rejected": -2.1323301792144775, "logps/chosen": -11.918231964111328, "logps/rejected": -7.756526947021484, "loss": 0.2708, "rewards/accuracies": 1.0, "rewards/chosen": 1.384401559829712, "rewards/margins": 1.1678342819213867, "rewards/rejected": 0.2165672332048416, "step": 4644 }, { "epoch": 2.51, "learning_rate": 6.960675643307851e-09, "logits/chosen": -2.0278329849243164, "logits/rejected": -2.263904571533203, "logps/chosen": -1.5567529201507568, "logps/rejected": -4.1968994140625, "loss": 0.6208, "rewards/accuracies": 1.0, "rewards/chosen": 0.9435846209526062, "rewards/margins": 0.15032386779785156, "rewards/rejected": 0.7932607531547546, "step": 4645 }, { "epoch": 2.51, "learning_rate": 6.9458639925300655e-09, "logits/chosen": -2.11822247505188, "logits/rejected": -2.1436452865600586, "logps/chosen": -7.565850257873535, "logps/rejected": -1.6992436647415161, "loss": 0.4076, "rewards/accuracies": 1.0, "rewards/chosen": 1.5977482795715332, "rewards/margins": 0.6866700053215027, "rewards/rejected": 0.9110782742500305, "step": 4646 }, { "epoch": 2.51, "learning_rate": 6.9310669410415355e-09, "logits/chosen": -1.957276463508606, "logits/rejected": -1.9510842561721802, "logps/chosen": -4.443421363830566, "logps/rejected": -4.966991901397705, "loss": 0.3204, "rewards/accuracies": 1.0, "rewards/chosen": 1.452738881111145, "rewards/margins": 0.9735604524612427, "rewards/rejected": 0.47917839884757996, "step": 4647 }, { "epoch": 2.51, "learning_rate": 6.916284493859836e-09, "logits/chosen": -2.0543036460876465, "logits/rejected": -2.039076805114746, "logps/chosen": -14.00535774230957, "logps/rejected": -4.410677433013916, "loss": 0.5505, "rewards/accuracies": 1.0, "rewards/chosen": 1.381628394126892, "rewards/margins": 0.3089677095413208, "rewards/rejected": 1.0726606845855713, "step": 4648 }, { "epoch": 2.51, "learning_rate": 6.901516655997536e-09, "logits/chosen": -2.079026460647583, "logits/rejected": -2.0546324253082275, "logps/chosen": -5.865802764892578, "logps/rejected": -3.7871925830841064, "loss": 0.3348, "rewards/accuracies": 1.0, "rewards/chosen": 1.5284641981124878, "rewards/margins": 0.9222967624664307, "rewards/rejected": 0.6061674356460571, "step": 4649 }, { "epoch": 2.51, "learning_rate": 6.886763432462317e-09, "logits/chosen": -2.0403034687042236, "logits/rejected": -2.2898709774017334, "logps/chosen": -4.319334983825684, "logps/rejected": -4.19854211807251, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.547530472278595, "rewards/margins": 0.016053318977355957, "rewards/rejected": 0.531477153301239, "step": 4650 }, { "epoch": 2.51, "learning_rate": 6.8720248282568395e-09, "logits/chosen": -1.975460171699524, "logits/rejected": -2.216665267944336, "logps/chosen": -0.5144492983818054, "logps/rejected": -0.5172232389450073, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.9482159614562988, "rewards/margins": 0.00858694314956665, "rewards/rejected": 0.9396290183067322, "step": 4651 }, { "epoch": 2.51, "learning_rate": 6.857300848378856e-09, "logits/chosen": -1.9959410429000854, "logits/rejected": -2.0148684978485107, "logps/chosen": -1.8401490449905396, "logps/rejected": -5.606840133666992, "loss": 0.4017, "rewards/accuracies": 1.0, "rewards/chosen": 1.2481234073638916, "rewards/margins": 0.7045597434043884, "rewards/rejected": 0.5435636639595032, "step": 4652 }, { "epoch": 2.51, "learning_rate": 6.8425914978211485e-09, "logits/chosen": -2.053840398788452, "logits/rejected": -2.2750375270843506, "logps/chosen": -1.0671054124832153, "logps/rejected": -2.3205230236053467, "loss": 0.6926, "rewards/accuracies": 1.0, "rewards/chosen": 1.036731481552124, "rewards/margins": 0.0011595487594604492, "rewards/rejected": 1.0355719327926636, "step": 4653 }, { "epoch": 2.51, "learning_rate": 6.827896781571507e-09, "logits/chosen": -2.0454745292663574, "logits/rejected": -2.2475786209106445, "logps/chosen": -9.795246124267578, "logps/rejected": -8.767814636230469, "loss": 0.7507, "rewards/accuracies": 0.0, "rewards/chosen": 0.7123028039932251, "rewards/margins": -0.11195904016494751, "rewards/rejected": 0.8242618441581726, "step": 4654 }, { "epoch": 2.51, "learning_rate": 6.813216704612817e-09, "logits/chosen": -1.965679407119751, "logits/rejected": -2.272799491882324, "logps/chosen": -1.0974059104919434, "logps/rejected": -1.029001235961914, "loss": 0.6899, "rewards/accuracies": 1.0, "rewards/chosen": 0.9397937655448914, "rewards/margins": 0.00658339262008667, "rewards/rejected": 0.9332103729248047, "step": 4655 }, { "epoch": 2.51, "learning_rate": 6.798551271922931e-09, "logits/chosen": -2.0989556312561035, "logits/rejected": -2.103875160217285, "logps/chosen": -5.756021976470947, "logps/rejected": -9.646526336669922, "loss": 0.2509, "rewards/accuracies": 1.0, "rewards/chosen": 1.6099035739898682, "rewards/margins": 1.254713535308838, "rewards/rejected": 0.35519009828567505, "step": 4656 }, { "epoch": 2.51, "learning_rate": 6.783900488474792e-09, "logits/chosen": -2.081990957260132, "logits/rejected": -2.080246925354004, "logps/chosen": -0.6004334688186646, "logps/rejected": -4.776467800140381, "loss": 0.4414, "rewards/accuracies": 1.0, "rewards/chosen": 0.9252164959907532, "rewards/margins": 0.5891079902648926, "rewards/rejected": 0.336108535528183, "step": 4657 }, { "epoch": 2.51, "learning_rate": 6.769264359236354e-09, "logits/chosen": -2.060452461242676, "logits/rejected": -2.253084421157837, "logps/chosen": -4.067926406860352, "logps/rejected": -0.9005274772644043, "loss": 0.8244, "rewards/accuracies": 0.0, "rewards/chosen": 0.8403436541557312, "rewards/margins": -0.24717313051223755, "rewards/rejected": 1.0875167846679688, "step": 4658 }, { "epoch": 2.51, "learning_rate": 6.754642889170603e-09, "logits/chosen": -1.945365309715271, "logits/rejected": -1.9546440839767456, "logps/chosen": -1.429202675819397, "logps/rejected": -3.931497097015381, "loss": 0.4662, "rewards/accuracies": 1.0, "rewards/chosen": 0.9375646710395813, "rewards/margins": 0.5211210250854492, "rewards/rejected": 0.41644367575645447, "step": 4659 }, { "epoch": 2.51, "learning_rate": 6.740036083235567e-09, "logits/chosen": -2.141845226287842, "logits/rejected": -2.1258862018585205, "logps/chosen": -6.966137886047363, "logps/rejected": -4.492319107055664, "loss": 0.3983, "rewards/accuracies": 1.0, "rewards/chosen": 1.3479201793670654, "rewards/margins": 0.7147002816200256, "rewards/rejected": 0.6332198977470398, "step": 4660 }, { "epoch": 2.51, "learning_rate": 6.725443946384263e-09, "logits/chosen": -1.960748553276062, "logits/rejected": -2.2133994102478027, "logps/chosen": -0.6786371469497681, "logps/rejected": -0.8298136591911316, "loss": 0.6729, "rewards/accuracies": 1.0, "rewards/chosen": 0.8683733940124512, "rewards/margins": 0.040881335735321045, "rewards/rejected": 0.8274920582771301, "step": 4661 }, { "epoch": 2.51, "learning_rate": 6.71086648356477e-09, "logits/chosen": -2.2161214351654053, "logits/rejected": -2.320805311203003, "logps/chosen": -16.845596313476562, "logps/rejected": -8.646270751953125, "loss": 0.5762, "rewards/accuracies": 1.0, "rewards/chosen": 1.181655764579773, "rewards/margins": 0.24949920177459717, "rewards/rejected": 0.9321565628051758, "step": 4662 }, { "epoch": 2.52, "learning_rate": 6.696303699720196e-09, "logits/chosen": -2.084547281265259, "logits/rejected": -2.338956117630005, "logps/chosen": -3.0961313247680664, "logps/rejected": -3.1203813552856445, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.976349949836731, "rewards/margins": 0.011629879474639893, "rewards/rejected": 0.9647200703620911, "step": 4663 }, { "epoch": 2.52, "learning_rate": 6.68175559978863e-09, "logits/chosen": -2.0546891689300537, "logits/rejected": -2.301678419113159, "logps/chosen": -1.2440611124038696, "logps/rejected": -4.064290523529053, "loss": 0.6735, "rewards/accuracies": 1.0, "rewards/chosen": 1.0484250783920288, "rewards/margins": 0.03973591327667236, "rewards/rejected": 1.0086891651153564, "step": 4664 }, { "epoch": 2.52, "learning_rate": 6.667222188703226e-09, "logits/chosen": -2.0757930278778076, "logits/rejected": -2.079049825668335, "logps/chosen": -1.1033438444137573, "logps/rejected": -1.8235293626785278, "loss": 0.4891, "rewards/accuracies": 1.0, "rewards/chosen": 1.1845866441726685, "rewards/margins": 0.4606230854988098, "rewards/rejected": 0.7239635586738586, "step": 4665 }, { "epoch": 2.52, "learning_rate": 6.65270347139214e-09, "logits/chosen": -2.0372161865234375, "logits/rejected": -2.0458850860595703, "logps/chosen": -1.4957822561264038, "logps/rejected": -2.6858437061309814, "loss": 0.4281, "rewards/accuracies": 1.0, "rewards/chosen": 1.2269726991653442, "rewards/margins": 0.6266282200813293, "rewards/rejected": 0.6003444790840149, "step": 4666 }, { "epoch": 2.52, "learning_rate": 6.638199452778536e-09, "logits/chosen": -2.0404484272003174, "logits/rejected": -2.0497512817382812, "logps/chosen": -2.0710079669952393, "logps/rejected": -2.6890738010406494, "loss": 0.4372, "rewards/accuracies": 1.0, "rewards/chosen": 1.2603299617767334, "rewards/margins": 0.6007041335105896, "rewards/rejected": 0.6596258282661438, "step": 4667 }, { "epoch": 2.52, "learning_rate": 6.623710137780608e-09, "logits/chosen": -1.8609018325805664, "logits/rejected": -2.276702880859375, "logps/chosen": -0.7136086225509644, "logps/rejected": -0.9274275302886963, "loss": 0.6621, "rewards/accuracies": 1.0, "rewards/chosen": 0.7911964654922485, "rewards/margins": 0.0630611777305603, "rewards/rejected": 0.7281352877616882, "step": 4668 }, { "epoch": 2.52, "learning_rate": 6.609235531311552e-09, "logits/chosen": -1.9460222721099854, "logits/rejected": -2.301607847213745, "logps/chosen": -0.4023568332195282, "logps/rejected": -0.2961835563182831, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.8278502821922302, "rewards/margins": 0.015439510345458984, "rewards/rejected": 0.8124107718467712, "step": 4669 }, { "epoch": 2.52, "learning_rate": 6.5947756382796036e-09, "logits/chosen": -2.2117342948913574, "logits/rejected": -2.179830312728882, "logps/chosen": -13.260019302368164, "logps/rejected": -3.039228677749634, "loss": 0.4819, "rewards/accuracies": 1.0, "rewards/chosen": 1.217004656791687, "rewards/margins": 0.4795374274253845, "rewards/rejected": 0.7374672293663025, "step": 4670 }, { "epoch": 2.52, "learning_rate": 6.580330463587963e-09, "logits/chosen": -1.9614574909210205, "logits/rejected": -2.2541332244873047, "logps/chosen": -1.3654348850250244, "logps/rejected": -1.2728283405303955, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 0.8854157328605652, "rewards/margins": 0.0027970075607299805, "rewards/rejected": 0.8826187252998352, "step": 4671 }, { "epoch": 2.52, "learning_rate": 6.565900012134879e-09, "logits/chosen": -2.009493112564087, "logits/rejected": -2.31846022605896, "logps/chosen": -0.7052434682846069, "logps/rejected": -0.775165319442749, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.8736468553543091, "rewards/margins": 0.0015668869018554688, "rewards/rejected": 0.8720799684524536, "step": 4672 }, { "epoch": 2.52, "learning_rate": 6.5514842888135964e-09, "logits/chosen": -1.980886459350586, "logits/rejected": -2.2536637783050537, "logps/chosen": -0.38091418147087097, "logps/rejected": -0.4688517153263092, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 1.0101258754730225, "rewards/margins": 0.025929272174835205, "rewards/rejected": 0.9841966032981873, "step": 4673 }, { "epoch": 2.52, "learning_rate": 6.537083298512352e-09, "logits/chosen": -2.047041416168213, "logits/rejected": -2.231628179550171, "logps/chosen": -1.5369573831558228, "logps/rejected": -0.9330289363861084, "loss": 0.5945, "rewards/accuracies": 1.0, "rewards/chosen": 1.1158045530319214, "rewards/margins": 0.20807665586471558, "rewards/rejected": 0.9077278971672058, "step": 4674 }, { "epoch": 2.52, "learning_rate": 6.522697046114406e-09, "logits/chosen": -2.0264809131622314, "logits/rejected": -2.3015542030334473, "logps/chosen": -1.445532202720642, "logps/rejected": -1.556049108505249, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.8881298899650574, "rewards/margins": 0.006018340587615967, "rewards/rejected": 0.8821115493774414, "step": 4675 }, { "epoch": 2.52, "learning_rate": 6.508325536498022e-09, "logits/chosen": -2.0799617767333984, "logits/rejected": -2.0823802947998047, "logps/chosen": -2.5988388061523438, "logps/rejected": -2.800527572631836, "loss": 0.6169, "rewards/accuracies": 1.0, "rewards/chosen": 1.0067275762557983, "rewards/margins": 0.15881794691085815, "rewards/rejected": 0.8479096293449402, "step": 4676 }, { "epoch": 2.52, "learning_rate": 6.49396877453644e-09, "logits/chosen": -2.1223034858703613, "logits/rejected": -2.127617835998535, "logps/chosen": -11.364970207214355, "logps/rejected": -7.633938312530518, "loss": 0.1983, "rewards/accuracies": 1.0, "rewards/chosen": 2.2641258239746094, "rewards/margins": 1.5171934366226196, "rewards/rejected": 0.7469323873519897, "step": 4677 }, { "epoch": 2.52, "learning_rate": 6.479626765097917e-09, "logits/chosen": -2.0521464347839355, "logits/rejected": -2.282985210418701, "logps/chosen": -1.3633136749267578, "logps/rejected": -1.9682868719100952, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": 0.8787302374839783, "rewards/margins": -0.009830355644226074, "rewards/rejected": 0.8885605931282043, "step": 4678 }, { "epoch": 2.52, "learning_rate": 6.465299513045719e-09, "logits/chosen": -2.0429904460906982, "logits/rejected": -2.04291033744812, "logps/chosen": -0.35792094469070435, "logps/rejected": -4.703948020935059, "loss": 0.4841, "rewards/accuracies": 1.0, "rewards/chosen": 0.9339908957481384, "rewards/margins": 0.47363942861557007, "rewards/rejected": 0.46035146713256836, "step": 4679 }, { "epoch": 2.52, "learning_rate": 6.450987023238092e-09, "logits/chosen": -2.019700050354004, "logits/rejected": -2.184021234512329, "logps/chosen": -0.3148699998855591, "logps/rejected": -0.3129745125770569, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 0.8247337341308594, "rewards/margins": 0.01706629991531372, "rewards/rejected": 0.8076674342155457, "step": 4680 }, { "epoch": 2.52, "learning_rate": 6.436689300528264e-09, "logits/chosen": -2.0989761352539062, "logits/rejected": -2.100489377975464, "logps/chosen": -4.275332927703857, "logps/rejected": -2.7436764240264893, "loss": 0.233, "rewards/accuracies": 1.0, "rewards/chosen": 1.9028934240341187, "rewards/margins": 1.3377916812896729, "rewards/rejected": 0.5651018023490906, "step": 4681 }, { "epoch": 2.53, "learning_rate": 6.422406349764481e-09, "logits/chosen": -2.182814598083496, "logits/rejected": -2.180784225463867, "logps/chosen": -3.3449151515960693, "logps/rejected": -4.77552604675293, "loss": 0.2343, "rewards/accuracies": 1.0, "rewards/chosen": 1.749226450920105, "rewards/margins": 1.331840991973877, "rewards/rejected": 0.41738539934158325, "step": 4682 }, { "epoch": 2.53, "learning_rate": 6.408138175789973e-09, "logits/chosen": -2.0856070518493652, "logits/rejected": -2.2840569019317627, "logps/chosen": -2.7585818767547607, "logps/rejected": -2.3984696865081787, "loss": 0.7093, "rewards/accuracies": 0.0, "rewards/chosen": 0.8432334065437317, "rewards/margins": -0.03203022480010986, "rewards/rejected": 0.8752636313438416, "step": 4683 }, { "epoch": 2.53, "learning_rate": 6.393884783442949e-09, "logits/chosen": -2.1376352310180664, "logits/rejected": -2.132786750793457, "logps/chosen": -7.025874137878418, "logps/rejected": -3.775570869445801, "loss": 0.3206, "rewards/accuracies": 1.0, "rewards/chosen": 1.5698494911193848, "rewards/margins": 0.9731606245040894, "rewards/rejected": 0.5966888666152954, "step": 4684 }, { "epoch": 2.53, "learning_rate": 6.379646177556608e-09, "logits/chosen": -2.1192259788513184, "logits/rejected": -2.265392780303955, "logps/chosen": -1.5032455921173096, "logps/rejected": -4.919997692108154, "loss": 0.619, "rewards/accuracies": 1.0, "rewards/chosen": 0.9130719304084778, "rewards/margins": 0.15433204174041748, "rewards/rejected": 0.7587398886680603, "step": 4685 }, { "epoch": 2.53, "learning_rate": 6.365422362959161e-09, "logits/chosen": -2.137554407119751, "logits/rejected": -1.9420775175094604, "logps/chosen": -36.79024124145508, "logps/rejected": -3.4705960750579834, "loss": 0.1696, "rewards/accuracies": 1.0, "rewards/chosen": 2.254655122756958, "rewards/margins": 1.688581943511963, "rewards/rejected": 0.5660732388496399, "step": 4686 }, { "epoch": 2.53, "learning_rate": 6.351213344473755e-09, "logits/chosen": -2.0422098636627197, "logits/rejected": -2.2794952392578125, "logps/chosen": -0.497187077999115, "logps/rejected": -0.5202823877334595, "loss": 0.6913, "rewards/accuracies": 1.0, "rewards/chosen": 0.862029492855072, "rewards/margins": 0.003776371479034424, "rewards/rejected": 0.8582531213760376, "step": 4687 }, { "epoch": 2.53, "learning_rate": 6.337019126918558e-09, "logits/chosen": -2.1772429943084717, "logits/rejected": -2.173654317855835, "logps/chosen": -3.1873326301574707, "logps/rejected": -11.345993995666504, "loss": 0.351, "rewards/accuracies": 1.0, "rewards/chosen": 1.2793060541152954, "rewards/margins": 0.8662140965461731, "rewards/rejected": 0.4130919575691223, "step": 4688 }, { "epoch": 2.53, "learning_rate": 6.322839715106709e-09, "logits/chosen": -1.9304598569869995, "logits/rejected": -2.2889764308929443, "logps/chosen": -0.1767803430557251, "logps/rejected": -0.18991811573505402, "loss": 0.6712, "rewards/accuracies": 1.0, "rewards/chosen": 0.8856593370437622, "rewards/margins": 0.044320881366729736, "rewards/rejected": 0.8413384556770325, "step": 4689 }, { "epoch": 2.53, "learning_rate": 6.308675113846324e-09, "logits/chosen": -2.067798376083374, "logits/rejected": -2.065265417098999, "logps/chosen": -1.2325806617736816, "logps/rejected": -5.498435020446777, "loss": 0.4245, "rewards/accuracies": 1.0, "rewards/chosen": 1.1072336435317993, "rewards/margins": 0.6369724273681641, "rewards/rejected": 0.47026118636131287, "step": 4690 }, { "epoch": 2.53, "learning_rate": 6.2945253279405145e-09, "logits/chosen": -2.124483823776245, "logits/rejected": -2.1043925285339355, "logps/chosen": -11.836220741271973, "logps/rejected": -5.246991157531738, "loss": 0.501, "rewards/accuracies": 1.0, "rewards/chosen": 1.1329375505447388, "rewards/margins": 0.4301857352256775, "rewards/rejected": 0.7027518153190613, "step": 4691 }, { "epoch": 2.53, "learning_rate": 6.280390362187315e-09, "logits/chosen": -2.0487749576568604, "logits/rejected": -2.2713217735290527, "logps/chosen": -0.37507715821266174, "logps/rejected": -0.3465314507484436, "loss": 0.683, "rewards/accuracies": 1.0, "rewards/chosen": 0.7989991307258606, "rewards/margins": 0.02041316032409668, "rewards/rejected": 0.7785859704017639, "step": 4692 }, { "epoch": 2.53, "learning_rate": 6.26627022137981e-09, "logits/chosen": -2.0867960453033447, "logits/rejected": -2.287393093109131, "logps/chosen": -1.2966192960739136, "logps/rejected": -1.2913694381713867, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.7079905867576599, "rewards/margins": 0.014254570007324219, "rewards/rejected": 0.6937360167503357, "step": 4693 }, { "epoch": 2.53, "learning_rate": 6.2521649103059904e-09, "logits/chosen": -2.0732898712158203, "logits/rejected": -2.3284912109375, "logps/chosen": -0.16321542859077454, "logps/rejected": -0.17371441423892975, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.854904294013977, "rewards/margins": 0.015257656574249268, "rewards/rejected": 0.8396466374397278, "step": 4694 }, { "epoch": 2.53, "learning_rate": 6.238074433748858e-09, "logits/chosen": -2.1278626918792725, "logits/rejected": -2.242574691772461, "logps/chosen": -6.098247528076172, "logps/rejected": -0.707596480846405, "loss": 0.5972, "rewards/accuracies": 1.0, "rewards/chosen": 1.058005690574646, "rewards/margins": 0.20207685232162476, "rewards/rejected": 0.8559288382530212, "step": 4695 }, { "epoch": 2.53, "learning_rate": 6.2239987964863796e-09, "logits/chosen": -2.13055682182312, "logits/rejected": -2.3266427516937256, "logps/chosen": -1.5040814876556396, "logps/rejected": -1.4773000478744507, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 1.00763738155365, "rewards/margins": 0.018388986587524414, "rewards/rejected": 0.9892483949661255, "step": 4696 }, { "epoch": 2.53, "learning_rate": 6.209938003291471e-09, "logits/chosen": -2.061516046524048, "logits/rejected": -2.1482107639312744, "logps/chosen": -2.9386818408966064, "logps/rejected": -19.287090301513672, "loss": 0.4008, "rewards/accuracies": 1.0, "rewards/chosen": 1.3288594484329224, "rewards/margins": 0.7073453068733215, "rewards/rejected": 0.6215141415596008, "step": 4697 }, { "epoch": 2.53, "learning_rate": 6.195892058932029e-09, "logits/chosen": -1.9547594785690308, "logits/rejected": -2.2650668621063232, "logps/chosen": -0.26394620537757874, "logps/rejected": -0.29476580023765564, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.9163300395011902, "rewards/margins": 0.010417461395263672, "rewards/rejected": 0.9059125781059265, "step": 4698 }, { "epoch": 2.53, "learning_rate": 6.18186096817091e-09, "logits/chosen": -1.941691517829895, "logits/rejected": -2.3091821670532227, "logps/chosen": -2.7907843589782715, "logps/rejected": -0.8787985444068909, "loss": 0.6748, "rewards/accuracies": 1.0, "rewards/chosen": 0.9071311950683594, "rewards/margins": 0.03699147701263428, "rewards/rejected": 0.8701397180557251, "step": 4699 }, { "epoch": 2.54, "learning_rate": 6.167844735765948e-09, "logits/chosen": -2.0916237831115723, "logits/rejected": -2.3347666263580322, "logps/chosen": -0.7725374698638916, "logps/rejected": -6.66956901550293, "loss": 0.6035, "rewards/accuracies": 1.0, "rewards/chosen": 1.0638113021850586, "rewards/margins": 0.1881309151649475, "rewards/rejected": 0.8756803870201111, "step": 4700 }, { "epoch": 2.54, "learning_rate": 6.153843366469935e-09, "logits/chosen": -2.0739548206329346, "logits/rejected": -2.265378713607788, "logps/chosen": -9.667485237121582, "logps/rejected": -5.43681001663208, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 1.070784330368042, "rewards/margins": 0.023059606552124023, "rewards/rejected": 1.047724723815918, "step": 4701 }, { "epoch": 2.54, "learning_rate": 6.13985686503059e-09, "logits/chosen": -2.1011035442352295, "logits/rejected": -2.108877182006836, "logps/chosen": -2.699314594268799, "logps/rejected": -2.936912775039673, "loss": 0.4692, "rewards/accuracies": 1.0, "rewards/chosen": 1.0143603086471558, "rewards/margins": 0.5129789113998413, "rewards/rejected": 0.5013813972473145, "step": 4702 }, { "epoch": 2.54, "learning_rate": 6.125885236190642e-09, "logits/chosen": -2.104398012161255, "logits/rejected": -2.3414087295532227, "logps/chosen": -2.2049503326416016, "logps/rejected": -2.2005221843719482, "loss": 0.6694, "rewards/accuracies": 1.0, "rewards/chosen": 0.9981585741043091, "rewards/margins": 0.04813206195831299, "rewards/rejected": 0.9500265121459961, "step": 4703 }, { "epoch": 2.54, "learning_rate": 6.111928484687723e-09, "logits/chosen": -2.0377514362335205, "logits/rejected": -2.225325345993042, "logps/chosen": -0.7152813673019409, "logps/rejected": -0.6294382810592651, "loss": 0.6842, "rewards/accuracies": 1.0, "rewards/chosen": 0.8915378451347351, "rewards/margins": 0.01788574457168579, "rewards/rejected": 0.8736521005630493, "step": 4704 }, { "epoch": 2.54, "learning_rate": 6.097986615254464e-09, "logits/chosen": -2.04327392578125, "logits/rejected": -2.099170684814453, "logps/chosen": -1.4541804790496826, "logps/rejected": -8.091058731079102, "loss": 0.4405, "rewards/accuracies": 1.0, "rewards/chosen": 1.5117465257644653, "rewards/margins": 0.5915703773498535, "rewards/rejected": 0.9201761484146118, "step": 4705 }, { "epoch": 2.54, "learning_rate": 6.084059632618443e-09, "logits/chosen": -2.2403876781463623, "logits/rejected": -2.154144763946533, "logps/chosen": -20.77063751220703, "logps/rejected": -3.830547571182251, "loss": 0.2389, "rewards/accuracies": 1.0, "rewards/chosen": 1.9783363342285156, "rewards/margins": 1.3099853992462158, "rewards/rejected": 0.6683509945869446, "step": 4706 }, { "epoch": 2.54, "learning_rate": 6.070147541502152e-09, "logits/chosen": -2.1238014698028564, "logits/rejected": -2.120997667312622, "logps/chosen": -3.929837226867676, "logps/rejected": -1.660485029220581, "loss": 0.373, "rewards/accuracies": 1.0, "rewards/chosen": 1.6195179224014282, "rewards/margins": 0.7938100099563599, "rewards/rejected": 0.8257079124450684, "step": 4707 }, { "epoch": 2.54, "learning_rate": 6.056250346623098e-09, "logits/chosen": -2.151935577392578, "logits/rejected": -2.1635379791259766, "logps/chosen": -2.371831178665161, "logps/rejected": -4.391035079956055, "loss": 0.4571, "rewards/accuracies": 1.0, "rewards/chosen": 1.0279730558395386, "rewards/margins": 0.545513391494751, "rewards/rejected": 0.4824596345424652, "step": 4708 }, { "epoch": 2.54, "learning_rate": 6.042368052693675e-09, "logits/chosen": -2.10921049118042, "logits/rejected": -2.0860040187835693, "logps/chosen": -12.921573638916016, "logps/rejected": -2.3059723377227783, "loss": 0.3887, "rewards/accuracies": 1.0, "rewards/chosen": 1.4237898588180542, "rewards/margins": 0.7444561719894409, "rewards/rejected": 0.6793336868286133, "step": 4709 }, { "epoch": 2.54, "learning_rate": 6.02850066442126e-09, "logits/chosen": -2.078840732574463, "logits/rejected": -2.0746078491210938, "logps/chosen": -0.15411335229873657, "logps/rejected": -12.218157768249512, "loss": 0.4846, "rewards/accuracies": 1.0, "rewards/chosen": 0.8758959770202637, "rewards/margins": 0.4723329544067383, "rewards/rejected": 0.4035630226135254, "step": 4710 }, { "epoch": 2.54, "learning_rate": 6.01464818650817e-09, "logits/chosen": -2.011889934539795, "logits/rejected": -2.2165958881378174, "logps/chosen": -1.5413110256195068, "logps/rejected": -1.6129035949707031, "loss": 0.6859, "rewards/accuracies": 1.0, "rewards/chosen": 0.982456386089325, "rewards/margins": 0.014459192752838135, "rewards/rejected": 0.9679971933364868, "step": 4711 }, { "epoch": 2.54, "learning_rate": 6.0008106236516475e-09, "logits/chosen": -2.0208399295806885, "logits/rejected": -2.022057294845581, "logps/chosen": -1.1105973720550537, "logps/rejected": -6.5257391929626465, "loss": 0.469, "rewards/accuracies": 1.0, "rewards/chosen": 0.9349784851074219, "rewards/margins": 0.5134124159812927, "rewards/rejected": 0.42156606912612915, "step": 4712 }, { "epoch": 2.54, "learning_rate": 5.9869879805439e-09, "logits/chosen": -1.967512607574463, "logits/rejected": -1.9743596315383911, "logps/chosen": -3.1697888374328613, "logps/rejected": -5.056799411773682, "loss": 0.431, "rewards/accuracies": 1.0, "rewards/chosen": 1.1758610010147095, "rewards/margins": 0.6183519959449768, "rewards/rejected": 0.5575090050697327, "step": 4713 }, { "epoch": 2.54, "learning_rate": 5.9731802618720795e-09, "logits/chosen": -2.0663528442382812, "logits/rejected": -2.329371213912964, "logps/chosen": -0.8732983469963074, "logps/rejected": -1.0769288539886475, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 0.7441014051437378, "rewards/margins": 0.0072484612464904785, "rewards/rejected": 0.7368529438972473, "step": 4714 }, { "epoch": 2.54, "learning_rate": 5.9593874723182445e-09, "logits/chosen": -2.0559439659118652, "logits/rejected": -2.261873483657837, "logps/chosen": -0.35548606514930725, "logps/rejected": -0.4090408384799957, "loss": 0.6917, "rewards/accuracies": 1.0, "rewards/chosen": 0.8249310851097107, "rewards/margins": 0.0028191804885864258, "rewards/rejected": 0.8221119046211243, "step": 4715 }, { "epoch": 2.54, "learning_rate": 5.9456096165594275e-09, "logits/chosen": -2.138176202774048, "logits/rejected": -2.2632341384887695, "logps/chosen": -6.803267002105713, "logps/rejected": -7.091463088989258, "loss": 0.662, "rewards/accuracies": 1.0, "rewards/chosen": 1.2176822423934937, "rewards/margins": 0.06332123279571533, "rewards/rejected": 1.1543610095977783, "step": 4716 }, { "epoch": 2.54, "learning_rate": 5.931846699267556e-09, "logits/chosen": -2.126446008682251, "logits/rejected": -2.2910428047180176, "logps/chosen": -3.6254982948303223, "logps/rejected": -3.3372175693511963, "loss": 0.7067, "rewards/accuracies": 0.0, "rewards/chosen": 0.7808506488800049, "rewards/margins": -0.026938796043395996, "rewards/rejected": 0.8077894449234009, "step": 4717 }, { "epoch": 2.54, "learning_rate": 5.918098725109555e-09, "logits/chosen": -2.1293232440948486, "logits/rejected": -2.237271547317505, "logps/chosen": -0.8721972703933716, "logps/rejected": -1.9809848070144653, "loss": 0.5972, "rewards/accuracies": 1.0, "rewards/chosen": 0.9829248785972595, "rewards/margins": 0.20199930667877197, "rewards/rejected": 0.7809255719184875, "step": 4718 }, { "epoch": 2.55, "learning_rate": 5.904365698747215e-09, "logits/chosen": -2.111607313156128, "logits/rejected": -2.354285478591919, "logps/chosen": -1.9894684553146362, "logps/rejected": -2.1247551441192627, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.6975094676017761, "rewards/margins": 0.01667696237564087, "rewards/rejected": 0.6808325052261353, "step": 4719 }, { "epoch": 2.55, "learning_rate": 5.890647624837303e-09, "logits/chosen": -2.213890790939331, "logits/rejected": -2.2746529579162598, "logps/chosen": -1.03278648853302, "logps/rejected": -1.1644151210784912, "loss": 0.674, "rewards/accuracies": 1.0, "rewards/chosen": 0.9210116267204285, "rewards/margins": 0.03860902786254883, "rewards/rejected": 0.8824025988578796, "step": 4720 }, { "epoch": 2.55, "learning_rate": 5.876944508031506e-09, "logits/chosen": -2.0868961811065674, "logits/rejected": -2.086467981338501, "logps/chosen": -1.1173992156982422, "logps/rejected": -1.6365548372268677, "loss": 0.6496, "rewards/accuracies": 1.0, "rewards/chosen": 0.8583259582519531, "rewards/margins": 0.0891297459602356, "rewards/rejected": 0.7691962122917175, "step": 4721 }, { "epoch": 2.55, "learning_rate": 5.863256352976414e-09, "logits/chosen": -2.0180115699768066, "logits/rejected": -2.2369637489318848, "logps/chosen": -0.49007269740104675, "logps/rejected": -0.49154508113861084, "loss": 0.6687, "rewards/accuracies": 1.0, "rewards/chosen": 1.0028395652770996, "rewards/margins": 0.049420833587646484, "rewards/rejected": 0.9534187316894531, "step": 4722 }, { "epoch": 2.55, "learning_rate": 5.849583164313576e-09, "logits/chosen": -1.9628480672836304, "logits/rejected": -2.3180713653564453, "logps/chosen": -0.6618430018424988, "logps/rejected": -11.441118240356445, "loss": 0.7692, "rewards/accuracies": 0.0, "rewards/chosen": 0.8862605094909668, "rewards/margins": -0.14667177200317383, "rewards/rejected": 1.0329322814941406, "step": 4723 }, { "epoch": 2.55, "learning_rate": 5.835924946679461e-09, "logits/chosen": -2.0271713733673096, "logits/rejected": -2.2594993114471436, "logps/chosen": -4.824041366577148, "logps/rejected": -4.799676895141602, "loss": 0.6991, "rewards/accuracies": 0.0, "rewards/chosen": 1.0015548467636108, "rewards/margins": -0.011771321296691895, "rewards/rejected": 1.0133261680603027, "step": 4724 }, { "epoch": 2.55, "learning_rate": 5.822281704705428e-09, "logits/chosen": -2.0218889713287354, "logits/rejected": -2.0206265449523926, "logps/chosen": -0.5488616824150085, "logps/rejected": -2.8822972774505615, "loss": 0.5182, "rewards/accuracies": 1.0, "rewards/chosen": 1.0385483503341675, "rewards/margins": 0.3871392011642456, "rewards/rejected": 0.6514091491699219, "step": 4725 }, { "epoch": 2.55, "learning_rate": 5.808653443017814e-09, "logits/chosen": -2.1207029819488525, "logits/rejected": -2.3852245807647705, "logps/chosen": -10.567302703857422, "logps/rejected": -8.667634963989258, "loss": 0.7428, "rewards/accuracies": 0.0, "rewards/chosen": 0.7011122107505798, "rewards/margins": -0.0969855785369873, "rewards/rejected": 0.7980977892875671, "step": 4726 }, { "epoch": 2.55, "learning_rate": 5.795040166237814e-09, "logits/chosen": -2.029675245285034, "logits/rejected": -2.2424020767211914, "logps/chosen": -0.5421915650367737, "logps/rejected": -0.6690626740455627, "loss": 0.6731, "rewards/accuracies": 1.0, "rewards/chosen": 0.7834159135818481, "rewards/margins": 0.04042142629623413, "rewards/rejected": 0.742994487285614, "step": 4727 }, { "epoch": 2.55, "learning_rate": 5.781441878981596e-09, "logits/chosen": -2.094421863555908, "logits/rejected": -2.095196008682251, "logps/chosen": -2.179063320159912, "logps/rejected": -5.76391077041626, "loss": 0.3018, "rewards/accuracies": 1.0, "rewards/chosen": 1.554129958152771, "rewards/margins": 1.0432283878326416, "rewards/rejected": 0.5109016299247742, "step": 4728 }, { "epoch": 2.55, "learning_rate": 5.76785858586023e-09, "logits/chosen": -2.057399272918701, "logits/rejected": -2.2786202430725098, "logps/chosen": -0.6140041947364807, "logps/rejected": -0.7198522090911865, "loss": 0.6827, "rewards/accuracies": 1.0, "rewards/chosen": 1.0490527153015137, "rewards/margins": 0.02109384536743164, "rewards/rejected": 1.027958869934082, "step": 4729 }, { "epoch": 2.55, "learning_rate": 5.7542902914796746e-09, "logits/chosen": -2.2009668350219727, "logits/rejected": -2.277231216430664, "logps/chosen": -3.871354579925537, "logps/rejected": -25.46387481689453, "loss": 0.4176, "rewards/accuracies": 1.0, "rewards/chosen": 1.135693907737732, "rewards/margins": 0.6571576595306396, "rewards/rejected": 0.4785362184047699, "step": 4730 }, { "epoch": 2.55, "learning_rate": 5.740737000440837e-09, "logits/chosen": -2.1542699337005615, "logits/rejected": -2.2976958751678467, "logps/chosen": -1.1073929071426392, "logps/rejected": -8.027202606201172, "loss": 0.5433, "rewards/accuracies": 1.0, "rewards/chosen": 1.16060209274292, "rewards/margins": 0.32615840435028076, "rewards/rejected": 0.8344436883926392, "step": 4731 }, { "epoch": 2.55, "learning_rate": 5.72719871733951e-09, "logits/chosen": -2.091986894607544, "logits/rejected": -2.3002395629882812, "logps/chosen": -0.6080904603004456, "logps/rejected": -0.5908236503601074, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9958303570747375, "rewards/margins": 0.0057506561279296875, "rewards/rejected": 0.9900797009468079, "step": 4732 }, { "epoch": 2.55, "learning_rate": 5.7136754467664086e-09, "logits/chosen": -2.0895895957946777, "logits/rejected": -2.3061516284942627, "logps/chosen": -26.477947235107422, "logps/rejected": -27.866188049316406, "loss": 0.7011, "rewards/accuracies": 0.0, "rewards/chosen": 0.7647785544395447, "rewards/margins": -0.015785574913024902, "rewards/rejected": 0.7805641293525696, "step": 4733 }, { "epoch": 2.55, "learning_rate": 5.700167193307182e-09, "logits/chosen": -2.0600764751434326, "logits/rejected": -2.2406699657440186, "logps/chosen": -0.4878552258014679, "logps/rejected": -0.32387298345565796, "loss": 0.6879, "rewards/accuracies": 1.0, "rewards/chosen": 0.7705258727073669, "rewards/margins": 0.010575830936431885, "rewards/rejected": 0.7599500417709351, "step": 4734 }, { "epoch": 2.55, "learning_rate": 5.686673961542332e-09, "logits/chosen": -2.0217957496643066, "logits/rejected": -2.24409556388855, "logps/chosen": -2.144174098968506, "logps/rejected": -5.862732887268066, "loss": 0.6606, "rewards/accuracies": 1.0, "rewards/chosen": 0.9240943789482117, "rewards/margins": 0.06625163555145264, "rewards/rejected": 0.857842743396759, "step": 4735 }, { "epoch": 2.55, "learning_rate": 5.673195756047311e-09, "logits/chosen": -2.17572283744812, "logits/rejected": -2.0708863735198975, "logps/chosen": -24.271190643310547, "logps/rejected": -4.185694694519043, "loss": 0.1479, "rewards/accuracies": 1.0, "rewards/chosen": 2.369051694869995, "rewards/margins": 1.8364940881729126, "rewards/rejected": 0.5325576066970825, "step": 4736 }, { "epoch": 2.56, "learning_rate": 5.659732581392468e-09, "logits/chosen": -1.988061785697937, "logits/rejected": -1.9862244129180908, "logps/chosen": -5.461306571960449, "logps/rejected": -2.2589573860168457, "loss": 0.3125, "rewards/accuracies": 1.0, "rewards/chosen": 1.701743721961975, "rewards/margins": 1.0029783248901367, "rewards/rejected": 0.6987654566764832, "step": 4737 }, { "epoch": 2.56, "learning_rate": 5.646284442143046e-09, "logits/chosen": -2.086890697479248, "logits/rejected": -2.260650157928467, "logps/chosen": -0.32249748706817627, "logps/rejected": -0.29514244198799133, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.9011818766593933, "rewards/margins": 0.011754512786865234, "rewards/rejected": 0.8894273638725281, "step": 4738 }, { "epoch": 2.56, "learning_rate": 5.632851342859202e-09, "logits/chosen": -2.0549285411834717, "logits/rejected": -2.0567820072174072, "logps/chosen": -1.0926158428192139, "logps/rejected": -2.5532989501953125, "loss": 0.4942, "rewards/accuracies": 1.0, "rewards/chosen": 1.0832279920578003, "rewards/margins": 0.4475756287574768, "rewards/rejected": 0.6356523633003235, "step": 4739 }, { "epoch": 2.56, "learning_rate": 5.619433288095971e-09, "logits/chosen": -2.1244056224823, "logits/rejected": -2.0930256843566895, "logps/chosen": -24.765689849853516, "logps/rejected": -15.874547958374023, "loss": 0.3025, "rewards/accuracies": 1.0, "rewards/chosen": 2.022169828414917, "rewards/margins": 1.040799617767334, "rewards/rejected": 0.9813701510429382, "step": 4740 }, { "epoch": 2.56, "learning_rate": 5.606030282403318e-09, "logits/chosen": -2.124910354614258, "logits/rejected": -2.1328723430633545, "logps/chosen": -1.677168369293213, "logps/rejected": -2.6446173191070557, "loss": 0.4109, "rewards/accuracies": 1.0, "rewards/chosen": 1.4754198789596558, "rewards/margins": 0.6769062876701355, "rewards/rejected": 0.7985135912895203, "step": 4741 }, { "epoch": 2.56, "learning_rate": 5.5926423303260635e-09, "logits/chosen": -2.143826961517334, "logits/rejected": -2.1408493518829346, "logps/chosen": -2.7841005325317383, "logps/rejected": -6.007134437561035, "loss": 0.3086, "rewards/accuracies": 1.0, "rewards/chosen": 1.3329558372497559, "rewards/margins": 1.0176153182983398, "rewards/rejected": 0.315340518951416, "step": 4742 }, { "epoch": 2.56, "learning_rate": 5.579269436403966e-09, "logits/chosen": -2.1124541759490967, "logits/rejected": -2.0947957038879395, "logps/chosen": -15.396879196166992, "logps/rejected": -2.2912254333496094, "loss": 0.3773, "rewards/accuracies": 1.0, "rewards/chosen": 1.2437256574630737, "rewards/margins": 0.7800761461257935, "rewards/rejected": 0.4636495113372803, "step": 4743 }, { "epoch": 2.56, "learning_rate": 5.565911605171664e-09, "logits/chosen": -2.0283827781677246, "logits/rejected": -2.0344784259796143, "logps/chosen": -0.9734973311424255, "logps/rejected": -5.905498504638672, "loss": 0.4159, "rewards/accuracies": 1.0, "rewards/chosen": 0.958795964717865, "rewards/margins": 0.6622582674026489, "rewards/rejected": 0.29653769731521606, "step": 4744 }, { "epoch": 2.56, "learning_rate": 5.552568841158661e-09, "logits/chosen": -2.0448014736175537, "logits/rejected": -2.2850730419158936, "logps/chosen": -0.9651052951812744, "logps/rejected": -1.2451307773590088, "loss": 0.67, "rewards/accuracies": 1.0, "rewards/chosen": 0.8334072232246399, "rewards/margins": 0.0467529296875, "rewards/rejected": 0.7866542935371399, "step": 4745 }, { "epoch": 2.56, "learning_rate": 5.53924114888939e-09, "logits/chosen": -2.1237664222717285, "logits/rejected": -2.1194217205047607, "logps/chosen": -12.762056350708008, "logps/rejected": -4.014566421508789, "loss": 0.4136, "rewards/accuracies": 1.0, "rewards/chosen": 1.1790670156478882, "rewards/margins": 0.6689848303794861, "rewards/rejected": 0.5100821852684021, "step": 4746 }, { "epoch": 2.56, "learning_rate": 5.525928532883156e-09, "logits/chosen": -2.0210049152374268, "logits/rejected": -2.2331759929656982, "logps/chosen": -0.5919508337974548, "logps/rejected": -0.5240490436553955, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 1.0035791397094727, "rewards/margins": 0.005515456199645996, "rewards/rejected": 0.9980636835098267, "step": 4747 }, { "epoch": 2.56, "learning_rate": 5.51263099765415e-09, "logits/chosen": -2.13448166847229, "logits/rejected": -2.142864465713501, "logps/chosen": -1.4104728698730469, "logps/rejected": -4.172728538513184, "loss": 0.3786, "rewards/accuracies": 1.0, "rewards/chosen": 1.3574724197387695, "rewards/margins": 0.7759904861450195, "rewards/rejected": 0.58148193359375, "step": 4748 }, { "epoch": 2.56, "learning_rate": 5.499348547711463e-09, "logits/chosen": -2.075082302093506, "logits/rejected": -2.279252767562866, "logps/chosen": -0.3265261948108673, "logps/rejected": -0.33311954140663147, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.869224488735199, "rewards/margins": 0.011023163795471191, "rewards/rejected": 0.8582013249397278, "step": 4749 }, { "epoch": 2.56, "learning_rate": 5.486081187559044e-09, "logits/chosen": -2.101250410079956, "logits/rejected": -2.110701322555542, "logps/chosen": -16.84339141845703, "logps/rejected": -7.640676975250244, "loss": 0.1796, "rewards/accuracies": 1.0, "rewards/chosen": 2.2182528972625732, "rewards/margins": 1.6257680654525757, "rewards/rejected": 0.5924848318099976, "step": 4750 }, { "epoch": 2.56, "learning_rate": 5.47282892169576e-09, "logits/chosen": -1.9690983295440674, "logits/rejected": -1.923037052154541, "logps/chosen": -12.857442855834961, "logps/rejected": -10.179019927978516, "loss": 0.2635, "rewards/accuracies": 1.0, "rewards/chosen": 1.4278125762939453, "rewards/margins": 1.1989402770996094, "rewards/rejected": 0.22887229919433594, "step": 4751 }, { "epoch": 2.56, "learning_rate": 5.4595917546153166e-09, "logits/chosen": -2.1361072063446045, "logits/rejected": -2.1367552280426025, "logps/chosen": -5.079204559326172, "logps/rejected": -2.4501664638519287, "loss": 0.2597, "rewards/accuracies": 1.0, "rewards/chosen": 1.7860854864120483, "rewards/margins": 1.2154507637023926, "rewards/rejected": 0.570634663105011, "step": 4752 }, { "epoch": 2.56, "learning_rate": 5.446369690806341e-09, "logits/chosen": -2.0681910514831543, "logits/rejected": -2.067819833755493, "logps/chosen": -0.6013835668563843, "logps/rejected": -1.7667999267578125, "loss": 0.6174, "rewards/accuracies": 1.0, "rewards/chosen": 0.9823927283287048, "rewards/margins": 0.15760481357574463, "rewards/rejected": 0.8247879147529602, "step": 4753 }, { "epoch": 2.56, "learning_rate": 5.433162734752328e-09, "logits/chosen": -2.162541151046753, "logits/rejected": -2.3262786865234375, "logps/chosen": -5.372754096984863, "logps/rejected": -3.62060546875, "loss": 0.6319, "rewards/accuracies": 1.0, "rewards/chosen": 0.6968947649002075, "rewards/margins": 0.12650680541992188, "rewards/rejected": 0.5703879594802856, "step": 4754 }, { "epoch": 2.56, "learning_rate": 5.4199708909316245e-09, "logits/chosen": -2.0849692821502686, "logits/rejected": -2.095202684402466, "logps/chosen": -1.530822992324829, "logps/rejected": -2.5663650035858154, "loss": 0.4937, "rewards/accuracies": 1.0, "rewards/chosen": 0.9587044715881348, "rewards/margins": 0.4489259719848633, "rewards/rejected": 0.5097784996032715, "step": 4755 }, { "epoch": 2.57, "learning_rate": 5.4067941638174795e-09, "logits/chosen": -2.035771131515503, "logits/rejected": -2.281798839569092, "logps/chosen": -0.4507788121700287, "logps/rejected": -0.5131381154060364, "loss": 0.6616, "rewards/accuracies": 1.0, "rewards/chosen": 0.9917169809341431, "rewards/margins": 0.06419318914413452, "rewards/rejected": 0.9275237917900085, "step": 4756 }, { "epoch": 2.57, "learning_rate": 5.393632557878014e-09, "logits/chosen": -2.0293266773223877, "logits/rejected": -2.026890993118286, "logps/chosen": -0.8331500291824341, "logps/rejected": -4.0194244384765625, "loss": 0.5099, "rewards/accuracies": 1.0, "rewards/chosen": 0.9469084143638611, "rewards/margins": 0.4077482223510742, "rewards/rejected": 0.5391601920127869, "step": 4757 }, { "epoch": 2.57, "learning_rate": 5.3804860775762076e-09, "logits/chosen": -2.0908310413360596, "logits/rejected": -2.2918343544006348, "logps/chosen": -0.33185428380966187, "logps/rejected": -0.2999576926231384, "loss": 0.6723, "rewards/accuracies": 1.0, "rewards/chosen": 0.8762955069541931, "rewards/margins": 0.04207479953765869, "rewards/rejected": 0.8342207074165344, "step": 4758 }, { "epoch": 2.57, "learning_rate": 5.367354727369938e-09, "logits/chosen": -2.0350894927978516, "logits/rejected": -2.2402122020721436, "logps/chosen": -0.3143934905529022, "logps/rejected": -0.30602124333381653, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.9562167525291443, "rewards/margins": -0.0038849711418151855, "rewards/rejected": 0.9601017236709595, "step": 4759 }, { "epoch": 2.57, "learning_rate": 5.354238511711906e-09, "logits/chosen": -2.0151848793029785, "logits/rejected": -2.0175588130950928, "logps/chosen": -2.3730673789978027, "logps/rejected": -2.554964780807495, "loss": 0.4898, "rewards/accuracies": 1.0, "rewards/chosen": 1.4486627578735352, "rewards/margins": 0.45897775888442993, "rewards/rejected": 0.9896849989891052, "step": 4760 }, { "epoch": 2.57, "learning_rate": 5.341137435049725e-09, "logits/chosen": -2.021954298019409, "logits/rejected": -2.0158779621124268, "logps/chosen": -2.6064586639404297, "logps/rejected": -4.036600112915039, "loss": 0.6012, "rewards/accuracies": 1.0, "rewards/chosen": 1.122330665588379, "rewards/margins": 0.19313526153564453, "rewards/rejected": 0.9291954040527344, "step": 4761 }, { "epoch": 2.57, "learning_rate": 5.328051501825864e-09, "logits/chosen": -2.046581506729126, "logits/rejected": -2.2867636680603027, "logps/chosen": -0.27645742893218994, "logps/rejected": -0.27724477648735046, "loss": 0.6835, "rewards/accuracies": 1.0, "rewards/chosen": 0.8973937034606934, "rewards/margins": 0.019329726696014404, "rewards/rejected": 0.878063976764679, "step": 4762 }, { "epoch": 2.57, "learning_rate": 5.314980716477629e-09, "logits/chosen": -2.0547688007354736, "logits/rejected": -2.0475687980651855, "logps/chosen": -4.068213939666748, "logps/rejected": -3.4235191345214844, "loss": 0.4091, "rewards/accuracies": 1.0, "rewards/chosen": 1.803704857826233, "rewards/margins": 0.6823756694793701, "rewards/rejected": 1.1213291883468628, "step": 4763 }, { "epoch": 2.57, "learning_rate": 5.301925083437242e-09, "logits/chosen": -2.12384295463562, "logits/rejected": -2.1176979541778564, "logps/chosen": -4.665184497833252, "logps/rejected": -5.716794967651367, "loss": 0.3418, "rewards/accuracies": 1.0, "rewards/chosen": 1.5026283264160156, "rewards/margins": 0.8976316452026367, "rewards/rejected": 0.6049966812133789, "step": 4764 }, { "epoch": 2.57, "learning_rate": 5.288884607131722e-09, "logits/chosen": -2.156954765319824, "logits/rejected": -2.2115399837493896, "logps/chosen": -5.591360092163086, "logps/rejected": -7.884986877441406, "loss": 0.5364, "rewards/accuracies": 1.0, "rewards/chosen": 1.2687023878097534, "rewards/margins": 0.34271663427352905, "rewards/rejected": 0.9259857535362244, "step": 4765 }, { "epoch": 2.57, "learning_rate": 5.275859291983004e-09, "logits/chosen": -2.1646180152893066, "logits/rejected": -2.1042256355285645, "logps/chosen": -20.427770614624023, "logps/rejected": -3.7263975143432617, "loss": 0.1782, "rewards/accuracies": 1.0, "rewards/chosen": 2.3588130474090576, "rewards/margins": 1.634284496307373, "rewards/rejected": 0.7245286107063293, "step": 4766 }, { "epoch": 2.57, "learning_rate": 5.262849142407855e-09, "logits/chosen": -2.185196876525879, "logits/rejected": -2.1010499000549316, "logps/chosen": -33.00636672973633, "logps/rejected": -4.88289737701416, "loss": 0.1884, "rewards/accuracies": 1.0, "rewards/chosen": 1.9785175323486328, "rewards/margins": 1.573536992073059, "rewards/rejected": 0.4049805700778961, "step": 4767 }, { "epoch": 2.57, "learning_rate": 5.249854162817907e-09, "logits/chosen": -1.9861935377120972, "logits/rejected": -1.9868136644363403, "logps/chosen": -9.40481948852539, "logps/rejected": -1.6589581966400146, "loss": 0.4676, "rewards/accuracies": 1.0, "rewards/chosen": 1.3546479940414429, "rewards/margins": 0.5173472166061401, "rewards/rejected": 0.8373007774353027, "step": 4768 }, { "epoch": 2.57, "learning_rate": 5.2368743576196525e-09, "logits/chosen": -2.1108145713806152, "logits/rejected": -2.11952805519104, "logps/chosen": -3.978627920150757, "logps/rejected": -7.902923583984375, "loss": 0.289, "rewards/accuracies": 1.0, "rewards/chosen": 1.5320367813110352, "rewards/margins": 1.0932285785675049, "rewards/rejected": 0.43880826234817505, "step": 4769 }, { "epoch": 2.57, "learning_rate": 5.223909731214427e-09, "logits/chosen": -2.0720880031585693, "logits/rejected": -2.062211513519287, "logps/chosen": -4.269139766693115, "logps/rejected": -3.2965128421783447, "loss": 0.5011, "rewards/accuracies": 1.0, "rewards/chosen": 1.120926856994629, "rewards/margins": 0.42997270822525024, "rewards/rejected": 0.6909541487693787, "step": 4770 }, { "epoch": 2.57, "learning_rate": 5.2109602879984175e-09, "logits/chosen": -2.083693027496338, "logits/rejected": -2.243867874145508, "logps/chosen": -0.7998350262641907, "logps/rejected": -0.7548990845680237, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.8349438905715942, "rewards/margins": 0.005280792713165283, "rewards/rejected": 0.829663097858429, "step": 4771 }, { "epoch": 2.57, "learning_rate": 5.198026032362684e-09, "logits/chosen": -2.0337162017822266, "logits/rejected": -2.2265920639038086, "logps/chosen": -0.4517040252685547, "logps/rejected": -0.4580984115600586, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 0.9686660766601562, "rewards/margins": 0.022001147270202637, "rewards/rejected": 0.9466649293899536, "step": 4772 }, { "epoch": 2.57, "learning_rate": 5.185106968693109e-09, "logits/chosen": -1.9762091636657715, "logits/rejected": -1.9930812120437622, "logps/chosen": -4.778898239135742, "logps/rejected": -7.45937967300415, "loss": 0.5233, "rewards/accuracies": 1.0, "rewards/chosen": 1.074914574623108, "rewards/margins": 0.37466686964035034, "rewards/rejected": 0.7002477049827576, "step": 4773 }, { "epoch": 2.57, "learning_rate": 5.172203101370448e-09, "logits/chosen": -1.9925659894943237, "logits/rejected": -2.2835090160369873, "logps/chosen": -0.8079354166984558, "logps/rejected": -0.8317470550537109, "loss": 0.6941, "rewards/accuracies": 0.0, "rewards/chosen": 1.0062060356140137, "rewards/margins": -0.001882791519165039, "rewards/rejected": 1.0080888271331787, "step": 4774 }, { "epoch": 2.58, "learning_rate": 5.159314434770273e-09, "logits/chosen": -2.081294298171997, "logits/rejected": -2.0738325119018555, "logps/chosen": -5.965405464172363, "logps/rejected": -4.174851417541504, "loss": 0.4251, "rewards/accuracies": 1.0, "rewards/chosen": 1.2664906978607178, "rewards/margins": 0.6352962851524353, "rewards/rejected": 0.6311944127082825, "step": 4775 }, { "epoch": 2.58, "learning_rate": 5.146440973263028e-09, "logits/chosen": -2.1580426692962646, "logits/rejected": -2.3174240589141846, "logps/chosen": -5.043688774108887, "logps/rejected": -0.8143762350082397, "loss": 0.7441, "rewards/accuracies": 0.0, "rewards/chosen": 0.9129192233085632, "rewards/margins": -0.09950035810470581, "rewards/rejected": 1.012419581413269, "step": 4776 }, { "epoch": 2.58, "learning_rate": 5.1335827212140196e-09, "logits/chosen": -2.1317808628082275, "logits/rejected": -2.137263774871826, "logps/chosen": -2.96850323677063, "logps/rejected": -14.86507797241211, "loss": 0.3283, "rewards/accuracies": 1.0, "rewards/chosen": 1.0915429592132568, "rewards/margins": 0.9451016783714294, "rewards/rejected": 0.1464412659406662, "step": 4777 }, { "epoch": 2.58, "learning_rate": 5.120739682983339e-09, "logits/chosen": -2.064943313598633, "logits/rejected": -2.1096551418304443, "logps/chosen": -3.184837818145752, "logps/rejected": -9.633489608764648, "loss": 0.3528, "rewards/accuracies": 1.0, "rewards/chosen": 1.521404504776001, "rewards/margins": 0.8601192831993103, "rewards/rejected": 0.6612852215766907, "step": 4778 }, { "epoch": 2.58, "learning_rate": 5.107911862925968e-09, "logits/chosen": -2.136328935623169, "logits/rejected": -2.1477553844451904, "logps/chosen": -2.349287509918213, "logps/rejected": -3.3714680671691895, "loss": 0.4726, "rewards/accuracies": 1.0, "rewards/chosen": 1.4295167922973633, "rewards/margins": 0.503989040851593, "rewards/rejected": 0.9255277514457703, "step": 4779 }, { "epoch": 2.58, "learning_rate": 5.0950992653917e-09, "logits/chosen": -2.164093017578125, "logits/rejected": -2.2989771366119385, "logps/chosen": -2.058453321456909, "logps/rejected": -2.0816903114318848, "loss": 0.7017, "rewards/accuracies": 0.0, "rewards/chosen": 0.973895251750946, "rewards/margins": -0.01709580421447754, "rewards/rejected": 0.9909910559654236, "step": 4780 }, { "epoch": 2.58, "learning_rate": 5.0823018947251775e-09, "logits/chosen": -2.0590550899505615, "logits/rejected": -2.0577657222747803, "logps/chosen": -0.8053654432296753, "logps/rejected": -1.8452943563461304, "loss": 0.6025, "rewards/accuracies": 1.0, "rewards/chosen": 1.0103126764297485, "rewards/margins": 0.1904199719429016, "rewards/rejected": 0.8198927044868469, "step": 4781 }, { "epoch": 2.58, "learning_rate": 5.0695197552659e-09, "logits/chosen": -2.1729774475097656, "logits/rejected": -2.1729981899261475, "logps/chosen": -0.6970624923706055, "logps/rejected": -3.9526023864746094, "loss": 0.4421, "rewards/accuracies": 1.0, "rewards/chosen": 1.088118076324463, "rewards/margins": 0.5869412422180176, "rewards/rejected": 0.5011768341064453, "step": 4782 }, { "epoch": 2.58, "learning_rate": 5.0567528513481506e-09, "logits/chosen": -2.070131778717041, "logits/rejected": -2.0805346965789795, "logps/chosen": -1.7803279161453247, "logps/rejected": -2.371013641357422, "loss": 0.4767, "rewards/accuracies": 1.0, "rewards/chosen": 1.1707875728607178, "rewards/margins": 0.4931560158729553, "rewards/rejected": 0.6776315569877625, "step": 4783 }, { "epoch": 2.58, "learning_rate": 5.044001187301089e-09, "logits/chosen": -2.162588119506836, "logits/rejected": -2.3139631748199463, "logps/chosen": -0.48021113872528076, "logps/rejected": -0.48891234397888184, "loss": 0.7014, "rewards/accuracies": 0.0, "rewards/chosen": 0.9907560348510742, "rewards/margins": -0.01640307903289795, "rewards/rejected": 1.0071591138839722, "step": 4784 }, { "epoch": 2.58, "learning_rate": 5.031264767448712e-09, "logits/chosen": -2.132488250732422, "logits/rejected": -2.3456473350524902, "logps/chosen": -0.40383997559547424, "logps/rejected": -0.40370070934295654, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.840700089931488, "rewards/margins": 0.020847320556640625, "rewards/rejected": 0.8198527693748474, "step": 4785 }, { "epoch": 2.58, "learning_rate": 5.018543596109798e-09, "logits/chosen": -2.1120259761810303, "logits/rejected": -2.27449893951416, "logps/chosen": -0.256454199552536, "logps/rejected": -0.2863231301307678, "loss": 0.6803, "rewards/accuracies": 1.0, "rewards/chosen": 0.9997215270996094, "rewards/margins": 0.025820910930633545, "rewards/rejected": 0.9739006161689758, "step": 4786 }, { "epoch": 2.58, "learning_rate": 5.005837677598029e-09, "logits/chosen": -2.15109920501709, "logits/rejected": -2.3093628883361816, "logps/chosen": -1.4046558141708374, "logps/rejected": -3.530728340148926, "loss": 0.7273, "rewards/accuracies": 0.0, "rewards/chosen": 0.8667292594909668, "rewards/margins": -0.0672370195388794, "rewards/rejected": 0.9339662790298462, "step": 4787 }, { "epoch": 2.58, "learning_rate": 4.9931470162218325e-09, "logits/chosen": -2.0375330448150635, "logits/rejected": -2.282494306564331, "logps/chosen": -10.437408447265625, "logps/rejected": -10.826576232910156, "loss": 0.6796, "rewards/accuracies": 1.0, "rewards/chosen": 1.0356485843658447, "rewards/margins": 0.027194857597351074, "rewards/rejected": 1.0084537267684937, "step": 4788 }, { "epoch": 2.58, "learning_rate": 4.980471616284537e-09, "logits/chosen": -1.9892584085464478, "logits/rejected": -2.226193428039551, "logps/chosen": -0.5340466499328613, "logps/rejected": -0.5937196016311646, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.9009637832641602, "rewards/margins": 0.027738869190216064, "rewards/rejected": 0.8732249140739441, "step": 4789 }, { "epoch": 2.58, "learning_rate": 4.967811482084239e-09, "logits/chosen": -2.0507395267486572, "logits/rejected": -2.080056667327881, "logps/chosen": -4.3086066246032715, "logps/rejected": -3.8997395038604736, "loss": 0.6039, "rewards/accuracies": 1.0, "rewards/chosen": 1.1606358289718628, "rewards/margins": 0.18718641996383667, "rewards/rejected": 0.9734494090080261, "step": 4790 }, { "epoch": 2.58, "learning_rate": 4.955166617913886e-09, "logits/chosen": -2.0617692470550537, "logits/rejected": -2.2309558391571045, "logps/chosen": -2.0829076766967773, "logps/rejected": -2.107764959335327, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 0.5828831195831299, "rewards/margins": 0.01969856023788452, "rewards/rejected": 0.5631845593452454, "step": 4791 }, { "epoch": 2.58, "learning_rate": 4.942537028061256e-09, "logits/chosen": -2.0302023887634277, "logits/rejected": -2.030426025390625, "logps/chosen": -1.1656205654144287, "logps/rejected": -2.249025344848633, "loss": 0.5748, "rewards/accuracies": 1.0, "rewards/chosen": 1.1015815734863281, "rewards/margins": 0.25263911485671997, "rewards/rejected": 0.8489424586296082, "step": 4792 }, { "epoch": 2.59, "learning_rate": 4.929922716808915e-09, "logits/chosen": -1.9657164812088013, "logits/rejected": -2.2901625633239746, "logps/chosen": -3.1354475021362305, "logps/rejected": -1.7511730194091797, "loss": 0.7755, "rewards/accuracies": 0.0, "rewards/chosen": 0.6197108626365662, "rewards/margins": -0.15847039222717285, "rewards/rejected": 0.778181254863739, "step": 4793 }, { "epoch": 2.59, "learning_rate": 4.917323688434283e-09, "logits/chosen": -1.9996782541275024, "logits/rejected": -2.238731622695923, "logps/chosen": -0.36636248230934143, "logps/rejected": -0.38241928815841675, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.9925862550735474, "rewards/margins": 0.019008100032806396, "rewards/rejected": 0.973578155040741, "step": 4794 }, { "epoch": 2.59, "learning_rate": 4.904739947209574e-09, "logits/chosen": -1.9894014596939087, "logits/rejected": -1.9948482513427734, "logps/chosen": -5.353055477142334, "logps/rejected": -1.1282085180282593, "loss": 0.2353, "rewards/accuracies": 1.0, "rewards/chosen": 1.8917455673217773, "rewards/margins": 1.326718807220459, "rewards/rejected": 0.5650267004966736, "step": 4795 }, { "epoch": 2.59, "learning_rate": 4.892171497401832e-09, "logits/chosen": -2.035223960876465, "logits/rejected": -2.0302228927612305, "logps/chosen": -5.650778770446777, "logps/rejected": -3.7879998683929443, "loss": 0.3182, "rewards/accuracies": 1.0, "rewards/chosen": 1.497236728668213, "rewards/margins": 0.981627345085144, "rewards/rejected": 0.5156093835830688, "step": 4796 }, { "epoch": 2.59, "learning_rate": 4.8796183432729235e-09, "logits/chosen": -2.0191402435302734, "logits/rejected": -2.001213788986206, "logps/chosen": -9.628596305847168, "logps/rejected": -10.960538864135742, "loss": 0.4842, "rewards/accuracies": 1.0, "rewards/chosen": 1.2323848009109497, "rewards/margins": 0.4734870195388794, "rewards/rejected": 0.7588977813720703, "step": 4797 }, { "epoch": 2.59, "learning_rate": 4.8670804890794894e-09, "logits/chosen": -2.1256394386291504, "logits/rejected": -2.3712263107299805, "logps/chosen": -0.7070306539535522, "logps/rejected": -10.24366569519043, "loss": 0.6667, "rewards/accuracies": 1.0, "rewards/chosen": 1.0471800565719604, "rewards/margins": 0.05365490913391113, "rewards/rejected": 0.9935251474380493, "step": 4798 }, { "epoch": 2.59, "learning_rate": 4.854557939073018e-09, "logits/chosen": -2.1350972652435303, "logits/rejected": -2.0527045726776123, "logps/chosen": -6.524157524108887, "logps/rejected": -1.785959005355835, "loss": 0.3827, "rewards/accuracies": 1.0, "rewards/chosen": 1.6519622802734375, "rewards/margins": 0.7630693316459656, "rewards/rejected": 0.8888929486274719, "step": 4799 }, { "epoch": 2.59, "learning_rate": 4.842050697499817e-09, "logits/chosen": -2.170396089553833, "logits/rejected": -2.1390833854675293, "logps/chosen": -20.895923614501953, "logps/rejected": -4.787165641784668, "loss": 0.2405, "rewards/accuracies": 1.0, "rewards/chosen": 1.793969750404358, "rewards/margins": 1.3024563789367676, "rewards/rejected": 0.49151334166526794, "step": 4800 }, { "epoch": 2.59, "learning_rate": 4.829558768600961e-09, "logits/chosen": -1.9792424440383911, "logits/rejected": -2.2366392612457275, "logps/chosen": -0.2263895720243454, "logps/rejected": -0.26262903213500977, "loss": 0.6765, "rewards/accuracies": 1.0, "rewards/chosen": 0.9383784532546997, "rewards/margins": 0.0336112380027771, "rewards/rejected": 0.9047672152519226, "step": 4801 }, { "epoch": 2.59, "learning_rate": 4.817082156612378e-09, "logits/chosen": -2.161982774734497, "logits/rejected": -2.1951181888580322, "logps/chosen": -2.6658315658569336, "logps/rejected": -11.774625778198242, "loss": 0.4585, "rewards/accuracies": 1.0, "rewards/chosen": 1.2944711446762085, "rewards/margins": 0.5417865514755249, "rewards/rejected": 0.7526845932006836, "step": 4802 }, { "epoch": 2.59, "learning_rate": 4.804620865764753e-09, "logits/chosen": -2.063488721847534, "logits/rejected": -2.3494763374328613, "logps/chosen": -0.2863514721393585, "logps/rejected": -0.32726144790649414, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.9023861885070801, "rewards/margins": 0.017157256603240967, "rewards/rejected": 0.8852289319038391, "step": 4803 }, { "epoch": 2.59, "learning_rate": 4.792174900283624e-09, "logits/chosen": -2.126877784729004, "logits/rejected": -2.339705467224121, "logps/chosen": -0.3108786344528198, "logps/rejected": -0.31668001413345337, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 0.8580240607261658, "rewards/margins": 0.021785080432891846, "rewards/rejected": 0.8362389802932739, "step": 4804 }, { "epoch": 2.59, "learning_rate": 4.779744264389301e-09, "logits/chosen": -2.109470844268799, "logits/rejected": -2.1184797286987305, "logps/chosen": -1.7396001815795898, "logps/rejected": -3.8506550788879395, "loss": 0.4408, "rewards/accuracies": 1.0, "rewards/chosen": 1.3366990089416504, "rewards/margins": 0.5906435251235962, "rewards/rejected": 0.7460554838180542, "step": 4805 }, { "epoch": 2.59, "learning_rate": 4.767328962296918e-09, "logits/chosen": -2.1340227127075195, "logits/rejected": -2.289132833480835, "logps/chosen": -0.6140755414962769, "logps/rejected": -0.6226353645324707, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 1.0292941331863403, "rewards/margins": 0.022265911102294922, "rewards/rejected": 1.0070282220840454, "step": 4806 }, { "epoch": 2.59, "learning_rate": 4.754928998216395e-09, "logits/chosen": -2.0992307662963867, "logits/rejected": -2.1004233360290527, "logps/chosen": -0.5905674695968628, "logps/rejected": -4.317264080047607, "loss": 0.4444, "rewards/accuracies": 1.0, "rewards/chosen": 1.0886566638946533, "rewards/margins": 0.5806333422660828, "rewards/rejected": 0.5080233216285706, "step": 4807 }, { "epoch": 2.59, "learning_rate": 4.742544376352442e-09, "logits/chosen": -2.1072590351104736, "logits/rejected": -2.1215217113494873, "logps/chosen": -3.6418724060058594, "logps/rejected": -4.8032989501953125, "loss": 0.2702, "rewards/accuracies": 1.0, "rewards/chosen": 1.8639730215072632, "rewards/margins": 1.1704000234603882, "rewards/rejected": 0.693572998046875, "step": 4808 }, { "epoch": 2.59, "learning_rate": 4.730175100904588e-09, "logits/chosen": -2.1045775413513184, "logits/rejected": -2.2445881366729736, "logps/chosen": -11.067403793334961, "logps/rejected": -0.4653196930885315, "loss": 0.8206, "rewards/accuracies": 0.0, "rewards/chosen": 0.6790054440498352, "rewards/margins": -0.24046021699905396, "rewards/rejected": 0.9194656610488892, "step": 4809 }, { "epoch": 2.59, "learning_rate": 4.717821176067149e-09, "logits/chosen": -2.1082072257995605, "logits/rejected": -2.3488452434539795, "logps/chosen": -0.4522472023963928, "logps/rejected": -0.4197665750980377, "loss": 0.6924, "rewards/accuracies": 1.0, "rewards/chosen": 0.9414330720901489, "rewards/margins": 0.0014559626579284668, "rewards/rejected": 0.9399771094322205, "step": 4810 }, { "epoch": 2.59, "learning_rate": 4.7054826060292316e-09, "logits/chosen": -2.234910488128662, "logits/rejected": -2.319474458694458, "logps/chosen": -0.21342742443084717, "logps/rejected": -0.19130894541740417, "loss": 0.6726, "rewards/accuracies": 1.0, "rewards/chosen": 0.8072177767753601, "rewards/margins": 0.04147744178771973, "rewards/rejected": 0.7657403349876404, "step": 4811 }, { "epoch": 2.6, "learning_rate": 4.6931593949747435e-09, "logits/chosen": -2.136448383331299, "logits/rejected": -2.140143632888794, "logps/chosen": -1.0910171270370483, "logps/rejected": -3.84251070022583, "loss": 0.4629, "rewards/accuracies": 1.0, "rewards/chosen": 1.0049962997436523, "rewards/margins": 0.5298347473144531, "rewards/rejected": 0.47516152262687683, "step": 4812 }, { "epoch": 2.6, "learning_rate": 4.6808515470823765e-09, "logits/chosen": -2.0915307998657227, "logits/rejected": -2.0916547775268555, "logps/chosen": -1.7463046312332153, "logps/rejected": -2.064804792404175, "loss": 0.603, "rewards/accuracies": 1.0, "rewards/chosen": 1.077271819114685, "rewards/margins": 0.18924397230148315, "rewards/rejected": 0.8880278468132019, "step": 4813 }, { "epoch": 2.6, "learning_rate": 4.6685590665256066e-09, "logits/chosen": -2.1206259727478027, "logits/rejected": -2.122304677963257, "logps/chosen": -1.3013193607330322, "logps/rejected": -2.1898727416992188, "loss": 0.5055, "rewards/accuracies": 1.0, "rewards/chosen": 1.0824673175811768, "rewards/margins": 0.4189409613609314, "rewards/rejected": 0.6635263562202454, "step": 4814 }, { "epoch": 2.6, "learning_rate": 4.65628195747273e-09, "logits/chosen": -2.0531861782073975, "logits/rejected": -2.0534658432006836, "logps/chosen": -5.526317596435547, "logps/rejected": -4.837326526641846, "loss": 0.4635, "rewards/accuracies": 1.0, "rewards/chosen": 1.166276216506958, "rewards/margins": 0.5281351804733276, "rewards/rejected": 0.6381410360336304, "step": 4815 }, { "epoch": 2.6, "learning_rate": 4.644020224086792e-09, "logits/chosen": -2.0960211753845215, "logits/rejected": -2.0969865322113037, "logps/chosen": -1.207252860069275, "logps/rejected": -5.718178749084473, "loss": 0.4149, "rewards/accuracies": 1.0, "rewards/chosen": 1.0289233922958374, "rewards/margins": 0.6649634838104248, "rewards/rejected": 0.3639598786830902, "step": 4816 }, { "epoch": 2.6, "learning_rate": 4.631773870525657e-09, "logits/chosen": -2.027350664138794, "logits/rejected": -2.293471574783325, "logps/chosen": -0.1446278840303421, "logps/rejected": -0.1823187917470932, "loss": 0.6826, "rewards/accuracies": 1.0, "rewards/chosen": 0.8678328394889832, "rewards/margins": 0.021294772624969482, "rewards/rejected": 0.8465380668640137, "step": 4817 }, { "epoch": 2.6, "learning_rate": 4.619542900941947e-09, "logits/chosen": -2.2270853519439697, "logits/rejected": -2.354015588760376, "logps/chosen": -14.132469177246094, "logps/rejected": -9.773716926574707, "loss": 0.6689, "rewards/accuracies": 1.0, "rewards/chosen": 0.9352943301200867, "rewards/margins": 0.04910820722579956, "rewards/rejected": 0.8861861228942871, "step": 4818 }, { "epoch": 2.6, "learning_rate": 4.607327319483078e-09, "logits/chosen": -2.0534510612487793, "logits/rejected": -2.313155174255371, "logps/chosen": -0.24523258209228516, "logps/rejected": -0.30225440859794617, "loss": 0.6871, "rewards/accuracies": 1.0, "rewards/chosen": 0.8977393507957458, "rewards/margins": 0.012086331844329834, "rewards/rejected": 0.885653018951416, "step": 4819 }, { "epoch": 2.6, "learning_rate": 4.5951271302912655e-09, "logits/chosen": -2.109480381011963, "logits/rejected": -2.2673451900482178, "logps/chosen": -1.536041259765625, "logps/rejected": -1.6372687816619873, "loss": 0.6823, "rewards/accuracies": 1.0, "rewards/chosen": 1.0158352851867676, "rewards/margins": 0.021843254566192627, "rewards/rejected": 0.993992030620575, "step": 4820 }, { "epoch": 2.6, "learning_rate": 4.582942337503465e-09, "logits/chosen": -2.0036675930023193, "logits/rejected": -2.300905227661133, "logps/chosen": -0.34752675890922546, "logps/rejected": -0.4447369873523712, "loss": 0.6807, "rewards/accuracies": 1.0, "rewards/chosen": 1.0080076456069946, "rewards/margins": 0.024998366832733154, "rewards/rejected": 0.9830092787742615, "step": 4821 }, { "epoch": 2.6, "learning_rate": 4.570772945251461e-09, "logits/chosen": -2.0326592922210693, "logits/rejected": -2.26723575592041, "logps/chosen": -1.7677662372589111, "logps/rejected": -2.00530743598938, "loss": 0.6852, "rewards/accuracies": 1.0, "rewards/chosen": 0.8154347538948059, "rewards/margins": 0.015949130058288574, "rewards/rejected": 0.7994856238365173, "step": 4822 }, { "epoch": 2.6, "learning_rate": 4.558618957661764e-09, "logits/chosen": -2.012606143951416, "logits/rejected": -2.002793073654175, "logps/chosen": -4.801800727844238, "logps/rejected": -1.2844183444976807, "loss": 0.4731, "rewards/accuracies": 1.0, "rewards/chosen": 1.2449077367782593, "rewards/margins": 0.5025008916854858, "rewards/rejected": 0.7424068450927734, "step": 4823 }, { "epoch": 2.6, "learning_rate": 4.546480378855699e-09, "logits/chosen": -2.095837354660034, "logits/rejected": -2.312424898147583, "logps/chosen": -0.2407134771347046, "logps/rejected": -0.3248492479324341, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": 0.8891567587852478, "rewards/margins": 0.03826028108596802, "rewards/rejected": 0.8508964776992798, "step": 4824 }, { "epoch": 2.6, "learning_rate": 4.534357212949358e-09, "logits/chosen": -1.9857927560806274, "logits/rejected": -1.988277554512024, "logps/chosen": -0.22584371268749237, "logps/rejected": -2.563113212585449, "loss": 0.5197, "rewards/accuracies": 1.0, "rewards/chosen": 0.9099825024604797, "rewards/margins": 0.3834010362625122, "rewards/rejected": 0.5265814661979675, "step": 4825 }, { "epoch": 2.6, "learning_rate": 4.522249464053601e-09, "logits/chosen": -2.075303316116333, "logits/rejected": -2.279628276824951, "logps/chosen": -2.474454641342163, "logps/rejected": -2.427978038787842, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.7308484315872192, "rewards/margins": 0.016111135482788086, "rewards/rejected": 0.7147372961044312, "step": 4826 }, { "epoch": 2.6, "learning_rate": 4.5101571362740734e-09, "logits/chosen": -1.9407151937484741, "logits/rejected": -2.240882396697998, "logps/chosen": -2.9281423091888428, "logps/rejected": -2.149905204772949, "loss": 0.7376, "rewards/accuracies": 0.0, "rewards/chosen": 0.7532197833061218, "rewards/margins": -0.08701813220977783, "rewards/rejected": 0.8402379155158997, "step": 4827 }, { "epoch": 2.6, "learning_rate": 4.498080233711154e-09, "logits/chosen": -2.0576283931732178, "logits/rejected": -2.2940480709075928, "logps/chosen": -2.5829291343688965, "logps/rejected": -2.671909809112549, "loss": 0.6813, "rewards/accuracies": 1.0, "rewards/chosen": 0.7270677089691162, "rewards/margins": 0.02380424737930298, "rewards/rejected": 0.7032634615898132, "step": 4828 }, { "epoch": 2.6, "learning_rate": 4.486018760460036e-09, "logits/chosen": -2.066128730773926, "logits/rejected": -2.0654337406158447, "logps/chosen": -0.34068915247917175, "logps/rejected": -5.360877990722656, "loss": 0.519, "rewards/accuracies": 1.0, "rewards/chosen": 1.042578101158142, "rewards/margins": 0.3851770758628845, "rewards/rejected": 0.6574010252952576, "step": 4829 }, { "epoch": 2.61, "learning_rate": 4.473972720610664e-09, "logits/chosen": -2.1492581367492676, "logits/rejected": -2.3434667587280273, "logps/chosen": -0.8487682938575745, "logps/rejected": -5.801812648773193, "loss": 0.6163, "rewards/accuracies": 1.0, "rewards/chosen": 0.9283520579338074, "rewards/margins": 0.16015338897705078, "rewards/rejected": 0.7681986689567566, "step": 4830 }, { "epoch": 2.61, "learning_rate": 4.461942118247736e-09, "logits/chosen": -2.0357418060302734, "logits/rejected": -2.0454342365264893, "logps/chosen": -7.3070526123046875, "logps/rejected": -3.0216000080108643, "loss": 0.3558, "rewards/accuracies": 1.0, "rewards/chosen": 1.6348965167999268, "rewards/margins": 0.8500999808311462, "rewards/rejected": 0.7847965359687805, "step": 4831 }, { "epoch": 2.61, "learning_rate": 4.449926957450723e-09, "logits/chosen": -2.039964437484741, "logits/rejected": -2.28914737701416, "logps/chosen": -5.489968776702881, "logps/rejected": -6.6987104415893555, "loss": 0.6276, "rewards/accuracies": 1.0, "rewards/chosen": 0.6185876727104187, "rewards/margins": 0.13568398356437683, "rewards/rejected": 0.48290368914604187, "step": 4832 }, { "epoch": 2.61, "learning_rate": 4.437927242293882e-09, "logits/chosen": -2.1199018955230713, "logits/rejected": -2.000544786453247, "logps/chosen": -14.868460655212402, "logps/rejected": -10.800949096679688, "loss": 0.3484, "rewards/accuracies": 1.0, "rewards/chosen": 1.7560299634933472, "rewards/margins": 0.8752506971359253, "rewards/rejected": 0.8807792663574219, "step": 4833 }, { "epoch": 2.61, "learning_rate": 4.425942976846187e-09, "logits/chosen": -1.9594347476959229, "logits/rejected": -1.9302738904953003, "logps/chosen": -9.846972465515137, "logps/rejected": -3.967334508895874, "loss": 0.2684, "rewards/accuracies": 1.0, "rewards/chosen": 1.6340798139572144, "rewards/margins": 1.1782690286636353, "rewards/rejected": 0.4558107554912567, "step": 4834 }, { "epoch": 2.61, "learning_rate": 4.413974165171436e-09, "logits/chosen": -1.9616799354553223, "logits/rejected": -2.2294700145721436, "logps/chosen": -2.2303993701934814, "logps/rejected": -2.195585250854492, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.748156726360321, "rewards/margins": 0.026345431804656982, "rewards/rejected": 0.7218112945556641, "step": 4835 }, { "epoch": 2.61, "learning_rate": 4.402020811328116e-09, "logits/chosen": -2.038175344467163, "logits/rejected": -2.2766129970550537, "logps/chosen": -0.38808688521385193, "logps/rejected": -0.3621829152107239, "loss": 0.6775, "rewards/accuracies": 1.0, "rewards/chosen": 0.9576995968818665, "rewards/margins": 0.03157311677932739, "rewards/rejected": 0.9261264801025391, "step": 4836 }, { "epoch": 2.61, "learning_rate": 4.3900829193695366e-09, "logits/chosen": -2.1175074577331543, "logits/rejected": -2.2369279861450195, "logps/chosen": -3.5594685077667236, "logps/rejected": -4.381718158721924, "loss": 0.6997, "rewards/accuracies": 0.0, "rewards/chosen": 0.7414202690124512, "rewards/margins": -0.013140082359313965, "rewards/rejected": 0.7545603513717651, "step": 4837 }, { "epoch": 2.61, "learning_rate": 4.378160493343702e-09, "logits/chosen": -2.1488423347473145, "logits/rejected": -2.041407823562622, "logps/chosen": -21.790462493896484, "logps/rejected": -1.3350355625152588, "loss": 0.2624, "rewards/accuracies": 1.0, "rewards/chosen": 2.056389331817627, "rewards/margins": 1.2038443088531494, "rewards/rejected": 0.8525449633598328, "step": 4838 }, { "epoch": 2.61, "learning_rate": 4.366253537293429e-09, "logits/chosen": -2.136098623275757, "logits/rejected": -2.049201726913452, "logps/chosen": -14.06114387512207, "logps/rejected": -5.531736373901367, "loss": 0.2394, "rewards/accuracies": 1.0, "rewards/chosen": 1.7293939590454102, "rewards/margins": 1.3075215816497803, "rewards/rejected": 0.42187243700027466, "step": 4839 }, { "epoch": 2.61, "learning_rate": 4.354362055256266e-09, "logits/chosen": -2.129673480987549, "logits/rejected": -2.3462331295013428, "logps/chosen": -0.5370664596557617, "logps/rejected": -0.5839943289756775, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 1.012209415435791, "rewards/margins": 0.014768898487091064, "rewards/rejected": 0.9974405169487, "step": 4840 }, { "epoch": 2.61, "learning_rate": 4.342486051264493e-09, "logits/chosen": -2.170865535736084, "logits/rejected": -2.165006160736084, "logps/chosen": -4.0948686599731445, "logps/rejected": -5.130634784698486, "loss": 0.5899, "rewards/accuracies": 1.0, "rewards/chosen": 0.8424970507621765, "rewards/margins": 0.218475341796875, "rewards/rejected": 0.6240217089653015, "step": 4841 }, { "epoch": 2.61, "learning_rate": 4.3306255293451775e-09, "logits/chosen": -1.995781660079956, "logits/rejected": -1.9946801662445068, "logps/chosen": -2.019775867462158, "logps/rejected": -8.5920991897583, "loss": 0.2863, "rewards/accuracies": 1.0, "rewards/chosen": 1.4858331680297852, "rewards/margins": 1.1040841341018677, "rewards/rejected": 0.38174906373023987, "step": 4842 }, { "epoch": 2.61, "learning_rate": 4.318780493520124e-09, "logits/chosen": -1.9327616691589355, "logits/rejected": -1.9678313732147217, "logps/chosen": -0.7571017742156982, "logps/rejected": -13.452383995056152, "loss": 0.5275, "rewards/accuracies": 1.0, "rewards/chosen": 1.067323088645935, "rewards/margins": 0.3642634153366089, "rewards/rejected": 0.7030596733093262, "step": 4843 }, { "epoch": 2.61, "learning_rate": 4.30695094780586e-09, "logits/chosen": -2.0792462825775146, "logits/rejected": -2.080599784851074, "logps/chosen": -0.43687137961387634, "logps/rejected": -4.318204402923584, "loss": 0.4626, "rewards/accuracies": 1.0, "rewards/chosen": 0.9845504760742188, "rewards/margins": 0.5307201147079468, "rewards/rejected": 0.45383039116859436, "step": 4844 }, { "epoch": 2.61, "learning_rate": 4.2951368962137135e-09, "logits/chosen": -2.2152724266052246, "logits/rejected": -2.244476318359375, "logps/chosen": -17.341670989990234, "logps/rejected": -14.230050086975098, "loss": 0.4363, "rewards/accuracies": 1.0, "rewards/chosen": 1.8686306476593018, "rewards/margins": 0.6033494472503662, "rewards/rejected": 1.2652812004089355, "step": 4845 }, { "epoch": 2.61, "learning_rate": 4.283338342749704e-09, "logits/chosen": -2.158048152923584, "logits/rejected": -2.1614530086517334, "logps/chosen": -1.5838260650634766, "logps/rejected": -4.947543621063232, "loss": 0.4513, "rewards/accuracies": 1.0, "rewards/chosen": 1.0642460584640503, "rewards/margins": 0.5614275336265564, "rewards/rejected": 0.5028185248374939, "step": 4846 }, { "epoch": 2.61, "learning_rate": 4.271555291414636e-09, "logits/chosen": -2.042656660079956, "logits/rejected": -2.0211150646209717, "logps/chosen": -6.859500885009766, "logps/rejected": -6.068778038024902, "loss": 0.3603, "rewards/accuracies": 1.0, "rewards/chosen": 1.3486292362213135, "rewards/margins": 0.8353240489959717, "rewards/rejected": 0.5133051872253418, "step": 4847 }, { "epoch": 2.61, "learning_rate": 4.259787746204036e-09, "logits/chosen": -2.080307960510254, "logits/rejected": -2.2668988704681396, "logps/chosen": -6.703496932983398, "logps/rejected": -0.9354530572891235, "loss": 0.6586, "rewards/accuracies": 1.0, "rewards/chosen": 0.9047490954399109, "rewards/margins": 0.07041090726852417, "rewards/rejected": 0.8343381881713867, "step": 4848 }, { "epoch": 2.62, "learning_rate": 4.248035711108172e-09, "logits/chosen": -2.2135863304138184, "logits/rejected": -2.1775124073028564, "logps/chosen": -27.516204833984375, "logps/rejected": -1.9745371341705322, "loss": 0.3513, "rewards/accuracies": 1.0, "rewards/chosen": 1.8888691663742065, "rewards/margins": 0.8653931617736816, "rewards/rejected": 1.023476004600525, "step": 4849 }, { "epoch": 2.62, "learning_rate": 4.236299190112075e-09, "logits/chosen": -2.116969347000122, "logits/rejected": -2.1191980838775635, "logps/chosen": -1.737907886505127, "logps/rejected": -1.969458818435669, "loss": 0.5806, "rewards/accuracies": 1.0, "rewards/chosen": 1.1495189666748047, "rewards/margins": 0.2393031120300293, "rewards/rejected": 0.9102158546447754, "step": 4850 }, { "epoch": 2.62, "learning_rate": 4.224578187195477e-09, "logits/chosen": -2.218977928161621, "logits/rejected": -2.211512804031372, "logps/chosen": -1.681714415550232, "logps/rejected": -10.602596282958984, "loss": 0.3559, "rewards/accuracies": 1.0, "rewards/chosen": 1.363155722618103, "rewards/margins": 0.8498288989067078, "rewards/rejected": 0.5133268237113953, "step": 4851 }, { "epoch": 2.62, "learning_rate": 4.212872706332882e-09, "logits/chosen": -2.1021149158477783, "logits/rejected": -2.3307101726531982, "logps/chosen": -0.9997062087059021, "logps/rejected": -0.8842338919639587, "loss": 0.6265, "rewards/accuracies": 1.0, "rewards/chosen": 1.2469806671142578, "rewards/margins": 0.13796591758728027, "rewards/rejected": 1.1090147495269775, "step": 4852 }, { "epoch": 2.62, "learning_rate": 4.20118275149352e-09, "logits/chosen": -2.1044833660125732, "logits/rejected": -2.106929302215576, "logps/chosen": -0.5284678339958191, "logps/rejected": -3.498826503753662, "loss": 0.5385, "rewards/accuracies": 1.0, "rewards/chosen": 0.7738277316093445, "rewards/margins": 0.3375500440597534, "rewards/rejected": 0.43627768754959106, "step": 4853 }, { "epoch": 2.62, "learning_rate": 4.189508326641339e-09, "logits/chosen": -2.094271183013916, "logits/rejected": -2.301551103591919, "logps/chosen": -0.9397778511047363, "logps/rejected": -1.0404597520828247, "loss": 0.6756, "rewards/accuracies": 1.0, "rewards/chosen": 0.8931543231010437, "rewards/margins": 0.03531169891357422, "rewards/rejected": 0.8578426241874695, "step": 4854 }, { "epoch": 2.62, "learning_rate": 4.17784943573507e-09, "logits/chosen": -2.098930597305298, "logits/rejected": -2.259326457977295, "logps/chosen": -0.37373417615890503, "logps/rejected": -0.4180868864059448, "loss": 0.6747, "rewards/accuracies": 1.0, "rewards/chosen": 0.9570233225822449, "rewards/margins": 0.03715682029724121, "rewards/rejected": 0.9198665022850037, "step": 4855 }, { "epoch": 2.62, "learning_rate": 4.1662060827281076e-09, "logits/chosen": -2.1009585857391357, "logits/rejected": -2.111341953277588, "logps/chosen": -1.9782088994979858, "logps/rejected": -1.881695032119751, "loss": 0.504, "rewards/accuracies": 1.0, "rewards/chosen": 1.1953738927841187, "rewards/margins": 0.42269808053970337, "rewards/rejected": 0.7726758122444153, "step": 4856 }, { "epoch": 2.62, "learning_rate": 4.154578271568632e-09, "logits/chosen": -2.0746209621429443, "logits/rejected": -2.312513589859009, "logps/chosen": -0.38429200649261475, "logps/rejected": -0.3694842457771301, "loss": 0.6773, "rewards/accuracies": 1.0, "rewards/chosen": 0.9173041582107544, "rewards/margins": 0.03196918964385986, "rewards/rejected": 0.8853349685668945, "step": 4857 }, { "epoch": 2.62, "learning_rate": 4.142966006199533e-09, "logits/chosen": -2.112758159637451, "logits/rejected": -2.119418144226074, "logps/chosen": -4.074713706970215, "logps/rejected": -3.97471284866333, "loss": 0.4379, "rewards/accuracies": 1.0, "rewards/chosen": 1.1865508556365967, "rewards/margins": 0.5987391471862793, "rewards/rejected": 0.5878117084503174, "step": 4858 }, { "epoch": 2.62, "learning_rate": 4.131369290558423e-09, "logits/chosen": -2.144341230392456, "logits/rejected": -2.3226821422576904, "logps/chosen": -0.3625325858592987, "logps/rejected": -0.35828641057014465, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.8651720881462097, "rewards/margins": 0.01890474557876587, "rewards/rejected": 0.8462673425674438, "step": 4859 }, { "epoch": 2.62, "learning_rate": 4.119788128577667e-09, "logits/chosen": -2.1137773990631104, "logits/rejected": -2.296952962875366, "logps/chosen": -2.054448366165161, "logps/rejected": -0.8232895135879517, "loss": 0.6705, "rewards/accuracies": 1.0, "rewards/chosen": 0.8800603151321411, "rewards/margins": 0.04579848051071167, "rewards/rejected": 0.8342618346214294, "step": 4860 }, { "epoch": 2.62, "learning_rate": 4.108222524184313e-09, "logits/chosen": -2.196387767791748, "logits/rejected": -2.29223895072937, "logps/chosen": -2.548740863800049, "logps/rejected": -2.587486982345581, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.9449540972709656, "rewards/margins": 0.013809442520141602, "rewards/rejected": 0.931144654750824, "step": 4861 }, { "epoch": 2.62, "learning_rate": 4.096672481300167e-09, "logits/chosen": -2.109863519668579, "logits/rejected": -2.1383378505706787, "logps/chosen": -5.907855987548828, "logps/rejected": -9.044666290283203, "loss": 0.4036, "rewards/accuracies": 1.0, "rewards/chosen": 1.606869101524353, "rewards/margins": 0.6987519860267639, "rewards/rejected": 0.9081171154975891, "step": 4862 }, { "epoch": 2.62, "learning_rate": 4.085138003841765e-09, "logits/chosen": -2.008511543273926, "logits/rejected": -2.004760980606079, "logps/chosen": -3.3385910987854004, "logps/rejected": -3.535943031311035, "loss": 0.3479, "rewards/accuracies": 1.0, "rewards/chosen": 1.4786291122436523, "rewards/margins": 0.8767004013061523, "rewards/rejected": 0.6019287109375, "step": 4863 }, { "epoch": 2.62, "learning_rate": 4.073619095720321e-09, "logits/chosen": -2.1584365367889404, "logits/rejected": -2.1557188034057617, "logps/chosen": -3.7713699340820312, "logps/rejected": -3.7071731090545654, "loss": 0.5612, "rewards/accuracies": 1.0, "rewards/chosen": 0.7813284993171692, "rewards/margins": 0.28403910994529724, "rewards/rejected": 0.49728938937187195, "step": 4864 }, { "epoch": 2.62, "learning_rate": 4.062115760841833e-09, "logits/chosen": -2.120309591293335, "logits/rejected": -2.122096538543701, "logps/chosen": -0.555580198764801, "logps/rejected": -8.828458786010742, "loss": 0.3903, "rewards/accuracies": 1.0, "rewards/chosen": 1.196082592010498, "rewards/margins": 0.7393385767936707, "rewards/rejected": 0.4567440152168274, "step": 4865 }, { "epoch": 2.62, "learning_rate": 4.05062800310696e-09, "logits/chosen": -1.9523539543151855, "logits/rejected": -2.2081074714660645, "logps/chosen": -0.32105809450149536, "logps/rejected": -0.3704296350479126, "loss": 0.6999, "rewards/accuracies": 0.0, "rewards/chosen": 0.9372777342796326, "rewards/margins": -0.01350027322769165, "rewards/rejected": 0.9507780075073242, "step": 4866 }, { "epoch": 2.63, "learning_rate": 4.039155826411106e-09, "logits/chosen": -2.0545811653137207, "logits/rejected": -2.240069627761841, "logps/chosen": -0.3265169560909271, "logps/rejected": -0.32183873653411865, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 0.9034135937690735, "rewards/margins": -0.00697249174118042, "rewards/rejected": 0.9103860855102539, "step": 4867 }, { "epoch": 2.63, "learning_rate": 4.0276992346443935e-09, "logits/chosen": -2.101440668106079, "logits/rejected": -2.0971601009368896, "logps/chosen": -7.992295265197754, "logps/rejected": -4.096051216125488, "loss": 0.2885, "rewards/accuracies": 1.0, "rewards/chosen": 1.6038646697998047, "rewards/margins": 1.0955231189727783, "rewards/rejected": 0.5083414912223816, "step": 4868 }, { "epoch": 2.63, "learning_rate": 4.016258231691649e-09, "logits/chosen": -2.1036338806152344, "logits/rejected": -2.2356958389282227, "logps/chosen": -3.533461809158325, "logps/rejected": -3.4717626571655273, "loss": 0.714, "rewards/accuracies": 0.0, "rewards/chosen": 0.7293351292610168, "rewards/margins": -0.04123717546463013, "rewards/rejected": 0.770572304725647, "step": 4869 }, { "epoch": 2.63, "learning_rate": 4.004832821432419e-09, "logits/chosen": -2.040759801864624, "logits/rejected": -2.338603973388672, "logps/chosen": -0.9318448901176453, "logps/rejected": -0.9134427309036255, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.8867220878601074, "rewards/margins": 0.01743030548095703, "rewards/rejected": 0.8692917823791504, "step": 4870 }, { "epoch": 2.63, "learning_rate": 3.99342300774097e-09, "logits/chosen": -2.116255760192871, "logits/rejected": -2.3716564178466797, "logps/chosen": -12.094324111938477, "logps/rejected": -7.476490020751953, "loss": 0.8303, "rewards/accuracies": 0.0, "rewards/chosen": 0.9743131995201111, "rewards/margins": -0.2577943205833435, "rewards/rejected": 1.2321075201034546, "step": 4871 }, { "epoch": 2.63, "learning_rate": 3.9820287944862644e-09, "logits/chosen": -2.1995468139648438, "logits/rejected": -2.2656772136688232, "logps/chosen": -2.4945385456085205, "logps/rejected": -11.85038948059082, "loss": 0.4462, "rewards/accuracies": 1.0, "rewards/chosen": 1.432253360748291, "rewards/margins": 0.5755915641784668, "rewards/rejected": 0.8566617965698242, "step": 4872 }, { "epoch": 2.63, "learning_rate": 3.9706501855319764e-09, "logits/chosen": -2.050023078918457, "logits/rejected": -2.1679203510284424, "logps/chosen": -0.4434114098548889, "logps/rejected": -17.619640350341797, "loss": 0.4726, "rewards/accuracies": 1.0, "rewards/chosen": 0.9806565642356873, "rewards/margins": 0.5039292573928833, "rewards/rejected": 0.47672730684280396, "step": 4873 }, { "epoch": 2.63, "learning_rate": 3.959287184736509e-09, "logits/chosen": -1.9954540729522705, "logits/rejected": -2.3575971126556396, "logps/chosen": -3.3386454582214355, "logps/rejected": -3.657714366912842, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 1.0766159296035767, "rewards/margins": 0.016502022743225098, "rewards/rejected": 1.0601139068603516, "step": 4874 }, { "epoch": 2.63, "learning_rate": 3.947939795952959e-09, "logits/chosen": -1.986882209777832, "logits/rejected": -2.259963035583496, "logps/chosen": -0.45648691058158875, "logps/rejected": -0.4807468056678772, "loss": 0.6903, "rewards/accuracies": 1.0, "rewards/chosen": 0.9609708786010742, "rewards/margins": 0.005729556083679199, "rewards/rejected": 0.955241322517395, "step": 4875 }, { "epoch": 2.63, "learning_rate": 3.93660802302912e-09, "logits/chosen": -2.019185781478882, "logits/rejected": -2.022515058517456, "logps/chosen": -2.5959627628326416, "logps/rejected": -0.7867071032524109, "loss": 0.6175, "rewards/accuracies": 1.0, "rewards/chosen": 1.1582672595977783, "rewards/margins": 0.15742933750152588, "rewards/rejected": 1.0008379220962524, "step": 4876 }, { "epoch": 2.63, "learning_rate": 3.925291869807496e-09, "logits/chosen": -2.200758695602417, "logits/rejected": -2.195842742919922, "logps/chosen": -6.098664283752441, "logps/rejected": -6.256224632263184, "loss": 0.3833, "rewards/accuracies": 1.0, "rewards/chosen": 1.224179983139038, "rewards/margins": 0.7611311674118042, "rewards/rejected": 0.4630488455295563, "step": 4877 }, { "epoch": 2.63, "learning_rate": 3.9139913401253164e-09, "logits/chosen": -2.1222658157348633, "logits/rejected": -2.135819911956787, "logps/chosen": -3.073068380355835, "logps/rejected": -5.6132683753967285, "loss": 0.4475, "rewards/accuracies": 1.0, "rewards/chosen": 1.3660808801651, "rewards/margins": 0.572050154209137, "rewards/rejected": 0.7940307259559631, "step": 4878 }, { "epoch": 2.63, "learning_rate": 3.902706437814468e-09, "logits/chosen": -1.9987417459487915, "logits/rejected": -1.9986422061920166, "logps/chosen": -0.4244113862514496, "logps/rejected": -1.8361388444900513, "loss": 0.569, "rewards/accuracies": 1.0, "rewards/chosen": 0.8856420516967773, "rewards/margins": 0.265841543674469, "rewards/rejected": 0.6198005080223083, "step": 4879 }, { "epoch": 2.63, "learning_rate": 3.891437166701583e-09, "logits/chosen": -2.0904126167297363, "logits/rejected": -2.089470624923706, "logps/chosen": -0.6893364787101746, "logps/rejected": -5.075158596038818, "loss": 0.5296, "rewards/accuracies": 1.0, "rewards/chosen": 0.8839231729507446, "rewards/margins": 0.35913175344467163, "rewards/rejected": 0.524791419506073, "step": 4880 }, { "epoch": 2.63, "learning_rate": 3.880183530607972e-09, "logits/chosen": -2.1010048389434814, "logits/rejected": -2.313868999481201, "logps/chosen": -0.4716894328594208, "logps/rejected": -0.45726388692855835, "loss": 0.6754, "rewards/accuracies": 1.0, "rewards/chosen": 1.0330933332443237, "rewards/margins": 0.035845935344696045, "rewards/rejected": 0.9972473978996277, "step": 4881 }, { "epoch": 2.63, "learning_rate": 3.868945533349643e-09, "logits/chosen": -2.071018695831299, "logits/rejected": -2.125962257385254, "logps/chosen": -4.748021602630615, "logps/rejected": -13.223613739013672, "loss": 0.2869, "rewards/accuracies": 1.0, "rewards/chosen": 1.5460999011993408, "rewards/margins": 1.1016491651535034, "rewards/rejected": 0.4444507658481598, "step": 4882 }, { "epoch": 2.63, "learning_rate": 3.857723178737304e-09, "logits/chosen": -2.081631898880005, "logits/rejected": -2.3267624378204346, "logps/chosen": -0.6599777340888977, "logps/rejected": -0.6309259533882141, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 0.9637467265129089, "rewards/margins": 0.024329662322998047, "rewards/rejected": 0.9394170641899109, "step": 4883 }, { "epoch": 2.63, "learning_rate": 3.846516470576361e-09, "logits/chosen": -2.228633403778076, "logits/rejected": -2.1960785388946533, "logps/chosen": -23.725753784179688, "logps/rejected": -11.529099464416504, "loss": 0.3534, "rewards/accuracies": 1.0, "rewards/chosen": 1.9212265014648438, "rewards/margins": 0.8582276105880737, "rewards/rejected": 1.06299889087677, "step": 4884 }, { "epoch": 2.63, "learning_rate": 3.835325412666929e-09, "logits/chosen": -2.020402669906616, "logits/rejected": -2.0153110027313232, "logps/chosen": -5.83796501159668, "logps/rejected": -4.300931453704834, "loss": 0.2823, "rewards/accuracies": 1.0, "rewards/chosen": 1.6693060398101807, "rewards/margins": 1.1201145648956299, "rewards/rejected": 0.5491915345191956, "step": 4885 }, { "epoch": 2.64, "learning_rate": 3.824150008803767e-09, "logits/chosen": -2.133791208267212, "logits/rejected": -2.298970937728882, "logps/chosen": -0.6814918518066406, "logps/rejected": -0.7825188636779785, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 1.0192567110061646, "rewards/margins": 0.008480548858642578, "rewards/rejected": 1.010776162147522, "step": 4886 }, { "epoch": 2.64, "learning_rate": 3.812990262776383e-09, "logits/chosen": -2.210376024246216, "logits/rejected": -2.216834545135498, "logps/chosen": -1.7054004669189453, "logps/rejected": -4.308074474334717, "loss": 0.4648, "rewards/accuracies": 1.0, "rewards/chosen": 0.9852685332298279, "rewards/margins": 0.5246292948722839, "rewards/rejected": 0.46063923835754395, "step": 4887 }, { "epoch": 2.64, "learning_rate": 3.80184617836895e-09, "logits/chosen": -2.0590896606445312, "logits/rejected": -2.0670061111450195, "logps/chosen": -1.072295069694519, "logps/rejected": -2.636449098587036, "loss": 0.4739, "rewards/accuracies": 1.0, "rewards/chosen": 1.0793966054916382, "rewards/margins": 0.50057452917099, "rewards/rejected": 0.5788220763206482, "step": 4888 }, { "epoch": 2.64, "learning_rate": 3.790717759360324e-09, "logits/chosen": -2.0296645164489746, "logits/rejected": -2.030790328979492, "logps/chosen": -3.150845527648926, "logps/rejected": -1.0076572895050049, "loss": 0.4861, "rewards/accuracies": 1.0, "rewards/chosen": 1.430039882659912, "rewards/margins": 0.46854543685913086, "rewards/rejected": 0.9614944458007812, "step": 4889 }, { "epoch": 2.64, "learning_rate": 3.7796050095240505e-09, "logits/chosen": -2.0800282955169678, "logits/rejected": -2.285209894180298, "logps/chosen": -6.861189365386963, "logps/rejected": -0.6225022077560425, "loss": 0.7343, "rewards/accuracies": 0.0, "rewards/chosen": 0.806086540222168, "rewards/margins": -0.08066314458847046, "rewards/rejected": 0.8867496848106384, "step": 4890 }, { "epoch": 2.64, "learning_rate": 3.768507932628396e-09, "logits/chosen": -2.180311679840088, "logits/rejected": -2.3197031021118164, "logps/chosen": -0.30071279406547546, "logps/rejected": -0.30593183636665344, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.9068987965583801, "rewards/margins": 0.026015102863311768, "rewards/rejected": 0.8808836936950684, "step": 4891 }, { "epoch": 2.64, "learning_rate": 3.757426532436248e-09, "logits/chosen": -2.1334822177886963, "logits/rejected": -2.325376033782959, "logps/chosen": -0.8778951168060303, "logps/rejected": -0.8370203971862793, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 1.0212290287017822, "rewards/margins": 0.011572122573852539, "rewards/rejected": 1.0096569061279297, "step": 4892 }, { "epoch": 2.64, "learning_rate": 3.746360812705235e-09, "logits/chosen": -2.1208271980285645, "logits/rejected": -2.260845899581909, "logps/chosen": -0.5023139715194702, "logps/rejected": -0.5620715618133545, "loss": 0.6962, "rewards/accuracies": 0.0, "rewards/chosen": 0.8247532248497009, "rewards/margins": -0.0061699748039245605, "rewards/rejected": 0.8309231996536255, "step": 4893 }, { "epoch": 2.64, "learning_rate": 3.735310777187645e-09, "logits/chosen": -2.0416948795318604, "logits/rejected": -2.0305070877075195, "logps/chosen": -3.8155035972595215, "logps/rejected": -2.3993122577667236, "loss": 0.3571, "rewards/accuracies": 1.0, "rewards/chosen": 1.6834264993667603, "rewards/margins": 0.8457331657409668, "rewards/rejected": 0.8376933336257935, "step": 4894 }, { "epoch": 2.64, "learning_rate": 3.724276429630452e-09, "logits/chosen": -2.09486722946167, "logits/rejected": -2.3173577785491943, "logps/chosen": -1.0004745721817017, "logps/rejected": -1.15071439743042, "loss": 0.6811, "rewards/accuracies": 1.0, "rewards/chosen": 1.0866551399230957, "rewards/margins": 0.024286389350891113, "rewards/rejected": 1.0623687505722046, "step": 4895 }, { "epoch": 2.64, "learning_rate": 3.7132577737753134e-09, "logits/chosen": -2.036257266998291, "logits/rejected": -2.04325008392334, "logps/chosen": -0.4450777769088745, "logps/rejected": -4.03347110748291, "loss": 0.5003, "rewards/accuracies": 1.0, "rewards/chosen": 0.8389212489128113, "rewards/margins": 0.4320618510246277, "rewards/rejected": 0.4068593978881836, "step": 4896 }, { "epoch": 2.64, "learning_rate": 3.7022548133585463e-09, "logits/chosen": -2.129634141921997, "logits/rejected": -2.1330864429473877, "logps/chosen": -1.9395636320114136, "logps/rejected": -5.056308746337891, "loss": 0.3933, "rewards/accuracies": 1.0, "rewards/chosen": 1.174494981765747, "rewards/margins": 0.7302371263504028, "rewards/rejected": 0.44425782561302185, "step": 4897 }, { "epoch": 2.64, "learning_rate": 3.6912675521111823e-09, "logits/chosen": -1.9824405908584595, "logits/rejected": -1.982283353805542, "logps/chosen": -1.7156059741973877, "logps/rejected": -2.6972427368164062, "loss": 0.5974, "rewards/accuracies": 1.0, "rewards/chosen": 1.10721755027771, "rewards/margins": 0.20161336660385132, "rewards/rejected": 0.9056041836738586, "step": 4898 }, { "epoch": 2.64, "learning_rate": 3.6802959937588807e-09, "logits/chosen": -2.0874791145324707, "logits/rejected": -2.3221805095672607, "logps/chosen": -1.3571851253509521, "logps/rejected": -0.9756225347518921, "loss": 0.7059, "rewards/accuracies": 0.0, "rewards/chosen": 0.5618833899497986, "rewards/margins": -0.025300264358520508, "rewards/rejected": 0.5871836543083191, "step": 4899 }, { "epoch": 2.64, "learning_rate": 3.6693401420220205e-09, "logits/chosen": -2.1661431789398193, "logits/rejected": -2.1575276851654053, "logps/chosen": -0.8967150449752808, "logps/rejected": -5.893204689025879, "loss": 0.4375, "rewards/accuracies": 1.0, "rewards/chosen": 1.0983867645263672, "rewards/margins": 0.5999908447265625, "rewards/rejected": 0.4983959197998047, "step": 4900 }, { "epoch": 2.64, "learning_rate": 3.658400000615636e-09, "logits/chosen": -2.004929542541504, "logits/rejected": -2.2728633880615234, "logps/chosen": -0.3846389353275299, "logps/rejected": -0.4531199038028717, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.9799469113349915, "rewards/margins": 0.02835923433303833, "rewards/rejected": 0.9515876770019531, "step": 4901 }, { "epoch": 2.64, "learning_rate": 3.647475573249431e-09, "logits/chosen": -2.1449697017669678, "logits/rejected": -2.1480019092559814, "logps/chosen": -1.3643656969070435, "logps/rejected": -11.996969223022461, "loss": 0.3426, "rewards/accuracies": 1.0, "rewards/chosen": 1.1741729974746704, "rewards/margins": 0.894871711730957, "rewards/rejected": 0.279301255941391, "step": 4902 }, { "epoch": 2.64, "learning_rate": 3.636566863627788e-09, "logits/chosen": -2.0132551193237305, "logits/rejected": -2.4047136306762695, "logps/chosen": -7.856863975524902, "logps/rejected": -14.5846529006958, "loss": 0.8959, "rewards/accuracies": 0.0, "rewards/chosen": 0.8623479008674622, "rewards/margins": -0.3711826205253601, "rewards/rejected": 1.2335305213928223, "step": 4903 }, { "epoch": 2.65, "learning_rate": 3.625673875449753e-09, "logits/chosen": -2.079244375228882, "logits/rejected": -2.3129897117614746, "logps/chosen": -0.3218867778778076, "logps/rejected": -0.30240511894226074, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.9267644882202148, "rewards/margins": 0.01844918727874756, "rewards/rejected": 0.9083153009414673, "step": 4904 }, { "epoch": 2.65, "learning_rate": 3.614796612409049e-09, "logits/chosen": -2.1006133556365967, "logits/rejected": -2.28460955619812, "logps/chosen": -0.2733061909675598, "logps/rejected": -0.2684325575828552, "loss": 0.6918, "rewards/accuracies": 1.0, "rewards/chosen": 0.8240299224853516, "rewards/margins": 0.002710402011871338, "rewards/rejected": 0.8213195204734802, "step": 4905 }, { "epoch": 2.65, "learning_rate": 3.603935078194076e-09, "logits/chosen": -2.024069309234619, "logits/rejected": -2.026667833328247, "logps/chosen": -2.0078952312469482, "logps/rejected": -4.399311542510986, "loss": 0.259, "rewards/accuracies": 1.0, "rewards/chosen": 1.7100690603256226, "rewards/margins": 1.2186588048934937, "rewards/rejected": 0.4914102256298065, "step": 4906 }, { "epoch": 2.65, "learning_rate": 3.5930892764878595e-09, "logits/chosen": -2.0687432289123535, "logits/rejected": -2.288465976715088, "logps/chosen": -5.565899848937988, "logps/rejected": -1.510762095451355, "loss": 0.7072, "rewards/accuracies": 0.0, "rewards/chosen": 1.0348637104034424, "rewards/margins": -0.027883172035217285, "rewards/rejected": 1.0627468824386597, "step": 4907 }, { "epoch": 2.65, "learning_rate": 3.582259210968147e-09, "logits/chosen": -2.0414235591888428, "logits/rejected": -2.0380992889404297, "logps/chosen": -5.540661811828613, "logps/rejected": -4.1800103187561035, "loss": 0.4172, "rewards/accuracies": 1.0, "rewards/chosen": 1.1601051092147827, "rewards/margins": 0.6583040356636047, "rewards/rejected": 0.501801073551178, "step": 4908 }, { "epoch": 2.65, "learning_rate": 3.5714448853072955e-09, "logits/chosen": -2.1561903953552246, "logits/rejected": -2.3455708026885986, "logps/chosen": -1.0655920505523682, "logps/rejected": -1.1276768445968628, "loss": 0.6877, "rewards/accuracies": 1.0, "rewards/chosen": 0.6954872012138367, "rewards/margins": 0.010836482048034668, "rewards/rejected": 0.684650719165802, "step": 4909 }, { "epoch": 2.65, "learning_rate": 3.56064630317236e-09, "logits/chosen": -2.1066088676452637, "logits/rejected": -2.125720739364624, "logps/chosen": -4.219489097595215, "logps/rejected": -3.3115241527557373, "loss": 0.4701, "rewards/accuracies": 1.0, "rewards/chosen": 1.284445881843567, "rewards/margins": 0.5106544494628906, "rewards/rejected": 0.7737914323806763, "step": 4910 }, { "epoch": 2.65, "learning_rate": 3.549863468225062e-09, "logits/chosen": -2.170274257659912, "logits/rejected": -2.1694576740264893, "logps/chosen": -2.641934394836426, "logps/rejected": -13.040243148803711, "loss": 0.5442, "rewards/accuracies": 1.0, "rewards/chosen": 0.9655831456184387, "rewards/margins": 0.3240818977355957, "rewards/rejected": 0.641501247882843, "step": 4911 }, { "epoch": 2.65, "learning_rate": 3.5390963841217425e-09, "logits/chosen": -1.9939740896224976, "logits/rejected": -2.000459909439087, "logps/chosen": -3.0556020736694336, "logps/rejected": -2.7894718647003174, "loss": 0.4372, "rewards/accuracies": 1.0, "rewards/chosen": 1.2203506231307983, "rewards/margins": 0.6007565259933472, "rewards/rejected": 0.6195940971374512, "step": 4912 }, { "epoch": 2.65, "learning_rate": 3.528345054513443e-09, "logits/chosen": -2.1311943531036377, "logits/rejected": -2.312211036682129, "logps/chosen": -2.7906131744384766, "logps/rejected": -2.7341933250427246, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.9483094215393066, "rewards/margins": 0.015089333057403564, "rewards/rejected": 0.9332200884819031, "step": 4913 }, { "epoch": 2.65, "learning_rate": 3.517609483045847e-09, "logits/chosen": -2.151001453399658, "logits/rejected": -2.203519821166992, "logps/chosen": -11.38436508178711, "logps/rejected": -20.48166847229004, "loss": 0.3549, "rewards/accuracies": 1.0, "rewards/chosen": 1.6977840662002563, "rewards/margins": 0.8532682657241821, "rewards/rejected": 0.8445158004760742, "step": 4914 }, { "epoch": 2.65, "learning_rate": 3.506889673359292e-09, "logits/chosen": -2.0066936016082764, "logits/rejected": -2.2762320041656494, "logps/chosen": -1.0157712697982788, "logps/rejected": -1.046026587486267, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.8565228581428528, "rewards/margins": -0.001063227653503418, "rewards/rejected": 0.8575860857963562, "step": 4915 }, { "epoch": 2.65, "learning_rate": 3.4961856290887827e-09, "logits/chosen": -2.240784168243408, "logits/rejected": -2.189645290374756, "logps/chosen": -21.90105438232422, "logps/rejected": -3.9347996711730957, "loss": 0.2027, "rewards/accuracies": 1.0, "rewards/chosen": 2.2983360290527344, "rewards/margins": 1.4927256107330322, "rewards/rejected": 0.8056104779243469, "step": 4916 }, { "epoch": 2.65, "learning_rate": 3.4854973538639476e-09, "logits/chosen": -2.213331937789917, "logits/rejected": -2.209327459335327, "logps/chosen": -6.208907127380371, "logps/rejected": -3.870620012283325, "loss": 0.4784, "rewards/accuracies": 1.0, "rewards/chosen": 0.9505526423454285, "rewards/margins": 0.48868080973625183, "rewards/rejected": 0.46187183260917664, "step": 4917 }, { "epoch": 2.65, "learning_rate": 3.4748248513091106e-09, "logits/chosen": -2.0908517837524414, "logits/rejected": -2.0923807621002197, "logps/chosen": -2.007643461227417, "logps/rejected": -0.9154716730117798, "loss": 0.5459, "rewards/accuracies": 1.0, "rewards/chosen": 1.3604090213775635, "rewards/margins": 0.319949746131897, "rewards/rejected": 1.0404592752456665, "step": 4918 }, { "epoch": 2.65, "learning_rate": 3.464168125043221e-09, "logits/chosen": -2.0691277980804443, "logits/rejected": -2.0589122772216797, "logps/chosen": -4.937938690185547, "logps/rejected": -4.778232574462891, "loss": 0.3839, "rewards/accuracies": 1.0, "rewards/chosen": 1.2720797061920166, "rewards/margins": 0.7593865990638733, "rewards/rejected": 0.5126931071281433, "step": 4919 }, { "epoch": 2.65, "learning_rate": 3.453527178679877e-09, "logits/chosen": -2.094295024871826, "logits/rejected": -2.1013455390930176, "logps/chosen": -1.1782609224319458, "logps/rejected": -2.8932197093963623, "loss": 0.469, "rewards/accuracies": 1.0, "rewards/chosen": 1.1114201545715332, "rewards/margins": 0.5134299397468567, "rewards/rejected": 0.5979902148246765, "step": 4920 }, { "epoch": 2.65, "learning_rate": 3.4429020158273425e-09, "logits/chosen": -2.153186798095703, "logits/rejected": -2.154651641845703, "logps/chosen": -1.388885736465454, "logps/rejected": -3.242173433303833, "loss": 0.5161, "rewards/accuracies": 1.0, "rewards/chosen": 1.0222976207733154, "rewards/margins": 0.39233219623565674, "rewards/rejected": 0.6299654245376587, "step": 4921 }, { "epoch": 2.65, "learning_rate": 3.4322926400884965e-09, "logits/chosen": -2.02517032623291, "logits/rejected": -2.0146877765655518, "logps/chosen": -12.338996887207031, "logps/rejected": -2.750349998474121, "loss": 0.6348, "rewards/accuracies": 1.0, "rewards/chosen": 0.9594900012016296, "rewards/margins": 0.12038087844848633, "rewards/rejected": 0.8391091227531433, "step": 4922 }, { "epoch": 2.66, "learning_rate": 3.4216990550609226e-09, "logits/chosen": -2.1119208335876465, "logits/rejected": -2.194796323776245, "logps/chosen": -1.5301001071929932, "logps/rejected": -17.211666107177734, "loss": 0.5648, "rewards/accuracies": 1.0, "rewards/chosen": 1.2451609373092651, "rewards/margins": 0.2757059931755066, "rewards/rejected": 0.9694549441337585, "step": 4923 }, { "epoch": 2.66, "learning_rate": 3.411121264336786e-09, "logits/chosen": -2.131355047225952, "logits/rejected": -2.1249642372131348, "logps/chosen": -4.52777099609375, "logps/rejected": -4.334216594696045, "loss": 0.3016, "rewards/accuracies": 1.0, "rewards/chosen": 1.4445661306381226, "rewards/margins": 1.0442280769348145, "rewards/rejected": 0.4003380239009857, "step": 4924 }, { "epoch": 2.66, "learning_rate": 3.4005592715029396e-09, "logits/chosen": -2.1107640266418457, "logits/rejected": -2.103187322616577, "logps/chosen": -4.358249187469482, "logps/rejected": -5.027547836303711, "loss": 0.4034, "rewards/accuracies": 1.0, "rewards/chosen": 1.1773262023925781, "rewards/margins": 0.6993108987808228, "rewards/rejected": 0.47801533341407776, "step": 4925 }, { "epoch": 2.66, "learning_rate": 3.390013080140863e-09, "logits/chosen": -2.121821165084839, "logits/rejected": -2.1210734844207764, "logps/chosen": -4.352608680725098, "logps/rejected": -5.98259973526001, "loss": 0.3273, "rewards/accuracies": 1.0, "rewards/chosen": 1.3832341432571411, "rewards/margins": 0.9486285448074341, "rewards/rejected": 0.43460556864738464, "step": 4926 }, { "epoch": 2.66, "learning_rate": 3.379482693826674e-09, "logits/chosen": -2.0692903995513916, "logits/rejected": -2.0749354362487793, "logps/chosen": -2.512047290802002, "logps/rejected": -0.6101825833320618, "loss": 0.496, "rewards/accuracies": 1.0, "rewards/chosen": 1.3077472448349, "rewards/margins": 0.4430510997772217, "rewards/rejected": 0.8646961450576782, "step": 4927 }, { "epoch": 2.66, "learning_rate": 3.368968116131138e-09, "logits/chosen": -1.9715708494186401, "logits/rejected": -2.3185958862304688, "logps/chosen": -0.1526549756526947, "logps/rejected": -0.17920321226119995, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.9159588813781738, "rewards/margins": 0.018451392650604248, "rewards/rejected": 0.8975074887275696, "step": 4928 }, { "epoch": 2.66, "learning_rate": 3.3584693506196692e-09, "logits/chosen": -2.1098792552948, "logits/rejected": -2.1037702560424805, "logps/chosen": -1.3806134462356567, "logps/rejected": -7.987785339355469, "loss": 0.3425, "rewards/accuracies": 1.0, "rewards/chosen": 1.2251642942428589, "rewards/margins": 0.8952187299728394, "rewards/rejected": 0.32994556427001953, "step": 4929 }, { "epoch": 2.66, "learning_rate": 3.347986400852293e-09, "logits/chosen": -2.0528676509857178, "logits/rejected": -2.15567684173584, "logps/chosen": -2.5466127395629883, "logps/rejected": -10.398681640625, "loss": 0.4732, "rewards/accuracies": 1.0, "rewards/chosen": 1.313790202140808, "rewards/margins": 0.5022768378257751, "rewards/rejected": 0.811513364315033, "step": 4930 }, { "epoch": 2.66, "learning_rate": 3.337519270383704e-09, "logits/chosen": -2.1078646183013916, "logits/rejected": -2.1080751419067383, "logps/chosen": -1.3160344362258911, "logps/rejected": -1.0525221824645996, "loss": 0.6268, "rewards/accuracies": 1.0, "rewards/chosen": 0.9904723167419434, "rewards/margins": 0.1373887062072754, "rewards/rejected": 0.853083610534668, "step": 4931 }, { "epoch": 2.66, "learning_rate": 3.327067962763186e-09, "logits/chosen": -2.215766191482544, "logits/rejected": -2.3338372707366943, "logps/chosen": -0.4476148784160614, "logps/rejected": -0.44967567920684814, "loss": 0.6838, "rewards/accuracies": 1.0, "rewards/chosen": 1.0522840023040771, "rewards/margins": 0.01878201961517334, "rewards/rejected": 1.0335019826889038, "step": 4932 }, { "epoch": 2.66, "learning_rate": 3.3166324815347313e-09, "logits/chosen": -2.0622293949127197, "logits/rejected": -2.269947052001953, "logps/chosen": -1.0724042654037476, "logps/rejected": -1.1501246690750122, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 1.071374773979187, "rewards/margins": 0.022281885147094727, "rewards/rejected": 1.0490928888320923, "step": 4933 }, { "epoch": 2.66, "learning_rate": 3.306212830236893e-09, "logits/chosen": -2.03641939163208, "logits/rejected": -2.030100107192993, "logps/chosen": -5.1580657958984375, "logps/rejected": -2.4132611751556396, "loss": 0.4738, "rewards/accuracies": 1.0, "rewards/chosen": 1.2606773376464844, "rewards/margins": 0.5006988644599915, "rewards/rejected": 0.7599784731864929, "step": 4934 }, { "epoch": 2.66, "learning_rate": 3.2958090124028893e-09, "logits/chosen": -2.0881576538085938, "logits/rejected": -2.347050905227661, "logps/chosen": -7.550474643707275, "logps/rejected": -5.862106800079346, "loss": 0.8546, "rewards/accuracies": 0.0, "rewards/chosen": 0.3712110221385956, "rewards/margins": -0.30040690302848816, "rewards/rejected": 0.6716179251670837, "step": 4935 }, { "epoch": 2.66, "learning_rate": 3.285421031560581e-09, "logits/chosen": -2.1653010845184326, "logits/rejected": -2.2735331058502197, "logps/chosen": -10.8055419921875, "logps/rejected": -3.41235089302063, "loss": 1.1056, "rewards/accuracies": 0.0, "rewards/chosen": 0.06745157390832901, "rewards/margins": -0.7036120295524597, "rewards/rejected": 0.7710636258125305, "step": 4936 }, { "epoch": 2.66, "learning_rate": 3.275048891232424e-09, "logits/chosen": -2.0550484657287598, "logits/rejected": -2.062676429748535, "logps/chosen": -1.1099486351013184, "logps/rejected": -4.531415939331055, "loss": 0.449, "rewards/accuracies": 1.0, "rewards/chosen": 0.8643465042114258, "rewards/margins": 0.5678392648696899, "rewards/rejected": 0.29650720953941345, "step": 4937 }, { "epoch": 2.66, "learning_rate": 3.2646925949355307e-09, "logits/chosen": -2.176516532897949, "logits/rejected": -2.1341090202331543, "logps/chosen": -19.9954776763916, "logps/rejected": -4.9521965980529785, "loss": 0.2274, "rewards/accuracies": 1.0, "rewards/chosen": 1.8365808725357056, "rewards/margins": 1.3654251098632812, "rewards/rejected": 0.4711557924747467, "step": 4938 }, { "epoch": 2.66, "learning_rate": 3.2543521461816436e-09, "logits/chosen": -2.022134780883789, "logits/rejected": -2.022491216659546, "logps/chosen": -0.9437140822410583, "logps/rejected": -3.10920786857605, "loss": 0.4974, "rewards/accuracies": 1.0, "rewards/chosen": 1.1335828304290771, "rewards/margins": 0.439253568649292, "rewards/rejected": 0.6943292617797852, "step": 4939 }, { "epoch": 2.66, "learning_rate": 3.2440275484770964e-09, "logits/chosen": -1.9999432563781738, "logits/rejected": -2.2436623573303223, "logps/chosen": -0.25099363923072815, "logps/rejected": -0.2876695394515991, "loss": 0.6929, "rewards/accuracies": 1.0, "rewards/chosen": 0.8631866574287415, "rewards/margins": 0.0005714297294616699, "rewards/rejected": 0.8626152276992798, "step": 4940 }, { "epoch": 2.67, "learning_rate": 3.233718805322888e-09, "logits/chosen": -2.0894196033477783, "logits/rejected": -2.1039130687713623, "logps/chosen": -6.958469390869141, "logps/rejected": -5.130242347717285, "loss": 0.3847, "rewards/accuracies": 1.0, "rewards/chosen": 1.663683533668518, "rewards/margins": 0.7569226622581482, "rewards/rejected": 0.9067608714103699, "step": 4941 }, { "epoch": 2.67, "learning_rate": 3.223425920214623e-09, "logits/chosen": -2.0741260051727295, "logits/rejected": -2.3027076721191406, "logps/chosen": -0.5657004714012146, "logps/rejected": -0.5826978087425232, "loss": 0.6858, "rewards/accuracies": 1.0, "rewards/chosen": 0.7915668487548828, "rewards/margins": 0.014679312705993652, "rewards/rejected": 0.7768875360488892, "step": 4942 }, { "epoch": 2.67, "learning_rate": 3.2131488966425313e-09, "logits/chosen": -2.1720993518829346, "logits/rejected": -2.169922351837158, "logps/chosen": -3.520113706588745, "logps/rejected": -4.022839069366455, "loss": 0.3032, "rewards/accuracies": 1.0, "rewards/chosen": 1.540398120880127, "rewards/margins": 1.0379600524902344, "rewards/rejected": 0.5024381279945374, "step": 4943 }, { "epoch": 2.67, "learning_rate": 3.20288773809147e-09, "logits/chosen": -2.1010799407958984, "logits/rejected": -2.0518651008605957, "logps/chosen": -8.798426628112793, "logps/rejected": -6.073486328125, "loss": 0.3831, "rewards/accuracies": 1.0, "rewards/chosen": 1.556026816368103, "rewards/margins": 0.7617156505584717, "rewards/rejected": 0.7943111658096313, "step": 4944 }, { "epoch": 2.67, "learning_rate": 3.192642448040894e-09, "logits/chosen": -2.048218250274658, "logits/rejected": -2.038386821746826, "logps/chosen": -3.3412389755249023, "logps/rejected": -2.0354251861572266, "loss": 0.3637, "rewards/accuracies": 1.0, "rewards/chosen": 1.6766510009765625, "rewards/margins": 0.8239268064498901, "rewards/rejected": 0.8527241945266724, "step": 4945 }, { "epoch": 2.67, "learning_rate": 3.182413029964909e-09, "logits/chosen": -2.117326498031616, "logits/rejected": -2.315502882003784, "logps/chosen": -3.4800450801849365, "logps/rejected": -1.4148019552230835, "loss": 0.7014, "rewards/accuracies": 0.0, "rewards/chosen": 1.0965179204940796, "rewards/margins": -0.016475200653076172, "rewards/rejected": 1.1129931211471558, "step": 4946 }, { "epoch": 2.67, "learning_rate": 3.1721994873322055e-09, "logits/chosen": -1.9955735206604004, "logits/rejected": -2.2870607376098633, "logps/chosen": -0.15713752806186676, "logps/rejected": -0.1646619290113449, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 0.9079678654670715, "rewards/margins": -0.0002576112747192383, "rewards/rejected": 0.9082254767417908, "step": 4947 }, { "epoch": 2.67, "learning_rate": 3.162001823606125e-09, "logits/chosen": -2.1515822410583496, "logits/rejected": -2.158461332321167, "logps/chosen": -5.2544403076171875, "logps/rejected": -2.8087782859802246, "loss": 0.7018, "rewards/accuracies": 0.0, "rewards/chosen": 1.2169288396835327, "rewards/margins": -0.017142534255981445, "rewards/rejected": 1.2340713739395142, "step": 4948 }, { "epoch": 2.67, "learning_rate": 3.1518200422446073e-09, "logits/chosen": -2.047954797744751, "logits/rejected": -2.3166327476501465, "logps/chosen": -1.9172160625457764, "logps/rejected": -1.831703782081604, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 0.7145341038703918, "rewards/margins": 0.011262059211730957, "rewards/rejected": 0.7032720446586609, "step": 4949 }, { "epoch": 2.67, "learning_rate": 3.1416541467001957e-09, "logits/chosen": -2.033060312271118, "logits/rejected": -2.2362117767333984, "logps/chosen": -3.6631879806518555, "logps/rejected": -0.6223176717758179, "loss": 0.6233, "rewards/accuracies": 1.0, "rewards/chosen": 1.1466337442398071, "rewards/margins": 0.14486312866210938, "rewards/rejected": 1.0017706155776978, "step": 4950 }, { "epoch": 2.67, "learning_rate": 3.131504140420066e-09, "logits/chosen": -2.076221466064453, "logits/rejected": -2.0720653533935547, "logps/chosen": -3.5238871574401855, "logps/rejected": -3.135284900665283, "loss": 0.5638, "rewards/accuracies": 1.0, "rewards/chosen": 1.0949385166168213, "rewards/margins": 0.27790015935897827, "rewards/rejected": 0.817038357257843, "step": 4951 }, { "epoch": 2.67, "learning_rate": 3.121370026845993e-09, "logits/chosen": -1.996533751487732, "logits/rejected": -2.006448984146118, "logps/chosen": -1.1847409009933472, "logps/rejected": -5.604383945465088, "loss": 0.3335, "rewards/accuracies": 1.0, "rewards/chosen": 1.5232452154159546, "rewards/margins": 0.9268800616264343, "rewards/rejected": 0.5963651537895203, "step": 4952 }, { "epoch": 2.67, "learning_rate": 3.111251809414378e-09, "logits/chosen": -2.080268383026123, "logits/rejected": -2.2749781608581543, "logps/chosen": -2.651906967163086, "logps/rejected": -3.315563201904297, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.7934894561767578, "rewards/margins": 0.030488967895507812, "rewards/rejected": 0.76300048828125, "step": 4953 }, { "epoch": 2.67, "learning_rate": 3.101149491556221e-09, "logits/chosen": -2.092987537384033, "logits/rejected": -2.3434934616088867, "logps/chosen": -7.647394180297852, "logps/rejected": -14.701525688171387, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.9128298163414001, "rewards/margins": 0.03220587968826294, "rewards/rejected": 0.8806239366531372, "step": 4954 }, { "epoch": 2.67, "learning_rate": 3.0910630766971203e-09, "logits/chosen": -2.013495922088623, "logits/rejected": -2.3185694217681885, "logps/chosen": -0.3885175287723541, "logps/rejected": -0.4100491404533386, "loss": 0.6907, "rewards/accuracies": 1.0, "rewards/chosen": 1.0228480100631714, "rewards/margins": 0.004839181900024414, "rewards/rejected": 1.018008828163147, "step": 4955 }, { "epoch": 2.67, "learning_rate": 3.080992568257307e-09, "logits/chosen": -2.172822952270508, "logits/rejected": -2.3049209117889404, "logps/chosen": -4.921518325805664, "logps/rejected": -2.3107078075408936, "loss": 0.7291, "rewards/accuracies": 0.0, "rewards/chosen": 0.8037352561950684, "rewards/margins": -0.07074511051177979, "rewards/rejected": 0.8744803667068481, "step": 4956 }, { "epoch": 2.67, "learning_rate": 3.0709379696515882e-09, "logits/chosen": -2.1472954750061035, "logits/rejected": -2.2667860984802246, "logps/chosen": -0.4752509891986847, "logps/rejected": -0.4780036509037018, "loss": 0.6888, "rewards/accuracies": 1.0, "rewards/chosen": 0.78363037109375, "rewards/margins": 0.008677661418914795, "rewards/rejected": 0.7749527096748352, "step": 4957 }, { "epoch": 2.67, "learning_rate": 3.060899284289403e-09, "logits/chosen": -2.0909745693206787, "logits/rejected": -2.108366012573242, "logps/chosen": -1.62094247341156, "logps/rejected": -7.44163703918457, "loss": 0.5091, "rewards/accuracies": 1.0, "rewards/chosen": 1.1268469095230103, "rewards/margins": 0.4097215533256531, "rewards/rejected": 0.7171253561973572, "step": 4958 }, { "epoch": 2.67, "learning_rate": 3.050876515574785e-09, "logits/chosen": -1.9925806522369385, "logits/rejected": -1.9886200428009033, "logps/chosen": -3.102675676345825, "logps/rejected": -7.914125442504883, "loss": 0.4854, "rewards/accuracies": 1.0, "rewards/chosen": 0.9804688692092896, "rewards/margins": 0.4703133702278137, "rewards/rejected": 0.5101554989814758, "step": 4959 }, { "epoch": 2.68, "learning_rate": 3.040869666906365e-09, "logits/chosen": -2.233163833618164, "logits/rejected": -2.2513339519500732, "logps/chosen": -2.4400951862335205, "logps/rejected": -8.610631942749023, "loss": 0.3349, "rewards/accuracies": 1.0, "rewards/chosen": 1.416657567024231, "rewards/margins": 0.9216939210891724, "rewards/rejected": 0.4949636459350586, "step": 4960 }, { "epoch": 2.68, "learning_rate": 3.0308787416773783e-09, "logits/chosen": -2.1314239501953125, "logits/rejected": -2.1085002422332764, "logps/chosen": -14.230729103088379, "logps/rejected": -1.1628838777542114, "loss": 0.3743, "rewards/accuracies": 1.0, "rewards/chosen": 1.769913911819458, "rewards/margins": 0.7895632982254028, "rewards/rejected": 0.9803506135940552, "step": 4961 }, { "epoch": 2.68, "learning_rate": 3.0209037432756657e-09, "logits/chosen": -2.0787038803100586, "logits/rejected": -2.083348274230957, "logps/chosen": -2.802429437637329, "logps/rejected": -5.785713195800781, "loss": 0.4422, "rewards/accuracies": 1.0, "rewards/chosen": 1.010797142982483, "rewards/margins": 0.5867675542831421, "rewards/rejected": 0.42402955889701843, "step": 4962 }, { "epoch": 2.68, "learning_rate": 3.0109446750836596e-09, "logits/chosen": -2.2398488521575928, "logits/rejected": -2.2365028858184814, "logps/chosen": -7.745177268981934, "logps/rejected": -6.100645065307617, "loss": 0.2692, "rewards/accuracies": 1.0, "rewards/chosen": 1.4376065731048584, "rewards/margins": 1.174501895904541, "rewards/rejected": 0.26310473680496216, "step": 4963 }, { "epoch": 2.68, "learning_rate": 3.0010015404784095e-09, "logits/chosen": -2.224996566772461, "logits/rejected": -2.1010630130767822, "logps/chosen": -44.945281982421875, "logps/rejected": -10.009284019470215, "loss": 0.2308, "rewards/accuracies": 1.0, "rewards/chosen": 2.2990524768829346, "rewards/margins": 1.3484985828399658, "rewards/rejected": 0.950553834438324, "step": 4964 }, { "epoch": 2.68, "learning_rate": 2.991074342831529e-09, "logits/chosen": -1.990074872970581, "logits/rejected": -2.316498279571533, "logps/chosen": -0.49456465244293213, "logps/rejected": -0.4055004119873047, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.895427405834198, "rewards/margins": -0.0030823945999145508, "rewards/rejected": 0.8985098004341125, "step": 4965 }, { "epoch": 2.68, "learning_rate": 2.981163085509253e-09, "logits/chosen": -2.1036412715911865, "logits/rejected": -2.2482147216796875, "logps/chosen": -2.283191680908203, "logps/rejected": -6.996579170227051, "loss": 0.5663, "rewards/accuracies": 1.0, "rewards/chosen": 0.7108368277549744, "rewards/margins": 0.2722073793411255, "rewards/rejected": 0.4386294484138489, "step": 4966 }, { "epoch": 2.68, "learning_rate": 2.9712677718724098e-09, "logits/chosen": -2.1059365272521973, "logits/rejected": -2.140031099319458, "logps/chosen": -4.09580659866333, "logps/rejected": -14.345136642456055, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 1.2360143661499023, "rewards/margins": 0.6038131713867188, "rewards/rejected": 0.6322011947631836, "step": 4967 }, { "epoch": 2.68, "learning_rate": 2.961388405276399e-09, "logits/chosen": -2.1263837814331055, "logits/rejected": -2.1279051303863525, "logps/chosen": -0.6574109792709351, "logps/rejected": -3.6467273235321045, "loss": 0.5037, "rewards/accuracies": 1.0, "rewards/chosen": 1.1353856325149536, "rewards/margins": 0.4233049750328064, "rewards/rejected": 0.7120806574821472, "step": 4968 }, { "epoch": 2.68, "learning_rate": 2.9515249890712522e-09, "logits/chosen": -2.129072666168213, "logits/rejected": -2.289212942123413, "logps/chosen": -1.1476237773895264, "logps/rejected": -1.12781822681427, "loss": 0.6846, "rewards/accuracies": 1.0, "rewards/chosen": 0.8867104649543762, "rewards/margins": 0.017238497734069824, "rewards/rejected": 0.8694719672203064, "step": 4969 }, { "epoch": 2.68, "learning_rate": 2.9416775266015492e-09, "logits/chosen": -2.15228009223938, "logits/rejected": -2.286381721496582, "logps/chosen": -5.099102973937988, "logps/rejected": -4.978926181793213, "loss": 0.6509, "rewards/accuracies": 1.0, "rewards/chosen": 0.6649433374404907, "rewards/margins": 0.08640950918197632, "rewards/rejected": 0.5785338282585144, "step": 4970 }, { "epoch": 2.68, "learning_rate": 2.931846021206491e-09, "logits/chosen": -2.129483699798584, "logits/rejected": -2.1356725692749023, "logps/chosen": -3.0814385414123535, "logps/rejected": -4.408888816833496, "loss": 0.3769, "rewards/accuracies": 1.0, "rewards/chosen": 1.2335485219955444, "rewards/margins": 0.7813172936439514, "rewards/rejected": 0.452231228351593, "step": 4971 }, { "epoch": 2.68, "learning_rate": 2.92203047621985e-09, "logits/chosen": -2.0267107486724854, "logits/rejected": -2.030372142791748, "logps/chosen": -4.962599754333496, "logps/rejected": -1.929719090461731, "loss": 0.589, "rewards/accuracies": 1.0, "rewards/chosen": 1.0066386461257935, "rewards/margins": 0.22040224075317383, "rewards/rejected": 0.7862364053726196, "step": 4972 }, { "epoch": 2.68, "learning_rate": 2.9122308949700026e-09, "logits/chosen": -2.112999200820923, "logits/rejected": -2.1219441890716553, "logps/chosen": -4.961635112762451, "logps/rejected": -2.0857772827148438, "loss": 0.2033, "rewards/accuracies": 1.0, "rewards/chosen": 2.1416728496551514, "rewards/margins": 1.4895336627960205, "rewards/rejected": 0.6521391868591309, "step": 4973 }, { "epoch": 2.68, "learning_rate": 2.9024472807799018e-09, "logits/chosen": -2.159810781478882, "logits/rejected": -2.336331605911255, "logps/chosen": -2.1529152393341064, "logps/rejected": -2.001685380935669, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.5979862213134766, "rewards/margins": -0.0031742453575134277, "rewards/rejected": 0.60116046667099, "step": 4974 }, { "epoch": 2.68, "learning_rate": 2.892679636967077e-09, "logits/chosen": -2.100396156311035, "logits/rejected": -2.098900079727173, "logps/chosen": -0.9932119250297546, "logps/rejected": -4.30537223815918, "loss": 0.481, "rewards/accuracies": 1.0, "rewards/chosen": 1.0757629871368408, "rewards/margins": 0.48177772760391235, "rewards/rejected": 0.5939852595329285, "step": 4975 }, { "epoch": 2.68, "learning_rate": 2.8829279668436624e-09, "logits/chosen": -2.0183651447296143, "logits/rejected": -2.2743656635284424, "logps/chosen": -10.502750396728516, "logps/rejected": -12.060250282287598, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 1.077979326248169, "rewards/margins": -0.005279660224914551, "rewards/rejected": 1.0832589864730835, "step": 4976 }, { "epoch": 2.68, "learning_rate": 2.8731922737163683e-09, "logits/chosen": -2.1193010807037354, "logits/rejected": -2.1531996726989746, "logps/chosen": -2.1618833541870117, "logps/rejected": -6.133262634277344, "loss": 0.6304, "rewards/accuracies": 1.0, "rewards/chosen": 1.071364164352417, "rewards/margins": 0.12964260578155518, "rewards/rejected": 0.9417215585708618, "step": 4977 }, { "epoch": 2.69, "learning_rate": 2.8634725608864763e-09, "logits/chosen": -2.0378732681274414, "logits/rejected": -2.318852186203003, "logps/chosen": -11.830975532531738, "logps/rejected": -7.400092124938965, "loss": 0.6464, "rewards/accuracies": 1.0, "rewards/chosen": 0.8989737629890442, "rewards/margins": 0.09582358598709106, "rewards/rejected": 0.8031501770019531, "step": 4978 }, { "epoch": 2.69, "learning_rate": 2.8537688316498664e-09, "logits/chosen": -2.156388521194458, "logits/rejected": -2.149698495864868, "logps/chosen": -4.721732139587402, "logps/rejected": -4.0039873123168945, "loss": 0.6532, "rewards/accuracies": 1.0, "rewards/chosen": 0.7552489638328552, "rewards/margins": 0.08153945207595825, "rewards/rejected": 0.673709511756897, "step": 4979 }, { "epoch": 2.69, "learning_rate": 2.8440810892969793e-09, "logits/chosen": -1.9569964408874512, "logits/rejected": -2.2538797855377197, "logps/chosen": -3.235539436340332, "logps/rejected": -3.7814464569091797, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.7090164422988892, "rewards/margins": 0.022738933563232422, "rewards/rejected": 0.6862775087356567, "step": 4980 }, { "epoch": 2.69, "learning_rate": 2.8344093371128418e-09, "logits/chosen": -2.1470630168914795, "logits/rejected": -2.064286947250366, "logps/chosen": -23.85352325439453, "logps/rejected": -4.0177693367004395, "loss": 0.4361, "rewards/accuracies": 1.0, "rewards/chosen": 1.4801361560821533, "rewards/margins": 0.6040329337120056, "rewards/rejected": 0.8761032223701477, "step": 4981 }, { "epoch": 2.69, "learning_rate": 2.824753578377087e-09, "logits/chosen": -2.072145462036133, "logits/rejected": -2.066406011581421, "logps/chosen": -2.53075909614563, "logps/rejected": -6.029646873474121, "loss": 0.4421, "rewards/accuracies": 1.0, "rewards/chosen": 1.2936815023422241, "rewards/margins": 0.587042510509491, "rewards/rejected": 0.7066389918327332, "step": 4982 }, { "epoch": 2.69, "learning_rate": 2.815113816363879e-09, "logits/chosen": -2.081430673599243, "logits/rejected": -2.271130323410034, "logps/chosen": -0.3700551986694336, "logps/rejected": -0.4045291841030121, "loss": 0.6825, "rewards/accuracies": 1.0, "rewards/chosen": 0.9232640266418457, "rewards/margins": 0.021485328674316406, "rewards/rejected": 0.9017786979675293, "step": 4983 }, { "epoch": 2.69, "learning_rate": 2.8054900543419914e-09, "logits/chosen": -2.132507801055908, "logits/rejected": -2.1333060264587402, "logps/chosen": -2.598634958267212, "logps/rejected": -1.271075963973999, "loss": 0.5148, "rewards/accuracies": 1.0, "rewards/chosen": 1.0828403234481812, "rewards/margins": 0.39548593759536743, "rewards/rejected": 0.6873543858528137, "step": 4984 }, { "epoch": 2.69, "learning_rate": 2.7958822955747475e-09, "logits/chosen": -2.0938122272491455, "logits/rejected": -2.1052939891815186, "logps/chosen": -1.486144781112671, "logps/rejected": -5.0774827003479, "loss": 0.6083, "rewards/accuracies": 1.0, "rewards/chosen": 1.1304157972335815, "rewards/margins": 0.17748093605041504, "rewards/rejected": 0.9529348611831665, "step": 4985 }, { "epoch": 2.69, "learning_rate": 2.7862905433200633e-09, "logits/chosen": -2.134516477584839, "logits/rejected": -2.27762770652771, "logps/chosen": -1.2027710676193237, "logps/rejected": -0.9210481643676758, "loss": 0.6806, "rewards/accuracies": 1.0, "rewards/chosen": 1.0666135549545288, "rewards/margins": 0.025168418884277344, "rewards/rejected": 1.0414451360702515, "step": 4986 }, { "epoch": 2.69, "learning_rate": 2.776714800830421e-09, "logits/chosen": -2.1464343070983887, "logits/rejected": -2.2133617401123047, "logps/chosen": -7.063275337219238, "logps/rejected": -5.8434038162231445, "loss": 0.7065, "rewards/accuracies": 0.0, "rewards/chosen": 0.734132707118988, "rewards/margins": -0.02654951810836792, "rewards/rejected": 0.760682225227356, "step": 4987 }, { "epoch": 2.69, "learning_rate": 2.7671550713528667e-09, "logits/chosen": -2.084961175918579, "logits/rejected": -2.253293037414551, "logps/chosen": -1.379743218421936, "logps/rejected": -1.3241569995880127, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 1.0215314626693726, "rewards/margins": 0.030038058757781982, "rewards/rejected": 0.9914934039115906, "step": 4988 }, { "epoch": 2.69, "learning_rate": 2.7576113581290252e-09, "logits/chosen": -2.169084310531616, "logits/rejected": -2.0950968265533447, "logps/chosen": -28.640380859375, "logps/rejected": -3.4576897621154785, "loss": 0.3039, "rewards/accuracies": 1.0, "rewards/chosen": 1.6810768842697144, "rewards/margins": 1.035330891609192, "rewards/rejected": 0.6457459926605225, "step": 4989 }, { "epoch": 2.69, "learning_rate": 2.7480836643950956e-09, "logits/chosen": -2.0748965740203857, "logits/rejected": -2.1031265258789062, "logps/chosen": -1.8829048871994019, "logps/rejected": -8.623912811279297, "loss": 0.2996, "rewards/accuracies": 1.0, "rewards/chosen": 1.6928890943527222, "rewards/margins": 1.051745891571045, "rewards/rejected": 0.641143262386322, "step": 4990 }, { "epoch": 2.69, "learning_rate": 2.738571993381822e-09, "logits/chosen": -2.083625555038452, "logits/rejected": -2.074059009552002, "logps/chosen": -12.885643005371094, "logps/rejected": -6.119363784790039, "loss": 0.4885, "rewards/accuracies": 1.0, "rewards/chosen": 1.124671220779419, "rewards/margins": 0.46211761236190796, "rewards/rejected": 0.662553608417511, "step": 4991 }, { "epoch": 2.69, "learning_rate": 2.7290763483145462e-09, "logits/chosen": -1.984928846359253, "logits/rejected": -1.9867908954620361, "logps/chosen": -1.9606982469558716, "logps/rejected": -3.6995372772216797, "loss": 0.4935, "rewards/accuracies": 1.0, "rewards/chosen": 0.9521655440330505, "rewards/margins": 0.44927525520324707, "rewards/rejected": 0.5028902888298035, "step": 4992 }, { "epoch": 2.69, "learning_rate": 2.719596732413154e-09, "logits/chosen": -2.084043025970459, "logits/rejected": -2.0563371181488037, "logps/chosen": -5.317456245422363, "logps/rejected": -3.912266969680786, "loss": 0.4381, "rewards/accuracies": 1.0, "rewards/chosen": 1.2666877508163452, "rewards/margins": 0.5983391404151917, "rewards/rejected": 0.6683486104011536, "step": 4993 }, { "epoch": 2.69, "learning_rate": 2.7101331488921018e-09, "logits/chosen": -2.0701308250427246, "logits/rejected": -2.0582354068756104, "logps/chosen": -12.700433731079102, "logps/rejected": -3.73516583442688, "loss": 0.3316, "rewards/accuracies": 1.0, "rewards/chosen": 1.419600486755371, "rewards/margins": 0.933548629283905, "rewards/rejected": 0.48605185747146606, "step": 4994 }, { "epoch": 2.69, "learning_rate": 2.7006856009604116e-09, "logits/chosen": -2.1170167922973633, "logits/rejected": -2.3286428451538086, "logps/chosen": -0.9211276769638062, "logps/rejected": -17.133142471313477, "loss": 0.5465, "rewards/accuracies": 1.0, "rewards/chosen": 1.2265976667404175, "rewards/margins": 0.3184899091720581, "rewards/rejected": 0.9081077575683594, "step": 4995 }, { "epoch": 2.69, "learning_rate": 2.6912540918216598e-09, "logits/chosen": -2.115699291229248, "logits/rejected": -2.1114394664764404, "logps/chosen": -2.7187161445617676, "logps/rejected": -2.4738059043884277, "loss": 0.3231, "rewards/accuracies": 1.0, "rewards/chosen": 1.5789402723312378, "rewards/margins": 0.9639655351638794, "rewards/rejected": 0.6149747371673584, "step": 4996 }, { "epoch": 2.7, "learning_rate": 2.681838624673999e-09, "logits/chosen": -1.985674262046814, "logits/rejected": -2.2708115577697754, "logps/chosen": -1.0660569667816162, "logps/rejected": -1.0704563856124878, "loss": 0.7025, "rewards/accuracies": 0.0, "rewards/chosen": 0.8415411114692688, "rewards/margins": -0.018558025360107422, "rewards/rejected": 0.8600991368293762, "step": 4997 }, { "epoch": 2.7, "learning_rate": 2.672439202710125e-09, "logits/chosen": -2.0233123302459717, "logits/rejected": -2.0297906398773193, "logps/chosen": -1.6998224258422852, "logps/rejected": -5.268192291259766, "loss": 0.4059, "rewards/accuracies": 1.0, "rewards/chosen": 0.9999067187309265, "rewards/margins": 0.6917062997817993, "rewards/rejected": 0.3082004487514496, "step": 4998 }, { "epoch": 2.7, "learning_rate": 2.6630558291173056e-09, "logits/chosen": -2.055488348007202, "logits/rejected": -2.0548551082611084, "logps/chosen": -2.8656768798828125, "logps/rejected": -5.467597961425781, "loss": 0.2529, "rewards/accuracies": 1.0, "rewards/chosen": 1.587971568107605, "rewards/margins": 1.2455620765686035, "rewards/rejected": 0.34240952134132385, "step": 4999 }, { "epoch": 2.7, "learning_rate": 2.6536885070773727e-09, "logits/chosen": -2.033217191696167, "logits/rejected": -2.0356180667877197, "logps/chosen": -0.1766945868730545, "logps/rejected": -6.103339672088623, "loss": 0.4848, "rewards/accuracies": 1.0, "rewards/chosen": 0.7667400240898132, "rewards/margins": 0.4718078076839447, "rewards/rejected": 0.29493221640586853, "step": 5000 }, { "epoch": 2.7, "learning_rate": 2.6443372397666806e-09, "logits/chosen": -2.0231359004974365, "logits/rejected": -2.025192975997925, "logps/chosen": -1.3190714120864868, "logps/rejected": -3.2665045261383057, "loss": 0.5101, "rewards/accuracies": 1.0, "rewards/chosen": 0.9473909735679626, "rewards/margins": 0.40732747316360474, "rewards/rejected": 0.5400635004043579, "step": 5001 }, { "epoch": 2.7, "learning_rate": 2.6350020303561925e-09, "logits/chosen": -2.189723253250122, "logits/rejected": -2.086345911026001, "logps/chosen": -25.818565368652344, "logps/rejected": -3.600480556488037, "loss": 0.1617, "rewards/accuracies": 1.0, "rewards/chosen": 2.1994597911834717, "rewards/margins": 1.7401347160339355, "rewards/rejected": 0.45932507514953613, "step": 5002 }, { "epoch": 2.7, "learning_rate": 2.6256828820113765e-09, "logits/chosen": -2.086773633956909, "logits/rejected": -2.0778470039367676, "logps/chosen": -7.471038818359375, "logps/rejected": -6.249882698059082, "loss": 0.2722, "rewards/accuracies": 1.0, "rewards/chosen": 1.6256517171859741, "rewards/margins": 1.1620056629180908, "rewards/rejected": 0.4636460244655609, "step": 5003 }, { "epoch": 2.7, "learning_rate": 2.616379797892293e-09, "logits/chosen": -2.237020254135132, "logits/rejected": -2.1015350818634033, "logps/chosen": -31.999553680419922, "logps/rejected": -1.1735621690750122, "loss": 0.0943, "rewards/accuracies": 1.0, "rewards/chosen": 3.0457515716552734, "rewards/margins": 2.3134829998016357, "rewards/rejected": 0.7322686314582825, "step": 5004 }, { "epoch": 2.7, "learning_rate": 2.6070927811535194e-09, "logits/chosen": -2.014011859893799, "logits/rejected": -2.0280396938323975, "logps/chosen": -27.99850082397461, "logps/rejected": -20.086633682250977, "loss": 0.4762, "rewards/accuracies": 1.0, "rewards/chosen": 0.9916656613349915, "rewards/margins": 0.494467169046402, "rewards/rejected": 0.4971984922885895, "step": 5005 }, { "epoch": 2.7, "learning_rate": 2.5978218349442137e-09, "logits/chosen": -2.0346758365631104, "logits/rejected": -2.030979633331299, "logps/chosen": -3.568108558654785, "logps/rejected": -3.1730644702911377, "loss": 0.331, "rewards/accuracies": 1.0, "rewards/chosen": 1.6217224597930908, "rewards/margins": 0.9357460141181946, "rewards/rejected": 0.6859764456748962, "step": 5006 }, { "epoch": 2.7, "learning_rate": 2.588566962408084e-09, "logits/chosen": -1.9431017637252808, "logits/rejected": -2.2380545139312744, "logps/chosen": -0.7416282892227173, "logps/rejected": -0.7367737889289856, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.8194207549095154, "rewards/margins": 0.015338361263275146, "rewards/rejected": 0.8040823936462402, "step": 5007 }, { "epoch": 2.7, "learning_rate": 2.579328166683359e-09, "logits/chosen": -2.0741543769836426, "logits/rejected": -2.3188323974609375, "logps/chosen": -1.0442352294921875, "logps/rejected": -4.432013511657715, "loss": 0.5705, "rewards/accuracies": 1.0, "rewards/chosen": 0.843300998210907, "rewards/margins": 0.2623702883720398, "rewards/rejected": 0.5809307098388672, "step": 5008 }, { "epoch": 2.7, "learning_rate": 2.5701054509028487e-09, "logits/chosen": -2.1709444522857666, "logits/rejected": -2.1845273971557617, "logps/chosen": -10.825362205505371, "logps/rejected": -10.791276931762695, "loss": 0.2743, "rewards/accuracies": 1.0, "rewards/chosen": 1.8918335437774658, "rewards/margins": 1.1533581018447876, "rewards/rejected": 0.7384754419326782, "step": 5009 }, { "epoch": 2.7, "learning_rate": 2.5608988181938907e-09, "logits/chosen": -2.0630037784576416, "logits/rejected": -2.0692408084869385, "logps/chosen": -3.244311571121216, "logps/rejected": -4.744797229766846, "loss": 0.3683, "rewards/accuracies": 1.0, "rewards/chosen": 1.3294366598129272, "rewards/margins": 0.8089110255241394, "rewards/rejected": 0.5205256342887878, "step": 5010 }, { "epoch": 2.7, "learning_rate": 2.5517082716783768e-09, "logits/chosen": -2.1487619876861572, "logits/rejected": -2.295867443084717, "logps/chosen": -3.204540252685547, "logps/rejected": -0.4499623775482178, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 1.0041478872299194, "rewards/margins": 0.0254480242729187, "rewards/rejected": 0.9786998629570007, "step": 5011 }, { "epoch": 2.7, "learning_rate": 2.5425338144727526e-09, "logits/chosen": -2.0433669090270996, "logits/rejected": -2.045729398727417, "logps/chosen": -1.965463638305664, "logps/rejected": -0.3576533794403076, "loss": 0.6471, "rewards/accuracies": 1.0, "rewards/chosen": 0.8411033749580383, "rewards/margins": 0.09424620866775513, "rewards/rejected": 0.7468571662902832, "step": 5012 }, { "epoch": 2.7, "learning_rate": 2.53337544968798e-09, "logits/chosen": -2.155799150466919, "logits/rejected": -2.119462013244629, "logps/chosen": -20.805217742919922, "logps/rejected": -3.475402355194092, "loss": 0.214, "rewards/accuracies": 1.0, "rewards/chosen": 2.0123043060302734, "rewards/margins": 1.432981014251709, "rewards/rejected": 0.5793232321739197, "step": 5013 }, { "epoch": 2.7, "learning_rate": 2.524233180429597e-09, "logits/chosen": -2.0145456790924072, "logits/rejected": -2.2482798099517822, "logps/chosen": -0.44283396005630493, "logps/rejected": -5.626060485839844, "loss": 0.5977, "rewards/accuracies": 1.0, "rewards/chosen": 0.8230016827583313, "rewards/margins": 0.20107614994049072, "rewards/rejected": 0.6219255328178406, "step": 5014 }, { "epoch": 2.7, "learning_rate": 2.515107009797668e-09, "logits/chosen": -2.0547826290130615, "logits/rejected": -2.253756046295166, "logps/chosen": -1.0164246559143066, "logps/rejected": -0.928773820400238, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.8842481970787048, "rewards/margins": 0.0024477243423461914, "rewards/rejected": 0.8818004727363586, "step": 5015 }, { "epoch": 2.71, "learning_rate": 2.505996940886784e-09, "logits/chosen": -2.2150373458862305, "logits/rejected": -2.217911958694458, "logps/chosen": -0.4639802873134613, "logps/rejected": -3.6781563758850098, "loss": 0.4572, "rewards/accuracies": 1.0, "rewards/chosen": 0.9917399287223816, "rewards/margins": 0.5452069640159607, "rewards/rejected": 0.4465329647064209, "step": 5016 }, { "epoch": 2.71, "learning_rate": 2.4969029767861138e-09, "logits/chosen": -2.036560297012329, "logits/rejected": -2.234750270843506, "logps/chosen": -3.025620460510254, "logps/rejected": -3.5263407230377197, "loss": 0.6679, "rewards/accuracies": 1.0, "rewards/chosen": 0.7593761682510376, "rewards/margins": 0.051145195960998535, "rewards/rejected": 0.7082309722900391, "step": 5017 }, { "epoch": 2.71, "learning_rate": 2.4878251205793232e-09, "logits/chosen": -2.1041860580444336, "logits/rejected": -2.1024367809295654, "logps/chosen": -1.7150605916976929, "logps/rejected": -8.836762428283691, "loss": 0.3832, "rewards/accuracies": 1.0, "rewards/chosen": 0.9511575102806091, "rewards/margins": 0.7614799737930298, "rewards/rejected": 0.18967752158641815, "step": 5018 }, { "epoch": 2.71, "learning_rate": 2.4787633753446445e-09, "logits/chosen": -2.0059757232666016, "logits/rejected": -2.2463080883026123, "logps/chosen": -2.096261501312256, "logps/rejected": -2.1387388706207275, "loss": 0.681, "rewards/accuracies": 1.0, "rewards/chosen": 1.2621698379516602, "rewards/margins": 0.024453282356262207, "rewards/rejected": 1.237716555595398, "step": 5019 }, { "epoch": 2.71, "learning_rate": 2.469717744154842e-09, "logits/chosen": -2.064507007598877, "logits/rejected": -2.0707666873931885, "logps/chosen": -3.609945297241211, "logps/rejected": -5.949834823608398, "loss": 0.387, "rewards/accuracies": 1.0, "rewards/chosen": 1.634500503540039, "rewards/margins": 0.7496129870414734, "rewards/rejected": 0.8848875164985657, "step": 5020 }, { "epoch": 2.71, "learning_rate": 2.4606882300772015e-09, "logits/chosen": -2.074643850326538, "logits/rejected": -2.2504851818084717, "logps/chosen": -7.71835994720459, "logps/rejected": -1.140830159187317, "loss": 0.9264, "rewards/accuracies": 0.0, "rewards/chosen": 0.6831240057945251, "rewards/margins": -0.4222450852394104, "rewards/rejected": 1.1053690910339355, "step": 5021 }, { "epoch": 2.71, "learning_rate": 2.4516748361735673e-09, "logits/chosen": -2.046644449234009, "logits/rejected": -2.03570556640625, "logps/chosen": -4.987143516540527, "logps/rejected": -2.706332206726074, "loss": 0.5545, "rewards/accuracies": 1.0, "rewards/chosen": 0.8561728596687317, "rewards/margins": 0.29956674575805664, "rewards/rejected": 0.556606113910675, "step": 5022 }, { "epoch": 2.71, "learning_rate": 2.4426775655002895e-09, "logits/chosen": -2.018240213394165, "logits/rejected": -2.2921195030212402, "logps/chosen": -5.583100318908691, "logps/rejected": -1.876539707183838, "loss": 0.7567, "rewards/accuracies": 0.0, "rewards/chosen": 0.8874874114990234, "rewards/margins": -0.12337052822113037, "rewards/rejected": 1.0108579397201538, "step": 5023 }, { "epoch": 2.71, "learning_rate": 2.433696421108272e-09, "logits/chosen": -1.9427874088287354, "logits/rejected": -1.9229482412338257, "logps/chosen": -7.845019340515137, "logps/rejected": -0.9713444709777832, "loss": 0.5654, "rewards/accuracies": 1.0, "rewards/chosen": 1.359201192855835, "rewards/margins": 0.2741515636444092, "rewards/rejected": 1.0850496292114258, "step": 5024 }, { "epoch": 2.71, "learning_rate": 2.4247314060429512e-09, "logits/chosen": -2.054539680480957, "logits/rejected": -2.0625057220458984, "logps/chosen": -1.058898687362671, "logps/rejected": -3.142023801803589, "loss": 0.5005, "rewards/accuracies": 1.0, "rewards/chosen": 1.0655685663223267, "rewards/margins": 0.43147820234298706, "rewards/rejected": 0.6340903639793396, "step": 5025 }, { "epoch": 2.71, "learning_rate": 2.4157825233442786e-09, "logits/chosen": -2.0184860229492188, "logits/rejected": -2.019047498703003, "logps/chosen": -0.5708521604537964, "logps/rejected": -4.8790717124938965, "loss": 0.4835, "rewards/accuracies": 1.0, "rewards/chosen": 0.831479549407959, "rewards/margins": 0.4753364026546478, "rewards/rejected": 0.35614314675331116, "step": 5026 }, { "epoch": 2.71, "learning_rate": 2.406849776046749e-09, "logits/chosen": -2.142087459564209, "logits/rejected": -2.142256498336792, "logps/chosen": -1.0428454875946045, "logps/rejected": -1.9879066944122314, "loss": 0.5927, "rewards/accuracies": 1.0, "rewards/chosen": 1.066572904586792, "rewards/margins": 0.21202892065048218, "rewards/rejected": 0.8545439839363098, "step": 5027 }, { "epoch": 2.71, "learning_rate": 2.3979331671793725e-09, "logits/chosen": -2.040814161300659, "logits/rejected": -2.3235039710998535, "logps/chosen": -2.101869583129883, "logps/rejected": -2.2760605812072754, "loss": 0.6608, "rewards/accuracies": 1.0, "rewards/chosen": 0.8920686841011047, "rewards/margins": 0.06578028202056885, "rewards/rejected": 0.8262884020805359, "step": 5028 }, { "epoch": 2.71, "learning_rate": 2.3890326997656974e-09, "logits/chosen": -2.0731029510498047, "logits/rejected": -2.3558902740478516, "logps/chosen": -0.7833593487739563, "logps/rejected": -1.0174857378005981, "loss": 0.6703, "rewards/accuracies": 1.0, "rewards/chosen": 0.7810813784599304, "rewards/margins": 0.04625678062438965, "rewards/rejected": 0.7348245978355408, "step": 5029 }, { "epoch": 2.71, "learning_rate": 2.3801483768237986e-09, "logits/chosen": -2.084123134613037, "logits/rejected": -2.107927083969116, "logps/chosen": -3.037184715270996, "logps/rejected": -6.035760402679443, "loss": 0.4732, "rewards/accuracies": 1.0, "rewards/chosen": 1.1906300783157349, "rewards/margins": 0.5023186206817627, "rewards/rejected": 0.6883114576339722, "step": 5030 }, { "epoch": 2.71, "learning_rate": 2.3712802013662713e-09, "logits/chosen": -2.1377644538879395, "logits/rejected": -2.3228952884674072, "logps/chosen": -5.863933563232422, "logps/rejected": -5.618141174316406, "loss": 0.6933, "rewards/accuracies": 0.0, "rewards/chosen": 0.5899639129638672, "rewards/margins": -0.00029718875885009766, "rewards/rejected": 0.5902611017227173, "step": 5031 }, { "epoch": 2.71, "learning_rate": 2.362428176400255e-09, "logits/chosen": -2.047508478164673, "logits/rejected": -2.3006436824798584, "logps/chosen": -1.2355247735977173, "logps/rejected": -1.2231708765029907, "loss": 0.688, "rewards/accuracies": 1.0, "rewards/chosen": 0.9471171498298645, "rewards/margins": 0.01030808687210083, "rewards/rejected": 0.9368090629577637, "step": 5032 }, { "epoch": 2.71, "learning_rate": 2.3535923049273654e-09, "logits/chosen": -2.144294500350952, "logits/rejected": -2.1386067867279053, "logps/chosen": -3.1383440494537354, "logps/rejected": -3.458019256591797, "loss": 0.4015, "rewards/accuracies": 1.0, "rewards/chosen": 1.174411416053772, "rewards/margins": 0.7050496339797974, "rewards/rejected": 0.4693617820739746, "step": 5033 }, { "epoch": 2.72, "learning_rate": 2.344772589943794e-09, "logits/chosen": -2.0061614513397217, "logits/rejected": -2.008488178253174, "logps/chosen": -0.9291483759880066, "logps/rejected": -3.0176262855529785, "loss": 0.4665, "rewards/accuracies": 1.0, "rewards/chosen": 1.1638555526733398, "rewards/margins": 0.520285427570343, "rewards/rejected": 0.6435701251029968, "step": 5034 }, { "epoch": 2.72, "learning_rate": 2.335969034440227e-09, "logits/chosen": -1.9422937631607056, "logits/rejected": -1.9494071006774902, "logps/chosen": -1.2403541803359985, "logps/rejected": -3.31870698928833, "loss": 0.4902, "rewards/accuracies": 1.0, "rewards/chosen": 1.0450146198272705, "rewards/margins": 0.45788049697875977, "rewards/rejected": 0.5871341228485107, "step": 5035 }, { "epoch": 2.72, "learning_rate": 2.3271816414018653e-09, "logits/chosen": -2.136957883834839, "logits/rejected": -2.1415634155273438, "logps/chosen": -1.5033308267593384, "logps/rejected": -3.8660290241241455, "loss": 0.4659, "rewards/accuracies": 1.0, "rewards/chosen": 0.964072048664093, "rewards/margins": 0.5217927694320679, "rewards/rejected": 0.44227924942970276, "step": 5036 }, { "epoch": 2.72, "learning_rate": 2.3184104138084413e-09, "logits/chosen": -1.9863101243972778, "logits/rejected": -2.2524197101593018, "logps/chosen": -2.5683515071868896, "logps/rejected": -2.514195203781128, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 0.9906629920005798, "rewards/margins": 0.02206319570541382, "rewards/rejected": 0.968599796295166, "step": 5037 }, { "epoch": 2.72, "learning_rate": 2.3096553546342155e-09, "logits/chosen": -2.125352621078491, "logits/rejected": -2.1257448196411133, "logps/chosen": -0.21338897943496704, "logps/rejected": -5.77850341796875, "loss": 0.4394, "rewards/accuracies": 1.0, "rewards/chosen": 0.9284448027610779, "rewards/margins": 0.5945362448692322, "rewards/rejected": 0.3339085578918457, "step": 5038 }, { "epoch": 2.72, "learning_rate": 2.3009164668479407e-09, "logits/chosen": -2.126282215118408, "logits/rejected": -2.1376450061798096, "logps/chosen": -2.722790002822876, "logps/rejected": -11.835186004638672, "loss": 0.5514, "rewards/accuracies": 1.0, "rewards/chosen": 1.2675083875656128, "rewards/margins": 0.3069722056388855, "rewards/rejected": 0.9605361819267273, "step": 5039 }, { "epoch": 2.72, "learning_rate": 2.292193753412902e-09, "logits/chosen": -1.973157286643982, "logits/rejected": -2.2786197662353516, "logps/chosen": -0.3947976529598236, "logps/rejected": -0.31992027163505554, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.8823357820510864, "rewards/margins": 0.019511818885803223, "rewards/rejected": 0.8628239631652832, "step": 5040 }, { "epoch": 2.72, "learning_rate": 2.2834872172868947e-09, "logits/chosen": -2.217533588409424, "logits/rejected": -2.2032155990600586, "logps/chosen": -10.495508193969727, "logps/rejected": -6.015336036682129, "loss": 0.5388, "rewards/accuracies": 1.0, "rewards/chosen": 1.2498925924301147, "rewards/margins": 0.33689188957214355, "rewards/rejected": 0.9130007028579712, "step": 5041 }, { "epoch": 2.72, "learning_rate": 2.2747968614222458e-09, "logits/chosen": -2.187580108642578, "logits/rejected": -2.1824703216552734, "logps/chosen": -6.447517395019531, "logps/rejected": -4.910952568054199, "loss": 0.3617, "rewards/accuracies": 1.0, "rewards/chosen": 1.2919323444366455, "rewards/margins": 0.8307962417602539, "rewards/rejected": 0.4611360728740692, "step": 5042 }, { "epoch": 2.72, "learning_rate": 2.2661226887657537e-09, "logits/chosen": -2.000600814819336, "logits/rejected": -2.342161178588867, "logps/chosen": -0.29708361625671387, "logps/rejected": -0.3244394063949585, "loss": 0.6897, "rewards/accuracies": 1.0, "rewards/chosen": 0.8763517737388611, "rewards/margins": 0.006820797920227051, "rewards/rejected": 0.869530975818634, "step": 5043 }, { "epoch": 2.72, "learning_rate": 2.2574647022587768e-09, "logits/chosen": -1.9786121845245361, "logits/rejected": -1.9813085794448853, "logps/chosen": -0.2037641704082489, "logps/rejected": -7.607731819152832, "loss": 0.4473, "rewards/accuracies": 1.0, "rewards/chosen": 0.8325111269950867, "rewards/margins": 0.572564959526062, "rewards/rejected": 0.25994616746902466, "step": 5044 }, { "epoch": 2.72, "learning_rate": 2.2488229048371667e-09, "logits/chosen": -2.0966107845306396, "logits/rejected": -2.250854015350342, "logps/chosen": -0.37342172861099243, "logps/rejected": -0.415131539106369, "loss": 0.6869, "rewards/accuracies": 1.0, "rewards/chosen": 0.9580278396606445, "rewards/margins": 0.012515842914581299, "rewards/rejected": 0.9455119967460632, "step": 5045 }, { "epoch": 2.72, "learning_rate": 2.240197299431268e-09, "logits/chosen": -2.0102643966674805, "logits/rejected": -2.005568027496338, "logps/chosen": -1.2066388130187988, "logps/rejected": -4.631286144256592, "loss": 0.4379, "rewards/accuracies": 1.0, "rewards/chosen": 1.1292957067489624, "rewards/margins": 0.5988356471061707, "rewards/rejected": 0.5304600596427917, "step": 5046 }, { "epoch": 2.72, "learning_rate": 2.2315878889659523e-09, "logits/chosen": -2.052983045578003, "logits/rejected": -2.0572292804718018, "logps/chosen": -7.388774871826172, "logps/rejected": -8.25632095336914, "loss": 0.2749, "rewards/accuracies": 1.0, "rewards/chosen": 1.7624729871749878, "rewards/margins": 1.1508992910385132, "rewards/rejected": 0.6115736961364746, "step": 5047 }, { "epoch": 2.72, "learning_rate": 2.2229946763606123e-09, "logits/chosen": -2.0448760986328125, "logits/rejected": -2.1134603023529053, "logps/chosen": -2.907783031463623, "logps/rejected": -25.80059814453125, "loss": 0.1624, "rewards/accuracies": 1.0, "rewards/chosen": 1.6447021961212158, "rewards/margins": 1.7352027893066406, "rewards/rejected": -0.09050064533948898, "step": 5048 }, { "epoch": 2.72, "learning_rate": 2.2144176645291057e-09, "logits/chosen": -1.9999028444290161, "logits/rejected": -2.24214243888855, "logps/chosen": -1.7839534282684326, "logps/rejected": -1.7692383527755737, "loss": 0.684, "rewards/accuracies": 1.0, "rewards/chosen": 0.6577650308609009, "rewards/margins": 0.018351852893829346, "rewards/rejected": 0.6394131779670715, "step": 5049 }, { "epoch": 2.72, "learning_rate": 2.205856856379856e-09, "logits/chosen": -2.081038236618042, "logits/rejected": -2.0871222019195557, "logps/chosen": -2.2974724769592285, "logps/rejected": -4.307802677154541, "loss": 0.496, "rewards/accuracies": 1.0, "rewards/chosen": 1.0608423948287964, "rewards/margins": 0.4429473876953125, "rewards/rejected": 0.6178950071334839, "step": 5050 }, { "epoch": 2.72, "learning_rate": 2.1973122548157364e-09, "logits/chosen": -1.9579848051071167, "logits/rejected": -2.2745771408081055, "logps/chosen": -0.24887262284755707, "logps/rejected": -0.2671785056591034, "loss": 0.694, "rewards/accuracies": 0.0, "rewards/chosen": 1.0498825311660767, "rewards/margins": -0.001789093017578125, "rewards/rejected": 1.0516716241836548, "step": 5051 }, { "epoch": 2.72, "learning_rate": 2.188783862734156e-09, "logits/chosen": -2.012101650238037, "logits/rejected": -2.0212619304656982, "logps/chosen": -1.5131721496582031, "logps/rejected": -3.281165838241577, "loss": 0.5271, "rewards/accuracies": 1.0, "rewards/chosen": 1.0219202041625977, "rewards/margins": 0.36534595489501953, "rewards/rejected": 0.6565742492675781, "step": 5052 }, { "epoch": 2.73, "learning_rate": 2.180271683027024e-09, "logits/chosen": -2.1739020347595215, "logits/rejected": -2.180269956588745, "logps/chosen": -1.929927945137024, "logps/rejected": -3.8874289989471436, "loss": 0.4898, "rewards/accuracies": 1.0, "rewards/chosen": 0.9431486129760742, "rewards/margins": 0.45897236466407776, "rewards/rejected": 0.48417624831199646, "step": 5053 }, { "epoch": 2.73, "learning_rate": 2.1717757185807372e-09, "logits/chosen": -2.0269172191619873, "logits/rejected": -2.328056573867798, "logps/chosen": -6.001402378082275, "logps/rejected": -5.40978479385376, "loss": 0.7016, "rewards/accuracies": 0.0, "rewards/chosen": 0.6507512927055359, "rewards/margins": -0.016801059246063232, "rewards/rejected": 0.6675523519515991, "step": 5054 }, { "epoch": 2.73, "learning_rate": 2.163295972276219e-09, "logits/chosen": -2.037623643875122, "logits/rejected": -2.027538299560547, "logps/chosen": -6.21162748336792, "logps/rejected": -4.807299613952637, "loss": 0.4693, "rewards/accuracies": 1.0, "rewards/chosen": 1.1235665082931519, "rewards/margins": 0.5127055644989014, "rewards/rejected": 0.6108609437942505, "step": 5055 }, { "epoch": 2.73, "learning_rate": 2.1548324469888633e-09, "logits/chosen": -2.037165880203247, "logits/rejected": -2.0354111194610596, "logps/chosen": -0.6387106776237488, "logps/rejected": -1.4223191738128662, "loss": 0.6043, "rewards/accuracies": 1.0, "rewards/chosen": 1.0093811750411987, "rewards/margins": 0.186387300491333, "rewards/rejected": 0.8229938745498657, "step": 5056 }, { "epoch": 2.73, "learning_rate": 2.146385145588586e-09, "logits/chosen": -2.082859516143799, "logits/rejected": -2.07025146484375, "logps/chosen": -12.042165756225586, "logps/rejected": -4.0374884605407715, "loss": 0.3135, "rewards/accuracies": 1.0, "rewards/chosen": 1.5234746932983398, "rewards/margins": 0.9992830753326416, "rewards/rejected": 0.5241916179656982, "step": 5057 }, { "epoch": 2.73, "learning_rate": 2.1379540709398014e-09, "logits/chosen": -1.9681370258331299, "logits/rejected": -1.972783088684082, "logps/chosen": -3.0983409881591797, "logps/rejected": -6.190066337585449, "loss": 0.4021, "rewards/accuracies": 1.0, "rewards/chosen": 1.3808635473251343, "rewards/margins": 0.7031431794166565, "rewards/rejected": 0.6777203679084778, "step": 5058 }, { "epoch": 2.73, "learning_rate": 2.1295392259014e-09, "logits/chosen": -2.253408670425415, "logits/rejected": -2.3277835845947266, "logps/chosen": -1.0486490726470947, "logps/rejected": -1.2092113494873047, "loss": 0.6942, "rewards/accuracies": 0.0, "rewards/chosen": 1.0502817630767822, "rewards/margins": -0.0020961761474609375, "rewards/rejected": 1.0523779392242432, "step": 5059 }, { "epoch": 2.73, "learning_rate": 2.1211406133267997e-09, "logits/chosen": -1.9893834590911865, "logits/rejected": -2.2449796199798584, "logps/chosen": -0.6972146034240723, "logps/rejected": -0.7868890762329102, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 0.7796244025230408, "rewards/margins": -0.00681614875793457, "rewards/rejected": 0.7864405512809753, "step": 5060 }, { "epoch": 2.73, "learning_rate": 2.1127582360638884e-09, "logits/chosen": -1.9682904481887817, "logits/rejected": -2.248947858810425, "logps/chosen": -0.18096472322940826, "logps/rejected": -0.18615710735321045, "loss": 0.6863, "rewards/accuracies": 1.0, "rewards/chosen": 0.9475658535957336, "rewards/margins": 0.013794779777526855, "rewards/rejected": 0.9337710738182068, "step": 5061 }, { "epoch": 2.73, "learning_rate": 2.1043920969550545e-09, "logits/chosen": -2.037407159805298, "logits/rejected": -2.053382396697998, "logps/chosen": -1.6827000379562378, "logps/rejected": -11.233786582946777, "loss": 0.5243, "rewards/accuracies": 1.0, "rewards/chosen": 1.2034556865692139, "rewards/margins": 0.37214797735214233, "rewards/rejected": 0.8313077092170715, "step": 5062 }, { "epoch": 2.73, "learning_rate": 2.0960421988372e-09, "logits/chosen": -2.156911849975586, "logits/rejected": -2.156611680984497, "logps/chosen": -0.9211136698722839, "logps/rejected": -1.7476470470428467, "loss": 0.6867, "rewards/accuracies": 1.0, "rewards/chosen": 0.8930014967918396, "rewards/margins": 0.012939393520355225, "rewards/rejected": 0.8800621032714844, "step": 5063 }, { "epoch": 2.73, "learning_rate": 2.0877085445416886e-09, "logits/chosen": -2.036494016647339, "logits/rejected": -2.0457117557525635, "logps/chosen": -1.194996953010559, "logps/rejected": -3.968083143234253, "loss": 0.4611, "rewards/accuracies": 1.0, "rewards/chosen": 1.0586808919906616, "rewards/margins": 0.5348276495933533, "rewards/rejected": 0.5238532423973083, "step": 5064 }, { "epoch": 2.73, "learning_rate": 2.079391136894404e-09, "logits/chosen": -2.135366201400757, "logits/rejected": -2.130357027053833, "logps/chosen": -3.877884864807129, "logps/rejected": -5.548083305358887, "loss": 0.3226, "rewards/accuracies": 1.0, "rewards/chosen": 1.450943112373352, "rewards/margins": 0.9658852815628052, "rewards/rejected": 0.4850578308105469, "step": 5065 }, { "epoch": 2.73, "learning_rate": 2.0710899787156954e-09, "logits/chosen": -2.0713958740234375, "logits/rejected": -2.269728183746338, "logps/chosen": -0.5621621012687683, "logps/rejected": -0.6161999702453613, "loss": 0.6882, "rewards/accuracies": 1.0, "rewards/chosen": 1.1675575971603394, "rewards/margins": 0.00995492935180664, "rewards/rejected": 1.1576026678085327, "step": 5066 }, { "epoch": 2.73, "learning_rate": 2.062805072820417e-09, "logits/chosen": -2.163038492202759, "logits/rejected": -2.060209274291992, "logps/chosen": -28.419126510620117, "logps/rejected": -6.393636703491211, "loss": 0.1547, "rewards/accuracies": 1.0, "rewards/chosen": 2.3169238567352295, "rewards/margins": 1.7882225513458252, "rewards/rejected": 0.5287012457847595, "step": 5067 }, { "epoch": 2.73, "learning_rate": 2.0545364220179216e-09, "logits/chosen": -2.0800468921661377, "logits/rejected": -2.079275131225586, "logps/chosen": -0.9009771347045898, "logps/rejected": -1.5968421697616577, "loss": 0.6759, "rewards/accuracies": 1.0, "rewards/chosen": 0.9136536717414856, "rewards/margins": 0.034861087799072266, "rewards/rejected": 0.8787925839424133, "step": 5068 }, { "epoch": 2.73, "learning_rate": 2.0462840291120163e-09, "logits/chosen": -2.027385711669922, "logits/rejected": -2.0330586433410645, "logps/chosen": -1.0389225482940674, "logps/rejected": -4.371484279632568, "loss": 0.4058, "rewards/accuracies": 1.0, "rewards/chosen": 0.9753682017326355, "rewards/margins": 0.692272424697876, "rewards/rejected": 0.2830958068370819, "step": 5069 }, { "epoch": 2.73, "learning_rate": 2.038047896901041e-09, "logits/chosen": -2.157463788986206, "logits/rejected": -2.147728443145752, "logps/chosen": -5.049702167510986, "logps/rejected": -4.518937587738037, "loss": 0.6371, "rewards/accuracies": 1.0, "rewards/chosen": 0.8850223422050476, "rewards/margins": 0.11547940969467163, "rewards/rejected": 0.769542932510376, "step": 5070 }, { "epoch": 2.74, "learning_rate": 2.0298280281777822e-09, "logits/chosen": -2.335132360458374, "logits/rejected": -2.0889241695404053, "logps/chosen": -47.30009841918945, "logps/rejected": -6.223773956298828, "loss": 0.1691, "rewards/accuracies": 1.0, "rewards/chosen": 2.128075122833252, "rewards/margins": 1.6914862394332886, "rewards/rejected": 0.436588853597641, "step": 5071 }, { "epoch": 2.74, "learning_rate": 2.021624425729529e-09, "logits/chosen": -1.997523307800293, "logits/rejected": -2.2803876399993896, "logps/chosen": -0.34340783953666687, "logps/rejected": -0.3925197422504425, "loss": 0.6872, "rewards/accuracies": 1.0, "rewards/chosen": 0.847251832485199, "rewards/margins": 0.011876702308654785, "rewards/rejected": 0.8353751301765442, "step": 5072 }, { "epoch": 2.74, "learning_rate": 2.0134370923380604e-09, "logits/chosen": -2.1017863750457764, "logits/rejected": -2.2530195713043213, "logps/chosen": -1.2915964126586914, "logps/rejected": -1.4977940320968628, "loss": 0.6904, "rewards/accuracies": 1.0, "rewards/chosen": 0.8876989483833313, "rewards/margins": 0.005433559417724609, "rewards/rejected": 0.8822653889656067, "step": 5073 }, { "epoch": 2.74, "learning_rate": 2.005266030779623e-09, "logits/chosen": -2.097332000732422, "logits/rejected": -2.084549903869629, "logps/chosen": -6.822652816772461, "logps/rejected": -2.916381359100342, "loss": 0.4554, "rewards/accuracies": 1.0, "rewards/chosen": 1.1389967203140259, "rewards/margins": 0.5503482222557068, "rewards/rejected": 0.5886484980583191, "step": 5074 }, { "epoch": 2.74, "learning_rate": 1.9971112438249614e-09, "logits/chosen": -2.0238475799560547, "logits/rejected": -2.2347521781921387, "logps/chosen": -0.5682002305984497, "logps/rejected": -0.6120855212211609, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.9563145637512207, "rewards/margins": 0.02822113037109375, "rewards/rejected": 0.928093433380127, "step": 5075 }, { "epoch": 2.74, "learning_rate": 1.9889727342392803e-09, "logits/chosen": -2.0600929260253906, "logits/rejected": -2.054513692855835, "logps/chosen": -14.528189659118652, "logps/rejected": -9.676275253295898, "loss": 0.2629, "rewards/accuracies": 1.0, "rewards/chosen": 1.4249894618988037, "rewards/margins": 1.2014622688293457, "rewards/rejected": 0.2235272377729416, "step": 5076 }, { "epoch": 2.74, "learning_rate": 1.9808505047822942e-09, "logits/chosen": -2.3119959831237793, "logits/rejected": -2.342360019683838, "logps/chosen": -0.8803922533988953, "logps/rejected": -0.6671331524848938, "loss": 0.6985, "rewards/accuracies": 0.0, "rewards/chosen": 0.755990207195282, "rewards/margins": -0.010654866695404053, "rewards/rejected": 0.766645073890686, "step": 5077 }, { "epoch": 2.74, "learning_rate": 1.972744558208178e-09, "logits/chosen": -2.057685136795044, "logits/rejected": -2.0649166107177734, "logps/chosen": -2.084587335586548, "logps/rejected": -4.191626071929932, "loss": 0.3193, "rewards/accuracies": 1.0, "rewards/chosen": 1.4774744510650635, "rewards/margins": 0.9779106378555298, "rewards/rejected": 0.4995638430118561, "step": 5078 }, { "epoch": 2.74, "learning_rate": 1.964654897265572e-09, "logits/chosen": -1.9742268323898315, "logits/rejected": -1.9751769304275513, "logps/chosen": -3.749177932739258, "logps/rejected": -0.43036118149757385, "loss": 0.4957, "rewards/accuracies": 1.0, "rewards/chosen": 1.3760648965835571, "rewards/margins": 0.44377976655960083, "rewards/rejected": 0.9322851300239563, "step": 5079 }, { "epoch": 2.74, "learning_rate": 1.956581524697637e-09, "logits/chosen": -2.077148199081421, "logits/rejected": -2.2665693759918213, "logps/chosen": -3.7340123653411865, "logps/rejected": -2.28068208694458, "loss": 0.7819, "rewards/accuracies": 0.0, "rewards/chosen": 0.9579395651817322, "rewards/margins": -0.17033499479293823, "rewards/rejected": 1.1282745599746704, "step": 5080 }, { "epoch": 2.74, "learning_rate": 1.9485244432419666e-09, "logits/chosen": -2.236292600631714, "logits/rejected": -2.054640293121338, "logps/chosen": -58.60114288330078, "logps/rejected": -0.36153125762939453, "loss": 0.148, "rewards/accuracies": 1.0, "rewards/chosen": 2.681295156478882, "rewards/margins": 1.835960865020752, "rewards/rejected": 0.8453342318534851, "step": 5081 }, { "epoch": 2.74, "learning_rate": 1.940483655630659e-09, "logits/chosen": -2.0885839462280273, "logits/rejected": -2.0850043296813965, "logps/chosen": -5.602558135986328, "logps/rejected": -5.529118537902832, "loss": 0.2955, "rewards/accuracies": 1.0, "rewards/chosen": 1.6597518920898438, "rewards/margins": 1.067880392074585, "rewards/rejected": 0.5918715596199036, "step": 5082 }, { "epoch": 2.74, "learning_rate": 1.9324591645902723e-09, "logits/chosen": -2.067035436630249, "logits/rejected": -2.3053934574127197, "logps/chosen": -0.9895121455192566, "logps/rejected": -1.1330093145370483, "loss": 0.6916, "rewards/accuracies": 1.0, "rewards/chosen": 0.9755905270576477, "rewards/margins": 0.003052055835723877, "rewards/rejected": 0.9725384712219238, "step": 5083 }, { "epoch": 2.74, "learning_rate": 1.92445097284184e-09, "logits/chosen": -1.9948396682739258, "logits/rejected": -1.9900318384170532, "logps/chosen": -6.767351150512695, "logps/rejected": -3.57875919342041, "loss": 0.4285, "rewards/accuracies": 1.0, "rewards/chosen": 1.653983473777771, "rewards/margins": 0.6256670951843262, "rewards/rejected": 1.0283163785934448, "step": 5084 }, { "epoch": 2.74, "learning_rate": 1.916459083100874e-09, "logits/chosen": -2.0414257049560547, "logits/rejected": -2.296751022338867, "logps/chosen": -1.497067928314209, "logps/rejected": -1.3614065647125244, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 1.0793720483779907, "rewards/margins": 0.017058134078979492, "rewards/rejected": 1.0623139142990112, "step": 5085 }, { "epoch": 2.74, "learning_rate": 1.9084834980773567e-09, "logits/chosen": -2.0409998893737793, "logits/rejected": -2.0347740650177, "logps/chosen": -3.5037710666656494, "logps/rejected": -1.4862676858901978, "loss": 0.6371, "rewards/accuracies": 1.0, "rewards/chosen": 0.8630086779594421, "rewards/margins": 0.11534929275512695, "rewards/rejected": 0.7476593852043152, "step": 5086 }, { "epoch": 2.74, "learning_rate": 1.900524220475741e-09, "logits/chosen": -2.107006788253784, "logits/rejected": -2.1585731506347656, "logps/chosen": -7.333383083343506, "logps/rejected": -9.330201148986816, "loss": 0.3856, "rewards/accuracies": 1.0, "rewards/chosen": 1.4644320011138916, "rewards/margins": 0.7538194060325623, "rewards/rejected": 0.7106125950813293, "step": 5087 }, { "epoch": 2.74, "learning_rate": 1.8925812529949403e-09, "logits/chosen": -2.034332513809204, "logits/rejected": -2.294612169265747, "logps/chosen": -1.005078911781311, "logps/rejected": -0.8551642894744873, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.9611026048660278, "rewards/margins": -0.002879917621612549, "rewards/rejected": 0.9639825224876404, "step": 5088 }, { "epoch": 2.74, "learning_rate": 1.8846545983283614e-09, "logits/chosen": -2.047963857650757, "logits/rejected": -2.306816816329956, "logps/chosen": -0.45367708802223206, "logps/rejected": -0.42566975951194763, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.8334980010986328, "rewards/margins": 0.023436248302459717, "rewards/rejected": 0.8100617527961731, "step": 5089 }, { "epoch": 2.75, "learning_rate": 1.876744259163859e-09, "logits/chosen": -2.114393711090088, "logits/rejected": -2.1123549938201904, "logps/chosen": -2.2751243114471436, "logps/rejected": -4.514048099517822, "loss": 0.2839, "rewards/accuracies": 1.0, "rewards/chosen": 1.6646130084991455, "rewards/margins": 1.1138908863067627, "rewards/rejected": 0.550722062587738, "step": 5090 }, { "epoch": 2.75, "learning_rate": 1.8688502381837612e-09, "logits/chosen": -2.105891466140747, "logits/rejected": -2.1100215911865234, "logps/chosen": -0.8113623261451721, "logps/rejected": -2.1117796897888184, "loss": 0.5184, "rewards/accuracies": 1.0, "rewards/chosen": 1.08926522731781, "rewards/margins": 0.38655346632003784, "rewards/rejected": 0.7027117609977722, "step": 5091 }, { "epoch": 2.75, "learning_rate": 1.860972538064859e-09, "logits/chosen": -2.1956303119659424, "logits/rejected": -2.2036938667297363, "logps/chosen": -2.5395264625549316, "logps/rejected": -4.528378486633301, "loss": 0.3459, "rewards/accuracies": 1.0, "rewards/chosen": 1.3578920364379883, "rewards/margins": 0.8836456537246704, "rewards/rejected": 0.47424641251564026, "step": 5092 }, { "epoch": 2.75, "learning_rate": 1.8531111614784268e-09, "logits/chosen": -2.0472702980041504, "logits/rejected": -2.312429428100586, "logps/chosen": -0.6608411073684692, "logps/rejected": -0.6104216575622559, "loss": 0.6847, "rewards/accuracies": 1.0, "rewards/chosen": 1.090710997581482, "rewards/margins": 0.016907095909118652, "rewards/rejected": 1.0738039016723633, "step": 5093 }, { "epoch": 2.75, "learning_rate": 1.8452661110901713e-09, "logits/chosen": -2.0654306411743164, "logits/rejected": -2.2311007976531982, "logps/chosen": -0.6351339221000671, "logps/rejected": -0.5940523147583008, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 0.9181221127510071, "rewards/margins": 0.02304697036743164, "rewards/rejected": 0.8950751423835754, "step": 5094 }, { "epoch": 2.75, "learning_rate": 1.8374373895602925e-09, "logits/chosen": -1.9824364185333252, "logits/rejected": -1.977474570274353, "logps/chosen": -7.450075149536133, "logps/rejected": -3.5399835109710693, "loss": 0.3016, "rewards/accuracies": 1.0, "rewards/chosen": 1.6499651670455933, "rewards/margins": 1.0442068576812744, "rewards/rejected": 0.6057583093643188, "step": 5095 }, { "epoch": 2.75, "learning_rate": 1.8296249995434444e-09, "logits/chosen": -2.0715994834899902, "logits/rejected": -2.0785253047943115, "logps/chosen": -2.738348960876465, "logps/rejected": -5.068125247955322, "loss": 0.4803, "rewards/accuracies": 1.0, "rewards/chosen": 0.9417321085929871, "rewards/margins": 0.4835628867149353, "rewards/rejected": 0.45816922187805176, "step": 5096 }, { "epoch": 2.75, "learning_rate": 1.8218289436887358e-09, "logits/chosen": -2.3441576957702637, "logits/rejected": -2.198458671569824, "logps/chosen": -21.39531898498535, "logps/rejected": -1.7498242855072021, "loss": 0.1791, "rewards/accuracies": 1.0, "rewards/chosen": 2.273158550262451, "rewards/margins": 1.629162311553955, "rewards/rejected": 0.6439961791038513, "step": 5097 }, { "epoch": 2.75, "learning_rate": 1.814049224639741e-09, "logits/chosen": -1.9625792503356934, "logits/rejected": -1.9636460542678833, "logps/chosen": -2.8070225715637207, "logps/rejected": -0.5748100876808167, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.7846242785453796, "rewards/margins": 0.005367159843444824, "rewards/rejected": 0.7792571187019348, "step": 5098 }, { "epoch": 2.75, "learning_rate": 1.8062858450345054e-09, "logits/chosen": -2.0456390380859375, "logits/rejected": -2.2552895545959473, "logps/chosen": -0.4437892436981201, "logps/rejected": -0.4335423707962036, "loss": 0.6839, "rewards/accuracies": 1.0, "rewards/chosen": 1.0073803663253784, "rewards/margins": 0.018481791019439697, "rewards/rejected": 0.9888985753059387, "step": 5099 }, { "epoch": 2.75, "learning_rate": 1.7985388075055175e-09, "logits/chosen": -2.144209861755371, "logits/rejected": -2.3052170276641846, "logps/chosen": -3.0884487628936768, "logps/rejected": -3.3849809169769287, "loss": 0.6684, "rewards/accuracies": 1.0, "rewards/chosen": 0.9255627989768982, "rewards/margins": 0.05020028352737427, "rewards/rejected": 0.8753625154495239, "step": 5100 }, { "epoch": 2.75, "learning_rate": 1.7908081146797427e-09, "logits/chosen": -2.212491512298584, "logits/rejected": -2.227285623550415, "logps/chosen": -3.5711827278137207, "logps/rejected": -10.247023582458496, "loss": 0.4514, "rewards/accuracies": 1.0, "rewards/chosen": 1.3649330139160156, "rewards/margins": 0.5613632798194885, "rewards/rejected": 0.8035697340965271, "step": 5101 }, { "epoch": 2.75, "learning_rate": 1.7830937691785786e-09, "logits/chosen": -1.9985350370407104, "logits/rejected": -1.9858081340789795, "logps/chosen": -0.8178373575210571, "logps/rejected": -3.589672088623047, "loss": 0.5271, "rewards/accuracies": 1.0, "rewards/chosen": 1.0270240306854248, "rewards/margins": 0.365250825881958, "rewards/rejected": 0.6617732048034668, "step": 5102 }, { "epoch": 2.75, "learning_rate": 1.7753957736178993e-09, "logits/chosen": -2.104445457458496, "logits/rejected": -2.1078121662139893, "logps/chosen": -3.394583225250244, "logps/rejected": -0.9346845149993896, "loss": 0.6676, "rewards/accuracies": 1.0, "rewards/chosen": 0.7385294437408447, "rewards/margins": 0.051685869693756104, "rewards/rejected": 0.6868435740470886, "step": 5103 }, { "epoch": 2.75, "learning_rate": 1.7677141306080279e-09, "logits/chosen": -2.011451244354248, "logits/rejected": -2.3038840293884277, "logps/chosen": -1.0311819314956665, "logps/rejected": -1.057216763496399, "loss": 0.6932, "rewards/accuracies": 0.0, "rewards/chosen": 0.9373272061347961, "rewards/margins": -5.310773849487305e-05, "rewards/rejected": 0.937380313873291, "step": 5104 }, { "epoch": 2.75, "learning_rate": 1.7600488427537475e-09, "logits/chosen": -2.055473804473877, "logits/rejected": -2.057400941848755, "logps/chosen": -0.21831440925598145, "logps/rejected": -5.294583797454834, "loss": 0.5212, "rewards/accuracies": 1.0, "rewards/chosen": 0.9383255839347839, "rewards/margins": 0.3796471953392029, "rewards/rejected": 0.558678388595581, "step": 5105 }, { "epoch": 2.75, "learning_rate": 1.75239991265429e-09, "logits/chosen": -2.059108018875122, "logits/rejected": -2.0624988079071045, "logps/chosen": -1.146683931350708, "logps/rejected": -2.8660717010498047, "loss": 0.529, "rewards/accuracies": 1.0, "rewards/chosen": 0.9009435772895813, "rewards/margins": 0.36053407192230225, "rewards/rejected": 0.540409505367279, "step": 5106 }, { "epoch": 2.75, "learning_rate": 1.744767342903336e-09, "logits/chosen": -2.294757127761841, "logits/rejected": -2.3776707649230957, "logps/chosen": -8.234901428222656, "logps/rejected": -13.26378059387207, "loss": 0.6176, "rewards/accuracies": 1.0, "rewards/chosen": 1.1563640832901, "rewards/margins": 0.15720194578170776, "rewards/rejected": 0.9991621375083923, "step": 5107 }, { "epoch": 2.76, "learning_rate": 1.7371511360890322e-09, "logits/chosen": -2.150707960128784, "logits/rejected": -2.2809934616088867, "logps/chosen": -2.3868565559387207, "logps/rejected": -0.4989875555038452, "loss": 0.7112, "rewards/accuracies": 0.0, "rewards/chosen": 0.878914475440979, "rewards/margins": -0.03574115037918091, "rewards/rejected": 0.9146556258201599, "step": 5108 }, { "epoch": 2.76, "learning_rate": 1.7295512947939683e-09, "logits/chosen": -2.0069539546966553, "logits/rejected": -1.9960769414901733, "logps/chosen": -5.574202537536621, "logps/rejected": -5.414999961853027, "loss": 0.3267, "rewards/accuracies": 1.0, "rewards/chosen": 1.5775741338729858, "rewards/margins": 0.9508020877838135, "rewards/rejected": 0.6267720460891724, "step": 5109 }, { "epoch": 2.76, "learning_rate": 1.721967821595177e-09, "logits/chosen": -2.0157623291015625, "logits/rejected": -2.2650845050811768, "logps/chosen": -1.7809367179870605, "logps/rejected": -1.407125473022461, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.8607797622680664, "rewards/margins": -0.0029088854789733887, "rewards/rejected": 0.8636886477470398, "step": 5110 }, { "epoch": 2.76, "learning_rate": 1.7144007190641685e-09, "logits/chosen": -2.0391950607299805, "logits/rejected": -2.044780969619751, "logps/chosen": -1.6596379280090332, "logps/rejected": -3.1106834411621094, "loss": 0.4665, "rewards/accuracies": 1.0, "rewards/chosen": 1.0598134994506836, "rewards/margins": 0.5202849507331848, "rewards/rejected": 0.5395285487174988, "step": 5111 }, { "epoch": 2.76, "learning_rate": 1.706849989766862e-09, "logits/chosen": -2.0174922943115234, "logits/rejected": -2.3017172813415527, "logps/chosen": -0.18637369573116302, "logps/rejected": -0.21542830765247345, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.9501372575759888, "rewards/margins": 0.009730517864227295, "rewards/rejected": 0.9404067397117615, "step": 5112 }, { "epoch": 2.76, "learning_rate": 1.6993156362636652e-09, "logits/chosen": -2.02227783203125, "logits/rejected": -2.0119199752807617, "logps/chosen": -6.151014804840088, "logps/rejected": -3.712520122528076, "loss": 0.5647, "rewards/accuracies": 1.0, "rewards/chosen": 1.152186393737793, "rewards/margins": 0.27587467432022095, "rewards/rejected": 0.876311719417572, "step": 5113 }, { "epoch": 2.76, "learning_rate": 1.6917976611093898e-09, "logits/chosen": -2.063539981842041, "logits/rejected": -2.2577130794525146, "logps/chosen": -1.2641425132751465, "logps/rejected": -6.408594131469727, "loss": 0.6184, "rewards/accuracies": 1.0, "rewards/chosen": 0.8609865307807922, "rewards/margins": 0.15560472011566162, "rewards/rejected": 0.7053818106651306, "step": 5114 }, { "epoch": 2.76, "learning_rate": 1.6842960668533302e-09, "logits/chosen": -2.1661458015441895, "logits/rejected": -2.349900484085083, "logps/chosen": -6.946681976318359, "logps/rejected": -6.6644415855407715, "loss": 0.693, "rewards/accuracies": 1.0, "rewards/chosen": 0.713519275188446, "rewards/margins": 0.00039142370223999023, "rewards/rejected": 0.713127851486206, "step": 5115 }, { "epoch": 2.76, "learning_rate": 1.6768108560392125e-09, "logits/chosen": -2.1518967151641846, "logits/rejected": -2.1432530879974365, "logps/chosen": -7.333098411560059, "logps/rejected": -1.6589516401290894, "loss": 0.4064, "rewards/accuracies": 1.0, "rewards/chosen": 1.4310163259506226, "rewards/margins": 0.6902067065238953, "rewards/rejected": 0.7408096194267273, "step": 5116 }, { "epoch": 2.76, "learning_rate": 1.6693420312052064e-09, "logits/chosen": -1.9585528373718262, "logits/rejected": -1.9597216844558716, "logps/chosen": -1.5536150932312012, "logps/rejected": -0.7749052047729492, "loss": 0.6262, "rewards/accuracies": 1.0, "rewards/chosen": 1.0837793350219727, "rewards/margins": 0.13877201080322266, "rewards/rejected": 0.94500732421875, "step": 5117 }, { "epoch": 2.76, "learning_rate": 1.6618895948839251e-09, "logits/chosen": -2.093548059463501, "logits/rejected": -2.100863218307495, "logps/chosen": -4.957547187805176, "logps/rejected": -10.403703689575195, "loss": 0.403, "rewards/accuracies": 1.0, "rewards/chosen": 1.1383651494979858, "rewards/margins": 0.7006263732910156, "rewards/rejected": 0.4377388060092926, "step": 5118 }, { "epoch": 2.76, "learning_rate": 1.6544535496024248e-09, "logits/chosen": -2.1121158599853516, "logits/rejected": -2.3212528228759766, "logps/chosen": -0.6164774298667908, "logps/rejected": -0.6478089690208435, "loss": 0.6781, "rewards/accuracies": 1.0, "rewards/chosen": 0.9937165379524231, "rewards/margins": 0.030304312705993652, "rewards/rejected": 0.9634122252464294, "step": 5119 }, { "epoch": 2.76, "learning_rate": 1.6470338978822107e-09, "logits/chosen": -2.0649313926696777, "logits/rejected": -2.1334753036499023, "logps/chosen": -7.5073699951171875, "logps/rejected": -18.09579849243164, "loss": 0.3578, "rewards/accuracies": 1.0, "rewards/chosen": 1.343699336051941, "rewards/margins": 0.8436562418937683, "rewards/rejected": 0.5000430941581726, "step": 5120 }, { "epoch": 2.76, "learning_rate": 1.6396306422392203e-09, "logits/chosen": -2.1228418350219727, "logits/rejected": -2.2412548065185547, "logps/chosen": -2.0781383514404297, "logps/rejected": -2.034874200820923, "loss": 0.6724, "rewards/accuracies": 1.0, "rewards/chosen": 0.9696540832519531, "rewards/margins": 0.04200667142868042, "rewards/rejected": 0.9276474118232727, "step": 5121 }, { "epoch": 2.76, "learning_rate": 1.6322437851838344e-09, "logits/chosen": -2.059492826461792, "logits/rejected": -2.287943124771118, "logps/chosen": -0.40350601077079773, "logps/rejected": -0.37007951736450195, "loss": 0.6937, "rewards/accuracies": 0.0, "rewards/chosen": 0.8339681625366211, "rewards/margins": -0.001017749309539795, "rewards/rejected": 0.8349859118461609, "step": 5122 }, { "epoch": 2.76, "learning_rate": 1.6248733292208661e-09, "logits/chosen": -2.2503743171691895, "logits/rejected": -2.2405874729156494, "logps/chosen": -1.1872867345809937, "logps/rejected": -3.968125820159912, "loss": 0.4689, "rewards/accuracies": 1.0, "rewards/chosen": 0.9854432344436646, "rewards/margins": 0.513794481754303, "rewards/rejected": 0.4716487526893616, "step": 5123 }, { "epoch": 2.76, "learning_rate": 1.6175192768495938e-09, "logits/chosen": -2.157356023788452, "logits/rejected": -2.1590776443481445, "logps/chosen": -0.2487519383430481, "logps/rejected": -5.614756107330322, "loss": 0.4446, "rewards/accuracies": 1.0, "rewards/chosen": 0.8528090715408325, "rewards/margins": 0.580157995223999, "rewards/rejected": 0.2726510465145111, "step": 5124 }, { "epoch": 2.76, "learning_rate": 1.6101816305636894e-09, "logits/chosen": -2.2067692279815674, "logits/rejected": -2.1409130096435547, "logps/chosen": -22.768817901611328, "logps/rejected": -2.003282308578491, "loss": 0.1968, "rewards/accuracies": 1.0, "rewards/chosen": 2.151012420654297, "rewards/margins": 1.5252807140350342, "rewards/rejected": 0.6257317662239075, "step": 5125 }, { "epoch": 2.76, "learning_rate": 1.6028603928513018e-09, "logits/chosen": -2.0422747135162354, "logits/rejected": -2.0480964183807373, "logps/chosen": -2.6294002532958984, "logps/rejected": -2.9626882076263428, "loss": 0.3813, "rewards/accuracies": 1.0, "rewards/chosen": 1.2226499319076538, "rewards/margins": 0.7675886750221252, "rewards/rejected": 0.45506125688552856, "step": 5126 }, { "epoch": 2.77, "learning_rate": 1.5955555661949894e-09, "logits/chosen": -2.097611427307129, "logits/rejected": -2.267845630645752, "logps/chosen": -5.115970611572266, "logps/rejected": -5.097500324249268, "loss": 0.6757, "rewards/accuracies": 1.0, "rewards/chosen": 0.6948041915893555, "rewards/margins": 0.035282254219055176, "rewards/rejected": 0.6595219373703003, "step": 5127 }, { "epoch": 2.77, "learning_rate": 1.588267153071765e-09, "logits/chosen": -2.140787363052368, "logits/rejected": -2.2534470558166504, "logps/chosen": -0.56192946434021, "logps/rejected": -0.7109999060630798, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.9749140739440918, "rewards/margins": 0.019174158573150635, "rewards/rejected": 0.9557399153709412, "step": 5128 }, { "epoch": 2.77, "learning_rate": 1.5809951559530633e-09, "logits/chosen": -1.9977024793624878, "logits/rejected": -2.307812213897705, "logps/chosen": -4.86741304397583, "logps/rejected": -7.118086338043213, "loss": 0.6637, "rewards/accuracies": 1.0, "rewards/chosen": 0.786273717880249, "rewards/margins": 0.05970054864883423, "rewards/rejected": 0.7265731692314148, "step": 5129 }, { "epoch": 2.77, "learning_rate": 1.5737395773047556e-09, "logits/chosen": -2.2212367057800293, "logits/rejected": -2.3333964347839355, "logps/chosen": -0.2898244261741638, "logps/rejected": -0.29023659229278564, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.994306206703186, "rewards/margins": 0.0046784281730651855, "rewards/rejected": 0.9896277785301208, "step": 5130 }, { "epoch": 2.77, "learning_rate": 1.5665004195871578e-09, "logits/chosen": -2.0382401943206787, "logits/rejected": -2.032283067703247, "logps/chosen": -0.6305038928985596, "logps/rejected": -4.8934760093688965, "loss": 0.4626, "rewards/accuracies": 1.0, "rewards/chosen": 1.0742459297180176, "rewards/margins": 0.5306091904640198, "rewards/rejected": 0.5436367392539978, "step": 5131 }, { "epoch": 2.77, "learning_rate": 1.5592776852549893e-09, "logits/chosen": -2.1975173950195312, "logits/rejected": -2.05722975730896, "logps/chosen": -31.340789794921875, "logps/rejected": -5.481180667877197, "loss": 0.182, "rewards/accuracies": 1.0, "rewards/chosen": 2.0678727626800537, "rewards/margins": 1.6112703084945679, "rewards/rejected": 0.45660242438316345, "step": 5132 }, { "epoch": 2.77, "learning_rate": 1.5520713767574244e-09, "logits/chosen": -2.0832881927490234, "logits/rejected": -2.278369665145874, "logps/chosen": -0.3992875814437866, "logps/rejected": -0.47755536437034607, "loss": 0.6964, "rewards/accuracies": 0.0, "rewards/chosen": 0.8627001643180847, "rewards/margins": -0.006496250629425049, "rewards/rejected": 0.8691964149475098, "step": 5133 }, { "epoch": 2.77, "learning_rate": 1.5448814965380696e-09, "logits/chosen": -2.044300079345703, "logits/rejected": -2.2683496475219727, "logps/chosen": -1.3294594287872314, "logps/rejected": -46.47690963745117, "loss": 0.2426, "rewards/accuracies": 1.0, "rewards/chosen": 0.8748478293418884, "rewards/margins": 1.2924813032150269, "rewards/rejected": -0.41763344407081604, "step": 5134 }, { "epoch": 2.77, "learning_rate": 1.5377080470349358e-09, "logits/chosen": -2.1128203868865967, "logits/rejected": -2.079752206802368, "logps/chosen": -2.6581461429595947, "logps/rejected": -3.4807004928588867, "loss": 0.4313, "rewards/accuracies": 1.0, "rewards/chosen": 1.1751289367675781, "rewards/margins": 0.6176247596740723, "rewards/rejected": 0.5575041770935059, "step": 5135 }, { "epoch": 2.77, "learning_rate": 1.5305510306804937e-09, "logits/chosen": -1.9948487281799316, "logits/rejected": -2.272418260574341, "logps/chosen": -1.8495995998382568, "logps/rejected": -9.88505744934082, "loss": 0.669, "rewards/accuracies": 1.0, "rewards/chosen": 0.9452806711196899, "rewards/margins": 0.048818230628967285, "rewards/rejected": 0.8964624404907227, "step": 5136 }, { "epoch": 2.77, "learning_rate": 1.523410449901602e-09, "logits/chosen": -1.9946635961532593, "logits/rejected": -1.9926295280456543, "logps/chosen": -2.933187484741211, "logps/rejected": -2.5997238159179688, "loss": 0.3724, "rewards/accuracies": 1.0, "rewards/chosen": 1.473493218421936, "rewards/margins": 0.795811653137207, "rewards/rejected": 0.677681565284729, "step": 5137 }, { "epoch": 2.77, "learning_rate": 1.5162863071196074e-09, "logits/chosen": -2.0202441215515137, "logits/rejected": -2.2955501079559326, "logps/chosen": -1.6685590744018555, "logps/rejected": -1.7844312191009521, "loss": 0.7049, "rewards/accuracies": 0.0, "rewards/chosen": 1.0997488498687744, "rewards/margins": -0.023460030555725098, "rewards/rejected": 1.1232088804244995, "step": 5138 }, { "epoch": 2.77, "learning_rate": 1.509178604750211e-09, "logits/chosen": -1.9929516315460205, "logits/rejected": -1.9707995653152466, "logps/chosen": -12.444707870483398, "logps/rejected": -7.030758380889893, "loss": 0.3577, "rewards/accuracies": 1.0, "rewards/chosen": 1.5599287748336792, "rewards/margins": 0.8439149260520935, "rewards/rejected": 0.7160138487815857, "step": 5139 }, { "epoch": 2.77, "learning_rate": 1.5020873452035954e-09, "logits/chosen": -2.048466920852661, "logits/rejected": -2.0413272380828857, "logps/chosen": -5.347146987915039, "logps/rejected": -2.040700912475586, "loss": 0.3525, "rewards/accuracies": 1.0, "rewards/chosen": 1.705919623374939, "rewards/margins": 0.8613532781600952, "rewards/rejected": 0.8445663452148438, "step": 5140 }, { "epoch": 2.77, "learning_rate": 1.495012530884343e-09, "logits/chosen": -2.050994873046875, "logits/rejected": -2.3208043575286865, "logps/chosen": -1.233080506324768, "logps/rejected": -1.2220021486282349, "loss": 0.6853, "rewards/accuracies": 1.0, "rewards/chosen": 0.8683140873908997, "rewards/margins": 0.01577693223953247, "rewards/rejected": 0.8525371551513672, "step": 5141 }, { "epoch": 2.77, "learning_rate": 1.4879541641914517e-09, "logits/chosen": -2.086946487426758, "logits/rejected": -2.311788558959961, "logps/chosen": -1.0773335695266724, "logps/rejected": -1.205312967300415, "loss": 0.6669, "rewards/accuracies": 1.0, "rewards/chosen": 0.8443613052368164, "rewards/margins": 0.05311155319213867, "rewards/rejected": 0.7912497520446777, "step": 5142 }, { "epoch": 2.77, "learning_rate": 1.4809122475183621e-09, "logits/chosen": -2.129862070083618, "logits/rejected": -2.131715774536133, "logps/chosen": -0.9401311874389648, "logps/rejected": -4.195411682128906, "loss": 0.535, "rewards/accuracies": 1.0, "rewards/chosen": 0.923675000667572, "rewards/margins": 0.34597331285476685, "rewards/rejected": 0.5777016878128052, "step": 5143 }, { "epoch": 2.77, "learning_rate": 1.473886783252931e-09, "logits/chosen": -2.1375961303710938, "logits/rejected": -2.1380114555358887, "logps/chosen": -1.063887357711792, "logps/rejected": -3.9339683055877686, "loss": 0.4331, "rewards/accuracies": 1.0, "rewards/chosen": 1.1263806819915771, "rewards/margins": 0.612408459186554, "rewards/rejected": 0.5139722228050232, "step": 5144 }, { "epoch": 2.78, "learning_rate": 1.4668777737774251e-09, "logits/chosen": -2.1326589584350586, "logits/rejected": -2.331435203552246, "logps/chosen": -0.3654974699020386, "logps/rejected": -0.3878743052482605, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.8347665667533875, "rewards/margins": 0.006723880767822266, "rewards/rejected": 0.8280426859855652, "step": 5145 }, { "epoch": 2.78, "learning_rate": 1.4598852214685486e-09, "logits/chosen": -2.138052463531494, "logits/rejected": -2.301349639892578, "logps/chosen": -1.5669372081756592, "logps/rejected": -1.637817621231079, "loss": 0.6764, "rewards/accuracies": 1.0, "rewards/chosen": 0.6544780135154724, "rewards/margins": 0.03387635946273804, "rewards/rejected": 0.6206016540527344, "step": 5146 }, { "epoch": 2.78, "learning_rate": 1.4529091286973993e-09, "logits/chosen": -2.1752965450286865, "logits/rejected": -2.129718780517578, "logps/chosen": -17.054895401000977, "logps/rejected": -3.2068123817443848, "loss": 0.3108, "rewards/accuracies": 1.0, "rewards/chosen": 1.6238653659820557, "rewards/margins": 1.0091190338134766, "rewards/rejected": 0.6147462725639343, "step": 5147 }, { "epoch": 2.78, "learning_rate": 1.445949497829524e-09, "logits/chosen": -1.9587604999542236, "logits/rejected": -1.9720278978347778, "logps/chosen": -1.5019052028656006, "logps/rejected": -6.971303939819336, "loss": 0.4327, "rewards/accuracies": 1.0, "rewards/chosen": 1.2103482484817505, "rewards/margins": 0.6135960817337036, "rewards/rejected": 0.5967521667480469, "step": 5148 }, { "epoch": 2.78, "learning_rate": 1.4390063312248847e-09, "logits/chosen": -2.0643820762634277, "logits/rejected": -2.2675387859344482, "logps/chosen": -0.4839191138744354, "logps/rejected": -0.5391433238983154, "loss": 0.682, "rewards/accuracies": 1.0, "rewards/chosen": 0.8811201453208923, "rewards/margins": 0.02237522602081299, "rewards/rejected": 0.8587449193000793, "step": 5149 }, { "epoch": 2.78, "learning_rate": 1.4320796312378258e-09, "logits/chosen": -2.058318853378296, "logits/rejected": -2.0514345169067383, "logps/chosen": -8.437897682189941, "logps/rejected": -4.17642879486084, "loss": 0.3271, "rewards/accuracies": 1.0, "rewards/chosen": 1.5206836462020874, "rewards/margins": 0.9493361115455627, "rewards/rejected": 0.5713475346565247, "step": 5150 }, { "epoch": 2.78, "learning_rate": 1.4251694002171522e-09, "logits/chosen": -2.0107741355895996, "logits/rejected": -2.2439303398132324, "logps/chosen": -0.3919186592102051, "logps/rejected": -0.42018282413482666, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 1.0137529373168945, "rewards/margins": 0.009726762771606445, "rewards/rejected": 1.004026174545288, "step": 5151 }, { "epoch": 2.78, "learning_rate": 1.4182756405060447e-09, "logits/chosen": -2.0843684673309326, "logits/rejected": -2.0880789756774902, "logps/chosen": -1.5735825300216675, "logps/rejected": -2.5723984241485596, "loss": 0.4687, "rewards/accuracies": 1.0, "rewards/chosen": 1.1731736660003662, "rewards/margins": 0.5142813324928284, "rewards/rejected": 0.6588923335075378, "step": 5152 }, { "epoch": 2.78, "learning_rate": 1.4113983544421338e-09, "logits/chosen": -2.065577745437622, "logits/rejected": -2.0672717094421387, "logps/chosen": -0.9237382411956787, "logps/rejected": -4.455028533935547, "loss": 0.5466, "rewards/accuracies": 1.0, "rewards/chosen": 0.9252253770828247, "rewards/margins": 0.31833648681640625, "rewards/rejected": 0.6068888902664185, "step": 5153 }, { "epoch": 2.78, "learning_rate": 1.404537544357448e-09, "logits/chosen": -1.9233200550079346, "logits/rejected": -2.2550747394561768, "logps/chosen": -5.3725104331970215, "logps/rejected": -4.359610557556152, "loss": 0.6606, "rewards/accuracies": 1.0, "rewards/chosen": 0.8720300793647766, "rewards/margins": 0.066292405128479, "rewards/rejected": 0.8057376742362976, "step": 5154 }, { "epoch": 2.78, "learning_rate": 1.3976932125784158e-09, "logits/chosen": -2.001460313796997, "logits/rejected": -2.237126588821411, "logps/chosen": -0.1821328103542328, "logps/rejected": -0.18712268769741058, "loss": 0.6979, "rewards/accuracies": 0.0, "rewards/chosen": 0.9932074546813965, "rewards/margins": -0.009464621543884277, "rewards/rejected": 1.0026720762252808, "step": 5155 }, { "epoch": 2.78, "learning_rate": 1.390865361425908e-09, "logits/chosen": -2.0789523124694824, "logits/rejected": -2.0756986141204834, "logps/chosen": -12.76852035522461, "logps/rejected": -11.605921745300293, "loss": 0.6975, "rewards/accuracies": 0.0, "rewards/chosen": 0.9227701425552368, "rewards/margins": -0.008722186088562012, "rewards/rejected": 0.9314923286437988, "step": 5156 }, { "epoch": 2.78, "learning_rate": 1.3840539932151784e-09, "logits/chosen": -2.1082823276519775, "logits/rejected": -1.9900115728378296, "logps/chosen": -30.606199264526367, "logps/rejected": -3.6760776042938232, "loss": 0.1549, "rewards/accuracies": 1.0, "rewards/chosen": 2.2544257640838623, "rewards/margins": 1.7862056493759155, "rewards/rejected": 0.46822014451026917, "step": 5157 }, { "epoch": 2.78, "learning_rate": 1.3772591102559072e-09, "logits/chosen": -2.047347068786621, "logits/rejected": -2.0477523803710938, "logps/chosen": -3.7327091693878174, "logps/rejected": -4.524674415588379, "loss": 0.494, "rewards/accuracies": 1.0, "rewards/chosen": 1.1164231300354004, "rewards/margins": 0.44797736406326294, "rewards/rejected": 0.6684457659721375, "step": 5158 }, { "epoch": 2.78, "learning_rate": 1.3704807148521903e-09, "logits/chosen": -2.0042550563812256, "logits/rejected": -2.289701223373413, "logps/chosen": -0.42553436756134033, "logps/rejected": -0.49224674701690674, "loss": 0.6885, "rewards/accuracies": 1.0, "rewards/chosen": 1.002874493598938, "rewards/margins": 0.009343147277832031, "rewards/rejected": 0.993531346321106, "step": 5159 }, { "epoch": 2.78, "learning_rate": 1.3637188093025054e-09, "logits/chosen": -2.000469207763672, "logits/rejected": -2.0005741119384766, "logps/chosen": -0.1605825424194336, "logps/rejected": -6.14792013168335, "loss": 0.427, "rewards/accuracies": 1.0, "rewards/chosen": 0.9501652121543884, "rewards/margins": 0.6299282312393188, "rewards/rejected": 0.32023701071739197, "step": 5160 }, { "epoch": 2.78, "learning_rate": 1.3569733958997798e-09, "logits/chosen": -2.020157814025879, "logits/rejected": -2.01175594329834, "logps/chosen": -1.9004267454147339, "logps/rejected": -3.7018351554870605, "loss": 0.47, "rewards/accuracies": 1.0, "rewards/chosen": 1.179552435874939, "rewards/margins": 0.510809600353241, "rewards/rejected": 0.668742835521698, "step": 5161 }, { "epoch": 2.78, "learning_rate": 1.3502444769313003e-09, "logits/chosen": -2.0498971939086914, "logits/rejected": -2.2565553188323975, "logps/chosen": -1.9432724714279175, "logps/rejected": -1.69399094581604, "loss": 0.5772, "rewards/accuracies": 1.0, "rewards/chosen": 1.1110094785690308, "rewards/margins": 0.24710839986801147, "rewards/rejected": 0.8639010787010193, "step": 5162 }, { "epoch": 2.78, "learning_rate": 1.3435320546788033e-09, "logits/chosen": -2.1387381553649902, "logits/rejected": -2.131091594696045, "logps/chosen": -5.272495746612549, "logps/rejected": -4.252734184265137, "loss": 0.3904, "rewards/accuracies": 1.0, "rewards/chosen": 1.2883104085922241, "rewards/margins": 0.739194929599762, "rewards/rejected": 0.5491154789924622, "step": 5163 }, { "epoch": 2.79, "learning_rate": 1.3368361314184117e-09, "logits/chosen": -2.0799238681793213, "logits/rejected": -2.08219838142395, "logps/chosen": -2.569444417953491, "logps/rejected": -2.617683172225952, "loss": 0.35, "rewards/accuracies": 1.0, "rewards/chosen": 1.7790418863296509, "rewards/margins": 0.8696188926696777, "rewards/rejected": 0.9094229936599731, "step": 5164 }, { "epoch": 2.79, "learning_rate": 1.330156709420649e-09, "logits/chosen": -2.2407548427581787, "logits/rejected": -2.0520896911621094, "logps/chosen": -42.52525329589844, "logps/rejected": -3.1764509677886963, "loss": 0.0987, "rewards/accuracies": 1.0, "rewards/chosen": 2.852074384689331, "rewards/margins": 2.266378879547119, "rewards/rejected": 0.5856955647468567, "step": 5165 }, { "epoch": 2.79, "learning_rate": 1.323493790950453e-09, "logits/chosen": -2.244241714477539, "logits/rejected": -2.2077083587646484, "logps/chosen": -22.380075454711914, "logps/rejected": -24.431522369384766, "loss": 0.5161, "rewards/accuracies": 1.0, "rewards/chosen": 1.6770597696304321, "rewards/margins": 0.3924001455307007, "rewards/rejected": 1.2846596240997314, "step": 5166 }, { "epoch": 2.79, "learning_rate": 1.3168473782671608e-09, "logits/chosen": -2.204117774963379, "logits/rejected": -2.3053271770477295, "logps/chosen": -8.831342697143555, "logps/rejected": -5.649630546569824, "loss": 0.7059, "rewards/accuracies": 0.0, "rewards/chosen": 0.7897377014160156, "rewards/margins": -0.025290191173553467, "rewards/rejected": 0.8150278925895691, "step": 5167 }, { "epoch": 2.79, "learning_rate": 1.3102174736245197e-09, "logits/chosen": -1.9434024095535278, "logits/rejected": -1.9439032077789307, "logps/chosen": -1.5348494052886963, "logps/rejected": -5.8731536865234375, "loss": 0.513, "rewards/accuracies": 1.0, "rewards/chosen": 1.066089391708374, "rewards/margins": 0.3999791741371155, "rewards/rejected": 0.6661102175712585, "step": 5168 }, { "epoch": 2.79, "learning_rate": 1.3036040792706704e-09, "logits/chosen": -1.9629353284835815, "logits/rejected": -2.2606725692749023, "logps/chosen": -7.295860290527344, "logps/rejected": -1.174565076828003, "loss": 0.5784, "rewards/accuracies": 1.0, "rewards/chosen": 1.1673330068588257, "rewards/margins": 0.24449628591537476, "rewards/rejected": 0.9228367209434509, "step": 5169 }, { "epoch": 2.79, "learning_rate": 1.2970071974481577e-09, "logits/chosen": -2.0971500873565674, "logits/rejected": -2.08687686920166, "logps/chosen": -1.6126974821090698, "logps/rejected": -12.232561111450195, "loss": 0.3064, "rewards/accuracies": 1.0, "rewards/chosen": 1.4103727340698242, "rewards/margins": 1.0259082317352295, "rewards/rejected": 0.38446447253227234, "step": 5170 }, { "epoch": 2.79, "learning_rate": 1.290426830393926e-09, "logits/chosen": -2.0459439754486084, "logits/rejected": -2.2804198265075684, "logps/chosen": -0.7203107476234436, "logps/rejected": -0.6944814324378967, "loss": 0.6794, "rewards/accuracies": 1.0, "rewards/chosen": 0.921483039855957, "rewards/margins": 0.027623236179351807, "rewards/rejected": 0.8938598036766052, "step": 5171 }, { "epoch": 2.79, "learning_rate": 1.283862980339334e-09, "logits/chosen": -2.0804433822631836, "logits/rejected": -2.3165199756622314, "logps/chosen": -13.077948570251465, "logps/rejected": -11.522201538085938, "loss": 0.6038, "rewards/accuracies": 1.0, "rewards/chosen": 1.1983208656311035, "rewards/margins": 0.18750393390655518, "rewards/rejected": 1.0108169317245483, "step": 5172 }, { "epoch": 2.79, "learning_rate": 1.2773156495101078e-09, "logits/chosen": -2.1656644344329834, "logits/rejected": -2.367004632949829, "logps/chosen": -1.0775192975997925, "logps/rejected": -1.025578498840332, "loss": 0.697, "rewards/accuracies": 0.0, "rewards/chosen": 0.8575950860977173, "rewards/margins": -0.00774991512298584, "rewards/rejected": 0.8653450012207031, "step": 5173 }, { "epoch": 2.79, "learning_rate": 1.2707848401264042e-09, "logits/chosen": -2.0980148315429688, "logits/rejected": -2.1020913124084473, "logps/chosen": -3.4481873512268066, "logps/rejected": -3.8559021949768066, "loss": 0.5105, "rewards/accuracies": 1.0, "rewards/chosen": 1.1483128070831299, "rewards/margins": 0.40628039836883545, "rewards/rejected": 0.7420324087142944, "step": 5174 }, { "epoch": 2.79, "learning_rate": 1.2642705544027633e-09, "logits/chosen": -2.058976173400879, "logits/rejected": -2.2675487995147705, "logps/chosen": -4.098302364349365, "logps/rejected": -4.185278415679932, "loss": 0.6728, "rewards/accuracies": 1.0, "rewards/chosen": 0.7304510474205017, "rewards/margins": 0.0410158634185791, "rewards/rejected": 0.6894351840019226, "step": 5175 }, { "epoch": 2.79, "learning_rate": 1.257772794548112e-09, "logits/chosen": -1.9901320934295654, "logits/rejected": -2.2775580883026123, "logps/chosen": -2.411198139190674, "logps/rejected": -5.368799209594727, "loss": 0.5849, "rewards/accuracies": 1.0, "rewards/chosen": 0.8302003741264343, "rewards/margins": 0.229628324508667, "rewards/rejected": 0.6005720496177673, "step": 5176 }, { "epoch": 2.79, "learning_rate": 1.2512915627657994e-09, "logits/chosen": -2.11198353767395, "logits/rejected": -2.1194984912872314, "logps/chosen": -1.4942166805267334, "logps/rejected": -3.891099452972412, "loss": 0.437, "rewards/accuracies": 1.0, "rewards/chosen": 1.0793927907943726, "rewards/margins": 0.6014049053192139, "rewards/rejected": 0.4779879152774811, "step": 5177 }, { "epoch": 2.79, "learning_rate": 1.2448268612535507e-09, "logits/chosen": -2.29209041595459, "logits/rejected": -2.309009552001953, "logps/chosen": -0.47415319085121155, "logps/rejected": -0.499206006526947, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.9465362429618835, "rewards/margins": -0.002956986427307129, "rewards/rejected": 0.9494932293891907, "step": 5178 }, { "epoch": 2.79, "learning_rate": 1.2383786922034956e-09, "logits/chosen": -2.0589966773986816, "logits/rejected": -2.2498507499694824, "logps/chosen": -3.804251194000244, "logps/rejected": -3.9431538581848145, "loss": 0.6944, "rewards/accuracies": 0.0, "rewards/chosen": 0.698727548122406, "rewards/margins": -0.002559959888458252, "rewards/rejected": 0.7012875080108643, "step": 5179 }, { "epoch": 2.79, "learning_rate": 1.2319470578021462e-09, "logits/chosen": -2.132030725479126, "logits/rejected": -2.129596710205078, "logps/chosen": -5.6101975440979, "logps/rejected": -3.061858892440796, "loss": 0.3821, "rewards/accuracies": 1.0, "rewards/chosen": 1.4429117441177368, "rewards/margins": 0.7648383975028992, "rewards/rejected": 0.6780733466148376, "step": 5180 }, { "epoch": 2.79, "learning_rate": 1.2255319602304137e-09, "logits/chosen": -2.0831315517425537, "logits/rejected": -2.2775542736053467, "logps/chosen": -0.9060689210891724, "logps/rejected": -0.9573178291320801, "loss": 0.677, "rewards/accuracies": 1.0, "rewards/chosen": 0.9163134694099426, "rewards/margins": 0.03246927261352539, "rewards/rejected": 0.8838441967964172, "step": 5181 }, { "epoch": 2.8, "learning_rate": 1.2191334016636134e-09, "logits/chosen": -2.0610857009887695, "logits/rejected": -2.2408392429351807, "logps/chosen": -0.3275183141231537, "logps/rejected": -0.31805524230003357, "loss": 0.6821, "rewards/accuracies": 1.0, "rewards/chosen": 0.8207753300666809, "rewards/margins": 0.022218525409698486, "rewards/rejected": 0.7985568046569824, "step": 5182 }, { "epoch": 2.8, "learning_rate": 1.2127513842714266e-09, "logits/chosen": -2.0233426094055176, "logits/rejected": -2.2764453887939453, "logps/chosen": -0.6154617667198181, "logps/rejected": -0.6718320250511169, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 1.0098921060562134, "rewards/margins": -0.0028754472732543945, "rewards/rejected": 1.0127675533294678, "step": 5183 }, { "epoch": 2.8, "learning_rate": 1.2063859102179608e-09, "logits/chosen": -2.2461657524108887, "logits/rejected": -2.129685401916504, "logps/chosen": -34.40292739868164, "logps/rejected": -4.9281158447265625, "loss": 0.1499, "rewards/accuracies": 1.0, "rewards/chosen": 2.5789146423339844, "rewards/margins": 1.8220765590667725, "rewards/rejected": 0.7568380236625671, "step": 5184 }, { "epoch": 2.8, "learning_rate": 1.2000369816616674e-09, "logits/chosen": -2.1712586879730225, "logits/rejected": -2.0573060512542725, "logps/chosen": -27.044588088989258, "logps/rejected": -3.4286997318267822, "loss": 0.2278, "rewards/accuracies": 1.0, "rewards/chosen": 1.8731024265289307, "rewards/margins": 1.3631727695465088, "rewards/rejected": 0.5099296569824219, "step": 5185 }, { "epoch": 2.8, "learning_rate": 1.1937046007554352e-09, "logits/chosen": -2.0966250896453857, "logits/rejected": -2.31272554397583, "logps/chosen": -8.838180541992188, "logps/rejected": -9.976945877075195, "loss": 0.6936, "rewards/accuracies": 0.0, "rewards/chosen": 0.9966371655464172, "rewards/margins": -0.0009420514106750488, "rewards/rejected": 0.9975792169570923, "step": 5186 }, { "epoch": 2.8, "learning_rate": 1.1873887696465135e-09, "logits/chosen": -2.0848212242126465, "logits/rejected": -2.2741103172302246, "logps/chosen": -2.108285903930664, "logps/rejected": -2.16471529006958, "loss": 0.6887, "rewards/accuracies": 1.0, "rewards/chosen": 0.9966868758201599, "rewards/margins": 0.008926868438720703, "rewards/rejected": 0.9877600073814392, "step": 5187 }, { "epoch": 2.8, "learning_rate": 1.1810894904765444e-09, "logits/chosen": -2.163360357284546, "logits/rejected": -2.167958974838257, "logps/chosen": -0.3591722846031189, "logps/rejected": -7.064515113830566, "loss": 0.4531, "rewards/accuracies": 1.0, "rewards/chosen": 1.0008879899978638, "rewards/margins": 0.5566400289535522, "rewards/rejected": 0.44424793124198914, "step": 5188 }, { "epoch": 2.8, "learning_rate": 1.1748067653815641e-09, "logits/chosen": -2.121263265609741, "logits/rejected": -2.1301751136779785, "logps/chosen": -1.81740403175354, "logps/rejected": -2.218432903289795, "loss": 0.4951, "rewards/accuracies": 1.0, "rewards/chosen": 1.1703413724899292, "rewards/margins": 0.4452444911003113, "rewards/rejected": 0.7250968813896179, "step": 5189 }, { "epoch": 2.8, "learning_rate": 1.1685405964919847e-09, "logits/chosen": -2.0738954544067383, "logits/rejected": -2.249279260635376, "logps/chosen": -0.414559006690979, "logps/rejected": -0.39231717586517334, "loss": 0.6741, "rewards/accuracies": 1.0, "rewards/chosen": 0.8928680419921875, "rewards/margins": 0.03856390714645386, "rewards/rejected": 0.8543041348457336, "step": 5190 }, { "epoch": 2.8, "learning_rate": 1.162290985932618e-09, "logits/chosen": -2.0726819038391113, "logits/rejected": -2.2663910388946533, "logps/chosen": -0.42307665944099426, "logps/rejected": -0.4749821126461029, "loss": 0.705, "rewards/accuracies": 0.0, "rewards/chosen": 1.0335502624511719, "rewards/margins": -0.02346932888031006, "rewards/rejected": 1.057019591331482, "step": 5191 }, { "epoch": 2.8, "learning_rate": 1.1560579358226518e-09, "logits/chosen": -2.0351006984710693, "logits/rejected": -2.326911449432373, "logps/chosen": -0.6112477779388428, "logps/rejected": -11.924676895141602, "loss": 0.6627, "rewards/accuracies": 1.0, "rewards/chosen": 0.9252060055732727, "rewards/margins": 0.061754822731018066, "rewards/rejected": 0.8634511828422546, "step": 5192 }, { "epoch": 2.8, "learning_rate": 1.1498414482756513e-09, "logits/chosen": -2.1292998790740967, "logits/rejected": -2.258967638015747, "logps/chosen": -0.1645098477602005, "logps/rejected": -0.1556721329689026, "loss": 0.6908, "rewards/accuracies": 1.0, "rewards/chosen": 0.8282999992370605, "rewards/margins": 0.004660189151763916, "rewards/rejected": 0.8236398100852966, "step": 5193 }, { "epoch": 2.8, "learning_rate": 1.1436415253995858e-09, "logits/chosen": -2.028137445449829, "logits/rejected": -2.035620927810669, "logps/chosen": -1.5333584547042847, "logps/rejected": -3.748784065246582, "loss": 0.4622, "rewards/accuracies": 1.0, "rewards/chosen": 0.9939050078392029, "rewards/margins": 0.53170245885849, "rewards/rejected": 0.4622025489807129, "step": 5194 }, { "epoch": 2.8, "learning_rate": 1.13745816929679e-09, "logits/chosen": -2.0736656188964844, "logits/rejected": -2.063640594482422, "logps/chosen": -7.442124366760254, "logps/rejected": -0.7350805997848511, "loss": 0.4707, "rewards/accuracies": 1.0, "rewards/chosen": 1.5409772396087646, "rewards/margins": 0.5089584589004517, "rewards/rejected": 1.032018780708313, "step": 5195 }, { "epoch": 2.8, "learning_rate": 1.131291382063987e-09, "logits/chosen": -2.0687832832336426, "logits/rejected": -2.3122386932373047, "logps/chosen": -1.8260128498077393, "logps/rejected": -1.7407366037368774, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.6588379144668579, "rewards/margins": 0.019663214683532715, "rewards/rejected": 0.6391746997833252, "step": 5196 }, { "epoch": 2.8, "learning_rate": 1.125141165792287e-09, "logits/chosen": -2.1178641319274902, "logits/rejected": -2.133777141571045, "logps/chosen": -7.226998805999756, "logps/rejected": -4.375405311584473, "loss": 0.4144, "rewards/accuracies": 1.0, "rewards/chosen": 1.3427660465240479, "rewards/margins": 0.6666049361228943, "rewards/rejected": 0.6761611104011536, "step": 5197 }, { "epoch": 2.8, "learning_rate": 1.119007522567167e-09, "logits/chosen": -2.06201434135437, "logits/rejected": -2.3326191902160645, "logps/chosen": -0.3162184953689575, "logps/rejected": -0.30241236090660095, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.9963579177856445, "rewards/margins": 0.011153757572174072, "rewards/rejected": 0.9852041602134705, "step": 5198 }, { "epoch": 2.8, "learning_rate": 1.1128904544685014e-09, "logits/chosen": -2.1574599742889404, "logits/rejected": -2.129380226135254, "logps/chosen": -25.97231674194336, "logps/rejected": -4.939337253570557, "loss": 0.5424, "rewards/accuracies": 1.0, "rewards/chosen": 1.4163074493408203, "rewards/margins": 0.32835161685943604, "rewards/rejected": 1.0879558324813843, "step": 5199 }, { "epoch": 2.8, "learning_rate": 1.1067899635705314e-09, "logits/chosen": -2.124053478240967, "logits/rejected": -2.0831139087677, "logps/chosen": -19.64167594909668, "logps/rejected": -9.04376220703125, "loss": 0.3608, "rewards/accuracies": 1.0, "rewards/chosen": 1.537596344947815, "rewards/margins": 0.8334604501724243, "rewards/rejected": 0.7041358947753906, "step": 5200 }, { "epoch": 2.81, "learning_rate": 1.10070605194188e-09, "logits/chosen": -2.1212074756622314, "logits/rejected": -2.0550830364227295, "logps/chosen": -10.516525268554688, "logps/rejected": -7.662628173828125, "loss": 0.696, "rewards/accuracies": 0.0, "rewards/chosen": 0.6586147546768188, "rewards/margins": -0.00571286678314209, "rewards/rejected": 0.6643276214599609, "step": 5201 }, { "epoch": 2.81, "learning_rate": 1.0946387216455577e-09, "logits/chosen": -2.086073160171509, "logits/rejected": -2.297292709350586, "logps/chosen": -3.666520595550537, "logps/rejected": -1.2236378192901611, "loss": 0.7234, "rewards/accuracies": 0.0, "rewards/chosen": 0.6759914755821228, "rewards/margins": -0.05967545509338379, "rewards/rejected": 0.7356669306755066, "step": 5202 }, { "epoch": 2.81, "learning_rate": 1.0885879747389305e-09, "logits/chosen": -2.2129507064819336, "logits/rejected": -2.335334300994873, "logps/chosen": -12.458580017089844, "logps/rejected": -9.280010223388672, "loss": 0.7278, "rewards/accuracies": 0.0, "rewards/chosen": 0.7875120043754578, "rewards/margins": -0.06810534000396729, "rewards/rejected": 0.855617344379425, "step": 5203 }, { "epoch": 2.81, "learning_rate": 1.0825538132737622e-09, "logits/chosen": -2.07350492477417, "logits/rejected": -2.2902045249938965, "logps/chosen": -2.0562777519226074, "logps/rejected": -1.9753801822662354, "loss": 0.6919, "rewards/accuracies": 1.0, "rewards/chosen": 0.898854672908783, "rewards/margins": 0.0025662779808044434, "rewards/rejected": 0.8962883949279785, "step": 5204 }, { "epoch": 2.81, "learning_rate": 1.076536239296194e-09, "logits/chosen": -1.9427955150604248, "logits/rejected": -1.9517040252685547, "logps/chosen": -1.12704336643219, "logps/rejected": -2.995903968811035, "loss": 0.4638, "rewards/accuracies": 1.0, "rewards/chosen": 1.0919839143753052, "rewards/margins": 0.5275687575340271, "rewards/rejected": 0.5644151568412781, "step": 5205 }, { "epoch": 2.81, "learning_rate": 1.0705352548467106e-09, "logits/chosen": -2.0810353755950928, "logits/rejected": -2.0873570442199707, "logps/chosen": -1.7486470937728882, "logps/rejected": -3.746094226837158, "loss": 0.4217, "rewards/accuracies": 1.0, "rewards/chosen": 1.1269210577011108, "rewards/margins": 0.6453531980514526, "rewards/rejected": 0.4815678298473358, "step": 5206 }, { "epoch": 2.81, "learning_rate": 1.0645508619602227e-09, "logits/chosen": -1.9936211109161377, "logits/rejected": -2.2772865295410156, "logps/chosen": -0.31048017740249634, "logps/rejected": -0.3991994261741638, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9700600504875183, "rewards/margins": 0.0051250457763671875, "rewards/rejected": 0.9649350047111511, "step": 5207 }, { "epoch": 2.81, "learning_rate": 1.0585830626659686e-09, "logits/chosen": -2.0982954502105713, "logits/rejected": -2.104572057723999, "logps/chosen": -8.69639778137207, "logps/rejected": -2.5005311965942383, "loss": 0.3414, "rewards/accuracies": 1.0, "rewards/chosen": 1.5294641256332397, "rewards/margins": 0.8991348743438721, "rewards/rejected": 0.6303292512893677, "step": 5208 }, { "epoch": 2.81, "learning_rate": 1.0526318589875848e-09, "logits/chosen": -2.1808888912200928, "logits/rejected": -2.172877311706543, "logps/chosen": -6.1631622314453125, "logps/rejected": -4.814204692840576, "loss": 0.3731, "rewards/accuracies": 1.0, "rewards/chosen": 1.3056831359863281, "rewards/margins": 0.7934078574180603, "rewards/rejected": 0.5122752785682678, "step": 5209 }, { "epoch": 2.81, "learning_rate": 1.0466972529430684e-09, "logits/chosen": -2.100510358810425, "logits/rejected": -2.286414623260498, "logps/chosen": -0.5991700291633606, "logps/rejected": -0.6186677813529968, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.7087680697441101, "rewards/margins": 0.017354071140289307, "rewards/rejected": 0.6914139986038208, "step": 5210 }, { "epoch": 2.81, "learning_rate": 1.0407792465447985e-09, "logits/chosen": -2.301466941833496, "logits/rejected": -2.2422354221343994, "logps/chosen": -7.552048683166504, "logps/rejected": -7.546056270599365, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.4750295579433441, "rewards/margins": 0.032109200954437256, "rewards/rejected": 0.44292035698890686, "step": 5211 }, { "epoch": 2.81, "learning_rate": 1.0348778417995196e-09, "logits/chosen": -2.2082622051239014, "logits/rejected": -2.0782408714294434, "logps/chosen": -53.86537170410156, "logps/rejected": -5.223968982696533, "loss": 0.1443, "rewards/accuracies": 1.0, "rewards/chosen": 2.504763126373291, "rewards/margins": 1.8631598949432373, "rewards/rejected": 0.6416031718254089, "step": 5212 }, { "epoch": 2.81, "learning_rate": 1.028993040708348e-09, "logits/chosen": -2.117547035217285, "logits/rejected": -2.273310422897339, "logps/chosen": -0.1626405417919159, "logps/rejected": -0.16714632511138916, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.9472967982292175, "rewards/margins": 0.029536426067352295, "rewards/rejected": 0.9177603721618652, "step": 5213 }, { "epoch": 2.81, "learning_rate": 1.0231248452667596e-09, "logits/chosen": -2.216169834136963, "logits/rejected": -2.1029553413391113, "logps/chosen": -17.031198501586914, "logps/rejected": -13.727596282958984, "loss": 0.4625, "rewards/accuracies": 1.0, "rewards/chosen": 1.590305209159851, "rewards/margins": 0.5308808088302612, "rewards/rejected": 1.0594244003295898, "step": 5214 }, { "epoch": 2.81, "learning_rate": 1.0172732574646292e-09, "logits/chosen": -2.068697452545166, "logits/rejected": -2.0673861503601074, "logps/chosen": -0.8512611389160156, "logps/rejected": -5.925652980804443, "loss": 0.4074, "rewards/accuracies": 1.0, "rewards/chosen": 1.1117011308670044, "rewards/margins": 0.6872243881225586, "rewards/rejected": 0.4244767725467682, "step": 5215 }, { "epoch": 2.81, "learning_rate": 1.0114382792861586e-09, "logits/chosen": -2.1355981826782227, "logits/rejected": -2.279691696166992, "logps/chosen": -2.9446094036102295, "logps/rejected": -2.7921950817108154, "loss": 0.6966, "rewards/accuracies": 0.0, "rewards/chosen": 1.0172361135482788, "rewards/margins": -0.006864666938781738, "rewards/rejected": 1.0241007804870605, "step": 5216 }, { "epoch": 2.81, "learning_rate": 1.005619912709954e-09, "logits/chosen": -2.2260630130767822, "logits/rejected": -2.2210490703582764, "logps/chosen": -0.9107792377471924, "logps/rejected": -4.5295562744140625, "loss": 0.4776, "rewards/accuracies": 1.0, "rewards/chosen": 1.0711921453475952, "rewards/margins": 0.4907662272453308, "rewards/rejected": 0.5804259181022644, "step": 5217 }, { "epoch": 2.81, "learning_rate": 9.99818159708965e-10, "logits/chosen": -2.0983757972717285, "logits/rejected": -2.254359006881714, "logps/chosen": -0.9603856801986694, "logps/rejected": -1.005539059638977, "loss": 0.6837, "rewards/accuracies": 1.0, "rewards/chosen": 0.7638564705848694, "rewards/margins": 0.01903975009918213, "rewards/rejected": 0.7448167204856873, "step": 5218 }, { "epoch": 2.81, "learning_rate": 9.940330222505233e-10, "logits/chosen": -2.124927282333374, "logits/rejected": -2.293336868286133, "logps/chosen": -5.46115779876709, "logps/rejected": -6.25006103515625, "loss": 0.5864, "rewards/accuracies": 1.0, "rewards/chosen": 1.0261410474777222, "rewards/margins": 0.2262522578239441, "rewards/rejected": 0.7998887896537781, "step": 5219 }, { "epoch": 2.82, "learning_rate": 9.88264502296321e-10, "logits/chosen": -2.0907950401306152, "logits/rejected": -2.298628330230713, "logps/chosen": -1.5424995422363281, "logps/rejected": -1.8738470077514648, "loss": 0.6815, "rewards/accuracies": 1.0, "rewards/chosen": 0.9593381285667419, "rewards/margins": 0.02337646484375, "rewards/rejected": 0.9359616637229919, "step": 5220 }, { "epoch": 2.82, "learning_rate": 9.825126018023988e-10, "logits/chosen": -2.1610233783721924, "logits/rejected": -2.342531681060791, "logps/chosen": -3.0797104835510254, "logps/rejected": -3.0746283531188965, "loss": 0.6909, "rewards/accuracies": 1.0, "rewards/chosen": 0.7956764698028564, "rewards/margins": 0.0045607686042785645, "rewards/rejected": 0.7911157011985779, "step": 5221 }, { "epoch": 2.82, "learning_rate": 9.767773227191967e-10, "logits/chosen": -2.1298611164093018, "logits/rejected": -2.1591339111328125, "logps/chosen": -9.541455268859863, "logps/rejected": -14.774778366088867, "loss": 0.6032, "rewards/accuracies": 1.0, "rewards/chosen": 1.201343297958374, "rewards/margins": 0.18885588645935059, "rewards/rejected": 1.0124874114990234, "step": 5222 }, { "epoch": 2.82, "learning_rate": 9.710586669914866e-10, "logits/chosen": -2.038935661315918, "logits/rejected": -2.3457834720611572, "logps/chosen": -0.3789122998714447, "logps/rejected": -0.33962857723236084, "loss": 0.6958, "rewards/accuracies": 0.0, "rewards/chosen": 0.8525053262710571, "rewards/margins": -0.005247652530670166, "rewards/rejected": 0.8577529788017273, "step": 5223 }, { "epoch": 2.82, "learning_rate": 9.653566365584176e-10, "logits/chosen": -2.1195898056030273, "logits/rejected": -2.26607346534729, "logps/chosen": -0.20515182614326477, "logps/rejected": -0.22019872069358826, "loss": 0.6898, "rewards/accuracies": 1.0, "rewards/chosen": 0.9495918154716492, "rewards/margins": 0.006704807281494141, "rewards/rejected": 0.942887008190155, "step": 5224 }, { "epoch": 2.82, "learning_rate": 9.596712333534985e-10, "logits/chosen": -1.997387409210205, "logits/rejected": -1.9965840578079224, "logps/chosen": -4.442412376403809, "logps/rejected": -0.5565236806869507, "loss": 0.7185, "rewards/accuracies": 0.0, "rewards/chosen": 0.892512321472168, "rewards/margins": -0.050030648708343506, "rewards/rejected": 0.9425429701805115, "step": 5225 }, { "epoch": 2.82, "learning_rate": 9.540024593046037e-10, "logits/chosen": -2.0241613388061523, "logits/rejected": -2.2659125328063965, "logps/chosen": -5.1058759689331055, "logps/rejected": -1.341103196144104, "loss": 0.714, "rewards/accuracies": 0.0, "rewards/chosen": 0.8513930439949036, "rewards/margins": -0.04133641719818115, "rewards/rejected": 0.8927294611930847, "step": 5226 }, { "epoch": 2.82, "learning_rate": 9.48350316333968e-10, "logits/chosen": -2.1634669303894043, "logits/rejected": -2.0244901180267334, "logps/chosen": -30.981578826904297, "logps/rejected": -5.107273101806641, "loss": 0.2312, "rewards/accuracies": 1.0, "rewards/chosen": 1.7604210376739502, "rewards/margins": 1.3466538190841675, "rewards/rejected": 0.4137672483921051, "step": 5227 }, { "epoch": 2.82, "learning_rate": 9.427148063581803e-10, "logits/chosen": -2.033255100250244, "logits/rejected": -2.031799077987671, "logps/chosen": -0.2915593683719635, "logps/rejected": -5.243200302124023, "loss": 0.4608, "rewards/accuracies": 1.0, "rewards/chosen": 0.9592161178588867, "rewards/margins": 0.5354924201965332, "rewards/rejected": 0.4237236976623535, "step": 5228 }, { "epoch": 2.82, "learning_rate": 9.370959312881953e-10, "logits/chosen": -2.1614797115325928, "logits/rejected": -2.3842453956604004, "logps/chosen": -0.6017314195632935, "logps/rejected": -0.5855157375335693, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.9572324156761169, "rewards/margins": 0.011045098304748535, "rewards/rejected": 0.9461873173713684, "step": 5229 }, { "epoch": 2.82, "learning_rate": 9.314936930293283e-10, "logits/chosen": -2.0279483795166016, "logits/rejected": -2.246349573135376, "logps/chosen": -8.678950309753418, "logps/rejected": -5.37878942489624, "loss": 0.6785, "rewards/accuracies": 1.0, "rewards/chosen": 0.8907914161682129, "rewards/margins": 0.029422104358673096, "rewards/rejected": 0.8613693118095398, "step": 5230 }, { "epoch": 2.82, "learning_rate": 9.25908093481248e-10, "logits/chosen": -2.178457260131836, "logits/rejected": -2.3055641651153564, "logps/chosen": -6.7571611404418945, "logps/rejected": -6.496583461761475, "loss": 0.7092, "rewards/accuracies": 0.0, "rewards/chosen": 0.5266942977905273, "rewards/margins": -0.03186136484146118, "rewards/rejected": 0.5585556626319885, "step": 5231 }, { "epoch": 2.82, "learning_rate": 9.203391345379841e-10, "logits/chosen": -1.9981309175491333, "logits/rejected": -2.2564890384674072, "logps/chosen": -0.6425691246986389, "logps/rejected": -0.7009057402610779, "loss": 0.6896, "rewards/accuracies": 1.0, "rewards/chosen": 1.1234456300735474, "rewards/margins": 0.0071152448654174805, "rewards/rejected": 1.1163303852081299, "step": 5232 }, { "epoch": 2.82, "learning_rate": 9.147868180879148e-10, "logits/chosen": -2.0767524242401123, "logits/rejected": -2.3359997272491455, "logps/chosen": -7.974860668182373, "logps/rejected": -10.52669620513916, "loss": 0.5619, "rewards/accuracies": 1.0, "rewards/chosen": 0.9547531008720398, "rewards/margins": 0.282290518283844, "rewards/rejected": 0.6724625825881958, "step": 5233 }, { "epoch": 2.82, "learning_rate": 9.092511460137952e-10, "logits/chosen": -1.9674075841903687, "logits/rejected": -1.9661130905151367, "logps/chosen": -0.9867451786994934, "logps/rejected": -3.2216882705688477, "loss": 0.5358, "rewards/accuracies": 1.0, "rewards/chosen": 1.1509692668914795, "rewards/margins": 0.344224750995636, "rewards/rejected": 0.8067445158958435, "step": 5234 }, { "epoch": 2.82, "learning_rate": 9.037321201927128e-10, "logits/chosen": -2.058269739151001, "logits/rejected": -2.05232834815979, "logps/chosen": -3.974155902862549, "logps/rejected": -3.5767271518707275, "loss": 0.2918, "rewards/accuracies": 1.0, "rewards/chosen": 1.573529839515686, "rewards/margins": 1.0820550918579102, "rewards/rejected": 0.4914747178554535, "step": 5235 }, { "epoch": 2.82, "learning_rate": 8.982297424961371e-10, "logits/chosen": -2.1363918781280518, "logits/rejected": -2.126826047897339, "logps/chosen": -4.300439834594727, "logps/rejected": -2.6250672340393066, "loss": 0.3554, "rewards/accuracies": 1.0, "rewards/chosen": 1.6653709411621094, "rewards/margins": 0.8516882061958313, "rewards/rejected": 0.8136827349662781, "step": 5236 }, { "epoch": 2.82, "learning_rate": 8.927440147898701e-10, "logits/chosen": -2.0882248878479004, "logits/rejected": -2.086916446685791, "logps/chosen": -4.988081932067871, "logps/rejected": -3.5011661052703857, "loss": 0.2631, "rewards/accuracies": 1.0, "rewards/chosen": 1.7026607990264893, "rewards/margins": 1.2006535530090332, "rewards/rejected": 0.5020073056221008, "step": 5237 }, { "epoch": 2.83, "learning_rate": 8.872749389340683e-10, "logits/chosen": -2.124847173690796, "logits/rejected": -2.2945950031280518, "logps/chosen": -2.4566659927368164, "logps/rejected": -1.2457449436187744, "loss": 0.6734, "rewards/accuracies": 1.0, "rewards/chosen": 1.018530249595642, "rewards/margins": 0.039955079555511475, "rewards/rejected": 0.9785751700401306, "step": 5238 }, { "epoch": 2.83, "learning_rate": 8.818225167832538e-10, "logits/chosen": -2.0916285514831543, "logits/rejected": -2.0876712799072266, "logps/chosen": -12.379570960998535, "logps/rejected": -1.4169666767120361, "loss": 0.7846, "rewards/accuracies": 0.0, "rewards/chosen": 0.7804602980613708, "rewards/margins": -0.17528998851776123, "rewards/rejected": 0.9557502865791321, "step": 5239 }, { "epoch": 2.83, "learning_rate": 8.763867501863031e-10, "logits/chosen": -2.129650354385376, "logits/rejected": -2.0240318775177, "logps/chosen": -17.781431198120117, "logps/rejected": -4.917153358459473, "loss": 0.3294, "rewards/accuracies": 1.0, "rewards/chosen": 1.623757004737854, "rewards/margins": 0.9413633346557617, "rewards/rejected": 0.6823936700820923, "step": 5240 }, { "epoch": 2.83, "learning_rate": 8.709676409864253e-10, "logits/chosen": -2.1627941131591797, "logits/rejected": -2.169450521469116, "logps/chosen": -2.1670148372650146, "logps/rejected": -4.304270267486572, "loss": 0.3569, "rewards/accuracies": 1.0, "rewards/chosen": 1.2940971851348877, "rewards/margins": 0.8464689254760742, "rewards/rejected": 0.4476282596588135, "step": 5241 }, { "epoch": 2.83, "learning_rate": 8.655651910212003e-10, "logits/chosen": -2.0557079315185547, "logits/rejected": -2.2670211791992188, "logps/chosen": -0.3325015902519226, "logps/rejected": -0.3485635221004486, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.9301315546035767, "rewards/margins": 0.03050750494003296, "rewards/rejected": 0.8996240496635437, "step": 5242 }, { "epoch": 2.83, "learning_rate": 8.601794021225573e-10, "logits/chosen": -1.9930475950241089, "logits/rejected": -1.9720377922058105, "logps/chosen": -15.824145317077637, "logps/rejected": -2.37652587890625, "loss": 0.2645, "rewards/accuracies": 1.0, "rewards/chosen": 1.8924922943115234, "rewards/margins": 1.1947563886642456, "rewards/rejected": 0.6977359056472778, "step": 5243 }, { "epoch": 2.83, "learning_rate": 8.548102761167631e-10, "logits/chosen": -2.188318967819214, "logits/rejected": -2.268817186355591, "logps/chosen": -5.973511219024658, "logps/rejected": -2.9364383220672607, "loss": 0.7104, "rewards/accuracies": 0.0, "rewards/chosen": 0.6192468404769897, "rewards/margins": -0.03423285484313965, "rewards/rejected": 0.6534796953201294, "step": 5244 }, { "epoch": 2.83, "learning_rate": 8.494578148244502e-10, "logits/chosen": -2.180161952972412, "logits/rejected": -2.2722082138061523, "logps/chosen": -6.003852367401123, "logps/rejected": -1.587029218673706, "loss": 0.7652, "rewards/accuracies": 0.0, "rewards/chosen": 0.7374494075775146, "rewards/margins": -0.13933169841766357, "rewards/rejected": 0.8767811059951782, "step": 5245 }, { "epoch": 2.83, "learning_rate": 8.441220200605836e-10, "logits/chosen": -2.017594337463379, "logits/rejected": -2.015094518661499, "logps/chosen": -9.257272720336914, "logps/rejected": -1.176843523979187, "loss": 0.4607, "rewards/accuracies": 1.0, "rewards/chosen": 1.3878930807113647, "rewards/margins": 0.5359012484550476, "rewards/rejected": 0.8519918322563171, "step": 5246 }, { "epoch": 2.83, "learning_rate": 8.388028936344938e-10, "logits/chosen": -1.9757485389709473, "logits/rejected": -2.2886548042297363, "logps/chosen": -0.368958979845047, "logps/rejected": -0.3552514314651489, "loss": 0.6874, "rewards/accuracies": 1.0, "rewards/chosen": 0.8074679374694824, "rewards/margins": 0.011448442935943604, "rewards/rejected": 0.7960194945335388, "step": 5247 }, { "epoch": 2.83, "learning_rate": 8.335004373498488e-10, "logits/chosen": -2.052718162536621, "logits/rejected": -2.2828075885772705, "logps/chosen": -0.4600123167037964, "logps/rejected": -1.5915415287017822, "loss": 0.6216, "rewards/accuracies": 1.0, "rewards/chosen": 1.0187016725540161, "rewards/margins": 0.1486274003982544, "rewards/rejected": 0.8700742721557617, "step": 5248 }, { "epoch": 2.83, "learning_rate": 8.282146530046608e-10, "logits/chosen": -2.176377534866333, "logits/rejected": -2.2022016048431396, "logps/chosen": -14.866992950439453, "logps/rejected": -3.82669734954834, "loss": 0.3485, "rewards/accuracies": 1.0, "rewards/chosen": 1.893967866897583, "rewards/margins": 0.8747323751449585, "rewards/rejected": 1.0192354917526245, "step": 5249 }, { "epoch": 2.83, "learning_rate": 8.229455423913012e-10, "logits/chosen": -2.046739101409912, "logits/rejected": -2.302316665649414, "logps/chosen": -2.1979267597198486, "logps/rejected": -1.9610910415649414, "loss": 0.6748, "rewards/accuracies": 1.0, "rewards/chosen": 1.204342246055603, "rewards/margins": 0.0369952917098999, "rewards/rejected": 1.1673469543457031, "step": 5250 }, { "epoch": 2.83, "learning_rate": 8.176931072964743e-10, "logits/chosen": -2.0526278018951416, "logits/rejected": -2.060863494873047, "logps/chosen": -1.193673014640808, "logps/rejected": -3.5047528743743896, "loss": 0.4601, "rewards/accuracies": 1.0, "rewards/chosen": 1.0661507844924927, "rewards/margins": 0.5373622179031372, "rewards/rejected": 0.5287885665893555, "step": 5251 }, { "epoch": 2.83, "learning_rate": 8.124573495012443e-10, "logits/chosen": -1.982079029083252, "logits/rejected": -2.2411680221557617, "logps/chosen": -0.19134117662906647, "logps/rejected": -0.2098226547241211, "loss": 0.6791, "rewards/accuracies": 1.0, "rewards/chosen": 0.8421568274497986, "rewards/margins": 0.028306126594543457, "rewards/rejected": 0.8138507008552551, "step": 5252 }, { "epoch": 2.83, "learning_rate": 8.072382707810022e-10, "logits/chosen": -2.083869457244873, "logits/rejected": -2.2668991088867188, "logps/chosen": -0.16517166793346405, "logps/rejected": -0.16836170852184296, "loss": 0.6728, "rewards/accuracies": 1.0, "rewards/chosen": 0.8719803094863892, "rewards/margins": 0.04109823703765869, "rewards/rejected": 0.8308820724487305, "step": 5253 }, { "epoch": 2.83, "learning_rate": 8.020358729054987e-10, "logits/chosen": -2.034929037094116, "logits/rejected": -2.252868175506592, "logps/chosen": -0.49071162939071655, "logps/rejected": -0.45827579498291016, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.8553163409233093, "rewards/margins": 0.01682382822036743, "rewards/rejected": 0.8384925127029419, "step": 5254 }, { "epoch": 2.83, "learning_rate": 7.968501576388232e-10, "logits/chosen": -2.178342342376709, "logits/rejected": -2.042206287384033, "logps/chosen": -29.742807388305664, "logps/rejected": -17.873506546020508, "loss": 0.275, "rewards/accuracies": 1.0, "rewards/chosen": 1.9499372243881226, "rewards/margins": 1.1502606868743896, "rewards/rejected": 0.7996765375137329, "step": 5255 }, { "epoch": 2.83, "learning_rate": 7.91681126739402e-10, "logits/chosen": -2.0550954341888428, "logits/rejected": -2.2815544605255127, "logps/chosen": -0.3512475788593292, "logps/rejected": -0.3931809067726135, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.8064403533935547, "rewards/margins": 0.019224822521209717, "rewards/rejected": 0.787215530872345, "step": 5256 }, { "epoch": 2.84, "learning_rate": 7.865287819600164e-10, "logits/chosen": -2.0306057929992676, "logits/rejected": -2.241459608078003, "logps/chosen": -0.24445679783821106, "logps/rejected": -0.3116180896759033, "loss": 0.6834, "rewards/accuracies": 1.0, "rewards/chosen": 0.8916053771972656, "rewards/margins": 0.01949363946914673, "rewards/rejected": 0.8721117377281189, "step": 5257 }, { "epoch": 2.84, "learning_rate": 7.813931250477745e-10, "logits/chosen": -2.0581772327423096, "logits/rejected": -2.2183923721313477, "logps/chosen": -4.235384941101074, "logps/rejected": -0.5902249813079834, "loss": 0.7995, "rewards/accuracies": 0.0, "rewards/chosen": 0.510343074798584, "rewards/margins": -0.20243752002716064, "rewards/rejected": 0.7127805948257446, "step": 5258 }, { "epoch": 2.84, "learning_rate": 7.76274157744139e-10, "logits/chosen": -2.1673076152801514, "logits/rejected": -2.3261847496032715, "logps/chosen": -3.7635409832000732, "logps/rejected": -3.43172287940979, "loss": 0.7086, "rewards/accuracies": 0.0, "rewards/chosen": 0.928875744342804, "rewards/margins": -0.030753910541534424, "rewards/rejected": 0.9596296548843384, "step": 5259 }, { "epoch": 2.84, "learning_rate": 7.7117188178491e-10, "logits/chosen": -2.165435314178467, "logits/rejected": -2.3053016662597656, "logps/chosen": -1.263379693031311, "logps/rejected": -1.270368218421936, "loss": 0.7047, "rewards/accuracies": 0.0, "rewards/chosen": 1.0396517515182495, "rewards/margins": -0.022950291633605957, "rewards/rejected": 1.0626020431518555, "step": 5260 }, { "epoch": 2.84, "learning_rate": 7.660862989002204e-10, "logits/chosen": -2.0184290409088135, "logits/rejected": -2.024232864379883, "logps/chosen": -1.5393959283828735, "logps/rejected": -4.797725200653076, "loss": 0.4205, "rewards/accuracies": 1.0, "rewards/chosen": 1.133230209350586, "rewards/margins": 0.6488059759140015, "rewards/rejected": 0.48442426323890686, "step": 5261 }, { "epoch": 2.84, "learning_rate": 7.610174108145462e-10, "logits/chosen": -2.095609664916992, "logits/rejected": -2.2920303344726562, "logps/chosen": -0.1950223594903946, "logps/rejected": -0.18692055344581604, "loss": 0.6679, "rewards/accuracies": 1.0, "rewards/chosen": 0.9048763513565063, "rewards/margins": 0.05118793249130249, "rewards/rejected": 0.8536884188652039, "step": 5262 }, { "epoch": 2.84, "learning_rate": 7.559652192467125e-10, "logits/chosen": -2.0517165660858154, "logits/rejected": -2.2733521461486816, "logps/chosen": -0.4152154326438904, "logps/rejected": -0.5178627967834473, "loss": 0.6802, "rewards/accuracies": 1.0, "rewards/chosen": 0.97809237241745, "rewards/margins": 0.026157379150390625, "rewards/rejected": 0.9519349932670593, "step": 5263 }, { "epoch": 2.84, "learning_rate": 7.509297259098601e-10, "logits/chosen": -1.9943956136703491, "logits/rejected": -1.998035192489624, "logps/chosen": -0.9083797931671143, "logps/rejected": -4.513174533843994, "loss": 0.4717, "rewards/accuracies": 1.0, "rewards/chosen": 1.0717970132827759, "rewards/margins": 0.5063062310218811, "rewards/rejected": 0.5654907822608948, "step": 5264 }, { "epoch": 2.84, "learning_rate": 7.459109325115009e-10, "logits/chosen": -1.963512897491455, "logits/rejected": -2.267615556716919, "logps/chosen": -2.484494686126709, "logps/rejected": -2.518270492553711, "loss": 0.6992, "rewards/accuracies": 0.0, "rewards/chosen": 0.8630321621894836, "rewards/margins": -0.01198434829711914, "rewards/rejected": 0.8750165104866028, "step": 5265 }, { "epoch": 2.84, "learning_rate": 7.409088407534514e-10, "logits/chosen": -2.0334413051605225, "logits/rejected": -2.2365901470184326, "logps/chosen": -3.6614317893981934, "logps/rejected": -5.58400821685791, "loss": 0.5796, "rewards/accuracies": 1.0, "rewards/chosen": 0.93622225522995, "rewards/margins": 0.24173849821090698, "rewards/rejected": 0.694483757019043, "step": 5266 }, { "epoch": 2.84, "learning_rate": 7.359234523318825e-10, "logits/chosen": -2.135300397872925, "logits/rejected": -2.304227113723755, "logps/chosen": -0.32898572087287903, "logps/rejected": -0.43040668964385986, "loss": 0.6832, "rewards/accuracies": 1.0, "rewards/chosen": 0.9621649980545044, "rewards/margins": 0.020050227642059326, "rewards/rejected": 0.9421147704124451, "step": 5267 }, { "epoch": 2.84, "learning_rate": 7.309547689372975e-10, "logits/chosen": -2.176131248474121, "logits/rejected": -2.303812026977539, "logps/chosen": -1.0964281558990479, "logps/rejected": -1.0604164600372314, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 1.0719082355499268, "rewards/margins": 0.007337450981140137, "rewards/rejected": 1.0645707845687866, "step": 5268 }, { "epoch": 2.84, "learning_rate": 7.260027922545319e-10, "logits/chosen": -2.162076473236084, "logits/rejected": -2.357689619064331, "logps/chosen": -0.7926647663116455, "logps/rejected": -0.840966522693634, "loss": 0.6895, "rewards/accuracies": 1.0, "rewards/chosen": 1.0591599941253662, "rewards/margins": 0.00724339485168457, "rewards/rejected": 1.0519165992736816, "step": 5269 }, { "epoch": 2.84, "learning_rate": 7.210675239627595e-10, "logits/chosen": -2.0143911838531494, "logits/rejected": -2.0002827644348145, "logps/chosen": -9.88094425201416, "logps/rejected": -7.659981727600098, "loss": 0.231, "rewards/accuracies": 1.0, "rewards/chosen": 2.1185410022735596, "rewards/margins": 1.3478248119354248, "rewards/rejected": 0.7707161903381348, "step": 5270 }, { "epoch": 2.84, "learning_rate": 7.161489657354858e-10, "logits/chosen": -2.1606364250183105, "logits/rejected": -2.1541976928710938, "logps/chosen": -3.5151124000549316, "logps/rejected": -7.823944091796875, "loss": 0.2749, "rewards/accuracies": 1.0, "rewards/chosen": 1.4587430953979492, "rewards/margins": 1.15080726146698, "rewards/rejected": 0.30793580412864685, "step": 5271 }, { "epoch": 2.84, "learning_rate": 7.112471192405545e-10, "logits/chosen": -2.0769617557525635, "logits/rejected": -2.2269442081451416, "logps/chosen": -1.1030553579330444, "logps/rejected": -1.0827816724777222, "loss": 0.7037, "rewards/accuracies": 0.0, "rewards/chosen": 0.9496889114379883, "rewards/margins": -0.021012961864471436, "rewards/rejected": 0.9707018733024597, "step": 5272 }, { "epoch": 2.84, "learning_rate": 7.063619861401415e-10, "logits/chosen": -2.0866026878356934, "logits/rejected": -2.094614267349243, "logps/chosen": -3.6986007690429688, "logps/rejected": -11.460672378540039, "loss": 0.6583, "rewards/accuracies": 1.0, "rewards/chosen": 0.8477993011474609, "rewards/margins": 0.07086235284805298, "rewards/rejected": 0.776936948299408, "step": 5273 }, { "epoch": 2.84, "learning_rate": 7.014935680907385e-10, "logits/chosen": -2.1036906242370605, "logits/rejected": -2.29335355758667, "logps/chosen": -1.0590054988861084, "logps/rejected": -6.5521111488342285, "loss": 0.5546, "rewards/accuracies": 1.0, "rewards/chosen": 1.0216448307037354, "rewards/margins": 0.2995123863220215, "rewards/rejected": 0.7221324443817139, "step": 5274 }, { "epoch": 2.85, "learning_rate": 6.966418667432139e-10, "logits/chosen": -2.1593260765075684, "logits/rejected": -2.1371233463287354, "logps/chosen": -8.201192855834961, "logps/rejected": -2.555393934249878, "loss": 0.2599, "rewards/accuracies": 1.0, "rewards/chosen": 1.8858801126480103, "rewards/margins": 1.2147947549819946, "rewards/rejected": 0.6710853576660156, "step": 5275 }, { "epoch": 2.85, "learning_rate": 6.918068837427127e-10, "logits/chosen": -2.110197067260742, "logits/rejected": -2.1060080528259277, "logps/chosen": -1.0079090595245361, "logps/rejected": -3.1660845279693604, "loss": 0.4806, "rewards/accuracies": 1.0, "rewards/chosen": 1.1673873662948608, "rewards/margins": 0.4829164147377014, "rewards/rejected": 0.6844709515571594, "step": 5276 }, { "epoch": 2.85, "learning_rate": 6.869886207287457e-10, "logits/chosen": -2.1648850440979004, "logits/rejected": -2.3367204666137695, "logps/chosen": -1.4640836715698242, "logps/rejected": -1.3757526874542236, "loss": 0.6824, "rewards/accuracies": 1.0, "rewards/chosen": 1.1771377325057983, "rewards/margins": 0.021675467491149902, "rewards/rejected": 1.1554622650146484, "step": 5277 }, { "epoch": 2.85, "learning_rate": 6.821870793351447e-10, "logits/chosen": -2.1334569454193115, "logits/rejected": -2.340677261352539, "logps/chosen": -4.635461807250977, "logps/rejected": -4.408782005310059, "loss": 0.6911, "rewards/accuracies": 1.0, "rewards/chosen": 0.7428426146507263, "rewards/margins": 0.004063069820404053, "rewards/rejected": 0.7387795448303223, "step": 5278 }, { "epoch": 2.85, "learning_rate": 6.774022611900687e-10, "logits/chosen": -2.0121660232543945, "logits/rejected": -2.2997732162475586, "logps/chosen": -2.8195369243621826, "logps/rejected": -2.7494254112243652, "loss": 0.6744, "rewards/accuracies": 1.0, "rewards/chosen": 0.5003505945205688, "rewards/margins": 0.037923961877822876, "rewards/rejected": 0.46242663264274597, "step": 5279 }, { "epoch": 2.85, "learning_rate": 6.726341679160141e-10, "logits/chosen": -2.2084717750549316, "logits/rejected": -2.370004653930664, "logps/chosen": -4.278541088104248, "logps/rejected": -0.9162935018539429, "loss": 0.8224, "rewards/accuracies": 0.0, "rewards/chosen": 0.9023036956787109, "rewards/margins": -0.2436976432800293, "rewards/rejected": 1.1460013389587402, "step": 5280 }, { "epoch": 2.85, "learning_rate": 6.678828011297932e-10, "logits/chosen": -2.097644329071045, "logits/rejected": -2.1080989837646484, "logps/chosen": -0.5370105504989624, "logps/rejected": -12.251171112060547, "loss": 0.5586, "rewards/accuracies": 1.0, "rewards/chosen": 1.023772120475769, "rewards/margins": 0.2900235056877136, "rewards/rejected": 0.7337486147880554, "step": 5281 }, { "epoch": 2.85, "learning_rate": 6.631481624425561e-10, "logits/chosen": -2.1314268112182617, "logits/rejected": -2.332044839859009, "logps/chosen": -1.448155403137207, "logps/rejected": -1.0119950771331787, "loss": 0.6943, "rewards/accuracies": 0.0, "rewards/chosen": 0.9741763472557068, "rewards/margins": -0.0023090243339538574, "rewards/rejected": 0.9764853715896606, "step": 5282 }, { "epoch": 2.85, "learning_rate": 6.584302534597852e-10, "logits/chosen": -2.0714123249053955, "logits/rejected": -2.2802517414093018, "logps/chosen": -6.32182502746582, "logps/rejected": -2.931743860244751, "loss": 0.718, "rewards/accuracies": 0.0, "rewards/chosen": 0.8661909103393555, "rewards/margins": -0.04912424087524414, "rewards/rejected": 0.9153151512145996, "step": 5283 }, { "epoch": 2.85, "learning_rate": 6.53729075781273e-10, "logits/chosen": -2.158627986907959, "logits/rejected": -2.159451723098755, "logps/chosen": -0.17551298439502716, "logps/rejected": -3.119619131088257, "loss": 0.4878, "rewards/accuracies": 1.0, "rewards/chosen": 1.0277807712554932, "rewards/margins": 0.4639362692832947, "rewards/rejected": 0.5638445019721985, "step": 5284 }, { "epoch": 2.85, "learning_rate": 6.490446310011554e-10, "logits/chosen": -1.9400449991226196, "logits/rejected": -1.9471194744110107, "logps/chosen": -1.3331904411315918, "logps/rejected": -5.730525493621826, "loss": 0.4078, "rewards/accuracies": 1.0, "rewards/chosen": 0.9890546798706055, "rewards/margins": 0.6862834692001343, "rewards/rejected": 0.3027712404727936, "step": 5285 }, { "epoch": 2.85, "learning_rate": 6.443769207078841e-10, "logits/chosen": -2.01515793800354, "logits/rejected": -2.0239975452423096, "logps/chosen": -1.4467177391052246, "logps/rejected": -3.9304089546203613, "loss": 0.4758, "rewards/accuracies": 1.0, "rewards/chosen": 0.976933479309082, "rewards/margins": 0.49556154012680054, "rewards/rejected": 0.4813719391822815, "step": 5286 }, { "epoch": 2.85, "learning_rate": 6.397259464842375e-10, "logits/chosen": -2.047501802444458, "logits/rejected": -2.047398090362549, "logps/chosen": -5.437220573425293, "logps/rejected": -2.8359155654907227, "loss": 0.294, "rewards/accuracies": 1.0, "rewards/chosen": 1.6478279829025269, "rewards/margins": 1.0736973285675049, "rewards/rejected": 0.5741307139396667, "step": 5287 }, { "epoch": 2.85, "learning_rate": 6.350917099073317e-10, "logits/chosen": -1.9989182949066162, "logits/rejected": -2.0002291202545166, "logps/chosen": -5.751920700073242, "logps/rejected": -1.1327341794967651, "loss": 0.419, "rewards/accuracies": 1.0, "rewards/chosen": 1.2657396793365479, "rewards/margins": 0.653171718120575, "rewards/rejected": 0.6125679612159729, "step": 5288 }, { "epoch": 2.85, "learning_rate": 6.304742125485873e-10, "logits/chosen": -2.0593156814575195, "logits/rejected": -2.267054319381714, "logps/chosen": -0.33824682235717773, "logps/rejected": -0.3068455755710602, "loss": 0.6876, "rewards/accuracies": 1.0, "rewards/chosen": 0.8097549676895142, "rewards/margins": 0.011105656623840332, "rewards/rejected": 0.7986493110656738, "step": 5289 }, { "epoch": 2.85, "learning_rate": 6.258734559737578e-10, "logits/chosen": -2.218810796737671, "logits/rejected": -2.04131817817688, "logps/chosen": -35.110008239746094, "logps/rejected": -3.2632079124450684, "loss": 0.0915, "rewards/accuracies": 1.0, "rewards/chosen": 2.8592541217803955, "rewards/margins": 2.3451085090637207, "rewards/rejected": 0.5141456127166748, "step": 5290 }, { "epoch": 2.85, "learning_rate": 6.212894417429226e-10, "logits/chosen": -2.0802290439605713, "logits/rejected": -2.0760841369628906, "logps/chosen": -2.6380441188812256, "logps/rejected": -5.331273078918457, "loss": 0.5037, "rewards/accuracies": 1.0, "rewards/chosen": 0.9713774919509888, "rewards/margins": 0.42329883575439453, "rewards/rejected": 0.5480786561965942, "step": 5291 }, { "epoch": 2.85, "learning_rate": 6.167221714104831e-10, "logits/chosen": -2.238527536392212, "logits/rejected": -2.2404918670654297, "logps/chosen": -0.5019834041595459, "logps/rejected": -3.8296635150909424, "loss": 0.4999, "rewards/accuracies": 1.0, "rewards/chosen": 0.7265897989273071, "rewards/margins": 0.4329812824726105, "rewards/rejected": 0.29360851645469666, "step": 5292 }, { "epoch": 2.85, "learning_rate": 6.121716465251669e-10, "logits/chosen": -2.054492473602295, "logits/rejected": -2.0599541664123535, "logps/chosen": -1.792954683303833, "logps/rejected": -4.816975116729736, "loss": 0.4328, "rewards/accuracies": 1.0, "rewards/chosen": 1.2025227546691895, "rewards/margins": 0.6133862733840942, "rewards/rejected": 0.5891364812850952, "step": 5293 }, { "epoch": 2.86, "learning_rate": 6.07637868630001e-10, "logits/chosen": -2.1910321712493896, "logits/rejected": -2.190089702606201, "logps/chosen": -0.6310604214668274, "logps/rejected": -2.4794156551361084, "loss": 0.5301, "rewards/accuracies": 1.0, "rewards/chosen": 1.0794181823730469, "rewards/margins": 0.35783928632736206, "rewards/rejected": 0.7215788960456848, "step": 5294 }, { "epoch": 2.86, "learning_rate": 6.031208392623665e-10, "logits/chosen": -2.135261058807373, "logits/rejected": -2.051535129547119, "logps/chosen": -13.264860153198242, "logps/rejected": -4.189014434814453, "loss": 0.3446, "rewards/accuracies": 1.0, "rewards/chosen": 1.6357157230377197, "rewards/margins": 0.8881058096885681, "rewards/rejected": 0.7476099133491516, "step": 5295 }, { "epoch": 2.86, "learning_rate": 5.98620559953944e-10, "logits/chosen": -2.0920820236206055, "logits/rejected": -2.1192853450775146, "logps/chosen": -18.218406677246094, "logps/rejected": -14.352088928222656, "loss": 0.1265, "rewards/accuracies": 1.0, "rewards/chosen": 2.3767716884613037, "rewards/margins": 2.0034544467926025, "rewards/rejected": 0.37331733107566833, "step": 5296 }, { "epoch": 2.86, "learning_rate": 5.941370322307405e-10, "logits/chosen": -2.205836772918701, "logits/rejected": -2.2075395584106445, "logps/chosen": -1.0659714937210083, "logps/rejected": -3.363600254058838, "loss": 0.4576, "rewards/accuracies": 1.0, "rewards/chosen": 1.2890790700912476, "rewards/margins": 0.5441495776176453, "rewards/rejected": 0.7449294924736023, "step": 5297 }, { "epoch": 2.86, "learning_rate": 5.896702576130841e-10, "logits/chosen": -2.002420663833618, "logits/rejected": -2.002858877182007, "logps/chosen": -1.9951423406600952, "logps/rejected": -1.0158089399337769, "loss": 0.7097, "rewards/accuracies": 0.0, "rewards/chosen": 1.01308012008667, "rewards/margins": -0.032859206199645996, "rewards/rejected": 1.045939326286316, "step": 5298 }, { "epoch": 2.86, "learning_rate": 5.852202376156134e-10, "logits/chosen": -2.0897254943847656, "logits/rejected": -2.0687367916107178, "logps/chosen": -8.221305847167969, "logps/rejected": -1.4530739784240723, "loss": 0.3681, "rewards/accuracies": 1.0, "rewards/chosen": 1.6702663898468018, "rewards/margins": 0.8095829486846924, "rewards/rejected": 0.8606834411621094, "step": 5299 }, { "epoch": 2.86, "learning_rate": 5.807869737472993e-10, "logits/chosen": -2.0728354454040527, "logits/rejected": -2.062542676925659, "logps/chosen": -0.32566389441490173, "logps/rejected": -6.966529846191406, "loss": 0.4427, "rewards/accuracies": 1.0, "rewards/chosen": 0.9264689683914185, "rewards/margins": 0.5854644775390625, "rewards/rejected": 0.34100446105003357, "step": 5300 }, { "epoch": 2.86, "learning_rate": 5.763704675114222e-10, "logits/chosen": -2.165205955505371, "logits/rejected": -2.1608612537384033, "logps/chosen": -3.131838798522949, "logps/rejected": -3.6281771659851074, "loss": 0.3389, "rewards/accuracies": 1.0, "rewards/chosen": 1.5817925930023193, "rewards/margins": 0.9077678322792053, "rewards/rejected": 0.674024760723114, "step": 5301 }, { "epoch": 2.86, "learning_rate": 5.719707204055735e-10, "logits/chosen": -2.0485939979553223, "logits/rejected": -2.0355353355407715, "logps/chosen": -2.0025293827056885, "logps/rejected": -5.129254341125488, "loss": 0.4559, "rewards/accuracies": 1.0, "rewards/chosen": 1.3996636867523193, "rewards/margins": 0.5488523840904236, "rewards/rejected": 0.8508113026618958, "step": 5302 }, { "epoch": 2.86, "learning_rate": 5.675877339216817e-10, "logits/chosen": -1.9971596002578735, "logits/rejected": -2.006666660308838, "logps/chosen": -1.5765652656555176, "logps/rejected": -3.286064624786377, "loss": 0.4704, "rewards/accuracies": 1.0, "rewards/chosen": 1.0117021799087524, "rewards/margins": 0.509746789932251, "rewards/rejected": 0.5019553899765015, "step": 5303 }, { "epoch": 2.86, "learning_rate": 5.632215095459691e-10, "logits/chosen": -2.066157341003418, "logits/rejected": -2.0684759616851807, "logps/chosen": -5.305917739868164, "logps/rejected": -3.700248956680298, "loss": 0.5653, "rewards/accuracies": 1.0, "rewards/chosen": 0.9193698763847351, "rewards/margins": 0.27438944578170776, "rewards/rejected": 0.6449804306030273, "step": 5304 }, { "epoch": 2.86, "learning_rate": 5.58872048758996e-10, "logits/chosen": -1.977065920829773, "logits/rejected": -1.9770667552947998, "logps/chosen": -0.13841135799884796, "logps/rejected": -7.64856481552124, "loss": 0.4922, "rewards/accuracies": 1.0, "rewards/chosen": 0.8280081748962402, "rewards/margins": 0.45280393958091736, "rewards/rejected": 0.3752042353153229, "step": 5305 }, { "epoch": 2.86, "learning_rate": 5.545393530356157e-10, "logits/chosen": -2.105001211166382, "logits/rejected": -2.25482439994812, "logps/chosen": -0.5736088752746582, "logps/rejected": -0.6894821524620056, "loss": 0.6833, "rewards/accuracies": 1.0, "rewards/chosen": 1.032881736755371, "rewards/margins": 0.019770145416259766, "rewards/rejected": 1.0131115913391113, "step": 5306 }, { "epoch": 2.86, "learning_rate": 5.502234238450143e-10, "logits/chosen": -2.1243624687194824, "logits/rejected": -2.247877359390259, "logps/chosen": -0.1886511743068695, "logps/rejected": -0.2635560929775238, "loss": 0.6772, "rewards/accuracies": 1.0, "rewards/chosen": 0.9055251479148865, "rewards/margins": 0.0321010947227478, "rewards/rejected": 0.8734240531921387, "step": 5307 }, { "epoch": 2.86, "learning_rate": 5.459242626506932e-10, "logits/chosen": -2.0303120613098145, "logits/rejected": -1.9992618560791016, "logps/chosen": -5.125393867492676, "logps/rejected": -3.887559652328491, "loss": 0.3064, "rewards/accuracies": 1.0, "rewards/chosen": 1.4890964031219482, "rewards/margins": 1.0257073640823364, "rewards/rejected": 0.46338900923728943, "step": 5308 }, { "epoch": 2.86, "learning_rate": 5.416418709104532e-10, "logits/chosen": -2.060657024383545, "logits/rejected": -2.3013813495635986, "logps/chosen": -1.0325459241867065, "logps/rejected": -0.9649986624717712, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 1.003433346748352, "rewards/margins": 0.030784904956817627, "rewards/rejected": 0.9726484417915344, "step": 5309 }, { "epoch": 2.86, "learning_rate": 5.373762500764101e-10, "logits/chosen": -1.9765689373016357, "logits/rejected": -1.9936777353286743, "logps/chosen": -1.0684808492660522, "logps/rejected": -13.844712257385254, "loss": 0.6981, "rewards/accuracies": 0.0, "rewards/chosen": 0.8997414708137512, "rewards/margins": -0.009918034076690674, "rewards/rejected": 0.9096595048904419, "step": 5310 }, { "epoch": 2.86, "learning_rate": 5.331274015950183e-10, "logits/chosen": -2.0616183280944824, "logits/rejected": -2.062408447265625, "logps/chosen": -4.744778156280518, "logps/rejected": -4.260921955108643, "loss": 0.2863, "rewards/accuracies": 1.0, "rewards/chosen": 1.549584984779358, "rewards/margins": 1.1039519309997559, "rewards/rejected": 0.44563308358192444, "step": 5311 }, { "epoch": 2.87, "learning_rate": 5.288953269070084e-10, "logits/chosen": -2.1202266216278076, "logits/rejected": -2.242398738861084, "logps/chosen": -1.1432790756225586, "logps/rejected": -1.238267183303833, "loss": 0.6764, "rewards/accuracies": 1.0, "rewards/chosen": 0.9040715098381042, "rewards/margins": 0.03377443552017212, "rewards/rejected": 0.8702970743179321, "step": 5312 }, { "epoch": 2.87, "learning_rate": 5.246800274474439e-10, "logits/chosen": -1.9972962141036987, "logits/rejected": -2.277149200439453, "logps/chosen": -0.39054134488105774, "logps/rejected": -0.4998649060726166, "loss": 0.6849, "rewards/accuracies": 1.0, "rewards/chosen": 0.8836313486099243, "rewards/margins": 0.016509950160980225, "rewards/rejected": 0.8671213984489441, "step": 5313 }, { "epoch": 2.87, "learning_rate": 5.204815046457034e-10, "logits/chosen": -2.089916706085205, "logits/rejected": -2.2619950771331787, "logps/chosen": -4.676585674285889, "logps/rejected": -4.449283599853516, "loss": 0.692, "rewards/accuracies": 1.0, "rewards/chosen": 1.0254143476486206, "rewards/margins": 0.002254962921142578, "rewards/rejected": 1.023159384727478, "step": 5314 }, { "epoch": 2.87, "learning_rate": 5.162997599254703e-10, "logits/chosen": -2.1290249824523926, "logits/rejected": -2.0733630657196045, "logps/chosen": -16.63445281982422, "logps/rejected": -3.9116790294647217, "loss": 0.2651, "rewards/accuracies": 1.0, "rewards/chosen": 1.7611385583877563, "rewards/margins": 1.1921613216400146, "rewards/rejected": 0.5689772963523865, "step": 5315 }, { "epoch": 2.87, "learning_rate": 5.121347947047327e-10, "logits/chosen": -2.1042656898498535, "logits/rejected": -2.1038787364959717, "logps/chosen": -1.2212839126586914, "logps/rejected": -2.850708246231079, "loss": 0.585, "rewards/accuracies": 1.0, "rewards/chosen": 0.9819459319114685, "rewards/margins": 0.2293940782546997, "rewards/rejected": 0.7525518536567688, "step": 5316 }, { "epoch": 2.87, "learning_rate": 5.079866103957942e-10, "logits/chosen": -2.1399731636047363, "logits/rejected": -2.147148370742798, "logps/chosen": -1.296453595161438, "logps/rejected": -4.49322509765625, "loss": 0.4232, "rewards/accuracies": 1.0, "rewards/chosen": 1.103880763053894, "rewards/margins": 0.6407366991043091, "rewards/rejected": 0.4631440341472626, "step": 5317 }, { "epoch": 2.87, "learning_rate": 5.038552084052794e-10, "logits/chosen": -2.016204357147217, "logits/rejected": -2.0129826068878174, "logps/chosen": -4.481776714324951, "logps/rejected": -1.7507613897323608, "loss": 0.309, "rewards/accuracies": 1.0, "rewards/chosen": 1.6338809728622437, "rewards/margins": 1.0158449411392212, "rewards/rejected": 0.6180360317230225, "step": 5318 }, { "epoch": 2.87, "learning_rate": 4.997405901340956e-10, "logits/chosen": -2.028911590576172, "logits/rejected": -2.2654049396514893, "logps/chosen": -8.127476692199707, "logps/rejected": -6.857584476470947, "loss": 0.735, "rewards/accuracies": 0.0, "rewards/chosen": 0.5155770182609558, "rewards/margins": -0.08202368021011353, "rewards/rejected": 0.5976006984710693, "step": 5319 }, { "epoch": 2.87, "learning_rate": 4.956427569774879e-10, "logits/chosen": -2.0229098796844482, "logits/rejected": -2.0653326511383057, "logps/chosen": -1.1393520832061768, "logps/rejected": -14.549604415893555, "loss": 0.6782, "rewards/accuracies": 1.0, "rewards/chosen": 0.7680259943008423, "rewards/margins": 0.030151784420013428, "rewards/rejected": 0.7378742098808289, "step": 5320 }, { "epoch": 2.87, "learning_rate": 4.915617103249892e-10, "logits/chosen": -2.219146490097046, "logits/rejected": -2.3501951694488525, "logps/chosen": -1.574115514755249, "logps/rejected": -1.635789155960083, "loss": 0.6822, "rewards/accuracies": 1.0, "rewards/chosen": 1.0339252948760986, "rewards/margins": 0.022043585777282715, "rewards/rejected": 1.011881709098816, "step": 5321 }, { "epoch": 2.87, "learning_rate": 4.874974515604479e-10, "logits/chosen": -2.096796989440918, "logits/rejected": -2.127110004425049, "logps/chosen": -3.2148873805999756, "logps/rejected": -5.974590301513672, "loss": 0.3598, "rewards/accuracies": 1.0, "rewards/chosen": 1.6254262924194336, "rewards/margins": 0.8369638323783875, "rewards/rejected": 0.7884624600410461, "step": 5322 }, { "epoch": 2.87, "learning_rate": 4.834499820620175e-10, "logits/chosen": -2.084667921066284, "logits/rejected": -2.0858712196350098, "logps/chosen": -3.1039249897003174, "logps/rejected": -13.059228897094727, "loss": 0.285, "rewards/accuracies": 1.0, "rewards/chosen": 1.239927053451538, "rewards/margins": 1.1093337535858154, "rewards/rejected": 0.13059329986572266, "step": 5323 }, { "epoch": 2.87, "learning_rate": 4.794193032021665e-10, "logits/chosen": -2.1529409885406494, "logits/rejected": -2.148184061050415, "logps/chosen": -6.804015159606934, "logps/rejected": -5.792989730834961, "loss": 0.3717, "rewards/accuracies": 1.0, "rewards/chosen": 1.1148698329925537, "rewards/margins": 0.798117995262146, "rewards/rejected": 0.3167518675327301, "step": 5324 }, { "epoch": 2.87, "learning_rate": 4.754054163476629e-10, "logits/chosen": -2.0927977561950684, "logits/rejected": -2.0942418575286865, "logps/chosen": -2.16964054107666, "logps/rejected": -4.446686744689941, "loss": 0.2882, "rewards/accuracies": 1.0, "rewards/chosen": 1.5746511220932007, "rewards/margins": 1.096648931503296, "rewards/rejected": 0.4780021607875824, "step": 5325 }, { "epoch": 2.87, "learning_rate": 4.714083228595789e-10, "logits/chosen": -1.9896925687789917, "logits/rejected": -2.253966808319092, "logps/chosen": -1.7024534940719604, "logps/rejected": -1.873582124710083, "loss": 0.6972, "rewards/accuracies": 0.0, "rewards/chosen": 0.9708328247070312, "rewards/margins": -0.008139252662658691, "rewards/rejected": 0.9789720773696899, "step": 5326 }, { "epoch": 2.87, "learning_rate": 4.674280240932916e-10, "logits/chosen": -2.1353273391723633, "logits/rejected": -2.333085298538208, "logps/chosen": -0.06250165402889252, "logps/rejected": -0.06319751590490341, "loss": 0.6793, "rewards/accuracies": 1.0, "rewards/chosen": 0.7666806578636169, "rewards/margins": 0.027928292751312256, "rewards/rejected": 0.7387523651123047, "step": 5327 }, { "epoch": 2.87, "learning_rate": 4.6346452139849337e-10, "logits/chosen": -2.0434510707855225, "logits/rejected": -2.2745463848114014, "logps/chosen": -6.475546836853027, "logps/rejected": -4.257998943328857, "loss": 0.6677, "rewards/accuracies": 1.0, "rewards/chosen": 0.8764088749885559, "rewards/margins": 0.05146455764770508, "rewards/rejected": 0.8249443173408508, "step": 5328 }, { "epoch": 2.87, "learning_rate": 4.595178161191704e-10, "logits/chosen": -2.1637091636657715, "logits/rejected": -2.1659200191497803, "logps/chosen": -0.27847301959991455, "logps/rejected": -5.294415473937988, "loss": 0.4577, "rewards/accuracies": 1.0, "rewards/chosen": 0.8786866068840027, "rewards/margins": 0.5439486503601074, "rewards/rejected": 0.33473798632621765, "step": 5329 }, { "epoch": 2.87, "learning_rate": 4.5558790959361327e-10, "logits/chosen": -2.046046495437622, "logits/rejected": -2.308572292327881, "logps/chosen": -0.7260379195213318, "logps/rejected": -0.7559643983840942, "loss": 0.6784, "rewards/accuracies": 1.0, "rewards/chosen": 0.8170695304870605, "rewards/margins": 0.029654383659362793, "rewards/rejected": 0.7874151468276978, "step": 5330 }, { "epoch": 2.88, "learning_rate": 4.5167480315442817e-10, "logits/chosen": -1.99418306350708, "logits/rejected": -1.9931213855743408, "logps/chosen": -2.7098429203033447, "logps/rejected": -3.460730791091919, "loss": 0.4124, "rewards/accuracies": 1.0, "rewards/chosen": 1.2030919790267944, "rewards/margins": 0.6724051237106323, "rewards/rejected": 0.5306868553161621, "step": 5331 }, { "epoch": 2.88, "learning_rate": 4.477784981285093e-10, "logits/chosen": -2.040198564529419, "logits/rejected": -2.3106353282928467, "logps/chosen": -0.4513489305973053, "logps/rejected": -0.5245234370231628, "loss": 0.6961, "rewards/accuracies": 0.0, "rewards/chosen": 1.0598220825195312, "rewards/margins": -0.005922198295593262, "rewards/rejected": 1.0657442808151245, "step": 5332 }, { "epoch": 2.88, "learning_rate": 4.4389899583706093e-10, "logits/chosen": -2.120290517807007, "logits/rejected": -2.12180233001709, "logps/chosen": -1.7047265768051147, "logps/rejected": -1.5431784391403198, "loss": 0.5645, "rewards/accuracies": 1.0, "rewards/chosen": 1.0195143222808838, "rewards/margins": 0.2763736844062805, "rewards/rejected": 0.7431406378746033, "step": 5333 }, { "epoch": 2.88, "learning_rate": 4.4003629759558626e-10, "logits/chosen": -2.125527858734131, "logits/rejected": -2.1254143714904785, "logps/chosen": -1.066969394683838, "logps/rejected": -1.1587952375411987, "loss": 0.5727, "rewards/accuracies": 1.0, "rewards/chosen": 1.0444309711456299, "rewards/margins": 0.25731760263442993, "rewards/rejected": 0.7871133685112, "step": 5334 }, { "epoch": 2.88, "learning_rate": 4.3619040471390425e-10, "logits/chosen": -2.1442201137542725, "logits/rejected": -2.1905131340026855, "logps/chosen": -3.626781463623047, "logps/rejected": -11.775824546813965, "loss": 0.2873, "rewards/accuracies": 1.0, "rewards/chosen": 1.6615746021270752, "rewards/margins": 1.1001372337341309, "rewards/rejected": 0.5614373087882996, "step": 5335 }, { "epoch": 2.88, "learning_rate": 4.3236131849611614e-10, "logits/chosen": -2.0416176319122314, "logits/rejected": -2.026085376739502, "logps/chosen": -2.241304874420166, "logps/rejected": -3.6455283164978027, "loss": 0.4026, "rewards/accuracies": 1.0, "rewards/chosen": 1.2734215259552002, "rewards/margins": 0.7018814086914062, "rewards/rejected": 0.571540117263794, "step": 5336 }, { "epoch": 2.88, "learning_rate": 4.2854904024062777e-10, "logits/chosen": -2.084967851638794, "logits/rejected": -2.089755058288574, "logps/chosen": -0.38891279697418213, "logps/rejected": -6.658723831176758, "loss": 0.4925, "rewards/accuracies": 1.0, "rewards/chosen": 0.9425690770149231, "rewards/margins": 0.4519255757331848, "rewards/rejected": 0.4906435012817383, "step": 5337 }, { "epoch": 2.88, "learning_rate": 4.2475357124016065e-10, "logits/chosen": -2.0690219402313232, "logits/rejected": -2.0668785572052, "logps/chosen": -0.43031150102615356, "logps/rejected": -3.7977678775787354, "loss": 0.4973, "rewards/accuracies": 1.0, "rewards/chosen": 1.0027285814285278, "rewards/margins": 0.4396790862083435, "rewards/rejected": 0.5630494952201843, "step": 5338 }, { "epoch": 2.88, "learning_rate": 4.209749127817186e-10, "logits/chosen": -2.1272592544555664, "logits/rejected": -2.2951486110687256, "logps/chosen": -3.641597270965576, "logps/rejected": -1.0686928033828735, "loss": 0.6684, "rewards/accuracies": 1.0, "rewards/chosen": 0.9491193890571594, "rewards/margins": 0.05002099275588989, "rewards/rejected": 0.8990983963012695, "step": 5339 }, { "epoch": 2.88, "learning_rate": 4.1721306614661554e-10, "logits/chosen": -2.0335748195648193, "logits/rejected": -2.2893922328948975, "logps/chosen": -2.441614866256714, "logps/rejected": -2.4743926525115967, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.8138212561607361, "rewards/margins": 0.008557438850402832, "rewards/rejected": 0.8052638173103333, "step": 5340 }, { "epoch": 2.88, "learning_rate": 4.134680326104645e-10, "logits/chosen": -2.103937864303589, "logits/rejected": -2.1033802032470703, "logps/chosen": -0.7505673766136169, "logps/rejected": -1.5350289344787598, "loss": 0.6409, "rewards/accuracies": 1.0, "rewards/chosen": 0.9473965764045715, "rewards/margins": 0.10734260082244873, "rewards/rejected": 0.8400539755821228, "step": 5341 }, { "epoch": 2.88, "learning_rate": 4.0973981344316624e-10, "logits/chosen": -2.0134990215301514, "logits/rejected": -2.30719256401062, "logps/chosen": -2.6450490951538086, "logps/rejected": -3.3175814151763916, "loss": 0.6702, "rewards/accuracies": 1.0, "rewards/chosen": 0.9966532588005066, "rewards/margins": 0.046384572982788086, "rewards/rejected": 0.9502686858177185, "step": 5342 }, { "epoch": 2.88, "learning_rate": 4.060284099089317e-10, "logits/chosen": -2.040724754333496, "logits/rejected": -2.040971517562866, "logps/chosen": -2.107164144515991, "logps/rejected": -4.657040119171143, "loss": 0.5997, "rewards/accuracies": 1.0, "rewards/chosen": 1.1168625354766846, "rewards/margins": 0.1964479684829712, "rewards/rejected": 0.9204145669937134, "step": 5343 }, { "epoch": 2.88, "learning_rate": 4.023338232662654e-10, "logits/chosen": -2.0421619415283203, "logits/rejected": -2.047180414199829, "logps/chosen": -0.8445122241973877, "logps/rejected": -12.57595443725586, "loss": 0.4735, "rewards/accuracies": 1.0, "rewards/chosen": 1.0899529457092285, "rewards/margins": 0.5014308094978333, "rewards/rejected": 0.5885221362113953, "step": 5344 }, { "epoch": 2.88, "learning_rate": 3.986560547679707e-10, "logits/chosen": -2.1132750511169434, "logits/rejected": -2.143404960632324, "logps/chosen": -3.165618896484375, "logps/rejected": -12.611297607421875, "loss": 0.2713, "rewards/accuracies": 1.0, "rewards/chosen": 1.5593093633651733, "rewards/margins": 1.1658061742782593, "rewards/rejected": 0.39350318908691406, "step": 5345 }, { "epoch": 2.88, "learning_rate": 3.949951056611556e-10, "logits/chosen": -2.030904769897461, "logits/rejected": -2.027078866958618, "logps/chosen": -0.7559175491333008, "logps/rejected": -2.4573800563812256, "loss": 0.6339, "rewards/accuracies": 1.0, "rewards/chosen": 0.9180079698562622, "rewards/margins": 0.1223297119140625, "rewards/rejected": 0.7956782579421997, "step": 5346 }, { "epoch": 2.88, "learning_rate": 3.9135097718719923e-10, "logits/chosen": -2.061610460281372, "logits/rejected": -2.0677201747894287, "logps/chosen": -5.706410884857178, "logps/rejected": -0.8596532940864563, "loss": 0.461, "rewards/accuracies": 1.0, "rewards/chosen": 1.3487285375595093, "rewards/margins": 0.5350422263145447, "rewards/rejected": 0.8136863112449646, "step": 5347 }, { "epoch": 2.88, "learning_rate": 3.877236705818132e-10, "logits/chosen": -1.9925543069839478, "logits/rejected": -1.99253511428833, "logps/chosen": -0.7758625149726868, "logps/rejected": -1.8017245531082153, "loss": 0.5501, "rewards/accuracies": 1.0, "rewards/chosen": 0.9985155463218689, "rewards/margins": 0.3099179267883301, "rewards/rejected": 0.6885976195335388, "step": 5348 }, { "epoch": 2.89, "learning_rate": 3.8411318707497476e-10, "logits/chosen": -2.041004180908203, "logits/rejected": -2.0434107780456543, "logps/chosen": -2.518629550933838, "logps/rejected": -1.0060982704162598, "loss": 0.4944, "rewards/accuracies": 1.0, "rewards/chosen": 1.2718629837036133, "rewards/margins": 0.44692325592041016, "rewards/rejected": 0.8249397277832031, "step": 5349 }, { "epoch": 2.89, "learning_rate": 3.8051952789096563e-10, "logits/chosen": -2.105945587158203, "logits/rejected": -2.102238655090332, "logps/chosen": -3.5222389698028564, "logps/rejected": -2.2891955375671387, "loss": 0.5349, "rewards/accuracies": 1.0, "rewards/chosen": 1.1371750831604004, "rewards/margins": 0.34626322984695435, "rewards/rejected": 0.790911853313446, "step": 5350 }, { "epoch": 2.89, "learning_rate": 3.769426942483722e-10, "logits/chosen": -2.0951504707336426, "logits/rejected": -2.1001744270324707, "logps/chosen": -2.0134260654449463, "logps/rejected": -3.9102420806884766, "loss": 0.4704, "rewards/accuracies": 1.0, "rewards/chosen": 1.0525219440460205, "rewards/margins": 0.5096673369407654, "rewards/rejected": 0.5428546071052551, "step": 5351 }, { "epoch": 2.89, "learning_rate": 3.7338268736006315e-10, "logits/chosen": -2.1662533283233643, "logits/rejected": -2.3240036964416504, "logps/chosen": -2.0695793628692627, "logps/rejected": -1.8820151090621948, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.9833109974861145, "rewards/margins": 0.005395054817199707, "rewards/rejected": 0.9779159426689148, "step": 5352 }, { "epoch": 2.89, "learning_rate": 3.6983950843321174e-10, "logits/chosen": -1.924763560295105, "logits/rejected": -2.229095458984375, "logps/chosen": -0.1948903650045395, "logps/rejected": -0.20500020682811737, "loss": 0.6883, "rewards/accuracies": 1.0, "rewards/chosen": 0.813800036907196, "rewards/margins": 0.0097237229347229, "rewards/rejected": 0.8040763139724731, "step": 5353 }, { "epoch": 2.89, "learning_rate": 3.6631315866927915e-10, "logits/chosen": -2.090548276901245, "logits/rejected": -2.07767653465271, "logps/chosen": -5.79698371887207, "logps/rejected": -7.3854660987854, "loss": 0.289, "rewards/accuracies": 1.0, "rewards/chosen": 1.3493273258209229, "rewards/margins": 1.0932501554489136, "rewards/rejected": 0.2560771405696869, "step": 5354 }, { "epoch": 2.89, "learning_rate": 3.6280363926401435e-10, "logits/chosen": -2.158182144165039, "logits/rejected": -2.1575374603271484, "logps/chosen": -21.726375579833984, "logps/rejected": -8.381206512451172, "loss": 0.2956, "rewards/accuracies": 1.0, "rewards/chosen": 1.5199016332626343, "rewards/margins": 1.0672911405563354, "rewards/rejected": 0.45261049270629883, "step": 5355 }, { "epoch": 2.89, "learning_rate": 3.5931095140747104e-10, "logits/chosen": -2.0598554611206055, "logits/rejected": -2.0640711784362793, "logps/chosen": -4.837214946746826, "logps/rejected": -0.4811466336250305, "loss": 0.5481, "rewards/accuracies": 1.0, "rewards/chosen": 1.2625770568847656, "rewards/margins": 0.31475216150283813, "rewards/rejected": 0.9478248953819275, "step": 5356 }, { "epoch": 2.89, "learning_rate": 3.558350962839851e-10, "logits/chosen": -2.041031837463379, "logits/rejected": -2.2999048233032227, "logps/chosen": -4.113681316375732, "logps/rejected": -6.987855434417725, "loss": 0.6257, "rewards/accuracies": 1.0, "rewards/chosen": 1.1971708536148071, "rewards/margins": 0.1396944522857666, "rewards/rejected": 1.0574764013290405, "step": 5357 }, { "epoch": 2.89, "learning_rate": 3.523760750721916e-10, "logits/chosen": -2.172459840774536, "logits/rejected": -2.294698476791382, "logps/chosen": -1.9087789058685303, "logps/rejected": -1.8520610332489014, "loss": 0.6925, "rewards/accuracies": 1.0, "rewards/chosen": 1.0035641193389893, "rewards/margins": 0.0013538599014282227, "rewards/rejected": 1.002210259437561, "step": 5358 }, { "epoch": 2.89, "learning_rate": 3.489338889450189e-10, "logits/chosen": -1.9697251319885254, "logits/rejected": -1.9692071676254272, "logps/chosen": -0.6993470788002014, "logps/rejected": -2.4436850547790527, "loss": 0.5885, "rewards/accuracies": 1.0, "rewards/chosen": 0.9317512512207031, "rewards/margins": 0.22147172689437866, "rewards/rejected": 0.7102795243263245, "step": 5359 }, { "epoch": 2.89, "learning_rate": 3.455085390696777e-10, "logits/chosen": -2.107182502746582, "logits/rejected": -2.109161615371704, "logps/chosen": -0.28706684708595276, "logps/rejected": -5.081279754638672, "loss": 0.4245, "rewards/accuracies": 1.0, "rewards/chosen": 0.9879854321479797, "rewards/margins": 0.6370795965194702, "rewards/rejected": 0.35090580582618713, "step": 5360 }, { "epoch": 2.89, "learning_rate": 3.421000266076779e-10, "logits/chosen": -2.195798397064209, "logits/rejected": -2.191822052001953, "logps/chosen": -8.29770565032959, "logps/rejected": -0.6004675030708313, "loss": 0.6313, "rewards/accuracies": 1.0, "rewards/chosen": 0.8335146903991699, "rewards/margins": 0.1276741623878479, "rewards/rejected": 0.705840528011322, "step": 5361 }, { "epoch": 2.89, "learning_rate": 3.387083527148116e-10, "logits/chosen": -2.2209229469299316, "logits/rejected": -2.155352830886841, "logps/chosen": -29.12084197998047, "logps/rejected": -6.225755214691162, "loss": 0.2226, "rewards/accuracies": 1.0, "rewards/chosen": 1.937978744506836, "rewards/margins": 1.3887758255004883, "rewards/rejected": 0.5492028594017029, "step": 5362 }, { "epoch": 2.89, "learning_rate": 3.353335185411699e-10, "logits/chosen": -2.048165798187256, "logits/rejected": -2.0458786487579346, "logps/chosen": -1.966377854347229, "logps/rejected": -5.688292026519775, "loss": 0.4067, "rewards/accuracies": 1.0, "rewards/chosen": 1.0402811765670776, "rewards/margins": 0.6895033121109009, "rewards/rejected": 0.35077786445617676, "step": 5363 }, { "epoch": 2.89, "learning_rate": 3.3197552523113737e-10, "logits/chosen": -2.2451093196868896, "logits/rejected": -2.243734121322632, "logps/chosen": -3.0961599349975586, "logps/rejected": -5.406961917877197, "loss": 0.4596, "rewards/accuracies": 1.0, "rewards/chosen": 0.9134076237678528, "rewards/margins": 0.5387980937957764, "rewards/rejected": 0.3746095299720764, "step": 5364 }, { "epoch": 2.89, "learning_rate": 3.286343739233699e-10, "logits/chosen": -2.13067626953125, "logits/rejected": -2.202935218811035, "logps/chosen": -4.301942825317383, "logps/rejected": -25.87418556213379, "loss": 0.5295, "rewards/accuracies": 1.0, "rewards/chosen": 1.0372213125228882, "rewards/margins": 0.3593570590019226, "rewards/rejected": 0.6778642535209656, "step": 5365 }, { "epoch": 2.89, "learning_rate": 3.253100657508279e-10, "logits/chosen": -1.9795628786087036, "logits/rejected": -1.9905602931976318, "logps/chosen": -3.105534076690674, "logps/rejected": -4.543773174285889, "loss": 0.4224, "rewards/accuracies": 1.0, "rewards/chosen": 1.07707679271698, "rewards/margins": 0.6432560086250305, "rewards/rejected": 0.43382078409194946, "step": 5366 }, { "epoch": 2.89, "learning_rate": 3.22002601840754e-10, "logits/chosen": -2.0789480209350586, "logits/rejected": -2.2701010704040527, "logps/chosen": -0.15709233283996582, "logps/rejected": -0.15914255380630493, "loss": 0.6946, "rewards/accuracies": 0.0, "rewards/chosen": 0.9243718981742859, "rewards/margins": -0.002930760383605957, "rewards/rejected": 0.9273026585578918, "step": 5367 }, { "epoch": 2.9, "learning_rate": 3.187119833146845e-10, "logits/chosen": -2.1518070697784424, "logits/rejected": -2.1461708545684814, "logps/chosen": -2.2954916954040527, "logps/rejected": -4.009788990020752, "loss": 0.3499, "rewards/accuracies": 1.0, "rewards/chosen": 1.5774000883102417, "rewards/margins": 0.8700060248374939, "rewards/rejected": 0.7073940634727478, "step": 5368 }, { "epoch": 2.9, "learning_rate": 3.1543821128843796e-10, "logits/chosen": -2.0895578861236572, "logits/rejected": -2.3941962718963623, "logps/chosen": -19.11908531188965, "logps/rejected": -16.918445587158203, "loss": 0.5978, "rewards/accuracies": 1.0, "rewards/chosen": 0.3116476237773895, "rewards/margins": 0.2008516490459442, "rewards/rejected": 0.11079597473144531, "step": 5369 }, { "epoch": 2.9, "learning_rate": 3.1218128687212075e-10, "logits/chosen": -2.089437961578369, "logits/rejected": -2.1529617309570312, "logps/chosen": -3.0974361896514893, "logps/rejected": -14.79362964630127, "loss": 0.3535, "rewards/accuracies": 1.0, "rewards/chosen": 1.61825692653656, "rewards/margins": 0.8579533100128174, "rewards/rejected": 0.7603036165237427, "step": 5370 }, { "epoch": 2.9, "learning_rate": 3.089412111701273e-10, "logits/chosen": -2.2345941066741943, "logits/rejected": -2.3629980087280273, "logps/chosen": -8.096179962158203, "logps/rejected": -8.38935661315918, "loss": 0.6615, "rewards/accuracies": 1.0, "rewards/chosen": 1.3267440795898438, "rewards/margins": 0.06430912017822266, "rewards/rejected": 1.262434959411621, "step": 5371 }, { "epoch": 2.9, "learning_rate": 3.057179852811453e-10, "logits/chosen": -2.0648484230041504, "logits/rejected": -2.0663681030273438, "logps/chosen": -2.143467903137207, "logps/rejected": -1.5657442808151245, "loss": 0.4689, "rewards/accuracies": 1.0, "rewards/chosen": 1.3796957731246948, "rewards/margins": 0.5137875080108643, "rewards/rejected": 0.8659082651138306, "step": 5372 }, { "epoch": 2.9, "learning_rate": 3.0251161029813953e-10, "logits/chosen": -2.0671117305755615, "logits/rejected": -2.067380905151367, "logps/chosen": -0.7222006320953369, "logps/rejected": -3.9237661361694336, "loss": 0.4482, "rewards/accuracies": 1.0, "rewards/chosen": 1.062142252922058, "rewards/margins": 0.5701757669448853, "rewards/rejected": 0.49196645617485046, "step": 5373 }, { "epoch": 2.9, "learning_rate": 2.9932208730836795e-10, "logits/chosen": -2.117706060409546, "logits/rejected": -2.007997989654541, "logps/chosen": -36.42853546142578, "logps/rejected": -4.152496337890625, "loss": 0.1507, "rewards/accuracies": 1.0, "rewards/chosen": 2.4653878211975098, "rewards/margins": 1.8159970045089722, "rewards/rejected": 0.6493908166885376, "step": 5374 }, { "epoch": 2.9, "learning_rate": 2.9614941739335987e-10, "logits/chosen": -2.021144390106201, "logits/rejected": -2.0362305641174316, "logps/chosen": -3.135671615600586, "logps/rejected": -1.0910484790802002, "loss": 0.5892, "rewards/accuracies": 1.0, "rewards/chosen": 0.9095344543457031, "rewards/margins": 0.2199850082397461, "rewards/rejected": 0.689549446105957, "step": 5375 }, { "epoch": 2.9, "learning_rate": 2.929936016289547e-10, "logits/chosen": -2.1803812980651855, "logits/rejected": -2.340893507003784, "logps/chosen": -0.4178808033466339, "logps/rejected": -0.5044747591018677, "loss": 0.6889, "rewards/accuracies": 1.0, "rewards/chosen": 0.9502364993095398, "rewards/margins": 0.008582353591918945, "rewards/rejected": 0.9416541457176208, "step": 5376 }, { "epoch": 2.9, "learning_rate": 2.8985464108525205e-10, "logits/chosen": -2.1442694664001465, "logits/rejected": -2.1453919410705566, "logps/chosen": -0.9138239026069641, "logps/rejected": -2.5062038898468018, "loss": 0.5267, "rewards/accuracies": 1.0, "rewards/chosen": 1.0299299955368042, "rewards/margins": 0.36617058515548706, "rewards/rejected": 0.6637594103813171, "step": 5377 }, { "epoch": 2.9, "learning_rate": 2.867325368266449e-10, "logits/chosen": -2.0287926197052, "logits/rejected": -2.0325090885162354, "logps/chosen": -5.831874370574951, "logps/rejected": -12.484125137329102, "loss": 0.3671, "rewards/accuracies": 1.0, "rewards/chosen": 1.6926039457321167, "rewards/margins": 0.8128105998039246, "rewards/rejected": 0.8797933459281921, "step": 5378 }, { "epoch": 2.9, "learning_rate": 2.836272899118253e-10, "logits/chosen": -2.0872371196746826, "logits/rejected": -2.0702311992645264, "logps/chosen": -7.612253665924072, "logps/rejected": -2.1990838050842285, "loss": 0.3551, "rewards/accuracies": 1.0, "rewards/chosen": 1.6322864294052124, "rewards/margins": 0.8525763750076294, "rewards/rejected": 0.779710054397583, "step": 5379 }, { "epoch": 2.9, "learning_rate": 2.8053890139374536e-10, "logits/chosen": -2.187469005584717, "logits/rejected": -2.3036162853240967, "logps/chosen": -1.3877918720245361, "logps/rejected": -1.4804043769836426, "loss": 0.6964, "rewards/accuracies": 0.0, "rewards/chosen": 0.7914280891418457, "rewards/margins": -0.006560266017913818, "rewards/rejected": 0.7979883551597595, "step": 5380 }, { "epoch": 2.9, "learning_rate": 2.774673723196508e-10, "logits/chosen": -2.1584391593933105, "logits/rejected": -2.1340134143829346, "logps/chosen": -20.82647705078125, "logps/rejected": -15.921597480773926, "loss": 0.258, "rewards/accuracies": 1.0, "rewards/chosen": 2.0165412425994873, "rewards/margins": 1.223127841949463, "rewards/rejected": 0.7934134602546692, "step": 5381 }, { "epoch": 2.9, "learning_rate": 2.7441270373108063e-10, "logits/chosen": -2.1531524658203125, "logits/rejected": -2.3088443279266357, "logps/chosen": -10.634244918823242, "logps/rejected": -11.164422988891602, "loss": 0.6473, "rewards/accuracies": 1.0, "rewards/chosen": 1.1565450429916382, "rewards/margins": 0.09386396408081055, "rewards/rejected": 1.0626810789108276, "step": 5382 }, { "epoch": 2.9, "learning_rate": 2.7137489666383405e-10, "logits/chosen": -2.153665065765381, "logits/rejected": -2.1430044174194336, "logps/chosen": -7.374634742736816, "logps/rejected": -3.2782692909240723, "loss": 0.6167, "rewards/accuracies": 1.0, "rewards/chosen": 0.7317127585411072, "rewards/margins": 0.15920233726501465, "rewards/rejected": 0.5725104212760925, "step": 5383 }, { "epoch": 2.9, "learning_rate": 2.6835395214801494e-10, "logits/chosen": -2.0400609970092773, "logits/rejected": -2.234204053878784, "logps/chosen": -2.8229379653930664, "logps/rejected": -3.1321067810058594, "loss": 0.6836, "rewards/accuracies": 1.0, "rewards/chosen": 0.8246397972106934, "rewards/margins": 0.01920253038406372, "rewards/rejected": 0.8054372668266296, "step": 5384 }, { "epoch": 2.9, "learning_rate": 2.653498712079927e-10, "logits/chosen": -1.9281381368637085, "logits/rejected": -2.2115159034729004, "logps/chosen": -0.34079334139823914, "logps/rejected": -0.319685161113739, "loss": 0.6855, "rewards/accuracies": 1.0, "rewards/chosen": 0.953266441822052, "rewards/margins": 0.015277683734893799, "rewards/rejected": 0.9379887580871582, "step": 5385 }, { "epoch": 2.91, "learning_rate": 2.6236265486243025e-10, "logits/chosen": -2.0210654735565186, "logits/rejected": -2.0233547687530518, "logps/chosen": -1.9147095680236816, "logps/rejected": -7.608954906463623, "loss": 0.3808, "rewards/accuracies": 1.0, "rewards/chosen": 0.9791393280029297, "rewards/margins": 0.7690545320510864, "rewards/rejected": 0.21008478105068207, "step": 5386 }, { "epoch": 2.91, "learning_rate": 2.593923041242674e-10, "logits/chosen": -2.0859858989715576, "logits/rejected": -2.0820817947387695, "logps/chosen": -3.9493794441223145, "logps/rejected": -3.9506819248199463, "loss": 0.5203, "rewards/accuracies": 1.0, "rewards/chosen": 1.000205636024475, "rewards/margins": 0.38205045461654663, "rewards/rejected": 0.6181551814079285, "step": 5387 }, { "epoch": 2.91, "learning_rate": 2.564388200007261e-10, "logits/chosen": -2.110649585723877, "logits/rejected": -2.285026788711548, "logps/chosen": -2.323518991470337, "logps/rejected": -2.22700572013855, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 1.0903395414352417, "rewards/margins": -0.010492324829101562, "rewards/rejected": 1.1008318662643433, "step": 5388 }, { "epoch": 2.91, "learning_rate": 2.535022034932999e-10, "logits/chosen": -1.9598619937896729, "logits/rejected": -2.2734925746917725, "logps/chosen": -0.1649288386106491, "logps/rejected": -0.18841984868049622, "loss": 0.6861, "rewards/accuracies": 1.0, "rewards/chosen": 0.9485117197036743, "rewards/margins": 0.014156341552734375, "rewards/rejected": 0.9343553781509399, "step": 5389 }, { "epoch": 2.91, "learning_rate": 2.5058245559777536e-10, "logits/chosen": -2.1380484104156494, "logits/rejected": -2.134094715118408, "logps/chosen": -4.820326805114746, "logps/rejected": -6.0561065673828125, "loss": 0.4879, "rewards/accuracies": 1.0, "rewards/chosen": 1.0468865633010864, "rewards/margins": 0.46380823850631714, "rewards/rejected": 0.5830783247947693, "step": 5390 }, { "epoch": 2.91, "learning_rate": 2.4767957730421617e-10, "logits/chosen": -2.038316011428833, "logits/rejected": -2.311591625213623, "logps/chosen": -2.651851177215576, "logps/rejected": -3.969479560852051, "loss": 0.6974, "rewards/accuracies": 0.0, "rewards/chosen": 0.9987896084785461, "rewards/margins": -0.008424460887908936, "rewards/rejected": 1.007214069366455, "step": 5391 }, { "epoch": 2.91, "learning_rate": 2.447935695969572e-10, "logits/chosen": -2.1148531436920166, "logits/rejected": -2.154448986053467, "logps/chosen": -3.990431070327759, "logps/rejected": -8.902246475219727, "loss": 0.524, "rewards/accuracies": 1.0, "rewards/chosen": 1.2851800918579102, "rewards/margins": 0.3727279305458069, "rewards/rejected": 0.9124521613121033, "step": 5392 }, { "epoch": 2.91, "learning_rate": 2.4192443345462663e-10, "logits/chosen": -1.9964311122894287, "logits/rejected": -1.997958779335022, "logps/chosen": -0.24020595848560333, "logps/rejected": -8.028609275817871, "loss": 0.3656, "rewards/accuracies": 1.0, "rewards/chosen": 1.0580723285675049, "rewards/margins": 0.8177849054336548, "rewards/rejected": 0.2402874082326889, "step": 5393 }, { "epoch": 2.91, "learning_rate": 2.390721698501186e-10, "logits/chosen": -2.0059754848480225, "logits/rejected": -2.266547679901123, "logps/chosen": -0.8354438543319702, "logps/rejected": -0.8300183415412903, "loss": 0.6868, "rewards/accuracies": 1.0, "rewards/chosen": 0.8754646182060242, "rewards/margins": 0.012833952903747559, "rewards/rejected": 0.8626306653022766, "step": 5394 }, { "epoch": 2.91, "learning_rate": 2.3623677975061485e-10, "logits/chosen": -2.204806089401245, "logits/rejected": -2.18123459815979, "logps/chosen": -4.822612762451172, "logps/rejected": -8.225545883178711, "loss": 0.4411, "rewards/accuracies": 1.0, "rewards/chosen": 1.0662049055099487, "rewards/margins": 0.5898466110229492, "rewards/rejected": 0.4763583242893219, "step": 5395 }, { "epoch": 2.91, "learning_rate": 2.334182641175686e-10, "logits/chosen": -2.121792793273926, "logits/rejected": -2.3019802570343018, "logps/chosen": -1.1235014200210571, "logps/rejected": -1.108970046043396, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 1.0134958028793335, "rewards/margins": 0.020502150058746338, "rewards/rejected": 0.9929936528205872, "step": 5396 }, { "epoch": 2.91, "learning_rate": 2.3061662390671532e-10, "logits/chosen": -2.154186248779297, "logits/rejected": -2.167795419692993, "logps/chosen": -3.474010944366455, "logps/rejected": -2.7233691215515137, "loss": 0.6145, "rewards/accuracies": 1.0, "rewards/chosen": 1.2753467559814453, "rewards/margins": 0.16404449939727783, "rewards/rejected": 1.1113022565841675, "step": 5397 }, { "epoch": 2.91, "learning_rate": 2.2783186006806731e-10, "logits/chosen": -2.0419394969940186, "logits/rejected": -2.0355045795440674, "logps/chosen": -2.8957035541534424, "logps/rejected": -6.452136039733887, "loss": 0.4709, "rewards/accuracies": 1.0, "rewards/chosen": 0.8553858995437622, "rewards/margins": 0.5085327625274658, "rewards/rejected": 0.3468531668186188, "step": 5398 }, { "epoch": 2.91, "learning_rate": 2.250639735459192e-10, "logits/chosen": -2.1472716331481934, "logits/rejected": -2.0824053287506104, "logps/chosen": -28.127389907836914, "logps/rejected": -7.76291561126709, "loss": 0.2127, "rewards/accuracies": 1.0, "rewards/chosen": 2.3620316982269287, "rewards/margins": 1.4395101070404053, "rewards/rejected": 0.9225215315818787, "step": 5399 }, { "epoch": 2.91, "learning_rate": 2.2231296527883137e-10, "logits/chosen": -2.0029749870300293, "logits/rejected": -2.0194599628448486, "logps/chosen": -3.4287338256835938, "logps/rejected": -7.158225059509277, "loss": 0.5296, "rewards/accuracies": 1.0, "rewards/chosen": 0.9639471173286438, "rewards/margins": 0.3592473268508911, "rewards/rejected": 0.6046997904777527, "step": 5400 }, { "epoch": 2.91, "learning_rate": 2.195788361996409e-10, "logits/chosen": -2.0495541095733643, "logits/rejected": -2.2711405754089355, "logps/chosen": -0.1889391988515854, "logps/rejected": -0.19308355450630188, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9508876800537109, "rewards/margins": 0.00512617826461792, "rewards/rejected": 0.945761501789093, "step": 5401 }, { "epoch": 2.91, "learning_rate": 2.1686158723548398e-10, "logits/chosen": -2.076042652130127, "logits/rejected": -2.1201915740966797, "logps/chosen": -4.2074995040893555, "logps/rejected": -24.320859909057617, "loss": 0.2073, "rewards/accuracies": 1.0, "rewards/chosen": 1.2783284187316895, "rewards/margins": 1.4679299592971802, "rewards/rejected": -0.18960152566432953, "step": 5402 }, { "epoch": 2.91, "learning_rate": 2.1416121930774577e-10, "logits/chosen": -2.0392913818359375, "logits/rejected": -2.0456345081329346, "logps/chosen": -4.865052223205566, "logps/rejected": -3.731163263320923, "loss": 0.376, "rewards/accuracies": 1.0, "rewards/chosen": 1.4084433317184448, "rewards/margins": 0.7842352390289307, "rewards/rejected": 0.6242080926895142, "step": 5403 }, { "epoch": 2.91, "learning_rate": 2.114777333320994e-10, "logits/chosen": -2.133641004562378, "logits/rejected": -2.1300344467163086, "logps/chosen": -2.136190414428711, "logps/rejected": -3.641221284866333, "loss": 0.7959, "rewards/accuracies": 0.0, "rewards/chosen": 0.8005630373954773, "rewards/margins": -0.19598209857940674, "rewards/rejected": 0.996545135974884, "step": 5404 }, { "epoch": 2.92, "learning_rate": 2.088111302184947e-10, "logits/chosen": -2.0226519107818604, "logits/rejected": -2.311396360397339, "logps/chosen": -0.5229185819625854, "logps/rejected": -0.6778848171234131, "loss": 0.6759, "rewards/accuracies": 1.0, "rewards/chosen": 0.8175633549690247, "rewards/margins": 0.03482896089553833, "rewards/rejected": 0.7827343940734863, "step": 5405 }, { "epoch": 2.92, "learning_rate": 2.0616141087114734e-10, "logits/chosen": -2.006483554840088, "logits/rejected": -2.000751495361328, "logps/chosen": -3.0708377361297607, "logps/rejected": -5.658535480499268, "loss": 0.4433, "rewards/accuracies": 1.0, "rewards/chosen": 0.8929465413093567, "rewards/margins": 0.5837441086769104, "rewards/rejected": 0.3092024326324463, "step": 5406 }, { "epoch": 2.92, "learning_rate": 2.0352857618856634e-10, "logits/chosen": -2.0283548831939697, "logits/rejected": -2.3020105361938477, "logps/chosen": -0.24784709513187408, "logps/rejected": -0.2993529736995697, "loss": 0.6829, "rewards/accuracies": 1.0, "rewards/chosen": 1.0448260307312012, "rewards/margins": 0.02057802677154541, "rewards/rejected": 1.0242480039596558, "step": 5407 }, { "epoch": 2.92, "learning_rate": 2.0091262706350974e-10, "logits/chosen": -2.055753469467163, "logits/rejected": -2.0499584674835205, "logps/chosen": -0.8971288204193115, "logps/rejected": -5.519097328186035, "loss": 0.3375, "rewards/accuracies": 1.0, "rewards/chosen": 1.330651044845581, "rewards/margins": 0.9125659465789795, "rewards/rejected": 0.41808509826660156, "step": 5408 }, { "epoch": 2.92, "learning_rate": 1.983135643830347e-10, "logits/chosen": -2.0276682376861572, "logits/rejected": -2.269634246826172, "logps/chosen": -0.5417140126228333, "logps/rejected": -0.49111270904541016, "loss": 0.6692, "rewards/accuracies": 1.0, "rewards/chosen": 0.9492787718772888, "rewards/margins": 0.04849034547805786, "rewards/rejected": 0.900788426399231, "step": 5409 }, { "epoch": 2.92, "learning_rate": 1.9573138902845287e-10, "logits/chosen": -2.1961371898651123, "logits/rejected": -2.314842462539673, "logps/chosen": -4.005342960357666, "logps/rejected": -1.3832781314849854, "loss": 0.7778, "rewards/accuracies": 0.0, "rewards/chosen": 0.694549024105072, "rewards/margins": -0.16264218091964722, "rewards/rejected": 0.8571912050247192, "step": 5410 }, { "epoch": 2.92, "learning_rate": 1.9316610187536387e-10, "logits/chosen": -2.1388673782348633, "logits/rejected": -2.142310619354248, "logps/chosen": -2.277127742767334, "logps/rejected": -14.47791862487793, "loss": 0.2495, "rewards/accuracies": 1.0, "rewards/chosen": 1.5559138059616089, "rewards/margins": 1.26104736328125, "rewards/rejected": 0.2948663830757141, "step": 5411 }, { "epoch": 2.92, "learning_rate": 1.9061770379363297e-10, "logits/chosen": -2.013209581375122, "logits/rejected": -2.2822470664978027, "logps/chosen": -2.8967697620391846, "logps/rejected": -2.7340753078460693, "loss": 0.679, "rewards/accuracies": 1.0, "rewards/chosen": 0.6376706957817078, "rewards/margins": 0.028424978256225586, "rewards/rejected": 0.6092457175254822, "step": 5412 }, { "epoch": 2.92, "learning_rate": 1.880861956473967e-10, "logits/chosen": -1.9919590950012207, "logits/rejected": -2.000645637512207, "logps/chosen": -2.9677340984344482, "logps/rejected": -3.1767477989196777, "loss": 0.4226, "rewards/accuracies": 1.0, "rewards/chosen": 1.238430380821228, "rewards/margins": 0.64248126745224, "rewards/rejected": 0.595949113368988, "step": 5413 }, { "epoch": 2.92, "learning_rate": 1.8557157829507397e-10, "logits/chosen": -2.070289134979248, "logits/rejected": -1.9665772914886475, "logps/chosen": -24.785594940185547, "logps/rejected": -8.292251586914062, "loss": 0.1504, "rewards/accuracies": 1.0, "rewards/chosen": 2.1092019081115723, "rewards/margins": 1.8185245990753174, "rewards/rejected": 0.2906772792339325, "step": 5414 }, { "epoch": 2.92, "learning_rate": 1.8307385258934382e-10, "logits/chosen": -2.1015238761901855, "logits/rejected": -2.108243703842163, "logps/chosen": -1.9911365509033203, "logps/rejected": -3.6385035514831543, "loss": 0.4716, "rewards/accuracies": 1.0, "rewards/chosen": 1.1396983861923218, "rewards/margins": 0.5065351724624634, "rewards/rejected": 0.6331632137298584, "step": 5415 }, { "epoch": 2.92, "learning_rate": 1.8059301937716764e-10, "logits/chosen": -1.9415106773376465, "logits/rejected": -1.9408906698226929, "logps/chosen": -0.5325904488563538, "logps/rejected": -1.809621810913086, "loss": 0.6691, "rewards/accuracies": 1.0, "rewards/chosen": 0.8753523230552673, "rewards/margins": 0.04861557483673096, "rewards/rejected": 0.8267367482185364, "step": 5416 }, { "epoch": 2.92, "learning_rate": 1.7812907949977252e-10, "logits/chosen": -2.2353618144989014, "logits/rejected": -2.1232056617736816, "logps/chosen": -27.587017059326172, "logps/rejected": -5.078441143035889, "loss": 0.1291, "rewards/accuracies": 1.0, "rewards/chosen": 2.4093263149261475, "rewards/margins": 1.9816005229949951, "rewards/rejected": 0.4277258515357971, "step": 5417 }, { "epoch": 2.92, "learning_rate": 1.7568203379266233e-10, "logits/chosen": -2.0330100059509277, "logits/rejected": -2.0330252647399902, "logps/chosen": -0.3182150721549988, "logps/rejected": -3.0198404788970947, "loss": 0.5365, "rewards/accuracies": 1.0, "rewards/chosen": 1.0332568883895874, "rewards/margins": 0.3424416184425354, "rewards/rejected": 0.690815269947052, "step": 5418 }, { "epoch": 2.92, "learning_rate": 1.7325188308560668e-10, "logits/chosen": -2.051440954208374, "logits/rejected": -2.3618316650390625, "logps/chosen": -0.37428754568099976, "logps/rejected": -0.42139118909835815, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.8441382646560669, "rewards/margins": 0.016761958599090576, "rewards/rejected": 0.8273763060569763, "step": 5419 }, { "epoch": 2.92, "learning_rate": 1.708386282026464e-10, "logits/chosen": -2.1032915115356445, "logits/rejected": -2.303232192993164, "logps/chosen": -2.1804206371307373, "logps/rejected": -0.6957225203514099, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.8795108795166016, "rewards/margins": 0.022886812686920166, "rewards/rejected": 0.8566240668296814, "step": 5420 }, { "epoch": 2.92, "learning_rate": 1.6844226996210465e-10, "logits/chosen": -2.113903522491455, "logits/rejected": -2.10762619972229, "logps/chosen": -1.9083006381988525, "logps/rejected": -3.969667434692383, "loss": 0.4915, "rewards/accuracies": 1.0, "rewards/chosen": 1.3169983625411987, "rewards/margins": 0.45451807975769043, "rewards/rejected": 0.8624802827835083, "step": 5421 }, { "epoch": 2.92, "learning_rate": 1.660628091765537e-10, "logits/chosen": -2.035067319869995, "logits/rejected": -2.2805957794189453, "logps/chosen": -1.0401960611343384, "logps/rejected": -1.1543333530426025, "loss": 0.6754, "rewards/accuracies": 1.0, "rewards/chosen": 0.7780301570892334, "rewards/margins": 0.03591513633728027, "rewards/rejected": 0.7421150207519531, "step": 5422 }, { "epoch": 2.93, "learning_rate": 1.637002466528592e-10, "logits/chosen": -2.1637651920318604, "logits/rejected": -2.1670095920562744, "logps/chosen": -3.9227490425109863, "logps/rejected": -4.736628532409668, "loss": 0.4326, "rewards/accuracies": 1.0, "rewards/chosen": 1.0836178064346313, "rewards/margins": 0.6138904094696045, "rewards/rejected": 0.46972742676734924, "step": 5423 }, { "epoch": 2.93, "learning_rate": 1.6135458319214145e-10, "logits/chosen": -1.9576665163040161, "logits/rejected": -2.241925001144409, "logps/chosen": -0.9433018565177917, "logps/rejected": -0.8633238077163696, "loss": 0.6735, "rewards/accuracies": 1.0, "rewards/chosen": 1.0043660402297974, "rewards/margins": 0.03975391387939453, "rewards/rejected": 0.9646121263504028, "step": 5424 }, { "epoch": 2.93, "learning_rate": 1.5902581958979755e-10, "logits/chosen": -2.1974146366119385, "logits/rejected": -2.191211462020874, "logps/chosen": -6.000028610229492, "logps/rejected": -6.668015003204346, "loss": 0.2794, "rewards/accuracies": 1.0, "rewards/chosen": 1.393405795097351, "rewards/margins": 1.1320396661758423, "rewards/rejected": 0.2613661289215088, "step": 5425 }, { "epoch": 2.93, "learning_rate": 1.567139566354847e-10, "logits/chosen": -2.037782907485962, "logits/rejected": -2.2911882400512695, "logps/chosen": -0.2527312934398651, "logps/rejected": -0.2679975628852844, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 1.0363330841064453, "rewards/margins": -0.0035753250122070312, "rewards/rejected": 1.0399084091186523, "step": 5426 }, { "epoch": 2.93, "learning_rate": 1.544189951131425e-10, "logits/chosen": -2.085890293121338, "logits/rejected": -2.3218488693237305, "logps/chosen": -1.7043445110321045, "logps/rejected": -1.720415711402893, "loss": 0.6787, "rewards/accuracies": 1.0, "rewards/chosen": 0.8467389941215515, "rewards/margins": 0.029065489768981934, "rewards/rejected": 0.8176735043525696, "step": 5427 }, { "epoch": 2.93, "learning_rate": 1.521409358009651e-10, "logits/chosen": -2.0470030307769775, "logits/rejected": -2.0416390895843506, "logps/chosen": -3.343242645263672, "logps/rejected": -4.3008012771606445, "loss": 0.3782, "rewards/accuracies": 1.0, "rewards/chosen": 1.3861750364303589, "rewards/margins": 0.7773382067680359, "rewards/rejected": 0.608836829662323, "step": 5428 }, { "epoch": 2.93, "learning_rate": 1.4987977947142904e-10, "logits/chosen": -2.1303961277008057, "logits/rejected": -2.125767469406128, "logps/chosen": -4.711479663848877, "logps/rejected": -2.541325330734253, "loss": 0.3069, "rewards/accuracies": 1.0, "rewards/chosen": 1.6668287515640259, "rewards/margins": 1.0238444805145264, "rewards/rejected": 0.6429843306541443, "step": 5429 }, { "epoch": 2.93, "learning_rate": 1.4763552689127657e-10, "logits/chosen": -2.1652379035949707, "logits/rejected": -2.293483257293701, "logps/chosen": -19.91057777404785, "logps/rejected": -1.8907979726791382, "loss": 0.71, "rewards/accuracies": 0.0, "rewards/chosen": 0.9882051348686218, "rewards/margins": -0.033390939235687256, "rewards/rejected": 1.021596074104309, "step": 5430 }, { "epoch": 2.93, "learning_rate": 1.4540817882150446e-10, "logits/chosen": -2.0331666469573975, "logits/rejected": -2.023540496826172, "logps/chosen": -5.423735618591309, "logps/rejected": -7.713857650756836, "loss": 0.3083, "rewards/accuracies": 1.0, "rewards/chosen": 1.596099615097046, "rewards/margins": 1.0185258388519287, "rewards/rejected": 0.5775737762451172, "step": 5431 }, { "epoch": 2.93, "learning_rate": 1.431977360173975e-10, "logits/chosen": -2.0813980102539062, "logits/rejected": -2.305819034576416, "logps/chosen": -0.37927669286727905, "logps/rejected": -0.4553431570529938, "loss": 0.6734, "rewards/accuracies": 1.0, "rewards/chosen": 0.9203813672065735, "rewards/margins": 0.03991901874542236, "rewards/rejected": 0.8804623484611511, "step": 5432 }, { "epoch": 2.93, "learning_rate": 1.4100419922849492e-10, "logits/chosen": -2.074756622314453, "logits/rejected": -2.07983660697937, "logps/chosen": -1.787003755569458, "logps/rejected": -2.9576175212860107, "loss": 0.403, "rewards/accuracies": 1.0, "rewards/chosen": 1.4186128377914429, "rewards/margins": 0.7006591558456421, "rewards/rejected": 0.7179536819458008, "step": 5433 }, { "epoch": 2.93, "learning_rate": 1.3882756919860184e-10, "logits/chosen": -1.9821159839630127, "logits/rejected": -1.9820054769515991, "logps/chosen": -1.3174275159835815, "logps/rejected": -0.9507445693016052, "loss": 0.5617, "rewards/accuracies": 1.0, "rewards/chosen": 1.163305401802063, "rewards/margins": 0.2829349637031555, "rewards/rejected": 0.8803704380989075, "step": 5434 }, { "epoch": 2.93, "learning_rate": 1.3666784666579446e-10, "logits/chosen": -2.0588431358337402, "logits/rejected": -2.3036000728607178, "logps/chosen": -1.465672254562378, "logps/rejected": -5.421060085296631, "loss": 0.6111, "rewards/accuracies": 1.0, "rewards/chosen": 1.0525705814361572, "rewards/margins": 0.17138636112213135, "rewards/rejected": 0.8811842203140259, "step": 5435 }, { "epoch": 2.93, "learning_rate": 1.3452503236242585e-10, "logits/chosen": -1.9394886493682861, "logits/rejected": -1.9390614032745361, "logps/chosen": -1.0241681337356567, "logps/rejected": -1.0281511545181274, "loss": 0.6708, "rewards/accuracies": 1.0, "rewards/chosen": 0.837894856929779, "rewards/margins": 0.04525649547576904, "rewards/rejected": 0.79263836145401, "step": 5436 }, { "epoch": 2.93, "learning_rate": 1.3239912701509814e-10, "logits/chosen": -2.002476215362549, "logits/rejected": -2.2718000411987305, "logps/chosen": -0.37257280945777893, "logps/rejected": -0.3881475627422333, "loss": 0.6934, "rewards/accuracies": 0.0, "rewards/chosen": 0.8528068661689758, "rewards/margins": -0.0004361271858215332, "rewards/rejected": 0.8532429933547974, "step": 5437 }, { "epoch": 2.93, "learning_rate": 1.302901313446847e-10, "logits/chosen": -2.110544443130493, "logits/rejected": -2.301727294921875, "logps/chosen": -1.3950660228729248, "logps/rejected": -1.3537825345993042, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.5873515009880066, "rewards/margins": -0.003872394561767578, "rewards/rejected": 0.5912238955497742, "step": 5438 }, { "epoch": 2.93, "learning_rate": 1.2819804606633568e-10, "logits/chosen": -2.115987539291382, "logits/rejected": -2.114434003829956, "logps/chosen": -0.6570120453834534, "logps/rejected": -2.0651960372924805, "loss": 0.6271, "rewards/accuracies": 1.0, "rewards/chosen": 0.9791273474693298, "rewards/margins": 0.13673138618469238, "rewards/rejected": 0.8423959612846375, "step": 5439 }, { "epoch": 2.93, "learning_rate": 1.2612287188945582e-10, "logits/chosen": -2.0261778831481934, "logits/rejected": -2.063598394393921, "logps/chosen": -1.3968065977096558, "logps/rejected": -10.358166694641113, "loss": 0.2845, "rewards/accuracies": 1.0, "rewards/chosen": 1.5330934524536133, "rewards/margins": 1.111256718635559, "rewards/rejected": 0.4218367636203766, "step": 5440 }, { "epoch": 2.93, "learning_rate": 1.2406460951772113e-10, "logits/chosen": -2.1904358863830566, "logits/rejected": -2.1914165019989014, "logps/chosen": -1.6907587051391602, "logps/rejected": -0.8970382809638977, "loss": 0.6261, "rewards/accuracies": 1.0, "rewards/chosen": 1.2027487754821777, "rewards/margins": 0.138924241065979, "rewards/rejected": 1.0638245344161987, "step": 5441 }, { "epoch": 2.94, "learning_rate": 1.220232596490678e-10, "logits/chosen": -2.077364683151245, "logits/rejected": -2.234046220779419, "logps/chosen": -7.172789096832275, "logps/rejected": -5.042341709136963, "loss": 0.6016, "rewards/accuracies": 1.0, "rewards/chosen": 1.0403727293014526, "rewards/margins": 0.19225376844406128, "rewards/rejected": 0.8481189608573914, "step": 5442 }, { "epoch": 2.94, "learning_rate": 1.1999882297569764e-10, "logits/chosen": -1.994309663772583, "logits/rejected": -1.9928603172302246, "logps/chosen": -1.2706222534179688, "logps/rejected": -5.216751575469971, "loss": 0.4221, "rewards/accuracies": 1.0, "rewards/chosen": 1.0041375160217285, "rewards/margins": 0.6439950466156006, "rewards/rejected": 0.36014246940612793, "step": 5443 }, { "epoch": 2.94, "learning_rate": 1.1799130018408377e-10, "logits/chosen": -2.0513503551483154, "logits/rejected": -2.276193857192993, "logps/chosen": -0.3635360300540924, "logps/rejected": -0.3307410478591919, "loss": 0.6984, "rewards/accuracies": 0.0, "rewards/chosen": 0.8583880662918091, "rewards/margins": -0.010426878929138184, "rewards/rejected": 0.8688149452209473, "step": 5444 }, { "epoch": 2.94, "learning_rate": 1.1600069195496498e-10, "logits/chosen": -2.0967721939086914, "logits/rejected": -2.0989162921905518, "logps/chosen": -3.3065848350524902, "logps/rejected": -0.8773816227912903, "loss": 0.6112, "rewards/accuracies": 1.0, "rewards/chosen": 1.0125083923339844, "rewards/margins": 0.17127341032028198, "rewards/rejected": 0.8412349820137024, "step": 5445 }, { "epoch": 2.94, "learning_rate": 1.1402699896332912e-10, "logits/chosen": -2.1245927810668945, "logits/rejected": -2.2882492542266846, "logps/chosen": -0.19852066040039062, "logps/rejected": -0.24877813458442688, "loss": 0.6983, "rewards/accuracies": 0.0, "rewards/chosen": 0.924827516078949, "rewards/margins": -0.010186851024627686, "rewards/rejected": 0.9350143671035767, "step": 5446 }, { "epoch": 2.94, "learning_rate": 1.120702218784464e-10, "logits/chosen": -2.0402650833129883, "logits/rejected": -2.0275120735168457, "logps/chosen": -6.024520397186279, "logps/rejected": -0.7118844985961914, "loss": 0.3754, "rewards/accuracies": 1.0, "rewards/chosen": 1.6184101104736328, "rewards/margins": 0.7862156629562378, "rewards/rejected": 0.832194447517395, "step": 5447 }, { "epoch": 2.94, "learning_rate": 1.1013036136384158e-10, "logits/chosen": -2.005192279815674, "logits/rejected": -2.007091522216797, "logps/chosen": -0.19978207349777222, "logps/rejected": -5.66986083984375, "loss": 0.4832, "rewards/accuracies": 1.0, "rewards/chosen": 0.8946650624275208, "rewards/margins": 0.4761245846748352, "rewards/rejected": 0.41854047775268555, "step": 5448 }, { "epoch": 2.94, "learning_rate": 1.082074180772996e-10, "logits/chosen": -2.102407693862915, "logits/rejected": -2.095060348510742, "logps/chosen": -0.5932058095932007, "logps/rejected": -4.014384746551514, "loss": 0.5683, "rewards/accuracies": 1.0, "rewards/chosen": 0.9438347816467285, "rewards/margins": 0.26743441820144653, "rewards/rejected": 0.676400363445282, "step": 5449 }, { "epoch": 2.94, "learning_rate": 1.0630139267087668e-10, "logits/chosen": -2.1304759979248047, "logits/rejected": -2.111969232559204, "logps/chosen": -1.3981915712356567, "logps/rejected": -8.847735404968262, "loss": 0.4307, "rewards/accuracies": 1.0, "rewards/chosen": 1.2297364473342896, "rewards/margins": 0.6192226409912109, "rewards/rejected": 0.6105138063430786, "step": 5450 }, { "epoch": 2.94, "learning_rate": 1.0441228579089467e-10, "logits/chosen": -2.056354522705078, "logits/rejected": -2.0294859409332275, "logps/chosen": -14.093831062316895, "logps/rejected": -2.6029961109161377, "loss": 0.2868, "rewards/accuracies": 1.0, "rewards/chosen": 1.6111968755722046, "rewards/margins": 1.1021651029586792, "rewards/rejected": 0.5090317726135254, "step": 5451 }, { "epoch": 2.94, "learning_rate": 1.0254009807793008e-10, "logits/chosen": -2.186098575592041, "logits/rejected": -2.2605929374694824, "logps/chosen": -0.26111966371536255, "logps/rejected": -0.28268957138061523, "loss": 0.6921, "rewards/accuracies": 1.0, "rewards/chosen": 0.8826130032539368, "rewards/margins": 0.0021680593490600586, "rewards/rejected": 0.8804449439048767, "step": 5452 }, { "epoch": 2.94, "learning_rate": 1.0068483016682505e-10, "logits/chosen": -2.0323657989501953, "logits/rejected": -2.0259594917297363, "logps/chosen": -13.12497329711914, "logps/rejected": -5.841866493225098, "loss": 0.3267, "rewards/accuracies": 1.0, "rewards/chosen": 1.6987375020980835, "rewards/margins": 0.9509284496307373, "rewards/rejected": 0.7478090524673462, "step": 5453 }, { "epoch": 2.94, "learning_rate": 9.884648268668194e-11, "logits/chosen": -2.0305678844451904, "logits/rejected": -2.0291452407836914, "logps/chosen": -0.9896281957626343, "logps/rejected": -3.8209357261657715, "loss": 0.4763, "rewards/accuracies": 1.0, "rewards/chosen": 0.9597660899162292, "rewards/margins": 0.49418386816978455, "rewards/rejected": 0.4655822217464447, "step": 5454 }, { "epoch": 2.94, "learning_rate": 9.702505626087987e-11, "logits/chosen": -2.0708792209625244, "logits/rejected": -2.281904935836792, "logps/chosen": -0.22503772377967834, "logps/rejected": -0.24945548176765442, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.8759735226631165, "rewards/margins": 0.02626943588256836, "rewards/rejected": 0.8497040867805481, "step": 5455 }, { "epoch": 2.94, "learning_rate": 9.52205515070359e-11, "logits/chosen": -2.0685527324676514, "logits/rejected": -2.219987630844116, "logps/chosen": -0.3704061806201935, "logps/rejected": -0.39067086577415466, "loss": 0.6959, "rewards/accuracies": 0.0, "rewards/chosen": 0.8283419609069824, "rewards/margins": -0.005572497844696045, "rewards/rejected": 0.8339144587516785, "step": 5456 }, { "epoch": 2.94, "learning_rate": 9.343296903704944e-11, "logits/chosen": -2.1605453491210938, "logits/rejected": -2.3192954063415527, "logps/chosen": -0.23251569271087646, "logps/rejected": -0.24046236276626587, "loss": 0.6873, "rewards/accuracies": 1.0, "rewards/chosen": 0.9458374977111816, "rewards/margins": 0.011707305908203125, "rewards/rejected": 0.9341301918029785, "step": 5457 }, { "epoch": 2.94, "learning_rate": 9.166230945706899e-11, "logits/chosen": -2.030421495437622, "logits/rejected": -2.3373234272003174, "logps/chosen": -8.680564880371094, "logps/rejected": -9.202055931091309, "loss": 0.6662, "rewards/accuracies": 1.0, "rewards/chosen": 0.47641879320144653, "rewards/margins": 0.054550379514694214, "rewards/rejected": 0.4218684136867523, "step": 5458 }, { "epoch": 2.94, "learning_rate": 8.990857336751978e-11, "logits/chosen": -2.066685676574707, "logits/rejected": -2.2803211212158203, "logps/chosen": -8.304450988769531, "logps/rejected": -8.665534019470215, "loss": 0.6697, "rewards/accuracies": 1.0, "rewards/chosen": 0.5619426965713501, "rewards/margins": 0.04743319749832153, "rewards/rejected": 0.5145094990730286, "step": 5459 }, { "epoch": 2.94, "learning_rate": 8.817176136306503e-11, "logits/chosen": -2.0447826385498047, "logits/rejected": -2.0404741764068604, "logps/chosen": -2.4194912910461426, "logps/rejected": -7.026491641998291, "loss": 0.4336, "rewards/accuracies": 1.0, "rewards/chosen": 1.256945252418518, "rewards/margins": 0.6110873818397522, "rewards/rejected": 0.6458578705787659, "step": 5460 }, { "epoch": 2.95, "learning_rate": 8.645187403265586e-11, "logits/chosen": -2.0915613174438477, "logits/rejected": -2.236002206802368, "logps/chosen": -3.586027145385742, "logps/rejected": -3.4853289127349854, "loss": 0.6779, "rewards/accuracies": 1.0, "rewards/chosen": 0.5940572023391724, "rewards/margins": 0.030728816986083984, "rewards/rejected": 0.5633283853530884, "step": 5461 }, { "epoch": 2.95, "learning_rate": 8.474891195948686e-11, "logits/chosen": -2.092533588409424, "logits/rejected": -2.100611925125122, "logps/chosen": -0.9375976324081421, "logps/rejected": -11.754863739013672, "loss": 0.4899, "rewards/accuracies": 1.0, "rewards/chosen": 0.9483184814453125, "rewards/margins": 0.4587331712245941, "rewards/rejected": 0.4895853102207184, "step": 5462 }, { "epoch": 2.95, "learning_rate": 8.306287572101278e-11, "logits/chosen": -1.9823787212371826, "logits/rejected": -1.9825239181518555, "logps/chosen": -0.6542789936065674, "logps/rejected": -2.116724967956543, "loss": 0.6247, "rewards/accuracies": 1.0, "rewards/chosen": 0.9893485307693481, "rewards/margins": 0.14190912246704102, "rewards/rejected": 0.8474394083023071, "step": 5463 }, { "epoch": 2.95, "learning_rate": 8.139376588896519e-11, "logits/chosen": -2.132066011428833, "logits/rejected": -2.1340432167053223, "logps/chosen": -0.6424176692962646, "logps/rejected": -7.780567646026611, "loss": 0.4196, "rewards/accuracies": 1.0, "rewards/chosen": 0.9467995762825012, "rewards/margins": 0.6514198780059814, "rewards/rejected": 0.2953796982765198, "step": 5464 }, { "epoch": 2.95, "learning_rate": 7.974158302931355e-11, "logits/chosen": -2.0582237243652344, "logits/rejected": -2.3057780265808105, "logps/chosen": -1.3300961256027222, "logps/rejected": -0.8334282636642456, "loss": 0.7162, "rewards/accuracies": 0.0, "rewards/chosen": 0.7162405848503113, "rewards/margins": -0.04567551612854004, "rewards/rejected": 0.7619161009788513, "step": 5465 }, { "epoch": 2.95, "learning_rate": 7.810632770230419e-11, "logits/chosen": -2.070167303085327, "logits/rejected": -2.076662302017212, "logps/chosen": -0.44446367025375366, "logps/rejected": -7.958127975463867, "loss": 0.3748, "rewards/accuracies": 1.0, "rewards/chosen": 1.1780470609664917, "rewards/margins": 0.7879865169525146, "rewards/rejected": 0.39006051421165466, "step": 5466 }, { "epoch": 2.95, "learning_rate": 7.648800046243242e-11, "logits/chosen": -2.0164408683776855, "logits/rejected": -2.2958590984344482, "logps/chosen": -0.5945357084274292, "logps/rejected": -0.6467791199684143, "loss": 0.678, "rewards/accuracies": 1.0, "rewards/chosen": 0.8600139617919922, "rewards/margins": 0.03046518564224243, "rewards/rejected": 0.8295487761497498, "step": 5467 }, { "epoch": 2.95, "learning_rate": 7.48866018584704e-11, "logits/chosen": -2.1616151332855225, "logits/rejected": -2.308131694793701, "logps/chosen": -0.6554215550422668, "logps/rejected": -0.6436368823051453, "loss": 0.6854, "rewards/accuracies": 1.0, "rewards/chosen": 0.9234381914138794, "rewards/margins": 0.01550436019897461, "rewards/rejected": 0.9079338312149048, "step": 5468 }, { "epoch": 2.95, "learning_rate": 7.330213243343375e-11, "logits/chosen": -2.033554792404175, "logits/rejected": -2.282423496246338, "logps/chosen": -0.32854390144348145, "logps/rejected": -0.3166026473045349, "loss": 0.6681, "rewards/accuracies": 1.0, "rewards/chosen": 0.9947213530540466, "rewards/margins": 0.0507504940032959, "rewards/rejected": 0.9439708590507507, "step": 5469 }, { "epoch": 2.95, "learning_rate": 7.173459272459826e-11, "logits/chosen": -1.9692806005477905, "logits/rejected": -2.2511985301971436, "logps/chosen": -2.2376821041107178, "logps/rejected": -2.245208501815796, "loss": 0.6801, "rewards/accuracies": 1.0, "rewards/chosen": 0.7084251642227173, "rewards/margins": 0.026344478130340576, "rewards/rejected": 0.6820806860923767, "step": 5470 }, { "epoch": 2.95, "learning_rate": 7.018398326350539e-11, "logits/chosen": -2.064284324645996, "logits/rejected": -2.27681303024292, "logps/chosen": -0.6536677479743958, "logps/rejected": -0.7316466569900513, "loss": 0.6927, "rewards/accuracies": 1.0, "rewards/chosen": 1.0527641773223877, "rewards/margins": 0.0009733438491821289, "rewards/rejected": 1.0517908334732056, "step": 5471 }, { "epoch": 2.95, "learning_rate": 6.865030457595123e-11, "logits/chosen": -2.096705198287964, "logits/rejected": -2.3132362365722656, "logps/chosen": -0.4333213269710541, "logps/rejected": -0.3969229459762573, "loss": 0.6875, "rewards/accuracies": 1.0, "rewards/chosen": 1.01606023311615, "rewards/margins": 0.01129305362701416, "rewards/rejected": 1.0047671794891357, "step": 5472 }, { "epoch": 2.95, "learning_rate": 6.713355718200309e-11, "logits/chosen": -2.1164472103118896, "logits/rejected": -2.1196084022521973, "logps/chosen": -2.1483633518218994, "logps/rejected": -7.000349998474121, "loss": 0.3936, "rewards/accuracies": 1.0, "rewards/chosen": 1.407422423362732, "rewards/margins": 0.7291021943092346, "rewards/rejected": 0.6783202290534973, "step": 5473 }, { "epoch": 2.95, "learning_rate": 6.563374159596624e-11, "logits/chosen": -2.085054397583008, "logits/rejected": -2.0871481895446777, "logps/chosen": -0.3399609923362732, "logps/rejected": -3.591507911682129, "loss": 0.4812, "rewards/accuracies": 1.0, "rewards/chosen": 1.0853278636932373, "rewards/margins": 0.4813389182090759, "rewards/rejected": 0.6039889454841614, "step": 5474 }, { "epoch": 2.95, "learning_rate": 6.415085832642275e-11, "logits/chosen": -2.1700963973999023, "logits/rejected": -2.2835640907287598, "logps/chosen": -1.7147488594055176, "logps/rejected": -1.3312269449234009, "loss": 0.754, "rewards/accuracies": 0.0, "rewards/chosen": 0.907845139503479, "rewards/margins": -0.11822724342346191, "rewards/rejected": 1.026072382926941, "step": 5475 }, { "epoch": 2.95, "learning_rate": 6.268490787619818e-11, "logits/chosen": -2.162236213684082, "logits/rejected": -2.352725028991699, "logps/chosen": -14.445127487182617, "logps/rejected": -9.781105041503906, "loss": 0.5051, "rewards/accuracies": 1.0, "rewards/chosen": 1.1860401630401611, "rewards/margins": 0.4197710156440735, "rewards/rejected": 0.7662691473960876, "step": 5476 }, { "epoch": 2.95, "learning_rate": 6.123589074238933e-11, "logits/chosen": -1.9791525602340698, "logits/rejected": -1.9793013334274292, "logps/chosen": -1.2122434377670288, "logps/rejected": -1.0338190793991089, "loss": 0.6591, "rewards/accuracies": 1.0, "rewards/chosen": 0.9150785803794861, "rewards/margins": 0.0692022442817688, "rewards/rejected": 0.8458763360977173, "step": 5477 }, { "epoch": 2.95, "learning_rate": 5.98038074163476e-11, "logits/chosen": -2.034301996231079, "logits/rejected": -2.2512197494506836, "logps/chosen": -5.210613250732422, "logps/rejected": -5.170252799987793, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 0.37829849123954773, "rewards/margins": 0.039442747831344604, "rewards/rejected": 0.3388557434082031, "step": 5478 }, { "epoch": 2.96, "learning_rate": 5.83886583836679e-11, "logits/chosen": -2.207041025161743, "logits/rejected": -2.337038040161133, "logps/chosen": -0.9311882853507996, "logps/rejected": -1.971146821975708, "loss": 0.6605, "rewards/accuracies": 1.0, "rewards/chosen": 1.0276658535003662, "rewards/margins": 0.06633973121643066, "rewards/rejected": 0.9613261222839355, "step": 5479 }, { "epoch": 2.96, "learning_rate": 5.699044412423304e-11, "logits/chosen": -1.962354063987732, "logits/rejected": -1.9750398397445679, "logps/chosen": -5.911581993103027, "logps/rejected": -5.002225399017334, "loss": 0.3449, "rewards/accuracies": 1.0, "rewards/chosen": 1.5128756761550903, "rewards/margins": 0.8870173096656799, "rewards/rejected": 0.6258583664894104, "step": 5480 }, { "epoch": 2.96, "learning_rate": 5.5609165112152636e-11, "logits/chosen": -2.1432785987854004, "logits/rejected": -2.2771997451782227, "logps/chosen": -1.336726188659668, "logps/rejected": -1.447092890739441, "loss": 0.6864, "rewards/accuracies": 1.0, "rewards/chosen": 1.0079349279403687, "rewards/margins": 0.013489305973052979, "rewards/rejected": 0.9944456219673157, "step": 5481 }, { "epoch": 2.96, "learning_rate": 5.4244821815807586e-11, "logits/chosen": -1.9449892044067383, "logits/rejected": -2.2599294185638428, "logps/chosen": -0.6612622737884521, "logps/rejected": -0.776386022567749, "loss": 0.6601, "rewards/accuracies": 1.0, "rewards/chosen": 1.010067105293274, "rewards/margins": 0.06725245714187622, "rewards/rejected": 0.9428146481513977, "step": 5482 }, { "epoch": 2.96, "learning_rate": 5.289741469784448e-11, "logits/chosen": -2.044076681137085, "logits/rejected": -2.1825921535491943, "logps/chosen": -0.2420523762702942, "logps/rejected": -0.3087059557437897, "loss": 0.6901, "rewards/accuracies": 1.0, "rewards/chosen": 0.7781294584274292, "rewards/margins": 0.006179928779602051, "rewards/rejected": 0.7719495296478271, "step": 5483 }, { "epoch": 2.96, "learning_rate": 5.1566944215142294e-11, "logits/chosen": -1.9459216594696045, "logits/rejected": -1.945383906364441, "logps/chosen": -0.8253061771392822, "logps/rejected": -1.4606696367263794, "loss": 0.6159, "rewards/accuracies": 1.0, "rewards/chosen": 0.9493274092674255, "rewards/margins": 0.16088008880615234, "rewards/rejected": 0.7884473204612732, "step": 5484 }, { "epoch": 2.96, "learning_rate": 5.025341081886791e-11, "logits/chosen": -2.1706953048706055, "logits/rejected": -2.306173801422119, "logps/chosen": -3.048678398132324, "logps/rejected": -2.5087387561798096, "loss": 0.7049, "rewards/accuracies": 0.0, "rewards/chosen": 0.5921998023986816, "rewards/margins": -0.023412227630615234, "rewards/rejected": 0.6156120300292969, "step": 5485 }, { "epoch": 2.96, "learning_rate": 4.895681495442061e-11, "logits/chosen": -2.0560061931610107, "logits/rejected": -2.2563579082489014, "logps/chosen": -0.3362470269203186, "logps/rejected": -0.36736738681793213, "loss": 0.6845, "rewards/accuracies": 1.0, "rewards/chosen": 0.7892040610313416, "rewards/margins": 0.017309725284576416, "rewards/rejected": 0.7718943357467651, "step": 5486 }, { "epoch": 2.96, "learning_rate": 4.767715706146536e-11, "logits/chosen": -2.104140281677246, "logits/rejected": -2.2846555709838867, "logps/chosen": -1.1266534328460693, "logps/rejected": -9.830048561096191, "loss": 0.6339, "rewards/accuracies": 1.0, "rewards/chosen": 1.0521296262741089, "rewards/margins": 0.12214505672454834, "rewards/rejected": 0.9299845695495605, "step": 5487 }, { "epoch": 2.96, "learning_rate": 4.6414437573921716e-11, "logits/chosen": -2.018233060836792, "logits/rejected": -2.0251288414001465, "logps/chosen": -7.7527337074279785, "logps/rejected": -1.5320796966552734, "loss": 0.6052, "rewards/accuracies": 1.0, "rewards/chosen": 1.2679933309555054, "rewards/margins": 0.1843729019165039, "rewards/rejected": 1.0836204290390015, "step": 5488 }, { "epoch": 2.96, "learning_rate": 4.516865691996941e-11, "logits/chosen": -1.9858890771865845, "logits/rejected": -2.3061187267303467, "logps/chosen": -0.47820162773132324, "logps/rejected": -0.49941518902778625, "loss": 0.6828, "rewards/accuracies": 1.0, "rewards/chosen": 0.9318792223930359, "rewards/margins": 0.02073049545288086, "rewards/rejected": 0.911148726940155, "step": 5489 }, { "epoch": 2.96, "learning_rate": 4.3939815522048285e-11, "logits/chosen": -2.094428062438965, "logits/rejected": -2.3113696575164795, "logps/chosen": -3.6862637996673584, "logps/rejected": -12.243343353271484, "loss": 0.7924, "rewards/accuracies": 0.0, "rewards/chosen": 0.9621757864952087, "rewards/margins": -0.1895652413368225, "rewards/rejected": 1.1517410278320312, "step": 5490 }, { "epoch": 2.96, "learning_rate": 4.272791379683616e-11, "logits/chosen": -2.049522638320923, "logits/rejected": -2.02871036529541, "logps/chosen": -14.061325073242188, "logps/rejected": -4.953301429748535, "loss": 0.4324, "rewards/accuracies": 1.0, "rewards/chosen": 1.720838189125061, "rewards/margins": 0.614303708076477, "rewards/rejected": 1.106534481048584, "step": 5491 }, { "epoch": 2.96, "learning_rate": 4.1532952155287627e-11, "logits/chosen": -1.948517084121704, "logits/rejected": -2.25093150138855, "logps/chosen": -0.32713863253593445, "logps/rejected": -0.32989993691444397, "loss": 0.6773, "rewards/accuracies": 1.0, "rewards/chosen": 0.8523105978965759, "rewards/margins": 0.03192025423049927, "rewards/rejected": 0.8203903436660767, "step": 5492 }, { "epoch": 2.96, "learning_rate": 4.035493100260079e-11, "logits/chosen": -2.1111276149749756, "logits/rejected": -2.24009370803833, "logps/chosen": -2.956261396408081, "logps/rejected": -3.079420328140259, "loss": 0.6558, "rewards/accuracies": 1.0, "rewards/chosen": 0.642297089099884, "rewards/margins": 0.07624399662017822, "rewards/rejected": 0.5660530924797058, "step": 5493 }, { "epoch": 2.96, "learning_rate": 3.919385073822834e-11, "logits/chosen": -2.1754519939422607, "logits/rejected": -2.1735036373138428, "logps/chosen": -6.578312397003174, "logps/rejected": -3.580374002456665, "loss": 0.2349, "rewards/accuracies": 1.0, "rewards/chosen": 1.835531234741211, "rewards/margins": 1.3290810585021973, "rewards/rejected": 0.5064501166343689, "step": 5494 }, { "epoch": 2.96, "learning_rate": 3.804971175588867e-11, "logits/chosen": -1.981898307800293, "logits/rejected": -2.2572922706604004, "logps/chosen": -0.4626365900039673, "logps/rejected": -0.47390690445899963, "loss": 0.6954, "rewards/accuracies": 0.0, "rewards/chosen": 0.8839744925498962, "rewards/margins": -0.004433155059814453, "rewards/rejected": 0.8884076476097107, "step": 5495 }, { "epoch": 2.96, "learning_rate": 3.6922514443549215e-11, "logits/chosen": -2.149343490600586, "logits/rejected": -2.1513280868530273, "logps/chosen": -1.9863909482955933, "logps/rejected": -4.511040687561035, "loss": 0.2656, "rewards/accuracies": 1.0, "rewards/chosen": 1.7023805379867554, "rewards/margins": 1.1902003288269043, "rewards/rejected": 0.5121802687644958, "step": 5496 }, { "epoch": 2.96, "learning_rate": 3.581225918342645e-11, "logits/chosen": -2.1861605644226074, "logits/rejected": -2.1743202209472656, "logps/chosen": -3.3418660163879395, "logps/rejected": -10.361276626586914, "loss": 0.5716, "rewards/accuracies": 1.0, "rewards/chosen": 0.8630487322807312, "rewards/margins": 0.25997692346572876, "rewards/rejected": 0.6030718088150024, "step": 5497 }, { "epoch": 2.97, "learning_rate": 3.4718946352008115e-11, "logits/chosen": -2.1154961585998535, "logits/rejected": -2.1198596954345703, "logps/chosen": -3.5620462894439697, "logps/rejected": -0.32217124104499817, "loss": 0.5836, "rewards/accuracies": 1.0, "rewards/chosen": 1.2714859247207642, "rewards/margins": 0.23257839679718018, "rewards/rejected": 1.038907527923584, "step": 5498 }, { "epoch": 2.97, "learning_rate": 3.3642576320019875e-11, "logits/chosen": -2.016923666000366, "logits/rejected": -2.0232574939727783, "logps/chosen": -3.9675397872924805, "logps/rejected": -6.792712211608887, "loss": 0.4084, "rewards/accuracies": 1.0, "rewards/chosen": 0.9184269905090332, "rewards/margins": 0.6842308044433594, "rewards/rejected": 0.23419618606567383, "step": 5499 }, { "epoch": 2.97, "learning_rate": 3.258314945245311e-11, "logits/chosen": -2.2976133823394775, "logits/rejected": -2.148282527923584, "logps/chosen": -29.538490295410156, "logps/rejected": -3.858036518096924, "loss": 0.1936, "rewards/accuracies": 1.0, "rewards/chosen": 1.9375015497207642, "rewards/margins": 1.5434303283691406, "rewards/rejected": 0.3940712511539459, "step": 5500 }, { "epoch": 2.97, "learning_rate": 3.154066610854822e-11, "logits/chosen": -2.100517749786377, "logits/rejected": -2.1008269786834717, "logps/chosen": -3.888831615447998, "logps/rejected": -2.1876726150512695, "loss": 0.3412, "rewards/accuracies": 1.0, "rewards/chosen": 1.4962339401245117, "rewards/margins": 0.8999550938606262, "rewards/rejected": 0.5962788462638855, "step": 5501 }, { "epoch": 2.97, "learning_rate": 3.051512664180578e-11, "logits/chosen": -1.9640480279922485, "logits/rejected": -1.9722130298614502, "logps/chosen": -0.39854949712753296, "logps/rejected": -11.889822959899902, "loss": 0.3195, "rewards/accuracies": 1.0, "rewards/chosen": 1.046906590461731, "rewards/margins": 0.9770553708076477, "rewards/rejected": 0.06985121220350266, "step": 5502 }, { "epoch": 2.97, "learning_rate": 2.950653139997539e-11, "logits/chosen": -2.1345956325531006, "logits/rejected": -2.1641643047332764, "logps/chosen": -8.11379337310791, "logps/rejected": -16.788328170776367, "loss": 0.3792, "rewards/accuracies": 1.0, "rewards/chosen": 1.3035815954208374, "rewards/margins": 0.7742637395858765, "rewards/rejected": 0.5293178558349609, "step": 5503 }, { "epoch": 2.97, "learning_rate": 2.8514880725061253e-11, "logits/chosen": -2.161255359649658, "logits/rejected": -2.1631741523742676, "logps/chosen": -3.833855628967285, "logps/rejected": -3.445716381072998, "loss": 0.4972, "rewards/accuracies": 1.0, "rewards/chosen": 1.0521680116653442, "rewards/margins": 0.4399356245994568, "rewards/rejected": 0.6122323870658875, "step": 5504 }, { "epoch": 2.97, "learning_rate": 2.754017495332217e-11, "logits/chosen": -2.0482852458953857, "logits/rejected": -2.266303777694702, "logps/chosen": -0.2933984696865082, "logps/rejected": -0.28100699186325073, "loss": 0.686, "rewards/accuracies": 1.0, "rewards/chosen": 0.9623128771781921, "rewards/margins": 0.014389514923095703, "rewards/rejected": 0.9479233622550964, "step": 5505 }, { "epoch": 2.97, "learning_rate": 2.6582414415282638e-11, "logits/chosen": -2.069249153137207, "logits/rejected": -2.264554500579834, "logps/chosen": -0.44983527064323425, "logps/rejected": -0.3967593312263489, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.81775963306427, "rewards/margins": 0.016830742359161377, "rewards/rejected": 0.8009288907051086, "step": 5506 }, { "epoch": 2.97, "learning_rate": 2.5641599435699545e-11, "logits/chosen": -2.0997939109802246, "logits/rejected": -2.302992820739746, "logps/chosen": -0.8427666425704956, "logps/rejected": -0.836613655090332, "loss": 0.6893, "rewards/accuracies": 1.0, "rewards/chosen": 0.8981899619102478, "rewards/margins": 0.007749497890472412, "rewards/rejected": 0.8904404640197754, "step": 5507 }, { "epoch": 2.97, "learning_rate": 2.4717730333601028e-11, "logits/chosen": -1.9671034812927246, "logits/rejected": -2.245015859603882, "logps/chosen": -4.37495756149292, "logps/rejected": -4.582000732421875, "loss": 0.6707, "rewards/accuracies": 1.0, "rewards/chosen": 0.8100792169570923, "rewards/margins": 0.0454026460647583, "rewards/rejected": 0.764676570892334, "step": 5508 }, { "epoch": 2.97, "learning_rate": 2.3810807422258717e-11, "logits/chosen": -1.9960887432098389, "logits/rejected": -1.9905799627304077, "logps/chosen": -1.0483626127243042, "logps/rejected": -4.944915771484375, "loss": 0.4598, "rewards/accuracies": 1.0, "rewards/chosen": 1.0406776666641235, "rewards/margins": 0.5383232235908508, "rewards/rejected": 0.5023544430732727, "step": 5509 }, { "epoch": 2.97, "learning_rate": 2.292083100920994e-11, "logits/chosen": -2.0455615520477295, "logits/rejected": -2.04263973236084, "logps/chosen": -6.222296714782715, "logps/rejected": -2.5184762477874756, "loss": 0.5085, "rewards/accuracies": 1.0, "rewards/chosen": 1.0957274436950684, "rewards/margins": 0.41133546829223633, "rewards/rejected": 0.684391975402832, "step": 5510 }, { "epoch": 2.97, "learning_rate": 2.204780139622997e-11, "logits/chosen": -2.0896823406219482, "logits/rejected": -2.299520492553711, "logps/chosen": -0.15636388957500458, "logps/rejected": -0.24790522456169128, "loss": 0.6812, "rewards/accuracies": 1.0, "rewards/chosen": 1.0255814790725708, "rewards/margins": 0.024123787879943848, "rewards/rejected": 1.001457691192627, "step": 5511 }, { "epoch": 2.97, "learning_rate": 2.1191718879359776e-11, "logits/chosen": -2.0647881031036377, "logits/rejected": -2.3295395374298096, "logps/chosen": -2.1815128326416016, "logps/rejected": -2.2363648414611816, "loss": 0.6905, "rewards/accuracies": 1.0, "rewards/chosen": 0.9837145209312439, "rewards/margins": 0.00528872013092041, "rewards/rejected": 0.9784258008003235, "step": 5512 }, { "epoch": 2.97, "learning_rate": 2.0352583748894925e-11, "logits/chosen": -2.0671565532684326, "logits/rejected": -2.288689374923706, "logps/chosen": -1.701810598373413, "logps/rejected": -1.7870211601257324, "loss": 0.6817, "rewards/accuracies": 1.0, "rewards/chosen": 1.0114096403121948, "rewards/margins": 0.02303779125213623, "rewards/rejected": 0.9883718490600586, "step": 5513 }, { "epoch": 2.97, "learning_rate": 1.9530396289368922e-11, "logits/chosen": -2.136765718460083, "logits/rejected": -2.03420090675354, "logps/chosen": -24.47732925415039, "logps/rejected": -3.3406856060028076, "loss": 0.1916, "rewards/accuracies": 1.0, "rewards/chosen": 2.1427712440490723, "rewards/margins": 1.5548362731933594, "rewards/rejected": 0.5879349708557129, "step": 5514 }, { "epoch": 2.97, "learning_rate": 1.8725156779580974e-11, "logits/chosen": -2.038240432739258, "logits/rejected": -2.2537693977355957, "logps/chosen": -0.39001011848449707, "logps/rejected": -0.3696419894695282, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 0.8849911093711853, "rewards/margins": 0.0395733118057251, "rewards/rejected": 0.8454177975654602, "step": 5515 }, { "epoch": 2.98, "learning_rate": 1.7936865492584886e-11, "logits/chosen": -2.131963014602661, "logits/rejected": -2.1347362995147705, "logps/chosen": -1.1493114233016968, "logps/rejected": -3.7100296020507812, "loss": 0.49, "rewards/accuracies": 1.0, "rewards/chosen": 1.0465103387832642, "rewards/margins": 0.4584236145019531, "rewards/rejected": 0.588086724281311, "step": 5516 }, { "epoch": 2.98, "learning_rate": 1.7165522695683498e-11, "logits/chosen": -2.2796437740325928, "logits/rejected": -2.1924891471862793, "logps/chosen": -39.862388610839844, "logps/rejected": -6.793581962585449, "loss": 0.1836, "rewards/accuracies": 1.0, "rewards/chosen": 2.28874135017395, "rewards/margins": 1.6018421649932861, "rewards/rejected": 0.6868991255760193, "step": 5517 }, { "epoch": 2.98, "learning_rate": 1.6411128650428708e-11, "logits/chosen": -2.0920522212982178, "logits/rejected": -2.273447036743164, "logps/chosen": -0.4707072973251343, "logps/rejected": -0.5690633654594421, "loss": 0.6881, "rewards/accuracies": 1.0, "rewards/chosen": 0.8543772101402283, "rewards/margins": 0.010128438472747803, "rewards/rejected": 0.8442487716674805, "step": 5518 }, { "epoch": 2.98, "learning_rate": 1.5673683612627e-11, "logits/chosen": -2.066917896270752, "logits/rejected": -2.284492015838623, "logps/chosen": -0.678758978843689, "logps/rejected": -0.6118987202644348, "loss": 0.6735, "rewards/accuracies": 1.0, "rewards/chosen": 1.0279712677001953, "rewards/margins": 0.03975635766983032, "rewards/rejected": 0.988214910030365, "step": 5519 }, { "epoch": 2.98, "learning_rate": 1.4953187832350555e-11, "logits/chosen": -2.12760591506958, "logits/rejected": -2.134913206100464, "logps/chosen": -1.632530689239502, "logps/rejected": -4.904840469360352, "loss": 0.4692, "rewards/accuracies": 1.0, "rewards/chosen": 1.0422013998031616, "rewards/margins": 0.5128580927848816, "rewards/rejected": 0.52934330701828, "step": 5520 }, { "epoch": 2.98, "learning_rate": 1.4249641553903957e-11, "logits/chosen": -2.1541972160339355, "logits/rejected": -2.160756826400757, "logps/chosen": -3.5239150524139404, "logps/rejected": -3.464751720428467, "loss": 0.5569, "rewards/accuracies": 1.0, "rewards/chosen": 0.9684564471244812, "rewards/margins": 0.29414647817611694, "rewards/rejected": 0.6743099689483643, "step": 5521 }, { "epoch": 2.98, "learning_rate": 1.3563045015851926e-11, "logits/chosen": -1.9583677053451538, "logits/rejected": -1.9613419771194458, "logps/chosen": -2.7515549659729004, "logps/rejected": -0.6059789657592773, "loss": 0.6385, "rewards/accuracies": 1.0, "rewards/chosen": 1.096299409866333, "rewards/margins": 0.11236625909805298, "rewards/rejected": 0.98393315076828, "step": 5522 }, { "epoch": 2.98, "learning_rate": 1.2893398451024884e-11, "logits/chosen": -2.1138265132904053, "logits/rejected": -2.1115691661834717, "logps/chosen": -1.2515413761138916, "logps/rejected": -2.363532066345215, "loss": 0.6263, "rewards/accuracies": 1.0, "rewards/chosen": 1.1496317386627197, "rewards/margins": 0.13859069347381592, "rewards/rejected": 1.0110410451889038, "step": 5523 }, { "epoch": 2.98, "learning_rate": 1.2240702086480092e-11, "logits/chosen": -2.0654280185699463, "logits/rejected": -2.0554702281951904, "logps/chosen": -4.049854278564453, "logps/rejected": -6.955839157104492, "loss": 0.3382, "rewards/accuracies": 1.0, "rewards/chosen": 1.2267578840255737, "rewards/margins": 0.9102319478988647, "rewards/rejected": 0.316525936126709, "step": 5524 }, { "epoch": 2.98, "learning_rate": 1.1604956143551614e-11, "logits/chosen": -2.0017919540405273, "logits/rejected": -2.234243869781494, "logps/chosen": -0.6227521896362305, "logps/rejected": -0.8217865228652954, "loss": 0.6848, "rewards/accuracies": 1.0, "rewards/chosen": 0.7936453223228455, "rewards/margins": 0.016773879528045654, "rewards/rejected": 0.7768714427947998, "step": 5525 }, { "epoch": 2.98, "learning_rate": 1.0986160837811453e-11, "logits/chosen": -2.1703951358795166, "logits/rejected": -2.1710045337677, "logps/chosen": -1.3825979232788086, "logps/rejected": -7.934484958648682, "loss": 0.555, "rewards/accuracies": 1.0, "rewards/chosen": 1.2026158571243286, "rewards/margins": 0.29855793714523315, "rewards/rejected": 0.9040579199790955, "step": 5526 }, { "epoch": 2.98, "learning_rate": 1.0384316379086211e-11, "logits/chosen": -2.082058906555176, "logits/rejected": -2.308655023574829, "logps/chosen": -1.4481160640716553, "logps/rejected": -1.343637228012085, "loss": 0.6818, "rewards/accuracies": 1.0, "rewards/chosen": 0.7825579047203064, "rewards/margins": 0.022810757160186768, "rewards/rejected": 0.7597471475601196, "step": 5527 }, { "epoch": 2.98, "learning_rate": 9.799422971462635e-12, "logits/chosen": -2.108227252960205, "logits/rejected": -2.3018996715545654, "logps/chosen": -11.418603897094727, "logps/rejected": -5.783216953277588, "loss": 0.6522, "rewards/accuracies": 1.0, "rewards/chosen": 0.9462066888809204, "rewards/margins": 0.08365482091903687, "rewards/rejected": 0.8625518679618835, "step": 5528 }, { "epoch": 2.98, "learning_rate": 9.231480813265413e-12, "logits/chosen": -2.0874311923980713, "logits/rejected": -2.0892341136932373, "logps/chosen": -1.313817024230957, "logps/rejected": -2.131004810333252, "loss": 0.4942, "rewards/accuracies": 1.0, "rewards/chosen": 1.2702596187591553, "rewards/margins": 0.44764530658721924, "rewards/rejected": 0.822614312171936, "step": 5529 }, { "epoch": 2.98, "learning_rate": 8.680490097084936e-12, "logits/chosen": -1.9824405908584595, "logits/rejected": -2.28483247756958, "logps/chosen": -1.2729328870773315, "logps/rejected": -1.2158012390136719, "loss": 0.6736, "rewards/accuracies": 1.0, "rewards/chosen": 1.095558524131775, "rewards/margins": 0.03950917720794678, "rewards/rejected": 1.0560493469238281, "step": 5530 }, { "epoch": 2.98, "learning_rate": 8.146451009755084e-12, "logits/chosen": -1.954642653465271, "logits/rejected": -2.329468250274658, "logps/chosen": -6.682612419128418, "logps/rejected": -7.097035884857178, "loss": 0.6547, "rewards/accuracies": 1.0, "rewards/chosen": 1.1591109037399292, "rewards/margins": 0.07851791381835938, "rewards/rejected": 1.0805929899215698, "step": 5531 }, { "epoch": 2.98, "learning_rate": 7.629363732358785e-12, "logits/chosen": -2.1302077770233154, "logits/rejected": -2.1290693283081055, "logps/chosen": -0.5031879544258118, "logps/rejected": -2.5136983394622803, "loss": 0.6328, "rewards/accuracies": 1.0, "rewards/chosen": 0.932246208190918, "rewards/margins": 0.12466531991958618, "rewards/rejected": 0.8075808882713318, "step": 5532 }, { "epoch": 2.98, "learning_rate": 7.129228440244661e-12, "logits/chosen": -2.0046346187591553, "logits/rejected": -2.0107245445251465, "logps/chosen": -1.6321214437484741, "logps/rejected": -3.6223037242889404, "loss": 0.4951, "rewards/accuracies": 1.0, "rewards/chosen": 0.9760388731956482, "rewards/margins": 0.4451603293418884, "rewards/rejected": 0.5308785438537598, "step": 5533 }, { "epoch": 2.98, "learning_rate": 6.646045303004832e-12, "logits/chosen": -1.989262580871582, "logits/rejected": -1.9663164615631104, "logps/chosen": -10.429471015930176, "logps/rejected": -3.1756887435913086, "loss": 0.4918, "rewards/accuracies": 1.0, "rewards/chosen": 1.5140289068222046, "rewards/margins": 0.45375335216522217, "rewards/rejected": 1.0602755546569824, "step": 5534 }, { "epoch": 2.99, "learning_rate": 6.179814484469359e-12, "logits/chosen": -2.2627944946289062, "logits/rejected": -2.267031669616699, "logps/chosen": -0.9497478604316711, "logps/rejected": -0.8860072493553162, "loss": 0.6947, "rewards/accuracies": 0.0, "rewards/chosen": 0.9091788530349731, "rewards/margins": -0.003153860569000244, "rewards/rejected": 0.9123327136039734, "step": 5535 }, { "epoch": 2.99, "learning_rate": 5.730536142745102e-12, "logits/chosen": -2.1089916229248047, "logits/rejected": -2.1060800552368164, "logps/chosen": -5.46268367767334, "logps/rejected": -3.822154998779297, "loss": 0.4065, "rewards/accuracies": 1.0, "rewards/chosen": 1.323441982269287, "rewards/margins": 0.6898987889289856, "rewards/rejected": 0.6335431933403015, "step": 5536 }, { "epoch": 2.99, "learning_rate": 5.2982104301768636e-12, "logits/chosen": -2.027268886566162, "logits/rejected": -2.031773805618286, "logps/chosen": -1.2616146802902222, "logps/rejected": -3.269299030303955, "loss": 0.4727, "rewards/accuracies": 1.0, "rewards/chosen": 1.0954738855361938, "rewards/margins": 0.5037733912467957, "rewards/rejected": 0.5917004942893982, "step": 5537 }, { "epoch": 2.99, "learning_rate": 4.8828374933640445e-12, "logits/chosen": -2.086178779602051, "logits/rejected": -2.3488552570343018, "logps/chosen": -2.135552167892456, "logps/rejected": -1.828501582145691, "loss": 0.6761, "rewards/accuracies": 1.0, "rewards/chosen": 1.0832692384719849, "rewards/margins": 0.03435325622558594, "rewards/rejected": 1.048915982246399, "step": 5538 }, { "epoch": 2.99, "learning_rate": 4.484417473149537e-12, "logits/chosen": -2.111675977706909, "logits/rejected": -2.302659034729004, "logps/chosen": -0.4257473349571228, "logps/rejected": -0.43866169452667236, "loss": 0.6951, "rewards/accuracies": 0.0, "rewards/chosen": 0.8391261100769043, "rewards/margins": -0.003914952278137207, "rewards/rejected": 0.8430410623550415, "step": 5539 }, { "epoch": 2.99, "learning_rate": 4.102950504636382e-12, "logits/chosen": -2.004411458969116, "logits/rejected": -2.004478693008423, "logps/chosen": -1.9090297222137451, "logps/rejected": -6.756038665771484, "loss": 0.3773, "rewards/accuracies": 1.0, "rewards/chosen": 1.1583027839660645, "rewards/margins": 0.780243992805481, "rewards/rejected": 0.3780588209629059, "step": 5540 }, { "epoch": 2.99, "learning_rate": 3.738436717176663e-12, "logits/chosen": -2.105703353881836, "logits/rejected": -2.113233804702759, "logps/chosen": -1.4528279304504395, "logps/rejected": -3.0394067764282227, "loss": 0.4363, "rewards/accuracies": 1.0, "rewards/chosen": 1.1325019598007202, "rewards/margins": 0.6032900214195251, "rewards/rejected": 0.5292119383811951, "step": 5541 }, { "epoch": 2.99, "learning_rate": 3.3908762343770604e-12, "logits/chosen": -2.144575357437134, "logits/rejected": -2.1224122047424316, "logps/chosen": -8.605073928833008, "logps/rejected": -5.1268157958984375, "loss": 0.3193, "rewards/accuracies": 1.0, "rewards/chosen": 1.4174846410751343, "rewards/margins": 0.977689266204834, "rewards/rejected": 0.4397954046726227, "step": 5542 }, { "epoch": 2.99, "learning_rate": 3.0602691740932996e-12, "logits/chosen": -2.0529589653015137, "logits/rejected": -2.06844425201416, "logps/chosen": -0.8117408752441406, "logps/rejected": -3.441953659057617, "loss": 0.5916, "rewards/accuracies": 1.0, "rewards/chosen": 1.0125786066055298, "rewards/margins": 0.21449005603790283, "rewards/rejected": 0.798088550567627, "step": 5543 }, { "epoch": 2.99, "learning_rate": 2.7466156484301507e-12, "logits/chosen": -2.154715061187744, "logits/rejected": -2.1602694988250732, "logps/chosen": -1.6730998754501343, "logps/rejected": -3.411175489425659, "loss": 0.5115, "rewards/accuracies": 1.0, "rewards/chosen": 1.0185075998306274, "rewards/margins": 0.40373319387435913, "rewards/rejected": 0.6147744059562683, "step": 5544 }, { "epoch": 2.99, "learning_rate": 2.449915763735877e-12, "logits/chosen": -2.1179072856903076, "logits/rejected": -2.2296674251556396, "logps/chosen": -2.233126163482666, "logps/rejected": -2.2702043056488037, "loss": 0.6805, "rewards/accuracies": 1.0, "rewards/chosen": 0.9521257281303406, "rewards/margins": 0.025532901287078857, "rewards/rejected": 0.9265928268432617, "step": 5545 }, { "epoch": 2.99, "learning_rate": 2.1701696206355423e-12, "logits/chosen": -2.144296407699585, "logits/rejected": -2.326141595840454, "logps/chosen": -8.141840934753418, "logps/rejected": -7.9480204582214355, "loss": 0.462, "rewards/accuracies": 1.0, "rewards/chosen": 1.1899322271347046, "rewards/margins": 0.5324433445930481, "rewards/rejected": 0.6574888825416565, "step": 5546 }, { "epoch": 2.99, "learning_rate": 1.9073773139699488e-12, "logits/chosen": -2.148554801940918, "logits/rejected": -2.2399637699127197, "logps/chosen": -0.2774512767791748, "logps/rejected": -0.2750672399997711, "loss": 0.6851, "rewards/accuracies": 1.0, "rewards/chosen": 0.8730023503303528, "rewards/margins": 0.016136765480041504, "rewards/rejected": 0.8568655848503113, "step": 5547 }, { "epoch": 2.99, "learning_rate": 1.6615389328678009e-12, "logits/chosen": -2.0094621181488037, "logits/rejected": -2.2643094062805176, "logps/chosen": -1.1620733737945557, "logps/rejected": -1.239112138748169, "loss": 0.7042, "rewards/accuracies": 0.0, "rewards/chosen": 0.9863864183425903, "rewards/margins": -0.021898865699768066, "rewards/rejected": 1.0082852840423584, "step": 5548 }, { "epoch": 2.99, "learning_rate": 1.432654560679092e-12, "logits/chosen": -2.041457176208496, "logits/rejected": -2.047262191772461, "logps/chosen": -1.688205361366272, "logps/rejected": -4.294544219970703, "loss": 0.4491, "rewards/accuracies": 1.0, "rewards/chosen": 1.1079787015914917, "rewards/margins": 0.5674502849578857, "rewards/rejected": 0.540528416633606, "step": 5549 }, { "epoch": 2.99, "learning_rate": 1.2207242750195134e-12, "logits/chosen": -2.002405881881714, "logits/rejected": -2.2547833919525146, "logps/chosen": -4.438247203826904, "logps/rejected": -4.4004082679748535, "loss": 0.6804, "rewards/accuracies": 1.0, "rewards/chosen": 0.8505670428276062, "rewards/margins": 0.025579452514648438, "rewards/rejected": 0.8249875903129578, "step": 5550 }, { "epoch": 2.99, "learning_rate": 1.025748147753802e-12, "logits/chosen": -2.255023241043091, "logits/rejected": -2.2093472480773926, "logps/chosen": -34.3206787109375, "logps/rejected": -11.812310218811035, "loss": 0.162, "rewards/accuracies": 1.0, "rewards/chosen": 2.47770619392395, "rewards/margins": 1.7383620738983154, "rewards/rejected": 0.7393441200256348, "step": 5551 }, { "epoch": 2.99, "learning_rate": 8.477262449901878e-13, "logits/chosen": -2.056976556777954, "logits/rejected": -2.289966106414795, "logps/chosen": -1.1674516201019287, "logps/rejected": -1.0996251106262207, "loss": 0.6856, "rewards/accuracies": 1.0, "rewards/chosen": 0.7882742881774902, "rewards/margins": 0.015153169631958008, "rewards/rejected": 0.7731211185455322, "step": 5552 }, { "epoch": 3.0, "learning_rate": 6.866586271081498e-13, "logits/chosen": -2.0445685386657715, "logits/rejected": -2.311767101287842, "logps/chosen": -0.44262948632240295, "logps/rejected": -3.269636631011963, "loss": 0.5261, "rewards/accuracies": 1.0, "rewards/chosen": 0.9436624646186829, "rewards/margins": 0.3677375316619873, "rewards/rejected": 0.5759249329566956, "step": 5553 }, { "epoch": 3.0, "learning_rate": 5.425453487140075e-13, "logits/chosen": -2.0496246814727783, "logits/rejected": -2.2393784523010254, "logps/chosen": -0.3346524238586426, "logps/rejected": -0.33515113592147827, "loss": 0.6742, "rewards/accuracies": 1.0, "rewards/chosen": 0.7102388739585876, "rewards/margins": 0.03827691078186035, "rewards/rejected": 0.6719619631767273, "step": 5554 }, { "epoch": 3.0, "learning_rate": 4.153864586742273e-13, "logits/chosen": -2.047335624694824, "logits/rejected": -2.2454347610473633, "logps/chosen": -0.6564099192619324, "logps/rejected": -0.8252699375152588, "loss": 0.6906, "rewards/accuracies": 1.0, "rewards/chosen": 0.9361461997032166, "rewards/margins": 0.005108356475830078, "rewards/rejected": 0.9310378432273865, "step": 5555 }, { "epoch": 3.0, "learning_rate": 3.051820001154226e-13, "logits/chosen": -2.0574190616607666, "logits/rejected": -2.2098987102508545, "logps/chosen": -0.4185575842857361, "logps/rejected": -0.4142831861972809, "loss": 0.6949, "rewards/accuracies": 0.0, "rewards/chosen": 0.792579710483551, "rewards/margins": -0.003430664539337158, "rewards/rejected": 0.7960103750228882, "step": 5556 }, { "epoch": 3.0, "learning_rate": 2.1193201040214937e-13, "logits/chosen": -1.976366400718689, "logits/rejected": -2.2812094688415527, "logps/chosen": -1.6159573793411255, "logps/rejected": -1.8468106985092163, "loss": 0.6792, "rewards/accuracies": 1.0, "rewards/chosen": 0.6626760363578796, "rewards/margins": 0.02805328369140625, "rewards/rejected": 0.6346227526664734, "step": 5557 }, { "epoch": 3.0, "learning_rate": 1.356365211535593e-13, "logits/chosen": -2.073293924331665, "logits/rejected": -2.0780415534973145, "logps/chosen": -0.5560543537139893, "logps/rejected": -7.36126708984375, "loss": 0.3718, "rewards/accuracies": 1.0, "rewards/chosen": 0.971780002117157, "rewards/margins": 0.7976624965667725, "rewards/rejected": 0.17411747574806213, "step": 5558 }, { "epoch": 3.0, "learning_rate": 7.629555823784884e-14, "logits/chosen": -2.0159425735473633, "logits/rejected": -2.0197160243988037, "logps/chosen": -4.626803874969482, "logps/rejected": -7.069414138793945, "loss": 0.3763, "rewards/accuracies": 1.0, "rewards/chosen": 1.290107011795044, "rewards/margins": 0.7834730744361877, "rewards/rejected": 0.5066339373588562, "step": 5559 }, { "epoch": 3.0, "learning_rate": 3.390914178336146e-14, "logits/chosen": -2.063309907913208, "logits/rejected": -2.0631215572357178, "logps/chosen": -1.512205958366394, "logps/rejected": -0.7094734907150269, "loss": 0.6687, "rewards/accuracies": 1.0, "rewards/chosen": 0.935883641242981, "rewards/margins": 0.04960089921951294, "rewards/rejected": 0.886282742023468, "step": 5560 }, { "epoch": 3.0, "learning_rate": 8.47728616748533e-15, "logits/chosen": -1.9941738843917847, "logits/rejected": -2.224307060241699, "logps/chosen": -1.6401546001434326, "logps/rejected": -1.3683781623840332, "loss": 0.6995, "rewards/accuracies": 0.0, "rewards/chosen": 0.7001371383666992, "rewards/margins": -0.0126875638961792, "rewards/rejected": 0.7128247022628784, "step": 5561 }, { "epoch": 3.0, "learning_rate": 0.0, "logits/chosen": -2.1069791316986084, "logits/rejected": -2.093248128890991, "logps/chosen": -13.854514122009277, "logps/rejected": -6.185429573059082, "loss": 0.2274, "rewards/accuracies": 1.0, "rewards/chosen": 1.7810413837432861, "rewards/margins": 1.3653473854064941, "rewards/rejected": 0.41569405794143677, "step": 5562 }, { "epoch": 3.0, "step": 5562, "total_flos": 0.0, "train_loss": 0.5620730580445225, "train_runtime": 35350.3872, "train_samples_per_second": 0.157, "train_steps_per_second": 0.157 } ], "logging_steps": 1.0, "max_steps": 5562, "num_train_epochs": 3, "save_steps": 200, "total_flos": 0.0, "trial_name": null, "trial_params": null }