{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.0, "eval_steps": 100, "global_step": 2776, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0007204610951008645, "grad_norm": 16.86738074547546, "learning_rate": 1.7985611510791367e-10, "logits/chosen": -1.901450514793396, "logits/rejected": -1.9076323509216309, "logps/chosen": -0.8524526953697205, "logps/rejected": -0.9626365900039673, "loss": 1.6316, "rewards/accuracies": 0.5, "rewards/chosen": -1.704905390739441, "rewards/margins": 0.22036786377429962, "rewards/rejected": -1.9252731800079346, "step": 1 }, { "epoch": 0.007204610951008645, "grad_norm": 20.67220170920981, "learning_rate": 1.7985611510791365e-09, "logits/chosen": -2.020613670349121, "logits/rejected": -2.006347894668579, "logps/chosen": -1.005244255065918, "logps/rejected": -1.1096515655517578, "loss": 1.6546, "rewards/accuracies": 0.5208333134651184, "rewards/chosen": -2.010488510131836, "rewards/margins": 0.20881448686122894, "rewards/rejected": -2.2193031311035156, "step": 10 }, { "epoch": 0.01440922190201729, "grad_norm": 26.108277039722253, "learning_rate": 3.597122302158273e-09, "logits/chosen": -2.0260705947875977, "logits/rejected": -2.022770643234253, "logps/chosen": -1.052295446395874, "logps/rejected": -1.1837208271026611, "loss": 1.6167, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.104590892791748, "rewards/margins": 0.26285091042518616, "rewards/rejected": -2.3674416542053223, "step": 20 }, { "epoch": 0.021613832853025938, "grad_norm": 20.47682519715639, "learning_rate": 5.3956834532374095e-09, "logits/chosen": -1.9848406314849854, "logits/rejected": -1.9775378704071045, "logps/chosen": -1.0540497303009033, "logps/rejected": -1.1514469385147095, "loss": 1.6715, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1080994606018066, "rewards/margins": 0.19479455053806305, "rewards/rejected": -2.302893877029419, "step": 30 }, { "epoch": 0.02881844380403458, "grad_norm": 22.578054082763025, "learning_rate": 7.194244604316546e-09, "logits/chosen": -2.0309205055236816, "logits/rejected": -2.030827045440674, "logps/chosen": -1.0357428789138794, "logps/rejected": -1.1376559734344482, "loss": 1.674, "rewards/accuracies": 0.59375, "rewards/chosen": -2.071485757827759, "rewards/margins": 0.20382657647132874, "rewards/rejected": -2.2753119468688965, "step": 40 }, { "epoch": 0.03602305475504323, "grad_norm": 17.189127890707947, "learning_rate": 8.992805755395683e-09, "logits/chosen": -1.9604355096817017, "logits/rejected": -1.9610908031463623, "logps/chosen": -0.9419905543327332, "logps/rejected": -1.0071475505828857, "loss": 1.7048, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8839811086654663, "rewards/margins": 0.13031414151191711, "rewards/rejected": -2.0142951011657715, "step": 50 }, { "epoch": 0.043227665706051875, "grad_norm": 24.394161121983817, "learning_rate": 1.0791366906474819e-08, "logits/chosen": -2.0403716564178467, "logits/rejected": -2.035911798477173, "logps/chosen": -1.0892378091812134, "logps/rejected": -1.1461578607559204, "loss": 1.7173, "rewards/accuracies": 0.53125, "rewards/chosen": -2.1784756183624268, "rewards/margins": 0.11384035646915436, "rewards/rejected": -2.292315721511841, "step": 60 }, { "epoch": 0.05043227665706052, "grad_norm": 23.080584749878106, "learning_rate": 1.2589928057553956e-08, "logits/chosen": -2.0298831462860107, "logits/rejected": -2.0174343585968018, "logps/chosen": -1.109933614730835, "logps/rejected": -1.2047233581542969, "loss": 1.6667, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.21986722946167, "rewards/margins": 0.18957947194576263, "rewards/rejected": -2.4094467163085938, "step": 70 }, { "epoch": 0.05763688760806916, "grad_norm": 28.510083775511152, "learning_rate": 1.4388489208633092e-08, "logits/chosen": -2.0415005683898926, "logits/rejected": -2.0385377407073975, "logps/chosen": -1.1662975549697876, "logps/rejected": -1.2378699779510498, "loss": 1.7003, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.332595109939575, "rewards/margins": 0.1431449055671692, "rewards/rejected": -2.4757399559020996, "step": 80 }, { "epoch": 0.06484149855907781, "grad_norm": 18.099831598265492, "learning_rate": 1.618705035971223e-08, "logits/chosen": -2.003298044204712, "logits/rejected": -2.004725933074951, "logps/chosen": -1.0415083169937134, "logps/rejected": -1.149029016494751, "loss": 1.6519, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.0830166339874268, "rewards/margins": 0.21504120528697968, "rewards/rejected": -2.298058032989502, "step": 90 }, { "epoch": 0.07204610951008646, "grad_norm": 21.6296417312396, "learning_rate": 1.7985611510791365e-08, "logits/chosen": -2.036734104156494, "logits/rejected": -2.0305848121643066, "logps/chosen": -1.0069749355316162, "logps/rejected": -1.1141220331192017, "loss": 1.654, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0139498710632324, "rewards/margins": 0.21429400146007538, "rewards/rejected": -2.2282440662384033, "step": 100 }, { "epoch": 0.0792507204610951, "grad_norm": 18.402417176588862, "learning_rate": 1.9784172661870502e-08, "logits/chosen": -1.9797817468643188, "logits/rejected": -1.9685176610946655, "logps/chosen": -1.0294291973114014, "logps/rejected": -1.1286334991455078, "loss": 1.6659, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0588583946228027, "rewards/margins": 0.19840820133686066, "rewards/rejected": -2.2572669982910156, "step": 110 }, { "epoch": 0.08645533141210375, "grad_norm": 20.729624339345094, "learning_rate": 2.1582733812949638e-08, "logits/chosen": -1.9758269786834717, "logits/rejected": -1.974029541015625, "logps/chosen": -0.964306652545929, "logps/rejected": -1.0657222270965576, "loss": 1.6486, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.928613305091858, "rewards/margins": 0.20283102989196777, "rewards/rejected": -2.1314444541931152, "step": 120 }, { "epoch": 0.0936599423631124, "grad_norm": 20.17439332595769, "learning_rate": 2.3381294964028775e-08, "logits/chosen": -2.0696139335632324, "logits/rejected": -2.068974733352661, "logps/chosen": -1.0797998905181885, "logps/rejected": -1.1516422033309937, "loss": 1.7012, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.159599781036377, "rewards/margins": 0.14368471503257751, "rewards/rejected": -2.3032844066619873, "step": 130 }, { "epoch": 0.10086455331412104, "grad_norm": 24.064780949371126, "learning_rate": 2.517985611510791e-08, "logits/chosen": -1.9815738201141357, "logits/rejected": -1.9751752614974976, "logps/chosen": -0.9776951670646667, "logps/rejected": -1.1230800151824951, "loss": 1.5974, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9553903341293335, "rewards/margins": 0.2907695770263672, "rewards/rejected": -2.2461600303649902, "step": 140 }, { "epoch": 0.10806916426512968, "grad_norm": 23.00427709241572, "learning_rate": 2.6978417266187048e-08, "logits/chosen": -1.99484121799469, "logits/rejected": -1.9905335903167725, "logps/chosen": -1.0193841457366943, "logps/rejected": -1.1368898153305054, "loss": 1.6404, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0387682914733887, "rewards/margins": 0.23501136898994446, "rewards/rejected": -2.2737796306610107, "step": 150 }, { "epoch": 0.11527377521613832, "grad_norm": 20.432294969630874, "learning_rate": 2.8776978417266184e-08, "logits/chosen": -1.997571587562561, "logits/rejected": -1.9914041757583618, "logps/chosen": -0.947496771812439, "logps/rejected": -1.0964053869247437, "loss": 1.5792, "rewards/accuracies": 0.65625, "rewards/chosen": -1.894993543624878, "rewards/margins": 0.2978169918060303, "rewards/rejected": -2.1928107738494873, "step": 160 }, { "epoch": 0.12247838616714697, "grad_norm": 25.103842505396916, "learning_rate": 3.057553956834532e-08, "logits/chosen": -2.006762981414795, "logits/rejected": -1.9991118907928467, "logps/chosen": -1.0366116762161255, "logps/rejected": -1.1614980697631836, "loss": 1.6344, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.073223352432251, "rewards/margins": 0.2497730255126953, "rewards/rejected": -2.322996139526367, "step": 170 }, { "epoch": 0.12968299711815562, "grad_norm": 26.27968111051558, "learning_rate": 3.237410071942446e-08, "logits/chosen": -2.0409793853759766, "logits/rejected": -2.034149646759033, "logps/chosen": -1.0202006101608276, "logps/rejected": -1.108983039855957, "loss": 1.6865, "rewards/accuracies": 0.53125, "rewards/chosen": -2.0404012203216553, "rewards/margins": 0.177564799785614, "rewards/rejected": -2.217966079711914, "step": 180 }, { "epoch": 0.13688760806916425, "grad_norm": 25.939125627248345, "learning_rate": 3.4172661870503594e-08, "logits/chosen": -2.0743298530578613, "logits/rejected": -2.072180986404419, "logps/chosen": -0.9696714282035828, "logps/rejected": -1.065748929977417, "loss": 1.6537, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9393428564071655, "rewards/margins": 0.19215507805347443, "rewards/rejected": -2.131497859954834, "step": 190 }, { "epoch": 0.1440922190201729, "grad_norm": 26.153404922121894, "learning_rate": 3.597122302158273e-08, "logits/chosen": -2.0394513607025146, "logits/rejected": -2.0364089012145996, "logps/chosen": -1.0258630514144897, "logps/rejected": -1.1529974937438965, "loss": 1.6189, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0517261028289795, "rewards/margins": 0.2542688548564911, "rewards/rejected": -2.305994987487793, "step": 200 }, { "epoch": 0.15129682997118155, "grad_norm": 23.72175833145625, "learning_rate": 3.776978417266187e-08, "logits/chosen": -2.034412384033203, "logits/rejected": -2.0315709114074707, "logps/chosen": -1.073925256729126, "logps/rejected": -1.1507259607315063, "loss": 1.6945, "rewards/accuracies": 0.5625, "rewards/chosen": -2.147850513458252, "rewards/margins": 0.1536014825105667, "rewards/rejected": -2.3014519214630127, "step": 210 }, { "epoch": 0.1585014409221902, "grad_norm": 18.049421620748525, "learning_rate": 3.9568345323741003e-08, "logits/chosen": -1.9837948083877563, "logits/rejected": -1.9797385931015015, "logps/chosen": -1.007852554321289, "logps/rejected": -1.1767760515213013, "loss": 1.5721, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.015705108642578, "rewards/margins": 0.33784690499305725, "rewards/rejected": -2.3535521030426025, "step": 220 }, { "epoch": 0.16570605187319884, "grad_norm": 19.432289400345837, "learning_rate": 4.136690647482014e-08, "logits/chosen": -2.0252606868743896, "logits/rejected": -2.025735378265381, "logps/chosen": -1.0125794410705566, "logps/rejected": -1.1261564493179321, "loss": 1.6379, "rewards/accuracies": 0.625, "rewards/chosen": -2.0251588821411133, "rewards/margins": 0.22715386748313904, "rewards/rejected": -2.2523128986358643, "step": 230 }, { "epoch": 0.1729106628242075, "grad_norm": 25.42149329995876, "learning_rate": 4.3165467625899276e-08, "logits/chosen": -2.0474588871002197, "logits/rejected": -2.042466163635254, "logps/chosen": -1.0612871646881104, "logps/rejected": -1.1391594409942627, "loss": 1.7029, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1225743293762207, "rewards/margins": 0.15574422478675842, "rewards/rejected": -2.2783188819885254, "step": 240 }, { "epoch": 0.18011527377521613, "grad_norm": 21.827295632014227, "learning_rate": 4.496402877697841e-08, "logits/chosen": -1.9690139293670654, "logits/rejected": -1.9651823043823242, "logps/chosen": -1.081837773323059, "logps/rejected": -1.173208236694336, "loss": 1.6762, "rewards/accuracies": 0.5625, "rewards/chosen": -2.163675546646118, "rewards/margins": 0.18274101614952087, "rewards/rejected": -2.346416473388672, "step": 250 }, { "epoch": 0.1873198847262248, "grad_norm": 23.883537952919536, "learning_rate": 4.676258992805755e-08, "logits/chosen": -1.9890559911727905, "logits/rejected": -1.9971050024032593, "logps/chosen": -1.1051918268203735, "logps/rejected": -1.2165734767913818, "loss": 1.6485, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.210383653640747, "rewards/margins": 0.22276310622692108, "rewards/rejected": -2.4331469535827637, "step": 260 }, { "epoch": 0.19452449567723343, "grad_norm": 23.452229796100493, "learning_rate": 4.8561151079136686e-08, "logits/chosen": -2.0651626586914062, "logits/rejected": -2.0570404529571533, "logps/chosen": -1.0715770721435547, "logps/rejected": -1.2007033824920654, "loss": 1.6136, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.1431541442871094, "rewards/margins": 0.2582527697086334, "rewards/rejected": -2.401406764984131, "step": 270 }, { "epoch": 0.2017291066282421, "grad_norm": 29.05416335946149, "learning_rate": 4.999992091672379e-08, "logits/chosen": -2.0108678340911865, "logits/rejected": -2.0090978145599365, "logps/chosen": -0.9353054761886597, "logps/rejected": -1.0496169328689575, "loss": 1.6345, "rewards/accuracies": 0.53125, "rewards/chosen": -1.8706109523773193, "rewards/margins": 0.22862282395362854, "rewards/rejected": -2.099233865737915, "step": 280 }, { "epoch": 0.20893371757925072, "grad_norm": 24.746568624535307, "learning_rate": 4.999715305459108e-08, "logits/chosen": -2.0434165000915527, "logits/rejected": -2.045293092727661, "logps/chosen": -1.0135209560394287, "logps/rejected": -1.1082584857940674, "loss": 1.6735, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0270419120788574, "rewards/margins": 0.18947532773017883, "rewards/rejected": -2.2165169715881348, "step": 290 }, { "epoch": 0.21613832853025935, "grad_norm": 23.704466162259774, "learning_rate": 4.9990431528966836e-08, "logits/chosen": -2.0209240913391113, "logits/rejected": -2.012465476989746, "logps/chosen": -1.0895938873291016, "logps/rejected": -1.1909050941467285, "loss": 1.653, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.179187774658203, "rewards/margins": 0.2026222050189972, "rewards/rejected": -2.381810188293457, "step": 300 }, { "epoch": 0.22334293948126802, "grad_norm": 21.29734105171553, "learning_rate": 4.997975740295813e-08, "logits/chosen": -1.9576565027236938, "logits/rejected": -1.9576654434204102, "logps/chosen": -1.0862493515014648, "logps/rejected": -1.172272801399231, "loss": 1.6787, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1724987030029297, "rewards/margins": 0.17204709351062775, "rewards/rejected": -2.344545602798462, "step": 310 }, { "epoch": 0.23054755043227665, "grad_norm": 18.963202937894597, "learning_rate": 4.996513236483331e-08, "logits/chosen": -2.033639907836914, "logits/rejected": -2.025113821029663, "logps/chosen": -1.008597493171692, "logps/rejected": -1.1411330699920654, "loss": 1.623, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.017194986343384, "rewards/margins": 0.26507094502449036, "rewards/rejected": -2.282266139984131, "step": 320 }, { "epoch": 0.2377521613832853, "grad_norm": 18.118259435513053, "learning_rate": 4.9946558727754974e-08, "logits/chosen": -2.013747453689575, "logits/rejected": -2.015979528427124, "logps/chosen": -1.0458552837371826, "logps/rejected": -1.0690838098526, "loss": 1.7903, "rewards/accuracies": 0.46875, "rewards/chosen": -2.0917105674743652, "rewards/margins": 0.04645707830786705, "rewards/rejected": -2.1381676197052, "step": 330 }, { "epoch": 0.24495677233429394, "grad_norm": 21.713334152733406, "learning_rate": 4.9924039429414086e-08, "logits/chosen": -2.0605921745300293, "logits/rejected": -2.0548267364501953, "logps/chosen": -1.0870946645736694, "logps/rejected": -1.1672402620315552, "loss": 1.6817, "rewards/accuracies": 0.5625, "rewards/chosen": -2.174189329147339, "rewards/margins": 0.1602911800146103, "rewards/rejected": -2.3344805240631104, "step": 340 }, { "epoch": 0.2521613832853026, "grad_norm": 21.7230708429387, "learning_rate": 4.989757803156537e-08, "logits/chosen": -1.9891109466552734, "logits/rejected": -1.983432412147522, "logps/chosen": -0.988193690776825, "logps/rejected": -1.115260362625122, "loss": 1.6191, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.97638738155365, "rewards/margins": 0.2541332542896271, "rewards/rejected": -2.230520725250244, "step": 350 }, { "epoch": 0.25936599423631124, "grad_norm": 24.29869182695538, "learning_rate": 4.986717871946393e-08, "logits/chosen": -1.995234727859497, "logits/rejected": -1.9911472797393799, "logps/chosen": -1.0861847400665283, "logps/rejected": -1.202515959739685, "loss": 1.6318, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.1723694801330566, "rewards/margins": 0.23266229033470154, "rewards/rejected": -2.40503191947937, "step": 360 }, { "epoch": 0.2665706051873199, "grad_norm": 21.151241640685495, "learning_rate": 4.983284630120331e-08, "logits/chosen": -2.0005943775177, "logits/rejected": -2.0005276203155518, "logps/chosen": -1.050954818725586, "logps/rejected": -1.180293083190918, "loss": 1.6091, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.101909637451172, "rewards/margins": 0.25867652893066406, "rewards/rejected": -2.360586166381836, "step": 370 }, { "epoch": 0.2737752161383285, "grad_norm": 18.702979404643415, "learning_rate": 4.979458620695505e-08, "logits/chosen": -2.0280709266662598, "logits/rejected": -2.0321407318115234, "logps/chosen": -1.01195228099823, "logps/rejected": -1.0852843523025513, "loss": 1.7127, "rewards/accuracies": 0.5625, "rewards/chosen": -2.02390456199646, "rewards/margins": 0.14666402339935303, "rewards/rejected": -2.1705687046051025, "step": 380 }, { "epoch": 0.28097982708933716, "grad_norm": 18.04619323368983, "learning_rate": 4.975240448810977e-08, "logits/chosen": -2.0287792682647705, "logits/rejected": -2.0225093364715576, "logps/chosen": -1.0217763185501099, "logps/rejected": -1.1489847898483276, "loss": 1.614, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0435526371002197, "rewards/margins": 0.25441741943359375, "rewards/rejected": -2.2979695796966553, "step": 390 }, { "epoch": 0.2881844380403458, "grad_norm": 22.031189167579363, "learning_rate": 4.970630781632009e-08, "logits/chosen": -2.034381628036499, "logits/rejected": -2.034792423248291, "logps/chosen": -0.9954347610473633, "logps/rejected": -1.0486609935760498, "loss": 1.7224, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.9908695220947266, "rewards/margins": 0.10645285993814468, "rewards/rejected": -2.0973219871520996, "step": 400 }, { "epoch": 0.2953890489913545, "grad_norm": 21.234234457439968, "learning_rate": 4.965630348244542e-08, "logits/chosen": -2.0295231342315674, "logits/rejected": -2.027569532394409, "logps/chosen": -1.0738043785095215, "logps/rejected": -1.1459261178970337, "loss": 1.7042, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.147608757019043, "rewards/margins": 0.14424346387386322, "rewards/rejected": -2.2918522357940674, "step": 410 }, { "epoch": 0.3025936599423631, "grad_norm": 19.466288881985548, "learning_rate": 4.9602399395398786e-08, "logits/chosen": -2.0115177631378174, "logits/rejected": -2.0157418251037598, "logps/chosen": -1.0445759296417236, "logps/rejected": -1.1231411695480347, "loss": 1.6872, "rewards/accuracies": 0.53125, "rewards/chosen": -2.0891518592834473, "rewards/margins": 0.15713071823120117, "rewards/rejected": -2.2462823390960693, "step": 420 }, { "epoch": 0.30979827089337175, "grad_norm": 21.116060561215924, "learning_rate": 4.95446040808959e-08, "logits/chosen": -1.9870742559432983, "logits/rejected": -1.9879848957061768, "logps/chosen": -1.0581797361373901, "logps/rejected": -1.099675178527832, "loss": 1.7526, "rewards/accuracies": 0.45625001192092896, "rewards/chosen": -2.1163594722747803, "rewards/margins": 0.08299090713262558, "rewards/rejected": -2.199350357055664, "step": 430 }, { "epoch": 0.3170028818443804, "grad_norm": 18.736237528011962, "learning_rate": 4.948292668010676e-08, "logits/chosen": -1.9880409240722656, "logits/rejected": -1.988071084022522, "logps/chosen": -1.0214247703552246, "logps/rejected": -1.1438568830490112, "loss": 1.6286, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.042849540710449, "rewards/margins": 0.2448642998933792, "rewards/rejected": -2.2877137660980225, "step": 440 }, { "epoch": 0.3242074927953891, "grad_norm": 20.43882931641836, "learning_rate": 4.941737694820975e-08, "logits/chosen": -2.0112996101379395, "logits/rejected": -2.0076920986175537, "logps/chosen": -1.144315242767334, "logps/rejected": -1.1844433546066284, "loss": 1.7537, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.288630485534668, "rewards/margins": 0.08025630563497543, "rewards/rejected": -2.368886709213257, "step": 450 }, { "epoch": 0.3314121037463977, "grad_norm": 28.239708991055796, "learning_rate": 4.93479652528488e-08, "logits/chosen": -2.007514476776123, "logits/rejected": -2.0019874572753906, "logps/chosen": -1.1697793006896973, "logps/rejected": -1.2875298261642456, "loss": 1.6351, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.3395586013793945, "rewards/margins": 0.23550114035606384, "rewards/rejected": -2.575059652328491, "step": 460 }, { "epoch": 0.33861671469740634, "grad_norm": 26.337152572859576, "learning_rate": 4.9274702572493555e-08, "logits/chosen": -2.040773868560791, "logits/rejected": -2.028566598892212, "logps/chosen": -1.0992854833602905, "logps/rejected": -1.2018510103225708, "loss": 1.6561, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.198570966720581, "rewards/margins": 0.20513089001178741, "rewards/rejected": -2.4037020206451416, "step": 470 }, { "epoch": 0.345821325648415, "grad_norm": 25.04785849160215, "learning_rate": 4.9197600494702955e-08, "logits/chosen": -2.10146164894104, "logits/rejected": -2.091294765472412, "logps/chosen": -0.9840625524520874, "logps/rejected": -1.106227993965149, "loss": 1.6196, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -1.9681251049041748, "rewards/margins": 0.24433092772960663, "rewards/rejected": -2.212455987930298, "step": 480 }, { "epoch": 0.3530259365994236, "grad_norm": 21.442417002640138, "learning_rate": 4.9116671214292526e-08, "logits/chosen": -2.017040491104126, "logits/rejected": -2.0160226821899414, "logps/chosen": -0.98698490858078, "logps/rejected": -1.1014493703842163, "loss": 1.6301, "rewards/accuracies": 0.625, "rewards/chosen": -1.97396981716156, "rewards/margins": 0.22892877459526062, "rewards/rejected": -2.2028987407684326, "step": 490 }, { "epoch": 0.36023054755043227, "grad_norm": 20.760566848988777, "learning_rate": 4.903192753140557e-08, "logits/chosen": -2.019878387451172, "logits/rejected": -2.0063798427581787, "logps/chosen": -1.0110963582992554, "logps/rejected": -1.1396064758300781, "loss": 1.6125, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0221927165985107, "rewards/margins": 0.2570200562477112, "rewards/rejected": -2.2792129516601562, "step": 500 }, { "epoch": 0.36743515850144093, "grad_norm": 21.964679850201083, "learning_rate": 4.894338284948866e-08, "logits/chosen": -2.088066577911377, "logits/rejected": -2.081502676010132, "logps/chosen": -1.0429285764694214, "logps/rejected": -1.1580368280410767, "loss": 1.6406, "rewards/accuracies": 0.65625, "rewards/chosen": -2.0858571529388428, "rewards/margins": 0.23021626472473145, "rewards/rejected": -2.3160736560821533, "step": 510 }, { "epoch": 0.3746397694524496, "grad_norm": 18.869869897878324, "learning_rate": 4.8851051173171656e-08, "logits/chosen": -2.0141520500183105, "logits/rejected": -2.0126335620880127, "logps/chosen": -1.1057158708572388, "logps/rejected": -1.2025840282440186, "loss": 1.6603, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2114317417144775, "rewards/margins": 0.1937362551689148, "rewards/rejected": -2.405168056488037, "step": 520 }, { "epoch": 0.3818443804034582, "grad_norm": 20.027593416077924, "learning_rate": 4.8754947106052696e-08, "logits/chosen": -1.9756828546524048, "logits/rejected": -1.9658939838409424, "logps/chosen": -0.983010470867157, "logps/rejected": -1.060978651046753, "loss": 1.6924, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.966020941734314, "rewards/margins": 0.15593627095222473, "rewards/rejected": -2.121957302093506, "step": 530 }, { "epoch": 0.38904899135446686, "grad_norm": 25.513748888084603, "learning_rate": 4.865508584838841e-08, "logits/chosen": -2.0021884441375732, "logits/rejected": -1.9952195882797241, "logps/chosen": -1.02981436252594, "logps/rejected": -1.1320369243621826, "loss": 1.6593, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.05962872505188, "rewards/margins": 0.2044449746608734, "rewards/rejected": -2.2640738487243652, "step": 540 }, { "epoch": 0.3962536023054755, "grad_norm": 20.013888186622026, "learning_rate": 4.855148319468979e-08, "logits/chosen": -1.9607187509536743, "logits/rejected": -1.9611743688583374, "logps/chosen": -0.9991506338119507, "logps/rejected": -1.0771278142929077, "loss": 1.7089, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -1.9983012676239014, "rewards/margins": 0.15595442056655884, "rewards/rejected": -2.1542556285858154, "step": 550 }, { "epoch": 0.4034582132564842, "grad_norm": 19.766020419371234, "learning_rate": 4.8444155531224065e-08, "logits/chosen": -2.0277419090270996, "logits/rejected": -2.0282044410705566, "logps/chosen": -1.0792930126190186, "logps/rejected": -1.1801952123641968, "loss": 1.6624, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.158586025238037, "rewards/margins": 0.20180411636829376, "rewards/rejected": -2.3603904247283936, "step": 560 }, { "epoch": 0.4106628242074928, "grad_norm": 18.138026924157977, "learning_rate": 4.833311983342292e-08, "logits/chosen": -2.031890869140625, "logits/rejected": -2.0176992416381836, "logps/chosen": -1.0929630994796753, "logps/rejected": -1.2075343132019043, "loss": 1.6422, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.1859261989593506, "rewards/margins": 0.22914230823516846, "rewards/rejected": -2.4150686264038086, "step": 570 }, { "epoch": 0.41786743515850144, "grad_norm": 21.60868876989794, "learning_rate": 4.821839366319768e-08, "logits/chosen": -2.0339295864105225, "logits/rejected": -2.0323901176452637, "logps/chosen": -0.9861429929733276, "logps/rejected": -1.0666794776916504, "loss": 1.6891, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9722859859466553, "rewards/margins": 0.16107279062271118, "rewards/rejected": -2.133358955383301, "step": 580 }, { "epoch": 0.4250720461095101, "grad_norm": 24.05826502769421, "learning_rate": 4.8099995166161536e-08, "logits/chosen": -2.0223276615142822, "logits/rejected": -2.022588014602661, "logps/chosen": -1.0257574319839478, "logps/rejected": -1.1649348735809326, "loss": 1.5961, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0515148639678955, "rewards/margins": 0.27835482358932495, "rewards/rejected": -2.3298697471618652, "step": 590 }, { "epoch": 0.4322766570605187, "grad_norm": 22.540205396639372, "learning_rate": 4.797794306875963e-08, "logits/chosen": -2.0817413330078125, "logits/rejected": -2.0777947902679443, "logps/chosen": -1.032037615776062, "logps/rejected": -1.1749569177627563, "loss": 1.597, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.064075231552124, "rewards/margins": 0.28583866357803345, "rewards/rejected": -2.3499138355255127, "step": 600 }, { "epoch": 0.43948126801152737, "grad_norm": 24.185799296259226, "learning_rate": 4.785225667530716e-08, "logits/chosen": -2.0293679237365723, "logits/rejected": -2.019531488418579, "logps/chosen": -1.0911657810211182, "logps/rejected": -1.1501439809799194, "loss": 1.7163, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.1823315620422363, "rewards/margins": 0.11795620620250702, "rewards/rejected": -2.300287961959839, "step": 610 }, { "epoch": 0.44668587896253603, "grad_norm": 28.363607554135946, "learning_rate": 4.772295586493613e-08, "logits/chosen": -2.0295448303222656, "logits/rejected": -2.0222747325897217, "logps/chosen": -0.9944518804550171, "logps/rejected": -1.082177758216858, "loss": 1.6763, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -1.9889037609100342, "rewards/margins": 0.1754516065120697, "rewards/rejected": -2.164355516433716, "step": 620 }, { "epoch": 0.4538904899135447, "grad_norm": 23.047596709189968, "learning_rate": 4.759006108845116e-08, "logits/chosen": -2.039217472076416, "logits/rejected": -2.0391504764556885, "logps/chosen": -1.0260940790176392, "logps/rejected": -1.1543083190917969, "loss": 1.6159, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -2.0521881580352783, "rewards/margins": 0.2564285397529602, "rewards/rejected": -2.3086166381835938, "step": 630 }, { "epoch": 0.4610951008645533, "grad_norm": 18.761921757090445, "learning_rate": 4.7453593365094926e-08, "logits/chosen": -1.9543355703353882, "logits/rejected": -1.955255150794983, "logps/chosen": -1.0088173151016235, "logps/rejected": -1.116272211074829, "loss": 1.6458, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.017634630203247, "rewards/margins": 0.21490976214408875, "rewards/rejected": -2.232544422149658, "step": 640 }, { "epoch": 0.46829971181556196, "grad_norm": 24.54554910010224, "learning_rate": 4.731357427922361e-08, "logits/chosen": -2.053588390350342, "logits/rejected": -2.038914442062378, "logps/chosen": -1.0363117456436157, "logps/rejected": -1.1169908046722412, "loss": 1.6897, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.0726234912872314, "rewards/margins": 0.1613583266735077, "rewards/rejected": -2.2339816093444824, "step": 650 }, { "epoch": 0.4755043227665706, "grad_norm": 26.37090068258741, "learning_rate": 4.71700259768931e-08, "logits/chosen": -2.032832145690918, "logits/rejected": -2.0338807106018066, "logps/chosen": -1.0859205722808838, "logps/rejected": -1.174154281616211, "loss": 1.6831, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.1718411445617676, "rewards/margins": 0.17646734416484833, "rewards/rejected": -2.348308563232422, "step": 660 }, { "epoch": 0.4827089337175792, "grad_norm": 24.073708164903586, "learning_rate": 4.7022971162356176e-08, "logits/chosen": -2.0062692165374756, "logits/rejected": -1.9969465732574463, "logps/chosen": -1.060937523841858, "logps/rejected": -1.1657707691192627, "loss": 1.6525, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.121875047683716, "rewards/margins": 0.20966656506061554, "rewards/rejected": -2.3315415382385254, "step": 670 }, { "epoch": 0.4899135446685879, "grad_norm": 21.794836443162072, "learning_rate": 4.6872433094471577e-08, "logits/chosen": -1.962633490562439, "logits/rejected": -1.953546166419983, "logps/chosen": -1.0352494716644287, "logps/rejected": -1.1061433553695679, "loss": 1.7057, "rewards/accuracies": 0.53125, "rewards/chosen": -2.0704989433288574, "rewards/margins": 0.1417877972126007, "rewards/rejected": -2.2122867107391357, "step": 680 }, { "epoch": 0.49711815561959655, "grad_norm": 29.899603243129143, "learning_rate": 4.671843558302522e-08, "logits/chosen": -2.025979518890381, "logits/rejected": -2.020822525024414, "logps/chosen": -1.1030082702636719, "logps/rejected": -1.2090809345245361, "loss": 1.6608, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2060165405273438, "rewards/margins": 0.21214473247528076, "rewards/rejected": -2.4181618690490723, "step": 690 }, { "epoch": 0.5043227665706052, "grad_norm": 23.0987112187863, "learning_rate": 4.656100298496439e-08, "logits/chosen": -2.0181777477264404, "logits/rejected": -2.0126051902770996, "logps/chosen": -1.017418622970581, "logps/rejected": -1.1510635614395142, "loss": 1.6139, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.034837245941162, "rewards/margins": 0.267289936542511, "rewards/rejected": -2.3021271228790283, "step": 700 }, { "epoch": 0.5115273775216138, "grad_norm": 21.978629277918778, "learning_rate": 4.640016020054527e-08, "logits/chosen": -1.9824316501617432, "logits/rejected": -1.979188323020935, "logps/chosen": -0.8954793810844421, "logps/rejected": -1.0217931270599365, "loss": 1.6279, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.7909587621688843, "rewards/margins": 0.25262752175331116, "rewards/rejected": -2.043586254119873, "step": 710 }, { "epoch": 0.5187319884726225, "grad_norm": 21.932059439411073, "learning_rate": 4.6235932669394676e-08, "logits/chosen": -2.011847972869873, "logits/rejected": -2.0056471824645996, "logps/chosen": -1.0409430265426636, "logps/rejected": -1.1656509637832642, "loss": 1.6179, "rewards/accuracies": 0.59375, "rewards/chosen": -2.081886053085327, "rewards/margins": 0.24941587448120117, "rewards/rejected": -2.3313019275665283, "step": 720 }, { "epoch": 0.5259365994236311, "grad_norm": 23.306210789596093, "learning_rate": 4.6068346366486325e-08, "logits/chosen": -2.013566732406616, "logits/rejected": -2.002554178237915, "logps/chosen": -1.0203325748443604, "logps/rejected": -1.104425072669983, "loss": 1.6898, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0406651496887207, "rewards/margins": 0.168185293674469, "rewards/rejected": -2.208850145339966, "step": 730 }, { "epoch": 0.5331412103746398, "grad_norm": 20.259951331338133, "learning_rate": 4.589742779803259e-08, "logits/chosen": -1.9962953329086304, "logits/rejected": -2.003087043762207, "logps/chosen": -1.017392635345459, "logps/rejected": -1.0792462825775146, "loss": 1.7292, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -2.034785270690918, "rewards/margins": 0.12370713800191879, "rewards/rejected": -2.1584925651550293, "step": 740 }, { "epoch": 0.5403458213256485, "grad_norm": 26.41919083462638, "learning_rate": 4.5723203997292146e-08, "logits/chosen": -2.014768123626709, "logits/rejected": -2.009498119354248, "logps/chosen": -1.0986969470977783, "logps/rejected": -1.1911667585372925, "loss": 1.6761, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.1973938941955566, "rewards/margins": 0.18493981659412384, "rewards/rejected": -2.382333517074585, "step": 750 }, { "epoch": 0.547550432276657, "grad_norm": 22.856174101882264, "learning_rate": 4.554570252029421e-08, "logits/chosen": -1.9752864837646484, "logits/rejected": -1.9790761470794678, "logps/chosen": -1.0023285150527954, "logps/rejected": -1.1136945486068726, "loss": 1.6506, "rewards/accuracies": 0.59375, "rewards/chosen": -2.004657030105591, "rewards/margins": 0.22273211181163788, "rewards/rejected": -2.227389097213745, "step": 760 }, { "epoch": 0.5547550432276657, "grad_norm": 24.809677260951396, "learning_rate": 4.536495144148021e-08, "logits/chosen": -1.9749062061309814, "logits/rejected": -1.9776439666748047, "logps/chosen": -0.974290668964386, "logps/rejected": -1.1291836500167847, "loss": 1.5873, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.948581337928772, "rewards/margins": 0.3097858428955078, "rewards/rejected": -2.2583673000335693, "step": 770 }, { "epoch": 0.5619596541786743, "grad_norm": 25.374366616475292, "learning_rate": 4.518097934926339e-08, "logits/chosen": -1.9943599700927734, "logits/rejected": -1.993017554283142, "logps/chosen": -1.0385878086090088, "logps/rejected": -1.1217256784439087, "loss": 1.681, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0771756172180176, "rewards/margins": 0.16627538204193115, "rewards/rejected": -2.2434513568878174, "step": 780 }, { "epoch": 0.569164265129683, "grad_norm": 20.014037840378, "learning_rate": 4.499381534150714e-08, "logits/chosen": -2.0125200748443604, "logits/rejected": -2.0065932273864746, "logps/chosen": -1.0741461515426636, "logps/rejected": -1.2335011959075928, "loss": 1.5826, "rewards/accuracies": 0.625, "rewards/chosen": -2.148292303085327, "rewards/margins": 0.31871041655540466, "rewards/rejected": -2.4670023918151855, "step": 790 }, { "epoch": 0.5763688760806917, "grad_norm": 21.859452356308726, "learning_rate": 4.48034890209227e-08, "logits/chosen": -2.037662982940674, "logits/rejected": -2.0339465141296387, "logps/chosen": -1.0423095226287842, "logps/rejected": -1.1221187114715576, "loss": 1.6972, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0846190452575684, "rewards/margins": 0.1596187800168991, "rewards/rejected": -2.2442374229431152, "step": 800 }, { "epoch": 0.5835734870317003, "grad_norm": 18.638956420354962, "learning_rate": 4.4610030490387154e-08, "logits/chosen": -2.01869797706604, "logits/rejected": -2.0211963653564453, "logps/chosen": -1.0117915868759155, "logps/rejected": -1.1026709079742432, "loss": 1.6729, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.023583173751831, "rewards/margins": 0.18175864219665527, "rewards/rejected": -2.2053418159484863, "step": 810 }, { "epoch": 0.590778097982709, "grad_norm": 24.338984987337728, "learning_rate": 4.4413470348182124e-08, "logits/chosen": -2.020242214202881, "logits/rejected": -2.0103182792663574, "logps/chosen": -1.0275824069976807, "logps/rejected": -1.1156085729599, "loss": 1.6885, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0551648139953613, "rewards/margins": 0.1760522723197937, "rewards/rejected": -2.2312171459198, "step": 820 }, { "epoch": 0.5979827089337176, "grad_norm": 21.286386599253383, "learning_rate": 4.421383968315427e-08, "logits/chosen": -2.000786781311035, "logits/rejected": -1.9987096786499023, "logps/chosen": -0.9581828117370605, "logps/rejected": -1.079776644706726, "loss": 1.6345, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.916365623474121, "rewards/margins": 0.24318790435791016, "rewards/rejected": -2.159553289413452, "step": 830 }, { "epoch": 0.6051873198847262, "grad_norm": 18.656428501584173, "learning_rate": 4.4011170069798126e-08, "logits/chosen": -2.02819561958313, "logits/rejected": -2.0282022953033447, "logps/chosen": -1.0859931707382202, "logps/rejected": -1.1599055528640747, "loss": 1.7042, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1719863414764404, "rewards/margins": 0.14782461524009705, "rewards/rejected": -2.3198111057281494, "step": 840 }, { "epoch": 0.6123919308357348, "grad_norm": 18.413528164156084, "learning_rate": 4.380549356326208e-08, "logits/chosen": -2.0375325679779053, "logits/rejected": -2.031755208969116, "logps/chosen": -1.0479528903961182, "logps/rejected": -1.1553928852081299, "loss": 1.6564, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0959057807922363, "rewards/margins": 0.21487931907176971, "rewards/rejected": -2.3107857704162598, "step": 850 }, { "epoch": 0.6195965417867435, "grad_norm": 20.693940592783264, "learning_rate": 4.359684269427848e-08, "logits/chosen": -2.009840726852417, "logits/rejected": -2.0057568550109863, "logps/chosen": -1.1601811647415161, "logps/rejected": -1.2186861038208008, "loss": 1.7248, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.3203623294830322, "rewards/margins": 0.11700980365276337, "rewards/rejected": -2.4373722076416016, "step": 860 }, { "epoch": 0.6268011527377522, "grad_norm": 23.036875621985878, "learning_rate": 4.3385250464018355e-08, "logits/chosen": -2.0459847450256348, "logits/rejected": -2.0400002002716064, "logps/chosen": -1.0027254819869995, "logps/rejected": -1.122941017150879, "loss": 1.6293, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.005450963973999, "rewards/margins": 0.2404308319091797, "rewards/rejected": -2.245882034301758, "step": 870 }, { "epoch": 0.6340057636887608, "grad_norm": 23.415166967310086, "learning_rate": 4.3170750338871806e-08, "logits/chosen": -2.034942626953125, "logits/rejected": -2.032872438430786, "logps/chosen": -1.0137414932250977, "logps/rejected": -1.0794751644134521, "loss": 1.7127, "rewards/accuracies": 0.53125, "rewards/chosen": -2.0274829864501953, "rewards/margins": 0.13146750628948212, "rewards/rejected": -2.1589503288269043, "step": 880 }, { "epoch": 0.6412103746397695, "grad_norm": 21.97241223292353, "learning_rate": 4.295337624515485e-08, "logits/chosen": -2.0610389709472656, "logits/rejected": -2.0595154762268066, "logps/chosen": -1.014024019241333, "logps/rejected": -1.1112914085388184, "loss": 1.6585, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.028048038482666, "rewards/margins": 0.1945347934961319, "rewards/rejected": -2.2225828170776367, "step": 890 }, { "epoch": 0.6484149855907781, "grad_norm": 23.8540421436762, "learning_rate": 4.273316256374342e-08, "logits/chosen": -1.9776846170425415, "logits/rejected": -1.9790922403335571, "logps/chosen": -1.14115309715271, "logps/rejected": -1.2139250040054321, "loss": 1.7115, "rewards/accuracies": 0.53125, "rewards/chosen": -2.28230619430542, "rewards/margins": 0.14554361999034882, "rewards/rejected": -2.4278500080108643, "step": 900 }, { "epoch": 0.6556195965417867, "grad_norm": 22.88947778871445, "learning_rate": 4.2510144124635605e-08, "logits/chosen": -1.9872970581054688, "logits/rejected": -1.9910697937011719, "logps/chosen": -1.0412156581878662, "logps/rejected": -1.1080083847045898, "loss": 1.7088, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.0824313163757324, "rewards/margins": 0.1335853785276413, "rewards/rejected": -2.2160167694091797, "step": 910 }, { "epoch": 0.6628242074927954, "grad_norm": 25.19603603966565, "learning_rate": 4.22843562014427e-08, "logits/chosen": -2.016618490219116, "logits/rejected": -2.0112040042877197, "logps/chosen": -0.9714315533638, "logps/rejected": -1.0625754594802856, "loss": 1.675, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9428631067276, "rewards/margins": 0.18228788673877716, "rewards/rejected": -2.1251509189605713, "step": 920 }, { "epoch": 0.670028818443804, "grad_norm": 20.60480228984638, "learning_rate": 4.205583450581023e-08, "logits/chosen": -2.050994873046875, "logits/rejected": -2.0482630729675293, "logps/chosen": -1.0324729681015015, "logps/rejected": -1.1509991884231567, "loss": 1.6261, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.064945936203003, "rewards/margins": 0.23705241084098816, "rewards/rejected": -2.3019983768463135, "step": 930 }, { "epoch": 0.6772334293948127, "grad_norm": 22.35823019195503, "learning_rate": 4.1824615181769577e-08, "logits/chosen": -2.0053532123565674, "logits/rejected": -2.0111613273620605, "logps/chosen": -1.1246191263198853, "logps/rejected": -1.1943109035491943, "loss": 1.6997, "rewards/accuracies": 0.59375, "rewards/chosen": -2.2492382526397705, "rewards/margins": 0.13938355445861816, "rewards/rejected": -2.3886218070983887, "step": 940 }, { "epoch": 0.6844380403458213, "grad_norm": 18.162143432621694, "learning_rate": 4.1590734800021354e-08, "logits/chosen": -1.9661105871200562, "logits/rejected": -1.9702253341674805, "logps/chosen": -1.0093214511871338, "logps/rejected": -1.1327455043792725, "loss": 1.633, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0186429023742676, "rewards/margins": 0.24684815108776093, "rewards/rejected": -2.265491008758545, "step": 950 }, { "epoch": 0.69164265129683, "grad_norm": 23.965182376610645, "learning_rate": 4.1354230352151143e-08, "logits/chosen": -2.041020154953003, "logits/rejected": -2.040239095687866, "logps/chosen": -1.0478591918945312, "logps/rejected": -1.1593987941741943, "loss": 1.6429, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0957183837890625, "rewards/margins": 0.223079115152359, "rewards/rejected": -2.3187975883483887, "step": 960 }, { "epoch": 0.6988472622478387, "grad_norm": 24.464714064797324, "learning_rate": 4.111513924477878e-08, "logits/chosen": -2.043121337890625, "logits/rejected": -2.0391170978546143, "logps/chosen": -0.9660174250602722, "logps/rejected": -1.0939273834228516, "loss": 1.6115, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.9320348501205444, "rewards/margins": 0.2558196187019348, "rewards/rejected": -2.187854766845703, "step": 970 }, { "epoch": 0.7060518731988472, "grad_norm": 20.201261103575792, "learning_rate": 4.087349929364192e-08, "logits/chosen": -1.9678367376327515, "logits/rejected": -1.9675403833389282, "logps/chosen": -1.1067652702331543, "logps/rejected": -1.2355201244354248, "loss": 1.6115, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.2135305404663086, "rewards/margins": 0.25750917196273804, "rewards/rejected": -2.4710402488708496, "step": 980 }, { "epoch": 0.7132564841498559, "grad_norm": 25.954821168667177, "learning_rate": 4.062934871761497e-08, "logits/chosen": -2.0314321517944336, "logits/rejected": -2.0284628868103027, "logps/chosen": -1.1079853773117065, "logps/rejected": -1.2067363262176514, "loss": 1.6694, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.215970754623413, "rewards/margins": 0.19750212132930756, "rewards/rejected": -2.4134726524353027, "step": 990 }, { "epoch": 0.7204610951008645, "grad_norm": 22.341732516108657, "learning_rate": 4.038272613266419e-08, "logits/chosen": -2.0092735290527344, "logits/rejected": -2.0061213970184326, "logps/chosen": -1.00548255443573, "logps/rejected": -1.1272741556167603, "loss": 1.6421, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.01096510887146, "rewards/margins": 0.24358315765857697, "rewards/rejected": -2.2545483112335205, "step": 1000 }, { "epoch": 0.7276657060518732, "grad_norm": 26.857801922416726, "learning_rate": 4.0133670545740014e-08, "logits/chosen": -2.0216879844665527, "logits/rejected": -2.0182127952575684, "logps/chosen": -0.9998480677604675, "logps/rejected": -1.0946764945983887, "loss": 1.6911, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.999696135520935, "rewards/margins": 0.1896568387746811, "rewards/rejected": -2.1893529891967773, "step": 1010 }, { "epoch": 0.7348703170028819, "grad_norm": 20.733064848722005, "learning_rate": 3.988222134860755e-08, "logits/chosen": -2.016014337539673, "logits/rejected": -2.0111701488494873, "logps/chosen": -1.0305395126342773, "logps/rejected": -1.1271092891693115, "loss": 1.6525, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0610790252685547, "rewards/margins": 0.19313934445381165, "rewards/rejected": -2.254218578338623, "step": 1020 }, { "epoch": 0.7420749279538905, "grad_norm": 20.161762859239953, "learning_rate": 3.962841831161617e-08, "logits/chosen": -1.9683122634887695, "logits/rejected": -1.9676278829574585, "logps/chosen": -1.0196747779846191, "logps/rejected": -1.1555341482162476, "loss": 1.6269, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.0393495559692383, "rewards/margins": 0.2717186510562897, "rewards/rejected": -2.311068296432495, "step": 1030 }, { "epoch": 0.7492795389048992, "grad_norm": 20.049038068269237, "learning_rate": 3.937230157740931e-08, "logits/chosen": -2.0240588188171387, "logits/rejected": -2.018101692199707, "logps/chosen": -1.0107640027999878, "logps/rejected": -1.0943820476531982, "loss": 1.6788, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.0215280055999756, "rewards/margins": 0.1672360599040985, "rewards/rejected": -2.1887640953063965, "step": 1040 }, { "epoch": 0.7564841498559077, "grad_norm": 21.02175811345582, "learning_rate": 3.9113911654575246e-08, "logits/chosen": -1.967104196548462, "logits/rejected": -1.9632108211517334, "logps/chosen": -0.935411810874939, "logps/rejected": -1.0687172412872314, "loss": 1.6138, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.870823621749878, "rewards/margins": 0.2666108012199402, "rewards/rejected": -2.137434482574463, "step": 1050 }, { "epoch": 0.7636887608069164, "grad_norm": 21.519122032376863, "learning_rate": 3.885328941124014e-08, "logits/chosen": -1.9991518259048462, "logits/rejected": -1.993080496788025, "logps/chosen": -1.0368106365203857, "logps/rejected": -1.1475738286972046, "loss": 1.6499, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0736212730407715, "rewards/margins": 0.2215261161327362, "rewards/rejected": -2.295147657394409, "step": 1060 }, { "epoch": 0.770893371757925, "grad_norm": 23.262898735103672, "learning_rate": 3.8590476068604106e-08, "logits/chosen": -2.00036358833313, "logits/rejected": -1.998552918434143, "logps/chosen": -1.071908712387085, "logps/rejected": -1.2022292613983154, "loss": 1.6331, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.14381742477417, "rewards/margins": 0.2606413960456848, "rewards/rejected": -2.404458522796631, "step": 1070 }, { "epoch": 0.7780979827089337, "grad_norm": 24.381883140284064, "learning_rate": 3.832551319442151e-08, "logits/chosen": -2.025217056274414, "logits/rejected": -2.0259487628936768, "logps/chosen": -1.0844666957855225, "logps/rejected": -1.1969218254089355, "loss": 1.6485, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.168933391571045, "rewards/margins": 0.2249099314212799, "rewards/rejected": -2.393843650817871, "step": 1080 }, { "epoch": 0.7853025936599424, "grad_norm": 27.86719418333446, "learning_rate": 3.8058442696426404e-08, "logits/chosen": -2.0382745265960693, "logits/rejected": -2.030484676361084, "logps/chosen": -1.1074187755584717, "logps/rejected": -1.211817979812622, "loss": 1.6631, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.2148375511169434, "rewards/margins": 0.20879819989204407, "rewards/rejected": -2.423635959625244, "step": 1090 }, { "epoch": 0.792507204610951, "grad_norm": 27.071495780290135, "learning_rate": 3.7789306815704216e-08, "logits/chosen": -2.026120185852051, "logits/rejected": -2.020829916000366, "logps/chosen": -1.0431629419326782, "logps/rejected": -1.1798484325408936, "loss": 1.6059, "rewards/accuracies": 0.625, "rewards/chosen": -2.0863258838653564, "rewards/margins": 0.27337145805358887, "rewards/rejected": -2.359696865081787, "step": 1100 }, { "epoch": 0.7997118155619597, "grad_norm": 18.88096690716272, "learning_rate": 3.7518148120010705e-08, "logits/chosen": -2.0271174907684326, "logits/rejected": -2.0201258659362793, "logps/chosen": -1.0074636936187744, "logps/rejected": -1.1304162740707397, "loss": 1.6266, "rewards/accuracies": 0.59375, "rewards/chosen": -2.014927387237549, "rewards/margins": 0.24590542912483215, "rewards/rejected": -2.2608325481414795, "step": 1110 }, { "epoch": 0.8069164265129684, "grad_norm": 21.911720790106997, "learning_rate": 3.7245009497039244e-08, "logits/chosen": -1.9778887033462524, "logits/rejected": -1.9702438116073608, "logps/chosen": -1.045611023902893, "logps/rejected": -1.1082854270935059, "loss": 1.7162, "rewards/accuracies": 0.5625, "rewards/chosen": -2.091222047805786, "rewards/margins": 0.1253490000963211, "rewards/rejected": -2.2165708541870117, "step": 1120 }, { "epoch": 0.8141210374639769, "grad_norm": 20.41807103685389, "learning_rate": 3.696993414763753e-08, "logits/chosen": -2.0123400688171387, "logits/rejected": -2.0082285404205322, "logps/chosen": -0.9978957176208496, "logps/rejected": -1.085761308670044, "loss": 1.6839, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.9957914352416992, "rewards/margins": 0.17573121190071106, "rewards/rejected": -2.171522617340088, "step": 1130 }, { "epoch": 0.8213256484149856, "grad_norm": 19.092407867541205, "learning_rate": 3.66929655789747e-08, "logits/chosen": -2.0528008937835693, "logits/rejected": -2.051527738571167, "logps/chosen": -1.046097993850708, "logps/rejected": -1.165477991104126, "loss": 1.6311, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.092195987701416, "rewards/margins": 0.23876003921031952, "rewards/rejected": -2.330955982208252, "step": 1140 }, { "epoch": 0.8285302593659942, "grad_norm": 21.260673310622927, "learning_rate": 3.64141475976601e-08, "logits/chosen": -2.041018009185791, "logits/rejected": -2.0345215797424316, "logps/chosen": -1.0739690065383911, "logps/rejected": -1.1673578023910522, "loss": 1.6763, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.1479380130767822, "rewards/margins": 0.18677765130996704, "rewards/rejected": -2.3347156047821045, "step": 1150 }, { "epoch": 0.8357348703170029, "grad_norm": 26.34484214118367, "learning_rate": 3.61335243028146e-08, "logits/chosen": -1.9964382648468018, "logits/rejected": -1.9950309991836548, "logps/chosen": -1.0375313758850098, "logps/rejected": -1.1169970035552979, "loss": 1.706, "rewards/accuracies": 0.53125, "rewards/chosen": -2.0750627517700195, "rewards/margins": 0.15893153846263885, "rewards/rejected": -2.2339940071105957, "step": 1160 }, { "epoch": 0.8429394812680115, "grad_norm": 24.32927550103409, "learning_rate": 3.585114007909562e-08, "logits/chosen": -1.9961084127426147, "logits/rejected": -1.9874632358551025, "logps/chosen": -1.0144343376159668, "logps/rejected": -1.1264407634735107, "loss": 1.6405, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.0288686752319336, "rewards/margins": 0.2240130603313446, "rewards/rejected": -2.2528815269470215, "step": 1170 }, { "epoch": 0.8501440922190202, "grad_norm": 26.36978810424015, "learning_rate": 3.556703958967716e-08, "logits/chosen": -1.9620872735977173, "logits/rejected": -1.9580965042114258, "logps/chosen": -1.059852123260498, "logps/rejected": -1.1655702590942383, "loss": 1.6635, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.119704246520996, "rewards/margins": 0.21143603324890137, "rewards/rejected": -2.3311405181884766, "step": 1180 }, { "epoch": 0.8573487031700289, "grad_norm": 20.25819384222818, "learning_rate": 3.528126776918559e-08, "logits/chosen": -2.0512845516204834, "logits/rejected": -2.0443384647369385, "logps/chosen": -1.071276068687439, "logps/rejected": -1.1480839252471924, "loss": 1.6952, "rewards/accuracies": 0.5625, "rewards/chosen": -2.142552137374878, "rewards/margins": 0.15361574292182922, "rewards/rejected": -2.2961678504943848, "step": 1190 }, { "epoch": 0.8645533141210374, "grad_norm": 25.590655270691236, "learning_rate": 3.499386981659262e-08, "logits/chosen": -1.9815905094146729, "logits/rejected": -1.9693584442138672, "logps/chosen": -1.086232304573059, "logps/rejected": -1.1749858856201172, "loss": 1.6708, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.172464609146118, "rewards/margins": 0.17750723659992218, "rewards/rejected": -2.3499717712402344, "step": 1200 }, { "epoch": 0.8717579250720461, "grad_norm": 22.74687405755938, "learning_rate": 3.47048911880664e-08, "logits/chosen": -1.971374750137329, "logits/rejected": -1.979832410812378, "logps/chosen": -0.9375821352005005, "logps/rejected": -1.0928103923797607, "loss": 1.5833, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.875164270401001, "rewards/margins": 0.31045612692832947, "rewards/rejected": -2.1856207847595215, "step": 1210 }, { "epoch": 0.8789625360230547, "grad_norm": 20.348332311917044, "learning_rate": 3.4414377589782e-08, "logits/chosen": -1.998797059059143, "logits/rejected": -1.9944807291030884, "logps/chosen": -1.008535623550415, "logps/rejected": -1.1152924299240112, "loss": 1.6564, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.01707124710083, "rewards/margins": 0.21351365745067596, "rewards/rejected": -2.2305848598480225, "step": 1220 }, { "epoch": 0.8861671469740634, "grad_norm": 20.10685961084486, "learning_rate": 3.412237497069226e-08, "logits/chosen": -1.9737958908081055, "logits/rejected": -1.9617822170257568, "logps/chosen": -0.9835951924324036, "logps/rejected": -1.0770342350006104, "loss": 1.6699, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9671903848648071, "rewards/margins": 0.1868787556886673, "rewards/rejected": -2.1540684700012207, "step": 1230 }, { "epoch": 0.8933717579250721, "grad_norm": 24.001680453622406, "learning_rate": 3.382892951526036e-08, "logits/chosen": -1.9840329885482788, "logits/rejected": -1.9820177555084229, "logps/chosen": -0.9354456067085266, "logps/rejected": -1.0733263492584229, "loss": 1.6076, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.8708912134170532, "rewards/margins": 0.2757616937160492, "rewards/rejected": -2.1466526985168457, "step": 1240 }, { "epoch": 0.9005763688760807, "grad_norm": 21.463794442952526, "learning_rate": 3.353408763615502e-08, "logits/chosen": -2.019768238067627, "logits/rejected": -2.0219955444335938, "logps/chosen": -1.0638011693954468, "logps/rejected": -1.219588041305542, "loss": 1.5926, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.1276023387908936, "rewards/margins": 0.31157371401786804, "rewards/rejected": -2.439176082611084, "step": 1250 }, { "epoch": 0.9077809798270894, "grad_norm": 28.944721871130742, "learning_rate": 3.323789596690971e-08, "logits/chosen": -2.0201096534729004, "logits/rejected": -2.02502703666687, "logps/chosen": -1.1166651248931885, "logps/rejected": -1.2446470260620117, "loss": 1.6227, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.233330249786377, "rewards/margins": 0.25596344470977783, "rewards/rejected": -2.4892940521240234, "step": 1260 }, { "epoch": 0.9149855907780979, "grad_norm": 20.87871503781233, "learning_rate": 3.294040135454681e-08, "logits/chosen": -1.9817100763320923, "logits/rejected": -1.975229024887085, "logps/chosen": -0.9639909863471985, "logps/rejected": -1.0865840911865234, "loss": 1.6229, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -1.927981972694397, "rewards/margins": 0.2451862096786499, "rewards/rejected": -2.173168182373047, "step": 1270 }, { "epoch": 0.9221902017291066, "grad_norm": 24.375366521893024, "learning_rate": 3.264165085216817e-08, "logits/chosen": -2.073403835296631, "logits/rejected": -2.0691773891448975, "logps/chosen": -1.0255587100982666, "logps/rejected": -1.1626403331756592, "loss": 1.6102, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.051117420196533, "rewards/margins": 0.2741633653640747, "rewards/rejected": -2.3252806663513184, "step": 1280 }, { "epoch": 0.9293948126801153, "grad_norm": 19.708714819889934, "learning_rate": 3.2341691711512854e-08, "logits/chosen": -2.0348494052886963, "logits/rejected": -2.0337963104248047, "logps/chosen": -0.993812084197998, "logps/rejected": -1.0996659994125366, "loss": 1.648, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.987624168395996, "rewards/margins": 0.2117079198360443, "rewards/rejected": -2.1993319988250732, "step": 1290 }, { "epoch": 0.9365994236311239, "grad_norm": 25.666809748463557, "learning_rate": 3.204057137548371e-08, "logits/chosen": -2.0750679969787598, "logits/rejected": -2.077117919921875, "logps/chosen": -1.0476573705673218, "logps/rejected": -1.1119945049285889, "loss": 1.7181, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.0953147411346436, "rewards/margins": 0.12867406010627747, "rewards/rejected": -2.2239890098571777, "step": 1300 }, { "epoch": 0.9438040345821326, "grad_norm": 24.64906958910084, "learning_rate": 3.173833747064351e-08, "logits/chosen": -2.0429582595825195, "logits/rejected": -2.0435373783111572, "logps/chosen": -0.9831833839416504, "logps/rejected": -1.0801106691360474, "loss": 1.6616, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9663667678833008, "rewards/margins": 0.19385461509227753, "rewards/rejected": -2.1602213382720947, "step": 1310 }, { "epoch": 0.9510086455331412, "grad_norm": 24.10994908767819, "learning_rate": 3.143503779968213e-08, "logits/chosen": -2.0116159915924072, "logits/rejected": -2.005117893218994, "logps/chosen": -1.0751299858093262, "logps/rejected": -1.2202341556549072, "loss": 1.594, "rewards/accuracies": 0.625, "rewards/chosen": -2.1502599716186523, "rewards/margins": 0.29020795226097107, "rewards/rejected": -2.4404683113098145, "step": 1320 }, { "epoch": 0.9582132564841499, "grad_norm": 17.63664818825136, "learning_rate": 3.113072033385589e-08, "logits/chosen": -2.0390655994415283, "logits/rejected": -2.0347390174865723, "logps/chosen": -1.0593435764312744, "logps/rejected": -1.1827442646026611, "loss": 1.6315, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.118687152862549, "rewards/margins": 0.24680104851722717, "rewards/rejected": -2.3654885292053223, "step": 1330 }, { "epoch": 0.9654178674351584, "grad_norm": 29.367574205793645, "learning_rate": 3.082543320540015e-08, "logits/chosen": -2.010270357131958, "logits/rejected": -2.0109667778015137, "logps/chosen": -1.0154287815093994, "logps/rejected": -1.132817029953003, "loss": 1.641, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.030857563018799, "rewards/margins": 0.23477670550346375, "rewards/rejected": -2.265634059906006, "step": 1340 }, { "epoch": 0.9726224783861671, "grad_norm": 18.34859679144091, "learning_rate": 3.051922469991655e-08, "logits/chosen": -1.9400978088378906, "logits/rejected": -1.9382463693618774, "logps/chosen": -1.0125986337661743, "logps/rejected": -1.087135910987854, "loss": 1.7066, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.0251972675323486, "rewards/margins": 0.149074524641037, "rewards/rejected": -2.174271821975708, "step": 1350 }, { "epoch": 0.9798270893371758, "grad_norm": 18.971281667579635, "learning_rate": 3.0212143248735886e-08, "logits/chosen": -1.9810125827789307, "logits/rejected": -1.9789069890975952, "logps/chosen": -0.961616039276123, "logps/rejected": -1.0554723739624023, "loss": 1.676, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.923232078552246, "rewards/margins": 0.18771259486675262, "rewards/rejected": -2.1109447479248047, "step": 1360 }, { "epoch": 0.9870317002881844, "grad_norm": 24.96298831746159, "learning_rate": 2.9904237421258046e-08, "logits/chosen": -2.00824236869812, "logits/rejected": -2.0035648345947266, "logps/chosen": -0.9997411966323853, "logps/rejected": -1.1224539279937744, "loss": 1.6216, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9994823932647705, "rewards/margins": 0.24542562663555145, "rewards/rejected": -2.244907855987549, "step": 1370 }, { "epoch": 0.9942363112391931, "grad_norm": 23.916004353098554, "learning_rate": 2.9595555917269997e-08, "logits/chosen": -1.9720462560653687, "logits/rejected": -1.968483328819275, "logps/chosen": -1.0500491857528687, "logps/rejected": -1.1262010335922241, "loss": 1.6923, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.1000983715057373, "rewards/margins": 0.1523038148880005, "rewards/rejected": -2.2524020671844482, "step": 1380 }, { "epoch": 1.0014409221902016, "grad_norm": 31.544680289629245, "learning_rate": 2.928614755924327e-08, "logits/chosen": -2.049835443496704, "logits/rejected": -2.050297737121582, "logps/chosen": -1.0141699314117432, "logps/rejected": -1.12843656539917, "loss": 1.645, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0283398628234863, "rewards/margins": 0.22853314876556396, "rewards/rejected": -2.25687313079834, "step": 1390 }, { "epoch": 1.0086455331412103, "grad_norm": 22.171264495293624, "learning_rate": 2.8976061284611908e-08, "logits/chosen": -1.9490327835083008, "logits/rejected": -1.9464311599731445, "logps/chosen": -1.058870553970337, "logps/rejected": -1.195708990097046, "loss": 1.627, "rewards/accuracies": 0.59375, "rewards/chosen": -2.117741107940674, "rewards/margins": 0.27367717027664185, "rewards/rejected": -2.391417980194092, "step": 1400 }, { "epoch": 1.015850144092219, "grad_norm": 26.077670247013415, "learning_rate": 2.8665346138032327e-08, "logits/chosen": -1.9846904277801514, "logits/rejected": -1.9889650344848633, "logps/chosen": -1.0112468004226685, "logps/rejected": -1.1389497518539429, "loss": 1.6326, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.022493600845337, "rewards/margins": 0.2554059326648712, "rewards/rejected": -2.2778995037078857, "step": 1410 }, { "epoch": 1.0230547550432276, "grad_norm": 21.678779034806855, "learning_rate": 2.8354051263626227e-08, "logits/chosen": -2.014312505722046, "logits/rejected": -2.0092453956604004, "logps/chosen": -1.1152961254119873, "logps/rejected": -1.2331750392913818, "loss": 1.6569, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.2305922508239746, "rewards/margins": 0.23575782775878906, "rewards/rejected": -2.4663500785827637, "step": 1420 }, { "epoch": 1.0302593659942363, "grad_norm": 24.392573956049755, "learning_rate": 2.8042225897207648e-08, "logits/chosen": -2.05131196975708, "logits/rejected": -2.0500950813293457, "logps/chosen": -0.9399210214614868, "logps/rejected": -1.0278102159500122, "loss": 1.6851, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -1.8798420429229736, "rewards/margins": 0.17577846348285675, "rewards/rejected": -2.0556204319000244, "step": 1430 }, { "epoch": 1.037463976945245, "grad_norm": 26.584656132405183, "learning_rate": 2.7729919358495728e-08, "logits/chosen": -2.0092933177948, "logits/rejected": -2.0025553703308105, "logps/chosen": -1.1368526220321655, "logps/rejected": -1.2202892303466797, "loss": 1.6972, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.273705244064331, "rewards/margins": 0.16687336564064026, "rewards/rejected": -2.4405784606933594, "step": 1440 }, { "epoch": 1.0446685878962536, "grad_norm": 19.430388178160936, "learning_rate": 2.741718104331393e-08, "logits/chosen": -2.06870698928833, "logits/rejected": -2.0779881477355957, "logps/chosen": -0.9939098358154297, "logps/rejected": -1.1466522216796875, "loss": 1.593, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9878196716308594, "rewards/margins": 0.30548471212387085, "rewards/rejected": -2.293304443359375, "step": 1450 }, { "epoch": 1.0518731988472623, "grad_norm": 19.07915573550666, "learning_rate": 2.710406041577751e-08, "logits/chosen": -1.9887897968292236, "logits/rejected": -1.9764807224273682, "logps/chosen": -0.9991080164909363, "logps/rejected": -1.1605703830718994, "loss": 1.568, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.9982160329818726, "rewards/margins": 0.3229246735572815, "rewards/rejected": -2.321140766143799, "step": 1460 }, { "epoch": 1.059077809798271, "grad_norm": 19.453172319867473, "learning_rate": 2.679060700046994e-08, "logits/chosen": -2.0260889530181885, "logits/rejected": -2.0165085792541504, "logps/chosen": -0.9590311050415039, "logps/rejected": -1.092279076576233, "loss": 1.6143, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9180622100830078, "rewards/margins": 0.266495943069458, "rewards/rejected": -2.184558153152466, "step": 1470 }, { "epoch": 1.0662824207492796, "grad_norm": 19.938817408083903, "learning_rate": 2.647687037460996e-08, "logits/chosen": -1.9772266149520874, "logits/rejected": -1.9762458801269531, "logps/chosen": -1.005311369895935, "logps/rejected": -1.1241614818572998, "loss": 1.6424, "rewards/accuracies": 0.59375, "rewards/chosen": -2.01062273979187, "rewards/margins": 0.23770026862621307, "rewards/rejected": -2.2483229637145996, "step": 1480 }, { "epoch": 1.0734870317002883, "grad_norm": 22.895606799094736, "learning_rate": 2.616290016021016e-08, "logits/chosen": -1.9965251684188843, "logits/rejected": -1.993642807006836, "logps/chosen": -1.1106324195861816, "logps/rejected": -1.1621644496917725, "loss": 1.736, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.2212648391723633, "rewards/margins": 0.10306410491466522, "rewards/rejected": -2.324328899383545, "step": 1490 }, { "epoch": 1.080691642651297, "grad_norm": 21.445560487885672, "learning_rate": 2.584874601622854e-08, "logits/chosen": -1.999669075012207, "logits/rejected": -1.986670732498169, "logps/chosen": -1.0090255737304688, "logps/rejected": -1.12135910987854, "loss": 1.6376, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0180511474609375, "rewards/margins": 0.2246670424938202, "rewards/rejected": -2.24271821975708, "step": 1500 }, { "epoch": 1.0878962536023056, "grad_norm": 20.277040631917046, "learning_rate": 2.5534457630714267e-08, "logits/chosen": -2.044276714324951, "logits/rejected": -2.0462608337402344, "logps/chosen": -1.014711618423462, "logps/rejected": -1.1253163814544678, "loss": 1.65, "rewards/accuracies": 0.5625, "rewards/chosen": -2.029423236846924, "rewards/margins": 0.22120928764343262, "rewards/rejected": -2.2506327629089355, "step": 1510 }, { "epoch": 1.0951008645533142, "grad_norm": 18.830969455717604, "learning_rate": 2.5220084712948764e-08, "logits/chosen": -1.985337495803833, "logits/rejected": -1.9856802225112915, "logps/chosen": -1.097121000289917, "logps/rejected": -1.1095194816589355, "loss": 1.7974, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -2.194242000579834, "rewards/margins": 0.024796944111585617, "rewards/rejected": -2.219038963317871, "step": 1520 }, { "epoch": 1.1023054755043227, "grad_norm": 20.22397260920369, "learning_rate": 2.490567698558343e-08, "logits/chosen": -2.030097723007202, "logits/rejected": -2.0213112831115723, "logps/chosen": -0.9496325254440308, "logps/rejected": -1.1169651746749878, "loss": 1.5633, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -1.8992650508880615, "rewards/margins": 0.3346652686595917, "rewards/rejected": -2.2339303493499756, "step": 1530 }, { "epoch": 1.1095100864553313, "grad_norm": 27.8479778302733, "learning_rate": 2.4591284176775326e-08, "logits/chosen": -2.033017158508301, "logits/rejected": -2.0267956256866455, "logps/chosen": -1.0237690210342407, "logps/rejected": -1.148371934890747, "loss": 1.6197, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0475380420684814, "rewards/margins": 0.24920621514320374, "rewards/rejected": -2.296743869781494, "step": 1540 }, { "epoch": 1.11671469740634, "grad_norm": 21.41619902791393, "learning_rate": 2.4276956012321926e-08, "logits/chosen": -2.0108845233917236, "logits/rejected": -2.0055813789367676, "logps/chosen": -1.026641607284546, "logps/rejected": -1.133044719696045, "loss": 1.6573, "rewards/accuracies": 0.59375, "rewards/chosen": -2.053283214569092, "rewards/margins": 0.21280638873577118, "rewards/rejected": -2.26608943939209, "step": 1550 }, { "epoch": 1.1239193083573487, "grad_norm": 29.804837301654878, "learning_rate": 2.3962742207796268e-08, "logits/chosen": -2.0699496269226074, "logits/rejected": -2.0638415813446045, "logps/chosen": -1.0469386577606201, "logps/rejected": -1.1839386224746704, "loss": 1.6112, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.0938773155212402, "rewards/margins": 0.2739998698234558, "rewards/rejected": -2.367877244949341, "step": 1560 }, { "epoch": 1.1311239193083573, "grad_norm": 19.072257470295916, "learning_rate": 2.364869246068368e-08, "logits/chosen": -2.049614429473877, "logits/rejected": -2.047759771347046, "logps/chosen": -1.0033305883407593, "logps/rejected": -1.1312639713287354, "loss": 1.6191, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0066611766815186, "rewards/margins": 0.255866676568985, "rewards/rejected": -2.2625279426574707, "step": 1570 }, { "epoch": 1.138328530259366, "grad_norm": 35.24129814573759, "learning_rate": 2.3334856442521435e-08, "logits/chosen": -2.021381139755249, "logits/rejected": -2.0155997276306152, "logps/chosen": -1.0470027923583984, "logps/rejected": -1.1944993734359741, "loss": 1.6022, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.094005584716797, "rewards/margins": 0.29499319195747375, "rewards/rejected": -2.3889987468719482, "step": 1580 }, { "epoch": 1.1455331412103746, "grad_norm": 19.787153457776164, "learning_rate": 2.3021283791042474e-08, "logits/chosen": -1.9919068813323975, "logits/rejected": -1.9873685836791992, "logps/chosen": -0.9659102559089661, "logps/rejected": -1.1006077527999878, "loss": 1.6015, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.9318205118179321, "rewards/margins": 0.26939502358436584, "rewards/rejected": -2.2012155055999756, "step": 1590 }, { "epoch": 1.1527377521613833, "grad_norm": 24.041299463029606, "learning_rate": 2.2708024102324454e-08, "logits/chosen": -2.0335323810577393, "logits/rejected": -2.0249056816101074, "logps/chosen": -1.0667665004730225, "logps/rejected": -1.1801570653915405, "loss": 1.6442, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.133533000946045, "rewards/margins": 0.2267809808254242, "rewards/rejected": -2.360314130783081, "step": 1600 }, { "epoch": 1.159942363112392, "grad_norm": 24.12111757342678, "learning_rate": 2.23951269229454e-08, "logits/chosen": -1.9858362674713135, "logits/rejected": -1.9893901348114014, "logps/chosen": -1.0084689855575562, "logps/rejected": -1.100411295890808, "loss": 1.6765, "rewards/accuracies": 0.5625, "rewards/chosen": -2.0169379711151123, "rewards/margins": 0.18388447165489197, "rewards/rejected": -2.200822591781616, "step": 1610 }, { "epoch": 1.1671469740634006, "grad_norm": 24.80127928292751, "learning_rate": 2.2082641742147238e-08, "logits/chosen": -2.059333562850952, "logits/rejected": -2.060598373413086, "logps/chosen": -1.0564236640930176, "logps/rejected": -1.1844590902328491, "loss": 1.6208, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.112847328186035, "rewards/margins": 0.25607064366340637, "rewards/rejected": -2.3689181804656982, "step": 1620 }, { "epoch": 1.1743515850144093, "grad_norm": 20.13313537520615, "learning_rate": 2.177061798400832e-08, "logits/chosen": -1.9521719217300415, "logits/rejected": -1.9484403133392334, "logps/chosen": -1.0265686511993408, "logps/rejected": -1.0971999168395996, "loss": 1.7039, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0531373023986816, "rewards/margins": 0.14126275479793549, "rewards/rejected": -2.194399833679199, "step": 1630 }, { "epoch": 1.181556195965418, "grad_norm": 19.495579780217277, "learning_rate": 2.145910499962628e-08, "logits/chosen": -1.9787580966949463, "logits/rejected": -1.9796241521835327, "logps/chosen": -1.0164722204208374, "logps/rejected": -1.1089410781860352, "loss": 1.6796, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.032944440841675, "rewards/margins": 0.1849372386932373, "rewards/rejected": -2.2178821563720703, "step": 1640 }, { "epoch": 1.1887608069164266, "grad_norm": 25.48988945819277, "learning_rate": 2.1148152059312437e-08, "logits/chosen": -2.0072319507598877, "logits/rejected": -2.0051167011260986, "logps/chosen": -1.005936861038208, "logps/rejected": -1.077120304107666, "loss": 1.707, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.011873722076416, "rewards/margins": 0.14236697554588318, "rewards/rejected": -2.154240608215332, "step": 1650 }, { "epoch": 1.195965417867435, "grad_norm": 21.152990734847933, "learning_rate": 2.0837808344799028e-08, "logits/chosen": -1.9343931674957275, "logits/rejected": -1.938122034072876, "logps/chosen": -0.9725497961044312, "logps/rejected": -1.0786354541778564, "loss": 1.659, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9450995922088623, "rewards/margins": 0.21217119693756104, "rewards/rejected": -2.157270908355713, "step": 1660 }, { "epoch": 1.2031700288184437, "grad_norm": 26.377266925973135, "learning_rate": 2.052812294146033e-08, "logits/chosen": -2.03047776222229, "logits/rejected": -2.027374744415283, "logps/chosen": -1.0395774841308594, "logps/rejected": -1.1767067909240723, "loss": 1.6092, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.0791549682617188, "rewards/margins": 0.27425867319107056, "rewards/rejected": -2.3534135818481445, "step": 1670 }, { "epoch": 1.2103746397694524, "grad_norm": 23.34792034910019, "learning_rate": 2.0219144830549163e-08, "logits/chosen": -1.971253752708435, "logits/rejected": -1.9633668661117554, "logps/chosen": -1.011054515838623, "logps/rejected": -1.148241400718689, "loss": 1.6033, "rewards/accuracies": 0.625, "rewards/chosen": -2.022109031677246, "rewards/margins": 0.2743736207485199, "rewards/rejected": -2.296482801437378, "step": 1680 }, { "epoch": 1.217579250720461, "grad_norm": 21.51470689698241, "learning_rate": 1.9910922881449716e-08, "logits/chosen": -2.011819362640381, "logits/rejected": -2.01347017288208, "logps/chosen": -1.0252724885940552, "logps/rejected": -1.1493511199951172, "loss": 1.6223, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.0505449771881104, "rewards/margins": 0.2481573075056076, "rewards/rejected": -2.2987022399902344, "step": 1690 }, { "epoch": 1.2247838616714697, "grad_norm": 26.201233097415667, "learning_rate": 1.9603505843948214e-08, "logits/chosen": -1.9847558736801147, "logits/rejected": -1.9857571125030518, "logps/chosen": -1.078355073928833, "logps/rejected": -1.2002760171890259, "loss": 1.6286, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.156710147857666, "rewards/margins": 0.2438419610261917, "rewards/rejected": -2.4005520343780518, "step": 1700 }, { "epoch": 1.2319884726224783, "grad_norm": 23.36257553020346, "learning_rate": 1.929694234052239e-08, "logits/chosen": -2.0325675010681152, "logits/rejected": -2.021353244781494, "logps/chosen": -0.9391233325004578, "logps/rejected": -1.0919990539550781, "loss": 1.5888, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8782466650009155, "rewards/margins": 0.3057512640953064, "rewards/rejected": -2.1839981079101562, "step": 1710 }, { "epoch": 1.239193083573487, "grad_norm": 19.47841832843257, "learning_rate": 1.8991280858651157e-08, "logits/chosen": -1.9777787923812866, "logits/rejected": -1.9743425846099854, "logps/chosen": -0.9831492304801941, "logps/rejected": -1.1153368949890137, "loss": 1.6254, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9662984609603882, "rewards/margins": 0.2643755078315735, "rewards/rejected": -2.2306737899780273, "step": 1720 }, { "epoch": 1.2463976945244957, "grad_norm": 29.21200511886615, "learning_rate": 1.868656974314557e-08, "logits/chosen": -2.0204837322235107, "logits/rejected": -2.0202364921569824, "logps/chosen": -1.032915711402893, "logps/rejected": -1.1591503620147705, "loss": 1.6285, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.065831422805786, "rewards/margins": 0.2524695098400116, "rewards/rejected": -2.318300724029541, "step": 1730 }, { "epoch": 1.2536023054755043, "grad_norm": 19.748132847469815, "learning_rate": 1.8382857188502422e-08, "logits/chosen": -2.005788803100586, "logits/rejected": -2.0102851390838623, "logps/chosen": -1.0914397239685059, "logps/rejected": -1.2216801643371582, "loss": 1.6228, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.1828794479370117, "rewards/margins": 0.26048025488853455, "rewards/rejected": -2.4433603286743164, "step": 1740 }, { "epoch": 1.260806916426513, "grad_norm": 21.739700200932262, "learning_rate": 1.8080191231281594e-08, "logits/chosen": -1.9720125198364258, "logits/rejected": -1.9585212469100952, "logps/chosen": -1.0438520908355713, "logps/rejected": -1.120289921760559, "loss": 1.7131, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.0877041816711426, "rewards/margins": 0.15287601947784424, "rewards/rejected": -2.240579843521118, "step": 1750 }, { "epoch": 1.2680115273775217, "grad_norm": 22.41910419325543, "learning_rate": 1.7778619742508345e-08, "logits/chosen": -2.0582499504089355, "logits/rejected": -2.063490390777588, "logps/chosen": -1.131137728691101, "logps/rejected": -1.2315350770950317, "loss": 1.6769, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.262275457382202, "rewards/margins": 0.20079448819160461, "rewards/rejected": -2.4630701541900635, "step": 1760 }, { "epoch": 1.2752161383285303, "grad_norm": 21.482297600422083, "learning_rate": 1.7478190420101796e-08, "logits/chosen": -2.036742687225342, "logits/rejected": -2.032109260559082, "logps/chosen": -1.050010085105896, "logps/rejected": -1.183793544769287, "loss": 1.6195, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.100020170211792, "rewards/margins": 0.2675670385360718, "rewards/rejected": -2.367587089538574, "step": 1770 }, { "epoch": 1.282420749279539, "grad_norm": 27.784893259936197, "learning_rate": 1.717895078133088e-08, "logits/chosen": -2.0239245891571045, "logits/rejected": -2.0181150436401367, "logps/chosen": -1.1388823986053467, "logps/rejected": -1.2011306285858154, "loss": 1.7186, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.2777647972106934, "rewards/margins": 0.12449675798416138, "rewards/rejected": -2.402261257171631, "step": 1780 }, { "epoch": 1.2896253602305476, "grad_norm": 15.507226212932258, "learning_rate": 1.688094815529873e-08, "logits/chosen": -1.978154182434082, "logits/rejected": -1.9796226024627686, "logps/chosen": -0.9750539660453796, "logps/rejected": -1.0688438415527344, "loss": 1.6649, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -1.9501079320907593, "rewards/margins": 0.18757975101470947, "rewards/rejected": -2.1376876831054688, "step": 1790 }, { "epoch": 1.2968299711815563, "grad_norm": 23.291152738627, "learning_rate": 1.658422967545693e-08, "logits/chosen": -2.0640625953674316, "logits/rejected": -2.0586647987365723, "logps/chosen": -1.0171083211898804, "logps/rejected": -1.207260251045227, "loss": 1.5423, "rewards/accuracies": 0.625, "rewards/chosen": -2.0342166423797607, "rewards/margins": 0.38030365109443665, "rewards/rejected": -2.414520502090454, "step": 1800 }, { "epoch": 1.304034582132565, "grad_norm": 24.79688067458518, "learning_rate": 1.6288842272150614e-08, "logits/chosen": -1.9829915761947632, "logits/rejected": -1.9848480224609375, "logps/chosen": -0.9952167272567749, "logps/rejected": -1.1160945892333984, "loss": 1.6428, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -1.9904334545135498, "rewards/margins": 0.24175576865673065, "rewards/rejected": -2.232189178466797, "step": 1810 }, { "epoch": 1.3112391930835736, "grad_norm": 31.41860730573992, "learning_rate": 1.5994832665195853e-08, "logits/chosen": -2.014812469482422, "logits/rejected": -2.0102453231811523, "logps/chosen": -0.9722223281860352, "logps/rejected": -1.119134545326233, "loss": 1.5972, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.9444446563720703, "rewards/margins": 0.29382452368736267, "rewards/rejected": -2.238269090652466, "step": 1820 }, { "epoch": 1.318443804034582, "grad_norm": 18.540859576791913, "learning_rate": 1.5702247356490134e-08, "logits/chosen": -1.985517144203186, "logits/rejected": -1.9945405721664429, "logps/chosen": -1.0170261859893799, "logps/rejected": -1.1490360498428345, "loss": 1.6267, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0340523719787598, "rewards/margins": 0.2640196681022644, "rewards/rejected": -2.298072099685669, "step": 1830 }, { "epoch": 1.3256484149855907, "grad_norm": 20.824160350380623, "learning_rate": 1.541113262265748e-08, "logits/chosen": -1.9940106868743896, "logits/rejected": -1.9816381931304932, "logps/chosen": -0.9921188354492188, "logps/rejected": -1.1125357151031494, "loss": 1.6419, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9842376708984375, "rewards/margins": 0.2408340871334076, "rewards/rejected": -2.225071430206299, "step": 1840 }, { "epoch": 1.3328530259365994, "grad_norm": 20.85405330465015, "learning_rate": 1.5121534507729073e-08, "logits/chosen": -2.0356698036193848, "logits/rejected": -2.029043197631836, "logps/chosen": -0.9926624298095703, "logps/rejected": -1.119905710220337, "loss": 1.6287, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.9853248596191406, "rewards/margins": 0.25448688864707947, "rewards/rejected": -2.239811420440674, "step": 1850 }, { "epoch": 1.340057636887608, "grad_norm": 20.598069658664354, "learning_rate": 1.4833498815860756e-08, "logits/chosen": -2.0153400897979736, "logits/rejected": -2.012422800064087, "logps/chosen": -1.0502384901046753, "logps/rejected": -1.1984398365020752, "loss": 1.5909, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.1004769802093506, "rewards/margins": 0.2964025139808655, "rewards/rejected": -2.3968796730041504, "step": 1860 }, { "epoch": 1.3472622478386167, "grad_norm": 24.169363767878462, "learning_rate": 1.4547071104088443e-08, "logits/chosen": -1.9878826141357422, "logits/rejected": -1.976165771484375, "logps/chosen": -0.9248664975166321, "logps/rejected": -1.0936037302017212, "loss": 1.5519, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8497329950332642, "rewards/margins": 0.33747443556785583, "rewards/rejected": -2.1872074604034424, "step": 1870 }, { "epoch": 1.3544668587896254, "grad_norm": 22.28586277053279, "learning_rate": 1.4262296675122592e-08, "logits/chosen": -1.9980617761611938, "logits/rejected": -1.993934988975525, "logps/chosen": -1.0281975269317627, "logps/rejected": -1.1155600547790527, "loss": 1.6881, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0563950538635254, "rewards/margins": 0.1747252643108368, "rewards/rejected": -2.2311201095581055, "step": 1880 }, { "epoch": 1.361671469740634, "grad_norm": 25.91029433001521, "learning_rate": 1.3979220570182902e-08, "logits/chosen": -1.9705870151519775, "logits/rejected": -1.9714816808700562, "logps/chosen": -1.0206265449523926, "logps/rejected": -1.153298258781433, "loss": 1.612, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.041253089904785, "rewards/margins": 0.26534369587898254, "rewards/rejected": -2.306596517562866, "step": 1890 }, { "epoch": 1.3688760806916427, "grad_norm": 18.134528292331787, "learning_rate": 1.369788756187445e-08, "logits/chosen": -1.9956543445587158, "logits/rejected": -2.0001983642578125, "logps/chosen": -1.0346391201019287, "logps/rejected": -1.1629480123519897, "loss": 1.6253, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0692782402038574, "rewards/margins": 0.2566176652908325, "rewards/rejected": -2.3258960247039795, "step": 1900 }, { "epoch": 1.3760806916426513, "grad_norm": 24.41945869354139, "learning_rate": 1.3418342147106212e-08, "logits/chosen": -2.027116537094116, "logits/rejected": -2.0312001705169678, "logps/chosen": -1.052958607673645, "logps/rejected": -1.2006646394729614, "loss": 1.5902, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.10591721534729, "rewards/margins": 0.29541200399398804, "rewards/rejected": -2.401329278945923, "step": 1910 }, { "epoch": 1.38328530259366, "grad_norm": 20.07983364842535, "learning_rate": 1.3140628540053218e-08, "logits/chosen": -2.0397636890411377, "logits/rejected": -2.039858341217041, "logps/chosen": -0.9341287612915039, "logps/rejected": -1.1015546321868896, "loss": 1.5639, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8682575225830078, "rewards/margins": 0.334852010011673, "rewards/rejected": -2.2031092643737793, "step": 1920 }, { "epoch": 1.3904899135446687, "grad_norm": 21.51361282786986, "learning_rate": 1.286479066516345e-08, "logits/chosen": -1.9665225744247437, "logits/rejected": -1.9671493768692017, "logps/chosen": -1.0339243412017822, "logps/rejected": -1.096381425857544, "loss": 1.7206, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.0678486824035645, "rewards/margins": 0.12491452693939209, "rewards/rejected": -2.192762851715088, "step": 1930 }, { "epoch": 1.397694524495677, "grad_norm": 18.366780492328118, "learning_rate": 1.2590872150210574e-08, "logits/chosen": -2.0531787872314453, "logits/rejected": -2.0568125247955322, "logps/chosen": -0.9896559715270996, "logps/rejected": -1.0928928852081299, "loss": 1.6574, "rewards/accuracies": 0.59375, "rewards/chosen": -1.9793119430541992, "rewards/margins": 0.20647385716438293, "rewards/rejected": -2.1857857704162598, "step": 1940 }, { "epoch": 1.4048991354466858, "grad_norm": 20.45388742088791, "learning_rate": 1.2318916319393555e-08, "logits/chosen": -2.0167720317840576, "logits/rejected": -2.011418581008911, "logps/chosen": -0.9759003520011902, "logps/rejected": -1.079329490661621, "loss": 1.6557, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9518007040023804, "rewards/margins": 0.20685818791389465, "rewards/rejected": -2.158658981323242, "step": 1950 }, { "epoch": 1.4121037463976944, "grad_norm": 22.381857191050415, "learning_rate": 1.2048966186484282e-08, "logits/chosen": -1.9989935159683228, "logits/rejected": -1.995570421218872, "logps/chosen": -1.0002979040145874, "logps/rejected": -1.1596596240997314, "loss": 1.5749, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.000595808029175, "rewards/margins": 0.3187234103679657, "rewards/rejected": -2.319319248199463, "step": 1960 }, { "epoch": 1.419308357348703, "grad_norm": 26.68547824879629, "learning_rate": 1.1781064448024333e-08, "logits/chosen": -2.0441131591796875, "logits/rejected": -2.0380544662475586, "logps/chosen": -1.0262136459350586, "logps/rejected": -1.1702954769134521, "loss": 1.6003, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.052427291870117, "rewards/margins": 0.288163959980011, "rewards/rejected": -2.3405909538269043, "step": 1970 }, { "epoch": 1.4265129682997117, "grad_norm": 22.583692034377382, "learning_rate": 1.1515253476571923e-08, "logits/chosen": -2.01566481590271, "logits/rejected": -2.0159027576446533, "logps/chosen": -1.0137196779251099, "logps/rejected": -1.1474217176437378, "loss": 1.6281, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0274393558502197, "rewards/margins": 0.2674042582511902, "rewards/rejected": -2.2948434352874756, "step": 1980 }, { "epoch": 1.4337175792507204, "grad_norm": 20.384401270587638, "learning_rate": 1.1251575314000034e-08, "logits/chosen": -1.9947608709335327, "logits/rejected": -1.9923511743545532, "logps/chosen": -0.996396541595459, "logps/rejected": -1.1149427890777588, "loss": 1.6386, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.992793083190918, "rewards/margins": 0.23709246516227722, "rewards/rejected": -2.2298855781555176, "step": 1990 }, { "epoch": 1.440922190201729, "grad_norm": 18.925724233722455, "learning_rate": 1.0990071664846861e-08, "logits/chosen": -2.055290460586548, "logits/rejected": -2.047081232070923, "logps/chosen": -0.9910066723823547, "logps/rejected": -1.1598938703536987, "loss": 1.5608, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.9820133447647095, "rewards/margins": 0.33777478337287903, "rewards/rejected": -2.3197877407073975, "step": 2000 }, { "epoch": 1.4481268011527377, "grad_norm": 19.05620802651582, "learning_rate": 1.0730783889719711e-08, "logits/chosen": -1.999265432357788, "logits/rejected": -1.992251992225647, "logps/chosen": -1.0045114755630493, "logps/rejected": -1.145806908607483, "loss": 1.6002, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0090229511260986, "rewards/margins": 0.2825910151004791, "rewards/rejected": -2.291613817214966, "step": 2010 }, { "epoch": 1.4553314121037464, "grad_norm": 21.429538223490898, "learning_rate": 1.0473752998753114e-08, "logits/chosen": -2.030794143676758, "logits/rejected": -2.0306010246276855, "logps/chosen": -1.0277831554412842, "logps/rejected": -1.1082103252410889, "loss": 1.6945, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.0555663108825684, "rewards/margins": 0.16085436940193176, "rewards/rejected": -2.2164206504821777, "step": 2020 }, { "epoch": 1.462536023054755, "grad_norm": 22.050870113308804, "learning_rate": 1.0219019645122575e-08, "logits/chosen": -2.045313835144043, "logits/rejected": -2.04082989692688, "logps/chosen": -0.9714498519897461, "logps/rejected": -1.1051620244979858, "loss": 1.6095, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -1.9428997039794922, "rewards/margins": 0.26742416620254517, "rewards/rejected": -2.2103240489959717, "step": 2030 }, { "epoch": 1.4697406340057637, "grad_norm": 26.157508425601968, "learning_rate": 9.966624118614611e-09, "logits/chosen": -2.032585620880127, "logits/rejected": -2.0330874919891357, "logps/chosen": -0.9972192645072937, "logps/rejected": -1.1311516761779785, "loss": 1.6103, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9944385290145874, "rewards/margins": 0.26786476373672485, "rewards/rejected": -2.262303352355957, "step": 2040 }, { "epoch": 1.4769452449567724, "grad_norm": 23.13602722680495, "learning_rate": 9.71660633925438e-09, "logits/chosen": -2.020906925201416, "logits/rejected": -2.011904001235962, "logps/chosen": -1.1174659729003906, "logps/rejected": -1.2822870016098022, "loss": 1.5902, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.2349319458007812, "rewards/margins": 0.32964205741882324, "rewards/rejected": -2.5645740032196045, "step": 2050 }, { "epoch": 1.484149855907781, "grad_norm": 24.5077374796, "learning_rate": 9.469005850991705e-09, "logits/chosen": -2.0096890926361084, "logits/rejected": -2.0027241706848145, "logps/chosen": -1.0285447835922241, "logps/rejected": -1.0884907245635986, "loss": 1.726, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.0570895671844482, "rewards/margins": 0.11989171802997589, "rewards/rejected": -2.1769814491271973, "step": 2060 }, { "epoch": 1.4913544668587897, "grad_norm": 28.391707951147488, "learning_rate": 9.223861815446682e-09, "logits/chosen": -2.0371224880218506, "logits/rejected": -2.022277593612671, "logps/chosen": -1.1392552852630615, "logps/rejected": -1.2324645519256592, "loss": 1.6607, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.278510570526123, "rewards/margins": 0.18641893565654755, "rewards/rejected": -2.4649291038513184, "step": 2070 }, { "epoch": 1.4985590778097984, "grad_norm": 21.381110932577513, "learning_rate": 8.981213005715627e-09, "logits/chosen": -1.949776291847229, "logits/rejected": -1.952210783958435, "logps/chosen": -1.0574856996536255, "logps/rejected": -1.1887096166610718, "loss": 1.6153, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.114971399307251, "rewards/margins": 0.26244813203811646, "rewards/rejected": -2.3774192333221436, "step": 2080 }, { "epoch": 1.505763688760807, "grad_norm": 22.794596171681757, "learning_rate": 8.741097800238617e-09, "logits/chosen": -2.036937713623047, "logits/rejected": -2.031627893447876, "logps/chosen": -1.0450177192687988, "logps/rejected": -1.1693742275238037, "loss": 1.6249, "rewards/accuracies": 0.59375, "rewards/chosen": -2.0900354385375977, "rewards/margins": 0.24871286749839783, "rewards/rejected": -2.3387484550476074, "step": 2090 }, { "epoch": 1.5129682997118157, "grad_norm": 21.632409202600087, "learning_rate": 8.503554176729341e-09, "logits/chosen": -1.989061713218689, "logits/rejected": -1.9979686737060547, "logps/chosen": -0.9342214465141296, "logps/rejected": -1.0600559711456299, "loss": 1.6299, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8684428930282593, "rewards/margins": 0.2516690790653229, "rewards/rejected": -2.1201119422912598, "step": 2100 }, { "epoch": 1.5201729106628243, "grad_norm": 24.73490951688704, "learning_rate": 8.268619706168376e-09, "logits/chosen": -1.9841842651367188, "logits/rejected": -1.9747326374053955, "logps/chosen": -1.015921950340271, "logps/rejected": -1.1286394596099854, "loss": 1.6542, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.031843900680542, "rewards/margins": 0.2254345417022705, "rewards/rejected": -2.2572789192199707, "step": 2110 }, { "epoch": 1.527377521613833, "grad_norm": 21.05879085590256, "learning_rate": 8.036331546860777e-09, "logits/chosen": -1.908087134361267, "logits/rejected": -1.9043972492218018, "logps/chosen": -1.0467476844787598, "logps/rejected": -1.1627973318099976, "loss": 1.6392, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0934953689575195, "rewards/margins": 0.232099249958992, "rewards/rejected": -2.325594663619995, "step": 2120 }, { "epoch": 1.5345821325648417, "grad_norm": 22.29777503336236, "learning_rate": 7.806726438559003e-09, "logits/chosen": -1.9843000173568726, "logits/rejected": -1.9898990392684937, "logps/chosen": -1.0579800605773926, "logps/rejected": -1.1668568849563599, "loss": 1.6476, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.115960121154785, "rewards/margins": 0.21775400638580322, "rewards/rejected": -2.3337137699127197, "step": 2130 }, { "epoch": 1.54178674351585, "grad_norm": 23.19617849497256, "learning_rate": 7.579840696651938e-09, "logits/chosen": -1.9837186336517334, "logits/rejected": -1.975818395614624, "logps/chosen": -1.062901258468628, "logps/rejected": -1.1917634010314941, "loss": 1.624, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.125802516937256, "rewards/margins": 0.25772443413734436, "rewards/rejected": -2.3835268020629883, "step": 2140 }, { "epoch": 1.5489913544668588, "grad_norm": 28.073689367264755, "learning_rate": 7.355710206421098e-09, "logits/chosen": -1.939091444015503, "logits/rejected": -1.9357961416244507, "logps/chosen": -1.0485239028930664, "logps/rejected": -1.1550390720367432, "loss": 1.6545, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.097047805786133, "rewards/margins": 0.2130306214094162, "rewards/rejected": -2.3100781440734863, "step": 2150 }, { "epoch": 1.5561959654178674, "grad_norm": 22.523519226977555, "learning_rate": 7.134370417364849e-09, "logits/chosen": -2.002406120300293, "logits/rejected": -2.003296136856079, "logps/chosen": -1.1109195947647095, "logps/rejected": -1.1835315227508545, "loss": 1.7081, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.221839189529419, "rewards/margins": 0.14522375166416168, "rewards/rejected": -2.367063045501709, "step": 2160 }, { "epoch": 1.563400576368876, "grad_norm": 21.935686976360966, "learning_rate": 6.915856337591572e-09, "logits/chosen": -1.994996428489685, "logits/rejected": -1.9938243627548218, "logps/chosen": -0.9442762136459351, "logps/rejected": -1.0544160604476929, "loss": 1.6479, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -1.8885524272918701, "rewards/margins": 0.22027930617332458, "rewards/rejected": -2.1088321208953857, "step": 2170 }, { "epoch": 1.5706051873198847, "grad_norm": 25.455047703257026, "learning_rate": 6.700202528282603e-09, "logits/chosen": -1.9900974035263062, "logits/rejected": -1.9881980419158936, "logps/chosen": -1.0776493549346924, "logps/rejected": -1.2152760028839111, "loss": 1.6069, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -2.1552987098693848, "rewards/margins": 0.27525344491004944, "rewards/rejected": -2.4305520057678223, "step": 2180 }, { "epoch": 1.5778097982708934, "grad_norm": 24.84944470319713, "learning_rate": 6.487443098225892e-09, "logits/chosen": -2.0501418113708496, "logits/rejected": -2.0468356609344482, "logps/chosen": -1.031200885772705, "logps/rejected": -1.1816002130508423, "loss": 1.5922, "rewards/accuracies": 0.625, "rewards/chosen": -2.06240177154541, "rewards/margins": 0.30079856514930725, "rewards/rejected": -2.3632004261016846, "step": 2190 }, { "epoch": 1.585014409221902, "grad_norm": 20.909336414895183, "learning_rate": 6.277611698421179e-09, "logits/chosen": -2.01149845123291, "logits/rejected": -2.007169723510742, "logps/chosen": -1.0256855487823486, "logps/rejected": -1.1708935499191284, "loss": 1.5954, "rewards/accuracies": 0.625, "rewards/chosen": -2.0513710975646973, "rewards/margins": 0.29041624069213867, "rewards/rejected": -2.341787099838257, "step": 2200 }, { "epoch": 1.5922190201729105, "grad_norm": 20.206486553431677, "learning_rate": 6.070741516757608e-09, "logits/chosen": -2.0089163780212402, "logits/rejected": -2.0105767250061035, "logps/chosen": -1.0603289604187012, "logps/rejected": -1.1521384716033936, "loss": 1.6878, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.1206579208374023, "rewards/margins": 0.18361885845661163, "rewards/rejected": -2.304276943206787, "step": 2210 }, { "epoch": 1.5994236311239192, "grad_norm": 22.350708958975336, "learning_rate": 5.866865272764607e-09, "logits/chosen": -2.014432668685913, "logits/rejected": -2.0136168003082275, "logps/chosen": -1.0870046615600586, "logps/rejected": -1.2768774032592773, "loss": 1.5475, "rewards/accuracies": 0.65625, "rewards/chosen": -2.174009323120117, "rewards/margins": 0.3797454833984375, "rewards/rejected": -2.5537548065185547, "step": 2220 }, { "epoch": 1.6066282420749278, "grad_norm": 23.288337045237636, "learning_rate": 5.666015212436795e-09, "logits/chosen": -2.0216281414031982, "logits/rejected": -2.0152599811553955, "logps/chosen": -1.0820379257202148, "logps/rejected": -1.2009848356246948, "loss": 1.6402, "rewards/accuracies": 0.625, "rewards/chosen": -2.1640758514404297, "rewards/margins": 0.2378939837217331, "rewards/rejected": -2.4019696712493896, "step": 2230 }, { "epoch": 1.6138328530259365, "grad_norm": 27.01583069454552, "learning_rate": 5.46822310313379e-09, "logits/chosen": -1.9930493831634521, "logits/rejected": -1.9963033199310303, "logps/chosen": -0.9150048494338989, "logps/rejected": -1.0773169994354248, "loss": 1.5742, "rewards/accuracies": 0.6312500238418579, "rewards/chosen": -1.8300096988677979, "rewards/margins": 0.32462412118911743, "rewards/rejected": -2.1546339988708496, "step": 2240 }, { "epoch": 1.6210374639769451, "grad_norm": 21.967134306318375, "learning_rate": 5.273520228555767e-09, "logits/chosen": -2.0560572147369385, "logits/rejected": -2.0469000339508057, "logps/chosen": -1.0821588039398193, "logps/rejected": -1.2086551189422607, "loss": 1.6406, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.1643176078796387, "rewards/margins": 0.25299257040023804, "rewards/rejected": -2.4173102378845215, "step": 2250 }, { "epoch": 1.6282420749279538, "grad_norm": 24.925763712447488, "learning_rate": 5.081937383795484e-09, "logits/chosen": -1.9834489822387695, "logits/rejected": -1.9737904071807861, "logps/chosen": -0.9765061140060425, "logps/rejected": -1.092413306236267, "loss": 1.6417, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.953012228012085, "rewards/margins": 0.2318144291639328, "rewards/rejected": -2.184826612472534, "step": 2260 }, { "epoch": 1.6354466858789625, "grad_norm": 24.22018193079342, "learning_rate": 4.893504870467588e-09, "logits/chosen": -2.0181922912597656, "logits/rejected": -2.016724109649658, "logps/chosen": -1.0404140949249268, "logps/rejected": -1.1570017337799072, "loss": 1.6379, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0808281898498535, "rewards/margins": 0.2331748753786087, "rewards/rejected": -2.3140034675598145, "step": 2270 }, { "epoch": 1.6426512968299711, "grad_norm": 26.66835743730832, "learning_rate": 4.708252491915951e-09, "logits/chosen": -1.979501724243164, "logits/rejected": -1.9686695337295532, "logps/chosen": -1.1171302795410156, "logps/rejected": -1.232032060623169, "loss": 1.6298, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.2342605590820312, "rewards/margins": 0.22980327904224396, "rewards/rejected": -2.464064121246338, "step": 2280 }, { "epoch": 1.6498559077809798, "grad_norm": 22.00835339861606, "learning_rate": 4.526209548499877e-09, "logits/chosen": -1.9684820175170898, "logits/rejected": -1.9653692245483398, "logps/chosen": -1.0476069450378418, "logps/rejected": -1.1009780168533325, "loss": 1.7427, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0952138900756836, "rewards/margins": 0.10674209892749786, "rewards/rejected": -2.201956033706665, "step": 2290 }, { "epoch": 1.6570605187319885, "grad_norm": 24.136328386644493, "learning_rate": 4.347404832959775e-09, "logits/chosen": -2.0073463916778564, "logits/rejected": -1.9953422546386719, "logps/chosen": -1.00043785572052, "logps/rejected": -1.1069265604019165, "loss": 1.6592, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.00087571144104, "rewards/margins": 0.21297720074653625, "rewards/rejected": -2.213853120803833, "step": 2300 }, { "epoch": 1.6642651296829971, "grad_norm": 22.057010770613108, "learning_rate": 4.171866625863229e-09, "logits/chosen": -1.9719337224960327, "logits/rejected": -1.968071699142456, "logps/chosen": -1.0742835998535156, "logps/rejected": -1.1518973112106323, "loss": 1.7035, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.1485671997070312, "rewards/margins": 0.15522751212120056, "rewards/rejected": -2.3037946224212646, "step": 2310 }, { "epoch": 1.6714697406340058, "grad_norm": 24.46288263177118, "learning_rate": 3.9996226911319546e-09, "logits/chosen": -2.044360637664795, "logits/rejected": -2.044651746749878, "logps/chosen": -1.0162107944488525, "logps/rejected": -1.1387598514556885, "loss": 1.635, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.032421588897705, "rewards/margins": 0.24509792029857635, "rewards/rejected": -2.277519702911377, "step": 2320 }, { "epoch": 1.6786743515850144, "grad_norm": 22.36911873035069, "learning_rate": 3.830700271650567e-09, "logits/chosen": -2.045510768890381, "logits/rejected": -2.0477523803710938, "logps/chosen": -0.9628473520278931, "logps/rejected": -1.124125599861145, "loss": 1.5848, "rewards/accuracies": 0.625, "rewards/chosen": -1.9256947040557861, "rewards/margins": 0.3225559890270233, "rewards/rejected": -2.24825119972229, "step": 2330 }, { "epoch": 1.685878962536023, "grad_norm": 22.36024023960374, "learning_rate": 3.665126084957723e-09, "logits/chosen": -1.9869651794433594, "logits/rejected": -1.9849302768707275, "logps/chosen": -0.9548214077949524, "logps/rejected": -1.1073800325393677, "loss": 1.5961, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.9096428155899048, "rewards/margins": 0.30511730909347534, "rewards/rejected": -2.2147600650787354, "step": 2340 }, { "epoch": 1.6930835734870318, "grad_norm": 26.20538266440962, "learning_rate": 3.502926319020327e-09, "logits/chosen": -1.953912377357483, "logits/rejected": -1.943302869796753, "logps/chosen": -1.060532569885254, "logps/rejected": -1.1716785430908203, "loss": 1.6529, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.121065139770508, "rewards/margins": 0.22229187190532684, "rewards/rejected": -2.3433570861816406, "step": 2350 }, { "epoch": 1.7002881844380404, "grad_norm": 21.248350350748197, "learning_rate": 3.3441266280915427e-09, "logits/chosen": -1.9893739223480225, "logits/rejected": -1.9806448221206665, "logps/chosen": -0.9349273443222046, "logps/rejected": -1.122243046760559, "loss": 1.5398, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8698546886444092, "rewards/margins": 0.3746311664581299, "rewards/rejected": -2.244486093521118, "step": 2360 }, { "epoch": 1.707492795389049, "grad_norm": 21.37718363400938, "learning_rate": 3.1887521286532023e-09, "logits/chosen": -2.0375638008117676, "logits/rejected": -2.030097484588623, "logps/chosen": -1.0926265716552734, "logps/rejected": -1.156776785850525, "loss": 1.7217, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.185253143310547, "rewards/margins": 0.12830035388469696, "rewards/rejected": -2.31355357170105, "step": 2370 }, { "epoch": 1.7146974063400577, "grad_norm": 22.764740803633266, "learning_rate": 3.0368273954432698e-09, "logits/chosen": -2.0525734424591064, "logits/rejected": -2.0523922443389893, "logps/chosen": -1.1625360250473022, "logps/rejected": -1.2549631595611572, "loss": 1.6774, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.3250720500946045, "rewards/margins": 0.18485429883003235, "rewards/rejected": -2.5099263191223145, "step": 2380 }, { "epoch": 1.7219020172910664, "grad_norm": 23.35653645378998, "learning_rate": 2.888376457568964e-09, "logits/chosen": -2.067924737930298, "logits/rejected": -2.062840461730957, "logps/chosen": -1.0438759326934814, "logps/rejected": -1.1699734926223755, "loss": 1.6262, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -2.087751865386963, "rewards/margins": 0.25219520926475525, "rewards/rejected": -2.339946985244751, "step": 2390 }, { "epoch": 1.729106628242075, "grad_norm": 18.117475149068106, "learning_rate": 2.7434227947062324e-09, "logits/chosen": -2.028148651123047, "logits/rejected": -2.022372245788574, "logps/chosen": -1.0291882753372192, "logps/rejected": -1.1964164972305298, "loss": 1.5828, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.0583765506744385, "rewards/margins": 0.3344564139842987, "rewards/rejected": -2.3928329944610596, "step": 2400 }, { "epoch": 1.7363112391930837, "grad_norm": 25.38586926098497, "learning_rate": 2.6019893333860954e-09, "logits/chosen": -2.0227832794189453, "logits/rejected": -2.0191638469696045, "logps/chosen": -1.0688263177871704, "logps/rejected": -1.1506297588348389, "loss": 1.6948, "rewards/accuracies": 0.5625, "rewards/chosen": -2.137652635574341, "rewards/margins": 0.16360695660114288, "rewards/rejected": -2.3012595176696777, "step": 2410 }, { "epoch": 1.7435158501440924, "grad_norm": 23.482865925242617, "learning_rate": 2.4640984433684758e-09, "logits/chosen": -2.0008485317230225, "logits/rejected": -1.9945675134658813, "logps/chosen": -0.9947766065597534, "logps/rejected": -1.1175730228424072, "loss": 1.6519, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -1.9895532131195068, "rewards/margins": 0.24559286236763, "rewards/rejected": -2.2351460456848145, "step": 2420 }, { "epoch": 1.7507204610951008, "grad_norm": 21.083704816516438, "learning_rate": 2.3297719341040856e-09, "logits/chosen": -1.9876625537872314, "logits/rejected": -1.9809608459472656, "logps/chosen": -1.0115702152252197, "logps/rejected": -1.1943610906600952, "loss": 1.5407, "rewards/accuracies": 0.65625, "rewards/chosen": -2.0231404304504395, "rewards/margins": 0.3655821681022644, "rewards/rejected": -2.3887221813201904, "step": 2430 }, { "epoch": 1.7579250720461095, "grad_norm": 22.214397294857353, "learning_rate": 2.199031051284972e-09, "logits/chosen": -2.029468297958374, "logits/rejected": -2.0226287841796875, "logps/chosen": -1.0236752033233643, "logps/rejected": -1.0963797569274902, "loss": 1.7051, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -2.0473504066467285, "rewards/margins": 0.14540909230709076, "rewards/rejected": -2.1927595138549805, "step": 2440 }, { "epoch": 1.7651296829971181, "grad_norm": 22.659240095278513, "learning_rate": 2.0718964734841667e-09, "logits/chosen": -2.0216195583343506, "logits/rejected": -2.0182180404663086, "logps/chosen": -1.1063746213912964, "logps/rejected": -1.191334843635559, "loss": 1.6945, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -2.2127492427825928, "rewards/margins": 0.16992013156414032, "rewards/rejected": -2.382669687271118, "step": 2450 }, { "epoch": 1.7723342939481268, "grad_norm": 17.951670482135615, "learning_rate": 1.948388308885102e-09, "logits/chosen": -2.0658774375915527, "logits/rejected": -2.0580222606658936, "logps/chosen": -0.9547419548034668, "logps/rejected": -1.0859801769256592, "loss": 1.6238, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9094839096069336, "rewards/margins": 0.2624765932559967, "rewards/rejected": -2.1719603538513184, "step": 2460 }, { "epoch": 1.7795389048991355, "grad_norm": 26.064153513909826, "learning_rate": 1.8285260921011846e-09, "logits/chosen": -2.047785520553589, "logits/rejected": -2.038276433944702, "logps/chosen": -1.1647402048110962, "logps/rejected": -1.2458956241607666, "loss": 1.7181, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.3294804096221924, "rewards/margins": 0.16231082379817963, "rewards/rejected": -2.491791248321533, "step": 2470 }, { "epoch": 1.7867435158501441, "grad_norm": 19.498652452320453, "learning_rate": 1.712328781086131e-09, "logits/chosen": -1.9989614486694336, "logits/rejected": -1.9959602355957031, "logps/chosen": -1.0413289070129395, "logps/rejected": -1.1756139993667603, "loss": 1.6205, "rewards/accuracies": 0.625, "rewards/chosen": -2.082657814025879, "rewards/margins": 0.2685699760913849, "rewards/rejected": -2.3512279987335205, "step": 2480 }, { "epoch": 1.7939481268011528, "grad_norm": 19.579842521680934, "learning_rate": 1.59981475413547e-09, "logits/chosen": -1.982996940612793, "logits/rejected": -1.9784364700317383, "logps/chosen": -0.9387162923812866, "logps/rejected": -1.0598050355911255, "loss": 1.6288, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -1.8774325847625732, "rewards/margins": 0.24217729270458221, "rewards/rejected": -2.119610071182251, "step": 2490 }, { "epoch": 1.8011527377521612, "grad_norm": 21.279275035018046, "learning_rate": 1.491001806979772e-09, "logits/chosen": -1.986486792564392, "logits/rejected": -1.9824678897857666, "logps/chosen": -1.0033233165740967, "logps/rejected": -1.1795743703842163, "loss": 1.5452, "rewards/accuracies": 0.6937500238418579, "rewards/chosen": -2.0066466331481934, "rewards/margins": 0.35250231623649597, "rewards/rejected": -2.3591487407684326, "step": 2500 }, { "epoch": 1.8083573487031699, "grad_norm": 22.81377086234191, "learning_rate": 1.3859071499699698e-09, "logits/chosen": -1.9927501678466797, "logits/rejected": -1.9867427349090576, "logps/chosen": -1.0045326948165894, "logps/rejected": -1.1114981174468994, "loss": 1.6603, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -2.0090653896331787, "rewards/margins": 0.2139308750629425, "rewards/rejected": -2.222996234893799, "step": 2510 }, { "epoch": 1.8155619596541785, "grad_norm": 22.243359575974143, "learning_rate": 1.2845474053553156e-09, "logits/chosen": -1.967153549194336, "logits/rejected": -1.9660823345184326, "logps/chosen": -1.0100979804992676, "logps/rejected": -1.1443712711334229, "loss": 1.6266, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -2.020195960998535, "rewards/margins": 0.26854681968688965, "rewards/rejected": -2.2887425422668457, "step": 2520 }, { "epoch": 1.8227665706051872, "grad_norm": 20.940981692691256, "learning_rate": 1.1869386046543222e-09, "logits/chosen": -1.9824377298355103, "logits/rejected": -1.9759935140609741, "logps/chosen": -1.0096970796585083, "logps/rejected": -1.1292235851287842, "loss": 1.6418, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -2.0193941593170166, "rewards/margins": 0.23905305564403534, "rewards/rejected": -2.2584471702575684, "step": 2530 }, { "epoch": 1.8299711815561959, "grad_norm": 23.31599170849686, "learning_rate": 1.0930961861191302e-09, "logits/chosen": -1.979270339012146, "logits/rejected": -1.9758975505828857, "logps/chosen": -0.9248117208480835, "logps/rejected": -1.046858549118042, "loss": 1.6419, "rewards/accuracies": 0.5625, "rewards/chosen": -1.849623441696167, "rewards/margins": 0.2440938651561737, "rewards/rejected": -2.093717098236084, "step": 2540 }, { "epoch": 1.8371757925072045, "grad_norm": 25.087005895011544, "learning_rate": 1.003034992293733e-09, "logits/chosen": -2.0150609016418457, "logits/rejected": -2.0050504207611084, "logps/chosen": -0.9422048330307007, "logps/rejected": -1.1044690608978271, "loss": 1.5718, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -1.8844096660614014, "rewards/margins": 0.3245285153388977, "rewards/rejected": -2.2089381217956543, "step": 2550 }, { "epoch": 1.8443804034582132, "grad_norm": 21.9012224134329, "learning_rate": 9.16769267666434e-10, "logits/chosen": -1.9812209606170654, "logits/rejected": -1.9651670455932617, "logps/chosen": -0.984574019908905, "logps/rejected": -1.0492041110992432, "loss": 1.7224, "rewards/accuracies": 0.5, "rewards/chosen": -1.96914803981781, "rewards/margins": 0.12925995886325836, "rewards/rejected": -2.0984082221984863, "step": 2560 }, { "epoch": 1.8515850144092219, "grad_norm": 29.798850096367598, "learning_rate": 8.343126564168412e-10, "logits/chosen": -2.023308277130127, "logits/rejected": -2.0172171592712402, "logps/chosen": -1.0593010187149048, "logps/rejected": -1.1926862001419067, "loss": 1.6084, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.1186020374298096, "rewards/margins": 0.2667704224586487, "rewards/rejected": -2.3853724002838135, "step": 2570 }, { "epoch": 1.8587896253602305, "grad_norm": 23.081298452800397, "learning_rate": 7.55678200257856e-10, "logits/chosen": -1.986285924911499, "logits/rejected": -1.9805923700332642, "logps/chosen": -1.0580508708953857, "logps/rejected": -1.134381651878357, "loss": 1.7026, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -2.1161017417907715, "rewards/margins": 0.1526612788438797, "rewards/rejected": -2.268763303756714, "step": 2580 }, { "epoch": 1.8659942363112392, "grad_norm": 19.289572768646487, "learning_rate": 6.808783363729364e-10, "logits/chosen": -1.9875192642211914, "logits/rejected": -1.9795039892196655, "logps/chosen": -0.9947047233581543, "logps/rejected": -1.128720998764038, "loss": 1.6133, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -1.9894094467163086, "rewards/margins": 0.2680323123931885, "rewards/rejected": -2.257441997528076, "step": 2590 }, { "epoch": 1.8731988472622478, "grad_norm": 19.644352031599038, "learning_rate": 6.099248954489794e-10, "logits/chosen": -2.0019712448120117, "logits/rejected": -2.0046229362487793, "logps/chosen": -0.9696542024612427, "logps/rejected": -1.0707123279571533, "loss": 1.6688, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -1.9393084049224854, "rewards/margins": 0.20211617648601532, "rewards/rejected": -2.1414246559143066, "step": 2600 }, { "epoch": 1.8804034582132565, "grad_norm": 20.13597620650645, "learning_rate": 5.428290998051116e-10, "logits/chosen": -1.991913080215454, "logits/rejected": -1.9869178533554077, "logps/chosen": -0.9798202514648438, "logps/rejected": -1.0955702066421509, "loss": 1.63, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -1.9596405029296875, "rewards/margins": 0.2314998209476471, "rewards/rejected": -2.1911404132843018, "step": 2610 }, { "epoch": 1.8876080691642652, "grad_norm": 24.79934802241825, "learning_rate": 4.796015616177401e-10, "logits/chosen": -1.9711564779281616, "logits/rejected": -1.9690234661102295, "logps/chosen": -1.028472900390625, "logps/rejected": -1.1603518724441528, "loss": 1.6215, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -2.05694580078125, "rewards/margins": 0.2637581527233124, "rewards/rejected": -2.3207037448883057, "step": 2620 }, { "epoch": 1.8948126801152738, "grad_norm": 22.569211582194647, "learning_rate": 4.2025228124205335e-10, "logits/chosen": -2.0415008068084717, "logits/rejected": -2.0464255809783936, "logps/chosen": -1.1165236234664917, "logps/rejected": -1.1973609924316406, "loss": 1.6887, "rewards/accuracies": 0.53125, "rewards/chosen": -2.2330472469329834, "rewards/margins": 0.1616745889186859, "rewards/rejected": -2.3947219848632812, "step": 2630 }, { "epoch": 1.9020172910662825, "grad_norm": 29.251729021197608, "learning_rate": 3.64790645630339e-10, "logits/chosen": -2.0050177574157715, "logits/rejected": -1.9978790283203125, "logps/chosen": -1.0866100788116455, "logps/rejected": -1.1683754920959473, "loss": 1.7045, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.173220157623291, "rewards/margins": 0.16353097558021545, "rewards/rejected": -2.3367509841918945, "step": 2640 }, { "epoch": 1.9092219020172911, "grad_norm": 26.686501597422076, "learning_rate": 3.1322542684729945e-10, "logits/chosen": -1.9958645105361938, "logits/rejected": -1.9863427877426147, "logps/chosen": -1.1009094715118408, "logps/rejected": -1.2481192350387573, "loss": 1.5988, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -2.2018189430236816, "rewards/margins": 0.29441922903060913, "rewards/rejected": -2.4962384700775146, "step": 2650 }, { "epoch": 1.9164265129682998, "grad_norm": 24.025977247368846, "learning_rate": 2.6556478068261447e-10, "logits/chosen": -2.011669158935547, "logits/rejected": -2.0208234786987305, "logps/chosen": -1.0808687210083008, "logps/rejected": -1.210559606552124, "loss": 1.6254, "rewards/accuracies": 0.5625, "rewards/chosen": -2.1617374420166016, "rewards/margins": 0.2593816816806793, "rewards/rejected": -2.421119213104248, "step": 2660 }, { "epoch": 1.9236311239193085, "grad_norm": 23.159728407542556, "learning_rate": 2.2181624536098952e-10, "logits/chosen": -2.06213641166687, "logits/rejected": -2.0583481788635254, "logps/chosen": -1.054503321647644, "logps/rejected": -1.183261513710022, "loss": 1.624, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.109006643295288, "rewards/margins": 0.2575165629386902, "rewards/rejected": -2.366523027420044, "step": 2670 }, { "epoch": 1.9308357348703171, "grad_norm": 24.285074754554824, "learning_rate": 1.819867403498737e-10, "logits/chosen": -2.0223278999328613, "logits/rejected": -2.0300443172454834, "logps/chosen": -1.0237973928451538, "logps/rejected": -1.1488239765167236, "loss": 1.616, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -2.0475947856903076, "rewards/margins": 0.250053346157074, "rewards/rejected": -2.2976479530334473, "step": 2680 }, { "epoch": 1.9380403458213258, "grad_norm": 22.444579015136416, "learning_rate": 1.4608256526505157e-10, "logits/chosen": -1.9494588375091553, "logits/rejected": -1.9468958377838135, "logps/chosen": -1.1418302059173584, "logps/rejected": -1.2206158638000488, "loss": 1.6977, "rewards/accuracies": 0.53125, "rewards/chosen": -2.283660411834717, "rewards/margins": 0.15757131576538086, "rewards/rejected": -2.4412317276000977, "step": 2690 }, { "epoch": 1.9452449567723344, "grad_norm": 26.381118967286472, "learning_rate": 1.1410939887425141e-10, "logits/chosen": -2.0545475482940674, "logits/rejected": -2.041222095489502, "logps/chosen": -0.9961267709732056, "logps/rejected": -1.0998473167419434, "loss": 1.6658, "rewards/accuracies": 0.5, "rewards/chosen": -1.9922535419464111, "rewards/margins": 0.20744113624095917, "rewards/rejected": -2.1996946334838867, "step": 2700 }, { "epoch": 1.952449567723343, "grad_norm": 24.607639118161103, "learning_rate": 8.607229819898865e-11, "logits/chosen": -2.02439022064209, "logits/rejected": -2.0220091342926025, "logps/chosen": -1.0608162879943848, "logps/rejected": -1.1870168447494507, "loss": 1.6251, "rewards/accuracies": 0.59375, "rewards/chosen": -2.1216325759887695, "rewards/margins": 0.2524010241031647, "rewards/rejected": -2.3740336894989014, "step": 2710 }, { "epoch": 1.9596541786743515, "grad_norm": 25.59822036865297, "learning_rate": 6.19756977147029e-11, "logits/chosen": -2.024225950241089, "logits/rejected": -2.0201265811920166, "logps/chosen": -0.9928004145622253, "logps/rejected": -1.1591542959213257, "loss": 1.5624, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -1.9856008291244507, "rewards/margins": 0.3327081501483917, "rewards/rejected": -2.3183085918426514, "step": 2720 }, { "epoch": 1.9668587896253602, "grad_norm": 19.72589484365043, "learning_rate": 4.1823408649391265e-11, "logits/chosen": -1.9768123626708984, "logits/rejected": -1.9772489070892334, "logps/chosen": -1.0286433696746826, "logps/rejected": -1.1329104900360107, "loss": 1.6564, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -2.0572867393493652, "rewards/margins": 0.2085343301296234, "rewards/rejected": -2.2658209800720215, "step": 2730 }, { "epoch": 1.9740634005763689, "grad_norm": 22.64173380411296, "learning_rate": 2.5618618380812694e-11, "logits/chosen": -2.074462413787842, "logits/rejected": -2.0713300704956055, "logps/chosen": -0.9406960606575012, "logps/rejected": -1.0647523403167725, "loss": 1.6234, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -1.8813921213150024, "rewards/margins": 0.24811263382434845, "rewards/rejected": -2.129504680633545, "step": 2740 }, { "epoch": 1.9812680115273775, "grad_norm": 18.289142985147425, "learning_rate": 1.3363889932338501e-11, "logits/chosen": -1.980743408203125, "logits/rejected": -1.9810924530029297, "logps/chosen": -1.0640665292739868, "logps/rejected": -1.2089464664459229, "loss": 1.5996, "rewards/accuracies": 0.65625, "rewards/chosen": -2.1281330585479736, "rewards/margins": 0.2897598147392273, "rewards/rejected": -2.4178929328918457, "step": 2750 }, { "epoch": 1.9884726224783862, "grad_norm": 21.489973290523885, "learning_rate": 5.061161567596061e-12, "logits/chosen": -2.085202217102051, "logits/rejected": -2.083030939102173, "logps/chosen": -1.0205986499786377, "logps/rejected": -1.12671959400177, "loss": 1.6554, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.0411972999572754, "rewards/margins": 0.2122419774532318, "rewards/rejected": -2.25343918800354, "step": 2760 }, { "epoch": 1.9956772334293948, "grad_norm": 27.92987313520658, "learning_rate": 7.11746483889053e-13, "logits/chosen": -2.0238137245178223, "logits/rejected": -2.018078327178955, "logps/chosen": -1.0771089792251587, "logps/rejected": -1.1666157245635986, "loss": 1.6928, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -2.1542179584503174, "rewards/margins": 0.17901378870010376, "rewards/rejected": -2.3332314491271973, "step": 2770 }, { "epoch": 2.0, "step": 2776, "total_flos": 0.0, "train_loss": 1.6477044489696322, "train_runtime": 3633.5789, "train_samples_per_second": 12.22, "train_steps_per_second": 0.764 } ], "logging_steps": 10, "max_steps": 2776, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }