{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 1540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.8668746948242188, "logits/rejected": -1.8712046146392822, "logps/chosen": -36.99528884887695, "logps/rejected": -33.6615104675293, "loss": 0.6855, "rewards/accuracies": 0.5, "rewards/chosen": 0.004523592535406351, "rewards/margins": 0.017096158117055893, "rewards/rejected": -0.012572565115988255, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9979562759399414, "logits/rejected": -2.000598907470703, "logps/chosen": -29.644357681274414, "logps/rejected": -29.06288719177246, "loss": 0.6942, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.0008665517088957131, "rewards/margins": -0.0010885533411055803, "rewards/rejected": 0.00022200122475624084, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.9213365316390991, "logits/rejected": -1.9186455011367798, "logps/chosen": -31.3991756439209, "logps/rejected": -33.220787048339844, "loss": 0.6903, "rewards/accuracies": 0.625, "rewards/chosen": 0.006767785642296076, "rewards/margins": 0.007698210421949625, "rewards/rejected": -0.0009304238483309746, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.0176024436950684, "logits/rejected": -2.008852481842041, "logps/chosen": -32.565155029296875, "logps/rejected": -32.51045608520508, "loss": 0.6924, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.004661071114242077, "rewards/margins": 0.0037749619223177433, "rewards/rejected": 0.0008861090755090117, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8624731302261353, "logits/rejected": -1.8516931533813477, "logps/chosen": -33.547359466552734, "logps/rejected": -35.463809967041016, "loss": 0.69, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.004046988673508167, "rewards/margins": 0.008634108118712902, "rewards/rejected": -0.004587120376527309, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9406204223632812, "logits/rejected": -1.9425837993621826, "logps/chosen": -32.54151153564453, "logps/rejected": -33.21025848388672, "loss": 0.6772, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.023469632491469383, "rewards/margins": 0.03902136906981468, "rewards/rejected": -0.015551735647022724, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.0720112323760986, "logits/rejected": -2.0769832134246826, "logps/chosen": -33.98130416870117, "logps/rejected": -36.64153289794922, "loss": 0.6804, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0028717622626572847, "rewards/margins": 0.03193504735827446, "rewards/rejected": -0.029063284397125244, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9326984882354736, "logits/rejected": -1.935831069946289, "logps/chosen": -34.31356430053711, "logps/rejected": -34.65351867675781, "loss": 0.6626, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.045696478337049484, "rewards/margins": 0.07014231383800507, "rewards/rejected": -0.02444584295153618, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9407522678375244, "logits/rejected": -1.9452556371688843, "logps/chosen": -32.40108108520508, "logps/rejected": -32.342872619628906, "loss": 0.6859, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.02653699554502964, "rewards/margins": 0.020122777670621872, "rewards/rejected": 0.0064142136834561825, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.0378258228302, "logits/rejected": -2.0358424186706543, "logps/chosen": -32.15534973144531, "logps/rejected": -31.297805786132812, "loss": 0.6726, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.03419475629925728, "rewards/margins": 0.04683210328221321, "rewards/rejected": -0.01263735257089138, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.233442544937134, "eval_logits/rejected": -2.2285873889923096, "eval_logps/chosen": -34.044193267822266, "eval_logps/rejected": -37.55242919921875, "eval_loss": 0.6902939081192017, "eval_rewards/accuracies": 0.5481727719306946, "eval_rewards/chosen": -0.0038560593966394663, "eval_rewards/margins": 0.010466905310750008, "eval_rewards/rejected": -0.014322965405881405, "eval_runtime": 146.038, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.9928157329559326, "logits/rejected": -1.9904266595840454, "logps/chosen": -33.1363410949707, "logps/rejected": -34.00283432006836, "loss": 0.6883, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04299246892333031, "rewards/margins": 0.03237856179475784, "rewards/rejected": 0.010613908991217613, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.0046679973602295, "logits/rejected": -1.9963252544403076, "logps/chosen": -32.3227653503418, "logps/rejected": -32.157779693603516, "loss": 0.6741, "rewards/accuracies": 0.5625, "rewards/chosen": 0.04885613173246384, "rewards/margins": 0.0473661907017231, "rewards/rejected": 0.0014899425441399217, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.0337133407592773, "logits/rejected": -2.0257389545440674, "logps/chosen": -30.330951690673828, "logps/rejected": -32.077327728271484, "loss": 0.6662, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.05790700390934944, "rewards/margins": 0.06763499230146408, "rewards/rejected": -0.00972799677401781, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9636684656143188, "logits/rejected": -1.9739125967025757, "logps/chosen": -31.2012996673584, "logps/rejected": -32.56267547607422, "loss": 0.6505, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.08734508603811264, "rewards/margins": 0.09846383333206177, "rewards/rejected": -0.011118754744529724, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.8753896951675415, "logits/rejected": -1.8765614032745361, "logps/chosen": -33.930477142333984, "logps/rejected": -34.81265640258789, "loss": 0.6411, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.10488543659448624, "rewards/margins": 0.1270444542169571, "rewards/rejected": -0.022159017622470856, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9266326427459717, "logits/rejected": -1.9231981039047241, "logps/chosen": -36.001312255859375, "logps/rejected": -32.70484924316406, "loss": 0.6671, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0681454986333847, "rewards/margins": 0.06098458915948868, "rewards/rejected": 0.007160906679928303, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.0281758308410645, "logits/rejected": -2.0208277702331543, "logps/chosen": -33.47995376586914, "logps/rejected": -31.4173526763916, "loss": 0.6278, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.12716376781463623, "rewards/margins": 0.15336360037326813, "rewards/rejected": -0.026199836283922195, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.033982038497925, "logits/rejected": -2.0392203330993652, "logps/chosen": -32.239437103271484, "logps/rejected": -32.44209671020508, "loss": 0.6474, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12213903665542603, "rewards/margins": 0.10492189973592758, "rewards/rejected": 0.017217133194208145, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.034848928451538, "logits/rejected": -2.032095432281494, "logps/chosen": -31.259105682373047, "logps/rejected": -31.31390953063965, "loss": 0.6567, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.09193893522024155, "rewards/margins": 0.09021683037281036, "rewards/rejected": 0.0017221048474311829, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9051767587661743, "logits/rejected": -1.9098323583602905, "logps/chosen": -31.32687759399414, "logps/rejected": -32.82324981689453, "loss": 0.6419, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.1117621660232544, "rewards/margins": 0.12468986213207245, "rewards/rejected": -0.012927706353366375, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.2309587001800537, "eval_logits/rejected": -2.2261300086975098, "eval_logps/chosen": -34.06596374511719, "eval_logps/rejected": -37.5656852722168, "eval_loss": 0.6932108402252197, "eval_rewards/accuracies": 0.5041528344154358, "eval_rewards/chosen": -0.0125651890411973, "eval_rewards/margins": 0.007060302421450615, "eval_rewards/rejected": -0.01962549053132534, "eval_runtime": 145.8797, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.0179035663604736, "logits/rejected": -2.0285422801971436, "logps/chosen": -31.756057739257812, "logps/rejected": -33.936058044433594, "loss": 0.6444, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.08609838038682938, "rewards/margins": 0.11544252932071686, "rewards/rejected": -0.029344135895371437, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.910278081893921, "logits/rejected": -1.9250171184539795, "logps/chosen": -29.805334091186523, "logps/rejected": -31.600574493408203, "loss": 0.6326, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1224818006157875, "rewards/margins": 0.139441579580307, "rewards/rejected": -0.016959769651293755, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9668325185775757, "logits/rejected": -1.970798134803772, "logps/chosen": -33.078094482421875, "logps/rejected": -31.6514949798584, "loss": 0.621, "rewards/accuracies": 0.6875, "rewards/chosen": 0.13575060665607452, "rewards/margins": 0.1766270101070404, "rewards/rejected": -0.04087639972567558, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.965409278869629, "logits/rejected": -1.9435851573944092, "logps/chosen": -33.83237075805664, "logps/rejected": -35.09648132324219, "loss": 0.6165, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.12855152785778046, "rewards/margins": 0.18682697415351868, "rewards/rejected": -0.05827543884515762, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.006166696548462, "logits/rejected": -2.002840042114258, "logps/chosen": -32.69712448120117, "logps/rejected": -36.255043029785156, "loss": 0.6502, "rewards/accuracies": 0.625, "rewards/chosen": 0.09117679297924042, "rewards/margins": 0.10322580486536026, "rewards/rejected": -0.01204901933670044, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8737766742706299, "logits/rejected": -1.87137770652771, "logps/chosen": -33.97405242919922, "logps/rejected": -35.56011962890625, "loss": 0.6473, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.08574129641056061, "rewards/margins": 0.1079399362206459, "rewards/rejected": -0.022198637947440147, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8586937189102173, "logits/rejected": -1.8563038110733032, "logps/chosen": -34.204627990722656, "logps/rejected": -31.85955810546875, "loss": 0.6478, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.07718975841999054, "rewards/margins": 0.11248154938220978, "rewards/rejected": -0.03529178351163864, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.962044358253479, "logits/rejected": -1.9515289068222046, "logps/chosen": -35.0092887878418, "logps/rejected": -31.877635955810547, "loss": 0.6267, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.13894058763980865, "rewards/margins": 0.15359947085380554, "rewards/rejected": -0.01465887576341629, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.0572900772094727, "logits/rejected": -2.0424036979675293, "logps/chosen": -30.72947120666504, "logps/rejected": -32.645362854003906, "loss": 0.6657, "rewards/accuracies": 0.625, "rewards/chosen": 0.07786226272583008, "rewards/margins": 0.07527298480272293, "rewards/rejected": 0.002589278621599078, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.9281587600708008, "logits/rejected": -1.9256232976913452, "logps/chosen": -32.37285614013672, "logps/rejected": -30.912500381469727, "loss": 0.5902, "rewards/accuracies": 0.75, "rewards/chosen": 0.22170230746269226, "rewards/margins": 0.2599778473377228, "rewards/rejected": -0.03827553242444992, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2279410362243652, "eval_logits/rejected": -2.2231125831604004, "eval_logps/chosen": -34.09661865234375, "eval_logps/rejected": -37.598716735839844, "eval_loss": 0.6926039457321167, "eval_rewards/accuracies": 0.49543190002441406, "eval_rewards/chosen": -0.024825766682624817, "eval_rewards/margins": 0.008013932965695858, "eval_rewards/rejected": -0.03283970057964325, "eval_runtime": 145.6649, "eval_samples_per_second": 2.355, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "grad_norm": 7.8125, "learning_rate": 4.84533120650964e-06, "logits/chosen": -2.0616166591644287, "logits/rejected": -2.048832416534424, "logps/chosen": -32.10347366333008, "logps/rejected": -32.91118621826172, "loss": 0.5726, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.16996803879737854, "rewards/margins": 0.2780183255672455, "rewards/rejected": -0.10805028676986694, "step": 310 }, { "epoch": 0.83, "grad_norm": 7.25, "learning_rate": 4.825108134172131e-06, "logits/chosen": -1.9727706909179688, "logits/rejected": -1.9641412496566772, "logps/chosen": -31.7288818359375, "logps/rejected": -30.45233154296875, "loss": 0.5546, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.25044065713882446, "rewards/margins": 0.33481746912002563, "rewards/rejected": -0.08437685668468475, "step": 320 }, { "epoch": 0.86, "grad_norm": 8.1875, "learning_rate": 4.80369052967602e-06, "logits/chosen": -1.9075199365615845, "logits/rejected": -1.9195775985717773, "logps/chosen": -29.810604095458984, "logps/rejected": -33.72663497924805, "loss": 0.5272, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.2674657702445984, "rewards/margins": 0.3972177803516388, "rewards/rejected": -0.1297520250082016, "step": 330 }, { "epoch": 0.88, "grad_norm": 10.0, "learning_rate": 4.781089396387968e-06, "logits/chosen": -1.8698316812515259, "logits/rejected": -1.8606189489364624, "logps/chosen": -33.96432876586914, "logps/rejected": -36.236751556396484, "loss": 0.5218, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.29383981227874756, "rewards/margins": 0.44584956765174866, "rewards/rejected": -0.15200971066951752, "step": 340 }, { "epoch": 0.91, "grad_norm": 7.09375, "learning_rate": 4.757316345716554e-06, "logits/chosen": -1.9213838577270508, "logits/rejected": -1.9220517873764038, "logps/chosen": -33.60356140136719, "logps/rejected": -34.14032745361328, "loss": 0.527, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.3206879496574402, "rewards/margins": 0.43722790479660034, "rewards/rejected": -0.11653995513916016, "step": 350 }, { "epoch": 0.94, "grad_norm": 7.4375, "learning_rate": 4.73238359114687e-06, "logits/chosen": -2.0467612743377686, "logits/rejected": -2.0528926849365234, "logps/chosen": -31.081350326538086, "logps/rejected": -33.01131057739258, "loss": 0.5605, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.1967591494321823, "rewards/margins": 0.3339698314666748, "rewards/rejected": -0.1372106969356537, "step": 360 }, { "epoch": 0.96, "grad_norm": 11.25, "learning_rate": 4.706303941965804e-06, "logits/chosen": -1.9743931293487549, "logits/rejected": -1.9739511013031006, "logps/chosen": -32.78725814819336, "logps/rejected": -36.41063690185547, "loss": 0.5304, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.26307255029678345, "rewards/margins": 0.4074175953865051, "rewards/rejected": -0.14434504508972168, "step": 370 }, { "epoch": 0.99, "grad_norm": 6.78125, "learning_rate": 4.679090796681225e-06, "logits/chosen": -2.004617214202881, "logits/rejected": -2.000023365020752, "logps/chosen": -30.0998592376709, "logps/rejected": -29.62681007385254, "loss": 0.531, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2442973405122757, "rewards/margins": 0.4094654619693756, "rewards/rejected": -0.1651681363582611, "step": 380 }, { "epoch": 1.01, "grad_norm": 8.75, "learning_rate": 4.650758136138454e-06, "logits/chosen": -1.7745968103408813, "logits/rejected": -1.7809730768203735, "logps/chosen": -31.600543975830078, "logps/rejected": -36.76545333862305, "loss": 0.4808, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.31406909227371216, "rewards/margins": 0.5729340314865112, "rewards/rejected": -0.2588648796081543, "step": 390 }, { "epoch": 1.04, "grad_norm": 7.21875, "learning_rate": 4.621320516337559e-06, "logits/chosen": -1.9251466989517212, "logits/rejected": -1.9188610315322876, "logps/chosen": -32.989654541015625, "logps/rejected": -32.73323059082031, "loss": 0.5071, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.33721691370010376, "rewards/margins": 0.5069588422775269, "rewards/rejected": -0.16974198818206787, "step": 400 }, { "epoch": 1.04, "eval_logits/chosen": -2.1922616958618164, "eval_logits/rejected": -2.1874282360076904, "eval_logps/chosen": -34.242794036865234, "eval_logps/rejected": -37.80149841308594, "eval_loss": 0.6867499947547913, "eval_rewards/accuracies": 0.5510797500610352, "eval_rewards/chosen": -0.08329664915800095, "eval_rewards/margins": 0.030653679743409157, "eval_rewards/rejected": -0.11395032703876495, "eval_runtime": 145.5198, "eval_samples_per_second": 2.357, "eval_steps_per_second": 0.295, "step": 400 }, { "epoch": 1.06, "grad_norm": 6.8125, "learning_rate": 4.590793060955158e-06, "logits/chosen": -1.922146201133728, "logits/rejected": -1.9294363260269165, "logps/chosen": -28.543231964111328, "logps/rejected": -29.782611846923828, "loss": 0.4969, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.23085805773735046, "rewards/margins": 0.5046137571334839, "rewards/rejected": -0.2737556993961334, "step": 410 }, { "epoch": 1.09, "grad_norm": 7.375, "learning_rate": 4.559191453574582e-06, "logits/chosen": -1.939373254776001, "logits/rejected": -1.938410758972168, "logps/chosen": -33.535621643066406, "logps/rejected": -31.325557708740234, "loss": 0.5345, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.28603512048721313, "rewards/margins": 0.4390401244163513, "rewards/rejected": -0.15300500392913818, "step": 420 }, { "epoch": 1.12, "grad_norm": 8.0625, "learning_rate": 4.52653192962838e-06, "logits/chosen": -1.9316809177398682, "logits/rejected": -1.9144115447998047, "logps/chosen": -30.42580223083496, "logps/rejected": -33.712886810302734, "loss": 0.4939, "rewards/accuracies": 0.875, "rewards/chosen": 0.2205621749162674, "rewards/margins": 0.5352030992507935, "rewards/rejected": -0.31464093923568726, "step": 430 }, { "epoch": 1.14, "grad_norm": 8.125, "learning_rate": 4.492831268057307e-06, "logits/chosen": -1.9621837139129639, "logits/rejected": -1.9643888473510742, "logps/chosen": -35.697532653808594, "logps/rejected": -35.595943450927734, "loss": 0.4474, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.36593085527420044, "rewards/margins": 0.6603227853775024, "rewards/rejected": -0.2943919003009796, "step": 440 }, { "epoch": 1.17, "grad_norm": 7.125, "learning_rate": 4.458106782690094e-06, "logits/chosen": -2.035297155380249, "logits/rejected": -2.035062074661255, "logps/chosen": -31.8659725189209, "logps/rejected": -34.10185241699219, "loss": 0.5013, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.306951105594635, "rewards/margins": 0.523645281791687, "rewards/rejected": -0.2166941612958908, "step": 450 }, { "epoch": 1.19, "grad_norm": 7.78125, "learning_rate": 4.422376313348405e-06, "logits/chosen": -1.9762487411499023, "logits/rejected": -1.968775987625122, "logps/chosen": -31.517736434936523, "logps/rejected": -36.918033599853516, "loss": 0.4394, "rewards/accuracies": 0.875, "rewards/chosen": 0.3552888035774231, "rewards/margins": 0.7008857131004333, "rewards/rejected": -0.3455968499183655, "step": 460 }, { "epoch": 1.22, "grad_norm": 8.375, "learning_rate": 4.3856582166815696e-06, "logits/chosen": -1.877435326576233, "logits/rejected": -1.8740183115005493, "logps/chosen": -33.043678283691406, "logps/rejected": -33.61932373046875, "loss": 0.4679, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.3554477095603943, "rewards/margins": 0.6336251497268677, "rewards/rejected": -0.2781774401664734, "step": 470 }, { "epoch": 1.25, "grad_norm": 8.5, "learning_rate": 4.347971356735789e-06, "logits/chosen": -2.00216007232666, "logits/rejected": -1.9952083826065063, "logps/chosen": -30.32107925415039, "logps/rejected": -32.764427185058594, "loss": 0.5165, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.20298178493976593, "rewards/margins": 0.5113586187362671, "rewards/rejected": -0.30837681889533997, "step": 480 }, { "epoch": 1.27, "grad_norm": 6.78125, "learning_rate": 4.309335095262675e-06, "logits/chosen": -1.943036437034607, "logits/rejected": -1.9446159601211548, "logps/chosen": -34.45885467529297, "logps/rejected": -34.452476501464844, "loss": 0.4477, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.35773298144340515, "rewards/margins": 0.6821189522743225, "rewards/rejected": -0.32438600063323975, "step": 490 }, { "epoch": 1.3, "grad_norm": 9.1875, "learning_rate": 4.269769281772082e-06, "logits/chosen": -1.8326492309570312, "logits/rejected": -1.830275297164917, "logps/chosen": -32.28834915161133, "logps/rejected": -37.49669647216797, "loss": 0.4361, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.3535749316215515, "rewards/margins": 0.766524076461792, "rewards/rejected": -0.4129491448402405, "step": 500 }, { "epoch": 1.3, "eval_logits/chosen": -2.173431634902954, "eval_logits/rejected": -2.1686556339263916, "eval_logps/chosen": -34.44293975830078, "eval_logps/rejected": -38.02019500732422, "eval_loss": 0.6902198195457458, "eval_rewards/accuracies": 0.5365448594093323, "eval_rewards/chosen": -0.16335560381412506, "eval_rewards/margins": 0.03807440027594566, "eval_rewards/rejected": -0.20143000781536102, "eval_runtime": 145.4318, "eval_samples_per_second": 2.358, "eval_steps_per_second": 0.296, "step": 500 }, { "epoch": 1.32, "grad_norm": 6.96875, "learning_rate": 4.22929424333435e-06, "logits/chosen": -1.9319264888763428, "logits/rejected": -1.936668038368225, "logps/chosen": -32.5339241027832, "logps/rejected": -32.27501678466797, "loss": 0.471, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.3693718910217285, "rewards/margins": 0.6209360361099243, "rewards/rejected": -0.2515642046928406, "step": 510 }, { "epoch": 1.35, "grad_norm": 7.53125, "learning_rate": 4.1879307741372085e-06, "logits/chosen": -1.9589513540267944, "logits/rejected": -1.969935655593872, "logps/chosen": -30.51776123046875, "logps/rejected": -32.34286117553711, "loss": 0.4588, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.3490603566169739, "rewards/margins": 0.6831297874450684, "rewards/rejected": -0.3340694308280945, "step": 520 }, { "epoch": 1.38, "grad_norm": 4.96875, "learning_rate": 4.145700124802693e-06, "logits/chosen": -1.8863223791122437, "logits/rejected": -1.8831069469451904, "logps/chosen": -31.701770782470703, "logps/rejected": -33.229095458984375, "loss": 0.4686, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.2874954342842102, "rewards/margins": 0.6340224742889404, "rewards/rejected": -0.3465271592140198, "step": 530 }, { "epoch": 1.4, "grad_norm": 6.9375, "learning_rate": 4.102623991469562e-06, "logits/chosen": -1.7517893314361572, "logits/rejected": -1.7610219717025757, "logps/chosen": -31.630868911743164, "logps/rejected": -32.625404357910156, "loss": 0.4697, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.3754081726074219, "rewards/margins": 0.6831758618354797, "rewards/rejected": -0.3077676296234131, "step": 540 }, { "epoch": 1.43, "grad_norm": 7.15625, "learning_rate": 4.058724504646834e-06, "logits/chosen": -1.8443384170532227, "logits/rejected": -1.8382275104522705, "logps/chosen": -32.76479721069336, "logps/rejected": -31.679241180419922, "loss": 0.4737, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.3774031698703766, "rewards/margins": 0.6525603532791138, "rewards/rejected": -0.2751571536064148, "step": 550 }, { "epoch": 1.45, "grad_norm": 6.75, "learning_rate": 4.014024217844167e-06, "logits/chosen": -1.932416319847107, "logits/rejected": -1.9304752349853516, "logps/chosen": -33.61432647705078, "logps/rejected": -32.20708084106445, "loss": 0.4728, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.3827466368675232, "rewards/margins": 0.6666241884231567, "rewards/rejected": -0.2838776111602783, "step": 560 }, { "epoch": 1.48, "grad_norm": 7.28125, "learning_rate": 3.968546095984911e-06, "logits/chosen": -1.7655744552612305, "logits/rejected": -1.7635431289672852, "logps/chosen": -31.894184112548828, "logps/rejected": -31.694026947021484, "loss": 0.4873, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.3608909249305725, "rewards/margins": 0.6375805735588074, "rewards/rejected": -0.27668967843055725, "step": 570 }, { "epoch": 1.51, "grad_norm": 10.4375, "learning_rate": 3.922313503607806e-06, "logits/chosen": -1.9005470275878906, "logits/rejected": -1.8971643447875977, "logps/chosen": -30.140361785888672, "logps/rejected": -35.64023971557617, "loss": 0.4311, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.38571348786354065, "rewards/margins": 0.7913459539413452, "rewards/rejected": -0.40563249588012695, "step": 580 }, { "epoch": 1.53, "grad_norm": 6.0, "learning_rate": 3.875350192863368e-06, "logits/chosen": -1.835296392440796, "logits/rejected": -1.8387609720230103, "logps/chosen": -28.92897605895996, "logps/rejected": -31.326541900634766, "loss": 0.4892, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.2744215726852417, "rewards/margins": 0.5414907336235046, "rewards/rejected": -0.2670692205429077, "step": 590 }, { "epoch": 1.56, "grad_norm": 7.34375, "learning_rate": 3.8276802913111436e-06, "logits/chosen": -1.870452880859375, "logits/rejected": -1.8704124689102173, "logps/chosen": -31.048049926757812, "logps/rejected": -31.96176528930664, "loss": 0.4356, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.4693926274776459, "rewards/margins": 0.776892364025116, "rewards/rejected": -0.3074997365474701, "step": 600 }, { "epoch": 1.56, "eval_logits/chosen": -2.1470563411712646, "eval_logits/rejected": -2.1422781944274902, "eval_logps/chosen": -34.333770751953125, "eval_logps/rejected": -37.99900436401367, "eval_loss": 0.6749633550643921, "eval_rewards/accuracies": 0.5834717750549316, "eval_rewards/chosen": -0.11968887597322464, "eval_rewards/margins": 0.07326464354991913, "eval_rewards/rejected": -0.19295351207256317, "eval_runtime": 145.1843, "eval_samples_per_second": 2.363, "eval_steps_per_second": 0.296, "step": 600 }, { "epoch": 1.58, "grad_norm": 7.53125, "learning_rate": 3.7793282895240927e-06, "logits/chosen": -1.9326798915863037, "logits/rejected": -1.9334518909454346, "logps/chosen": -33.740760803222656, "logps/rejected": -34.0605354309082, "loss": 0.4085, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5157531499862671, "rewards/margins": 0.9062908887863159, "rewards/rejected": -0.3905377686023712, "step": 610 }, { "epoch": 1.61, "grad_norm": 6.25, "learning_rate": 3.730319028506478e-06, "logits/chosen": -1.894683837890625, "logits/rejected": -1.8922332525253296, "logps/chosen": -31.973896026611328, "logps/rejected": -32.93384552001953, "loss": 0.4327, "rewards/accuracies": 0.875, "rewards/chosen": 0.460601270198822, "rewards/margins": 0.8061720132827759, "rewards/rejected": -0.34557071328163147, "step": 620 }, { "epoch": 1.64, "grad_norm": 13.625, "learning_rate": 3.6806776869317074e-06, "logits/chosen": -1.9046722650527954, "logits/rejected": -1.8957364559173584, "logps/chosen": -31.66204833984375, "logps/rejected": -31.843969345092773, "loss": 0.4473, "rewards/accuracies": 0.8125, "rewards/chosen": 0.44562751054763794, "rewards/margins": 0.8090687990188599, "rewards/rejected": -0.36344125866889954, "step": 630 }, { "epoch": 1.66, "grad_norm": 9.25, "learning_rate": 3.6304297682067146e-06, "logits/chosen": -1.905128836631775, "logits/rejected": -1.901825189590454, "logps/chosen": -31.2297306060791, "logps/rejected": -33.26097106933594, "loss": 0.4435, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.38747596740722656, "rewards/margins": 0.7240976095199585, "rewards/rejected": -0.3366217017173767, "step": 640 }, { "epoch": 1.69, "grad_norm": 7.21875, "learning_rate": 3.579601087369492e-06, "logits/chosen": -1.9152917861938477, "logits/rejected": -1.9175786972045898, "logps/chosen": -32.58631134033203, "logps/rejected": -34.52180099487305, "loss": 0.421, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.426957368850708, "rewards/margins": 0.7623184323310852, "rewards/rejected": -0.3353610634803772, "step": 650 }, { "epoch": 1.71, "grad_norm": 10.8125, "learning_rate": 3.5282177578265295e-06, "logits/chosen": -1.8232342004776, "logits/rejected": -1.8236335515975952, "logps/chosen": -32.828582763671875, "logps/rejected": -32.33021545410156, "loss": 0.4383, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.447300523519516, "rewards/margins": 0.736484169960022, "rewards/rejected": -0.28918370604515076, "step": 660 }, { "epoch": 1.74, "grad_norm": 9.9375, "learning_rate": 3.476306177936961e-06, "logits/chosen": -1.8623815774917603, "logits/rejected": -1.852773666381836, "logps/chosen": -32.85457992553711, "logps/rejected": -33.26173782348633, "loss": 0.4009, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.4586804509162903, "rewards/margins": 0.8402576446533203, "rewards/rejected": -0.38157716393470764, "step": 670 }, { "epoch": 1.77, "grad_norm": 6.28125, "learning_rate": 3.423893017450324e-06, "logits/chosen": -1.7598998546600342, "logits/rejected": -1.7566627264022827, "logps/chosen": -30.202777862548828, "logps/rejected": -35.252830505371094, "loss": 0.3982, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.5032743215560913, "rewards/margins": 0.8769756555557251, "rewards/rejected": -0.37370121479034424, "step": 680 }, { "epoch": 1.79, "grad_norm": 5.84375, "learning_rate": 3.3710052038048794e-06, "logits/chosen": -1.8207229375839233, "logits/rejected": -1.8200151920318604, "logps/chosen": -33.87450408935547, "logps/rejected": -36.40219497680664, "loss": 0.3646, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5583134293556213, "rewards/margins": 0.9776619076728821, "rewards/rejected": -0.41934847831726074, "step": 690 }, { "epoch": 1.82, "grad_norm": 5.9375, "learning_rate": 3.3176699082935546e-06, "logits/chosen": -1.7926433086395264, "logits/rejected": -1.7952711582183838, "logps/chosen": -31.316104888916016, "logps/rejected": -36.66993713378906, "loss": 0.4036, "rewards/accuracies": 0.875, "rewards/chosen": 0.51469886302948, "rewards/margins": 0.9260055422782898, "rewards/rejected": -0.4113067090511322, "step": 700 }, { "epoch": 1.82, "eval_logits/chosen": -2.1232047080993652, "eval_logits/rejected": -2.11846923828125, "eval_logps/chosen": -34.40776824951172, "eval_logps/rejected": -38.10317611694336, "eval_loss": 0.6718631386756897, "eval_rewards/accuracies": 0.5859634280204773, "eval_rewards/chosen": -0.14928743243217468, "eval_rewards/margins": 0.08533468097448349, "eval_rewards/rejected": -0.23462210595607758, "eval_runtime": 145.3862, "eval_samples_per_second": 2.359, "eval_steps_per_second": 0.296, "step": 700 }, { "epoch": 1.84, "grad_norm": 8.4375, "learning_rate": 3.2639145321045933e-06, "logits/chosen": -1.8965635299682617, "logits/rejected": -1.8991920948028564, "logps/chosen": -33.687984466552734, "logps/rejected": -35.19561004638672, "loss": 0.4379, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.483041912317276, "rewards/margins": 0.8532260060310364, "rewards/rejected": -0.3701840341091156, "step": 710 }, { "epoch": 1.87, "grad_norm": 8.5625, "learning_rate": 3.2097666922441107e-06, "logits/chosen": -1.7496124505996704, "logits/rejected": -1.7437019348144531, "logps/chosen": -33.61560821533203, "logps/rejected": -33.33930587768555, "loss": 0.4064, "rewards/accuracies": 0.875, "rewards/chosen": 0.4870489239692688, "rewards/margins": 0.882034182548523, "rewards/rejected": -0.3949853479862213, "step": 720 }, { "epoch": 1.9, "grad_norm": 5.875, "learning_rate": 3.1552542073477554e-06, "logits/chosen": -1.923055648803711, "logits/rejected": -1.919946312904358, "logps/chosen": -29.562047958374023, "logps/rejected": -32.68622970581055, "loss": 0.3979, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.4363761842250824, "rewards/margins": 0.9315892457962036, "rewards/rejected": -0.49521297216415405, "step": 730 }, { "epoch": 1.92, "grad_norm": 6.71875, "learning_rate": 3.100405083388799e-06, "logits/chosen": -1.7599384784698486, "logits/rejected": -1.7601182460784912, "logps/chosen": -32.07990264892578, "logps/rejected": -38.58934783935547, "loss": 0.3994, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5533084869384766, "rewards/margins": 0.9379235506057739, "rewards/rejected": -0.38461512327194214, "step": 740 }, { "epoch": 1.95, "grad_norm": 5.59375, "learning_rate": 3.0452474992899645e-06, "logits/chosen": -1.6459096670150757, "logits/rejected": -1.651227355003357, "logps/chosen": -35.65028762817383, "logps/rejected": -35.04454803466797, "loss": 0.4218, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.5128704905509949, "rewards/margins": 0.9417527914047241, "rewards/rejected": -0.428882360458374, "step": 750 }, { "epoch": 1.97, "grad_norm": 8.125, "learning_rate": 2.989809792446417e-06, "logits/chosen": -1.8414586782455444, "logits/rejected": -1.8431018590927124, "logps/chosen": -31.405654907226562, "logps/rejected": -33.640296936035156, "loss": 0.4105, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.48001009225845337, "rewards/margins": 0.8244431614875793, "rewards/rejected": -0.34443309903144836, "step": 760 }, { "epoch": 2.0, "grad_norm": 8.1875, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -1.8179328441619873, "logits/rejected": -1.8168070316314697, "logps/chosen": -30.935861587524414, "logps/rejected": -35.76841354370117, "loss": 0.433, "rewards/accuracies": 0.85833340883255, "rewards/chosen": 0.4958992004394531, "rewards/margins": 0.7920029759407043, "rewards/rejected": -0.2961038649082184, "step": 770 }, { "epoch": 2.03, "grad_norm": 5.3125, "learning_rate": 2.878208065043501e-06, "logits/chosen": -1.8167203664779663, "logits/rejected": -1.8160884380340576, "logps/chosen": -33.24877166748047, "logps/rejected": -32.76862716674805, "loss": 0.3406, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5598764419555664, "rewards/margins": 1.0995185375213623, "rewards/rejected": -0.5396420359611511, "step": 780 }, { "epoch": 2.05, "grad_norm": 6.75, "learning_rate": 2.8221013802485974e-06, "logits/chosen": -1.8443057537078857, "logits/rejected": -1.842907190322876, "logps/chosen": -28.221912384033203, "logps/rejected": -34.14670944213867, "loss": 0.3187, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.536507248878479, "rewards/margins": 1.140665054321289, "rewards/rejected": -0.6041578054428101, "step": 790 }, { "epoch": 2.08, "grad_norm": 5.0625, "learning_rate": 2.76582921478147e-06, "logits/chosen": -1.8954391479492188, "logits/rejected": -1.892087697982788, "logps/chosen": -30.82273292541504, "logps/rejected": -35.755470275878906, "loss": 0.2952, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6417907476425171, "rewards/margins": 1.2643665075302124, "rewards/rejected": -0.6225756406784058, "step": 800 }, { "epoch": 2.08, "eval_logits/chosen": -2.1026289463043213, "eval_logits/rejected": -2.097952127456665, "eval_logps/chosen": -34.54673767089844, "eval_logps/rejected": -38.26842498779297, "eval_loss": 0.6757632493972778, "eval_rewards/accuracies": 0.5776578187942505, "eval_rewards/chosen": -0.20487497746944427, "eval_rewards/margins": 0.0958474650979042, "eval_rewards/rejected": -0.3007224202156067, "eval_runtime": 145.3563, "eval_samples_per_second": 2.36, "eval_steps_per_second": 0.296, "step": 800 }, { "epoch": 2.1, "grad_norm": 5.71875, "learning_rate": 2.7094204786572254e-06, "logits/chosen": -1.721960425376892, "logits/rejected": -1.7143983840942383, "logps/chosen": -32.55122375488281, "logps/rejected": -36.20327377319336, "loss": 0.2827, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.7620068192481995, "rewards/margins": 1.4192689657211304, "rewards/rejected": -0.6572622060775757, "step": 810 }, { "epoch": 2.13, "grad_norm": 6.25, "learning_rate": 2.6529041520546072e-06, "logits/chosen": -1.797978401184082, "logits/rejected": -1.8082430362701416, "logps/chosen": -33.965431213378906, "logps/rejected": -33.7777214050293, "loss": 0.3245, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6921030282974243, "rewards/margins": 1.2306172847747803, "rewards/rejected": -0.5385143160820007, "step": 820 }, { "epoch": 2.16, "grad_norm": 7.71875, "learning_rate": 2.5963092704273302e-06, "logits/chosen": -1.847389817237854, "logits/rejected": -1.8522322177886963, "logps/chosen": -33.48334503173828, "logps/rejected": -30.594324111938477, "loss": 0.3479, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.5677754282951355, "rewards/margins": 1.0982062816619873, "rewards/rejected": -0.5304308533668518, "step": 830 }, { "epoch": 2.18, "grad_norm": 5.40625, "learning_rate": 2.53966490958702e-06, "logits/chosen": -1.846261978149414, "logits/rejected": -1.8545395135879517, "logps/chosen": -33.14823913574219, "logps/rejected": -31.363750457763672, "loss": 0.3104, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6559160947799683, "rewards/margins": 1.2623378038406372, "rewards/rejected": -0.6064217686653137, "step": 840 }, { "epoch": 2.21, "grad_norm": 6.96875, "learning_rate": 2.4830001707654135e-06, "logits/chosen": -1.7743536233901978, "logits/rejected": -1.7648853063583374, "logps/chosen": -30.302188873291016, "logps/rejected": -33.442298889160156, "loss": 0.329, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5964034795761108, "rewards/margins": 1.1678011417388916, "rewards/rejected": -0.5713975429534912, "step": 850 }, { "epoch": 2.23, "grad_norm": 4.78125, "learning_rate": 2.4263441656635054e-06, "logits/chosen": -1.9121280908584595, "logits/rejected": -1.9022724628448486, "logps/chosen": -24.69386100769043, "logps/rejected": -31.476959228515625, "loss": 0.3155, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6120836138725281, "rewards/margins": 1.2539312839508057, "rewards/rejected": -0.6418476104736328, "step": 860 }, { "epoch": 2.26, "grad_norm": 5.78125, "learning_rate": 2.3697260014953107e-06, "logits/chosen": -1.7656368017196655, "logits/rejected": -1.7670552730560303, "logps/chosen": -32.631500244140625, "logps/rejected": -31.436681747436523, "loss": 0.3053, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6806014180183411, "rewards/margins": 1.2622734308242798, "rewards/rejected": -0.5816720724105835, "step": 870 }, { "epoch": 2.29, "grad_norm": 6.125, "learning_rate": 2.3131747660339396e-06, "logits/chosen": -1.7689151763916016, "logits/rejected": -1.7699298858642578, "logps/chosen": -31.100326538085938, "logps/rejected": -34.68801498413086, "loss": 0.3503, "rewards/accuracies": 0.875, "rewards/chosen": 0.515569806098938, "rewards/margins": 1.1704087257385254, "rewards/rejected": -0.6548389196395874, "step": 880 }, { "epoch": 2.31, "grad_norm": 5.5625, "learning_rate": 2.256719512667651e-06, "logits/chosen": -1.6725788116455078, "logits/rejected": -1.671099066734314, "logps/chosen": -34.11555099487305, "logps/rejected": -37.912750244140625, "loss": 0.2954, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6197448968887329, "rewards/margins": 1.4339497089385986, "rewards/rejected": -0.8142046928405762, "step": 890 }, { "epoch": 2.34, "grad_norm": 6.25, "learning_rate": 2.2003892454735786e-06, "logits/chosen": -1.8256248235702515, "logits/rejected": -1.8188546895980835, "logps/chosen": -30.36061668395996, "logps/rejected": -34.45724105834961, "loss": 0.304, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6568080186843872, "rewards/margins": 1.3364773988723755, "rewards/rejected": -0.6796693801879883, "step": 900 }, { "epoch": 2.34, "eval_logits/chosen": -2.0867063999176025, "eval_logits/rejected": -2.082029342651367, "eval_logps/chosen": -34.61775207519531, "eval_logps/rejected": -38.34459686279297, "eval_loss": 0.6778721809387207, "eval_rewards/accuracies": 0.595099687576294, "eval_rewards/chosen": -0.23328028619289398, "eval_rewards/margins": 0.09790942072868347, "eval_rewards/rejected": -0.33118972182273865, "eval_runtime": 145.1775, "eval_samples_per_second": 2.363, "eval_steps_per_second": 0.296, "step": 900 }, { "epoch": 2.36, "grad_norm": 6.46875, "learning_rate": 2.1442129043167877e-06, "logits/chosen": -1.858393907546997, "logits/rejected": -1.8537986278533936, "logps/chosen": -31.981470108032227, "logps/rejected": -36.285892486572266, "loss": 0.2951, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6304842233657837, "rewards/margins": 1.3333914279937744, "rewards/rejected": -0.7029072642326355, "step": 910 }, { "epoch": 2.39, "grad_norm": 5.8125, "learning_rate": 2.088219349982323e-06, "logits/chosen": -1.777714729309082, "logits/rejected": -1.7827917337417603, "logps/chosen": -33.658485412597656, "logps/rejected": -34.12433624267578, "loss": 0.2995, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6760393977165222, "rewards/margins": 1.3258110284805298, "rewards/rejected": -0.649771511554718, "step": 920 }, { "epoch": 2.42, "grad_norm": 5.0, "learning_rate": 2.0324373493478803e-06, "logits/chosen": -1.8777525424957275, "logits/rejected": -1.8685945272445679, "logps/chosen": -30.748672485351562, "logps/rejected": -36.116886138916016, "loss": 0.294, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.614503026008606, "rewards/margins": 1.2841379642486572, "rewards/rejected": -0.6696349382400513, "step": 930 }, { "epoch": 2.44, "grad_norm": 7.125, "learning_rate": 1.976895560604729e-06, "logits/chosen": -1.8024845123291016, "logits/rejected": -1.7992064952850342, "logps/chosen": -30.062992095947266, "logps/rejected": -33.89337921142578, "loss": 0.356, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.49018430709838867, "rewards/margins": 1.0613863468170166, "rewards/rejected": -0.5712020993232727, "step": 940 }, { "epoch": 2.47, "grad_norm": 4.4375, "learning_rate": 1.921622518534466e-06, "logits/chosen": -1.73916757106781, "logits/rejected": -1.7464996576309204, "logps/chosen": -31.560144424438477, "logps/rejected": -37.22822189331055, "loss": 0.3426, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5243637561798096, "rewards/margins": 1.1956799030303955, "rewards/rejected": -0.6713162660598755, "step": 950 }, { "epoch": 2.49, "grad_norm": 4.5, "learning_rate": 1.8666466198491794e-06, "logits/chosen": -1.7508230209350586, "logits/rejected": -1.7439250946044922, "logps/chosen": -32.305728912353516, "logps/rejected": -38.560791015625, "loss": 0.2871, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6091153621673584, "rewards/margins": 1.3939114809036255, "rewards/rejected": -0.784795880317688, "step": 960 }, { "epoch": 2.52, "grad_norm": 5.75, "learning_rate": 1.8119961086025376e-06, "logits/chosen": -1.752611517906189, "logits/rejected": -1.7530120611190796, "logps/chosen": -29.592453002929688, "logps/rejected": -34.51519775390625, "loss": 0.3156, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6575800180435181, "rewards/margins": 1.2711557149887085, "rewards/rejected": -0.6135755777359009, "step": 970 }, { "epoch": 2.55, "grad_norm": 6.53125, "learning_rate": 1.7576990616793139e-06, "logits/chosen": -1.7550010681152344, "logits/rejected": -1.7661269903182983, "logps/chosen": -31.351938247680664, "logps/rejected": -35.77216339111328, "loss": 0.3031, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.68990159034729, "rewards/margins": 1.3462820053100586, "rewards/rejected": -0.6563804149627686, "step": 980 }, { "epoch": 2.57, "grad_norm": 4.59375, "learning_rate": 1.7037833743707892e-06, "logits/chosen": -1.8242708444595337, "logits/rejected": -1.821933388710022, "logps/chosen": -34.63307189941406, "logps/rejected": -33.89019012451172, "loss": 0.3595, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.5951918363571167, "rewards/margins": 1.1915771961212158, "rewards/rejected": -0.5963853597640991, "step": 990 }, { "epoch": 2.6, "grad_norm": 6.53125, "learning_rate": 1.6502767460434588e-06, "logits/chosen": -1.7967815399169922, "logits/rejected": -1.8010671138763428, "logps/chosen": -32.9234619140625, "logps/rejected": -35.8967399597168, "loss": 0.333, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6159319281578064, "rewards/margins": 1.1802124977111816, "rewards/rejected": -0.5642803907394409, "step": 1000 }, { "epoch": 2.6, "eval_logits/chosen": -2.072319984436035, "eval_logits/rejected": -2.0676779747009277, "eval_logps/chosen": -34.67308044433594, "eval_logps/rejected": -38.430641174316406, "eval_loss": 0.676977813243866, "eval_rewards/accuracies": 0.5859634280204773, "eval_rewards/chosen": -0.2554103434085846, "eval_rewards/margins": 0.11019979417324066, "eval_rewards/rejected": -0.36561012268066406, "eval_runtime": 145.2025, "eval_samples_per_second": 2.362, "eval_steps_per_second": 0.296, "step": 1000 }, { "epoch": 2.62, "grad_norm": 9.9375, "learning_rate": 1.5972066659083796e-06, "logits/chosen": -1.8064178228378296, "logits/rejected": -1.811234712600708, "logps/chosen": -31.44000244140625, "logps/rejected": -33.90888977050781, "loss": 0.3712, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.5441684126853943, "rewards/margins": 1.0992209911346436, "rewards/rejected": -0.5550524592399597, "step": 1010 }, { "epoch": 2.65, "grad_norm": 7.46875, "learning_rate": 1.5446003988985041e-06, "logits/chosen": -1.7619049549102783, "logits/rejected": -1.7642395496368408, "logps/chosen": -28.927509307861328, "logps/rejected": -33.50086212158203, "loss": 0.2941, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6072515845298767, "rewards/margins": 1.3279320001602173, "rewards/rejected": -0.7206803560256958, "step": 1020 }, { "epoch": 2.68, "grad_norm": 4.5, "learning_rate": 1.4924849716612211e-06, "logits/chosen": -1.8155397176742554, "logits/rejected": -1.8104407787322998, "logps/chosen": -32.632835388183594, "logps/rejected": -35.2443962097168, "loss": 0.3301, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.5425110459327698, "rewards/margins": 1.233070969581604, "rewards/rejected": -0.6905598640441895, "step": 1030 }, { "epoch": 2.7, "grad_norm": 9.5, "learning_rate": 1.440887158673332e-06, "logits/chosen": -1.7551313638687134, "logits/rejected": -1.7592474222183228, "logps/chosen": -34.41304397583008, "logps/rejected": -35.99690246582031, "loss": 0.3295, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5362745523452759, "rewards/margins": 1.167203664779663, "rewards/rejected": -0.6309291124343872, "step": 1040 }, { "epoch": 2.73, "grad_norm": 5.75, "learning_rate": 1.3898334684855647e-06, "logits/chosen": -1.751050591468811, "logits/rejected": -1.7637557983398438, "logps/chosen": -30.462039947509766, "logps/rejected": -34.61774444580078, "loss": 0.3249, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.5603402853012085, "rewards/margins": 1.1994216442108154, "rewards/rejected": -0.6390813589096069, "step": 1050 }, { "epoch": 2.75, "grad_norm": 9.4375, "learning_rate": 1.3393501301037245e-06, "logits/chosen": -1.7376810312271118, "logits/rejected": -1.7309871912002563, "logps/chosen": -30.55047035217285, "logps/rejected": -34.8571662902832, "loss": 0.3058, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6435432434082031, "rewards/margins": 1.259224534034729, "rewards/rejected": -0.6156812310218811, "step": 1060 }, { "epoch": 2.78, "grad_norm": 6.1875, "learning_rate": 1.2894630795134454e-06, "logits/chosen": -1.8616443872451782, "logits/rejected": -1.8623626232147217, "logps/chosen": -32.02442169189453, "logps/rejected": -34.386295318603516, "loss": 0.3145, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6090625524520874, "rewards/margins": 1.2257022857666016, "rewards/rejected": -0.6166397333145142, "step": 1070 }, { "epoch": 2.81, "grad_norm": 6.59375, "learning_rate": 1.2401979463554984e-06, "logits/chosen": -1.7933332920074463, "logits/rejected": -1.7921574115753174, "logps/chosen": -32.67540740966797, "logps/rejected": -34.564029693603516, "loss": 0.3561, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.5657684206962585, "rewards/margins": 1.1146466732025146, "rewards/rejected": -0.5488781929016113, "step": 1080 }, { "epoch": 2.83, "grad_norm": 5.84375, "learning_rate": 1.1915800407584705e-06, "logits/chosen": -1.8151838779449463, "logits/rejected": -1.8076585531234741, "logps/chosen": -32.7107048034668, "logps/rejected": -32.47800827026367, "loss": 0.3302, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.6060681939125061, "rewards/margins": 1.1806586980819702, "rewards/rejected": -0.5745903849601746, "step": 1090 }, { "epoch": 2.86, "grad_norm": 4.59375, "learning_rate": 1.1436343403356019e-06, "logits/chosen": -1.7889270782470703, "logits/rejected": -1.787825584411621, "logps/chosen": -33.79990005493164, "logps/rejected": -38.28570556640625, "loss": 0.2846, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6474052667617798, "rewards/margins": 1.4206043481826782, "rewards/rejected": -0.7731989622116089, "step": 1100 }, { "epoch": 2.86, "eval_logits/chosen": -2.069221258163452, "eval_logits/rejected": -2.0645899772644043, "eval_logps/chosen": -34.66777420043945, "eval_logps/rejected": -38.41979217529297, "eval_loss": 0.6772051453590393, "eval_rewards/accuracies": 0.6009136438369751, "eval_rewards/chosen": -0.2532878816127777, "eval_rewards/margins": 0.10798129439353943, "eval_rewards/rejected": -0.36126917600631714, "eval_runtime": 145.1766, "eval_samples_per_second": 2.363, "eval_steps_per_second": 0.296, "step": 1100 }, { "epoch": 2.88, "grad_norm": 6.9375, "learning_rate": 1.0963854773524548e-06, "logits/chosen": -1.836913824081421, "logits/rejected": -1.8424360752105713, "logps/chosen": -34.19898223876953, "logps/rejected": -37.27791213989258, "loss": 0.3393, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5072540044784546, "rewards/margins": 1.200984239578247, "rewards/rejected": -0.6937301158905029, "step": 1110 }, { "epoch": 2.91, "grad_norm": 5.09375, "learning_rate": 1.049857726072005e-06, "logits/chosen": -1.8054929971694946, "logits/rejected": -1.8034013509750366, "logps/chosen": -31.228893280029297, "logps/rejected": -34.67009353637695, "loss": 0.3249, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.5549246668815613, "rewards/margins": 1.229860544204712, "rewards/rejected": -0.6749356985092163, "step": 1120 }, { "epoch": 2.94, "grad_norm": 4.78125, "learning_rate": 1.0040749902836508e-06, "logits/chosen": -1.7074657678604126, "logits/rejected": -1.710054636001587, "logps/chosen": -28.319087982177734, "logps/rejected": -32.21437454223633, "loss": 0.2869, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5718191266059875, "rewards/margins": 1.3595529794692993, "rewards/rejected": -0.7877337336540222, "step": 1130 }, { "epoch": 2.96, "grad_norm": 5.8125, "learning_rate": 9.59060791022566e-07, "logits/chosen": -1.8046897649765015, "logits/rejected": -1.8014905452728271, "logps/chosen": -31.595922470092773, "logps/rejected": -33.86729431152344, "loss": 0.3331, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6431301832199097, "rewards/margins": 1.1792962551116943, "rewards/rejected": -0.5361660718917847, "step": 1140 }, { "epoch": 2.99, "grad_norm": 6.40625, "learning_rate": 9.148382544856885e-07, "logits/chosen": -1.7242072820663452, "logits/rejected": -1.7179205417633057, "logps/chosen": -26.89887046813965, "logps/rejected": -33.96310806274414, "loss": 0.3234, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.49564680457115173, "rewards/margins": 1.2215702533721924, "rewards/rejected": -0.7259235382080078, "step": 1150 }, { "epoch": 3.01, "grad_norm": 5.34375, "learning_rate": 8.714301001505568e-07, "logits/chosen": -1.833547830581665, "logits/rejected": -1.8309831619262695, "logps/chosen": -31.566417694091797, "logps/rejected": -36.94977569580078, "loss": 0.3005, "rewards/accuracies": 0.9458333849906921, "rewards/chosen": 0.611860990524292, "rewards/margins": 1.3551568984985352, "rewards/rejected": -0.7432958483695984, "step": 1160 }, { "epoch": 3.04, "grad_norm": 4.875, "learning_rate": 8.288586291031025e-07, "logits/chosen": -1.8228060007095337, "logits/rejected": -1.8206332921981812, "logps/chosen": -30.116840362548828, "logps/rejected": -34.350860595703125, "loss": 0.2812, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5990268588066101, "rewards/margins": 1.3937623500823975, "rewards/rejected": -0.7947354912757874, "step": 1170 }, { "epoch": 3.06, "grad_norm": 4.65625, "learning_rate": 7.871457125803897e-07, "logits/chosen": -1.8162040710449219, "logits/rejected": -1.804120659828186, "logps/chosen": -34.508934020996094, "logps/rejected": -36.088863372802734, "loss": 0.2775, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6145464181900024, "rewards/margins": 1.4215012788772583, "rewards/rejected": -0.8069549798965454, "step": 1180 }, { "epoch": 3.09, "grad_norm": 5.625, "learning_rate": 7.463127807341966e-07, "logits/chosen": -1.7128658294677734, "logits/rejected": -1.7075669765472412, "logps/chosen": -33.452003479003906, "logps/rejected": -35.91048812866211, "loss": 0.2738, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7590751647949219, "rewards/margins": 1.42814040184021, "rewards/rejected": -0.6690651774406433, "step": 1190 }, { "epoch": 3.12, "grad_norm": 5.125, "learning_rate": 7.063808116212021e-07, "logits/chosen": -1.7600603103637695, "logits/rejected": -1.7632497549057007, "logps/chosen": -31.24569320678711, "logps/rejected": -33.11017608642578, "loss": 0.3301, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5477610230445862, "rewards/margins": 1.1973613500595093, "rewards/rejected": -0.6496003866195679, "step": 1200 }, { "epoch": 3.12, "eval_logits/chosen": -2.0681374073028564, "eval_logits/rejected": -2.0634970664978027, "eval_logps/chosen": -34.671016693115234, "eval_logps/rejected": -38.42964172363281, "eval_loss": 0.6771031618118286, "eval_rewards/accuracies": 0.5776578187942505, "eval_rewards/chosen": -0.25458672642707825, "eval_rewards/margins": 0.11062110960483551, "eval_rewards/rejected": -0.36520785093307495, "eval_runtime": 145.3934, "eval_samples_per_second": 2.359, "eval_steps_per_second": 0.296, "step": 1200 }, { "epoch": 3.14, "grad_norm": 6.3125, "learning_rate": 6.673703204254348e-07, "logits/chosen": -1.7789433002471924, "logits/rejected": -1.7744804620742798, "logps/chosen": -29.76243019104004, "logps/rejected": -32.72071075439453, "loss": 0.2864, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6044963002204895, "rewards/margins": 1.3295423984527588, "rewards/rejected": -0.7250461578369141, "step": 1210 }, { "epoch": 3.17, "grad_norm": 5.03125, "learning_rate": 6.293013489185315e-07, "logits/chosen": -1.7943885326385498, "logits/rejected": -1.7889045476913452, "logps/chosen": -33.75127410888672, "logps/rejected": -36.586204528808594, "loss": 0.2973, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6090667247772217, "rewards/margins": 1.4155981540679932, "rewards/rejected": -0.8065314292907715, "step": 1220 }, { "epoch": 3.19, "grad_norm": 4.9375, "learning_rate": 5.921934551632086e-07, "logits/chosen": -1.7955095767974854, "logits/rejected": -1.782134771347046, "logps/chosen": -31.721553802490234, "logps/rejected": -36.032196044921875, "loss": 0.3156, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.5489839315414429, "rewards/margins": 1.3139245510101318, "rewards/rejected": -0.764940619468689, "step": 1230 }, { "epoch": 3.22, "grad_norm": 4.5625, "learning_rate": 5.560657034652405e-07, "logits/chosen": -1.8425153493881226, "logits/rejected": -1.8402408361434937, "logps/chosen": -33.340415954589844, "logps/rejected": -33.357208251953125, "loss": 0.2976, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.6769941449165344, "rewards/margins": 1.2839314937591553, "rewards/rejected": -0.6069372296333313, "step": 1240 }, { "epoch": 3.25, "grad_norm": 4.4375, "learning_rate": 5.2093665457911e-07, "logits/chosen": -1.7631075382232666, "logits/rejected": -1.7602026462554932, "logps/chosen": -32.7774658203125, "logps/rejected": -37.03419876098633, "loss": 0.2631, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6474747061729431, "rewards/margins": 1.5423027276992798, "rewards/rejected": -0.8948280215263367, "step": 1250 }, { "epoch": 3.27, "grad_norm": 5.0, "learning_rate": 4.868243561723535e-07, "logits/chosen": -1.836340308189392, "logits/rejected": -1.8310245275497437, "logps/chosen": -29.116525650024414, "logps/rejected": -34.146141052246094, "loss": 0.2836, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6577095985412598, "rewards/margins": 1.3631455898284912, "rewards/rejected": -0.7054358720779419, "step": 1260 }, { "epoch": 3.3, "grad_norm": 4.90625, "learning_rate": 4.537463335535161e-07, "logits/chosen": -1.8614356517791748, "logits/rejected": -1.8663842678070068, "logps/chosen": -30.975250244140625, "logps/rejected": -33.32194137573242, "loss": 0.2845, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6942988634109497, "rewards/margins": 1.350528359413147, "rewards/rejected": -0.656229555606842, "step": 1270 }, { "epoch": 3.32, "grad_norm": 6.34375, "learning_rate": 4.217195806684629e-07, "logits/chosen": -1.7541017532348633, "logits/rejected": -1.761639952659607, "logps/chosen": -33.46928024291992, "logps/rejected": -34.43730163574219, "loss": 0.3088, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5751846432685852, "rewards/margins": 1.3107969760894775, "rewards/rejected": -0.7356122732162476, "step": 1280 }, { "epoch": 3.35, "grad_norm": 5.78125, "learning_rate": 3.907605513696808e-07, "logits/chosen": -1.6694676876068115, "logits/rejected": -1.671893835067749, "logps/chosen": -31.393539428710938, "logps/rejected": -38.413612365722656, "loss": 0.2938, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6953436732292175, "rewards/margins": 1.4390814304351807, "rewards/rejected": -0.7437376976013184, "step": 1290 }, { "epoch": 3.38, "grad_norm": 5.34375, "learning_rate": 3.6088515096305675e-07, "logits/chosen": -1.7161693572998047, "logits/rejected": -1.719686508178711, "logps/chosen": -31.36373519897461, "logps/rejected": -34.30319595336914, "loss": 0.2648, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.741172194480896, "rewards/margins": 1.4767869710922241, "rewards/rejected": -0.7356146574020386, "step": 1300 }, { "epoch": 3.38, "eval_logits/chosen": -2.067815065383911, "eval_logits/rejected": -2.0631678104400635, "eval_logps/chosen": -34.68254089355469, "eval_logps/rejected": -38.4415397644043, "eval_loss": 0.6774489283561707, "eval_rewards/accuracies": 0.5834717750549316, "eval_rewards/chosen": -0.2591961622238159, "eval_rewards/margins": 0.11077102273702621, "eval_rewards/rejected": -0.3699672222137451, "eval_runtime": 145.3523, "eval_samples_per_second": 2.36, "eval_steps_per_second": 0.296, "step": 1300 }, { "epoch": 3.4, "grad_norm": 3.765625, "learning_rate": 3.321087280364757e-07, "logits/chosen": -1.8252456188201904, "logits/rejected": -1.8048315048217773, "logps/chosen": -29.41412925720215, "logps/rejected": -37.30854415893555, "loss": 0.2709, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6797068119049072, "rewards/margins": 1.5365630388259888, "rewards/rejected": -0.8568561673164368, "step": 1310 }, { "epoch": 3.43, "grad_norm": 4.90625, "learning_rate": 3.044460665744284e-07, "logits/chosen": -1.799944519996643, "logits/rejected": -1.8057048320770264, "logps/chosen": -31.14408302307129, "logps/rejected": -34.24466323852539, "loss": 0.2632, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7558823823928833, "rewards/margins": 1.4574472904205322, "rewards/rejected": -0.7015649080276489, "step": 1320 }, { "epoch": 3.45, "grad_norm": 4.59375, "learning_rate": 2.779113783626916e-07, "logits/chosen": -1.7793442010879517, "logits/rejected": -1.7743394374847412, "logps/chosen": -31.542285919189453, "logps/rejected": -36.73578643798828, "loss": 0.2666, "rewards/accuracies": 0.9375, "rewards/chosen": 0.5920882821083069, "rewards/margins": 1.4574869871139526, "rewards/rejected": -0.8653987646102905, "step": 1330 }, { "epoch": 3.48, "grad_norm": 4.3125, "learning_rate": 2.5251829568697204e-07, "logits/chosen": -1.6390937566757202, "logits/rejected": -1.648639440536499, "logps/chosen": -31.974777221679688, "logps/rejected": -32.479408264160156, "loss": 0.3034, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7299244999885559, "rewards/margins": 1.3080428838729858, "rewards/rejected": -0.5781184434890747, "step": 1340 }, { "epoch": 3.51, "grad_norm": 5.25, "learning_rate": 2.2827986432927774e-07, "logits/chosen": -1.6959750652313232, "logits/rejected": -1.7006906270980835, "logps/chosen": -32.08546447753906, "logps/rejected": -34.80610656738281, "loss": 0.2624, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.7461883425712585, "rewards/margins": 1.3867065906524658, "rewards/rejected": -0.6405184268951416, "step": 1350 }, { "epoch": 3.53, "grad_norm": 11.0625, "learning_rate": 2.0520853686560177e-07, "logits/chosen": -1.7251228094100952, "logits/rejected": -1.7304246425628662, "logps/chosen": -32.209083557128906, "logps/rejected": -36.53895950317383, "loss": 0.2908, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6490303874015808, "rewards/margins": 1.4393888711929321, "rewards/rejected": -0.790358304977417, "step": 1360 }, { "epoch": 3.56, "grad_norm": 3.953125, "learning_rate": 1.833161662683672e-07, "logits/chosen": -1.8194055557250977, "logits/rejected": -1.8149350881576538, "logps/chosen": -31.16534423828125, "logps/rejected": -32.77348709106445, "loss": 0.2875, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5881365537643433, "rewards/margins": 1.3327710628509521, "rewards/rejected": -0.7446345090866089, "step": 1370 }, { "epoch": 3.58, "grad_norm": 6.03125, "learning_rate": 1.626139998169246e-07, "logits/chosen": -1.7297048568725586, "logits/rejected": -1.7319259643554688, "logps/chosen": -28.14961814880371, "logps/rejected": -32.501651763916016, "loss": 0.299, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6038740277290344, "rewards/margins": 1.306862235069275, "rewards/rejected": -0.7029882669448853, "step": 1380 }, { "epoch": 3.61, "grad_norm": 6.15625, "learning_rate": 1.4311267331922535e-07, "logits/chosen": -1.7326571941375732, "logits/rejected": -1.7373113632202148, "logps/chosen": -30.526447296142578, "logps/rejected": -34.334877014160156, "loss": 0.322, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5972187519073486, "rewards/margins": 1.1359431743621826, "rewards/rejected": -0.5387245416641235, "step": 1390 }, { "epoch": 3.64, "grad_norm": 4.5, "learning_rate": 1.2482220564763669e-07, "logits/chosen": -1.7946516275405884, "logits/rejected": -1.7978696823120117, "logps/chosen": -33.16937255859375, "logps/rejected": -35.18634033203125, "loss": 0.2661, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.7044798135757446, "rewards/margins": 1.409590244293213, "rewards/rejected": -0.7051103711128235, "step": 1400 }, { "epoch": 3.64, "eval_logits/chosen": -2.068145990371704, "eval_logits/rejected": -2.0635111331939697, "eval_logps/chosen": -34.682861328125, "eval_logps/rejected": -38.453575134277344, "eval_loss": 0.6737242937088013, "eval_rewards/accuracies": 0.5888704061508179, "eval_rewards/chosen": -0.25932323932647705, "eval_rewards/margins": 0.11545901745557785, "eval_rewards/rejected": -0.3747822642326355, "eval_runtime": 145.2165, "eval_samples_per_second": 2.362, "eval_steps_per_second": 0.296, "step": 1400 }, { "epoch": 3.66, "grad_norm": 5.5625, "learning_rate": 1.0775199359171346e-07, "logits/chosen": -1.861598253250122, "logits/rejected": -1.8546861410140991, "logps/chosen": -32.03349685668945, "logps/rejected": -35.89234924316406, "loss": 0.2836, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7076212763786316, "rewards/margins": 1.3667352199554443, "rewards/rejected": -0.6591139435768127, "step": 1410 }, { "epoch": 3.69, "grad_norm": 7.25, "learning_rate": 9.191080703056604e-08, "logits/chosen": -1.754303216934204, "logits/rejected": -1.7654063701629639, "logps/chosen": -32.40322494506836, "logps/rejected": -35.30275344848633, "loss": 0.2752, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.6984306573867798, "rewards/margins": 1.383098840713501, "rewards/rejected": -0.6846679449081421, "step": 1420 }, { "epoch": 3.71, "grad_norm": 6.09375, "learning_rate": 7.730678442730539e-08, "logits/chosen": -1.8308874368667603, "logits/rejected": -1.8427495956420898, "logps/chosen": -33.228111267089844, "logps/rejected": -35.49666213989258, "loss": 0.2652, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.7330778241157532, "rewards/margins": 1.476409912109375, "rewards/rejected": -0.743332028388977, "step": 1430 }, { "epoch": 3.74, "grad_norm": 6.875, "learning_rate": 6.394742864787806e-08, "logits/chosen": -1.8214858770370483, "logits/rejected": -1.823809266090393, "logps/chosen": -31.59054183959961, "logps/rejected": -35.78374481201172, "loss": 0.2894, "rewards/accuracies": 0.9375, "rewards/chosen": 0.6471369862556458, "rewards/margins": 1.4166474342346191, "rewards/rejected": -0.7695104479789734, "step": 1440 }, { "epoch": 3.77, "grad_norm": 4.71875, "learning_rate": 5.183960310644748e-08, "logits/chosen": -1.8054840564727783, "logits/rejected": -1.798434853553772, "logps/chosen": -33.86231231689453, "logps/rejected": -35.45820617675781, "loss": 0.2973, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.682405948638916, "rewards/margins": 1.3575280904769897, "rewards/rejected": -0.6751221418380737, "step": 1450 }, { "epoch": 3.79, "grad_norm": 5.15625, "learning_rate": 4.098952823928693e-08, "logits/chosen": -1.772991418838501, "logits/rejected": -1.7789325714111328, "logps/chosen": -29.6343994140625, "logps/rejected": -35.68324661254883, "loss": 0.2694, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.6315112709999084, "rewards/margins": 1.4368103742599487, "rewards/rejected": -0.8052991032600403, "step": 1460 }, { "epoch": 3.82, "grad_norm": 4.875, "learning_rate": 3.1402778309014284e-08, "logits/chosen": -1.7217044830322266, "logits/rejected": -1.719930648803711, "logps/chosen": -29.1605281829834, "logps/rejected": -32.21342086791992, "loss": 0.2994, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6740103363990784, "rewards/margins": 1.3216867446899414, "rewards/rejected": -0.6476765275001526, "step": 1470 }, { "epoch": 3.84, "grad_norm": 5.96875, "learning_rate": 2.3084278540791427e-08, "logits/chosen": -1.896645188331604, "logits/rejected": -1.8910773992538452, "logps/chosen": -33.91704177856445, "logps/rejected": -33.85138702392578, "loss": 0.3006, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.5472885370254517, "rewards/margins": 1.2651593685150146, "rewards/rejected": -0.7178710103034973, "step": 1480 }, { "epoch": 3.87, "grad_norm": 5.78125, "learning_rate": 1.6038302591975807e-08, "logits/chosen": -1.790889024734497, "logits/rejected": -1.793336272239685, "logps/chosen": -27.0684757232666, "logps/rejected": -29.4942684173584, "loss": 0.338, "rewards/accuracies": 0.875, "rewards/chosen": 0.5631623864173889, "rewards/margins": 1.1519577503204346, "rewards/rejected": -0.5887953639030457, "step": 1490 }, { "epoch": 3.9, "grad_norm": 6.21875, "learning_rate": 1.0268470356514237e-08, "logits/chosen": -1.796856164932251, "logits/rejected": -1.7912604808807373, "logps/chosen": -31.488414764404297, "logps/rejected": -33.81616973876953, "loss": 0.3194, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.5249518156051636, "rewards/margins": 1.2638393640518188, "rewards/rejected": -0.7388876080513, "step": 1500 }, { "epoch": 3.9, "eval_logits/chosen": -2.067756175994873, "eval_logits/rejected": -2.0631155967712402, "eval_logps/chosen": -34.68146514892578, "eval_logps/rejected": -38.44291687011719, "eval_loss": 0.676816999912262, "eval_rewards/accuracies": 0.5801494717597961, "eval_rewards/chosen": -0.25876519083976746, "eval_rewards/margins": 0.11175353080034256, "eval_rewards/rejected": -0.3705187141895294, "eval_runtime": 145.241, "eval_samples_per_second": 2.362, "eval_steps_per_second": 0.296, "step": 1500 }, { "epoch": 3.92, "grad_norm": 7.375, "learning_rate": 5.777746105209147e-09, "logits/chosen": -1.7225477695465088, "logits/rejected": -1.7265936136245728, "logps/chosen": -32.69451141357422, "logps/rejected": -35.99700164794922, "loss": 0.3195, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.5905929803848267, "rewards/margins": 1.2252399921417236, "rewards/rejected": -0.6346471309661865, "step": 1510 }, { "epoch": 3.95, "grad_norm": 3.671875, "learning_rate": 2.5684369628148352e-09, "logits/chosen": -1.7768518924713135, "logits/rejected": -1.7753241062164307, "logps/chosen": -29.13608741760254, "logps/rejected": -34.37911605834961, "loss": 0.3089, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.6392368078231812, "rewards/margins": 1.3549953699111938, "rewards/rejected": -0.7157586216926575, "step": 1520 }, { "epoch": 3.97, "grad_norm": 7.34375, "learning_rate": 6.421917227455999e-10, "logits/chosen": -1.8761787414550781, "logits/rejected": -1.8682851791381836, "logps/chosen": -26.693634033203125, "logps/rejected": -30.43243408203125, "loss": 0.3054, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.547723114490509, "rewards/margins": 1.2520538568496704, "rewards/rejected": -0.7043307423591614, "step": 1530 }, { "epoch": 4.0, "grad_norm": 5.3125, "learning_rate": 0.0, "logits/chosen": -1.7809568643569946, "logits/rejected": -1.7705955505371094, "logps/chosen": -31.539371490478516, "logps/rejected": -37.31859588623047, "loss": 0.2762, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.6107311248779297, "rewards/margins": 1.3737008571624756, "rewards/rejected": -0.7629695534706116, "step": 1540 }, { "epoch": 4.0, "step": 1540, "total_flos": 0.0, "train_loss": 0.2942203808140445, "train_runtime": 10768.7061, "train_samples_per_second": 1.144, "train_steps_per_second": 0.143 } ], "logging_steps": 10, "max_steps": 1540, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }