{ "best_metric": 1.3996269702911377, "best_model_checkpoint": "saves/Gemma-2B-It/lora/orpo/checkpoint-1500", "epoch": 2.997999555456768, "eval_steps": 500, "global_step": 1686, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.017781729273171815, "grad_norm": 1.9148550033569336, "learning_rate": 4.9995745934141085e-06, "logits/chosen": -22.071788787841797, "logits/rejected": -21.994897842407227, "logps/chosen": -1.9321304559707642, "logps/rejected": -2.141274929046631, "loss": 2.0148, "odds_ratio_loss": 0.8263328671455383, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -0.19321303069591522, "rewards/margins": 0.020914455875754356, "rewards/rejected": -0.21412746608257294, "sft_loss": 1.9321304559707642, "step": 10 }, { "epoch": 0.03556345854634363, "grad_norm": 4.772641181945801, "learning_rate": 4.9982812903243405e-06, "logits/chosen": -21.850475311279297, "logits/rejected": -22.140661239624023, "logps/chosen": -2.000199556350708, "logps/rejected": -1.9620949029922485, "loss": 2.0912, "odds_ratio_loss": 0.9096724390983582, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.2000199258327484, "rewards/margins": -0.003810454858466983, "rewards/rejected": -0.19620949029922485, "sft_loss": 2.000199556350708, "step": 20 }, { "epoch": 0.05334518781951545, "grad_norm": 1.9645005464553833, "learning_rate": 4.996120496405222e-06, "logits/chosen": -22.181926727294922, "logits/rejected": -22.236988067626953, "logps/chosen": -1.9057893753051758, "logps/rejected": -2.2623982429504395, "loss": 1.9768, "odds_ratio_loss": 0.7102858424186707, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.19057895243167877, "rewards/margins": 0.03566090017557144, "rewards/rejected": -0.22623984515666962, "sft_loss": 1.9057893753051758, "step": 30 }, { "epoch": 0.07112691709268726, "grad_norm": 1.9976199865341187, "learning_rate": 4.99309296196014e-06, "logits/chosen": -22.178194046020508, "logits/rejected": -22.227825164794922, "logps/chosen": -1.8588358163833618, "logps/rejected": -2.0477230548858643, "loss": 1.94, "odds_ratio_loss": 0.8119063377380371, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.18588361144065857, "rewards/margins": 0.01888870820403099, "rewards/rejected": -0.20477227866649628, "sft_loss": 1.8588358163833618, "step": 40 }, { "epoch": 0.08890864636585907, "grad_norm": 1.6904418468475342, "learning_rate": 4.989199738255166e-06, "logits/chosen": -22.063446044921875, "logits/rejected": -22.088878631591797, "logps/chosen": -1.8785845041275024, "logps/rejected": -2.0510427951812744, "loss": 1.9601, "odds_ratio_loss": 0.81475830078125, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.187858447432518, "rewards/margins": 0.0172458253800869, "rewards/rejected": -0.2051042765378952, "sft_loss": 1.8785845041275024, "step": 50 }, { "epoch": 0.1066903756390309, "grad_norm": 1.597947359085083, "learning_rate": 4.984442177154031e-06, "logits/chosen": -22.34821319580078, "logits/rejected": -22.315746307373047, "logps/chosen": -1.9788051843643188, "logps/rejected": -2.084188461303711, "loss": 2.0713, "odds_ratio_loss": 0.924887478351593, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.19788053631782532, "rewards/margins": 0.01053832471370697, "rewards/rejected": -0.20841887593269348, "sft_loss": 1.9788051843643188, "step": 60 }, { "epoch": 0.12447210491220272, "grad_norm": 2.274142026901245, "learning_rate": 4.978821930648704e-06, "logits/chosen": -22.288013458251953, "logits/rejected": -22.197546005249023, "logps/chosen": -1.9143011569976807, "logps/rejected": -1.8838450908660889, "loss": 2.0168, "odds_ratio_loss": 1.0252134799957275, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.19143010675907135, "rewards/margins": -0.0030455999076366425, "rewards/rejected": -0.1883845031261444, "sft_loss": 1.9143011569976807, "step": 70 }, { "epoch": 0.14225383418537452, "grad_norm": 2.63519549369812, "learning_rate": 4.97234095028576e-06, "logits/chosen": -22.663928985595703, "logits/rejected": -22.51036834716797, "logps/chosen": -1.8908298015594482, "logps/rejected": -1.9426231384277344, "loss": 1.9749, "odds_ratio_loss": 0.8411667943000793, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1890830099582672, "rewards/margins": 0.005179307423532009, "rewards/rejected": -0.1942623108625412, "sft_loss": 1.8908298015594482, "step": 80 }, { "epoch": 0.16003556345854633, "grad_norm": 1.5385671854019165, "learning_rate": 4.965001486488743e-06, "logits/chosen": -22.35540199279785, "logits/rejected": -22.453685760498047, "logps/chosen": -1.6930122375488281, "logps/rejected": -1.8899803161621094, "loss": 1.7643, "odds_ratio_loss": 0.7125129699707031, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.16930122673511505, "rewards/margins": 0.019696824252605438, "rewards/rejected": -0.1889980286359787, "sft_loss": 1.6930122375488281, "step": 90 }, { "epoch": 0.17781729273171815, "grad_norm": 1.6486105918884277, "learning_rate": 4.956806087776732e-06, "logits/chosen": -22.912220001220703, "logits/rejected": -22.764265060424805, "logps/chosen": -1.7519505023956299, "logps/rejected": -2.004110813140869, "loss": 1.8222, "odds_ratio_loss": 0.7028593420982361, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.17519506812095642, "rewards/margins": 0.025216031819581985, "rewards/rejected": -0.2004111111164093, "sft_loss": 1.7519505023956299, "step": 100 }, { "epoch": 0.19559902200489, "grad_norm": 2.1504974365234375, "learning_rate": 4.947757599879411e-06, "logits/chosen": -22.865467071533203, "logits/rejected": -23.005489349365234, "logps/chosen": -1.727837324142456, "logps/rejected": -1.8906141519546509, "loss": 1.8026, "odds_ratio_loss": 0.7475000023841858, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.1727837324142456, "rewards/margins": 0.016277695074677467, "rewards/rejected": -0.18906141817569733, "sft_loss": 1.727837324142456, "step": 110 }, { "epoch": 0.2133807512780618, "grad_norm": 3.6934735774993896, "learning_rate": 4.937859164748931e-06, "logits/chosen": -22.859783172607422, "logits/rejected": -23.031169891357422, "logps/chosen": -1.5483535528182983, "logps/rejected": -1.6135647296905518, "loss": 1.624, "odds_ratio_loss": 0.7560455203056335, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.15483535826206207, "rewards/margins": 0.006521114148199558, "rewards/rejected": -0.16135647892951965, "sft_loss": 1.5483535528182983, "step": 120 }, { "epoch": 0.23116248055123362, "grad_norm": 3.72802734375, "learning_rate": 4.92711421946891e-06, "logits/chosen": -23.100276947021484, "logits/rejected": -22.69415283203125, "logps/chosen": -1.5568244457244873, "logps/rejected": -1.8098100423812866, "loss": 1.6294, "odds_ratio_loss": 0.7258477210998535, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.15568244457244873, "rewards/margins": 0.025298580527305603, "rewards/rejected": -0.18098104000091553, "sft_loss": 1.5568244457244873, "step": 130 }, { "epoch": 0.24894420982440543, "grad_norm": 5.469463348388672, "learning_rate": 4.915526495060961e-06, "logits/chosen": -23.371618270874023, "logits/rejected": -23.214031219482422, "logps/chosen": -1.4800597429275513, "logps/rejected": -1.746651291847229, "loss": 1.5507, "odds_ratio_loss": 0.7065833806991577, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.14800596237182617, "rewards/margins": 0.026659172028303146, "rewards/rejected": -0.17466513812541962, "sft_loss": 1.4800597429275513, "step": 140 }, { "epoch": 0.26672593909757725, "grad_norm": 4.669017314910889, "learning_rate": 4.903100015189153e-06, "logits/chosen": -22.959320068359375, "logits/rejected": -23.156007766723633, "logps/chosen": -1.5119131803512573, "logps/rejected": -1.708356261253357, "loss": 1.5853, "odds_ratio_loss": 0.7340201735496521, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1511913239955902, "rewards/margins": 0.01964429020881653, "rewards/rejected": -0.17083561420440674, "sft_loss": 1.5119131803512573, "step": 150 }, { "epoch": 0.28450766837074903, "grad_norm": 1.5560555458068848, "learning_rate": 4.889839094762848e-06, "logits/chosen": -22.915985107421875, "logits/rejected": -22.794408798217773, "logps/chosen": -1.6158710718154907, "logps/rejected": -1.7288596630096436, "loss": 1.6909, "odds_ratio_loss": 0.7498828768730164, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.16158713400363922, "rewards/margins": 0.011298839002847672, "rewards/rejected": -0.1728859841823578, "sft_loss": 1.6158710718154907, "step": 160 }, { "epoch": 0.3022893976439209, "grad_norm": 1.565077304840088, "learning_rate": 4.875748338438416e-06, "logits/chosen": -23.140369415283203, "logits/rejected": -23.20174789428711, "logps/chosen": -1.532865285873413, "logps/rejected": -1.6764837503433228, "loss": 1.6069, "odds_ratio_loss": 0.7403478026390076, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.15328654646873474, "rewards/margins": 0.014361831359565258, "rewards/rejected": -0.16764836013317108, "sft_loss": 1.532865285873413, "step": 170 }, { "epoch": 0.32007112691709266, "grad_norm": 3.263695240020752, "learning_rate": 4.8608326390203386e-06, "logits/chosen": -22.981613159179688, "logits/rejected": -22.818286895751953, "logps/chosen": -1.485670804977417, "logps/rejected": -1.6812422275543213, "loss": 1.5542, "odds_ratio_loss": 0.6854217052459717, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.1485670804977417, "rewards/margins": 0.019557146355509758, "rewards/rejected": -0.1681242287158966, "sft_loss": 1.485670804977417, "step": 180 }, { "epoch": 0.3378528561902645, "grad_norm": 2.1444835662841797, "learning_rate": 4.845097175762251e-06, "logits/chosen": -23.199800491333008, "logits/rejected": -23.2564640045166, "logps/chosen": -1.4873155355453491, "logps/rejected": -1.5498250722885132, "loss": 1.5614, "odds_ratio_loss": 0.7410100698471069, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.1487315446138382, "rewards/margins": 0.006250949110835791, "rewards/rejected": -0.15498249232769012, "sft_loss": 1.4873155355453491, "step": 190 }, { "epoch": 0.3556345854634363, "grad_norm": 5.516879558563232, "learning_rate": 4.8285474125685286e-06, "logits/chosen": -23.00992774963379, "logits/rejected": -22.893043518066406, "logps/chosen": -1.520996332168579, "logps/rejected": -1.6076465845108032, "loss": 1.5966, "odds_ratio_loss": 0.7559183239936829, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1520996391773224, "rewards/margins": 0.008664996363222599, "rewards/rejected": -0.1607646495103836, "sft_loss": 1.520996332168579, "step": 200 }, { "epoch": 0.37341631473660813, "grad_norm": 1.6477737426757812, "learning_rate": 4.811189096097025e-06, "logits/chosen": -22.601619720458984, "logits/rejected": -22.704158782958984, "logps/chosen": -1.5167438983917236, "logps/rejected": -1.700338363647461, "loss": 1.5896, "odds_ratio_loss": 0.7285597920417786, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.15167437493801117, "rewards/margins": 0.018359454348683357, "rewards/rejected": -0.17003384232521057, "sft_loss": 1.5167438983917236, "step": 210 }, { "epoch": 0.39119804400978, "grad_norm": 2.6526737213134766, "learning_rate": 4.793028253763633e-06, "logits/chosen": -22.879850387573242, "logits/rejected": -22.78567123413086, "logps/chosen": -1.4604800939559937, "logps/rejected": -1.6260970830917358, "loss": 1.537, "odds_ratio_loss": 0.7654477953910828, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.14604800939559937, "rewards/margins": 0.016561714932322502, "rewards/rejected": -0.16260972619056702, "sft_loss": 1.4604800939559937, "step": 220 }, { "epoch": 0.40897977328295176, "grad_norm": 2.860865354537964, "learning_rate": 4.774071191649352e-06, "logits/chosen": -22.46622657775879, "logits/rejected": -22.480607986450195, "logps/chosen": -1.376908779144287, "logps/rejected": -1.6316293478012085, "loss": 1.4435, "odds_ratio_loss": 0.6654683351516724, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.13769087195396423, "rewards/margins": 0.025472048670053482, "rewards/rejected": -0.16316291689872742, "sft_loss": 1.376908779144287, "step": 230 }, { "epoch": 0.4267615025561236, "grad_norm": 2.685337781906128, "learning_rate": 4.7543244923105975e-06, "logits/chosen": -22.682777404785156, "logits/rejected": -22.806440353393555, "logps/chosen": -1.5592434406280518, "logps/rejected": -1.5708439350128174, "loss": 1.6395, "odds_ratio_loss": 0.8026041984558105, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.15592436492443085, "rewards/margins": 0.0011600303696468472, "rewards/rejected": -0.1570843905210495, "sft_loss": 1.5592434406280518, "step": 240 }, { "epoch": 0.4445432318292954, "grad_norm": 1.3707021474838257, "learning_rate": 4.733795012493506e-06, "logits/chosen": -22.8146915435791, "logits/rejected": -22.913793563842773, "logps/chosen": -1.5595623254776, "logps/rejected": -1.6163349151611328, "loss": 1.6357, "odds_ratio_loss": 0.7609250545501709, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1559562385082245, "rewards/margins": 0.005677259061485529, "rewards/rejected": -0.1616334766149521, "sft_loss": 1.5595623254776, "step": 250 }, { "epoch": 0.46232496110246724, "grad_norm": 1.3753399848937988, "learning_rate": 4.712489880753035e-06, "logits/chosen": -22.511287689208984, "logits/rejected": -22.446317672729492, "logps/chosen": -1.3289070129394531, "logps/rejected": -1.4945290088653564, "loss": 1.3952, "odds_ratio_loss": 0.6626302003860474, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.1328907012939453, "rewards/margins": 0.016562188044190407, "rewards/rejected": -0.14945289492607117, "sft_loss": 1.3289070129394531, "step": 260 }, { "epoch": 0.480106690375639, "grad_norm": 5.678652286529541, "learning_rate": 4.690416494977673e-06, "logits/chosen": -22.829517364501953, "logits/rejected": -22.87631607055664, "logps/chosen": -1.4606059789657593, "logps/rejected": -1.6754430532455444, "loss": 1.5279, "odds_ratio_loss": 0.6730555295944214, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14606061577796936, "rewards/margins": 0.02148369327187538, "rewards/rejected": -0.16754429042339325, "sft_loss": 1.4606059789657593, "step": 270 }, { "epoch": 0.49788841964881086, "grad_norm": 1.2074130773544312, "learning_rate": 4.667582519820639e-06, "logits/chosen": -22.504804611206055, "logits/rejected": -22.659706115722656, "logps/chosen": -1.479263424873352, "logps/rejected": -1.5646381378173828, "loss": 1.5534, "odds_ratio_loss": 0.7413693070411682, "rewards/accuracies": 0.5625, "rewards/chosen": -0.14792636036872864, "rewards/margins": 0.008537469431757927, "rewards/rejected": -0.15646381676197052, "sft_loss": 1.479263424873352, "step": 280 }, { "epoch": 0.5156701489219827, "grad_norm": 2.7386221885681152, "learning_rate": 4.643995884038443e-06, "logits/chosen": -22.59560775756836, "logits/rejected": -22.57559585571289, "logps/chosen": -1.3870899677276611, "logps/rejected": -1.5722427368164062, "loss": 1.4562, "odds_ratio_loss": 0.6912243962287903, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.13870900869369507, "rewards/margins": 0.01851527765393257, "rewards/rejected": -0.15722428262233734, "sft_loss": 1.3870899677276611, "step": 290 }, { "epoch": 0.5334518781951545, "grad_norm": 1.5327345132827759, "learning_rate": 4.6196647777377475e-06, "logits/chosen": -22.43231201171875, "logits/rejected": -22.38507652282715, "logps/chosen": -1.4321014881134033, "logps/rejected": -1.4831379652023315, "loss": 1.5072, "odds_ratio_loss": 0.7509574294090271, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.14321014285087585, "rewards/margins": 0.005103647243231535, "rewards/rejected": -0.14831380546092987, "sft_loss": 1.4321014881134033, "step": 300 }, { "epoch": 0.5512336074683263, "grad_norm": 1.276062250137329, "learning_rate": 4.59459764953147e-06, "logits/chosen": -22.478355407714844, "logits/rejected": -22.29865264892578, "logps/chosen": -1.445011854171753, "logps/rejected": -1.550065279006958, "loss": 1.5147, "odds_ratio_loss": 0.6971566081047058, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.14450117945671082, "rewards/margins": 0.010505353100597858, "rewards/rejected": -0.1550065577030182, "sft_loss": 1.445011854171753, "step": 310 }, { "epoch": 0.5690153367414981, "grad_norm": 2.80613112449646, "learning_rate": 4.568803203605133e-06, "logits/chosen": -22.582855224609375, "logits/rejected": -22.391347885131836, "logps/chosen": -1.3941065073013306, "logps/rejected": -1.580993413925171, "loss": 1.4659, "odds_ratio_loss": 0.7180419564247131, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.13941065967082977, "rewards/margins": 0.018688684329390526, "rewards/rejected": -0.15809933841228485, "sft_loss": 1.3941065073013306, "step": 320 }, { "epoch": 0.58679706601467, "grad_norm": 5.356297492980957, "learning_rate": 4.542290396694462e-06, "logits/chosen": -22.250286102294922, "logits/rejected": -22.175914764404297, "logps/chosen": -1.4387528896331787, "logps/rejected": -1.5810470581054688, "loss": 1.5106, "odds_ratio_loss": 0.7184728980064392, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.14387528598308563, "rewards/margins": 0.014229406602680683, "rewards/rejected": -0.15810470283031464, "sft_loss": 1.4387528896331787, "step": 330 }, { "epoch": 0.6045787952878418, "grad_norm": 8.996047019958496, "learning_rate": 4.515068434975298e-06, "logits/chosen": -22.00839614868164, "logits/rejected": -22.072261810302734, "logps/chosen": -1.4673653841018677, "logps/rejected": -1.6608636379241943, "loss": 1.5376, "odds_ratio_loss": 0.7021427154541016, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.14673654735088348, "rewards/margins": 0.019349832087755203, "rewards/rejected": -0.16608639061450958, "sft_loss": 1.4673653841018677, "step": 340 }, { "epoch": 0.6223605245610135, "grad_norm": 1.460726261138916, "learning_rate": 4.487146770866887e-06, "logits/chosen": -22.291297912597656, "logits/rejected": -22.382854461669922, "logps/chosen": -1.406706690788269, "logps/rejected": -1.4625658988952637, "loss": 1.4815, "odds_ratio_loss": 0.747775673866272, "rewards/accuracies": 0.4375, "rewards/chosen": -0.14067067205905914, "rewards/margins": 0.005585917271673679, "rewards/rejected": -0.14625659584999084, "sft_loss": 1.406706690788269, "step": 350 }, { "epoch": 0.6401422538341853, "grad_norm": 1.722812533378601, "learning_rate": 4.458535099749666e-06, "logits/chosen": -22.276639938354492, "logits/rejected": -22.166675567626953, "logps/chosen": -1.5117685794830322, "logps/rejected": -1.5999605655670166, "loss": 1.5911, "odds_ratio_loss": 0.7935177087783813, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.15117685496807098, "rewards/margins": 0.008819197304546833, "rewards/rejected": -0.15999604761600494, "sft_loss": 1.5117685794830322, "step": 360 }, { "epoch": 0.6579239831073572, "grad_norm": 2.568336248397827, "learning_rate": 4.429243356598694e-06, "logits/chosen": -21.958419799804688, "logits/rejected": -21.927824020385742, "logps/chosen": -1.4804319143295288, "logps/rejected": -1.6579450368881226, "loss": 1.5496, "odds_ratio_loss": 0.6912356615066528, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1480431854724884, "rewards/margins": 0.017751310020685196, "rewards/rejected": -0.1657945215702057, "sft_loss": 1.4804319143295288, "step": 370 }, { "epoch": 0.675705712380529, "grad_norm": 1.4206441640853882, "learning_rate": 4.399281712533875e-06, "logits/chosen": -22.067081451416016, "logits/rejected": -22.091421127319336, "logps/chosen": -1.4124424457550049, "logps/rejected": -1.4996305704116821, "loss": 1.4873, "odds_ratio_loss": 0.7487770318984985, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.14124426245689392, "rewards/margins": 0.008718819357454777, "rewards/rejected": -0.14996306598186493, "sft_loss": 1.4124424457550049, "step": 380 }, { "epoch": 0.6934874416537008, "grad_norm": 2.3953347206115723, "learning_rate": 4.368660571288192e-06, "logits/chosen": -22.193960189819336, "logits/rejected": -22.225393295288086, "logps/chosen": -1.394415020942688, "logps/rejected": -1.500723123550415, "loss": 1.4702, "odds_ratio_loss": 0.7577108144760132, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.13944150507450104, "rewards/margins": 0.010630805976688862, "rewards/rejected": -0.15007230639457703, "sft_loss": 1.394415020942688, "step": 390 }, { "epoch": 0.7112691709268726, "grad_norm": 1.4220997095108032, "learning_rate": 4.337390565595163e-06, "logits/chosen": -21.68547248840332, "logits/rejected": -21.761310577392578, "logps/chosen": -1.464005708694458, "logps/rejected": -1.5315691232681274, "loss": 1.5392, "odds_ratio_loss": 0.75159752368927, "rewards/accuracies": 0.46875, "rewards/chosen": -0.1464005708694458, "rewards/margins": 0.006756337825208902, "rewards/rejected": -0.15315690636634827, "sft_loss": 1.464005708694458, "step": 400 }, { "epoch": 0.7290509002000445, "grad_norm": 1.8401212692260742, "learning_rate": 4.305482553496786e-06, "logits/chosen": -21.259353637695312, "logits/rejected": -21.159082412719727, "logps/chosen": -1.3970698118209839, "logps/rejected": -1.5361021757125854, "loss": 1.4689, "odds_ratio_loss": 0.7183545827865601, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13970699906349182, "rewards/margins": 0.01390322856605053, "rewards/rejected": -0.1536101996898651, "sft_loss": 1.3970698118209839, "step": 410 }, { "epoch": 0.7468326294732163, "grad_norm": 2.2964348793029785, "learning_rate": 4.272947614573244e-06, "logits/chosen": -21.679210662841797, "logits/rejected": -21.884136199951172, "logps/chosen": -1.447422981262207, "logps/rejected": -1.5282857418060303, "loss": 1.5205, "odds_ratio_loss": 0.730276346206665, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.14474229514598846, "rewards/margins": 0.008086279034614563, "rewards/rejected": -0.15282857418060303, "sft_loss": 1.447422981262207, "step": 420 }, { "epoch": 0.7646143587463881, "grad_norm": 1.2190438508987427, "learning_rate": 4.23979704609569e-06, "logits/chosen": -21.96237564086914, "logits/rejected": -22.065784454345703, "logps/chosen": -1.398108959197998, "logps/rejected": -1.495884656906128, "loss": 1.4676, "odds_ratio_loss": 0.6946425437927246, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.13981090486049652, "rewards/margins": 0.009777549654245377, "rewards/rejected": -0.1495884507894516, "sft_loss": 1.398108959197998, "step": 430 }, { "epoch": 0.78239608801956, "grad_norm": 2.3040215969085693, "learning_rate": 4.206042359103435e-06, "logits/chosen": -21.56629180908203, "logits/rejected": -21.716127395629883, "logps/chosen": -1.487396001815796, "logps/rejected": -1.617078185081482, "loss": 1.5602, "odds_ratio_loss": 0.7281696796417236, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.14873960614204407, "rewards/margins": 0.012968212366104126, "rewards/rejected": -0.1617078334093094, "sft_loss": 1.487396001815796, "step": 440 }, { "epoch": 0.8001778172927317, "grad_norm": 2.5727310180664062, "learning_rate": 4.17169527440691e-06, "logits/chosen": -21.884145736694336, "logits/rejected": -21.738811492919922, "logps/chosen": -1.4501018524169922, "logps/rejected": -1.4668500423431396, "loss": 1.5286, "odds_ratio_loss": 0.7853611707687378, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.14501020312309265, "rewards/margins": 0.0016747992485761642, "rewards/rejected": -0.14668500423431396, "sft_loss": 1.4501018524169922, "step": 450 }, { "epoch": 0.8179595465659035, "grad_norm": 2.54972243309021, "learning_rate": 4.136767718517797e-06, "logits/chosen": -21.746496200561523, "logits/rejected": -21.7362060546875, "logps/chosen": -1.3023537397384644, "logps/rejected": -1.5028297901153564, "loss": 1.368, "odds_ratio_loss": 0.6567283868789673, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.13023535907268524, "rewards/margins": 0.020047612488269806, "rewards/rejected": -0.15028299391269684, "sft_loss": 1.3023537397384644, "step": 460 }, { "epoch": 0.8357412758390753, "grad_norm": 6.595831871032715, "learning_rate": 4.1012718195077196e-06, "logits/chosen": -21.96458625793457, "logits/rejected": -22.172712326049805, "logps/chosen": -1.4211918115615845, "logps/rejected": -1.4663982391357422, "loss": 1.4961, "odds_ratio_loss": 0.7494389414787292, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.14211918413639069, "rewards/margins": 0.004520639777183533, "rewards/rejected": -0.14663982391357422, "sft_loss": 1.4211918115615845, "step": 470 }, { "epoch": 0.8535230051122472, "grad_norm": 3.3689377307891846, "learning_rate": 4.065219902796953e-06, "logits/chosen": -21.535301208496094, "logits/rejected": -21.487293243408203, "logps/chosen": -1.3686919212341309, "logps/rejected": -1.5178884267807007, "loss": 1.4414, "odds_ratio_loss": 0.7275662422180176, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13686920702457428, "rewards/margins": 0.014919650740921497, "rewards/rejected": -0.1517888456583023, "sft_loss": 1.3686919212341309, "step": 480 }, { "epoch": 0.871304734385419, "grad_norm": 1.1600650548934937, "learning_rate": 4.028624486874608e-06, "logits/chosen": -21.398052215576172, "logits/rejected": -21.58942222595215, "logps/chosen": -1.3973881006240845, "logps/rejected": -1.5505540370941162, "loss": 1.4689, "odds_ratio_loss": 0.7148123383522034, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.1397387981414795, "rewards/margins": 0.015316602773964405, "rewards/rejected": -0.15505541861057281, "sft_loss": 1.3973881006240845, "step": 490 }, { "epoch": 0.8890864636585908, "grad_norm": 1.584820032119751, "learning_rate": 3.99149827895177e-06, "logits/chosen": -21.60881996154785, "logits/rejected": -21.396835327148438, "logps/chosen": -1.4577990770339966, "logps/rejected": -1.5503555536270142, "loss": 1.5296, "odds_ratio_loss": 0.7181479334831238, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.14577992260456085, "rewards/margins": 0.009255652315914631, "rewards/rejected": -0.15503555536270142, "sft_loss": 1.4577990770339966, "step": 500 }, { "epoch": 0.8890864636585908, "eval_logits/chosen": -21.487462997436523, "eval_logits/rejected": -21.574512481689453, "eval_logps/chosen": -1.3780959844589233, "eval_logps/rejected": -1.5480619668960571, "eval_loss": 1.4461547136306763, "eval_odds_ratio_loss": 0.6805880665779114, "eval_rewards/accuracies": 0.546999990940094, "eval_rewards/chosen": -0.13780958950519562, "eval_rewards/margins": 0.016996610909700394, "eval_rewards/rejected": -0.1548061966896057, "eval_runtime": 80.0397, "eval_samples_per_second": 12.494, "eval_sft_loss": 1.3780959844589233, "eval_steps_per_second": 6.247, "step": 500 }, { "epoch": 0.9068681929317626, "grad_norm": 2.9641082286834717, "learning_rate": 3.953854170549114e-06, "logits/chosen": -21.522262573242188, "logits/rejected": -21.48137092590332, "logps/chosen": -1.3978930711746216, "logps/rejected": -1.4638049602508545, "loss": 1.4702, "odds_ratio_loss": 0.722897469997406, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.13978929817676544, "rewards/margins": 0.006591203156858683, "rewards/rejected": -0.1463804990053177, "sft_loss": 1.3978930711746216, "step": 510 }, { "epoch": 0.9246499222049345, "grad_norm": 1.988604187965393, "learning_rate": 3.91570523302051e-06, "logits/chosen": -21.512929916381836, "logits/rejected": -21.350711822509766, "logps/chosen": -1.4139622449874878, "logps/rejected": -1.502074122428894, "loss": 1.4897, "odds_ratio_loss": 0.7573299407958984, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.14139625430107117, "rewards/margins": 0.008811171166598797, "rewards/rejected": -0.15020740032196045, "sft_loss": 1.4139622449874878, "step": 520 }, { "epoch": 0.9424316514781063, "grad_norm": 1.4567950963974, "learning_rate": 3.8770647130141996e-06, "logits/chosen": -21.612693786621094, "logits/rejected": -21.457687377929688, "logps/chosen": -1.3569138050079346, "logps/rejected": -1.5465893745422363, "loss": 1.4271, "odds_ratio_loss": 0.7022345662117004, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13569141924381256, "rewards/margins": 0.01896754465997219, "rewards/rejected": -0.1546589434146881, "sft_loss": 1.3569138050079346, "step": 530 }, { "epoch": 0.960213380751278, "grad_norm": 1.3677376508712769, "learning_rate": 3.837946027873086e-06, "logits/chosen": -21.51246452331543, "logits/rejected": -21.367631912231445, "logps/chosen": -1.4506080150604248, "logps/rejected": -1.578880786895752, "loss": 1.5258, "odds_ratio_loss": 0.7515760660171509, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.14506080746650696, "rewards/margins": 0.012827281840145588, "rewards/rejected": -0.15788806974887848, "sft_loss": 1.4506080150604248, "step": 540 }, { "epoch": 0.9779951100244498, "grad_norm": 2.220174789428711, "learning_rate": 3.7983627609757713e-06, "logits/chosen": -21.598114013671875, "logits/rejected": -21.58673095703125, "logps/chosen": -1.4242851734161377, "logps/rejected": -1.521756887435913, "loss": 1.495, "odds_ratio_loss": 0.7070504426956177, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.14242851734161377, "rewards/margins": 0.009747164323925972, "rewards/rejected": -0.1521756947040558, "sft_loss": 1.4242851734161377, "step": 550 }, { "epoch": 0.9957768392976217, "grad_norm": 4.630890369415283, "learning_rate": 3.758328657019924e-06, "logits/chosen": -21.449283599853516, "logits/rejected": -21.295719146728516, "logps/chosen": -1.3235647678375244, "logps/rejected": -1.4640743732452393, "loss": 1.394, "odds_ratio_loss": 0.7047213315963745, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.1323564499616623, "rewards/margins": 0.014050972647964954, "rewards/rejected": -0.14640744030475616, "sft_loss": 1.3235647678375244, "step": 560 }, { "epoch": 1.0135585685707935, "grad_norm": 4.50676155090332, "learning_rate": 3.717857617249642e-06, "logits/chosen": -21.255306243896484, "logits/rejected": -21.378076553344727, "logps/chosen": -1.4302732944488525, "logps/rejected": -1.5925706624984741, "loss": 1.5037, "odds_ratio_loss": 0.7346171140670776, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.14302733540534973, "rewards/margins": 0.016229737550020218, "rewards/rejected": -0.15925706923007965, "sft_loss": 1.4302732944488525, "step": 570 }, { "epoch": 1.0313402978439654, "grad_norm": 1.0999839305877686, "learning_rate": 3.6769636946284543e-06, "logits/chosen": -21.335865020751953, "logits/rejected": -21.204635620117188, "logps/chosen": -1.2982518672943115, "logps/rejected": -1.4139636754989624, "loss": 1.3699, "odds_ratio_loss": 0.7168216705322266, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.12982520461082458, "rewards/margins": 0.011571166105568409, "rewards/rejected": -0.14139637351036072, "sft_loss": 1.2982518672943115, "step": 580 }, { "epoch": 1.049122027117137, "grad_norm": 2.193345069885254, "learning_rate": 3.6356610889596355e-06, "logits/chosen": -21.441791534423828, "logits/rejected": -21.434829711914062, "logps/chosen": -1.371517539024353, "logps/rejected": -1.4539680480957031, "loss": 1.444, "odds_ratio_loss": 0.724626362323761, "rewards/accuracies": 0.46875, "rewards/chosen": -0.13715174794197083, "rewards/margins": 0.008245043456554413, "rewards/rejected": -0.14539679884910583, "sft_loss": 1.371517539024353, "step": 590 }, { "epoch": 1.066903756390309, "grad_norm": 1.4549708366394043, "learning_rate": 3.593964141955541e-06, "logits/chosen": -21.462820053100586, "logits/rejected": -21.234458923339844, "logps/chosen": -1.355276107788086, "logps/rejected": -1.4370988607406616, "loss": 1.4288, "odds_ratio_loss": 0.7349004149436951, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1355275958776474, "rewards/margins": 0.008182285353541374, "rewards/rejected": -0.14370988309383392, "sft_loss": 1.355276107788086, "step": 600 }, { "epoch": 1.0846854856634809, "grad_norm": 1.5671168565750122, "learning_rate": 3.5518873322576573e-06, "logits/chosen": -20.91110610961914, "logits/rejected": -21.30324935913086, "logps/chosen": -1.3413856029510498, "logps/rejected": -1.4344289302825928, "loss": 1.4148, "odds_ratio_loss": 0.7339103817939758, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.1341385543346405, "rewards/margins": 0.009304327890276909, "rewards/rejected": -0.14344289898872375, "sft_loss": 1.3413856029510498, "step": 610 }, { "epoch": 1.1024672149366526, "grad_norm": 1.315266728401184, "learning_rate": 3.5094452704091143e-06, "logits/chosen": -21.287628173828125, "logits/rejected": -21.211769104003906, "logps/chosen": -1.3973969221115112, "logps/rejected": -1.5142686367034912, "loss": 1.4697, "odds_ratio_loss": 0.7228156924247742, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13973967730998993, "rewards/margins": 0.01168716698884964, "rewards/rejected": -0.15142686665058136, "sft_loss": 1.3973969221115112, "step": 620 }, { "epoch": 1.1202489442098245, "grad_norm": 3.814415693283081, "learning_rate": 3.46665269378139e-06, "logits/chosen": -21.241634368896484, "logits/rejected": -21.107349395751953, "logps/chosen": -1.4169210195541382, "logps/rejected": -1.4841772317886353, "loss": 1.4919, "odds_ratio_loss": 0.7493141889572144, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1416921317577362, "rewards/margins": 0.006725601851940155, "rewards/rejected": -0.14841774106025696, "sft_loss": 1.4169210195541382, "step": 630 }, { "epoch": 1.1380306734829961, "grad_norm": 5.05172872543335, "learning_rate": 3.4235244614569794e-06, "logits/chosen": -21.426654815673828, "logits/rejected": -21.443878173828125, "logps/chosen": -1.452530026435852, "logps/rejected": -1.5365841388702393, "loss": 1.528, "odds_ratio_loss": 0.7546505928039551, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1452530175447464, "rewards/margins": 0.00840541161596775, "rewards/rejected": -0.1536584198474884, "sft_loss": 1.452530026435852, "step": 640 }, { "epoch": 1.155812402756168, "grad_norm": 1.0250731706619263, "learning_rate": 3.3800755490698008e-06, "logits/chosen": -21.588850021362305, "logits/rejected": -21.425325393676758, "logps/chosen": -1.321417212486267, "logps/rejected": -1.539794921875, "loss": 1.3862, "odds_ratio_loss": 0.6476024985313416, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.13214172422885895, "rewards/margins": 0.021837763488292694, "rewards/rejected": -0.15397948026657104, "sft_loss": 1.321417212486267, "step": 650 }, { "epoch": 1.17359413202934, "grad_norm": 1.5032236576080322, "learning_rate": 3.3363210436051287e-06, "logits/chosen": -21.39287567138672, "logits/rejected": -21.30692481994629, "logps/chosen": -1.409549593925476, "logps/rejected": -1.5230066776275635, "loss": 1.4823, "odds_ratio_loss": 0.7271451950073242, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.14095497131347656, "rewards/margins": 0.011345705017447472, "rewards/rejected": -0.15230068564414978, "sft_loss": 1.409549593925476, "step": 660 }, { "epoch": 1.1913758613025116, "grad_norm": 1.4157321453094482, "learning_rate": 3.292276138160867e-06, "logits/chosen": -21.29572105407715, "logits/rejected": -21.25027084350586, "logps/chosen": -1.4160873889923096, "logps/rejected": -1.496361255645752, "loss": 1.491, "odds_ratio_loss": 0.749149739742279, "rewards/accuracies": 0.46875, "rewards/chosen": -0.14160871505737305, "rewards/margins": 0.00802740640938282, "rewards/rejected": -0.14963611960411072, "sft_loss": 1.4160873889923096, "step": 670 }, { "epoch": 1.2091575905756835, "grad_norm": 1.573752522468567, "learning_rate": 3.2479561266719694e-06, "logits/chosen": -21.345748901367188, "logits/rejected": -21.232250213623047, "logps/chosen": -1.3891535997390747, "logps/rejected": -1.5042526721954346, "loss": 1.4603, "odds_ratio_loss": 0.7116107940673828, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.13891534507274628, "rewards/margins": 0.011509908363223076, "rewards/rejected": -0.1504252701997757, "sft_loss": 1.3891535997390747, "step": 680 }, { "epoch": 1.2269393198488552, "grad_norm": 2.5362017154693604, "learning_rate": 3.2033763985998533e-06, "logits/chosen": -21.208703994750977, "logits/rejected": -21.204181671142578, "logps/chosen": -1.3326551914215088, "logps/rejected": -1.5722543001174927, "loss": 1.3975, "odds_ratio_loss": 0.6482537984848022, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.13326552510261536, "rewards/margins": 0.02395990863442421, "rewards/rejected": -0.15722543001174927, "sft_loss": 1.3326551914215088, "step": 690 }, { "epoch": 1.244721049122027, "grad_norm": 1.910599946975708, "learning_rate": 3.1585524335886335e-06, "logits/chosen": -21.477584838867188, "logits/rejected": -21.243457794189453, "logps/chosen": -1.299839735031128, "logps/rejected": -1.449894666671753, "loss": 1.3692, "odds_ratio_loss": 0.6940584182739258, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.12998396158218384, "rewards/margins": 0.015005489811301231, "rewards/rejected": -0.14498946070671082, "sft_loss": 1.299839735031128, "step": 700 }, { "epoch": 1.262502778395199, "grad_norm": 2.3555686473846436, "learning_rate": 3.1134997960900536e-06, "logits/chosen": -20.757158279418945, "logits/rejected": -20.784774780273438, "logps/chosen": -1.2707315683364868, "logps/rejected": -1.538629174232483, "loss": 1.334, "odds_ratio_loss": 0.6324664950370789, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.12707316875457764, "rewards/margins": 0.026789745315909386, "rewards/rejected": -0.15386290848255157, "sft_loss": 1.2707315683364868, "step": 710 }, { "epoch": 1.2802845076683709, "grad_norm": 1.596251130104065, "learning_rate": 3.0682341299589583e-06, "logits/chosen": -20.7999324798584, "logits/rejected": -20.802942276000977, "logps/chosen": -1.3453733921051025, "logps/rejected": -1.4210965633392334, "loss": 1.4194, "odds_ratio_loss": 0.7405422329902649, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.13453733921051025, "rewards/margins": 0.0075723156332969666, "rewards/rejected": -0.14210966229438782, "sft_loss": 1.3453733921051025, "step": 720 }, { "epoch": 1.2980662369415426, "grad_norm": 9.634610176086426, "learning_rate": 3.022771153021201e-06, "logits/chosen": -21.071128845214844, "logits/rejected": -21.114444732666016, "logps/chosen": -1.3551054000854492, "logps/rejected": -1.5129811763763428, "loss": 1.4248, "odds_ratio_loss": 0.6970704197883606, "rewards/accuracies": 0.59375, "rewards/chosen": -0.13551053404808044, "rewards/margins": 0.015787573531270027, "rewards/rejected": -0.15129812061786652, "sft_loss": 1.3551054000854492, "step": 730 }, { "epoch": 1.3158479662147144, "grad_norm": 1.6211514472961426, "learning_rate": 2.9771266516158625e-06, "logits/chosen": -20.895437240600586, "logits/rejected": -21.04778480529785, "logps/chosen": -1.3368757963180542, "logps/rejected": -1.4986876249313354, "loss": 1.4062, "odds_ratio_loss": 0.6928601264953613, "rewards/accuracies": 0.5, "rewards/chosen": -0.1336875855922699, "rewards/margins": 0.016181182116270065, "rewards/rejected": -0.14986875653266907, "sft_loss": 1.3368757963180542, "step": 740 }, { "epoch": 1.3336296954878861, "grad_norm": 1.4428294897079468, "learning_rate": 2.9313164751136802e-06, "logits/chosen": -20.872339248657227, "logits/rejected": -21.019441604614258, "logps/chosen": -1.4122194051742554, "logps/rejected": -1.487066626548767, "loss": 1.4842, "odds_ratio_loss": 0.7194846272468567, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.14122194051742554, "rewards/margins": 0.007484720554202795, "rewards/rejected": -0.14870667457580566, "sft_loss": 1.4122194051742554, "step": 750 }, { "epoch": 1.351411424761058, "grad_norm": 2.2369892597198486, "learning_rate": 2.8853565304135956e-06, "logits/chosen": -21.462568283081055, "logits/rejected": -21.10171127319336, "logps/chosen": -1.352461576461792, "logps/rejected": -1.3995507955551147, "loss": 1.4271, "odds_ratio_loss": 0.7464177012443542, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.1352461576461792, "rewards/margins": 0.004708918742835522, "rewards/rejected": -0.1399550884962082, "sft_loss": 1.352461576461792, "step": 760 }, { "epoch": 1.36919315403423, "grad_norm": 4.38085412979126, "learning_rate": 2.839262776419313e-06, "logits/chosen": -20.986604690551758, "logits/rejected": -20.851150512695312, "logps/chosen": -1.3386285305023193, "logps/rejected": -1.5874344110488892, "loss": 1.4054, "odds_ratio_loss": 0.6678156852722168, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.13386288285255432, "rewards/margins": 0.024880561977624893, "rewards/rejected": -0.15874342620372772, "sft_loss": 1.3386285305023193, "step": 770 }, { "epoch": 1.3869748833074016, "grad_norm": 1.5637321472167969, "learning_rate": 2.793051218497817e-06, "logits/chosen": -21.250728607177734, "logits/rejected": -21.10789680480957, "logps/chosen": -1.3795894384384155, "logps/rejected": -1.4174426794052124, "loss": 1.4542, "odds_ratio_loss": 0.7466022968292236, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.13795895874500275, "rewards/margins": 0.003785322653129697, "rewards/rejected": -0.14174428582191467, "sft_loss": 1.3795894384384155, "step": 780 }, { "epoch": 1.4047566125805735, "grad_norm": 1.276485800743103, "learning_rate": 2.7467379029217437e-06, "logits/chosen": -20.930208206176758, "logits/rejected": -20.79922103881836, "logps/chosen": -1.3249105215072632, "logps/rejected": -1.5102876424789429, "loss": 1.3942, "odds_ratio_loss": 0.6923983693122864, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1324910670518875, "rewards/margins": 0.018537694588303566, "rewards/rejected": -0.15102875232696533, "sft_loss": 1.3249105215072632, "step": 790 }, { "epoch": 1.4225383418537452, "grad_norm": 1.1495212316513062, "learning_rate": 2.7003389112975546e-06, "logits/chosen": -21.19894027709961, "logits/rejected": -21.32394790649414, "logps/chosen": -1.3503518104553223, "logps/rejected": -1.484006404876709, "loss": 1.4195, "odds_ratio_loss": 0.6918057203292847, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1350351870059967, "rewards/margins": 0.013365456834435463, "rewards/rejected": -0.14840063452720642, "sft_loss": 1.3503518104553223, "step": 800 }, { "epoch": 1.440320071126917, "grad_norm": 4.020893573760986, "learning_rate": 2.653870354981437e-06, "logits/chosen": -21.07791519165039, "logits/rejected": -20.885608673095703, "logps/chosen": -1.2470946311950684, "logps/rejected": -1.3942331075668335, "loss": 1.3155, "odds_ratio_loss": 0.6838669776916504, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.12470944970846176, "rewards/margins": 0.014713853597640991, "rewards/rejected": -0.13942332565784454, "sft_loss": 1.2470946311950684, "step": 810 }, { "epoch": 1.458101800400089, "grad_norm": 3.656785011291504, "learning_rate": 2.6073483694848777e-06, "logits/chosen": -20.693532943725586, "logits/rejected": -21.043460845947266, "logps/chosen": -1.288588285446167, "logps/rejected": -1.4409494400024414, "loss": 1.3582, "odds_ratio_loss": 0.6965524554252625, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.12885884940624237, "rewards/margins": 0.015236112289130688, "rewards/rejected": -0.14409494400024414, "sft_loss": 1.288588285446167, "step": 820 }, { "epoch": 1.4758835296732609, "grad_norm": 1.309704303741455, "learning_rate": 2.560789108871847e-06, "logits/chosen": -20.856311798095703, "logits/rejected": -20.888708114624023, "logps/chosen": -1.3494679927825928, "logps/rejected": -1.5807578563690186, "loss": 1.4162, "odds_ratio_loss": 0.6674301028251648, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1349468231201172, "rewards/margins": 0.02312898077070713, "rewards/rejected": -0.15807577967643738, "sft_loss": 1.3494679927825928, "step": 830 }, { "epoch": 1.4936652589464325, "grad_norm": 5.437036037445068, "learning_rate": 2.514208740149544e-06, "logits/chosen": -21.045909881591797, "logits/rejected": -21.26214599609375, "logps/chosen": -1.4145755767822266, "logps/rejected": -1.549298644065857, "loss": 1.4864, "odds_ratio_loss": 0.7186577320098877, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.14145755767822266, "rewards/margins": 0.013472315855324268, "rewards/rejected": -0.15492987632751465, "sft_loss": 1.4145755767822266, "step": 840 }, { "epoch": 1.5114469882196042, "grad_norm": 2.7086102962493896, "learning_rate": 2.46762343765464e-06, "logits/chosen": -21.045820236206055, "logits/rejected": -21.116756439208984, "logps/chosen": -1.4063694477081299, "logps/rejected": -1.5858089923858643, "loss": 1.4749, "odds_ratio_loss": 0.6853106021881104, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.14063693583011627, "rewards/margins": 0.01794394478201866, "rewards/rejected": -0.15858088433742523, "sft_loss": 1.4063694477081299, "step": 850 }, { "epoch": 1.5292287174927761, "grad_norm": 3.8162646293640137, "learning_rate": 2.4210493774369903e-06, "logits/chosen": -20.788593292236328, "logits/rejected": -20.716583251953125, "logps/chosen": -1.406285285949707, "logps/rejected": -1.5719993114471436, "loss": 1.4773, "odds_ratio_loss": 0.7099908590316772, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.14062853157520294, "rewards/margins": 0.016571396961808205, "rewards/rejected": -0.1571999341249466, "sft_loss": 1.406285285949707, "step": 860 }, { "epoch": 1.547010446765948, "grad_norm": 1.2074657678604126, "learning_rate": 2.374502731642732e-06, "logits/chosen": -20.910995483398438, "logits/rejected": -20.997020721435547, "logps/chosen": -1.3468477725982666, "logps/rejected": -1.490969181060791, "loss": 1.4171, "odds_ratio_loss": 0.7025480270385742, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.13468477129936218, "rewards/margins": 0.014412140473723412, "rewards/rejected": -0.14909692108631134, "sft_loss": 1.3468477725982666, "step": 870 }, { "epoch": 1.56479217603912, "grad_norm": 1.6771084070205688, "learning_rate": 2.3279996628987556e-06, "logits/chosen": -21.090503692626953, "logits/rejected": -21.15408706665039, "logps/chosen": -1.3241709470748901, "logps/rejected": -1.4298092126846313, "loss": 1.397, "odds_ratio_loss": 0.7280608415603638, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.13241711258888245, "rewards/margins": 0.010563802905380726, "rewards/rejected": -0.1429809182882309, "sft_loss": 1.3241709470748901, "step": 880 }, { "epoch": 1.5825739053122916, "grad_norm": 2.092092514038086, "learning_rate": 2.281556318700474e-06, "logits/chosen": -20.86192512512207, "logits/rejected": -21.044658660888672, "logps/chosen": -1.3072993755340576, "logps/rejected": -1.3738138675689697, "loss": 1.3822, "odds_ratio_loss": 0.7485288381576538, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.13072994351387024, "rewards/margins": 0.006651435978710651, "rewards/rejected": -0.13738137483596802, "sft_loss": 1.3072993755340576, "step": 890 }, { "epoch": 1.6003556345854635, "grad_norm": 6.660823822021484, "learning_rate": 2.2351888258048408e-06, "logits/chosen": -20.55089569091797, "logits/rejected": -20.74386978149414, "logps/chosen": -1.3101674318313599, "logps/rejected": -1.4409325122833252, "loss": 1.3826, "odds_ratio_loss": 0.7239800691604614, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.13101674616336823, "rewards/margins": 0.013076506555080414, "rewards/rejected": -0.14409324526786804, "sft_loss": 1.3101674318313599, "step": 900 }, { "epoch": 1.6181373638586352, "grad_norm": 1.3889552354812622, "learning_rate": 2.188913284630584e-06, "logits/chosen": -21.00895118713379, "logits/rejected": -21.11439323425293, "logps/chosen": -1.3723797798156738, "logps/rejected": -1.4007512331008911, "loss": 1.449, "odds_ratio_loss": 0.7658642530441284, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.13723799586296082, "rewards/margins": 0.0028371470980346203, "rewards/rejected": -0.14007511734962463, "sft_loss": 1.3723797798156738, "step": 910 }, { "epoch": 1.635919093131807, "grad_norm": 4.06219482421875, "learning_rate": 2.1427457636675652e-06, "logits/chosen": -21.082805633544922, "logits/rejected": -21.207538604736328, "logps/chosen": -1.336096167564392, "logps/rejected": -1.4373667240142822, "loss": 1.4096, "odds_ratio_loss": 0.7348427176475525, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.13360963761806488, "rewards/margins": 0.010127037763595581, "rewards/rejected": -0.14373667538166046, "sft_loss": 1.336096167564392, "step": 920 }, { "epoch": 1.653700822404979, "grad_norm": 1.464863657951355, "learning_rate": 2.096702293897247e-06, "logits/chosen": -20.881576538085938, "logits/rejected": -20.812564849853516, "logps/chosen": -1.3259438276290894, "logps/rejected": -1.5576345920562744, "loss": 1.3925, "odds_ratio_loss": 0.6658841371536255, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.13259439170360565, "rewards/margins": 0.02316909097135067, "rewards/rejected": -0.15576346218585968, "sft_loss": 1.3259438276290894, "step": 930 }, { "epoch": 1.6714825516781509, "grad_norm": 2.923940420150757, "learning_rate": 2.0507988632261672e-06, "logits/chosen": -20.792316436767578, "logits/rejected": -20.86935043334961, "logps/chosen": -1.3512170314788818, "logps/rejected": -1.4976381063461304, "loss": 1.4213, "odds_ratio_loss": 0.7008249163627625, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.1351216733455658, "rewards/margins": 0.014642128720879555, "rewards/rejected": -0.14976383745670319, "sft_loss": 1.3512170314788818, "step": 940 }, { "epoch": 1.6892642809513225, "grad_norm": 3.5238471031188965, "learning_rate": 2.005051410934382e-06, "logits/chosen": -20.95963478088379, "logits/rejected": -20.97479248046875, "logps/chosen": -1.4458208084106445, "logps/rejected": -1.5041887760162354, "loss": 1.5203, "odds_ratio_loss": 0.7446683645248413, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.14458207786083221, "rewards/margins": 0.005836788564920425, "rewards/rejected": -0.15041887760162354, "sft_loss": 1.4458208084106445, "step": 950 }, { "epoch": 1.7070460102244942, "grad_norm": 2.6721088886260986, "learning_rate": 1.9594758221407843e-06, "logits/chosen": -20.884212493896484, "logits/rejected": -20.890071868896484, "logps/chosen": -1.3164139986038208, "logps/rejected": -1.506830096244812, "loss": 1.3826, "odds_ratio_loss": 0.6615261435508728, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.13164140284061432, "rewards/margins": 0.019041623920202255, "rewards/rejected": -0.15068301558494568, "sft_loss": 1.3164139986038208, "step": 960 }, { "epoch": 1.724827739497666, "grad_norm": 2.1806442737579346, "learning_rate": 1.9140879222872408e-06, "logits/chosen": -20.64748191833496, "logits/rejected": -20.80613136291504, "logps/chosen": -1.3217017650604248, "logps/rejected": -1.4095408916473389, "loss": 1.3952, "odds_ratio_loss": 0.7351614236831665, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.1321701854467392, "rewards/margins": 0.008783898316323757, "rewards/rejected": -0.14095407724380493, "sft_loss": 1.3217017650604248, "step": 970 }, { "epoch": 1.742609468770838, "grad_norm": 1.7276735305786133, "learning_rate": 1.8689034716434346e-06, "logits/chosen": -21.096982955932617, "logits/rejected": -21.016551971435547, "logps/chosen": -1.3933743238449097, "logps/rejected": -1.4783251285552979, "loss": 1.4665, "odds_ratio_loss": 0.7310749292373657, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1393374353647232, "rewards/margins": 0.008495080284774303, "rewards/rejected": -0.14783251285552979, "sft_loss": 1.3933743238449097, "step": 980 }, { "epoch": 1.76039119804401, "grad_norm": 1.8123304843902588, "learning_rate": 1.8239381598343576e-06, "logits/chosen": -20.750640869140625, "logits/rejected": -20.75037956237793, "logps/chosen": -1.3481253385543823, "logps/rejected": -1.4228088855743408, "loss": 1.4223, "odds_ratio_loss": 0.7416225671768188, "rewards/accuracies": 0.5, "rewards/chosen": -0.13481254875659943, "rewards/margins": 0.0074683367274701595, "rewards/rejected": -0.14228087663650513, "sft_loss": 1.3481253385543823, "step": 990 }, { "epoch": 1.7781729273171816, "grad_norm": 3.5161044597625732, "learning_rate": 1.779207600392312e-06, "logits/chosen": -21.101184844970703, "logits/rejected": -21.04250717163086, "logps/chosen": -1.4057555198669434, "logps/rejected": -1.5241641998291016, "loss": 1.4761, "odds_ratio_loss": 0.7033491134643555, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.1405755579471588, "rewards/margins": 0.011840855702757835, "rewards/rejected": -0.1524164229631424, "sft_loss": 1.4057555198669434, "step": 1000 }, { "epoch": 1.7781729273171816, "eval_logits/chosen": -20.803815841674805, "eval_logits/rejected": -20.891420364379883, "eval_logps/chosen": -1.3395137786865234, "eval_logps/rejected": -1.5095441341400146, "eval_loss": 1.407221794128418, "eval_odds_ratio_loss": 0.6770801544189453, "eval_rewards/accuracies": 0.5350000262260437, "eval_rewards/chosen": -0.1339513659477234, "eval_rewards/margins": 0.017003033310174942, "eval_rewards/rejected": -0.15095441043376923, "eval_runtime": 80.0538, "eval_samples_per_second": 12.492, "eval_sft_loss": 1.3395137786865234, "eval_steps_per_second": 6.246, "step": 1000 }, { "epoch": 1.7959546565903532, "grad_norm": 3.0343945026397705, "learning_rate": 1.7347273253353552e-06, "logits/chosen": -20.704559326171875, "logits/rejected": -20.68727684020996, "logps/chosen": -1.314007043838501, "logps/rejected": -1.4146376848220825, "loss": 1.389, "odds_ratio_loss": 0.7499723434448242, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1314007043838501, "rewards/margins": 0.01006306428462267, "rewards/rejected": -0.14146378636360168, "sft_loss": 1.314007043838501, "step": 1010 }, { "epoch": 1.8137363858635251, "grad_norm": 6.905886650085449, "learning_rate": 1.690512779774029e-06, "logits/chosen": -20.81467056274414, "logits/rejected": -20.834705352783203, "logps/chosen": -1.4023054838180542, "logps/rejected": -1.652772307395935, "loss": 1.4691, "odds_ratio_loss": 0.6682060956954956, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.14023055136203766, "rewards/margins": 0.025046680122613907, "rewards/rejected": -0.16527722775936127, "sft_loss": 1.4023054838180542, "step": 1020 }, { "epoch": 1.831518115136697, "grad_norm": 1.7513582706451416, "learning_rate": 1.6465793165482838e-06, "logits/chosen": -20.86380386352539, "logits/rejected": -20.82488441467285, "logps/chosen": -1.2588412761688232, "logps/rejected": -1.4600279331207275, "loss": 1.3236, "odds_ratio_loss": 0.6471126079559326, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.12588414549827576, "rewards/margins": 0.02011866495013237, "rewards/rejected": -0.14600279927253723, "sft_loss": 1.2588412761688232, "step": 1030 }, { "epoch": 1.849299844409869, "grad_norm": 2.2100415229797363, "learning_rate": 1.6029421908964305e-06, "logits/chosen": -20.778493881225586, "logits/rejected": -20.64494514465332, "logps/chosen": -1.3084795475006104, "logps/rejected": -1.622815728187561, "loss": 1.375, "odds_ratio_loss": 0.664787232875824, "rewards/accuracies": 0.5625, "rewards/chosen": -0.1308479607105255, "rewards/margins": 0.03143361583352089, "rewards/rejected": -0.1622815579175949, "sft_loss": 1.3084795475006104, "step": 1040 }, { "epoch": 1.8670815736830408, "grad_norm": 4.5934367179870605, "learning_rate": 1.559616555157985e-06, "logits/chosen": -21.22179412841797, "logits/rejected": -20.979957580566406, "logps/chosen": -1.3719347715377808, "logps/rejected": -1.5128790140151978, "loss": 1.4452, "odds_ratio_loss": 0.7331027388572693, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.1371934711933136, "rewards/margins": 0.014094437472522259, "rewards/rejected": -0.15128789842128754, "sft_loss": 1.3719347715377808, "step": 1050 }, { "epoch": 1.8848633029562125, "grad_norm": 1.7163333892822266, "learning_rate": 1.516617453512252e-06, "logits/chosen": -21.102188110351562, "logits/rejected": -21.131221771240234, "logps/chosen": -1.4223716259002686, "logps/rejected": -1.4797694683074951, "loss": 1.498, "odds_ratio_loss": 0.7560666799545288, "rewards/accuracies": 0.46875, "rewards/chosen": -0.14223715662956238, "rewards/margins": 0.005739795975387096, "rewards/rejected": -0.14797696471214294, "sft_loss": 1.4223716259002686, "step": 1060 }, { "epoch": 1.9026450322293842, "grad_norm": 9.351452827453613, "learning_rate": 1.473959816754449e-06, "logits/chosen": -20.615371704101562, "logits/rejected": -20.649810791015625, "logps/chosen": -1.3047014474868774, "logps/rejected": -1.3762633800506592, "loss": 1.3786, "odds_ratio_loss": 0.7393638491630554, "rewards/accuracies": 0.5, "rewards/chosen": -0.1304701417684555, "rewards/margins": 0.00715619046241045, "rewards/rejected": -0.13762633502483368, "sft_loss": 1.3047014474868774, "step": 1070 }, { "epoch": 1.920426761502556, "grad_norm": 2.190560817718506, "learning_rate": 1.4316584571112213e-06, "logits/chosen": -21.255840301513672, "logits/rejected": -21.131498336791992, "logps/chosen": -1.3643953800201416, "logps/rejected": -1.461114525794983, "loss": 1.4364, "odds_ratio_loss": 0.7197447419166565, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13643954694271088, "rewards/margins": 0.009671924635767937, "rewards/rejected": -0.14611145853996277, "sft_loss": 1.3643953800201416, "step": 1080 }, { "epoch": 1.938208490775728, "grad_norm": 2.542182683944702, "learning_rate": 1.389728063097306e-06, "logits/chosen": -20.93314552307129, "logits/rejected": -20.863218307495117, "logps/chosen": -1.389034390449524, "logps/rejected": -1.5989328622817993, "loss": 1.4573, "odds_ratio_loss": 0.6827085614204407, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.1389034539461136, "rewards/margins": 0.020989837124943733, "rewards/rejected": -0.15989328920841217, "sft_loss": 1.389034390449524, "step": 1090 }, { "epoch": 1.9559902200488999, "grad_norm": 1.5650415420532227, "learning_rate": 1.348183194415179e-06, "logits/chosen": -20.95106315612793, "logits/rejected": -20.61818504333496, "logps/chosen": -1.323676347732544, "logps/rejected": -1.5667550563812256, "loss": 1.3885, "odds_ratio_loss": 0.6487289071083069, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.13236764073371887, "rewards/margins": 0.02430787682533264, "rewards/rejected": -0.1566755324602127, "sft_loss": 1.323676347732544, "step": 1100 }, { "epoch": 1.9737719493220716, "grad_norm": 1.7203210592269897, "learning_rate": 1.3070382768994015e-06, "logits/chosen": -20.69628143310547, "logits/rejected": -20.650815963745117, "logps/chosen": -1.3079763650894165, "logps/rejected": -1.449339747428894, "loss": 1.3762, "odds_ratio_loss": 0.6826270818710327, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13079765439033508, "rewards/margins": 0.014136332087218761, "rewards/rejected": -0.14493396878242493, "sft_loss": 1.3079763650894165, "step": 1110 }, { "epoch": 1.9915536785952432, "grad_norm": 1.5598257780075073, "learning_rate": 1.2663075975074746e-06, "logits/chosen": -20.689380645751953, "logits/rejected": -20.69732666015625, "logps/chosen": -1.3402197360992432, "logps/rejected": -1.503177285194397, "loss": 1.4129, "odds_ratio_loss": 0.7268449664115906, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.13402198255062103, "rewards/margins": 0.016295749694108963, "rewards/rejected": -0.1503177136182785, "sft_loss": 1.3402197360992432, "step": 1120 }, { "epoch": 2.009335407868415, "grad_norm": 5.007309436798096, "learning_rate": 1.2260052993589034e-06, "logits/chosen": -20.855276107788086, "logits/rejected": -20.814468383789062, "logps/chosen": -1.4246357679367065, "logps/rejected": -1.4585391283035278, "loss": 1.5014, "odds_ratio_loss": 0.7673634886741638, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1424635797739029, "rewards/margins": 0.003390337573364377, "rewards/rejected": -0.1458539217710495, "sft_loss": 1.4246357679367065, "step": 1130 }, { "epoch": 2.027117137141587, "grad_norm": 1.2563971281051636, "learning_rate": 1.1861453768242099e-06, "logits/chosen": -20.794506072998047, "logits/rejected": -20.795894622802734, "logps/chosen": -1.2917953729629517, "logps/rejected": -1.483782172203064, "loss": 1.3577, "odds_ratio_loss": 0.6590424180030823, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.12917952239513397, "rewards/margins": 0.019198691472411156, "rewards/rejected": -0.14837822318077087, "sft_loss": 1.2917953729629517, "step": 1140 }, { "epoch": 2.044898866414759, "grad_norm": 8.363728523254395, "learning_rate": 1.1467416706655982e-06, "logits/chosen": -20.971622467041016, "logits/rejected": -21.218524932861328, "logps/chosen": -1.418050765991211, "logps/rejected": -1.580128788948059, "loss": 1.4924, "odds_ratio_loss": 0.7438761591911316, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.14180508255958557, "rewards/margins": 0.016207797452807426, "rewards/rejected": -0.15801288187503815, "sft_loss": 1.418050765991211, "step": 1150 }, { "epoch": 2.062680595687931, "grad_norm": 1.5622318983078003, "learning_rate": 1.1078078632309559e-06, "logits/chosen": -20.65304946899414, "logits/rejected": -20.797122955322266, "logps/chosen": -1.3177438974380493, "logps/rejected": -1.4770663976669312, "loss": 1.3849, "odds_ratio_loss": 0.6711241006851196, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.1317743957042694, "rewards/margins": 0.015932243317365646, "rewards/rejected": -0.14770662784576416, "sft_loss": 1.3177438974380493, "step": 1160 }, { "epoch": 2.0804623249611023, "grad_norm": 1.6626743078231812, "learning_rate": 1.0693574737028627e-06, "logits/chosen": -20.749677658081055, "logits/rejected": -20.718555450439453, "logps/chosen": -1.354952096939087, "logps/rejected": -1.4761542081832886, "loss": 1.428, "odds_ratio_loss": 0.73005211353302, "rewards/accuracies": 0.5, "rewards/chosen": -0.13549521565437317, "rewards/margins": 0.012120204977691174, "rewards/rejected": -0.14761541783809662, "sft_loss": 1.354952096939087, "step": 1170 }, { "epoch": 2.098244054234274, "grad_norm": 3.4684457778930664, "learning_rate": 1.0314038534042586e-06, "logits/chosen": -20.948108673095703, "logits/rejected": -20.716609954833984, "logps/chosen": -1.2302569150924683, "logps/rejected": -1.4216673374176025, "loss": 1.2983, "odds_ratio_loss": 0.6800249814987183, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.12302567809820175, "rewards/margins": 0.019141051918268204, "rewards/rejected": -0.14216673374176025, "sft_loss": 1.2302569150924683, "step": 1180 }, { "epoch": 2.116025783507446, "grad_norm": 1.7580640316009521, "learning_rate": 9.939601811623946e-07, "logits/chosen": -20.846065521240234, "logits/rejected": -20.80862045288086, "logps/chosen": -1.3318583965301514, "logps/rejected": -1.4921131134033203, "loss": 1.4031, "odds_ratio_loss": 0.7127273678779602, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.13318583369255066, "rewards/margins": 0.016025487333536148, "rewards/rejected": -0.1492113173007965, "sft_loss": 1.3318583965301514, "step": 1190 }, { "epoch": 2.133807512780618, "grad_norm": 2.0461864471435547, "learning_rate": 9.570394587326825e-07, "logits/chosen": -21.051130294799805, "logits/rejected": -20.864850997924805, "logps/chosen": -1.322939395904541, "logps/rejected": -1.5531421899795532, "loss": 1.3893, "odds_ratio_loss": 0.6637840867042542, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -0.1322939246892929, "rewards/margins": 0.023020274937152863, "rewards/rejected": -0.15531422197818756, "sft_loss": 1.322939395904541, "step": 1200 }, { "epoch": 2.15158924205379, "grad_norm": 1.2578119039535522, "learning_rate": 9.206545062840302e-07, "logits/chosen": -21.234752655029297, "logits/rejected": -20.857492446899414, "logps/chosen": -1.2849655151367188, "logps/rejected": -1.4767402410507202, "loss": 1.3511, "odds_ratio_loss": 0.6610640287399292, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -0.12849654257297516, "rewards/margins": 0.019177492707967758, "rewards/rejected": -0.14767403900623322, "sft_loss": 1.2849655151367188, "step": 1210 }, { "epoch": 2.1693709713269618, "grad_norm": 1.5944854021072388, "learning_rate": 8.848179579472285e-07, "logits/chosen": -20.92203140258789, "logits/rejected": -20.818485260009766, "logps/chosen": -1.2799731492996216, "logps/rejected": -1.326030969619751, "loss": 1.3534, "odds_ratio_loss": 0.7338781952857971, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.12799732387065887, "rewards/margins": 0.0046057915315032005, "rewards/rejected": -0.13260310888290405, "sft_loss": 1.2799731492996216, "step": 1220 }, { "epoch": 2.1871527006001332, "grad_norm": 3.263883352279663, "learning_rate": 8.495422574279403e-07, "logits/chosen": -20.327661514282227, "logits/rejected": -20.28653907775879, "logps/chosen": -1.262486219406128, "logps/rejected": -1.4986459016799927, "loss": 1.3277, "odds_ratio_loss": 0.6521891951560974, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -0.12624862790107727, "rewards/margins": 0.02361595258116722, "rewards/rejected": -0.1498645842075348, "sft_loss": 1.262486219406128, "step": 1230 }, { "epoch": 2.204934429873305, "grad_norm": 2.0986313819885254, "learning_rate": 8.148396536858063e-07, "logits/chosen": -21.014957427978516, "logits/rejected": -21.147602081298828, "logps/chosen": -1.3925727605819702, "logps/rejected": -1.589383840560913, "loss": 1.463, "odds_ratio_loss": 0.7045022249221802, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.1392572671175003, "rewards/margins": 0.019681129604578018, "rewards/rejected": -0.15893837809562683, "sft_loss": 1.3925727605819702, "step": 1240 }, { "epoch": 2.222716159146477, "grad_norm": 1.3406250476837158, "learning_rate": 7.807221966811815e-07, "logits/chosen": -20.607036590576172, "logits/rejected": -20.66307830810547, "logps/chosen": -1.2920827865600586, "logps/rejected": -1.41164231300354, "loss": 1.3638, "odds_ratio_loss": 0.7169677019119263, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1292082816362381, "rewards/margins": 0.01195596344769001, "rewards/rejected": -0.14116425812244415, "sft_loss": 1.2920827865600586, "step": 1250 }, { "epoch": 2.240497888419649, "grad_norm": 2.084696054458618, "learning_rate": 7.47201733190962e-07, "logits/chosen": -20.630435943603516, "logits/rejected": -20.60986328125, "logps/chosen": -1.2978394031524658, "logps/rejected": -1.403597116470337, "loss": 1.3682, "odds_ratio_loss": 0.7040928602218628, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.12978394329547882, "rewards/margins": 0.010575750842690468, "rewards/rejected": -0.14035969972610474, "sft_loss": 1.2978394031524658, "step": 1260 }, { "epoch": 2.258279617692821, "grad_norm": 1.7839128971099854, "learning_rate": 7.142899026949721e-07, "logits/chosen": -20.951190948486328, "logits/rejected": -20.935705184936523, "logps/chosen": -1.320299744606018, "logps/rejected": -1.4439074993133545, "loss": 1.3892, "odds_ratio_loss": 0.6885126233100891, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.13202998042106628, "rewards/margins": 0.012360776774585247, "rewards/rejected": -0.1443907469511032, "sft_loss": 1.320299744606018, "step": 1270 }, { "epoch": 2.2760613469659923, "grad_norm": 7.137161731719971, "learning_rate": 6.819981333343273e-07, "logits/chosen": -20.221033096313477, "logits/rejected": -20.27283477783203, "logps/chosen": -1.2987910509109497, "logps/rejected": -1.4801701307296753, "loss": 1.3676, "odds_ratio_loss": 0.6884258985519409, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.12987910211086273, "rewards/margins": 0.01813790202140808, "rewards/rejected": -0.148017019033432, "sft_loss": 1.2987910509109497, "step": 1280 }, { "epoch": 2.293843076239164, "grad_norm": 2.513110876083374, "learning_rate": 6.503376379431839e-07, "logits/chosen": -20.69548797607422, "logits/rejected": -20.705198287963867, "logps/chosen": -1.4108153581619263, "logps/rejected": -1.396875262260437, "loss": 1.4867, "odds_ratio_loss": 0.7589144110679626, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.1410815417766571, "rewards/margins": -0.0013940061908215284, "rewards/rejected": -0.13968753814697266, "sft_loss": 1.4108153581619263, "step": 1290 }, { "epoch": 2.311624805512336, "grad_norm": 6.825961112976074, "learning_rate": 6.193194101552502e-07, "logits/chosen": -20.706968307495117, "logits/rejected": -20.34494400024414, "logps/chosen": -1.327618956565857, "logps/rejected": -1.4798409938812256, "loss": 1.3947, "odds_ratio_loss": 0.6709089279174805, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.1327619105577469, "rewards/margins": 0.01522219367325306, "rewards/rejected": -0.147984117269516, "sft_loss": 1.327618956565857, "step": 1300 }, { "epoch": 2.329406534785508, "grad_norm": 2.9888756275177, "learning_rate": 5.889542205864083e-07, "logits/chosen": -20.558048248291016, "logits/rejected": -20.51228904724121, "logps/chosen": -1.3237196207046509, "logps/rejected": -1.4659796953201294, "loss": 1.3931, "odds_ratio_loss": 0.6935244798660278, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13237197697162628, "rewards/margins": 0.014225991442799568, "rewards/rejected": -0.1465979516506195, "sft_loss": 1.3237196207046509, "step": 1310 }, { "epoch": 2.34718826405868, "grad_norm": 1.8925628662109375, "learning_rate": 5.592526130947862e-07, "logits/chosen": -20.927398681640625, "logits/rejected": -20.855573654174805, "logps/chosen": -1.3616701364517212, "logps/rejected": -1.4716918468475342, "loss": 1.4372, "odds_ratio_loss": 0.7552787065505981, "rewards/accuracies": 0.4437499940395355, "rewards/chosen": -0.1361670196056366, "rewards/margins": 0.011002160608768463, "rewards/rejected": -0.14716917276382446, "sft_loss": 1.3616701364517212, "step": 1320 }, { "epoch": 2.3649699933318518, "grad_norm": 2.2592906951904297, "learning_rate": 5.302249011195507e-07, "logits/chosen": -20.643238067626953, "logits/rejected": -20.706254959106445, "logps/chosen": -1.286387324333191, "logps/rejected": -1.3748500347137451, "loss": 1.3577, "odds_ratio_loss": 0.712990939617157, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.12863874435424805, "rewards/margins": 0.008846262469887733, "rewards/rejected": -0.13748499751091003, "sft_loss": 1.286387324333191, "step": 1330 }, { "epoch": 2.382751722605023, "grad_norm": 3.97871470451355, "learning_rate": 5.018811640997307e-07, "logits/chosen": -20.570959091186523, "logits/rejected": -20.81188201904297, "logps/chosen": -1.383998155593872, "logps/rejected": -1.646945595741272, "loss": 1.4509, "odds_ratio_loss": 0.6689848899841309, "rewards/accuracies": 0.59375, "rewards/chosen": -0.13839980959892273, "rewards/margins": 0.026294732466340065, "rewards/rejected": -0.16469456255435944, "sft_loss": 1.383998155593872, "step": 1340 }, { "epoch": 2.400533451878195, "grad_norm": 1.2727420330047607, "learning_rate": 4.7423124397427105e-07, "logits/chosen": -20.430959701538086, "logits/rejected": -20.650379180908203, "logps/chosen": -1.35360848903656, "logps/rejected": -1.4245867729187012, "loss": 1.4266, "odds_ratio_loss": 0.7302565574645996, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.13536083698272705, "rewards/margins": 0.007097836583852768, "rewards/rejected": -0.1424586921930313, "sft_loss": 1.35360848903656, "step": 1350 }, { "epoch": 2.418315181151367, "grad_norm": 2.574122428894043, "learning_rate": 4.472847417645787e-07, "logits/chosen": -20.755605697631836, "logits/rejected": -20.458105087280273, "logps/chosen": -1.3647658824920654, "logps/rejected": -1.634526252746582, "loss": 1.4294, "odds_ratio_loss": 0.6465052366256714, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.13647659122943878, "rewards/margins": 0.026976028457283974, "rewards/rejected": -0.1634526252746582, "sft_loss": 1.3647658824920654, "step": 1360 }, { "epoch": 2.436096910424539, "grad_norm": 1.2747830152511597, "learning_rate": 4.210510142406993e-07, "logits/chosen": -20.725910186767578, "logits/rejected": -20.539182662963867, "logps/chosen": -1.3636181354522705, "logps/rejected": -1.573249101638794, "loss": 1.4312, "odds_ratio_loss": 0.6754659414291382, "rewards/accuracies": 0.53125, "rewards/chosen": -0.13636180758476257, "rewards/margins": 0.020963111892342567, "rewards/rejected": -0.1573249250650406, "sft_loss": 1.3636181354522705, "step": 1370 }, { "epoch": 2.4538786396977104, "grad_norm": 1.5959084033966064, "learning_rate": 3.9553917067232966e-07, "logits/chosen": -20.685565948486328, "logits/rejected": -20.612730026245117, "logps/chosen": -1.3631963729858398, "logps/rejected": -1.501734972000122, "loss": 1.4351, "odds_ratio_loss": 0.7191514372825623, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13631963729858398, "rewards/margins": 0.013853861019015312, "rewards/rejected": -0.15017351508140564, "sft_loss": 1.3631963729858398, "step": 1380 }, { "epoch": 2.4716603689708823, "grad_norm": 1.8356739282608032, "learning_rate": 3.707580696657509e-07, "logits/chosen": -20.62293243408203, "logits/rejected": -20.265270233154297, "logps/chosen": -1.3268606662750244, "logps/rejected": -1.4021425247192383, "loss": 1.3996, "odds_ratio_loss": 0.727665364742279, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.1326860636472702, "rewards/margins": 0.007528189569711685, "rewards/rejected": -0.1402142494916916, "sft_loss": 1.3268606662750244, "step": 1390 }, { "epoch": 2.489442098244054, "grad_norm": 2.215832471847534, "learning_rate": 3.4671631608781815e-07, "logits/chosen": -20.820430755615234, "logits/rejected": -20.70709991455078, "logps/chosen": -1.3705365657806396, "logps/rejected": -1.4663138389587402, "loss": 1.4448, "odds_ratio_loss": 0.742554783821106, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.13705363869667053, "rewards/margins": 0.009577738121151924, "rewards/rejected": -0.1466313898563385, "sft_loss": 1.3705365657806396, "step": 1400 }, { "epoch": 2.507223827517226, "grad_norm": 2.842649221420288, "learning_rate": 3.234222580780405e-07, "logits/chosen": -20.579906463623047, "logits/rejected": -20.50626564025879, "logps/chosen": -1.3471759557724, "logps/rejected": -1.4215319156646729, "loss": 1.4184, "odds_ratio_loss": 0.711919367313385, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13471761345863342, "rewards/margins": 0.007435592822730541, "rewards/rejected": -0.14215318858623505, "sft_loss": 1.3471759557724, "step": 1410 }, { "epoch": 2.525005556790398, "grad_norm": 3.6692733764648438, "learning_rate": 3.0088398414982375e-07, "logits/chosen": -20.674327850341797, "logits/rejected": -20.809429168701172, "logps/chosen": -1.3552839756011963, "logps/rejected": -1.5087939500808716, "loss": 1.4268, "odds_ratio_loss": 0.7148610353469849, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.13552840054035187, "rewards/margins": 0.015351003035902977, "rewards/rejected": -0.1508793979883194, "sft_loss": 1.3552839756011963, "step": 1420 }, { "epoch": 2.54278728606357, "grad_norm": 1.8147318363189697, "learning_rate": 2.7910932038184487e-07, "logits/chosen": -20.291900634765625, "logits/rejected": -19.921438217163086, "logps/chosen": -1.3218873739242554, "logps/rejected": -1.475524663925171, "loss": 1.3899, "odds_ratio_loss": 0.6805239319801331, "rewards/accuracies": 0.5625, "rewards/chosen": -0.13218875229358673, "rewards/margins": 0.015363717451691628, "rewards/rejected": -0.1475524604320526, "sft_loss": 1.3218873739242554, "step": 1430 }, { "epoch": 2.5605690153367417, "grad_norm": 2.6163878440856934, "learning_rate": 2.5810582770057325e-07, "logits/chosen": -20.752613067626953, "logits/rejected": -20.92694854736328, "logps/chosen": -1.2694684267044067, "logps/rejected": -1.3633973598480225, "loss": 1.3412, "odds_ratio_loss": 0.7170311212539673, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.1269468367099762, "rewards/margins": 0.009392908774316311, "rewards/rejected": -0.13633975386619568, "sft_loss": 1.2694684267044067, "step": 1440 }, { "epoch": 2.578350744609913, "grad_norm": 2.4267303943634033, "learning_rate": 2.3788079925484402e-07, "logits/chosen": -20.907817840576172, "logits/rejected": -20.742984771728516, "logps/chosen": -1.3328653573989868, "logps/rejected": -1.4286470413208008, "loss": 1.4041, "odds_ratio_loss": 0.7125917673110962, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1332865208387375, "rewards/margins": 0.009578163735568523, "rewards/rejected": -0.14286470413208008, "sft_loss": 1.3328653573989868, "step": 1450 }, { "epoch": 2.596132473883085, "grad_norm": 2.563065528869629, "learning_rate": 2.1844125788342661e-07, "logits/chosen": -20.36819076538086, "logits/rejected": -20.245798110961914, "logps/chosen": -1.3011656999588013, "logps/rejected": -1.601665735244751, "loss": 1.3692, "odds_ratio_loss": 0.679993212223053, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1301165670156479, "rewards/margins": 0.030050003901124, "rewards/rejected": -0.16016657650470734, "sft_loss": 1.3011656999588013, "step": 1460 }, { "epoch": 2.613914203156257, "grad_norm": 1.592044711112976, "learning_rate": 1.9979395367644428e-07, "logits/chosen": -20.988916397094727, "logits/rejected": -20.9386043548584, "logps/chosen": -1.2825675010681152, "logps/rejected": -1.444154977798462, "loss": 1.3506, "odds_ratio_loss": 0.6804038286209106, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.12825676798820496, "rewards/margins": 0.01615874283015728, "rewards/rejected": -0.1444154977798462, "sft_loss": 1.2825675010681152, "step": 1470 }, { "epoch": 2.631695932429429, "grad_norm": 3.1699938774108887, "learning_rate": 1.81945361631512e-07, "logits/chosen": -21.14181900024414, "logits/rejected": -21.21465301513672, "logps/chosen": -1.3464009761810303, "logps/rejected": -1.4395297765731812, "loss": 1.4199, "odds_ratio_loss": 0.7345655560493469, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -0.13464009761810303, "rewards/margins": 0.00931286346167326, "rewards/rejected": -0.14395298063755035, "sft_loss": 1.3464009761810303, "step": 1480 }, { "epoch": 2.6494776617026004, "grad_norm": 2.9426472187042236, "learning_rate": 1.6490167940538343e-07, "logits/chosen": -20.980464935302734, "logits/rejected": -20.800823211669922, "logps/chosen": -1.3066675662994385, "logps/rejected": -1.4621120691299438, "loss": 1.3758, "odds_ratio_loss": 0.691235363483429, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -0.13066676259040833, "rewards/margins": 0.015544441528618336, "rewards/rejected": -0.1462111920118332, "sft_loss": 1.3066675662994385, "step": 1490 }, { "epoch": 2.6672593909757722, "grad_norm": 2.0618135929107666, "learning_rate": 1.4866882516191339e-07, "logits/chosen": -20.438335418701172, "logits/rejected": -20.611011505126953, "logps/chosen": -1.3370510339736938, "logps/rejected": -1.4072545766830444, "loss": 1.4105, "odds_ratio_loss": 0.7341033220291138, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.13370510935783386, "rewards/margins": 0.007020360324531794, "rewards/rejected": -0.14072546362876892, "sft_loss": 1.3370510339736938, "step": 1500 }, { "epoch": 2.6672593909757722, "eval_logits/chosen": -20.655466079711914, "eval_logits/rejected": -20.744272232055664, "eval_logps/chosen": -1.3319367170333862, "eval_logps/rejected": -1.5009632110595703, "eval_loss": 1.3996269702911377, "eval_odds_ratio_loss": 0.67690110206604, "eval_rewards/accuracies": 0.5460000038146973, "eval_rewards/chosen": -0.133193701505661, "eval_rewards/margins": 0.016902634873986244, "eval_rewards/rejected": -0.1500963419675827, "eval_runtime": 80.0771, "eval_samples_per_second": 12.488, "eval_sft_loss": 1.3319367170333862, "eval_steps_per_second": 6.244, "step": 1500 }, { "epoch": 2.685041120248944, "grad_norm": 4.3807573318481445, "learning_rate": 1.3325243551706057e-07, "logits/chosen": -20.31595802307129, "logits/rejected": -20.66552734375, "logps/chosen": -1.3314330577850342, "logps/rejected": -1.618486762046814, "loss": 1.3972, "odds_ratio_loss": 0.6577640175819397, "rewards/accuracies": 0.59375, "rewards/chosen": -0.13314330577850342, "rewards/margins": 0.028705382719635963, "rewards/rejected": -0.16184869408607483, "sft_loss": 1.3314330577850342, "step": 1510 }, { "epoch": 2.702822849522116, "grad_norm": 5.399725437164307, "learning_rate": 1.1865786358165737e-07, "logits/chosen": -20.374225616455078, "logits/rejected": -20.766555786132812, "logps/chosen": -1.3559472560882568, "logps/rejected": -1.465380311012268, "loss": 1.4273, "odds_ratio_loss": 0.71399986743927, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.13559472560882568, "rewards/margins": 0.010943309403955936, "rewards/rejected": -0.14653804898262024, "sft_loss": 1.3559472560882568, "step": 1520 }, { "epoch": 2.720604578795288, "grad_norm": 5.161293029785156, "learning_rate": 1.0489017710262311e-07, "logits/chosen": -20.828411102294922, "logits/rejected": -21.007801055908203, "logps/chosen": -1.3781417608261108, "logps/rejected": -1.6057850122451782, "loss": 1.4502, "odds_ratio_loss": 0.7206953763961792, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.13781419396400452, "rewards/margins": 0.02276432514190674, "rewards/rejected": -0.16057850420475006, "sft_loss": 1.3781417608261108, "step": 1530 }, { "epoch": 2.73838630806846, "grad_norm": 1.9645308256149292, "learning_rate": 9.195415670326446e-08, "logits/chosen": -20.687061309814453, "logits/rejected": -20.75905990600586, "logps/chosen": -1.3485379219055176, "logps/rejected": -1.5022733211517334, "loss": 1.4177, "odds_ratio_loss": 0.6911865472793579, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1348538100719452, "rewards/margins": 0.015373537316918373, "rewards/rejected": -0.150227352976799, "sft_loss": 1.3485379219055176, "step": 1540 }, { "epoch": 2.7561680373416317, "grad_norm": 2.820127010345459, "learning_rate": 7.985429422327384e-08, "logits/chosen": -20.722209930419922, "logits/rejected": -20.719024658203125, "logps/chosen": -1.3103783130645752, "logps/rejected": -1.3722031116485596, "loss": 1.3834, "odds_ratio_loss": 0.7300290465354919, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.1310378611087799, "rewards/margins": 0.006182484794408083, "rewards/rejected": -0.1372203379869461, "sft_loss": 1.3103783130645752, "step": 1550 }, { "epoch": 2.773949766614803, "grad_norm": 3.8620612621307373, "learning_rate": 6.859479115900818e-08, "logits/chosen": -20.64493179321289, "logits/rejected": -20.700607299804688, "logps/chosen": -1.4513204097747803, "logps/rejected": -1.611519455909729, "loss": 1.5216, "odds_ratio_loss": 0.7024157047271729, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.14513204991817474, "rewards/margins": 0.01601991057395935, "rewards/rejected": -0.1611519604921341, "sft_loss": 1.4513204097747803, "step": 1560 }, { "epoch": 2.791731495887975, "grad_norm": 8.068270683288574, "learning_rate": 5.817955720457902e-08, "logits/chosen": -20.495128631591797, "logits/rejected": -20.559017181396484, "logps/chosen": -1.27366042137146, "logps/rejected": -1.3641878366470337, "loss": 1.3449, "odds_ratio_loss": 0.7120680809020996, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.12736603617668152, "rewards/margins": 0.009052738547325134, "rewards/rejected": -0.13641878962516785, "sft_loss": 1.27366042137146, "step": 1570 }, { "epoch": 2.809513225161147, "grad_norm": 3.788001775741577, "learning_rate": 4.861220889427199e-08, "logits/chosen": -20.778738021850586, "logits/rejected": -20.58936309814453, "logps/chosen": -1.3479427099227905, "logps/rejected": -1.4240316152572632, "loss": 1.423, "odds_ratio_loss": 0.7501288652420044, "rewards/accuracies": 0.543749988079071, "rewards/chosen": -0.13479426503181458, "rewards/margins": 0.007608892861753702, "rewards/rejected": -0.14240317046642303, "sft_loss": 1.3479427099227905, "step": 1580 }, { "epoch": 2.827294954434319, "grad_norm": 1.3882092237472534, "learning_rate": 3.9896068346758074e-08, "logits/chosen": -20.645978927612305, "logits/rejected": -20.691020965576172, "logps/chosen": -1.3436458110809326, "logps/rejected": -1.4700592756271362, "loss": 1.4137, "odds_ratio_loss": 0.7008241415023804, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.13436457514762878, "rewards/margins": 0.012641333043575287, "rewards/rejected": -0.14700593054294586, "sft_loss": 1.3436458110809326, "step": 1590 }, { "epoch": 2.8450766837074903, "grad_norm": 3.9510364532470703, "learning_rate": 3.203416211153832e-08, "logits/chosen": -20.51412582397461, "logits/rejected": -20.81777572631836, "logps/chosen": -1.331721544265747, "logps/rejected": -1.4455146789550781, "loss": 1.4049, "odds_ratio_loss": 0.7322754859924316, "rewards/accuracies": 0.4625000059604645, "rewards/chosen": -0.1331721693277359, "rewards/margins": 0.011379324831068516, "rewards/rejected": -0.14455147087574005, "sft_loss": 1.331721544265747, "step": 1600 }, { "epoch": 2.8628584129806622, "grad_norm": 5.2995758056640625, "learning_rate": 2.5029220118019393e-08, "logits/chosen": -20.452526092529297, "logits/rejected": -20.59510612487793, "logps/chosen": -1.3901276588439941, "logps/rejected": -1.4555182456970215, "loss": 1.4637, "odds_ratio_loss": 0.7356002330780029, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.13901275396347046, "rewards/margins": 0.006539070047438145, "rewards/rejected": -0.14555183053016663, "sft_loss": 1.3901276588439941, "step": 1610 }, { "epoch": 2.880640142253834, "grad_norm": 2.9100406169891357, "learning_rate": 1.8883674727586122e-08, "logits/chosen": -20.593090057373047, "logits/rejected": -20.566762924194336, "logps/chosen": -1.2591346502304077, "logps/rejected": -1.5285673141479492, "loss": 1.3231, "odds_ratio_loss": 0.6392361521720886, "rewards/accuracies": 0.625, "rewards/chosen": -0.12591347098350525, "rewards/margins": 0.026943260803818703, "rewards/rejected": -0.1528567224740982, "sft_loss": 1.2591346502304077, "step": 1620 }, { "epoch": 2.898421871527006, "grad_norm": 1.2503418922424316, "learning_rate": 1.3599659889000639e-08, "logits/chosen": -20.94070816040039, "logits/rejected": -20.831439971923828, "logps/chosen": -1.3583745956420898, "logps/rejected": -1.4623037576675415, "loss": 1.4301, "odds_ratio_loss": 0.7174537181854248, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": -0.13583745062351227, "rewards/margins": 0.010392926633358002, "rewards/rejected": -0.14623036980628967, "sft_loss": 1.3583745956420898, "step": 1630 }, { "epoch": 2.916203600800178, "grad_norm": 1.5849334001541138, "learning_rate": 9.179010397421528e-09, "logits/chosen": -20.463802337646484, "logits/rejected": -20.60258674621582, "logps/chosen": -1.3220821619033813, "logps/rejected": -1.4576328992843628, "loss": 1.3923, "odds_ratio_loss": 0.7020986676216125, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": -0.1322081983089447, "rewards/margins": 0.01355508528649807, "rewards/rejected": -0.1457633078098297, "sft_loss": 1.3220821619033813, "step": 1640 }, { "epoch": 2.93398533007335, "grad_norm": 1.5637987852096558, "learning_rate": 5.623261257296509e-09, "logits/chosen": -20.550914764404297, "logits/rejected": -20.748790740966797, "logps/chosen": -1.2091234922409058, "logps/rejected": -1.3757600784301758, "loss": 1.2759, "odds_ratio_loss": 0.6680835485458374, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.1209123507142067, "rewards/margins": 0.01666366681456566, "rewards/rejected": -0.13757601380348206, "sft_loss": 1.2091234922409058, "step": 1650 }, { "epoch": 2.9517670593465217, "grad_norm": 2.060124158859253, "learning_rate": 2.933647149357122e-09, "logits/chosen": -20.7076358795166, "logits/rejected": -20.74884605407715, "logps/chosen": -1.344455361366272, "logps/rejected": -1.4792556762695312, "loss": 1.415, "odds_ratio_loss": 0.7057270407676697, "rewards/accuracies": 0.5, "rewards/chosen": -0.13444553315639496, "rewards/margins": 0.013480030000209808, "rewards/rejected": -0.14792557060718536, "sft_loss": 1.344455361366272, "step": 1660 }, { "epoch": 2.969548788619693, "grad_norm": 1.8397283554077148, "learning_rate": 1.1111020018930717e-09, "logits/chosen": -20.895946502685547, "logits/rejected": -20.76508903503418, "logps/chosen": -1.319896936416626, "logps/rejected": -1.4024416208267212, "loss": 1.3912, "odds_ratio_loss": 0.7127953767776489, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.13198968768119812, "rewards/margins": 0.008254442363977432, "rewards/rejected": -0.14024415612220764, "sft_loss": 1.319896936416626, "step": 1670 }, { "epoch": 2.987330517892865, "grad_norm": 1.2750743627548218, "learning_rate": 1.5625866646051813e-10, "logits/chosen": -20.67104721069336, "logits/rejected": -20.601062774658203, "logps/chosen": -1.2744053602218628, "logps/rejected": -1.4778211116790771, "loss": 1.3388, "odds_ratio_loss": 0.6436463594436646, "rewards/accuracies": 0.59375, "rewards/chosen": -0.12744054198265076, "rewards/margins": 0.02034156210720539, "rewards/rejected": -0.1477821171283722, "sft_loss": 1.2744053602218628, "step": 1680 }, { "epoch": 2.997999555456768, "step": 1686, "total_flos": 5.313908590588723e+17, "train_loss": 1.477055920117832, "train_runtime": 8055.1491, "train_samples_per_second": 3.351, "train_steps_per_second": 0.209 } ], "logging_steps": 10, "max_steps": 1686, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "total_flos": 5.313908590588723e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }