{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 100, "global_step": 1065, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 2.1205010754043525, "learning_rate": 4.672897196261682e-08, "logits/chosen": -2.8477635383605957, "logits/rejected": -2.8469698429107666, "logps/chosen": -522.6112670898438, "logps/rejected": -359.48583984375, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 10.218544680897951, "learning_rate": 4.6728971962616824e-07, "logits/chosen": -2.9212379455566406, "logits/rejected": -2.7965469360351562, "logps/chosen": -313.4451904296875, "logps/rejected": -170.3771209716797, "loss": 0.6932, "rewards/accuracies": 0.5, "rewards/chosen": 0.0002524534647818655, "rewards/margins": 0.0003799269034061581, "rewards/margins_max": 0.0016077507752925158, "rewards/margins_min": -0.0008478969684801996, "rewards/margins_std": 0.0017364051891490817, "rewards/rejected": -0.0001274734386242926, "step": 10 }, { "epoch": 0.06, "grad_norm": 2.0408708876984667, "learning_rate": 9.345794392523365e-07, "logits/chosen": -2.7633142471313477, "logits/rejected": -2.7104804515838623, "logps/chosen": -380.93878173828125, "logps/rejected": -244.42214965820312, "loss": 0.6916, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0017110242042690516, "rewards/margins": 0.002610816154628992, "rewards/margins_max": 0.004759171046316624, "rewards/margins_min": 0.0004624614375643432, "rewards/margins_std": 0.0030382319819182158, "rewards/rejected": -0.0008997917175292969, "step": 20 }, { "epoch": 0.08, "grad_norm": 2.293731718484229, "learning_rate": 1.4018691588785047e-06, "logits/chosen": -2.8749966621398926, "logits/rejected": -2.8233141899108887, "logps/chosen": -375.4239196777344, "logps/rejected": -252.9129638671875, "loss": 0.687, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0067976354621350765, "rewards/margins": 0.009298587217926979, "rewards/margins_max": 0.015676181763410568, "rewards/margins_min": 0.0029209901113063097, "rewards/margins_std": 0.009019283577799797, "rewards/rejected": -0.0025009517557919025, "step": 30 }, { "epoch": 0.11, "grad_norm": 1.9265009094442067, "learning_rate": 1.869158878504673e-06, "logits/chosen": -2.7316184043884277, "logits/rejected": -2.7654078006744385, "logps/chosen": -305.0208740234375, "logps/rejected": -318.15576171875, "loss": 0.6783, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.01904786378145218, "rewards/margins": 0.02529343031346798, "rewards/margins_max": 0.03756815567612648, "rewards/margins_min": 0.013018706813454628, "rewards/margins_std": 0.017359081655740738, "rewards/rejected": -0.006245566997677088, "step": 40 }, { "epoch": 0.14, "grad_norm": 2.2762718753507225, "learning_rate": 2.3364485981308413e-06, "logits/chosen": -2.7840142250061035, "logits/rejected": -2.695960521697998, "logps/chosen": -241.2890167236328, "logps/rejected": -175.4230194091797, "loss": 0.6612, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.039340294897556305, "rewards/margins": 0.05124547332525253, "rewards/margins_max": 0.07519420981407166, "rewards/margins_min": 0.027296727523207664, "rewards/margins_std": 0.03386863321065903, "rewards/rejected": -0.011905180290341377, "step": 50 }, { "epoch": 0.17, "grad_norm": 2.278929693070735, "learning_rate": 2.8037383177570094e-06, "logits/chosen": -2.7337279319763184, "logits/rejected": -2.6699888706207275, "logps/chosen": -257.01812744140625, "logps/rejected": -237.2047119140625, "loss": 0.636, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.10417316108942032, "rewards/margins": 0.12125153839588165, "rewards/margins_max": 0.19414573907852173, "rewards/margins_min": 0.04835732653737068, "rewards/margins_std": 0.10308797657489777, "rewards/rejected": -0.01707836613059044, "step": 60 }, { "epoch": 0.2, "grad_norm": 1.9261684067245632, "learning_rate": 3.2710280373831774e-06, "logits/chosen": -2.6452136039733887, "logits/rejected": -2.649742364883423, "logps/chosen": -320.9119567871094, "logps/rejected": -220.4650421142578, "loss": 0.6066, "rewards/accuracies": 1.0, "rewards/chosen": 0.1302875578403473, "rewards/margins": 0.186918243765831, "rewards/margins_max": 0.2680404782295227, "rewards/margins_min": 0.10579605400562286, "rewards/margins_std": 0.11472412198781967, "rewards/rejected": -0.0566307008266449, "step": 70 }, { "epoch": 0.23, "grad_norm": 1.899604093562728, "learning_rate": 3.738317757009346e-06, "logits/chosen": -2.856180191040039, "logits/rejected": -2.781043291091919, "logps/chosen": -324.0494079589844, "logps/rejected": -299.65643310546875, "loss": 0.5744, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.12999968230724335, "rewards/margins": 0.25530779361724854, "rewards/margins_max": 0.37520045042037964, "rewards/margins_min": 0.13541515171527863, "rewards/margins_std": 0.16955383121967316, "rewards/rejected": -0.12530812621116638, "step": 80 }, { "epoch": 0.25, "grad_norm": 2.438635537156189, "learning_rate": 4.205607476635514e-06, "logits/chosen": -2.6444644927978516, "logits/rejected": -2.6486284732818604, "logps/chosen": -272.92718505859375, "logps/rejected": -228.8600616455078, "loss": 0.523, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.04048062115907669, "rewards/margins": 0.29693564772605896, "rewards/margins_max": 0.4845455288887024, "rewards/margins_min": 0.1093258485198021, "rewards/margins_std": 0.265320360660553, "rewards/rejected": -0.25645506381988525, "step": 90 }, { "epoch": 0.28, "grad_norm": 2.676590355830037, "learning_rate": 4.6728971962616825e-06, "logits/chosen": -2.7964138984680176, "logits/rejected": -2.735548973083496, "logps/chosen": -437.5833435058594, "logps/rejected": -379.58123779296875, "loss": 0.4777, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.20675165951251984, "rewards/margins": 0.581081748008728, "rewards/margins_max": 0.8298590779304504, "rewards/margins_min": 0.3323042690753937, "rewards/margins_std": 0.3518243730068207, "rewards/rejected": -0.3743300139904022, "step": 100 }, { "epoch": 0.28, "eval_logits/chosen": -2.670954704284668, "eval_logits/rejected": -2.6312379837036133, "eval_logps/chosen": -321.22222900390625, "eval_logps/rejected": -301.6253967285156, "eval_loss": 0.6754581928253174, "eval_rewards/accuracies": 0.60317462682724, "eval_rewards/chosen": -0.3600099980831146, "eval_rewards/margins": 0.06441720575094223, "eval_rewards/margins_max": 0.35590171813964844, "eval_rewards/margins_min": -0.22098243236541748, "eval_rewards/margins_std": 0.25287726521492004, "eval_rewards/rejected": -0.42442721128463745, "eval_runtime": 283.3412, "eval_samples_per_second": 7.059, "eval_steps_per_second": 0.222, "step": 100 }, { "epoch": 0.31, "grad_norm": 2.5201742608505686, "learning_rate": 4.999879018839288e-06, "logits/chosen": -2.637324810028076, "logits/rejected": -2.529784679412842, "logps/chosen": -315.1212158203125, "logps/rejected": -298.06903076171875, "loss": 0.4234, "rewards/accuracies": 1.0, "rewards/chosen": 0.12577927112579346, "rewards/margins": 0.6422899961471558, "rewards/margins_max": 0.9393427968025208, "rewards/margins_min": 0.3452370762825012, "rewards/margins_std": 0.42009615898132324, "rewards/rejected": -0.5165106058120728, "step": 110 }, { "epoch": 0.34, "grad_norm": 6.261552433653697, "learning_rate": 4.99772856836941e-06, "logits/chosen": -2.7266364097595215, "logits/rejected": -2.7145590782165527, "logps/chosen": -347.3783264160156, "logps/rejected": -389.63299560546875, "loss": 0.3956, "rewards/accuracies": 1.0, "rewards/chosen": 0.24562442302703857, "rewards/margins": 0.8258479237556458, "rewards/margins_max": 1.141953468322754, "rewards/margins_min": 0.5097422003746033, "rewards/margins_std": 0.44704094529151917, "rewards/rejected": -0.5802234411239624, "step": 120 }, { "epoch": 0.37, "grad_norm": 2.5117234961196413, "learning_rate": 4.992892309373227e-06, "logits/chosen": -2.5119540691375732, "logits/rejected": -2.4644391536712646, "logps/chosen": -370.6039733886719, "logps/rejected": -361.2594909667969, "loss": 0.3218, "rewards/accuracies": 1.0, "rewards/chosen": 0.20368309319019318, "rewards/margins": 1.2330464124679565, "rewards/margins_max": 1.4150781631469727, "rewards/margins_min": 1.0510146617889404, "rewards/margins_std": 0.25743168592453003, "rewards/rejected": -1.0293633937835693, "step": 130 }, { "epoch": 0.39, "grad_norm": 5.066809244826759, "learning_rate": 4.985375442281969e-06, "logits/chosen": -2.325155019760132, "logits/rejected": -2.2663826942443848, "logps/chosen": -366.98211669921875, "logps/rejected": -403.01495361328125, "loss": 0.2761, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.1577085703611374, "rewards/margins": 1.5553103685379028, "rewards/margins_max": 2.037226676940918, "rewards/margins_min": 1.0733940601348877, "rewards/margins_std": 0.681532621383667, "rewards/rejected": -1.7130190134048462, "step": 140 }, { "epoch": 0.42, "grad_norm": 7.190427764349362, "learning_rate": 4.9751860499858175e-06, "logits/chosen": -2.1403324604034424, "logits/rejected": -2.041670560836792, "logps/chosen": -324.15667724609375, "logps/rejected": -441.0560607910156, "loss": 0.2399, "rewards/accuracies": 1.0, "rewards/chosen": -0.27334439754486084, "rewards/margins": 1.659519910812378, "rewards/margins_max": 2.2249293327331543, "rewards/margins_min": 1.0941104888916016, "rewards/margins_std": 0.7996099591255188, "rewards/rejected": -1.9328645467758179, "step": 150 }, { "epoch": 0.45, "grad_norm": 7.116224539942571, "learning_rate": 4.962335089142376e-06, "logits/chosen": -1.9535696506500244, "logits/rejected": -1.7718425989151, "logps/chosen": -358.6165466308594, "logps/rejected": -501.46856689453125, "loss": 0.1556, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.26896899938583374, "rewards/margins": 2.3143906593322754, "rewards/margins_max": 2.8530867099761963, "rewards/margins_min": 1.7756941318511963, "rewards/margins_std": 0.7618317008018494, "rewards/rejected": -2.5833592414855957, "step": 160 }, { "epoch": 0.48, "grad_norm": 12.210481387434758, "learning_rate": 4.946836378394967e-06, "logits/chosen": -1.838096022605896, "logits/rejected": -1.5799922943115234, "logps/chosen": -445.1002502441406, "logps/rejected": -597.6307373046875, "loss": 0.1406, "rewards/accuracies": 1.0, "rewards/chosen": -0.4461892545223236, "rewards/margins": 3.19466233253479, "rewards/margins_max": 4.110939979553223, "rewards/margins_min": 2.2783844470977783, "rewards/margins_std": 1.2958126068115234, "rewards/rejected": -3.6408514976501465, "step": 170 }, { "epoch": 0.51, "grad_norm": 27.562973883397905, "learning_rate": 4.928706583513441e-06, "logits/chosen": -1.3463890552520752, "logits/rejected": -1.2715332508087158, "logps/chosen": -605.5383911132812, "logps/rejected": -967.7098388671875, "loss": 0.1672, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.81402325630188, "rewards/margins": 3.0660033226013184, "rewards/margins_max": 3.8246688842773438, "rewards/margins_min": 2.307338237762451, "rewards/margins_std": 1.072914719581604, "rewards/rejected": -5.880026817321777, "step": 180 }, { "epoch": 0.54, "grad_norm": 3.9080684244028343, "learning_rate": 4.907965199473471e-06, "logits/chosen": -1.3362934589385986, "logits/rejected": -1.0377042293548584, "logps/chosen": -732.0992431640625, "logps/rejected": -907.0653076171875, "loss": 0.131, "rewards/accuracies": 1.0, "rewards/chosen": -2.7598698139190674, "rewards/margins": 4.08551549911499, "rewards/margins_max": 4.806515693664551, "rewards/margins_min": 3.3645145893096924, "rewards/margins_std": 1.019648551940918, "rewards/rejected": -6.8453850746154785, "step": 190 }, { "epoch": 0.56, "grad_norm": 42.83035382744783, "learning_rate": 4.884634529493591e-06, "logits/chosen": -1.4783378839492798, "logits/rejected": -1.2933928966522217, "logps/chosen": -735.5909423828125, "logps/rejected": -1023.0391845703125, "loss": 0.1416, "rewards/accuracies": 1.0, "rewards/chosen": -4.047953128814697, "rewards/margins": 4.137004375457764, "rewards/margins_max": 5.257144927978516, "rewards/margins_min": 3.0168652534484863, "rewards/margins_std": 1.5841166973114014, "rewards/rejected": -8.184958457946777, "step": 200 }, { "epoch": 0.56, "eval_logits/chosen": -1.4607926607131958, "eval_logits/rejected": -1.4055131673812866, "eval_logps/chosen": -955.6170043945312, "eval_logps/rejected": -980.7882080078125, "eval_loss": 0.9053447246551514, "eval_rewards/accuracies": 0.6269841194152832, "eval_rewards/chosen": -6.703957557678223, "eval_rewards/margins": 0.5120973587036133, "eval_rewards/margins_max": 2.7698452472686768, "eval_rewards/margins_min": -1.7983918190002441, "eval_rewards/margins_std": 2.0239174365997314, "eval_rewards/rejected": -7.216055393218994, "eval_runtime": 281.707, "eval_samples_per_second": 7.1, "eval_steps_per_second": 0.224, "step": 200 }, { "epoch": 0.59, "grad_norm": 11.323675041923366, "learning_rate": 4.858739661052539e-06, "logits/chosen": -1.350990891456604, "logits/rejected": -1.2011955976486206, "logps/chosen": -738.5956420898438, "logps/rejected": -1072.1134033203125, "loss": 0.1359, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -3.817591905593872, "rewards/margins": 4.215450286865234, "rewards/margins_max": 6.099488735198975, "rewards/margins_min": 2.3314108848571777, "rewards/margins_std": 2.664433240890503, "rewards/rejected": -8.033041000366211, "step": 210 }, { "epoch": 0.62, "grad_norm": 2.145861603880887, "learning_rate": 4.830308438912687e-06, "logits/chosen": -1.5942816734313965, "logits/rejected": -1.3603050708770752, "logps/chosen": -854.7412109375, "logps/rejected": -1243.659423828125, "loss": 0.0774, "rewards/accuracies": 1.0, "rewards/chosen": -4.454717636108398, "rewards/margins": 5.1989240646362305, "rewards/margins_max": 6.37256383895874, "rewards/margins_min": 4.025284290313721, "rewards/margins_std": 1.6597778797149658, "rewards/rejected": -9.653641700744629, "step": 220 }, { "epoch": 0.65, "grad_norm": 4.962012371252307, "learning_rate": 4.799371435178544e-06, "logits/chosen": -1.7452170848846436, "logits/rejected": -1.609167456626892, "logps/chosen": -769.598876953125, "logps/rejected": -1189.131103515625, "loss": 0.104, "rewards/accuracies": 1.0, "rewards/chosen": -3.888404130935669, "rewards/margins": 4.6370439529418945, "rewards/margins_max": 5.980400085449219, "rewards/margins_min": 3.293687343597412, "rewards/margins_std": 1.8997926712036133, "rewards/rejected": -8.5254487991333, "step": 230 }, { "epoch": 0.68, "grad_norm": 2.001005873458455, "learning_rate": 4.765961916422575e-06, "logits/chosen": -1.6597576141357422, "logits/rejected": -1.444551944732666, "logps/chosen": -838.1024169921875, "logps/rejected": -1238.279052734375, "loss": 0.0955, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -4.969546318054199, "rewards/margins": 4.475127696990967, "rewards/margins_max": 5.603785514831543, "rewards/margins_min": 3.346471071243286, "rewards/margins_std": 1.59616219997406, "rewards/rejected": -9.444674491882324, "step": 240 }, { "epoch": 0.7, "grad_norm": 17.06427775193877, "learning_rate": 4.730115807913627e-06, "logits/chosen": -1.6722052097320557, "logits/rejected": -1.393259882926941, "logps/chosen": -916.7503662109375, "logps/rejected": -1274.2889404296875, "loss": 0.0866, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.015233039855957, "rewards/margins": 5.172359943389893, "rewards/margins_max": 6.111589431762695, "rewards/margins_min": 4.233129501342773, "rewards/margins_std": 1.328271508216858, "rewards/rejected": -10.187592506408691, "step": 250 }, { "epoch": 0.73, "grad_norm": 1.9182916124757974, "learning_rate": 4.691871654986485e-06, "logits/chosen": -1.7107824087142944, "logits/rejected": -1.6128714084625244, "logps/chosen": -878.5494384765625, "logps/rejected": -1255.8555908203125, "loss": 0.079, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -5.5672712326049805, "rewards/margins": 4.748871803283691, "rewards/margins_max": 5.786838531494141, "rewards/margins_min": 3.7109055519104004, "rewards/margins_std": 1.4679062366485596, "rewards/rejected": -10.316143035888672, "step": 260 }, { "epoch": 0.76, "grad_norm": 14.786553042508123, "learning_rate": 4.651270581594054e-06, "logits/chosen": -1.8650672435760498, "logits/rejected": -1.613443374633789, "logps/chosen": -834.0842895507812, "logps/rejected": -1138.3665771484375, "loss": 0.0875, "rewards/accuracies": 1.0, "rewards/chosen": -4.050877571105957, "rewards/margins": 5.007403373718262, "rewards/margins_max": 5.84472131729126, "rewards/margins_min": 4.170086860656738, "rewards/margins_std": 1.184145212173462, "rewards/rejected": -9.058280944824219, "step": 270 }, { "epoch": 0.79, "grad_norm": 5.30439894597876, "learning_rate": 4.6083562460867545e-06, "logits/chosen": -1.6716859340667725, "logits/rejected": -1.5429413318634033, "logps/chosen": -701.3162841796875, "logps/rejected": -1120.8736572265625, "loss": 0.0896, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.7223961353302, "rewards/margins": 4.8294267654418945, "rewards/margins_max": 6.9812211990356445, "rewards/margins_min": 2.6776328086853027, "rewards/margins_std": 3.0430965423583984, "rewards/rejected": -8.551824569702148, "step": 280 }, { "epoch": 0.82, "grad_norm": 12.724182318476426, "learning_rate": 4.563174794266684e-06, "logits/chosen": -1.8460794687271118, "logits/rejected": -1.6377445459365845, "logps/chosen": -858.4215698242188, "logps/rejected": -1289.198974609375, "loss": 0.0576, "rewards/accuracies": 1.0, "rewards/chosen": -4.965760231018066, "rewards/margins": 5.2121992111206055, "rewards/margins_max": 6.927371025085449, "rewards/margins_min": 3.49702525138855, "rewards/margins_std": 2.4256205558776855, "rewards/rejected": -10.177958488464355, "step": 290 }, { "epoch": 0.85, "grad_norm": 5.778488241840074, "learning_rate": 4.5157748097670125e-06, "logits/chosen": -1.7077114582061768, "logits/rejected": -1.5558173656463623, "logps/chosen": -739.67333984375, "logps/rejected": -1423.210693359375, "loss": 0.0426, "rewards/accuracies": 1.0, "rewards/chosen": -4.0192999839782715, "rewards/margins": 7.085653781890869, "rewards/margins_max": 7.969016075134277, "rewards/margins_min": 6.202291488647461, "rewards/margins_std": 1.2492637634277344, "rewards/rejected": -11.104954719543457, "step": 300 }, { "epoch": 0.85, "eval_logits/chosen": -1.7101370096206665, "eval_logits/rejected": -1.6507517099380493, "eval_logps/chosen": -1041.5823974609375, "eval_logps/rejected": -1121.1776123046875, "eval_loss": 0.9213338494300842, "eval_rewards/accuracies": 0.6785714030265808, "eval_rewards/chosen": -7.563611030578613, "eval_rewards/margins": 1.0563386678695679, "eval_rewards/margins_max": 4.265172481536865, "eval_rewards/margins_min": -2.1614327430725098, "eval_rewards/margins_std": 2.8564813137054443, "eval_rewards/rejected": -8.619950294494629, "eval_runtime": 281.7456, "eval_samples_per_second": 7.099, "eval_steps_per_second": 0.224, "step": 300 }, { "epoch": 0.87, "grad_norm": 12.853675144552225, "learning_rate": 4.466207261809989e-06, "logits/chosen": -1.9336496591567993, "logits/rejected": -1.6221659183502197, "logps/chosen": -901.4439697265625, "logps/rejected": -1262.938720703125, "loss": 0.0633, "rewards/accuracies": 1.0, "rewards/chosen": -4.771965980529785, "rewards/margins": 5.247581958770752, "rewards/margins_max": 6.526535987854004, "rewards/margins_min": 3.968628406524658, "rewards/margins_std": 1.8087135553359985, "rewards/rejected": -10.019546508789062, "step": 310 }, { "epoch": 0.9, "grad_norm": 12.332833632235157, "learning_rate": 4.414525450399713e-06, "logits/chosen": -1.6821091175079346, "logits/rejected": -1.511785626411438, "logps/chosen": -956.3181762695312, "logps/rejected": -1481.1754150390625, "loss": 0.0978, "rewards/accuracies": 1.0, "rewards/chosen": -6.100653171539307, "rewards/margins": 6.3301496505737305, "rewards/margins_max": 8.061585426330566, "rewards/margins_min": 4.598714828491211, "rewards/margins_std": 2.4486188888549805, "rewards/rejected": -12.430803298950195, "step": 320 }, { "epoch": 0.93, "grad_norm": 3.9044155848949162, "learning_rate": 4.360784949008615e-06, "logits/chosen": -1.768561601638794, "logits/rejected": -1.5437813997268677, "logps/chosen": -1006.9339599609375, "logps/rejected": -1522.902587890625, "loss": 0.1091, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.272473335266113, "rewards/margins": 6.482227325439453, "rewards/margins_max": 8.401371002197266, "rewards/margins_min": 4.563082695007324, "rewards/margins_std": 2.7140800952911377, "rewards/rejected": -12.754700660705566, "step": 330 }, { "epoch": 0.96, "grad_norm": 4.01171637277802, "learning_rate": 4.30504354481929e-06, "logits/chosen": -1.7665777206420898, "logits/rejected": -1.5484760999679565, "logps/chosen": -942.85888671875, "logps/rejected": -1260.244384765625, "loss": 0.0741, "rewards/accuracies": 1.0, "rewards/chosen": -5.743631839752197, "rewards/margins": 4.815784931182861, "rewards/margins_max": 6.530648708343506, "rewards/margins_min": 3.1009204387664795, "rewards/margins_std": 2.425184488296509, "rewards/rejected": -10.559415817260742, "step": 340 }, { "epoch": 0.99, "grad_norm": 12.659683176327913, "learning_rate": 4.247361176585904e-06, "logits/chosen": -1.831321120262146, "logits/rejected": -1.6549314260482788, "logps/chosen": -909.5006713867188, "logps/rejected": -1532.635986328125, "loss": 0.0943, "rewards/accuracies": 1.0, "rewards/chosen": -4.810971736907959, "rewards/margins": 7.531504154205322, "rewards/margins_max": 8.548044204711914, "rewards/margins_min": 6.514962673187256, "rewards/margins_std": 1.4376055002212524, "rewards/rejected": -12.342476844787598, "step": 350 }, { "epoch": 1.01, "grad_norm": 3.001942641389469, "learning_rate": 4.187799870182038e-06, "logits/chosen": -1.7835716009140015, "logits/rejected": -1.5620241165161133, "logps/chosen": -896.9002075195312, "logps/rejected": -1392.6307373046875, "loss": 0.0555, "rewards/accuracies": 1.0, "rewards/chosen": -5.5069427490234375, "rewards/margins": 6.391612529754639, "rewards/margins_max": 7.894322872161865, "rewards/margins_min": 4.888903617858887, "rewards/margins_std": 2.125152349472046, "rewards/rejected": -11.898555755615234, "step": 360 }, { "epoch": 1.04, "grad_norm": 34.14422714120664, "learning_rate": 4.1264236719042365e-06, "logits/chosen": -1.5919651985168457, "logits/rejected": -1.5377094745635986, "logps/chosen": -915.7950439453125, "logps/rejected": -1490.6865234375, "loss": 0.0808, "rewards/accuracies": 1.0, "rewards/chosen": -5.6413750648498535, "rewards/margins": 6.627654075622559, "rewards/margins_max": 8.43530559539795, "rewards/margins_min": 4.820002555847168, "rewards/margins_std": 2.5564048290252686, "rewards/rejected": -12.26902961730957, "step": 370 }, { "epoch": 1.07, "grad_norm": 2.1290534012360847, "learning_rate": 4.063298579603001e-06, "logits/chosen": -1.8492443561553955, "logits/rejected": -1.5422757863998413, "logps/chosen": -937.0126953125, "logps/rejected": -1458.616455078125, "loss": 0.0231, "rewards/accuracies": 1.0, "rewards/chosen": -5.460320949554443, "rewards/margins": 7.281059265136719, "rewards/margins_max": 8.49816608428955, "rewards/margins_min": 6.0639543533325195, "rewards/margins_std": 1.7212467193603516, "rewards/rejected": -12.74138069152832, "step": 380 }, { "epoch": 1.1, "grad_norm": 5.584775064800199, "learning_rate": 3.998492471715272e-06, "logits/chosen": -1.8397998809814453, "logits/rejected": -1.6857073307037354, "logps/chosen": -913.9352416992188, "logps/rejected": -1781.8939208984375, "loss": 0.0278, "rewards/accuracies": 1.0, "rewards/chosen": -5.164222717285156, "rewards/margins": 9.338297843933105, "rewards/margins_max": 11.463502883911133, "rewards/margins_min": 7.2130937576293945, "rewards/margins_std": 3.005493640899658, "rewards/rejected": -14.502520561218262, "step": 390 }, { "epoch": 1.13, "grad_norm": 0.9893449328848739, "learning_rate": 3.932075034274723e-06, "logits/chosen": -1.5922348499298096, "logits/rejected": -1.4688727855682373, "logps/chosen": -871.9650268554688, "logps/rejected": -1526.658935546875, "loss": 0.0537, "rewards/accuracies": 1.0, "rewards/chosen": -5.7322564125061035, "rewards/margins": 7.261972904205322, "rewards/margins_max": 8.895970344543457, "rewards/margins_min": 5.627974510192871, "rewards/margins_std": 2.3108224868774414, "rewards/rejected": -12.994227409362793, "step": 400 }, { "epoch": 1.13, "eval_logits/chosen": -1.6575742959976196, "eval_logits/rejected": -1.5926053524017334, "eval_logps/chosen": -1505.182861328125, "eval_logps/rejected": -1577.3876953125, "eval_loss": 1.1419050693511963, "eval_rewards/accuracies": 0.64682537317276, "eval_rewards/chosen": -12.199617385864258, "eval_rewards/margins": 0.9824325442314148, "eval_rewards/margins_max": 5.48787260055542, "eval_rewards/margins_min": -3.0621237754821777, "eval_rewards/margins_std": 3.7889323234558105, "eval_rewards/rejected": -13.182049751281738, "eval_runtime": 282.4562, "eval_samples_per_second": 7.081, "eval_steps_per_second": 0.223, "step": 400 }, { "epoch": 1.15, "grad_norm": 0.9794540017501292, "learning_rate": 3.864117685978339e-06, "logits/chosen": -1.6234560012817383, "logits/rejected": -1.4928052425384521, "logps/chosen": -1131.8265380859375, "logps/rejected": -1794.791015625, "loss": 0.0776, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -8.371360778808594, "rewards/margins": 7.494576454162598, "rewards/margins_max": 10.048029899597168, "rewards/margins_min": 4.941121578216553, "rewards/margins_std": 3.61112904548645, "rewards/rejected": -15.865939140319824, "step": 410 }, { "epoch": 1.18, "grad_norm": 5.020955613205059, "learning_rate": 3.794693501389861e-06, "logits/chosen": -1.7987747192382812, "logits/rejected": -1.6164734363555908, "logps/chosen": -1037.0328369140625, "logps/rejected": -1667.540283203125, "loss": 0.054, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.372786045074463, "rewards/margins": 7.646895408630371, "rewards/margins_max": 8.891626358032227, "rewards/margins_min": 6.402162075042725, "rewards/margins_std": 1.7603172063827515, "rewards/rejected": -14.019680976867676, "step": 420 }, { "epoch": 1.21, "grad_norm": 15.978168852619268, "learning_rate": 3.7238771323626822e-06, "logits/chosen": -1.6425611972808838, "logits/rejected": -1.4570006132125854, "logps/chosen": -1138.6572265625, "logps/rejected": -1780.6002197265625, "loss": 0.044, "rewards/accuracies": 1.0, "rewards/chosen": -7.515681266784668, "rewards/margins": 7.655673027038574, "rewards/margins_max": 9.563043594360352, "rewards/margins_min": 5.748303413391113, "rewards/margins_std": 2.6974284648895264, "rewards/rejected": -15.171353340148926, "step": 430 }, { "epoch": 1.24, "grad_norm": 1.4394479904186748, "learning_rate": 3.651744727766676e-06, "logits/chosen": -1.565843939781189, "logits/rejected": -1.3031253814697266, "logps/chosen": -1135.116943359375, "logps/rejected": -1897.188232421875, "loss": 0.0356, "rewards/accuracies": 1.0, "rewards/chosen": -8.198633193969727, "rewards/margins": 8.82483196258545, "rewards/margins_max": 11.5381441116333, "rewards/margins_min": 6.1115217208862305, "rewards/margins_std": 3.8372015953063965, "rewards/rejected": -17.023466110229492, "step": 440 }, { "epoch": 1.27, "grad_norm": 2.5233082457705853, "learning_rate": 3.57837385160529e-06, "logits/chosen": -1.6333341598510742, "logits/rejected": -1.419213056564331, "logps/chosen": -991.2794799804688, "logps/rejected": -1686.808837890625, "loss": 0.0246, "rewards/accuracies": 1.0, "rewards/chosen": -6.5310492515563965, "rewards/margins": 7.6606926918029785, "rewards/margins_max": 9.670614242553711, "rewards/margins_min": 5.650770664215088, "rewards/margins_std": 2.842459201812744, "rewards/rejected": -14.191740036010742, "step": 450 }, { "epoch": 1.3, "grad_norm": 1.432241857413985, "learning_rate": 3.503843399610941e-06, "logits/chosen": -1.6662094593048096, "logits/rejected": -1.5159740447998047, "logps/chosen": -1023.26220703125, "logps/rejected": -1997.1787109375, "loss": 0.0208, "rewards/accuracies": 1.0, "rewards/chosen": -6.279843330383301, "rewards/margins": 9.666014671325684, "rewards/margins_max": 11.908063888549805, "rewards/margins_min": 7.423966407775879, "rewards/margins_std": 3.1707358360290527, "rewards/rejected": -15.945857048034668, "step": 460 }, { "epoch": 1.32, "grad_norm": 1.3845844015706055, "learning_rate": 3.4282335144083985e-06, "logits/chosen": -1.5941836833953857, "logits/rejected": -1.34697425365448, "logps/chosen": -1180.2171630859375, "logps/rejected": -1964.836181640625, "loss": 0.0304, "rewards/accuracies": 1.0, "rewards/chosen": -8.229662895202637, "rewards/margins": 9.211896896362305, "rewards/margins_max": 11.3733549118042, "rewards/margins_min": 7.050437927246094, "rewards/margins_std": 3.0567641258239746, "rewards/rejected": -17.441558837890625, "step": 470 }, { "epoch": 1.35, "grad_norm": 0.25091350074864577, "learning_rate": 3.351625499337395e-06, "logits/chosen": -1.7405236959457397, "logits/rejected": -1.4616386890411377, "logps/chosen": -1157.209716796875, "logps/rejected": -1899.130126953125, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -7.413580417633057, "rewards/margins": 8.81358528137207, "rewards/margins_max": 10.952999114990234, "rewards/margins_min": 6.674172401428223, "rewards/margins_std": 3.0255870819091797, "rewards/rejected": -16.227169036865234, "step": 480 }, { "epoch": 1.38, "grad_norm": 1.9987349085330508, "learning_rate": 3.2741017310271056e-06, "logits/chosen": -1.3325449228286743, "logits/rejected": -1.044908881187439, "logps/chosen": -1130.028076171875, "logps/rejected": -2392.521728515625, "loss": 0.0448, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -9.127466201782227, "rewards/margins": 12.631993293762207, "rewards/margins_max": 19.333314895629883, "rewards/margins_min": 5.930669784545898, "rewards/margins_std": 9.47710132598877, "rewards/rejected": -21.759456634521484, "step": 490 }, { "epoch": 1.41, "grad_norm": 1.7094204242814826, "learning_rate": 3.195745570816532e-06, "logits/chosen": -1.3385294675827026, "logits/rejected": -1.144627571105957, "logps/chosen": -1425.61474609375, "logps/rejected": -2558.358642578125, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -10.41409969329834, "rewards/margins": 12.790387153625488, "rewards/margins_max": 14.778757095336914, "rewards/margins_min": 10.802019119262695, "rewards/margins_std": 2.811978340148926, "rewards/rejected": -23.204486846923828, "step": 500 }, { "epoch": 1.41, "eval_logits/chosen": -1.5026105642318726, "eval_logits/rejected": -1.4330366849899292, "eval_logps/chosen": -2000.166259765625, "eval_logps/rejected": -2146.479736328125, "eval_loss": 1.684375524520874, "eval_rewards/accuracies": 0.6666666865348816, "eval_rewards/chosen": -17.149450302124023, "eval_rewards/margins": 1.7235194444656372, "eval_rewards/margins_max": 9.41946029663086, "eval_rewards/margins_min": -5.146158218383789, "eval_rewards/margins_std": 6.577420711517334, "eval_rewards/rejected": -18.872970581054688, "eval_runtime": 282.6761, "eval_samples_per_second": 7.075, "eval_steps_per_second": 0.223, "step": 500 }, { "epoch": 1.44, "grad_norm": 19.195207569920772, "learning_rate": 3.116641275116018e-06, "logits/chosen": -1.2405312061309814, "logits/rejected": -0.9798258543014526, "logps/chosen": -1318.967041015625, "logps/rejected": -3077.10986328125, "loss": 0.0229, "rewards/accuracies": 1.0, "rewards/chosen": -10.26286792755127, "rewards/margins": 17.355688095092773, "rewards/margins_max": 25.170244216918945, "rewards/margins_min": 9.541135787963867, "rewards/margins_std": 11.051448822021484, "rewards/rejected": -27.618555068969727, "step": 510 }, { "epoch": 1.46, "grad_norm": 18.23076880980296, "learning_rate": 3.0368739048062956e-06, "logits/chosen": -1.6826045513153076, "logits/rejected": -1.4554195404052734, "logps/chosen": -1159.925048828125, "logps/rejected": -2069.19580078125, "loss": 0.0355, "rewards/accuracies": 1.0, "rewards/chosen": -8.183090209960938, "rewards/margins": 10.176679611206055, "rewards/margins_max": 13.777229309082031, "rewards/margins_min": 6.5761308670043945, "rewards/margins_std": 5.091946125030518, "rewards/rejected": -18.359769821166992, "step": 520 }, { "epoch": 1.49, "grad_norm": 7.345312333811953, "learning_rate": 2.956529233772492e-06, "logits/chosen": -1.6696984767913818, "logits/rejected": -1.566896915435791, "logps/chosen": -1206.398681640625, "logps/rejected": -2070.3857421875, "loss": 0.0184, "rewards/accuracies": 1.0, "rewards/chosen": -8.273930549621582, "rewards/margins": 9.733041763305664, "rewards/margins_max": 12.174661636352539, "rewards/margins_min": 7.291422367095947, "rewards/margins_std": 3.4529712200164795, "rewards/rejected": -18.006973266601562, "step": 530 }, { "epoch": 1.52, "grad_norm": 21.78105244485373, "learning_rate": 2.8756936566714317e-06, "logits/chosen": -1.8572250604629517, "logits/rejected": -1.5829768180847168, "logps/chosen": -1132.333740234375, "logps/rejected": -1908.844970703125, "loss": 0.0256, "rewards/accuracies": 1.0, "rewards/chosen": -7.327805519104004, "rewards/margins": 9.385960578918457, "rewards/margins_max": 10.629077911376953, "rewards/margins_min": 8.142843246459961, "rewards/margins_std": 1.7580335140228271, "rewards/rejected": -16.713764190673828, "step": 540 }, { "epoch": 1.55, "grad_norm": 0.0011589092808777935, "learning_rate": 2.794454096031429e-06, "logits/chosen": -1.7256653308868408, "logits/rejected": -1.5292785167694092, "logps/chosen": -1160.131591796875, "logps/rejected": -2000.1337890625, "loss": 0.0223, "rewards/accuracies": 1.0, "rewards/chosen": -8.491829872131348, "rewards/margins": 8.8389892578125, "rewards/margins_max": 10.393911361694336, "rewards/margins_min": 7.284067630767822, "rewards/margins_std": 2.1989917755126953, "rewards/rejected": -17.33081817626953, "step": 550 }, { "epoch": 1.58, "grad_norm": 1.1029358007262624, "learning_rate": 2.71289790878446e-06, "logits/chosen": -1.5588399171829224, "logits/rejected": -1.3718044757843018, "logps/chosen": -1313.054443359375, "logps/rejected": -2318.33544921875, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": -9.963714599609375, "rewards/margins": 9.831637382507324, "rewards/margins_max": 12.691813468933105, "rewards/margins_min": 6.971460819244385, "rewards/margins_std": 4.044900894165039, "rewards/rejected": -19.795352935791016, "step": 560 }, { "epoch": 1.61, "grad_norm": 0.032589510422147, "learning_rate": 2.6311127923312156e-06, "logits/chosen": -1.7382599115371704, "logits/rejected": -1.5052683353424072, "logps/chosen": -1249.270263671875, "logps/rejected": -2084.659912109375, "loss": 0.0177, "rewards/accuracies": 1.0, "rewards/chosen": -8.170693397521973, "rewards/margins": 9.51733684539795, "rewards/margins_max": 11.196283340454102, "rewards/margins_min": 7.8383917808532715, "rewards/margins_std": 2.374387741088867, "rewards/rejected": -17.68802833557129, "step": 570 }, { "epoch": 1.63, "grad_norm": 12.99158263963332, "learning_rate": 2.549186690240057e-06, "logits/chosen": -1.610082983970642, "logits/rejected": -1.3717553615570068, "logps/chosen": -1186.931884765625, "logps/rejected": -2215.44970703125, "loss": 0.0096, "rewards/accuracies": 1.0, "rewards/chosen": -8.884663581848145, "rewards/margins": 11.055347442626953, "rewards/margins_max": 13.794784545898438, "rewards/margins_min": 8.315912246704102, "rewards/margins_std": 3.874147891998291, "rewards/rejected": -19.94001007080078, "step": 580 }, { "epoch": 1.66, "grad_norm": 0.09893386521593805, "learning_rate": 2.4672076976812548e-06, "logits/chosen": -1.504370927810669, "logits/rejected": -1.24093759059906, "logps/chosen": -1294.6529541015625, "logps/rejected": -2374.53271484375, "loss": 0.0182, "rewards/accuracies": 1.0, "rewards/chosen": -9.253921508789062, "rewards/margins": 11.56922721862793, "rewards/margins_max": 16.103586196899414, "rewards/margins_min": 7.034867763519287, "rewards/margins_std": 6.412552833557129, "rewards/rejected": -20.823148727416992, "step": 590 }, { "epoch": 1.69, "grad_norm": 1.4677452546622722, "learning_rate": 2.3852639666982218e-06, "logits/chosen": -1.5387322902679443, "logits/rejected": -1.3424365520477295, "logps/chosen": -1172.688232421875, "logps/rejected": -2390.56689453125, "loss": 0.0029, "rewards/accuracies": 1.0, "rewards/chosen": -8.940356254577637, "rewards/margins": 12.360175132751465, "rewards/margins_max": 14.774116516113281, "rewards/margins_min": 9.946235656738281, "rewards/margins_std": 3.4138267040252686, "rewards/rejected": -21.300533294677734, "step": 600 }, { "epoch": 1.69, "eval_logits/chosen": -1.5330660343170166, "eval_logits/rejected": -1.4547291994094849, "eval_logps/chosen": -1739.8331298828125, "eval_logps/rejected": -2005.7900390625, "eval_loss": 1.9743393659591675, "eval_rewards/accuracies": 0.6865079402923584, "eval_rewards/chosen": -14.546117782592773, "eval_rewards/margins": 2.9199535846710205, "eval_rewards/margins_max": 12.400845527648926, "eval_rewards/margins_min": -5.716708660125732, "eval_rewards/margins_std": 8.164259910583496, "eval_rewards/rejected": -17.46607208251953, "eval_runtime": 281.995, "eval_samples_per_second": 7.092, "eval_steps_per_second": 0.223, "step": 600 }, { "epoch": 1.72, "grad_norm": 16.662428863900104, "learning_rate": 2.303443611417584e-06, "logits/chosen": -1.2892029285430908, "logits/rejected": -1.0749212503433228, "logps/chosen": -1583.099609375, "logps/rejected": -2742.760498046875, "loss": 0.3581, "rewards/accuracies": 1.0, "rewards/chosen": -12.176101684570312, "rewards/margins": 12.56828498840332, "rewards/margins_max": 17.369625091552734, "rewards/margins_min": 7.766943454742432, "rewards/margins_std": 6.790121555328369, "rewards/rejected": -24.744388580322266, "step": 610 }, { "epoch": 1.75, "grad_norm": 0.1502185307527533, "learning_rate": 2.2218346133000264e-06, "logits/chosen": -1.1851621866226196, "logits/rejected": -0.8747516870498657, "logps/chosen": -1684.5989990234375, "logps/rejected": -2998.321044921875, "loss": 0.0851, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -13.733156204223633, "rewards/margins": 14.27801513671875, "rewards/margins_max": 20.737751007080078, "rewards/margins_min": 7.818281650543213, "rewards/margins_std": 9.135442733764648, "rewards/rejected": -28.011173248291016, "step": 620 }, { "epoch": 1.77, "grad_norm": 0.608737783564001, "learning_rate": 2.140524726533792e-06, "logits/chosen": -1.4635207653045654, "logits/rejected": -1.206559658050537, "logps/chosen": -1263.6993408203125, "logps/rejected": -2158.978759765625, "loss": 0.0474, "rewards/accuracies": 1.0, "rewards/chosen": -8.689355850219727, "rewards/margins": 10.659037590026855, "rewards/margins_max": 13.989839553833008, "rewards/margins_min": 7.3282365798950195, "rewards/margins_std": 4.710465431213379, "rewards/rejected": -19.3483943939209, "step": 630 }, { "epoch": 1.8, "grad_norm": 37.51094566818964, "learning_rate": 2.059601383672566e-06, "logits/chosen": -1.6980371475219727, "logits/rejected": -1.5178521871566772, "logps/chosen": -964.2796630859375, "logps/rejected": -1743.4036865234375, "loss": 0.0669, "rewards/accuracies": 1.0, "rewards/chosen": -6.6180419921875, "rewards/margins": 8.817036628723145, "rewards/margins_max": 10.244000434875488, "rewards/margins_min": 7.390072822570801, "rewards/margins_std": 2.018031597137451, "rewards/rejected": -15.435079574584961, "step": 640 }, { "epoch": 1.83, "grad_norm": 0.824336798291059, "learning_rate": 1.9791516016192214e-06, "logits/chosen": -1.8461487293243408, "logits/rejected": -1.5655087232589722, "logps/chosen": -941.0548706054688, "logps/rejected": -1621.322265625, "loss": 0.0587, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -6.0665507316589355, "rewards/margins": 7.918545722961426, "rewards/margins_max": 10.15103530883789, "rewards/margins_min": 5.6860551834106445, "rewards/margins_std": 3.157217502593994, "rewards/rejected": -13.985095024108887, "step": 650 }, { "epoch": 1.86, "grad_norm": 0.2329366656877762, "learning_rate": 1.8992618880565039e-06, "logits/chosen": -1.4127376079559326, "logits/rejected": -1.204310655593872, "logps/chosen": -974.7972412109375, "logps/rejected": -1706.96484375, "loss": 0.0472, "rewards/accuracies": 1.0, "rewards/chosen": -6.5077385902404785, "rewards/margins": 8.661420822143555, "rewards/margins_max": 11.35025691986084, "rewards/margins_min": 5.972585678100586, "rewards/margins_std": 3.8025870323181152, "rewards/rejected": -15.169160842895508, "step": 660 }, { "epoch": 1.89, "grad_norm": 0.2766932797893532, "learning_rate": 1.8200181484252888e-06, "logits/chosen": -1.6775104999542236, "logits/rejected": -1.5603760480880737, "logps/chosen": -1146.943603515625, "logps/rejected": -2180.825927734375, "loss": 0.0303, "rewards/accuracies": 1.0, "rewards/chosen": -7.641868591308594, "rewards/margins": 10.99293327331543, "rewards/margins_max": 14.466341018676758, "rewards/margins_min": 7.519525051116943, "rewards/margins_std": 4.912140369415283, "rewards/rejected": -18.634801864624023, "step": 670 }, { "epoch": 1.92, "grad_norm": 1.9894517252535326, "learning_rate": 1.7415055935504234e-06, "logits/chosen": -1.6779143810272217, "logits/rejected": -1.3088996410369873, "logps/chosen": -1250.79345703125, "logps/rejected": -2332.5302734375, "loss": 0.0268, "rewards/accuracies": 1.0, "rewards/chosen": -8.574339866638184, "rewards/margins": 11.780553817749023, "rewards/margins_max": 17.217056274414062, "rewards/margins_min": 6.344052314758301, "rewards/margins_std": 7.688374996185303, "rewards/rejected": -20.35489273071289, "step": 680 }, { "epoch": 1.94, "grad_norm": 1.2264882447915335, "learning_rate": 1.6638086480134954e-06, "logits/chosen": -1.133843183517456, "logits/rejected": -0.9121431112289429, "logps/chosen": -1320.951171875, "logps/rejected": -2429.5537109375, "loss": 0.014, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.83985710144043, "rewards/margins": 12.160634994506836, "rewards/margins_max": 17.855926513671875, "rewards/margins_min": 6.465344429016113, "rewards/margins_std": 8.054357528686523, "rewards/rejected": -23.000492095947266, "step": 690 }, { "epoch": 1.97, "grad_norm": 4.223913353219136, "learning_rate": 1.5870108593710473e-06, "logits/chosen": -1.4314680099487305, "logits/rejected": -1.1393955945968628, "logps/chosen": -1421.0302734375, "logps/rejected": -2616.06005859375, "loss": 0.018, "rewards/accuracies": 1.0, "rewards/chosen": -9.940652847290039, "rewards/margins": 14.069793701171875, "rewards/margins_max": 18.50979995727539, "rewards/margins_min": 9.62978744506836, "rewards/margins_std": 6.279117584228516, "rewards/rejected": -24.010446548461914, "step": 700 }, { "epoch": 1.97, "eval_logits/chosen": -1.4977593421936035, "eval_logits/rejected": -1.4133175611495972, "eval_logps/chosen": -1938.2783203125, "eval_logps/rejected": -2177.001708984375, "eval_loss": 1.8029882907867432, "eval_rewards/accuracies": 0.6785714030265808, "eval_rewards/chosen": -16.53057098388672, "eval_rewards/margins": 2.6476187705993652, "eval_rewards/margins_max": 11.230785369873047, "eval_rewards/margins_min": -5.27154541015625, "eval_rewards/margins_std": 7.43382453918457, "eval_rewards/rejected": -19.178190231323242, "eval_runtime": 282.2867, "eval_samples_per_second": 7.085, "eval_steps_per_second": 0.223, "step": 700 }, { "epoch": 2.0, "grad_norm": 0.027200756028801846, "learning_rate": 1.511194808315853e-06, "logits/chosen": -1.4225877523422241, "logits/rejected": -1.1490380764007568, "logps/chosen": -1361.941162109375, "logps/rejected": -2227.452880859375, "loss": 0.0423, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.556672096252441, "rewards/margins": 9.88037109375, "rewards/margins_max": 13.63640022277832, "rewards/margins_min": 6.124342441558838, "rewards/margins_std": 5.311827182769775, "rewards/rejected": -20.437042236328125, "step": 710 }, { "epoch": 2.03, "grad_norm": 0.318786591879142, "learning_rate": 1.4364420198778662e-06, "logits/chosen": -1.5894582271575928, "logits/rejected": -1.3686472177505493, "logps/chosen": -1422.156005859375, "logps/rejected": -2683.84814453125, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -10.745410919189453, "rewards/margins": 12.789144515991211, "rewards/margins_max": 16.427227020263672, "rewards/margins_min": 9.15106201171875, "rewards/margins_std": 5.14502477645874, "rewards/rejected": -23.53455352783203, "step": 720 }, { "epoch": 2.06, "grad_norm": 1.5807231251466567, "learning_rate": 1.3628328757603243e-06, "logits/chosen": -1.6512333154678345, "logits/rejected": -1.3885473012924194, "logps/chosen": -1368.7022705078125, "logps/rejected": -2550.4912109375, "loss": 0.0091, "rewards/accuracies": 1.0, "rewards/chosen": -9.459519386291504, "rewards/margins": 13.517751693725586, "rewards/margins_max": 18.180484771728516, "rewards/margins_min": 8.855023384094238, "rewards/margins_std": 6.5940961837768555, "rewards/rejected": -22.97727394104004, "step": 730 }, { "epoch": 2.08, "grad_norm": 0.1516893711186873, "learning_rate": 1.2904465279052725e-06, "logits/chosen": -1.6209065914154053, "logits/rejected": -1.351872444152832, "logps/chosen": -1231.8480224609375, "logps/rejected": -2237.622802734375, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -8.798944473266602, "rewards/margins": 11.324702262878418, "rewards/margins_max": 13.88591480255127, "rewards/margins_min": 8.763489723205566, "rewards/margins_std": 3.6221022605895996, "rewards/rejected": -20.123645782470703, "step": 740 }, { "epoch": 2.11, "grad_norm": 0.8035507691467565, "learning_rate": 1.219360813381446e-06, "logits/chosen": -1.247396469116211, "logits/rejected": -1.033151388168335, "logps/chosen": -1316.85546875, "logps/rejected": -2502.35400390625, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -11.282798767089844, "rewards/margins": 12.374329566955566, "rewards/margins_max": 16.396432876586914, "rewards/margins_min": 8.352226257324219, "rewards/margins_std": 5.688112258911133, "rewards/rejected": -23.657127380371094, "step": 750 }, { "epoch": 2.14, "grad_norm": 0.10201527009610997, "learning_rate": 1.1496521706860392e-06, "logits/chosen": -1.5233542919158936, "logits/rejected": -1.1838680505752563, "logps/chosen": -1417.0087890625, "logps/rejected": -2805.773681640625, "loss": 0.0051, "rewards/accuracies": 1.0, "rewards/chosen": -10.839475631713867, "rewards/margins": 14.590258598327637, "rewards/margins_max": 17.661457061767578, "rewards/margins_min": 11.519063949584961, "rewards/margins_std": 4.343328475952148, "rewards/rejected": -25.429737091064453, "step": 760 }, { "epoch": 2.17, "grad_norm": 0.0015806759819360625, "learning_rate": 1.0813955575503588e-06, "logits/chosen": -1.355691909790039, "logits/rejected": -1.144424557685852, "logps/chosen": -1348.842041015625, "logps/rejected": -2898.0224609375, "loss": 0.0066, "rewards/accuracies": 1.0, "rewards/chosen": -10.152058601379395, "rewards/margins": 16.251543045043945, "rewards/margins_max": 22.687950134277344, "rewards/margins_min": 9.815134048461914, "rewards/margins_std": 9.102456092834473, "rewards/rejected": -26.40359878540039, "step": 770 }, { "epoch": 2.2, "grad_norm": 0.408380187113466, "learning_rate": 1.0146643703377488e-06, "logits/chosen": -1.6056991815567017, "logits/rejected": -1.3266913890838623, "logps/chosen": -1298.9927978515625, "logps/rejected": -2409.390869140625, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -9.719507217407227, "rewards/margins": 12.09427261352539, "rewards/margins_max": 15.695422172546387, "rewards/margins_min": 8.493124008178711, "rewards/margins_std": 5.092793941497803, "rewards/rejected": -21.813779830932617, "step": 780 }, { "epoch": 2.23, "grad_norm": 0.001344347508367163, "learning_rate": 9.495303651204496e-07, "logits/chosen": -1.563906192779541, "logits/rejected": -1.3474560976028442, "logps/chosen": -1254.9219970703125, "logps/rejected": -2623.2822265625, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -8.889430046081543, "rewards/margins": 14.575796127319336, "rewards/margins_max": 18.69800567626953, "rewards/margins_min": 10.453584671020508, "rewards/margins_std": 5.829684734344482, "rewards/rejected": -23.465227127075195, "step": 790 }, { "epoch": 2.25, "grad_norm": 1.6920469977748351, "learning_rate": 8.860635805202616e-07, "logits/chosen": -1.551922082901001, "logits/rejected": -1.2580442428588867, "logps/chosen": -1456.9490966796875, "logps/rejected": -2604.62744140625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -10.935505867004395, "rewards/margins": 12.657417297363281, "rewards/margins_max": 15.51282024383545, "rewards/margins_min": 9.802014350891113, "rewards/margins_std": 4.038149833679199, "rewards/rejected": -23.59292221069336, "step": 800 }, { "epoch": 2.25, "eval_logits/chosen": -1.5266377925872803, "eval_logits/rejected": -1.4433014392852783, "eval_logps/chosen": -1957.578857421875, "eval_logps/rejected": -2208.484375, "eval_loss": 1.8519227504730225, "eval_rewards/accuracies": 0.6746031641960144, "eval_rewards/chosen": -16.72357749938965, "eval_rewards/margins": 2.7694385051727295, "eval_rewards/margins_max": 11.662981033325195, "eval_rewards/margins_min": -5.304656982421875, "eval_rewards/margins_std": 7.62367582321167, "eval_rewards/rejected": -19.493017196655273, "eval_runtime": 282.5434, "eval_samples_per_second": 7.079, "eval_steps_per_second": 0.223, "step": 800 }, { "epoch": 2.28, "grad_norm": 3.2305387145726234, "learning_rate": 8.24332262395994e-07, "logits/chosen": -1.5742024183273315, "logits/rejected": -1.3343318700790405, "logps/chosen": -1459.0062255859375, "logps/rejected": -2835.21044921875, "loss": 0.0055, "rewards/accuracies": 1.0, "rewards/chosen": -11.644388198852539, "rewards/margins": 14.268835067749023, "rewards/margins_max": 19.221527099609375, "rewards/margins_min": 9.316144943237305, "rewards/margins_std": 7.0041632652282715, "rewards/rejected": -25.913223266601562, "step": 810 }, { "epoch": 2.31, "grad_norm": 0.26542768442550385, "learning_rate": 7.644027904586587e-07, "logits/chosen": -1.50737726688385, "logits/rejected": -1.2445927858352661, "logps/chosen": -1452.3663330078125, "logps/rejected": -2697.02880859375, "loss": 0.0067, "rewards/accuracies": 1.0, "rewards/chosen": -11.468404769897461, "rewards/margins": 13.425836563110352, "rewards/margins_max": 16.106616973876953, "rewards/margins_min": 10.745055198669434, "rewards/margins_std": 3.791196823120117, "rewards/rejected": -24.894241333007812, "step": 820 }, { "epoch": 2.34, "grad_norm": 0.8567763833713586, "learning_rate": 7.06339606893347e-07, "logits/chosen": -1.6803547143936157, "logits/rejected": -1.4048993587493896, "logps/chosen": -1588.3795166015625, "logps/rejected": -2856.94873046875, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": -11.344830513000488, "rewards/margins": 14.68242073059082, "rewards/margins_max": 20.33969497680664, "rewards/margins_min": 9.025145530700684, "rewards/margins_std": 8.000594139099121, "rewards/rejected": -26.02724838256836, "step": 830 }, { "epoch": 2.37, "grad_norm": 0.19797390603665133, "learning_rate": 6.502051470645149e-07, "logits/chosen": -1.7654281854629517, "logits/rejected": -1.40230393409729, "logps/chosen": -1327.5189208984375, "logps/rejected": -2276.90771484375, "loss": 0.0218, "rewards/accuracies": 1.0, "rewards/chosen": -9.360559463500977, "rewards/margins": 10.55632495880127, "rewards/margins_max": 12.99437141418457, "rewards/margins_min": 8.118279457092285, "rewards/margins_std": 3.4479167461395264, "rewards/rejected": -19.916885375976562, "step": 840 }, { "epoch": 2.39, "grad_norm": 0.0023467881665189677, "learning_rate": 5.960597723792194e-07, "logits/chosen": -1.5812981128692627, "logits/rejected": -1.1608024835586548, "logps/chosen": -1374.124267578125, "logps/rejected": -2819.462158203125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -9.890588760375977, "rewards/margins": 15.723424911499023, "rewards/margins_max": 21.0240421295166, "rewards/margins_min": 10.422807693481445, "rewards/margins_std": 7.4962053298950195, "rewards/rejected": -25.614009857177734, "step": 850 }, { "epoch": 2.42, "grad_norm": 1.4084849928658003, "learning_rate": 5.43961705380465e-07, "logits/chosen": -1.646162986755371, "logits/rejected": -1.4091808795928955, "logps/chosen": -1218.2606201171875, "logps/rejected": -2409.643798828125, "loss": 0.0078, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -8.628401756286621, "rewards/margins": 12.78498649597168, "rewards/margins_max": 17.431535720825195, "rewards/margins_min": 8.138437271118164, "rewards/margins_std": 6.5712127685546875, "rewards/rejected": -21.413387298583984, "step": 860 }, { "epoch": 2.45, "grad_norm": 0.13595105985996128, "learning_rate": 4.939669671404871e-07, "logits/chosen": -1.5396533012390137, "logits/rejected": -1.2183513641357422, "logps/chosen": -1237.326904296875, "logps/rejected": -3156.015380859375, "loss": 0.0039, "rewards/accuracies": 1.0, "rewards/chosen": -8.967730522155762, "rewards/margins": 19.433839797973633, "rewards/margins_max": 26.383316040039062, "rewards/margins_min": 12.484365463256836, "rewards/margins_std": 9.828042984008789, "rewards/rejected": -28.40157127380371, "step": 870 }, { "epoch": 2.48, "grad_norm": 0.012403182973777866, "learning_rate": 4.461293170212644e-07, "logits/chosen": -1.6268768310546875, "logits/rejected": -1.3297674655914307, "logps/chosen": -1231.2391357421875, "logps/rejected": -2482.310546875, "loss": 0.0125, "rewards/accuracies": 1.0, "rewards/chosen": -9.140237808227539, "rewards/margins": 13.229069709777832, "rewards/margins_max": 16.058679580688477, "rewards/margins_min": 10.399457931518555, "rewards/margins_std": 4.001674175262451, "rewards/rejected": -22.369308471679688, "step": 880 }, { "epoch": 2.51, "grad_norm": 5.925107209728559, "learning_rate": 4.005001948670606e-07, "logits/chosen": -1.7953965663909912, "logits/rejected": -1.5808696746826172, "logps/chosen": -1377.26611328125, "logps/rejected": -2234.20849609375, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -9.003216743469238, "rewards/margins": 10.078218460083008, "rewards/margins_max": 11.774847030639648, "rewards/margins_min": 8.381589889526367, "rewards/margins_std": 2.39939546585083, "rewards/rejected": -19.08143424987793, "step": 890 }, { "epoch": 2.54, "grad_norm": 0.0018034560654693567, "learning_rate": 3.571286656911377e-07, "logits/chosen": -1.6509956121444702, "logits/rejected": -1.2617855072021484, "logps/chosen": -1374.924072265625, "logps/rejected": -2686.83154296875, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -9.74584674835205, "rewards/margins": 14.469047546386719, "rewards/margins_max": 20.866533279418945, "rewards/margins_min": 8.071561813354492, "rewards/margins_std": 9.04741096496582, "rewards/rejected": -24.214895248413086, "step": 900 }, { "epoch": 2.54, "eval_logits/chosen": -1.5324345827102661, "eval_logits/rejected": -1.4488511085510254, "eval_logps/chosen": -1899.9781494140625, "eval_logps/rejected": -2137.156982421875, "eval_loss": 1.6798701286315918, "eval_rewards/accuracies": 0.6865079402923584, "eval_rewards/chosen": -16.14756965637207, "eval_rewards/margins": 2.632173776626587, "eval_rewards/margins_max": 10.763092994689941, "eval_rewards/margins_min": -4.875840663909912, "eval_rewards/margins_std": 7.033862590789795, "eval_rewards/rejected": -18.77974510192871, "eval_runtime": 281.9065, "eval_samples_per_second": 7.095, "eval_steps_per_second": 0.223, "step": 900 }, { "epoch": 2.56, "grad_norm": 0.39851941407344293, "learning_rate": 3.1606136691612555e-07, "logits/chosen": -1.7041774988174438, "logits/rejected": -1.4187756776809692, "logps/chosen": -1301.1878662109375, "logps/rejected": -2172.826904296875, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -8.820059776306152, "rewards/margins": 10.524114608764648, "rewards/margins_max": 12.688272476196289, "rewards/margins_min": 8.359955787658691, "rewards/margins_std": 3.060582160949707, "rewards/rejected": -19.344173431396484, "step": 910 }, { "epoch": 2.59, "grad_norm": 0.0005374838985619683, "learning_rate": 2.773424582247844e-07, "logits/chosen": -1.5690796375274658, "logits/rejected": -1.2215526103973389, "logps/chosen": -1358.075927734375, "logps/rejected": -2381.899169921875, "loss": 0.0024, "rewards/accuracies": 1.0, "rewards/chosen": -9.972057342529297, "rewards/margins": 11.921777725219727, "rewards/margins_max": 14.729642868041992, "rewards/margins_min": 9.113912582397461, "rewards/margins_std": 3.970921754837036, "rewards/rejected": -21.893835067749023, "step": 920 }, { "epoch": 2.62, "grad_norm": 0.8257494267996711, "learning_rate": 2.410135740750821e-07, "logits/chosen": -1.5338929891586304, "logits/rejected": -1.259865164756775, "logps/chosen": -1410.4990234375, "logps/rejected": -2998.914794921875, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -10.63892936706543, "rewards/margins": 16.653705596923828, "rewards/margins_max": 21.365177154541016, "rewards/margins_min": 11.942238807678223, "rewards/margins_std": 6.663023471832275, "rewards/rejected": -27.29263687133789, "step": 930 }, { "epoch": 2.65, "grad_norm": 0.06916221157748438, "learning_rate": 2.0711377893064182e-07, "logits/chosen": -1.5516988039016724, "logits/rejected": -1.2729582786560059, "logps/chosen": -1308.211669921875, "logps/rejected": -2490.35693359375, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -9.68997573852539, "rewards/margins": 13.111665725708008, "rewards/margins_max": 18.273632049560547, "rewards/margins_min": 7.9496965408325195, "rewards/margins_std": 7.300126075744629, "rewards/rejected": -22.801639556884766, "step": 940 }, { "epoch": 2.68, "grad_norm": 2.498417925921994, "learning_rate": 1.756795252547111e-07, "logits/chosen": -1.4785737991333008, "logits/rejected": -1.2068592309951782, "logps/chosen": -1470.0135498046875, "logps/rejected": -2859.243408203125, "loss": 0.0078, "rewards/accuracies": 1.0, "rewards/chosen": -11.678686141967773, "rewards/margins": 14.885587692260742, "rewards/margins_max": 18.92436981201172, "rewards/margins_min": 10.846805572509766, "rewards/margins_std": 5.7117018699646, "rewards/rejected": -26.564273834228516, "step": 950 }, { "epoch": 2.7, "grad_norm": 0.30835027385045066, "learning_rate": 1.4674461431281013e-07, "logits/chosen": -1.6750847101211548, "logits/rejected": -1.3757655620574951, "logps/chosen": -1276.86669921875, "logps/rejected": -2703.418701171875, "loss": 0.0151, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.499726295471191, "rewards/margins": 15.09521198272705, "rewards/margins_max": 21.079849243164062, "rewards/margins_min": 9.11056900024414, "rewards/margins_std": 8.463561058044434, "rewards/rejected": -24.59493637084961, "step": 960 }, { "epoch": 2.73, "grad_norm": 0.23235990194938522, "learning_rate": 1.2034015982622243e-07, "logits/chosen": -1.5666346549987793, "logits/rejected": -1.2590982913970947, "logps/chosen": -1482.5379638671875, "logps/rejected": -2852.9375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -11.074012756347656, "rewards/margins": 14.420585632324219, "rewards/margins_max": 18.83799934387207, "rewards/margins_min": 10.003174781799316, "rewards/margins_std": 6.24716329574585, "rewards/rejected": -25.494598388671875, "step": 970 }, { "epoch": 2.76, "grad_norm": 0.003130078676672441, "learning_rate": 9.649455451539419e-08, "logits/chosen": -1.2376658916473389, "logits/rejected": -0.9727104306221008, "logps/chosen": -1320.026123046875, "logps/rejected": -2890.248291015625, "loss": 0.0043, "rewards/accuracies": 1.0, "rewards/chosen": -10.823871612548828, "rewards/margins": 16.33503532409668, "rewards/margins_max": 22.118406295776367, "rewards/margins_min": 10.551666259765625, "rewards/margins_std": 8.178921699523926, "rewards/rejected": -27.15890884399414, "step": 980 }, { "epoch": 2.79, "grad_norm": 0.01106748013868886, "learning_rate": 7.523343956923196e-08, "logits/chosen": -1.6014173030853271, "logits/rejected": -1.3725566864013672, "logps/chosen": -1455.7508544921875, "logps/rejected": -2784.856201171875, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -11.036726951599121, "rewards/margins": 13.958398818969727, "rewards/margins_max": 18.721614837646484, "rewards/margins_min": 9.19517993927002, "rewards/margins_std": 6.736205101013184, "rewards/rejected": -24.995126724243164, "step": 990 }, { "epoch": 2.82, "grad_norm": 0.21777107682252947, "learning_rate": 5.657967707312195e-08, "logits/chosen": -1.4147546291351318, "logits/rejected": -1.2533682584762573, "logps/chosen": -1340.80859375, "logps/rejected": -2710.937255859375, "loss": 0.0118, "rewards/accuracies": 1.0, "rewards/chosen": -10.588825225830078, "rewards/margins": 13.658398628234863, "rewards/margins_max": 17.033788681030273, "rewards/margins_min": 10.28300666809082, "rewards/margins_std": 4.773523807525635, "rewards/rejected": -24.247220993041992, "step": 1000 }, { "epoch": 2.82, "eval_logits/chosen": -1.51563560962677, "eval_logits/rejected": -1.4296027421951294, "eval_logps/chosen": -1952.324462890625, "eval_logps/rejected": -2219.474609375, "eval_loss": 1.8351484537124634, "eval_rewards/accuracies": 0.682539701461792, "eval_rewards/chosen": -16.671031951904297, "eval_rewards/margins": 2.931889057159424, "eval_rewards/margins_max": 11.962862014770508, "eval_rewards/margins_min": -5.289890766143799, "eval_rewards/margins_std": 7.766205787658691, "eval_rewards/rejected": -19.602922439575195, "eval_runtime": 281.5027, "eval_samples_per_second": 7.105, "eval_steps_per_second": 0.224, "step": 1000 }, { "epoch": 2.85, "grad_norm": 0.4419550733032763, "learning_rate": 4.055332542531959e-08, "logits/chosen": -1.5433815717697144, "logits/rejected": -1.295972228050232, "logps/chosen": -1293.6630859375, "logps/rejected": -2648.736572265625, "loss": 0.0096, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.779963493347168, "rewards/margins": 14.112527847290039, "rewards/margins_max": 18.39639663696289, "rewards/margins_min": 9.828656196594238, "rewards/margins_std": 6.058306694030762, "rewards/rejected": -23.89249038696289, "step": 1010 }, { "epoch": 2.87, "grad_norm": 0.14005943320430667, "learning_rate": 2.7171617768147472e-08, "logits/chosen": -1.398990273475647, "logits/rejected": -1.063157320022583, "logps/chosen": -1454.0186767578125, "logps/rejected": -2948.3251953125, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -11.425373077392578, "rewards/margins": 15.727473258972168, "rewards/margins_max": 20.60434341430664, "rewards/margins_min": 10.850606918334961, "rewards/margins_std": 6.896933078765869, "rewards/rejected": -27.152847290039062, "step": 1020 }, { "epoch": 2.9, "grad_norm": 0.2626213621970617, "learning_rate": 1.6448943457189616e-08, "logits/chosen": -1.5582804679870605, "logits/rejected": -1.3218994140625, "logps/chosen": -1478.698974609375, "logps/rejected": -2884.353271484375, "loss": 0.0057, "rewards/accuracies": 1.0, "rewards/chosen": -11.135309219360352, "rewards/margins": 14.943025588989258, "rewards/margins_max": 20.703128814697266, "rewards/margins_min": 9.1829195022583, "rewards/margins_std": 8.146018981933594, "rewards/rejected": -26.07833480834961, "step": 1030 }, { "epoch": 2.93, "grad_norm": 2.8326701528782565, "learning_rate": 8.39683258841123e-09, "logits/chosen": -1.5044890642166138, "logits/rejected": -1.2109694480895996, "logps/chosen": -1402.8773193359375, "logps/rejected": -2849.219970703125, "loss": 0.0062, "rewards/accuracies": 1.0, "rewards/chosen": -10.566572189331055, "rewards/margins": 15.638870239257812, "rewards/margins_max": 20.092174530029297, "rewards/margins_min": 11.185564041137695, "rewards/margins_std": 6.297926425933838, "rewards/rejected": -26.2054443359375, "step": 1040 }, { "epoch": 2.96, "grad_norm": 0.3213477153635432, "learning_rate": 3.0239435998430376e-09, "logits/chosen": -1.4634066820144653, "logits/rejected": -1.1483074426651, "logps/chosen": -1369.406494140625, "logps/rejected": -2688.2548828125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -10.378218650817871, "rewards/margins": 13.92640495300293, "rewards/margins_max": 18.696613311767578, "rewards/margins_min": 9.156195640563965, "rewards/margins_std": 6.746094703674316, "rewards/rejected": -24.304622650146484, "step": 1050 }, { "epoch": 2.99, "grad_norm": 0.31694097428400714, "learning_rate": 3.3605396115826695e-10, "logits/chosen": -1.4050662517547607, "logits/rejected": -1.1527583599090576, "logps/chosen": -1549.754150390625, "logps/rejected": -2639.6474609375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -12.238971710205078, "rewards/margins": 12.063154220581055, "rewards/margins_max": 15.284955978393555, "rewards/margins_min": 8.841352462768555, "rewards/margins_std": 4.5563154220581055, "rewards/rejected": -24.302127838134766, "step": 1060 }, { "epoch": 3.0, "step": 1065, "total_flos": 0.0, "train_loss": 0.1103198329137612, "train_runtime": 9245.0119, "train_samples_per_second": 1.843, "train_steps_per_second": 0.115 } ], "logging_steps": 10, "max_steps": 1065, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }