{ "best_metric": 21.601177215576172, "best_model_checkpoint": "./output/checkpoints/2024-05-27_09-02-19/checkpoint-600", "epoch": 1.0, "eval_steps": 100, "global_step": 1271, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003933910306845004, "grad_norm": 26.446468353271484, "learning_rate": 9.375000000000001e-07, "logits/chosen": -0.2329835593700409, "logits/rejected": -0.7131723165512085, "logps/chosen": -1.0090148448944092, "logps/rejected": -1.6766555309295654, "loss": 25.0031, "rewards/accuracies": 0.1875, "rewards/chosen": 8.527375939593185e-06, "rewards/margins": -3.058705624425784e-05, "rewards/rejected": 3.911443127435632e-05, "step": 5 }, { "epoch": 0.007867820613690008, "grad_norm": 11.936881065368652, "learning_rate": 2.5e-06, "logits/chosen": -0.396948903799057, "logits/rejected": -0.7360211610794067, "logps/chosen": -0.8984262347221375, "logps/rejected": -1.1693015098571777, "loss": 24.9925, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -4.258692206349224e-05, "rewards/margins": 7.496408943552524e-05, "rewards/rejected": -0.00011755101149901748, "step": 10 }, { "epoch": 0.011801730920535013, "grad_norm": 13.576423645019531, "learning_rate": 4.0625000000000005e-06, "logits/chosen": -0.3573324680328369, "logits/rejected": -0.6578253507614136, "logps/chosen": -0.8142125010490417, "logps/rejected": -1.0063048601150513, "loss": 24.98, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.00027080357540398836, "rewards/margins": 0.00020028329163324088, "rewards/rejected": -0.00047108688158914447, "step": 15 }, { "epoch": 0.015735641227380016, "grad_norm": 34.40192794799805, "learning_rate": 5.3125e-06, "logits/chosen": -0.3880882263183594, "logits/rejected": -0.7228592038154602, "logps/chosen": -1.1428436040878296, "logps/rejected": -1.567692756652832, "loss": 24.8648, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.001833672053180635, "rewards/margins": 0.0014055297942832112, "rewards/rejected": -0.003239201847463846, "step": 20 }, { "epoch": 0.01966955153422502, "grad_norm": 16.62054443359375, "learning_rate": 6.875e-06, "logits/chosen": -0.25890520215034485, "logits/rejected": -0.7020931839942932, "logps/chosen": -1.212777853012085, "logps/rejected": -1.3589212894439697, "loss": 24.939, "rewards/accuracies": 0.5625, "rewards/chosen": -0.0046936506405472755, "rewards/margins": 0.0006289022276178002, "rewards/rejected": -0.005322552751749754, "step": 25 }, { "epoch": 0.023603461841070025, "grad_norm": 22.588180541992188, "learning_rate": 8.4375e-06, "logits/chosen": -0.3284154236316681, "logits/rejected": -0.6061900854110718, "logps/chosen": -0.9161252975463867, "logps/rejected": -1.1756784915924072, "loss": 24.7008, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.004226677119731903, "rewards/margins": 0.003145116614177823, "rewards/rejected": -0.0073717935010790825, "step": 30 }, { "epoch": 0.02753737214791503, "grad_norm": 35.44740295410156, "learning_rate": 1e-05, "logits/chosen": -0.4888971447944641, "logits/rejected": -0.7553955912590027, "logps/chosen": -1.252327561378479, "logps/rejected": -1.473224401473999, "loss": 24.5665, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.013578332960605621, "rewards/margins": 0.004552872385829687, "rewards/rejected": -0.018131205812096596, "step": 35 }, { "epoch": 0.03147128245476003, "grad_norm": 32.20027160644531, "learning_rate": 1.1562500000000002e-05, "logits/chosen": -0.4067641794681549, "logits/rejected": -0.7352877855300903, "logps/chosen": -1.055959939956665, "logps/rejected": -1.4485868215560913, "loss": 24.0967, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.016065727919340134, "rewards/margins": 0.011377329006791115, "rewards/rejected": -0.02744305692613125, "step": 40 }, { "epoch": 0.03540519276160504, "grad_norm": NaN, "learning_rate": 1.2812500000000001e-05, "logits/chosen": -0.7447024583816528, "logits/rejected": -1.0448763370513916, "logps/chosen": -1.723064661026001, "logps/rejected": -2.249486207962036, "loss": 24.0293, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.041548244655132294, "rewards/margins": 0.013970533385872841, "rewards/rejected": -0.05551878362894058, "step": 45 }, { "epoch": 0.03933910306845004, "grad_norm": 26.65926742553711, "learning_rate": 1.4375e-05, "logits/chosen": -0.37696924805641174, "logits/rejected": -0.46783286333084106, "logps/chosen": -1.015779733657837, "logps/rejected": -1.599536418914795, "loss": 24.0592, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.020270783454179764, "rewards/margins": 0.019904401153326035, "rewards/rejected": -0.0401751883327961, "step": 50 }, { "epoch": 0.043273013375295044, "grad_norm": 30.033103942871094, "learning_rate": 1.59375e-05, "logits/chosen": -0.7907823324203491, "logits/rejected": -0.990174412727356, "logps/chosen": -1.7764304876327515, "logps/rejected": -1.9972736835479736, "loss": 24.6893, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.05663704872131348, "rewards/margins": 0.012498116120696068, "rewards/rejected": -0.0691351667046547, "step": 55 }, { "epoch": 0.04720692368214005, "grad_norm": 155.12327575683594, "learning_rate": 1.7500000000000002e-05, "logits/chosen": -0.5883419513702393, "logits/rejected": -0.9729728698730469, "logps/chosen": -1.749121904373169, "logps/rejected": -2.5301637649536133, "loss": 23.3501, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.06644631177186966, "rewards/margins": 0.04214775934815407, "rewards/rejected": -0.10859407484531403, "step": 60 }, { "epoch": 0.05114083398898505, "grad_norm": 106.24934387207031, "learning_rate": 1.8750000000000002e-05, "logits/chosen": -0.8320428133010864, "logits/rejected": -1.0106093883514404, "logps/chosen": -1.1940712928771973, "logps/rejected": -2.7599964141845703, "loss": 23.1753, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.03951374441385269, "rewards/margins": 0.08484560251235962, "rewards/rejected": -0.12435934692621231, "step": 65 }, { "epoch": 0.05507474429583006, "grad_norm": 82.20561981201172, "learning_rate": 2.0312500000000002e-05, "logits/chosen": -0.9974054098129272, "logits/rejected": -1.2483726739883423, "logps/chosen": -1.6322723627090454, "logps/rejected": -2.4456088542938232, "loss": 21.933, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.07070576399564743, "rewards/margins": 0.055024802684783936, "rewards/rejected": -0.12573055922985077, "step": 70 }, { "epoch": 0.059008654602675056, "grad_norm": 1317.8922119140625, "learning_rate": 2.1562500000000002e-05, "logits/chosen": -1.0032284259796143, "logits/rejected": -1.2543575763702393, "logps/chosen": -1.648602843284607, "logps/rejected": -3.2052788734436035, "loss": 30.9638, "rewards/accuracies": 0.625, "rewards/chosen": -0.08028480410575867, "rewards/margins": 0.09190882742404938, "rewards/rejected": -0.17219363152980804, "step": 75 }, { "epoch": 0.06294256490952006, "grad_norm": 135.98031616210938, "learning_rate": 2.3125000000000003e-05, "logits/chosen": -1.1862694025039673, "logits/rejected": -1.268090844154358, "logps/chosen": -1.7252533435821533, "logps/rejected": -2.730776786804199, "loss": 23.1291, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.08682449907064438, "rewards/margins": 0.07040676474571228, "rewards/rejected": -0.15723127126693726, "step": 80 }, { "epoch": 0.06687647521636507, "grad_norm": 83.77543640136719, "learning_rate": 2.46875e-05, "logits/chosen": -1.4288493394851685, "logits/rejected": -1.6324199438095093, "logps/chosen": -1.858473539352417, "logps/rejected": -2.4925625324249268, "loss": 22.47, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.10322336852550507, "rewards/margins": 0.040188394486904144, "rewards/rejected": -0.143411785364151, "step": 85 }, { "epoch": 0.07081038552321008, "grad_norm": 112.26095581054688, "learning_rate": 2.625e-05, "logits/chosen": -1.5094571113586426, "logits/rejected": -1.6503517627716064, "logps/chosen": -2.1784815788269043, "logps/rejected": -2.8247978687286377, "loss": 26.8652, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.12195611000061035, "rewards/margins": 0.04399397224187851, "rewards/rejected": -0.16595008969306946, "step": 90 }, { "epoch": 0.07474429583005507, "grad_norm": 116.14981079101562, "learning_rate": 2.7812500000000002e-05, "logits/chosen": -1.705255150794983, "logits/rejected": -1.8774993419647217, "logps/chosen": -2.0536391735076904, "logps/rejected": -2.913367748260498, "loss": 23.1756, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11939144134521484, "rewards/margins": 0.03494938462972641, "rewards/rejected": -0.15434083342552185, "step": 95 }, { "epoch": 0.07867820613690008, "grad_norm": 171.99806213378906, "learning_rate": 2.9375000000000003e-05, "logits/chosen": -1.7922956943511963, "logits/rejected": -1.805193305015564, "logps/chosen": -2.2087619304656982, "logps/rejected": -2.953051805496216, "loss": 23.3214, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1307235062122345, "rewards/margins": 0.03435593843460083, "rewards/rejected": -0.16507944464683533, "step": 100 }, { "epoch": 0.07867820613690008, "eval_logits/chosen": -1.966138243675232, "eval_logits/rejected": -2.09938645362854, "eval_logps/chosen": -2.3467371463775635, "eval_logps/rejected": -2.9913861751556396, "eval_loss": 22.537601470947266, "eval_rewards/accuracies": 0.643750011920929, "eval_rewards/chosen": -0.13215361535549164, "eval_rewards/margins": 0.04243787005543709, "eval_rewards/rejected": -0.17459148168563843, "eval_runtime": 254.2532, "eval_samples_per_second": 2.517, "eval_steps_per_second": 0.157, "step": 100 }, { "epoch": 0.08261211644374508, "grad_norm": 90.37792205810547, "learning_rate": 3.09375e-05, "logits/chosen": -1.7015612125396729, "logits/rejected": -1.8363323211669922, "logps/chosen": -2.096281051635742, "logps/rejected": -3.1252198219299316, "loss": 27.6536, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.11915542930364609, "rewards/margins": 0.06005290150642395, "rewards/rejected": -0.17920835316181183, "step": 105 }, { "epoch": 0.08654602675059009, "grad_norm": 88.4887466430664, "learning_rate": 3.2500000000000004e-05, "logits/chosen": -1.657274842262268, "logits/rejected": -1.8521808385849, "logps/chosen": -1.8666527271270752, "logps/rejected": -2.9845376014709473, "loss": 21.5034, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.10541417449712753, "rewards/margins": 0.05897489935159683, "rewards/rejected": -0.16438907384872437, "step": 110 }, { "epoch": 0.0904799370574351, "grad_norm": 134.99591064453125, "learning_rate": 3.40625e-05, "logits/chosen": -1.6925245523452759, "logits/rejected": -1.7081615924835205, "logps/chosen": -2.6438632011413574, "logps/rejected": -3.7139625549316406, "loss": 22.8601, "rewards/accuracies": 0.625, "rewards/chosen": -0.14907808601856232, "rewards/margins": 0.05926816537976265, "rewards/rejected": -0.20834624767303467, "step": 115 }, { "epoch": 0.0944138473642801, "grad_norm": 105.24943542480469, "learning_rate": 3.5625000000000005e-05, "logits/chosen": -1.591507911682129, "logits/rejected": -1.58656907081604, "logps/chosen": -1.9171836376190186, "logps/rejected": -2.5378520488739014, "loss": 23.0976, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.10242807865142822, "rewards/margins": 0.03649063780903816, "rewards/rejected": -0.1389187127351761, "step": 120 }, { "epoch": 0.0983477576711251, "grad_norm": 87.82530212402344, "learning_rate": 3.71875e-05, "logits/chosen": -1.3108810186386108, "logits/rejected": -1.4434765577316284, "logps/chosen": -2.124854564666748, "logps/rejected": -3.0788867473602295, "loss": 24.5017, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.1214764267206192, "rewards/margins": 0.04934501647949219, "rewards/rejected": -0.1708214432001114, "step": 125 }, { "epoch": 0.1022816679779701, "grad_norm": 69.61011505126953, "learning_rate": 3.875e-05, "logits/chosen": -1.286228895187378, "logits/rejected": -1.5050832033157349, "logps/chosen": -2.5508532524108887, "logps/rejected": -3.24528169631958, "loss": 21.4116, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15334758162498474, "rewards/margins": 0.04556337743997574, "rewards/rejected": -0.19891095161437988, "step": 130 }, { "epoch": 0.10621557828481511, "grad_norm": 101.8541259765625, "learning_rate": 3.999992445477636e-05, "logits/chosen": -1.3636066913604736, "logits/rejected": -1.5931237936019897, "logps/chosen": -3.0847220420837402, "logps/rejected": -3.839167356491089, "loss": 21.3367, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2012835294008255, "rewards/margins": 0.05702406167984009, "rewards/rejected": -0.2583075761795044, "step": 135 }, { "epoch": 0.11014948859166011, "grad_norm": 701.3626098632812, "learning_rate": 3.999728043187288e-05, "logits/chosen": -1.4217129945755005, "logits/rejected": -1.4933011531829834, "logps/chosen": -3.9832420349121094, "logps/rejected": -5.435095310211182, "loss": 23.8821, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2787173390388489, "rewards/margins": 0.09217057377099991, "rewards/rejected": -0.3708879351615906, "step": 140 }, { "epoch": 0.11408339889850512, "grad_norm": 163.25831604003906, "learning_rate": 3.9990859718476166e-05, "logits/chosen": -1.4570497274398804, "logits/rejected": -1.473787784576416, "logps/chosen": -3.3077120780944824, "logps/rejected": -4.8310956954956055, "loss": 20.1222, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2514913082122803, "rewards/margins": 0.11996223777532578, "rewards/rejected": -0.37145355343818665, "step": 145 }, { "epoch": 0.11801730920535011, "grad_norm": 274.31146240234375, "learning_rate": 3.998066352720348e-05, "logits/chosen": -1.4901472330093384, "logits/rejected": -1.5641014575958252, "logps/chosen": -4.366189956665039, "logps/rejected": -5.792475700378418, "loss": 24.8947, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3093239367008209, "rewards/margins": 0.1069142073392868, "rewards/rejected": -0.4162382185459137, "step": 150 }, { "epoch": 0.12195121951219512, "grad_norm": 204.0838623046875, "learning_rate": 3.9966693783709596e-05, "logits/chosen": -1.775489091873169, "logits/rejected": -1.6681087017059326, "logps/chosen": -3.3862712383270264, "logps/rejected": -3.850006580352783, "loss": 25.0897, "rewards/accuracies": 0.5625, "rewards/chosen": -0.22455720603466034, "rewards/margins": 0.03894208371639252, "rewards/rejected": -0.26349928975105286, "step": 155 }, { "epoch": 0.12588512981904013, "grad_norm": 117.58898162841797, "learning_rate": 3.9948953126323144e-05, "logits/chosen": -1.7140939235687256, "logits/rejected": -1.8748031854629517, "logps/chosen": -2.808436632156372, "logps/rejected": -3.4621639251708984, "loss": 22.909, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.17321309447288513, "rewards/margins": 0.03799115866422653, "rewards/rejected": -0.21120426058769226, "step": 160 }, { "epoch": 0.12981904012588513, "grad_norm": 75.52522277832031, "learning_rate": 3.992744490554832e-05, "logits/chosen": -1.5584402084350586, "logits/rejected": -1.6980777978897095, "logps/chosen": -2.5916876792907715, "logps/rejected": -3.250844955444336, "loss": 22.8805, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.16778233647346497, "rewards/margins": 0.046813301742076874, "rewards/rejected": -0.21459563076496124, "step": 165 }, { "epoch": 0.13375295043273014, "grad_norm": 210.5413818359375, "learning_rate": 3.990217318343214e-05, "logits/chosen": -1.6046726703643799, "logits/rejected": -1.785196304321289, "logps/chosen": -3.144779920578003, "logps/rejected": -4.314496040344238, "loss": 21.1638, "rewards/accuracies": 0.625, "rewards/chosen": -0.2154751569032669, "rewards/margins": 0.07778888940811157, "rewards/rejected": -0.2932640314102173, "step": 170 }, { "epoch": 0.13768686073957515, "grad_norm": 137.43014526367188, "learning_rate": 3.987314273279721e-05, "logits/chosen": -1.538189172744751, "logits/rejected": -1.7611982822418213, "logps/chosen": -3.314256191253662, "logps/rejected": -4.361076354980469, "loss": 22.1568, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.24516558647155762, "rewards/margins": 0.08248710632324219, "rewards/rejected": -0.3276526927947998, "step": 175 }, { "epoch": 0.14162077104642015, "grad_norm": 162.08778381347656, "learning_rate": 3.9840359036340424e-05, "logits/chosen": -1.5785366296768188, "logits/rejected": -1.6759214401245117, "logps/chosen": -3.9628891944885254, "logps/rejected": -4.663954257965088, "loss": 23.3996, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2786514163017273, "rewards/margins": 0.06107773259282112, "rewards/rejected": -0.3397291302680969, "step": 180 }, { "epoch": 0.14555468135326516, "grad_norm": 183.8037567138672, "learning_rate": 3.980382828559743e-05, "logits/chosen": -1.8036388158798218, "logits/rejected": -1.9109561443328857, "logps/chosen": -4.924368381500244, "logps/rejected": -5.794642448425293, "loss": 22.9192, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.357871949672699, "rewards/margins": 0.06735062599182129, "rewards/rejected": -0.42522257566452026, "step": 185 }, { "epoch": 0.14948859166011014, "grad_norm": 137.1122283935547, "learning_rate": 3.9763557379773316e-05, "logits/chosen": -1.7101930379867554, "logits/rejected": -1.8198864459991455, "logps/chosen": -3.584138870239258, "logps/rejected": -4.54425048828125, "loss": 20.9665, "rewards/accuracies": 0.625, "rewards/chosen": -0.27856582403182983, "rewards/margins": 0.0747852548956871, "rewards/rejected": -0.3533511161804199, "step": 190 }, { "epoch": 0.15342250196695514, "grad_norm": 164.39016723632812, "learning_rate": 3.971955392443965e-05, "logits/chosen": -1.697361707687378, "logits/rejected": -1.7193193435668945, "logps/chosen": -3.8646721839904785, "logps/rejected": -5.100863456726074, "loss": 21.2867, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.29381299018859863, "rewards/margins": 0.07520242035388947, "rewards/rejected": -0.3690153658390045, "step": 195 }, { "epoch": 0.15735641227380015, "grad_norm": 125.37354278564453, "learning_rate": 3.9671826230098045e-05, "logits/chosen": -1.6001428365707397, "logits/rejected": -1.736572504043579, "logps/chosen": -3.7506911754608154, "logps/rejected": -4.7256364822387695, "loss": 21.3918, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2859867215156555, "rewards/margins": 0.07923749834299088, "rewards/rejected": -0.3652242124080658, "step": 200 }, { "epoch": 0.15735641227380015, "eval_logits/chosen": -1.5953483581542969, "eval_logits/rejected": -1.725934624671936, "eval_logps/chosen": -4.086066246032715, "eval_logps/rejected": -5.129216194152832, "eval_loss": 23.703336715698242, "eval_rewards/accuracies": 0.6546875238418579, "eval_rewards/chosen": -0.3060864806175232, "eval_rewards/margins": 0.0822879821062088, "eval_rewards/rejected": -0.3883745074272156, "eval_runtime": 254.3055, "eval_samples_per_second": 2.517, "eval_steps_per_second": 0.157, "step": 200 }, { "epoch": 0.16129032258064516, "grad_norm": 167.259033203125, "learning_rate": 3.962038331061065e-05, "logits/chosen": -1.4170001745224, "logits/rejected": -1.6189939975738525, "logps/chosen": -3.634209156036377, "logps/rejected": -5.206550121307373, "loss": 26.5886, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.2781711220741272, "rewards/margins": 0.10549378395080566, "rewards/rejected": -0.38366490602493286, "step": 205 }, { "epoch": 0.16522423288749016, "grad_norm": 109.77017211914062, "learning_rate": 3.9565234881497835e-05, "logits/chosen": -1.5879325866699219, "logits/rejected": -1.6509323120117188, "logps/chosen": -2.8328652381896973, "logps/rejected": -3.345362901687622, "loss": 22.6272, "rewards/accuracies": 0.6875, "rewards/chosen": -0.20151302218437195, "rewards/margins": 0.04314180836081505, "rewards/rejected": -0.2446548491716385, "step": 210 }, { "epoch": 0.16915814319433517, "grad_norm": 162.56524658203125, "learning_rate": 3.950639135810326e-05, "logits/chosen": -1.6067664623260498, "logits/rejected": -1.7900478839874268, "logps/chosen": -3.400160312652588, "logps/rejected": -4.594171047210693, "loss": 20.8583, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2197611778974533, "rewards/margins": 0.0824299305677414, "rewards/rejected": -0.3021911084651947, "step": 215 }, { "epoch": 0.17309205350118018, "grad_norm": 134.93789672851562, "learning_rate": 3.944386385362683e-05, "logits/chosen": -1.7304567098617554, "logits/rejected": -1.7651231288909912, "logps/chosen": -4.1914567947387695, "logps/rejected": -5.027632713317871, "loss": 21.3514, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.2970493733882904, "rewards/margins": 0.06264514476060867, "rewards/rejected": -0.3596945106983185, "step": 220 }, { "epoch": 0.17702596380802518, "grad_norm": 106.4392318725586, "learning_rate": 3.937766417702591e-05, "logits/chosen": -1.645422339439392, "logits/rejected": -1.7480659484863281, "logps/chosen": -4.8556694984436035, "logps/rejected": -5.551773548126221, "loss": 25.2991, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.39037808775901794, "rewards/margins": 0.040518540889024734, "rewards/rejected": -0.4308966100215912, "step": 225 }, { "epoch": 0.1809598741148702, "grad_norm": 75.40190124511719, "learning_rate": 3.9307804830785033e-05, "logits/chosen": -1.710780382156372, "logits/rejected": -1.759790062904358, "logps/chosen": -4.1053643226623535, "logps/rejected": -5.580018520355225, "loss": 20.5995, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.32871341705322266, "rewards/margins": 0.09709561616182327, "rewards/rejected": -0.42580899596214294, "step": 230 }, { "epoch": 0.1848937844217152, "grad_norm": 129.5159912109375, "learning_rate": 3.923429900855468e-05, "logits/chosen": -1.5250613689422607, "logits/rejected": -1.7375835180282593, "logps/chosen": -3.8208870887756348, "logps/rejected": -5.471742630004883, "loss": 19.4905, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2826174199581146, "rewards/margins": 0.10505588352680206, "rewards/rejected": -0.3876733183860779, "step": 235 }, { "epoch": 0.1888276947285602, "grad_norm": 358.99334716796875, "learning_rate": 3.915716059265956e-05, "logits/chosen": -1.2919423580169678, "logits/rejected": -1.5058711767196655, "logps/chosen": -4.358604907989502, "logps/rejected": -5.214530944824219, "loss": 21.5458, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.30291062593460083, "rewards/margins": 0.06571656465530396, "rewards/rejected": -0.3686271905899048, "step": 240 }, { "epoch": 0.19276160503540518, "grad_norm": 83.99627685546875, "learning_rate": 3.907640415147675e-05, "logits/chosen": -1.1905521154403687, "logits/rejected": -1.4401142597198486, "logps/chosen": -3.447228193283081, "logps/rejected": -4.264595985412598, "loss": 21.469, "rewards/accuracies": 0.6875, "rewards/chosen": -0.2503766417503357, "rewards/margins": 0.06672003120183945, "rewards/rejected": -0.31709665060043335, "step": 245 }, { "epoch": 0.1966955153422502, "grad_norm": 97.8551025390625, "learning_rate": 3.8992044936684326e-05, "logits/chosen": -1.167415976524353, "logits/rejected": -1.3312307596206665, "logps/chosen": -3.2072510719299316, "logps/rejected": -3.7459397315979004, "loss": 24.394, "rewards/accuracies": 0.5625, "rewards/chosen": -0.23177361488342285, "rewards/margins": 0.04146546125411987, "rewards/rejected": -0.2732390761375427, "step": 250 }, { "epoch": 0.2006294256490952, "grad_norm": 81.79573822021484, "learning_rate": 3.8904098880380946e-05, "logits/chosen": -1.0507287979125977, "logits/rejected": -1.1515988111495972, "logps/chosen": -2.6618685722351074, "logps/rejected": -3.6504600048065186, "loss": 21.5489, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.17464396357536316, "rewards/margins": 0.08120186626911163, "rewards/rejected": -0.2558458149433136, "step": 255 }, { "epoch": 0.2045633359559402, "grad_norm": 74.57398986816406, "learning_rate": 3.881258259207688e-05, "logits/chosen": -1.026132583618164, "logits/rejected": -1.1835418939590454, "logps/chosen": -3.0442395210266113, "logps/rejected": -3.4265129566192627, "loss": 24.2589, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.18103419244289398, "rewards/margins": 0.02801087498664856, "rewards/rejected": -0.20904505252838135, "step": 260 }, { "epoch": 0.2084972462627852, "grad_norm": 136.9661102294922, "learning_rate": 3.8717513355557156e-05, "logits/chosen": -1.0285115242004395, "logits/rejected": -1.2167742252349854, "logps/chosen": -2.512547016143799, "logps/rejected": -3.510410785675049, "loss": 22.8105, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.1588296890258789, "rewards/margins": 0.07337381690740585, "rewards/rejected": -0.23220351338386536, "step": 265 }, { "epoch": 0.21243115656963021, "grad_norm": 69.99120330810547, "learning_rate": 3.861890912561731e-05, "logits/chosen": -0.8553465604782104, "logits/rejected": -1.1523014307022095, "logps/chosen": -2.420487880706787, "logps/rejected": -3.4689393043518066, "loss": 20.5014, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.15432177484035492, "rewards/margins": 0.08016739785671234, "rewards/rejected": -0.23448920249938965, "step": 270 }, { "epoch": 0.21636506687647522, "grad_norm": 75.06553649902344, "learning_rate": 3.85167885246725e-05, "logits/chosen": -1.0096137523651123, "logits/rejected": -0.9508267641067505, "logps/chosen": -3.4687724113464355, "logps/rejected": -4.444920539855957, "loss": 22.2787, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2557123303413391, "rewards/margins": 0.07582716643810272, "rewards/rejected": -0.3315395414829254, "step": 275 }, { "epoch": 0.22029897718332023, "grad_norm": 80.17613983154297, "learning_rate": 3.8411170839240394e-05, "logits/chosen": -0.9037753939628601, "logits/rejected": -0.9584333300590515, "logps/chosen": -3.32795786857605, "logps/rejected": -4.637690544128418, "loss": 23.2314, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2455664873123169, "rewards/margins": 0.09604751318693161, "rewards/rejected": -0.3416139781475067, "step": 280 }, { "epoch": 0.22423288749016523, "grad_norm": 51.01013946533203, "learning_rate": 3.8302076016298786e-05, "logits/chosen": -0.7821402549743652, "logits/rejected": -0.8784409761428833, "logps/chosen": -3.464825391769409, "logps/rejected": -4.3120927810668945, "loss": 26.3572, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.22129957377910614, "rewards/margins": 0.05804131552577019, "rewards/rejected": -0.27934086322784424, "step": 285 }, { "epoch": 0.22816679779701024, "grad_norm": 50.36003494262695, "learning_rate": 3.818952465951836e-05, "logits/chosen": -0.8527859449386597, "logits/rejected": -1.0032708644866943, "logps/chosen": -2.805267810821533, "logps/rejected": -3.6299731731414795, "loss": 22.6246, "rewards/accuracies": 0.625, "rewards/chosen": -0.19116182625293732, "rewards/margins": 0.04059046879410744, "rewards/rejected": -0.23175227642059326, "step": 290 }, { "epoch": 0.23210070810385522, "grad_norm": 73.20655059814453, "learning_rate": 3.80735380253715e-05, "logits/chosen": -1.153649926185608, "logits/rejected": -1.3198211193084717, "logps/chosen": -3.344242811203003, "logps/rejected": -3.948678970336914, "loss": 23.0667, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.24411065876483917, "rewards/margins": 0.03483257442712784, "rewards/rejected": -0.2789432406425476, "step": 295 }, { "epoch": 0.23603461841070023, "grad_norm": 74.69874572753906, "learning_rate": 3.7954138019117764e-05, "logits/chosen": -1.2777029275894165, "logits/rejected": -1.4231250286102295, "logps/chosen": -3.8555595874786377, "logps/rejected": -4.460744380950928, "loss": 23.7264, "rewards/accuracies": 0.5625, "rewards/chosen": -0.2938821017742157, "rewards/margins": 0.04352904483675957, "rewards/rejected": -0.33741116523742676, "step": 300 }, { "epoch": 0.23603461841070023, "eval_logits/chosen": -1.299351453781128, "eval_logits/rejected": -1.5808088779449463, "eval_logps/chosen": -3.9815101623535156, "eval_logps/rejected": -4.813979148864746, "eval_loss": 22.46109962463379, "eval_rewards/accuracies": 0.6578124761581421, "eval_rewards/chosen": -0.29563087224960327, "eval_rewards/margins": 0.06121987849473953, "eval_rewards/rejected": -0.3568507432937622, "eval_runtime": 256.5735, "eval_samples_per_second": 2.494, "eval_steps_per_second": 0.156, "step": 300 }, { "epoch": 0.23996852871754523, "grad_norm": 100.70768737792969, "learning_rate": 3.7831347190666886e-05, "logits/chosen": -1.4278929233551025, "logits/rejected": -1.5912885665893555, "logps/chosen": -4.3169355392456055, "logps/rejected": -5.223280906677246, "loss": 21.9094, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.32976609468460083, "rewards/margins": 0.05834154412150383, "rewards/rejected": -0.38810762763023376, "step": 305 }, { "epoch": 0.24390243902439024, "grad_norm": 220.49977111816406, "learning_rate": 3.770518873031997e-05, "logits/chosen": -1.3963868618011475, "logits/rejected": -1.5353944301605225, "logps/chosen": -4.6622209548950195, "logps/rejected": -5.298556327819824, "loss": 25.9132, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.34076324105262756, "rewards/margins": 0.03211987391114235, "rewards/rejected": -0.3728831112384796, "step": 310 }, { "epoch": 0.24783634933123525, "grad_norm": 45.653079986572266, "learning_rate": 3.757568646438977e-05, "logits/chosen": -1.3604671955108643, "logits/rejected": -1.4712865352630615, "logps/chosen": -4.468562126159668, "logps/rejected": -5.09032678604126, "loss": 23.4367, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.32228899002075195, "rewards/margins": 0.037079013884067535, "rewards/rejected": -0.3593679964542389, "step": 315 }, { "epoch": 0.25177025963808025, "grad_norm": 66.92400360107422, "learning_rate": 3.744286485070085e-05, "logits/chosen": -1.082240343093872, "logits/rejected": -1.4719394445419312, "logps/chosen": -4.237619876861572, "logps/rejected": -5.15458345413208, "loss": 23.5263, "rewards/accuracies": 0.6875, "rewards/chosen": -0.33307066559791565, "rewards/margins": 0.0633757933974266, "rewards/rejected": -0.39644646644592285, "step": 320 }, { "epoch": 0.25570416994492523, "grad_norm": 177.88168334960938, "learning_rate": 3.730674897397048e-05, "logits/chosen": -1.114916443824768, "logits/rejected": -1.6648147106170654, "logps/chosen": -4.053045749664307, "logps/rejected": -5.189083099365234, "loss": 22.1226, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3164909780025482, "rewards/margins": 0.07537268847227097, "rewards/rejected": -0.3918636739253998, "step": 325 }, { "epoch": 0.25963808025177026, "grad_norm": 100.30528259277344, "learning_rate": 3.7167364541071115e-05, "logits/chosen": -0.988497257232666, "logits/rejected": -1.1937472820281982, "logps/chosen": -4.265296459197998, "logps/rejected": -4.853640556335449, "loss": 20.9283, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3129270374774933, "rewards/margins": 0.0673552006483078, "rewards/rejected": -0.3802822232246399, "step": 330 }, { "epoch": 0.26357199055861524, "grad_norm": 83.86753845214844, "learning_rate": 3.7024737876175406e-05, "logits/chosen": -0.8350197076797485, "logits/rejected": -1.1670969724655151, "logps/chosen": -5.4864630699157715, "logps/rejected": -6.544106960296631, "loss": 20.1224, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4627310335636139, "rewards/margins": 0.0860290378332138, "rewards/rejected": -0.5487600564956665, "step": 335 }, { "epoch": 0.2675059008654603, "grad_norm": 73.52009582519531, "learning_rate": 3.6878895915784616e-05, "logits/chosen": -0.6294984221458435, "logits/rejected": -0.6951633095741272, "logps/chosen": -6.682524681091309, "logps/rejected": -7.499720573425293, "loss": 23.3106, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.576111912727356, "rewards/margins": 0.06401752680540085, "rewards/rejected": -0.6401294469833374, "step": 340 }, { "epoch": 0.27143981117230526, "grad_norm": 47.29294204711914, "learning_rate": 3.6729866203641346e-05, "logits/chosen": -0.30728015303611755, "logits/rejected": -0.7238900065422058, "logps/chosen": -5.2912421226501465, "logps/rejected": -6.754895210266113, "loss": 20.956, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.44258102774620056, "rewards/margins": 0.09092569351196289, "rewards/rejected": -0.5335067510604858, "step": 345 }, { "epoch": 0.2753737214791503, "grad_norm": 92.66437530517578, "learning_rate": 3.6577676885527676e-05, "logits/chosen": -0.4043883681297302, "logits/rejected": -0.6512165069580078, "logps/chosen": -4.932716369628906, "logps/rejected": -6.014449596405029, "loss": 19.8567, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.39097142219543457, "rewards/margins": 0.09061526507139206, "rewards/rejected": -0.48158663511276245, "step": 350 }, { "epoch": 0.27930763178599527, "grad_norm": 147.38983154296875, "learning_rate": 3.6422356703949525e-05, "logits/chosen": -0.1327817142009735, "logits/rejected": -0.62315833568573, "logps/chosen": -5.30325984954834, "logps/rejected": -6.832304954528809, "loss": 21.7519, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4314250946044922, "rewards/margins": 0.1263391077518463, "rewards/rejected": -0.5577641725540161, "step": 355 }, { "epoch": 0.2832415420928403, "grad_norm": 31.5225772857666, "learning_rate": 3.62639349927083e-05, "logits/chosen": -0.2402796745300293, "logits/rejected": -0.8563323020935059, "logps/chosen": -4.622067928314209, "logps/rejected": -6.146195411682129, "loss": 17.5716, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.37094250321388245, "rewards/margins": 0.14315392076969147, "rewards/rejected": -0.5140964388847351, "step": 360 }, { "epoch": 0.2871754523996853, "grad_norm": 62.2461051940918, "learning_rate": 3.610244167136095e-05, "logits/chosen": -0.09466689825057983, "logits/rejected": -0.4544064402580261, "logps/chosen": -5.546882629394531, "logps/rejected": -6.525860786437988, "loss": 24.1383, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.47001561522483826, "rewards/margins": 0.06077291816473007, "rewards/rejected": -0.5307885408401489, "step": 365 }, { "epoch": 0.2911093627065303, "grad_norm": 48.28466796875, "learning_rate": 3.593790723956935e-05, "logits/chosen": -0.2374683916568756, "logits/rejected": -0.373137503862381, "logps/chosen": -8.164457321166992, "logps/rejected": -8.156596183776855, "loss": 27.2267, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7038506269454956, "rewards/margins": 0.0024669456761330366, "rewards/rejected": -0.7063175439834595, "step": 370 }, { "epoch": 0.2950432730133753, "grad_norm": 51.48979568481445, "learning_rate": 3.577036277134012e-05, "logits/chosen": 0.5883103609085083, "logits/rejected": 0.345896452665329, "logps/chosen": -7.786595344543457, "logps/rejected": -8.453828811645508, "loss": 23.7109, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6850046515464783, "rewards/margins": 0.03222974017262459, "rewards/rejected": -0.7172344326972961, "step": 375 }, { "epoch": 0.2989771833202203, "grad_norm": 47.203521728515625, "learning_rate": 3.5599839909155954e-05, "logits/chosen": 0.8187308311462402, "logits/rejected": 0.5813020467758179, "logps/chosen": -7.474339962005615, "logps/rejected": -8.487818717956543, "loss": 21.9183, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.649897038936615, "rewards/margins": 0.06481163203716278, "rewards/rejected": -0.714708685874939, "step": 380 }, { "epoch": 0.3029110936270653, "grad_norm": 66.43785095214844, "learning_rate": 3.542637085799967e-05, "logits/chosen": 0.7243896722793579, "logits/rejected": 0.633999228477478, "logps/chosen": -6.237640380859375, "logps/rejected": -7.760465145111084, "loss": 21.8449, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5326268672943115, "rewards/margins": 0.1084168553352356, "rewards/rejected": -0.6410436630249023, "step": 385 }, { "epoch": 0.3068450039339103, "grad_norm": 37.43156814575195, "learning_rate": 3.524998837927192e-05, "logits/chosen": 0.06674204766750336, "logits/rejected": -0.2028542459011078, "logps/chosen": -3.726958751678467, "logps/rejected": -4.262044906616211, "loss": 24.4969, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.28034740686416626, "rewards/margins": 0.04259229078888893, "rewards/rejected": -0.3229396939277649, "step": 390 }, { "epoch": 0.3107789142407553, "grad_norm": 43.61391067504883, "learning_rate": 3.5070725784603906e-05, "logits/chosen": -0.40263357758522034, "logits/rejected": -0.7386992573738098, "logps/chosen": -2.9942004680633545, "logps/rejected": -3.7471251487731934, "loss": 22.408, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.19999414682388306, "rewards/margins": 0.05711622163653374, "rewards/rejected": -0.2571103572845459, "step": 395 }, { "epoch": 0.3147128245476003, "grad_norm": 39.55924606323242, "learning_rate": 3.488861692956612e-05, "logits/chosen": -0.3814232647418976, "logits/rejected": -0.7326269149780273, "logps/chosen": -3.743687391281128, "logps/rejected": -4.247666835784912, "loss": 22.5808, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2401810586452484, "rewards/margins": 0.03669751435518265, "rewards/rejected": -0.27687856554985046, "step": 400 }, { "epoch": 0.3147128245476003, "eval_logits/chosen": 1.05367910861969, "eval_logits/rejected": 0.8413508534431458, "eval_logps/chosen": -3.476516008377075, "eval_logps/rejected": -4.1527628898620605, "eval_loss": 22.322988510131836, "eval_rewards/accuracies": 0.643750011920929, "eval_rewards/chosen": -0.2451314926147461, "eval_rewards/margins": 0.045597635209560394, "eval_rewards/rejected": -0.2907291054725647, "eval_runtime": 262.6324, "eval_samples_per_second": 2.437, "eval_steps_per_second": 0.152, "step": 400 }, { "epoch": 0.31864673485444533, "grad_norm": 106.93277740478516, "learning_rate": 3.470369620727433e-05, "logits/chosen": -0.2646043300628662, "logits/rejected": -0.5908231735229492, "logps/chosen": -4.6682844161987305, "logps/rejected": -4.9077653884887695, "loss": 24.9016, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.31124863028526306, "rewards/margins": 0.024502381682395935, "rewards/rejected": -0.3357509970664978, "step": 405 }, { "epoch": 0.3225806451612903, "grad_norm": 74.05709838867188, "learning_rate": 3.451599854189419e-05, "logits/chosen": 0.022719597443938255, "logits/rejected": -0.2066432684659958, "logps/chosen": -4.9405951499938965, "logps/rejected": -5.324969291687012, "loss": 24.7214, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.37897494435310364, "rewards/margins": 0.023352503776550293, "rewards/rejected": -0.40232744812965393, "step": 410 }, { "epoch": 0.32651455546813535, "grad_norm": 39.29796600341797, "learning_rate": 3.4325559382045344e-05, "logits/chosen": 0.5940214395523071, "logits/rejected": 0.3333088755607605, "logps/chosen": -4.753328323364258, "logps/rejected": -5.081197738647461, "loss": 24.0772, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.36598071455955505, "rewards/margins": 0.0194702185690403, "rewards/rejected": -0.38545092940330505, "step": 415 }, { "epoch": 0.3304484657749803, "grad_norm": 77.69596862792969, "learning_rate": 3.413241469410669e-05, "logits/chosen": 0.5746585726737976, "logits/rejected": 0.37664318084716797, "logps/chosen": -4.366871356964111, "logps/rejected": -4.923527717590332, "loss": 23.106, "rewards/accuracies": 0.625, "rewards/chosen": -0.33017784357070923, "rewards/margins": 0.03850778192281723, "rewards/rejected": -0.36868563294410706, "step": 420 }, { "epoch": 0.33438237608182536, "grad_norm": 37.37318801879883, "learning_rate": 3.3936600955423684e-05, "logits/chosen": 0.5197581052780151, "logits/rejected": 0.22815366089344025, "logps/chosen": -4.27942419052124, "logps/rejected": -5.002130031585693, "loss": 21.5593, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.3183726668357849, "rewards/margins": 0.0549759566783905, "rewards/rejected": -0.3733486235141754, "step": 425 }, { "epoch": 0.33831628638867034, "grad_norm": 51.467933654785156, "learning_rate": 3.373815514741928e-05, "logits/chosen": 0.5920094847679138, "logits/rejected": 0.27607864141464233, "logps/chosen": -4.812392234802246, "logps/rejected": -6.322897434234619, "loss": 20.3197, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3940238356590271, "rewards/margins": 0.08363697677850723, "rewards/rejected": -0.47766080498695374, "step": 430 }, { "epoch": 0.3422501966955153, "grad_norm": 92.83573150634766, "learning_rate": 3.353711474860957e-05, "logits/chosen": 0.2608449459075928, "logits/rejected": 0.022218376398086548, "logps/chosen": -6.094309329986572, "logps/rejected": -6.749911308288574, "loss": 22.7358, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.48861637711524963, "rewards/margins": 0.06393507868051529, "rewards/rejected": -0.5525515079498291, "step": 435 }, { "epoch": 0.34618410700236035, "grad_norm": 117.08031463623047, "learning_rate": 3.333351772752559e-05, "logits/chosen": 0.36764952540397644, "logits/rejected": 0.11572384834289551, "logps/chosen": -5.9948601722717285, "logps/rejected": -7.810961723327637, "loss": 22.5633, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4657825827598572, "rewards/margins": 0.0877792239189148, "rewards/rejected": -0.553561806678772, "step": 440 }, { "epoch": 0.35011801730920533, "grad_norm": 42.3343620300293, "learning_rate": 3.31274025355426e-05, "logits/chosen": -0.1417434960603714, "logits/rejected": -0.3869401216506958, "logps/chosen": -4.742400169372559, "logps/rejected": -5.590722560882568, "loss": 23.695, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.37819725275039673, "rewards/margins": 0.05572297424077988, "rewards/rejected": -0.4339202046394348, "step": 445 }, { "epoch": 0.35405192761605037, "grad_norm": 48.5923957824707, "learning_rate": 3.2918808099618145e-05, "logits/chosen": -0.13320264220237732, "logits/rejected": -0.4407239854335785, "logps/chosen": -4.052382946014404, "logps/rejected": -5.202944755554199, "loss": 23.9333, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2898481488227844, "rewards/margins": 0.06827215105295181, "rewards/rejected": -0.35812026262283325, "step": 450 }, { "epoch": 0.35798583792289534, "grad_norm": 40.526222229003906, "learning_rate": 3.270777381494025e-05, "logits/chosen": 0.15568742156028748, "logits/rejected": -0.1973900943994522, "logps/chosen": -3.3848178386688232, "logps/rejected": -4.244564056396484, "loss": 22.4256, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.24747803807258606, "rewards/margins": 0.0624859556555748, "rewards/rejected": -0.30996400117874146, "step": 455 }, { "epoch": 0.3619197482297404, "grad_norm": 39.41511917114258, "learning_rate": 3.2494339537487316e-05, "logits/chosen": 0.2709997296333313, "logits/rejected": -0.02324852906167507, "logps/chosen": -4.2176713943481445, "logps/rejected": -4.712052345275879, "loss": 22.1339, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.30599862337112427, "rewards/margins": 0.05780113860964775, "rewards/rejected": -0.3637998104095459, "step": 460 }, { "epoch": 0.36585365853658536, "grad_norm": 64.34850311279297, "learning_rate": 3.227854557650086e-05, "logits/chosen": 0.43827542662620544, "logits/rejected": 0.3022671937942505, "logps/chosen": -4.146628379821777, "logps/rejected": -4.721283912658691, "loss": 24.9713, "rewards/accuracies": 0.625, "rewards/chosen": -0.32393088936805725, "rewards/margins": 0.03894919902086258, "rewards/rejected": -0.36288008093833923, "step": 465 }, { "epoch": 0.3697875688434304, "grad_norm": 48.07391357421875, "learning_rate": 3.206043268687271e-05, "logits/chosen": 0.8742543458938599, "logits/rejected": 0.6306554079055786, "logps/chosen": -4.235721111297607, "logps/rejected": -4.655104160308838, "loss": 24.504, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3277904689311981, "rewards/margins": 0.03070756234228611, "rewards/rejected": -0.3584980368614197, "step": 470 }, { "epoch": 0.37372147915027537, "grad_norm": 48.7944450378418, "learning_rate": 3.1840042061448034e-05, "logits/chosen": 0.8953019380569458, "logits/rejected": 0.6509414315223694, "logps/chosen": -3.809953212738037, "logps/rejected": -4.727410793304443, "loss": 21.5063, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.2697634696960449, "rewards/margins": 0.04955270141363144, "rewards/rejected": -0.31931617856025696, "step": 475 }, { "epoch": 0.3776553894571204, "grad_norm": 50.050758361816406, "learning_rate": 3.161741532324567e-05, "logits/chosen": 0.6901669502258301, "logits/rejected": 0.4217056632041931, "logps/chosen": -3.9408392906188965, "logps/rejected": -4.815566062927246, "loss": 20.9824, "rewards/accuracies": 0.625, "rewards/chosen": -0.3004041314125061, "rewards/margins": 0.06339363753795624, "rewards/rejected": -0.36379775404930115, "step": 480 }, { "epoch": 0.3815892997639654, "grad_norm": 31.20033836364746, "learning_rate": 3.139259451759715e-05, "logits/chosen": 0.13000288605690002, "logits/rejected": -0.044496648013591766, "logps/chosen": -3.5587058067321777, "logps/rejected": -4.0875444412231445, "loss": 23.3392, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -0.25211071968078613, "rewards/margins": 0.0408257320523262, "rewards/rejected": -0.2929364740848541, "step": 485 }, { "epoch": 0.38552321007081036, "grad_norm": 42.778175354003906, "learning_rate": 3.116562210420604e-05, "logits/chosen": 0.001088732504285872, "logits/rejected": -0.32067522406578064, "logps/chosen": -3.792126178741455, "logps/rejected": -5.263632297515869, "loss": 19.5225, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2830085754394531, "rewards/margins": 0.11108819395303726, "rewards/rejected": -0.3940967619419098, "step": 490 }, { "epoch": 0.3894571203776554, "grad_norm": 51.158023834228516, "learning_rate": 3.093654094912901e-05, "logits/chosen": 0.14770345389842987, "logits/rejected": -0.27998632192611694, "logps/chosen": -3.2016499042510986, "logps/rejected": -4.281766414642334, "loss": 20.9501, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.23279622197151184, "rewards/margins": 0.07809507101774216, "rewards/rejected": -0.3108913004398346, "step": 495 }, { "epoch": 0.3933910306845004, "grad_norm": 56.20425033569336, "learning_rate": 3.070539431668008e-05, "logits/chosen": 0.262935608625412, "logits/rejected": 0.04751387611031532, "logps/chosen": -3.5432257652282715, "logps/rejected": -4.892638683319092, "loss": 19.8621, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.2724161148071289, "rewards/margins": 0.10621275752782822, "rewards/rejected": -0.3786288797855377, "step": 500 }, { "epoch": 0.3933910306845004, "eval_logits/chosen": 0.6740007400512695, "eval_logits/rejected": 0.4591088891029358, "eval_logps/chosen": -4.5571136474609375, "eval_logps/rejected": -5.613867282867432, "eval_loss": 21.918312072753906, "eval_rewards/accuracies": 0.659375011920929, "eval_rewards/chosen": -0.35319122672080994, "eval_rewards/margins": 0.08364833891391754, "eval_rewards/rejected": -0.43683958053588867, "eval_runtime": 263.6435, "eval_samples_per_second": 2.428, "eval_steps_per_second": 0.152, "step": 500 }, { "epoch": 0.3973249409913454, "grad_norm": 58.8707275390625, "learning_rate": 3.0472225861259792e-05, "logits/chosen": 0.5369864702224731, "logits/rejected": 0.22990770637989044, "logps/chosen": -4.736105442047119, "logps/rejected": -6.215359687805176, "loss": 19.8037, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.3894490599632263, "rewards/margins": 0.10317282378673553, "rewards/rejected": -0.49262189865112305, "step": 505 }, { "epoch": 0.4012588512981904, "grad_norm": 70.14684295654297, "learning_rate": 3.023707961911056e-05, "logits/chosen": 0.8286817669868469, "logits/rejected": 0.553676187992096, "logps/chosen": -5.788529872894287, "logps/rejected": -7.536166191101074, "loss": 17.8624, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.49081355333328247, "rewards/margins": 0.14760908484458923, "rewards/rejected": -0.6384226083755493, "step": 510 }, { "epoch": 0.4051927616050354, "grad_norm": 64.01305389404297, "learning_rate": 3.0000000000000004e-05, "logits/chosen": 0.9336326718330383, "logits/rejected": 0.7778112292289734, "logps/chosen": -6.726840972900391, "logps/rejected": -7.550817966461182, "loss": 22.4924, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.57944256067276, "rewards/margins": 0.06806603819131851, "rewards/rejected": -0.6475085616111755, "step": 515 }, { "epoch": 0.4091266719118804, "grad_norm": 78.60610961914062, "learning_rate": 2.976103177883374e-05, "logits/chosen": 1.424285650253296, "logits/rejected": 1.2528765201568604, "logps/chosen": -6.3146467208862305, "logps/rejected": -7.42047643661499, "loss": 22.4632, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5491231083869934, "rewards/margins": 0.08809302002191544, "rewards/rejected": -0.6372160911560059, "step": 520 }, { "epoch": 0.41306058221872544, "grad_norm": 53.94759750366211, "learning_rate": 2.9520220087199142e-05, "logits/chosen": 2.070854663848877, "logits/rejected": 1.9312273263931274, "logps/chosen": -7.275876522064209, "logps/rejected": -7.790124416351318, "loss": 22.9161, "rewards/accuracies": 0.625, "rewards/chosen": -0.6180551052093506, "rewards/margins": 0.04511018842458725, "rewards/rejected": -0.663165271282196, "step": 525 }, { "epoch": 0.4169944925255704, "grad_norm": 56.0859489440918, "learning_rate": 2.9277610404841792e-05, "logits/chosen": 2.1373679637908936, "logits/rejected": 1.8990551233291626, "logps/chosen": -6.323026180267334, "logps/rejected": -7.068973541259766, "loss": 21.5577, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.5458236932754517, "rewards/margins": 0.058930903673172, "rewards/rejected": -0.6047546863555908, "step": 530 }, { "epoch": 0.4209284028324154, "grad_norm": 38.24552536010742, "learning_rate": 2.903324855107617e-05, "logits/chosen": 1.698553442955017, "logits/rejected": 1.4752168655395508, "logps/chosen": -6.036180019378662, "logps/rejected": -7.159039497375488, "loss": 20.6918, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4921696186065674, "rewards/margins": 0.09151138365268707, "rewards/rejected": -0.5836809873580933, "step": 535 }, { "epoch": 0.42486231313926043, "grad_norm": 78.7889633178711, "learning_rate": 2.8787180676132222e-05, "logits/chosen": 1.3787410259246826, "logits/rejected": 1.1702592372894287, "logps/chosen": -5.483891010284424, "logps/rejected": -6.863039493560791, "loss": 21.8569, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4555833339691162, "rewards/margins": 0.10881421715021133, "rewards/rejected": -0.5643975138664246, "step": 540 }, { "epoch": 0.4287962234461054, "grad_norm": 58.13717269897461, "learning_rate": 2.8539453252439388e-05, "logits/chosen": 1.238527536392212, "logits/rejected": 1.0659363269805908, "logps/chosen": -4.524688243865967, "logps/rejected": -5.795032978057861, "loss": 19.8882, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.365116685628891, "rewards/margins": 0.0977514460682869, "rewards/rejected": -0.4628681540489197, "step": 545 }, { "epoch": 0.43273013375295044, "grad_norm": 57.31916809082031, "learning_rate": 2.829011306584983e-05, "logits/chosen": 1.0495655536651611, "logits/rejected": 0.9189395904541016, "logps/chosen": -4.698520183563232, "logps/rejected": -5.4618024826049805, "loss": 22.771, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3808877170085907, "rewards/margins": 0.05942262336611748, "rewards/rejected": -0.4403103291988373, "step": 550 }, { "epoch": 0.4366640440597954, "grad_norm": 57.70218276977539, "learning_rate": 2.8039207206802444e-05, "logits/chosen": 1.0372337102890015, "logits/rejected": 0.7637672424316406, "logps/chosen": -5.192538261413574, "logps/rejected": -6.2920708656311035, "loss": 21.0183, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4280625283718109, "rewards/margins": 0.07608579099178314, "rewards/rejected": -0.5041483640670776, "step": 555 }, { "epoch": 0.44059795436664045, "grad_norm": 38.57474899291992, "learning_rate": 2.778678306142936e-05, "logits/chosen": 1.3346863985061646, "logits/rejected": 1.2482701539993286, "logps/chosen": -4.537243843078613, "logps/rejected": -5.363685607910156, "loss": 21.725, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.367723673582077, "rewards/margins": 0.07422361522912979, "rewards/rejected": -0.4419472813606262, "step": 560 }, { "epoch": 0.44453186467348543, "grad_norm": 47.30175018310547, "learning_rate": 2.753288830260655e-05, "logits/chosen": 1.081312894821167, "logits/rejected": 1.0209182500839233, "logps/chosen": -4.504608154296875, "logps/rejected": -5.094980716705322, "loss": 23.979, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34262794256210327, "rewards/margins": 0.04645577073097229, "rewards/rejected": -0.38908374309539795, "step": 565 }, { "epoch": 0.44846577498033047, "grad_norm": 41.14936065673828, "learning_rate": 2.727757088095037e-05, "logits/chosen": 1.2744576930999756, "logits/rejected": 0.9736446142196655, "logps/chosen": -4.100034236907959, "logps/rejected": -5.128809452056885, "loss": 20.4892, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3064883053302765, "rewards/margins": 0.08413257449865341, "rewards/rejected": -0.3906208872795105, "step": 570 }, { "epoch": 0.45239968528717545, "grad_norm": 29.25827407836914, "learning_rate": 2.7020879015761555e-05, "logits/chosen": 1.315836787223816, "logits/rejected": 1.091429352760315, "logps/chosen": -4.02718448638916, "logps/rejected": -4.86886739730835, "loss": 21.384, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.2975132465362549, "rewards/margins": 0.07079926878213882, "rewards/rejected": -0.3683125078678131, "step": 575 }, { "epoch": 0.4563335955940205, "grad_norm": 156.1727752685547, "learning_rate": 2.6762861185918532e-05, "logits/chosen": 1.271761178970337, "logits/rejected": 1.0751326084136963, "logps/chosen": -4.122300624847412, "logps/rejected": -5.07947301864624, "loss": 20.2656, "rewards/accuracies": 0.6875, "rewards/chosen": -0.3042159378528595, "rewards/margins": 0.0826338604092598, "rewards/rejected": -0.3868497610092163, "step": 580 }, { "epoch": 0.46026750590086546, "grad_norm": 33.745567321777344, "learning_rate": 2.6503566120721685e-05, "logits/chosen": 1.1284042596817017, "logits/rejected": 0.8111549615859985, "logps/chosen": -4.277104377746582, "logps/rejected": -5.083390235900879, "loss": 20.6651, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3438721299171448, "rewards/margins": 0.06679800897836685, "rewards/rejected": -0.41067013144493103, "step": 585 }, { "epoch": 0.46420141620771044, "grad_norm": 51.66657638549805, "learning_rate": 2.6243042790690332e-05, "logits/chosen": 0.8191879987716675, "logits/rejected": 0.7653782367706299, "logps/chosen": -5.171725273132324, "logps/rejected": -5.932669162750244, "loss": 22.7086, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4288380742073059, "rewards/margins": 0.0465051606297493, "rewards/rejected": -0.4753432869911194, "step": 590 }, { "epoch": 0.46813532651455547, "grad_norm": 44.73387908935547, "learning_rate": 2.5981340398314148e-05, "logits/chosen": 0.5237664580345154, "logits/rejected": 0.2646290957927704, "logps/chosen": -4.674435615539551, "logps/rejected": -6.298100471496582, "loss": 19.2422, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3852913975715637, "rewards/margins": 0.1304633468389511, "rewards/rejected": -0.5157546997070312, "step": 595 }, { "epoch": 0.47206923682140045, "grad_norm": 45.94012451171875, "learning_rate": 2.571850836876074e-05, "logits/chosen": 0.5761805176734924, "logits/rejected": 0.38747546076774597, "logps/chosen": -5.026305198669434, "logps/rejected": -7.055234432220459, "loss": 19.6134, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4065484404563904, "rewards/margins": 0.10743912309408188, "rewards/rejected": -0.5139876008033752, "step": 600 }, { "epoch": 0.47206923682140045, "eval_logits/chosen": 0.718625545501709, "eval_logits/rejected": 0.5276994705200195, "eval_logps/chosen": -5.122862815856934, "eval_logps/rejected": -6.157461643218994, "eval_loss": 21.601177215576172, "eval_rewards/accuracies": 0.6812499761581421, "eval_rewards/chosen": -0.40976619720458984, "eval_rewards/margins": 0.08143284171819687, "eval_rewards/rejected": -0.4911990761756897, "eval_runtime": 265.5044, "eval_samples_per_second": 2.411, "eval_steps_per_second": 0.151, "step": 600 }, { "epoch": 0.4760031471282455, "grad_norm": 186.21047973632812, "learning_rate": 2.5454596340541246e-05, "logits/chosen": 0.4267461895942688, "logits/rejected": 0.17012283205986023, "logps/chosen": -5.3673906326293945, "logps/rejected": -6.8712663650512695, "loss": 20.017, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4380454421043396, "rewards/margins": 0.10050982236862183, "rewards/rejected": -0.5385553240776062, "step": 605 }, { "epoch": 0.47993705743509046, "grad_norm": 137.08309936523438, "learning_rate": 2.5189654156135577e-05, "logits/chosen": 0.4136212468147278, "logits/rejected": 0.08081427961587906, "logps/chosen": -4.9327521324157715, "logps/rejected": -6.341540813446045, "loss": 21.9344, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3804043233394623, "rewards/margins": 0.10515154898166656, "rewards/rejected": -0.48555582761764526, "step": 610 }, { "epoch": 0.4838709677419355, "grad_norm": 45.95155334472656, "learning_rate": 2.492373185257913e-05, "logits/chosen": 0.255919486284256, "logits/rejected": 0.06857960671186447, "logps/chosen": -4.774691104888916, "logps/rejected": -5.879635334014893, "loss": 21.4035, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.38787540793418884, "rewards/margins": 0.08788047730922699, "rewards/rejected": -0.47575584053993225, "step": 615 }, { "epoch": 0.4878048780487805, "grad_norm": 93.56800079345703, "learning_rate": 2.465687965201283e-05, "logits/chosen": -0.002777445362880826, "logits/rejected": -0.19223739206790924, "logps/chosen": -4.975631237030029, "logps/rejected": -6.164121150970459, "loss": 21.7253, "rewards/accuracies": 0.625, "rewards/chosen": -0.40307608246803284, "rewards/margins": 0.09124691784381866, "rewards/rejected": -0.4943229556083679, "step": 620 }, { "epoch": 0.4917387883556255, "grad_norm": 48.37771987915039, "learning_rate": 2.438914795219813e-05, "logits/chosen": 0.19498832523822784, "logits/rejected": 0.00378171494230628, "logps/chosen": -4.89281702041626, "logps/rejected": -6.380954742431641, "loss": 18.3391, "rewards/accuracies": 0.75, "rewards/chosen": -0.4015694558620453, "rewards/margins": 0.10364029556512833, "rewards/rejected": -0.505209743976593, "step": 625 }, { "epoch": 0.4956726986624705, "grad_norm": 92.10182189941406, "learning_rate": 2.41205873169989e-05, "logits/chosen": 0.3381834626197815, "logits/rejected": 0.16358311474323273, "logps/chosen": -5.862008094787598, "logps/rejected": -7.257102966308594, "loss": 21.0954, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.49373936653137207, "rewards/margins": 0.10812152922153473, "rewards/rejected": -0.6018608808517456, "step": 630 }, { "epoch": 0.4996066089693155, "grad_norm": 71.71492767333984, "learning_rate": 2.3851248466831906e-05, "logits/chosen": 0.3445281982421875, "logits/rejected": 0.19256843626499176, "logps/chosen": -7.764035224914551, "logps/rejected": -8.462240219116211, "loss": 24.1744, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.6419528126716614, "rewards/margins": 0.05703103542327881, "rewards/rejected": -0.6989837884902954, "step": 635 }, { "epoch": 0.5035405192761605, "grad_norm": 66.27214813232422, "learning_rate": 2.3581182269087756e-05, "logits/chosen": 0.6563648581504822, "logits/rejected": 0.4416646957397461, "logps/chosen": -7.931412696838379, "logps/rejected": -9.330202102661133, "loss": 20.7496, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6889309883117676, "rewards/margins": 0.10431470721960068, "rewards/rejected": -0.7932456731796265, "step": 640 }, { "epoch": 0.5074744295830055, "grad_norm": 58.666744232177734, "learning_rate": 2.331043972852408e-05, "logits/chosen": 0.7555745840072632, "logits/rejected": 0.6508604884147644, "logps/chosen": -8.447460174560547, "logps/rejected": -9.591973304748535, "loss": 19.7029, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.7366517782211304, "rewards/margins": 0.09356808662414551, "rewards/rejected": -0.8302198648452759, "step": 645 }, { "epoch": 0.5114083398898505, "grad_norm": 72.17485046386719, "learning_rate": 2.303907197763275e-05, "logits/chosen": 1.018164038658142, "logits/rejected": 0.8766286969184875, "logps/chosen": -8.899210929870605, "logps/rejected": -10.095988273620605, "loss": 20.9781, "rewards/accuracies": 0.625, "rewards/chosen": -0.7549716830253601, "rewards/margins": 0.07867839932441711, "rewards/rejected": -0.8336501121520996, "step": 650 }, { "epoch": 0.5153422501966956, "grad_norm": 67.8235855102539, "learning_rate": 2.2767130266982972e-05, "logits/chosen": 1.225295066833496, "logits/rejected": 1.1447144746780396, "logps/chosen": -9.052233695983887, "logps/rejected": -11.032278060913086, "loss": 21.8437, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.8185558319091797, "rewards/margins": 0.0942615494132042, "rewards/rejected": -0.9128173589706421, "step": 655 }, { "epoch": 0.5192761605035405, "grad_norm": 68.50457000732422, "learning_rate": 2.2494665955542128e-05, "logits/chosen": 1.5384838581085205, "logits/rejected": 1.455594778060913, "logps/chosen": -8.344881057739258, "logps/rejected": -9.565633773803711, "loss": 20.1429, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.7527450323104858, "rewards/margins": 0.09417356550693512, "rewards/rejected": -0.8469184637069702, "step": 660 }, { "epoch": 0.5232100708103855, "grad_norm": 82.57818603515625, "learning_rate": 2.2221730500976095e-05, "logits/chosen": 1.622971534729004, "logits/rejected": 1.5073456764221191, "logps/chosen": -8.968372344970703, "logps/rejected": -10.068008422851562, "loss": 23.63, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.8010263442993164, "rewards/margins": 0.061060793697834015, "rewards/rejected": -0.862087070941925, "step": 665 }, { "epoch": 0.5271439811172305, "grad_norm": 68.11051177978516, "learning_rate": 2.1948375449930918e-05, "logits/chosen": 1.7366511821746826, "logits/rejected": 1.4829221963882446, "logps/chosen": -8.090542793273926, "logps/rejected": -9.549524307250977, "loss": 19.8611, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.7041796445846558, "rewards/margins": 0.09810546785593033, "rewards/rejected": -0.8022850751876831, "step": 670 }, { "epoch": 0.5310778914240756, "grad_norm": 65.59056854248047, "learning_rate": 2.167465242829774e-05, "logits/chosen": 1.7103216648101807, "logits/rejected": 1.6612507104873657, "logps/chosen": -7.633252143859863, "logps/rejected": -9.149633407592773, "loss": 19.6635, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.678862988948822, "rewards/margins": 0.08954181522130966, "rewards/rejected": -0.7684048414230347, "step": 675 }, { "epoch": 0.5350118017309206, "grad_norm": 82.65924835205078, "learning_rate": 2.1400613131462697e-05, "logits/chosen": 1.8782835006713867, "logits/rejected": 1.8185780048370361, "logps/chosen": -8.152751922607422, "logps/rejected": -9.006729125976562, "loss": 23.3166, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -0.6963292360305786, "rewards/margins": 0.05258508399128914, "rewards/rejected": -0.7489142417907715, "step": 680 }, { "epoch": 0.5389457120377655, "grad_norm": 52.20330810546875, "learning_rate": 2.1126309314543712e-05, "logits/chosen": 2.1228325366973877, "logits/rejected": 1.9640865325927734, "logps/chosen": -7.649266242980957, "logps/rejected": -9.048944473266602, "loss": 18.8418, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.6748228669166565, "rewards/margins": 0.11289025843143463, "rewards/rejected": -0.7877130508422852, "step": 685 }, { "epoch": 0.5428796223446105, "grad_norm": 71.97002410888672, "learning_rate": 2.0851792782616055e-05, "logits/chosen": 2.075559377670288, "logits/rejected": 1.9426681995391846, "logps/chosen": -7.587338924407959, "logps/rejected": -8.79751968383789, "loss": 22.4863, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.6690691113471985, "rewards/margins": 0.0750223845243454, "rewards/rejected": -0.7440915107727051, "step": 690 }, { "epoch": 0.5468135326514555, "grad_norm": 72.18091583251953, "learning_rate": 2.0577115380928366e-05, "logits/chosen": 2.057607889175415, "logits/rejected": 1.905311942100525, "logps/chosen": -7.9985480308532715, "logps/rejected": -9.41818618774414, "loss": 19.677, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.6911519169807434, "rewards/margins": 0.11198244243860245, "rewards/rejected": -0.8031343221664429, "step": 695 }, { "epoch": 0.5507474429583006, "grad_norm": 58.584835052490234, "learning_rate": 2.0302328985111197e-05, "logits/chosen": 1.9190715551376343, "logits/rejected": 1.7894951105117798, "logps/chosen": -7.271109580993652, "logps/rejected": -8.35887336730957, "loss": 20.514, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.6410033702850342, "rewards/margins": 0.08995092660188675, "rewards/rejected": -0.7309542894363403, "step": 700 }, { "epoch": 0.5507474429583006, "eval_logits/chosen": 1.4903769493103027, "eval_logits/rejected": 1.3117529153823853, "eval_logps/chosen": -7.017483711242676, "eval_logps/rejected": -8.199769973754883, "eval_loss": 22.447879791259766, "eval_rewards/accuracies": 0.667187511920929, "eval_rewards/chosen": -0.5992282629013062, "eval_rewards/margins": 0.09620151668787003, "eval_rewards/rejected": -0.6954299211502075, "eval_runtime": 271.3259, "eval_samples_per_second": 2.359, "eval_steps_per_second": 0.147, "step": 700 }, { "epoch": 0.5546813532651456, "grad_norm": 53.3320426940918, "learning_rate": 2.0027485491379747e-05, "logits/chosen": 1.7803634405136108, "logits/rejected": 1.7076250314712524, "logps/chosen": -7.167619228363037, "logps/rejected": -8.508960723876953, "loss": 23.2853, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6200074553489685, "rewards/margins": 0.06713174283504486, "rewards/rejected": -0.6871392130851746, "step": 705 }, { "epoch": 0.5586152635719905, "grad_norm": 50.50627136230469, "learning_rate": 1.9752636806732742e-05, "logits/chosen": 1.3914200067520142, "logits/rejected": 1.207233190536499, "logps/chosen": -5.589415073394775, "logps/rejected": -7.128883361816406, "loss": 18.8527, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4748566746711731, "rewards/margins": 0.10387661308050156, "rewards/rejected": -0.5787333250045776, "step": 710 }, { "epoch": 0.5625491738788355, "grad_norm": 48.835845947265625, "learning_rate": 1.9477834839149278e-05, "logits/chosen": 1.0721666812896729, "logits/rejected": 0.9233131408691406, "logps/chosen": -5.5658769607543945, "logps/rejected": -7.002392768859863, "loss": 18.0462, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.46734148263931274, "rewards/margins": 0.12140637636184692, "rewards/rejected": -0.5887478590011597, "step": 715 }, { "epoch": 0.5664830841856806, "grad_norm": 90.04922485351562, "learning_rate": 1.9203131487785428e-05, "logits/chosen": 0.7360326051712036, "logits/rejected": 0.5881434679031372, "logps/chosen": -5.03780460357666, "logps/rejected": -5.854475498199463, "loss": 22.7303, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.4176352024078369, "rewards/margins": 0.06857079267501831, "rewards/rejected": -0.4862059950828552, "step": 720 }, { "epoch": 0.5704169944925256, "grad_norm": 38.25209045410156, "learning_rate": 1.8928578633172605e-05, "logits/chosen": 0.27817726135253906, "logits/rejected": 0.11374132335186005, "logps/chosen": -4.7726898193359375, "logps/rejected": -6.507106781005859, "loss": 18.5401, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3321690857410431, "rewards/margins": 0.12912718951702118, "rewards/rejected": -0.4612962603569031, "step": 725 }, { "epoch": 0.5743509047993706, "grad_norm": 63.3168830871582, "learning_rate": 1.8654228127419375e-05, "logits/chosen": 0.14177891612052917, "logits/rejected": 0.005235266871750355, "logps/chosen": -3.9255287647247314, "logps/rejected": -4.774594783782959, "loss": 22.0643, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.30579084157943726, "rewards/margins": 0.0727209746837616, "rewards/rejected": -0.37851184606552124, "step": 730 }, { "epoch": 0.5782848151062155, "grad_norm": 85.18032836914062, "learning_rate": 1.838013178441866e-05, "logits/chosen": 0.020315665751695633, "logits/rejected": -0.2219502180814743, "logps/chosen": -3.9957587718963623, "logps/rejected": -5.301746368408203, "loss": 19.3506, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3082605302333832, "rewards/margins": 0.10884448140859604, "rewards/rejected": -0.4171050190925598, "step": 735 }, { "epoch": 0.5822187254130606, "grad_norm": 51.162899017333984, "learning_rate": 1.810634137006213e-05, "logits/chosen": 0.07239419966936111, "logits/rejected": -0.12188796699047089, "logps/chosen": -3.5929577350616455, "logps/rejected": -4.78910493850708, "loss": 20.6575, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.2725752294063568, "rewards/margins": 0.10044431686401367, "rewards/rejected": -0.3730195164680481, "step": 740 }, { "epoch": 0.5861526357199056, "grad_norm": 92.04205322265625, "learning_rate": 1.7832908592463733e-05, "logits/chosen": 0.04120447859168053, "logits/rejected": -0.11522980034351349, "logps/chosen": -4.338095188140869, "logps/rejected": -5.353451251983643, "loss": 22.649, "rewards/accuracies": 0.625, "rewards/chosen": -0.34081414341926575, "rewards/margins": 0.08662021160125732, "rewards/rejected": -0.42743435502052307, "step": 745 }, { "epoch": 0.5900865460267506, "grad_norm": 116.44744873046875, "learning_rate": 1.755988509219406e-05, "logits/chosen": 0.23367616534233093, "logits/rejected": -0.08756458014249802, "logps/chosen": -4.6500091552734375, "logps/rejected": -5.806417465209961, "loss": 21.2043, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3509804606437683, "rewards/margins": 0.10566103458404541, "rewards/rejected": -0.4566414952278137, "step": 750 }, { "epoch": 0.5940204563335956, "grad_norm": 80.40803527832031, "learning_rate": 1.7287322432527488e-05, "logits/chosen": 0.2616182565689087, "logits/rejected": 0.0007495712488889694, "logps/chosen": -4.33780574798584, "logps/rejected": -5.812748432159424, "loss": 22.7481, "rewards/accuracies": 0.6875, "rewards/chosen": -0.346657931804657, "rewards/margins": 0.09951233118772507, "rewards/rejected": -0.44617027044296265, "step": 755 }, { "epoch": 0.5979543666404405, "grad_norm": 55.723270416259766, "learning_rate": 1.7015272089703957e-05, "logits/chosen": 0.28436246514320374, "logits/rejected": 0.11714988946914673, "logps/chosen": -5.426861763000488, "logps/rejected": -6.6721086502075195, "loss": 19.9111, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.43509548902511597, "rewards/margins": 0.09244825690984726, "rewards/rejected": -0.527543842792511, "step": 760 }, { "epoch": 0.6018882769472856, "grad_norm": 57.092342376708984, "learning_rate": 1.6743785443207143e-05, "logits/chosen": 0.41758331656455994, "logits/rejected": 0.11633528769016266, "logps/chosen": -4.913158416748047, "logps/rejected": -6.685948848724365, "loss": 17.5309, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -0.37690046429634094, "rewards/margins": 0.1555865854024887, "rewards/rejected": -0.5324870347976685, "step": 765 }, { "epoch": 0.6058221872541306, "grad_norm": 52.56325149536133, "learning_rate": 1.6472913766060902e-05, "logits/chosen": 0.4455360770225525, "logits/rejected": 0.2686857581138611, "logps/chosen": -5.640677452087402, "logps/rejected": -6.480513572692871, "loss": 22.17, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4680722653865814, "rewards/margins": 0.06571700423955917, "rewards/rejected": -0.5337892770767212, "step": 770 }, { "epoch": 0.6097560975609756, "grad_norm": 97.6286392211914, "learning_rate": 1.6202708215145872e-05, "logits/chosen": 0.5606172680854797, "logits/rejected": 0.4327201247215271, "logps/chosen": -6.182036876678467, "logps/rejected": -7.41018533706665, "loss": 20.9249, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5261003375053406, "rewards/margins": 0.10036258399486542, "rewards/rejected": -0.6264629364013672, "step": 775 }, { "epoch": 0.6136900078678206, "grad_norm": 52.04305648803711, "learning_rate": 1.5933219821537954e-05, "logits/chosen": 0.5304365754127502, "logits/rejected": 0.35703176259994507, "logps/chosen": -6.3372802734375, "logps/rejected": -7.8030829429626465, "loss": 21.2755, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.5311385989189148, "rewards/margins": 0.08466647565364838, "rewards/rejected": -0.6158050298690796, "step": 780 }, { "epoch": 0.6176239181746657, "grad_norm": 102.9618148803711, "learning_rate": 1.566449948087054e-05, "logits/chosen": 0.5141326785087585, "logits/rejected": 0.36102861166000366, "logps/chosen": -6.295104026794434, "logps/rejected": -8.318052291870117, "loss": 20.8088, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5228041410446167, "rewards/margins": 0.10621719062328339, "rewards/rejected": -0.6290213465690613, "step": 785 }, { "epoch": 0.6215578284815106, "grad_norm": 68.3493423461914, "learning_rate": 1.5396597943722432e-05, "logits/chosen": 0.6660643815994263, "logits/rejected": 0.5413907766342163, "logps/chosen": -6.168806552886963, "logps/rejected": -7.278306484222412, "loss": 22.7281, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.49865293502807617, "rewards/margins": 0.07277169823646545, "rewards/rejected": -0.571424663066864, "step": 790 }, { "epoch": 0.6254917387883556, "grad_norm": 82.17771911621094, "learning_rate": 1.512956580603299e-05, "logits/chosen": 0.7329293489456177, "logits/rejected": 0.5695565938949585, "logps/chosen": -6.647820949554443, "logps/rejected": -7.977785587310791, "loss": 22.417, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5436100959777832, "rewards/margins": 0.09925737231969833, "rewards/rejected": -0.6428674459457397, "step": 795 }, { "epoch": 0.6294256490952006, "grad_norm": 46.18910598754883, "learning_rate": 1.4863453499546645e-05, "logits/chosen": 0.8801663517951965, "logits/rejected": 0.7344497442245483, "logps/chosen": -5.812638282775879, "logps/rejected": -7.028628349304199, "loss": 19.7149, "rewards/accuracies": 0.8125, "rewards/chosen": -0.4914124608039856, "rewards/margins": 0.10450112819671631, "rewards/rejected": -0.5959135890007019, "step": 800 }, { "epoch": 0.6294256490952006, "eval_logits/chosen": 0.3036060333251953, "eval_logits/rejected": 0.1599578559398651, "eval_logps/chosen": -6.097789287567139, "eval_logps/rejected": -7.161267280578613, "eval_loss": 21.823678970336914, "eval_rewards/accuracies": 0.6781250238418579, "eval_rewards/chosen": -0.5072587728500366, "eval_rewards/margins": 0.08432072401046753, "eval_rewards/rejected": -0.5915795564651489, "eval_runtime": 269.2822, "eval_samples_per_second": 2.377, "eval_steps_per_second": 0.149, "step": 800 }, { "epoch": 0.6333595594020456, "grad_norm": 59.289424896240234, "learning_rate": 1.4598311282288303e-05, "logits/chosen": 0.9243080019950867, "logits/rejected": 0.7870502471923828, "logps/chosen": -6.291727542877197, "logps/rejected": -6.99709939956665, "loss": 23.6839, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.5228357315063477, "rewards/margins": 0.0496952123939991, "rewards/rejected": -0.5725310444831848, "step": 805 }, { "epoch": 0.6372934697088907, "grad_norm": 77.22916412353516, "learning_rate": 1.4334189229071616e-05, "logits/chosen": 0.9643747210502625, "logits/rejected": 0.8659998774528503, "logps/chosen": -6.07599401473999, "logps/rejected": -6.853678226470947, "loss": 22.2309, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5087305307388306, "rewards/margins": 0.06423305720090866, "rewards/rejected": -0.572963535785675, "step": 810 }, { "epoch": 0.6412273800157356, "grad_norm": 59.69563674926758, "learning_rate": 1.4071137222041853e-05, "logits/chosen": 1.0996719598770142, "logits/rejected": 1.134555459022522, "logps/chosen": -6.2107062339782715, "logps/rejected": -6.798120021820068, "loss": 24.029, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5292443037033081, "rewards/margins": 0.04952489584684372, "rewards/rejected": -0.578769326210022, "step": 815 }, { "epoch": 0.6451612903225806, "grad_norm": 60.09773254394531, "learning_rate": 1.3809204941255145e-05, "logits/chosen": 1.1656619310379028, "logits/rejected": 1.024043083190918, "logps/chosen": -6.1883721351623535, "logps/rejected": -7.462551116943359, "loss": 19.7327, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5342612862586975, "rewards/margins": 0.09449473023414612, "rewards/rejected": -0.6287559270858765, "step": 820 }, { "epoch": 0.6490952006294256, "grad_norm": 41.9356689453125, "learning_rate": 1.3548441855295875e-05, "logits/chosen": 1.4191844463348389, "logits/rejected": 1.1811176538467407, "logps/chosen": -6.1646833419799805, "logps/rejected": -7.954948425292969, "loss": 19.8497, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5303637385368347, "rewards/margins": 0.10388074070215225, "rewards/rejected": -0.634244441986084, "step": 825 }, { "epoch": 0.6530291109362707, "grad_norm": 48.11675262451172, "learning_rate": 1.3288897211934068e-05, "logits/chosen": 1.2117464542388916, "logits/rejected": 1.0153982639312744, "logps/chosen": -6.489108085632324, "logps/rejected": -7.619096279144287, "loss": 21.6481, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5611577033996582, "rewards/margins": 0.08501636981964111, "rewards/rejected": -0.6461740732192993, "step": 830 }, { "epoch": 0.6569630212431157, "grad_norm": 49.94951248168945, "learning_rate": 1.3030620028824426e-05, "logits/chosen": 1.3170408010482788, "logits/rejected": 1.1979198455810547, "logps/chosen": -6.271700382232666, "logps/rejected": -7.738451957702637, "loss": 21.4144, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.5391649603843689, "rewards/margins": 0.10710246860980988, "rewards/rejected": -0.6462674736976624, "step": 835 }, { "epoch": 0.6608969315499607, "grad_norm": 71.95092010498047, "learning_rate": 1.2773659084248847e-05, "logits/chosen": 1.2662818431854248, "logits/rejected": 1.0763803720474243, "logps/chosen": -6.253358364105225, "logps/rejected": -7.545995235443115, "loss": 20.9379, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5418421030044556, "rewards/margins": 0.0893731564283371, "rewards/rejected": -0.6312152147293091, "step": 840 }, { "epoch": 0.6648308418568056, "grad_norm": 68.2335205078125, "learning_rate": 1.2518062907904139e-05, "logits/chosen": 1.0089311599731445, "logits/rejected": 0.9394065141677856, "logps/chosen": -6.143620014190674, "logps/rejected": -6.944084167480469, "loss": 22.3766, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5299480557441711, "rewards/margins": 0.06761987507343292, "rewards/rejected": -0.5975678563117981, "step": 845 }, { "epoch": 0.6687647521636507, "grad_norm": 57.40378189086914, "learning_rate": 1.2263879771736715e-05, "logits/chosen": 1.062596321105957, "logits/rejected": 0.9756869077682495, "logps/chosen": -6.35455322265625, "logps/rejected": -7.452897071838379, "loss": 21.0019, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.5414422750473022, "rewards/margins": 0.0917770117521286, "rewards/rejected": -0.6332192420959473, "step": 850 }, { "epoch": 0.6726986624704957, "grad_norm": 126.5318374633789, "learning_rate": 1.2011157680825928e-05, "logits/chosen": 1.1028703451156616, "logits/rejected": 0.9885491132736206, "logps/chosen": -6.448549747467041, "logps/rejected": -7.387596130371094, "loss": 23.4204, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.552506148815155, "rewards/margins": 0.06099040061235428, "rewards/rejected": -0.6134966015815735, "step": 855 }, { "epoch": 0.6766325727773407, "grad_norm": 47.99146270751953, "learning_rate": 1.1759944364317813e-05, "logits/chosen": 1.2233279943466187, "logits/rejected": 1.0970909595489502, "logps/chosen": -6.800168037414551, "logps/rejected": -7.861006259918213, "loss": 20.2209, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.5629354119300842, "rewards/margins": 0.08497332781553268, "rewards/rejected": -0.6479086875915527, "step": 860 }, { "epoch": 0.6805664830841857, "grad_norm": 45.71125793457031, "learning_rate": 1.151028726641097e-05, "logits/chosen": 1.211646318435669, "logits/rejected": 1.0901798009872437, "logps/chosen": -6.566044807434082, "logps/rejected": -7.516194820404053, "loss": 23.8428, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5404679775238037, "rewards/margins": 0.05825378745794296, "rewards/rejected": -0.5987217426300049, "step": 865 }, { "epoch": 0.6845003933910306, "grad_norm": 197.55665588378906, "learning_rate": 1.126223353739623e-05, "logits/chosen": 1.3044805526733398, "logits/rejected": 1.134115219116211, "logps/chosen": -5.714540958404541, "logps/rejected": -6.580451965332031, "loss": 20.3569, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4907086491584778, "rewards/margins": 0.07508612424135208, "rewards/rejected": -0.5657947659492493, "step": 870 }, { "epoch": 0.6884343036978757, "grad_norm": 57.10478210449219, "learning_rate": 1.1015830024751855e-05, "logits/chosen": 1.2777565717697144, "logits/rejected": 1.1199305057525635, "logps/chosen": -5.988950252532959, "logps/rejected": -7.0798020362854, "loss": 21.1749, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5039869546890259, "rewards/margins": 0.08334285765886307, "rewards/rejected": -0.5873297452926636, "step": 875 }, { "epoch": 0.6923682140047207, "grad_norm": 81.65518188476562, "learning_rate": 1.0771123264295898e-05, "logits/chosen": 1.3681554794311523, "logits/rejected": 1.1427236795425415, "logps/chosen": -5.568511486053467, "logps/rejected": -6.888806343078613, "loss": 21.5826, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.4716271460056305, "rewards/margins": 0.09705589711666107, "rewards/rejected": -0.5686829686164856, "step": 880 }, { "epoch": 0.6963021243115657, "grad_norm": 51.81953811645508, "learning_rate": 1.0528159471397425e-05, "logits/chosen": 1.4481117725372314, "logits/rejected": 1.2932265996932983, "logps/chosen": -5.606583595275879, "logps/rejected": -6.986734867095947, "loss": 20.0762, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.46911725401878357, "rewards/margins": 0.10834591090679169, "rewards/rejected": -0.5774631500244141, "step": 885 }, { "epoch": 0.7002360346184107, "grad_norm": 63.05445098876953, "learning_rate": 1.0286984532248327e-05, "logits/chosen": 1.2815773487091064, "logits/rejected": 1.1918110847473145, "logps/chosen": -6.13564920425415, "logps/rejected": -7.042695045471191, "loss": 23.5487, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.5119949579238892, "rewards/margins": 0.050577979534864426, "rewards/rejected": -0.5625730156898499, "step": 890 }, { "epoch": 0.7041699449252558, "grad_norm": 95.1924057006836, "learning_rate": 1.004764399519718e-05, "logits/chosen": 1.4892219305038452, "logits/rejected": 1.1956894397735596, "logps/chosen": -5.164662837982178, "logps/rejected": -6.872800350189209, "loss": 18.8206, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -0.4294440746307373, "rewards/margins": 0.1322946548461914, "rewards/rejected": -0.5617387294769287, "step": 895 }, { "epoch": 0.7081038552321007, "grad_norm": 71.83849334716797, "learning_rate": 9.81018306214702e-06, "logits/chosen": 1.2133238315582275, "logits/rejected": 1.130614995956421, "logps/chosen": -5.295372009277344, "logps/rejected": -6.396082878112793, "loss": 21.4856, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4392309784889221, "rewards/margins": 0.06895321607589722, "rewards/rejected": -0.5081842541694641, "step": 900 }, { "epoch": 0.7081038552321007, "eval_logits/chosen": 0.8757205009460449, "eval_logits/rejected": 0.7193918824195862, "eval_logps/chosen": -5.373657703399658, "eval_logps/rejected": -6.414725303649902, "eval_loss": 21.844945907592773, "eval_rewards/accuracies": 0.682812511920929, "eval_rewards/chosen": -0.43484562635421753, "eval_rewards/margins": 0.08207974582910538, "eval_rewards/rejected": -0.5169254541397095, "eval_runtime": 270.5677, "eval_samples_per_second": 2.365, "eval_steps_per_second": 0.148, "step": 900 }, { "epoch": 0.7120377655389457, "grad_norm": 56.13790512084961, "learning_rate": 9.574646580018483e-06, "logits/chosen": 1.3937456607818604, "logits/rejected": 1.1991198062896729, "logps/chosen": -5.508763313293457, "logps/rejected": -7.044470310211182, "loss": 20.5192, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.439894437789917, "rewards/margins": 0.10662362724542618, "rewards/rejected": -0.5465181469917297, "step": 905 }, { "epoch": 0.7159716758457907, "grad_norm": 58.68037796020508, "learning_rate": 9.341079032279987e-06, "logits/chosen": 1.3311818838119507, "logits/rejected": 1.1792255640029907, "logps/chosen": -4.972262382507324, "logps/rejected": -6.138205528259277, "loss": 20.7723, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3926599621772766, "rewards/margins": 0.0938471108675003, "rewards/rejected": -0.4865070879459381, "step": 910 }, { "epoch": 0.7199055861526357, "grad_norm": 60.50275802612305, "learning_rate": 9.109524530546622e-06, "logits/chosen": 1.4016880989074707, "logits/rejected": 1.1274712085723877, "logps/chosen": -5.319563865661621, "logps/rejected": -6.878640174865723, "loss": 18.247, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.442908376455307, "rewards/margins": 0.12117187678813934, "rewards/rejected": -0.5640802979469299, "step": 915 }, { "epoch": 0.7238394964594808, "grad_norm": 51.57741165161133, "learning_rate": 8.880026806249194e-06, "logits/chosen": 1.2216346263885498, "logits/rejected": 1.1520761251449585, "logps/chosen": -5.210920333862305, "logps/rejected": -6.013674736022949, "loss": 22.637, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4350380003452301, "rewards/margins": 0.06822977215051651, "rewards/rejected": -0.503267765045166, "step": 920 }, { "epoch": 0.7277734067663257, "grad_norm": 113.00042724609375, "learning_rate": 8.652629202375075e-06, "logits/chosen": 1.1651921272277832, "logits/rejected": 1.0577062368392944, "logps/chosen": -5.723333835601807, "logps/rejected": -7.094163417816162, "loss": 21.495, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4707266390323639, "rewards/margins": 0.0914454534649849, "rewards/rejected": -0.5621720552444458, "step": 925 }, { "epoch": 0.7317073170731707, "grad_norm": 48.148902893066406, "learning_rate": 8.427374665282488e-06, "logits/chosen": 1.2118332386016846, "logits/rejected": 1.0361649990081787, "logps/chosen": -5.214700222015381, "logps/rejected": -6.316381931304932, "loss": 21.8246, "rewards/accuracies": 0.75, "rewards/chosen": -0.4387596547603607, "rewards/margins": 0.07641210407018661, "rewards/rejected": -0.5151717066764832, "step": 930 }, { "epoch": 0.7356412273800157, "grad_norm": 55.0938835144043, "learning_rate": 8.204305736589613e-06, "logits/chosen": 1.1594150066375732, "logits/rejected": 0.9741013646125793, "logps/chosen": -6.014735221862793, "logps/rejected": -7.042943000793457, "loss": 24.2135, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.5000636577606201, "rewards/margins": 0.07266019284725189, "rewards/rejected": -0.5727238655090332, "step": 935 }, { "epoch": 0.7395751376868608, "grad_norm": 54.356346130371094, "learning_rate": 7.98346454514018e-06, "logits/chosen": 1.2832156419754028, "logits/rejected": 1.0834671258926392, "logps/chosen": -5.47364616394043, "logps/rejected": -6.59670352935791, "loss": 20.1251, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4491247236728668, "rewards/margins": 0.0896775871515274, "rewards/rejected": -0.5388022661209106, "step": 940 }, { "epoch": 0.7435090479937058, "grad_norm": 45.92910385131836, "learning_rate": 7.764892799047005e-06, "logits/chosen": 1.2785322666168213, "logits/rejected": 1.157806158065796, "logps/chosen": -5.310911178588867, "logps/rejected": -6.4970197677612305, "loss": 22.7858, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.44885730743408203, "rewards/margins": 0.09017422050237656, "rewards/rejected": -0.5390315055847168, "step": 945 }, { "epoch": 0.7474429583005507, "grad_norm": 59.7811393737793, "learning_rate": 7.548631777814996e-06, "logits/chosen": 1.3047298192977905, "logits/rejected": 1.089996099472046, "logps/chosen": -5.754390239715576, "logps/rejected": -6.797402858734131, "loss": 20.7907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4849552512168884, "rewards/margins": 0.07941243052482605, "rewards/rejected": -0.5643676519393921, "step": 950 }, { "epoch": 0.7513768686073957, "grad_norm": 40.247764587402344, "learning_rate": 7.334722324545065e-06, "logits/chosen": 1.2972372770309448, "logits/rejected": 1.0613635778427124, "logps/chosen": -5.219810485839844, "logps/rejected": -6.806240081787109, "loss": 22.302, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4275331497192383, "rewards/margins": 0.08785694092512131, "rewards/rejected": -0.5153900980949402, "step": 955 }, { "epoch": 0.7553107789142408, "grad_norm": 46.78326416015625, "learning_rate": 7.123204838220534e-06, "logits/chosen": 1.3515903949737549, "logits/rejected": 1.223388910293579, "logps/chosen": -5.440323829650879, "logps/rejected": -6.528807163238525, "loss": 20.2834, "rewards/accuracies": 0.625, "rewards/chosen": -0.45285138487815857, "rewards/margins": 0.08332471549510956, "rewards/rejected": -0.5361760854721069, "step": 960 }, { "epoch": 0.7592446892210858, "grad_norm": 40.53936767578125, "learning_rate": 6.914119266077355e-06, "logits/chosen": 1.3715136051177979, "logits/rejected": 1.172968864440918, "logps/chosen": -5.467780590057373, "logps/rejected": -6.763506889343262, "loss": 20.8753, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4584290385246277, "rewards/margins": 0.08680377900600433, "rewards/rejected": -0.5452328324317932, "step": 965 }, { "epoch": 0.7631785995279308, "grad_norm": 49.12062454223633, "learning_rate": 6.707505096059663e-06, "logits/chosen": 1.3483985662460327, "logits/rejected": 1.2560744285583496, "logps/chosen": -5.542896747589111, "logps/rejected": -6.603106498718262, "loss": 21.623, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.43749743700027466, "rewards/margins": 0.08478393405675888, "rewards/rejected": -0.5222813487052917, "step": 970 }, { "epoch": 0.7671125098347757, "grad_norm": 88.77938079833984, "learning_rate": 6.503401349362084e-06, "logits/chosen": 1.2470347881317139, "logits/rejected": 1.128948450088501, "logps/chosen": -5.131396293640137, "logps/rejected": -6.444796085357666, "loss": 22.2456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4255082607269287, "rewards/margins": 0.08464544266462326, "rewards/rejected": -0.5101537704467773, "step": 975 }, { "epoch": 0.7710464201416207, "grad_norm": 43.77740478515625, "learning_rate": 6.301846573060177e-06, "logits/chosen": 1.3094112873077393, "logits/rejected": 1.1933720111846924, "logps/chosen": -4.36563777923584, "logps/rejected": -5.267557621002197, "loss": 20.8653, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.35090750455856323, "rewards/margins": 0.0695296972990036, "rewards/rejected": -0.4204372465610504, "step": 980 }, { "epoch": 0.7749803304484658, "grad_norm": 51.79399871826172, "learning_rate": 6.102878832830432e-06, "logits/chosen": 1.196919560432434, "logits/rejected": 1.0146095752716064, "logps/chosen": -5.188233852386475, "logps/rejected": -6.542181491851807, "loss": 19.3124, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.4062345027923584, "rewards/margins": 0.10461841523647308, "rewards/rejected": -0.5108529925346375, "step": 985 }, { "epoch": 0.7789142407553108, "grad_norm": 55.651023864746094, "learning_rate": 5.90653570576116e-06, "logits/chosen": 1.1126521825790405, "logits/rejected": 0.9780498743057251, "logps/chosen": -5.8701982498168945, "logps/rejected": -6.6015424728393555, "loss": 21.2432, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4567411541938782, "rewards/margins": 0.07785584777593613, "rewards/rejected": -0.5345970392227173, "step": 990 }, { "epoch": 0.7828481510621558, "grad_norm": 41.18940353393555, "learning_rate": 5.712854273255708e-06, "logits/chosen": 1.3726425170898438, "logits/rejected": 1.0831149816513062, "logps/chosen": -4.694300651550293, "logps/rejected": -6.523576259613037, "loss": 18.0565, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -0.38610920310020447, "rewards/margins": 0.1167839914560318, "rewards/rejected": -0.5028932094573975, "step": 995 }, { "epoch": 0.7867820613690008, "grad_norm": 55.95176696777344, "learning_rate": 5.521871114029233e-06, "logits/chosen": 1.1702280044555664, "logits/rejected": 0.8772756457328796, "logps/chosen": -4.847783088684082, "logps/rejected": -6.067271709442139, "loss": 20.8844, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.3914787769317627, "rewards/margins": 0.09186731278896332, "rewards/rejected": -0.4833460748195648, "step": 1000 }, { "epoch": 0.7867820613690008, "eval_logits/chosen": 0.7199884653091431, "eval_logits/rejected": 0.5619722604751587, "eval_logps/chosen": -5.075138092041016, "eval_logps/rejected": -6.114271640777588, "eval_loss": 21.61347198486328, "eval_rewards/accuracies": 0.676562488079071, "eval_rewards/chosen": -0.4049936830997467, "eval_rewards/margins": 0.08188632875680923, "eval_rewards/rejected": -0.4868800640106201, "eval_runtime": 272.6784, "eval_samples_per_second": 2.347, "eval_steps_per_second": 0.147, "step": 1000 }, { "epoch": 0.7907159716758458, "grad_norm": 50.42679214477539, "learning_rate": 5.3336222972004494e-06, "logits/chosen": 1.0932950973510742, "logits/rejected": 1.0363296270370483, "logps/chosen": -5.089923858642578, "logps/rejected": -5.958308219909668, "loss": 22.5933, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.4146384596824646, "rewards/margins": 0.0728868916630745, "rewards/rejected": -0.4875253140926361, "step": 1005 }, { "epoch": 0.7946498819826908, "grad_norm": 50.056846618652344, "learning_rate": 5.148143375479602e-06, "logits/chosen": 1.0465943813323975, "logits/rejected": 0.7968215346336365, "logps/chosen": -5.255601406097412, "logps/rejected": -6.39734411239624, "loss": 19.5322, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.43599051237106323, "rewards/margins": 0.08602426201105118, "rewards/rejected": -0.5220147967338562, "step": 1010 }, { "epoch": 0.7985837922895358, "grad_norm": 63.85953140258789, "learning_rate": 4.96546937845398e-06, "logits/chosen": 0.9397533535957336, "logits/rejected": 0.6560163497924805, "logps/chosen": -5.794540882110596, "logps/rejected": -7.656335353851318, "loss": 18.5173, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.4594026207923889, "rewards/margins": 0.11265122890472412, "rewards/rejected": -0.5720537900924683, "step": 1015 }, { "epoch": 0.8025177025963808, "grad_norm": 42.84744644165039, "learning_rate": 4.785634805972201e-06, "logits/chosen": 0.8836447596549988, "logits/rejected": 0.6324716806411743, "logps/chosen": -5.117056369781494, "logps/rejected": -6.833028316497803, "loss": 17.5394, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.4234439730644226, "rewards/margins": 0.13116177916526794, "rewards/rejected": -0.5546057224273682, "step": 1020 }, { "epoch": 0.8064516129032258, "grad_norm": 47.17292404174805, "learning_rate": 4.60867362162861e-06, "logits/chosen": 0.8913745880126953, "logits/rejected": 0.7286295294761658, "logps/chosen": -5.609414577484131, "logps/rejected": -6.968409061431885, "loss": 18.6049, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.46564167737960815, "rewards/margins": 0.107306107878685, "rewards/rejected": -0.5729478001594543, "step": 1025 }, { "epoch": 0.8103855232100708, "grad_norm": 72.6835708618164, "learning_rate": 4.434619246348843e-06, "logits/chosen": 1.0678660869598389, "logits/rejected": 0.8571092486381531, "logps/chosen": -5.482309818267822, "logps/rejected": -6.8230743408203125, "loss": 21.0093, "rewards/accuracies": 0.625, "rewards/chosen": -0.4621756076812744, "rewards/margins": 0.09944047778844833, "rewards/rejected": -0.561616063117981, "step": 1030 }, { "epoch": 0.8143194335169158, "grad_norm": 63.443233489990234, "learning_rate": 4.263504552078004e-06, "logits/chosen": 0.8602091073989868, "logits/rejected": 0.7144955396652222, "logps/chosen": -5.040487289428711, "logps/rejected": -6.352028846740723, "loss": 21.8699, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.42594534158706665, "rewards/margins": 0.08123664557933807, "rewards/rejected": -0.5071820020675659, "step": 1035 }, { "epoch": 0.8182533438237608, "grad_norm": 100.11986541748047, "learning_rate": 4.095361855572431e-06, "logits/chosen": 0.6691738367080688, "logits/rejected": 0.5654880404472351, "logps/chosen": -5.479603290557861, "logps/rejected": -6.441510200500488, "loss": 23.1216, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.45797911286354065, "rewards/margins": 0.0781431570649147, "rewards/rejected": -0.53612220287323, "step": 1040 }, { "epoch": 0.8221872541306058, "grad_norm": 60.643890380859375, "learning_rate": 3.9302229122963465e-06, "logits/chosen": 0.8846467733383179, "logits/rejected": 0.6783546209335327, "logps/chosen": -6.16463565826416, "logps/rejected": -6.873899936676025, "loss": 25.3631, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4791037440299988, "rewards/margins": 0.06751880794763565, "rewards/rejected": -0.5466224551200867, "step": 1045 }, { "epoch": 0.8261211644374509, "grad_norm": 43.71245574951172, "learning_rate": 3.768118910424532e-06, "logits/chosen": 0.9008040428161621, "logits/rejected": 0.6831108331680298, "logps/chosen": -5.510120868682861, "logps/rejected": -6.556514739990234, "loss": 22.7596, "rewards/accuracies": 0.625, "rewards/chosen": -0.4446142315864563, "rewards/margins": 0.0855989158153534, "rewards/rejected": -0.5302131175994873, "step": 1050 }, { "epoch": 0.8300550747442959, "grad_norm": 62.271480560302734, "learning_rate": 3.6090804649521037e-06, "logits/chosen": 0.7602671384811401, "logits/rejected": 0.7593709826469421, "logps/chosen": -5.140334606170654, "logps/rejected": -5.595780372619629, "loss": 26.1903, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.42375484108924866, "rewards/margins": 0.03734329715371132, "rewards/rejected": -0.4610981345176697, "step": 1055 }, { "epoch": 0.8339889850511408, "grad_norm": 55.21813201904297, "learning_rate": 3.4531376119125605e-06, "logits/chosen": 0.9315360188484192, "logits/rejected": 0.9035407900810242, "logps/chosen": -4.700660705566406, "logps/rejected": -5.618612766265869, "loss": 22.3302, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3840516209602356, "rewards/margins": 0.06768553704023361, "rewards/rejected": -0.45173710584640503, "step": 1060 }, { "epoch": 0.8379228953579858, "grad_norm": 82.28722381591797, "learning_rate": 3.3003198027051897e-06, "logits/chosen": 0.8791500329971313, "logits/rejected": 0.7902938723564148, "logps/chosen": -5.500949382781982, "logps/rejected": -6.393127918243408, "loss": 22.1454, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.424252986907959, "rewards/margins": 0.07516753673553467, "rewards/rejected": -0.49942055344581604, "step": 1065 }, { "epoch": 0.8418568056648308, "grad_norm": 58.98760223388672, "learning_rate": 3.150655898532853e-06, "logits/chosen": 0.9076460003852844, "logits/rejected": 0.7676557302474976, "logps/chosen": -5.2781171798706055, "logps/rejected": -6.154453754425049, "loss": 21.2363, "rewards/accuracies": 0.625, "rewards/chosen": -0.4085540771484375, "rewards/margins": 0.07775183767080307, "rewards/rejected": -0.4863058924674988, "step": 1070 }, { "epoch": 0.8457907159716759, "grad_norm": 50.36076354980469, "learning_rate": 3.004174164951259e-06, "logits/chosen": 0.8667048215866089, "logits/rejected": 0.7412594556808472, "logps/chosen": -5.391347408294678, "logps/rejected": -6.3621954917907715, "loss": 23.4492, "rewards/accuracies": 0.625, "rewards/chosen": -0.4194518029689789, "rewards/margins": 0.05420858785510063, "rewards/rejected": -0.4736603796482086, "step": 1075 }, { "epoch": 0.8497246262785209, "grad_norm": 44.32093048095703, "learning_rate": 2.860902266530723e-06, "logits/chosen": 0.846762478351593, "logits/rejected": 0.6929168701171875, "logps/chosen": -5.092324256896973, "logps/rejected": -5.855961799621582, "loss": 23.0306, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.41406020522117615, "rewards/margins": 0.05717035382986069, "rewards/rejected": -0.47123056650161743, "step": 1080 }, { "epoch": 0.8536585365853658, "grad_norm": 57.81728744506836, "learning_rate": 2.7208672616314345e-06, "logits/chosen": 0.8338634371757507, "logits/rejected": 0.731313169002533, "logps/chosen": -5.652237415313721, "logps/rejected": -6.4818902015686035, "loss": 23.1995, "rewards/accuracies": 0.625, "rewards/chosen": -0.4492163062095642, "rewards/margins": 0.0547863133251667, "rewards/rejected": -0.5040026903152466, "step": 1085 }, { "epoch": 0.8575924468922108, "grad_norm": 60.14872360229492, "learning_rate": 2.58409559729321e-06, "logits/chosen": 0.8530842065811157, "logits/rejected": 0.546228289604187, "logps/chosen": -5.260161399841309, "logps/rejected": -6.7824811935424805, "loss": 18.7626, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.4305374026298523, "rewards/margins": 0.10589683055877686, "rewards/rejected": -0.5364342331886292, "step": 1090 }, { "epoch": 0.8615263571990559, "grad_norm": 63.61619186401367, "learning_rate": 2.4506131042406844e-06, "logits/chosen": 0.8177973628044128, "logits/rejected": 0.547761082649231, "logps/chosen": -5.860104560852051, "logps/rejected": -6.826254367828369, "loss": 21.7568, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.44510722160339355, "rewards/margins": 0.07002703845500946, "rewards/rejected": -0.5151342153549194, "step": 1095 }, { "epoch": 0.8654602675059009, "grad_norm": 63.3447151184082, "learning_rate": 2.3204449920049378e-06, "logits/chosen": 0.9175206422805786, "logits/rejected": 0.7014715671539307, "logps/chosen": -4.718405723571777, "logps/rejected": -6.031472682952881, "loss": 21.5778, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3723874092102051, "rewards/margins": 0.08356385678052902, "rewards/rejected": -0.4559513032436371, "step": 1100 }, { "epoch": 0.8654602675059009, "eval_logits/chosen": 0.5780611038208008, "eval_logits/rejected": 0.40807875990867615, "eval_logps/chosen": -4.8285369873046875, "eval_logps/rejected": -5.842423439025879, "eval_loss": 21.6467342376709, "eval_rewards/accuracies": 0.668749988079071, "eval_rewards/chosen": -0.3803336024284363, "eval_rewards/margins": 0.07936159521341324, "eval_rewards/rejected": -0.4596951901912689, "eval_runtime": 276.4407, "eval_samples_per_second": 2.315, "eval_steps_per_second": 0.145, "step": 1100 }, { "epoch": 0.8693941778127459, "grad_norm": 54.00641632080078, "learning_rate": 2.1936158441624113e-06, "logits/chosen": 0.7385736107826233, "logits/rejected": 0.6566623449325562, "logps/chosen": -5.136672019958496, "logps/rejected": -5.858384609222412, "loss": 23.5285, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4080820679664612, "rewards/margins": 0.05175456404685974, "rewards/rejected": -0.4598366320133209, "step": 1105 }, { "epoch": 0.8733280881195908, "grad_norm": 63.94253921508789, "learning_rate": 2.070149613692032e-06, "logits/chosen": 0.8581029772758484, "logits/rejected": 0.6364805102348328, "logps/chosen": -5.164943695068359, "logps/rejected": -5.946603298187256, "loss": 23.1104, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.4210405945777893, "rewards/margins": 0.06227627396583557, "rewards/rejected": -0.4833168387413025, "step": 1110 }, { "epoch": 0.8772619984264359, "grad_norm": 44.03200149536133, "learning_rate": 1.9500696184514735e-06, "logits/chosen": 0.813498318195343, "logits/rejected": 0.7083032131195068, "logps/chosen": -4.999638557434082, "logps/rejected": -5.514034271240234, "loss": 23.8753, "rewards/accuracies": 0.625, "rewards/chosen": -0.3863885700702667, "rewards/margins": 0.060605116188526154, "rewards/rejected": -0.4469936788082123, "step": 1115 }, { "epoch": 0.8811959087332809, "grad_norm": 91.94346618652344, "learning_rate": 1.8333985367733208e-06, "logits/chosen": 0.6693506240844727, "logits/rejected": 0.6530565023422241, "logps/chosen": -4.876978874206543, "logps/rejected": -5.703049659729004, "loss": 25.4636, "rewards/accuracies": 0.5625, "rewards/chosen": -0.3906908929347992, "rewards/margins": 0.038804419338703156, "rewards/rejected": -0.42949533462524414, "step": 1120 }, { "epoch": 0.8851298190401259, "grad_norm": 43.88142395019531, "learning_rate": 1.7201584031820418e-06, "logits/chosen": 0.7986466884613037, "logits/rejected": 0.637027382850647, "logps/chosen": -4.8898234367370605, "logps/rejected": -6.180974960327148, "loss": 20.0693, "rewards/accuracies": 0.625, "rewards/chosen": -0.3792203366756439, "rewards/margins": 0.09974167495965958, "rewards/rejected": -0.4789620339870453, "step": 1125 }, { "epoch": 0.8890637293469709, "grad_norm": 137.21253967285156, "learning_rate": 1.610370604232543e-06, "logits/chosen": 0.9146040678024292, "logits/rejected": 0.7076241970062256, "logps/chosen": -4.883869647979736, "logps/rejected": -5.884066104888916, "loss": 21.7767, "rewards/accuracies": 0.625, "rewards/chosen": -0.3782455325126648, "rewards/margins": 0.05688385292887688, "rewards/rejected": -0.4351293444633484, "step": 1130 }, { "epoch": 0.8929976396538158, "grad_norm": 64.58161163330078, "learning_rate": 1.5040558744711087e-06, "logits/chosen": 0.7862271070480347, "logits/rejected": 0.6925013065338135, "logps/chosen": -4.76452112197876, "logps/rejected": -5.642160892486572, "loss": 22.3309, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.3690032362937927, "rewards/margins": 0.06053303927183151, "rewards/rejected": -0.42953628301620483, "step": 1135 }, { "epoch": 0.8969315499606609, "grad_norm": 61.42625045776367, "learning_rate": 1.4012342925194532e-06, "logits/chosen": 0.9055964350700378, "logits/rejected": 0.7526835799217224, "logps/chosen": -4.5060014724731445, "logps/rejected": -5.722702980041504, "loss": 20.1537, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3619564175605774, "rewards/margins": 0.09387658536434174, "rewards/rejected": -0.4558330178260803, "step": 1140 }, { "epoch": 0.9008654602675059, "grad_norm": 55.103858947753906, "learning_rate": 1.3019252772826874e-06, "logits/chosen": 0.8664646148681641, "logits/rejected": 0.6308975219726562, "logps/chosen": -4.155823230743408, "logps/rejected": -5.087547302246094, "loss": 21.2004, "rewards/accuracies": 0.625, "rewards/chosen": -0.3356073200702667, "rewards/margins": 0.06966021656990051, "rewards/rejected": -0.40526753664016724, "step": 1145 }, { "epoch": 0.9047993705743509, "grad_norm": 36.31183624267578, "learning_rate": 1.2061475842818337e-06, "logits/chosen": 0.8728748559951782, "logits/rejected": 0.8325087428092957, "logps/chosen": -4.736401557922363, "logps/rejected": -5.618078708648682, "loss": 22.6537, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.37614908814430237, "rewards/margins": 0.07038898766040802, "rewards/rejected": -0.4465380609035492, "step": 1150 }, { "epoch": 0.9087332808811959, "grad_norm": 66.35440826416016, "learning_rate": 1.1139193021116878e-06, "logits/chosen": 0.9341157674789429, "logits/rejected": 0.9482291340827942, "logps/chosen": -4.348454475402832, "logps/rejected": -4.784420967102051, "loss": 25.3501, "rewards/accuracies": 0.5625, "rewards/chosen": -0.34729093313217163, "rewards/margins": 0.020721841603517532, "rewards/rejected": -0.36801275610923767, "step": 1155 }, { "epoch": 0.912667191188041, "grad_norm": 89.98934173583984, "learning_rate": 1.0252578490245812e-06, "logits/chosen": 0.9573014974594116, "logits/rejected": 0.7914093136787415, "logps/chosen": -4.558495044708252, "logps/rejected": -5.923836708068848, "loss": 21.659, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3659791052341461, "rewards/margins": 0.07632460445165634, "rewards/rejected": -0.44230371713638306, "step": 1160 }, { "epoch": 0.9166011014948859, "grad_norm": 65.70526885986328, "learning_rate": 9.401799696407643e-07, "logits/chosen": 0.8820127248764038, "logits/rejected": 0.5988653898239136, "logps/chosen": -4.5979132652282715, "logps/rejected": -5.598001003265381, "loss": 20.0052, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3443650007247925, "rewards/margins": 0.09784060716629028, "rewards/rejected": -0.4422055780887604, "step": 1165 }, { "epoch": 0.9205350118017309, "grad_norm": 47.08201599121094, "learning_rate": 8.587017317860291e-07, "logits/chosen": 0.9893172979354858, "logits/rejected": 0.7521982192993164, "logps/chosen": -3.797804355621338, "logps/rejected": -5.014548301696777, "loss": 19.899, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.30206161737442017, "rewards/margins": 0.09998033195734024, "rewards/rejected": -0.4020419120788574, "step": 1170 }, { "epoch": 0.9244689221085759, "grad_norm": 44.627437591552734, "learning_rate": 7.808385234571303e-07, "logits/chosen": 0.9069494009017944, "logits/rejected": 0.6808138489723206, "logps/chosen": -4.21389627456665, "logps/rejected": -5.213305473327637, "loss": 20.0447, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -0.33665183186531067, "rewards/margins": 0.0863773375749588, "rewards/rejected": -0.4230291247367859, "step": 1175 }, { "epoch": 0.9284028324154209, "grad_norm": 223.38873291015625, "learning_rate": 7.066050499155941e-07, "logits/chosen": 0.866726279258728, "logits/rejected": 0.7506811022758484, "logps/chosen": -4.992801189422607, "logps/rejected": -5.7411932945251465, "loss": 21.8626, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.38969671726226807, "rewards/margins": 0.07754883170127869, "rewards/rejected": -0.46724551916122437, "step": 1180 }, { "epoch": 0.932336742722266, "grad_norm": 39.60309600830078, "learning_rate": 6.360153309104999e-07, "logits/chosen": 0.8703472018241882, "logits/rejected": 0.6217884421348572, "logps/chosen": -4.574381351470947, "logps/rejected": -6.075875759124756, "loss": 17.0961, "rewards/accuracies": 0.75, "rewards/chosen": -0.36664003133773804, "rewards/margins": 0.116389200091362, "rewards/rejected": -0.48302921652793884, "step": 1185 }, { "epoch": 0.9362706530291109, "grad_norm": 44.12582015991211, "learning_rate": 5.690826980306851e-07, "logits/chosen": 0.8729127645492554, "logits/rejected": 0.6510919332504272, "logps/chosen": -4.34609842300415, "logps/rejected": -5.6441850662231445, "loss": 21.2825, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3452923893928528, "rewards/margins": 0.08857759088277817, "rewards/rejected": -0.43387001752853394, "step": 1190 }, { "epoch": 0.9402045633359559, "grad_norm": 53.13776779174805, "learning_rate": 5.058197921869568e-07, "logits/chosen": 0.9216381311416626, "logits/rejected": 0.6996389031410217, "logps/chosen": -5.07051420211792, "logps/rejected": -6.018683433532715, "loss": 21.8654, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.38140612840652466, "rewards/margins": 0.07850027084350586, "rewards/rejected": -0.4599063992500305, "step": 1195 }, { "epoch": 0.9441384736428009, "grad_norm": 41.1729736328125, "learning_rate": 4.4623856122471665e-07, "logits/chosen": 0.7972432971000671, "logits/rejected": 0.6558005213737488, "logps/chosen": -4.346493721008301, "logps/rejected": -5.412367343902588, "loss": 20.3207, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.3416510224342346, "rewards/margins": 0.08416789770126343, "rewards/rejected": -0.42581886053085327, "step": 1200 }, { "epoch": 0.9441384736428009, "eval_logits/chosen": 0.6238431334495544, "eval_logits/rejected": 0.4534227252006531, "eval_logps/chosen": -4.653068542480469, "eval_logps/rejected": -5.671896934509277, "eval_loss": 21.683839797973633, "eval_rewards/accuracies": 0.671875, "eval_rewards/chosen": -0.3627867102622986, "eval_rewards/margins": 0.07985583692789078, "eval_rewards/rejected": -0.44264253973960876, "eval_runtime": 271.153, "eval_samples_per_second": 2.36, "eval_steps_per_second": 0.148, "step": 1200 }, { "epoch": 0.948072383949646, "grad_norm": 89.22261810302734, "learning_rate": 3.9035025766749333e-07, "logits/chosen": 1.065003752708435, "logits/rejected": 0.8652560114860535, "logps/chosen": -4.475849151611328, "logps/rejected": -5.141944885253906, "loss": 23.3289, "rewards/accuracies": 0.625, "rewards/chosen": -0.355093777179718, "rewards/margins": 0.06265915930271149, "rewards/rejected": -0.4177529215812683, "step": 1205 }, { "epoch": 0.952006294256491, "grad_norm": 54.871742248535156, "learning_rate": 3.381654365917864e-07, "logits/chosen": 0.9329290390014648, "logits/rejected": 0.725328803062439, "logps/chosen": -4.456474304199219, "logps/rejected": -5.94174337387085, "loss": 21.236, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.35311684012413025, "rewards/margins": 0.08815713971853256, "rewards/rejected": -0.4412739872932434, "step": 1210 }, { "epoch": 0.955940204563336, "grad_norm": 50.6146354675293, "learning_rate": 2.896939536336296e-07, "logits/chosen": 0.9757564663887024, "logits/rejected": 0.7466751933097839, "logps/chosen": -3.781454563140869, "logps/rejected": -4.780596733093262, "loss": 20.805, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.30329400300979614, "rewards/margins": 0.0837683156132698, "rewards/rejected": -0.38706234097480774, "step": 1215 }, { "epoch": 0.9598741148701809, "grad_norm": 60.844093322753906, "learning_rate": 2.449449631272605e-07, "logits/chosen": 0.8028494715690613, "logits/rejected": 0.6643998622894287, "logps/chosen": -5.040999412536621, "logps/rejected": -6.319550037384033, "loss": 20.5501, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.38902026414871216, "rewards/margins": 0.09444370120763779, "rewards/rejected": -0.48346394300460815, "step": 1220 }, { "epoch": 0.963808025177026, "grad_norm": 48.02137756347656, "learning_rate": 2.0392691637622698e-07, "logits/chosen": 0.9626764059066772, "logits/rejected": 0.7285498380661011, "logps/chosen": -4.898202896118164, "logps/rejected": -5.569176197052002, "loss": 23.1671, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.37996894121170044, "rewards/margins": 0.05454573780298233, "rewards/rejected": -0.43451470136642456, "step": 1225 }, { "epoch": 0.967741935483871, "grad_norm": 78.87560272216797, "learning_rate": 1.666475600572648e-07, "logits/chosen": 0.9195537567138672, "logits/rejected": 0.7668181657791138, "logps/chosen": -4.843678951263428, "logps/rejected": -6.038217067718506, "loss": 21.6436, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -0.3704824447631836, "rewards/margins": 0.08607770502567291, "rewards/rejected": -0.4565601348876953, "step": 1230 }, { "epoch": 0.971675845790716, "grad_norm": 42.323211669921875, "learning_rate": 1.331139347572763e-07, "logits/chosen": 0.8599262237548828, "logits/rejected": 0.6428951025009155, "logps/chosen": -5.1760969161987305, "logps/rejected": -6.792412757873535, "loss": 19.5088, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -0.3994407057762146, "rewards/margins": 0.11559662967920303, "rewards/rejected": -0.5150374174118042, "step": 1235 }, { "epoch": 0.975609756097561, "grad_norm": 45.8914909362793, "learning_rate": 1.033323736436298e-07, "logits/chosen": 0.9099301099777222, "logits/rejected": 0.8088103532791138, "logps/chosen": -4.615214824676514, "logps/rejected": -5.559386253356934, "loss": 21.5362, "rewards/accuracies": 0.625, "rewards/chosen": -0.37754908204078674, "rewards/margins": 0.07683036476373672, "rewards/rejected": -0.4543794095516205, "step": 1240 }, { "epoch": 0.9795436664044059, "grad_norm": 46.62201690673828, "learning_rate": 7.730850126807854e-08, "logits/chosen": 0.9434062838554382, "logits/rejected": 0.7752779126167297, "logps/chosen": -4.551201820373535, "logps/rejected": -5.316429615020752, "loss": 22.0776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.3628130853176117, "rewards/margins": 0.0656280368566513, "rewards/rejected": -0.4284411370754242, "step": 1245 }, { "epoch": 0.983477576711251, "grad_norm": 48.4689826965332, "learning_rate": 5.5047232504505943e-08, "logits/chosen": 0.8963427543640137, "logits/rejected": 0.684687077999115, "logps/chosen": -4.659503936767578, "logps/rejected": -6.174535751342773, "loss": 19.0895, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -0.37634724378585815, "rewards/margins": 0.10511218011379242, "rewards/rejected": -0.48145943880081177, "step": 1250 }, { "epoch": 0.987411487018096, "grad_norm": 55.87615203857422, "learning_rate": 3.655277162071258e-08, "logits/chosen": 0.7668878436088562, "logits/rejected": 0.6688386797904968, "logps/chosen": -5.46780252456665, "logps/rejected": -6.352745056152344, "loss": 24.3925, "rewards/accuracies": 0.574999988079071, "rewards/chosen": -0.40356582403182983, "rewards/margins": 0.07050136476755142, "rewards/rejected": -0.47406721115112305, "step": 1255 }, { "epoch": 0.991345397324941, "grad_norm": 76.68724822998047, "learning_rate": 2.1828611484377983e-08, "logits/chosen": 0.9311445355415344, "logits/rejected": 0.7878313064575195, "logps/chosen": -4.927813529968262, "logps/rejected": -6.390625476837158, "loss": 21.7847, "rewards/accuracies": 0.6875, "rewards/chosen": -0.40384235978126526, "rewards/margins": 0.09595279395580292, "rewards/rejected": -0.499795138835907, "step": 1260 }, { "epoch": 0.995279307631786, "grad_norm": 47.24720764160156, "learning_rate": 1.0877532903414979e-08, "logits/chosen": 0.7036594152450562, "logits/rejected": 0.573765754699707, "logps/chosen": -5.013037204742432, "logps/rejected": -6.5157151222229, "loss": 21.7697, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": -0.4053846001625061, "rewards/margins": 0.09475921839475632, "rewards/rejected": -0.500143826007843, "step": 1265 }, { "epoch": 0.999213217938631, "grad_norm": 195.9378662109375, "learning_rate": 3.7016041007742474e-09, "logits/chosen": 0.9408377408981323, "logits/rejected": 0.719150185585022, "logps/chosen": -4.878444194793701, "logps/rejected": -6.0785698890686035, "loss": 20.698, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -0.3925797641277313, "rewards/margins": 0.07607986032962799, "rewards/rejected": -0.4686596393585205, "step": 1270 }, { "epoch": 1.0, "step": 1271, "total_flos": 0.0, "train_loss": 21.99049959550403, "train_runtime": 22427.3619, "train_samples_per_second": 0.906, "train_steps_per_second": 0.057 } ], "logging_steps": 5, "max_steps": 1271, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }