{ "best_metric": null, "best_model_checkpoint": null, "epoch": 4.0, "eval_steps": 100, "global_step": 1540, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.8663302659988403, "logits/rejected": -1.870653748512268, "logps/chosen": -36.98862075805664, "logps/rejected": -33.65410232543945, "loss": 0.4999, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.00017976858362089843, "rewards/margins": 0.0004200442635919899, "rewards/rejected": -0.00024027563631534576, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.997056245803833, "logits/rejected": -1.9996883869171143, "logps/chosen": -29.644180297851562, "logps/rejected": -29.042306900024414, "loss": 0.5, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -1.9853896446875297e-05, "rewards/margins": -0.00023122904531192034, "rewards/rejected": 0.00021137515432201326, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.9208399057388306, "logits/rejected": -1.9181444644927979, "logps/chosen": -31.4141902923584, "logps/rejected": -33.234039306640625, "loss": 0.5, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 1.908373087644577e-05, "rewards/margins": 0.0001748828508425504, "rewards/rejected": -0.00015579909086227417, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.016510009765625, "logits/rejected": -2.007784843444824, "logps/chosen": -32.554359436035156, "logps/rejected": -32.493995666503906, "loss": 0.5, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00022449034440796822, "rewards/margins": 3.7771409552078694e-05, "rewards/rejected": 0.00018671892757993191, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8620290756225586, "logits/rejected": -1.851264238357544, "logps/chosen": -33.51964569091797, "logps/rejected": -35.407535552978516, "loss": 0.5, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.000378346536308527, "rewards/margins": -6.972800474613905e-05, "rewards/rejected": 0.0004480745701584965, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.941019058227539, "logits/rejected": -1.9429725408554077, "logps/chosen": -32.50640869140625, "logps/rejected": -33.16156768798828, "loss": 0.4998, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0009377357782796025, "rewards/margins": 0.000839641026686877, "rewards/rejected": 9.809464972931892e-05, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.071539878845215, "logits/rejected": -2.0764715671539307, "logps/chosen": -33.93183898925781, "logps/rejected": -36.525550842285156, "loss": 0.5, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.000566466711461544, "rewards/margins": 0.0001332084066234529, "rewards/rejected": 0.0004332582466304302, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9310013055801392, "logits/rejected": -1.9341026544570923, "logps/chosen": -34.225975036621094, "logps/rejected": -34.53061294555664, "loss": 0.4997, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.0020183573942631483, "rewards/margins": 0.0014004094991832972, "rewards/rejected": 0.0006179477786645293, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9398448467254639, "logits/rejected": -1.9443466663360596, "logps/chosen": -32.252197265625, "logps/rejected": -32.24930953979492, "loss": 0.4997, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0021522401366382837, "rewards/margins": 0.0010562599636614323, "rewards/rejected": 0.0010959801729768515, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.037635564804077, "logits/rejected": -2.0356411933898926, "logps/chosen": -31.99556541442871, "logps/rejected": -31.139856338500977, "loss": 0.4997, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0024526813067495823, "rewards/margins": 0.0011891307076439261, "rewards/rejected": 0.0012635505991056561, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.232027292251587, "eval_logits/rejected": -2.227186441421509, "eval_logps/chosen": -33.85300064086914, "eval_logps/rejected": -37.352054595947266, "eval_loss": 0.49996307492256165, "eval_rewards/accuracies": 0.5182723999023438, "eval_rewards/chosen": 0.0018155159195885062, "eval_rewards/margins": 0.00016987840353976935, "eval_rewards/rejected": 0.0016456374432891607, "eval_runtime": 145.752, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.9933750629425049, "logits/rejected": -1.9910228252410889, "logps/chosen": -32.92595291137695, "logps/rejected": -33.842830657958984, "loss": 0.4996, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0031786723993718624, "rewards/margins": 0.0013132576132193208, "rewards/rejected": 0.0018654146697372198, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.003373861312866, "logits/rejected": -1.9950469732284546, "logps/chosen": -32.12664031982422, "logps/rejected": -31.956974029541016, "loss": 0.4997, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0031826242338865995, "rewards/margins": 0.0011372944572940469, "rewards/rejected": 0.002045330125838518, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.0322792530059814, "logits/rejected": -2.024336099624634, "logps/chosen": -30.118215560913086, "logps/rejected": -31.88967514038086, "loss": 0.4995, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0035750004462897778, "rewards/margins": 0.0019416653085500002, "rewards/rejected": 0.0016333358362317085, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9619420766830444, "logits/rejected": -1.9721254110336304, "logps/chosen": -31.056041717529297, "logps/rejected": -32.40135955810547, "loss": 0.4994, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0036362011451274157, "rewards/margins": 0.002301006345078349, "rewards/rejected": 0.0013351945672184229, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.873915433883667, "logits/rejected": -1.8750746250152588, "logps/chosen": -33.66141891479492, "logps/rejected": -34.53316116333008, "loss": 0.4993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.005312725435942411, "rewards/margins": 0.003071808721870184, "rewards/rejected": 0.002240917179733515, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9251092672348022, "logits/rejected": -1.921754240989685, "logps/chosen": -35.78556823730469, "logps/rejected": -32.478919982910156, "loss": 0.4996, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0038611083291471004, "rewards/margins": 0.0014227699721232057, "rewards/rejected": 0.0024383387062698603, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.0260913372039795, "logits/rejected": -2.0187911987304688, "logps/chosen": -33.21813201904297, "logps/rejected": -31.20585060119629, "loss": 0.499, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.005797281861305237, "rewards/margins": 0.00433726841583848, "rewards/rejected": 0.0014600132126361132, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.0327439308166504, "logits/rejected": -2.0379390716552734, "logps/chosen": -31.954986572265625, "logps/rejected": -32.14731979370117, "loss": 0.4994, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.00589799415320158, "rewards/margins": 0.0025197656359523535, "rewards/rejected": 0.0033782287500798702, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.0335724353790283, "logits/rejected": -2.0308480262756348, "logps/chosen": -31.060842514038086, "logps/rejected": -31.11257553100586, "loss": 0.4995, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.004281100817024708, "rewards/margins": 0.002224702388048172, "rewards/rejected": 0.0020563979633152485, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.9044897556304932, "logits/rejected": -1.9091441631317139, "logps/chosen": -31.081249237060547, "logps/rejected": -32.595829010009766, "loss": 0.4992, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0052503314800560474, "rewards/margins": 0.0032992898486554623, "rewards/rejected": 0.0019510419806465507, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.2288246154785156, "eval_logits/rejected": -2.2240161895751953, "eval_logps/chosen": -33.74216842651367, "eval_logps/rejected": -37.281192779541016, "eval_loss": 0.49986347556114197, "eval_rewards/accuracies": 0.5747508406639099, "eval_rewards/chosen": 0.0029238576535135508, "eval_rewards/margins": 0.0005695598665624857, "eval_rewards/rejected": 0.002354297786951065, "eval_runtime": 145.8032, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.016956090927124, "logits/rejected": -2.0275344848632812, "logps/chosen": -31.490520477294922, "logps/rejected": -33.689552307128906, "loss": 0.4993, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.004807817749679089, "rewards/margins": 0.003076353808864951, "rewards/rejected": 0.0017314634751528502, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.9092109203338623, "logits/rejected": -1.9238770008087158, "logps/chosen": -29.577035903930664, "logps/rejected": -31.389028549194336, "loss": 0.4991, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.00534503348171711, "rewards/margins": 0.003653537482023239, "rewards/rejected": 0.0016914959996938705, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9660460948944092, "logits/rejected": -1.9700311422348022, "logps/chosen": -32.787933349609375, "logps/rejected": -31.389507293701172, "loss": 0.4988, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006295348517596722, "rewards/margins": 0.004697396419942379, "rewards/rejected": 0.001597951864823699, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9643852710723877, "logits/rejected": -1.9426368474960327, "logps/chosen": -33.608482360839844, "logps/rejected": -34.86387252807617, "loss": 0.4989, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.005452706944197416, "rewards/margins": 0.004583484493196011, "rewards/rejected": 0.0008692230330780149, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -2.006474256515503, "logits/rejected": -2.003183364868164, "logps/chosen": -32.441749572753906, "logps/rejected": -35.984519958496094, "loss": 0.4994, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.004833144135773182, "rewards/margins": 0.00242913281545043, "rewards/rejected": 0.002404011320322752, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8745994567871094, "logits/rejected": -1.8721377849578857, "logps/chosen": -33.70234298706055, "logps/rejected": -35.24811553955078, "loss": 0.4995, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.004860556218773127, "rewards/margins": 0.002295448211953044, "rewards/rejected": 0.0025651075411587954, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8601783514022827, "logits/rejected": -1.8576805591583252, "logps/chosen": -33.903480529785156, "logps/rejected": -31.573394775390625, "loss": 0.4993, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.0049412003718316555, "rewards/margins": 0.0029618421103805304, "rewards/rejected": 0.001979358261451125, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.963313341140747, "logits/rejected": -1.9528875350952148, "logps/chosen": -34.740501403808594, "logps/rejected": -31.602115631103516, "loss": 0.4991, "rewards/accuracies": 0.6875, "rewards/chosen": 0.006161376368254423, "rewards/margins": 0.0037725958973169327, "rewards/rejected": 0.002388780238106847, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.058793544769287, "logits/rejected": -2.043962240219116, "logps/chosen": -30.4116268157959, "logps/rejected": -32.341819763183594, "loss": 0.4995, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.005124981049448252, "rewards/margins": 0.0020247932989150286, "rewards/rejected": 0.003100187750533223, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.9307619333267212, "logits/rejected": -1.9283145666122437, "logps/chosen": -32.04631042480469, "logps/rejected": -30.6721134185791, "loss": 0.4982, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.008808004669845104, "rewards/margins": 0.007361026015132666, "rewards/rejected": 0.0014469798188656569, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2289085388183594, "eval_logits/rejected": -2.2240960597991943, "eval_logps/chosen": -33.75447463989258, "eval_logps/rejected": -37.274105072021484, "eval_loss": 0.499908983707428, "eval_rewards/accuracies": 0.5041528344154358, "eval_rewards/chosen": 0.0028007798828184605, "eval_rewards/margins": 0.00037563726073130965, "eval_rewards/rejected": 0.00242514256387949, "eval_runtime": 145.6422, "eval_samples_per_second": 2.355, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 4.84533120650964e-06, "logits/chosen": -1.914014458656311, "logits/rejected": -1.9108896255493164, "logps/chosen": -31.016780853271484, "logps/rejected": -33.5390739440918, "loss": 0.4991, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.005865688901394606, "rewards/margins": 0.0038349279202520847, "rewards/rejected": 0.002030761446803808, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.825108134172131e-06, "logits/chosen": -1.9622209072113037, "logits/rejected": -1.9501073360443115, "logps/chosen": -33.98822784423828, "logps/rejected": -33.42522048950195, "loss": 0.4988, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.0060784295201301575, "rewards/margins": 0.00494221830740571, "rewards/rejected": 0.0011362109798938036, "step": 320 }, { "epoch": 0.86, "learning_rate": 4.80369052967602e-06, "logits/chosen": -1.997842788696289, "logits/rejected": -1.9965225458145142, "logps/chosen": -32.7388801574707, "logps/rejected": -32.22772216796875, "loss": 0.4988, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.00746917212381959, "rewards/margins": 0.0049505168572068214, "rewards/rejected": 0.002518654800951481, "step": 330 }, { "epoch": 0.88, "learning_rate": 4.781089396387968e-06, "logits/chosen": -2.0849790573120117, "logits/rejected": -2.06939435005188, "logps/chosen": -33.35835266113281, "logps/rejected": -32.73654556274414, "loss": 0.4989, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.008210408501327038, "rewards/margins": 0.004395507741719484, "rewards/rejected": 0.0038149021565914154, "step": 340 }, { "epoch": 0.91, "learning_rate": 4.757316345716554e-06, "logits/chosen": -1.9578001499176025, "logits/rejected": -1.9570014476776123, "logps/chosen": -32.38352966308594, "logps/rejected": -32.16341781616211, "loss": 0.4985, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.008985674008727074, "rewards/margins": 0.005968388635665178, "rewards/rejected": 0.0030172846745699644, "step": 350 }, { "epoch": 0.94, "learning_rate": 4.73238359114687e-06, "logits/chosen": -1.9111387729644775, "logits/rejected": -1.9213663339614868, "logps/chosen": -31.330036163330078, "logps/rejected": -34.964256286621094, "loss": 0.4986, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.008746545761823654, "rewards/margins": 0.005501560866832733, "rewards/rejected": 0.003244984894990921, "step": 360 }, { "epoch": 0.96, "learning_rate": 4.706303941965804e-06, "logits/chosen": -2.0449702739715576, "logits/rejected": -2.0385282039642334, "logps/chosen": -32.85245132446289, "logps/rejected": -28.954187393188477, "loss": 0.4987, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0079951835796237, "rewards/margins": 0.005272808950394392, "rewards/rejected": 0.002722373930737376, "step": 370 }, { "epoch": 0.99, "learning_rate": 4.679090796681225e-06, "logits/chosen": -1.901015281677246, "logits/rejected": -1.9031999111175537, "logps/chosen": -33.20323944091797, "logps/rejected": -30.677053451538086, "loss": 0.498, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.010358055122196674, "rewards/margins": 0.00840230192989111, "rewards/rejected": 0.001955752493813634, "step": 380 }, { "epoch": 1.01, "learning_rate": 4.650758136138454e-06, "logits/chosen": -1.9289333820343018, "logits/rejected": -1.9276466369628906, "logps/chosen": -33.32128143310547, "logps/rejected": -35.63185119628906, "loss": 0.4975, "rewards/accuracies": 0.7583333253860474, "rewards/chosen": 0.01063510961830616, "rewards/margins": 0.010393550619482994, "rewards/rejected": 0.00024155918799806386, "step": 390 }, { "epoch": 1.04, "learning_rate": 4.621320516337559e-06, "logits/chosen": -1.863186240196228, "logits/rejected": -1.8549854755401611, "logps/chosen": -30.498249053955078, "logps/rejected": -36.088096618652344, "loss": 0.4967, "rewards/accuracies": 0.875, "rewards/chosen": 0.012757278978824615, "rewards/margins": 0.014232242479920387, "rewards/rejected": -0.001474961405619979, "step": 400 }, { "epoch": 1.04, "eval_logits/chosen": -2.2094738483428955, "eval_logits/rejected": -2.2046518325805664, "eval_logps/chosen": -33.672157287597656, "eval_logps/rejected": -37.215171813964844, "eval_loss": 0.4998500943183899, "eval_rewards/accuracies": 0.5622923374176025, "eval_rewards/chosen": 0.0036239090841263533, "eval_rewards/margins": 0.0006094546988606453, "eval_rewards/rejected": 0.003014454385265708, "eval_runtime": 146.2447, "eval_samples_per_second": 2.345, "eval_steps_per_second": 0.294, "step": 400 }, { "epoch": 1.06, "learning_rate": 4.590793060955158e-06, "logits/chosen": -2.0367393493652344, "logits/rejected": -2.0395472049713135, "logps/chosen": -31.686859130859375, "logps/rejected": -34.91094970703125, "loss": 0.4966, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.013670777902007103, "rewards/margins": 0.0148518281057477, "rewards/rejected": -0.0011810490395873785, "step": 410 }, { "epoch": 1.09, "learning_rate": 4.559191453574582e-06, "logits/chosen": -1.8766273260116577, "logits/rejected": -1.8752492666244507, "logps/chosen": -27.93256187438965, "logps/rejected": -32.38922882080078, "loss": 0.4973, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.011349949985742569, "rewards/margins": 0.012065614573657513, "rewards/rejected": -0.0007156648789532483, "step": 420 }, { "epoch": 1.12, "learning_rate": 4.52653192962838e-06, "logits/chosen": -1.834745168685913, "logits/rejected": -1.8277431726455688, "logps/chosen": -32.58226013183594, "logps/rejected": -34.0352783203125, "loss": 0.4969, "rewards/accuracies": 0.9375, "rewards/chosen": 0.014918235130608082, "rewards/margins": 0.012818296439945698, "rewards/rejected": 0.0020999389234930277, "step": 430 }, { "epoch": 1.14, "learning_rate": 4.492831268057307e-06, "logits/chosen": -2.005305767059326, "logits/rejected": -2.0002236366271973, "logps/chosen": -30.26675033569336, "logps/rejected": -32.066688537597656, "loss": 0.4962, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.015165433287620544, "rewards/margins": 0.01615620031952858, "rewards/rejected": -0.0009907669154927135, "step": 440 }, { "epoch": 1.17, "learning_rate": 4.458106782690094e-06, "logits/chosen": -1.8884214162826538, "logits/rejected": -1.892564058303833, "logps/chosen": -32.953224182128906, "logps/rejected": -32.72943878173828, "loss": 0.4961, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.01562627963721752, "rewards/margins": 0.017034271731972694, "rewards/rejected": -0.0014079909306019545, "step": 450 }, { "epoch": 1.19, "learning_rate": 4.422376313348405e-06, "logits/chosen": -1.8940017223358154, "logits/rejected": -1.888357162475586, "logps/chosen": -33.757293701171875, "logps/rejected": -35.25668716430664, "loss": 0.4956, "rewards/accuracies": 0.875, "rewards/chosen": 0.016622314229607582, "rewards/margins": 0.01969819888472557, "rewards/rejected": -0.003075886517763138, "step": 460 }, { "epoch": 1.22, "learning_rate": 4.3856582166815696e-06, "logits/chosen": -1.9210342168807983, "logits/rejected": -1.920659065246582, "logps/chosen": -32.557655334472656, "logps/rejected": -34.18804168701172, "loss": 0.4962, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.01610674150288105, "rewards/margins": 0.01613234356045723, "rewards/rejected": -2.560380380600691e-05, "step": 470 }, { "epoch": 1.25, "learning_rate": 4.347971356735789e-06, "logits/chosen": -1.9694734811782837, "logits/rejected": -1.9510208368301392, "logps/chosen": -32.425758361816406, "logps/rejected": -33.256771087646484, "loss": 0.4956, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.01735007017850876, "rewards/margins": 0.01921800896525383, "rewards/rejected": -0.001867939019575715, "step": 480 }, { "epoch": 1.27, "learning_rate": 4.309335095262675e-06, "logits/chosen": -1.934520959854126, "logits/rejected": -1.934033751487732, "logps/chosen": -29.948516845703125, "logps/rejected": -31.202651977539062, "loss": 0.4961, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.016627881675958633, "rewards/margins": 0.016193937510252, "rewards/rejected": 0.0004339427687227726, "step": 490 }, { "epoch": 1.3, "learning_rate": 4.269769281772082e-06, "logits/chosen": -1.8985036611557007, "logits/rejected": -1.891710638999939, "logps/chosen": -30.8389892578125, "logps/rejected": -34.76033020019531, "loss": 0.4954, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.018311362713575363, "rewards/margins": 0.01964724436402321, "rewards/rejected": -0.0013358818832784891, "step": 500 }, { "epoch": 1.3, "eval_logits/chosen": -2.1807026863098145, "eval_logits/rejected": -2.1759257316589355, "eval_logps/chosen": -33.619056701660156, "eval_logps/rejected": -37.18906021118164, "eval_loss": 0.4997919797897339, "eval_rewards/accuracies": 0.5772424936294556, "eval_rewards/chosen": 0.0041549475863575935, "eval_rewards/margins": 0.0008793265442363918, "eval_rewards/rejected": 0.003275620751082897, "eval_runtime": 145.8925, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 500 }, { "epoch": 1.32, "learning_rate": 4.22929424333435e-06, "logits/chosen": -1.892907738685608, "logits/rejected": -1.8967831134796143, "logps/chosen": -27.704700469970703, "logps/rejected": -32.9903450012207, "loss": 0.4963, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.014701254665851593, "rewards/margins": 0.016011826694011688, "rewards/rejected": -0.001310571446083486, "step": 510 }, { "epoch": 1.35, "learning_rate": 4.1879307741372085e-06, "logits/chosen": -1.8976653814315796, "logits/rejected": -1.9082101583480835, "logps/chosen": -31.580432891845703, "logps/rejected": -30.9519100189209, "loss": 0.4955, "rewards/accuracies": 0.875, "rewards/chosen": 0.016818998381495476, "rewards/margins": 0.01982123777270317, "rewards/rejected": -0.003002240788191557, "step": 520 }, { "epoch": 1.38, "learning_rate": 4.145700124802693e-06, "logits/chosen": -1.8399875164031982, "logits/rejected": -1.8377044200897217, "logps/chosen": -29.908676147460938, "logps/rejected": -30.362207412719727, "loss": 0.4955, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.01758403517305851, "rewards/margins": 0.019438493996858597, "rewards/rejected": -0.0018544571939855814, "step": 530 }, { "epoch": 1.4, "learning_rate": 4.102623991469562e-06, "logits/chosen": -1.920372724533081, "logits/rejected": -1.9134242534637451, "logps/chosen": -32.44926071166992, "logps/rejected": -33.200767517089844, "loss": 0.4955, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.017835671082139015, "rewards/margins": 0.019668519496917725, "rewards/rejected": -0.0018328496953472495, "step": 540 }, { "epoch": 1.43, "learning_rate": 4.058724504646834e-06, "logits/chosen": -1.885063886642456, "logits/rejected": -1.8914177417755127, "logps/chosen": -30.19329261779785, "logps/rejected": -32.799842834472656, "loss": 0.4963, "rewards/accuracies": 0.8125, "rewards/chosen": 0.015598910860717297, "rewards/margins": 0.01600998267531395, "rewards/rejected": -0.0004110717272851616, "step": 550 }, { "epoch": 1.45, "learning_rate": 4.014024217844167e-06, "logits/chosen": -1.9558032751083374, "logits/rejected": -1.9331070184707642, "logps/chosen": -29.753009796142578, "logps/rejected": -33.04769515991211, "loss": 0.4959, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.0168455857783556, "rewards/margins": 0.017361946403980255, "rewards/rejected": -0.0005163621390238404, "step": 560 }, { "epoch": 1.48, "learning_rate": 3.968546095984911e-06, "logits/chosen": -1.8903181552886963, "logits/rejected": -1.8854230642318726, "logps/chosen": -30.62668228149414, "logps/rejected": -32.05100631713867, "loss": 0.4959, "rewards/accuracies": 0.8125, "rewards/chosen": 0.01790049858391285, "rewards/margins": 0.01720140501856804, "rewards/rejected": 0.0006990955444052815, "step": 570 }, { "epoch": 1.51, "learning_rate": 3.922313503607806e-06, "logits/chosen": -1.930395483970642, "logits/rejected": -1.932220220565796, "logps/chosen": -32.589778900146484, "logps/rejected": -35.16767883300781, "loss": 0.4952, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.017860129475593567, "rewards/margins": 0.021527757868170738, "rewards/rejected": -0.0036676295567303896, "step": 580 }, { "epoch": 1.53, "learning_rate": 3.875350192863368e-06, "logits/chosen": -1.9129533767700195, "logits/rejected": -1.9123092889785767, "logps/chosen": -28.789093017578125, "logps/rejected": -31.536624908447266, "loss": 0.4956, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.018029652535915375, "rewards/margins": 0.01877954974770546, "rewards/rejected": -0.0007498954655602574, "step": 590 }, { "epoch": 1.56, "learning_rate": 3.8276802913111436e-06, "logits/chosen": -1.9223308563232422, "logits/rejected": -1.9202518463134766, "logps/chosen": -31.292776107788086, "logps/rejected": -32.150634765625, "loss": 0.4964, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.017055068165063858, "rewards/margins": 0.015561839565634727, "rewards/rejected": 0.0014932285994291306, "step": 600 }, { "epoch": 1.56, "eval_logits/chosen": -2.1569008827209473, "eval_logits/rejected": -2.1521544456481934, "eval_logps/chosen": -33.592525482177734, "eval_logps/rejected": -37.25735855102539, "eval_loss": 0.4995650053024292, "eval_rewards/accuracies": 0.5685215592384338, "eval_rewards/chosen": 0.004420237150043249, "eval_rewards/margins": 0.0018276458140462637, "eval_rewards/rejected": 0.0025925911031663418, "eval_runtime": 145.7896, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 600 }, { "epoch": 1.58, "learning_rate": 3.7793282895240927e-06, "logits/chosen": -1.9622409343719482, "logits/rejected": -1.968703031539917, "logps/chosen": -30.59027099609375, "logps/rejected": -32.09941864013672, "loss": 0.4961, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.016471117734909058, "rewards/margins": 0.016833433881402016, "rewards/rejected": -0.0003623150405474007, "step": 610 }, { "epoch": 1.61, "learning_rate": 3.730319028506478e-06, "logits/chosen": -1.9154777526855469, "logits/rejected": -1.913214921951294, "logps/chosen": -32.721229553222656, "logps/rejected": -31.03836441040039, "loss": 0.4951, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.0203465037047863, "rewards/margins": 0.020875070244073868, "rewards/rejected": -0.000528569333255291, "step": 620 }, { "epoch": 1.64, "learning_rate": 3.6806776869317074e-06, "logits/chosen": -1.8615707159042358, "logits/rejected": -1.8549633026123047, "logps/chosen": -33.12920379638672, "logps/rejected": -32.32065963745117, "loss": 0.4946, "rewards/accuracies": 0.875, "rewards/chosen": 0.023350173607468605, "rewards/margins": 0.02298470214009285, "rewards/rejected": 0.00036546969204209745, "step": 630 }, { "epoch": 1.66, "learning_rate": 3.6304297682067146e-06, "logits/chosen": -1.8696269989013672, "logits/rejected": -1.8758395910263062, "logps/chosen": -32.165340423583984, "logps/rejected": -33.2056770324707, "loss": 0.4958, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.018139610067009926, "rewards/margins": 0.01779373362660408, "rewards/rejected": 0.0003458770806901157, "step": 640 }, { "epoch": 1.69, "learning_rate": 3.579601087369492e-06, "logits/chosen": -1.952419638633728, "logits/rejected": -1.966408371925354, "logps/chosen": -30.081417083740234, "logps/rejected": -31.9990291595459, "loss": 0.4958, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.017710469663143158, "rewards/margins": 0.017723901197314262, "rewards/rejected": -1.3430044418782927e-05, "step": 650 }, { "epoch": 1.71, "learning_rate": 3.5282177578265295e-06, "logits/chosen": -1.828285574913025, "logits/rejected": -1.8253133296966553, "logps/chosen": -31.497600555419922, "logps/rejected": -35.03656005859375, "loss": 0.4936, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.024253567680716515, "rewards/margins": 0.02761664055287838, "rewards/rejected": -0.0033630705438554287, "step": 660 }, { "epoch": 1.74, "learning_rate": 3.476306177936961e-06, "logits/chosen": -1.9160382747650146, "logits/rejected": -1.916337251663208, "logps/chosen": -29.65035057067871, "logps/rejected": -34.16764831542969, "loss": 0.4957, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.01635240949690342, "rewards/margins": 0.01976504735648632, "rewards/rejected": -0.003412640420719981, "step": 670 }, { "epoch": 1.77, "learning_rate": 3.423893017450324e-06, "logits/chosen": -1.8645700216293335, "logits/rejected": -1.861588716506958, "logps/chosen": -29.15572166442871, "logps/rejected": -33.00225067138672, "loss": 0.4957, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.01716628111898899, "rewards/margins": 0.01901078037917614, "rewards/rejected": -0.0018444998422637582, "step": 680 }, { "epoch": 1.79, "learning_rate": 3.3710052038048794e-06, "logits/chosen": -1.8963727951049805, "logits/rejected": -1.8964437246322632, "logps/chosen": -27.984289169311523, "logps/rejected": -30.99312973022461, "loss": 0.4943, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.02178804762661457, "rewards/margins": 0.025014396756887436, "rewards/rejected": -0.0032263505272567272, "step": 690 }, { "epoch": 1.82, "learning_rate": 3.3176699082935546e-06, "logits/chosen": -1.8081165552139282, "logits/rejected": -1.81146240234375, "logps/chosen": -32.05728530883789, "logps/rejected": -31.581283569335938, "loss": 0.4942, "rewards/accuracies": 0.8125, "rewards/chosen": 0.024211909621953964, "rewards/margins": 0.025377869606018066, "rewards/rejected": -0.001165962778031826, "step": 700 }, { "epoch": 1.82, "eval_logits/chosen": -2.1417086124420166, "eval_logits/rejected": -2.1370012760162354, "eval_logps/chosen": -33.69810485839844, "eval_logps/rejected": -37.340553283691406, "eval_loss": 0.49963295459747314, "eval_rewards/accuracies": 0.579734206199646, "eval_rewards/chosen": 0.0033644884824752808, "eval_rewards/margins": 0.0016038385219871998, "eval_rewards/rejected": 0.0017606498440727592, "eval_runtime": 145.8796, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 700 }, { "epoch": 1.84, "learning_rate": 3.2639145321045933e-06, "logits/chosen": -1.8899507522583008, "logits/rejected": -1.8811132907867432, "logps/chosen": -34.376182556152344, "logps/rejected": -32.027015686035156, "loss": 0.4948, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.021358376368880272, "rewards/margins": 0.02257036231458187, "rewards/rejected": -0.0012119871098548174, "step": 710 }, { "epoch": 1.87, "learning_rate": 3.2097666922441107e-06, "logits/chosen": -1.9003782272338867, "logits/rejected": -1.9015012979507446, "logps/chosen": -34.15888214111328, "logps/rejected": -33.193199157714844, "loss": 0.4944, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.022765014320611954, "rewards/margins": 0.023936782032251358, "rewards/rejected": -0.0011717682937160134, "step": 720 }, { "epoch": 1.9, "learning_rate": 3.1552542073477554e-06, "logits/chosen": -1.9175533056259155, "logits/rejected": -1.9151859283447266, "logps/chosen": -30.27239990234375, "logps/rejected": -33.10813522338867, "loss": 0.4944, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02290928363800049, "rewards/margins": 0.024021681398153305, "rewards/rejected": -0.0011123981093987823, "step": 730 }, { "epoch": 1.92, "learning_rate": 3.100405083388799e-06, "logits/chosen": -1.8999090194702148, "logits/rejected": -1.9051405191421509, "logps/chosen": -29.600543975830078, "logps/rejected": -33.40522384643555, "loss": 0.4944, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.021851813420653343, "rewards/margins": 0.02435588836669922, "rewards/rejected": -0.0025040716864168644, "step": 740 }, { "epoch": 1.95, "learning_rate": 3.0452474992899645e-06, "logits/chosen": -1.8462251424789429, "logits/rejected": -1.8450710773468018, "logps/chosen": -30.871395111083984, "logps/rejected": -34.77960205078125, "loss": 0.4947, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.022535843774676323, "rewards/margins": 0.02347227931022644, "rewards/rejected": -0.0009364362922497094, "step": 750 }, { "epoch": 1.97, "learning_rate": 2.989809792446417e-06, "logits/chosen": -1.720664381980896, "logits/rejected": -1.7157522439956665, "logps/chosen": -33.3805046081543, "logps/rejected": -35.51945877075195, "loss": 0.4932, "rewards/accuracies": 0.875, "rewards/chosen": 0.026179391890764236, "rewards/margins": 0.02928983047604561, "rewards/rejected": -0.003110440680757165, "step": 760 }, { "epoch": 2.0, "learning_rate": 2.9341204441673267e-06, "logits/chosen": -1.8583590984344482, "logits/rejected": -1.8622472286224365, "logps/chosen": -33.08837127685547, "logps/rejected": -33.761695861816406, "loss": 0.4948, "rewards/accuracies": 0.8541666269302368, "rewards/chosen": 0.021252866834402084, "rewards/margins": 0.022921394556760788, "rewards/rejected": -0.0016685245791450143, "step": 770 }, { "epoch": 2.03, "learning_rate": 2.878208065043501e-06, "logits/chosen": -1.803155541419983, "logits/rejected": -1.8014135360717773, "logps/chosen": -30.931873321533203, "logps/rejected": -35.34140396118164, "loss": 0.4914, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.029697230085730553, "rewards/margins": 0.039064671844244, "rewards/rejected": -0.00936744175851345, "step": 780 }, { "epoch": 2.05, "learning_rate": 2.8221013802485974e-06, "logits/chosen": -1.8476654291152954, "logits/rejected": -1.8463468551635742, "logps/chosen": -30.539403915405273, "logps/rejected": -33.56371307373047, "loss": 0.4925, "rewards/accuracies": 0.9375, "rewards/chosen": 0.02836332656443119, "rewards/margins": 0.033134736120700836, "rewards/rejected": -0.004771408159285784, "step": 790 }, { "epoch": 2.08, "learning_rate": 2.76582921478147e-06, "logits/chosen": -1.7794796228408813, "logits/rejected": -1.773779273033142, "logps/chosen": -31.581567764282227, "logps/rejected": -31.875568389892578, "loss": 0.4926, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.028015125542879105, "rewards/margins": 0.03212819993495941, "rewards/rejected": -0.004113074392080307, "step": 800 }, { "epoch": 2.08, "eval_logits/chosen": -2.132253885269165, "eval_logits/rejected": -2.1275634765625, "eval_logps/chosen": -33.6678466796875, "eval_logps/rejected": -37.38642120361328, "eval_loss": 0.499472975730896, "eval_rewards/accuracies": 0.594684362411499, "eval_rewards/chosen": 0.0036670216359198093, "eval_rewards/margins": 0.002365043619647622, "eval_rewards/rejected": 0.0013019782491028309, "eval_runtime": 145.9155, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 800 }, { "epoch": 2.1, "learning_rate": 2.7094204786572254e-06, "logits/chosen": -1.875432014465332, "logits/rejected": -1.8828538656234741, "logps/chosen": -29.421600341796875, "logps/rejected": -34.89478302001953, "loss": 0.4926, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.027066577225923538, "rewards/margins": 0.033455990254879, "rewards/rejected": -0.006389413960278034, "step": 810 }, { "epoch": 2.13, "learning_rate": 2.6529041520546072e-06, "logits/chosen": -1.8495336771011353, "logits/rejected": -1.852142572402954, "logps/chosen": -30.283716201782227, "logps/rejected": -33.47504425048828, "loss": 0.4942, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.022873621433973312, "rewards/margins": 0.02532605454325676, "rewards/rejected": -0.00245243264362216, "step": 820 }, { "epoch": 2.16, "learning_rate": 2.5963092704273302e-06, "logits/chosen": -1.748708963394165, "logits/rejected": -1.7528718709945679, "logps/chosen": -29.96748924255371, "logps/rejected": -35.57471466064453, "loss": 0.4927, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.026239663362503052, "rewards/margins": 0.032885000109672546, "rewards/rejected": -0.0066453381441533566, "step": 830 }, { "epoch": 2.18, "learning_rate": 2.53966490958702e-06, "logits/chosen": -1.8163249492645264, "logits/rejected": -1.8123382329940796, "logps/chosen": -30.351696014404297, "logps/rejected": -33.14829635620117, "loss": 0.4935, "rewards/accuracies": 0.875, "rewards/chosen": 0.026169534772634506, "rewards/margins": 0.027706000953912735, "rewards/rejected": -0.0015364640858024359, "step": 840 }, { "epoch": 2.21, "learning_rate": 2.4830001707654135e-06, "logits/chosen": -1.8956245183944702, "logits/rejected": -1.8976259231567383, "logps/chosen": -29.960979461669922, "logps/rejected": -36.380279541015625, "loss": 0.4921, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.02814355492591858, "rewards/margins": 0.03532697632908821, "rewards/rejected": -0.007183422800153494, "step": 850 }, { "epoch": 2.23, "learning_rate": 2.4263441656635054e-06, "logits/chosen": -1.7016551494598389, "logits/rejected": -1.6961944103240967, "logps/chosen": -33.43014144897461, "logps/rejected": -33.36838150024414, "loss": 0.4925, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.028515327721834183, "rewards/margins": 0.03328012302517891, "rewards/rejected": -0.004764794372022152, "step": 860 }, { "epoch": 2.26, "learning_rate": 2.3697260014953107e-06, "logits/chosen": -1.7582767009735107, "logits/rejected": -1.7583192586898804, "logps/chosen": -32.9239616394043, "logps/rejected": -35.554019927978516, "loss": 0.4919, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.029188454151153564, "rewards/margins": 0.036265987902879715, "rewards/rejected": -0.007077532354742289, "step": 870 }, { "epoch": 2.29, "learning_rate": 2.3131747660339396e-06, "logits/chosen": -1.8046823740005493, "logits/rejected": -1.7934318780899048, "logps/chosen": -31.343753814697266, "logps/rejected": -33.549922943115234, "loss": 0.4925, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.027628283947706223, "rewards/margins": 0.033907197415828705, "rewards/rejected": -0.006278916262090206, "step": 880 }, { "epoch": 2.31, "learning_rate": 2.256719512667651e-06, "logits/chosen": -1.9076982736587524, "logits/rejected": -1.9121280908584595, "logps/chosen": -30.531978607177734, "logps/rejected": -33.27050018310547, "loss": 0.492, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.027979601174592972, "rewards/margins": 0.0363197848200798, "rewards/rejected": -0.008340183645486832, "step": 890 }, { "epoch": 2.34, "learning_rate": 2.2003892454735786e-06, "logits/chosen": -1.826246976852417, "logits/rejected": -1.8192167282104492, "logps/chosen": -31.618709564208984, "logps/rejected": -33.17729949951172, "loss": 0.491, "rewards/accuracies": 0.9375, "rewards/chosen": 0.032353680580854416, "rewards/margins": 0.04010532423853874, "rewards/rejected": -0.007751642260700464, "step": 900 }, { "epoch": 2.34, "eval_logits/chosen": -2.1153886318206787, "eval_logits/rejected": -2.1107325553894043, "eval_logps/chosen": -33.72140884399414, "eval_logps/rejected": -37.456180572509766, "eval_loss": 0.4994627833366394, "eval_rewards/accuracies": 0.5739202499389648, "eval_rewards/chosen": 0.003131402190774679, "eval_rewards/margins": 0.0025269899051636457, "eval_rewards/rejected": 0.0006044124602340162, "eval_runtime": 145.8928, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 900 }, { "epoch": 2.36, "learning_rate": 2.1442129043167877e-06, "logits/chosen": -1.8176963329315186, "logits/rejected": -1.8180383443832397, "logps/chosen": -28.452808380126953, "logps/rejected": -35.59280014038086, "loss": 0.4921, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.027143558487296104, "rewards/margins": 0.036174654960632324, "rewards/rejected": -0.00903109647333622, "step": 910 }, { "epoch": 2.39, "learning_rate": 2.088219349982323e-06, "logits/chosen": -1.772467017173767, "logits/rejected": -1.764556646347046, "logps/chosen": -29.524948120117188, "logps/rejected": -34.33641815185547, "loss": 0.4934, "rewards/accuracies": 0.9375, "rewards/chosen": 0.023139983415603638, "rewards/margins": 0.030790451914072037, "rewards/rejected": -0.007650467567145824, "step": 920 }, { "epoch": 2.42, "learning_rate": 2.0324373493478803e-06, "logits/chosen": -1.9374393224716187, "logits/rejected": -1.9376726150512695, "logps/chosen": -27.288341522216797, "logps/rejected": -33.29496383666992, "loss": 0.493, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.027086907997727394, "rewards/margins": 0.03138797730207443, "rewards/rejected": -0.0043010651133954525, "step": 930 }, { "epoch": 2.44, "learning_rate": 1.976895560604729e-06, "logits/chosen": -1.8147531747817993, "logits/rejected": -1.8243385553359985, "logps/chosen": -31.1882266998291, "logps/rejected": -33.76173400878906, "loss": 0.491, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.03236837312579155, "rewards/margins": 0.040089257061481476, "rewards/rejected": -0.007720877416431904, "step": 940 }, { "epoch": 2.47, "learning_rate": 1.921622518534466e-06, "logits/chosen": -1.8579801321029663, "logits/rejected": -1.861673355102539, "logps/chosen": -28.247013092041016, "logps/rejected": -32.32887268066406, "loss": 0.4933, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.024007126688957214, "rewards/margins": 0.02993503212928772, "rewards/rejected": -0.0059279040433466434, "step": 950 }, { "epoch": 2.49, "learning_rate": 1.8666466198491794e-06, "logits/chosen": -1.8509804010391235, "logits/rejected": -1.8471883535385132, "logps/chosen": -31.077259063720703, "logps/rejected": -34.63513946533203, "loss": 0.4918, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.029979607090353966, "rewards/margins": 0.037267137318849564, "rewards/rejected": -0.0072875297628343105, "step": 960 }, { "epoch": 2.52, "learning_rate": 1.8119961086025376e-06, "logits/chosen": -1.7676801681518555, "logits/rejected": -1.7701276540756226, "logps/chosen": -29.73238754272461, "logps/rejected": -35.80852127075195, "loss": 0.4919, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.030803611502051353, "rewards/margins": 0.03707288205623627, "rewards/rejected": -0.0062692672945559025, "step": 970 }, { "epoch": 2.55, "learning_rate": 1.7576990616793139e-06, "logits/chosen": -1.8003818988800049, "logits/rejected": -1.7940855026245117, "logps/chosen": -32.70087814331055, "logps/rejected": -37.19306945800781, "loss": 0.4921, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.027041900902986526, "rewards/margins": 0.03635306656360626, "rewards/rejected": -0.009311167523264885, "step": 980 }, { "epoch": 2.57, "learning_rate": 1.7037833743707892e-06, "logits/chosen": -1.776210069656372, "logits/rejected": -1.7711480855941772, "logps/chosen": -28.2900390625, "logps/rejected": -36.815948486328125, "loss": 0.4922, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.02712910808622837, "rewards/margins": 0.03538975864648819, "rewards/rejected": -0.008260652422904968, "step": 990 }, { "epoch": 2.6, "learning_rate": 1.6502767460434588e-06, "logits/chosen": -1.7549470663070679, "logits/rejected": -1.7445472478866577, "logps/chosen": -29.535968780517578, "logps/rejected": -30.032791137695312, "loss": 0.4941, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.022127199918031693, "rewards/margins": 0.026457766070961952, "rewards/rejected": -0.004330565221607685, "step": 1000 }, { "epoch": 2.6, "eval_logits/chosen": -2.1005983352661133, "eval_logits/rejected": -2.095959424972534, "eval_logps/chosen": -33.75029373168945, "eval_logps/rejected": -37.502525329589844, "eval_loss": 0.49945273995399475, "eval_rewards/accuracies": 0.6058970093727112, "eval_rewards/chosen": 0.0028425909113138914, "eval_rewards/margins": 0.002701645949855447, "eval_rewards/rejected": 0.00014094497601035982, "eval_runtime": 145.8673, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 1000 }, { "epoch": 2.62, "learning_rate": 1.5972066659083796e-06, "logits/chosen": -1.8609699010849, "logits/rejected": -1.8604313135147095, "logps/chosen": -29.16253662109375, "logps/rejected": -30.912038803100586, "loss": 0.4927, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.026616577059030533, "rewards/margins": 0.03315214067697525, "rewards/rejected": -0.006535563617944717, "step": 1010 }, { "epoch": 2.65, "learning_rate": 1.5446003988985041e-06, "logits/chosen": -1.9033771753311157, "logits/rejected": -1.9041751623153687, "logps/chosen": -29.415115356445312, "logps/rejected": -31.9776611328125, "loss": 0.4927, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.025284653529524803, "rewards/margins": 0.033403180539608, "rewards/rejected": -0.008118532598018646, "step": 1020 }, { "epoch": 2.68, "learning_rate": 1.4924849716612211e-06, "logits/chosen": -1.870643973350525, "logits/rejected": -1.8740373849868774, "logps/chosen": -29.589406967163086, "logps/rejected": -28.370319366455078, "loss": 0.4929, "rewards/accuracies": 0.875, "rewards/chosen": 0.02690659835934639, "rewards/margins": 0.030772637575864792, "rewards/rejected": -0.003866040613502264, "step": 1030 }, { "epoch": 2.7, "learning_rate": 1.440887158673332e-06, "logits/chosen": -1.8781054019927979, "logits/rejected": -1.8704578876495361, "logps/chosen": -28.706920623779297, "logps/rejected": -34.413326263427734, "loss": 0.4928, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.02470974065363407, "rewards/margins": 0.03384070843458176, "rewards/rejected": -0.009130971506237984, "step": 1040 }, { "epoch": 2.73, "learning_rate": 1.3898334684855647e-06, "logits/chosen": -1.818250298500061, "logits/rejected": -1.8290458917617798, "logps/chosen": -30.612024307250977, "logps/rejected": -32.82752227783203, "loss": 0.492, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.02941080555319786, "rewards/margins": 0.035235948860645294, "rewards/rejected": -0.005825136322528124, "step": 1050 }, { "epoch": 2.75, "learning_rate": 1.3393501301037245e-06, "logits/chosen": -1.8953624963760376, "logits/rejected": -1.8862526416778564, "logps/chosen": -30.589797973632812, "logps/rejected": -37.13585662841797, "loss": 0.4921, "rewards/accuracies": 0.875, "rewards/chosen": 0.029109802097082138, "rewards/margins": 0.03715446963906288, "rewards/rejected": -0.008044666610658169, "step": 1060 }, { "epoch": 2.78, "learning_rate": 1.2894630795134454e-06, "logits/chosen": -1.8016560077667236, "logits/rejected": -1.8037437200546265, "logps/chosen": -32.77046203613281, "logps/rejected": -33.44564437866211, "loss": 0.4911, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03218822926282883, "rewards/margins": 0.039665985852479935, "rewards/rejected": -0.007477754261344671, "step": 1070 }, { "epoch": 2.81, "learning_rate": 1.2401979463554984e-06, "logits/chosen": -1.9271634817123413, "logits/rejected": -1.9280281066894531, "logps/chosen": -30.242328643798828, "logps/rejected": -35.20170211791992, "loss": 0.4917, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.028153110295534134, "rewards/margins": 0.03901328891515732, "rewards/rejected": -0.010860181413590908, "step": 1080 }, { "epoch": 2.83, "learning_rate": 1.1915800407584705e-06, "logits/chosen": -1.9012151956558228, "logits/rejected": -1.9054481983184814, "logps/chosen": -28.445846557617188, "logps/rejected": -34.31275177001953, "loss": 0.493, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.02447468414902687, "rewards/margins": 0.03212384134531021, "rewards/rejected": -0.0076491571962833405, "step": 1090 }, { "epoch": 2.86, "learning_rate": 1.1436343403356019e-06, "logits/chosen": -1.8907396793365479, "logits/rejected": -1.895939588546753, "logps/chosen": -31.212976455688477, "logps/rejected": -30.585351943969727, "loss": 0.4943, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.0235602967441082, "rewards/margins": 0.02539706788957119, "rewards/rejected": -0.001836769632063806, "step": 1100 }, { "epoch": 2.86, "eval_logits/chosen": -2.0964860916137695, "eval_logits/rejected": -2.091848373413086, "eval_logps/chosen": -33.77998352050781, "eval_logps/rejected": -37.53815841674805, "eval_loss": 0.4994538724422455, "eval_rewards/accuracies": 0.5826411843299866, "eval_rewards/chosen": 0.0025456694420427084, "eval_rewards/margins": 0.0027610675897449255, "eval_rewards/rejected": -0.0002153978421119973, "eval_runtime": 145.7726, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 1100 }, { "epoch": 2.88, "learning_rate": 1.0963854773524548e-06, "logits/chosen": -1.8822052478790283, "logits/rejected": -1.8824293613433838, "logps/chosen": -30.01492691040039, "logps/rejected": -31.45803451538086, "loss": 0.4923, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.029252270236611366, "rewards/margins": 0.03367346525192261, "rewards/rejected": -0.004421197809278965, "step": 1110 }, { "epoch": 2.91, "learning_rate": 1.049857726072005e-06, "logits/chosen": -1.7276147603988647, "logits/rejected": -1.7294237613677979, "logps/chosen": -31.513330459594727, "logps/rejected": -33.66464614868164, "loss": 0.4919, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.030516481027007103, "rewards/margins": 0.03675025328993797, "rewards/rejected": -0.006233775056898594, "step": 1120 }, { "epoch": 2.94, "learning_rate": 1.0040749902836508e-06, "logits/chosen": -1.7628084421157837, "logits/rejected": -1.760263442993164, "logps/chosen": -28.515111923217773, "logps/rejected": -31.614593505859375, "loss": 0.4935, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.024966087192296982, "rewards/margins": 0.02945462428033352, "rewards/rejected": -0.00448854174464941, "step": 1130 }, { "epoch": 2.96, "learning_rate": 9.59060791022566e-07, "logits/chosen": -1.8981006145477295, "logits/rejected": -1.8926544189453125, "logps/chosen": -30.10824966430664, "logps/rejected": -33.54905319213867, "loss": 0.4922, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.02931547723710537, "rewards/margins": 0.035265903919935226, "rewards/rejected": -0.005950425285845995, "step": 1140 }, { "epoch": 2.99, "learning_rate": 9.148382544856885e-07, "logits/chosen": -1.759209394454956, "logits/rejected": -1.7495086193084717, "logps/chosen": -30.550457000732422, "logps/rejected": -31.75992202758789, "loss": 0.4925, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.02901277504861355, "rewards/margins": 0.03368794918060303, "rewards/rejected": -0.004675174597650766, "step": 1150 }, { "epoch": 3.01, "learning_rate": 8.714301001505568e-07, "logits/chosen": -1.8278675079345703, "logits/rejected": -1.8278887271881104, "logps/chosen": -30.80059814453125, "logps/rejected": -31.803054809570312, "loss": 0.4921, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.028811518102884293, "rewards/margins": 0.0348275825381279, "rewards/rejected": -0.00601606722921133, "step": 1160 }, { "epoch": 3.04, "learning_rate": 8.288586291031025e-07, "logits/chosen": -1.902727484703064, "logits/rejected": -1.8974792957305908, "logps/chosen": -30.870708465576172, "logps/rejected": -33.48421859741211, "loss": 0.4928, "rewards/accuracies": 0.9375, "rewards/chosen": 0.026552587747573853, "rewards/margins": 0.03198261931538582, "rewards/rejected": -0.005430030170828104, "step": 1170 }, { "epoch": 3.06, "learning_rate": 7.871457125803897e-07, "logits/chosen": -1.7634168863296509, "logits/rejected": -1.7711273431777954, "logps/chosen": -30.915302276611328, "logps/rejected": -33.02967071533203, "loss": 0.4932, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.024176979437470436, "rewards/margins": 0.030859291553497314, "rewards/rejected": -0.006682313047349453, "step": 1180 }, { "epoch": 3.09, "learning_rate": 7.463127807341966e-07, "logits/chosen": -1.8290866613388062, "logits/rejected": -1.8232570886611938, "logps/chosen": -29.006099700927734, "logps/rejected": -34.01789093017578, "loss": 0.4914, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.03167518228292465, "rewards/margins": 0.03901957720518112, "rewards/rejected": -0.007344390265643597, "step": 1190 }, { "epoch": 3.12, "learning_rate": 7.063808116212021e-07, "logits/chosen": -1.7752304077148438, "logits/rejected": -1.7772098779678345, "logps/chosen": -30.416156768798828, "logps/rejected": -33.775291442871094, "loss": 0.4911, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03074287250638008, "rewards/margins": 0.0408952571451664, "rewards/rejected": -0.010152382776141167, "step": 1200 }, { "epoch": 3.12, "eval_logits/chosen": -2.0958993434906006, "eval_logits/rejected": -2.0912692546844482, "eval_logps/chosen": -33.775428771972656, "eval_logps/rejected": -37.52570724487305, "eval_loss": 0.4994668662548065, "eval_rewards/accuracies": 0.6009136438369751, "eval_rewards/chosen": 0.0025912297423928976, "eval_rewards/margins": 0.002682073274627328, "eval_rewards/rejected": -9.084340126719326e-05, "eval_runtime": 145.907, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 1200 }, { "epoch": 3.14, "learning_rate": 6.673703204254348e-07, "logits/chosen": -1.7069997787475586, "logits/rejected": -1.7061100006103516, "logps/chosen": -32.673728942871094, "logps/rejected": -33.5372314453125, "loss": 0.4906, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.0337875559926033, "rewards/margins": 0.042066238820552826, "rewards/rejected": -0.008278685621917248, "step": 1210 }, { "epoch": 3.17, "learning_rate": 6.293013489185315e-07, "logits/chosen": -1.8786895275115967, "logits/rejected": -1.8728017807006836, "logps/chosen": -28.626781463623047, "logps/rejected": -33.97628402709961, "loss": 0.491, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.030769089236855507, "rewards/margins": 0.04174653813242912, "rewards/rejected": -0.010977448895573616, "step": 1220 }, { "epoch": 3.19, "learning_rate": 5.921934551632086e-07, "logits/chosen": -1.7307627201080322, "logits/rejected": -1.7194693088531494, "logps/chosen": -31.0261173248291, "logps/rejected": -33.56462478637695, "loss": 0.4912, "rewards/accuracies": 0.9375, "rewards/chosen": 0.033595647662878036, "rewards/margins": 0.040303632616996765, "rewards/rejected": -0.006707982625812292, "step": 1230 }, { "epoch": 3.22, "learning_rate": 5.560657034652405e-07, "logits/chosen": -1.8212391138076782, "logits/rejected": -1.8145866394042969, "logps/chosen": -28.291839599609375, "logps/rejected": -29.779611587524414, "loss": 0.4929, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.024083953350782394, "rewards/margins": 0.033204466104507446, "rewards/rejected": -0.009120511822402477, "step": 1240 }, { "epoch": 3.25, "learning_rate": 5.2093665457911e-07, "logits/chosen": -1.8461530208587646, "logits/rejected": -1.853011131286621, "logps/chosen": -32.12201690673828, "logps/rejected": -32.083274841308594, "loss": 0.4913, "rewards/accuracies": 0.9375, "rewards/chosen": 0.032733749598264694, "rewards/margins": 0.0383833646774292, "rewards/rejected": -0.005649610888212919, "step": 1250 }, { "epoch": 3.27, "learning_rate": 4.868243561723535e-07, "logits/chosen": -1.8253049850463867, "logits/rejected": -1.8258377313613892, "logps/chosen": -30.2758846282959, "logps/rejected": -33.74597930908203, "loss": 0.492, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.027526726946234703, "rewards/margins": 0.03683784604072571, "rewards/rejected": -0.00931111816316843, "step": 1260 }, { "epoch": 3.3, "learning_rate": 4.537463335535161e-07, "logits/chosen": -1.7631717920303345, "logits/rejected": -1.7627891302108765, "logps/chosen": -30.03763771057129, "logps/rejected": -34.385276794433594, "loss": 0.4914, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03053201362490654, "rewards/margins": 0.039178695529699326, "rewards/rejected": -0.008646685630083084, "step": 1270 }, { "epoch": 3.32, "learning_rate": 4.217195806684629e-07, "logits/chosen": -1.648390531539917, "logits/rejected": -1.6438019275665283, "logps/chosen": -32.16303253173828, "logps/rejected": -31.598995208740234, "loss": 0.4915, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.0307441595941782, "rewards/margins": 0.03746918961405754, "rewards/rejected": -0.00672503188252449, "step": 1280 }, { "epoch": 3.35, "learning_rate": 3.907605513696808e-07, "logits/chosen": -1.8533127307891846, "logits/rejected": -1.838148832321167, "logps/chosen": -31.7786865234375, "logps/rejected": -35.894744873046875, "loss": 0.4922, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.025688907131552696, "rewards/margins": 0.03698316216468811, "rewards/rejected": -0.011294253170490265, "step": 1290 }, { "epoch": 3.38, "learning_rate": 3.6088515096305675e-07, "logits/chosen": -1.7974460124969482, "logits/rejected": -1.802198052406311, "logps/chosen": -30.211551666259766, "logps/rejected": -37.193172454833984, "loss": 0.4906, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.0330607071518898, "rewards/margins": 0.04396023601293564, "rewards/rejected": -0.010899528861045837, "step": 1300 }, { "epoch": 3.38, "eval_logits/chosen": -2.0954809188842773, "eval_logits/rejected": -2.0908546447753906, "eval_logps/chosen": -33.78174591064453, "eval_logps/rejected": -37.53657150268555, "eval_loss": 0.49945539236068726, "eval_rewards/accuracies": 0.6034052968025208, "eval_rewards/chosen": 0.0025280837435275316, "eval_rewards/margins": 0.002727580489590764, "eval_rewards/rejected": -0.0001994967897189781, "eval_runtime": 145.8892, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 1300 }, { "epoch": 3.4, "learning_rate": 3.321087280364757e-07, "logits/chosen": -1.770353078842163, "logits/rejected": -1.770532250404358, "logps/chosen": -32.75363540649414, "logps/rejected": -37.96880340576172, "loss": 0.4904, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.03490384668111801, "rewards/margins": 0.04375072568655014, "rewards/rejected": -0.008846879936754704, "step": 1310 }, { "epoch": 3.43, "learning_rate": 3.044460665744284e-07, "logits/chosen": -1.8576828241348267, "logits/rejected": -1.8563992977142334, "logps/chosen": -29.168567657470703, "logps/rejected": -31.714990615844727, "loss": 0.492, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.02925288677215576, "rewards/margins": 0.036869172006845474, "rewards/rejected": -0.007616284303367138, "step": 1320 }, { "epoch": 3.45, "learning_rate": 2.779113783626916e-07, "logits/chosen": -1.766406774520874, "logits/rejected": -1.7676417827606201, "logps/chosen": -31.04288673400879, "logps/rejected": -34.331932067871094, "loss": 0.491, "rewards/accuracies": 0.9375, "rewards/chosen": 0.03331366181373596, "rewards/margins": 0.04014625400304794, "rewards/rejected": -0.006832593586295843, "step": 1330 }, { "epoch": 3.48, "learning_rate": 2.5251829568697204e-07, "logits/chosen": -1.8259414434432983, "logits/rejected": -1.8247743844985962, "logps/chosen": -28.54180335998535, "logps/rejected": -32.83352279663086, "loss": 0.4923, "rewards/accuracies": 0.9375, "rewards/chosen": 0.026568632572889328, "rewards/margins": 0.03498411923646927, "rewards/rejected": -0.008415484800934792, "step": 1340 }, { "epoch": 3.51, "learning_rate": 2.2827986432927774e-07, "logits/chosen": -1.8422014713287354, "logits/rejected": -1.8278350830078125, "logps/chosen": -31.403839111328125, "logps/rejected": -37.33091354370117, "loss": 0.4914, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.02912832237780094, "rewards/margins": 0.039793141186237335, "rewards/rejected": -0.010664817877113819, "step": 1350 }, { "epoch": 3.53, "learning_rate": 2.0520853686560177e-07, "logits/chosen": -1.834238052368164, "logits/rejected": -1.8465309143066406, "logps/chosen": -29.14828872680664, "logps/rejected": -32.92185592651367, "loss": 0.492, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.029201796278357506, "rewards/margins": 0.03602294623851776, "rewards/rejected": -0.006821149028837681, "step": 1360 }, { "epoch": 3.56, "learning_rate": 1.833161662683672e-07, "logits/chosen": -1.937150239944458, "logits/rejected": -1.9364910125732422, "logps/chosen": -28.896930694580078, "logps/rejected": -37.03318786621094, "loss": 0.4907, "rewards/accuracies": 0.949999988079071, "rewards/chosen": 0.03133498877286911, "rewards/margins": 0.043358031660318375, "rewards/rejected": -0.012023041024804115, "step": 1370 }, { "epoch": 3.58, "learning_rate": 1.626139998169246e-07, "logits/chosen": -1.7997983694076538, "logits/rejected": -1.8073842525482178, "logps/chosen": -30.809356689453125, "logps/rejected": -38.418704986572266, "loss": 0.4906, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.03371125087141991, "rewards/margins": 0.04340182989835739, "rewards/rejected": -0.009690576232969761, "step": 1380 }, { "epoch": 3.61, "learning_rate": 1.4311267331922535e-07, "logits/chosen": -1.756156325340271, "logits/rejected": -1.7522109746932983, "logps/chosen": -31.52239418029785, "logps/rejected": -32.12908172607422, "loss": 0.4915, "rewards/accuracies": 0.875, "rewards/chosen": 0.0324481800198555, "rewards/margins": 0.03776510804891586, "rewards/rejected": -0.005316923372447491, "step": 1390 }, { "epoch": 3.64, "learning_rate": 1.2482220564763669e-07, "logits/chosen": -1.9121891260147095, "logits/rejected": -1.90972101688385, "logps/chosen": -28.551815032958984, "logps/rejected": -32.69978713989258, "loss": 0.4928, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.027447333559393883, "rewards/margins": 0.03300053998827934, "rewards/rejected": -0.0055532073602080345, "step": 1400 }, { "epoch": 3.64, "eval_logits/chosen": -2.0949766635894775, "eval_logits/rejected": -2.0903432369232178, "eval_logps/chosen": -33.78425216674805, "eval_logps/rejected": -37.54228591918945, "eval_loss": 0.4994584321975708, "eval_rewards/accuracies": 0.5921927094459534, "eval_rewards/chosen": 0.0025030241813510656, "eval_rewards/margins": 0.0027596852742135525, "eval_rewards/rejected": -0.0002566613839007914, "eval_runtime": 145.7016, "eval_samples_per_second": 2.354, "eval_steps_per_second": 0.295, "step": 1400 }, { "epoch": 3.66, "learning_rate": 1.0775199359171346e-07, "logits/chosen": -1.8566265106201172, "logits/rejected": -1.8526551723480225, "logps/chosen": -30.65496826171875, "logps/rejected": -30.23004722595215, "loss": 0.4918, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.029875636100769043, "rewards/margins": 0.036631517112255096, "rewards/rejected": -0.0067558870650827885, "step": 1410 }, { "epoch": 3.69, "learning_rate": 9.191080703056604e-08, "logits/chosen": -1.8138599395751953, "logits/rejected": -1.8149265050888062, "logps/chosen": -30.453847885131836, "logps/rejected": -35.29817581176758, "loss": 0.4926, "rewards/accuracies": 0.887499988079071, "rewards/chosen": 0.029110288247466087, "rewards/margins": 0.0328519269824028, "rewards/rejected": -0.0037416405975818634, "step": 1420 }, { "epoch": 3.71, "learning_rate": 7.730678442730539e-08, "logits/chosen": -1.7643918991088867, "logits/rejected": -1.7578294277191162, "logps/chosen": -31.150136947631836, "logps/rejected": -37.63862228393555, "loss": 0.4914, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.0299163106828928, "rewards/margins": 0.039408471435308456, "rewards/rejected": -0.009492164477705956, "step": 1430 }, { "epoch": 3.74, "learning_rate": 6.394742864787806e-08, "logits/chosen": -1.7766668796539307, "logits/rejected": -1.7712846994400024, "logps/chosen": -26.59188461303711, "logps/rejected": -32.42449188232422, "loss": 0.4925, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.027213579043745995, "rewards/margins": 0.03415192291140556, "rewards/rejected": -0.006938344333320856, "step": 1440 }, { "epoch": 3.77, "learning_rate": 5.183960310644748e-08, "logits/chosen": -1.8049747943878174, "logits/rejected": -1.7948274612426758, "logps/chosen": -30.14188575744629, "logps/rejected": -36.50958251953125, "loss": 0.4926, "rewards/accuracies": 0.925000011920929, "rewards/chosen": 0.023396430537104607, "rewards/margins": 0.03563863784074783, "rewards/rejected": -0.012242205440998077, "step": 1450 }, { "epoch": 3.79, "learning_rate": 4.098952823928693e-08, "logits/chosen": -1.776219129562378, "logits/rejected": -1.773436188697815, "logps/chosen": -30.742990493774414, "logps/rejected": -31.301162719726562, "loss": 0.4933, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.025667887181043625, "rewards/margins": 0.029771283268928528, "rewards/rejected": -0.004103394225239754, "step": 1460 }, { "epoch": 3.82, "learning_rate": 3.1402778309014284e-08, "logits/chosen": -1.835860013961792, "logits/rejected": -1.8421319723129272, "logps/chosen": -28.727758407592773, "logps/rejected": -33.723846435546875, "loss": 0.4909, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.033414192497730255, "rewards/margins": 0.04102843254804611, "rewards/rejected": -0.007614238653331995, "step": 1470 }, { "epoch": 3.84, "learning_rate": 2.3084278540791427e-08, "logits/chosen": -1.8385893106460571, "logits/rejected": -1.8487203121185303, "logps/chosen": -28.86956787109375, "logps/rejected": -30.35264015197754, "loss": 0.4921, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": 0.02825688198208809, "rewards/margins": 0.03549762815237045, "rewards/rejected": -0.007240750826895237, "step": 1480 }, { "epoch": 3.87, "learning_rate": 1.6038302591975807e-08, "logits/chosen": -1.7700088024139404, "logits/rejected": -1.7631587982177734, "logps/chosen": -31.02837562561035, "logps/rejected": -33.01637649536133, "loss": 0.4924, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": 0.02628110721707344, "rewards/margins": 0.03434007614850998, "rewards/rejected": -0.008058969862759113, "step": 1490 }, { "epoch": 3.9, "learning_rate": 1.0268470356514237e-08, "logits/chosen": -1.823885202407837, "logits/rejected": -1.8207124471664429, "logps/chosen": -30.59462547302246, "logps/rejected": -34.283512115478516, "loss": 0.4912, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.030309468507766724, "rewards/margins": 0.04054233804345131, "rewards/rejected": -0.010232868604362011, "step": 1500 }, { "epoch": 3.9, "eval_logits/chosen": -2.0954155921936035, "eval_logits/rejected": -2.090785264968872, "eval_logps/chosen": -33.78346252441406, "eval_logps/rejected": -37.54273223876953, "eval_loss": 0.49945247173309326, "eval_rewards/accuracies": 0.6034052968025208, "eval_rewards/chosen": 0.0025108722038567066, "eval_rewards/margins": 0.0027720185462385416, "eval_rewards/rejected": -0.00026114637148566544, "eval_runtime": 145.8069, "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 1500 }, { "epoch": 3.92, "learning_rate": 5.777746105209147e-09, "logits/chosen": -1.900957465171814, "logits/rejected": -1.9016454219818115, "logps/chosen": -26.679061889648438, "logps/rejected": -33.479774475097656, "loss": 0.4914, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.03137214854359627, "rewards/margins": 0.03876541927456856, "rewards/rejected": -0.007393266074359417, "step": 1510 }, { "epoch": 3.95, "learning_rate": 2.5684369628148352e-09, "logits/chosen": -1.7551101446151733, "logits/rejected": -1.7550216913223267, "logps/chosen": -30.129684448242188, "logps/rejected": -34.65699005126953, "loss": 0.4918, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": 0.029121821746230125, "rewards/margins": 0.036954350769519806, "rewards/rejected": -0.007832523435354233, "step": 1520 }, { "epoch": 3.97, "learning_rate": 6.421917227455999e-10, "logits/chosen": -1.9060754776000977, "logits/rejected": -1.9034589529037476, "logps/chosen": -28.90531349182129, "logps/rejected": -32.31726837158203, "loss": 0.4926, "rewards/accuracies": 0.9624999761581421, "rewards/chosen": 0.02568744495511055, "rewards/margins": 0.03402363136410713, "rewards/rejected": -0.008336183615028858, "step": 1530 }, { "epoch": 4.0, "learning_rate": 0.0, "logits/chosen": -1.890435814857483, "logits/rejected": -1.8915026187896729, "logps/chosen": -28.199377059936523, "logps/rejected": -30.4346981048584, "loss": 0.4932, "rewards/accuracies": 0.9333333969116211, "rewards/chosen": 0.024205626919865608, "rewards/margins": 0.030650783330202103, "rewards/rejected": -0.006445156875997782, "step": 1540 }, { "epoch": 4.0, "step": 1540, "total_flos": 0.0, "train_loss": 0.39746343532165923, "train_runtime": 10799.7737, "train_samples_per_second": 1.14, "train_steps_per_second": 0.143 } ], "logging_steps": 10, "max_steps": 1540, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }