{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.998691442030882, "eval_steps": 500, "global_step": 477, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.0416666666666667e-07, "logits/chosen": -2.7967724800109863, "logits/rejected": -2.805750846862793, "logps/chosen": -270.3558654785156, "logps/rejected": -243.22396850585938, "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.02, "learning_rate": 1.0416666666666667e-06, "logits/chosen": -2.671762466430664, "logits/rejected": -2.713010549545288, "logps/chosen": -228.7700958251953, "logps/rejected": -213.14617919921875, "loss": 0.5, "rewards/accuracies": 0.3680555522441864, "rewards/chosen": 6.792499334551394e-06, "rewards/margins": -1.524862182122888e-05, "rewards/rejected": 2.204112388426438e-05, "step": 10 }, { "epoch": 0.04, "learning_rate": 2.0833333333333334e-06, "logits/chosen": -2.7296969890594482, "logits/rejected": -2.7327723503112793, "logps/chosen": -242.0876922607422, "logps/rejected": -224.0583038330078, "loss": 0.5, "rewards/accuracies": 0.4781250059604645, "rewards/chosen": 0.00022381536837201566, "rewards/margins": 7.298699347302318e-05, "rewards/rejected": 0.0001508283894509077, "step": 20 }, { "epoch": 0.06, "learning_rate": 3.125e-06, "logits/chosen": -2.7316231727600098, "logits/rejected": -2.7253520488739014, "logps/chosen": -253.94223022460938, "logps/rejected": -235.99728393554688, "loss": 0.4999, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.000914489384740591, "rewards/margins": 0.0003135653678327799, "rewards/rejected": 0.0006009239004924893, "step": 30 }, { "epoch": 0.08, "learning_rate": 4.166666666666667e-06, "logits/chosen": -2.6916933059692383, "logits/rejected": -2.690849781036377, "logps/chosen": -252.69357299804688, "logps/rejected": -234.3971710205078, "loss": 0.4998, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0020142460707575083, "rewards/margins": 0.0008797285263426602, "rewards/rejected": 0.0011345174862071872, "step": 40 }, { "epoch": 0.1, "learning_rate": 4.999731868769027e-06, "logits/chosen": -2.6373677253723145, "logits/rejected": -2.641324281692505, "logps/chosen": -224.9910430908203, "logps/rejected": -220.6387939453125, "loss": 0.4996, "rewards/accuracies": 0.59375, "rewards/chosen": 0.003733439836651087, "rewards/margins": 0.0014786701649427414, "rewards/rejected": 0.0022547696717083454, "step": 50 }, { "epoch": 0.13, "learning_rate": 4.9903533134293035e-06, "logits/chosen": -2.6569573879241943, "logits/rejected": -2.6791322231292725, "logps/chosen": -221.05709838867188, "logps/rejected": -203.5032501220703, "loss": 0.4994, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.005443854723125696, "rewards/margins": 0.0024864792358130217, "rewards/rejected": 0.0029573754873126745, "step": 60 }, { "epoch": 0.15, "learning_rate": 4.967625656594782e-06, "logits/chosen": -2.707890748977661, "logits/rejected": -2.6958038806915283, "logps/chosen": -227.9226531982422, "logps/rejected": -209.75619506835938, "loss": 0.4992, "rewards/accuracies": 0.6031249761581421, "rewards/chosen": 0.006226530764251947, "rewards/margins": 0.0026825761888176203, "rewards/rejected": 0.0035439543426036835, "step": 70 }, { "epoch": 0.17, "learning_rate": 4.93167072587771e-06, "logits/chosen": -2.691779375076294, "logits/rejected": -2.6711440086364746, "logps/chosen": -218.3474578857422, "logps/rejected": -207.0499725341797, "loss": 0.499, "rewards/accuracies": 0.609375, "rewards/chosen": 0.007510344497859478, "rewards/margins": 0.003996443003416061, "rewards/rejected": 0.0035139017272740602, "step": 80 }, { "epoch": 0.19, "learning_rate": 4.882681251368549e-06, "logits/chosen": -2.6946892738342285, "logits/rejected": -2.6706433296203613, "logps/chosen": -233.44540405273438, "logps/rejected": -225.4489288330078, "loss": 0.4988, "rewards/accuracies": 0.606249988079071, "rewards/chosen": 0.007306075654923916, "rewards/margins": 0.004931028466671705, "rewards/rejected": 0.002375046955421567, "step": 90 }, { "epoch": 0.21, "learning_rate": 4.8209198325401815e-06, "logits/chosen": -2.741804838180542, "logits/rejected": -2.722029447555542, "logps/chosen": -244.57785034179688, "logps/rejected": -229.181884765625, "loss": 0.4986, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.00877557136118412, "rewards/margins": 0.007159523665904999, "rewards/rejected": 0.0016160461818799376, "step": 100 }, { "epoch": 0.23, "learning_rate": 4.746717530629565e-06, "logits/chosen": -2.6642231941223145, "logits/rejected": -2.682945966720581, "logps/chosen": -227.15109252929688, "logps/rejected": -215.261474609375, "loss": 0.4986, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.007662790361791849, "rewards/margins": 0.007442783564329147, "rewards/rejected": 0.00022000684111844748, "step": 110 }, { "epoch": 0.25, "learning_rate": 4.660472094042121e-06, "logits/chosen": -2.6481804847717285, "logits/rejected": -2.6557509899139404, "logps/chosen": -243.96139526367188, "logps/rejected": -208.84384155273438, "loss": 0.4984, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.007970081642270088, "rewards/margins": 0.007513949181884527, "rewards/rejected": 0.00045613135444000363, "step": 120 }, { "epoch": 0.27, "learning_rate": 4.5626458262912745e-06, "logits/chosen": -2.607055187225342, "logits/rejected": -2.6165382862091064, "logps/chosen": -214.7068634033203, "logps/rejected": -214.4553680419922, "loss": 0.4985, "rewards/accuracies": 0.5625, "rewards/chosen": 0.007824478670954704, "rewards/margins": 0.007803040556609631, "rewards/rejected": 2.143829988199286e-05, "step": 130 }, { "epoch": 0.29, "learning_rate": 4.453763107901676e-06, "logits/chosen": -2.662113666534424, "logits/rejected": -2.6899476051330566, "logps/chosen": -230.07870483398438, "logps/rejected": -218.9852752685547, "loss": 0.4981, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.008939255960285664, "rewards/margins": 0.00983688049018383, "rewards/rejected": -0.000897624995559454, "step": 140 }, { "epoch": 0.31, "learning_rate": 4.33440758555951e-06, "logits/chosen": -2.6915247440338135, "logits/rejected": -2.684671640396118, "logps/chosen": -239.7102813720703, "logps/rejected": -196.85520935058594, "loss": 0.498, "rewards/accuracies": 0.625, "rewards/chosen": 0.006214095279574394, "rewards/margins": 0.008828171528875828, "rewards/rejected": -0.0026140757836401463, "step": 150 }, { "epoch": 0.33, "learning_rate": 4.205219043576955e-06, "logits/chosen": -2.6351287364959717, "logits/rejected": -2.6126668453216553, "logps/chosen": -217.2763214111328, "logps/rejected": -232.38308715820312, "loss": 0.4977, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.005867967382073402, "rewards/margins": 0.012363599613308907, "rewards/rejected": -0.006495633628219366, "step": 160 }, { "epoch": 0.36, "learning_rate": 4.066889974440757e-06, "logits/chosen": -2.651954412460327, "logits/rejected": -2.6270413398742676, "logps/chosen": -230.48202514648438, "logps/rejected": -207.2578887939453, "loss": 0.4981, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": 0.005082262679934502, "rewards/margins": 0.009124400094151497, "rewards/rejected": -0.004042136482894421, "step": 170 }, { "epoch": 0.38, "learning_rate": 3.92016186682789e-06, "logits/chosen": -2.623335838317871, "logits/rejected": -2.6311707496643066, "logps/chosen": -240.3880157470703, "logps/rejected": -239.96383666992188, "loss": 0.4978, "rewards/accuracies": 0.65625, "rewards/chosen": 0.004731752909719944, "rewards/margins": 0.013921832665801048, "rewards/rejected": -0.009190080687403679, "step": 180 }, { "epoch": 0.4, "learning_rate": 3.7658212309857576e-06, "logits/chosen": -2.6556308269500732, "logits/rejected": -2.635357618331909, "logps/chosen": -227.9666748046875, "logps/rejected": -235.2104949951172, "loss": 0.4976, "rewards/accuracies": 0.6343749761581421, "rewards/chosen": 0.005550178233534098, "rewards/margins": 0.015345620922744274, "rewards/rejected": -0.009795443154871464, "step": 190 }, { "epoch": 0.42, "learning_rate": 3.604695382782159e-06, "logits/chosen": -2.613764524459839, "logits/rejected": -2.5981905460357666, "logps/chosen": -244.93527221679688, "logps/rejected": -219.70816040039062, "loss": 0.4975, "rewards/accuracies": 0.621874988079071, "rewards/chosen": 0.005905141122639179, "rewards/margins": 0.01656787097454071, "rewards/rejected": -0.010662728920578957, "step": 200 }, { "epoch": 0.44, "learning_rate": 3.437648009023905e-06, "logits/chosen": -2.6281652450561523, "logits/rejected": -2.6158878803253174, "logps/chosen": -230.68093872070312, "logps/rejected": -229.5253448486328, "loss": 0.4974, "rewards/accuracies": 0.5843750238418579, "rewards/chosen": 0.0013032301794737577, "rewards/margins": 0.012972685508430004, "rewards/rejected": -0.011669456958770752, "step": 210 }, { "epoch": 0.46, "learning_rate": 3.265574537815398e-06, "logits/chosen": -2.6179089546203613, "logits/rejected": -2.600205898284912, "logps/chosen": -220.5909881591797, "logps/rejected": -219.4180450439453, "loss": 0.4975, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.00289526185952127, "rewards/margins": 0.01316472701728344, "rewards/rejected": -0.0102694658562541, "step": 220 }, { "epoch": 0.48, "learning_rate": 3.089397338773569e-06, "logits/chosen": -2.5429725646972656, "logits/rejected": -2.5193886756896973, "logps/chosen": -219.8972930908203, "logps/rejected": -224.34597778320312, "loss": 0.4974, "rewards/accuracies": 0.578125, "rewards/chosen": 0.001055281376466155, "rewards/margins": 0.015370063483715057, "rewards/rejected": -0.014314780943095684, "step": 230 }, { "epoch": 0.5, "learning_rate": 2.9100607788275547e-06, "logits/chosen": -2.5410211086273193, "logits/rejected": -2.5228562355041504, "logps/chosen": -221.77243041992188, "logps/rejected": -222.72091674804688, "loss": 0.4972, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0065743401646614075, "rewards/margins": 0.014264288358390331, "rewards/rejected": -0.020838629454374313, "step": 240 }, { "epoch": 0.52, "learning_rate": 2.72852616010567e-06, "logits/chosen": -2.5719101428985596, "logits/rejected": -2.5223493576049805, "logps/chosen": -226.51950073242188, "logps/rejected": -222.7196502685547, "loss": 0.4969, "rewards/accuracies": 0.590624988079071, "rewards/chosen": -0.0035572790075093508, "rewards/margins": 0.023138266056776047, "rewards/rejected": -0.02669554576277733, "step": 250 }, { "epoch": 0.54, "learning_rate": 2.5457665670441937e-06, "logits/chosen": -2.6053082942962646, "logits/rejected": -2.5733938217163086, "logps/chosen": -249.0303955078125, "logps/rejected": -240.6870574951172, "loss": 0.4962, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.005832755006849766, "rewards/margins": 0.026431718841195107, "rewards/rejected": -0.03226447105407715, "step": 260 }, { "epoch": 0.57, "learning_rate": 2.3627616503391813e-06, "logits/chosen": -2.6097915172576904, "logits/rejected": -2.565622329711914, "logps/chosen": -272.3103942871094, "logps/rejected": -283.4029846191406, "loss": 0.4962, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -0.009318219497799873, "rewards/margins": 0.03199902921915054, "rewards/rejected": -0.041317250579595566, "step": 270 }, { "epoch": 0.59, "learning_rate": 2.1804923757009885e-06, "logits/chosen": -2.5360941886901855, "logits/rejected": -2.502972364425659, "logps/chosen": -253.5753173828125, "logps/rejected": -242.55581665039062, "loss": 0.4965, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.025225955992937088, "rewards/margins": 0.025721151381731033, "rewards/rejected": -0.05094710737466812, "step": 280 }, { "epoch": 0.61, "learning_rate": 1.9999357655598894e-06, "logits/chosen": -2.5319244861602783, "logits/rejected": -2.465296983718872, "logps/chosen": -268.68438720703125, "logps/rejected": -286.90582275390625, "loss": 0.4959, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.03240751475095749, "rewards/margins": 0.035424619913101196, "rewards/rejected": -0.06783213466405869, "step": 290 }, { "epoch": 0.63, "learning_rate": 1.8220596619089576e-06, "logits/chosen": -2.4394478797912598, "logits/rejected": -2.4182257652282715, "logps/chosen": -272.16973876953125, "logps/rejected": -293.65020751953125, "loss": 0.4947, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.03661135956645012, "rewards/margins": 0.03990130499005318, "rewards/rejected": -0.07651267200708389, "step": 300 }, { "epoch": 0.65, "learning_rate": 1.647817538357072e-06, "logits/chosen": -2.297726631164551, "logits/rejected": -2.2101216316223145, "logps/chosen": -390.9844665527344, "logps/rejected": -431.4691467285156, "loss": 0.4919, "rewards/accuracies": 0.528124988079071, "rewards/chosen": -0.15482836961746216, "rewards/margins": 0.07953239977359772, "rewards/rejected": -0.23436078429222107, "step": 310 }, { "epoch": 0.67, "learning_rate": 1.4781433892011132e-06, "logits/chosen": -1.8453972339630127, "logits/rejected": -1.7317909002304077, "logps/chosen": -681.6414184570312, "logps/rejected": -987.8673706054688, "loss": 0.4842, "rewards/accuracies": 0.5531250238418579, "rewards/chosen": -0.4435897767543793, "rewards/margins": 0.3242936134338379, "rewards/rejected": -0.7678834199905396, "step": 320 }, { "epoch": 0.69, "learning_rate": 1.3139467229135999e-06, "logits/chosen": -1.9067537784576416, "logits/rejected": -1.8014500141143799, "logps/chosen": -885.9420166015625, "logps/rejected": -1086.572509765625, "loss": 0.482, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -0.6387772560119629, "rewards/margins": 0.23005250096321106, "rewards/rejected": -0.8688297271728516, "step": 330 }, { "epoch": 0.71, "learning_rate": 1.1561076868822756e-06, "logits/chosen": -2.052692413330078, "logits/rejected": -1.9470455646514893, "logps/chosen": -739.042236328125, "logps/rejected": -992.4461059570312, "loss": 0.4809, "rewards/accuracies": 0.559374988079071, "rewards/chosen": -0.49481409788131714, "rewards/margins": 0.2876318097114563, "rewards/rejected": -0.7824459075927734, "step": 340 }, { "epoch": 0.73, "learning_rate": 1.0054723495346484e-06, "logits/chosen": -1.9866969585418701, "logits/rejected": -1.8636195659637451, "logps/chosen": -883.5921630859375, "logps/rejected": -1097.7490234375, "loss": 0.4832, "rewards/accuracies": 0.5406249761581421, "rewards/chosen": -0.658557116985321, "rewards/margins": 0.2439090460538864, "rewards/rejected": -0.9024661779403687, "step": 350 }, { "epoch": 0.75, "learning_rate": 8.628481651367876e-07, "logits/chosen": -2.103574275970459, "logits/rejected": -1.9690395593643188, "logps/chosen": -854.9332885742188, "logps/rejected": -1295.6673583984375, "loss": 0.4805, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6121799945831299, "rewards/margins": 0.4702150821685791, "rewards/rejected": -1.082395076751709, "step": 360 }, { "epoch": 0.77, "learning_rate": 7.289996455765749e-07, "logits/chosen": -2.1500675678253174, "logits/rejected": -2.044593572616577, "logps/chosen": -781.8174438476562, "logps/rejected": -1134.3828125, "loss": 0.4804, "rewards/accuracies": 0.49687498807907104, "rewards/chosen": -0.5213514566421509, "rewards/margins": 0.39176231622695923, "rewards/rejected": -0.9131137132644653, "step": 370 }, { "epoch": 0.8, "learning_rate": 6.046442623320145e-07, "logits/chosen": -2.1524083614349365, "logits/rejected": -2.063347816467285, "logps/chosen": -762.1702270507812, "logps/rejected": -1244.992431640625, "loss": 0.477, "rewards/accuracies": 0.6156250238418579, "rewards/chosen": -0.5212429761886597, "rewards/margins": 0.4973062574863434, "rewards/rejected": -1.0185492038726807, "step": 380 }, { "epoch": 0.82, "learning_rate": 4.904486005914027e-07, "logits/chosen": -2.1940252780914307, "logits/rejected": -2.106301784515381, "logps/chosen": -789.7882690429688, "logps/rejected": -1301.745361328125, "loss": 0.4771, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.5382066965103149, "rewards/margins": 0.5309630632400513, "rewards/rejected": -1.0691697597503662, "step": 390 }, { "epoch": 0.84, "learning_rate": 3.8702478614051353e-07, "logits/chosen": -2.1056671142578125, "logits/rejected": -2.0197396278381348, "logps/chosen": -846.3111572265625, "logps/rejected": -1285.551025390625, "loss": 0.478, "rewards/accuracies": 0.518750011920929, "rewards/chosen": -0.6085025072097778, "rewards/margins": 0.4548709988594055, "rewards/rejected": -1.0633734464645386, "step": 400 }, { "epoch": 0.86, "learning_rate": 2.9492720416985004e-07, "logits/chosen": -2.086305618286133, "logits/rejected": -1.996105432510376, "logps/chosen": -932.2883911132812, "logps/rejected": -1177.2127685546875, "loss": 0.4807, "rewards/accuracies": 0.534375011920929, "rewards/chosen": -0.6851298213005066, "rewards/margins": 0.29829445481300354, "rewards/rejected": -0.983424186706543, "step": 410 }, { "epoch": 0.88, "learning_rate": 2.1464952759020857e-07, "logits/chosen": -2.1438374519348145, "logits/rejected": -2.0642104148864746, "logps/chosen": -878.0729370117188, "logps/rejected": -1302.420654296875, "loss": 0.4798, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -0.6570709347724915, "rewards/margins": 0.4223295748233795, "rewards/rejected": -1.0794004201889038, "step": 420 }, { "epoch": 0.9, "learning_rate": 1.4662207078575685e-07, "logits/chosen": -2.1188504695892334, "logits/rejected": -2.0422720909118652, "logps/chosen": -986.8173828125, "logps/rejected": -1470.625244140625, "loss": 0.4786, "rewards/accuracies": 0.546875, "rewards/chosen": -0.7492085695266724, "rewards/margins": 0.5096833109855652, "rewards/rejected": -1.2588918209075928, "step": 430 }, { "epoch": 0.92, "learning_rate": 9.120948298936422e-08, "logits/chosen": -2.160806894302368, "logits/rejected": -2.0456414222717285, "logps/chosen": -828.1990356445312, "logps/rejected": -1279.345703125, "loss": 0.4768, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -0.6060695052146912, "rewards/margins": 0.45984458923339844, "rewards/rejected": -1.0659140348434448, "step": 440 }, { "epoch": 0.94, "learning_rate": 4.870879364444109e-08, "logits/chosen": -2.1377763748168945, "logits/rejected": -2.0168356895446777, "logps/chosen": -823.4695434570312, "logps/rejected": -1253.1837158203125, "loss": 0.4787, "rewards/accuracies": 0.5218750238418579, "rewards/chosen": -0.591966986656189, "rewards/margins": 0.44343581795692444, "rewards/rejected": -1.0354026556015015, "step": 450 }, { "epoch": 0.96, "learning_rate": 1.93478202307823e-08, "logits/chosen": -2.1620395183563232, "logits/rejected": -2.011014223098755, "logps/chosen": -961.3709106445312, "logps/rejected": -1517.798828125, "loss": 0.4762, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -0.7009885907173157, "rewards/margins": 0.592383086681366, "rewards/rejected": -1.2933716773986816, "step": 460 }, { "epoch": 0.98, "learning_rate": 3.283947088983663e-09, "logits/chosen": -2.198997735977173, "logits/rejected": -2.040200710296631, "logps/chosen": -834.0213623046875, "logps/rejected": -1428.2811279296875, "loss": 0.4748, "rewards/accuracies": 0.5718749761581421, "rewards/chosen": -0.594894289970398, "rewards/margins": 0.6237603425979614, "rewards/rejected": -1.2186545133590698, "step": 470 }, { "epoch": 1.0, "step": 477, "total_flos": 0.0, "train_loss": 0.38682256204777044, "train_runtime": 17349.9683, "train_samples_per_second": 3.524, "train_steps_per_second": 0.027 } ], "logging_steps": 10, "max_steps": 477, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 20, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }