{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": -1.7278180122375488, "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, "loss": 1.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, "logits/chosen": -1.8664803504943848, "logits/rejected": -1.8707994222640991, "logps/chosen": -36.978511810302734, "logps/rejected": -33.66939163208008, "loss": 0.9993, "rewards/accuracies": 0.5694444179534912, "rewards/chosen": 0.00028087408281862736, "rewards/margins": 0.0006740752141922712, "rewards/rejected": -0.00039320107316598296, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, "logits/chosen": -1.9984451532363892, "logits/rejected": -2.0010995864868164, "logps/chosen": -29.63176918029785, "logps/rejected": -29.05954933166504, "loss": 0.9999, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.00010425634536659345, "rewards/margins": 6.528960511786863e-05, "rewards/rejected": 3.8966707506915554e-05, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, "logits/chosen": -1.9210799932479858, "logits/rejected": -1.9183847904205322, "logps/chosen": -31.414783477783203, "logps/rejected": -33.19659423828125, "loss": 1.0002, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 1.3138540452928282e-05, "rewards/margins": -0.00020548875909298658, "rewards/rejected": 0.00021862727589905262, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, "logits/chosen": -2.0177221298217773, "logits/rejected": -2.008965492248535, "logps/chosen": -32.57322311401367, "logps/rejected": -32.500308990478516, "loss": 1.0001, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 3.584356090868823e-05, "rewards/margins": -8.777440234553069e-05, "rewards/rejected": 0.00012361796689219773, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, "logits/chosen": -1.8622690439224243, "logits/rejected": -1.851509690284729, "logps/chosen": -33.547603607177734, "logps/rejected": -35.463592529296875, "loss": 0.9998, "rewards/accuracies": 0.5, "rewards/chosen": 9.876764670480043e-05, "rewards/margins": 0.0002112251240760088, "rewards/rejected": -0.00011245747737120837, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, "logits/chosen": -1.9400131702423096, "logits/rejected": -1.9419806003570557, "logps/chosen": -32.52842330932617, "logps/rejected": -33.22877883911133, "loss": 0.9987, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0007175664068199694, "rewards/margins": 0.0012915965635329485, "rewards/rejected": -0.00057403021492064, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, "logits/chosen": -2.070552349090576, "logits/rejected": -2.0755274295806885, "logps/chosen": -34.00461959838867, "logps/rejected": -36.64922332763672, "loss": 0.9994, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -0.00016131921438500285, "rewards/margins": 0.0006422021542675793, "rewards/rejected": -0.0008035213686525822, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, "logits/chosen": -1.9306777715682983, "logits/rejected": -1.9338254928588867, "logps/chosen": -34.32624816894531, "logps/rejected": -34.661468505859375, "loss": 0.9983, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0010156143689528108, "rewards/margins": 0.0017062196275219321, "rewards/rejected": -0.0006906053749844432, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, "logits/chosen": -1.9389193058013916, "logits/rejected": -1.9434226751327515, "logps/chosen": -32.38957214355469, "logps/rejected": -32.348140716552734, "loss": 0.9993, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.0007785108755342662, "rewards/margins": 0.0006708315922878683, "rewards/rejected": 0.00010767912317533046, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, "logits/chosen": -2.0358777046203613, "logits/rejected": -2.0339014530181885, "logps/chosen": -32.13254165649414, "logps/rejected": -31.29019546508789, "loss": 0.9987, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0010829826351255178, "rewards/margins": 0.001322855008766055, "rewards/rejected": -0.0002398724900558591, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": -2.2312774658203125, "eval_logits/rejected": -2.226422071456909, "eval_logps/chosen": -34.04991149902344, "eval_logps/rejected": -37.55283737182617, "eval_loss": 0.9997907280921936, "eval_rewards/accuracies": 0.5336378812789917, "eval_rewards/chosen": -0.0001535558985779062, "eval_rewards/margins": 0.00020861340453848243, "eval_rewards/rejected": -0.00036216925946064293, "eval_runtime": 146.0254, "eval_samples_per_second": 2.349, "eval_steps_per_second": 0.294, "step": 100 }, { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, "logits/chosen": -1.9907060861587524, "logits/rejected": -1.9883339405059814, "logps/chosen": -33.13169860839844, "logps/rejected": -34.033958435058594, "loss": 0.9988, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0011211589444428682, "rewards/margins": 0.0011670273961499333, "rewards/rejected": -4.586850991472602e-05, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, "logits/chosen": -2.002023458480835, "logits/rejected": -1.993699312210083, "logps/chosen": -32.341697692871094, "logps/rejected": -32.16511917114258, "loss": 0.9989, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0010320657165721059, "rewards/margins": 0.001068194629624486, "rewards/rejected": -3.612901855376549e-05, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, "logits/chosen": -2.028505802154541, "logits/rejected": -2.020526885986328, "logps/chosen": -30.3519287109375, "logps/rejected": -32.101314544677734, "loss": 0.9983, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0012378758983686566, "rewards/margins": 0.001720982021652162, "rewards/rejected": -0.0004831062688026577, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, "logits/chosen": -1.9588673114776611, "logits/rejected": -1.9690834283828735, "logps/chosen": -31.205490112304688, "logps/rejected": -32.55961608886719, "loss": 0.9976, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0021417266689240932, "rewards/margins": 0.0023890691809356213, "rewards/rejected": -0.00024734257021918893, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, "logits/chosen": -1.8695415258407593, "logits/rejected": -1.8707062005996704, "logps/chosen": -33.88127899169922, "logps/rejected": -34.7686653137207, "loss": 0.9968, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.0031141345389187336, "rewards/margins": 0.003228238318115473, "rewards/rejected": -0.00011410393926780671, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, "logits/chosen": -1.9212032556533813, "logits/rejected": -1.9178003072738647, "logps/chosen": -35.99773406982422, "logps/rejected": -32.705848693847656, "loss": 0.9984, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0017394202295690775, "rewards/margins": 0.0015704210381954908, "rewards/rejected": 0.00016899927868507802, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, "logits/chosen": -2.0206995010375977, "logits/rejected": -2.0133931636810303, "logps/chosen": -33.504085540771484, "logps/rejected": -31.432220458984375, "loss": 0.9963, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0029377774335443974, "rewards/margins": 0.003741443855687976, "rewards/rejected": -0.0008036663057282567, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, "logits/chosen": -2.0269291400909424, "logits/rejected": -2.032160997390747, "logps/chosen": -32.24355697631836, "logps/rejected": -32.431182861328125, "loss": 0.9975, "rewards/accuracies": 0.625, "rewards/chosen": 0.003012270200997591, "rewards/margins": 0.0024727012496441603, "rewards/rejected": 0.0005395688931457698, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, "logits/chosen": -2.027367115020752, "logits/rejected": -2.0246078968048096, "logps/chosen": -31.290613174438477, "logps/rejected": -31.361133575439453, "loss": 0.9976, "rewards/accuracies": 0.625, "rewards/chosen": 0.0019833946134895086, "rewards/margins": 0.002412599278613925, "rewards/rejected": -0.00042920451960526407, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, "logits/chosen": -1.8985168933868408, "logits/rejected": -1.903148889541626, "logps/chosen": -31.30405616760254, "logps/rejected": -32.838443756103516, "loss": 0.9965, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0030222723726183176, "rewards/margins": 0.0034973658621311188, "rewards/rejected": -0.0004750936641357839, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": -2.225177764892578, "eval_logits/rejected": -2.2203547954559326, "eval_logps/chosen": -34.06184387207031, "eval_logps/rejected": -37.579010009765625, "eval_loss": 0.9996482133865356, "eval_rewards/accuracies": 0.5070598125457764, "eval_rewards/chosen": -0.0002729461120907217, "eval_rewards/margins": 0.00035095339990220964, "eval_rewards/rejected": -0.0006238995119929314, "eval_runtime": 145.704, "eval_samples_per_second": 2.354, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, "logits/chosen": -2.011120557785034, "logits/rejected": -2.021751880645752, "logps/chosen": -31.745685577392578, "logps/rejected": -33.96772003173828, "loss": 0.9967, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0022561827208846807, "rewards/margins": 0.003306365106254816, "rewards/rejected": -0.0010501822689548135, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, "logits/chosen": -1.903857946395874, "logits/rejected": -1.918621301651001, "logps/chosen": -29.797290802001953, "logps/rejected": -31.628814697265625, "loss": 0.9962, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0031425058841705322, "rewards/margins": 0.0038488968275487423, "rewards/rejected": -0.0007063907687552273, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, "logits/chosen": -1.9593979120254517, "logits/rejected": -1.9633464813232422, "logps/chosen": -33.067623138427734, "logps/rejected": -31.64206886291504, "loss": 0.9956, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.003498472273349762, "rewards/margins": 0.004426136147230864, "rewards/rejected": -0.0009276636992581189, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, "logits/chosen": -1.9572632312774658, "logits/rejected": -1.9354908466339111, "logps/chosen": -33.843727111816406, "logps/rejected": -35.1453742980957, "loss": 0.995, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0031002266332507133, "rewards/margins": 0.0050460235215723515, "rewards/rejected": -0.0019457967719063163, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, "logits/chosen": -1.9997854232788086, "logits/rejected": -1.9964803457260132, "logps/chosen": -32.75019454956055, "logps/rejected": -36.28661346435547, "loss": 0.9976, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0017487213481217623, "rewards/margins": 0.0023655896075069904, "rewards/rejected": -0.0006168682011775672, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, "logits/chosen": -1.8673791885375977, "logits/rejected": -1.8649587631225586, "logps/chosen": -34.018226623535156, "logps/rejected": -35.539276123046875, "loss": 0.998, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.001701725646853447, "rewards/margins": 0.002048287307843566, "rewards/rejected": -0.0003465614281594753, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, "logits/chosen": -1.8522275686264038, "logits/rejected": -1.849872350692749, "logps/chosen": -34.16339874267578, "logps/rejected": -31.845317840576172, "loss": 0.9969, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.0023420110810548067, "rewards/margins": 0.0030819105450063944, "rewards/rejected": -0.0007398994639515877, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, "logits/chosen": -1.9549518823623657, "logits/rejected": -1.94447922706604, "logps/chosen": -35.027687072753906, "logps/rejected": -31.895471572875977, "loss": 0.9962, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.003289591521024704, "rewards/margins": 0.003834384260699153, "rewards/rejected": -0.0005447928560897708, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, "logits/chosen": -2.0498766899108887, "logits/rejected": -2.034980297088623, "logps/chosen": -30.72440528869629, "logps/rejected": -32.658695220947266, "loss": 0.9979, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0019971781875938177, "rewards/margins": 0.002065772656351328, "rewards/rejected": -6.859400309622288e-05, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, "logits/chosen": -1.9201946258544922, "logits/rejected": -1.9177051782608032, "logps/chosen": -32.3183479309082, "logps/rejected": -30.95510482788086, "loss": 0.9925, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.006087628658860922, "rewards/margins": 0.007470599375665188, "rewards/rejected": -0.0013829706003889441, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": -2.2213804721832275, "eval_logits/rejected": -2.216555118560791, "eval_logps/chosen": -34.083614349365234, "eval_logps/rejected": -37.60634994506836, "eval_loss": 0.999591052532196, "eval_rewards/accuracies": 0.559385359287262, "eval_rewards/chosen": -0.0004906260874122381, "eval_rewards/margins": 0.00040669209556654096, "eval_rewards/rejected": -0.0008973181829787791, "eval_runtime": 145.8707, "eval_samples_per_second": 2.351, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, "learning_rate": 5.576113578589035e-07, "logits/chosen": -1.9060642719268799, "logits/rejected": -1.9028133153915405, "logps/chosen": -31.319162368774414, "logps/rejected": -33.85043716430664, "loss": 0.9961, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.002841859357431531, "rewards/margins": 0.003924719989299774, "rewards/rejected": -0.0010828599333763123, "step": 310 }, { "epoch": 0.83, "learning_rate": 4.229036944380913e-07, "logits/chosen": -1.9553836584091187, "logits/rejected": -1.9432109594345093, "logps/chosen": -34.27588653564453, "logps/rejected": -33.672359466552734, "loss": 0.9955, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.0032019000500440598, "rewards/margins": 0.004537059459835291, "rewards/rejected": -0.0013351596426218748, "step": 320 }, { "epoch": 0.86, "learning_rate": 3.053082288996112e-07, "logits/chosen": -1.9905990362167358, "logits/rejected": -1.9891618490219116, "logps/chosen": -33.116233825683594, "logps/rejected": -32.55724334716797, "loss": 0.9955, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.0036955769173800945, "rewards/margins": 0.004472161643207073, "rewards/rejected": -0.0007765850750729442, "step": 330 }, { "epoch": 0.88, "learning_rate": 2.0579377374915805e-07, "logits/chosen": -2.0769362449645996, "logits/rejected": -2.0613036155700684, "logps/chosen": -33.791297912597656, "logps/rejected": -33.12422180175781, "loss": 0.9961, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.003880967851728201, "rewards/margins": 0.00394281093031168, "rewards/rejected": -6.184288213262334e-05, "step": 340 }, { "epoch": 0.91, "learning_rate": 1.2518018074041684e-07, "logits/chosen": -1.950060248374939, "logits/rejected": -1.9492241144180298, "logps/chosen": -32.82404327392578, "logps/rejected": -32.50709915161133, "loss": 0.995, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.004580510314553976, "rewards/margins": 0.005000022705644369, "rewards/rejected": -0.000419511750806123, "step": 350 }, { "epoch": 0.94, "learning_rate": 6.41315865106129e-08, "logits/chosen": -1.9050449132919312, "logits/rejected": -1.915305733680725, "logps/chosen": -31.87860679626465, "logps/rejected": -35.34981155395508, "loss": 0.9961, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0032608420588076115, "rewards/margins": 0.0038713677786290646, "rewards/rejected": -0.0006105261854827404, "step": 360 }, { "epoch": 0.96, "learning_rate": 2.3150941078050325e-08, "logits/chosen": -2.04546856880188, "logits/rejected": -2.039043426513672, "logps/chosen": -33.336219787597656, "logps/rejected": -29.269311904907227, "loss": 0.9964, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.0031574335880577564, "rewards/margins": 0.003586276201531291, "rewards/rejected": -0.0004288425261620432, "step": 370 }, { "epoch": 0.99, "learning_rate": 2.575864278703266e-09, "logits/chosen": -1.905160665512085, "logits/rejected": -1.907360315322876, "logps/chosen": -33.86741256713867, "logps/rejected": -30.982807159423828, "loss": 0.9952, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.0037163912784308195, "rewards/margins": 0.004818186163902283, "rewards/rejected": -0.0011017953511327505, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 0.9973225085766284, "train_runtime": 3253.1307, "train_samples_per_second": 0.946, "train_steps_per_second": 0.118 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }