diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 4.0, + "epoch": 1.0, "eval_steps": 100, - "global_step": 1540, + "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -15,7 +15,7 @@ "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, - "loss": 0.6931, + "loss": 0.5, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -25,155 +25,155 @@ { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, - "logits/chosen": -1.8665436506271362, - "logits/rejected": -1.8708542585372925, - "logps/chosen": -36.982460021972656, - "logps/rejected": -33.656585693359375, - "loss": 0.6882, - "rewards/accuracies": 0.5138888955116272, - "rewards/chosen": 0.0048276386223733425, - "rewards/margins": 0.010130541399121284, - "rewards/rejected": -0.005302901845425367, + "logits/chosen": -1.8665586709976196, + "logits/rejected": -1.8708692789077759, + "logps/chosen": -37.00250244140625, + "logps/rejected": -33.66969299316406, + "loss": 0.4985, + "rewards/accuracies": 0.5416666865348816, + "rewards/chosen": 0.0008193479152396321, + "rewards/margins": 0.008743342012166977, + "rewards/rejected": -0.007923995144665241, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, - "logits/chosen": -1.9975639581680298, - "logits/rejected": -2.000220537185669, - "logps/chosen": -29.6324462890625, - "logps/rejected": -29.057437896728516, - "loss": 0.693, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": 0.0019495869055390358, - "rewards/margins": 0.0007479515625163913, - "rewards/rejected": 0.0012016354594379663, + "logits/chosen": -1.9974254369735718, + "logits/rejected": -2.0000810623168945, + "logps/chosen": -29.634906768798828, + "logps/rejected": -29.027408599853516, + "loss": 0.5009, + "rewards/accuracies": 0.38749998807907104, + "rewards/chosen": 0.001457492122426629, + "rewards/margins": -0.005749998614192009, + "rewards/rejected": 0.007207490503787994, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, - "logits/chosen": -1.9208921194076538, - "logits/rejected": -1.918195128440857, - "logps/chosen": -31.420969009399414, - "logps/rejected": -33.22509765625, - "loss": 0.6932, - "rewards/accuracies": 0.48750001192092896, - "rewards/chosen": -0.000974868715275079, - "rewards/margins": 0.0003514128620736301, - "rewards/rejected": -0.0013262818101793528, + "logits/chosen": -1.9204790592193604, + "logits/rejected": -1.9177839756011963, + "logps/chosen": -31.412555694580078, + "logps/rejected": -33.24369812011719, + "loss": 0.4992, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.0007082058000378311, + "rewards/margins": 0.005755766294896603, + "rewards/rejected": -0.005047560669481754, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, - "logits/chosen": -2.018131971359253, - "logits/rejected": -2.0093750953674316, - "logps/chosen": -32.56100082397461, - "logps/rejected": -32.525699615478516, - "loss": 0.6905, - "rewards/accuracies": 0.512499988079071, - "rewards/chosen": 0.0031610552687197924, - "rewards/margins": 0.0057663023471832275, - "rewards/rejected": -0.002605247776955366, + "logits/chosen": -2.0162367820739746, + "logits/rejected": -2.007521629333496, + "logps/chosen": -32.55222702026367, + "logps/rejected": -32.50428771972656, + "loss": 0.4992, + "rewards/accuracies": 0.5, + "rewards/chosen": 0.004915344063192606, + "rewards/margins": 0.0032389431726187468, + "rewards/rejected": 0.0016764007741585374, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, - "logits/chosen": -1.8627980947494507, - "logits/rejected": -1.852043867111206, - "logps/chosen": -33.57464599609375, - "logps/rejected": -35.459083557128906, - "loss": 0.6945, - "rewards/accuracies": 0.4625000059604645, - "rewards/chosen": -0.003433463629335165, - "rewards/margins": -0.0020852875895798206, - "rewards/rejected": -0.0013481763890013099, + "logits/chosen": -1.8627817630767822, + "logits/rejected": -1.8519961833953857, + "logps/chosen": -33.52722930908203, + "logps/rejected": -35.41474151611328, + "loss": 0.5002, + "rewards/accuracies": 0.4749999940395355, + "rewards/chosen": 0.0060504418797791, + "rewards/margins": -0.001469915034249425, + "rewards/rejected": 0.007520356681197882, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, - "logits/chosen": -1.9412128925323486, - "logits/rejected": -1.9431606531143188, - "logps/chosen": -32.51996612548828, - "logps/rejected": -33.226768493652344, - "loss": 0.6806, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.01604365184903145, - "rewards/margins": 0.02712194062769413, - "rewards/rejected": -0.011078287847340107, + "logits/chosen": -1.9401956796646118, + "logits/rejected": -1.9421344995498657, + "logps/chosen": -32.53112030029297, + "logps/rejected": -33.1898307800293, + "loss": 0.4964, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.01381197851151228, + "rewards/margins": 0.01750265061855316, + "rewards/rejected": -0.00369067071005702, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, - "logits/chosen": -2.072383165359497, - "logits/rejected": -2.0773634910583496, - "logps/chosen": -33.964813232421875, - "logps/rejected": -36.62574005126953, - "loss": 0.6861, - "rewards/accuracies": 0.5625, - "rewards/chosen": 0.004735334776341915, - "rewards/margins": 0.016109289601445198, - "rewards/rejected": -0.011373954825103283, + "logits/chosen": -2.071833610534668, + "logits/rejected": -2.076794147491455, + "logps/chosen": -33.93788528442383, + "logps/rejected": -36.575801849365234, + "loss": 0.4976, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.010120250284671783, + "rewards/margins": 0.011506280861794949, + "rewards/rejected": -0.0013860296458005905, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, - "logits/chosen": -1.9332096576690674, - "logits/rejected": -1.9363422393798828, - "logps/chosen": -34.310150146484375, - "logps/rejected": -34.669891357421875, - "loss": 0.6749, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.023532329127192497, - "rewards/margins": 0.03902902454137802, - "rewards/rejected": -0.015496693551540375, + "logits/chosen": -1.9325587749481201, + "logits/rejected": -1.9356613159179688, + "logps/chosen": -34.26636505126953, + "logps/rejected": -34.54140090942383, + "loss": 0.4947, + "rewards/accuracies": 0.550000011920929, + "rewards/chosen": 0.032289423048496246, + "rewards/margins": 0.022087663412094116, + "rewards/rejected": 0.01020175963640213, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, - "logits/chosen": -1.9401609897613525, - "logits/rejected": -1.944685935974121, - "logps/chosen": -32.39472961425781, - "logps/rejected": -32.360809326171875, - "loss": 0.6865, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.01453828252851963, - "rewards/margins": 0.014918209984898567, - "rewards/rejected": -0.000379929319024086, + "logits/chosen": -1.942940354347229, + "logits/rejected": -1.9474563598632812, + "logps/chosen": -32.32842254638672, + "logps/rejected": -32.28204345703125, + "loss": 0.4965, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.027799556031823158, + "rewards/margins": 0.012426125817000866, + "rewards/rejected": 0.015373429283499718, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, - "logits/chosen": -2.0377113819122314, - "logits/rejected": -2.035726308822632, - "logps/chosen": -32.14560317993164, - "logps/rejected": -31.297948837280273, - "loss": 0.6813, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.01904645375907421, - "rewards/margins": 0.0253940187394619, - "rewards/rejected": -0.006347564049065113, + "logits/chosen": -2.0405964851379395, + "logits/rejected": -2.0386147499084473, + "logps/chosen": -32.07122039794922, + "logps/rejected": -31.215023040771484, + "loss": 0.4947, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.033922821283340454, + "rewards/margins": 0.02368539571762085, + "rewards/rejected": 0.010237427428364754, "step": 100 }, { "epoch": 0.26, - "eval_logits/chosen": -2.231863498687744, - "eval_logits/rejected": -2.227025270462036, - "eval_logps/chosen": -34.02951431274414, - "eval_logps/rejected": -37.534454345703125, - "eval_loss": 0.6914929151535034, - "eval_rewards/accuracies": 0.5220099687576294, - "eval_rewards/chosen": 0.0010076783364638686, - "eval_rewards/margins": 0.004574176389724016, - "eval_rewards/rejected": -0.0035664979368448257, - "eval_runtime": 145.7925, + "eval_logits/chosen": -2.2347991466522217, + "eval_logits/rejected": -2.229947805404663, + "eval_logps/chosen": -33.91511917114258, + "eval_logps/rejected": -37.412628173828125, + "eval_loss": 0.499397873878479, + "eval_rewards/accuracies": 0.5215947031974792, + "eval_rewards/chosen": 0.023886699229478836, + "eval_rewards/margins": 0.0030889539048075676, + "eval_rewards/rejected": 0.020797746255993843, + "eval_runtime": 145.7619, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 100 @@ -181,2257 +181,441 @@ { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, - "logits/chosen": -1.992661476135254, - "logits/rejected": -1.9902756214141846, - "logps/chosen": -33.13728713989258, - "logps/rejected": -34.051490783691406, - "loss": 0.6834, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.02130572497844696, - "rewards/margins": 0.025729293003678322, - "rewards/rejected": -0.004423565696924925, + "logits/chosen": -1.9959571361541748, + "logits/rejected": -1.99361252784729, + "logps/chosen": -32.979209899902344, + "logps/rejected": -33.898841857910156, + "loss": 0.4925, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.052921634167432785, + "rewards/margins": 0.026816055178642273, + "rewards/rejected": 0.026105573400855064, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, - "logits/chosen": -2.0037741661071777, - "logits/rejected": -1.995439887046814, - "logps/chosen": -32.33051681518555, - "logps/rejected": -32.13713836669922, - "loss": 0.6853, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.022878289222717285, - "rewards/margins": 0.01800495944917202, - "rewards/rejected": 0.004873327445238829, + "logits/chosen": -2.007497787475586, + "logits/rejected": -1.9991832971572876, + "logps/chosen": -32.195396423339844, + "logps/rejected": -31.9866886138916, + "loss": 0.4967, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.04990261048078537, + "rewards/margins": 0.01493864320218563, + "rewards/rejected": 0.03496397286653519, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, - "logits/chosen": -2.032257318496704, - "logits/rejected": -2.024292230606079, - "logps/chosen": -30.306774139404297, - "logps/rejected": -32.081024169921875, - "loss": 0.6754, - "rewards/accuracies": 0.6499999761581421, - "rewards/chosen": 0.03378837928175926, - "rewards/margins": 0.03939158096909523, - "rewards/rejected": -0.00560319609940052, + "logits/chosen": -2.0352938175201416, + "logits/rejected": -2.027338743209839, + "logps/chosen": -30.16824722290039, + "logps/rejected": -31.9173526763916, + "loss": 0.4919, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.061494071036577225, + "rewards/margins": 0.034362830221652985, + "rewards/rejected": 0.027131233364343643, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, - "logits/chosen": -1.9630043506622314, - "logits/rejected": -1.973233938217163, - "logps/chosen": -31.224035263061523, - "logps/rejected": -32.56679916381836, - "loss": 0.6718, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.0391254760324955, - "rewards/margins": 0.045508723706007004, - "rewards/rejected": -0.006383246276527643, + "logits/chosen": -1.9656566381454468, + "logits/rejected": -1.9758468866348267, + "logps/chosen": -31.05475425720215, + "logps/rejected": -32.390235900878906, + "loss": 0.4893, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.0729818269610405, + "rewards/margins": 0.04405337944626808, + "rewards/rejected": 0.028928453102707863, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, - "logits/chosen": -1.874066710472107, - "logits/rejected": -1.8752658367156982, - "logps/chosen": -33.90449905395508, - "logps/rejected": -34.78742218017578, - "loss": 0.6642, + "logits/chosen": -1.877623200416565, + "logits/rejected": -1.8787848949432373, + "logps/chosen": -33.642906188964844, + "logps/rejected": -34.57593536376953, + "loss": 0.4826, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.05763882398605347, - "rewards/margins": 0.06367169320583344, - "rewards/rejected": -0.006032869219779968, + "rewards/chosen": 0.10995662212371826, + "rewards/margins": 0.07369254529476166, + "rewards/rejected": 0.0362640880048275, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, - "logits/chosen": -1.9251247644424438, - "logits/rejected": -1.9217197895050049, - "logps/chosen": -36.002262115478516, - "logps/rejected": -32.70859909057617, - "loss": 0.6788, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.03388362005352974, - "rewards/margins": 0.031053191050887108, - "rewards/rejected": 0.0028304329607635736, + "logits/chosen": -1.9285871982574463, + "logits/rejected": -1.9251912832260132, + "logps/chosen": -35.79336929321289, + "logps/rejected": -32.49341583251953, + "loss": 0.4925, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.07566188275814056, + "rewards/margins": 0.029793858528137207, + "rewards/rejected": 0.045868031680583954, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, - "logits/chosen": -2.025562286376953, - "logits/rejected": -2.018228530883789, - "logps/chosen": -33.46672439575195, - "logps/rejected": -31.409042358398438, - "loss": 0.657, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.06622789800167084, - "rewards/margins": 0.07766623049974442, - "rewards/rejected": -0.011438337154686451, + "logits/chosen": -2.0295512676239014, + "logits/rejected": -2.0222458839416504, + "logps/chosen": -33.23942947387695, + "logps/rejected": -31.19607925415039, + "loss": 0.4806, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11168631166219711, + "rewards/margins": 0.08053232729434967, + "rewards/rejected": 0.0311539676040411, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, - "logits/chosen": -2.0323550701141357, - "logits/rejected": -2.037605047225952, - "logps/chosen": -32.23218536376953, - "logps/rejected": -32.439491271972656, - "loss": 0.6682, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.06252055615186691, - "rewards/margins": 0.05339093878865242, - "rewards/rejected": 0.009129621088504791, + "logits/chosen": -2.0365030765533447, + "logits/rejected": -2.0416781902313232, + "logps/chosen": -31.986160278320312, + "logps/rejected": -32.17144012451172, + "loss": 0.4878, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.11172527074813843, + "rewards/margins": 0.048985324800014496, + "rewards/rejected": 0.06273995339870453, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, - "logits/chosen": -2.0334537029266357, - "logits/rejected": -2.030702590942383, - "logps/chosen": -31.266775131225586, - "logps/rejected": -31.347997665405273, - "loss": 0.67, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.04443611577153206, - "rewards/margins": 0.050393205136060715, - "rewards/rejected": -0.005957084707915783, + "logits/chosen": -2.0367565155029297, + "logits/rejected": -2.0340359210968018, + "logps/chosen": -31.0958194732666, + "logps/rejected": -31.11456298828125, + "loss": 0.4905, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.07862688601016998, + "rewards/margins": 0.03789640590548515, + "rewards/rejected": 0.04073048755526543, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, - "logits/chosen": -1.9037307500839233, - "logits/rejected": -1.9083878993988037, - "logps/chosen": -31.30868148803711, - "logps/rejected": -32.823123931884766, - "loss": 0.6632, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.05951983854174614, - "rewards/margins": 0.0659579336643219, - "rewards/rejected": -0.006438094191253185, + "logits/chosen": -1.90771484375, + "logits/rejected": -1.912388801574707, + "logps/chosen": -31.08389663696289, + "logps/rejected": -32.62942123413086, + "loss": 0.4825, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.10447730123996735, + "rewards/margins": 0.07217548787593842, + "rewards/rejected": 0.032301802188158035, "step": 200 }, { "epoch": 0.52, - "eval_logits/chosen": -2.22880482673645, - "eval_logits/rejected": -2.2239701747894287, - "eval_logps/chosen": -34.05141067504883, - "eval_logps/rejected": -37.58522033691406, - "eval_loss": 0.6888246536254883, - "eval_rewards/accuracies": 0.5660299062728882, - "eval_rewards/chosen": -0.003371814964339137, - "eval_rewards/margins": 0.01034836657345295, - "eval_rewards/rejected": -0.013720180839300156, - "eval_runtime": 145.665, - "eval_samples_per_second": 2.355, + "eval_logits/chosen": -2.2322466373443604, + "eval_logits/rejected": -2.2274229526519775, + "eval_logps/chosen": -33.779205322265625, + "eval_logps/rejected": -37.31794357299805, + "eval_loss": 0.4973558187484741, + "eval_rewards/accuracies": 0.5544019937515259, + "eval_rewards/chosen": 0.05106903612613678, + "eval_rewards/margins": 0.011334729380905628, + "eval_rewards/rejected": 0.039734311401844025, + "eval_runtime": 145.5488, + "eval_samples_per_second": 2.357, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, - "logits/chosen": -2.016369342803955, - "logits/rejected": -2.027027130126953, - "logps/chosen": -31.75009536743164, - "logps/rejected": -33.965911865234375, - "loss": 0.6631, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.04424174129962921, - "rewards/margins": 0.06488426774740219, - "rewards/rejected": -0.020642530173063278, + "logits/chosen": -2.0204057693481445, + "logits/rejected": -2.0309972763061523, + "logps/chosen": -31.54058837890625, + "logps/rejected": -33.71467590332031, + "loss": 0.4864, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.08614270389080048, + "rewards/margins": 0.0565376803278923, + "rewards/rejected": 0.029605034738779068, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, - "logits/chosen": -1.909157156944275, - "logits/rejected": -1.9239362478256226, - "logps/chosen": -29.8031005859375, - "logps/rejected": -31.647253036499023, - "loss": 0.6564, + "logits/chosen": -1.9130290746688843, + "logits/rejected": -1.9277244806289673, + "logps/chosen": -29.63765525817871, + "logps/rejected": -31.437763214111328, + "loss": 0.4831, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.06168792396783829, - "rewards/margins": 0.07950346171855927, - "rewards/rejected": -0.017815548926591873, + "rewards/chosen": 0.09477666765451431, + "rewards/margins": 0.07069384306669235, + "rewards/rejected": 0.02408282831311226, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, - "logits/chosen": -1.9656283855438232, - "logits/rejected": -1.9696086645126343, - "logps/chosen": -33.09147644042969, - "logps/rejected": -31.6539306640625, - "loss": 0.6543, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.06519898772239685, - "rewards/margins": 0.08612470328807831, - "rewards/rejected": -0.02092570997774601, + "logits/chosen": -1.9711300134658813, + "logits/rejected": -1.9751195907592773, + "logps/chosen": -32.86473846435547, + "logps/rejected": -31.40615463256836, + "loss": 0.4801, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.11054597795009613, + "rewards/margins": 0.0819164589047432, + "rewards/rejected": 0.02862953022122383, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, - "logits/chosen": -1.963721513748169, - "logits/rejected": -1.9419059753417969, - "logps/chosen": -33.84326934814453, - "logps/rejected": -35.13850784301758, - "loss": 0.6479, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.06209472939372063, - "rewards/margins": 0.0996372401714325, - "rewards/rejected": -0.03754251450300217, + "logits/chosen": -1.9691890478134155, + "logits/rejected": -1.9474375247955322, + "logps/chosen": -33.62578201293945, + "logps/rejected": -34.88764953613281, + "loss": 0.4781, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.10559363663196564, + "rewards/margins": 0.0929645225405693, + "rewards/rejected": 0.012629099190235138, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, - "logits/chosen": -2.004868268966675, - "logits/rejected": -2.0015549659729004, - "logps/chosen": -32.729896545410156, - "logps/rejected": -36.27812957763672, - "loss": 0.6706, + "logits/chosen": -2.010312557220459, + "logits/rejected": -2.007035732269287, + "logps/chosen": -32.49589920043945, + "logps/rejected": -35.9986686706543, + "loss": 0.49, "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.03903444483876228, - "rewards/margins": 0.04967557638883591, - "rewards/rejected": -0.010641133412718773, + "rewards/chosen": 0.08583381026983261, + "rewards/margins": 0.040582992136478424, + "rewards/rejected": 0.04525081440806389, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, - "logits/chosen": -1.8725497722625732, - "logits/rejected": -1.8701589107513428, - "logps/chosen": -33.99323654174805, - "logps/rejected": -35.557159423828125, - "loss": 0.6706, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.039031852036714554, - "rewards/margins": 0.04953894019126892, - "rewards/rejected": -0.010507088154554367, + "logits/chosen": -1.8778746128082275, + "logits/rejected": -1.8754326105117798, + "logps/chosen": -33.7269287109375, + "logps/rejected": -35.267066955566406, + "loss": 0.4893, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.09229358285665512, + "rewards/margins": 0.044782862067222595, + "rewards/rejected": 0.047510724514722824, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, - "logits/chosen": -1.8571383953094482, - "logits/rejected": -1.854741096496582, - "logps/chosen": -34.20746612548828, - "logps/rejected": -31.83416748046875, - "loss": 0.6708, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.0380266010761261, - "rewards/margins": 0.05059366300702095, - "rewards/rejected": -0.01256705541163683, + "logits/chosen": -1.8628604412078857, + "logits/rejected": -1.8603498935699463, + "logps/chosen": -33.917144775390625, + "logps/rejected": -31.5814151763916, + "loss": 0.4863, + "rewards/accuracies": 0.637499988079071, + "rewards/chosen": 0.09609059244394302, + "rewards/margins": 0.058107007294893265, + "rewards/rejected": 0.03798357769846916, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, - "logits/chosen": -1.9600715637207031, - "logits/rejected": -1.9495933055877686, - "logps/chosen": -34.99934387207031, - "logps/rejected": -31.900096893310547, - "loss": 0.6544, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.07145937532186508, - "rewards/margins": 0.08328022807836533, - "rewards/rejected": -0.011820845305919647, + "logits/chosen": -1.9663734436035156, + "logits/rejected": -1.9559385776519775, + "logps/chosen": -34.753868103027344, + "logps/rejected": -31.635868072509766, + "loss": 0.4806, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.1205538734793663, + "rewards/margins": 0.07952861487865448, + "rewards/rejected": 0.04102526605129242, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, - "logits/chosen": -2.0559241771698, - "logits/rejected": -2.04101300239563, - "logps/chosen": -30.744888305664062, - "logps/rejected": -32.62079620361328, - "loss": 0.6809, - "rewards/accuracies": 0.5375000238418579, - "rewards/chosen": 0.035847391933202744, - "rewards/margins": 0.029639026150107384, - "rewards/rejected": 0.006208371836692095, + "logits/chosen": -2.062568187713623, + "logits/rejected": -2.047731399536133, + "logps/chosen": -30.416040420532227, + "logps/rejected": -32.359901428222656, + "loss": 0.4895, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.1016167551279068, + "rewards/margins": 0.04322956129908562, + "rewards/rejected": 0.05838719755411148, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, - "logits/chosen": -1.9259971380233765, - "logits/rejected": -1.9234883785247803, - "logps/chosen": -32.35750961303711, - "logps/rejected": -30.92331314086914, - "loss": 0.6327, - "rewards/accuracies": 0.7875000238418579, - "rewards/chosen": 0.11392021179199219, - "rewards/margins": 0.13522081077098846, - "rewards/rejected": -0.021300604566931725, + "logits/chosen": -1.93375563621521, + "logits/rejected": -1.9313066005706787, + "logps/chosen": -32.096317291259766, + "logps/rejected": -30.669086456298828, + "loss": 0.4669, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.16615837812423706, + "rewards/margins": 0.13661329448223114, + "rewards/rejected": 0.029545078054070473, "step": 300 }, { "epoch": 0.78, - "eval_logits/chosen": -2.2262163162231445, - "eval_logits/rejected": -2.2213830947875977, - "eval_logps/chosen": -34.093963623046875, - "eval_logps/rejected": -37.60972213745117, - "eval_loss": 0.6908519268035889, - "eval_rewards/accuracies": 0.49501657485961914, - "eval_rewards/chosen": -0.011881927028298378, - "eval_rewards/margins": 0.006738076452165842, - "eval_rewards/rejected": -0.018620004877448082, - "eval_runtime": 145.4585, - "eval_samples_per_second": 2.358, - "eval_steps_per_second": 0.296, + "eval_logits/chosen": -2.231231927871704, + "eval_logits/rejected": -2.226423978805542, + "eval_logps/chosen": -33.768104553222656, + "eval_logps/rejected": -37.29201889038086, + "eval_loss": 0.49796417355537415, + "eval_rewards/accuracies": 0.5157807469367981, + "eval_rewards/chosen": 0.05328937619924545, + "eval_rewards/margins": 0.008368566632270813, + "eval_rewards/rejected": 0.04492080584168434, + "eval_runtime": 145.91, + "eval_samples_per_second": 2.351, + "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, - "learning_rate": 4.84533120650964e-06, - "logits/chosen": -1.911368727684021, - "logits/rejected": -1.9081214666366577, - "logps/chosen": -31.312576293945312, - "logps/rejected": -33.82461166381836, - "loss": 0.6597, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.05815497785806656, - "rewards/margins": 0.07464680820703506, - "rewards/rejected": -0.0164918415248394, + "learning_rate": 5.576113578589035e-07, + "logits/chosen": -1.9169145822525024, + "logits/rejected": -1.9137403964996338, + "logps/chosen": -31.04461097717285, + "logps/rejected": -33.57860565185547, + "loss": 0.4809, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.11174802482128143, + "rewards/margins": 0.07903869450092316, + "rewards/rejected": 0.03270933777093887, "step": 310 }, { "epoch": 0.83, - "learning_rate": 4.825108134172131e-06, - "logits/chosen": -1.9587844610214233, - "logits/rejected": -1.9465945959091187, - "logps/chosen": -34.316810607910156, - "logps/rejected": -33.684722900390625, - "loss": 0.6546, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.05585243180394173, - "rewards/margins": 0.08502905070781708, - "rewards/rejected": -0.0291766170412302, + "learning_rate": 4.229036944380913e-07, + "logits/chosen": -1.9684025049209595, + "logits/rejected": -1.956284523010254, + "logps/chosen": -34.03736114501953, + "logps/rejected": -33.44590377807617, + "loss": 0.4778, + "rewards/accuracies": 0.762499988079071, + "rewards/chosen": 0.11174283176660538, + "rewards/margins": 0.09315498173236847, + "rewards/rejected": 0.018587838858366013, "step": 320 }, { "epoch": 0.86, - "learning_rate": 4.80369052967602e-06, - "logits/chosen": -1.9922815561294556, - "logits/rejected": -1.990870475769043, - "logps/chosen": -33.09821319580078, - "logps/rejected": -32.55400085449219, - "loss": 0.6516, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.07751716673374176, - "rewards/margins": 0.09240014851093292, - "rewards/rejected": -0.014882983639836311, + "learning_rate": 3.053082288996112e-07, + "logits/chosen": -2.003418445587158, + "logits/rejected": -2.0020651817321777, + "logps/chosen": -32.87415313720703, + "logps/rejected": -32.24354553222656, + "loss": 0.4816, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.12232844531536102, + "rewards/margins": 0.07512133568525314, + "rewards/rejected": 0.04720713198184967, "step": 330 }, { "epoch": 0.88, - "learning_rate": 4.781089396387968e-06, - "logits/chosen": -2.077565908432007, - "logits/rejected": -2.0619282722473145, - "logps/chosen": -33.7425651550293, - "logps/rejected": -33.112735748291016, - "loss": 0.6535, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.08736560493707657, - "rewards/margins": 0.0863056406378746, - "rewards/rejected": 0.0010599673260003328, + "learning_rate": 2.0579377374915805e-07, + "logits/chosen": -2.090510845184326, + "logits/rejected": -2.0748770236968994, + "logps/chosen": -33.48200607299805, + "logps/rejected": -32.82493209838867, + "loss": 0.4807, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.13947655260562897, + "rewards/margins": 0.0808553546667099, + "rewards/rejected": 0.05862119793891907, "step": 340 }, { "epoch": 0.91, - "learning_rate": 4.757316345716554e-06, - "logits/chosen": -1.9514176845550537, - "logits/rejected": -1.9505844116210938, - "logps/chosen": -32.773475646972656, - "logps/rejected": -32.514556884765625, - "loss": 0.6447, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.10172417014837265, - "rewards/margins": 0.11160662025213242, - "rewards/rejected": -0.009882445447146893, + "learning_rate": 1.2518018074041684e-07, + "logits/chosen": -1.9624559879302979, + "logits/rejected": -1.9616531133651733, + "logps/chosen": -32.56671142578125, + "logps/rejected": -32.222007751464844, + "loss": 0.4774, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.14307641983032227, + "rewards/margins": 0.09444761276245117, + "rewards/rejected": 0.04862881451845169, "step": 350 }, { "epoch": 0.94, - "learning_rate": 4.73238359114687e-06, - "logits/chosen": -1.9032083749771118, - "logits/rejected": -1.9134079217910767, - "logps/chosen": -31.7545223236084, - "logps/rejected": -35.40223693847656, - "loss": 0.643, - "rewards/accuracies": 0.737500011920929, - "rewards/chosen": 0.09003358334302902, - "rewards/margins": 0.11272971332073212, - "rewards/rejected": -0.022696146741509438, + "learning_rate": 6.41315865106129e-08, + "logits/chosen": -1.9191839694976807, + "logits/rejected": -1.9294923543930054, + "logps/chosen": -31.610462188720703, + "logps/rejected": -35.02184295654297, + "loss": 0.4841, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.11884615570306778, + "rewards/margins": 0.06546333432197571, + "rewards/rejected": 0.05338282510638237, "step": 360 }, { "epoch": 0.96, - "learning_rate": 4.706303941965804e-06, - "logits/chosen": -2.0375308990478516, - "logits/rejected": -2.0311739444732666, - "logps/chosen": -33.24945831298828, - "logps/rejected": -29.3138370513916, - "loss": 0.6481, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.08050236850976944, - "rewards/margins": 0.09798407554626465, - "rewards/rejected": -0.017481710761785507, + "learning_rate": 2.3150941078050325e-08, + "logits/chosen": -2.0576701164245605, + "logits/rejected": -2.051162004470825, + "logps/chosen": -33.00985336303711, + "logps/rejected": -28.984561920166016, + "loss": 0.4803, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12842342257499695, + "rewards/margins": 0.08005058020353317, + "rewards/rejected": 0.04837283492088318, "step": 370 }, { "epoch": 0.99, - "learning_rate": 4.679090796681225e-06, - "logits/chosen": -1.8942573070526123, - "logits/rejected": -1.896502137184143, - "logps/chosen": -33.638389587402344, - "logps/rejected": -30.993362426757812, - "loss": 0.6294, - "rewards/accuracies": 0.7749999761581421, - "rewards/chosen": 0.12013156712055206, - "rewards/margins": 0.14427851140499115, - "rewards/rejected": -0.02414695918560028, + "learning_rate": 2.575864278703266e-09, + "logits/chosen": -1.9184468984603882, + "logits/rejected": -1.9206174612045288, + "logps/chosen": -33.628562927246094, + "logps/rejected": -30.75335121154785, + "loss": 0.4766, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.12209834903478622, + "rewards/margins": 0.09824265539646149, + "rewards/rejected": 0.023855695500969887, "step": 380 }, { - "epoch": 1.01, - "learning_rate": 4.650758136138454e-06, - "logits/chosen": -1.9207279682159424, - "logits/rejected": -1.9194484949111938, - "logps/chosen": -33.73949432373047, - "logps/rejected": -36.0501823425293, - "loss": 0.6032, - "rewards/accuracies": 0.7666667103767395, - "rewards/chosen": 0.12905877828598022, - "rewards/margins": 0.20789436995983124, - "rewards/rejected": -0.07883557677268982, - "step": 390 - }, - { - "epoch": 1.04, - "learning_rate": 4.621320516337559e-06, - "logits/chosen": -1.8543081283569336, - "logits/rejected": -1.845897912979126, - "logps/chosen": -30.96196937561035, - "logps/rejected": -36.452301025390625, - "loss": 0.581, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.1624010056257248, - "rewards/margins": 0.264740914106369, - "rewards/rejected": -0.10233992338180542, - "step": 400 - }, - { - "epoch": 1.04, - "eval_logits/chosen": -2.2016568183898926, - "eval_logits/rejected": -2.1968207359313965, - "eval_logps/chosen": -34.14461135864258, - "eval_logps/rejected": -37.69810104370117, - "eval_loss": 0.6878200173377991, - "eval_rewards/accuracies": 0.5714285373687744, - "eval_rewards/chosen": -0.022011177614331245, - "eval_rewards/margins": 0.014285118319094181, - "eval_rewards/rejected": -0.036296289414167404, - "eval_runtime": 146.2041, - "eval_samples_per_second": 2.346, - "eval_steps_per_second": 0.294, - "step": 400 - }, - { - "epoch": 1.06, - "learning_rate": 4.590793060955158e-06, - "logits/chosen": -2.023873805999756, - "logits/rejected": -2.026768207550049, - "logps/chosen": -32.23768997192383, - "logps/rejected": -35.340965270996094, - "loss": 0.5771, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.16324953734874725, - "rewards/margins": 0.27287349104881287, - "rewards/rejected": -0.10962393134832382, - "step": 410 - }, - { - "epoch": 1.09, - "learning_rate": 4.559191453574582e-06, - "logits/chosen": -1.8587830066680908, - "logits/rejected": -1.857414960861206, - "logps/chosen": -28.370670318603516, - "logps/rejected": -32.84798049926758, - "loss": 0.5834, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.13937747478485107, - "rewards/margins": 0.24544134736061096, - "rewards/rejected": -0.10606386512517929, - "step": 420 - }, - { - "epoch": 1.12, - "learning_rate": 4.52653192962838e-06, - "logits/chosen": -1.8148523569107056, - "logits/rejected": -1.8079469203948975, - "logps/chosen": -33.094722747802734, - "logps/rejected": -34.54511642456055, - "loss": 0.5788, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.1958724707365036, - "rewards/margins": 0.255840927362442, - "rewards/rejected": -0.05996844172477722, - "step": 430 - }, - { - "epoch": 1.14, - "learning_rate": 4.492831268057307e-06, - "logits/chosen": -1.9824317693710327, - "logits/rejected": -1.9772965908050537, - "logps/chosen": -30.79900550842285, - "logps/rejected": -32.5714225769043, - "loss": 0.559, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.196858748793602, - "rewards/margins": 0.31762176752090454, - "rewards/rejected": -0.12076298147439957, - "step": 440 - }, - { - "epoch": 1.17, - "learning_rate": 4.458106782690094e-06, - "logits/chosen": -1.863440752029419, - "logits/rejected": -1.867692232131958, - "logps/chosen": -33.48546600341797, - "logps/rejected": -33.23447799682617, - "loss": 0.5476, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.20607607066631317, - "rewards/margins": 0.33524394035339355, - "rewards/rejected": -0.1291678547859192, - "step": 450 - }, - { - "epoch": 1.19, - "learning_rate": 4.422376313348405e-06, - "logits/chosen": -1.8666127920150757, - "logits/rejected": -1.8609821796417236, - "logps/chosen": -34.34830856323242, - "logps/rejected": -35.8173942565918, - "loss": 0.5326, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.21424289047718048, - "rewards/margins": 0.38790255784988403, - "rewards/rejected": -0.17365965247154236, - "step": 460 - }, - { - "epoch": 1.22, - "learning_rate": 4.3856582166815696e-06, - "logits/chosen": -1.8877861499786377, - "logits/rejected": -1.887603998184204, - "logps/chosen": -33.108131408691406, - "logps/rejected": -34.76340866088867, - "loss": 0.5566, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.21203932166099548, - "rewards/margins": 0.32762524485588074, - "rewards/rejected": -0.11558592319488525, - "step": 470 - }, - { - "epoch": 1.25, - "learning_rate": 4.347971356735789e-06, - "logits/chosen": -1.9315589666366577, - "logits/rejected": -1.9128243923187256, - "logps/chosen": -33.0328254699707, - "logps/rejected": -33.85285949707031, - "loss": 0.5355, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.2255876511335373, - "rewards/margins": 0.3821641206741333, - "rewards/rejected": -0.15657642483711243, - "step": 480 - }, - { - "epoch": 1.27, - "learning_rate": 4.309335095262675e-06, - "logits/chosen": -1.8951761722564697, - "logits/rejected": -1.8945060968399048, - "logps/chosen": -30.589183807373047, - "logps/rejected": -31.776708602905273, - "loss": 0.562, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.20442362129688263, - "rewards/margins": 0.31055623292922974, - "rewards/rejected": -0.1061326265335083, - "step": 490 - }, - { - "epoch": 1.3, - "learning_rate": 4.269769281772082e-06, - "logits/chosen": -1.85430908203125, - "logits/rejected": -1.847357153892517, - "logps/chosen": -31.502410888671875, - "logps/rejected": -35.497947692871094, - "loss": 0.5252, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.2335432469844818, - "rewards/margins": 0.4077844023704529, - "rewards/rejected": -0.17424120008945465, - "step": 500 - }, - { - "epoch": 1.3, - "eval_logits/chosen": -2.143381357192993, - "eval_logits/rejected": -2.138577938079834, - "eval_logps/chosen": -34.26863098144531, - "eval_logps/rejected": -37.889427185058594, - "eval_loss": 0.6833134889602661, - "eval_rewards/accuracies": 0.5801494717597961, - "eval_rewards/chosen": -0.04681617021560669, - "eval_rewards/margins": 0.027745069935917854, - "eval_rewards/rejected": -0.07456124573945999, - "eval_runtime": 145.9946, - "eval_samples_per_second": 2.349, - "eval_steps_per_second": 0.295, - "step": 500 - }, - { - "epoch": 1.32, - "learning_rate": 4.22929424333435e-06, - "logits/chosen": -1.8464529514312744, - "logits/rejected": -1.8500893115997314, - "logps/chosen": -28.302093505859375, - "logps/rejected": -33.746177673339844, - "loss": 0.5444, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.17454631626605988, - "rewards/margins": 0.351924866437912, - "rewards/rejected": -0.1773785799741745, - "step": 510 - }, - { - "epoch": 1.35, - "learning_rate": 4.1879307741372085e-06, - "logits/chosen": -1.844003438949585, - "logits/rejected": -1.8547565937042236, - "logps/chosen": -32.249412536621094, - "logps/rejected": -31.553197860717773, - "loss": 0.5417, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.2025829255580902, - "rewards/margins": 0.38288554549217224, - "rewards/rejected": -0.18030261993408203, - "step": 520 - }, - { - "epoch": 1.38, - "learning_rate": 4.145700124802693e-06, - "logits/chosen": -1.7861287593841553, - "logits/rejected": -1.7838165760040283, - "logps/chosen": -30.66195297241211, - "logps/rejected": -31.060169219970703, - "loss": 0.5413, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.2010248899459839, - "rewards/margins": 0.3777065873146057, - "rewards/rejected": -0.17668168246746063, - "step": 530 - }, - { - "epoch": 1.4, - "learning_rate": 4.102623991469562e-06, - "logits/chosen": -1.8594945669174194, - "logits/rejected": -1.852640151977539, - "logps/chosen": -33.20045852661133, - "logps/rejected": -33.99872970581055, - "loss": 0.5297, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.20647427439689636, - "rewards/margins": 0.4027228355407715, - "rewards/rejected": -0.19624853134155273, - "step": 540 - }, - { - "epoch": 1.43, - "learning_rate": 4.058724504646834e-06, - "logits/chosen": -1.825993537902832, - "logits/rejected": -1.8323986530303955, - "logps/chosen": -30.916723251342773, - "logps/rejected": -33.54086685180664, - "loss": 0.5574, - "rewards/accuracies": 0.800000011920929, - "rewards/chosen": 0.16729258000850677, - "rewards/margins": 0.3237192630767822, - "rewards/rejected": -0.15642671287059784, - "step": 550 - }, - { - "epoch": 1.45, - "learning_rate": 4.014024217844167e-06, - "logits/chosen": -1.8956279754638672, - "logits/rejected": -1.872667908668518, - "logps/chosen": -30.50515365600586, - "logps/rejected": -33.68445587158203, - "loss": 0.5559, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.1864827573299408, - "rewards/margins": 0.3241623342037201, - "rewards/rejected": -0.1376795470714569, - "step": 560 - }, - { - "epoch": 1.48, - "learning_rate": 3.968546095984911e-06, - "logits/chosen": -1.8265231847763062, - "logits/rejected": -1.8216123580932617, - "logps/chosen": -31.426382064819336, - "logps/rejected": -32.83190155029297, - "loss": 0.5552, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.19806936383247375, - "rewards/margins": 0.34026631712913513, - "rewards/rejected": -0.14219696819782257, - "step": 570 - }, - { - "epoch": 1.51, - "learning_rate": 3.922313503607806e-06, - "logits/chosen": -1.8608344793319702, - "logits/rejected": -1.8626718521118164, - "logps/chosen": -33.50287628173828, - "logps/rejected": -35.984031677246094, - "loss": 0.5285, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.17458313703536987, - "rewards/margins": 0.41120585799217224, - "rewards/rejected": -0.23662272095680237, - "step": 580 - }, - { - "epoch": 1.53, - "learning_rate": 3.875350192863368e-06, - "logits/chosen": -1.8413366079330444, - "logits/rejected": -1.840811014175415, - "logps/chosen": -29.51900863647461, - "logps/rejected": -32.4603157043457, - "loss": 0.5273, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.2146090269088745, - "rewards/margins": 0.41434532403945923, - "rewards/rejected": -0.19973623752593994, - "step": 590 - }, - { - "epoch": 1.56, - "learning_rate": 3.8276802913111436e-06, - "logits/chosen": -1.8494768142700195, - "logits/rejected": -1.8471746444702148, - "logps/chosen": -31.929302215576172, - "logps/rejected": -33.2027587890625, - "loss": 0.5389, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.2137964963912964, - "rewards/margins": 0.3943565785884857, - "rewards/rejected": -0.18056008219718933, - "step": 600 - }, - { - "epoch": 1.56, - "eval_logits/chosen": -2.0948598384857178, - "eval_logits/rejected": -2.09011173248291, - "eval_logps/chosen": -34.46384048461914, - "eval_logps/rejected": -38.17683792114258, - "eval_loss": 0.6771385073661804, - "eval_rewards/accuracies": 0.5714285373687744, - "eval_rewards/chosen": -0.08585790544748306, - "eval_rewards/margins": 0.04618564993143082, - "eval_rewards/rejected": -0.13204355537891388, - "eval_runtime": 145.9638, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 600 - }, - { - "epoch": 1.58, - "learning_rate": 3.7793282895240927e-06, - "logits/chosen": -1.886190414428711, - "logits/rejected": -1.8926368951797485, - "logps/chosen": -31.31719398498535, - "logps/rejected": -33.124969482421875, - "loss": 0.5331, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.1840377300977707, - "rewards/margins": 0.396394282579422, - "rewards/rejected": -0.2123565673828125, - "step": 610 - }, - { - "epoch": 1.61, - "learning_rate": 3.730319028506478e-06, - "logits/chosen": -1.8389345407485962, - "logits/rejected": -1.8366647958755493, - "logps/chosen": -33.65427780151367, - "logps/rejected": -31.928466796875, - "loss": 0.5313, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.22032015025615692, - "rewards/margins": 0.40891194343566895, - "rewards/rejected": -0.18859176337718964, - "step": 620 - }, - { - "epoch": 1.64, - "learning_rate": 3.6806776869317074e-06, - "logits/chosen": -1.7820647954940796, - "logits/rejected": -1.7755095958709717, - "logps/chosen": -34.277992248535156, - "logps/rejected": -33.469478607177734, - "loss": 0.5147, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.2372467964887619, - "rewards/margins": 0.45970138907432556, - "rewards/rejected": -0.22245457768440247, - "step": 630 - }, - { - "epoch": 1.66, - "learning_rate": 3.6304297682067146e-06, - "logits/chosen": -1.7966333627700806, - "logits/rejected": -1.8029024600982666, - "logps/chosen": -32.984920501708984, - "logps/rejected": -34.18581008911133, - "loss": 0.5374, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.19887647032737732, - "rewards/margins": 0.38798633217811584, - "rewards/rejected": -0.18910983204841614, - "step": 640 - }, - { - "epoch": 1.69, - "learning_rate": 3.579601087369492e-06, - "logits/chosen": -1.8746440410614014, - "logits/rejected": -1.8887230157852173, - "logps/chosen": -31.00433349609375, - "logps/rejected": -32.96894836425781, - "loss": 0.5456, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.16962555050849915, - "rewards/margins": 0.3638772964477539, - "rewards/rejected": -0.19425176084041595, - "step": 650 - }, - { - "epoch": 1.71, - "learning_rate": 3.5282177578265295e-06, - "logits/chosen": -1.7442607879638672, - "logits/rejected": -1.7411283254623413, - "logps/chosen": -32.64227294921875, - "logps/rejected": -36.11538314819336, - "loss": 0.4891, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.25613728165626526, - "rewards/margins": 0.5391642451286316, - "rewards/rejected": -0.28302693367004395, - "step": 660 - }, - { - "epoch": 1.74, - "learning_rate": 3.476306177936961e-06, - "logits/chosen": -1.8324668407440186, - "logits/rejected": -1.8325097560882568, - "logps/chosen": -30.450613021850586, - "logps/rejected": -35.19148635864258, - "loss": 0.5221, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.16699588298797607, - "rewards/margins": 0.4400172829627991, - "rewards/rejected": -0.273021399974823, - "step": 670 - }, - { - "epoch": 1.77, - "learning_rate": 3.423893017450324e-06, - "logits/chosen": -1.777939796447754, - "logits/rejected": -1.7746601104736328, - "logps/chosen": -30.014904022216797, - "logps/rejected": -34.034278869628906, - "loss": 0.5343, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.1714888960123062, - "rewards/margins": 0.41478481888771057, - "rewards/rejected": -0.24329595267772675, - "step": 680 - }, - { - "epoch": 1.79, - "learning_rate": 3.3710052038048794e-06, - "logits/chosen": -1.8041671514511108, - "logits/rejected": -1.8042271137237549, - "logps/chosen": -28.870250701904297, - "logps/rejected": -31.84161376953125, - "loss": 0.5054, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.25856813788414, - "rewards/margins": 0.492791086435318, - "rewards/rejected": -0.2342229187488556, - "step": 690 - }, - { - "epoch": 1.82, - "learning_rate": 3.3176699082935546e-06, - "logits/chosen": -1.7211097478866577, - "logits/rejected": -1.7244020700454712, - "logps/chosen": -33.21913528442383, - "logps/rejected": -32.624568939208984, - "loss": 0.5239, - "rewards/accuracies": 0.762499988079071, - "rewards/chosen": 0.2518671751022339, - "rewards/margins": 0.4838434159755707, - "rewards/rejected": -0.2319762408733368, - "step": 700 - }, - { - "epoch": 1.82, - "eval_logits/chosen": -2.0655694007873535, - "eval_logits/rejected": -2.0608956813812256, - "eval_logps/chosen": -34.67680358886719, - "eval_logps/rejected": -38.3806037902832, - "eval_loss": 0.6811580657958984, - "eval_rewards/accuracies": 0.5627076625823975, - "eval_rewards/chosen": -0.12845046818256378, - "eval_rewards/margins": 0.044346876442432404, - "eval_rewards/rejected": -0.17279735207557678, - "eval_runtime": 145.9699, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 700 - }, - { - "epoch": 1.84, - "learning_rate": 3.2639145321045933e-06, - "logits/chosen": -1.8006311655044556, - "logits/rejected": -1.7920291423797607, - "logps/chosen": -35.518272399902344, - "logps/rejected": -33.038963317871094, - "loss": 0.5239, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.19874954223632812, - "rewards/margins": 0.42537909746170044, - "rewards/rejected": -0.22662954032421112, - "step": 710 - }, - { - "epoch": 1.87, - "learning_rate": 3.2097666922441107e-06, - "logits/chosen": -1.8101955652236938, - "logits/rejected": -1.8115628957748413, - "logps/chosen": -35.39076614379883, - "logps/rejected": -34.37725067138672, - "loss": 0.5138, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.20892438292503357, - "rewards/margins": 0.4691707491874695, - "rewards/rejected": -0.2602463662624359, - "step": 720 - }, - { - "epoch": 1.9, - "learning_rate": 3.1552542073477554e-06, - "logits/chosen": -1.830071210861206, - "logits/rejected": -1.8276923894882202, - "logps/chosen": -31.283824920654297, - "logps/rejected": -34.07398223876953, - "loss": 0.5154, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.25590062141418457, - "rewards/margins": 0.47131747007369995, - "rewards/rejected": -0.215416818857193, - "step": 730 - }, - { - "epoch": 1.92, - "learning_rate": 3.100405083388799e-06, - "logits/chosen": -1.8064956665039062, - "logits/rejected": -1.8117122650146484, - "logps/chosen": -30.50588035583496, - "logps/rejected": -34.42152404785156, - "loss": 0.4968, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.25596895813941956, - "rewards/margins": 0.509309709072113, - "rewards/rejected": -0.25334087014198303, - "step": 740 - }, - { - "epoch": 1.95, - "learning_rate": 3.0452474992899645e-06, - "logits/chosen": -1.7577202320098877, - "logits/rejected": -1.7564342021942139, - "logps/chosen": -32.0627555847168, - "logps/rejected": -36.04865264892578, - "loss": 0.5152, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.21244528889656067, - "rewards/margins": 0.4849843978881836, - "rewards/rejected": -0.2725391089916229, - "step": 750 - }, - { - "epoch": 1.97, - "learning_rate": 2.989809792446417e-06, - "logits/chosen": -1.633909821510315, - "logits/rejected": -1.6291061639785767, - "logps/chosen": -34.7264404296875, - "logps/rejected": -36.655662536621094, - "loss": 0.4918, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.25440096855163574, - "rewards/margins": 0.5438504815101624, - "rewards/rejected": -0.2894495129585266, - "step": 760 - }, - { - "epoch": 2.0, - "learning_rate": 2.9341204441673267e-06, - "logits/chosen": -1.7661796808242798, - "logits/rejected": -1.7702823877334595, - "logps/chosen": -34.160831451416016, - "logps/rejected": -34.7751579284668, - "loss": 0.5223, - "rewards/accuracies": 0.82916659116745, - "rewards/chosen": 0.21056540310382843, - "rewards/margins": 0.44662851095199585, - "rewards/rejected": -0.23606309294700623, - "step": 770 - }, - { - "epoch": 2.03, - "learning_rate": 2.878208065043501e-06, - "logits/chosen": -1.7136824131011963, - "logits/rejected": -1.7119735479354858, - "logps/chosen": -32.115562438964844, - "logps/rejected": -36.59116744995117, - "loss": 0.4005, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.35720616579055786, - "rewards/margins": 0.7945086359977722, - "rewards/rejected": -0.43730250000953674, - "step": 780 - }, - { - "epoch": 2.05, - "learning_rate": 2.8221013802485974e-06, - "logits/chosen": -1.7579456567764282, - "logits/rejected": -1.7560312747955322, - "logps/chosen": -31.616933822631836, - "logps/rejected": -34.80826187133789, - "loss": 0.4295, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.35176053643226624, - "rewards/margins": 0.6960979700088501, - "rewards/rejected": -0.34433746337890625, - "step": 790 - }, - { - "epoch": 2.08, - "learning_rate": 2.76582921478147e-06, - "logits/chosen": -1.6844114065170288, - "logits/rejected": -1.6787183284759521, - "logps/chosen": -32.91436004638672, - "logps/rejected": -33.189552307128906, - "loss": 0.4527, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.2937435507774353, - "rewards/margins": 0.6388018131256104, - "rewards/rejected": -0.34505826234817505, - "step": 800 - }, - { - "epoch": 2.08, - "eval_logits/chosen": -2.0512404441833496, - "eval_logits/rejected": -2.0466127395629883, - "eval_logps/chosen": -34.70791244506836, - "eval_logps/rejected": -38.482669830322266, - "eval_loss": 0.6754072308540344, - "eval_rewards/accuracies": 0.5627076625823975, - "eval_rewards/chosen": -0.1346716433763504, - "eval_rewards/margins": 0.05853841453790665, - "eval_rewards/rejected": -0.19321005046367645, - "eval_runtime": 145.8672, - "eval_samples_per_second": 2.351, - "eval_steps_per_second": 0.295, - "step": 800 - }, - { - "epoch": 2.1, - "learning_rate": 2.7094204786572254e-06, - "logits/chosen": -1.7809431552886963, - "logits/rejected": -1.788240671157837, - "logps/chosen": -30.48040771484375, - "logps/rejected": -36.21466827392578, - "loss": 0.4324, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.32956990599632263, - "rewards/margins": 0.7213352918624878, - "rewards/rejected": -0.3917653560638428, - "step": 810 - }, - { - "epoch": 2.13, - "learning_rate": 2.6529041520546072e-06, - "logits/chosen": -1.756633996963501, - "logits/rejected": -1.7590742111206055, - "logps/chosen": -31.144771575927734, - "logps/rejected": -34.56122589111328, - "loss": 0.4894, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.2852618098258972, - "rewards/margins": 0.5515455603599548, - "rewards/rejected": -0.26628372073173523, - "step": 820 - }, - { - "epoch": 2.16, - "learning_rate": 2.5963092704273302e-06, - "logits/chosen": -1.653343915939331, - "logits/rejected": -1.6575119495391846, - "logps/chosen": -31.099651336669922, - "logps/rejected": -36.96272659301758, - "loss": 0.431, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.29836100339889526, - "rewards/margins": 0.7088698744773865, - "rewards/rejected": -0.410508930683136, - "step": 830 - }, - { - "epoch": 2.18, - "learning_rate": 2.53966490958702e-06, - "logits/chosen": -1.72197687625885, - "logits/rejected": -1.718133568763733, - "logps/chosen": -31.479351043701172, - "logps/rejected": -34.477298736572266, - "loss": 0.4712, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.29785966873168945, - "rewards/margins": 0.5943899750709534, - "rewards/rejected": -0.29653024673461914, - "step": 840 - }, - { - "epoch": 2.21, - "learning_rate": 2.4830001707654135e-06, - "logits/chosen": -1.7955116033554077, - "logits/rejected": -1.7977268695831299, - "logps/chosen": -31.038639068603516, - "logps/rejected": -37.74627685546875, - "loss": 0.4125, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.3473394811153412, - "rewards/margins": 0.7642072439193726, - "rewards/rejected": -0.41686782240867615, - "step": 850 - }, - { - "epoch": 2.23, - "learning_rate": 2.4263441656635054e-06, - "logits/chosen": -1.6078054904937744, - "logits/rejected": -1.602609634399414, - "logps/chosen": -34.81996154785156, - "logps/rejected": -34.76050567626953, - "loss": 0.4487, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.2923429608345032, - "rewards/margins": 0.666063666343689, - "rewards/rejected": -0.3737207055091858, - "step": 860 - }, - { - "epoch": 2.26, - "learning_rate": 2.3697260014953107e-06, - "logits/chosen": -1.6567983627319336, - "logits/rejected": -1.6568479537963867, - "logps/chosen": -34.18325424194336, - "logps/rejected": -36.891624450683594, - "loss": 0.4223, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.331909716129303, - "rewards/margins": 0.7409814596176147, - "rewards/rejected": -0.40907174348831177, - "step": 870 - }, - { - "epoch": 2.29, - "learning_rate": 2.3131747660339396e-06, - "logits/chosen": -1.6990554332733154, - "logits/rejected": -1.6875009536743164, - "logps/chosen": -32.56647491455078, - "logps/rejected": -35.05681610107422, - "loss": 0.4156, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.3080216944217682, - "rewards/margins": 0.7349780797958374, - "rewards/rejected": -0.4269563555717468, - "step": 880 - }, - { - "epoch": 2.31, - "learning_rate": 2.256719512667651e-06, - "logits/chosen": -1.7994136810302734, - "logits/rejected": -1.803999662399292, - "logps/chosen": -31.997573852539062, - "logps/rejected": -34.711036682128906, - "loss": 0.4378, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.2664729058742523, - "rewards/margins": 0.7213842868804932, - "rewards/rejected": -0.45491141080856323, - "step": 890 - }, - { - "epoch": 2.34, - "learning_rate": 2.2003892454735786e-06, - "logits/chosen": -1.7206227779388428, - "logits/rejected": -1.7134149074554443, - "logps/chosen": -33.088829040527344, - "logps/rejected": -34.64997100830078, - "loss": 0.4042, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.3530493974685669, - "rewards/margins": 0.8026164770126343, - "rewards/rejected": -0.44956716895103455, - "step": 900 - }, - { - "epoch": 2.34, - "eval_logits/chosen": -2.022648572921753, - "eval_logits/rejected": -2.0180346965789795, - "eval_logps/chosen": -34.87144470214844, - "eval_logps/rejected": -38.65544128417969, - "eval_loss": 0.6781958937644958, - "eval_rewards/accuracies": 0.565614640712738, - "eval_rewards/chosen": -0.1673787236213684, - "eval_rewards/margins": 0.06038525328040123, - "eval_rewards/rejected": -0.22776399552822113, - "eval_runtime": 145.9687, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 900 - }, - { - "epoch": 2.36, - "learning_rate": 2.1442129043167877e-06, - "logits/chosen": -1.7143480777740479, - "logits/rejected": -1.7145404815673828, - "logps/chosen": -29.756851196289062, - "logps/rejected": -37.19713592529297, - "loss": 0.4132, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.28206199407577515, - "rewards/margins": 0.7835511565208435, - "rewards/rejected": -0.5014891028404236, - "step": 910 - }, - { - "epoch": 2.39, - "learning_rate": 2.088219349982323e-06, - "logits/chosen": -1.6657079458236694, - "logits/rejected": -1.6576827764511108, - "logps/chosen": -30.637893676757812, - "logps/rejected": -35.93877410888672, - "loss": 0.431, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.24021020531654358, - "rewards/margins": 0.7136905193328857, - "rewards/rejected": -0.4734802842140198, - "step": 920 - }, - { - "epoch": 2.42, - "learning_rate": 2.0324373493478803e-06, - "logits/chosen": -1.8297160863876343, - "logits/rejected": -1.829289197921753, - "logps/chosen": -28.57230567932129, - "logps/rejected": -34.938716888427734, - "loss": 0.4478, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.28494688868522644, - "rewards/margins": 0.6997185945510864, - "rewards/rejected": -0.4147717356681824, - "step": 930 - }, - { - "epoch": 2.44, - "learning_rate": 1.976895560604729e-06, - "logits/chosen": -1.7081295251846313, - "logits/rejected": -1.71817946434021, - "logps/chosen": -32.8480110168457, - "logps/rejected": -35.46654510498047, - "loss": 0.4128, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.315410852432251, - "rewards/margins": 0.8107908368110657, - "rewards/rejected": -0.4953800141811371, - "step": 940 - }, - { - "epoch": 2.47, - "learning_rate": 1.921622518534466e-06, - "logits/chosen": -1.7529624700546265, - "logits/rejected": -1.7564818859100342, - "logps/chosen": -29.49724769592285, - "logps/rejected": -33.9348030090332, - "loss": 0.4488, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.2300949990749359, - "rewards/margins": 0.6698387861251831, - "rewards/rejected": -0.4397437572479248, - "step": 950 - }, - { - "epoch": 2.49, - "learning_rate": 1.8666466198491794e-06, - "logits/chosen": -1.7402656078338623, - "logits/rejected": -1.7361259460449219, - "logps/chosen": -32.50096893310547, - "logps/rejected": -36.015480041503906, - "loss": 0.4371, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.31484872102737427, - "rewards/margins": 0.7366662621498108, - "rewards/rejected": -0.4218175411224365, - "step": 960 - }, - { - "epoch": 2.52, - "learning_rate": 1.8119961086025376e-06, - "logits/chosen": -1.6588690280914307, - "logits/rejected": -1.6611392498016357, - "logps/chosen": -31.239105224609375, - "logps/rejected": -37.4363899230957, - "loss": 0.4187, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.3147282004356384, - "rewards/margins": 0.7656875252723694, - "rewards/rejected": -0.45095935463905334, - "step": 970 - }, - { - "epoch": 2.55, - "learning_rate": 1.7576990616793139e-06, - "logits/chosen": -1.6898727416992188, - "logits/rejected": -1.6835498809814453, - "logps/chosen": -34.387081146240234, - "logps/rejected": -38.79052734375, - "loss": 0.4381, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.20359770953655243, - "rewards/margins": 0.7093115448951721, - "rewards/rejected": -0.5057138204574585, - "step": 980 - }, - { - "epoch": 2.57, - "learning_rate": 1.7037833743707892e-06, - "logits/chosen": -1.6725902557373047, - "logits/rejected": -1.6671783924102783, - "logps/chosen": -29.603830337524414, - "logps/rejected": -38.36696243286133, - "loss": 0.4257, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.27982446551322937, - "rewards/margins": 0.7552399635314941, - "rewards/rejected": -0.4754153788089752, - "step": 990 - }, - { - "epoch": 2.6, - "learning_rate": 1.6502767460434588e-06, - "logits/chosen": -1.6507503986358643, - "logits/rejected": -1.6404520273208618, - "logps/chosen": -30.60138511657715, - "logps/rejected": -31.420658111572266, - "loss": 0.4706, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.2294611930847168, - "rewards/margins": 0.5936459302902222, - "rewards/rejected": -0.36418476700782776, - "step": 1000 - }, - { - "epoch": 2.6, - "eval_logits/chosen": -2.0031397342681885, - "eval_logits/rejected": -1.998576283454895, - "eval_logps/chosen": -34.9782829284668, - "eval_logps/rejected": -38.792564392089844, - "eval_loss": 0.6767791509628296, - "eval_rewards/accuracies": 0.5772424936294556, - "eval_rewards/chosen": -0.1887458711862564, - "eval_rewards/margins": 0.06644343584775925, - "eval_rewards/rejected": -0.25518932938575745, - "eval_runtime": 145.9615, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 1000 - }, - { - "epoch": 2.62, - "learning_rate": 1.5972066659083796e-06, - "logits/chosen": -1.7468115091323853, - "logits/rejected": -1.7462623119354248, - "logps/chosen": -30.48056983947754, - "logps/rejected": -32.37314987182617, - "loss": 0.4553, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.26872488856315613, - "rewards/margins": 0.6916586756706238, - "rewards/rejected": -0.42293381690979004, - "step": 1010 - }, - { - "epoch": 2.65, - "learning_rate": 1.5446003988985041e-06, - "logits/chosen": -1.7937511205673218, - "logits/rejected": -1.7943975925445557, - "logps/chosen": -30.599365234375, - "logps/rejected": -33.3109016418457, - "loss": 0.4334, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.26884275674819946, - "rewards/margins": 0.6978610157966614, - "rewards/rejected": -0.4290183186531067, - "step": 1020 - }, - { - "epoch": 2.68, - "learning_rate": 1.4924849716612211e-06, - "logits/chosen": -1.7542606592178345, - "logits/rejected": -1.7581384181976318, - "logps/chosen": -31.039047241210938, - "logps/rejected": -29.711040496826172, - "loss": 0.4752, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.24820418655872345, - "rewards/margins": 0.5936691164970398, - "rewards/rejected": -0.34546491503715515, - "step": 1030 - }, - { - "epoch": 2.7, - "learning_rate": 1.440887158673332e-06, - "logits/chosen": -1.757930040359497, - "logits/rejected": -1.7499282360076904, - "logps/chosen": -29.807552337646484, - "logps/rejected": -36.027854919433594, - "loss": 0.4206, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.2740689218044281, - "rewards/margins": 0.7795940637588501, - "rewards/rejected": -0.5055252313613892, - "step": 1040 - }, - { - "epoch": 2.73, - "learning_rate": 1.3898334684855647e-06, - "logits/chosen": -1.702924132347107, - "logits/rejected": -1.713683843612671, - "logps/chosen": -32.3316650390625, - "logps/rejected": -34.53157424926758, - "loss": 0.4326, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.2442883998155594, - "rewards/margins": 0.701600968837738, - "rewards/rejected": -0.45731258392333984, - "step": 1050 - }, - { - "epoch": 2.75, - "learning_rate": 1.3393501301037245e-06, - "logits/chosen": -1.7792609930038452, - "logits/rejected": -1.7700929641723633, - "logps/chosen": -32.188419342041016, - "logps/rejected": -39.166893005371094, - "loss": 0.4252, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.2624720335006714, - "rewards/margins": 0.8295727968215942, - "rewards/rejected": -0.5671008229255676, - "step": 1060 - }, - { - "epoch": 2.78, - "learning_rate": 1.2894630795134454e-06, - "logits/chosen": -1.6865419149398804, - "logits/rejected": -1.6886098384857178, - "logps/chosen": -34.260337829589844, - "logps/rejected": -34.93286895751953, - "loss": 0.4168, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.3457896411418915, - "rewards/margins": 0.7927892208099365, - "rewards/rejected": -0.44699954986572266, - "step": 1070 - }, - { - "epoch": 2.81, - "learning_rate": 1.2401979463554984e-06, - "logits/chosen": -1.8080484867095947, - "logits/rejected": -1.8084468841552734, - "logps/chosen": -31.47174644470215, - "logps/rejected": -36.87213134765625, - "loss": 0.3943, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.3171781897544861, - "rewards/margins": 0.868468165397644, - "rewards/rejected": -0.551289975643158, - "step": 1080 - }, - { - "epoch": 2.83, - "learning_rate": 1.1915800407584705e-06, - "logits/chosen": -1.7872835397720337, - "logits/rejected": -1.7911815643310547, - "logps/chosen": -29.605815887451172, - "logps/rejected": -35.87862014770508, - "loss": 0.4332, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.2574996054172516, - "rewards/margins": 0.7236566543579102, - "rewards/rejected": -0.46615704894065857, - "step": 1090 - }, - { - "epoch": 2.86, - "learning_rate": 1.1436343403356019e-06, - "logits/chosen": -1.7762609720230103, - "logits/rejected": -1.7813961505889893, - "logps/chosen": -32.46584701538086, - "logps/rejected": -32.048606872558594, - "loss": 0.4851, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.22063057124614716, - "rewards/margins": 0.5500176548957825, - "rewards/rejected": -0.32938703894615173, - "step": 1100 - }, - { - "epoch": 2.86, - "eval_logits/chosen": -1.997167944908142, - "eval_logits/rejected": -1.9925878047943115, - "eval_logps/chosen": -35.010650634765625, - "eval_logps/rejected": -38.846195220947266, - "eval_loss": 0.6752685308456421, - "eval_rewards/accuracies": 0.5772424936294556, - "eval_rewards/chosen": -0.1952199786901474, - "eval_rewards/margins": 0.07069465517997742, - "eval_rewards/rejected": -0.265914648771286, - "eval_runtime": 145.9132, - "eval_samples_per_second": 2.351, - "eval_steps_per_second": 0.295, - "step": 1100 - }, - { - "epoch": 2.88, - "learning_rate": 1.0963854773524548e-06, - "logits/chosen": -1.767409324645996, - "logits/rejected": -1.7679336071014404, - "logps/chosen": -31.276708602905273, - "logps/rejected": -32.90143585205078, - "loss": 0.4357, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.3326892554759979, - "rewards/margins": 0.709793210029602, - "rewards/rejected": -0.3771039843559265, - "step": 1110 - }, - { - "epoch": 2.91, - "learning_rate": 1.049857726072005e-06, - "logits/chosen": -1.614660620689392, - "logits/rejected": -1.6168187856674194, - "logps/chosen": -33.12425994873047, - "logps/rejected": -35.205345153808594, - "loss": 0.4468, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.28814348578453064, - "rewards/margins": 0.7209606170654297, - "rewards/rejected": -0.43281716108322144, - "step": 1120 - }, - { - "epoch": 2.94, - "learning_rate": 1.0040749902836508e-06, - "logits/chosen": -1.6470248699188232, - "logits/rejected": -1.6448488235473633, - "logps/chosen": -29.9570255279541, - "logps/rejected": -33.207984924316406, - "loss": 0.4831, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.2109387367963791, - "rewards/margins": 0.6193884015083313, - "rewards/rejected": -0.408449649810791, - "step": 1130 - }, - { - "epoch": 2.96, - "learning_rate": 9.59060791022566e-07, - "logits/chosen": -1.7829939126968384, - "logits/rejected": -1.7778551578521729, - "logps/chosen": -31.336633682250977, - "logps/rejected": -35.121238708496094, - "loss": 0.4157, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.3406323492527008, - "rewards/margins": 0.7740785479545593, - "rewards/rejected": -0.43344616889953613, - "step": 1140 - }, - { - "epoch": 2.99, - "learning_rate": 9.148382544856885e-07, - "logits/chosen": -1.653777837753296, - "logits/rejected": -1.6442779302597046, - "logps/chosen": -32.26165008544922, - "logps/rejected": -33.48617172241211, - "loss": 0.4453, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.23801711201667786, - "rewards/margins": 0.6767706274986267, - "rewards/rejected": -0.438753604888916, - "step": 1150 - }, - { - "epoch": 3.01, - "learning_rate": 8.714301001505568e-07, - "logits/chosen": -1.7101352214813232, - "logits/rejected": -1.7105070352554321, - "logps/chosen": -32.104759216308594, - "logps/rejected": -33.233394622802734, - "loss": 0.4316, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.3153984546661377, - "rewards/margins": 0.7217879891395569, - "rewards/rejected": -0.40638941526412964, - "step": 1160 - }, - { - "epoch": 3.04, - "learning_rate": 8.288586291031025e-07, - "logits/chosen": -1.7904865741729736, - "logits/rejected": -1.7850242853164673, - "logps/chosen": -32.22756576538086, - "logps/rejected": -34.872222900390625, - "loss": 0.4575, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.2596810758113861, - "rewards/margins": 0.6458818316459656, - "rewards/rejected": -0.3862007260322571, - "step": 1170 - }, - { - "epoch": 3.06, - "learning_rate": 7.871457125803897e-07, - "logits/chosen": -1.6573622226715088, - "logits/rejected": -1.665421485900879, - "logps/chosen": -32.17869186401367, - "logps/rejected": -34.522247314453125, - "loss": 0.445, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.23086118698120117, - "rewards/margins": 0.6630229353904724, - "rewards/rejected": -0.43216174840927124, - "step": 1180 - }, - { - "epoch": 3.09, - "learning_rate": 7.463127807341966e-07, - "logits/chosen": -1.7110074758529663, - "logits/rejected": -1.705242395401001, - "logps/chosen": -30.413455963134766, - "logps/rejected": -35.588809967041016, - "loss": 0.4052, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.3520320653915405, - "rewards/margins": 0.8131035566329956, - "rewards/rejected": -0.46107155084609985, - "step": 1190 - }, - { - "epoch": 3.12, - "learning_rate": 7.063808116212021e-07, - "logits/chosen": -1.6587133407592773, - "logits/rejected": -1.6605675220489502, - "logps/chosen": -32.00436782836914, - "logps/rejected": -35.51262664794922, - "loss": 0.4079, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.29721540212631226, - "rewards/margins": 0.847730815410614, - "rewards/rejected": -0.550515353679657, - "step": 1200 - }, - { - "epoch": 3.12, - "eval_logits/chosen": -1.995952844619751, - "eval_logits/rejected": -1.991401195526123, - "eval_logps/chosen": -35.02239990234375, - "eval_logps/rejected": -38.853912353515625, - "eval_loss": 0.6756896376609802, - "eval_rewards/accuracies": 0.5685215592384338, - "eval_rewards/chosen": -0.19756944477558136, - "eval_rewards/margins": 0.06989007443189621, - "eval_rewards/rejected": -0.2674594819545746, - "eval_runtime": 145.9713, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 1200 - }, - { - "epoch": 3.14, - "learning_rate": 6.673703204254348e-07, - "logits/chosen": -1.5981744527816772, - "logits/rejected": -1.5973711013793945, - "logps/chosen": -34.34618377685547, - "logps/rejected": -35.283565521240234, - "loss": 0.4011, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.34126046299934387, - "rewards/margins": 0.8561012148857117, - "rewards/rejected": -0.5148407220840454, - "step": 1210 - }, - { - "epoch": 3.17, - "learning_rate": 6.293013489185315e-07, - "logits/chosen": -1.7593389749526978, - "logits/rejected": -1.753101110458374, - "logps/chosen": -30.266992568969727, - "logps/rejected": -35.572105407714844, - "loss": 0.4037, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.2873390316963196, - "rewards/margins": 0.8260520696640015, - "rewards/rejected": -0.5387130975723267, - "step": 1220 - }, - { - "epoch": 3.19, - "learning_rate": 5.921934551632086e-07, - "logits/chosen": -1.6186577081680298, - "logits/rejected": -1.6075941324234009, - "logps/chosen": -32.524078369140625, - "logps/rejected": -35.393341064453125, - "loss": 0.384, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.37232059240341187, - "rewards/margins": 0.8722246885299683, - "rewards/rejected": -0.49990415573120117, - "step": 1230 - }, - { - "epoch": 3.22, - "learning_rate": 5.560657034652405e-07, - "logits/chosen": -1.7074792385101318, - "logits/rejected": -1.7010929584503174, - "logps/chosen": -29.577234268188477, - "logps/rejected": -31.12249755859375, - "loss": 0.4626, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.22459976375102997, - "rewards/margins": 0.6755872964859009, - "rewards/rejected": -0.4509875178337097, - "step": 1240 - }, - { - "epoch": 3.25, - "learning_rate": 5.2093665457911e-07, - "logits/chosen": -1.7290546894073486, - "logits/rejected": -1.7365970611572266, - "logps/chosen": -33.69880676269531, - "logps/rejected": -33.51887130737305, - "loss": 0.4219, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.3393164873123169, - "rewards/margins": 0.7394281625747681, - "rewards/rejected": -0.4001116156578064, - "step": 1250 - }, - { - "epoch": 3.27, - "learning_rate": 4.868243561723535e-07, - "logits/chosen": -1.7125896215438843, - "logits/rejected": -1.7129156589508057, - "logps/chosen": -31.79986572265625, - "logps/rejected": -35.48588180541992, - "loss": 0.4084, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.2457386553287506, - "rewards/margins": 0.7799408435821533, - "rewards/rejected": -0.5342021584510803, - "step": 1260 - }, - { - "epoch": 3.3, - "learning_rate": 4.537463335535161e-07, - "logits/chosen": -1.645140290260315, - "logits/rejected": -1.6442397832870483, - "logps/chosen": -31.326940536499023, - "logps/rejected": -36.054588317871094, - "loss": 0.3902, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.35277944803237915, - "rewards/margins": 0.8595754504203796, - "rewards/rejected": -0.5067960023880005, - "step": 1270 - }, - { - "epoch": 3.32, - "learning_rate": 4.217195806684629e-07, - "logits/chosen": -1.535873532295227, - "logits/rejected": -1.5315896272659302, - "logps/chosen": -33.540103912353516, - "logps/rejected": -33.158790588378906, - "loss": 0.4135, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.3394676148891449, - "rewards/margins": 0.7859278917312622, - "rewards/rejected": -0.4464602470397949, - "step": 1280 - }, - { - "epoch": 3.35, - "learning_rate": 3.907605513696808e-07, - "logits/chosen": -1.7358757257461548, - "logits/rejected": -1.720964789390564, - "logps/chosen": -32.98870086669922, - "logps/rejected": -37.79273223876953, - "loss": 0.3817, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.2717747390270233, - "rewards/margins": 0.8772575259208679, - "rewards/rejected": -0.605482816696167, - "step": 1290 - }, - { - "epoch": 3.38, - "learning_rate": 3.6088515096305675e-07, - "logits/chosen": -1.6800758838653564, - "logits/rejected": -1.6846822500228882, - "logps/chosen": -31.980499267578125, - "logps/rejected": -39.190223693847656, - "loss": 0.3644, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.3074238896369934, - "rewards/margins": 0.9248245358467102, - "rewards/rejected": -0.6174007654190063, - "step": 1300 - }, - { - "epoch": 3.38, - "eval_logits/chosen": -1.9952446222305298, - "eval_logits/rejected": -1.9906855821609497, - "eval_logps/chosen": -35.02177810668945, - "eval_logps/rejected": -38.85075759887695, - "eval_loss": 0.6762353777885437, - "eval_rewards/accuracies": 0.5714285373687744, - "eval_rewards/chosen": -0.19744610786437988, - "eval_rewards/margins": 0.06938131898641586, - "eval_rewards/rejected": -0.26682746410369873, - "eval_runtime": 145.9746, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 1300 - }, - { - "epoch": 3.4, - "learning_rate": 3.321087280364757e-07, - "logits/chosen": -1.6574876308441162, - "logits/rejected": -1.657605767250061, - "logps/chosen": -34.51050567626953, - "logps/rejected": -39.725650787353516, - "loss": 0.4025, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.34670281410217285, - "rewards/margins": 0.8750103116035461, - "rewards/rejected": -0.5283073782920837, - "step": 1310 - }, - { - "epoch": 3.43, - "learning_rate": 3.044460665744284e-07, - "logits/chosen": -1.7422373294830322, - "logits/rejected": -1.7410180568695068, - "logps/chosen": -30.673898696899414, - "logps/rejected": -33.566566467285156, - "loss": 0.4088, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.28399163484573364, - "rewards/margins": 0.8066338300704956, - "rewards/rejected": -0.5226421356201172, - "step": 1320 - }, - { - "epoch": 3.45, - "learning_rate": 2.779113783626916e-07, - "logits/chosen": -1.6552095413208008, - "logits/rejected": -1.656686544418335, - "logps/chosen": -32.714210510253906, - "logps/rejected": -36.135841369628906, - "loss": 0.3991, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.3320077359676361, - "rewards/margins": 0.8294411897659302, - "rewards/rejected": -0.49743351340293884, - "step": 1330 - }, - { - "epoch": 3.48, - "learning_rate": 2.5251829568697204e-07, - "logits/chosen": -1.7152090072631836, - "logits/rejected": -1.7140846252441406, - "logps/chosen": -29.72369384765625, - "logps/rejected": -34.30165100097656, - "loss": 0.4137, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.2949947118759155, - "rewards/margins": 0.7569295167922974, - "rewards/rejected": -0.46193480491638184, - "step": 1340 - }, - { - "epoch": 3.51, - "learning_rate": 2.2827986432927774e-07, - "logits/chosen": -1.7289466857910156, - "logits/rejected": -1.7141622304916382, - "logps/chosen": -33.12803268432617, - "logps/rejected": -39.370018005371094, - "loss": 0.3932, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.2377280294895172, - "rewards/margins": 0.8588449358940125, - "rewards/rejected": -0.6211169958114624, - "step": 1350 - }, - { - "epoch": 3.53, - "learning_rate": 2.0520853686560177e-07, - "logits/chosen": -1.7152817249298096, - "logits/rejected": -1.7272228002548218, - "logps/chosen": -30.357595443725586, - "logps/rejected": -34.62307357788086, - "loss": 0.4085, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.3421744406223297, - "rewards/margins": 0.8188406229019165, - "rewards/rejected": -0.4766661524772644, - "step": 1360 - }, - { - "epoch": 3.56, - "learning_rate": 1.833161662683672e-07, - "logits/chosen": -1.8232405185699463, - "logits/rejected": -1.822654366493225, - "logps/chosen": -30.2298641204834, - "logps/rejected": -39.070777893066406, - "loss": 0.3565, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.3601134717464447, - "rewards/margins": 1.0080931186676025, - "rewards/rejected": -0.6479796767234802, - "step": 1370 - }, - { - "epoch": 3.58, - "learning_rate": 1.626139998169246e-07, - "logits/chosen": -1.6886169910430908, - "logits/rejected": -1.6961129903793335, - "logps/chosen": -32.56038284301758, - "logps/rejected": -40.28064727783203, - "loss": 0.3973, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.3240193724632263, - "rewards/margins": 0.8902201652526855, - "rewards/rejected": -0.566200852394104, - "step": 1380 - }, - { - "epoch": 3.61, - "learning_rate": 1.4311267331922535e-07, - "logits/chosen": -1.6452823877334595, - "logits/rejected": -1.6412826776504517, - "logps/chosen": -32.96963119506836, - "logps/rejected": -33.69043731689453, - "loss": 0.42, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.3595161437988281, - "rewards/margins": 0.7781243324279785, - "rewards/rejected": -0.41860824823379517, - "step": 1390 - }, - { - "epoch": 3.64, - "learning_rate": 1.2482220564763669e-07, - "logits/chosen": -1.7927372455596924, - "logits/rejected": -1.7904326915740967, - "logps/chosen": -29.784997940063477, - "logps/rejected": -34.433712005615234, - "loss": 0.4147, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.302310585975647, - "rewards/margins": 0.7601593136787415, - "rewards/rejected": -0.4578487277030945, - "step": 1400 - }, - { - "epoch": 3.64, - "eval_logits/chosen": -1.9954665899276733, - "eval_logits/rejected": -1.9909135103225708, - "eval_logps/chosen": -35.02546310424805, - "eval_logps/rejected": -38.848751068115234, - "eval_loss": 0.6764604449272156, - "eval_rewards/accuracies": 0.5830564498901367, - "eval_rewards/chosen": -0.19818241894245148, - "eval_rewards/margins": 0.0682431310415268, - "eval_rewards/rejected": -0.26642557978630066, - "eval_runtime": 146.0031, - "eval_samples_per_second": 2.349, - "eval_steps_per_second": 0.295, - "step": 1400 - }, - { - "epoch": 3.66, - "learning_rate": 1.0775199359171346e-07, - "logits/chosen": -1.7428439855575562, - "logits/rejected": -1.7387222051620483, - "logps/chosen": -32.050254821777344, - "logps/rejected": -31.62164878845215, - "loss": 0.4305, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.31845641136169434, - "rewards/margins": 0.731894850730896, - "rewards/rejected": -0.41343846917152405, - "step": 1410 - }, - { - "epoch": 3.69, - "learning_rate": 9.191080703056604e-08, - "logits/chosen": -1.6962391138076782, - "logits/rejected": -1.6972090005874634, - "logps/chosen": -31.709964752197266, - "logps/rejected": -36.93099594116211, - "loss": 0.4252, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.3309827446937561, - "rewards/margins": 0.7323800325393677, - "rewards/rejected": -0.4013972878456116, - "step": 1420 - }, - { - "epoch": 3.71, - "learning_rate": 7.730678442730539e-08, - "logits/chosen": -1.6504757404327393, - "logits/rejected": -1.644012451171875, - "logps/chosen": -32.439308166503906, - "logps/rejected": -39.37647247314453, - "loss": 0.4018, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.3404920995235443, - "rewards/margins": 0.8779042363166809, - "rewards/rejected": -0.5374122262001038, - "step": 1430 - }, - { - "epoch": 3.74, - "learning_rate": 6.394742864787806e-08, - "logits/chosen": -1.6599947214126587, - "logits/rejected": -1.6544313430786133, - "logps/chosen": -27.771902084350586, - "logps/rejected": -34.0653190612793, - "loss": 0.4212, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.3082676827907562, - "rewards/margins": 0.7752003073692322, - "rewards/rejected": -0.46693262457847595, - "step": 1440 - }, - { - "epoch": 3.77, - "learning_rate": 5.183960310644748e-08, - "logits/chosen": -1.6984790563583374, - "logits/rejected": -1.6879154443740845, - "logps/chosen": -31.496490478515625, - "logps/rejected": -38.104469299316406, - "loss": 0.417, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.19700780510902405, - "rewards/margins": 0.760827898979187, - "rewards/rejected": -0.5638201832771301, - "step": 1450 - }, - { - "epoch": 3.79, - "learning_rate": 4.098952823928693e-08, - "logits/chosen": -1.6632159948349, - "logits/rejected": -1.6603494882583618, - "logps/chosen": -31.930124282836914, - "logps/rejected": -32.78700256347656, - "loss": 0.4537, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.27593058347702026, - "rewards/margins": 0.6551658511161804, - "rewards/rejected": -0.37923532724380493, - "step": 1460 - }, - { - "epoch": 3.82, - "learning_rate": 3.1402778309014284e-08, - "logits/chosen": -1.7246555089950562, - "logits/rejected": -1.7306416034698486, - "logps/chosen": -30.238815307617188, - "logps/rejected": -35.39813232421875, - "loss": 0.3994, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.36607176065444946, - "rewards/margins": 0.8532134890556335, - "rewards/rejected": -0.48714175820350647, - "step": 1470 - }, - { - "epoch": 3.84, - "learning_rate": 2.3084278540791427e-08, - "logits/chosen": -1.7249250411987305, - "logits/rejected": -1.7347908020019531, - "logps/chosen": -30.2387752532959, - "logps/rejected": -31.81027603149414, - "loss": 0.4214, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.2912960648536682, - "rewards/margins": 0.7276380658149719, - "rewards/rejected": -0.4363420605659485, - "step": 1480 - }, - { - "epoch": 3.87, - "learning_rate": 1.6038302591975807e-08, - "logits/chosen": -1.6587145328521729, - "logits/rejected": -1.651919960975647, - "logps/chosen": -32.452491760253906, - "logps/rejected": -34.58582305908203, - "loss": 0.4325, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.24079903960227966, - "rewards/margins": 0.7158681154251099, - "rewards/rejected": -0.4750691056251526, - "step": 1490 - }, - { - "epoch": 3.9, - "learning_rate": 1.0268470356514237e-08, - "logits/chosen": -1.7148383855819702, - "logits/rejected": -1.7118122577667236, - "logps/chosen": -32.22610092163086, - "logps/rejected": -36.112178802490234, - "loss": 0.4021, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.27989429235458374, - "rewards/margins": 0.8502857089042664, - "rewards/rejected": -0.5703914165496826, - "step": 1500 - }, - { - "epoch": 3.9, - "eval_logits/chosen": -1.995542287826538, - "eval_logits/rejected": -1.9909816980361938, - "eval_logps/chosen": -35.02688217163086, - "eval_logps/rejected": -38.85047912597656, - "eval_loss": 0.6767085790634155, - "eval_rewards/accuracies": 0.5714285373687744, - "eval_rewards/chosen": -0.1984661966562271, - "eval_rewards/margins": 0.06830596923828125, - "eval_rewards/rejected": -0.26677215099334717, - "eval_runtime": 145.8804, - "eval_samples_per_second": 2.351, - "eval_steps_per_second": 0.295, - "step": 1500 - }, - { - "epoch": 3.92, - "learning_rate": 5.777746105209147e-09, - "logits/chosen": -1.7851879596710205, - "logits/rejected": -1.7856934070587158, - "logps/chosen": -28.24176025390625, - "logps/rejected": -35.181297302246094, - "loss": 0.4185, - "rewards/accuracies": 0.8125, - "rewards/chosen": 0.314903199672699, - "rewards/margins": 0.8030730485916138, - "rewards/rejected": -0.4881698489189148, - "step": 1510 - }, - { - "epoch": 3.95, - "learning_rate": 2.5684369628148352e-09, - "logits/chosen": -1.6452668905258179, - "logits/rejected": -1.6448297500610352, - "logps/chosen": -31.499074935913086, - "logps/rejected": -36.205726623535156, - "loss": 0.4244, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.3085591197013855, - "rewards/margins": 0.7749578356742859, - "rewards/rejected": -0.46639877557754517, - "step": 1520 - }, - { - "epoch": 3.97, - "learning_rate": 6.421917227455999e-10, - "logits/chosen": -1.7937875986099243, - "logits/rejected": -1.7911087274551392, - "logps/chosen": -30.171310424804688, - "logps/rejected": -33.855953216552734, - "loss": 0.4275, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.2605496048927307, - "rewards/margins": 0.735010027885437, - "rewards/rejected": -0.4744604527950287, - "step": 1530 - }, - { - "epoch": 4.0, - "learning_rate": 0.0, - "logits/chosen": -1.7776581048965454, - "logits/rejected": -1.7787119150161743, - "logps/chosen": -29.523090362548828, - "logps/rejected": -31.804224014282227, - "loss": 0.4563, - "rewards/accuracies": 0.9416666030883789, - "rewards/chosen": 0.21937060356140137, - "rewards/margins": 0.6221782565116882, - "rewards/rejected": -0.40280765295028687, - "step": 1540 - }, - { - "epoch": 4.0, - "step": 1540, + "epoch": 1.0, + "step": 385, "total_flos": 0.0, - "train_loss": 0.3823247471413055, - "train_runtime": 10805.5642, - "train_samples_per_second": 1.14, - "train_steps_per_second": 0.143 + "train_loss": 0.4876757522682091, + "train_runtime": 3250.1859, + "train_samples_per_second": 0.947, + "train_steps_per_second": 0.118 } ], "logging_steps": 10, - "max_steps": 1540, + "max_steps": 385, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4,