diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,9 +1,9 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 4.0, + "epoch": 1.0, "eval_steps": 100, - "global_step": 1540, + "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, @@ -15,7 +15,7 @@ "logits/rejected": -1.7377450466156006, "logps/chosen": -29.553977966308594, "logps/rejected": -42.813133239746094, - "loss": 0.5, + "loss": 6.25, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, @@ -25,155 +25,155 @@ { "epoch": 0.03, "learning_rate": 1.282051282051282e-06, - "logits/chosen": -1.8665586709976196, - "logits/rejected": -1.8708692789077759, - "logps/chosen": -37.00250244140625, - "logps/rejected": -33.66969299316406, - "loss": 0.4985, - "rewards/accuracies": 0.5416666865348816, - "rewards/chosen": 0.0008193479152396321, - "rewards/margins": 0.008743342012166977, - "rewards/rejected": -0.007923995144665241, + "logits/chosen": -1.866089105606079, + "logits/rejected": -1.8704073429107666, + "logps/chosen": -36.98554229736328, + "logps/rejected": -33.6707763671875, + "loss": 5.979, + "rewards/accuracies": 0.5972222089767456, + "rewards/chosen": 0.004210897721350193, + "rewards/margins": 0.01235075294971466, + "rewards/rejected": -0.008139855228364468, "step": 10 }, { "epoch": 0.05, "learning_rate": 2.564102564102564e-06, - "logits/chosen": -1.9974254369735718, - "logits/rejected": -2.0000810623168945, - "logps/chosen": -29.634906768798828, - "logps/rejected": -29.027408599853516, - "loss": 0.5009, - "rewards/accuracies": 0.38749998807907104, - "rewards/chosen": 0.001457492122426629, - "rewards/margins": -0.005749998614192009, - "rewards/rejected": 0.007207490503787994, + "logits/chosen": -1.9978349208831787, + "logits/rejected": -2.0004687309265137, + "logps/chosen": -29.640878677368164, + "logps/rejected": -29.042272567749023, + "loss": 6.3859, + "rewards/accuracies": 0.4124999940395355, + "rewards/chosen": 0.00026315837749280035, + "rewards/margins": -0.003971050027757883, + "rewards/rejected": 0.004234207794070244, "step": 20 }, { "epoch": 0.08, "learning_rate": 3.846153846153847e-06, - "logits/chosen": -1.9204790592193604, - "logits/rejected": -1.9177839756011963, - "logps/chosen": -31.412555694580078, - "logps/rejected": -33.24369812011719, - "loss": 0.4992, + "logits/chosen": -1.9206383228302002, + "logits/rejected": -1.9179503917694092, + "logps/chosen": -31.404415130615234, + "logps/rejected": -33.228981018066406, + "loss": 6.1874, "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.0007082058000378311, - "rewards/margins": 0.005755766294896603, - "rewards/rejected": -0.005047560669481754, + "rewards/chosen": 0.00233639357611537, + "rewards/margins": 0.004440182354301214, + "rewards/rejected": -0.002103788312524557, "step": 30 }, { "epoch": 0.1, "learning_rate": 4.999896948438434e-06, - "logits/chosen": -2.0162367820739746, - "logits/rejected": -2.007521629333496, - "logps/chosen": -32.55222702026367, - "logps/rejected": -32.50428771972656, - "loss": 0.4992, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.004915344063192606, - "rewards/margins": 0.0032389431726187468, - "rewards/rejected": 0.0016764007741585374, + "logits/chosen": -2.017291784286499, + "logits/rejected": -2.008547067642212, + "logps/chosen": -32.58599090576172, + "logps/rejected": -32.512664794921875, + "loss": 6.3474, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": -0.001837015151977539, + "rewards/margins": -0.0018385002622380853, + "rewards/rejected": 1.4854595065116882e-06, "step": 40 }, { "epoch": 0.13, "learning_rate": 4.987541037542187e-06, - "logits/chosen": -1.8627817630767822, - "logits/rejected": -1.8519961833953857, - "logps/chosen": -33.52722930908203, - "logps/rejected": -35.41474151611328, - "loss": 0.5002, - "rewards/accuracies": 0.4749999940395355, - "rewards/chosen": 0.0060504418797791, - "rewards/margins": -0.001469915034249425, - "rewards/rejected": 0.007520356681197882, + "logits/chosen": -1.8622735738754272, + "logits/rejected": -1.8514816761016846, + "logps/chosen": -33.54685592651367, + "logps/rejected": -35.447818756103516, + "loss": 6.2748, + "rewards/accuracies": 0.512499988079071, + "rewards/chosen": 0.0021248008124530315, + "rewards/margins": 0.0012201189529150724, + "rewards/rejected": 0.0009046817431226373, "step": 50 }, { "epoch": 0.16, "learning_rate": 4.954691471941119e-06, - "logits/chosen": -1.9401956796646118, - "logits/rejected": -1.9421344995498657, - "logps/chosen": -32.53112030029297, - "logps/rejected": -33.1898307800293, - "loss": 0.4964, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.01381197851151228, - "rewards/margins": 0.01750265061855316, - "rewards/rejected": -0.00369067071005702, + "logits/chosen": -1.9413617849349976, + "logits/rejected": -1.943302869796753, + "logps/chosen": -32.52573013305664, + "logps/rejected": -33.22004318237305, + "loss": 5.8451, + "rewards/accuracies": 0.5874999761581421, + "rewards/chosen": 0.014890777878463268, + "rewards/margins": 0.024623576551675797, + "rewards/rejected": -0.009732798673212528, "step": 60 }, { "epoch": 0.18, "learning_rate": 4.901618883413549e-06, - "logits/chosen": -2.071833610534668, - "logits/rejected": -2.076794147491455, - "logps/chosen": -33.93788528442383, - "logps/rejected": -36.575801849365234, - "loss": 0.4976, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.010120250284671783, - "rewards/margins": 0.011506280861794949, - "rewards/rejected": -0.0013860296458005905, + "logits/chosen": -2.0721487998962402, + "logits/rejected": -2.07711124420166, + "logps/chosen": -33.97162628173828, + "logps/rejected": -36.63127517700195, + "loss": 6.0458, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.0033715851604938507, + "rewards/margins": 0.015852421522140503, + "rewards/rejected": -0.012480835430324078, "step": 70 }, { "epoch": 0.21, "learning_rate": 4.828760511501322e-06, - "logits/chosen": -1.9325587749481201, - "logits/rejected": -1.9356613159179688, - "logps/chosen": -34.26636505126953, - "logps/rejected": -34.54140090942383, - "loss": 0.4947, - "rewards/accuracies": 0.550000011920929, - "rewards/chosen": 0.032289423048496246, - "rewards/margins": 0.022087663412094116, - "rewards/rejected": 0.01020175963640213, + "logits/chosen": -1.9334779977798462, + "logits/rejected": -1.936608910560608, + "logps/chosen": -34.305667877197266, + "logps/rejected": -34.637855529785156, + "loss": 5.6523, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.02442934736609459, + "rewards/margins": 0.033519335091114044, + "rewards/rejected": -0.009089985862374306, "step": 80 }, { "epoch": 0.23, "learning_rate": 4.7367166013034295e-06, - "logits/chosen": -1.942940354347229, - "logits/rejected": -1.9474563598632812, - "logps/chosen": -32.32842254638672, - "logps/rejected": -32.28204345703125, - "loss": 0.4965, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.027799556031823158, - "rewards/margins": 0.012426125817000866, - "rewards/rejected": 0.015373429283499718, + "logits/chosen": -1.9406198263168335, + "logits/rejected": -1.9451316595077515, + "logps/chosen": -32.37959289550781, + "logps/rejected": -32.313934326171875, + "loss": 6.1848, + "rewards/accuracies": 0.5249999761581421, + "rewards/chosen": 0.017566002905368805, + "rewards/margins": 0.008571788668632507, + "rewards/rejected": 0.008994214236736298, "step": 90 }, { "epoch": 0.26, "learning_rate": 4.626245458345211e-06, - "logits/chosen": -2.0405964851379395, - "logits/rejected": -2.0386147499084473, - "logps/chosen": -32.07122039794922, - "logps/rejected": -31.215023040771484, - "loss": 0.4947, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.033922821283340454, - "rewards/margins": 0.02368539571762085, - "rewards/rejected": 0.010237427428364754, + "logits/chosen": -2.0386805534362793, + "logits/rejected": -2.036684513092041, + "logps/chosen": -32.129981994628906, + "logps/rejected": -31.296749114990234, + "loss": 5.6745, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.022171173244714737, + "rewards/margins": 0.028278371319174767, + "rewards/rejected": -0.006107199937105179, "step": 100 }, { "epoch": 0.26, - "eval_logits/chosen": -2.2347991466522217, - "eval_logits/rejected": -2.229947805404663, - "eval_logps/chosen": -33.91511917114258, - "eval_logps/rejected": -37.412628173828125, - "eval_loss": 0.499397873878479, - "eval_rewards/accuracies": 0.5215947031974792, - "eval_rewards/chosen": 0.023886699229478836, - "eval_rewards/margins": 0.0030889539048075676, - "eval_rewards/rejected": 0.020797746255993843, - "eval_runtime": 145.7619, + "eval_logits/chosen": -2.2338244915008545, + "eval_logits/rejected": -2.2289819717407227, + "eval_logps/chosen": -34.01533508300781, + "eval_logps/rejected": -37.518131256103516, + "eval_loss": 6.2508440017700195, + "eval_rewards/accuracies": 0.5460963845252991, + "eval_rewards/chosen": 0.0038431365974247456, + "eval_rewards/margins": 0.0041458746418356895, + "eval_rewards/rejected": -0.0003027375496458262, + "eval_runtime": 145.7849, "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 100 @@ -181,2257 +181,441 @@ { "epoch": 0.29, "learning_rate": 4.498257201263691e-06, - "logits/chosen": -1.9959571361541748, - "logits/rejected": -1.99361252784729, - "logps/chosen": -32.979209899902344, - "logps/rejected": -33.898841857910156, - "loss": 0.4925, - "rewards/accuracies": 0.612500011920929, - "rewards/chosen": 0.052921634167432785, - "rewards/margins": 0.026816055178642273, - "rewards/rejected": 0.026105573400855064, + "logits/chosen": -1.9937114715576172, + "logits/rejected": -1.9913352727890015, + "logps/chosen": -33.10139465332031, + "logps/rejected": -33.98957061767578, + "loss": 6.1578, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.028484445065259933, + "rewards/margins": 0.02052464708685875, + "rewards/rejected": 0.007959800772368908, "step": 110 }, { "epoch": 0.31, "learning_rate": 4.353806263777678e-06, - "logits/chosen": -2.007497787475586, - "logits/rejected": -1.9991832971572876, - "logps/chosen": -32.195396423339844, - "logps/rejected": -31.9866886138916, - "loss": 0.4967, + "logits/chosen": -2.0058560371398926, + "logits/rejected": -1.9975417852401733, + "logps/chosen": -32.32807159423828, + "logps/rejected": -32.11988067626953, + "loss": 6.0258, "rewards/accuracies": 0.5249999761581421, - "rewards/chosen": 0.04990261048078537, - "rewards/margins": 0.01493864320218563, - "rewards/rejected": 0.03496397286653519, + "rewards/chosen": 0.02336641401052475, + "rewards/margins": 0.015041453763842583, + "rewards/rejected": 0.008324960246682167, "step": 120 }, { "epoch": 0.34, "learning_rate": 4.1940827077152755e-06, - "logits/chosen": -2.0352938175201416, - "logits/rejected": -2.027338743209839, - "logps/chosen": -30.16824722290039, - "logps/rejected": -31.9173526763916, - "loss": 0.4919, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.061494071036577225, - "rewards/margins": 0.034362830221652985, - "rewards/rejected": 0.027131233364343643, + "logits/chosen": -2.034005641937256, + "logits/rejected": -2.026031494140625, + "logps/chosen": -30.320354461669922, + "logps/rejected": -32.04728698730469, + "loss": 5.7894, + "rewards/accuracies": 0.574999988079071, + "rewards/chosen": 0.031072016805410385, + "rewards/margins": 0.029927905648946762, + "rewards/rejected": 0.0011441137176007032, "step": 130 }, { "epoch": 0.36, "learning_rate": 4.0204024186666215e-06, - "logits/chosen": -1.9656566381454468, - "logits/rejected": -1.9758468866348267, - "logps/chosen": -31.05475425720215, - "logps/rejected": -32.390235900878906, - "loss": 0.4893, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.0729818269610405, - "rewards/margins": 0.04405337944626808, - "rewards/rejected": 0.028928453102707863, + "logits/chosen": -1.9651100635528564, + "logits/rejected": -1.9753506183624268, + "logps/chosen": -31.207500457763672, + "logps/rejected": -32.54130172729492, + "loss": 5.4157, + "rewards/accuracies": 0.6625000238418579, + "rewards/chosen": 0.042432788759469986, + "rewards/margins": 0.0437164343893528, + "rewards/rejected": -0.0012836471432819963, "step": 140 }, { "epoch": 0.39, "learning_rate": 3.834196265035119e-06, - "logits/chosen": -1.877623200416565, - "logits/rejected": -1.8787848949432373, - "logps/chosen": -33.642906188964844, - "logps/rejected": -34.57593536376953, - "loss": 0.4826, + "logits/chosen": -1.8767722845077515, + "logits/rejected": -1.8779083490371704, + "logps/chosen": -33.90843963623047, + "logps/rejected": -34.76317596435547, + "loss": 5.2936, "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.10995662212371826, - "rewards/margins": 0.07369254529476166, - "rewards/rejected": 0.0362640880048275, + "rewards/chosen": 0.05685017257928848, + "rewards/margins": 0.058033354580402374, + "rewards/rejected": -0.0011831853771582246, "step": 150 }, { "epoch": 0.42, "learning_rate": 3.636998309800573e-06, - "logits/chosen": -1.9285871982574463, - "logits/rejected": -1.9251912832260132, - "logps/chosen": -35.79336929321289, - "logps/rejected": -32.49341583251953, - "loss": 0.4925, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.07566188275814056, - "rewards/margins": 0.029793858528137207, - "rewards/rejected": 0.045868031680583954, + "logits/chosen": -1.9284919500350952, + "logits/rejected": -1.9250543117523193, + "logps/chosen": -36.01051712036133, + "logps/rejected": -32.69367980957031, + "loss": 5.7928, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.032231830060482025, + "rewards/margins": 0.026417434215545654, + "rewards/rejected": 0.005814394913613796, "step": 160 }, { "epoch": 0.44, "learning_rate": 3.4304331721118078e-06, - "logits/chosen": -2.0295512676239014, - "logits/rejected": -2.0222458839416504, - "logps/chosen": -33.23942947387695, - "logps/rejected": -31.19607925415039, - "loss": 0.4806, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.11168631166219711, - "rewards/margins": 0.08053232729434967, - "rewards/rejected": 0.0311539676040411, + "logits/chosen": -2.029017925262451, + "logits/rejected": -2.0216681957244873, + "logps/chosen": -33.47245407104492, + "logps/rejected": -31.359905242919922, + "loss": 5.0567, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0650816410779953, + "rewards/margins": 0.06669269502162933, + "rewards/rejected": -0.0016110436990857124, "step": 170 }, { "epoch": 0.47, "learning_rate": 3.2162026428305436e-06, - "logits/chosen": -2.0365030765533447, - "logits/rejected": -2.0416781902313232, - "logps/chosen": -31.986160278320312, - "logps/rejected": -32.17144012451172, - "loss": 0.4878, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.11172527074813843, - "rewards/margins": 0.048985324800014496, - "rewards/rejected": 0.06273995339870453, + "logits/chosen": -2.0359654426574707, + "logits/rejected": -2.041189670562744, + "logps/chosen": -32.1973876953125, + "logps/rejected": -32.39836883544922, + "loss": 5.2683, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.0694805309176445, + "rewards/margins": 0.05212607979774475, + "rewards/rejected": 0.017354462295770645, "step": 180 }, { "epoch": 0.49, "learning_rate": 2.996071664294641e-06, - "logits/chosen": -2.0367565155029297, - "logits/rejected": -2.0340359210968018, - "logps/chosen": -31.0958194732666, - "logps/rejected": -31.11456298828125, - "loss": 0.4905, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.07862688601016998, - "rewards/margins": 0.03789640590548515, - "rewards/rejected": 0.04073048755526543, + "logits/chosen": -2.0371925830841064, + "logits/rejected": -2.0344390869140625, + "logps/chosen": -31.23935317993164, + "logps/rejected": -31.283512115478516, + "loss": 5.5783, + "rewards/accuracies": 0.625, + "rewards/chosen": 0.049920208752155304, + "rewards/margins": 0.04297895357012749, + "rewards/rejected": 0.006941256113350391, "step": 190 }, { "epoch": 0.52, "learning_rate": 2.7718537898066833e-06, - "logits/chosen": -1.90771484375, - "logits/rejected": -1.912388801574707, - "logps/chosen": -31.08389663696289, - "logps/rejected": -32.62942123413086, - "loss": 0.4825, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.10447730123996735, - "rewards/margins": 0.07217548787593842, - "rewards/rejected": 0.032301802188158035, + "logits/chosen": -1.9059925079345703, + "logits/rejected": -1.9106495380401611, + "logps/chosen": -31.285137176513672, + "logps/rejected": -32.777244567871094, + "loss": 5.2135, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.064228855073452, + "rewards/margins": 0.061491239815950394, + "rewards/rejected": 0.002737621311098337, "step": 200 }, { "epoch": 0.52, - "eval_logits/chosen": -2.2322466373443604, - "eval_logits/rejected": -2.2274229526519775, - "eval_logps/chosen": -33.779205322265625, - "eval_logps/rejected": -37.31794357299805, - "eval_loss": 0.4973558187484741, - "eval_rewards/accuracies": 0.5544019937515259, - "eval_rewards/chosen": 0.05106903612613678, - "eval_rewards/margins": 0.011334729380905628, - "eval_rewards/rejected": 0.039734311401844025, - "eval_runtime": 145.5488, - "eval_samples_per_second": 2.357, + "eval_logits/chosen": -2.231491804122925, + "eval_logits/rejected": -2.2266557216644287, + "eval_logps/chosen": -34.00144577026367, + "eval_logps/rejected": -37.5041618347168, + "eval_loss": 6.2880659103393555, + "eval_rewards/accuracies": 0.5402824282646179, + "eval_rewards/chosen": 0.006621644366532564, + "eval_rewards/margins": 0.00412956066429615, + "eval_rewards/rejected": 0.0024920827709138393, + "eval_runtime": 145.8338, + "eval_samples_per_second": 2.352, "eval_steps_per_second": 0.295, "step": 200 }, { "epoch": 0.55, "learning_rate": 2.5453962426402006e-06, - "logits/chosen": -2.0204057693481445, - "logits/rejected": -2.0309972763061523, - "logps/chosen": -31.54058837890625, - "logps/rejected": -33.71467590332031, - "loss": 0.4864, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.08614270389080048, - "rewards/margins": 0.0565376803278923, - "rewards/rejected": 0.029605034738779068, + "logits/chosen": -2.017942428588867, + "logits/rejected": -2.0285964012145996, + "logps/chosen": -31.721837997436523, + "logps/rejected": -33.87845993041992, + "loss": 5.312, + "rewards/accuracies": 0.612500011920929, + "rewards/chosen": 0.0498933307826519, + "rewards/margins": 0.05304562300443649, + "rewards/rejected": -0.003152288496494293, "step": 210 }, { "epoch": 0.57, "learning_rate": 2.3185646976551794e-06, - "logits/chosen": -1.9130290746688843, - "logits/rejected": -1.9277244806289673, - "logps/chosen": -29.63765525817871, - "logps/rejected": -31.437763214111328, - "loss": 0.4831, + "logits/chosen": -1.9104692935943604, + "logits/rejected": -1.9252119064331055, + "logps/chosen": -29.818248748779297, + "logps/rejected": -31.569311141967773, + "loss": 5.1276, "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.09477666765451431, - "rewards/margins": 0.07069384306669235, - "rewards/rejected": 0.02408282831311226, + "rewards/chosen": 0.05865820124745369, + "rewards/margins": 0.060884904116392136, + "rewards/rejected": -0.002226702868938446, "step": 220 }, { "epoch": 0.6, "learning_rate": 2.0932279108998323e-06, - "logits/chosen": -1.9711300134658813, - "logits/rejected": -1.9751195907592773, - "logps/chosen": -32.86473846435547, - "logps/rejected": -31.40615463256836, - "loss": 0.4801, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.11054597795009613, - "rewards/margins": 0.0819164589047432, - "rewards/rejected": 0.02862953022122383, + "logits/chosen": -1.9671802520751953, + "logits/rejected": -1.9711711406707764, + "logps/chosen": -33.06322479248047, + "logps/rejected": -31.57196617126465, + "loss": 4.9986, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": 0.0708485022187233, + "rewards/margins": 0.075381800532341, + "rewards/rejected": -0.004533302970230579, "step": 230 }, { "epoch": 0.62, "learning_rate": 1.8712423238279358e-06, - "logits/chosen": -1.9691890478134155, - "logits/rejected": -1.9474375247955322, - "logps/chosen": -33.62578201293945, - "logps/rejected": -34.88764953613281, - "loss": 0.4781, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.10559363663196564, - "rewards/margins": 0.0929645225405693, - "rewards/rejected": 0.012629099190235138, + "logits/chosen": -1.9659277200698853, + "logits/rejected": -1.9440826177597046, + "logps/chosen": -33.80937957763672, + "logps/rejected": -35.049232482910156, + "loss": 4.6987, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.06887368112802505, + "rewards/margins": 0.08856189250946045, + "rewards/rejected": -0.019688209518790245, "step": 240 }, { "epoch": 0.65, "learning_rate": 1.6544367689701824e-06, - "logits/chosen": -2.010312557220459, - "logits/rejected": -2.007035732269287, - "logps/chosen": -32.49589920043945, - "logps/rejected": -35.9986686706543, - "loss": 0.49, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.08583381026983261, - "rewards/margins": 0.040582992136478424, - "rewards/rejected": 0.04525081440806389, + "logits/chosen": -2.0067532062530518, + "logits/rejected": -2.0034396648406982, + "logps/chosen": -32.64842987060547, + "logps/rejected": -36.244728088378906, + "loss": 5.1569, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": 0.05532795190811157, + "rewards/margins": 0.05928860232234001, + "rewards/rejected": -0.003960648085922003, "step": 250 }, { "epoch": 0.68, "learning_rate": 1.4445974030621963e-06, - "logits/chosen": -1.8778746128082275, - "logits/rejected": -1.8754326105117798, - "logps/chosen": -33.7269287109375, - "logps/rejected": -35.267066955566406, - "loss": 0.4893, - "rewards/accuracies": 0.574999988079071, - "rewards/chosen": 0.09229358285665512, - "rewards/margins": 0.044782862067222595, - "rewards/rejected": 0.047510724514722824, + "logits/chosen": -1.8740017414093018, + "logits/rejected": -1.8715832233428955, + "logps/chosen": -33.948036193847656, + "logps/rejected": -35.48664093017578, + "loss": 5.4652, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.04807313531637192, + "rewards/margins": 0.0444767102599144, + "rewards/rejected": 0.0035964243579655886, "step": 260 }, { "epoch": 0.7, "learning_rate": 1.243452991757889e-06, - "logits/chosen": -1.8628604412078857, - "logits/rejected": -1.8603498935699463, - "logps/chosen": -33.917144775390625, - "logps/rejected": -31.5814151763916, - "loss": 0.4863, - "rewards/accuracies": 0.637499988079071, - "rewards/chosen": 0.09609059244394302, - "rewards/margins": 0.058107007294893265, - "rewards/rejected": 0.03798357769846916, + "logits/chosen": -1.8596279621124268, + "logits/rejected": -1.8571679592132568, + "logps/chosen": -34.14976119995117, + "logps/rejected": -31.774593353271484, + "loss": 5.3924, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": 0.049567773938179016, + "rewards/margins": 0.05022105574607849, + "rewards/rejected": -0.0006532802362926304, "step": 270 }, { "epoch": 0.73, "learning_rate": 1.0526606671603523e-06, - "logits/chosen": -1.9663734436035156, - "logits/rejected": -1.9559385776519775, - "logps/chosen": -34.753868103027344, - "logps/rejected": -31.635868072509766, - "loss": 0.4806, - "rewards/accuracies": 0.625, - "rewards/chosen": 0.1205538734793663, - "rewards/margins": 0.07952861487865448, - "rewards/rejected": 0.04102526605129242, + "logits/chosen": -1.9632982015609741, + "logits/rejected": -1.9527626037597656, + "logps/chosen": -34.97089767456055, + "logps/rejected": -31.810266494750977, + "loss": 4.9049, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.07714872062206268, + "rewards/margins": 0.07100304216146469, + "rewards/rejected": 0.0061456747353076935, "step": 280 }, { "epoch": 0.75, "learning_rate": 8.737922755071455e-07, - "logits/chosen": -2.062568187713623, - "logits/rejected": -2.047731399536133, - "logps/chosen": -30.416040420532227, - "logps/rejected": -32.359901428222656, - "loss": 0.4895, - "rewards/accuracies": 0.5874999761581421, - "rewards/chosen": 0.1016167551279068, - "rewards/margins": 0.04322956129908562, - "rewards/rejected": 0.05838719755411148, + "logits/chosen": -2.0585777759552, + "logits/rejected": -2.043663740158081, + "logps/chosen": -30.67743492126465, + "logps/rejected": -32.60033416748047, + "loss": 5.7039, + "rewards/accuracies": 0.6000000238418579, + "rewards/chosen": 0.04933839291334152, + "rewards/margins": 0.039037786424160004, + "rewards/rejected": 0.01030060462653637, "step": 290 }, { "epoch": 0.78, "learning_rate": 7.08321427484816e-07, - "logits/chosen": -1.93375563621521, - "logits/rejected": -1.9313066005706787, - "logps/chosen": -32.096317291259766, - "logps/rejected": -30.669086456298828, - "loss": 0.4669, + "logits/chosen": -1.9290869235992432, + "logits/rejected": -1.9265540838241577, + "logps/chosen": -32.41482925415039, + "logps/rejected": -30.851070404052734, + "loss": 4.3883, "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.16615837812423706, - "rewards/margins": 0.13661329448223114, - "rewards/rejected": 0.029545078054070473, + "rewards/chosen": 0.1024569720029831, + "rewards/margins": 0.10930945724248886, + "rewards/rejected": -0.006852488033473492, "step": 300 }, { "epoch": 0.78, - "eval_logits/chosen": -2.231231927871704, - "eval_logits/rejected": -2.226423978805542, - "eval_logps/chosen": -33.768104553222656, - "eval_logps/rejected": -37.29201889038086, - "eval_loss": 0.49796417355537415, - "eval_rewards/accuracies": 0.5157807469367981, - "eval_rewards/chosen": 0.05328937619924545, - "eval_rewards/margins": 0.008368566632270813, - "eval_rewards/rejected": 0.04492080584168434, - "eval_runtime": 145.91, - "eval_samples_per_second": 2.351, + "eval_logits/chosen": -2.2290894985198975, + "eval_logits/rejected": -2.2242588996887207, + "eval_logps/chosen": -34.0192756652832, + "eval_logps/rejected": -37.53245544433594, + "eval_loss": 6.2381510734558105, + "eval_rewards/accuracies": 0.5166113376617432, + "eval_rewards/chosen": 0.0030557620339095592, + "eval_rewards/margins": 0.0062218476086854935, + "eval_rewards/rejected": -0.003166085807606578, + "eval_runtime": 145.7657, + "eval_samples_per_second": 2.353, "eval_steps_per_second": 0.295, "step": 300 }, { "epoch": 0.81, - "learning_rate": 4.84533120650964e-06, - "logits/chosen": -1.91664719581604, - "logits/rejected": -1.9135043621063232, - "logps/chosen": -31.018157958984375, - "logps/rejected": -33.52751922607422, - "loss": 0.4821, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.11703909933567047, - "rewards/margins": 0.0741124153137207, - "rewards/rejected": 0.04292667657136917, + "learning_rate": 5.576113578589035e-07, + "logits/chosen": -1.9141037464141846, + "logits/rejected": -1.9108575582504272, + "logps/chosen": -31.29278564453125, + "logps/rejected": -33.755470275878906, + "loss": 5.1987, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.06211275979876518, + "rewards/margins": 0.06477634608745575, + "rewards/rejected": -0.002663586288690567, "step": 310 }, { "epoch": 0.83, - "learning_rate": 4.825108134172131e-06, - "logits/chosen": -1.9666311740875244, - "logits/rejected": -1.954493522644043, - "logps/chosen": -34.01880645751953, - "logps/rejected": -33.413368225097656, - "loss": 0.4782, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.11545337736606598, - "rewards/margins": 0.09035861492156982, - "rewards/rejected": 0.02509475313127041, + "learning_rate": 4.229036944380913e-07, + "logits/chosen": -1.9642059803009033, + "logits/rejected": -1.9519942998886108, + "logps/chosen": -34.297203063964844, + "logps/rejected": -33.615535736083984, + "loss": 4.8567, + "rewards/accuracies": 0.7124999761581421, + "rewards/chosen": 0.05977395921945572, + "rewards/margins": 0.07511334121227264, + "rewards/rejected": -0.015339391306042671, "step": 320 }, { "epoch": 0.86, - "learning_rate": 4.80369052967602e-06, - "logits/chosen": -2.000293254852295, - "logits/rejected": -1.9989759922027588, - "logps/chosen": -32.78376007080078, - "logps/rejected": -32.249916076660156, - "loss": 0.477, - "rewards/accuracies": 0.7124999761581421, - "rewards/chosen": 0.14040763676166534, - "rewards/margins": 0.09447403252124786, - "rewards/rejected": 0.045933615416288376, + "learning_rate": 3.053082288996112e-07, + "logits/chosen": -1.9994945526123047, + "logits/rejected": -1.9980732202529907, + "logps/chosen": -33.15888214111328, + "logps/rejected": -32.49894332885742, + "loss": 5.0065, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.06538230925798416, + "rewards/margins": 0.06925411522388458, + "rewards/rejected": -0.0038718082942068577, "step": 330 }, { "epoch": 0.88, - "learning_rate": 4.781089396387968e-06, - "logits/chosen": -2.088131904602051, - "logits/rejected": -2.0725440979003906, - "logps/chosen": -33.396446228027344, - "logps/rejected": -32.75043869018555, - "loss": 0.48, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.15658864378929138, - "rewards/margins": 0.08306934684515, - "rewards/rejected": 0.07351930439472198, + "learning_rate": 2.0579377374915805e-07, + "logits/chosen": -2.0863518714904785, + "logits/rejected": -2.0706593990325928, + "logps/chosen": -33.7581901550293, + "logps/rejected": -33.056785583496094, + "loss": 5.0199, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.08424069732427597, + "rewards/margins": 0.07199026644229889, + "rewards/rejected": 0.012250429019331932, "step": 340 }, { "epoch": 0.91, - "learning_rate": 4.757316345716554e-06, - "logits/chosen": -1.9608865976333618, - "logits/rejected": -1.9600937366485596, - "logps/chosen": -32.507408142089844, - "logps/rejected": -32.206024169921875, - "loss": 0.4755, - "rewards/accuracies": 0.6875, - "rewards/chosen": 0.15493735671043396, - "rewards/margins": 0.1031133309006691, - "rewards/rejected": 0.05182403326034546, + "learning_rate": 1.2518018074041684e-07, + "logits/chosen": -1.9585072994232178, + "logits/rejected": -1.9576387405395508, + "logps/chosen": -32.82634735107422, + "logps/rejected": -32.51306915283203, + "loss": 4.6899, + "rewards/accuracies": 0.737500011920929, + "rewards/chosen": 0.0911494642496109, + "rewards/margins": 0.10073409974575043, + "rewards/rejected": -0.009584645740687847, "step": 350 }, { "epoch": 0.94, - "learning_rate": 4.73238359114687e-06, - "logits/chosen": -1.9142580032348633, - "logits/rejected": -1.9244797229766846, - "logps/chosen": -31.3671817779541, - "logps/rejected": -35.00855255126953, - "loss": 0.4728, - "rewards/accuracies": 0.675000011920929, - "rewards/chosen": 0.1675015389919281, - "rewards/margins": 0.11145973205566406, - "rewards/rejected": 0.05604177713394165, + "learning_rate": 6.41315865106129e-08, + "logits/chosen": -1.9139328002929688, + "logits/rejected": -1.9242264032363892, + "logps/chosen": -31.86139488220215, + "logps/rejected": -35.2977180480957, + "loss": 4.9538, + "rewards/accuracies": 0.6875, + "rewards/chosen": 0.06865896284580231, + "rewards/margins": 0.0704503208398819, + "rewards/rejected": -0.001791359856724739, "step": 360 }, { "epoch": 0.96, - "learning_rate": 4.706303941965804e-06, - "logits/chosen": -2.048444986343384, - "logits/rejected": -2.0420126914978027, - "logps/chosen": -32.811119079589844, - "logps/rejected": -28.963024139404297, - "loss": 0.4716, - "rewards/accuracies": 0.75, - "rewards/chosen": 0.16816960275173187, - "rewards/margins": 0.11548954248428345, - "rewards/rejected": 0.05268005281686783, + "learning_rate": 2.3150941078050325e-08, + "logits/chosen": -2.0540366172790527, + "logits/rejected": -2.047560214996338, + "logps/chosen": -33.293235778808594, + "logps/rejected": -29.21136474609375, + "loss": 4.9289, + "rewards/accuracies": 0.7250000238418579, + "rewards/chosen": 0.0717458575963974, + "rewards/margins": 0.06873348355293274, + "rewards/rejected": 0.0030123672913759947, "step": 370 }, { "epoch": 0.99, - "learning_rate": 4.679090796681225e-06, - "logits/chosen": -1.9065204858779907, - "logits/rejected": -1.9086847305297852, - "logps/chosen": -33.27626419067383, - "logps/rejected": -30.68581199645996, - "loss": 0.4634, - "rewards/accuracies": 0.7250000238418579, - "rewards/chosen": 0.19255687296390533, - "rewards/margins": 0.15519388020038605, - "rewards/rejected": 0.03736298158764839, + "learning_rate": 2.575864278703266e-09, + "logits/chosen": -1.913193702697754, + "logits/rejected": -1.9154123067855835, + "logps/chosen": -33.81034469604492, + "logps/rejected": -30.896728515625, + "loss": 4.6897, + "rewards/accuracies": 0.75, + "rewards/chosen": 0.08574088662862778, + "rewards/margins": 0.09056078642606735, + "rewards/rejected": -0.004819901194423437, "step": 380 }, { - "epoch": 1.01, - "learning_rate": 4.650758136138454e-06, - "logits/chosen": -1.935220718383789, - "logits/rejected": -1.933953881263733, - "logps/chosen": -33.31889343261719, - "logps/rejected": -35.62324905395508, - "loss": 0.4516, - "rewards/accuracies": 0.7541667222976685, - "rewards/chosen": 0.2131790816783905, - "rewards/margins": 0.20662721991539001, - "rewards/rejected": 0.0065518878400325775, - "step": 390 - }, - { - "epoch": 1.04, - "learning_rate": 4.621320516337559e-06, - "logits/chosen": -1.8704192638397217, - "logits/rejected": -1.8622219562530518, - "logps/chosen": -30.537851333618164, - "logps/rejected": -36.038818359375, - "loss": 0.4399, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.24722521007061005, - "rewards/margins": 0.26686906814575195, - "rewards/rejected": -0.019643839448690414, - "step": 400 - }, - { - "epoch": 1.04, - "eval_logits/chosen": -2.2152392864227295, - "eval_logits/rejected": -2.2104203701019287, - "eval_logps/chosen": -33.64607620239258, - "eval_logps/rejected": -37.199623107910156, - "eval_loss": 0.4964854121208191, - "eval_rewards/accuracies": 0.5859634280204773, - "eval_rewards/chosen": 0.07769521325826645, - "eval_rewards/margins": 0.014295723289251328, - "eval_rewards/rejected": 0.06339949369430542, - "eval_runtime": 146.3322, - "eval_samples_per_second": 2.344, - "eval_steps_per_second": 0.294, - "step": 400 - }, - { - "epoch": 1.06, - "learning_rate": 4.590793060955158e-06, - "logits/chosen": -2.0440499782562256, - "logits/rejected": -2.0468368530273438, - "logps/chosen": -31.769847869873047, - "logps/rejected": -34.8954963684082, - "loss": 0.4374, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.25681817531585693, - "rewards/margins": 0.27734869718551636, - "rewards/rejected": -0.02053055725991726, - "step": 410 - }, - { - "epoch": 1.09, - "learning_rate": 4.559191453574582e-06, - "logits/chosen": -1.8831751346588135, - "logits/rejected": -1.881805419921875, - "logps/chosen": -27.95968246459961, - "logps/rejected": -32.3897590637207, - "loss": 0.4474, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.2215747833251953, - "rewards/margins": 0.23599445819854736, - "rewards/rejected": -0.014419645071029663, - "step": 420 - }, - { - "epoch": 1.12, - "learning_rate": 4.52653192962838e-06, - "logits/chosen": -1.8401778936386108, - "logits/rejected": -1.833168387413025, - "logps/chosen": -32.55414581298828, - "logps/rejected": -34.014617919921875, - "loss": 0.4388, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.3039882481098175, - "rewards/margins": 0.25785765051841736, - "rewards/rejected": 0.04613056033849716, - "step": 430 - }, - { - "epoch": 1.14, - "learning_rate": 4.492831268057307e-06, - "logits/chosen": -2.010392427444458, - "logits/rejected": -2.0052847862243652, - "logps/chosen": -30.339391708374023, - "logps/rejected": -32.0531120300293, - "loss": 0.4306, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.28878048062324524, - "rewards/margins": 0.3058806359767914, - "rewards/rejected": -0.01710016280412674, - "step": 440 - }, - { - "epoch": 1.17, - "learning_rate": 4.458106782690094e-06, - "logits/chosen": -1.893460988998413, - "logits/rejected": -1.8975980281829834, - "logps/chosen": -32.97051239013672, - "logps/rejected": -32.69754409790039, - "loss": 0.4244, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.3090675473213196, - "rewards/margins": 0.3308485448360443, - "rewards/rejected": -0.02178102172911167, - "step": 450 - }, - { - "epoch": 1.19, - "learning_rate": 4.422376313348405e-06, - "logits/chosen": -1.8964331150054932, - "logits/rejected": -1.8907877206802368, - "logps/chosen": -33.82334899902344, - "logps/rejected": -35.24158477783203, - "loss": 0.4182, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.31923526525497437, - "rewards/margins": 0.37773314118385315, - "rewards/rejected": -0.05849781632423401, - "step": 460 - }, - { - "epoch": 1.22, - "learning_rate": 4.3856582166815696e-06, - "logits/chosen": -1.9247757196426392, - "logits/rejected": -1.9244073629379272, - "logps/chosen": -32.55290222167969, - "logps/rejected": -34.13166427612305, - "loss": 0.4291, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.32308587431907654, - "rewards/margins": 0.312322199344635, - "rewards/rejected": 0.010763740167021751, - "step": 470 - }, - { - "epoch": 1.25, - "learning_rate": 4.347971356735789e-06, - "logits/chosen": -1.9735698699951172, - "logits/rejected": -1.9551506042480469, - "logps/chosen": -32.39688491821289, - "logps/rejected": -33.23746871948242, - "loss": 0.4136, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.35277625918388367, - "rewards/margins": 0.38627398014068604, - "rewards/rejected": -0.033497732132673264, - "step": 480 - }, - { - "epoch": 1.27, - "learning_rate": 4.309335095262675e-06, - "logits/chosen": -1.9385684728622437, - "logits/rejected": -1.938098669052124, - "logps/chosen": -29.996129989624023, - "logps/rejected": -31.194690704345703, - "loss": 0.4273, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.3230343759059906, - "rewards/margins": 0.31276363134384155, - "rewards/rejected": 0.010270734317600727, - "step": 490 - }, - { - "epoch": 1.3, - "learning_rate": 4.269769281772082e-06, - "logits/chosen": -1.9020084142684937, - "logits/rejected": -1.8952033519744873, - "logps/chosen": -30.868408203125, - "logps/rejected": -34.74605941772461, - "loss": 0.4135, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.3603435754776001, - "rewards/margins": 0.3842063844203949, - "rewards/rejected": -0.02386278472840786, - "step": 500 - }, - { - "epoch": 1.3, - "eval_logits/chosen": -2.185137987136841, - "eval_logits/rejected": -2.1803719997406006, - "eval_logps/chosen": -33.61381912231445, - "eval_logps/rejected": -37.18801498413086, - "eval_loss": 0.49560075998306274, - "eval_rewards/accuracies": 0.5772424936294556, - "eval_rewards/chosen": 0.08414657413959503, - "eval_rewards/margins": 0.018426479771733284, - "eval_rewards/rejected": 0.0657200962305069, - "eval_runtime": 145.9823, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 500 - }, - { - "epoch": 1.32, - "learning_rate": 4.22929424333435e-06, - "logits/chosen": -1.8966715335845947, - "logits/rejected": -1.9005581140518188, - "logps/chosen": -27.716833114624023, - "logps/rejected": -33.01227569580078, - "loss": 0.428, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.2915978729724884, - "rewards/margins": 0.32219529151916504, - "rewards/rejected": -0.030597442761063576, - "step": 510 - }, - { - "epoch": 1.35, - "learning_rate": 4.1879307741372085e-06, - "logits/chosen": -1.9012483358383179, - "logits/rejected": -1.9117851257324219, - "logps/chosen": -31.59527587890625, - "logps/rejected": -30.912118911743164, - "loss": 0.4171, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.33341097831726074, - "rewards/margins": 0.38549748063087463, - "rewards/rejected": -0.05208650231361389, - "step": 520 - }, - { - "epoch": 1.38, - "learning_rate": 4.145700124802693e-06, - "logits/chosen": -1.8438947200775146, - "logits/rejected": -1.841627836227417, - "logps/chosen": -29.916330337524414, - "logps/rejected": -30.342233657836914, - "loss": 0.4141, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.350149929523468, - "rewards/margins": 0.3832443058490753, - "rewards/rejected": -0.03309439495205879, - "step": 530 - }, - { - "epoch": 1.4, - "learning_rate": 4.102623991469562e-06, - "logits/chosen": -1.9244403839111328, - "logits/rejected": -1.9175224304199219, - "logps/chosen": -32.44102096557617, - "logps/rejected": -33.2357063293457, - "loss": 0.4107, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.3583614230155945, - "rewards/margins": 0.40200528502464294, - "rewards/rejected": -0.04364382103085518, - "step": 540 - }, - { - "epoch": 1.43, - "learning_rate": 4.058724504646834e-06, - "logits/chosen": -1.8887981176376343, - "logits/rejected": -1.8951419591903687, - "logps/chosen": -30.23369789123535, - "logps/rejected": -32.778255462646484, - "loss": 0.4294, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.3038971424102783, - "rewards/margins": 0.3078022301197052, - "rewards/rejected": -0.0039050534833222628, - "step": 550 - }, - { - "epoch": 1.45, - "learning_rate": 4.014024217844167e-06, - "logits/chosen": -1.9593422412872314, - "logits/rejected": -1.9366729259490967, - "logps/chosen": -29.808597564697266, - "logps/rejected": -33.034202575683594, - "loss": 0.4244, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.3257942795753479, - "rewards/margins": 0.3334236741065979, - "rewards/rejected": -0.007629436906427145, - "step": 560 - }, - { - "epoch": 1.48, - "learning_rate": 3.968546095984911e-06, - "logits/chosen": -1.8934186697006226, - "logits/rejected": -1.8884944915771484, - "logps/chosen": -30.610851287841797, - "logps/rejected": -32.035675048828125, - "loss": 0.4208, - "rewards/accuracies": 0.824999988079071, - "rewards/chosen": 0.361176073551178, - "rewards/margins": 0.3441280722618103, - "rewards/rejected": 0.017047986388206482, - "step": 570 - }, - { - "epoch": 1.51, - "learning_rate": 3.922313503607806e-06, - "logits/chosen": -1.934767484664917, - "logits/rejected": -1.9366267919540405, - "logps/chosen": -32.60491180419922, - "logps/rejected": -35.0875129699707, - "loss": 0.4112, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.35417625308036804, - "rewards/margins": 0.4114954471588135, - "rewards/rejected": -0.05731917545199394, - "step": 580 - }, - { - "epoch": 1.53, - "learning_rate": 3.875350192863368e-06, - "logits/chosen": -1.916466474533081, - "logits/rejected": -1.9158313274383545, - "logps/chosen": -28.790512084960938, - "logps/rejected": -31.58980369567871, - "loss": 0.4138, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.3603084981441498, - "rewards/margins": 0.38594144582748413, - "rewards/rejected": -0.02563294768333435, - "step": 590 - }, - { - "epoch": 1.56, - "learning_rate": 3.8276802913111436e-06, - "logits/chosen": -1.9257876873016357, - "logits/rejected": -1.9237552881240845, - "logps/chosen": -31.2141056060791, - "logps/rejected": -32.23310852050781, - "loss": 0.4221, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.3568352162837982, - "rewards/margins": 0.34346500039100647, - "rewards/rejected": 0.013370266184210777, - "step": 600 - }, - { - "epoch": 1.56, - "eval_logits/chosen": -2.1612045764923096, - "eval_logits/rejected": -2.1564619541168213, - "eval_logps/chosen": -33.5879020690918, - "eval_logps/rejected": -37.26316833496094, - "eval_loss": 0.49089184403419495, - "eval_rewards/accuracies": 0.5764119625091553, - "eval_rewards/chosen": 0.08932927250862122, - "eval_rewards/margins": 0.038638439029455185, - "eval_rewards/rejected": 0.05069083347916603, - "eval_runtime": 145.982, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 600 - }, - { - "epoch": 1.58, - "learning_rate": 3.7793282895240927e-06, - "logits/chosen": -1.966064453125, - "logits/rejected": -1.9725717306137085, - "logps/chosen": -30.54904556274414, - "logps/rejected": -32.12957000732422, - "loss": 0.4214, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.33766743540763855, - "rewards/margins": 0.35094374418258667, - "rewards/rejected": -0.013276261277496815, - "step": 610 - }, - { - "epoch": 1.61, - "learning_rate": 3.730319028506478e-06, - "logits/chosen": -1.9191545248031616, - "logits/rejected": -1.9168806076049805, - "logps/chosen": -32.72040557861328, - "logps/rejected": -31.017284393310547, - "loss": 0.4075, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.4070951044559479, - "rewards/margins": 0.4134503901004791, - "rewards/rejected": -0.006355271674692631, - "step": 620 - }, - { - "epoch": 1.64, - "learning_rate": 3.6806776869317074e-06, - "logits/chosen": -1.8647797107696533, - "logits/rejected": -1.8581911325454712, - "logps/chosen": -33.25090408325195, - "logps/rejected": -32.411922454833984, - "loss": 0.3992, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.4426650106906891, - "rewards/margins": 0.4536076486110687, - "rewards/rejected": -0.010942650958895683, - "step": 630 - }, - { - "epoch": 1.66, - "learning_rate": 3.6304297682067146e-06, - "logits/chosen": -1.8728103637695312, - "logits/rejected": -1.8790283203125, - "logps/chosen": -32.14948654174805, - "logps/rejected": -33.18699264526367, - "loss": 0.4191, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.3659629225730896, - "rewards/margins": 0.3553093373775482, - "rewards/rejected": 0.010653568431735039, - "step": 640 - }, - { - "epoch": 1.69, - "learning_rate": 3.579601087369492e-06, - "logits/chosen": -1.9559097290039062, - "logits/rejected": -1.969873070716858, - "logps/chosen": -30.12808609008789, - "logps/rejected": -32.0156364440918, - "loss": 0.4205, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.3448758125305176, - "rewards/margins": 0.3484658896923065, - "rewards/rejected": -0.0035900480579584837, - "step": 650 - }, - { - "epoch": 1.71, - "learning_rate": 3.5282177578265295e-06, - "logits/chosen": -1.829250693321228, - "logits/rejected": -1.8262916803359985, - "logps/chosen": -31.54953384399414, - "logps/rejected": -35.02196502685547, - "loss": 0.3825, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.47468534111976624, - "rewards/margins": 0.5390273928642273, - "rewards/rejected": -0.06434204429388046, - "step": 660 - }, - { - "epoch": 1.74, - "learning_rate": 3.476306177936961e-06, - "logits/chosen": -1.9189475774765015, - "logits/rejected": -1.9192441701889038, - "logps/chosen": -29.566814422607422, - "logps/rejected": -34.125972747802734, - "loss": 0.4135, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.3437557518482208, - "rewards/margins": 0.4036734998226166, - "rewards/rejected": -0.05991772934794426, - "step": 670 - }, - { - "epoch": 1.77, - "learning_rate": 3.423893017450324e-06, - "logits/chosen": -1.8682838678359985, - "logits/rejected": -1.8652855157852173, - "logps/chosen": -29.1109561920166, - "logps/rejected": -32.9627571105957, - "loss": 0.417, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.35227832198143005, - "rewards/margins": 0.38126859068870544, - "rewards/rejected": -0.0289902500808239, - "step": 680 - }, - { - "epoch": 1.79, - "learning_rate": 3.3710052038048794e-06, - "logits/chosen": -1.8988441228866577, - "logits/rejected": -1.898903250694275, - "logps/chosen": -27.93353843688965, - "logps/rejected": -30.931421279907227, - "loss": 0.3906, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.44591015577316284, - "rewards/margins": 0.49809446930885315, - "rewards/rejected": -0.052184272557497025, - "step": 690 - }, - { - "epoch": 1.82, - "learning_rate": 3.3176699082935546e-06, - "logits/chosen": -1.8120651245117188, - "logits/rejected": -1.8154224157333374, - "logps/chosen": -32.11365509033203, - "logps/rejected": -31.554616928100586, - "loss": 0.3967, - "rewards/accuracies": 0.8374999761581421, - "rewards/chosen": 0.4729641079902649, - "rewards/margins": 0.490949809551239, - "rewards/rejected": -0.017985735088586807, - "step": 700 - }, - { - "epoch": 1.82, - "eval_logits/chosen": -2.145447015762329, - "eval_logits/rejected": -2.140751361846924, - "eval_logps/chosen": -33.68964767456055, - "eval_logps/rejected": -37.34749984741211, - "eval_loss": 0.49207815527915955, - "eval_rewards/accuracies": 0.5793189406394958, - "eval_rewards/chosen": 0.0689806342124939, - "eval_rewards/margins": 0.03515753895044327, - "eval_rewards/rejected": 0.03382309526205063, - "eval_runtime": 145.9055, - "eval_samples_per_second": 2.351, - "eval_steps_per_second": 0.295, - "step": 700 - }, - { - "epoch": 1.84, - "learning_rate": 3.2639145321045933e-06, - "logits/chosen": -1.8932316303253174, - "logits/rejected": -1.8843755722045898, - "logps/chosen": -34.428524017333984, - "logps/rejected": -32.0389404296875, - "loss": 0.4022, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.41669875383377075, - "rewards/margins": 0.4433247447013855, - "rewards/rejected": -0.02662605047225952, - "step": 710 - }, - { - "epoch": 1.87, - "learning_rate": 3.2097666922441107e-06, - "logits/chosen": -1.9056898355484009, - "logits/rejected": -1.9067920446395874, - "logps/chosen": -34.22626876831055, - "logps/rejected": -33.2248420715332, - "loss": 0.3966, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.44182342290878296, - "rewards/margins": 0.4715878367424011, - "rewards/rejected": -0.02976439893245697, - "step": 720 - }, - { - "epoch": 1.9, - "learning_rate": 3.1552542073477554e-06, - "logits/chosen": -1.9228168725967407, - "logits/rejected": -1.9204353094100952, - "logps/chosen": -30.3280029296875, - "logps/rejected": -33.11607360839844, - "loss": 0.3965, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.44706568121910095, - "rewards/margins": 0.4709019064903259, - "rewards/rejected": -0.023836305364966393, - "step": 730 - }, - { - "epoch": 1.92, - "learning_rate": 3.100405083388799e-06, - "logits/chosen": -1.9053897857666016, - "logits/rejected": -1.9105879068374634, - "logps/chosen": -29.603191375732422, - "logps/rejected": -33.39387893676758, - "loss": 0.3939, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.43650713562965393, - "rewards/margins": 0.48431968688964844, - "rewards/rejected": -0.0478125736117363, - "step": 740 - }, - { - "epoch": 1.95, - "learning_rate": 3.0452474992899645e-06, - "logits/chosen": -1.8512208461761475, - "logits/rejected": -1.8501055240631104, - "logps/chosen": -30.947040557861328, - "logps/rejected": -34.80047607421875, - "loss": 0.4016, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.4355888366699219, - "rewards/margins": 0.4584925174713135, - "rewards/rejected": -0.022903719916939735, - "step": 750 - }, - { - "epoch": 1.97, - "learning_rate": 2.989809792446417e-06, - "logits/chosen": -1.7258459329605103, - "logits/rejected": -1.7209047079086304, - "logps/chosen": -33.49466323852539, - "logps/rejected": -35.46110153198242, - "loss": 0.3832, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.5007556080818176, - "rewards/margins": 0.5512930154800415, - "rewards/rejected": -0.05053748935461044, - "step": 760 - }, - { - "epoch": 2.0, - "learning_rate": 2.9341204441673267e-06, - "logits/chosen": -1.8652160167694092, - "logits/rejected": -1.8690448999404907, - "logps/chosen": -33.102561950683594, - "logps/rejected": -33.72128677368164, - "loss": 0.403, - "rewards/accuracies": 0.8708333969116211, - "rewards/chosen": 0.4222186207771301, - "rewards/margins": 0.4475080966949463, - "rewards/rejected": -0.025289500132203102, - "step": 770 - }, - { - "epoch": 2.03, - "learning_rate": 2.878208065043501e-06, - "logits/chosen": -1.8099088668823242, - "logits/rejected": -1.808158278465271, - "logps/chosen": -30.957799911499023, - "logps/rejected": -35.31805419921875, - "loss": 0.3439, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5887584090232849, - "rewards/margins": 0.7714377045631409, - "rewards/rejected": -0.18267923593521118, - "step": 780 - }, - { - "epoch": 2.05, - "learning_rate": 2.8221013802485974e-06, - "logits/chosen": -1.8558752536773682, - "logits/rejected": -1.8545467853546143, - "logps/chosen": -30.529211044311523, - "logps/rejected": -33.56177520751953, - "loss": 0.3585, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.5693053007125854, - "rewards/margins": 0.6643453240394592, - "rewards/rejected": -0.09504004567861557, - "step": 790 - }, - { - "epoch": 2.08, - "learning_rate": 2.76582921478147e-06, - "logits/chosen": -1.7855558395385742, - "logits/rejected": -1.779860258102417, - "logps/chosen": -31.63662338256836, - "logps/rejected": -31.89573097229004, - "loss": 0.365, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.5492910146713257, - "rewards/margins": 0.6355854868888855, - "rewards/rejected": -0.08629439771175385, - "step": 800 - }, - { - "epoch": 2.08, - "eval_logits/chosen": -2.137204170227051, - "eval_logits/rejected": -2.1325197219848633, - "eval_logps/chosen": -33.66006088256836, - "eval_logps/rejected": -37.36638641357422, - "eval_loss": 0.4898916482925415, - "eval_rewards/accuracies": 0.5975913405418396, - "eval_rewards/chosen": 0.07489825040102005, - "eval_rewards/margins": 0.044850923120975494, - "eval_rewards/rejected": 0.03004733845591545, - "eval_runtime": 145.964, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 800 - }, - { - "epoch": 2.1, - "learning_rate": 2.7094204786572254e-06, - "logits/chosen": -1.8811490535736084, - "logits/rejected": -1.8885715007781982, - "logps/chosen": -29.428760528564453, - "logps/rejected": -34.94689178466797, - "loss": 0.3621, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.5398995280265808, - "rewards/margins": 0.6781092882156372, - "rewards/rejected": -0.13820984959602356, - "step": 810 - }, - { - "epoch": 2.13, - "learning_rate": 2.6529041520546072e-06, - "logits/chosen": -1.8541055917739868, - "logits/rejected": -1.8567206859588623, - "logps/chosen": -30.24544334411621, - "logps/rejected": -33.46533203125, - "loss": 0.3894, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.4651278853416443, - "rewards/margins": 0.5122340321540833, - "rewards/rejected": -0.047106094658374786, - "step": 820 - }, - { - "epoch": 2.16, - "learning_rate": 2.5963092704273302e-06, - "logits/chosen": -1.7540677785873413, - "logits/rejected": -1.7582552433013916, - "logps/chosen": -29.936010360717773, - "logps/rejected": -35.5365104675293, - "loss": 0.3632, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5310894250869751, - "rewards/margins": 0.6563557982444763, - "rewards/rejected": -0.125266432762146, - "step": 830 - }, - { - "epoch": 2.18, - "learning_rate": 2.53966490958702e-06, - "logits/chosen": -1.8204174041748047, - "logits/rejected": -1.8164218664169312, - "logps/chosen": -30.37557601928711, - "logps/rejected": -33.122283935546875, - "loss": 0.3791, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.5186147093772888, - "rewards/margins": 0.5441412925720215, - "rewards/rejected": -0.025526583194732666, - "step": 840 - }, - { - "epoch": 2.21, - "learning_rate": 2.4830001707654135e-06, - "logits/chosen": -1.8992812633514404, - "logits/rejected": -1.9012937545776367, - "logps/chosen": -29.994953155517578, - "logps/rejected": -36.37163162231445, - "loss": 0.3543, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.5560764074325562, - "rewards/margins": 0.698015570640564, - "rewards/rejected": -0.14193907380104065, - "step": 850 - }, - { - "epoch": 2.23, - "learning_rate": 2.4263441656635054e-06, - "logits/chosen": -1.7072890996932983, - "logits/rejected": -1.7018003463745117, - "logps/chosen": -33.47475814819336, - "logps/rejected": -33.329551696777344, - "loss": 0.3639, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.5613837838172913, - "rewards/margins": 0.6489141583442688, - "rewards/rejected": -0.08753031492233276, - "step": 860 - }, - { - "epoch": 2.26, - "learning_rate": 2.3697260014953107e-06, - "logits/chosen": -1.7648935317993164, - "logits/rejected": -1.7649040222167969, - "logps/chosen": -32.934661865234375, - "logps/rejected": -35.514404296875, - "loss": 0.3529, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.5816287398338318, - "rewards/margins": 0.7152572870254517, - "rewards/rejected": -0.1336284726858139, - "step": 870 - }, - { - "epoch": 2.29, - "learning_rate": 2.3131747660339396e-06, - "logits/chosen": -1.8092641830444336, - "logits/rejected": -1.7980419397354126, - "logps/chosen": -31.361209869384766, - "logps/rejected": -33.579830169677734, - "loss": 0.3601, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5490745902061462, - "rewards/margins": 0.6806339621543884, - "rewards/rejected": -0.1315593421459198, - "step": 880 - }, - { - "epoch": 2.31, - "learning_rate": 2.256719512667651e-06, - "logits/chosen": -1.9134113788604736, - "logits/rejected": -1.917851448059082, - "logps/chosen": -30.57168197631836, - "logps/rejected": -33.24015808105469, - "loss": 0.3559, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.5516507625579834, - "rewards/margins": 0.7123855352401733, - "rewards/rejected": -0.16073477268218994, - "step": 890 - }, - { - "epoch": 2.34, - "learning_rate": 2.2003892454735786e-06, - "logits/chosen": -1.8315509557724, - "logits/rejected": -1.8244893550872803, - "logps/chosen": -31.741907119750977, - "logps/rejected": -33.2019157409668, - "loss": 0.3419, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.6224341988563538, - "rewards/margins": 0.7823906540870667, - "rewards/rejected": -0.15995636582374573, - "step": 900 - }, - { - "epoch": 2.34, - "eval_logits/chosen": -2.1196937561035156, - "eval_logits/rejected": -2.1150262355804443, - "eval_logps/chosen": -33.70163345336914, - "eval_logps/rejected": -37.43189239501953, - "eval_loss": 0.48952463269233704, - "eval_rewards/accuracies": 0.5681062936782837, - "eval_rewards/chosen": 0.06658438593149185, - "eval_rewards/margins": 0.04963872581720352, - "eval_rewards/rejected": 0.01694565825164318, - "eval_runtime": 145.9947, - "eval_samples_per_second": 2.349, - "eval_steps_per_second": 0.295, - "step": 900 - }, - { - "epoch": 2.36, - "learning_rate": 2.1442129043167877e-06, - "logits/chosen": -1.8226476907730103, - "logits/rejected": -1.8230241537094116, - "logps/chosen": -28.407958984375, - "logps/rejected": -35.533790588378906, - "loss": 0.3555, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5518409609794617, - "rewards/margins": 0.7206611633300781, - "rewards/rejected": -0.16882023215293884, - "step": 910 - }, - { - "epoch": 2.39, - "learning_rate": 2.088219349982323e-06, - "logits/chosen": -1.780072569847107, - "logits/rejected": -1.772212028503418, - "logps/chosen": -29.447246551513672, - "logps/rejected": -34.29579544067383, - "loss": 0.3742, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.4783394932746887, - "rewards/margins": 0.6232239007949829, - "rewards/rejected": -0.14488451182842255, - "step": 920 - }, - { - "epoch": 2.42, - "learning_rate": 2.0324373493478803e-06, - "logits/chosen": -1.9453346729278564, - "logits/rejected": -1.9455890655517578, - "logps/chosen": -27.443078994750977, - "logps/rejected": -33.385589599609375, - "loss": 0.3768, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.5107913613319397, - "rewards/margins": 0.6149374842643738, - "rewards/rejected": -0.10414610803127289, - "step": 930 - }, - { - "epoch": 2.44, - "learning_rate": 1.976895560604729e-06, - "logits/chosen": -1.821976661682129, - "logits/rejected": -1.8315296173095703, - "logps/chosen": -31.297231674194336, - "logps/rejected": -33.75246047973633, - "loss": 0.3423, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6255664229393005, - "rewards/margins": 0.7781294584274292, - "rewards/rejected": -0.15256305038928986, - "step": 940 - }, - { - "epoch": 2.47, - "learning_rate": 1.921622518534466e-06, - "logits/chosen": -1.8654251098632812, - "logits/rejected": -1.8691349029541016, - "logps/chosen": -28.240697860717773, - "logps/rejected": -32.25327682495117, - "loss": 0.3761, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.4814048707485199, - "rewards/margins": 0.5848445892333984, - "rewards/rejected": -0.10343976318836212, - "step": 950 - }, - { - "epoch": 2.49, - "learning_rate": 1.8666466198491794e-06, - "logits/chosen": -1.8611690998077393, - "logits/rejected": -1.8573919534683228, - "logps/chosen": -31.143264770507812, - "logps/rejected": -34.57156753540039, - "loss": 0.3553, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.5863906741142273, - "rewards/margins": 0.719426691532135, - "rewards/rejected": -0.13303601741790771, - "step": 960 - }, - { - "epoch": 2.52, - "learning_rate": 1.8119961086025376e-06, - "logits/chosen": -1.7771320343017578, - "logits/rejected": -1.779615044593811, - "logps/chosen": -29.85514259338379, - "logps/rejected": -35.81364822387695, - "loss": 0.355, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5915218591690063, - "rewards/margins": 0.7179332971572876, - "rewards/rejected": -0.12641148269176483, - "step": 970 - }, - { - "epoch": 2.55, - "learning_rate": 1.7576990616793139e-06, - "logits/chosen": -1.8118658065795898, - "logits/rejected": -1.8055760860443115, - "logps/chosen": -32.73058319091797, - "logps/rejected": -37.12406921386719, - "loss": 0.3587, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.5348971486091614, - "rewards/margins": 0.7073200941085815, - "rewards/rejected": -0.17242297530174255, - "step": 980 - }, - { - "epoch": 2.57, - "learning_rate": 1.7037833743707892e-06, - "logits/chosen": -1.787426233291626, - "logits/rejected": -1.7823556661605835, - "logps/chosen": -28.302471160888672, - "logps/rejected": -36.696189880371094, - "loss": 0.3607, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.5400962233543396, - "rewards/margins": 0.6813567876815796, - "rewards/rejected": -0.14126062393188477, - "step": 990 - }, - { - "epoch": 2.6, - "learning_rate": 1.6502767460434588e-06, - "logits/chosen": -1.7660512924194336, - "logits/rejected": -1.7556447982788086, - "logps/chosen": -29.44234275817871, - "logps/rejected": -29.9862003326416, - "loss": 0.3845, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.46126946806907654, - "rewards/margins": 0.5385631322860718, - "rewards/rejected": -0.07729364931583405, - "step": 1000 - }, - { - "epoch": 2.6, - "eval_logits/chosen": -2.110644817352295, - "eval_logits/rejected": -2.1059935092926025, - "eval_logps/chosen": -33.722373962402344, - "eval_logps/rejected": -37.474510192871094, - "eval_loss": 0.4890524744987488, - "eval_rewards/accuracies": 0.5980066657066345, - "eval_rewards/chosen": 0.06243573874235153, - "eval_rewards/margins": 0.054014090448617935, - "eval_rewards/rejected": 0.008421653881669044, - "eval_runtime": 145.8236, - "eval_samples_per_second": 2.352, - "eval_steps_per_second": 0.295, - "step": 1000 - }, - { - "epoch": 2.62, - "learning_rate": 1.5972066659083796e-06, - "logits/chosen": -1.8719408512115479, - "logits/rejected": -1.8713929653167725, - "logps/chosen": -29.192501068115234, - "logps/rejected": -30.8101749420166, - "loss": 0.3704, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.5263386964797974, - "rewards/margins": 0.6366775035858154, - "rewards/rejected": -0.11033886671066284, - "step": 1010 - }, - { - "epoch": 2.65, - "learning_rate": 1.5446003988985041e-06, - "logits/chosen": -1.914655089378357, - "logits/rejected": -1.9154428243637085, - "logps/chosen": -29.429738998413086, - "logps/rejected": -31.938552856445312, - "loss": 0.3642, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5027679204940796, - "rewards/margins": 0.657317042350769, - "rewards/rejected": -0.15454904735088348, - "step": 1020 - }, - { - "epoch": 2.68, - "learning_rate": 1.4924849716612211e-06, - "logits/chosen": -1.8813469409942627, - "logits/rejected": -1.884722113609314, - "logps/chosen": -29.628210067749023, - "logps/rejected": -28.323083877563477, - "loss": 0.3741, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.530371367931366, - "rewards/margins": 0.5982450246810913, - "rewards/rejected": -0.06787357479333878, - "step": 1030 - }, - { - "epoch": 2.7, - "learning_rate": 1.440887158673332e-06, - "logits/chosen": -1.889192819595337, - "logits/rejected": -1.881563425064087, - "logps/chosen": -28.655075073242188, - "logps/rejected": -34.3299446105957, - "loss": 0.3649, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.5045636892318726, - "rewards/margins": 0.6705065965652466, - "rewards/rejected": -0.16594286262989044, - "step": 1040 - }, - { - "epoch": 2.73, - "learning_rate": 1.3898334684855647e-06, - "logits/chosen": -1.8291490077972412, - "logits/rejected": -1.8399531841278076, - "logps/chosen": -30.7166748046875, - "logps/rejected": -32.760276794433594, - "loss": 0.3593, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.5672865509986877, - "rewards/margins": 0.6703389286994934, - "rewards/rejected": -0.10305234044790268, - "step": 1050 - }, - { - "epoch": 2.75, - "learning_rate": 1.3393501301037245e-06, - "logits/chosen": -1.9070945978164673, - "logits/rejected": -1.897945761680603, - "logps/chosen": -30.696279525756836, - "logps/rejected": -37.12523651123047, - "loss": 0.3626, - "rewards/accuracies": 0.862500011920929, - "rewards/chosen": 0.5608991384506226, - "rewards/margins": 0.7196691632270813, - "rewards/rejected": -0.15877002477645874, - "step": 1060 - }, - { - "epoch": 2.78, - "learning_rate": 1.2894630795134454e-06, - "logits/chosen": -1.8145792484283447, - "logits/rejected": -1.8166484832763672, - "logps/chosen": -32.840938568115234, - "logps/rejected": -33.36056900024414, - "loss": 0.3437, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6296695470809937, - "rewards/margins": 0.7622098922729492, - "rewards/rejected": -0.13254031538963318, - "step": 1070 - }, - { - "epoch": 2.81, - "learning_rate": 1.2401979463554984e-06, - "logits/chosen": -1.9403307437896729, - "logits/rejected": -1.9412422180175781, - "logps/chosen": -30.178918838500977, - "logps/rejected": -35.10880661010742, - "loss": 0.3463, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5757440328598022, - "rewards/margins": 0.774368166923523, - "rewards/rejected": -0.19862423837184906, - "step": 1080 - }, - { - "epoch": 2.83, - "learning_rate": 1.1915800407584705e-06, - "logits/chosen": -1.9139982461929321, - "logits/rejected": -1.9182541370391846, - "logps/chosen": -28.404876708984375, - "logps/rejected": -34.255104064941406, - "loss": 0.3698, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.49768757820129395, - "rewards/margins": 0.639141857624054, - "rewards/rejected": -0.1414542943239212, - "step": 1090 - }, - { - "epoch": 2.86, - "learning_rate": 1.1436343403356019e-06, - "logits/chosen": -1.903759241104126, - "logits/rejected": -1.9089443683624268, - "logps/chosen": -31.17728614807129, - "logps/rejected": -30.53354835510254, - "loss": 0.3914, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.4783433973789215, - "rewards/margins": 0.5047181248664856, - "rewards/rejected": -0.026374701410531998, - "step": 1100 - }, - { - "epoch": 2.86, - "eval_logits/chosen": -2.1077661514282227, - "eval_logits/rejected": -2.103113889694214, - "eval_logps/chosen": -33.75041198730469, - "eval_logps/rejected": -37.48313903808594, - "eval_loss": 0.49010559916496277, - "eval_rewards/accuracies": 0.5917773842811584, - "eval_rewards/chosen": 0.05682789906859398, - "eval_rewards/margins": 0.05013115331530571, - "eval_rewards/rejected": 0.00669674901291728, - "eval_runtime": 145.9699, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 1100 - }, - { - "epoch": 2.88, - "learning_rate": 1.0963854773524548e-06, - "logits/chosen": -1.8932660818099976, - "logits/rejected": -1.8934653997421265, - "logps/chosen": -30.050395965576172, - "logps/rejected": -31.422481536865234, - "loss": 0.3614, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5779515504837036, - "rewards/margins": 0.6592650413513184, - "rewards/rejected": -0.08131341636180878, - "step": 1110 - }, - { - "epoch": 2.91, - "learning_rate": 1.049857726072005e-06, - "logits/chosen": -1.7390146255493164, - "logits/rejected": -1.74075448513031, - "logps/chosen": -31.606548309326172, - "logps/rejected": -33.571266174316406, - "loss": 0.3592, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.59168541431427, - "rewards/margins": 0.6976853609085083, - "rewards/rejected": -0.1059999018907547, - "step": 1120 - }, - { - "epoch": 2.94, - "learning_rate": 1.0040749902836508e-06, - "logits/chosen": -1.7736375331878662, - "logits/rejected": -1.7710756063461304, - "logps/chosen": -28.52197265625, - "logps/rejected": -31.5717830657959, - "loss": 0.38, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.49794936180114746, - "rewards/margins": 0.5791577100753784, - "rewards/rejected": -0.08120830357074738, - "step": 1130 - }, - { - "epoch": 2.96, - "learning_rate": 9.59060791022566e-07, - "logits/chosen": -1.9108989238739014, - "logits/rejected": -1.9054679870605469, - "logps/chosen": -30.09890365600586, - "logps/rejected": -33.4709358215332, - "loss": 0.3568, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.5881778001785278, - "rewards/margins": 0.6915627717971802, - "rewards/rejected": -0.10338501632213593, - "step": 1140 - }, - { - "epoch": 2.99, - "learning_rate": 9.148382544856885e-07, - "logits/chosen": -1.770045280456543, - "logits/rejected": -1.7603470087051392, - "logps/chosen": -30.64144515991211, - "logps/rejected": -31.74554443359375, - "loss": 0.3671, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.5620578527450562, - "rewards/margins": 0.6526859402656555, - "rewards/rejected": -0.09062808007001877, - "step": 1150 - }, - { - "epoch": 3.01, - "learning_rate": 8.714301001505568e-07, - "logits/chosen": -1.8397423028945923, - "logits/rejected": -1.839726209640503, - "logps/chosen": -30.804174423217773, - "logps/rejected": -31.754486083984375, - "loss": 0.357, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5755155086517334, - "rewards/margins": 0.6861231923103333, - "rewards/rejected": -0.11060768365859985, - "step": 1160 - }, - { - "epoch": 3.04, - "learning_rate": 8.288586291031025e-07, - "logits/chosen": -1.9154409170150757, - "logits/rejected": -1.9101753234863281, - "logps/chosen": -30.82037353515625, - "logps/rejected": -33.420101165771484, - "loss": 0.3661, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.541119396686554, - "rewards/margins": 0.6368963718414307, - "rewards/rejected": -0.09577690064907074, - "step": 1170 - }, - { - "epoch": 3.06, - "learning_rate": 7.871457125803897e-07, - "logits/chosen": -1.7755794525146484, - "logits/rejected": -1.7832618951797485, - "logps/chosen": -30.81817626953125, - "logps/rejected": -32.95917892456055, - "loss": 0.3703, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.5029646158218384, - "rewards/margins": 0.622512936592102, - "rewards/rejected": -0.11954829841852188, - "step": 1180 - }, - { - "epoch": 3.09, - "learning_rate": 7.463127807341966e-07, - "logits/chosen": -1.8418070077896118, - "logits/rejected": -1.8359706401824951, - "logps/chosen": -29.004486083984375, - "logps/rejected": -33.9888916015625, - "loss": 0.3421, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.6338256597518921, - "rewards/margins": 0.7749150395393372, - "rewards/rejected": -0.14108926057815552, - "step": 1190 - }, - { - "epoch": 3.12, - "learning_rate": 7.063808116212021e-07, - "logits/chosen": -1.7879689931869507, - "logits/rejected": -1.789954423904419, - "logps/chosen": -30.48044204711914, - "logps/rejected": -33.68505096435547, - "loss": 0.3451, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.6020002365112305, - "rewards/margins": 0.7870009541511536, - "rewards/rejected": -0.1850006878376007, - "step": 1200 - }, - { - "epoch": 3.12, - "eval_logits/chosen": -2.108360767364502, - "eval_logits/rejected": -2.1037116050720215, - "eval_logps/chosen": -33.73748779296875, - "eval_logps/rejected": -37.49613952636719, - "eval_loss": 0.48899805545806885, - "eval_rewards/accuracies": 0.6004983186721802, - "eval_rewards/chosen": 0.05941270664334297, - "eval_rewards/margins": 0.05531647056341171, - "eval_rewards/rejected": 0.004096232354640961, - "eval_runtime": 145.954, - "eval_samples_per_second": 2.35, - "eval_steps_per_second": 0.295, - "step": 1200 - }, - { - "epoch": 3.14, - "learning_rate": 6.673703204254348e-07, - "logits/chosen": -1.72002375125885, - "logits/rejected": -1.7190821170806885, - "logps/chosen": -32.775211334228516, - "logps/rejected": -33.485801696777344, - "loss": 0.3377, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.6554538011550903, - "rewards/margins": 0.8107422590255737, - "rewards/rejected": -0.155288428068161, - "step": 1210 - }, - { - "epoch": 3.17, - "learning_rate": 6.293013489185315e-07, - "logits/chosen": -1.8932708501815796, - "logits/rejected": -1.8874114751815796, - "logps/chosen": -28.695459365844727, - "logps/rejected": -33.90492630004883, - "loss": 0.3422, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6016449332237244, - "rewards/margins": 0.8069217801094055, - "rewards/rejected": -0.20527689158916473, - "step": 1220 - }, - { - "epoch": 3.19, - "learning_rate": 5.921934551632086e-07, - "logits/chosen": -1.7432985305786133, - "logits/rejected": -1.731978178024292, - "logps/chosen": -31.134801864624023, - "logps/rejected": -33.52891159057617, - "loss": 0.3438, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.650175929069519, - "rewards/margins": 0.7771934866905212, - "rewards/rejected": -0.12701748311519623, - "step": 1230 - }, - { - "epoch": 3.22, - "learning_rate": 5.560657034652405e-07, - "logits/chosen": -1.8350751399993896, - "logits/rejected": -1.8283989429473877, - "logps/chosen": -28.29378890991211, - "logps/rejected": -29.679229736328125, - "loss": 0.3749, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.48128899931907654, - "rewards/margins": 0.6436225175857544, - "rewards/rejected": -0.16233357787132263, - "step": 1240 - }, - { - "epoch": 3.25, - "learning_rate": 5.2093665457911e-07, - "logits/chosen": -1.8610950708389282, - "logits/rejected": -1.8678795099258423, - "logps/chosen": -32.20514678955078, - "logps/rejected": -32.009395599365234, - "loss": 0.3467, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.638048529624939, - "rewards/margins": 0.7362658381462097, - "rewards/rejected": -0.09821729362010956, - "step": 1250 - }, - { - "epoch": 3.27, - "learning_rate": 4.868243561723535e-07, - "logits/chosen": -1.8403499126434326, - "logits/rejected": -1.8409210443496704, - "logps/chosen": -30.232723236083984, - "logps/rejected": -33.621315002441406, - "loss": 0.3524, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.5591671466827393, - "rewards/margins": 0.7204564809799194, - "rewards/rejected": -0.1612892895936966, - "step": 1260 - }, - { - "epoch": 3.3, - "learning_rate": 4.537463335535161e-07, - "logits/chosen": -1.7770591974258423, - "logits/rejected": -1.776659369468689, - "logps/chosen": -29.988265991210938, - "logps/rejected": -34.27975082397461, - "loss": 0.3412, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6205139756202698, - "rewards/margins": 0.7723422646522522, - "rewards/rejected": -0.15182837843894958, - "step": 1270 - }, - { - "epoch": 3.32, - "learning_rate": 4.217195806684629e-07, - "logits/chosen": -1.661996841430664, - "logits/rejected": -1.6574077606201172, - "logps/chosen": -32.13587951660156, - "logps/rejected": -31.51412582397461, - "loss": 0.3438, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6203125715255737, - "rewards/margins": 0.7378401756286621, - "rewards/rejected": -0.11752767860889435, - "step": 1280 - }, - { - "epoch": 3.35, - "learning_rate": 3.907605513696808e-07, - "logits/chosen": -1.867071509361267, - "logits/rejected": -1.8518552780151367, - "logps/chosen": -31.667064666748047, - "logps/rejected": -35.789974212646484, - "loss": 0.3514, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.5361019968986511, - "rewards/margins": 0.7410336136817932, - "rewards/rejected": -0.2049315869808197, - "step": 1290 - }, - { - "epoch": 3.38, - "learning_rate": 3.6088515096305675e-07, - "logits/chosen": -1.811191201210022, - "logits/rejected": -1.8159558773040771, - "logps/chosen": -30.33746337890625, - "logps/rejected": -37.15138626098633, - "loss": 0.3358, - "rewards/accuracies": 0.987500011920929, - "rewards/chosen": 0.6360319256782532, - "rewards/margins": 0.8456646203994751, - "rewards/rejected": -0.20963260531425476, - "step": 1300 - }, - { - "epoch": 3.38, - "eval_logits/chosen": -2.1072001457214355, - "eval_logits/rejected": -2.1025450229644775, - "eval_logps/chosen": -33.73197937011719, - "eval_logps/rejected": -37.49148941040039, - "eval_loss": 0.4890049993991852, - "eval_rewards/accuracies": 0.6212624907493591, - "eval_rewards/chosen": 0.06051424890756607, - "eval_rewards/margins": 0.05548795312643051, - "eval_rewards/rejected": 0.005026295315474272, - "eval_runtime": 145.9902, - "eval_samples_per_second": 2.349, - "eval_steps_per_second": 0.295, - "step": 1300 - }, - { - "epoch": 3.4, - "learning_rate": 3.321087280364757e-07, - "logits/chosen": -1.7846622467041016, - "logits/rejected": -1.7848495244979858, - "logps/chosen": -32.82987976074219, - "logps/rejected": -37.857730865478516, - "loss": 0.3347, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.6828277707099915, - "rewards/margins": 0.8375515937805176, - "rewards/rejected": -0.15472377836704254, - "step": 1310 - }, - { - "epoch": 3.43, - "learning_rate": 3.044460665744284e-07, - "logits/chosen": -1.8712621927261353, - "logits/rejected": -1.8699556589126587, - "logps/chosen": -29.258586883544922, - "logps/rejected": -31.701663970947266, - "loss": 0.3574, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.5670534372329712, - "rewards/margins": 0.7167149186134338, - "rewards/rejected": -0.14966149628162384, - "step": 1320 - }, - { - "epoch": 3.45, - "learning_rate": 2.779113783626916e-07, - "logits/chosen": -1.7789275646209717, - "logits/rejected": -1.7801557779312134, - "logps/chosen": -31.177318572998047, - "logps/rejected": -34.304542541503906, - "loss": 0.3438, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.6393861770629883, - "rewards/margins": 0.7705596685409546, - "rewards/rejected": -0.1311735361814499, - "step": 1330 - }, - { - "epoch": 3.48, - "learning_rate": 2.5251829568697204e-07, - "logits/chosen": -1.839123010635376, - "logits/rejected": -1.8379541635513306, - "logps/chosen": -28.512765884399414, - "logps/rejected": -32.78876495361328, - "loss": 0.3567, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.53718101978302, - "rewards/margins": 0.696540117263794, - "rewards/rejected": -0.1593591719865799, - "step": 1340 - }, - { - "epoch": 3.51, - "learning_rate": 2.2827986432927774e-07, - "logits/chosen": -1.8561503887176514, - "logits/rejected": -1.841815710067749, - "logps/chosen": -31.484426498413086, - "logps/rejected": -37.18474578857422, - "loss": 0.3512, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5664494037628174, - "rewards/margins": 0.7505115866661072, - "rewards/rejected": -0.18406221270561218, - "step": 1350 - }, - { - "epoch": 3.53, - "learning_rate": 2.0520853686560177e-07, - "logits/chosen": -1.8498824834823608, - "logits/rejected": -1.8622316122055054, - "logps/chosen": -29.09408187866211, - "logps/rejected": -32.83143615722656, - "loss": 0.3511, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.5948771834373474, - "rewards/margins": 0.7132156491279602, - "rewards/rejected": -0.118338443338871, - "step": 1360 - }, - { - "epoch": 3.56, - "learning_rate": 1.833161662683672e-07, - "logits/chosen": -1.949222207069397, - "logits/rejected": -1.9485489130020142, - "logps/chosen": -28.93695640563965, - "logps/rejected": -36.96564483642578, - "loss": 0.3344, - "rewards/accuracies": 0.9624999761581421, - "rewards/chosen": 0.6186943650245667, - "rewards/margins": 0.8456465005874634, - "rewards/rejected": -0.22695204615592957, - "step": 1370 - }, - { - "epoch": 3.58, - "learning_rate": 1.626139998169246e-07, - "logits/chosen": -1.8143821954727173, - "logits/rejected": -1.8219850063323975, - "logps/chosen": -30.901662826538086, - "logps/rejected": -38.29996871948242, - "loss": 0.3384, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.6557637453079224, - "rewards/margins": 0.8258283734321594, - "rewards/rejected": -0.17006462812423706, - "step": 1380 - }, - { - "epoch": 3.61, - "learning_rate": 1.4311267331922535e-07, - "logits/chosen": -1.7700645923614502, - "logits/rejected": -1.766127347946167, - "logps/chosen": -31.538192749023438, - "logps/rejected": -32.0847282409668, - "loss": 0.3468, - "rewards/accuracies": 0.875, - "rewards/chosen": 0.6458035111427307, - "rewards/margins": 0.743270754814148, - "rewards/rejected": -0.09746719151735306, - "step": 1390 - }, - { - "epoch": 3.64, - "learning_rate": 1.2482220564763669e-07, - "logits/chosen": -1.927178978919983, - "logits/rejected": -1.9247134923934937, - "logps/chosen": -28.587448120117188, - "logps/rejected": -32.722015380859375, - "loss": 0.3651, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.5418199896812439, - "rewards/margins": 0.6573297381401062, - "rewards/rejected": -0.11550970375537872, - "step": 1400 - }, - { - "epoch": 3.64, - "eval_logits/chosen": -2.106520175933838, - "eval_logits/rejected": -2.101874589920044, - "eval_logps/chosen": -33.74518585205078, - "eval_logps/rejected": -37.50544357299805, - "eval_loss": 0.48907649517059326, - "eval_rewards/accuracies": 0.594684362411499, - "eval_rewards/chosen": 0.057873498648405075, - "eval_rewards/margins": 0.055638302117586136, - "eval_rewards/rejected": 0.0022352009546011686, - "eval_runtime": 145.8794, - "eval_samples_per_second": 2.351, - "eval_steps_per_second": 0.295, - "step": 1400 - }, - { - "epoch": 3.66, - "learning_rate": 1.0775199359171346e-07, - "logits/chosen": -1.8713433742523193, - "logits/rejected": -1.8673336505889893, - "logps/chosen": -30.67318344116211, - "logps/rejected": -30.160730361938477, - "loss": 0.3524, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.5938695669174194, - "rewards/margins": 0.7151241898536682, - "rewards/rejected": -0.12125466763973236, - "step": 1410 - }, - { - "epoch": 3.69, - "learning_rate": 9.191080703056604e-08, - "logits/chosen": -1.8283851146697998, - "logits/rejected": -1.829474687576294, - "logps/chosen": -30.433517456054688, - "logps/rejected": -35.27409744262695, - "loss": 0.3607, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.5862718820571899, - "rewards/margins": 0.6562893986701965, - "rewards/rejected": -0.07001753151416779, - "step": 1420 - }, - { - "epoch": 3.71, - "learning_rate": 7.730678442730539e-08, - "logits/chosen": -1.7778269052505493, - "logits/rejected": -1.7712900638580322, - "logps/chosen": -31.182327270507812, - "logps/rejected": -37.51873016357422, - "loss": 0.3484, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.5918875932693481, - "rewards/margins": 0.7577527165412903, - "rewards/rejected": -0.16586506366729736, - "step": 1430 - }, - { - "epoch": 3.74, - "learning_rate": 6.394742864787806e-08, - "logits/chosen": -1.7914530038833618, - "logits/rejected": -1.7860643863677979, - "logps/chosen": -26.54972267150879, - "logps/rejected": -32.384273529052734, - "loss": 0.3594, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.5527039766311646, - "rewards/margins": 0.6834272742271423, - "rewards/rejected": -0.13072335720062256, - "step": 1440 - }, - { - "epoch": 3.77, - "learning_rate": 5.183960310644748e-08, - "logits/chosen": -1.8194904327392578, - "logits/rejected": -1.8093700408935547, - "logps/chosen": -30.115234375, - "logps/rejected": -36.441741943359375, - "loss": 0.3634, - "rewards/accuracies": 0.925000011920929, - "rewards/chosen": 0.473258912563324, - "rewards/margins": 0.7045349478721619, - "rewards/rejected": -0.23127606511116028, - "step": 1450 - }, - { - "epoch": 3.79, - "learning_rate": 4.098952823928693e-08, - "logits/chosen": -1.7905895709991455, - "logits/rejected": -1.7878305912017822, - "logps/chosen": -30.670690536499023, - "logps/rejected": -31.243310928344727, - "loss": 0.3722, - "rewards/accuracies": 0.9125000238418579, - "rewards/chosen": 0.5278172492980957, - "rewards/margins": 0.5983149409294128, - "rewards/rejected": -0.07049769908189774, - "step": 1460 - }, - { - "epoch": 3.82, - "learning_rate": 3.1402778309014284e-08, - "logits/chosen": -1.8513298034667969, - "logits/rejected": -1.8576581478118896, - "logps/chosen": -28.771564483642578, - "logps/rejected": -33.66225051879883, - "loss": 0.3376, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.6595225930213928, - "rewards/margins": 0.799488365650177, - "rewards/rejected": -0.1399657279253006, - "step": 1470 - }, - { - "epoch": 3.84, - "learning_rate": 2.3084278540791427e-08, - "logits/chosen": -1.8537070751190186, - "logits/rejected": -1.8638670444488525, - "logps/chosen": -28.841760635375977, - "logps/rejected": -30.2868709564209, - "loss": 0.3537, - "rewards/accuracies": 0.9750000238418579, - "rewards/chosen": 0.5706985592842102, - "rewards/margins": 0.7023594975471497, - "rewards/rejected": -0.13166090846061707, - "step": 1480 - }, - { - "epoch": 3.87, - "learning_rate": 1.6038302591975807e-08, - "logits/chosen": -1.7844855785369873, - "logits/rejected": -1.7776241302490234, - "logps/chosen": -30.99209213256836, - "logps/rejected": -32.9597053527832, - "loss": 0.359, - "rewards/accuracies": 0.9375, - "rewards/chosen": 0.5328781604766846, - "rewards/margins": 0.6827234029769897, - "rewards/rejected": -0.149845153093338, - "step": 1490 - }, - { - "epoch": 3.9, - "learning_rate": 1.0268470356514237e-08, - "logits/chosen": -1.8382985591888428, - "logits/rejected": -1.8351030349731445, - "logps/chosen": -30.677505493164062, - "logps/rejected": -34.204551696777344, - "loss": 0.3472, - "rewards/accuracies": 0.887499988079071, - "rewards/chosen": 0.5896133184432983, - "rewards/margins": 0.778479278087616, - "rewards/rejected": -0.18886595964431763, - "step": 1500 - }, - { - "epoch": 3.9, - "eval_logits/chosen": -2.106571674346924, - "eval_logits/rejected": -2.101935386657715, - "eval_logps/chosen": -33.74234390258789, - "eval_logps/rejected": -37.49684524536133, - "eval_loss": 0.48921331763267517, - "eval_rewards/accuracies": 0.6092192530632019, - "eval_rewards/chosen": 0.058441367000341415, - "eval_rewards/margins": 0.054486002773046494, - "eval_rewards/rejected": 0.003955358173698187, - "eval_runtime": 145.812, - "eval_samples_per_second": 2.352, - "eval_steps_per_second": 0.295, - "step": 1500 - }, - { - "epoch": 3.92, - "learning_rate": 5.777746105209147e-09, - "logits/chosen": -1.915920615196228, - "logits/rejected": -1.9165786504745483, - "logps/chosen": -26.842731475830078, - "logps/rejected": -33.47703552246094, - "loss": 0.3525, - "rewards/accuracies": 0.8500000238418579, - "rewards/chosen": 0.5947096943855286, - "rewards/margins": 0.7420263290405273, - "rewards/rejected": -0.14731669425964355, - "step": 1510 - }, - { - "epoch": 3.95, - "learning_rate": 2.5684369628148352e-09, - "logits/chosen": -1.7689346075057983, - "logits/rejected": -1.7688779830932617, - "logps/chosen": -30.16294288635254, - "logps/rejected": -34.59621810913086, - "loss": 0.3536, - "rewards/accuracies": 0.8999999761581421, - "rewards/chosen": 0.5757853388786316, - "rewards/margins": 0.7202829122543335, - "rewards/rejected": -0.1444975882768631, - "step": 1520 - }, - { - "epoch": 3.97, - "learning_rate": 6.421917227455999e-10, - "logits/chosen": -1.9208015203475952, - "logits/rejected": -1.9181976318359375, - "logps/chosen": -28.897686004638672, - "logps/rejected": -32.24897003173828, - "loss": 0.3645, - "rewards/accuracies": 0.949999988079071, - "rewards/chosen": 0.5152745246887207, - "rewards/margins": 0.6683377623558044, - "rewards/rejected": -0.15306326746940613, - "step": 1530 - }, - { - "epoch": 4.0, - "learning_rate": 0.0, - "logits/chosen": -1.9047333002090454, - "logits/rejected": -1.9058234691619873, - "logps/chosen": -28.1831111907959, - "logps/rejected": -30.388927459716797, - "loss": 0.3731, - "rewards/accuracies": 0.9583333730697632, - "rewards/chosen": 0.4873664975166321, - "rewards/margins": 0.6071158647537231, - "rewards/rejected": -0.11974940448999405, - "step": 1540 - }, - { - "epoch": 4.0, - "step": 1540, + "epoch": 1.0, + "step": 385, "total_flos": 0.0, - "train_loss": 0.30868637050901143, - "train_runtime": 10803.6, - "train_samples_per_second": 1.14, - "train_steps_per_second": 0.143 + "train_loss": 5.437272009911475, + "train_runtime": 3253.1823, + "train_samples_per_second": 0.946, + "train_steps_per_second": 0.118 } ], "logging_steps": 10, - "max_steps": 1540, + "max_steps": 385, "num_input_tokens_seen": 0, - "num_train_epochs": 4, + "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4,