diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,21 +1,22 @@ { "best_metric": null, "best_model_checkpoint": null, - "epoch": 1.0, + "epoch": 0.9990186457311089, "eval_steps": 100, - "global_step": 2038, + "global_step": 509, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.0, - "grad_norm": 11.418422500469418, - "learning_rate": 2.4509803921568627e-09, - "logits/chosen": -0.4609375, - "logits/rejected": -0.5625, - "logps/chosen": -1832.0, - "logps/rejected": -1832.0, + "epoch": 0.001962708537782139, + "grad_norm": 2.4117076017287205, + "learning_rate": 9.803921568627451e-09, + "logits/chosen": -1.125, + "logits/rejected": -1.1875, + "logps/bottom_tokens": -0.000553131103515625, + "logps/chosen": -500.0, + "logps/rejected": -520.0, "loss": 0.6914, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, @@ -24,3382 +25,902 @@ "step": 1 }, { - "epoch": 0.0, - "grad_norm": 10.679791571668222, - "learning_rate": 2.4509803921568626e-08, - "logits/chosen": -0.55859375, - "logits/rejected": -0.58203125, - "logps/chosen": -2784.0, - "logps/rejected": -2624.0, - "loss": 0.6994, - "rewards/accuracies": 0.2888889014720917, - "rewards/chosen": -0.0011749267578125, - "rewards/margins": -0.0142822265625, - "rewards/rejected": 0.01312255859375, + "epoch": 0.019627085377821395, + "grad_norm": 2.3800058601187866, + "learning_rate": 9.80392156862745e-08, + "logits/chosen": -1.1640625, + "logits/rejected": -1.203125, + "logps/bottom_tokens": -0.00081634521484375, + "logps/chosen": -380.0, + "logps/rejected": -316.0, + "loss": 0.6922, + "rewards/accuracies": 0.41111111640930176, + "rewards/chosen": -0.000202178955078125, + "rewards/margins": 0.0035247802734375, + "rewards/rejected": -0.00372314453125, "step": 10 }, { - "epoch": 0.01, - "grad_norm": 9.399862723559004, - "learning_rate": 4.901960784313725e-08, - "logits/chosen": -0.62109375, - "logits/rejected": -0.72265625, - "logps/chosen": -2064.0, - "logps/rejected": -1632.0, - "loss": 0.6943, - "rewards/accuracies": 0.4000000059604645, - "rewards/chosen": 0.01373291015625, - "rewards/margins": 0.012939453125, - "rewards/rejected": 0.00079345703125, + "epoch": 0.03925417075564279, + "grad_norm": 2.4064882227881057, + "learning_rate": 1.96078431372549e-07, + "logits/chosen": -1.0859375, + "logits/rejected": -1.1484375, + "logps/bottom_tokens": -0.0008392333984375, + "logps/chosen": -374.0, + "logps/rejected": -324.0, + "loss": 0.6913, + "rewards/accuracies": 0.4399999976158142, + "rewards/chosen": 0.003662109375, + "rewards/margins": 0.0033111572265625, + "rewards/rejected": 0.0003528594970703125, "step": 20 }, { - "epoch": 0.01, - "grad_norm": 11.520323557414129, - "learning_rate": 7.352941176470588e-08, - "logits/chosen": -0.61328125, - "logits/rejected": -0.6015625, - "logps/chosen": -1984.0, - "logps/rejected": -1968.0, - "loss": 0.6943, - "rewards/accuracies": 0.36000004410743713, - "rewards/chosen": -0.01300048828125, - "rewards/margins": -0.0107421875, - "rewards/rejected": -0.002197265625, + "epoch": 0.058881256133464184, + "grad_norm": 2.3536995350535426, + "learning_rate": 2.941176470588235e-07, + "logits/chosen": -1.125, + "logits/rejected": -1.1796875, + "logps/bottom_tokens": -0.000774383544921875, + "logps/chosen": -364.0, + "logps/rejected": -324.0, + "loss": 0.6938, + "rewards/accuracies": 0.3850000202655792, + "rewards/chosen": -0.0030670166015625, + "rewards/margins": -0.0067138671875, + "rewards/rejected": 0.003631591796875, "step": 30 }, { - "epoch": 0.02, - "grad_norm": 11.053039743567291, - "learning_rate": 9.80392156862745e-08, - "logits/chosen": -0.5078125, - "logits/rejected": -0.54296875, - "logps/chosen": -2176.0, - "logps/rejected": -2024.0, - "loss": 0.69, - "rewards/accuracies": 0.36000004410743713, - "rewards/chosen": 0.01409912109375, - "rewards/margins": 0.0230712890625, - "rewards/rejected": -0.009033203125, + "epoch": 0.07850834151128558, + "grad_norm": 2.3870217018270155, + "learning_rate": 3.92156862745098e-07, + "logits/chosen": -1.125, + "logits/rejected": -1.15625, + "logps/bottom_tokens": -0.000782012939453125, + "logps/chosen": -378.0, + "logps/rejected": -338.0, + "loss": 0.6929, + "rewards/accuracies": 0.445000022649765, + "rewards/chosen": 0.000881195068359375, + "rewards/margins": 0.00244140625, + "rewards/rejected": -0.00154876708984375, "step": 40 }, { - "epoch": 0.02, - "grad_norm": 10.828208455861878, - "learning_rate": 1.2254901960784314e-07, - "logits/chosen": -0.6015625, - "logits/rejected": -0.703125, - "logps/chosen": -2080.0, - "logps/rejected": -1624.0, - "loss": 0.6967, - "rewards/accuracies": 0.4000000059604645, - "rewards/chosen": 0.0101318359375, - "rewards/margins": 0.00958251953125, - "rewards/rejected": 0.000507354736328125, + "epoch": 0.09813542688910697, + "grad_norm": 2.4788478916800147, + "learning_rate": 4.901960784313725e-07, + "logits/chosen": -1.1171875, + "logits/rejected": -1.1484375, + "logps/bottom_tokens": -0.000789642333984375, + "logps/chosen": -406.0, + "logps/rejected": -352.0, + "loss": 0.6915, + "rewards/accuracies": 0.5400000214576721, + "rewards/chosen": 0.0026397705078125, + "rewards/margins": 0.005889892578125, + "rewards/rejected": -0.00323486328125, "step": 50 }, { - "epoch": 0.03, - "grad_norm": 10.30578886430554, - "learning_rate": 1.4705882352941175e-07, - "logits/chosen": -0.5625, - "logits/rejected": -0.703125, - "logps/chosen": -2608.0, - "logps/rejected": -1904.0, - "loss": 0.6971, - "rewards/accuracies": 0.40000003576278687, - "rewards/chosen": -0.004302978515625, - "rewards/margins": -0.017578125, - "rewards/rejected": 0.01324462890625, + "epoch": 0.11776251226692837, + "grad_norm": 2.360316334548125, + "learning_rate": 4.995237599803335e-07, + "logits/chosen": -1.140625, + "logits/rejected": -1.203125, + "logps/bottom_tokens": -0.000797271728515625, + "logps/chosen": -406.0, + "logps/rejected": -322.0, + "loss": 0.6913, + "rewards/accuracies": 0.42500001192092896, + "rewards/chosen": 0.00244140625, + "rewards/margins": 0.00201416015625, + "rewards/rejected": 0.000431060791015625, "step": 60 }, { - "epoch": 0.03, - "grad_norm": 11.148448836585857, - "learning_rate": 1.715686274509804e-07, - "logits/chosen": -0.66015625, - "logits/rejected": -0.7109375, - "logps/chosen": -2112.0, - "logps/rejected": -1880.0, - "loss": 0.6893, - "rewards/accuracies": 0.5400000214576721, - "rewards/chosen": 0.055419921875, - "rewards/margins": 0.03466796875, - "rewards/rejected": 0.020751953125, + "epoch": 0.13738959764474976, + "grad_norm": 2.3051434353276847, + "learning_rate": 4.978798275112142e-07, + "logits/chosen": -1.09375, + "logits/rejected": -1.1328125, + "logps/bottom_tokens": -0.00078582763671875, + "logps/chosen": -372.0, + "logps/rejected": -330.0, + "loss": 0.688, + "rewards/accuracies": 0.5049999952316284, + "rewards/chosen": 0.00897216796875, + "rewards/margins": 0.01190185546875, + "rewards/rejected": -0.0028839111328125, "step": 70 }, { - "epoch": 0.04, - "grad_norm": 9.100710948464306, - "learning_rate": 1.96078431372549e-07, - "logits/chosen": -0.53515625, - "logits/rejected": -0.5625, - "logps/chosen": -2256.0, - "logps/rejected": -2040.0, - "loss": 0.6961, - "rewards/accuracies": 0.41999998688697815, - "rewards/chosen": 0.050537109375, - "rewards/margins": 0.0162353515625, - "rewards/rejected": 0.034423828125, + "epoch": 0.15701668302257116, + "grad_norm": 2.2866846976386, + "learning_rate": 4.950700530747689e-07, + "logits/chosen": -1.078125, + "logits/rejected": -1.1484375, + "logps/bottom_tokens": -0.000904083251953125, + "logps/chosen": -378.0, + "logps/rejected": -308.0, + "loss": 0.685, + "rewards/accuracies": 0.5450000166893005, + "rewards/chosen": -0.00121307373046875, + "rewards/margins": 0.01483154296875, + "rewards/rejected": -0.01611328125, "step": 80 }, { - "epoch": 0.04, - "grad_norm": 11.809225699546186, - "learning_rate": 2.2058823529411763e-07, - "logits/chosen": -0.52734375, - "logits/rejected": -0.625, - "logps/chosen": -2000.0, - "logps/rejected": -1712.0, - "loss": 0.6807, - "rewards/accuracies": 0.48000001907348633, - "rewards/chosen": 0.0888671875, - "rewards/margins": 0.0196533203125, - "rewards/rejected": 0.0693359375, + "epoch": 0.17664376840039253, + "grad_norm": 2.3053347338418098, + "learning_rate": 4.911076517558622e-07, + "logits/chosen": -1.125, + "logits/rejected": -1.15625, + "logps/bottom_tokens": -0.000835418701171875, + "logps/chosen": -382.0, + "logps/rejected": -346.0, + "loss": 0.6832, + "rewards/accuracies": 0.5600000023841858, + "rewards/chosen": -0.0106201171875, + "rewards/margins": 0.0159912109375, + "rewards/rejected": -0.026611328125, "step": 90 }, { - "epoch": 0.05, - "grad_norm": 10.00322127291456, - "learning_rate": 2.4509803921568627e-07, - "logits/chosen": -0.5703125, - "logits/rejected": -0.66015625, - "logps/chosen": -2208.0, - "logps/rejected": -1944.0, - "loss": 0.684, - "rewards/accuracies": 0.46000003814697266, - "rewards/chosen": 0.1591796875, - "rewards/margins": 0.0250244140625, - "rewards/rejected": 0.1337890625, + "epoch": 0.19627085377821393, + "grad_norm": 2.2125416576513732, + "learning_rate": 4.860112597371772e-07, + "logits/chosen": -1.125, + "logits/rejected": -1.171875, + "logps/bottom_tokens": -0.0009002685546875, + "logps/chosen": -372.0, + "logps/rejected": -328.0, + "loss": 0.678, + "rewards/accuracies": 0.699999988079071, + "rewards/chosen": -0.017333984375, + "rewards/margins": 0.0361328125, + "rewards/rejected": -0.053466796875, "step": 100 }, { - "epoch": 0.05, - "eval_logits/chosen": -0.6640625, - "eval_logits/rejected": -0.72265625, - "eval_logps/chosen": -2352.0, - "eval_logps/rejected": -2048.0, - "eval_loss": 0.6767656207084656, - "eval_rewards/accuracies": 0.449404776096344, - "eval_rewards/chosen": 0.2314453125, - "eval_rewards/margins": 0.04052734375, - "eval_rewards/rejected": 0.1904296875, - "eval_runtime": 90.1206, - "eval_samples_per_second": 22.192, - "eval_steps_per_second": 0.466, + "epoch": 0.19627085377821393, + "eval_logits/chosen": -1.09375, + "eval_logits/rejected": -1.15625, + "eval_logps/bottom_tokens": -0.0008697509765625, + "eval_logps/chosen": -396.0, + "eval_logps/rejected": -344.0, + "eval_loss": 0.6789160370826721, + "eval_rewards/accuracies": 0.5880597233772278, + "eval_rewards/chosen": -0.0274658203125, + "eval_rewards/margins": 0.033203125, + "eval_rewards/rejected": -0.060791015625, + "eval_runtime": 94.4387, + "eval_samples_per_second": 21.178, + "eval_steps_per_second": 0.709, "step": 100 }, { - "epoch": 0.05, - "grad_norm": 11.799509248452377, - "learning_rate": 2.6960784313725486e-07, - "logits/chosen": -0.55859375, - "logits/rejected": -0.6640625, - "logps/chosen": -2432.0, - "logps/rejected": -1888.0, - "loss": 0.682, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.255859375, - "rewards/margins": 0.04736328125, - "rewards/rejected": 0.2080078125, + "epoch": 0.21589793915603533, + "grad_norm": 2.438395616681449, + "learning_rate": 4.798048466485017e-07, + "logits/chosen": -1.1015625, + "logits/rejected": -1.109375, + "logps/bottom_tokens": -0.0008544921875, + "logps/chosen": -344.0, + "logps/rejected": -332.0, + "loss": 0.6804, + "rewards/accuracies": 0.5899999737739563, + "rewards/chosen": -0.037109375, + "rewards/margins": 0.02001953125, + "rewards/rejected": -0.05712890625, "step": 110 }, { - "epoch": 0.06, - "grad_norm": 9.997348094627812, - "learning_rate": 2.941176470588235e-07, - "logits/chosen": -0.59765625, - "logits/rejected": -0.5546875, - "logps/chosen": -2304.0, - "logps/rejected": -2352.0, - "loss": 0.6822, - "rewards/accuracies": 0.46000003814697266, - "rewards/chosen": 0.28515625, - "rewards/margins": -0.00159454345703125, - "rewards/rejected": 0.287109375, + "epoch": 0.23552502453385674, + "grad_norm": 2.226213549318803, + "learning_rate": 4.725176028314541e-07, + "logits/chosen": -1.109375, + "logits/rejected": -1.1171875, + "logps/bottom_tokens": -0.0008544921875, + "logps/chosen": -372.0, + "logps/rejected": -354.0, + "loss": 0.6745, + "rewards/accuracies": 0.6399999856948853, + "rewards/chosen": -0.03564453125, + "rewards/margins": 0.0517578125, + "rewards/rejected": -0.08740234375, "step": 120 }, { - "epoch": 0.06, - "grad_norm": 10.489336936701164, - "learning_rate": 3.1862745098039215e-07, - "logits/chosen": -0.58203125, - "logits/rejected": -0.69140625, - "logps/chosen": -2240.0, - "logps/rejected": -1608.0, - "loss": 0.6706, - "rewards/accuracies": 0.46000003814697266, - "rewards/chosen": 0.34375, - "rewards/margins": 0.103515625, - "rewards/rejected": 0.2412109375, + "epoch": 0.25515210991167814, + "grad_norm": 2.4135162897156706, + "learning_rate": 4.641838020498713e-07, + "logits/chosen": -1.09375, + "logits/rejected": -1.1640625, + "logps/bottom_tokens": -0.000926971435546875, + "logps/chosen": -408.0, + "logps/rejected": -338.0, + "loss": 0.6674, + "rewards/accuracies": 0.5849999785423279, + "rewards/chosen": -0.0703125, + "rewards/margins": 0.0517578125, + "rewards/rejected": -0.1220703125, "step": 130 }, { - "epoch": 0.07, - "grad_norm": 8.373655221952864, - "learning_rate": 3.431372549019608e-07, - "logits/chosen": -0.6328125, - "logits/rejected": -0.6640625, - "logps/chosen": -2256.0, - "logps/rejected": -2128.0, - "loss": 0.6786, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.376953125, - "rewards/margins": 0.0556640625, - "rewards/rejected": 0.322265625, + "epoch": 0.2747791952894995, + "grad_norm": 2.4502181786024004, + "learning_rate": 4.5484264029156733e-07, + "logits/chosen": -1.1015625, + "logits/rejected": -1.1484375, + "logps/bottom_tokens": -0.000823974609375, + "logps/chosen": -386.0, + "logps/rejected": -336.0, + "loss": 0.6635, + "rewards/accuracies": 0.5900000333786011, + "rewards/chosen": -0.1015625, + "rewards/margins": 0.048828125, + "rewards/rejected": -0.150390625, "step": 140 }, { - "epoch": 0.07, - "grad_norm": 7.530198235177303, - "learning_rate": 3.6764705882352943e-07, - "logits/chosen": -0.6328125, - "logits/rejected": -0.74609375, - "logps/chosen": -2560.0, - "logps/rejected": -2000.0, - "loss": 0.6618, - "rewards/accuracies": 0.5399999618530273, - "rewards/chosen": 0.466796875, - "rewards/margins": 0.1103515625, - "rewards/rejected": 0.35546875, + "epoch": 0.2944062806673209, + "grad_norm": 2.4663119079457614, + "learning_rate": 4.445380514196192e-07, + "logits/chosen": -1.09375, + "logits/rejected": -1.171875, + "logps/bottom_tokens": -0.0008697509765625, + "logps/chosen": -428.0, + "logps/rejected": -356.0, + "loss": 0.668, + "rewards/accuracies": 0.6350000500679016, + "rewards/chosen": -0.12890625, + "rewards/margins": 0.0673828125, + "rewards/rejected": -0.1962890625, "step": 150 }, { - "epoch": 0.08, - "grad_norm": 9.93615787272239, - "learning_rate": 3.92156862745098e-07, - "logits/chosen": -0.578125, - "logits/rejected": -0.62109375, - "logps/chosen": -2544.0, - "logps/rejected": -2272.0, - "loss": 0.6632, - "rewards/accuracies": 0.4599999785423279, - "rewards/chosen": 0.4921875, - "rewards/margins": 0.08984375, - "rewards/rejected": 0.40234375, + "epoch": 0.3140333660451423, + "grad_norm": 2.455591342132379, + "learning_rate": 4.33318500540218e-07, + "logits/chosen": -1.0859375, + "logits/rejected": -1.1328125, + "logps/bottom_tokens": -0.000896453857421875, + "logps/chosen": -408.0, + "logps/rejected": -368.0, + "loss": 0.6655, + "rewards/accuracies": 0.6050000190734863, + "rewards/chosen": -0.16796875, + "rewards/margins": 0.078125, + "rewards/rejected": -0.24609375, "step": 160 }, { - "epoch": 0.08, - "grad_norm": 8.034753030681635, - "learning_rate": 4.1666666666666667e-07, - "logits/chosen": -0.59375, - "logits/rejected": -0.703125, - "logps/chosen": -2400.0, - "logps/rejected": -1968.0, - "loss": 0.6664, - "rewards/accuracies": 0.42000001668930054, - "rewards/chosen": 0.48828125, - "rewards/margins": 0.0771484375, - "rewards/rejected": 0.41015625, + "epoch": 0.3336604514229637, + "grad_norm": 2.436300399124971, + "learning_rate": 4.2123675605892985e-07, + "logits/chosen": -1.078125, + "logits/rejected": -1.1484375, + "logps/bottom_tokens": -0.00099945068359375, + "logps/chosen": -422.0, + "logps/rejected": -364.0, + "loss": 0.6585, + "rewards/accuracies": 0.6450000405311584, + "rewards/chosen": -0.1826171875, + "rewards/margins": 0.115234375, + "rewards/rejected": -0.296875, "step": 170 }, { - "epoch": 0.09, - "grad_norm": 8.810211928299461, - "learning_rate": 4.4117647058823526e-07, - "logits/chosen": -0.5, - "logits/rejected": -0.625, - "logps/chosen": -3008.0, - "logps/rejected": -2416.0, - "loss": 0.6614, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.7265625, - "rewards/margins": 0.2216796875, - "rewards/rejected": 0.5078125, + "epoch": 0.35328753680078506, + "grad_norm": 2.5546008416763035, + "learning_rate": 4.0834964149744333e-07, + "logits/chosen": -1.1015625, + "logits/rejected": -1.15625, + "logps/bottom_tokens": -0.00096893310546875, + "logps/chosen": -416.0, + "logps/rejected": -380.0, + "loss": 0.6643, + "rewards/accuracies": 0.6100000143051147, + "rewards/chosen": -0.232421875, + "rewards/margins": 0.06396484375, + "rewards/rejected": -0.296875, "step": 180 }, { - "epoch": 0.09, - "grad_norm": 8.455528608933424, - "learning_rate": 4.656862745098039e-07, - "logits/chosen": -0.484375, - "logits/rejected": -0.62109375, - "logps/chosen": -2544.0, - "logps/rejected": -2064.0, - "loss": 0.6596, - "rewards/accuracies": 0.5600000619888306, - "rewards/chosen": 0.6171875, - "rewards/margins": 0.1083984375, - "rewards/rejected": 0.51171875, + "epoch": 0.3729146221786065, + "grad_norm": 2.555290762655567, + "learning_rate": 3.947177682380738e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.203125, + "logps/bottom_tokens": -0.000804901123046875, + "logps/chosen": -378.0, + "logps/rejected": -356.0, + "loss": 0.6499, + "rewards/accuracies": 0.6450001001358032, + "rewards/chosen": -0.2412109375, + "rewards/margins": 0.11669921875, + "rewards/rejected": -0.357421875, "step": 190 }, { - "epoch": 0.1, - "grad_norm": 9.256083265710721, - "learning_rate": 4.901960784313725e-07, - "logits/chosen": -0.515625, - "logits/rejected": -0.5625, - "logps/chosen": -2768.0, - "logps/rejected": -2576.0, - "loss": 0.663, - "rewards/accuracies": 0.46000003814697266, - "rewards/chosen": 0.70703125, - "rewards/margins": 0.05224609375, - "rewards/rejected": 0.65625, + "epoch": 0.39254170755642787, + "grad_norm": 2.7737043586573313, + "learning_rate": 3.804052504529933e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.2265625, + "logps/bottom_tokens": -0.000858306884765625, + "logps/chosen": -392.0, + "logps/rejected": -370.0, + "loss": 0.645, + "rewards/accuracies": 0.6350000500679016, + "rewards/chosen": -0.271484375, + "rewards/margins": 0.1396484375, + "rewards/rejected": -0.41015625, "step": 200 }, { - "epoch": 0.1, - "eval_logits/chosen": -0.66796875, - "eval_logits/rejected": -0.7265625, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2016.0, - "eval_loss": 0.6565937399864197, - "eval_rewards/accuracies": 0.494047611951828, - "eval_rewards/chosen": 0.59765625, - "eval_rewards/margins": 0.11083984375, - "eval_rewards/rejected": 0.48828125, - "eval_runtime": 89.4808, - "eval_samples_per_second": 22.351, - "eval_steps_per_second": 0.469, + "epoch": 0.39254170755642787, + "eval_logits/chosen": -1.15625, + "eval_logits/rejected": -1.203125, + "eval_logps/bottom_tokens": -0.00091552734375, + "eval_logps/chosen": -422.0, + "eval_logps/rejected": -380.0, + "eval_loss": 0.6488671898841858, + "eval_rewards/accuracies": 0.6447761058807373, + "eval_rewards/chosen": -0.287109375, + "eval_rewards/margins": 0.13671875, + "eval_rewards/rejected": -0.423828125, + "eval_runtime": 94.2281, + "eval_samples_per_second": 21.225, + "eval_steps_per_second": 0.711, "step": 200 }, { - "epoch": 0.1, - "grad_norm": 10.458558020929722, - "learning_rate": 4.999867958705476e-07, - "logits/chosen": -0.58203125, - "logits/rejected": -0.61328125, - "logps/chosen": -2352.0, - "logps/rejected": -2272.0, - "loss": 0.6743, - "rewards/accuracies": 0.47999995946884155, - "rewards/chosen": 0.61328125, - "rewards/margins": 0.0166015625, - "rewards/rejected": 0.59765625, + "epoch": 0.41216879293424924, + "grad_norm": 2.8286672144445277, + "learning_rate": 3.654794035589483e-07, + "logits/chosen": -1.1328125, + "logits/rejected": -1.1640625, + "logps/bottom_tokens": -0.00092315673828125, + "logps/chosen": -362.0, + "logps/rejected": -344.0, + "loss": 0.6512, + "rewards/accuracies": 0.6149999499320984, + "rewards/chosen": -0.298828125, + "rewards/margins": 0.12060546875, + "rewards/rejected": -0.419921875, "step": 210 }, { - "epoch": 0.11, - "grad_norm": 10.655404888327219, - "learning_rate": 4.999061090193831e-07, - "logits/chosen": -0.609375, - "logits/rejected": -0.70703125, - "logps/chosen": -2528.0, - "logps/rejected": -2112.0, - "loss": 0.6801, - "rewards/accuracies": 0.5000000596046448, - "rewards/chosen": 0.6796875, - "rewards/margins": 0.1416015625, - "rewards/rejected": 0.53515625, + "epoch": 0.43179587831207067, + "grad_norm": 2.98579141751378, + "learning_rate": 3.5001042761570826e-07, + "logits/chosen": -1.171875, + "logits/rejected": -1.2109375, + "logps/bottom_tokens": -0.000751495361328125, + "logps/chosen": -414.0, + "logps/rejected": -398.0, + "loss": 0.6507, + "rewards/accuracies": 0.5800000429153442, + "rewards/chosen": -0.333984375, + "rewards/margins": 0.11279296875, + "rewards/rejected": -0.447265625, "step": 220 }, { - "epoch": 0.11, - "grad_norm": 10.56468733064606, - "learning_rate": 4.997520945910046e-07, - "logits/chosen": -0.58203125, - "logits/rejected": -0.7109375, - "logps/chosen": -2368.0, - "logps/rejected": -1848.0, - "loss": 0.674, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.58984375, - "rewards/margins": 0.1865234375, - "rewards/rejected": 0.400390625, + "epoch": 0.45142296368989204, + "grad_norm": 2.849801650804548, + "learning_rate": 3.34071077157304e-07, + "logits/chosen": -1.171875, + "logits/rejected": -1.2265625, + "logps/bottom_tokens": -0.000789642333984375, + "logps/chosen": -388.0, + "logps/rejected": -354.0, + "loss": 0.6464, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -0.353515625, + "rewards/margins": 0.1337890625, + "rewards/rejected": -0.48828125, "step": 230 }, { - "epoch": 0.12, - "grad_norm": 9.35483083586776, - "learning_rate": 4.995247977764035e-07, - "logits/chosen": -0.6875, - "logits/rejected": -0.75, - "logps/chosen": -2192.0, - "logps/rejected": -1968.0, - "loss": 0.6788, - "rewards/accuracies": 0.4399999976158142, - "rewards/chosen": 0.484375, - "rewards/margins": 0.0615234375, - "rewards/rejected": 0.423828125, + "epoch": 0.47105004906771347, + "grad_norm": 3.020709895469043, + "learning_rate": 3.1773631900892204e-07, + "logits/chosen": -1.1484375, + "logits/rejected": -1.1875, + "logps/bottom_tokens": -0.00077056884765625, + "logps/chosen": -416.0, + "logps/rejected": -396.0, + "loss": 0.6442, + "rewards/accuracies": 0.6200000047683716, + "rewards/chosen": -0.38671875, + "rewards/margins": 0.1337890625, + "rewards/rejected": -0.51953125, "step": 240 }, { - "epoch": 0.12, - "grad_norm": 9.99023631027908, - "learning_rate": 4.992242852691269e-07, - "logits/chosen": -0.62109375, - "logits/rejected": -0.703125, - "logps/chosen": -2160.0, - "logps/rejected": -1960.0, - "loss": 0.6734, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.5, - "rewards/margins": 0.1396484375, - "rewards/rejected": 0.361328125, + "epoch": 0.49067713444553485, + "grad_norm": 2.801068325901482, + "learning_rate": 3.0108297969883103e-07, + "logits/chosen": -1.1640625, + "logits/rejected": -1.1953125, + "logps/bottom_tokens": -0.000827789306640625, + "logps/chosen": -426.0, + "logps/rejected": -398.0, + "loss": 0.6347, + "rewards/accuracies": 0.64000004529953, + "rewards/chosen": -0.39453125, + "rewards/margins": 0.1865234375, + "rewards/rejected": -0.58203125, "step": 250 }, { - "epoch": 0.13, - "grad_norm": 8.81462332150393, - "learning_rate": 4.988506452457066e-07, - "logits/chosen": -0.5625, - "logits/rejected": -0.6484375, - "logps/chosen": -2272.0, - "logps/rejected": -1952.0, - "loss": 0.6606, - "rewards/accuracies": 0.4800000786781311, - "rewards/chosen": 0.419921875, - "rewards/margins": 0.04931640625, - "rewards/rejected": 0.37109375, + "epoch": 0.5103042198233563, + "grad_norm": 2.8119914001202835, + "learning_rate": 2.8418938412365013e-07, + "logits/chosen": -1.1640625, + "logits/rejected": -1.203125, + "logps/bottom_tokens": -0.000873565673828125, + "logps/chosen": -396.0, + "logps/rejected": -372.0, + "loss": 0.6381, + "rewards/accuracies": 0.6350000500679016, + "rewards/chosen": -0.373046875, + "rewards/margins": 0.171875, + "rewards/rejected": -0.546875, "step": 260 }, { - "epoch": 0.13, - "grad_norm": 9.565836577883003, - "learning_rate": 4.984039873397879e-07, - "logits/chosen": -0.5703125, - "logits/rejected": -0.6953125, - "logps/chosen": -2608.0, - "logps/rejected": -2032.0, - "loss": 0.6715, - "rewards/accuracies": 0.5600000619888306, - "rewards/chosen": 0.60546875, - "rewards/margins": 0.1826171875, - "rewards/rejected": 0.42578125, + "epoch": 0.5299313052011776, + "grad_norm": 2.914608701481186, + "learning_rate": 2.671349871664101e-07, + "logits/chosen": -1.1640625, + "logits/rejected": -1.171875, + "logps/bottom_tokens": -0.0007781982421875, + "logps/chosen": -398.0, + "logps/rejected": -386.0, + "loss": 0.6315, + "rewards/accuracies": 0.64000004529953, + "rewards/chosen": -0.40625, + "rewards/margins": 0.17578125, + "rewards/rejected": -0.58203125, "step": 270 }, { - "epoch": 0.14, - "grad_norm": 7.882899392773696, - "learning_rate": 4.9788444260996e-07, - "logits/chosen": -0.4921875, - "logits/rejected": -0.55859375, - "logps/chosen": -2512.0, - "logps/rejected": -2144.0, - "loss": 0.6488, - "rewards/accuracies": 0.4800000786781311, - "rewards/chosen": 0.578125, - "rewards/margins": 0.1357421875, - "rewards/rejected": 0.443359375, + "epoch": 0.549558390578999, + "grad_norm": 2.974677635397429, + "learning_rate": 2.5e-07, + "logits/chosen": -1.171875, + "logits/rejected": -1.1953125, + "logps/bottom_tokens": -0.000904083251953125, + "logps/chosen": -438.0, + "logps/rejected": -402.0, + "loss": 0.6384, + "rewards/accuracies": 0.6600000262260437, + "rewards/chosen": -0.42578125, + "rewards/margins": 0.21484375, + "rewards/rejected": -0.640625, "step": 280 }, { - "epoch": 0.14, - "grad_norm": 8.30641415306888, - "learning_rate": 4.97292163501301e-07, - "logits/chosen": -0.57421875, - "logits/rejected": -0.578125, - "logps/chosen": -2400.0, - "logps/rejected": -2352.0, - "loss": 0.641, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.63671875, - "rewards/margins": 0.10107421875, - "rewards/rejected": 0.5390625, + "epoch": 0.5691854759568205, + "grad_norm": 3.4767790428686234, + "learning_rate": 2.3286501283358982e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.2421875, + "logps/bottom_tokens": -0.00086212158203125, + "logps/chosen": -412.0, + "logps/rejected": -376.0, + "loss": 0.632, + "rewards/accuracies": 0.5750000476837158, + "rewards/chosen": -0.4609375, + "rewards/margins": 0.16015625, + "rewards/rejected": -0.62109375, "step": 290 }, { - "epoch": 0.15, - "grad_norm": 8.469934208860764, - "learning_rate": 4.96627323800647e-07, - "logits/chosen": -0.427734375, - "logits/rejected": -0.6328125, - "logps/chosen": -3056.0, - "logps/rejected": -2128.0, - "loss": 0.6529, - "rewards/accuracies": 0.6599999666213989, - "rewards/chosen": 0.76953125, - "rewards/margins": 0.265625, - "rewards/rejected": 0.50390625, + "epoch": 0.5888125613346418, + "grad_norm": 3.0983859451271565, + "learning_rate": 2.1581061587634987e-07, + "logits/chosen": -1.203125, + "logits/rejected": -1.2421875, + "logps/bottom_tokens": -0.000774383544921875, + "logps/chosen": -428.0, + "logps/rejected": -388.0, + "loss": 0.6396, + "rewards/accuracies": 0.5999999642372131, + "rewards/chosen": -0.482421875, + "rewards/margins": 0.162109375, + "rewards/rejected": -0.64453125, "step": 300 }, { - "epoch": 0.15, - "eval_logits/chosen": -0.65625, - "eval_logits/rejected": -0.71875, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2016.0, - "eval_loss": 0.651296854019165, - "eval_rewards/accuracies": 0.5148809552192688, - "eval_rewards/chosen": 0.625, - "eval_rewards/margins": 0.1279296875, - "eval_rewards/rejected": 0.494140625, - "eval_runtime": 89.9953, - "eval_samples_per_second": 22.223, - "eval_steps_per_second": 0.467, + "epoch": 0.5888125613346418, + "eval_logits/chosen": -1.1875, + "eval_logits/rejected": -1.234375, + "eval_logps/bottom_tokens": -0.000751495361328125, + "eval_logps/chosen": -438.0, + "eval_logps/rejected": -406.0, + "eval_loss": 0.6303857564926147, + "eval_rewards/accuracies": 0.6626865863800049, + "eval_rewards/chosen": -0.451171875, + "eval_rewards/margins": 0.2275390625, + "eval_rewards/rejected": -0.6796875, + "eval_runtime": 94.2601, + "eval_samples_per_second": 21.218, + "eval_steps_per_second": 0.711, "step": 300 }, { - "epoch": 0.15, - "grad_norm": 9.606536101637007, - "learning_rate": 4.958901185856005e-07, - "logits/chosen": -0.6015625, - "logits/rejected": -0.640625, - "logps/chosen": -2688.0, - "logps/rejected": -2496.0, - "loss": 0.6578, - "rewards/accuracies": 0.5400000214576721, - "rewards/chosen": 0.69140625, - "rewards/margins": 0.06005859375, - "rewards/rejected": 0.62890625, + "epoch": 0.6084396467124632, + "grad_norm": 3.1412458629194835, + "learning_rate": 1.9891702030116897e-07, + "logits/chosen": -1.140625, + "logits/rejected": -1.2421875, + "logps/bottom_tokens": -0.000728607177734375, + "logps/chosen": -446.0, + "logps/rejected": -358.0, + "loss": 0.6234, + "rewards/accuracies": 0.6949999928474426, + "rewards/chosen": -0.408203125, + "rewards/margins": 0.2451171875, + "rewards/rejected": -0.65625, "step": 310 }, { - "epoch": 0.16, - "grad_norm": 8.306258892059772, - "learning_rate": 4.95080764167289e-07, - "logits/chosen": -0.474609375, - "logits/rejected": -0.4921875, - "logps/chosen": -2400.0, - "logps/rejected": -2336.0, - "loss": 0.6911, - "rewards/accuracies": 0.4599999785423279, - "rewards/chosen": 0.5859375, - "rewards/margins": 0.042724609375, - "rewards/rejected": 0.54296875, + "epoch": 0.6280667320902846, + "grad_norm": 3.1923082526436986, + "learning_rate": 1.8226368099107792e-07, + "logits/chosen": -1.1640625, + "logits/rejected": -1.2109375, + "logps/bottom_tokens": -0.000823974609375, + "logps/chosen": -424.0, + "logps/rejected": -364.0, + "loss": 0.6241, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.42578125, + "rewards/margins": 0.2216796875, + "rewards/rejected": -0.6484375, "step": 320 }, { - "epoch": 0.16, - "grad_norm": 9.33758707972592, - "learning_rate": 4.941994980268966e-07, - "logits/chosen": -0.640625, - "logits/rejected": -0.65234375, - "logps/chosen": -2008.0, - "logps/rejected": -1832.0, - "loss": 0.6607, - "rewards/accuracies": 0.5000000596046448, - "rewards/chosen": 0.478515625, - "rewards/margins": 0.11865234375, - "rewards/rejected": 0.361328125, + "epoch": 0.647693817468106, + "grad_norm": 3.064211696764281, + "learning_rate": 1.6592892284269594e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.2109375, + "logps/bottom_tokens": -0.00072479248046875, + "logps/chosen": -408.0, + "logps/rejected": -386.0, + "loss": 0.6224, + "rewards/accuracies": 0.6799999475479126, + "rewards/chosen": -0.431640625, + "rewards/margins": 0.259765625, + "rewards/rejected": -0.69140625, "step": 330 }, { - "epoch": 0.17, - "grad_norm": 8.421773008047236, - "learning_rate": 4.932465787459808e-07, - "logits/chosen": -0.50390625, - "logits/rejected": -0.48828125, - "logps/chosen": -2512.0, - "logps/rejected": -2512.0, - "loss": 0.6597, - "rewards/accuracies": 0.42000001668930054, - "rewards/chosen": 0.56640625, - "rewards/margins": -0.020263671875, - "rewards/rejected": 0.5859375, + "epoch": 0.6673209028459274, + "grad_norm": 3.1791023826814353, + "learning_rate": 1.4998957238429172e-07, + "logits/chosen": -1.21875, + "logits/rejected": -1.2421875, + "logps/bottom_tokens": -0.000789642333984375, + "logps/chosen": -408.0, + "logps/rejected": -380.0, + "loss": 0.6204, + "rewards/accuracies": 0.6300000548362732, + "rewards/chosen": -0.5078125, + "rewards/margins": 0.244140625, + "rewards/rejected": -0.75390625, "step": 340 }, { - "epoch": 0.17, - "grad_norm": 11.142714749304895, - "learning_rate": 4.922222859306005e-07, - "logits/chosen": -0.5, - "logits/rejected": -0.61328125, - "logps/chosen": -2112.0, - "logps/rejected": -1664.0, - "loss": 0.6529, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.46484375, - "rewards/margins": 0.13671875, - "rewards/rejected": 0.328125, + "epoch": 0.6869479882237488, + "grad_norm": 3.295570474728778, + "learning_rate": 1.345205964410517e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.21875, + "logps/bottom_tokens": -0.00087738037109375, + "logps/chosen": -392.0, + "logps/rejected": -372.0, + "loss": 0.627, + "rewards/accuracies": 0.5850000381469727, + "rewards/chosen": -0.49609375, + "rewards/margins": 0.2236328125, + "rewards/rejected": -0.71875, "step": 350 }, { - "epoch": 0.18, - "grad_norm": 9.405996440031476, - "learning_rate": 4.911269201292724e-07, - "logits/chosen": -0.478515625, - "logits/rejected": -0.58203125, - "logps/chosen": -2864.0, - "logps/rejected": -2464.0, - "loss": 0.6625, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.796875, - "rewards/margins": 0.2177734375, - "rewards/rejected": 0.578125, + "epoch": 0.7065750736015701, + "grad_norm": 3.5211819482445184, + "learning_rate": 1.1959474954700665e-07, + "logits/chosen": -1.1796875, + "logits/rejected": -1.21875, + "logps/bottom_tokens": -0.00067138671875, + "logps/chosen": -424.0, + "logps/rejected": -416.0, + "loss": 0.613, + "rewards/accuracies": 0.7049999833106995, + "rewards/chosen": -0.44140625, + "rewards/margins": 0.234375, + "rewards/rejected": -0.67578125, "step": 360 }, { - "epoch": 0.18, - "grad_norm": 9.498724790073537, - "learning_rate": 4.899608027447858e-07, - "logits/chosen": -0.515625, - "logits/rejected": -0.64453125, - "logps/chosen": -2672.0, - "logps/rejected": -2112.0, - "loss": 0.659, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.71875, - "rewards/margins": 0.1923828125, - "rewards/rejected": 0.5234375, + "epoch": 0.7262021589793916, + "grad_norm": 3.3333877037469026, + "learning_rate": 1.0528223176192615e-07, + "logits/chosen": -1.1953125, + "logits/rejected": -1.234375, + "logps/bottom_tokens": -0.0006866455078125, + "logps/chosen": -442.0, + "logps/rejected": -398.0, + "loss": 0.6218, + "rewards/accuracies": 0.6250000596046448, + "rewards/chosen": -0.5234375, + "rewards/margins": 0.2138671875, + "rewards/rejected": -0.73828125, "step": 370 }, { - "epoch": 0.19, - "grad_norm": 9.511360912470519, - "learning_rate": 4.887242759398945e-07, - "logits/chosen": -0.578125, - "logits/rejected": -0.6796875, - "logps/chosen": -2416.0, - "logps/rejected": -2008.0, - "loss": 0.6449, - "rewards/accuracies": 0.64000004529953, - "rewards/chosen": 0.60546875, - "rewards/margins": 0.2109375, - "rewards/rejected": 0.39453125, + "epoch": 0.745829244357213, + "grad_norm": 3.3039144354882657, + "learning_rate": 9.16503585025567e-08, + "logits/chosen": -1.1953125, + "logits/rejected": -1.21875, + "logps/bottom_tokens": -0.0009765625, + "logps/chosen": -420.0, + "logps/rejected": -412.0, + "loss": 0.6279, + "rewards/accuracies": 0.6499999761581421, + "rewards/chosen": -0.5, + "rewards/margins": 0.2041015625, + "rewards/rejected": -0.703125, "step": 380 }, { - "epoch": 0.19, - "grad_norm": 9.32007338457764, - "learning_rate": 4.874177025369207e-07, - "logits/chosen": -0.46484375, - "logits/rejected": -0.5859375, - "logps/chosen": -2800.0, - "logps/rejected": -2256.0, - "loss": 0.6507, - "rewards/accuracies": 0.7400000095367432, - "rewards/chosen": 0.7890625, - "rewards/margins": 0.275390625, - "rewards/rejected": 0.51171875, + "epoch": 0.7654563297350343, + "grad_norm": 3.460907844274303, + "learning_rate": 7.876324394107017e-08, + "logits/chosen": -1.15625, + "logits/rejected": -1.203125, + "logps/bottom_tokens": -0.0006866455078125, + "logps/chosen": -442.0, + "logps/rejected": -418.0, + "loss": 0.6289, + "rewards/accuracies": 0.6350000500679016, + "rewards/chosen": -0.50390625, + "rewards/margins": 0.2255859375, + "rewards/rejected": -0.7265625, "step": 390 }, { - "epoch": 0.2, - "grad_norm": 9.245573199091506, - "learning_rate": 4.860414659112948e-07, - "logits/chosen": -0.55859375, - "logits/rejected": -0.578125, - "logps/chosen": -2176.0, - "logps/rejected": -1936.0, - "loss": 0.6371, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.6328125, - "rewards/margins": 0.1474609375, - "rewards/rejected": 0.48828125, + "epoch": 0.7850834151128557, + "grad_norm": 3.2842912290921897, + "learning_rate": 6.668149945978201e-08, + "logits/chosen": -1.1953125, + "logits/rejected": -1.2265625, + "logps/bottom_tokens": -0.000720977783203125, + "logps/chosen": -440.0, + "logps/rejected": -420.0, + "loss": 0.6102, + "rewards/accuracies": 0.6700000166893005, + "rewards/chosen": -0.482421875, + "rewards/margins": 0.291015625, + "rewards/rejected": -0.7734375, "step": 400 }, { - "epoch": 0.2, - "eval_logits/chosen": -0.66796875, - "eval_logits/rejected": -0.7265625, - "eval_logps/chosen": -2304.0, - "eval_logps/rejected": -2016.0, - "eval_loss": 0.649093747138977, - "eval_rewards/accuracies": 0.5595238208770752, - "eval_rewards/chosen": 0.65625, - "eval_rewards/margins": 0.15234375, - "eval_rewards/rejected": 0.5, - "eval_runtime": 90.1263, - "eval_samples_per_second": 22.191, - "eval_steps_per_second": 0.466, + "epoch": 0.7850834151128557, + "eval_logits/chosen": -1.1875, + "eval_logits/rejected": -1.234375, + "eval_logps/bottom_tokens": -0.000667572021484375, + "eval_logps/chosen": -444.0, + "eval_logps/rejected": -414.0, + "eval_loss": 0.6267920136451721, + "eval_rewards/accuracies": 0.6567164063453674, + "eval_rewards/chosen": -0.50390625, + "eval_rewards/margins": 0.2578125, + "eval_rewards/rejected": -0.76171875, + "eval_runtime": 94.2884, + "eval_samples_per_second": 21.212, + "eval_steps_per_second": 0.711, "step": 400 }, { - "epoch": 0.2, - "grad_norm": 8.878063287442174, - "learning_rate": 4.845959698790652e-07, - "logits/chosen": -0.578125, - "logits/rejected": -0.6171875, - "logps/chosen": -2128.0, - "logps/rejected": -1864.0, - "loss": 0.6683, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.59765625, - "rewards/margins": 0.0791015625, - "rewards/rejected": 0.515625, + "epoch": 0.8047105004906772, + "grad_norm": 3.3007954730404303, + "learning_rate": 5.546194858038072e-08, + "logits/chosen": -1.171875, + "logits/rejected": -1.21875, + "logps/bottom_tokens": -0.000797271728515625, + "logps/chosen": -416.0, + "logps/rejected": -374.0, + "loss": 0.6227, + "rewards/accuracies": 0.6699999570846558, + "rewards/chosen": -0.51171875, + "rewards/margins": 0.28515625, + "rewards/rejected": -0.796875, "step": 410 }, { - "epoch": 0.21, - "grad_norm": 9.071445660844004, - "learning_rate": 4.830816385784104e-07, - "logits/chosen": -0.5078125, - "logits/rejected": -0.55078125, - "logps/chosen": -2256.0, - "logps/rejected": -1872.0, - "loss": 0.6581, - "rewards/accuracies": 0.6399999856948853, - "rewards/chosen": 0.62890625, - "rewards/margins": 0.189453125, - "rewards/rejected": 0.44140625, + "epoch": 0.8243375858684985, + "grad_norm": 3.9743687860867185, + "learning_rate": 4.5157359708432626e-08, + "logits/chosen": -1.1953125, + "logits/rejected": -1.2265625, + "logps/bottom_tokens": -0.000751495361328125, + "logps/chosen": -394.0, + "logps/rejected": -412.0, + "loss": 0.6205, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.474609375, + "rewards/margins": 0.234375, + "rewards/rejected": -0.70703125, "step": 420 }, { - "epoch": 0.21, - "grad_norm": 10.638539511395816, - "learning_rate": 4.814989163451889e-07, - "logits/chosen": -0.578125, - "logits/rejected": -0.546875, - "logps/chosen": -1840.0, - "logps/rejected": -1864.0, - "loss": 0.6701, - "rewards/accuracies": 0.5200001001358032, - "rewards/chosen": 0.52734375, - "rewards/margins": 0.03271484375, - "rewards/rejected": 0.4921875, + "epoch": 0.8439646712463199, + "grad_norm": 3.1969688623984633, + "learning_rate": 3.581619795012874e-08, + "logits/chosen": -1.1796875, + "logits/rejected": -1.1875, + "logps/bottom_tokens": -0.000762939453125, + "logps/chosen": -400.0, + "logps/rejected": -404.0, + "loss": 0.6208, + "rewards/accuracies": 0.675000011920929, + "rewards/chosen": -0.474609375, + "rewards/margins": 0.267578125, + "rewards/rejected": -0.7421875, "step": 430 }, { - "epoch": 0.22, - "grad_norm": 9.196165350919857, - "learning_rate": 4.798482675825602e-07, - "logits/chosen": -0.5546875, - "logits/rejected": -0.62109375, - "logps/chosen": -2176.0, - "logps/rejected": -2128.0, - "loss": 0.6605, - "rewards/accuracies": 0.5600000023841858, - "rewards/chosen": 0.5703125, - "rewards/margins": 0.06591796875, - "rewards/rejected": 0.50390625, + "epoch": 0.8635917566241413, + "grad_norm": 3.705663203159775, + "learning_rate": 2.748239716854589e-08, + "logits/chosen": -1.2109375, + "logits/rejected": -1.1953125, + "logps/bottom_tokens": -0.00074005126953125, + "logps/chosen": -424.0, + "logps/rejected": -420.0, + "loss": 0.6398, + "rewards/accuracies": 0.5849999785423279, + "rewards/chosen": -0.51171875, + "rewards/margins": 0.1435546875, + "rewards/rejected": -0.65625, "step": 440 }, { - "epoch": 0.22, - "grad_norm": 8.671692805246133, - "learning_rate": 4.781301766247215e-07, - "logits/chosen": -0.64453125, - "logits/rejected": -0.640625, - "logps/chosen": -2040.0, - "logps/rejected": -2128.0, - "loss": 0.6486, - "rewards/accuracies": 0.6200000643730164, - "rewards/chosen": 0.494140625, - "rewards/margins": 0.0693359375, - "rewards/rejected": 0.42578125, + "epoch": 0.8832188420019627, + "grad_norm": 3.9792023056235455, + "learning_rate": 2.0195153351498323e-08, + "logits/chosen": -1.1796875, + "logits/rejected": -1.2109375, + "logps/bottom_tokens": -0.0007171630859375, + "logps/chosen": -432.0, + "logps/rejected": -420.0, + "loss": 0.611, + "rewards/accuracies": 0.6149999499320984, + "rewards/chosen": -0.53125, + "rewards/margins": 0.2421875, + "rewards/rejected": -0.7734375, "step": 450 }, { - "epoch": 0.23, - "grad_norm": 12.453319112805517, - "learning_rate": 4.7634514759479275e-07, - "logits/chosen": -0.6171875, - "logits/rejected": -0.671875, - "logps/chosen": -2096.0, - "logps/rejected": -1800.0, - "loss": 0.6577, - "rewards/accuracies": 0.5600000619888306, - "rewards/chosen": 0.5078125, - "rewards/margins": 0.1748046875, - "rewards/rejected": 0.33203125, + "epoch": 0.9028459273797841, + "grad_norm": 3.598443005581659, + "learning_rate": 1.3988740262822846e-08, + "logits/chosen": -1.1953125, + "logits/rejected": -1.203125, + "logps/bottom_tokens": -0.00067138671875, + "logps/chosen": -428.0, + "logps/rejected": -410.0, + "loss": 0.6138, + "rewards/accuracies": 0.6349999904632568, + "rewards/chosen": -0.490234375, + "rewards/margins": 0.216796875, + "rewards/rejected": -0.70703125, "step": 460 }, { - "epoch": 0.23, - "grad_norm": 10.997561643902582, - "learning_rate": 4.7449370425689694e-07, - "logits/chosen": -0.56640625, - "logits/rejected": -0.6171875, - "logps/chosen": -2240.0, - "logps/rejected": -2008.0, - "loss": 0.6317, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.5390625, - "rewards/margins": 0.2255859375, - "rewards/rejected": 0.3125, + "epoch": 0.9224730127576055, + "grad_norm": 3.423571391469107, + "learning_rate": 8.892348244137788e-09, + "logits/chosen": -1.1875, + "logits/rejected": -1.2421875, + "logps/bottom_tokens": -0.00066375732421875, + "logps/chosen": -474.0, + "logps/rejected": -444.0, + "loss": 0.6106, + "rewards/accuracies": 0.6299999952316284, + "rewards/chosen": -0.494140625, + "rewards/margins": 0.271484375, + "rewards/rejected": -0.765625, "step": 470 }, { - "epoch": 0.24, - "grad_norm": 7.512807533628497, - "learning_rate": 4.7257638986247684e-07, - "logits/chosen": -0.466796875, - "logits/rejected": -0.6640625, - "logps/chosen": -3024.0, - "logps/rejected": -2192.0, - "loss": 0.6469, - "rewards/accuracies": 0.7199999690055847, - "rewards/chosen": 0.72265625, - "rewards/margins": 0.380859375, - "rewards/rejected": 0.34375, + "epoch": 0.9421000981354269, + "grad_norm": 3.1667123948106584, + "learning_rate": 4.929946925231076e-09, + "logits/chosen": -1.1328125, + "logits/rejected": -1.171875, + "logps/bottom_tokens": -0.000850677490234375, + "logps/chosen": -410.0, + "logps/rejected": -412.0, + "loss": 0.6203, + "rewards/accuracies": 0.6049999594688416, + "rewards/chosen": -0.48046875, + "rewards/margins": 0.1953125, + "rewards/rejected": -0.67578125, "step": 480 }, { - "epoch": 0.24, - "grad_norm": 13.645526383003268, - "learning_rate": 4.705937669908943e-07, - "logits/chosen": -0.4921875, - "logits/rejected": -0.6015625, - "logps/chosen": -2624.0, - "logps/rejected": -2240.0, - "loss": 0.6384, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.63671875, - "rewards/margins": 0.2138671875, - "rewards/rejected": 0.421875, + "epoch": 0.9617271835132483, + "grad_norm": 3.5902417143779024, + "learning_rate": 2.1201724887858484e-09, + "logits/chosen": -1.1640625, + "logits/rejected": -1.171875, + "logps/bottom_tokens": -0.000732421875, + "logps/chosen": -422.0, + "logps/rejected": -412.0, + "loss": 0.6235, + "rewards/accuracies": 0.5949999690055847, + "rewards/chosen": -0.5390625, + "rewards/margins": 0.2265625, + "rewards/rejected": -0.765625, "step": 490 }, { - "epoch": 0.25, - "grad_norm": 7.909000541119995, - "learning_rate": 4.685464173843574e-07, - "logits/chosen": -0.54296875, - "logits/rejected": -0.6484375, - "logps/chosen": -2192.0, - "logps/rejected": -1792.0, - "loss": 0.6206, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.5703125, - "rewards/margins": 0.248046875, - "rewards/rejected": 0.322265625, + "epoch": 0.9813542688910697, + "grad_norm": 3.3154898943344704, + "learning_rate": 4.762400196664518e-10, + "logits/chosen": -1.1484375, + "logits/rejected": -1.1953125, + "logps/bottom_tokens": -0.0006256103515625, + "logps/chosen": -428.0, + "logps/rejected": -388.0, + "loss": 0.6084, + "rewards/accuracies": 0.625, + "rewards/chosen": -0.578125, + "rewards/margins": 0.1787109375, + "rewards/rejected": -0.75390625, "step": 500 }, { - "epoch": 0.25, - "eval_logits/chosen": -0.65625, - "eval_logits/rejected": -0.71484375, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2024.0, - "eval_loss": 0.6465859413146973, - "eval_rewards/accuracies": 0.5952380895614624, - "eval_rewards/chosen": 0.5390625, - "eval_rewards/margins": 0.14453125, - "eval_rewards/rejected": 0.39453125, - "eval_runtime": 89.3629, - "eval_samples_per_second": 22.381, - "eval_steps_per_second": 0.47, + "epoch": 0.9813542688910697, + "eval_logits/chosen": -1.1953125, + "eval_logits/rejected": -1.2421875, + "eval_logps/bottom_tokens": -0.000743865966796875, + "eval_logps/chosen": -446.0, + "eval_logps/rejected": -416.0, + "eval_loss": 0.6259472370147705, + "eval_rewards/accuracies": 0.6567164659500122, + "eval_rewards/chosen": -0.5234375, + "eval_rewards/margins": 0.26171875, + "eval_rewards/rejected": -0.78515625, + "eval_runtime": 94.3436, + "eval_samples_per_second": 21.199, + "eval_steps_per_second": 0.71, "step": 500 }, { - "epoch": 0.25, - "grad_norm": 9.183157685366133, - "learning_rate": 4.6643494177722574e-07, - "logits/chosen": -0.5546875, - "logits/rejected": -0.58984375, - "logps/chosen": -2160.0, - "logps/rejected": -1864.0, - "loss": 0.6507, - "rewards/accuracies": 0.6200000643730164, - "rewards/chosen": 0.51953125, - "rewards/margins": 0.1279296875, - "rewards/rejected": 0.392578125, - "step": 510 - }, - { - "epoch": 0.26, - "grad_norm": 10.502858947018517, - "learning_rate": 4.6425995971974265e-07, - "logits/chosen": -0.6015625, - "logits/rejected": -0.75, - "logps/chosen": -2448.0, - "logps/rejected": -1856.0, - "loss": 0.6657, - "rewards/accuracies": 0.5199999809265137, - "rewards/chosen": 0.46875, - "rewards/margins": 0.11279296875, - "rewards/rejected": 0.357421875, - "step": 520 - }, - { - "epoch": 0.26, - "grad_norm": 10.966207624712743, - "learning_rate": 4.6202210939624607e-07, - "logits/chosen": -0.51953125, - "logits/rejected": -0.546875, - "logps/chosen": -2688.0, - "logps/rejected": -2528.0, - "loss": 0.6652, - "rewards/accuracies": 0.5000000596046448, - "rewards/chosen": 0.64453125, - "rewards/margins": 0.041015625, - "rewards/rejected": 0.60546875, - "step": 530 - }, - { - "epoch": 0.26, - "grad_norm": 8.724395310063244, - "learning_rate": 4.597220474379125e-07, - "logits/chosen": -0.57421875, - "logits/rejected": -0.6484375, - "logps/chosen": -2448.0, - "logps/rejected": -2040.0, - "loss": 0.6592, - "rewards/accuracies": 0.5400000810623169, - "rewards/chosen": 0.515625, - "rewards/margins": 0.177734375, - "rewards/rejected": 0.333984375, - "step": 540 - }, - { - "epoch": 0.27, - "grad_norm": 8.826948047502222, - "learning_rate": 4.57360448730088e-07, - "logits/chosen": -0.6171875, - "logits/rejected": -0.6953125, - "logps/chosen": -2496.0, - "logps/rejected": -2112.0, - "loss": 0.6483, - "rewards/accuracies": 0.5400000214576721, - "rewards/chosen": 0.61328125, - "rewards/margins": 0.08837890625, - "rewards/rejected": 0.52734375, - "step": 550 - }, - { - "epoch": 0.27, - "grad_norm": 7.904587345979932, - "learning_rate": 4.549380062142627e-07, - "logits/chosen": -0.66796875, - "logits/rejected": -0.65625, - "logps/chosen": -1960.0, - "logps/rejected": -1928.0, - "loss": 0.6557, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.515625, - "rewards/margins": 0.052001953125, - "rewards/rejected": 0.46484375, - "step": 560 - }, - { - "epoch": 0.28, - "grad_norm": 7.802080225823436, - "learning_rate": 4.524554306847479e-07, - "logits/chosen": -0.6171875, - "logits/rejected": -0.6171875, - "logps/chosen": -2144.0, - "logps/rejected": -2096.0, - "loss": 0.652, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.5625, - "rewards/margins": 0.062255859375, - "rewards/rejected": 0.498046875, - "step": 570 - }, - { - "epoch": 0.28, - "grad_norm": 10.484471048264675, - "learning_rate": 4.499134505801141e-07, - "logits/chosen": -0.52734375, - "logits/rejected": -0.51953125, - "logps/chosen": -2352.0, - "logps/rejected": -2336.0, - "loss": 0.6459, - "rewards/accuracies": 0.42000001668930054, - "rewards/chosen": 0.58984375, - "rewards/margins": 0.018798828125, - "rewards/rejected": 0.5703125, - "step": 580 - }, - { - "epoch": 0.29, - "grad_norm": 11.582326622430703, - "learning_rate": 4.4731281176945244e-07, - "logits/chosen": -0.5234375, - "logits/rejected": -0.640625, - "logps/chosen": -2768.0, - "logps/rejected": -2208.0, - "loss": 0.6501, - "rewards/accuracies": 0.5400000810623169, - "rewards/chosen": 0.72265625, - "rewards/margins": 0.1611328125, - "rewards/rejected": 0.55859375, - "step": 590 - }, - { - "epoch": 0.29, - "grad_norm": 8.880977624721396, - "learning_rate": 4.4465427733352124e-07, - "logits/chosen": -0.5078125, - "logits/rejected": -0.55859375, - "logps/chosen": -2320.0, - "logps/rejected": -2080.0, - "loss": 0.686, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.6640625, - "rewards/margins": 0.251953125, - "rewards/rejected": 0.4140625, - "step": 600 - }, - { - "epoch": 0.29, - "eval_logits/chosen": -0.66015625, - "eval_logits/rejected": -0.71875, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2024.0, - "eval_loss": 0.6446093916893005, - "eval_rewards/accuracies": 0.5714285969734192, - "eval_rewards/chosen": 0.578125, - "eval_rewards/margins": 0.1591796875, - "eval_rewards/rejected": 0.41796875, - "eval_runtime": 89.7151, - "eval_samples_per_second": 22.293, - "eval_steps_per_second": 0.468, - "step": 600 - }, - { - "epoch": 0.3, - "grad_norm": 11.188626521589404, - "learning_rate": 4.4193862734084277e-07, - "logits/chosen": -0.6796875, - "logits/rejected": -0.76171875, - "logps/chosen": -2192.0, - "logps/rejected": -2024.0, - "loss": 0.6552, - "rewards/accuracies": 0.5400000810623169, - "rewards/chosen": 0.5703125, - "rewards/margins": 0.1181640625, - "rewards/rejected": 0.453125, - "step": 610 - }, - { - "epoch": 0.3, - "grad_norm": 6.697586616076624, - "learning_rate": 4.391666586188145e-07, - "logits/chosen": -0.640625, - "logits/rejected": -0.703125, - "logps/chosen": -2352.0, - "logps/rejected": -2128.0, - "loss": 0.6495, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.55859375, - "rewards/margins": 0.1611328125, - "rewards/rejected": 0.39453125, - "step": 620 - }, - { - "epoch": 0.31, - "grad_norm": 11.772870086073146, - "learning_rate": 4.363391845199045e-07, - "logits/chosen": -0.56640625, - "logits/rejected": -0.59765625, - "logps/chosen": -2432.0, - "logps/rejected": -2272.0, - "loss": 0.625, - "rewards/accuracies": 0.6399999856948853, - "rewards/chosen": 0.58203125, - "rewards/margins": 0.271484375, - "rewards/rejected": 0.3125, - "step": 630 - }, - { - "epoch": 0.31, - "grad_norm": 8.98000540892343, - "learning_rate": 4.3345703468299634e-07, - "logits/chosen": -0.52734375, - "logits/rejected": -0.58203125, - "logps/chosen": -2608.0, - "logps/rejected": -2304.0, - "loss": 0.6203, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.56640625, - "rewards/margins": 0.1865234375, - "rewards/rejected": 0.3828125, - "step": 640 - }, - { - "epoch": 0.32, - "grad_norm": 10.179524289387247, - "learning_rate": 4.3052105478995635e-07, - "logits/chosen": -0.53125, - "logits/rejected": -0.71875, - "logps/chosen": -2448.0, - "logps/rejected": -1608.0, - "loss": 0.6391, - "rewards/accuracies": 0.6800000667572021, - "rewards/chosen": 0.55859375, - "rewards/margins": 0.302734375, - "rewards/rejected": 0.25390625, - "step": 650 - }, - { - "epoch": 0.32, - "grad_norm": 12.855522500267961, - "learning_rate": 4.275321063174936e-07, - "logits/chosen": -0.6015625, - "logits/rejected": -0.640625, - "logps/chosen": -1656.0, - "logps/rejected": -1456.0, - "loss": 0.643, - "rewards/accuracies": 0.5799999833106995, - "rewards/chosen": 0.39453125, - "rewards/margins": 0.09619140625, - "rewards/rejected": 0.298828125, - "step": 660 - }, - { - "epoch": 0.33, - "grad_norm": 7.0575963436949465, - "learning_rate": 4.24491066284384e-07, - "logits/chosen": -0.73046875, - "logits/rejected": -0.734375, - "logps/chosen": -1664.0, - "logps/rejected": -1544.0, - "loss": 0.6358, - "rewards/accuracies": 0.5199999809265137, - "rewards/chosen": 0.373046875, - "rewards/margins": 0.1484375, - "rewards/rejected": 0.224609375, - "step": 670 - }, - { - "epoch": 0.33, - "grad_norm": 8.261526871746847, - "learning_rate": 4.2139882699413613e-07, - "logits/chosen": -0.54296875, - "logits/rejected": -0.5625, - "logps/chosen": -2240.0, - "logps/rejected": -2144.0, - "loss": 0.6533, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.6640625, - "rewards/margins": 0.169921875, - "rewards/rejected": 0.494140625, - "step": 680 - }, - { - "epoch": 0.34, - "grad_norm": 9.857235722928268, - "learning_rate": 4.1825629577317024e-07, - "logits/chosen": -0.55859375, - "logits/rejected": -0.63671875, - "logps/chosen": -2288.0, - "logps/rejected": -1960.0, - "loss": 0.6869, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.5859375, - "rewards/margins": 0.1455078125, - "rewards/rejected": 0.439453125, - "step": 690 - }, - { - "epoch": 0.34, - "grad_norm": 9.152797505715435, - "learning_rate": 4.1506439470459056e-07, - "logits/chosen": -0.51171875, - "logits/rejected": -0.609375, - "logps/chosen": -2496.0, - "logps/rejected": -2208.0, - "loss": 0.6459, - "rewards/accuracies": 0.5400000214576721, - "rewards/chosen": 0.58203125, - "rewards/margins": 0.166015625, - "rewards/rejected": 0.416015625, - "step": 700 - }, - { - "epoch": 0.34, - "eval_logits/chosen": -0.62890625, - "eval_logits/rejected": -0.6875, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2032.0, - "eval_loss": 0.6448671817779541, - "eval_rewards/accuracies": 0.601190447807312, - "eval_rewards/chosen": 0.55078125, - "eval_rewards/margins": 0.1884765625, - "eval_rewards/rejected": 0.36328125, - "eval_runtime": 90.547, - "eval_samples_per_second": 22.088, - "eval_steps_per_second": 0.464, - "step": 700 - }, - { - "epoch": 0.35, - "grad_norm": 9.410823784289315, - "learning_rate": 4.1182406035762684e-07, - "logits/chosen": -0.494140625, - "logits/rejected": -0.5234375, - "logps/chosen": -2288.0, - "logps/rejected": -2016.0, - "loss": 0.6429, - "rewards/accuracies": 0.5799999833106995, - "rewards/chosen": 0.44140625, - "rewards/margins": 0.1240234375, - "rewards/rejected": 0.31640625, - "step": 710 - }, - { - "epoch": 0.35, - "grad_norm": 9.75600415982287, - "learning_rate": 4.085362435128262e-07, - "logits/chosen": -0.5390625, - "logits/rejected": -0.68359375, - "logps/chosen": -2768.0, - "logps/rejected": -2224.0, - "loss": 0.6704, - "rewards/accuracies": 0.6800000071525574, - "rewards/chosen": 0.6328125, - "rewards/margins": 0.2470703125, - "rewards/rejected": 0.38671875, - "step": 720 - }, - { - "epoch": 0.36, - "grad_norm": 7.3325000524472, - "learning_rate": 4.0520190888307413e-07, - "logits/chosen": -0.61328125, - "logits/rejected": -0.66796875, - "logps/chosen": -2544.0, - "logps/rejected": -2304.0, - "loss": 0.647, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.63671875, - "rewards/margins": 0.232421875, - "rewards/rejected": 0.40625, - "step": 730 - }, - { - "epoch": 0.36, - "grad_norm": 10.182221180599024, - "learning_rate": 4.0182203483052825e-07, - "logits/chosen": -0.546875, - "logits/rejected": -0.68359375, - "logps/chosen": -2864.0, - "logps/rejected": -2288.0, - "loss": 0.6299, - "rewards/accuracies": 0.6800000071525574, - "rewards/chosen": 0.7265625, - "rewards/margins": 0.248046875, - "rewards/rejected": 0.4765625, - "step": 740 - }, - { - "epoch": 0.37, - "grad_norm": 7.856122064768362, - "learning_rate": 3.983976130795467e-07, - "logits/chosen": -0.51953125, - "logits/rejected": -0.61328125, - "logps/chosen": -2576.0, - "logps/rejected": -2128.0, - "loss": 0.6215, - "rewards/accuracies": 0.5600000619888306, - "rewards/chosen": 0.71484375, - "rewards/margins": 0.275390625, - "rewards/rejected": 0.439453125, - "step": 750 - }, - { - "epoch": 0.37, - "grad_norm": 8.363791551838782, - "learning_rate": 3.949296484256959e-07, - "logits/chosen": -0.5859375, - "logits/rejected": -0.64453125, - "logps/chosen": -2128.0, - "logps/rejected": -2008.0, - "loss": 0.6718, - "rewards/accuracies": 0.5400000214576721, - "rewards/chosen": 0.490234375, - "rewards/margins": 0.0625, - "rewards/rejected": 0.427734375, - "step": 760 - }, - { - "epoch": 0.38, - "grad_norm": 9.779529809569974, - "learning_rate": 3.9141915844092285e-07, - "logits/chosen": -0.546875, - "logits/rejected": -0.6796875, - "logps/chosen": -2208.0, - "logps/rejected": -1912.0, - "loss": 0.6547, - "rewards/accuracies": 0.5799999833106995, - "rewards/chosen": 0.55078125, - "rewards/margins": 0.23046875, - "rewards/rejected": 0.3203125, - "step": 770 - }, - { - "epoch": 0.38, - "grad_norm": 8.698999015594477, - "learning_rate": 3.8786717317497875e-07, - "logits/chosen": -0.4921875, - "logits/rejected": -0.609375, - "logps/chosen": -2432.0, - "logps/rejected": -2096.0, - "loss": 0.6326, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.6484375, - "rewards/margins": 0.353515625, - "rewards/rejected": 0.296875, - "step": 780 - }, - { - "epoch": 0.39, - "grad_norm": 8.385397043034576, - "learning_rate": 3.842747348531813e-07, - "logits/chosen": -0.53515625, - "logits/rejected": -0.58203125, - "logps/chosen": -2192.0, - "logps/rejected": -1880.0, - "loss": 0.6337, - "rewards/accuracies": 0.64000004529953, - "rewards/chosen": 0.578125, - "rewards/margins": 0.23828125, - "rewards/rejected": 0.337890625, - "step": 790 - }, - { - "epoch": 0.39, - "grad_norm": 8.23475490511833, - "learning_rate": 3.806428975706042e-07, - "logits/chosen": -0.62890625, - "logits/rejected": -0.65625, - "logps/chosen": -2352.0, - "logps/rejected": -2112.0, - "loss": 0.6458, - "rewards/accuracies": 0.5600000619888306, - "rewards/chosen": 0.62890625, - "rewards/margins": 0.2119140625, - "rewards/rejected": 0.41796875, - "step": 800 - }, - { - "epoch": 0.39, - "eval_logits/chosen": -0.640625, - "eval_logits/rejected": -0.6953125, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2024.0, - "eval_loss": 0.6421015858650208, - "eval_rewards/accuracies": 0.5773809552192688, - "eval_rewards/chosen": 0.55859375, - "eval_rewards/margins": 0.1708984375, - "eval_rewards/rejected": 0.38671875, - "eval_runtime": 86.7441, - "eval_samples_per_second": 23.056, - "eval_steps_per_second": 0.484, - "step": 800 - }, - { - "epoch": 0.4, - "grad_norm": 9.096444731037868, - "learning_rate": 3.769727269827843e-07, - "logits/chosen": -0.51171875, - "logits/rejected": -0.62109375, - "logps/chosen": -1992.0, - "logps/rejected": -1672.0, - "loss": 0.6547, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.337890625, - "rewards/margins": 0.0634765625, - "rewards/rejected": 0.275390625, - "step": 810 - }, - { - "epoch": 0.4, - "grad_norm": 9.063205334817118, - "learning_rate": 3.7326529999303633e-07, - "logits/chosen": -0.55859375, - "logits/rejected": -0.6484375, - "logps/chosen": -2608.0, - "logps/rejected": -2128.0, - "loss": 0.6512, - "rewards/accuracies": 0.5400000810623169, - "rewards/chosen": 0.56640625, - "rewards/margins": 0.1396484375, - "rewards/rejected": 0.42578125, - "step": 820 - }, - { - "epoch": 0.41, - "grad_norm": 8.96580135461996, - "learning_rate": 3.6952170443646737e-07, - "logits/chosen": -0.51171875, - "logits/rejected": -0.63671875, - "logps/chosen": -2752.0, - "logps/rejected": -2064.0, - "loss": 0.6351, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.73828125, - "rewards/margins": 0.248046875, - "rewards/rejected": 0.4921875, - "step": 830 - }, - { - "epoch": 0.41, - "grad_norm": 10.61500715901887, - "learning_rate": 3.6574303876078366e-07, - "logits/chosen": -0.5234375, - "logits/rejected": -0.6640625, - "logps/chosen": -2384.0, - "logps/rejected": -1816.0, - "loss": 0.6429, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.734375, - "rewards/margins": 0.30859375, - "rewards/rejected": 0.42578125, - "step": 840 - }, - { - "epoch": 0.42, - "grad_norm": 8.368580291250769, - "learning_rate": 3.619304117039835e-07, - "logits/chosen": -0.52734375, - "logits/rejected": -0.546875, - "logps/chosen": -2352.0, - "logps/rejected": -2240.0, - "loss": 0.6492, - "rewards/accuracies": 0.440000057220459, - "rewards/chosen": 0.75, - "rewards/margins": 0.12353515625, - "rewards/rejected": 0.625, - "step": 850 - }, - { - "epoch": 0.42, - "grad_norm": 6.994507159815261, - "learning_rate": 3.5808494196903117e-07, - "logits/chosen": -0.55078125, - "logits/rejected": -0.62890625, - "logps/chosen": -2608.0, - "logps/rejected": -2128.0, - "loss": 0.6169, - "rewards/accuracies": 0.6800000071525574, - "rewards/chosen": 0.87109375, - "rewards/margins": 0.400390625, - "rewards/rejected": 0.47265625, - "step": 860 - }, - { - "epoch": 0.43, - "grad_norm": 8.69645008304575, - "learning_rate": 3.542077578956057e-07, - "logits/chosen": -0.51953125, - "logits/rejected": -0.578125, - "logps/chosen": -2416.0, - "logps/rejected": -2192.0, - "loss": 0.6549, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.7421875, - "rewards/margins": 0.21484375, - "rewards/rejected": 0.52734375, - "step": 870 - }, - { - "epoch": 0.43, - "grad_norm": 6.41242543011292, - "learning_rate": 3.5029999712902387e-07, - "logits/chosen": -0.45703125, - "logits/rejected": -0.53125, - "logps/chosen": -2608.0, - "logps/rejected": -2352.0, - "loss": 0.6401, - "rewards/accuracies": 0.5600000619888306, - "rewards/chosen": 0.81640625, - "rewards/margins": 0.15234375, - "rewards/rejected": 0.6640625, - "step": 880 - }, - { - "epoch": 0.44, - "grad_norm": 7.750038189045204, - "learning_rate": 3.463628062864312e-07, - "logits/chosen": -0.470703125, - "logits/rejected": -0.58984375, - "logps/chosen": -2688.0, - "logps/rejected": -2128.0, - "loss": 0.622, - "rewards/accuracies": 0.6599999666213989, - "rewards/chosen": 0.8828125, - "rewards/margins": 0.28515625, - "rewards/rejected": 0.59765625, - "step": 890 - }, - { - "epoch": 0.44, - "grad_norm": 7.569717070191467, - "learning_rate": 3.4239734062036067e-07, - "logits/chosen": -0.5, - "logits/rejected": -0.58203125, - "logps/chosen": -2512.0, - "logps/rejected": -2224.0, - "loss": 0.6451, - "rewards/accuracies": 0.4599999785423279, - "rewards/chosen": 0.625, - "rewards/margins": 0.06591796875, - "rewards/rejected": 0.55859375, - "step": 900 - }, - { - "epoch": 0.44, - "eval_logits/chosen": -0.61328125, - "eval_logits/rejected": -0.671875, - "eval_logps/chosen": -2304.0, - "eval_logps/rejected": -2016.0, - "eval_loss": 0.6398203372955322, - "eval_rewards/accuracies": 0.5684523582458496, - "eval_rewards/chosen": 0.7109375, - "eval_rewards/margins": 0.20703125, - "eval_rewards/rejected": 0.50390625, - "eval_runtime": 86.5756, - "eval_samples_per_second": 23.101, - "eval_steps_per_second": 0.485, - "step": 900 - }, - { - "epoch": 0.45, - "grad_norm": 8.632582600123046, - "learning_rate": 3.3840476367975874e-07, - "logits/chosen": -0.515625, - "logits/rejected": -0.62890625, - "logps/chosen": -2432.0, - "logps/rejected": -1968.0, - "loss": 0.6196, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.78515625, - "rewards/margins": 0.328125, - "rewards/rejected": 0.45703125, - "step": 910 - }, - { - "epoch": 0.45, - "grad_norm": 8.506506212272358, - "learning_rate": 3.343862469685755e-07, - "logits/chosen": -0.6015625, - "logits/rejected": -0.68359375, - "logps/chosen": -1960.0, - "logps/rejected": -1776.0, - "loss": 0.6652, - "rewards/accuracies": 0.40000003576278687, - "rewards/chosen": 0.53125, - "rewards/margins": 0.045166015625, - "rewards/rejected": 0.486328125, - "step": 920 - }, - { - "epoch": 0.46, - "grad_norm": 11.75407958457082, - "learning_rate": 3.3034296960202195e-07, - "logits/chosen": -0.45703125, - "logits/rejected": -0.5703125, - "logps/chosen": -2656.0, - "logps/rejected": -2192.0, - "loss": 0.6409, - "rewards/accuracies": 0.5199999809265137, - "rewards/chosen": 0.7734375, - "rewards/margins": 0.232421875, - "rewards/rejected": 0.54296875, - "step": 930 - }, - { - "epoch": 0.46, - "grad_norm": 8.239692706862764, - "learning_rate": 3.2627611796059283e-07, - "logits/chosen": -0.53515625, - "logits/rejected": -0.5625, - "logps/chosen": -2736.0, - "logps/rejected": -2400.0, - "loss": 0.6449, - "rewards/accuracies": 0.5600000619888306, - "rewards/chosen": 0.91796875, - "rewards/margins": 0.2333984375, - "rewards/rejected": 0.68359375, - "step": 940 - }, - { - "epoch": 0.47, - "grad_norm": 8.160298452954589, - "learning_rate": 3.221868853419587e-07, - "logits/chosen": -0.46875, - "logits/rejected": -0.55078125, - "logps/chosen": -2768.0, - "logps/rejected": -2272.0, - "loss": 0.6157, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.98046875, - "rewards/margins": 0.302734375, - "rewards/rejected": 0.67578125, - "step": 950 - }, - { - "epoch": 0.47, - "grad_norm": 9.590783642097492, - "learning_rate": 3.1807647161082797e-07, - "logits/chosen": -0.47265625, - "logits/rejected": -0.5625, - "logps/chosen": -2656.0, - "logps/rejected": -2336.0, - "loss": 0.6511, - "rewards/accuracies": 0.5000000596046448, - "rewards/chosen": 0.859375, - "rewards/margins": 0.16796875, - "rewards/rejected": 0.69140625, - "step": 960 - }, - { - "epoch": 0.48, - "grad_norm": 7.99282237541914, - "learning_rate": 3.139460828468815e-07, - "logits/chosen": -0.44140625, - "logits/rejected": -0.494140625, - "logps/chosen": -1976.0, - "logps/rejected": -1776.0, - "loss": 0.6747, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.6328125, - "rewards/margins": 0.166015625, - "rewards/rejected": 0.466796875, - "step": 970 - }, - { - "epoch": 0.48, - "grad_norm": 12.319935997541707, - "learning_rate": 3.097969309908847e-07, - "logits/chosen": -0.60546875, - "logits/rejected": -0.5390625, - "logps/chosen": -1728.0, - "logps/rejected": -2024.0, - "loss": 0.6457, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.5234375, - "rewards/margins": 0.03857421875, - "rewards/rejected": 0.486328125, - "step": 980 - }, - { - "epoch": 0.49, - "grad_norm": 9.488759901702995, - "learning_rate": 3.056302334890786e-07, - "logits/chosen": -0.53515625, - "logits/rejected": -0.62890625, - "logps/chosen": -2400.0, - "logps/rejected": -2096.0, - "loss": 0.6284, - "rewards/accuracies": 0.5600000619888306, - "rewards/chosen": 0.83203125, - "rewards/margins": 0.1953125, - "rewards/rejected": 0.63671875, - "step": 990 - }, - { - "epoch": 0.49, - "grad_norm": 7.899477826304579, - "learning_rate": 3.01447212935957e-07, - "logits/chosen": -0.5234375, - "logits/rejected": -0.609375, - "logps/chosen": -2432.0, - "logps/rejected": -2064.0, - "loss": 0.6213, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.8203125, - "rewards/margins": 0.27734375, - "rewards/rejected": 0.54296875, - "step": 1000 - }, - { - "epoch": 0.49, - "eval_logits/chosen": -0.6015625, - "eval_logits/rejected": -0.66015625, - "eval_logps/chosen": -2304.0, - "eval_logps/rejected": -2008.0, - "eval_loss": 0.6406640410423279, - "eval_rewards/accuracies": 0.5714285969734192, - "eval_rewards/chosen": 0.7734375, - "eval_rewards/margins": 0.201171875, - "eval_rewards/rejected": 0.57421875, - "eval_runtime": 86.4608, - "eval_samples_per_second": 23.132, - "eval_steps_per_second": 0.486, - "step": 1000 - }, - { - "epoch": 0.5, - "grad_norm": 8.55209914949281, - "learning_rate": 2.9724909671553134e-07, - "logits/chosen": -0.56640625, - "logits/rejected": -0.6015625, - "logps/chosen": -2040.0, - "logps/rejected": -1952.0, - "loss": 0.6205, - "rewards/accuracies": 0.5399999618530273, - "rewards/chosen": 0.66796875, - "rewards/margins": 0.11328125, - "rewards/rejected": 0.5546875, - "step": 1010 - }, - { - "epoch": 0.5, - "grad_norm": 7.990632543907024, - "learning_rate": 2.930371166411915e-07, - "logits/chosen": -0.458984375, - "logits/rejected": -0.53515625, - "logps/chosen": -2912.0, - "logps/rejected": -2672.0, - "loss": 0.6432, - "rewards/accuracies": 0.6200000643730164, - "rewards/chosen": 0.94921875, - "rewards/margins": 0.173828125, - "rewards/rejected": 0.77734375, - "step": 1020 - }, - { - "epoch": 0.51, - "grad_norm": 8.428930236764653, - "learning_rate": 2.888125085942664e-07, - "logits/chosen": -0.52734375, - "logits/rejected": -0.578125, - "logps/chosen": -1992.0, - "logps/rejected": -1808.0, - "loss": 0.6492, - "rewards/accuracies": 0.6200000643730164, - "rewards/chosen": 0.62109375, - "rewards/margins": 0.1728515625, - "rewards/rejected": 0.447265625, - "step": 1030 - }, - { - "epoch": 0.51, - "grad_norm": 9.162488652610268, - "learning_rate": 2.845765121613912e-07, - "logits/chosen": -0.54296875, - "logits/rejected": -0.62109375, - "logps/chosen": -2368.0, - "logps/rejected": -2048.0, - "loss": 0.6406, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.7265625, - "rewards/margins": 0.1865234375, - "rewards/rejected": 0.54296875, - "step": 1040 - }, - { - "epoch": 0.52, - "grad_norm": 7.430844898047478, - "learning_rate": 2.803303702707869e-07, - "logits/chosen": -0.56640625, - "logits/rejected": -0.703125, - "logps/chosen": -2240.0, - "logps/rejected": -1752.0, - "loss": 0.6623, - "rewards/accuracies": 0.5799999833106995, - "rewards/chosen": 0.578125, - "rewards/margins": 0.224609375, - "rewards/rejected": 0.3515625, - "step": 1050 - }, - { - "epoch": 0.52, - "grad_norm": 10.51154519002261, - "learning_rate": 2.760753288275598e-07, - "logits/chosen": -0.58203125, - "logits/rejected": -0.640625, - "logps/chosen": -2544.0, - "logps/rejected": -2320.0, - "loss": 0.6646, - "rewards/accuracies": 0.5400000214576721, - "rewards/chosen": 0.74609375, - "rewards/margins": 0.259765625, - "rewards/rejected": 0.484375, - "step": 1060 - }, - { - "epoch": 0.53, - "grad_norm": 7.047353993767992, - "learning_rate": 2.718126363481276e-07, - "logits/chosen": -0.59375, - "logits/rejected": -0.78125, - "logps/chosen": -2720.0, - "logps/rejected": -1832.0, - "loss": 0.6346, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.7109375, - "rewards/margins": 0.341796875, - "rewards/rejected": 0.37109375, - "step": 1070 - }, - { - "epoch": 0.53, - "grad_norm": 8.420763714981168, - "learning_rate": 2.675435435938788e-07, - "logits/chosen": -0.59765625, - "logits/rejected": -0.56640625, - "logps/chosen": -1784.0, - "logps/rejected": -1880.0, - "loss": 0.6347, - "rewards/accuracies": 0.47999995946884155, - "rewards/chosen": 0.39453125, - "rewards/margins": 0.039306640625, - "rewards/rejected": 0.353515625, - "step": 1080 - }, - { - "epoch": 0.53, - "grad_norm": 8.77849576592234, - "learning_rate": 2.63269303204174e-07, - "logits/chosen": -0.48046875, - "logits/rejected": -0.5546875, - "logps/chosen": -2656.0, - "logps/rejected": -2352.0, - "loss": 0.6426, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.6640625, - "rewards/margins": 0.275390625, - "rewards/rejected": 0.38671875, - "step": 1090 - }, - { - "epoch": 0.54, - "grad_norm": 8.92354118074767, - "learning_rate": 2.5899116932879534e-07, - "logits/chosen": -0.4765625, - "logits/rejected": -0.5703125, - "logps/chosen": -2496.0, - "logps/rejected": -2144.0, - "loss": 0.6313, - "rewards/accuracies": 0.6399999856948853, - "rewards/chosen": 0.58203125, - "rewards/margins": 0.2353515625, - "rewards/rejected": 0.345703125, - "step": 1100 - }, - { - "epoch": 0.54, - "eval_logits/chosen": -0.609375, - "eval_logits/rejected": -0.66796875, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2032.0, - "eval_loss": 0.638671875, - "eval_rewards/accuracies": 0.5892857313156128, - "eval_rewards/chosen": 0.5390625, - "eval_rewards/margins": 0.1806640625, - "eval_rewards/rejected": 0.35546875, - "eval_runtime": 86.566, - "eval_samples_per_second": 23.104, - "eval_steps_per_second": 0.485, - "step": 1100 - }, - { - "epoch": 0.54, - "grad_norm": 8.680098424161624, - "learning_rate": 2.5471039725995345e-07, - "logits/chosen": -0.5078125, - "logits/rejected": -0.5234375, - "logps/chosen": -2224.0, - "logps/rejected": -2096.0, - "loss": 0.6539, - "rewards/accuracies": 0.5199999809265137, - "rewards/chosen": 0.58984375, - "rewards/margins": 0.259765625, - "rewards/rejected": 0.330078125, - "step": 1110 - }, - { - "epoch": 0.55, - "grad_norm": 12.234325062863915, - "learning_rate": 2.504282430639594e-07, - "logits/chosen": -0.466796875, - "logits/rejected": -0.59765625, - "logps/chosen": -2560.0, - "logps/rejected": -2064.0, - "loss": 0.6367, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.56640625, - "rewards/margins": 0.197265625, - "rewards/rejected": 0.3671875, - "step": 1120 - }, - { - "epoch": 0.55, - "grad_norm": 9.71309302276875, - "learning_rate": 2.4614596321266836e-07, - "logits/chosen": -0.60546875, - "logits/rejected": -0.625, - "logps/chosen": -2352.0, - "logps/rejected": -2272.0, - "loss": 0.632, - "rewards/accuracies": 0.5400000214576721, - "rewards/chosen": 0.45703125, - "rewards/margins": 0.12451171875, - "rewards/rejected": 0.33203125, - "step": 1130 - }, - { - "epoch": 0.56, - "grad_norm": 9.646759761306011, - "learning_rate": 2.418648142148056e-07, - "logits/chosen": -0.50390625, - "logits/rejected": -0.59375, - "logps/chosen": -2784.0, - "logps/rejected": -2192.0, - "loss": 0.6486, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.6640625, - "rewards/margins": 0.21875, - "rewards/rejected": 0.443359375, - "step": 1140 - }, - { - "epoch": 0.56, - "grad_norm": 8.518198214709173, - "learning_rate": 2.375860522472805e-07, - "logits/chosen": -0.578125, - "logits/rejected": -0.68359375, - "logps/chosen": -2064.0, - "logps/rejected": -1624.0, - "loss": 0.6414, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.435546875, - "rewards/margins": 0.19140625, - "rewards/rejected": 0.2451171875, - "step": 1150 - }, - { - "epoch": 0.57, - "grad_norm": 9.732180854785232, - "learning_rate": 2.3331093278659906e-07, - "logits/chosen": -0.5625, - "logits/rejected": -0.5859375, - "logps/chosen": -1992.0, - "logps/rejected": -1888.0, - "loss": 0.6358, - "rewards/accuracies": 0.4599999785423279, - "rewards/chosen": 0.53515625, - "rewards/margins": 0.212890625, - "rewards/rejected": 0.322265625, - "step": 1160 - }, - { - "epoch": 0.57, - "grad_norm": 8.25240239846351, - "learning_rate": 2.2904071024048089e-07, - "logits/chosen": -0.5546875, - "logits/rejected": -0.515625, - "logps/chosen": -2016.0, - "logps/rejected": -2016.0, - "loss": 0.6236, - "rewards/accuracies": 0.5000000596046448, - "rewards/chosen": 0.5625, - "rewards/margins": 0.1025390625, - "rewards/rejected": 0.4609375, - "step": 1170 - }, - { - "epoch": 0.58, - "grad_norm": 9.230235569126055, - "learning_rate": 2.247766375797906e-07, - "logits/chosen": -0.5546875, - "logits/rejected": -0.625, - "logps/chosen": -2224.0, - "logps/rejected": -1864.0, - "loss": 0.6549, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.62109375, - "rewards/margins": 0.205078125, - "rewards/rejected": 0.416015625, - "step": 1180 - }, - { - "epoch": 0.58, - "grad_norm": 10.347179580129167, - "learning_rate": 2.2051996597089026e-07, - "logits/chosen": -0.49609375, - "logits/rejected": -0.55078125, - "logps/chosen": -2096.0, - "logps/rejected": -1944.0, - "loss": 0.6314, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.421875, - "rewards/margins": 0.0191650390625, - "rewards/rejected": 0.40234375, - "step": 1190 - }, - { - "epoch": 0.59, - "grad_norm": 7.338817367824801, - "learning_rate": 2.1627194440852142e-07, - "logits/chosen": -0.478515625, - "logits/rejected": -0.59765625, - "logps/chosen": -2592.0, - "logps/rejected": -2064.0, - "loss": 0.6298, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.78515625, - "rewards/margins": 0.265625, - "rewards/rejected": 0.51953125, - "step": 1200 - }, - { - "epoch": 0.59, - "eval_logits/chosen": -0.59765625, - "eval_logits/rejected": -0.65234375, - "eval_logps/chosen": -2304.0, - "eval_logps/rejected": -2016.0, - "eval_loss": 0.6379843950271606, - "eval_rewards/accuracies": 0.6041666865348816, - "eval_rewards/chosen": 0.6953125, - "eval_rewards/margins": 0.203125, - "eval_rewards/rejected": 0.4921875, - "eval_runtime": 86.5496, - "eval_samples_per_second": 23.108, - "eval_steps_per_second": 0.485, - "step": 1200 - }, - { - "epoch": 0.59, - "grad_norm": 11.049536436218999, - "learning_rate": 2.120338193493248e-07, - "logits/chosen": -0.53515625, - "logits/rejected": -0.59765625, - "logps/chosen": -2624.0, - "logps/rejected": -2144.0, - "loss": 0.6477, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.7421875, - "rewards/margins": 0.2890625, - "rewards/rejected": 0.453125, - "step": 1210 - }, - { - "epoch": 0.6, - "grad_norm": 6.8225110204324615, - "learning_rate": 2.0780683434610413e-07, - "logits/chosen": -0.52734375, - "logits/rejected": -0.59765625, - "logps/chosen": -2416.0, - "logps/rejected": -2240.0, - "loss": 0.6609, - "rewards/accuracies": 0.5600000023841858, - "rewards/chosen": 0.734375, - "rewards/margins": 0.2099609375, - "rewards/rejected": 0.52734375, - "step": 1220 - }, - { - "epoch": 0.6, - "grad_norm": 10.18256822065958, - "learning_rate": 2.0359222968294202e-07, - "logits/chosen": -0.5703125, - "logits/rejected": -0.58203125, - "logps/chosen": -2192.0, - "logps/rejected": -2096.0, - "loss": 0.6622, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.5703125, - "rewards/margins": 0.208984375, - "rewards/rejected": 0.359375, - "step": 1230 - }, - { - "epoch": 0.61, - "grad_norm": 8.086963793402472, - "learning_rate": 1.993912420112756e-07, - "logits/chosen": -0.62109375, - "logits/rejected": -0.6015625, - "logps/chosen": -1992.0, - "logps/rejected": -2096.0, - "loss": 0.6603, - "rewards/accuracies": 0.5600000023841858, - "rewards/chosen": 0.62890625, - "rewards/margins": 0.09765625, - "rewards/rejected": 0.53125, - "step": 1240 - }, - { - "epoch": 0.61, - "grad_norm": 8.024306896782054, - "learning_rate": 1.9520510398703766e-07, - "logits/chosen": -0.51953125, - "logits/rejected": -0.5625, - "logps/chosen": -2512.0, - "logps/rejected": -2320.0, - "loss": 0.6632, - "rewards/accuracies": 0.5400000810623169, - "rewards/chosen": 0.70703125, - "rewards/margins": 0.2353515625, - "rewards/rejected": 0.47265625, - "step": 1250 - }, - { - "epoch": 0.62, - "grad_norm": 9.937034420935973, - "learning_rate": 1.9103504390896944e-07, - "logits/chosen": -0.49609375, - "logits/rejected": -0.5859375, - "logps/chosen": -2464.0, - "logps/rejected": -2240.0, - "loss": 0.6705, - "rewards/accuracies": 0.5600000023841858, - "rewards/chosen": 0.58984375, - "rewards/margins": 0.107421875, - "rewards/rejected": 0.482421875, - "step": 1260 - }, - { - "epoch": 0.62, - "grad_norm": 10.056595402865112, - "learning_rate": 1.8688228535821348e-07, - "logits/chosen": -0.53125, - "logits/rejected": -0.50390625, - "logps/chosen": -1936.0, - "logps/rejected": -2040.0, - "loss": 0.6226, - "rewards/accuracies": 0.440000057220459, - "rewards/chosen": 0.44921875, - "rewards/margins": 0.1337890625, - "rewards/rejected": 0.314453125, - "step": 1270 - }, - { - "epoch": 0.63, - "grad_norm": 8.971832197264595, - "learning_rate": 1.8274804683928913e-07, - "logits/chosen": -0.578125, - "logits/rejected": -0.55859375, - "logps/chosen": -2048.0, - "logps/rejected": -2160.0, - "loss": 0.6517, - "rewards/accuracies": 0.48000001907348633, - "rewards/chosen": 0.498046875, - "rewards/margins": 0.054443359375, - "rewards/rejected": 0.4453125, - "step": 1280 - }, - { - "epoch": 0.63, - "grad_norm": 9.440719762857112, - "learning_rate": 1.786335414225588e-07, - "logits/chosen": -0.5625, - "logits/rejected": -0.625, - "logps/chosen": -2096.0, - "logps/rejected": -1992.0, - "loss": 0.6552, - "rewards/accuracies": 0.5600000619888306, - "rewards/chosen": 0.400390625, - "rewards/margins": 0.19140625, - "rewards/rejected": 0.208984375, - "step": 1290 - }, - { - "epoch": 0.64, - "grad_norm": 12.661374748922718, - "learning_rate": 1.745399763882881e-07, - "logits/chosen": -0.5, - "logits/rejected": -0.57421875, - "logps/chosen": -2512.0, - "logps/rejected": -2112.0, - "loss": 0.6461, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.5390625, - "rewards/margins": 0.19921875, - "rewards/rejected": 0.33984375, - "step": 1300 - }, - { - "epoch": 0.64, - "eval_logits/chosen": -0.63671875, - "eval_logits/rejected": -0.69140625, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2032.0, - "eval_loss": 0.6395859122276306, - "eval_rewards/accuracies": 0.586309552192688, - "eval_rewards/chosen": 0.55859375, - "eval_rewards/margins": 0.1962890625, - "eval_rewards/rejected": 0.361328125, - "eval_runtime": 86.6403, - "eval_samples_per_second": 23.084, - "eval_steps_per_second": 0.485, - "step": 1300 - }, - { - "epoch": 0.64, - "grad_norm": 8.541591527739216, - "learning_rate": 1.704685528724046e-07, - "logits/chosen": -0.56640625, - "logits/rejected": -0.625, - "logps/chosen": -2496.0, - "logps/rejected": -2304.0, - "loss": 0.6451, - "rewards/accuracies": 0.48000001907348633, - "rewards/chosen": 0.62890625, - "rewards/margins": 0.0703125, - "rewards/rejected": 0.55859375, - "step": 1310 - }, - { - "epoch": 0.65, - "grad_norm": 9.240636114996493, - "learning_rate": 1.664204655140607e-07, - "logits/chosen": -0.5625, - "logits/rejected": -0.61328125, - "logps/chosen": -2272.0, - "logps/rejected": -1944.0, - "loss": 0.6175, - "rewards/accuracies": 0.6800000071525574, - "rewards/chosen": 0.671875, - "rewards/margins": 0.298828125, - "rewards/rejected": 0.375, - "step": 1320 - }, - { - "epoch": 0.65, - "grad_norm": 9.2814996839246, - "learning_rate": 1.6239690210510166e-07, - "logits/chosen": -0.58984375, - "logits/rejected": -0.65625, - "logps/chosen": -2608.0, - "logps/rejected": -2368.0, - "loss": 0.6566, - "rewards/accuracies": 0.5400000214576721, - "rewards/chosen": 0.59765625, - "rewards/margins": 0.07421875, - "rewards/rejected": 0.5234375, - "step": 1330 - }, - { - "epoch": 0.66, - "grad_norm": 7.5279255603123, - "learning_rate": 1.5839904324154273e-07, - "logits/chosen": -0.34765625, - "logits/rejected": -0.455078125, - "logps/chosen": -2736.0, - "logps/rejected": -2256.0, - "loss": 0.6349, - "rewards/accuracies": 0.5000000596046448, - "rewards/chosen": 0.71875, - "rewards/margins": 0.2255859375, - "rewards/rejected": 0.49609375, - "step": 1340 - }, - { - "epoch": 0.66, - "grad_norm": 11.057065560114015, - "learning_rate": 1.544280619771588e-07, - "logits/chosen": -0.44140625, - "logits/rejected": -0.515625, - "logps/chosen": -2448.0, - "logps/rejected": -2160.0, - "loss": 0.6424, - "rewards/accuracies": 0.5600000023841858, - "rewards/chosen": 0.7109375, - "rewards/margins": 0.2080078125, - "rewards/rejected": 0.50390625, - "step": 1350 - }, - { - "epoch": 0.67, - "grad_norm": 9.245099893740868, - "learning_rate": 1.5048512347928564e-07, - "logits/chosen": -0.474609375, - "logits/rejected": -0.578125, - "logps/chosen": -2800.0, - "logps/rejected": -2480.0, - "loss": 0.6604, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.74609375, - "rewards/margins": 0.2138671875, - "rewards/rejected": 0.53125, - "step": 1360 - }, - { - "epoch": 0.67, - "grad_norm": 9.054284348259795, - "learning_rate": 1.4657138468693648e-07, - "logits/chosen": -0.57421875, - "logits/rejected": -0.671875, - "logps/chosen": -2224.0, - "logps/rejected": -1832.0, - "loss": 0.6223, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.59765625, - "rewards/margins": 0.33203125, - "rewards/rejected": 0.265625, - "step": 1370 - }, - { - "epoch": 0.68, - "grad_norm": 8.967356728455812, - "learning_rate": 1.426879939713322e-07, - "logits/chosen": -0.58984375, - "logits/rejected": -0.71484375, - "logps/chosen": -2464.0, - "logps/rejected": -1864.0, - "loss": 0.6278, - "rewards/accuracies": 0.6399999856948853, - "rewards/chosen": 0.76171875, - "rewards/margins": 0.3515625, - "rewards/rejected": 0.408203125, - "step": 1380 - }, - { - "epoch": 0.68, - "grad_norm": 8.462317796783486, - "learning_rate": 1.3883609079894532e-07, - "logits/chosen": -0.52734375, - "logits/rejected": -0.51171875, - "logps/chosen": -1848.0, - "logps/rejected": -1936.0, - "loss": 0.6453, - "rewards/accuracies": 0.5400000810623169, - "rewards/chosen": 0.5859375, - "rewards/margins": 0.07177734375, - "rewards/rejected": 0.515625, - "step": 1390 - }, - { - "epoch": 0.69, - "grad_norm": 12.275639958330698, - "learning_rate": 1.350168053971577e-07, - "logits/chosen": -0.51953125, - "logits/rejected": -0.69140625, - "logps/chosen": -2528.0, - "logps/rejected": -1856.0, - "loss": 0.6258, - "rewards/accuracies": 0.6800000071525574, - "rewards/chosen": 0.75, - "rewards/margins": 0.3671875, - "rewards/rejected": 0.3828125, - "step": 1400 - }, - { - "epoch": 0.69, - "eval_logits/chosen": -0.6171875, - "eval_logits/rejected": -0.67578125, - "eval_logps/chosen": -2304.0, - "eval_logps/rejected": -2016.0, - "eval_loss": 0.6359687447547913, - "eval_rewards/accuracies": 0.5922619104385376, - "eval_rewards/chosen": 0.69140625, - "eval_rewards/margins": 0.220703125, - "eval_rewards/rejected": 0.47265625, - "eval_runtime": 86.3381, - "eval_samples_per_second": 23.165, - "eval_steps_per_second": 0.486, - "step": 1400 - }, - { - "epoch": 0.69, - "grad_norm": 10.6189969938929, - "learning_rate": 1.312312584226284e-07, - "logits/chosen": -0.54296875, - "logits/rejected": -0.5859375, - "logps/chosen": -2368.0, - "logps/rejected": -2144.0, - "loss": 0.6219, - "rewards/accuracies": 0.48000001907348633, - "rewards/chosen": 0.77734375, - "rewards/margins": 0.271484375, - "rewards/rejected": 0.50390625, - "step": 1410 - }, - { - "epoch": 0.7, - "grad_norm": 7.349435818799782, - "learning_rate": 1.2748056063246994e-07, - "logits/chosen": -0.486328125, - "logits/rejected": -0.62890625, - "logps/chosen": -2416.0, - "logps/rejected": -1912.0, - "loss": 0.6426, - "rewards/accuracies": 0.5600000619888306, - "rewards/chosen": 0.73828125, - "rewards/margins": 0.25, - "rewards/rejected": 0.486328125, - "step": 1420 - }, - { - "epoch": 0.7, - "grad_norm": 9.740153583722698, - "learning_rate": 1.2376581255832966e-07, - "logits/chosen": -0.54296875, - "logits/rejected": -0.6640625, - "logps/chosen": -2688.0, - "logps/rejected": -2080.0, - "loss": 0.6293, - "rewards/accuracies": 0.7000000476837158, - "rewards/chosen": 0.78515625, - "rewards/margins": 0.294921875, - "rewards/rejected": 0.48828125, - "step": 1430 - }, - { - "epoch": 0.71, - "grad_norm": 8.753751751983152, - "learning_rate": 1.2008810418347093e-07, - "logits/chosen": -0.5546875, - "logits/rejected": -0.5859375, - "logps/chosen": -2080.0, - "logps/rejected": -1880.0, - "loss": 0.6566, - "rewards/accuracies": 0.64000004529953, - "rewards/chosen": 0.57421875, - "rewards/margins": 0.2021484375, - "rewards/rejected": 0.373046875, - "step": 1440 - }, - { - "epoch": 0.71, - "grad_norm": 8.57529838239571, - "learning_rate": 1.1644851462294956e-07, - "logits/chosen": -0.54296875, - "logits/rejected": -0.65234375, - "logps/chosen": -2032.0, - "logps/rejected": -1632.0, - "loss": 0.636, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.5078125, - "rewards/margins": 0.1904296875, - "rewards/rejected": 0.318359375, - "step": 1450 - }, - { - "epoch": 0.72, - "grad_norm": 8.301288314698684, - "learning_rate": 1.128481118069799e-07, - "logits/chosen": -0.484375, - "logits/rejected": -0.59375, - "logps/chosen": -2768.0, - "logps/rejected": -2208.0, - "loss": 0.6506, - "rewards/accuracies": 0.7400001287460327, - "rewards/chosen": 0.796875, - "rewards/margins": 0.2421875, - "rewards/rejected": 0.5546875, - "step": 1460 - }, - { - "epoch": 0.72, - "grad_norm": 9.889148738488851, - "learning_rate": 1.0928795216758149e-07, - "logits/chosen": -0.55078125, - "logits/rejected": -0.62890625, - "logps/chosen": -2176.0, - "logps/rejected": -1832.0, - "loss": 0.6503, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.52734375, - "rewards/margins": 0.193359375, - "rewards/rejected": 0.3359375, - "step": 1470 - }, - { - "epoch": 0.73, - "grad_norm": 9.51045156043377, - "learning_rate": 1.0576908032860088e-07, - "logits/chosen": -0.62890625, - "logits/rejected": -0.640625, - "logps/chosen": -2208.0, - "logps/rejected": -2144.0, - "loss": 0.6682, - "rewards/accuracies": 0.5600000023841858, - "rewards/chosen": 0.66796875, - "rewards/margins": 0.24609375, - "rewards/rejected": 0.421875, - "step": 1480 - }, - { - "epoch": 0.73, - "grad_norm": 7.27263720622099, - "learning_rate": 1.0229252879919714e-07, - "logits/chosen": -0.5859375, - "logits/rejected": -0.60546875, - "logps/chosen": -1960.0, - "logps/rejected": -1784.0, - "loss": 0.6574, - "rewards/accuracies": 0.5399999618530273, - "rewards/chosen": 0.48828125, - "rewards/margins": 0.10107421875, - "rewards/rejected": 0.38671875, - "step": 1490 - }, - { - "epoch": 0.74, - "grad_norm": 9.236088100797234, - "learning_rate": 9.88593176708827e-08, - "logits/chosen": -0.4765625, - "logits/rejected": -0.59765625, - "logps/chosen": -2384.0, - "logps/rejected": -1888.0, - "loss": 0.6347, - "rewards/accuracies": 0.64000004529953, - "rewards/chosen": 0.7265625, - "rewards/margins": 0.44921875, - "rewards/rejected": 0.279296875, - "step": 1500 - }, - { - "epoch": 0.74, - "eval_logits/chosen": -0.609375, - "eval_logits/rejected": -0.6640625, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2024.0, - "eval_loss": 0.6374765634536743, - "eval_rewards/accuracies": 0.5892857313156128, - "eval_rewards/chosen": 0.625, - "eval_rewards/margins": 0.2099609375, - "eval_rewards/rejected": 0.4140625, - "eval_runtime": 86.4172, - "eval_samples_per_second": 23.144, - "eval_steps_per_second": 0.486, - "step": 1500 - }, - { - "epoch": 0.74, - "grad_norm": 8.84592951975432, - "learning_rate": 9.547045431820749e-08, - "logits/chosen": -0.52734375, - "logits/rejected": -0.5546875, - "logps/chosen": -2384.0, - "logps/rejected": -2240.0, - "loss": 0.6272, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.68359375, - "rewards/margins": 0.11669921875, - "rewards/rejected": 0.56640625, - "step": 1510 - }, - { - "epoch": 0.75, - "grad_norm": 8.84073742236179, - "learning_rate": 9.212693310317479e-08, - "logits/chosen": -0.5703125, - "logits/rejected": -0.58984375, - "logps/chosen": -2176.0, - "logps/rejected": -2048.0, - "loss": 0.6444, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.5625, - "rewards/margins": 0.1689453125, - "rewards/rejected": 0.392578125, - "step": 1520 - }, - { - "epoch": 0.75, - "grad_norm": 6.8580337819405335, - "learning_rate": 8.882973508347449e-08, - "logits/chosen": -0.546875, - "logits/rejected": -0.65234375, - "logps/chosen": -1968.0, - "logps/rejected": -1648.0, - "loss": 0.6548, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.54296875, - "rewards/margins": 0.2451171875, - "rewards/rejected": 0.30078125, - "step": 1530 - }, - { - "epoch": 0.76, - "grad_norm": 10.016816397566908, - "learning_rate": 8.557982772462138e-08, - "logits/chosen": -0.54296875, - "logits/rejected": -0.60546875, - "logps/chosen": -2352.0, - "logps/rejected": -1976.0, - "loss": 0.6418, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.6953125, - "rewards/margins": 0.28125, - "rewards/rejected": 0.4140625, - "step": 1540 - }, - { - "epoch": 0.76, - "grad_norm": 11.142392440409802, - "learning_rate": 8.237816461608049e-08, - "logits/chosen": -0.546875, - "logits/rejected": -0.53125, - "logps/chosen": -2048.0, - "logps/rejected": -1928.0, - "loss": 0.6925, - "rewards/accuracies": 0.6800000071525574, - "rewards/chosen": 0.515625, - "rewards/margins": 0.1630859375, - "rewards/rejected": 0.3515625, - "step": 1550 - }, - { - "epoch": 0.77, - "grad_norm": 10.436188393180485, - "learning_rate": 7.922568519146425e-08, - "logits/chosen": -0.58984375, - "logits/rejected": -0.62109375, - "logps/chosen": -2432.0, - "logps/rejected": -2240.0, - "loss": 0.662, - "rewards/accuracies": 0.5600000023841858, - "rewards/chosen": 0.65625, - "rewards/margins": 0.14453125, - "rewards/rejected": 0.515625, - "step": 1560 - }, - { - "epoch": 0.77, - "grad_norm": 8.734002863477851, - "learning_rate": 7.612331445288389e-08, - "logits/chosen": -0.423828125, - "logits/rejected": -0.494140625, - "logps/chosen": -2448.0, - "logps/rejected": -2064.0, - "loss": 0.6432, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.69140625, - "rewards/margins": 0.2001953125, - "rewards/rejected": 0.490234375, - "step": 1570 - }, - { - "epoch": 0.78, - "grad_norm": 10.892280756623634, - "learning_rate": 7.307196269953444e-08, - "logits/chosen": -0.5546875, - "logits/rejected": -0.53515625, - "logps/chosen": -2064.0, - "logps/rejected": -2192.0, - "loss": 0.6439, - "rewards/accuracies": 0.5199999809265137, - "rewards/chosen": 0.478515625, - "rewards/margins": 0.09716796875, - "rewards/rejected": 0.380859375, - "step": 1580 - }, - { - "epoch": 0.78, - "grad_norm": 11.89094090289422, - "learning_rate": 7.007252526059446e-08, - "logits/chosen": -0.494140625, - "logits/rejected": -0.59375, - "logps/chosen": -2624.0, - "logps/rejected": -2160.0, - "loss": 0.6732, - "rewards/accuracies": 0.5199999809265137, - "rewards/chosen": 0.62109375, - "rewards/margins": 0.07421875, - "rewards/rejected": 0.546875, - "step": 1590 - }, - { - "epoch": 0.79, - "grad_norm": 8.353720135556827, - "learning_rate": 6.712588223251809e-08, - "logits/chosen": -0.57421875, - "logits/rejected": -0.62890625, - "logps/chosen": -2480.0, - "logps/rejected": -2192.0, - "loss": 0.6185, - "rewards/accuracies": 0.6599999666213989, - "rewards/chosen": 0.66015625, - "rewards/margins": 0.3359375, - "rewards/rejected": 0.32421875, - "step": 1600 - }, - { - "epoch": 0.79, - "eval_logits/chosen": -0.625, - "eval_logits/rejected": -0.6796875, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2032.0, - "eval_loss": 0.6382187604904175, - "eval_rewards/accuracies": 0.6041666865348816, - "eval_rewards/chosen": 0.59765625, - "eval_rewards/margins": 0.205078125, - "eval_rewards/rejected": 0.392578125, - "eval_runtime": 86.1215, - "eval_samples_per_second": 23.223, - "eval_steps_per_second": 0.488, - "step": 1600 - }, - { - "epoch": 0.79, - "grad_norm": 9.929495280909341, - "learning_rate": 6.423289822079644e-08, - "logits/chosen": -0.482421875, - "logits/rejected": -0.5234375, - "logps/chosen": -2464.0, - "logps/rejected": -2272.0, - "loss": 0.621, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.73828125, - "rewards/margins": 0.28515625, - "rewards/rejected": 0.453125, - "step": 1610 - }, - { - "epoch": 0.79, - "grad_norm": 7.970603963526829, - "learning_rate": 6.139442208626517e-08, - "logits/chosen": -0.58203125, - "logits/rejected": -0.66015625, - "logps/chosen": -2544.0, - "logps/rejected": -2256.0, - "loss": 0.6499, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.60546875, - "rewards/margins": 0.10693359375, - "rewards/rejected": 0.5, - "step": 1620 - }, - { - "epoch": 0.8, - "grad_norm": 7.712927942409274, - "learning_rate": 5.8611286696030795e-08, - "logits/chosen": -0.546875, - "logits/rejected": -0.58203125, - "logps/chosen": -2800.0, - "logps/rejected": -2480.0, - "loss": 0.6347, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.81640625, - "rewards/margins": 0.3125, - "rewards/rejected": 0.50390625, - "step": 1630 - }, - { - "epoch": 0.8, - "grad_norm": 8.214503728500627, - "learning_rate": 5.5884308679090525e-08, - "logits/chosen": -0.5703125, - "logits/rejected": -0.60546875, - "logps/chosen": -2208.0, - "logps/rejected": -2112.0, - "loss": 0.6556, - "rewards/accuracies": 0.4599999785423279, - "rewards/chosen": 0.40625, - "rewards/margins": 0.11474609375, - "rewards/rejected": 0.291015625, - "step": 1640 - }, - { - "epoch": 0.81, - "grad_norm": 8.126286436494889, - "learning_rate": 5.321428818671672e-08, - "logits/chosen": -0.52734375, - "logits/rejected": -0.609375, - "logps/chosen": -2128.0, - "logps/rejected": -1752.0, - "loss": 0.6435, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.443359375, - "rewards/margins": 0.166015625, - "rewards/rejected": 0.27734375, - "step": 1650 - }, - { - "epoch": 0.81, - "grad_norm": 9.727164383599991, - "learning_rate": 5.060200865767605e-08, - "logits/chosen": -0.5546875, - "logits/rejected": -0.6171875, - "logps/chosen": -2336.0, - "logps/rejected": -2040.0, - "loss": 0.627, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.458984375, - "rewards/margins": 0.2470703125, - "rewards/rejected": 0.2119140625, - "step": 1660 - }, - { - "epoch": 0.82, - "grad_norm": 8.55962886651413, - "learning_rate": 4.804823658835233e-08, - "logits/chosen": -0.5859375, - "logits/rejected": -0.671875, - "logps/chosen": -2352.0, - "logps/rejected": -1920.0, - "loss": 0.6377, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.51171875, - "rewards/margins": 0.14453125, - "rewards/rejected": 0.369140625, - "step": 1670 - }, - { - "epoch": 0.82, - "grad_norm": 13.3092741964271, - "learning_rate": 4.555372130784102e-08, - "logits/chosen": -0.65625, - "logits/rejected": -0.71875, - "logps/chosen": -1912.0, - "logps/rejected": -1752.0, - "loss": 0.6305, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.376953125, - "rewards/margins": 0.0341796875, - "rewards/rejected": 0.34375, - "step": 1680 - }, - { - "epoch": 0.83, - "grad_norm": 8.172767944806202, - "learning_rate": 4.311919475808037e-08, - "logits/chosen": -0.5546875, - "logits/rejected": -0.609375, - "logps/chosen": -2256.0, - "logps/rejected": -2032.0, - "loss": 0.6572, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.53125, - "rewards/margins": 0.1796875, - "rewards/rejected": 0.3515625, - "step": 1690 - }, - { - "epoch": 0.83, - "grad_norm": 8.68637911065445, - "learning_rate": 4.0745371279084976e-08, - "logits/chosen": -0.5, - "logits/rejected": -0.53515625, - "logps/chosen": -2528.0, - "logps/rejected": -2240.0, - "loss": 0.6408, - "rewards/accuracies": 0.5400000810623169, - "rewards/chosen": 0.6484375, - "rewards/margins": 0.189453125, - "rewards/rejected": 0.458984375, - "step": 1700 - }, - { - "epoch": 0.83, - "eval_logits/chosen": -0.6171875, - "eval_logits/rejected": -0.671875, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2024.0, - "eval_loss": 0.6374297142028809, - "eval_rewards/accuracies": 0.5952380895614624, - "eval_rewards/chosen": 0.59765625, - "eval_rewards/margins": 0.2041015625, - "eval_rewards/rejected": 0.392578125, - "eval_runtime": 86.0796, - "eval_samples_per_second": 23.234, - "eval_steps_per_second": 0.488, - "step": 1700 - }, - { - "epoch": 0.84, - "grad_norm": 12.620733225081068, - "learning_rate": 3.843294739934369e-08, - "logits/chosen": -0.515625, - "logits/rejected": -0.53515625, - "logps/chosen": -2176.0, - "logps/rejected": -2208.0, - "loss": 0.6653, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.6015625, - "rewards/margins": 0.142578125, - "rewards/rejected": 0.45703125, - "step": 1710 - }, - { - "epoch": 0.84, - "grad_norm": 11.28123633647817, - "learning_rate": 3.6182601631443596e-08, - "logits/chosen": -0.5, - "logits/rejected": -0.62890625, - "logps/chosen": -2752.0, - "logps/rejected": -2032.0, - "loss": 0.6464, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.671875, - "rewards/margins": 0.1640625, - "rewards/rejected": 0.5078125, - "step": 1720 - }, - { - "epoch": 0.85, - "grad_norm": 9.053407610350268, - "learning_rate": 3.3994994272980944e-08, - "logits/chosen": -0.50390625, - "logits/rejected": -0.6171875, - "logps/chosen": -2384.0, - "logps/rejected": -1832.0, - "loss": 0.6394, - "rewards/accuracies": 0.6600000262260437, - "rewards/chosen": 0.59375, - "rewards/margins": 0.271484375, - "rewards/rejected": 0.3203125, - "step": 1730 - }, - { - "epoch": 0.85, - "grad_norm": 10.202733795570047, - "learning_rate": 3.187076721281595e-08, - "logits/chosen": -0.55078125, - "logits/rejected": -0.6484375, - "logps/chosen": -2080.0, - "logps/rejected": -1744.0, - "loss": 0.6662, - "rewards/accuracies": 0.5600000023841858, - "rewards/chosen": 0.474609375, - "rewards/margins": 0.1572265625, - "rewards/rejected": 0.31640625, - "step": 1740 - }, - { - "epoch": 0.86, - "grad_norm": 9.310568396714823, - "learning_rate": 2.9810543742729705e-08, - "logits/chosen": -0.5390625, - "logits/rejected": -0.56640625, - "logps/chosen": -2304.0, - "logps/rejected": -2096.0, - "loss": 0.6431, - "rewards/accuracies": 0.5799999833106995, - "rewards/chosen": 0.6796875, - "rewards/margins": 0.263671875, - "rewards/rejected": 0.4140625, - "step": 1750 - }, - { - "epoch": 0.86, - "grad_norm": 9.254692446674307, - "learning_rate": 2.7814928374537334e-08, - "logits/chosen": -0.48828125, - "logits/rejected": -0.625, - "logps/chosen": -2272.0, - "logps/rejected": -1696.0, - "loss": 0.631, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.51171875, - "rewards/margins": 0.1708984375, - "rewards/rejected": 0.33984375, - "step": 1760 - }, - { - "epoch": 0.87, - "grad_norm": 7.485091349390559, - "learning_rate": 2.5884506662711886e-08, - "logits/chosen": -0.49609375, - "logits/rejected": -0.62109375, - "logps/chosen": -2576.0, - "logps/rejected": -2008.0, - "loss": 0.6258, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.68359375, - "rewards/margins": 0.296875, - "rewards/rejected": 0.388671875, - "step": 1770 - }, - { - "epoch": 0.87, - "grad_norm": 7.583505828929572, - "learning_rate": 2.4019845032570875e-08, - "logits/chosen": -0.4921875, - "logits/rejected": -0.5859375, - "logps/chosen": -2688.0, - "logps/rejected": -2176.0, - "loss": 0.6613, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.62890625, - "rewards/margins": 0.1943359375, - "rewards/rejected": 0.435546875, - "step": 1780 - }, - { - "epoch": 0.88, - "grad_norm": 10.11330279692475, - "learning_rate": 2.222149061407527e-08, - "logits/chosen": -0.43359375, - "logits/rejected": -0.5078125, - "logps/chosen": -3072.0, - "logps/rejected": -2704.0, - "loss": 0.6427, - "rewards/accuracies": 0.46000003814697266, - "rewards/chosen": 0.8828125, - "rewards/margins": 0.140625, - "rewards/rejected": 0.7421875, - "step": 1790 - }, - { - "epoch": 0.88, - "grad_norm": 10.263183451230402, - "learning_rate": 2.0489971081290193e-08, - "logits/chosen": -0.5703125, - "logits/rejected": -0.58984375, - "logps/chosen": -2208.0, - "logps/rejected": -1936.0, - "loss": 0.662, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.62890625, - "rewards/margins": 0.11181640625, - "rewards/rejected": 0.515625, - "step": 1800 - }, - { - "epoch": 0.88, - "eval_logits/chosen": -0.62890625, - "eval_logits/rejected": -0.68359375, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2024.0, - "eval_loss": 0.6355390548706055, - "eval_rewards/accuracies": 0.601190447807312, - "eval_rewards/chosen": 0.609375, - "eval_rewards/margins": 0.2119140625, - "eval_rewards/rejected": 0.3984375, - "eval_runtime": 86.081, - "eval_samples_per_second": 23.234, - "eval_steps_per_second": 0.488, - "step": 1800 - }, - { - "epoch": 0.89, - "grad_norm": 7.814802788464052, - "learning_rate": 1.882579449755495e-08, - "logits/chosen": -0.515625, - "logits/rejected": -0.640625, - "logps/chosen": -2656.0, - "logps/rejected": -2128.0, - "loss": 0.6186, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.75, - "rewards/margins": 0.29296875, - "rewards/rejected": 0.45703125, - "step": 1810 - }, - { - "epoch": 0.89, - "grad_norm": 9.220542164026453, - "learning_rate": 1.7229449166406477e-08, - "logits/chosen": -0.54296875, - "logits/rejected": -0.58203125, - "logps/chosen": -2496.0, - "logps/rejected": -2256.0, - "loss": 0.658, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.63671875, - "rewards/margins": 0.1083984375, - "rewards/rejected": 0.52734375, - "step": 1820 - }, - { - "epoch": 0.9, - "grad_norm": 11.528605527636161, - "learning_rate": 1.5701403488301235e-08, - "logits/chosen": -0.5546875, - "logits/rejected": -0.625, - "logps/chosen": -2288.0, - "logps/rejected": -2000.0, - "loss": 0.6381, - "rewards/accuracies": 0.6399999856948853, - "rewards/chosen": 0.53125, - "rewards/margins": 0.255859375, - "rewards/rejected": 0.2734375, - "step": 1830 - }, - { - "epoch": 0.9, - "grad_norm": 13.882026366198952, - "learning_rate": 1.4242105823176837e-08, - "logits/chosen": -0.671875, - "logits/rejected": -0.7421875, - "logps/chosen": -2096.0, - "logps/rejected": -1752.0, - "loss": 0.6506, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.515625, - "rewards/margins": 0.19140625, - "rewards/rejected": 0.32421875, - "step": 1840 - }, - { - "epoch": 0.91, - "grad_norm": 9.608713807399866, - "learning_rate": 1.285198435889398e-08, - "logits/chosen": -0.484375, - "logits/rejected": -0.56640625, - "logps/chosen": -2480.0, - "logps/rejected": -1984.0, - "loss": 0.6507, - "rewards/accuracies": 0.6000000238418579, - "rewards/chosen": 0.6015625, - "rewards/margins": 0.2275390625, - "rewards/rejected": 0.373046875, - "step": 1850 - }, - { - "epoch": 0.91, - "grad_norm": 9.195872685886451, - "learning_rate": 1.1531446985597604e-08, - "logits/chosen": -0.671875, - "logits/rejected": -0.5859375, - "logps/chosen": -1664.0, - "logps/rejected": -2064.0, - "loss": 0.6717, - "rewards/accuracies": 0.5400000214576721, - "rewards/chosen": 0.447265625, - "rewards/margins": -0.01806640625, - "rewards/rejected": 0.46484375, - "step": 1860 - }, - { - "epoch": 0.92, - "grad_norm": 10.06291940471119, - "learning_rate": 1.0280881176033318e-08, - "logits/chosen": -0.55859375, - "logits/rejected": -0.640625, - "logps/chosen": -2400.0, - "logps/rejected": -1952.0, - "loss": 0.6418, - "rewards/accuracies": 0.5600000619888306, - "rewards/chosen": 0.6328125, - "rewards/margins": 0.232421875, - "rewards/rejected": 0.40234375, - "step": 1870 - }, - { - "epoch": 0.92, - "grad_norm": 9.690917680622107, - "learning_rate": 9.100653871854963e-09, - "logits/chosen": -0.55078125, - "logits/rejected": -0.60546875, - "logps/chosen": -2608.0, - "logps/rejected": -2336.0, - "loss": 0.6336, - "rewards/accuracies": 0.5199999809265137, - "rewards/chosen": 0.68359375, - "rewards/margins": 0.31640625, - "rewards/rejected": 0.3671875, - "step": 1880 - }, - { - "epoch": 0.93, - "grad_norm": 7.698287956753982, - "learning_rate": 7.991111375956539e-09, - "logits/chosen": -0.470703125, - "logits/rejected": -0.54296875, - "logps/chosen": -2480.0, - "logps/rejected": -2224.0, - "loss": 0.6679, - "rewards/accuracies": 0.4599999785423279, - "rewards/chosen": 0.60546875, - "rewards/margins": 0.109375, - "rewards/rejected": 0.498046875, - "step": 1890 - }, - { - "epoch": 0.93, - "grad_norm": 9.319385343543805, - "learning_rate": 6.9525792508597634e-09, - "logits/chosen": -0.59765625, - "logits/rejected": -0.6015625, - "logps/chosen": -2336.0, - "logps/rejected": -2320.0, - "loss": 0.6385, - "rewards/accuracies": 0.6599999666213989, - "rewards/chosen": 0.58984375, - "rewards/margins": 0.2236328125, - "rewards/rejected": 0.3671875, - "step": 1900 - }, - { - "epoch": 0.93, - "eval_logits/chosen": -0.62109375, - "eval_logits/rejected": -0.67578125, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2024.0, - "eval_loss": 0.6378594040870667, - "eval_rewards/accuracies": 0.625, - "eval_rewards/chosen": 0.60546875, - "eval_rewards/margins": 0.212890625, - "eval_rewards/rejected": 0.392578125, - "eval_runtime": 86.1786, - "eval_samples_per_second": 23.208, - "eval_steps_per_second": 0.487, - "step": 1900 - }, - { - "epoch": 0.94, - "grad_norm": 7.544043526165161, - "learning_rate": 5.985362223187296e-09, - "logits/chosen": -0.47265625, - "logits/rejected": -0.54296875, - "logps/chosen": -2512.0, - "logps/rejected": -2176.0, - "loss": 0.6528, - "rewards/accuracies": 0.5799999833106995, - "rewards/chosen": 0.66015625, - "rewards/margins": 0.21484375, - "rewards/rejected": 0.4453125, - "step": 1910 - }, - { - "epoch": 0.94, - "grad_norm": 9.355498575273248, - "learning_rate": 5.089744094249837e-09, - "logits/chosen": -0.58203125, - "logits/rejected": -0.6953125, - "logps/chosen": -2848.0, - "logps/rejected": -2304.0, - "loss": 0.6129, - "rewards/accuracies": 0.699999988079071, - "rewards/chosen": 0.75390625, - "rewards/margins": 0.279296875, - "rewards/rejected": 0.4765625, - "step": 1920 - }, - { - "epoch": 0.95, - "grad_norm": 9.114147510919343, - "learning_rate": 4.265987656772857e-09, - "logits/chosen": -0.55078125, - "logits/rejected": -0.625, - "logps/chosen": -2352.0, - "logps/rejected": -1928.0, - "loss": 0.6488, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.69921875, - "rewards/margins": 0.31640625, - "rewards/rejected": 0.3828125, - "step": 1930 - }, - { - "epoch": 0.95, - "grad_norm": 11.411261082676203, - "learning_rate": 3.5143346177878565e-09, - "logits/chosen": -0.46484375, - "logits/rejected": -0.578125, - "logps/chosen": -2752.0, - "logps/rejected": -2176.0, - "loss": 0.6235, - "rewards/accuracies": 0.7799999713897705, - "rewards/chosen": 0.83984375, - "rewards/margins": 0.482421875, - "rewards/rejected": 0.357421875, - "step": 1940 - }, - { - "epoch": 0.96, - "grad_norm": 7.165814163311654, - "learning_rate": 2.835005527710682e-09, - "logits/chosen": -0.5234375, - "logits/rejected": -0.60546875, - "logps/chosen": -2368.0, - "logps/rejected": -1904.0, - "loss": 0.6375, - "rewards/accuracies": 0.6800000071525574, - "rewards/chosen": 0.58984375, - "rewards/margins": 0.302734375, - "rewards/rejected": 0.2890625, - "step": 1950 - }, - { - "epoch": 0.96, - "grad_norm": 8.854995734298003, - "learning_rate": 2.2281997156273213e-09, - "logits/chosen": -0.4296875, - "logits/rejected": -0.61328125, - "logps/chosen": -2720.0, - "logps/rejected": -1928.0, - "loss": 0.6276, - "rewards/accuracies": 0.6399999856948853, - "rewards/chosen": 0.671875, - "rewards/margins": 0.373046875, - "rewards/rejected": 0.298828125, - "step": 1960 - }, - { - "epoch": 0.97, - "grad_norm": 8.38257510772057, - "learning_rate": 1.6940952308068523e-09, - "logits/chosen": -0.59375, - "logits/rejected": -0.671875, - "logps/chosen": -2576.0, - "logps/rejected": -2160.0, - "loss": 0.6368, - "rewards/accuracies": 0.7600001096725464, - "rewards/chosen": 0.8125, - "rewards/margins": 0.439453125, - "rewards/rejected": 0.373046875, - "step": 1970 - }, - { - "epoch": 0.97, - "grad_norm": 11.857430374835033, - "learning_rate": 1.2328487904580131e-09, - "logits/chosen": -0.61328125, - "logits/rejected": -0.6796875, - "logps/chosen": -2704.0, - "logps/rejected": -2288.0, - "loss": 0.6438, - "rewards/accuracies": 0.5800000429153442, - "rewards/chosen": 0.63671875, - "rewards/margins": 0.1953125, - "rewards/rejected": 0.443359375, - "step": 1980 - }, - { - "epoch": 0.98, - "grad_norm": 7.7776326201834, - "learning_rate": 8.445957337451515e-10, - "logits/chosen": -0.546875, - "logits/rejected": -0.6328125, - "logps/chosen": -2336.0, - "logps/rejected": -2000.0, - "loss": 0.6336, - "rewards/accuracies": 0.6200000047683716, - "rewards/chosen": 0.6953125, - "rewards/margins": 0.26953125, - "rewards/rejected": 0.423828125, - "step": 1990 - }, - { - "epoch": 0.98, - "grad_norm": 8.452587117057995, - "learning_rate": 5.29449982077046e-10, - "logits/chosen": -0.578125, - "logits/rejected": -0.640625, - "logps/chosen": -2336.0, - "logps/rejected": -2016.0, - "loss": 0.6154, - "rewards/accuracies": 0.6800000667572021, - "rewards/chosen": 0.6171875, - "rewards/margins": 0.2373046875, - "rewards/rejected": 0.37890625, - "step": 2000 - }, - { - "epoch": 0.98, - "eval_logits/chosen": -0.62109375, - "eval_logits/rejected": -0.67578125, - "eval_logps/chosen": -2320.0, - "eval_logps/rejected": -2024.0, - "eval_loss": 0.6380937695503235, - "eval_rewards/accuracies": 0.601190447807312, - "eval_rewards/chosen": 0.609375, - "eval_rewards/margins": 0.2041015625, - "eval_rewards/rejected": 0.404296875, - "eval_runtime": 86.1049, - "eval_samples_per_second": 23.227, - "eval_steps_per_second": 0.488, - "step": 2000 - }, - { - "epoch": 0.99, - "grad_norm": 12.27319369048997, - "learning_rate": 2.875040056799227e-10, - "logits/chosen": -0.5859375, - "logits/rejected": -0.57421875, - "logps/chosen": -2304.0, - "logps/rejected": -2432.0, - "loss": 0.6572, - "rewards/accuracies": 0.5200000405311584, - "rewards/chosen": 0.58203125, - "rewards/margins": 0.08447265625, - "rewards/rejected": 0.498046875, - "step": 2010 - }, - { - "epoch": 0.99, - "grad_norm": 9.006712698632741, - "learning_rate": 1.1882879646485379e-10, - "logits/chosen": -0.54296875, - "logits/rejected": -0.6015625, - "logps/chosen": -2040.0, - "logps/rejected": -1824.0, - "loss": 0.6547, - "rewards/accuracies": 0.5, - "rewards/chosen": 0.53515625, - "rewards/margins": 0.1533203125, - "rewards/rejected": 0.3828125, - "step": 2020 - }, - { - "epoch": 1.0, - "grad_norm": 9.784506721487483, - "learning_rate": 2.3473847197225115e-11, - "logits/chosen": -0.54296875, - "logits/rejected": -0.671875, - "logps/chosen": -2512.0, - "logps/rejected": -1984.0, - "loss": 0.6671, - "rewards/accuracies": 0.5400000810623169, - "rewards/chosen": 0.703125, - "rewards/margins": 0.259765625, - "rewards/rejected": 0.443359375, - "step": 2030 - }, - { - "epoch": 1.0, - "step": 2038, + "epoch": 0.9990186457311089, + "step": 509, "total_flos": 0.0, - "train_loss": 0.6502101503246783, - "train_runtime": 8979.6364, - "train_samples_per_second": 6.808, - "train_steps_per_second": 0.227 + "train_loss": 0.6464882252961105, + "train_runtime": 7224.3746, + "train_samples_per_second": 8.462, + "train_steps_per_second": 0.07 } ], "logging_steps": 10, - "max_steps": 2038, + "max_steps": 509, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100,