{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 1416, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.2890625, "learning_rate": 3.5211267605633804e-08, "logits/chosen": -2.859166383743286, "logits/rejected": -2.8096845149993896, "logps/chosen": -99.04647827148438, "logps/rejected": -117.97454071044922, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 1.265625, "learning_rate": 3.521126760563381e-07, "logits/chosen": -2.9342617988586426, "logits/rejected": -2.741842269897461, "logps/chosen": -173.28025817871094, "logps/rejected": -158.45001220703125, "loss": 0.6929, "rewards/accuracies": 0.5277777910232544, "rewards/chosen": 0.0002738925686571747, "rewards/margins": 0.0006674294709227979, "rewards/rejected": -0.0003935369022656232, "step": 10 }, { "epoch": 0.01, "grad_norm": 1.2578125, "learning_rate": 7.042253521126762e-07, "logits/chosen": -2.9304115772247314, "logits/rejected": -2.773350477218628, "logps/chosen": -148.03622436523438, "logps/rejected": -155.12030029296875, "loss": 0.6917, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.0001768588845152408, "rewards/margins": 0.0034071411937475204, "rewards/rejected": -0.0035840000491589308, "step": 20 }, { "epoch": 0.02, "grad_norm": 1.2890625, "learning_rate": 1.0563380281690142e-06, "logits/chosen": -2.8715338706970215, "logits/rejected": -2.548405170440674, "logps/chosen": -135.1510772705078, "logps/rejected": -153.06016540527344, "loss": 0.6875, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.001646283664740622, "rewards/margins": 0.010566174052655697, "rewards/rejected": -0.012212458066642284, "step": 30 }, { "epoch": 0.03, "grad_norm": 1.34375, "learning_rate": 1.4084507042253523e-06, "logits/chosen": -2.762091636657715, "logits/rejected": -2.660902738571167, "logps/chosen": -136.66757202148438, "logps/rejected": -153.9733123779297, "loss": 0.6813, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -0.005630046594887972, "rewards/margins": 0.020510759204626083, "rewards/rejected": -0.026140809059143066, "step": 40 }, { "epoch": 0.04, "grad_norm": 1.4609375, "learning_rate": 1.7605633802816902e-06, "logits/chosen": -2.8669049739837646, "logits/rejected": -2.475205421447754, "logps/chosen": -188.17373657226562, "logps/rejected": -151.49258422851562, "loss": 0.6713, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -0.000742008734960109, "rewards/margins": 0.04474978148937225, "rewards/rejected": -0.045491788536310196, "step": 50 }, { "epoch": 0.04, "grad_norm": 1.4453125, "learning_rate": 2.1126760563380285e-06, "logits/chosen": -2.926124095916748, "logits/rejected": -2.7637622356414795, "logps/chosen": -153.60772705078125, "logps/rejected": -188.63381958007812, "loss": 0.6557, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.006551164202392101, "rewards/margins": 0.08704066276550293, "rewards/rejected": -0.09359182417392731, "step": 60 }, { "epoch": 0.05, "grad_norm": 1.4296875, "learning_rate": 2.4647887323943666e-06, "logits/chosen": -2.899833917617798, "logits/rejected": -2.4336047172546387, "logps/chosen": -144.35931396484375, "logps/rejected": -145.45330810546875, "loss": 0.6385, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -0.0348644033074379, "rewards/margins": 0.09251449257135391, "rewards/rejected": -0.1273788958787918, "step": 70 }, { "epoch": 0.06, "grad_norm": 1.5859375, "learning_rate": 2.8169014084507046e-06, "logits/chosen": -2.842036008834839, "logits/rejected": -2.6445887088775635, "logps/chosen": -135.467041015625, "logps/rejected": -170.1948699951172, "loss": 0.6131, "rewards/accuracies": 1.0, "rewards/chosen": -0.0638028234243393, "rewards/margins": 0.12937679886817932, "rewards/rejected": -0.193179652094841, "step": 80 }, { "epoch": 0.06, "grad_norm": 2.234375, "learning_rate": 3.1690140845070427e-06, "logits/chosen": -2.8690648078918457, "logits/rejected": -2.761101245880127, "logps/chosen": -145.8473358154297, "logps/rejected": -161.64495849609375, "loss": 0.5925, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.031888581812381744, "rewards/margins": 0.22591328620910645, "rewards/rejected": -0.2578018605709076, "step": 90 }, { "epoch": 0.07, "grad_norm": 2.078125, "learning_rate": 3.5211267605633804e-06, "logits/chosen": -2.9103403091430664, "logits/rejected": -2.630405902862549, "logps/chosen": -157.76466369628906, "logps/rejected": -180.42227172851562, "loss": 0.5404, "rewards/accuracies": 1.0, "rewards/chosen": -0.041967134922742844, "rewards/margins": 0.34702008962631226, "rewards/rejected": -0.388987272977829, "step": 100 }, { "epoch": 0.07, "eval_logits/chosen": -2.9905362129211426, "eval_logits/rejected": -2.9873273372650146, "eval_logps/chosen": -399.27130126953125, "eval_logps/rejected": -344.4871520996094, "eval_loss": 0.7039694786071777, "eval_rewards/accuracies": 0.4276685416698456, "eval_rewards/chosen": -0.15393377840518951, "eval_rewards/margins": -0.010769988410174847, "eval_rewards/rejected": -0.1431637704372406, "eval_runtime": 656.9135, "eval_samples_per_second": 8.668, "eval_steps_per_second": 0.271, "step": 100 }, { "epoch": 0.08, "grad_norm": 3.71875, "learning_rate": 3.873239436619718e-06, "logits/chosen": -2.9433434009552, "logits/rejected": -2.6831233501434326, "logps/chosen": -156.14508056640625, "logps/rejected": -198.12060546875, "loss": 0.5029, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.05710986256599426, "rewards/margins": 0.42593106627464294, "rewards/rejected": -0.4830408990383148, "step": 110 }, { "epoch": 0.08, "grad_norm": 4.8125, "learning_rate": 4.225352112676057e-06, "logits/chosen": -2.9455037117004395, "logits/rejected": -2.6296839714050293, "logps/chosen": -162.0118865966797, "logps/rejected": -221.32962036132812, "loss": 0.4583, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.06950317323207855, "rewards/margins": 0.5651465654373169, "rewards/rejected": -0.6346497535705566, "step": 120 }, { "epoch": 0.09, "grad_norm": 2.53125, "learning_rate": 4.577464788732395e-06, "logits/chosen": -2.857548713684082, "logits/rejected": -2.720071315765381, "logps/chosen": -187.21287536621094, "logps/rejected": -258.74365234375, "loss": 0.4505, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.05886738374829292, "rewards/margins": 0.5861214995384216, "rewards/rejected": -0.6449888348579407, "step": 130 }, { "epoch": 0.1, "grad_norm": 2.125, "learning_rate": 4.929577464788733e-06, "logits/chosen": -2.977691173553467, "logits/rejected": -2.716517925262451, "logps/chosen": -156.32614135742188, "logps/rejected": -268.61248779296875, "loss": 0.3921, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.0449405275285244, "rewards/margins": 0.7741391658782959, "rewards/rejected": -0.8190797567367554, "step": 140 }, { "epoch": 0.11, "grad_norm": 7.15625, "learning_rate": 4.99951355158602e-06, "logits/chosen": -2.902024507522583, "logits/rejected": -2.745807647705078, "logps/chosen": -169.4105224609375, "logps/rejected": -243.8219757080078, "loss": 0.3472, "rewards/accuracies": 1.0, "rewards/chosen": -0.09769558906555176, "rewards/margins": 0.9020183682441711, "rewards/rejected": -0.9997137784957886, "step": 150 }, { "epoch": 0.11, "grad_norm": 2.53125, "learning_rate": 4.9975376793444875e-06, "logits/chosen": -2.872546672821045, "logits/rejected": -2.766613483428955, "logps/chosen": -149.41319274902344, "logps/rejected": -265.3560791015625, "loss": 0.351, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -0.20074686408042908, "rewards/margins": 0.9895814061164856, "rewards/rejected": -1.1903283596038818, "step": 160 }, { "epoch": 0.12, "grad_norm": 3.046875, "learning_rate": 4.99404318075312e-06, "logits/chosen": -2.9483747482299805, "logits/rejected": -2.597449541091919, "logps/chosen": -195.63929748535156, "logps/rejected": -323.9130554199219, "loss": 0.2852, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.1841152012348175, "rewards/margins": 1.2758854627609253, "rewards/rejected": -1.46000075340271, "step": 170 }, { "epoch": 0.13, "grad_norm": 1.6015625, "learning_rate": 4.989032180639774e-06, "logits/chosen": -2.7701478004455566, "logits/rejected": -2.6605210304260254, "logps/chosen": -183.76681518554688, "logps/rejected": -313.6458435058594, "loss": 0.2709, "rewards/accuracies": 1.0, "rewards/chosen": -0.2509196698665619, "rewards/margins": 1.244728446006775, "rewards/rejected": -1.4956481456756592, "step": 180 }, { "epoch": 0.13, "grad_norm": 1.2734375, "learning_rate": 4.9825077259401914e-06, "logits/chosen": -2.819714069366455, "logits/rejected": -2.4680557250976562, "logps/chosen": -196.28665161132812, "logps/rejected": -301.0437316894531, "loss": 0.2703, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.2638978660106659, "rewards/margins": 1.3128944635391235, "rewards/rejected": -1.5767922401428223, "step": 190 }, { "epoch": 0.14, "grad_norm": 3.953125, "learning_rate": 4.974473783845297e-06, "logits/chosen": -2.694714307785034, "logits/rejected": -2.337446451187134, "logps/chosen": -246.986083984375, "logps/rejected": -382.73297119140625, "loss": 0.2123, "rewards/accuracies": 1.0, "rewards/chosen": -0.4879836142063141, "rewards/margins": 1.5526775121688843, "rewards/rejected": -2.040661334991455, "step": 200 }, { "epoch": 0.14, "eval_logits/chosen": -2.7971036434173584, "eval_logits/rejected": -2.791273832321167, "eval_logps/chosen": -502.5458068847656, "eval_logps/rejected": -442.856201171875, "eval_loss": 0.7891684174537659, "eval_rewards/accuracies": 0.4227527976036072, "eval_rewards/chosen": -1.186678409576416, "eval_rewards/margins": -0.05982402339577675, "eval_rewards/rejected": -1.126854419708252, "eval_runtime": 655.9346, "eval_samples_per_second": 8.681, "eval_steps_per_second": 0.271, "step": 200 }, { "epoch": 0.15, "grad_norm": 5.125, "learning_rate": 4.9649352393889644e-06, "logits/chosen": -2.584268093109131, "logits/rejected": -2.4863028526306152, "logps/chosen": -193.27066040039062, "logps/rejected": -429.0790100097656, "loss": 0.1598, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.6279975175857544, "rewards/margins": 2.084122657775879, "rewards/rejected": -2.712120294570923, "step": 210 }, { "epoch": 0.16, "grad_norm": 3.828125, "learning_rate": 4.953897892477664e-06, "logits/chosen": -2.4790871143341064, "logits/rejected": -2.2345595359802246, "logps/chosen": -202.48167419433594, "logps/rejected": -474.41192626953125, "loss": 0.1561, "rewards/accuracies": 1.0, "rewards/chosen": -0.5403665900230408, "rewards/margins": 2.5899345874786377, "rewards/rejected": -3.130300998687744, "step": 220 }, { "epoch": 0.16, "grad_norm": 1.578125, "learning_rate": 4.941368454363839e-06, "logits/chosen": -2.440495729446411, "logits/rejected": -2.1979377269744873, "logps/chosen": -222.61294555664062, "logps/rejected": -537.0726318359375, "loss": 0.1131, "rewards/accuracies": 1.0, "rewards/chosen": -0.6902516484260559, "rewards/margins": 3.114187717437744, "rewards/rejected": -3.804439067840576, "step": 230 }, { "epoch": 0.17, "grad_norm": 4.90625, "learning_rate": 4.927354543565131e-06, "logits/chosen": -2.3387157917022705, "logits/rejected": -2.126927614212036, "logps/chosen": -211.66836547851562, "logps/rejected": -530.5004272460938, "loss": 0.0964, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.5667862892150879, "rewards/margins": 2.8909993171691895, "rewards/rejected": -3.4577858448028564, "step": 240 }, { "epoch": 0.18, "grad_norm": 6.15625, "learning_rate": 4.911864681231942e-06, "logits/chosen": -2.1806998252868652, "logits/rejected": -2.0283143520355225, "logps/chosen": -263.16845703125, "logps/rejected": -556.828369140625, "loss": 0.1336, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.8755892515182495, "rewards/margins": 2.895956516265869, "rewards/rejected": -3.771545886993408, "step": 250 }, { "epoch": 0.18, "grad_norm": 0.443359375, "learning_rate": 4.894908285966157e-06, "logits/chosen": -2.236419200897217, "logits/rejected": -1.9524688720703125, "logps/chosen": -261.3115234375, "logps/rejected": -587.9168701171875, "loss": 0.0822, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -0.9085346460342407, "rewards/margins": 3.482004165649414, "rewards/rejected": -4.390538692474365, "step": 260 }, { "epoch": 0.19, "grad_norm": 2.21875, "learning_rate": 4.876495668094168e-06, "logits/chosen": -2.036257743835449, "logits/rejected": -1.8838832378387451, "logps/chosen": -226.5672607421875, "logps/rejected": -604.56884765625, "loss": 0.0827, "rewards/accuracies": 1.0, "rewards/chosen": -0.8008095026016235, "rewards/margins": 3.5851123332977295, "rewards/rejected": -4.385921478271484, "step": 270 }, { "epoch": 0.2, "grad_norm": 1.609375, "learning_rate": 4.856638023397685e-06, "logits/chosen": -1.990782380104065, "logits/rejected": -1.7287362813949585, "logps/chosen": -232.73056030273438, "logps/rejected": -619.3232421875, "loss": 0.0923, "rewards/accuracies": 1.0, "rewards/chosen": -0.8233655691146851, "rewards/margins": 3.937394618988037, "rewards/rejected": -4.760760307312012, "step": 280 }, { "epoch": 0.2, "grad_norm": 1.53125, "learning_rate": 4.8353474263061465e-06, "logits/chosen": -1.9733736515045166, "logits/rejected": -1.805991530418396, "logps/chosen": -239.6761474609375, "logps/rejected": -666.7669677734375, "loss": 0.0684, "rewards/accuracies": 1.0, "rewards/chosen": -0.8877469897270203, "rewards/margins": 4.119744300842285, "rewards/rejected": -5.007491111755371, "step": 290 }, { "epoch": 0.21, "grad_norm": 3.28125, "learning_rate": 4.812636822554873e-06, "logits/chosen": -1.8918040990829468, "logits/rejected": -1.7033321857452393, "logps/chosen": -259.3042297363281, "logps/rejected": -664.6048583984375, "loss": 0.0678, "rewards/accuracies": 1.0, "rewards/chosen": -1.0919157266616821, "rewards/margins": 3.7808890342712402, "rewards/rejected": -4.872804641723633, "step": 300 }, { "epoch": 0.21, "eval_logits/chosen": -2.1730568408966064, "eval_logits/rejected": -2.157489538192749, "eval_logps/chosen": -567.996826171875, "eval_logps/rejected": -508.4316101074219, "eval_loss": 0.8810476660728455, "eval_rewards/accuracies": 0.4227527976036072, "eval_rewards/chosen": -1.8411885499954224, "eval_rewards/margins": -0.0585799440741539, "eval_rewards/rejected": -1.7826087474822998, "eval_runtime": 656.0248, "eval_samples_per_second": 8.68, "eval_steps_per_second": 0.271, "step": 300 }, { "epoch": 0.22, "grad_norm": 1.671875, "learning_rate": 4.788520021313416e-06, "logits/chosen": -1.9999065399169922, "logits/rejected": -1.8646202087402344, "logps/chosen": -265.22320556640625, "logps/rejected": -667.613037109375, "loss": 0.0878, "rewards/accuracies": 1.0, "rewards/chosen": -1.286360263824463, "rewards/margins": 3.923821210861206, "rewards/rejected": -5.21018123626709, "step": 310 }, { "epoch": 0.23, "grad_norm": 1.234375, "learning_rate": 4.763011686788904e-06, "logits/chosen": -2.001868486404419, "logits/rejected": -1.6644681692123413, "logps/chosen": -342.4534606933594, "logps/rejected": -677.551025390625, "loss": 0.0666, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.5519191026687622, "rewards/margins": 3.5869154930114746, "rewards/rejected": -5.138834476470947, "step": 320 }, { "epoch": 0.23, "grad_norm": 13.8125, "learning_rate": 4.736127329309476e-06, "logits/chosen": -1.9961566925048828, "logits/rejected": -1.7332680225372314, "logps/chosen": -274.7004089355469, "logps/rejected": -663.12451171875, "loss": 0.0633, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -1.1638389825820923, "rewards/margins": 4.054037570953369, "rewards/rejected": -5.217876434326172, "step": 330 }, { "epoch": 0.24, "grad_norm": 10.1875, "learning_rate": 4.707883295893241e-06, "logits/chosen": -1.8405824899673462, "logits/rejected": -1.481626272201538, "logps/chosen": -288.8253173828125, "logps/rejected": -727.4976806640625, "loss": 0.0756, "rewards/accuracies": 1.0, "rewards/chosen": -1.2971699237823486, "rewards/margins": 4.394200325012207, "rewards/rejected": -5.691370487213135, "step": 340 }, { "epoch": 0.25, "grad_norm": 4.375, "learning_rate": 4.678296760308474e-06, "logits/chosen": -1.9200785160064697, "logits/rejected": -1.7004458904266357, "logps/chosen": -295.3323669433594, "logps/rejected": -754.2498779296875, "loss": 0.0597, "rewards/accuracies": 1.0, "rewards/chosen": -1.4161075353622437, "rewards/margins": 4.548061370849609, "rewards/rejected": -5.964169025421143, "step": 350 }, { "epoch": 0.25, "grad_norm": 0.447265625, "learning_rate": 4.647385712631127e-06, "logits/chosen": -1.9339395761489868, "logits/rejected": -1.755002737045288, "logps/chosen": -318.36163330078125, "logps/rejected": -770.7998046875, "loss": 0.0592, "rewards/accuracies": 1.0, "rewards/chosen": -1.5235289335250854, "rewards/margins": 4.540143013000488, "rewards/rejected": -6.063671588897705, "step": 360 }, { "epoch": 0.26, "grad_norm": 0.81640625, "learning_rate": 4.615168948305967e-06, "logits/chosen": -1.864012360572815, "logits/rejected": -1.6661291122436523, "logps/chosen": -292.7504577636719, "logps/rejected": -775.6510009765625, "loss": 0.0457, "rewards/accuracies": 1.0, "rewards/chosen": -1.2696479558944702, "rewards/margins": 4.875089168548584, "rewards/rejected": -6.144737243652344, "step": 370 }, { "epoch": 0.27, "grad_norm": 1.1640625, "learning_rate": 4.581666056718016e-06, "logits/chosen": -1.7839504480361938, "logits/rejected": -1.6283111572265625, "logps/chosen": -398.49481201171875, "logps/rejected": -897.6380615234375, "loss": 0.0541, "rewards/accuracies": 1.0, "rewards/chosen": -2.2088656425476074, "rewards/margins": 5.060395240783691, "rewards/rejected": -7.269261360168457, "step": 380 }, { "epoch": 0.28, "grad_norm": 1.4921875, "learning_rate": 4.546897409281241e-06, "logits/chosen": -1.694427251815796, "logits/rejected": -1.45779550075531, "logps/chosen": -358.84136962890625, "logps/rejected": -952.1790161132812, "loss": 0.0839, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -2.1445980072021484, "rewards/margins": 5.831234931945801, "rewards/rejected": -7.975832939147949, "step": 390 }, { "epoch": 0.28, "grad_norm": 3.3125, "learning_rate": 4.510884147051722e-06, "logits/chosen": -1.791006326675415, "logits/rejected": -1.4730372428894043, "logps/chosen": -456.94854736328125, "logps/rejected": -1104.5611572265625, "loss": 0.0755, "rewards/accuracies": 1.0, "rewards/chosen": -2.662592649459839, "rewards/margins": 6.464757442474365, "rewards/rejected": -9.127350807189941, "step": 400 }, { "epoch": 0.28, "eval_logits/chosen": -1.9529255628585815, "eval_logits/rejected": -1.9330235719680786, "eval_logps/chosen": -686.5277099609375, "eval_logps/rejected": -631.0067138671875, "eval_loss": 0.9693852663040161, "eval_rewards/accuracies": 0.44873595237731934, "eval_rewards/chosen": -3.0264980792999268, "eval_rewards/margins": -0.018139641731977463, "eval_rewards/rejected": -3.0083587169647217, "eval_runtime": 655.9386, "eval_samples_per_second": 8.681, "eval_steps_per_second": 0.271, "step": 400 }, { "epoch": 0.29, "grad_norm": 9.875, "learning_rate": 4.473648167872852e-06, "logits/chosen": -1.5779629945755005, "logits/rejected": -1.3739255666732788, "logps/chosen": -450.720703125, "logps/rejected": -1086.2620849609375, "loss": 0.0719, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9547436237335205, "rewards/margins": 6.432783603668213, "rewards/rejected": -9.387526512145996, "step": 410 }, { "epoch": 0.3, "grad_norm": 0.8046875, "learning_rate": 4.4352121130603576e-06, "logits/chosen": -1.5717002153396606, "logits/rejected": -1.4303909540176392, "logps/chosen": -425.1919860839844, "logps/rejected": -1012.8151245117188, "loss": 0.0392, "rewards/accuracies": 1.0, "rewards/chosen": -2.6251375675201416, "rewards/margins": 5.805235385894775, "rewards/rejected": -8.430373191833496, "step": 420 }, { "epoch": 0.3, "grad_norm": 4.25, "learning_rate": 4.395599353635269e-06, "logits/chosen": -1.8063325881958008, "logits/rejected": -1.5978657007217407, "logps/chosen": -391.03973388671875, "logps/rejected": -1139.81396484375, "loss": 0.0401, "rewards/accuracies": 1.0, "rewards/chosen": -2.1669018268585205, "rewards/margins": 7.3755998611450195, "rewards/rejected": -9.542501449584961, "step": 430 }, { "epoch": 0.31, "grad_norm": 2.90625, "learning_rate": 4.354833976113176e-06, "logits/chosen": -1.6995327472686768, "logits/rejected": -1.304896593093872, "logps/chosen": -442.50927734375, "logps/rejected": -1046.9505615234375, "loss": 0.0484, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.6409192085266113, "rewards/margins": 6.354380130767822, "rewards/rejected": -8.995299339294434, "step": 440 }, { "epoch": 0.32, "grad_norm": 14.1875, "learning_rate": 4.312940767858442e-06, "logits/chosen": -1.4967067241668701, "logits/rejected": -1.2330373525619507, "logps/chosen": -460.6571350097656, "logps/rejected": -1238.811767578125, "loss": 0.0529, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.039698362350464, "rewards/margins": 7.639819145202637, "rewards/rejected": -10.67951774597168, "step": 450 }, { "epoch": 0.32, "grad_norm": 13.875, "learning_rate": 4.2699452020122556e-06, "logits/chosen": -1.5593597888946533, "logits/rejected": -1.3835352659225464, "logps/chosen": -534.43359375, "logps/rejected": -1238.1392822265625, "loss": 0.0578, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.907667875289917, "rewards/margins": 7.0630784034729, "rewards/rejected": -10.970745086669922, "step": 460 }, { "epoch": 0.33, "grad_norm": 4.5, "learning_rate": 4.2258734220037075e-06, "logits/chosen": -1.4951756000518799, "logits/rejected": -1.0847975015640259, "logps/chosen": -468.1197204589844, "logps/rejected": -1282.1279296875, "loss": 0.0583, "rewards/accuracies": 1.0, "rewards/chosen": -3.1975932121276855, "rewards/margins": 7.988409519195557, "rewards/rejected": -11.186002731323242, "step": 470 }, { "epoch": 0.34, "grad_norm": 0.90234375, "learning_rate": 4.1807522256532925e-06, "logits/chosen": -1.4408828020095825, "logits/rejected": -1.2817597389221191, "logps/chosen": -459.5364685058594, "logps/rejected": -1234.5921630859375, "loss": 0.0512, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -3.050990581512451, "rewards/margins": 7.3619384765625, "rewards/rejected": -10.412928581237793, "step": 480 }, { "epoch": 0.35, "grad_norm": 0.60546875, "learning_rate": 4.134609048878504e-06, "logits/chosen": -1.439564824104309, "logits/rejected": -1.068268895149231, "logps/chosen": -430.1298828125, "logps/rejected": -1251.78271484375, "loss": 0.0767, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -2.9712727069854736, "rewards/margins": 8.197530746459961, "rewards/rejected": -11.168804168701172, "step": 490 }, { "epoch": 0.35, "grad_norm": 1.8359375, "learning_rate": 4.08747194901145e-06, "logits/chosen": -1.5350717306137085, "logits/rejected": -1.2265657186508179, "logps/chosen": -478.27838134765625, "logps/rejected": -1366.25048828125, "loss": 0.0722, "rewards/accuracies": 1.0, "rewards/chosen": -3.0265259742736816, "rewards/margins": 8.87561321258545, "rewards/rejected": -11.902138710021973, "step": 500 }, { "epoch": 0.35, "eval_logits/chosen": -1.7579246759414673, "eval_logits/rejected": -1.7334508895874023, "eval_logps/chosen": -776.935546875, "eval_logps/rejected": -734.9275512695312, "eval_loss": 0.9231775999069214, "eval_rewards/accuracies": 0.4852527976036072, "eval_rewards/chosen": -3.9305758476257324, "eval_rewards/margins": 0.11699170619249344, "eval_rewards/rejected": -4.047567844390869, "eval_runtime": 656.0216, "eval_samples_per_second": 8.68, "eval_steps_per_second": 0.271, "step": 500 }, { "epoch": 0.36, "grad_norm": 1.6328125, "learning_rate": 4.039369587738599e-06, "logits/chosen": -1.2768363952636719, "logits/rejected": -0.8066743016242981, "logps/chosen": -658.4061279296875, "logps/rejected": -1867.58984375, "loss": 0.0284, "rewards/accuracies": 1.0, "rewards/chosen": -5.000919818878174, "rewards/margins": 12.019338607788086, "rewards/rejected": -17.020259857177734, "step": 510 }, { "epoch": 0.37, "grad_norm": 0.1513671875, "learning_rate": 3.990331213673064e-06, "logits/chosen": -1.0913686752319336, "logits/rejected": -0.6941195726394653, "logps/chosen": -732.0529174804688, "logps/rejected": -1852.646240234375, "loss": 0.0507, "rewards/accuracies": 1.0, "rewards/chosen": -5.824349880218506, "rewards/margins": 11.102621078491211, "rewards/rejected": -16.926971435546875, "step": 520 }, { "epoch": 0.37, "grad_norm": 0.55078125, "learning_rate": 3.940386644569999e-06, "logits/chosen": -1.1093213558197021, "logits/rejected": -0.7086474895477295, "logps/chosen": -757.3285522460938, "logps/rejected": -2145.50537109375, "loss": 0.0277, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -5.836045742034912, "rewards/margins": 14.031903266906738, "rewards/rejected": -19.86794662475586, "step": 530 }, { "epoch": 0.38, "grad_norm": 0.2353515625, "learning_rate": 3.889566249195929e-06, "logits/chosen": -1.0829203128814697, "logits/rejected": -0.6696628332138062, "logps/chosen": -865.8997802734375, "logps/rejected": -2506.55908203125, "loss": 0.0192, "rewards/accuracies": 1.0, "rewards/chosen": -7.24310302734375, "rewards/margins": 16.113231658935547, "rewards/rejected": -23.356334686279297, "step": 540 }, { "epoch": 0.39, "grad_norm": 0.95703125, "learning_rate": 3.837900928863039e-06, "logits/chosen": -1.1552801132202148, "logits/rejected": -0.8314965963363647, "logps/chosen": -881.5584106445312, "logps/rejected": -2046.798095703125, "loss": 0.0533, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.256842613220215, "rewards/margins": 11.579842567443848, "rewards/rejected": -18.836687088012695, "step": 550 }, { "epoch": 0.4, "grad_norm": 0.224609375, "learning_rate": 3.7854220986396493e-06, "logits/chosen": -1.1160633563995361, "logits/rejected": -0.5819014310836792, "logps/chosen": -919.8805541992188, "logps/rejected": -2116.480224609375, "loss": 0.03, "rewards/accuracies": 1.0, "rewards/chosen": -7.002084255218506, "rewards/margins": 12.451542854309082, "rewards/rejected": -19.453628540039062, "step": 560 }, { "epoch": 0.4, "grad_norm": 25.0, "learning_rate": 3.732161668248303e-06, "logits/chosen": -0.9451616406440735, "logits/rejected": -0.6455805897712708, "logps/chosen": -1017.2947998046875, "logps/rejected": -2289.78125, "loss": 0.0234, "rewards/accuracies": 1.0, "rewards/chosen": -8.655045509338379, "rewards/margins": 12.746968269348145, "rewards/rejected": -21.402013778686523, "step": 570 }, { "epoch": 0.41, "grad_norm": 0.376953125, "learning_rate": 3.6781520226630735e-06, "logits/chosen": -0.9697924852371216, "logits/rejected": -0.4289473593235016, "logps/chosen": -1160.775634765625, "logps/rejected": -3142.683349609375, "loss": 0.0534, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.224233627319336, "rewards/margins": 19.751903533935547, "rewards/rejected": -29.976139068603516, "step": 580 }, { "epoch": 0.42, "grad_norm": 11.9375, "learning_rate": 3.6234260024179036e-06, "logits/chosen": -1.1209807395935059, "logits/rejected": -0.8063465356826782, "logps/chosen": -994.5657348632812, "logps/rejected": -2481.98193359375, "loss": 0.0718, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.60986328125, "rewards/margins": 14.783126831054688, "rewards/rejected": -23.392990112304688, "step": 590 }, { "epoch": 0.42, "grad_norm": 11.3125, "learning_rate": 3.568016883637936e-06, "logits/chosen": -1.2013523578643799, "logits/rejected": -0.7750760316848755, "logps/chosen": -965.8484497070312, "logps/rejected": -2581.89111328125, "loss": 0.0154, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.930583953857422, "rewards/margins": 16.107666015625, "rewards/rejected": -24.03824806213379, "step": 600 }, { "epoch": 0.42, "eval_logits/chosen": -1.6101704835891724, "eval_logits/rejected": -1.5838062763214111, "eval_logps/chosen": -1076.6463623046875, "eval_logps/rejected": -1046.2589111328125, "eval_loss": 1.0930993556976318, "eval_rewards/accuracies": 0.5042135119438171, "eval_rewards/chosen": -6.927684783935547, "eval_rewards/margins": 0.2331954836845398, "eval_rewards/rejected": -7.1608805656433105, "eval_runtime": 656.1015, "eval_samples_per_second": 8.679, "eval_steps_per_second": 0.271, "step": 600 }, { "epoch": 0.43, "grad_norm": 2.015625, "learning_rate": 3.5119583578059845e-06, "logits/chosen": -1.3500652313232422, "logits/rejected": -0.9361883997917175, "logps/chosen": -924.3231201171875, "logps/rejected": -2288.12939453125, "loss": 0.0723, "rewards/accuracies": 1.0, "rewards/chosen": -7.645608425140381, "rewards/margins": 13.6157808303833, "rewards/rejected": -21.261388778686523, "step": 610 }, { "epoch": 0.44, "grad_norm": 1.203125, "learning_rate": 3.455284511276448e-06, "logits/chosen": -1.339277982711792, "logits/rejected": -0.9697107076644897, "logps/chosen": -864.0202026367188, "logps/rejected": -2208.43408203125, "loss": 0.0536, "rewards/accuracies": 1.0, "rewards/chosen": -7.1323041915893555, "rewards/margins": 13.35148811340332, "rewards/rejected": -20.483795166015625, "step": 620 }, { "epoch": 0.44, "grad_norm": 2.671875, "learning_rate": 3.39802980454912e-06, "logits/chosen": -1.1924967765808105, "logits/rejected": -0.9132025837898254, "logps/chosen": -880.2484130859375, "logps/rejected": -2039.931640625, "loss": 0.09, "rewards/accuracies": 1.0, "rewards/chosen": -7.052211761474609, "rewards/margins": 11.527318954467773, "rewards/rejected": -18.579532623291016, "step": 630 }, { "epoch": 0.45, "grad_norm": 1.1328125, "learning_rate": 3.340229051315505e-06, "logits/chosen": -1.247047781944275, "logits/rejected": -0.9143965840339661, "logps/chosen": -864.1796875, "logps/rejected": -2199.156494140625, "loss": 0.0093, "rewards/accuracies": 1.0, "rewards/chosen": -6.9714202880859375, "rewards/margins": 13.445959091186523, "rewards/rejected": -20.41737937927246, "step": 640 }, { "epoch": 0.46, "grad_norm": 4.125, "learning_rate": 3.281917397290371e-06, "logits/chosen": -1.2007062435150146, "logits/rejected": -0.7931119203567505, "logps/chosen": -1153.964111328125, "logps/rejected": -2686.174072265625, "loss": 0.016, "rewards/accuracies": 1.0, "rewards/chosen": -9.766824722290039, "rewards/margins": 15.582504272460938, "rewards/rejected": -25.349327087402344, "step": 650 }, { "epoch": 0.47, "grad_norm": 6.875, "learning_rate": 3.2231302988414198e-06, "logits/chosen": -1.2824809551239014, "logits/rejected": -0.9021598100662231, "logps/chosen": -965.5451049804688, "logps/rejected": -2473.737060546875, "loss": 0.0456, "rewards/accuracies": 1.0, "rewards/chosen": -8.284971237182617, "rewards/margins": 15.063722610473633, "rewards/rejected": -23.348691940307617, "step": 660 }, { "epoch": 0.47, "grad_norm": 0.5546875, "learning_rate": 3.1639035014300583e-06, "logits/chosen": -1.2285374402999878, "logits/rejected": -0.8617483973503113, "logps/chosen": -876.5970458984375, "logps/rejected": -2488.18505859375, "loss": 0.0023, "rewards/accuracies": 1.0, "rewards/chosen": -7.433468818664551, "rewards/margins": 15.856343269348145, "rewards/rejected": -23.289812088012695, "step": 670 }, { "epoch": 0.48, "grad_norm": 24.375, "learning_rate": 3.104273017876399e-06, "logits/chosen": -1.259339451789856, "logits/rejected": -0.8192273378372192, "logps/chosen": -854.6680908203125, "logps/rejected": -2123.60693359375, "loss": 0.0551, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -7.36416482925415, "rewards/margins": 12.604635238647461, "rewards/rejected": -19.968799591064453, "step": 680 }, { "epoch": 0.49, "grad_norm": 19.125, "learning_rate": 3.044275106461678e-06, "logits/chosen": -1.1816984415054321, "logits/rejected": -0.8878567814826965, "logps/chosen": -909.8551635742188, "logps/rejected": -2352.97119140625, "loss": 0.0602, "rewards/accuracies": 1.0, "rewards/chosen": -7.709178924560547, "rewards/margins": 14.284971237182617, "rewards/rejected": -21.994152069091797, "step": 690 }, { "epoch": 0.49, "grad_norm": 0.01129150390625, "learning_rate": 2.983946248881433e-06, "logits/chosen": -1.3704578876495361, "logits/rejected": -1.0677545070648193, "logps/chosen": -945.0372314453125, "logps/rejected": -2145.31689453125, "loss": 0.0049, "rewards/accuracies": 1.0, "rewards/chosen": -7.863336086273193, "rewards/margins": 11.76345443725586, "rewards/rejected": -19.62679100036621, "step": 700 }, { "epoch": 0.49, "eval_logits/chosen": -1.7367656230926514, "eval_logits/rejected": -1.7125349044799805, "eval_logps/chosen": -1059.4774169921875, "eval_logps/rejected": -1021.320556640625, "eval_loss": 1.0778571367263794, "eval_rewards/accuracies": 0.49227526783943176, "eval_rewards/chosen": -6.755995750427246, "eval_rewards/margins": 0.1555027812719345, "eval_rewards/rejected": -6.911497592926025, "eval_runtime": 656.0222, "eval_samples_per_second": 8.68, "eval_steps_per_second": 0.271, "step": 700 }, { "epoch": 0.5, "grad_norm": 0.1279296875, "learning_rate": 2.923323128062825e-06, "logits/chosen": -1.411307692527771, "logits/rejected": -0.9907943606376648, "logps/chosen": -943.5584716796875, "logps/rejected": -2305.66552734375, "loss": 0.0097, "rewards/accuracies": 1.0, "rewards/chosen": -7.608894348144531, "rewards/margins": 13.67431640625, "rewards/rejected": -21.28321075439453, "step": 710 }, { "epoch": 0.51, "grad_norm": 0.056640625, "learning_rate": 2.8624426058596107e-06, "logits/chosen": -1.2269010543823242, "logits/rejected": -0.7131327986717224, "logps/chosen": -1085.274658203125, "logps/rejected": -2826.19873046875, "loss": 0.0081, "rewards/accuracies": 1.0, "rewards/chosen": -9.366337776184082, "rewards/margins": 17.45486068725586, "rewards/rejected": -26.821197509765625, "step": 720 }, { "epoch": 0.52, "grad_norm": 28.375, "learning_rate": 2.8013417006383078e-06, "logits/chosen": -1.2380268573760986, "logits/rejected": -0.7660568356513977, "logps/chosen": -1162.524169921875, "logps/rejected": -2991.434814453125, "loss": 0.0601, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.785096168518066, "rewards/margins": 18.30564308166504, "rewards/rejected": -28.090740203857422, "step": 730 }, { "epoch": 0.52, "grad_norm": 1.2109375, "learning_rate": 2.7400575647692046e-06, "logits/chosen": -1.389968752861023, "logits/rejected": -0.8731257319450378, "logps/chosen": -1055.5992431640625, "logps/rejected": -2900.40087890625, "loss": 0.0547, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.028341293334961, "rewards/margins": 18.45594024658203, "rewards/rejected": -27.48427963256836, "step": 740 }, { "epoch": 0.53, "grad_norm": 2.5, "learning_rate": 2.6786274620358773e-06, "logits/chosen": -1.3587383031845093, "logits/rejected": -0.8094767332077026, "logps/chosen": -1126.9896240234375, "logps/rejected": -2875.303955078125, "loss": 0.007, "rewards/accuracies": 1.0, "rewards/chosen": -9.480697631835938, "rewards/margins": 17.524669647216797, "rewards/rejected": -27.0053653717041, "step": 750 }, { "epoch": 0.54, "grad_norm": 0.59765625, "learning_rate": 2.61708874497697e-06, "logits/chosen": -1.3367315530776978, "logits/rejected": -0.8143109083175659, "logps/chosen": -1111.2471923828125, "logps/rejected": -2995.88330078125, "loss": 0.0112, "rewards/accuracies": 1.0, "rewards/chosen": -9.43135929107666, "rewards/margins": 19.074649810791016, "rewards/rejected": -28.506006240844727, "step": 760 }, { "epoch": 0.54, "grad_norm": 2.765625, "learning_rate": 2.5554788321740054e-06, "logits/chosen": -1.1412795782089233, "logits/rejected": -0.647286057472229, "logps/chosen": -1577.8646240234375, "logps/rejected": -3706.69384765625, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -13.945138931274414, "rewards/margins": 21.588544845581055, "rewards/rejected": -35.53368377685547, "step": 770 }, { "epoch": 0.55, "grad_norm": 0.0712890625, "learning_rate": 2.493835185499039e-06, "logits/chosen": -1.161700963973999, "logits/rejected": -0.7219703197479248, "logps/chosen": -1308.0667724609375, "logps/rejected": -3078.884033203125, "loss": 0.0434, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -11.402704238891602, "rewards/margins": 17.901020050048828, "rewards/rejected": -29.303722381591797, "step": 780 }, { "epoch": 0.56, "grad_norm": 0.07568359375, "learning_rate": 2.4321952873359842e-06, "logits/chosen": -1.269687533378601, "logits/rejected": -0.6700019240379333, "logps/chosen": -1245.6978759765625, "logps/rejected": -3713.414794921875, "loss": 0.0197, "rewards/accuracies": 1.0, "rewards/chosen": -10.587652206420898, "rewards/margins": 24.8172607421875, "rewards/rejected": -35.40491485595703, "step": 790 }, { "epoch": 0.56, "grad_norm": 3.09375, "learning_rate": 2.3705966177894763e-06, "logits/chosen": -1.2091081142425537, "logits/rejected": -0.7592669129371643, "logps/chosen": -1207.2398681640625, "logps/rejected": -3553.341796875, "loss": 0.033, "rewards/accuracies": 1.0, "rewards/chosen": -10.607110977172852, "rewards/margins": 23.20706558227539, "rewards/rejected": -33.81417465209961, "step": 800 }, { "epoch": 0.56, "eval_logits/chosen": -1.6792893409729004, "eval_logits/rejected": -1.6524384021759033, "eval_logps/chosen": -1390.235107421875, "eval_logps/rejected": -1383.8111572265625, "eval_loss": 1.3468295335769653, "eval_rewards/accuracies": 0.5372191071510315, "eval_rewards/chosen": -10.063570976257324, "eval_rewards/margins": 0.47283390164375305, "eval_rewards/rejected": -10.53640365600586, "eval_runtime": 655.8108, "eval_samples_per_second": 8.682, "eval_steps_per_second": 0.271, "step": 800 }, { "epoch": 0.57, "grad_norm": 0.60546875, "learning_rate": 2.309076631895116e-06, "logits/chosen": -1.2587814331054688, "logits/rejected": -0.7719920873641968, "logps/chosen": -1253.469970703125, "logps/rejected": -3200.095458984375, "loss": 0.0841, "rewards/accuracies": 1.0, "rewards/chosen": -10.87464714050293, "rewards/margins": 19.867063522338867, "rewards/rejected": -30.741710662841797, "step": 810 }, { "epoch": 0.58, "grad_norm": 7.28125, "learning_rate": 2.2476727368449487e-06, "logits/chosen": -1.248982548713684, "logits/rejected": -0.6828194260597229, "logps/chosen": -1316.671630859375, "logps/rejected": -3310.6484375, "loss": 0.0095, "rewards/accuracies": 1.0, "rewards/chosen": -11.567731857299805, "rewards/margins": 19.971338272094727, "rewards/rejected": -31.5390682220459, "step": 820 }, { "epoch": 0.59, "grad_norm": 2.3125, "learning_rate": 2.1864222692420555e-06, "logits/chosen": -1.3420377969741821, "logits/rejected": -0.8499332666397095, "logps/chosen": -1105.1287841796875, "logps/rejected": -3251.79345703125, "loss": 0.0469, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.636556625366211, "rewards/margins": 21.3193302154541, "rewards/rejected": -30.955890655517578, "step": 830 }, { "epoch": 0.59, "grad_norm": 0.01214599609375, "learning_rate": 2.125362472398041e-06, "logits/chosen": -1.2289090156555176, "logits/rejected": -0.8354812860488892, "logps/chosen": -1310.153564453125, "logps/rejected": -3590.446044921875, "loss": 0.0216, "rewards/accuracies": 1.0, "rewards/chosen": -11.406636238098145, "rewards/margins": 22.582155227661133, "rewards/rejected": -33.988792419433594, "step": 840 }, { "epoch": 0.6, "grad_norm": 1.03125, "learning_rate": 2.0645304736872683e-06, "logits/chosen": -1.2989423274993896, "logits/rejected": -0.7755887508392334, "logps/chosen": -1139.165283203125, "logps/rejected": -3502.372314453125, "loss": 0.0038, "rewards/accuracies": 1.0, "rewards/chosen": -9.853492736816406, "rewards/margins": 23.74998664855957, "rewards/rejected": -33.603477478027344, "step": 850 }, { "epoch": 0.61, "grad_norm": 0.36328125, "learning_rate": 2.0039632619715724e-06, "logits/chosen": -1.2742412090301514, "logits/rejected": -0.886884331703186, "logps/chosen": -1159.204345703125, "logps/rejected": -3300.87109375, "loss": 0.0485, "rewards/accuracies": 1.0, "rewards/chosen": -10.175374984741211, "rewards/margins": 21.167037963867188, "rewards/rejected": -31.342416763305664, "step": 860 }, { "epoch": 0.61, "grad_norm": 5.435943603515625e-05, "learning_rate": 1.9436976651092143e-06, "logits/chosen": -1.4048194885253906, "logits/rejected": -0.8454034924507141, "logps/chosen": -1091.2130126953125, "logps/rejected": -3097.588623046875, "loss": 0.1044, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.176810264587402, "rewards/margins": 20.21010971069336, "rewards/rejected": -29.386920928955078, "step": 870 }, { "epoch": 0.62, "grad_norm": 0.0185546875, "learning_rate": 1.8837703275617106e-06, "logits/chosen": -1.3679651021957397, "logits/rejected": -0.9742671251296997, "logps/chosen": -1107.792724609375, "logps/rejected": -3230.29638671875, "loss": 0.0086, "rewards/accuracies": 1.0, "rewards/chosen": -9.241239547729492, "rewards/margins": 21.296463012695312, "rewards/rejected": -30.537700653076172, "step": 880 }, { "epoch": 0.63, "grad_norm": 1.1484375, "learning_rate": 1.8242176881122004e-06, "logits/chosen": -1.3404251337051392, "logits/rejected": -0.8698049783706665, "logps/chosen": -1174.1446533203125, "logps/rejected": -3308.083251953125, "loss": 0.0611, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.142600059509277, "rewards/margins": 21.470674514770508, "rewards/rejected": -31.613271713256836, "step": 890 }, { "epoch": 0.64, "grad_norm": 0.00439453125, "learning_rate": 1.765075957708856e-06, "logits/chosen": -1.4810196161270142, "logits/rejected": -1.0637757778167725, "logps/chosen": -1093.116943359375, "logps/rejected": -2951.527099609375, "loss": 0.0068, "rewards/accuracies": 1.0, "rewards/chosen": -9.223703384399414, "rewards/margins": 18.70016098022461, "rewards/rejected": -27.923864364624023, "step": 900 }, { "epoch": 0.64, "eval_logits/chosen": -1.7685190439224243, "eval_logits/rejected": -1.7439854145050049, "eval_logps/chosen": -1204.1876220703125, "eval_logps/rejected": -1185.1373291015625, "eval_loss": 1.1494473218917847, "eval_rewards/accuracies": 0.5224719047546387, "eval_rewards/chosen": -8.203096389770508, "eval_rewards/margins": 0.3465690612792969, "eval_rewards/rejected": -8.549664497375488, "eval_runtime": 656.2998, "eval_samples_per_second": 8.676, "eval_steps_per_second": 0.271, "step": 900 }, { "epoch": 0.64, "grad_norm": 21.875, "learning_rate": 1.706381097446845e-06, "logits/chosen": -1.3406823873519897, "logits/rejected": -0.8678423166275024, "logps/chosen": -1017.1760864257812, "logps/rejected": -2439.1611328125, "loss": 0.0813, "rewards/accuracies": 1.0, "rewards/chosen": -8.433879852294922, "rewards/margins": 14.480337142944336, "rewards/rejected": -22.91421890258789, "step": 910 }, { "epoch": 0.65, "grad_norm": 3.265625, "learning_rate": 1.6481687967021976e-06, "logits/chosen": -1.3469676971435547, "logits/rejected": -1.037846565246582, "logps/chosen": -1005.4847412109375, "logps/rejected": -2438.32861328125, "loss": 0.0522, "rewards/accuracies": 1.0, "rewards/chosen": -8.364850997924805, "rewards/margins": 14.493901252746582, "rewards/rejected": -22.858753204345703, "step": 920 }, { "epoch": 0.66, "grad_norm": 0.0072021484375, "learning_rate": 1.590474451430911e-06, "logits/chosen": -1.4652862548828125, "logits/rejected": -1.009216547012329, "logps/chosen": -959.9542236328125, "logps/rejected": -2865.4619140625, "loss": 0.0013, "rewards/accuracies": 1.0, "rewards/chosen": -7.892512321472168, "rewards/margins": 18.82561683654785, "rewards/rejected": -26.718130111694336, "step": 930 }, { "epoch": 0.66, "grad_norm": 1.71875, "learning_rate": 1.5333331426464532e-06, "logits/chosen": -1.3715388774871826, "logits/rejected": -0.9282493591308594, "logps/chosen": -1106.976806640625, "logps/rejected": -2604.638427734375, "loss": 0.0195, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.100738525390625, "rewards/margins": 15.058537483215332, "rewards/rejected": -24.159276962280273, "step": 940 }, { "epoch": 0.67, "grad_norm": 20.75, "learning_rate": 1.4767796150887725e-06, "logits/chosen": -1.47031569480896, "logits/rejected": -1.1085163354873657, "logps/chosen": -1109.52490234375, "logps/rejected": -2674.32861328125, "loss": 0.0245, "rewards/accuracies": 1.0, "rewards/chosen": -9.363363265991211, "rewards/margins": 15.712671279907227, "rewards/rejected": -25.076034545898438, "step": 950 }, { "epoch": 0.68, "grad_norm": 0.306640625, "learning_rate": 1.4208482560977848e-06, "logits/chosen": -1.4269269704818726, "logits/rejected": -0.9111806750297546, "logps/chosen": -1008.8807373046875, "logps/rejected": -2636.96923828125, "loss": 0.0117, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.473782539367676, "rewards/margins": 16.409137725830078, "rewards/rejected": -24.882923126220703, "step": 960 }, { "epoch": 0.69, "grad_norm": 0.86328125, "learning_rate": 1.3655730747041608e-06, "logits/chosen": -1.4213557243347168, "logits/rejected": -0.8652811050415039, "logps/chosen": -1088.4149169921875, "logps/rejected": -2928.28369140625, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -9.118019104003906, "rewards/margins": 18.76186180114746, "rewards/rejected": -27.8798828125, "step": 970 }, { "epoch": 0.69, "grad_norm": 0.62109375, "learning_rate": 1.310987680950166e-06, "logits/chosen": -1.4652445316314697, "logits/rejected": -1.0953832864761353, "logps/chosen": -1127.3375244140625, "logps/rejected": -3079.28955078125, "loss": 0.0839, "rewards/accuracies": 1.0, "rewards/chosen": -9.71268081665039, "rewards/margins": 19.30825424194336, "rewards/rejected": -29.02093505859375, "step": 980 }, { "epoch": 0.7, "grad_norm": 6.1875, "learning_rate": 1.2571252654530835e-06, "logits/chosen": -1.368370532989502, "logits/rejected": -1.0655570030212402, "logps/chosen": -1173.7587890625, "logps/rejected": -2970.58642578125, "loss": 0.0085, "rewards/accuracies": 1.0, "rewards/chosen": -9.98849105834961, "rewards/margins": 17.761144638061523, "rewards/rejected": -27.749637603759766, "step": 990 }, { "epoch": 0.71, "grad_norm": 0.57421875, "learning_rate": 1.2040185792236874e-06, "logits/chosen": -1.1881918907165527, "logits/rejected": -0.6709830164909363, "logps/chosen": -1286.3770751953125, "logps/rejected": -2953.1875, "loss": 0.0054, "rewards/accuracies": 1.0, "rewards/chosen": -11.300887107849121, "rewards/margins": 16.589113235473633, "rewards/rejected": -27.890003204345703, "step": 1000 }, { "epoch": 0.71, "eval_logits/chosen": -1.7660337686538696, "eval_logits/rejected": -1.7417700290679932, "eval_logps/chosen": -1295.65478515625, "eval_logps/rejected": -1279.202392578125, "eval_loss": 1.210878849029541, "eval_rewards/accuracies": 0.521769642829895, "eval_rewards/chosen": -9.117769241333008, "eval_rewards/margins": 0.37254810333251953, "eval_rewards/rejected": -9.490316390991211, "eval_runtime": 656.0444, "eval_samples_per_second": 8.679, "eval_steps_per_second": 0.271, "step": 1000 }, { "epoch": 0.71, "grad_norm": 0.7265625, "learning_rate": 1.1516999137520023e-06, "logits/chosen": -1.427712321281433, "logits/rejected": -1.0445911884307861, "logps/chosen": -1226.722412109375, "logps/rejected": -2748.0888671875, "loss": 0.0123, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.473367691040039, "rewards/margins": 15.225300788879395, "rewards/rejected": -25.698665618896484, "step": 1010 }, { "epoch": 0.72, "grad_norm": 35.0, "learning_rate": 1.1002010813724851e-06, "logits/chosen": -1.3768597841262817, "logits/rejected": -0.8904365301132202, "logps/chosen": -1194.642333984375, "logps/rejected": -2986.991455078125, "loss": 0.0421, "rewards/accuracies": 1.0, "rewards/chosen": -10.254484176635742, "rewards/margins": 18.088577270507812, "rewards/rejected": -28.343059539794922, "step": 1020 }, { "epoch": 0.73, "grad_norm": 1.4140625, "learning_rate": 1.0495533959205506e-06, "logits/chosen": -1.2557332515716553, "logits/rejected": -0.9599855542182922, "logps/chosen": -1217.221923828125, "logps/rejected": -2950.060546875, "loss": 0.0444, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -10.468454360961914, "rewards/margins": 17.50444984436035, "rewards/rejected": -27.9729061126709, "step": 1030 }, { "epoch": 0.73, "grad_norm": 0.06884765625, "learning_rate": 9.997876536922175e-07, "logits/chosen": -1.3843439817428589, "logits/rejected": -1.00477933883667, "logps/chosen": -1080.3974609375, "logps/rejected": -3273.93310546875, "loss": 0.008, "rewards/accuracies": 1.0, "rewards/chosen": -9.390206336975098, "rewards/margins": 21.491409301757812, "rewards/rejected": -30.881616592407227, "step": 1040 }, { "epoch": 0.74, "grad_norm": 1.296875, "learning_rate": 9.509341147184306e-07, "logits/chosen": -1.244557499885559, "logits/rejected": -0.7361448407173157, "logps/chosen": -1227.6064453125, "logps/rejected": -3095.39990234375, "loss": 0.0345, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.757524490356445, "rewards/margins": 18.573318481445312, "rewards/rejected": -29.33084487915039, "step": 1050 }, { "epoch": 0.75, "grad_norm": 0.275390625, "learning_rate": 9.030224843654739e-07, "logits/chosen": -1.511866569519043, "logits/rejected": -1.08823561668396, "logps/chosen": -1069.511474609375, "logps/rejected": -2755.186767578125, "loss": 0.0183, "rewards/accuracies": 1.0, "rewards/chosen": -9.144437789916992, "rewards/margins": 17.022851943969727, "rewards/rejected": -26.167287826538086, "step": 1060 }, { "epoch": 0.76, "grad_norm": 7.0, "learning_rate": 8.560818952726329e-07, "logits/chosen": -1.401003122329712, "logits/rejected": -1.0348355770111084, "logps/chosen": -1216.390869140625, "logps/rejected": -2738.28857421875, "loss": 0.0401, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -10.61055850982666, "rewards/margins": 15.326191902160645, "rewards/rejected": -25.936748504638672, "step": 1070 }, { "epoch": 0.76, "grad_norm": 1.8125, "learning_rate": 8.101408896381141e-07, "logits/chosen": -1.4727786779403687, "logits/rejected": -0.9919303059577942, "logps/chosen": -1113.8826904296875, "logps/rejected": -2898.84326171875, "loss": 0.0561, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.318387985229492, "rewards/margins": 18.01841926574707, "rewards/rejected": -27.336811065673828, "step": 1080 }, { "epoch": 0.77, "grad_norm": 3.984375, "learning_rate": 7.652274018639791e-07, "logits/chosen": -1.4749600887298584, "logits/rejected": -1.1055461168289185, "logps/chosen": -1062.56103515625, "logps/rejected": -2858.611572265625, "loss": 0.0176, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.10958480834961, "rewards/margins": 17.9027042388916, "rewards/rejected": -27.01228904724121, "step": 1090 }, { "epoch": 0.78, "grad_norm": 0.00131988525390625, "learning_rate": 7.213687415706416e-07, "logits/chosen": -1.4862587451934814, "logits/rejected": -1.1032100915908813, "logps/chosen": -987.9553833007812, "logps/rejected": -2840.3037109375, "loss": 0.0221, "rewards/accuracies": 1.0, "rewards/chosen": -8.071586608886719, "rewards/margins": 18.485403060913086, "rewards/rejected": -26.556987762451172, "step": 1100 }, { "epoch": 0.78, "eval_logits/chosen": -1.7971382141113281, "eval_logits/rejected": -1.7736546993255615, "eval_logps/chosen": -1209.494873046875, "eval_logps/rejected": -1187.998291015625, "eval_loss": 1.1414741277694702, "eval_rewards/accuracies": 0.5133426785469055, "eval_rewards/chosen": -8.256168365478516, "eval_rewards/margins": 0.32210543751716614, "eval_rewards/rejected": -8.57827377319336, "eval_runtime": 655.5822, "eval_samples_per_second": 8.685, "eval_steps_per_second": 0.272, "step": 1100 }, { "epoch": 0.78, "grad_norm": 0.00022411346435546875, "learning_rate": 6.785915769912763e-07, "logits/chosen": -1.3605607748031616, "logits/rejected": -0.9404409527778625, "logps/chosen": -1157.4788818359375, "logps/rejected": -3032.787841796875, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -9.797929763793945, "rewards/margins": 18.825809478759766, "rewards/rejected": -28.62373924255371, "step": 1110 }, { "epoch": 0.79, "grad_norm": 0.0283203125, "learning_rate": 6.369219187562064e-07, "logits/chosen": -1.2791340351104736, "logits/rejected": -0.8162029981613159, "logps/chosen": -1043.4237060546875, "logps/rejected": -2922.19189453125, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -8.613737106323242, "rewards/margins": 18.991708755493164, "rewards/rejected": -27.605443954467773, "step": 1120 }, { "epoch": 0.8, "grad_norm": 2.5, "learning_rate": 5.963851040771639e-07, "logits/chosen": -1.3994640111923218, "logits/rejected": -0.952297568321228, "logps/chosen": -1053.916748046875, "logps/rejected": -2627.59521484375, "loss": 0.0041, "rewards/accuracies": 1.0, "rewards/chosen": -8.838054656982422, "rewards/margins": 15.89689826965332, "rewards/rejected": -24.73495101928711, "step": 1130 }, { "epoch": 0.81, "grad_norm": 0.01434326171875, "learning_rate": 5.570057813410043e-07, "logits/chosen": -1.3779122829437256, "logits/rejected": -1.0176897048950195, "logps/chosen": -1081.74462890625, "logps/rejected": -2848.0478515625, "loss": 0.0692, "rewards/accuracies": 1.0, "rewards/chosen": -9.363855361938477, "rewards/margins": 17.53445053100586, "rewards/rejected": -26.898305892944336, "step": 1140 }, { "epoch": 0.81, "grad_norm": 36.25, "learning_rate": 5.188078951222745e-07, "logits/chosen": -1.4497394561767578, "logits/rejected": -1.0479614734649658, "logps/chosen": -1025.534423828125, "logps/rejected": -2559.9140625, "loss": 0.0827, "rewards/accuracies": 1.0, "rewards/chosen": -8.517609596252441, "rewards/margins": 15.235819816589355, "rewards/rejected": -23.753429412841797, "step": 1150 }, { "epoch": 0.82, "grad_norm": 0.00750732421875, "learning_rate": 4.818146716237248e-07, "logits/chosen": -1.38028085231781, "logits/rejected": -1.0845643281936646, "logps/chosen": -1096.662841796875, "logps/rejected": -2663.427734375, "loss": 0.0308, "rewards/accuracies": 1.0, "rewards/chosen": -9.4973783493042, "rewards/margins": 15.601678848266602, "rewards/rejected": -25.099056243896484, "step": 1160 }, { "epoch": 0.83, "grad_norm": 10.25, "learning_rate": 4.4604860455363415e-07, "logits/chosen": -1.4428985118865967, "logits/rejected": -0.9759352803230286, "logps/chosen": -1115.9844970703125, "logps/rejected": -2557.877685546875, "loss": 0.0415, "rewards/accuracies": 0.949999988079071, "rewards/chosen": -9.554030418395996, "rewards/margins": 14.485267639160156, "rewards/rejected": -24.039297103881836, "step": 1170 }, { "epoch": 0.83, "grad_norm": 29.75, "learning_rate": 4.1153144144851746e-07, "logits/chosen": -1.4641423225402832, "logits/rejected": -1.1280659437179565, "logps/chosen": -1110.2255859375, "logps/rejected": -2608.10986328125, "loss": 0.0179, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -9.455390930175781, "rewards/margins": 14.743484497070312, "rewards/rejected": -24.198875427246094, "step": 1180 }, { "epoch": 0.84, "grad_norm": 10.9375, "learning_rate": 3.7828417044955465e-07, "logits/chosen": -1.5029444694519043, "logits/rejected": -1.0327703952789307, "logps/chosen": -1037.093994140625, "logps/rejected": -2658.862548828125, "loss": 0.0092, "rewards/accuracies": 1.0, "rewards/chosen": -8.763093948364258, "rewards/margins": 16.484668731689453, "rewards/rejected": -25.24776268005371, "step": 1190 }, { "epoch": 0.85, "grad_norm": 0.462890625, "learning_rate": 3.463270075407585e-07, "logits/chosen": -1.3929235935211182, "logits/rejected": -0.9737609028816223, "logps/chosen": -955.1953125, "logps/rejected": -2510.16259765625, "loss": 0.0523, "rewards/accuracies": 1.0, "rewards/chosen": -8.147527694702148, "rewards/margins": 15.567843437194824, "rewards/rejected": -23.71537208557129, "step": 1200 }, { "epoch": 0.85, "eval_logits/chosen": -1.800801157951355, "eval_logits/rejected": -1.777402400970459, "eval_logps/chosen": -1208.180419921875, "eval_logps/rejected": -1186.5596923828125, "eval_loss": 1.1390780210494995, "eval_rewards/accuracies": 0.5126404762268066, "eval_rewards/chosen": -8.243023872375488, "eval_rewards/margins": 0.3208653926849365, "eval_rewards/rejected": -8.563888549804688, "eval_runtime": 655.9309, "eval_samples_per_second": 8.681, "eval_steps_per_second": 0.271, "step": 1200 }, { "epoch": 0.85, "grad_norm": 27.625, "learning_rate": 3.1567938425665995e-07, "logits/chosen": -1.4732145071029663, "logits/rejected": -1.0730016231536865, "logps/chosen": -1028.572998046875, "logps/rejected": -2395.628173828125, "loss": 0.014, "rewards/accuracies": 1.0, "rewards/chosen": -8.665718078613281, "rewards/margins": 13.804449081420898, "rewards/rejected": -22.470169067382812, "step": 1210 }, { "epoch": 0.86, "grad_norm": 0.0478515625, "learning_rate": 2.8635993586697555e-07, "logits/chosen": -1.4387049674987793, "logits/rejected": -1.0289270877838135, "logps/chosen": -1012.2488403320312, "logps/rejected": -2634.938232421875, "loss": 0.0226, "rewards/accuracies": 1.0, "rewards/chosen": -8.564096450805664, "rewards/margins": 16.44196128845215, "rewards/rejected": -25.006057739257812, "step": 1220 }, { "epoch": 0.87, "grad_norm": 0.0033111572265625, "learning_rate": 2.583864900454386e-07, "logits/chosen": -1.277785062789917, "logits/rejected": -0.8318522572517395, "logps/chosen": -1057.027587890625, "logps/rejected": -2829.741943359375, "loss": 0.0059, "rewards/accuracies": 1.0, "rewards/chosen": -9.079306602478027, "rewards/margins": 17.6206111907959, "rewards/rejected": -26.699920654296875, "step": 1230 }, { "epoch": 0.88, "grad_norm": 0.0247802734375, "learning_rate": 2.317760560296975e-07, "logits/chosen": -1.4619300365447998, "logits/rejected": -1.1385523080825806, "logps/chosen": -939.02880859375, "logps/rejected": -2747.52001953125, "loss": 0.0037, "rewards/accuracies": 1.0, "rewards/chosen": -7.453832149505615, "rewards/margins": 18.143102645874023, "rewards/rejected": -25.596935272216797, "step": 1240 }, { "epoch": 0.88, "grad_norm": 0.1181640625, "learning_rate": 2.065448142788537e-07, "logits/chosen": -1.4030250310897827, "logits/rejected": -1.065373182296753, "logps/chosen": -1141.380126953125, "logps/rejected": -2657.07177734375, "loss": 0.0469, "rewards/accuracies": 1.0, "rewards/chosen": -9.86433219909668, "rewards/margins": 15.284082412719727, "rewards/rejected": -25.14841651916504, "step": 1250 }, { "epoch": 0.89, "grad_norm": 3.09375, "learning_rate": 1.8270810663494591e-07, "logits/chosen": -1.447638750076294, "logits/rejected": -1.1313003301620483, "logps/chosen": -1005.6617431640625, "logps/rejected": -2633.3125, "loss": 0.0105, "rewards/accuracies": 1.0, "rewards/chosen": -8.672689437866211, "rewards/margins": 16.14659881591797, "rewards/rejected": -24.819286346435547, "step": 1260 }, { "epoch": 0.9, "grad_norm": 0.953125, "learning_rate": 1.602804269943503e-07, "logits/chosen": -1.3622468709945679, "logits/rejected": -0.7782484292984009, "logps/chosen": -1040.588623046875, "logps/rejected": -2571.0732421875, "loss": 0.0053, "rewards/accuracies": 1.0, "rewards/chosen": -8.804241180419922, "rewards/margins": 15.194506645202637, "rewards/rejected": -23.998748779296875, "step": 1270 }, { "epoch": 0.9, "grad_norm": 0.357421875, "learning_rate": 1.3927541249477732e-07, "logits/chosen": -1.443404197692871, "logits/rejected": -0.9848993420600891, "logps/chosen": -1129.6878662109375, "logps/rejected": -2819.124267578125, "loss": 0.0178, "rewards/accuracies": 1.0, "rewards/chosen": -9.561059951782227, "rewards/margins": 16.899639129638672, "rewards/rejected": -26.460702896118164, "step": 1280 }, { "epoch": 0.91, "grad_norm": 4.9375, "learning_rate": 1.197058352232147e-07, "logits/chosen": -1.4344743490219116, "logits/rejected": -1.0912712812423706, "logps/chosen": -1007.9430541992188, "logps/rejected": -2606.56591796875, "loss": 0.0128, "rewards/accuracies": 1.0, "rewards/chosen": -8.628483772277832, "rewards/margins": 15.96386432647705, "rewards/rejected": -24.592348098754883, "step": 1290 }, { "epoch": 0.92, "grad_norm": 2.71875, "learning_rate": 1.0158359444987054e-07, "logits/chosen": -1.334395170211792, "logits/rejected": -1.0543233156204224, "logps/chosen": -995.3160400390625, "logps/rejected": -2497.846435546875, "loss": 0.0089, "rewards/accuracies": 1.0, "rewards/chosen": -8.33879280090332, "rewards/margins": 14.844099044799805, "rewards/rejected": -23.182891845703125, "step": 1300 }, { "epoch": 0.92, "eval_logits/chosen": -1.8012746572494507, "eval_logits/rejected": -1.7778656482696533, "eval_logps/chosen": -1207.9892578125, "eval_logps/rejected": -1186.5936279296875, "eval_loss": 1.1386491060256958, "eval_rewards/accuracies": 0.5126404762268066, "eval_rewards/chosen": -8.241113662719727, "eval_rewards/margins": 0.32311534881591797, "eval_rewards/rejected": -8.564229965209961, "eval_runtime": 655.9595, "eval_samples_per_second": 8.68, "eval_steps_per_second": 0.271, "step": 1300 }, { "epoch": 0.93, "grad_norm": 0.007476806640625, "learning_rate": 8.491970939282613e-08, "logits/chosen": -1.4563968181610107, "logits/rejected": -0.9708026051521301, "logps/chosen": -1012.9606323242188, "logps/rejected": -2704.7099609375, "loss": 0.036, "rewards/accuracies": 1.0, "rewards/chosen": -8.506877899169922, "rewards/margins": 16.914335250854492, "rewards/rejected": -25.421215057373047, "step": 1310 }, { "epoch": 0.93, "grad_norm": 0.70703125, "learning_rate": 6.972431251780931e-08, "logits/chosen": -1.4861555099487305, "logits/rejected": -1.0724611282348633, "logps/chosen": -1050.627197265625, "logps/rejected": -2748.00537109375, "loss": 0.0428, "rewards/accuracies": 1.0, "rewards/chosen": -8.896292686462402, "rewards/margins": 17.019859313964844, "rewards/rejected": -25.916152954101562, "step": 1320 }, { "epoch": 0.94, "grad_norm": 10.0625, "learning_rate": 5.600664337715167e-08, "logits/chosen": -1.5016971826553345, "logits/rejected": -1.0134552717208862, "logps/chosen": -1083.9617919921875, "logps/rejected": -2744.57275390625, "loss": 0.0124, "rewards/accuracies": 1.0, "rewards/chosen": -8.885272026062012, "rewards/margins": 16.872028350830078, "rewards/rejected": -25.75729751586914, "step": 1330 }, { "epoch": 0.95, "grad_norm": 0.83984375, "learning_rate": 4.3775042991686944e-08, "logits/chosen": -1.4656665325164795, "logits/rejected": -1.0450032949447632, "logps/chosen": -1093.253173828125, "logps/rejected": -2482.2109375, "loss": 0.0099, "rewards/accuracies": 1.0, "rewards/chosen": -9.25328540802002, "rewards/margins": 14.060190200805664, "rewards/rejected": -23.313474655151367, "step": 1340 }, { "epoch": 0.95, "grad_norm": 0.029296875, "learning_rate": 3.303694877899666e-08, "logits/chosen": -1.4124219417572021, "logits/rejected": -1.1058433055877686, "logps/chosen": -1101.9676513671875, "logps/rejected": -2878.31982421875, "loss": 0.0103, "rewards/accuracies": 1.0, "rewards/chosen": -9.427397727966309, "rewards/margins": 17.64303207397461, "rewards/rejected": -27.070430755615234, "step": 1350 }, { "epoch": 0.96, "grad_norm": 1.6953125, "learning_rate": 2.3798890031092037e-08, "logits/chosen": -1.4580352306365967, "logits/rejected": -0.9887199401855469, "logps/chosen": -1025.1748046875, "logps/rejected": -2862.871337890625, "loss": 0.0113, "rewards/accuracies": 1.0, "rewards/chosen": -8.556358337402344, "rewards/margins": 18.30660629272461, "rewards/rejected": -26.862966537475586, "step": 1360 }, { "epoch": 0.97, "grad_norm": 0.248046875, "learning_rate": 1.606648394428284e-08, "logits/chosen": -1.364495873451233, "logits/rejected": -0.9767102003097534, "logps/chosen": -1100.37646484375, "logps/rejected": -3104.2509765625, "loss": 0.0189, "rewards/accuracies": 1.0, "rewards/chosen": -9.444318771362305, "rewards/margins": 20.047155380249023, "rewards/rejected": -29.491479873657227, "step": 1370 }, { "epoch": 0.97, "grad_norm": 0.2392578125, "learning_rate": 9.844432203644228e-09, "logits/chosen": -1.482132911682129, "logits/rejected": -0.9816814661026001, "logps/chosen": -982.9803466796875, "logps/rejected": -2685.384033203125, "loss": 0.0061, "rewards/accuracies": 1.0, "rewards/chosen": -8.141470909118652, "rewards/margins": 16.952713012695312, "rewards/rejected": -25.09418296813965, "step": 1380 }, { "epoch": 0.98, "grad_norm": 0.1767578125, "learning_rate": 5.136518124159162e-09, "logits/chosen": -1.4492480754852295, "logits/rejected": -1.0385563373565674, "logps/chosen": -1007.1890869140625, "logps/rejected": -2742.18310546875, "loss": 0.0087, "rewards/accuracies": 1.0, "rewards/chosen": -8.491100311279297, "rewards/margins": 17.244041442871094, "rewards/rejected": -25.73514175415039, "step": 1390 }, { "epoch": 0.99, "grad_norm": 4.0625, "learning_rate": 1.945604350276631e-09, "logits/chosen": -1.4436314105987549, "logits/rejected": -1.0450704097747803, "logps/chosen": -1043.3038330078125, "logps/rejected": -2812.408935546875, "loss": 0.0165, "rewards/accuracies": 0.9750000238418579, "rewards/chosen": -8.866385459899902, "rewards/margins": 17.693334579467773, "rewards/rejected": -26.55971908569336, "step": 1400 }, { "epoch": 0.99, "eval_logits/chosen": -1.8010309934616089, "eval_logits/rejected": -1.7776145935058594, "eval_logps/chosen": -1208.0731201171875, "eval_logps/rejected": -1186.5670166015625, "eval_loss": 1.139304518699646, "eval_rewards/accuracies": 0.5098314881324768, "eval_rewards/chosen": -8.241950035095215, "eval_rewards/margins": 0.32201313972473145, "eval_rewards/rejected": -8.563963890075684, "eval_runtime": 655.4029, "eval_samples_per_second": 8.688, "eval_steps_per_second": 0.272, "step": 1400 }, { "epoch": 1.0, "grad_norm": 0.12353515625, "learning_rate": 2.7363111528233563e-10, "logits/chosen": -1.1875760555267334, "logits/rejected": -0.8251748085021973, "logps/chosen": -1076.190185546875, "logps/rejected": -2831.43017578125, "loss": 0.0046, "rewards/accuracies": 1.0, "rewards/chosen": -9.385587692260742, "rewards/margins": 17.486358642578125, "rewards/rejected": -26.871944427490234, "step": 1410 }, { "epoch": 1.0, "step": 1416, "total_flos": 0.0, "train_loss": 0.10576157405838307, "train_runtime": 15018.2697, "train_samples_per_second": 1.508, "train_steps_per_second": 0.094 } ], "logging_steps": 10, "max_steps": 1416, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }