{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01068804275217101, "grad_norm": 62.3482779515011, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.0140018463134766, "logits/rejected": -0.9845958948135376, "logps/chosen": -0.27406683564186096, "logps/rejected": -0.2714424729347229, "loss": 3.0994, "rewards/accuracies": 0.4375, "rewards/chosen": -2.7406680583953857, "rewards/margins": -0.02624346688389778, "rewards/rejected": -2.7144248485565186, "step": 5 }, { "epoch": 0.02137608550434202, "grad_norm": 39.525628188076254, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.0465514659881592, "logits/rejected": -0.9793618321418762, "logps/chosen": -0.29423215985298157, "logps/rejected": -0.2993616461753845, "loss": 3.1379, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.942321538925171, "rewards/margins": 0.051294513046741486, "rewards/rejected": -2.9936161041259766, "step": 10 }, { "epoch": 0.03206412825651302, "grad_norm": 51.851115967445885, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.9649394750595093, "logits/rejected": -0.983955979347229, "logps/chosen": -0.26406729221343994, "logps/rejected": -0.3004179894924164, "loss": 3.2301, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.6406729221343994, "rewards/margins": 0.363506942987442, "rewards/rejected": -3.0041799545288086, "step": 15 }, { "epoch": 0.04275217100868404, "grad_norm": 99.02642749476678, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9679675102233887, "logits/rejected": -0.9419299960136414, "logps/chosen": -0.2775927186012268, "logps/rejected": -0.2915174961090088, "loss": 3.1605, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.7759270668029785, "rewards/margins": 0.13924789428710938, "rewards/rejected": -2.915174961090088, "step": 20 }, { "epoch": 0.053440213760855046, "grad_norm": 57.54560785330943, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.003973126411438, "logits/rejected": -0.9752557873725891, "logps/chosen": -0.2722216844558716, "logps/rejected": -0.2782929539680481, "loss": 3.3103, "rewards/accuracies": 0.5, "rewards/chosen": -2.7222166061401367, "rewards/margins": 0.060712575912475586, "rewards/rejected": -2.7829294204711914, "step": 25 }, { "epoch": 0.06412825651302605, "grad_norm": 52.02880771091385, "learning_rate": 6.382978723404255e-07, "logits/chosen": -0.9953545331954956, "logits/rejected": -0.9509505033493042, "logps/chosen": -0.273654043674469, "logps/rejected": -0.27911943197250366, "loss": 2.9866, "rewards/accuracies": 0.4375, "rewards/chosen": -2.7365403175354004, "rewards/margins": 0.05465413257479668, "rewards/rejected": -2.791194438934326, "step": 30 }, { "epoch": 0.07481629926519706, "grad_norm": 62.86254815286924, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.0501649379730225, "logits/rejected": -0.9741900563240051, "logps/chosen": -0.2949184775352478, "logps/rejected": -0.32086285948753357, "loss": 3.0094, "rewards/accuracies": 0.53125, "rewards/chosen": -2.9491848945617676, "rewards/margins": 0.2594442367553711, "rewards/rejected": -3.2086288928985596, "step": 35 }, { "epoch": 0.08550434201736808, "grad_norm": 65.11890009591963, "learning_rate": 8.51063829787234e-07, "logits/chosen": -1.0010614395141602, "logits/rejected": -0.9576476216316223, "logps/chosen": -0.2806803584098816, "logps/rejected": -0.32674694061279297, "loss": 2.9254, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -2.8068039417266846, "rewards/margins": 0.46066540479660034, "rewards/rejected": -3.2674694061279297, "step": 40 }, { "epoch": 0.09619238476953908, "grad_norm": 36.14523632736934, "learning_rate": 9.574468085106384e-07, "logits/chosen": -1.0506356954574585, "logits/rejected": -1.0073630809783936, "logps/chosen": -0.3041021227836609, "logps/rejected": -0.35804516077041626, "loss": 3.0106, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -3.0410218238830566, "rewards/margins": 0.5394296050071716, "rewards/rejected": -3.580451250076294, "step": 45 }, { "epoch": 0.10688042752171009, "grad_norm": 61.101997429137676, "learning_rate": 9.998741174712533e-07, "logits/chosen": -1.025564432144165, "logits/rejected": -0.9764531850814819, "logps/chosen": -0.3140087425708771, "logps/rejected": -0.3549434542655945, "loss": 3.1031, "rewards/accuracies": 0.46875, "rewards/chosen": -3.140087604522705, "rewards/margins": 0.40934714674949646, "rewards/rejected": -3.5494346618652344, "step": 50 }, { "epoch": 0.11756847027388109, "grad_norm": 128.15530312228304, "learning_rate": 9.991050648838675e-07, "logits/chosen": -1.0674389600753784, "logits/rejected": -1.0326998233795166, "logps/chosen": -0.2986104488372803, "logps/rejected": -0.3673686683177948, "loss": 2.7756, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -2.9861044883728027, "rewards/margins": 0.6875823736190796, "rewards/rejected": -3.6736865043640137, "step": 55 }, { "epoch": 0.1282565130260521, "grad_norm": 59.025780817391194, "learning_rate": 9.97637968732563e-07, "logits/chosen": -1.108605980873108, "logits/rejected": -1.075398325920105, "logps/chosen": -0.3378816246986389, "logps/rejected": -0.3586636483669281, "loss": 2.9029, "rewards/accuracies": 0.5625, "rewards/chosen": -3.3788161277770996, "rewards/margins": 0.20782046020030975, "rewards/rejected": -3.5866363048553467, "step": 60 }, { "epoch": 0.13894455577822312, "grad_norm": 66.41947079309563, "learning_rate": 9.954748808839674e-07, "logits/chosen": -1.0052762031555176, "logits/rejected": -0.9777056574821472, "logps/chosen": -0.4097859263420105, "logps/rejected": -0.4840938150882721, "loss": 2.9214, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.0978593826293945, "rewards/margins": 0.7430787682533264, "rewards/rejected": -4.840937614440918, "step": 65 }, { "epoch": 0.14963259853039412, "grad_norm": 42.264544083175664, "learning_rate": 9.926188266120295e-07, "logits/chosen": -1.0273511409759521, "logits/rejected": -1.0030959844589233, "logps/chosen": -0.39483898878097534, "logps/rejected": -0.4872601628303528, "loss": 2.8869, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.948390245437622, "rewards/margins": 0.9242109060287476, "rewards/rejected": -4.872600555419922, "step": 70 }, { "epoch": 0.16032064128256512, "grad_norm": 71.26393008122545, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.9822802543640137, "logits/rejected": -0.9115797281265259, "logps/chosen": -0.38268035650253296, "logps/rejected": -0.4444147050380707, "loss": 2.9837, "rewards/accuracies": 0.53125, "rewards/chosen": -3.826803207397461, "rewards/margins": 0.6173437237739563, "rewards/rejected": -4.444147109985352, "step": 75 }, { "epoch": 0.17100868403473615, "grad_norm": 50.319154680061054, "learning_rate": 9.848447601883433e-07, "logits/chosen": -0.9707294702529907, "logits/rejected": -0.9569045305252075, "logps/chosen": -0.35631316900253296, "logps/rejected": -0.46392399072647095, "loss": 2.7881, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": -3.563131809234619, "rewards/margins": 1.0761077404022217, "rewards/rejected": -4.63923978805542, "step": 80 }, { "epoch": 0.18169672678690715, "grad_norm": 70.667078624953, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.9739160537719727, "logits/rejected": -0.9520059823989868, "logps/chosen": -0.3384969234466553, "logps/rejected": -0.4023989737033844, "loss": 2.7081, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -3.3849689960479736, "rewards/margins": 0.6390206217765808, "rewards/rejected": -4.023990154266357, "step": 85 }, { "epoch": 0.19238476953907815, "grad_norm": 70.40191576067738, "learning_rate": 9.743592451943998e-07, "logits/chosen": -1.0218126773834229, "logits/rejected": -0.9869282841682434, "logps/chosen": -0.43610191345214844, "logps/rejected": -0.5330775380134583, "loss": 2.9174, "rewards/accuracies": 0.5625, "rewards/chosen": -4.361019134521484, "rewards/margins": 0.9697564840316772, "rewards/rejected": -5.330776214599609, "step": 90 }, { "epoch": 0.20307281229124916, "grad_norm": 48.668822510411275, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.0933444499969482, "logits/rejected": -1.01079523563385, "logps/chosen": -0.4492688775062561, "logps/rejected": -0.49912723898887634, "loss": 2.7939, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -4.49268913269043, "rewards/margins": 0.49858370423316956, "rewards/rejected": -4.99127197265625, "step": 95 }, { "epoch": 0.21376085504342018, "grad_norm": 68.88980102518907, "learning_rate": 9.612209208833646e-07, "logits/chosen": -0.9715415239334106, "logits/rejected": -0.9469987154006958, "logps/chosen": -0.4551132619380951, "logps/rejected": -0.49960607290267944, "loss": 3.0421, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -4.551133155822754, "rewards/margins": 0.44492778182029724, "rewards/rejected": -4.996060848236084, "step": 100 }, { "epoch": 0.22444889779559118, "grad_norm": 73.34499964545964, "learning_rate": 9.536793472839324e-07, "logits/chosen": -0.9907165765762329, "logits/rejected": -0.9372614622116089, "logps/chosen": -0.39754587411880493, "logps/rejected": -0.5162733793258667, "loss": 2.9277, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -3.975458860397339, "rewards/margins": 1.187274694442749, "rewards/rejected": -5.16273307800293, "step": 105 }, { "epoch": 0.23513694054776219, "grad_norm": 55.456582167351314, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.9392507672309875, "logits/rejected": -0.8784140348434448, "logps/chosen": -0.4648202955722809, "logps/rejected": -0.5995782017707825, "loss": 2.788, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -4.648202896118164, "rewards/margins": 1.347578763961792, "rewards/rejected": -5.995781898498535, "step": 110 }, { "epoch": 0.2458249832999332, "grad_norm": 60.644976413076485, "learning_rate": 9.367041003085648e-07, "logits/chosen": -1.0021904706954956, "logits/rejected": -0.9409273266792297, "logps/chosen": -0.48944035172462463, "logps/rejected": -0.5491489171981812, "loss": 2.6847, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -4.89440393447876, "rewards/margins": 0.5970853567123413, "rewards/rejected": -5.491488933563232, "step": 115 }, { "epoch": 0.2565130260521042, "grad_norm": 66.42443726320764, "learning_rate": 9.272941683504808e-07, "logits/chosen": -0.9655882716178894, "logits/rejected": -0.8733075857162476, "logps/chosen": -0.49609699845314026, "logps/rejected": -0.6988444328308105, "loss": 2.6175, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -4.9609694480896, "rewards/margins": 2.027474880218506, "rewards/rejected": -6.9884443283081055, "step": 120 }, { "epoch": 0.26720106880427524, "grad_norm": 80.7904476004964, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.0375964641571045, "logits/rejected": -0.9925470352172852, "logps/chosen": -0.5534143447875977, "logps/rejected": -0.6445597410202026, "loss": 2.4088, "rewards/accuracies": 0.643750011920929, "rewards/chosen": -5.534143447875977, "rewards/margins": 0.9114534258842468, "rewards/rejected": -6.445597171783447, "step": 125 }, { "epoch": 0.27788911155644624, "grad_norm": 164.83298553765533, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.0506072044372559, "logits/rejected": -1.0421117544174194, "logps/chosen": -0.5260264873504639, "logps/rejected": -0.809399425983429, "loss": 2.3687, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -5.260264873504639, "rewards/margins": 2.833728551864624, "rewards/rejected": -8.093994140625, "step": 130 }, { "epoch": 0.28857715430861725, "grad_norm": 110.42493262615906, "learning_rate": 8.955355173281707e-07, "logits/chosen": -1.0204308032989502, "logits/rejected": -0.9703726768493652, "logps/chosen": -0.5653955936431885, "logps/rejected": -0.6786874532699585, "loss": 2.4188, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": -5.653956413269043, "rewards/margins": 1.132918119430542, "rewards/rejected": -6.786874294281006, "step": 135 }, { "epoch": 0.29926519706078825, "grad_norm": 71.93596841122324, "learning_rate": 8.838223701790055e-07, "logits/chosen": -1.100914716720581, "logits/rejected": -1.0754241943359375, "logps/chosen": -0.6414980888366699, "logps/rejected": -0.7632189989089966, "loss": 2.4389, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -6.414980411529541, "rewards/margins": 1.217208981513977, "rewards/rejected": -7.6321892738342285, "step": 140 }, { "epoch": 0.30995323981295925, "grad_norm": 66.0213241480768, "learning_rate": 8.71572412738697e-07, "logits/chosen": -0.9927597045898438, "logits/rejected": -0.9650676846504211, "logps/chosen": -0.6458258032798767, "logps/rejected": -0.8493485450744629, "loss": 2.1456, "rewards/accuracies": 0.71875, "rewards/chosen": -6.458258152008057, "rewards/margins": 2.0352275371551514, "rewards/rejected": -8.493486404418945, "step": 145 }, { "epoch": 0.32064128256513025, "grad_norm": 69.48762102431438, "learning_rate": 8.588027776804058e-07, "logits/chosen": -1.0225753784179688, "logits/rejected": -0.9999582171440125, "logps/chosen": -0.7057562470436096, "logps/rejected": -0.8774517774581909, "loss": 2.2075, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -7.057562351226807, "rewards/margins": 1.7169564962387085, "rewards/rejected": -8.774518966674805, "step": 150 }, { "epoch": 0.33132932531730125, "grad_norm": 67.84761069060667, "learning_rate": 8.455313244934324e-07, "logits/chosen": -1.0279462337493896, "logits/rejected": -1.0052425861358643, "logps/chosen": -0.7633088231086731, "logps/rejected": -1.005048155784607, "loss": 2.2792, "rewards/accuracies": 0.706250011920929, "rewards/chosen": -7.633088111877441, "rewards/margins": 2.4173941612243652, "rewards/rejected": -10.050481796264648, "step": 155 }, { "epoch": 0.3420173680694723, "grad_norm": 82.96608468035394, "learning_rate": 8.317766145051057e-07, "logits/chosen": -1.030012845993042, "logits/rejected": -1.012251377105713, "logps/chosen": -0.8448828458786011, "logps/rejected": -1.189296007156372, "loss": 2.2782, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.448829650878906, "rewards/margins": 3.444131851196289, "rewards/rejected": -11.892961502075195, "step": 160 }, { "epoch": 0.3527054108216433, "grad_norm": 64.3657602730046, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.0416388511657715, "logits/rejected": -1.0139344930648804, "logps/chosen": -0.8959344625473022, "logps/rejected": -1.2197812795639038, "loss": 2.1737, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -8.959344863891602, "rewards/margins": 3.238468885421753, "rewards/rejected": -12.197813034057617, "step": 165 }, { "epoch": 0.3633934535738143, "grad_norm": 72.02799237773567, "learning_rate": 8.028950219204099e-07, "logits/chosen": -1.0289143323898315, "logits/rejected": -1.0052926540374756, "logps/chosen": -0.87171471118927, "logps/rejected": -1.2460126876831055, "loss": 2.0202, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -8.717145919799805, "rewards/margins": 3.7429795265197754, "rewards/rejected": -12.460125923156738, "step": 170 }, { "epoch": 0.3740814963259853, "grad_norm": 98.19729820258998, "learning_rate": 7.878085328428368e-07, "logits/chosen": -1.0445234775543213, "logits/rejected": -0.9937236905097961, "logps/chosen": -0.9708870649337769, "logps/rejected": -1.1840471029281616, "loss": 1.8631, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -9.708871841430664, "rewards/margins": 2.1315996646881104, "rewards/rejected": -11.840471267700195, "step": 175 }, { "epoch": 0.3847695390781563, "grad_norm": 69.56663708781485, "learning_rate": 7.723195175075135e-07, "logits/chosen": -1.001516580581665, "logits/rejected": -0.9787738919258118, "logps/chosen": -0.9560983777046204, "logps/rejected": -1.2841722965240479, "loss": 1.8859, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -9.560983657836914, "rewards/margins": 3.2807374000549316, "rewards/rejected": -12.841720581054688, "step": 180 }, { "epoch": 0.3954575818303273, "grad_norm": 91.74349211425728, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.0450928211212158, "logits/rejected": -0.9868279695510864, "logps/chosen": -1.0066741704940796, "logps/rejected": -1.3705105781555176, "loss": 1.7618, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -10.066742897033691, "rewards/margins": 3.6383633613586426, "rewards/rejected": -13.705105781555176, "step": 185 }, { "epoch": 0.4061456245824983, "grad_norm": 88.77063833211469, "learning_rate": 7.402210918896689e-07, "logits/chosen": -1.0353386402130127, "logits/rejected": -1.0412102937698364, "logps/chosen": -1.136867642402649, "logps/rejected": -1.6356449127197266, "loss": 1.6848, "rewards/accuracies": 0.8125, "rewards/chosen": -11.36867618560791, "rewards/margins": 4.987773895263672, "rewards/rejected": -16.356449127197266, "step": 190 }, { "epoch": 0.4168336673346693, "grad_norm": 78.26789286604094, "learning_rate": 7.236565741578162e-07, "logits/chosen": -0.9953984022140503, "logits/rejected": -0.973209023475647, "logps/chosen": -1.1132943630218506, "logps/rejected": -1.454097867012024, "loss": 1.6983, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -11.132943153381348, "rewards/margins": 3.408036470413208, "rewards/rejected": -14.540979385375977, "step": 195 }, { "epoch": 0.42752171008684037, "grad_norm": 124.50175267795422, "learning_rate": 7.067792524832603e-07, "logits/chosen": -0.9921610951423645, "logits/rejected": -0.9790946245193481, "logps/chosen": -1.1536376476287842, "logps/rejected": -1.5450570583343506, "loss": 1.8334, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -11.536375045776367, "rewards/margins": 3.914196729660034, "rewards/rejected": -15.45057201385498, "step": 200 }, { "epoch": 0.43820975283901137, "grad_norm": 88.73588073000924, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.0303648710250854, "logits/rejected": -0.9800698161125183, "logps/chosen": -1.2688630819320679, "logps/rejected": -1.6606884002685547, "loss": 1.8744, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -12.688631057739258, "rewards/margins": 3.918254852294922, "rewards/rejected": -16.606884002685547, "step": 205 }, { "epoch": 0.44889779559118237, "grad_norm": 88.06916392468375, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.0542664527893066, "logits/rejected": -1.0434906482696533, "logps/chosen": -1.3758102655410767, "logps/rejected": -1.7928674221038818, "loss": 1.7772, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -13.75810432434082, "rewards/margins": 4.170571327209473, "rewards/rejected": -17.928674697875977, "step": 210 }, { "epoch": 0.45958583834335337, "grad_norm": 121.97560618316521, "learning_rate": 6.545084971874736e-07, "logits/chosen": -0.9826368093490601, "logits/rejected": -0.9663593173027039, "logps/chosen": -1.4412583112716675, "logps/rejected": -1.9161531925201416, "loss": 1.7058, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -14.412582397460938, "rewards/margins": 4.748946189880371, "rewards/rejected": -19.161529541015625, "step": 215 }, { "epoch": 0.47027388109552437, "grad_norm": 123.47452460404413, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.0570485591888428, "logits/rejected": -1.0392574071884155, "logps/chosen": -1.5018644332885742, "logps/rejected": -2.0092532634735107, "loss": 1.6513, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -15.018644332885742, "rewards/margins": 5.073886394500732, "rewards/rejected": -20.092533111572266, "step": 220 }, { "epoch": 0.48096192384769537, "grad_norm": 67.98272511740662, "learning_rate": 6.185401888577487e-07, "logits/chosen": -1.0517892837524414, "logits/rejected": -1.0197094678878784, "logps/chosen": -1.4916750192642212, "logps/rejected": -1.9748971462249756, "loss": 1.5538, "rewards/accuracies": 0.75, "rewards/chosen": -14.916749954223633, "rewards/margins": 4.832221508026123, "rewards/rejected": -19.74897003173828, "step": 225 }, { "epoch": 0.4916499665998664, "grad_norm": 91.18490250667737, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.1277612447738647, "logits/rejected": -1.0792579650878906, "logps/chosen": -1.4870562553405762, "logps/rejected": -1.9353138208389282, "loss": 1.5151, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -14.870562553405762, "rewards/margins": 4.482577323913574, "rewards/rejected": -19.353137969970703, "step": 230 }, { "epoch": 0.5023380093520374, "grad_norm": 110.64166427250719, "learning_rate": 5.819089557075688e-07, "logits/chosen": -1.1586382389068604, "logits/rejected": -1.1309268474578857, "logps/chosen": -1.5005654096603394, "logps/rejected": -2.048241138458252, "loss": 1.5958, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -15.005655288696289, "rewards/margins": 5.476757049560547, "rewards/rejected": -20.482410430908203, "step": 235 }, { "epoch": 0.5130260521042084, "grad_norm": 105.27714488639405, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.1117291450500488, "logits/rejected": -1.113638162612915, "logps/chosen": -1.4002972841262817, "logps/rejected": -1.889203667640686, "loss": 1.5807, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -14.002973556518555, "rewards/margins": 4.889064788818359, "rewards/rejected": -18.892038345336914, "step": 240 }, { "epoch": 0.5237140948563794, "grad_norm": 124.65949242661881, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.2370043992996216, "logits/rejected": -1.1781737804412842, "logps/chosen": -1.4051058292388916, "logps/rejected": -1.9604175090789795, "loss": 1.4672, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -14.051058769226074, "rewards/margins": 5.553117752075195, "rewards/rejected": -19.604177474975586, "step": 245 }, { "epoch": 0.5344021376085505, "grad_norm": 144.8350321427832, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.1803662776947021, "logits/rejected": -1.1651079654693604, "logps/chosen": -1.4542481899261475, "logps/rejected": -2.0275795459747314, "loss": 1.4629, "rewards/accuracies": 0.84375, "rewards/chosen": -14.54248046875, "rewards/margins": 5.73331356048584, "rewards/rejected": -20.275793075561523, "step": 250 }, { "epoch": 0.5450901803607214, "grad_norm": 78.0251331894233, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.209343433380127, "logits/rejected": -1.182515263557434, "logps/chosen": -1.5509597063064575, "logps/rejected": -2.0222866535186768, "loss": 1.6056, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -15.50959587097168, "rewards/margins": 4.713270664215088, "rewards/rejected": -20.22286605834961, "step": 255 }, { "epoch": 0.5557782231128925, "grad_norm": 123.4068534133138, "learning_rate": 4.887809678520975e-07, "logits/chosen": -1.195039987564087, "logits/rejected": -1.1650830507278442, "logps/chosen": -1.4355518817901611, "logps/rejected": -1.9077056646347046, "loss": 1.3832, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -14.35551929473877, "rewards/margins": 4.721535682678223, "rewards/rejected": -19.077056884765625, "step": 260 }, { "epoch": 0.5664662658650634, "grad_norm": 86.09154207388711, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.1370768547058105, "logits/rejected": -1.1196085214614868, "logps/chosen": -1.5277538299560547, "logps/rejected": -2.003265857696533, "loss": 1.5697, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -15.277536392211914, "rewards/margins": 4.755120277404785, "rewards/rejected": -20.032657623291016, "step": 265 }, { "epoch": 0.5771543086172345, "grad_norm": 198.6225861486916, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.1517788171768188, "logits/rejected": -1.1230041980743408, "logps/chosen": -1.4101899862289429, "logps/rejected": -1.9304449558258057, "loss": 1.5201, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -14.101900100708008, "rewards/margins": 5.202548980712891, "rewards/rejected": -19.3044490814209, "step": 270 }, { "epoch": 0.5878423513694054, "grad_norm": 88.44447901867754, "learning_rate": 4.328833670911724e-07, "logits/chosen": -1.1258559226989746, "logits/rejected": -1.087894082069397, "logps/chosen": -1.4194262027740479, "logps/rejected": -1.82901930809021, "loss": 1.6626, "rewards/accuracies": 0.78125, "rewards/chosen": -14.19426155090332, "rewards/margins": 4.095931529998779, "rewards/rejected": -18.290193557739258, "step": 275 }, { "epoch": 0.5985303941215765, "grad_norm": 74.99606064357101, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.2096599340438843, "logits/rejected": -1.183319091796875, "logps/chosen": -1.4303152561187744, "logps/rejected": -1.9751609563827515, "loss": 1.4679, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -14.303152084350586, "rewards/margins": 5.44845724105835, "rewards/rejected": -19.751609802246094, "step": 280 }, { "epoch": 0.6092184368737475, "grad_norm": 112.3023562652983, "learning_rate": 3.960441545911204e-07, "logits/chosen": -1.17905592918396, "logits/rejected": -1.1455281972885132, "logps/chosen": -1.5073192119598389, "logps/rejected": -2.059325695037842, "loss": 1.313, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -15.073190689086914, "rewards/margins": 5.520066738128662, "rewards/rejected": -20.593257904052734, "step": 285 }, { "epoch": 0.6199064796259185, "grad_norm": 86.7215875996648, "learning_rate": 3.778297969310529e-07, "logits/chosen": -1.2023842334747314, "logits/rejected": -1.1609100103378296, "logps/chosen": -1.4881120920181274, "logps/rejected": -1.9453773498535156, "loss": 1.5443, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -14.881118774414062, "rewards/margins": 4.572653770446777, "rewards/rejected": -19.453771591186523, "step": 290 }, { "epoch": 0.6305945223780896, "grad_norm": 112.19728139908482, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.1330126523971558, "logits/rejected": -1.117620825767517, "logps/chosen": -1.5452992916107178, "logps/rejected": -2.0189967155456543, "loss": 1.3971, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -15.452993392944336, "rewards/margins": 4.736973285675049, "rewards/rejected": -20.18996810913086, "step": 295 }, { "epoch": 0.6412825651302605, "grad_norm": 103.78285178770669, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -1.0798468589782715, "logits/rejected": -1.0519963502883911, "logps/chosen": -1.5262010097503662, "logps/rejected": -1.9304592609405518, "loss": 1.8782, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -15.262011528015137, "rewards/margins": 4.042581558227539, "rewards/rejected": -19.30459213256836, "step": 300 }, { "epoch": 0.6519706078824316, "grad_norm": 89.28193531018944, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.1877957582473755, "logits/rejected": -1.141606092453003, "logps/chosen": -1.4767545461654663, "logps/rejected": -1.900796890258789, "loss": 1.5065, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -14.767545700073242, "rewards/margins": 4.240423679351807, "rewards/rejected": -19.007970809936523, "step": 305 }, { "epoch": 0.6626586506346025, "grad_norm": 118.8609587886762, "learning_rate": 3.069319753571269e-07, "logits/chosen": -1.2135640382766724, "logits/rejected": -1.1922129392623901, "logps/chosen": -1.5534937381744385, "logps/rejected": -2.036700487136841, "loss": 1.6677, "rewards/accuracies": 0.78125, "rewards/chosen": -15.534937858581543, "rewards/margins": 4.832065582275391, "rewards/rejected": -20.36700439453125, "step": 310 }, { "epoch": 0.6733466933867736, "grad_norm": 99.40008827127444, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.2052314281463623, "logits/rejected": -1.1488008499145508, "logps/chosen": -1.4641424417495728, "logps/rejected": -1.947819709777832, "loss": 1.5204, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -14.641424179077148, "rewards/margins": 4.8367719650268555, "rewards/rejected": -19.478195190429688, "step": 315 }, { "epoch": 0.6840347361389446, "grad_norm": 111.94261532026978, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.1900420188903809, "logits/rejected": -1.184233546257019, "logps/chosen": -1.5346307754516602, "logps/rejected": -2.0948898792266846, "loss": 1.5013, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -15.346307754516602, "rewards/margins": 5.6025896072387695, "rewards/rejected": -20.948898315429688, "step": 320 }, { "epoch": 0.6947227788911156, "grad_norm": 92.96649699793859, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -1.2109119892120361, "logits/rejected": -1.1595691442489624, "logps/chosen": -1.6573654413223267, "logps/rejected": -2.3071436882019043, "loss": 1.4311, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.57365608215332, "rewards/margins": 6.497782230377197, "rewards/rejected": -23.07143783569336, "step": 325 }, { "epoch": 0.7054108216432866, "grad_norm": 81.17346063961124, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.117084264755249, "logits/rejected": -1.0746078491210938, "logps/chosen": -1.5377912521362305, "logps/rejected": -1.992598533630371, "loss": 1.5151, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -15.377914428710938, "rewards/margins": 4.548072814941406, "rewards/rejected": -19.92598533630371, "step": 330 }, { "epoch": 0.7160988643954576, "grad_norm": 143.05041993788979, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -1.1712327003479004, "logits/rejected": -1.151883602142334, "logps/chosen": -1.4956694841384888, "logps/rejected": -2.041016101837158, "loss": 1.335, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -14.956695556640625, "rewards/margins": 5.453465938568115, "rewards/rejected": -20.410160064697266, "step": 335 }, { "epoch": 0.7267869071476286, "grad_norm": 114.52568542094356, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -1.1757316589355469, "logits/rejected": -1.1399040222167969, "logps/chosen": -1.5182468891143799, "logps/rejected": -2.0508017539978027, "loss": 1.5897, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.182469367980957, "rewards/margins": 5.3255486488342285, "rewards/rejected": -20.50801658630371, "step": 340 }, { "epoch": 0.7374749498997996, "grad_norm": 91.55669201238004, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -1.2101690769195557, "logits/rejected": -1.185270071029663, "logps/chosen": -1.5700366497039795, "logps/rejected": -2.17747163772583, "loss": 1.5591, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -15.70036506652832, "rewards/margins": 6.074349403381348, "rewards/rejected": -21.774715423583984, "step": 345 }, { "epoch": 0.7481629926519706, "grad_norm": 163.06372328196355, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.191392183303833, "logits/rejected": -1.1933467388153076, "logps/chosen": -1.5097607374191284, "logps/rejected": -1.995661735534668, "loss": 1.5515, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -15.097607612609863, "rewards/margins": 4.859010696411133, "rewards/rejected": -19.956619262695312, "step": 350 }, { "epoch": 0.7588510354041417, "grad_norm": 116.64866987709867, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.1239954233169556, "logits/rejected": -1.1377310752868652, "logps/chosen": -1.4421604871749878, "logps/rejected": -1.9545857906341553, "loss": 1.2498, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -14.421605110168457, "rewards/margins": 5.1242547035217285, "rewards/rejected": -19.545862197875977, "step": 355 }, { "epoch": 0.7695390781563126, "grad_norm": 96.86652927133838, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -1.1753429174423218, "logits/rejected": -1.1725155115127563, "logps/chosen": -1.4693684577941895, "logps/rejected": -2.0869216918945312, "loss": 1.4734, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -14.693684577941895, "rewards/margins": 6.175534248352051, "rewards/rejected": -20.869220733642578, "step": 360 }, { "epoch": 0.7802271209084837, "grad_norm": 102.14193626103624, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -1.2227611541748047, "logits/rejected": -1.1525650024414062, "logps/chosen": -1.5389500856399536, "logps/rejected": -2.149737596511841, "loss": 1.4471, "rewards/accuracies": 0.8125, "rewards/chosen": -15.389498710632324, "rewards/margins": 6.107874393463135, "rewards/rejected": -21.497373580932617, "step": 365 }, { "epoch": 0.7909151636606546, "grad_norm": 117.88219247560441, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.136879324913025, "logits/rejected": -1.1233110427856445, "logps/chosen": -1.5077215433120728, "logps/rejected": -1.9501771926879883, "loss": 1.5365, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.077214241027832, "rewards/margins": 4.424557685852051, "rewards/rejected": -19.501771926879883, "step": 370 }, { "epoch": 0.8016032064128257, "grad_norm": 146.62372943161364, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.1779382228851318, "logits/rejected": -1.180654764175415, "logps/chosen": -1.6213643550872803, "logps/rejected": -2.1708412170410156, "loss": 1.528, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.213642120361328, "rewards/margins": 5.494766712188721, "rewards/rejected": -21.708412170410156, "step": 375 }, { "epoch": 0.8122912491649966, "grad_norm": 138.00637470724254, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.1884950399398804, "logits/rejected": -1.1677783727645874, "logps/chosen": -1.466828465461731, "logps/rejected": -1.9588056802749634, "loss": 1.5783, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -14.668286323547363, "rewards/margins": 4.919771194458008, "rewards/rejected": -19.588054656982422, "step": 380 }, { "epoch": 0.8229792919171677, "grad_norm": 103.64385074010784, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.1786987781524658, "logits/rejected": -1.1550320386886597, "logps/chosen": -1.5073349475860596, "logps/rejected": -2.1525402069091797, "loss": 1.292, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.073351860046387, "rewards/margins": 6.45205020904541, "rewards/rejected": -21.525402069091797, "step": 385 }, { "epoch": 0.8336673346693386, "grad_norm": 91.3096727719287, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.1919111013412476, "logits/rejected": -1.1740847826004028, "logps/chosen": -1.580262541770935, "logps/rejected": -2.1456198692321777, "loss": 1.4583, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -15.80262565612793, "rewards/margins": 5.6535725593566895, "rewards/rejected": -21.45619773864746, "step": 390 }, { "epoch": 0.8443553774215097, "grad_norm": 83.70090604282484, "learning_rate": 7.077560319906694e-08, "logits/chosen": -1.1921932697296143, "logits/rejected": -1.1703288555145264, "logps/chosen": -1.5114152431488037, "logps/rejected": -2.060832977294922, "loss": 1.4804, "rewards/accuracies": 0.8125, "rewards/chosen": -15.114153861999512, "rewards/margins": 5.494177341461182, "rewards/rejected": -20.60832977294922, "step": 395 }, { "epoch": 0.8550434201736807, "grad_norm": 78.24949652820646, "learning_rate": 6.148679950161672e-08, "logits/chosen": -1.2033644914627075, "logits/rejected": -1.1850937604904175, "logps/chosen": -1.5316810607910156, "logps/rejected": -2.022660732269287, "loss": 1.3196, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -15.316810607910156, "rewards/margins": 4.909798622131348, "rewards/rejected": -20.226608276367188, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -1.3681788444519043, "eval_logits/rejected": -1.3776241540908813, "eval_logps/chosen": -1.531295895576477, "eval_logps/rejected": -2.054961919784546, "eval_loss": 1.3846672773361206, "eval_rewards/accuracies": 0.8292682766914368, "eval_rewards/chosen": -15.312957763671875, "eval_rewards/margins": 5.236661434173584, "eval_rewards/rejected": -20.549619674682617, "eval_runtime": 94.8838, "eval_samples_per_second": 20.667, "eval_steps_per_second": 1.296, "step": 400 }, { "epoch": 0.8657314629258517, "grad_norm": 117.65350933148179, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -1.166261911392212, "logits/rejected": -1.183724284172058, "logps/chosen": -1.5941721200942993, "logps/rejected": -2.0773017406463623, "loss": 1.3853, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -15.94172191619873, "rewards/margins": 4.831295967102051, "rewards/rejected": -20.77301597595215, "step": 405 }, { "epoch": 0.8764195056780227, "grad_norm": 165.1398306935317, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -1.1748878955841064, "logits/rejected": -1.1613463163375854, "logps/chosen": -1.5498403310775757, "logps/rejected": -2.065404176712036, "loss": 1.5604, "rewards/accuracies": 0.84375, "rewards/chosen": -15.49840259552002, "rewards/margins": 5.155638694763184, "rewards/rejected": -20.654041290283203, "step": 410 }, { "epoch": 0.8871075484301937, "grad_norm": 96.86053857753426, "learning_rate": 3.734784976300165e-08, "logits/chosen": -1.1749510765075684, "logits/rejected": -1.1216485500335693, "logps/chosen": -1.4522340297698975, "logps/rejected": -2.062474012374878, "loss": 1.6792, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -14.5223388671875, "rewards/margins": 6.102401256561279, "rewards/rejected": -20.624740600585938, "step": 415 }, { "epoch": 0.8977955911823647, "grad_norm": 106.60182495249761, "learning_rate": 3.058153372200695e-08, "logits/chosen": -1.2051994800567627, "logits/rejected": -1.1551568508148193, "logps/chosen": -1.4368181228637695, "logps/rejected": -1.9974851608276367, "loss": 1.4114, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -14.368182182312012, "rewards/margins": 5.606671333312988, "rewards/rejected": -19.974851608276367, "step": 420 }, { "epoch": 0.9084836339345357, "grad_norm": 133.4506452029166, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.1993720531463623, "logits/rejected": -1.1902838945388794, "logps/chosen": -1.5988143682479858, "logps/rejected": -2.0698258876800537, "loss": 1.6235, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -15.988142013549805, "rewards/margins": 4.710117340087891, "rewards/rejected": -20.698259353637695, "step": 425 }, { "epoch": 0.9191716766867067, "grad_norm": 135.55475022461232, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -1.1921052932739258, "logits/rejected": -1.1760125160217285, "logps/chosen": -1.6189323663711548, "logps/rejected": -2.1575050354003906, "loss": 1.5124, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -16.1893253326416, "rewards/margins": 5.385725498199463, "rewards/rejected": -21.57505226135254, "step": 430 }, { "epoch": 0.9298597194388778, "grad_norm": 107.7429496590572, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -1.145105004310608, "logits/rejected": -1.1043111085891724, "logps/chosen": -1.4653263092041016, "logps/rejected": -2.004653215408325, "loss": 1.4369, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -14.6532621383667, "rewards/margins": 5.393268585205078, "rewards/rejected": -20.046531677246094, "step": 435 }, { "epoch": 0.9405477621910487, "grad_norm": 99.21866608543166, "learning_rate": 1.016230078838226e-08, "logits/chosen": -1.1760567426681519, "logits/rejected": -1.1167289018630981, "logps/chosen": -1.5557719469070435, "logps/rejected": -2.0364768505096436, "loss": 1.4144, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.557719230651855, "rewards/margins": 4.807051181793213, "rewards/rejected": -20.364770889282227, "step": 440 }, { "epoch": 0.9512358049432198, "grad_norm": 90.97863869904057, "learning_rate": 6.754703038239329e-09, "logits/chosen": -1.1209274530410767, "logits/rejected": -1.1030616760253906, "logps/chosen": -1.5600204467773438, "logps/rejected": -2.162205934524536, "loss": 1.2667, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -15.60020637512207, "rewards/margins": 6.021853446960449, "rewards/rejected": -21.622058868408203, "step": 445 }, { "epoch": 0.9619238476953907, "grad_norm": 111.38924570455498, "learning_rate": 4.036953436716895e-09, "logits/chosen": -1.2349797487258911, "logits/rejected": -1.2146203517913818, "logps/chosen": -1.5040500164031982, "logps/rejected": -2.0357797145843506, "loss": 1.4976, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -15.040499687194824, "rewards/margins": 5.31729793548584, "rewards/rejected": -20.357799530029297, "step": 450 }, { "epoch": 0.9726118904475618, "grad_norm": 109.67244088589509, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -1.1971113681793213, "logits/rejected": -1.162461519241333, "logps/chosen": -1.4996418952941895, "logps/rejected": -2.1262269020080566, "loss": 1.1322, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -14.996419906616211, "rewards/margins": 6.265848636627197, "rewards/rejected": -21.262269973754883, "step": 455 }, { "epoch": 0.9832999331997327, "grad_norm": 102.11437295080724, "learning_rate": 6.852326227130833e-10, "logits/chosen": -1.1955955028533936, "logits/rejected": -1.1833515167236328, "logps/chosen": -1.5992295742034912, "logps/rejected": -2.1679768562316895, "loss": 1.3852, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -15.99229621887207, "rewards/margins": 5.687474727630615, "rewards/rejected": -21.67976951599121, "step": 460 }, { "epoch": 0.9939879759519038, "grad_norm": 101.68998572629944, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.179431438446045, "logits/rejected": -1.184579610824585, "logps/chosen": -1.569451093673706, "logps/rejected": -2.143585681915283, "loss": 1.6178, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -15.694511413574219, "rewards/margins": 5.741344451904297, "rewards/rejected": -21.435855865478516, "step": 465 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 1.9788420639405668, "train_runtime": 11440.9622, "train_samples_per_second": 5.233, "train_steps_per_second": 0.041 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }