{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9982631930527722, "eval_steps": 400, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01068804275217101, "grad_norm": 48.927791324930695, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.0180829763412476, "logits/rejected": -0.9883173704147339, "logps/chosen": -0.2738715410232544, "logps/rejected": -0.2716783285140991, "loss": 3.0574, "rewards/accuracies": 0.4375, "rewards/chosen": -2.738715648651123, "rewards/margins": -0.021932203322649002, "rewards/rejected": -2.716783046722412, "step": 5 }, { "epoch": 0.02137608550434202, "grad_norm": 39.813279548661036, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.0492197275161743, "logits/rejected": -0.9815438985824585, "logps/chosen": -0.2942040264606476, "logps/rejected": -0.29975026845932007, "loss": 3.0033, "rewards/accuracies": 0.5062500238418579, "rewards/chosen": -2.942039966583252, "rewards/margins": 0.055462419986724854, "rewards/rejected": -2.997502326965332, "step": 10 }, { "epoch": 0.03206412825651302, "grad_norm": 54.64580630838249, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -0.9780637621879578, "logits/rejected": -0.9978879690170288, "logps/chosen": -0.2642993927001953, "logps/rejected": -0.3006458878517151, "loss": 2.9877, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -2.642993688583374, "rewards/margins": 0.363465279340744, "rewards/rejected": -3.0064589977264404, "step": 15 }, { "epoch": 0.04275217100868404, "grad_norm": 78.63474777212464, "learning_rate": 4.25531914893617e-07, "logits/chosen": -0.9655851125717163, "logits/rejected": -0.9391099810600281, "logps/chosen": -0.2776910662651062, "logps/rejected": -0.291360080242157, "loss": 2.9252, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -2.7769107818603516, "rewards/margins": 0.13669000566005707, "rewards/rejected": -2.9136006832122803, "step": 20 }, { "epoch": 0.053440213760855046, "grad_norm": 53.858972431024775, "learning_rate": 5.319148936170212e-07, "logits/chosen": -1.0097562074661255, "logits/rejected": -0.9812997579574585, "logps/chosen": -0.2714676260948181, "logps/rejected": -0.27822521328926086, "loss": 3.0821, "rewards/accuracies": 0.4937500059604645, "rewards/chosen": -2.7146763801574707, "rewards/margins": 0.06757592409849167, "rewards/rejected": -2.782252073287964, "step": 25 }, { "epoch": 0.06412825651302605, "grad_norm": 44.312475927746796, "learning_rate": 6.382978723404255e-07, "logits/chosen": -0.9986146688461304, "logits/rejected": -0.9536568522453308, "logps/chosen": -0.27314493060112, "logps/rejected": -0.27925461530685425, "loss": 2.937, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": -2.731449604034424, "rewards/margins": 0.06109660863876343, "rewards/rejected": -2.792546033859253, "step": 30 }, { "epoch": 0.07481629926519706, "grad_norm": 55.321940182511284, "learning_rate": 7.446808510638297e-07, "logits/chosen": -1.0669300556182861, "logits/rejected": -0.9896968603134155, "logps/chosen": -0.29428571462631226, "logps/rejected": -0.3205253481864929, "loss": 2.905, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -2.942857265472412, "rewards/margins": 0.26239633560180664, "rewards/rejected": -3.2052536010742188, "step": 35 }, { "epoch": 0.08550434201736808, "grad_norm": 53.68098989474069, "learning_rate": 8.51063829787234e-07, "logits/chosen": -1.0166269540786743, "logits/rejected": -0.9719806909561157, "logps/chosen": -0.2796934247016907, "logps/rejected": -0.32216984033584595, "loss": 2.916, "rewards/accuracies": 0.59375, "rewards/chosen": -2.796934127807617, "rewards/margins": 0.42476367950439453, "rewards/rejected": -3.221698045730591, "step": 40 }, { "epoch": 0.09619238476953908, "grad_norm": 36.765236755711314, "learning_rate": 9.574468085106384e-07, "logits/chosen": -1.0554900169372559, "logits/rejected": -1.0124839544296265, "logps/chosen": -0.3013826012611389, "logps/rejected": -0.3502373695373535, "loss": 2.9447, "rewards/accuracies": 0.550000011920929, "rewards/chosen": -3.0138256549835205, "rewards/margins": 0.4885478913784027, "rewards/rejected": -3.5023739337921143, "step": 45 }, { "epoch": 0.10688042752171009, "grad_norm": 72.12342853911701, "learning_rate": 9.998741174712533e-07, "logits/chosen": -1.033050298690796, "logits/rejected": -0.9839521646499634, "logps/chosen": -0.3049773573875427, "logps/rejected": -0.3382193446159363, "loss": 2.976, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": -3.0497734546661377, "rewards/margins": 0.33241981267929077, "rewards/rejected": -3.382193088531494, "step": 50 }, { "epoch": 0.11756847027388109, "grad_norm": 67.04896260966717, "learning_rate": 9.991050648838675e-07, "logits/chosen": -1.0543005466461182, "logits/rejected": -1.0193541049957275, "logps/chosen": -0.2847168445587158, "logps/rejected": -0.34575051069259644, "loss": 2.7924, "rewards/accuracies": 0.6187499761581421, "rewards/chosen": -2.847168445587158, "rewards/margins": 0.6103365421295166, "rewards/rejected": -3.457504987716675, "step": 55 }, { "epoch": 0.1282565130260521, "grad_norm": 53.20515583895435, "learning_rate": 9.97637968732563e-07, "logits/chosen": -1.1007188558578491, "logits/rejected": -1.066847801208496, "logps/chosen": -0.32495683431625366, "logps/rejected": -0.3465155363082886, "loss": 2.8738, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -3.249568462371826, "rewards/margins": 0.215586856007576, "rewards/rejected": -3.4651551246643066, "step": 60 }, { "epoch": 0.13894455577822312, "grad_norm": 54.54015013992033, "learning_rate": 9.954748808839674e-07, "logits/chosen": -1.0070468187332153, "logits/rejected": -0.9784091711044312, "logps/chosen": -0.37832310795783997, "logps/rejected": -0.43590840697288513, "loss": 2.7895, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": -3.783231258392334, "rewards/margins": 0.5758528113365173, "rewards/rejected": -4.359084129333496, "step": 65 }, { "epoch": 0.14963259853039412, "grad_norm": 38.242775225934125, "learning_rate": 9.926188266120295e-07, "logits/chosen": -1.0234776735305786, "logits/rejected": -0.9988471269607544, "logps/chosen": -0.3544539511203766, "logps/rejected": -0.4332161545753479, "loss": 2.8516, "rewards/accuracies": 0.581250011920929, "rewards/chosen": -3.5445396900177, "rewards/margins": 0.7876222729682922, "rewards/rejected": -4.332161903381348, "step": 70 }, { "epoch": 0.16032064128256512, "grad_norm": 70.74640041136536, "learning_rate": 9.890738003669027e-07, "logits/chosen": -0.9775687456130981, "logits/rejected": -0.9074035882949829, "logps/chosen": -0.3704521059989929, "logps/rejected": -0.42546525597572327, "loss": 2.7815, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": -3.704521656036377, "rewards/margins": 0.5501310229301453, "rewards/rejected": -4.254652500152588, "step": 75 }, { "epoch": 0.17100868403473615, "grad_norm": 46.9909884312478, "learning_rate": 9.848447601883433e-07, "logits/chosen": -0.9548114538192749, "logits/rejected": -0.94190514087677, "logps/chosen": -0.35945671796798706, "logps/rejected": -0.4592272639274597, "loss": 2.7108, "rewards/accuracies": 0.606249988079071, "rewards/chosen": -3.594567060470581, "rewards/margins": 0.9977054595947266, "rewards/rejected": -4.592272758483887, "step": 80 }, { "epoch": 0.18169672678690715, "grad_norm": 54.79418392154241, "learning_rate": 9.799376207714444e-07, "logits/chosen": -0.9647032618522644, "logits/rejected": -0.9432573318481445, "logps/chosen": -0.3421172797679901, "logps/rejected": -0.4004732072353363, "loss": 2.6569, "rewards/accuracies": 0.612500011920929, "rewards/chosen": -3.421172618865967, "rewards/margins": 0.5835592746734619, "rewards/rejected": -4.00473165512085, "step": 85 }, { "epoch": 0.19238476953907815, "grad_norm": 63.87918692389446, "learning_rate": 9.743592451943998e-07, "logits/chosen": -1.0345466136932373, "logits/rejected": -0.9992335438728333, "logps/chosen": -0.4216434061527252, "logps/rejected": -0.5047457218170166, "loss": 2.8483, "rewards/accuracies": 0.59375, "rewards/chosen": -4.216434001922607, "rewards/margins": 0.8310235142707825, "rewards/rejected": -5.047457695007324, "step": 90 }, { "epoch": 0.20307281229124916, "grad_norm": 53.84055400604519, "learning_rate": 9.681174353198686e-07, "logits/chosen": -1.0933572053909302, "logits/rejected": -1.012095332145691, "logps/chosen": -0.4486677050590515, "logps/rejected": -0.4948577880859375, "loss": 2.7206, "rewards/accuracies": 0.5687500238418579, "rewards/chosen": -4.4866766929626465, "rewards/margins": 0.46190088987350464, "rewards/rejected": -4.948577404022217, "step": 95 }, { "epoch": 0.21376085504342018, "grad_norm": 69.39656295840837, "learning_rate": 9.612209208833646e-07, "logits/chosen": -0.9949450492858887, "logits/rejected": -0.9710448384284973, "logps/chosen": -0.42737340927124023, "logps/rejected": -0.511344850063324, "loss": 2.7353, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": -4.273734092712402, "rewards/margins": 0.8397142291069031, "rewards/rejected": -5.113448143005371, "step": 100 }, { "epoch": 0.22444889779559118, "grad_norm": 63.49627205534197, "learning_rate": 9.536793472839324e-07, "logits/chosen": -1.00840425491333, "logits/rejected": -0.9560264348983765, "logps/chosen": -0.4261465072631836, "logps/rejected": -0.5318101644515991, "loss": 2.6988, "rewards/accuracies": 0.699999988079071, "rewards/chosen": -4.261464595794678, "rewards/margins": 1.0566365718841553, "rewards/rejected": -5.318101406097412, "step": 105 }, { "epoch": 0.23513694054776219, "grad_norm": 67.07988857179406, "learning_rate": 9.455032620941839e-07, "logits/chosen": -0.9696318507194519, "logits/rejected": -0.9108623266220093, "logps/chosen": -0.48374947905540466, "logps/rejected": -0.6151714324951172, "loss": 2.6096, "rewards/accuracies": 0.6812499761581421, "rewards/chosen": -4.837494373321533, "rewards/margins": 1.3142198324203491, "rewards/rejected": -6.151714324951172, "step": 110 }, { "epoch": 0.2458249832999332, "grad_norm": 80.4417839343177, "learning_rate": 9.367041003085648e-07, "logits/chosen": -1.033552646636963, "logits/rejected": -0.9741662740707397, "logps/chosen": -0.5227991938591003, "logps/rejected": -0.5981119275093079, "loss": 2.4723, "rewards/accuracies": 0.637499988079071, "rewards/chosen": -5.227993011474609, "rewards/margins": 0.7531263828277588, "rewards/rejected": -5.981118202209473, "step": 115 }, { "epoch": 0.2565130260521042, "grad_norm": 67.69889462049662, "learning_rate": 9.272941683504808e-07, "logits/chosen": -0.9916391372680664, "logits/rejected": -0.9028812646865845, "logps/chosen": -0.5420633554458618, "logps/rejected": -0.7466092705726624, "loss": 2.377, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -5.420632839202881, "rewards/margins": 2.0454587936401367, "rewards/rejected": -7.466092109680176, "step": 120 }, { "epoch": 0.26720106880427524, "grad_norm": 64.90166370238528, "learning_rate": 9.172866268606513e-07, "logits/chosen": -1.0659786462783813, "logits/rejected": -1.0236841440200806, "logps/chosen": -0.6124440431594849, "logps/rejected": -0.7124758958816528, "loss": 2.2955, "rewards/accuracies": 0.675000011920929, "rewards/chosen": -6.1244401931762695, "rewards/margins": 1.0003182888031006, "rewards/rejected": -7.124758720397949, "step": 125 }, { "epoch": 0.27788911155644624, "grad_norm": 75.48258438787046, "learning_rate": 9.066954722907638e-07, "logits/chosen": -1.0887296199798584, "logits/rejected": -1.0823543071746826, "logps/chosen": -0.6110976934432983, "logps/rejected": -0.8805627822875977, "loss": 2.1296, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -6.110977649688721, "rewards/margins": 2.694650888442993, "rewards/rejected": -8.805627822875977, "step": 130 }, { "epoch": 0.28857715430861725, "grad_norm": 62.13046213587147, "learning_rate": 8.955355173281707e-07, "logits/chosen": -1.061156153678894, "logits/rejected": -1.0147919654846191, "logps/chosen": -0.7112447023391724, "logps/rejected": -0.8724945783615112, "loss": 2.1133, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -7.1124467849731445, "rewards/margins": 1.6124988794326782, "rewards/rejected": -8.724946975708008, "step": 135 }, { "epoch": 0.29926519706078825, "grad_norm": 79.80676489486827, "learning_rate": 8.838223701790055e-07, "logits/chosen": -1.1300795078277588, "logits/rejected": -1.1087987422943115, "logps/chosen": -0.8216513395309448, "logps/rejected": -0.9944013357162476, "loss": 2.0323, "rewards/accuracies": 0.75, "rewards/chosen": -8.216513633728027, "rewards/margins": 1.7274997234344482, "rewards/rejected": -9.944013595581055, "step": 140 }, { "epoch": 0.30995323981295925, "grad_norm": 119.11117858285472, "learning_rate": 8.71572412738697e-07, "logits/chosen": -1.0456678867340088, "logits/rejected": -1.0206925868988037, "logps/chosen": -0.8874173164367676, "logps/rejected": -1.1297991275787354, "loss": 2.0077, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -8.874174118041992, "rewards/margins": 2.423818588256836, "rewards/rejected": -11.297992706298828, "step": 145 }, { "epoch": 0.32064128256513025, "grad_norm": 80.60289814144, "learning_rate": 8.588027776804058e-07, "logits/chosen": -1.0754765272140503, "logits/rejected": -1.0576502084732056, "logps/chosen": -0.9953246116638184, "logps/rejected": -1.2399874925613403, "loss": 1.968, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -9.953246116638184, "rewards/margins": 2.4466278553009033, "rewards/rejected": -12.399874687194824, "step": 150 }, { "epoch": 0.33132932531730125, "grad_norm": 79.65950829440058, "learning_rate": 8.455313244934324e-07, "logits/chosen": -1.080444097518921, "logits/rejected": -1.0592705011367798, "logps/chosen": -1.0582973957061768, "logps/rejected": -1.3756240606307983, "loss": 1.9981, "rewards/accuracies": 0.731249988079071, "rewards/chosen": -10.582974433898926, "rewards/margins": 3.173267364501953, "rewards/rejected": -13.756240844726562, "step": 155 }, { "epoch": 0.3420173680694723, "grad_norm": 81.44098785800907, "learning_rate": 8.317766145051057e-07, "logits/chosen": -1.0870612859725952, "logits/rejected": -1.069802165031433, "logps/chosen": -1.1801505088806152, "logps/rejected": -1.5819157361984253, "loss": 1.9469, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -11.801506042480469, "rewards/margins": 4.017651557922363, "rewards/rejected": -15.819157600402832, "step": 160 }, { "epoch": 0.3527054108216433, "grad_norm": 61.9394419875011, "learning_rate": 8.175578849210894e-07, "logits/chosen": -1.0850841999053955, "logits/rejected": -1.061554193496704, "logps/chosen": -1.1361093521118164, "logps/rejected": -1.5122711658477783, "loss": 1.8308, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -11.361093521118164, "rewards/margins": 3.761617660522461, "rewards/rejected": -15.122709274291992, "step": 165 }, { "epoch": 0.3633934535738143, "grad_norm": 86.89706327407258, "learning_rate": 8.028950219204099e-07, "logits/chosen": -1.0892133712768555, "logits/rejected": -1.0675928592681885, "logps/chosen": -1.1062101125717163, "logps/rejected": -1.4951918125152588, "loss": 1.7802, "rewards/accuracies": 0.78125, "rewards/chosen": -11.062100410461426, "rewards/margins": 3.8898162841796875, "rewards/rejected": -14.951919555664062, "step": 170 }, { "epoch": 0.3740814963259853, "grad_norm": 96.87652305461658, "learning_rate": 7.878085328428368e-07, "logits/chosen": -1.1229137182235718, "logits/rejected": -1.0774867534637451, "logps/chosen": -1.1681886911392212, "logps/rejected": -1.4487732648849487, "loss": 1.6772, "rewards/accuracies": 0.768750011920929, "rewards/chosen": -11.681886672973633, "rewards/margins": 2.805846691131592, "rewards/rejected": -14.487733840942383, "step": 175 }, { "epoch": 0.3847695390781563, "grad_norm": 80.44938362402195, "learning_rate": 7.723195175075135e-07, "logits/chosen": -1.0816049575805664, "logits/rejected": -1.0617396831512451, "logps/chosen": -1.1198861598968506, "logps/rejected": -1.4944720268249512, "loss": 1.5945, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -11.198859214782715, "rewards/margins": 3.7458598613739014, "rewards/rejected": -14.944720268249512, "step": 180 }, { "epoch": 0.3954575818303273, "grad_norm": 89.9964846943623, "learning_rate": 7.564496387029531e-07, "logits/chosen": -1.1142748594284058, "logits/rejected": -1.061927080154419, "logps/chosen": -1.1488279104232788, "logps/rejected": -1.5771600008010864, "loss": 1.6746, "rewards/accuracies": 0.84375, "rewards/chosen": -11.488279342651367, "rewards/margins": 4.283320426940918, "rewards/rejected": -15.771600723266602, "step": 185 }, { "epoch": 0.4061456245824983, "grad_norm": 91.4567322928116, "learning_rate": 7.402210918896689e-07, "logits/chosen": -1.113872766494751, "logits/rejected": -1.1223859786987305, "logps/chosen": -1.2559322118759155, "logps/rejected": -1.7311124801635742, "loss": 1.5468, "rewards/accuracies": 0.84375, "rewards/chosen": -12.55932331085205, "rewards/margins": 4.751800060272217, "rewards/rejected": -17.31112289428711, "step": 190 }, { "epoch": 0.4168336673346693, "grad_norm": 71.15679417803156, "learning_rate": 7.236565741578162e-07, "logits/chosen": -1.0601518154144287, "logits/rejected": -1.043198823928833, "logps/chosen": -1.2675104141235352, "logps/rejected": -1.6440922021865845, "loss": 1.6056, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -12.675103187561035, "rewards/margins": 3.7658183574676514, "rewards/rejected": -16.440921783447266, "step": 195 }, { "epoch": 0.42752171008684037, "grad_norm": 88.98069899942548, "learning_rate": 7.067792524832603e-07, "logits/chosen": -1.0721577405929565, "logits/rejected": -1.0621263980865479, "logps/chosen": -1.3113422393798828, "logps/rejected": -1.726875901222229, "loss": 1.5055, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -13.113421440124512, "rewards/margins": 4.155338287353516, "rewards/rejected": -17.268760681152344, "step": 200 }, { "epoch": 0.43820975283901137, "grad_norm": 96.85728294484134, "learning_rate": 6.896127313264642e-07, "logits/chosen": -1.10856032371521, "logits/rejected": -1.059822916984558, "logps/chosen": -1.3952258825302124, "logps/rejected": -1.8415533304214478, "loss": 1.716, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -13.952260971069336, "rewards/margins": 4.4632720947265625, "rewards/rejected": -18.415531158447266, "step": 205 }, { "epoch": 0.44889779559118237, "grad_norm": 98.7584341258845, "learning_rate": 6.721810196195174e-07, "logits/chosen": -1.134487271308899, "logits/rejected": -1.1236417293548584, "logps/chosen": -1.4038760662078857, "logps/rejected": -1.8213703632354736, "loss": 1.5993, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -14.0387601852417, "rewards/margins": 4.174942970275879, "rewards/rejected": -18.213703155517578, "step": 210 }, { "epoch": 0.45958583834335337, "grad_norm": 116.36934325190856, "learning_rate": 6.545084971874736e-07, "logits/chosen": -1.083676815032959, "logits/rejected": -1.0672903060913086, "logps/chosen": -1.3948618173599243, "logps/rejected": -1.87642502784729, "loss": 1.4766, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -13.948617935180664, "rewards/margins": 4.8156328201293945, "rewards/rejected": -18.76424789428711, "step": 215 }, { "epoch": 0.47027388109552437, "grad_norm": 96.57054428988462, "learning_rate": 6.3661988065096e-07, "logits/chosen": -1.1386303901672363, "logits/rejected": -1.1223524808883667, "logps/chosen": -1.477141261100769, "logps/rejected": -1.971549391746521, "loss": 1.431, "rewards/accuracies": 0.737500011920929, "rewards/chosen": -14.77141284942627, "rewards/margins": 4.9440813064575195, "rewards/rejected": -19.71549415588379, "step": 220 }, { "epoch": 0.48096192384769537, "grad_norm": 72.99627339556893, "learning_rate": 6.185401888577487e-07, "logits/chosen": -1.1213773488998413, "logits/rejected": -1.0908575057983398, "logps/chosen": -1.5149943828582764, "logps/rejected": -1.969143271446228, "loss": 1.4658, "rewards/accuracies": 0.7562500238418579, "rewards/chosen": -15.149943351745605, "rewards/margins": 4.541489601135254, "rewards/rejected": -19.69143295288086, "step": 225 }, { "epoch": 0.4916499665998664, "grad_norm": 75.07337643391894, "learning_rate": 6.002947078916364e-07, "logits/chosen": -1.1983073949813843, "logits/rejected": -1.150994896888733, "logps/chosen": -1.4561713933944702, "logps/rejected": -1.9137779474258423, "loss": 1.3907, "rewards/accuracies": 0.78125, "rewards/chosen": -14.561712265014648, "rewards/margins": 4.5760674476623535, "rewards/rejected": -19.137781143188477, "step": 230 }, { "epoch": 0.5023380093520374, "grad_norm": 89.06305062801928, "learning_rate": 5.819089557075688e-07, "logits/chosen": -1.227797031402588, "logits/rejected": -1.2002477645874023, "logps/chosen": -1.4925800561904907, "logps/rejected": -1.9937610626220703, "loss": 1.4023, "rewards/accuracies": 0.793749988079071, "rewards/chosen": -14.925801277160645, "rewards/margins": 5.011811256408691, "rewards/rejected": -19.937610626220703, "step": 235 }, { "epoch": 0.5130260521042084, "grad_norm": 89.0732695289788, "learning_rate": 5.634086464424742e-07, "logits/chosen": -1.189410924911499, "logits/rejected": -1.1908595561981201, "logps/chosen": -1.4204081296920776, "logps/rejected": -1.9320650100708008, "loss": 1.4327, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -14.204083442687988, "rewards/margins": 5.116568088531494, "rewards/rejected": -19.320650100708008, "step": 240 }, { "epoch": 0.5237140948563794, "grad_norm": 99.61038425380444, "learning_rate": 5.448196544517167e-07, "logits/chosen": -1.2636008262634277, "logits/rejected": -1.2102385759353638, "logps/chosen": -1.492004156112671, "logps/rejected": -2.0921199321746826, "loss": 1.3755, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -14.920039176940918, "rewards/margins": 6.001158714294434, "rewards/rejected": -20.921199798583984, "step": 245 }, { "epoch": 0.5344021376085505, "grad_norm": 143.41066987990183, "learning_rate": 5.26167978121472e-07, "logits/chosen": -1.2205616235733032, "logits/rejected": -1.2053756713867188, "logps/chosen": -1.5569369792938232, "logps/rejected": -2.1403331756591797, "loss": 1.3485, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.569369316101074, "rewards/margins": 5.833963394165039, "rewards/rejected": -21.403331756591797, "step": 250 }, { "epoch": 0.5450901803607214, "grad_norm": 71.93551703878607, "learning_rate": 5.074797035076318e-07, "logits/chosen": -1.2752165794372559, "logits/rejected": -1.2500503063201904, "logps/chosen": -1.6620187759399414, "logps/rejected": -2.134455442428589, "loss": 1.4857, "rewards/accuracies": 0.8125, "rewards/chosen": -16.620187759399414, "rewards/margins": 4.724367141723633, "rewards/rejected": -21.344552993774414, "step": 255 }, { "epoch": 0.5557782231128925, "grad_norm": 85.67142749873541, "learning_rate": 4.887809678520975e-07, "logits/chosen": -1.2445859909057617, "logits/rejected": -1.216204047203064, "logps/chosen": -1.5793449878692627, "logps/rejected": -2.078167676925659, "loss": 1.4255, "rewards/accuracies": 0.84375, "rewards/chosen": -15.793449401855469, "rewards/margins": 4.988225936889648, "rewards/rejected": -20.781675338745117, "step": 260 }, { "epoch": 0.5664662658650634, "grad_norm": 74.44253878678798, "learning_rate": 4.700979230274829e-07, "logits/chosen": -1.1956579685211182, "logits/rejected": -1.1797969341278076, "logps/chosen": -1.6723514795303345, "logps/rejected": -2.195023536682129, "loss": 1.3414, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -16.723514556884766, "rewards/margins": 5.22672176361084, "rewards/rejected": -21.95023536682129, "step": 265 }, { "epoch": 0.5771543086172345, "grad_norm": 114.96460787224315, "learning_rate": 4.514566989613559e-07, "logits/chosen": -1.1912486553192139, "logits/rejected": -1.1648938655853271, "logps/chosen": -1.463266134262085, "logps/rejected": -2.001335620880127, "loss": 1.3473, "rewards/accuracies": 0.84375, "rewards/chosen": -14.632661819458008, "rewards/margins": 5.3806915283203125, "rewards/rejected": -20.013355255126953, "step": 270 }, { "epoch": 0.5878423513694054, "grad_norm": 75.32343278326546, "learning_rate": 4.328833670911724e-07, "logits/chosen": -1.1563775539398193, "logits/rejected": -1.1241414546966553, "logps/chosen": -1.4626271724700928, "logps/rejected": -1.924564003944397, "loss": 1.4306, "rewards/accuracies": 0.8062499761581421, "rewards/chosen": -14.626272201538086, "rewards/margins": 4.619367599487305, "rewards/rejected": -19.24563980102539, "step": 275 }, { "epoch": 0.5985303941215765, "grad_norm": 112.30854407154642, "learning_rate": 4.144039039010124e-07, "logits/chosen": -1.2617356777191162, "logits/rejected": -1.2384282350540161, "logps/chosen": -1.5061413049697876, "logps/rejected": -2.0556976795196533, "loss": 1.3243, "rewards/accuracies": 0.800000011920929, "rewards/chosen": -15.06141185760498, "rewards/margins": 5.495565891265869, "rewards/rejected": -20.556978225708008, "step": 280 }, { "epoch": 0.6092184368737475, "grad_norm": 102.49061452491978, "learning_rate": 3.960441545911204e-07, "logits/chosen": -1.2426598072052002, "logits/rejected": -1.2112630605697632, "logps/chosen": -1.5387237071990967, "logps/rejected": -2.120283842086792, "loss": 1.0685, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -15.387234687805176, "rewards/margins": 5.815601825714111, "rewards/rejected": -21.202838897705078, "step": 285 }, { "epoch": 0.6199064796259185, "grad_norm": 86.79253258499234, "learning_rate": 3.778297969310529e-07, "logits/chosen": -1.2638859748840332, "logits/rejected": -1.2220103740692139, "logps/chosen": -1.564584493637085, "logps/rejected": -2.053191661834717, "loss": 1.3472, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -15.645845413208008, "rewards/margins": 4.886073589324951, "rewards/rejected": -20.531917572021484, "step": 290 }, { "epoch": 0.6305945223780896, "grad_norm": 93.44397121318542, "learning_rate": 3.5978630534699865e-07, "logits/chosen": -1.2045689821243286, "logits/rejected": -1.191235899925232, "logps/chosen": -1.5795795917510986, "logps/rejected": -2.093400239944458, "loss": 1.1752, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -15.795794486999512, "rewards/margins": 5.138205528259277, "rewards/rejected": -20.934001922607422, "step": 295 }, { "epoch": 0.6412825651302605, "grad_norm": 83.42376671175532, "learning_rate": 3.4193891529348795e-07, "logits/chosen": -1.128404974937439, "logits/rejected": -1.1020969152450562, "logps/chosen": -1.6557916402816772, "logps/rejected": -2.1081161499023438, "loss": 1.574, "rewards/accuracies": 0.762499988079071, "rewards/chosen": -16.557918548583984, "rewards/margins": 4.523244857788086, "rewards/rejected": -21.081159591674805, "step": 300 }, { "epoch": 0.6519706078824316, "grad_norm": 87.28007107027204, "learning_rate": 3.243125879593286e-07, "logits/chosen": -1.235114574432373, "logits/rejected": -1.19254469871521, "logps/chosen": -1.6206077337265015, "logps/rejected": -2.079169750213623, "loss": 1.3167, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -16.206077575683594, "rewards/margins": 4.585621356964111, "rewards/rejected": -20.791696548461914, "step": 305 }, { "epoch": 0.6626586506346025, "grad_norm": 105.45685254547827, "learning_rate": 3.069319753571269e-07, "logits/chosen": -1.267155408859253, "logits/rejected": -1.2484853267669678, "logps/chosen": -1.6359084844589233, "logps/rejected": -2.1494529247283936, "loss": 1.3629, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -16.359085083007812, "rewards/margins": 5.135441780090332, "rewards/rejected": -21.49452781677246, "step": 310 }, { "epoch": 0.6733466933867736, "grad_norm": 87.29974596975983, "learning_rate": 2.898213858452173e-07, "logits/chosen": -1.2641007900238037, "logits/rejected": -1.2104285955429077, "logps/chosen": -1.6273491382598877, "logps/rejected": -2.1474812030792236, "loss": 1.3491, "rewards/accuracies": 0.84375, "rewards/chosen": -16.273488998413086, "rewards/margins": 5.201323509216309, "rewards/rejected": -21.47481346130371, "step": 315 }, { "epoch": 0.6840347361389446, "grad_norm": 93.70048699997521, "learning_rate": 2.730047501302266e-07, "logits/chosen": -1.247004747390747, "logits/rejected": -1.2437224388122559, "logps/chosen": -1.6495912075042725, "logps/rejected": -2.273390293121338, "loss": 1.2651, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -16.495912551879883, "rewards/margins": 6.237987518310547, "rewards/rejected": -22.73390007019043, "step": 320 }, { "epoch": 0.6947227788911156, "grad_norm": 76.81018981722117, "learning_rate": 2.5650558779781635e-07, "logits/chosen": -1.26289701461792, "logits/rejected": -1.2167103290557861, "logps/chosen": -1.7046712636947632, "logps/rejected": -2.372957944869995, "loss": 1.3105, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -17.046714782714844, "rewards/margins": 6.682864189147949, "rewards/rejected": -23.72957992553711, "step": 325 }, { "epoch": 0.7054108216432866, "grad_norm": 73.64401812634293, "learning_rate": 2.403469744184154e-07, "logits/chosen": -1.178143858909607, "logits/rejected": -1.1377698183059692, "logps/chosen": -1.6760982275009155, "logps/rejected": -2.156362533569336, "loss": 1.3515, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.760982513427734, "rewards/margins": 4.802641868591309, "rewards/rejected": -21.56362533569336, "step": 330 }, { "epoch": 0.7160988643954576, "grad_norm": 100.69110505698991, "learning_rate": 2.2455150927394878e-07, "logits/chosen": -1.2217228412628174, "logits/rejected": -1.20427405834198, "logps/chosen": -1.6592464447021484, "logps/rejected": -2.2141623497009277, "loss": 1.1861, "rewards/accuracies": 0.84375, "rewards/chosen": -16.592464447021484, "rewards/margins": 5.549159049987793, "rewards/rejected": -22.141624450683594, "step": 335 }, { "epoch": 0.7267869071476286, "grad_norm": 96.51234191429023, "learning_rate": 2.0914128375069722e-07, "logits/chosen": -1.2305556535720825, "logits/rejected": -1.1986171007156372, "logps/chosen": -1.5974411964416504, "logps/rejected": -2.150116443634033, "loss": 1.3127, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.974411010742188, "rewards/margins": 5.526752948760986, "rewards/rejected": -21.501163482666016, "step": 340 }, { "epoch": 0.7374749498997996, "grad_norm": 82.01592774884807, "learning_rate": 1.9413785044249676e-07, "logits/chosen": -1.2591183185577393, "logits/rejected": -1.2364073991775513, "logps/chosen": -1.667109727859497, "logps/rejected": -2.311634063720703, "loss": 1.3624, "rewards/accuracies": 0.875, "rewards/chosen": -16.671098709106445, "rewards/margins": 6.445242404937744, "rewards/rejected": -23.1163387298584, "step": 345 }, { "epoch": 0.7481629926519706, "grad_norm": 122.79704197237824, "learning_rate": 1.7956219300748792e-07, "logits/chosen": -1.2474735975265503, "logits/rejected": -1.2506452798843384, "logps/chosen": -1.5353481769561768, "logps/rejected": -2.0822863578796387, "loss": 1.2838, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -15.353483200073242, "rewards/margins": 5.4693803787231445, "rewards/rejected": -20.822864532470703, "step": 350 }, { "epoch": 0.7588510354041417, "grad_norm": 96.25560337558127, "learning_rate": 1.6543469682057104e-07, "logits/chosen": -1.1812379360198975, "logits/rejected": -1.1956241130828857, "logps/chosen": -1.5455963611602783, "logps/rejected": -2.1094608306884766, "loss": 1.1903, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -15.455963134765625, "rewards/margins": 5.638647079467773, "rewards/rejected": -21.0946102142334, "step": 355 }, { "epoch": 0.7695390781563126, "grad_norm": 79.54472628433167, "learning_rate": 1.5177512046261666e-07, "logits/chosen": -1.2227225303649902, "logits/rejected": -1.223512053489685, "logps/chosen": -1.562652349472046, "logps/rejected": -2.2054429054260254, "loss": 1.3011, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -15.626523971557617, "rewards/margins": 6.4279046058654785, "rewards/rejected": -22.054428100585938, "step": 360 }, { "epoch": 0.7802271209084837, "grad_norm": 67.31957818166626, "learning_rate": 1.3860256808630427e-07, "logits/chosen": -1.2667648792266846, "logits/rejected": -1.204973816871643, "logps/chosen": -1.611985206604004, "logps/rejected": -2.2405993938446045, "loss": 1.2638, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -16.119850158691406, "rewards/margins": 6.286141872406006, "rewards/rejected": -22.405994415283203, "step": 365 }, { "epoch": 0.7909151636606546, "grad_norm": 96.4652631691847, "learning_rate": 1.2593546269723647e-07, "logits/chosen": -1.1908237934112549, "logits/rejected": -1.1797075271606445, "logps/chosen": -1.5888497829437256, "logps/rejected": -2.0819642543792725, "loss": 1.2368, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -15.888498306274414, "rewards/margins": 4.931147575378418, "rewards/rejected": -20.819644927978516, "step": 370 }, { "epoch": 0.8016032064128257, "grad_norm": 77.32657538767864, "learning_rate": 1.1379152038770029e-07, "logits/chosen": -1.22549307346344, "logits/rejected": -1.2306774854660034, "logps/chosen": -1.7054897546768188, "logps/rejected": -2.2909984588623047, "loss": 1.2394, "rewards/accuracies": 0.84375, "rewards/chosen": -17.05489730834961, "rewards/margins": 5.855085372924805, "rewards/rejected": -22.909982681274414, "step": 375 }, { "epoch": 0.8122912491649966, "grad_norm": 122.77103138361475, "learning_rate": 1.0218772555910954e-07, "logits/chosen": -1.2489427328109741, "logits/rejected": -1.2302041053771973, "logps/chosen": -1.59738028049469, "logps/rejected": -2.1246509552001953, "loss": 1.3954, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -15.973803520202637, "rewards/margins": 5.272706031799316, "rewards/rejected": -21.246509552001953, "step": 380 }, { "epoch": 0.8229792919171677, "grad_norm": 73.11130573539627, "learning_rate": 9.114030716778432e-08, "logits/chosen": -1.231930136680603, "logits/rejected": -1.2102787494659424, "logps/chosen": -1.6145036220550537, "logps/rejected": -2.3103325366973877, "loss": 1.0955, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -16.145038604736328, "rewards/margins": 6.958285331726074, "rewards/rejected": -23.103322982788086, "step": 385 }, { "epoch": 0.8336673346693386, "grad_norm": 80.68579596437256, "learning_rate": 8.066471602728803e-08, "logits/chosen": -1.2316021919250488, "logits/rejected": -1.2188332080841064, "logps/chosen": -1.6731784343719482, "logps/rejected": -2.2686033248901367, "loss": 1.2377, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -16.73178482055664, "rewards/margins": 5.95424747467041, "rewards/rejected": -22.686031341552734, "step": 390 }, { "epoch": 0.8443553774215097, "grad_norm": 77.88673283635482, "learning_rate": 7.077560319906694e-08, "logits/chosen": -1.2400703430175781, "logits/rejected": -1.2209936380386353, "logps/chosen": -1.5812984704971313, "logps/rejected": -2.1461308002471924, "loss": 1.2574, "rewards/accuracies": 0.8187500238418579, "rewards/chosen": -15.812983512878418, "rewards/margins": 5.648324012756348, "rewards/rejected": -21.461307525634766, "step": 395 }, { "epoch": 0.8550434201736807, "grad_norm": 64.27634143705052, "learning_rate": 6.148679950161672e-08, "logits/chosen": -1.2457908391952515, "logits/rejected": -1.2307510375976562, "logps/chosen": -1.6303634643554688, "logps/rejected": -2.144191265106201, "loss": 1.2352, "rewards/accuracies": 0.8125, "rewards/chosen": -16.303632736206055, "rewards/margins": 5.138282775878906, "rewards/rejected": -21.441913604736328, "step": 400 }, { "epoch": 0.8550434201736807, "eval_logits/chosen": -1.4050133228302002, "eval_logits/rejected": -1.4148539304733276, "eval_logps/chosen": -1.6315457820892334, "eval_logps/rejected": -2.184220314025879, "eval_loss": 1.3035991191864014, "eval_rewards/accuracies": 0.8313007950782776, "eval_rewards/chosen": -16.315458297729492, "eval_rewards/margins": 5.526745319366455, "eval_rewards/rejected": -21.842201232910156, "eval_runtime": 114.1272, "eval_samples_per_second": 17.183, "eval_steps_per_second": 1.078, "step": 400 }, { "epoch": 0.8657314629258517, "grad_norm": 93.3112085508996, "learning_rate": 5.2811296166831666e-08, "logits/chosen": -1.2086267471313477, "logits/rejected": -1.2275283336639404, "logps/chosen": -1.705394983291626, "logps/rejected": -2.2604918479919434, "loss": 1.2335, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -17.053951263427734, "rewards/margins": 5.550968647003174, "rewards/rejected": -22.604917526245117, "step": 405 }, { "epoch": 0.8764195056780227, "grad_norm": 147.49347048623574, "learning_rate": 4.4761226670592066e-08, "logits/chosen": -1.224875569343567, "logits/rejected": -1.2125729322433472, "logps/chosen": -1.6484178304672241, "logps/rejected": -2.173166036605835, "loss": 1.3786, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -16.48417854309082, "rewards/margins": 5.247479438781738, "rewards/rejected": -21.731660842895508, "step": 410 }, { "epoch": 0.8871075484301937, "grad_norm": 72.56853127664434, "learning_rate": 3.734784976300165e-08, "logits/chosen": -1.216326355934143, "logits/rejected": -1.1681609153747559, "logps/chosen": -1.584081768989563, "logps/rejected": -2.2398409843444824, "loss": 1.3224, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -15.84081745147705, "rewards/margins": 6.557589530944824, "rewards/rejected": -22.398406982421875, "step": 415 }, { "epoch": 0.8977955911823647, "grad_norm": 96.60767749787689, "learning_rate": 3.058153372200695e-08, "logits/chosen": -1.252618432044983, "logits/rejected": -1.206176996231079, "logps/chosen": -1.533342719078064, "logps/rejected": -2.1350560188293457, "loss": 1.2256, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -15.333427429199219, "rewards/margins": 6.0171332359313965, "rewards/rejected": -21.350561141967773, "step": 420 }, { "epoch": 0.9084836339345357, "grad_norm": 102.43117197696006, "learning_rate": 2.4471741852423233e-08, "logits/chosen": -1.2422844171524048, "logits/rejected": -1.2342640161514282, "logps/chosen": -1.7160053253173828, "logps/rejected": -2.2498655319213867, "loss": 1.4539, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -17.160053253173828, "rewards/margins": 5.338602542877197, "rewards/rejected": -22.498653411865234, "step": 425 }, { "epoch": 0.9191716766867067, "grad_norm": 95.6241453357728, "learning_rate": 1.9027019250647036e-08, "logits/chosen": -1.232668161392212, "logits/rejected": -1.2184712886810303, "logps/chosen": -1.7022215127944946, "logps/rejected": -2.2985284328460693, "loss": 1.267, "rewards/accuracies": 0.875, "rewards/chosen": -17.022212982177734, "rewards/margins": 5.963072299957275, "rewards/rejected": -22.985288619995117, "step": 430 }, { "epoch": 0.9298597194388778, "grad_norm": 76.99966381399814, "learning_rate": 1.4254980853566246e-08, "logits/chosen": -1.182472825050354, "logits/rejected": -1.1473052501678467, "logps/chosen": -1.5595757961273193, "logps/rejected": -2.154953956604004, "loss": 1.2015, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -15.595758438110352, "rewards/margins": 5.953780174255371, "rewards/rejected": -21.54953956604004, "step": 435 }, { "epoch": 0.9405477621910487, "grad_norm": 84.23154902337001, "learning_rate": 1.016230078838226e-08, "logits/chosen": -1.2160289287567139, "logits/rejected": -1.1602892875671387, "logps/chosen": -1.6690679788589478, "logps/rejected": -2.205056667327881, "loss": 1.245, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.690677642822266, "rewards/margins": 5.359889984130859, "rewards/rejected": -22.050569534301758, "step": 440 }, { "epoch": 0.9512358049432198, "grad_norm": 75.27496517042923, "learning_rate": 6.754703038239329e-09, "logits/chosen": -1.1662180423736572, "logits/rejected": -1.1511404514312744, "logps/chosen": -1.6556246280670166, "logps/rejected": -2.3011534214019775, "loss": 1.1055, "rewards/accuracies": 0.84375, "rewards/chosen": -16.556243896484375, "rewards/margins": 6.4552903175354, "rewards/rejected": -23.011533737182617, "step": 445 }, { "epoch": 0.9619238476953907, "grad_norm": 92.06659067628235, "learning_rate": 4.036953436716895e-09, "logits/chosen": -1.278028130531311, "logits/rejected": -1.2591049671173096, "logps/chosen": -1.6093896627426147, "logps/rejected": -2.1693015098571777, "loss": 1.3206, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -16.093896865844727, "rewards/margins": 5.599120140075684, "rewards/rejected": -21.693017959594727, "step": 450 }, { "epoch": 0.9726118904475618, "grad_norm": 100.7331017689662, "learning_rate": 2.0128530023804656e-09, "logits/chosen": -1.2293764352798462, "logits/rejected": -1.1971036195755005, "logps/chosen": -1.6129881143569946, "logps/rejected": -2.264960765838623, "loss": 1.0669, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -16.129878997802734, "rewards/margins": 6.5197248458862305, "rewards/rejected": -22.649606704711914, "step": 455 }, { "epoch": 0.9832999331997327, "grad_norm": 88.7868280064186, "learning_rate": 6.852326227130833e-10, "logits/chosen": -1.241369366645813, "logits/rejected": -1.2309256792068481, "logps/chosen": -1.6784422397613525, "logps/rejected": -2.295506000518799, "loss": 1.1912, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": -16.784420013427734, "rewards/margins": 6.170637607574463, "rewards/rejected": -22.95505714416504, "step": 460 }, { "epoch": 0.9939879759519038, "grad_norm": 89.2084840240269, "learning_rate": 5.594909486328348e-11, "logits/chosen": -1.2126357555389404, "logits/rejected": -1.2189154624938965, "logps/chosen": -1.7046855688095093, "logps/rejected": -2.3294055461883545, "loss": 1.3123, "rewards/accuracies": 0.84375, "rewards/chosen": -17.046857833862305, "rewards/margins": 6.247200965881348, "rewards/rejected": -23.294055938720703, "step": 465 }, { "epoch": 0.9982631930527722, "step": 467, "total_flos": 0.0, "train_loss": 1.7982526555561662, "train_runtime": 17001.7268, "train_samples_per_second": 3.522, "train_steps_per_second": 0.027 } ], "logging_steps": 5, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }