{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9981298423724285, "eval_steps": 500, "global_step": 467, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0021373230029388193, "grad_norm": 2.51713228225708, "learning_rate": 1.0638297872340425e-08, "logits/chosen": -1.1381689310073853, "logits/rejected": -0.9913416504859924, "logps/chosen": -0.2839311957359314, "logps/rejected": -0.2955534756183624, "loss": 1.6097, "rewards/accuracies": 0.625, "rewards/chosen": -0.5678623914718628, "rewards/margins": 0.023244591429829597, "rewards/rejected": -0.5911069512367249, "step": 1 }, { "epoch": 0.004274646005877639, "grad_norm": 6.541850566864014, "learning_rate": 2.127659574468085e-08, "logits/chosen": -1.0311710834503174, "logits/rejected": -0.8901023864746094, "logps/chosen": -0.24952735006809235, "logps/rejected": -0.24253402650356293, "loss": 1.6096, "rewards/accuracies": 0.5, "rewards/chosen": -0.4990547001361847, "rewards/margins": -0.013986671343445778, "rewards/rejected": -0.48506805300712585, "step": 2 }, { "epoch": 0.006411969008816457, "grad_norm": 5.6596479415893555, "learning_rate": 3.191489361702127e-08, "logits/chosen": -0.9279628992080688, "logits/rejected": -0.8305555582046509, "logps/chosen": -0.2633163630962372, "logps/rejected": -0.26702702045440674, "loss": 1.6174, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5266327261924744, "rewards/margins": 0.007421246729791164, "rewards/rejected": -0.5340540409088135, "step": 3 }, { "epoch": 0.008549292011755277, "grad_norm": 3.8121635913848877, "learning_rate": 4.25531914893617e-08, "logits/chosen": -0.8504582047462463, "logits/rejected": -0.7527742981910706, "logps/chosen": -0.2771408259868622, "logps/rejected": -0.26471394300460815, "loss": 1.6393, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5542816519737244, "rewards/margins": -0.024853792041540146, "rewards/rejected": -0.5294278860092163, "step": 4 }, { "epoch": 0.010686615014694095, "grad_norm": 6.048301696777344, "learning_rate": 5.3191489361702123e-08, "logits/chosen": -1.156632661819458, "logits/rejected": -1.2128832340240479, "logps/chosen": -0.28773820400238037, "logps/rejected": -0.29937219619750977, "loss": 1.6108, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5754764080047607, "rewards/margins": 0.023267941549420357, "rewards/rejected": -0.5987443923950195, "step": 5 }, { "epoch": 0.012823938017632914, "grad_norm": 3.6442198753356934, "learning_rate": 6.382978723404254e-08, "logits/chosen": -1.0647015571594238, "logits/rejected": -1.031942367553711, "logps/chosen": -0.25931063294410706, "logps/rejected": -0.28003033995628357, "loss": 1.6079, "rewards/accuracies": 0.625, "rewards/chosen": -0.5186212658882141, "rewards/margins": 0.04143940657377243, "rewards/rejected": -0.5600606799125671, "step": 6 }, { "epoch": 0.014961261020571734, "grad_norm": 5.595146656036377, "learning_rate": 7.446808510638298e-08, "logits/chosen": -0.7785481810569763, "logits/rejected": -0.7654089331626892, "logps/chosen": -0.25532105565071106, "logps/rejected": -0.24814245104789734, "loss": 1.6092, "rewards/accuracies": 0.625, "rewards/chosen": -0.5106421113014221, "rewards/margins": -0.01435722503811121, "rewards/rejected": -0.4962849020957947, "step": 7 }, { "epoch": 0.017098584023510555, "grad_norm": 2.9471020698547363, "learning_rate": 8.51063829787234e-08, "logits/chosen": -1.0282069444656372, "logits/rejected": -1.0483824014663696, "logps/chosen": -0.24546000361442566, "logps/rejected": -0.2658500373363495, "loss": 1.5902, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4909200072288513, "rewards/margins": 0.04078003019094467, "rewards/rejected": -0.531700074672699, "step": 8 }, { "epoch": 0.01923590702644937, "grad_norm": 3.132836103439331, "learning_rate": 9.574468085106382e-08, "logits/chosen": -0.9889479875564575, "logits/rejected": -0.8638209104537964, "logps/chosen": -0.27614107728004456, "logps/rejected": -0.2566734254360199, "loss": 1.6173, "rewards/accuracies": 0.3125, "rewards/chosen": -0.5522821545600891, "rewards/margins": -0.03893527761101723, "rewards/rejected": -0.5133468508720398, "step": 9 }, { "epoch": 0.02137323002938819, "grad_norm": 5.624292850494385, "learning_rate": 1.0638297872340425e-07, "logits/chosen": -1.0719839334487915, "logits/rejected": -1.0015329122543335, "logps/chosen": -0.32535240054130554, "logps/rejected": -0.31745338439941406, "loss": 1.6211, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6507048010826111, "rewards/margins": -0.015798063948750496, "rewards/rejected": -0.6349067687988281, "step": 10 }, { "epoch": 0.02351055303232701, "grad_norm": 5.1507039070129395, "learning_rate": 1.1702127659574468e-07, "logits/chosen": -0.9715439677238464, "logits/rejected": -0.8908199071884155, "logps/chosen": -0.2835432291030884, "logps/rejected": -0.2507440745830536, "loss": 1.612, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5670864582061768, "rewards/margins": -0.0655982717871666, "rewards/rejected": -0.5014881491661072, "step": 11 }, { "epoch": 0.02564787603526583, "grad_norm": 2.2926666736602783, "learning_rate": 1.2765957446808508e-07, "logits/chosen": -0.9799962639808655, "logits/rejected": -1.0184035301208496, "logps/chosen": -0.29446908831596375, "logps/rejected": -0.26765191555023193, "loss": 1.6202, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5889381766319275, "rewards/margins": -0.05363432317972183, "rewards/rejected": -0.5353038311004639, "step": 12 }, { "epoch": 0.027785199038204648, "grad_norm": 5.308409690856934, "learning_rate": 1.3829787234042553e-07, "logits/chosen": -0.8681848049163818, "logits/rejected": -0.8799771070480347, "logps/chosen": -0.3181426227092743, "logps/rejected": -0.3121987581253052, "loss": 1.6031, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6362852454185486, "rewards/margins": -0.011887717992067337, "rewards/rejected": -0.6243975162506104, "step": 13 }, { "epoch": 0.029922522041143467, "grad_norm": 4.573068618774414, "learning_rate": 1.4893617021276595e-07, "logits/chosen": -0.8867932558059692, "logits/rejected": -0.861649751663208, "logps/chosen": -0.312772661447525, "logps/rejected": -0.29462364315986633, "loss": 1.6226, "rewards/accuracies": 0.375, "rewards/chosen": -0.62554532289505, "rewards/margins": -0.03629804030060768, "rewards/rejected": -0.5892472863197327, "step": 14 }, { "epoch": 0.03205984504408229, "grad_norm": 4.1025872230529785, "learning_rate": 1.5957446808510638e-07, "logits/chosen": -1.1116752624511719, "logits/rejected": -0.9415389895439148, "logps/chosen": -0.27133169770240784, "logps/rejected": -0.29030919075012207, "loss": 1.5818, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5426633954048157, "rewards/margins": 0.037955012172460556, "rewards/rejected": -0.5806183815002441, "step": 15 }, { "epoch": 0.03419716804702111, "grad_norm": 3.307173728942871, "learning_rate": 1.702127659574468e-07, "logits/chosen": -0.9105625152587891, "logits/rejected": -0.8872620463371277, "logps/chosen": -0.2662544846534729, "logps/rejected": -0.28296971321105957, "loss": 1.6112, "rewards/accuracies": 0.5, "rewards/chosen": -0.5325089693069458, "rewards/margins": 0.03343046456575394, "rewards/rejected": -0.5659394264221191, "step": 16 }, { "epoch": 0.03633449104995993, "grad_norm": 6.173768997192383, "learning_rate": 1.8085106382978725e-07, "logits/chosen": -0.7553848028182983, "logits/rejected": -0.7946615815162659, "logps/chosen": -0.277927964925766, "logps/rejected": -0.28916075825691223, "loss": 1.5928, "rewards/accuracies": 0.5, "rewards/chosen": -0.555855929851532, "rewards/margins": 0.02246551401913166, "rewards/rejected": -0.5783215165138245, "step": 17 }, { "epoch": 0.03847181405289874, "grad_norm": 3.708397626876831, "learning_rate": 1.9148936170212765e-07, "logits/chosen": -1.0742344856262207, "logits/rejected": -1.1560362577438354, "logps/chosen": -0.2530558407306671, "logps/rejected": -0.2565101981163025, "loss": 1.6245, "rewards/accuracies": 0.5, "rewards/chosen": -0.5061116814613342, "rewards/margins": 0.006908770650625229, "rewards/rejected": -0.513020396232605, "step": 18 }, { "epoch": 0.04060913705583756, "grad_norm": 4.8654351234436035, "learning_rate": 2.0212765957446807e-07, "logits/chosen": -1.1306225061416626, "logits/rejected": -1.0444625616073608, "logps/chosen": -0.2724864184856415, "logps/rejected": -0.2817416787147522, "loss": 1.6247, "rewards/accuracies": 0.6875, "rewards/chosen": -0.544972836971283, "rewards/margins": 0.018510470166802406, "rewards/rejected": -0.5634833574295044, "step": 19 }, { "epoch": 0.04274646005877638, "grad_norm": 3.5271363258361816, "learning_rate": 2.127659574468085e-07, "logits/chosen": -1.0348137617111206, "logits/rejected": -1.0212081670761108, "logps/chosen": -0.2397567480802536, "logps/rejected": -0.23578569293022156, "loss": 1.6172, "rewards/accuracies": 0.5, "rewards/chosen": -0.4795134961605072, "rewards/margins": -0.007942091673612595, "rewards/rejected": -0.4715713858604431, "step": 20 }, { "epoch": 0.0448837830617152, "grad_norm": 7.901147842407227, "learning_rate": 2.2340425531914892e-07, "logits/chosen": -1.1679033041000366, "logits/rejected": -1.0415174961090088, "logps/chosen": -0.33534738421440125, "logps/rejected": -0.27388396859169006, "loss": 1.6502, "rewards/accuracies": 0.25, "rewards/chosen": -0.6706947684288025, "rewards/margins": -0.12292689830064774, "rewards/rejected": -0.5477679371833801, "step": 21 }, { "epoch": 0.04702110606465402, "grad_norm": 2.3991823196411133, "learning_rate": 2.3404255319148937e-07, "logits/chosen": -1.0736172199249268, "logits/rejected": -1.0771551132202148, "logps/chosen": -0.2646552622318268, "logps/rejected": -0.2733539938926697, "loss": 1.6048, "rewards/accuracies": 0.375, "rewards/chosen": -0.5293105244636536, "rewards/margins": 0.01739754155278206, "rewards/rejected": -0.5467079877853394, "step": 22 }, { "epoch": 0.04915842906759284, "grad_norm": 4.812252998352051, "learning_rate": 2.4468085106382976e-07, "logits/chosen": -0.8147614002227783, "logits/rejected": -0.9166449904441833, "logps/chosen": -0.28619590401649475, "logps/rejected": -0.2908383309841156, "loss": 1.5764, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5723918080329895, "rewards/margins": 0.009284832514822483, "rewards/rejected": -0.5816766619682312, "step": 23 }, { "epoch": 0.05129575207053166, "grad_norm": 5.214301109313965, "learning_rate": 2.5531914893617016e-07, "logits/chosen": -1.0316184759140015, "logits/rejected": -1.0412724018096924, "logps/chosen": -0.23989242315292358, "logps/rejected": -0.26728230714797974, "loss": 1.602, "rewards/accuracies": 0.5, "rewards/chosen": -0.47978484630584717, "rewards/margins": 0.0547797717154026, "rewards/rejected": -0.5345646142959595, "step": 24 }, { "epoch": 0.053433075073470476, "grad_norm": 3.372835636138916, "learning_rate": 2.659574468085106e-07, "logits/chosen": -1.0795375108718872, "logits/rejected": -0.9741866588592529, "logps/chosen": -0.28838473558425903, "logps/rejected": -0.32610005140304565, "loss": 1.6016, "rewards/accuracies": 0.625, "rewards/chosen": -0.5767694711685181, "rewards/margins": 0.07543070614337921, "rewards/rejected": -0.6522001028060913, "step": 25 }, { "epoch": 0.055570398076409296, "grad_norm": 3.9052999019622803, "learning_rate": 2.7659574468085106e-07, "logits/chosen": -1.2568001747131348, "logits/rejected": -1.1107139587402344, "logps/chosen": -0.30466389656066895, "logps/rejected": -0.2980763614177704, "loss": 1.6209, "rewards/accuracies": 0.5, "rewards/chosen": -0.6093277931213379, "rewards/margins": -0.0131750563159585, "rewards/rejected": -0.5961527228355408, "step": 26 }, { "epoch": 0.057707721079348115, "grad_norm": 3.9069981575012207, "learning_rate": 2.872340425531915e-07, "logits/chosen": -1.0098018646240234, "logits/rejected": -0.9794459342956543, "logps/chosen": -0.2699134051799774, "logps/rejected": -0.28315117955207825, "loss": 1.6203, "rewards/accuracies": 0.5, "rewards/chosen": -0.5398268103599548, "rewards/margins": 0.026475582271814346, "rewards/rejected": -0.5663023591041565, "step": 27 }, { "epoch": 0.059845044082286934, "grad_norm": 4.644921779632568, "learning_rate": 2.978723404255319e-07, "logits/chosen": -0.8839479088783264, "logits/rejected": -0.9320971965789795, "logps/chosen": -0.2668587565422058, "logps/rejected": -0.27507418394088745, "loss": 1.6176, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5337175130844116, "rewards/margins": 0.016430813819169998, "rewards/rejected": -0.5501483678817749, "step": 28 }, { "epoch": 0.061982367085225754, "grad_norm": 3.2341363430023193, "learning_rate": 3.085106382978723e-07, "logits/chosen": -1.0859841108322144, "logits/rejected": -1.0080296993255615, "logps/chosen": -0.2636515498161316, "logps/rejected": -0.2644122838973999, "loss": 1.6185, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5273030996322632, "rewards/margins": 0.0015215259045362473, "rewards/rejected": -0.5288245677947998, "step": 29 }, { "epoch": 0.06411969008816458, "grad_norm": 5.580157279968262, "learning_rate": 3.1914893617021275e-07, "logits/chosen": -1.170966386795044, "logits/rejected": -0.9350689053535461, "logps/chosen": -0.2749802768230438, "logps/rejected": -0.2526704668998718, "loss": 1.6164, "rewards/accuracies": 0.3125, "rewards/chosen": -0.5499605536460876, "rewards/margins": -0.04461963474750519, "rewards/rejected": -0.5053409337997437, "step": 30 }, { "epoch": 0.06625701309110339, "grad_norm": 4.681908130645752, "learning_rate": 3.2978723404255315e-07, "logits/chosen": -1.0664238929748535, "logits/rejected": -0.9249334335327148, "logps/chosen": -0.26851335167884827, "logps/rejected": -0.3246592581272125, "loss": 1.5989, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5370267033576965, "rewards/margins": 0.11229176819324493, "rewards/rejected": -0.649318516254425, "step": 31 }, { "epoch": 0.06839433609404222, "grad_norm": 7.798113822937012, "learning_rate": 3.404255319148936e-07, "logits/chosen": -0.8868415951728821, "logits/rejected": -0.8269252777099609, "logps/chosen": -0.26608309149742126, "logps/rejected": -0.29178884625434875, "loss": 1.6, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5321661829948425, "rewards/margins": 0.051411453634500504, "rewards/rejected": -0.5835776925086975, "step": 32 }, { "epoch": 0.07053165909698103, "grad_norm": 3.8565964698791504, "learning_rate": 3.5106382978723405e-07, "logits/chosen": -1.075560450553894, "logits/rejected": -0.9206546545028687, "logps/chosen": -0.3033750355243683, "logps/rejected": -0.2647935748100281, "loss": 1.6255, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6067500710487366, "rewards/margins": -0.07716288417577744, "rewards/rejected": -0.5295871496200562, "step": 33 }, { "epoch": 0.07266898209991986, "grad_norm": 4.738920211791992, "learning_rate": 3.617021276595745e-07, "logits/chosen": -1.0078967809677124, "logits/rejected": -0.9841946363449097, "logps/chosen": -0.29649823904037476, "logps/rejected": -0.3331226706504822, "loss": 1.6, "rewards/accuracies": 0.625, "rewards/chosen": -0.5929964780807495, "rewards/margins": 0.07324886322021484, "rewards/rejected": -0.6662453413009644, "step": 34 }, { "epoch": 0.07480630510285867, "grad_norm": 15.601210594177246, "learning_rate": 3.7234042553191484e-07, "logits/chosen": -1.0271260738372803, "logits/rejected": -1.0070686340332031, "logps/chosen": -0.2500755488872528, "logps/rejected": -0.2826491892337799, "loss": 1.6131, "rewards/accuracies": 0.625, "rewards/chosen": -0.5001510977745056, "rewards/margins": 0.06514722108840942, "rewards/rejected": -0.5652983784675598, "step": 35 }, { "epoch": 0.07694362810579748, "grad_norm": 7.088011264801025, "learning_rate": 3.829787234042553e-07, "logits/chosen": -0.7224124670028687, "logits/rejected": -0.5971524119377136, "logps/chosen": -0.2726445198059082, "logps/rejected": -0.2940409481525421, "loss": 1.603, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5452890396118164, "rewards/margins": 0.04279275983572006, "rewards/rejected": -0.5880818963050842, "step": 36 }, { "epoch": 0.07908095110873631, "grad_norm": 4.70820426940918, "learning_rate": 3.9361702127659574e-07, "logits/chosen": -0.979728102684021, "logits/rejected": -0.9153163433074951, "logps/chosen": -0.27593153715133667, "logps/rejected": -0.26201528310775757, "loss": 1.6146, "rewards/accuracies": 0.5, "rewards/chosen": -0.5518630743026733, "rewards/margins": -0.02783256769180298, "rewards/rejected": -0.5240305662155151, "step": 37 }, { "epoch": 0.08121827411167512, "grad_norm": 8.7505464553833, "learning_rate": 4.0425531914893614e-07, "logits/chosen": -0.8443434238433838, "logits/rejected": -0.8855568170547485, "logps/chosen": -0.29990217089653015, "logps/rejected": -0.2905019521713257, "loss": 1.6493, "rewards/accuracies": 0.4375, "rewards/chosen": -0.5998043417930603, "rewards/margins": -0.0188005194067955, "rewards/rejected": -0.5810039043426514, "step": 38 }, { "epoch": 0.08335559711461395, "grad_norm": 5.359803676605225, "learning_rate": 4.148936170212766e-07, "logits/chosen": -1.0560702085494995, "logits/rejected": -1.1278265714645386, "logps/chosen": -0.25392618775367737, "logps/rejected": -0.2735791802406311, "loss": 1.5949, "rewards/accuracies": 0.5, "rewards/chosen": -0.5078523755073547, "rewards/margins": 0.039305973798036575, "rewards/rejected": -0.5471583604812622, "step": 39 }, { "epoch": 0.08549292011755276, "grad_norm": 3.1088671684265137, "learning_rate": 4.25531914893617e-07, "logits/chosen": -1.0592100620269775, "logits/rejected": -1.0815989971160889, "logps/chosen": -0.2885398864746094, "logps/rejected": -0.2929195761680603, "loss": 1.6321, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5770797729492188, "rewards/margins": 0.00875941477715969, "rewards/rejected": -0.5858391523361206, "step": 40 }, { "epoch": 0.08763024312049159, "grad_norm": 8.073966026306152, "learning_rate": 4.3617021276595744e-07, "logits/chosen": -1.0096590518951416, "logits/rejected": -0.8713966012001038, "logps/chosen": -0.30629193782806396, "logps/rejected": -0.33664122223854065, "loss": 1.5914, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6125838756561279, "rewards/margins": 0.06069856137037277, "rewards/rejected": -0.6732824444770813, "step": 41 }, { "epoch": 0.0897675661234304, "grad_norm": 5.209786891937256, "learning_rate": 4.4680851063829783e-07, "logits/chosen": -1.0377849340438843, "logits/rejected": -0.8914337754249573, "logps/chosen": -0.2845829427242279, "logps/rejected": -0.3244422674179077, "loss": 1.6121, "rewards/accuracies": 0.5, "rewards/chosen": -0.5691658854484558, "rewards/margins": 0.079718679189682, "rewards/rejected": -0.6488845348358154, "step": 42 }, { "epoch": 0.09190488912636922, "grad_norm": 4.667476177215576, "learning_rate": 4.574468085106383e-07, "logits/chosen": -0.7347361445426941, "logits/rejected": -0.7869642376899719, "logps/chosen": -0.3507947623729706, "logps/rejected": -0.27199897170066833, "loss": 1.6222, "rewards/accuracies": 0.3125, "rewards/chosen": -0.7015895247459412, "rewards/margins": -0.1575915366411209, "rewards/rejected": -0.5439979434013367, "step": 43 }, { "epoch": 0.09404221212930804, "grad_norm": 14.311481475830078, "learning_rate": 4.6808510638297873e-07, "logits/chosen": -0.8943421840667725, "logits/rejected": -0.836614727973938, "logps/chosen": -0.4167774021625519, "logps/rejected": -0.430794894695282, "loss": 1.597, "rewards/accuracies": 0.5, "rewards/chosen": -0.8335548043251038, "rewards/margins": 0.028035037219524384, "rewards/rejected": -0.861589789390564, "step": 44 }, { "epoch": 0.09617953513224686, "grad_norm": 3.08385968208313, "learning_rate": 4.787234042553192e-07, "logits/chosen": -0.9741953015327454, "logits/rejected": -0.8605018258094788, "logps/chosen": -0.2905868887901306, "logps/rejected": -0.29014959931373596, "loss": 1.6179, "rewards/accuracies": 0.5, "rewards/chosen": -0.5811737775802612, "rewards/margins": -0.0008745882660150528, "rewards/rejected": -0.5802991986274719, "step": 45 }, { "epoch": 0.09831685813518568, "grad_norm": 3.141914129257202, "learning_rate": 4.893617021276595e-07, "logits/chosen": -0.8467612266540527, "logits/rejected": -0.8879311084747314, "logps/chosen": -0.2710065543651581, "logps/rejected": -0.28622525930404663, "loss": 1.6098, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5420131087303162, "rewards/margins": 0.03043745458126068, "rewards/rejected": -0.5724505186080933, "step": 46 }, { "epoch": 0.1004541811381245, "grad_norm": 5.874278545379639, "learning_rate": 5e-07, "logits/chosen": -0.9935128688812256, "logits/rejected": -1.0635360479354858, "logps/chosen": -0.2610815465450287, "logps/rejected": -0.2970622777938843, "loss": 1.5882, "rewards/accuracies": 0.5, "rewards/chosen": -0.5221630930900574, "rewards/margins": 0.07196150720119476, "rewards/rejected": -0.5941245555877686, "step": 47 }, { "epoch": 0.10259150414106331, "grad_norm": 3.674631118774414, "learning_rate": 4.999930062653174e-07, "logits/chosen": -0.7607293725013733, "logits/rejected": -0.9387491941452026, "logps/chosen": -0.30105069279670715, "logps/rejected": -0.29622718691825867, "loss": 1.6263, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6021013855934143, "rewards/margins": -0.009646959602832794, "rewards/rejected": -0.5924543738365173, "step": 48 }, { "epoch": 0.10472882714400214, "grad_norm": 3.2993836402893066, "learning_rate": 4.999720254525684e-07, "logits/chosen": -1.041825294494629, "logits/rejected": -0.8979977965354919, "logps/chosen": -0.3147028684616089, "logps/rejected": -0.32463401556015015, "loss": 1.5836, "rewards/accuracies": 0.5, "rewards/chosen": -0.6294057369232178, "rewards/margins": 0.019862275570631027, "rewards/rejected": -0.6492680311203003, "step": 49 }, { "epoch": 0.10686615014694095, "grad_norm": 3.6394598484039307, "learning_rate": 4.999370587356267e-07, "logits/chosen": -1.0319520235061646, "logits/rejected": -0.9399799108505249, "logps/chosen": -0.3198903501033783, "logps/rejected": -0.33650463819503784, "loss": 1.61, "rewards/accuracies": 0.375, "rewards/chosen": -0.6397807002067566, "rewards/margins": 0.03322865813970566, "rewards/rejected": -0.6730092763900757, "step": 50 }, { "epoch": 0.10900347314987978, "grad_norm": 3.5822248458862305, "learning_rate": 4.998881080708758e-07, "logits/chosen": -0.7624353170394897, "logits/rejected": -0.7781803011894226, "logps/chosen": -0.22195202112197876, "logps/rejected": -0.2529197931289673, "loss": 1.6014, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4439040422439575, "rewards/margins": 0.061935484409332275, "rewards/rejected": -0.5058395862579346, "step": 51 }, { "epoch": 0.11114079615281859, "grad_norm": 4.502132415771484, "learning_rate": 4.998251761970996e-07, "logits/chosen": -0.934096097946167, "logits/rejected": -0.9894377589225769, "logps/chosen": -0.3010854721069336, "logps/rejected": -0.2971184551715851, "loss": 1.6238, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6021709442138672, "rewards/margins": -0.00793398916721344, "rewards/rejected": -0.5942369103431702, "step": 52 }, { "epoch": 0.11327811915575742, "grad_norm": 14.495563507080078, "learning_rate": 4.997482666353286e-07, "logits/chosen": -0.9065138101577759, "logits/rejected": -0.8083285093307495, "logps/chosen": -0.2879031002521515, "logps/rejected": -0.30471161007881165, "loss": 1.6036, "rewards/accuracies": 0.625, "rewards/chosen": -0.575806200504303, "rewards/margins": 0.03361699730157852, "rewards/rejected": -0.6094232201576233, "step": 53 }, { "epoch": 0.11541544215869623, "grad_norm": 5.210042953491211, "learning_rate": 4.996573836886434e-07, "logits/chosen": -1.012821912765503, "logits/rejected": -0.935365617275238, "logps/chosen": -0.27059802412986755, "logps/rejected": -0.28305694460868835, "loss": 1.5922, "rewards/accuracies": 0.5, "rewards/chosen": -0.5411960482597351, "rewards/margins": 0.024917850270867348, "rewards/rejected": -0.5661138892173767, "step": 54 }, { "epoch": 0.11755276516163506, "grad_norm": 3.4929800033569336, "learning_rate": 4.995525324419337e-07, "logits/chosen": -1.03290593624115, "logits/rejected": -0.8397963047027588, "logps/chosen": -0.23197168111801147, "logps/rejected": -0.257206529378891, "loss": 1.6012, "rewards/accuracies": 0.75, "rewards/chosen": -0.46394336223602295, "rewards/margins": 0.05046967417001724, "rewards/rejected": -0.514413058757782, "step": 55 }, { "epoch": 0.11969008816457387, "grad_norm": 6.639918804168701, "learning_rate": 4.99433718761614e-07, "logits/chosen": -0.8676168918609619, "logits/rejected": -0.8751212954521179, "logps/chosen": -0.2813611626625061, "logps/rejected": -0.28943243622779846, "loss": 1.602, "rewards/accuracies": 0.5, "rewards/chosen": -0.5627223253250122, "rewards/margins": 0.01614254154264927, "rewards/rejected": -0.5788648724555969, "step": 56 }, { "epoch": 0.1218274111675127, "grad_norm": 3.159461736679077, "learning_rate": 4.993009492952949e-07, "logits/chosen": -0.9598115682601929, "logits/rejected": -0.9728808999061584, "logps/chosen": -0.2418256551027298, "logps/rejected": -0.27858078479766846, "loss": 1.6025, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4836513102054596, "rewards/margins": 0.07351024448871613, "rewards/rejected": -0.5571615695953369, "step": 57 }, { "epoch": 0.12396473417045151, "grad_norm": 3.1260619163513184, "learning_rate": 4.991542314714122e-07, "logits/chosen": -1.1715333461761475, "logits/rejected": -1.0372506380081177, "logps/chosen": -0.2886565625667572, "logps/rejected": -0.3048909306526184, "loss": 1.6142, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5773131251335144, "rewards/margins": 0.03246863931417465, "rewards/rejected": -0.6097818613052368, "step": 58 }, { "epoch": 0.12610205717339032, "grad_norm": 4.803882598876953, "learning_rate": 4.989935734988097e-07, "logits/chosen": -0.8652929663658142, "logits/rejected": -0.9138813018798828, "logps/chosen": -0.22791269421577454, "logps/rejected": -0.2620168924331665, "loss": 1.5999, "rewards/accuracies": 0.6875, "rewards/chosen": -0.4558253884315491, "rewards/margins": 0.06820837408304214, "rewards/rejected": -0.524033784866333, "step": 59 }, { "epoch": 0.12823938017632916, "grad_norm": 2.9545466899871826, "learning_rate": 4.988189843662815e-07, "logits/chosen": -0.9540647864341736, "logits/rejected": -0.9105108380317688, "logps/chosen": -0.28050848841667175, "logps/rejected": -0.2682150602340698, "loss": 1.6229, "rewards/accuracies": 0.5, "rewards/chosen": -0.5610169768333435, "rewards/margins": -0.024586813524365425, "rewards/rejected": -0.5364301204681396, "step": 60 }, { "epoch": 0.13037670317926797, "grad_norm": 5.4623260498046875, "learning_rate": 4.986304738420683e-07, "logits/chosen": -0.8594868779182434, "logits/rejected": -0.8749207854270935, "logps/chosen": -0.23750001192092896, "logps/rejected": -0.24768495559692383, "loss": 1.5863, "rewards/accuracies": 0.5625, "rewards/chosen": -0.4750000238418579, "rewards/margins": 0.020369907841086388, "rewards/rejected": -0.49536991119384766, "step": 61 }, { "epoch": 0.13251402618220678, "grad_norm": 5.195383548736572, "learning_rate": 4.984280524733107e-07, "logits/chosen": -0.8988451361656189, "logits/rejected": -1.0471916198730469, "logps/chosen": -0.2563616931438446, "logps/rejected": -0.264529824256897, "loss": 1.628, "rewards/accuracies": 0.5625, "rewards/chosen": -0.5127233862876892, "rewards/margins": 0.01633620262145996, "rewards/rejected": -0.529059648513794, "step": 62 }, { "epoch": 0.1346513491851456, "grad_norm": 3.365633249282837, "learning_rate": 4.982117315854593e-07, "logits/chosen": -0.9563354253768921, "logits/rejected": -1.143921971321106, "logps/chosen": -0.27564752101898193, "logps/rejected": -0.2891802191734314, "loss": 1.6243, "rewards/accuracies": 0.5, "rewards/chosen": -0.5512950420379639, "rewards/margins": 0.02706541307270527, "rewards/rejected": -0.5783604383468628, "step": 63 }, { "epoch": 0.13678867218808444, "grad_norm": 3.6298470497131348, "learning_rate": 4.979815232816416e-07, "logits/chosen": -0.9835873246192932, "logits/rejected": -0.8579452037811279, "logps/chosen": -0.2935434579849243, "logps/rejected": -0.26197710633277893, "loss": 1.6428, "rewards/accuracies": 0.3125, "rewards/chosen": -0.5870869159698486, "rewards/margins": -0.06313266605138779, "rewards/rejected": -0.5239542126655579, "step": 64 }, { "epoch": 0.13892599519102325, "grad_norm": 5.261904239654541, "learning_rate": 4.977374404419837e-07, "logits/chosen": -1.0193111896514893, "logits/rejected": -1.036008358001709, "logps/chosen": -0.27654433250427246, "logps/rejected": -0.25757479667663574, "loss": 1.5985, "rewards/accuracies": 0.25, "rewards/chosen": -0.6913608908653259, "rewards/margins": -0.047423895448446274, "rewards/rejected": -0.6439369320869446, "step": 65 }, { "epoch": 0.14106331819396206, "grad_norm": 3.326939582824707, "learning_rate": 4.974794967228907e-07, "logits/chosen": -1.0054104328155518, "logits/rejected": -0.9754442572593689, "logps/chosen": -0.2905897796154022, "logps/rejected": -0.32264938950538635, "loss": 1.6248, "rewards/accuracies": 0.5, "rewards/chosen": -0.7264744639396667, "rewards/margins": 0.08014895021915436, "rewards/rejected": -0.8066234588623047, "step": 66 }, { "epoch": 0.14320064119690087, "grad_norm": 5.669600486755371, "learning_rate": 4.972077065562821e-07, "logits/chosen": -0.9552958607673645, "logits/rejected": -1.0761511325836182, "logps/chosen": -0.3276459574699402, "logps/rejected": -0.32107335329055786, "loss": 1.6203, "rewards/accuracies": 0.5, "rewards/chosen": -0.8191148638725281, "rewards/margins": -0.01643138751387596, "rewards/rejected": -0.8026834726333618, "step": 67 }, { "epoch": 0.14533796419983971, "grad_norm": 3.257904052734375, "learning_rate": 4.969220851487844e-07, "logits/chosen": -0.9927914142608643, "logits/rejected": -0.9472739696502686, "logps/chosen": -0.3458186686038971, "logps/rejected": -0.34241756796836853, "loss": 1.6191, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8645466566085815, "rewards/margins": -0.008502773940563202, "rewards/rejected": -0.8560439348220825, "step": 68 }, { "epoch": 0.14747528720277853, "grad_norm": 5.560789585113525, "learning_rate": 4.966226484808803e-07, "logits/chosen": -0.9344061613082886, "logits/rejected": -0.8273663520812988, "logps/chosen": -0.2849215567111969, "logps/rejected": -0.31608855724334717, "loss": 1.6123, "rewards/accuracies": 0.375, "rewards/chosen": -0.7123039960861206, "rewards/margins": 0.0779174268245697, "rewards/rejected": -0.7902213335037231, "step": 69 }, { "epoch": 0.14961261020571734, "grad_norm": 3.70934796333313, "learning_rate": 4.963094133060148e-07, "logits/chosen": -0.9611161947250366, "logits/rejected": -0.8749902248382568, "logps/chosen": -0.2869144380092621, "logps/rejected": -0.23931002616882324, "loss": 1.6348, "rewards/accuracies": 0.25, "rewards/chosen": -0.7172860503196716, "rewards/margins": -0.11901099979877472, "rewards/rejected": -0.5982750654220581, "step": 70 }, { "epoch": 0.15174993320865615, "grad_norm": 3.771442413330078, "learning_rate": 4.959823971496574e-07, "logits/chosen": -1.0483484268188477, "logits/rejected": -0.9827014803886414, "logps/chosen": -0.3061015009880066, "logps/rejected": -0.3094024658203125, "loss": 1.5879, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7652537822723389, "rewards/margins": 0.00825244840234518, "rewards/rejected": -0.7735061645507812, "step": 71 }, { "epoch": 0.15388725621159496, "grad_norm": 3.6872432231903076, "learning_rate": 4.956416183083221e-07, "logits/chosen": -1.0115149021148682, "logits/rejected": -1.0020099878311157, "logps/chosen": -0.26311925053596497, "logps/rejected": -0.27171316742897034, "loss": 1.5697, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6577981114387512, "rewards/margins": 0.02148478478193283, "rewards/rejected": -0.6792829036712646, "step": 72 }, { "epoch": 0.1560245792145338, "grad_norm": 7.885510444641113, "learning_rate": 4.952870958485431e-07, "logits/chosen": -0.7439613938331604, "logits/rejected": -0.7543243169784546, "logps/chosen": -0.32277047634124756, "logps/rejected": -0.44049495458602905, "loss": 1.5719, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8069261312484741, "rewards/margins": 0.2943112254142761, "rewards/rejected": -1.1012372970581055, "step": 73 }, { "epoch": 0.15816190221747262, "grad_norm": 14.005949020385742, "learning_rate": 4.949188496058089e-07, "logits/chosen": -0.8661502599716187, "logits/rejected": -0.9138545989990234, "logps/chosen": -0.27060988545417786, "logps/rejected": -0.25004029273986816, "loss": 1.6381, "rewards/accuracies": 0.5, "rewards/chosen": -0.6765246987342834, "rewards/margins": -0.05142403393983841, "rewards/rejected": -0.6251006722450256, "step": 74 }, { "epoch": 0.16029922522041143, "grad_norm": 7.343827247619629, "learning_rate": 4.945369001834514e-07, "logits/chosen": -1.07318115234375, "logits/rejected": -1.0178194046020508, "logps/chosen": -0.2654929459095001, "logps/rejected": -0.29686206579208374, "loss": 1.5458, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6637323498725891, "rewards/margins": 0.07842274755239487, "rewards/rejected": -0.7421550750732422, "step": 75 }, { "epoch": 0.16243654822335024, "grad_norm": 4.01411247253418, "learning_rate": 4.941412689514941e-07, "logits/chosen": -1.162184238433838, "logits/rejected": -1.2236565351486206, "logps/chosen": -0.2647251486778259, "logps/rejected": -0.2977098226547241, "loss": 1.6206, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6618129014968872, "rewards/margins": 0.08246167004108429, "rewards/rejected": -0.7442746162414551, "step": 76 }, { "epoch": 0.16457387122628908, "grad_norm": 4.315869331359863, "learning_rate": 4.937319780454559e-07, "logits/chosen": -0.8569203019142151, "logits/rejected": -0.7959333062171936, "logps/chosen": -0.29102951288223267, "logps/rejected": -0.31862419843673706, "loss": 1.5992, "rewards/accuracies": 0.5625, "rewards/chosen": -0.727573812007904, "rewards/margins": 0.06898671388626099, "rewards/rejected": -0.7965604662895203, "step": 77 }, { "epoch": 0.1667111942292279, "grad_norm": 6.3516645431518555, "learning_rate": 4.933090503651128e-07, "logits/chosen": -0.9815778136253357, "logits/rejected": -0.9455960988998413, "logps/chosen": -0.290622353553772, "logps/rejected": -0.25920000672340393, "loss": 1.6019, "rewards/accuracies": 0.3125, "rewards/chosen": -0.7265558838844299, "rewards/margins": -0.07855589687824249, "rewards/rejected": -0.6479999423027039, "step": 78 }, { "epoch": 0.1688485172321667, "grad_norm": 5.576763153076172, "learning_rate": 4.928725095732168e-07, "logits/chosen": -0.7572908401489258, "logits/rejected": -0.8643375039100647, "logps/chosen": -0.28876882791519165, "logps/rejected": -0.38018882274627686, "loss": 1.574, "rewards/accuracies": 0.5, "rewards/chosen": -0.721921980381012, "rewards/margins": 0.22855007648468018, "rewards/rejected": -0.9504721164703369, "step": 79 }, { "epoch": 0.17098584023510552, "grad_norm": 6.904773235321045, "learning_rate": 4.924223800941717e-07, "logits/chosen": -1.1600089073181152, "logits/rejected": -1.001929759979248, "logps/chosen": -0.3185364007949829, "logps/rejected": -0.2833505868911743, "loss": 1.5885, "rewards/accuracies": 0.5, "rewards/chosen": -0.796341061592102, "rewards/margins": -0.08796463906764984, "rewards/rejected": -0.708376407623291, "step": 80 }, { "epoch": 0.17312316323804436, "grad_norm": 5.542295932769775, "learning_rate": 4.919586871126667e-07, "logits/chosen": -1.1290327310562134, "logits/rejected": -1.0776805877685547, "logps/chosen": -0.28904592990875244, "logps/rejected": -0.32642093300819397, "loss": 1.5823, "rewards/accuracies": 0.5, "rewards/chosen": -0.7226147651672363, "rewards/margins": 0.09343745559453964, "rewards/rejected": -0.816052258014679, "step": 81 }, { "epoch": 0.17526048624098317, "grad_norm": 7.346787929534912, "learning_rate": 4.91481456572267e-07, "logits/chosen": -1.008028507232666, "logits/rejected": -0.7614388465881348, "logps/chosen": -0.276422917842865, "logps/rejected": -0.28925520181655884, "loss": 1.5471, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6910573244094849, "rewards/margins": 0.03208072483539581, "rewards/rejected": -0.7231380343437195, "step": 82 }, { "epoch": 0.17739780924392198, "grad_norm": 3.4823226928710938, "learning_rate": 4.909907151739633e-07, "logits/chosen": -0.8054043650627136, "logits/rejected": -0.8212348222732544, "logps/chosen": -0.25493186712265015, "logps/rejected": -0.2324959635734558, "loss": 1.6075, "rewards/accuracies": 0.375, "rewards/chosen": -0.637329638004303, "rewards/margins": -0.05608966201543808, "rewards/rejected": -0.5812399983406067, "step": 83 }, { "epoch": 0.1795351322468608, "grad_norm": 8.512069702148438, "learning_rate": 4.904864903746765e-07, "logits/chosen": -0.8016963601112366, "logits/rejected": -0.8472069501876831, "logps/chosen": -0.3017991781234741, "logps/rejected": -0.30026912689208984, "loss": 1.6386, "rewards/accuracies": 0.5, "rewards/chosen": -0.7544978857040405, "rewards/margins": -0.003825142979621887, "rewards/rejected": -0.7506727576255798, "step": 84 }, { "epoch": 0.18167245524979964, "grad_norm": 3.27510666847229, "learning_rate": 4.899688103857222e-07, "logits/chosen": -0.9057269096374512, "logits/rejected": -0.8979475498199463, "logps/chosen": -0.24959440529346466, "logps/rejected": -0.3118637502193451, "loss": 1.5594, "rewards/accuracies": 0.6875, "rewards/chosen": -0.623986005783081, "rewards/margins": 0.15567341446876526, "rewards/rejected": -0.7796593904495239, "step": 85 }, { "epoch": 0.18380977825273845, "grad_norm": 3.2452337741851807, "learning_rate": 4.894377041712326e-07, "logits/chosen": -0.6997116208076477, "logits/rejected": -0.6495150327682495, "logps/chosen": -0.2519880533218384, "logps/rejected": -0.30695033073425293, "loss": 1.585, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6299701929092407, "rewards/margins": 0.137405663728714, "rewards/rejected": -0.7673758268356323, "step": 86 }, { "epoch": 0.18594710125567726, "grad_norm": 6.7908430099487305, "learning_rate": 4.888932014465352e-07, "logits/chosen": -0.8975124359130859, "logits/rejected": -0.8113777041435242, "logps/chosen": -0.2837047576904297, "logps/rejected": -0.2963961958885193, "loss": 1.5927, "rewards/accuracies": 0.5, "rewards/chosen": -0.709261953830719, "rewards/margins": 0.03172856196761131, "rewards/rejected": -0.7409905195236206, "step": 87 }, { "epoch": 0.18808442425861607, "grad_norm": 4.965595722198486, "learning_rate": 4.883353326764906e-07, "logits/chosen": -0.8913217186927795, "logits/rejected": -0.8421756625175476, "logps/chosen": -0.25936800241470337, "logps/rejected": -0.45224201679229736, "loss": 1.5572, "rewards/accuracies": 0.6875, "rewards/chosen": -0.648419976234436, "rewards/margins": 0.4821849763393402, "rewards/rejected": -1.1306049823760986, "step": 88 }, { "epoch": 0.1902217472615549, "grad_norm": 5.781017780303955, "learning_rate": 4.877641290737883e-07, "logits/chosen": -0.9931791424751282, "logits/rejected": -0.9962902665138245, "logps/chosen": -0.2539224624633789, "logps/rejected": -0.2921288311481476, "loss": 1.6042, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6348061561584473, "rewards/margins": 0.0955159068107605, "rewards/rejected": -0.730322003364563, "step": 89 }, { "epoch": 0.19235907026449373, "grad_norm": 5.002528190612793, "learning_rate": 4.871796225971999e-07, "logits/chosen": -0.9850423336029053, "logits/rejected": -0.857207179069519, "logps/chosen": -0.27758607268333435, "logps/rejected": -0.3058236241340637, "loss": 1.5967, "rewards/accuracies": 0.625, "rewards/chosen": -0.6939651966094971, "rewards/margins": 0.0705939531326294, "rewards/rejected": -0.7645590901374817, "step": 90 }, { "epoch": 0.19449639326743254, "grad_norm": 5.870463848114014, "learning_rate": 4.86581845949791e-07, "logits/chosen": -0.949233889579773, "logits/rejected": -1.0075958967208862, "logps/chosen": -0.2556777596473694, "logps/rejected": -0.2905680537223816, "loss": 1.5559, "rewards/accuracies": 0.8125, "rewards/chosen": -0.6391944289207458, "rewards/margins": 0.08722572773694992, "rewards/rejected": -0.7264201045036316, "step": 91 }, { "epoch": 0.19663371627037135, "grad_norm": 4.762439727783203, "learning_rate": 4.859708325770919e-07, "logits/chosen": -1.1257095336914062, "logits/rejected": -1.173663854598999, "logps/chosen": -0.28581249713897705, "logps/rejected": -0.3704802989959717, "loss": 1.5915, "rewards/accuracies": 0.75, "rewards/chosen": -0.7145313024520874, "rewards/margins": 0.21166956424713135, "rewards/rejected": -0.926200807094574, "step": 92 }, { "epoch": 0.1987710392733102, "grad_norm": 5.9837117195129395, "learning_rate": 4.853466166652258e-07, "logits/chosen": -0.9948515295982361, "logits/rejected": -0.9665160179138184, "logps/chosen": -0.2551361620426178, "logps/rejected": -0.28811219334602356, "loss": 1.5882, "rewards/accuracies": 0.5, "rewards/chosen": -0.6378403902053833, "rewards/margins": 0.08244016021490097, "rewards/rejected": -0.7202805280685425, "step": 93 }, { "epoch": 0.200908362276249, "grad_norm": 4.58953857421875, "learning_rate": 4.847092331389964e-07, "logits/chosen": -0.7557870149612427, "logits/rejected": -0.7804038524627686, "logps/chosen": -0.26233580708503723, "logps/rejected": -0.28490206599235535, "loss": 1.6059, "rewards/accuracies": 0.625, "rewards/chosen": -0.6558394432067871, "rewards/margins": 0.056415725499391556, "rewards/rejected": -0.7122551798820496, "step": 94 }, { "epoch": 0.20304568527918782, "grad_norm": 4.404232978820801, "learning_rate": 4.840587176599343e-07, "logits/chosen": -1.1708656549453735, "logits/rejected": -1.1824274063110352, "logps/chosen": -0.3498944640159607, "logps/rejected": -0.3052523732185364, "loss": 1.5518, "rewards/accuracies": 0.375, "rewards/chosen": -0.8747361898422241, "rewards/margins": -0.1116051897406578, "rewards/rejected": -0.7631310224533081, "step": 95 }, { "epoch": 0.20518300828212663, "grad_norm": 2.755133628845215, "learning_rate": 4.833951066243004e-07, "logits/chosen": -0.9821409583091736, "logits/rejected": -0.9246101975440979, "logps/chosen": -0.29376041889190674, "logps/rejected": -0.2656431794166565, "loss": 1.6092, "rewards/accuracies": 0.375, "rewards/chosen": -0.7344010472297668, "rewards/margins": -0.07029299437999725, "rewards/rejected": -0.6641080379486084, "step": 96 }, { "epoch": 0.20732033128506547, "grad_norm": 8.961865425109863, "learning_rate": 4.82718437161051e-07, "logits/chosen": -0.9781126976013184, "logits/rejected": -1.0274431705474854, "logps/chosen": -0.2688814699649811, "logps/rejected": -0.25695058703422546, "loss": 1.6414, "rewards/accuracies": 0.5, "rewards/chosen": -0.6722037196159363, "rewards/margins": -0.029827285557985306, "rewards/rejected": -0.6423764228820801, "step": 97 }, { "epoch": 0.20945765428800428, "grad_norm": 3.496291160583496, "learning_rate": 4.820287471297597e-07, "logits/chosen": -1.110063076019287, "logits/rejected": -0.9798667430877686, "logps/chosen": -0.2772579789161682, "logps/rejected": -0.284493625164032, "loss": 1.6027, "rewards/accuracies": 0.5, "rewards/chosen": -0.6931449174880981, "rewards/margins": 0.01808912120759487, "rewards/rejected": -0.7112340927124023, "step": 98 }, { "epoch": 0.2115949772909431, "grad_norm": 9.36062240600586, "learning_rate": 4.813260751184992e-07, "logits/chosen": -1.0408313274383545, "logits/rejected": -0.9097151160240173, "logps/chosen": -0.2336195558309555, "logps/rejected": -0.28545060753822327, "loss": 1.5888, "rewards/accuracies": 0.8125, "rewards/chosen": -0.5840489268302917, "rewards/margins": 0.12957759201526642, "rewards/rejected": -0.7136265635490417, "step": 99 }, { "epoch": 0.2137323002938819, "grad_norm": 4.372857570648193, "learning_rate": 4.806104604416823e-07, "logits/chosen": -1.1981866359710693, "logits/rejected": -1.1812773942947388, "logps/chosen": -0.40545234084129333, "logps/rejected": -0.32747963070869446, "loss": 1.6366, "rewards/accuracies": 0.5, "rewards/chosen": -1.0136308670043945, "rewards/margins": -0.194931760430336, "rewards/rejected": -0.8186991214752197, "step": 100 }, { "epoch": 0.21586962329682075, "grad_norm": 6.457290172576904, "learning_rate": 4.798819431378626e-07, "logits/chosen": -0.9583615064620972, "logits/rejected": -0.9292630553245544, "logps/chosen": -0.2667827904224396, "logps/rejected": -0.3141520917415619, "loss": 1.5717, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6669570207595825, "rewards/margins": 0.11842326819896698, "rewards/rejected": -0.7853802442550659, "step": 101 }, { "epoch": 0.21800694629975956, "grad_norm": 4.259753704071045, "learning_rate": 4.79140563967494e-07, "logits/chosen": -0.9554131031036377, "logits/rejected": -0.9235316514968872, "logps/chosen": -0.2790910005569458, "logps/rejected": -0.29358065128326416, "loss": 1.5956, "rewards/accuracies": 0.5, "rewards/chosen": -0.6977274417877197, "rewards/margins": 0.03622422739863396, "rewards/rejected": -0.7339516878128052, "step": 102 }, { "epoch": 0.22014426930269837, "grad_norm": 7.2334675788879395, "learning_rate": 4.783863644106502e-07, "logits/chosen": -0.958928108215332, "logits/rejected": -0.9119776487350464, "logps/chosen": -0.2611943483352661, "logps/rejected": -0.29755640029907227, "loss": 1.5837, "rewards/accuracies": 0.75, "rewards/chosen": -0.6529859304428101, "rewards/margins": 0.09090512990951538, "rewards/rejected": -0.7438910007476807, "step": 103 }, { "epoch": 0.22228159230563718, "grad_norm": 3.2508132457733154, "learning_rate": 4.776193866647039e-07, "logits/chosen": -1.073838710784912, "logits/rejected": -0.9083616733551025, "logps/chosen": -0.2882213294506073, "logps/rejected": -0.2734883427619934, "loss": 1.606, "rewards/accuracies": 0.5, "rewards/chosen": -0.7205533385276794, "rewards/margins": -0.03683248162269592, "rewards/rejected": -0.6837208867073059, "step": 104 }, { "epoch": 0.224418915308576, "grad_norm": 5.259939193725586, "learning_rate": 4.768396736419662e-07, "logits/chosen": -0.9633040428161621, "logits/rejected": -0.9958257675170898, "logps/chosen": -0.2812567949295044, "logps/rejected": -0.3445313572883606, "loss": 1.6165, "rewards/accuracies": 0.8125, "rewards/chosen": -0.703141987323761, "rewards/margins": 0.1581864058971405, "rewards/rejected": -0.8613283634185791, "step": 105 }, { "epoch": 0.22655623831151483, "grad_norm": 4.128396511077881, "learning_rate": 4.7604726896728496e-07, "logits/chosen": -0.898779571056366, "logits/rejected": -0.8008460998535156, "logps/chosen": -0.3449317216873169, "logps/rejected": -0.3174844980239868, "loss": 1.581, "rewards/accuracies": 0.4375, "rewards/chosen": -0.862329363822937, "rewards/margins": -0.06861816346645355, "rewards/rejected": -0.793711245059967, "step": 106 }, { "epoch": 0.22869356131445365, "grad_norm": 7.073979377746582, "learning_rate": 4.752422169756047e-07, "logits/chosen": -0.7736971378326416, "logits/rejected": -0.7141239643096924, "logps/chosen": -0.27883169054985046, "logps/rejected": -0.28934141993522644, "loss": 1.6038, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6970791816711426, "rewards/margins": 0.02627432905137539, "rewards/rejected": -0.7233536243438721, "step": 107 }, { "epoch": 0.23083088431739246, "grad_norm": 3.607881784439087, "learning_rate": 4.744245627094858e-07, "logits/chosen": -0.7748329043388367, "logits/rejected": -0.7313745021820068, "logps/chosen": -0.3115028142929077, "logps/rejected": -0.3770483732223511, "loss": 1.6484, "rewards/accuracies": 0.75, "rewards/chosen": -0.7787570357322693, "rewards/margins": 0.1638639271259308, "rewards/rejected": -0.9426208734512329, "step": 108 }, { "epoch": 0.23296820732033127, "grad_norm": 6.025390148162842, "learning_rate": 4.735943519165842e-07, "logits/chosen": -0.8779905438423157, "logits/rejected": -0.9295673966407776, "logps/chosen": -0.2856750786304474, "logps/rejected": -0.3149481415748596, "loss": 1.6163, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7141878008842468, "rewards/margins": 0.07318252325057983, "rewards/rejected": -0.7873702645301819, "step": 109 }, { "epoch": 0.2351055303232701, "grad_norm": 10.654799461364746, "learning_rate": 4.7275163104709194e-07, "logits/chosen": -1.139617681503296, "logits/rejected": -1.037335753440857, "logps/chosen": -0.3125270903110504, "logps/rejected": -0.42321839928627014, "loss": 1.5969, "rewards/accuracies": 0.625, "rewards/chosen": -0.7813177704811096, "rewards/margins": 0.27672821283340454, "rewards/rejected": -1.0580458641052246, "step": 110 }, { "epoch": 0.23724285332620892, "grad_norm": 3.1225905418395996, "learning_rate": 4.718964472511385e-07, "logits/chosen": -0.7755342125892639, "logits/rejected": -0.9119763374328613, "logps/chosen": -0.26263684034347534, "logps/rejected": -0.2584255635738373, "loss": 1.6006, "rewards/accuracies": 0.5, "rewards/chosen": -0.6565921306610107, "rewards/margins": -0.010528111830353737, "rewards/rejected": -0.6460639834403992, "step": 111 }, { "epoch": 0.23938017632914774, "grad_norm": 3.907759189605713, "learning_rate": 4.710288483761524e-07, "logits/chosen": -0.805738091468811, "logits/rejected": -0.8327180743217468, "logps/chosen": -0.26873674988746643, "logps/rejected": -0.27950698137283325, "loss": 1.5569, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6718418598175049, "rewards/margins": 0.02692551538348198, "rewards/rejected": -0.6987674236297607, "step": 112 }, { "epoch": 0.24151749933208655, "grad_norm": 5.125892162322998, "learning_rate": 4.7014888296418447e-07, "logits/chosen": -0.8660019040107727, "logits/rejected": -0.7626081109046936, "logps/chosen": -0.27202802896499634, "logps/rejected": -0.3179852068424225, "loss": 1.5133, "rewards/accuracies": 0.4375, "rewards/chosen": -0.680070161819458, "rewards/margins": 0.11489284038543701, "rewards/rejected": -0.7949629426002502, "step": 113 }, { "epoch": 0.2436548223350254, "grad_norm": 3.331281900405884, "learning_rate": 4.692566002491916e-07, "logits/chosen": -0.9860325455665588, "logits/rejected": -1.0227984189987183, "logps/chosen": -0.277464359998703, "logps/rejected": -0.3393504023551941, "loss": 1.5764, "rewards/accuracies": 0.5, "rewards/chosen": -0.6936609745025635, "rewards/margins": 0.15471497178077698, "rewards/rejected": -0.8483759164810181, "step": 114 }, { "epoch": 0.2457921453379642, "grad_norm": 4.2767205238342285, "learning_rate": 4.683520501542824e-07, "logits/chosen": -1.1069515943527222, "logits/rejected": -0.9956479668617249, "logps/chosen": -0.26621949672698975, "logps/rejected": -0.2311069816350937, "loss": 1.6116, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6655487418174744, "rewards/margins": -0.08778128772974014, "rewards/rejected": -0.5777674317359924, "step": 115 }, { "epoch": 0.24792946834090301, "grad_norm": 4.450298309326172, "learning_rate": 4.6743528328892384e-07, "logits/chosen": -1.089507818222046, "logits/rejected": -1.0226225852966309, "logps/chosen": -0.31000208854675293, "logps/rejected": -0.3048384189605713, "loss": 1.5601, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7750052213668823, "rewards/margins": -0.012909159064292908, "rewards/rejected": -0.7620960474014282, "step": 116 }, { "epoch": 0.25006679134384185, "grad_norm": 8.30679988861084, "learning_rate": 4.6650635094610966e-07, "logits/chosen": -1.019626498222351, "logits/rejected": -1.0024278163909912, "logps/chosen": -0.27769044041633606, "logps/rejected": -0.31382811069488525, "loss": 1.603, "rewards/accuracies": 0.625, "rewards/chosen": -0.6942261457443237, "rewards/margins": 0.09034418314695358, "rewards/rejected": -0.7845702767372131, "step": 117 }, { "epoch": 0.25220411434678064, "grad_norm": 2.7810580730438232, "learning_rate": 4.655653050994906e-07, "logits/chosen": -0.8939322829246521, "logits/rejected": -0.9443778991699219, "logps/chosen": -0.3001169264316559, "logps/rejected": -0.27608948945999146, "loss": 1.6033, "rewards/accuracies": 0.5, "rewards/chosen": -0.7502923011779785, "rewards/margins": -0.06006866693496704, "rewards/rejected": -0.6902236938476562, "step": 118 }, { "epoch": 0.2543414373497195, "grad_norm": 7.52852201461792, "learning_rate": 4.646121984004665e-07, "logits/chosen": -1.0176833868026733, "logits/rejected": -0.9106737971305847, "logps/chosen": -0.2855750620365143, "logps/rejected": -0.2689260244369507, "loss": 1.6369, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7139376401901245, "rewards/margins": -0.041622575372457504, "rewards/rejected": -0.6723150610923767, "step": 119 }, { "epoch": 0.2564787603526583, "grad_norm": 5.235323429107666, "learning_rate": 4.636470841752404e-07, "logits/chosen": -0.894492506980896, "logits/rejected": -0.8580023050308228, "logps/chosen": -0.2390415519475937, "logps/rejected": -0.3226756751537323, "loss": 1.5698, "rewards/accuracies": 0.6875, "rewards/chosen": -0.5976038575172424, "rewards/margins": 0.20908531546592712, "rewards/rejected": -0.8066891431808472, "step": 120 }, { "epoch": 0.2586160833555971, "grad_norm": 7.31561803817749, "learning_rate": 4.626700164218349e-07, "logits/chosen": -1.1262331008911133, "logits/rejected": -1.1069376468658447, "logps/chosen": -0.32872867584228516, "logps/rejected": -0.4012628495693207, "loss": 1.5619, "rewards/accuracies": 0.75, "rewards/chosen": -0.8218216896057129, "rewards/margins": 0.18133553862571716, "rewards/rejected": -1.003157138824463, "step": 121 }, { "epoch": 0.26075340635853594, "grad_norm": 4.992301940917969, "learning_rate": 4.6168104980707103e-07, "logits/chosen": -0.947390079498291, "logits/rejected": -0.9287791848182678, "logps/chosen": -0.3660104274749756, "logps/rejected": -0.34243422746658325, "loss": 1.6722, "rewards/accuracies": 0.3125, "rewards/chosen": -0.9150261282920837, "rewards/margins": -0.058940548449754715, "rewards/rejected": -0.8560855388641357, "step": 122 }, { "epoch": 0.26289072936147473, "grad_norm": 9.114599227905273, "learning_rate": 4.606802396635098e-07, "logits/chosen": -1.0491164922714233, "logits/rejected": -1.0415245294570923, "logps/chosen": -0.2906876802444458, "logps/rejected": -0.2940623164176941, "loss": 1.6062, "rewards/accuracies": 0.5, "rewards/chosen": -0.7267192006111145, "rewards/margins": 0.008436577394604683, "rewards/rejected": -0.7351557612419128, "step": 123 }, { "epoch": 0.26502805236441357, "grad_norm": 13.20346736907959, "learning_rate": 4.59667641986356e-07, "logits/chosen": -0.9480360746383667, "logits/rejected": -0.9648789167404175, "logps/chosen": -0.30809280276298523, "logps/rejected": -0.39009833335876465, "loss": 1.5673, "rewards/accuracies": 0.6875, "rewards/chosen": -0.770232081413269, "rewards/margins": 0.20501384139060974, "rewards/rejected": -0.9752458930015564, "step": 124 }, { "epoch": 0.2671653753673524, "grad_norm": 10.775737762451172, "learning_rate": 4.5864331343032565e-07, "logits/chosen": -0.9860743880271912, "logits/rejected": -0.9669252634048462, "logps/chosen": -0.4254220724105835, "logps/rejected": -0.42529717087745667, "loss": 1.6036, "rewards/accuracies": 0.625, "rewards/chosen": -1.063555121421814, "rewards/margins": -0.0003122463822364807, "rewards/rejected": -1.0632429122924805, "step": 125 }, { "epoch": 0.2693026983702912, "grad_norm": 3.9531850814819336, "learning_rate": 4.576073113064759e-07, "logits/chosen": -0.9061692953109741, "logits/rejected": -1.030226707458496, "logps/chosen": -0.2965227961540222, "logps/rejected": -0.3571414351463318, "loss": 1.5663, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7413069605827332, "rewards/margins": 0.15154659748077393, "rewards/rejected": -0.8928536176681519, "step": 126 }, { "epoch": 0.27144002137323003, "grad_norm": 10.59150218963623, "learning_rate": 4.565596935789987e-07, "logits/chosen": -1.0731703042984009, "logits/rejected": -1.0575220584869385, "logps/chosen": -0.3338828682899475, "logps/rejected": -0.36648380756378174, "loss": 1.5818, "rewards/accuracies": 0.375, "rewards/chosen": -0.8347071409225464, "rewards/margins": 0.08150236308574677, "rewards/rejected": -0.9162094593048096, "step": 127 }, { "epoch": 0.2735773443761689, "grad_norm": 10.513736724853516, "learning_rate": 4.555005188619775e-07, "logits/chosen": -0.8747404217720032, "logits/rejected": -0.8733081817626953, "logps/chosen": -0.24837706983089447, "logps/rejected": -0.297157883644104, "loss": 1.5767, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6209426522254944, "rewards/margins": 0.12195204943418503, "rewards/rejected": -0.74289470911026, "step": 128 }, { "epoch": 0.27571466737910766, "grad_norm": 5.887775421142578, "learning_rate": 4.5442984641610784e-07, "logits/chosen": -1.126139760017395, "logits/rejected": -1.0465540885925293, "logps/chosen": -0.2965885102748871, "logps/rejected": -0.2864833474159241, "loss": 1.6037, "rewards/accuracies": 0.3125, "rewards/chosen": -0.7414712905883789, "rewards/margins": -0.025262875482439995, "rewards/rejected": -0.7162083387374878, "step": 129 }, { "epoch": 0.2778519903820465, "grad_norm": 4.381558418273926, "learning_rate": 4.533477361453819e-07, "logits/chosen": -1.0439306497573853, "logits/rejected": -1.1324841976165771, "logps/chosen": -0.3036992847919464, "logps/rejected": -0.3638463020324707, "loss": 1.5841, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7592483162879944, "rewards/margins": 0.15036745369434357, "rewards/rejected": -0.9096157550811768, "step": 130 }, { "epoch": 0.2799893133849853, "grad_norm": 4.211307048797607, "learning_rate": 4.5225424859373684e-07, "logits/chosen": -0.9729929566383362, "logits/rejected": -0.971265435218811, "logps/chosen": -0.3391942083835602, "logps/rejected": -0.35309362411499023, "loss": 1.595, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8479855060577393, "rewards/margins": 0.03474842756986618, "rewards/rejected": -0.8827340602874756, "step": 131 }, { "epoch": 0.2821266363879241, "grad_norm": 9.722661018371582, "learning_rate": 4.511494449416671e-07, "logits/chosen": -0.8604239225387573, "logits/rejected": -0.790294885635376, "logps/chosen": -0.25934475660324097, "logps/rejected": -0.2542663812637329, "loss": 1.6546, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6483618021011353, "rewards/margins": -0.012695923447608948, "rewards/rejected": -0.6356659531593323, "step": 132 }, { "epoch": 0.28426395939086296, "grad_norm": 3.2997727394104004, "learning_rate": 4.500333870028016e-07, "logits/chosen": -1.0789867639541626, "logits/rejected": -1.073919653892517, "logps/chosen": -0.25591588020324707, "logps/rejected": -0.2593124806880951, "loss": 1.5489, "rewards/accuracies": 0.5, "rewards/chosen": -0.6397897601127625, "rewards/margins": 0.008491499349474907, "rewards/rejected": -0.6482812166213989, "step": 133 }, { "epoch": 0.28640128239380175, "grad_norm": 3.918221950531006, "learning_rate": 4.489061372204452e-07, "logits/chosen": -0.9510654211044312, "logits/rejected": -0.880722165107727, "logps/chosen": -0.2889711260795593, "logps/rejected": -0.32566970586776733, "loss": 1.5822, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7224277257919312, "rewards/margins": 0.09174646437168121, "rewards/rejected": -0.8141741752624512, "step": 134 }, { "epoch": 0.2885386053967406, "grad_norm": 9.736861228942871, "learning_rate": 4.4776775866408533e-07, "logits/chosen": -1.0732065439224243, "logits/rejected": -0.9681872725486755, "logps/chosen": -0.41927701234817505, "logps/rejected": -0.2924247086048126, "loss": 1.5615, "rewards/accuracies": 0.25, "rewards/chosen": -1.0481925010681152, "rewards/margins": -0.3171307146549225, "rewards/rejected": -0.7310618162155151, "step": 135 }, { "epoch": 0.29067592839967943, "grad_norm": 3.712836503982544, "learning_rate": 4.4661831502586244e-07, "logits/chosen": -0.9898865222930908, "logits/rejected": -0.958566427230835, "logps/chosen": -0.3362237811088562, "logps/rejected": -0.3830156624317169, "loss": 1.5408, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8405593633651733, "rewards/margins": 0.11697974801063538, "rewards/rejected": -0.9575392007827759, "step": 136 }, { "epoch": 0.2928132514026182, "grad_norm": 6.5986409187316895, "learning_rate": 4.4545787061700746e-07, "logits/chosen": -0.9952265620231628, "logits/rejected": -0.9618417024612427, "logps/chosen": -0.33093225955963135, "logps/rejected": -0.3158915042877197, "loss": 1.6151, "rewards/accuracies": 0.375, "rewards/chosen": -0.8273307085037231, "rewards/margins": -0.037601932883262634, "rewards/rejected": -0.7897287607192993, "step": 137 }, { "epoch": 0.29495057440555705, "grad_norm": 11.264966011047363, "learning_rate": 4.442864903642427e-07, "logits/chosen": -0.9705032706260681, "logits/rejected": -1.010439395904541, "logps/chosen": -0.29408299922943115, "logps/rejected": -0.32492977380752563, "loss": 1.6785, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7352074980735779, "rewards/margins": 0.07711698114871979, "rewards/rejected": -0.8123244047164917, "step": 138 }, { "epoch": 0.29708789740849584, "grad_norm": 3.294029951095581, "learning_rate": 4.4310423980614986e-07, "logits/chosen": -0.9771057963371277, "logits/rejected": -0.8812280893325806, "logps/chosen": -0.27679648995399475, "logps/rejected": -0.301949143409729, "loss": 1.574, "rewards/accuracies": 0.625, "rewards/chosen": -0.6919912099838257, "rewards/margins": 0.06288158893585205, "rewards/rejected": -0.7548727989196777, "step": 139 }, { "epoch": 0.2992252204114347, "grad_norm": 6.963750839233398, "learning_rate": 4.4191118508950277e-07, "logits/chosen": -0.9832889437675476, "logits/rejected": -1.041925311088562, "logps/chosen": -0.3261723518371582, "logps/rejected": -0.36972764134407043, "loss": 1.6004, "rewards/accuracies": 0.75, "rewards/chosen": -0.8154308199882507, "rewards/margins": 0.10888823121786118, "rewards/rejected": -0.9243191480636597, "step": 140 }, { "epoch": 0.3013625434143735, "grad_norm": 4.65049934387207, "learning_rate": 4.407073929655666e-07, "logits/chosen": -0.8786113858222961, "logits/rejected": -0.8743698000907898, "logps/chosen": -0.3489750027656555, "logps/rejected": -0.34578827023506165, "loss": 1.613, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8724376559257507, "rewards/margins": -0.007966993376612663, "rewards/rejected": -0.8644705414772034, "step": 141 }, { "epoch": 0.3034998664173123, "grad_norm": 5.086356163024902, "learning_rate": 4.394929307863632e-07, "logits/chosen": -1.178961157798767, "logits/rejected": -1.1221526861190796, "logps/chosen": -0.30651959776878357, "logps/rejected": -0.27971014380455017, "loss": 1.5804, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7662990093231201, "rewards/margins": -0.0670236200094223, "rewards/rejected": -0.6992753744125366, "step": 142 }, { "epoch": 0.30563718942025114, "grad_norm": 4.039379119873047, "learning_rate": 4.3826786650090273e-07, "logits/chosen": -1.1027284860610962, "logits/rejected": -1.0947867631912231, "logps/chosen": -0.3164759874343872, "logps/rejected": -0.4346773028373718, "loss": 1.5507, "rewards/accuracies": 0.375, "rewards/chosen": -0.791189968585968, "rewards/margins": 0.29550325870513916, "rewards/rejected": -1.086693286895752, "step": 143 }, { "epoch": 0.3077745124231899, "grad_norm": 5.900667190551758, "learning_rate": 4.370322686513817e-07, "logits/chosen": -0.8383625149726868, "logits/rejected": -0.7769290804862976, "logps/chosen": -0.2520799934864044, "logps/rejected": -0.23787729442119598, "loss": 1.5816, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6302000284194946, "rewards/margins": -0.03550675883889198, "rewards/rejected": -0.5946931838989258, "step": 144 }, { "epoch": 0.30991183542612877, "grad_norm": 5.484445095062256, "learning_rate": 4.357862063693485e-07, "logits/chosen": -0.9914720058441162, "logits/rejected": -1.0965267419815063, "logps/chosen": -0.2822697162628174, "logps/rejected": -0.3381388485431671, "loss": 1.5638, "rewards/accuracies": 0.625, "rewards/chosen": -0.7056742906570435, "rewards/margins": 0.13967278599739075, "rewards/rejected": -0.8453471064567566, "step": 145 }, { "epoch": 0.3120491584290676, "grad_norm": 2.739793539047241, "learning_rate": 4.345297493718352e-07, "logits/chosen": -0.9342893362045288, "logits/rejected": -0.8757031559944153, "logps/chosen": -0.512154221534729, "logps/rejected": -0.6000754237174988, "loss": 1.5743, "rewards/accuracies": 0.625, "rewards/chosen": -1.2803857326507568, "rewards/margins": 0.2198028415441513, "rewards/rejected": -1.5001884698867798, "step": 146 }, { "epoch": 0.3141864814320064, "grad_norm": 4.219600677490234, "learning_rate": 4.332629679574565e-07, "logits/chosen": -0.7380187511444092, "logits/rejected": -0.8239220380783081, "logps/chosen": -0.24688729643821716, "logps/rejected": -0.2952543795108795, "loss": 1.5583, "rewards/accuracies": 0.625, "rewards/chosen": -0.6172181963920593, "rewards/margins": 0.12091781944036484, "rewards/rejected": -0.7381359934806824, "step": 147 }, { "epoch": 0.31632380443494523, "grad_norm": 7.114157676696777, "learning_rate": 4.319859330024777e-07, "logits/chosen": -0.950808048248291, "logits/rejected": -0.873584508895874, "logps/chosen": -0.28023314476013184, "logps/rejected": -0.37878137826919556, "loss": 1.575, "rewards/accuracies": 0.5, "rewards/chosen": -0.7005828022956848, "rewards/margins": 0.2463706135749817, "rewards/rejected": -0.9469534754753113, "step": 148 }, { "epoch": 0.3184611274378841, "grad_norm": 5.033133029937744, "learning_rate": 4.3069871595684787e-07, "logits/chosen": -0.9993598461151123, "logits/rejected": -1.1495643854141235, "logps/chosen": -0.31961789727211, "logps/rejected": -0.4130839705467224, "loss": 1.6089, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7990447878837585, "rewards/margins": 0.2336651086807251, "rewards/rejected": -1.0327098369598389, "step": 149 }, { "epoch": 0.32059845044082286, "grad_norm": 3.7931158542633057, "learning_rate": 4.294013888402029e-07, "logits/chosen": -1.0581141710281372, "logits/rejected": -0.958967924118042, "logps/chosen": -0.30636316537857056, "logps/rejected": -0.31132641434669495, "loss": 1.6122, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7659078240394592, "rewards/margins": 0.012408185750246048, "rewards/rejected": -0.7783160209655762, "step": 150 }, { "epoch": 0.3227357734437617, "grad_norm": 4.442758560180664, "learning_rate": 4.280940242378362e-07, "logits/chosen": -0.9492220878601074, "logits/rejected": -0.9829614162445068, "logps/chosen": -0.26527076959609985, "logps/rejected": -0.5424583554267883, "loss": 1.5182, "rewards/accuracies": 0.625, "rewards/chosen": -0.6631768941879272, "rewards/margins": 0.6929690837860107, "rewards/rejected": -1.3561458587646484, "step": 151 }, { "epoch": 0.3248730964467005, "grad_norm": 4.347165107727051, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -0.9675495624542236, "logits/rejected": -0.9267060160636902, "logps/chosen": -0.28724977374076843, "logps/rejected": -0.27893343567848206, "loss": 1.6064, "rewards/accuracies": 0.5, "rewards/chosen": -0.7181244492530823, "rewards/margins": -0.02079082280397415, "rewards/rejected": -0.697333574295044, "step": 152 }, { "epoch": 0.3270104194496393, "grad_norm": 3.866643190383911, "learning_rate": 4.254494757209979e-07, "logits/chosen": -1.0312570333480835, "logits/rejected": -0.8400145173072815, "logps/chosen": -0.2714364230632782, "logps/rejected": -0.3370465636253357, "loss": 1.5993, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6785910725593567, "rewards/margins": 0.16402524709701538, "rewards/rejected": -0.8426163196563721, "step": 153 }, { "epoch": 0.32914774245257816, "grad_norm": 7.2947211265563965, "learning_rate": 4.2411243976869173e-07, "logits/chosen": -1.1030328273773193, "logits/rejected": -1.107038140296936, "logps/chosen": -0.31799790263175964, "logps/rejected": -0.3556910455226898, "loss": 1.5435, "rewards/accuracies": 0.5, "rewards/chosen": -0.7949947714805603, "rewards/margins": 0.09423284232616425, "rewards/rejected": -0.8892276287078857, "step": 154 }, { "epoch": 0.33128506545551695, "grad_norm": 6.094887733459473, "learning_rate": 4.227656622467162e-07, "logits/chosen": -0.9807777404785156, "logits/rejected": -0.9574925303459167, "logps/chosen": -0.36069509387016296, "logps/rejected": -0.411272257566452, "loss": 1.5455, "rewards/accuracies": 0.5, "rewards/chosen": -0.9017376899719238, "rewards/margins": 0.12644296884536743, "rewards/rejected": -1.028180718421936, "step": 155 }, { "epoch": 0.3334223884584558, "grad_norm": 7.840887069702148, "learning_rate": 4.2140921850710855e-07, "logits/chosen": -1.1150490045547485, "logits/rejected": -1.1116127967834473, "logps/chosen": -0.2742304801940918, "logps/rejected": -0.3083428740501404, "loss": 1.546, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6855762004852295, "rewards/margins": 0.08528101444244385, "rewards/rejected": -0.7708572149276733, "step": 156 }, { "epoch": 0.3355597114613946, "grad_norm": 15.734699249267578, "learning_rate": 4.200431844427298e-07, "logits/chosen": -0.9994797706604004, "logits/rejected": -1.077652931213379, "logps/chosen": -0.3408905565738678, "logps/rejected": -0.605131208896637, "loss": 1.5829, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8522265553474426, "rewards/margins": 0.6606015563011169, "rewards/rejected": -1.51282799243927, "step": 157 }, { "epoch": 0.3376970344643334, "grad_norm": 4.568877696990967, "learning_rate": 4.186676364830186e-07, "logits/chosen": -0.8166912794113159, "logits/rejected": -0.9158197641372681, "logps/chosen": -0.3100201189517975, "logps/rejected": -0.4257528781890869, "loss": 1.5949, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7750502824783325, "rewards/margins": 0.28933185338974, "rewards/rejected": -1.0643821954727173, "step": 158 }, { "epoch": 0.33983435746727225, "grad_norm": 7.091736793518066, "learning_rate": 4.172826515897145e-07, "logits/chosen": -0.9496626853942871, "logits/rejected": -0.8826749920845032, "logps/chosen": -0.2823619842529297, "logps/rejected": -0.25573766231536865, "loss": 1.6079, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7059049606323242, "rewards/margins": -0.06656082719564438, "rewards/rejected": -0.6393441557884216, "step": 159 }, { "epoch": 0.34197168047021104, "grad_norm": 9.935672760009766, "learning_rate": 4.158883072525528e-07, "logits/chosen": -1.139492392539978, "logits/rejected": -0.9911923408508301, "logps/chosen": -0.24080964922904968, "logps/rejected": -0.23250696063041687, "loss": 1.5373, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6020240783691406, "rewards/margins": -0.020756704732775688, "rewards/rejected": -0.5812674164772034, "step": 160 }, { "epoch": 0.3441090034731499, "grad_norm": 15.794203758239746, "learning_rate": 4.1448468148492814e-07, "logits/chosen": -1.019397258758545, "logits/rejected": -0.9881049394607544, "logps/chosen": -0.3946765065193176, "logps/rejected": -0.3796921670436859, "loss": 1.5431, "rewards/accuracies": 0.625, "rewards/chosen": -0.9866912364959717, "rewards/margins": -0.03746076300740242, "rewards/rejected": -0.9492304921150208, "step": 161 }, { "epoch": 0.3462463264760887, "grad_norm": 3.839250087738037, "learning_rate": 4.130718528195303e-07, "logits/chosen": -0.9311838746070862, "logits/rejected": -0.8956501483917236, "logps/chosen": -0.282693088054657, "logps/rejected": -0.2629316449165344, "loss": 1.5818, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7067327499389648, "rewards/margins": -0.04940361529588699, "rewards/rejected": -0.6573290824890137, "step": 162 }, { "epoch": 0.3483836494790275, "grad_norm": 5.865959644317627, "learning_rate": 4.1164990030394985e-07, "logits/chosen": -1.0395972728729248, "logits/rejected": -0.9770699143409729, "logps/chosen": -0.3128069043159485, "logps/rejected": -0.31642264127731323, "loss": 1.5692, "rewards/accuracies": 0.3125, "rewards/chosen": -0.7820172905921936, "rewards/margins": 0.009039390832185745, "rewards/rejected": -0.791056752204895, "step": 163 }, { "epoch": 0.35052097248196634, "grad_norm": 4.154603481292725, "learning_rate": 4.10218903496256e-07, "logits/chosen": -1.0948988199234009, "logits/rejected": -0.9907031059265137, "logps/chosen": -0.30839213728904724, "logps/rejected": -0.29299482703208923, "loss": 1.5829, "rewards/accuracies": 0.375, "rewards/chosen": -0.7709803581237793, "rewards/margins": -0.03849326819181442, "rewards/rejected": -0.7324870824813843, "step": 164 }, { "epoch": 0.3526582954849052, "grad_norm": 5.253880500793457, "learning_rate": 4.087789424605447e-07, "logits/chosen": -1.0539865493774414, "logits/rejected": -0.9663246870040894, "logps/chosen": -0.26886874437332153, "logps/rejected": -0.43172940611839294, "loss": 1.5157, "rewards/accuracies": 0.5625, "rewards/chosen": -0.672171950340271, "rewards/margins": 0.40715163946151733, "rewards/rejected": -1.0793235301971436, "step": 165 }, { "epoch": 0.35479561848784397, "grad_norm": 11.49240493774414, "learning_rate": 4.0733009776245937e-07, "logits/chosen": -0.9969057440757751, "logits/rejected": -1.0402690172195435, "logps/chosen": -0.3554040193557739, "logps/rejected": -0.396072655916214, "loss": 1.584, "rewards/accuracies": 0.5, "rewards/chosen": -0.8885101079940796, "rewards/margins": 0.10167157649993896, "rewards/rejected": -0.9901816248893738, "step": 166 }, { "epoch": 0.3569329414907828, "grad_norm": 5.116168975830078, "learning_rate": 4.058724504646834e-07, "logits/chosen": -0.9382141828536987, "logits/rejected": -0.8863942623138428, "logps/chosen": -0.261793315410614, "logps/rejected": -0.45277461409568787, "loss": 1.539, "rewards/accuracies": 0.4375, "rewards/chosen": -0.6544832587242126, "rewards/margins": 0.47745317220687866, "rewards/rejected": -1.1319365501403809, "step": 167 }, { "epoch": 0.3590702644937216, "grad_norm": 10.550248146057129, "learning_rate": 4.0440608212240445e-07, "logits/chosen": -1.0490831136703491, "logits/rejected": -1.1039912700653076, "logps/chosen": -0.3632212281227112, "logps/rejected": -0.3736804723739624, "loss": 1.5845, "rewards/accuracies": 0.625, "rewards/chosen": -0.9080529808998108, "rewards/margins": 0.026148155331611633, "rewards/rejected": -0.934201180934906, "step": 168 }, { "epoch": 0.36120758749666043, "grad_norm": 4.120011806488037, "learning_rate": 4.0293107477875156e-07, "logits/chosen": -0.914804220199585, "logits/rejected": -0.9306747317314148, "logps/chosen": -0.3597089350223541, "logps/rejected": -0.39882034063339233, "loss": 1.5235, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8992725014686584, "rewards/margins": 0.09777843952178955, "rewards/rejected": -0.9970508813858032, "step": 169 }, { "epoch": 0.36334491049959927, "grad_norm": 3.9872193336486816, "learning_rate": 4.0144751096020497e-07, "logits/chosen": -1.0519163608551025, "logits/rejected": -0.9880449175834656, "logps/chosen": -0.27723756432533264, "logps/rejected": -0.3831270933151245, "loss": 1.5744, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6930938959121704, "rewards/margins": 0.26472383737564087, "rewards/rejected": -0.9578177332878113, "step": 170 }, { "epoch": 0.36548223350253806, "grad_norm": 5.331676006317139, "learning_rate": 3.999554736719785e-07, "logits/chosen": -1.1113324165344238, "logits/rejected": -1.1892024278640747, "logps/chosen": -0.3108530640602112, "logps/rejected": -0.5784565806388855, "loss": 1.4846, "rewards/accuracies": 0.625, "rewards/chosen": -0.7771324515342712, "rewards/margins": 0.6690089702606201, "rewards/rejected": -1.4461414813995361, "step": 171 }, { "epoch": 0.3676195565054769, "grad_norm": 6.586511611938477, "learning_rate": 3.9845504639337535e-07, "logits/chosen": -1.2047513723373413, "logits/rejected": -1.1406968832015991, "logps/chosen": -0.3595273196697235, "logps/rejected": -0.32145068049430847, "loss": 1.5328, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8988182544708252, "rewards/margins": -0.0951915979385376, "rewards/rejected": -0.8036267757415771, "step": 172 }, { "epoch": 0.36975687950841574, "grad_norm": 6.111835479736328, "learning_rate": 3.9694631307311825e-07, "logits/chosen": -0.8004586696624756, "logits/rejected": -0.7772153615951538, "logps/chosen": -0.4090813100337982, "logps/rejected": -0.4898335635662079, "loss": 1.559, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0227031707763672, "rewards/margins": 0.20188063383102417, "rewards/rejected": -1.2245839834213257, "step": 173 }, { "epoch": 0.3718942025113545, "grad_norm": 7.0863189697265625, "learning_rate": 3.954293581246514e-07, "logits/chosen": -0.9679336547851562, "logits/rejected": -0.9125540256500244, "logps/chosen": -0.29369306564331055, "logps/rejected": -0.31403255462646484, "loss": 1.5375, "rewards/accuracies": 0.625, "rewards/chosen": -0.7342327237129211, "rewards/margins": 0.05084871128201485, "rewards/rejected": -0.7850814461708069, "step": 174 }, { "epoch": 0.37403152551429336, "grad_norm": 7.140958309173584, "learning_rate": 3.939042664214184e-07, "logits/chosen": -0.949452817440033, "logits/rejected": -1.0473122596740723, "logps/chosen": -0.2707624137401581, "logps/rejected": -0.32049351930618286, "loss": 1.5626, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6769061088562012, "rewards/margins": 0.12432771921157837, "rewards/rejected": -0.8012337684631348, "step": 175 }, { "epoch": 0.37616884851723215, "grad_norm": 7.0456695556640625, "learning_rate": 3.92371123292113e-07, "logits/chosen": -1.0727981328964233, "logits/rejected": -1.1329890489578247, "logps/chosen": -0.29705438017845154, "logps/rejected": -0.3278125524520874, "loss": 1.6107, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7426358461380005, "rewards/margins": 0.07689555734395981, "rewards/rejected": -0.8195314407348633, "step": 176 }, { "epoch": 0.378306171520171, "grad_norm": 5.836486339569092, "learning_rate": 3.908300145159055e-07, "logits/chosen": -0.9942230582237244, "logits/rejected": -1.0356171131134033, "logps/chosen": -0.31931719183921814, "logps/rejected": -0.33853164315223694, "loss": 1.5837, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7982929348945618, "rewards/margins": 0.0480361208319664, "rewards/rejected": -0.8463290929794312, "step": 177 }, { "epoch": 0.3804434945231098, "grad_norm": 8.505417823791504, "learning_rate": 3.8928102631764304e-07, "logits/chosen": -1.0212180614471436, "logits/rejected": -1.087773323059082, "logps/chosen": -0.3532945513725281, "logps/rejected": -0.5901373028755188, "loss": 1.5557, "rewards/accuracies": 0.6875, "rewards/chosen": -0.883236289024353, "rewards/margins": 0.5921069979667664, "rewards/rejected": -1.4753433465957642, "step": 178 }, { "epoch": 0.3825808175260486, "grad_norm": 6.116640090942383, "learning_rate": 3.877242453630256e-07, "logits/chosen": -1.2131381034851074, "logits/rejected": -1.0686910152435303, "logps/chosen": -0.3515666127204895, "logps/rejected": -0.3958896994590759, "loss": 1.5671, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8789165019989014, "rewards/margins": 0.11080773174762726, "rewards/rejected": -0.9897242784500122, "step": 179 }, { "epoch": 0.38471814052898745, "grad_norm": 6.355064868927002, "learning_rate": 3.8615975875375676e-07, "logits/chosen": -0.9339985847473145, "logits/rejected": -0.9060691595077515, "logps/chosen": -0.32276052236557007, "logps/rejected": -0.37401843070983887, "loss": 1.548, "rewards/accuracies": 0.75, "rewards/chosen": -0.8069013953208923, "rewards/margins": 0.1281447559595108, "rewards/rejected": -0.9350461959838867, "step": 180 }, { "epoch": 0.38685546353192624, "grad_norm": 6.534996509552002, "learning_rate": 3.8458765402267056e-07, "logits/chosen": -0.8938146233558655, "logits/rejected": -0.9069436192512512, "logps/chosen": -0.336931049823761, "logps/rejected": -0.4913772940635681, "loss": 1.5787, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8423275947570801, "rewards/margins": 0.3861156404018402, "rewards/rejected": -1.2284431457519531, "step": 181 }, { "epoch": 0.3889927865348651, "grad_norm": 10.956029891967773, "learning_rate": 3.8300801912883414e-07, "logits/chosen": -1.0703511238098145, "logits/rejected": -0.9989842176437378, "logps/chosen": -0.26583123207092285, "logps/rejected": -0.2977861762046814, "loss": 1.5609, "rewards/accuracies": 0.75, "rewards/chosen": -0.6645781397819519, "rewards/margins": 0.07988730072975159, "rewards/rejected": -0.7444654107093811, "step": 182 }, { "epoch": 0.3911301095378039, "grad_norm": 10.217528343200684, "learning_rate": 3.8142094245262615e-07, "logits/chosen": -1.145703673362732, "logits/rejected": -1.0282764434814453, "logps/chosen": -0.3538467586040497, "logps/rejected": -0.3405742645263672, "loss": 1.5855, "rewards/accuracies": 0.375, "rewards/chosen": -0.8846168518066406, "rewards/margins": -0.03318122401833534, "rewards/rejected": -0.8514357209205627, "step": 183 }, { "epoch": 0.3932674325407427, "grad_norm": 4.681653022766113, "learning_rate": 3.7982651279079227e-07, "logits/chosen": -1.2552436590194702, "logits/rejected": -1.259030818939209, "logps/chosen": -0.2886826992034912, "logps/rejected": -0.4662485718727112, "loss": 1.5609, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7217066884040833, "rewards/margins": 0.44391465187072754, "rewards/rejected": -1.1656213998794556, "step": 184 }, { "epoch": 0.39540475554368154, "grad_norm": 4.339652061462402, "learning_rate": 3.7822481935147655e-07, "logits/chosen": -1.0260683298110962, "logits/rejected": -1.015075922012329, "logps/chosen": -0.36714547872543335, "logps/rejected": -0.5204967260360718, "loss": 1.5682, "rewards/accuracies": 0.875, "rewards/chosen": -0.9178636074066162, "rewards/margins": 0.38337817788124084, "rewards/rejected": -1.3012418746948242, "step": 185 }, { "epoch": 0.3975420785466204, "grad_norm": 5.974206924438477, "learning_rate": 3.766159517492307e-07, "logits/chosen": -1.0455535650253296, "logits/rejected": -1.1319448947906494, "logps/chosen": -0.41289687156677246, "logps/rejected": -0.613991379737854, "loss": 1.5825, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0322421789169312, "rewards/margins": 0.5027362108230591, "rewards/rejected": -1.5349783897399902, "step": 186 }, { "epoch": 0.39967940154955917, "grad_norm": 8.767956733703613, "learning_rate": 3.75e-07, "logits/chosen": -1.0032697916030884, "logits/rejected": -0.9564570784568787, "logps/chosen": -0.31954333186149597, "logps/rejected": -0.4057242274284363, "loss": 1.6033, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7988582849502563, "rewards/margins": 0.21545226871967316, "rewards/rejected": -1.014310598373413, "step": 187 }, { "epoch": 0.401816724552498, "grad_norm": 4.35204553604126, "learning_rate": 3.7337705451608667e-07, "logits/chosen": -1.1166412830352783, "logits/rejected": -1.0849709510803223, "logps/chosen": -0.3008464574813843, "logps/rejected": -0.2960435450077057, "loss": 1.5105, "rewards/accuracies": 0.625, "rewards/chosen": -0.7521160840988159, "rewards/margins": -0.012007185257971287, "rewards/rejected": -0.7401089072227478, "step": 188 }, { "epoch": 0.4039540475554368, "grad_norm": 3.929826021194458, "learning_rate": 3.717472061010918e-07, "logits/chosen": -1.1040568351745605, "logits/rejected": -1.062517523765564, "logps/chosen": -0.3373297154903412, "logps/rejected": -0.5283687710762024, "loss": 1.5152, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8433243632316589, "rewards/margins": 0.47759759426116943, "rewards/rejected": -1.3209218978881836, "step": 189 }, { "epoch": 0.40609137055837563, "grad_norm": 4.574549198150635, "learning_rate": 3.7011054594483443e-07, "logits/chosen": -1.1240224838256836, "logits/rejected": -1.0487711429595947, "logps/chosen": -0.3029400706291199, "logps/rejected": -0.41601306200027466, "loss": 1.4632, "rewards/accuracies": 0.625, "rewards/chosen": -0.7573502063751221, "rewards/margins": 0.28268247842788696, "rewards/rejected": -1.0400326251983643, "step": 190 }, { "epoch": 0.40822869356131447, "grad_norm": 4.018647193908691, "learning_rate": 3.6846716561824967e-07, "logits/chosen": -0.80363529920578, "logits/rejected": -0.9596213102340698, "logps/chosen": -0.3076530694961548, "logps/rejected": -0.5633202195167542, "loss": 1.5163, "rewards/accuracies": 0.5, "rewards/chosen": -0.7691327333450317, "rewards/margins": 0.6391679048538208, "rewards/rejected": -1.408300518989563, "step": 191 }, { "epoch": 0.41036601656425326, "grad_norm": 7.332089424133301, "learning_rate": 3.668171570682655e-07, "logits/chosen": -0.9585205316543579, "logits/rejected": -0.9636404514312744, "logps/chosen": -0.33684462308883667, "logps/rejected": -0.3766506016254425, "loss": 1.5671, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8421115279197693, "rewards/margins": 0.09951499849557877, "rewards/rejected": -0.9416265487670898, "step": 192 }, { "epoch": 0.4125033395671921, "grad_norm": 8.853985786437988, "learning_rate": 3.6516061261265805e-07, "logits/chosen": -1.027462363243103, "logits/rejected": -0.9135668873786926, "logps/chosen": -0.3370886445045471, "logps/rejected": -0.3812939524650574, "loss": 1.5598, "rewards/accuracies": 0.375, "rewards/chosen": -0.8427215814590454, "rewards/margins": 0.11051319539546967, "rewards/rejected": -0.953234851360321, "step": 193 }, { "epoch": 0.41464066257013094, "grad_norm": 5.765879154205322, "learning_rate": 3.634976249348867e-07, "logits/chosen": -1.1132540702819824, "logits/rejected": -1.003641963005066, "logps/chosen": -0.3518536686897278, "logps/rejected": -0.5063703656196594, "loss": 1.5071, "rewards/accuracies": 0.75, "rewards/chosen": -0.8796342015266418, "rewards/margins": 0.38629183173179626, "rewards/rejected": -1.2659261226654053, "step": 194 }, { "epoch": 0.4167779855730697, "grad_norm": 17.148714065551758, "learning_rate": 3.618282870789081e-07, "logits/chosen": -1.041336178779602, "logits/rejected": -1.0308490991592407, "logps/chosen": -0.4422120749950409, "logps/rejected": -0.4290231466293335, "loss": 1.6783, "rewards/accuracies": 0.5, "rewards/chosen": -1.1055301427841187, "rewards/margins": -0.032972272485494614, "rewards/rejected": -1.072557806968689, "step": 195 }, { "epoch": 0.41891530857600856, "grad_norm": 4.9743332862854, "learning_rate": 3.601526924439709e-07, "logits/chosen": -0.9943188428878784, "logits/rejected": -1.029951810836792, "logps/chosen": -0.2909929156303406, "logps/rejected": -0.3154396116733551, "loss": 1.5771, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7274821996688843, "rewards/margins": 0.061116717755794525, "rewards/rejected": -0.7885990142822266, "step": 196 }, { "epoch": 0.42105263157894735, "grad_norm": 6.192495346069336, "learning_rate": 3.584709347793895e-07, "logits/chosen": -0.8082910776138306, "logits/rejected": -0.8116950988769531, "logps/chosen": -0.2856646478176117, "logps/rejected": -0.30446913838386536, "loss": 1.5157, "rewards/accuracies": 0.5, "rewards/chosen": -0.7141616344451904, "rewards/margins": 0.04701121151447296, "rewards/rejected": -0.7611728310585022, "step": 197 }, { "epoch": 0.4231899545818862, "grad_norm": 4.891373157501221, "learning_rate": 3.567831081792992e-07, "logits/chosen": -1.0285996198654175, "logits/rejected": -1.034073829650879, "logps/chosen": -0.3283870220184326, "logps/rejected": -0.5464656949043274, "loss": 1.4871, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8209674954414368, "rewards/margins": 0.5451965928077698, "rewards/rejected": -1.366164207458496, "step": 198 }, { "epoch": 0.425327277584825, "grad_norm": 13.869108200073242, "learning_rate": 3.550893070773914e-07, "logits/chosen": -1.0854626893997192, "logits/rejected": -1.0260361433029175, "logps/chosen": -0.39059579372406006, "logps/rejected": -0.4412023425102234, "loss": 1.6672, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9764894247055054, "rewards/margins": 0.12651631236076355, "rewards/rejected": -1.1030058860778809, "step": 199 }, { "epoch": 0.4274646005877638, "grad_norm": 29.342126846313477, "learning_rate": 3.5338962624163016e-07, "logits/chosen": -1.1286933422088623, "logits/rejected": -1.1019514799118042, "logps/chosen": -0.29572370648384094, "logps/rejected": -0.3438429832458496, "loss": 1.6118, "rewards/accuracies": 0.625, "rewards/chosen": -0.7393092513084412, "rewards/margins": 0.12029813230037689, "rewards/rejected": -0.8596073985099792, "step": 200 }, { "epoch": 0.42960192359070265, "grad_norm": 4.435629367828369, "learning_rate": 3.516841607689501e-07, "logits/chosen": -1.1759017705917358, "logits/rejected": -1.0626184940338135, "logps/chosen": -0.3442676067352295, "logps/rejected": -0.3576590120792389, "loss": 1.5321, "rewards/accuracies": 0.5, "rewards/chosen": -0.8606690168380737, "rewards/margins": 0.03347862884402275, "rewards/rejected": -0.8941476345062256, "step": 201 }, { "epoch": 0.4317392465936415, "grad_norm": 5.45989990234375, "learning_rate": 3.499730060799352e-07, "logits/chosen": -1.1944599151611328, "logits/rejected": -1.1447770595550537, "logps/chosen": -0.300496369600296, "logps/rejected": -0.3771470785140991, "loss": 1.4774, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7512409687042236, "rewards/margins": 0.19162678718566895, "rewards/rejected": -0.9428676962852478, "step": 202 }, { "epoch": 0.4338765695965803, "grad_norm": 4.396944046020508, "learning_rate": 3.482562579134809e-07, "logits/chosen": -0.9371283054351807, "logits/rejected": -0.9887581467628479, "logps/chosen": -0.34337079524993896, "logps/rejected": -0.31941717863082886, "loss": 1.5624, "rewards/accuracies": 0.375, "rewards/chosen": -0.8584270477294922, "rewards/margins": -0.05988417938351631, "rewards/rejected": -0.798542857170105, "step": 203 }, { "epoch": 0.4360138925995191, "grad_norm": 5.779623508453369, "learning_rate": 3.465340123214365e-07, "logits/chosen": -0.9840802550315857, "logits/rejected": -0.9649553298950195, "logps/chosen": -0.5713462829589844, "logps/rejected": -0.7279367446899414, "loss": 1.5474, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4283654689788818, "rewards/margins": 0.39147651195526123, "rewards/rejected": -1.8198421001434326, "step": 204 }, { "epoch": 0.4381512156024579, "grad_norm": 10.535792350769043, "learning_rate": 3.448063656632321e-07, "logits/chosen": -1.1214243173599243, "logits/rejected": -1.0236384868621826, "logps/chosen": -0.327178418636322, "logps/rejected": -0.3443678021430969, "loss": 1.5847, "rewards/accuracies": 0.375, "rewards/chosen": -0.8179460167884827, "rewards/margins": 0.042973555624485016, "rewards/rejected": -0.8609195351600647, "step": 205 }, { "epoch": 0.44028853860539674, "grad_norm": 5.442493915557861, "learning_rate": 3.430734146004863e-07, "logits/chosen": -1.1191673278808594, "logits/rejected": -0.9904736876487732, "logps/chosen": -0.2607005536556244, "logps/rejected": -0.2681718170642853, "loss": 1.542, "rewards/accuracies": 0.3125, "rewards/chosen": -0.6517513990402222, "rewards/margins": 0.01867814175784588, "rewards/rejected": -0.670429527759552, "step": 206 }, { "epoch": 0.4424258616083356, "grad_norm": 6.850170612335205, "learning_rate": 3.413352560915988e-07, "logits/chosen": -1.0275464057922363, "logits/rejected": -1.0052015781402588, "logps/chosen": -0.3867985010147095, "logps/rejected": -0.4938412010669708, "loss": 1.6312, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9669963121414185, "rewards/margins": 0.267606645822525, "rewards/rejected": -1.234602928161621, "step": 207 }, { "epoch": 0.44456318461127436, "grad_norm": 9.965657234191895, "learning_rate": 3.39591987386325e-07, "logits/chosen": -0.9659216403961182, "logits/rejected": -0.9130998253822327, "logps/chosen": -0.33372846245765686, "logps/rejected": -0.3092671036720276, "loss": 1.5355, "rewards/accuracies": 0.375, "rewards/chosen": -0.834321141242981, "rewards/margins": -0.061153292655944824, "rewards/rejected": -0.7731677889823914, "step": 208 }, { "epoch": 0.4467005076142132, "grad_norm": 5.595789909362793, "learning_rate": 3.378437060203357e-07, "logits/chosen": -1.2547951936721802, "logits/rejected": -1.1610562801361084, "logps/chosen": -0.34088316559791565, "logps/rejected": -0.34324803948402405, "loss": 1.6059, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8522078990936279, "rewards/margins": 0.00591224804520607, "rewards/rejected": -0.8581202030181885, "step": 209 }, { "epoch": 0.448837830617152, "grad_norm": 17.92057991027832, "learning_rate": 3.360905098097587e-07, "logits/chosen": -1.0579925775527954, "logits/rejected": -0.9834758043289185, "logps/chosen": -0.38748034834861755, "logps/rejected": -0.6860374808311462, "loss": 1.5363, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9687008857727051, "rewards/margins": 0.7463930249214172, "rewards/rejected": -1.715093970298767, "step": 210 }, { "epoch": 0.45097515362009083, "grad_norm": 6.570519924163818, "learning_rate": 3.343324968457075e-07, "logits/chosen": -1.0359179973602295, "logits/rejected": -0.9564209580421448, "logps/chosen": -0.38825637102127075, "logps/rejected": -0.3802332878112793, "loss": 1.5384, "rewards/accuracies": 0.5, "rewards/chosen": -0.970641016960144, "rewards/margins": -0.02005772665143013, "rewards/rejected": -0.950583279132843, "step": 211 }, { "epoch": 0.45311247662302967, "grad_norm": 5.518048286437988, "learning_rate": 3.325697654887918e-07, "logits/chosen": -0.998512327671051, "logits/rejected": -0.9381792545318604, "logps/chosen": -0.3794736862182617, "logps/rejected": -0.6236636638641357, "loss": 1.5091, "rewards/accuracies": 0.75, "rewards/chosen": -0.9486840963363647, "rewards/margins": 0.6104748249053955, "rewards/rejected": -1.5591590404510498, "step": 212 }, { "epoch": 0.45524979962596845, "grad_norm": 12.084184646606445, "learning_rate": 3.30802414363615e-07, "logits/chosen": -0.9403542280197144, "logits/rejected": -0.6737431287765503, "logps/chosen": -0.4244030714035034, "logps/rejected": -0.43834903836250305, "loss": 1.4581, "rewards/accuracies": 0.375, "rewards/chosen": -1.0610076189041138, "rewards/margins": 0.03486503288149834, "rewards/rejected": -1.0958726406097412, "step": 213 }, { "epoch": 0.4573871226289073, "grad_norm": 3.5294582843780518, "learning_rate": 3.2903054235325613e-07, "logits/chosen": -1.1825759410858154, "logits/rejected": -1.210655927658081, "logps/chosen": -0.3315200209617615, "logps/rejected": -0.46745753288269043, "loss": 1.5312, "rewards/accuracies": 0.625, "rewards/chosen": -0.8287999629974365, "rewards/margins": 0.33984383940696716, "rewards/rejected": -1.168643832206726, "step": 214 }, { "epoch": 0.45952444563184613, "grad_norm": 6.134922027587891, "learning_rate": 3.272542485937368e-07, "logits/chosen": -1.1072171926498413, "logits/rejected": -1.208855152130127, "logps/chosen": -0.4051734209060669, "logps/rejected": -0.6289750337600708, "loss": 1.5473, "rewards/accuracies": 0.6875, "rewards/chosen": -1.012933611869812, "rewards/margins": 0.5595039129257202, "rewards/rejected": -1.5724375247955322, "step": 215 }, { "epoch": 0.4616617686347849, "grad_norm": 6.10336446762085, "learning_rate": 3.2547363246847546e-07, "logits/chosen": -1.0125056505203247, "logits/rejected": -1.0291041135787964, "logps/chosen": -0.3960397243499756, "logps/rejected": -0.6897832751274109, "loss": 1.5091, "rewards/accuracies": 0.625, "rewards/chosen": -0.990099310874939, "rewards/margins": 0.7343588471412659, "rewards/rejected": -1.72445809841156, "step": 216 }, { "epoch": 0.46379909163772376, "grad_norm": 9.434686660766602, "learning_rate": 3.2368879360272606e-07, "logits/chosen": -1.0608569383621216, "logits/rejected": -1.0038235187530518, "logps/chosen": -0.4567071199417114, "logps/rejected": -0.42994168400764465, "loss": 1.616, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1417678594589233, "rewards/margins": -0.06691357493400574, "rewards/rejected": -1.0748542547225952, "step": 217 }, { "epoch": 0.46593641464066254, "grad_norm": 5.21524715423584, "learning_rate": 3.218998318580043e-07, "logits/chosen": -1.1354548931121826, "logits/rejected": -1.0435974597930908, "logps/chosen": -0.2741296589374542, "logps/rejected": -0.37144631147384644, "loss": 1.6029, "rewards/accuracies": 0.625, "rewards/chosen": -0.6853241920471191, "rewards/margins": 0.24329157173633575, "rewards/rejected": -0.9286156892776489, "step": 218 }, { "epoch": 0.4680737376436014, "grad_norm": 4.108745574951172, "learning_rate": 3.201068473265007e-07, "logits/chosen": -0.8878648281097412, "logits/rejected": -0.8645142316818237, "logps/chosen": -0.32466036081314087, "logps/rejected": -0.28847843408584595, "loss": 1.6023, "rewards/accuracies": 0.5625, "rewards/chosen": -0.811650812625885, "rewards/margins": -0.09045480191707611, "rewards/rejected": -0.7211960554122925, "step": 219 }, { "epoch": 0.4702110606465402, "grad_norm": 17.760408401489258, "learning_rate": 3.1830994032548e-07, "logits/chosen": -1.197770595550537, "logits/rejected": -1.0971354246139526, "logps/chosen": -0.44655174016952515, "logps/rejected": -0.5050027370452881, "loss": 1.6185, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1163791418075562, "rewards/margins": 0.14612753689289093, "rewards/rejected": -1.2625068426132202, "step": 220 }, { "epoch": 0.472348383649479, "grad_norm": 24.489158630371094, "learning_rate": 3.1650921139166874e-07, "logits/chosen": -0.9091489315032959, "logits/rejected": -0.9671614766120911, "logps/chosen": -0.2689306437969208, "logps/rejected": -0.2791651487350464, "loss": 1.6576, "rewards/accuracies": 0.5625, "rewards/chosen": -0.6723266243934631, "rewards/margins": 0.02558620274066925, "rewards/rejected": -0.6979128122329712, "step": 221 }, { "epoch": 0.47448570665241785, "grad_norm": 4.240891933441162, "learning_rate": 3.147047612756302e-07, "logits/chosen": -1.1410434246063232, "logits/rejected": -0.9494026303291321, "logps/chosen": -0.3623463809490204, "logps/rejected": -0.3546559810638428, "loss": 1.5634, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9058659672737122, "rewards/margins": -0.01922605186700821, "rewards/rejected": -0.8866399526596069, "step": 222 }, { "epoch": 0.4766230296553567, "grad_norm": 11.909400939941406, "learning_rate": 3.128966909361271e-07, "logits/chosen": -1.0778872966766357, "logits/rejected": -0.9947598576545715, "logps/chosen": -0.2876349687576294, "logps/rejected": -0.3500506281852722, "loss": 1.5763, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7190873622894287, "rewards/margins": 0.15603923797607422, "rewards/rejected": -0.8751266002655029, "step": 223 }, { "epoch": 0.4787603526582955, "grad_norm": 3.9968485832214355, "learning_rate": 3.110851015344735e-07, "logits/chosen": -1.043594241142273, "logits/rejected": -1.0751991271972656, "logps/chosen": -0.3403151333332062, "logps/rejected": -0.45080384612083435, "loss": 1.4964, "rewards/accuracies": 0.6875, "rewards/chosen": -0.850787878036499, "rewards/margins": 0.27622172236442566, "rewards/rejected": -1.127009630203247, "step": 224 }, { "epoch": 0.4808976756612343, "grad_norm": 4.30190372467041, "learning_rate": 3.0927009442887437e-07, "logits/chosen": -0.9305320978164673, "logits/rejected": -1.0111606121063232, "logps/chosen": -0.32919758558273315, "logps/rejected": -0.34503474831581116, "loss": 1.5875, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8229939341545105, "rewards/margins": 0.0395929217338562, "rewards/rejected": -0.8625868558883667, "step": 225 }, { "epoch": 0.4830349986641731, "grad_norm": 5.68215799331665, "learning_rate": 3.074517711687549e-07, "logits/chosen": -0.9502861499786377, "logits/rejected": -0.9219777584075928, "logps/chosen": -0.40744659304618835, "logps/rejected": -0.4551170766353607, "loss": 1.5318, "rewards/accuracies": 0.75, "rewards/chosen": -1.0186164379119873, "rewards/margins": 0.11917618662118912, "rewards/rejected": -1.137792706489563, "step": 226 }, { "epoch": 0.48517232166711194, "grad_norm": 5.924420356750488, "learning_rate": 3.056302334890786e-07, "logits/chosen": -1.0599088668823242, "logits/rejected": -0.9398927688598633, "logps/chosen": -0.2768517732620239, "logps/rejected": -0.3650413155555725, "loss": 1.5365, "rewards/accuracies": 0.625, "rewards/chosen": -0.692129373550415, "rewards/margins": 0.22047390043735504, "rewards/rejected": -0.9126032590866089, "step": 227 }, { "epoch": 0.4873096446700508, "grad_norm": 9.18790340423584, "learning_rate": 3.038055833046555e-07, "logits/chosen": -1.23221755027771, "logits/rejected": -1.1094015836715698, "logps/chosen": -0.3468588590621948, "logps/rejected": -0.533679723739624, "loss": 1.5544, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8671470880508423, "rewards/margins": 0.4670522212982178, "rewards/rejected": -1.3341991901397705, "step": 228 }, { "epoch": 0.48944696767298956, "grad_norm": 3.236159563064575, "learning_rate": 3.0197792270443976e-07, "logits/chosen": -1.101015567779541, "logits/rejected": -0.980370044708252, "logps/chosen": -0.5276182293891907, "logps/rejected": -0.2907797694206238, "loss": 1.6115, "rewards/accuracies": 0.5, "rewards/chosen": -1.3190455436706543, "rewards/margins": -0.592096209526062, "rewards/rejected": -0.7269493341445923, "step": 229 }, { "epoch": 0.4915842906759284, "grad_norm": 6.479002475738525, "learning_rate": 3.001473539458182e-07, "logits/chosen": -1.1139984130859375, "logits/rejected": -1.0145281553268433, "logps/chosen": -0.40499821305274963, "logps/rejected": -0.5032440423965454, "loss": 1.5857, "rewards/accuracies": 0.625, "rewards/chosen": -1.0124956369400024, "rewards/margins": 0.24561452865600586, "rewards/rejected": -1.2581101655960083, "step": 230 }, { "epoch": 0.49372161367886724, "grad_norm": 11.510492324829102, "learning_rate": 2.983139794488883e-07, "logits/chosen": -1.1703720092773438, "logits/rejected": -1.0775160789489746, "logps/chosen": -0.4314059615135193, "logps/rejected": -0.39874231815338135, "loss": 1.6011, "rewards/accuracies": 0.5, "rewards/chosen": -1.078514814376831, "rewards/margins": -0.08165915310382843, "rewards/rejected": -0.9968557953834534, "step": 231 }, { "epoch": 0.49585893668180603, "grad_norm": 5.559615612030029, "learning_rate": 2.964779017907287e-07, "logits/chosen": -1.0301462411880493, "logits/rejected": -1.0727837085723877, "logps/chosen": -0.40483570098876953, "logps/rejected": -0.45073747634887695, "loss": 1.5311, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0120892524719238, "rewards/margins": 0.114754319190979, "rewards/rejected": -1.1268435716629028, "step": 232 }, { "epoch": 0.49799625968474487, "grad_norm": 4.339590549468994, "learning_rate": 2.9463922369965915e-07, "logits/chosen": -0.9359559416770935, "logits/rejected": -0.9321252703666687, "logps/chosen": -0.35180893540382385, "logps/rejected": -0.536721408367157, "loss": 1.5723, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8795222640037537, "rewards/margins": 0.4622812271118164, "rewards/rejected": -1.3418035507202148, "step": 233 }, { "epoch": 0.5001335826876837, "grad_norm": 4.583502292633057, "learning_rate": 2.927980480494938e-07, "logits/chosen": -1.0992170572280884, "logits/rejected": -1.0070428848266602, "logps/chosen": -0.36073118448257446, "logps/rejected": -0.3927144706249237, "loss": 1.5751, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9018279314041138, "rewards/margins": 0.07995828986167908, "rewards/rejected": -0.9817862510681152, "step": 234 }, { "epoch": 0.5022709056906225, "grad_norm": 4.529286861419678, "learning_rate": 2.909544778537844e-07, "logits/chosen": -1.1656326055526733, "logits/rejected": -1.0929317474365234, "logps/chosen": -0.3711916506290436, "logps/rejected": -0.3863615393638611, "loss": 1.5221, "rewards/accuracies": 0.375, "rewards/chosen": -0.9279791116714478, "rewards/margins": 0.03792468085885048, "rewards/rejected": -0.9659038186073303, "step": 235 }, { "epoch": 0.5044082286935613, "grad_norm": 10.745676040649414, "learning_rate": 2.8910861626005773e-07, "logits/chosen": -1.058958649635315, "logits/rejected": -0.9348481297492981, "logps/chosen": -0.31637054681777954, "logps/rejected": -0.33828607201576233, "loss": 1.4931, "rewards/accuracies": 0.25, "rewards/chosen": -0.7909263968467712, "rewards/margins": 0.05478885397315025, "rewards/rejected": -0.8457151055335999, "step": 236 }, { "epoch": 0.5065455516965002, "grad_norm": 4.350003242492676, "learning_rate": 2.872605665440436e-07, "logits/chosen": -1.155067801475525, "logits/rejected": -1.044098138809204, "logps/chosen": -0.4006834626197815, "logps/rejected": -0.3987181484699249, "loss": 1.5417, "rewards/accuracies": 0.5, "rewards/chosen": -1.001708745956421, "rewards/margins": -0.004913315176963806, "rewards/rejected": -0.9967952966690063, "step": 237 }, { "epoch": 0.508682874699439, "grad_norm": 4.5963358879089355, "learning_rate": 2.8541043210389726e-07, "logits/chosen": -0.9011512994766235, "logits/rejected": -0.9799545407295227, "logps/chosen": -0.30112171173095703, "logps/rejected": -0.4484432339668274, "loss": 1.4859, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7528042793273926, "rewards/margins": 0.36830389499664307, "rewards/rejected": -1.1211082935333252, "step": 238 }, { "epoch": 0.5108201977023777, "grad_norm": 5.283090591430664, "learning_rate": 2.8355831645441387e-07, "logits/chosen": -1.2146248817443848, "logits/rejected": -1.2574325799942017, "logps/chosen": -0.3450472354888916, "logps/rejected": -0.4763634204864502, "loss": 1.4888, "rewards/accuracies": 0.6875, "rewards/chosen": -0.862618088722229, "rewards/margins": 0.32829049229621887, "rewards/rejected": -1.1909085512161255, "step": 239 }, { "epoch": 0.5129575207053166, "grad_norm": 6.43093729019165, "learning_rate": 2.817043232212371e-07, "logits/chosen": -1.2071186304092407, "logits/rejected": -1.1450533866882324, "logps/chosen": -0.3647967576980591, "logps/rejected": -0.4625674784183502, "loss": 1.5268, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9119919538497925, "rewards/margins": 0.24442686140537262, "rewards/rejected": -1.156418800354004, "step": 240 }, { "epoch": 0.5150948437082554, "grad_norm": 9.717195510864258, "learning_rate": 2.7984855613506106e-07, "logits/chosen": -1.1946227550506592, "logits/rejected": -1.1376502513885498, "logps/chosen": -0.29764774441719055, "logps/rejected": -0.302202045917511, "loss": 1.5322, "rewards/accuracies": 0.5, "rewards/chosen": -0.74411940574646, "rewards/margins": 0.01138581894338131, "rewards/rejected": -0.7555052042007446, "step": 241 }, { "epoch": 0.5172321667111942, "grad_norm": 6.479928493499756, "learning_rate": 2.7799111902582693e-07, "logits/chosen": -1.2251317501068115, "logits/rejected": -1.0719342231750488, "logps/chosen": -0.3156971037387848, "logps/rejected": -0.2400185763835907, "loss": 1.5725, "rewards/accuracies": 0.25, "rewards/chosen": -0.789242684841156, "rewards/margins": -0.18919625878334045, "rewards/rejected": -0.6000465154647827, "step": 242 }, { "epoch": 0.5193694897141331, "grad_norm": 8.900924682617188, "learning_rate": 2.761321158169134e-07, "logits/chosen": -1.0849740505218506, "logits/rejected": -1.1516170501708984, "logps/chosen": -0.35676899552345276, "logps/rejected": -0.5772523283958435, "loss": 1.4924, "rewards/accuracies": 0.375, "rewards/chosen": -0.8919224739074707, "rewards/margins": 0.5512083172798157, "rewards/rejected": -1.4431307315826416, "step": 243 }, { "epoch": 0.5215068127170719, "grad_norm": 5.021285533905029, "learning_rate": 2.74271650519322e-07, "logits/chosen": -1.1510225534439087, "logits/rejected": -1.1225014925003052, "logps/chosen": -0.3502144515514374, "logps/rejected": -0.48459944128990173, "loss": 1.5383, "rewards/accuracies": 0.8125, "rewards/chosen": -0.8755362033843994, "rewards/margins": 0.3359624445438385, "rewards/rejected": -1.2114986181259155, "step": 244 }, { "epoch": 0.5236441357200107, "grad_norm": 6.033867359161377, "learning_rate": 2.7240982722585837e-07, "logits/chosen": -1.0076422691345215, "logits/rejected": -1.0045421123504639, "logps/chosen": -0.3226780295372009, "logps/rejected": -0.37450891733169556, "loss": 1.5745, "rewards/accuracies": 0.625, "rewards/chosen": -0.8066950440406799, "rewards/margins": 0.1295773833990097, "rewards/rejected": -0.9362723231315613, "step": 245 }, { "epoch": 0.5257814587229495, "grad_norm": 7.610095977783203, "learning_rate": 2.705467501053076e-07, "logits/chosen": -1.3070695400238037, "logits/rejected": -1.360163688659668, "logps/chosen": -0.4132193624973297, "logps/rejected": -0.5460841059684753, "loss": 1.5482, "rewards/accuracies": 0.625, "rewards/chosen": -1.033048391342163, "rewards/margins": 0.3321617841720581, "rewards/rejected": -1.3652101755142212, "step": 246 }, { "epoch": 0.5279187817258884, "grad_norm": 5.6094770431518555, "learning_rate": 2.6868252339660607e-07, "logits/chosen": -0.9480774998664856, "logits/rejected": -0.9445351362228394, "logps/chosen": -0.5733252763748169, "logps/rejected": -1.0633982419967651, "loss": 1.5284, "rewards/accuracies": 0.875, "rewards/chosen": -1.433313250541687, "rewards/margins": 1.225182294845581, "rewards/rejected": -2.6584954261779785, "step": 247 }, { "epoch": 0.5300561047288271, "grad_norm": 16.856760025024414, "learning_rate": 2.6681725140300995e-07, "logits/chosen": -1.1925255060195923, "logits/rejected": -1.1129463911056519, "logps/chosen": -0.28040605783462524, "logps/rejected": -0.36458098888397217, "loss": 1.5368, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7010151743888855, "rewards/margins": 0.21043738722801208, "rewards/rejected": -0.9114525318145752, "step": 248 }, { "epoch": 0.5321934277317659, "grad_norm": 6.675515174865723, "learning_rate": 2.6495103848625854e-07, "logits/chosen": -1.2934060096740723, "logits/rejected": -1.17371666431427, "logps/chosen": -0.3621112108230591, "logps/rejected": -0.4810316562652588, "loss": 1.5586, "rewards/accuracies": 0.5, "rewards/chosen": -0.9052779674530029, "rewards/margins": 0.29730114340782166, "rewards/rejected": -1.2025790214538574, "step": 249 }, { "epoch": 0.5343307507347048, "grad_norm": 15.49481201171875, "learning_rate": 2.63083989060736e-07, "logits/chosen": -1.019038438796997, "logits/rejected": -0.9999558925628662, "logps/chosen": -0.431622713804245, "logps/rejected": -0.620360255241394, "loss": 1.55, "rewards/accuracies": 0.5, "rewards/chosen": -1.0790568590164185, "rewards/margins": 0.47184401750564575, "rewards/rejected": -1.550900936126709, "step": 250 }, { "epoch": 0.5364680737376436, "grad_norm": 4.874868392944336, "learning_rate": 2.6121620758762875e-07, "logits/chosen": -1.1522804498672485, "logits/rejected": -1.144692301750183, "logps/chosen": -0.40826401114463806, "logps/rejected": -0.4715278744697571, "loss": 1.5317, "rewards/accuracies": 0.625, "rewards/chosen": -1.0206599235534668, "rewards/margins": 0.15815965831279755, "rewards/rejected": -1.1788195371627808, "step": 251 }, { "epoch": 0.5386053967405824, "grad_norm": 4.113776206970215, "learning_rate": 2.593477985690815e-07, "logits/chosen": -1.0712709426879883, "logits/rejected": -1.1005451679229736, "logps/chosen": -0.5715100765228271, "logps/rejected": -0.6493417620658875, "loss": 1.5129, "rewards/accuracies": 0.5, "rewards/chosen": -1.4287753105163574, "rewards/margins": 0.19457919895648956, "rewards/rejected": -1.6233545541763306, "step": 252 }, { "epoch": 0.5407427197435213, "grad_norm": 17.1635799407959, "learning_rate": 2.574788665423496e-07, "logits/chosen": -0.9928967356681824, "logits/rejected": -0.9838371276855469, "logps/chosen": -0.3351861536502838, "logps/rejected": -0.3290242850780487, "loss": 1.5488, "rewards/accuracies": 0.25, "rewards/chosen": -0.8379653692245483, "rewards/margins": -0.01540469378232956, "rewards/rejected": -0.822560727596283, "step": 253 }, { "epoch": 0.5428800427464601, "grad_norm": 8.28346061706543, "learning_rate": 2.5560951607395126e-07, "logits/chosen": -1.1226708889007568, "logits/rejected": -1.0680346488952637, "logps/chosen": -0.3342251777648926, "logps/rejected": -0.3822442591190338, "loss": 1.5604, "rewards/accuracies": 0.625, "rewards/chosen": -0.8355628252029419, "rewards/margins": 0.12004776298999786, "rewards/rejected": -0.9556106925010681, "step": 254 }, { "epoch": 0.5450173657493989, "grad_norm": 12.173513412475586, "learning_rate": 2.537398517538159e-07, "logits/chosen": -1.1180171966552734, "logits/rejected": -1.1232236623764038, "logps/chosen": -0.3291173279285431, "logps/rejected": -0.5288177132606506, "loss": 1.4907, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8227933645248413, "rewards/margins": 0.49925094842910767, "rewards/rejected": -1.3220442533493042, "step": 255 }, { "epoch": 0.5471546887523377, "grad_norm": 4.996148109436035, "learning_rate": 2.518699781894332e-07, "logits/chosen": -1.0864285230636597, "logits/rejected": -1.0856531858444214, "logps/chosen": -0.46618932485580444, "logps/rejected": -0.9816129803657532, "loss": 1.5076, "rewards/accuracies": 0.75, "rewards/chosen": -1.165473222732544, "rewards/margins": 1.2885593175888062, "rewards/rejected": -2.4540326595306396, "step": 256 }, { "epoch": 0.5492920117552765, "grad_norm": 5.049304008483887, "learning_rate": 2.5e-07, "logits/chosen": -0.9920480251312256, "logits/rejected": -0.897991418838501, "logps/chosen": -0.3009772002696991, "logps/rejected": -0.3982135057449341, "loss": 1.6439, "rewards/accuracies": 0.4375, "rewards/chosen": -0.7524430155754089, "rewards/margins": 0.24309074878692627, "rewards/rejected": -0.9955337643623352, "step": 257 }, { "epoch": 0.5514293347582153, "grad_norm": 4.575491428375244, "learning_rate": 2.4813002181056676e-07, "logits/chosen": -1.0483980178833008, "logits/rejected": -1.040475845336914, "logps/chosen": -0.2760324478149414, "logps/rejected": -0.5634697675704956, "loss": 1.5932, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6900811791419983, "rewards/margins": 0.718593180179596, "rewards/rejected": -1.4086742401123047, "step": 258 }, { "epoch": 0.5535666577611541, "grad_norm": 4.031703948974609, "learning_rate": 2.4626014824618413e-07, "logits/chosen": -1.2272746562957764, "logits/rejected": -1.2073853015899658, "logps/chosen": -0.4353184700012207, "logps/rejected": -0.5070162415504456, "loss": 1.5153, "rewards/accuracies": 0.75, "rewards/chosen": -1.0882960557937622, "rewards/margins": 0.17924460768699646, "rewards/rejected": -1.2675405740737915, "step": 259 }, { "epoch": 0.555703980764093, "grad_norm": 8.027057647705078, "learning_rate": 2.4439048392604877e-07, "logits/chosen": -0.953754186630249, "logits/rejected": -0.9900184869766235, "logps/chosen": -0.2740909159183502, "logps/rejected": -0.3458973467350006, "loss": 1.5291, "rewards/accuracies": 0.6875, "rewards/chosen": -0.6852273344993591, "rewards/margins": 0.17951609194278717, "rewards/rejected": -0.8647434711456299, "step": 260 }, { "epoch": 0.5578413037670318, "grad_norm": 7.306129455566406, "learning_rate": 2.4252113345765043e-07, "logits/chosen": -0.9035928845405579, "logits/rejected": -0.8614873290061951, "logps/chosen": -0.2865443229675293, "logps/rejected": -0.32079729437828064, "loss": 1.5665, "rewards/accuracies": 0.625, "rewards/chosen": -0.7163608074188232, "rewards/margins": 0.0856325626373291, "rewards/rejected": -0.8019933104515076, "step": 261 }, { "epoch": 0.5599786267699706, "grad_norm": 3.2643260955810547, "learning_rate": 2.406522014309186e-07, "logits/chosen": -1.1808401346206665, "logits/rejected": -1.1874431371688843, "logps/chosen": -0.5122575163841248, "logps/rejected": -0.8233806490898132, "loss": 1.5851, "rewards/accuracies": 0.625, "rewards/chosen": -1.2806435823440552, "rewards/margins": 0.7778077721595764, "rewards/rejected": -2.0584514141082764, "step": 262 }, { "epoch": 0.5621159497729095, "grad_norm": 3.0484321117401123, "learning_rate": 2.3878379241237134e-07, "logits/chosen": -1.1015522480010986, "logits/rejected": -1.1043397188186646, "logps/chosen": -0.5216892957687378, "logps/rejected": -0.5477871298789978, "loss": 1.4888, "rewards/accuracies": 0.625, "rewards/chosen": -1.3042232990264893, "rewards/margins": 0.06524449586868286, "rewards/rejected": -1.3694677352905273, "step": 263 }, { "epoch": 0.5642532727758482, "grad_norm": 8.040013313293457, "learning_rate": 2.3691601093926402e-07, "logits/chosen": -1.0679914951324463, "logits/rejected": -1.041649580001831, "logps/chosen": -0.4239467978477478, "logps/rejected": -0.427889347076416, "loss": 1.6854, "rewards/accuracies": 0.5, "rewards/chosen": -1.0598669052124023, "rewards/margins": 0.009856484830379486, "rewards/rejected": -1.06972336769104, "step": 264 }, { "epoch": 0.566390595778787, "grad_norm": 10.049110412597656, "learning_rate": 2.3504896151374144e-07, "logits/chosen": -1.1767913103103638, "logits/rejected": -1.2240692377090454, "logps/chosen": -0.4159534275531769, "logps/rejected": -0.5419010519981384, "loss": 1.5352, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0398834943771362, "rewards/margins": 0.3148690462112427, "rewards/rejected": -1.3547526597976685, "step": 265 }, { "epoch": 0.5685279187817259, "grad_norm": 8.153444290161133, "learning_rate": 2.3318274859699008e-07, "logits/chosen": -1.063308596611023, "logits/rejected": -1.164639949798584, "logps/chosen": -0.2907513678073883, "logps/rejected": -0.5243133306503296, "loss": 1.6477, "rewards/accuracies": 0.6875, "rewards/chosen": -0.7268784046173096, "rewards/margins": 0.5839048624038696, "rewards/rejected": -1.3107832670211792, "step": 266 }, { "epoch": 0.5706652417846647, "grad_norm": 8.460691452026367, "learning_rate": 2.3131747660339394e-07, "logits/chosen": -1.2165307998657227, "logits/rejected": -1.1944361925125122, "logps/chosen": -0.5601080656051636, "logps/rejected": -0.47026118636131287, "loss": 1.5634, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4002699851989746, "rewards/margins": -0.2246171087026596, "rewards/rejected": -1.1756529808044434, "step": 267 }, { "epoch": 0.5728025647876035, "grad_norm": 13.028862953186035, "learning_rate": 2.2945324989469243e-07, "logits/chosen": -1.0125137567520142, "logits/rejected": -0.9787082672119141, "logps/chosen": -0.3832467794418335, "logps/rejected": -0.7757288217544556, "loss": 1.4993, "rewards/accuracies": 0.4375, "rewards/chosen": -0.958116888999939, "rewards/margins": 0.9812053442001343, "rewards/rejected": -1.9393221139907837, "step": 268 }, { "epoch": 0.5749398877905424, "grad_norm": 10.404675483703613, "learning_rate": 2.2759017277414164e-07, "logits/chosen": -1.1808825731277466, "logits/rejected": -1.1194167137145996, "logps/chosen": -0.43756401538848877, "logps/rejected": -0.3946504294872284, "loss": 1.6378, "rewards/accuracies": 0.3125, "rewards/chosen": -1.0939099788665771, "rewards/margins": -0.10728396475315094, "rewards/rejected": -0.9866260290145874, "step": 269 }, { "epoch": 0.5770772107934812, "grad_norm": 7.459733963012695, "learning_rate": 2.2572834948067795e-07, "logits/chosen": -0.9175713062286377, "logits/rejected": -0.9572230577468872, "logps/chosen": -0.2940235137939453, "logps/rejected": -0.3464244604110718, "loss": 1.6275, "rewards/accuracies": 0.375, "rewards/chosen": -0.7350587844848633, "rewards/margins": 0.13100232183933258, "rewards/rejected": -0.8660610914230347, "step": 270 }, { "epoch": 0.57921453379642, "grad_norm": 6.2825493812561035, "learning_rate": 2.2386788418308665e-07, "logits/chosen": -1.0154887437820435, "logits/rejected": -1.0528539419174194, "logps/chosen": -0.5251376628875732, "logps/rejected": -0.7547603845596313, "loss": 1.5214, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3128442764282227, "rewards/margins": 0.5740568041801453, "rewards/rejected": -1.8869010210037231, "step": 271 }, { "epoch": 0.5813518567993589, "grad_norm": 4.264716148376465, "learning_rate": 2.2200888097417302e-07, "logits/chosen": -1.043276071548462, "logits/rejected": -0.9186975955963135, "logps/chosen": -0.39481961727142334, "logps/rejected": -0.5242050886154175, "loss": 1.5337, "rewards/accuracies": 0.625, "rewards/chosen": -0.9870489835739136, "rewards/margins": 0.32346370816230774, "rewards/rejected": -1.310512661933899, "step": 272 }, { "epoch": 0.5834891798022976, "grad_norm": 4.6611199378967285, "learning_rate": 2.2015144386493895e-07, "logits/chosen": -0.9979356527328491, "logits/rejected": -0.9526849985122681, "logps/chosen": -0.39222562313079834, "logps/rejected": -0.46204763650894165, "loss": 1.4999, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9805639982223511, "rewards/margins": 0.1745550036430359, "rewards/rejected": -1.1551190614700317, "step": 273 }, { "epoch": 0.5856265028052364, "grad_norm": 4.630171298980713, "learning_rate": 2.1829567677876297e-07, "logits/chosen": -0.9676195979118347, "logits/rejected": -0.9666755199432373, "logps/chosen": -0.35061851143836975, "logps/rejected": -0.35894879698753357, "loss": 1.609, "rewards/accuracies": 0.125, "rewards/chosen": -0.8765462636947632, "rewards/margins": 0.020825695246458054, "rewards/rejected": -0.8973720073699951, "step": 274 }, { "epoch": 0.5877638258081752, "grad_norm": 11.871662139892578, "learning_rate": 2.164416835455862e-07, "logits/chosen": -0.7467477321624756, "logits/rejected": -0.6393258571624756, "logps/chosen": -0.502008318901062, "logps/rejected": -0.44732266664505005, "loss": 1.5655, "rewards/accuracies": 0.375, "rewards/chosen": -1.2550209760665894, "rewards/margins": -0.13671430945396423, "rewards/rejected": -1.1183066368103027, "step": 275 }, { "epoch": 0.5899011488111141, "grad_norm": 3.6022424697875977, "learning_rate": 2.1458956789610277e-07, "logits/chosen": -1.2034939527511597, "logits/rejected": -1.0202971696853638, "logps/chosen": -0.3793608248233795, "logps/rejected": -0.33719444274902344, "loss": 1.5627, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9484022259712219, "rewards/margins": -0.10541604459285736, "rewards/rejected": -0.8429861068725586, "step": 276 }, { "epoch": 0.5920384718140529, "grad_norm": 6.574893474578857, "learning_rate": 2.1273943345595635e-07, "logits/chosen": -1.2551283836364746, "logits/rejected": -1.2000017166137695, "logps/chosen": -0.4082186818122864, "logps/rejected": -0.6158214807510376, "loss": 1.5529, "rewards/accuracies": 0.5, "rewards/chosen": -1.0205466747283936, "rewards/margins": 0.51900714635849, "rewards/rejected": -1.5395537614822388, "step": 277 }, { "epoch": 0.5941757948169917, "grad_norm": 5.325026035308838, "learning_rate": 2.1089138373994222e-07, "logits/chosen": -1.0981683731079102, "logits/rejected": -1.093741774559021, "logps/chosen": -0.4150196313858032, "logps/rejected": -0.5339372754096985, "loss": 1.5688, "rewards/accuracies": 0.75, "rewards/chosen": -1.0375490188598633, "rewards/margins": 0.2972941994667053, "rewards/rejected": -1.3348432779312134, "step": 278 }, { "epoch": 0.5963131178199306, "grad_norm": 17.316631317138672, "learning_rate": 2.0904552214621556e-07, "logits/chosen": -1.1414576768875122, "logits/rejected": -1.1112793684005737, "logps/chosen": -0.6599245071411133, "logps/rejected": -0.3380126357078552, "loss": 1.6195, "rewards/accuracies": 0.5, "rewards/chosen": -1.6498112678527832, "rewards/margins": -0.8047796487808228, "rewards/rejected": -0.8450315594673157, "step": 279 }, { "epoch": 0.5984504408228694, "grad_norm": 4.380247116088867, "learning_rate": 2.072019519505062e-07, "logits/chosen": -0.9662964940071106, "logits/rejected": -0.9849826693534851, "logps/chosen": -0.36699962615966797, "logps/rejected": -0.3455093204975128, "loss": 1.5053, "rewards/accuracies": 0.375, "rewards/chosen": -0.9174990653991699, "rewards/margins": -0.053725723177194595, "rewards/rejected": -0.8637734055519104, "step": 280 }, { "epoch": 0.6005877638258081, "grad_norm": 7.828254699707031, "learning_rate": 2.0536077630034085e-07, "logits/chosen": -0.9694425463676453, "logits/rejected": -0.8208516240119934, "logps/chosen": -0.4557761251926422, "logps/rejected": -0.6782093048095703, "loss": 1.6184, "rewards/accuracies": 0.5, "rewards/chosen": -1.1394402980804443, "rewards/margins": 0.5560829043388367, "rewards/rejected": -1.6955231428146362, "step": 281 }, { "epoch": 0.602725086828747, "grad_norm": 7.952332496643066, "learning_rate": 2.0352209820927135e-07, "logits/chosen": -0.9816855192184448, "logits/rejected": -0.8845440149307251, "logps/chosen": -0.3230597972869873, "logps/rejected": -0.4072348475456238, "loss": 1.5012, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8076494932174683, "rewards/margins": 0.21043761074543, "rewards/rejected": -1.0180871486663818, "step": 282 }, { "epoch": 0.6048624098316858, "grad_norm": 8.255958557128906, "learning_rate": 2.0168602055111173e-07, "logits/chosen": -1.1203254461288452, "logits/rejected": -1.107031226158142, "logps/chosen": -0.6907448768615723, "logps/rejected": -0.5776211023330688, "loss": 1.6044, "rewards/accuracies": 0.5, "rewards/chosen": -1.7268621921539307, "rewards/margins": -0.28280916810035706, "rewards/rejected": -1.4440529346466064, "step": 283 }, { "epoch": 0.6069997328346246, "grad_norm": 11.462747573852539, "learning_rate": 1.998526460541818e-07, "logits/chosen": -1.0083472728729248, "logits/rejected": -0.9932087659835815, "logps/chosen": -0.5044468641281128, "logps/rejected": -0.4264739453792572, "loss": 1.6586, "rewards/accuracies": 0.5, "rewards/chosen": -1.2611171007156372, "rewards/margins": -0.19493228197097778, "rewards/rejected": -1.0661848783493042, "step": 284 }, { "epoch": 0.6091370558375635, "grad_norm": 4.765872478485107, "learning_rate": 1.980220772955602e-07, "logits/chosen": -1.035547137260437, "logits/rejected": -1.0857359170913696, "logps/chosen": -0.41963931918144226, "logps/rejected": -0.592042863368988, "loss": 1.5091, "rewards/accuracies": 0.5625, "rewards/chosen": -1.049098253250122, "rewards/margins": 0.43100887537002563, "rewards/rejected": -1.4801071882247925, "step": 285 }, { "epoch": 0.6112743788405023, "grad_norm": 7.985474586486816, "learning_rate": 1.961944166953445e-07, "logits/chosen": -0.8251385688781738, "logits/rejected": -0.9071054458618164, "logps/chosen": -0.3732144236564636, "logps/rejected": -0.4104728400707245, "loss": 1.5169, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9330360889434814, "rewards/margins": 0.09314604848623276, "rewards/rejected": -1.0261821746826172, "step": 286 }, { "epoch": 0.6134117018434411, "grad_norm": 7.316262722015381, "learning_rate": 1.9436976651092142e-07, "logits/chosen": -0.9544340372085571, "logits/rejected": -0.898868203163147, "logps/chosen": -0.35007724165916443, "logps/rejected": -0.495257705450058, "loss": 1.6104, "rewards/accuracies": 0.625, "rewards/chosen": -0.8751930594444275, "rewards/margins": 0.3629511594772339, "rewards/rejected": -1.2381441593170166, "step": 287 }, { "epoch": 0.6155490248463799, "grad_norm": 3.980193614959717, "learning_rate": 1.9254822883124517e-07, "logits/chosen": -1.2356715202331543, "logits/rejected": -1.1466394662857056, "logps/chosen": -0.418282151222229, "logps/rejected": -0.5082724094390869, "loss": 1.5123, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0457054376602173, "rewards/margins": 0.22497567534446716, "rewards/rejected": -1.2706811428070068, "step": 288 }, { "epoch": 0.6176863478493188, "grad_norm": 6.745012283325195, "learning_rate": 1.9072990557112564e-07, "logits/chosen": -1.2313592433929443, "logits/rejected": -1.1524139642715454, "logps/chosen": -0.34664469957351685, "logps/rejected": -0.5482650399208069, "loss": 1.5042, "rewards/accuracies": 0.625, "rewards/chosen": -0.8666118383407593, "rewards/margins": 0.504050612449646, "rewards/rejected": -1.3706625699996948, "step": 289 }, { "epoch": 0.6198236708522575, "grad_norm": 9.481700897216797, "learning_rate": 1.8891489846552644e-07, "logits/chosen": -1.081266164779663, "logits/rejected": -1.085394263267517, "logps/chosen": -0.37148210406303406, "logps/rejected": -0.5117133855819702, "loss": 1.5743, "rewards/accuracies": 0.5, "rewards/chosen": -0.928705096244812, "rewards/margins": 0.35057833790779114, "rewards/rejected": -1.2792835235595703, "step": 290 }, { "epoch": 0.6219609938551963, "grad_norm": 5.410580635070801, "learning_rate": 1.8710330906387286e-07, "logits/chosen": -1.0288105010986328, "logits/rejected": -1.0116032361984253, "logps/chosen": -0.3644832670688629, "logps/rejected": -0.4690641462802887, "loss": 1.4934, "rewards/accuracies": 0.625, "rewards/chosen": -0.9112080931663513, "rewards/margins": 0.261452317237854, "rewards/rejected": -1.1726603507995605, "step": 291 }, { "epoch": 0.6240983168581352, "grad_norm": 12.723451614379883, "learning_rate": 1.8529523872436977e-07, "logits/chosen": -1.0818991661071777, "logits/rejected": -1.0822491645812988, "logps/chosen": -0.2900945246219635, "logps/rejected": -0.44144943356513977, "loss": 1.6044, "rewards/accuracies": 0.8125, "rewards/chosen": -0.7252362370491028, "rewards/margins": 0.37838736176490784, "rewards/rejected": -1.103623628616333, "step": 292 }, { "epoch": 0.626235639861074, "grad_norm": 18.391582489013672, "learning_rate": 1.8349078860833124e-07, "logits/chosen": -1.2308346033096313, "logits/rejected": -1.1760648488998413, "logps/chosen": -0.6205483675003052, "logps/rejected": -0.3508188724517822, "loss": 1.6365, "rewards/accuracies": 0.375, "rewards/chosen": -1.5513708591461182, "rewards/margins": -0.6743236780166626, "rewards/rejected": -0.8770472407341003, "step": 293 }, { "epoch": 0.6283729628640128, "grad_norm": 4.619331359863281, "learning_rate": 1.8169005967452e-07, "logits/chosen": -1.2816352844238281, "logits/rejected": -1.2922104597091675, "logps/chosen": -0.3991982340812683, "logps/rejected": -0.504127562046051, "loss": 1.5704, "rewards/accuracies": 0.5, "rewards/chosen": -0.9979956150054932, "rewards/margins": 0.2623233199119568, "rewards/rejected": -1.2603188753128052, "step": 294 }, { "epoch": 0.6305102858669517, "grad_norm": 5.379226207733154, "learning_rate": 1.7989315267349933e-07, "logits/chosen": -1.0375932455062866, "logits/rejected": -0.987227737903595, "logps/chosen": -0.5034650564193726, "logps/rejected": -0.6317480206489563, "loss": 1.5085, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2586627006530762, "rewards/margins": 0.3207073211669922, "rewards/rejected": -1.5793699026107788, "step": 295 }, { "epoch": 0.6326476088698905, "grad_norm": 15.063772201538086, "learning_rate": 1.781001681419957e-07, "logits/chosen": -1.0075099468231201, "logits/rejected": -0.9762495160102844, "logps/chosen": -0.5012757778167725, "logps/rejected": -0.4941572844982147, "loss": 1.4992, "rewards/accuracies": 0.5, "rewards/chosen": -1.2531894445419312, "rewards/margins": -0.01779627427458763, "rewards/rejected": -1.2353932857513428, "step": 296 }, { "epoch": 0.6347849318728293, "grad_norm": 6.187438488006592, "learning_rate": 1.763112063972739e-07, "logits/chosen": -1.086329460144043, "logits/rejected": -1.003612995147705, "logps/chosen": -0.43468421697616577, "logps/rejected": -0.623386800289154, "loss": 1.5243, "rewards/accuracies": 0.625, "rewards/chosen": -1.0867105722427368, "rewards/margins": 0.4717563986778259, "rewards/rejected": -1.5584670305252075, "step": 297 }, { "epoch": 0.6369222548757681, "grad_norm": 16.858556747436523, "learning_rate": 1.745263675315245e-07, "logits/chosen": -0.9435803890228271, "logits/rejected": -0.9401760697364807, "logps/chosen": -0.41162610054016113, "logps/rejected": -0.7601633667945862, "loss": 1.5119, "rewards/accuracies": 0.625, "rewards/chosen": -1.0290653705596924, "rewards/margins": 0.871343195438385, "rewards/rejected": -1.9004085063934326, "step": 298 }, { "epoch": 0.6390595778787069, "grad_norm": 9.424429893493652, "learning_rate": 1.7274575140626315e-07, "logits/chosen": -1.1454746723175049, "logits/rejected": -1.0624852180480957, "logps/chosen": -0.5017335414886475, "logps/rejected": -0.5628975629806519, "loss": 1.6427, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2543339729309082, "rewards/margins": 0.1529100388288498, "rewards/rejected": -1.4072438478469849, "step": 299 }, { "epoch": 0.6411969008816457, "grad_norm": 5.96793794631958, "learning_rate": 1.7096945764674398e-07, "logits/chosen": -0.9207834005355835, "logits/rejected": -0.9202168583869934, "logps/chosen": -0.3956447243690491, "logps/rejected": -0.40050509572029114, "loss": 1.6262, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9891117811203003, "rewards/margins": 0.012150941416621208, "rewards/rejected": -1.0012627840042114, "step": 300 }, { "epoch": 0.6433342238845846, "grad_norm": 5.713474750518799, "learning_rate": 1.6919758563638502e-07, "logits/chosen": -0.9401556253433228, "logits/rejected": -0.8605173826217651, "logps/chosen": -0.4167335331439972, "logps/rejected": -0.5897922515869141, "loss": 1.527, "rewards/accuracies": 0.625, "rewards/chosen": -1.041833758354187, "rewards/margins": 0.4326467216014862, "rewards/rejected": -1.4744806289672852, "step": 301 }, { "epoch": 0.6454715468875234, "grad_norm": 11.05538272857666, "learning_rate": 1.674302345112083e-07, "logits/chosen": -1.0368751287460327, "logits/rejected": -1.1599576473236084, "logps/chosen": -0.42875924706459045, "logps/rejected": -0.7052878737449646, "loss": 1.4713, "rewards/accuracies": 0.625, "rewards/chosen": -1.071898102760315, "rewards/margins": 0.6913214325904846, "rewards/rejected": -1.7632195949554443, "step": 302 }, { "epoch": 0.6476088698904622, "grad_norm": 7.134030818939209, "learning_rate": 1.656675031542925e-07, "logits/chosen": -1.1293184757232666, "logits/rejected": -1.1065864562988281, "logps/chosen": -0.4180205166339874, "logps/rejected": -0.4501388669013977, "loss": 1.5318, "rewards/accuracies": 0.5, "rewards/chosen": -1.045051097869873, "rewards/margins": 0.08029599487781525, "rewards/rejected": -1.1253471374511719, "step": 303 }, { "epoch": 0.649746192893401, "grad_norm": 9.838216781616211, "learning_rate": 1.6390949019024118e-07, "logits/chosen": -1.2255228757858276, "logits/rejected": -1.0440919399261475, "logps/chosen": -0.34637248516082764, "logps/rejected": -0.332830548286438, "loss": 1.4995, "rewards/accuracies": 0.625, "rewards/chosen": -0.8659312725067139, "rewards/margins": -0.0338548980653286, "rewards/rejected": -0.8320763111114502, "step": 304 }, { "epoch": 0.6518835158963399, "grad_norm": 3.633300542831421, "learning_rate": 1.621562939796643e-07, "logits/chosen": -1.0595769882202148, "logits/rejected": -1.060139536857605, "logps/chosen": -0.4522903561592102, "logps/rejected": -0.7426398992538452, "loss": 1.5239, "rewards/accuracies": 0.4375, "rewards/chosen": -1.1307260990142822, "rewards/margins": 0.7258738279342651, "rewards/rejected": -1.8565996885299683, "step": 305 }, { "epoch": 0.6540208388992786, "grad_norm": 6.867697715759277, "learning_rate": 1.6040801261367493e-07, "logits/chosen": -1.1086713075637817, "logits/rejected": -1.1967618465423584, "logps/chosen": -0.42471063137054443, "logps/rejected": -0.4879637360572815, "loss": 1.4941, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0617766380310059, "rewards/margins": 0.15813271701335907, "rewards/rejected": -1.2199093103408813, "step": 306 }, { "epoch": 0.6561581619022174, "grad_norm": 9.760129928588867, "learning_rate": 1.5866474390840124e-07, "logits/chosen": -1.0829546451568604, "logits/rejected": -1.0814926624298096, "logps/chosen": -0.4132816791534424, "logps/rejected": -0.586501955986023, "loss": 1.5785, "rewards/accuracies": 0.5625, "rewards/chosen": -1.033204197883606, "rewards/margins": 0.43305063247680664, "rewards/rejected": -1.466254711151123, "step": 307 }, { "epoch": 0.6582954849051563, "grad_norm": 4.1318511962890625, "learning_rate": 1.569265853995137e-07, "logits/chosen": -0.9130831360816956, "logits/rejected": -1.0174808502197266, "logps/chosen": -0.3561224639415741, "logps/rejected": -0.47521257400512695, "loss": 1.4836, "rewards/accuracies": 0.5, "rewards/chosen": -0.8903061747550964, "rewards/margins": 0.2977251708507538, "rewards/rejected": -1.1880314350128174, "step": 308 }, { "epoch": 0.6604328079080951, "grad_norm": 5.264586925506592, "learning_rate": 1.5519363433676791e-07, "logits/chosen": -1.2580910921096802, "logits/rejected": -1.2599362134933472, "logps/chosen": -0.42455989122390747, "logps/rejected": -0.5628042221069336, "loss": 1.6118, "rewards/accuracies": 0.5, "rewards/chosen": -1.0613996982574463, "rewards/margins": 0.34561091661453247, "rewards/rejected": -1.4070106744766235, "step": 309 }, { "epoch": 0.6625701309110339, "grad_norm": 19.9966983795166, "learning_rate": 1.5346598767856345e-07, "logits/chosen": -0.8979520201683044, "logits/rejected": -0.9155081510543823, "logps/chosen": -0.335290789604187, "logps/rejected": -0.43495267629623413, "loss": 1.6585, "rewards/accuracies": 0.625, "rewards/chosen": -0.8382269740104675, "rewards/margins": 0.24915480613708496, "rewards/rejected": -1.0873818397521973, "step": 310 }, { "epoch": 0.6647074539139728, "grad_norm": 10.383206367492676, "learning_rate": 1.517437420865191e-07, "logits/chosen": -1.4863961935043335, "logits/rejected": -1.2771762609481812, "logps/chosen": -0.33609017729759216, "logps/rejected": -0.5989465713500977, "loss": 1.5329, "rewards/accuracies": 0.4375, "rewards/chosen": -0.8402254581451416, "rewards/margins": 0.6571409106254578, "rewards/rejected": -1.4973664283752441, "step": 311 }, { "epoch": 0.6668447769169116, "grad_norm": 4.85746431350708, "learning_rate": 1.500269939200648e-07, "logits/chosen": -1.1300700902938843, "logits/rejected": -1.1000648736953735, "logps/chosen": -0.4227021336555481, "logps/rejected": -0.3756105899810791, "loss": 1.572, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0567553043365479, "rewards/margins": -0.1177288144826889, "rewards/rejected": -0.939026415348053, "step": 312 }, { "epoch": 0.6689820999198504, "grad_norm": 15.065166473388672, "learning_rate": 1.4831583923104998e-07, "logits/chosen": -1.2687509059906006, "logits/rejected": -1.2387371063232422, "logps/chosen": -0.37272506952285767, "logps/rejected": -0.4628967344760895, "loss": 1.5731, "rewards/accuracies": 0.625, "rewards/chosen": -0.9318127632141113, "rewards/margins": 0.22542911767959595, "rewards/rejected": -1.157241940498352, "step": 313 }, { "epoch": 0.6711194229227893, "grad_norm": 4.742990970611572, "learning_rate": 1.4661037375836987e-07, "logits/chosen": -1.1166198253631592, "logits/rejected": -1.1708546876907349, "logps/chosen": -0.40323004126548767, "logps/rejected": -0.49593961238861084, "loss": 1.5854, "rewards/accuracies": 0.375, "rewards/chosen": -1.0080751180648804, "rewards/margins": 0.2317739725112915, "rewards/rejected": -1.2398490905761719, "step": 314 }, { "epoch": 0.673256745925728, "grad_norm": 6.135270595550537, "learning_rate": 1.4491069292260866e-07, "logits/chosen": -1.0454214811325073, "logits/rejected": -0.9797170162200928, "logps/chosen": -0.4790341258049011, "logps/rejected": -0.5334821343421936, "loss": 1.5961, "rewards/accuracies": 0.4375, "rewards/chosen": -1.1975852251052856, "rewards/margins": 0.13612008094787598, "rewards/rejected": -1.3337054252624512, "step": 315 }, { "epoch": 0.6753940689286668, "grad_norm": 5.529047012329102, "learning_rate": 1.432168918207009e-07, "logits/chosen": -0.9548214673995972, "logits/rejected": -1.037723183631897, "logps/chosen": -0.34955471754074097, "logps/rejected": -0.6267892122268677, "loss": 1.5352, "rewards/accuracies": 0.5, "rewards/chosen": -0.87388676404953, "rewards/margins": 0.6930862069129944, "rewards/rejected": -1.5669729709625244, "step": 316 }, { "epoch": 0.6775313919316056, "grad_norm": 4.692312240600586, "learning_rate": 1.4152906522061047e-07, "logits/chosen": -1.0882840156555176, "logits/rejected": -1.0136744976043701, "logps/chosen": -0.31677815318107605, "logps/rejected": -0.42752817273139954, "loss": 1.522, "rewards/accuracies": 0.625, "rewards/chosen": -0.7919453978538513, "rewards/margins": 0.2768751382827759, "rewards/rejected": -1.0688204765319824, "step": 317 }, { "epoch": 0.6796687149345445, "grad_norm": 10.370941162109375, "learning_rate": 1.3984730755602903e-07, "logits/chosen": -1.161927580833435, "logits/rejected": -1.0617276430130005, "logps/chosen": -0.534487783908844, "logps/rejected": -0.6021491289138794, "loss": 1.5201, "rewards/accuracies": 0.6875, "rewards/chosen": -1.3362195491790771, "rewards/margins": 0.1691533476114273, "rewards/rejected": -1.5053728818893433, "step": 318 }, { "epoch": 0.6818060379374833, "grad_norm": 9.610735893249512, "learning_rate": 1.381717129210918e-07, "logits/chosen": -1.1923787593841553, "logits/rejected": -1.2188538312911987, "logps/chosen": -0.375224769115448, "logps/rejected": -0.7198653817176819, "loss": 1.5779, "rewards/accuracies": 0.75, "rewards/chosen": -0.9380618929862976, "rewards/margins": 0.8616017699241638, "rewards/rejected": -1.7996635437011719, "step": 319 }, { "epoch": 0.6839433609404221, "grad_norm": 5.218975067138672, "learning_rate": 1.365023750651133e-07, "logits/chosen": -1.1556600332260132, "logits/rejected": -1.095563530921936, "logps/chosen": -0.37500467896461487, "logps/rejected": -0.4297294020652771, "loss": 1.5315, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9375116229057312, "rewards/margins": 0.1368117332458496, "rewards/rejected": -1.0743234157562256, "step": 320 }, { "epoch": 0.686080683943361, "grad_norm": 6.690799236297607, "learning_rate": 1.3483938738734195e-07, "logits/chosen": -0.8860509395599365, "logits/rejected": -0.8542050123214722, "logps/chosen": -0.3055468201637268, "logps/rejected": -0.3592091202735901, "loss": 1.5345, "rewards/accuracies": 0.5625, "rewards/chosen": -0.7638670802116394, "rewards/margins": 0.13415566086769104, "rewards/rejected": -0.8980227708816528, "step": 321 }, { "epoch": 0.6882180069462998, "grad_norm": 9.405998229980469, "learning_rate": 1.3318284293173449e-07, "logits/chosen": -0.9992817640304565, "logits/rejected": -0.9634179472923279, "logps/chosen": -0.4300232529640198, "logps/rejected": -0.4082144498825073, "loss": 1.5484, "rewards/accuracies": 0.5, "rewards/chosen": -1.075058102607727, "rewards/margins": -0.05452210083603859, "rewards/rejected": -1.0205360651016235, "step": 322 }, { "epoch": 0.6903553299492385, "grad_norm": 5.07054328918457, "learning_rate": 1.3153283438175034e-07, "logits/chosen": -1.0605663061141968, "logits/rejected": -1.0907377004623413, "logps/chosen": -0.3852520287036896, "logps/rejected": -0.4656079113483429, "loss": 1.5678, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9631301164627075, "rewards/margins": 0.2008897364139557, "rewards/rejected": -1.1640198230743408, "step": 323 }, { "epoch": 0.6924926529521774, "grad_norm": 7.885300159454346, "learning_rate": 1.2988945405516565e-07, "logits/chosen": -1.0609923601150513, "logits/rejected": -1.0959070920944214, "logps/chosen": -0.4137347638607025, "logps/rejected": -0.5460120439529419, "loss": 1.5155, "rewards/accuracies": 0.5625, "rewards/chosen": -1.034337043762207, "rewards/margins": 0.33069324493408203, "rewards/rejected": -1.3650301694869995, "step": 324 }, { "epoch": 0.6946299759551162, "grad_norm": 5.987451553344727, "learning_rate": 1.2825279389890818e-07, "logits/chosen": -0.9947149753570557, "logits/rejected": -1.0909423828125, "logps/chosen": -0.4138132333755493, "logps/rejected": -0.4551887512207031, "loss": 1.4521, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0345330238342285, "rewards/margins": 0.10343889147043228, "rewards/rejected": -1.1379718780517578, "step": 325 }, { "epoch": 0.696767298958055, "grad_norm": 5.460362434387207, "learning_rate": 1.2662294548391328e-07, "logits/chosen": -1.1488416194915771, "logits/rejected": -0.9424848556518555, "logps/chosen": -0.48600658774375916, "logps/rejected": -0.7618072628974915, "loss": 1.5489, "rewards/accuracies": 0.375, "rewards/chosen": -1.215016484260559, "rewards/margins": 0.6895018219947815, "rewards/rejected": -1.9045181274414062, "step": 326 }, { "epoch": 0.6989046219609939, "grad_norm": 10.205648422241211, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -0.9662617444992065, "logits/rejected": -1.1120105981826782, "logps/chosen": -0.40102994441986084, "logps/rejected": -0.6069145798683167, "loss": 1.5435, "rewards/accuracies": 0.75, "rewards/chosen": -1.0025748014450073, "rewards/margins": 0.5147115588188171, "rewards/rejected": -1.5172864198684692, "step": 327 }, { "epoch": 0.7010419449639327, "grad_norm": 8.57025146484375, "learning_rate": 1.2338404825076935e-07, "logits/chosen": -1.1456284523010254, "logits/rejected": -1.0507254600524902, "logps/chosen": -0.4316751956939697, "logps/rejected": -0.44297924637794495, "loss": 1.5203, "rewards/accuracies": 0.625, "rewards/chosen": -1.0791878700256348, "rewards/margins": 0.02826026827096939, "rewards/rejected": -1.1074482202529907, "step": 328 }, { "epoch": 0.7031792679668715, "grad_norm": 9.405667304992676, "learning_rate": 1.2177518064852345e-07, "logits/chosen": -1.1795152425765991, "logits/rejected": -1.0673104524612427, "logps/chosen": -0.37221118807792664, "logps/rejected": -0.4177697002887726, "loss": 1.5138, "rewards/accuracies": 0.4375, "rewards/chosen": -0.930527925491333, "rewards/margins": 0.11389636248350143, "rewards/rejected": -1.044424295425415, "step": 329 }, { "epoch": 0.7053165909698104, "grad_norm": 6.466906547546387, "learning_rate": 1.201734872092077e-07, "logits/chosen": -1.0346126556396484, "logits/rejected": -1.0439854860305786, "logps/chosen": -0.42790815234184265, "logps/rejected": -1.0795375108718872, "loss": 1.4512, "rewards/accuracies": 0.8125, "rewards/chosen": -1.0697704553604126, "rewards/margins": 1.6290733814239502, "rewards/rejected": -2.6988439559936523, "step": 330 }, { "epoch": 0.7074539139727491, "grad_norm": 4.390846252441406, "learning_rate": 1.185790575473738e-07, "logits/chosen": -1.1390395164489746, "logits/rejected": -1.1135765314102173, "logps/chosen": -0.5658525824546814, "logps/rejected": -0.5814335942268372, "loss": 1.4646, "rewards/accuracies": 0.375, "rewards/chosen": -1.4146316051483154, "rewards/margins": 0.03895253688097, "rewards/rejected": -1.45358407497406, "step": 331 }, { "epoch": 0.7095912369756879, "grad_norm": 5.300051212310791, "learning_rate": 1.1699198087116588e-07, "logits/chosen": -1.015911340713501, "logits/rejected": -1.11297607421875, "logps/chosen": -0.4078459143638611, "logps/rejected": -0.6018810868263245, "loss": 1.5814, "rewards/accuracies": 0.6875, "rewards/chosen": -1.019614815711975, "rewards/margins": 0.48508787155151367, "rewards/rejected": -1.5047025680541992, "step": 332 }, { "epoch": 0.7117285599786267, "grad_norm": 5.75014066696167, "learning_rate": 1.1541234597732947e-07, "logits/chosen": -1.0877783298492432, "logits/rejected": -1.0997726917266846, "logps/chosen": -0.3489132523536682, "logps/rejected": -0.43700623512268066, "loss": 1.4605, "rewards/accuracies": 0.625, "rewards/chosen": -0.8722831606864929, "rewards/margins": 0.22023235261440277, "rewards/rejected": -1.0925155878067017, "step": 333 }, { "epoch": 0.7138658829815656, "grad_norm": 5.7690229415893555, "learning_rate": 1.1384024124624322e-07, "logits/chosen": -0.9815250635147095, "logits/rejected": -0.9534754753112793, "logps/chosen": -0.42468035221099854, "logps/rejected": -0.42991912364959717, "loss": 1.5096, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0617008209228516, "rewards/margins": 0.013096902519464493, "rewards/rejected": -1.0747978687286377, "step": 334 }, { "epoch": 0.7160032059845044, "grad_norm": 6.530084609985352, "learning_rate": 1.1227575463697439e-07, "logits/chosen": -1.0770314931869507, "logits/rejected": -1.1593232154846191, "logps/chosen": -0.41134944558143616, "logps/rejected": -0.46647369861602783, "loss": 1.5333, "rewards/accuracies": 0.75, "rewards/chosen": -1.0283737182617188, "rewards/margins": 0.1378105729818344, "rewards/rejected": -1.1661843061447144, "step": 335 }, { "epoch": 0.7181405289874432, "grad_norm": 6.972837448120117, "learning_rate": 1.1071897368235694e-07, "logits/chosen": -1.0634236335754395, "logits/rejected": -1.1836671829223633, "logps/chosen": -0.35514572262763977, "logps/rejected": -0.6676814556121826, "loss": 1.4464, "rewards/accuracies": 0.5625, "rewards/chosen": -0.8878642320632935, "rewards/margins": 0.781339168548584, "rewards/rejected": -1.6692036390304565, "step": 336 }, { "epoch": 0.7202778519903821, "grad_norm": 6.366727828979492, "learning_rate": 1.0916998548409447e-07, "logits/chosen": -1.0254015922546387, "logits/rejected": -0.9457456469535828, "logps/chosen": -0.3946457803249359, "logps/rejected": -0.4669502377510071, "loss": 1.617, "rewards/accuracies": 0.5625, "rewards/chosen": -0.986614465713501, "rewards/margins": 0.1807611733675003, "rewards/rejected": -1.1673755645751953, "step": 337 }, { "epoch": 0.7224151749933209, "grad_norm": 6.886241436004639, "learning_rate": 1.0762887670788701e-07, "logits/chosen": -0.8809665441513062, "logits/rejected": -0.7807790637016296, "logps/chosen": -0.32795268297195435, "logps/rejected": -0.4252588152885437, "loss": 1.4383, "rewards/accuracies": 0.75, "rewards/chosen": -0.8198816776275635, "rewards/margins": 0.2432653307914734, "rewards/rejected": -1.063146948814392, "step": 338 }, { "epoch": 0.7245524979962596, "grad_norm": 4.194058418273926, "learning_rate": 1.0609573357858165e-07, "logits/chosen": -1.1363167762756348, "logits/rejected": -1.1581072807312012, "logps/chosen": -0.4632134437561035, "logps/rejected": -0.6518194079399109, "loss": 1.6039, "rewards/accuracies": 0.75, "rewards/chosen": -1.1580334901809692, "rewards/margins": 0.47151511907577515, "rewards/rejected": -1.6295486688613892, "step": 339 }, { "epoch": 0.7266898209991985, "grad_norm": 23.59259033203125, "learning_rate": 1.0457064187534861e-07, "logits/chosen": -1.0518946647644043, "logits/rejected": -0.9888642430305481, "logps/chosen": -0.407795786857605, "logps/rejected": -0.45109352469444275, "loss": 1.6088, "rewards/accuracies": 0.3125, "rewards/chosen": -1.0194894075393677, "rewards/margins": 0.1082444041967392, "rewards/rejected": -1.127733826637268, "step": 340 }, { "epoch": 0.7288271440021373, "grad_norm": 5.5164079666137695, "learning_rate": 1.0305368692688174e-07, "logits/chosen": -1.0163506269454956, "logits/rejected": -0.9497014284133911, "logps/chosen": -0.43188926577568054, "logps/rejected": -0.5361363291740417, "loss": 1.5212, "rewards/accuracies": 0.5, "rewards/chosen": -1.0797233581542969, "rewards/margins": 0.2606176435947418, "rewards/rejected": -1.3403409719467163, "step": 341 }, { "epoch": 0.7309644670050761, "grad_norm": 10.590493202209473, "learning_rate": 1.0154495360662463e-07, "logits/chosen": -0.7582399249076843, "logits/rejected": -0.7725558280944824, "logps/chosen": -0.3883350193500519, "logps/rejected": -0.4608793556690216, "loss": 1.4818, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9708375930786133, "rewards/margins": 0.18136097490787506, "rewards/rejected": -1.152198314666748, "step": 342 }, { "epoch": 0.733101790008015, "grad_norm": 5.3863677978515625, "learning_rate": 1.0004452632802158e-07, "logits/chosen": -1.0777875185012817, "logits/rejected": -1.0332427024841309, "logps/chosen": -0.6136234402656555, "logps/rejected": -0.6208893656730652, "loss": 1.4504, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5340585708618164, "rewards/margins": 0.018164925277233124, "rewards/rejected": -1.552223563194275, "step": 343 }, { "epoch": 0.7352391130109538, "grad_norm": 4.131237506866455, "learning_rate": 9.855248903979505e-08, "logits/chosen": -1.1028010845184326, "logits/rejected": -1.063793420791626, "logps/chosen": -0.4896419644355774, "logps/rejected": -0.5126334428787231, "loss": 1.496, "rewards/accuracies": 0.375, "rewards/chosen": -1.224104881286621, "rewards/margins": 0.05747878551483154, "rewards/rejected": -1.2815836668014526, "step": 344 }, { "epoch": 0.7373764360138926, "grad_norm": 6.84834623336792, "learning_rate": 9.706892522124838e-08, "logits/chosen": -1.0494110584259033, "logits/rejected": -0.994105339050293, "logps/chosen": -0.5251023173332214, "logps/rejected": -0.5692360401153564, "loss": 1.5667, "rewards/accuracies": 0.3125, "rewards/chosen": -1.312755823135376, "rewards/margins": 0.11033419519662857, "rewards/rejected": -1.4230899810791016, "step": 345 }, { "epoch": 0.7395137590168315, "grad_norm": 7.503670692443848, "learning_rate": 9.559391787759554e-08, "logits/chosen": -1.304071307182312, "logits/rejected": -1.1836615800857544, "logps/chosen": -0.5006979703903198, "logps/rejected": -0.4764450788497925, "loss": 1.5827, "rewards/accuracies": 0.5, "rewards/chosen": -1.2517449855804443, "rewards/margins": -0.06063230335712433, "rewards/rejected": -1.191112756729126, "step": 346 }, { "epoch": 0.7416510820197703, "grad_norm": 7.5821757316589355, "learning_rate": 9.412754953531663e-08, "logits/chosen": -1.0126221179962158, "logits/rejected": -1.0061360597610474, "logps/chosen": -0.536862850189209, "logps/rejected": -0.7791385650634766, "loss": 1.5476, "rewards/accuracies": 0.875, "rewards/chosen": -1.342157244682312, "rewards/margins": 0.6056893467903137, "rewards/rejected": -1.9478464126586914, "step": 347 }, { "epoch": 0.743788405022709, "grad_norm": 6.262937545776367, "learning_rate": 9.266990223754067e-08, "logits/chosen": -0.9866530895233154, "logits/rejected": -1.087868571281433, "logps/chosen": -0.35664424300193787, "logps/rejected": -0.7218388915061951, "loss": 1.5054, "rewards/accuracies": 0.625, "rewards/chosen": -0.8916106224060059, "rewards/margins": 0.9129866361618042, "rewards/rejected": -1.8045971393585205, "step": 348 }, { "epoch": 0.7459257280256478, "grad_norm": 4.314223766326904, "learning_rate": 9.12210575394553e-08, "logits/chosen": -1.0736174583435059, "logits/rejected": -1.0914644002914429, "logps/chosen": -0.4206693172454834, "logps/rejected": -0.4104337692260742, "loss": 1.58, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0516732931137085, "rewards/margins": -0.025588899850845337, "rewards/rejected": -1.0260844230651855, "step": 349 }, { "epoch": 0.7480630510285867, "grad_norm": 12.095142364501953, "learning_rate": 8.978109650374396e-08, "logits/chosen": -1.0621310472488403, "logits/rejected": -1.0466017723083496, "logps/chosen": -0.4226948618888855, "logps/rejected": -0.46644341945648193, "loss": 1.5575, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0567370653152466, "rewards/margins": 0.10937156528234482, "rewards/rejected": -1.16610848903656, "step": 350 }, { "epoch": 0.7502003740315255, "grad_norm": 6.449507236480713, "learning_rate": 8.835009969605011e-08, "logits/chosen": -1.1138099431991577, "logits/rejected": -1.0220037698745728, "logps/chosen": -0.3483428359031677, "logps/rejected": -0.3484461307525635, "loss": 1.5018, "rewards/accuracies": 0.5, "rewards/chosen": -0.8708571195602417, "rewards/margins": 0.0002581886947154999, "rewards/rejected": -0.8711153268814087, "step": 351 }, { "epoch": 0.7523376970344643, "grad_norm": 6.543509483337402, "learning_rate": 8.692814718046978e-08, "logits/chosen": -1.1011321544647217, "logits/rejected": -1.0573300123214722, "logps/chosen": -0.6217561364173889, "logps/rejected": -0.5529008507728577, "loss": 1.6158, "rewards/accuracies": 0.5625, "rewards/chosen": -1.55439031124115, "rewards/margins": -0.1721382439136505, "rewards/rejected": -1.3822520971298218, "step": 352 }, { "epoch": 0.7544750200374032, "grad_norm": 6.191348552703857, "learning_rate": 8.551531851507185e-08, "logits/chosen": -1.0794535875320435, "logits/rejected": -0.9287205934524536, "logps/chosen": -0.3870071768760681, "logps/rejected": -0.4184566140174866, "loss": 1.5651, "rewards/accuracies": 0.5, "rewards/chosen": -0.9675179719924927, "rewards/margins": 0.07862359285354614, "rewards/rejected": -1.046141505241394, "step": 353 }, { "epoch": 0.756612343040342, "grad_norm": 5.186205863952637, "learning_rate": 8.411169274744723e-08, "logits/chosen": -1.0084459781646729, "logits/rejected": -1.0072932243347168, "logps/chosen": -0.3501359224319458, "logps/rejected": -0.4932965040206909, "loss": 1.4669, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8753398656845093, "rewards/margins": 0.35790154337882996, "rewards/rejected": -1.233241319656372, "step": 354 }, { "epoch": 0.7587496660432808, "grad_norm": 8.382218360900879, "learning_rate": 8.271734841028552e-08, "logits/chosen": -1.0127713680267334, "logits/rejected": -1.0045888423919678, "logps/chosen": -0.4923154413700104, "logps/rejected": -0.4377010464668274, "loss": 1.5886, "rewards/accuracies": 0.375, "rewards/chosen": -1.2307885885238647, "rewards/margins": -0.13653598725795746, "rewards/rejected": -1.094252586364746, "step": 355 }, { "epoch": 0.7608869890462197, "grad_norm": 34.09469985961914, "learning_rate": 8.133236351698142e-08, "logits/chosen": -1.191988229751587, "logits/rejected": -1.0870414972305298, "logps/chosen": -0.5740368366241455, "logps/rejected": -0.943623960018158, "loss": 1.5604, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4350918531417847, "rewards/margins": 0.9239681959152222, "rewards/rejected": -2.359060049057007, "step": 356 }, { "epoch": 0.7630243120491584, "grad_norm": 15.886775970458984, "learning_rate": 7.99568155572701e-08, "logits/chosen": -1.2273132801055908, "logits/rejected": -1.1660319566726685, "logps/chosen": -0.5724566578865051, "logps/rejected": -0.6214060187339783, "loss": 1.5502, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4311414957046509, "rewards/margins": 0.12237339466810226, "rewards/rejected": -1.5535149574279785, "step": 357 }, { "epoch": 0.7651616350520972, "grad_norm": 5.216745376586914, "learning_rate": 7.859078149289144e-08, "logits/chosen": -1.0447226762771606, "logits/rejected": -1.0748844146728516, "logps/chosen": -0.3527478873729706, "logps/rejected": -0.45652708411216736, "loss": 1.4919, "rewards/accuracies": 0.625, "rewards/chosen": -0.8818696737289429, "rewards/margins": 0.2594480514526367, "rewards/rejected": -1.1413178443908691, "step": 358 }, { "epoch": 0.7672989580550361, "grad_norm": 7.554361820220947, "learning_rate": 7.723433775328384e-08, "logits/chosen": -1.0472806692123413, "logits/rejected": -1.2351243495941162, "logps/chosen": -0.486628919839859, "logps/rejected": -0.8915761709213257, "loss": 1.5042, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2165722846984863, "rewards/margins": 1.012368083000183, "rewards/rejected": -2.22894024848938, "step": 359 }, { "epoch": 0.7694362810579749, "grad_norm": 5.030363082885742, "learning_rate": 7.588756023130833e-08, "logits/chosen": -0.7821986079216003, "logits/rejected": -0.8699120283126831, "logps/chosen": -0.4785808324813843, "logps/rejected": -0.5831509828567505, "loss": 1.4942, "rewards/accuracies": 0.5, "rewards/chosen": -1.1964521408081055, "rewards/margins": 0.261425256729126, "rewards/rejected": -1.4578773975372314, "step": 360 }, { "epoch": 0.7715736040609137, "grad_norm": 7.105344295501709, "learning_rate": 7.455052427900213e-08, "logits/chosen": -1.215461254119873, "logits/rejected": -1.0217854976654053, "logps/chosen": -0.3912314176559448, "logps/rejected": -0.3650144636631012, "loss": 1.6164, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9780785441398621, "rewards/margins": -0.06554235517978668, "rewards/rejected": -0.9125362038612366, "step": 361 }, { "epoch": 0.7737109270638525, "grad_norm": 5.5884199142456055, "learning_rate": 7.322330470336313e-08, "logits/chosen": -1.0982252359390259, "logits/rejected": -1.0146585702896118, "logps/chosen": -0.32143884897232056, "logps/rejected": -0.495330274105072, "loss": 1.4397, "rewards/accuracies": 0.625, "rewards/chosen": -0.8035971522331238, "rewards/margins": 0.4347284734249115, "rewards/rejected": -1.2383257150650024, "step": 362 }, { "epoch": 0.7758482500667914, "grad_norm": 10.065922737121582, "learning_rate": 7.190597576216384e-08, "logits/chosen": -0.8641526699066162, "logits/rejected": -0.8766672015190125, "logps/chosen": -0.46004733443260193, "logps/rejected": -0.4904063940048218, "loss": 1.5026, "rewards/accuracies": 0.6875, "rewards/chosen": -1.150118350982666, "rewards/margins": 0.07589760422706604, "rewards/rejected": -1.2260159254074097, "step": 363 }, { "epoch": 0.7779855730697302, "grad_norm": 9.752578735351562, "learning_rate": 7.059861115979701e-08, "logits/chosen": -1.0331135988235474, "logits/rejected": -1.0849862098693848, "logps/chosen": -0.4356737434864044, "logps/rejected": -0.4790940582752228, "loss": 1.6159, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0891844034194946, "rewards/margins": 0.1085507869720459, "rewards/rejected": -1.1977351903915405, "step": 364 }, { "epoch": 0.7801228960726689, "grad_norm": 6.089657783508301, "learning_rate": 6.930128404315214e-08, "logits/chosen": -1.0164854526519775, "logits/rejected": -1.006756067276001, "logps/chosen": -0.6724580526351929, "logps/rejected": -0.6376264095306396, "loss": 1.4724, "rewards/accuracies": 0.375, "rewards/chosen": -1.6811450719833374, "rewards/margins": -0.08707903325557709, "rewards/rejected": -1.5940660238265991, "step": 365 }, { "epoch": 0.7822602190756078, "grad_norm": 4.81856107711792, "learning_rate": 6.801406699752229e-08, "logits/chosen": -1.1662445068359375, "logits/rejected": -1.0658493041992188, "logps/chosen": -0.4783725440502167, "logps/rejected": -0.4546273946762085, "loss": 1.6376, "rewards/accuracies": 0.375, "rewards/chosen": -1.1959314346313477, "rewards/margins": -0.059362899512052536, "rewards/rejected": -1.1365684270858765, "step": 366 }, { "epoch": 0.7843975420785466, "grad_norm": 9.418230056762695, "learning_rate": 6.673703204254347e-08, "logits/chosen": -1.2206920385360718, "logits/rejected": -1.2510498762130737, "logps/chosen": -0.594007670879364, "logps/rejected": -0.8274646997451782, "loss": 1.4951, "rewards/accuracies": 0.5, "rewards/chosen": -1.4850192070007324, "rewards/margins": 0.5836424827575684, "rewards/rejected": -2.068661689758301, "step": 367 }, { "epoch": 0.7865348650814854, "grad_norm": 10.291698455810547, "learning_rate": 6.547025062816486e-08, "logits/chosen": -0.8960355520248413, "logits/rejected": -0.9344096779823303, "logps/chosen": -0.36661607027053833, "logps/rejected": -0.43485212326049805, "loss": 1.554, "rewards/accuracies": 0.625, "rewards/chosen": -0.916540265083313, "rewards/margins": 0.17059014737606049, "rewards/rejected": -1.0871303081512451, "step": 368 }, { "epoch": 0.7886721880844243, "grad_norm": 4.118884086608887, "learning_rate": 6.42137936306514e-08, "logits/chosen": -1.0395543575286865, "logits/rejected": -0.9369036555290222, "logps/chosen": -0.3800487816333771, "logps/rejected": -0.35273048281669617, "loss": 1.5453, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9501218795776367, "rewards/margins": -0.06829574704170227, "rewards/rejected": -0.8818261623382568, "step": 369 }, { "epoch": 0.7908095110873631, "grad_norm": 10.194417953491211, "learning_rate": 6.296773134861824e-08, "logits/chosen": -1.06082284450531, "logits/rejected": -1.0715018510818481, "logps/chosen": -0.45035964250564575, "logps/rejected": -0.49578312039375305, "loss": 1.5399, "rewards/accuracies": 0.6875, "rewards/chosen": -1.125899076461792, "rewards/margins": 0.11355876922607422, "rewards/rejected": -1.2394579648971558, "step": 370 }, { "epoch": 0.7929468340903019, "grad_norm": 12.731304168701172, "learning_rate": 6.173213349909728e-08, "logits/chosen": -1.1646618843078613, "logits/rejected": -1.0550154447555542, "logps/chosen": -0.45987626910209656, "logps/rejected": -0.6932123303413391, "loss": 1.4858, "rewards/accuracies": 0.625, "rewards/chosen": -1.1496906280517578, "rewards/margins": 0.583340048789978, "rewards/rejected": -1.7330307960510254, "step": 371 }, { "epoch": 0.7950841570932408, "grad_norm": 4.002368927001953, "learning_rate": 6.050706921363672e-08, "logits/chosen": -1.1935923099517822, "logits/rejected": -1.2162138223648071, "logps/chosen": -0.39056870341300964, "logps/rejected": -0.5482085943222046, "loss": 1.472, "rewards/accuracies": 0.625, "rewards/chosen": -0.9764216542243958, "rewards/margins": 0.39409980177879333, "rewards/rejected": -1.3705215454101562, "step": 372 }, { "epoch": 0.7972214800961795, "grad_norm": 14.192221641540527, "learning_rate": 5.929260703443337e-08, "logits/chosen": -0.7887646555900574, "logits/rejected": -0.8968151211738586, "logps/chosen": -0.3553355634212494, "logps/rejected": -0.43609827756881714, "loss": 1.5724, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8883388638496399, "rewards/margins": 0.20190683007240295, "rewards/rejected": -1.0902457237243652, "step": 373 }, { "epoch": 0.7993588030991183, "grad_norm": 14.193215370178223, "learning_rate": 5.808881491049722e-08, "logits/chosen": -1.265504240989685, "logits/rejected": -1.3089518547058105, "logps/chosen": -0.5325222611427307, "logps/rejected": -0.530707836151123, "loss": 1.5907, "rewards/accuracies": 0.625, "rewards/chosen": -1.331305742263794, "rewards/margins": -0.004536189138889313, "rewards/rejected": -1.3267695903778076, "step": 374 }, { "epoch": 0.8014961261020572, "grad_norm": 8.60618782043457, "learning_rate": 5.6895760193850145e-08, "logits/chosen": -1.0075315237045288, "logits/rejected": -1.0355682373046875, "logps/chosen": -0.4078059792518616, "logps/rejected": -0.6130856275558472, "loss": 1.5543, "rewards/accuracies": 0.8125, "rewards/chosen": -1.019515037536621, "rewards/margins": 0.5131990313529968, "rewards/rejected": -1.5327140092849731, "step": 375 }, { "epoch": 0.803633449104996, "grad_norm": 8.175145149230957, "learning_rate": 5.571350963575727e-08, "logits/chosen": -1.1598341464996338, "logits/rejected": -0.9834379553794861, "logps/chosen": -0.4177122414112091, "logps/rejected": -0.35740387439727783, "loss": 1.5343, "rewards/accuracies": 0.1875, "rewards/chosen": -1.0442806482315063, "rewards/margins": -0.1507708877325058, "rewards/rejected": -0.8935096859931946, "step": 376 }, { "epoch": 0.8057707721079348, "grad_norm": 20.987957000732422, "learning_rate": 5.454212938299255e-08, "logits/chosen": -1.140153169631958, "logits/rejected": -1.0446147918701172, "logps/chosen": -0.5767669677734375, "logps/rejected": -0.4474826455116272, "loss": 1.5854, "rewards/accuracies": 0.3125, "rewards/chosen": -1.4419174194335938, "rewards/margins": -0.32321077585220337, "rewards/rejected": -1.1187067031860352, "step": 377 }, { "epoch": 0.8079080951108736, "grad_norm": 7.252152919769287, "learning_rate": 5.338168497413756e-08, "logits/chosen": -1.1144384145736694, "logits/rejected": -1.2679189443588257, "logps/chosen": -0.3241596817970276, "logps/rejected": -0.5525774955749512, "loss": 1.5213, "rewards/accuracies": 0.6875, "rewards/chosen": -0.8103991150856018, "rewards/margins": 0.5710446238517761, "rewards/rejected": -1.3814438581466675, "step": 378 }, { "epoch": 0.8100454181138125, "grad_norm": 5.948306083679199, "learning_rate": 5.223224133591475e-08, "logits/chosen": -1.212296724319458, "logits/rejected": -1.0987417697906494, "logps/chosen": -0.7626843452453613, "logps/rejected": -1.2524211406707764, "loss": 1.4664, "rewards/accuracies": 0.5625, "rewards/chosen": -1.9067108631134033, "rewards/margins": 1.2243417501449585, "rewards/rejected": -3.1310529708862305, "step": 379 }, { "epoch": 0.8121827411167513, "grad_norm": 5.803626537322998, "learning_rate": 5.109386277955477e-08, "logits/chosen": -1.1813595294952393, "logits/rejected": -1.1350992918014526, "logps/chosen": -0.45435211062431335, "logps/rejected": -0.5456478595733643, "loss": 1.4602, "rewards/accuracies": 0.4375, "rewards/chosen": -1.1358802318572998, "rewards/margins": 0.22823941707611084, "rewards/rejected": -1.364119529724121, "step": 380 }, { "epoch": 0.81432006411969, "grad_norm": 4.112217903137207, "learning_rate": 4.996661299719845e-08, "logits/chosen": -0.8679373264312744, "logits/rejected": -0.8792969584465027, "logps/chosen": -0.4242730736732483, "logps/rejected": -0.7051547765731812, "loss": 1.5246, "rewards/accuracies": 0.8125, "rewards/chosen": -1.060682773590088, "rewards/margins": 0.7022043466567993, "rewards/rejected": -1.7628870010375977, "step": 381 }, { "epoch": 0.8164573871226289, "grad_norm": 10.623425483703613, "learning_rate": 4.885055505833291e-08, "logits/chosen": -1.2642873525619507, "logits/rejected": -1.189084768295288, "logps/chosen": -0.4796138405799866, "logps/rejected": -0.5438456535339355, "loss": 1.5916, "rewards/accuracies": 0.5625, "rewards/chosen": -1.199034571647644, "rewards/margins": 0.16057954728603363, "rewards/rejected": -1.3596141338348389, "step": 382 }, { "epoch": 0.8185947101255677, "grad_norm": 5.901862144470215, "learning_rate": 4.774575140626316e-08, "logits/chosen": -1.0387252569198608, "logits/rejected": -0.9353397488594055, "logps/chosen": -0.43210557103157043, "logps/rejected": -0.474994421005249, "loss": 1.4878, "rewards/accuracies": 0.75, "rewards/chosen": -1.0802638530731201, "rewards/margins": 0.1072220653295517, "rewards/rejected": -1.187485933303833, "step": 383 }, { "epoch": 0.8207320331285065, "grad_norm": 10.615392684936523, "learning_rate": 4.6652263854618016e-08, "logits/chosen": -1.2227771282196045, "logits/rejected": -1.2380874156951904, "logps/chosen": -0.42436325550079346, "logps/rejected": -0.6415535807609558, "loss": 1.4763, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0609081983566284, "rewards/margins": 0.5429757833480835, "rewards/rejected": -1.6038841009140015, "step": 384 }, { "epoch": 0.8228693561314454, "grad_norm": 7.720395565032959, "learning_rate": 4.557015358389216e-08, "logits/chosen": -0.9876857995986938, "logits/rejected": -1.0623700618743896, "logps/chosen": -0.37730199098587036, "logps/rejected": -0.587026059627533, "loss": 1.4647, "rewards/accuracies": 0.625, "rewards/chosen": -0.9432549476623535, "rewards/margins": 0.5243102312088013, "rewards/rejected": -1.4675650596618652, "step": 385 }, { "epoch": 0.8250066791343842, "grad_norm": 8.550374984741211, "learning_rate": 4.449948113802254e-08, "logits/chosen": -1.203737735748291, "logits/rejected": -1.1764881610870361, "logps/chosen": -0.33095237612724304, "logps/rejected": -0.3797753155231476, "loss": 1.5067, "rewards/accuracies": 0.375, "rewards/chosen": -0.827380895614624, "rewards/margins": 0.12205736339092255, "rewards/rejected": -0.9494383335113525, "step": 386 }, { "epoch": 0.827144002137323, "grad_norm": 8.158326148986816, "learning_rate": 4.3440306421001324e-08, "logits/chosen": -1.0478994846343994, "logits/rejected": -1.0275698900222778, "logps/chosen": -0.4138807952404022, "logps/rejected": -0.5256016254425049, "loss": 1.4697, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0347020626068115, "rewards/margins": 0.2793022096157074, "rewards/rejected": -1.3140041828155518, "step": 387 }, { "epoch": 0.8292813251402619, "grad_norm": 4.749565124511719, "learning_rate": 4.2392688693524055e-08, "logits/chosen": -1.0925198793411255, "logits/rejected": -1.053438663482666, "logps/chosen": -0.4893158972263336, "logps/rejected": -0.7248774766921997, "loss": 1.4731, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2232897281646729, "rewards/margins": 0.5889038443565369, "rewards/rejected": -1.8121936321258545, "step": 388 }, { "epoch": 0.8314186481432007, "grad_norm": 17.479476928710938, "learning_rate": 4.1356686569674335e-08, "logits/chosen": -1.237557291984558, "logits/rejected": -1.1833033561706543, "logps/chosen": -0.6519225835800171, "logps/rejected": -0.6998899579048157, "loss": 1.5504, "rewards/accuracies": 0.5, "rewards/chosen": -1.6298065185546875, "rewards/margins": 0.1199183464050293, "rewards/rejected": -1.7497249841690063, "step": 389 }, { "epoch": 0.8335559711461394, "grad_norm": 6.035606384277344, "learning_rate": 4.0332358013644015e-08, "logits/chosen": -1.16561758518219, "logits/rejected": -1.1693965196609497, "logps/chosen": -0.6768723726272583, "logps/rejected": -1.003299593925476, "loss": 1.4768, "rewards/accuracies": 0.5625, "rewards/chosen": -1.692180871963501, "rewards/margins": 0.8160682916641235, "rewards/rejected": -2.508248805999756, "step": 390 }, { "epoch": 0.8356932941490782, "grad_norm": 6.278528213500977, "learning_rate": 3.9319760336490205e-08, "logits/chosen": -0.7999597191810608, "logits/rejected": -0.6943086385726929, "logps/chosen": -0.3784443140029907, "logps/rejected": -0.4581376612186432, "loss": 1.5857, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9461109042167664, "rewards/margins": 0.1992333084344864, "rewards/rejected": -1.1453441381454468, "step": 391 }, { "epoch": 0.8378306171520171, "grad_norm": 4.925440788269043, "learning_rate": 3.831895019292897e-08, "logits/chosen": -1.1565301418304443, "logits/rejected": -1.1625264883041382, "logps/chosen": -0.5009636878967285, "logps/rejected": -0.8770073056221008, "loss": 1.4676, "rewards/accuracies": 0.625, "rewards/chosen": -1.2524092197418213, "rewards/margins": 0.9401088953018188, "rewards/rejected": -2.1925179958343506, "step": 392 }, { "epoch": 0.8399679401549559, "grad_norm": 16.391368865966797, "learning_rate": 3.732998357816514e-08, "logits/chosen": -1.0616164207458496, "logits/rejected": -1.0247597694396973, "logps/chosen": -0.5045540928840637, "logps/rejected": -0.467951238155365, "loss": 1.6318, "rewards/accuracies": 0.5, "rewards/chosen": -1.261385202407837, "rewards/margins": -0.09150727093219757, "rewards/rejected": -1.1698780059814453, "step": 393 }, { "epoch": 0.8421052631578947, "grad_norm": 8.550324440002441, "learning_rate": 3.635291582475963e-08, "logits/chosen": -0.9775318503379822, "logits/rejected": -0.8906686902046204, "logps/chosen": -0.5394979119300842, "logps/rejected": -0.5499922037124634, "loss": 1.5031, "rewards/accuracies": 0.625, "rewards/chosen": -1.3487448692321777, "rewards/margins": 0.02623593807220459, "rewards/rejected": -1.3749808073043823, "step": 394 }, { "epoch": 0.8442425861608336, "grad_norm": 4.05560302734375, "learning_rate": 3.538780159953347e-08, "logits/chosen": -1.0048534870147705, "logits/rejected": -0.839844822883606, "logps/chosen": -0.4153073728084564, "logps/rejected": -0.35160398483276367, "loss": 1.5959, "rewards/accuracies": 0.3125, "rewards/chosen": -1.0382684469223022, "rewards/margins": -0.1592584103345871, "rewards/rejected": -0.8790099620819092, "step": 395 }, { "epoch": 0.8463799091637724, "grad_norm": 7.721834182739258, "learning_rate": 3.4434694900509345e-08, "logits/chosen": -1.1735496520996094, "logits/rejected": -1.150696039199829, "logps/chosen": -0.5112527012825012, "logps/rejected": -0.6088312864303589, "loss": 1.5405, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2781318426132202, "rewards/margins": 0.24394652247428894, "rewards/rejected": -1.522078275680542, "step": 396 }, { "epoch": 0.8485172321667112, "grad_norm": 5.097099781036377, "learning_rate": 3.349364905389032e-08, "logits/chosen": -1.2030649185180664, "logits/rejected": -1.08577561378479, "logps/chosen": -0.47224241495132446, "logps/rejected": -0.6983097791671753, "loss": 1.5699, "rewards/accuracies": 0.625, "rewards/chosen": -1.1806060075759888, "rewards/margins": 0.5651683807373047, "rewards/rejected": -1.745774507522583, "step": 397 }, { "epoch": 0.85065455516965, "grad_norm": 9.325733184814453, "learning_rate": 3.256471671107616e-08, "logits/chosen": -0.9340042471885681, "logits/rejected": -0.946205198764801, "logps/chosen": -0.7264373302459717, "logps/rejected": -0.6860026121139526, "loss": 1.5801, "rewards/accuracies": 0.5625, "rewards/chosen": -1.8160933256149292, "rewards/margins": -0.1010868027806282, "rewards/rejected": -1.7150065898895264, "step": 398 }, { "epoch": 0.8527918781725888, "grad_norm": 4.532627105712891, "learning_rate": 3.1647949845717585e-08, "logits/chosen": -0.8892905712127686, "logits/rejected": -0.8267837166786194, "logps/chosen": -0.4368503987789154, "logps/rejected": -0.5416733026504517, "loss": 1.4184, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0921260118484497, "rewards/margins": 0.2620573043823242, "rewards/rejected": -1.3541834354400635, "step": 399 }, { "epoch": 0.8549292011755276, "grad_norm": 3.611345052719116, "learning_rate": 3.074339975080836e-08, "logits/chosen": -0.9715834259986877, "logits/rejected": -0.9480459690093994, "logps/chosen": -0.685413122177124, "logps/rejected": -0.7482943534851074, "loss": 1.4719, "rewards/accuracies": 0.6875, "rewards/chosen": -1.71353280544281, "rewards/margins": 0.15720298886299133, "rewards/rejected": -1.870735764503479, "step": 400 }, { "epoch": 0.8570665241784665, "grad_norm": 49.35193634033203, "learning_rate": 2.98511170358155e-08, "logits/chosen": -0.959058940410614, "logits/rejected": -0.9538683295249939, "logps/chosen": -0.43192583322525024, "logps/rejected": -0.4624338746070862, "loss": 1.6139, "rewards/accuracies": 0.625, "rewards/chosen": -1.0798146724700928, "rewards/margins": 0.07627001404762268, "rewards/rejected": -1.156084656715393, "step": 401 }, { "epoch": 0.8592038471814053, "grad_norm": 11.570405006408691, "learning_rate": 2.8971151623847584e-08, "logits/chosen": -1.0528483390808105, "logits/rejected": -0.9815914630889893, "logps/chosen": -0.584007740020752, "logps/rejected": -0.6078099608421326, "loss": 1.5858, "rewards/accuracies": 0.6875, "rewards/chosen": -1.4600192308425903, "rewards/margins": 0.05950555205345154, "rewards/rejected": -1.5195249319076538, "step": 402 }, { "epoch": 0.8613411701843441, "grad_norm": 5.403092861175537, "learning_rate": 2.8103552748861475e-08, "logits/chosen": -1.052750825881958, "logits/rejected": -1.0424790382385254, "logps/chosen": -0.6250475645065308, "logps/rejected": -0.6444798111915588, "loss": 1.6129, "rewards/accuracies": 0.5625, "rewards/chosen": -1.5626189708709717, "rewards/margins": 0.04858069866895676, "rewards/rejected": -1.6111997365951538, "step": 403 }, { "epoch": 0.863478493187283, "grad_norm": 5.894848346710205, "learning_rate": 2.724836895290805e-08, "logits/chosen": -1.0178946256637573, "logits/rejected": -0.8687437772750854, "logps/chosen": -0.36898887157440186, "logps/rejected": -0.7522455453872681, "loss": 1.5337, "rewards/accuracies": 0.4375, "rewards/chosen": -0.9224721789360046, "rewards/margins": 0.958141565322876, "rewards/rejected": -1.8806138038635254, "step": 404 }, { "epoch": 0.8656158161902218, "grad_norm": 6.724638938903809, "learning_rate": 2.6405648083415833e-08, "logits/chosen": -1.1396384239196777, "logits/rejected": -1.0131150484085083, "logps/chosen": -0.6166858673095703, "logps/rejected": -0.5131340026855469, "loss": 1.5427, "rewards/accuracies": 0.375, "rewards/chosen": -1.5417147874832153, "rewards/margins": -0.258879691362381, "rewards/rejected": -1.2828351259231567, "step": 405 }, { "epoch": 0.8677531391931605, "grad_norm": 4.763136863708496, "learning_rate": 2.55754372905142e-08, "logits/chosen": -1.1291677951812744, "logits/rejected": -1.0440433025360107, "logps/chosen": -0.44505226612091064, "logps/rejected": -0.46291491389274597, "loss": 1.499, "rewards/accuracies": 0.5, "rewards/chosen": -1.1126307249069214, "rewards/margins": 0.04465658590197563, "rewards/rejected": -1.157287359237671, "step": 406 }, { "epoch": 0.8698904621960993, "grad_norm": 5.994953155517578, "learning_rate": 2.475778302439524e-08, "logits/chosen": -1.0867843627929688, "logits/rejected": -1.1085426807403564, "logps/chosen": -0.6850754022598267, "logps/rejected": -0.8681538105010986, "loss": 1.54, "rewards/accuracies": 0.5, "rewards/chosen": -1.7126885652542114, "rewards/margins": 0.4576959013938904, "rewards/rejected": -2.170384645462036, "step": 407 }, { "epoch": 0.8720277851990382, "grad_norm": 4.274337291717529, "learning_rate": 2.3952731032714973e-08, "logits/chosen": -0.8507957458496094, "logits/rejected": -0.8216973543167114, "logps/chosen": -0.352754145860672, "logps/rejected": -0.6458288431167603, "loss": 1.4806, "rewards/accuracies": 0.75, "rewards/chosen": -0.8818854093551636, "rewards/margins": 0.7326868176460266, "rewards/rejected": -1.6145721673965454, "step": 408 }, { "epoch": 0.874165108201977, "grad_norm": 19.687917709350586, "learning_rate": 2.3160326358033778e-08, "logits/chosen": -1.0179362297058105, "logits/rejected": -0.9422796964645386, "logps/chosen": -0.6053561568260193, "logps/rejected": -1.013503074645996, "loss": 1.4768, "rewards/accuracies": 0.625, "rewards/chosen": -1.5133905410766602, "rewards/margins": 1.0203672647476196, "rewards/rejected": -2.5337576866149902, "step": 409 }, { "epoch": 0.8763024312049158, "grad_norm": 12.98466968536377, "learning_rate": 2.2380613335296033e-08, "logits/chosen": -0.8576774597167969, "logits/rejected": -0.9601131677627563, "logps/chosen": -0.42103275656700134, "logps/rejected": -0.41525495052337646, "loss": 1.5959, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0525819063186646, "rewards/margins": -0.014444507658481598, "rewards/rejected": -1.0381373167037964, "step": 410 }, { "epoch": 0.8784397542078547, "grad_norm": 7.204395294189453, "learning_rate": 2.1613635589349756e-08, "logits/chosen": -0.9204460978507996, "logits/rejected": -0.9429717063903809, "logps/chosen": -0.3754885196685791, "logps/rejected": -0.4179832935333252, "loss": 1.5211, "rewards/accuracies": 0.625, "rewards/chosen": -0.9387211799621582, "rewards/margins": 0.10623697191476822, "rewards/rejected": -1.0449581146240234, "step": 411 }, { "epoch": 0.8805770772107935, "grad_norm": 14.507763862609863, "learning_rate": 2.085943603250595e-08, "logits/chosen": -0.9056552648544312, "logits/rejected": -0.8666099309921265, "logps/chosen": -0.4410094618797302, "logps/rejected": -0.6149858236312866, "loss": 1.5065, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1025235652923584, "rewards/margins": 0.43494072556495667, "rewards/rejected": -1.5374643802642822, "step": 412 }, { "epoch": 0.8827144002137323, "grad_norm": 4.710812568664551, "learning_rate": 2.0118056862137354e-08, "logits/chosen": -0.9734061360359192, "logits/rejected": -0.8919450044631958, "logps/chosen": -0.4355347156524658, "logps/rejected": -0.4121510982513428, "loss": 1.5909, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0888367891311646, "rewards/margins": -0.058458905667066574, "rewards/rejected": -1.030377745628357, "step": 413 }, { "epoch": 0.8848517232166712, "grad_norm": 6.681394100189209, "learning_rate": 1.938953955831771e-08, "logits/chosen": -1.089871883392334, "logits/rejected": -1.0652118921279907, "logps/chosen": -0.45682665705680847, "logps/rejected": -0.5210414528846741, "loss": 1.4889, "rewards/accuracies": 0.625, "rewards/chosen": -1.1420665979385376, "rewards/margins": 0.1605370044708252, "rewards/rejected": -1.3026037216186523, "step": 414 }, { "epoch": 0.88698904621961, "grad_norm": 10.179585456848145, "learning_rate": 1.8673924881500823e-08, "logits/chosen": -0.9976410269737244, "logits/rejected": -1.0120102167129517, "logps/chosen": -0.7260380387306213, "logps/rejected": -0.9380686283111572, "loss": 1.5325, "rewards/accuracies": 0.6875, "rewards/chosen": -1.8150951862335205, "rewards/margins": 0.5300765037536621, "rewards/rejected": -2.3451716899871826, "step": 415 }, { "epoch": 0.8891263692225487, "grad_norm": 6.529333591461182, "learning_rate": 1.797125287024029e-08, "logits/chosen": -1.0898345708847046, "logits/rejected": -1.1249827146530151, "logps/chosen": -0.5659042596817017, "logps/rejected": -0.8098978996276855, "loss": 1.4598, "rewards/accuracies": 0.75, "rewards/chosen": -1.414760947227478, "rewards/margins": 0.6099839210510254, "rewards/rejected": -2.024744749069214, "step": 416 }, { "epoch": 0.8912636922254876, "grad_norm": 18.293628692626953, "learning_rate": 1.7281562838948966e-08, "logits/chosen": -0.9206041693687439, "logits/rejected": -0.9363532066345215, "logps/chosen": -0.6177800893783569, "logps/rejected": -0.5964070558547974, "loss": 1.6686, "rewards/accuracies": 0.4375, "rewards/chosen": -1.5444501638412476, "rewards/margins": -0.05343271791934967, "rewards/rejected": -1.4910173416137695, "step": 417 }, { "epoch": 0.8934010152284264, "grad_norm": 6.754497528076172, "learning_rate": 1.6604893375699592e-08, "logits/chosen": -1.1073287725448608, "logits/rejected": -0.9858848452568054, "logps/chosen": -0.4562823176383972, "logps/rejected": -0.5044858455657959, "loss": 1.5667, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1407058238983154, "rewards/margins": 0.12050891667604446, "rewards/rejected": -1.2612147331237793, "step": 418 }, { "epoch": 0.8955383382313652, "grad_norm": 4.217410087585449, "learning_rate": 1.5941282340065697e-08, "logits/chosen": -1.1437349319458008, "logits/rejected": -1.2322208881378174, "logps/chosen": -0.4406435191631317, "logps/rejected": -0.6784199476242065, "loss": 1.4618, "rewards/accuracies": 0.5, "rewards/chosen": -1.1016088724136353, "rewards/margins": 0.5944410562515259, "rewards/rejected": -1.6960498094558716, "step": 419 }, { "epoch": 0.897675661234304, "grad_norm": 16.80787467956543, "learning_rate": 1.5290766861003475e-08, "logits/chosen": -0.9160170555114746, "logits/rejected": -0.8729619383811951, "logps/chosen": -0.3433056175708771, "logps/rejected": -0.37045544385910034, "loss": 1.6206, "rewards/accuracies": 0.25, "rewards/chosen": -0.8582640290260315, "rewards/margins": 0.06787460297346115, "rewards/rejected": -0.926138699054718, "step": 420 }, { "epoch": 0.8998129842372429, "grad_norm": 13.28708553314209, "learning_rate": 1.4653383334774228e-08, "logits/chosen": -1.0449622869491577, "logits/rejected": -1.0788357257843018, "logps/chosen": -0.5669997930526733, "logps/rejected": -0.7596959471702576, "loss": 1.5144, "rewards/accuracies": 0.5625, "rewards/chosen": -1.4174995422363281, "rewards/margins": 0.4817403554916382, "rewards/rejected": -1.8992400169372559, "step": 421 }, { "epoch": 0.9019503072401817, "grad_norm": 7.049681663513184, "learning_rate": 1.4029167422908105e-08, "logits/chosen": -1.148837924003601, "logits/rejected": -1.0951889753341675, "logps/chosen": -0.4990730583667755, "logps/rejected": -0.607469916343689, "loss": 1.5152, "rewards/accuracies": 0.5, "rewards/chosen": -1.2476826906204224, "rewards/margins": 0.2709922194480896, "rewards/rejected": -1.5186748504638672, "step": 422 }, { "epoch": 0.9040876302431204, "grad_norm": 6.772598743438721, "learning_rate": 1.3418154050208936e-08, "logits/chosen": -0.9771215319633484, "logits/rejected": -0.9951186776161194, "logps/chosen": -0.5036316514015198, "logps/rejected": -0.6173194646835327, "loss": 1.5277, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2590792179107666, "rewards/margins": 0.28421932458877563, "rewards/rejected": -1.543298602104187, "step": 423 }, { "epoch": 0.9062249532460593, "grad_norm": 5.654405117034912, "learning_rate": 1.2820377402800064e-08, "logits/chosen": -0.8762426972389221, "logits/rejected": -0.6818346381187439, "logps/chosen": -0.4185902774333954, "logps/rejected": -0.9549139738082886, "loss": 1.4405, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0464756488800049, "rewards/margins": 1.3408091068267822, "rewards/rejected": -2.387284755706787, "step": 424 }, { "epoch": 0.9083622762489981, "grad_norm": 9.220845222473145, "learning_rate": 1.2235870926211616e-08, "logits/chosen": -0.9336291551589966, "logits/rejected": -0.9068048596382141, "logps/chosen": -0.4764101505279541, "logps/rejected": -0.6432890295982361, "loss": 1.5286, "rewards/accuracies": 0.5, "rewards/chosen": -1.1910252571105957, "rewards/margins": 0.41719722747802734, "rewards/rejected": -1.608222484588623, "step": 425 }, { "epoch": 0.9104995992519369, "grad_norm": 6.728222846984863, "learning_rate": 1.1664667323509347e-08, "logits/chosen": -1.0641913414001465, "logits/rejected": -0.9187748432159424, "logps/chosen": -0.3940516710281372, "logps/rejected": -0.4065035283565521, "loss": 1.5201, "rewards/accuracies": 0.5, "rewards/chosen": -0.985129177570343, "rewards/margins": 0.031129609793424606, "rewards/rejected": -1.0162588357925415, "step": 426 }, { "epoch": 0.9126369222548758, "grad_norm": 7.619234561920166, "learning_rate": 1.1106798553464802e-08, "logits/chosen": -0.954318642616272, "logits/rejected": -0.908359706401825, "logps/chosen": -0.4075399935245514, "logps/rejected": -0.4785918891429901, "loss": 1.5005, "rewards/accuracies": 0.625, "rewards/chosen": -1.0188499689102173, "rewards/margins": 0.17762985825538635, "rewards/rejected": -1.1964799165725708, "step": 427 }, { "epoch": 0.9147742452578146, "grad_norm": 6.409261226654053, "learning_rate": 1.0562295828767387e-08, "logits/chosen": -1.0399943590164185, "logits/rejected": -1.0340969562530518, "logps/chosen": -0.3984678387641907, "logps/rejected": -0.5552449226379395, "loss": 1.4632, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9961696863174438, "rewards/margins": 0.39194267988204956, "rewards/rejected": -1.3881123065948486, "step": 428 }, { "epoch": 0.9169115682607534, "grad_norm": 10.060012817382812, "learning_rate": 1.0031189614277763e-08, "logits/chosen": -0.9579813480377197, "logits/rejected": -0.9383722543716431, "logps/chosen": -0.5487989187240601, "logps/rejected": -0.5818829536437988, "loss": 1.5268, "rewards/accuracies": 0.625, "rewards/chosen": -1.371997356414795, "rewards/margins": 0.0827101320028305, "rewards/rejected": -1.4547075033187866, "step": 429 }, { "epoch": 0.9190488912636923, "grad_norm": 6.2424139976501465, "learning_rate": 9.513509625323518e-09, "logits/chosen": -0.9296804666519165, "logits/rejected": -0.9260187745094299, "logps/chosen": -0.39111167192459106, "logps/rejected": -0.45873600244522095, "loss": 1.4814, "rewards/accuracies": 0.6875, "rewards/chosen": -0.9777792096138, "rewards/margins": 0.16906076669692993, "rewards/rejected": -1.14683997631073, "step": 430 }, { "epoch": 0.921186214266631, "grad_norm": 10.799667358398438, "learning_rate": 9.009284826036689e-09, "logits/chosen": -0.892406702041626, "logits/rejected": -0.9241263270378113, "logps/chosen": -0.5018429160118103, "logps/rejected": -0.6716207265853882, "loss": 1.4844, "rewards/accuracies": 0.625, "rewards/chosen": -1.2546073198318481, "rewards/margins": 0.42444440722465515, "rewards/rejected": -1.6790517568588257, "step": 431 }, { "epoch": 0.9233235372695698, "grad_norm": 4.4624104499816895, "learning_rate": 8.518543427732949e-09, "logits/chosen": -0.9802000522613525, "logits/rejected": -0.9559367895126343, "logps/chosen": -0.38735339045524597, "logps/rejected": -0.6876262426376343, "loss": 1.481, "rewards/accuracies": 0.8125, "rewards/chosen": -0.9683833718299866, "rewards/margins": 0.7506821155548096, "rewards/rejected": -1.7190656661987305, "step": 432 }, { "epoch": 0.9254608602725087, "grad_norm": 7.951170444488525, "learning_rate": 8.041312887333396e-09, "logits/chosen": -0.9957330226898193, "logits/rejected": -0.9468004703521729, "logps/chosen": -0.4312437176704407, "logps/rejected": -0.5361820459365845, "loss": 1.4571, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0781091451644897, "rewards/margins": 0.26234593987464905, "rewards/rejected": -1.3404550552368164, "step": 433 }, { "epoch": 0.9275981832754475, "grad_norm": 12.679637908935547, "learning_rate": 7.577619905828281e-09, "logits/chosen": -1.0287508964538574, "logits/rejected": -0.9482178688049316, "logps/chosen": -0.40496230125427246, "logps/rejected": -0.38896051049232483, "loss": 1.4673, "rewards/accuracies": 0.375, "rewards/chosen": -1.0124057531356812, "rewards/margins": -0.040004514157772064, "rewards/rejected": -0.9724011421203613, "step": 434 }, { "epoch": 0.9297355062783863, "grad_norm": 6.805285930633545, "learning_rate": 7.127490426783123e-09, "logits/chosen": -1.1413686275482178, "logits/rejected": -1.09022057056427, "logps/chosen": -0.5725710391998291, "logps/rejected": -0.6725842952728271, "loss": 1.5341, "rewards/accuracies": 0.4375, "rewards/chosen": -1.4314275979995728, "rewards/margins": 0.2500333786010742, "rewards/rejected": -1.6814608573913574, "step": 435 }, { "epoch": 0.9318728292813251, "grad_norm": 5.759010314941406, "learning_rate": 6.6909496348871445e-09, "logits/chosen": -1.1474663019180298, "logits/rejected": -1.1745803356170654, "logps/chosen": -0.6815481185913086, "logps/rejected": -0.7699655294418335, "loss": 1.5144, "rewards/accuracies": 0.625, "rewards/chosen": -1.7038702964782715, "rewards/margins": 0.2210434526205063, "rewards/rejected": -1.9249136447906494, "step": 436 }, { "epoch": 0.934010152284264, "grad_norm": 4.481965065002441, "learning_rate": 6.268021954544095e-09, "logits/chosen": -0.9412963390350342, "logits/rejected": -0.9516785144805908, "logps/chosen": -0.3796120285987854, "logps/rejected": -0.37879857420921326, "loss": 1.5547, "rewards/accuracies": 0.375, "rewards/chosen": -0.9490300416946411, "rewards/margins": -0.002033662050962448, "rewards/rejected": -0.9469964504241943, "step": 437 }, { "epoch": 0.9361474752872028, "grad_norm": 6.431197166442871, "learning_rate": 5.858731048505927e-09, "logits/chosen": -1.076103687286377, "logits/rejected": -1.0976781845092773, "logps/chosen": -0.4042336046695709, "logps/rejected": -0.612984836101532, "loss": 1.4394, "rewards/accuracies": 0.6875, "rewards/chosen": -1.0105839967727661, "rewards/margins": 0.5218778848648071, "rewards/rejected": -1.5324620008468628, "step": 438 }, { "epoch": 0.9382847982901416, "grad_norm": 11.778403282165527, "learning_rate": 5.463099816548577e-09, "logits/chosen": -1.085112452507019, "logits/rejected": -1.024551272392273, "logps/chosen": -0.3431437611579895, "logps/rejected": -0.5487081408500671, "loss": 1.5601, "rewards/accuracies": 0.375, "rewards/chosen": -0.8578594923019409, "rewards/margins": 0.5139108896255493, "rewards/rejected": -1.3717702627182007, "step": 439 }, { "epoch": 0.9404221212930804, "grad_norm": 4.792006492614746, "learning_rate": 5.08115039419113e-09, "logits/chosen": -0.9755135774612427, "logits/rejected": -0.8824871778488159, "logps/chosen": -0.3806512653827667, "logps/rejected": -0.5343747735023499, "loss": 1.52, "rewards/accuracies": 0.5625, "rewards/chosen": -0.9516281485557556, "rewards/margins": 0.38430866599082947, "rewards/rejected": -1.3359367847442627, "step": 440 }, { "epoch": 0.9425594442960192, "grad_norm": 6.083741188049316, "learning_rate": 4.712904151456864e-09, "logits/chosen": -0.9559481143951416, "logits/rejected": -0.8965126872062683, "logps/chosen": -0.4216436743736267, "logps/rejected": -0.4634767770767212, "loss": 1.4695, "rewards/accuracies": 0.625, "rewards/chosen": -1.0541093349456787, "rewards/margins": 0.1045827567577362, "rewards/rejected": -1.1586920022964478, "step": 441 }, { "epoch": 0.944696767298958, "grad_norm": 4.866091251373291, "learning_rate": 4.358381691677931e-09, "logits/chosen": -0.9368703961372375, "logits/rejected": -0.8826941251754761, "logps/chosen": -0.3301496207714081, "logps/rejected": -0.38065895438194275, "loss": 1.4942, "rewards/accuracies": 0.625, "rewards/chosen": -0.8253740072250366, "rewards/margins": 0.1262734830379486, "rewards/rejected": -0.9516474008560181, "step": 442 }, { "epoch": 0.9468340903018969, "grad_norm": 7.398251056671143, "learning_rate": 4.0176028503425826e-09, "logits/chosen": -1.079075574874878, "logits/rejected": -1.00128972530365, "logps/chosen": -0.45567309856414795, "logps/rejected": -0.39884287118911743, "loss": 1.5388, "rewards/accuracies": 0.3125, "rewards/chosen": -1.1391828060150146, "rewards/margins": -0.14207565784454346, "rewards/rejected": -0.9971071481704712, "step": 443 }, { "epoch": 0.9489714133048357, "grad_norm": 9.869709014892578, "learning_rate": 3.6905866939851983e-09, "logits/chosen": -1.1050983667373657, "logits/rejected": -1.0123190879821777, "logps/chosen": -0.44024163484573364, "logps/rejected": -0.3926200568675995, "loss": 1.4883, "rewards/accuracies": 0.5625, "rewards/chosen": -1.1006040573120117, "rewards/margins": -0.1190539002418518, "rewards/rejected": -0.9815501570701599, "step": 444 }, { "epoch": 0.9511087363077745, "grad_norm": 4.558749198913574, "learning_rate": 3.3773515191196646e-09, "logits/chosen": -1.0600441694259644, "logits/rejected": -1.0637009143829346, "logps/chosen": -0.4824512004852295, "logps/rejected": -0.594225287437439, "loss": 1.5854, "rewards/accuracies": 0.5625, "rewards/chosen": -1.2061281204223633, "rewards/margins": 0.2794351577758789, "rewards/rejected": -1.4855631589889526, "step": 445 }, { "epoch": 0.9532460593107134, "grad_norm": 7.49786376953125, "learning_rate": 3.077914851215585e-09, "logits/chosen": -1.06005859375, "logits/rejected": -1.0157767534255981, "logps/chosen": -0.5099815130233765, "logps/rejected": -0.6151185035705566, "loss": 1.4851, "rewards/accuracies": 0.5, "rewards/chosen": -1.274953842163086, "rewards/margins": 0.26284244656562805, "rewards/rejected": -1.5377962589263916, "step": 446 }, { "epoch": 0.9553833823136522, "grad_norm": 5.8320746421813965, "learning_rate": 2.7922934437178692e-09, "logits/chosen": -0.8938602209091187, "logits/rejected": -0.9562631249427795, "logps/chosen": -0.37319353222846985, "logps/rejected": -0.37881189584732056, "loss": 1.4395, "rewards/accuracies": 0.375, "rewards/chosen": -0.9329838752746582, "rewards/margins": 0.0140459556132555, "rewards/rejected": -0.9470298290252686, "step": 447 }, { "epoch": 0.957520705316591, "grad_norm": 3.9899652004241943, "learning_rate": 2.5205032771092592e-09, "logits/chosen": -0.9867920279502869, "logits/rejected": -0.9630347490310669, "logps/chosen": -0.39268139004707336, "logps/rejected": -0.635047435760498, "loss": 1.5503, "rewards/accuracies": 0.625, "rewards/chosen": -0.9817034006118774, "rewards/margins": 0.6059151291847229, "rewards/rejected": -1.5876185894012451, "step": 448 }, { "epoch": 0.9596580283195298, "grad_norm": 7.979334354400635, "learning_rate": 2.2625595580163247e-09, "logits/chosen": -1.1002980470657349, "logits/rejected": -1.0980544090270996, "logps/chosen": -0.7607989311218262, "logps/rejected": -0.9997137784957886, "loss": 1.5401, "rewards/accuracies": 0.6875, "rewards/chosen": -1.9019973278045654, "rewards/margins": 0.5972872972488403, "rewards/rejected": -2.4992847442626953, "step": 449 }, { "epoch": 0.9617953513224686, "grad_norm": 21.347997665405273, "learning_rate": 2.0184767183584474e-09, "logits/chosen": -0.8544862866401672, "logits/rejected": -0.8984670042991638, "logps/chosen": -0.5437809228897095, "logps/rejected": -0.6033496856689453, "loss": 1.4988, "rewards/accuracies": 0.5625, "rewards/chosen": -1.359452247619629, "rewards/margins": 0.14892183244228363, "rewards/rejected": -1.5083742141723633, "step": 450 }, { "epoch": 0.9639326743254074, "grad_norm": 8.341211318969727, "learning_rate": 1.7882684145406612e-09, "logits/chosen": -1.0166016817092896, "logits/rejected": -1.0338623523712158, "logps/chosen": -0.5460320711135864, "logps/rejected": -0.5169766545295715, "loss": 1.5717, "rewards/accuracies": 0.5625, "rewards/chosen": -1.3650802373886108, "rewards/margins": -0.07263859361410141, "rewards/rejected": -1.292441725730896, "step": 451 }, { "epoch": 0.9660699973283462, "grad_norm": 12.869359970092773, "learning_rate": 1.5719475266893489e-09, "logits/chosen": -1.0716265439987183, "logits/rejected": -1.0736249685287476, "logps/chosen": -0.4328761696815491, "logps/rejected": -0.5213409066200256, "loss": 1.5382, "rewards/accuracies": 0.5, "rewards/chosen": -1.0821905136108398, "rewards/margins": 0.22116179764270782, "rewards/rejected": -1.3033523559570312, "step": 452 }, { "epoch": 0.9682073203312851, "grad_norm": 5.998004913330078, "learning_rate": 1.3695261579316775e-09, "logits/chosen": -1.1411190032958984, "logits/rejected": -1.0635725259780884, "logps/chosen": -0.6178188323974609, "logps/rejected": -0.592136561870575, "loss": 1.6393, "rewards/accuracies": 0.3125, "rewards/chosen": -1.5445469617843628, "rewards/margins": -0.064205601811409, "rewards/rejected": -1.4803414344787598, "step": 453 }, { "epoch": 0.9703446433342239, "grad_norm": 7.896289348602295, "learning_rate": 1.1810156337183908e-09, "logits/chosen": -1.009376883506775, "logits/rejected": -0.9895673394203186, "logps/chosen": -0.7131325602531433, "logps/rejected": -0.6051285266876221, "loss": 1.5398, "rewards/accuracies": 0.5, "rewards/chosen": -1.7828314304351807, "rewards/margins": -0.2700101137161255, "rewards/rejected": -1.5128213167190552, "step": 454 }, { "epoch": 0.9724819663371627, "grad_norm": 6.323320388793945, "learning_rate": 1.0064265011902328e-09, "logits/chosen": -1.0121572017669678, "logits/rejected": -0.9521965980529785, "logps/chosen": -0.5518695712089539, "logps/rejected": -0.5250836610794067, "loss": 1.5786, "rewards/accuracies": 0.3125, "rewards/chosen": -1.379673957824707, "rewards/margins": -0.06696499139070511, "rewards/rejected": -1.312708854675293, "step": 455 }, { "epoch": 0.9746192893401016, "grad_norm": 6.093278408050537, "learning_rate": 8.457685285878091e-10, "logits/chosen": -0.9040694236755371, "logits/rejected": -1.0100067853927612, "logps/chosen": -0.6011037230491638, "logps/rejected": -0.9459134340286255, "loss": 1.4765, "rewards/accuracies": 0.625, "rewards/chosen": -1.502759337425232, "rewards/margins": 0.8620242476463318, "rewards/rejected": -2.364783525466919, "step": 456 }, { "epoch": 0.9767566123430403, "grad_norm": 7.644316673278809, "learning_rate": 6.990507047049676e-10, "logits/chosen": -1.1642239093780518, "logits/rejected": -1.327940583229065, "logps/chosen": -0.7490012645721436, "logps/rejected": -0.8202410340309143, "loss": 1.6319, "rewards/accuracies": 0.4375, "rewards/chosen": -1.8725032806396484, "rewards/margins": 0.1780991107225418, "rewards/rejected": -2.0506021976470947, "step": 457 }, { "epoch": 0.9788939353459791, "grad_norm": 4.500396728515625, "learning_rate": 5.662812383859794e-10, "logits/chosen": -1.057512640953064, "logits/rejected": -1.0183889865875244, "logps/chosen": -0.5618507862091064, "logps/rejected": -0.7458251714706421, "loss": 1.5519, "rewards/accuracies": 0.625, "rewards/chosen": -1.4046270847320557, "rewards/margins": 0.4599360227584839, "rewards/rejected": -1.86456298828125, "step": 458 }, { "epoch": 0.981031258348918, "grad_norm": 5.931386470794678, "learning_rate": 4.4746755806621126e-10, "logits/chosen": -1.0571941137313843, "logits/rejected": -1.1213597059249878, "logps/chosen": -0.642679750919342, "logps/rejected": -0.7841815948486328, "loss": 1.4134, "rewards/accuracies": 0.6875, "rewards/chosen": -1.6066994667053223, "rewards/margins": 0.3537544906139374, "rewards/rejected": -1.9604538679122925, "step": 459 }, { "epoch": 0.9831685813518568, "grad_norm": 9.15275764465332, "learning_rate": 3.4261631135654167e-10, "logits/chosen": -0.8693954348564148, "logits/rejected": -0.7475937008857727, "logps/chosen": -0.3651605546474457, "logps/rejected": -0.42595067620277405, "loss": 1.4887, "rewards/accuracies": 0.625, "rewards/chosen": -0.9129014015197754, "rewards/margins": 0.15197524428367615, "rewards/rejected": -1.064876675605774, "step": 460 }, { "epoch": 0.9853059043547956, "grad_norm": 11.78666877746582, "learning_rate": 2.5173336467135263e-10, "logits/chosen": -1.1248339414596558, "logits/rejected": -1.013832688331604, "logps/chosen": -0.48437923192977905, "logps/rejected": -0.5153346657752991, "loss": 1.487, "rewards/accuracies": 0.6875, "rewards/chosen": -1.2109479904174805, "rewards/margins": 0.07738865166902542, "rewards/rejected": -1.2883366346359253, "step": 461 }, { "epoch": 0.9874432273577345, "grad_norm": 5.332235336303711, "learning_rate": 1.7482380290034792e-10, "logits/chosen": -1.0759484767913818, "logits/rejected": -0.9958518147468567, "logps/chosen": -0.4139256775379181, "logps/rejected": -0.7975391745567322, "loss": 1.3972, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0348142385482788, "rewards/margins": 0.9590335488319397, "rewards/rejected": -1.9938479661941528, "step": 462 }, { "epoch": 0.9895805503606733, "grad_norm": 5.443465709686279, "learning_rate": 1.1189192912416933e-10, "logits/chosen": -1.0934535264968872, "logits/rejected": -1.0068289041519165, "logps/chosen": -0.48808667063713074, "logps/rejected": -0.6034483313560486, "loss": 1.4738, "rewards/accuracies": 0.5, "rewards/chosen": -1.2202166318893433, "rewards/margins": 0.2884041368961334, "rewards/rejected": -1.5086207389831543, "step": 463 }, { "epoch": 0.9917178733636121, "grad_norm": 5.87878942489624, "learning_rate": 6.294126437336733e-11, "logits/chosen": -1.0164133310317993, "logits/rejected": -0.9699276685714722, "logps/chosen": -0.4373021125793457, "logps/rejected": -0.5335453152656555, "loss": 1.4684, "rewards/accuracies": 0.4375, "rewards/chosen": -1.0932552814483643, "rewards/margins": 0.24060802161693573, "rewards/rejected": -1.3338632583618164, "step": 464 }, { "epoch": 0.9938551963665508, "grad_norm": 4.464337348937988, "learning_rate": 2.797454743164174e-11, "logits/chosen": -1.2042040824890137, "logits/rejected": -1.055449366569519, "logps/chosen": -0.3999456763267517, "logps/rejected": -0.4777381420135498, "loss": 1.5844, "rewards/accuracies": 0.625, "rewards/chosen": -0.9998641610145569, "rewards/margins": 0.19448117911815643, "rewards/rejected": -1.1943453550338745, "step": 465 }, { "epoch": 0.9959925193694897, "grad_norm": 9.290578842163086, "learning_rate": 6.993734682547714e-12, "logits/chosen": -0.8978444933891296, "logits/rejected": -0.8331681489944458, "logps/chosen": -0.521608293056488, "logps/rejected": -0.5517727136611938, "loss": 1.5965, "rewards/accuracies": 0.375, "rewards/chosen": -1.304020643234253, "rewards/margins": 0.07541122287511826, "rewards/rejected": -1.379431962966919, "step": 466 }, { "epoch": 0.9981298423724285, "grad_norm": 12.642468452453613, "learning_rate": 0.0, "logits/chosen": -0.8587179183959961, "logits/rejected": -0.8208239078521729, "logps/chosen": -0.4356221556663513, "logps/rejected": -0.39131855964660645, "loss": 1.4854, "rewards/accuracies": 0.5625, "rewards/chosen": -1.0890554189682007, "rewards/margins": -0.11075909435749054, "rewards/rejected": -0.9782962799072266, "step": 467 }, { "epoch": 0.9981298423724285, "step": 467, "total_flos": 0.0, "train_loss": 0.0, "train_runtime": 0.0036, "train_samples_per_second": 16407823.488, "train_steps_per_second": 127972.035 } ], "logging_steps": 1, "max_steps": 467, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 32, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }