simpo-baseline / trainer_state.json
ZefanW's picture
Model save
bba4123 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.9981298423724285,
"eval_steps": 500,
"global_step": 467,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0021373230029388193,
"grad_norm": 2.51713228225708,
"learning_rate": 1.0638297872340425e-08,
"logits/chosen": -1.1381689310073853,
"logits/rejected": -0.9913416504859924,
"logps/chosen": -0.2839311957359314,
"logps/rejected": -0.2955534756183624,
"loss": 1.6097,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5678623914718628,
"rewards/margins": 0.023244591429829597,
"rewards/rejected": -0.5911069512367249,
"step": 1
},
{
"epoch": 0.004274646005877639,
"grad_norm": 6.541850566864014,
"learning_rate": 2.127659574468085e-08,
"logits/chosen": -1.0311710834503174,
"logits/rejected": -0.8901023864746094,
"logps/chosen": -0.24952735006809235,
"logps/rejected": -0.24253402650356293,
"loss": 1.6096,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.4990547001361847,
"rewards/margins": -0.013986671343445778,
"rewards/rejected": -0.48506805300712585,
"step": 2
},
{
"epoch": 0.006411969008816457,
"grad_norm": 5.6596479415893555,
"learning_rate": 3.191489361702127e-08,
"logits/chosen": -0.9279628992080688,
"logits/rejected": -0.8305555582046509,
"logps/chosen": -0.2633163630962372,
"logps/rejected": -0.26702702045440674,
"loss": 1.6174,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5266327261924744,
"rewards/margins": 0.007421246729791164,
"rewards/rejected": -0.5340540409088135,
"step": 3
},
{
"epoch": 0.008549292011755277,
"grad_norm": 3.8121635913848877,
"learning_rate": 4.25531914893617e-08,
"logits/chosen": -0.8504582047462463,
"logits/rejected": -0.7527742981910706,
"logps/chosen": -0.2771408259868622,
"logps/rejected": -0.26471394300460815,
"loss": 1.6393,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5542816519737244,
"rewards/margins": -0.024853792041540146,
"rewards/rejected": -0.5294278860092163,
"step": 4
},
{
"epoch": 0.010686615014694095,
"grad_norm": 6.048301696777344,
"learning_rate": 5.3191489361702123e-08,
"logits/chosen": -1.156632661819458,
"logits/rejected": -1.2128832340240479,
"logps/chosen": -0.28773820400238037,
"logps/rejected": -0.29937219619750977,
"loss": 1.6108,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5754764080047607,
"rewards/margins": 0.023267941549420357,
"rewards/rejected": -0.5987443923950195,
"step": 5
},
{
"epoch": 0.012823938017632914,
"grad_norm": 3.6442198753356934,
"learning_rate": 6.382978723404254e-08,
"logits/chosen": -1.0647015571594238,
"logits/rejected": -1.031942367553711,
"logps/chosen": -0.25931063294410706,
"logps/rejected": -0.28003033995628357,
"loss": 1.6079,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5186212658882141,
"rewards/margins": 0.04143940657377243,
"rewards/rejected": -0.5600606799125671,
"step": 6
},
{
"epoch": 0.014961261020571734,
"grad_norm": 5.595146656036377,
"learning_rate": 7.446808510638298e-08,
"logits/chosen": -0.7785481810569763,
"logits/rejected": -0.7654089331626892,
"logps/chosen": -0.25532105565071106,
"logps/rejected": -0.24814245104789734,
"loss": 1.6092,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5106421113014221,
"rewards/margins": -0.01435722503811121,
"rewards/rejected": -0.4962849020957947,
"step": 7
},
{
"epoch": 0.017098584023510555,
"grad_norm": 2.9471020698547363,
"learning_rate": 8.51063829787234e-08,
"logits/chosen": -1.0282069444656372,
"logits/rejected": -1.0483824014663696,
"logps/chosen": -0.24546000361442566,
"logps/rejected": -0.2658500373363495,
"loss": 1.5902,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4909200072288513,
"rewards/margins": 0.04078003019094467,
"rewards/rejected": -0.531700074672699,
"step": 8
},
{
"epoch": 0.01923590702644937,
"grad_norm": 3.132836103439331,
"learning_rate": 9.574468085106382e-08,
"logits/chosen": -0.9889479875564575,
"logits/rejected": -0.8638209104537964,
"logps/chosen": -0.27614107728004456,
"logps/rejected": -0.2566734254360199,
"loss": 1.6173,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.5522821545600891,
"rewards/margins": -0.03893527761101723,
"rewards/rejected": -0.5133468508720398,
"step": 9
},
{
"epoch": 0.02137323002938819,
"grad_norm": 5.624292850494385,
"learning_rate": 1.0638297872340425e-07,
"logits/chosen": -1.0719839334487915,
"logits/rejected": -1.0015329122543335,
"logps/chosen": -0.32535240054130554,
"logps/rejected": -0.31745338439941406,
"loss": 1.6211,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.6507048010826111,
"rewards/margins": -0.015798063948750496,
"rewards/rejected": -0.6349067687988281,
"step": 10
},
{
"epoch": 0.02351055303232701,
"grad_norm": 5.1507039070129395,
"learning_rate": 1.1702127659574468e-07,
"logits/chosen": -0.9715439677238464,
"logits/rejected": -0.8908199071884155,
"logps/chosen": -0.2835432291030884,
"logps/rejected": -0.2507440745830536,
"loss": 1.612,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5670864582061768,
"rewards/margins": -0.0655982717871666,
"rewards/rejected": -0.5014881491661072,
"step": 11
},
{
"epoch": 0.02564787603526583,
"grad_norm": 2.2926666736602783,
"learning_rate": 1.2765957446808508e-07,
"logits/chosen": -0.9799962639808655,
"logits/rejected": -1.0184035301208496,
"logps/chosen": -0.29446908831596375,
"logps/rejected": -0.26765191555023193,
"loss": 1.6202,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5889381766319275,
"rewards/margins": -0.05363432317972183,
"rewards/rejected": -0.5353038311004639,
"step": 12
},
{
"epoch": 0.027785199038204648,
"grad_norm": 5.308409690856934,
"learning_rate": 1.3829787234042553e-07,
"logits/chosen": -0.8681848049163818,
"logits/rejected": -0.8799771070480347,
"logps/chosen": -0.3181426227092743,
"logps/rejected": -0.3121987581253052,
"loss": 1.6031,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6362852454185486,
"rewards/margins": -0.011887717992067337,
"rewards/rejected": -0.6243975162506104,
"step": 13
},
{
"epoch": 0.029922522041143467,
"grad_norm": 4.573068618774414,
"learning_rate": 1.4893617021276595e-07,
"logits/chosen": -0.8867932558059692,
"logits/rejected": -0.861649751663208,
"logps/chosen": -0.312772661447525,
"logps/rejected": -0.29462364315986633,
"loss": 1.6226,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.62554532289505,
"rewards/margins": -0.03629804030060768,
"rewards/rejected": -0.5892472863197327,
"step": 14
},
{
"epoch": 0.03205984504408229,
"grad_norm": 4.1025872230529785,
"learning_rate": 1.5957446808510638e-07,
"logits/chosen": -1.1116752624511719,
"logits/rejected": -0.9415389895439148,
"logps/chosen": -0.27133169770240784,
"logps/rejected": -0.29030919075012207,
"loss": 1.5818,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5426633954048157,
"rewards/margins": 0.037955012172460556,
"rewards/rejected": -0.5806183815002441,
"step": 15
},
{
"epoch": 0.03419716804702111,
"grad_norm": 3.307173728942871,
"learning_rate": 1.702127659574468e-07,
"logits/chosen": -0.9105625152587891,
"logits/rejected": -0.8872620463371277,
"logps/chosen": -0.2662544846534729,
"logps/rejected": -0.28296971321105957,
"loss": 1.6112,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5325089693069458,
"rewards/margins": 0.03343046456575394,
"rewards/rejected": -0.5659394264221191,
"step": 16
},
{
"epoch": 0.03633449104995993,
"grad_norm": 6.173768997192383,
"learning_rate": 1.8085106382978725e-07,
"logits/chosen": -0.7553848028182983,
"logits/rejected": -0.7946615815162659,
"logps/chosen": -0.277927964925766,
"logps/rejected": -0.28916075825691223,
"loss": 1.5928,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.555855929851532,
"rewards/margins": 0.02246551401913166,
"rewards/rejected": -0.5783215165138245,
"step": 17
},
{
"epoch": 0.03847181405289874,
"grad_norm": 3.708397626876831,
"learning_rate": 1.9148936170212765e-07,
"logits/chosen": -1.0742344856262207,
"logits/rejected": -1.1560362577438354,
"logps/chosen": -0.2530558407306671,
"logps/rejected": -0.2565101981163025,
"loss": 1.6245,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5061116814613342,
"rewards/margins": 0.006908770650625229,
"rewards/rejected": -0.513020396232605,
"step": 18
},
{
"epoch": 0.04060913705583756,
"grad_norm": 4.8654351234436035,
"learning_rate": 2.0212765957446807e-07,
"logits/chosen": -1.1306225061416626,
"logits/rejected": -1.0444625616073608,
"logps/chosen": -0.2724864184856415,
"logps/rejected": -0.2817416787147522,
"loss": 1.6247,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.544972836971283,
"rewards/margins": 0.018510470166802406,
"rewards/rejected": -0.5634833574295044,
"step": 19
},
{
"epoch": 0.04274646005877638,
"grad_norm": 3.5271363258361816,
"learning_rate": 2.127659574468085e-07,
"logits/chosen": -1.0348137617111206,
"logits/rejected": -1.0212081670761108,
"logps/chosen": -0.2397567480802536,
"logps/rejected": -0.23578569293022156,
"loss": 1.6172,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.4795134961605072,
"rewards/margins": -0.007942091673612595,
"rewards/rejected": -0.4715713858604431,
"step": 20
},
{
"epoch": 0.0448837830617152,
"grad_norm": 7.901147842407227,
"learning_rate": 2.2340425531914892e-07,
"logits/chosen": -1.1679033041000366,
"logits/rejected": -1.0415174961090088,
"logps/chosen": -0.33534738421440125,
"logps/rejected": -0.27388396859169006,
"loss": 1.6502,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.6706947684288025,
"rewards/margins": -0.12292689830064774,
"rewards/rejected": -0.5477679371833801,
"step": 21
},
{
"epoch": 0.04702110606465402,
"grad_norm": 2.3991823196411133,
"learning_rate": 2.3404255319148937e-07,
"logits/chosen": -1.0736172199249268,
"logits/rejected": -1.0771551132202148,
"logps/chosen": -0.2646552622318268,
"logps/rejected": -0.2733539938926697,
"loss": 1.6048,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.5293105244636536,
"rewards/margins": 0.01739754155278206,
"rewards/rejected": -0.5467079877853394,
"step": 22
},
{
"epoch": 0.04915842906759284,
"grad_norm": 4.812252998352051,
"learning_rate": 2.4468085106382976e-07,
"logits/chosen": -0.8147614002227783,
"logits/rejected": -0.9166449904441833,
"logps/chosen": -0.28619590401649475,
"logps/rejected": -0.2908383309841156,
"loss": 1.5764,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5723918080329895,
"rewards/margins": 0.009284832514822483,
"rewards/rejected": -0.5816766619682312,
"step": 23
},
{
"epoch": 0.05129575207053166,
"grad_norm": 5.214301109313965,
"learning_rate": 2.5531914893617016e-07,
"logits/chosen": -1.0316184759140015,
"logits/rejected": -1.0412724018096924,
"logps/chosen": -0.23989242315292358,
"logps/rejected": -0.26728230714797974,
"loss": 1.602,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.47978484630584717,
"rewards/margins": 0.0547797717154026,
"rewards/rejected": -0.5345646142959595,
"step": 24
},
{
"epoch": 0.053433075073470476,
"grad_norm": 3.372835636138916,
"learning_rate": 2.659574468085106e-07,
"logits/chosen": -1.0795375108718872,
"logits/rejected": -0.9741866588592529,
"logps/chosen": -0.28838473558425903,
"logps/rejected": -0.32610005140304565,
"loss": 1.6016,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5767694711685181,
"rewards/margins": 0.07543070614337921,
"rewards/rejected": -0.6522001028060913,
"step": 25
},
{
"epoch": 0.055570398076409296,
"grad_norm": 3.9052999019622803,
"learning_rate": 2.7659574468085106e-07,
"logits/chosen": -1.2568001747131348,
"logits/rejected": -1.1107139587402344,
"logps/chosen": -0.30466389656066895,
"logps/rejected": -0.2980763614177704,
"loss": 1.6209,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6093277931213379,
"rewards/margins": -0.0131750563159585,
"rewards/rejected": -0.5961527228355408,
"step": 26
},
{
"epoch": 0.057707721079348115,
"grad_norm": 3.9069981575012207,
"learning_rate": 2.872340425531915e-07,
"logits/chosen": -1.0098018646240234,
"logits/rejected": -0.9794459342956543,
"logps/chosen": -0.2699134051799774,
"logps/rejected": -0.28315117955207825,
"loss": 1.6203,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5398268103599548,
"rewards/margins": 0.026475582271814346,
"rewards/rejected": -0.5663023591041565,
"step": 27
},
{
"epoch": 0.059845044082286934,
"grad_norm": 4.644921779632568,
"learning_rate": 2.978723404255319e-07,
"logits/chosen": -0.8839479088783264,
"logits/rejected": -0.9320971965789795,
"logps/chosen": -0.2668587565422058,
"logps/rejected": -0.27507418394088745,
"loss": 1.6176,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5337175130844116,
"rewards/margins": 0.016430813819169998,
"rewards/rejected": -0.5501483678817749,
"step": 28
},
{
"epoch": 0.061982367085225754,
"grad_norm": 3.2341363430023193,
"learning_rate": 3.085106382978723e-07,
"logits/chosen": -1.0859841108322144,
"logits/rejected": -1.0080296993255615,
"logps/chosen": -0.2636515498161316,
"logps/rejected": -0.2644122838973999,
"loss": 1.6185,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5273030996322632,
"rewards/margins": 0.0015215259045362473,
"rewards/rejected": -0.5288245677947998,
"step": 29
},
{
"epoch": 0.06411969008816458,
"grad_norm": 5.580157279968262,
"learning_rate": 3.1914893617021275e-07,
"logits/chosen": -1.170966386795044,
"logits/rejected": -0.9350689053535461,
"logps/chosen": -0.2749802768230438,
"logps/rejected": -0.2526704668998718,
"loss": 1.6164,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.5499605536460876,
"rewards/margins": -0.04461963474750519,
"rewards/rejected": -0.5053409337997437,
"step": 30
},
{
"epoch": 0.06625701309110339,
"grad_norm": 4.681908130645752,
"learning_rate": 3.2978723404255315e-07,
"logits/chosen": -1.0664238929748535,
"logits/rejected": -0.9249334335327148,
"logps/chosen": -0.26851335167884827,
"logps/rejected": -0.3246592581272125,
"loss": 1.5989,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5370267033576965,
"rewards/margins": 0.11229176819324493,
"rewards/rejected": -0.649318516254425,
"step": 31
},
{
"epoch": 0.06839433609404222,
"grad_norm": 7.798113822937012,
"learning_rate": 3.404255319148936e-07,
"logits/chosen": -0.8868415951728821,
"logits/rejected": -0.8269252777099609,
"logps/chosen": -0.26608309149742126,
"logps/rejected": -0.29178884625434875,
"loss": 1.6,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5321661829948425,
"rewards/margins": 0.051411453634500504,
"rewards/rejected": -0.5835776925086975,
"step": 32
},
{
"epoch": 0.07053165909698103,
"grad_norm": 3.8565964698791504,
"learning_rate": 3.5106382978723405e-07,
"logits/chosen": -1.075560450553894,
"logits/rejected": -0.9206546545028687,
"logps/chosen": -0.3033750355243683,
"logps/rejected": -0.2647935748100281,
"loss": 1.6255,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.6067500710487366,
"rewards/margins": -0.07716288417577744,
"rewards/rejected": -0.5295871496200562,
"step": 33
},
{
"epoch": 0.07266898209991986,
"grad_norm": 4.738920211791992,
"learning_rate": 3.617021276595745e-07,
"logits/chosen": -1.0078967809677124,
"logits/rejected": -0.9841946363449097,
"logps/chosen": -0.29649823904037476,
"logps/rejected": -0.3331226706504822,
"loss": 1.6,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5929964780807495,
"rewards/margins": 0.07324886322021484,
"rewards/rejected": -0.6662453413009644,
"step": 34
},
{
"epoch": 0.07480630510285867,
"grad_norm": 15.601210594177246,
"learning_rate": 3.7234042553191484e-07,
"logits/chosen": -1.0271260738372803,
"logits/rejected": -1.0070686340332031,
"logps/chosen": -0.2500755488872528,
"logps/rejected": -0.2826491892337799,
"loss": 1.6131,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.5001510977745056,
"rewards/margins": 0.06514722108840942,
"rewards/rejected": -0.5652983784675598,
"step": 35
},
{
"epoch": 0.07694362810579748,
"grad_norm": 7.088011264801025,
"learning_rate": 3.829787234042553e-07,
"logits/chosen": -0.7224124670028687,
"logits/rejected": -0.5971524119377136,
"logps/chosen": -0.2726445198059082,
"logps/rejected": -0.2940409481525421,
"loss": 1.603,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5452890396118164,
"rewards/margins": 0.04279275983572006,
"rewards/rejected": -0.5880818963050842,
"step": 36
},
{
"epoch": 0.07908095110873631,
"grad_norm": 4.70820426940918,
"learning_rate": 3.9361702127659574e-07,
"logits/chosen": -0.979728102684021,
"logits/rejected": -0.9153163433074951,
"logps/chosen": -0.27593153715133667,
"logps/rejected": -0.26201528310775757,
"loss": 1.6146,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5518630743026733,
"rewards/margins": -0.02783256769180298,
"rewards/rejected": -0.5240305662155151,
"step": 37
},
{
"epoch": 0.08121827411167512,
"grad_norm": 8.7505464553833,
"learning_rate": 4.0425531914893614e-07,
"logits/chosen": -0.8443434238433838,
"logits/rejected": -0.8855568170547485,
"logps/chosen": -0.29990217089653015,
"logps/rejected": -0.2905019521713257,
"loss": 1.6493,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.5998043417930603,
"rewards/margins": -0.0188005194067955,
"rewards/rejected": -0.5810039043426514,
"step": 38
},
{
"epoch": 0.08335559711461395,
"grad_norm": 5.359803676605225,
"learning_rate": 4.148936170212766e-07,
"logits/chosen": -1.0560702085494995,
"logits/rejected": -1.1278265714645386,
"logps/chosen": -0.25392618775367737,
"logps/rejected": -0.2735791802406311,
"loss": 1.5949,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5078523755073547,
"rewards/margins": 0.039305973798036575,
"rewards/rejected": -0.5471583604812622,
"step": 39
},
{
"epoch": 0.08549292011755276,
"grad_norm": 3.1088671684265137,
"learning_rate": 4.25531914893617e-07,
"logits/chosen": -1.0592100620269775,
"logits/rejected": -1.0815989971160889,
"logps/chosen": -0.2885398864746094,
"logps/rejected": -0.2929195761680603,
"loss": 1.6321,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5770797729492188,
"rewards/margins": 0.00875941477715969,
"rewards/rejected": -0.5858391523361206,
"step": 40
},
{
"epoch": 0.08763024312049159,
"grad_norm": 8.073966026306152,
"learning_rate": 4.3617021276595744e-07,
"logits/chosen": -1.0096590518951416,
"logits/rejected": -0.8713966012001038,
"logps/chosen": -0.30629193782806396,
"logps/rejected": -0.33664122223854065,
"loss": 1.5914,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6125838756561279,
"rewards/margins": 0.06069856137037277,
"rewards/rejected": -0.6732824444770813,
"step": 41
},
{
"epoch": 0.0897675661234304,
"grad_norm": 5.209786891937256,
"learning_rate": 4.4680851063829783e-07,
"logits/chosen": -1.0377849340438843,
"logits/rejected": -0.8914337754249573,
"logps/chosen": -0.2845829427242279,
"logps/rejected": -0.3244422674179077,
"loss": 1.6121,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5691658854484558,
"rewards/margins": 0.079718679189682,
"rewards/rejected": -0.6488845348358154,
"step": 42
},
{
"epoch": 0.09190488912636922,
"grad_norm": 4.667476177215576,
"learning_rate": 4.574468085106383e-07,
"logits/chosen": -0.7347361445426941,
"logits/rejected": -0.7869642376899719,
"logps/chosen": -0.3507947623729706,
"logps/rejected": -0.27199897170066833,
"loss": 1.6222,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.7015895247459412,
"rewards/margins": -0.1575915366411209,
"rewards/rejected": -0.5439979434013367,
"step": 43
},
{
"epoch": 0.09404221212930804,
"grad_norm": 14.311481475830078,
"learning_rate": 4.6808510638297873e-07,
"logits/chosen": -0.8943421840667725,
"logits/rejected": -0.836614727973938,
"logps/chosen": -0.4167774021625519,
"logps/rejected": -0.430794894695282,
"loss": 1.597,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8335548043251038,
"rewards/margins": 0.028035037219524384,
"rewards/rejected": -0.861589789390564,
"step": 44
},
{
"epoch": 0.09617953513224686,
"grad_norm": 3.08385968208313,
"learning_rate": 4.787234042553192e-07,
"logits/chosen": -0.9741953015327454,
"logits/rejected": -0.8605018258094788,
"logps/chosen": -0.2905868887901306,
"logps/rejected": -0.29014959931373596,
"loss": 1.6179,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5811737775802612,
"rewards/margins": -0.0008745882660150528,
"rewards/rejected": -0.5802991986274719,
"step": 45
},
{
"epoch": 0.09831685813518568,
"grad_norm": 3.141914129257202,
"learning_rate": 4.893617021276595e-07,
"logits/chosen": -0.8467612266540527,
"logits/rejected": -0.8879311084747314,
"logps/chosen": -0.2710065543651581,
"logps/rejected": -0.28622525930404663,
"loss": 1.6098,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5420131087303162,
"rewards/margins": 0.03043745458126068,
"rewards/rejected": -0.5724505186080933,
"step": 46
},
{
"epoch": 0.1004541811381245,
"grad_norm": 5.874278545379639,
"learning_rate": 5e-07,
"logits/chosen": -0.9935128688812256,
"logits/rejected": -1.0635360479354858,
"logps/chosen": -0.2610815465450287,
"logps/rejected": -0.2970622777938843,
"loss": 1.5882,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5221630930900574,
"rewards/margins": 0.07196150720119476,
"rewards/rejected": -0.5941245555877686,
"step": 47
},
{
"epoch": 0.10259150414106331,
"grad_norm": 3.674631118774414,
"learning_rate": 4.999930062653174e-07,
"logits/chosen": -0.7607293725013733,
"logits/rejected": -0.9387491941452026,
"logps/chosen": -0.30105069279670715,
"logps/rejected": -0.29622718691825867,
"loss": 1.6263,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6021013855934143,
"rewards/margins": -0.009646959602832794,
"rewards/rejected": -0.5924543738365173,
"step": 48
},
{
"epoch": 0.10472882714400214,
"grad_norm": 3.2993836402893066,
"learning_rate": 4.999720254525684e-07,
"logits/chosen": -1.041825294494629,
"logits/rejected": -0.8979977965354919,
"logps/chosen": -0.3147028684616089,
"logps/rejected": -0.32463401556015015,
"loss": 1.5836,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6294057369232178,
"rewards/margins": 0.019862275570631027,
"rewards/rejected": -0.6492680311203003,
"step": 49
},
{
"epoch": 0.10686615014694095,
"grad_norm": 3.6394598484039307,
"learning_rate": 4.999370587356267e-07,
"logits/chosen": -1.0319520235061646,
"logits/rejected": -0.9399799108505249,
"logps/chosen": -0.3198903501033783,
"logps/rejected": -0.33650463819503784,
"loss": 1.61,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.6397807002067566,
"rewards/margins": 0.03322865813970566,
"rewards/rejected": -0.6730092763900757,
"step": 50
},
{
"epoch": 0.10900347314987978,
"grad_norm": 3.5822248458862305,
"learning_rate": 4.998881080708758e-07,
"logits/chosen": -0.7624353170394897,
"logits/rejected": -0.7781803011894226,
"logps/chosen": -0.22195202112197876,
"logps/rejected": -0.2529197931289673,
"loss": 1.6014,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4439040422439575,
"rewards/margins": 0.061935484409332275,
"rewards/rejected": -0.5058395862579346,
"step": 51
},
{
"epoch": 0.11114079615281859,
"grad_norm": 4.502132415771484,
"learning_rate": 4.998251761970996e-07,
"logits/chosen": -0.934096097946167,
"logits/rejected": -0.9894377589225769,
"logps/chosen": -0.3010854721069336,
"logps/rejected": -0.2971184551715851,
"loss": 1.6238,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6021709442138672,
"rewards/margins": -0.00793398916721344,
"rewards/rejected": -0.5942369103431702,
"step": 52
},
{
"epoch": 0.11327811915575742,
"grad_norm": 14.495563507080078,
"learning_rate": 4.997482666353286e-07,
"logits/chosen": -0.9065138101577759,
"logits/rejected": -0.8083285093307495,
"logps/chosen": -0.2879031002521515,
"logps/rejected": -0.30471161007881165,
"loss": 1.6036,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.575806200504303,
"rewards/margins": 0.03361699730157852,
"rewards/rejected": -0.6094232201576233,
"step": 53
},
{
"epoch": 0.11541544215869623,
"grad_norm": 5.210042953491211,
"learning_rate": 4.996573836886434e-07,
"logits/chosen": -1.012821912765503,
"logits/rejected": -0.935365617275238,
"logps/chosen": -0.27059802412986755,
"logps/rejected": -0.28305694460868835,
"loss": 1.5922,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5411960482597351,
"rewards/margins": 0.024917850270867348,
"rewards/rejected": -0.5661138892173767,
"step": 54
},
{
"epoch": 0.11755276516163506,
"grad_norm": 3.4929800033569336,
"learning_rate": 4.995525324419337e-07,
"logits/chosen": -1.03290593624115,
"logits/rejected": -0.8397963047027588,
"logps/chosen": -0.23197168111801147,
"logps/rejected": -0.257206529378891,
"loss": 1.6012,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.46394336223602295,
"rewards/margins": 0.05046967417001724,
"rewards/rejected": -0.514413058757782,
"step": 55
},
{
"epoch": 0.11969008816457387,
"grad_norm": 6.639918804168701,
"learning_rate": 4.99433718761614e-07,
"logits/chosen": -0.8676168918609619,
"logits/rejected": -0.8751212954521179,
"logps/chosen": -0.2813611626625061,
"logps/rejected": -0.28943243622779846,
"loss": 1.602,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5627223253250122,
"rewards/margins": 0.01614254154264927,
"rewards/rejected": -0.5788648724555969,
"step": 56
},
{
"epoch": 0.1218274111675127,
"grad_norm": 3.159461736679077,
"learning_rate": 4.993009492952949e-07,
"logits/chosen": -0.9598115682601929,
"logits/rejected": -0.9728808999061584,
"logps/chosen": -0.2418256551027298,
"logps/rejected": -0.27858078479766846,
"loss": 1.6025,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4836513102054596,
"rewards/margins": 0.07351024448871613,
"rewards/rejected": -0.5571615695953369,
"step": 57
},
{
"epoch": 0.12396473417045151,
"grad_norm": 3.1260619163513184,
"learning_rate": 4.991542314714122e-07,
"logits/chosen": -1.1715333461761475,
"logits/rejected": -1.0372506380081177,
"logps/chosen": -0.2886565625667572,
"logps/rejected": -0.3048909306526184,
"loss": 1.6142,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5773131251335144,
"rewards/margins": 0.03246863931417465,
"rewards/rejected": -0.6097818613052368,
"step": 58
},
{
"epoch": 0.12610205717339032,
"grad_norm": 4.803882598876953,
"learning_rate": 4.989935734988097e-07,
"logits/chosen": -0.8652929663658142,
"logits/rejected": -0.9138813018798828,
"logps/chosen": -0.22791269421577454,
"logps/rejected": -0.2620168924331665,
"loss": 1.5999,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.4558253884315491,
"rewards/margins": 0.06820837408304214,
"rewards/rejected": -0.524033784866333,
"step": 59
},
{
"epoch": 0.12823938017632916,
"grad_norm": 2.9545466899871826,
"learning_rate": 4.988189843662815e-07,
"logits/chosen": -0.9540647864341736,
"logits/rejected": -0.9105108380317688,
"logps/chosen": -0.28050848841667175,
"logps/rejected": -0.2682150602340698,
"loss": 1.6229,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5610169768333435,
"rewards/margins": -0.024586813524365425,
"rewards/rejected": -0.5364301204681396,
"step": 60
},
{
"epoch": 0.13037670317926797,
"grad_norm": 5.4623260498046875,
"learning_rate": 4.986304738420683e-07,
"logits/chosen": -0.8594868779182434,
"logits/rejected": -0.8749207854270935,
"logps/chosen": -0.23750001192092896,
"logps/rejected": -0.24768495559692383,
"loss": 1.5863,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.4750000238418579,
"rewards/margins": 0.020369907841086388,
"rewards/rejected": -0.49536991119384766,
"step": 61
},
{
"epoch": 0.13251402618220678,
"grad_norm": 5.195383548736572,
"learning_rate": 4.984280524733107e-07,
"logits/chosen": -0.8988451361656189,
"logits/rejected": -1.0471916198730469,
"logps/chosen": -0.2563616931438446,
"logps/rejected": -0.264529824256897,
"loss": 1.628,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.5127233862876892,
"rewards/margins": 0.01633620262145996,
"rewards/rejected": -0.529059648513794,
"step": 62
},
{
"epoch": 0.1346513491851456,
"grad_norm": 3.365633249282837,
"learning_rate": 4.982117315854593e-07,
"logits/chosen": -0.9563354253768921,
"logits/rejected": -1.143921971321106,
"logps/chosen": -0.27564752101898193,
"logps/rejected": -0.2891802191734314,
"loss": 1.6243,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.5512950420379639,
"rewards/margins": 0.02706541307270527,
"rewards/rejected": -0.5783604383468628,
"step": 63
},
{
"epoch": 0.13678867218808444,
"grad_norm": 3.6298470497131348,
"learning_rate": 4.979815232816416e-07,
"logits/chosen": -0.9835873246192932,
"logits/rejected": -0.8579452037811279,
"logps/chosen": -0.2935434579849243,
"logps/rejected": -0.26197710633277893,
"loss": 1.6428,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.5870869159698486,
"rewards/margins": -0.06313266605138779,
"rewards/rejected": -0.5239542126655579,
"step": 64
},
{
"epoch": 0.13892599519102325,
"grad_norm": 5.261904239654541,
"learning_rate": 4.977374404419837e-07,
"logits/chosen": -1.0193111896514893,
"logits/rejected": -1.036008358001709,
"logps/chosen": -0.27654433250427246,
"logps/rejected": -0.25757479667663574,
"loss": 1.5985,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.6913608908653259,
"rewards/margins": -0.047423895448446274,
"rewards/rejected": -0.6439369320869446,
"step": 65
},
{
"epoch": 0.14106331819396206,
"grad_norm": 3.326939582824707,
"learning_rate": 4.974794967228907e-07,
"logits/chosen": -1.0054104328155518,
"logits/rejected": -0.9754442572593689,
"logps/chosen": -0.2905897796154022,
"logps/rejected": -0.32264938950538635,
"loss": 1.6248,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7264744639396667,
"rewards/margins": 0.08014895021915436,
"rewards/rejected": -0.8066234588623047,
"step": 66
},
{
"epoch": 0.14320064119690087,
"grad_norm": 5.669600486755371,
"learning_rate": 4.972077065562821e-07,
"logits/chosen": -0.9552958607673645,
"logits/rejected": -1.0761511325836182,
"logps/chosen": -0.3276459574699402,
"logps/rejected": -0.32107335329055786,
"loss": 1.6203,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8191148638725281,
"rewards/margins": -0.01643138751387596,
"rewards/rejected": -0.8026834726333618,
"step": 67
},
{
"epoch": 0.14533796419983971,
"grad_norm": 3.257904052734375,
"learning_rate": 4.969220851487844e-07,
"logits/chosen": -0.9927914142608643,
"logits/rejected": -0.9472739696502686,
"logps/chosen": -0.3458186686038971,
"logps/rejected": -0.34241756796836853,
"loss": 1.6191,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8645466566085815,
"rewards/margins": -0.008502773940563202,
"rewards/rejected": -0.8560439348220825,
"step": 68
},
{
"epoch": 0.14747528720277853,
"grad_norm": 5.560789585113525,
"learning_rate": 4.966226484808803e-07,
"logits/chosen": -0.9344061613082886,
"logits/rejected": -0.8273663520812988,
"logps/chosen": -0.2849215567111969,
"logps/rejected": -0.31608855724334717,
"loss": 1.6123,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.7123039960861206,
"rewards/margins": 0.0779174268245697,
"rewards/rejected": -0.7902213335037231,
"step": 69
},
{
"epoch": 0.14961261020571734,
"grad_norm": 3.70934796333313,
"learning_rate": 4.963094133060148e-07,
"logits/chosen": -0.9611161947250366,
"logits/rejected": -0.8749902248382568,
"logps/chosen": -0.2869144380092621,
"logps/rejected": -0.23931002616882324,
"loss": 1.6348,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.7172860503196716,
"rewards/margins": -0.11901099979877472,
"rewards/rejected": -0.5982750654220581,
"step": 70
},
{
"epoch": 0.15174993320865615,
"grad_norm": 3.771442413330078,
"learning_rate": 4.959823971496574e-07,
"logits/chosen": -1.0483484268188477,
"logits/rejected": -0.9827014803886414,
"logps/chosen": -0.3061015009880066,
"logps/rejected": -0.3094024658203125,
"loss": 1.5879,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7652537822723389,
"rewards/margins": 0.00825244840234518,
"rewards/rejected": -0.7735061645507812,
"step": 71
},
{
"epoch": 0.15388725621159496,
"grad_norm": 3.6872432231903076,
"learning_rate": 4.956416183083221e-07,
"logits/chosen": -1.0115149021148682,
"logits/rejected": -1.0020099878311157,
"logps/chosen": -0.26311925053596497,
"logps/rejected": -0.27171316742897034,
"loss": 1.5697,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6577981114387512,
"rewards/margins": 0.02148478478193283,
"rewards/rejected": -0.6792829036712646,
"step": 72
},
{
"epoch": 0.1560245792145338,
"grad_norm": 7.885510444641113,
"learning_rate": 4.952870958485431e-07,
"logits/chosen": -0.7439613938331604,
"logits/rejected": -0.7543243169784546,
"logps/chosen": -0.32277047634124756,
"logps/rejected": -0.44049495458602905,
"loss": 1.5719,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8069261312484741,
"rewards/margins": 0.2943112254142761,
"rewards/rejected": -1.1012372970581055,
"step": 73
},
{
"epoch": 0.15816190221747262,
"grad_norm": 14.005949020385742,
"learning_rate": 4.949188496058089e-07,
"logits/chosen": -0.8661502599716187,
"logits/rejected": -0.9138545989990234,
"logps/chosen": -0.27060988545417786,
"logps/rejected": -0.25004029273986816,
"loss": 1.6381,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6765246987342834,
"rewards/margins": -0.05142403393983841,
"rewards/rejected": -0.6251006722450256,
"step": 74
},
{
"epoch": 0.16029922522041143,
"grad_norm": 7.343827247619629,
"learning_rate": 4.945369001834514e-07,
"logits/chosen": -1.07318115234375,
"logits/rejected": -1.0178194046020508,
"logps/chosen": -0.2654929459095001,
"logps/rejected": -0.29686206579208374,
"loss": 1.5458,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.6637323498725891,
"rewards/margins": 0.07842274755239487,
"rewards/rejected": -0.7421550750732422,
"step": 75
},
{
"epoch": 0.16243654822335024,
"grad_norm": 4.01411247253418,
"learning_rate": 4.941412689514941e-07,
"logits/chosen": -1.162184238433838,
"logits/rejected": -1.2236565351486206,
"logps/chosen": -0.2647251486778259,
"logps/rejected": -0.2977098226547241,
"loss": 1.6206,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6618129014968872,
"rewards/margins": 0.08246167004108429,
"rewards/rejected": -0.7442746162414551,
"step": 76
},
{
"epoch": 0.16457387122628908,
"grad_norm": 4.315869331359863,
"learning_rate": 4.937319780454559e-07,
"logits/chosen": -0.8569203019142151,
"logits/rejected": -0.7959333062171936,
"logps/chosen": -0.29102951288223267,
"logps/rejected": -0.31862419843673706,
"loss": 1.5992,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.727573812007904,
"rewards/margins": 0.06898671388626099,
"rewards/rejected": -0.7965604662895203,
"step": 77
},
{
"epoch": 0.1667111942292279,
"grad_norm": 6.3516645431518555,
"learning_rate": 4.933090503651128e-07,
"logits/chosen": -0.9815778136253357,
"logits/rejected": -0.9455960988998413,
"logps/chosen": -0.290622353553772,
"logps/rejected": -0.25920000672340393,
"loss": 1.6019,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.7265558838844299,
"rewards/margins": -0.07855589687824249,
"rewards/rejected": -0.6479999423027039,
"step": 78
},
{
"epoch": 0.1688485172321667,
"grad_norm": 5.576763153076172,
"learning_rate": 4.928725095732168e-07,
"logits/chosen": -0.7572908401489258,
"logits/rejected": -0.8643375039100647,
"logps/chosen": -0.28876882791519165,
"logps/rejected": -0.38018882274627686,
"loss": 1.574,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.721921980381012,
"rewards/margins": 0.22855007648468018,
"rewards/rejected": -0.9504721164703369,
"step": 79
},
{
"epoch": 0.17098584023510552,
"grad_norm": 6.904773235321045,
"learning_rate": 4.924223800941717e-07,
"logits/chosen": -1.1600089073181152,
"logits/rejected": -1.001929759979248,
"logps/chosen": -0.3185364007949829,
"logps/rejected": -0.2833505868911743,
"loss": 1.5885,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.796341061592102,
"rewards/margins": -0.08796463906764984,
"rewards/rejected": -0.708376407623291,
"step": 80
},
{
"epoch": 0.17312316323804436,
"grad_norm": 5.542295932769775,
"learning_rate": 4.919586871126667e-07,
"logits/chosen": -1.1290327310562134,
"logits/rejected": -1.0776805877685547,
"logps/chosen": -0.28904592990875244,
"logps/rejected": -0.32642093300819397,
"loss": 1.5823,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7226147651672363,
"rewards/margins": 0.09343745559453964,
"rewards/rejected": -0.816052258014679,
"step": 81
},
{
"epoch": 0.17526048624098317,
"grad_norm": 7.346787929534912,
"learning_rate": 4.91481456572267e-07,
"logits/chosen": -1.008028507232666,
"logits/rejected": -0.7614388465881348,
"logps/chosen": -0.276422917842865,
"logps/rejected": -0.28925520181655884,
"loss": 1.5471,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6910573244094849,
"rewards/margins": 0.03208072483539581,
"rewards/rejected": -0.7231380343437195,
"step": 82
},
{
"epoch": 0.17739780924392198,
"grad_norm": 3.4823226928710938,
"learning_rate": 4.909907151739633e-07,
"logits/chosen": -0.8054043650627136,
"logits/rejected": -0.8212348222732544,
"logps/chosen": -0.25493186712265015,
"logps/rejected": -0.2324959635734558,
"loss": 1.6075,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.637329638004303,
"rewards/margins": -0.05608966201543808,
"rewards/rejected": -0.5812399983406067,
"step": 83
},
{
"epoch": 0.1795351322468608,
"grad_norm": 8.512069702148438,
"learning_rate": 4.904864903746765e-07,
"logits/chosen": -0.8016963601112366,
"logits/rejected": -0.8472069501876831,
"logps/chosen": -0.3017991781234741,
"logps/rejected": -0.30026912689208984,
"loss": 1.6386,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7544978857040405,
"rewards/margins": -0.003825142979621887,
"rewards/rejected": -0.7506727576255798,
"step": 84
},
{
"epoch": 0.18167245524979964,
"grad_norm": 3.27510666847229,
"learning_rate": 4.899688103857222e-07,
"logits/chosen": -0.9057269096374512,
"logits/rejected": -0.8979475498199463,
"logps/chosen": -0.24959440529346466,
"logps/rejected": -0.3118637502193451,
"loss": 1.5594,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.623986005783081,
"rewards/margins": 0.15567341446876526,
"rewards/rejected": -0.7796593904495239,
"step": 85
},
{
"epoch": 0.18380977825273845,
"grad_norm": 3.2452337741851807,
"learning_rate": 4.894377041712326e-07,
"logits/chosen": -0.6997116208076477,
"logits/rejected": -0.6495150327682495,
"logps/chosen": -0.2519880533218384,
"logps/rejected": -0.30695033073425293,
"loss": 1.585,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6299701929092407,
"rewards/margins": 0.137405663728714,
"rewards/rejected": -0.7673758268356323,
"step": 86
},
{
"epoch": 0.18594710125567726,
"grad_norm": 6.7908430099487305,
"learning_rate": 4.888932014465352e-07,
"logits/chosen": -0.8975124359130859,
"logits/rejected": -0.8113777041435242,
"logps/chosen": -0.2837047576904297,
"logps/rejected": -0.2963961958885193,
"loss": 1.5927,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.709261953830719,
"rewards/margins": 0.03172856196761131,
"rewards/rejected": -0.7409905195236206,
"step": 87
},
{
"epoch": 0.18808442425861607,
"grad_norm": 4.965595722198486,
"learning_rate": 4.883353326764906e-07,
"logits/chosen": -0.8913217186927795,
"logits/rejected": -0.8421756625175476,
"logps/chosen": -0.25936800241470337,
"logps/rejected": -0.45224201679229736,
"loss": 1.5572,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.648419976234436,
"rewards/margins": 0.4821849763393402,
"rewards/rejected": -1.1306049823760986,
"step": 88
},
{
"epoch": 0.1902217472615549,
"grad_norm": 5.781017780303955,
"learning_rate": 4.877641290737883e-07,
"logits/chosen": -0.9931791424751282,
"logits/rejected": -0.9962902665138245,
"logps/chosen": -0.2539224624633789,
"logps/rejected": -0.2921288311481476,
"loss": 1.6042,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6348061561584473,
"rewards/margins": 0.0955159068107605,
"rewards/rejected": -0.730322003364563,
"step": 89
},
{
"epoch": 0.19235907026449373,
"grad_norm": 5.002528190612793,
"learning_rate": 4.871796225971999e-07,
"logits/chosen": -0.9850423336029053,
"logits/rejected": -0.857207179069519,
"logps/chosen": -0.27758607268333435,
"logps/rejected": -0.3058236241340637,
"loss": 1.5967,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6939651966094971,
"rewards/margins": 0.0705939531326294,
"rewards/rejected": -0.7645590901374817,
"step": 90
},
{
"epoch": 0.19449639326743254,
"grad_norm": 5.870463848114014,
"learning_rate": 4.86581845949791e-07,
"logits/chosen": -0.949233889579773,
"logits/rejected": -1.0075958967208862,
"logps/chosen": -0.2556777596473694,
"logps/rejected": -0.2905680537223816,
"loss": 1.5559,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.6391944289207458,
"rewards/margins": 0.08722572773694992,
"rewards/rejected": -0.7264201045036316,
"step": 91
},
{
"epoch": 0.19663371627037135,
"grad_norm": 4.762439727783203,
"learning_rate": 4.859708325770919e-07,
"logits/chosen": -1.1257095336914062,
"logits/rejected": -1.173663854598999,
"logps/chosen": -0.28581249713897705,
"logps/rejected": -0.3704802989959717,
"loss": 1.5915,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7145313024520874,
"rewards/margins": 0.21166956424713135,
"rewards/rejected": -0.926200807094574,
"step": 92
},
{
"epoch": 0.1987710392733102,
"grad_norm": 5.9837117195129395,
"learning_rate": 4.853466166652258e-07,
"logits/chosen": -0.9948515295982361,
"logits/rejected": -0.9665160179138184,
"logps/chosen": -0.2551361620426178,
"logps/rejected": -0.28811219334602356,
"loss": 1.5882,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6378403902053833,
"rewards/margins": 0.08244016021490097,
"rewards/rejected": -0.7202805280685425,
"step": 93
},
{
"epoch": 0.200908362276249,
"grad_norm": 4.58953857421875,
"learning_rate": 4.847092331389964e-07,
"logits/chosen": -0.7557870149612427,
"logits/rejected": -0.7804038524627686,
"logps/chosen": -0.26233580708503723,
"logps/rejected": -0.28490206599235535,
"loss": 1.6059,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6558394432067871,
"rewards/margins": 0.056415725499391556,
"rewards/rejected": -0.7122551798820496,
"step": 94
},
{
"epoch": 0.20304568527918782,
"grad_norm": 4.404232978820801,
"learning_rate": 4.840587176599343e-07,
"logits/chosen": -1.1708656549453735,
"logits/rejected": -1.1824274063110352,
"logps/chosen": -0.3498944640159607,
"logps/rejected": -0.3052523732185364,
"loss": 1.5518,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8747361898422241,
"rewards/margins": -0.1116051897406578,
"rewards/rejected": -0.7631310224533081,
"step": 95
},
{
"epoch": 0.20518300828212663,
"grad_norm": 2.755133628845215,
"learning_rate": 4.833951066243004e-07,
"logits/chosen": -0.9821409583091736,
"logits/rejected": -0.9246101975440979,
"logps/chosen": -0.29376041889190674,
"logps/rejected": -0.2656431794166565,
"loss": 1.6092,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.7344010472297668,
"rewards/margins": -0.07029299437999725,
"rewards/rejected": -0.6641080379486084,
"step": 96
},
{
"epoch": 0.20732033128506547,
"grad_norm": 8.961865425109863,
"learning_rate": 4.82718437161051e-07,
"logits/chosen": -0.9781126976013184,
"logits/rejected": -1.0274431705474854,
"logps/chosen": -0.2688814699649811,
"logps/rejected": -0.25695058703422546,
"loss": 1.6414,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6722037196159363,
"rewards/margins": -0.029827285557985306,
"rewards/rejected": -0.6423764228820801,
"step": 97
},
{
"epoch": 0.20945765428800428,
"grad_norm": 3.496291160583496,
"learning_rate": 4.820287471297597e-07,
"logits/chosen": -1.110063076019287,
"logits/rejected": -0.9798667430877686,
"logps/chosen": -0.2772579789161682,
"logps/rejected": -0.284493625164032,
"loss": 1.6027,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6931449174880981,
"rewards/margins": 0.01808912120759487,
"rewards/rejected": -0.7112340927124023,
"step": 98
},
{
"epoch": 0.2115949772909431,
"grad_norm": 9.36062240600586,
"learning_rate": 4.813260751184992e-07,
"logits/chosen": -1.0408313274383545,
"logits/rejected": -0.9097151160240173,
"logps/chosen": -0.2336195558309555,
"logps/rejected": -0.28545060753822327,
"loss": 1.5888,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.5840489268302917,
"rewards/margins": 0.12957759201526642,
"rewards/rejected": -0.7136265635490417,
"step": 99
},
{
"epoch": 0.2137323002938819,
"grad_norm": 4.372857570648193,
"learning_rate": 4.806104604416823e-07,
"logits/chosen": -1.1981866359710693,
"logits/rejected": -1.1812773942947388,
"logps/chosen": -0.40545234084129333,
"logps/rejected": -0.32747963070869446,
"loss": 1.6366,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0136308670043945,
"rewards/margins": -0.194931760430336,
"rewards/rejected": -0.8186991214752197,
"step": 100
},
{
"epoch": 0.21586962329682075,
"grad_norm": 6.457290172576904,
"learning_rate": 4.798819431378626e-07,
"logits/chosen": -0.9583615064620972,
"logits/rejected": -0.9292630553245544,
"logps/chosen": -0.2667827904224396,
"logps/rejected": -0.3141520917415619,
"loss": 1.5717,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6669570207595825,
"rewards/margins": 0.11842326819896698,
"rewards/rejected": -0.7853802442550659,
"step": 101
},
{
"epoch": 0.21800694629975956,
"grad_norm": 4.259753704071045,
"learning_rate": 4.79140563967494e-07,
"logits/chosen": -0.9554131031036377,
"logits/rejected": -0.9235316514968872,
"logps/chosen": -0.2790910005569458,
"logps/rejected": -0.29358065128326416,
"loss": 1.5956,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6977274417877197,
"rewards/margins": 0.03622422739863396,
"rewards/rejected": -0.7339516878128052,
"step": 102
},
{
"epoch": 0.22014426930269837,
"grad_norm": 7.2334675788879395,
"learning_rate": 4.783863644106502e-07,
"logits/chosen": -0.958928108215332,
"logits/rejected": -0.9119776487350464,
"logps/chosen": -0.2611943483352661,
"logps/rejected": -0.29755640029907227,
"loss": 1.5837,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6529859304428101,
"rewards/margins": 0.09090512990951538,
"rewards/rejected": -0.7438910007476807,
"step": 103
},
{
"epoch": 0.22228159230563718,
"grad_norm": 3.2508132457733154,
"learning_rate": 4.776193866647039e-07,
"logits/chosen": -1.073838710784912,
"logits/rejected": -0.9083616733551025,
"logps/chosen": -0.2882213294506073,
"logps/rejected": -0.2734883427619934,
"loss": 1.606,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7205533385276794,
"rewards/margins": -0.03683248162269592,
"rewards/rejected": -0.6837208867073059,
"step": 104
},
{
"epoch": 0.224418915308576,
"grad_norm": 5.259939193725586,
"learning_rate": 4.768396736419662e-07,
"logits/chosen": -0.9633040428161621,
"logits/rejected": -0.9958257675170898,
"logps/chosen": -0.2812567949295044,
"logps/rejected": -0.3445313572883606,
"loss": 1.6165,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.703141987323761,
"rewards/margins": 0.1581864058971405,
"rewards/rejected": -0.8613283634185791,
"step": 105
},
{
"epoch": 0.22655623831151483,
"grad_norm": 4.128396511077881,
"learning_rate": 4.7604726896728496e-07,
"logits/chosen": -0.898779571056366,
"logits/rejected": -0.8008460998535156,
"logps/chosen": -0.3449317216873169,
"logps/rejected": -0.3174844980239868,
"loss": 1.581,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.862329363822937,
"rewards/margins": -0.06861816346645355,
"rewards/rejected": -0.793711245059967,
"step": 106
},
{
"epoch": 0.22869356131445365,
"grad_norm": 7.073979377746582,
"learning_rate": 4.752422169756047e-07,
"logits/chosen": -0.7736971378326416,
"logits/rejected": -0.7141239643096924,
"logps/chosen": -0.27883169054985046,
"logps/rejected": -0.28934141993522644,
"loss": 1.6038,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6970791816711426,
"rewards/margins": 0.02627432905137539,
"rewards/rejected": -0.7233536243438721,
"step": 107
},
{
"epoch": 0.23083088431739246,
"grad_norm": 3.607881784439087,
"learning_rate": 4.744245627094858e-07,
"logits/chosen": -0.7748329043388367,
"logits/rejected": -0.7313745021820068,
"logps/chosen": -0.3115028142929077,
"logps/rejected": -0.3770483732223511,
"loss": 1.6484,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.7787570357322693,
"rewards/margins": 0.1638639271259308,
"rewards/rejected": -0.9426208734512329,
"step": 108
},
{
"epoch": 0.23296820732033127,
"grad_norm": 6.025390148162842,
"learning_rate": 4.735943519165842e-07,
"logits/chosen": -0.8779905438423157,
"logits/rejected": -0.9295673966407776,
"logps/chosen": -0.2856750786304474,
"logps/rejected": -0.3149481415748596,
"loss": 1.6163,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7141878008842468,
"rewards/margins": 0.07318252325057983,
"rewards/rejected": -0.7873702645301819,
"step": 109
},
{
"epoch": 0.2351055303232701,
"grad_norm": 10.654799461364746,
"learning_rate": 4.7275163104709194e-07,
"logits/chosen": -1.139617681503296,
"logits/rejected": -1.037335753440857,
"logps/chosen": -0.3125270903110504,
"logps/rejected": -0.42321839928627014,
"loss": 1.5969,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7813177704811096,
"rewards/margins": 0.27672821283340454,
"rewards/rejected": -1.0580458641052246,
"step": 110
},
{
"epoch": 0.23724285332620892,
"grad_norm": 3.1225905418395996,
"learning_rate": 4.718964472511385e-07,
"logits/chosen": -0.7755342125892639,
"logits/rejected": -0.9119763374328613,
"logps/chosen": -0.26263684034347534,
"logps/rejected": -0.2584255635738373,
"loss": 1.6006,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6565921306610107,
"rewards/margins": -0.010528111830353737,
"rewards/rejected": -0.6460639834403992,
"step": 111
},
{
"epoch": 0.23938017632914774,
"grad_norm": 3.907759189605713,
"learning_rate": 4.710288483761524e-07,
"logits/chosen": -0.805738091468811,
"logits/rejected": -0.8327180743217468,
"logps/chosen": -0.26873674988746643,
"logps/rejected": -0.27950698137283325,
"loss": 1.5569,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6718418598175049,
"rewards/margins": 0.02692551538348198,
"rewards/rejected": -0.6987674236297607,
"step": 112
},
{
"epoch": 0.24151749933208655,
"grad_norm": 5.125892162322998,
"learning_rate": 4.7014888296418447e-07,
"logits/chosen": -0.8660019040107727,
"logits/rejected": -0.7626081109046936,
"logps/chosen": -0.27202802896499634,
"logps/rejected": -0.3179852068424225,
"loss": 1.5133,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.680070161819458,
"rewards/margins": 0.11489284038543701,
"rewards/rejected": -0.7949629426002502,
"step": 113
},
{
"epoch": 0.2436548223350254,
"grad_norm": 3.331281900405884,
"learning_rate": 4.692566002491916e-07,
"logits/chosen": -0.9860325455665588,
"logits/rejected": -1.0227984189987183,
"logps/chosen": -0.277464359998703,
"logps/rejected": -0.3393504023551941,
"loss": 1.5764,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6936609745025635,
"rewards/margins": 0.15471497178077698,
"rewards/rejected": -0.8483759164810181,
"step": 114
},
{
"epoch": 0.2457921453379642,
"grad_norm": 4.2767205238342285,
"learning_rate": 4.683520501542824e-07,
"logits/chosen": -1.1069515943527222,
"logits/rejected": -0.9956479668617249,
"logps/chosen": -0.26621949672698975,
"logps/rejected": -0.2311069816350937,
"loss": 1.6116,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6655487418174744,
"rewards/margins": -0.08778128772974014,
"rewards/rejected": -0.5777674317359924,
"step": 115
},
{
"epoch": 0.24792946834090301,
"grad_norm": 4.450298309326172,
"learning_rate": 4.6743528328892384e-07,
"logits/chosen": -1.089507818222046,
"logits/rejected": -1.0226225852966309,
"logps/chosen": -0.31000208854675293,
"logps/rejected": -0.3048384189605713,
"loss": 1.5601,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7750052213668823,
"rewards/margins": -0.012909159064292908,
"rewards/rejected": -0.7620960474014282,
"step": 116
},
{
"epoch": 0.25006679134384185,
"grad_norm": 8.30679988861084,
"learning_rate": 4.6650635094610966e-07,
"logits/chosen": -1.019626498222351,
"logits/rejected": -1.0024278163909912,
"logps/chosen": -0.27769044041633606,
"logps/rejected": -0.31382811069488525,
"loss": 1.603,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6942261457443237,
"rewards/margins": 0.09034418314695358,
"rewards/rejected": -0.7845702767372131,
"step": 117
},
{
"epoch": 0.25220411434678064,
"grad_norm": 2.7810580730438232,
"learning_rate": 4.655653050994906e-07,
"logits/chosen": -0.8939322829246521,
"logits/rejected": -0.9443778991699219,
"logps/chosen": -0.3001169264316559,
"logps/rejected": -0.27608948945999146,
"loss": 1.6033,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7502923011779785,
"rewards/margins": -0.06006866693496704,
"rewards/rejected": -0.6902236938476562,
"step": 118
},
{
"epoch": 0.2543414373497195,
"grad_norm": 7.52852201461792,
"learning_rate": 4.646121984004665e-07,
"logits/chosen": -1.0176833868026733,
"logits/rejected": -0.9106737971305847,
"logps/chosen": -0.2855750620365143,
"logps/rejected": -0.2689260244369507,
"loss": 1.6369,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7139376401901245,
"rewards/margins": -0.041622575372457504,
"rewards/rejected": -0.6723150610923767,
"step": 119
},
{
"epoch": 0.2564787603526583,
"grad_norm": 5.235323429107666,
"learning_rate": 4.636470841752404e-07,
"logits/chosen": -0.894492506980896,
"logits/rejected": -0.8580023050308228,
"logps/chosen": -0.2390415519475937,
"logps/rejected": -0.3226756751537323,
"loss": 1.5698,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.5976038575172424,
"rewards/margins": 0.20908531546592712,
"rewards/rejected": -0.8066891431808472,
"step": 120
},
{
"epoch": 0.2586160833555971,
"grad_norm": 7.31561803817749,
"learning_rate": 4.626700164218349e-07,
"logits/chosen": -1.1262331008911133,
"logits/rejected": -1.1069376468658447,
"logps/chosen": -0.32872867584228516,
"logps/rejected": -0.4012628495693207,
"loss": 1.5619,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8218216896057129,
"rewards/margins": 0.18133553862571716,
"rewards/rejected": -1.003157138824463,
"step": 121
},
{
"epoch": 0.26075340635853594,
"grad_norm": 4.992301940917969,
"learning_rate": 4.6168104980707103e-07,
"logits/chosen": -0.947390079498291,
"logits/rejected": -0.9287791848182678,
"logps/chosen": -0.3660104274749756,
"logps/rejected": -0.34243422746658325,
"loss": 1.6722,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.9150261282920837,
"rewards/margins": -0.058940548449754715,
"rewards/rejected": -0.8560855388641357,
"step": 122
},
{
"epoch": 0.26289072936147473,
"grad_norm": 9.114599227905273,
"learning_rate": 4.606802396635098e-07,
"logits/chosen": -1.0491164922714233,
"logits/rejected": -1.0415245294570923,
"logps/chosen": -0.2906876802444458,
"logps/rejected": -0.2940623164176941,
"loss": 1.6062,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7267192006111145,
"rewards/margins": 0.008436577394604683,
"rewards/rejected": -0.7351557612419128,
"step": 123
},
{
"epoch": 0.26502805236441357,
"grad_norm": 13.20346736907959,
"learning_rate": 4.59667641986356e-07,
"logits/chosen": -0.9480360746383667,
"logits/rejected": -0.9648789167404175,
"logps/chosen": -0.30809280276298523,
"logps/rejected": -0.39009833335876465,
"loss": 1.5673,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.770232081413269,
"rewards/margins": 0.20501384139060974,
"rewards/rejected": -0.9752458930015564,
"step": 124
},
{
"epoch": 0.2671653753673524,
"grad_norm": 10.775737762451172,
"learning_rate": 4.5864331343032565e-07,
"logits/chosen": -0.9860743880271912,
"logits/rejected": -0.9669252634048462,
"logps/chosen": -0.4254220724105835,
"logps/rejected": -0.42529717087745667,
"loss": 1.6036,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.063555121421814,
"rewards/margins": -0.0003122463822364807,
"rewards/rejected": -1.0632429122924805,
"step": 125
},
{
"epoch": 0.2693026983702912,
"grad_norm": 3.9531850814819336,
"learning_rate": 4.576073113064759e-07,
"logits/chosen": -0.9061692953109741,
"logits/rejected": -1.030226707458496,
"logps/chosen": -0.2965227961540222,
"logps/rejected": -0.3571414351463318,
"loss": 1.5663,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7413069605827332,
"rewards/margins": 0.15154659748077393,
"rewards/rejected": -0.8928536176681519,
"step": 126
},
{
"epoch": 0.27144002137323003,
"grad_norm": 10.59150218963623,
"learning_rate": 4.565596935789987e-07,
"logits/chosen": -1.0731703042984009,
"logits/rejected": -1.0575220584869385,
"logps/chosen": -0.3338828682899475,
"logps/rejected": -0.36648380756378174,
"loss": 1.5818,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8347071409225464,
"rewards/margins": 0.08150236308574677,
"rewards/rejected": -0.9162094593048096,
"step": 127
},
{
"epoch": 0.2735773443761689,
"grad_norm": 10.513736724853516,
"learning_rate": 4.555005188619775e-07,
"logits/chosen": -0.8747404217720032,
"logits/rejected": -0.8733081817626953,
"logps/chosen": -0.24837706983089447,
"logps/rejected": -0.297157883644104,
"loss": 1.5767,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6209426522254944,
"rewards/margins": 0.12195204943418503,
"rewards/rejected": -0.74289470911026,
"step": 128
},
{
"epoch": 0.27571466737910766,
"grad_norm": 5.887775421142578,
"learning_rate": 4.5442984641610784e-07,
"logits/chosen": -1.126139760017395,
"logits/rejected": -1.0465540885925293,
"logps/chosen": -0.2965885102748871,
"logps/rejected": -0.2864833474159241,
"loss": 1.6037,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.7414712905883789,
"rewards/margins": -0.025262875482439995,
"rewards/rejected": -0.7162083387374878,
"step": 129
},
{
"epoch": 0.2778519903820465,
"grad_norm": 4.381558418273926,
"learning_rate": 4.533477361453819e-07,
"logits/chosen": -1.0439306497573853,
"logits/rejected": -1.1324841976165771,
"logps/chosen": -0.3036992847919464,
"logps/rejected": -0.3638463020324707,
"loss": 1.5841,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7592483162879944,
"rewards/margins": 0.15036745369434357,
"rewards/rejected": -0.9096157550811768,
"step": 130
},
{
"epoch": 0.2799893133849853,
"grad_norm": 4.211307048797607,
"learning_rate": 4.5225424859373684e-07,
"logits/chosen": -0.9729929566383362,
"logits/rejected": -0.971265435218811,
"logps/chosen": -0.3391942083835602,
"logps/rejected": -0.35309362411499023,
"loss": 1.595,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8479855060577393,
"rewards/margins": 0.03474842756986618,
"rewards/rejected": -0.8827340602874756,
"step": 131
},
{
"epoch": 0.2821266363879241,
"grad_norm": 9.722661018371582,
"learning_rate": 4.511494449416671e-07,
"logits/chosen": -0.8604239225387573,
"logits/rejected": -0.790294885635376,
"logps/chosen": -0.25934475660324097,
"logps/rejected": -0.2542663812637329,
"loss": 1.6546,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6483618021011353,
"rewards/margins": -0.012695923447608948,
"rewards/rejected": -0.6356659531593323,
"step": 132
},
{
"epoch": 0.28426395939086296,
"grad_norm": 3.2997727394104004,
"learning_rate": 4.500333870028016e-07,
"logits/chosen": -1.0789867639541626,
"logits/rejected": -1.073919653892517,
"logps/chosen": -0.25591588020324707,
"logps/rejected": -0.2593124806880951,
"loss": 1.5489,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.6397897601127625,
"rewards/margins": 0.008491499349474907,
"rewards/rejected": -0.6482812166213989,
"step": 133
},
{
"epoch": 0.28640128239380175,
"grad_norm": 3.918221950531006,
"learning_rate": 4.489061372204452e-07,
"logits/chosen": -0.9510654211044312,
"logits/rejected": -0.880722165107727,
"logps/chosen": -0.2889711260795593,
"logps/rejected": -0.32566970586776733,
"loss": 1.5822,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7224277257919312,
"rewards/margins": 0.09174646437168121,
"rewards/rejected": -0.8141741752624512,
"step": 134
},
{
"epoch": 0.2885386053967406,
"grad_norm": 9.736861228942871,
"learning_rate": 4.4776775866408533e-07,
"logits/chosen": -1.0732065439224243,
"logits/rejected": -0.9681872725486755,
"logps/chosen": -0.41927701234817505,
"logps/rejected": -0.2924247086048126,
"loss": 1.5615,
"rewards/accuracies": 0.25,
"rewards/chosen": -1.0481925010681152,
"rewards/margins": -0.3171307146549225,
"rewards/rejected": -0.7310618162155151,
"step": 135
},
{
"epoch": 0.29067592839967943,
"grad_norm": 3.712836503982544,
"learning_rate": 4.4661831502586244e-07,
"logits/chosen": -0.9898865222930908,
"logits/rejected": -0.958566427230835,
"logps/chosen": -0.3362237811088562,
"logps/rejected": -0.3830156624317169,
"loss": 1.5408,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8405593633651733,
"rewards/margins": 0.11697974801063538,
"rewards/rejected": -0.9575392007827759,
"step": 136
},
{
"epoch": 0.2928132514026182,
"grad_norm": 6.5986409187316895,
"learning_rate": 4.4545787061700746e-07,
"logits/chosen": -0.9952265620231628,
"logits/rejected": -0.9618417024612427,
"logps/chosen": -0.33093225955963135,
"logps/rejected": -0.3158915042877197,
"loss": 1.6151,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8273307085037231,
"rewards/margins": -0.037601932883262634,
"rewards/rejected": -0.7897287607192993,
"step": 137
},
{
"epoch": 0.29495057440555705,
"grad_norm": 11.264966011047363,
"learning_rate": 4.442864903642427e-07,
"logits/chosen": -0.9705032706260681,
"logits/rejected": -1.010439395904541,
"logps/chosen": -0.29408299922943115,
"logps/rejected": -0.32492977380752563,
"loss": 1.6785,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7352074980735779,
"rewards/margins": 0.07711698114871979,
"rewards/rejected": -0.8123244047164917,
"step": 138
},
{
"epoch": 0.29708789740849584,
"grad_norm": 3.294029951095581,
"learning_rate": 4.4310423980614986e-07,
"logits/chosen": -0.9771057963371277,
"logits/rejected": -0.8812280893325806,
"logps/chosen": -0.27679648995399475,
"logps/rejected": -0.301949143409729,
"loss": 1.574,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6919912099838257,
"rewards/margins": 0.06288158893585205,
"rewards/rejected": -0.7548727989196777,
"step": 139
},
{
"epoch": 0.2992252204114347,
"grad_norm": 6.963750839233398,
"learning_rate": 4.4191118508950277e-07,
"logits/chosen": -0.9832889437675476,
"logits/rejected": -1.041925311088562,
"logps/chosen": -0.3261723518371582,
"logps/rejected": -0.36972764134407043,
"loss": 1.6004,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8154308199882507,
"rewards/margins": 0.10888823121786118,
"rewards/rejected": -0.9243191480636597,
"step": 140
},
{
"epoch": 0.3013625434143735,
"grad_norm": 4.65049934387207,
"learning_rate": 4.407073929655666e-07,
"logits/chosen": -0.8786113858222961,
"logits/rejected": -0.8743698000907898,
"logps/chosen": -0.3489750027656555,
"logps/rejected": -0.34578827023506165,
"loss": 1.613,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8724376559257507,
"rewards/margins": -0.007966993376612663,
"rewards/rejected": -0.8644705414772034,
"step": 141
},
{
"epoch": 0.3034998664173123,
"grad_norm": 5.086356163024902,
"learning_rate": 4.394929307863632e-07,
"logits/chosen": -1.178961157798767,
"logits/rejected": -1.1221526861190796,
"logps/chosen": -0.30651959776878357,
"logps/rejected": -0.27971014380455017,
"loss": 1.5804,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7662990093231201,
"rewards/margins": -0.0670236200094223,
"rewards/rejected": -0.6992753744125366,
"step": 142
},
{
"epoch": 0.30563718942025114,
"grad_norm": 4.039379119873047,
"learning_rate": 4.3826786650090273e-07,
"logits/chosen": -1.1027284860610962,
"logits/rejected": -1.0947867631912231,
"logps/chosen": -0.3164759874343872,
"logps/rejected": -0.4346773028373718,
"loss": 1.5507,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.791189968585968,
"rewards/margins": 0.29550325870513916,
"rewards/rejected": -1.086693286895752,
"step": 143
},
{
"epoch": 0.3077745124231899,
"grad_norm": 5.900667190551758,
"learning_rate": 4.370322686513817e-07,
"logits/chosen": -0.8383625149726868,
"logits/rejected": -0.7769290804862976,
"logps/chosen": -0.2520799934864044,
"logps/rejected": -0.23787729442119598,
"loss": 1.5816,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.6302000284194946,
"rewards/margins": -0.03550675883889198,
"rewards/rejected": -0.5946931838989258,
"step": 144
},
{
"epoch": 0.30991183542612877,
"grad_norm": 5.484445095062256,
"learning_rate": 4.357862063693485e-07,
"logits/chosen": -0.9914720058441162,
"logits/rejected": -1.0965267419815063,
"logps/chosen": -0.2822697162628174,
"logps/rejected": -0.3381388485431671,
"loss": 1.5638,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7056742906570435,
"rewards/margins": 0.13967278599739075,
"rewards/rejected": -0.8453471064567566,
"step": 145
},
{
"epoch": 0.3120491584290676,
"grad_norm": 2.739793539047241,
"learning_rate": 4.345297493718352e-07,
"logits/chosen": -0.9342893362045288,
"logits/rejected": -0.8757031559944153,
"logps/chosen": -0.512154221534729,
"logps/rejected": -0.6000754237174988,
"loss": 1.5743,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2803857326507568,
"rewards/margins": 0.2198028415441513,
"rewards/rejected": -1.5001884698867798,
"step": 146
},
{
"epoch": 0.3141864814320064,
"grad_norm": 4.219600677490234,
"learning_rate": 4.332629679574565e-07,
"logits/chosen": -0.7380187511444092,
"logits/rejected": -0.8239220380783081,
"logps/chosen": -0.24688729643821716,
"logps/rejected": -0.2952543795108795,
"loss": 1.5583,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6172181963920593,
"rewards/margins": 0.12091781944036484,
"rewards/rejected": -0.7381359934806824,
"step": 147
},
{
"epoch": 0.31632380443494523,
"grad_norm": 7.114157676696777,
"learning_rate": 4.319859330024777e-07,
"logits/chosen": -0.950808048248291,
"logits/rejected": -0.873584508895874,
"logps/chosen": -0.28023314476013184,
"logps/rejected": -0.37878137826919556,
"loss": 1.575,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7005828022956848,
"rewards/margins": 0.2463706135749817,
"rewards/rejected": -0.9469534754753113,
"step": 148
},
{
"epoch": 0.3184611274378841,
"grad_norm": 5.033133029937744,
"learning_rate": 4.3069871595684787e-07,
"logits/chosen": -0.9993598461151123,
"logits/rejected": -1.1495643854141235,
"logps/chosen": -0.31961789727211,
"logps/rejected": -0.4130839705467224,
"loss": 1.6089,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7990447878837585,
"rewards/margins": 0.2336651086807251,
"rewards/rejected": -1.0327098369598389,
"step": 149
},
{
"epoch": 0.32059845044082286,
"grad_norm": 3.7931158542633057,
"learning_rate": 4.294013888402029e-07,
"logits/chosen": -1.0581141710281372,
"logits/rejected": -0.958967924118042,
"logps/chosen": -0.30636316537857056,
"logps/rejected": -0.31132641434669495,
"loss": 1.6122,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7659078240394592,
"rewards/margins": 0.012408185750246048,
"rewards/rejected": -0.7783160209655762,
"step": 150
},
{
"epoch": 0.3227357734437617,
"grad_norm": 4.442758560180664,
"learning_rate": 4.280940242378362e-07,
"logits/chosen": -0.9492220878601074,
"logits/rejected": -0.9829614162445068,
"logps/chosen": -0.26527076959609985,
"logps/rejected": -0.5424583554267883,
"loss": 1.5182,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6631768941879272,
"rewards/margins": 0.6929690837860107,
"rewards/rejected": -1.3561458587646484,
"step": 151
},
{
"epoch": 0.3248730964467005,
"grad_norm": 4.347165107727051,
"learning_rate": 4.2677669529663686e-07,
"logits/chosen": -0.9675495624542236,
"logits/rejected": -0.9267060160636902,
"logps/chosen": -0.28724977374076843,
"logps/rejected": -0.27893343567848206,
"loss": 1.6064,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7181244492530823,
"rewards/margins": -0.02079082280397415,
"rewards/rejected": -0.697333574295044,
"step": 152
},
{
"epoch": 0.3270104194496393,
"grad_norm": 3.866643190383911,
"learning_rate": 4.254494757209979e-07,
"logits/chosen": -1.0312570333480835,
"logits/rejected": -0.8400145173072815,
"logps/chosen": -0.2714364230632782,
"logps/rejected": -0.3370465636253357,
"loss": 1.5993,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6785910725593567,
"rewards/margins": 0.16402524709701538,
"rewards/rejected": -0.8426163196563721,
"step": 153
},
{
"epoch": 0.32914774245257816,
"grad_norm": 7.2947211265563965,
"learning_rate": 4.2411243976869173e-07,
"logits/chosen": -1.1030328273773193,
"logits/rejected": -1.107038140296936,
"logps/chosen": -0.31799790263175964,
"logps/rejected": -0.3556910455226898,
"loss": 1.5435,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7949947714805603,
"rewards/margins": 0.09423284232616425,
"rewards/rejected": -0.8892276287078857,
"step": 154
},
{
"epoch": 0.33128506545551695,
"grad_norm": 6.094887733459473,
"learning_rate": 4.227656622467162e-07,
"logits/chosen": -0.9807777404785156,
"logits/rejected": -0.9574925303459167,
"logps/chosen": -0.36069509387016296,
"logps/rejected": -0.411272257566452,
"loss": 1.5455,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.9017376899719238,
"rewards/margins": 0.12644296884536743,
"rewards/rejected": -1.028180718421936,
"step": 155
},
{
"epoch": 0.3334223884584558,
"grad_norm": 7.840887069702148,
"learning_rate": 4.2140921850710855e-07,
"logits/chosen": -1.1150490045547485,
"logits/rejected": -1.1116127967834473,
"logps/chosen": -0.2742304801940918,
"logps/rejected": -0.3083428740501404,
"loss": 1.546,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6855762004852295,
"rewards/margins": 0.08528101444244385,
"rewards/rejected": -0.7708572149276733,
"step": 156
},
{
"epoch": 0.3355597114613946,
"grad_norm": 15.734699249267578,
"learning_rate": 4.200431844427298e-07,
"logits/chosen": -0.9994797706604004,
"logits/rejected": -1.077652931213379,
"logps/chosen": -0.3408905565738678,
"logps/rejected": -0.605131208896637,
"loss": 1.5829,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8522265553474426,
"rewards/margins": 0.6606015563011169,
"rewards/rejected": -1.51282799243927,
"step": 157
},
{
"epoch": 0.3376970344643334,
"grad_norm": 4.568877696990967,
"learning_rate": 4.186676364830186e-07,
"logits/chosen": -0.8166912794113159,
"logits/rejected": -0.9158197641372681,
"logps/chosen": -0.3100201189517975,
"logps/rejected": -0.4257528781890869,
"loss": 1.5949,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7750502824783325,
"rewards/margins": 0.28933185338974,
"rewards/rejected": -1.0643821954727173,
"step": 158
},
{
"epoch": 0.33983435746727225,
"grad_norm": 7.091736793518066,
"learning_rate": 4.172826515897145e-07,
"logits/chosen": -0.9496626853942871,
"logits/rejected": -0.8826749920845032,
"logps/chosen": -0.2823619842529297,
"logps/rejected": -0.25573766231536865,
"loss": 1.6079,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7059049606323242,
"rewards/margins": -0.06656082719564438,
"rewards/rejected": -0.6393441557884216,
"step": 159
},
{
"epoch": 0.34197168047021104,
"grad_norm": 9.935672760009766,
"learning_rate": 4.158883072525528e-07,
"logits/chosen": -1.139492392539978,
"logits/rejected": -0.9911923408508301,
"logps/chosen": -0.24080964922904968,
"logps/rejected": -0.23250696063041687,
"loss": 1.5373,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6020240783691406,
"rewards/margins": -0.020756704732775688,
"rewards/rejected": -0.5812674164772034,
"step": 160
},
{
"epoch": 0.3441090034731499,
"grad_norm": 15.794203758239746,
"learning_rate": 4.1448468148492814e-07,
"logits/chosen": -1.019397258758545,
"logits/rejected": -0.9881049394607544,
"logps/chosen": -0.3946765065193176,
"logps/rejected": -0.3796921670436859,
"loss": 1.5431,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9866912364959717,
"rewards/margins": -0.03746076300740242,
"rewards/rejected": -0.9492304921150208,
"step": 161
},
{
"epoch": 0.3462463264760887,
"grad_norm": 3.839250087738037,
"learning_rate": 4.130718528195303e-07,
"logits/chosen": -0.9311838746070862,
"logits/rejected": -0.8956501483917236,
"logps/chosen": -0.282693088054657,
"logps/rejected": -0.2629316449165344,
"loss": 1.5818,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7067327499389648,
"rewards/margins": -0.04940361529588699,
"rewards/rejected": -0.6573290824890137,
"step": 162
},
{
"epoch": 0.3483836494790275,
"grad_norm": 5.865959644317627,
"learning_rate": 4.1164990030394985e-07,
"logits/chosen": -1.0395972728729248,
"logits/rejected": -0.9770699143409729,
"logps/chosen": -0.3128069043159485,
"logps/rejected": -0.31642264127731323,
"loss": 1.5692,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.7820172905921936,
"rewards/margins": 0.009039390832185745,
"rewards/rejected": -0.791056752204895,
"step": 163
},
{
"epoch": 0.35052097248196634,
"grad_norm": 4.154603481292725,
"learning_rate": 4.10218903496256e-07,
"logits/chosen": -1.0948988199234009,
"logits/rejected": -0.9907031059265137,
"logps/chosen": -0.30839213728904724,
"logps/rejected": -0.29299482703208923,
"loss": 1.5829,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.7709803581237793,
"rewards/margins": -0.03849326819181442,
"rewards/rejected": -0.7324870824813843,
"step": 164
},
{
"epoch": 0.3526582954849052,
"grad_norm": 5.253880500793457,
"learning_rate": 4.087789424605447e-07,
"logits/chosen": -1.0539865493774414,
"logits/rejected": -0.9663246870040894,
"logps/chosen": -0.26886874437332153,
"logps/rejected": -0.43172940611839294,
"loss": 1.5157,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.672171950340271,
"rewards/margins": 0.40715163946151733,
"rewards/rejected": -1.0793235301971436,
"step": 165
},
{
"epoch": 0.35479561848784397,
"grad_norm": 11.49240493774414,
"learning_rate": 4.0733009776245937e-07,
"logits/chosen": -0.9969057440757751,
"logits/rejected": -1.0402690172195435,
"logps/chosen": -0.3554040193557739,
"logps/rejected": -0.396072655916214,
"loss": 1.584,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8885101079940796,
"rewards/margins": 0.10167157649993896,
"rewards/rejected": -0.9901816248893738,
"step": 166
},
{
"epoch": 0.3569329414907828,
"grad_norm": 5.116168975830078,
"learning_rate": 4.058724504646834e-07,
"logits/chosen": -0.9382141828536987,
"logits/rejected": -0.8863942623138428,
"logps/chosen": -0.261793315410614,
"logps/rejected": -0.45277461409568787,
"loss": 1.539,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.6544832587242126,
"rewards/margins": 0.47745317220687866,
"rewards/rejected": -1.1319365501403809,
"step": 167
},
{
"epoch": 0.3590702644937216,
"grad_norm": 10.550248146057129,
"learning_rate": 4.0440608212240445e-07,
"logits/chosen": -1.0490831136703491,
"logits/rejected": -1.1039912700653076,
"logps/chosen": -0.3632212281227112,
"logps/rejected": -0.3736804723739624,
"loss": 1.5845,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9080529808998108,
"rewards/margins": 0.026148155331611633,
"rewards/rejected": -0.934201180934906,
"step": 168
},
{
"epoch": 0.36120758749666043,
"grad_norm": 4.120011806488037,
"learning_rate": 4.0293107477875156e-07,
"logits/chosen": -0.914804220199585,
"logits/rejected": -0.9306747317314148,
"logps/chosen": -0.3597089350223541,
"logps/rejected": -0.39882034063339233,
"loss": 1.5235,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8992725014686584,
"rewards/margins": 0.09777843952178955,
"rewards/rejected": -0.9970508813858032,
"step": 169
},
{
"epoch": 0.36334491049959927,
"grad_norm": 3.9872193336486816,
"learning_rate": 4.0144751096020497e-07,
"logits/chosen": -1.0519163608551025,
"logits/rejected": -0.9880449175834656,
"logps/chosen": -0.27723756432533264,
"logps/rejected": -0.3831270933151245,
"loss": 1.5744,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6930938959121704,
"rewards/margins": 0.26472383737564087,
"rewards/rejected": -0.9578177332878113,
"step": 170
},
{
"epoch": 0.36548223350253806,
"grad_norm": 5.331676006317139,
"learning_rate": 3.999554736719785e-07,
"logits/chosen": -1.1113324165344238,
"logits/rejected": -1.1892024278640747,
"logps/chosen": -0.3108530640602112,
"logps/rejected": -0.5784565806388855,
"loss": 1.4846,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7771324515342712,
"rewards/margins": 0.6690089702606201,
"rewards/rejected": -1.4461414813995361,
"step": 171
},
{
"epoch": 0.3676195565054769,
"grad_norm": 6.586511611938477,
"learning_rate": 3.9845504639337535e-07,
"logits/chosen": -1.2047513723373413,
"logits/rejected": -1.1406968832015991,
"logps/chosen": -0.3595273196697235,
"logps/rejected": -0.32145068049430847,
"loss": 1.5328,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8988182544708252,
"rewards/margins": -0.0951915979385376,
"rewards/rejected": -0.8036267757415771,
"step": 172
},
{
"epoch": 0.36975687950841574,
"grad_norm": 6.111835479736328,
"learning_rate": 3.9694631307311825e-07,
"logits/chosen": -0.8004586696624756,
"logits/rejected": -0.7772153615951538,
"logps/chosen": -0.4090813100337982,
"logps/rejected": -0.4898335635662079,
"loss": 1.559,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0227031707763672,
"rewards/margins": 0.20188063383102417,
"rewards/rejected": -1.2245839834213257,
"step": 173
},
{
"epoch": 0.3718942025113545,
"grad_norm": 7.0863189697265625,
"learning_rate": 3.954293581246514e-07,
"logits/chosen": -0.9679336547851562,
"logits/rejected": -0.9125540256500244,
"logps/chosen": -0.29369306564331055,
"logps/rejected": -0.31403255462646484,
"loss": 1.5375,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7342327237129211,
"rewards/margins": 0.05084871128201485,
"rewards/rejected": -0.7850814461708069,
"step": 174
},
{
"epoch": 0.37403152551429336,
"grad_norm": 7.140958309173584,
"learning_rate": 3.939042664214184e-07,
"logits/chosen": -0.949452817440033,
"logits/rejected": -1.0473122596740723,
"logps/chosen": -0.2707624137401581,
"logps/rejected": -0.32049351930618286,
"loss": 1.5626,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6769061088562012,
"rewards/margins": 0.12432771921157837,
"rewards/rejected": -0.8012337684631348,
"step": 175
},
{
"epoch": 0.37616884851723215,
"grad_norm": 7.0456695556640625,
"learning_rate": 3.92371123292113e-07,
"logits/chosen": -1.0727981328964233,
"logits/rejected": -1.1329890489578247,
"logps/chosen": -0.29705438017845154,
"logps/rejected": -0.3278125524520874,
"loss": 1.6107,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7426358461380005,
"rewards/margins": 0.07689555734395981,
"rewards/rejected": -0.8195314407348633,
"step": 176
},
{
"epoch": 0.378306171520171,
"grad_norm": 5.836486339569092,
"learning_rate": 3.908300145159055e-07,
"logits/chosen": -0.9942230582237244,
"logits/rejected": -1.0356171131134033,
"logps/chosen": -0.31931719183921814,
"logps/rejected": -0.33853164315223694,
"loss": 1.5837,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7982929348945618,
"rewards/margins": 0.0480361208319664,
"rewards/rejected": -0.8463290929794312,
"step": 177
},
{
"epoch": 0.3804434945231098,
"grad_norm": 8.505417823791504,
"learning_rate": 3.8928102631764304e-07,
"logits/chosen": -1.0212180614471436,
"logits/rejected": -1.087773323059082,
"logps/chosen": -0.3532945513725281,
"logps/rejected": -0.5901373028755188,
"loss": 1.5557,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.883236289024353,
"rewards/margins": 0.5921069979667664,
"rewards/rejected": -1.4753433465957642,
"step": 178
},
{
"epoch": 0.3825808175260486,
"grad_norm": 6.116640090942383,
"learning_rate": 3.877242453630256e-07,
"logits/chosen": -1.2131381034851074,
"logits/rejected": -1.0686910152435303,
"logps/chosen": -0.3515666127204895,
"logps/rejected": -0.3958896994590759,
"loss": 1.5671,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8789165019989014,
"rewards/margins": 0.11080773174762726,
"rewards/rejected": -0.9897242784500122,
"step": 179
},
{
"epoch": 0.38471814052898745,
"grad_norm": 6.355064868927002,
"learning_rate": 3.8615975875375676e-07,
"logits/chosen": -0.9339985847473145,
"logits/rejected": -0.9060691595077515,
"logps/chosen": -0.32276052236557007,
"logps/rejected": -0.37401843070983887,
"loss": 1.548,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8069013953208923,
"rewards/margins": 0.1281447559595108,
"rewards/rejected": -0.9350461959838867,
"step": 180
},
{
"epoch": 0.38685546353192624,
"grad_norm": 6.534996509552002,
"learning_rate": 3.8458765402267056e-07,
"logits/chosen": -0.8938146233558655,
"logits/rejected": -0.9069436192512512,
"logps/chosen": -0.336931049823761,
"logps/rejected": -0.4913772940635681,
"loss": 1.5787,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8423275947570801,
"rewards/margins": 0.3861156404018402,
"rewards/rejected": -1.2284431457519531,
"step": 181
},
{
"epoch": 0.3889927865348651,
"grad_norm": 10.956029891967773,
"learning_rate": 3.8300801912883414e-07,
"logits/chosen": -1.0703511238098145,
"logits/rejected": -0.9989842176437378,
"logps/chosen": -0.26583123207092285,
"logps/rejected": -0.2977861762046814,
"loss": 1.5609,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.6645781397819519,
"rewards/margins": 0.07988730072975159,
"rewards/rejected": -0.7444654107093811,
"step": 182
},
{
"epoch": 0.3911301095378039,
"grad_norm": 10.217528343200684,
"learning_rate": 3.8142094245262615e-07,
"logits/chosen": -1.145703673362732,
"logits/rejected": -1.0282764434814453,
"logps/chosen": -0.3538467586040497,
"logps/rejected": -0.3405742645263672,
"loss": 1.5855,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8846168518066406,
"rewards/margins": -0.03318122401833534,
"rewards/rejected": -0.8514357209205627,
"step": 183
},
{
"epoch": 0.3932674325407427,
"grad_norm": 4.681653022766113,
"learning_rate": 3.7982651279079227e-07,
"logits/chosen": -1.2552436590194702,
"logits/rejected": -1.259030818939209,
"logps/chosen": -0.2886826992034912,
"logps/rejected": -0.4662485718727112,
"loss": 1.5609,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7217066884040833,
"rewards/margins": 0.44391465187072754,
"rewards/rejected": -1.1656213998794556,
"step": 184
},
{
"epoch": 0.39540475554368154,
"grad_norm": 4.339652061462402,
"learning_rate": 3.7822481935147655e-07,
"logits/chosen": -1.0260683298110962,
"logits/rejected": -1.015075922012329,
"logps/chosen": -0.36714547872543335,
"logps/rejected": -0.5204967260360718,
"loss": 1.5682,
"rewards/accuracies": 0.875,
"rewards/chosen": -0.9178636074066162,
"rewards/margins": 0.38337817788124084,
"rewards/rejected": -1.3012418746948242,
"step": 185
},
{
"epoch": 0.3975420785466204,
"grad_norm": 5.974206924438477,
"learning_rate": 3.766159517492307e-07,
"logits/chosen": -1.0455535650253296,
"logits/rejected": -1.1319448947906494,
"logps/chosen": -0.41289687156677246,
"logps/rejected": -0.613991379737854,
"loss": 1.5825,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0322421789169312,
"rewards/margins": 0.5027362108230591,
"rewards/rejected": -1.5349783897399902,
"step": 186
},
{
"epoch": 0.39967940154955917,
"grad_norm": 8.767956733703613,
"learning_rate": 3.75e-07,
"logits/chosen": -1.0032697916030884,
"logits/rejected": -0.9564570784568787,
"logps/chosen": -0.31954333186149597,
"logps/rejected": -0.4057242274284363,
"loss": 1.6033,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7988582849502563,
"rewards/margins": 0.21545226871967316,
"rewards/rejected": -1.014310598373413,
"step": 187
},
{
"epoch": 0.401816724552498,
"grad_norm": 4.35204553604126,
"learning_rate": 3.7337705451608667e-07,
"logits/chosen": -1.1166412830352783,
"logits/rejected": -1.0849709510803223,
"logps/chosen": -0.3008464574813843,
"logps/rejected": -0.2960435450077057,
"loss": 1.5105,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7521160840988159,
"rewards/margins": -0.012007185257971287,
"rewards/rejected": -0.7401089072227478,
"step": 188
},
{
"epoch": 0.4039540475554368,
"grad_norm": 3.929826021194458,
"learning_rate": 3.717472061010918e-07,
"logits/chosen": -1.1040568351745605,
"logits/rejected": -1.062517523765564,
"logps/chosen": -0.3373297154903412,
"logps/rejected": -0.5283687710762024,
"loss": 1.5152,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8433243632316589,
"rewards/margins": 0.47759759426116943,
"rewards/rejected": -1.3209218978881836,
"step": 189
},
{
"epoch": 0.40609137055837563,
"grad_norm": 4.574549198150635,
"learning_rate": 3.7011054594483443e-07,
"logits/chosen": -1.1240224838256836,
"logits/rejected": -1.0487711429595947,
"logps/chosen": -0.3029400706291199,
"logps/rejected": -0.41601306200027466,
"loss": 1.4632,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7573502063751221,
"rewards/margins": 0.28268247842788696,
"rewards/rejected": -1.0400326251983643,
"step": 190
},
{
"epoch": 0.40822869356131447,
"grad_norm": 4.018647193908691,
"learning_rate": 3.6846716561824967e-07,
"logits/chosen": -0.80363529920578,
"logits/rejected": -0.9596213102340698,
"logps/chosen": -0.3076530694961548,
"logps/rejected": -0.5633202195167542,
"loss": 1.5163,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7691327333450317,
"rewards/margins": 0.6391679048538208,
"rewards/rejected": -1.408300518989563,
"step": 191
},
{
"epoch": 0.41036601656425326,
"grad_norm": 7.332089424133301,
"learning_rate": 3.668171570682655e-07,
"logits/chosen": -0.9585205316543579,
"logits/rejected": -0.9636404514312744,
"logps/chosen": -0.33684462308883667,
"logps/rejected": -0.3766506016254425,
"loss": 1.5671,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8421115279197693,
"rewards/margins": 0.09951499849557877,
"rewards/rejected": -0.9416265487670898,
"step": 192
},
{
"epoch": 0.4125033395671921,
"grad_norm": 8.853985786437988,
"learning_rate": 3.6516061261265805e-07,
"logits/chosen": -1.027462363243103,
"logits/rejected": -0.9135668873786926,
"logps/chosen": -0.3370886445045471,
"logps/rejected": -0.3812939524650574,
"loss": 1.5598,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8427215814590454,
"rewards/margins": 0.11051319539546967,
"rewards/rejected": -0.953234851360321,
"step": 193
},
{
"epoch": 0.41464066257013094,
"grad_norm": 5.765879154205322,
"learning_rate": 3.634976249348867e-07,
"logits/chosen": -1.1132540702819824,
"logits/rejected": -1.003641963005066,
"logps/chosen": -0.3518536686897278,
"logps/rejected": -0.5063703656196594,
"loss": 1.5071,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8796342015266418,
"rewards/margins": 0.38629183173179626,
"rewards/rejected": -1.2659261226654053,
"step": 194
},
{
"epoch": 0.4167779855730697,
"grad_norm": 17.148714065551758,
"learning_rate": 3.618282870789081e-07,
"logits/chosen": -1.041336178779602,
"logits/rejected": -1.0308490991592407,
"logps/chosen": -0.4422120749950409,
"logps/rejected": -0.4290231466293335,
"loss": 1.6783,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1055301427841187,
"rewards/margins": -0.032972272485494614,
"rewards/rejected": -1.072557806968689,
"step": 195
},
{
"epoch": 0.41891530857600856,
"grad_norm": 4.9743332862854,
"learning_rate": 3.601526924439709e-07,
"logits/chosen": -0.9943188428878784,
"logits/rejected": -1.029951810836792,
"logps/chosen": -0.2909929156303406,
"logps/rejected": -0.3154396116733551,
"loss": 1.5771,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7274821996688843,
"rewards/margins": 0.061116717755794525,
"rewards/rejected": -0.7885990142822266,
"step": 196
},
{
"epoch": 0.42105263157894735,
"grad_norm": 6.192495346069336,
"learning_rate": 3.584709347793895e-07,
"logits/chosen": -0.8082910776138306,
"logits/rejected": -0.8116950988769531,
"logps/chosen": -0.2856646478176117,
"logps/rejected": -0.30446913838386536,
"loss": 1.5157,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.7141616344451904,
"rewards/margins": 0.04701121151447296,
"rewards/rejected": -0.7611728310585022,
"step": 197
},
{
"epoch": 0.4231899545818862,
"grad_norm": 4.891373157501221,
"learning_rate": 3.567831081792992e-07,
"logits/chosen": -1.0285996198654175,
"logits/rejected": -1.034073829650879,
"logps/chosen": -0.3283870220184326,
"logps/rejected": -0.5464656949043274,
"loss": 1.4871,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8209674954414368,
"rewards/margins": 0.5451965928077698,
"rewards/rejected": -1.366164207458496,
"step": 198
},
{
"epoch": 0.425327277584825,
"grad_norm": 13.869108200073242,
"learning_rate": 3.550893070773914e-07,
"logits/chosen": -1.0854626893997192,
"logits/rejected": -1.0260361433029175,
"logps/chosen": -0.39059579372406006,
"logps/rejected": -0.4412023425102234,
"loss": 1.6672,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9764894247055054,
"rewards/margins": 0.12651631236076355,
"rewards/rejected": -1.1030058860778809,
"step": 199
},
{
"epoch": 0.4274646005877638,
"grad_norm": 29.342126846313477,
"learning_rate": 3.5338962624163016e-07,
"logits/chosen": -1.1286933422088623,
"logits/rejected": -1.1019514799118042,
"logps/chosen": -0.29572370648384094,
"logps/rejected": -0.3438429832458496,
"loss": 1.6118,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7393092513084412,
"rewards/margins": 0.12029813230037689,
"rewards/rejected": -0.8596073985099792,
"step": 200
},
{
"epoch": 0.42960192359070265,
"grad_norm": 4.435629367828369,
"learning_rate": 3.516841607689501e-07,
"logits/chosen": -1.1759017705917358,
"logits/rejected": -1.0626184940338135,
"logps/chosen": -0.3442676067352295,
"logps/rejected": -0.3576590120792389,
"loss": 1.5321,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8606690168380737,
"rewards/margins": 0.03347862884402275,
"rewards/rejected": -0.8941476345062256,
"step": 201
},
{
"epoch": 0.4317392465936415,
"grad_norm": 5.45989990234375,
"learning_rate": 3.499730060799352e-07,
"logits/chosen": -1.1944599151611328,
"logits/rejected": -1.1447770595550537,
"logps/chosen": -0.300496369600296,
"logps/rejected": -0.3771470785140991,
"loss": 1.4774,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7512409687042236,
"rewards/margins": 0.19162678718566895,
"rewards/rejected": -0.9428676962852478,
"step": 202
},
{
"epoch": 0.4338765695965803,
"grad_norm": 4.396944046020508,
"learning_rate": 3.482562579134809e-07,
"logits/chosen": -0.9371283054351807,
"logits/rejected": -0.9887581467628479,
"logps/chosen": -0.34337079524993896,
"logps/rejected": -0.31941717863082886,
"loss": 1.5624,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8584270477294922,
"rewards/margins": -0.05988417938351631,
"rewards/rejected": -0.798542857170105,
"step": 203
},
{
"epoch": 0.4360138925995191,
"grad_norm": 5.779623508453369,
"learning_rate": 3.465340123214365e-07,
"logits/chosen": -0.9840802550315857,
"logits/rejected": -0.9649553298950195,
"logps/chosen": -0.5713462829589844,
"logps/rejected": -0.7279367446899414,
"loss": 1.5474,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4283654689788818,
"rewards/margins": 0.39147651195526123,
"rewards/rejected": -1.8198421001434326,
"step": 204
},
{
"epoch": 0.4381512156024579,
"grad_norm": 10.535792350769043,
"learning_rate": 3.448063656632321e-07,
"logits/chosen": -1.1214243173599243,
"logits/rejected": -1.0236384868621826,
"logps/chosen": -0.327178418636322,
"logps/rejected": -0.3443678021430969,
"loss": 1.5847,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8179460167884827,
"rewards/margins": 0.042973555624485016,
"rewards/rejected": -0.8609195351600647,
"step": 205
},
{
"epoch": 0.44028853860539674,
"grad_norm": 5.442493915557861,
"learning_rate": 3.430734146004863e-07,
"logits/chosen": -1.1191673278808594,
"logits/rejected": -0.9904736876487732,
"logps/chosen": -0.2607005536556244,
"logps/rejected": -0.2681718170642853,
"loss": 1.542,
"rewards/accuracies": 0.3125,
"rewards/chosen": -0.6517513990402222,
"rewards/margins": 0.01867814175784588,
"rewards/rejected": -0.670429527759552,
"step": 206
},
{
"epoch": 0.4424258616083356,
"grad_norm": 6.850170612335205,
"learning_rate": 3.413352560915988e-07,
"logits/chosen": -1.0275464057922363,
"logits/rejected": -1.0052015781402588,
"logps/chosen": -0.3867985010147095,
"logps/rejected": -0.4938412010669708,
"loss": 1.6312,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9669963121414185,
"rewards/margins": 0.267606645822525,
"rewards/rejected": -1.234602928161621,
"step": 207
},
{
"epoch": 0.44456318461127436,
"grad_norm": 9.965657234191895,
"learning_rate": 3.39591987386325e-07,
"logits/chosen": -0.9659216403961182,
"logits/rejected": -0.9130998253822327,
"logps/chosen": -0.33372846245765686,
"logps/rejected": -0.3092671036720276,
"loss": 1.5355,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.834321141242981,
"rewards/margins": -0.061153292655944824,
"rewards/rejected": -0.7731677889823914,
"step": 208
},
{
"epoch": 0.4467005076142132,
"grad_norm": 5.595789909362793,
"learning_rate": 3.378437060203357e-07,
"logits/chosen": -1.2547951936721802,
"logits/rejected": -1.1610562801361084,
"logps/chosen": -0.34088316559791565,
"logps/rejected": -0.34324803948402405,
"loss": 1.6059,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8522078990936279,
"rewards/margins": 0.00591224804520607,
"rewards/rejected": -0.8581202030181885,
"step": 209
},
{
"epoch": 0.448837830617152,
"grad_norm": 17.92057991027832,
"learning_rate": 3.360905098097587e-07,
"logits/chosen": -1.0579925775527954,
"logits/rejected": -0.9834758043289185,
"logps/chosen": -0.38748034834861755,
"logps/rejected": -0.6860374808311462,
"loss": 1.5363,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9687008857727051,
"rewards/margins": 0.7463930249214172,
"rewards/rejected": -1.715093970298767,
"step": 210
},
{
"epoch": 0.45097515362009083,
"grad_norm": 6.570519924163818,
"learning_rate": 3.343324968457075e-07,
"logits/chosen": -1.0359179973602295,
"logits/rejected": -0.9564209580421448,
"logps/chosen": -0.38825637102127075,
"logps/rejected": -0.3802332878112793,
"loss": 1.5384,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.970641016960144,
"rewards/margins": -0.02005772665143013,
"rewards/rejected": -0.950583279132843,
"step": 211
},
{
"epoch": 0.45311247662302967,
"grad_norm": 5.518048286437988,
"learning_rate": 3.325697654887918e-07,
"logits/chosen": -0.998512327671051,
"logits/rejected": -0.9381792545318604,
"logps/chosen": -0.3794736862182617,
"logps/rejected": -0.6236636638641357,
"loss": 1.5091,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9486840963363647,
"rewards/margins": 0.6104748249053955,
"rewards/rejected": -1.5591590404510498,
"step": 212
},
{
"epoch": 0.45524979962596845,
"grad_norm": 12.084184646606445,
"learning_rate": 3.30802414363615e-07,
"logits/chosen": -0.9403542280197144,
"logits/rejected": -0.6737431287765503,
"logps/chosen": -0.4244030714035034,
"logps/rejected": -0.43834903836250305,
"loss": 1.4581,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.0610076189041138,
"rewards/margins": 0.03486503288149834,
"rewards/rejected": -1.0958726406097412,
"step": 213
},
{
"epoch": 0.4573871226289073,
"grad_norm": 3.5294582843780518,
"learning_rate": 3.2903054235325613e-07,
"logits/chosen": -1.1825759410858154,
"logits/rejected": -1.210655927658081,
"logps/chosen": -0.3315200209617615,
"logps/rejected": -0.46745753288269043,
"loss": 1.5312,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8287999629974365,
"rewards/margins": 0.33984383940696716,
"rewards/rejected": -1.168643832206726,
"step": 214
},
{
"epoch": 0.45952444563184613,
"grad_norm": 6.134922027587891,
"learning_rate": 3.272542485937368e-07,
"logits/chosen": -1.1072171926498413,
"logits/rejected": -1.208855152130127,
"logps/chosen": -0.4051734209060669,
"logps/rejected": -0.6289750337600708,
"loss": 1.5473,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.012933611869812,
"rewards/margins": 0.5595039129257202,
"rewards/rejected": -1.5724375247955322,
"step": 215
},
{
"epoch": 0.4616617686347849,
"grad_norm": 6.10336446762085,
"learning_rate": 3.2547363246847546e-07,
"logits/chosen": -1.0125056505203247,
"logits/rejected": -1.0291041135787964,
"logps/chosen": -0.3960397243499756,
"logps/rejected": -0.6897832751274109,
"loss": 1.5091,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.990099310874939,
"rewards/margins": 0.7343588471412659,
"rewards/rejected": -1.72445809841156,
"step": 216
},
{
"epoch": 0.46379909163772376,
"grad_norm": 9.434686660766602,
"learning_rate": 3.2368879360272606e-07,
"logits/chosen": -1.0608569383621216,
"logits/rejected": -1.0038235187530518,
"logps/chosen": -0.4567071199417114,
"logps/rejected": -0.42994168400764465,
"loss": 1.616,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1417678594589233,
"rewards/margins": -0.06691357493400574,
"rewards/rejected": -1.0748542547225952,
"step": 217
},
{
"epoch": 0.46593641464066254,
"grad_norm": 5.21524715423584,
"learning_rate": 3.218998318580043e-07,
"logits/chosen": -1.1354548931121826,
"logits/rejected": -1.0435974597930908,
"logps/chosen": -0.2741296589374542,
"logps/rejected": -0.37144631147384644,
"loss": 1.6029,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.6853241920471191,
"rewards/margins": 0.24329157173633575,
"rewards/rejected": -0.9286156892776489,
"step": 218
},
{
"epoch": 0.4680737376436014,
"grad_norm": 4.108745574951172,
"learning_rate": 3.201068473265007e-07,
"logits/chosen": -0.8878648281097412,
"logits/rejected": -0.8645142316818237,
"logps/chosen": -0.32466036081314087,
"logps/rejected": -0.28847843408584595,
"loss": 1.6023,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.811650812625885,
"rewards/margins": -0.09045480191707611,
"rewards/rejected": -0.7211960554122925,
"step": 219
},
{
"epoch": 0.4702110606465402,
"grad_norm": 17.760408401489258,
"learning_rate": 3.1830994032548e-07,
"logits/chosen": -1.197770595550537,
"logits/rejected": -1.0971354246139526,
"logps/chosen": -0.44655174016952515,
"logps/rejected": -0.5050027370452881,
"loss": 1.6185,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1163791418075562,
"rewards/margins": 0.14612753689289093,
"rewards/rejected": -1.2625068426132202,
"step": 220
},
{
"epoch": 0.472348383649479,
"grad_norm": 24.489158630371094,
"learning_rate": 3.1650921139166874e-07,
"logits/chosen": -0.9091489315032959,
"logits/rejected": -0.9671614766120911,
"logps/chosen": -0.2689306437969208,
"logps/rejected": -0.2791651487350464,
"loss": 1.6576,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.6723266243934631,
"rewards/margins": 0.02558620274066925,
"rewards/rejected": -0.6979128122329712,
"step": 221
},
{
"epoch": 0.47448570665241785,
"grad_norm": 4.240891933441162,
"learning_rate": 3.147047612756302e-07,
"logits/chosen": -1.1410434246063232,
"logits/rejected": -0.9494026303291321,
"logps/chosen": -0.3623463809490204,
"logps/rejected": -0.3546559810638428,
"loss": 1.5634,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9058659672737122,
"rewards/margins": -0.01922605186700821,
"rewards/rejected": -0.8866399526596069,
"step": 222
},
{
"epoch": 0.4766230296553567,
"grad_norm": 11.909400939941406,
"learning_rate": 3.128966909361271e-07,
"logits/chosen": -1.0778872966766357,
"logits/rejected": -0.9947598576545715,
"logps/chosen": -0.2876349687576294,
"logps/rejected": -0.3500506281852722,
"loss": 1.5763,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7190873622894287,
"rewards/margins": 0.15603923797607422,
"rewards/rejected": -0.8751266002655029,
"step": 223
},
{
"epoch": 0.4787603526582955,
"grad_norm": 3.9968485832214355,
"learning_rate": 3.110851015344735e-07,
"logits/chosen": -1.043594241142273,
"logits/rejected": -1.0751991271972656,
"logps/chosen": -0.3403151333332062,
"logps/rejected": -0.45080384612083435,
"loss": 1.4964,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.850787878036499,
"rewards/margins": 0.27622172236442566,
"rewards/rejected": -1.127009630203247,
"step": 224
},
{
"epoch": 0.4808976756612343,
"grad_norm": 4.30190372467041,
"learning_rate": 3.0927009442887437e-07,
"logits/chosen": -0.9305320978164673,
"logits/rejected": -1.0111606121063232,
"logps/chosen": -0.32919758558273315,
"logps/rejected": -0.34503474831581116,
"loss": 1.5875,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8229939341545105,
"rewards/margins": 0.0395929217338562,
"rewards/rejected": -0.8625868558883667,
"step": 225
},
{
"epoch": 0.4830349986641731,
"grad_norm": 5.68215799331665,
"learning_rate": 3.074517711687549e-07,
"logits/chosen": -0.9502861499786377,
"logits/rejected": -0.9219777584075928,
"logps/chosen": -0.40744659304618835,
"logps/rejected": -0.4551170766353607,
"loss": 1.5318,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0186164379119873,
"rewards/margins": 0.11917618662118912,
"rewards/rejected": -1.137792706489563,
"step": 226
},
{
"epoch": 0.48517232166711194,
"grad_norm": 5.924420356750488,
"learning_rate": 3.056302334890786e-07,
"logits/chosen": -1.0599088668823242,
"logits/rejected": -0.9398927688598633,
"logps/chosen": -0.2768517732620239,
"logps/rejected": -0.3650413155555725,
"loss": 1.5365,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.692129373550415,
"rewards/margins": 0.22047390043735504,
"rewards/rejected": -0.9126032590866089,
"step": 227
},
{
"epoch": 0.4873096446700508,
"grad_norm": 9.18790340423584,
"learning_rate": 3.038055833046555e-07,
"logits/chosen": -1.23221755027771,
"logits/rejected": -1.1094015836715698,
"logps/chosen": -0.3468588590621948,
"logps/rejected": -0.533679723739624,
"loss": 1.5544,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8671470880508423,
"rewards/margins": 0.4670522212982178,
"rewards/rejected": -1.3341991901397705,
"step": 228
},
{
"epoch": 0.48944696767298956,
"grad_norm": 3.236159563064575,
"learning_rate": 3.0197792270443976e-07,
"logits/chosen": -1.101015567779541,
"logits/rejected": -0.980370044708252,
"logps/chosen": -0.5276182293891907,
"logps/rejected": -0.2907797694206238,
"loss": 1.6115,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.3190455436706543,
"rewards/margins": -0.592096209526062,
"rewards/rejected": -0.7269493341445923,
"step": 229
},
{
"epoch": 0.4915842906759284,
"grad_norm": 6.479002475738525,
"learning_rate": 3.001473539458182e-07,
"logits/chosen": -1.1139984130859375,
"logits/rejected": -1.0145281553268433,
"logps/chosen": -0.40499821305274963,
"logps/rejected": -0.5032440423965454,
"loss": 1.5857,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0124956369400024,
"rewards/margins": 0.24561452865600586,
"rewards/rejected": -1.2581101655960083,
"step": 230
},
{
"epoch": 0.49372161367886724,
"grad_norm": 11.510492324829102,
"learning_rate": 2.983139794488883e-07,
"logits/chosen": -1.1703720092773438,
"logits/rejected": -1.0775160789489746,
"logps/chosen": -0.4314059615135193,
"logps/rejected": -0.39874231815338135,
"loss": 1.6011,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.078514814376831,
"rewards/margins": -0.08165915310382843,
"rewards/rejected": -0.9968557953834534,
"step": 231
},
{
"epoch": 0.49585893668180603,
"grad_norm": 5.559615612030029,
"learning_rate": 2.964779017907287e-07,
"logits/chosen": -1.0301462411880493,
"logits/rejected": -1.0727837085723877,
"logps/chosen": -0.40483570098876953,
"logps/rejected": -0.45073747634887695,
"loss": 1.5311,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0120892524719238,
"rewards/margins": 0.114754319190979,
"rewards/rejected": -1.1268435716629028,
"step": 232
},
{
"epoch": 0.49799625968474487,
"grad_norm": 4.339590549468994,
"learning_rate": 2.9463922369965915e-07,
"logits/chosen": -0.9359559416770935,
"logits/rejected": -0.9321252703666687,
"logps/chosen": -0.35180893540382385,
"logps/rejected": -0.536721408367157,
"loss": 1.5723,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.8795222640037537,
"rewards/margins": 0.4622812271118164,
"rewards/rejected": -1.3418035507202148,
"step": 233
},
{
"epoch": 0.5001335826876837,
"grad_norm": 4.583502292633057,
"learning_rate": 2.927980480494938e-07,
"logits/chosen": -1.0992170572280884,
"logits/rejected": -1.0070428848266602,
"logps/chosen": -0.36073118448257446,
"logps/rejected": -0.3927144706249237,
"loss": 1.5751,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.9018279314041138,
"rewards/margins": 0.07995828986167908,
"rewards/rejected": -0.9817862510681152,
"step": 234
},
{
"epoch": 0.5022709056906225,
"grad_norm": 4.529286861419678,
"learning_rate": 2.909544778537844e-07,
"logits/chosen": -1.1656326055526733,
"logits/rejected": -1.0929317474365234,
"logps/chosen": -0.3711916506290436,
"logps/rejected": -0.3863615393638611,
"loss": 1.5221,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.9279791116714478,
"rewards/margins": 0.03792468085885048,
"rewards/rejected": -0.9659038186073303,
"step": 235
},
{
"epoch": 0.5044082286935613,
"grad_norm": 10.745676040649414,
"learning_rate": 2.8910861626005773e-07,
"logits/chosen": -1.058958649635315,
"logits/rejected": -0.9348481297492981,
"logps/chosen": -0.31637054681777954,
"logps/rejected": -0.33828607201576233,
"loss": 1.4931,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.7909263968467712,
"rewards/margins": 0.05478885397315025,
"rewards/rejected": -0.8457151055335999,
"step": 236
},
{
"epoch": 0.5065455516965002,
"grad_norm": 4.350003242492676,
"learning_rate": 2.872605665440436e-07,
"logits/chosen": -1.155067801475525,
"logits/rejected": -1.044098138809204,
"logps/chosen": -0.4006834626197815,
"logps/rejected": -0.3987181484699249,
"loss": 1.5417,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.001708745956421,
"rewards/margins": -0.004913315176963806,
"rewards/rejected": -0.9967952966690063,
"step": 237
},
{
"epoch": 0.508682874699439,
"grad_norm": 4.5963358879089355,
"learning_rate": 2.8541043210389726e-07,
"logits/chosen": -0.9011512994766235,
"logits/rejected": -0.9799545407295227,
"logps/chosen": -0.30112171173095703,
"logps/rejected": -0.4484432339668274,
"loss": 1.4859,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7528042793273926,
"rewards/margins": 0.36830389499664307,
"rewards/rejected": -1.1211082935333252,
"step": 238
},
{
"epoch": 0.5108201977023777,
"grad_norm": 5.283090591430664,
"learning_rate": 2.8355831645441387e-07,
"logits/chosen": -1.2146248817443848,
"logits/rejected": -1.2574325799942017,
"logps/chosen": -0.3450472354888916,
"logps/rejected": -0.4763634204864502,
"loss": 1.4888,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.862618088722229,
"rewards/margins": 0.32829049229621887,
"rewards/rejected": -1.1909085512161255,
"step": 239
},
{
"epoch": 0.5129575207053166,
"grad_norm": 6.43093729019165,
"learning_rate": 2.817043232212371e-07,
"logits/chosen": -1.2071186304092407,
"logits/rejected": -1.1450533866882324,
"logps/chosen": -0.3647967576980591,
"logps/rejected": -0.4625674784183502,
"loss": 1.5268,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9119919538497925,
"rewards/margins": 0.24442686140537262,
"rewards/rejected": -1.156418800354004,
"step": 240
},
{
"epoch": 0.5150948437082554,
"grad_norm": 9.717195510864258,
"learning_rate": 2.7984855613506106e-07,
"logits/chosen": -1.1946227550506592,
"logits/rejected": -1.1376502513885498,
"logps/chosen": -0.29764774441719055,
"logps/rejected": -0.302202045917511,
"loss": 1.5322,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.74411940574646,
"rewards/margins": 0.01138581894338131,
"rewards/rejected": -0.7555052042007446,
"step": 241
},
{
"epoch": 0.5172321667111942,
"grad_norm": 6.479928493499756,
"learning_rate": 2.7799111902582693e-07,
"logits/chosen": -1.2251317501068115,
"logits/rejected": -1.0719342231750488,
"logps/chosen": -0.3156971037387848,
"logps/rejected": -0.2400185763835907,
"loss": 1.5725,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.789242684841156,
"rewards/margins": -0.18919625878334045,
"rewards/rejected": -0.6000465154647827,
"step": 242
},
{
"epoch": 0.5193694897141331,
"grad_norm": 8.900924682617188,
"learning_rate": 2.761321158169134e-07,
"logits/chosen": -1.0849740505218506,
"logits/rejected": -1.1516170501708984,
"logps/chosen": -0.35676899552345276,
"logps/rejected": -0.5772523283958435,
"loss": 1.4924,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8919224739074707,
"rewards/margins": 0.5512083172798157,
"rewards/rejected": -1.4431307315826416,
"step": 243
},
{
"epoch": 0.5215068127170719,
"grad_norm": 5.021285533905029,
"learning_rate": 2.74271650519322e-07,
"logits/chosen": -1.1510225534439087,
"logits/rejected": -1.1225014925003052,
"logps/chosen": -0.3502144515514374,
"logps/rejected": -0.48459944128990173,
"loss": 1.5383,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.8755362033843994,
"rewards/margins": 0.3359624445438385,
"rewards/rejected": -1.2114986181259155,
"step": 244
},
{
"epoch": 0.5236441357200107,
"grad_norm": 6.033867359161377,
"learning_rate": 2.7240982722585837e-07,
"logits/chosen": -1.0076422691345215,
"logits/rejected": -1.0045421123504639,
"logps/chosen": -0.3226780295372009,
"logps/rejected": -0.37450891733169556,
"loss": 1.5745,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8066950440406799,
"rewards/margins": 0.1295773833990097,
"rewards/rejected": -0.9362723231315613,
"step": 245
},
{
"epoch": 0.5257814587229495,
"grad_norm": 7.610095977783203,
"learning_rate": 2.705467501053076e-07,
"logits/chosen": -1.3070695400238037,
"logits/rejected": -1.360163688659668,
"logps/chosen": -0.4132193624973297,
"logps/rejected": -0.5460841059684753,
"loss": 1.5482,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.033048391342163,
"rewards/margins": 0.3321617841720581,
"rewards/rejected": -1.3652101755142212,
"step": 246
},
{
"epoch": 0.5279187817258884,
"grad_norm": 5.6094770431518555,
"learning_rate": 2.6868252339660607e-07,
"logits/chosen": -0.9480774998664856,
"logits/rejected": -0.9445351362228394,
"logps/chosen": -0.5733252763748169,
"logps/rejected": -1.0633982419967651,
"loss": 1.5284,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.433313250541687,
"rewards/margins": 1.225182294845581,
"rewards/rejected": -2.6584954261779785,
"step": 247
},
{
"epoch": 0.5300561047288271,
"grad_norm": 16.856760025024414,
"learning_rate": 2.6681725140300995e-07,
"logits/chosen": -1.1925255060195923,
"logits/rejected": -1.1129463911056519,
"logps/chosen": -0.28040605783462524,
"logps/rejected": -0.36458098888397217,
"loss": 1.5368,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7010151743888855,
"rewards/margins": 0.21043738722801208,
"rewards/rejected": -0.9114525318145752,
"step": 248
},
{
"epoch": 0.5321934277317659,
"grad_norm": 6.675515174865723,
"learning_rate": 2.6495103848625854e-07,
"logits/chosen": -1.2934060096740723,
"logits/rejected": -1.17371666431427,
"logps/chosen": -0.3621112108230591,
"logps/rejected": -0.4810316562652588,
"loss": 1.5586,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.9052779674530029,
"rewards/margins": 0.29730114340782166,
"rewards/rejected": -1.2025790214538574,
"step": 249
},
{
"epoch": 0.5343307507347048,
"grad_norm": 15.49481201171875,
"learning_rate": 2.63083989060736e-07,
"logits/chosen": -1.019038438796997,
"logits/rejected": -0.9999558925628662,
"logps/chosen": -0.431622713804245,
"logps/rejected": -0.620360255241394,
"loss": 1.55,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0790568590164185,
"rewards/margins": 0.47184401750564575,
"rewards/rejected": -1.550900936126709,
"step": 250
},
{
"epoch": 0.5364680737376436,
"grad_norm": 4.874868392944336,
"learning_rate": 2.6121620758762875e-07,
"logits/chosen": -1.1522804498672485,
"logits/rejected": -1.144692301750183,
"logps/chosen": -0.40826401114463806,
"logps/rejected": -0.4715278744697571,
"loss": 1.5317,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0206599235534668,
"rewards/margins": 0.15815965831279755,
"rewards/rejected": -1.1788195371627808,
"step": 251
},
{
"epoch": 0.5386053967405824,
"grad_norm": 4.113776206970215,
"learning_rate": 2.593477985690815e-07,
"logits/chosen": -1.0712709426879883,
"logits/rejected": -1.1005451679229736,
"logps/chosen": -0.5715100765228271,
"logps/rejected": -0.6493417620658875,
"loss": 1.5129,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.4287753105163574,
"rewards/margins": 0.19457919895648956,
"rewards/rejected": -1.6233545541763306,
"step": 252
},
{
"epoch": 0.5407427197435213,
"grad_norm": 17.1635799407959,
"learning_rate": 2.574788665423496e-07,
"logits/chosen": -0.9928967356681824,
"logits/rejected": -0.9838371276855469,
"logps/chosen": -0.3351861536502838,
"logps/rejected": -0.3290242850780487,
"loss": 1.5488,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.8379653692245483,
"rewards/margins": -0.01540469378232956,
"rewards/rejected": -0.822560727596283,
"step": 253
},
{
"epoch": 0.5428800427464601,
"grad_norm": 8.28346061706543,
"learning_rate": 2.5560951607395126e-07,
"logits/chosen": -1.1226708889007568,
"logits/rejected": -1.0680346488952637,
"logps/chosen": -0.3342251777648926,
"logps/rejected": -0.3822442591190338,
"loss": 1.5604,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8355628252029419,
"rewards/margins": 0.12004776298999786,
"rewards/rejected": -0.9556106925010681,
"step": 254
},
{
"epoch": 0.5450173657493989,
"grad_norm": 12.173513412475586,
"learning_rate": 2.537398517538159e-07,
"logits/chosen": -1.1180171966552734,
"logits/rejected": -1.1232236623764038,
"logps/chosen": -0.3291173279285431,
"logps/rejected": -0.5288177132606506,
"loss": 1.4907,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8227933645248413,
"rewards/margins": 0.49925094842910767,
"rewards/rejected": -1.3220442533493042,
"step": 255
},
{
"epoch": 0.5471546887523377,
"grad_norm": 4.996148109436035,
"learning_rate": 2.518699781894332e-07,
"logits/chosen": -1.0864285230636597,
"logits/rejected": -1.0856531858444214,
"logps/chosen": -0.46618932485580444,
"logps/rejected": -0.9816129803657532,
"loss": 1.5076,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.165473222732544,
"rewards/margins": 1.2885593175888062,
"rewards/rejected": -2.4540326595306396,
"step": 256
},
{
"epoch": 0.5492920117552765,
"grad_norm": 5.049304008483887,
"learning_rate": 2.5e-07,
"logits/chosen": -0.9920480251312256,
"logits/rejected": -0.897991418838501,
"logps/chosen": -0.3009772002696991,
"logps/rejected": -0.3982135057449341,
"loss": 1.6439,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.7524430155754089,
"rewards/margins": 0.24309074878692627,
"rewards/rejected": -0.9955337643623352,
"step": 257
},
{
"epoch": 0.5514293347582153,
"grad_norm": 4.575491428375244,
"learning_rate": 2.4813002181056676e-07,
"logits/chosen": -1.0483980178833008,
"logits/rejected": -1.040475845336914,
"logps/chosen": -0.2760324478149414,
"logps/rejected": -0.5634697675704956,
"loss": 1.5932,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6900811791419983,
"rewards/margins": 0.718593180179596,
"rewards/rejected": -1.4086742401123047,
"step": 258
},
{
"epoch": 0.5535666577611541,
"grad_norm": 4.031703948974609,
"learning_rate": 2.4626014824618413e-07,
"logits/chosen": -1.2272746562957764,
"logits/rejected": -1.2073853015899658,
"logps/chosen": -0.4353184700012207,
"logps/rejected": -0.5070162415504456,
"loss": 1.5153,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0882960557937622,
"rewards/margins": 0.17924460768699646,
"rewards/rejected": -1.2675405740737915,
"step": 259
},
{
"epoch": 0.555703980764093,
"grad_norm": 8.027057647705078,
"learning_rate": 2.4439048392604877e-07,
"logits/chosen": -0.953754186630249,
"logits/rejected": -0.9900184869766235,
"logps/chosen": -0.2740909159183502,
"logps/rejected": -0.3458973467350006,
"loss": 1.5291,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.6852273344993591,
"rewards/margins": 0.17951609194278717,
"rewards/rejected": -0.8647434711456299,
"step": 260
},
{
"epoch": 0.5578413037670318,
"grad_norm": 7.306129455566406,
"learning_rate": 2.4252113345765043e-07,
"logits/chosen": -0.9035928845405579,
"logits/rejected": -0.8614873290061951,
"logps/chosen": -0.2865443229675293,
"logps/rejected": -0.32079729437828064,
"loss": 1.5665,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7163608074188232,
"rewards/margins": 0.0856325626373291,
"rewards/rejected": -0.8019933104515076,
"step": 261
},
{
"epoch": 0.5599786267699706,
"grad_norm": 3.2643260955810547,
"learning_rate": 2.406522014309186e-07,
"logits/chosen": -1.1808401346206665,
"logits/rejected": -1.1874431371688843,
"logps/chosen": -0.5122575163841248,
"logps/rejected": -0.8233806490898132,
"loss": 1.5851,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2806435823440552,
"rewards/margins": 0.7778077721595764,
"rewards/rejected": -2.0584514141082764,
"step": 262
},
{
"epoch": 0.5621159497729095,
"grad_norm": 3.0484321117401123,
"learning_rate": 2.3878379241237134e-07,
"logits/chosen": -1.1015522480010986,
"logits/rejected": -1.1043397188186646,
"logps/chosen": -0.5216892957687378,
"logps/rejected": -0.5477871298789978,
"loss": 1.4888,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3042232990264893,
"rewards/margins": 0.06524449586868286,
"rewards/rejected": -1.3694677352905273,
"step": 263
},
{
"epoch": 0.5642532727758482,
"grad_norm": 8.040013313293457,
"learning_rate": 2.3691601093926402e-07,
"logits/chosen": -1.0679914951324463,
"logits/rejected": -1.041649580001831,
"logps/chosen": -0.4239467978477478,
"logps/rejected": -0.427889347076416,
"loss": 1.6854,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0598669052124023,
"rewards/margins": 0.009856484830379486,
"rewards/rejected": -1.06972336769104,
"step": 264
},
{
"epoch": 0.566390595778787,
"grad_norm": 10.049110412597656,
"learning_rate": 2.3504896151374144e-07,
"logits/chosen": -1.1767913103103638,
"logits/rejected": -1.2240692377090454,
"logps/chosen": -0.4159534275531769,
"logps/rejected": -0.5419010519981384,
"loss": 1.5352,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0398834943771362,
"rewards/margins": 0.3148690462112427,
"rewards/rejected": -1.3547526597976685,
"step": 265
},
{
"epoch": 0.5685279187817259,
"grad_norm": 8.153444290161133,
"learning_rate": 2.3318274859699008e-07,
"logits/chosen": -1.063308596611023,
"logits/rejected": -1.164639949798584,
"logps/chosen": -0.2907513678073883,
"logps/rejected": -0.5243133306503296,
"loss": 1.6477,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.7268784046173096,
"rewards/margins": 0.5839048624038696,
"rewards/rejected": -1.3107832670211792,
"step": 266
},
{
"epoch": 0.5706652417846647,
"grad_norm": 8.460691452026367,
"learning_rate": 2.3131747660339394e-07,
"logits/chosen": -1.2165307998657227,
"logits/rejected": -1.1944361925125122,
"logps/chosen": -0.5601080656051636,
"logps/rejected": -0.47026118636131287,
"loss": 1.5634,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4002699851989746,
"rewards/margins": -0.2246171087026596,
"rewards/rejected": -1.1756529808044434,
"step": 267
},
{
"epoch": 0.5728025647876035,
"grad_norm": 13.028862953186035,
"learning_rate": 2.2945324989469243e-07,
"logits/chosen": -1.0125137567520142,
"logits/rejected": -0.9787082672119141,
"logps/chosen": -0.3832467794418335,
"logps/rejected": -0.7757288217544556,
"loss": 1.4993,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.958116888999939,
"rewards/margins": 0.9812053442001343,
"rewards/rejected": -1.9393221139907837,
"step": 268
},
{
"epoch": 0.5749398877905424,
"grad_norm": 10.404675483703613,
"learning_rate": 2.2759017277414164e-07,
"logits/chosen": -1.1808825731277466,
"logits/rejected": -1.1194167137145996,
"logps/chosen": -0.43756401538848877,
"logps/rejected": -0.3946504294872284,
"loss": 1.6378,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.0939099788665771,
"rewards/margins": -0.10728396475315094,
"rewards/rejected": -0.9866260290145874,
"step": 269
},
{
"epoch": 0.5770772107934812,
"grad_norm": 7.459733963012695,
"learning_rate": 2.2572834948067795e-07,
"logits/chosen": -0.9175713062286377,
"logits/rejected": -0.9572230577468872,
"logps/chosen": -0.2940235137939453,
"logps/rejected": -0.3464244604110718,
"loss": 1.6275,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.7350587844848633,
"rewards/margins": 0.13100232183933258,
"rewards/rejected": -0.8660610914230347,
"step": 270
},
{
"epoch": 0.57921453379642,
"grad_norm": 6.2825493812561035,
"learning_rate": 2.2386788418308665e-07,
"logits/chosen": -1.0154887437820435,
"logits/rejected": -1.0528539419174194,
"logps/chosen": -0.5251376628875732,
"logps/rejected": -0.7547603845596313,
"loss": 1.5214,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3128442764282227,
"rewards/margins": 0.5740568041801453,
"rewards/rejected": -1.8869010210037231,
"step": 271
},
{
"epoch": 0.5813518567993589,
"grad_norm": 4.264716148376465,
"learning_rate": 2.2200888097417302e-07,
"logits/chosen": -1.043276071548462,
"logits/rejected": -0.9186975955963135,
"logps/chosen": -0.39481961727142334,
"logps/rejected": -0.5242050886154175,
"loss": 1.5337,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9870489835739136,
"rewards/margins": 0.32346370816230774,
"rewards/rejected": -1.310512661933899,
"step": 272
},
{
"epoch": 0.5834891798022976,
"grad_norm": 4.6611199378967285,
"learning_rate": 2.2015144386493895e-07,
"logits/chosen": -0.9979356527328491,
"logits/rejected": -0.9526849985122681,
"logps/chosen": -0.39222562313079834,
"logps/rejected": -0.46204763650894165,
"loss": 1.4999,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.9805639982223511,
"rewards/margins": 0.1745550036430359,
"rewards/rejected": -1.1551190614700317,
"step": 273
},
{
"epoch": 0.5856265028052364,
"grad_norm": 4.630171298980713,
"learning_rate": 2.1829567677876297e-07,
"logits/chosen": -0.9676195979118347,
"logits/rejected": -0.9666755199432373,
"logps/chosen": -0.35061851143836975,
"logps/rejected": -0.35894879698753357,
"loss": 1.609,
"rewards/accuracies": 0.125,
"rewards/chosen": -0.8765462636947632,
"rewards/margins": 0.020825695246458054,
"rewards/rejected": -0.8973720073699951,
"step": 274
},
{
"epoch": 0.5877638258081752,
"grad_norm": 11.871662139892578,
"learning_rate": 2.164416835455862e-07,
"logits/chosen": -0.7467477321624756,
"logits/rejected": -0.6393258571624756,
"logps/chosen": -0.502008318901062,
"logps/rejected": -0.44732266664505005,
"loss": 1.5655,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.2550209760665894,
"rewards/margins": -0.13671430945396423,
"rewards/rejected": -1.1183066368103027,
"step": 275
},
{
"epoch": 0.5899011488111141,
"grad_norm": 3.6022424697875977,
"learning_rate": 2.1458956789610277e-07,
"logits/chosen": -1.2034939527511597,
"logits/rejected": -1.0202971696853638,
"logps/chosen": -0.3793608248233795,
"logps/rejected": -0.33719444274902344,
"loss": 1.5627,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9484022259712219,
"rewards/margins": -0.10541604459285736,
"rewards/rejected": -0.8429861068725586,
"step": 276
},
{
"epoch": 0.5920384718140529,
"grad_norm": 6.574893474578857,
"learning_rate": 2.1273943345595635e-07,
"logits/chosen": -1.2551283836364746,
"logits/rejected": -1.2000017166137695,
"logps/chosen": -0.4082186818122864,
"logps/rejected": -0.6158214807510376,
"loss": 1.5529,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0205466747283936,
"rewards/margins": 0.51900714635849,
"rewards/rejected": -1.5395537614822388,
"step": 277
},
{
"epoch": 0.5941757948169917,
"grad_norm": 5.325026035308838,
"learning_rate": 2.1089138373994222e-07,
"logits/chosen": -1.0981683731079102,
"logits/rejected": -1.093741774559021,
"logps/chosen": -0.4150196313858032,
"logps/rejected": -0.5339372754096985,
"loss": 1.5688,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0375490188598633,
"rewards/margins": 0.2972941994667053,
"rewards/rejected": -1.3348432779312134,
"step": 278
},
{
"epoch": 0.5963131178199306,
"grad_norm": 17.316631317138672,
"learning_rate": 2.0904552214621556e-07,
"logits/chosen": -1.1414576768875122,
"logits/rejected": -1.1112793684005737,
"logps/chosen": -0.6599245071411133,
"logps/rejected": -0.3380126357078552,
"loss": 1.6195,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.6498112678527832,
"rewards/margins": -0.8047796487808228,
"rewards/rejected": -0.8450315594673157,
"step": 279
},
{
"epoch": 0.5984504408228694,
"grad_norm": 4.380247116088867,
"learning_rate": 2.072019519505062e-07,
"logits/chosen": -0.9662964940071106,
"logits/rejected": -0.9849826693534851,
"logps/chosen": -0.36699962615966797,
"logps/rejected": -0.3455093204975128,
"loss": 1.5053,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.9174990653991699,
"rewards/margins": -0.053725723177194595,
"rewards/rejected": -0.8637734055519104,
"step": 280
},
{
"epoch": 0.6005877638258081,
"grad_norm": 7.828254699707031,
"learning_rate": 2.0536077630034085e-07,
"logits/chosen": -0.9694425463676453,
"logits/rejected": -0.8208516240119934,
"logps/chosen": -0.4557761251926422,
"logps/rejected": -0.6782093048095703,
"loss": 1.6184,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1394402980804443,
"rewards/margins": 0.5560829043388367,
"rewards/rejected": -1.6955231428146362,
"step": 281
},
{
"epoch": 0.602725086828747,
"grad_norm": 7.952332496643066,
"learning_rate": 2.0352209820927135e-07,
"logits/chosen": -0.9816855192184448,
"logits/rejected": -0.8845440149307251,
"logps/chosen": -0.3230597972869873,
"logps/rejected": -0.4072348475456238,
"loss": 1.5012,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8076494932174683,
"rewards/margins": 0.21043761074543,
"rewards/rejected": -1.0180871486663818,
"step": 282
},
{
"epoch": 0.6048624098316858,
"grad_norm": 8.255958557128906,
"learning_rate": 2.0168602055111173e-07,
"logits/chosen": -1.1203254461288452,
"logits/rejected": -1.107031226158142,
"logps/chosen": -0.6907448768615723,
"logps/rejected": -0.5776211023330688,
"loss": 1.6044,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.7268621921539307,
"rewards/margins": -0.28280916810035706,
"rewards/rejected": -1.4440529346466064,
"step": 283
},
{
"epoch": 0.6069997328346246,
"grad_norm": 11.462747573852539,
"learning_rate": 1.998526460541818e-07,
"logits/chosen": -1.0083472728729248,
"logits/rejected": -0.9932087659835815,
"logps/chosen": -0.5044468641281128,
"logps/rejected": -0.4264739453792572,
"loss": 1.6586,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2611171007156372,
"rewards/margins": -0.19493228197097778,
"rewards/rejected": -1.0661848783493042,
"step": 284
},
{
"epoch": 0.6091370558375635,
"grad_norm": 4.765872478485107,
"learning_rate": 1.980220772955602e-07,
"logits/chosen": -1.035547137260437,
"logits/rejected": -1.0857359170913696,
"logps/chosen": -0.41963931918144226,
"logps/rejected": -0.592042863368988,
"loss": 1.5091,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.049098253250122,
"rewards/margins": 0.43100887537002563,
"rewards/rejected": -1.4801071882247925,
"step": 285
},
{
"epoch": 0.6112743788405023,
"grad_norm": 7.985474586486816,
"learning_rate": 1.961944166953445e-07,
"logits/chosen": -0.8251385688781738,
"logits/rejected": -0.9071054458618164,
"logps/chosen": -0.3732144236564636,
"logps/rejected": -0.4104728400707245,
"loss": 1.5169,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9330360889434814,
"rewards/margins": 0.09314604848623276,
"rewards/rejected": -1.0261821746826172,
"step": 286
},
{
"epoch": 0.6134117018434411,
"grad_norm": 7.316262722015381,
"learning_rate": 1.9436976651092142e-07,
"logits/chosen": -0.9544340372085571,
"logits/rejected": -0.898868203163147,
"logps/chosen": -0.35007724165916443,
"logps/rejected": -0.495257705450058,
"loss": 1.6104,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8751930594444275,
"rewards/margins": 0.3629511594772339,
"rewards/rejected": -1.2381441593170166,
"step": 287
},
{
"epoch": 0.6155490248463799,
"grad_norm": 3.980193614959717,
"learning_rate": 1.9254822883124517e-07,
"logits/chosen": -1.2356715202331543,
"logits/rejected": -1.1466394662857056,
"logps/chosen": -0.418282151222229,
"logps/rejected": -0.5082724094390869,
"loss": 1.5123,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0457054376602173,
"rewards/margins": 0.22497567534446716,
"rewards/rejected": -1.2706811428070068,
"step": 288
},
{
"epoch": 0.6176863478493188,
"grad_norm": 6.745012283325195,
"learning_rate": 1.9072990557112564e-07,
"logits/chosen": -1.2313592433929443,
"logits/rejected": -1.1524139642715454,
"logps/chosen": -0.34664469957351685,
"logps/rejected": -0.5482650399208069,
"loss": 1.5042,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8666118383407593,
"rewards/margins": 0.504050612449646,
"rewards/rejected": -1.3706625699996948,
"step": 289
},
{
"epoch": 0.6198236708522575,
"grad_norm": 9.481700897216797,
"learning_rate": 1.8891489846552644e-07,
"logits/chosen": -1.081266164779663,
"logits/rejected": -1.085394263267517,
"logps/chosen": -0.37148210406303406,
"logps/rejected": -0.5117133855819702,
"loss": 1.5743,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.928705096244812,
"rewards/margins": 0.35057833790779114,
"rewards/rejected": -1.2792835235595703,
"step": 290
},
{
"epoch": 0.6219609938551963,
"grad_norm": 5.410580635070801,
"learning_rate": 1.8710330906387286e-07,
"logits/chosen": -1.0288105010986328,
"logits/rejected": -1.0116032361984253,
"logps/chosen": -0.3644832670688629,
"logps/rejected": -0.4690641462802887,
"loss": 1.4934,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9112080931663513,
"rewards/margins": 0.261452317237854,
"rewards/rejected": -1.1726603507995605,
"step": 291
},
{
"epoch": 0.6240983168581352,
"grad_norm": 12.723451614379883,
"learning_rate": 1.8529523872436977e-07,
"logits/chosen": -1.0818991661071777,
"logits/rejected": -1.0822491645812988,
"logps/chosen": -0.2900945246219635,
"logps/rejected": -0.44144943356513977,
"loss": 1.6044,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.7252362370491028,
"rewards/margins": 0.37838736176490784,
"rewards/rejected": -1.103623628616333,
"step": 292
},
{
"epoch": 0.626235639861074,
"grad_norm": 18.391582489013672,
"learning_rate": 1.8349078860833124e-07,
"logits/chosen": -1.2308346033096313,
"logits/rejected": -1.1760648488998413,
"logps/chosen": -0.6205483675003052,
"logps/rejected": -0.3508188724517822,
"loss": 1.6365,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.5513708591461182,
"rewards/margins": -0.6743236780166626,
"rewards/rejected": -0.8770472407341003,
"step": 293
},
{
"epoch": 0.6283729628640128,
"grad_norm": 4.619331359863281,
"learning_rate": 1.8169005967452e-07,
"logits/chosen": -1.2816352844238281,
"logits/rejected": -1.2922104597091675,
"logps/chosen": -0.3991982340812683,
"logps/rejected": -0.504127562046051,
"loss": 1.5704,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.9979956150054932,
"rewards/margins": 0.2623233199119568,
"rewards/rejected": -1.2603188753128052,
"step": 294
},
{
"epoch": 0.6305102858669517,
"grad_norm": 5.379226207733154,
"learning_rate": 1.7989315267349933e-07,
"logits/chosen": -1.0375932455062866,
"logits/rejected": -0.987227737903595,
"logps/chosen": -0.5034650564193726,
"logps/rejected": -0.6317480206489563,
"loss": 1.5085,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2586627006530762,
"rewards/margins": 0.3207073211669922,
"rewards/rejected": -1.5793699026107788,
"step": 295
},
{
"epoch": 0.6326476088698905,
"grad_norm": 15.063772201538086,
"learning_rate": 1.781001681419957e-07,
"logits/chosen": -1.0075099468231201,
"logits/rejected": -0.9762495160102844,
"logps/chosen": -0.5012757778167725,
"logps/rejected": -0.4941572844982147,
"loss": 1.4992,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2531894445419312,
"rewards/margins": -0.01779627427458763,
"rewards/rejected": -1.2353932857513428,
"step": 296
},
{
"epoch": 0.6347849318728293,
"grad_norm": 6.187438488006592,
"learning_rate": 1.763112063972739e-07,
"logits/chosen": -1.086329460144043,
"logits/rejected": -1.003612995147705,
"logps/chosen": -0.43468421697616577,
"logps/rejected": -0.623386800289154,
"loss": 1.5243,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0867105722427368,
"rewards/margins": 0.4717563986778259,
"rewards/rejected": -1.5584670305252075,
"step": 297
},
{
"epoch": 0.6369222548757681,
"grad_norm": 16.858556747436523,
"learning_rate": 1.745263675315245e-07,
"logits/chosen": -0.9435803890228271,
"logits/rejected": -0.9401760697364807,
"logps/chosen": -0.41162610054016113,
"logps/rejected": -0.7601633667945862,
"loss": 1.5119,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0290653705596924,
"rewards/margins": 0.871343195438385,
"rewards/rejected": -1.9004085063934326,
"step": 298
},
{
"epoch": 0.6390595778787069,
"grad_norm": 9.424429893493652,
"learning_rate": 1.7274575140626315e-07,
"logits/chosen": -1.1454746723175049,
"logits/rejected": -1.0624852180480957,
"logps/chosen": -0.5017335414886475,
"logps/rejected": -0.5628975629806519,
"loss": 1.6427,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2543339729309082,
"rewards/margins": 0.1529100388288498,
"rewards/rejected": -1.4072438478469849,
"step": 299
},
{
"epoch": 0.6411969008816457,
"grad_norm": 5.96793794631958,
"learning_rate": 1.7096945764674398e-07,
"logits/chosen": -0.9207834005355835,
"logits/rejected": -0.9202168583869934,
"logps/chosen": -0.3956447243690491,
"logps/rejected": -0.40050509572029114,
"loss": 1.6262,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9891117811203003,
"rewards/margins": 0.012150941416621208,
"rewards/rejected": -1.0012627840042114,
"step": 300
},
{
"epoch": 0.6433342238845846,
"grad_norm": 5.713474750518799,
"learning_rate": 1.6919758563638502e-07,
"logits/chosen": -0.9401556253433228,
"logits/rejected": -0.8605173826217651,
"logps/chosen": -0.4167335331439972,
"logps/rejected": -0.5897922515869141,
"loss": 1.527,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.041833758354187,
"rewards/margins": 0.4326467216014862,
"rewards/rejected": -1.4744806289672852,
"step": 301
},
{
"epoch": 0.6454715468875234,
"grad_norm": 11.05538272857666,
"learning_rate": 1.674302345112083e-07,
"logits/chosen": -1.0368751287460327,
"logits/rejected": -1.1599576473236084,
"logps/chosen": -0.42875924706459045,
"logps/rejected": -0.7052878737449646,
"loss": 1.4713,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.071898102760315,
"rewards/margins": 0.6913214325904846,
"rewards/rejected": -1.7632195949554443,
"step": 302
},
{
"epoch": 0.6476088698904622,
"grad_norm": 7.134030818939209,
"learning_rate": 1.656675031542925e-07,
"logits/chosen": -1.1293184757232666,
"logits/rejected": -1.1065864562988281,
"logps/chosen": -0.4180205166339874,
"logps/rejected": -0.4501388669013977,
"loss": 1.5318,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.045051097869873,
"rewards/margins": 0.08029599487781525,
"rewards/rejected": -1.1253471374511719,
"step": 303
},
{
"epoch": 0.649746192893401,
"grad_norm": 9.838216781616211,
"learning_rate": 1.6390949019024118e-07,
"logits/chosen": -1.2255228757858276,
"logits/rejected": -1.0440919399261475,
"logps/chosen": -0.34637248516082764,
"logps/rejected": -0.332830548286438,
"loss": 1.4995,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8659312725067139,
"rewards/margins": -0.0338548980653286,
"rewards/rejected": -0.8320763111114502,
"step": 304
},
{
"epoch": 0.6518835158963399,
"grad_norm": 3.633300542831421,
"learning_rate": 1.621562939796643e-07,
"logits/chosen": -1.0595769882202148,
"logits/rejected": -1.060139536857605,
"logps/chosen": -0.4522903561592102,
"logps/rejected": -0.7426398992538452,
"loss": 1.5239,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1307260990142822,
"rewards/margins": 0.7258738279342651,
"rewards/rejected": -1.8565996885299683,
"step": 305
},
{
"epoch": 0.6540208388992786,
"grad_norm": 6.867697715759277,
"learning_rate": 1.6040801261367493e-07,
"logits/chosen": -1.1086713075637817,
"logits/rejected": -1.1967618465423584,
"logps/chosen": -0.42471063137054443,
"logps/rejected": -0.4879637360572815,
"loss": 1.4941,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0617766380310059,
"rewards/margins": 0.15813271701335907,
"rewards/rejected": -1.2199093103408813,
"step": 306
},
{
"epoch": 0.6561581619022174,
"grad_norm": 9.760129928588867,
"learning_rate": 1.5866474390840124e-07,
"logits/chosen": -1.0829546451568604,
"logits/rejected": -1.0814926624298096,
"logps/chosen": -0.4132816791534424,
"logps/rejected": -0.586501955986023,
"loss": 1.5785,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.033204197883606,
"rewards/margins": 0.43305063247680664,
"rewards/rejected": -1.466254711151123,
"step": 307
},
{
"epoch": 0.6582954849051563,
"grad_norm": 4.1318511962890625,
"learning_rate": 1.569265853995137e-07,
"logits/chosen": -0.9130831360816956,
"logits/rejected": -1.0174808502197266,
"logps/chosen": -0.3561224639415741,
"logps/rejected": -0.47521257400512695,
"loss": 1.4836,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8903061747550964,
"rewards/margins": 0.2977251708507538,
"rewards/rejected": -1.1880314350128174,
"step": 308
},
{
"epoch": 0.6604328079080951,
"grad_norm": 5.264586925506592,
"learning_rate": 1.5519363433676791e-07,
"logits/chosen": -1.2580910921096802,
"logits/rejected": -1.2599362134933472,
"logps/chosen": -0.42455989122390747,
"logps/rejected": -0.5628042221069336,
"loss": 1.6118,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0613996982574463,
"rewards/margins": 0.34561091661453247,
"rewards/rejected": -1.4070106744766235,
"step": 309
},
{
"epoch": 0.6625701309110339,
"grad_norm": 19.9966983795166,
"learning_rate": 1.5346598767856345e-07,
"logits/chosen": -0.8979520201683044,
"logits/rejected": -0.9155081510543823,
"logps/chosen": -0.335290789604187,
"logps/rejected": -0.43495267629623413,
"loss": 1.6585,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8382269740104675,
"rewards/margins": 0.24915480613708496,
"rewards/rejected": -1.0873818397521973,
"step": 310
},
{
"epoch": 0.6647074539139728,
"grad_norm": 10.383206367492676,
"learning_rate": 1.517437420865191e-07,
"logits/chosen": -1.4863961935043335,
"logits/rejected": -1.2771762609481812,
"logps/chosen": -0.33609017729759216,
"logps/rejected": -0.5989465713500977,
"loss": 1.5329,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.8402254581451416,
"rewards/margins": 0.6571409106254578,
"rewards/rejected": -1.4973664283752441,
"step": 311
},
{
"epoch": 0.6668447769169116,
"grad_norm": 4.85746431350708,
"learning_rate": 1.500269939200648e-07,
"logits/chosen": -1.1300700902938843,
"logits/rejected": -1.1000648736953735,
"logps/chosen": -0.4227021336555481,
"logps/rejected": -0.3756105899810791,
"loss": 1.572,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.0567553043365479,
"rewards/margins": -0.1177288144826889,
"rewards/rejected": -0.939026415348053,
"step": 312
},
{
"epoch": 0.6689820999198504,
"grad_norm": 15.065166473388672,
"learning_rate": 1.4831583923104998e-07,
"logits/chosen": -1.2687509059906006,
"logits/rejected": -1.2387371063232422,
"logps/chosen": -0.37272506952285767,
"logps/rejected": -0.4628967344760895,
"loss": 1.5731,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9318127632141113,
"rewards/margins": 0.22542911767959595,
"rewards/rejected": -1.157241940498352,
"step": 313
},
{
"epoch": 0.6711194229227893,
"grad_norm": 4.742990970611572,
"learning_rate": 1.4661037375836987e-07,
"logits/chosen": -1.1166198253631592,
"logits/rejected": -1.1708546876907349,
"logps/chosen": -0.40323004126548767,
"logps/rejected": -0.49593961238861084,
"loss": 1.5854,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.0080751180648804,
"rewards/margins": 0.2317739725112915,
"rewards/rejected": -1.2398490905761719,
"step": 314
},
{
"epoch": 0.673256745925728,
"grad_norm": 6.135270595550537,
"learning_rate": 1.4491069292260866e-07,
"logits/chosen": -1.0454214811325073,
"logits/rejected": -0.9797170162200928,
"logps/chosen": -0.4790341258049011,
"logps/rejected": -0.5334821343421936,
"loss": 1.5961,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1975852251052856,
"rewards/margins": 0.13612008094787598,
"rewards/rejected": -1.3337054252624512,
"step": 315
},
{
"epoch": 0.6753940689286668,
"grad_norm": 5.529047012329102,
"learning_rate": 1.432168918207009e-07,
"logits/chosen": -0.9548214673995972,
"logits/rejected": -1.037723183631897,
"logps/chosen": -0.34955471754074097,
"logps/rejected": -0.6267892122268677,
"loss": 1.5352,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.87388676404953,
"rewards/margins": 0.6930862069129944,
"rewards/rejected": -1.5669729709625244,
"step": 316
},
{
"epoch": 0.6775313919316056,
"grad_norm": 4.692312240600586,
"learning_rate": 1.4152906522061047e-07,
"logits/chosen": -1.0882840156555176,
"logits/rejected": -1.0136744976043701,
"logps/chosen": -0.31677815318107605,
"logps/rejected": -0.42752817273139954,
"loss": 1.522,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.7919453978538513,
"rewards/margins": 0.2768751382827759,
"rewards/rejected": -1.0688204765319824,
"step": 317
},
{
"epoch": 0.6796687149345445,
"grad_norm": 10.370941162109375,
"learning_rate": 1.3984730755602903e-07,
"logits/chosen": -1.161927580833435,
"logits/rejected": -1.0617276430130005,
"logps/chosen": -0.534487783908844,
"logps/rejected": -0.6021491289138794,
"loss": 1.5201,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.3362195491790771,
"rewards/margins": 0.1691533476114273,
"rewards/rejected": -1.5053728818893433,
"step": 318
},
{
"epoch": 0.6818060379374833,
"grad_norm": 9.610735893249512,
"learning_rate": 1.381717129210918e-07,
"logits/chosen": -1.1923787593841553,
"logits/rejected": -1.2188538312911987,
"logps/chosen": -0.375224769115448,
"logps/rejected": -0.7198653817176819,
"loss": 1.5779,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.9380618929862976,
"rewards/margins": 0.8616017699241638,
"rewards/rejected": -1.7996635437011719,
"step": 319
},
{
"epoch": 0.6839433609404221,
"grad_norm": 5.218975067138672,
"learning_rate": 1.365023750651133e-07,
"logits/chosen": -1.1556600332260132,
"logits/rejected": -1.095563530921936,
"logps/chosen": -0.37500467896461487,
"logps/rejected": -0.4297294020652771,
"loss": 1.5315,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.9375116229057312,
"rewards/margins": 0.1368117332458496,
"rewards/rejected": -1.0743234157562256,
"step": 320
},
{
"epoch": 0.686080683943361,
"grad_norm": 6.690799236297607,
"learning_rate": 1.3483938738734195e-07,
"logits/chosen": -0.8860509395599365,
"logits/rejected": -0.8542050123214722,
"logps/chosen": -0.3055468201637268,
"logps/rejected": -0.3592091202735901,
"loss": 1.5345,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.7638670802116394,
"rewards/margins": 0.13415566086769104,
"rewards/rejected": -0.8980227708816528,
"step": 321
},
{
"epoch": 0.6882180069462998,
"grad_norm": 9.405998229980469,
"learning_rate": 1.3318284293173449e-07,
"logits/chosen": -0.9992817640304565,
"logits/rejected": -0.9634179472923279,
"logps/chosen": -0.4300232529640198,
"logps/rejected": -0.4082144498825073,
"loss": 1.5484,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.075058102607727,
"rewards/margins": -0.05452210083603859,
"rewards/rejected": -1.0205360651016235,
"step": 322
},
{
"epoch": 0.6903553299492385,
"grad_norm": 5.07054328918457,
"learning_rate": 1.3153283438175034e-07,
"logits/chosen": -1.0605663061141968,
"logits/rejected": -1.0907377004623413,
"logps/chosen": -0.3852520287036896,
"logps/rejected": -0.4656079113483429,
"loss": 1.5678,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9631301164627075,
"rewards/margins": 0.2008897364139557,
"rewards/rejected": -1.1640198230743408,
"step": 323
},
{
"epoch": 0.6924926529521774,
"grad_norm": 7.885300159454346,
"learning_rate": 1.2988945405516565e-07,
"logits/chosen": -1.0609923601150513,
"logits/rejected": -1.0959070920944214,
"logps/chosen": -0.4137347638607025,
"logps/rejected": -0.5460120439529419,
"loss": 1.5155,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.034337043762207,
"rewards/margins": 0.33069324493408203,
"rewards/rejected": -1.3650301694869995,
"step": 324
},
{
"epoch": 0.6946299759551162,
"grad_norm": 5.987451553344727,
"learning_rate": 1.2825279389890818e-07,
"logits/chosen": -0.9947149753570557,
"logits/rejected": -1.0909423828125,
"logps/chosen": -0.4138132333755493,
"logps/rejected": -0.4551887512207031,
"loss": 1.4521,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0345330238342285,
"rewards/margins": 0.10343889147043228,
"rewards/rejected": -1.1379718780517578,
"step": 325
},
{
"epoch": 0.696767298958055,
"grad_norm": 5.460362434387207,
"learning_rate": 1.2662294548391328e-07,
"logits/chosen": -1.1488416194915771,
"logits/rejected": -0.9424848556518555,
"logps/chosen": -0.48600658774375916,
"logps/rejected": -0.7618072628974915,
"loss": 1.5489,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.215016484260559,
"rewards/margins": 0.6895018219947815,
"rewards/rejected": -1.9045181274414062,
"step": 326
},
{
"epoch": 0.6989046219609939,
"grad_norm": 10.205648422241211,
"learning_rate": 1.2500000000000005e-07,
"logits/chosen": -0.9662617444992065,
"logits/rejected": -1.1120105981826782,
"logps/chosen": -0.40102994441986084,
"logps/rejected": -0.6069145798683167,
"loss": 1.5435,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0025748014450073,
"rewards/margins": 0.5147115588188171,
"rewards/rejected": -1.5172864198684692,
"step": 327
},
{
"epoch": 0.7010419449639327,
"grad_norm": 8.57025146484375,
"learning_rate": 1.2338404825076935e-07,
"logits/chosen": -1.1456284523010254,
"logits/rejected": -1.0507254600524902,
"logps/chosen": -0.4316751956939697,
"logps/rejected": -0.44297924637794495,
"loss": 1.5203,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0791878700256348,
"rewards/margins": 0.02826026827096939,
"rewards/rejected": -1.1074482202529907,
"step": 328
},
{
"epoch": 0.7031792679668715,
"grad_norm": 9.405667304992676,
"learning_rate": 1.2177518064852345e-07,
"logits/chosen": -1.1795152425765991,
"logits/rejected": -1.0673104524612427,
"logps/chosen": -0.37221118807792664,
"logps/rejected": -0.4177697002887726,
"loss": 1.5138,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.930527925491333,
"rewards/margins": 0.11389636248350143,
"rewards/rejected": -1.044424295425415,
"step": 329
},
{
"epoch": 0.7053165909698104,
"grad_norm": 6.466906547546387,
"learning_rate": 1.201734872092077e-07,
"logits/chosen": -1.0346126556396484,
"logits/rejected": -1.0439854860305786,
"logps/chosen": -0.42790815234184265,
"logps/rejected": -1.0795375108718872,
"loss": 1.4512,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.0697704553604126,
"rewards/margins": 1.6290733814239502,
"rewards/rejected": -2.6988439559936523,
"step": 330
},
{
"epoch": 0.7074539139727491,
"grad_norm": 4.390846252441406,
"learning_rate": 1.185790575473738e-07,
"logits/chosen": -1.1390395164489746,
"logits/rejected": -1.1135765314102173,
"logps/chosen": -0.5658525824546814,
"logps/rejected": -0.5814335942268372,
"loss": 1.4646,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.4146316051483154,
"rewards/margins": 0.03895253688097,
"rewards/rejected": -1.45358407497406,
"step": 331
},
{
"epoch": 0.7095912369756879,
"grad_norm": 5.300051212310791,
"learning_rate": 1.1699198087116588e-07,
"logits/chosen": -1.015911340713501,
"logits/rejected": -1.11297607421875,
"logps/chosen": -0.4078459143638611,
"logps/rejected": -0.6018810868263245,
"loss": 1.5814,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.019614815711975,
"rewards/margins": 0.48508787155151367,
"rewards/rejected": -1.5047025680541992,
"step": 332
},
{
"epoch": 0.7117285599786267,
"grad_norm": 5.75014066696167,
"learning_rate": 1.1541234597732947e-07,
"logits/chosen": -1.0877783298492432,
"logits/rejected": -1.0997726917266846,
"logps/chosen": -0.3489132523536682,
"logps/rejected": -0.43700623512268066,
"loss": 1.4605,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8722831606864929,
"rewards/margins": 0.22023235261440277,
"rewards/rejected": -1.0925155878067017,
"step": 333
},
{
"epoch": 0.7138658829815656,
"grad_norm": 5.7690229415893555,
"learning_rate": 1.1384024124624322e-07,
"logits/chosen": -0.9815250635147095,
"logits/rejected": -0.9534754753112793,
"logps/chosen": -0.42468035221099854,
"logps/rejected": -0.42991912364959717,
"loss": 1.5096,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.0617008209228516,
"rewards/margins": 0.013096902519464493,
"rewards/rejected": -1.0747978687286377,
"step": 334
},
{
"epoch": 0.7160032059845044,
"grad_norm": 6.530084609985352,
"learning_rate": 1.1227575463697439e-07,
"logits/chosen": -1.0770314931869507,
"logits/rejected": -1.1593232154846191,
"logps/chosen": -0.41134944558143616,
"logps/rejected": -0.46647369861602783,
"loss": 1.5333,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0283737182617188,
"rewards/margins": 0.1378105729818344,
"rewards/rejected": -1.1661843061447144,
"step": 335
},
{
"epoch": 0.7181405289874432,
"grad_norm": 6.972837448120117,
"learning_rate": 1.1071897368235694e-07,
"logits/chosen": -1.0634236335754395,
"logits/rejected": -1.1836671829223633,
"logps/chosen": -0.35514572262763977,
"logps/rejected": -0.6676814556121826,
"loss": 1.4464,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.8878642320632935,
"rewards/margins": 0.781339168548584,
"rewards/rejected": -1.6692036390304565,
"step": 336
},
{
"epoch": 0.7202778519903821,
"grad_norm": 6.366727828979492,
"learning_rate": 1.0916998548409447e-07,
"logits/chosen": -1.0254015922546387,
"logits/rejected": -0.9457456469535828,
"logps/chosen": -0.3946457803249359,
"logps/rejected": -0.4669502377510071,
"loss": 1.617,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.986614465713501,
"rewards/margins": 0.1807611733675003,
"rewards/rejected": -1.1673755645751953,
"step": 337
},
{
"epoch": 0.7224151749933209,
"grad_norm": 6.886241436004639,
"learning_rate": 1.0762887670788701e-07,
"logits/chosen": -0.8809665441513062,
"logits/rejected": -0.7807790637016296,
"logps/chosen": -0.32795268297195435,
"logps/rejected": -0.4252588152885437,
"loss": 1.4383,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8198816776275635,
"rewards/margins": 0.2432653307914734,
"rewards/rejected": -1.063146948814392,
"step": 338
},
{
"epoch": 0.7245524979962596,
"grad_norm": 4.194058418273926,
"learning_rate": 1.0609573357858165e-07,
"logits/chosen": -1.1363167762756348,
"logits/rejected": -1.1581072807312012,
"logps/chosen": -0.4632134437561035,
"logps/rejected": -0.6518194079399109,
"loss": 1.6039,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.1580334901809692,
"rewards/margins": 0.47151511907577515,
"rewards/rejected": -1.6295486688613892,
"step": 339
},
{
"epoch": 0.7266898209991985,
"grad_norm": 23.59259033203125,
"learning_rate": 1.0457064187534861e-07,
"logits/chosen": -1.0518946647644043,
"logits/rejected": -0.9888642430305481,
"logps/chosen": -0.407795786857605,
"logps/rejected": -0.45109352469444275,
"loss": 1.6088,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.0194894075393677,
"rewards/margins": 0.1082444041967392,
"rewards/rejected": -1.127733826637268,
"step": 340
},
{
"epoch": 0.7288271440021373,
"grad_norm": 5.5164079666137695,
"learning_rate": 1.0305368692688174e-07,
"logits/chosen": -1.0163506269454956,
"logits/rejected": -0.9497014284133911,
"logps/chosen": -0.43188926577568054,
"logps/rejected": -0.5361363291740417,
"loss": 1.5212,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0797233581542969,
"rewards/margins": 0.2606176435947418,
"rewards/rejected": -1.3403409719467163,
"step": 341
},
{
"epoch": 0.7309644670050761,
"grad_norm": 10.590493202209473,
"learning_rate": 1.0154495360662463e-07,
"logits/chosen": -0.7582399249076843,
"logits/rejected": -0.7725558280944824,
"logps/chosen": -0.3883350193500519,
"logps/rejected": -0.4608793556690216,
"loss": 1.4818,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9708375930786133,
"rewards/margins": 0.18136097490787506,
"rewards/rejected": -1.152198314666748,
"step": 342
},
{
"epoch": 0.733101790008015,
"grad_norm": 5.3863677978515625,
"learning_rate": 1.0004452632802158e-07,
"logits/chosen": -1.0777875185012817,
"logits/rejected": -1.0332427024841309,
"logps/chosen": -0.6136234402656555,
"logps/rejected": -0.6208893656730652,
"loss": 1.4504,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.5340585708618164,
"rewards/margins": 0.018164925277233124,
"rewards/rejected": -1.552223563194275,
"step": 343
},
{
"epoch": 0.7352391130109538,
"grad_norm": 4.131237506866455,
"learning_rate": 9.855248903979505e-08,
"logits/chosen": -1.1028010845184326,
"logits/rejected": -1.063793420791626,
"logps/chosen": -0.4896419644355774,
"logps/rejected": -0.5126334428787231,
"loss": 1.496,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.224104881286621,
"rewards/margins": 0.05747878551483154,
"rewards/rejected": -1.2815836668014526,
"step": 344
},
{
"epoch": 0.7373764360138926,
"grad_norm": 6.84834623336792,
"learning_rate": 9.706892522124838e-08,
"logits/chosen": -1.0494110584259033,
"logits/rejected": -0.994105339050293,
"logps/chosen": -0.5251023173332214,
"logps/rejected": -0.5692360401153564,
"loss": 1.5667,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.312755823135376,
"rewards/margins": 0.11033419519662857,
"rewards/rejected": -1.4230899810791016,
"step": 345
},
{
"epoch": 0.7395137590168315,
"grad_norm": 7.503670692443848,
"learning_rate": 9.559391787759554e-08,
"logits/chosen": -1.304071307182312,
"logits/rejected": -1.1836615800857544,
"logps/chosen": -0.5006979703903198,
"logps/rejected": -0.4764450788497925,
"loss": 1.5827,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2517449855804443,
"rewards/margins": -0.06063230335712433,
"rewards/rejected": -1.191112756729126,
"step": 346
},
{
"epoch": 0.7416510820197703,
"grad_norm": 7.5821757316589355,
"learning_rate": 9.412754953531663e-08,
"logits/chosen": -1.0126221179962158,
"logits/rejected": -1.0061360597610474,
"logps/chosen": -0.536862850189209,
"logps/rejected": -0.7791385650634766,
"loss": 1.5476,
"rewards/accuracies": 0.875,
"rewards/chosen": -1.342157244682312,
"rewards/margins": 0.6056893467903137,
"rewards/rejected": -1.9478464126586914,
"step": 347
},
{
"epoch": 0.743788405022709,
"grad_norm": 6.262937545776367,
"learning_rate": 9.266990223754067e-08,
"logits/chosen": -0.9866530895233154,
"logits/rejected": -1.087868571281433,
"logps/chosen": -0.35664424300193787,
"logps/rejected": -0.7218388915061951,
"loss": 1.5054,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8916106224060059,
"rewards/margins": 0.9129866361618042,
"rewards/rejected": -1.8045971393585205,
"step": 348
},
{
"epoch": 0.7459257280256478,
"grad_norm": 4.314223766326904,
"learning_rate": 9.12210575394553e-08,
"logits/chosen": -1.0736174583435059,
"logits/rejected": -1.0914644002914429,
"logps/chosen": -0.4206693172454834,
"logps/rejected": -0.4104337692260742,
"loss": 1.58,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.0516732931137085,
"rewards/margins": -0.025588899850845337,
"rewards/rejected": -1.0260844230651855,
"step": 349
},
{
"epoch": 0.7480630510285867,
"grad_norm": 12.095142364501953,
"learning_rate": 8.978109650374396e-08,
"logits/chosen": -1.0621310472488403,
"logits/rejected": -1.0466017723083496,
"logps/chosen": -0.4226948618888855,
"logps/rejected": -0.46644341945648193,
"loss": 1.5575,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0567370653152466,
"rewards/margins": 0.10937156528234482,
"rewards/rejected": -1.16610848903656,
"step": 350
},
{
"epoch": 0.7502003740315255,
"grad_norm": 6.449507236480713,
"learning_rate": 8.835009969605011e-08,
"logits/chosen": -1.1138099431991577,
"logits/rejected": -1.0220037698745728,
"logps/chosen": -0.3483428359031677,
"logps/rejected": -0.3484461307525635,
"loss": 1.5018,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.8708571195602417,
"rewards/margins": 0.0002581886947154999,
"rewards/rejected": -0.8711153268814087,
"step": 351
},
{
"epoch": 0.7523376970344643,
"grad_norm": 6.543509483337402,
"learning_rate": 8.692814718046978e-08,
"logits/chosen": -1.1011321544647217,
"logits/rejected": -1.0573300123214722,
"logps/chosen": -0.6217561364173889,
"logps/rejected": -0.5529008507728577,
"loss": 1.6158,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.55439031124115,
"rewards/margins": -0.1721382439136505,
"rewards/rejected": -1.3822520971298218,
"step": 352
},
{
"epoch": 0.7544750200374032,
"grad_norm": 6.191348552703857,
"learning_rate": 8.551531851507185e-08,
"logits/chosen": -1.0794535875320435,
"logits/rejected": -0.9287205934524536,
"logps/chosen": -0.3870071768760681,
"logps/rejected": -0.4184566140174866,
"loss": 1.5651,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.9675179719924927,
"rewards/margins": 0.07862359285354614,
"rewards/rejected": -1.046141505241394,
"step": 353
},
{
"epoch": 0.756612343040342,
"grad_norm": 5.186205863952637,
"learning_rate": 8.411169274744723e-08,
"logits/chosen": -1.0084459781646729,
"logits/rejected": -1.0072932243347168,
"logps/chosen": -0.3501359224319458,
"logps/rejected": -0.4932965040206909,
"loss": 1.4669,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8753398656845093,
"rewards/margins": 0.35790154337882996,
"rewards/rejected": -1.233241319656372,
"step": 354
},
{
"epoch": 0.7587496660432808,
"grad_norm": 8.382218360900879,
"learning_rate": 8.271734841028552e-08,
"logits/chosen": -1.0127713680267334,
"logits/rejected": -1.0045888423919678,
"logps/chosen": -0.4923154413700104,
"logps/rejected": -0.4377010464668274,
"loss": 1.5886,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.2307885885238647,
"rewards/margins": -0.13653598725795746,
"rewards/rejected": -1.094252586364746,
"step": 355
},
{
"epoch": 0.7608869890462197,
"grad_norm": 34.09469985961914,
"learning_rate": 8.133236351698142e-08,
"logits/chosen": -1.191988229751587,
"logits/rejected": -1.0870414972305298,
"logps/chosen": -0.5740368366241455,
"logps/rejected": -0.943623960018158,
"loss": 1.5604,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4350918531417847,
"rewards/margins": 0.9239681959152222,
"rewards/rejected": -2.359060049057007,
"step": 356
},
{
"epoch": 0.7630243120491584,
"grad_norm": 15.886775970458984,
"learning_rate": 7.99568155572701e-08,
"logits/chosen": -1.2273132801055908,
"logits/rejected": -1.1660319566726685,
"logps/chosen": -0.5724566578865051,
"logps/rejected": -0.6214060187339783,
"loss": 1.5502,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4311414957046509,
"rewards/margins": 0.12237339466810226,
"rewards/rejected": -1.5535149574279785,
"step": 357
},
{
"epoch": 0.7651616350520972,
"grad_norm": 5.216745376586914,
"learning_rate": 7.859078149289144e-08,
"logits/chosen": -1.0447226762771606,
"logits/rejected": -1.0748844146728516,
"logps/chosen": -0.3527478873729706,
"logps/rejected": -0.45652708411216736,
"loss": 1.4919,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8818696737289429,
"rewards/margins": 0.2594480514526367,
"rewards/rejected": -1.1413178443908691,
"step": 358
},
{
"epoch": 0.7672989580550361,
"grad_norm": 7.554361820220947,
"learning_rate": 7.723433775328384e-08,
"logits/chosen": -1.0472806692123413,
"logits/rejected": -1.2351243495941162,
"logps/chosen": -0.486628919839859,
"logps/rejected": -0.8915761709213257,
"loss": 1.5042,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2165722846984863,
"rewards/margins": 1.012368083000183,
"rewards/rejected": -2.22894024848938,
"step": 359
},
{
"epoch": 0.7694362810579749,
"grad_norm": 5.030363082885742,
"learning_rate": 7.588756023130833e-08,
"logits/chosen": -0.7821986079216003,
"logits/rejected": -0.8699120283126831,
"logps/chosen": -0.4785808324813843,
"logps/rejected": -0.5831509828567505,
"loss": 1.4942,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1964521408081055,
"rewards/margins": 0.261425256729126,
"rewards/rejected": -1.4578773975372314,
"step": 360
},
{
"epoch": 0.7715736040609137,
"grad_norm": 7.105344295501709,
"learning_rate": 7.455052427900213e-08,
"logits/chosen": -1.215461254119873,
"logits/rejected": -1.0217854976654053,
"logps/chosen": -0.3912314176559448,
"logps/rejected": -0.3650144636631012,
"loss": 1.6164,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9780785441398621,
"rewards/margins": -0.06554235517978668,
"rewards/rejected": -0.9125362038612366,
"step": 361
},
{
"epoch": 0.7737109270638525,
"grad_norm": 5.5884199142456055,
"learning_rate": 7.322330470336313e-08,
"logits/chosen": -1.0982252359390259,
"logits/rejected": -1.0146585702896118,
"logps/chosen": -0.32143884897232056,
"logps/rejected": -0.495330274105072,
"loss": 1.4397,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8035971522331238,
"rewards/margins": 0.4347284734249115,
"rewards/rejected": -1.2383257150650024,
"step": 362
},
{
"epoch": 0.7758482500667914,
"grad_norm": 10.065922737121582,
"learning_rate": 7.190597576216384e-08,
"logits/chosen": -0.8641526699066162,
"logits/rejected": -0.8766672015190125,
"logps/chosen": -0.46004733443260193,
"logps/rejected": -0.4904063940048218,
"loss": 1.5026,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.150118350982666,
"rewards/margins": 0.07589760422706604,
"rewards/rejected": -1.2260159254074097,
"step": 363
},
{
"epoch": 0.7779855730697302,
"grad_norm": 9.752578735351562,
"learning_rate": 7.059861115979701e-08,
"logits/chosen": -1.0331135988235474,
"logits/rejected": -1.0849862098693848,
"logps/chosen": -0.4356737434864044,
"logps/rejected": -0.4790940582752228,
"loss": 1.6159,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0891844034194946,
"rewards/margins": 0.1085507869720459,
"rewards/rejected": -1.1977351903915405,
"step": 364
},
{
"epoch": 0.7801228960726689,
"grad_norm": 6.089657783508301,
"learning_rate": 6.930128404315214e-08,
"logits/chosen": -1.0164854526519775,
"logits/rejected": -1.006756067276001,
"logps/chosen": -0.6724580526351929,
"logps/rejected": -0.6376264095306396,
"loss": 1.4724,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.6811450719833374,
"rewards/margins": -0.08707903325557709,
"rewards/rejected": -1.5940660238265991,
"step": 365
},
{
"epoch": 0.7822602190756078,
"grad_norm": 4.81856107711792,
"learning_rate": 6.801406699752229e-08,
"logits/chosen": -1.1662445068359375,
"logits/rejected": -1.0658493041992188,
"logps/chosen": -0.4783725440502167,
"logps/rejected": -0.4546273946762085,
"loss": 1.6376,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.1959314346313477,
"rewards/margins": -0.059362899512052536,
"rewards/rejected": -1.1365684270858765,
"step": 366
},
{
"epoch": 0.7843975420785466,
"grad_norm": 9.418230056762695,
"learning_rate": 6.673703204254347e-08,
"logits/chosen": -1.2206920385360718,
"logits/rejected": -1.2510498762130737,
"logps/chosen": -0.594007670879364,
"logps/rejected": -0.8274646997451782,
"loss": 1.4951,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.4850192070007324,
"rewards/margins": 0.5836424827575684,
"rewards/rejected": -2.068661689758301,
"step": 367
},
{
"epoch": 0.7865348650814854,
"grad_norm": 10.291698455810547,
"learning_rate": 6.547025062816486e-08,
"logits/chosen": -0.8960355520248413,
"logits/rejected": -0.9344096779823303,
"logps/chosen": -0.36661607027053833,
"logps/rejected": -0.43485212326049805,
"loss": 1.554,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.916540265083313,
"rewards/margins": 0.17059014737606049,
"rewards/rejected": -1.0871303081512451,
"step": 368
},
{
"epoch": 0.7886721880844243,
"grad_norm": 4.118884086608887,
"learning_rate": 6.42137936306514e-08,
"logits/chosen": -1.0395543575286865,
"logits/rejected": -0.9369036555290222,
"logps/chosen": -0.3800487816333771,
"logps/rejected": -0.35273048281669617,
"loss": 1.5453,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9501218795776367,
"rewards/margins": -0.06829574704170227,
"rewards/rejected": -0.8818261623382568,
"step": 369
},
{
"epoch": 0.7908095110873631,
"grad_norm": 10.194417953491211,
"learning_rate": 6.296773134861824e-08,
"logits/chosen": -1.06082284450531,
"logits/rejected": -1.0715018510818481,
"logps/chosen": -0.45035964250564575,
"logps/rejected": -0.49578312039375305,
"loss": 1.5399,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.125899076461792,
"rewards/margins": 0.11355876922607422,
"rewards/rejected": -1.2394579648971558,
"step": 370
},
{
"epoch": 0.7929468340903019,
"grad_norm": 12.731304168701172,
"learning_rate": 6.173213349909728e-08,
"logits/chosen": -1.1646618843078613,
"logits/rejected": -1.0550154447555542,
"logps/chosen": -0.45987626910209656,
"logps/rejected": -0.6932123303413391,
"loss": 1.4858,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1496906280517578,
"rewards/margins": 0.583340048789978,
"rewards/rejected": -1.7330307960510254,
"step": 371
},
{
"epoch": 0.7950841570932408,
"grad_norm": 4.002368927001953,
"learning_rate": 6.050706921363672e-08,
"logits/chosen": -1.1935923099517822,
"logits/rejected": -1.2162138223648071,
"logps/chosen": -0.39056870341300964,
"logps/rejected": -0.5482085943222046,
"loss": 1.472,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9764216542243958,
"rewards/margins": 0.39409980177879333,
"rewards/rejected": -1.3705215454101562,
"step": 372
},
{
"epoch": 0.7972214800961795,
"grad_norm": 14.192221641540527,
"learning_rate": 5.929260703443337e-08,
"logits/chosen": -0.7887646555900574,
"logits/rejected": -0.8968151211738586,
"logps/chosen": -0.3553355634212494,
"logps/rejected": -0.43609827756881714,
"loss": 1.5724,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8883388638496399,
"rewards/margins": 0.20190683007240295,
"rewards/rejected": -1.0902457237243652,
"step": 373
},
{
"epoch": 0.7993588030991183,
"grad_norm": 14.193215370178223,
"learning_rate": 5.808881491049722e-08,
"logits/chosen": -1.265504240989685,
"logits/rejected": -1.3089518547058105,
"logps/chosen": -0.5325222611427307,
"logps/rejected": -0.530707836151123,
"loss": 1.5907,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.331305742263794,
"rewards/margins": -0.004536189138889313,
"rewards/rejected": -1.3267695903778076,
"step": 374
},
{
"epoch": 0.8014961261020572,
"grad_norm": 8.60618782043457,
"learning_rate": 5.6895760193850145e-08,
"logits/chosen": -1.0075315237045288,
"logits/rejected": -1.0355682373046875,
"logps/chosen": -0.4078059792518616,
"logps/rejected": -0.6130856275558472,
"loss": 1.5543,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.019515037536621,
"rewards/margins": 0.5131990313529968,
"rewards/rejected": -1.5327140092849731,
"step": 375
},
{
"epoch": 0.803633449104996,
"grad_norm": 8.175145149230957,
"learning_rate": 5.571350963575727e-08,
"logits/chosen": -1.1598341464996338,
"logits/rejected": -0.9834379553794861,
"logps/chosen": -0.4177122414112091,
"logps/rejected": -0.35740387439727783,
"loss": 1.5343,
"rewards/accuracies": 0.1875,
"rewards/chosen": -1.0442806482315063,
"rewards/margins": -0.1507708877325058,
"rewards/rejected": -0.8935096859931946,
"step": 376
},
{
"epoch": 0.8057707721079348,
"grad_norm": 20.987957000732422,
"learning_rate": 5.454212938299255e-08,
"logits/chosen": -1.140153169631958,
"logits/rejected": -1.0446147918701172,
"logps/chosen": -0.5767669677734375,
"logps/rejected": -0.4474826455116272,
"loss": 1.5854,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.4419174194335938,
"rewards/margins": -0.32321077585220337,
"rewards/rejected": -1.1187067031860352,
"step": 377
},
{
"epoch": 0.8079080951108736,
"grad_norm": 7.252152919769287,
"learning_rate": 5.338168497413756e-08,
"logits/chosen": -1.1144384145736694,
"logits/rejected": -1.2679189443588257,
"logps/chosen": -0.3241596817970276,
"logps/rejected": -0.5525774955749512,
"loss": 1.5213,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.8103991150856018,
"rewards/margins": 0.5710446238517761,
"rewards/rejected": -1.3814438581466675,
"step": 378
},
{
"epoch": 0.8100454181138125,
"grad_norm": 5.948306083679199,
"learning_rate": 5.223224133591475e-08,
"logits/chosen": -1.212296724319458,
"logits/rejected": -1.0987417697906494,
"logps/chosen": -0.7626843452453613,
"logps/rejected": -1.2524211406707764,
"loss": 1.4664,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.9067108631134033,
"rewards/margins": 1.2243417501449585,
"rewards/rejected": -3.1310529708862305,
"step": 379
},
{
"epoch": 0.8121827411167513,
"grad_norm": 5.803626537322998,
"learning_rate": 5.109386277955477e-08,
"logits/chosen": -1.1813595294952393,
"logits/rejected": -1.1350992918014526,
"logps/chosen": -0.45435211062431335,
"logps/rejected": -0.5456478595733643,
"loss": 1.4602,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.1358802318572998,
"rewards/margins": 0.22823941707611084,
"rewards/rejected": -1.364119529724121,
"step": 380
},
{
"epoch": 0.81432006411969,
"grad_norm": 4.112217903137207,
"learning_rate": 4.996661299719845e-08,
"logits/chosen": -0.8679373264312744,
"logits/rejected": -0.8792969584465027,
"logps/chosen": -0.4242730736732483,
"logps/rejected": -0.7051547765731812,
"loss": 1.5246,
"rewards/accuracies": 0.8125,
"rewards/chosen": -1.060682773590088,
"rewards/margins": 0.7022043466567993,
"rewards/rejected": -1.7628870010375977,
"step": 381
},
{
"epoch": 0.8164573871226289,
"grad_norm": 10.623425483703613,
"learning_rate": 4.885055505833291e-08,
"logits/chosen": -1.2642873525619507,
"logits/rejected": -1.189084768295288,
"logps/chosen": -0.4796138405799866,
"logps/rejected": -0.5438456535339355,
"loss": 1.5916,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.199034571647644,
"rewards/margins": 0.16057954728603363,
"rewards/rejected": -1.3596141338348389,
"step": 382
},
{
"epoch": 0.8185947101255677,
"grad_norm": 5.901862144470215,
"learning_rate": 4.774575140626316e-08,
"logits/chosen": -1.0387252569198608,
"logits/rejected": -0.9353397488594055,
"logps/chosen": -0.43210557103157043,
"logps/rejected": -0.474994421005249,
"loss": 1.4878,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.0802638530731201,
"rewards/margins": 0.1072220653295517,
"rewards/rejected": -1.187485933303833,
"step": 383
},
{
"epoch": 0.8207320331285065,
"grad_norm": 10.615392684936523,
"learning_rate": 4.6652263854618016e-08,
"logits/chosen": -1.2227771282196045,
"logits/rejected": -1.2380874156951904,
"logps/chosen": -0.42436325550079346,
"logps/rejected": -0.6415535807609558,
"loss": 1.4763,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0609081983566284,
"rewards/margins": 0.5429757833480835,
"rewards/rejected": -1.6038841009140015,
"step": 384
},
{
"epoch": 0.8228693561314454,
"grad_norm": 7.720395565032959,
"learning_rate": 4.557015358389216e-08,
"logits/chosen": -0.9876857995986938,
"logits/rejected": -1.0623700618743896,
"logps/chosen": -0.37730199098587036,
"logps/rejected": -0.587026059627533,
"loss": 1.4647,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9432549476623535,
"rewards/margins": 0.5243102312088013,
"rewards/rejected": -1.4675650596618652,
"step": 385
},
{
"epoch": 0.8250066791343842,
"grad_norm": 8.550374984741211,
"learning_rate": 4.449948113802254e-08,
"logits/chosen": -1.203737735748291,
"logits/rejected": -1.1764881610870361,
"logps/chosen": -0.33095237612724304,
"logps/rejected": -0.3797753155231476,
"loss": 1.5067,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.827380895614624,
"rewards/margins": 0.12205736339092255,
"rewards/rejected": -0.9494383335113525,
"step": 386
},
{
"epoch": 0.827144002137323,
"grad_norm": 8.158326148986816,
"learning_rate": 4.3440306421001324e-08,
"logits/chosen": -1.0478994846343994,
"logits/rejected": -1.0275698900222778,
"logps/chosen": -0.4138807952404022,
"logps/rejected": -0.5256016254425049,
"loss": 1.4697,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0347020626068115,
"rewards/margins": 0.2793022096157074,
"rewards/rejected": -1.3140041828155518,
"step": 387
},
{
"epoch": 0.8292813251402619,
"grad_norm": 4.749565124511719,
"learning_rate": 4.2392688693524055e-08,
"logits/chosen": -1.0925198793411255,
"logits/rejected": -1.053438663482666,
"logps/chosen": -0.4893158972263336,
"logps/rejected": -0.7248774766921997,
"loss": 1.4731,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2232897281646729,
"rewards/margins": 0.5889038443565369,
"rewards/rejected": -1.8121936321258545,
"step": 388
},
{
"epoch": 0.8314186481432007,
"grad_norm": 17.479476928710938,
"learning_rate": 4.1356686569674335e-08,
"logits/chosen": -1.237557291984558,
"logits/rejected": -1.1833033561706543,
"logps/chosen": -0.6519225835800171,
"logps/rejected": -0.6998899579048157,
"loss": 1.5504,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.6298065185546875,
"rewards/margins": 0.1199183464050293,
"rewards/rejected": -1.7497249841690063,
"step": 389
},
{
"epoch": 0.8335559711461394,
"grad_norm": 6.035606384277344,
"learning_rate": 4.0332358013644015e-08,
"logits/chosen": -1.16561758518219,
"logits/rejected": -1.1693965196609497,
"logps/chosen": -0.6768723726272583,
"logps/rejected": -1.003299593925476,
"loss": 1.4768,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.692180871963501,
"rewards/margins": 0.8160682916641235,
"rewards/rejected": -2.508248805999756,
"step": 390
},
{
"epoch": 0.8356932941490782,
"grad_norm": 6.278528213500977,
"learning_rate": 3.9319760336490205e-08,
"logits/chosen": -0.7999597191810608,
"logits/rejected": -0.6943086385726929,
"logps/chosen": -0.3784443140029907,
"logps/rejected": -0.4581376612186432,
"loss": 1.5857,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9461109042167664,
"rewards/margins": 0.1992333084344864,
"rewards/rejected": -1.1453441381454468,
"step": 391
},
{
"epoch": 0.8378306171520171,
"grad_norm": 4.925440788269043,
"learning_rate": 3.831895019292897e-08,
"logits/chosen": -1.1565301418304443,
"logits/rejected": -1.1625264883041382,
"logps/chosen": -0.5009636878967285,
"logps/rejected": -0.8770073056221008,
"loss": 1.4676,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2524092197418213,
"rewards/margins": 0.9401088953018188,
"rewards/rejected": -2.1925179958343506,
"step": 392
},
{
"epoch": 0.8399679401549559,
"grad_norm": 16.391368865966797,
"learning_rate": 3.732998357816514e-08,
"logits/chosen": -1.0616164207458496,
"logits/rejected": -1.0247597694396973,
"logps/chosen": -0.5045540928840637,
"logps/rejected": -0.467951238155365,
"loss": 1.6318,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.261385202407837,
"rewards/margins": -0.09150727093219757,
"rewards/rejected": -1.1698780059814453,
"step": 393
},
{
"epoch": 0.8421052631578947,
"grad_norm": 8.550324440002441,
"learning_rate": 3.635291582475963e-08,
"logits/chosen": -0.9775318503379822,
"logits/rejected": -0.8906686902046204,
"logps/chosen": -0.5394979119300842,
"logps/rejected": -0.5499922037124634,
"loss": 1.5031,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.3487448692321777,
"rewards/margins": 0.02623593807220459,
"rewards/rejected": -1.3749808073043823,
"step": 394
},
{
"epoch": 0.8442425861608336,
"grad_norm": 4.05560302734375,
"learning_rate": 3.538780159953347e-08,
"logits/chosen": -1.0048534870147705,
"logits/rejected": -0.839844822883606,
"logps/chosen": -0.4153073728084564,
"logps/rejected": -0.35160398483276367,
"loss": 1.5959,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.0382684469223022,
"rewards/margins": -0.1592584103345871,
"rewards/rejected": -0.8790099620819092,
"step": 395
},
{
"epoch": 0.8463799091637724,
"grad_norm": 7.721834182739258,
"learning_rate": 3.4434694900509345e-08,
"logits/chosen": -1.1735496520996094,
"logits/rejected": -1.150696039199829,
"logps/chosen": -0.5112527012825012,
"logps/rejected": -0.6088312864303589,
"loss": 1.5405,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2781318426132202,
"rewards/margins": 0.24394652247428894,
"rewards/rejected": -1.522078275680542,
"step": 396
},
{
"epoch": 0.8485172321667112,
"grad_norm": 5.097099781036377,
"learning_rate": 3.349364905389032e-08,
"logits/chosen": -1.2030649185180664,
"logits/rejected": -1.08577561378479,
"logps/chosen": -0.47224241495132446,
"logps/rejected": -0.6983097791671753,
"loss": 1.5699,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1806060075759888,
"rewards/margins": 0.5651683807373047,
"rewards/rejected": -1.745774507522583,
"step": 397
},
{
"epoch": 0.85065455516965,
"grad_norm": 9.325733184814453,
"learning_rate": 3.256471671107616e-08,
"logits/chosen": -0.9340042471885681,
"logits/rejected": -0.946205198764801,
"logps/chosen": -0.7264373302459717,
"logps/rejected": -0.6860026121139526,
"loss": 1.5801,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.8160933256149292,
"rewards/margins": -0.1010868027806282,
"rewards/rejected": -1.7150065898895264,
"step": 398
},
{
"epoch": 0.8527918781725888,
"grad_norm": 4.532627105712891,
"learning_rate": 3.1647949845717585e-08,
"logits/chosen": -0.8892905712127686,
"logits/rejected": -0.8267837166786194,
"logps/chosen": -0.4368503987789154,
"logps/rejected": -0.5416733026504517,
"loss": 1.4184,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0921260118484497,
"rewards/margins": 0.2620573043823242,
"rewards/rejected": -1.3541834354400635,
"step": 399
},
{
"epoch": 0.8549292011755276,
"grad_norm": 3.611345052719116,
"learning_rate": 3.074339975080836e-08,
"logits/chosen": -0.9715834259986877,
"logits/rejected": -0.9480459690093994,
"logps/chosen": -0.685413122177124,
"logps/rejected": -0.7482943534851074,
"loss": 1.4719,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.71353280544281,
"rewards/margins": 0.15720298886299133,
"rewards/rejected": -1.870735764503479,
"step": 400
},
{
"epoch": 0.8570665241784665,
"grad_norm": 49.35193634033203,
"learning_rate": 2.98511170358155e-08,
"logits/chosen": -0.959058940410614,
"logits/rejected": -0.9538683295249939,
"logps/chosen": -0.43192583322525024,
"logps/rejected": -0.4624338746070862,
"loss": 1.6139,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0798146724700928,
"rewards/margins": 0.07627001404762268,
"rewards/rejected": -1.156084656715393,
"step": 401
},
{
"epoch": 0.8592038471814053,
"grad_norm": 11.570405006408691,
"learning_rate": 2.8971151623847584e-08,
"logits/chosen": -1.0528483390808105,
"logits/rejected": -0.9815914630889893,
"logps/chosen": -0.584007740020752,
"logps/rejected": -0.6078099608421326,
"loss": 1.5858,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.4600192308425903,
"rewards/margins": 0.05950555205345154,
"rewards/rejected": -1.5195249319076538,
"step": 402
},
{
"epoch": 0.8613411701843441,
"grad_norm": 5.403092861175537,
"learning_rate": 2.8103552748861475e-08,
"logits/chosen": -1.052750825881958,
"logits/rejected": -1.0424790382385254,
"logps/chosen": -0.6250475645065308,
"logps/rejected": -0.6444798111915588,
"loss": 1.6129,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.5626189708709717,
"rewards/margins": 0.04858069866895676,
"rewards/rejected": -1.6111997365951538,
"step": 403
},
{
"epoch": 0.863478493187283,
"grad_norm": 5.894848346710205,
"learning_rate": 2.724836895290805e-08,
"logits/chosen": -1.0178946256637573,
"logits/rejected": -0.8687437772750854,
"logps/chosen": -0.36898887157440186,
"logps/rejected": -0.7522455453872681,
"loss": 1.5337,
"rewards/accuracies": 0.4375,
"rewards/chosen": -0.9224721789360046,
"rewards/margins": 0.958141565322876,
"rewards/rejected": -1.8806138038635254,
"step": 404
},
{
"epoch": 0.8656158161902218,
"grad_norm": 6.724638938903809,
"learning_rate": 2.6405648083415833e-08,
"logits/chosen": -1.1396384239196777,
"logits/rejected": -1.0131150484085083,
"logps/chosen": -0.6166858673095703,
"logps/rejected": -0.5131340026855469,
"loss": 1.5427,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.5417147874832153,
"rewards/margins": -0.258879691362381,
"rewards/rejected": -1.2828351259231567,
"step": 405
},
{
"epoch": 0.8677531391931605,
"grad_norm": 4.763136863708496,
"learning_rate": 2.55754372905142e-08,
"logits/chosen": -1.1291677951812744,
"logits/rejected": -1.0440433025360107,
"logps/chosen": -0.44505226612091064,
"logps/rejected": -0.46291491389274597,
"loss": 1.499,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1126307249069214,
"rewards/margins": 0.04465658590197563,
"rewards/rejected": -1.157287359237671,
"step": 406
},
{
"epoch": 0.8698904621960993,
"grad_norm": 5.994953155517578,
"learning_rate": 2.475778302439524e-08,
"logits/chosen": -1.0867843627929688,
"logits/rejected": -1.1085426807403564,
"logps/chosen": -0.6850754022598267,
"logps/rejected": -0.8681538105010986,
"loss": 1.54,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.7126885652542114,
"rewards/margins": 0.4576959013938904,
"rewards/rejected": -2.170384645462036,
"step": 407
},
{
"epoch": 0.8720277851990382,
"grad_norm": 4.274337291717529,
"learning_rate": 2.3952731032714973e-08,
"logits/chosen": -0.8507957458496094,
"logits/rejected": -0.8216973543167114,
"logps/chosen": -0.352754145860672,
"logps/rejected": -0.6458288431167603,
"loss": 1.4806,
"rewards/accuracies": 0.75,
"rewards/chosen": -0.8818854093551636,
"rewards/margins": 0.7326868176460266,
"rewards/rejected": -1.6145721673965454,
"step": 408
},
{
"epoch": 0.874165108201977,
"grad_norm": 19.687917709350586,
"learning_rate": 2.3160326358033778e-08,
"logits/chosen": -1.0179362297058105,
"logits/rejected": -0.9422796964645386,
"logps/chosen": -0.6053561568260193,
"logps/rejected": -1.013503074645996,
"loss": 1.4768,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.5133905410766602,
"rewards/margins": 1.0203672647476196,
"rewards/rejected": -2.5337576866149902,
"step": 409
},
{
"epoch": 0.8763024312049158,
"grad_norm": 12.98466968536377,
"learning_rate": 2.2380613335296033e-08,
"logits/chosen": -0.8576774597167969,
"logits/rejected": -0.9601131677627563,
"logps/chosen": -0.42103275656700134,
"logps/rejected": -0.41525495052337646,
"loss": 1.5959,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0525819063186646,
"rewards/margins": -0.014444507658481598,
"rewards/rejected": -1.0381373167037964,
"step": 410
},
{
"epoch": 0.8784397542078547,
"grad_norm": 7.204395294189453,
"learning_rate": 2.1613635589349756e-08,
"logits/chosen": -0.9204460978507996,
"logits/rejected": -0.9429717063903809,
"logps/chosen": -0.3754885196685791,
"logps/rejected": -0.4179832935333252,
"loss": 1.5211,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9387211799621582,
"rewards/margins": 0.10623697191476822,
"rewards/rejected": -1.0449581146240234,
"step": 411
},
{
"epoch": 0.8805770772107935,
"grad_norm": 14.507763862609863,
"learning_rate": 2.085943603250595e-08,
"logits/chosen": -0.9056552648544312,
"logits/rejected": -0.8666099309921265,
"logps/chosen": -0.4410094618797302,
"logps/rejected": -0.6149858236312866,
"loss": 1.5065,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1025235652923584,
"rewards/margins": 0.43494072556495667,
"rewards/rejected": -1.5374643802642822,
"step": 412
},
{
"epoch": 0.8827144002137323,
"grad_norm": 4.710812568664551,
"learning_rate": 2.0118056862137354e-08,
"logits/chosen": -0.9734061360359192,
"logits/rejected": -0.8919450044631958,
"logps/chosen": -0.4355347156524658,
"logps/rejected": -0.4121510982513428,
"loss": 1.5909,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.0888367891311646,
"rewards/margins": -0.058458905667066574,
"rewards/rejected": -1.030377745628357,
"step": 413
},
{
"epoch": 0.8848517232166712,
"grad_norm": 6.681394100189209,
"learning_rate": 1.938953955831771e-08,
"logits/chosen": -1.089871883392334,
"logits/rejected": -1.0652118921279907,
"logps/chosen": -0.45682665705680847,
"logps/rejected": -0.5210414528846741,
"loss": 1.4889,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.1420665979385376,
"rewards/margins": 0.1605370044708252,
"rewards/rejected": -1.3026037216186523,
"step": 414
},
{
"epoch": 0.88698904621961,
"grad_norm": 10.179585456848145,
"learning_rate": 1.8673924881500823e-08,
"logits/chosen": -0.9976410269737244,
"logits/rejected": -1.0120102167129517,
"logps/chosen": -0.7260380387306213,
"logps/rejected": -0.9380686283111572,
"loss": 1.5325,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.8150951862335205,
"rewards/margins": 0.5300765037536621,
"rewards/rejected": -2.3451716899871826,
"step": 415
},
{
"epoch": 0.8891263692225487,
"grad_norm": 6.529333591461182,
"learning_rate": 1.797125287024029e-08,
"logits/chosen": -1.0898345708847046,
"logits/rejected": -1.1249827146530151,
"logps/chosen": -0.5659042596817017,
"logps/rejected": -0.8098978996276855,
"loss": 1.4598,
"rewards/accuracies": 0.75,
"rewards/chosen": -1.414760947227478,
"rewards/margins": 0.6099839210510254,
"rewards/rejected": -2.024744749069214,
"step": 416
},
{
"epoch": 0.8912636922254876,
"grad_norm": 18.293628692626953,
"learning_rate": 1.7281562838948966e-08,
"logits/chosen": -0.9206041693687439,
"logits/rejected": -0.9363532066345215,
"logps/chosen": -0.6177800893783569,
"logps/rejected": -0.5964070558547974,
"loss": 1.6686,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.5444501638412476,
"rewards/margins": -0.05343271791934967,
"rewards/rejected": -1.4910173416137695,
"step": 417
},
{
"epoch": 0.8934010152284264,
"grad_norm": 6.754497528076172,
"learning_rate": 1.6604893375699592e-08,
"logits/chosen": -1.1073287725448608,
"logits/rejected": -0.9858848452568054,
"logps/chosen": -0.4562823176383972,
"logps/rejected": -0.5044858455657959,
"loss": 1.5667,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1407058238983154,
"rewards/margins": 0.12050891667604446,
"rewards/rejected": -1.2612147331237793,
"step": 418
},
{
"epoch": 0.8955383382313652,
"grad_norm": 4.217410087585449,
"learning_rate": 1.5941282340065697e-08,
"logits/chosen": -1.1437349319458008,
"logits/rejected": -1.2322208881378174,
"logps/chosen": -0.4406435191631317,
"logps/rejected": -0.6784199476242065,
"loss": 1.4618,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1016088724136353,
"rewards/margins": 0.5944410562515259,
"rewards/rejected": -1.6960498094558716,
"step": 419
},
{
"epoch": 0.897675661234304,
"grad_norm": 16.80787467956543,
"learning_rate": 1.5290766861003475e-08,
"logits/chosen": -0.9160170555114746,
"logits/rejected": -0.8729619383811951,
"logps/chosen": -0.3433056175708771,
"logps/rejected": -0.37045544385910034,
"loss": 1.6206,
"rewards/accuracies": 0.25,
"rewards/chosen": -0.8582640290260315,
"rewards/margins": 0.06787460297346115,
"rewards/rejected": -0.926138699054718,
"step": 420
},
{
"epoch": 0.8998129842372429,
"grad_norm": 13.28708553314209,
"learning_rate": 1.4653383334774228e-08,
"logits/chosen": -1.0449622869491577,
"logits/rejected": -1.0788357257843018,
"logps/chosen": -0.5669997930526733,
"logps/rejected": -0.7596959471702576,
"loss": 1.5144,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.4174995422363281,
"rewards/margins": 0.4817403554916382,
"rewards/rejected": -1.8992400169372559,
"step": 421
},
{
"epoch": 0.9019503072401817,
"grad_norm": 7.049681663513184,
"learning_rate": 1.4029167422908105e-08,
"logits/chosen": -1.148837924003601,
"logits/rejected": -1.0951889753341675,
"logps/chosen": -0.4990730583667755,
"logps/rejected": -0.607469916343689,
"loss": 1.5152,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2476826906204224,
"rewards/margins": 0.2709922194480896,
"rewards/rejected": -1.5186748504638672,
"step": 422
},
{
"epoch": 0.9040876302431204,
"grad_norm": 6.772598743438721,
"learning_rate": 1.3418154050208936e-08,
"logits/chosen": -0.9771215319633484,
"logits/rejected": -0.9951186776161194,
"logps/chosen": -0.5036316514015198,
"logps/rejected": -0.6173194646835327,
"loss": 1.5277,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2590792179107666,
"rewards/margins": 0.28421932458877563,
"rewards/rejected": -1.543298602104187,
"step": 423
},
{
"epoch": 0.9062249532460593,
"grad_norm": 5.654405117034912,
"learning_rate": 1.2820377402800064e-08,
"logits/chosen": -0.8762426972389221,
"logits/rejected": -0.6818346381187439,
"logps/chosen": -0.4185902774333954,
"logps/rejected": -0.9549139738082886,
"loss": 1.4405,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0464756488800049,
"rewards/margins": 1.3408091068267822,
"rewards/rejected": -2.387284755706787,
"step": 424
},
{
"epoch": 0.9083622762489981,
"grad_norm": 9.220845222473145,
"learning_rate": 1.2235870926211616e-08,
"logits/chosen": -0.9336291551589966,
"logits/rejected": -0.9068048596382141,
"logps/chosen": -0.4764101505279541,
"logps/rejected": -0.6432890295982361,
"loss": 1.5286,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.1910252571105957,
"rewards/margins": 0.41719722747802734,
"rewards/rejected": -1.608222484588623,
"step": 425
},
{
"epoch": 0.9104995992519369,
"grad_norm": 6.728222846984863,
"learning_rate": 1.1664667323509347e-08,
"logits/chosen": -1.0641913414001465,
"logits/rejected": -0.9187748432159424,
"logps/chosen": -0.3940516710281372,
"logps/rejected": -0.4065035283565521,
"loss": 1.5201,
"rewards/accuracies": 0.5,
"rewards/chosen": -0.985129177570343,
"rewards/margins": 0.031129609793424606,
"rewards/rejected": -1.0162588357925415,
"step": 426
},
{
"epoch": 0.9126369222548758,
"grad_norm": 7.619234561920166,
"learning_rate": 1.1106798553464802e-08,
"logits/chosen": -0.954318642616272,
"logits/rejected": -0.908359706401825,
"logps/chosen": -0.4075399935245514,
"logps/rejected": -0.4785918891429901,
"loss": 1.5005,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0188499689102173,
"rewards/margins": 0.17762985825538635,
"rewards/rejected": -1.1964799165725708,
"step": 427
},
{
"epoch": 0.9147742452578146,
"grad_norm": 6.409261226654053,
"learning_rate": 1.0562295828767387e-08,
"logits/chosen": -1.0399943590164185,
"logits/rejected": -1.0340969562530518,
"logps/chosen": -0.3984678387641907,
"logps/rejected": -0.5552449226379395,
"loss": 1.4632,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.9961696863174438,
"rewards/margins": 0.39194267988204956,
"rewards/rejected": -1.3881123065948486,
"step": 428
},
{
"epoch": 0.9169115682607534,
"grad_norm": 10.060012817382812,
"learning_rate": 1.0031189614277763e-08,
"logits/chosen": -0.9579813480377197,
"logits/rejected": -0.9383722543716431,
"logps/chosen": -0.5487989187240601,
"logps/rejected": -0.5818829536437988,
"loss": 1.5268,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.371997356414795,
"rewards/margins": 0.0827101320028305,
"rewards/rejected": -1.4547075033187866,
"step": 429
},
{
"epoch": 0.9190488912636923,
"grad_norm": 6.2424139976501465,
"learning_rate": 9.513509625323518e-09,
"logits/chosen": -0.9296804666519165,
"logits/rejected": -0.9260187745094299,
"logps/chosen": -0.39111167192459106,
"logps/rejected": -0.45873600244522095,
"loss": 1.4814,
"rewards/accuracies": 0.6875,
"rewards/chosen": -0.9777792096138,
"rewards/margins": 0.16906076669692993,
"rewards/rejected": -1.14683997631073,
"step": 430
},
{
"epoch": 0.921186214266631,
"grad_norm": 10.799667358398438,
"learning_rate": 9.009284826036689e-09,
"logits/chosen": -0.892406702041626,
"logits/rejected": -0.9241263270378113,
"logps/chosen": -0.5018429160118103,
"logps/rejected": -0.6716207265853882,
"loss": 1.4844,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.2546073198318481,
"rewards/margins": 0.42444440722465515,
"rewards/rejected": -1.6790517568588257,
"step": 431
},
{
"epoch": 0.9233235372695698,
"grad_norm": 4.4624104499816895,
"learning_rate": 8.518543427732949e-09,
"logits/chosen": -0.9802000522613525,
"logits/rejected": -0.9559367895126343,
"logps/chosen": -0.38735339045524597,
"logps/rejected": -0.6876262426376343,
"loss": 1.481,
"rewards/accuracies": 0.8125,
"rewards/chosen": -0.9683833718299866,
"rewards/margins": 0.7506821155548096,
"rewards/rejected": -1.7190656661987305,
"step": 432
},
{
"epoch": 0.9254608602725087,
"grad_norm": 7.951170444488525,
"learning_rate": 8.041312887333396e-09,
"logits/chosen": -0.9957330226898193,
"logits/rejected": -0.9468004703521729,
"logps/chosen": -0.4312437176704407,
"logps/rejected": -0.5361820459365845,
"loss": 1.4571,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0781091451644897,
"rewards/margins": 0.26234593987464905,
"rewards/rejected": -1.3404550552368164,
"step": 433
},
{
"epoch": 0.9275981832754475,
"grad_norm": 12.679637908935547,
"learning_rate": 7.577619905828281e-09,
"logits/chosen": -1.0287508964538574,
"logits/rejected": -0.9482178688049316,
"logps/chosen": -0.40496230125427246,
"logps/rejected": -0.38896051049232483,
"loss": 1.4673,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.0124057531356812,
"rewards/margins": -0.040004514157772064,
"rewards/rejected": -0.9724011421203613,
"step": 434
},
{
"epoch": 0.9297355062783863,
"grad_norm": 6.805285930633545,
"learning_rate": 7.127490426783123e-09,
"logits/chosen": -1.1413686275482178,
"logits/rejected": -1.09022057056427,
"logps/chosen": -0.5725710391998291,
"logps/rejected": -0.6725842952728271,
"loss": 1.5341,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.4314275979995728,
"rewards/margins": 0.2500333786010742,
"rewards/rejected": -1.6814608573913574,
"step": 435
},
{
"epoch": 0.9318728292813251,
"grad_norm": 5.759010314941406,
"learning_rate": 6.6909496348871445e-09,
"logits/chosen": -1.1474663019180298,
"logits/rejected": -1.1745803356170654,
"logps/chosen": -0.6815481185913086,
"logps/rejected": -0.7699655294418335,
"loss": 1.5144,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.7038702964782715,
"rewards/margins": 0.2210434526205063,
"rewards/rejected": -1.9249136447906494,
"step": 436
},
{
"epoch": 0.934010152284264,
"grad_norm": 4.481965065002441,
"learning_rate": 6.268021954544095e-09,
"logits/chosen": -0.9412963390350342,
"logits/rejected": -0.9516785144805908,
"logps/chosen": -0.3796120285987854,
"logps/rejected": -0.37879857420921326,
"loss": 1.5547,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.9490300416946411,
"rewards/margins": -0.002033662050962448,
"rewards/rejected": -0.9469964504241943,
"step": 437
},
{
"epoch": 0.9361474752872028,
"grad_norm": 6.431197166442871,
"learning_rate": 5.858731048505927e-09,
"logits/chosen": -1.076103687286377,
"logits/rejected": -1.0976781845092773,
"logps/chosen": -0.4042336046695709,
"logps/rejected": -0.612984836101532,
"loss": 1.4394,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.0105839967727661,
"rewards/margins": 0.5218778848648071,
"rewards/rejected": -1.5324620008468628,
"step": 438
},
{
"epoch": 0.9382847982901416,
"grad_norm": 11.778403282165527,
"learning_rate": 5.463099816548577e-09,
"logits/chosen": -1.085112452507019,
"logits/rejected": -1.024551272392273,
"logps/chosen": -0.3431437611579895,
"logps/rejected": -0.5487081408500671,
"loss": 1.5601,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.8578594923019409,
"rewards/margins": 0.5139108896255493,
"rewards/rejected": -1.3717702627182007,
"step": 439
},
{
"epoch": 0.9404221212930804,
"grad_norm": 4.792006492614746,
"learning_rate": 5.08115039419113e-09,
"logits/chosen": -0.9755135774612427,
"logits/rejected": -0.8824871778488159,
"logps/chosen": -0.3806512653827667,
"logps/rejected": -0.5343747735023499,
"loss": 1.52,
"rewards/accuracies": 0.5625,
"rewards/chosen": -0.9516281485557556,
"rewards/margins": 0.38430866599082947,
"rewards/rejected": -1.3359367847442627,
"step": 440
},
{
"epoch": 0.9425594442960192,
"grad_norm": 6.083741188049316,
"learning_rate": 4.712904151456864e-09,
"logits/chosen": -0.9559481143951416,
"logits/rejected": -0.8965126872062683,
"logps/chosen": -0.4216436743736267,
"logps/rejected": -0.4634767770767212,
"loss": 1.4695,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.0541093349456787,
"rewards/margins": 0.1045827567577362,
"rewards/rejected": -1.1586920022964478,
"step": 441
},
{
"epoch": 0.944696767298958,
"grad_norm": 4.866091251373291,
"learning_rate": 4.358381691677931e-09,
"logits/chosen": -0.9368703961372375,
"logits/rejected": -0.8826941251754761,
"logps/chosen": -0.3301496207714081,
"logps/rejected": -0.38065895438194275,
"loss": 1.4942,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.8253740072250366,
"rewards/margins": 0.1262734830379486,
"rewards/rejected": -0.9516474008560181,
"step": 442
},
{
"epoch": 0.9468340903018969,
"grad_norm": 7.398251056671143,
"learning_rate": 4.0176028503425826e-09,
"logits/chosen": -1.079075574874878,
"logits/rejected": -1.00128972530365,
"logps/chosen": -0.45567309856414795,
"logps/rejected": -0.39884287118911743,
"loss": 1.5388,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.1391828060150146,
"rewards/margins": -0.14207565784454346,
"rewards/rejected": -0.9971071481704712,
"step": 443
},
{
"epoch": 0.9489714133048357,
"grad_norm": 9.869709014892578,
"learning_rate": 3.6905866939851983e-09,
"logits/chosen": -1.1050983667373657,
"logits/rejected": -1.0123190879821777,
"logps/chosen": -0.44024163484573364,
"logps/rejected": -0.3926200568675995,
"loss": 1.4883,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.1006040573120117,
"rewards/margins": -0.1190539002418518,
"rewards/rejected": -0.9815501570701599,
"step": 444
},
{
"epoch": 0.9511087363077745,
"grad_norm": 4.558749198913574,
"learning_rate": 3.3773515191196646e-09,
"logits/chosen": -1.0600441694259644,
"logits/rejected": -1.0637009143829346,
"logps/chosen": -0.4824512004852295,
"logps/rejected": -0.594225287437439,
"loss": 1.5854,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.2061281204223633,
"rewards/margins": 0.2794351577758789,
"rewards/rejected": -1.4855631589889526,
"step": 445
},
{
"epoch": 0.9532460593107134,
"grad_norm": 7.49786376953125,
"learning_rate": 3.077914851215585e-09,
"logits/chosen": -1.06005859375,
"logits/rejected": -1.0157767534255981,
"logps/chosen": -0.5099815130233765,
"logps/rejected": -0.6151185035705566,
"loss": 1.4851,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.274953842163086,
"rewards/margins": 0.26284244656562805,
"rewards/rejected": -1.5377962589263916,
"step": 446
},
{
"epoch": 0.9553833823136522,
"grad_norm": 5.8320746421813965,
"learning_rate": 2.7922934437178692e-09,
"logits/chosen": -0.8938602209091187,
"logits/rejected": -0.9562631249427795,
"logps/chosen": -0.37319353222846985,
"logps/rejected": -0.37881189584732056,
"loss": 1.4395,
"rewards/accuracies": 0.375,
"rewards/chosen": -0.9329838752746582,
"rewards/margins": 0.0140459556132555,
"rewards/rejected": -0.9470298290252686,
"step": 447
},
{
"epoch": 0.957520705316591,
"grad_norm": 3.9899652004241943,
"learning_rate": 2.5205032771092592e-09,
"logits/chosen": -0.9867920279502869,
"logits/rejected": -0.9630347490310669,
"logps/chosen": -0.39268139004707336,
"logps/rejected": -0.635047435760498,
"loss": 1.5503,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9817034006118774,
"rewards/margins": 0.6059151291847229,
"rewards/rejected": -1.5876185894012451,
"step": 448
},
{
"epoch": 0.9596580283195298,
"grad_norm": 7.979334354400635,
"learning_rate": 2.2625595580163247e-09,
"logits/chosen": -1.1002980470657349,
"logits/rejected": -1.0980544090270996,
"logps/chosen": -0.7607989311218262,
"logps/rejected": -0.9997137784957886,
"loss": 1.5401,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.9019973278045654,
"rewards/margins": 0.5972872972488403,
"rewards/rejected": -2.4992847442626953,
"step": 449
},
{
"epoch": 0.9617953513224686,
"grad_norm": 21.347997665405273,
"learning_rate": 2.0184767183584474e-09,
"logits/chosen": -0.8544862866401672,
"logits/rejected": -0.8984670042991638,
"logps/chosen": -0.5437809228897095,
"logps/rejected": -0.6033496856689453,
"loss": 1.4988,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.359452247619629,
"rewards/margins": 0.14892183244228363,
"rewards/rejected": -1.5083742141723633,
"step": 450
},
{
"epoch": 0.9639326743254074,
"grad_norm": 8.341211318969727,
"learning_rate": 1.7882684145406612e-09,
"logits/chosen": -1.0166016817092896,
"logits/rejected": -1.0338623523712158,
"logps/chosen": -0.5460320711135864,
"logps/rejected": -0.5169766545295715,
"loss": 1.5717,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.3650802373886108,
"rewards/margins": -0.07263859361410141,
"rewards/rejected": -1.292441725730896,
"step": 451
},
{
"epoch": 0.9660699973283462,
"grad_norm": 12.869359970092773,
"learning_rate": 1.5719475266893489e-09,
"logits/chosen": -1.0716265439987183,
"logits/rejected": -1.0736249685287476,
"logps/chosen": -0.4328761696815491,
"logps/rejected": -0.5213409066200256,
"loss": 1.5382,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.0821905136108398,
"rewards/margins": 0.22116179764270782,
"rewards/rejected": -1.3033523559570312,
"step": 452
},
{
"epoch": 0.9682073203312851,
"grad_norm": 5.998004913330078,
"learning_rate": 1.3695261579316775e-09,
"logits/chosen": -1.1411190032958984,
"logits/rejected": -1.0635725259780884,
"logps/chosen": -0.6178188323974609,
"logps/rejected": -0.592136561870575,
"loss": 1.6393,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.5445469617843628,
"rewards/margins": -0.064205601811409,
"rewards/rejected": -1.4803414344787598,
"step": 453
},
{
"epoch": 0.9703446433342239,
"grad_norm": 7.896289348602295,
"learning_rate": 1.1810156337183908e-09,
"logits/chosen": -1.009376883506775,
"logits/rejected": -0.9895673394203186,
"logps/chosen": -0.7131325602531433,
"logps/rejected": -0.6051285266876221,
"loss": 1.5398,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.7828314304351807,
"rewards/margins": -0.2700101137161255,
"rewards/rejected": -1.5128213167190552,
"step": 454
},
{
"epoch": 0.9724819663371627,
"grad_norm": 6.323320388793945,
"learning_rate": 1.0064265011902328e-09,
"logits/chosen": -1.0121572017669678,
"logits/rejected": -0.9521965980529785,
"logps/chosen": -0.5518695712089539,
"logps/rejected": -0.5250836610794067,
"loss": 1.5786,
"rewards/accuracies": 0.3125,
"rewards/chosen": -1.379673957824707,
"rewards/margins": -0.06696499139070511,
"rewards/rejected": -1.312708854675293,
"step": 455
},
{
"epoch": 0.9746192893401016,
"grad_norm": 6.093278408050537,
"learning_rate": 8.457685285878091e-10,
"logits/chosen": -0.9040694236755371,
"logits/rejected": -1.0100067853927612,
"logps/chosen": -0.6011037230491638,
"logps/rejected": -0.9459134340286255,
"loss": 1.4765,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.502759337425232,
"rewards/margins": 0.8620242476463318,
"rewards/rejected": -2.364783525466919,
"step": 456
},
{
"epoch": 0.9767566123430403,
"grad_norm": 7.644316673278809,
"learning_rate": 6.990507047049676e-10,
"logits/chosen": -1.1642239093780518,
"logits/rejected": -1.327940583229065,
"logps/chosen": -0.7490012645721436,
"logps/rejected": -0.8202410340309143,
"loss": 1.6319,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.8725032806396484,
"rewards/margins": 0.1780991107225418,
"rewards/rejected": -2.0506021976470947,
"step": 457
},
{
"epoch": 0.9788939353459791,
"grad_norm": 4.500396728515625,
"learning_rate": 5.662812383859794e-10,
"logits/chosen": -1.057512640953064,
"logits/rejected": -1.0183889865875244,
"logps/chosen": -0.5618507862091064,
"logps/rejected": -0.7458251714706421,
"loss": 1.5519,
"rewards/accuracies": 0.625,
"rewards/chosen": -1.4046270847320557,
"rewards/margins": 0.4599360227584839,
"rewards/rejected": -1.86456298828125,
"step": 458
},
{
"epoch": 0.981031258348918,
"grad_norm": 5.931386470794678,
"learning_rate": 4.4746755806621126e-10,
"logits/chosen": -1.0571941137313843,
"logits/rejected": -1.1213597059249878,
"logps/chosen": -0.642679750919342,
"logps/rejected": -0.7841815948486328,
"loss": 1.4134,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.6066994667053223,
"rewards/margins": 0.3537544906139374,
"rewards/rejected": -1.9604538679122925,
"step": 459
},
{
"epoch": 0.9831685813518568,
"grad_norm": 9.15275764465332,
"learning_rate": 3.4261631135654167e-10,
"logits/chosen": -0.8693954348564148,
"logits/rejected": -0.7475937008857727,
"logps/chosen": -0.3651605546474457,
"logps/rejected": -0.42595067620277405,
"loss": 1.4887,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9129014015197754,
"rewards/margins": 0.15197524428367615,
"rewards/rejected": -1.064876675605774,
"step": 460
},
{
"epoch": 0.9853059043547956,
"grad_norm": 11.78666877746582,
"learning_rate": 2.5173336467135263e-10,
"logits/chosen": -1.1248339414596558,
"logits/rejected": -1.013832688331604,
"logps/chosen": -0.48437923192977905,
"logps/rejected": -0.5153346657752991,
"loss": 1.487,
"rewards/accuracies": 0.6875,
"rewards/chosen": -1.2109479904174805,
"rewards/margins": 0.07738865166902542,
"rewards/rejected": -1.2883366346359253,
"step": 461
},
{
"epoch": 0.9874432273577345,
"grad_norm": 5.332235336303711,
"learning_rate": 1.7482380290034792e-10,
"logits/chosen": -1.0759484767913818,
"logits/rejected": -0.9958518147468567,
"logps/chosen": -0.4139256775379181,
"logps/rejected": -0.7975391745567322,
"loss": 1.3972,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0348142385482788,
"rewards/margins": 0.9590335488319397,
"rewards/rejected": -1.9938479661941528,
"step": 462
},
{
"epoch": 0.9895805503606733,
"grad_norm": 5.443465709686279,
"learning_rate": 1.1189192912416933e-10,
"logits/chosen": -1.0934535264968872,
"logits/rejected": -1.0068289041519165,
"logps/chosen": -0.48808667063713074,
"logps/rejected": -0.6034483313560486,
"loss": 1.4738,
"rewards/accuracies": 0.5,
"rewards/chosen": -1.2202166318893433,
"rewards/margins": 0.2884041368961334,
"rewards/rejected": -1.5086207389831543,
"step": 463
},
{
"epoch": 0.9917178733636121,
"grad_norm": 5.87878942489624,
"learning_rate": 6.294126437336733e-11,
"logits/chosen": -1.0164133310317993,
"logits/rejected": -0.9699276685714722,
"logps/chosen": -0.4373021125793457,
"logps/rejected": -0.5335453152656555,
"loss": 1.4684,
"rewards/accuracies": 0.4375,
"rewards/chosen": -1.0932552814483643,
"rewards/margins": 0.24060802161693573,
"rewards/rejected": -1.3338632583618164,
"step": 464
},
{
"epoch": 0.9938551963665508,
"grad_norm": 4.464337348937988,
"learning_rate": 2.797454743164174e-11,
"logits/chosen": -1.2042040824890137,
"logits/rejected": -1.055449366569519,
"logps/chosen": -0.3999456763267517,
"logps/rejected": -0.4777381420135498,
"loss": 1.5844,
"rewards/accuracies": 0.625,
"rewards/chosen": -0.9998641610145569,
"rewards/margins": 0.19448117911815643,
"rewards/rejected": -1.1943453550338745,
"step": 465
},
{
"epoch": 0.9959925193694897,
"grad_norm": 9.290578842163086,
"learning_rate": 6.993734682547714e-12,
"logits/chosen": -0.8978444933891296,
"logits/rejected": -0.8331681489944458,
"logps/chosen": -0.521608293056488,
"logps/rejected": -0.5517727136611938,
"loss": 1.5965,
"rewards/accuracies": 0.375,
"rewards/chosen": -1.304020643234253,
"rewards/margins": 0.07541122287511826,
"rewards/rejected": -1.379431962966919,
"step": 466
},
{
"epoch": 0.9981298423724285,
"grad_norm": 12.642468452453613,
"learning_rate": 0.0,
"logits/chosen": -0.8587179183959961,
"logits/rejected": -0.8208239078521729,
"logps/chosen": -0.4356221556663513,
"logps/rejected": -0.39131855964660645,
"loss": 1.4854,
"rewards/accuracies": 0.5625,
"rewards/chosen": -1.0890554189682007,
"rewards/margins": -0.11075909435749054,
"rewards/rejected": -0.9782962799072266,
"step": 467
},
{
"epoch": 0.9981298423724285,
"step": 467,
"total_flos": 0.0,
"train_loss": 0.0,
"train_runtime": 0.0036,
"train_samples_per_second": 16407823.488,
"train_steps_per_second": 127972.035
}
],
"logging_steps": 1,
"max_steps": 467,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 32,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}