{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9995071463775259, "eval_steps": 400, "global_step": 507, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001971414489896501, "grad_norm": 6.2392770862642, "learning_rate": 9.803921568627451e-09, "logits/chosen": -1.594488501548767, "logits/rejected": -1.1860766410827637, "logps/chosen": -198.3888397216797, "logps/rejected": -269.352783203125, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.009857072449482503, "grad_norm": 5.49954498256661, "learning_rate": 4.901960784313725e-08, "logits/chosen": -1.645488977432251, "logits/rejected": -1.0096673965454102, "logps/chosen": -192.4307861328125, "logps/rejected": -247.57391357421875, "loss": 0.6931, "rewards/accuracies": 0.390625, "rewards/chosen": 0.00013264300650916994, "rewards/margins": 0.0001808845845516771, "rewards/rejected": -4.824160714633763e-05, "step": 5 }, { "epoch": 0.019714144898965006, "grad_norm": 4.196436716438617, "learning_rate": 9.80392156862745e-08, "logits/chosen": -1.6045820713043213, "logits/rejected": -1.0348637104034424, "logps/chosen": -184.26632690429688, "logps/rejected": -245.4076690673828, "loss": 0.6931, "rewards/accuracies": 0.48124998807907104, "rewards/chosen": 0.0013285436434671283, "rewards/margins": -0.0003174581506755203, "rewards/rejected": 0.001646001823246479, "step": 10 }, { "epoch": 0.02957121734844751, "grad_norm": 5.768149867251834, "learning_rate": 1.4705882352941175e-07, "logits/chosen": -1.8137686252593994, "logits/rejected": -1.135617971420288, "logps/chosen": -199.5909881591797, "logps/rejected": -266.2090759277344, "loss": 0.6924, "rewards/accuracies": 0.5562499761581421, "rewards/chosen": -0.0006634569144807756, "rewards/margins": 0.0016718091210350394, "rewards/rejected": -0.002335265977308154, "step": 15 }, { "epoch": 0.03942828979793001, "grad_norm": 5.9407046802470065, "learning_rate": 1.96078431372549e-07, "logits/chosen": -1.7376708984375, "logits/rejected": -1.1297136545181274, "logps/chosen": -189.01934814453125, "logps/rejected": -255.4130859375, "loss": 0.6901, "rewards/accuracies": 0.668749988079071, "rewards/chosen": -0.006191645748913288, "rewards/margins": 0.006456127855926752, "rewards/rejected": -0.012647772207856178, "step": 20 }, { "epoch": 0.04928536224741252, "grad_norm": 5.2689388633967456, "learning_rate": 2.4509803921568627e-07, "logits/chosen": -1.7063930034637451, "logits/rejected": -1.1289308071136475, "logps/chosen": -204.759765625, "logps/rejected": -266.6024169921875, "loss": 0.684, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": -0.017932727932929993, "rewards/margins": 0.019068485125899315, "rewards/rejected": -0.03700121492147446, "step": 25 }, { "epoch": 0.05914243469689502, "grad_norm": 6.582821307605904, "learning_rate": 2.941176470588235e-07, "logits/chosen": -1.5701669454574585, "logits/rejected": -1.041677474975586, "logps/chosen": -194.34347534179688, "logps/rejected": -276.304443359375, "loss": 0.673, "rewards/accuracies": 0.7437499761581421, "rewards/chosen": -0.02758154645562172, "rewards/margins": 0.045433152467012405, "rewards/rejected": -0.07301469147205353, "step": 30 }, { "epoch": 0.06899950714637752, "grad_norm": 10.279144076298133, "learning_rate": 3.431372549019608e-07, "logits/chosen": -1.4824097156524658, "logits/rejected": -0.9899765253067017, "logps/chosen": -198.76766967773438, "logps/rejected": -265.6862487792969, "loss": 0.6359, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -0.068024180829525, "rewards/margins": 0.12255563586950302, "rewards/rejected": -0.1905798316001892, "step": 35 }, { "epoch": 0.07885657959586002, "grad_norm": 9.399070867553222, "learning_rate": 3.92156862745098e-07, "logits/chosen": -1.7072070837020874, "logits/rejected": -1.1361684799194336, "logps/chosen": -204.9685516357422, "logps/rejected": -303.8945617675781, "loss": 0.5789, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": -0.17202235758304596, "rewards/margins": 0.32544782757759094, "rewards/rejected": -0.4974702000617981, "step": 40 }, { "epoch": 0.08871365204534254, "grad_norm": 13.971472653574747, "learning_rate": 4.4117647058823526e-07, "logits/chosen": -2.052572727203369, "logits/rejected": -1.6851530075073242, "logps/chosen": -326.47637939453125, "logps/rejected": -529.0755004882812, "loss": 0.5125, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": -1.31343674659729, "rewards/margins": 1.4102681875228882, "rewards/rejected": -2.7237050533294678, "step": 45 }, { "epoch": 0.09857072449482504, "grad_norm": 15.586119367213884, "learning_rate": 4.901960784313725e-07, "logits/chosen": -2.3275997638702393, "logits/rejected": -1.8939182758331299, "logps/chosen": -430.58563232421875, "logps/rejected": -688.560302734375, "loss": 0.491, "rewards/accuracies": 0.8125, "rewards/chosen": -2.3751883506774902, "rewards/margins": 2.010326862335205, "rewards/rejected": -4.385515213012695, "step": 50 }, { "epoch": 0.10842779694430754, "grad_norm": 25.434888651559017, "learning_rate": 4.999050767562379e-07, "logits/chosen": -2.086081027984619, "logits/rejected": -1.7880547046661377, "logps/chosen": -363.26824951171875, "logps/rejected": -565.7152099609375, "loss": 0.4485, "rewards/accuracies": 0.78125, "rewards/chosen": -1.765494704246521, "rewards/margins": 1.468611478805542, "rewards/rejected": -3.2341067790985107, "step": 55 }, { "epoch": 0.11828486939379004, "grad_norm": 33.28729029749558, "learning_rate": 4.99519574616467e-07, "logits/chosen": -2.2118542194366455, "logits/rejected": -1.929535150527954, "logps/chosen": -434.70794677734375, "logps/rejected": -744.1588745117188, "loss": 0.4177, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -2.4308719635009766, "rewards/margins": 2.5395278930664062, "rewards/rejected": -4.970398902893066, "step": 60 }, { "epoch": 0.12814194184327254, "grad_norm": 18.906078399540743, "learning_rate": 4.988380179235842e-07, "logits/chosen": -2.071911334991455, "logits/rejected": -1.7777721881866455, "logps/chosen": -411.3829040527344, "logps/rejected": -706.1156005859375, "loss": 0.3931, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -2.1888458728790283, "rewards/margins": 2.4369874000549316, "rewards/rejected": -4.625833511352539, "step": 65 }, { "epoch": 0.13799901429275505, "grad_norm": 29.037425921796533, "learning_rate": 4.978612153434526e-07, "logits/chosen": -2.3122410774230957, "logits/rejected": -2.039794683456421, "logps/chosen": -457.93646240234375, "logps/rejected": -924.6302490234375, "loss": 0.4394, "rewards/accuracies": 0.875, "rewards/chosen": -2.7360401153564453, "rewards/margins": 3.9763436317443848, "rewards/rejected": -6.712383270263672, "step": 70 }, { "epoch": 0.14785608674223755, "grad_norm": 12.920105460240974, "learning_rate": 4.965903258506806e-07, "logits/chosen": -2.1728882789611816, "logits/rejected": -1.9507039785385132, "logps/chosen": -443.1044006347656, "logps/rejected": -733.8536376953125, "loss": 0.4033, "rewards/accuracies": 0.8125, "rewards/chosen": -2.428837537765503, "rewards/margins": 2.4469170570373535, "rewards/rejected": -4.8757548332214355, "step": 75 }, { "epoch": 0.15771315919172005, "grad_norm": 13.726152345059775, "learning_rate": 4.950268573535011e-07, "logits/chosen": -2.0774412155151367, "logits/rejected": -1.877873420715332, "logps/chosen": -434.2333068847656, "logps/rejected": -697.5498657226562, "loss": 0.3896, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -2.339465618133545, "rewards/margins": 2.073235034942627, "rewards/rejected": -4.412700176239014, "step": 80 }, { "epoch": 0.16757023164120255, "grad_norm": 22.96957352147464, "learning_rate": 4.93172664904641e-07, "logits/chosen": -2.578918695449829, "logits/rejected": -2.313844680786133, "logps/chosen": -714.9383544921875, "logps/rejected": -1185.1837158203125, "loss": 0.3544, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -5.246799468994141, "rewards/margins": 4.040070533752441, "rewards/rejected": -9.286870002746582, "step": 85 }, { "epoch": 0.17742730409068508, "grad_norm": 16.12185087053667, "learning_rate": 4.910299485003033e-07, "logits/chosen": -2.3522980213165283, "logits/rejected": -2.1312594413757324, "logps/chosen": -548.0660400390625, "logps/rejected": -941.7537231445312, "loss": 0.326, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.534759044647217, "rewards/margins": 3.3071117401123047, "rewards/rejected": -6.841870307922363, "step": 90 }, { "epoch": 0.18728437654016758, "grad_norm": 38.45908394327309, "learning_rate": 4.886012504698769e-07, "logits/chosen": -2.29638671875, "logits/rejected": -2.0095603466033936, "logps/chosen": -526.743408203125, "logps/rejected": -906.7442626953125, "loss": 0.3562, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.3386921882629395, "rewards/margins": 3.1595466136932373, "rewards/rejected": -6.498239040374756, "step": 95 }, { "epoch": 0.19714144898965008, "grad_norm": 23.37026509407894, "learning_rate": 4.858894524594652e-07, "logits/chosen": -2.509087085723877, "logits/rejected": -2.2394092082977295, "logps/chosen": -597.5819091796875, "logps/rejected": -1110.536376953125, "loss": 0.3208, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.074545383453369, "rewards/margins": 4.4778618812561035, "rewards/rejected": -8.552406311035156, "step": 100 }, { "epoch": 0.20699852143913258, "grad_norm": 17.538993246799073, "learning_rate": 4.828977720128198e-07, "logits/chosen": -2.368518114089966, "logits/rejected": -2.0958077907562256, "logps/chosen": -522.4010620117188, "logps/rejected": -853.4436645507812, "loss": 0.3199, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -3.3017165660858154, "rewards/margins": 2.8141353130340576, "rewards/rejected": -6.115852355957031, "step": 105 }, { "epoch": 0.21685559388861508, "grad_norm": 14.205208079234838, "learning_rate": 4.796297587537285e-07, "logits/chosen": -2.4165451526641846, "logits/rejected": -2.1057496070861816, "logps/chosen": -577.1276245117188, "logps/rejected": -963.6633911132812, "loss": 0.2935, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -3.8097610473632812, "rewards/margins": 3.394871950149536, "rewards/rejected": -7.2046332359313965, "step": 110 }, { "epoch": 0.22671266633809758, "grad_norm": 16.990565766105274, "learning_rate": 4.760892901743944e-07, "logits/chosen": -2.536337375640869, "logits/rejected": -2.2590508460998535, "logps/chosen": -760.9464111328125, "logps/rejected": -1193.0130615234375, "loss": 0.3468, "rewards/accuracies": 0.84375, "rewards/chosen": -5.601290702819824, "rewards/margins": 3.6806259155273438, "rewards/rejected": -9.281916618347168, "step": 115 }, { "epoch": 0.23656973878758009, "grad_norm": 14.381039222874595, "learning_rate": 4.7228056703479626e-07, "logits/chosen": -2.490741014480591, "logits/rejected": -2.1797027587890625, "logps/chosen": -651.6326293945312, "logps/rejected": -1045.991943359375, "loss": 0.3, "rewards/accuracies": 0.824999988079071, "rewards/chosen": -4.517138481140137, "rewards/margins": 3.300055742263794, "rewards/rejected": -7.817193508148193, "step": 120 }, { "epoch": 0.2464268112370626, "grad_norm": 13.845809576206165, "learning_rate": 4.6820810837849535e-07, "logits/chosen": -2.4549553394317627, "logits/rejected": -2.05999755859375, "logps/chosen": -606.1448974609375, "logps/rejected": -1030.6544189453125, "loss": 0.2987, "rewards/accuracies": 0.875, "rewards/chosen": -4.098814487457275, "rewards/margins": 3.521782636642456, "rewards/rejected": -7.620597839355469, "step": 125 }, { "epoch": 0.2562838836865451, "grad_norm": 30.897064038144002, "learning_rate": 4.63876746170797e-07, "logits/chosen": -2.3905959129333496, "logits/rejected": -2.17751145362854, "logps/chosen": -677.5357055664062, "logps/rejected": -1075.299072265625, "loss": 0.299, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -4.783341407775879, "rewards/margins": 3.367708683013916, "rewards/rejected": -8.151049613952637, "step": 130 }, { "epoch": 0.2661409561360276, "grad_norm": 24.59610522580519, "learning_rate": 4.592916195656321e-07, "logits/chosen": -2.686401844024658, "logits/rejected": -2.2882132530212402, "logps/chosen": -798.7413330078125, "logps/rejected": -1337.7080078125, "loss": 0.2956, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -6.00932502746582, "rewards/margins": 4.70266580581665, "rewards/rejected": -10.711990356445312, "step": 135 }, { "epoch": 0.2759980285855101, "grad_norm": 14.965471726804886, "learning_rate": 4.544581688079602e-07, "logits/chosen": -2.4349093437194824, "logits/rejected": -2.14192533493042, "logps/chosen": -705.6304321289062, "logps/rejected": -1094.288330078125, "loss": 0.2863, "rewards/accuracies": 0.875, "rewards/chosen": -5.049565315246582, "rewards/margins": 3.3041865825653076, "rewards/rejected": -8.353752136230469, "step": 140 }, { "epoch": 0.2858551010349926, "grad_norm": 18.293122078134097, "learning_rate": 4.493821287789272e-07, "logits/chosen": -2.5565428733825684, "logits/rejected": -2.1939361095428467, "logps/chosen": -744.5687255859375, "logps/rejected": -1154.7205810546875, "loss": 0.2788, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": -5.435603618621826, "rewards/margins": 3.5441932678222656, "rewards/rejected": -8.979796409606934, "step": 145 }, { "epoch": 0.2957121734844751, "grad_norm": 20.91998686803295, "learning_rate": 4.4406952219143934e-07, "logits/chosen": -2.5498974323272705, "logits/rejected": -2.22133731842041, "logps/chosen": -842.6162109375, "logps/rejected": -1307.8778076171875, "loss": 0.295, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -6.563225746154785, "rewards/margins": 4.067451477050781, "rewards/rejected": -10.630678176879883, "step": 150 }, { "epoch": 0.3055692459339576, "grad_norm": 16.16798121676358, "learning_rate": 4.38526652444224e-07, "logits/chosen": -2.5155484676361084, "logits/rejected": -2.1966238021850586, "logps/chosen": -806.795166015625, "logps/rejected": -1259.8773193359375, "loss": 0.2963, "rewards/accuracies": 0.856249988079071, "rewards/chosen": -6.04946231842041, "rewards/margins": 3.812061309814453, "rewards/rejected": -9.861523628234863, "step": 155 }, { "epoch": 0.3154263183834401, "grad_norm": 18.735794107249802, "learning_rate": 4.3276009614285824e-07, "logits/chosen": -2.464740037918091, "logits/rejected": -2.1177756786346436, "logps/chosen": -709.6548461914062, "logps/rejected": -1163.3350830078125, "loss": 0.2554, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.127840042114258, "rewards/margins": 3.861670970916748, "rewards/rejected": -8.989511489868164, "step": 160 }, { "epoch": 0.3252833908329226, "grad_norm": 22.02574974928147, "learning_rate": 4.2677669529663686e-07, "logits/chosen": -2.545640707015991, "logits/rejected": -2.2462990283966064, "logps/chosen": -783.61669921875, "logps/rejected": -1255.912353515625, "loss": 0.2591, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.863072395324707, "rewards/margins": 4.111878395080566, "rewards/rejected": -9.974950790405273, "step": 165 }, { "epoch": 0.3351404632824051, "grad_norm": 18.815167005575123, "learning_rate": 4.2058354920054043e-07, "logits/chosen": -2.5555951595306396, "logits/rejected": -2.2355425357818604, "logps/chosen": -801.5789184570312, "logps/rejected": -1247.4630126953125, "loss": 0.2675, "rewards/accuracies": 0.84375, "rewards/chosen": -6.109245777130127, "rewards/margins": 3.8287367820739746, "rewards/rejected": -9.937983512878418, "step": 170 }, { "epoch": 0.34499753573188763, "grad_norm": 15.86237856217812, "learning_rate": 4.141880060119336e-07, "logits/chosen": -2.541696786880493, "logits/rejected": -2.180537700653076, "logps/chosen": -784.6647338867188, "logps/rejected": -1234.05126953125, "loss": 0.2502, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.834389686584473, "rewards/margins": 3.9414896965026855, "rewards/rejected": -9.77587890625, "step": 175 }, { "epoch": 0.35485460818137016, "grad_norm": 16.77349624146522, "learning_rate": 4.0759765403198877e-07, "logits/chosen": -2.5138328075408936, "logits/rejected": -2.1284890174865723, "logps/chosen": -700.7459106445312, "logps/rejected": -1123.9356689453125, "loss": 0.2808, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.06411075592041, "rewards/margins": 3.7050411701202393, "rewards/rejected": -8.769152641296387, "step": 180 }, { "epoch": 0.36471168063085263, "grad_norm": 17.550598446162923, "learning_rate": 4.008203127021797e-07, "logits/chosen": -2.5796236991882324, "logits/rejected": -2.215527057647705, "logps/chosen": -717.72119140625, "logps/rejected": -1230.483642578125, "loss": 0.2249, "rewards/accuracies": 0.9437500238418579, "rewards/chosen": -5.206206798553467, "rewards/margins": 4.433660507202148, "rewards/rejected": -9.639867782592773, "step": 185 }, { "epoch": 0.37456875308033516, "grad_norm": 15.558233723999136, "learning_rate": 3.9386402332652754e-07, "logits/chosen": -2.6024489402770996, "logits/rejected": -2.3488709926605225, "logps/chosen": -900.3855590820312, "logps/rejected": -1392.37255859375, "loss": 0.2267, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -7.0901055335998535, "rewards/margins": 4.340862274169922, "rewards/rejected": -11.430967330932617, "step": 190 }, { "epoch": 0.38442582552981763, "grad_norm": 22.187048055147955, "learning_rate": 3.867370395306068e-07, "logits/chosen": -2.6506357192993164, "logits/rejected": -2.2959604263305664, "logps/chosen": -900.3836059570312, "logps/rejected": -1402.96630859375, "loss": 0.2693, "rewards/accuracies": 0.875, "rewards/chosen": -7.0995588302612305, "rewards/margins": 4.546249866485596, "rewards/rejected": -11.645808219909668, "step": 195 }, { "epoch": 0.39428289797930016, "grad_norm": 15.588853964432303, "learning_rate": 3.794478174686328e-07, "logits/chosen": -2.5797057151794434, "logits/rejected": -2.1939449310302734, "logps/chosen": -769.4427490234375, "logps/rejected": -1267.3035888671875, "loss": 0.2491, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.717661380767822, "rewards/margins": 4.36967658996582, "rewards/rejected": -10.087339401245117, "step": 200 }, { "epoch": 0.40413997042878264, "grad_norm": 15.604577624570162, "learning_rate": 3.720050057902495e-07, "logits/chosen": -2.4678874015808105, "logits/rejected": -2.166454553604126, "logps/chosen": -664.3575439453125, "logps/rejected": -1184.0172119140625, "loss": 0.2733, "rewards/accuracies": 0.84375, "rewards/chosen": -4.809238910675049, "rewards/margins": 4.560500144958496, "rewards/rejected": -9.369739532470703, "step": 205 }, { "epoch": 0.41399704287826516, "grad_norm": 16.104577186140947, "learning_rate": 3.644174353789204e-07, "logits/chosen": -2.470492124557495, "logits/rejected": -2.2408156394958496, "logps/chosen": -702.6835327148438, "logps/rejected": -1184.7685546875, "loss": 0.24, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.020668983459473, "rewards/margins": 4.07871150970459, "rewards/rejected": -9.099380493164062, "step": 210 }, { "epoch": 0.42385411532774764, "grad_norm": 42.39094832845099, "learning_rate": 3.566941088741009e-07, "logits/chosen": -2.465122699737549, "logits/rejected": -2.202960968017578, "logps/chosen": -784.384765625, "logps/rejected": -1312.956298828125, "loss": 0.2914, "rewards/accuracies": 0.8687499761581421, "rewards/chosen": -5.859043598175049, "rewards/margins": 4.713334083557129, "rewards/rejected": -10.572378158569336, "step": 215 }, { "epoch": 0.43371118777723017, "grad_norm": 20.944458558373373, "learning_rate": 3.488441899896217e-07, "logits/chosen": -2.487208843231201, "logits/rejected": -2.197640895843506, "logps/chosen": -729.4404296875, "logps/rejected": -1207.813232421875, "loss": 0.2843, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.363978862762451, "rewards/margins": 4.154356956481934, "rewards/rejected": -9.518336296081543, "step": 220 }, { "epoch": 0.44356826022671264, "grad_norm": 14.863684470935617, "learning_rate": 3.408769926409574e-07, "logits/chosen": -2.4418163299560547, "logits/rejected": -2.1561474800109863, "logps/chosen": -578.9898071289062, "logps/rejected": -913.1700439453125, "loss": 0.2547, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -3.821843385696411, "rewards/margins": 2.9205775260925293, "rewards/rejected": -6.742421627044678, "step": 225 }, { "epoch": 0.45342533267619517, "grad_norm": 17.477623835490277, "learning_rate": 3.3280196989428263e-07, "logits/chosen": -2.4349989891052246, "logits/rejected": -2.196359634399414, "logps/chosen": -682.2156982421875, "logps/rejected": -1149.534423828125, "loss": 0.2754, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -4.879435062408447, "rewards/margins": 3.9676411151885986, "rewards/rejected": -8.847076416015625, "step": 230 }, { "epoch": 0.4632824051256777, "grad_norm": 16.30612813668589, "learning_rate": 3.2462870275042367e-07, "logits/chosen": -2.5115764141082764, "logits/rejected": -2.3011107444763184, "logps/chosen": -744.306396484375, "logps/rejected": -1183.502685546875, "loss": 0.2276, "rewards/accuracies": 0.90625, "rewards/chosen": -5.445645332336426, "rewards/margins": 3.8912956714630127, "rewards/rejected": -9.33694076538086, "step": 235 }, { "epoch": 0.47313947757516017, "grad_norm": 18.35946878564802, "learning_rate": 3.1636688877701806e-07, "logits/chosen": -2.5281643867492676, "logits/rejected": -2.2399466037750244, "logps/chosen": -777.9661865234375, "logps/rejected": -1258.2623291015625, "loss": 0.2537, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.761153221130371, "rewards/margins": 4.262064456939697, "rewards/rejected": -10.023218154907227, "step": 240 }, { "epoch": 0.4829965500246427, "grad_norm": 22.711594265033526, "learning_rate": 3.080263306023669e-07, "logits/chosen": -2.43805193901062, "logits/rejected": -2.136569023132324, "logps/chosen": -738.892578125, "logps/rejected": -1253.740966796875, "loss": 0.2465, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.394611358642578, "rewards/margins": 4.49846076965332, "rewards/rejected": -9.893071174621582, "step": 245 }, { "epoch": 0.4928536224741252, "grad_norm": 22.523084393015623, "learning_rate": 2.996169242846328e-07, "logits/chosen": -2.456860065460205, "logits/rejected": -2.1488893032073975, "logps/chosen": -664.892822265625, "logps/rejected": -1094.06640625, "loss": 0.2643, "rewards/accuracies": 0.90625, "rewards/chosen": -4.737056732177734, "rewards/margins": 3.7822394371032715, "rewards/rejected": -8.519296646118164, "step": 250 }, { "epoch": 0.5027106949236076, "grad_norm": 21.85050975494629, "learning_rate": 2.911486475701835e-07, "logits/chosen": -2.3711659908294678, "logits/rejected": -2.104147434234619, "logps/chosen": -632.7847900390625, "logps/rejected": -1041.4219970703125, "loss": 0.2848, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -4.385097503662109, "rewards/margins": 3.560230255126953, "rewards/rejected": -7.9453277587890625, "step": 255 }, { "epoch": 0.5125677673730902, "grad_norm": 17.27564046380349, "learning_rate": 2.826315480550129e-07, "logits/chosen": -2.326019763946533, "logits/rejected": -2.0808887481689453, "logps/chosen": -590.845458984375, "logps/rejected": -1011.6871337890625, "loss": 0.2489, "rewards/accuracies": 0.90625, "rewards/chosen": -3.986447811126709, "rewards/margins": 3.542587995529175, "rewards/rejected": -7.529036045074463, "step": 260 }, { "epoch": 0.5224248398225727, "grad_norm": 19.212069446863243, "learning_rate": 2.740757312632854e-07, "logits/chosen": -2.414062261581421, "logits/rejected": -2.197702646255493, "logps/chosen": -744.2257080078125, "logps/rejected": -1205.133544921875, "loss": 0.2221, "rewards/accuracies": 0.875, "rewards/chosen": -5.464824676513672, "rewards/margins": 4.082161903381348, "rewards/rejected": -9.54698657989502, "step": 265 }, { "epoch": 0.5322819122720552, "grad_norm": 16.242036970306053, "learning_rate": 2.654913486571487e-07, "logits/chosen": -2.5215845108032227, "logits/rejected": -2.260974168777466, "logps/chosen": -794.4285888671875, "logps/rejected": -1301.2264404296875, "loss": 0.3103, "rewards/accuracies": 0.875, "rewards/chosen": -5.997513294219971, "rewards/margins": 4.493828773498535, "rewards/rejected": -10.491341590881348, "step": 270 }, { "epoch": 0.5421389847215377, "grad_norm": 14.856129841637888, "learning_rate": 2.5688858559204053e-07, "logits/chosen": -2.406275987625122, "logits/rejected": -2.168721914291382, "logps/chosen": -711.0574951171875, "logps/rejected": -1197.390380859375, "loss": 0.2365, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.203823566436768, "rewards/margins": 4.352889060974121, "rewards/rejected": -9.55671215057373, "step": 275 }, { "epoch": 0.5519960571710202, "grad_norm": 15.78620841198885, "learning_rate": 2.4827764923178246e-07, "logits/chosen": -2.5056891441345215, "logits/rejected": -2.270139694213867, "logps/chosen": -740.6078491210938, "logps/rejected": -1241.2222900390625, "loss": 0.2371, "rewards/accuracies": 0.9375, "rewards/chosen": -5.419187068939209, "rewards/margins": 4.417618751525879, "rewards/rejected": -9.83680534362793, "step": 280 }, { "epoch": 0.5618531296205027, "grad_norm": 17.093071523621635, "learning_rate": 2.3966875643779667e-07, "logits/chosen": -2.443941593170166, "logits/rejected": -2.2383456230163574, "logps/chosen": -725.4220581054688, "logps/rejected": -1253.807373046875, "loss": 0.2179, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.367009162902832, "rewards/margins": 4.663185119628906, "rewards/rejected": -10.030195236206055, "step": 285 }, { "epoch": 0.5717102020699852, "grad_norm": 25.373557062497504, "learning_rate": 2.3107212164681774e-07, "logits/chosen": -2.5970406532287598, "logits/rejected": -2.2234134674072266, "logps/chosen": -703.3094482421875, "logps/rejected": -1330.679931640625, "loss": 0.2351, "rewards/accuracies": 0.90625, "rewards/chosen": -5.1501851081848145, "rewards/margins": 5.526017665863037, "rewards/rejected": -10.676202774047852, "step": 290 }, { "epoch": 0.5815672745194678, "grad_norm": 39.54586447558642, "learning_rate": 2.2249794475148019e-07, "logits/chosen": -2.508376359939575, "logits/rejected": -2.293508768081665, "logps/chosen": -826.1845703125, "logps/rejected": -1299.66845703125, "loss": 0.2529, "rewards/accuracies": 0.831250011920929, "rewards/chosen": -6.255187034606934, "rewards/margins": 4.241654872894287, "rewards/rejected": -10.496840476989746, "step": 295 }, { "epoch": 0.5914243469689502, "grad_norm": 21.452851323361823, "learning_rate": 2.1395639899816332e-07, "logits/chosen": -2.580679416656494, "logits/rejected": -2.2998709678649902, "logps/chosen": -733.3718872070312, "logps/rejected": -1240.96533203125, "loss": 0.229, "rewards/accuracies": 0.875, "rewards/chosen": -5.495171070098877, "rewards/margins": 4.4129509925842285, "rewards/rejected": -9.908121109008789, "step": 300 }, { "epoch": 0.6012814194184327, "grad_norm": 16.276691413135083, "learning_rate": 2.0545761891645177e-07, "logits/chosen": -2.456111431121826, "logits/rejected": -2.243847608566284, "logps/chosen": -683.7113037109375, "logps/rejected": -1161.4580078125, "loss": 0.2226, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -4.874439239501953, "rewards/margins": 4.157763957977295, "rewards/rejected": -9.032203674316406, "step": 305 }, { "epoch": 0.6111384918679152, "grad_norm": 24.661935948628066, "learning_rate": 1.9701168829453305e-07, "logits/chosen": -2.6442089080810547, "logits/rejected": -2.186643600463867, "logps/chosen": -696.6754150390625, "logps/rejected": -1296.2703857421875, "loss": 0.2543, "rewards/accuracies": 0.875, "rewards/chosen": -5.091025352478027, "rewards/margins": 5.146512031555176, "rewards/rejected": -10.237536430358887, "step": 310 }, { "epoch": 0.6209955643173978, "grad_norm": 14.790443951524152, "learning_rate": 1.886286282148002e-07, "logits/chosen": -2.444152355194092, "logits/rejected": -2.1477932929992676, "logps/chosen": -734.7506713867188, "logps/rejected": -1255.331787109375, "loss": 0.2501, "rewards/accuracies": 0.90625, "rewards/chosen": -5.458280563354492, "rewards/margins": 4.611725807189941, "rewards/rejected": -10.070005416870117, "step": 315 }, { "epoch": 0.6308526367668802, "grad_norm": 16.271316784779167, "learning_rate": 1.8031838516385422e-07, "logits/chosen": -2.369560718536377, "logits/rejected": -2.1628785133361816, "logps/chosen": -670.2017822265625, "logps/rejected": -1158.4266357421875, "loss": 0.2539, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.8326568603515625, "rewards/margins": 4.299530982971191, "rewards/rejected": -9.132187843322754, "step": 320 }, { "epoch": 0.6407097092163627, "grad_norm": 17.744715641719303, "learning_rate": 1.7209081923101472e-07, "logits/chosen": -2.6445670127868652, "logits/rejected": -2.266472578048706, "logps/chosen": -690.3375854492188, "logps/rejected": -1188.698974609375, "loss": 0.205, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -4.986203193664551, "rewards/margins": 4.211085319519043, "rewards/rejected": -9.19728946685791, "step": 325 }, { "epoch": 0.6505667816658453, "grad_norm": 21.850943779213573, "learning_rate": 1.639556924093404e-07, "logits/chosen": -2.358119249343872, "logits/rejected": -2.1153066158294678, "logps/chosen": -764.6770629882812, "logps/rejected": -1240.2838134765625, "loss": 0.2799, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -5.743839263916016, "rewards/margins": 4.227473735809326, "rewards/rejected": -9.971312522888184, "step": 330 }, { "epoch": 0.6604238541153278, "grad_norm": 34.1719257406542, "learning_rate": 1.5592265701304114e-07, "logits/chosen": -2.375866413116455, "logits/rejected": -2.240598678588867, "logps/chosen": -763.527099609375, "logps/rejected": -1258.715576171875, "loss": 0.2564, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.7242560386657715, "rewards/margins": 4.458041667938232, "rewards/rejected": -10.18229866027832, "step": 335 }, { "epoch": 0.6702809265648102, "grad_norm": 18.71419612814259, "learning_rate": 1.4800124422502334e-07, "logits/chosen": -2.519636631011963, "logits/rejected": -2.2316250801086426, "logps/chosen": -762.00439453125, "logps/rejected": -1267.931396484375, "loss": 0.2514, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.641887664794922, "rewards/margins": 4.440293312072754, "rewards/rejected": -10.08218002319336, "step": 340 }, { "epoch": 0.6801379990142927, "grad_norm": 18.664999037354942, "learning_rate": 1.4020085278815743e-07, "logits/chosen": -2.458855628967285, "logits/rejected": -2.2174124717712402, "logps/chosen": -758.8146362304688, "logps/rejected": -1192.955322265625, "loss": 0.2308, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.576407432556152, "rewards/margins": 3.762500762939453, "rewards/rejected": -9.338907241821289, "step": 345 }, { "epoch": 0.6899950714637753, "grad_norm": 19.00593669045522, "learning_rate": 1.3253073785368545e-07, "logits/chosen": -2.4038822650909424, "logits/rejected": -2.114386796951294, "logps/chosen": -628.0557250976562, "logps/rejected": -1128.370849609375, "loss": 0.2821, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -4.437934875488281, "rewards/margins": 4.355624198913574, "rewards/rejected": -8.793559074401855, "step": 350 }, { "epoch": 0.6998521439132578, "grad_norm": 13.437480532125694, "learning_rate": 1.2500000000000005e-07, "logits/chosen": -2.367783308029175, "logits/rejected": -2.1017680168151855, "logps/chosen": -647.2433471679688, "logps/rejected": -1109.231689453125, "loss": 0.2264, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -4.586766242980957, "rewards/margins": 4.0332794189453125, "rewards/rejected": -8.62004566192627, "step": 355 }, { "epoch": 0.7097092163627403, "grad_norm": 23.214131611033924, "learning_rate": 1.1761757443482285e-07, "logits/chosen": -2.4149296283721924, "logits/rejected": -2.0817036628723145, "logps/chosen": -711.5889892578125, "logps/rejected": -1216.048095703125, "loss": 0.2471, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.251239776611328, "rewards/margins": 4.419920921325684, "rewards/rejected": -9.671161651611328, "step": 360 }, { "epoch": 0.7195662888122227, "grad_norm": 21.14502188501099, "learning_rate": 1.1039222039359644e-07, "logits/chosen": -2.5779356956481934, "logits/rejected": -2.228896141052246, "logps/chosen": -739.5020751953125, "logps/rejected": -1144.7041015625, "loss": 0.2331, "rewards/accuracies": 0.90625, "rewards/chosen": -5.435536861419678, "rewards/margins": 3.5643341541290283, "rewards/rejected": -8.999870300292969, "step": 365 }, { "epoch": 0.7294233612617053, "grad_norm": 24.352395974541345, "learning_rate": 1.0333251074666608e-07, "logits/chosen": -2.4502475261688232, "logits/rejected": -2.300096035003662, "logps/chosen": -781.7764282226562, "logps/rejected": -1243.557373046875, "loss": 0.2034, "rewards/accuracies": 0.925000011920929, "rewards/chosen": -5.88522481918335, "rewards/margins": 4.11319637298584, "rewards/rejected": -9.998421669006348, "step": 370 }, { "epoch": 0.7392804337111878, "grad_norm": 12.733858279084933, "learning_rate": 9.644682182758304e-08, "logits/chosen": -2.5493714809417725, "logits/rejected": -2.2471814155578613, "logps/chosen": -801.8941650390625, "logps/rejected": -1274.67529296875, "loss": 0.2314, "rewards/accuracies": 0.9375, "rewards/chosen": -6.090977668762207, "rewards/margins": 4.2742109298706055, "rewards/rejected": -10.365188598632812, "step": 375 }, { "epoch": 0.7491375061606703, "grad_norm": 30.453462939771114, "learning_rate": 8.974332349459992e-08, "logits/chosen": -2.3520667552948, "logits/rejected": -2.144470691680908, "logps/chosen": -808.9397583007812, "logps/rejected": -1302.123291015625, "loss": 0.2251, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -6.170880317687988, "rewards/margins": 4.420655727386475, "rewards/rejected": -10.591535568237305, "step": 380 }, { "epoch": 0.7589945786101527, "grad_norm": 25.800864953205974, "learning_rate": 8.322996943714672e-08, "logits/chosen": -2.4617538452148438, "logits/rejected": -2.170855760574341, "logps/chosen": -752.3043212890625, "logps/rejected": -1303.364013671875, "loss": 0.2474, "rewards/accuracies": 0.9125000238418579, "rewards/chosen": -5.609736919403076, "rewards/margins": 4.878790378570557, "rewards/rejected": -10.488527297973633, "step": 385 }, { "epoch": 0.7688516510596353, "grad_norm": 20.367448051714003, "learning_rate": 7.691448773879256e-08, "logits/chosen": -2.631474018096924, "logits/rejected": -2.1774039268493652, "logps/chosen": -788.4654541015625, "logps/rejected": -1410.991455078125, "loss": 0.219, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.944725036621094, "rewards/margins": 5.383862495422363, "rewards/rejected": -11.328587532043457, "step": 390 }, { "epoch": 0.7787087235091178, "grad_norm": 22.316993245593054, "learning_rate": 7.080437170788722e-08, "logits/chosen": -2.5305416584014893, "logits/rejected": -2.246816873550415, "logps/chosen": -782.3768310546875, "logps/rejected": -1279.210693359375, "loss": 0.267, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.886050701141357, "rewards/margins": 4.281658172607422, "rewards/rejected": -10.167708396911621, "step": 395 }, { "epoch": 0.7885657959586003, "grad_norm": 25.32693997023262, "learning_rate": 6.490687098676332e-08, "logits/chosen": -2.4314379692077637, "logits/rejected": -2.1938157081604004, "logps/chosen": -747.9923095703125, "logps/rejected": -1171.9027099609375, "loss": 0.2606, "rewards/accuracies": 0.893750011920929, "rewards/chosen": -5.560776710510254, "rewards/margins": 3.7204151153564453, "rewards/rejected": -9.2811918258667, "step": 400 }, { "epoch": 0.7885657959586003, "eval_logits/chosen": -2.784451484680176, "eval_logits/rejected": -2.6733083724975586, "eval_logps/chosen": -513.8394165039062, "eval_logps/rejected": -600.927978515625, "eval_loss": 0.5123496651649475, "eval_rewards/accuracies": 0.7782257795333862, "eval_rewards/chosen": -2.5094728469848633, "eval_rewards/margins": 0.760833203792572, "eval_rewards/rejected": -3.27030611038208, "eval_runtime": 327.294, "eval_samples_per_second": 6.037, "eval_steps_per_second": 0.379, "step": 400 }, { "epoch": 0.7984228684080829, "grad_norm": 16.407923464923826, "learning_rate": 5.9228982950048414e-08, "logits/chosen": -2.4307689666748047, "logits/rejected": -2.029819965362549, "logps/chosen": -701.4022827148438, "logps/rejected": -1147.330322265625, "loss": 0.227, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.128365993499756, "rewards/margins": 3.8555781841278076, "rewards/rejected": -8.9839448928833, "step": 405 }, { "epoch": 0.8082799408575653, "grad_norm": 23.480190565228476, "learning_rate": 5.3777444402291345e-08, "logits/chosen": -2.4188990592956543, "logits/rejected": -2.1691110134124756, "logps/chosen": -730.129638671875, "logps/rejected": -1147.34521484375, "loss": 0.2563, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.437844753265381, "rewards/margins": 3.608722686767578, "rewards/rejected": -9.0465669631958, "step": 410 }, { "epoch": 0.8181370133070478, "grad_norm": 14.849124963520776, "learning_rate": 4.855872358475546e-08, "logits/chosen": -2.4617886543273926, "logits/rejected": -2.174734592437744, "logps/chosen": -733.2481079101562, "logps/rejected": -1173.011962890625, "loss": 0.2203, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.399328708648682, "rewards/margins": 3.8578476905822754, "rewards/rejected": -9.257177352905273, "step": 415 }, { "epoch": 0.8279940857565303, "grad_norm": 14.728191064922497, "learning_rate": 4.357901250086107e-08, "logits/chosen": -2.605170488357544, "logits/rejected": -2.1935315132141113, "logps/chosen": -696.6317749023438, "logps/rejected": -1134.62890625, "loss": 0.2169, "rewards/accuracies": 0.875, "rewards/chosen": -5.047616481781006, "rewards/margins": 3.7440898418426514, "rewards/rejected": -8.791706085205078, "step": 420 }, { "epoch": 0.8378511582060129, "grad_norm": 21.656855077862126, "learning_rate": 3.884421956938377e-08, "logits/chosen": -2.443837881088257, "logits/rejected": -2.016244649887085, "logps/chosen": -734.0977783203125, "logps/rejected": -1261.663818359375, "loss": 0.2601, "rewards/accuracies": 0.9375, "rewards/chosen": -5.3997087478637695, "rewards/margins": 4.594438552856445, "rewards/rejected": -9.994146347045898, "step": 425 }, { "epoch": 0.8477082306554953, "grad_norm": 22.361665031765803, "learning_rate": 3.435996261412591e-08, "logits/chosen": -2.4327638149261475, "logits/rejected": -2.148250102996826, "logps/chosen": -736.1185913085938, "logps/rejected": -1234.297607421875, "loss": 0.2572, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.458459854125977, "rewards/margins": 4.413580894470215, "rewards/rejected": -9.872041702270508, "step": 430 }, { "epoch": 0.8575653031049778, "grad_norm": 34.61319327349877, "learning_rate": 3.013156219837776e-08, "logits/chosen": -2.418109655380249, "logits/rejected": -2.1507232189178467, "logps/chosen": -754.8319091796875, "logps/rejected": -1258.600830078125, "loss": 0.2446, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.640649795532227, "rewards/margins": 4.45644998550415, "rewards/rejected": -10.097099304199219, "step": 435 }, { "epoch": 0.8674223755544603, "grad_norm": 17.275605835829435, "learning_rate": 2.6164035312078447e-08, "logits/chosen": -2.610421657562256, "logits/rejected": -2.200122356414795, "logps/chosen": -775.4208984375, "logps/rejected": -1280.5159912109375, "loss": 0.2162, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.812338829040527, "rewards/margins": 4.107577800750732, "rewards/rejected": -9.919916152954102, "step": 440 }, { "epoch": 0.8772794480039429, "grad_norm": 19.69767066811732, "learning_rate": 2.2462089419165776e-08, "logits/chosen": -2.454554319381714, "logits/rejected": -2.129283905029297, "logps/chosen": -772.8093872070312, "logps/rejected": -1197.3636474609375, "loss": 0.2623, "rewards/accuracies": 0.862500011920929, "rewards/chosen": -5.787473201751709, "rewards/margins": 3.7656428813934326, "rewards/rejected": -9.553116798400879, "step": 445 }, { "epoch": 0.8871365204534253, "grad_norm": 16.68762717345223, "learning_rate": 1.9030116872178314e-08, "logits/chosen": -2.3323419094085693, "logits/rejected": -2.1572489738464355, "logps/chosen": -730.7281494140625, "logps/rejected": -1192.001220703125, "loss": 0.2348, "rewards/accuracies": 0.875, "rewards/chosen": -5.397282600402832, "rewards/margins": 4.056326866149902, "rewards/rejected": -9.453609466552734, "step": 450 }, { "epoch": 0.8969935929029078, "grad_norm": 20.379506675051896, "learning_rate": 1.5872189700736337e-08, "logits/chosen": -2.3889849185943604, "logits/rejected": -2.237183094024658, "logps/chosen": -754.2752075195312, "logps/rejected": -1230.703125, "loss": 0.1963, "rewards/accuracies": 0.9375, "rewards/chosen": -5.620961666107178, "rewards/margins": 4.211625099182129, "rewards/rejected": -9.832587242126465, "step": 455 }, { "epoch": 0.9068506653523903, "grad_norm": 17.181830991927878, "learning_rate": 1.2992054780085692e-08, "logits/chosen": -2.495082139968872, "logits/rejected": -2.1834959983825684, "logps/chosen": -710.3396606445312, "logps/rejected": -1224.693603515625, "loss": 0.2459, "rewards/accuracies": 0.90625, "rewards/chosen": -5.253153324127197, "rewards/margins": 4.523493766784668, "rewards/rejected": -9.776647567749023, "step": 460 }, { "epoch": 0.9167077378018729, "grad_norm": 17.458158491525015, "learning_rate": 1.0393129385436823e-08, "logits/chosen": -2.5279664993286133, "logits/rejected": -2.2738842964172363, "logps/chosen": -760.11962890625, "logps/rejected": -1247.168212890625, "loss": 0.2254, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -5.765726566314697, "rewards/margins": 4.349237442016602, "rewards/rejected": -10.11496353149414, "step": 465 }, { "epoch": 0.9265648102513554, "grad_norm": 22.242802721359375, "learning_rate": 8.078497137373242e-09, "logits/chosen": -2.6163723468780518, "logits/rejected": -2.2263712882995605, "logps/chosen": -774.3194580078125, "logps/rejected": -1314.293212890625, "loss": 0.2375, "rewards/accuracies": 0.8999999761581421, "rewards/chosen": -5.78206205368042, "rewards/margins": 4.660614490509033, "rewards/rejected": -10.442676544189453, "step": 470 }, { "epoch": 0.9364218827008378, "grad_norm": 17.50981352291082, "learning_rate": 6.0509043431410945e-09, "logits/chosen": -2.4221930503845215, "logits/rejected": -2.220930814743042, "logps/chosen": -804.204345703125, "logps/rejected": -1225.1212158203125, "loss": 0.2554, "rewards/accuracies": 0.9312499761581421, "rewards/chosen": -6.063734531402588, "rewards/margins": 3.7845940589904785, "rewards/rejected": -9.848328590393066, "step": 475 }, { "epoch": 0.9462789551503203, "grad_norm": 19.77591605111257, "learning_rate": 4.312756738160145e-09, "logits/chosen": -2.5149528980255127, "logits/rejected": -2.154731512069702, "logps/chosen": -768.4055786132812, "logps/rejected": -1274.7755126953125, "loss": 0.2458, "rewards/accuracies": 0.887499988079071, "rewards/chosen": -5.735711574554443, "rewards/margins": 4.444643974304199, "rewards/rejected": -10.180355072021484, "step": 480 }, { "epoch": 0.9561360275998029, "grad_norm": 16.377470184235065, "learning_rate": 2.8661166316229223e-09, "logits/chosen": -2.3629002571105957, "logits/rejected": -2.151808738708496, "logps/chosen": -723.3502197265625, "logps/rejected": -1148.914794921875, "loss": 0.2446, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.385800361633301, "rewards/margins": 3.7826755046844482, "rewards/rejected": -9.168476104736328, "step": 485 }, { "epoch": 0.9659931000492854, "grad_norm": 15.391953903269371, "learning_rate": 1.7127004595681727e-09, "logits/chosen": -2.5323967933654785, "logits/rejected": -2.1350226402282715, "logps/chosen": -750.980712890625, "logps/rejected": -1350.3404541015625, "loss": 0.2446, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.573115348815918, "rewards/margins": 5.145482063293457, "rewards/rejected": -10.718597412109375, "step": 490 }, { "epoch": 0.9758501724987678, "grad_norm": 16.723832751333937, "learning_rate": 8.538767483325383e-10, "logits/chosen": -2.532517433166504, "logits/rejected": -2.180654525756836, "logps/chosen": -762.9144287109375, "logps/rejected": -1284.332763671875, "loss": 0.2173, "rewards/accuracies": 0.918749988079071, "rewards/chosen": -5.670698165893555, "rewards/margins": 4.554699897766113, "rewards/rejected": -10.225398063659668, "step": 495 }, { "epoch": 0.9857072449482503, "grad_norm": 26.735630621648028, "learning_rate": 2.9066449079634404e-10, "logits/chosen": -2.5005085468292236, "logits/rejected": -2.2136847972869873, "logps/chosen": -738.4940185546875, "logps/rejected": -1214.4185791015625, "loss": 0.2165, "rewards/accuracies": 0.9375, "rewards/chosen": -5.457156658172607, "rewards/margins": 4.181014060974121, "rewards/rejected": -9.63817024230957, "step": 500 }, { "epoch": 0.9955643173977329, "grad_norm": 17.938477728126337, "learning_rate": 2.3731937350224273e-11, "logits/chosen": -2.449402093887329, "logits/rejected": -2.1034648418426514, "logps/chosen": -783.5701293945312, "logps/rejected": -1268.8685302734375, "loss": 0.2476, "rewards/accuracies": 0.8812500238418579, "rewards/chosen": -5.870804786682129, "rewards/margins": 4.206555366516113, "rewards/rejected": -10.077360153198242, "step": 505 }, { "epoch": 0.9995071463775259, "step": 507, "total_flos": 0.0, "train_loss": 0.30356378627011527, "train_runtime": 18867.8748, "train_samples_per_second": 3.441, "train_steps_per_second": 0.027 } ], "logging_steps": 5, "max_steps": 507, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }