{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 684, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 1.972968290218204, "learning_rate": 7.246376811594203e-09, "logits/chosen": -2.8746490478515625, "logits/rejected": -2.840811252593994, "logps/chosen": -227.73272705078125, "logps/rejected": -174.71890258789062, "loss": 0.6931, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.01, "grad_norm": 1.8679339461073734, "learning_rate": 7.246376811594203e-08, "logits/chosen": -2.769803047180176, "logits/rejected": -2.7261266708374023, "logps/chosen": -244.6114959716797, "logps/rejected": -240.41116333007812, "loss": 0.6931, "rewards/accuracies": 0.3611111044883728, "rewards/chosen": 0.00021203850337769836, "rewards/margins": 0.00021335652854759246, "rewards/margins_max": 0.0021964015904814005, "rewards/margins_min": -0.0016354921972379088, "rewards/margins_std": 0.0016972733428701758, "rewards/rejected": -1.318060753874306e-06, "step": 10 }, { "epoch": 0.03, "grad_norm": 2.1095097251535906, "learning_rate": 1.4492753623188405e-07, "logits/chosen": -2.8978943824768066, "logits/rejected": -2.859205961227417, "logps/chosen": -317.23260498046875, "logps/rejected": -239.8469696044922, "loss": 0.6931, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.00018418056424707174, "rewards/margins": 9.037666313815862e-05, "rewards/margins_max": 0.003943216986954212, "rewards/margins_min": -0.003490231465548277, "rewards/margins_std": 0.0033192276023328304, "rewards/rejected": 9.380385745316744e-05, "step": 20 }, { "epoch": 0.04, "grad_norm": 2.08467778375872, "learning_rate": 2.1739130434782607e-07, "logits/chosen": -2.811413288116455, "logits/rejected": -2.7905545234680176, "logps/chosen": -264.98712158203125, "logps/rejected": -223.62734985351562, "loss": 0.693, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.00012606059317477047, "rewards/margins": -7.177007319114637e-06, "rewards/margins_max": 0.0028896895237267017, "rewards/margins_min": -0.0032867384143173695, "rewards/margins_std": 0.0027837478555738926, "rewards/rejected": 0.000133237597765401, "step": 30 }, { "epoch": 0.06, "grad_norm": 1.940398970016681, "learning_rate": 2.898550724637681e-07, "logits/chosen": -2.8652420043945312, "logits/rejected": -2.83544659614563, "logps/chosen": -305.147216796875, "logps/rejected": -267.1144714355469, "loss": 0.6928, "rewards/accuracies": 0.5625, "rewards/chosen": 0.0004788968653883785, "rewards/margins": 0.0008589730714447796, "rewards/margins_max": 0.0041023739613592625, "rewards/margins_min": -0.002046389738097787, "rewards/margins_std": 0.0028562676161527634, "rewards/rejected": -0.0003800761769525707, "step": 40 }, { "epoch": 0.07, "grad_norm": 1.6929010917265717, "learning_rate": 3.6231884057971015e-07, "logits/chosen": -2.8695337772369385, "logits/rejected": -2.837745189666748, "logps/chosen": -260.9767150878906, "logps/rejected": -235.2560577392578, "loss": 0.6924, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0011892963666468859, "rewards/margins": 0.0010751936351880431, "rewards/margins_max": 0.004216945730149746, "rewards/margins_min": -0.002206298988312483, "rewards/margins_std": 0.0029160729609429836, "rewards/rejected": 0.00011410261504352093, "step": 50 }, { "epoch": 0.09, "grad_norm": 1.6771532679917263, "learning_rate": 4.3478260869565214e-07, "logits/chosen": -2.8912997245788574, "logits/rejected": -2.8439412117004395, "logps/chosen": -281.3103942871094, "logps/rejected": -248.8180389404297, "loss": 0.6919, "rewards/accuracies": 0.75, "rewards/chosen": 0.0023077281657606363, "rewards/margins": 0.0024695510510355234, "rewards/margins_max": 0.006759033538401127, "rewards/margins_min": -0.0015218419721350074, "rewards/margins_std": 0.00368516705930233, "rewards/rejected": -0.0001618233509361744, "step": 60 }, { "epoch": 0.1, "grad_norm": 1.9158671065853772, "learning_rate": 4.999967381905813e-07, "logits/chosen": -2.7889466285705566, "logits/rejected": -2.7540149688720703, "logps/chosen": -290.43804931640625, "logps/rejected": -212.5797882080078, "loss": 0.691, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.003653948428109288, "rewards/margins": 0.004532678984105587, "rewards/margins_max": 0.0123423608019948, "rewards/margins_min": -0.001225766958668828, "rewards/margins_std": 0.0061333803460001945, "rewards/rejected": -0.000878730439580977, "step": 70 }, { "epoch": 0.12, "grad_norm": 1.71294323830613, "learning_rate": 4.996054240392509e-07, "logits/chosen": -2.777132749557495, "logits/rejected": -2.7576847076416016, "logps/chosen": -265.20281982421875, "logps/rejected": -243.9671173095703, "loss": 0.6907, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.003907615318894386, "rewards/margins": 0.003999439999461174, "rewards/margins_max": 0.012583956122398376, "rewards/margins_min": -0.0027496658731251955, "rewards/margins_std": 0.00690504303202033, "rewards/rejected": -9.18240548344329e-05, "step": 80 }, { "epoch": 0.13, "grad_norm": 2.4508716900973826, "learning_rate": 4.985629178361649e-07, "logits/chosen": -2.889789581298828, "logits/rejected": -2.8571457862854004, "logps/chosen": -273.2809753417969, "logps/rejected": -246.13528442382812, "loss": 0.6894, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.0063544102013111115, "rewards/margins": 0.007208968047052622, "rewards/margins_max": 0.018225526437163353, "rewards/margins_min": -0.0013279046397656202, "rewards/margins_std": 0.008873926475644112, "rewards/rejected": -0.0008545577293261886, "step": 90 }, { "epoch": 0.15, "grad_norm": 1.8386412495416258, "learning_rate": 4.968719393609756e-07, "logits/chosen": -2.8424429893493652, "logits/rejected": -2.7764132022857666, "logps/chosen": -337.29986572265625, "logps/rejected": -243.8036651611328, "loss": 0.6876, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.010191970504820347, "rewards/margins": 0.012033768929541111, "rewards/margins_max": 0.029753312468528748, "rewards/margins_min": -0.0020332676358520985, "rewards/margins_std": 0.014395820908248425, "rewards/rejected": -0.0018417991232126951, "step": 100 }, { "epoch": 0.15, "eval_logits/chosen": -2.8052334785461426, "eval_logits/rejected": -2.7664124965667725, "eval_logps/chosen": -284.0943603515625, "eval_logps/rejected": -258.4696960449219, "eval_loss": 0.691686749458313, "eval_rewards/accuracies": 0.6079999804496765, "eval_rewards/chosen": 0.004990490153431892, "eval_rewards/margins": 0.0038980983663350344, "eval_rewards/margins_max": 0.023511478677392006, "eval_rewards/margins_min": -0.012509307824075222, "eval_rewards/margins_std": 0.011862216517329216, "eval_rewards/rejected": 0.0010923919035121799, "eval_runtime": 444.4734, "eval_samples_per_second": 4.5, "eval_steps_per_second": 0.281, "step": 100 }, { "epoch": 0.16, "grad_norm": 1.9556267546868065, "learning_rate": 4.945369001834514e-07, "logits/chosen": -2.8761301040649414, "logits/rejected": -2.8216347694396973, "logps/chosen": -287.6670837402344, "logps/rejected": -223.6521453857422, "loss": 0.6881, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.009855161421000957, "rewards/margins": 0.010875609703361988, "rewards/margins_max": 0.027963850647211075, "rewards/margins_min": -0.00028645730344578624, "rewards/margins_std": 0.012777927331626415, "rewards/rejected": -0.0010204474674537778, "step": 110 }, { "epoch": 0.18, "grad_norm": 1.9579674466888082, "learning_rate": 4.915638921541951e-07, "logits/chosen": -2.8273448944091797, "logits/rejected": -2.8115439414978027, "logps/chosen": -257.2424011230469, "logps/rejected": -237.67904663085938, "loss": 0.6866, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.00945218000560999, "rewards/margins": 0.012427609413862228, "rewards/margins_max": 0.033436521887779236, "rewards/margins_min": -0.0018920926377177238, "rewards/margins_std": 0.015944166108965874, "rewards/rejected": -0.0029754305724054575, "step": 120 }, { "epoch": 0.19, "grad_norm": 1.5966946942385376, "learning_rate": 4.879606715117018e-07, "logits/chosen": -2.878997325897217, "logits/rejected": -2.8253443241119385, "logps/chosen": -275.1449890136719, "logps/rejected": -235.9576873779297, "loss": 0.6859, "rewards/accuracies": 0.75, "rewards/chosen": 0.00955723412334919, "rewards/margins": 0.012582411989569664, "rewards/margins_max": 0.03178320452570915, "rewards/margins_min": -0.004044829867780209, "rewards/margins_std": 0.0158962644636631, "rewards/rejected": -0.003025178564712405, "step": 130 }, { "epoch": 0.2, "grad_norm": 1.612892528929156, "learning_rate": 4.837366386472174e-07, "logits/chosen": -2.8901007175445557, "logits/rejected": -2.827146053314209, "logps/chosen": -297.2471618652344, "logps/rejected": -254.73318481445312, "loss": 0.686, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.015631090849637985, "rewards/margins": 0.017191508784890175, "rewards/margins_max": 0.04147082567214966, "rewards/margins_min": -0.0013556934427469969, "rewards/margins_std": 0.019727854058146477, "rewards/rejected": -0.0015604153741151094, "step": 140 }, { "epoch": 0.22, "grad_norm": 1.7836254409467616, "learning_rate": 4.789028135801918e-07, "logits/chosen": -2.847933769226074, "logits/rejected": -2.8365511894226074, "logps/chosen": -297.95208740234375, "logps/rejected": -272.38201904296875, "loss": 0.684, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.016203617677092552, "rewards/margins": 0.018767360597848892, "rewards/margins_max": 0.05341927334666252, "rewards/margins_min": -0.011627629399299622, "rewards/margins_std": 0.029866989701986313, "rewards/rejected": -0.0025637417566031218, "step": 150 }, { "epoch": 0.23, "grad_norm": 8.25250622221843, "learning_rate": 4.7347180720830627e-07, "logits/chosen": -2.867987632751465, "logits/rejected": -2.7911014556884766, "logps/chosen": -300.35626220703125, "logps/rejected": -261.74908447265625, "loss": 0.6847, "rewards/accuracies": 0.8125, "rewards/chosen": 0.016810361295938492, "rewards/margins": 0.01963501051068306, "rewards/margins_max": 0.05201994627714157, "rewards/margins_min": -0.0029959846287965775, "rewards/margins_std": 0.024577533826231956, "rewards/rejected": -0.002824649680405855, "step": 160 }, { "epoch": 0.25, "grad_norm": 1.8565831100637469, "learning_rate": 4.6745778840708107e-07, "logits/chosen": -2.7958624362945557, "logits/rejected": -2.783989429473877, "logps/chosen": -295.93438720703125, "logps/rejected": -227.43661499023438, "loss": 0.6779, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.024501120671629906, "rewards/margins": 0.02991095557808876, "rewards/margins_max": 0.06729420274496078, "rewards/margins_min": -0.003415555926039815, "rewards/margins_std": 0.03072194755077362, "rewards/rejected": -0.005409830249845982, "step": 170 }, { "epoch": 0.26, "grad_norm": 1.8914594463949175, "learning_rate": 4.6087644706489703e-07, "logits/chosen": -2.8270645141601562, "logits/rejected": -2.787436008453369, "logps/chosen": -294.5668640136719, "logps/rejected": -255.82962036132812, "loss": 0.6816, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.01939496397972107, "rewards/margins": 0.02427856996655464, "rewards/margins_max": 0.059351809322834015, "rewards/margins_min": -0.008079716935753822, "rewards/margins_std": 0.03051997348666191, "rewards/rejected": -0.004883607849478722, "step": 180 }, { "epoch": 0.28, "grad_norm": 1.724680638811993, "learning_rate": 4.537449531498687e-07, "logits/chosen": -2.767610788345337, "logits/rejected": -2.719372272491455, "logps/chosen": -281.36297607421875, "logps/rejected": -229.79306030273438, "loss": 0.6809, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.02440481074154377, "rewards/margins": 0.026266787201166153, "rewards/margins_max": 0.06888638436794281, "rewards/margins_min": -0.009829925373196602, "rewards/margins_std": 0.034937743097543716, "rewards/rejected": -0.0018619761103764176, "step": 190 }, { "epoch": 0.29, "grad_norm": 1.587562818110368, "learning_rate": 4.4608191191535736e-07, "logits/chosen": -2.869741201400757, "logits/rejected": -2.8258328437805176, "logps/chosen": -275.5715637207031, "logps/rejected": -244.32763671875, "loss": 0.6792, "rewards/accuracies": 0.75, "rewards/chosen": 0.02022332139313221, "rewards/margins": 0.026959722861647606, "rewards/margins_max": 0.07024272531270981, "rewards/margins_min": -0.013307643122971058, "rewards/margins_std": 0.037044934928417206, "rewards/rejected": -0.006736403796821833, "step": 200 }, { "epoch": 0.29, "eval_logits/chosen": -2.7954251766204834, "eval_logits/rejected": -2.7569642066955566, "eval_logps/chosen": -283.2934875488281, "eval_logps/rejected": -258.5861511230469, "eval_loss": 0.6882554888725281, "eval_rewards/accuracies": 0.6370000243186951, "eval_rewards/chosen": 0.012999121099710464, "eval_rewards/margins": 0.013071166351437569, "eval_rewards/margins_max": 0.07187327742576599, "eval_rewards/margins_min": -0.03631452098488808, "eval_rewards/margins_std": 0.035920411348342896, "eval_rewards/rejected": -7.204585563158616e-05, "eval_runtime": 453.7395, "eval_samples_per_second": 4.408, "eval_steps_per_second": 0.275, "step": 200 }, { "epoch": 0.31, "grad_norm": 1.7443006213577599, "learning_rate": 4.379073153609896e-07, "logits/chosen": -2.8446342945098877, "logits/rejected": -2.8066139221191406, "logps/chosen": -298.4417724609375, "logps/rejected": -260.84881591796875, "loss": 0.6787, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.028581559658050537, "rewards/margins": 0.03288479894399643, "rewards/margins_max": 0.08662258088588715, "rewards/margins_min": -0.013487100601196289, "rewards/margins_std": 0.04499204084277153, "rewards/rejected": -0.0043032425455749035, "step": 210 }, { "epoch": 0.32, "grad_norm": 1.7530104764635095, "learning_rate": 4.292424900758128e-07, "logits/chosen": -2.8034873008728027, "logits/rejected": -2.775111675262451, "logps/chosen": -279.15478515625, "logps/rejected": -223.8057403564453, "loss": 0.6778, "rewards/accuracies": 0.875, "rewards/chosen": 0.028650784865021706, "rewards/margins": 0.03896017372608185, "rewards/margins_max": 0.08817549049854279, "rewards/margins_min": -0.0010216787923127413, "rewards/margins_std": 0.04009110480546951, "rewards/rejected": -0.010309383273124695, "step": 220 }, { "epoch": 0.34, "grad_norm": 1.6934833415236739, "learning_rate": 4.201100415996597e-07, "logits/chosen": -2.7763991355895996, "logits/rejected": -2.7445781230926514, "logps/chosen": -263.20074462890625, "logps/rejected": -261.6719970703125, "loss": 0.6785, "rewards/accuracies": 0.75, "rewards/chosen": 0.030025389045476913, "rewards/margins": 0.031550828367471695, "rewards/margins_max": 0.09253410995006561, "rewards/margins_min": -0.01012241281569004, "rewards/margins_std": 0.04607797786593437, "rewards/rejected": -0.001525437692180276, "step": 230 }, { "epoch": 0.35, "grad_norm": 1.9254815601584274, "learning_rate": 4.1053379544787557e-07, "logits/chosen": -2.8438620567321777, "logits/rejected": -2.8013641834259033, "logps/chosen": -286.17901611328125, "logps/rejected": -249.131591796875, "loss": 0.6777, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.03425859659910202, "rewards/margins": 0.04053955897688866, "rewards/margins_max": 0.09026241302490234, "rewards/margins_min": -0.004998114425688982, "rewards/margins_std": 0.043195050209760666, "rewards/rejected": -0.006280961446464062, "step": 240 }, { "epoch": 0.37, "grad_norm": 1.773759451391351, "learning_rate": 4.0053873495326964e-07, "logits/chosen": -2.802912950515747, "logits/rejected": -2.7872378826141357, "logps/chosen": -243.8607177734375, "logps/rejected": -229.35678100585938, "loss": 0.674, "rewards/accuracies": 0.8125, "rewards/chosen": 0.026193741708993912, "rewards/margins": 0.03452175855636597, "rewards/margins_max": 0.08425874263048172, "rewards/margins_min": -0.007185367401689291, "rewards/margins_std": 0.04130570963025093, "rewards/rejected": -0.00832801777869463, "step": 250 }, { "epoch": 0.38, "grad_norm": 1.89562327580133, "learning_rate": 3.9015093608745143e-07, "logits/chosen": -2.8584208488464355, "logits/rejected": -2.7942659854888916, "logps/chosen": -283.54974365234375, "logps/rejected": -242.9059295654297, "loss": 0.6717, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.03558926284313202, "rewards/margins": 0.0444478802382946, "rewards/margins_max": 0.11205202341079712, "rewards/margins_min": -0.004177084192633629, "rewards/margins_std": 0.05269969254732132, "rewards/rejected": -0.008858618326485157, "step": 260 }, { "epoch": 0.39, "grad_norm": 2.30599597059098, "learning_rate": 3.79397499431599e-07, "logits/chosen": -2.8217787742614746, "logits/rejected": -2.7673909664154053, "logps/chosen": -291.3817138671875, "logps/rejected": -263.5201721191406, "loss": 0.672, "rewards/accuracies": 0.75, "rewards/chosen": 0.04040498286485672, "rewards/margins": 0.04550846666097641, "rewards/margins_max": 0.10735081136226654, "rewards/margins_min": -0.009063487872481346, "rewards/margins_std": 0.051175691187381744, "rewards/rejected": -0.00510348379611969, "step": 270 }, { "epoch": 0.41, "grad_norm": 1.6888761913438506, "learning_rate": 3.6830647947413694e-07, "logits/chosen": -2.88932728767395, "logits/rejected": -2.8418757915496826, "logps/chosen": -274.30218505859375, "logps/rejected": -248.6231231689453, "loss": 0.671, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.03000582382082939, "rewards/margins": 0.04024919122457504, "rewards/margins_max": 0.10416732728481293, "rewards/margins_min": -0.011732708662748337, "rewards/margins_std": 0.05244187265634537, "rewards/rejected": -0.0102433692663908, "step": 280 }, { "epoch": 0.42, "grad_norm": 1.6649865105801362, "learning_rate": 3.5690681141977837e-07, "logits/chosen": -2.822051525115967, "logits/rejected": -2.7766671180725098, "logps/chosen": -253.5765380859375, "logps/rejected": -206.1942901611328, "loss": 0.6729, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.031104344874620438, "rewards/margins": 0.04246490076184273, "rewards/margins_max": 0.12114210426807404, "rewards/margins_min": -0.018173199146986008, "rewards/margins_std": 0.06279204040765762, "rewards/rejected": -0.011360556818544865, "step": 290 }, { "epoch": 0.44, "grad_norm": 2.1922548116517397, "learning_rate": 3.4522823570088067e-07, "logits/chosen": -2.821171998977661, "logits/rejected": -2.808584451675415, "logps/chosen": -259.92913818359375, "logps/rejected": -254.540283203125, "loss": 0.6697, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.03274675831198692, "rewards/margins": 0.047333501279354095, "rewards/margins_max": 0.13525982201099396, "rewards/margins_min": -0.01479897927492857, "rewards/margins_std": 0.06737245619297028, "rewards/rejected": -0.01458674855530262, "step": 300 }, { "epoch": 0.44, "eval_logits/chosen": -2.786345958709717, "eval_logits/rejected": -2.7488794326782227, "eval_logps/chosen": -282.5645751953125, "eval_logps/rejected": -258.747314453125, "eval_loss": 0.6848979592323303, "eval_rewards/accuracies": 0.6169999837875366, "eval_rewards/chosen": 0.020288635045289993, "eval_rewards/margins": 0.02197239361703396, "eval_rewards/margins_max": 0.1184224858880043, "eval_rewards/margins_min": -0.059722770005464554, "eval_rewards/margins_std": 0.05925743281841278, "eval_rewards/rejected": -0.0016837569419294596, "eval_runtime": 440.5364, "eval_samples_per_second": 4.54, "eval_steps_per_second": 0.284, "step": 300 }, { "epoch": 0.45, "grad_norm": 1.884851957725623, "learning_rate": 3.3330122038805277e-07, "logits/chosen": -2.8432490825653076, "logits/rejected": -2.795261859893799, "logps/chosen": -272.4532470703125, "logps/rejected": -227.98348999023438, "loss": 0.6672, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.038232967257499695, "rewards/margins": 0.0511365607380867, "rewards/margins_max": 0.1235017329454422, "rewards/margins_min": -0.007081184536218643, "rewards/margins_std": 0.05993686243891716, "rewards/rejected": -0.012903591617941856, "step": 310 }, { "epoch": 0.47, "grad_norm": 1.732519789963906, "learning_rate": 3.2115688170243734e-07, "logits/chosen": -2.831395387649536, "logits/rejected": -2.8028111457824707, "logps/chosen": -270.9764709472656, "logps/rejected": -252.1440887451172, "loss": 0.6681, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.037659503519535065, "rewards/margins": 0.05491337180137634, "rewards/margins_max": 0.132755845785141, "rewards/margins_min": -0.010344445705413818, "rewards/margins_std": 0.06564854830503464, "rewards/rejected": -0.01725386641919613, "step": 320 }, { "epoch": 0.48, "grad_norm": 1.5242926911564552, "learning_rate": 3.088269028370435e-07, "logits/chosen": -2.8538591861724854, "logits/rejected": -2.8225364685058594, "logps/chosen": -295.54388427734375, "logps/rejected": -250.438232421875, "loss": 0.663, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.04337853938341141, "rewards/margins": 0.06583191454410553, "rewards/margins_max": 0.15647490322589874, "rewards/margins_min": -0.0003548143431544304, "rewards/margins_std": 0.07130275666713715, "rewards/rejected": -0.022453375160694122, "step": 330 }, { "epoch": 0.5, "grad_norm": 1.6819074997159351, "learning_rate": 2.9634345129891294e-07, "logits/chosen": -2.80195689201355, "logits/rejected": -2.765192985534668, "logps/chosen": -288.1705627441406, "logps/rejected": -258.40667724609375, "loss": 0.6647, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.04105386883020401, "rewards/margins": 0.06338892132043839, "rewards/margins_max": 0.1566300094127655, "rewards/margins_min": -0.008608223870396614, "rewards/margins_std": 0.0741976946592331, "rewards/rejected": -0.022335056215524673, "step": 340 }, { "epoch": 0.51, "grad_norm": 1.9298838156481428, "learning_rate": 2.8373909498776744e-07, "logits/chosen": -2.8505892753601074, "logits/rejected": -2.8334896564483643, "logps/chosen": -280.2142028808594, "logps/rejected": -279.97857666015625, "loss": 0.6686, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.03649730607867241, "rewards/margins": 0.05252464860677719, "rewards/margins_max": 0.12718316912651062, "rewards/margins_min": -0.00450880965217948, "rewards/margins_std": 0.061397988349199295, "rewards/rejected": -0.016027342528104782, "step": 350 }, { "epoch": 0.53, "grad_norm": 1.9549755637356359, "learning_rate": 2.710467172300768e-07, "logits/chosen": -2.756687879562378, "logits/rejected": -2.71925687789917, "logps/chosen": -318.5474548339844, "logps/rejected": -252.8844757080078, "loss": 0.6597, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.05029547959566116, "rewards/margins": 0.06931338459253311, "rewards/margins_max": 0.15269415080547333, "rewards/margins_min": 0.0053079272620379925, "rewards/margins_std": 0.06585155427455902, "rewards/rejected": -0.01901790127158165, "step": 360 }, { "epoch": 0.54, "grad_norm": 2.265857763501263, "learning_rate": 2.582994309902146e-07, "logits/chosen": -2.7980430126190186, "logits/rejected": -2.7501606941223145, "logps/chosen": -305.3988342285156, "logps/rejected": -258.4122314453125, "loss": 0.6649, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.04739305004477501, "rewards/margins": 0.06349317729473114, "rewards/margins_max": 0.17864832282066345, "rewards/margins_min": -0.03638879954814911, "rewards/margins_std": 0.09656090289354324, "rewards/rejected": -0.01610013097524643, "step": 370 }, { "epoch": 0.56, "grad_norm": 2.2730703155677743, "learning_rate": 2.455304924825151e-07, "logits/chosen": -2.813772678375244, "logits/rejected": -2.8081743717193604, "logps/chosen": -276.4326477050781, "logps/rejected": -260.1085510253906, "loss": 0.6546, "rewards/accuracies": 0.8125, "rewards/chosen": 0.044434770941734314, "rewards/margins": 0.07333989441394806, "rewards/margins_max": 0.16991741955280304, "rewards/margins_min": 0.007076957728713751, "rewards/margins_std": 0.07423131167888641, "rewards/rejected": -0.028905129060149193, "step": 380 }, { "epoch": 0.57, "grad_norm": 1.8322064736841432, "learning_rate": 2.3277321440960732e-07, "logits/chosen": -2.811725616455078, "logits/rejected": -2.7685177326202393, "logps/chosen": -268.05914306640625, "logps/rejected": -266.083740234375, "loss": 0.6614, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.04107924923300743, "rewards/margins": 0.06845887005329132, "rewards/margins_max": 0.18714329600334167, "rewards/margins_min": -0.010029973462224007, "rewards/margins_std": 0.08763924241065979, "rewards/rejected": -0.02737962268292904, "step": 390 }, { "epoch": 0.58, "grad_norm": 2.128975347395003, "learning_rate": 2.2006087905337698e-07, "logits/chosen": -2.8421759605407715, "logits/rejected": -2.816889762878418, "logps/chosen": -239.36984252929688, "logps/rejected": -236.58364868164062, "loss": 0.6571, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.04006009176373482, "rewards/margins": 0.07172206044197083, "rewards/margins_max": 0.17035157978534698, "rewards/margins_min": -0.0009761411929503083, "rewards/margins_std": 0.07894248515367508, "rewards/rejected": -0.03166196495294571, "step": 400 }, { "epoch": 0.58, "eval_logits/chosen": -2.7727160453796387, "eval_logits/rejected": -2.735231399536133, "eval_logps/chosen": -282.5451965332031, "eval_logps/rejected": -259.56536865234375, "eval_loss": 0.6818826198577881, "eval_rewards/accuracies": 0.6330000162124634, "eval_rewards/chosen": 0.02048237808048725, "eval_rewards/margins": 0.03034677356481552, "eval_rewards/margins_max": 0.15978190302848816, "eval_rewards/margins_min": -0.08066722005605698, "eval_rewards/margins_std": 0.08006121963262558, "eval_rewards/rejected": -0.00986439362168312, "eval_runtime": 428.8143, "eval_samples_per_second": 4.664, "eval_steps_per_second": 0.292, "step": 400 }, { "epoch": 0.6, "grad_norm": 1.464979648644022, "learning_rate": 2.0742665144529372e-07, "logits/chosen": -2.8245086669921875, "logits/rejected": -2.7860279083251953, "logps/chosen": -308.59234619140625, "logps/rejected": -286.37725830078125, "loss": 0.6545, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.05577210336923599, "rewards/margins": 0.08082688599824905, "rewards/margins_max": 0.21429844200611115, "rewards/margins_min": -0.014085543341934681, "rewards/margins_std": 0.10368019342422485, "rewards/rejected": -0.02505478635430336, "step": 410 }, { "epoch": 0.61, "grad_norm": 2.2565152886733024, "learning_rate": 1.9490349284263033e-07, "logits/chosen": -2.7606394290924072, "logits/rejected": -2.7328662872314453, "logps/chosen": -257.64727783203125, "logps/rejected": -247.62216186523438, "loss": 0.6651, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.034510333091020584, "rewards/margins": 0.05616886541247368, "rewards/margins_max": 0.16494083404541016, "rewards/margins_min": -0.04768489673733711, "rewards/margins_std": 0.09831468015909195, "rewards/rejected": -0.021658534184098244, "step": 420 }, { "epoch": 0.63, "grad_norm": 2.3425120835618545, "learning_rate": 1.8252407473630605e-07, "logits/chosen": -2.8338799476623535, "logits/rejected": -2.780625820159912, "logps/chosen": -296.00457763671875, "logps/rejected": -234.919189453125, "loss": 0.6593, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.05103176832199097, "rewards/margins": 0.07940290123224258, "rewards/margins_max": 0.18009448051452637, "rewards/margins_min": -0.008073708042502403, "rewards/margins_std": 0.08305726200342178, "rewards/rejected": -0.028371136635541916, "step": 430 }, { "epoch": 0.64, "grad_norm": 1.9987304949927014, "learning_rate": 1.7032069361469764e-07, "logits/chosen": -2.7400341033935547, "logits/rejected": -2.7252862453460693, "logps/chosen": -228.4139404296875, "logps/rejected": -274.11822509765625, "loss": 0.6614, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.04276793450117111, "rewards/margins": 0.0704236626625061, "rewards/margins_max": 0.18232765793800354, "rewards/margins_min": -0.0089184595271945, "rewards/margins_std": 0.08663706481456757, "rewards/rejected": -0.02765573561191559, "step": 440 }, { "epoch": 0.66, "grad_norm": 1.5785697838984911, "learning_rate": 1.58325186705788e-07, "logits/chosen": -2.8510193824768066, "logits/rejected": -2.829686403274536, "logps/chosen": -269.9481506347656, "logps/rejected": -256.58563232421875, "loss": 0.6643, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.028526362031698227, "rewards/margins": 0.06000509113073349, "rewards/margins_max": 0.16429737210273743, "rewards/margins_min": -0.04647505283355713, "rewards/margins_std": 0.09461401402950287, "rewards/rejected": -0.03147872909903526, "step": 450 }, { "epoch": 0.67, "grad_norm": 1.737884201823339, "learning_rate": 1.4656884891747395e-07, "logits/chosen": -2.7824554443359375, "logits/rejected": -2.7381398677825928, "logps/chosen": -283.9850158691406, "logps/rejected": -239.4892120361328, "loss": 0.6607, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.037972342222929, "rewards/margins": 0.07290490716695786, "rewards/margins_max": 0.1753791719675064, "rewards/margins_min": -0.014705635607242584, "rewards/margins_std": 0.0863322764635086, "rewards/rejected": -0.03493257611989975, "step": 460 }, { "epoch": 0.69, "grad_norm": 1.8923794511223242, "learning_rate": 1.3508235119272466e-07, "logits/chosen": -2.8365092277526855, "logits/rejected": -2.7932674884796143, "logps/chosen": -324.05743408203125, "logps/rejected": -294.9861755371094, "loss": 0.6479, "rewards/accuracies": 0.75, "rewards/chosen": 0.05230081081390381, "rewards/margins": 0.09341531991958618, "rewards/margins_max": 0.25887981057167053, "rewards/margins_min": -0.017766449600458145, "rewards/margins_std": 0.12934701144695282, "rewards/rejected": -0.04111451655626297, "step": 470 }, { "epoch": 0.7, "grad_norm": 1.5554031948375129, "learning_rate": 1.2389566049259336e-07, "logits/chosen": -2.7813267707824707, "logits/rejected": -2.7599194049835205, "logps/chosen": -239.9017333984375, "logps/rejected": -236.3795928955078, "loss": 0.6554, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.04591267183423042, "rewards/margins": 0.08855243027210236, "rewards/margins_max": 0.20911240577697754, "rewards/margins_min": -0.007775710429996252, "rewards/margins_std": 0.09931908547878265, "rewards/rejected": -0.042639754712581635, "step": 480 }, { "epoch": 0.72, "grad_norm": 2.494715637025419, "learning_rate": 1.1303796161583762e-07, "logits/chosen": -2.889634847640991, "logits/rejected": -2.8090715408325195, "logps/chosen": -317.6177062988281, "logps/rejected": -262.18414306640625, "loss": 0.6536, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.06907807290554047, "rewards/margins": 0.10660415887832642, "rewards/margins_max": 0.251437246799469, "rewards/margins_min": -0.01369224488735199, "rewards/margins_std": 0.11950767040252686, "rewards/rejected": -0.03752607852220535, "step": 490 }, { "epoch": 0.73, "grad_norm": 1.9249901065324473, "learning_rate": 1.0253758105911167e-07, "logits/chosen": -2.8541712760925293, "logits/rejected": -2.8100945949554443, "logps/chosen": -331.4628601074219, "logps/rejected": -291.27984619140625, "loss": 0.6508, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.07663814723491669, "rewards/margins": 0.11299224942922592, "rewards/margins_max": 0.2441258728504181, "rewards/margins_min": 0.010978538542985916, "rewards/margins_std": 0.10471439361572266, "rewards/rejected": -0.036354102194309235, "step": 500 }, { "epoch": 0.73, "eval_logits/chosen": -2.768547296524048, "eval_logits/rejected": -2.7314090728759766, "eval_logps/chosen": -283.3109436035156, "eval_logps/rejected": -260.8111267089844, "eval_loss": 0.6802051067352295, "eval_rewards/accuracies": 0.6269999742507935, "eval_rewards/chosen": 0.012824743054807186, "eval_rewards/margins": 0.035146910697221756, "eval_rewards/margins_max": 0.1843804121017456, "eval_rewards/margins_min": -0.09391897916793823, "eval_rewards/margins_std": 0.09255214780569077, "eval_rewards/rejected": -0.022322168573737144, "eval_runtime": 449.4803, "eval_samples_per_second": 4.45, "eval_steps_per_second": 0.278, "step": 500 }, { "epoch": 0.75, "grad_norm": 2.267072438870165, "learning_rate": 9.242191311637049e-08, "logits/chosen": -2.799065113067627, "logits/rejected": -2.7682583332061768, "logps/chosen": -279.16363525390625, "logps/rejected": -241.4091796875, "loss": 0.6585, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.043598730117082596, "rewards/margins": 0.08321089297533035, "rewards/margins_max": 0.2153284102678299, "rewards/margins_min": -0.01286692637950182, "rewards/margins_std": 0.10618630796670914, "rewards/rejected": -0.03961215913295746, "step": 510 }, { "epoch": 0.76, "grad_norm": 1.9414414953336778, "learning_rate": 8.271734841028552e-08, "logits/chosen": -2.8196640014648438, "logits/rejected": -2.747915029525757, "logps/chosen": -337.2265625, "logps/rejected": -270.56146240234375, "loss": 0.6494, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.054617054760456085, "rewards/margins": 0.09917386621236801, "rewards/margins_max": 0.24081799387931824, "rewards/margins_min": -0.0010756913106888533, "rewards/margins_std": 0.11212246119976044, "rewards/rejected": -0.04455682262778282, "step": 520 }, { "epoch": 0.77, "grad_norm": 1.952363376892767, "learning_rate": 7.344920504212243e-08, "logits/chosen": -2.8049850463867188, "logits/rejected": -2.7458760738372803, "logps/chosen": -263.7733154296875, "logps/rejected": -222.8297882080078, "loss": 0.6563, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.02564847469329834, "rewards/margins": 0.06961268186569214, "rewards/margins_max": 0.18673083186149597, "rewards/margins_min": -0.030092215165495872, "rewards/margins_std": 0.09981914609670639, "rewards/rejected": -0.0439642071723938, "step": 530 }, { "epoch": 0.79, "grad_norm": 1.7758645410300213, "learning_rate": 6.46416625397067e-08, "logits/chosen": -2.801713466644287, "logits/rejected": -2.745816230773926, "logps/chosen": -321.74554443359375, "logps/rejected": -301.52325439453125, "loss": 0.6518, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.055145345628261566, "rewards/margins": 0.0942390188574791, "rewards/margins_max": 0.2147323191165924, "rewards/margins_min": -0.01778433658182621, "rewards/margins_std": 0.10189126431941986, "rewards/rejected": -0.03909367322921753, "step": 540 }, { "epoch": 0.8, "grad_norm": 1.6203543941779939, "learning_rate": 5.6317698775795344e-08, "logits/chosen": -2.8398869037628174, "logits/rejected": -2.8026833534240723, "logps/chosen": -280.53826904296875, "logps/rejected": -275.3709411621094, "loss": 0.6538, "rewards/accuracies": 0.75, "rewards/chosen": 0.03587827831506729, "rewards/margins": 0.06856991350650787, "rewards/margins_max": 0.1956954300403595, "rewards/margins_min": -0.03170696645975113, "rewards/margins_std": 0.10253816843032837, "rewards/rejected": -0.03269163519144058, "step": 550 }, { "epoch": 0.82, "grad_norm": 1.8985615756457714, "learning_rate": 4.849903002143113e-08, "logits/chosen": -2.875913143157959, "logits/rejected": -2.8281137943267822, "logps/chosen": -329.9501037597656, "logps/rejected": -278.819580078125, "loss": 0.6436, "rewards/accuracies": 0.8125, "rewards/chosen": 0.06364526599645615, "rewards/margins": 0.11146412044763565, "rewards/margins_max": 0.2525936961174011, "rewards/margins_min": -0.006486054509878159, "rewards/margins_std": 0.11787240207195282, "rewards/rejected": -0.047818850725889206, "step": 560 }, { "epoch": 0.83, "grad_norm": 1.7787852048841497, "learning_rate": 4.1206054290670535e-08, "logits/chosen": -2.827500104904175, "logits/rejected": -2.7943129539489746, "logps/chosen": -267.7633056640625, "logps/rejected": -264.48822021484375, "loss": 0.6537, "rewards/accuracies": 0.862500011920929, "rewards/chosen": 0.05329934507608414, "rewards/margins": 0.09477965533733368, "rewards/margins_max": 0.20895680785179138, "rewards/margins_min": 0.0013521288055926561, "rewards/margins_std": 0.0940418690443039, "rewards/rejected": -0.041480325162410736, "step": 570 }, { "epoch": 0.85, "grad_norm": 2.0202098422067625, "learning_rate": 3.44577981244944e-08, "logits/chosen": -2.7897348403930664, "logits/rejected": -2.777766704559326, "logps/chosen": -260.3923645019531, "logps/rejected": -262.2550964355469, "loss": 0.6591, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.034461263567209244, "rewards/margins": 0.07005412131547928, "rewards/margins_max": 0.2330094575881958, "rewards/margins_min": -0.05284310132265091, "rewards/margins_std": 0.1283414661884308, "rewards/rejected": -0.03559286147356033, "step": 580 }, { "epoch": 0.86, "grad_norm": 1.9903127787741515, "learning_rate": 2.8271866952734814e-08, "logits/chosen": -2.8326079845428467, "logits/rejected": -2.8090529441833496, "logps/chosen": -304.954345703125, "logps/rejected": -290.8445129394531, "loss": 0.6574, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.037466924637556076, "rewards/margins": 0.07924413681030273, "rewards/margins_max": 0.22350183129310608, "rewards/margins_min": -0.030017787590622902, "rewards/margins_std": 0.11654887348413467, "rewards/rejected": -0.04177721589803696, "step": 590 }, { "epoch": 0.88, "grad_norm": 2.150802586702874, "learning_rate": 2.2664399163518782e-08, "logits/chosen": -2.7674005031585693, "logits/rejected": -2.7506704330444336, "logps/chosen": -290.6200256347656, "logps/rejected": -253.0946044921875, "loss": 0.6444, "rewards/accuracies": 0.8125, "rewards/chosen": 0.06507185846567154, "rewards/margins": 0.10532490164041519, "rewards/margins_max": 0.24639275670051575, "rewards/margins_min": -0.00248835701495409, "rewards/margins_std": 0.11410228908061981, "rewards/rejected": -0.040253035724163055, "step": 600 }, { "epoch": 0.88, "eval_logits/chosen": -2.7678329944610596, "eval_logits/rejected": -2.7307965755462646, "eval_logps/chosen": -283.465576171875, "eval_logps/rejected": -261.1460266113281, "eval_loss": 0.6795856356620789, "eval_rewards/accuracies": 0.6230000257492065, "eval_rewards/chosen": 0.011278249323368073, "eval_rewards/margins": 0.036949291825294495, "eval_rewards/margins_max": 0.1936866044998169, "eval_rewards/margins_min": -0.09831613302230835, "eval_rewards/margins_std": 0.0972999855875969, "eval_rewards/rejected": -0.025671038776636124, "eval_runtime": 428.6504, "eval_samples_per_second": 4.666, "eval_steps_per_second": 0.292, "step": 600 }, { "epoch": 0.89, "grad_norm": 1.4413450481375123, "learning_rate": 1.7650024000056414e-08, "logits/chosen": -2.750030517578125, "logits/rejected": -2.7483627796173096, "logps/chosen": -243.78244018554688, "logps/rejected": -249.83358764648438, "loss": 0.6525, "rewards/accuracies": 0.75, "rewards/chosen": 0.04149278253316879, "rewards/margins": 0.08110538125038147, "rewards/margins_max": 0.20331616699695587, "rewards/margins_min": -0.0176301971077919, "rewards/margins_std": 0.09766165912151337, "rewards/rejected": -0.03961259126663208, "step": 610 }, { "epoch": 0.91, "grad_norm": 1.9882715371125113, "learning_rate": 1.3241823394615437e-08, "logits/chosen": -2.872929811477661, "logits/rejected": -2.835972309112549, "logps/chosen": -306.00555419921875, "logps/rejected": -293.34014892578125, "loss": 0.6513, "rewards/accuracies": 0.8374999761581421, "rewards/chosen": 0.05589998513460159, "rewards/margins": 0.10602303594350815, "rewards/margins_max": 0.22401413321495056, "rewards/margins_min": -0.0072759948670864105, "rewards/margins_std": 0.10588717460632324, "rewards/rejected": -0.05012305825948715, "step": 620 }, { "epoch": 0.92, "grad_norm": 2.1198299570085473, "learning_rate": 9.451297839253913e-09, "logits/chosen": -2.749361276626587, "logits/rejected": -2.7258260250091553, "logps/chosen": -277.8694763183594, "logps/rejected": -285.40814208984375, "loss": 0.6532, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.05345848202705383, "rewards/margins": 0.10288135707378387, "rewards/margins_max": 0.24511775374412537, "rewards/margins_min": -0.012477993965148926, "rewards/margins_std": 0.11347142606973648, "rewards/rejected": -0.04942287132143974, "step": 630 }, { "epoch": 0.94, "grad_norm": 1.5752496491228836, "learning_rate": 6.288336382349463e-09, "logits/chosen": -2.767456293106079, "logits/rejected": -2.73591685295105, "logps/chosen": -247.0557098388672, "logps/rejected": -214.10733032226562, "loss": 0.6441, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.04774732142686844, "rewards/margins": 0.09429731965065002, "rewards/margins_max": 0.22121095657348633, "rewards/margins_min": 0.002260456560179591, "rewards/margins_std": 0.10257305949926376, "rewards/rejected": -0.046550001949071884, "step": 640 }, { "epoch": 0.95, "grad_norm": 2.9091264169866813, "learning_rate": 3.7611908292010665e-09, "logits/chosen": -2.8238768577575684, "logits/rejected": -2.7892489433288574, "logps/chosen": -310.2026672363281, "logps/rejected": -265.02740478515625, "loss": 0.6485, "rewards/accuracies": 0.75, "rewards/chosen": 0.030774693936109543, "rewards/margins": 0.08213461190462112, "rewards/margins_max": 0.2235296070575714, "rewards/margins_min": -0.03506668284535408, "rewards/margins_std": 0.11517021805047989, "rewards/rejected": -0.05135990306735039, "step": 650 }, { "epoch": 0.96, "grad_norm": 2.099534541726007, "learning_rate": 1.8764542140112527e-09, "logits/chosen": -2.767094135284424, "logits/rejected": -2.772709369659424, "logps/chosen": -222.73178100585938, "logps/rejected": -231.007080078125, "loss": 0.6568, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.018592417240142822, "rewards/margins": 0.06605438143014908, "rewards/margins_max": 0.18254120647907257, "rewards/margins_min": -0.025726070627570152, "rewards/margins_std": 0.09429889917373657, "rewards/rejected": -0.047461964190006256, "step": 660 }, { "epoch": 0.98, "grad_norm": 2.1873875180344093, "learning_rate": 6.390435994127752e-10, "logits/chosen": -2.793241024017334, "logits/rejected": -2.760741710662842, "logps/chosen": -281.3489074707031, "logps/rejected": -301.64892578125, "loss": 0.6538, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.04053753241896629, "rewards/margins": 0.08871294558048248, "rewards/margins_max": 0.2136339396238327, "rewards/margins_min": -0.008021386340260506, "rewards/margins_std": 0.1009400486946106, "rewards/rejected": -0.04817541316151619, "step": 670 }, { "epoch": 0.99, "grad_norm": 2.9786376127683343, "learning_rate": 5.2187248413465555e-11, "logits/chosen": -2.7850821018218994, "logits/rejected": -2.7602639198303223, "logps/chosen": -298.0012512207031, "logps/rejected": -265.384521484375, "loss": 0.6519, "rewards/accuracies": 0.824999988079071, "rewards/chosen": 0.05158674716949463, "rewards/margins": 0.10068739950656891, "rewards/margins_max": 0.258645623922348, "rewards/margins_min": -0.020790638402104378, "rewards/margins_std": 0.1269642412662506, "rewards/rejected": -0.04910064488649368, "step": 680 }, { "epoch": 1.0, "step": 684, "total_flos": 0.0, "train_loss": 0.6679948979651021, "train_runtime": 9131.6092, "train_samples_per_second": 1.198, "train_steps_per_second": 0.075 } ], "logging_steps": 10, "max_steps": 684, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }