{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 385, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0, "grad_norm": 145.0, "learning_rate": 1.282051282051282e-07, "logits/chosen": 88.18099975585938, "logits/rejected": 88.25153350830078, "logps/chosen": -29.073104858398438, "logps/rejected": -26.25731658935547, "loss": 0.3086, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.03, "grad_norm": 288.0, "learning_rate": 1.282051282051282e-06, "logits/chosen": 81.08262634277344, "logits/rejected": 80.7869873046875, "logps/chosen": -34.28562545776367, "logps/rejected": -33.03427505493164, "loss": 0.9415, "rewards/accuracies": 0.4861111044883728, "rewards/chosen": -0.039508190006017685, "rewards/margins": 0.02877185121178627, "rewards/rejected": -0.06828003376722336, "step": 10 }, { "epoch": 0.05, "grad_norm": 149.0, "learning_rate": 2.564102564102564e-06, "logits/chosen": 80.65168762207031, "logits/rejected": 80.53875732421875, "logps/chosen": -33.57862091064453, "logps/rejected": -30.82345199584961, "loss": 1.1289, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1001572236418724, "rewards/margins": 0.14152315258979797, "rewards/rejected": -0.041365914046764374, "step": 20 }, { "epoch": 0.08, "grad_norm": 234.0, "learning_rate": 3.846153846153847e-06, "logits/chosen": 82.32148742675781, "logits/rejected": 82.35160827636719, "logps/chosen": -33.95701599121094, "logps/rejected": -31.341405868530273, "loss": 1.3887, "rewards/accuracies": 0.4375, "rewards/chosen": 0.05022481083869934, "rewards/margins": -0.023957695811986923, "rewards/rejected": 0.07418251037597656, "step": 30 }, { "epoch": 0.1, "grad_norm": 636.0, "learning_rate": 4.999896948438434e-06, "logits/chosen": 80.53231811523438, "logits/rejected": 80.530029296875, "logps/chosen": -33.09763717651367, "logps/rejected": -33.38296890258789, "loss": 2.1646, "rewards/accuracies": 0.5, "rewards/chosen": 0.06877875328063965, "rewards/margins": 0.0817752406001091, "rewards/rejected": -0.0129964929074049, "step": 40 }, { "epoch": 0.13, "grad_norm": 386.0, "learning_rate": 4.987541037542187e-06, "logits/chosen": 78.10733032226562, "logits/rejected": 78.12706756591797, "logps/chosen": -31.247085571289062, "logps/rejected": -31.239765167236328, "loss": 1.232, "rewards/accuracies": 0.5625, "rewards/chosen": -0.03765222057700157, "rewards/margins": 0.11217772960662842, "rewards/rejected": -0.14982998371124268, "step": 50 }, { "epoch": 0.16, "grad_norm": 200.0, "learning_rate": 4.954691471941119e-06, "logits/chosen": 82.77879333496094, "logits/rejected": 82.82901763916016, "logps/chosen": -31.19879150390625, "logps/rejected": -29.63169288635254, "loss": 1.1982, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": -0.021271925419569016, "rewards/margins": 0.015784021466970444, "rewards/rejected": -0.03705594688653946, "step": 60 }, { "epoch": 0.18, "grad_norm": 832.0, "learning_rate": 4.901618883413549e-06, "logits/chosen": 83.74064636230469, "logits/rejected": 83.76899719238281, "logps/chosen": -30.796112060546875, "logps/rejected": -33.087310791015625, "loss": 2.0455, "rewards/accuracies": 0.44999998807907104, "rewards/chosen": -0.15038001537322998, "rewards/margins": -0.10573381185531616, "rewards/rejected": -0.04464619606733322, "step": 70 }, { "epoch": 0.21, "grad_norm": 432.0, "learning_rate": 4.828760511501322e-06, "logits/chosen": 81.53680419921875, "logits/rejected": 81.53197479248047, "logps/chosen": -31.50222396850586, "logps/rejected": -30.938213348388672, "loss": 2.2722, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.0024462162982672453, "rewards/margins": 0.16495725512504578, "rewards/rejected": -0.16251103579998016, "step": 80 }, { "epoch": 0.23, "grad_norm": 378.0, "learning_rate": 4.7367166013034295e-06, "logits/chosen": 78.6462173461914, "logits/rejected": 78.61238098144531, "logps/chosen": -32.54735565185547, "logps/rejected": -30.98733139038086, "loss": 1.7909, "rewards/accuracies": 0.5, "rewards/chosen": 0.0817408338189125, "rewards/margins": 0.11401765048503876, "rewards/rejected": -0.03227682039141655, "step": 90 }, { "epoch": 0.26, "grad_norm": 210.0, "learning_rate": 4.626245458345211e-06, "logits/chosen": 84.08260345458984, "logits/rejected": 84.1148910522461, "logps/chosen": -34.308895111083984, "logps/rejected": -31.754663467407227, "loss": 2.0775, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.003516471479088068, "rewards/margins": 0.00018945932970382273, "rewards/rejected": -0.0037059492897242308, "step": 100 }, { "epoch": 0.26, "eval_logits/chosen": 98.9691390991211, "eval_logits/rejected": 98.96244812011719, "eval_logps/chosen": -32.53602981567383, "eval_logps/rejected": -35.98977279663086, "eval_loss": 1.7336534261703491, "eval_rewards/accuracies": 0.4746677577495575, "eval_rewards/chosen": -0.08356913179159164, "eval_rewards/margins": -0.06264925748109818, "eval_rewards/rejected": -0.02091986872255802, "eval_runtime": 104.1909, "eval_samples_per_second": 3.292, "eval_steps_per_second": 0.413, "step": 100 }, { "epoch": 0.29, "grad_norm": 256.0, "learning_rate": 4.498257201263691e-06, "logits/chosen": 84.40672302246094, "logits/rejected": 84.30145263671875, "logps/chosen": -32.60901641845703, "logps/rejected": -32.70885467529297, "loss": 3.1161, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.37241029739379883, "rewards/margins": 0.4200804829597473, "rewards/rejected": -0.04767021909356117, "step": 110 }, { "epoch": 0.31, "grad_norm": 446.0, "learning_rate": 4.353806263777678e-06, "logits/chosen": 84.58103942871094, "logits/rejected": 84.6691665649414, "logps/chosen": -28.995365142822266, "logps/rejected": -35.473243713378906, "loss": 2.9665, "rewards/accuracies": 0.512499988079071, "rewards/chosen": -0.05920146033167839, "rewards/margins": -0.0465129017829895, "rewards/rejected": -0.012688541784882545, "step": 120 }, { "epoch": 0.34, "grad_norm": 208.0, "learning_rate": 4.1940827077152755e-06, "logits/chosen": 82.01698303222656, "logits/rejected": 82.04023742675781, "logps/chosen": -30.6518497467041, "logps/rejected": -32.01683807373047, "loss": 1.858, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.22995714843273163, "rewards/margins": 0.3066490590572357, "rewards/rejected": -0.0766918882727623, "step": 130 }, { "epoch": 0.36, "grad_norm": 392.0, "learning_rate": 4.0204024186666215e-06, "logits/chosen": 83.59390258789062, "logits/rejected": 83.58688354492188, "logps/chosen": -27.2325439453125, "logps/rejected": -32.482357025146484, "loss": 2.0491, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.19329962134361267, "rewards/margins": 0.24638020992279053, "rewards/rejected": -0.05308058112859726, "step": 140 }, { "epoch": 0.39, "grad_norm": 209.0, "learning_rate": 3.834196265035119e-06, "logits/chosen": 82.43211364746094, "logits/rejected": 82.41563415527344, "logps/chosen": -28.861217498779297, "logps/rejected": -32.51410675048828, "loss": 1.804, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4751041829586029, "rewards/margins": 0.42794743180274963, "rewards/rejected": 0.04715672880411148, "step": 150 }, { "epoch": 0.42, "grad_norm": 450.0, "learning_rate": 3.636998309800573e-06, "logits/chosen": 84.4583969116211, "logits/rejected": 84.46671295166016, "logps/chosen": -33.707855224609375, "logps/rejected": -29.813602447509766, "loss": 3.5567, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.38600218296051025, "rewards/margins": 0.2609595060348511, "rewards/rejected": 0.12504267692565918, "step": 160 }, { "epoch": 0.44, "grad_norm": 278.0, "learning_rate": 3.4304331721118078e-06, "logits/chosen": 85.22765350341797, "logits/rejected": 85.16728973388672, "logps/chosen": -31.05304527282715, "logps/rejected": -32.11688995361328, "loss": 1.9364, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1856154203414917, "rewards/margins": 0.2727610170841217, "rewards/rejected": -0.08714555948972702, "step": 170 }, { "epoch": 0.47, "grad_norm": 196.0, "learning_rate": 3.2162026428305436e-06, "logits/chosen": 82.93448638916016, "logits/rejected": 82.9182357788086, "logps/chosen": -30.893056869506836, "logps/rejected": -31.30733299255371, "loss": 1.6619, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.2736847400665283, "rewards/margins": 0.40879687666893005, "rewards/rejected": -0.13511209189891815, "step": 180 }, { "epoch": 0.49, "grad_norm": 130.0, "learning_rate": 2.996071664294641e-06, "logits/chosen": 84.62263488769531, "logits/rejected": 84.59950256347656, "logps/chosen": -30.79315757751465, "logps/rejected": -30.666366577148438, "loss": 2.2923, "rewards/accuracies": 0.5, "rewards/chosen": -0.02406691387295723, "rewards/margins": 0.15444278717041016, "rewards/rejected": -0.17850971221923828, "step": 190 }, { "epoch": 0.52, "grad_norm": 536.0, "learning_rate": 2.7718537898066833e-06, "logits/chosen": 79.89015197753906, "logits/rejected": 79.83697509765625, "logps/chosen": -34.30753707885742, "logps/rejected": -32.36396026611328, "loss": 3.1221, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.40067988634109497, "rewards/margins": 0.2841777205467224, "rewards/rejected": 0.11650214344263077, "step": 200 }, { "epoch": 0.52, "eval_logits/chosen": 99.16912841796875, "eval_logits/rejected": 99.15798950195312, "eval_logps/chosen": -32.52165603637695, "eval_logps/rejected": -36.10483932495117, "eval_loss": 1.716450572013855, "eval_rewards/accuracies": 0.5278239250183105, "eval_rewards/chosen": -0.0706300213932991, "eval_rewards/margins": 0.05385042726993561, "eval_rewards/rejected": -0.12448045611381531, "eval_runtime": 103.9441, "eval_samples_per_second": 3.3, "eval_steps_per_second": 0.414, "step": 200 }, { "epoch": 0.55, "grad_norm": 392.0, "learning_rate": 2.5453962426402006e-06, "logits/chosen": 82.42179870605469, "logits/rejected": 82.31734466552734, "logps/chosen": -33.38166427612305, "logps/rejected": -35.02565383911133, "loss": 2.7032, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.507649302482605, "rewards/margins": 0.5432528257369995, "rewards/rejected": -0.03560344874858856, "step": 210 }, { "epoch": 0.57, "grad_norm": 474.0, "learning_rate": 2.3185646976551794e-06, "logits/chosen": 84.43553161621094, "logits/rejected": 84.52201080322266, "logps/chosen": -31.842187881469727, "logps/rejected": -30.83688735961914, "loss": 3.6089, "rewards/accuracies": 0.625, "rewards/chosen": 0.09608902782201767, "rewards/margins": 0.1932254582643509, "rewards/rejected": -0.09713643789291382, "step": 220 }, { "epoch": 0.6, "grad_norm": 324.0, "learning_rate": 2.0932279108998323e-06, "logits/chosen": 81.6541976928711, "logits/rejected": 81.72249603271484, "logps/chosen": -32.656639099121094, "logps/rejected": -33.97739791870117, "loss": 1.7997, "rewards/accuracies": 0.5625, "rewards/chosen": 0.1341962367296219, "rewards/margins": 0.10292205959558487, "rewards/rejected": 0.03127415105700493, "step": 230 }, { "epoch": 0.62, "grad_norm": 328.0, "learning_rate": 1.8712423238279358e-06, "logits/chosen": 83.9974365234375, "logits/rejected": 84.2688217163086, "logps/chosen": -31.19537353515625, "logps/rejected": -31.57509994506836, "loss": 2.2222, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.443420946598053, "rewards/margins": 0.4821735918521881, "rewards/rejected": -0.038752567023038864, "step": 240 }, { "epoch": 0.65, "grad_norm": 404.0, "learning_rate": 1.6544367689701824e-06, "logits/chosen": 82.76817321777344, "logits/rejected": 82.85389709472656, "logps/chosen": -27.292465209960938, "logps/rejected": -29.717370986938477, "loss": 2.0726, "rewards/accuracies": 0.48750001192092896, "rewards/chosen": 0.27308744192123413, "rewards/margins": 0.10805711895227432, "rewards/rejected": 0.1650303304195404, "step": 250 }, { "epoch": 0.68, "grad_norm": 672.0, "learning_rate": 1.4445974030621963e-06, "logits/chosen": 79.98811340332031, "logits/rejected": 80.15504455566406, "logps/chosen": -31.048583984375, "logps/rejected": -35.97705841064453, "loss": 3.1081, "rewards/accuracies": 0.550000011920929, "rewards/chosen": 0.4611719250679016, "rewards/margins": 0.3490581512451172, "rewards/rejected": 0.112113818526268, "step": 260 }, { "epoch": 0.7, "grad_norm": 211.0, "learning_rate": 1.243452991757889e-06, "logits/chosen": 79.3856201171875, "logits/rejected": 79.41062927246094, "logps/chosen": -31.38250732421875, "logps/rejected": -31.722015380859375, "loss": 2.0801, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.3567788600921631, "rewards/margins": 0.5517110228538513, "rewards/rejected": -0.194932222366333, "step": 270 }, { "epoch": 0.73, "grad_norm": 498.0, "learning_rate": 1.0526606671603523e-06, "logits/chosen": 81.8171157836914, "logits/rejected": 81.62063598632812, "logps/chosen": -31.32455062866211, "logps/rejected": -29.448989868164062, "loss": 2.1832, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.43680915236473083, "rewards/margins": 0.2406042516231537, "rewards/rejected": 0.19620487093925476, "step": 280 }, { "epoch": 0.75, "grad_norm": 262.0, "learning_rate": 8.737922755071455e-07, "logits/chosen": 82.0579833984375, "logits/rejected": 81.96476745605469, "logps/chosen": -33.8171272277832, "logps/rejected": -31.962472915649414, "loss": 3.4215, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.25841274857521057, "rewards/margins": 0.2094600647687912, "rewards/rejected": 0.04895265772938728, "step": 290 }, { "epoch": 0.78, "grad_norm": 169.0, "learning_rate": 7.08321427484816e-07, "logits/chosen": 77.75779724121094, "logits/rejected": 77.80491638183594, "logps/chosen": -32.91708755493164, "logps/rejected": -29.0297908782959, "loss": 2.4404, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.4397595524787903, "rewards/margins": 0.37474992871284485, "rewards/rejected": 0.06500961631536484, "step": 300 }, { "epoch": 0.78, "eval_logits/chosen": 99.06168365478516, "eval_logits/rejected": 99.05675506591797, "eval_logps/chosen": -32.30947494506836, "eval_logps/rejected": -35.82986068725586, "eval_loss": 1.5173850059509277, "eval_rewards/accuracies": 0.510797381401062, "eval_rewards/chosen": 0.12033051997423172, "eval_rewards/margins": -0.0026713553816080093, "eval_rewards/rejected": 0.12300187349319458, "eval_runtime": 103.9278, "eval_samples_per_second": 3.3, "eval_steps_per_second": 0.414, "step": 300 }, { "epoch": 0.81, "grad_norm": 928.0, "learning_rate": 5.576113578589035e-07, "logits/chosen": 84.6921157836914, "logits/rejected": 84.73957824707031, "logps/chosen": -30.482025146484375, "logps/rejected": -32.063751220703125, "loss": 2.7405, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.3433563709259033, "rewards/margins": 0.22095146775245667, "rewards/rejected": 0.12240489572286606, "step": 310 }, { "epoch": 0.83, "grad_norm": 406.0, "learning_rate": 4.229036944380913e-07, "logits/chosen": 82.20662689208984, "logits/rejected": 82.20085906982422, "logps/chosen": -31.305639266967773, "logps/rejected": -28.786062240600586, "loss": 2.2147, "rewards/accuracies": 0.4749999940395355, "rewards/chosen": 0.2636275887489319, "rewards/margins": 0.14354394376277924, "rewards/rejected": 0.12008367478847504, "step": 320 }, { "epoch": 0.86, "grad_norm": 366.0, "learning_rate": 3.053082288996112e-07, "logits/chosen": 79.59921264648438, "logits/rejected": 79.63972473144531, "logps/chosen": -29.793676376342773, "logps/rejected": -32.53992462158203, "loss": 2.2293, "rewards/accuracies": 0.625, "rewards/chosen": 0.5030375719070435, "rewards/margins": 0.41758814454078674, "rewards/rejected": 0.08544941991567612, "step": 330 }, { "epoch": 0.88, "grad_norm": 300.0, "learning_rate": 2.0579377374915805e-07, "logits/chosen": 83.62577056884766, "logits/rejected": 83.62586975097656, "logps/chosen": -32.759605407714844, "logps/rejected": -33.222259521484375, "loss": 2.4595, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.40729111433029175, "rewards/margins": 0.2755037844181061, "rewards/rejected": 0.13178732991218567, "step": 340 }, { "epoch": 0.91, "grad_norm": 302.0, "learning_rate": 1.2518018074041684e-07, "logits/chosen": 82.8216781616211, "logits/rejected": 82.81562805175781, "logps/chosen": -33.33207702636719, "logps/rejected": -32.93452835083008, "loss": 1.9575, "rewards/accuracies": 0.5, "rewards/chosen": 0.2804979979991913, "rewards/margins": 0.13895203173160553, "rewards/rejected": 0.14154598116874695, "step": 350 }, { "epoch": 0.94, "grad_norm": 218.0, "learning_rate": 6.41315865106129e-08, "logits/chosen": 84.19699096679688, "logits/rejected": 84.23506164550781, "logps/chosen": -28.99674415588379, "logps/rejected": -31.437408447265625, "loss": 2.09, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.4999212324619293, "rewards/margins": 0.3087221086025238, "rewards/rejected": 0.19119907915592194, "step": 360 }, { "epoch": 0.96, "grad_norm": 508.0, "learning_rate": 2.3150941078050325e-08, "logits/chosen": 83.60005187988281, "logits/rejected": 83.62088012695312, "logps/chosen": -32.41377258300781, "logps/rejected": -34.801570892333984, "loss": 3.3754, "rewards/accuracies": 0.5, "rewards/chosen": 0.41615089774131775, "rewards/margins": 0.13912765681743622, "rewards/rejected": 0.2770232558250427, "step": 370 }, { "epoch": 0.99, "grad_norm": 194.0, "learning_rate": 2.575864278703266e-09, "logits/chosen": 77.62187194824219, "logits/rejected": 77.48942565917969, "logps/chosen": -30.352733612060547, "logps/rejected": -28.094280242919922, "loss": 2.3864, "rewards/accuracies": 0.512499988079071, "rewards/chosen": 0.20105357468128204, "rewards/margins": 0.14138910174369812, "rewards/rejected": 0.059664465487003326, "step": 380 }, { "epoch": 1.0, "step": 385, "total_flos": 0.0, "train_loss": 2.2499145250041765, "train_runtime": 2558.7273, "train_samples_per_second": 1.203, "train_steps_per_second": 0.15 } ], "logging_steps": 10, "max_steps": 385, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }