{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 3821, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "dpo_losses": 0.6931471824645996, "epoch": 0.0, "grad_norm": 1.6627631330555053, "learning_rate": 1.3054830287206268e-08, "logits/chosen": -2.909182548522949, "logits/rejected": -2.942319393157959, "logps/chosen": -202.1656494140625, "logps/rejected": -236.2765350341797, "loss": 0.6931, "positive_losses": 0.0, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/margins_max": 0.0, "rewards/margins_min": 0.0, "rewards/margins_std": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "dpo_losses": 0.693147599697113, "epoch": 0.0, "grad_norm": 18.8314142290279, "learning_rate": 1.3054830287206266e-07, "logits/chosen": -2.867999315261841, "logits/rejected": -2.786515474319458, "logps/chosen": -300.2493591308594, "logps/rejected": -226.55767822265625, "loss": 0.7004, "positive_losses": 0.07911261171102524, "rewards/accuracies": 0.4166666567325592, "rewards/chosen": -0.00020092699560336769, "rewards/margins": 6.209334628692886e-07, "rewards/margins_max": 0.002266084775328636, "rewards/margins_min": -0.0019263287540525198, "rewards/margins_std": 0.0019202136900275946, "rewards/rejected": -0.00020154794037807733, "step": 10 }, { "dpo_losses": 0.6929634809494019, "epoch": 0.01, "grad_norm": 12.176410970842737, "learning_rate": 2.610966057441253e-07, "logits/chosen": -2.8988351821899414, "logits/rejected": -2.820763111114502, "logps/chosen": -342.5530700683594, "logps/rejected": -237.41000366210938, "loss": 0.6975, "positive_losses": 0.036047983914613724, "rewards/accuracies": 0.5249999761581421, "rewards/chosen": 0.001374891260638833, "rewards/margins": 0.00037001215969212353, "rewards/margins_max": 0.0038178269751369953, "rewards/margins_min": -0.0023865108378231525, "rewards/margins_std": 0.002772086299955845, "rewards/rejected": 0.00100487913005054, "step": 20 }, { "dpo_losses": 0.6929342150688171, "epoch": 0.01, "grad_norm": 6.771810263556228, "learning_rate": 3.9164490861618804e-07, "logits/chosen": -2.7983055114746094, "logits/rejected": -2.801570177078247, "logps/chosen": -303.83001708984375, "logps/rejected": -266.1291809082031, "loss": 0.6949, "positive_losses": 0.0268110278993845, "rewards/accuracies": 0.5375000238418579, "rewards/chosen": 0.0040281894616782665, "rewards/margins": 0.0004303906171116978, "rewards/margins_max": 0.0038729298394173384, "rewards/margins_min": -0.003587177721783519, "rewards/margins_std": 0.003313865512609482, "rewards/rejected": 0.0035977992229163647, "step": 30 }, { "dpo_losses": 0.6930840611457825, "epoch": 0.01, "grad_norm": 5.635121542864576, "learning_rate": 5.221932114882506e-07, "logits/chosen": -2.8125195503234863, "logits/rejected": -2.8254590034484863, "logps/chosen": -280.2214660644531, "logps/rejected": -280.5213928222656, "loss": 0.6936, "positive_losses": 0.011600112542510033, "rewards/accuracies": 0.42500001192092896, "rewards/chosen": 0.0077796741388738155, "rewards/margins": 0.0001318950962740928, "rewards/margins_max": 0.00467295479029417, "rewards/margins_min": -0.004409942775964737, "rewards/margins_std": 0.003962748683989048, "rewards/rejected": 0.007647777907550335, "step": 40 }, { "dpo_losses": 0.6926494836807251, "epoch": 0.01, "grad_norm": 9.337876531115175, "learning_rate": 6.527415143603135e-07, "logits/chosen": -2.8853297233581543, "logits/rejected": -2.8575782775878906, "logps/chosen": -293.530029296875, "logps/rejected": -290.3891906738281, "loss": 0.6932, "positive_losses": 0.006702614016830921, "rewards/accuracies": 0.625, "rewards/chosen": 0.011776240542531013, "rewards/margins": 0.0010005722288042307, "rewards/margins_max": 0.005041410680860281, "rewards/margins_min": -0.0035116872750222683, "rewards/margins_std": 0.003799186320975423, "rewards/rejected": 0.010775668546557426, "step": 50 }, { "dpo_losses": 0.6921502351760864, "epoch": 0.02, "grad_norm": 1.9372752889946026, "learning_rate": 7.832898172323761e-07, "logits/chosen": -2.8227450847625732, "logits/rejected": -2.7623629570007324, "logps/chosen": -284.64166259765625, "logps/rejected": -250.4044952392578, "loss": 0.6927, "positive_losses": 0.002616500947624445, "rewards/accuracies": 0.625, "rewards/chosen": 0.014274273999035358, "rewards/margins": 0.002002457156777382, "rewards/margins_max": 0.006887891795486212, "rewards/margins_min": -0.0028654516208916903, "rewards/margins_std": 0.004471802152693272, "rewards/rejected": 0.012271817773580551, "step": 60 }, { "dpo_losses": 0.6922898292541504, "epoch": 0.02, "grad_norm": 1.6246085062137012, "learning_rate": 9.138381201044387e-07, "logits/chosen": -2.8563594818115234, "logits/rejected": -2.826786518096924, "logps/chosen": -247.6701202392578, "logps/rejected": -229.2803955078125, "loss": 0.6928, "positive_losses": 0.0, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.014947858639061451, "rewards/margins": 0.0017268791561946273, "rewards/margins_max": 0.00879730749875307, "rewards/margins_min": -0.005179594270884991, "rewards/margins_std": 0.0061830440536141396, "rewards/rejected": 0.013220980763435364, "step": 70 }, { "dpo_losses": 0.6920645236968994, "epoch": 0.02, "grad_norm": 2.4957603745365726, "learning_rate": 1.0443864229765013e-06, "logits/chosen": -2.818035125732422, "logits/rejected": -2.7780404090881348, "logps/chosen": -275.47320556640625, "logps/rejected": -225.15750122070312, "loss": 0.6933, "positive_losses": 0.022166062146425247, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.015697982162237167, "rewards/margins": 0.0021850476041436195, "rewards/margins_max": 0.010664868168532848, "rewards/margins_min": -0.0068687512539327145, "rewards/margins_std": 0.007743604481220245, "rewards/rejected": 0.013512934558093548, "step": 80 }, { "dpo_losses": 0.6902826428413391, "epoch": 0.02, "grad_norm": 1.8069263129269828, "learning_rate": 1.1749347258485642e-06, "logits/chosen": -2.8795130252838135, "logits/rejected": -2.8336520195007324, "logps/chosen": -322.0303039550781, "logps/rejected": -271.56866455078125, "loss": 0.6924, "positive_losses": 0.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.021155569702386856, "rewards/margins": 0.0057623423635959625, "rewards/margins_max": 0.01648578606545925, "rewards/margins_min": -0.004632280208170414, "rewards/margins_std": 0.009683581069111824, "rewards/rejected": 0.015393229201436043, "step": 90 }, { "dpo_losses": 0.6905539631843567, "epoch": 0.03, "grad_norm": 21.73417561464979, "learning_rate": 1.305483028720627e-06, "logits/chosen": -2.7189571857452393, "logits/rejected": -2.6742961406707764, "logps/chosen": -341.22723388671875, "logps/rejected": -240.46115112304688, "loss": 0.6921, "positive_losses": 0.00832443218678236, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.022471453994512558, "rewards/margins": 0.005237923469394445, "rewards/margins_max": 0.0194450281560421, "rewards/margins_min": -0.0072221914306283, "rewards/margins_std": 0.01184946671128273, "rewards/rejected": 0.017233530059456825, "step": 100 }, { "epoch": 0.03, "eval_dpo_losses": 0.690070629119873, "eval_logits/chosen": -2.8057563304901123, "eval_logits/rejected": -2.7667205333709717, "eval_logps/chosen": -282.11053466796875, "eval_logps/rejected": -256.71771240234375, "eval_loss": 0.691548228263855, "eval_positive_losses": 0.011959685944020748, "eval_rewards/accuracies": 0.6650000214576721, "eval_rewards/chosen": 0.024828700348734856, "eval_rewards/margins": 0.0062165395356714725, "eval_rewards/margins_max": 0.02886023372411728, "eval_rewards/margins_min": -0.013660137541592121, "eval_rewards/margins_std": 0.013916457071900368, "eval_rewards/rejected": 0.01861215941607952, "eval_runtime": 429.6898, "eval_samples_per_second": 4.655, "eval_steps_per_second": 0.291, "step": 100 }, { "dpo_losses": 0.692000687122345, "epoch": 0.03, "grad_norm": 2.1540912381314827, "learning_rate": 1.4360313315926894e-06, "logits/chosen": -2.8143553733825684, "logits/rejected": -2.8246326446533203, "logps/chosen": -253.93936157226562, "logps/rejected": -245.428955078125, "loss": 0.6911, "positive_losses": 0.0, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.023682493716478348, "rewards/margins": 0.002336515113711357, "rewards/margins_max": 0.016066711395978928, "rewards/margins_min": -0.011998656205832958, "rewards/margins_std": 0.012446084059774876, "rewards/rejected": 0.02134597860276699, "step": 110 }, { "dpo_losses": 0.691046953201294, "epoch": 0.03, "grad_norm": 11.008003444382771, "learning_rate": 1.5665796344647521e-06, "logits/chosen": -2.802347421646118, "logits/rejected": -2.789463996887207, "logps/chosen": -276.475830078125, "logps/rejected": -233.5188446044922, "loss": 0.7013, "positive_losses": 0.20269469916820526, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.027693506330251694, "rewards/margins": 0.00434782775118947, "rewards/margins_max": 0.020682280883193016, "rewards/margins_min": -0.01483757235109806, "rewards/margins_std": 0.016509367153048515, "rewards/rejected": 0.02334568090736866, "step": 120 }, { "dpo_losses": 0.6879526376724243, "epoch": 0.03, "grad_norm": 10.959459331569784, "learning_rate": 1.6971279373368146e-06, "logits/chosen": -2.821155071258545, "logits/rejected": -2.775268316268921, "logps/chosen": -260.90911865234375, "logps/rejected": -306.16436767578125, "loss": 0.6893, "positive_losses": 0.002742767333984375, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.03026561066508293, "rewards/margins": 0.011031229980289936, "rewards/margins_max": 0.04761187359690666, "rewards/margins_min": -0.01323478389531374, "rewards/margins_std": 0.028165534138679504, "rewards/rejected": 0.01923438161611557, "step": 130 }, { "dpo_losses": 0.6876500844955444, "epoch": 0.04, "grad_norm": 1.8016495669550654, "learning_rate": 1.8276762402088774e-06, "logits/chosen": -2.8401308059692383, "logits/rejected": -2.7858662605285645, "logps/chosen": -260.0283203125, "logps/rejected": -230.19107055664062, "loss": 0.7133, "positive_losses": 0.07800178229808807, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.03599924221634865, "rewards/margins": 0.01117384247481823, "rewards/margins_max": 0.03845587000250816, "rewards/margins_min": -0.01107207965105772, "rewards/margins_std": 0.022020744159817696, "rewards/rejected": 0.02482539974153042, "step": 140 }, { "dpo_losses": 0.688017725944519, "epoch": 0.04, "grad_norm": 1.8545720875157046, "learning_rate": 1.9582245430809403e-06, "logits/chosen": -2.809023141860962, "logits/rejected": -2.7906556129455566, "logps/chosen": -239.0895538330078, "logps/rejected": -254.45254516601562, "loss": 0.6917, "positive_losses": 0.011655425652861595, "rewards/accuracies": 0.6875, "rewards/chosen": 0.0437992587685585, "rewards/margins": 0.010411135852336884, "rewards/margins_max": 0.03449424356222153, "rewards/margins_min": -0.01029182504862547, "rewards/margins_std": 0.020365644246339798, "rewards/rejected": 0.03338811919093132, "step": 150 }, { "dpo_losses": 0.6856845021247864, "epoch": 0.04, "grad_norm": 9.222214904332864, "learning_rate": 2.0887728459530026e-06, "logits/chosen": -2.7487688064575195, "logits/rejected": -2.7678794860839844, "logps/chosen": -268.80438232421875, "logps/rejected": -257.0102844238281, "loss": 0.6852, "positive_losses": 0.0068878172896802425, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.05923125892877579, "rewards/margins": 0.015303397551178932, "rewards/margins_max": 0.0555412694811821, "rewards/margins_min": -0.020768599584698677, "rewards/margins_std": 0.03399623930454254, "rewards/rejected": 0.04392785578966141, "step": 160 }, { "dpo_losses": 0.6870883703231812, "epoch": 0.04, "grad_norm": 1.8542392873651312, "learning_rate": 2.2193211488250653e-06, "logits/chosen": -2.7857773303985596, "logits/rejected": -2.796518325805664, "logps/chosen": -264.02410888671875, "logps/rejected": -230.34439086914062, "loss": 0.69, "positive_losses": 0.02435150183737278, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.0670926421880722, "rewards/margins": 0.012564549222588539, "rewards/margins_max": 0.054188720881938934, "rewards/margins_min": -0.027178261429071426, "rewards/margins_std": 0.036464206874370575, "rewards/rejected": 0.054528094828128815, "step": 170 }, { "dpo_losses": 0.6825939416885376, "epoch": 0.05, "grad_norm": 8.9485055879907, "learning_rate": 2.3498694516971284e-06, "logits/chosen": -2.8339333534240723, "logits/rejected": -2.7744784355163574, "logps/chosen": -318.8066711425781, "logps/rejected": -278.94952392578125, "loss": 0.694, "positive_losses": 0.17801937460899353, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.07065500319004059, "rewards/margins": 0.021819163113832474, "rewards/margins_max": 0.07340516149997711, "rewards/margins_min": -0.028240054845809937, "rewards/margins_std": 0.04419630020856857, "rewards/rejected": 0.04883584380149841, "step": 180 }, { "dpo_losses": 0.6837826371192932, "epoch": 0.05, "grad_norm": 9.226703071007915, "learning_rate": 2.4804177545691907e-06, "logits/chosen": -2.834094524383545, "logits/rejected": -2.8156392574310303, "logps/chosen": -227.9156951904297, "logps/rejected": -219.6840362548828, "loss": 0.6868, "positive_losses": 0.027264881879091263, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.0788140743970871, "rewards/margins": 0.01936030387878418, "rewards/margins_max": 0.06731411814689636, "rewards/margins_min": -0.024441728368401527, "rewards/margins_std": 0.041044656187295914, "rewards/rejected": 0.059453777968883514, "step": 190 }, { "dpo_losses": 0.6796741485595703, "epoch": 0.05, "grad_norm": 2.037727006307245, "learning_rate": 2.610966057441254e-06, "logits/chosen": -2.8307013511657715, "logits/rejected": -2.7554831504821777, "logps/chosen": -307.99224853515625, "logps/rejected": -245.27273559570312, "loss": 0.6851, "positive_losses": 0.00517616281285882, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0933031439781189, "rewards/margins": 0.02771134115755558, "rewards/margins_max": 0.08080239593982697, "rewards/margins_min": -0.012679673731327057, "rewards/margins_std": 0.04240233451128006, "rewards/rejected": 0.06559181213378906, "step": 200 }, { "epoch": 0.05, "eval_dpo_losses": 0.6806924939155579, "eval_logits/chosen": -2.796081781387329, "eval_logits/rejected": -2.756462812423706, "eval_logps/chosen": -275.4477844238281, "eval_logps/rejected": -252.0205841064453, "eval_loss": 0.6925920248031616, "eval_positive_losses": 0.030853919684886932, "eval_rewards/accuracies": 0.675000011920929, "eval_rewards/chosen": 0.0914565846323967, "eval_rewards/margins": 0.025873009115457535, "eval_rewards/margins_max": 0.11261825263500214, "eval_rewards/margins_min": -0.0511879101395607, "eval_rewards/margins_std": 0.05379374697804451, "eval_rewards/rejected": 0.06558356434106827, "eval_runtime": 428.6917, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 200 }, { "dpo_losses": 0.6797536611557007, "epoch": 0.05, "grad_norm": 2.5291776297602424, "learning_rate": 2.741514360313316e-06, "logits/chosen": -2.7762606143951416, "logits/rejected": -2.74076509475708, "logps/chosen": -288.113525390625, "logps/rejected": -287.1617431640625, "loss": 0.6882, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.0990293100476265, "rewards/margins": 0.02833416685461998, "rewards/margins_max": 0.10787785053253174, "rewards/margins_min": -0.033592965453863144, "rewards/margins_std": 0.06290306150913239, "rewards/rejected": 0.07069514691829681, "step": 210 }, { "dpo_losses": 0.6818485856056213, "epoch": 0.06, "grad_norm": 1.783795528886745, "learning_rate": 2.872062663185379e-06, "logits/chosen": -2.7865102291107178, "logits/rejected": -2.750469207763672, "logps/chosen": -261.618408203125, "logps/rejected": -268.7855224609375, "loss": 0.68, "positive_losses": 7.43865966796875e-05, "rewards/accuracies": 0.625, "rewards/chosen": 0.10103417932987213, "rewards/margins": 0.023570220917463303, "rewards/margins_max": 0.07599742710590363, "rewards/margins_min": -0.033367056399583817, "rewards/margins_std": 0.0483870692551136, "rewards/rejected": 0.07746393978595734, "step": 220 }, { "dpo_losses": 0.67864590883255, "epoch": 0.06, "grad_norm": 1.8925200493782315, "learning_rate": 3.0026109660574416e-06, "logits/chosen": -2.8211140632629395, "logits/rejected": -2.8101022243499756, "logps/chosen": -275.81982421875, "logps/rejected": -252.9820098876953, "loss": 0.6815, "positive_losses": 0.05394439771771431, "rewards/accuracies": 0.6875, "rewards/chosen": 0.11983136087656021, "rewards/margins": 0.030398359522223473, "rewards/margins_max": 0.10901688039302826, "rewards/margins_min": -0.042515210807323456, "rewards/margins_std": 0.06782356649637222, "rewards/rejected": 0.08943299949169159, "step": 230 }, { "dpo_losses": 0.6751521229743958, "epoch": 0.06, "grad_norm": 2.024310774130299, "learning_rate": 3.1331592689295043e-06, "logits/chosen": -2.7420055866241455, "logits/rejected": -2.6680760383605957, "logps/chosen": -274.074462890625, "logps/rejected": -229.8125457763672, "loss": 0.6736, "positive_losses": 0.013821601867675781, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.10831008851528168, "rewards/margins": 0.037697240710258484, "rewards/margins_max": 0.11681525409221649, "rewards/margins_min": -0.035942185670137405, "rewards/margins_std": 0.0686887726187706, "rewards/rejected": 0.0706128478050232, "step": 240 }, { "dpo_losses": 0.6734244227409363, "epoch": 0.07, "grad_norm": 7.790031411103958, "learning_rate": 3.263707571801567e-06, "logits/chosen": -2.8077797889709473, "logits/rejected": -2.78965425491333, "logps/chosen": -258.5167541503906, "logps/rejected": -227.7363739013672, "loss": 0.6785, "positive_losses": 0.030743788927793503, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1030823141336441, "rewards/margins": 0.041508518159389496, "rewards/margins_max": 0.12676717340946198, "rewards/margins_min": -0.0373227559030056, "rewards/margins_std": 0.07170651108026505, "rewards/rejected": 0.061573781073093414, "step": 250 }, { "dpo_losses": 0.6674519777297974, "epoch": 0.07, "grad_norm": 21.425869914925933, "learning_rate": 3.3942558746736293e-06, "logits/chosen": -2.8157076835632324, "logits/rejected": -2.769394636154175, "logps/chosen": -313.24371337890625, "logps/rejected": -309.8602294921875, "loss": 0.6787, "positive_losses": 0.17785778641700745, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.11327938735485077, "rewards/margins": 0.05391327291727066, "rewards/margins_max": 0.14178535342216492, "rewards/margins_min": -0.02601126953959465, "rewards/margins_std": 0.07614488154649734, "rewards/rejected": 0.05936611816287041, "step": 260 }, { "dpo_losses": 0.6706933379173279, "epoch": 0.07, "grad_norm": 8.886625600473348, "learning_rate": 3.524804177545692e-06, "logits/chosen": -2.8070363998413086, "logits/rejected": -2.7220377922058105, "logps/chosen": -302.37078857421875, "logps/rejected": -283.0482482910156, "loss": 0.6814, "positive_losses": 0.23241576552391052, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.12265243381261826, "rewards/margins": 0.04733430594205856, "rewards/margins_max": 0.14325857162475586, "rewards/margins_min": -0.03970567509531975, "rewards/margins_std": 0.08187074214220047, "rewards/rejected": 0.0753181204199791, "step": 270 }, { "dpo_losses": 0.6768977642059326, "epoch": 0.07, "grad_norm": 17.157045106233873, "learning_rate": 3.6553524804177547e-06, "logits/chosen": -2.882512331008911, "logits/rejected": -2.8379597663879395, "logps/chosen": -280.17901611328125, "logps/rejected": -257.7303466796875, "loss": 0.6879, "positive_losses": 0.0861150249838829, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.13492551445960999, "rewards/margins": 0.034786418080329895, "rewards/margins_max": 0.13329659402370453, "rewards/margins_min": -0.062356531620025635, "rewards/margins_std": 0.08704538643360138, "rewards/rejected": 0.10013909637928009, "step": 280 }, { "dpo_losses": 0.6654553413391113, "epoch": 0.08, "grad_norm": 11.296721018141447, "learning_rate": 3.7859007832898174e-06, "logits/chosen": -2.845855236053467, "logits/rejected": -2.807690143585205, "logps/chosen": -284.0709533691406, "logps/rejected": -247.1073455810547, "loss": 0.6792, "positive_losses": 0.1357584446668625, "rewards/accuracies": 0.75, "rewards/chosen": 0.13916456699371338, "rewards/margins": 0.05807735398411751, "rewards/margins_max": 0.14726272225379944, "rewards/margins_min": -0.02012510970234871, "rewards/margins_std": 0.07666449248790741, "rewards/rejected": 0.08108720183372498, "step": 290 }, { "dpo_losses": 0.6774862408638, "epoch": 0.08, "grad_norm": 1.9556420548531384, "learning_rate": 3.9164490861618806e-06, "logits/chosen": -2.850562572479248, "logits/rejected": -2.7735071182250977, "logps/chosen": -267.1048278808594, "logps/rejected": -227.51370239257812, "loss": 0.6861, "positive_losses": 0.12432155758142471, "rewards/accuracies": 0.5625, "rewards/chosen": 0.13996531069278717, "rewards/margins": 0.03365673869848251, "rewards/margins_max": 0.1524609476327896, "rewards/margins_min": -0.049048103392124176, "rewards/margins_std": 0.0909699946641922, "rewards/rejected": 0.10630857944488525, "step": 300 }, { "epoch": 0.08, "eval_dpo_losses": 0.6716173887252808, "eval_logits/chosen": -2.778388261795044, "eval_logits/rejected": -2.738612174987793, "eval_logps/chosen": -268.2485046386719, "eval_logps/rejected": -246.83624267578125, "eval_loss": 0.6917684078216553, "eval_positive_losses": 0.07591178268194199, "eval_rewards/accuracies": 0.6840000152587891, "eval_rewards/chosen": 0.163449227809906, "eval_rewards/margins": 0.046022407710552216, "eval_rewards/margins_max": 0.19921058416366577, "eval_rewards/margins_min": -0.09013175964355469, "eval_rewards/margins_std": 0.09529687464237213, "eval_rewards/rejected": 0.11742684245109558, "eval_runtime": 428.6911, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 300 }, { "dpo_losses": 0.6766080856323242, "epoch": 0.08, "grad_norm": 2.34088203434741, "learning_rate": 4.046997389033943e-06, "logits/chosen": -2.8368277549743652, "logits/rejected": -2.798027992248535, "logps/chosen": -235.4011993408203, "logps/rejected": -237.29672241210938, "loss": 0.679, "positive_losses": 0.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.15762075781822205, "rewards/margins": 0.03504303842782974, "rewards/margins_max": 0.12610016763210297, "rewards/margins_min": -0.03666384145617485, "rewards/margins_std": 0.07294420897960663, "rewards/rejected": 0.1225777119398117, "step": 310 }, { "dpo_losses": 0.6741234064102173, "epoch": 0.08, "grad_norm": 5.629898312126322, "learning_rate": 4.177545691906005e-06, "logits/chosen": -2.8309075832366943, "logits/rejected": -2.7934060096740723, "logps/chosen": -266.61474609375, "logps/rejected": -242.709716796875, "loss": 0.6909, "positive_losses": 0.3503338694572449, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.16119489073753357, "rewards/margins": 0.04173852503299713, "rewards/margins_max": 0.16567359864711761, "rewards/margins_min": -0.06576590240001678, "rewards/margins_std": 0.10304777324199677, "rewards/rejected": 0.11945638805627823, "step": 320 }, { "dpo_losses": 0.6696040630340576, "epoch": 0.09, "grad_norm": 5.371980573094988, "learning_rate": 4.308093994778068e-06, "logits/chosen": -2.8499436378479004, "logits/rejected": -2.779639959335327, "logps/chosen": -272.63482666015625, "logps/rejected": -277.36822509765625, "loss": 0.6789, "positive_losses": 0.0, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18033604323863983, "rewards/margins": 0.05072510242462158, "rewards/margins_max": 0.16920921206474304, "rewards/margins_min": -0.046907443553209305, "rewards/margins_std": 0.0969356968998909, "rewards/rejected": 0.12961094081401825, "step": 330 }, { "dpo_losses": 0.6650840640068054, "epoch": 0.09, "grad_norm": 6.263659999111754, "learning_rate": 4.4386422976501306e-06, "logits/chosen": -2.8021037578582764, "logits/rejected": -2.789630174636841, "logps/chosen": -279.3359375, "logps/rejected": -256.07122802734375, "loss": 0.6814, "positive_losses": 0.13530464470386505, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18126890063285828, "rewards/margins": 0.059733688831329346, "rewards/margins_max": 0.17804959416389465, "rewards/margins_min": -0.033679597079753876, "rewards/margins_std": 0.09589993953704834, "rewards/rejected": 0.12153519690036774, "step": 340 }, { "dpo_losses": 0.6661222577095032, "epoch": 0.09, "grad_norm": 13.053108891501612, "learning_rate": 4.569190600522193e-06, "logits/chosen": -2.6898586750030518, "logits/rejected": -2.643160104751587, "logps/chosen": -262.5791320800781, "logps/rejected": -229.4337158203125, "loss": 0.6967, "positive_losses": 0.18669238686561584, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.16490976512432098, "rewards/margins": 0.05847515910863876, "rewards/margins_max": 0.19355596601963043, "rewards/margins_min": -0.07381604611873627, "rewards/margins_std": 0.11916041374206543, "rewards/rejected": 0.10643460601568222, "step": 350 }, { "dpo_losses": 0.6652417182922363, "epoch": 0.09, "grad_norm": 9.380150359357794, "learning_rate": 4.699738903394257e-06, "logits/chosen": -2.823025703430176, "logits/rejected": -2.8484368324279785, "logps/chosen": -273.8858947753906, "logps/rejected": -240.82003784179688, "loss": 0.6898, "positive_losses": 0.2703586518764496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.15409310162067413, "rewards/margins": 0.06112586334347725, "rewards/margins_max": 0.2065500020980835, "rewards/margins_min": -0.07230211794376373, "rewards/margins_std": 0.12428711354732513, "rewards/rejected": 0.09296722710132599, "step": 360 }, { "dpo_losses": 0.6730788946151733, "epoch": 0.1, "grad_norm": 8.600281165206487, "learning_rate": 4.8302872062663196e-06, "logits/chosen": -2.703054428100586, "logits/rejected": -2.6885857582092285, "logps/chosen": -260.34466552734375, "logps/rejected": -248.26809692382812, "loss": 0.7009, "positive_losses": 0.07516975700855255, "rewards/accuracies": 0.625, "rewards/chosen": 0.15891191363334656, "rewards/margins": 0.0432850606739521, "rewards/margins_max": 0.1486811488866806, "rewards/margins_min": -0.056220900267362595, "rewards/margins_std": 0.08996488898992538, "rewards/rejected": 0.11562683433294296, "step": 370 }, { "dpo_losses": 0.6764985918998718, "epoch": 0.1, "grad_norm": 6.9187812177303565, "learning_rate": 4.9608355091383814e-06, "logits/chosen": -2.75665545463562, "logits/rejected": -2.6705994606018066, "logps/chosen": -240.15225219726562, "logps/rejected": -255.17306518554688, "loss": 0.6798, "positive_losses": 0.07718296349048615, "rewards/accuracies": 0.5625, "rewards/chosen": 0.14909972250461578, "rewards/margins": 0.03865838795900345, "rewards/margins_max": 0.19323141872882843, "rewards/margins_min": -0.10901296138763428, "rewards/margins_std": 0.13268721103668213, "rewards/rejected": 0.11044134944677353, "step": 380 }, { "dpo_losses": 0.6542829275131226, "epoch": 0.1, "grad_norm": 2.1777332801987157, "learning_rate": 4.9999488562447675e-06, "logits/chosen": -2.831470251083374, "logits/rejected": -2.8036863803863525, "logps/chosen": -275.0520935058594, "logps/rejected": -252.9716339111328, "loss": 0.6914, "positive_losses": 0.4602828919887543, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.174590066075325, "rewards/margins": 0.0824054405093193, "rewards/margins_max": 0.1946953535079956, "rewards/margins_min": -0.023375285789370537, "rewards/margins_std": 0.10006286948919296, "rewards/rejected": 0.09218461066484451, "step": 390 }, { "dpo_losses": 0.678782045841217, "epoch": 0.1, "grad_norm": 9.510034180909672, "learning_rate": 4.999698361256577e-06, "logits/chosen": -2.7608649730682373, "logits/rejected": -2.7339279651641846, "logps/chosen": -239.4250030517578, "logps/rejected": -237.4720916748047, "loss": 0.7061, "positive_losses": 0.29985731840133667, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.12814535200595856, "rewards/margins": 0.03364657610654831, "rewards/margins_max": 0.1790953278541565, "rewards/margins_min": -0.10498541593551636, "rewards/margins_std": 0.12637025117874146, "rewards/rejected": 0.09449878334999084, "step": 400 }, { "epoch": 0.1, "eval_dpo_losses": 0.663825273513794, "eval_logits/chosen": -2.7626216411590576, "eval_logits/rejected": -2.724411725997925, "eval_logps/chosen": -268.4670715332031, "eval_logps/rejected": -248.8231658935547, "eval_loss": 0.6930261254310608, "eval_positive_losses": 0.17836187779903412, "eval_rewards/accuracies": 0.6980000138282776, "eval_rewards/chosen": 0.16126349568367004, "eval_rewards/margins": 0.0637059286236763, "eval_rewards/margins_max": 0.25521811842918396, "eval_rewards/margins_min": -0.11399037390947342, "eval_rewards/margins_std": 0.1221209466457367, "eval_rewards/rejected": 0.09755756705999374, "eval_runtime": 428.3692, "eval_samples_per_second": 4.669, "eval_steps_per_second": 0.292, "step": 400 }, { "dpo_losses": 0.6647549271583557, "epoch": 0.11, "grad_norm": 1.9490725781631402, "learning_rate": 4.999239142174581e-06, "logits/chosen": -2.7890048027038574, "logits/rejected": -2.7066380977630615, "logps/chosen": -265.97100830078125, "logps/rejected": -248.64566040039062, "loss": 0.699, "positive_losses": 0.39228931069374084, "rewards/accuracies": 0.6875, "rewards/chosen": 0.16712786257266998, "rewards/margins": 0.06104505807161331, "rewards/margins_max": 0.1883912980556488, "rewards/margins_min": -0.055216945707798004, "rewards/margins_std": 0.1096106767654419, "rewards/rejected": 0.10608279705047607, "step": 410 }, { "dpo_losses": 0.6541630029678345, "epoch": 0.11, "grad_norm": 4.891924700627691, "learning_rate": 4.99857123734344e-06, "logits/chosen": -2.795801877975464, "logits/rejected": -2.754772901535034, "logps/chosen": -276.148193359375, "logps/rejected": -230.367431640625, "loss": 0.6623, "positive_losses": 0.0, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1755593717098236, "rewards/margins": 0.08338401466608047, "rewards/margins_max": 0.21404854953289032, "rewards/margins_min": -0.033180076628923416, "rewards/margins_std": 0.10915927588939667, "rewards/rejected": 0.09217534959316254, "step": 420 }, { "dpo_losses": 0.6530717015266418, "epoch": 0.11, "grad_norm": 13.936764821732806, "learning_rate": 4.997694702533016e-06, "logits/chosen": -2.7750701904296875, "logits/rejected": -2.719933032989502, "logps/chosen": -335.82586669921875, "logps/rejected": -252.59884643554688, "loss": 0.6681, "positive_losses": 0.12311439216136932, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.17032219469547272, "rewards/margins": 0.08721128106117249, "rewards/margins_max": 0.2493777722120285, "rewards/margins_min": -0.04909246042370796, "rewards/margins_std": 0.1305169314146042, "rewards/rejected": 0.08311090618371964, "step": 430 }, { "dpo_losses": 0.6612471342086792, "epoch": 0.12, "grad_norm": 1.9302026470311238, "learning_rate": 4.996609610933713e-06, "logits/chosen": -2.7909905910491943, "logits/rejected": -2.7806625366210938, "logps/chosen": -258.2890319824219, "logps/rejected": -246.836181640625, "loss": 0.675, "positive_losses": 0.2794044613838196, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1579817235469818, "rewards/margins": 0.06991832703351974, "rewards/margins_max": 0.23221509158611298, "rewards/margins_min": -0.07362373173236847, "rewards/margins_std": 0.13679808378219604, "rewards/rejected": 0.08806340396404266, "step": 440 }, { "dpo_losses": 0.6611306667327881, "epoch": 0.12, "grad_norm": 10.32640553068114, "learning_rate": 4.995316053150366e-06, "logits/chosen": -2.82472825050354, "logits/rejected": -2.786837339401245, "logps/chosen": -283.5440979003906, "logps/rejected": -259.4210205078125, "loss": 0.6761, "positive_losses": 0.13967056572437286, "rewards/accuracies": 0.6875, "rewards/chosen": 0.17699022591114044, "rewards/margins": 0.06885913759469986, "rewards/margins_max": 0.20350675284862518, "rewards/margins_min": -0.0634232759475708, "rewards/margins_std": 0.11859367042779922, "rewards/rejected": 0.10813107341527939, "step": 450 }, { "dpo_losses": 0.645112156867981, "epoch": 0.12, "grad_norm": 2.160228652154638, "learning_rate": 4.9938141371946815e-06, "logits/chosen": -2.7420597076416016, "logits/rejected": -2.7228333950042725, "logps/chosen": -261.9131774902344, "logps/rejected": -243.5559539794922, "loss": 0.6727, "positive_losses": 0.5096033811569214, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.17745108902454376, "rewards/margins": 0.10410416126251221, "rewards/margins_max": 0.24705567955970764, "rewards/margins_min": -0.0324539914727211, "rewards/margins_std": 0.1242934912443161, "rewards/rejected": 0.07334692776203156, "step": 460 }, { "dpo_losses": 0.6669226884841919, "epoch": 0.12, "grad_norm": 10.948951873085415, "learning_rate": 4.992103988476206e-06, "logits/chosen": -2.691591501235962, "logits/rejected": -2.732255458831787, "logps/chosen": -236.1201934814453, "logps/rejected": -237.4022979736328, "loss": 0.7036, "positive_losses": 0.3716261386871338, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.13690085709095, "rewards/margins": 0.058231890201568604, "rewards/margins_max": 0.18529482185840607, "rewards/margins_min": -0.0782652348279953, "rewards/margins_std": 0.12157370150089264, "rewards/rejected": 0.07866895943880081, "step": 470 }, { "dpo_losses": 0.66729736328125, "epoch": 0.13, "grad_norm": 6.018100457085463, "learning_rate": 4.990185749791866e-06, "logits/chosen": -2.7900872230529785, "logits/rejected": -2.7373461723327637, "logps/chosen": -274.8970031738281, "logps/rejected": -241.1278533935547, "loss": 0.681, "positive_losses": 0.15947015583515167, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.17529059946537018, "rewards/margins": 0.05790730565786362, "rewards/margins_max": 0.201708123087883, "rewards/margins_min": -0.08258150517940521, "rewards/margins_std": 0.1270827353000641, "rewards/rejected": 0.11738328635692596, "step": 480 }, { "dpo_losses": 0.6618391275405884, "epoch": 0.13, "grad_norm": 14.341586612623543, "learning_rate": 4.9880595813140395e-06, "logits/chosen": -2.7179877758026123, "logits/rejected": -2.7014708518981934, "logps/chosen": -285.288330078125, "logps/rejected": -253.21340942382812, "loss": 0.6844, "positive_losses": 0.0, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19091898202896118, "rewards/margins": 0.06788579374551773, "rewards/margins_max": 0.21516986191272736, "rewards/margins_min": -0.047360509634017944, "rewards/margins_std": 0.11700733006000519, "rewards/rejected": 0.12303321063518524, "step": 490 }, { "dpo_losses": 0.6634654998779297, "epoch": 0.13, "grad_norm": 1.8399809767942166, "learning_rate": 4.985725660577184e-06, "logits/chosen": -2.754000425338745, "logits/rejected": -2.7245523929595947, "logps/chosen": -288.63446044921875, "logps/rejected": -259.36700439453125, "loss": 0.6898, "positive_losses": 0.0647762268781662, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.19668762385845184, "rewards/margins": 0.0646195262670517, "rewards/margins_max": 0.20340470969676971, "rewards/margins_min": -0.06121315434575081, "rewards/margins_std": 0.11870823055505753, "rewards/rejected": 0.13206809759140015, "step": 500 }, { "epoch": 0.13, "eval_dpo_losses": 0.6642889976501465, "eval_logits/chosen": -2.7125039100646973, "eval_logits/rejected": -2.673779010772705, "eval_logps/chosen": -265.1497802734375, "eval_logps/rejected": -245.39649963378906, "eval_loss": 0.67903071641922, "eval_positive_losses": 0.06921113282442093, "eval_rewards/accuracies": 0.6959999799728394, "eval_rewards/chosen": 0.1944362074136734, "eval_rewards/margins": 0.06261204183101654, "eval_rewards/margins_max": 0.2531706988811493, "eval_rewards/margins_min": -0.10926476866006851, "eval_rewards/margins_std": 0.12040134519338608, "eval_rewards/rejected": 0.13182415068149567, "eval_runtime": 428.9817, "eval_samples_per_second": 4.662, "eval_steps_per_second": 0.291, "step": 500 }, { "dpo_losses": 0.6559068560600281, "epoch": 0.13, "grad_norm": 1.8444710307649992, "learning_rate": 4.983184182463009e-06, "logits/chosen": -2.6351840496063232, "logits/rejected": -2.620729684829712, "logps/chosen": -261.0209045410156, "logps/rejected": -240.55459594726562, "loss": 0.6704, "positive_losses": 0.1364610642194748, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.19576655328273773, "rewards/margins": 0.0826006755232811, "rewards/margins_max": 0.267486035823822, "rewards/margins_min": -0.06554356217384338, "rewards/margins_std": 0.14967602491378784, "rewards/rejected": 0.11316587030887604, "step": 510 }, { "dpo_losses": 0.653467059135437, "epoch": 0.14, "grad_norm": 1.7262342322179807, "learning_rate": 4.980435359184203e-06, "logits/chosen": -2.705235719680786, "logits/rejected": -2.660126209259033, "logps/chosen": -256.4479064941406, "logps/rejected": -239.2073211669922, "loss": 0.6789, "positive_losses": 0.117906853556633, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.17526809871196747, "rewards/margins": 0.08569283783435822, "rewards/margins_max": 0.23225037753582, "rewards/margins_min": -0.05254429578781128, "rewards/margins_std": 0.12837602198123932, "rewards/rejected": 0.08957526832818985, "step": 520 }, { "dpo_losses": 0.6582752466201782, "epoch": 0.14, "grad_norm": 2.0399299033594556, "learning_rate": 4.9774794202667236e-06, "logits/chosen": -2.7792890071868896, "logits/rejected": -2.761970043182373, "logps/chosen": -273.3455810546875, "logps/rejected": -257.6287841796875, "loss": 0.687, "positive_losses": 0.0915599837899208, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.18177327513694763, "rewards/margins": 0.07460211217403412, "rewards/margins_max": 0.21025963127613068, "rewards/margins_min": -0.04098554700613022, "rewards/margins_std": 0.11133607476949692, "rewards/rejected": 0.1071711927652359, "step": 530 }, { "dpo_losses": 0.6663140654563904, "epoch": 0.14, "grad_norm": 1.940175393523082, "learning_rate": 4.974316612530615e-06, "logits/chosen": -2.7265803813934326, "logits/rejected": -2.7087855339050293, "logps/chosen": -248.11453247070312, "logps/rejected": -231.1879425048828, "loss": 0.6804, "positive_losses": 0.18669891357421875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.19553527235984802, "rewards/margins": 0.060328125953674316, "rewards/margins_max": 0.21855533123016357, "rewards/margins_min": -0.09312988817691803, "rewards/margins_std": 0.13823029398918152, "rewards/rejected": 0.1352071613073349, "step": 540 }, { "dpo_losses": 0.6531503796577454, "epoch": 0.14, "grad_norm": 1.7015233339228084, "learning_rate": 4.970947200069416e-06, "logits/chosen": -2.828091621398926, "logits/rejected": -2.7830519676208496, "logps/chosen": -291.1979064941406, "logps/rejected": -248.87588500976562, "loss": 0.6786, "positive_losses": 0.040079496800899506, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.19328513741493225, "rewards/margins": 0.0860414132475853, "rewards/margins_max": 0.22265836596488953, "rewards/margins_min": -0.051799990236759186, "rewards/margins_std": 0.12175168842077255, "rewards/rejected": 0.10724373161792755, "step": 550 }, { "dpo_losses": 0.6719218492507935, "epoch": 0.15, "grad_norm": 8.428585061715763, "learning_rate": 4.967371464228096e-06, "logits/chosen": -2.766552448272705, "logits/rejected": -2.78574800491333, "logps/chosen": -279.3778991699219, "logps/rejected": -264.9951477050781, "loss": 0.6752, "positive_losses": 0.016327476128935814, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.2008594572544098, "rewards/margins": 0.04748000204563141, "rewards/margins_max": 0.1892395317554474, "rewards/margins_min": -0.09617707878351212, "rewards/margins_std": 0.12776336073875427, "rewards/rejected": 0.15337945520877838, "step": 560 }, { "dpo_losses": 0.6593919396400452, "epoch": 0.15, "grad_norm": 1.7794764205172144, "learning_rate": 4.963589703579569e-06, "logits/chosen": -2.7407279014587402, "logits/rejected": -2.7030673027038574, "logps/chosen": -236.3583984375, "logps/rejected": -221.271728515625, "loss": 0.6725, "positive_losses": 0.13829974830150604, "rewards/accuracies": 0.75, "rewards/chosen": 0.18503494560718536, "rewards/margins": 0.07332818955183029, "rewards/margins_max": 0.23580794036388397, "rewards/margins_min": -0.07303695380687714, "rewards/margins_std": 0.133176788687706, "rewards/rejected": 0.11170674860477448, "step": 570 }, { "dpo_losses": 0.6594555974006653, "epoch": 0.15, "grad_norm": 17.572357560786788, "learning_rate": 4.9596022338997615e-06, "logits/chosen": -2.803607702255249, "logits/rejected": -2.7698140144348145, "logps/chosen": -281.70526123046875, "logps/rejected": -243.89022827148438, "loss": 0.6838, "positive_losses": 0.23751580715179443, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.1932162642478943, "rewards/margins": 0.07334502786397934, "rewards/margins_max": 0.20450429618358612, "rewards/margins_min": -0.04019375517964363, "rewards/margins_std": 0.11187299340963364, "rewards/rejected": 0.11987121403217316, "step": 580 }, { "dpo_losses": 0.6541207432746887, "epoch": 0.15, "grad_norm": 1.6414851522387102, "learning_rate": 4.955409388141243e-06, "logits/chosen": -2.7634010314941406, "logits/rejected": -2.7144711017608643, "logps/chosen": -261.980712890625, "logps/rejected": -242.0223388671875, "loss": 0.6845, "positive_losses": 0.24174003303050995, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.19224414229393005, "rewards/margins": 0.08591899275779724, "rewards/margins_max": 0.2521759569644928, "rewards/margins_min": -0.07810191065073013, "rewards/margins_std": 0.14326006174087524, "rewards/rejected": 0.10632514953613281, "step": 590 }, { "dpo_losses": 0.6603989601135254, "epoch": 0.16, "grad_norm": 1.6079385513407207, "learning_rate": 4.951011516405429e-06, "logits/chosen": -2.7776551246643066, "logits/rejected": -2.750886917114258, "logps/chosen": -305.2583312988281, "logps/rejected": -267.74456787109375, "loss": 0.6626, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.2138456404209137, "rewards/margins": 0.07189485430717468, "rewards/margins_max": 0.22072605788707733, "rewards/margins_min": -0.08298121392726898, "rewards/margins_std": 0.1358434110879898, "rewards/rejected": 0.141950786113739, "step": 600 }, { "epoch": 0.16, "eval_dpo_losses": 0.6581176519393921, "eval_logits/chosen": -2.7062556743621826, "eval_logits/rejected": -2.6704349517822266, "eval_logps/chosen": -265.4291687011719, "eval_logps/rejected": -247.12062072753906, "eval_loss": 0.6881689429283142, "eval_positive_losses": 0.1556549072265625, "eval_rewards/accuracies": 0.703000009059906, "eval_rewards/chosen": 0.1916424185037613, "eval_rewards/margins": 0.07705948501825333, "eval_rewards/margins_max": 0.3032918870449066, "eval_rewards/margins_min": -0.12884144484996796, "eval_rewards/margins_std": 0.14358200132846832, "eval_rewards/rejected": 0.11458291858434677, "eval_runtime": 428.688, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 600 }, { "dpo_losses": 0.641246497631073, "epoch": 0.16, "grad_norm": 9.136664890686701, "learning_rate": 4.946408985913344e-06, "logits/chosen": -2.7508132457733154, "logits/rejected": -2.67838716506958, "logps/chosen": -346.3443298339844, "logps/rejected": -260.87579345703125, "loss": 0.6637, "positive_losses": 0.1905239075422287, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.22615596652030945, "rewards/margins": 0.11383726447820663, "rewards/margins_max": 0.2715848684310913, "rewards/margins_min": -0.056796569377183914, "rewards/margins_std": 0.15181510150432587, "rewards/rejected": 0.11231867223978043, "step": 610 }, { "dpo_losses": 0.6564712524414062, "epoch": 0.16, "grad_norm": 5.738761201422579, "learning_rate": 4.941602180974958e-06, "logits/chosen": -2.7260587215423584, "logits/rejected": -2.700836420059204, "logps/chosen": -279.70941162109375, "logps/rejected": -257.27618408203125, "loss": 0.6596, "positive_losses": 0.1551402062177658, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1692667454481125, "rewards/margins": 0.0832987055182457, "rewards/margins_max": 0.26042428612709045, "rewards/margins_min": -0.09459871798753738, "rewards/margins_std": 0.1611034870147705, "rewards/rejected": 0.0859680324792862, "step": 620 }, { "dpo_losses": 0.6489056348800659, "epoch": 0.16, "grad_norm": 1.8002591317465293, "learning_rate": 4.936591502957101e-06, "logits/chosen": -2.676663637161255, "logits/rejected": -2.656998634338379, "logps/chosen": -249.65414428710938, "logps/rejected": -261.31634521484375, "loss": 0.7204, "positive_losses": 0.271310418844223, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19132879376411438, "rewards/margins": 0.09678339958190918, "rewards/margins_max": 0.2735072672367096, "rewards/margins_min": -0.038702718913555145, "rewards/margins_std": 0.14061103761196136, "rewards/rejected": 0.0945453941822052, "step": 630 }, { "dpo_losses": 0.6572433114051819, "epoch": 0.17, "grad_norm": 10.31914588173438, "learning_rate": 4.931377370249946e-06, "logits/chosen": -2.6811020374298096, "logits/rejected": -2.6324188709259033, "logps/chosen": -293.49127197265625, "logps/rejected": -248.0328826904297, "loss": 0.6866, "positive_losses": 0.3294762074947357, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.20909182727336884, "rewards/margins": 0.08217711001634598, "rewards/margins_max": 0.25599515438079834, "rewards/margins_min": -0.09916047751903534, "rewards/margins_std": 0.16033844649791718, "rewards/rejected": 0.12691470980644226, "step": 640 }, { "dpo_losses": 0.6632541418075562, "epoch": 0.17, "grad_norm": 9.821545361901988, "learning_rate": 4.925960218232073e-06, "logits/chosen": -2.667874574661255, "logits/rejected": -2.669560432434082, "logps/chosen": -240.31991577148438, "logps/rejected": -239.5518341064453, "loss": 0.6786, "positive_losses": 0.1409328430891037, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.1838376820087433, "rewards/margins": 0.06511926651000977, "rewards/margins_max": 0.20776009559631348, "rewards/margins_min": -0.06922709196805954, "rewards/margins_std": 0.12260917574167252, "rewards/rejected": 0.11871840804815292, "step": 650 }, { "dpo_losses": 0.6515302658081055, "epoch": 0.17, "grad_norm": 14.749942068212505, "learning_rate": 4.920340499234116e-06, "logits/chosen": -2.7395052909851074, "logits/rejected": -2.7221152782440186, "logps/chosen": -242.5193328857422, "logps/rejected": -246.7387237548828, "loss": 0.7121, "positive_losses": 0.5331507921218872, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21133026480674744, "rewards/margins": 0.09330625087022781, "rewards/margins_max": 0.2816157042980194, "rewards/margins_min": -0.10383447259664536, "rewards/margins_std": 0.17130884528160095, "rewards/rejected": 0.11802403628826141, "step": 660 }, { "dpo_losses": 0.6704595685005188, "epoch": 0.18, "grad_norm": 1.8198004399397298, "learning_rate": 4.914518682500995e-06, "logits/chosen": -2.7302098274230957, "logits/rejected": -2.672208309173584, "logps/chosen": -255.2490692138672, "logps/rejected": -227.4058837890625, "loss": 0.7089, "positive_losses": 0.39662402868270874, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1763489544391632, "rewards/margins": 0.05407591536641121, "rewards/margins_max": 0.22863993048667908, "rewards/margins_min": -0.11319337785243988, "rewards/margins_std": 0.15389451384544373, "rewards/rejected": 0.1222730427980423, "step": 670 }, { "dpo_losses": 0.660301148891449, "epoch": 0.18, "grad_norm": 1.801919639370543, "learning_rate": 4.9084952541527315e-06, "logits/chosen": -2.583868980407715, "logits/rejected": -2.563411235809326, "logps/chosen": -258.68084716796875, "logps/rejected": -232.2401123046875, "loss": 0.6765, "positive_losses": 0.26405686140060425, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20509466528892517, "rewards/margins": 0.07166479527950287, "rewards/margins_max": 0.22515162825584412, "rewards/margins_min": -0.06914964318275452, "rewards/margins_std": 0.12898924946784973, "rewards/rejected": 0.1334298849105835, "step": 680 }, { "dpo_losses": 0.6609671115875244, "epoch": 0.18, "grad_norm": 1.6964021336495965, "learning_rate": 4.902270717143858e-06, "logits/chosen": -2.653022289276123, "logits/rejected": -2.6309311389923096, "logps/chosen": -269.4786071777344, "logps/rejected": -230.60888671875, "loss": 0.6673, "positive_losses": 0.22941379249095917, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.17261944711208344, "rewards/margins": 0.07039458304643631, "rewards/margins_max": 0.20210058987140656, "rewards/margins_min": -0.06099690869450569, "rewards/margins_std": 0.11795171350240707, "rewards/rejected": 0.10222486406564713, "step": 690 }, { "dpo_losses": 0.6613507866859436, "epoch": 0.18, "grad_norm": 10.252681846540773, "learning_rate": 4.895845591221427e-06, "logits/chosen": -2.6758804321289062, "logits/rejected": -2.645129442214966, "logps/chosen": -255.1545867919922, "logps/rejected": -217.7654266357422, "loss": 0.6734, "positive_losses": 0.3122512698173523, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18215101957321167, "rewards/margins": 0.07005932182073593, "rewards/margins_max": 0.21361741423606873, "rewards/margins_min": -0.0689416378736496, "rewards/margins_std": 0.1241738423705101, "rewards/rejected": 0.11209169775247574, "step": 700 }, { "epoch": 0.18, "eval_dpo_losses": 0.6578969359397888, "eval_logits/chosen": -2.666285514831543, "eval_logits/rejected": -2.626643180847168, "eval_logps/chosen": -264.9039001464844, "eval_logps/rejected": -246.63800048828125, "eval_loss": 0.6858479380607605, "eval_positive_losses": 0.1192430704832077, "eval_rewards/accuracies": 0.7039999961853027, "eval_rewards/chosen": 0.19689512252807617, "eval_rewards/margins": 0.0774858370423317, "eval_rewards/margins_max": 0.30344513058662415, "eval_rewards/margins_min": -0.12545056641101837, "eval_rewards/margins_std": 0.14276309311389923, "eval_rewards/rejected": 0.11940930038690567, "eval_runtime": 428.6344, "eval_samples_per_second": 4.666, "eval_steps_per_second": 0.292, "step": 700 }, { "dpo_losses": 0.6717836856842041, "epoch": 0.19, "grad_norm": 1.7301946900909437, "learning_rate": 4.8892204128816e-06, "logits/chosen": -2.6657819747924805, "logits/rejected": -2.655238628387451, "logps/chosen": -193.959228515625, "logps/rejected": -202.8900146484375, "loss": 0.667, "positive_losses": 0.04419295862317085, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17174428701400757, "rewards/margins": 0.04585854336619377, "rewards/margins_max": 0.15950840711593628, "rewards/margins_min": -0.058240942656993866, "rewards/margins_std": 0.09776998311281204, "rewards/rejected": 0.1258857697248459, "step": 710 }, { "dpo_losses": 0.656749963760376, "epoch": 0.19, "grad_norm": 2.0937206209313564, "learning_rate": 4.882395735324864e-06, "logits/chosen": -2.617654323577881, "logits/rejected": -2.5914931297302246, "logps/chosen": -304.9457702636719, "logps/rejected": -290.8948974609375, "loss": 0.6686, "positive_losses": 0.26902008056640625, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21568791568279266, "rewards/margins": 0.08263921737670898, "rewards/margins_max": 0.29247573018074036, "rewards/margins_min": -0.09150998294353485, "rewards/margins_std": 0.1693994104862213, "rewards/rejected": 0.13304868340492249, "step": 720 }, { "dpo_losses": 0.6433828473091125, "epoch": 0.19, "grad_norm": 2.074264267244222, "learning_rate": 4.87537212840983e-06, "logits/chosen": -2.724853515625, "logits/rejected": -2.6763432025909424, "logps/chosen": -306.9518127441406, "logps/rejected": -271.6395568847656, "loss": 0.6737, "positive_losses": 0.20783910155296326, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21737425029277802, "rewards/margins": 0.1093602180480957, "rewards/margins_max": 0.27868980169296265, "rewards/margins_min": -0.04861774295568466, "rewards/margins_std": 0.15080413222312927, "rewards/rejected": 0.10801403224468231, "step": 730 }, { "dpo_losses": 0.6573991179466248, "epoch": 0.19, "grad_norm": 4.600266175239933, "learning_rate": 4.8681501786056545e-06, "logits/chosen": -2.676056146621704, "logits/rejected": -2.7281899452209473, "logps/chosen": -250.2340850830078, "logps/rejected": -292.02325439453125, "loss": 0.6884, "positive_losses": 0.26742249727249146, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.18083827197551727, "rewards/margins": 0.07916980236768723, "rewards/margins_max": 0.2478286474943161, "rewards/margins_min": -0.07943885773420334, "rewards/margins_std": 0.143642395734787, "rewards/rejected": 0.10166845470666885, "step": 740 }, { "dpo_losses": 0.656650960445404, "epoch": 0.2, "grad_norm": 2.024032210806157, "learning_rate": 4.860730488943068e-06, "logits/chosen": -2.694767713546753, "logits/rejected": -2.7098731994628906, "logps/chosen": -281.2164611816406, "logps/rejected": -278.75286865234375, "loss": 0.6814, "positive_losses": 0.14628362655639648, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21069040894508362, "rewards/margins": 0.0801982507109642, "rewards/margins_max": 0.2301901876926422, "rewards/margins_min": -0.06396922469139099, "rewards/margins_std": 0.13221822679042816, "rewards/rejected": 0.1304921805858612, "step": 750 }, { "dpo_losses": 0.6397830247879028, "epoch": 0.2, "grad_norm": 1.826267635670237, "learning_rate": 4.853113678964022e-06, "logits/chosen": -2.7526934146881104, "logits/rejected": -2.6893749237060547, "logps/chosen": -255.79043579101562, "logps/rejected": -229.5143585205078, "loss": 0.6488, "positive_losses": 0.0, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.22755758464336395, "rewards/margins": 0.11574671417474747, "rewards/margins_max": 0.26273924112319946, "rewards/margins_min": -0.014345327392220497, "rewards/margins_std": 0.12416696548461914, "rewards/rejected": 0.11181087791919708, "step": 760 }, { "dpo_losses": 0.63740473985672, "epoch": 0.2, "grad_norm": 2.073612447801951, "learning_rate": 4.845300384669958e-06, "logits/chosen": -2.728501081466675, "logits/rejected": -2.6738760471343994, "logps/chosen": -297.43414306640625, "logps/rejected": -259.41802978515625, "loss": 0.6551, "positive_losses": 0.02582397498190403, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2262098491191864, "rewards/margins": 0.12691542506217957, "rewards/margins_max": 0.3481551706790924, "rewards/margins_min": -0.08779765665531158, "rewards/margins_std": 0.19468773901462555, "rewards/rejected": 0.09929443150758743, "step": 770 }, { "dpo_losses": 0.6497555375099182, "epoch": 0.2, "grad_norm": 10.939153753324211, "learning_rate": 4.837291258468701e-06, "logits/chosen": -2.7370822429656982, "logits/rejected": -2.697481393814087, "logps/chosen": -289.6043395996094, "logps/rejected": -253.1317596435547, "loss": 0.7112, "positive_losses": 0.41278189420700073, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.20192649960517883, "rewards/margins": 0.0962577611207962, "rewards/margins_max": 0.28686782717704773, "rewards/margins_min": -0.08795861154794693, "rewards/margins_std": 0.16655433177947998, "rewards/rejected": 0.10566870868206024, "step": 780 }, { "dpo_losses": 0.6536123752593994, "epoch": 0.21, "grad_norm": 2.014352441105708, "learning_rate": 4.829086969119984e-06, "logits/chosen": -2.726776599884033, "logits/rejected": -2.7007291316986084, "logps/chosen": -251.7963104248047, "logps/rejected": -253.19546508789062, "loss": 0.7048, "positive_losses": 0.280552476644516, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.18638314306735992, "rewards/margins": 0.0886508971452713, "rewards/margins_max": 0.2712864279747009, "rewards/margins_min": -0.08036483079195023, "rewards/margins_std": 0.15787658095359802, "rewards/rejected": 0.09773224592208862, "step": 790 }, { "dpo_losses": 0.6422048807144165, "epoch": 0.21, "grad_norm": 9.471876175998979, "learning_rate": 4.820688201679605e-06, "logits/chosen": -2.7207765579223633, "logits/rejected": -2.687804698944092, "logps/chosen": -329.40106201171875, "logps/rejected": -263.5412902832031, "loss": 0.6609, "positive_losses": 0.2059955596923828, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2225791960954666, "rewards/margins": 0.11335919052362442, "rewards/margins_max": 0.341458797454834, "rewards/margins_min": -0.04586971923708916, "rewards/margins_std": 0.1757529079914093, "rewards/rejected": 0.109219990670681, "step": 800 }, { "epoch": 0.21, "eval_dpo_losses": 0.6530181169509888, "eval_logits/chosen": -2.7101621627807617, "eval_logits/rejected": -2.6689047813415527, "eval_logps/chosen": -264.641357421875, "eval_logps/rejected": -247.5410614013672, "eval_loss": 0.6883417963981628, "eval_positive_losses": 0.17954860627651215, "eval_rewards/accuracies": 0.7039999961853027, "eval_rewards/chosen": 0.19952091574668884, "eval_rewards/margins": 0.0891420766711235, "eval_rewards/margins_max": 0.3443031907081604, "eval_rewards/margins_min": -0.1329580694437027, "eval_rewards/margins_std": 0.15895424783229828, "eval_rewards/rejected": 0.11037883907556534, "eval_runtime": 428.9257, "eval_samples_per_second": 4.663, "eval_steps_per_second": 0.291, "step": 800 }, { "dpo_losses": 0.644654393196106, "epoch": 0.21, "grad_norm": 32.865065781588285, "learning_rate": 4.8120956574422315e-06, "logits/chosen": -2.757279396057129, "logits/rejected": -2.706714153289795, "logps/chosen": -267.24017333984375, "logps/rejected": -268.2978820800781, "loss": 0.6791, "positive_losses": 0.171641543507576, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.20262300968170166, "rewards/margins": 0.10689127445220947, "rewards/margins_max": 0.3089839518070221, "rewards/margins_min": -0.08225008845329285, "rewards/margins_std": 0.17407983541488647, "rewards/rejected": 0.09573175013065338, "step": 810 }, { "dpo_losses": 0.6347582340240479, "epoch": 0.21, "grad_norm": 2.133849398460458, "learning_rate": 4.803310053882831e-06, "logits/chosen": -2.7158467769622803, "logits/rejected": -2.643524169921875, "logps/chosen": -241.708740234375, "logps/rejected": -211.9954833984375, "loss": 0.6992, "positive_losses": 0.7901833057403564, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.19976899027824402, "rewards/margins": 0.1295097917318344, "rewards/margins_max": 0.3093356192111969, "rewards/margins_min": -0.023181414231657982, "rewards/margins_std": 0.1538805514574051, "rewards/rejected": 0.0702591985464096, "step": 820 }, { "dpo_losses": 0.6600489616394043, "epoch": 0.22, "grad_norm": 1.9906556466945862, "learning_rate": 4.794332124596775e-06, "logits/chosen": -2.73274302482605, "logits/rejected": -2.693387031555176, "logps/chosen": -310.03228759765625, "logps/rejected": -289.34808349609375, "loss": 0.7112, "positive_losses": 0.6938531994819641, "rewards/accuracies": 0.625, "rewards/chosen": 0.1901048719882965, "rewards/margins": 0.07919653505086899, "rewards/margins_max": 0.27788442373275757, "rewards/margins_min": -0.1256086230278015, "rewards/margins_std": 0.18551048636436462, "rewards/rejected": 0.11090834438800812, "step": 830 }, { "dpo_losses": 0.6633915901184082, "epoch": 0.22, "grad_norm": 13.501522402223179, "learning_rate": 4.785162619238575e-06, "logits/chosen": -2.7413976192474365, "logits/rejected": -2.703176259994507, "logps/chosen": -287.1868896484375, "logps/rejected": -249.82577514648438, "loss": 0.6835, "positive_losses": 0.25547829270362854, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.19128718972206116, "rewards/margins": 0.06747514009475708, "rewards/margins_max": 0.252020925283432, "rewards/margins_min": -0.08707233518362045, "rewards/margins_std": 0.15036623179912567, "rewards/rejected": 0.12381205707788467, "step": 840 }, { "dpo_losses": 0.6518365740776062, "epoch": 0.22, "grad_norm": 16.315205901352137, "learning_rate": 4.775802303459288e-06, "logits/chosen": -2.728170394897461, "logits/rejected": -2.6817569732666016, "logps/chosen": -278.39508056640625, "logps/rejected": -261.9521179199219, "loss": 0.6593, "positive_losses": 0.0, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1942692995071411, "rewards/margins": 0.09106271713972092, "rewards/margins_max": 0.2513071298599243, "rewards/margins_min": -0.06500595808029175, "rewards/margins_std": 0.14592596888542175, "rewards/rejected": 0.1032065600156784, "step": 850 }, { "dpo_losses": 0.6661285161972046, "epoch": 0.23, "grad_norm": 11.161606472430845, "learning_rate": 4.766251958842589e-06, "logits/chosen": -2.7020115852355957, "logits/rejected": -2.7151379585266113, "logps/chosen": -207.96890258789062, "logps/rejected": -219.1763458251953, "loss": 0.6991, "positive_losses": 0.3840135633945465, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.17543333768844604, "rewards/margins": 0.06088130921125412, "rewards/margins_max": 0.22354106605052948, "rewards/margins_min": -0.08830691874027252, "rewards/margins_std": 0.13910332322120667, "rewards/rejected": 0.11455200612545013, "step": 860 }, { "dpo_losses": 0.6467432379722595, "epoch": 0.23, "grad_norm": 13.833133803311254, "learning_rate": 4.7565123828395066e-06, "logits/chosen": -2.6408803462982178, "logits/rejected": -2.634593963623047, "logps/chosen": -301.78521728515625, "logps/rejected": -235.86532592773438, "loss": 0.7119, "positive_losses": 0.9385786056518555, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.21764317154884338, "rewards/margins": 0.1273972988128662, "rewards/margins_max": 0.4656582474708557, "rewards/margins_min": -0.10134243965148926, "rewards/margins_std": 0.2609899938106537, "rewards/rejected": 0.09024586528539658, "step": 870 }, { "dpo_losses": 0.6767698526382446, "epoch": 0.23, "grad_norm": 2.0617563467583744, "learning_rate": 4.746584388701831e-06, "logits/chosen": -2.6994736194610596, "logits/rejected": -2.697141647338867, "logps/chosen": -251.4209442138672, "logps/rejected": -290.01910400390625, "loss": 0.6942, "positive_losses": 0.050130270421504974, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.17999081313610077, "rewards/margins": 0.04156552627682686, "rewards/margins_max": 0.21716555953025818, "rewards/margins_min": -0.16058142483234406, "rewards/margins_std": 0.17269611358642578, "rewards/rejected": 0.13842527568340302, "step": 880 }, { "dpo_losses": 0.6655186414718628, "epoch": 0.23, "grad_norm": 7.147921717645752, "learning_rate": 4.736468805414218e-06, "logits/chosen": -2.6926393508911133, "logits/rejected": -2.6765496730804443, "logps/chosen": -291.87615966796875, "logps/rejected": -265.9792785644531, "loss": 0.7001, "positive_losses": 0.6028454303741455, "rewards/accuracies": 0.625, "rewards/chosen": 0.18655279278755188, "rewards/margins": 0.06399138271808624, "rewards/margins_max": 0.262087881565094, "rewards/margins_min": -0.11333304643630981, "rewards/margins_std": 0.16814734041690826, "rewards/rejected": 0.12256141752004623, "step": 890 }, { "dpo_losses": 0.6704162955284119, "epoch": 0.24, "grad_norm": 1.8413939859583113, "learning_rate": 4.7261664776249595e-06, "logits/chosen": -2.695251941680908, "logits/rejected": -2.6767027378082275, "logps/chosen": -244.72549438476562, "logps/rejected": -263.5853576660156, "loss": 0.6772, "positive_losses": 0.04858360439538956, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2000569850206375, "rewards/margins": 0.05427966266870499, "rewards/margins_max": 0.24951040744781494, "rewards/margins_min": -0.1446894109249115, "rewards/margins_std": 0.17368021607398987, "rewards/rejected": 0.14577731490135193, "step": 900 }, { "epoch": 0.24, "eval_dpo_losses": 0.6531457304954529, "eval_logits/chosen": -2.6914944648742676, "eval_logits/rejected": -2.651089668273926, "eval_logps/chosen": -264.3727722167969, "eval_logps/rejected": -247.279296875, "eval_loss": 0.6838503479957581, "eval_positive_losses": 0.1725376397371292, "eval_rewards/accuracies": 0.6880000233650208, "eval_rewards/chosen": 0.20220635831356049, "eval_rewards/margins": 0.08920986950397491, "eval_rewards/margins_max": 0.3503573536872864, "eval_rewards/margins_min": -0.1380314975976944, "eval_rewards/margins_std": 0.16318261623382568, "eval_rewards/rejected": 0.11299646645784378, "eval_runtime": 428.5468, "eval_samples_per_second": 4.667, "eval_steps_per_second": 0.292, "step": 900 }, { "dpo_losses": 0.6476281881332397, "epoch": 0.24, "grad_norm": 12.48126403471607, "learning_rate": 4.715678265575463e-06, "logits/chosen": -2.7070693969726562, "logits/rejected": -2.6706814765930176, "logps/chosen": -246.9974365234375, "logps/rejected": -209.4190673828125, "loss": 0.6851, "positive_losses": 0.1288740187883377, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2026553601026535, "rewards/margins": 0.10049331188201904, "rewards/margins_max": 0.2798111140727997, "rewards/margins_min": -0.07261396944522858, "rewards/margins_std": 0.1584080457687378, "rewards/rejected": 0.10216206312179565, "step": 910 }, { "dpo_losses": 0.6598332524299622, "epoch": 0.24, "grad_norm": 8.431915469591006, "learning_rate": 4.705005045028415e-06, "logits/chosen": -2.6534323692321777, "logits/rejected": -2.6535675525665283, "logps/chosen": -262.9512634277344, "logps/rejected": -258.3651428222656, "loss": 0.6877, "positive_losses": 0.18950042128562927, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.19684894382953644, "rewards/margins": 0.07885893434286118, "rewards/margins_max": 0.27929168939590454, "rewards/margins_min": -0.0989658385515213, "rewards/margins_std": 0.16988089680671692, "rewards/rejected": 0.11798999458551407, "step": 920 }, { "dpo_losses": 0.6389893293380737, "epoch": 0.24, "grad_norm": 2.0299211116551934, "learning_rate": 4.694147707194659e-06, "logits/chosen": -2.686340093612671, "logits/rejected": -2.643683433532715, "logps/chosen": -307.39239501953125, "logps/rejected": -261.57684326171875, "loss": 0.6759, "positive_losses": 0.27537959814071655, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2219519168138504, "rewards/margins": 0.1232791393995285, "rewards/margins_max": 0.3459230959415436, "rewards/margins_min": -0.12194018065929413, "rewards/margins_std": 0.21176505088806152, "rewards/rejected": 0.0986727625131607, "step": 930 }, { "dpo_losses": 0.660244345664978, "epoch": 0.25, "grad_norm": 19.834565111051244, "learning_rate": 4.683107158658782e-06, "logits/chosen": -2.737499713897705, "logits/rejected": -2.7289881706237793, "logps/chosen": -257.3228759765625, "logps/rejected": -261.7406921386719, "loss": 0.7249, "positive_losses": 0.3064068853855133, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19655682146549225, "rewards/margins": 0.0728425458073616, "rewards/margins_max": 0.2391335517168045, "rewards/margins_min": -0.08707662671804428, "rewards/margins_std": 0.1448540985584259, "rewards/rejected": 0.12371426820755005, "step": 940 }, { "dpo_losses": 0.6490105390548706, "epoch": 0.25, "grad_norm": 2.0333265987138796, "learning_rate": 4.671884321303407e-06, "logits/chosen": -2.732962131500244, "logits/rejected": -2.751864433288574, "logps/chosen": -247.425537109375, "logps/rejected": -275.36541748046875, "loss": 0.6792, "positive_losses": 0.39264431595802307, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1909867823123932, "rewards/margins": 0.09982354193925858, "rewards/margins_max": 0.3180977702140808, "rewards/margins_min": -0.10292227566242218, "rewards/margins_std": 0.1878553330898285, "rewards/rejected": 0.09116323292255402, "step": 950 }, { "dpo_losses": 0.6445702314376831, "epoch": 0.25, "grad_norm": 1.9802247147247922, "learning_rate": 4.660480132232224e-06, "logits/chosen": -2.7283406257629395, "logits/rejected": -2.6318576335906982, "logps/chosen": -310.75347900390625, "logps/rejected": -248.46890258789062, "loss": 0.678, "positive_losses": 0.43692511320114136, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.19033241271972656, "rewards/margins": 0.10816697031259537, "rewards/margins_max": 0.2987057566642761, "rewards/margins_min": -0.09911347925662994, "rewards/margins_std": 0.18026554584503174, "rewards/rejected": 0.0821654349565506, "step": 960 }, { "dpo_losses": 0.6424090266227722, "epoch": 0.25, "grad_norm": 13.668252310219486, "learning_rate": 4.6488955436917414e-06, "logits/chosen": -2.7777047157287598, "logits/rejected": -2.7694575786590576, "logps/chosen": -292.04925537109375, "logps/rejected": -263.6664123535156, "loss": 0.675, "positive_losses": 0.2722419798374176, "rewards/accuracies": 0.75, "rewards/chosen": 0.18903517723083496, "rewards/margins": 0.1125110536813736, "rewards/margins_max": 0.292854368686676, "rewards/margins_min": -0.0723922997713089, "rewards/margins_std": 0.16320811212062836, "rewards/rejected": 0.07652413845062256, "step": 970 }, { "dpo_losses": 0.652771532535553, "epoch": 0.26, "grad_norm": 5.038909957114555, "learning_rate": 4.6371315229917644e-06, "logits/chosen": -2.796520233154297, "logits/rejected": -2.785407543182373, "logps/chosen": -261.3905944824219, "logps/rejected": -223.49746704101562, "loss": 0.6749, "positive_losses": 0.2700274586677551, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.18703484535217285, "rewards/margins": 0.0883074700832367, "rewards/margins_max": 0.244595006108284, "rewards/margins_min": -0.06791295856237411, "rewards/margins_std": 0.1423143595457077, "rewards/rejected": 0.09872739017009735, "step": 980 }, { "dpo_losses": 0.6566593050956726, "epoch": 0.26, "grad_norm": 8.451568403786762, "learning_rate": 4.625189052424638e-06, "logits/chosen": -2.6437346935272217, "logits/rejected": -2.6022567749023438, "logps/chosen": -274.94598388671875, "logps/rejected": -258.070556640625, "loss": 0.838, "positive_losses": 3.820669174194336, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1613970845937729, "rewards/margins": 0.08419553935527802, "rewards/margins_max": 0.26956096291542053, "rewards/margins_min": -0.1393173336982727, "rewards/margins_std": 0.185796856880188, "rewards/rejected": 0.07720156013965607, "step": 990 }, { "dpo_losses": 0.6721663475036621, "epoch": 0.26, "grad_norm": 11.694826491864028, "learning_rate": 4.613069129183218e-06, "logits/chosen": -2.714327335357666, "logits/rejected": -2.7002182006835938, "logps/chosen": -252.7064666748047, "logps/rejected": -240.7244873046875, "loss": 0.6919, "positive_losses": 0.2932225465774536, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.18922224640846252, "rewards/margins": 0.05068572238087654, "rewards/margins_max": 0.24727725982666016, "rewards/margins_min": -0.12820494174957275, "rewards/margins_std": 0.16742168366909027, "rewards/rejected": 0.1385365128517151, "step": 1000 }, { "epoch": 0.26, "eval_dpo_losses": 0.6541987657546997, "eval_logits/chosen": -2.6966404914855957, "eval_logits/rejected": -2.6584105491638184, "eval_logps/chosen": -263.4407043457031, "eval_logps/rejected": -246.06858825683594, "eval_loss": 0.674384355545044, "eval_positive_losses": 0.12830524146556854, "eval_rewards/accuracies": 0.7009999752044678, "eval_rewards/chosen": 0.21152688562870026, "eval_rewards/margins": 0.08642362803220749, "eval_rewards/margins_max": 0.3385370969772339, "eval_rewards/margins_min": -0.13127902150154114, "eval_rewards/margins_std": 0.15737108886241913, "eval_rewards/rejected": 0.12510326504707336, "eval_runtime": 428.8127, "eval_samples_per_second": 4.664, "eval_steps_per_second": 0.292, "step": 1000 }, { "dpo_losses": 0.641662061214447, "epoch": 0.26, "grad_norm": 10.626894434167557, "learning_rate": 4.600772765277607e-06, "logits/chosen": -2.7660765647888184, "logits/rejected": -2.722968339920044, "logps/chosen": -266.34222412109375, "logps/rejected": -267.46661376953125, "loss": 0.702, "positive_losses": 0.22415924072265625, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.20177514851093292, "rewards/margins": 0.1848360002040863, "rewards/margins_max": 0.6389064788818359, "rewards/margins_min": -0.03613171726465225, "rewards/margins_std": 0.31833499670028687, "rewards/rejected": 0.016939152032136917, "step": 1010 }, { "dpo_losses": 0.6729387044906616, "epoch": 0.27, "grad_norm": 11.603685096231283, "learning_rate": 4.588300987450652e-06, "logits/chosen": -2.752546787261963, "logits/rejected": -2.724853754043579, "logps/chosen": -264.8641357421875, "logps/rejected": -264.7480773925781, "loss": 0.6872, "positive_losses": 0.3430689871311188, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.1882530003786087, "rewards/margins": 0.046753257513046265, "rewards/margins_max": 0.20113499462604523, "rewards/margins_min": -0.11619193851947784, "rewards/margins_std": 0.14573058485984802, "rewards/rejected": 0.14149974286556244, "step": 1020 }, { "dpo_losses": 0.6509944796562195, "epoch": 0.27, "grad_norm": 2.139521962009652, "learning_rate": 4.5756548370922136e-06, "logits/chosen": -2.6962647438049316, "logits/rejected": -2.6954352855682373, "logps/chosen": -277.3344421386719, "logps/rejected": -281.9914855957031, "loss": 0.6698, "positive_losses": 0.0366668701171875, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22435882687568665, "rewards/margins": 0.09524282068014145, "rewards/margins_max": 0.2957554757595062, "rewards/margins_min": -0.11324095726013184, "rewards/margins_std": 0.18168717622756958, "rewards/rejected": 0.1291159838438034, "step": 1030 }, { "dpo_losses": 0.6445959210395813, "epoch": 0.27, "grad_norm": 1.8515225820624515, "learning_rate": 4.562835370152206e-06, "logits/chosen": -2.6695055961608887, "logits/rejected": -2.6389050483703613, "logps/chosen": -254.07485961914062, "logps/rejected": -231.9303436279297, "loss": 0.6658, "positive_losses": 0.07041053473949432, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23283395171165466, "rewards/margins": 0.10802946984767914, "rewards/margins_max": 0.307711660861969, "rewards/margins_min": -0.05454155057668686, "rewards/margins_std": 0.165231853723526, "rewards/rejected": 0.12480449676513672, "step": 1040 }, { "dpo_losses": 0.6566747426986694, "epoch": 0.27, "grad_norm": 18.894892637256174, "learning_rate": 4.54984365705243e-06, "logits/chosen": -2.70576810836792, "logits/rejected": -2.6664857864379883, "logps/chosen": -295.703369140625, "logps/rejected": -297.4399719238281, "loss": 0.6703, "positive_losses": 0.26898860931396484, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.24375157058238983, "rewards/margins": 0.08427192270755768, "rewards/margins_max": 0.29443126916885376, "rewards/margins_min": -0.12378156185150146, "rewards/margins_std": 0.1837128847837448, "rewards/rejected": 0.15947964787483215, "step": 1050 }, { "dpo_losses": 0.6367157101631165, "epoch": 0.28, "grad_norm": 1.9230237799417447, "learning_rate": 4.536680782597191e-06, "logits/chosen": -2.7162742614746094, "logits/rejected": -2.691763401031494, "logps/chosen": -304.84722900390625, "logps/rejected": -283.64190673828125, "loss": 0.6475, "positive_losses": 0.0, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2298576384782791, "rewards/margins": 0.12448763847351074, "rewards/margins_max": 0.32968512177467346, "rewards/margins_min": -0.023798024281859398, "rewards/margins_std": 0.16265524923801422, "rewards/rejected": 0.10537000745534897, "step": 1060 }, { "dpo_losses": 0.6413639187812805, "epoch": 0.28, "grad_norm": 5.949947440291917, "learning_rate": 4.523347845882718e-06, "logits/chosen": -2.7673373222351074, "logits/rejected": -2.7692039012908936, "logps/chosen": -230.3128662109375, "logps/rejected": -231.18142700195312, "loss": 0.6945, "positive_losses": 0.36742934584617615, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.1888483613729477, "rewards/margins": 0.11430616676807404, "rewards/margins_max": 0.3059723377227783, "rewards/margins_min": -0.0899059846997261, "rewards/margins_std": 0.17463162541389465, "rewards/rejected": 0.07454220950603485, "step": 1070 }, { "dpo_losses": 0.6528059244155884, "epoch": 0.28, "grad_norm": 1.8483695010449583, "learning_rate": 4.50984596020539e-06, "logits/chosen": -2.7179977893829346, "logits/rejected": -2.603659152984619, "logps/chosen": -269.09881591796875, "logps/rejected": -245.3249969482422, "loss": 0.706, "positive_losses": 0.4951631426811218, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.20286288857460022, "rewards/margins": 0.09366341680288315, "rewards/margins_max": 0.29101991653442383, "rewards/margins_min": -0.13122287392616272, "rewards/margins_std": 0.19200639426708221, "rewards/rejected": 0.10919946432113647, "step": 1080 }, { "dpo_losses": 0.6374953389167786, "epoch": 0.29, "grad_norm": 2.1313382038199618, "learning_rate": 4.4961762529687745e-06, "logits/chosen": -2.734699249267578, "logits/rejected": -2.702521800994873, "logps/chosen": -246.2765655517578, "logps/rejected": -222.8563690185547, "loss": 0.6458, "positive_losses": 0.11191920936107635, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.2176026999950409, "rewards/margins": 0.12468305975198746, "rewards/margins_max": 0.3307117819786072, "rewards/margins_min": -0.06565960496664047, "rewards/margins_std": 0.18037711083889008, "rewards/rejected": 0.09291966259479523, "step": 1090 }, { "dpo_losses": 0.6489697694778442, "epoch": 0.29, "grad_norm": 7.72971667640289, "learning_rate": 4.482339865589492e-06, "logits/chosen": -2.665496587753296, "logits/rejected": -2.6654245853424072, "logps/chosen": -279.18463134765625, "logps/rejected": -246.7305908203125, "loss": 0.6999, "positive_losses": 0.3637309968471527, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21064460277557373, "rewards/margins": 0.10365686565637589, "rewards/margins_max": 0.34722089767456055, "rewards/margins_min": -0.08580169826745987, "rewards/margins_std": 0.19276562333106995, "rewards/rejected": 0.10698773711919785, "step": 1100 }, { "epoch": 0.29, "eval_dpo_losses": 0.6483847498893738, "eval_logits/chosen": -2.710740089416504, "eval_logits/rejected": -2.676161766052246, "eval_logps/chosen": -263.6142883300781, "eval_logps/rejected": -247.6087646484375, "eval_loss": 0.6819132566452026, "eval_positive_losses": 0.20826822519302368, "eval_rewards/accuracies": 0.699999988079071, "eval_rewards/chosen": 0.20979158580303192, "eval_rewards/margins": 0.10008974373340607, "eval_rewards/margins_max": 0.3739788234233856, "eval_rewards/margins_min": -0.13883402943611145, "eval_rewards/margins_std": 0.17210878431797028, "eval_rewards/rejected": 0.10970184206962585, "eval_runtime": 428.8012, "eval_samples_per_second": 4.664, "eval_steps_per_second": 0.292, "step": 1100 }, { "dpo_losses": 0.6387425065040588, "epoch": 0.29, "grad_norm": 2.1639373412588756, "learning_rate": 4.468337953401909e-06, "logits/chosen": -2.7495522499084473, "logits/rejected": -2.7339322566986084, "logps/chosen": -265.6031799316406, "logps/rejected": -256.9521179199219, "loss": 0.6583, "positive_losses": 0.0828929916024208, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21731695532798767, "rewards/margins": 0.11999177932739258, "rewards/margins_max": 0.3173108994960785, "rewards/margins_min": -0.0539204403758049, "rewards/margins_std": 0.16713543236255646, "rewards/rejected": 0.09732518345117569, "step": 1110 }, { "dpo_losses": 0.6517058610916138, "epoch": 0.29, "grad_norm": 2.6773986828580862, "learning_rate": 4.45417168556166e-06, "logits/chosen": -2.747889757156372, "logits/rejected": -2.7046046257019043, "logps/chosen": -267.8252258300781, "logps/rejected": -222.594482421875, "loss": 0.6904, "positive_losses": 0.48121047019958496, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23185715079307556, "rewards/margins": 0.09255851805210114, "rewards/margins_max": 0.2650790512561798, "rewards/margins_min": -0.07168123126029968, "rewards/margins_std": 0.15016911923885345, "rewards/rejected": 0.13929861783981323, "step": 1120 }, { "dpo_losses": 0.6384804844856262, "epoch": 0.3, "grad_norm": 2.058088265886734, "learning_rate": 4.439842244948036e-06, "logits/chosen": -2.7641022205352783, "logits/rejected": -2.7230029106140137, "logps/chosen": -283.72637939453125, "logps/rejected": -247.41610717773438, "loss": 0.6573, "positive_losses": 0.11894264072179794, "rewards/accuracies": 0.8125, "rewards/chosen": 0.22092337906360626, "rewards/margins": 0.1186535581946373, "rewards/margins_max": 0.28430280089378357, "rewards/margins_min": -0.009220912121236324, "rewards/margins_std": 0.13258947432041168, "rewards/rejected": 0.10226980596780777, "step": 1130 }, { "dpo_losses": 0.6384243965148926, "epoch": 0.3, "grad_norm": 16.82012018004312, "learning_rate": 4.425350828065204e-06, "logits/chosen": -2.7302262783050537, "logits/rejected": -2.7000887393951416, "logps/chosen": -231.2483673095703, "logps/rejected": -217.3150634765625, "loss": 0.6655, "positive_losses": 0.3047451078891754, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.20616674423217773, "rewards/margins": 0.12047781050205231, "rewards/margins_max": 0.3222366273403168, "rewards/margins_min": -0.0599980354309082, "rewards/margins_std": 0.16773714125156403, "rewards/rejected": 0.08568893373012543, "step": 1140 }, { "dpo_losses": 0.6288574934005737, "epoch": 0.3, "grad_norm": 2.1351976856129102, "learning_rate": 4.410698644942303e-06, "logits/chosen": -2.708707809448242, "logits/rejected": -2.708299160003662, "logps/chosen": -285.7297058105469, "logps/rejected": -244.5306854248047, "loss": 0.6578, "positive_losses": 0.21590347588062286, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.23049168288707733, "rewards/margins": 0.1436266154050827, "rewards/margins_max": 0.38826459646224976, "rewards/margins_min": -0.05494864657521248, "rewards/margins_std": 0.19941997528076172, "rewards/rejected": 0.08686506748199463, "step": 1150 }, { "dpo_losses": 0.6439481377601624, "epoch": 0.3, "grad_norm": 2.0874043425611775, "learning_rate": 4.395886919032406e-06, "logits/chosen": -2.64715838432312, "logits/rejected": -2.6258010864257812, "logps/chosen": -215.3719482421875, "logps/rejected": -205.1241455078125, "loss": 0.682, "positive_losses": 0.7586047053337097, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18462257087230682, "rewards/margins": 0.10984931886196136, "rewards/margins_max": 0.3208647072315216, "rewards/margins_min": -0.06190117448568344, "rewards/margins_std": 0.16643205285072327, "rewards/rejected": 0.07477324455976486, "step": 1160 }, { "dpo_losses": 0.6412914395332336, "epoch": 0.31, "grad_norm": 2.0902804194786078, "learning_rate": 4.380916887110366e-06, "logits/chosen": -2.7267210483551025, "logits/rejected": -2.6894755363464355, "logps/chosen": -243.6680908203125, "logps/rejected": -254.6070556640625, "loss": 0.6945, "positive_losses": 0.32631784677505493, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.19491291046142578, "rewards/margins": 0.1161138266324997, "rewards/margins_max": 0.30505552887916565, "rewards/margins_min": -0.07447733730077744, "rewards/margins_std": 0.16869278252124786, "rewards/rejected": 0.07879908382892609, "step": 1170 }, { "dpo_losses": 0.64289790391922, "epoch": 0.31, "grad_norm": 1.883050927225588, "learning_rate": 4.365789799169539e-06, "logits/chosen": -2.7036330699920654, "logits/rejected": -2.6694464683532715, "logps/chosen": -243.7769775390625, "logps/rejected": -226.6832733154297, "loss": 0.6661, "positive_losses": 0.34683817625045776, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2082194834947586, "rewards/margins": 0.1138322576880455, "rewards/margins_max": 0.3273180425167084, "rewards/margins_min": -0.07047141343355179, "rewards/margins_std": 0.17807689309120178, "rewards/rejected": 0.0943872481584549, "step": 1180 }, { "dpo_losses": 0.6655398011207581, "epoch": 0.31, "grad_norm": 15.49620797317368, "learning_rate": 4.350506918317416e-06, "logits/chosen": -2.6926140785217285, "logits/rejected": -2.6838948726654053, "logps/chosen": -250.5306854248047, "logps/rejected": -232.6984405517578, "loss": 0.7552, "positive_losses": 1.3293288946151733, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1976785957813263, "rewards/margins": 0.06513278186321259, "rewards/margins_max": 0.24743323028087616, "rewards/margins_min": -0.12439664453268051, "rewards/margins_std": 0.16177868843078613, "rewards/rejected": 0.1325458139181137, "step": 1190 }, { "dpo_losses": 0.6657013297080994, "epoch": 0.31, "grad_norm": 1.881349620525999, "learning_rate": 4.335069520670149e-06, "logits/chosen": -2.696167469024658, "logits/rejected": -2.675083637237549, "logps/chosen": -259.9974670410156, "logps/rejected": -235.8877410888672, "loss": 0.6733, "positive_losses": 0.07209090888500214, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.22681312263011932, "rewards/margins": 0.06806282699108124, "rewards/margins_max": 0.27797073125839233, "rewards/margins_min": -0.13237114250659943, "rewards/margins_std": 0.18537095189094543, "rewards/rejected": 0.15875029563903809, "step": 1200 }, { "epoch": 0.31, "eval_dpo_losses": 0.6510069370269775, "eval_logits/chosen": -2.692028284072876, "eval_logits/rejected": -2.6589362621307373, "eval_logps/chosen": -262.9894714355469, "eval_logps/rejected": -246.43472290039062, "eval_loss": 0.6807742118835449, "eval_positive_losses": 0.19238094985485077, "eval_rewards/accuracies": 0.703000009059906, "eval_rewards/chosen": 0.21603924036026, "eval_rewards/margins": 0.09459712356328964, "eval_rewards/margins_max": 0.3759876787662506, "eval_rewards/margins_min": -0.14237606525421143, "eval_rewards/margins_std": 0.1725015491247177, "eval_rewards/rejected": 0.12144210934638977, "eval_runtime": 429.0994, "eval_samples_per_second": 4.661, "eval_steps_per_second": 0.291, "step": 1200 }, { "dpo_losses": 0.6611160039901733, "epoch": 0.32, "grad_norm": 1.7684502741946675, "learning_rate": 4.319478895246e-06, "logits/chosen": -2.7390599250793457, "logits/rejected": -2.710543155670166, "logps/chosen": -263.1366882324219, "logps/rejected": -229.7078094482422, "loss": 0.6533, "positive_losses": 0.026244735345244408, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.23217184841632843, "rewards/margins": 0.07489676773548126, "rewards/margins_max": 0.30165398120880127, "rewards/margins_min": -0.12996122241020203, "rewards/margins_std": 0.19295385479927063, "rewards/rejected": 0.15727505087852478, "step": 1210 }, { "dpo_losses": 0.6445401906967163, "epoch": 0.32, "grad_norm": 1.9439185483414834, "learning_rate": 4.303736343857704e-06, "logits/chosen": -2.7497551441192627, "logits/rejected": -2.7256340980529785, "logps/chosen": -275.651123046875, "logps/rejected": -257.06024169921875, "loss": 0.7067, "positive_losses": 0.29237785935401917, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2161777913570404, "rewards/margins": 0.11061173677444458, "rewards/margins_max": 0.33916497230529785, "rewards/margins_min": -0.06498685479164124, "rewards/margins_std": 0.18643537163734436, "rewards/rejected": 0.10556602478027344, "step": 1220 }, { "dpo_losses": 0.6591473817825317, "epoch": 0.32, "grad_norm": 1.7793934834588527, "learning_rate": 4.287843181003772e-06, "logits/chosen": -2.669670820236206, "logits/rejected": -2.685515880584717, "logps/chosen": -206.4748077392578, "logps/rejected": -220.84518432617188, "loss": 0.6681, "positive_losses": 0.042365264147520065, "rewards/accuracies": 0.6875, "rewards/chosen": 0.18949341773986816, "rewards/margins": 0.07591966539621353, "rewards/margins_max": 0.2512756288051605, "rewards/margins_min": -0.07519426196813583, "rewards/margins_std": 0.1464376002550125, "rewards/rejected": 0.11357376724481583, "step": 1230 }, { "dpo_losses": 0.6370415091514587, "epoch": 0.32, "grad_norm": 1.8643633181659638, "learning_rate": 4.27180073375873e-06, "logits/chosen": -2.633789539337158, "logits/rejected": -2.663215160369873, "logps/chosen": -227.5106964111328, "logps/rejected": -215.61572265625, "loss": 0.667, "positive_losses": 0.28877371549606323, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21797780692577362, "rewards/margins": 0.12506039440631866, "rewards/margins_max": 0.32338911294937134, "rewards/margins_min": -0.024745440110564232, "rewards/margins_std": 0.15704648196697235, "rewards/rejected": 0.09291739761829376, "step": 1240 }, { "dpo_losses": 0.6495698690414429, "epoch": 0.33, "grad_norm": 9.81438840930156, "learning_rate": 4.255610341662304e-06, "logits/chosen": -2.630401611328125, "logits/rejected": -2.6542916297912598, "logps/chosen": -251.74429321289062, "logps/rejected": -267.33502197265625, "loss": 0.7337, "positive_losses": 1.2103960514068604, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19170698523521423, "rewards/margins": 0.1016342043876648, "rewards/margins_max": 0.31702300906181335, "rewards/margins_min": -0.09832672774791718, "rewards/margins_std": 0.1854063719511032, "rewards/rejected": 0.09007280319929123, "step": 1250 }, { "dpo_losses": 0.6237746477127075, "epoch": 0.33, "grad_norm": 17.469642684592714, "learning_rate": 4.2392733566075764e-06, "logits/chosen": -2.7168173789978027, "logits/rejected": -2.7174434661865234, "logps/chosen": -226.96176147460938, "logps/rejected": -213.95187377929688, "loss": 0.7315, "positive_losses": 0.5717722773551941, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.20527131855487823, "rewards/margins": 0.1550363004207611, "rewards/margins_max": 0.35445767641067505, "rewards/margins_min": -0.054833848029375076, "rewards/margins_std": 0.18285340070724487, "rewards/rejected": 0.05023502558469772, "step": 1260 }, { "dpo_losses": 0.6449776887893677, "epoch": 0.33, "grad_norm": 11.343167544877923, "learning_rate": 4.2227911427280975e-06, "logits/chosen": -2.7697720527648926, "logits/rejected": -2.769394636154175, "logps/chosen": -289.546875, "logps/rejected": -288.5842590332031, "loss": 0.6956, "positive_losses": 0.005890273954719305, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2388238161802292, "rewards/margins": 0.10974308103322983, "rewards/margins_max": 0.3462539315223694, "rewards/margins_min": -0.07522787898778915, "rewards/margins_std": 0.18770408630371094, "rewards/rejected": 0.12908072769641876, "step": 1270 }, { "dpo_losses": 0.6495126485824585, "epoch": 0.33, "grad_norm": 10.64441083647489, "learning_rate": 4.206165076283983e-06, "logits/chosen": -2.660526752471924, "logits/rejected": -2.672694444656372, "logps/chosen": -257.87030029296875, "logps/rejected": -247.67324829101562, "loss": 0.6708, "positive_losses": 0.15419521927833557, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2141142189502716, "rewards/margins": 0.09941279888153076, "rewards/margins_max": 0.28904852271080017, "rewards/margins_min": -0.09349670261144638, "rewards/margins_std": 0.1741757094860077, "rewards/rejected": 0.11470142751932144, "step": 1280 }, { "dpo_losses": 0.6463828086853027, "epoch": 0.34, "grad_norm": 13.745553702271556, "learning_rate": 4.189396545546995e-06, "logits/chosen": -2.6681981086730957, "logits/rejected": -2.6637043952941895, "logps/chosen": -246.97756958007812, "logps/rejected": -245.38137817382812, "loss": 0.7208, "positive_losses": 0.5871347188949585, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21898198127746582, "rewards/margins": 0.11543124914169312, "rewards/margins_max": 0.3535284996032715, "rewards/margins_min": -0.09753818809986115, "rewards/margins_std": 0.20409724116325378, "rewards/rejected": 0.1035507470369339, "step": 1290 }, { "dpo_losses": 0.6657624840736389, "epoch": 0.34, "grad_norm": 1.8702001807220798, "learning_rate": 4.172486950684627e-06, "logits/chosen": -2.7049002647399902, "logits/rejected": -2.648792028427124, "logps/chosen": -180.62350463867188, "logps/rejected": -209.24960327148438, "loss": 0.6956, "positive_losses": 0.5532621145248413, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.18718722462654114, "rewards/margins": 0.06377638131380081, "rewards/margins_max": 0.24433700740337372, "rewards/margins_min": -0.1268782615661621, "rewards/margins_std": 0.16472597420215607, "rewards/rejected": 0.12341083586215973, "step": 1300 }, { "epoch": 0.34, "eval_dpo_losses": 0.6533894538879395, "eval_logits/chosen": -2.685936212539673, "eval_logits/rejected": -2.65126371383667, "eval_logps/chosen": -262.44921875, "eval_logps/rejected": -245.30075073242188, "eval_loss": 0.6717718839645386, "eval_positive_losses": 0.10075932741165161, "eval_rewards/accuracies": 0.7049999833106995, "eval_rewards/chosen": 0.22144196927547455, "eval_rewards/margins": 0.08866012841463089, "eval_rewards/margins_max": 0.348684698343277, "eval_rewards/margins_min": -0.13701532781124115, "eval_rewards/margins_std": 0.16302600502967834, "eval_rewards/rejected": 0.13278183341026306, "eval_runtime": 428.5934, "eval_samples_per_second": 4.666, "eval_steps_per_second": 0.292, "step": 1300 }, { "dpo_losses": 0.6607504487037659, "epoch": 0.34, "grad_norm": 15.357596122668163, "learning_rate": 4.155437703643182e-06, "logits/chosen": -2.685216188430786, "logits/rejected": -2.6406562328338623, "logps/chosen": -251.7557830810547, "logps/rejected": -239.54092407226562, "loss": 0.6782, "positive_losses": 0.31490644812583923, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.20520582795143127, "rewards/margins": 0.07459048181772232, "rewards/margins_max": 0.258635938167572, "rewards/margins_min": -0.10191160440444946, "rewards/margins_std": 0.16240204870700836, "rewards/rejected": 0.13061535358428955, "step": 1310 }, { "dpo_losses": 0.6575291752815247, "epoch": 0.35, "grad_norm": 5.674113186685851, "learning_rate": 4.138250228029882e-06, "logits/chosen": -2.7346696853637695, "logits/rejected": -2.708397388458252, "logps/chosen": -265.3553466796875, "logps/rejected": -242.7547607421875, "loss": 0.6803, "positive_losses": 0.23917289078235626, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22491803765296936, "rewards/margins": 0.08349496126174927, "rewards/margins_max": 0.301153302192688, "rewards/margins_min": -0.14515478909015656, "rewards/margins_std": 0.1971009522676468, "rewards/rejected": 0.1414230614900589, "step": 1320 }, { "dpo_losses": 0.6387127637863159, "epoch": 0.35, "grad_norm": 1.8657015070731098, "learning_rate": 4.120925958993994e-06, "logits/chosen": -2.6975204944610596, "logits/rejected": -2.6977813243865967, "logps/chosen": -270.1856384277344, "logps/rejected": -263.95379638671875, "loss": 0.6965, "positive_losses": 0.14932651817798615, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23762302100658417, "rewards/margins": 0.1235288754105568, "rewards/margins_max": 0.33065515756607056, "rewards/margins_min": -0.08119857311248779, "rewards/margins_std": 0.18276996910572052, "rewards/rejected": 0.11409411579370499, "step": 1330 }, { "dpo_losses": 0.6390461325645447, "epoch": 0.35, "grad_norm": 2.1685616447160534, "learning_rate": 4.103466343106999e-06, "logits/chosen": -2.5646095275878906, "logits/rejected": -2.54118013381958, "logps/chosen": -325.34490966796875, "logps/rejected": -271.0790100097656, "loss": 0.6633, "positive_losses": 0.16528816521167755, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2389363795518875, "rewards/margins": 0.12369044870138168, "rewards/margins_max": 0.34082871675491333, "rewards/margins_min": -0.07639284431934357, "rewards/margins_std": 0.18984806537628174, "rewards/rejected": 0.11524595320224762, "step": 1340 }, { "dpo_losses": 0.6475824117660522, "epoch": 0.35, "grad_norm": 9.033524276385755, "learning_rate": 4.085872838241797e-06, "logits/chosen": -2.724543809890747, "logits/rejected": -2.679438591003418, "logps/chosen": -293.99957275390625, "logps/rejected": -282.8594970703125, "loss": 0.6783, "positive_losses": 0.3988454341888428, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22194123268127441, "rewards/margins": 0.10272666066884995, "rewards/margins_max": 0.32548457384109497, "rewards/margins_min": -0.10698683559894562, "rewards/margins_std": 0.18989944458007812, "rewards/rejected": 0.11921457946300507, "step": 1350 }, { "dpo_losses": 0.6559935808181763, "epoch": 0.36, "grad_norm": 2.234692851046713, "learning_rate": 4.06814691345098e-06, "logits/chosen": -2.663053035736084, "logits/rejected": -2.6754870414733887, "logps/chosen": -254.96728515625, "logps/rejected": -260.19183349609375, "loss": 0.6928, "positive_losses": 0.20500841736793518, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21983525156974792, "rewards/margins": 0.08531799167394638, "rewards/margins_max": 0.2903383672237396, "rewards/margins_min": -0.09972424060106277, "rewards/margins_std": 0.17287270724773407, "rewards/rejected": 0.13451728224754333, "step": 1360 }, { "dpo_losses": 0.6433361768722534, "epoch": 0.36, "grad_norm": 1.8750335054582856, "learning_rate": 4.050290048844171e-06, "logits/chosen": -2.746443271636963, "logits/rejected": -2.701589822769165, "logps/chosen": -276.8860168457031, "logps/rejected": -243.2723846435547, "loss": 0.6567, "positive_losses": 0.02892322465777397, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2512308657169342, "rewards/margins": 0.11041466891765594, "rewards/margins_max": 0.30071407556533813, "rewards/margins_min": -0.048938196152448654, "rewards/margins_std": 0.15528790652751923, "rewards/rejected": 0.14081618189811707, "step": 1370 }, { "dpo_losses": 0.6351478695869446, "epoch": 0.36, "grad_norm": 6.112838192810634, "learning_rate": 4.032303735464422e-06, "logits/chosen": -2.6798160076141357, "logits/rejected": -2.6576924324035645, "logps/chosen": -265.6146545410156, "logps/rejected": -233.17050170898438, "loss": 0.6659, "positive_losses": 0.32943230867385864, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2465706765651703, "rewards/margins": 0.12892796099185944, "rewards/margins_max": 0.3543568551540375, "rewards/margins_min": -0.05588211864233017, "rewards/margins_std": 0.18078508973121643, "rewards/rejected": 0.11764273792505264, "step": 1380 }, { "dpo_losses": 0.6396161913871765, "epoch": 0.36, "grad_norm": 2.034282240386281, "learning_rate": 4.014189475163727e-06, "logits/chosen": -2.740041494369507, "logits/rejected": -2.709094524383545, "logps/chosen": -235.23446655273438, "logps/rejected": -231.8503875732422, "loss": 0.6625, "positive_losses": 0.03492698818445206, "rewards/accuracies": 0.75, "rewards/chosen": 0.21176226437091827, "rewards/margins": 0.11928150802850723, "rewards/margins_max": 0.3145882785320282, "rewards/margins_min": -0.08566208183765411, "rewards/margins_std": 0.18053455650806427, "rewards/rejected": 0.09248076379299164, "step": 1390 }, { "dpo_losses": 0.6532957553863525, "epoch": 0.37, "grad_norm": 7.972277290724107, "learning_rate": 3.995948780477605e-06, "logits/chosen": -2.6942837238311768, "logits/rejected": -2.6299290657043457, "logps/chosen": -256.9607238769531, "logps/rejected": -224.931396484375, "loss": 0.7748, "positive_losses": 1.7606565952301025, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.19038155674934387, "rewards/margins": 0.0929260402917862, "rewards/margins_max": 0.28488069772720337, "rewards/margins_min": -0.11896850168704987, "rewards/margins_std": 0.17732271552085876, "rewards/rejected": 0.09745551645755768, "step": 1400 }, { "epoch": 0.37, "eval_dpo_losses": 0.6459010243415833, "eval_logits/chosen": -2.6662824153900146, "eval_logits/rejected": -2.6320106983184814, "eval_logps/chosen": -263.4083251953125, "eval_logps/rejected": -248.10313415527344, "eval_loss": 0.6954149007797241, "eval_positive_losses": 0.3216582238674164, "eval_rewards/accuracies": 0.6949999928474426, "eval_rewards/chosen": 0.21185092628002167, "eval_rewards/margins": 0.10709292441606522, "eval_rewards/margins_max": 0.41416898369789124, "eval_rewards/margins_min": -0.15775814652442932, "eval_rewards/margins_std": 0.19059261679649353, "eval_rewards/rejected": 0.10475799441337585, "eval_runtime": 428.6816, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 1400 }, { "dpo_losses": 0.6509705781936646, "epoch": 0.37, "grad_norm": 1.9059275995495557, "learning_rate": 3.977583174498816e-06, "logits/chosen": -2.6256604194641113, "logits/rejected": -2.6049373149871826, "logps/chosen": -212.65060424804688, "logps/rejected": -201.88674926757812, "loss": 0.6732, "positive_losses": 0.1541658341884613, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1950497031211853, "rewards/margins": 0.09519018232822418, "rewards/margins_max": 0.27731990814208984, "rewards/margins_min": -0.10081164538860321, "rewards/margins_std": 0.1688801795244217, "rewards/rejected": 0.09985951334238052, "step": 1410 }, { "dpo_losses": 0.6548116207122803, "epoch": 0.37, "grad_norm": 10.047181025422862, "learning_rate": 3.959094190750172e-06, "logits/chosen": -2.691422939300537, "logits/rejected": -2.6133222579956055, "logps/chosen": -220.44363403320312, "logps/rejected": -170.9870147705078, "loss": 0.6833, "positive_losses": 0.18265528976917267, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.20632381737232208, "rewards/margins": 0.08463943004608154, "rewards/margins_max": 0.2522267699241638, "rewards/margins_min": -0.07335247099399567, "rewards/margins_std": 0.1462160348892212, "rewards/rejected": 0.12168438732624054, "step": 1420 }, { "dpo_losses": 0.6528488397598267, "epoch": 0.37, "grad_norm": 2.4787016574071807, "learning_rate": 3.9404833730564975e-06, "logits/chosen": -2.701068162918091, "logits/rejected": -2.6969940662384033, "logps/chosen": -249.27658081054688, "logps/rejected": -228.20816040039062, "loss": 0.6885, "positive_losses": 0.3932468295097351, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.1905629187822342, "rewards/margins": 0.09067387878894806, "rewards/margins_max": 0.2638342082500458, "rewards/margins_min": -0.08423193544149399, "rewards/margins_std": 0.15701726078987122, "rewards/rejected": 0.09988904744386673, "step": 1430 }, { "dpo_losses": 0.6618974804878235, "epoch": 0.38, "grad_norm": 11.535012724513463, "learning_rate": 3.921752275415712e-06, "logits/chosen": -2.6624839305877686, "logits/rejected": -2.643702983856201, "logps/chosen": -298.94354248046875, "logps/rejected": -297.4173278808594, "loss": 0.6928, "positive_losses": 0.797075629234314, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19255423545837402, "rewards/margins": 0.07184568792581558, "rewards/margins_max": 0.2585648000240326, "rewards/margins_min": -0.11981566995382309, "rewards/margins_std": 0.17012974619865417, "rewards/rejected": 0.12070856243371964, "step": 1440 }, { "dpo_losses": 0.6448982954025269, "epoch": 0.38, "grad_norm": 8.439593472204347, "learning_rate": 3.902902461869079e-06, "logits/chosen": -2.649231195449829, "logits/rejected": -2.593291759490967, "logps/chosen": -303.04290771484375, "logps/rejected": -246.13558959960938, "loss": 0.6792, "positive_losses": 0.49538594484329224, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23056527972221375, "rewards/margins": 0.1090080514550209, "rewards/margins_max": 0.3267473578453064, "rewards/margins_min": -0.08372115343809128, "rewards/margins_std": 0.186322882771492, "rewards/rejected": 0.12155723571777344, "step": 1450 }, { "dpo_losses": 0.6358802318572998, "epoch": 0.38, "grad_norm": 1.9473062126498197, "learning_rate": 3.883935506370605e-06, "logits/chosen": -2.67279314994812, "logits/rejected": -2.6045124530792236, "logps/chosen": -278.34991455078125, "logps/rejected": -251.8113555908203, "loss": 0.6532, "positive_losses": 0.1108587235212326, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.21871885657310486, "rewards/margins": 0.12610134482383728, "rewards/margins_max": 0.30578070878982544, "rewards/margins_min": -0.058588337153196335, "rewards/margins_std": 0.1628783792257309, "rewards/rejected": 0.09261750429868698, "step": 1460 }, { "dpo_losses": 0.6454759240150452, "epoch": 0.38, "grad_norm": 9.845102756132542, "learning_rate": 3.864852992655617e-06, "logits/chosen": -2.720557689666748, "logits/rejected": -2.696348190307617, "logps/chosen": -268.27154541015625, "logps/rejected": -224.34774780273438, "loss": 0.667, "positive_losses": 0.5282651782035828, "rewards/accuracies": 0.625, "rewards/chosen": 0.19762755930423737, "rewards/margins": 0.11043057590723038, "rewards/margins_max": 0.36109811067581177, "rewards/margins_min": -0.10932507365942001, "rewards/margins_std": 0.21296295523643494, "rewards/rejected": 0.08719699084758759, "step": 1470 }, { "dpo_losses": 0.6457245945930481, "epoch": 0.39, "grad_norm": 1.9399777842753911, "learning_rate": 3.845656514108516e-06, "logits/chosen": -2.7073025703430176, "logits/rejected": -2.652066469192505, "logps/chosen": -236.2076873779297, "logps/rejected": -241.01461791992188, "loss": 0.6768, "positive_losses": 0.49394041299819946, "rewards/accuracies": 0.75, "rewards/chosen": 0.1909310668706894, "rewards/margins": 0.10538061708211899, "rewards/margins_max": 0.2829916775226593, "rewards/margins_min": -0.07284554839134216, "rewards/margins_std": 0.16062533855438232, "rewards/rejected": 0.0855504721403122, "step": 1480 }, { "dpo_losses": 0.6333507299423218, "epoch": 0.39, "grad_norm": 9.817224616454585, "learning_rate": 3.826347673629738e-06, "logits/chosen": -2.655913829803467, "logits/rejected": -2.6235640048980713, "logps/chosen": -268.8856506347656, "logps/rejected": -251.4858856201172, "loss": 0.7036, "positive_losses": 0.48341161012649536, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21629247069358826, "rewards/margins": 0.13244296610355377, "rewards/margins_max": 0.3207319676876068, "rewards/margins_min": -0.05463407188653946, "rewards/margins_std": 0.16572895646095276, "rewards/rejected": 0.08384953439235687, "step": 1490 }, { "dpo_losses": 0.6569206118583679, "epoch": 0.39, "grad_norm": 1.6572605145314065, "learning_rate": 3.8069280835019062e-06, "logits/chosen": -2.712667226791382, "logits/rejected": -2.6612064838409424, "logps/chosen": -221.9389190673828, "logps/rejected": -192.12229919433594, "loss": 0.6702, "positive_losses": 0.10241594165563583, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.20521406829357147, "rewards/margins": 0.08377285301685333, "rewards/margins_max": 0.3187033534049988, "rewards/margins_min": -0.08312635123729706, "rewards/margins_std": 0.18237851560115814, "rewards/rejected": 0.12144124507904053, "step": 1500 }, { "epoch": 0.39, "eval_dpo_losses": 0.6497886776924133, "eval_logits/chosen": -2.654142141342163, "eval_logits/rejected": -2.617938756942749, "eval_logps/chosen": -262.27630615234375, "eval_logps/rejected": -246.00479125976562, "eval_loss": 0.6790736317634583, "eval_positive_losses": 0.1719692498445511, "eval_rewards/accuracies": 0.6959999799728394, "eval_rewards/chosen": 0.22317123413085938, "eval_rewards/margins": 0.09742990136146545, "eval_rewards/margins_max": 0.37969255447387695, "eval_rewards/margins_min": -0.1462269127368927, "eval_rewards/margins_std": 0.17573675513267517, "eval_rewards/rejected": 0.12574131786823273, "eval_runtime": 428.4499, "eval_samples_per_second": 4.668, "eval_steps_per_second": 0.292, "step": 1500 }, { "dpo_losses": 0.661389172077179, "epoch": 0.4, "grad_norm": 2.2530888442102732, "learning_rate": 3.7873993652552077e-06, "logits/chosen": -2.6906399726867676, "logits/rejected": -2.6586453914642334, "logps/chosen": -281.95599365234375, "logps/rejected": -251.7514190673828, "loss": 0.6692, "positive_losses": 0.20750923454761505, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.213405579328537, "rewards/margins": 0.07286909222602844, "rewards/margins_max": 0.2439979612827301, "rewards/margins_min": -0.10484246164560318, "rewards/margins_std": 0.15867802500724792, "rewards/rejected": 0.14053651690483093, "step": 1510 }, { "dpo_losses": 0.6586933732032776, "epoch": 0.4, "grad_norm": 2.0913466139033154, "learning_rate": 3.7677631495319953e-06, "logits/chosen": -2.666205644607544, "logits/rejected": -2.636078119277954, "logps/chosen": -241.21786499023438, "logps/rejected": -223.1501007080078, "loss": 0.6818, "positive_losses": 0.31700435280799866, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.19468605518341064, "rewards/margins": 0.07912818342447281, "rewards/margins_max": 0.27547183632850647, "rewards/margins_min": -0.11301781237125397, "rewards/margins_std": 0.17428424954414368, "rewards/rejected": 0.11555787175893784, "step": 1520 }, { "dpo_losses": 0.6458944082260132, "epoch": 0.4, "grad_norm": 1.6951047005288509, "learning_rate": 3.748021075950633e-06, "logits/chosen": -2.68874454498291, "logits/rejected": -2.6798384189605713, "logps/chosen": -257.79766845703125, "logps/rejected": -248.316162109375, "loss": 0.6755, "positive_losses": 0.196380615234375, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.21627548336982727, "rewards/margins": 0.10481035709381104, "rewards/margins_max": 0.28972405195236206, "rewards/margins_min": -0.0763392299413681, "rewards/margins_std": 0.1624285727739334, "rewards/rejected": 0.11146511882543564, "step": 1530 }, { "dpo_losses": 0.6477961540222168, "epoch": 0.4, "grad_norm": 17.738464905595166, "learning_rate": 3.7281747929685824e-06, "logits/chosen": -2.701960802078247, "logits/rejected": -2.6405043601989746, "logps/chosen": -326.67962646484375, "logps/rejected": -293.8638610839844, "loss": 0.6941, "positive_losses": 0.19925327599048615, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23924434185028076, "rewards/margins": 0.10458407551050186, "rewards/margins_max": 0.3132040500640869, "rewards/margins_min": -0.07730933278799057, "rewards/margins_std": 0.1736002266407013, "rewards/rejected": 0.1346602737903595, "step": 1540 }, { "dpo_losses": 0.6386266946792603, "epoch": 0.41, "grad_norm": 2.537412230378379, "learning_rate": 3.7082259577447604e-06, "logits/chosen": -2.6471686363220215, "logits/rejected": -2.684368848800659, "logps/chosen": -260.3028869628906, "logps/rejected": -259.9427490234375, "loss": 0.6656, "positive_losses": 0.0027763366233557463, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.24805012345314026, "rewards/margins": 0.12799425423145294, "rewards/margins_max": 0.3592818081378937, "rewards/margins_min": -0.05324137210845947, "rewards/margins_std": 0.18737933039665222, "rewards/rejected": 0.12005583941936493, "step": 1550 }, { "dpo_losses": 0.6476603746414185, "epoch": 0.41, "grad_norm": 2.173682908025813, "learning_rate": 3.6881762360011688e-06, "logits/chosen": -2.7004570960998535, "logits/rejected": -2.6530659198760986, "logps/chosen": -254.0734405517578, "logps/rejected": -227.4460906982422, "loss": 0.7017, "positive_losses": 0.29716262221336365, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2482289969921112, "rewards/margins": 0.10765968263149261, "rewards/margins_max": 0.38754507899284363, "rewards/margins_min": -0.09375838190317154, "rewards/margins_std": 0.2146865576505661, "rewards/rejected": 0.14056932926177979, "step": 1560 }, { "dpo_losses": 0.6519032716751099, "epoch": 0.41, "grad_norm": 35.962333471438384, "learning_rate": 3.668027301883802e-06, "logits/chosen": -2.7290425300598145, "logits/rejected": -2.725869655609131, "logps/chosen": -292.5254821777344, "logps/rejected": -254.58016967773438, "loss": 0.6684, "positive_losses": 0.279030978679657, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22284607589244843, "rewards/margins": 0.09153153747320175, "rewards/margins_max": 0.25660672783851624, "rewards/margins_min": -0.09409938752651215, "rewards/margins_std": 0.15535183250904083, "rewards/rejected": 0.13131454586982727, "step": 1570 }, { "dpo_losses": 0.6673418283462524, "epoch": 0.41, "grad_norm": 8.517334260797579, "learning_rate": 3.64778083782286e-06, "logits/chosen": -2.678079605102539, "logits/rejected": -2.6898694038391113, "logps/chosen": -274.681396484375, "logps/rejected": -243.666015625, "loss": 0.685, "positive_losses": 0.13846245408058167, "rewards/accuracies": 0.625, "rewards/chosen": 0.20553827285766602, "rewards/margins": 0.06004505604505539, "rewards/margins_max": 0.22803637385368347, "rewards/margins_min": -0.07718309760093689, "rewards/margins_std": 0.13624307513237, "rewards/rejected": 0.14549322426319122, "step": 1580 }, { "dpo_losses": 0.6696632504463196, "epoch": 0.42, "grad_norm": 13.95096775649537, "learning_rate": 3.627438534392268e-06, "logits/chosen": -2.7030322551727295, "logits/rejected": -2.734532117843628, "logps/chosen": -265.6767578125, "logps/rejected": -254.0251922607422, "loss": 0.6826, "positive_losses": 0.3200332522392273, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.19224755465984344, "rewards/margins": 0.05710170790553093, "rewards/margins_max": 0.2669149935245514, "rewards/margins_min": -0.14468397200107574, "rewards/margins_std": 0.18533547222614288, "rewards/rejected": 0.13514584302902222, "step": 1590 }, { "dpo_losses": 0.6428429484367371, "epoch": 0.42, "grad_norm": 1.6166960480894232, "learning_rate": 3.607002090168506e-06, "logits/chosen": -2.67470121383667, "logits/rejected": -2.6375515460968018, "logps/chosen": -241.6584930419922, "logps/rejected": -214.53713989257812, "loss": 0.7212, "positive_losses": 0.14834651350975037, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.23246422410011292, "rewards/margins": 0.11045853793621063, "rewards/margins_max": 0.29041144251823425, "rewards/margins_min": -0.058310650289058685, "rewards/margins_std": 0.15310747921466827, "rewards/rejected": 0.12200568616390228, "step": 1600 }, { "epoch": 0.42, "eval_dpo_losses": 0.6517577767372131, "eval_logits/chosen": -2.653696298599243, "eval_logits/rejected": -2.6207213401794434, "eval_logps/chosen": -262.1661682128906, "eval_logps/rejected": -245.42872619628906, "eval_loss": 0.6791109442710876, "eval_positive_losses": 0.13293030858039856, "eval_rewards/accuracies": 0.6949999928474426, "eval_rewards/chosen": 0.22427266836166382, "eval_rewards/margins": 0.09277059137821198, "eval_rewards/margins_max": 0.3671208918094635, "eval_rewards/margins_min": -0.14220982789993286, "eval_rewards/margins_std": 0.1705854833126068, "eval_rewards/rejected": 0.13150209188461304, "eval_runtime": 428.4713, "eval_samples_per_second": 4.668, "eval_steps_per_second": 0.292, "step": 1600 }, { "dpo_losses": 0.6560760140419006, "epoch": 0.42, "grad_norm": 1.8541471573164028, "learning_rate": 3.586473211588787e-06, "logits/chosen": -2.6674115657806396, "logits/rejected": -2.659879207611084, "logps/chosen": -264.8671569824219, "logps/rejected": -232.6522674560547, "loss": 0.6672, "positive_losses": 0.14955882728099823, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2132745087146759, "rewards/margins": 0.08606938272714615, "rewards/margins_max": 0.3227910101413727, "rewards/margins_min": -0.09505396336317062, "rewards/margins_std": 0.18823081254959106, "rewards/rejected": 0.12720511853694916, "step": 1610 }, { "dpo_losses": 0.6431070566177368, "epoch": 0.42, "grad_norm": 2.162558573552497, "learning_rate": 3.5658536128085623e-06, "logits/chosen": -2.606340169906616, "logits/rejected": -2.5996594429016113, "logps/chosen": -260.520263671875, "logps/rejected": -267.6850280761719, "loss": 0.6523, "positive_losses": 0.17370910942554474, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.27634936571121216, "rewards/margins": 0.11630578339099884, "rewards/margins_max": 0.3538152277469635, "rewards/margins_min": -0.05678463727235794, "rewards/margins_std": 0.18289852142333984, "rewards/rejected": 0.1600435972213745, "step": 1620 }, { "dpo_losses": 0.6393071413040161, "epoch": 0.43, "grad_norm": 7.2998853596085835, "learning_rate": 3.545145015558399e-06, "logits/chosen": -2.7133536338806152, "logits/rejected": -2.672772169113159, "logps/chosen": -268.8191223144531, "logps/rejected": -256.1548767089844, "loss": 0.6615, "positive_losses": 0.20378609001636505, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22124388813972473, "rewards/margins": 0.11874544620513916, "rewards/margins_max": 0.29263168573379517, "rewards/margins_min": -0.0503699965775013, "rewards/margins_std": 0.15689268708229065, "rewards/rejected": 0.10249841213226318, "step": 1630 }, { "dpo_losses": 0.6422899961471558, "epoch": 0.43, "grad_norm": 2.1822109040555757, "learning_rate": 3.5243491490002056e-06, "logits/chosen": -2.733675479888916, "logits/rejected": -2.701655149459839, "logps/chosen": -271.9985046386719, "logps/rejected": -226.31838989257812, "loss": 0.6698, "positive_losses": 0.16527938842773438, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2348925620317459, "rewards/margins": 0.11396696418523788, "rewards/margins_max": 0.3119576573371887, "rewards/margins_min": -0.07116124778985977, "rewards/margins_std": 0.17077691853046417, "rewards/rejected": 0.12092562019824982, "step": 1640 }, { "dpo_losses": 0.633284866809845, "epoch": 0.43, "grad_norm": 2.213317021957103, "learning_rate": 3.503467749582857e-06, "logits/chosen": -2.6859681606292725, "logits/rejected": -2.6332907676696777, "logps/chosen": -308.0093994140625, "logps/rejected": -257.30621337890625, "loss": 0.6717, "positive_losses": 0.4384998381137848, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.22693462669849396, "rewards/margins": 0.13720186054706573, "rewards/margins_max": 0.34494930505752563, "rewards/margins_min": -0.08562783896923065, "rewards/margins_std": 0.1938478946685791, "rewards/rejected": 0.08973275125026703, "step": 1650 }, { "dpo_losses": 0.6396130323410034, "epoch": 0.43, "grad_norm": 11.193349321394455, "learning_rate": 3.4825025608971947e-06, "logits/chosen": -2.6242432594299316, "logits/rejected": -2.5996615886688232, "logps/chosen": -235.93539428710938, "logps/rejected": -259.59381103515625, "loss": 0.6509, "positive_losses": 0.05194082111120224, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.21370768547058105, "rewards/margins": 0.12144899368286133, "rewards/margins_max": 0.3482202887535095, "rewards/margins_min": -0.07061970233917236, "rewards/margins_std": 0.18398067355155945, "rewards/rejected": 0.09225870668888092, "step": 1660 }, { "dpo_losses": 0.6389883756637573, "epoch": 0.44, "grad_norm": 1.7358764472890016, "learning_rate": 3.4614553335304407e-06, "logits/chosen": -2.6740899085998535, "logits/rejected": -2.633568048477173, "logps/chosen": -239.4064483642578, "logps/rejected": -223.352783203125, "loss": 0.6775, "positive_losses": 0.4708629548549652, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.22561807930469513, "rewards/margins": 0.12099581956863403, "rewards/margins_max": 0.32260701060295105, "rewards/margins_min": -0.09542026370763779, "rewards/margins_std": 0.1840066909790039, "rewards/rejected": 0.1046222448348999, "step": 1670 }, { "dpo_losses": 0.6516619920730591, "epoch": 0.44, "grad_norm": 14.78354876912118, "learning_rate": 3.4403278249200222e-06, "logits/chosen": -2.6399035453796387, "logits/rejected": -2.6173157691955566, "logps/chosen": -245.27780151367188, "logps/rejected": -262.12042236328125, "loss": 0.688, "positive_losses": 0.7937184572219849, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.1986553966999054, "rewards/margins": 0.09551330655813217, "rewards/margins_max": 0.31856924295425415, "rewards/margins_min": -0.12135110050439835, "rewards/margins_std": 0.19662317633628845, "rewards/rejected": 0.10314206779003143, "step": 1680 }, { "dpo_losses": 0.6412166953086853, "epoch": 0.44, "grad_norm": 8.56541983391079, "learning_rate": 3.4191217992068293e-06, "logits/chosen": -2.6228997707366943, "logits/rejected": -2.630502939224243, "logps/chosen": -234.009521484375, "logps/rejected": -250.3644256591797, "loss": 0.694, "positive_losses": 0.6924245953559875, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.19170401990413666, "rewards/margins": 0.11980365216732025, "rewards/margins_max": 0.3731180429458618, "rewards/margins_min": -0.08363162726163864, "rewards/margins_std": 0.20621831715106964, "rewards/rejected": 0.07190034538507462, "step": 1690 }, { "dpo_losses": 0.6487213373184204, "epoch": 0.44, "grad_norm": 12.29537966408457, "learning_rate": 3.3978390270879056e-06, "logits/chosen": -2.717733860015869, "logits/rejected": -2.726158618927002, "logps/chosen": -226.20413208007812, "logps/rejected": -250.90145874023438, "loss": 0.6612, "positive_losses": 0.14080695807933807, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.20010197162628174, "rewards/margins": 0.10068812221288681, "rewards/margins_max": 0.29419422149658203, "rewards/margins_min": -0.10057322680950165, "rewards/margins_std": 0.17789433896541595, "rewards/rejected": 0.09941386431455612, "step": 1700 }, { "epoch": 0.44, "eval_dpo_losses": 0.6476919054985046, "eval_logits/chosen": -2.677135705947876, "eval_logits/rejected": -2.643777370452881, "eval_logps/chosen": -262.121337890625, "eval_logps/rejected": -246.36651611328125, "eval_loss": 0.6768782734870911, "eval_positive_losses": 0.20542214810848236, "eval_rewards/accuracies": 0.7080000042915344, "eval_rewards/chosen": 0.22472086548805237, "eval_rewards/margins": 0.10259683430194855, "eval_rewards/margins_max": 0.39832353591918945, "eval_rewards/margins_min": -0.14721530675888062, "eval_rewards/margins_std": 0.1822042167186737, "eval_rewards/rejected": 0.12212403863668442, "eval_runtime": 428.5819, "eval_samples_per_second": 4.667, "eval_steps_per_second": 0.292, "step": 1700 }, { "dpo_losses": 0.66423499584198, "epoch": 0.45, "grad_norm": 13.25207445711773, "learning_rate": 3.3764812856685995e-06, "logits/chosen": -2.764681339263916, "logits/rejected": -2.7417492866516113, "logps/chosen": -242.6545867919922, "logps/rejected": -229.55502319335938, "loss": 0.6751, "positive_losses": 0.13716831803321838, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.23364511132240295, "rewards/margins": 0.06816355884075165, "rewards/margins_max": 0.26030972599983215, "rewards/margins_min": -0.1095590814948082, "rewards/margins_std": 0.16339609026908875, "rewards/rejected": 0.16548150777816772, "step": 1710 }, { "dpo_losses": 0.6592116951942444, "epoch": 0.45, "grad_norm": 2.1630670895411916, "learning_rate": 3.3550503583141726e-06, "logits/chosen": -2.6876742839813232, "logits/rejected": -2.682015895843506, "logps/chosen": -236.06680297851562, "logps/rejected": -248.79696655273438, "loss": 0.6741, "positive_losses": 0.32171955704689026, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.21035310626029968, "rewards/margins": 0.07852686196565628, "rewards/margins_max": 0.31927719712257385, "rewards/margins_min": -0.11590731143951416, "rewards/margins_std": 0.19199693202972412, "rewards/rejected": 0.1318262219429016, "step": 1720 }, { "dpo_losses": 0.6501371264457703, "epoch": 0.45, "grad_norm": 13.86982259294493, "learning_rate": 3.3335480345008907e-06, "logits/chosen": -2.6857552528381348, "logits/rejected": -2.651797294616699, "logps/chosen": -251.1685333251953, "logps/rejected": -258.6827392578125, "loss": 0.7051, "positive_losses": 0.014325427822768688, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.20184862613677979, "rewards/margins": 0.0961037278175354, "rewards/margins_max": 0.2953731417655945, "rewards/margins_min": -0.06370477378368378, "rewards/margins_std": 0.1600179672241211, "rewards/rejected": 0.10574488341808319, "step": 1730 }, { "dpo_losses": 0.6431703567504883, "epoch": 0.46, "grad_norm": 18.05753709662793, "learning_rate": 3.3119761096666055e-06, "logits/chosen": -2.621642827987671, "logits/rejected": -2.5782694816589355, "logps/chosen": -246.7869873046875, "logps/rejected": -207.5820770263672, "loss": 0.6929, "positive_losses": 0.09281005710363388, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.2293844223022461, "rewards/margins": 0.1122862845659256, "rewards/margins_max": 0.33559125661849976, "rewards/margins_min": -0.0727875754237175, "rewards/margins_std": 0.1830786168575287, "rewards/rejected": 0.11709816753864288, "step": 1740 }, { "dpo_losses": 0.6214326620101929, "epoch": 0.46, "grad_norm": 6.536976033615429, "learning_rate": 3.290336385060832e-06, "logits/chosen": -2.729367733001709, "logits/rejected": -2.7126047611236572, "logps/chosen": -293.42327880859375, "logps/rejected": -291.1454772949219, "loss": 0.6481, "positive_losses": 0.07654953002929688, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.23736996948719025, "rewards/margins": 0.16105449199676514, "rewards/margins_max": 0.3831517994403839, "rewards/margins_min": -0.06268024444580078, "rewards/margins_std": 0.19824610650539398, "rewards/rejected": 0.07631546258926392, "step": 1750 }, { "dpo_losses": 0.6360501050949097, "epoch": 0.46, "grad_norm": 2.286240644816451, "learning_rate": 3.268630667594348e-06, "logits/chosen": -2.637633800506592, "logits/rejected": -2.6077706813812256, "logps/chosen": -246.01553344726562, "logps/rejected": -240.3081817626953, "loss": 0.6629, "positive_losses": 0.5037437081336975, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23212666809558868, "rewards/margins": 0.13249792158603668, "rewards/margins_max": 0.38482505083084106, "rewards/margins_min": -0.09228018671274185, "rewards/margins_std": 0.21233367919921875, "rewards/rejected": 0.09962873160839081, "step": 1760 }, { "dpo_losses": 0.6503337621688843, "epoch": 0.46, "grad_norm": 15.131264138165896, "learning_rate": 3.2468607696883147e-06, "logits/chosen": -2.6927237510681152, "logits/rejected": -2.672182559967041, "logps/chosen": -256.45245361328125, "logps/rejected": -251.2952880859375, "loss": 0.7108, "positive_losses": 0.34177929162979126, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2186545878648758, "rewards/margins": 0.09831682592630386, "rewards/margins_max": 0.2875586152076721, "rewards/margins_min": -0.07265286892652512, "rewards/margins_std": 0.16652391850948334, "rewards/rejected": 0.12033774703741074, "step": 1770 }, { "dpo_losses": 0.6352652311325073, "epoch": 0.47, "grad_norm": 2.1339962088382296, "learning_rate": 3.225028509122944e-06, "logits/chosen": -2.695920467376709, "logits/rejected": -2.5993239879608154, "logps/chosen": -300.1233215332031, "logps/rejected": -226.9390869140625, "loss": 0.6747, "positive_losses": 0.35415419936180115, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.24287322163581848, "rewards/margins": 0.12968754768371582, "rewards/margins_max": 0.32654038071632385, "rewards/margins_min": -0.07334260642528534, "rewards/margins_std": 0.17713679373264313, "rewards/rejected": 0.11318568885326385, "step": 1780 }, { "dpo_losses": 0.655597984790802, "epoch": 0.47, "grad_norm": 2.0714766773970688, "learning_rate": 3.2031357088857083e-06, "logits/chosen": -2.7241082191467285, "logits/rejected": -2.651494026184082, "logps/chosen": -301.7291259765625, "logps/rejected": -247.71572875976562, "loss": 0.696, "positive_losses": 0.4892311096191406, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.23131561279296875, "rewards/margins": 0.0859755203127861, "rewards/margins_max": 0.29585233330726624, "rewards/margins_min": -0.09423918277025223, "rewards/margins_std": 0.17703907191753387, "rewards/rejected": 0.14534008502960205, "step": 1790 }, { "dpo_losses": 0.6512483954429626, "epoch": 0.47, "grad_norm": 2.2204190276306317, "learning_rate": 3.181184197019127e-06, "logits/chosen": -2.6714210510253906, "logits/rejected": -2.6079258918762207, "logps/chosen": -290.9697570800781, "logps/rejected": -245.71176147460938, "loss": 0.6934, "positive_losses": 0.30568617582321167, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21041937172412872, "rewards/margins": 0.0918908417224884, "rewards/margins_max": 0.2533586919307709, "rewards/margins_min": -0.056008823215961456, "rewards/margins_std": 0.13672399520874023, "rewards/rejected": 0.11852853000164032, "step": 1800 }, { "epoch": 0.47, "eval_dpo_losses": 0.6485722661018372, "eval_logits/chosen": -2.6494243144989014, "eval_logits/rejected": -2.6153383255004883, "eval_logps/chosen": -261.53656005859375, "eval_logps/rejected": -245.57456970214844, "eval_loss": 0.6708742380142212, "eval_positive_losses": 0.1500679850578308, "eval_rewards/accuracies": 0.7039999961853027, "eval_rewards/chosen": 0.23056870698928833, "eval_rewards/margins": 0.10052523761987686, "eval_rewards/margins_max": 0.39074593782424927, "eval_rewards/margins_min": -0.14597617089748383, "eval_rewards/margins_std": 0.17966297268867493, "eval_rewards/rejected": 0.13004347681999207, "eval_runtime": 428.425, "eval_samples_per_second": 4.668, "eval_steps_per_second": 0.292, "step": 1800 }, { "dpo_losses": 0.6462545990943909, "epoch": 0.47, "grad_norm": 2.196685036142017, "learning_rate": 3.159175806468126e-06, "logits/chosen": -2.682133436203003, "logits/rejected": -2.6218066215515137, "logps/chosen": -298.1648254394531, "logps/rejected": -280.821044921875, "loss": 0.6852, "positive_losses": 0.3974171280860901, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22966495156288147, "rewards/margins": 0.10593409836292267, "rewards/margins_max": 0.31627365946769714, "rewards/margins_min": -0.10758032649755478, "rewards/margins_std": 0.18766090273857117, "rewards/rejected": 0.12373087555170059, "step": 1810 }, { "dpo_losses": 0.6489313244819641, "epoch": 0.48, "grad_norm": 8.783862176811004, "learning_rate": 3.1371123749269804e-06, "logits/chosen": -2.627868413925171, "logits/rejected": -2.600464344024658, "logps/chosen": -256.3890380859375, "logps/rejected": -267.42205810546875, "loss": 0.6858, "positive_losses": 0.5902347564697266, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2368176430463791, "rewards/margins": 0.1038227528333664, "rewards/margins_max": 0.3395024836063385, "rewards/margins_min": -0.08281396329402924, "rewards/margins_std": 0.18659810721874237, "rewards/rejected": 0.1329948753118515, "step": 1820 }, { "dpo_losses": 0.6479700803756714, "epoch": 0.48, "grad_norm": 1.855646321683957, "learning_rate": 3.114995744685877e-06, "logits/chosen": -2.6669769287109375, "logits/rejected": -2.6285557746887207, "logps/chosen": -285.3905334472656, "logps/rejected": -316.7983703613281, "loss": 0.667, "positive_losses": 0.38233718276023865, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2400607168674469, "rewards/margins": 0.10475423187017441, "rewards/margins_max": 0.3386301100254059, "rewards/margins_min": -0.10469480603933334, "rewards/margins_std": 0.20079728960990906, "rewards/rejected": 0.1353064626455307, "step": 1830 }, { "dpo_losses": 0.6492260694503784, "epoch": 0.48, "grad_norm": 12.057583296059217, "learning_rate": 3.0928277624770743e-06, "logits/chosen": -2.6672568321228027, "logits/rejected": -2.609797239303589, "logps/chosen": -239.1560821533203, "logps/rejected": -218.8496551513672, "loss": 0.6617, "positive_losses": 0.17789649963378906, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22713598608970642, "rewards/margins": 0.09814493358135223, "rewards/margins_max": 0.28700849413871765, "rewards/margins_min": -0.07425049692392349, "rewards/margins_std": 0.16490700840950012, "rewards/rejected": 0.1289910525083542, "step": 1840 }, { "dpo_losses": 0.642635703086853, "epoch": 0.48, "grad_norm": 1.9425340903296642, "learning_rate": 3.070610279320708e-06, "logits/chosen": -2.657158136367798, "logits/rejected": -2.5925612449645996, "logps/chosen": -241.08383178710938, "logps/rejected": -232.469482421875, "loss": 0.6485, "positive_losses": 0.008465195074677467, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21421754360198975, "rewards/margins": 0.11229304224252701, "rewards/margins_max": 0.27615246176719666, "rewards/margins_min": -0.06589009612798691, "rewards/margins_std": 0.1505574882030487, "rewards/rejected": 0.10192450135946274, "step": 1850 }, { "dpo_losses": 0.6225242018699646, "epoch": 0.49, "grad_norm": 9.38683010103534, "learning_rate": 3.0483451503702264e-06, "logits/chosen": -2.6227405071258545, "logits/rejected": -2.5655102729797363, "logps/chosen": -283.06121826171875, "logps/rejected": -276.5240783691406, "loss": 0.6669, "positive_losses": 0.42248469591140747, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2341805249452591, "rewards/margins": 0.16139110922813416, "rewards/margins_max": 0.44353175163269043, "rewards/margins_min": -0.05831465870141983, "rewards/margins_std": 0.2237849235534668, "rewards/rejected": 0.07278943061828613, "step": 1860 }, { "dpo_losses": 0.6268504858016968, "epoch": 0.49, "grad_norm": 1.9373683269426913, "learning_rate": 3.0260342347574916e-06, "logits/chosen": -2.6286182403564453, "logits/rejected": -2.597712755203247, "logps/chosen": -266.64581298828125, "logps/rejected": -258.6505432128906, "loss": 0.6369, "positive_losses": 0.0, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.22858329117298126, "rewards/margins": 0.1482369750738144, "rewards/margins_max": 0.35187166929244995, "rewards/margins_min": -0.049382638186216354, "rewards/margins_std": 0.1775979846715927, "rewards/rejected": 0.08034632354974747, "step": 1870 }, { "dpo_losses": 0.629361629486084, "epoch": 0.49, "grad_norm": 24.776993644558654, "learning_rate": 3.0036793954375358e-06, "logits/chosen": -2.6223738193511963, "logits/rejected": -2.5900847911834717, "logps/chosen": -243.8868408203125, "logps/rejected": -231.40481567382812, "loss": 0.7135, "positive_losses": 0.5055671334266663, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2147538959980011, "rewards/margins": 0.1427079141139984, "rewards/margins_max": 0.341435968875885, "rewards/margins_min": -0.05848371237516403, "rewards/margins_std": 0.18279311060905457, "rewards/rejected": 0.07204596698284149, "step": 1880 }, { "dpo_losses": 0.6442877650260925, "epoch": 0.49, "grad_norm": 2.237086111820016, "learning_rate": 2.981282499033009e-06, "logits/chosen": -2.7022206783294678, "logits/rejected": -2.651426315307617, "logps/chosen": -290.08770751953125, "logps/rejected": -277.43798828125, "loss": 0.7144, "positive_losses": 0.5254039764404297, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.20076048374176025, "rewards/margins": 0.11146055161952972, "rewards/margins_max": 0.3246122896671295, "rewards/margins_min": -0.08855988085269928, "rewards/margins_std": 0.18590357899665833, "rewards/rejected": 0.08929993212223053, "step": 1890 }, { "dpo_losses": 0.6295750737190247, "epoch": 0.5, "grad_norm": 1.7334249145204979, "learning_rate": 2.9588454156783163e-06, "logits/chosen": -2.6986260414123535, "logits/rejected": -2.6652133464813232, "logps/chosen": -274.7660827636719, "logps/rejected": -249.05197143554688, "loss": 0.671, "positive_losses": 0.5682666897773743, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2159028947353363, "rewards/margins": 0.1410866677761078, "rewards/margins_max": 0.34820178151130676, "rewards/margins_min": -0.046055570244789124, "rewards/margins_std": 0.17434677481651306, "rewards/rejected": 0.07481620460748672, "step": 1900 }, { "epoch": 0.5, "eval_dpo_losses": 0.646479070186615, "eval_logits/chosen": -2.623149871826172, "eval_logits/rejected": -2.5887410640716553, "eval_logps/chosen": -262.097900390625, "eval_logps/rejected": -246.6335906982422, "eval_loss": 0.6769080758094788, "eval_positive_losses": 0.21011905372142792, "eval_rewards/accuracies": 0.703000009059906, "eval_rewards/chosen": 0.22495508193969727, "eval_rewards/margins": 0.10550175607204437, "eval_rewards/margins_max": 0.4050651788711548, "eval_rewards/margins_min": -0.14816665649414062, "eval_rewards/margins_std": 0.1861076056957245, "eval_rewards/rejected": 0.1194533109664917, "eval_runtime": 428.7476, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 1900 }, { "dpo_losses": 0.6271580457687378, "epoch": 0.5, "grad_norm": 14.350673625168044, "learning_rate": 2.9363700188634597e-06, "logits/chosen": -2.6323440074920654, "logits/rejected": -2.6393771171569824, "logps/chosen": -255.4196319580078, "logps/rejected": -255.7123260498047, "loss": 0.6551, "positive_losses": 0.06551642715930939, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.2607203423976898, "rewards/margins": 0.1465071439743042, "rewards/margins_max": 0.3591081202030182, "rewards/margins_min": -0.03199191763997078, "rewards/margins_std": 0.17112192511558533, "rewards/rejected": 0.11421322822570801, "step": 1910 }, { "dpo_losses": 0.6591442227363586, "epoch": 0.5, "grad_norm": 2.0389221844681584, "learning_rate": 2.9138581852776053e-06, "logits/chosen": -2.6276705265045166, "logits/rejected": -2.611521005630493, "logps/chosen": -239.8134765625, "logps/rejected": -213.1717987060547, "loss": 0.671, "positive_losses": 0.23264846205711365, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2048894464969635, "rewards/margins": 0.07798510789871216, "rewards/margins_max": 0.2764508128166199, "rewards/margins_min": -0.10032937675714493, "rewards/margins_std": 0.16354572772979736, "rewards/rejected": 0.12690433859825134, "step": 1920 }, { "dpo_losses": 0.6257726550102234, "epoch": 0.51, "grad_norm": 8.777957393992192, "learning_rate": 2.8913117946523805e-06, "logits/chosen": -2.7061777114868164, "logits/rejected": -2.637718677520752, "logps/chosen": -280.39398193359375, "logps/rejected": -235.650634765625, "loss": 0.6681, "positive_losses": 0.009366607293486595, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.25092214345932007, "rewards/margins": 0.14841283857822418, "rewards/margins_max": 0.35846710205078125, "rewards/margins_min": -0.04116084426641464, "rewards/margins_std": 0.17491276562213898, "rewards/rejected": 0.10250934213399887, "step": 1930 }, { "dpo_losses": 0.64067143201828, "epoch": 0.51, "grad_norm": 20.00317052506146, "learning_rate": 2.8687327296049126e-06, "logits/chosen": -2.682300090789795, "logits/rejected": -2.646540641784668, "logps/chosen": -273.55609130859375, "logps/rejected": -255.3865966796875, "loss": 0.6773, "positive_losses": 0.5298584699630737, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.20646294951438904, "rewards/margins": 0.11747118085622787, "rewards/margins_max": 0.30065709352493286, "rewards/margins_min": -0.07594355195760727, "rewards/margins_std": 0.1690727174282074, "rewards/rejected": 0.08899179100990295, "step": 1940 }, { "dpo_losses": 0.6489737033843994, "epoch": 0.51, "grad_norm": 1.9299680683479012, "learning_rate": 2.8461228754806376e-06, "logits/chosen": -2.676379680633545, "logits/rejected": -2.6793432235717773, "logps/chosen": -251.88558959960938, "logps/rejected": -239.60043334960938, "loss": 0.6656, "positive_losses": 0.3307304382324219, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.22858497500419617, "rewards/margins": 0.09980324655771255, "rewards/margins_max": 0.3188799023628235, "rewards/margins_min": -0.07784005254507065, "rewards/margins_std": 0.1788029968738556, "rewards/rejected": 0.1287817358970642, "step": 1950 }, { "dpo_losses": 0.6377055644989014, "epoch": 0.51, "grad_norm": 2.0972809127219167, "learning_rate": 2.823484120195865e-06, "logits/chosen": -2.652918577194214, "logits/rejected": -2.6294288635253906, "logps/chosen": -273.92279052734375, "logps/rejected": -232.54562377929688, "loss": 0.6669, "positive_losses": 0.013454246334731579, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2532370388507843, "rewards/margins": 0.12490354478359222, "rewards/margins_max": 0.3485172390937805, "rewards/margins_min": -0.06277976185083389, "rewards/margins_std": 0.18805760145187378, "rewards/rejected": 0.12833350896835327, "step": 1960 }, { "dpo_losses": 0.6538165807723999, "epoch": 0.52, "grad_norm": 8.628220109094144, "learning_rate": 2.8008183540801486e-06, "logits/chosen": -2.6856980323791504, "logits/rejected": -2.674312114715576, "logps/chosen": -233.84164428710938, "logps/rejected": -219.39346313476562, "loss": 0.6946, "positive_losses": 0.5838509798049927, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2157512903213501, "rewards/margins": 0.08980618417263031, "rewards/margins_max": 0.28088387846946716, "rewards/margins_min": -0.10212485492229462, "rewards/margins_std": 0.16968229413032532, "rewards/rejected": 0.1259451061487198, "step": 1970 }, { "dpo_losses": 0.6721614599227905, "epoch": 0.52, "grad_norm": 1.9585796634150767, "learning_rate": 2.7781274697184353e-06, "logits/chosen": -2.690422773361206, "logits/rejected": -2.6423134803771973, "logps/chosen": -221.5077667236328, "logps/rejected": -226.5823211669922, "loss": 0.6894, "positive_losses": 0.07679557800292969, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.2107527256011963, "rewards/margins": 0.0497070848941803, "rewards/margins_max": 0.23413977026939392, "rewards/margins_min": -0.10646041482686996, "rewards/margins_std": 0.1523529440164566, "rewards/rejected": 0.161045640707016, "step": 1980 }, { "dpo_losses": 0.6687083840370178, "epoch": 0.52, "grad_norm": 1.8160063272567628, "learning_rate": 2.7554133617930397e-06, "logits/chosen": -2.6545233726501465, "logits/rejected": -2.565333127975464, "logps/chosen": -248.9022979736328, "logps/rejected": -216.00942993164062, "loss": 0.6734, "positive_losses": 0.223857119679451, "rewards/accuracies": 0.574999988079071, "rewards/chosen": 0.20214338600635529, "rewards/margins": 0.057341255247592926, "rewards/margins_max": 0.22431108355522156, "rewards/margins_min": -0.11150351911783218, "rewards/margins_std": 0.15205100178718567, "rewards/rejected": 0.14480213820934296, "step": 1990 }, { "dpo_losses": 0.6533797979354858, "epoch": 0.52, "grad_norm": 10.195717335113752, "learning_rate": 2.7326779269254363e-06, "logits/chosen": -2.6570885181427, "logits/rejected": -2.6376636028289795, "logps/chosen": -226.8413543701172, "logps/rejected": -214.5694122314453, "loss": 0.6552, "positive_losses": 0.07695265114307404, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.24027195572853088, "rewards/margins": 0.09057092666625977, "rewards/margins_max": 0.27771997451782227, "rewards/margins_min": -0.1029549241065979, "rewards/margins_std": 0.1712425947189331, "rewards/rejected": 0.14970099925994873, "step": 2000 }, { "epoch": 0.52, "eval_dpo_losses": 0.6438818573951721, "eval_logits/chosen": -2.6499390602111816, "eval_logits/rejected": -2.614983558654785, "eval_logps/chosen": -262.0489807128906, "eval_logps/rejected": -247.1804656982422, "eval_loss": 0.6781352162361145, "eval_positive_losses": 0.22603482007980347, "eval_rewards/accuracies": 0.7179999947547913, "eval_rewards/chosen": 0.22544459998607635, "eval_rewards/margins": 0.11145983636379242, "eval_rewards/margins_max": 0.41781821846961975, "eval_rewards/margins_min": -0.15051095187664032, "eval_rewards/margins_std": 0.19022272527217865, "eval_rewards/rejected": 0.11398474872112274, "eval_runtime": 428.9229, "eval_samples_per_second": 4.663, "eval_steps_per_second": 0.291, "step": 2000 }, { "dpo_losses": 0.6339729428291321, "epoch": 0.53, "grad_norm": 2.038824425607107, "learning_rate": 2.7099230635178954e-06, "logits/chosen": -2.6144392490386963, "logits/rejected": -2.6250531673431396, "logps/chosen": -241.5415496826172, "logps/rejected": -239.8206024169922, "loss": 0.6455, "positive_losses": 0.0629568099975586, "rewards/accuracies": 0.8500000238418579, "rewards/chosen": 0.22847697138786316, "rewards/margins": 0.1312393844127655, "rewards/margins_max": 0.3371210992336273, "rewards/margins_min": -0.03498242050409317, "rewards/margins_std": 0.1663881540298462, "rewards/rejected": 0.09723760187625885, "step": 2010 }, { "dpo_losses": 0.6432933211326599, "epoch": 0.53, "grad_norm": 15.628763218696921, "learning_rate": 2.6871506715949608e-06, "logits/chosen": -2.7466046810150146, "logits/rejected": -2.711158037185669, "logps/chosen": -295.12078857421875, "logps/rejected": -277.30511474609375, "loss": 0.6963, "positive_losses": 0.6019003987312317, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21844974160194397, "rewards/margins": 0.11679448932409286, "rewards/margins_max": 0.3955059051513672, "rewards/margins_min": -0.10713322460651398, "rewards/margins_std": 0.21821625530719757, "rewards/rejected": 0.10165522992610931, "step": 2020 }, { "dpo_losses": 0.6454865336418152, "epoch": 0.53, "grad_norm": 2.105398541414066, "learning_rate": 2.6643626526448063e-06, "logits/chosen": -2.712730646133423, "logits/rejected": -2.6918509006500244, "logps/chosen": -250.91152954101562, "logps/rejected": -256.67120361328125, "loss": 0.7062, "positive_losses": 0.41354599595069885, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.20078325271606445, "rewards/margins": 0.11357339471578598, "rewards/margins_max": 0.3843469023704529, "rewards/margins_min": -0.14561566710472107, "rewards/margins_std": 0.23204609751701355, "rewards/rejected": 0.08720986545085907, "step": 2030 }, { "dpo_losses": 0.6228288412094116, "epoch": 0.53, "grad_norm": 2.0229868736878354, "learning_rate": 2.6415609094604562e-06, "logits/chosen": -2.566884994506836, "logits/rejected": -2.5930848121643066, "logps/chosen": -285.4487609863281, "logps/rejected": -215.4723663330078, "loss": 0.645, "positive_losses": 0.16471806168556213, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23081424832344055, "rewards/margins": 0.1602402776479721, "rewards/margins_max": 0.43092918395996094, "rewards/margins_min": -0.03616338595747948, "rewards/margins_std": 0.21078911423683167, "rewards/rejected": 0.07057399302721024, "step": 2040 }, { "dpo_losses": 0.646896243095398, "epoch": 0.54, "grad_norm": 29.26735387645572, "learning_rate": 2.618747345980904e-06, "logits/chosen": -2.6859145164489746, "logits/rejected": -2.648469924926758, "logps/chosen": -238.9740447998047, "logps/rejected": -247.914794921875, "loss": 0.7102, "positive_losses": 0.5972229242324829, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20252947509288788, "rewards/margins": 0.10697458684444427, "rewards/margins_max": 0.32459062337875366, "rewards/margins_min": -0.10777413845062256, "rewards/margins_std": 0.19482077658176422, "rewards/rejected": 0.0955548956990242, "step": 2050 }, { "dpo_losses": 0.6469524502754211, "epoch": 0.54, "grad_norm": 2.274483980616763, "learning_rate": 2.595923867132136e-06, "logits/chosen": -2.722360610961914, "logits/rejected": -2.6759095191955566, "logps/chosen": -281.6043395996094, "logps/rejected": -256.77325439453125, "loss": 0.6667, "positive_losses": 0.2896057963371277, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2594970166683197, "rewards/margins": 0.10757414996623993, "rewards/margins_max": 0.3386196792125702, "rewards/margins_min": -0.09805265814065933, "rewards/margins_std": 0.19371601939201355, "rewards/rejected": 0.15192286670207977, "step": 2060 }, { "dpo_losses": 0.6544901132583618, "epoch": 0.54, "grad_norm": 1.9052299521922782, "learning_rate": 2.5730923786680672e-06, "logits/chosen": -2.6944234371185303, "logits/rejected": -2.6886143684387207, "logps/chosen": -239.3451690673828, "logps/rejected": -263.98858642578125, "loss": 0.6564, "positive_losses": 0.08744988590478897, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22319169342517853, "rewards/margins": 0.08794328570365906, "rewards/margins_max": 0.2805837392807007, "rewards/margins_min": -0.08538836240768433, "rewards/margins_std": 0.16383443772792816, "rewards/rejected": 0.13524839282035828, "step": 2070 }, { "dpo_losses": 0.6472769975662231, "epoch": 0.54, "grad_norm": 2.0715989056914497, "learning_rate": 2.5502547870114137e-06, "logits/chosen": -2.714325189590454, "logits/rejected": -2.6771862506866455, "logps/chosen": -245.0335235595703, "logps/rejected": -234.88351440429688, "loss": 0.668, "positive_losses": 0.32111629843711853, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23139286041259766, "rewards/margins": 0.10783247649669647, "rewards/margins_max": 0.3527871370315552, "rewards/margins_min": -0.13813123106956482, "rewards/margins_std": 0.21896696090698242, "rewards/rejected": 0.12356036901473999, "step": 2080 }, { "dpo_losses": 0.6535965204238892, "epoch": 0.55, "grad_norm": 2.2094415247288826, "learning_rate": 2.527412999094507e-06, "logits/chosen": -2.6340432167053223, "logits/rejected": -2.633737087249756, "logps/chosen": -245.63644409179688, "logps/rejected": -246.65625, "loss": 0.6834, "positive_losses": 0.2089000642299652, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.19841374456882477, "rewards/margins": 0.09049098938703537, "rewards/margins_max": 0.2716861963272095, "rewards/margins_min": -0.08254183828830719, "rewards/margins_std": 0.1597171425819397, "rewards/rejected": 0.1079227551817894, "step": 2090 }, { "dpo_losses": 0.6478793621063232, "epoch": 0.55, "grad_norm": 1.6665961091683164, "learning_rate": 2.504568922200064e-06, "logits/chosen": -2.7214608192443848, "logits/rejected": -2.694786787033081, "logps/chosen": -225.6134796142578, "logps/rejected": -187.5884552001953, "loss": 0.6727, "positive_losses": 0.16175270080566406, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.20994910597801208, "rewards/margins": 0.1009642630815506, "rewards/margins_max": 0.2725215256214142, "rewards/margins_min": -0.04913107305765152, "rewards/margins_std": 0.14418932795524597, "rewards/rejected": 0.10898486524820328, "step": 2100 }, { "epoch": 0.55, "eval_dpo_losses": 0.6420684456825256, "eval_logits/chosen": -2.6598217487335205, "eval_logits/rejected": -2.6245787143707275, "eval_logps/chosen": -262.303466796875, "eval_logps/rejected": -247.8636932373047, "eval_loss": 0.6811562180519104, "eval_positive_losses": 0.2672227621078491, "eval_rewards/accuracies": 0.722000002861023, "eval_rewards/chosen": 0.2228996902704239, "eval_rewards/margins": 0.11574731022119522, "eval_rewards/margins_max": 0.43427106738090515, "eval_rewards/margins_min": -0.15023651719093323, "eval_rewards/margins_std": 0.19497303664684296, "eval_rewards/rejected": 0.10715239495038986, "eval_runtime": 428.9886, "eval_samples_per_second": 4.662, "eval_steps_per_second": 0.291, "step": 2100 }, { "dpo_losses": 0.6544997096061707, "epoch": 0.55, "grad_norm": 1.7715668075544146, "learning_rate": 2.4817244638019333e-06, "logits/chosen": -2.6845505237579346, "logits/rejected": -2.667048454284668, "logps/chosen": -235.6198272705078, "logps/rejected": -235.6031951904297, "loss": 0.7309, "positive_losses": 0.020835304632782936, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22199173271656036, "rewards/margins": 0.09037148952484131, "rewards/margins_max": 0.3368614614009857, "rewards/margins_min": -0.10964126884937286, "rewards/margins_std": 0.19660863280296326, "rewards/rejected": 0.13162024319171906, "step": 2110 }, { "dpo_losses": 0.6359599828720093, "epoch": 0.55, "grad_norm": 22.723289823232573, "learning_rate": 2.4588815314058155e-06, "logits/chosen": -2.654115676879883, "logits/rejected": -2.6176581382751465, "logps/chosen": -226.5947723388672, "logps/rejected": -234.5386199951172, "loss": 0.6832, "positive_losses": 0.0, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.22623100876808167, "rewards/margins": 0.12508252263069153, "rewards/margins_max": 0.30088773369789124, "rewards/margins_min": -0.037920646369457245, "rewards/margins_std": 0.15172867476940155, "rewards/rejected": 0.10114847123622894, "step": 2120 }, { "dpo_losses": 0.6571123003959656, "epoch": 0.56, "grad_norm": 11.54615610076033, "learning_rate": 2.4360420323899922e-06, "logits/chosen": -2.695882797241211, "logits/rejected": -2.690129518508911, "logps/chosen": -243.5088348388672, "logps/rejected": -253.06777954101562, "loss": 0.69, "positive_losses": 0.507931113243103, "rewards/accuracies": 0.6875, "rewards/chosen": 0.195848286151886, "rewards/margins": 0.08386779576539993, "rewards/margins_max": 0.2921691834926605, "rewards/margins_min": -0.13089944422245026, "rewards/margins_std": 0.1906213015317917, "rewards/rejected": 0.11198048293590546, "step": 2130 }, { "dpo_losses": 0.6368788480758667, "epoch": 0.56, "grad_norm": 22.020147634733714, "learning_rate": 2.4132078738460585e-06, "logits/chosen": -2.7201437950134277, "logits/rejected": -2.7059519290924072, "logps/chosen": -256.1385803222656, "logps/rejected": -251.4710235595703, "loss": 0.6769, "positive_losses": 0.3843521177768707, "rewards/accuracies": 0.75, "rewards/chosen": 0.22043637931346893, "rewards/margins": 0.12601637840270996, "rewards/margins_max": 0.32969871163368225, "rewards/margins_min": -0.06888232380151749, "rewards/margins_std": 0.17837919294834137, "rewards/rejected": 0.09442003071308136, "step": 2140 }, { "dpo_losses": 0.6462651491165161, "epoch": 0.56, "grad_norm": 1.9838140100200745, "learning_rate": 2.3903809624196826e-06, "logits/chosen": -2.676684856414795, "logits/rejected": -2.646646499633789, "logps/chosen": -260.8387756347656, "logps/rejected": -224.59097290039062, "loss": 0.6724, "positive_losses": 0.4286876618862152, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22429773211479187, "rewards/margins": 0.10722216218709946, "rewards/margins_max": 0.2981399595737457, "rewards/margins_min": -0.1043846383690834, "rewards/margins_std": 0.18404658138751984, "rewards/rejected": 0.11707557737827301, "step": 2150 }, { "dpo_losses": 0.6493538022041321, "epoch": 0.57, "grad_norm": 2.0189123035296164, "learning_rate": 2.3675632041513978e-06, "logits/chosen": -2.605515956878662, "logits/rejected": -2.600921392440796, "logps/chosen": -247.6452178955078, "logps/rejected": -241.02883911132812, "loss": 0.6817, "positive_losses": 0.27517661452293396, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1930980533361435, "rewards/margins": 0.09897392988204956, "rewards/margins_max": 0.3034440279006958, "rewards/margins_min": -0.09545283764600754, "rewards/margins_std": 0.17565584182739258, "rewards/rejected": 0.09412412345409393, "step": 2160 }, { "dpo_losses": 0.6231532692909241, "epoch": 0.57, "grad_norm": 15.191125507659171, "learning_rate": 2.3447565043174533e-06, "logits/chosen": -2.577331066131592, "logits/rejected": -2.61116361618042, "logps/chosen": -252.9422149658203, "logps/rejected": -253.1884307861328, "loss": 0.6757, "positive_losses": 0.49209538102149963, "rewards/accuracies": 0.75, "rewards/chosen": 0.23586344718933105, "rewards/margins": 0.15678806602954865, "rewards/margins_max": 0.38855940103530884, "rewards/margins_min": -0.0624086856842041, "rewards/margins_std": 0.20114819705486298, "rewards/rejected": 0.07907537370920181, "step": 2170 }, { "dpo_losses": 0.635786235332489, "epoch": 0.57, "grad_norm": 6.3786834846092, "learning_rate": 2.321962767270724e-06, "logits/chosen": -2.751986503601074, "logits/rejected": -2.7088913917541504, "logps/chosen": -288.1539611816406, "logps/rejected": -250.8312225341797, "loss": 0.6849, "positive_losses": 0.41348037123680115, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.22203759849071503, "rewards/margins": 0.12962044775485992, "rewards/margins_max": 0.3543454110622406, "rewards/margins_min": -0.08103072643280029, "rewards/margins_std": 0.19302402436733246, "rewards/rejected": 0.0924171507358551, "step": 2180 }, { "dpo_losses": 0.6522166132926941, "epoch": 0.57, "grad_norm": 2.1124788407389032, "learning_rate": 2.299183896281692e-06, "logits/chosen": -2.7143988609313965, "logits/rejected": -2.7166504859924316, "logps/chosen": -251.7370147705078, "logps/rejected": -271.95721435546875, "loss": 0.6656, "positive_losses": 0.07898597419261932, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.20387010276317596, "rewards/margins": 0.09346799552440643, "rewards/margins_max": 0.30877038836479187, "rewards/margins_min": -0.10738486051559448, "rewards/margins_std": 0.1863938271999359, "rewards/rejected": 0.11040210723876953, "step": 2190 }, { "dpo_losses": 0.6498124599456787, "epoch": 0.58, "grad_norm": 8.628097641351273, "learning_rate": 2.2764217933795297e-06, "logits/chosen": -2.7102949619293213, "logits/rejected": -2.65270733833313, "logps/chosen": -271.67999267578125, "logps/rejected": -237.8876495361328, "loss": 0.6657, "positive_losses": 0.31769466400146484, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22908155620098114, "rewards/margins": 0.09972299635410309, "rewards/margins_max": 0.3029990792274475, "rewards/margins_min": -0.124956414103508, "rewards/margins_std": 0.18858322501182556, "rewards/rejected": 0.12935855984687805, "step": 2200 }, { "epoch": 0.58, "eval_dpo_losses": 0.6416943073272705, "eval_logits/chosen": -2.6534764766693115, "eval_logits/rejected": -2.619741201400757, "eval_logps/chosen": -261.8956604003906, "eval_logps/rejected": -247.55897521972656, "eval_loss": 0.680858314037323, "eval_positive_losses": 0.26072269678115845, "eval_rewards/accuracies": 0.718999981880188, "eval_rewards/chosen": 0.22697755694389343, "eval_rewards/margins": 0.11677798628807068, "eval_rewards/margins_max": 0.4373670518398285, "eval_rewards/margins_min": -0.15177133679389954, "eval_rewards/margins_std": 0.19642306864261627, "eval_rewards/rejected": 0.11019958555698395, "eval_runtime": 428.7127, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 2200 }, { "dpo_losses": 0.640792965888977, "epoch": 0.58, "grad_norm": 14.102969112671586, "learning_rate": 2.2536783591932786e-06, "logits/chosen": -2.669931411743164, "logits/rejected": -2.6289846897125244, "logps/chosen": -239.69937133789062, "logps/rejected": -248.9862823486328, "loss": 0.6628, "positive_losses": 0.3990330100059509, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2111925631761551, "rewards/margins": 0.11656135320663452, "rewards/margins_max": 0.3115245997905731, "rewards/margins_min": -0.07762470841407776, "rewards/margins_std": 0.17548440396785736, "rewards/rejected": 0.09463120996952057, "step": 2210 }, { "dpo_losses": 0.6432579755783081, "epoch": 0.58, "grad_norm": 20.01822061258544, "learning_rate": 2.230955492793149e-06, "logits/chosen": -2.7320353984832764, "logits/rejected": -2.720430374145508, "logps/chosen": -301.41864013671875, "logps/rejected": -265.033447265625, "loss": 0.6998, "positive_losses": 0.4360194206237793, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21062934398651123, "rewards/margins": 0.11473493278026581, "rewards/margins_max": 0.35814404487609863, "rewards/margins_min": -0.12162482738494873, "rewards/margins_std": 0.21247203648090363, "rewards/rejected": 0.09589441865682602, "step": 2220 }, { "dpo_losses": 0.6411441564559937, "epoch": 0.58, "grad_norm": 21.36363373097859, "learning_rate": 2.208255091531947e-06, "logits/chosen": -2.624075412750244, "logits/rejected": -2.6277029514312744, "logps/chosen": -228.22860717773438, "logps/rejected": -228.2174072265625, "loss": 0.6653, "positive_losses": 0.06467847526073456, "rewards/accuracies": 0.75, "rewards/chosen": 0.2375207245349884, "rewards/margins": 0.11537857353687286, "rewards/margins_max": 0.31860530376434326, "rewards/margins_min": -0.060957975685596466, "rewards/margins_std": 0.16837990283966064, "rewards/rejected": 0.12214212119579315, "step": 2230 }, { "dpo_losses": 0.6498722434043884, "epoch": 0.59, "grad_norm": 7.689554575705533, "learning_rate": 2.1855790508866435e-06, "logits/chosen": -2.5988476276397705, "logits/rejected": -2.547642469406128, "logps/chosen": -262.65704345703125, "logps/rejected": -244.81527709960938, "loss": 0.7201, "positive_losses": 0.19701528549194336, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.26206567883491516, "rewards/margins": 0.11004292964935303, "rewards/margins_max": 0.38525325059890747, "rewards/margins_min": -0.16038253903388977, "rewards/margins_std": 0.24645563960075378, "rewards/rejected": 0.15202273428440094, "step": 2240 }, { "dpo_losses": 0.6424070596694946, "epoch": 0.59, "grad_norm": 6.49877947572252, "learning_rate": 2.162929264300107e-06, "logits/chosen": -2.70169997215271, "logits/rejected": -2.7117209434509277, "logps/chosen": -261.92449951171875, "logps/rejected": -240.8101348876953, "loss": 0.7243, "positive_losses": 0.7694869041442871, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21866443753242493, "rewards/margins": 0.11571111530065536, "rewards/margins_max": 0.36487776041030884, "rewards/margins_min": -0.08591816574335098, "rewards/margins_std": 0.20337148010730743, "rewards/rejected": 0.10295332968235016, "step": 2250 }, { "dpo_losses": 0.6471494436264038, "epoch": 0.59, "grad_norm": 12.778357591235745, "learning_rate": 2.1403076230230006e-06, "logits/chosen": -2.5875096321105957, "logits/rejected": -2.5508904457092285, "logps/chosen": -247.1914520263672, "logps/rejected": -239.9813995361328, "loss": 0.6934, "positive_losses": 0.33063822984695435, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21310050785541534, "rewards/margins": 0.10168435424566269, "rewards/margins_max": 0.2952573299407959, "rewards/margins_min": -0.08863938599824905, "rewards/margins_std": 0.16796045005321503, "rewards/rejected": 0.11141613870859146, "step": 2260 }, { "dpo_losses": 0.631346583366394, "epoch": 0.59, "grad_norm": 2.099881886384941, "learning_rate": 2.11771601595586e-06, "logits/chosen": -2.680393934249878, "logits/rejected": -2.623697519302368, "logps/chosen": -242.8871612548828, "logps/rejected": -234.7235565185547, "loss": 0.6365, "positive_losses": 0.029846955090761185, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2638316750526428, "rewards/margins": 0.14046084880828857, "rewards/margins_max": 0.39030706882476807, "rewards/margins_min": -0.07516772300004959, "rewards/margins_std": 0.20723383128643036, "rewards/rejected": 0.12337078899145126, "step": 2270 }, { "dpo_losses": 0.6454527378082275, "epoch": 0.6, "grad_norm": 8.151043746671727, "learning_rate": 2.0951563294913737e-06, "logits/chosen": -2.6282997131347656, "logits/rejected": -2.615326404571533, "logps/chosen": -261.466064453125, "logps/rejected": -272.07440185546875, "loss": 0.684, "positive_losses": 0.5209732055664062, "rewards/accuracies": 0.6875, "rewards/chosen": 0.23491840064525604, "rewards/margins": 0.10922648757696152, "rewards/margins_max": 0.31910452246665955, "rewards/margins_min": -0.11001088470220566, "rewards/margins_std": 0.19210968911647797, "rewards/rejected": 0.12569192051887512, "step": 2280 }, { "dpo_losses": 0.6411945819854736, "epoch": 0.6, "grad_norm": 1.6197063345883989, "learning_rate": 2.0726304473568693e-06, "logits/chosen": -2.669771671295166, "logits/rejected": -2.616565704345703, "logps/chosen": -236.112060546875, "logps/rejected": -211.77685546875, "loss": 0.6947, "positive_losses": 0.9044864773750305, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23386506736278534, "rewards/margins": 0.11773782968521118, "rewards/margins_max": 0.35969018936157227, "rewards/margins_min": -0.07756470143795013, "rewards/margins_std": 0.1970023512840271, "rewards/rejected": 0.11612723767757416, "step": 2290 }, { "dpo_losses": 0.6478989720344543, "epoch": 0.6, "grad_norm": 1.9365441210128405, "learning_rate": 2.050140250457023e-06, "logits/chosen": -2.69016432762146, "logits/rejected": -2.6887741088867188, "logps/chosen": -264.53436279296875, "logps/rejected": -269.4958801269531, "loss": 0.7128, "positive_losses": 0.5848304629325867, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2253313958644867, "rewards/margins": 0.1076095849275589, "rewards/margins_max": 0.34349751472473145, "rewards/margins_min": -0.08968096226453781, "rewards/margins_std": 0.1942981779575348, "rewards/rejected": 0.1177218109369278, "step": 2300 }, { "epoch": 0.6, "eval_dpo_losses": 0.6414274573326111, "eval_logits/chosen": -2.666247606277466, "eval_logits/rejected": -2.63419771194458, "eval_logps/chosen": -261.9747619628906, "eval_logps/rejected": -247.71238708496094, "eval_loss": 0.6833491325378418, "eval_positive_losses": 0.2781386375427246, "eval_rewards/accuracies": 0.7239999771118164, "eval_rewards/chosen": 0.2261866331100464, "eval_rewards/margins": 0.11752131581306458, "eval_rewards/margins_max": 0.4381586015224457, "eval_rewards/margins_min": -0.15121419727802277, "eval_rewards/margins_std": 0.19746553897857666, "eval_rewards/rejected": 0.10866532474756241, "eval_runtime": 428.6796, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 2300 }, { "dpo_losses": 0.6425520777702332, "epoch": 0.6, "grad_norm": 1.8545017653986802, "learning_rate": 2.0276876167168042e-06, "logits/chosen": -2.6687874794006348, "logits/rejected": -2.633638381958008, "logps/chosen": -291.30621337890625, "logps/rejected": -232.53659057617188, "loss": 0.6689, "positive_losses": 0.40098267793655396, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2242567092180252, "rewards/margins": 0.11582033336162567, "rewards/margins_max": 0.37259894609451294, "rewards/margins_min": -0.09212379157543182, "rewards/margins_std": 0.2043609619140625, "rewards/rejected": 0.10843636095523834, "step": 2310 }, { "dpo_losses": 0.6488217711448669, "epoch": 0.61, "grad_norm": 10.778491211794645, "learning_rate": 2.0052744209246682e-06, "logits/chosen": -2.7158865928649902, "logits/rejected": -2.6738638877868652, "logps/chosen": -240.2007293701172, "logps/rejected": -221.02197265625, "loss": 0.6874, "positive_losses": 0.5481477975845337, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.22526594996452332, "rewards/margins": 0.10251255333423615, "rewards/margins_max": 0.337367981672287, "rewards/margins_min": -0.1220068708062172, "rewards/margins_std": 0.2029140442609787, "rewards/rejected": 0.12275341898202896, "step": 2320 }, { "dpo_losses": 0.6290943622589111, "epoch": 0.61, "grad_norm": 4.815377700268198, "learning_rate": 1.9829025345760127e-06, "logits/chosen": -2.696500778198242, "logits/rejected": -2.692378520965576, "logps/chosen": -279.5434875488281, "logps/rejected": -288.99658203125, "loss": 0.6759, "positive_losses": 0.5212033987045288, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2523339092731476, "rewards/margins": 0.1457194983959198, "rewards/margins_max": 0.42099323868751526, "rewards/margins_min": -0.05689948797225952, "rewards/margins_std": 0.21575328707695007, "rewards/rejected": 0.10661438852548599, "step": 2330 }, { "dpo_losses": 0.6482885479927063, "epoch": 0.61, "grad_norm": 5.37540239114238, "learning_rate": 1.9605738257169115e-06, "logits/chosen": -2.640395164489746, "logits/rejected": -2.6198418140411377, "logps/chosen": -271.88134765625, "logps/rejected": -268.78326416015625, "loss": 0.6778, "positive_losses": 0.2157512605190277, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.21400196850299835, "rewards/margins": 0.10478683561086655, "rewards/margins_max": 0.36149492859840393, "rewards/margins_min": -0.12646926939487457, "rewards/margins_std": 0.21732476353645325, "rewards/rejected": 0.10921511799097061, "step": 2340 }, { "dpo_losses": 0.6575223207473755, "epoch": 0.62, "grad_norm": 9.852750798798407, "learning_rate": 1.9382901587881275e-06, "logits/chosen": -2.709843158721924, "logits/rejected": -2.6917917728424072, "logps/chosen": -266.71832275390625, "logps/rejected": -236.25942993164062, "loss": 0.6679, "positive_losses": 0.13655433058738708, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.2228725403547287, "rewards/margins": 0.08450976759195328, "rewards/margins_max": 0.31864067912101746, "rewards/margins_min": -0.13753117620944977, "rewards/margins_std": 0.20050354301929474, "rewards/rejected": 0.13836278021335602, "step": 2350 }, { "dpo_losses": 0.6405150294303894, "epoch": 0.62, "grad_norm": 7.315606134089701, "learning_rate": 1.916053394469437e-06, "logits/chosen": -2.698080062866211, "logits/rejected": -2.653700590133667, "logps/chosen": -263.4848937988281, "logps/rejected": -207.7606658935547, "loss": 0.6813, "positive_losses": 0.5083738565444946, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21712832152843475, "rewards/margins": 0.12794360518455505, "rewards/margins_max": 0.3849547505378723, "rewards/margins_min": -0.1235085278749466, "rewards/margins_std": 0.2276526689529419, "rewards/rejected": 0.08918474614620209, "step": 2360 }, { "dpo_losses": 0.6614426970481873, "epoch": 0.62, "grad_norm": 2.3666701554107026, "learning_rate": 1.8938653895242604e-06, "logits/chosen": -2.7283883094787598, "logits/rejected": -2.7107996940612793, "logps/chosen": -227.27963256835938, "logps/rejected": -239.0881805419922, "loss": 0.6799, "positive_losses": 0.3466273248195648, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.20778174698352814, "rewards/margins": 0.0777909979224205, "rewards/margins_max": 0.26601895689964294, "rewards/margins_min": -0.12979359924793243, "rewards/margins_std": 0.1801576465368271, "rewards/rejected": 0.12999074161052704, "step": 2370 }, { "dpo_losses": 0.6346568465232849, "epoch": 0.62, "grad_norm": 1.9285349009854211, "learning_rate": 1.8717279966446267e-06, "logits/chosen": -2.716540813446045, "logits/rejected": -2.6619017124176025, "logps/chosen": -276.6706848144531, "logps/rejected": -232.4741973876953, "loss": 0.6656, "positive_losses": 0.23119506239891052, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23178791999816895, "rewards/margins": 0.13141080737113953, "rewards/margins_max": 0.3396896719932556, "rewards/margins_min": -0.07753165811300278, "rewards/margins_std": 0.1872069537639618, "rewards/rejected": 0.1003771424293518, "step": 2380 }, { "dpo_losses": 0.6504985094070435, "epoch": 0.63, "grad_norm": 12.739750950174063, "learning_rate": 1.8496430642964698e-06, "logits/chosen": -2.7091147899627686, "logits/rejected": -2.698914051055908, "logps/chosen": -267.83734130859375, "logps/rejected": -262.9453430175781, "loss": 0.6635, "positive_losses": 0.24009115993976593, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21650449931621552, "rewards/margins": 0.09888532012701035, "rewards/margins_max": 0.3336879014968872, "rewards/margins_min": -0.14185991883277893, "rewards/margins_std": 0.2064811885356903, "rewards/rejected": 0.11761917918920517, "step": 2390 }, { "dpo_losses": 0.653568685054779, "epoch": 0.63, "grad_norm": 1.9055943846878478, "learning_rate": 1.827612436565286e-06, "logits/chosen": -2.696230173110962, "logits/rejected": -2.6393418312072754, "logps/chosen": -218.92666625976562, "logps/rejected": -252.3378143310547, "loss": 0.664, "positive_losses": 0.29177045822143555, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.22582077980041504, "rewards/margins": 0.0924738198518753, "rewards/margins_max": 0.3166094422340393, "rewards/margins_min": -0.10669116675853729, "rewards/margins_std": 0.19182102382183075, "rewards/rejected": 0.13334695994853973, "step": 2400 }, { "epoch": 0.63, "eval_dpo_losses": 0.6416491866111755, "eval_logits/chosen": -2.670564889907837, "eval_logits/rejected": -2.637544631958008, "eval_logps/chosen": -261.88232421875, "eval_logps/rejected": -247.55894470214844, "eval_loss": 0.6815658807754517, "eval_positive_losses": 0.2634168565273285, "eval_rewards/accuracies": 0.7179999947547913, "eval_rewards/chosen": 0.22711116075515747, "eval_rewards/margins": 0.1169111505150795, "eval_rewards/margins_max": 0.43684303760528564, "eval_rewards/margins_min": -0.1508060246706009, "eval_rewards/margins_std": 0.19625326991081238, "eval_rewards/rejected": 0.11020002514123917, "eval_runtime": 428.8578, "eval_samples_per_second": 4.664, "eval_steps_per_second": 0.291, "step": 2400 }, { "dpo_losses": 0.6515271067619324, "epoch": 0.63, "grad_norm": 13.319168641319113, "learning_rate": 1.8056379530021492e-06, "logits/chosen": -2.639566659927368, "logits/rejected": -2.6303904056549072, "logps/chosen": -228.5304718017578, "logps/rejected": -228.3441162109375, "loss": 0.6496, "positive_losses": 0.049842070788145065, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.21713919937610626, "rewards/margins": 0.09499481320381165, "rewards/margins_max": 0.3225991725921631, "rewards/margins_min": -0.10366543382406235, "rewards/margins_std": 0.1886659562587738, "rewards/rejected": 0.12214437872171402, "step": 2410 }, { "dpo_losses": 0.6352885961532593, "epoch": 0.63, "grad_norm": 9.397242088746662, "learning_rate": 1.7837214484701154e-06, "logits/chosen": -2.7199771404266357, "logits/rejected": -2.6601598262786865, "logps/chosen": -312.45355224609375, "logps/rejected": -259.98956298828125, "loss": 0.6816, "positive_losses": 0.5449883341789246, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2680310308933258, "rewards/margins": 0.13462026417255402, "rewards/margins_max": 0.3901945948600769, "rewards/margins_min": -0.10415732860565186, "rewards/margins_std": 0.21482273936271667, "rewards/rejected": 0.1334107667207718, "step": 2420 }, { "dpo_losses": 0.6505361795425415, "epoch": 0.64, "grad_norm": 5.197291314292016, "learning_rate": 1.7618647529910043e-06, "logits/chosen": -2.7230987548828125, "logits/rejected": -2.6814982891082764, "logps/chosen": -287.58319091796875, "logps/rejected": -275.0035095214844, "loss": 0.6817, "positive_losses": 0.019449805840849876, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.21414928138256073, "rewards/margins": 0.09958560764789581, "rewards/margins_max": 0.3731870651245117, "rewards/margins_min": -0.12462493032217026, "rewards/margins_std": 0.21799974143505096, "rewards/rejected": 0.11456366628408432, "step": 2430 }, { "dpo_losses": 0.6427281498908997, "epoch": 0.64, "grad_norm": 9.174269927287428, "learning_rate": 1.7400696915925996e-06, "logits/chosen": -2.633579730987549, "logits/rejected": -2.5776102542877197, "logps/chosen": -250.72915649414062, "logps/rejected": -227.14639282226562, "loss": 0.654, "positive_losses": 0.13230650126934052, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.23319904506206512, "rewards/margins": 0.11510088294744492, "rewards/margins_max": 0.3420729637145996, "rewards/margins_min": -0.07432746142148972, "rewards/margins_std": 0.1844799965620041, "rewards/rejected": 0.1180981770157814, "step": 2440 }, { "dpo_losses": 0.6590006947517395, "epoch": 0.64, "grad_norm": 1.8929680421633466, "learning_rate": 1.718338084156254e-06, "logits/chosen": -2.720773220062256, "logits/rejected": -2.7224438190460205, "logps/chosen": -257.0924987792969, "logps/rejected": -280.6599426269531, "loss": 0.6886, "positive_losses": 0.19687971472740173, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.21660399436950684, "rewards/margins": 0.07879406213760376, "rewards/margins_max": 0.24527005851268768, "rewards/margins_min": -0.09501490741968155, "rewards/margins_std": 0.14944718778133392, "rewards/rejected": 0.13780996203422546, "step": 2450 }, { "dpo_losses": 0.6474701166152954, "epoch": 0.64, "grad_norm": 14.933123725600801, "learning_rate": 1.6966717452649372e-06, "logits/chosen": -2.7045950889587402, "logits/rejected": -2.69558048248291, "logps/chosen": -251.3594207763672, "logps/rejected": -238.4080810546875, "loss": 0.6758, "positive_losses": 0.19091586768627167, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2184688150882721, "rewards/margins": 0.10467857122421265, "rewards/margins_max": 0.3353387713432312, "rewards/margins_min": -0.09618758410215378, "rewards/margins_std": 0.1976177990436554, "rewards/rejected": 0.11379025131464005, "step": 2460 }, { "dpo_losses": 0.6434329748153687, "epoch": 0.65, "grad_norm": 2.026333419334601, "learning_rate": 1.6750724840517103e-06, "logits/chosen": -2.6674137115478516, "logits/rejected": -2.6695876121520996, "logps/chosen": -278.2698974609375, "logps/rejected": -266.6101379394531, "loss": 0.6952, "positive_losses": 0.4136360287666321, "rewards/accuracies": 0.75, "rewards/chosen": 0.22013676166534424, "rewards/margins": 0.11484777927398682, "rewards/margins_max": 0.31252869963645935, "rewards/margins_min": -0.07297607511281967, "rewards/margins_std": 0.1760813444852829, "rewards/rejected": 0.10528896003961563, "step": 2470 }, { "dpo_losses": 0.6468105912208557, "epoch": 0.65, "grad_norm": 15.874665504596074, "learning_rate": 1.6535421040486686e-06, "logits/chosen": -2.684587001800537, "logits/rejected": -2.678257465362549, "logps/chosen": -260.3267517089844, "logps/rejected": -248.08920288085938, "loss": 0.6747, "positive_losses": 0.22333469986915588, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22517025470733643, "rewards/margins": 0.10742548853158951, "rewards/margins_max": 0.32099416851997375, "rewards/margins_min": -0.10793408006429672, "rewards/margins_std": 0.1870633363723755, "rewards/rejected": 0.11774475872516632, "step": 2480 }, { "dpo_losses": 0.6184683442115784, "epoch": 0.65, "grad_norm": 11.685645297983053, "learning_rate": 1.6320824030363458e-06, "logits/chosen": -2.6181864738464355, "logits/rejected": -2.547217607498169, "logps/chosen": -272.03216552734375, "logps/rejected": -237.7306671142578, "loss": 0.6428, "positive_losses": 0.1731315404176712, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.25566861033439636, "rewards/margins": 0.1680397093296051, "rewards/margins_max": 0.40417972207069397, "rewards/margins_min": -0.07985541224479675, "rewards/margins_std": 0.21474552154541016, "rewards/rejected": 0.08762890100479126, "step": 2490 }, { "dpo_losses": 0.6308268904685974, "epoch": 0.65, "grad_norm": 11.818517373396437, "learning_rate": 1.6106951728936028e-06, "logits/chosen": -2.6593117713928223, "logits/rejected": -2.681063175201416, "logps/chosen": -230.7357940673828, "logps/rejected": -251.7682647705078, "loss": 0.6854, "positive_losses": 0.7284355163574219, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.23075993359088898, "rewards/margins": 0.13796807825565338, "rewards/margins_max": 0.3351263403892517, "rewards/margins_min": -0.04487986862659454, "rewards/margins_std": 0.17346806824207306, "rewards/rejected": 0.09279186278581619, "step": 2500 }, { "epoch": 0.65, "eval_dpo_losses": 0.6403778195381165, "eval_logits/chosen": -2.6642088890075684, "eval_logits/rejected": -2.6317145824432373, "eval_logps/chosen": -261.55877685546875, "eval_logps/rejected": -247.54385375976562, "eval_loss": 0.6813825368881226, "eval_positive_losses": 0.25728633999824524, "eval_rewards/accuracies": 0.7179999947547913, "eval_rewards/chosen": 0.23034650087356567, "eval_rewards/margins": 0.1199958547949791, "eval_rewards/margins_max": 0.44315144419670105, "eval_rewards/margins_min": -0.15268249809741974, "eval_rewards/margins_std": 0.19928030669689178, "eval_rewards/rejected": 0.11035063862800598, "eval_runtime": 428.7586, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 2500 }, { "dpo_losses": 0.6385301351547241, "epoch": 0.66, "grad_norm": 2.0337771614010047, "learning_rate": 1.5893821994479996e-06, "logits/chosen": -2.6357200145721436, "logits/rejected": -2.591759204864502, "logps/chosen": -238.2237548828125, "logps/rejected": -235.53396606445312, "loss": 0.6456, "positive_losses": 0.0, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2237323820590973, "rewards/margins": 0.12865914404392242, "rewards/margins_max": 0.4088048040866852, "rewards/margins_min": -0.11632319539785385, "rewards/margins_std": 0.23324613273143768, "rewards/rejected": 0.09507322311401367, "step": 2510 }, { "dpo_losses": 0.6342465281486511, "epoch": 0.66, "grad_norm": 12.936001455897884, "learning_rate": 1.5681452623266868e-06, "logits/chosen": -2.7126686573028564, "logits/rejected": -2.6774439811706543, "logps/chosen": -236.6406707763672, "logps/rejected": -256.8985900878906, "loss": 0.6883, "positive_losses": 0.389979749917984, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21572761237621307, "rewards/margins": 0.1364932507276535, "rewards/margins_max": 0.40666407346725464, "rewards/margins_min": -0.0939478650689125, "rewards/margins_std": 0.23144134879112244, "rewards/rejected": 0.07923434674739838, "step": 2520 }, { "dpo_losses": 0.6177478432655334, "epoch": 0.66, "grad_norm": 1.8569364826060555, "learning_rate": 1.5469861348078014e-06, "logits/chosen": -2.637803316116333, "logits/rejected": -2.647839307785034, "logps/chosen": -255.9474639892578, "logps/rejected": -269.5167541503906, "loss": 0.6458, "positive_losses": 0.29298895597457886, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2587506175041199, "rewards/margins": 0.17714685201644897, "rewards/margins_max": 0.4911496043205261, "rewards/margins_min": -0.054872214794158936, "rewards/margins_std": 0.24489715695381165, "rewards/rejected": 0.08160378038883209, "step": 2530 }, { "dpo_losses": 0.6516502499580383, "epoch": 0.66, "grad_norm": 19.06675957788269, "learning_rate": 1.5259065836724035e-06, "logits/chosen": -2.665034532546997, "logits/rejected": -2.6555521488189697, "logps/chosen": -231.2336883544922, "logps/rejected": -279.32928466796875, "loss": 0.6898, "positive_losses": 0.47542038559913635, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.1883983165025711, "rewards/margins": 0.095589280128479, "rewards/margins_max": 0.3161161541938782, "rewards/margins_min": -0.11133667081594467, "rewards/margins_std": 0.1936117708683014, "rewards/rejected": 0.0928090289235115, "step": 2540 }, { "dpo_losses": 0.6406174898147583, "epoch": 0.67, "grad_norm": 9.215663380250275, "learning_rate": 1.5049083690569456e-06, "logits/chosen": -2.664921522140503, "logits/rejected": -2.6189188957214355, "logps/chosen": -226.3057403564453, "logps/rejected": -233.3822784423828, "loss": 0.6725, "positive_losses": 0.36033034324645996, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2260403335094452, "rewards/margins": 0.11755865812301636, "rewards/margins_max": 0.3342980146408081, "rewards/margins_min": -0.0829990804195404, "rewards/margins_std": 0.18444745242595673, "rewards/rejected": 0.10848164558410645, "step": 2550 }, { "dpo_losses": 0.6350597739219666, "epoch": 0.67, "grad_norm": 2.0562404087387915, "learning_rate": 1.4839932443063057e-06, "logits/chosen": -2.5878238677978516, "logits/rejected": -2.5637125968933105, "logps/chosen": -214.47384643554688, "logps/rejected": -194.06439208984375, "loss": 0.6625, "positive_losses": 0.4961133897304535, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.23396773636341095, "rewards/margins": 0.13020317256450653, "rewards/margins_max": 0.3193449378013611, "rewards/margins_min": -0.06943968683481216, "rewards/margins_std": 0.1697998195886612, "rewards/rejected": 0.10376455634832382, "step": 2560 }, { "dpo_losses": 0.63968825340271, "epoch": 0.67, "grad_norm": 12.910559278883106, "learning_rate": 1.4631629558273803e-06, "logits/chosen": -2.690162181854248, "logits/rejected": -2.6756958961486816, "logps/chosen": -277.5194396972656, "logps/rejected": -259.59454345703125, "loss": 0.6729, "positive_losses": 0.13807010650634766, "rewards/accuracies": 0.6875, "rewards/chosen": 0.21853511035442352, "rewards/margins": 0.1262436956167221, "rewards/margins_max": 0.35763314366340637, "rewards/margins_min": -0.1083545833826065, "rewards/margins_std": 0.20638838410377502, "rewards/rejected": 0.09229140728712082, "step": 2570 }, { "dpo_losses": 0.6387574672698975, "epoch": 0.68, "grad_norm": 1.9039663800749447, "learning_rate": 1.4424192429432657e-06, "logits/chosen": -2.6643788814544678, "logits/rejected": -2.617077350616455, "logps/chosen": -248.69882202148438, "logps/rejected": -255.11123657226562, "loss": 0.6666, "positive_losses": 0.4080939292907715, "rewards/accuracies": 0.75, "rewards/chosen": 0.23328952491283417, "rewards/margins": 0.12091660499572754, "rewards/margins_max": 0.3187905550003052, "rewards/margins_min": -0.08352819085121155, "rewards/margins_std": 0.1809152215719223, "rewards/rejected": 0.11237289011478424, "step": 2580 }, { "dpo_losses": 0.6367157697677612, "epoch": 0.68, "grad_norm": 3.798827622911278, "learning_rate": 1.421763837748016e-06, "logits/chosen": -2.6870176792144775, "logits/rejected": -2.661057710647583, "logps/chosen": -265.65948486328125, "logps/rejected": -227.1802520751953, "loss": 0.7059, "positive_losses": 0.6233920454978943, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.22595369815826416, "rewards/margins": 0.1299133151769638, "rewards/margins_max": 0.3707989454269409, "rewards/margins_min": -0.10667713731527328, "rewards/margins_std": 0.21823760867118835, "rewards/rejected": 0.09604041278362274, "step": 2590 }, { "dpo_losses": 0.6473874449729919, "epoch": 0.68, "grad_norm": 9.802999963713875, "learning_rate": 1.401198464962021e-06, "logits/chosen": -2.6622397899627686, "logits/rejected": -2.649526596069336, "logps/chosen": -243.1597442626953, "logps/rejected": -274.5718994140625, "loss": 0.6744, "positive_losses": 0.3092424273490906, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.19757656753063202, "rewards/margins": 0.10350263118743896, "rewards/margins_max": 0.31503498554229736, "rewards/margins_min": -0.08336089551448822, "rewards/margins_std": 0.18119773268699646, "rewards/rejected": 0.09407395124435425, "step": 2600 }, { "epoch": 0.68, "eval_dpo_losses": 0.6418951749801636, "eval_logits/chosen": -2.6558468341827393, "eval_logits/rejected": -2.623983383178711, "eval_logps/chosen": -261.6072692871094, "eval_logps/rejected": -247.284423828125, "eval_loss": 0.6808555722236633, "eval_positive_losses": 0.27308306097984314, "eval_rewards/accuracies": 0.7160000205039978, "eval_rewards/chosen": 0.22986145317554474, "eval_rewards/margins": 0.11691611260175705, "eval_rewards/margins_max": 0.44820886850357056, "eval_rewards/margins_min": -0.15670141577720642, "eval_rewards/margins_std": 0.20118042826652527, "eval_rewards/rejected": 0.11294533312320709, "eval_runtime": 428.4217, "eval_samples_per_second": 4.668, "eval_steps_per_second": 0.292, "step": 2600 }, { "dpo_losses": 0.6365216374397278, "epoch": 0.68, "grad_norm": 12.037343094893478, "learning_rate": 1.3807248417879896e-06, "logits/chosen": -2.65614652633667, "logits/rejected": -2.612067461013794, "logps/chosen": -268.8036804199219, "logps/rejected": -212.86721801757812, "loss": 0.6814, "positive_losses": 0.25806960463523865, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2531015872955322, "rewards/margins": 0.12736979126930237, "rewards/margins_max": 0.3798461854457855, "rewards/margins_min": -0.07106635719537735, "rewards/margins_std": 0.2040456086397171, "rewards/rejected": 0.12573178112506866, "step": 2610 }, { "dpo_losses": 0.6571398973464966, "epoch": 0.69, "grad_norm": 2.213316267916755, "learning_rate": 1.3603446777675665e-06, "logits/chosen": -2.658865451812744, "logits/rejected": -2.6524603366851807, "logps/chosen": -247.210693359375, "logps/rejected": -243.6188201904297, "loss": 0.6811, "positive_losses": 0.2383430451154709, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.21061749756336212, "rewards/margins": 0.08448885381221771, "rewards/margins_max": 0.3074760437011719, "rewards/margins_min": -0.1139925867319107, "rewards/margins_std": 0.19386586546897888, "rewards/rejected": 0.1261286437511444, "step": 2620 }, { "dpo_losses": 0.6430662870407104, "epoch": 0.69, "grad_norm": 37.12075976679441, "learning_rate": 1.3400596746385817e-06, "logits/chosen": -2.6104116439819336, "logits/rejected": -2.6521754264831543, "logps/chosen": -261.8434143066406, "logps/rejected": -260.4698791503906, "loss": 0.7344, "positive_losses": 1.191888451576233, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.20310619473457336, "rewards/margins": 0.11507274210453033, "rewards/margins_max": 0.33129122853279114, "rewards/margins_min": -0.11089511960744858, "rewards/margins_std": 0.19794295728206635, "rewards/rejected": 0.08803346008062363, "step": 2630 }, { "dpo_losses": 0.6499592661857605, "epoch": 0.69, "grad_norm": 7.908026333615997, "learning_rate": 1.3198715261929587e-06, "logits/chosen": -2.7350478172302246, "logits/rejected": -2.721505641937256, "logps/chosen": -295.0231628417969, "logps/rejected": -249.0230255126953, "loss": 0.7168, "positive_losses": 0.8153203725814819, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.21393021941184998, "rewards/margins": 0.10020557790994644, "rewards/margins_max": 0.34818652272224426, "rewards/margins_min": -0.10275671631097794, "rewards/margins_std": 0.2075171023607254, "rewards/rejected": 0.11372464895248413, "step": 2640 }, { "dpo_losses": 0.6230269074440002, "epoch": 0.69, "grad_norm": 1.8490655858986154, "learning_rate": 1.2997819181352823e-06, "logits/chosen": -2.667567253112793, "logits/rejected": -2.6267752647399902, "logps/chosen": -239.09170532226562, "logps/rejected": -242.45584106445312, "loss": 0.6465, "positive_losses": 0.1859421283006668, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.23573580384254456, "rewards/margins": 0.159610778093338, "rewards/margins_max": 0.3993149399757385, "rewards/margins_min": -0.028125818818807602, "rewards/margins_std": 0.1948395073413849, "rewards/rejected": 0.07612505555152893, "step": 2650 }, { "dpo_losses": 0.6473650932312012, "epoch": 0.7, "grad_norm": 2.2647962435595193, "learning_rate": 1.2797925279420454e-06, "logits/chosen": -2.732039451599121, "logits/rejected": -2.704516887664795, "logps/chosen": -291.14703369140625, "logps/rejected": -232.1072235107422, "loss": 0.6475, "positive_losses": 0.025724029168486595, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24492976069450378, "rewards/margins": 0.10577981173992157, "rewards/margins_max": 0.3053321838378906, "rewards/margins_min": -0.08668453246355057, "rewards/margins_std": 0.1828915774822235, "rewards/rejected": 0.13914994895458221, "step": 2660 }, { "dpo_losses": 0.6515626907348633, "epoch": 0.7, "grad_norm": 8.190568887517635, "learning_rate": 1.2599050247215764e-06, "logits/chosen": -2.765254020690918, "logits/rejected": -2.6876344680786133, "logps/chosen": -254.70364379882812, "logps/rejected": -222.0720672607422, "loss": 0.6889, "positive_losses": 0.5075147747993469, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.20974647998809814, "rewards/margins": 0.0958796963095665, "rewards/margins_max": 0.30540916323661804, "rewards/margins_min": -0.08624430000782013, "rewards/margins_std": 0.17755314707756042, "rewards/rejected": 0.11386678367853165, "step": 2670 }, { "dpo_losses": 0.6387694478034973, "epoch": 0.7, "grad_norm": 10.467895108517743, "learning_rate": 1.2401210690746705e-06, "logits/chosen": -2.7085320949554443, "logits/rejected": -2.7124011516571045, "logps/chosen": -298.4921875, "logps/rejected": -335.75726318359375, "loss": 0.7204, "positive_losses": 1.109521508216858, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2240915596485138, "rewards/margins": 0.13182318210601807, "rewards/margins_max": 0.4292375147342682, "rewards/margins_min": -0.1588866263628006, "rewards/margins_std": 0.25979083776474, "rewards/rejected": 0.09226836264133453, "step": 2680 }, { "dpo_losses": 0.6363158226013184, "epoch": 0.7, "grad_norm": 2.255248517977151, "learning_rate": 1.2204423129559306e-06, "logits/chosen": -2.579045534133911, "logits/rejected": -2.5741171836853027, "logps/chosen": -197.7860565185547, "logps/rejected": -222.0952911376953, "loss": 0.6736, "positive_losses": 0.1473967581987381, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21976570785045624, "rewards/margins": 0.12878528237342834, "rewards/margins_max": 0.34714943170547485, "rewards/margins_min": -0.07536034286022186, "rewards/margins_std": 0.19248996675014496, "rewards/rejected": 0.0909804105758667, "step": 2690 }, { "dpo_losses": 0.6511543989181519, "epoch": 0.71, "grad_norm": 16.728722566036126, "learning_rate": 1.20087039953583e-06, "logits/chosen": -2.6526553630828857, "logits/rejected": -2.609686851501465, "logps/chosen": -238.56045532226562, "logps/rejected": -214.05307006835938, "loss": 0.667, "positive_losses": 0.11517200618982315, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.23147904872894287, "rewards/margins": 0.09473319351673126, "rewards/margins_max": 0.2892138361930847, "rewards/margins_min": -0.11468710750341415, "rewards/margins_std": 0.17651180922985077, "rewards/rejected": 0.13674584031105042, "step": 2700 }, { "epoch": 0.71, "eval_dpo_losses": 0.6441443562507629, "eval_logits/chosen": -2.6651101112365723, "eval_logits/rejected": -2.63289213180542, "eval_logps/chosen": -260.949951171875, "eval_logps/rejected": -246.05718994140625, "eval_loss": 0.6720408797264099, "eval_positive_losses": 0.18110305070877075, "eval_rewards/accuracies": 0.7129999995231628, "eval_rewards/chosen": 0.2364349663257599, "eval_rewards/margins": 0.1112174391746521, "eval_rewards/margins_max": 0.425150603055954, "eval_rewards/margins_min": -0.15084302425384521, "eval_rewards/margins_std": 0.19235800206661224, "eval_rewards/rejected": 0.1252175271511078, "eval_runtime": 428.5175, "eval_samples_per_second": 4.667, "eval_steps_per_second": 0.292, "step": 2700 }, { "dpo_losses": 0.6572734117507935, "epoch": 0.71, "grad_norm": 11.484244790928516, "learning_rate": 1.181406963063507e-06, "logits/chosen": -2.7521791458129883, "logits/rejected": -2.7197654247283936, "logps/chosen": -265.80706787109375, "logps/rejected": -277.2874450683594, "loss": 0.7076, "positive_losses": 0.5719951391220093, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.22318899631500244, "rewards/margins": 0.08136919885873795, "rewards/margins_max": 0.27184703946113586, "rewards/margins_min": -0.0953093096613884, "rewards/margins_std": 0.16653461754322052, "rewards/rejected": 0.1418198198080063, "step": 2710 }, { "dpo_losses": 0.6453554630279541, "epoch": 0.71, "grad_norm": 2.6880729419427656, "learning_rate": 1.1620536287303052e-06, "logits/chosen": -2.6446657180786133, "logits/rejected": -2.588442087173462, "logps/chosen": -228.0830078125, "logps/rejected": -238.17141723632812, "loss": 0.6691, "positive_losses": 0.015170765109360218, "rewards/accuracies": 0.8125, "rewards/chosen": 0.24276229739189148, "rewards/margins": 0.10786803811788559, "rewards/margins_max": 0.29676342010498047, "rewards/margins_min": -0.08386511355638504, "rewards/margins_std": 0.17123156785964966, "rewards/rejected": 0.13489428162574768, "step": 2720 }, { "dpo_losses": 0.6461043357849121, "epoch": 0.71, "grad_norm": 16.544314898742407, "learning_rate": 1.1428120125340717e-06, "logits/chosen": -2.671823501586914, "logits/rejected": -2.669869899749756, "logps/chosen": -254.216064453125, "logps/rejected": -246.4410858154297, "loss": 0.6622, "positive_losses": 0.0366668701171875, "rewards/accuracies": 0.6875, "rewards/chosen": 0.27612707018852234, "rewards/margins": 0.11478084325790405, "rewards/margins_max": 0.4099472165107727, "rewards/margins_min": -0.11208458989858627, "rewards/margins_std": 0.23278048634529114, "rewards/rejected": 0.16134625673294067, "step": 2730 }, { "dpo_losses": 0.6565698385238647, "epoch": 0.72, "grad_norm": 1.8381845561486416, "learning_rate": 1.123683721144223e-06, "logits/chosen": -2.6716742515563965, "logits/rejected": -2.692115545272827, "logps/chosen": -224.25033569335938, "logps/rejected": -240.08926391601562, "loss": 0.7195, "positive_losses": 0.2249153107404709, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.22329013049602509, "rewards/margins": 0.0855129212141037, "rewards/margins_max": 0.319058358669281, "rewards/margins_min": -0.09195182472467422, "rewards/margins_std": 0.18861567974090576, "rewards/rejected": 0.1377771943807602, "step": 2740 }, { "dpo_losses": 0.6327452063560486, "epoch": 0.72, "grad_norm": 12.747731324854005, "learning_rate": 1.1046703517675848e-06, "logits/chosen": -2.7317569255828857, "logits/rejected": -2.698331356048584, "logps/chosen": -258.23480224609375, "logps/rejected": -234.5063018798828, "loss": 0.6745, "positive_losses": 0.1679396629333496, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2581270933151245, "rewards/margins": 0.13834264874458313, "rewards/margins_max": 0.35900428891181946, "rewards/margins_min": -0.04750121384859085, "rewards/margins_std": 0.1870662271976471, "rewards/rejected": 0.11978445202112198, "step": 2750 }, { "dpo_losses": 0.6246061325073242, "epoch": 0.72, "grad_norm": 10.10358339703976, "learning_rate": 1.085773492015028e-06, "logits/chosen": -2.6803040504455566, "logits/rejected": -2.6235201358795166, "logps/chosen": -304.90557861328125, "logps/rejected": -279.1253356933594, "loss": 0.642, "positive_losses": 0.04808397218585014, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.272233784198761, "rewards/margins": 0.15535393357276917, "rewards/margins_max": 0.4002731740474701, "rewards/margins_min": -0.05857878923416138, "rewards/margins_std": 0.20337903499603271, "rewards/rejected": 0.11687986552715302, "step": 2760 }, { "dpo_losses": 0.6474908590316772, "epoch": 0.72, "grad_norm": 4.702800143862722, "learning_rate": 1.0669947197689034e-06, "logits/chosen": -2.7744927406311035, "logits/rejected": -2.7323365211486816, "logps/chosen": -283.83074951171875, "logps/rejected": -252.76382446289062, "loss": 0.675, "positive_losses": 0.05157585069537163, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.23826198279857635, "rewards/margins": 0.10503115504980087, "rewards/margins_max": 0.34306079149246216, "rewards/margins_min": -0.10593054443597794, "rewards/margins_std": 0.1956198662519455, "rewards/rejected": 0.13323083519935608, "step": 2770 }, { "dpo_losses": 0.6510823965072632, "epoch": 0.73, "grad_norm": 9.12463127331858, "learning_rate": 1.048335603051291e-06, "logits/chosen": -2.683255434036255, "logits/rejected": -2.6377549171447754, "logps/chosen": -298.9820556640625, "logps/rejected": -228.7309112548828, "loss": 0.6745, "positive_losses": 0.32691827416419983, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.21623727679252625, "rewards/margins": 0.0958595797419548, "rewards/margins_max": 0.31551045179367065, "rewards/margins_min": -0.10179316997528076, "rewards/margins_std": 0.18623696267604828, "rewards/rejected": 0.12037769705057144, "step": 2780 }, { "dpo_losses": 0.6339425444602966, "epoch": 0.73, "grad_norm": 1.5157367625765046, "learning_rate": 1.0297976998930665e-06, "logits/chosen": -2.7739503383636475, "logits/rejected": -2.743335247039795, "logps/chosen": -301.68414306640625, "logps/rejected": -250.2188720703125, "loss": 0.653, "positive_losses": 0.09758367389440536, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2672674357891083, "rewards/margins": 0.13677000999450684, "rewards/margins_max": 0.38596731424331665, "rewards/margins_min": -0.07417190074920654, "rewards/margins_std": 0.20014214515686035, "rewards/rejected": 0.13049742579460144, "step": 2790 }, { "dpo_losses": 0.6524044275283813, "epoch": 0.73, "grad_norm": 9.800673434415943, "learning_rate": 1.0113825582038078e-06, "logits/chosen": -2.7075295448303223, "logits/rejected": -2.6866707801818848, "logps/chosen": -244.5875701904297, "logps/rejected": -244.8672637939453, "loss": 0.689, "positive_losses": 0.4609023928642273, "rewards/accuracies": 0.6875, "rewards/chosen": 0.20752203464508057, "rewards/margins": 0.09320969134569168, "rewards/margins_max": 0.2969111204147339, "rewards/margins_min": -0.11049803346395493, "rewards/margins_std": 0.18351614475250244, "rewards/rejected": 0.11431236565113068, "step": 2800 }, { "epoch": 0.73, "eval_dpo_losses": 0.6423235535621643, "eval_logits/chosen": -2.669100046157837, "eval_logits/rejected": -2.636967182159424, "eval_logps/chosen": -261.0170593261719, "eval_logps/rejected": -246.5806121826172, "eval_loss": 0.6738886833190918, "eval_positive_losses": 0.20811626315116882, "eval_rewards/accuracies": 0.7080000042915344, "eval_rewards/chosen": 0.23576387763023376, "eval_rewards/margins": 0.11578075587749481, "eval_rewards/margins_max": 0.4364437162876129, "eval_rewards/margins_min": -0.15528635680675507, "eval_rewards/margins_std": 0.19841812551021576, "eval_rewards/rejected": 0.11998309940099716, "eval_runtime": 428.7708, "eval_samples_per_second": 4.664, "eval_steps_per_second": 0.292, "step": 2800 }, { "dpo_losses": 0.6166631579399109, "epoch": 0.74, "grad_norm": 2.2194723108690946, "learning_rate": 9.930917156425477e-07, "logits/chosen": -2.6861419677734375, "logits/rejected": -2.6787948608398438, "logps/chosen": -244.9552764892578, "logps/rejected": -220.2777557373047, "loss": 0.6685, "positive_losses": 0.17078809440135956, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2651708722114563, "rewards/margins": 0.1798083782196045, "rewards/margins_max": 0.41993579268455505, "rewards/margins_min": -0.0417616032063961, "rewards/margins_std": 0.20407752692699432, "rewards/rejected": 0.08536247909069061, "step": 2810 }, { "dpo_losses": 0.6557396650314331, "epoch": 0.74, "grad_norm": 2.4051517110387843, "learning_rate": 9.749266994893756e-07, "logits/chosen": -2.674410820007324, "logits/rejected": -2.6722211837768555, "logps/chosen": -241.9083709716797, "logps/rejected": -248.44680786132812, "loss": 0.6602, "positive_losses": 0.2690662443637848, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2257826030254364, "rewards/margins": 0.08534611016511917, "rewards/margins_max": 0.2758987545967102, "rewards/margins_min": -0.10468705743551254, "rewards/margins_std": 0.1666567027568817, "rewards/rejected": 0.14043651521205902, "step": 2820 }, { "dpo_losses": 0.6404080986976624, "epoch": 0.74, "grad_norm": 2.0355323839303203, "learning_rate": 9.56889026517913e-07, "logits/chosen": -2.6610190868377686, "logits/rejected": -2.6477789878845215, "logps/chosen": -276.48504638671875, "logps/rejected": -363.28704833984375, "loss": 0.6464, "positive_losses": 0.06721305847167969, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.23350541293621063, "rewards/margins": 0.11772487312555313, "rewards/margins_max": 0.33200541138648987, "rewards/margins_min": -0.07340795546770096, "rewards/margins_std": 0.17831745743751526, "rewards/rejected": 0.11578056961297989, "step": 2830 }, { "dpo_losses": 0.6309880018234253, "epoch": 0.74, "grad_norm": 1.8766274700622423, "learning_rate": 9.389802028686617e-07, "logits/chosen": -2.648667573928833, "logits/rejected": -2.616184949874878, "logps/chosen": -259.9830627441406, "logps/rejected": -231.4760284423828, "loss": 0.6637, "positive_losses": 0.0, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.23486320674419403, "rewards/margins": 0.1399315893650055, "rewards/margins_max": 0.3641683757305145, "rewards/margins_min": -0.05851022154092789, "rewards/margins_std": 0.18994159996509552, "rewards/rejected": 0.09493163973093033, "step": 2840 }, { "dpo_losses": 0.6386110186576843, "epoch": 0.75, "grad_norm": 2.6656721068465044, "learning_rate": 9.212017239232427e-07, "logits/chosen": -2.721648693084717, "logits/rejected": -2.7082343101501465, "logps/chosen": -261.38629150390625, "logps/rejected": -249.54165649414062, "loss": 0.6969, "positive_losses": 0.3316573202610016, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22004885971546173, "rewards/margins": 0.12660430371761322, "rewards/margins_max": 0.3868912160396576, "rewards/margins_min": -0.1485016644001007, "rewards/margins_std": 0.23475301265716553, "rewards/rejected": 0.09344454109668732, "step": 2850 }, { "dpo_losses": 0.637594997882843, "epoch": 0.75, "grad_norm": 1.9614135650641793, "learning_rate": 9.03555074179533e-07, "logits/chosen": -2.699233055114746, "logits/rejected": -2.6669087409973145, "logps/chosen": -227.62094116210938, "logps/rejected": -217.5689697265625, "loss": 0.6443, "positive_losses": 0.12064529955387115, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.22545281052589417, "rewards/margins": 0.12351174652576447, "rewards/margins_max": 0.3114795386791229, "rewards/margins_min": -0.07401446998119354, "rewards/margins_std": 0.17692972719669342, "rewards/rejected": 0.1019410490989685, "step": 2860 }, { "dpo_losses": 0.6284499764442444, "epoch": 0.75, "grad_norm": 13.736236341981504, "learning_rate": 8.860417271277067e-07, "logits/chosen": -2.652283191680908, "logits/rejected": -2.6217527389526367, "logps/chosen": -287.107421875, "logps/rejected": -272.94329833984375, "loss": 0.6551, "positive_losses": 0.1835586577653885, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.24038609862327576, "rewards/margins": 0.14933030307292938, "rewards/margins_max": 0.3877665400505066, "rewards/margins_min": -0.09269267320632935, "rewards/margins_std": 0.2160751074552536, "rewards/rejected": 0.09105581045150757, "step": 2870 }, { "dpo_losses": 0.6409178376197815, "epoch": 0.75, "grad_norm": 11.681985298637304, "learning_rate": 8.686631451272029e-07, "logits/chosen": -2.6817407608032227, "logits/rejected": -2.604240894317627, "logps/chosen": -277.89312744140625, "logps/rejected": -249.20162963867188, "loss": 0.6897, "positive_losses": 0.4857429563999176, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21931703388690948, "rewards/margins": 0.12401922792196274, "rewards/margins_max": 0.35402682423591614, "rewards/margins_min": -0.12312940508127213, "rewards/margins_std": 0.2164493352174759, "rewards/rejected": 0.09529776871204376, "step": 2880 }, { "dpo_losses": 0.6496797800064087, "epoch": 0.76, "grad_norm": 1.9715463065103491, "learning_rate": 8.514207792846168e-07, "logits/chosen": -2.6438345909118652, "logits/rejected": -2.590409755706787, "logps/chosen": -269.50665283203125, "logps/rejected": -239.47177124023438, "loss": 0.6592, "positive_losses": 0.19409985840320587, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2361488789319992, "rewards/margins": 0.10511632263660431, "rewards/margins_max": 0.3536381423473358, "rewards/margins_min": -0.1464562565088272, "rewards/margins_std": 0.2224140465259552, "rewards/rejected": 0.1310325711965561, "step": 2890 }, { "dpo_losses": 0.6307598352432251, "epoch": 0.76, "grad_norm": 14.209094850300765, "learning_rate": 8.343160693325356e-07, "logits/chosen": -2.6957051753997803, "logits/rejected": -2.6865906715393066, "logps/chosen": -279.3338317871094, "logps/rejected": -236.14297485351562, "loss": 0.6882, "positive_losses": 0.5050802230834961, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.23673968017101288, "rewards/margins": 0.15022286772727966, "rewards/margins_max": 0.4690360128879547, "rewards/margins_min": -0.08646819740533829, "rewards/margins_std": 0.24568140506744385, "rewards/rejected": 0.08651680499315262, "step": 2900 }, { "epoch": 0.76, "eval_dpo_losses": 0.636917233467102, "eval_logits/chosen": -2.6701102256774902, "eval_logits/rejected": -2.6382365226745605, "eval_logps/chosen": -262.1392517089844, "eval_logps/rejected": -249.01136779785156, "eval_loss": 0.6874231696128845, "eval_positive_losses": 0.3545902371406555, "eval_rewards/accuracies": 0.7160000205039978, "eval_rewards/chosen": 0.22454147040843964, "eval_rewards/margins": 0.12886586785316467, "eval_rewards/margins_max": 0.4703871011734009, "eval_rewards/margins_min": -0.16210775077342987, "eval_rewards/margins_std": 0.21217867732048035, "eval_rewards/rejected": 0.09567559510469437, "eval_runtime": 428.7383, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 2900 }, { "dpo_losses": 0.6331465840339661, "epoch": 0.76, "grad_norm": 9.449500531918837, "learning_rate": 8.173504435093174e-07, "logits/chosen": -2.6724600791931152, "logits/rejected": -2.628342628479004, "logps/chosen": -235.9564666748047, "logps/rejected": -237.97354125976562, "loss": 0.6751, "positive_losses": 0.3460685610771179, "rewards/accuracies": 0.75, "rewards/chosen": 0.22341866791248322, "rewards/margins": 0.13468225300312042, "rewards/margins_max": 0.3398105204105377, "rewards/margins_min": -0.058129072189331055, "rewards/margins_std": 0.18074364960193634, "rewards/rejected": 0.0887364000082016, "step": 2910 }, { "dpo_losses": 0.6350184082984924, "epoch": 0.76, "grad_norm": 22.87685699429666, "learning_rate": 8.00525318439836e-07, "logits/chosen": -2.6834559440612793, "logits/rejected": -2.6547510623931885, "logps/chosen": -234.9221649169922, "logps/rejected": -204.93490600585938, "loss": 0.6889, "positive_losses": 1.0345876216888428, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.23099103569984436, "rewards/margins": 0.13446852564811707, "rewards/margins_max": 0.3792273700237274, "rewards/margins_min": -0.09443271905183792, "rewards/margins_std": 0.20637774467468262, "rewards/rejected": 0.09652251750230789, "step": 2920 }, { "dpo_losses": 0.6325119733810425, "epoch": 0.77, "grad_norm": 19.525373334648993, "learning_rate": 7.838420990171927e-07, "logits/chosen": -2.5970053672790527, "logits/rejected": -2.6252169609069824, "logps/chosen": -259.844482421875, "logps/rejected": -229.298095703125, "loss": 0.718, "positive_losses": 0.6484737396240234, "rewards/accuracies": 0.75, "rewards/chosen": 0.24847297370433807, "rewards/margins": 0.14355938136577606, "rewards/margins_max": 0.40601831674575806, "rewards/margins_min": -0.060882069170475006, "rewards/margins_std": 0.21191605925559998, "rewards/rejected": 0.10491357743740082, "step": 2930 }, { "dpo_losses": 0.6477078199386597, "epoch": 0.77, "grad_norm": 1.7704771094966083, "learning_rate": 7.673021782854084e-07, "logits/chosen": -2.6433920860290527, "logits/rejected": -2.653783082962036, "logps/chosen": -238.2683868408203, "logps/rejected": -237.9255828857422, "loss": 0.7021, "positive_losses": 0.6647624969482422, "rewards/accuracies": 0.6875, "rewards/chosen": 0.24048829078674316, "rewards/margins": 0.10512743145227432, "rewards/margins_max": 0.34509819746017456, "rewards/margins_min": -0.11889245361089706, "rewards/margins_std": 0.2046825885772705, "rewards/rejected": 0.13536086678504944, "step": 2940 }, { "dpo_losses": 0.6352402567863464, "epoch": 0.77, "grad_norm": 12.84443072443328, "learning_rate": 7.509069373231039e-07, "logits/chosen": -2.730562210083008, "logits/rejected": -2.6819634437561035, "logps/chosen": -249.3621368408203, "logps/rejected": -271.73895263671875, "loss": 0.6927, "positive_losses": 1.0396728515625, "rewards/accuracies": 0.75, "rewards/chosen": 0.2244911938905716, "rewards/margins": 0.13533872365951538, "rewards/margins_max": 0.40829578042030334, "rewards/margins_min": -0.0888959988951683, "rewards/margins_std": 0.21971866488456726, "rewards/rejected": 0.08915245532989502, "step": 2950 }, { "dpo_losses": 0.6449676156044006, "epoch": 0.77, "grad_norm": 1.8810142698907721, "learning_rate": 7.346577451281822e-07, "logits/chosen": -2.678873062133789, "logits/rejected": -2.6712539196014404, "logps/chosen": -258.7846374511719, "logps/rejected": -227.93331909179688, "loss": 0.6706, "positive_losses": 0.7027614712715149, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.23514370620250702, "rewards/margins": 0.11439553648233414, "rewards/margins_max": 0.36421626806259155, "rewards/margins_min": -0.12222512811422348, "rewards/margins_std": 0.2136216163635254, "rewards/rejected": 0.12074816226959229, "step": 2960 }, { "dpo_losses": 0.6398079991340637, "epoch": 0.78, "grad_norm": 7.455034886975086, "learning_rate": 7.185559585035138e-07, "logits/chosen": -2.673152208328247, "logits/rejected": -2.712742567062378, "logps/chosen": -248.77304077148438, "logps/rejected": -242.112548828125, "loss": 0.6837, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.24148698151111603, "rewards/margins": 0.11927878856658936, "rewards/margins_max": 0.31742143630981445, "rewards/margins_min": -0.05262741446495056, "rewards/margins_std": 0.16800831258296967, "rewards/rejected": 0.12220821529626846, "step": 2970 }, { "dpo_losses": 0.6365830898284912, "epoch": 0.78, "grad_norm": 2.035208146793626, "learning_rate": 7.026029219436504e-07, "logits/chosen": -2.6321146488189697, "logits/rejected": -2.6523303985595703, "logps/chosen": -279.9559631347656, "logps/rejected": -289.53558349609375, "loss": 0.6555, "positive_losses": 0.28735655546188354, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23514989018440247, "rewards/margins": 0.1361285001039505, "rewards/margins_max": 0.4135875105857849, "rewards/margins_min": -0.0913558229804039, "rewards/margins_std": 0.22293679416179657, "rewards/rejected": 0.09902138262987137, "step": 2980 }, { "dpo_losses": 0.6480584740638733, "epoch": 0.78, "grad_norm": 2.2782672560327266, "learning_rate": 6.867999675225523e-07, "logits/chosen": -2.6375298500061035, "logits/rejected": -2.5830719470977783, "logps/chosen": -280.14892578125, "logps/rejected": -272.3431091308594, "loss": 0.6824, "positive_losses": 0.3097159266471863, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.23243474960327148, "rewards/margins": 0.11371596157550812, "rewards/margins_max": 0.3422084450721741, "rewards/margins_min": -0.16615024209022522, "rewards/margins_std": 0.23152053356170654, "rewards/rejected": 0.11871878057718277, "step": 2990 }, { "dpo_losses": 0.6527405977249146, "epoch": 0.79, "grad_norm": 2.4298409661892855, "learning_rate": 6.711484147823663e-07, "logits/chosen": -2.69260311126709, "logits/rejected": -2.67319917678833, "logps/chosen": -222.75436401367188, "logps/rejected": -241.01513671875, "loss": 0.6643, "positive_losses": 0.06721165031194687, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.20959754288196564, "rewards/margins": 0.09568390995264053, "rewards/margins_max": 0.3374756872653961, "rewards/margins_min": -0.08535850048065186, "rewards/margins_std": 0.1930762231349945, "rewards/rejected": 0.11391359567642212, "step": 3000 }, { "epoch": 0.79, "eval_dpo_losses": 0.6398637294769287, "eval_logits/chosen": -2.6685869693756104, "eval_logits/rejected": -2.6371400356292725, "eval_logps/chosen": -261.2201232910156, "eval_logps/rejected": -247.35943603515625, "eval_loss": 0.6773857474327087, "eval_positive_losses": 0.23619309067726135, "eval_rewards/accuracies": 0.7160000205039978, "eval_rewards/chosen": 0.23373311758041382, "eval_rewards/margins": 0.12153854221105576, "eval_rewards/margins_max": 0.449297696352005, "eval_rewards/margins_min": -0.1537606120109558, "eval_rewards/margins_std": 0.20275598764419556, "eval_rewards/rejected": 0.11219460517168045, "eval_runtime": 428.7984, "eval_samples_per_second": 4.664, "eval_steps_per_second": 0.292, "step": 3000 }, { "dpo_losses": 0.6306964755058289, "epoch": 0.79, "grad_norm": 2.052630304807198, "learning_rate": 6.556495706232413e-07, "logits/chosen": -2.734579563140869, "logits/rejected": -2.7351388931274414, "logps/chosen": -270.08966064453125, "logps/rejected": -248.0271453857422, "loss": 0.6622, "positive_losses": 0.3427295684814453, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.24841323494911194, "rewards/margins": 0.14205755293369293, "rewards/margins_max": 0.3683268427848816, "rewards/margins_min": -0.06113971024751663, "rewards/margins_std": 0.19455572962760925, "rewards/rejected": 0.1063556894659996, "step": 3010 }, { "dpo_losses": 0.6338121891021729, "epoch": 0.79, "grad_norm": 20.259890554571196, "learning_rate": 6.403047291942057e-07, "logits/chosen": -2.702024459838867, "logits/rejected": -2.6492066383361816, "logps/chosen": -241.46847534179688, "logps/rejected": -209.68701171875, "loss": 0.6902, "positive_losses": 0.352372944355011, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23364338278770447, "rewards/margins": 0.13513147830963135, "rewards/margins_max": 0.3587791919708252, "rewards/margins_min": -0.09530764818191528, "rewards/margins_std": 0.20471616089344025, "rewards/rejected": 0.09851191192865372, "step": 3020 }, { "dpo_losses": 0.6391339898109436, "epoch": 0.79, "grad_norm": 1.9289190329788246, "learning_rate": 6.251151717851023e-07, "logits/chosen": -2.6582083702087402, "logits/rejected": -2.6450939178466797, "logps/chosen": -253.9155731201172, "logps/rejected": -280.8744201660156, "loss": 0.6917, "positive_losses": 0.3619529604911804, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22244131565093994, "rewards/margins": 0.12662717700004578, "rewards/margins_max": 0.3937821686267853, "rewards/margins_min": -0.08315359055995941, "rewards/margins_std": 0.21088910102844238, "rewards/rejected": 0.09581412374973297, "step": 3030 }, { "dpo_losses": 0.6252659559249878, "epoch": 0.8, "grad_norm": 2.418152834489774, "learning_rate": 6.100821667196041e-07, "logits/chosen": -2.6874794960021973, "logits/rejected": -2.629295825958252, "logps/chosen": -286.81243896484375, "logps/rejected": -278.2958679199219, "loss": 0.6466, "positive_losses": 0.2067296952009201, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.25894415378570557, "rewards/margins": 0.15300555527210236, "rewards/margins_max": 0.37397339940071106, "rewards/margins_min": -0.05536312982439995, "rewards/margins_std": 0.1934482306241989, "rewards/rejected": 0.1059386283159256, "step": 3040 }, { "dpo_losses": 0.6366590857505798, "epoch": 0.8, "grad_norm": 71.14259654362321, "learning_rate": 5.952069692493062e-07, "logits/chosen": -2.674673557281494, "logits/rejected": -2.6568939685821533, "logps/chosen": -247.89779663085938, "logps/rejected": -240.1614990234375, "loss": 0.6911, "positive_losses": 0.13297691941261292, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22278237342834473, "rewards/margins": 0.1301979124546051, "rewards/margins_max": 0.3397059142589569, "rewards/margins_min": -0.08934374153614044, "rewards/margins_std": 0.1898353546857834, "rewards/rejected": 0.09258445352315903, "step": 3050 }, { "dpo_losses": 0.6470397710800171, "epoch": 0.8, "grad_norm": 15.52598706730308, "learning_rate": 5.80490821448918e-07, "logits/chosen": -2.691991090774536, "logits/rejected": -2.6771504878997803, "logps/chosen": -283.14068603515625, "logps/rejected": -268.2895202636719, "loss": 0.6563, "positive_losses": 0.2283284217119217, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.24039585888385773, "rewards/margins": 0.1061919704079628, "rewards/margins_max": 0.33132392168045044, "rewards/margins_min": -0.1168690174818039, "rewards/margins_std": 0.2001001089811325, "rewards/rejected": 0.13420389592647552, "step": 3060 }, { "dpo_losses": 0.6548877954483032, "epoch": 0.8, "grad_norm": 10.717360870832172, "learning_rate": 5.659349521125459e-07, "logits/chosen": -2.6247153282165527, "logits/rejected": -2.6121203899383545, "logps/chosen": -224.9630126953125, "logps/rejected": -248.85867309570312, "loss": 0.6744, "positive_losses": 0.5918839573860168, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.20065799355506897, "rewards/margins": 0.08835332840681076, "rewards/margins_max": 0.3009774684906006, "rewards/margins_min": -0.09773066639900208, "rewards/margins_std": 0.1786937266588211, "rewards/rejected": 0.11230464279651642, "step": 3070 }, { "dpo_losses": 0.6685213446617126, "epoch": 0.81, "grad_norm": 2.037438692446002, "learning_rate": 5.5154057665109e-07, "logits/chosen": -2.645376682281494, "logits/rejected": -2.65083909034729, "logps/chosen": -230.00491333007812, "logps/rejected": -269.43231201171875, "loss": 0.6797, "positive_losses": 0.6703279614448547, "rewards/accuracies": 0.5874999761581421, "rewards/chosen": 0.20277495682239532, "rewards/margins": 0.06283371150493622, "rewards/margins_max": 0.28540587425231934, "rewards/margins_min": -0.15911726653575897, "rewards/margins_std": 0.19872137904167175, "rewards/rejected": 0.1399412453174591, "step": 3080 }, { "dpo_losses": 0.6519734859466553, "epoch": 0.81, "grad_norm": 6.232355770763503, "learning_rate": 5.373088969907586e-07, "logits/chosen": -2.6775002479553223, "logits/rejected": -2.6286909580230713, "logps/chosen": -255.8141632080078, "logps/rejected": -230.0997772216797, "loss": 0.7097, "positive_losses": 0.9905219078063965, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.20397773385047913, "rewards/margins": 0.09516613930463791, "rewards/margins_max": 0.3103027641773224, "rewards/margins_min": -0.08702187240123749, "rewards/margins_std": 0.180120050907135, "rewards/rejected": 0.10881157964468002, "step": 3090 }, { "dpo_losses": 0.668020486831665, "epoch": 0.81, "grad_norm": 13.349306330000353, "learning_rate": 5.23241101472709e-07, "logits/chosen": -2.692614793777466, "logits/rejected": -2.685544013977051, "logps/chosen": -262.1580810546875, "logps/rejected": -271.2661437988281, "loss": 0.6877, "positive_losses": 0.2822890281677246, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.23586949706077576, "rewards/margins": 0.06439534574747086, "rewards/margins_max": 0.29085201025009155, "rewards/margins_min": -0.15065248310565948, "rewards/margins_std": 0.1968853771686554, "rewards/rejected": 0.1714741438627243, "step": 3100 }, { "epoch": 0.81, "eval_dpo_losses": 0.641436755657196, "eval_logits/chosen": -2.6650614738464355, "eval_logits/rejected": -2.6329939365386963, "eval_logps/chosen": -260.8720397949219, "eval_logps/rejected": -246.6224365234375, "eval_loss": 0.6719857454299927, "eval_positive_losses": 0.18758077919483185, "eval_rewards/accuracies": 0.7120000123977661, "eval_rewards/chosen": 0.23721392452716827, "eval_rewards/margins": 0.11764894425868988, "eval_rewards/margins_max": 0.4372667968273163, "eval_rewards/margins_min": -0.1502349078655243, "eval_rewards/margins_std": 0.19791251420974731, "eval_rewards/rejected": 0.11956498771905899, "eval_runtime": 428.8563, "eval_samples_per_second": 4.664, "eval_steps_per_second": 0.291, "step": 3100 }, { "dpo_losses": 0.6547427177429199, "epoch": 0.81, "grad_norm": 2.2292396913730514, "learning_rate": 5.09338364753818e-07, "logits/chosen": -2.718360662460327, "logits/rejected": -2.650073766708374, "logps/chosen": -266.08343505859375, "logps/rejected": -252.248291015625, "loss": 0.6584, "positive_losses": 0.1940586119890213, "rewards/accuracies": 0.6875, "rewards/chosen": 0.22189071774482727, "rewards/margins": 0.09172861278057098, "rewards/margins_max": 0.341278612613678, "rewards/margins_min": -0.1299332082271576, "rewards/margins_std": 0.20829057693481445, "rewards/rejected": 0.1301620900630951, "step": 3110 }, { "dpo_losses": 0.6420370936393738, "epoch": 0.82, "grad_norm": 1.9294826383380466, "learning_rate": 4.956018477086005e-07, "logits/chosen": -2.63110613822937, "logits/rejected": -2.600900888442993, "logps/chosen": -271.1282043457031, "logps/rejected": -247.82284545898438, "loss": 0.6596, "positive_losses": 0.41998806595802307, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.23537835478782654, "rewards/margins": 0.11625852435827255, "rewards/margins_max": 0.34567388892173767, "rewards/margins_min": -0.101532481610775, "rewards/margins_std": 0.1985291987657547, "rewards/rejected": 0.11911983788013458, "step": 3120 }, { "dpo_losses": 0.6431052684783936, "epoch": 0.82, "grad_norm": 1.909705485260958, "learning_rate": 4.820326973322764e-07, "logits/chosen": -2.719957113265991, "logits/rejected": -2.6817431449890137, "logps/chosen": -241.52163696289062, "logps/rejected": -210.27688598632812, "loss": 0.646, "positive_losses": 0.17644834518432617, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.22382204234600067, "rewards/margins": 0.11636801064014435, "rewards/margins_max": 0.36100926995277405, "rewards/margins_min": -0.10877721011638641, "rewards/margins_std": 0.21270017325878143, "rewards/rejected": 0.10745406150817871, "step": 3130 }, { "dpo_losses": 0.6377115845680237, "epoch": 0.82, "grad_norm": 7.663100560411211, "learning_rate": 4.686320466449981e-07, "logits/chosen": -2.6265625953674316, "logits/rejected": -2.6530814170837402, "logps/chosen": -215.73171997070312, "logps/rejected": -240.04525756835938, "loss": 0.6885, "positive_losses": 0.2875320315361023, "rewards/accuracies": 0.75, "rewards/chosen": 0.2346370667219162, "rewards/margins": 0.12181933969259262, "rewards/margins_max": 0.28717726469039917, "rewards/margins_min": -0.040038738399744034, "rewards/margins_std": 0.148724764585495, "rewards/rejected": 0.11281770467758179, "step": 3140 }, { "dpo_losses": 0.6270794868469238, "epoch": 0.82, "grad_norm": 2.220914832284377, "learning_rate": 4.554010145972418e-07, "logits/chosen": -2.5903825759887695, "logits/rejected": -2.5763096809387207, "logps/chosen": -277.8771667480469, "logps/rejected": -295.5439758300781, "loss": 0.648, "positive_losses": 0.12219295650720596, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.254260778427124, "rewards/margins": 0.1561306267976761, "rewards/margins_max": 0.4178551733493805, "rewards/margins_min": -0.06652072072029114, "rewards/margins_std": 0.21075372397899628, "rewards/rejected": 0.09813012927770615, "step": 3150 }, { "dpo_losses": 0.6211727857589722, "epoch": 0.83, "grad_norm": 2.471957766452978, "learning_rate": 4.4234070597637455e-07, "logits/chosen": -2.724518299102783, "logits/rejected": -2.6921210289001465, "logps/chosen": -278.99884033203125, "logps/rejected": -245.81069946289062, "loss": 0.6627, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.26552876830101013, "rewards/margins": 0.16143757104873657, "rewards/margins_max": 0.3843764662742615, "rewards/margins_min": -0.042003024369478226, "rewards/margins_std": 0.19313038885593414, "rewards/rejected": 0.10409118980169296, "step": 3160 }, { "dpo_losses": 0.6412732005119324, "epoch": 0.83, "grad_norm": 12.902440134555217, "learning_rate": 4.2945221131440783e-07, "logits/chosen": -2.705014705657959, "logits/rejected": -2.6872849464416504, "logps/chosen": -272.9225158691406, "logps/rejected": -254.9892120361328, "loss": 0.6618, "positive_losses": 0.40812358260154724, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.21921618282794952, "rewards/margins": 0.11757165193557739, "rewards/margins_max": 0.3251839876174927, "rewards/margins_min": -0.08082378655672073, "rewards/margins_std": 0.17831583321094513, "rewards/rejected": 0.10164451599121094, "step": 3170 }, { "dpo_losses": 0.6384933590888977, "epoch": 0.83, "grad_norm": 14.754409389587469, "learning_rate": 4.167366067969381e-07, "logits/chosen": -2.636754035949707, "logits/rejected": -2.616779088973999, "logps/chosen": -268.90966796875, "logps/rejected": -220.1052703857422, "loss": 0.6983, "positive_losses": 0.3922765851020813, "rewards/accuracies": 0.7875000238418579, "rewards/chosen": 0.2557048797607422, "rewards/margins": 0.12198550999164581, "rewards/margins_max": 0.3047846257686615, "rewards/margins_min": -0.09232550114393234, "rewards/margins_std": 0.17760710418224335, "rewards/rejected": 0.13371935486793518, "step": 3180 }, { "dpo_losses": 0.6359082460403442, "epoch": 0.83, "grad_norm": 1.9901359017131461, "learning_rate": 4.041949541732826e-07, "logits/chosen": -2.6384222507476807, "logits/rejected": -2.602496862411499, "logps/chosen": -258.841064453125, "logps/rejected": -236.75448608398438, "loss": 0.6607, "positive_losses": 0.26552096009254456, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23902972042560577, "rewards/margins": 0.12980027496814728, "rewards/margins_max": 0.3550037741661072, "rewards/margins_min": -0.1000526174902916, "rewards/margins_std": 0.2099500447511673, "rewards/rejected": 0.1092294305562973, "step": 3190 }, { "dpo_losses": 0.6393564343452454, "epoch": 0.84, "grad_norm": 1.7672084358519964, "learning_rate": 3.9182830066782614e-07, "logits/chosen": -2.636000156402588, "logits/rejected": -2.613868474960327, "logps/chosen": -248.4848175048828, "logps/rejected": -240.05886840820312, "loss": 0.6513, "positive_losses": 0.18637581169605255, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22722773253917694, "rewards/margins": 0.12229988723993301, "rewards/margins_max": 0.36984676122665405, "rewards/margins_min": -0.09386143088340759, "rewards/margins_std": 0.20374366641044617, "rewards/rejected": 0.10492783784866333, "step": 3200 }, { "epoch": 0.84, "eval_dpo_losses": 0.638157308101654, "eval_logits/chosen": -2.663109064102173, "eval_logits/rejected": -2.6310267448425293, "eval_logps/chosen": -261.3907470703125, "eval_logps/rejected": -247.9314727783203, "eval_loss": 0.6780828833580017, "eval_positive_losses": 0.2525976002216339, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": 0.23202690482139587, "eval_rewards/margins": 0.12555211782455444, "eval_rewards/margins_max": 0.4573511779308319, "eval_rewards/margins_min": -0.1548658013343811, "eval_rewards/margins_std": 0.2060714066028595, "eval_rewards/rejected": 0.10647477209568024, "eval_runtime": 428.6061, "eval_samples_per_second": 4.666, "eval_steps_per_second": 0.292, "step": 3200 }, { "dpo_losses": 0.6356785893440247, "epoch": 0.84, "grad_norm": 2.100458585531086, "learning_rate": 3.796376788925771e-07, "logits/chosen": -2.6860642433166504, "logits/rejected": -2.666347026824951, "logps/chosen": -230.35360717773438, "logps/rejected": -223.0608367919922, "loss": 0.6731, "positive_losses": 0.2336856871843338, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2298533171415329, "rewards/margins": 0.13223880529403687, "rewards/margins_max": 0.3856995403766632, "rewards/margins_min": -0.08395050466060638, "rewards/margins_std": 0.21135418117046356, "rewards/rejected": 0.09761451184749603, "step": 3210 }, { "dpo_losses": 0.637304425239563, "epoch": 0.84, "grad_norm": 2.287149613024027, "learning_rate": 3.676241067609465e-07, "logits/chosen": -2.624802827835083, "logits/rejected": -2.61322021484375, "logps/chosen": -268.00994873046875, "logps/rejected": -264.14605712890625, "loss": 0.6847, "positive_losses": 0.34495869278907776, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.22617018222808838, "rewards/margins": 0.12562856078147888, "rewards/margins_max": 0.3497055470943451, "rewards/margins_min": -0.05278503894805908, "rewards/margins_std": 0.1777449995279312, "rewards/rejected": 0.1005416065454483, "step": 3220 }, { "dpo_losses": 0.6588651537895203, "epoch": 0.85, "grad_norm": 2.3462180811926627, "learning_rate": 3.5578858740274976e-07, "logits/chosen": -2.6383557319641113, "logits/rejected": -2.623720407485962, "logps/chosen": -294.2330322265625, "logps/rejected": -310.3564758300781, "loss": 0.7223, "positive_losses": 0.767656683921814, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.21732446551322937, "rewards/margins": 0.0910581424832344, "rewards/margins_max": 0.4103250503540039, "rewards/margins_min": -0.1528368890285492, "rewards/margins_std": 0.24970689415931702, "rewards/rejected": 0.12626633048057556, "step": 3230 }, { "dpo_losses": 0.6268896460533142, "epoch": 0.85, "grad_norm": 2.3769887327537815, "learning_rate": 3.44132109080447e-07, "logits/chosen": -2.7066702842712402, "logits/rejected": -2.660546064376831, "logps/chosen": -231.73080444335938, "logps/rejected": -249.92276000976562, "loss": 0.6778, "positive_losses": 0.19026947021484375, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.2194831669330597, "rewards/margins": 0.15953031182289124, "rewards/margins_max": 0.40868711471557617, "rewards/margins_min": -0.06091107800602913, "rewards/margins_std": 0.21540050208568573, "rewards/rejected": 0.05995287373661995, "step": 3240 }, { "dpo_losses": 0.65413498878479, "epoch": 0.85, "grad_norm": 12.254538804601198, "learning_rate": 3.3265564510662344e-07, "logits/chosen": -2.7262778282165527, "logits/rejected": -2.710273027420044, "logps/chosen": -271.7200622558594, "logps/rejected": -253.6293487548828, "loss": 0.6957, "positive_losses": 0.43155670166015625, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2163349837064743, "rewards/margins": 0.08986008912324905, "rewards/margins_max": 0.2767840027809143, "rewards/margins_min": -0.10123058408498764, "rewards/margins_std": 0.16752156615257263, "rewards/rejected": 0.12647488713264465, "step": 3250 }, { "dpo_losses": 0.6490314602851868, "epoch": 0.85, "grad_norm": 7.718883188928705, "learning_rate": 3.213601537627195e-07, "logits/chosen": -2.716717481613159, "logits/rejected": -2.7288060188293457, "logps/chosen": -244.274658203125, "logps/rejected": -284.3211364746094, "loss": 0.6627, "positive_losses": 0.1870754212141037, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.20126838982105255, "rewards/margins": 0.10119322687387466, "rewards/margins_max": 0.3067261576652527, "rewards/margins_min": -0.10198497772216797, "rewards/margins_std": 0.18057967722415924, "rewards/rejected": 0.1000751480460167, "step": 3260 }, { "dpo_losses": 0.6421646475791931, "epoch": 0.86, "grad_norm": 12.581409703097165, "learning_rate": 3.1024657821901063e-07, "logits/chosen": -2.694148302078247, "logits/rejected": -2.644860029220581, "logps/chosen": -236.10885620117188, "logps/rejected": -255.47769165039062, "loss": 0.6745, "positive_losses": 0.27171993255615234, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.2141837626695633, "rewards/margins": 0.11888556182384491, "rewards/margins_max": 0.3949454724788666, "rewards/margins_min": -0.13334409892559052, "rewards/margins_std": 0.23849141597747803, "rewards/rejected": 0.09529820829629898, "step": 3270 }, { "dpo_losses": 0.652772068977356, "epoch": 0.86, "grad_norm": 14.022586586301134, "learning_rate": 2.9931584645585654e-07, "logits/chosen": -2.7820725440979004, "logits/rejected": -2.6916279792785645, "logps/chosen": -235.06948852539062, "logps/rejected": -235.7620391845703, "loss": 0.6705, "positive_losses": 0.40381184220314026, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.22431237995624542, "rewards/margins": 0.09758533537387848, "rewards/margins_max": 0.35281312465667725, "rewards/margins_min": -0.13170775771141052, "rewards/margins_std": 0.21209947764873505, "rewards/rejected": 0.12672704458236694, "step": 3280 }, { "dpo_losses": 0.6143797039985657, "epoch": 0.86, "grad_norm": 2.1908171434796997, "learning_rate": 2.885688711862136e-07, "logits/chosen": -2.625082492828369, "logits/rejected": -2.6306636333465576, "logps/chosen": -269.0477294921875, "logps/rejected": -266.344970703125, "loss": 0.6651, "positive_losses": 0.40293607115745544, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.24958916008472443, "rewards/margins": 0.17949099838733673, "rewards/margins_max": 0.42584124207496643, "rewards/margins_min": -0.0823826938867569, "rewards/margins_std": 0.22891190648078918, "rewards/rejected": 0.0700981467962265, "step": 3290 }, { "dpo_losses": 0.6400938034057617, "epoch": 0.86, "grad_norm": 8.819274895374097, "learning_rate": 2.7800654977942486e-07, "logits/chosen": -2.6938107013702393, "logits/rejected": -2.6867642402648926, "logps/chosen": -310.67657470703125, "logps/rejected": -296.18853759765625, "loss": 0.6681, "positive_losses": 0.24154730141162872, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2511255145072937, "rewards/margins": 0.12273173034191132, "rewards/margins_max": 0.3658311069011688, "rewards/margins_min": -0.14342817664146423, "rewards/margins_std": 0.2238069772720337, "rewards/rejected": 0.12839379906654358, "step": 3300 }, { "epoch": 0.86, "eval_dpo_losses": 0.6388716101646423, "eval_logits/chosen": -2.6669511795043945, "eval_logits/rejected": -2.6348111629486084, "eval_logps/chosen": -261.18914794921875, "eval_logps/rejected": -247.5554962158203, "eval_loss": 0.6756910681724548, "eval_positive_losses": 0.23082859814167023, "eval_rewards/accuracies": 0.7170000076293945, "eval_rewards/chosen": 0.23404254019260406, "eval_rewards/margins": 0.12380820512771606, "eval_rewards/margins_max": 0.45279812812805176, "eval_rewards/margins_min": -0.15328605473041534, "eval_rewards/margins_std": 0.20409417152404785, "eval_rewards/rejected": 0.1102343201637268, "eval_runtime": 428.9258, "eval_samples_per_second": 4.663, "eval_steps_per_second": 0.291, "step": 3300 }, { "dpo_losses": 0.6238885521888733, "epoch": 0.87, "grad_norm": 14.668852961032188, "learning_rate": 2.6762976418628797e-07, "logits/chosen": -2.717496395111084, "logits/rejected": -2.6723551750183105, "logps/chosen": -251.7400665283203, "logps/rejected": -255.1749725341797, "loss": 0.6844, "positive_losses": 0.19081421196460724, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.227542445063591, "rewards/margins": 0.159857377409935, "rewards/margins_max": 0.44299978017807007, "rewards/margins_min": -0.0689382404088974, "rewards/margins_std": 0.23011159896850586, "rewards/rejected": 0.06768506020307541, "step": 3310 }, { "dpo_losses": 0.6380771398544312, "epoch": 0.87, "grad_norm": 2.9331369666638416, "learning_rate": 2.5743938086541354e-07, "logits/chosen": -2.6246352195739746, "logits/rejected": -2.6102166175842285, "logps/chosen": -277.8871765136719, "logps/rejected": -267.2280578613281, "loss": 0.6743, "positive_losses": 0.29624366760253906, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2643848657608032, "rewards/margins": 0.13570401072502136, "rewards/margins_max": 0.42715948820114136, "rewards/margins_min": -0.08788014948368073, "rewards/margins_std": 0.23491962254047394, "rewards/rejected": 0.12868084013462067, "step": 3320 }, { "dpo_losses": 0.6462224721908569, "epoch": 0.87, "grad_norm": 2.3618876233738932, "learning_rate": 2.4743625071087574e-07, "logits/chosen": -2.745776653289795, "logits/rejected": -2.7600514888763428, "logps/chosen": -279.7750549316406, "logps/rejected": -276.932373046875, "loss": 0.6482, "positive_losses": 0.015347003936767578, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.25319600105285645, "rewards/margins": 0.10687966644763947, "rewards/margins_max": 0.31104451417922974, "rewards/margins_min": -0.09214670956134796, "rewards/margins_std": 0.17904457449913025, "rewards/rejected": 0.14631633460521698, "step": 3330 }, { "dpo_losses": 0.6348574161529541, "epoch": 0.87, "grad_norm": 2.050415704028374, "learning_rate": 2.3762120898116498e-07, "logits/chosen": -2.666491985321045, "logits/rejected": -2.6815104484558105, "logps/chosen": -243.59573364257812, "logps/rejected": -295.98736572265625, "loss": 0.6482, "positive_losses": 0.1339561492204666, "rewards/accuracies": 0.75, "rewards/chosen": 0.2198420763015747, "rewards/margins": 0.13329023122787476, "rewards/margins_max": 0.35944879055023193, "rewards/margins_min": -0.08258643001317978, "rewards/margins_std": 0.19745826721191406, "rewards/rejected": 0.08655181527137756, "step": 3340 }, { "dpo_losses": 0.6417210698127747, "epoch": 0.88, "grad_norm": 7.076805008506167, "learning_rate": 2.2799507522944048e-07, "logits/chosen": -2.7321081161499023, "logits/rejected": -2.6608481407165527, "logps/chosen": -252.4512481689453, "logps/rejected": -222.0477752685547, "loss": 0.6647, "positive_losses": 0.2921985983848572, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21575334668159485, "rewards/margins": 0.12168721109628677, "rewards/margins_max": 0.37518253922462463, "rewards/margins_min": -0.10554458200931549, "rewards/margins_std": 0.21923920512199402, "rewards/rejected": 0.09406615793704987, "step": 3350 }, { "dpo_losses": 0.622350811958313, "epoch": 0.88, "grad_norm": 1.7405315106965984, "learning_rate": 2.1855865323510056e-07, "logits/chosen": -2.627683401107788, "logits/rejected": -2.6018080711364746, "logps/chosen": -270.21295166015625, "logps/rejected": -278.2595520019531, "loss": 0.6677, "positive_losses": 0.20549389719963074, "rewards/accuracies": 0.8125, "rewards/chosen": 0.23529359698295593, "rewards/margins": 0.1640116572380066, "rewards/margins_max": 0.421354204416275, "rewards/margins_min": -0.051111988723278046, "rewards/margins_std": 0.2088313102722168, "rewards/rejected": 0.07128194719552994, "step": 3360 }, { "dpo_losses": 0.6479249000549316, "epoch": 0.88, "grad_norm": 2.1059422570958986, "learning_rate": 2.0931273093666575e-07, "logits/chosen": -2.7212464809417725, "logits/rejected": -2.722151041030884, "logps/chosen": -237.88265991210938, "logps/rejected": -243.563720703125, "loss": 0.6629, "positive_losses": 0.0947032943367958, "rewards/accuracies": 0.675000011920929, "rewards/chosen": 0.22334372997283936, "rewards/margins": 0.10537412017583847, "rewards/margins_max": 0.35444560647010803, "rewards/margins_min": -0.1045054942369461, "rewards/margins_std": 0.2078150510787964, "rewards/rejected": 0.11796959489583969, "step": 3370 }, { "dpo_losses": 0.6481832265853882, "epoch": 0.88, "grad_norm": 1.8210267069752408, "learning_rate": 2.002580803659873e-07, "logits/chosen": -2.749992847442627, "logits/rejected": -2.6664257049560547, "logps/chosen": -231.4402313232422, "logps/rejected": -220.24569702148438, "loss": 0.7084, "positive_losses": 0.00621795654296875, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.21355018019676208, "rewards/margins": 0.10223261266946793, "rewards/margins_max": 0.3133842349052429, "rewards/margins_min": -0.06899925321340561, "rewards/margins_std": 0.17496258020401, "rewards/rejected": 0.11131759732961655, "step": 3380 }, { "dpo_losses": 0.6434181928634644, "epoch": 0.89, "grad_norm": 1.9408715354115598, "learning_rate": 1.913954575837826e-07, "logits/chosen": -2.648083209991455, "logits/rejected": -2.6244101524353027, "logps/chosen": -255.4722900390625, "logps/rejected": -243.4903106689453, "loss": 0.6574, "positive_losses": 0.25248631834983826, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.21928679943084717, "rewards/margins": 0.12077753245830536, "rewards/margins_max": 0.4127538204193115, "rewards/margins_min": -0.14272555708885193, "rewards/margins_std": 0.24786868691444397, "rewards/rejected": 0.09850926697254181, "step": 3390 }, { "dpo_losses": 0.6385716199874878, "epoch": 0.89, "grad_norm": 1.895414103452244, "learning_rate": 1.827256026165028e-07, "logits/chosen": -2.6520824432373047, "logits/rejected": -2.6471376419067383, "logps/chosen": -215.4574737548828, "logps/rejected": -237.3244171142578, "loss": 0.6522, "positive_losses": 0.17541389167308807, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.21729853749275208, "rewards/margins": 0.12453562021255493, "rewards/margins_max": 0.38050615787506104, "rewards/margins_min": -0.06733690947294235, "rewards/margins_std": 0.20129939913749695, "rewards/rejected": 0.09276290982961655, "step": 3400 }, { "epoch": 0.89, "eval_dpo_losses": 0.6378921270370483, "eval_logits/chosen": -2.6654551029205322, "eval_logits/rejected": -2.633190870285034, "eval_logps/chosen": -261.2841491699219, "eval_logps/rejected": -247.886962890625, "eval_loss": 0.6781357526779175, "eval_positive_losses": 0.24831658601760864, "eval_rewards/accuracies": 0.718999981880188, "eval_rewards/chosen": 0.233092799782753, "eval_rewards/margins": 0.1261732429265976, "eval_rewards/margins_max": 0.459024041891098, "eval_rewards/margins_min": -0.15361490845680237, "eval_rewards/margins_std": 0.20637626945972443, "eval_rewards/rejected": 0.1069195568561554, "eval_runtime": 428.688, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 3400 }, { "dpo_losses": 0.642066240310669, "epoch": 0.89, "grad_norm": 1.6204006689760988, "learning_rate": 1.7424923939454274e-07, "logits/chosen": -2.5819950103759766, "logits/rejected": -2.594771146774292, "logps/chosen": -227.9528350830078, "logps/rejected": -247.6559600830078, "loss": 0.6824, "positive_losses": 0.69378662109375, "rewards/accuracies": 0.6875, "rewards/chosen": 0.1847458928823471, "rewards/margins": 0.11629591137170792, "rewards/margins_max": 0.34145644307136536, "rewards/margins_min": -0.07148783653974533, "rewards/margins_std": 0.184322789311409, "rewards/rejected": 0.06844998151063919, "step": 3410 }, { "dpo_losses": 0.6082772016525269, "epoch": 0.9, "grad_norm": 2.387517767235919, "learning_rate": 1.6596707569179304e-07, "logits/chosen": -2.6335902214050293, "logits/rejected": -2.6150095462799072, "logps/chosen": -285.0335693359375, "logps/rejected": -256.03875732421875, "loss": 0.6254, "positive_losses": 0.09616050869226456, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2746448516845703, "rewards/margins": 0.1927504539489746, "rewards/margins_max": 0.4615437388420105, "rewards/margins_min": -0.05438561365008354, "rewards/margins_std": 0.2323668897151947, "rewards/rejected": 0.0818944051861763, "step": 3420 }, { "dpo_losses": 0.6296011805534363, "epoch": 0.9, "grad_norm": 2.0117223054428535, "learning_rate": 1.578798030665385e-07, "logits/chosen": -2.657029151916504, "logits/rejected": -2.6547393798828125, "logps/chosen": -258.6382141113281, "logps/rejected": -250.00637817382812, "loss": 0.6506, "positive_losses": 0.04038505628705025, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2152319699525833, "rewards/margins": 0.1408066600561142, "rewards/margins_max": 0.3357832729816437, "rewards/margins_min": -0.04050174355506897, "rewards/margins_std": 0.16783852875232697, "rewards/rejected": 0.07442530244588852, "step": 3430 }, { "dpo_losses": 0.6399334073066711, "epoch": 0.9, "grad_norm": 1.7339047318481389, "learning_rate": 1.499880968037165e-07, "logits/chosen": -2.655935764312744, "logits/rejected": -2.6524109840393066, "logps/chosen": -285.5038146972656, "logps/rejected": -275.9583740234375, "loss": 0.6592, "positive_losses": 0.25177130103111267, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2417500913143158, "rewards/margins": 0.12439272552728653, "rewards/margins_max": 0.35841020941734314, "rewards/margins_min": -0.10624537616968155, "rewards/margins_std": 0.20926275849342346, "rewards/rejected": 0.11735733598470688, "step": 3440 }, { "dpo_losses": 0.6234074234962463, "epoch": 0.9, "grad_norm": 12.280097577331748, "learning_rate": 1.4229261585852805e-07, "logits/chosen": -2.700300693511963, "logits/rejected": -2.6871867179870605, "logps/chosen": -284.65289306640625, "logps/rejected": -256.893310546875, "loss": 0.6828, "positive_losses": 0.6932986974716187, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.2574659287929535, "rewards/margins": 0.1591106802225113, "rewards/margins_max": 0.37911471724510193, "rewards/margins_min": -0.05546389892697334, "rewards/margins_std": 0.18983301520347595, "rewards/rejected": 0.0983552634716034, "step": 3450 }, { "dpo_losses": 0.6086649894714355, "epoch": 0.91, "grad_norm": 2.3081388848365116, "learning_rate": 1.3479400280141886e-07, "logits/chosen": -2.711552858352661, "logits/rejected": -2.64066743850708, "logps/chosen": -281.89971923828125, "logps/rejected": -220.6728057861328, "loss": 0.6489, "positive_losses": 0.29107895493507385, "rewards/accuracies": 0.8125, "rewards/chosen": 0.2659539580345154, "rewards/margins": 0.19101370871067047, "rewards/margins_max": 0.44433078169822693, "rewards/margins_min": -0.012200175784528255, "rewards/margins_std": 0.20038139820098877, "rewards/rejected": 0.0749402642250061, "step": 3460 }, { "dpo_losses": 0.6380258202552795, "epoch": 0.91, "grad_norm": 2.273957602663457, "learning_rate": 1.2749288376442044e-07, "logits/chosen": -2.6947264671325684, "logits/rejected": -2.6187143325805664, "logps/chosen": -267.95416259765625, "logps/rejected": -256.015869140625, "loss": 0.6999, "positive_losses": 0.40557432174682617, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.20190012454986572, "rewards/margins": 0.12539361417293549, "rewards/margins_max": 0.347222238779068, "rewards/margins_min": -0.07464499771595001, "rewards/margins_std": 0.19334430992603302, "rewards/rejected": 0.07650649547576904, "step": 3470 }, { "dpo_losses": 0.6449288129806519, "epoch": 0.91, "grad_norm": 2.064857984559453, "learning_rate": 1.203898683888713e-07, "logits/chosen": -2.742762327194214, "logits/rejected": -2.6796300411224365, "logps/chosen": -265.0412292480469, "logps/rejected": -270.024169921875, "loss": 0.6877, "positive_losses": 0.49461060762405396, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.2326090782880783, "rewards/margins": 0.1122850775718689, "rewards/margins_max": 0.33024439215660095, "rewards/margins_min": -0.09869068115949631, "rewards/margins_std": 0.1922878623008728, "rewards/rejected": 0.12032399326562881, "step": 3480 }, { "dpo_losses": 0.6597029566764832, "epoch": 0.91, "grad_norm": 9.678912682115064, "learning_rate": 1.1348554977451132e-07, "logits/chosen": -2.6579794883728027, "logits/rejected": -2.684399127960205, "logps/chosen": -228.6675567626953, "logps/rejected": -215.63818359375, "loss": 0.681, "positive_losses": 0.15204429626464844, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2136567384004593, "rewards/margins": 0.07788977771997452, "rewards/margins_max": 0.2724088728427887, "rewards/margins_min": -0.11260491609573364, "rewards/margins_std": 0.1709081381559372, "rewards/rejected": 0.13576695322990417, "step": 3490 }, { "dpo_losses": 0.6369072198867798, "epoch": 0.92, "grad_norm": 46.67250271589927, "learning_rate": 1.0678050442995802e-07, "logits/chosen": -2.733651638031006, "logits/rejected": -2.712925434112549, "logps/chosen": -287.21612548828125, "logps/rejected": -235.900390625, "loss": 0.7096, "positive_losses": 0.04894094541668892, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.22435179352760315, "rewards/margins": 0.12650364637374878, "rewards/margins_max": 0.36697354912757874, "rewards/margins_min": -0.06886914372444153, "rewards/margins_std": 0.19230186939239502, "rewards/rejected": 0.09784816205501556, "step": 3500 }, { "epoch": 0.92, "eval_dpo_losses": 0.6372407674789429, "eval_logits/chosen": -2.667452573776245, "eval_logits/rejected": -2.635446071624756, "eval_logps/chosen": -261.3742370605469, "eval_logps/rejected": -248.14076232910156, "eval_loss": 0.6797913312911987, "eval_positive_losses": 0.26923131942749023, "eval_rewards/accuracies": 0.7239999771118164, "eval_rewards/chosen": 0.23219171166419983, "eval_rewards/margins": 0.1278100609779358, "eval_rewards/margins_max": 0.46456432342529297, "eval_rewards/margins_min": -0.15515924990177155, "eval_rewards/margins_std": 0.20863020420074463, "eval_rewards/rejected": 0.10438163578510284, "eval_runtime": 428.813, "eval_samples_per_second": 4.664, "eval_steps_per_second": 0.292, "step": 3500 }, { "dpo_losses": 0.639677882194519, "epoch": 0.92, "grad_norm": 2.2602722074930117, "learning_rate": 1.0027529222456755e-07, "logits/chosen": -2.6378064155578613, "logits/rejected": -2.628966808319092, "logps/chosen": -268.2995910644531, "logps/rejected": -248.23825073242188, "loss": 0.7001, "positive_losses": 0.7789055705070496, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.22079963982105255, "rewards/margins": 0.12557564675807953, "rewards/margins_max": 0.3839789628982544, "rewards/margins_min": -0.08118332922458649, "rewards/margins_std": 0.211937814950943, "rewards/rejected": 0.09522400796413422, "step": 3510 }, { "dpo_losses": 0.6374799013137817, "epoch": 0.92, "grad_norm": 12.08746758252504, "learning_rate": 9.397045634168766e-08, "logits/chosen": -2.6791863441467285, "logits/rejected": -2.6992931365966797, "logps/chosen": -250.8428497314453, "logps/rejected": -250.5262451171875, "loss": 0.711, "positive_losses": 0.5547275543212891, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.18645942211151123, "rewards/margins": 0.1275174468755722, "rewards/margins_max": 0.383579283952713, "rewards/margins_min": -0.09520339965820312, "rewards/margins_std": 0.20841717720031738, "rewards/rejected": 0.05894196778535843, "step": 3520 }, { "dpo_losses": 0.6348574161529541, "epoch": 0.92, "grad_norm": 8.536143035062093, "learning_rate": 8.78665232332998e-08, "logits/chosen": -2.7510018348693848, "logits/rejected": -2.730635166168213, "logps/chosen": -273.70770263671875, "logps/rejected": -265.7099609375, "loss": 0.6385, "positive_losses": 0.04913749545812607, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.25052696466445923, "rewards/margins": 0.13073930144309998, "rewards/margins_max": 0.3736670911312103, "rewards/margins_min": -0.044946182519197464, "rewards/margins_std": 0.18753521144390106, "rewards/rejected": 0.11978765577077866, "step": 3530 }, { "dpo_losses": 0.6442808508872986, "epoch": 0.93, "grad_norm": 15.932070057693137, "learning_rate": 8.196400257606208e-08, "logits/chosen": -2.633051633834839, "logits/rejected": -2.5638608932495117, "logps/chosen": -281.0552978515625, "logps/rejected": -298.94964599609375, "loss": 0.7265, "positive_losses": 0.8619983792304993, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.2538689970970154, "rewards/margins": 0.1262858361005783, "rewards/margins_max": 0.4654213786125183, "rewards/margins_min": -0.13996709883213043, "rewards/margins_std": 0.275529682636261, "rewards/rejected": 0.12758317589759827, "step": 3540 }, { "dpo_losses": 0.6315485835075378, "epoch": 0.93, "grad_norm": 2.1180908978402395, "learning_rate": 7.626338722875076e-08, "logits/chosen": -2.6209847927093506, "logits/rejected": -2.6487958431243896, "logps/chosen": -238.2965850830078, "logps/rejected": -258.9308166503906, "loss": 0.662, "positive_losses": 0.09130015224218369, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.2700076103210449, "rewards/margins": 0.14941179752349854, "rewards/margins_max": 0.41716593503952026, "rewards/margins_min": -0.09211661666631699, "rewards/margins_std": 0.23184537887573242, "rewards/rejected": 0.12059581279754639, "step": 3550 }, { "dpo_losses": 0.6292354464530945, "epoch": 0.93, "grad_norm": 10.541543067740294, "learning_rate": 7.076515319110688e-08, "logits/chosen": -2.6589627265930176, "logits/rejected": -2.5958473682403564, "logps/chosen": -290.36212158203125, "logps/rejected": -261.944091796875, "loss": 0.6614, "positive_losses": 0.29073411226272583, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.2517632246017456, "rewards/margins": 0.14822056889533997, "rewards/margins_max": 0.42206496000289917, "rewards/margins_min": -0.09494920074939728, "rewards/margins_std": 0.23037895560264587, "rewards/rejected": 0.10354267060756683, "step": 3560 }, { "dpo_losses": 0.6255845427513123, "epoch": 0.93, "grad_norm": 9.517378722195478, "learning_rate": 6.54697595640899e-08, "logits/chosen": -2.6677193641662598, "logits/rejected": -2.641484260559082, "logps/chosen": -300.04791259765625, "logps/rejected": -249.2549285888672, "loss": 0.6987, "positive_losses": 0.5968934297561646, "rewards/accuracies": 0.7749999761581421, "rewards/chosen": 0.22224600613117218, "rewards/margins": 0.1552506685256958, "rewards/margins_max": 0.44209718704223633, "rewards/margins_min": -0.10256730020046234, "rewards/margins_std": 0.23920206725597382, "rewards/rejected": 0.06699535250663757, "step": 3570 }, { "dpo_losses": 0.6264487504959106, "epoch": 0.94, "grad_norm": 6.846464829235522, "learning_rate": 6.037764851154426e-08, "logits/chosen": -2.715301990509033, "logits/rejected": -2.6565797328948975, "logps/chosen": -265.61834716796875, "logps/rejected": -247.1842498779297, "loss": 0.6937, "positive_losses": 0.5297044515609741, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.22866220772266388, "rewards/margins": 0.1546943038702011, "rewards/margins_max": 0.39230793714523315, "rewards/margins_min": -0.1014304980635643, "rewards/margins_std": 0.22269447147846222, "rewards/rejected": 0.07396790385246277, "step": 3580 }, { "dpo_losses": 0.6250921487808228, "epoch": 0.94, "grad_norm": 5.132066460539605, "learning_rate": 5.548924522327748e-08, "logits/chosen": -2.6867854595184326, "logits/rejected": -2.66359806060791, "logps/chosen": -249.00961303710938, "logps/rejected": -220.60806274414062, "loss": 0.6867, "positive_losses": 0.6274803876876831, "rewards/accuracies": 0.75, "rewards/chosen": 0.23753464221954346, "rewards/margins": 0.1585163176059723, "rewards/margins_max": 0.44630104303359985, "rewards/margins_min": -0.09158410876989365, "rewards/margins_std": 0.23935404419898987, "rewards/rejected": 0.07901832461357117, "step": 3590 }, { "dpo_losses": 0.646395206451416, "epoch": 0.94, "grad_norm": 2.6248304221464736, "learning_rate": 5.0804957879556915e-08, "logits/chosen": -2.6485066413879395, "logits/rejected": -2.606511354446411, "logps/chosen": -237.2054901123047, "logps/rejected": -249.83096313476562, "loss": 0.6554, "positive_losses": 0.0, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22109758853912354, "rewards/margins": 0.10693392902612686, "rewards/margins_max": 0.31272679567337036, "rewards/margins_min": -0.12216305732727051, "rewards/margins_std": 0.19301414489746094, "rewards/rejected": 0.11416369676589966, "step": 3600 }, { "epoch": 0.94, "eval_dpo_losses": 0.6379253268241882, "eval_logits/chosen": -2.6683716773986816, "eval_logits/rejected": -2.6362881660461426, "eval_logps/chosen": -261.234375, "eval_logps/rejected": -247.8321990966797, "eval_loss": 0.6779402494430542, "eval_positive_losses": 0.251412034034729, "eval_rewards/accuracies": 0.7200000286102295, "eval_rewards/chosen": 0.23359021544456482, "eval_rewards/margins": 0.12612289190292358, "eval_rewards/margins_max": 0.4599488079547882, "eval_rewards/margins_min": -0.15298037230968475, "eval_rewards/margins_std": 0.20651289820671082, "eval_rewards/rejected": 0.10746732354164124, "eval_runtime": 428.8856, "eval_samples_per_second": 4.663, "eval_steps_per_second": 0.291, "step": 3600 }, { "dpo_losses": 0.6407705545425415, "epoch": 0.94, "grad_norm": 1.9936227452913986, "learning_rate": 4.632517761702815e-08, "logits/chosen": -2.6707515716552734, "logits/rejected": -2.6665282249450684, "logps/chosen": -305.3087463378906, "logps/rejected": -247.710205078125, "loss": 0.6606, "positive_losses": 0.02645111083984375, "rewards/accuracies": 0.75, "rewards/chosen": 0.23321452736854553, "rewards/margins": 0.11667405068874359, "rewards/margins_max": 0.29690879583358765, "rewards/margins_min": -0.07151924818754196, "rewards/margins_std": 0.16342324018478394, "rewards/rejected": 0.11654046922922134, "step": 3610 }, { "dpo_losses": 0.6456506848335266, "epoch": 0.95, "grad_norm": 2.721072640831352, "learning_rate": 4.205027849605359e-08, "logits/chosen": -2.710562229156494, "logits/rejected": -2.6717777252197266, "logps/chosen": -262.54815673828125, "logps/rejected": -237.6497802734375, "loss": 0.6557, "positive_losses": 0.031458280980587006, "rewards/accuracies": 0.737500011920929, "rewards/chosen": 0.23188185691833496, "rewards/margins": 0.1048072949051857, "rewards/margins_max": 0.2976217567920685, "rewards/margins_min": -0.06104566901922226, "rewards/margins_std": 0.16042517125606537, "rewards/rejected": 0.12707456946372986, "step": 3620 }, { "dpo_losses": 0.6386553049087524, "epoch": 0.95, "grad_norm": 10.008769576967202, "learning_rate": 3.798061746947995e-08, "logits/chosen": -2.6221561431884766, "logits/rejected": -2.5709946155548096, "logps/chosen": -228.15969848632812, "logps/rejected": -274.2277526855469, "loss": 0.6887, "positive_losses": 0.8732093572616577, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.19012576341629028, "rewards/margins": 0.12254045903682709, "rewards/margins_max": 0.34004732966423035, "rewards/margins_min": -0.06443891674280167, "rewards/margins_std": 0.18650075793266296, "rewards/rejected": 0.0675852820277214, "step": 3630 }, { "dpo_losses": 0.6430724263191223, "epoch": 0.95, "grad_norm": 1.7010971035385118, "learning_rate": 3.411653435283158e-08, "logits/chosen": -2.657015562057495, "logits/rejected": -2.626798629760742, "logps/chosen": -234.0613250732422, "logps/rejected": -210.7574005126953, "loss": 0.6809, "positive_losses": 0.08847179263830185, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.22986125946044922, "rewards/margins": 0.11668477207422256, "rewards/margins_max": 0.3718630373477936, "rewards/margins_min": -0.11003968864679337, "rewards/margins_std": 0.2165330946445465, "rewards/rejected": 0.11317648738622665, "step": 3640 }, { "dpo_losses": 0.6503810286521912, "epoch": 0.96, "grad_norm": 1.8738600762565447, "learning_rate": 3.04583517959367e-08, "logits/chosen": -2.672093391418457, "logits/rejected": -2.673926830291748, "logps/chosen": -272.6725769042969, "logps/rejected": -283.348388671875, "loss": 0.6581, "positive_losses": 0.20909519493579865, "rewards/accuracies": 0.6000000238418579, "rewards/chosen": 0.2302829772233963, "rewards/margins": 0.1008058562874794, "rewards/margins_max": 0.35259950160980225, "rewards/margins_min": -0.12621888518333435, "rewards/margins_std": 0.21480269730091095, "rewards/rejected": 0.1294771134853363, "step": 3650 }, { "dpo_losses": 0.6644676923751831, "epoch": 0.96, "grad_norm": 2.2468968290878837, "learning_rate": 2.7006375255985984e-08, "logits/chosen": -2.818138599395752, "logits/rejected": -2.7806057929992676, "logps/chosen": -262.3272705078125, "logps/rejected": -261.388671875, "loss": 0.7127, "positive_losses": 0.4840957522392273, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2029964029788971, "rewards/margins": 0.0679345428943634, "rewards/margins_max": 0.2564147114753723, "rewards/margins_min": -0.13476568460464478, "rewards/margins_std": 0.17046482861042023, "rewards/rejected": 0.1350618600845337, "step": 3660 }, { "dpo_losses": 0.6274867057800293, "epoch": 0.96, "grad_norm": 1.8861153152948316, "learning_rate": 2.3760892972027328e-08, "logits/chosen": -2.67265248298645, "logits/rejected": -2.644329786300659, "logps/chosen": -275.19708251953125, "logps/rejected": -284.86480712890625, "loss": 0.658, "positive_losses": 0.20399093627929688, "rewards/accuracies": 0.7250000238418579, "rewards/chosen": 0.24722471833229065, "rewards/margins": 0.1497470587491989, "rewards/margins_max": 0.4041268825531006, "rewards/margins_min": -0.06888915598392487, "rewards/margins_std": 0.2147652804851532, "rewards/rejected": 0.09747765958309174, "step": 3670 }, { "dpo_losses": 0.6478864550590515, "epoch": 0.96, "grad_norm": 5.029289669290955, "learning_rate": 2.072217594089765e-08, "logits/chosen": -2.676222562789917, "logits/rejected": -2.641787052154541, "logps/chosen": -265.0032043457031, "logps/rejected": -257.2332763671875, "loss": 0.6828, "positive_losses": 0.23020200431346893, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.2107507288455963, "rewards/margins": 0.10467696189880371, "rewards/margins_max": 0.3455045521259308, "rewards/margins_min": -0.12821164727210999, "rewards/margins_std": 0.2098625898361206, "rewards/rejected": 0.10607375204563141, "step": 3680 }, { "dpo_losses": 0.6204084157943726, "epoch": 0.97, "grad_norm": 10.972239056237635, "learning_rate": 1.789047789459375e-08, "logits/chosen": -2.6845951080322266, "logits/rejected": -2.65423583984375, "logps/chosen": -324.8729553222656, "logps/rejected": -261.7614440917969, "loss": 0.6469, "positive_losses": 0.0, "rewards/accuracies": 0.75, "rewards/chosen": 0.2780536711215973, "rewards/margins": 0.16375882923603058, "rewards/margins_max": 0.36409690976142883, "rewards/margins_min": -0.06626104563474655, "rewards/margins_std": 0.1893640011548996, "rewards/rejected": 0.11429482698440552, "step": 3690 }, { "dpo_losses": 0.6478589177131653, "epoch": 0.97, "grad_norm": 7.9739929902768525, "learning_rate": 1.5266035279088708e-08, "logits/chosen": -2.769407272338867, "logits/rejected": -2.7307915687561035, "logps/chosen": -267.03485107421875, "logps/rejected": -241.7327117919922, "loss": 0.7134, "positive_losses": 0.8700395822525024, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.23777230083942413, "rewards/margins": 0.10296879708766937, "rewards/margins_max": 0.3077225089073181, "rewards/margins_min": -0.10617302358150482, "rewards/margins_std": 0.18426969647407532, "rewards/rejected": 0.13480350375175476, "step": 3700 }, { "epoch": 0.97, "eval_dpo_losses": 0.6379401683807373, "eval_logits/chosen": -2.6680033206939697, "eval_logits/rejected": -2.6359734535217285, "eval_logps/chosen": -261.2256774902344, "eval_logps/rejected": -247.81829833984375, "eval_loss": 0.6779002547264099, "eval_positive_losses": 0.2483443021774292, "eval_rewards/accuracies": 0.722000002861023, "eval_rewards/chosen": 0.233677476644516, "eval_rewards/margins": 0.12607111036777496, "eval_rewards/margins_max": 0.4593754708766937, "eval_rewards/margins_min": -0.15287671983242035, "eval_rewards/margins_std": 0.20637017488479614, "eval_rewards/rejected": 0.10760633647441864, "eval_runtime": 428.7157, "eval_samples_per_second": 4.665, "eval_steps_per_second": 0.292, "step": 3700 }, { "dpo_losses": 0.647309422492981, "epoch": 0.97, "grad_norm": 7.387525622328266, "learning_rate": 1.2849067234584623e-08, "logits/chosen": -2.639650821685791, "logits/rejected": -2.5958681106567383, "logps/chosen": -255.32797241210938, "logps/rejected": -237.5811767578125, "loss": 0.6627, "positive_losses": 0.16466370224952698, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.23427970707416534, "rewards/margins": 0.10695306956768036, "rewards/margins_max": 0.31495481729507446, "rewards/margins_min": -0.11774158477783203, "rewards/margins_std": 0.19992010295391083, "rewards/rejected": 0.1273266226053238, "step": 3710 }, { "dpo_losses": 0.6346789598464966, "epoch": 0.97, "grad_norm": 10.090532575451604, "learning_rate": 1.0639775577218625e-08, "logits/chosen": -2.6735434532165527, "logits/rejected": -2.6736886501312256, "logps/chosen": -239.50125122070312, "logps/rejected": -239.09219360351562, "loss": 0.6685, "positive_losses": 0.6132221221923828, "rewards/accuracies": 0.6875, "rewards/chosen": 0.2345018833875656, "rewards/margins": 0.13293547928333282, "rewards/margins_max": 0.36295372247695923, "rewards/margins_min": -0.0947188287973404, "rewards/margins_std": 0.20821118354797363, "rewards/rejected": 0.10156641155481339, "step": 3720 }, { "dpo_losses": 0.6413782835006714, "epoch": 0.98, "grad_norm": 15.554879770283303, "learning_rate": 8.638344782207486e-09, "logits/chosen": -2.6854450702667236, "logits/rejected": -2.6753978729248047, "logps/chosen": -299.7012939453125, "logps/rejected": -269.0886535644531, "loss": 0.7551, "positive_losses": 0.954269528388977, "rewards/accuracies": 0.612500011920929, "rewards/chosen": 0.24659478664398193, "rewards/margins": 0.12330418825149536, "rewards/margins_max": 0.4121350646018982, "rewards/margins_min": -0.12201066315174103, "rewards/margins_std": 0.23730938136577606, "rewards/rejected": 0.12329061329364777, "step": 3730 }, { "dpo_losses": 0.618949830532074, "epoch": 0.98, "grad_norm": 11.43429170230489, "learning_rate": 6.84494196844715e-09, "logits/chosen": -2.6758453845977783, "logits/rejected": -2.634490728378296, "logps/chosen": -281.9029235839844, "logps/rejected": -265.92926025390625, "loss": 0.6541, "positive_losses": 0.4289817810058594, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.25217822194099426, "rewards/margins": 0.1817954182624817, "rewards/margins_max": 0.4762052893638611, "rewards/margins_min": -0.04849465936422348, "rewards/margins_std": 0.23775847256183624, "rewards/rejected": 0.07038280367851257, "step": 3740 }, { "dpo_losses": 0.6470276117324829, "epoch": 0.98, "grad_norm": 2.344804043583945, "learning_rate": 5.259716884556121e-09, "logits/chosen": -2.6333367824554443, "logits/rejected": -2.6696276664733887, "logps/chosen": -269.58636474609375, "logps/rejected": -258.2654724121094, "loss": 0.6969, "positive_losses": 1.0564213991165161, "rewards/accuracies": 0.7124999761581421, "rewards/chosen": 0.23361241817474365, "rewards/margins": 0.10669457912445068, "rewards/margins_max": 0.3290795683860779, "rewards/margins_min": -0.11579986661672592, "rewards/margins_std": 0.19672229886054993, "rewards/rejected": 0.12691782414913177, "step": 3750 }, { "dpo_losses": 0.6406267285346985, "epoch": 0.98, "grad_norm": 1.9499945616034942, "learning_rate": 3.882801896372967e-09, "logits/chosen": -2.6625890731811523, "logits/rejected": -2.6816468238830566, "logps/chosen": -233.4760284423828, "logps/rejected": -241.8442840576172, "loss": 0.6636, "positive_losses": 0.33517932891845703, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.21011845767498016, "rewards/margins": 0.12152941524982452, "rewards/margins_max": 0.34125444293022156, "rewards/margins_min": -0.08021117746829987, "rewards/margins_std": 0.1876247674226761, "rewards/rejected": 0.08858904987573624, "step": 3760 }, { "dpo_losses": 0.6517797708511353, "epoch": 0.99, "grad_norm": 16.602540412374584, "learning_rate": 2.7143119759026614e-09, "logits/chosen": -2.6886088848114014, "logits/rejected": -2.629971981048584, "logps/chosen": -284.85333251953125, "logps/rejected": -292.6435241699219, "loss": 0.6772, "positive_losses": 0.469033420085907, "rewards/accuracies": 0.699999988079071, "rewards/chosen": 0.25503087043762207, "rewards/margins": 0.09973520040512085, "rewards/margins_max": 0.3132280707359314, "rewards/margins_min": -0.11939004808664322, "rewards/margins_std": 0.19445772469043732, "rewards/rejected": 0.15529564023017883, "step": 3770 }, { "dpo_losses": 0.6563539505004883, "epoch": 0.99, "grad_norm": 13.373401581985298, "learning_rate": 1.754344691717591e-09, "logits/chosen": -2.6870341300964355, "logits/rejected": -2.6425023078918457, "logps/chosen": -249.67434692382812, "logps/rejected": -257.3709411621094, "loss": 0.6629, "positive_losses": 0.17134341597557068, "rewards/accuracies": 0.625, "rewards/chosen": 0.20479460060596466, "rewards/margins": 0.09673847258090973, "rewards/margins_max": 0.37108755111694336, "rewards/margins_min": -0.22099053859710693, "rewards/margins_std": 0.26931631565093994, "rewards/rejected": 0.10805612802505493, "step": 3780 }, { "dpo_losses": 0.6493461728096008, "epoch": 0.99, "grad_norm": 17.939112786816548, "learning_rate": 1.0029802008096335e-09, "logits/chosen": -2.703834056854248, "logits/rejected": -2.6924235820770264, "logps/chosen": -215.004150390625, "logps/rejected": -188.3095703125, "loss": 0.7397, "positive_losses": 1.0388996601104736, "rewards/accuracies": 0.762499988079071, "rewards/chosen": 0.19507798552513123, "rewards/margins": 0.09886033087968826, "rewards/margins_max": 0.2661459147930145, "rewards/margins_min": -0.07485532015562057, "rewards/margins_std": 0.15068285167217255, "rewards/rejected": 0.09621763974428177, "step": 3790 }, { "dpo_losses": 0.6362841725349426, "epoch": 0.99, "grad_norm": 10.420088630636137, "learning_rate": 4.602812418974534e-10, "logits/chosen": -2.6196417808532715, "logits/rejected": -2.6174726486206055, "logps/chosen": -262.04779052734375, "logps/rejected": -199.59432983398438, "loss": 0.6563, "positive_losses": 0.0, "rewards/accuracies": 0.800000011920929, "rewards/chosen": 0.25773847103118896, "rewards/margins": 0.13278800249099731, "rewards/margins_max": 0.40468335151672363, "rewards/margins_min": -0.05471289902925491, "rewards/margins_std": 0.20865869522094727, "rewards/rejected": 0.12495045363903046, "step": 3800 }, { "epoch": 0.99, "eval_dpo_losses": 0.6379905939102173, "eval_logits/chosen": -2.666172981262207, "eval_logits/rejected": -2.633877992630005, "eval_logps/chosen": -261.2152404785156, "eval_logps/rejected": -247.79693603515625, "eval_loss": 0.6776851415634155, "eval_positive_losses": 0.24759767949581146, "eval_rewards/accuracies": 0.7239999771118164, "eval_rewards/chosen": 0.23378188908100128, "eval_rewards/margins": 0.12596195936203003, "eval_rewards/margins_max": 0.4592114984989166, "eval_rewards/margins_min": -0.1531001180410385, "eval_rewards/margins_std": 0.20633676648139954, "eval_rewards/rejected": 0.10781992226839066, "eval_runtime": 428.9455, "eval_samples_per_second": 4.663, "eval_steps_per_second": 0.291, "step": 3800 }, { "dpo_losses": 0.644007682800293, "epoch": 1.0, "grad_norm": 1.7586445513403168, "learning_rate": 1.2629313018819312e-10, "logits/chosen": -2.693336009979248, "logits/rejected": -2.6638665199279785, "logps/chosen": -259.37847900390625, "logps/rejected": -234.46383666992188, "loss": 0.6643, "positive_losses": 0.09393195807933807, "rewards/accuracies": 0.6625000238418579, "rewards/chosen": 0.22506313025951385, "rewards/margins": 0.11026908457279205, "rewards/margins_max": 0.3049668073654175, "rewards/margins_min": -0.07908110320568085, "rewards/margins_std": 0.17724165320396423, "rewards/rejected": 0.11479403078556061, "step": 3810 }, { "dpo_losses": 0.6355063915252686, "epoch": 1.0, "grad_norm": 6.5856032824079005, "learning_rate": 1.0437535929996855e-12, "logits/chosen": -2.634423017501831, "logits/rejected": -2.624018430709839, "logps/chosen": -307.9532470703125, "logps/rejected": -207.0972900390625, "loss": 0.677, "positive_losses": 0.70243901014328, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.23277465999126434, "rewards/margins": 0.1376054733991623, "rewards/margins_max": 0.380585640668869, "rewards/margins_min": -0.09022089093923569, "rewards/margins_std": 0.21244606375694275, "rewards/rejected": 0.09516917914152145, "step": 3820 }, { "epoch": 1.0, "step": 3821, "total_flos": 0.0, "train_loss": 0.6803070375102567, "train_runtime": 46122.1645, "train_samples_per_second": 1.325, "train_steps_per_second": 0.083 } ], "logging_steps": 10, "max_steps": 3821, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }