{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 73, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 29.326078015981345, "learning_rate": 6.25e-08, "logps/chosen": -47.87165832519531, "logps/rejected": -35.03704071044922, "loss": 0.6939, "losses/dpo": 0.7437427639961243, "losses/sft": 0.2519839406013489, "losses/total": 0.7437427639961243, "ref_logps/chosen": -47.90069580078125, "ref_logps/rejected": -35.07575225830078, "rewards/accuracies": 0.4609375, "rewards/chosen": 0.0029037208296358585, "rewards/margins": -0.0009674869943410158, "rewards/rejected": 0.0038712075911462307, "step": 1 }, { "epoch": 0.03, "grad_norm": 25.98987817588094, "learning_rate": 1.25e-07, "logps/chosen": -46.03837966918945, "logps/rejected": -34.79166030883789, "loss": 0.6937, "losses/dpo": 0.711306095123291, "losses/sft": 0.21511156857013702, "losses/total": 0.711306095123291, "ref_logps/chosen": -46.05853271484375, "ref_logps/rejected": -34.81706237792969, "rewards/accuracies": 0.5, "rewards/chosen": 0.0020157406106591225, "rewards/margins": -0.000524366507306695, "rewards/rejected": 0.002540107350796461, "step": 2 }, { "epoch": 0.04, "grad_norm": 43.145173675858224, "learning_rate": 1.875e-07, "logps/chosen": -41.797569274902344, "logps/rejected": -31.708539962768555, "loss": 0.693, "losses/dpo": 0.7042351365089417, "losses/sft": 0.18763618171215057, "losses/total": 0.7042351365089417, "ref_logps/chosen": -41.833030700683594, "ref_logps/rejected": -31.735107421875, "rewards/accuracies": 0.5234375, "rewards/chosen": 0.003545756684616208, "rewards/margins": 0.0008889732416719198, "rewards/rejected": 0.0026567834429442883, "step": 3 }, { "epoch": 0.05, "grad_norm": 31.32790996670384, "learning_rate": 2.5e-07, "logps/chosen": -42.71172332763672, "logps/rejected": -32.757808685302734, "loss": 0.6927, "losses/dpo": 0.6976655125617981, "losses/sft": 0.17784112691879272, "losses/total": 0.6976655125617981, "ref_logps/chosen": -42.72623062133789, "ref_logps/rejected": -32.75667190551758, "rewards/accuracies": 0.5, "rewards/chosen": 0.0014508566819131374, "rewards/margins": 0.0015643269289284945, "rewards/rejected": -0.00011346983956173062, "step": 4 }, { "epoch": 0.07, "grad_norm": 35.10577986645193, "learning_rate": 3.1249999999999997e-07, "logps/chosen": -45.85194396972656, "logps/rejected": -34.628639221191406, "loss": 0.689, "losses/dpo": 0.7395577430725098, "losses/sft": 0.17383158206939697, "losses/total": 0.7395577430725098, "ref_logps/chosen": -45.91680145263672, "ref_logps/rejected": -34.60468673706055, "rewards/accuracies": 0.5703125, "rewards/chosen": 0.006485694088041782, "rewards/margins": 0.008881103247404099, "rewards/rejected": -0.002395408693701029, "step": 5 }, { "epoch": 0.08, "grad_norm": 28.260278751569523, "learning_rate": 3.75e-07, "logps/chosen": -42.09749221801758, "logps/rejected": -32.70561599731445, "loss": 0.6932, "losses/dpo": 0.6590798497200012, "losses/sft": 0.18368251621723175, "losses/total": 0.6590798497200012, "ref_logps/chosen": -42.06741714477539, "ref_logps/rejected": -32.67097473144531, "rewards/accuracies": 0.484375, "rewards/chosen": -0.0030076471157372, "rewards/margins": 0.0004564363043755293, "rewards/rejected": -0.0034640836529433727, "step": 6 }, { "epoch": 0.1, "grad_norm": 45.257780534421805, "learning_rate": 4.375e-07, "logps/chosen": -48.16801834106445, "logps/rejected": -35.98320770263672, "loss": 0.6931, "losses/dpo": 0.674820065498352, "losses/sft": 0.17130310833454132, "losses/total": 0.674820065498352, "ref_logps/chosen": -48.16166687011719, "ref_logps/rejected": -35.96845245361328, "rewards/accuracies": 0.515625, "rewards/chosen": -0.0006352070486173034, "rewards/margins": 0.0008399828802794218, "rewards/rejected": -0.0014751903945580125, "step": 7 }, { "epoch": 0.11, "grad_norm": 37.963707614132204, "learning_rate": 5e-07, "logps/chosen": -46.631561279296875, "logps/rejected": -34.54258728027344, "loss": 0.6911, "losses/dpo": 0.6616916060447693, "losses/sft": 0.15279927849769592, "losses/total": 0.6616916060447693, "ref_logps/chosen": -46.690643310546875, "ref_logps/rejected": -34.551368713378906, "rewards/accuracies": 0.5546875, "rewards/chosen": 0.005908225197345018, "rewards/margins": 0.005030112341046333, "rewards/rejected": 0.0008781132637523115, "step": 8 }, { "epoch": 0.12, "grad_norm": 23.24345634411509, "learning_rate": 4.997080567080816e-07, "logps/chosen": -45.053184509277344, "logps/rejected": -35.14673614501953, "loss": 0.6888, "losses/dpo": 0.645126461982727, "losses/sft": 0.1863231658935547, "losses/total": 0.645126461982727, "ref_logps/chosen": -45.13517379760742, "ref_logps/rejected": -35.132957458496094, "rewards/accuracies": 0.5390625, "rewards/chosen": 0.008199075236916542, "rewards/margins": 0.009576688520610332, "rewards/rejected": -0.0013776118867099285, "step": 9 }, { "epoch": 0.14, "grad_norm": 27.949597341892236, "learning_rate": 4.988329086794122e-07, "logps/chosen": -46.718475341796875, "logps/rejected": -36.01044464111328, "loss": 0.6845, "losses/dpo": 0.6536989212036133, "losses/sft": 0.16235677897930145, "losses/total": 0.6536989212036133, "ref_logps/chosen": -46.86553192138672, "ref_logps/rejected": -35.97478103637695, "rewards/accuracies": 0.6171875, "rewards/chosen": 0.0147053562104702, "rewards/margins": 0.018271632492542267, "rewards/rejected": -0.0035662769805639982, "step": 10 }, { "epoch": 0.15, "grad_norm": 40.316536183472955, "learning_rate": 4.973765998627628e-07, "logps/chosen": -45.7076416015625, "logps/rejected": -32.744361877441406, "loss": 0.6758, "losses/dpo": 0.639275848865509, "losses/sft": 0.19072100520133972, "losses/total": 0.639275848865509, "ref_logps/chosen": -45.953941345214844, "ref_logps/rejected": -32.63063430786133, "rewards/accuracies": 0.734375, "rewards/chosen": 0.024630192667245865, "rewards/margins": 0.036002762615680695, "rewards/rejected": -0.01137256994843483, "step": 11 }, { "epoch": 0.16, "grad_norm": 31.231333750699285, "learning_rate": 4.953425315348533e-07, "logps/chosen": -48.346229553222656, "logps/rejected": -35.44029235839844, "loss": 0.6735, "losses/dpo": 0.7411879301071167, "losses/sft": 0.30462783575057983, "losses/total": 0.7411879301071167, "ref_logps/chosen": -48.579471588134766, "ref_logps/rejected": -35.26258087158203, "rewards/accuracies": 0.75, "rewards/chosen": 0.023324450477957726, "rewards/margins": 0.04109576344490051, "rewards/rejected": -0.017771316692233086, "step": 12 }, { "epoch": 0.18, "grad_norm": 24.02378939332813, "learning_rate": 4.92735454356513e-07, "logps/chosen": -43.760799407958984, "logps/rejected": -32.20792007446289, "loss": 0.6771, "losses/dpo": 0.7643380761146545, "losses/sft": 0.15294401347637177, "losses/total": 0.7643380761146545, "ref_logps/chosen": -43.909759521484375, "ref_logps/rejected": -32.016273498535156, "rewards/accuracies": 0.65625, "rewards/chosen": 0.01489595789462328, "rewards/margins": 0.034060731530189514, "rewards/rejected": -0.01916477642953396, "step": 13 }, { "epoch": 0.19, "grad_norm": 33.47814491109199, "learning_rate": 4.895614572772916e-07, "logps/chosen": -45.79880905151367, "logps/rejected": -34.85653305053711, "loss": 0.6669, "losses/dpo": 0.7224411368370056, "losses/sft": 0.2095840573310852, "losses/total": 0.7224411368370056, "ref_logps/chosen": -46.07813262939453, "ref_logps/rejected": -34.58377456665039, "rewards/accuracies": 0.734375, "rewards/chosen": 0.02793230675160885, "rewards/margins": 0.055208105593919754, "rewards/rejected": -0.027275800704956055, "step": 14 }, { "epoch": 0.21, "grad_norm": 47.78782257013143, "learning_rate": 4.858279533144357e-07, "logps/chosen": -47.91066360473633, "logps/rejected": -36.8038330078125, "loss": 0.6545, "losses/dpo": 0.5712046027183533, "losses/sft": 0.20200778543949127, "losses/total": 0.5712046027183533, "ref_logps/chosen": -48.32217788696289, "ref_logps/rejected": -36.395023345947266, "rewards/accuracies": 0.765625, "rewards/chosen": 0.04115153104066849, "rewards/margins": 0.08203274011611938, "rewards/rejected": -0.040881212800741196, "step": 15 }, { "epoch": 0.22, "grad_norm": 246.97737804069968, "learning_rate": 4.815436622394441e-07, "logps/chosen": -46.90559387207031, "logps/rejected": -36.626888275146484, "loss": 0.6465, "losses/dpo": 0.7274478077888489, "losses/sft": 0.26765260100364685, "losses/total": 0.7274478077888489, "ref_logps/chosen": -47.21229934692383, "ref_logps/rejected": -35.93655776977539, "rewards/accuracies": 0.78125, "rewards/chosen": 0.03067046031355858, "rewards/margins": 0.09970355033874512, "rewards/rejected": -0.06903309375047684, "step": 16 }, { "epoch": 0.23, "grad_norm": 23.079239827774252, "learning_rate": 4.767185902126363e-07, "logps/chosen": -48.87858200073242, "logps/rejected": -36.90644073486328, "loss": 0.633, "losses/dpo": 0.6357161998748779, "losses/sft": 0.1839471459388733, "losses/total": 0.6357161998748779, "ref_logps/chosen": -49.40204620361328, "ref_logps/rejected": -36.11450958251953, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.05234625190496445, "rewards/margins": 0.13153919577598572, "rewards/rejected": -0.07919295132160187, "step": 17 }, { "epoch": 0.25, "grad_norm": 25.63300252359878, "learning_rate": 4.7136400641330245e-07, "logps/chosen": -46.71650695800781, "logps/rejected": -37.09510040283203, "loss": 0.6297, "losses/dpo": 0.6393631100654602, "losses/sft": 0.21227942407131195, "losses/total": 0.6393631100654602, "ref_logps/chosen": -46.991477966308594, "ref_logps/rejected": -35.969173431396484, "rewards/accuracies": 0.8203125, "rewards/chosen": 0.02749716117978096, "rewards/margins": 0.14008952677249908, "rewards/rejected": -0.11259236931800842, "step": 18 }, { "epoch": 0.26, "grad_norm": 26.311859157755837, "learning_rate": 4.6549241672001225e-07, "logps/chosen": -43.63357162475586, "logps/rejected": -34.979026794433594, "loss": 0.6077, "losses/dpo": 0.5548383593559265, "losses/sft": 0.19493867456912994, "losses/total": 0.5548383593559265, "ref_logps/chosen": -44.03193664550781, "ref_logps/rejected": -33.485252380371094, "rewards/accuracies": 0.8515625, "rewards/chosen": 0.03983645513653755, "rewards/margins": 0.18921390175819397, "rewards/rejected": -0.14937745034694672, "step": 19 }, { "epoch": 0.27, "grad_norm": 28.714173620781665, "learning_rate": 4.591175345025566e-07, "logps/chosen": -46.371559143066406, "logps/rejected": -35.243812561035156, "loss": 0.609, "losses/dpo": 0.6410955190658569, "losses/sft": 0.16183941066265106, "losses/total": 0.6410955190658569, "ref_logps/chosen": -46.70909881591797, "ref_logps/rejected": -33.71453857421875, "rewards/accuracies": 0.828125, "rewards/chosen": 0.03375420719385147, "rewards/margins": 0.18668171763420105, "rewards/rejected": -0.15292751789093018, "step": 20 }, { "epoch": 0.29, "grad_norm": 26.549036618365495, "learning_rate": 4.5225424859373684e-07, "logps/chosen": -41.521549224853516, "logps/rejected": -34.770103454589844, "loss": 0.5963, "losses/dpo": 0.7364767789840698, "losses/sft": 0.17622552812099457, "losses/total": 0.7364767789840698, "ref_logps/chosen": -41.7501106262207, "ref_logps/rejected": -32.80527114868164, "rewards/accuracies": 0.859375, "rewards/chosen": 0.02285606414079666, "rewards/margins": 0.21933907270431519, "rewards/rejected": -0.19648301601409912, "step": 21 }, { "epoch": 0.3, "grad_norm": 33.26960463303905, "learning_rate": 4.4491858851580553e-07, "logps/chosen": -45.94141387939453, "logps/rejected": -36.16654968261719, "loss": 0.5887, "losses/dpo": 0.495862752199173, "losses/sft": 0.17526012659072876, "losses/total": 0.495862752199173, "ref_logps/chosen": -46.16797637939453, "ref_logps/rejected": -33.92024612426758, "rewards/accuracies": 0.84375, "rewards/chosen": 0.02265631966292858, "rewards/margins": 0.2472866028547287, "rewards/rejected": -0.22463028132915497, "step": 22 }, { "epoch": 0.32, "grad_norm": 38.94504011639214, "learning_rate": 4.3712768704277524e-07, "logps/chosen": -43.17596435546875, "logps/rejected": -35.83791732788086, "loss": 0.5549, "losses/dpo": 0.6368575692176819, "losses/sft": 0.20419813692569733, "losses/total": 0.6368575692176819, "ref_logps/chosen": -43.439910888671875, "ref_logps/rejected": -32.738441467285156, "rewards/accuracies": 0.8828125, "rewards/chosen": 0.026394736021757126, "rewards/margins": 0.3363422751426697, "rewards/rejected": -0.30994755029678345, "step": 23 }, { "epoch": 0.33, "grad_norm": 28.33928817647071, "learning_rate": 4.2889974018603024e-07, "logps/chosen": -48.73534393310547, "logps/rejected": -40.98769760131836, "loss": 0.5358, "losses/dpo": 0.6388107538223267, "losses/sft": 0.21662825345993042, "losses/total": 0.6388107538223267, "ref_logps/chosen": -48.840187072753906, "ref_logps/rejected": -37.24340057373047, "rewards/accuracies": 0.890625, "rewards/chosen": 0.010484418831765652, "rewards/margins": 0.38491398096084595, "rewards/rejected": -0.3744295537471771, "step": 24 }, { "epoch": 0.34, "grad_norm": 31.571769897086057, "learning_rate": 4.2025396469669926e-07, "logps/chosen": -49.65196228027344, "logps/rejected": -39.15043258666992, "loss": 0.5317, "losses/dpo": 0.4821869134902954, "losses/sft": 0.2129327803850174, "losses/total": 0.4821869134902954, "ref_logps/chosen": -49.09580993652344, "ref_logps/rejected": -34.47374725341797, "rewards/accuracies": 0.8828125, "rewards/chosen": -0.05561504885554314, "rewards/margins": 0.41205331683158875, "rewards/rejected": -0.467668354511261, "step": 25 }, { "epoch": 0.36, "grad_norm": 20.54896163205101, "learning_rate": 4.112105531840426e-07, "logps/chosen": -50.22370529174805, "logps/rejected": -38.49211120605469, "loss": 0.5133, "losses/dpo": 0.6953214406967163, "losses/sft": 0.1770307421684265, "losses/total": 0.6953214406967163, "ref_logps/chosen": -49.23892593383789, "ref_logps/rejected": -32.732269287109375, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.09847792983055115, "rewards/margins": 0.4775061011314392, "rewards/rejected": -0.575984001159668, "step": 26 }, { "epoch": 0.37, "grad_norm": 24.210290197713302, "learning_rate": 4.017906269546778e-07, "logps/chosen": -48.78424072265625, "logps/rejected": -39.4119758605957, "loss": 0.5025, "losses/dpo": 0.2536649703979492, "losses/sft": 0.17507979273796082, "losses/total": 0.2536649703979492, "ref_logps/chosen": -47.147621154785156, "ref_logps/rejected": -32.35851287841797, "rewards/accuracies": 0.84375, "rewards/chosen": -0.16366226971149445, "rewards/margins": 0.5416839718818665, "rewards/rejected": -0.7053462266921997, "step": 27 }, { "epoch": 0.38, "grad_norm": 25.054325101536794, "learning_rate": 3.920161866827889e-07, "logps/chosen": -46.48284912109375, "logps/rejected": -40.55732727050781, "loss": 0.5225, "losses/dpo": 0.6159500479698181, "losses/sft": 0.18471354246139526, "losses/total": 0.6159500479698181, "ref_logps/chosen": -44.64717102050781, "ref_logps/rejected": -34.08299255371094, "rewards/accuracies": 0.84375, "rewards/chosen": -0.18356791138648987, "rewards/margins": 0.46386560797691345, "rewards/rejected": -0.6474335193634033, "step": 28 }, { "epoch": 0.4, "grad_norm": 25.059885652690767, "learning_rate": 3.8191006102653317e-07, "logps/chosen": -50.65240478515625, "logps/rejected": -44.85976028442383, "loss": 0.4509, "losses/dpo": 0.5429763793945312, "losses/sft": 0.19810011982917786, "losses/total": 0.5429763793945312, "ref_logps/chosen": -47.85638427734375, "ref_logps/rejected": -35.169281005859375, "rewards/accuracies": 0.9140625, "rewards/chosen": -0.27960240840911865, "rewards/margins": 0.6894451975822449, "rewards/rejected": -0.9690475463867188, "step": 29 }, { "epoch": 0.41, "grad_norm": 19.99856582783424, "learning_rate": 3.7149585331065145e-07, "logps/chosen": -49.85383605957031, "logps/rejected": -45.81809997558594, "loss": 0.4332, "losses/dpo": 0.29431843757629395, "losses/sft": 0.18581561744213104, "losses/total": 0.29431843757629395, "ref_logps/chosen": -46.770938873291016, "ref_logps/rejected": -34.5809326171875, "rewards/accuracies": 0.8671875, "rewards/chosen": -0.3082895576953888, "rewards/margins": 0.8154268264770508, "rewards/rejected": -1.1237163543701172, "step": 30 }, { "epoch": 0.42, "grad_norm": 34.79633257386577, "learning_rate": 3.6079788639981036e-07, "logps/chosen": -52.836326599121094, "logps/rejected": -46.93244934082031, "loss": 0.4604, "losses/dpo": 0.8810983300209045, "losses/sft": 0.23828193545341492, "losses/total": 0.8810983300209045, "ref_logps/chosen": -49.11648178100586, "ref_logps/rejected": -36.381752014160156, "rewards/accuracies": 0.8984375, "rewards/chosen": -0.3719848394393921, "rewards/margins": 0.6830847263336182, "rewards/rejected": -1.0550695657730103, "step": 31 }, { "epoch": 0.44, "grad_norm": 23.026509844905394, "learning_rate": 3.498411458914238e-07, "logps/chosen": -50.38003921508789, "logps/rejected": -45.10429763793945, "loss": 0.4393, "losses/dpo": 0.15313033759593964, "losses/sft": 0.19763650000095367, "losses/total": 0.15313033759593964, "ref_logps/chosen": -46.028076171875, "ref_logps/rejected": -33.00657272338867, "rewards/accuracies": 0.875, "rewards/chosen": -0.4351964592933655, "rewards/margins": 0.7745760679244995, "rewards/rejected": -1.2097725868225098, "step": 32 }, { "epoch": 0.45, "grad_norm": 18.317574609447647, "learning_rate": 3.3865122176063385e-07, "logps/chosen": -51.4942512512207, "logps/rejected": -49.96583557128906, "loss": 0.4075, "losses/dpo": 0.1953999102115631, "losses/sft": 0.29790106415748596, "losses/total": 0.1953999102115631, "ref_logps/chosen": -45.6589469909668, "ref_logps/rejected": -34.858577728271484, "rewards/accuracies": 0.8515625, "rewards/chosen": -0.5835303068161011, "rewards/margins": 0.9271953105926514, "rewards/rejected": -1.510725736618042, "step": 33 }, { "epoch": 0.47, "grad_norm": 19.255871137244554, "learning_rate": 3.272542485937368e-07, "logps/chosen": -50.351234436035156, "logps/rejected": -48.89935302734375, "loss": 0.3959, "losses/dpo": 0.4281933605670929, "losses/sft": 0.19774244725704193, "losses/total": 0.4281933605670929, "ref_logps/chosen": -43.48761749267578, "ref_logps/rejected": -32.255577087402344, "rewards/accuracies": 0.859375, "rewards/chosen": -0.68636155128479, "rewards/margins": 0.9780160188674927, "rewards/rejected": -1.6643775701522827, "step": 34 }, { "epoch": 0.48, "grad_norm": 17.53385145494046, "learning_rate": 3.1567684454964674e-07, "logps/chosen": -49.46981430053711, "logps/rejected": -49.80710220336914, "loss": 0.4011, "losses/dpo": 0.5663512945175171, "losses/sft": 0.24904295802116394, "losses/total": 0.5663512945175171, "ref_logps/chosen": -42.88325500488281, "ref_logps/rejected": -33.13590621948242, "rewards/accuracies": 0.890625, "rewards/chosen": -0.6586559414863586, "rewards/margins": 1.0084636211395264, "rewards/rejected": -1.6671196222305298, "step": 35 }, { "epoch": 0.49, "grad_norm": 157.5390863725062, "learning_rate": 3.0394604919195157e-07, "logps/chosen": -50.14772415161133, "logps/rejected": -49.97753143310547, "loss": 0.4132, "losses/dpo": 0.6134005784988403, "losses/sft": 0.1941785216331482, "losses/total": 0.6134005784988403, "ref_logps/chosen": -42.886375427246094, "ref_logps/rejected": -32.889442443847656, "rewards/accuracies": 0.859375, "rewards/chosen": -0.7261347770690918, "rewards/margins": 0.9826743006706238, "rewards/rejected": -1.7088091373443604, "step": 36 }, { "epoch": 0.51, "grad_norm": 30.744138000924785, "learning_rate": 2.920892603367596e-07, "logps/chosen": -52.53690719604492, "logps/rejected": -51.7293701171875, "loss": 0.4345, "losses/dpo": 0.39982184767723083, "losses/sft": 0.16318069398403168, "losses/total": 0.39982184767723083, "ref_logps/chosen": -44.043270111083984, "ref_logps/rejected": -33.67184066772461, "rewards/accuracies": 0.8203125, "rewards/chosen": -0.8493636250495911, "rewards/margins": 0.956389307975769, "rewards/rejected": -1.8057528734207153, "step": 37 }, { "epoch": 0.52, "grad_norm": 18.608606064784283, "learning_rate": 2.801341700638307e-07, "logps/chosen": -54.247406005859375, "logps/rejected": -51.46720886230469, "loss": 0.4308, "losses/dpo": 0.7559365630149841, "losses/sft": 0.20898960530757904, "losses/total": 0.7559365630149841, "ref_logps/chosen": -47.05962371826172, "ref_logps/rejected": -34.95857238769531, "rewards/accuracies": 0.828125, "rewards/chosen": -0.7187784910202026, "rewards/margins": 0.9320851564407349, "rewards/rejected": -1.6508636474609375, "step": 38 }, { "epoch": 0.53, "grad_norm": 47.98397942977545, "learning_rate": 2.681087000404406e-07, "logps/chosen": -53.239768981933594, "logps/rejected": -52.34550476074219, "loss": 0.3907, "losses/dpo": 0.31572413444519043, "losses/sft": 0.18499067425727844, "losses/total": 0.31572413444519043, "ref_logps/chosen": -45.19135284423828, "ref_logps/rejected": -33.13307189941406, "rewards/accuracies": 0.90625, "rewards/chosen": -0.8048416972160339, "rewards/margins": 1.1164013147354126, "rewards/rejected": -1.9212429523468018, "step": 39 }, { "epoch": 0.55, "grad_norm": 21.523748609052035, "learning_rate": 2.5604093630903305e-07, "logps/chosen": -53.806236267089844, "logps/rejected": -54.13373565673828, "loss": 0.3678, "losses/dpo": 0.6854045391082764, "losses/sft": 0.21097487211227417, "losses/total": 0.6854045391082764, "ref_logps/chosen": -44.96014404296875, "ref_logps/rejected": -34.04387664794922, "rewards/accuracies": 0.890625, "rewards/chosen": -0.8846092224121094, "rewards/margins": 1.1243770122528076, "rewards/rejected": -2.008985996246338, "step": 40 }, { "epoch": 0.56, "grad_norm": 20.63046978113073, "learning_rate": 2.43959063690967e-07, "logps/chosen": -56.91130065917969, "logps/rejected": -54.714378356933594, "loss": 0.3872, "losses/dpo": 0.1204671785235405, "losses/sft": 0.17937365174293518, "losses/total": 0.1204671785235405, "ref_logps/chosen": -47.74310302734375, "ref_logps/rejected": -34.866615295410156, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9168204069137573, "rewards/margins": 1.0679559707641602, "rewards/rejected": -1.984776258468628, "step": 41 }, { "epoch": 0.58, "grad_norm": 27.841791874606287, "learning_rate": 2.3189129995955942e-07, "logps/chosen": -56.37548065185547, "logps/rejected": -55.140594482421875, "loss": 0.3703, "losses/dpo": 0.6694349646568298, "losses/sft": 0.15415219962596893, "losses/total": 0.6694349646568298, "ref_logps/chosen": -46.114707946777344, "ref_logps/rejected": -33.19464111328125, "rewards/accuracies": 0.8671875, "rewards/chosen": -1.0260775089263916, "rewards/margins": 1.16851806640625, "rewards/rejected": -2.1945955753326416, "step": 42 }, { "epoch": 0.59, "grad_norm": 20.157417684445996, "learning_rate": 2.1986582993616925e-07, "logps/chosen": -55.861724853515625, "logps/rejected": -55.27591323852539, "loss": 0.4096, "losses/dpo": 0.253600537776947, "losses/sft": 0.25442296266555786, "losses/total": 0.253600537776947, "ref_logps/chosen": -46.024993896484375, "ref_logps/rejected": -34.88616180419922, "rewards/accuracies": 0.859375, "rewards/chosen": -0.9836731553077698, "rewards/margins": 1.0553019046783447, "rewards/rejected": -2.038975238800049, "step": 43 }, { "epoch": 0.6, "grad_norm": 22.91868411351925, "learning_rate": 2.0791073966324034e-07, "logps/chosen": -56.3699836730957, "logps/rejected": -58.20032501220703, "loss": 0.3645, "losses/dpo": 0.05803808197379112, "losses/sft": 0.16261443495750427, "losses/total": 0.05803808197379112, "ref_logps/chosen": -46.18814468383789, "ref_logps/rejected": -35.7181396484375, "rewards/accuracies": 0.8828125, "rewards/chosen": -1.018183708190918, "rewards/margins": 1.230034351348877, "rewards/rejected": -2.248218059539795, "step": 44 }, { "epoch": 0.62, "grad_norm": 24.665726952614282, "learning_rate": 1.960539508080485e-07, "logps/chosen": -55.33811569213867, "logps/rejected": -56.2475700378418, "loss": 0.4363, "losses/dpo": 0.6756047606468201, "losses/sft": 0.1989610195159912, "losses/total": 0.6756047606468201, "ref_logps/chosen": -42.876373291015625, "ref_logps/rejected": -33.306602478027344, "rewards/accuracies": 0.8125, "rewards/chosen": -1.2461739778518677, "rewards/margins": 1.0479230880737305, "rewards/rejected": -2.2940969467163086, "step": 45 }, { "epoch": 0.63, "grad_norm": 35.04495782063734, "learning_rate": 1.8432315545035327e-07, "logps/chosen": -59.337791442871094, "logps/rejected": -60.82359313964844, "loss": 0.3701, "losses/dpo": 0.24237556755542755, "losses/sft": 0.14872561395168304, "losses/total": 0.24237556755542755, "ref_logps/chosen": -46.916419982910156, "ref_logps/rejected": -36.144935607910156, "rewards/accuracies": 0.8671875, "rewards/chosen": -1.2421373128890991, "rewards/margins": 1.2257287502288818, "rewards/rejected": -2.4678661823272705, "step": 46 }, { "epoch": 0.64, "grad_norm": 18.874251761700755, "learning_rate": 1.7274575140626315e-07, "logps/chosen": -60.359886169433594, "logps/rejected": -56.043479919433594, "loss": 0.3903, "losses/dpo": 0.6876823902130127, "losses/sft": 0.163571298122406, "losses/total": 0.6876823902130127, "ref_logps/chosen": -49.23930358886719, "ref_logps/rejected": -34.02153778076172, "rewards/accuracies": 0.8984375, "rewards/chosen": -1.1120576858520508, "rewards/margins": 1.0901365280151367, "rewards/rejected": -2.2021942138671875, "step": 47 }, { "epoch": 0.66, "grad_norm": 29.114539057876968, "learning_rate": 1.6134877823936607e-07, "logps/chosen": -60.98393249511719, "logps/rejected": -58.489444732666016, "loss": 0.4011, "losses/dpo": 0.03265048563480377, "losses/sft": 0.14689283072948456, "losses/total": 0.03265048563480377, "ref_logps/chosen": -49.34606170654297, "ref_logps/rejected": -36.67803955078125, "rewards/accuracies": 0.8671875, "rewards/chosen": -1.1637871265411377, "rewards/margins": 1.0173530578613281, "rewards/rejected": -2.181140184402466, "step": 48 }, { "epoch": 0.67, "grad_norm": 21.107662898541907, "learning_rate": 1.5015885410857614e-07, "logps/chosen": -60.81307601928711, "logps/rejected": -59.90397262573242, "loss": 0.3897, "losses/dpo": 0.33075177669525146, "losses/sft": 0.214824840426445, "losses/total": 0.33075177669525146, "ref_logps/chosen": -46.25496292114258, "ref_logps/rejected": -33.91436004638672, "rewards/accuracies": 0.859375, "rewards/chosen": -1.4558112621307373, "rewards/margins": 1.143149971961975, "rewards/rejected": -2.598961114883423, "step": 49 }, { "epoch": 0.68, "grad_norm": 26.95108201172052, "learning_rate": 1.392021136001897e-07, "logps/chosen": -56.23418426513672, "logps/rejected": -56.328125, "loss": 0.3964, "losses/dpo": 0.03794693946838379, "losses/sft": 0.19881302118301392, "losses/total": 0.03794693946838379, "ref_logps/chosen": -42.96794891357422, "ref_logps/rejected": -32.164451599121094, "rewards/accuracies": 0.875, "rewards/chosen": -1.3266233205795288, "rewards/margins": 1.089743971824646, "rewards/rejected": -2.416367530822754, "step": 50 }, { "epoch": 0.7, "grad_norm": 33.76828619344551, "learning_rate": 1.2850414668934847e-07, "logps/chosen": -61.50416946411133, "logps/rejected": -59.79325485229492, "loss": 0.3827, "losses/dpo": 0.5413109660148621, "losses/sft": 0.30467280745506287, "losses/total": 0.5413109660148621, "ref_logps/chosen": -48.96829605102539, "ref_logps/rejected": -35.99717330932617, "rewards/accuracies": 0.9375, "rewards/chosen": -1.2535876035690308, "rewards/margins": 1.1260210275650024, "rewards/rejected": -2.379608631134033, "step": 51 }, { "epoch": 0.71, "grad_norm": 16.559964106722745, "learning_rate": 1.1808993897346678e-07, "logps/chosen": -58.611270904541016, "logps/rejected": -58.919395446777344, "loss": 0.3796, "losses/dpo": 0.3290981352329254, "losses/sft": 0.19547075033187866, "losses/total": 0.3290981352329254, "ref_logps/chosen": -46.96087646484375, "ref_logps/rejected": -36.086090087890625, "rewards/accuracies": 0.90625, "rewards/chosen": -1.1650400161743164, "rewards/margins": 1.1182900667190552, "rewards/rejected": -2.283329963684082, "step": 52 }, { "epoch": 0.73, "grad_norm": 25.26391431571928, "learning_rate": 1.0798381331721107e-07, "logps/chosen": -58.2769775390625, "logps/rejected": -57.12656021118164, "loss": 0.3707, "losses/dpo": 0.3912191092967987, "losses/sft": 0.20826196670532227, "losses/total": 0.3912191092967987, "ref_logps/chosen": -46.01140213012695, "ref_logps/rejected": -32.54326629638672, "rewards/accuracies": 0.859375, "rewards/chosen": -1.226557731628418, "rewards/margins": 1.2317723035812378, "rewards/rejected": -2.4583301544189453, "step": 53 }, { "epoch": 0.74, "grad_norm": 18.669814077600197, "learning_rate": 9.82093730453222e-08, "logps/chosen": -57.36506271362305, "logps/rejected": -57.83528137207031, "loss": 0.4249, "losses/dpo": 0.28024712204933167, "losses/sft": 0.21661897003650665, "losses/total": 0.28024712204933167, "ref_logps/chosen": -44.405941009521484, "ref_logps/rejected": -34.53661346435547, "rewards/accuracies": 0.8671875, "rewards/chosen": -1.295912265777588, "rewards/margins": 1.0339548587799072, "rewards/rejected": -2.329867124557495, "step": 54 }, { "epoch": 0.75, "grad_norm": 17.65819121351904, "learning_rate": 8.87894468159574e-08, "logps/chosen": -60.354469299316406, "logps/rejected": -60.50645065307617, "loss": 0.3985, "losses/dpo": 0.9817911386489868, "losses/sft": 0.1904633343219757, "losses/total": 0.9817911386489868, "ref_logps/chosen": -46.499290466308594, "ref_logps/rejected": -34.763404846191406, "rewards/accuracies": 0.8359375, "rewards/chosen": -1.3855178356170654, "rewards/margins": 1.1887872219085693, "rewards/rejected": -2.5743050575256348, "step": 55 }, { "epoch": 0.77, "grad_norm": 23.90292670438398, "learning_rate": 7.974603530330067e-08, "logps/chosen": -55.58333206176758, "logps/rejected": -55.52084732055664, "loss": 0.3777, "losses/dpo": 0.04075286537408829, "losses/sft": 0.22049269080162048, "losses/total": 0.04075286537408829, "ref_logps/chosen": -43.25560760498047, "ref_logps/rejected": -31.006759643554688, "rewards/accuracies": 0.8828125, "rewards/chosen": -1.2327725887298584, "rewards/margins": 1.2186365127563477, "rewards/rejected": -2.451408863067627, "step": 56 }, { "epoch": 0.78, "grad_norm": 28.08593658686289, "learning_rate": 7.110025981396975e-08, "logps/chosen": -58.75514221191406, "logps/rejected": -58.784584045410156, "loss": 0.4449, "losses/dpo": 0.4793856143951416, "losses/sft": 0.20940393209457397, "losses/total": 0.4793856143951416, "ref_logps/chosen": -45.29600524902344, "ref_logps/rejected": -34.97162628173828, "rewards/accuracies": 0.8046875, "rewards/chosen": -1.3459134101867676, "rewards/margins": 1.0353822708129883, "rewards/rejected": -2.381295680999756, "step": 57 }, { "epoch": 0.79, "grad_norm": 24.077339089176505, "learning_rate": 6.28723129572247e-08, "logps/chosen": -55.75697326660156, "logps/rejected": -56.72669219970703, "loss": 0.3567, "losses/dpo": 0.21238191425800323, "losses/sft": 0.1661817878484726, "losses/total": 0.21238191425800323, "ref_logps/chosen": -44.3855094909668, "ref_logps/rejected": -32.21479797363281, "rewards/accuracies": 0.890625, "rewards/chosen": -1.137147068977356, "rewards/margins": 1.314042568206787, "rewards/rejected": -2.4511895179748535, "step": 58 }, { "epoch": 0.81, "grad_norm": 43.46612828134844, "learning_rate": 5.508141148419443e-08, "logps/chosen": -61.76049041748047, "logps/rejected": -62.041648864746094, "loss": 0.3688, "losses/dpo": 0.27996987104415894, "losses/sft": 0.1737639456987381, "losses/total": 0.27996987104415894, "ref_logps/chosen": -49.25553894042969, "ref_logps/rejected": -36.210182189941406, "rewards/accuracies": 0.84375, "rewards/chosen": -1.250495195388794, "rewards/margins": 1.3326513767242432, "rewards/rejected": -2.583146572113037, "step": 59 }, { "epoch": 0.82, "grad_norm": 22.779198271573037, "learning_rate": 4.774575140626316e-08, "logps/chosen": -55.46681594848633, "logps/rejected": -57.17453384399414, "loss": 0.3531, "losses/dpo": 0.046613942831754684, "losses/sft": 0.20427729189395905, "losses/total": 0.046613942831754684, "ref_logps/chosen": -42.29081726074219, "ref_logps/rejected": -30.75497817993164, "rewards/accuracies": 0.8984375, "rewards/chosen": -1.3175995349884033, "rewards/margins": 1.3243558406829834, "rewards/rejected": -2.6419553756713867, "step": 60 }, { "epoch": 0.84, "grad_norm": 20.59368424342303, "learning_rate": 4.0882465497443313e-08, "logps/chosen": -58.52223587036133, "logps/rejected": -56.04042053222656, "loss": 0.3923, "losses/dpo": 0.26003214716911316, "losses/sft": 0.17392012476921082, "losses/total": 0.26003214716911316, "ref_logps/chosen": -48.404632568359375, "ref_logps/rejected": -34.86602783203125, "rewards/accuracies": 0.890625, "rewards/chosen": -1.0117601156234741, "rewards/margins": 1.1056792736053467, "rewards/rejected": -2.1174392700195312, "step": 61 }, { "epoch": 0.85, "grad_norm": 23.660376428219948, "learning_rate": 3.450758327998768e-08, "logps/chosen": -60.401039123535156, "logps/rejected": -60.10982131958008, "loss": 0.3902, "losses/dpo": 0.01773645170032978, "losses/sft": 0.17717282474040985, "losses/total": 0.01773645170032978, "ref_logps/chosen": -48.241943359375, "ref_logps/rejected": -34.582366943359375, "rewards/accuracies": 0.890625, "rewards/chosen": -1.215909719467163, "rewards/margins": 1.3368357419967651, "rewards/rejected": -2.5527453422546387, "step": 62 }, { "epoch": 0.86, "grad_norm": 86.96881294099092, "learning_rate": 2.863599358669755e-08, "logps/chosen": -56.905418395996094, "logps/rejected": -56.808746337890625, "loss": 0.3944, "losses/dpo": 0.15065120160579681, "losses/sft": 0.22477349638938904, "losses/total": 0.15065120160579681, "ref_logps/chosen": -44.15583038330078, "ref_logps/rejected": -33.21840286254883, "rewards/accuracies": 0.828125, "rewards/chosen": -1.2749593257904053, "rewards/margins": 1.0840749740600586, "rewards/rejected": -2.359034299850464, "step": 63 }, { "epoch": 0.88, "grad_norm": 18.8337077576639, "learning_rate": 2.3281409787363648e-08, "logps/chosen": -57.604774475097656, "logps/rejected": -57.78453063964844, "loss": 0.3863, "losses/dpo": 0.41682732105255127, "losses/sft": 0.16616390645503998, "losses/total": 0.41682732105255127, "ref_logps/chosen": -43.315818786621094, "ref_logps/rejected": -31.524248123168945, "rewards/accuracies": 0.8671875, "rewards/chosen": -1.4288955926895142, "rewards/margins": 1.1971325874328613, "rewards/rejected": -2.626028537750244, "step": 64 }, { "epoch": 0.89, "grad_norm": 374.1054719017444, "learning_rate": 1.845633776055591e-08, "logps/chosen": -57.63691711425781, "logps/rejected": -58.3455810546875, "loss": 0.3882, "losses/dpo": 0.26508828997612, "losses/sft": 0.2718198001384735, "losses/total": 0.26508828997612, "ref_logps/chosen": -44.429481506347656, "ref_logps/rejected": -33.13744354248047, "rewards/accuracies": 0.875, "rewards/chosen": -1.3207435607910156, "rewards/margins": 1.2000699043273926, "rewards/rejected": -2.520813465118408, "step": 65 }, { "epoch": 0.9, "grad_norm": 26.70970124014032, "learning_rate": 1.4172046685564209e-08, "logps/chosen": -58.663551330566406, "logps/rejected": -58.07282257080078, "loss": 0.3962, "losses/dpo": 0.08177483081817627, "losses/sft": 0.18531636893749237, "losses/total": 0.08177483081817627, "ref_logps/chosen": -45.821983337402344, "ref_logps/rejected": -33.62261199951172, "rewards/accuracies": 0.875, "rewards/chosen": -1.2841567993164062, "rewards/margins": 1.1608643531799316, "rewards/rejected": -2.445021390914917, "step": 66 }, { "epoch": 0.92, "grad_norm": 25.593261462625442, "learning_rate": 1.0438542722708444e-08, "logps/chosen": -59.08097839355469, "logps/rejected": -59.16502380371094, "loss": 0.3836, "losses/dpo": 0.02788337506353855, "losses/sft": 0.19819076359272003, "losses/total": 0.02788337506353855, "ref_logps/chosen": -45.94892883300781, "ref_logps/rejected": -33.597511291503906, "rewards/accuracies": 0.8828125, "rewards/chosen": -1.3132052421569824, "rewards/margins": 1.2435462474822998, "rewards/rejected": -2.556751251220703, "step": 67 }, { "epoch": 0.93, "grad_norm": 25.28796063034412, "learning_rate": 7.2645456434869965e-09, "logps/chosen": -57.95222473144531, "logps/rejected": -58.91720199584961, "loss": 0.3915, "losses/dpo": 1.2907841205596924, "losses/sft": 0.20458956062793732, "losses/total": 1.2907841205596924, "ref_logps/chosen": -45.50114440917969, "ref_logps/rejected": -35.063446044921875, "rewards/accuracies": 0.890625, "rewards/chosen": -1.2451080083847046, "rewards/margins": 1.140267252922058, "rewards/rejected": -2.385375499725342, "step": 68 }, { "epoch": 0.95, "grad_norm": 30.554099185463503, "learning_rate": 4.657468465146641e-09, "logps/chosen": -57.99516296386719, "logps/rejected": -55.496768951416016, "loss": 0.3752, "losses/dpo": 0.20264464616775513, "losses/sft": 0.17493540048599243, "losses/total": 0.20264464616775513, "ref_logps/chosen": -47.58026123046875, "ref_logps/rejected": -33.345062255859375, "rewards/accuracies": 0.890625, "rewards/chosen": -1.041489839553833, "rewards/margins": 1.1736811399459839, "rewards/rejected": -2.2151710987091064, "step": 69 }, { "epoch": 0.96, "grad_norm": 21.555895701368716, "learning_rate": 2.6234001372372193e-09, "logps/chosen": -55.79784393310547, "logps/rejected": -54.85697555541992, "loss": 0.4513, "losses/dpo": 0.6288288235664368, "losses/sft": 0.25858786702156067, "losses/total": 0.6288288235664368, "ref_logps/chosen": -42.008121490478516, "ref_logps/rejected": -31.47281265258789, "rewards/accuracies": 0.828125, "rewards/chosen": -1.3789721727371216, "rewards/margins": 0.9594441056251526, "rewards/rejected": -2.338416337966919, "step": 70 }, { "epoch": 0.97, "grad_norm": 21.73384383499147, "learning_rate": 1.167091320587843e-09, "logps/chosen": -56.99696350097656, "logps/rejected": -59.2013053894043, "loss": 0.3554, "losses/dpo": 0.09169570356607437, "losses/sft": 0.20991858839988708, "losses/total": 0.09169570356607437, "ref_logps/chosen": -42.36278533935547, "ref_logps/rejected": -31.79424476623535, "rewards/accuracies": 0.890625, "rewards/chosen": -1.463417887687683, "rewards/margins": 1.2772881984710693, "rewards/rejected": -2.740705966949463, "step": 71 }, { "epoch": 0.99, "grad_norm": 30.958564799186906, "learning_rate": 2.9194329191833953e-10, "logps/chosen": -58.35291290283203, "logps/rejected": -56.74859619140625, "loss": 0.3706, "losses/dpo": 0.3077165484428406, "losses/sft": 0.17356029152870178, "losses/total": 0.3077165484428406, "ref_logps/chosen": -44.90869903564453, "ref_logps/rejected": -31.324697494506836, "rewards/accuracies": 0.890625, "rewards/chosen": -1.34442138671875, "rewards/margins": 1.197968602180481, "rewards/rejected": -2.5423898696899414, "step": 72 }, { "epoch": 1.0, "grad_norm": 20.514487251091158, "learning_rate": 0.0, "logps/chosen": -55.3281135559082, "logps/rejected": -54.42873764038086, "loss": 0.4185, "losses/dpo": 0.45331382751464844, "losses/sft": 0.16170088946819305, "losses/total": 0.45331382751464844, "ref_logps/chosen": -42.832916259765625, "ref_logps/rejected": -31.545093536376953, "rewards/accuracies": 0.875, "rewards/chosen": -1.2495195865631104, "rewards/margins": 1.0388449430465698, "rewards/rejected": -2.2883644104003906, "step": 73 }, { "epoch": 1.0, "step": 73, "total_flos": 0.0, "train_loss": 0.4880054197082781, "train_runtime": 1195.1879, "train_samples_per_second": 7.883, "train_steps_per_second": 0.061 } ], "logging_steps": 1.0, "max_steps": 73, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }