{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.985781990521327, "eval_steps": 50, "global_step": 315, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0947867298578199, "grad_norm": 56.951628924108704, "learning_rate": 1.5624999999999999e-07, "logits/chosen": -2.8022689819335938, "logits/rejected": -2.699367046356201, "logps/chosen": -354.14007568359375, "logps/rejected": -648.7852783203125, "loss": 0.6846, "rewards/accuracies": 0.637499988079071, "rewards/chosen": 0.0030409712344408035, "rewards/margins": 0.015484926290810108, "rewards/rejected": -0.01244395412504673, "step": 10 }, { "epoch": 0.1895734597156398, "grad_norm": 16.911922497415656, "learning_rate": 3.1249999999999997e-07, "logits/chosen": -2.8449482917785645, "logits/rejected": -2.7297720909118652, "logps/chosen": -361.7726135253906, "logps/rejected": -731.9713134765625, "loss": 0.4488, "rewards/accuracies": 1.0, "rewards/chosen": 0.11448182910680771, "rewards/margins": 0.7460837364196777, "rewards/rejected": -0.6316019892692566, "step": 20 }, { "epoch": 0.2843601895734597, "grad_norm": 2.8879981399804886, "learning_rate": 4.6874999999999996e-07, "logits/chosen": -2.924880027770996, "logits/rejected": -2.7608063220977783, "logps/chosen": -344.0640869140625, "logps/rejected": -1062.529541015625, "loss": 0.1128, "rewards/accuracies": 1.0, "rewards/chosen": 0.4154191017150879, "rewards/margins": 4.946678638458252, "rewards/rejected": -4.531259536743164, "step": 30 }, { "epoch": 0.3791469194312796, "grad_norm": 0.7967945507055681, "learning_rate": 4.990147841143461e-07, "logits/chosen": -2.9928297996520996, "logits/rejected": -2.858860969543457, "logps/chosen": -369.7523193359375, "logps/rejected": -2523.788818359375, "loss": 0.0145, "rewards/accuracies": 1.0, "rewards/chosen": -0.17358417809009552, "rewards/margins": 18.882659912109375, "rewards/rejected": -19.056243896484375, "step": 40 }, { "epoch": 0.47393364928909953, "grad_norm": 0.08242657747458541, "learning_rate": 4.950256493879794e-07, "logits/chosen": -3.1458115577697754, "logits/rejected": -3.068504810333252, "logps/chosen": -445.88641357421875, "logps/rejected": -3839.385498046875, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -0.8680551648139954, "rewards/margins": 31.354045867919922, "rewards/rejected": -32.22209930419922, "step": 50 }, { "epoch": 0.47393364928909953, "eval_logits/chosen": -3.214230537414551, "eval_logits/rejected": -3.0434162616729736, "eval_logps/chosen": -511.5262451171875, "eval_logps/rejected": -4356.53564453125, "eval_loss": 0.006651720497757196, "eval_rewards/accuracies": 0.9939516186714172, "eval_rewards/chosen": -1.4454454183578491, "eval_rewards/margins": 35.71202850341797, "eval_rewards/rejected": -37.157470703125, "eval_runtime": 194.5294, "eval_samples_per_second": 20.074, "eval_steps_per_second": 0.319, "step": 50 }, { "epoch": 0.5687203791469194, "grad_norm": 0.08904936739654302, "learning_rate": 4.88020090697132e-07, "logits/chosen": -3.2791202068328857, "logits/rejected": -3.141754150390625, "logps/chosen": -564.9468383789062, "logps/rejected": -4684.3271484375, "loss": 0.004, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9347045421600342, "rewards/margins": 38.56499099731445, "rewards/rejected": -40.49969482421875, "step": 60 }, { "epoch": 0.6635071090047393, "grad_norm": 1.511268095124282, "learning_rate": 4.780843509929904e-07, "logits/chosen": -3.2914862632751465, "logits/rejected": -3.0883309841156006, "logps/chosen": -603.4210205078125, "logps/rejected": -4877.28662109375, "loss": 0.0021, "rewards/accuracies": 1.0, "rewards/chosen": -2.6363024711608887, "rewards/margins": 39.97002410888672, "rewards/rejected": -42.606327056884766, "step": 70 }, { "epoch": 0.7582938388625592, "grad_norm": 0.22202350824430725, "learning_rate": 4.6534074564712217e-07, "logits/chosen": -3.417383909225464, "logits/rejected": -3.290362596511841, "logps/chosen": -600.4118041992188, "logps/rejected": -5436.11376953125, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -2.5593833923339844, "rewards/margins": 45.55999755859375, "rewards/rejected": -48.11937713623047, "step": 80 }, { "epoch": 0.8530805687203792, "grad_norm": 2.0861019684034874, "learning_rate": 4.4994615667026846e-07, "logits/chosen": -3.4805240631103516, "logits/rejected": -3.3906772136688232, "logps/chosen": -624.0176391601562, "logps/rejected": -5296.82275390625, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.676025867462158, "rewards/margins": 44.0660285949707, "rewards/rejected": -46.7420539855957, "step": 90 }, { "epoch": 0.9478672985781991, "grad_norm": 2.8965011668216905, "learning_rate": 4.320901013934887e-07, "logits/chosen": -3.4210407733917236, "logits/rejected": -3.3643829822540283, "logps/chosen": -556.0076904296875, "logps/rejected": -4813.1806640625, "loss": 0.0036, "rewards/accuracies": 1.0, "rewards/chosen": -2.0869507789611816, "rewards/margins": 39.87181854248047, "rewards/rejected": -41.95877456665039, "step": 100 }, { "epoch": 0.9478672985781991, "eval_logits/chosen": -3.4104061126708984, "eval_logits/rejected": -3.2429261207580566, "eval_logps/chosen": -570.0164184570312, "eval_logps/rejected": -4765.2841796875, "eval_loss": 0.0052900416776537895, "eval_rewards/accuracies": 0.9939516186714172, "eval_rewards/chosen": -2.0303473472595215, "eval_rewards/margins": 39.21460723876953, "eval_rewards/rejected": -41.24495315551758, "eval_runtime": 192.2337, "eval_samples_per_second": 20.314, "eval_steps_per_second": 0.323, "step": 100 }, { "epoch": 1.042654028436019, "grad_norm": 1.2489542878599509, "learning_rate": 4.119923993874379e-07, "logits/chosen": -3.4639148712158203, "logits/rejected": -3.4126315116882324, "logps/chosen": -549.92138671875, "logps/rejected": -5150.29638671875, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.9557552337646484, "rewards/margins": 43.08815002441406, "rewards/rejected": -45.04390335083008, "step": 110 }, { "epoch": 1.1374407582938388, "grad_norm": 0.919711694376481, "learning_rate": 3.899004663415083e-07, "logits/chosen": -3.455725908279419, "logits/rejected": -3.3397490978240967, "logps/chosen": -534.6444702148438, "logps/rejected": -5193.822265625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.8104517459869385, "rewards/margins": 43.72606658935547, "rewards/rejected": -45.53651809692383, "step": 120 }, { "epoch": 1.2322274881516588, "grad_norm": 0.03772744312797018, "learning_rate": 3.6608626821692824e-07, "logits/chosen": -3.503054141998291, "logits/rejected": -3.4913394451141357, "logps/chosen": -509.2953186035156, "logps/rejected": -5831.84228515625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.6301825046539307, "rewards/margins": 49.84960174560547, "rewards/rejected": -51.47977828979492, "step": 130 }, { "epoch": 1.3270142180094786, "grad_norm": 0.00011722006953608906, "learning_rate": 3.408429731701635e-07, "logits/chosen": -3.636444091796875, "logits/rejected": -3.614245891571045, "logps/chosen": -664.00341796875, "logps/rejected": -5503.0537109375, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -3.150538682937622, "rewards/margins": 45.41934585571289, "rewards/rejected": -48.56988525390625, "step": 140 }, { "epoch": 1.4218009478672986, "grad_norm": 0.0013414969188062405, "learning_rate": 3.144813424636031e-07, "logits/chosen": -3.788306713104248, "logits/rejected": -3.686079740524292, "logps/chosen": -791.1682739257812, "logps/rejected": -5721.5634765625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -4.1505842208862305, "rewards/margins": 46.614662170410156, "rewards/rejected": -50.7652473449707, "step": 150 }, { "epoch": 1.4218009478672986, "eval_logits/chosen": -3.5867350101470947, "eval_logits/rejected": -3.5067942142486572, "eval_logps/chosen": -561.57568359375, "eval_logps/rejected": -5161.087890625, "eval_loss": 0.006992733106017113, "eval_rewards/accuracies": 0.9939516186714172, "eval_rewards/chosen": -1.9459394216537476, "eval_rewards/margins": 43.25704574584961, "eval_rewards/rejected": -45.2029914855957, "eval_runtime": 191.7726, "eval_samples_per_second": 20.363, "eval_steps_per_second": 0.323, "step": 150 }, { "epoch": 1.5165876777251186, "grad_norm": 0.0004138099071654368, "learning_rate": 2.8732590479375165e-07, "logits/chosen": -3.556847333908081, "logits/rejected": -3.5835862159729004, "logps/chosen": -528.8604736328125, "logps/rejected": -5157.8740234375, "loss": 0.0003, "rewards/accuracies": 1.0, "rewards/chosen": -1.7568155527114868, "rewards/margins": 43.957759857177734, "rewards/rejected": -45.714576721191406, "step": 160 }, { "epoch": 1.6113744075829384, "grad_norm": 0.0016286137021698196, "learning_rate": 2.597109611334169e-07, "logits/chosen": -3.579390287399292, "logits/rejected": -3.6478075981140137, "logps/chosen": -520.5675048828125, "logps/rejected": -5432.5673828125, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.6220014095306396, "rewards/margins": 46.55379867553711, "rewards/rejected": -48.17579650878906, "step": 170 }, { "epoch": 1.7061611374407581, "grad_norm": 0.00799320909391895, "learning_rate": 2.3197646927086694e-07, "logits/chosen": -3.5350117683410645, "logits/rejected": -3.6110050678253174, "logps/chosen": -534.5997314453125, "logps/rejected": -5420.73583984375, "loss": 0.0075, "rewards/accuracies": 1.0, "rewards/chosen": -1.6480720043182373, "rewards/margins": 46.55036163330078, "rewards/rejected": -48.19843292236328, "step": 180 }, { "epoch": 1.8009478672985781, "grad_norm": 0.0014081828819370304, "learning_rate": 2.0446385870993467e-07, "logits/chosen": -3.5267558097839355, "logits/rejected": -3.5355076789855957, "logps/chosen": -524.6720581054688, "logps/rejected": -5069.0888671875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.4387648105621338, "rewards/margins": 43.29344177246094, "rewards/rejected": -44.73220443725586, "step": 190 }, { "epoch": 1.8957345971563981, "grad_norm": 0.018000801767423476, "learning_rate": 1.775118274523545e-07, "logits/chosen": -3.5183377265930176, "logits/rejected": -3.5119102001190186, "logps/chosen": -486.629150390625, "logps/rejected": -5021.52490234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3421844244003296, "rewards/margins": 42.632965087890625, "rewards/rejected": -43.97514724731445, "step": 200 }, { "epoch": 1.8957345971563981, "eval_logits/chosen": -3.502014398574829, "eval_logits/rejected": -3.422856092453003, "eval_logps/chosen": -512.3704223632812, "eval_logps/rejected": -5067.64501953125, "eval_loss": 0.004733214620500803, "eval_rewards/accuracies": 0.9959677457809448, "eval_rewards/chosen": -1.4538869857788086, "eval_rewards/margins": 42.814674377441406, "eval_rewards/rejected": -44.26856231689453, "eval_runtime": 194.1121, "eval_samples_per_second": 20.117, "eval_steps_per_second": 0.319, "step": 200 }, { "epoch": 1.9905213270142181, "grad_norm": 0.9312964869423628, "learning_rate": 1.514521724066537e-07, "logits/chosen": -3.540240526199341, "logits/rejected": -3.5632777214050293, "logps/chosen": -531.4307861328125, "logps/rejected": -5061.63818359375, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.486299753189087, "rewards/margins": 42.955726623535156, "rewards/rejected": -44.44202423095703, "step": 210 }, { "epoch": 2.085308056872038, "grad_norm": 0.029566978048640967, "learning_rate": 1.266057047539568e-07, "logits/chosen": -3.5052971839904785, "logits/rejected": -3.5332977771759033, "logps/chosen": -477.3848571777344, "logps/rejected": -5269.00390625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.4086942672729492, "rewards/margins": 45.027523040771484, "rewards/rejected": -46.43621826171875, "step": 220 }, { "epoch": 2.1800947867298577, "grad_norm": 0.0005556188331340245, "learning_rate": 1.032783005551884e-07, "logits/chosen": -3.5509438514709473, "logits/rejected": -3.5611331462860107, "logps/chosen": -473.364501953125, "logps/rejected": -4865.369140625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.312354326248169, "rewards/margins": 41.259403228759766, "rewards/rejected": -42.57175827026367, "step": 230 }, { "epoch": 2.2748815165876777, "grad_norm": 0.005629678669869344, "learning_rate": 8.175713521924976e-08, "logits/chosen": -3.5678086280822754, "logits/rejected": -3.5121123790740967, "logps/chosen": -496.83258056640625, "logps/rejected": -5081.9599609375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.4451147317886353, "rewards/margins": 43.37391662597656, "rewards/rejected": -44.81903839111328, "step": 240 }, { "epoch": 2.3696682464454977, "grad_norm": 0.0012113886351427462, "learning_rate": 6.230714818829733e-08, "logits/chosen": -3.530911922454834, "logits/rejected": -3.5102057456970215, "logps/chosen": -484.5502014160156, "logps/rejected": -5412.3271484375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.327695608139038, "rewards/margins": 46.98969268798828, "rewards/rejected": -48.31739044189453, "step": 250 }, { "epoch": 2.3696682464454977, "eval_logits/chosen": -3.5510308742523193, "eval_logits/rejected": -3.444518566131592, "eval_logps/chosen": -512.2269287109375, "eval_logps/rejected": -5116.15771484375, "eval_loss": 0.005008448380976915, "eval_rewards/accuracies": 0.9959677457809448, "eval_rewards/chosen": -1.4524519443511963, "eval_rewards/margins": 43.301239013671875, "eval_rewards/rejected": -44.753692626953125, "eval_runtime": 192.1218, "eval_samples_per_second": 20.326, "eval_steps_per_second": 0.323, "step": 250 }, { "epoch": 2.4644549763033177, "grad_norm": 0.005272804838769864, "learning_rate": 4.516778136213037e-08, "logits/chosen": -3.5464816093444824, "logits/rejected": -3.532754898071289, "logps/chosen": -474.98077392578125, "logps/rejected": -5214.1748046875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3641650676727295, "rewards/margins": 44.85725021362305, "rewards/rejected": -46.22141647338867, "step": 260 }, { "epoch": 2.5592417061611377, "grad_norm": 0.01760309981671165, "learning_rate": 3.055003141378948e-08, "logits/chosen": -3.5305237770080566, "logits/rejected": -3.543522357940674, "logps/chosen": -502.1796875, "logps/rejected": -5842.8251953125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.2657973766326904, "rewards/margins": 50.87003707885742, "rewards/rejected": -52.135841369628906, "step": 270 }, { "epoch": 2.654028436018957, "grad_norm": 0.01345213655983596, "learning_rate": 1.8633852284264508e-08, "logits/chosen": -3.5437607765197754, "logits/rejected": -3.537663221359253, "logps/chosen": -519.03759765625, "logps/rejected": -5507.5615234375, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3319001197814941, "rewards/margins": 47.102291107177734, "rewards/rejected": -48.4341926574707, "step": 280 }, { "epoch": 2.748815165876777, "grad_norm": 5.9654408780918595e-05, "learning_rate": 9.56593983327919e-09, "logits/chosen": -3.5722999572753906, "logits/rejected": -3.5434532165527344, "logps/chosen": -525.2794189453125, "logps/rejected": -5359.7451171875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.4624111652374268, "rewards/margins": 46.21337890625, "rewards/rejected": -47.675785064697266, "step": 290 }, { "epoch": 2.843601895734597, "grad_norm": 0.0012624104591569302, "learning_rate": 3.4579259185321398e-09, "logits/chosen": -3.5550761222839355, "logits/rejected": -3.541923999786377, "logps/chosen": -513.0765380859375, "logps/rejected": -5235.28759765625, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.3970950841903687, "rewards/margins": 45.181175231933594, "rewards/rejected": -46.578269958496094, "step": 300 }, { "epoch": 2.843601895734597, "eval_logits/chosen": -3.5504369735717773, "eval_logits/rejected": -3.444122552871704, "eval_logps/chosen": -512.8049926757812, "eval_logps/rejected": -5128.248046875, "eval_loss": 0.004975645802915096, "eval_rewards/accuracies": 0.9959677457809448, "eval_rewards/chosen": -1.4582326412200928, "eval_rewards/margins": 43.41635513305664, "eval_rewards/rejected": -44.87459182739258, "eval_runtime": 192.6295, "eval_samples_per_second": 20.272, "eval_steps_per_second": 0.322, "step": 300 }, { "epoch": 2.938388625592417, "grad_norm": 0.0005953504074610172, "learning_rate": 3.850041354441502e-10, "logits/chosen": -3.5716750621795654, "logits/rejected": -3.5102698802948, "logps/chosen": -509.0469665527344, "logps/rejected": -4801.1611328125, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -1.4760140180587769, "rewards/margins": 40.71800994873047, "rewards/rejected": -42.19402313232422, "step": 310 }, { "epoch": 2.985781990521327, "step": 315, "total_flos": 0.0, "train_loss": 0.04083177362173292, "train_runtime": 9033.5209, "train_samples_per_second": 4.483, "train_steps_per_second": 0.035 } ], "logging_steps": 10, "max_steps": 315, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }