{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 100, "global_step": 352, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.028409090909090908, "grad_norm": 58.42705245846632, "learning_rate": 1.3888888888888888e-07, "logits/chosen": -2.8592312335968018, "logits/rejected": -2.642709732055664, "logps/chosen": -390.5020446777344, "logps/rejected": -607.8412475585938, "loss": 0.6868, "rewards/accuracies": 0.6499999761581421, "rewards/chosen": 0.002577512990683317, "rewards/margins": 0.013913804665207863, "rewards/rejected": -0.011336291208863258, "step": 10 }, { "epoch": 0.056818181818181816, "grad_norm": 18.51862119745116, "learning_rate": 2.7777777777777776e-07, "logits/chosen": -2.84271502494812, "logits/rejected": -2.694936513900757, "logps/chosen": -328.5304870605469, "logps/rejected": -774.9099731445312, "loss": 0.4819, "rewards/accuracies": 0.987500011920929, "rewards/chosen": 0.08614631742238998, "rewards/margins": 0.6608496904373169, "rewards/rejected": -0.5747033357620239, "step": 20 }, { "epoch": 0.08522727272727272, "grad_norm": 3.947671256913515, "learning_rate": 4.1666666666666667e-07, "logits/chosen": -2.8618407249450684, "logits/rejected": -2.6804850101470947, "logps/chosen": -294.7425842285156, "logps/rejected": -1098.802978515625, "loss": 0.1417, "rewards/accuracies": 1.0, "rewards/chosen": 0.37539467215538025, "rewards/margins": 4.5838799476623535, "rewards/rejected": -4.208485126495361, "step": 30 }, { "epoch": 0.11363636363636363, "grad_norm": 1.2673223440201191, "learning_rate": 4.998023493068254e-07, "logits/chosen": -2.8695826530456543, "logits/rejected": -2.690202236175537, "logps/chosen": -310.9261169433594, "logps/rejected": -2008.798583984375, "loss": 0.0271, "rewards/accuracies": 1.0, "rewards/chosen": 0.3582938015460968, "rewards/margins": 13.367321968078613, "rewards/rejected": -13.009028434753418, "step": 40 }, { "epoch": 0.14204545454545456, "grad_norm": 0.0479749771589853, "learning_rate": 4.975823666181255e-07, "logits/chosen": -2.8763322830200195, "logits/rejected": -2.6664085388183594, "logps/chosen": -403.7674255371094, "logps/rejected": -3682.93896484375, "loss": 0.0033, "rewards/accuracies": 1.0, "rewards/chosen": -0.4113216996192932, "rewards/margins": 29.878662109375, "rewards/rejected": -30.28998374938965, "step": 50 }, { "epoch": 0.17045454545454544, "grad_norm": 0.6274546818497669, "learning_rate": 4.929173350101024e-07, "logits/chosen": -3.0023722648620605, "logits/rejected": -2.7470011711120605, "logps/chosen": -463.00946044921875, "logps/rejected": -4437.8525390625, "loss": 0.0031, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0311436653137207, "rewards/margins": 36.9188346862793, "rewards/rejected": -37.949981689453125, "step": 60 }, { "epoch": 0.19886363636363635, "grad_norm": 0.0844518740673388, "learning_rate": 4.858533249305336e-07, "logits/chosen": -3.005385398864746, "logits/rejected": -2.6852545738220215, "logps/chosen": -471.344970703125, "logps/rejected": -4398.6142578125, "loss": 0.0034, "rewards/accuracies": 1.0, "rewards/chosen": -0.9218431711196899, "rewards/margins": 36.93693923950195, "rewards/rejected": -37.858787536621094, "step": 70 }, { "epoch": 0.22727272727272727, "grad_norm": 5.427351151177568, "learning_rate": 4.764600984163808e-07, "logits/chosen": -3.0055181980133057, "logits/rejected": -2.524444103240967, "logps/chosen": -475.0348205566406, "logps/rejected": -5109.64990234375, "loss": 0.0027, "rewards/accuracies": 1.0, "rewards/chosen": -0.6871398687362671, "rewards/margins": 43.67203903198242, "rewards/rejected": -44.35917663574219, "step": 80 }, { "epoch": 0.2556818181818182, "grad_norm": 0.009091790561130925, "learning_rate": 4.6483042014491527e-07, "logits/chosen": -3.004645824432373, "logits/rejected": -2.3897948265075684, "logps/chosen": -470.91943359375, "logps/rejected": -4687.4931640625, "loss": 0.0112, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.0880612134933472, "rewards/margins": 39.41301727294922, "rewards/rejected": -40.50108337402344, "step": 90 }, { "epoch": 0.2840909090909091, "grad_norm": 0.055190493723617784, "learning_rate": 4.510791413176912e-07, "logits/chosen": -2.8832428455352783, "logits/rejected": -1.8006477355957031, "logps/chosen": -491.24505615234375, "logps/rejected": -5191.5498046875, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.2177506685256958, "rewards/margins": 43.63993453979492, "rewards/rejected": -44.857688903808594, "step": 100 }, { "epoch": 0.2840909090909091, "eval_logits/chosen": -2.7411134243011475, "eval_logits/rejected": -1.4200084209442139, "eval_logps/chosen": -499.48809814453125, "eval_logps/rejected": -5114.40576171875, "eval_loss": 0.0009676189511083066, "eval_rewards/accuracies": 0.9979838728904724, "eval_rewards/chosen": -1.3250634670257568, "eval_rewards/margins": 43.411109924316406, "eval_rewards/rejected": -44.73617172241211, "eval_runtime": 196.2044, "eval_samples_per_second": 19.903, "eval_steps_per_second": 0.316, "step": 100 }, { "epoch": 0.3125, "grad_norm": 0.03671469805558989, "learning_rate": 4.353420654246546e-07, "logits/chosen": -2.5657219886779785, "logits/rejected": -1.2966344356536865, "logps/chosen": -516.1082763671875, "logps/rejected": -4920.09814453125, "loss": 0.0032, "rewards/accuracies": 1.0, "rewards/chosen": -1.4068708419799805, "rewards/margins": 41.80142593383789, "rewards/rejected": -43.20829391479492, "step": 110 }, { "epoch": 0.3409090909090909, "grad_norm": 0.046355425167055216, "learning_rate": 4.177746070897592e-07, "logits/chosen": -2.7508440017700195, "logits/rejected": -1.5980149507522583, "logps/chosen": -527.7090454101562, "logps/rejected": -5251.87744140625, "loss": 0.0014, "rewards/accuracies": 1.0, "rewards/chosen": -1.4619219303131104, "rewards/margins": 44.68457794189453, "rewards/rejected": -46.1464958190918, "step": 120 }, { "epoch": 0.3693181818181818, "grad_norm": 0.22007447567081792, "learning_rate": 3.9855025724292763e-07, "logits/chosen": -2.9421451091766357, "logits/rejected": -1.7615553140640259, "logps/chosen": -534.1954345703125, "logps/rejected": -5053.0048828125, "loss": 0.0022, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.6374390125274658, "rewards/margins": 42.48723220825195, "rewards/rejected": -44.124671936035156, "step": 130 }, { "epoch": 0.3977272727272727, "grad_norm": 2.5714609658759042, "learning_rate": 3.7785886977585555e-07, "logits/chosen": -2.755537271499634, "logits/rejected": -1.0718333721160889, "logps/chosen": -519.0208129882812, "logps/rejected": -5410.51708984375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.5298444032669067, "rewards/margins": 46.15542984008789, "rewards/rejected": -47.68526840209961, "step": 140 }, { "epoch": 0.42613636363636365, "grad_norm": 0.3002663453248257, "learning_rate": 3.5590478660213206e-07, "logits/chosen": -2.406147003173828, "logits/rejected": -0.27999475598335266, "logps/chosen": -545.5628051757812, "logps/rejected": -5466.4716796875, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.67499577999115, "rewards/margins": 45.86994171142578, "rewards/rejected": -47.544944763183594, "step": 150 }, { "epoch": 0.45454545454545453, "grad_norm": 0.5345626100174099, "learning_rate": 3.3290481963801696e-07, "logits/chosen": -2.146878242492676, "logits/rejected": 0.38504794239997864, "logps/chosen": -508.6912536621094, "logps/rejected": -5496.20166015625, "loss": 0.0001, "rewards/accuracies": 1.0, "rewards/chosen": -1.3855329751968384, "rewards/margins": 47.411582946777344, "rewards/rejected": -48.797119140625, "step": 160 }, { "epoch": 0.48295454545454547, "grad_norm": 0.011551400933576704, "learning_rate": 3.0908610963322626e-07, "logits/chosen": -2.115241289138794, "logits/rejected": 0.22601358592510223, "logps/chosen": -550.0446166992188, "logps/rejected": -5791.59521484375, "loss": 0.0042, "rewards/accuracies": 1.0, "rewards/chosen": -1.6242635250091553, "rewards/margins": 49.00857925415039, "rewards/rejected": -50.632843017578125, "step": 170 }, { "epoch": 0.5113636363636364, "grad_norm": 0.0026766351641471543, "learning_rate": 2.846838829972671e-07, "logits/chosen": -2.1634111404418945, "logits/rejected": 0.14969149231910706, "logps/chosen": -528.2894287109375, "logps/rejected": -5540.1259765625, "loss": 0.0026, "rewards/accuracies": 1.0, "rewards/chosen": -1.6738510131835938, "rewards/margins": 47.597564697265625, "rewards/rejected": -49.27141571044922, "step": 180 }, { "epoch": 0.5397727272727273, "grad_norm": 0.032545430377750574, "learning_rate": 2.5993912877423147e-07, "logits/chosen": -2.0492312908172607, "logits/rejected": 0.02273269183933735, "logps/chosen": -494.49713134765625, "logps/rejected": -5951.44970703125, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.4888134002685547, "rewards/margins": 51.109580993652344, "rewards/rejected": -52.5984001159668, "step": 190 }, { "epoch": 0.5681818181818182, "grad_norm": 0.33060341294021806, "learning_rate": 2.3509621870754504e-07, "logits/chosen": -1.8956499099731445, "logits/rejected": 0.8085635304450989, "logps/chosen": -514.856201171875, "logps/rejected": -5108.5361328125, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.4972646236419678, "rewards/margins": 43.27967071533203, "rewards/rejected": -44.77693557739258, "step": 200 }, { "epoch": 0.5681818181818182, "eval_logits/chosen": -1.8780713081359863, "eval_logits/rejected": 0.7428802251815796, "eval_logps/chosen": -529.2301635742188, "eval_logps/rejected": -5479.81494140625, "eval_loss": 0.0004189308965578675, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.6224839687347412, "eval_rewards/margins": 46.76777648925781, "eval_rewards/rejected": -48.390262603759766, "eval_runtime": 194.4022, "eval_samples_per_second": 20.087, "eval_steps_per_second": 0.319, "step": 200 }, { "epoch": 0.5965909090909091, "grad_norm": 0.2151284325761924, "learning_rate": 2.1040049389819624e-07, "logits/chosen": -1.7524973154067993, "logits/rejected": 0.9734399914741516, "logps/chosen": -544.6936645507812, "logps/rejected": -5410.6865234375, "loss": 0.0007, "rewards/accuracies": 1.0, "rewards/chosen": -1.5405043363571167, "rewards/margins": 46.24732208251953, "rewards/rejected": -47.78782272338867, "step": 210 }, { "epoch": 0.625, "grad_norm": 2.3334722364043823, "learning_rate": 1.8609584188988133e-07, "logits/chosen": -1.2117726802825928, "logits/rejected": 0.9446122050285339, "logps/chosen": -568.7277221679688, "logps/rejected": -5055.0498046875, "loss": 0.0018, "rewards/accuracies": 1.0, "rewards/chosen": -2.0412721633911133, "rewards/margins": 42.01286697387695, "rewards/rejected": -44.05413055419922, "step": 220 }, { "epoch": 0.6534090909090909, "grad_norm": 70.71977531846203, "learning_rate": 1.624222881090439e-07, "logits/chosen": -1.3626362085342407, "logits/rejected": 0.9213559031486511, "logps/chosen": -592.7008056640625, "logps/rejected": -5952.9228515625, "loss": 0.0147, "rewards/accuracies": 0.9937499761581421, "rewards/chosen": -1.9803613424301147, "rewards/margins": 50.69305419921875, "rewards/rejected": -52.67341995239258, "step": 230 }, { "epoch": 0.6818181818181818, "grad_norm": 0.049503027606470004, "learning_rate": 1.3961362544602212e-07, "logits/chosen": -1.4228966236114502, "logits/rejected": 1.1803163290023804, "logps/chosen": -560.458251953125, "logps/rejected": -5114.12890625, "loss": 0.001, "rewards/accuracies": 1.0, "rewards/chosen": -1.8444585800170898, "rewards/margins": 43.356101989746094, "rewards/rejected": -45.20056915283203, "step": 240 }, { "epoch": 0.7102272727272727, "grad_norm": 0.31518289932805543, "learning_rate": 1.1789510538684522e-07, "logits/chosen": -1.6409775018692017, "logits/rejected": 0.999941349029541, "logps/chosen": -527.280517578125, "logps/rejected": -6313.17236328125, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.7524694204330444, "rewards/margins": 54.08344268798828, "rewards/rejected": -55.835906982421875, "step": 250 }, { "epoch": 0.7386363636363636, "grad_norm": 0.19426521854792267, "learning_rate": 9.748121349736891e-08, "logits/chosen": -1.6752300262451172, "logits/rejected": 0.9494975805282593, "logps/chosen": -575.4473266601562, "logps/rejected": -5758.55859375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.9161920547485352, "rewards/margins": 48.88811111450195, "rewards/rejected": -50.80430221557617, "step": 260 }, { "epoch": 0.7670454545454546, "grad_norm": 0.00641608946538556, "learning_rate": 7.857355122839673e-08, "logits/chosen": -1.775024652481079, "logits/rejected": 0.8411375880241394, "logps/chosen": -561.006103515625, "logps/rejected": -5389.9775390625, "loss": 0.0005, "rewards/accuracies": 1.0, "rewards/chosen": -1.801565408706665, "rewards/margins": 45.40861129760742, "rewards/rejected": -47.21017837524414, "step": 270 }, { "epoch": 0.7954545454545454, "grad_norm": 0.07974689002553936, "learning_rate": 6.135884496044244e-08, "logits/chosen": -1.6470428705215454, "logits/rejected": 1.1842314004898071, "logps/chosen": -544.4002685546875, "logps/rejected": -5383.25, "loss": 0.0015, "rewards/accuracies": 1.0, "rewards/chosen": -1.6522743701934814, "rewards/margins": 45.73133087158203, "rewards/rejected": -47.38361358642578, "step": 280 }, { "epoch": 0.8238636363636364, "grad_norm": 0.0651111213625187, "learning_rate": 4.600710195020982e-08, "logits/chosen": -1.5382473468780518, "logits/rejected": 1.2690740823745728, "logps/chosen": -565.579833984375, "logps/rejected": -5386.55859375, "loss": 0.0025, "rewards/accuracies": 1.0, "rewards/chosen": -1.7643659114837646, "rewards/margins": 45.688636779785156, "rewards/rejected": -47.4530029296875, "step": 290 }, { "epoch": 0.8522727272727273, "grad_norm": 0.7977586596409562, "learning_rate": 3.2669931390104374e-08, "logits/chosen": -1.57468581199646, "logits/rejected": 1.145819902420044, "logps/chosen": -520.630859375, "logps/rejected": -5860.75927734375, "loss": 0.0008, "rewards/accuracies": 1.0, "rewards/chosen": -1.6437629461288452, "rewards/margins": 50.755577087402344, "rewards/rejected": -52.39934158325195, "step": 300 }, { "epoch": 0.8522727272727273, "eval_logits/chosen": -1.4805512428283691, "eval_logits/rejected": 1.2551480531692505, "eval_logps/chosen": -540.1784057617188, "eval_logps/rejected": -5602.4775390625, "eval_loss": 0.0003319734532851726, "eval_rewards/accuracies": 1.0, "eval_rewards/chosen": -1.7319667339324951, "eval_rewards/margins": 47.88492202758789, "eval_rewards/rejected": -49.61688995361328, "eval_runtime": 195.4681, "eval_samples_per_second": 19.978, "eval_steps_per_second": 0.317, "step": 300 }, { "epoch": 0.8806818181818182, "grad_norm": 0.1653589500509016, "learning_rate": 2.147904716149135e-08, "logits/chosen": -1.4495469331741333, "logits/rejected": 1.214980959892273, "logps/chosen": -526.7190551757812, "logps/rejected": -5688.666015625, "loss": 0.0012, "rewards/accuracies": 1.0, "rewards/chosen": -1.6465237140655518, "rewards/margins": 48.23511505126953, "rewards/rejected": -49.88164520263672, "step": 310 }, { "epoch": 0.9090909090909091, "grad_norm": 0.028900316674960968, "learning_rate": 1.254496706805433e-08, "logits/chosen": -1.584967851638794, "logits/rejected": 1.173344373703003, "logps/chosen": -558.8123779296875, "logps/rejected": -5757.21240234375, "loss": 0.0004, "rewards/accuracies": 1.0, "rewards/chosen": -1.761392593383789, "rewards/margins": 49.29091262817383, "rewards/rejected": -51.052303314208984, "step": 320 }, { "epoch": 0.9375, "grad_norm": 0.1098326915964376, "learning_rate": 5.955921395237318e-09, "logits/chosen": -1.5144588947296143, "logits/rejected": 1.1384176015853882, "logps/chosen": -516.386962890625, "logps/rejected": -5688.7119140625, "loss": 0.0002, "rewards/accuracies": 1.0, "rewards/chosen": -1.6111881732940674, "rewards/margins": 48.798397064208984, "rewards/rejected": -50.40958786010742, "step": 330 }, { "epoch": 0.9659090909090909, "grad_norm": 0.17586461845284926, "learning_rate": 1.7769815745066474e-09, "logits/chosen": -1.7140228748321533, "logits/rejected": 1.0387569665908813, "logps/chosen": -531.6962890625, "logps/rejected": -5153.82958984375, "loss": 0.0006, "rewards/accuracies": 1.0, "rewards/chosen": -1.6271555423736572, "rewards/margins": 43.823490142822266, "rewards/rejected": -45.45064163208008, "step": 340 }, { "epoch": 0.9943181818181818, "grad_norm": 0.8169470643038225, "learning_rate": 4.9417557483610875e-11, "logits/chosen": -1.4863841533660889, "logits/rejected": 1.150782823562622, "logps/chosen": -551.8464965820312, "logps/rejected": -5518.20068359375, "loss": 0.0009, "rewards/accuracies": 1.0, "rewards/chosen": -1.775787353515625, "rewards/margins": 46.930641174316406, "rewards/rejected": -48.706424713134766, "step": 350 }, { "epoch": 1.0, "step": 352, "total_flos": 0.0, "train_loss": 0.03994455389971563, "train_runtime": 9328.4885, "train_samples_per_second": 4.824, "train_steps_per_second": 0.038 } ], "logging_steps": 10, "max_steps": 352, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }