{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0265486725663717, "eval_steps": 500, "global_step": 29, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.035398230088495575, "grad_norm": NaN, "learning_rate": 0.0, "logits/chosen": -3.0738143920898438, "logits/rejected": -3.0764384269714355, "logps/chosen": -48.15470886230469, "logps/rejected": -73.88392639160156, "loss": 2.7726, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 1 }, { "epoch": 0.07079646017699115, "grad_norm": NaN, "learning_rate": 0.0, "logits/chosen": -3.094489812850952, "logits/rejected": -3.090228319168091, "logps/chosen": -51.06940841674805, "logps/rejected": -76.33265686035156, "loss": 2.7726, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 2 }, { "epoch": 0.10619469026548672, "grad_norm": NaN, "learning_rate": 0.0, "logits/chosen": -3.0312275886535645, "logits/rejected": -3.030165910720825, "logps/chosen": -47.49669647216797, "logps/rejected": -76.91014099121094, "loss": 2.7726, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 3 }, { "epoch": 0.1415929203539823, "grad_norm": NaN, "learning_rate": 0.0, "logits/chosen": -3.0501389503479004, "logits/rejected": -3.054299831390381, "logps/chosen": -52.20546340942383, "logps/rejected": -76.84318542480469, "loss": 2.7726, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 4 }, { "epoch": 0.17699115044247787, "grad_norm": 39.514007568359375, "learning_rate": 2e-05, "logits/chosen": -3.059661865234375, "logits/rejected": -3.0544543266296387, "logps/chosen": -54.500221252441406, "logps/rejected": -80.73619079589844, "loss": 2.7726, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 5 }, { "epoch": 0.21238938053097345, "grad_norm": 35.310333251953125, "learning_rate": 4e-05, "logits/chosen": -3.0174481868743896, "logits/rejected": -3.0225086212158203, "logps/chosen": -45.962432861328125, "logps/rejected": -69.7489013671875, "loss": 2.7726, "rewards/accuracies": 0.0, "rewards/chosen": 0.0, "rewards/margins": 0.0, "rewards/rejected": 0.0, "step": 6 }, { "epoch": 0.24778761061946902, "grad_norm": 33.619834899902344, "learning_rate": 6e-05, "logits/chosen": -3.0760531425476074, "logits/rejected": -3.075507402420044, "logps/chosen": -52.68150329589844, "logps/rejected": -82.66320037841797, "loss": 2.3724, "rewards/accuracies": 0.984375, "rewards/chosen": 0.033591579645872116, "rewards/margins": 0.2135639488697052, "rewards/rejected": -0.17997236549854279, "step": 7 }, { "epoch": 0.2831858407079646, "grad_norm": 26.519262313842773, "learning_rate": 8e-05, "logits/chosen": -3.141781806945801, "logits/rejected": -3.111323595046997, "logps/chosen": -49.25138854980469, "logps/rejected": -83.14276885986328, "loss": 1.6547, "rewards/accuracies": 1.0, "rewards/chosen": 0.08978669345378876, "rewards/margins": 0.6971186399459839, "rewards/rejected": -0.6073319315910339, "step": 8 }, { "epoch": 0.3185840707964602, "grad_norm": 17.522594451904297, "learning_rate": 0.0001, "logits/chosen": -3.1494789123535156, "logits/rejected": -3.107832431793213, "logps/chosen": -47.360626220703125, "logps/rejected": -87.75389099121094, "loss": 0.9735, "rewards/accuracies": 0.953125, "rewards/chosen": 0.19129924476146698, "rewards/margins": 1.4335689544677734, "rewards/rejected": -1.24226975440979, "step": 9 }, { "epoch": 0.35398230088495575, "grad_norm": 7.8468241691589355, "learning_rate": 0.00012, "logits/chosen": -3.105159044265747, "logits/rejected": -3.0585837364196777, "logps/chosen": -42.66419982910156, "logps/rejected": -92.25341796875, "loss": 0.5496, "rewards/accuracies": 0.90625, "rewards/chosen": 0.2652404308319092, "rewards/margins": 2.590460777282715, "rewards/rejected": -2.3252203464508057, "step": 10 }, { "epoch": 0.3893805309734513, "grad_norm": 2.123518228530884, "learning_rate": 0.00014, "logits/chosen": -3.239974021911621, "logits/rejected": -3.114036798477173, "logps/chosen": -46.743621826171875, "logps/rejected": -125.77220153808594, "loss": 0.0432, "rewards/accuracies": 1.0, "rewards/chosen": 0.3305644392967224, "rewards/margins": 5.1967926025390625, "rewards/rejected": -4.866228103637695, "step": 11 }, { "epoch": 0.4247787610619469, "grad_norm": 0.18487615883350372, "learning_rate": 0.00016, "logits/chosen": -3.2138984203338623, "logits/rejected": -2.992119312286377, "logps/chosen": -54.21443557739258, "logps/rejected": -160.70346069335938, "loss": 0.132, "rewards/accuracies": 0.953125, "rewards/chosen": 0.07433129847049713, "rewards/margins": 8.329842567443848, "rewards/rejected": -8.255511283874512, "step": 12 }, { "epoch": 0.46017699115044247, "grad_norm": 0.3735005855560303, "learning_rate": 0.00018, "logits/chosen": -3.314460277557373, "logits/rejected": -2.955761194229126, "logps/chosen": -52.52698516845703, "logps/rejected": -211.6153564453125, "loss": 0.0467, "rewards/accuracies": 0.984375, "rewards/chosen": -0.11871516704559326, "rewards/margins": 13.215730667114258, "rewards/rejected": -13.33444595336914, "step": 13 }, { "epoch": 0.49557522123893805, "grad_norm": 0.000926459557376802, "learning_rate": 0.0002, "logits/chosen": -3.378767490386963, "logits/rejected": -2.9119057655334473, "logps/chosen": -59.550537109375, "logps/rejected": -289.6356201171875, "loss": 0.0433, "rewards/accuracies": 0.984375, "rewards/chosen": -0.4433783292770386, "rewards/margins": 20.33984375, "rewards/rejected": -20.78322410583496, "step": 14 }, { "epoch": 0.5309734513274337, "grad_norm": 0.018438709899783134, "learning_rate": 0.00019863613034027224, "logits/chosen": -3.4022810459136963, "logits/rejected": -2.9307518005371094, "logps/chosen": -75.22628784179688, "logps/rejected": -371.2684326171875, "loss": 0.0867, "rewards/accuracies": 0.96875, "rewards/chosen": -2.2902748584747314, "rewards/margins": 27.06856918334961, "rewards/rejected": -29.358842849731445, "step": 15 }, { "epoch": 0.5663716814159292, "grad_norm": 0.00022576068295165896, "learning_rate": 0.00019458172417006347, "logits/chosen": -3.4614078998565674, "logits/rejected": -3.028236150741577, "logps/chosen": -111.99066162109375, "logps/rejected": -499.0533142089844, "loss": 0.0433, "rewards/accuracies": 0.984375, "rewards/chosen": -6.10006856918335, "rewards/margins": 36.00136947631836, "rewards/rejected": -42.101436614990234, "step": 16 }, { "epoch": 0.6017699115044248, "grad_norm": 0.0, "learning_rate": 0.0001879473751206489, "logits/chosen": -3.4879064559936523, "logits/rejected": -3.1346540451049805, "logps/chosen": -181.54132080078125, "logps/rejected": -627.1567993164062, "loss": 0.0433, "rewards/accuracies": 0.984375, "rewards/chosen": -12.846463203430176, "rewards/margins": 42.00312805175781, "rewards/rejected": -54.84959411621094, "step": 17 }, { "epoch": 0.6371681415929203, "grad_norm": 0.0, "learning_rate": 0.00017891405093963938, "logits/chosen": -3.5802273750305176, "logits/rejected": -3.2139182090759277, "logps/chosen": -298.01898193359375, "logps/rejected": -730.3427734375, "loss": 0.0433, "rewards/accuracies": 0.984375, "rewards/chosen": -24.727779388427734, "rewards/margins": 40.633914947509766, "rewards/rejected": -65.3616943359375, "step": 18 }, { "epoch": 0.672566371681416, "grad_norm": 0.0006348241004161537, "learning_rate": 0.00016772815716257412, "logits/chosen": -3.6362297534942627, "logits/rejected": -3.3277578353881836, "logps/chosen": -441.85595703125, "logps/rejected": -857.678466796875, "loss": 0.0, "rewards/accuracies": 1.0, "rewards/chosen": -39.0582275390625, "rewards/margins": 38.73958969116211, "rewards/rejected": -77.79782104492188, "step": 19 }, { "epoch": 0.7079646017699115, "grad_norm": 5.561949728871696e-05, "learning_rate": 0.00015469481581224272, "logits/chosen": -3.7110390663146973, "logits/rejected": -3.481760025024414, "logps/chosen": -620.394287109375, "logps/rejected": -953.712646484375, "loss": 0.0866, "rewards/accuracies": 0.96875, "rewards/chosen": -56.605865478515625, "rewards/margins": 30.84559440612793, "rewards/rejected": -87.45145416259766, "step": 20 }, { "epoch": 0.7433628318584071, "grad_norm": 0.006302627269178629, "learning_rate": 0.00014016954246529696, "logits/chosen": -3.705810070037842, "logits/rejected": -3.480112314224243, "logps/chosen": -626.589111328125, "logps/rejected": -926.1605224609375, "loss": 0.2166, "rewards/accuracies": 0.921875, "rewards/chosen": -57.85133743286133, "rewards/margins": 27.39826202392578, "rewards/rejected": -85.24958801269531, "step": 21 }, { "epoch": 0.7787610619469026, "grad_norm": 0.19819259643554688, "learning_rate": 0.00012454854871407994, "logits/chosen": -3.771535873413086, "logits/rejected": -3.57301664352417, "logps/chosen": -856.1668701171875, "logps/rejected": -1113.4576416015625, "loss": 0.0434, "rewards/accuracies": 0.984375, "rewards/chosen": -80.38790130615234, "rewards/margins": 23.13334846496582, "rewards/rejected": -103.52125549316406, "step": 22 }, { "epoch": 0.8141592920353983, "grad_norm": 0.00016238712123595178, "learning_rate": 0.00010825793454723325, "logits/chosen": -3.9207518100738525, "logits/rejected": -3.6793296337127686, "logps/chosen": -834.5657958984375, "logps/rejected": -1170.9774169921875, "loss": 0.0433, "rewards/accuracies": 0.984375, "rewards/chosen": -78.40864562988281, "rewards/margins": 31.1939697265625, "rewards/rejected": -109.60260772705078, "step": 23 }, { "epoch": 0.8495575221238938, "grad_norm": 0.0, "learning_rate": 9.174206545276677e-05, "logits/chosen": -4.076857089996338, "logits/rejected": -3.751131534576416, "logps/chosen": -736.5623168945312, "logps/rejected": -1179.674072265625, "loss": 0.13, "rewards/accuracies": 0.953125, "rewards/chosen": -69.1473159790039, "rewards/margins": 41.747955322265625, "rewards/rejected": -110.89527893066406, "step": 24 }, { "epoch": 0.8849557522123894, "grad_norm": 0.0, "learning_rate": 7.54514512859201e-05, "logits/chosen": -4.139317035675049, "logits/rejected": -3.7883429527282715, "logps/chosen": -723.9154663085938, "logps/rejected": -1208.33154296875, "loss": 0.13, "rewards/accuracies": 0.953125, "rewards/chosen": -67.75996398925781, "rewards/margins": 45.80049133300781, "rewards/rejected": -113.56045532226562, "step": 25 }, { "epoch": 0.9203539823008849, "grad_norm": 0.0, "learning_rate": 5.983045753470308e-05, "logits/chosen": -4.219160079956055, "logits/rejected": -3.8395681381225586, "logps/chosen": -831.74560546875, "logps/rejected": -1357.1932373046875, "loss": 0.0433, "rewards/accuracies": 0.984375, "rewards/chosen": -78.22254943847656, "rewards/margins": 49.974510192871094, "rewards/rejected": -128.19705200195312, "step": 26 }, { "epoch": 0.9557522123893806, "grad_norm": 1.5598791378579335e-06, "learning_rate": 4.530518418775733e-05, "logits/chosen": -4.2556562423706055, "logits/rejected": -3.8611326217651367, "logps/chosen": -830.6559448242188, "logps/rejected": -1364.437744140625, "loss": 0.0433, "rewards/accuracies": 0.984375, "rewards/chosen": -78.08842468261719, "rewards/margins": 50.610897064208984, "rewards/rejected": -128.69931030273438, "step": 27 }, { "epoch": 0.9911504424778761, "grad_norm": 0.0023740511387586594, "learning_rate": 3.227184283742591e-05, "logits/chosen": -4.302487373352051, "logits/rejected": -3.8732571601867676, "logps/chosen": -825.9217529296875, "logps/rejected": -1350.9755859375, "loss": 0.0433, "rewards/accuracies": 0.984375, "rewards/chosen": -77.16596984863281, "rewards/margins": 49.86052703857422, "rewards/rejected": -127.02650451660156, "step": 28 }, { "epoch": 1.0265486725663717, "grad_norm": 0.0, "learning_rate": 2.1085949060360654e-05, "logits/chosen": -4.38674259185791, "logits/rejected": -3.9807748794555664, "logps/chosen": -908.9441528320312, "logps/rejected": -1436.7275390625, "loss": 0.13, "rewards/accuracies": 0.953125, "rewards/chosen": -85.50328063964844, "rewards/margins": 50.298500061035156, "rewards/rejected": -135.8017578125, "step": 29 } ], "logging_steps": 1, "max_steps": 29, "num_input_tokens_seen": 0, "num_train_epochs": 2, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }