{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.992, "eval_steps": 200, "global_step": 62, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 359.8308330892111, "learning_rate": 7.142857142857142e-08, "logits/generated": -3.0665292739868164, "logits/real": -2.8827080726623535, "logps/generated": -260.97100830078125, "logps/real": -154.1412811279297, "loss": 0.8446, "rewards/accuracies": 0.0, "rewards/generated": 0.0, "rewards/margins": 0.0, "rewards/real": 0.0, "step": 1 }, { "epoch": 0.16, "grad_norm": 14.829707235996306, "learning_rate": 4.727272727272727e-07, "logits/generated": -3.019892454147339, "logits/real": -2.6373720169067383, "logps/generated": -281.0638732910156, "logps/real": -134.65234375, "loss": 0.2963, "rewards/accuracies": 0.8888888955116272, "rewards/generated": -3.8116445541381836, "rewards/margins": 5.251079082489014, "rewards/real": 1.4394347667694092, "step": 10 }, { "epoch": 0.32, "grad_norm": 1.2891537123618362, "learning_rate": 3.818181818181818e-07, "logits/generated": -3.0050318241119385, "logits/real": -2.69225811958313, "logps/generated": -310.45654296875, "logps/real": -124.5722885131836, "loss": 0.1097, "rewards/accuracies": 1.0, "rewards/generated": -7.0854997634887695, "rewards/margins": 10.399068832397461, "rewards/real": 3.313570499420166, "step": 20 }, { "epoch": 0.48, "grad_norm": 1.1473555762335221, "learning_rate": 2.909090909090909e-07, "logits/generated": -3.005507707595825, "logits/real": -2.8163156509399414, "logps/generated": -333.9879150390625, "logps/real": -110.40803527832031, "loss": 0.1041, "rewards/accuracies": 1.0, "rewards/generated": -8.524462699890137, "rewards/margins": 12.605262756347656, "rewards/real": 4.080801486968994, "step": 30 }, { "epoch": 0.64, "grad_norm": 1.345996994857361, "learning_rate": 2e-07, "logits/generated": -3.0229580402374268, "logits/real": -2.770540952682495, "logps/generated": -343.5229797363281, "logps/real": -96.39772033691406, "loss": 0.1087, "rewards/accuracies": 1.0, "rewards/generated": -9.872434616088867, "rewards/margins": 13.813318252563477, "rewards/real": 3.9408836364746094, "step": 40 }, { "epoch": 0.8, "grad_norm": 1.2324237659279247, "learning_rate": 1.0909090909090908e-07, "logits/generated": -2.993199586868286, "logits/real": -2.768155574798584, "logps/generated": -336.2699279785156, "logps/real": -97.20591735839844, "loss": 0.0994, "rewards/accuracies": 1.0, "rewards/generated": -9.705252647399902, "rewards/margins": 13.57763671875, "rewards/real": 3.872384548187256, "step": 50 }, { "epoch": 0.96, "grad_norm": 2.978037812560663, "learning_rate": 1.818181818181818e-08, "logits/generated": -3.0415966510772705, "logits/real": -2.78061580657959, "logps/generated": -337.62353515625, "logps/real": -109.2850341796875, "loss": 0.1025, "rewards/accuracies": 1.0, "rewards/generated": -9.384114265441895, "rewards/margins": 13.61937141418457, "rewards/real": 4.235257625579834, "step": 60 }, { "epoch": 0.992, "step": 62, "total_flos": 0.0, "train_loss": 0.1436174963751147, "train_runtime": 742.9089, "train_samples_per_second": 2.692, "train_steps_per_second": 0.083 } ], "logging_steps": 10, "max_steps": 62, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }