{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.2, "eval_steps": 50, "global_step": 1000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.01, "grad_norm": 14.438831329345703, "learning_rate": 0.00015, "loss": 1.116, "step": 50 }, { "epoch": 0.01, "eval_loss": 0.8588868975639343, "eval_runtime": 680.2426, "eval_samples_per_second": 8.601, "eval_steps_per_second": 1.076, "step": 50 }, { "epoch": 0.02, "grad_norm": 1.0438328981399536, "learning_rate": 0.0003, "loss": 0.723, "step": 100 }, { "epoch": 0.02, "eval_loss": 0.8119707703590393, "eval_runtime": 681.2671, "eval_samples_per_second": 8.588, "eval_steps_per_second": 1.074, "step": 100 }, { "epoch": 0.03, "grad_norm": 0.8140527009963989, "learning_rate": 0.0002833333333333333, "loss": 0.6987, "step": 150 }, { "epoch": 0.03, "eval_loss": 0.7736836075782776, "eval_runtime": 681.2882, "eval_samples_per_second": 8.588, "eval_steps_per_second": 1.074, "step": 150 }, { "epoch": 0.04, "grad_norm": 0.7538554668426514, "learning_rate": 0.0002666666666666666, "loss": 0.6701, "step": 200 }, { "epoch": 0.04, "eval_loss": 0.7494252324104309, "eval_runtime": 681.1585, "eval_samples_per_second": 8.59, "eval_steps_per_second": 1.075, "step": 200 }, { "epoch": 0.05, "grad_norm": 0.8082383871078491, "learning_rate": 0.00025, "loss": 0.6585, "step": 250 }, { "epoch": 0.05, "eval_loss": 0.7359848022460938, "eval_runtime": 681.3996, "eval_samples_per_second": 8.587, "eval_steps_per_second": 1.074, "step": 250 }, { "epoch": 0.06, "grad_norm": 0.7390263080596924, "learning_rate": 0.0002333333333333333, "loss": 0.6451, "step": 300 }, { "epoch": 0.06, "eval_loss": 0.7315810322761536, "eval_runtime": 680.7597, "eval_samples_per_second": 8.595, "eval_steps_per_second": 1.075, "step": 300 }, { "epoch": 0.07, "grad_norm": 0.719605565071106, "learning_rate": 0.00021666666666666666, "loss": 0.6382, "step": 350 }, { "epoch": 0.07, "eval_loss": 0.6773383617401123, "eval_runtime": 680.8905, "eval_samples_per_second": 8.593, "eval_steps_per_second": 1.075, "step": 350 }, { "epoch": 0.08, "grad_norm": 0.8071795701980591, "learning_rate": 0.00019999999999999998, "loss": 0.6304, "step": 400 }, { "epoch": 0.08, "eval_loss": 0.6790196299552917, "eval_runtime": 680.4041, "eval_samples_per_second": 8.599, "eval_steps_per_second": 1.076, "step": 400 }, { "epoch": 0.09, "grad_norm": 0.6562775373458862, "learning_rate": 0.00018333333333333334, "loss": 0.6236, "step": 450 }, { "epoch": 0.09, "eval_loss": 0.667582631111145, "eval_runtime": 680.9279, "eval_samples_per_second": 8.593, "eval_steps_per_second": 1.075, "step": 450 }, { "epoch": 0.1, "grad_norm": 0.6858498454093933, "learning_rate": 0.00016666666666666666, "loss": 0.6127, "step": 500 }, { "epoch": 0.1, "eval_loss": 0.6611046195030212, "eval_runtime": 681.3448, "eval_samples_per_second": 8.587, "eval_steps_per_second": 1.074, "step": 500 }, { "epoch": 0.11, "grad_norm": 0.7050228714942932, "learning_rate": 0.00015, "loss": 0.6109, "step": 550 }, { "epoch": 0.11, "eval_loss": 0.6624142527580261, "eval_runtime": 680.8132, "eval_samples_per_second": 8.594, "eval_steps_per_second": 1.075, "step": 550 }, { "epoch": 0.12, "grad_norm": 0.7099040746688843, "learning_rate": 0.0001333333333333333, "loss": 0.6074, "step": 600 }, { "epoch": 0.12, "eval_loss": 0.6503908634185791, "eval_runtime": 680.9458, "eval_samples_per_second": 8.592, "eval_steps_per_second": 1.075, "step": 600 }, { "epoch": 0.13, "grad_norm": 0.7423191070556641, "learning_rate": 0.00011666666666666665, "loss": 0.5972, "step": 650 }, { "epoch": 0.13, "eval_loss": 0.6401746273040771, "eval_runtime": 680.6123, "eval_samples_per_second": 8.597, "eval_steps_per_second": 1.076, "step": 650 }, { "epoch": 0.14, "grad_norm": 0.6712120175361633, "learning_rate": 9.999999999999999e-05, "loss": 0.5912, "step": 700 }, { "epoch": 0.14, "eval_loss": 0.6332426071166992, "eval_runtime": 680.7596, "eval_samples_per_second": 8.595, "eval_steps_per_second": 1.075, "step": 700 }, { "epoch": 0.15, "grad_norm": 0.7194417715072632, "learning_rate": 8.333333333333333e-05, "loss": 0.5934, "step": 750 }, { "epoch": 0.15, "eval_loss": 0.6242749094963074, "eval_runtime": 679.731, "eval_samples_per_second": 8.608, "eval_steps_per_second": 1.077, "step": 750 }, { "epoch": 0.16, "grad_norm": 0.7845382690429688, "learning_rate": 6.666666666666666e-05, "loss": 0.5908, "step": 800 }, { "epoch": 0.16, "eval_loss": 0.6116130352020264, "eval_runtime": 681.1204, "eval_samples_per_second": 8.59, "eval_steps_per_second": 1.075, "step": 800 }, { "epoch": 0.17, "grad_norm": 0.7342799305915833, "learning_rate": 4.9999999999999996e-05, "loss": 0.5824, "step": 850 }, { "epoch": 0.17, "eval_loss": 0.6023569703102112, "eval_runtime": 680.663, "eval_samples_per_second": 8.596, "eval_steps_per_second": 1.075, "step": 850 }, { "epoch": 0.18, "grad_norm": 0.6745529174804688, "learning_rate": 3.333333333333333e-05, "loss": 0.5823, "step": 900 }, { "epoch": 0.18, "eval_loss": 0.5980194211006165, "eval_runtime": 680.2295, "eval_samples_per_second": 8.602, "eval_steps_per_second": 1.076, "step": 900 }, { "epoch": 0.19, "grad_norm": 0.7388492226600647, "learning_rate": 1.6666666666666664e-05, "loss": 0.5876, "step": 950 }, { "epoch": 0.19, "eval_loss": 0.5954298973083496, "eval_runtime": 680.8083, "eval_samples_per_second": 8.594, "eval_steps_per_second": 1.075, "step": 950 }, { "epoch": 0.2, "grad_norm": 0.8207566738128662, "learning_rate": 0.0, "loss": 0.5748, "step": 1000 }, { "epoch": 0.2, "eval_loss": 0.5945320725440979, "eval_runtime": 680.8349, "eval_samples_per_second": 8.594, "eval_steps_per_second": 1.075, "step": 1000 } ], "logging_steps": 50, "max_steps": 1000, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.187924717080576e+16, "train_batch_size": 10, "trial_name": null, "trial_params": null }