{ "best_metric": 0.6293959021568298, "best_model_checkpoint": "experiments/checkpoint-250", "epoch": 22.58823529411765, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.75, "learning_rate": 2.9999999999999997e-05, "loss": 3.2279, "step": 10 }, { "epoch": 1.51, "learning_rate": 5.9999999999999995e-05, "loss": 3.0585, "step": 20 }, { "epoch": 2.26, "learning_rate": 8.699999999999999e-05, "loss": 2.6599, "step": 30 }, { "epoch": 3.01, "learning_rate": 0.00011399999999999999, "loss": 1.9787, "step": 40 }, { "epoch": 3.76, "learning_rate": 0.00014099999999999998, "loss": 1.378, "step": 50 }, { "epoch": 3.76, "eval_loss": 1.1715341806411743, "eval_runtime": 12.5908, "eval_samples_per_second": 15.885, "eval_steps_per_second": 1.986, "step": 50 }, { "epoch": 4.52, "learning_rate": 0.00017099999999999998, "loss": 1.1071, "step": 60 }, { "epoch": 5.27, "learning_rate": 0.000201, "loss": 0.9966, "step": 70 }, { "epoch": 6.02, "learning_rate": 0.00023099999999999998, "loss": 0.9398, "step": 80 }, { "epoch": 6.78, "learning_rate": 0.000261, "loss": 0.8755, "step": 90 }, { "epoch": 7.53, "learning_rate": 0.00029099999999999997, "loss": 0.8274, "step": 100 }, { "epoch": 7.53, "eval_loss": 0.7494703531265259, "eval_runtime": 12.5869, "eval_samples_per_second": 15.89, "eval_steps_per_second": 1.986, "step": 100 }, { "epoch": 8.28, "learning_rate": 0.0002895, "loss": 0.761, "step": 110 }, { "epoch": 9.04, "learning_rate": 0.0002745, "loss": 0.7358, "step": 120 }, { "epoch": 9.79, "learning_rate": 0.00025949999999999997, "loss": 0.7155, "step": 130 }, { "epoch": 10.54, "learning_rate": 0.0002445, "loss": 0.6811, "step": 140 }, { "epoch": 11.29, "learning_rate": 0.0002295, "loss": 0.6712, "step": 150 }, { "epoch": 11.29, "eval_loss": 0.6565232872962952, "eval_runtime": 12.5854, "eval_samples_per_second": 15.891, "eval_steps_per_second": 1.986, "step": 150 }, { "epoch": 12.05, "learning_rate": 0.00021449999999999998, "loss": 0.6562, "step": 160 }, { "epoch": 12.8, "learning_rate": 0.0001995, "loss": 0.6367, "step": 170 }, { "epoch": 13.55, "learning_rate": 0.00018449999999999999, "loss": 0.6173, "step": 180 }, { "epoch": 14.31, "learning_rate": 0.00016949999999999997, "loss": 0.6267, "step": 190 }, { "epoch": 15.06, "learning_rate": 0.0001545, "loss": 0.6039, "step": 200 }, { "epoch": 15.06, "eval_loss": 0.6357369422912598, "eval_runtime": 12.6199, "eval_samples_per_second": 15.848, "eval_steps_per_second": 1.981, "step": 200 }, { "epoch": 15.81, "learning_rate": 0.0001395, "loss": 0.5949, "step": 210 }, { "epoch": 16.56, "learning_rate": 0.0001245, "loss": 0.5957, "step": 220 }, { "epoch": 17.32, "learning_rate": 0.00010949999999999999, "loss": 0.5625, "step": 230 }, { "epoch": 18.07, "learning_rate": 9.449999999999999e-05, "loss": 0.5897, "step": 240 }, { "epoch": 18.82, "learning_rate": 7.95e-05, "loss": 0.5747, "step": 250 }, { "epoch": 18.82, "eval_loss": 0.6293959021568298, "eval_runtime": 12.5951, "eval_samples_per_second": 15.879, "eval_steps_per_second": 1.985, "step": 250 }, { "epoch": 19.58, "learning_rate": 6.45e-05, "loss": 0.5588, "step": 260 }, { "epoch": 20.33, "learning_rate": 4.95e-05, "loss": 0.5501, "step": 270 }, { "epoch": 21.08, "learning_rate": 3.45e-05, "loss": 0.552, "step": 280 }, { "epoch": 21.84, "learning_rate": 1.95e-05, "loss": 0.5523, "step": 290 }, { "epoch": 22.59, "learning_rate": 4.499999999999999e-06, "loss": 0.5447, "step": 300 }, { "epoch": 22.59, "eval_loss": 0.633611798286438, "eval_runtime": 12.6, "eval_samples_per_second": 15.873, "eval_steps_per_second": 1.984, "step": 300 } ], "max_steps": 300, "num_train_epochs": 24, "total_flos": 2.0721681158111232e+17, "trial_name": null, "trial_params": null }