{ "best_metric": 0.9545454382896423, "best_model_checkpoint": "/kaggle/working/ckpts/checkpoint-2453", "epoch": 14.0, "eval_steps": 500, "global_step": 3122, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.45, "grad_norm": 2.3771965503692627, "learning_rate": 9.701046337817639e-06, "loss": 1.5285, "step": 100 }, { "epoch": 0.9, "grad_norm": 2.858564615249634, "learning_rate": 9.402092675635277e-06, "loss": 1.1628, "step": 200 }, { "epoch": 1.0, "eval_accuracy": 0.7727272510528564, "eval_loss": 0.712559700012207, "eval_runtime": 9.2054, "eval_samples_per_second": 21.509, "eval_steps_per_second": 5.432, "step": 223 }, { "epoch": 1.35, "grad_norm": 12.418754577636719, "learning_rate": 9.106128550074738e-06, "loss": 0.8286, "step": 300 }, { "epoch": 1.79, "grad_norm": 7.768007755279541, "learning_rate": 8.807174887892378e-06, "loss": 0.6562, "step": 400 }, { "epoch": 2.0, "eval_accuracy": 0.8484848737716675, "eval_loss": 0.5068599581718445, "eval_runtime": 9.5278, "eval_samples_per_second": 20.781, "eval_steps_per_second": 5.248, "step": 446 }, { "epoch": 2.24, "grad_norm": 9.381482124328613, "learning_rate": 8.51121076233184e-06, "loss": 0.5053, "step": 500 }, { "epoch": 2.69, "grad_norm": 2.293752908706665, "learning_rate": 8.212257100149478e-06, "loss": 0.4199, "step": 600 }, { "epoch": 3.0, "eval_accuracy": 0.8989899158477783, "eval_loss": 0.356963574886322, "eval_runtime": 9.2472, "eval_samples_per_second": 21.412, "eval_steps_per_second": 5.407, "step": 669 }, { "epoch": 3.14, "grad_norm": 23.293209075927734, "learning_rate": 7.916292974588939e-06, "loss": 0.3121, "step": 700 }, { "epoch": 3.59, "grad_norm": 3.8754687309265137, "learning_rate": 7.617339312406578e-06, "loss": 0.325, "step": 800 }, { "epoch": 4.0, "eval_accuracy": 0.939393937587738, "eval_loss": 0.20920716226100922, "eval_runtime": 9.2568, "eval_samples_per_second": 21.39, "eval_steps_per_second": 5.401, "step": 892 }, { "epoch": 4.04, "grad_norm": 62.81392288208008, "learning_rate": 7.318385650224216e-06, "loss": 0.2896, "step": 900 }, { "epoch": 4.48, "grad_norm": 35.08163833618164, "learning_rate": 7.019431988041854e-06, "loss": 0.2535, "step": 1000 }, { "epoch": 4.93, "grad_norm": 14.269490242004395, "learning_rate": 6.720478325859492e-06, "loss": 0.2217, "step": 1100 }, { "epoch": 5.0, "eval_accuracy": 0.9444444179534912, "eval_loss": 0.23924072086811066, "eval_runtime": 9.2044, "eval_samples_per_second": 21.511, "eval_steps_per_second": 5.432, "step": 1115 }, { "epoch": 5.38, "grad_norm": 0.41719043254852295, "learning_rate": 6.421524663677131e-06, "loss": 0.2165, "step": 1200 }, { "epoch": 5.83, "grad_norm": 1.484471321105957, "learning_rate": 6.1225710014947695e-06, "loss": 0.1831, "step": 1300 }, { "epoch": 6.0, "eval_accuracy": 0.9292929172515869, "eval_loss": 0.27538299560546875, "eval_runtime": 9.1435, "eval_samples_per_second": 21.655, "eval_steps_per_second": 5.468, "step": 1338 }, { "epoch": 6.28, "grad_norm": 0.09743738174438477, "learning_rate": 5.823617339312408e-06, "loss": 0.2059, "step": 1400 }, { "epoch": 6.73, "grad_norm": 0.3065042793750763, "learning_rate": 5.524663677130046e-06, "loss": 0.1598, "step": 1500 }, { "epoch": 7.0, "eval_accuracy": 0.9343434572219849, "eval_loss": 0.3294394910335541, "eval_runtime": 9.1064, "eval_samples_per_second": 21.743, "eval_steps_per_second": 5.491, "step": 1561 }, { "epoch": 7.17, "grad_norm": 0.05342373996973038, "learning_rate": 5.228699551569507e-06, "loss": 0.1455, "step": 1600 }, { "epoch": 7.62, "grad_norm": 1.5460679531097412, "learning_rate": 4.929745889387145e-06, "loss": 0.1676, "step": 1700 }, { "epoch": 8.0, "eval_accuracy": 0.9494949579238892, "eval_loss": 0.2668905258178711, "eval_runtime": 9.2118, "eval_samples_per_second": 21.494, "eval_steps_per_second": 5.428, "step": 1784 }, { "epoch": 8.07, "grad_norm": 17.537992477416992, "learning_rate": 4.630792227204783e-06, "loss": 0.1762, "step": 1800 }, { "epoch": 8.52, "grad_norm": 0.20349286496639252, "learning_rate": 4.3318385650224224e-06, "loss": 0.1566, "step": 1900 }, { "epoch": 8.97, "grad_norm": 15.300110816955566, "learning_rate": 4.03288490284006e-06, "loss": 0.1597, "step": 2000 }, { "epoch": 9.0, "eval_accuracy": 0.9292929172515869, "eval_loss": 0.34383586049079895, "eval_runtime": 9.179, "eval_samples_per_second": 21.571, "eval_steps_per_second": 5.447, "step": 2007 }, { "epoch": 9.42, "grad_norm": 0.4512959420681, "learning_rate": 3.7339312406576984e-06, "loss": 0.1416, "step": 2100 }, { "epoch": 9.87, "grad_norm": 0.7455862760543823, "learning_rate": 3.4349775784753366e-06, "loss": 0.1132, "step": 2200 }, { "epoch": 10.0, "eval_accuracy": 0.9444444179534912, "eval_loss": 0.31586208939552307, "eval_runtime": 9.1631, "eval_samples_per_second": 21.608, "eval_steps_per_second": 5.457, "step": 2230 }, { "epoch": 10.31, "grad_norm": 0.25966259837150574, "learning_rate": 3.136023916292975e-06, "loss": 0.1654, "step": 2300 }, { "epoch": 10.76, "grad_norm": 0.45347365736961365, "learning_rate": 2.8370702541106134e-06, "loss": 0.1224, "step": 2400 }, { "epoch": 11.0, "eval_accuracy": 0.9545454382896423, "eval_loss": 0.29796990752220154, "eval_runtime": 9.1354, "eval_samples_per_second": 21.674, "eval_steps_per_second": 5.473, "step": 2453 }, { "epoch": 11.21, "grad_norm": 27.043094635009766, "learning_rate": 2.538116591928251e-06, "loss": 0.1021, "step": 2500 }, { "epoch": 11.66, "grad_norm": 72.37726593017578, "learning_rate": 2.2391629297458894e-06, "loss": 0.095, "step": 2600 }, { "epoch": 12.0, "eval_accuracy": 0.9444444179534912, "eval_loss": 0.2970119118690491, "eval_runtime": 9.1388, "eval_samples_per_second": 21.666, "eval_steps_per_second": 5.471, "step": 2676 }, { "epoch": 12.11, "grad_norm": 0.6068007946014404, "learning_rate": 1.940209267563528e-06, "loss": 0.1307, "step": 2700 }, { "epoch": 12.56, "grad_norm": 4.567564964294434, "learning_rate": 1.641255605381166e-06, "loss": 0.1087, "step": 2800 }, { "epoch": 13.0, "eval_accuracy": 0.9343434572219849, "eval_loss": 0.34486597776412964, "eval_runtime": 9.3094, "eval_samples_per_second": 21.269, "eval_steps_per_second": 5.371, "step": 2899 }, { "epoch": 13.0, "grad_norm": 41.62958908081055, "learning_rate": 1.3423019431988044e-06, "loss": 0.0917, "step": 2900 }, { "epoch": 13.45, "grad_norm": 0.026164406910538673, "learning_rate": 1.0433482810164425e-06, "loss": 0.0904, "step": 3000 }, { "epoch": 13.9, "grad_norm": 52.47389221191406, "learning_rate": 7.443946188340807e-07, "loss": 0.1254, "step": 3100 }, { "epoch": 14.0, "eval_accuracy": 0.9444444179534912, "eval_loss": 0.31978654861450195, "eval_runtime": 9.2595, "eval_samples_per_second": 21.384, "eval_steps_per_second": 5.4, "step": 3122 }, { "epoch": 14.0, "step": 3122, "total_flos": 7.9842219974856e+17, "train_loss": 0.30023268478028214, "train_runtime": 1742.0026, "train_samples_per_second": 15.336, "train_steps_per_second": 1.92 }, { "epoch": 14.0, "eval_accuracy": 0.9545454382896423, "eval_loss": 0.29796990752220154, "eval_runtime": 8.9823, "eval_samples_per_second": 22.043, "eval_steps_per_second": 5.566, "step": 3122 } ], "logging_steps": 100, "max_steps": 3345, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 500, "total_flos": 7.9842219974856e+17, "train_batch_size": 4, "trial_name": null, "trial_params": null }