{ "best_metric": 0.288575142621994, "best_model_checkpoint": "./fine-tuned/checkpoint-2000", "epoch": 3.99667497921862, "eval_steps": 100, "global_step": 2404, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0831255195344971, "grad_norm": 81058.0859375, "learning_rate": 4.896006655574044e-05, "loss": 1.2803, "step": 50 }, { "epoch": 0.1662510390689942, "grad_norm": 80198.9375, "learning_rate": 4.792013311148087e-05, "loss": 0.6108, "step": 100 }, { "epoch": 0.1662510390689942, "eval_loss": 0.4418589174747467, "eval_runtime": 38.3971, "eval_samples_per_second": 12.866, "eval_steps_per_second": 1.615, "step": 100 }, { "epoch": 0.24937655860349128, "grad_norm": 70821.9921875, "learning_rate": 4.68801996672213e-05, "loss": 0.5174, "step": 150 }, { "epoch": 0.3325020781379884, "grad_norm": 75381.640625, "learning_rate": 4.5840266222961734e-05, "loss": 0.4743, "step": 200 }, { "epoch": 0.3325020781379884, "eval_loss": 0.3926239609718323, "eval_runtime": 38.3601, "eval_samples_per_second": 12.878, "eval_steps_per_second": 1.616, "step": 200 }, { "epoch": 0.41562759767248547, "grad_norm": 87610.4453125, "learning_rate": 4.480033277870216e-05, "loss": 0.4777, "step": 250 }, { "epoch": 0.49875311720698257, "grad_norm": 74392.109375, "learning_rate": 4.3760399334442597e-05, "loss": 0.4536, "step": 300 }, { "epoch": 0.49875311720698257, "eval_loss": 0.36712247133255005, "eval_runtime": 38.3686, "eval_samples_per_second": 12.875, "eval_steps_per_second": 1.616, "step": 300 }, { "epoch": 0.5818786367414797, "grad_norm": 152670.1875, "learning_rate": 4.272046589018303e-05, "loss": 0.4386, "step": 350 }, { "epoch": 0.6650041562759768, "grad_norm": 59777.48046875, "learning_rate": 4.1680532445923466e-05, "loss": 0.449, "step": 400 }, { "epoch": 0.6650041562759768, "eval_loss": 0.3487951159477234, "eval_runtime": 38.1805, "eval_samples_per_second": 12.939, "eval_steps_per_second": 1.624, "step": 400 }, { "epoch": 0.7481296758104738, "grad_norm": 60153.67578125, "learning_rate": 4.06405990016639e-05, "loss": 0.4116, "step": 450 }, { "epoch": 0.8312551953449709, "grad_norm": 60111.30078125, "learning_rate": 3.960066555740433e-05, "loss": 0.4145, "step": 500 }, { "epoch": 0.8312551953449709, "eval_loss": 0.3375319242477417, "eval_runtime": 38.2231, "eval_samples_per_second": 12.924, "eval_steps_per_second": 1.622, "step": 500 }, { "epoch": 0.914380714879468, "grad_norm": 60626.75, "learning_rate": 3.856073211314476e-05, "loss": 0.405, "step": 550 }, { "epoch": 0.9975062344139651, "grad_norm": 51910.04296875, "learning_rate": 3.752079866888519e-05, "loss": 0.4186, "step": 600 }, { "epoch": 0.9975062344139651, "eval_loss": 0.3285529315471649, "eval_runtime": 38.3187, "eval_samples_per_second": 12.892, "eval_steps_per_second": 1.618, "step": 600 }, { "epoch": 1.0806317539484622, "grad_norm": 48182.8046875, "learning_rate": 3.6480865224625625e-05, "loss": 0.3925, "step": 650 }, { "epoch": 1.1637572734829593, "grad_norm": 51930.34765625, "learning_rate": 3.544093178036606e-05, "loss": 0.3705, "step": 700 }, { "epoch": 1.1637572734829593, "eval_loss": 0.3227428197860718, "eval_runtime": 38.1185, "eval_samples_per_second": 12.96, "eval_steps_per_second": 1.627, "step": 700 }, { "epoch": 1.2468827930174564, "grad_norm": 60283.91015625, "learning_rate": 3.4400998336106495e-05, "loss": 0.3823, "step": 750 }, { "epoch": 1.3300083125519535, "grad_norm": 55843.62109375, "learning_rate": 3.336106489184692e-05, "loss": 0.3763, "step": 800 }, { "epoch": 1.3300083125519535, "eval_loss": 0.3174193501472473, "eval_runtime": 38.4517, "eval_samples_per_second": 12.847, "eval_steps_per_second": 1.612, "step": 800 }, { "epoch": 1.4131338320864506, "grad_norm": 40623.88671875, "learning_rate": 3.232113144758736e-05, "loss": 0.3509, "step": 850 }, { "epoch": 1.4962593516209477, "grad_norm": 57212.0546875, "learning_rate": 3.128119800332779e-05, "loss": 0.3624, "step": 900 }, { "epoch": 1.4962593516209477, "eval_loss": 0.31218209862709045, "eval_runtime": 38.1558, "eval_samples_per_second": 12.947, "eval_steps_per_second": 1.625, "step": 900 }, { "epoch": 1.5793848711554448, "grad_norm": 49043.25390625, "learning_rate": 3.0241264559068223e-05, "loss": 0.3656, "step": 950 }, { "epoch": 1.6625103906899419, "grad_norm": 50702.92578125, "learning_rate": 2.9201331114808654e-05, "loss": 0.3741, "step": 1000 }, { "epoch": 1.6625103906899419, "eval_loss": 0.3063213527202606, "eval_runtime": 38.2912, "eval_samples_per_second": 12.901, "eval_steps_per_second": 1.619, "step": 1000 }, { "epoch": 1.745635910224439, "grad_norm": 57559.86328125, "learning_rate": 2.816139767054909e-05, "loss": 0.3649, "step": 1050 }, { "epoch": 1.828761429758936, "grad_norm": 54892.80078125, "learning_rate": 2.7121464226289517e-05, "loss": 0.3518, "step": 1100 }, { "epoch": 1.828761429758936, "eval_loss": 0.30387237668037415, "eval_runtime": 38.3202, "eval_samples_per_second": 12.891, "eval_steps_per_second": 1.618, "step": 1100 }, { "epoch": 1.9118869492934332, "grad_norm": 56018.88671875, "learning_rate": 2.608153078202995e-05, "loss": 0.3515, "step": 1150 }, { "epoch": 1.9950124688279303, "grad_norm": 46875.38671875, "learning_rate": 2.5041597337770382e-05, "loss": 0.3459, "step": 1200 }, { "epoch": 1.9950124688279303, "eval_loss": 0.29989051818847656, "eval_runtime": 38.3688, "eval_samples_per_second": 12.875, "eval_steps_per_second": 1.616, "step": 1200 }, { "epoch": 2.0781379883624274, "grad_norm": 46399.90234375, "learning_rate": 2.4001663893510817e-05, "loss": 0.3293, "step": 1250 }, { "epoch": 2.1612635078969245, "grad_norm": 57348.53515625, "learning_rate": 2.296173044925125e-05, "loss": 0.3387, "step": 1300 }, { "epoch": 2.1612635078969245, "eval_loss": 0.29858091473579407, "eval_runtime": 38.3821, "eval_samples_per_second": 12.871, "eval_steps_per_second": 1.615, "step": 1300 }, { "epoch": 2.2443890274314215, "grad_norm": 32873.6796875, "learning_rate": 2.1921797004991683e-05, "loss": 0.3366, "step": 1350 }, { "epoch": 2.3275145469659186, "grad_norm": 47216.03125, "learning_rate": 2.0881863560732114e-05, "loss": 0.3528, "step": 1400 }, { "epoch": 2.3275145469659186, "eval_loss": 0.29636165499687195, "eval_runtime": 38.2663, "eval_samples_per_second": 12.91, "eval_steps_per_second": 1.62, "step": 1400 }, { "epoch": 2.4106400665004157, "grad_norm": 52153.40234375, "learning_rate": 1.9841930116472545e-05, "loss": 0.3379, "step": 1450 }, { "epoch": 2.493765586034913, "grad_norm": 57533.625, "learning_rate": 1.880199667221298e-05, "loss": 0.3175, "step": 1500 }, { "epoch": 2.493765586034913, "eval_loss": 0.29390034079551697, "eval_runtime": 38.2441, "eval_samples_per_second": 12.917, "eval_steps_per_second": 1.621, "step": 1500 }, { "epoch": 2.57689110556941, "grad_norm": 42211.703125, "learning_rate": 1.776206322795341e-05, "loss": 0.3199, "step": 1550 }, { "epoch": 2.660016625103907, "grad_norm": 41404.1953125, "learning_rate": 1.6722129783693842e-05, "loss": 0.3321, "step": 1600 }, { "epoch": 2.660016625103907, "eval_loss": 0.2914765477180481, "eval_runtime": 38.3949, "eval_samples_per_second": 12.866, "eval_steps_per_second": 1.615, "step": 1600 }, { "epoch": 2.743142144638404, "grad_norm": 65816.5546875, "learning_rate": 1.5682196339434277e-05, "loss": 0.3314, "step": 1650 }, { "epoch": 2.826267664172901, "grad_norm": 80084.921875, "learning_rate": 1.464226289517471e-05, "loss": 0.3163, "step": 1700 }, { "epoch": 2.826267664172901, "eval_loss": 0.29080235958099365, "eval_runtime": 38.4641, "eval_samples_per_second": 12.843, "eval_steps_per_second": 1.612, "step": 1700 }, { "epoch": 2.9093931837073983, "grad_norm": 41843.58203125, "learning_rate": 1.3602329450915141e-05, "loss": 0.3334, "step": 1750 }, { "epoch": 2.9925187032418954, "grad_norm": 46427.6796875, "learning_rate": 1.2562396006655574e-05, "loss": 0.3196, "step": 1800 }, { "epoch": 2.9925187032418954, "eval_loss": 0.2890726923942566, "eval_runtime": 38.2139, "eval_samples_per_second": 12.927, "eval_steps_per_second": 1.622, "step": 1800 }, { "epoch": 3.0756442227763925, "grad_norm": 45502.953125, "learning_rate": 1.1522462562396007e-05, "loss": 0.3226, "step": 1850 }, { "epoch": 3.1587697423108896, "grad_norm": 57836.49609375, "learning_rate": 1.048252911813644e-05, "loss": 0.3216, "step": 1900 }, { "epoch": 3.1587697423108896, "eval_loss": 0.2888147532939911, "eval_runtime": 38.2133, "eval_samples_per_second": 12.927, "eval_steps_per_second": 1.622, "step": 1900 }, { "epoch": 3.2418952618453867, "grad_norm": 51613.7734375, "learning_rate": 9.442595673876873e-06, "loss": 0.3169, "step": 1950 }, { "epoch": 3.3250207813798838, "grad_norm": 53367.69921875, "learning_rate": 8.402662229617304e-06, "loss": 0.3183, "step": 2000 }, { "epoch": 3.3250207813798838, "eval_loss": 0.288575142621994, "eval_runtime": 38.3741, "eval_samples_per_second": 12.873, "eval_steps_per_second": 1.616, "step": 2000 }, { "epoch": 3.408146300914381, "grad_norm": 48414.92578125, "learning_rate": 7.362728785357738e-06, "loss": 0.3053, "step": 2050 }, { "epoch": 3.491271820448878, "grad_norm": 56748.72265625, "learning_rate": 6.32279534109817e-06, "loss": 0.3253, "step": 2100 }, { "epoch": 3.491271820448878, "eval_loss": 0.28712254762649536, "eval_runtime": 38.3593, "eval_samples_per_second": 12.878, "eval_steps_per_second": 1.616, "step": 2100 }, { "epoch": 3.574397339983375, "grad_norm": 52447.25390625, "learning_rate": 5.282861896838603e-06, "loss": 0.3188, "step": 2150 }, { "epoch": 3.657522859517872, "grad_norm": 47730.953125, "learning_rate": 4.242928452579035e-06, "loss": 0.3041, "step": 2200 }, { "epoch": 3.657522859517872, "eval_loss": 0.2862567901611328, "eval_runtime": 38.2735, "eval_samples_per_second": 12.907, "eval_steps_per_second": 1.62, "step": 2200 }, { "epoch": 3.7406483790523692, "grad_norm": 48164.21484375, "learning_rate": 3.2029950083194676e-06, "loss": 0.3013, "step": 2250 }, { "epoch": 3.8237738985868663, "grad_norm": 44376.41015625, "learning_rate": 2.1630615640599005e-06, "loss": 0.3166, "step": 2300 }, { "epoch": 3.8237738985868663, "eval_loss": 0.2858003079891205, "eval_runtime": 38.3193, "eval_samples_per_second": 12.892, "eval_steps_per_second": 1.618, "step": 2300 }, { "epoch": 3.9068994181213634, "grad_norm": 53910.06640625, "learning_rate": 1.1231281198003328e-06, "loss": 0.3034, "step": 2350 }, { "epoch": 3.9900249376558605, "grad_norm": 36512.5546875, "learning_rate": 8.319467554076539e-08, "loss": 0.2983, "step": 2400 }, { "epoch": 3.9900249376558605, "eval_loss": 0.2859017550945282, "eval_runtime": 38.2723, "eval_samples_per_second": 12.908, "eval_steps_per_second": 1.62, "step": 2400 } ], "logging_steps": 50, "max_steps": 2404, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.342112942882816e+16, "train_batch_size": 8, "trial_name": null, "trial_params": null }