{ "best_metric": 0.31690406799316406, "best_model_checkpoint": "/kaggle/working/hubert-agum960-amharic/checkpoint-3500", "epoch": 5.0, "eval_steps": 500, "global_step": 3760, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.13297872340425532, "grad_norm": 2.615074396133423, "learning_rate": 9.73404255319149e-06, "loss": 1.5743, "step": 100 }, { "epoch": 0.26595744680851063, "grad_norm": 2.4918744564056396, "learning_rate": 9.46808510638298e-06, "loss": 1.3461, "step": 200 }, { "epoch": 0.39893617021276595, "grad_norm": 2.8701541423797607, "learning_rate": 9.20212765957447e-06, "loss": 1.0072, "step": 300 }, { "epoch": 0.5319148936170213, "grad_norm": 12.351414680480957, "learning_rate": 8.938829787234043e-06, "loss": 0.7362, "step": 400 }, { "epoch": 0.6648936170212766, "grad_norm": 8.005541801452637, "learning_rate": 8.672872340425533e-06, "loss": 0.5275, "step": 500 }, { "epoch": 0.6648936170212766, "eval_accuracy": 0.8482810258865356, "eval_loss": 0.4739724397659302, "eval_runtime": 120.9505, "eval_samples_per_second": 11.062, "eval_steps_per_second": 1.389, "step": 500 }, { "epoch": 0.7978723404255319, "grad_norm": 4.672068119049072, "learning_rate": 8.406914893617022e-06, "loss": 0.4446, "step": 600 }, { "epoch": 0.9308510638297872, "grad_norm": 7.660475254058838, "learning_rate": 8.140957446808512e-06, "loss": 0.3683, "step": 700 }, { "epoch": 1.0638297872340425, "grad_norm": 3.2801108360290527, "learning_rate": 7.877659574468086e-06, "loss": 0.3025, "step": 800 }, { "epoch": 1.196808510638298, "grad_norm": 11.70166015625, "learning_rate": 7.61436170212766e-06, "loss": 0.2542, "step": 900 }, { "epoch": 1.3297872340425532, "grad_norm": 10.051458358764648, "learning_rate": 7.348404255319149e-06, "loss": 0.2923, "step": 1000 }, { "epoch": 1.3297872340425532, "eval_accuracy": 0.8998505473136902, "eval_loss": 0.3727114200592041, "eval_runtime": 121.4022, "eval_samples_per_second": 11.021, "eval_steps_per_second": 1.384, "step": 1000 }, { "epoch": 1.4627659574468086, "grad_norm": 2.711136817932129, "learning_rate": 7.0824468085106394e-06, "loss": 0.2151, "step": 1100 }, { "epoch": 1.5957446808510638, "grad_norm": 3.505892038345337, "learning_rate": 6.816489361702127e-06, "loss": 0.2303, "step": 1200 }, { "epoch": 1.728723404255319, "grad_norm": 0.5282111763954163, "learning_rate": 6.550531914893618e-06, "loss": 0.2323, "step": 1300 }, { "epoch": 1.8617021276595744, "grad_norm": 6.366661071777344, "learning_rate": 6.284574468085107e-06, "loss": 0.2029, "step": 1400 }, { "epoch": 1.9946808510638299, "grad_norm": 17.721065521240234, "learning_rate": 6.018617021276596e-06, "loss": 0.2168, "step": 1500 }, { "epoch": 1.9946808510638299, "eval_accuracy": 0.9073243737220764, "eval_loss": 0.41751202940940857, "eval_runtime": 120.4054, "eval_samples_per_second": 11.112, "eval_steps_per_second": 1.395, "step": 1500 }, { "epoch": 2.127659574468085, "grad_norm": 29.569236755371094, "learning_rate": 5.752659574468086e-06, "loss": 0.181, "step": 1600 }, { "epoch": 2.2606382978723403, "grad_norm": 3.7976977825164795, "learning_rate": 5.4867021276595745e-06, "loss": 0.1902, "step": 1700 }, { "epoch": 2.393617021276596, "grad_norm": 5.339010715484619, "learning_rate": 5.220744680851064e-06, "loss": 0.1711, "step": 1800 }, { "epoch": 2.526595744680851, "grad_norm": 20.87778091430664, "learning_rate": 4.954787234042554e-06, "loss": 0.1889, "step": 1900 }, { "epoch": 2.6595744680851063, "grad_norm": 8.609652519226074, "learning_rate": 4.6888297872340425e-06, "loss": 0.1442, "step": 2000 }, { "epoch": 2.6595744680851063, "eval_accuracy": 0.9312406778335571, "eval_loss": 0.3470732569694519, "eval_runtime": 121.4455, "eval_samples_per_second": 11.017, "eval_steps_per_second": 1.383, "step": 2000 }, { "epoch": 2.7925531914893615, "grad_norm": 10.679058074951172, "learning_rate": 4.422872340425532e-06, "loss": 0.1706, "step": 2100 }, { "epoch": 2.925531914893617, "grad_norm": 0.32079407572746277, "learning_rate": 4.156914893617022e-06, "loss": 0.1333, "step": 2200 }, { "epoch": 3.0585106382978724, "grad_norm": 0.19917072355747223, "learning_rate": 3.890957446808511e-06, "loss": 0.1414, "step": 2300 }, { "epoch": 3.1914893617021276, "grad_norm": 3.178858757019043, "learning_rate": 3.625e-06, "loss": 0.1217, "step": 2400 }, { "epoch": 3.324468085106383, "grad_norm": 4.0875020027160645, "learning_rate": 3.3590425531914896e-06, "loss": 0.1341, "step": 2500 }, { "epoch": 3.324468085106383, "eval_accuracy": 0.927503764629364, "eval_loss": 0.3641144931316376, "eval_runtime": 120.3344, "eval_samples_per_second": 11.119, "eval_steps_per_second": 1.396, "step": 2500 }, { "epoch": 3.4574468085106385, "grad_norm": 2.2575433254241943, "learning_rate": 3.0957446808510637e-06, "loss": 0.1179, "step": 2600 }, { "epoch": 3.5904255319148937, "grad_norm": 2.2771668434143066, "learning_rate": 2.8297872340425537e-06, "loss": 0.1247, "step": 2700 }, { "epoch": 3.723404255319149, "grad_norm": 0.28244930505752563, "learning_rate": 2.563829787234043e-06, "loss": 0.1186, "step": 2800 }, { "epoch": 3.8563829787234045, "grad_norm": 10.92738151550293, "learning_rate": 2.297872340425532e-06, "loss": 0.1487, "step": 2900 }, { "epoch": 3.9893617021276597, "grad_norm": 3.5330650806427, "learning_rate": 2.0319148936170213e-06, "loss": 0.1136, "step": 3000 }, { "epoch": 3.9893617021276597, "eval_accuracy": 0.9431988000869751, "eval_loss": 0.3285652995109558, "eval_runtime": 121.4946, "eval_samples_per_second": 11.013, "eval_steps_per_second": 1.383, "step": 3000 }, { "epoch": 4.122340425531915, "grad_norm": 0.24657931923866272, "learning_rate": 1.7659574468085109e-06, "loss": 0.1157, "step": 3100 }, { "epoch": 4.25531914893617, "grad_norm": 4.8149800300598145, "learning_rate": 1.5e-06, "loss": 0.1127, "step": 3200 }, { "epoch": 4.388297872340425, "grad_norm": 12.910594940185547, "learning_rate": 1.2340425531914894e-06, "loss": 0.1159, "step": 3300 }, { "epoch": 4.5212765957446805, "grad_norm": 0.07863473147153854, "learning_rate": 9.680851063829788e-07, "loss": 0.1177, "step": 3400 }, { "epoch": 4.654255319148936, "grad_norm": 8.249691009521484, "learning_rate": 7.021276595744682e-07, "loss": 0.1102, "step": 3500 }, { "epoch": 4.654255319148936, "eval_accuracy": 0.939461886882782, "eval_loss": 0.31690406799316406, "eval_runtime": 121.4032, "eval_samples_per_second": 11.021, "eval_steps_per_second": 1.384, "step": 3500 }, { "epoch": 4.787234042553192, "grad_norm": 0.02278612181544304, "learning_rate": 4.361702127659575e-07, "loss": 0.0957, "step": 3600 }, { "epoch": 4.920212765957447, "grad_norm": 11.251379013061523, "learning_rate": 1.7021276595744683e-07, "loss": 0.1042, "step": 3700 }, { "epoch": 5.0, "step": 3760, "total_flos": 7.814523952420126e+18, "train_loss": 0.29202749221882923, "train_runtime": 8842.7154, "train_samples_per_second": 6.795, "train_steps_per_second": 0.425 } ], "logging_steps": 100, "max_steps": 3760, "num_input_tokens_seen": 0, "num_train_epochs": 5, "save_steps": 500, "stateful_callbacks": { "EarlyStoppingCallback": { "args": { "early_stopping_patience": 3, "early_stopping_threshold": 0.0 }, "attributes": { "early_stopping_patience_counter": 0 } }, "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.814523952420126e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }