{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.8615384615384616, "eval_steps": 500, "global_step": 96, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.08205128205128205, "grad_norm": 11.399080276489258, "learning_rate": 0.00019583333333333334, "loss": 75.6247, "step": 2 }, { "epoch": 0.1641025641025641, "grad_norm": 17.320436477661133, "learning_rate": 0.00019166666666666667, "loss": 71.7205, "step": 4 }, { "epoch": 0.24615384615384617, "grad_norm": NaN, "learning_rate": 0.00018958333333333332, "loss": 65.3071, "step": 6 }, { "epoch": 0.3282051282051282, "grad_norm": 17.457279205322266, "learning_rate": 0.00018541666666666668, "loss": 63.1094, "step": 8 }, { "epoch": 0.41025641025641024, "grad_norm": 20.812185287475586, "learning_rate": 0.00018125000000000001, "loss": 60.4665, "step": 10 }, { "epoch": 0.49230769230769234, "grad_norm": 23.849903106689453, "learning_rate": 0.00017708333333333335, "loss": 56.8296, "step": 12 }, { "epoch": 0.5743589743589743, "grad_norm": 14.324117660522461, "learning_rate": 0.00017291666666666668, "loss": 55.1421, "step": 14 }, { "epoch": 0.6564102564102564, "grad_norm": 8.510538101196289, "learning_rate": 0.00016875, "loss": 54.8105, "step": 16 }, { "epoch": 0.7384615384615385, "grad_norm": 6.031883239746094, "learning_rate": 0.00016458333333333334, "loss": 52.9411, "step": 18 }, { "epoch": 0.8205128205128205, "grad_norm": 6.2261061668396, "learning_rate": 0.00016041666666666667, "loss": 53.788, "step": 20 }, { "epoch": 0.9025641025641026, "grad_norm": 8.355509757995605, "learning_rate": 0.00015625, "loss": 52.0403, "step": 22 }, { "epoch": 0.9846153846153847, "grad_norm": 10.010218620300293, "learning_rate": 0.00015208333333333333, "loss": 53.6758, "step": 24 }, { "epoch": 1.041025641025641, "grad_norm": 5.320664882659912, "learning_rate": 0.0001479166666666667, "loss": 35.4336, "step": 26 }, { "epoch": 1.123076923076923, "grad_norm": 6.9148478507995605, "learning_rate": 0.00014375, "loss": 52.6291, "step": 28 }, { "epoch": 1.205128205128205, "grad_norm": 4.67191743850708, "learning_rate": 0.00013958333333333333, "loss": 52.2121, "step": 30 }, { "epoch": 1.287179487179487, "grad_norm": 7.0961689949035645, "learning_rate": 0.0001354166666666667, "loss": 51.0249, "step": 32 }, { "epoch": 1.3692307692307693, "grad_norm": 11.30848217010498, "learning_rate": 0.00013125000000000002, "loss": 53.0805, "step": 34 }, { "epoch": 1.4512820512820512, "grad_norm": 6.0200419425964355, "learning_rate": 0.00012708333333333332, "loss": 52.6946, "step": 36 }, { "epoch": 1.5333333333333332, "grad_norm": 5.744126319885254, "learning_rate": 0.00012291666666666668, "loss": 51.4448, "step": 38 }, { "epoch": 1.6153846153846154, "grad_norm": 4.301318645477295, "learning_rate": 0.00011875, "loss": 51.0796, "step": 40 }, { "epoch": 1.6974358974358974, "grad_norm": 6.739460468292236, "learning_rate": 0.00011458333333333333, "loss": 52.3609, "step": 42 }, { "epoch": 1.7794871794871794, "grad_norm": 3.347414493560791, "learning_rate": 0.00011041666666666668, "loss": 51.6531, "step": 44 }, { "epoch": 1.8615384615384616, "grad_norm": 7.933114528656006, "learning_rate": 0.00010625000000000001, "loss": 51.9316, "step": 46 }, { "epoch": 1.9435897435897436, "grad_norm": 4.68740177154541, "learning_rate": 0.00010208333333333333, "loss": 53.1193, "step": 48 }, { "epoch": 2.0, "grad_norm": 3.4068479537963867, "learning_rate": 9.791666666666667e-05, "loss": 35.3235, "step": 50 }, { "epoch": 2.082051282051282, "grad_norm": 8.956518173217773, "learning_rate": 9.375e-05, "loss": 51.9865, "step": 52 }, { "epoch": 2.164102564102564, "grad_norm": 6.310459613800049, "learning_rate": 8.958333333333335e-05, "loss": 51.7456, "step": 54 }, { "epoch": 2.246153846153846, "grad_norm": 8.6781644821167, "learning_rate": 8.541666666666666e-05, "loss": 52.5304, "step": 56 }, { "epoch": 2.3282051282051284, "grad_norm": 3.7787671089172363, "learning_rate": 8.125000000000001e-05, "loss": 51.84, "step": 58 }, { "epoch": 2.41025641025641, "grad_norm": 7.715843200683594, "learning_rate": 7.708333333333334e-05, "loss": 51.1438, "step": 60 }, { "epoch": 2.4923076923076923, "grad_norm": 9.50344181060791, "learning_rate": 7.291666666666667e-05, "loss": 50.2017, "step": 62 }, { "epoch": 2.574358974358974, "grad_norm": 5.091357231140137, "learning_rate": 6.875e-05, "loss": 52.1048, "step": 64 }, { "epoch": 2.6564102564102563, "grad_norm": 6.4015421867370605, "learning_rate": 6.458333333333334e-05, "loss": 52.0365, "step": 66 }, { "epoch": 2.7384615384615385, "grad_norm": 3.156503438949585, "learning_rate": 6.041666666666667e-05, "loss": 51.8669, "step": 68 }, { "epoch": 2.8205128205128203, "grad_norm": 5.185201644897461, "learning_rate": 5.6250000000000005e-05, "loss": 50.7857, "step": 70 }, { "epoch": 2.9025641025641025, "grad_norm": 3.295457601547241, "learning_rate": 5.208333333333334e-05, "loss": 51.0927, "step": 72 }, { "epoch": 2.9846153846153847, "grad_norm": 5.918073654174805, "learning_rate": 4.791666666666667e-05, "loss": 52.483, "step": 74 }, { "epoch": 3.041025641025641, "grad_norm": 6.10349702835083, "learning_rate": 4.375e-05, "loss": 35.2321, "step": 76 }, { "epoch": 3.123076923076923, "grad_norm": 4.493152618408203, "learning_rate": 3.958333333333333e-05, "loss": 51.6482, "step": 78 }, { "epoch": 3.2051282051282053, "grad_norm": 3.8696682453155518, "learning_rate": 3.541666666666667e-05, "loss": 52.3785, "step": 80 }, { "epoch": 3.287179487179487, "grad_norm": 4.229553699493408, "learning_rate": 3.125e-05, "loss": 51.059, "step": 82 }, { "epoch": 3.3692307692307693, "grad_norm": 5.5054779052734375, "learning_rate": 2.7083333333333332e-05, "loss": 52.1417, "step": 84 }, { "epoch": 3.4512820512820515, "grad_norm": 8.53302001953125, "learning_rate": 2.2916666666666667e-05, "loss": 51.1943, "step": 86 }, { "epoch": 3.533333333333333, "grad_norm": 6.278346538543701, "learning_rate": 1.8750000000000002e-05, "loss": 51.0593, "step": 88 }, { "epoch": 3.6153846153846154, "grad_norm": 3.9308125972747803, "learning_rate": 1.4583333333333335e-05, "loss": 51.8184, "step": 90 }, { "epoch": 3.6974358974358976, "grad_norm": 4.1439971923828125, "learning_rate": 1.0416666666666668e-05, "loss": 52.4379, "step": 92 }, { "epoch": 3.7794871794871794, "grad_norm": 5.464842796325684, "learning_rate": 6.25e-06, "loss": 51.1109, "step": 94 }, { "epoch": 3.8615384615384616, "grad_norm": 2.955758571624756, "learning_rate": 2.0833333333333334e-06, "loss": 50.9587, "step": 96 }, { "epoch": 3.8615384615384616, "step": 96, "total_flos": 943340021512272.0, "train_loss": 52.71458212534586, "train_runtime": 1936.9788, "train_samples_per_second": 0.805, "train_steps_per_second": 0.05 } ], "logging_steps": 2, "max_steps": 96, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 943340021512272.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }