{ "best_metric": 0.8222477064220184, "best_model_checkpoint": "tiny-bert-sst2/run-0/checkpoint-1000", "epoch": 4.743833017077799, "eval_steps": 100, "global_step": 2500, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18975332068311196, "grad_norm": 1.6602848768234253, "learning_rate": 0.0001435562651011864, "loss": 0.816, "step": 100 }, { "epoch": 0.18975332068311196, "eval_accuracy": 0.7454128440366973, "eval_loss": 0.6589266061782837, "eval_runtime": 2.4211, "eval_samples_per_second": 360.162, "eval_steps_per_second": 2.891, "step": 100 }, { "epoch": 0.3795066413662239, "grad_norm": 3.0019118785858154, "learning_rate": 0.00015910431045764318, "loss": 0.5566, "step": 200 }, { "epoch": 0.3795066413662239, "eval_accuracy": 0.7878440366972477, "eval_loss": 0.5643345713615417, "eval_runtime": 2.456, "eval_samples_per_second": 355.042, "eval_steps_per_second": 2.85, "step": 200 }, { "epoch": 0.5692599620493358, "grad_norm": 2.3228344917297363, "learning_rate": 0.00015825291603624946, "loss": 0.4541, "step": 300 }, { "epoch": 0.5692599620493358, "eval_accuracy": 0.7981651376146789, "eval_loss": 0.5387669205665588, "eval_runtime": 2.4565, "eval_samples_per_second": 354.981, "eval_steps_per_second": 2.85, "step": 300 }, { "epoch": 0.7590132827324478, "grad_norm": 3.891537666320801, "learning_rate": 0.00015679611440991184, "loss": 0.4115, "step": 400 }, { "epoch": 0.7590132827324478, "eval_accuracy": 0.8096330275229358, "eval_loss": 0.5092660784721375, "eval_runtime": 2.4451, "eval_samples_per_second": 356.629, "eval_steps_per_second": 2.863, "step": 400 }, { "epoch": 0.9487666034155597, "grad_norm": 3.581057548522949, "learning_rate": 0.0001547451293926331, "loss": 0.3816, "step": 500 }, { "epoch": 0.9487666034155597, "eval_accuracy": 0.8153669724770642, "eval_loss": 0.5169694423675537, "eval_runtime": 2.444, "eval_samples_per_second": 356.797, "eval_steps_per_second": 2.864, "step": 500 }, { "epoch": 1.1385199240986716, "grad_norm": 2.517819881439209, "learning_rate": 0.00015211576263780258, "loss": 0.3336, "step": 600 }, { "epoch": 1.1385199240986716, "eval_accuracy": 0.8107798165137615, "eval_loss": 0.528367280960083, "eval_runtime": 2.4308, "eval_samples_per_second": 358.732, "eval_steps_per_second": 2.88, "step": 600 }, { "epoch": 1.3282732447817835, "grad_norm": 1.765392780303955, "learning_rate": 0.00014892827189559552, "loss": 0.2972, "step": 700 }, { "epoch": 1.3282732447817835, "eval_accuracy": 0.8142201834862385, "eval_loss": 0.5347566604614258, "eval_runtime": 2.4351, "eval_samples_per_second": 358.094, "eval_steps_per_second": 2.875, "step": 700 }, { "epoch": 1.5180265654648957, "grad_norm": 3.401026725769043, "learning_rate": 0.00014520721493872392, "loss": 0.282, "step": 800 }, { "epoch": 1.5180265654648957, "eval_accuracy": 0.8245412844036697, "eval_loss": 0.5643708109855652, "eval_runtime": 2.5012, "eval_samples_per_second": 348.628, "eval_steps_per_second": 2.799, "step": 800 }, { "epoch": 1.7077798861480076, "grad_norm": 3.1617627143859863, "learning_rate": 0.0001409812603590005, "loss": 0.2792, "step": 900 }, { "epoch": 1.7077798861480076, "eval_accuracy": 0.8256880733944955, "eval_loss": 0.5595521926879883, "eval_runtime": 2.4305, "eval_samples_per_second": 358.773, "eval_steps_per_second": 2.88, "step": 900 }, { "epoch": 1.8975332068311195, "grad_norm": 5.014317035675049, "learning_rate": 0.00013628296669241911, "loss": 0.2794, "step": 1000 }, { "epoch": 1.8975332068311195, "eval_accuracy": 0.8222477064220184, "eval_loss": 0.5535959601402283, "eval_runtime": 2.4299, "eval_samples_per_second": 358.861, "eval_steps_per_second": 2.881, "step": 1000 }, { "epoch": 2.0872865275142316, "grad_norm": 2.2261252403259277, "learning_rate": 0.0001311485315744647, "loss": 0.2433, "step": 1100 }, { "epoch": 2.0872865275142316, "eval_accuracy": 0.823394495412844, "eval_loss": 0.5890715718269348, "eval_runtime": 2.4516, "eval_samples_per_second": 355.679, "eval_steps_per_second": 2.855, "step": 1100 }, { "epoch": 2.2770398481973433, "grad_norm": 2.7825124263763428, "learning_rate": 0.00012561751285826656, "loss": 0.2166, "step": 1200 }, { "epoch": 2.2770398481973433, "eval_accuracy": 0.8165137614678899, "eval_loss": 0.6120957136154175, "eval_runtime": 2.4345, "eval_samples_per_second": 358.192, "eval_steps_per_second": 2.875, "step": 1200 }, { "epoch": 2.4667931688804554, "grad_norm": 1.830217957496643, "learning_rate": 0.00011973252384421784, "loss": 0.2272, "step": 1300 }, { "epoch": 2.4667931688804554, "eval_accuracy": 0.8211009174311926, "eval_loss": 0.6535155177116394, "eval_runtime": 2.4697, "eval_samples_per_second": 353.075, "eval_steps_per_second": 2.834, "step": 1300 }, { "epoch": 2.656546489563567, "grad_norm": 3.0946052074432373, "learning_rate": 0.00011353890496914119, "loss": 0.2213, "step": 1400 }, { "epoch": 2.656546489563567, "eval_accuracy": 0.8211009174311926, "eval_loss": 0.6385691165924072, "eval_runtime": 2.4251, "eval_samples_per_second": 359.567, "eval_steps_per_second": 2.886, "step": 1400 }, { "epoch": 2.846299810246679, "grad_norm": 2.369713544845581, "learning_rate": 0.00010708437448444503, "loss": 0.2243, "step": 1500 }, { "epoch": 2.846299810246679, "eval_accuracy": 0.8165137614678899, "eval_loss": 0.6380329728126526, "eval_runtime": 2.4214, "eval_samples_per_second": 360.121, "eval_steps_per_second": 2.891, "step": 1500 }, { "epoch": 3.0360531309297913, "grad_norm": 2.249871015548706, "learning_rate": 0.00010041866081459412, "loss": 0.2022, "step": 1600 }, { "epoch": 3.0360531309297913, "eval_accuracy": 0.8256880733944955, "eval_loss": 0.631977379322052, "eval_runtime": 2.5009, "eval_samples_per_second": 348.677, "eval_steps_per_second": 2.799, "step": 1600 }, { "epoch": 3.225806451612903, "grad_norm": 1.9010361433029175, "learning_rate": 9.359311942835884e-05, "loss": 0.1692, "step": 1700 }, { "epoch": 3.225806451612903, "eval_accuracy": 0.8188073394495413, "eval_loss": 0.6348777413368225, "eval_runtime": 2.4316, "eval_samples_per_second": 358.616, "eval_steps_per_second": 2.879, "step": 1700 }, { "epoch": 3.415559772296015, "grad_norm": 2.311347246170044, "learning_rate": 8.666033717462946e-05, "loss": 0.1771, "step": 1800 }, { "epoch": 3.415559772296015, "eval_accuracy": 0.8142201834862385, "eval_loss": 0.6402046084403992, "eval_runtime": 2.4446, "eval_samples_per_second": 356.707, "eval_steps_per_second": 2.863, "step": 1800 }, { "epoch": 3.6053130929791273, "grad_norm": 3.018489360809326, "learning_rate": 7.967372713115845e-05, "loss": 0.1895, "step": 1900 }, { "epoch": 3.6053130929791273, "eval_accuracy": 0.8165137614678899, "eval_loss": 0.6510571837425232, "eval_runtime": 2.4399, "eval_samples_per_second": 357.39, "eval_steps_per_second": 2.869, "step": 1900 }, { "epoch": 3.795066413662239, "grad_norm": 2.140141010284424, "learning_rate": 7.268711708768742e-05, "loss": 0.1813, "step": 2000 }, { "epoch": 3.795066413662239, "eval_accuracy": 0.8130733944954128, "eval_loss": 0.6792920231819153, "eval_runtime": 2.4818, "eval_samples_per_second": 351.356, "eval_steps_per_second": 2.821, "step": 2000 }, { "epoch": 3.984819734345351, "grad_norm": 3.712886095046997, "learning_rate": 6.575433483395807e-05, "loss": 0.18, "step": 2100 }, { "epoch": 3.984819734345351, "eval_accuracy": 0.8314220183486238, "eval_loss": 0.6877826452255249, "eval_runtime": 2.4747, "eval_samples_per_second": 352.367, "eval_steps_per_second": 2.829, "step": 2100 }, { "epoch": 4.174573055028463, "grad_norm": 4.451997756958008, "learning_rate": 5.892879344772279e-05, "loss": 0.1531, "step": 2200 }, { "epoch": 4.174573055028463, "eval_accuracy": 0.8188073394495413, "eval_loss": 0.7583606243133545, "eval_runtime": 2.4344, "eval_samples_per_second": 358.204, "eval_steps_per_second": 2.875, "step": 2200 }, { "epoch": 4.364326375711575, "grad_norm": 2.2267160415649414, "learning_rate": 5.226307977787188e-05, "loss": 0.1562, "step": 2300 }, { "epoch": 4.364326375711575, "eval_accuracy": 0.8176605504587156, "eval_loss": 0.7094950675964355, "eval_runtime": 2.4247, "eval_samples_per_second": 359.631, "eval_steps_per_second": 2.887, "step": 2300 }, { "epoch": 4.554079696394687, "grad_norm": 2.2096240520477295, "learning_rate": 4.580854929317572e-05, "loss": 0.1576, "step": 2400 }, { "epoch": 4.554079696394687, "eval_accuracy": 0.8222477064220184, "eval_loss": 0.7315683364868164, "eval_runtime": 2.511, "eval_samples_per_second": 347.269, "eval_steps_per_second": 2.788, "step": 2400 }, { "epoch": 4.743833017077799, "grad_norm": 2.03861403465271, "learning_rate": 3.9614930418099035e-05, "loss": 0.1571, "step": 2500 }, { "epoch": 4.743833017077799, "eval_accuracy": 0.8165137614678899, "eval_loss": 0.7373071908950806, "eval_runtime": 2.4391, "eval_samples_per_second": 357.503, "eval_steps_per_second": 2.87, "step": 2500 } ], "logging_steps": 100, "max_steps": 3689, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 38388922255320.0, "train_batch_size": 128, "trial_name": null, "trial_params": { "alpha": 0.9409528267353703, "learning_rate": 0.0001593474542623169, "num_train_epochs": 7, "temperature": 4 } }