|
{ |
|
"best_metric": 0.8222477064220184, |
|
"best_model_checkpoint": "tiny-bert-sst2/run-0/checkpoint-1000", |
|
"epoch": 4.743833017077799, |
|
"eval_steps": 100, |
|
"global_step": 2500, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.18975332068311196, |
|
"grad_norm": 1.6602848768234253, |
|
"learning_rate": 0.0001435562651011864, |
|
"loss": 0.816, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18975332068311196, |
|
"eval_accuracy": 0.7454128440366973, |
|
"eval_loss": 0.6589266061782837, |
|
"eval_runtime": 2.4211, |
|
"eval_samples_per_second": 360.162, |
|
"eval_steps_per_second": 2.891, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3795066413662239, |
|
"grad_norm": 3.0019118785858154, |
|
"learning_rate": 0.00015910431045764318, |
|
"loss": 0.5566, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3795066413662239, |
|
"eval_accuracy": 0.7878440366972477, |
|
"eval_loss": 0.5643345713615417, |
|
"eval_runtime": 2.456, |
|
"eval_samples_per_second": 355.042, |
|
"eval_steps_per_second": 2.85, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5692599620493358, |
|
"grad_norm": 2.3228344917297363, |
|
"learning_rate": 0.00015825291603624946, |
|
"loss": 0.4541, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5692599620493358, |
|
"eval_accuracy": 0.7981651376146789, |
|
"eval_loss": 0.5387669205665588, |
|
"eval_runtime": 2.4565, |
|
"eval_samples_per_second": 354.981, |
|
"eval_steps_per_second": 2.85, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7590132827324478, |
|
"grad_norm": 3.891537666320801, |
|
"learning_rate": 0.00015679611440991184, |
|
"loss": 0.4115, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7590132827324478, |
|
"eval_accuracy": 0.8096330275229358, |
|
"eval_loss": 0.5092660784721375, |
|
"eval_runtime": 2.4451, |
|
"eval_samples_per_second": 356.629, |
|
"eval_steps_per_second": 2.863, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9487666034155597, |
|
"grad_norm": 3.581057548522949, |
|
"learning_rate": 0.0001547451293926331, |
|
"loss": 0.3816, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9487666034155597, |
|
"eval_accuracy": 0.8153669724770642, |
|
"eval_loss": 0.5169694423675537, |
|
"eval_runtime": 2.444, |
|
"eval_samples_per_second": 356.797, |
|
"eval_steps_per_second": 2.864, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1385199240986716, |
|
"grad_norm": 2.517819881439209, |
|
"learning_rate": 0.00015211576263780258, |
|
"loss": 0.3336, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1385199240986716, |
|
"eval_accuracy": 0.8107798165137615, |
|
"eval_loss": 0.528367280960083, |
|
"eval_runtime": 2.4308, |
|
"eval_samples_per_second": 358.732, |
|
"eval_steps_per_second": 2.88, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3282732447817835, |
|
"grad_norm": 1.765392780303955, |
|
"learning_rate": 0.00014892827189559552, |
|
"loss": 0.2972, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3282732447817835, |
|
"eval_accuracy": 0.8142201834862385, |
|
"eval_loss": 0.5347566604614258, |
|
"eval_runtime": 2.4351, |
|
"eval_samples_per_second": 358.094, |
|
"eval_steps_per_second": 2.875, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5180265654648957, |
|
"grad_norm": 3.401026725769043, |
|
"learning_rate": 0.00014520721493872392, |
|
"loss": 0.282, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5180265654648957, |
|
"eval_accuracy": 0.8245412844036697, |
|
"eval_loss": 0.5643708109855652, |
|
"eval_runtime": 2.5012, |
|
"eval_samples_per_second": 348.628, |
|
"eval_steps_per_second": 2.799, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7077798861480076, |
|
"grad_norm": 3.1617627143859863, |
|
"learning_rate": 0.0001409812603590005, |
|
"loss": 0.2792, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7077798861480076, |
|
"eval_accuracy": 0.8256880733944955, |
|
"eval_loss": 0.5595521926879883, |
|
"eval_runtime": 2.4305, |
|
"eval_samples_per_second": 358.773, |
|
"eval_steps_per_second": 2.88, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8975332068311195, |
|
"grad_norm": 5.014317035675049, |
|
"learning_rate": 0.00013628296669241911, |
|
"loss": 0.2794, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8975332068311195, |
|
"eval_accuracy": 0.8222477064220184, |
|
"eval_loss": 0.5535959601402283, |
|
"eval_runtime": 2.4299, |
|
"eval_samples_per_second": 358.861, |
|
"eval_steps_per_second": 2.881, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0872865275142316, |
|
"grad_norm": 2.2261252403259277, |
|
"learning_rate": 0.0001311485315744647, |
|
"loss": 0.2433, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.0872865275142316, |
|
"eval_accuracy": 0.823394495412844, |
|
"eval_loss": 0.5890715718269348, |
|
"eval_runtime": 2.4516, |
|
"eval_samples_per_second": 355.679, |
|
"eval_steps_per_second": 2.855, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.2770398481973433, |
|
"grad_norm": 2.7825124263763428, |
|
"learning_rate": 0.00012561751285826656, |
|
"loss": 0.2166, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2770398481973433, |
|
"eval_accuracy": 0.8165137614678899, |
|
"eval_loss": 0.6120957136154175, |
|
"eval_runtime": 2.4345, |
|
"eval_samples_per_second": 358.192, |
|
"eval_steps_per_second": 2.875, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.4667931688804554, |
|
"grad_norm": 1.830217957496643, |
|
"learning_rate": 0.00011973252384421784, |
|
"loss": 0.2272, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.4667931688804554, |
|
"eval_accuracy": 0.8211009174311926, |
|
"eval_loss": 0.6535155177116394, |
|
"eval_runtime": 2.4697, |
|
"eval_samples_per_second": 353.075, |
|
"eval_steps_per_second": 2.834, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.656546489563567, |
|
"grad_norm": 3.0946052074432373, |
|
"learning_rate": 0.00011353890496914119, |
|
"loss": 0.2213, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.656546489563567, |
|
"eval_accuracy": 0.8211009174311926, |
|
"eval_loss": 0.6385691165924072, |
|
"eval_runtime": 2.4251, |
|
"eval_samples_per_second": 359.567, |
|
"eval_steps_per_second": 2.886, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.846299810246679, |
|
"grad_norm": 2.369713544845581, |
|
"learning_rate": 0.00010708437448444503, |
|
"loss": 0.2243, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.846299810246679, |
|
"eval_accuracy": 0.8165137614678899, |
|
"eval_loss": 0.6380329728126526, |
|
"eval_runtime": 2.4214, |
|
"eval_samples_per_second": 360.121, |
|
"eval_steps_per_second": 2.891, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.0360531309297913, |
|
"grad_norm": 2.249871015548706, |
|
"learning_rate": 0.00010041866081459412, |
|
"loss": 0.2022, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.0360531309297913, |
|
"eval_accuracy": 0.8256880733944955, |
|
"eval_loss": 0.631977379322052, |
|
"eval_runtime": 2.5009, |
|
"eval_samples_per_second": 348.677, |
|
"eval_steps_per_second": 2.799, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.225806451612903, |
|
"grad_norm": 1.9010361433029175, |
|
"learning_rate": 9.359311942835884e-05, |
|
"loss": 0.1692, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.225806451612903, |
|
"eval_accuracy": 0.8188073394495413, |
|
"eval_loss": 0.6348777413368225, |
|
"eval_runtime": 2.4316, |
|
"eval_samples_per_second": 358.616, |
|
"eval_steps_per_second": 2.879, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.415559772296015, |
|
"grad_norm": 2.311347246170044, |
|
"learning_rate": 8.666033717462946e-05, |
|
"loss": 0.1771, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.415559772296015, |
|
"eval_accuracy": 0.8142201834862385, |
|
"eval_loss": 0.6402046084403992, |
|
"eval_runtime": 2.4446, |
|
"eval_samples_per_second": 356.707, |
|
"eval_steps_per_second": 2.863, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.6053130929791273, |
|
"grad_norm": 3.018489360809326, |
|
"learning_rate": 7.967372713115845e-05, |
|
"loss": 0.1895, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.6053130929791273, |
|
"eval_accuracy": 0.8165137614678899, |
|
"eval_loss": 0.6510571837425232, |
|
"eval_runtime": 2.4399, |
|
"eval_samples_per_second": 357.39, |
|
"eval_steps_per_second": 2.869, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.795066413662239, |
|
"grad_norm": 2.140141010284424, |
|
"learning_rate": 7.268711708768742e-05, |
|
"loss": 0.1813, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.795066413662239, |
|
"eval_accuracy": 0.8130733944954128, |
|
"eval_loss": 0.6792920231819153, |
|
"eval_runtime": 2.4818, |
|
"eval_samples_per_second": 351.356, |
|
"eval_steps_per_second": 2.821, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.984819734345351, |
|
"grad_norm": 3.712886095046997, |
|
"learning_rate": 6.575433483395807e-05, |
|
"loss": 0.18, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.984819734345351, |
|
"eval_accuracy": 0.8314220183486238, |
|
"eval_loss": 0.6877826452255249, |
|
"eval_runtime": 2.4747, |
|
"eval_samples_per_second": 352.367, |
|
"eval_steps_per_second": 2.829, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.174573055028463, |
|
"grad_norm": 4.451997756958008, |
|
"learning_rate": 5.892879344772279e-05, |
|
"loss": 0.1531, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.174573055028463, |
|
"eval_accuracy": 0.8188073394495413, |
|
"eval_loss": 0.7583606243133545, |
|
"eval_runtime": 2.4344, |
|
"eval_samples_per_second": 358.204, |
|
"eval_steps_per_second": 2.875, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.364326375711575, |
|
"grad_norm": 2.2267160415649414, |
|
"learning_rate": 5.226307977787188e-05, |
|
"loss": 0.1562, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.364326375711575, |
|
"eval_accuracy": 0.8176605504587156, |
|
"eval_loss": 0.7094950675964355, |
|
"eval_runtime": 2.4247, |
|
"eval_samples_per_second": 359.631, |
|
"eval_steps_per_second": 2.887, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.554079696394687, |
|
"grad_norm": 2.2096240520477295, |
|
"learning_rate": 4.580854929317572e-05, |
|
"loss": 0.1576, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.554079696394687, |
|
"eval_accuracy": 0.8222477064220184, |
|
"eval_loss": 0.7315683364868164, |
|
"eval_runtime": 2.511, |
|
"eval_samples_per_second": 347.269, |
|
"eval_steps_per_second": 2.788, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.743833017077799, |
|
"grad_norm": 2.03861403465271, |
|
"learning_rate": 3.9614930418099035e-05, |
|
"loss": 0.1571, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.743833017077799, |
|
"eval_accuracy": 0.8165137614678899, |
|
"eval_loss": 0.7373071908950806, |
|
"eval_runtime": 2.4391, |
|
"eval_samples_per_second": 357.503, |
|
"eval_steps_per_second": 2.87, |
|
"step": 2500 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 3689, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 7, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 38388922255320.0, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": { |
|
"alpha": 0.9409528267353703, |
|
"learning_rate": 0.0001593474542623169, |
|
"num_train_epochs": 7, |
|
"temperature": 4 |
|
} |
|
} |
|
|