tiny-bert-sst2 / run-0 /checkpoint-2000 /trainer_state.json
jason-zhoou's picture
Training in progress, step 500
6925bae verified
raw
history blame
9.55 kB
{
"best_metric": 0.8222477064220184,
"best_model_checkpoint": "tiny-bert-sst2/run-0/checkpoint-1000",
"epoch": 3.795066413662239,
"eval_steps": 100,
"global_step": 2000,
"is_hyper_param_search": true,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.18975332068311196,
"grad_norm": 1.6602848768234253,
"learning_rate": 0.0001435562651011864,
"loss": 0.816,
"step": 100
},
{
"epoch": 0.18975332068311196,
"eval_accuracy": 0.7454128440366973,
"eval_loss": 0.6589266061782837,
"eval_runtime": 2.4211,
"eval_samples_per_second": 360.162,
"eval_steps_per_second": 2.891,
"step": 100
},
{
"epoch": 0.3795066413662239,
"grad_norm": 3.0019118785858154,
"learning_rate": 0.00015910431045764318,
"loss": 0.5566,
"step": 200
},
{
"epoch": 0.3795066413662239,
"eval_accuracy": 0.7878440366972477,
"eval_loss": 0.5643345713615417,
"eval_runtime": 2.456,
"eval_samples_per_second": 355.042,
"eval_steps_per_second": 2.85,
"step": 200
},
{
"epoch": 0.5692599620493358,
"grad_norm": 2.3228344917297363,
"learning_rate": 0.00015825291603624946,
"loss": 0.4541,
"step": 300
},
{
"epoch": 0.5692599620493358,
"eval_accuracy": 0.7981651376146789,
"eval_loss": 0.5387669205665588,
"eval_runtime": 2.4565,
"eval_samples_per_second": 354.981,
"eval_steps_per_second": 2.85,
"step": 300
},
{
"epoch": 0.7590132827324478,
"grad_norm": 3.891537666320801,
"learning_rate": 0.00015679611440991184,
"loss": 0.4115,
"step": 400
},
{
"epoch": 0.7590132827324478,
"eval_accuracy": 0.8096330275229358,
"eval_loss": 0.5092660784721375,
"eval_runtime": 2.4451,
"eval_samples_per_second": 356.629,
"eval_steps_per_second": 2.863,
"step": 400
},
{
"epoch": 0.9487666034155597,
"grad_norm": 3.581057548522949,
"learning_rate": 0.0001547451293926331,
"loss": 0.3816,
"step": 500
},
{
"epoch": 0.9487666034155597,
"eval_accuracy": 0.8153669724770642,
"eval_loss": 0.5169694423675537,
"eval_runtime": 2.444,
"eval_samples_per_second": 356.797,
"eval_steps_per_second": 2.864,
"step": 500
},
{
"epoch": 1.1385199240986716,
"grad_norm": 2.517819881439209,
"learning_rate": 0.00015211576263780258,
"loss": 0.3336,
"step": 600
},
{
"epoch": 1.1385199240986716,
"eval_accuracy": 0.8107798165137615,
"eval_loss": 0.528367280960083,
"eval_runtime": 2.4308,
"eval_samples_per_second": 358.732,
"eval_steps_per_second": 2.88,
"step": 600
},
{
"epoch": 1.3282732447817835,
"grad_norm": 1.765392780303955,
"learning_rate": 0.00014892827189559552,
"loss": 0.2972,
"step": 700
},
{
"epoch": 1.3282732447817835,
"eval_accuracy": 0.8142201834862385,
"eval_loss": 0.5347566604614258,
"eval_runtime": 2.4351,
"eval_samples_per_second": 358.094,
"eval_steps_per_second": 2.875,
"step": 700
},
{
"epoch": 1.5180265654648957,
"grad_norm": 3.401026725769043,
"learning_rate": 0.00014520721493872392,
"loss": 0.282,
"step": 800
},
{
"epoch": 1.5180265654648957,
"eval_accuracy": 0.8245412844036697,
"eval_loss": 0.5643708109855652,
"eval_runtime": 2.5012,
"eval_samples_per_second": 348.628,
"eval_steps_per_second": 2.799,
"step": 800
},
{
"epoch": 1.7077798861480076,
"grad_norm": 3.1617627143859863,
"learning_rate": 0.0001409812603590005,
"loss": 0.2792,
"step": 900
},
{
"epoch": 1.7077798861480076,
"eval_accuracy": 0.8256880733944955,
"eval_loss": 0.5595521926879883,
"eval_runtime": 2.4305,
"eval_samples_per_second": 358.773,
"eval_steps_per_second": 2.88,
"step": 900
},
{
"epoch": 1.8975332068311195,
"grad_norm": 5.014317035675049,
"learning_rate": 0.00013628296669241911,
"loss": 0.2794,
"step": 1000
},
{
"epoch": 1.8975332068311195,
"eval_accuracy": 0.8222477064220184,
"eval_loss": 0.5535959601402283,
"eval_runtime": 2.4299,
"eval_samples_per_second": 358.861,
"eval_steps_per_second": 2.881,
"step": 1000
},
{
"epoch": 2.0872865275142316,
"grad_norm": 2.2261252403259277,
"learning_rate": 0.0001311485315744647,
"loss": 0.2433,
"step": 1100
},
{
"epoch": 2.0872865275142316,
"eval_accuracy": 0.823394495412844,
"eval_loss": 0.5890715718269348,
"eval_runtime": 2.4516,
"eval_samples_per_second": 355.679,
"eval_steps_per_second": 2.855,
"step": 1100
},
{
"epoch": 2.2770398481973433,
"grad_norm": 2.7825124263763428,
"learning_rate": 0.00012561751285826656,
"loss": 0.2166,
"step": 1200
},
{
"epoch": 2.2770398481973433,
"eval_accuracy": 0.8165137614678899,
"eval_loss": 0.6120957136154175,
"eval_runtime": 2.4345,
"eval_samples_per_second": 358.192,
"eval_steps_per_second": 2.875,
"step": 1200
},
{
"epoch": 2.4667931688804554,
"grad_norm": 1.830217957496643,
"learning_rate": 0.00011973252384421784,
"loss": 0.2272,
"step": 1300
},
{
"epoch": 2.4667931688804554,
"eval_accuracy": 0.8211009174311926,
"eval_loss": 0.6535155177116394,
"eval_runtime": 2.4697,
"eval_samples_per_second": 353.075,
"eval_steps_per_second": 2.834,
"step": 1300
},
{
"epoch": 2.656546489563567,
"grad_norm": 3.0946052074432373,
"learning_rate": 0.00011353890496914119,
"loss": 0.2213,
"step": 1400
},
{
"epoch": 2.656546489563567,
"eval_accuracy": 0.8211009174311926,
"eval_loss": 0.6385691165924072,
"eval_runtime": 2.4251,
"eval_samples_per_second": 359.567,
"eval_steps_per_second": 2.886,
"step": 1400
},
{
"epoch": 2.846299810246679,
"grad_norm": 2.369713544845581,
"learning_rate": 0.00010708437448444503,
"loss": 0.2243,
"step": 1500
},
{
"epoch": 2.846299810246679,
"eval_accuracy": 0.8165137614678899,
"eval_loss": 0.6380329728126526,
"eval_runtime": 2.4214,
"eval_samples_per_second": 360.121,
"eval_steps_per_second": 2.891,
"step": 1500
},
{
"epoch": 3.0360531309297913,
"grad_norm": 2.249871015548706,
"learning_rate": 0.00010041866081459412,
"loss": 0.2022,
"step": 1600
},
{
"epoch": 3.0360531309297913,
"eval_accuracy": 0.8256880733944955,
"eval_loss": 0.631977379322052,
"eval_runtime": 2.5009,
"eval_samples_per_second": 348.677,
"eval_steps_per_second": 2.799,
"step": 1600
},
{
"epoch": 3.225806451612903,
"grad_norm": 1.9010361433029175,
"learning_rate": 9.359311942835884e-05,
"loss": 0.1692,
"step": 1700
},
{
"epoch": 3.225806451612903,
"eval_accuracy": 0.8188073394495413,
"eval_loss": 0.6348777413368225,
"eval_runtime": 2.4316,
"eval_samples_per_second": 358.616,
"eval_steps_per_second": 2.879,
"step": 1700
},
{
"epoch": 3.415559772296015,
"grad_norm": 2.311347246170044,
"learning_rate": 8.666033717462946e-05,
"loss": 0.1771,
"step": 1800
},
{
"epoch": 3.415559772296015,
"eval_accuracy": 0.8142201834862385,
"eval_loss": 0.6402046084403992,
"eval_runtime": 2.4446,
"eval_samples_per_second": 356.707,
"eval_steps_per_second": 2.863,
"step": 1800
},
{
"epoch": 3.6053130929791273,
"grad_norm": 3.018489360809326,
"learning_rate": 7.967372713115845e-05,
"loss": 0.1895,
"step": 1900
},
{
"epoch": 3.6053130929791273,
"eval_accuracy": 0.8165137614678899,
"eval_loss": 0.6510571837425232,
"eval_runtime": 2.4399,
"eval_samples_per_second": 357.39,
"eval_steps_per_second": 2.869,
"step": 1900
},
{
"epoch": 3.795066413662239,
"grad_norm": 2.140141010284424,
"learning_rate": 7.268711708768742e-05,
"loss": 0.1813,
"step": 2000
},
{
"epoch": 3.795066413662239,
"eval_accuracy": 0.8130733944954128,
"eval_loss": 0.6792920231819153,
"eval_runtime": 2.4818,
"eval_samples_per_second": 351.356,
"eval_steps_per_second": 2.821,
"step": 2000
}
],
"logging_steps": 100,
"max_steps": 3689,
"num_input_tokens_seen": 0,
"num_train_epochs": 7,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 30741729246300.0,
"train_batch_size": 128,
"trial_name": null,
"trial_params": {
"alpha": 0.9409528267353703,
"learning_rate": 0.0001593474542623169,
"num_train_epochs": 7,
"temperature": 4
}
}