|
{ |
|
"best_metric": 0.8188073394495413, |
|
"best_model_checkpoint": "tiny-bert-sst2/run-3/checkpoint-1000", |
|
"epoch": 5.0, |
|
"eval_steps": 100, |
|
"global_step": 2635, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.18975332068311196, |
|
"grad_norm": 7.5281982421875, |
|
"learning_rate": 0.00022048666969254715, |
|
"loss": 1.6888, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18975332068311196, |
|
"eval_accuracy": 0.7740825688073395, |
|
"eval_loss": 1.2137372493743896, |
|
"eval_runtime": 2.512, |
|
"eval_samples_per_second": 347.137, |
|
"eval_steps_per_second": 2.787, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.3795066413662239, |
|
"grad_norm": 8.994091033935547, |
|
"learning_rate": 0.00022849311940488983, |
|
"loss": 1.0387, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.3795066413662239, |
|
"eval_accuracy": 0.7958715596330275, |
|
"eval_loss": 1.0297660827636719, |
|
"eval_runtime": 2.4479, |
|
"eval_samples_per_second": 356.218, |
|
"eval_steps_per_second": 2.86, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.5692599620493358, |
|
"grad_norm": 4.1480183601379395, |
|
"learning_rate": 0.00022592984315022126, |
|
"loss": 0.7891, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.5692599620493358, |
|
"eval_accuracy": 0.7935779816513762, |
|
"eval_loss": 0.9698419570922852, |
|
"eval_runtime": 2.4189, |
|
"eval_samples_per_second": 360.492, |
|
"eval_steps_per_second": 2.894, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.7590132827324478, |
|
"grad_norm": 6.172699451446533, |
|
"learning_rate": 0.00022165433603986336, |
|
"loss": 0.6767, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.7590132827324478, |
|
"eval_accuracy": 0.8222477064220184, |
|
"eval_loss": 0.9135227799415588, |
|
"eval_runtime": 2.4368, |
|
"eval_samples_per_second": 357.851, |
|
"eval_steps_per_second": 2.873, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9487666034155597, |
|
"grad_norm": 6.589296340942383, |
|
"learning_rate": 0.00021573238588267377, |
|
"loss": 0.617, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.9487666034155597, |
|
"eval_accuracy": 0.8073394495412844, |
|
"eval_loss": 0.9335416555404663, |
|
"eval_runtime": 2.4972, |
|
"eval_samples_per_second": 349.185, |
|
"eval_steps_per_second": 2.803, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.1385199240986716, |
|
"grad_norm": 4.637527942657471, |
|
"learning_rate": 0.00020825511453038828, |
|
"loss": 0.5142, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.1385199240986716, |
|
"eval_accuracy": 0.8073394495412844, |
|
"eval_loss": 1.0174880027770996, |
|
"eval_runtime": 2.431, |
|
"eval_samples_per_second": 358.694, |
|
"eval_steps_per_second": 2.879, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.3282732447817835, |
|
"grad_norm": 3.807858467102051, |
|
"learning_rate": 0.00019933757577330664, |
|
"loss": 0.4511, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.3282732447817835, |
|
"eval_accuracy": 0.8142201834862385, |
|
"eval_loss": 0.9702788591384888, |
|
"eval_runtime": 2.4299, |
|
"eval_samples_per_second": 358.858, |
|
"eval_steps_per_second": 2.881, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.5180265654648957, |
|
"grad_norm": 5.257256984710693, |
|
"learning_rate": 0.000189116984991988, |
|
"loss": 0.4292, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.5180265654648957, |
|
"eval_accuracy": 0.8027522935779816, |
|
"eval_loss": 1.0670255422592163, |
|
"eval_runtime": 2.4833, |
|
"eval_samples_per_second": 351.149, |
|
"eval_steps_per_second": 2.819, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.7077798861480076, |
|
"grad_norm": 4.167674541473389, |
|
"learning_rate": 0.0001777506078055455, |
|
"loss": 0.4201, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.7077798861480076, |
|
"eval_accuracy": 0.8165137614678899, |
|
"eval_loss": 0.966061532497406, |
|
"eval_runtime": 2.4276, |
|
"eval_samples_per_second": 359.207, |
|
"eval_steps_per_second": 2.884, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 1.8975332068311195, |
|
"grad_norm": 7.251418590545654, |
|
"learning_rate": 0.00016541334020419587, |
|
"loss": 0.4173, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 1.8975332068311195, |
|
"eval_accuracy": 0.8188073394495413, |
|
"eval_loss": 0.9646121263504028, |
|
"eval_runtime": 2.4466, |
|
"eval_samples_per_second": 356.409, |
|
"eval_steps_per_second": 2.861, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 2.0872865275142316, |
|
"grad_norm": 3.3101117610931396, |
|
"learning_rate": 0.0001522950174008934, |
|
"loss": 0.3537, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.0872865275142316, |
|
"eval_accuracy": 0.8165137614678899, |
|
"eval_loss": 1.00448477268219, |
|
"eval_runtime": 2.4388, |
|
"eval_samples_per_second": 357.559, |
|
"eval_steps_per_second": 2.87, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 2.2770398481973433, |
|
"grad_norm": 4.73748254776001, |
|
"learning_rate": 0.00013859749281111374, |
|
"loss": 0.3095, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.2770398481973433, |
|
"eval_accuracy": 0.8153669724770642, |
|
"eval_loss": 1.0533220767974854, |
|
"eval_runtime": 2.4451, |
|
"eval_samples_per_second": 356.635, |
|
"eval_steps_per_second": 2.863, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 2.4667931688804554, |
|
"grad_norm": 2.5525803565979004, |
|
"learning_rate": 0.00012453153210692205, |
|
"loss": 0.3254, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.4667931688804554, |
|
"eval_accuracy": 0.8130733944954128, |
|
"eval_loss": 1.1290571689605713, |
|
"eval_runtime": 2.463, |
|
"eval_samples_per_second": 354.04, |
|
"eval_steps_per_second": 2.842, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 2.656546489563567, |
|
"grad_norm": 5.130295276641846, |
|
"learning_rate": 0.00011031357013693877, |
|
"loss": 0.3135, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.656546489563567, |
|
"eval_accuracy": 0.8073394495412844, |
|
"eval_loss": 1.0808497667312622, |
|
"eval_runtime": 2.434, |
|
"eval_samples_per_second": 358.263, |
|
"eval_steps_per_second": 2.876, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 2.846299810246679, |
|
"grad_norm": 2.434037685394287, |
|
"learning_rate": 9.616238061391467e-05, |
|
"loss": 0.3162, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 2.846299810246679, |
|
"eval_accuracy": 0.8096330275229358, |
|
"eval_loss": 1.1154645681381226, |
|
"eval_runtime": 2.4873, |
|
"eval_samples_per_second": 350.575, |
|
"eval_steps_per_second": 2.814, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 3.0360531309297913, |
|
"grad_norm": 2.9498291015625, |
|
"learning_rate": 8.229570981388642e-05, |
|
"loss": 0.2907, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.0360531309297913, |
|
"eval_accuracy": 0.823394495412844, |
|
"eval_loss": 1.059111475944519, |
|
"eval_runtime": 2.4279, |
|
"eval_samples_per_second": 359.161, |
|
"eval_steps_per_second": 2.883, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 3.225806451612903, |
|
"grad_norm": 2.830878496170044, |
|
"learning_rate": 6.892692608463966e-05, |
|
"loss": 0.2325, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.225806451612903, |
|
"eval_accuracy": 0.8153669724770642, |
|
"eval_loss": 1.0795413255691528, |
|
"eval_runtime": 2.4332, |
|
"eval_samples_per_second": 358.383, |
|
"eval_steps_per_second": 2.877, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 3.415559772296015, |
|
"grad_norm": 2.6231229305267334, |
|
"learning_rate": 5.62617367179498e-05, |
|
"loss": 0.2423, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.415559772296015, |
|
"eval_accuracy": 0.8211009174311926, |
|
"eval_loss": 1.0745121240615845, |
|
"eval_runtime": 2.5016, |
|
"eval_samples_per_second": 348.574, |
|
"eval_steps_per_second": 2.798, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 3.6053130929791273, |
|
"grad_norm": 3.7570250034332275, |
|
"learning_rate": 4.4495022703532034e-05, |
|
"loss": 0.2608, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.6053130929791273, |
|
"eval_accuracy": 0.8142201834862385, |
|
"eval_loss": 1.0897266864776611, |
|
"eval_runtime": 2.4393, |
|
"eval_samples_per_second": 357.481, |
|
"eval_steps_per_second": 2.87, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 3.795066413662239, |
|
"grad_norm": 3.726491689682007, |
|
"learning_rate": 3.3807840068772395e-05, |
|
"loss": 0.2518, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.795066413662239, |
|
"eval_accuracy": 0.8153669724770642, |
|
"eval_loss": 1.0948952436447144, |
|
"eval_runtime": 2.4397, |
|
"eval_samples_per_second": 357.425, |
|
"eval_steps_per_second": 2.869, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 3.984819734345351, |
|
"grad_norm": 3.4763576984405518, |
|
"learning_rate": 2.436463394503288e-05, |
|
"loss": 0.2529, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 3.984819734345351, |
|
"eval_accuracy": 0.8176605504587156, |
|
"eval_loss": 1.1064103841781616, |
|
"eval_runtime": 2.5075, |
|
"eval_samples_per_second": 347.752, |
|
"eval_steps_per_second": 2.792, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 4.174573055028463, |
|
"grad_norm": 6.195148468017578, |
|
"learning_rate": 1.631070822807034e-05, |
|
"loss": 0.2251, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.174573055028463, |
|
"eval_accuracy": 0.8142201834862385, |
|
"eval_loss": 1.1260789632797241, |
|
"eval_runtime": 2.4849, |
|
"eval_samples_per_second": 350.92, |
|
"eval_steps_per_second": 2.817, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 4.364326375711575, |
|
"grad_norm": 2.506267547607422, |
|
"learning_rate": 9.769989767248965e-06, |
|
"loss": 0.221, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.364326375711575, |
|
"eval_accuracy": 0.8130733944954128, |
|
"eval_loss": 1.1184971332550049, |
|
"eval_runtime": 2.448, |
|
"eval_samples_per_second": 356.205, |
|
"eval_steps_per_second": 2.859, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 4.554079696394687, |
|
"grad_norm": 3.0686843395233154, |
|
"learning_rate": 4.843121486269946e-06, |
|
"loss": 0.2266, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.554079696394687, |
|
"eval_accuracy": 0.8188073394495413, |
|
"eval_loss": 1.1176307201385498, |
|
"eval_runtime": 2.4651, |
|
"eval_samples_per_second": 353.741, |
|
"eval_steps_per_second": 2.84, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 4.743833017077799, |
|
"grad_norm": 3.0805752277374268, |
|
"learning_rate": 1.6059137768293674e-06, |
|
"loss": 0.218, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.743833017077799, |
|
"eval_accuracy": 0.8153669724770642, |
|
"eval_loss": 1.1191787719726562, |
|
"eval_runtime": 2.4416, |
|
"eval_samples_per_second": 357.14, |
|
"eval_steps_per_second": 2.867, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 4.933586337760911, |
|
"grad_norm": 2.7237091064453125, |
|
"learning_rate": 1.0817799382007904e-07, |
|
"loss": 0.2257, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 4.933586337760911, |
|
"eval_accuracy": 0.8119266055045872, |
|
"eval_loss": 1.118699550628662, |
|
"eval_runtime": 2.4777, |
|
"eval_samples_per_second": 351.943, |
|
"eval_steps_per_second": 2.825, |
|
"step": 2600 |
|
} |
|
], |
|
"logging_steps": 100, |
|
"max_steps": 2635, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 5, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 39911918594520.0, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": { |
|
"alpha": 0.7363133901666971, |
|
"learning_rate": 0.00022930613648024903, |
|
"num_train_epochs": 5, |
|
"temperature": 13, |
|
"warmup_ratio": 0.03930811279168249 |
|
} |
|
} |
|
|