|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 13.0, |
|
"eval_steps": 500, |
|
"global_step": 4134, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.9968553459119497, |
|
"grad_norm": 0.3114970326423645, |
|
"learning_rate": 0.00018516884358804785, |
|
"loss": 0.285, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.8787096774193548, |
|
"eval_loss": 0.06010065972805023, |
|
"eval_runtime": 2.5938, |
|
"eval_samples_per_second": 1195.147, |
|
"eval_steps_per_second": 25.06, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.9937106918238994, |
|
"grad_norm": 0.2971077263355255, |
|
"learning_rate": 0.00016979066087455266, |
|
"loss": 0.0516, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9296774193548387, |
|
"eval_loss": 0.03248392790555954, |
|
"eval_runtime": 2.6049, |
|
"eval_samples_per_second": 1190.073, |
|
"eval_steps_per_second": 24.953, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.990566037735849, |
|
"grad_norm": 0.24367552995681763, |
|
"learning_rate": 0.00015441247816105745, |
|
"loss": 0.0306, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.9354838709677419, |
|
"eval_loss": 0.02789517492055893, |
|
"eval_runtime": 2.6186, |
|
"eval_samples_per_second": 1183.836, |
|
"eval_steps_per_second": 24.822, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 3.9874213836477987, |
|
"grad_norm": 0.1115935891866684, |
|
"learning_rate": 0.00013903429544756226, |
|
"loss": 0.0247, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.9432258064516129, |
|
"eval_loss": 0.022768640890717506, |
|
"eval_runtime": 2.6313, |
|
"eval_samples_per_second": 1178.135, |
|
"eval_steps_per_second": 24.703, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 4.984276729559748, |
|
"grad_norm": 0.09492151439189911, |
|
"learning_rate": 0.00012365611273406705, |
|
"loss": 0.0211, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9416129032258065, |
|
"eval_loss": 0.021043915301561356, |
|
"eval_runtime": 2.6325, |
|
"eval_samples_per_second": 1177.607, |
|
"eval_steps_per_second": 24.692, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.981132075471698, |
|
"grad_norm": 0.1175580695271492, |
|
"learning_rate": 0.00010827793002057186, |
|
"loss": 0.0195, |
|
"step": 1902 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9429032258064516, |
|
"eval_loss": 0.020259153097867966, |
|
"eval_runtime": 2.6508, |
|
"eval_samples_per_second": 1169.473, |
|
"eval_steps_per_second": 24.521, |
|
"step": 1908 |
|
}, |
|
{ |
|
"epoch": 6.977987421383648, |
|
"grad_norm": 0.11239363253116608, |
|
"learning_rate": 9.289974730707666e-05, |
|
"loss": 0.0179, |
|
"step": 2219 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.9454838709677419, |
|
"eval_loss": 0.01921679638326168, |
|
"eval_runtime": 2.6313, |
|
"eval_samples_per_second": 1178.137, |
|
"eval_steps_per_second": 24.703, |
|
"step": 2226 |
|
}, |
|
{ |
|
"epoch": 7.9748427672955975, |
|
"grad_norm": 0.06852811574935913, |
|
"learning_rate": 7.752156459358147e-05, |
|
"loss": 0.0166, |
|
"step": 2536 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.944516129032258, |
|
"eval_loss": 0.01865958236157894, |
|
"eval_runtime": 2.6643, |
|
"eval_samples_per_second": 1163.539, |
|
"eval_steps_per_second": 24.397, |
|
"step": 2544 |
|
}, |
|
{ |
|
"epoch": 8.971698113207546, |
|
"grad_norm": 0.06738817691802979, |
|
"learning_rate": 6.214338188008627e-05, |
|
"loss": 0.0158, |
|
"step": 2853 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.944516129032258, |
|
"eval_loss": 0.017864950001239777, |
|
"eval_runtime": 2.6486, |
|
"eval_samples_per_second": 1170.45, |
|
"eval_steps_per_second": 24.542, |
|
"step": 2862 |
|
}, |
|
{ |
|
"epoch": 9.968553459119496, |
|
"grad_norm": 0.062413159757852554, |
|
"learning_rate": 4.6765199166591074e-05, |
|
"loss": 0.015, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9467741935483871, |
|
"eval_loss": 0.017261695116758347, |
|
"eval_runtime": 2.652, |
|
"eval_samples_per_second": 1168.93, |
|
"eval_steps_per_second": 24.51, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 10.965408805031446, |
|
"grad_norm": 0.05933304503560066, |
|
"learning_rate": 3.138701645309588e-05, |
|
"loss": 0.0143, |
|
"step": 3487 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.9464516129032258, |
|
"eval_loss": 0.016805030405521393, |
|
"eval_runtime": 2.653, |
|
"eval_samples_per_second": 1168.48, |
|
"eval_steps_per_second": 24.5, |
|
"step": 3498 |
|
}, |
|
{ |
|
"epoch": 11.962264150943396, |
|
"grad_norm": 0.05458727478981018, |
|
"learning_rate": 1.6008833739600677e-05, |
|
"loss": 0.0138, |
|
"step": 3804 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.9458064516129032, |
|
"eval_loss": 0.016528310254216194, |
|
"eval_runtime": 2.6453, |
|
"eval_samples_per_second": 1171.873, |
|
"eval_steps_per_second": 24.572, |
|
"step": 3816 |
|
}, |
|
{ |
|
"epoch": 12.959119496855346, |
|
"grad_norm": 0.05435788258910179, |
|
"learning_rate": 6.306510261054813e-07, |
|
"loss": 0.0133, |
|
"step": 4121 |
|
} |
|
], |
|
"logging_steps": 317, |
|
"max_steps": 4134, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 13, |
|
"save_steps": 1000000000.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1072409708161512.0, |
|
"train_batch_size": 48, |
|
"trial_name": null, |
|
"trial_params": { |
|
"alpha": 0.3513465070451599, |
|
"fp16": false, |
|
"learning_rate": 0.00018909828459685892, |
|
"lr_scheduler": "cosine", |
|
"num_train_epochs": 13, |
|
"temperature": 9, |
|
"warmup_steps": 236, |
|
"weight_decay": 0.09716219849882443 |
|
} |
|
} |
|
|