|
{ |
|
"best_metric": 0.4858, |
|
"best_model_checkpoint": "runs/cocoruta2-llama3-1-8b-regex-only-valid/checkpoint-350", |
|
"epoch": 0.9992542878448919, |
|
"eval_steps": 25, |
|
"global_step": 670, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.037285607755406416, |
|
"grad_norm": 0.5538629931679852, |
|
"learning_rate": 7.46268656716418e-05, |
|
"loss": 0.8987, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.037285607755406416, |
|
"eval_loss": 0.6527890563011169, |
|
"eval_runtime": 4.7198, |
|
"eval_samples_per_second": 1.059, |
|
"eval_steps_per_second": 0.636, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07457121551081283, |
|
"grad_norm": 0.2341721772038344, |
|
"learning_rate": 0.0001492537313432836, |
|
"loss": 0.6394, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.07457121551081283, |
|
"eval_loss": 0.5486425757408142, |
|
"eval_runtime": 4.7295, |
|
"eval_samples_per_second": 1.057, |
|
"eval_steps_per_second": 0.634, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.11185682326621924, |
|
"grad_norm": 0.19912920431393782, |
|
"learning_rate": 0.00019991315351855748, |
|
"loss": 0.6007, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.11185682326621924, |
|
"eval_loss": 0.5331323146820068, |
|
"eval_runtime": 4.6724, |
|
"eval_samples_per_second": 1.07, |
|
"eval_steps_per_second": 0.642, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.14914243102162567, |
|
"grad_norm": 0.22498739924316974, |
|
"learning_rate": 0.0001985256759242359, |
|
"loss": 0.5696, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.14914243102162567, |
|
"eval_loss": 0.49856358766555786, |
|
"eval_runtime": 4.7327, |
|
"eval_samples_per_second": 1.056, |
|
"eval_steps_per_second": 0.634, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.18642803877703207, |
|
"grad_norm": 0.14018935803156768, |
|
"learning_rate": 0.00019546910545535558, |
|
"loss": 0.5523, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.18642803877703207, |
|
"eval_loss": 0.48138752579689026, |
|
"eval_runtime": 4.6757, |
|
"eval_samples_per_second": 1.069, |
|
"eval_steps_per_second": 0.642, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.22371364653243847, |
|
"grad_norm": 0.16571925714609734, |
|
"learning_rate": 0.00019079522252288386, |
|
"loss": 0.5779, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.22371364653243847, |
|
"eval_loss": 0.472788006067276, |
|
"eval_runtime": 4.7083, |
|
"eval_samples_per_second": 1.062, |
|
"eval_steps_per_second": 0.637, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.2609992542878449, |
|
"grad_norm": 0.14926948884435004, |
|
"learning_rate": 0.00018458320592590975, |
|
"loss": 0.5351, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.2609992542878449, |
|
"eval_loss": 0.467672735452652, |
|
"eval_runtime": 4.6956, |
|
"eval_samples_per_second": 1.065, |
|
"eval_steps_per_second": 0.639, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.29828486204325133, |
|
"grad_norm": 0.13621513300078802, |
|
"learning_rate": 0.00017693829150820068, |
|
"loss": 0.5251, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.29828486204325133, |
|
"eval_loss": 0.4527561664581299, |
|
"eval_runtime": 4.7174, |
|
"eval_samples_per_second": 1.06, |
|
"eval_steps_per_second": 0.636, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.33557046979865773, |
|
"grad_norm": 0.14343687732505184, |
|
"learning_rate": 0.00016798998939045895, |
|
"loss": 0.5301, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.33557046979865773, |
|
"eval_loss": 0.45074066519737244, |
|
"eval_runtime": 4.7651, |
|
"eval_samples_per_second": 1.049, |
|
"eval_steps_per_second": 0.63, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.37285607755406414, |
|
"grad_norm": 0.14384455598548593, |
|
"learning_rate": 0.00015788988997959114, |
|
"loss": 0.5554, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.37285607755406414, |
|
"eval_loss": 0.45150551199913025, |
|
"eval_runtime": 4.7234, |
|
"eval_samples_per_second": 1.059, |
|
"eval_steps_per_second": 0.635, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.41014168530947054, |
|
"grad_norm": 0.11661007763800872, |
|
"learning_rate": 0.0001468090959227082, |
|
"loss": 0.5198, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.41014168530947054, |
|
"eval_loss": 0.4460136294364929, |
|
"eval_runtime": 4.7242, |
|
"eval_samples_per_second": 1.058, |
|
"eval_steps_per_second": 0.635, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.44742729306487694, |
|
"grad_norm": 0.1321962416990341, |
|
"learning_rate": 0.0001349353235103232, |
|
"loss": 0.5484, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.44742729306487694, |
|
"eval_loss": 0.4421899914741516, |
|
"eval_runtime": 4.7511, |
|
"eval_samples_per_second": 1.052, |
|
"eval_steps_per_second": 0.631, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.48471290082028334, |
|
"grad_norm": 0.13900994098830932, |
|
"learning_rate": 0.0001224697226329772, |
|
"loss": 0.5223, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.48471290082028334, |
|
"eval_loss": 0.4396364092826843, |
|
"eval_runtime": 4.705, |
|
"eval_samples_per_second": 1.063, |
|
"eval_steps_per_second": 0.638, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.5219985085756897, |
|
"grad_norm": 0.15825647169367713, |
|
"learning_rate": 0.00010962346916341903, |
|
"loss": 0.4858, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5219985085756897, |
|
"eval_loss": 0.43353357911109924, |
|
"eval_runtime": 4.7849, |
|
"eval_samples_per_second": 1.045, |
|
"eval_steps_per_second": 0.627, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.5592841163310962, |
|
"grad_norm": 0.11937908855087626, |
|
"learning_rate": 9.661418749173467e-05, |
|
"loss": 0.5051, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5592841163310962, |
|
"eval_loss": 0.42787012457847595, |
|
"eval_runtime": 4.7427, |
|
"eval_samples_per_second": 1.054, |
|
"eval_steps_per_second": 0.633, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.5965697240865027, |
|
"grad_norm": 0.1236665974221879, |
|
"learning_rate": 8.366226381814697e-05, |
|
"loss": 0.489, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.5965697240865027, |
|
"eval_loss": 0.4265735149383545, |
|
"eval_runtime": 4.7586, |
|
"eval_samples_per_second": 1.051, |
|
"eval_steps_per_second": 0.63, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.633855331841909, |
|
"grad_norm": 0.1340681564316269, |
|
"learning_rate": 7.09871126588481e-05, |
|
"loss": 0.4992, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.633855331841909, |
|
"eval_loss": 0.41966643929481506, |
|
"eval_runtime": 4.725, |
|
"eval_samples_per_second": 1.058, |
|
"eval_steps_per_second": 0.635, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 0.6711409395973155, |
|
"grad_norm": 0.14955138964358772, |
|
"learning_rate": 5.880345981282876e-05, |
|
"loss": 0.4985, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.6711409395973155, |
|
"eval_loss": 0.41805362701416016, |
|
"eval_runtime": 4.6928, |
|
"eval_samples_per_second": 1.065, |
|
"eval_steps_per_second": 0.639, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 0.7084265473527218, |
|
"grad_norm": 0.13281003649358977, |
|
"learning_rate": 4.7317704758809946e-05, |
|
"loss": 0.511, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.7084265473527218, |
|
"eval_loss": 0.4160347878932953, |
|
"eval_runtime": 4.6998, |
|
"eval_samples_per_second": 1.064, |
|
"eval_steps_per_second": 0.638, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 0.7457121551081283, |
|
"grad_norm": 0.13212294340504707, |
|
"learning_rate": 3.672442410577965e-05, |
|
"loss": 0.5103, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7457121551081283, |
|
"eval_loss": 0.41526561975479126, |
|
"eval_runtime": 4.7196, |
|
"eval_samples_per_second": 1.059, |
|
"eval_steps_per_second": 0.636, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.7829977628635347, |
|
"grad_norm": 0.13255473695033904, |
|
"learning_rate": 2.7203075331094017e-05, |
|
"loss": 0.4953, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.7829977628635347, |
|
"eval_loss": 0.41363996267318726, |
|
"eval_runtime": 4.6994, |
|
"eval_samples_per_second": 1.064, |
|
"eval_steps_per_second": 0.638, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 0.8202833706189411, |
|
"grad_norm": 0.11553089711715571, |
|
"learning_rate": 1.89149566470915e-05, |
|
"loss": 0.5017, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8202833706189411, |
|
"eval_loss": 0.4126836359500885, |
|
"eval_runtime": 4.7366, |
|
"eval_samples_per_second": 1.056, |
|
"eval_steps_per_second": 0.633, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 0.8575689783743475, |
|
"grad_norm": 0.1205660312265152, |
|
"learning_rate": 1.2000474498175552e-05, |
|
"loss": 0.488, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.8575689783743475, |
|
"eval_loss": 0.41148170828819275, |
|
"eval_runtime": 4.6986, |
|
"eval_samples_per_second": 1.064, |
|
"eval_steps_per_second": 0.638, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 0.8948545861297539, |
|
"grad_norm": 0.11357983134630366, |
|
"learning_rate": 6.576764978849004e-06, |
|
"loss": 0.4862, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.8948545861297539, |
|
"eval_loss": 0.4110669493675232, |
|
"eval_runtime": 4.7813, |
|
"eval_samples_per_second": 1.046, |
|
"eval_steps_per_second": 0.627, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.9321401938851603, |
|
"grad_norm": 0.11689127563900782, |
|
"learning_rate": 2.735709467518699e-06, |
|
"loss": 0.4866, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.9321401938851603, |
|
"eval_loss": 0.41068965196609497, |
|
"eval_runtime": 4.7375, |
|
"eval_samples_per_second": 1.055, |
|
"eval_steps_per_second": 0.633, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 0.9694258016405667, |
|
"grad_norm": 0.12155258793935453, |
|
"learning_rate": 5.42378092601481e-07, |
|
"loss": 0.4902, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9694258016405667, |
|
"eval_loss": 0.4103812277317047, |
|
"eval_runtime": 4.6763, |
|
"eval_samples_per_second": 1.069, |
|
"eval_steps_per_second": 0.642, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 0.9992542878448919, |
|
"step": 670, |
|
"total_flos": 7.44129634982953e+16, |
|
"train_loss": 0.0, |
|
"train_runtime": 12.3404, |
|
"train_samples_per_second": 434.426, |
|
"train_steps_per_second": 54.293 |
|
} |
|
], |
|
"logging_steps": 25, |
|
"max_steps": 670, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 25, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.44129634982953e+16, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|