{ "best_metric": 0.4858, "best_model_checkpoint": "runs/cocoruta2-llama3-1-8b-regex-only-valid/checkpoint-350", "epoch": 0.9992542878448919, "eval_steps": 25, "global_step": 670, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.037285607755406416, "grad_norm": 0.5538629931679852, "learning_rate": 7.46268656716418e-05, "loss": 0.8987, "step": 25 }, { "epoch": 0.037285607755406416, "eval_loss": 0.6527890563011169, "eval_runtime": 4.7198, "eval_samples_per_second": 1.059, "eval_steps_per_second": 0.636, "step": 25 }, { "epoch": 0.07457121551081283, "grad_norm": 0.2341721772038344, "learning_rate": 0.0001492537313432836, "loss": 0.6394, "step": 50 }, { "epoch": 0.07457121551081283, "eval_loss": 0.5486425757408142, "eval_runtime": 4.7295, "eval_samples_per_second": 1.057, "eval_steps_per_second": 0.634, "step": 50 }, { "epoch": 0.11185682326621924, "grad_norm": 0.19912920431393782, "learning_rate": 0.00019991315351855748, "loss": 0.6007, "step": 75 }, { "epoch": 0.11185682326621924, "eval_loss": 0.5331323146820068, "eval_runtime": 4.6724, "eval_samples_per_second": 1.07, "eval_steps_per_second": 0.642, "step": 75 }, { "epoch": 0.14914243102162567, "grad_norm": 0.22498739924316974, "learning_rate": 0.0001985256759242359, "loss": 0.5696, "step": 100 }, { "epoch": 0.14914243102162567, "eval_loss": 0.49856358766555786, "eval_runtime": 4.7327, "eval_samples_per_second": 1.056, "eval_steps_per_second": 0.634, "step": 100 }, { "epoch": 0.18642803877703207, "grad_norm": 0.14018935803156768, "learning_rate": 0.00019546910545535558, "loss": 0.5523, "step": 125 }, { "epoch": 0.18642803877703207, "eval_loss": 0.48138752579689026, "eval_runtime": 4.6757, "eval_samples_per_second": 1.069, "eval_steps_per_second": 0.642, "step": 125 }, { "epoch": 0.22371364653243847, "grad_norm": 0.16571925714609734, "learning_rate": 0.00019079522252288386, "loss": 0.5779, "step": 150 }, { "epoch": 0.22371364653243847, "eval_loss": 0.472788006067276, "eval_runtime": 4.7083, "eval_samples_per_second": 1.062, "eval_steps_per_second": 0.637, "step": 150 }, { "epoch": 0.2609992542878449, "grad_norm": 0.14926948884435004, "learning_rate": 0.00018458320592590975, "loss": 0.5351, "step": 175 }, { "epoch": 0.2609992542878449, "eval_loss": 0.467672735452652, "eval_runtime": 4.6956, "eval_samples_per_second": 1.065, "eval_steps_per_second": 0.639, "step": 175 }, { "epoch": 0.29828486204325133, "grad_norm": 0.13621513300078802, "learning_rate": 0.00017693829150820068, "loss": 0.5251, "step": 200 }, { "epoch": 0.29828486204325133, "eval_loss": 0.4527561664581299, "eval_runtime": 4.7174, "eval_samples_per_second": 1.06, "eval_steps_per_second": 0.636, "step": 200 }, { "epoch": 0.33557046979865773, "grad_norm": 0.14343687732505184, "learning_rate": 0.00016798998939045895, "loss": 0.5301, "step": 225 }, { "epoch": 0.33557046979865773, "eval_loss": 0.45074066519737244, "eval_runtime": 4.7651, "eval_samples_per_second": 1.049, "eval_steps_per_second": 0.63, "step": 225 }, { "epoch": 0.37285607755406414, "grad_norm": 0.14384455598548593, "learning_rate": 0.00015788988997959114, "loss": 0.5554, "step": 250 }, { "epoch": 0.37285607755406414, "eval_loss": 0.45150551199913025, "eval_runtime": 4.7234, "eval_samples_per_second": 1.059, "eval_steps_per_second": 0.635, "step": 250 }, { "epoch": 0.41014168530947054, "grad_norm": 0.11661007763800872, "learning_rate": 0.0001468090959227082, "loss": 0.5198, "step": 275 }, { "epoch": 0.41014168530947054, "eval_loss": 0.4460136294364929, "eval_runtime": 4.7242, "eval_samples_per_second": 1.058, "eval_steps_per_second": 0.635, "step": 275 }, { "epoch": 0.44742729306487694, "grad_norm": 0.1321962416990341, "learning_rate": 0.0001349353235103232, "loss": 0.5484, "step": 300 }, { "epoch": 0.44742729306487694, "eval_loss": 0.4421899914741516, "eval_runtime": 4.7511, "eval_samples_per_second": 1.052, "eval_steps_per_second": 0.631, "step": 300 }, { "epoch": 0.48471290082028334, "grad_norm": 0.13900994098830932, "learning_rate": 0.0001224697226329772, "loss": 0.5223, "step": 325 }, { "epoch": 0.48471290082028334, "eval_loss": 0.4396364092826843, "eval_runtime": 4.705, "eval_samples_per_second": 1.063, "eval_steps_per_second": 0.638, "step": 325 }, { "epoch": 0.5219985085756897, "grad_norm": 0.15825647169367713, "learning_rate": 0.00010962346916341903, "loss": 0.4858, "step": 350 }, { "epoch": 0.5219985085756897, "eval_loss": 0.43353357911109924, "eval_runtime": 4.7849, "eval_samples_per_second": 1.045, "eval_steps_per_second": 0.627, "step": 350 }, { "epoch": 0.5592841163310962, "grad_norm": 0.11937908855087626, "learning_rate": 9.661418749173467e-05, "loss": 0.5051, "step": 375 }, { "epoch": 0.5592841163310962, "eval_loss": 0.42787012457847595, "eval_runtime": 4.7427, "eval_samples_per_second": 1.054, "eval_steps_per_second": 0.633, "step": 375 }, { "epoch": 0.5965697240865027, "grad_norm": 0.1236665974221879, "learning_rate": 8.366226381814697e-05, "loss": 0.489, "step": 400 }, { "epoch": 0.5965697240865027, "eval_loss": 0.4265735149383545, "eval_runtime": 4.7586, "eval_samples_per_second": 1.051, "eval_steps_per_second": 0.63, "step": 400 }, { "epoch": 0.633855331841909, "grad_norm": 0.1340681564316269, "learning_rate": 7.09871126588481e-05, "loss": 0.4992, "step": 425 }, { "epoch": 0.633855331841909, "eval_loss": 0.41966643929481506, "eval_runtime": 4.725, "eval_samples_per_second": 1.058, "eval_steps_per_second": 0.635, "step": 425 }, { "epoch": 0.6711409395973155, "grad_norm": 0.14955138964358772, "learning_rate": 5.880345981282876e-05, "loss": 0.4985, "step": 450 }, { "epoch": 0.6711409395973155, "eval_loss": 0.41805362701416016, "eval_runtime": 4.6928, "eval_samples_per_second": 1.065, "eval_steps_per_second": 0.639, "step": 450 }, { "epoch": 0.7084265473527218, "grad_norm": 0.13281003649358977, "learning_rate": 4.7317704758809946e-05, "loss": 0.511, "step": 475 }, { "epoch": 0.7084265473527218, "eval_loss": 0.4160347878932953, "eval_runtime": 4.6998, "eval_samples_per_second": 1.064, "eval_steps_per_second": 0.638, "step": 475 }, { "epoch": 0.7457121551081283, "grad_norm": 0.13212294340504707, "learning_rate": 3.672442410577965e-05, "loss": 0.5103, "step": 500 }, { "epoch": 0.7457121551081283, "eval_loss": 0.41526561975479126, "eval_runtime": 4.7196, "eval_samples_per_second": 1.059, "eval_steps_per_second": 0.636, "step": 500 }, { "epoch": 0.7829977628635347, "grad_norm": 0.13255473695033904, "learning_rate": 2.7203075331094017e-05, "loss": 0.4953, "step": 525 }, { "epoch": 0.7829977628635347, "eval_loss": 0.41363996267318726, "eval_runtime": 4.6994, "eval_samples_per_second": 1.064, "eval_steps_per_second": 0.638, "step": 525 }, { "epoch": 0.8202833706189411, "grad_norm": 0.11553089711715571, "learning_rate": 1.89149566470915e-05, "loss": 0.5017, "step": 550 }, { "epoch": 0.8202833706189411, "eval_loss": 0.4126836359500885, "eval_runtime": 4.7366, "eval_samples_per_second": 1.056, "eval_steps_per_second": 0.633, "step": 550 }, { "epoch": 0.8575689783743475, "grad_norm": 0.1205660312265152, "learning_rate": 1.2000474498175552e-05, "loss": 0.488, "step": 575 }, { "epoch": 0.8575689783743475, "eval_loss": 0.41148170828819275, "eval_runtime": 4.6986, "eval_samples_per_second": 1.064, "eval_steps_per_second": 0.638, "step": 575 }, { "epoch": 0.8948545861297539, "grad_norm": 0.11357983134630366, "learning_rate": 6.576764978849004e-06, "loss": 0.4862, "step": 600 }, { "epoch": 0.8948545861297539, "eval_loss": 0.4110669493675232, "eval_runtime": 4.7813, "eval_samples_per_second": 1.046, "eval_steps_per_second": 0.627, "step": 600 }, { "epoch": 0.9321401938851603, "grad_norm": 0.11689127563900782, "learning_rate": 2.735709467518699e-06, "loss": 0.4866, "step": 625 }, { "epoch": 0.9321401938851603, "eval_loss": 0.41068965196609497, "eval_runtime": 4.7375, "eval_samples_per_second": 1.055, "eval_steps_per_second": 0.633, "step": 625 }, { "epoch": 0.9694258016405667, "grad_norm": 0.12155258793935453, "learning_rate": 5.42378092601481e-07, "loss": 0.4902, "step": 650 }, { "epoch": 0.9694258016405667, "eval_loss": 0.4103812277317047, "eval_runtime": 4.6763, "eval_samples_per_second": 1.069, "eval_steps_per_second": 0.642, "step": 650 }, { "epoch": 0.9992542878448919, "step": 670, "total_flos": 7.44129634982953e+16, "train_loss": 0.0, "train_runtime": 12.3404, "train_samples_per_second": 434.426, "train_steps_per_second": 54.293 } ], "logging_steps": 25, "max_steps": 670, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 7.44129634982953e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }