{ "best_metric": null, "best_model_checkpoint": null, "epoch": 16.0, "eval_steps": 500, "global_step": 5088, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9968553459119497, "grad_norm": 0.381134957075119, "learning_rate": 0.00019797823879361723, "loss": 0.2559, "step": 317 }, { "epoch": 1.0, "eval_accuracy": 0.89, "eval_loss": 0.058846741914749146, "eval_runtime": 2.6122, "eval_samples_per_second": 1186.735, "eval_steps_per_second": 24.883, "step": 318 }, { "epoch": 1.9937106918238994, "grad_norm": 0.21276384592056274, "learning_rate": 0.00018482395212466384, "loss": 0.0471, "step": 634 }, { "epoch": 2.0, "eval_accuracy": 0.9338709677419355, "eval_loss": 0.031198587268590927, "eval_runtime": 2.6181, "eval_samples_per_second": 1184.043, "eval_steps_per_second": 24.827, "step": 636 }, { "epoch": 2.990566037735849, "grad_norm": 0.2707769274711609, "learning_rate": 0.00017166966545571045, "loss": 0.0304, "step": 951 }, { "epoch": 3.0, "eval_accuracy": 0.9393548387096774, "eval_loss": 0.027465904131531715, "eval_runtime": 2.6194, "eval_samples_per_second": 1183.467, "eval_steps_per_second": 24.815, "step": 954 }, { "epoch": 3.9874213836477987, "grad_norm": 0.1308950036764145, "learning_rate": 0.00015851537878675706, "loss": 0.0243, "step": 1268 }, { "epoch": 4.0, "eval_accuracy": 0.9416129032258065, "eval_loss": 0.023094674572348595, "eval_runtime": 2.6149, "eval_samples_per_second": 1185.525, "eval_steps_per_second": 24.858, "step": 1272 }, { "epoch": 4.984276729559748, "grad_norm": 0.0777081847190857, "learning_rate": 0.00014536109211780367, "loss": 0.0209, "step": 1585 }, { "epoch": 5.0, "eval_accuracy": 0.9435483870967742, "eval_loss": 0.02354435622692108, "eval_runtime": 2.6158, "eval_samples_per_second": 1185.084, "eval_steps_per_second": 24.849, "step": 1590 }, { "epoch": 5.981132075471698, "grad_norm": 0.0858517661690712, "learning_rate": 0.00013220680544885025, "loss": 0.019, "step": 1902 }, { "epoch": 6.0, "eval_accuracy": 0.9451612903225807, "eval_loss": 0.02090289629995823, "eval_runtime": 2.6518, "eval_samples_per_second": 1169.015, "eval_steps_per_second": 24.512, "step": 1908 }, { "epoch": 6.977987421383648, "grad_norm": 0.08468086272478104, "learning_rate": 0.00011905251877989686, "loss": 0.0176, "step": 2219 }, { "epoch": 7.0, "eval_accuracy": 0.947741935483871, "eval_loss": 0.01986338384449482, "eval_runtime": 2.6294, "eval_samples_per_second": 1178.961, "eval_steps_per_second": 24.72, "step": 2226 }, { "epoch": 7.9748427672955975, "grad_norm": 0.06428312510251999, "learning_rate": 0.00010589823211094344, "loss": 0.0166, "step": 2536 }, { "epoch": 8.0, "eval_accuracy": 0.9470967741935484, "eval_loss": 0.019629037007689476, "eval_runtime": 2.6518, "eval_samples_per_second": 1169.031, "eval_steps_per_second": 24.512, "step": 2544 }, { "epoch": 8.971698113207546, "grad_norm": 0.07834560424089432, "learning_rate": 9.274394544199005e-05, "loss": 0.0173, "step": 2853 }, { "epoch": 9.0, "eval_accuracy": 0.9438709677419355, "eval_loss": 0.020628534257411957, "eval_runtime": 2.6779, "eval_samples_per_second": 1157.602, "eval_steps_per_second": 24.272, "step": 2862 }, { "epoch": 9.968553459119496, "grad_norm": 0.06184606999158859, "learning_rate": 7.958965877303666e-05, "loss": 0.0155, "step": 3170 }, { "epoch": 10.0, "eval_accuracy": 0.9480645161290323, "eval_loss": 0.018417367711663246, "eval_runtime": 2.6615, "eval_samples_per_second": 1164.775, "eval_steps_per_second": 24.423, "step": 3180 }, { "epoch": 10.965408805031446, "grad_norm": 0.0588119812309742, "learning_rate": 6.643537210408326e-05, "loss": 0.0145, "step": 3487 }, { "epoch": 11.0, "eval_accuracy": 0.9496774193548387, "eval_loss": 0.017711853608489037, "eval_runtime": 2.6508, "eval_samples_per_second": 1169.454, "eval_steps_per_second": 24.521, "step": 3498 }, { "epoch": 11.962264150943396, "grad_norm": 0.05123499035835266, "learning_rate": 5.328108543512986e-05, "loss": 0.0139, "step": 3804 }, { "epoch": 12.0, "eval_accuracy": 0.9483870967741935, "eval_loss": 0.017743250355124474, "eval_runtime": 2.6626, "eval_samples_per_second": 1164.291, "eval_steps_per_second": 24.413, "step": 3816 }, { "epoch": 12.959119496855346, "grad_norm": 0.05427718907594681, "learning_rate": 4.0126798766176454e-05, "loss": 0.0132, "step": 4121 }, { "epoch": 13.0, "eval_accuracy": 0.9493548387096774, "eval_loss": 0.01712816022336483, "eval_runtime": 2.6545, "eval_samples_per_second": 1167.815, "eval_steps_per_second": 24.486, "step": 4134 }, { "epoch": 13.955974842767295, "grad_norm": 0.0513085275888443, "learning_rate": 2.6972512097223057e-05, "loss": 0.0128, "step": 4438 }, { "epoch": 14.0, "eval_accuracy": 0.9503225806451613, "eval_loss": 0.016615070402622223, "eval_runtime": 2.6323, "eval_samples_per_second": 1177.661, "eval_steps_per_second": 24.693, "step": 4452 }, { "epoch": 14.952830188679245, "grad_norm": 0.04954610392451286, "learning_rate": 1.3818225428269658e-05, "loss": 0.0123, "step": 4755 }, { "epoch": 15.0, "eval_accuracy": 0.9509677419354838, "eval_loss": 0.01620567962527275, "eval_runtime": 2.6507, "eval_samples_per_second": 1169.516, "eval_steps_per_second": 24.522, "step": 4770 }, { "epoch": 15.949685534591195, "grad_norm": 0.05111037567257881, "learning_rate": 6.639387593162599e-07, "loss": 0.012, "step": 5072 } ], "logging_steps": 317, "max_steps": 5088, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 1000000000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1319419533829824.0, "train_batch_size": 48, "trial_name": null, "trial_params": { "alpha": 0.33188902619419924, "fp16": false, "learning_rate": 0.00020387069528254905, "lr_scheduler": "cosine", "num_train_epochs": 16, "temperature": 11, "warmup_steps": 175, "weight_decay": 0.15922831030843926 } }