{ "best_metric": null, "best_model_checkpoint": null, "epoch": 14.0, "eval_steps": 500, "global_step": 4452, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9968553459119497, "grad_norm": 0.47506219148635864, "learning_rate": 9.46923688245487e-06, "loss": 0.4817, "step": 317 }, { "epoch": 1.0, "eval_accuracy": 0.33064516129032256, "eval_loss": 0.3162553906440735, "eval_runtime": 2.713, "eval_samples_per_second": 1142.656, "eval_steps_per_second": 23.959, "step": 318 }, { "epoch": 1.9937106918238994, "grad_norm": 0.5554618239402771, "learning_rate": 8.743300221816854e-06, "loss": 0.2672, "step": 634 }, { "epoch": 2.0, "eval_accuracy": 0.6270967741935484, "eval_loss": 0.18112796545028687, "eval_runtime": 2.624, "eval_samples_per_second": 1181.409, "eval_steps_per_second": 24.771, "step": 636 }, { "epoch": 2.990566037735849, "grad_norm": 0.6009519100189209, "learning_rate": 8.017363561178839e-06, "loss": 0.1758, "step": 951 }, { "epoch": 3.0, "eval_accuracy": 0.7761290322580645, "eval_loss": 0.12200149148702621, "eval_runtime": 2.6358, "eval_samples_per_second": 1176.11, "eval_steps_per_second": 24.66, "step": 954 }, { "epoch": 3.9874213836477987, "grad_norm": 0.5635930299758911, "learning_rate": 7.291426900540823e-06, "loss": 0.1322, "step": 1268 }, { "epoch": 4.0, "eval_accuracy": 0.8258064516129032, "eval_loss": 0.09276342391967773, "eval_runtime": 2.6813, "eval_samples_per_second": 1156.15, "eval_steps_per_second": 24.242, "step": 1272 }, { "epoch": 4.984276729559748, "grad_norm": 0.3592129349708557, "learning_rate": 6.5654902399028074e-06, "loss": 0.1073, "step": 1585 }, { "epoch": 5.0, "eval_accuracy": 0.8583870967741936, "eval_loss": 0.07522393763065338, "eval_runtime": 2.6613, "eval_samples_per_second": 1164.858, "eval_steps_per_second": 24.424, "step": 1590 }, { "epoch": 5.981132075471698, "grad_norm": 0.5057026147842407, "learning_rate": 5.839553579264793e-06, "loss": 0.0919, "step": 1902 }, { "epoch": 6.0, "eval_accuracy": 0.8806451612903226, "eval_loss": 0.06308811157941818, "eval_runtime": 2.6307, "eval_samples_per_second": 1178.407, "eval_steps_per_second": 24.709, "step": 1908 }, { "epoch": 6.977987421383648, "grad_norm": 0.35764896869659424, "learning_rate": 5.1136169186267775e-06, "loss": 0.0813, "step": 2219 }, { "epoch": 7.0, "eval_accuracy": 0.8877419354838709, "eval_loss": 0.05563005059957504, "eval_runtime": 2.6575, "eval_samples_per_second": 1166.531, "eval_steps_per_second": 24.46, "step": 2226 }, { "epoch": 7.9748427672955975, "grad_norm": 0.3608967363834381, "learning_rate": 4.387680257988761e-06, "loss": 0.0733, "step": 2536 }, { "epoch": 8.0, "eval_accuracy": 0.8977419354838709, "eval_loss": 0.05038844048976898, "eval_runtime": 2.6676, "eval_samples_per_second": 1162.09, "eval_steps_per_second": 24.366, "step": 2544 }, { "epoch": 8.971698113207546, "grad_norm": 0.43566691875457764, "learning_rate": 3.661743597350746e-06, "loss": 0.0679, "step": 2853 }, { "epoch": 9.0, "eval_accuracy": 0.9019354838709678, "eval_loss": 0.045959629118442535, "eval_runtime": 2.69, "eval_samples_per_second": 1152.43, "eval_steps_per_second": 24.164, "step": 2862 }, { "epoch": 9.968553459119496, "grad_norm": 0.3841456472873688, "learning_rate": 2.935806936712731e-06, "loss": 0.0636, "step": 3170 }, { "epoch": 10.0, "eval_accuracy": 0.9038709677419355, "eval_loss": 0.04300825670361519, "eval_runtime": 2.6725, "eval_samples_per_second": 1159.945, "eval_steps_per_second": 24.321, "step": 3180 }, { "epoch": 10.965408805031446, "grad_norm": 0.3714827597141266, "learning_rate": 2.2098702760747155e-06, "loss": 0.0605, "step": 3487 }, { "epoch": 11.0, "eval_accuracy": 0.905483870967742, "eval_loss": 0.04122327268123627, "eval_runtime": 2.6669, "eval_samples_per_second": 1162.413, "eval_steps_per_second": 24.373, "step": 3498 }, { "epoch": 11.962264150943396, "grad_norm": 0.28599628806114197, "learning_rate": 1.4839336154367003e-06, "loss": 0.0584, "step": 3804 }, { "epoch": 12.0, "eval_accuracy": 0.9080645161290323, "eval_loss": 0.039800167083740234, "eval_runtime": 2.6582, "eval_samples_per_second": 1166.221, "eval_steps_per_second": 24.453, "step": 3816 }, { "epoch": 12.959119496855346, "grad_norm": 0.3374980390071869, "learning_rate": 7.579969547986848e-07, "loss": 0.0568, "step": 4121 }, { "epoch": 13.0, "eval_accuracy": 0.9103225806451613, "eval_loss": 0.03894847258925438, "eval_runtime": 2.6631, "eval_samples_per_second": 1164.046, "eval_steps_per_second": 24.407, "step": 4134 }, { "epoch": 13.955974842767295, "grad_norm": 0.2531619071960449, "learning_rate": 3.206029416066945e-08, "loss": 0.0562, "step": 4438 } ], "logging_steps": 317, "max_steps": 4452, "num_input_tokens_seen": 0, "num_train_epochs": 14, "save_steps": 1000000000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1234918384253064.0, "train_batch_size": 48, "trial_name": null, "trial_params": { "alpha": 0.6907680425307066, "fp16": true, "learning_rate": 1.015395316488631e-05, "lr_scheduler": "cosine", "num_train_epochs": 14.721187464037747, "temperature": 13, "warmup_steps": 18, "weight_decay": 0.2870223156122959 } }