{ "best_metric": null, "best_model_checkpoint": null, "epoch": 14.0, "eval_steps": 500, "global_step": 4452, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9968553459119497, "grad_norm": 0.3429218530654907, "learning_rate": 0.00019453796448455724, "loss": 0.2573, "step": 317 }, { "epoch": 1.0, "eval_accuracy": 0.8848387096774194, "eval_loss": 0.05875122919678688, "eval_runtime": 2.6188, "eval_samples_per_second": 1183.755, "eval_steps_per_second": 24.821, "step": 318 }, { "epoch": 1.9937106918238994, "grad_norm": 0.2231501191854477, "learning_rate": 0.00017962417131851016, "loss": 0.0475, "step": 634 }, { "epoch": 2.0, "eval_accuracy": 0.9248387096774193, "eval_loss": 0.033762380480766296, "eval_runtime": 2.6039, "eval_samples_per_second": 1190.545, "eval_steps_per_second": 24.963, "step": 636 }, { "epoch": 2.990566037735849, "grad_norm": 0.20701543986797333, "learning_rate": 0.00016471037815246308, "loss": 0.0305, "step": 951 }, { "epoch": 3.0, "eval_accuracy": 0.9387096774193548, "eval_loss": 0.025485960766673088, "eval_runtime": 2.6072, "eval_samples_per_second": 1189.026, "eval_steps_per_second": 24.931, "step": 954 }, { "epoch": 3.9874213836477987, "grad_norm": 0.2078278660774231, "learning_rate": 0.00014979658498641602, "loss": 0.0242, "step": 1268 }, { "epoch": 4.0, "eval_accuracy": 0.9412903225806452, "eval_loss": 0.024921948090195656, "eval_runtime": 2.6163, "eval_samples_per_second": 1184.865, "eval_steps_per_second": 24.844, "step": 1272 }, { "epoch": 4.984276729559748, "grad_norm": 0.09209390729665756, "learning_rate": 0.00013488279182036894, "loss": 0.0213, "step": 1585 }, { "epoch": 5.0, "eval_accuracy": 0.9454838709677419, "eval_loss": 0.021658778190612793, "eval_runtime": 2.6077, "eval_samples_per_second": 1188.789, "eval_steps_per_second": 24.926, "step": 1590 }, { "epoch": 5.981132075471698, "grad_norm": 0.07631496340036392, "learning_rate": 0.00011996899865432188, "loss": 0.0191, "step": 1902 }, { "epoch": 6.0, "eval_accuracy": 0.9435483870967742, "eval_loss": 0.021196924149990082, "eval_runtime": 2.6236, "eval_samples_per_second": 1181.597, "eval_steps_per_second": 24.775, "step": 1908 }, { "epoch": 6.977987421383648, "grad_norm": 0.06425660103559494, "learning_rate": 0.0001050552054882748, "loss": 0.0176, "step": 2219 }, { "epoch": 7.0, "eval_accuracy": 0.9493548387096774, "eval_loss": 0.019918320700526237, "eval_runtime": 2.6075, "eval_samples_per_second": 1188.867, "eval_steps_per_second": 24.928, "step": 2226 }, { "epoch": 7.9748427672955975, "grad_norm": 0.0663381814956665, "learning_rate": 9.014141232222773e-05, "loss": 0.0165, "step": 2536 }, { "epoch": 8.0, "eval_accuracy": 0.9490322580645161, "eval_loss": 0.019041651859879494, "eval_runtime": 2.6053, "eval_samples_per_second": 1189.869, "eval_steps_per_second": 24.949, "step": 2544 }, { "epoch": 8.971698113207546, "grad_norm": 0.0676058828830719, "learning_rate": 7.522761915618066e-05, "loss": 0.0156, "step": 2853 }, { "epoch": 9.0, "eval_accuracy": 0.9487096774193549, "eval_loss": 0.018456541001796722, "eval_runtime": 2.6235, "eval_samples_per_second": 1181.605, "eval_steps_per_second": 24.776, "step": 2862 }, { "epoch": 9.968553459119496, "grad_norm": 0.062172286212444305, "learning_rate": 6.031382599013358e-05, "loss": 0.0148, "step": 3170 }, { "epoch": 10.0, "eval_accuracy": 0.947741935483871, "eval_loss": 0.01798408292233944, "eval_runtime": 2.6268, "eval_samples_per_second": 1180.13, "eval_steps_per_second": 24.745, "step": 3180 }, { "epoch": 10.965408805031446, "grad_norm": 0.0584622323513031, "learning_rate": 4.5400032824086506e-05, "loss": 0.0141, "step": 3487 }, { "epoch": 11.0, "eval_accuracy": 0.9503225806451613, "eval_loss": 0.017367394641041756, "eval_runtime": 2.6152, "eval_samples_per_second": 1185.368, "eval_steps_per_second": 24.854, "step": 3498 }, { "epoch": 11.962264150943396, "grad_norm": 0.05319051072001457, "learning_rate": 3.048623965803944e-05, "loss": 0.0135, "step": 3804 }, { "epoch": 12.0, "eval_accuracy": 0.9493548387096774, "eval_loss": 0.017015676945447922, "eval_runtime": 2.5922, "eval_samples_per_second": 1195.915, "eval_steps_per_second": 25.076, "step": 3816 }, { "epoch": 12.959119496855346, "grad_norm": 0.05101251229643822, "learning_rate": 1.5572446491992368e-05, "loss": 0.013, "step": 4121 }, { "epoch": 13.0, "eval_accuracy": 0.9493548387096774, "eval_loss": 0.016792386770248413, "eval_runtime": 2.6169, "eval_samples_per_second": 1184.591, "eval_steps_per_second": 24.838, "step": 4134 }, { "epoch": 13.955974842767295, "grad_norm": 0.04865499958395958, "learning_rate": 6.586533259452965e-07, "loss": 0.0126, "step": 4438 } ], "logging_steps": 317, "max_steps": 4452, "num_input_tokens_seen": 0, "num_train_epochs": 14, "save_steps": 1000000000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1154143441198440.0, "train_batch_size": 48, "trial_name": null, "trial_params": { "alpha": 0.34251596991370203, "fp16": false, "learning_rate": 0.00020126563774242704, "lr_scheduler": "cosine", "num_train_epochs": 14, "temperature": 10, "warmup_steps": 174, "weight_decay": 0.1646286765476401 } }