{ "best_metric": null, "best_model_checkpoint": null, "epoch": 16.0, "eval_steps": 500, "global_step": 5088, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9968553459119497, "grad_norm": 0.3145744502544403, "learning_rate": 0.00013773896055969116, "loss": 0.2562, "step": 317 }, { "epoch": 1.0, "eval_accuracy": 0.8893548387096775, "eval_loss": 0.05541645735502243, "eval_runtime": 2.6126, "eval_samples_per_second": 1186.566, "eval_steps_per_second": 24.88, "step": 318 }, { "epoch": 1.9937106918238994, "grad_norm": 0.2050488293170929, "learning_rate": 0.00012858715789831575, "loss": 0.0488, "step": 634 }, { "epoch": 2.0, "eval_accuracy": 0.9303225806451613, "eval_loss": 0.030376020818948746, "eval_runtime": 2.6008, "eval_samples_per_second": 1191.932, "eval_steps_per_second": 24.992, "step": 636 }, { "epoch": 2.990566037735849, "grad_norm": 0.20619478821754456, "learning_rate": 0.00011943535523694032, "loss": 0.0305, "step": 951 }, { "epoch": 3.0, "eval_accuracy": 0.9383870967741935, "eval_loss": 0.023558245971798897, "eval_runtime": 2.6497, "eval_samples_per_second": 1169.948, "eval_steps_per_second": 24.531, "step": 954 }, { "epoch": 3.9874213836477987, "grad_norm": 0.13936841487884521, "learning_rate": 0.00011028355257556492, "loss": 0.0246, "step": 1268 }, { "epoch": 4.0, "eval_accuracy": 0.9390322580645162, "eval_loss": 0.02127992734313011, "eval_runtime": 2.6543, "eval_samples_per_second": 1167.921, "eval_steps_per_second": 24.489, "step": 1272 }, { "epoch": 4.984276729559748, "grad_norm": 0.09692877531051636, "learning_rate": 0.0001011317499141895, "loss": 0.0215, "step": 1585 }, { "epoch": 5.0, "eval_accuracy": 0.9403225806451613, "eval_loss": 0.01949879713356495, "eval_runtime": 2.6309, "eval_samples_per_second": 1178.314, "eval_steps_per_second": 24.707, "step": 1590 }, { "epoch": 5.981132075471698, "grad_norm": 0.09292752295732498, "learning_rate": 9.197994725281409e-05, "loss": 0.0197, "step": 1902 }, { "epoch": 6.0, "eval_accuracy": 0.9435483870967742, "eval_loss": 0.01897437684237957, "eval_runtime": 2.6528, "eval_samples_per_second": 1168.577, "eval_steps_per_second": 24.502, "step": 1908 }, { "epoch": 6.977987421383648, "grad_norm": 0.07617080211639404, "learning_rate": 8.282814459143868e-05, "loss": 0.0183, "step": 2219 }, { "epoch": 7.0, "eval_accuracy": 0.9441935483870968, "eval_loss": 0.018397612497210503, "eval_runtime": 2.6546, "eval_samples_per_second": 1167.789, "eval_steps_per_second": 24.486, "step": 2226 }, { "epoch": 7.9748427672955975, "grad_norm": 0.07391500473022461, "learning_rate": 7.367634193006326e-05, "loss": 0.0171, "step": 2536 }, { "epoch": 8.0, "eval_accuracy": 0.9461290322580646, "eval_loss": 0.017443224787712097, "eval_runtime": 2.6528, "eval_samples_per_second": 1168.563, "eval_steps_per_second": 24.502, "step": 2544 }, { "epoch": 8.971698113207546, "grad_norm": 0.09221338480710983, "learning_rate": 6.452453926868784e-05, "loss": 0.0162, "step": 2853 }, { "epoch": 9.0, "eval_accuracy": 0.9435483870967742, "eval_loss": 0.017002489417791367, "eval_runtime": 2.6461, "eval_samples_per_second": 1171.522, "eval_steps_per_second": 24.564, "step": 2862 }, { "epoch": 9.968553459119496, "grad_norm": 0.07394809275865555, "learning_rate": 5.537273660731243e-05, "loss": 0.0155, "step": 3170 }, { "epoch": 10.0, "eval_accuracy": 0.9429032258064516, "eval_loss": 0.01677101105451584, "eval_runtime": 2.635, "eval_samples_per_second": 1176.456, "eval_steps_per_second": 24.668, "step": 3180 }, { "epoch": 10.965408805031446, "grad_norm": 0.0590691976249218, "learning_rate": 4.622093394593702e-05, "loss": 0.0149, "step": 3487 }, { "epoch": 11.0, "eval_accuracy": 0.9464516129032258, "eval_loss": 0.016378937289118767, "eval_runtime": 2.6318, "eval_samples_per_second": 1177.923, "eval_steps_per_second": 24.698, "step": 3498 }, { "epoch": 11.962264150943396, "grad_norm": 0.05711551755666733, "learning_rate": 3.7069131284561605e-05, "loss": 0.0143, "step": 3804 }, { "epoch": 12.0, "eval_accuracy": 0.9461290322580646, "eval_loss": 0.015853669494390488, "eval_runtime": 2.6492, "eval_samples_per_second": 1170.169, "eval_steps_per_second": 24.536, "step": 3816 }, { "epoch": 12.959119496855346, "grad_norm": 0.05840861052274704, "learning_rate": 2.7917328623186196e-05, "loss": 0.0137, "step": 4121 }, { "epoch": 13.0, "eval_accuracy": 0.9461290322580646, "eval_loss": 0.015562719665467739, "eval_runtime": 2.6419, "eval_samples_per_second": 1173.405, "eval_steps_per_second": 24.604, "step": 4134 }, { "epoch": 13.955974842767295, "grad_norm": 0.0555766336619854, "learning_rate": 1.8765525961810784e-05, "loss": 0.0133, "step": 4438 }, { "epoch": 14.0, "eval_accuracy": 0.9467741935483871, "eval_loss": 0.015005242079496384, "eval_runtime": 2.6572, "eval_samples_per_second": 1166.643, "eval_steps_per_second": 24.462, "step": 4452 }, { "epoch": 14.952830188679245, "grad_norm": 0.05531783401966095, "learning_rate": 9.61372330043537e-06, "loss": 0.0129, "step": 4755 }, { "epoch": 15.0, "eval_accuracy": 0.9461290322580646, "eval_loss": 0.01490978617221117, "eval_runtime": 2.6507, "eval_samples_per_second": 1169.499, "eval_steps_per_second": 24.522, "step": 4770 }, { "epoch": 15.949685534591195, "grad_norm": 0.05344949662685394, "learning_rate": 4.6192063905995774e-07, "loss": 0.0125, "step": 5072 } ], "logging_steps": 317, "max_steps": 5088, "num_input_tokens_seen": 0, "num_train_epochs": 16, "save_steps": 1000000000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1319419533829824.0, "train_batch_size": 48, "trial_name": null, "trial_params": { "alpha": 0.8496454567338295, "fp16": false, "learning_rate": 0.0001430799179488219, "lr_scheduler": "cosine", "num_train_epochs": 16, "temperature": 12, "warmup_steps": 132, "weight_decay": 0.20337985521529534 } }