{ "best_metric": null, "best_model_checkpoint": null, "epoch": 15.0, "eval_steps": 500, "global_step": 4770, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9968553459119497, "grad_norm": 0.24409309029579163, "learning_rate": 0.00014590873969244836, "loss": 0.1983, "step": 317 }, { "epoch": 1.0, "eval_accuracy": 0.8987096774193548, "eval_loss": 0.04829428717494011, "eval_runtime": 5.4639, "eval_samples_per_second": 567.356, "eval_steps_per_second": 11.896, "step": 318 }, { "epoch": 1.9937106918238994, "grad_norm": 0.2151748687028885, "learning_rate": 0.0001411733693365559, "loss": 0.0453, "step": 634 }, { "epoch": 2.0, "eval_accuracy": 0.9312903225806451, "eval_loss": 0.03353011608123779, "eval_runtime": 5.4346, "eval_samples_per_second": 570.424, "eval_steps_per_second": 11.96, "step": 636 }, { "epoch": 2.990566037735849, "grad_norm": 0.1853768676519394, "learning_rate": 0.00013350993615255516, "loss": 0.0302, "step": 951 }, { "epoch": 3.0, "eval_accuracy": 0.9380645161290323, "eval_loss": 0.024508632719516754, "eval_runtime": 5.4066, "eval_samples_per_second": 573.375, "eval_steps_per_second": 12.022, "step": 954 }, { "epoch": 3.9874213836477987, "grad_norm": 0.12661713361740112, "learning_rate": 0.00012325127343070825, "loss": 0.0241, "step": 1268 }, { "epoch": 4.0, "eval_accuracy": 0.9390322580645162, "eval_loss": 0.022521397098898888, "eval_runtime": 5.4552, "eval_samples_per_second": 568.268, "eval_steps_per_second": 11.915, "step": 1272 }, { "epoch": 4.984276729559748, "grad_norm": 0.07092136144638062, "learning_rate": 0.00011084292879770157, "loss": 0.0211, "step": 1585 }, { "epoch": 5.0, "eval_accuracy": 0.9432258064516129, "eval_loss": 0.020574456080794334, "eval_runtime": 5.4348, "eval_samples_per_second": 570.399, "eval_steps_per_second": 11.96, "step": 1590 }, { "epoch": 5.981132075471698, "grad_norm": 0.08544526994228363, "learning_rate": 9.682381347931996e-05, "loss": 0.0191, "step": 1902 }, { "epoch": 6.0, "eval_accuracy": 0.9425806451612904, "eval_loss": 0.020394276827573776, "eval_runtime": 5.3752, "eval_samples_per_second": 576.72, "eval_steps_per_second": 12.093, "step": 1908 }, { "epoch": 6.977987421383648, "grad_norm": 0.07930737733840942, "learning_rate": 8.180279665546596e-05, "loss": 0.0178, "step": 2219 }, { "epoch": 7.0, "eval_accuracy": 0.942258064516129, "eval_loss": 0.019140785560011864, "eval_runtime": 5.4099, "eval_samples_per_second": 573.027, "eval_steps_per_second": 12.015, "step": 2226 }, { "epoch": 7.9748427672955975, "grad_norm": 0.07150762528181076, "learning_rate": 6.643226144641048e-05, "loss": 0.0166, "step": 2536 }, { "epoch": 8.0, "eval_accuracy": 0.9429032258064516, "eval_loss": 0.018344789743423462, "eval_runtime": 5.4691, "eval_samples_per_second": 566.817, "eval_steps_per_second": 11.885, "step": 2544 }, { "epoch": 8.971698113207546, "grad_norm": 0.07256153225898743, "learning_rate": 5.137977102963379e-05, "loss": 0.0156, "step": 2853 }, { "epoch": 9.0, "eval_accuracy": 0.9435483870967742, "eval_loss": 0.018101442605257034, "eval_runtime": 5.4494, "eval_samples_per_second": 568.87, "eval_steps_per_second": 11.928, "step": 2862 }, { "epoch": 9.968553459119496, "grad_norm": 0.0548202246427536, "learning_rate": 3.729907546620694e-05, "loss": 0.0149, "step": 3170 }, { "epoch": 10.0, "eval_accuracy": 0.9470967741935484, "eval_loss": 0.01727716624736786, "eval_runtime": 5.4078, "eval_samples_per_second": 573.243, "eval_steps_per_second": 12.02, "step": 3180 }, { "epoch": 10.965408805031446, "grad_norm": 0.06869664788246155, "learning_rate": 2.480171844954125e-05, "loss": 0.0141, "step": 3487 }, { "epoch": 11.0, "eval_accuracy": 0.9451612903225807, "eval_loss": 0.016744110733270645, "eval_runtime": 5.3758, "eval_samples_per_second": 576.654, "eval_steps_per_second": 12.091, "step": 3498 }, { "epoch": 11.962264150943396, "grad_norm": 0.058046165853738785, "learning_rate": 1.4430477133891922e-05, "loss": 0.0137, "step": 3804 }, { "epoch": 12.0, "eval_accuracy": 0.9464516129032258, "eval_loss": 0.016505638137459755, "eval_runtime": 5.4235, "eval_samples_per_second": 571.587, "eval_steps_per_second": 11.985, "step": 3816 }, { "epoch": 12.959119496855346, "grad_norm": 0.07068239897489548, "learning_rate": 6.6357885868641785e-06, "loss": 0.0133, "step": 4121 }, { "epoch": 13.0, "eval_accuracy": 0.9464516129032258, "eval_loss": 0.016218330711126328, "eval_runtime": 5.4782, "eval_samples_per_second": 565.88, "eval_steps_per_second": 11.865, "step": 4134 }, { "epoch": 13.955974842767295, "grad_norm": 0.055857669562101364, "learning_rate": 1.7561866970409828e-06, "loss": 0.0131, "step": 4438 }, { "epoch": 14.0, "eval_accuracy": 0.9467741935483871, "eval_loss": 0.01611991599202156, "eval_runtime": 5.4357, "eval_samples_per_second": 570.308, "eval_steps_per_second": 11.958, "step": 4452 }, { "epoch": 14.952830188679245, "grad_norm": 0.04982820898294449, "learning_rate": 3.599188350283289e-09, "loss": 0.0129, "step": 4755 } ], "logging_steps": 317, "max_steps": 4770, "num_input_tokens_seen": 0, "num_train_epochs": 15, "save_steps": 1000000000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1236796014635076.0, "train_batch_size": 48, "trial_name": null, "trial_params": { "alpha": 0.8539116398349643, "learning_rate": 0.00014751038366731588, "lr_scheduler_type": "cosine", "num_train_epochs": 15, "temperature": 10.068159940358916, "weight_decay": 0.299840395098859 } }