{ "best_metric": null, "best_model_checkpoint": null, "epoch": 20.0, "eval_steps": 500, "global_step": 6360, "is_hyper_param_search": true, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.9968553459119497, "grad_norm": 0.3776308596134186, "learning_rate": 0.00038685123471569015, "loss": 0.1977, "step": 317 }, { "epoch": 1.0, "eval_accuracy": 0.8580645161290322, "eval_loss": 0.06473647803068161, "eval_runtime": 2.6632, "eval_samples_per_second": 1164.002, "eval_steps_per_second": 24.406, "step": 318 }, { "epoch": 1.9937106918238994, "grad_norm": 0.25335493683815, "learning_rate": 0.0003665580291216353, "loss": 0.0555, "step": 634 }, { "epoch": 2.0, "eval_accuracy": 0.9019354838709678, "eval_loss": 0.05631404370069504, "eval_runtime": 2.6288, "eval_samples_per_second": 1179.25, "eval_steps_per_second": 24.726, "step": 636 }, { "epoch": 2.990566037735849, "grad_norm": 0.6674047112464905, "learning_rate": 0.0003462648235275804, "loss": 0.0401, "step": 951 }, { "epoch": 3.0, "eval_accuracy": 0.8874193548387097, "eval_loss": 0.05496141314506531, "eval_runtime": 2.6441, "eval_samples_per_second": 1172.424, "eval_steps_per_second": 24.583, "step": 954 }, { "epoch": 3.9874213836477987, "grad_norm": 0.38346582651138306, "learning_rate": 0.00032597161793352546, "loss": 0.0399, "step": 1268 }, { "epoch": 4.0, "eval_accuracy": 0.8932258064516129, "eval_loss": 0.0530259795486927, "eval_runtime": 2.6688, "eval_samples_per_second": 1161.578, "eval_steps_per_second": 24.356, "step": 1272 }, { "epoch": 4.984276729559748, "grad_norm": 0.10067661851644516, "learning_rate": 0.00030567841233947055, "loss": 0.0336, "step": 1585 }, { "epoch": 5.0, "eval_accuracy": 0.9161290322580645, "eval_loss": 0.04296305030584335, "eval_runtime": 2.6372, "eval_samples_per_second": 1175.495, "eval_steps_per_second": 24.647, "step": 1590 }, { "epoch": 5.981132075471698, "grad_norm": 0.22013570368289948, "learning_rate": 0.0002853852067454157, "loss": 0.0287, "step": 1902 }, { "epoch": 6.0, "eval_accuracy": 0.9116129032258065, "eval_loss": 0.048792969435453415, "eval_runtime": 2.649, "eval_samples_per_second": 1170.263, "eval_steps_per_second": 24.538, "step": 1908 }, { "epoch": 6.977987421383648, "grad_norm": 0.08923449367284775, "learning_rate": 0.0002650920011513608, "loss": 0.0255, "step": 2219 }, { "epoch": 7.0, "eval_accuracy": 0.912258064516129, "eval_loss": 0.044845063239336014, "eval_runtime": 2.6493, "eval_samples_per_second": 1170.106, "eval_steps_per_second": 24.534, "step": 2226 }, { "epoch": 7.9748427672955975, "grad_norm": 0.16984495520591736, "learning_rate": 0.00024479879555730586, "loss": 0.0245, "step": 2536 }, { "epoch": 8.0, "eval_accuracy": 0.917741935483871, "eval_loss": 0.04449814185500145, "eval_runtime": 2.6634, "eval_samples_per_second": 1163.909, "eval_steps_per_second": 24.405, "step": 2544 }, { "epoch": 8.971698113207546, "grad_norm": 0.5007947087287903, "learning_rate": 0.00022450558996325092, "loss": 0.0244, "step": 2853 }, { "epoch": 9.0, "eval_accuracy": 0.9183870967741935, "eval_loss": 0.04302350431680679, "eval_runtime": 2.643, "eval_samples_per_second": 1172.907, "eval_steps_per_second": 24.593, "step": 2862 }, { "epoch": 9.968553459119496, "grad_norm": 0.06756994128227234, "learning_rate": 0.00020421238436919606, "loss": 0.0219, "step": 3170 }, { "epoch": 10.0, "eval_accuracy": 0.9238709677419354, "eval_loss": 0.03625302389264107, "eval_runtime": 2.6402, "eval_samples_per_second": 1174.161, "eval_steps_per_second": 24.62, "step": 3180 }, { "epoch": 10.965408805031446, "grad_norm": 0.7381525635719299, "learning_rate": 0.00018391917877514115, "loss": 0.0199, "step": 3487 }, { "epoch": 11.0, "eval_accuracy": 0.9287096774193548, "eval_loss": 0.03676296025514603, "eval_runtime": 2.6489, "eval_samples_per_second": 1170.314, "eval_steps_per_second": 24.539, "step": 3498 }, { "epoch": 11.962264150943396, "grad_norm": 0.051013097167015076, "learning_rate": 0.00016362597318108623, "loss": 0.0184, "step": 3804 }, { "epoch": 12.0, "eval_accuracy": 0.9264516129032258, "eval_loss": 0.03529633954167366, "eval_runtime": 2.6771, "eval_samples_per_second": 1157.95, "eval_steps_per_second": 24.28, "step": 3816 }, { "epoch": 12.959119496855346, "grad_norm": 0.07709548622369766, "learning_rate": 0.00014333276758703132, "loss": 0.0171, "step": 4121 }, { "epoch": 13.0, "eval_accuracy": 0.9306451612903226, "eval_loss": 0.03273295238614082, "eval_runtime": 2.6553, "eval_samples_per_second": 1167.463, "eval_steps_per_second": 24.479, "step": 4134 }, { "epoch": 13.955974842767295, "grad_norm": 0.13161151111125946, "learning_rate": 0.0001230395619929764, "loss": 0.016, "step": 4438 }, { "epoch": 14.0, "eval_accuracy": 0.9309677419354838, "eval_loss": 0.03127666935324669, "eval_runtime": 2.6519, "eval_samples_per_second": 1168.993, "eval_steps_per_second": 24.511, "step": 4452 }, { "epoch": 14.952830188679245, "grad_norm": 0.05098772048950195, "learning_rate": 0.00010274635639892152, "loss": 0.0151, "step": 4755 }, { "epoch": 15.0, "eval_accuracy": 0.9348387096774193, "eval_loss": 0.030557256191968918, "eval_runtime": 2.6688, "eval_samples_per_second": 1161.571, "eval_steps_per_second": 24.356, "step": 4770 }, { "epoch": 15.949685534591195, "grad_norm": 0.04677645117044449, "learning_rate": 8.245315080486662e-05, "loss": 0.0143, "step": 5072 }, { "epoch": 16.0, "eval_accuracy": 0.9338709677419355, "eval_loss": 0.03013915941119194, "eval_runtime": 2.6585, "eval_samples_per_second": 1166.065, "eval_steps_per_second": 24.45, "step": 5088 }, { "epoch": 16.946540880503143, "grad_norm": 0.05656523257493973, "learning_rate": 6.21599452108117e-05, "loss": 0.0138, "step": 5389 }, { "epoch": 17.0, "eval_accuracy": 0.9332258064516129, "eval_loss": 0.028866084292531013, "eval_runtime": 2.6651, "eval_samples_per_second": 1163.197, "eval_steps_per_second": 24.39, "step": 5406 }, { "epoch": 17.943396226415093, "grad_norm": 0.047965776175260544, "learning_rate": 4.1866739616756805e-05, "loss": 0.0132, "step": 5706 }, { "epoch": 18.0, "eval_accuracy": 0.9329032258064516, "eval_loss": 0.02866896614432335, "eval_runtime": 2.6577, "eval_samples_per_second": 1166.402, "eval_steps_per_second": 24.457, "step": 5724 }, { "epoch": 18.940251572327043, "grad_norm": 0.04392002522945404, "learning_rate": 2.15735340227019e-05, "loss": 0.0128, "step": 6023 }, { "epoch": 19.0, "eval_accuracy": 0.9338709677419355, "eval_loss": 0.02806813083589077, "eval_runtime": 2.6852, "eval_samples_per_second": 1154.456, "eval_steps_per_second": 24.206, "step": 6042 }, { "epoch": 19.937106918238992, "grad_norm": 0.043970681726932526, "learning_rate": 1.280328428646997e-06, "loss": 0.0124, "step": 6340 } ], "logging_steps": 317, "max_steps": 6360, "num_input_tokens_seen": 0, "num_train_epochs": 20, "save_steps": 1000000000.0, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1648461417340164.0, "train_batch_size": 48, "trial_name": null, "trial_params": { "alpha": 0.040278370777020456, "fp16": true, "learning_rate": 0.0004035595207095335, "lr_scheduler": "linear", "num_train_epochs": 20, "temperature": 19, "warmup_steps": 56, "weight_decay": 0.14418853843556578 } }