|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 20.0, |
|
"eval_steps": 500, |
|
"global_step": 6360, |
|
"is_hyper_param_search": true, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.9968553459119497, |
|
"grad_norm": 0.3776308596134186, |
|
"learning_rate": 0.00038685123471569015, |
|
"loss": 0.1977, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.8580645161290322, |
|
"eval_loss": 0.06473647803068161, |
|
"eval_runtime": 2.6632, |
|
"eval_samples_per_second": 1164.002, |
|
"eval_steps_per_second": 24.406, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.9937106918238994, |
|
"grad_norm": 0.25335493683815, |
|
"learning_rate": 0.0003665580291216353, |
|
"loss": 0.0555, |
|
"step": 634 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.9019354838709678, |
|
"eval_loss": 0.05631404370069504, |
|
"eval_runtime": 2.6288, |
|
"eval_samples_per_second": 1179.25, |
|
"eval_steps_per_second": 24.726, |
|
"step": 636 |
|
}, |
|
{ |
|
"epoch": 2.990566037735849, |
|
"grad_norm": 0.6674047112464905, |
|
"learning_rate": 0.0003462648235275804, |
|
"loss": 0.0401, |
|
"step": 951 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.8874193548387097, |
|
"eval_loss": 0.05496141314506531, |
|
"eval_runtime": 2.6441, |
|
"eval_samples_per_second": 1172.424, |
|
"eval_steps_per_second": 24.583, |
|
"step": 954 |
|
}, |
|
{ |
|
"epoch": 3.9874213836477987, |
|
"grad_norm": 0.38346582651138306, |
|
"learning_rate": 0.00032597161793352546, |
|
"loss": 0.0399, |
|
"step": 1268 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.8932258064516129, |
|
"eval_loss": 0.0530259795486927, |
|
"eval_runtime": 2.6688, |
|
"eval_samples_per_second": 1161.578, |
|
"eval_steps_per_second": 24.356, |
|
"step": 1272 |
|
}, |
|
{ |
|
"epoch": 4.984276729559748, |
|
"grad_norm": 0.10067661851644516, |
|
"learning_rate": 0.00030567841233947055, |
|
"loss": 0.0336, |
|
"step": 1585 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.9161290322580645, |
|
"eval_loss": 0.04296305030584335, |
|
"eval_runtime": 2.6372, |
|
"eval_samples_per_second": 1175.495, |
|
"eval_steps_per_second": 24.647, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 5.981132075471698, |
|
"grad_norm": 0.22013570368289948, |
|
"learning_rate": 0.0002853852067454157, |
|
"loss": 0.0287, |
|
"step": 1902 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.9116129032258065, |
|
"eval_loss": 0.048792969435453415, |
|
"eval_runtime": 2.649, |
|
"eval_samples_per_second": 1170.263, |
|
"eval_steps_per_second": 24.538, |
|
"step": 1908 |
|
}, |
|
{ |
|
"epoch": 6.977987421383648, |
|
"grad_norm": 0.08923449367284775, |
|
"learning_rate": 0.0002650920011513608, |
|
"loss": 0.0255, |
|
"step": 2219 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.912258064516129, |
|
"eval_loss": 0.044845063239336014, |
|
"eval_runtime": 2.6493, |
|
"eval_samples_per_second": 1170.106, |
|
"eval_steps_per_second": 24.534, |
|
"step": 2226 |
|
}, |
|
{ |
|
"epoch": 7.9748427672955975, |
|
"grad_norm": 0.16984495520591736, |
|
"learning_rate": 0.00024479879555730586, |
|
"loss": 0.0245, |
|
"step": 2536 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.917741935483871, |
|
"eval_loss": 0.04449814185500145, |
|
"eval_runtime": 2.6634, |
|
"eval_samples_per_second": 1163.909, |
|
"eval_steps_per_second": 24.405, |
|
"step": 2544 |
|
}, |
|
{ |
|
"epoch": 8.971698113207546, |
|
"grad_norm": 0.5007947087287903, |
|
"learning_rate": 0.00022450558996325092, |
|
"loss": 0.0244, |
|
"step": 2853 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.9183870967741935, |
|
"eval_loss": 0.04302350431680679, |
|
"eval_runtime": 2.643, |
|
"eval_samples_per_second": 1172.907, |
|
"eval_steps_per_second": 24.593, |
|
"step": 2862 |
|
}, |
|
{ |
|
"epoch": 9.968553459119496, |
|
"grad_norm": 0.06756994128227234, |
|
"learning_rate": 0.00020421238436919606, |
|
"loss": 0.0219, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.9238709677419354, |
|
"eval_loss": 0.03625302389264107, |
|
"eval_runtime": 2.6402, |
|
"eval_samples_per_second": 1174.161, |
|
"eval_steps_per_second": 24.62, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 10.965408805031446, |
|
"grad_norm": 0.7381525635719299, |
|
"learning_rate": 0.00018391917877514115, |
|
"loss": 0.0199, |
|
"step": 3487 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.9287096774193548, |
|
"eval_loss": 0.03676296025514603, |
|
"eval_runtime": 2.6489, |
|
"eval_samples_per_second": 1170.314, |
|
"eval_steps_per_second": 24.539, |
|
"step": 3498 |
|
}, |
|
{ |
|
"epoch": 11.962264150943396, |
|
"grad_norm": 0.051013097167015076, |
|
"learning_rate": 0.00016362597318108623, |
|
"loss": 0.0184, |
|
"step": 3804 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.9264516129032258, |
|
"eval_loss": 0.03529633954167366, |
|
"eval_runtime": 2.6771, |
|
"eval_samples_per_second": 1157.95, |
|
"eval_steps_per_second": 24.28, |
|
"step": 3816 |
|
}, |
|
{ |
|
"epoch": 12.959119496855346, |
|
"grad_norm": 0.07709548622369766, |
|
"learning_rate": 0.00014333276758703132, |
|
"loss": 0.0171, |
|
"step": 4121 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.9306451612903226, |
|
"eval_loss": 0.03273295238614082, |
|
"eval_runtime": 2.6553, |
|
"eval_samples_per_second": 1167.463, |
|
"eval_steps_per_second": 24.479, |
|
"step": 4134 |
|
}, |
|
{ |
|
"epoch": 13.955974842767295, |
|
"grad_norm": 0.13161151111125946, |
|
"learning_rate": 0.0001230395619929764, |
|
"loss": 0.016, |
|
"step": 4438 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.9309677419354838, |
|
"eval_loss": 0.03127666935324669, |
|
"eval_runtime": 2.6519, |
|
"eval_samples_per_second": 1168.993, |
|
"eval_steps_per_second": 24.511, |
|
"step": 4452 |
|
}, |
|
{ |
|
"epoch": 14.952830188679245, |
|
"grad_norm": 0.05098772048950195, |
|
"learning_rate": 0.00010274635639892152, |
|
"loss": 0.0151, |
|
"step": 4755 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.9348387096774193, |
|
"eval_loss": 0.030557256191968918, |
|
"eval_runtime": 2.6688, |
|
"eval_samples_per_second": 1161.571, |
|
"eval_steps_per_second": 24.356, |
|
"step": 4770 |
|
}, |
|
{ |
|
"epoch": 15.949685534591195, |
|
"grad_norm": 0.04677645117044449, |
|
"learning_rate": 8.245315080486662e-05, |
|
"loss": 0.0143, |
|
"step": 5072 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.9338709677419355, |
|
"eval_loss": 0.03013915941119194, |
|
"eval_runtime": 2.6585, |
|
"eval_samples_per_second": 1166.065, |
|
"eval_steps_per_second": 24.45, |
|
"step": 5088 |
|
}, |
|
{ |
|
"epoch": 16.946540880503143, |
|
"grad_norm": 0.05656523257493973, |
|
"learning_rate": 6.21599452108117e-05, |
|
"loss": 0.0138, |
|
"step": 5389 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.9332258064516129, |
|
"eval_loss": 0.028866084292531013, |
|
"eval_runtime": 2.6651, |
|
"eval_samples_per_second": 1163.197, |
|
"eval_steps_per_second": 24.39, |
|
"step": 5406 |
|
}, |
|
{ |
|
"epoch": 17.943396226415093, |
|
"grad_norm": 0.047965776175260544, |
|
"learning_rate": 4.1866739616756805e-05, |
|
"loss": 0.0132, |
|
"step": 5706 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.9329032258064516, |
|
"eval_loss": 0.02866896614432335, |
|
"eval_runtime": 2.6577, |
|
"eval_samples_per_second": 1166.402, |
|
"eval_steps_per_second": 24.457, |
|
"step": 5724 |
|
}, |
|
{ |
|
"epoch": 18.940251572327043, |
|
"grad_norm": 0.04392002522945404, |
|
"learning_rate": 2.15735340227019e-05, |
|
"loss": 0.0128, |
|
"step": 6023 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.9338709677419355, |
|
"eval_loss": 0.02806813083589077, |
|
"eval_runtime": 2.6852, |
|
"eval_samples_per_second": 1154.456, |
|
"eval_steps_per_second": 24.206, |
|
"step": 6042 |
|
}, |
|
{ |
|
"epoch": 19.937106918238992, |
|
"grad_norm": 0.043970681726932526, |
|
"learning_rate": 1.280328428646997e-06, |
|
"loss": 0.0124, |
|
"step": 6340 |
|
} |
|
], |
|
"logging_steps": 317, |
|
"max_steps": 6360, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 20, |
|
"save_steps": 1000000000.0, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1648461417340164.0, |
|
"train_batch_size": 48, |
|
"trial_name": null, |
|
"trial_params": { |
|
"alpha": 0.040278370777020456, |
|
"fp16": true, |
|
"learning_rate": 0.0004035595207095335, |
|
"lr_scheduler": "linear", |
|
"num_train_epochs": 20, |
|
"temperature": 19, |
|
"warmup_steps": 56, |
|
"weight_decay": 0.14418853843556578 |
|
} |
|
} |
|
|