|
{ |
|
"best_metric": 0.14178870618343353, |
|
"best_model_checkpoint": "t5/checkpoint-63492", |
|
"epoch": 22.0, |
|
"eval_steps": 500, |
|
"global_step": 107448, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4002791941165924, |
|
"learning_rate": 0.00019750000000000003, |
|
"loss": 0.3509, |
|
"step": 4884 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_accuracy": 0.05755138360609973, |
|
"eval_loss": 0.21836575865745544, |
|
"eval_runtime": 1442.1861, |
|
"eval_samples_per_second": 79.482, |
|
"eval_steps_per_second": 0.311, |
|
"step": 4884 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.3467308580875397, |
|
"learning_rate": 0.000195, |
|
"loss": 0.2125, |
|
"step": 9768 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_accuracy": 0.077276058205674, |
|
"eval_loss": 0.18824860453605652, |
|
"eval_runtime": 1440.8058, |
|
"eval_samples_per_second": 79.558, |
|
"eval_steps_per_second": 0.311, |
|
"step": 9768 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.3095360994338989, |
|
"learning_rate": 0.00019250000000000002, |
|
"loss": 0.1842, |
|
"step": 14652 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_accuracy": 0.09321457235579439, |
|
"eval_loss": 0.17374937236309052, |
|
"eval_runtime": 1440.3978, |
|
"eval_samples_per_second": 79.581, |
|
"eval_steps_per_second": 0.311, |
|
"step": 14652 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"grad_norm": 0.26522552967071533, |
|
"learning_rate": 0.00019, |
|
"loss": 0.1669, |
|
"step": 19536 |
|
}, |
|
{ |
|
"epoch": 4.0, |
|
"eval_accuracy": 0.10587291063265519, |
|
"eval_loss": 0.16474950313568115, |
|
"eval_runtime": 1439.905, |
|
"eval_samples_per_second": 79.608, |
|
"eval_steps_per_second": 0.311, |
|
"step": 19536 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.22116659581661224, |
|
"learning_rate": 0.0001875, |
|
"loss": 0.1544, |
|
"step": 24420 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"eval_accuracy": 0.11401228321178071, |
|
"eval_loss": 0.15819723904132843, |
|
"eval_runtime": 1441.587, |
|
"eval_samples_per_second": 79.515, |
|
"eval_steps_per_second": 0.311, |
|
"step": 24420 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"grad_norm": 0.2789860963821411, |
|
"learning_rate": 0.00018500000000000002, |
|
"loss": 0.1444, |
|
"step": 29304 |
|
}, |
|
{ |
|
"epoch": 6.0, |
|
"eval_accuracy": 0.12168056670272534, |
|
"eval_loss": 0.15368323028087616, |
|
"eval_runtime": 1440.4412, |
|
"eval_samples_per_second": 79.578, |
|
"eval_steps_per_second": 0.311, |
|
"step": 29304 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"grad_norm": 0.24488751590251923, |
|
"learning_rate": 0.0001825, |
|
"loss": 0.1359, |
|
"step": 34188 |
|
}, |
|
{ |
|
"epoch": 7.0, |
|
"eval_accuracy": 0.12852880622535506, |
|
"eval_loss": 0.15017201006412506, |
|
"eval_runtime": 1439.8603, |
|
"eval_samples_per_second": 79.611, |
|
"eval_steps_per_second": 0.311, |
|
"step": 34188 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"grad_norm": 0.2019621729850769, |
|
"learning_rate": 0.00018, |
|
"loss": 0.1284, |
|
"step": 39072 |
|
}, |
|
{ |
|
"epoch": 8.0, |
|
"eval_accuracy": 0.1320968698747252, |
|
"eval_loss": 0.1474558413028717, |
|
"eval_runtime": 1440.5681, |
|
"eval_samples_per_second": 79.571, |
|
"eval_steps_per_second": 0.311, |
|
"step": 39072 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"grad_norm": 0.24134604632854462, |
|
"learning_rate": 0.0001775, |
|
"loss": 0.1218, |
|
"step": 43956 |
|
}, |
|
{ |
|
"epoch": 9.0, |
|
"eval_accuracy": 0.13765397634085913, |
|
"eval_loss": 0.14563630521297455, |
|
"eval_runtime": 1439.521, |
|
"eval_samples_per_second": 79.629, |
|
"eval_steps_per_second": 0.311, |
|
"step": 43956 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.21248851716518402, |
|
"learning_rate": 0.000175, |
|
"loss": 0.1156, |
|
"step": 48840 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"eval_accuracy": 0.14216421816659106, |
|
"eval_loss": 0.14398989081382751, |
|
"eval_runtime": 1439.8066, |
|
"eval_samples_per_second": 79.613, |
|
"eval_steps_per_second": 0.311, |
|
"step": 48840 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"grad_norm": 0.18349188566207886, |
|
"learning_rate": 0.00017250000000000002, |
|
"loss": 0.11, |
|
"step": 53724 |
|
}, |
|
{ |
|
"epoch": 11.0, |
|
"eval_accuracy": 0.14525246885577695, |
|
"eval_loss": 0.14398634433746338, |
|
"eval_runtime": 1441.3594, |
|
"eval_samples_per_second": 79.528, |
|
"eval_steps_per_second": 0.311, |
|
"step": 53724 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"grad_norm": 0.19695980846881866, |
|
"learning_rate": 0.00017, |
|
"loss": 0.1049, |
|
"step": 58608 |
|
}, |
|
{ |
|
"epoch": 12.0, |
|
"eval_accuracy": 0.1470757581044771, |
|
"eval_loss": 0.14296095073223114, |
|
"eval_runtime": 1438.3626, |
|
"eval_samples_per_second": 79.693, |
|
"eval_steps_per_second": 0.311, |
|
"step": 58608 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"grad_norm": 0.19672971963882446, |
|
"learning_rate": 0.0001675, |
|
"loss": 0.1001, |
|
"step": 63492 |
|
}, |
|
{ |
|
"epoch": 13.0, |
|
"eval_accuracy": 0.14978888229751894, |
|
"eval_loss": 0.14178870618343353, |
|
"eval_runtime": 1440.3056, |
|
"eval_samples_per_second": 79.586, |
|
"eval_steps_per_second": 0.311, |
|
"step": 63492 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"grad_norm": 0.192398801445961, |
|
"learning_rate": 0.000165, |
|
"loss": 0.0956, |
|
"step": 68376 |
|
}, |
|
{ |
|
"epoch": 14.0, |
|
"eval_accuracy": 0.15094915727396446, |
|
"eval_loss": 0.1433423012495041, |
|
"eval_runtime": 1438.7215, |
|
"eval_samples_per_second": 79.674, |
|
"eval_steps_per_second": 0.311, |
|
"step": 68376 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"grad_norm": 0.17484577000141144, |
|
"learning_rate": 0.00016250000000000002, |
|
"loss": 0.0914, |
|
"step": 73260 |
|
}, |
|
{ |
|
"epoch": 15.0, |
|
"eval_accuracy": 0.15243221551453398, |
|
"eval_loss": 0.14287354052066803, |
|
"eval_runtime": 1440.6167, |
|
"eval_samples_per_second": 79.569, |
|
"eval_steps_per_second": 0.311, |
|
"step": 73260 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"grad_norm": 0.16681896150112152, |
|
"learning_rate": 0.00016, |
|
"loss": 0.0874, |
|
"step": 78144 |
|
}, |
|
{ |
|
"epoch": 16.0, |
|
"eval_accuracy": 0.15390654988310012, |
|
"eval_loss": 0.14442311227321625, |
|
"eval_runtime": 1440.0503, |
|
"eval_samples_per_second": 79.6, |
|
"eval_steps_per_second": 0.311, |
|
"step": 78144 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"grad_norm": 0.19815337657928467, |
|
"learning_rate": 0.0001575, |
|
"loss": 0.0837, |
|
"step": 83028 |
|
}, |
|
{ |
|
"epoch": 17.0, |
|
"eval_accuracy": 0.1557036675157902, |
|
"eval_loss": 0.14355292916297913, |
|
"eval_runtime": 1439.7841, |
|
"eval_samples_per_second": 79.615, |
|
"eval_steps_per_second": 0.311, |
|
"step": 83028 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"grad_norm": 0.17696212232112885, |
|
"learning_rate": 0.000155, |
|
"loss": 0.0802, |
|
"step": 87912 |
|
}, |
|
{ |
|
"epoch": 18.0, |
|
"eval_accuracy": 0.15677670377220226, |
|
"eval_loss": 0.1449592113494873, |
|
"eval_runtime": 1439.963, |
|
"eval_samples_per_second": 79.605, |
|
"eval_steps_per_second": 0.311, |
|
"step": 87912 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"grad_norm": 0.1683954894542694, |
|
"learning_rate": 0.0001525, |
|
"loss": 0.0769, |
|
"step": 92796 |
|
}, |
|
{ |
|
"epoch": 19.0, |
|
"eval_accuracy": 0.15593921205988065, |
|
"eval_loss": 0.14668723940849304, |
|
"eval_runtime": 1441.0988, |
|
"eval_samples_per_second": 79.542, |
|
"eval_steps_per_second": 0.311, |
|
"step": 92796 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"grad_norm": 0.20112481713294983, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.0738, |
|
"step": 97680 |
|
}, |
|
{ |
|
"epoch": 20.0, |
|
"eval_accuracy": 0.15597410754789406, |
|
"eval_loss": 0.1498769223690033, |
|
"eval_runtime": 1441.1799, |
|
"eval_samples_per_second": 79.538, |
|
"eval_steps_per_second": 0.311, |
|
"step": 97680 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"grad_norm": 0.16424116492271423, |
|
"learning_rate": 0.0001475, |
|
"loss": 0.0709, |
|
"step": 102564 |
|
}, |
|
{ |
|
"epoch": 21.0, |
|
"eval_accuracy": 0.1572041735003664, |
|
"eval_loss": 0.14935219287872314, |
|
"eval_runtime": 1439.3479, |
|
"eval_samples_per_second": 79.639, |
|
"eval_steps_per_second": 0.311, |
|
"step": 102564 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"grad_norm": 0.16410428285598755, |
|
"learning_rate": 0.000145, |
|
"loss": 0.068, |
|
"step": 107448 |
|
}, |
|
{ |
|
"epoch": 22.0, |
|
"eval_accuracy": 0.157518232892487, |
|
"eval_loss": 0.15158186852931976, |
|
"eval_runtime": 1441.5687, |
|
"eval_samples_per_second": 79.516, |
|
"eval_steps_per_second": 0.311, |
|
"step": 107448 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 390720, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 80, |
|
"save_steps": 500, |
|
"total_flos": 3.224876944629031e+18, |
|
"train_batch_size": 256, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|