{ "best_metric": 0.14178870618343353, "best_model_checkpoint": "t5/checkpoint-63492", "epoch": 22.0, "eval_steps": 500, "global_step": 107448, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 1.0, "grad_norm": 0.4002791941165924, "learning_rate": 0.00019750000000000003, "loss": 0.3509, "step": 4884 }, { "epoch": 1.0, "eval_accuracy": 0.05755138360609973, "eval_loss": 0.21836575865745544, "eval_runtime": 1442.1861, "eval_samples_per_second": 79.482, "eval_steps_per_second": 0.311, "step": 4884 }, { "epoch": 2.0, "grad_norm": 0.3467308580875397, "learning_rate": 0.000195, "loss": 0.2125, "step": 9768 }, { "epoch": 2.0, "eval_accuracy": 0.077276058205674, "eval_loss": 0.18824860453605652, "eval_runtime": 1440.8058, "eval_samples_per_second": 79.558, "eval_steps_per_second": 0.311, "step": 9768 }, { "epoch": 3.0, "grad_norm": 0.3095360994338989, "learning_rate": 0.00019250000000000002, "loss": 0.1842, "step": 14652 }, { "epoch": 3.0, "eval_accuracy": 0.09321457235579439, "eval_loss": 0.17374937236309052, "eval_runtime": 1440.3978, "eval_samples_per_second": 79.581, "eval_steps_per_second": 0.311, "step": 14652 }, { "epoch": 4.0, "grad_norm": 0.26522552967071533, "learning_rate": 0.00019, "loss": 0.1669, "step": 19536 }, { "epoch": 4.0, "eval_accuracy": 0.10587291063265519, "eval_loss": 0.16474950313568115, "eval_runtime": 1439.905, "eval_samples_per_second": 79.608, "eval_steps_per_second": 0.311, "step": 19536 }, { "epoch": 5.0, "grad_norm": 0.22116659581661224, "learning_rate": 0.0001875, "loss": 0.1544, "step": 24420 }, { "epoch": 5.0, "eval_accuracy": 0.11401228321178071, "eval_loss": 0.15819723904132843, "eval_runtime": 1441.587, "eval_samples_per_second": 79.515, "eval_steps_per_second": 0.311, "step": 24420 }, { "epoch": 6.0, "grad_norm": 0.2789860963821411, "learning_rate": 0.00018500000000000002, "loss": 0.1444, "step": 29304 }, { "epoch": 6.0, "eval_accuracy": 0.12168056670272534, "eval_loss": 0.15368323028087616, "eval_runtime": 1440.4412, "eval_samples_per_second": 79.578, "eval_steps_per_second": 0.311, "step": 29304 }, { "epoch": 7.0, "grad_norm": 0.24488751590251923, "learning_rate": 0.0001825, "loss": 0.1359, "step": 34188 }, { "epoch": 7.0, "eval_accuracy": 0.12852880622535506, "eval_loss": 0.15017201006412506, "eval_runtime": 1439.8603, "eval_samples_per_second": 79.611, "eval_steps_per_second": 0.311, "step": 34188 }, { "epoch": 8.0, "grad_norm": 0.2019621729850769, "learning_rate": 0.00018, "loss": 0.1284, "step": 39072 }, { "epoch": 8.0, "eval_accuracy": 0.1320968698747252, "eval_loss": 0.1474558413028717, "eval_runtime": 1440.5681, "eval_samples_per_second": 79.571, "eval_steps_per_second": 0.311, "step": 39072 }, { "epoch": 9.0, "grad_norm": 0.24134604632854462, "learning_rate": 0.0001775, "loss": 0.1218, "step": 43956 }, { "epoch": 9.0, "eval_accuracy": 0.13765397634085913, "eval_loss": 0.14563630521297455, "eval_runtime": 1439.521, "eval_samples_per_second": 79.629, "eval_steps_per_second": 0.311, "step": 43956 }, { "epoch": 10.0, "grad_norm": 0.21248851716518402, "learning_rate": 0.000175, "loss": 0.1156, "step": 48840 }, { "epoch": 10.0, "eval_accuracy": 0.14216421816659106, "eval_loss": 0.14398989081382751, "eval_runtime": 1439.8066, "eval_samples_per_second": 79.613, "eval_steps_per_second": 0.311, "step": 48840 }, { "epoch": 11.0, "grad_norm": 0.18349188566207886, "learning_rate": 0.00017250000000000002, "loss": 0.11, "step": 53724 }, { "epoch": 11.0, "eval_accuracy": 0.14525246885577695, "eval_loss": 0.14398634433746338, "eval_runtime": 1441.3594, "eval_samples_per_second": 79.528, "eval_steps_per_second": 0.311, "step": 53724 }, { "epoch": 12.0, "grad_norm": 0.19695980846881866, "learning_rate": 0.00017, "loss": 0.1049, "step": 58608 }, { "epoch": 12.0, "eval_accuracy": 0.1470757581044771, "eval_loss": 0.14296095073223114, "eval_runtime": 1438.3626, "eval_samples_per_second": 79.693, "eval_steps_per_second": 0.311, "step": 58608 }, { "epoch": 13.0, "grad_norm": 0.19672971963882446, "learning_rate": 0.0001675, "loss": 0.1001, "step": 63492 }, { "epoch": 13.0, "eval_accuracy": 0.14978888229751894, "eval_loss": 0.14178870618343353, "eval_runtime": 1440.3056, "eval_samples_per_second": 79.586, "eval_steps_per_second": 0.311, "step": 63492 }, { "epoch": 14.0, "grad_norm": 0.192398801445961, "learning_rate": 0.000165, "loss": 0.0956, "step": 68376 }, { "epoch": 14.0, "eval_accuracy": 0.15094915727396446, "eval_loss": 0.1433423012495041, "eval_runtime": 1438.7215, "eval_samples_per_second": 79.674, "eval_steps_per_second": 0.311, "step": 68376 }, { "epoch": 15.0, "grad_norm": 0.17484577000141144, "learning_rate": 0.00016250000000000002, "loss": 0.0914, "step": 73260 }, { "epoch": 15.0, "eval_accuracy": 0.15243221551453398, "eval_loss": 0.14287354052066803, "eval_runtime": 1440.6167, "eval_samples_per_second": 79.569, "eval_steps_per_second": 0.311, "step": 73260 }, { "epoch": 16.0, "grad_norm": 0.16681896150112152, "learning_rate": 0.00016, "loss": 0.0874, "step": 78144 }, { "epoch": 16.0, "eval_accuracy": 0.15390654988310012, "eval_loss": 0.14442311227321625, "eval_runtime": 1440.0503, "eval_samples_per_second": 79.6, "eval_steps_per_second": 0.311, "step": 78144 }, { "epoch": 17.0, "grad_norm": 0.19815337657928467, "learning_rate": 0.0001575, "loss": 0.0837, "step": 83028 }, { "epoch": 17.0, "eval_accuracy": 0.1557036675157902, "eval_loss": 0.14355292916297913, "eval_runtime": 1439.7841, "eval_samples_per_second": 79.615, "eval_steps_per_second": 0.311, "step": 83028 }, { "epoch": 18.0, "grad_norm": 0.17696212232112885, "learning_rate": 0.000155, "loss": 0.0802, "step": 87912 }, { "epoch": 18.0, "eval_accuracy": 0.15677670377220226, "eval_loss": 0.1449592113494873, "eval_runtime": 1439.963, "eval_samples_per_second": 79.605, "eval_steps_per_second": 0.311, "step": 87912 }, { "epoch": 19.0, "grad_norm": 0.1683954894542694, "learning_rate": 0.0001525, "loss": 0.0769, "step": 92796 }, { "epoch": 19.0, "eval_accuracy": 0.15593921205988065, "eval_loss": 0.14668723940849304, "eval_runtime": 1441.0988, "eval_samples_per_second": 79.542, "eval_steps_per_second": 0.311, "step": 92796 }, { "epoch": 20.0, "grad_norm": 0.20112481713294983, "learning_rate": 0.00015000000000000001, "loss": 0.0738, "step": 97680 }, { "epoch": 20.0, "eval_accuracy": 0.15597410754789406, "eval_loss": 0.1498769223690033, "eval_runtime": 1441.1799, "eval_samples_per_second": 79.538, "eval_steps_per_second": 0.311, "step": 97680 }, { "epoch": 21.0, "grad_norm": 0.16424116492271423, "learning_rate": 0.0001475, "loss": 0.0709, "step": 102564 }, { "epoch": 21.0, "eval_accuracy": 0.1572041735003664, "eval_loss": 0.14935219287872314, "eval_runtime": 1439.3479, "eval_samples_per_second": 79.639, "eval_steps_per_second": 0.311, "step": 102564 }, { "epoch": 22.0, "grad_norm": 0.16410428285598755, "learning_rate": 0.000145, "loss": 0.068, "step": 107448 }, { "epoch": 22.0, "eval_accuracy": 0.157518232892487, "eval_loss": 0.15158186852931976, "eval_runtime": 1441.5687, "eval_samples_per_second": 79.516, "eval_steps_per_second": 0.311, "step": 107448 } ], "logging_steps": 500, "max_steps": 390720, "num_input_tokens_seen": 0, "num_train_epochs": 80, "save_steps": 500, "total_flos": 3.224876944629031e+18, "train_batch_size": 256, "trial_name": null, "trial_params": null }