|
{ |
|
"best_metric": 4.805818557739258, |
|
"best_model_checkpoint": "/users/hr1171/scratch/T5LAA/checkpoint-200000", |
|
"epoch": 4.10692, |
|
"eval_steps": 1000, |
|
"global_step": 200000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0025, |
|
"grad_norm": 0.42752909660339355, |
|
"learning_rate": 4.9875000000000006e-05, |
|
"loss": 9.2783, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"grad_norm": 0.6895659565925598, |
|
"learning_rate": 4.975e-05, |
|
"loss": 8.7605, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.005, |
|
"eval_accuracy": 0.033912700987924906, |
|
"eval_loss": 8.507349967956543, |
|
"eval_runtime": 91.5333, |
|
"eval_samples_per_second": 39.253, |
|
"eval_steps_per_second": 2.458, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.0075, |
|
"grad_norm": 1.0673288106918335, |
|
"learning_rate": 4.962500000000001e-05, |
|
"loss": 8.3738, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 0.6839419603347778, |
|
"learning_rate": 4.9500000000000004e-05, |
|
"loss": 8.0954, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"eval_accuracy": 0.03104558957478346, |
|
"eval_loss": 8.017916679382324, |
|
"eval_runtime": 91.927, |
|
"eval_samples_per_second": 39.085, |
|
"eval_steps_per_second": 2.448, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.0125, |
|
"grad_norm": 0.950947105884552, |
|
"learning_rate": 4.937525e-05, |
|
"loss": 7.8869, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"grad_norm": 1.0953532457351685, |
|
"learning_rate": 4.925025e-05, |
|
"loss": 7.7188, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.015, |
|
"eval_accuracy": 0.03081947497059626, |
|
"eval_loss": 7.6839141845703125, |
|
"eval_runtime": 90.9808, |
|
"eval_samples_per_second": 39.492, |
|
"eval_steps_per_second": 2.473, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.0175, |
|
"grad_norm": 0.5422408580780029, |
|
"learning_rate": 4.912525e-05, |
|
"loss": 7.5854, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 3.3019630908966064, |
|
"learning_rate": 4.900025e-05, |
|
"loss": 7.4459, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_accuracy": 0.032928381396291694, |
|
"eval_loss": 7.432972431182861, |
|
"eval_runtime": 92.5405, |
|
"eval_samples_per_second": 38.826, |
|
"eval_steps_per_second": 2.431, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.0225, |
|
"grad_norm": 16.127534866333008, |
|
"learning_rate": 4.88755e-05, |
|
"loss": 7.3365, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"grad_norm": 1.8640549182891846, |
|
"learning_rate": 4.875050000000001e-05, |
|
"loss": 7.2526, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.025, |
|
"eval_accuracy": 0.032260649460460894, |
|
"eval_loss": 7.2563862800598145, |
|
"eval_runtime": 91.289, |
|
"eval_samples_per_second": 39.359, |
|
"eval_steps_per_second": 2.465, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.0275, |
|
"grad_norm": 1.704673171043396, |
|
"learning_rate": 4.862575e-05, |
|
"loss": 7.1831, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 2.844064235687256, |
|
"learning_rate": 4.850075e-05, |
|
"loss": 7.1018, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"eval_accuracy": 0.033544686544888576, |
|
"eval_loss": 7.128678321838379, |
|
"eval_runtime": 94.2173, |
|
"eval_samples_per_second": 38.135, |
|
"eval_steps_per_second": 2.388, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.0325, |
|
"grad_norm": 1.2851128578186035, |
|
"learning_rate": 4.837575e-05, |
|
"loss": 7.0525, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"grad_norm": 0.74603670835495, |
|
"learning_rate": 4.825075e-05, |
|
"loss": 7.014, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.035, |
|
"eval_accuracy": 0.03406153936155956, |
|
"eval_loss": 7.0242919921875, |
|
"eval_runtime": 91.5738, |
|
"eval_samples_per_second": 39.236, |
|
"eval_steps_per_second": 2.457, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.0375, |
|
"grad_norm": 0.937613308429718, |
|
"learning_rate": 4.812575e-05, |
|
"loss": 6.9613, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 2.0788276195526123, |
|
"learning_rate": 4.800075e-05, |
|
"loss": 6.9585, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"eval_accuracy": 0.03162039956436886, |
|
"eval_loss": 6.953730583190918, |
|
"eval_runtime": 98.804, |
|
"eval_samples_per_second": 36.365, |
|
"eval_steps_per_second": 2.277, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.0425, |
|
"grad_norm": 0.7946870923042297, |
|
"learning_rate": 4.787575e-05, |
|
"loss": 6.9196, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"grad_norm": 1.7380784749984741, |
|
"learning_rate": 4.775075e-05, |
|
"loss": 6.9082, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.045, |
|
"eval_accuracy": 0.032857907650823, |
|
"eval_loss": 6.873142242431641, |
|
"eval_runtime": 100.6418, |
|
"eval_samples_per_second": 35.701, |
|
"eval_steps_per_second": 2.236, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.0475, |
|
"grad_norm": 1.8795133829116821, |
|
"learning_rate": 4.7625750000000004e-05, |
|
"loss": 6.8796, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 1.129756212234497, |
|
"learning_rate": 4.7501e-05, |
|
"loss": 6.8857, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"eval_accuracy": 0.032590025788220955, |
|
"eval_loss": 6.822448253631592, |
|
"eval_runtime": 96.3312, |
|
"eval_samples_per_second": 37.298, |
|
"eval_steps_per_second": 2.336, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.0525, |
|
"grad_norm": 0.6531652212142944, |
|
"learning_rate": 4.7376e-05, |
|
"loss": 6.8436, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"grad_norm": 1.2178057432174683, |
|
"learning_rate": 4.7251e-05, |
|
"loss": 6.8166, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.055, |
|
"eval_accuracy": 0.032403501647221764, |
|
"eval_loss": 6.821002006530762, |
|
"eval_runtime": 95.9352, |
|
"eval_samples_per_second": 37.452, |
|
"eval_steps_per_second": 2.345, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.0575, |
|
"grad_norm": 1.195275902748108, |
|
"learning_rate": 4.7126e-05, |
|
"loss": 6.8214, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 1.0131785869598389, |
|
"learning_rate": 4.700125e-05, |
|
"loss": 6.8225, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"eval_accuracy": 0.033441832970420755, |
|
"eval_loss": 6.764980316162109, |
|
"eval_runtime": 94.995, |
|
"eval_samples_per_second": 37.823, |
|
"eval_steps_per_second": 2.369, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.0625, |
|
"grad_norm": 2.522235870361328, |
|
"learning_rate": 4.687625000000001e-05, |
|
"loss": 6.7924, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"grad_norm": 0.894263505935669, |
|
"learning_rate": 4.6751250000000004e-05, |
|
"loss": 6.791, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.065, |
|
"eval_accuracy": 0.03224772473880158, |
|
"eval_loss": 6.734119415283203, |
|
"eval_runtime": 117.5831, |
|
"eval_samples_per_second": 30.557, |
|
"eval_steps_per_second": 1.914, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.0675, |
|
"grad_norm": 2.9750237464904785, |
|
"learning_rate": 4.662625000000001e-05, |
|
"loss": 6.7703, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 1.03080153465271, |
|
"learning_rate": 4.6501250000000005e-05, |
|
"loss": 6.7786, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"eval_accuracy": 0.03289967490923784, |
|
"eval_loss": 6.726984024047852, |
|
"eval_runtime": 103.0616, |
|
"eval_samples_per_second": 34.863, |
|
"eval_steps_per_second": 2.183, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.0725, |
|
"grad_norm": 2.6716787815093994, |
|
"learning_rate": 4.63765e-05, |
|
"loss": 6.7565, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"grad_norm": 0.8023720383644104, |
|
"learning_rate": 4.62515e-05, |
|
"loss": 6.7516, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.075, |
|
"eval_accuracy": 0.03358686195240845, |
|
"eval_loss": 6.673779010772705, |
|
"eval_runtime": 97.0967, |
|
"eval_samples_per_second": 37.004, |
|
"eval_steps_per_second": 2.317, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.0775, |
|
"grad_norm": 0.9706649780273438, |
|
"learning_rate": 4.61265e-05, |
|
"loss": 6.7337, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 2.0966403484344482, |
|
"learning_rate": 4.60015e-05, |
|
"loss": 6.7343, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"eval_accuracy": 0.03371406842347646, |
|
"eval_loss": 6.695716857910156, |
|
"eval_runtime": 95.0805, |
|
"eval_samples_per_second": 37.789, |
|
"eval_steps_per_second": 2.366, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.0825, |
|
"grad_norm": 0.998198390007019, |
|
"learning_rate": 4.58765e-05, |
|
"loss": 6.7044, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"grad_norm": 0.5809659957885742, |
|
"learning_rate": 4.5751500000000004e-05, |
|
"loss": 6.7027, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.085, |
|
"eval_accuracy": 0.03325081918926622, |
|
"eval_loss": 6.647298336029053, |
|
"eval_runtime": 111.1392, |
|
"eval_samples_per_second": 32.329, |
|
"eval_steps_per_second": 2.024, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.0875, |
|
"grad_norm": 1.1927660703659058, |
|
"learning_rate": 4.56265e-05, |
|
"loss": 6.7058, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 1.1725599765777588, |
|
"learning_rate": 4.5501500000000005e-05, |
|
"loss": 6.6741, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"eval_accuracy": 0.034510367327392044, |
|
"eval_loss": 6.625380039215088, |
|
"eval_runtime": 91.7433, |
|
"eval_samples_per_second": 39.164, |
|
"eval_steps_per_second": 2.452, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.0925, |
|
"grad_norm": 1.0063527822494507, |
|
"learning_rate": 4.537675e-05, |
|
"loss": 6.7041, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"grad_norm": 2.5089831352233887, |
|
"learning_rate": 4.525175e-05, |
|
"loss": 6.6426, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.095, |
|
"eval_accuracy": 0.03386848483487988, |
|
"eval_loss": 6.642553806304932, |
|
"eval_runtime": 95.8961, |
|
"eval_samples_per_second": 37.468, |
|
"eval_steps_per_second": 2.346, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.0975, |
|
"grad_norm": 1.1984361410140991, |
|
"learning_rate": 4.512675e-05, |
|
"loss": 6.6729, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 1.813727855682373, |
|
"learning_rate": 4.500175e-05, |
|
"loss": 6.6475, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"eval_accuracy": 0.03304606438824231, |
|
"eval_loss": 6.6046462059021, |
|
"eval_runtime": 95.7736, |
|
"eval_samples_per_second": 37.516, |
|
"eval_steps_per_second": 2.349, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.1025, |
|
"grad_norm": 1.4854774475097656, |
|
"learning_rate": 4.4877000000000004e-05, |
|
"loss": 6.6445, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"grad_norm": 1.47989022731781, |
|
"learning_rate": 4.4752e-05, |
|
"loss": 6.6649, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.105, |
|
"eval_accuracy": 0.034226567649693784, |
|
"eval_loss": 6.570390224456787, |
|
"eval_runtime": 99.1817, |
|
"eval_samples_per_second": 36.226, |
|
"eval_steps_per_second": 2.269, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.1075, |
|
"grad_norm": 1.1667526960372925, |
|
"learning_rate": 4.4627000000000005e-05, |
|
"loss": 6.6509, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 1.3919010162353516, |
|
"learning_rate": 4.4502e-05, |
|
"loss": 6.619, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"eval_accuracy": 0.032384726788390335, |
|
"eval_loss": 6.571103096008301, |
|
"eval_runtime": 95.7384, |
|
"eval_samples_per_second": 37.529, |
|
"eval_steps_per_second": 2.35, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.1125, |
|
"grad_norm": 2.1141815185546875, |
|
"learning_rate": 4.437725e-05, |
|
"loss": 6.6248, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"grad_norm": 2.2613210678100586, |
|
"learning_rate": 4.425225e-05, |
|
"loss": 6.6216, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.115, |
|
"eval_accuracy": 0.03199086290203537, |
|
"eval_loss": 6.581330299377441, |
|
"eval_runtime": 91.9194, |
|
"eval_samples_per_second": 39.089, |
|
"eval_steps_per_second": 2.448, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.1175, |
|
"grad_norm": 1.3900034427642822, |
|
"learning_rate": 4.4127250000000003e-05, |
|
"loss": 6.5989, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 1.3075273036956787, |
|
"learning_rate": 4.400225e-05, |
|
"loss": 6.5812, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"eval_accuracy": 0.033095314380249395, |
|
"eval_loss": 6.547011375427246, |
|
"eval_runtime": 94.3249, |
|
"eval_samples_per_second": 38.092, |
|
"eval_steps_per_second": 2.385, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.1225, |
|
"grad_norm": 1.2836869955062866, |
|
"learning_rate": 4.387750000000001e-05, |
|
"loss": 6.597, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"grad_norm": 1.1456223726272583, |
|
"learning_rate": 4.375275e-05, |
|
"loss": 6.5995, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.125, |
|
"eval_accuracy": 0.033780188578491493, |
|
"eval_loss": 6.51839542388916, |
|
"eval_runtime": 93.2456, |
|
"eval_samples_per_second": 38.533, |
|
"eval_steps_per_second": 2.413, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.1275, |
|
"grad_norm": 1.2860591411590576, |
|
"learning_rate": 4.362775e-05, |
|
"loss": 6.5882, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 1.200269103050232, |
|
"learning_rate": 4.350275e-05, |
|
"loss": 6.5891, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"eval_accuracy": 0.03334578188103678, |
|
"eval_loss": 6.508150100708008, |
|
"eval_runtime": 90.9178, |
|
"eval_samples_per_second": 39.519, |
|
"eval_steps_per_second": 2.475, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.1325, |
|
"grad_norm": 1.4053945541381836, |
|
"learning_rate": 4.337775e-05, |
|
"loss": 6.5754, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"grad_norm": 1.9518871307373047, |
|
"learning_rate": 4.3252750000000004e-05, |
|
"loss": 6.5767, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.135, |
|
"eval_accuracy": 0.03279600503655996, |
|
"eval_loss": 6.481350898742676, |
|
"eval_runtime": 93.0531, |
|
"eval_samples_per_second": 38.612, |
|
"eval_steps_per_second": 2.418, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.1375, |
|
"grad_norm": 0.9082052111625671, |
|
"learning_rate": 4.312775e-05, |
|
"loss": 6.5661, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 1.2394309043884277, |
|
"learning_rate": 4.3002750000000004e-05, |
|
"loss": 6.5387, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"eval_accuracy": 0.03242513354978841, |
|
"eval_loss": 6.50333833694458, |
|
"eval_runtime": 91.851, |
|
"eval_samples_per_second": 39.118, |
|
"eval_steps_per_second": 2.45, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.1425, |
|
"grad_norm": 1.4448879957199097, |
|
"learning_rate": 4.287775e-05, |
|
"loss": 6.541, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"grad_norm": 1.214563250541687, |
|
"learning_rate": 4.2752750000000005e-05, |
|
"loss": 6.5427, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.145, |
|
"eval_accuracy": 0.03188379178681556, |
|
"eval_loss": 6.480025291442871, |
|
"eval_runtime": 103.8976, |
|
"eval_samples_per_second": 34.582, |
|
"eval_steps_per_second": 2.166, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.1475, |
|
"grad_norm": 0.9117876291275024, |
|
"learning_rate": 4.2628e-05, |
|
"loss": 6.5505, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 1.4780124425888062, |
|
"learning_rate": 4.2503e-05, |
|
"loss": 6.5139, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"eval_accuracy": 0.03140081534586215, |
|
"eval_loss": 6.477231025695801, |
|
"eval_runtime": 97.6245, |
|
"eval_samples_per_second": 36.804, |
|
"eval_steps_per_second": 2.305, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.1525, |
|
"grad_norm": 0.9774089455604553, |
|
"learning_rate": 4.2378e-05, |
|
"loss": 6.5255, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"grad_norm": 1.5147466659545898, |
|
"learning_rate": 4.225325e-05, |
|
"loss": 6.5186, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.155, |
|
"eval_accuracy": 0.03227180553599841, |
|
"eval_loss": 6.446476936340332, |
|
"eval_runtime": 92.5361, |
|
"eval_samples_per_second": 38.828, |
|
"eval_steps_per_second": 2.431, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 0.1575, |
|
"grad_norm": 1.4620882272720337, |
|
"learning_rate": 4.2128250000000004e-05, |
|
"loss": 6.5019, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 1.446492314338684, |
|
"learning_rate": 4.200325e-05, |
|
"loss": 6.5233, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"eval_accuracy": 0.03262907205260226, |
|
"eval_loss": 6.422818660736084, |
|
"eval_runtime": 91.1245, |
|
"eval_samples_per_second": 39.43, |
|
"eval_steps_per_second": 2.469, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 0.1625, |
|
"grad_norm": 1.022921085357666, |
|
"learning_rate": 4.1878250000000005e-05, |
|
"loss": 6.4829, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"grad_norm": 1.4783859252929688, |
|
"learning_rate": 4.175325e-05, |
|
"loss": 6.4659, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.165, |
|
"eval_accuracy": 0.031788829095045004, |
|
"eval_loss": 6.436858654022217, |
|
"eval_runtime": 95.3969, |
|
"eval_samples_per_second": 37.664, |
|
"eval_steps_per_second": 2.359, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 0.1675, |
|
"grad_norm": 3.6760544776916504, |
|
"learning_rate": 4.162825e-05, |
|
"loss": 6.5373, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 2.7687697410583496, |
|
"learning_rate": 4.15035e-05, |
|
"loss": 6.4819, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"eval_accuracy": 0.033804269375688326, |
|
"eval_loss": 6.397607803344727, |
|
"eval_runtime": 94.8411, |
|
"eval_samples_per_second": 37.884, |
|
"eval_steps_per_second": 2.372, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 0.1725, |
|
"grad_norm": 1.5380724668502808, |
|
"learning_rate": 4.1378500000000004e-05, |
|
"loss": 6.4747, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"grad_norm": 1.8788790702819824, |
|
"learning_rate": 4.12535e-05, |
|
"loss": 6.4735, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.175, |
|
"eval_accuracy": 0.033044159692418834, |
|
"eval_loss": 6.411598205566406, |
|
"eval_runtime": 92.6637, |
|
"eval_samples_per_second": 38.775, |
|
"eval_steps_per_second": 2.428, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 0.1775, |
|
"grad_norm": 1.3821016550064087, |
|
"learning_rate": 4.1128500000000004e-05, |
|
"loss": 6.4538, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 2.0021932125091553, |
|
"learning_rate": 4.10035e-05, |
|
"loss": 6.4659, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"eval_accuracy": 0.031257418960294575, |
|
"eval_loss": 6.4191389083862305, |
|
"eval_runtime": 102.9081, |
|
"eval_samples_per_second": 34.915, |
|
"eval_steps_per_second": 2.186, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 0.1825, |
|
"grad_norm": 0.9058569073677063, |
|
"learning_rate": 4.0878500000000005e-05, |
|
"loss": 6.4517, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"grad_norm": 1.922741174697876, |
|
"learning_rate": 4.07535e-05, |
|
"loss": 6.443, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.185, |
|
"eval_accuracy": 0.03225167018015021, |
|
"eval_loss": 6.378974437713623, |
|
"eval_runtime": 105.2589, |
|
"eval_samples_per_second": 34.135, |
|
"eval_steps_per_second": 2.138, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 0.1875, |
|
"grad_norm": 2.298774480819702, |
|
"learning_rate": 4.0628500000000006e-05, |
|
"loss": 6.4497, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 1.7634875774383545, |
|
"learning_rate": 4.050375e-05, |
|
"loss": 6.448, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"eval_accuracy": 0.0315727821687819, |
|
"eval_loss": 6.390995979309082, |
|
"eval_runtime": 97.37, |
|
"eval_samples_per_second": 36.9, |
|
"eval_steps_per_second": 2.311, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 0.1925, |
|
"grad_norm": 1.2913744449615479, |
|
"learning_rate": 4.0378750000000004e-05, |
|
"loss": 6.4289, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"grad_norm": 1.5835742950439453, |
|
"learning_rate": 4.025375e-05, |
|
"loss": 6.421, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.195, |
|
"eval_accuracy": 0.0321972502994794, |
|
"eval_loss": 6.371878147125244, |
|
"eval_runtime": 96.1284, |
|
"eval_samples_per_second": 37.377, |
|
"eval_steps_per_second": 2.341, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 0.1975, |
|
"grad_norm": 1.9730321168899536, |
|
"learning_rate": 4.0128750000000004e-05, |
|
"loss": 6.4357, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 1.8616057634353638, |
|
"learning_rate": 4.0004000000000005e-05, |
|
"loss": 6.4127, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"eval_accuracy": 0.03198569301337165, |
|
"eval_loss": 6.374399662017822, |
|
"eval_runtime": 92.9371, |
|
"eval_samples_per_second": 38.661, |
|
"eval_steps_per_second": 2.421, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 0.2025, |
|
"grad_norm": 1.4953457117080688, |
|
"learning_rate": 3.9879e-05, |
|
"loss": 6.4233, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"grad_norm": 1.2381442785263062, |
|
"learning_rate": 3.9754e-05, |
|
"loss": 6.4213, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.205, |
|
"eval_accuracy": 0.03152353217677482, |
|
"eval_loss": 6.381062030792236, |
|
"eval_runtime": 130.2985, |
|
"eval_samples_per_second": 27.575, |
|
"eval_steps_per_second": 1.727, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 0.2075, |
|
"grad_norm": 1.2648102045059204, |
|
"learning_rate": 3.9628999999999996e-05, |
|
"loss": 6.4195, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 1.322369933128357, |
|
"learning_rate": 3.9504250000000004e-05, |
|
"loss": 6.42, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"eval_accuracy": 0.031080010149307745, |
|
"eval_loss": 6.351558685302734, |
|
"eval_runtime": 97.1612, |
|
"eval_samples_per_second": 36.98, |
|
"eval_steps_per_second": 2.316, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 0.2125, |
|
"grad_norm": 1.8557090759277344, |
|
"learning_rate": 3.937925e-05, |
|
"loss": 6.4013, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"grad_norm": 2.21213436126709, |
|
"learning_rate": 3.9254250000000005e-05, |
|
"loss": 6.414, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.215, |
|
"eval_accuracy": 0.03100681540980551, |
|
"eval_loss": 6.333887100219727, |
|
"eval_runtime": 96.5646, |
|
"eval_samples_per_second": 37.208, |
|
"eval_steps_per_second": 2.33, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 0.2175, |
|
"grad_norm": 1.817885160446167, |
|
"learning_rate": 3.912925e-05, |
|
"loss": 6.4259, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 1.3436046838760376, |
|
"learning_rate": 3.90045e-05, |
|
"loss": 6.3899, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"eval_accuracy": 0.03152108328214463, |
|
"eval_loss": 6.350229263305664, |
|
"eval_runtime": 110.9053, |
|
"eval_samples_per_second": 32.397, |
|
"eval_steps_per_second": 2.029, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 0.2225, |
|
"grad_norm": 1.5731582641601562, |
|
"learning_rate": 3.887975e-05, |
|
"loss": 6.3756, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.00173, |
|
"grad_norm": 1.9111318588256836, |
|
"learning_rate": 3.875475e-05, |
|
"loss": 6.3715, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.00173, |
|
"eval_accuracy": 0.03136843551686302, |
|
"eval_loss": 6.31905460357666, |
|
"eval_runtime": 98.2753, |
|
"eval_samples_per_second": 36.561, |
|
"eval_steps_per_second": 2.289, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.00423, |
|
"grad_norm": 1.8441720008850098, |
|
"learning_rate": 3.8629750000000004e-05, |
|
"loss": 6.3367, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.00673, |
|
"grad_norm": 1.122545838356018, |
|
"learning_rate": 3.850475e-05, |
|
"loss": 6.3588, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.00673, |
|
"eval_accuracy": 0.03153237540738382, |
|
"eval_loss": 6.308679103851318, |
|
"eval_runtime": 103.4383, |
|
"eval_samples_per_second": 34.736, |
|
"eval_steps_per_second": 2.175, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.00923, |
|
"grad_norm": 1.5169119834899902, |
|
"learning_rate": 3.8379750000000005e-05, |
|
"loss": 6.3561, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.01173, |
|
"grad_norm": 2.325814723968506, |
|
"learning_rate": 3.825475e-05, |
|
"loss": 6.3802, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.01173, |
|
"eval_accuracy": 0.03152312402766979, |
|
"eval_loss": 6.2924675941467285, |
|
"eval_runtime": 92.1003, |
|
"eval_samples_per_second": 39.012, |
|
"eval_steps_per_second": 2.443, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.01423, |
|
"grad_norm": 2.1370677947998047, |
|
"learning_rate": 3.8129750000000005e-05, |
|
"loss": 6.3738, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.01673, |
|
"grad_norm": 1.6703208684921265, |
|
"learning_rate": 3.8005e-05, |
|
"loss": 6.3708, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.01673, |
|
"eval_accuracy": 0.03175032702947041, |
|
"eval_loss": 6.304358959197998, |
|
"eval_runtime": 92.2936, |
|
"eval_samples_per_second": 38.93, |
|
"eval_steps_per_second": 2.438, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.01923, |
|
"grad_norm": 2.00714111328125, |
|
"learning_rate": 3.788e-05, |
|
"loss": 6.3374, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.02173, |
|
"grad_norm": 1.4548686742782593, |
|
"learning_rate": 3.7755e-05, |
|
"loss": 6.3189, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.02173, |
|
"eval_accuracy": 0.030815393479545948, |
|
"eval_loss": 6.318645477294922, |
|
"eval_runtime": 91.5251, |
|
"eval_samples_per_second": 39.257, |
|
"eval_steps_per_second": 2.458, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.02423, |
|
"grad_norm": 1.719914436340332, |
|
"learning_rate": 3.7630000000000004e-05, |
|
"loss": 6.3527, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.02673, |
|
"grad_norm": 1.7514491081237793, |
|
"learning_rate": 3.7505e-05, |
|
"loss": 6.3545, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.02673, |
|
"eval_accuracy": 0.030726689074052533, |
|
"eval_loss": 6.3023600578308105, |
|
"eval_runtime": 92.8566, |
|
"eval_samples_per_second": 38.694, |
|
"eval_steps_per_second": 2.423, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.02923, |
|
"grad_norm": 1.5919264554977417, |
|
"learning_rate": 3.7380000000000005e-05, |
|
"loss": 6.3373, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.03173, |
|
"grad_norm": 2.918543815612793, |
|
"learning_rate": 3.7255e-05, |
|
"loss": 6.3255, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.03173, |
|
"eval_accuracy": 0.030562341034426697, |
|
"eval_loss": 6.30160665512085, |
|
"eval_runtime": 92.2685, |
|
"eval_samples_per_second": 38.941, |
|
"eval_steps_per_second": 2.439, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.03423, |
|
"grad_norm": 2.352147340774536, |
|
"learning_rate": 3.713025e-05, |
|
"loss": 6.3399, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.03673, |
|
"grad_norm": 2.9179019927978516, |
|
"learning_rate": 3.700525e-05, |
|
"loss": 6.3162, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.03673, |
|
"eval_accuracy": 0.03165182704545624, |
|
"eval_loss": 6.283206462860107, |
|
"eval_runtime": 91.4548, |
|
"eval_samples_per_second": 39.287, |
|
"eval_steps_per_second": 2.46, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.03923, |
|
"grad_norm": 1.9093166589736938, |
|
"learning_rate": 3.6880249999999996e-05, |
|
"loss": 6.2978, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.04173, |
|
"grad_norm": 1.792311668395996, |
|
"learning_rate": 3.675525e-05, |
|
"loss": 6.309, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.04173, |
|
"eval_accuracy": 0.030514859688541417, |
|
"eval_loss": 6.273383140563965, |
|
"eval_runtime": 99.4034, |
|
"eval_samples_per_second": 36.146, |
|
"eval_steps_per_second": 2.264, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.04423, |
|
"grad_norm": 1.311917781829834, |
|
"learning_rate": 3.663025e-05, |
|
"loss": 6.303, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.04673, |
|
"grad_norm": 1.8375986814498901, |
|
"learning_rate": 3.6505500000000005e-05, |
|
"loss": 6.314, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.04673, |
|
"eval_accuracy": 0.03121020971381265, |
|
"eval_loss": 6.250477313995361, |
|
"eval_runtime": 91.4556, |
|
"eval_samples_per_second": 39.287, |
|
"eval_steps_per_second": 2.46, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.04923, |
|
"grad_norm": 1.672000527381897, |
|
"learning_rate": 3.63805e-05, |
|
"loss": 6.296, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.05173, |
|
"grad_norm": 1.5362963676452637, |
|
"learning_rate": 3.6255500000000005e-05, |
|
"loss": 6.293, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.05173, |
|
"eval_accuracy": 0.031686927868488916, |
|
"eval_loss": 6.259158611297607, |
|
"eval_runtime": 90.5801, |
|
"eval_samples_per_second": 39.667, |
|
"eval_steps_per_second": 2.484, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.05423, |
|
"grad_norm": 1.3914135694503784, |
|
"learning_rate": 3.61305e-05, |
|
"loss": 6.2675, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.05673, |
|
"grad_norm": 1.923341989517212, |
|
"learning_rate": 3.6005500000000006e-05, |
|
"loss": 6.2813, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.05673, |
|
"eval_accuracy": 0.0307902242847357, |
|
"eval_loss": 6.2271199226379395, |
|
"eval_runtime": 108.1752, |
|
"eval_samples_per_second": 33.215, |
|
"eval_steps_per_second": 2.08, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.05923, |
|
"grad_norm": 1.8110476732254028, |
|
"learning_rate": 3.58805e-05, |
|
"loss": 6.2753, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.06173, |
|
"grad_norm": 3.4441418647766113, |
|
"learning_rate": 3.575575e-05, |
|
"loss": 6.2781, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.06173, |
|
"eval_accuracy": 0.030471595883408125, |
|
"eval_loss": 6.252005100250244, |
|
"eval_runtime": 119.6594, |
|
"eval_samples_per_second": 30.027, |
|
"eval_steps_per_second": 1.88, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.06423, |
|
"grad_norm": 2.464296340942383, |
|
"learning_rate": 3.563075e-05, |
|
"loss": 6.2624, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.06673, |
|
"grad_norm": 1.7239878177642822, |
|
"learning_rate": 3.5505750000000005e-05, |
|
"loss": 6.2625, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.06673, |
|
"eval_accuracy": 0.03089743144965719, |
|
"eval_loss": 6.220044136047363, |
|
"eval_runtime": 91.2098, |
|
"eval_samples_per_second": 39.393, |
|
"eval_steps_per_second": 2.467, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.06923, |
|
"grad_norm": 3.2937629222869873, |
|
"learning_rate": 3.538075e-05, |
|
"loss": 6.2822, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.07173, |
|
"grad_norm": 2.1419010162353516, |
|
"learning_rate": 3.5255750000000005e-05, |
|
"loss": 6.2638, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.07173, |
|
"eval_accuracy": 0.03013310422563571, |
|
"eval_loss": 6.199001789093018, |
|
"eval_runtime": 92.4858, |
|
"eval_samples_per_second": 38.849, |
|
"eval_steps_per_second": 2.433, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.07423, |
|
"grad_norm": 1.3166654109954834, |
|
"learning_rate": 3.5131e-05, |
|
"loss": 6.2721, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.07673, |
|
"grad_norm": 2.0733723640441895, |
|
"learning_rate": 3.5005999999999997e-05, |
|
"loss": 6.2455, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.07673, |
|
"eval_accuracy": 0.031115110972340414, |
|
"eval_loss": 6.203488349914551, |
|
"eval_runtime": 110.8283, |
|
"eval_samples_per_second": 32.42, |
|
"eval_steps_per_second": 2.03, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.07923, |
|
"grad_norm": 1.416002631187439, |
|
"learning_rate": 3.4881e-05, |
|
"loss": 6.2489, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.08173, |
|
"grad_norm": 2.517613172531128, |
|
"learning_rate": 3.4756e-05, |
|
"loss": 6.253, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.08173, |
|
"eval_accuracy": 0.03135415029818693, |
|
"eval_loss": 6.215968608856201, |
|
"eval_runtime": 101.5668, |
|
"eval_samples_per_second": 35.376, |
|
"eval_steps_per_second": 2.215, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.08423, |
|
"grad_norm": 1.8757721185684204, |
|
"learning_rate": 3.4631e-05, |
|
"loss": 6.2412, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.08673, |
|
"grad_norm": 1.971893072128296, |
|
"learning_rate": 3.4506e-05, |
|
"loss": 6.2408, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.08673, |
|
"eval_accuracy": 0.030135144971160866, |
|
"eval_loss": 6.208563804626465, |
|
"eval_runtime": 92.1622, |
|
"eval_samples_per_second": 38.986, |
|
"eval_steps_per_second": 2.441, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.08923, |
|
"grad_norm": 1.688067078590393, |
|
"learning_rate": 3.4381e-05, |
|
"loss": 6.2355, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 1.09173, |
|
"grad_norm": 1.7897340059280396, |
|
"learning_rate": 3.4256e-05, |
|
"loss": 6.2332, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.09173, |
|
"eval_accuracy": 0.029759647794532298, |
|
"eval_loss": 6.192452907562256, |
|
"eval_runtime": 91.4457, |
|
"eval_samples_per_second": 39.291, |
|
"eval_steps_per_second": 2.46, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 1.09423, |
|
"grad_norm": 1.7778252363204956, |
|
"learning_rate": 3.4131250000000006e-05, |
|
"loss": 6.2307, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 1.09673, |
|
"grad_norm": 2.530224561691284, |
|
"learning_rate": 3.400625e-05, |
|
"loss": 6.2182, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.09673, |
|
"eval_accuracy": 0.030409149070338377, |
|
"eval_loss": 6.166359901428223, |
|
"eval_runtime": 93.8927, |
|
"eval_samples_per_second": 38.267, |
|
"eval_steps_per_second": 2.396, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 1.09923, |
|
"grad_norm": 2.602126121520996, |
|
"learning_rate": 3.388125e-05, |
|
"loss": 6.2163, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 1.10173, |
|
"grad_norm": 2.1342318058013916, |
|
"learning_rate": 3.37565e-05, |
|
"loss": 6.2301, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.10173, |
|
"eval_accuracy": 0.030321941211563407, |
|
"eval_loss": 6.158266067504883, |
|
"eval_runtime": 97.9208, |
|
"eval_samples_per_second": 36.693, |
|
"eval_steps_per_second": 2.298, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 1.10423, |
|
"grad_norm": 1.733469843864441, |
|
"learning_rate": 3.363175e-05, |
|
"loss": 6.2318, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 1.10673, |
|
"grad_norm": 3.459373950958252, |
|
"learning_rate": 3.3506750000000006e-05, |
|
"loss": 6.2379, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.10673, |
|
"eval_accuracy": 0.030465745746236016, |
|
"eval_loss": 6.1884002685546875, |
|
"eval_runtime": 95.0294, |
|
"eval_samples_per_second": 37.809, |
|
"eval_steps_per_second": 2.368, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 1.10923, |
|
"grad_norm": 2.1125833988189697, |
|
"learning_rate": 3.338175e-05, |
|
"loss": 6.2044, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 1.11173, |
|
"grad_norm": 1.7863088846206665, |
|
"learning_rate": 3.3256750000000006e-05, |
|
"loss": 6.2211, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.11173, |
|
"eval_accuracy": 0.03108939757872346, |
|
"eval_loss": 6.161408424377441, |
|
"eval_runtime": 94.8294, |
|
"eval_samples_per_second": 37.889, |
|
"eval_steps_per_second": 2.373, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 1.11423, |
|
"grad_norm": 2.1072041988372803, |
|
"learning_rate": 3.3131750000000003e-05, |
|
"loss": 6.2086, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 1.11673, |
|
"grad_norm": 1.647073745727539, |
|
"learning_rate": 3.300675e-05, |
|
"loss": 6.2018, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.11673, |
|
"eval_accuracy": 0.030671997093978373, |
|
"eval_loss": 6.1607513427734375, |
|
"eval_runtime": 91.8853, |
|
"eval_samples_per_second": 39.103, |
|
"eval_steps_per_second": 2.449, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 1.11923, |
|
"grad_norm": 2.174481153488159, |
|
"learning_rate": 3.288175e-05, |
|
"loss": 6.1837, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 1.12173, |
|
"grad_norm": 1.5667694807052612, |
|
"learning_rate": 3.275675e-05, |
|
"loss": 6.1969, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.12173, |
|
"eval_accuracy": 0.030524519217360487, |
|
"eval_loss": 6.133298397064209, |
|
"eval_runtime": 140.1649, |
|
"eval_samples_per_second": 25.634, |
|
"eval_steps_per_second": 1.605, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 1.12423, |
|
"grad_norm": 2.3138535022735596, |
|
"learning_rate": 3.263175e-05, |
|
"loss": 6.2001, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 1.12673, |
|
"grad_norm": 2.0583817958831787, |
|
"learning_rate": 3.2507000000000006e-05, |
|
"loss": 6.1989, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.12673, |
|
"eval_accuracy": 0.030227250619196205, |
|
"eval_loss": 6.131419658660889, |
|
"eval_runtime": 92.9886, |
|
"eval_samples_per_second": 38.639, |
|
"eval_steps_per_second": 2.42, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 1.12923, |
|
"grad_norm": 2.3746538162231445, |
|
"learning_rate": 3.2382e-05, |
|
"loss": 6.2039, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 1.1317300000000001, |
|
"grad_norm": 2.146286964416504, |
|
"learning_rate": 3.2257000000000006e-05, |
|
"loss": 6.2058, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.1317300000000001, |
|
"eval_accuracy": 0.029887942663213724, |
|
"eval_loss": 6.151033878326416, |
|
"eval_runtime": 115.2908, |
|
"eval_samples_per_second": 31.165, |
|
"eval_steps_per_second": 1.952, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 1.13423, |
|
"grad_norm": 1.7566348314285278, |
|
"learning_rate": 3.2132e-05, |
|
"loss": 6.1963, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 1.13673, |
|
"grad_norm": 1.7382845878601074, |
|
"learning_rate": 3.2007e-05, |
|
"loss": 6.1994, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.13673, |
|
"eval_accuracy": 0.02953489368736187, |
|
"eval_loss": 6.144207954406738, |
|
"eval_runtime": 106.6363, |
|
"eval_samples_per_second": 33.694, |
|
"eval_steps_per_second": 2.11, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 1.13923, |
|
"grad_norm": 2.3577260971069336, |
|
"learning_rate": 3.188225e-05, |
|
"loss": 6.1737, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 1.14173, |
|
"grad_norm": 1.917486310005188, |
|
"learning_rate": 3.175725e-05, |
|
"loss": 6.1715, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.14173, |
|
"eval_accuracy": 0.029909030366973663, |
|
"eval_loss": 6.13955545425415, |
|
"eval_runtime": 92.2027, |
|
"eval_samples_per_second": 38.968, |
|
"eval_steps_per_second": 2.44, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 1.14423, |
|
"grad_norm": 2.7271180152893066, |
|
"learning_rate": 3.163225e-05, |
|
"loss": 6.1803, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 1.14673, |
|
"grad_norm": 1.5343501567840576, |
|
"learning_rate": 3.150725e-05, |
|
"loss": 6.1849, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.14673, |
|
"eval_accuracy": 0.029956919861963974, |
|
"eval_loss": 6.108343601226807, |
|
"eval_runtime": 94.132, |
|
"eval_samples_per_second": 38.17, |
|
"eval_steps_per_second": 2.39, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 1.14923, |
|
"grad_norm": 2.063617706298828, |
|
"learning_rate": 3.138225e-05, |
|
"loss": 6.1601, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 1.15173, |
|
"grad_norm": 1.7743293046951294, |
|
"learning_rate": 3.1257500000000004e-05, |
|
"loss": 6.1709, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.15173, |
|
"eval_accuracy": 0.030241399788170614, |
|
"eval_loss": 6.083706855773926, |
|
"eval_runtime": 91.1389, |
|
"eval_samples_per_second": 39.423, |
|
"eval_steps_per_second": 2.469, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 1.15423, |
|
"grad_norm": 2.292107343673706, |
|
"learning_rate": 3.11325e-05, |
|
"loss": 6.1532, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 1.15673, |
|
"grad_norm": 1.8466393947601318, |
|
"learning_rate": 3.10075e-05, |
|
"loss": 6.1669, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.15673, |
|
"eval_accuracy": 0.02916497454850206, |
|
"eval_loss": 6.0924787521362305, |
|
"eval_runtime": 93.9311, |
|
"eval_samples_per_second": 38.251, |
|
"eval_steps_per_second": 2.395, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 1.15923, |
|
"grad_norm": 2.8982222080230713, |
|
"learning_rate": 3.08825e-05, |
|
"loss": 6.1602, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 1.16173, |
|
"grad_norm": 2.5072736740112305, |
|
"learning_rate": 3.07575e-05, |
|
"loss": 6.16, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.16173, |
|
"eval_accuracy": 0.029235040144865724, |
|
"eval_loss": 6.0938825607299805, |
|
"eval_runtime": 98.3356, |
|
"eval_samples_per_second": 36.538, |
|
"eval_steps_per_second": 2.288, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 1.1642299999999999, |
|
"grad_norm": 1.8945947885513306, |
|
"learning_rate": 3.06325e-05, |
|
"loss": 6.1295, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 1.16673, |
|
"grad_norm": 3.0348196029663086, |
|
"learning_rate": 3.0507750000000003e-05, |
|
"loss": 6.1637, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.16673, |
|
"eval_accuracy": 0.029682371563979754, |
|
"eval_loss": 6.0949811935424805, |
|
"eval_runtime": 95.4626, |
|
"eval_samples_per_second": 37.638, |
|
"eval_steps_per_second": 2.357, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 1.16923, |
|
"grad_norm": 1.6715943813323975, |
|
"learning_rate": 3.038275e-05, |
|
"loss": 6.1729, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 1.17173, |
|
"grad_norm": 1.6270267963409424, |
|
"learning_rate": 3.025775e-05, |
|
"loss": 6.1446, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.17173, |
|
"eval_accuracy": 0.029339254216350318, |
|
"eval_loss": 6.08974552154541, |
|
"eval_runtime": 91.6584, |
|
"eval_samples_per_second": 39.2, |
|
"eval_steps_per_second": 2.455, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 1.17423, |
|
"grad_norm": 2.177884817123413, |
|
"learning_rate": 3.0132750000000004e-05, |
|
"loss": 6.1339, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 1.17673, |
|
"grad_norm": 2.0329620838165283, |
|
"learning_rate": 3.000775e-05, |
|
"loss": 6.1231, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.17673, |
|
"eval_accuracy": 0.02976223273886416, |
|
"eval_loss": 6.078031063079834, |
|
"eval_runtime": 98.9799, |
|
"eval_samples_per_second": 36.3, |
|
"eval_steps_per_second": 2.273, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 1.17923, |
|
"grad_norm": 2.5211639404296875, |
|
"learning_rate": 2.9883e-05, |
|
"loss": 6.1297, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 1.18173, |
|
"grad_norm": 3.6579315662384033, |
|
"learning_rate": 2.9758000000000002e-05, |
|
"loss": 6.1287, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 1.18173, |
|
"eval_accuracy": 0.029048379954164854, |
|
"eval_loss": 6.091179370880127, |
|
"eval_runtime": 94.4808, |
|
"eval_samples_per_second": 38.029, |
|
"eval_steps_per_second": 2.381, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 1.18423, |
|
"grad_norm": 2.1335196495056152, |
|
"learning_rate": 2.9633e-05, |
|
"loss": 6.1302, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 1.18673, |
|
"grad_norm": 1.8634554147720337, |
|
"learning_rate": 2.9508000000000003e-05, |
|
"loss": 6.1196, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 1.18673, |
|
"eval_accuracy": 0.028964029139125106, |
|
"eval_loss": 6.08493709564209, |
|
"eval_runtime": 102.0825, |
|
"eval_samples_per_second": 35.197, |
|
"eval_steps_per_second": 2.204, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 1.18923, |
|
"grad_norm": 3.3065264225006104, |
|
"learning_rate": 2.938325e-05, |
|
"loss": 6.1363, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 1.19173, |
|
"grad_norm": 2.0532631874084473, |
|
"learning_rate": 2.9258250000000004e-05, |
|
"loss": 6.1136, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 1.19173, |
|
"eval_accuracy": 0.029438298399171184, |
|
"eval_loss": 6.061553478240967, |
|
"eval_runtime": 114.3059, |
|
"eval_samples_per_second": 31.433, |
|
"eval_steps_per_second": 1.968, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 1.1942300000000001, |
|
"grad_norm": 1.6184016466140747, |
|
"learning_rate": 2.913325e-05, |
|
"loss": 6.1084, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 1.19673, |
|
"grad_norm": 3.0933752059936523, |
|
"learning_rate": 2.9008250000000005e-05, |
|
"loss": 6.1135, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.19673, |
|
"eval_accuracy": 0.02910620107737759, |
|
"eval_loss": 6.051595211029053, |
|
"eval_runtime": 97.7568, |
|
"eval_samples_per_second": 36.754, |
|
"eval_steps_per_second": 2.302, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 1.19923, |
|
"grad_norm": 2.0492806434631348, |
|
"learning_rate": 2.888325e-05, |
|
"loss": 6.1135, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 1.20173, |
|
"grad_norm": 2.1396772861480713, |
|
"learning_rate": 2.87585e-05, |
|
"loss": 6.1157, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 1.20173, |
|
"eval_accuracy": 0.029641148504371616, |
|
"eval_loss": 6.051669120788574, |
|
"eval_runtime": 135.3455, |
|
"eval_samples_per_second": 26.547, |
|
"eval_steps_per_second": 1.662, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 1.20423, |
|
"grad_norm": 1.9825798273086548, |
|
"learning_rate": 2.8633500000000003e-05, |
|
"loss": 6.1003, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 1.20673, |
|
"grad_norm": 4.120855331420898, |
|
"learning_rate": 2.85085e-05, |
|
"loss": 6.1102, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 1.20673, |
|
"eval_accuracy": 0.029109058121112804, |
|
"eval_loss": 6.062215328216553, |
|
"eval_runtime": 93.9666, |
|
"eval_samples_per_second": 38.237, |
|
"eval_steps_per_second": 2.394, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 1.20923, |
|
"grad_norm": 1.7419639825820923, |
|
"learning_rate": 2.83835e-05, |
|
"loss": 6.1052, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 1.21173, |
|
"grad_norm": 1.9040395021438599, |
|
"learning_rate": 2.825875e-05, |
|
"loss": 6.1218, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 1.21173, |
|
"eval_accuracy": 0.02853969011959449, |
|
"eval_loss": 6.063874244689941, |
|
"eval_runtime": 91.0034, |
|
"eval_samples_per_second": 39.482, |
|
"eval_steps_per_second": 2.472, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 1.21423, |
|
"grad_norm": 3.0082874298095703, |
|
"learning_rate": 2.813375e-05, |
|
"loss": 6.1101, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 1.21673, |
|
"grad_norm": 1.843441367149353, |
|
"learning_rate": 2.8008750000000002e-05, |
|
"loss": 6.1104, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.21673, |
|
"eval_accuracy": 0.029038720425345787, |
|
"eval_loss": 6.0515055656433105, |
|
"eval_runtime": 92.5288, |
|
"eval_samples_per_second": 38.831, |
|
"eval_steps_per_second": 2.432, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 1.21923, |
|
"grad_norm": 1.9573761224746704, |
|
"learning_rate": 2.7883750000000002e-05, |
|
"loss": 6.1118, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 1.22173, |
|
"grad_norm": 2.1925711631774902, |
|
"learning_rate": 2.7759000000000003e-05, |
|
"loss": 6.0777, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 1.22173, |
|
"eval_accuracy": 0.02947503181862398, |
|
"eval_loss": 6.0190510749816895, |
|
"eval_runtime": 91.9724, |
|
"eval_samples_per_second": 39.066, |
|
"eval_steps_per_second": 2.446, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 2.00096, |
|
"grad_norm": 2.6323490142822266, |
|
"learning_rate": 2.7634e-05, |
|
"loss": 6.1035, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 2.00346, |
|
"grad_norm": 2.907351016998291, |
|
"learning_rate": 2.7509e-05, |
|
"loss": 6.051, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.00346, |
|
"eval_accuracy": 0.028725125862980264, |
|
"eval_loss": 6.004823207855225, |
|
"eval_runtime": 111.8414, |
|
"eval_samples_per_second": 32.126, |
|
"eval_steps_per_second": 2.012, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.00596, |
|
"grad_norm": 2.1557071208953857, |
|
"learning_rate": 2.7383999999999997e-05, |
|
"loss": 6.0777, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 2.00846, |
|
"grad_norm": 2.466111183166504, |
|
"learning_rate": 2.7259e-05, |
|
"loss": 6.065, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 2.00846, |
|
"eval_accuracy": 0.0288474345447879, |
|
"eval_loss": 6.030221462249756, |
|
"eval_runtime": 91.3043, |
|
"eval_samples_per_second": 39.352, |
|
"eval_steps_per_second": 2.464, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 2.01096, |
|
"grad_norm": 2.554461717605591, |
|
"learning_rate": 2.713425e-05, |
|
"loss": 6.1054, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 2.01346, |
|
"grad_norm": 2.1461994647979736, |
|
"learning_rate": 2.7009250000000002e-05, |
|
"loss": 6.0941, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 2.01346, |
|
"eval_accuracy": 0.028430578258849523, |
|
"eval_loss": 6.029834747314453, |
|
"eval_runtime": 137.2429, |
|
"eval_samples_per_second": 26.18, |
|
"eval_steps_per_second": 1.639, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 2.01596, |
|
"grad_norm": 1.8053221702575684, |
|
"learning_rate": 2.688425e-05, |
|
"loss": 6.0738, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 2.01846, |
|
"grad_norm": 1.9942346811294556, |
|
"learning_rate": 2.6759250000000003e-05, |
|
"loss": 6.0833, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 2.01846, |
|
"eval_accuracy": 0.02874675776554691, |
|
"eval_loss": 6.014132022857666, |
|
"eval_runtime": 96.8636, |
|
"eval_samples_per_second": 37.093, |
|
"eval_steps_per_second": 2.323, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 2.02096, |
|
"grad_norm": 1.6957967281341553, |
|
"learning_rate": 2.663425e-05, |
|
"loss": 6.0429, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 2.02346, |
|
"grad_norm": 2.633817672729492, |
|
"learning_rate": 2.6509500000000004e-05, |
|
"loss": 6.0816, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 2.02346, |
|
"eval_accuracy": 0.0281086846646817, |
|
"eval_loss": 6.0137176513671875, |
|
"eval_runtime": 97.3569, |
|
"eval_samples_per_second": 36.905, |
|
"eval_steps_per_second": 2.311, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 2.02596, |
|
"grad_norm": 2.664257049560547, |
|
"learning_rate": 2.6384750000000002e-05, |
|
"loss": 6.0638, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 2.02846, |
|
"grad_norm": 2.147036552429199, |
|
"learning_rate": 2.625975e-05, |
|
"loss": 6.0771, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 2.02846, |
|
"eval_accuracy": 0.029002667254401378, |
|
"eval_loss": 6.028487205505371, |
|
"eval_runtime": 97.1346, |
|
"eval_samples_per_second": 36.99, |
|
"eval_steps_per_second": 2.316, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 2.03096, |
|
"grad_norm": 2.5634896755218506, |
|
"learning_rate": 2.6134750000000002e-05, |
|
"loss": 6.0474, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 2.03346, |
|
"grad_norm": 2.277717113494873, |
|
"learning_rate": 2.600975e-05, |
|
"loss": 6.0646, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 2.03346, |
|
"eval_accuracy": 0.027694413323075186, |
|
"eval_loss": 6.009863376617432, |
|
"eval_runtime": 92.8998, |
|
"eval_samples_per_second": 38.676, |
|
"eval_steps_per_second": 2.422, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 2.03596, |
|
"grad_norm": 2.0405683517456055, |
|
"learning_rate": 2.5884750000000003e-05, |
|
"loss": 6.0532, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 2.03846, |
|
"grad_norm": 1.5538651943206787, |
|
"learning_rate": 2.575975e-05, |
|
"loss": 6.0421, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 2.03846, |
|
"eval_accuracy": 0.029365511808773982, |
|
"eval_loss": 6.003146648406982, |
|
"eval_runtime": 112.494, |
|
"eval_samples_per_second": 31.939, |
|
"eval_steps_per_second": 2.0, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 2.04096, |
|
"grad_norm": 2.2698843479156494, |
|
"learning_rate": 2.563475e-05, |
|
"loss": 6.0434, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 2.04346, |
|
"grad_norm": 3.3169667720794678, |
|
"learning_rate": 2.5509749999999997e-05, |
|
"loss": 6.0477, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 2.04346, |
|
"eval_accuracy": 0.027985151535558972, |
|
"eval_loss": 5.9978766441345215, |
|
"eval_runtime": 110.0502, |
|
"eval_samples_per_second": 32.649, |
|
"eval_steps_per_second": 2.045, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 2.04596, |
|
"grad_norm": 1.6990258693695068, |
|
"learning_rate": 2.5385000000000002e-05, |
|
"loss": 6.0594, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 2.04846, |
|
"grad_norm": 2.433408260345459, |
|
"learning_rate": 2.526e-05, |
|
"loss": 6.0317, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 2.04846, |
|
"eval_accuracy": 0.028621047841197345, |
|
"eval_loss": 5.987879276275635, |
|
"eval_runtime": 97.8847, |
|
"eval_samples_per_second": 36.706, |
|
"eval_steps_per_second": 2.299, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 2.05096, |
|
"grad_norm": 3.094221830368042, |
|
"learning_rate": 2.5135000000000002e-05, |
|
"loss": 6.0438, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 2.05346, |
|
"grad_norm": 2.064706802368164, |
|
"learning_rate": 2.501e-05, |
|
"loss": 6.0236, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.05346, |
|
"eval_accuracy": 0.028613837207008466, |
|
"eval_loss": 5.978915691375732, |
|
"eval_runtime": 116.882, |
|
"eval_samples_per_second": 30.74, |
|
"eval_steps_per_second": 1.925, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 2.05596, |
|
"grad_norm": 2.8103480339050293, |
|
"learning_rate": 2.488525e-05, |
|
"loss": 6.0153, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 2.05846, |
|
"grad_norm": 3.619741678237915, |
|
"learning_rate": 2.476025e-05, |
|
"loss": 6.0245, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 2.05846, |
|
"eval_accuracy": 0.028634244662260017, |
|
"eval_loss": 5.98130989074707, |
|
"eval_runtime": 100.1482, |
|
"eval_samples_per_second": 35.877, |
|
"eval_steps_per_second": 2.247, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 2.06096, |
|
"grad_norm": 2.7197930812835693, |
|
"learning_rate": 2.463525e-05, |
|
"loss": 6.0433, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 2.06346, |
|
"grad_norm": 2.0345962047576904, |
|
"learning_rate": 2.451025e-05, |
|
"loss": 6.0046, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 2.06346, |
|
"eval_accuracy": 0.02719443066941215, |
|
"eval_loss": 5.959959030151367, |
|
"eval_runtime": 99.9834, |
|
"eval_samples_per_second": 35.936, |
|
"eval_steps_per_second": 2.25, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 2.06596, |
|
"grad_norm": 3.054705858230591, |
|
"learning_rate": 2.43855e-05, |
|
"loss": 6.0308, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 2.06846, |
|
"grad_norm": 3.065950632095337, |
|
"learning_rate": 2.42605e-05, |
|
"loss": 6.0089, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 2.06846, |
|
"eval_accuracy": 0.028201470561225427, |
|
"eval_loss": 5.969558238983154, |
|
"eval_runtime": 91.0097, |
|
"eval_samples_per_second": 39.479, |
|
"eval_steps_per_second": 2.472, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 2.07096, |
|
"grad_norm": 2.385056495666504, |
|
"learning_rate": 2.413575e-05, |
|
"loss": 6.0279, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 2.07346, |
|
"grad_norm": 2.1604409217834473, |
|
"learning_rate": 2.401075e-05, |
|
"loss": 6.0268, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 2.07346, |
|
"eval_accuracy": 0.028443775079912192, |
|
"eval_loss": 5.9631242752075195, |
|
"eval_runtime": 95.1215, |
|
"eval_samples_per_second": 37.773, |
|
"eval_steps_per_second": 2.365, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 2.07596, |
|
"grad_norm": 2.4640963077545166, |
|
"learning_rate": 2.388575e-05, |
|
"loss": 6.0192, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 2.07846, |
|
"grad_norm": 3.9500372409820557, |
|
"learning_rate": 2.376075e-05, |
|
"loss": 6.015, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 2.07846, |
|
"eval_accuracy": 0.02789304588752363, |
|
"eval_loss": 5.986013412475586, |
|
"eval_runtime": 95.12, |
|
"eval_samples_per_second": 37.773, |
|
"eval_steps_per_second": 2.365, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 2.08096, |
|
"grad_norm": 2.237582206726074, |
|
"learning_rate": 2.363575e-05, |
|
"loss": 6.0236, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 2.08346, |
|
"grad_norm": 2.649489641189575, |
|
"learning_rate": 2.3511000000000002e-05, |
|
"loss": 5.9978, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 2.08346, |
|
"eval_accuracy": 0.02822813630275412, |
|
"eval_loss": 5.95936393737793, |
|
"eval_runtime": 93.2702, |
|
"eval_samples_per_second": 38.522, |
|
"eval_steps_per_second": 2.412, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 2.08596, |
|
"grad_norm": 2.1337761878967285, |
|
"learning_rate": 2.3386000000000003e-05, |
|
"loss": 6.0018, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 2.08846, |
|
"grad_norm": 3.1514151096343994, |
|
"learning_rate": 2.3261000000000003e-05, |
|
"loss": 6.0095, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 2.08846, |
|
"eval_accuracy": 0.028024197799940274, |
|
"eval_loss": 5.966735363006592, |
|
"eval_runtime": 105.6735, |
|
"eval_samples_per_second": 34.001, |
|
"eval_steps_per_second": 2.129, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 2.09096, |
|
"grad_norm": 2.5281171798706055, |
|
"learning_rate": 2.3136000000000003e-05, |
|
"loss": 5.9958, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 2.09346, |
|
"grad_norm": 2.90545392036438, |
|
"learning_rate": 2.3011e-05, |
|
"loss": 6.008, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 2.09346, |
|
"eval_accuracy": 0.02748897827354289, |
|
"eval_loss": 5.956141471862793, |
|
"eval_runtime": 104.8321, |
|
"eval_samples_per_second": 34.274, |
|
"eval_steps_per_second": 2.146, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 2.09596, |
|
"grad_norm": 2.385185956954956, |
|
"learning_rate": 2.2886e-05, |
|
"loss": 5.9762, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 2.09846, |
|
"grad_norm": 3.710510015487671, |
|
"learning_rate": 2.2761e-05, |
|
"loss": 5.9912, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 2.09846, |
|
"eval_accuracy": 0.027843251696709842, |
|
"eval_loss": 5.974761962890625, |
|
"eval_runtime": 92.291, |
|
"eval_samples_per_second": 38.931, |
|
"eval_steps_per_second": 2.438, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 2.10096, |
|
"grad_norm": 2.344508171081543, |
|
"learning_rate": 2.2636e-05, |
|
"loss": 6.007, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 2.10346, |
|
"grad_norm": 2.5263593196868896, |
|
"learning_rate": 2.251125e-05, |
|
"loss": 6.0, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 2.10346, |
|
"eval_accuracy": 0.027863114953154685, |
|
"eval_loss": 5.951306343078613, |
|
"eval_runtime": 107.5477, |
|
"eval_samples_per_second": 33.408, |
|
"eval_steps_per_second": 2.092, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 2.10596, |
|
"grad_norm": 4.0800981521606445, |
|
"learning_rate": 2.238625e-05, |
|
"loss": 6.0135, |
|
"step": 110500 |
|
}, |
|
{ |
|
"epoch": 2.10846, |
|
"grad_norm": 2.456308126449585, |
|
"learning_rate": 2.226125e-05, |
|
"loss": 5.9981, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 2.10846, |
|
"eval_accuracy": 0.02768094440260916, |
|
"eval_loss": 5.935766220092773, |
|
"eval_runtime": 92.4942, |
|
"eval_samples_per_second": 38.846, |
|
"eval_steps_per_second": 2.433, |
|
"step": 111000 |
|
}, |
|
{ |
|
"epoch": 2.11096, |
|
"grad_norm": 2.2109882831573486, |
|
"learning_rate": 2.213625e-05, |
|
"loss": 5.9899, |
|
"step": 111500 |
|
}, |
|
{ |
|
"epoch": 2.11346, |
|
"grad_norm": 2.52247953414917, |
|
"learning_rate": 2.20115e-05, |
|
"loss": 5.9877, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 2.11346, |
|
"eval_accuracy": 0.02789957627320413, |
|
"eval_loss": 5.934952259063721, |
|
"eval_runtime": 107.9372, |
|
"eval_samples_per_second": 33.288, |
|
"eval_steps_per_second": 2.085, |
|
"step": 112000 |
|
}, |
|
{ |
|
"epoch": 2.11596, |
|
"grad_norm": 1.6865503787994385, |
|
"learning_rate": 2.18865e-05, |
|
"loss": 5.9807, |
|
"step": 112500 |
|
}, |
|
{ |
|
"epoch": 2.11846, |
|
"grad_norm": 2.4976794719696045, |
|
"learning_rate": 2.17615e-05, |
|
"loss": 5.9726, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 2.11846, |
|
"eval_accuracy": 0.027755091490023136, |
|
"eval_loss": 5.934043884277344, |
|
"eval_runtime": 98.7369, |
|
"eval_samples_per_second": 36.39, |
|
"eval_steps_per_second": 2.279, |
|
"step": 113000 |
|
}, |
|
{ |
|
"epoch": 2.12096, |
|
"grad_norm": 2.376749038696289, |
|
"learning_rate": 2.1636500000000002e-05, |
|
"loss": 5.9716, |
|
"step": 113500 |
|
}, |
|
{ |
|
"epoch": 2.12346, |
|
"grad_norm": 4.487443447113037, |
|
"learning_rate": 2.1511500000000002e-05, |
|
"loss": 5.9696, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 2.12346, |
|
"eval_accuracy": 0.027384900251759974, |
|
"eval_loss": 5.924759864807129, |
|
"eval_runtime": 92.999, |
|
"eval_samples_per_second": 38.635, |
|
"eval_steps_per_second": 2.419, |
|
"step": 114000 |
|
}, |
|
{ |
|
"epoch": 2.12596, |
|
"grad_norm": 3.5680646896362305, |
|
"learning_rate": 2.138675e-05, |
|
"loss": 5.9932, |
|
"step": 114500 |
|
}, |
|
{ |
|
"epoch": 2.12846, |
|
"grad_norm": 2.196323871612549, |
|
"learning_rate": 2.126175e-05, |
|
"loss": 5.9842, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 2.12846, |
|
"eval_accuracy": 0.027340003850206556, |
|
"eval_loss": 5.95149564743042, |
|
"eval_runtime": 101.0386, |
|
"eval_samples_per_second": 35.561, |
|
"eval_steps_per_second": 2.227, |
|
"step": 115000 |
|
}, |
|
{ |
|
"epoch": 2.13096, |
|
"grad_norm": 2.3321759700775146, |
|
"learning_rate": 2.1136750000000004e-05, |
|
"loss": 5.9865, |
|
"step": 115500 |
|
}, |
|
{ |
|
"epoch": 2.13346, |
|
"grad_norm": 2.0492184162139893, |
|
"learning_rate": 2.101175e-05, |
|
"loss": 5.9919, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 2.13346, |
|
"eval_accuracy": 0.027651693716748603, |
|
"eval_loss": 5.923666477203369, |
|
"eval_runtime": 106.1587, |
|
"eval_samples_per_second": 33.846, |
|
"eval_steps_per_second": 2.119, |
|
"step": 116000 |
|
}, |
|
{ |
|
"epoch": 2.13596, |
|
"grad_norm": 2.516122817993164, |
|
"learning_rate": 2.088675e-05, |
|
"loss": 5.9827, |
|
"step": 116500 |
|
}, |
|
{ |
|
"epoch": 2.13846, |
|
"grad_norm": 2.3729827404022217, |
|
"learning_rate": 2.0762000000000002e-05, |
|
"loss": 5.972, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 2.13846, |
|
"eval_accuracy": 0.02696409852447296, |
|
"eval_loss": 5.927760601043701, |
|
"eval_runtime": 122.2666, |
|
"eval_samples_per_second": 29.387, |
|
"eval_steps_per_second": 1.84, |
|
"step": 117000 |
|
}, |
|
{ |
|
"epoch": 2.14096, |
|
"grad_norm": 2.6471846103668213, |
|
"learning_rate": 2.0637e-05, |
|
"loss": 5.9663, |
|
"step": 117500 |
|
}, |
|
{ |
|
"epoch": 2.14346, |
|
"grad_norm": 4.273701190948486, |
|
"learning_rate": 2.0512e-05, |
|
"loss": 5.9715, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 2.14346, |
|
"eval_accuracy": 0.026757439027625573, |
|
"eval_loss": 5.910974502563477, |
|
"eval_runtime": 99.6535, |
|
"eval_samples_per_second": 36.055, |
|
"eval_steps_per_second": 2.258, |
|
"step": 118000 |
|
}, |
|
{ |
|
"epoch": 2.14596, |
|
"grad_norm": 3.1654696464538574, |
|
"learning_rate": 2.0387e-05, |
|
"loss": 5.9718, |
|
"step": 118500 |
|
}, |
|
{ |
|
"epoch": 2.14846, |
|
"grad_norm": 2.6283257007598877, |
|
"learning_rate": 2.026225e-05, |
|
"loss": 5.9727, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 2.14846, |
|
"eval_accuracy": 0.027529385034940963, |
|
"eval_loss": 5.913906574249268, |
|
"eval_runtime": 91.3308, |
|
"eval_samples_per_second": 39.341, |
|
"eval_steps_per_second": 2.464, |
|
"step": 119000 |
|
}, |
|
{ |
|
"epoch": 2.15096, |
|
"grad_norm": 2.8760488033294678, |
|
"learning_rate": 2.013725e-05, |
|
"loss": 5.9587, |
|
"step": 119500 |
|
}, |
|
{ |
|
"epoch": 2.15346, |
|
"grad_norm": 2.699265956878662, |
|
"learning_rate": 2.001225e-05, |
|
"loss": 5.9427, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 2.15346, |
|
"eval_accuracy": 0.02733360951422774, |
|
"eval_loss": 5.927834987640381, |
|
"eval_runtime": 91.7396, |
|
"eval_samples_per_second": 39.165, |
|
"eval_steps_per_second": 2.453, |
|
"step": 120000 |
|
}, |
|
{ |
|
"epoch": 2.15596, |
|
"grad_norm": 2.398200511932373, |
|
"learning_rate": 1.9887250000000002e-05, |
|
"loss": 5.9724, |
|
"step": 120500 |
|
}, |
|
{ |
|
"epoch": 2.15846, |
|
"grad_norm": 2.2844276428222656, |
|
"learning_rate": 1.9762250000000002e-05, |
|
"loss": 5.9514, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 2.15846, |
|
"eval_accuracy": 0.02694165032369625, |
|
"eval_loss": 5.9226508140563965, |
|
"eval_runtime": 92.7015, |
|
"eval_samples_per_second": 38.759, |
|
"eval_steps_per_second": 2.427, |
|
"step": 121000 |
|
}, |
|
{ |
|
"epoch": 2.16096, |
|
"grad_norm": 3.6655936241149902, |
|
"learning_rate": 1.96375e-05, |
|
"loss": 5.9701, |
|
"step": 121500 |
|
}, |
|
{ |
|
"epoch": 2.16346, |
|
"grad_norm": 2.5577406883239746, |
|
"learning_rate": 1.95125e-05, |
|
"loss": 5.9217, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 2.16346, |
|
"eval_accuracy": 0.02731524280450134, |
|
"eval_loss": 5.9304680824279785, |
|
"eval_runtime": 91.7545, |
|
"eval_samples_per_second": 39.159, |
|
"eval_steps_per_second": 2.452, |
|
"step": 122000 |
|
}, |
|
{ |
|
"epoch": 2.16596, |
|
"grad_norm": 3.617988109588623, |
|
"learning_rate": 1.93875e-05, |
|
"loss": 5.9587, |
|
"step": 122500 |
|
}, |
|
{ |
|
"epoch": 2.16846, |
|
"grad_norm": 2.8228862285614014, |
|
"learning_rate": 1.92625e-05, |
|
"loss": 5.9862, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 2.16846, |
|
"eval_accuracy": 0.02671240657637048, |
|
"eval_loss": 5.909170150756836, |
|
"eval_runtime": 92.8579, |
|
"eval_samples_per_second": 38.694, |
|
"eval_steps_per_second": 2.423, |
|
"step": 123000 |
|
}, |
|
{ |
|
"epoch": 2.17096, |
|
"grad_norm": 5.492008209228516, |
|
"learning_rate": 1.91375e-05, |
|
"loss": 5.9472, |
|
"step": 123500 |
|
}, |
|
{ |
|
"epoch": 2.17346, |
|
"grad_norm": 2.1293392181396484, |
|
"learning_rate": 1.9012750000000002e-05, |
|
"loss": 5.9388, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 2.17346, |
|
"eval_accuracy": 0.0270023284906442, |
|
"eval_loss": 5.889882564544678, |
|
"eval_runtime": 122.3828, |
|
"eval_samples_per_second": 29.359, |
|
"eval_steps_per_second": 1.838, |
|
"step": 124000 |
|
}, |
|
{ |
|
"epoch": 2.17596, |
|
"grad_norm": 2.5894041061401367, |
|
"learning_rate": 1.8887750000000002e-05, |
|
"loss": 5.9259, |
|
"step": 124500 |
|
}, |
|
{ |
|
"epoch": 2.17846, |
|
"grad_norm": 3.7750794887542725, |
|
"learning_rate": 1.8762750000000003e-05, |
|
"loss": 5.9429, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 2.17846, |
|
"eval_accuracy": 0.02674982024433166, |
|
"eval_loss": 5.895012855529785, |
|
"eval_runtime": 95.7852, |
|
"eval_samples_per_second": 37.511, |
|
"eval_steps_per_second": 2.349, |
|
"step": 125000 |
|
}, |
|
{ |
|
"epoch": 2.18096, |
|
"grad_norm": 2.4733965396881104, |
|
"learning_rate": 1.8638e-05, |
|
"loss": 5.9506, |
|
"step": 125500 |
|
}, |
|
{ |
|
"epoch": 2.18346, |
|
"grad_norm": 2.5394270420074463, |
|
"learning_rate": 1.8513e-05, |
|
"loss": 5.9317, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 2.18346, |
|
"eval_accuracy": 0.026840429345648554, |
|
"eval_loss": 5.910998344421387, |
|
"eval_runtime": 95.7395, |
|
"eval_samples_per_second": 37.529, |
|
"eval_steps_per_second": 2.35, |
|
"step": 126000 |
|
}, |
|
{ |
|
"epoch": 2.18596, |
|
"grad_norm": 2.7636115550994873, |
|
"learning_rate": 1.8388e-05, |
|
"loss": 5.9325, |
|
"step": 126500 |
|
}, |
|
{ |
|
"epoch": 2.18846, |
|
"grad_norm": 3.0258498191833496, |
|
"learning_rate": 1.8263e-05, |
|
"loss": 5.9367, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 2.18846, |
|
"eval_accuracy": 0.026817436946065136, |
|
"eval_loss": 5.86806583404541, |
|
"eval_runtime": 93.0877, |
|
"eval_samples_per_second": 38.598, |
|
"eval_steps_per_second": 2.417, |
|
"step": 127000 |
|
}, |
|
{ |
|
"epoch": 2.19096, |
|
"grad_norm": 2.4709484577178955, |
|
"learning_rate": 1.8138e-05, |
|
"loss": 5.9368, |
|
"step": 127500 |
|
}, |
|
{ |
|
"epoch": 2.19346, |
|
"grad_norm": 4.170524597167969, |
|
"learning_rate": 1.8013000000000002e-05, |
|
"loss": 5.9273, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 2.19346, |
|
"eval_accuracy": 0.027440544579745874, |
|
"eval_loss": 5.880221366882324, |
|
"eval_runtime": 92.6132, |
|
"eval_samples_per_second": 38.796, |
|
"eval_steps_per_second": 2.429, |
|
"step": 128000 |
|
}, |
|
{ |
|
"epoch": 2.19596, |
|
"grad_norm": 3.9754855632781982, |
|
"learning_rate": 1.7888000000000002e-05, |
|
"loss": 5.9188, |
|
"step": 128500 |
|
}, |
|
{ |
|
"epoch": 2.19846, |
|
"grad_norm": 2.676684856414795, |
|
"learning_rate": 1.7763000000000003e-05, |
|
"loss": 5.934, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 2.19846, |
|
"eval_accuracy": 0.026782064023629114, |
|
"eval_loss": 5.897346496582031, |
|
"eval_runtime": 92.8542, |
|
"eval_samples_per_second": 38.695, |
|
"eval_steps_per_second": 2.423, |
|
"step": 129000 |
|
}, |
|
{ |
|
"epoch": 2.20096, |
|
"grad_norm": 3.0156376361846924, |
|
"learning_rate": 1.763825e-05, |
|
"loss": 5.9191, |
|
"step": 129500 |
|
}, |
|
{ |
|
"epoch": 2.20346, |
|
"grad_norm": 2.913367509841919, |
|
"learning_rate": 1.751325e-05, |
|
"loss": 5.9229, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 2.20346, |
|
"eval_accuracy": 0.027015253212303518, |
|
"eval_loss": 5.891612529754639, |
|
"eval_runtime": 91.7301, |
|
"eval_samples_per_second": 39.169, |
|
"eval_steps_per_second": 2.453, |
|
"step": 130000 |
|
}, |
|
{ |
|
"epoch": 2.20596, |
|
"grad_norm": 3.0923585891723633, |
|
"learning_rate": 1.738825e-05, |
|
"loss": 5.9225, |
|
"step": 130500 |
|
}, |
|
{ |
|
"epoch": 2.20846, |
|
"grad_norm": 2.9327263832092285, |
|
"learning_rate": 1.726325e-05, |
|
"loss": 5.942, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 2.20846, |
|
"eval_accuracy": 0.026604791262343958, |
|
"eval_loss": 5.896469593048096, |
|
"eval_runtime": 97.1482, |
|
"eval_samples_per_second": 36.985, |
|
"eval_steps_per_second": 2.316, |
|
"step": 131000 |
|
}, |
|
{ |
|
"epoch": 2.21096, |
|
"grad_norm": 2.434720993041992, |
|
"learning_rate": 1.7138500000000002e-05, |
|
"loss": 5.9244, |
|
"step": 131500 |
|
}, |
|
{ |
|
"epoch": 2.21346, |
|
"grad_norm": 3.1938490867614746, |
|
"learning_rate": 1.7013500000000002e-05, |
|
"loss": 5.9224, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 2.21346, |
|
"eval_accuracy": 0.02676478571151613, |
|
"eval_loss": 5.8799638748168945, |
|
"eval_runtime": 101.684, |
|
"eval_samples_per_second": 35.335, |
|
"eval_steps_per_second": 2.213, |
|
"step": 132000 |
|
}, |
|
{ |
|
"epoch": 2.21596, |
|
"grad_norm": 2.5575222969055176, |
|
"learning_rate": 1.6888500000000003e-05, |
|
"loss": 5.933, |
|
"step": 132500 |
|
}, |
|
{ |
|
"epoch": 2.21846, |
|
"grad_norm": 2.610675573348999, |
|
"learning_rate": 1.6763500000000003e-05, |
|
"loss": 5.936, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 2.21846, |
|
"eval_accuracy": 0.026946139963851593, |
|
"eval_loss": 5.869258880615234, |
|
"eval_runtime": 94.0477, |
|
"eval_samples_per_second": 38.204, |
|
"eval_steps_per_second": 2.392, |
|
"step": 133000 |
|
}, |
|
{ |
|
"epoch": 2.22096, |
|
"grad_norm": 2.5007147789001465, |
|
"learning_rate": 1.663875e-05, |
|
"loss": 5.9006, |
|
"step": 133500 |
|
}, |
|
{ |
|
"epoch": 3.00019, |
|
"grad_norm": 2.4350554943084717, |
|
"learning_rate": 1.651375e-05, |
|
"loss": 5.9129, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 3.00019, |
|
"eval_accuracy": 0.02651254956460694, |
|
"eval_loss": 5.850106239318848, |
|
"eval_runtime": 90.9885, |
|
"eval_samples_per_second": 39.489, |
|
"eval_steps_per_second": 2.473, |
|
"step": 134000 |
|
}, |
|
{ |
|
"epoch": 3.00269, |
|
"grad_norm": 2.9564504623413086, |
|
"learning_rate": 1.638875e-05, |
|
"loss": 5.8888, |
|
"step": 134500 |
|
}, |
|
{ |
|
"epoch": 3.00519, |
|
"grad_norm": 3.2191567420959473, |
|
"learning_rate": 1.6263749999999998e-05, |
|
"loss": 5.8787, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 3.00519, |
|
"eval_accuracy": 0.0267069645883034, |
|
"eval_loss": 5.870194911956787, |
|
"eval_runtime": 93.5455, |
|
"eval_samples_per_second": 38.409, |
|
"eval_steps_per_second": 2.405, |
|
"step": 135000 |
|
}, |
|
{ |
|
"epoch": 3.00769, |
|
"grad_norm": 3.1718389987945557, |
|
"learning_rate": 1.613875e-05, |
|
"loss": 5.8943, |
|
"step": 135500 |
|
}, |
|
{ |
|
"epoch": 3.01019, |
|
"grad_norm": 3.5635807514190674, |
|
"learning_rate": 1.601425e-05, |
|
"loss": 5.9171, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 3.01019, |
|
"eval_accuracy": 0.0268892711885506, |
|
"eval_loss": 5.844875812530518, |
|
"eval_runtime": 92.3095, |
|
"eval_samples_per_second": 38.923, |
|
"eval_steps_per_second": 2.437, |
|
"step": 136000 |
|
}, |
|
{ |
|
"epoch": 3.01269, |
|
"grad_norm": 4.3343305587768555, |
|
"learning_rate": 1.588925e-05, |
|
"loss": 5.9144, |
|
"step": 136500 |
|
}, |
|
{ |
|
"epoch": 3.01519, |
|
"grad_norm": 2.6254384517669678, |
|
"learning_rate": 1.576425e-05, |
|
"loss": 5.8931, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 3.01519, |
|
"eval_accuracy": 0.026976070898220537, |
|
"eval_loss": 5.845653057098389, |
|
"eval_runtime": 92.0247, |
|
"eval_samples_per_second": 39.044, |
|
"eval_steps_per_second": 2.445, |
|
"step": 137000 |
|
}, |
|
{ |
|
"epoch": 3.01769, |
|
"grad_norm": 3.994947910308838, |
|
"learning_rate": 1.563925e-05, |
|
"loss": 5.9248, |
|
"step": 137500 |
|
}, |
|
{ |
|
"epoch": 3.02019, |
|
"grad_norm": 3.4500911235809326, |
|
"learning_rate": 1.5514249999999998e-05, |
|
"loss": 5.8612, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 3.02019, |
|
"eval_accuracy": 0.026333100008094957, |
|
"eval_loss": 5.863004684448242, |
|
"eval_runtime": 97.9674, |
|
"eval_samples_per_second": 36.675, |
|
"eval_steps_per_second": 2.297, |
|
"step": 138000 |
|
}, |
|
{ |
|
"epoch": 3.02269, |
|
"grad_norm": 2.8210179805755615, |
|
"learning_rate": 1.538925e-05, |
|
"loss": 5.8873, |
|
"step": 138500 |
|
}, |
|
{ |
|
"epoch": 3.02519, |
|
"grad_norm": 3.462085723876953, |
|
"learning_rate": 1.526425e-05, |
|
"loss": 5.8897, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 3.02519, |
|
"eval_accuracy": 0.02670546804158495, |
|
"eval_loss": 5.849697589874268, |
|
"eval_runtime": 91.473, |
|
"eval_samples_per_second": 39.279, |
|
"eval_steps_per_second": 2.46, |
|
"step": 139000 |
|
}, |
|
{ |
|
"epoch": 3.02769, |
|
"grad_norm": 3.4429280757904053, |
|
"learning_rate": 1.5139250000000002e-05, |
|
"loss": 5.8915, |
|
"step": 139500 |
|
}, |
|
{ |
|
"epoch": 3.03019, |
|
"grad_norm": 2.4032459259033203, |
|
"learning_rate": 1.50145e-05, |
|
"loss": 5.8772, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 3.03019, |
|
"eval_accuracy": 0.026319903187032288, |
|
"eval_loss": 5.817691802978516, |
|
"eval_runtime": 116.7881, |
|
"eval_samples_per_second": 30.765, |
|
"eval_steps_per_second": 1.927, |
|
"step": 140000 |
|
}, |
|
{ |
|
"epoch": 3.03269, |
|
"grad_norm": 2.4531261920928955, |
|
"learning_rate": 1.48895e-05, |
|
"loss": 5.8735, |
|
"step": 140500 |
|
}, |
|
{ |
|
"epoch": 3.03519, |
|
"grad_norm": 2.7715256214141846, |
|
"learning_rate": 1.47645e-05, |
|
"loss": 5.8774, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 3.03519, |
|
"eval_accuracy": 0.026607376206675824, |
|
"eval_loss": 5.821176052093506, |
|
"eval_runtime": 92.8442, |
|
"eval_samples_per_second": 38.699, |
|
"eval_steps_per_second": 2.423, |
|
"step": 141000 |
|
}, |
|
{ |
|
"epoch": 3.03769, |
|
"grad_norm": 3.399043560028076, |
|
"learning_rate": 1.4639750000000002e-05, |
|
"loss": 5.8561, |
|
"step": 141500 |
|
}, |
|
{ |
|
"epoch": 3.04019, |
|
"grad_norm": 3.1285316944122314, |
|
"learning_rate": 1.4514750000000002e-05, |
|
"loss": 5.8694, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 3.04019, |
|
"eval_accuracy": 0.026714855471000665, |
|
"eval_loss": 5.837355613708496, |
|
"eval_runtime": 98.834, |
|
"eval_samples_per_second": 36.354, |
|
"eval_steps_per_second": 2.277, |
|
"step": 142000 |
|
}, |
|
{ |
|
"epoch": 3.04269, |
|
"grad_norm": 3.5642755031585693, |
|
"learning_rate": 1.4389750000000002e-05, |
|
"loss": 5.8642, |
|
"step": 142500 |
|
}, |
|
{ |
|
"epoch": 3.04519, |
|
"grad_norm": 2.953099489212036, |
|
"learning_rate": 1.4264750000000001e-05, |
|
"loss": 5.8561, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 3.04519, |
|
"eval_accuracy": 0.02667281611318247, |
|
"eval_loss": 5.792807102203369, |
|
"eval_runtime": 93.9281, |
|
"eval_samples_per_second": 38.253, |
|
"eval_steps_per_second": 2.395, |
|
"step": 143000 |
|
}, |
|
{ |
|
"epoch": 3.04769, |
|
"grad_norm": 2.8684558868408203, |
|
"learning_rate": 1.4139750000000001e-05, |
|
"loss": 5.8442, |
|
"step": 143500 |
|
}, |
|
{ |
|
"epoch": 3.05019, |
|
"grad_norm": 2.2189698219299316, |
|
"learning_rate": 1.4014750000000002e-05, |
|
"loss": 5.8658, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 3.05019, |
|
"eval_accuracy": 0.02694872490818346, |
|
"eval_loss": 5.79358434677124, |
|
"eval_runtime": 93.4176, |
|
"eval_samples_per_second": 38.462, |
|
"eval_steps_per_second": 2.409, |
|
"step": 144000 |
|
}, |
|
{ |
|
"epoch": 3.05269, |
|
"grad_norm": 2.659515619277954, |
|
"learning_rate": 1.3889750000000002e-05, |
|
"loss": 5.8347, |
|
"step": 144500 |
|
}, |
|
{ |
|
"epoch": 3.05519, |
|
"grad_norm": 2.62570858001709, |
|
"learning_rate": 1.376475e-05, |
|
"loss": 5.8295, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 3.05519, |
|
"eval_accuracy": 0.026485203574569863, |
|
"eval_loss": 5.795611381530762, |
|
"eval_runtime": 94.2389, |
|
"eval_samples_per_second": 38.127, |
|
"eval_steps_per_second": 2.388, |
|
"step": 145000 |
|
}, |
|
{ |
|
"epoch": 3.05769, |
|
"grad_norm": 3.243245840072632, |
|
"learning_rate": 1.364e-05, |
|
"loss": 5.8247, |
|
"step": 145500 |
|
}, |
|
{ |
|
"epoch": 3.06019, |
|
"grad_norm": 3.164580821990967, |
|
"learning_rate": 1.3515e-05, |
|
"loss": 5.8444, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 3.06019, |
|
"eval_accuracy": 0.026442620017944955, |
|
"eval_loss": 5.792357444763184, |
|
"eval_runtime": 91.4103, |
|
"eval_samples_per_second": 39.306, |
|
"eval_steps_per_second": 2.461, |
|
"step": 146000 |
|
}, |
|
{ |
|
"epoch": 3.06269, |
|
"grad_norm": 2.9992387294769287, |
|
"learning_rate": 1.339e-05, |
|
"loss": 5.8187, |
|
"step": 146500 |
|
}, |
|
{ |
|
"epoch": 3.06519, |
|
"grad_norm": 2.9133353233337402, |
|
"learning_rate": 1.3265e-05, |
|
"loss": 5.8318, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 3.06519, |
|
"eval_accuracy": 0.026461258827074705, |
|
"eval_loss": 5.765110015869141, |
|
"eval_runtime": 93.7438, |
|
"eval_samples_per_second": 38.328, |
|
"eval_steps_per_second": 2.4, |
|
"step": 147000 |
|
}, |
|
{ |
|
"epoch": 3.06769, |
|
"grad_norm": 3.1825642585754395, |
|
"learning_rate": 1.314e-05, |
|
"loss": 5.8185, |
|
"step": 147500 |
|
}, |
|
{ |
|
"epoch": 3.07019, |
|
"grad_norm": 3.5604467391967773, |
|
"learning_rate": 1.3015e-05, |
|
"loss": 5.8323, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 3.07019, |
|
"eval_accuracy": 0.026824783629955697, |
|
"eval_loss": 5.770131587982178, |
|
"eval_runtime": 92.029, |
|
"eval_samples_per_second": 39.042, |
|
"eval_steps_per_second": 2.445, |
|
"step": 148000 |
|
}, |
|
{ |
|
"epoch": 3.07269, |
|
"grad_norm": 3.883007049560547, |
|
"learning_rate": 1.2889999999999999e-05, |
|
"loss": 5.8111, |
|
"step": 148500 |
|
}, |
|
{ |
|
"epoch": 3.07519, |
|
"grad_norm": 8.257863998413086, |
|
"learning_rate": 1.2765250000000001e-05, |
|
"loss": 5.8239, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 3.07519, |
|
"eval_accuracy": 0.026383438397715453, |
|
"eval_loss": 5.779338359832764, |
|
"eval_runtime": 92.1302, |
|
"eval_samples_per_second": 38.999, |
|
"eval_steps_per_second": 2.442, |
|
"step": 149000 |
|
}, |
|
{ |
|
"epoch": 3.07769, |
|
"grad_norm": 2.9781956672668457, |
|
"learning_rate": 1.2640250000000002e-05, |
|
"loss": 5.8073, |
|
"step": 149500 |
|
}, |
|
{ |
|
"epoch": 3.08019, |
|
"grad_norm": 3.369654893875122, |
|
"learning_rate": 1.251525e-05, |
|
"loss": 5.8057, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 3.08019, |
|
"eval_accuracy": 0.027368438237857055, |
|
"eval_loss": 5.7676262855529785, |
|
"eval_runtime": 94.7379, |
|
"eval_samples_per_second": 37.926, |
|
"eval_steps_per_second": 2.375, |
|
"step": 150000 |
|
}, |
|
{ |
|
"epoch": 3.08269, |
|
"grad_norm": 2.6309428215026855, |
|
"learning_rate": 1.239025e-05, |
|
"loss": 5.798, |
|
"step": 150500 |
|
}, |
|
{ |
|
"epoch": 3.08519, |
|
"grad_norm": 4.712452411651611, |
|
"learning_rate": 1.2265250000000001e-05, |
|
"loss": 5.7818, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 3.08519, |
|
"eval_accuracy": 0.026996750452875445, |
|
"eval_loss": 5.756935119628906, |
|
"eval_runtime": 93.4756, |
|
"eval_samples_per_second": 38.438, |
|
"eval_steps_per_second": 2.407, |
|
"step": 151000 |
|
}, |
|
{ |
|
"epoch": 3.08769, |
|
"grad_norm": 3.6165318489074707, |
|
"learning_rate": 1.21405e-05, |
|
"loss": 5.7959, |
|
"step": 151500 |
|
}, |
|
{ |
|
"epoch": 3.09019, |
|
"grad_norm": 3.728891134262085, |
|
"learning_rate": 1.20155e-05, |
|
"loss": 5.773, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 3.09019, |
|
"eval_accuracy": 0.026722338204592903, |
|
"eval_loss": 5.740837097167969, |
|
"eval_runtime": 94.7277, |
|
"eval_samples_per_second": 37.93, |
|
"eval_steps_per_second": 2.375, |
|
"step": 152000 |
|
}, |
|
{ |
|
"epoch": 3.09269, |
|
"grad_norm": 3.545217990875244, |
|
"learning_rate": 1.18905e-05, |
|
"loss": 5.7891, |
|
"step": 152500 |
|
}, |
|
{ |
|
"epoch": 3.09519, |
|
"grad_norm": 3.074216842651367, |
|
"learning_rate": 1.17655e-05, |
|
"loss": 5.7491, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 3.09519, |
|
"eval_accuracy": 0.027355241416794383, |
|
"eval_loss": 5.720602989196777, |
|
"eval_runtime": 97.4705, |
|
"eval_samples_per_second": 36.862, |
|
"eval_steps_per_second": 2.308, |
|
"step": 153000 |
|
}, |
|
{ |
|
"epoch": 3.09769, |
|
"grad_norm": 4.110429763793945, |
|
"learning_rate": 1.16405e-05, |
|
"loss": 5.7724, |
|
"step": 153500 |
|
}, |
|
{ |
|
"epoch": 3.10019, |
|
"grad_norm": 3.007507801055908, |
|
"learning_rate": 1.151575e-05, |
|
"loss": 5.7655, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 3.10019, |
|
"eval_accuracy": 0.026848184178644142, |
|
"eval_loss": 5.7095208168029785, |
|
"eval_runtime": 98.9114, |
|
"eval_samples_per_second": 36.325, |
|
"eval_steps_per_second": 2.275, |
|
"step": 154000 |
|
}, |
|
{ |
|
"epoch": 3.10269, |
|
"grad_norm": 3.1194117069244385, |
|
"learning_rate": 1.139075e-05, |
|
"loss": 5.7622, |
|
"step": 154500 |
|
}, |
|
{ |
|
"epoch": 3.10519, |
|
"grad_norm": 4.109334945678711, |
|
"learning_rate": 1.126575e-05, |
|
"loss": 5.7706, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 3.10519, |
|
"eval_accuracy": 0.02715552045473252, |
|
"eval_loss": 5.707860469818115, |
|
"eval_runtime": 126.3537, |
|
"eval_samples_per_second": 28.436, |
|
"eval_steps_per_second": 1.781, |
|
"step": 155000 |
|
}, |
|
{ |
|
"epoch": 3.10769, |
|
"grad_norm": 3.767932415008545, |
|
"learning_rate": 1.1140750000000002e-05, |
|
"loss": 5.7639, |
|
"step": 155500 |
|
}, |
|
{ |
|
"epoch": 3.11019, |
|
"grad_norm": 4.084874629974365, |
|
"learning_rate": 1.1016e-05, |
|
"loss": 5.7379, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 3.11019, |
|
"eval_accuracy": 0.027283815323413948, |
|
"eval_loss": 5.691872596740723, |
|
"eval_runtime": 93.2144, |
|
"eval_samples_per_second": 38.546, |
|
"eval_steps_per_second": 2.414, |
|
"step": 156000 |
|
}, |
|
{ |
|
"epoch": 3.11269, |
|
"grad_norm": 6.118876934051514, |
|
"learning_rate": 1.0891000000000001e-05, |
|
"loss": 5.7411, |
|
"step": 156500 |
|
}, |
|
{ |
|
"epoch": 3.11519, |
|
"grad_norm": 4.562450885772705, |
|
"learning_rate": 1.0766000000000002e-05, |
|
"loss": 5.7374, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 3.11519, |
|
"eval_accuracy": 0.027440680629447548, |
|
"eval_loss": 5.667840957641602, |
|
"eval_runtime": 91.1779, |
|
"eval_samples_per_second": 39.406, |
|
"eval_steps_per_second": 2.468, |
|
"step": 157000 |
|
}, |
|
{ |
|
"epoch": 3.11769, |
|
"grad_norm": 3.47501277923584, |
|
"learning_rate": 1.0641e-05, |
|
"loss": 5.7177, |
|
"step": 157500 |
|
}, |
|
{ |
|
"epoch": 3.12019, |
|
"grad_norm": 3.262887477874756, |
|
"learning_rate": 1.0516e-05, |
|
"loss": 5.7077, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 3.12019, |
|
"eval_accuracy": 0.026955663442968986, |
|
"eval_loss": 5.6481852531433105, |
|
"eval_runtime": 93.6492, |
|
"eval_samples_per_second": 38.367, |
|
"eval_steps_per_second": 2.403, |
|
"step": 158000 |
|
}, |
|
{ |
|
"epoch": 3.12269, |
|
"grad_norm": 3.8364920616149902, |
|
"learning_rate": 1.0391250000000002e-05, |
|
"loss": 5.7026, |
|
"step": 158500 |
|
}, |
|
{ |
|
"epoch": 3.12519, |
|
"grad_norm": 3.216892719268799, |
|
"learning_rate": 1.026625e-05, |
|
"loss": 5.7176, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 3.12519, |
|
"eval_accuracy": 0.027355649565899413, |
|
"eval_loss": 5.614231109619141, |
|
"eval_runtime": 128.5228, |
|
"eval_samples_per_second": 27.956, |
|
"eval_steps_per_second": 1.751, |
|
"step": 159000 |
|
}, |
|
{ |
|
"epoch": 3.12769, |
|
"grad_norm": 3.7914490699768066, |
|
"learning_rate": 1.014125e-05, |
|
"loss": 5.7055, |
|
"step": 159500 |
|
}, |
|
{ |
|
"epoch": 3.13019, |
|
"grad_norm": 3.515538215637207, |
|
"learning_rate": 1.0016250000000001e-05, |
|
"loss": 5.7077, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 3.13019, |
|
"eval_accuracy": 0.027464081178135997, |
|
"eval_loss": 5.629947185516357, |
|
"eval_runtime": 97.5707, |
|
"eval_samples_per_second": 36.825, |
|
"eval_steps_per_second": 2.306, |
|
"step": 160000 |
|
}, |
|
{ |
|
"epoch": 3.13269, |
|
"grad_norm": 4.886687278747559, |
|
"learning_rate": 9.8915e-06, |
|
"loss": 5.6998, |
|
"step": 160500 |
|
}, |
|
{ |
|
"epoch": 3.13519, |
|
"grad_norm": 3.584073305130005, |
|
"learning_rate": 9.7665e-06, |
|
"loss": 5.6882, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 3.13519, |
|
"eval_accuracy": 0.027459183388875624, |
|
"eval_loss": 5.591392993927002, |
|
"eval_runtime": 93.7876, |
|
"eval_samples_per_second": 38.31, |
|
"eval_steps_per_second": 2.399, |
|
"step": 161000 |
|
}, |
|
{ |
|
"epoch": 3.13769, |
|
"grad_norm": 2.614473581314087, |
|
"learning_rate": 9.6415e-06, |
|
"loss": 5.6896, |
|
"step": 161500 |
|
}, |
|
{ |
|
"epoch": 3.14019, |
|
"grad_norm": 3.612001419067383, |
|
"learning_rate": 9.516500000000001e-06, |
|
"loss": 5.6513, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 3.14019, |
|
"eval_accuracy": 0.02719810401135743, |
|
"eval_loss": 5.585655689239502, |
|
"eval_runtime": 127.1026, |
|
"eval_samples_per_second": 28.268, |
|
"eval_steps_per_second": 1.77, |
|
"step": 162000 |
|
}, |
|
{ |
|
"epoch": 3.14269, |
|
"grad_norm": 3.9107935428619385, |
|
"learning_rate": 9.39175e-06, |
|
"loss": 5.657, |
|
"step": 162500 |
|
}, |
|
{ |
|
"epoch": 3.14519, |
|
"grad_norm": 3.753208637237549, |
|
"learning_rate": 9.26675e-06, |
|
"loss": 5.6516, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 3.14519, |
|
"eval_accuracy": 0.027440544579745874, |
|
"eval_loss": 5.5584001541137695, |
|
"eval_runtime": 97.5764, |
|
"eval_samples_per_second": 36.822, |
|
"eval_steps_per_second": 2.306, |
|
"step": 163000 |
|
}, |
|
{ |
|
"epoch": 3.14769, |
|
"grad_norm": 3.1370689868927, |
|
"learning_rate": 9.141750000000001e-06, |
|
"loss": 5.6678, |
|
"step": 163500 |
|
}, |
|
{ |
|
"epoch": 3.15019, |
|
"grad_norm": 3.1537926197052, |
|
"learning_rate": 9.01675e-06, |
|
"loss": 5.6158, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 3.15019, |
|
"eval_accuracy": 0.028059434672674622, |
|
"eval_loss": 5.522320747375488, |
|
"eval_runtime": 92.5826, |
|
"eval_samples_per_second": 38.809, |
|
"eval_steps_per_second": 2.43, |
|
"step": 164000 |
|
}, |
|
{ |
|
"epoch": 3.1526899999999998, |
|
"grad_norm": 3.2936768531799316, |
|
"learning_rate": 8.89175e-06, |
|
"loss": 5.6305, |
|
"step": 164500 |
|
}, |
|
{ |
|
"epoch": 3.15519, |
|
"grad_norm": 5.041363716125488, |
|
"learning_rate": 8.767000000000001e-06, |
|
"loss": 5.6235, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 3.15519, |
|
"eval_accuracy": 0.027728289698792763, |
|
"eval_loss": 5.527610778808594, |
|
"eval_runtime": 92.1615, |
|
"eval_samples_per_second": 38.986, |
|
"eval_steps_per_second": 2.441, |
|
"step": 165000 |
|
}, |
|
{ |
|
"epoch": 3.15769, |
|
"grad_norm": 4.752361297607422, |
|
"learning_rate": 8.642e-06, |
|
"loss": 5.5997, |
|
"step": 165500 |
|
}, |
|
{ |
|
"epoch": 3.16019, |
|
"grad_norm": 4.261384963989258, |
|
"learning_rate": 8.517e-06, |
|
"loss": 5.6308, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 3.16019, |
|
"eval_accuracy": 0.02816977098073468, |
|
"eval_loss": 5.499186038970947, |
|
"eval_runtime": 92.5045, |
|
"eval_samples_per_second": 38.841, |
|
"eval_steps_per_second": 2.432, |
|
"step": 166000 |
|
}, |
|
{ |
|
"epoch": 3.16269, |
|
"grad_norm": 4.297583103179932, |
|
"learning_rate": 8.392e-06, |
|
"loss": 5.5752, |
|
"step": 166500 |
|
}, |
|
{ |
|
"epoch": 3.16519, |
|
"grad_norm": 7.3237624168396, |
|
"learning_rate": 8.26725e-06, |
|
"loss": 5.5782, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 3.16519, |
|
"eval_accuracy": 0.027606525215791832, |
|
"eval_loss": 5.48903226852417, |
|
"eval_runtime": 92.0464, |
|
"eval_samples_per_second": 39.035, |
|
"eval_steps_per_second": 2.444, |
|
"step": 167000 |
|
}, |
|
{ |
|
"epoch": 3.16769, |
|
"grad_norm": 3.426760196685791, |
|
"learning_rate": 8.14225e-06, |
|
"loss": 5.6192, |
|
"step": 167500 |
|
}, |
|
{ |
|
"epoch": 3.17019, |
|
"grad_norm": 3.8471386432647705, |
|
"learning_rate": 8.01725e-06, |
|
"loss": 5.5723, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 3.17019, |
|
"eval_accuracy": 0.027942976128039095, |
|
"eval_loss": 5.443605422973633, |
|
"eval_runtime": 90.8581, |
|
"eval_samples_per_second": 39.545, |
|
"eval_steps_per_second": 2.476, |
|
"step": 168000 |
|
}, |
|
{ |
|
"epoch": 3.1726900000000002, |
|
"grad_norm": 3.767125368118286, |
|
"learning_rate": 7.89225e-06, |
|
"loss": 5.5569, |
|
"step": 168500 |
|
}, |
|
{ |
|
"epoch": 3.17519, |
|
"grad_norm": 4.584527015686035, |
|
"learning_rate": 7.767250000000001e-06, |
|
"loss": 5.5417, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 3.17519, |
|
"eval_accuracy": 0.028352213630683562, |
|
"eval_loss": 5.416599750518799, |
|
"eval_runtime": 91.2582, |
|
"eval_samples_per_second": 39.372, |
|
"eval_steps_per_second": 2.466, |
|
"step": 169000 |
|
}, |
|
{ |
|
"epoch": 3.17769, |
|
"grad_norm": 5.043390274047852, |
|
"learning_rate": 7.642500000000002e-06, |
|
"loss": 5.535, |
|
"step": 169500 |
|
}, |
|
{ |
|
"epoch": 3.18019, |
|
"grad_norm": 7.203840732574463, |
|
"learning_rate": 7.517500000000001e-06, |
|
"loss": 5.5346, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 3.18019, |
|
"eval_accuracy": 0.028451802012311136, |
|
"eval_loss": 5.403586387634277, |
|
"eval_runtime": 96.555, |
|
"eval_samples_per_second": 37.212, |
|
"eval_steps_per_second": 2.33, |
|
"step": 170000 |
|
}, |
|
{ |
|
"epoch": 3.18269, |
|
"grad_norm": 5.778980731964111, |
|
"learning_rate": 7.392500000000001e-06, |
|
"loss": 5.521, |
|
"step": 170500 |
|
}, |
|
{ |
|
"epoch": 3.18519, |
|
"grad_norm": 3.5962352752685547, |
|
"learning_rate": 7.2675e-06, |
|
"loss": 5.5068, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 3.18519, |
|
"eval_accuracy": 0.028494929767742753, |
|
"eval_loss": 5.36637544631958, |
|
"eval_runtime": 92.7171, |
|
"eval_samples_per_second": 38.752, |
|
"eval_steps_per_second": 2.427, |
|
"step": 171000 |
|
}, |
|
{ |
|
"epoch": 3.18769, |
|
"grad_norm": 3.231135606765747, |
|
"learning_rate": 7.142500000000001e-06, |
|
"loss": 5.5068, |
|
"step": 171500 |
|
}, |
|
{ |
|
"epoch": 3.19019, |
|
"grad_norm": 4.50712776184082, |
|
"learning_rate": 7.017750000000001e-06, |
|
"loss": 5.5024, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 3.19019, |
|
"eval_accuracy": 0.0285840423223412, |
|
"eval_loss": 5.33723783493042, |
|
"eval_runtime": 91.3348, |
|
"eval_samples_per_second": 39.339, |
|
"eval_steps_per_second": 2.463, |
|
"step": 172000 |
|
}, |
|
{ |
|
"epoch": 3.19269, |
|
"grad_norm": 5.674015522003174, |
|
"learning_rate": 6.89275e-06, |
|
"loss": 5.479, |
|
"step": 172500 |
|
}, |
|
{ |
|
"epoch": 3.19519, |
|
"grad_norm": 5.207662582397461, |
|
"learning_rate": 6.7677500000000006e-06, |
|
"loss": 5.4611, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 3.19519, |
|
"eval_accuracy": 0.02859955198833238, |
|
"eval_loss": 5.306539058685303, |
|
"eval_runtime": 92.148, |
|
"eval_samples_per_second": 38.992, |
|
"eval_steps_per_second": 2.442, |
|
"step": 173000 |
|
}, |
|
{ |
|
"epoch": 3.19769, |
|
"grad_norm": 4.15488338470459, |
|
"learning_rate": 6.64275e-06, |
|
"loss": 5.4725, |
|
"step": 173500 |
|
}, |
|
{ |
|
"epoch": 3.20019, |
|
"grad_norm": 3.6545050144195557, |
|
"learning_rate": 6.51775e-06, |
|
"loss": 5.4352, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 3.20019, |
|
"eval_accuracy": 0.02851234412955741, |
|
"eval_loss": 5.30513858795166, |
|
"eval_runtime": 128.4128, |
|
"eval_samples_per_second": 27.98, |
|
"eval_steps_per_second": 1.752, |
|
"step": 174000 |
|
}, |
|
{ |
|
"epoch": 3.20269, |
|
"grad_norm": 4.31083869934082, |
|
"learning_rate": 6.3930000000000005e-06, |
|
"loss": 5.4433, |
|
"step": 174500 |
|
}, |
|
{ |
|
"epoch": 3.20519, |
|
"grad_norm": 5.675565719604492, |
|
"learning_rate": 6.268e-06, |
|
"loss": 5.4305, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 3.20519, |
|
"eval_accuracy": 0.02904035302176591, |
|
"eval_loss": 5.271829128265381, |
|
"eval_runtime": 91.0491, |
|
"eval_samples_per_second": 39.462, |
|
"eval_steps_per_second": 2.471, |
|
"step": 175000 |
|
}, |
|
{ |
|
"epoch": 3.20769, |
|
"grad_norm": 4.5083231925964355, |
|
"learning_rate": 6.143e-06, |
|
"loss": 5.4444, |
|
"step": 175500 |
|
}, |
|
{ |
|
"epoch": 3.21019, |
|
"grad_norm": 5.508623123168945, |
|
"learning_rate": 6.018e-06, |
|
"loss": 5.4244, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 3.21019, |
|
"eval_accuracy": 0.02856485931440474, |
|
"eval_loss": 5.234050750732422, |
|
"eval_runtime": 97.7428, |
|
"eval_samples_per_second": 36.76, |
|
"eval_steps_per_second": 2.302, |
|
"step": 176000 |
|
}, |
|
{ |
|
"epoch": 3.21269, |
|
"grad_norm": 5.6364030838012695, |
|
"learning_rate": 5.893000000000001e-06, |
|
"loss": 5.3918, |
|
"step": 176500 |
|
}, |
|
{ |
|
"epoch": 3.2151899999999998, |
|
"grad_norm": 4.249551773071289, |
|
"learning_rate": 5.76825e-06, |
|
"loss": 5.406, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 3.2151899999999998, |
|
"eval_accuracy": 0.028736417988219458, |
|
"eval_loss": 5.197048664093018, |
|
"eval_runtime": 101.4416, |
|
"eval_samples_per_second": 35.419, |
|
"eval_steps_per_second": 2.218, |
|
"step": 177000 |
|
}, |
|
{ |
|
"epoch": 3.21769, |
|
"grad_norm": 6.131099224090576, |
|
"learning_rate": 5.64325e-06, |
|
"loss": 5.4104, |
|
"step": 177500 |
|
}, |
|
{ |
|
"epoch": 3.22019, |
|
"grad_norm": 5.6621479988098145, |
|
"learning_rate": 5.518250000000001e-06, |
|
"loss": 5.3693, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 3.22019, |
|
"eval_accuracy": 0.028811245324141817, |
|
"eval_loss": 5.188289642333984, |
|
"eval_runtime": 93.1736, |
|
"eval_samples_per_second": 38.562, |
|
"eval_steps_per_second": 2.415, |
|
"step": 178000 |
|
}, |
|
{ |
|
"epoch": 3.22269, |
|
"grad_norm": 6.5256242752075195, |
|
"learning_rate": 5.39325e-06, |
|
"loss": 5.3581, |
|
"step": 178500 |
|
}, |
|
{ |
|
"epoch": 4.00192, |
|
"grad_norm": 4.139052867889404, |
|
"learning_rate": 5.2685000000000005e-06, |
|
"loss": 5.3414, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 4.00192, |
|
"eval_accuracy": 0.028710568544900825, |
|
"eval_loss": 5.1566362380981445, |
|
"eval_runtime": 128.5676, |
|
"eval_samples_per_second": 27.946, |
|
"eval_steps_per_second": 1.75, |
|
"step": 179000 |
|
}, |
|
{ |
|
"epoch": 4.00442, |
|
"grad_norm": 5.124391078948975, |
|
"learning_rate": 5.143500000000001e-06, |
|
"loss": 5.3228, |
|
"step": 179500 |
|
}, |
|
{ |
|
"epoch": 4.00692, |
|
"grad_norm": 4.564643859863281, |
|
"learning_rate": 5.0185e-06, |
|
"loss": 5.3252, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 4.00692, |
|
"eval_accuracy": 0.029084841274214297, |
|
"eval_loss": 5.121004104614258, |
|
"eval_runtime": 99.0466, |
|
"eval_samples_per_second": 36.276, |
|
"eval_steps_per_second": 2.272, |
|
"step": 180000 |
|
}, |
|
{ |
|
"epoch": 4.00942, |
|
"grad_norm": 6.144300937652588, |
|
"learning_rate": 4.8935e-06, |
|
"loss": 5.326, |
|
"step": 180500 |
|
}, |
|
{ |
|
"epoch": 4.01192, |
|
"grad_norm": 4.762115001678467, |
|
"learning_rate": 4.768750000000001e-06, |
|
"loss": 5.3302, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 4.01192, |
|
"eval_accuracy": 0.02900389170171647, |
|
"eval_loss": 5.1127095222473145, |
|
"eval_runtime": 93.1042, |
|
"eval_samples_per_second": 38.591, |
|
"eval_steps_per_second": 2.417, |
|
"step": 181000 |
|
}, |
|
{ |
|
"epoch": 4.01442, |
|
"grad_norm": 4.9249420166015625, |
|
"learning_rate": 4.64375e-06, |
|
"loss": 5.322, |
|
"step": 181500 |
|
}, |
|
{ |
|
"epoch": 4.01692, |
|
"grad_norm": 5.8505988121032715, |
|
"learning_rate": 4.519e-06, |
|
"loss": 5.3112, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 4.01692, |
|
"eval_accuracy": 0.028904303320088896, |
|
"eval_loss": 5.079184055328369, |
|
"eval_runtime": 92.4255, |
|
"eval_samples_per_second": 38.875, |
|
"eval_steps_per_second": 2.434, |
|
"step": 182000 |
|
}, |
|
{ |
|
"epoch": 4.01942, |
|
"grad_norm": 5.5884599685668945, |
|
"learning_rate": 4.394000000000001e-06, |
|
"loss": 5.2634, |
|
"step": 182500 |
|
}, |
|
{ |
|
"epoch": 4.02192, |
|
"grad_norm": 6.884908199310303, |
|
"learning_rate": 4.269e-06, |
|
"loss": 5.2651, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 4.02192, |
|
"eval_accuracy": 0.02909871834378535, |
|
"eval_loss": 5.043324947357178, |
|
"eval_runtime": 100.4033, |
|
"eval_samples_per_second": 35.786, |
|
"eval_steps_per_second": 2.241, |
|
"step": 183000 |
|
}, |
|
{ |
|
"epoch": 4.02442, |
|
"grad_norm": 4.412216663360596, |
|
"learning_rate": 4.144e-06, |
|
"loss": 5.2793, |
|
"step": 183500 |
|
}, |
|
{ |
|
"epoch": 4.02692, |
|
"grad_norm": 4.486749172210693, |
|
"learning_rate": 4.019e-06, |
|
"loss": 5.2623, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 4.02692, |
|
"eval_accuracy": 0.028819816455347466, |
|
"eval_loss": 5.025639533996582, |
|
"eval_runtime": 94.773, |
|
"eval_samples_per_second": 37.912, |
|
"eval_steps_per_second": 2.374, |
|
"step": 184000 |
|
}, |
|
{ |
|
"epoch": 4.02942, |
|
"grad_norm": 5.047926425933838, |
|
"learning_rate": 3.894e-06, |
|
"loss": 5.2454, |
|
"step": 184500 |
|
}, |
|
{ |
|
"epoch": 4.03192, |
|
"grad_norm": 7.453557968139648, |
|
"learning_rate": 3.7690000000000003e-06, |
|
"loss": 5.2297, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 4.03192, |
|
"eval_accuracy": 0.02869369838189287, |
|
"eval_loss": 5.029138565063477, |
|
"eval_runtime": 92.8613, |
|
"eval_samples_per_second": 38.692, |
|
"eval_steps_per_second": 2.423, |
|
"step": 185000 |
|
}, |
|
{ |
|
"epoch": 4.03442, |
|
"grad_norm": 5.0118021965026855, |
|
"learning_rate": 3.644e-06, |
|
"loss": 5.2391, |
|
"step": 185500 |
|
}, |
|
{ |
|
"epoch": 4.03692, |
|
"grad_norm": 5.173340320587158, |
|
"learning_rate": 3.5192500000000002e-06, |
|
"loss": 5.1991, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 4.03692, |
|
"eval_accuracy": 0.028796688006062374, |
|
"eval_loss": 4.970343589782715, |
|
"eval_runtime": 95.8003, |
|
"eval_samples_per_second": 37.505, |
|
"eval_steps_per_second": 2.349, |
|
"step": 186000 |
|
}, |
|
{ |
|
"epoch": 4.03942, |
|
"grad_norm": 4.592197895050049, |
|
"learning_rate": 3.39425e-06, |
|
"loss": 5.203, |
|
"step": 186500 |
|
}, |
|
{ |
|
"epoch": 4.04192, |
|
"grad_norm": 5.571292400360107, |
|
"learning_rate": 3.26925e-06, |
|
"loss": 5.1883, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 4.04192, |
|
"eval_accuracy": 0.02865968595647362, |
|
"eval_loss": 4.975839614868164, |
|
"eval_runtime": 92.6873, |
|
"eval_samples_per_second": 38.765, |
|
"eval_steps_per_second": 2.428, |
|
"step": 187000 |
|
}, |
|
{ |
|
"epoch": 4.04442, |
|
"grad_norm": 4.220412731170654, |
|
"learning_rate": 3.14425e-06, |
|
"loss": 5.1943, |
|
"step": 187500 |
|
}, |
|
{ |
|
"epoch": 4.04692, |
|
"grad_norm": 8.616243362426758, |
|
"learning_rate": 3.0195e-06, |
|
"loss": 5.1854, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 4.04692, |
|
"eval_accuracy": 0.028208953294817662, |
|
"eval_loss": 4.942821502685547, |
|
"eval_runtime": 94.4959, |
|
"eval_samples_per_second": 38.023, |
|
"eval_steps_per_second": 2.381, |
|
"step": 188000 |
|
}, |
|
{ |
|
"epoch": 4.04942, |
|
"grad_norm": 5.9760894775390625, |
|
"learning_rate": 2.8945e-06, |
|
"loss": 5.1704, |
|
"step": 188500 |
|
}, |
|
{ |
|
"epoch": 4.05192, |
|
"grad_norm": 6.06154203414917, |
|
"learning_rate": 2.7695000000000003e-06, |
|
"loss": 5.1636, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 4.05192, |
|
"eval_accuracy": 0.02843751679363505, |
|
"eval_loss": 4.911832332611084, |
|
"eval_runtime": 91.2834, |
|
"eval_samples_per_second": 39.361, |
|
"eval_steps_per_second": 2.465, |
|
"step": 189000 |
|
}, |
|
{ |
|
"epoch": 4.05442, |
|
"grad_norm": 7.606237411499023, |
|
"learning_rate": 2.6445000000000003e-06, |
|
"loss": 5.1407, |
|
"step": 189500 |
|
}, |
|
{ |
|
"epoch": 4.05692, |
|
"grad_norm": 4.682966709136963, |
|
"learning_rate": 2.5197500000000003e-06, |
|
"loss": 5.1356, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 4.05692, |
|
"eval_accuracy": 0.028226095557228967, |
|
"eval_loss": 4.904683589935303, |
|
"eval_runtime": 92.6319, |
|
"eval_samples_per_second": 38.788, |
|
"eval_steps_per_second": 2.429, |
|
"step": 190000 |
|
}, |
|
{ |
|
"epoch": 4.05942, |
|
"grad_norm": 5.441736221313477, |
|
"learning_rate": 2.3947500000000002e-06, |
|
"loss": 5.1346, |
|
"step": 190500 |
|
}, |
|
{ |
|
"epoch": 4.06192, |
|
"grad_norm": 4.829955101013184, |
|
"learning_rate": 2.26975e-06, |
|
"loss": 5.1329, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 4.06192, |
|
"eval_accuracy": 0.028335071368272256, |
|
"eval_loss": 4.874863147735596, |
|
"eval_runtime": 91.0476, |
|
"eval_samples_per_second": 39.463, |
|
"eval_steps_per_second": 2.471, |
|
"step": 191000 |
|
}, |
|
{ |
|
"epoch": 4.06442, |
|
"grad_norm": 5.842422962188721, |
|
"learning_rate": 2.14475e-06, |
|
"loss": 5.1127, |
|
"step": 191500 |
|
}, |
|
{ |
|
"epoch": 4.06692, |
|
"grad_norm": 5.372358322143555, |
|
"learning_rate": 2.01975e-06, |
|
"loss": 5.107, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 4.06692, |
|
"eval_accuracy": 0.02805358453550251, |
|
"eval_loss": 4.877078533172607, |
|
"eval_runtime": 92.2372, |
|
"eval_samples_per_second": 38.954, |
|
"eval_steps_per_second": 2.439, |
|
"step": 192000 |
|
}, |
|
{ |
|
"epoch": 4.06942, |
|
"grad_norm": 5.863542556762695, |
|
"learning_rate": 1.8950000000000003e-06, |
|
"loss": 5.1301, |
|
"step": 192500 |
|
}, |
|
{ |
|
"epoch": 4.07192, |
|
"grad_norm": 8.768465042114258, |
|
"learning_rate": 1.7700000000000002e-06, |
|
"loss": 5.1159, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 4.07192, |
|
"eval_accuracy": 0.027984607336752263, |
|
"eval_loss": 4.856239318847656, |
|
"eval_runtime": 100.1894, |
|
"eval_samples_per_second": 35.862, |
|
"eval_steps_per_second": 2.246, |
|
"step": 193000 |
|
}, |
|
{ |
|
"epoch": 4.07442, |
|
"grad_norm": 4.413668632507324, |
|
"learning_rate": 1.645e-06, |
|
"loss": 5.1087, |
|
"step": 193500 |
|
}, |
|
{ |
|
"epoch": 4.07692, |
|
"grad_norm": 7.255640983581543, |
|
"learning_rate": 1.52e-06, |
|
"loss": 5.0892, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 4.07692, |
|
"eval_accuracy": 0.027881481662881083, |
|
"eval_loss": 4.846496105194092, |
|
"eval_runtime": 95.8083, |
|
"eval_samples_per_second": 37.502, |
|
"eval_steps_per_second": 2.348, |
|
"step": 194000 |
|
}, |
|
{ |
|
"epoch": 4.07942, |
|
"grad_norm": 5.159695148468018, |
|
"learning_rate": 1.3952500000000001e-06, |
|
"loss": 5.0909, |
|
"step": 194500 |
|
}, |
|
{ |
|
"epoch": 4.08192, |
|
"grad_norm": 4.329514026641846, |
|
"learning_rate": 1.27025e-06, |
|
"loss": 5.083, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 4.08192, |
|
"eval_accuracy": 0.02791671853561543, |
|
"eval_loss": 4.825800895690918, |
|
"eval_runtime": 92.6653, |
|
"eval_samples_per_second": 38.774, |
|
"eval_steps_per_second": 2.428, |
|
"step": 195000 |
|
}, |
|
{ |
|
"epoch": 4.08442, |
|
"grad_norm": 6.477020263671875, |
|
"learning_rate": 1.14525e-06, |
|
"loss": 5.0699, |
|
"step": 195500 |
|
}, |
|
{ |
|
"epoch": 4.08692, |
|
"grad_norm": 4.648099899291992, |
|
"learning_rate": 1.02025e-06, |
|
"loss": 5.0824, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 4.08692, |
|
"eval_accuracy": 0.028006647388423938, |
|
"eval_loss": 4.821605682373047, |
|
"eval_runtime": 103.4797, |
|
"eval_samples_per_second": 34.722, |
|
"eval_steps_per_second": 2.174, |
|
"step": 196000 |
|
}, |
|
{ |
|
"epoch": 4.08942, |
|
"grad_norm": 5.26840877532959, |
|
"learning_rate": 8.952500000000001e-07, |
|
"loss": 5.0674, |
|
"step": 196500 |
|
}, |
|
{ |
|
"epoch": 4.09192, |
|
"grad_norm": 5.00206184387207, |
|
"learning_rate": 7.7075e-07, |
|
"loss": 5.0774, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 4.09192, |
|
"eval_accuracy": 0.027867604593310027, |
|
"eval_loss": 4.817193984985352, |
|
"eval_runtime": 103.2721, |
|
"eval_samples_per_second": 34.792, |
|
"eval_steps_per_second": 2.179, |
|
"step": 197000 |
|
}, |
|
{ |
|
"epoch": 4.09442, |
|
"grad_norm": 8.953470230102539, |
|
"learning_rate": 6.4575e-07, |
|
"loss": 5.0661, |
|
"step": 197500 |
|
}, |
|
{ |
|
"epoch": 4.09692, |
|
"grad_norm": 4.821086406707764, |
|
"learning_rate": 5.207500000000001e-07, |
|
"loss": 5.0567, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 4.09692, |
|
"eval_accuracy": 0.027846652939251766, |
|
"eval_loss": 4.811811923980713, |
|
"eval_runtime": 129.0966, |
|
"eval_samples_per_second": 27.832, |
|
"eval_steps_per_second": 1.743, |
|
"step": 198000 |
|
}, |
|
{ |
|
"epoch": 4.09942, |
|
"grad_norm": 5.411968231201172, |
|
"learning_rate": 3.9575000000000003e-07, |
|
"loss": 5.0657, |
|
"step": 198500 |
|
}, |
|
{ |
|
"epoch": 4.10192, |
|
"grad_norm": 5.903570652008057, |
|
"learning_rate": 2.7075e-07, |
|
"loss": 5.0657, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 4.10192, |
|
"eval_accuracy": 0.02782066744623146, |
|
"eval_loss": 4.807706832885742, |
|
"eval_runtime": 126.5816, |
|
"eval_samples_per_second": 28.385, |
|
"eval_steps_per_second": 1.778, |
|
"step": 199000 |
|
}, |
|
{ |
|
"epoch": 4.10442, |
|
"grad_norm": 4.718256950378418, |
|
"learning_rate": 1.4575000000000002e-07, |
|
"loss": 5.0755, |
|
"step": 199500 |
|
}, |
|
{ |
|
"epoch": 4.10692, |
|
"grad_norm": 5.640545845031738, |
|
"learning_rate": 2.075e-08, |
|
"loss": 5.0751, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 4.10692, |
|
"eval_accuracy": 0.02785753691538593, |
|
"eval_loss": 4.805818557739258, |
|
"eval_runtime": 112.4779, |
|
"eval_samples_per_second": 31.944, |
|
"eval_steps_per_second": 2.0, |
|
"step": 200000 |
|
}, |
|
{ |
|
"epoch": 4.10692, |
|
"step": 200000, |
|
"total_flos": 1.1895955925447475e+18, |
|
"train_loss": 6.0653790368652345, |
|
"train_runtime": 98053.9397, |
|
"train_samples_per_second": 32.635, |
|
"train_steps_per_second": 2.04 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 200000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.1895955925447475e+18, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|