T5LAA / trainer_state.json
hrezaei's picture
End of training
1847c1b
{
"best_metric": 4.805818557739258,
"best_model_checkpoint": "/users/hr1171/scratch/T5LAA/checkpoint-200000",
"epoch": 4.10692,
"eval_steps": 1000,
"global_step": 200000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0025,
"grad_norm": 0.42752909660339355,
"learning_rate": 4.9875000000000006e-05,
"loss": 9.2783,
"step": 500
},
{
"epoch": 0.005,
"grad_norm": 0.6895659565925598,
"learning_rate": 4.975e-05,
"loss": 8.7605,
"step": 1000
},
{
"epoch": 0.005,
"eval_accuracy": 0.033912700987924906,
"eval_loss": 8.507349967956543,
"eval_runtime": 91.5333,
"eval_samples_per_second": 39.253,
"eval_steps_per_second": 2.458,
"step": 1000
},
{
"epoch": 0.0075,
"grad_norm": 1.0673288106918335,
"learning_rate": 4.962500000000001e-05,
"loss": 8.3738,
"step": 1500
},
{
"epoch": 0.01,
"grad_norm": 0.6839419603347778,
"learning_rate": 4.9500000000000004e-05,
"loss": 8.0954,
"step": 2000
},
{
"epoch": 0.01,
"eval_accuracy": 0.03104558957478346,
"eval_loss": 8.017916679382324,
"eval_runtime": 91.927,
"eval_samples_per_second": 39.085,
"eval_steps_per_second": 2.448,
"step": 2000
},
{
"epoch": 0.0125,
"grad_norm": 0.950947105884552,
"learning_rate": 4.937525e-05,
"loss": 7.8869,
"step": 2500
},
{
"epoch": 0.015,
"grad_norm": 1.0953532457351685,
"learning_rate": 4.925025e-05,
"loss": 7.7188,
"step": 3000
},
{
"epoch": 0.015,
"eval_accuracy": 0.03081947497059626,
"eval_loss": 7.6839141845703125,
"eval_runtime": 90.9808,
"eval_samples_per_second": 39.492,
"eval_steps_per_second": 2.473,
"step": 3000
},
{
"epoch": 0.0175,
"grad_norm": 0.5422408580780029,
"learning_rate": 4.912525e-05,
"loss": 7.5854,
"step": 3500
},
{
"epoch": 0.02,
"grad_norm": 3.3019630908966064,
"learning_rate": 4.900025e-05,
"loss": 7.4459,
"step": 4000
},
{
"epoch": 0.02,
"eval_accuracy": 0.032928381396291694,
"eval_loss": 7.432972431182861,
"eval_runtime": 92.5405,
"eval_samples_per_second": 38.826,
"eval_steps_per_second": 2.431,
"step": 4000
},
{
"epoch": 0.0225,
"grad_norm": 16.127534866333008,
"learning_rate": 4.88755e-05,
"loss": 7.3365,
"step": 4500
},
{
"epoch": 0.025,
"grad_norm": 1.8640549182891846,
"learning_rate": 4.875050000000001e-05,
"loss": 7.2526,
"step": 5000
},
{
"epoch": 0.025,
"eval_accuracy": 0.032260649460460894,
"eval_loss": 7.2563862800598145,
"eval_runtime": 91.289,
"eval_samples_per_second": 39.359,
"eval_steps_per_second": 2.465,
"step": 5000
},
{
"epoch": 0.0275,
"grad_norm": 1.704673171043396,
"learning_rate": 4.862575e-05,
"loss": 7.1831,
"step": 5500
},
{
"epoch": 0.03,
"grad_norm": 2.844064235687256,
"learning_rate": 4.850075e-05,
"loss": 7.1018,
"step": 6000
},
{
"epoch": 0.03,
"eval_accuracy": 0.033544686544888576,
"eval_loss": 7.128678321838379,
"eval_runtime": 94.2173,
"eval_samples_per_second": 38.135,
"eval_steps_per_second": 2.388,
"step": 6000
},
{
"epoch": 0.0325,
"grad_norm": 1.2851128578186035,
"learning_rate": 4.837575e-05,
"loss": 7.0525,
"step": 6500
},
{
"epoch": 0.035,
"grad_norm": 0.74603670835495,
"learning_rate": 4.825075e-05,
"loss": 7.014,
"step": 7000
},
{
"epoch": 0.035,
"eval_accuracy": 0.03406153936155956,
"eval_loss": 7.0242919921875,
"eval_runtime": 91.5738,
"eval_samples_per_second": 39.236,
"eval_steps_per_second": 2.457,
"step": 7000
},
{
"epoch": 0.0375,
"grad_norm": 0.937613308429718,
"learning_rate": 4.812575e-05,
"loss": 6.9613,
"step": 7500
},
{
"epoch": 0.04,
"grad_norm": 2.0788276195526123,
"learning_rate": 4.800075e-05,
"loss": 6.9585,
"step": 8000
},
{
"epoch": 0.04,
"eval_accuracy": 0.03162039956436886,
"eval_loss": 6.953730583190918,
"eval_runtime": 98.804,
"eval_samples_per_second": 36.365,
"eval_steps_per_second": 2.277,
"step": 8000
},
{
"epoch": 0.0425,
"grad_norm": 0.7946870923042297,
"learning_rate": 4.787575e-05,
"loss": 6.9196,
"step": 8500
},
{
"epoch": 0.045,
"grad_norm": 1.7380784749984741,
"learning_rate": 4.775075e-05,
"loss": 6.9082,
"step": 9000
},
{
"epoch": 0.045,
"eval_accuracy": 0.032857907650823,
"eval_loss": 6.873142242431641,
"eval_runtime": 100.6418,
"eval_samples_per_second": 35.701,
"eval_steps_per_second": 2.236,
"step": 9000
},
{
"epoch": 0.0475,
"grad_norm": 1.8795133829116821,
"learning_rate": 4.7625750000000004e-05,
"loss": 6.8796,
"step": 9500
},
{
"epoch": 0.05,
"grad_norm": 1.129756212234497,
"learning_rate": 4.7501e-05,
"loss": 6.8857,
"step": 10000
},
{
"epoch": 0.05,
"eval_accuracy": 0.032590025788220955,
"eval_loss": 6.822448253631592,
"eval_runtime": 96.3312,
"eval_samples_per_second": 37.298,
"eval_steps_per_second": 2.336,
"step": 10000
},
{
"epoch": 0.0525,
"grad_norm": 0.6531652212142944,
"learning_rate": 4.7376e-05,
"loss": 6.8436,
"step": 10500
},
{
"epoch": 0.055,
"grad_norm": 1.2178057432174683,
"learning_rate": 4.7251e-05,
"loss": 6.8166,
"step": 11000
},
{
"epoch": 0.055,
"eval_accuracy": 0.032403501647221764,
"eval_loss": 6.821002006530762,
"eval_runtime": 95.9352,
"eval_samples_per_second": 37.452,
"eval_steps_per_second": 2.345,
"step": 11000
},
{
"epoch": 0.0575,
"grad_norm": 1.195275902748108,
"learning_rate": 4.7126e-05,
"loss": 6.8214,
"step": 11500
},
{
"epoch": 0.06,
"grad_norm": 1.0131785869598389,
"learning_rate": 4.700125e-05,
"loss": 6.8225,
"step": 12000
},
{
"epoch": 0.06,
"eval_accuracy": 0.033441832970420755,
"eval_loss": 6.764980316162109,
"eval_runtime": 94.995,
"eval_samples_per_second": 37.823,
"eval_steps_per_second": 2.369,
"step": 12000
},
{
"epoch": 0.0625,
"grad_norm": 2.522235870361328,
"learning_rate": 4.687625000000001e-05,
"loss": 6.7924,
"step": 12500
},
{
"epoch": 0.065,
"grad_norm": 0.894263505935669,
"learning_rate": 4.6751250000000004e-05,
"loss": 6.791,
"step": 13000
},
{
"epoch": 0.065,
"eval_accuracy": 0.03224772473880158,
"eval_loss": 6.734119415283203,
"eval_runtime": 117.5831,
"eval_samples_per_second": 30.557,
"eval_steps_per_second": 1.914,
"step": 13000
},
{
"epoch": 0.0675,
"grad_norm": 2.9750237464904785,
"learning_rate": 4.662625000000001e-05,
"loss": 6.7703,
"step": 13500
},
{
"epoch": 0.07,
"grad_norm": 1.03080153465271,
"learning_rate": 4.6501250000000005e-05,
"loss": 6.7786,
"step": 14000
},
{
"epoch": 0.07,
"eval_accuracy": 0.03289967490923784,
"eval_loss": 6.726984024047852,
"eval_runtime": 103.0616,
"eval_samples_per_second": 34.863,
"eval_steps_per_second": 2.183,
"step": 14000
},
{
"epoch": 0.0725,
"grad_norm": 2.6716787815093994,
"learning_rate": 4.63765e-05,
"loss": 6.7565,
"step": 14500
},
{
"epoch": 0.075,
"grad_norm": 0.8023720383644104,
"learning_rate": 4.62515e-05,
"loss": 6.7516,
"step": 15000
},
{
"epoch": 0.075,
"eval_accuracy": 0.03358686195240845,
"eval_loss": 6.673779010772705,
"eval_runtime": 97.0967,
"eval_samples_per_second": 37.004,
"eval_steps_per_second": 2.317,
"step": 15000
},
{
"epoch": 0.0775,
"grad_norm": 0.9706649780273438,
"learning_rate": 4.61265e-05,
"loss": 6.7337,
"step": 15500
},
{
"epoch": 0.08,
"grad_norm": 2.0966403484344482,
"learning_rate": 4.60015e-05,
"loss": 6.7343,
"step": 16000
},
{
"epoch": 0.08,
"eval_accuracy": 0.03371406842347646,
"eval_loss": 6.695716857910156,
"eval_runtime": 95.0805,
"eval_samples_per_second": 37.789,
"eval_steps_per_second": 2.366,
"step": 16000
},
{
"epoch": 0.0825,
"grad_norm": 0.998198390007019,
"learning_rate": 4.58765e-05,
"loss": 6.7044,
"step": 16500
},
{
"epoch": 0.085,
"grad_norm": 0.5809659957885742,
"learning_rate": 4.5751500000000004e-05,
"loss": 6.7027,
"step": 17000
},
{
"epoch": 0.085,
"eval_accuracy": 0.03325081918926622,
"eval_loss": 6.647298336029053,
"eval_runtime": 111.1392,
"eval_samples_per_second": 32.329,
"eval_steps_per_second": 2.024,
"step": 17000
},
{
"epoch": 0.0875,
"grad_norm": 1.1927660703659058,
"learning_rate": 4.56265e-05,
"loss": 6.7058,
"step": 17500
},
{
"epoch": 0.09,
"grad_norm": 1.1725599765777588,
"learning_rate": 4.5501500000000005e-05,
"loss": 6.6741,
"step": 18000
},
{
"epoch": 0.09,
"eval_accuracy": 0.034510367327392044,
"eval_loss": 6.625380039215088,
"eval_runtime": 91.7433,
"eval_samples_per_second": 39.164,
"eval_steps_per_second": 2.452,
"step": 18000
},
{
"epoch": 0.0925,
"grad_norm": 1.0063527822494507,
"learning_rate": 4.537675e-05,
"loss": 6.7041,
"step": 18500
},
{
"epoch": 0.095,
"grad_norm": 2.5089831352233887,
"learning_rate": 4.525175e-05,
"loss": 6.6426,
"step": 19000
},
{
"epoch": 0.095,
"eval_accuracy": 0.03386848483487988,
"eval_loss": 6.642553806304932,
"eval_runtime": 95.8961,
"eval_samples_per_second": 37.468,
"eval_steps_per_second": 2.346,
"step": 19000
},
{
"epoch": 0.0975,
"grad_norm": 1.1984361410140991,
"learning_rate": 4.512675e-05,
"loss": 6.6729,
"step": 19500
},
{
"epoch": 0.1,
"grad_norm": 1.813727855682373,
"learning_rate": 4.500175e-05,
"loss": 6.6475,
"step": 20000
},
{
"epoch": 0.1,
"eval_accuracy": 0.03304606438824231,
"eval_loss": 6.6046462059021,
"eval_runtime": 95.7736,
"eval_samples_per_second": 37.516,
"eval_steps_per_second": 2.349,
"step": 20000
},
{
"epoch": 0.1025,
"grad_norm": 1.4854774475097656,
"learning_rate": 4.4877000000000004e-05,
"loss": 6.6445,
"step": 20500
},
{
"epoch": 0.105,
"grad_norm": 1.47989022731781,
"learning_rate": 4.4752e-05,
"loss": 6.6649,
"step": 21000
},
{
"epoch": 0.105,
"eval_accuracy": 0.034226567649693784,
"eval_loss": 6.570390224456787,
"eval_runtime": 99.1817,
"eval_samples_per_second": 36.226,
"eval_steps_per_second": 2.269,
"step": 21000
},
{
"epoch": 0.1075,
"grad_norm": 1.1667526960372925,
"learning_rate": 4.4627000000000005e-05,
"loss": 6.6509,
"step": 21500
},
{
"epoch": 0.11,
"grad_norm": 1.3919010162353516,
"learning_rate": 4.4502e-05,
"loss": 6.619,
"step": 22000
},
{
"epoch": 0.11,
"eval_accuracy": 0.032384726788390335,
"eval_loss": 6.571103096008301,
"eval_runtime": 95.7384,
"eval_samples_per_second": 37.529,
"eval_steps_per_second": 2.35,
"step": 22000
},
{
"epoch": 0.1125,
"grad_norm": 2.1141815185546875,
"learning_rate": 4.437725e-05,
"loss": 6.6248,
"step": 22500
},
{
"epoch": 0.115,
"grad_norm": 2.2613210678100586,
"learning_rate": 4.425225e-05,
"loss": 6.6216,
"step": 23000
},
{
"epoch": 0.115,
"eval_accuracy": 0.03199086290203537,
"eval_loss": 6.581330299377441,
"eval_runtime": 91.9194,
"eval_samples_per_second": 39.089,
"eval_steps_per_second": 2.448,
"step": 23000
},
{
"epoch": 0.1175,
"grad_norm": 1.3900034427642822,
"learning_rate": 4.4127250000000003e-05,
"loss": 6.5989,
"step": 23500
},
{
"epoch": 0.12,
"grad_norm": 1.3075273036956787,
"learning_rate": 4.400225e-05,
"loss": 6.5812,
"step": 24000
},
{
"epoch": 0.12,
"eval_accuracy": 0.033095314380249395,
"eval_loss": 6.547011375427246,
"eval_runtime": 94.3249,
"eval_samples_per_second": 38.092,
"eval_steps_per_second": 2.385,
"step": 24000
},
{
"epoch": 0.1225,
"grad_norm": 1.2836869955062866,
"learning_rate": 4.387750000000001e-05,
"loss": 6.597,
"step": 24500
},
{
"epoch": 0.125,
"grad_norm": 1.1456223726272583,
"learning_rate": 4.375275e-05,
"loss": 6.5995,
"step": 25000
},
{
"epoch": 0.125,
"eval_accuracy": 0.033780188578491493,
"eval_loss": 6.51839542388916,
"eval_runtime": 93.2456,
"eval_samples_per_second": 38.533,
"eval_steps_per_second": 2.413,
"step": 25000
},
{
"epoch": 0.1275,
"grad_norm": 1.2860591411590576,
"learning_rate": 4.362775e-05,
"loss": 6.5882,
"step": 25500
},
{
"epoch": 0.13,
"grad_norm": 1.200269103050232,
"learning_rate": 4.350275e-05,
"loss": 6.5891,
"step": 26000
},
{
"epoch": 0.13,
"eval_accuracy": 0.03334578188103678,
"eval_loss": 6.508150100708008,
"eval_runtime": 90.9178,
"eval_samples_per_second": 39.519,
"eval_steps_per_second": 2.475,
"step": 26000
},
{
"epoch": 0.1325,
"grad_norm": 1.4053945541381836,
"learning_rate": 4.337775e-05,
"loss": 6.5754,
"step": 26500
},
{
"epoch": 0.135,
"grad_norm": 1.9518871307373047,
"learning_rate": 4.3252750000000004e-05,
"loss": 6.5767,
"step": 27000
},
{
"epoch": 0.135,
"eval_accuracy": 0.03279600503655996,
"eval_loss": 6.481350898742676,
"eval_runtime": 93.0531,
"eval_samples_per_second": 38.612,
"eval_steps_per_second": 2.418,
"step": 27000
},
{
"epoch": 0.1375,
"grad_norm": 0.9082052111625671,
"learning_rate": 4.312775e-05,
"loss": 6.5661,
"step": 27500
},
{
"epoch": 0.14,
"grad_norm": 1.2394309043884277,
"learning_rate": 4.3002750000000004e-05,
"loss": 6.5387,
"step": 28000
},
{
"epoch": 0.14,
"eval_accuracy": 0.03242513354978841,
"eval_loss": 6.50333833694458,
"eval_runtime": 91.851,
"eval_samples_per_second": 39.118,
"eval_steps_per_second": 2.45,
"step": 28000
},
{
"epoch": 0.1425,
"grad_norm": 1.4448879957199097,
"learning_rate": 4.287775e-05,
"loss": 6.541,
"step": 28500
},
{
"epoch": 0.145,
"grad_norm": 1.214563250541687,
"learning_rate": 4.2752750000000005e-05,
"loss": 6.5427,
"step": 29000
},
{
"epoch": 0.145,
"eval_accuracy": 0.03188379178681556,
"eval_loss": 6.480025291442871,
"eval_runtime": 103.8976,
"eval_samples_per_second": 34.582,
"eval_steps_per_second": 2.166,
"step": 29000
},
{
"epoch": 0.1475,
"grad_norm": 0.9117876291275024,
"learning_rate": 4.2628e-05,
"loss": 6.5505,
"step": 29500
},
{
"epoch": 0.15,
"grad_norm": 1.4780124425888062,
"learning_rate": 4.2503e-05,
"loss": 6.5139,
"step": 30000
},
{
"epoch": 0.15,
"eval_accuracy": 0.03140081534586215,
"eval_loss": 6.477231025695801,
"eval_runtime": 97.6245,
"eval_samples_per_second": 36.804,
"eval_steps_per_second": 2.305,
"step": 30000
},
{
"epoch": 0.1525,
"grad_norm": 0.9774089455604553,
"learning_rate": 4.2378e-05,
"loss": 6.5255,
"step": 30500
},
{
"epoch": 0.155,
"grad_norm": 1.5147466659545898,
"learning_rate": 4.225325e-05,
"loss": 6.5186,
"step": 31000
},
{
"epoch": 0.155,
"eval_accuracy": 0.03227180553599841,
"eval_loss": 6.446476936340332,
"eval_runtime": 92.5361,
"eval_samples_per_second": 38.828,
"eval_steps_per_second": 2.431,
"step": 31000
},
{
"epoch": 0.1575,
"grad_norm": 1.4620882272720337,
"learning_rate": 4.2128250000000004e-05,
"loss": 6.5019,
"step": 31500
},
{
"epoch": 0.16,
"grad_norm": 1.446492314338684,
"learning_rate": 4.200325e-05,
"loss": 6.5233,
"step": 32000
},
{
"epoch": 0.16,
"eval_accuracy": 0.03262907205260226,
"eval_loss": 6.422818660736084,
"eval_runtime": 91.1245,
"eval_samples_per_second": 39.43,
"eval_steps_per_second": 2.469,
"step": 32000
},
{
"epoch": 0.1625,
"grad_norm": 1.022921085357666,
"learning_rate": 4.1878250000000005e-05,
"loss": 6.4829,
"step": 32500
},
{
"epoch": 0.165,
"grad_norm": 1.4783859252929688,
"learning_rate": 4.175325e-05,
"loss": 6.4659,
"step": 33000
},
{
"epoch": 0.165,
"eval_accuracy": 0.031788829095045004,
"eval_loss": 6.436858654022217,
"eval_runtime": 95.3969,
"eval_samples_per_second": 37.664,
"eval_steps_per_second": 2.359,
"step": 33000
},
{
"epoch": 0.1675,
"grad_norm": 3.6760544776916504,
"learning_rate": 4.162825e-05,
"loss": 6.5373,
"step": 33500
},
{
"epoch": 0.17,
"grad_norm": 2.7687697410583496,
"learning_rate": 4.15035e-05,
"loss": 6.4819,
"step": 34000
},
{
"epoch": 0.17,
"eval_accuracy": 0.033804269375688326,
"eval_loss": 6.397607803344727,
"eval_runtime": 94.8411,
"eval_samples_per_second": 37.884,
"eval_steps_per_second": 2.372,
"step": 34000
},
{
"epoch": 0.1725,
"grad_norm": 1.5380724668502808,
"learning_rate": 4.1378500000000004e-05,
"loss": 6.4747,
"step": 34500
},
{
"epoch": 0.175,
"grad_norm": 1.8788790702819824,
"learning_rate": 4.12535e-05,
"loss": 6.4735,
"step": 35000
},
{
"epoch": 0.175,
"eval_accuracy": 0.033044159692418834,
"eval_loss": 6.411598205566406,
"eval_runtime": 92.6637,
"eval_samples_per_second": 38.775,
"eval_steps_per_second": 2.428,
"step": 35000
},
{
"epoch": 0.1775,
"grad_norm": 1.3821016550064087,
"learning_rate": 4.1128500000000004e-05,
"loss": 6.4538,
"step": 35500
},
{
"epoch": 0.18,
"grad_norm": 2.0021932125091553,
"learning_rate": 4.10035e-05,
"loss": 6.4659,
"step": 36000
},
{
"epoch": 0.18,
"eval_accuracy": 0.031257418960294575,
"eval_loss": 6.4191389083862305,
"eval_runtime": 102.9081,
"eval_samples_per_second": 34.915,
"eval_steps_per_second": 2.186,
"step": 36000
},
{
"epoch": 0.1825,
"grad_norm": 0.9058569073677063,
"learning_rate": 4.0878500000000005e-05,
"loss": 6.4517,
"step": 36500
},
{
"epoch": 0.185,
"grad_norm": 1.922741174697876,
"learning_rate": 4.07535e-05,
"loss": 6.443,
"step": 37000
},
{
"epoch": 0.185,
"eval_accuracy": 0.03225167018015021,
"eval_loss": 6.378974437713623,
"eval_runtime": 105.2589,
"eval_samples_per_second": 34.135,
"eval_steps_per_second": 2.138,
"step": 37000
},
{
"epoch": 0.1875,
"grad_norm": 2.298774480819702,
"learning_rate": 4.0628500000000006e-05,
"loss": 6.4497,
"step": 37500
},
{
"epoch": 0.19,
"grad_norm": 1.7634875774383545,
"learning_rate": 4.050375e-05,
"loss": 6.448,
"step": 38000
},
{
"epoch": 0.19,
"eval_accuracy": 0.0315727821687819,
"eval_loss": 6.390995979309082,
"eval_runtime": 97.37,
"eval_samples_per_second": 36.9,
"eval_steps_per_second": 2.311,
"step": 38000
},
{
"epoch": 0.1925,
"grad_norm": 1.2913744449615479,
"learning_rate": 4.0378750000000004e-05,
"loss": 6.4289,
"step": 38500
},
{
"epoch": 0.195,
"grad_norm": 1.5835742950439453,
"learning_rate": 4.025375e-05,
"loss": 6.421,
"step": 39000
},
{
"epoch": 0.195,
"eval_accuracy": 0.0321972502994794,
"eval_loss": 6.371878147125244,
"eval_runtime": 96.1284,
"eval_samples_per_second": 37.377,
"eval_steps_per_second": 2.341,
"step": 39000
},
{
"epoch": 0.1975,
"grad_norm": 1.9730321168899536,
"learning_rate": 4.0128750000000004e-05,
"loss": 6.4357,
"step": 39500
},
{
"epoch": 0.2,
"grad_norm": 1.8616057634353638,
"learning_rate": 4.0004000000000005e-05,
"loss": 6.4127,
"step": 40000
},
{
"epoch": 0.2,
"eval_accuracy": 0.03198569301337165,
"eval_loss": 6.374399662017822,
"eval_runtime": 92.9371,
"eval_samples_per_second": 38.661,
"eval_steps_per_second": 2.421,
"step": 40000
},
{
"epoch": 0.2025,
"grad_norm": 1.4953457117080688,
"learning_rate": 3.9879e-05,
"loss": 6.4233,
"step": 40500
},
{
"epoch": 0.205,
"grad_norm": 1.2381442785263062,
"learning_rate": 3.9754e-05,
"loss": 6.4213,
"step": 41000
},
{
"epoch": 0.205,
"eval_accuracy": 0.03152353217677482,
"eval_loss": 6.381062030792236,
"eval_runtime": 130.2985,
"eval_samples_per_second": 27.575,
"eval_steps_per_second": 1.727,
"step": 41000
},
{
"epoch": 0.2075,
"grad_norm": 1.2648102045059204,
"learning_rate": 3.9628999999999996e-05,
"loss": 6.4195,
"step": 41500
},
{
"epoch": 0.21,
"grad_norm": 1.322369933128357,
"learning_rate": 3.9504250000000004e-05,
"loss": 6.42,
"step": 42000
},
{
"epoch": 0.21,
"eval_accuracy": 0.031080010149307745,
"eval_loss": 6.351558685302734,
"eval_runtime": 97.1612,
"eval_samples_per_second": 36.98,
"eval_steps_per_second": 2.316,
"step": 42000
},
{
"epoch": 0.2125,
"grad_norm": 1.8557090759277344,
"learning_rate": 3.937925e-05,
"loss": 6.4013,
"step": 42500
},
{
"epoch": 0.215,
"grad_norm": 2.21213436126709,
"learning_rate": 3.9254250000000005e-05,
"loss": 6.414,
"step": 43000
},
{
"epoch": 0.215,
"eval_accuracy": 0.03100681540980551,
"eval_loss": 6.333887100219727,
"eval_runtime": 96.5646,
"eval_samples_per_second": 37.208,
"eval_steps_per_second": 2.33,
"step": 43000
},
{
"epoch": 0.2175,
"grad_norm": 1.817885160446167,
"learning_rate": 3.912925e-05,
"loss": 6.4259,
"step": 43500
},
{
"epoch": 0.22,
"grad_norm": 1.3436046838760376,
"learning_rate": 3.90045e-05,
"loss": 6.3899,
"step": 44000
},
{
"epoch": 0.22,
"eval_accuracy": 0.03152108328214463,
"eval_loss": 6.350229263305664,
"eval_runtime": 110.9053,
"eval_samples_per_second": 32.397,
"eval_steps_per_second": 2.029,
"step": 44000
},
{
"epoch": 0.2225,
"grad_norm": 1.5731582641601562,
"learning_rate": 3.887975e-05,
"loss": 6.3756,
"step": 44500
},
{
"epoch": 1.00173,
"grad_norm": 1.9111318588256836,
"learning_rate": 3.875475e-05,
"loss": 6.3715,
"step": 45000
},
{
"epoch": 1.00173,
"eval_accuracy": 0.03136843551686302,
"eval_loss": 6.31905460357666,
"eval_runtime": 98.2753,
"eval_samples_per_second": 36.561,
"eval_steps_per_second": 2.289,
"step": 45000
},
{
"epoch": 1.00423,
"grad_norm": 1.8441720008850098,
"learning_rate": 3.8629750000000004e-05,
"loss": 6.3367,
"step": 45500
},
{
"epoch": 1.00673,
"grad_norm": 1.122545838356018,
"learning_rate": 3.850475e-05,
"loss": 6.3588,
"step": 46000
},
{
"epoch": 1.00673,
"eval_accuracy": 0.03153237540738382,
"eval_loss": 6.308679103851318,
"eval_runtime": 103.4383,
"eval_samples_per_second": 34.736,
"eval_steps_per_second": 2.175,
"step": 46000
},
{
"epoch": 1.00923,
"grad_norm": 1.5169119834899902,
"learning_rate": 3.8379750000000005e-05,
"loss": 6.3561,
"step": 46500
},
{
"epoch": 1.01173,
"grad_norm": 2.325814723968506,
"learning_rate": 3.825475e-05,
"loss": 6.3802,
"step": 47000
},
{
"epoch": 1.01173,
"eval_accuracy": 0.03152312402766979,
"eval_loss": 6.2924675941467285,
"eval_runtime": 92.1003,
"eval_samples_per_second": 39.012,
"eval_steps_per_second": 2.443,
"step": 47000
},
{
"epoch": 1.01423,
"grad_norm": 2.1370677947998047,
"learning_rate": 3.8129750000000005e-05,
"loss": 6.3738,
"step": 47500
},
{
"epoch": 1.01673,
"grad_norm": 1.6703208684921265,
"learning_rate": 3.8005e-05,
"loss": 6.3708,
"step": 48000
},
{
"epoch": 1.01673,
"eval_accuracy": 0.03175032702947041,
"eval_loss": 6.304358959197998,
"eval_runtime": 92.2936,
"eval_samples_per_second": 38.93,
"eval_steps_per_second": 2.438,
"step": 48000
},
{
"epoch": 1.01923,
"grad_norm": 2.00714111328125,
"learning_rate": 3.788e-05,
"loss": 6.3374,
"step": 48500
},
{
"epoch": 1.02173,
"grad_norm": 1.4548686742782593,
"learning_rate": 3.7755e-05,
"loss": 6.3189,
"step": 49000
},
{
"epoch": 1.02173,
"eval_accuracy": 0.030815393479545948,
"eval_loss": 6.318645477294922,
"eval_runtime": 91.5251,
"eval_samples_per_second": 39.257,
"eval_steps_per_second": 2.458,
"step": 49000
},
{
"epoch": 1.02423,
"grad_norm": 1.719914436340332,
"learning_rate": 3.7630000000000004e-05,
"loss": 6.3527,
"step": 49500
},
{
"epoch": 1.02673,
"grad_norm": 1.7514491081237793,
"learning_rate": 3.7505e-05,
"loss": 6.3545,
"step": 50000
},
{
"epoch": 1.02673,
"eval_accuracy": 0.030726689074052533,
"eval_loss": 6.3023600578308105,
"eval_runtime": 92.8566,
"eval_samples_per_second": 38.694,
"eval_steps_per_second": 2.423,
"step": 50000
},
{
"epoch": 1.02923,
"grad_norm": 1.5919264554977417,
"learning_rate": 3.7380000000000005e-05,
"loss": 6.3373,
"step": 50500
},
{
"epoch": 1.03173,
"grad_norm": 2.918543815612793,
"learning_rate": 3.7255e-05,
"loss": 6.3255,
"step": 51000
},
{
"epoch": 1.03173,
"eval_accuracy": 0.030562341034426697,
"eval_loss": 6.30160665512085,
"eval_runtime": 92.2685,
"eval_samples_per_second": 38.941,
"eval_steps_per_second": 2.439,
"step": 51000
},
{
"epoch": 1.03423,
"grad_norm": 2.352147340774536,
"learning_rate": 3.713025e-05,
"loss": 6.3399,
"step": 51500
},
{
"epoch": 1.03673,
"grad_norm": 2.9179019927978516,
"learning_rate": 3.700525e-05,
"loss": 6.3162,
"step": 52000
},
{
"epoch": 1.03673,
"eval_accuracy": 0.03165182704545624,
"eval_loss": 6.283206462860107,
"eval_runtime": 91.4548,
"eval_samples_per_second": 39.287,
"eval_steps_per_second": 2.46,
"step": 52000
},
{
"epoch": 1.03923,
"grad_norm": 1.9093166589736938,
"learning_rate": 3.6880249999999996e-05,
"loss": 6.2978,
"step": 52500
},
{
"epoch": 1.04173,
"grad_norm": 1.792311668395996,
"learning_rate": 3.675525e-05,
"loss": 6.309,
"step": 53000
},
{
"epoch": 1.04173,
"eval_accuracy": 0.030514859688541417,
"eval_loss": 6.273383140563965,
"eval_runtime": 99.4034,
"eval_samples_per_second": 36.146,
"eval_steps_per_second": 2.264,
"step": 53000
},
{
"epoch": 1.04423,
"grad_norm": 1.311917781829834,
"learning_rate": 3.663025e-05,
"loss": 6.303,
"step": 53500
},
{
"epoch": 1.04673,
"grad_norm": 1.8375986814498901,
"learning_rate": 3.6505500000000005e-05,
"loss": 6.314,
"step": 54000
},
{
"epoch": 1.04673,
"eval_accuracy": 0.03121020971381265,
"eval_loss": 6.250477313995361,
"eval_runtime": 91.4556,
"eval_samples_per_second": 39.287,
"eval_steps_per_second": 2.46,
"step": 54000
},
{
"epoch": 1.04923,
"grad_norm": 1.672000527381897,
"learning_rate": 3.63805e-05,
"loss": 6.296,
"step": 54500
},
{
"epoch": 1.05173,
"grad_norm": 1.5362963676452637,
"learning_rate": 3.6255500000000005e-05,
"loss": 6.293,
"step": 55000
},
{
"epoch": 1.05173,
"eval_accuracy": 0.031686927868488916,
"eval_loss": 6.259158611297607,
"eval_runtime": 90.5801,
"eval_samples_per_second": 39.667,
"eval_steps_per_second": 2.484,
"step": 55000
},
{
"epoch": 1.05423,
"grad_norm": 1.3914135694503784,
"learning_rate": 3.61305e-05,
"loss": 6.2675,
"step": 55500
},
{
"epoch": 1.05673,
"grad_norm": 1.923341989517212,
"learning_rate": 3.6005500000000006e-05,
"loss": 6.2813,
"step": 56000
},
{
"epoch": 1.05673,
"eval_accuracy": 0.0307902242847357,
"eval_loss": 6.2271199226379395,
"eval_runtime": 108.1752,
"eval_samples_per_second": 33.215,
"eval_steps_per_second": 2.08,
"step": 56000
},
{
"epoch": 1.05923,
"grad_norm": 1.8110476732254028,
"learning_rate": 3.58805e-05,
"loss": 6.2753,
"step": 56500
},
{
"epoch": 1.06173,
"grad_norm": 3.4441418647766113,
"learning_rate": 3.575575e-05,
"loss": 6.2781,
"step": 57000
},
{
"epoch": 1.06173,
"eval_accuracy": 0.030471595883408125,
"eval_loss": 6.252005100250244,
"eval_runtime": 119.6594,
"eval_samples_per_second": 30.027,
"eval_steps_per_second": 1.88,
"step": 57000
},
{
"epoch": 1.06423,
"grad_norm": 2.464296340942383,
"learning_rate": 3.563075e-05,
"loss": 6.2624,
"step": 57500
},
{
"epoch": 1.06673,
"grad_norm": 1.7239878177642822,
"learning_rate": 3.5505750000000005e-05,
"loss": 6.2625,
"step": 58000
},
{
"epoch": 1.06673,
"eval_accuracy": 0.03089743144965719,
"eval_loss": 6.220044136047363,
"eval_runtime": 91.2098,
"eval_samples_per_second": 39.393,
"eval_steps_per_second": 2.467,
"step": 58000
},
{
"epoch": 1.06923,
"grad_norm": 3.2937629222869873,
"learning_rate": 3.538075e-05,
"loss": 6.2822,
"step": 58500
},
{
"epoch": 1.07173,
"grad_norm": 2.1419010162353516,
"learning_rate": 3.5255750000000005e-05,
"loss": 6.2638,
"step": 59000
},
{
"epoch": 1.07173,
"eval_accuracy": 0.03013310422563571,
"eval_loss": 6.199001789093018,
"eval_runtime": 92.4858,
"eval_samples_per_second": 38.849,
"eval_steps_per_second": 2.433,
"step": 59000
},
{
"epoch": 1.07423,
"grad_norm": 1.3166654109954834,
"learning_rate": 3.5131e-05,
"loss": 6.2721,
"step": 59500
},
{
"epoch": 1.07673,
"grad_norm": 2.0733723640441895,
"learning_rate": 3.5005999999999997e-05,
"loss": 6.2455,
"step": 60000
},
{
"epoch": 1.07673,
"eval_accuracy": 0.031115110972340414,
"eval_loss": 6.203488349914551,
"eval_runtime": 110.8283,
"eval_samples_per_second": 32.42,
"eval_steps_per_second": 2.03,
"step": 60000
},
{
"epoch": 1.07923,
"grad_norm": 1.416002631187439,
"learning_rate": 3.4881e-05,
"loss": 6.2489,
"step": 60500
},
{
"epoch": 1.08173,
"grad_norm": 2.517613172531128,
"learning_rate": 3.4756e-05,
"loss": 6.253,
"step": 61000
},
{
"epoch": 1.08173,
"eval_accuracy": 0.03135415029818693,
"eval_loss": 6.215968608856201,
"eval_runtime": 101.5668,
"eval_samples_per_second": 35.376,
"eval_steps_per_second": 2.215,
"step": 61000
},
{
"epoch": 1.08423,
"grad_norm": 1.8757721185684204,
"learning_rate": 3.4631e-05,
"loss": 6.2412,
"step": 61500
},
{
"epoch": 1.08673,
"grad_norm": 1.971893072128296,
"learning_rate": 3.4506e-05,
"loss": 6.2408,
"step": 62000
},
{
"epoch": 1.08673,
"eval_accuracy": 0.030135144971160866,
"eval_loss": 6.208563804626465,
"eval_runtime": 92.1622,
"eval_samples_per_second": 38.986,
"eval_steps_per_second": 2.441,
"step": 62000
},
{
"epoch": 1.08923,
"grad_norm": 1.688067078590393,
"learning_rate": 3.4381e-05,
"loss": 6.2355,
"step": 62500
},
{
"epoch": 1.09173,
"grad_norm": 1.7897340059280396,
"learning_rate": 3.4256e-05,
"loss": 6.2332,
"step": 63000
},
{
"epoch": 1.09173,
"eval_accuracy": 0.029759647794532298,
"eval_loss": 6.192452907562256,
"eval_runtime": 91.4457,
"eval_samples_per_second": 39.291,
"eval_steps_per_second": 2.46,
"step": 63000
},
{
"epoch": 1.09423,
"grad_norm": 1.7778252363204956,
"learning_rate": 3.4131250000000006e-05,
"loss": 6.2307,
"step": 63500
},
{
"epoch": 1.09673,
"grad_norm": 2.530224561691284,
"learning_rate": 3.400625e-05,
"loss": 6.2182,
"step": 64000
},
{
"epoch": 1.09673,
"eval_accuracy": 0.030409149070338377,
"eval_loss": 6.166359901428223,
"eval_runtime": 93.8927,
"eval_samples_per_second": 38.267,
"eval_steps_per_second": 2.396,
"step": 64000
},
{
"epoch": 1.09923,
"grad_norm": 2.602126121520996,
"learning_rate": 3.388125e-05,
"loss": 6.2163,
"step": 64500
},
{
"epoch": 1.10173,
"grad_norm": 2.1342318058013916,
"learning_rate": 3.37565e-05,
"loss": 6.2301,
"step": 65000
},
{
"epoch": 1.10173,
"eval_accuracy": 0.030321941211563407,
"eval_loss": 6.158266067504883,
"eval_runtime": 97.9208,
"eval_samples_per_second": 36.693,
"eval_steps_per_second": 2.298,
"step": 65000
},
{
"epoch": 1.10423,
"grad_norm": 1.733469843864441,
"learning_rate": 3.363175e-05,
"loss": 6.2318,
"step": 65500
},
{
"epoch": 1.10673,
"grad_norm": 3.459373950958252,
"learning_rate": 3.3506750000000006e-05,
"loss": 6.2379,
"step": 66000
},
{
"epoch": 1.10673,
"eval_accuracy": 0.030465745746236016,
"eval_loss": 6.1884002685546875,
"eval_runtime": 95.0294,
"eval_samples_per_second": 37.809,
"eval_steps_per_second": 2.368,
"step": 66000
},
{
"epoch": 1.10923,
"grad_norm": 2.1125833988189697,
"learning_rate": 3.338175e-05,
"loss": 6.2044,
"step": 66500
},
{
"epoch": 1.11173,
"grad_norm": 1.7863088846206665,
"learning_rate": 3.3256750000000006e-05,
"loss": 6.2211,
"step": 67000
},
{
"epoch": 1.11173,
"eval_accuracy": 0.03108939757872346,
"eval_loss": 6.161408424377441,
"eval_runtime": 94.8294,
"eval_samples_per_second": 37.889,
"eval_steps_per_second": 2.373,
"step": 67000
},
{
"epoch": 1.11423,
"grad_norm": 2.1072041988372803,
"learning_rate": 3.3131750000000003e-05,
"loss": 6.2086,
"step": 67500
},
{
"epoch": 1.11673,
"grad_norm": 1.647073745727539,
"learning_rate": 3.300675e-05,
"loss": 6.2018,
"step": 68000
},
{
"epoch": 1.11673,
"eval_accuracy": 0.030671997093978373,
"eval_loss": 6.1607513427734375,
"eval_runtime": 91.8853,
"eval_samples_per_second": 39.103,
"eval_steps_per_second": 2.449,
"step": 68000
},
{
"epoch": 1.11923,
"grad_norm": 2.174481153488159,
"learning_rate": 3.288175e-05,
"loss": 6.1837,
"step": 68500
},
{
"epoch": 1.12173,
"grad_norm": 1.5667694807052612,
"learning_rate": 3.275675e-05,
"loss": 6.1969,
"step": 69000
},
{
"epoch": 1.12173,
"eval_accuracy": 0.030524519217360487,
"eval_loss": 6.133298397064209,
"eval_runtime": 140.1649,
"eval_samples_per_second": 25.634,
"eval_steps_per_second": 1.605,
"step": 69000
},
{
"epoch": 1.12423,
"grad_norm": 2.3138535022735596,
"learning_rate": 3.263175e-05,
"loss": 6.2001,
"step": 69500
},
{
"epoch": 1.12673,
"grad_norm": 2.0583817958831787,
"learning_rate": 3.2507000000000006e-05,
"loss": 6.1989,
"step": 70000
},
{
"epoch": 1.12673,
"eval_accuracy": 0.030227250619196205,
"eval_loss": 6.131419658660889,
"eval_runtime": 92.9886,
"eval_samples_per_second": 38.639,
"eval_steps_per_second": 2.42,
"step": 70000
},
{
"epoch": 1.12923,
"grad_norm": 2.3746538162231445,
"learning_rate": 3.2382e-05,
"loss": 6.2039,
"step": 70500
},
{
"epoch": 1.1317300000000001,
"grad_norm": 2.146286964416504,
"learning_rate": 3.2257000000000006e-05,
"loss": 6.2058,
"step": 71000
},
{
"epoch": 1.1317300000000001,
"eval_accuracy": 0.029887942663213724,
"eval_loss": 6.151033878326416,
"eval_runtime": 115.2908,
"eval_samples_per_second": 31.165,
"eval_steps_per_second": 1.952,
"step": 71000
},
{
"epoch": 1.13423,
"grad_norm": 1.7566348314285278,
"learning_rate": 3.2132e-05,
"loss": 6.1963,
"step": 71500
},
{
"epoch": 1.13673,
"grad_norm": 1.7382845878601074,
"learning_rate": 3.2007e-05,
"loss": 6.1994,
"step": 72000
},
{
"epoch": 1.13673,
"eval_accuracy": 0.02953489368736187,
"eval_loss": 6.144207954406738,
"eval_runtime": 106.6363,
"eval_samples_per_second": 33.694,
"eval_steps_per_second": 2.11,
"step": 72000
},
{
"epoch": 1.13923,
"grad_norm": 2.3577260971069336,
"learning_rate": 3.188225e-05,
"loss": 6.1737,
"step": 72500
},
{
"epoch": 1.14173,
"grad_norm": 1.917486310005188,
"learning_rate": 3.175725e-05,
"loss": 6.1715,
"step": 73000
},
{
"epoch": 1.14173,
"eval_accuracy": 0.029909030366973663,
"eval_loss": 6.13955545425415,
"eval_runtime": 92.2027,
"eval_samples_per_second": 38.968,
"eval_steps_per_second": 2.44,
"step": 73000
},
{
"epoch": 1.14423,
"grad_norm": 2.7271180152893066,
"learning_rate": 3.163225e-05,
"loss": 6.1803,
"step": 73500
},
{
"epoch": 1.14673,
"grad_norm": 1.5343501567840576,
"learning_rate": 3.150725e-05,
"loss": 6.1849,
"step": 74000
},
{
"epoch": 1.14673,
"eval_accuracy": 0.029956919861963974,
"eval_loss": 6.108343601226807,
"eval_runtime": 94.132,
"eval_samples_per_second": 38.17,
"eval_steps_per_second": 2.39,
"step": 74000
},
{
"epoch": 1.14923,
"grad_norm": 2.063617706298828,
"learning_rate": 3.138225e-05,
"loss": 6.1601,
"step": 74500
},
{
"epoch": 1.15173,
"grad_norm": 1.7743293046951294,
"learning_rate": 3.1257500000000004e-05,
"loss": 6.1709,
"step": 75000
},
{
"epoch": 1.15173,
"eval_accuracy": 0.030241399788170614,
"eval_loss": 6.083706855773926,
"eval_runtime": 91.1389,
"eval_samples_per_second": 39.423,
"eval_steps_per_second": 2.469,
"step": 75000
},
{
"epoch": 1.15423,
"grad_norm": 2.292107343673706,
"learning_rate": 3.11325e-05,
"loss": 6.1532,
"step": 75500
},
{
"epoch": 1.15673,
"grad_norm": 1.8466393947601318,
"learning_rate": 3.10075e-05,
"loss": 6.1669,
"step": 76000
},
{
"epoch": 1.15673,
"eval_accuracy": 0.02916497454850206,
"eval_loss": 6.0924787521362305,
"eval_runtime": 93.9311,
"eval_samples_per_second": 38.251,
"eval_steps_per_second": 2.395,
"step": 76000
},
{
"epoch": 1.15923,
"grad_norm": 2.8982222080230713,
"learning_rate": 3.08825e-05,
"loss": 6.1602,
"step": 76500
},
{
"epoch": 1.16173,
"grad_norm": 2.5072736740112305,
"learning_rate": 3.07575e-05,
"loss": 6.16,
"step": 77000
},
{
"epoch": 1.16173,
"eval_accuracy": 0.029235040144865724,
"eval_loss": 6.0938825607299805,
"eval_runtime": 98.3356,
"eval_samples_per_second": 36.538,
"eval_steps_per_second": 2.288,
"step": 77000
},
{
"epoch": 1.1642299999999999,
"grad_norm": 1.8945947885513306,
"learning_rate": 3.06325e-05,
"loss": 6.1295,
"step": 77500
},
{
"epoch": 1.16673,
"grad_norm": 3.0348196029663086,
"learning_rate": 3.0507750000000003e-05,
"loss": 6.1637,
"step": 78000
},
{
"epoch": 1.16673,
"eval_accuracy": 0.029682371563979754,
"eval_loss": 6.0949811935424805,
"eval_runtime": 95.4626,
"eval_samples_per_second": 37.638,
"eval_steps_per_second": 2.357,
"step": 78000
},
{
"epoch": 1.16923,
"grad_norm": 1.6715943813323975,
"learning_rate": 3.038275e-05,
"loss": 6.1729,
"step": 78500
},
{
"epoch": 1.17173,
"grad_norm": 1.6270267963409424,
"learning_rate": 3.025775e-05,
"loss": 6.1446,
"step": 79000
},
{
"epoch": 1.17173,
"eval_accuracy": 0.029339254216350318,
"eval_loss": 6.08974552154541,
"eval_runtime": 91.6584,
"eval_samples_per_second": 39.2,
"eval_steps_per_second": 2.455,
"step": 79000
},
{
"epoch": 1.17423,
"grad_norm": 2.177884817123413,
"learning_rate": 3.0132750000000004e-05,
"loss": 6.1339,
"step": 79500
},
{
"epoch": 1.17673,
"grad_norm": 2.0329620838165283,
"learning_rate": 3.000775e-05,
"loss": 6.1231,
"step": 80000
},
{
"epoch": 1.17673,
"eval_accuracy": 0.02976223273886416,
"eval_loss": 6.078031063079834,
"eval_runtime": 98.9799,
"eval_samples_per_second": 36.3,
"eval_steps_per_second": 2.273,
"step": 80000
},
{
"epoch": 1.17923,
"grad_norm": 2.5211639404296875,
"learning_rate": 2.9883e-05,
"loss": 6.1297,
"step": 80500
},
{
"epoch": 1.18173,
"grad_norm": 3.6579315662384033,
"learning_rate": 2.9758000000000002e-05,
"loss": 6.1287,
"step": 81000
},
{
"epoch": 1.18173,
"eval_accuracy": 0.029048379954164854,
"eval_loss": 6.091179370880127,
"eval_runtime": 94.4808,
"eval_samples_per_second": 38.029,
"eval_steps_per_second": 2.381,
"step": 81000
},
{
"epoch": 1.18423,
"grad_norm": 2.1335196495056152,
"learning_rate": 2.9633e-05,
"loss": 6.1302,
"step": 81500
},
{
"epoch": 1.18673,
"grad_norm": 1.8634554147720337,
"learning_rate": 2.9508000000000003e-05,
"loss": 6.1196,
"step": 82000
},
{
"epoch": 1.18673,
"eval_accuracy": 0.028964029139125106,
"eval_loss": 6.08493709564209,
"eval_runtime": 102.0825,
"eval_samples_per_second": 35.197,
"eval_steps_per_second": 2.204,
"step": 82000
},
{
"epoch": 1.18923,
"grad_norm": 3.3065264225006104,
"learning_rate": 2.938325e-05,
"loss": 6.1363,
"step": 82500
},
{
"epoch": 1.19173,
"grad_norm": 2.0532631874084473,
"learning_rate": 2.9258250000000004e-05,
"loss": 6.1136,
"step": 83000
},
{
"epoch": 1.19173,
"eval_accuracy": 0.029438298399171184,
"eval_loss": 6.061553478240967,
"eval_runtime": 114.3059,
"eval_samples_per_second": 31.433,
"eval_steps_per_second": 1.968,
"step": 83000
},
{
"epoch": 1.1942300000000001,
"grad_norm": 1.6184016466140747,
"learning_rate": 2.913325e-05,
"loss": 6.1084,
"step": 83500
},
{
"epoch": 1.19673,
"grad_norm": 3.0933752059936523,
"learning_rate": 2.9008250000000005e-05,
"loss": 6.1135,
"step": 84000
},
{
"epoch": 1.19673,
"eval_accuracy": 0.02910620107737759,
"eval_loss": 6.051595211029053,
"eval_runtime": 97.7568,
"eval_samples_per_second": 36.754,
"eval_steps_per_second": 2.302,
"step": 84000
},
{
"epoch": 1.19923,
"grad_norm": 2.0492806434631348,
"learning_rate": 2.888325e-05,
"loss": 6.1135,
"step": 84500
},
{
"epoch": 1.20173,
"grad_norm": 2.1396772861480713,
"learning_rate": 2.87585e-05,
"loss": 6.1157,
"step": 85000
},
{
"epoch": 1.20173,
"eval_accuracy": 0.029641148504371616,
"eval_loss": 6.051669120788574,
"eval_runtime": 135.3455,
"eval_samples_per_second": 26.547,
"eval_steps_per_second": 1.662,
"step": 85000
},
{
"epoch": 1.20423,
"grad_norm": 1.9825798273086548,
"learning_rate": 2.8633500000000003e-05,
"loss": 6.1003,
"step": 85500
},
{
"epoch": 1.20673,
"grad_norm": 4.120855331420898,
"learning_rate": 2.85085e-05,
"loss": 6.1102,
"step": 86000
},
{
"epoch": 1.20673,
"eval_accuracy": 0.029109058121112804,
"eval_loss": 6.062215328216553,
"eval_runtime": 93.9666,
"eval_samples_per_second": 38.237,
"eval_steps_per_second": 2.394,
"step": 86000
},
{
"epoch": 1.20923,
"grad_norm": 1.7419639825820923,
"learning_rate": 2.83835e-05,
"loss": 6.1052,
"step": 86500
},
{
"epoch": 1.21173,
"grad_norm": 1.9040395021438599,
"learning_rate": 2.825875e-05,
"loss": 6.1218,
"step": 87000
},
{
"epoch": 1.21173,
"eval_accuracy": 0.02853969011959449,
"eval_loss": 6.063874244689941,
"eval_runtime": 91.0034,
"eval_samples_per_second": 39.482,
"eval_steps_per_second": 2.472,
"step": 87000
},
{
"epoch": 1.21423,
"grad_norm": 3.0082874298095703,
"learning_rate": 2.813375e-05,
"loss": 6.1101,
"step": 87500
},
{
"epoch": 1.21673,
"grad_norm": 1.843441367149353,
"learning_rate": 2.8008750000000002e-05,
"loss": 6.1104,
"step": 88000
},
{
"epoch": 1.21673,
"eval_accuracy": 0.029038720425345787,
"eval_loss": 6.0515055656433105,
"eval_runtime": 92.5288,
"eval_samples_per_second": 38.831,
"eval_steps_per_second": 2.432,
"step": 88000
},
{
"epoch": 1.21923,
"grad_norm": 1.9573761224746704,
"learning_rate": 2.7883750000000002e-05,
"loss": 6.1118,
"step": 88500
},
{
"epoch": 1.22173,
"grad_norm": 2.1925711631774902,
"learning_rate": 2.7759000000000003e-05,
"loss": 6.0777,
"step": 89000
},
{
"epoch": 1.22173,
"eval_accuracy": 0.02947503181862398,
"eval_loss": 6.0190510749816895,
"eval_runtime": 91.9724,
"eval_samples_per_second": 39.066,
"eval_steps_per_second": 2.446,
"step": 89000
},
{
"epoch": 2.00096,
"grad_norm": 2.6323490142822266,
"learning_rate": 2.7634e-05,
"loss": 6.1035,
"step": 89500
},
{
"epoch": 2.00346,
"grad_norm": 2.907351016998291,
"learning_rate": 2.7509e-05,
"loss": 6.051,
"step": 90000
},
{
"epoch": 2.00346,
"eval_accuracy": 0.028725125862980264,
"eval_loss": 6.004823207855225,
"eval_runtime": 111.8414,
"eval_samples_per_second": 32.126,
"eval_steps_per_second": 2.012,
"step": 90000
},
{
"epoch": 2.00596,
"grad_norm": 2.1557071208953857,
"learning_rate": 2.7383999999999997e-05,
"loss": 6.0777,
"step": 90500
},
{
"epoch": 2.00846,
"grad_norm": 2.466111183166504,
"learning_rate": 2.7259e-05,
"loss": 6.065,
"step": 91000
},
{
"epoch": 2.00846,
"eval_accuracy": 0.0288474345447879,
"eval_loss": 6.030221462249756,
"eval_runtime": 91.3043,
"eval_samples_per_second": 39.352,
"eval_steps_per_second": 2.464,
"step": 91000
},
{
"epoch": 2.01096,
"grad_norm": 2.554461717605591,
"learning_rate": 2.713425e-05,
"loss": 6.1054,
"step": 91500
},
{
"epoch": 2.01346,
"grad_norm": 2.1461994647979736,
"learning_rate": 2.7009250000000002e-05,
"loss": 6.0941,
"step": 92000
},
{
"epoch": 2.01346,
"eval_accuracy": 0.028430578258849523,
"eval_loss": 6.029834747314453,
"eval_runtime": 137.2429,
"eval_samples_per_second": 26.18,
"eval_steps_per_second": 1.639,
"step": 92000
},
{
"epoch": 2.01596,
"grad_norm": 1.8053221702575684,
"learning_rate": 2.688425e-05,
"loss": 6.0738,
"step": 92500
},
{
"epoch": 2.01846,
"grad_norm": 1.9942346811294556,
"learning_rate": 2.6759250000000003e-05,
"loss": 6.0833,
"step": 93000
},
{
"epoch": 2.01846,
"eval_accuracy": 0.02874675776554691,
"eval_loss": 6.014132022857666,
"eval_runtime": 96.8636,
"eval_samples_per_second": 37.093,
"eval_steps_per_second": 2.323,
"step": 93000
},
{
"epoch": 2.02096,
"grad_norm": 1.6957967281341553,
"learning_rate": 2.663425e-05,
"loss": 6.0429,
"step": 93500
},
{
"epoch": 2.02346,
"grad_norm": 2.633817672729492,
"learning_rate": 2.6509500000000004e-05,
"loss": 6.0816,
"step": 94000
},
{
"epoch": 2.02346,
"eval_accuracy": 0.0281086846646817,
"eval_loss": 6.0137176513671875,
"eval_runtime": 97.3569,
"eval_samples_per_second": 36.905,
"eval_steps_per_second": 2.311,
"step": 94000
},
{
"epoch": 2.02596,
"grad_norm": 2.664257049560547,
"learning_rate": 2.6384750000000002e-05,
"loss": 6.0638,
"step": 94500
},
{
"epoch": 2.02846,
"grad_norm": 2.147036552429199,
"learning_rate": 2.625975e-05,
"loss": 6.0771,
"step": 95000
},
{
"epoch": 2.02846,
"eval_accuracy": 0.029002667254401378,
"eval_loss": 6.028487205505371,
"eval_runtime": 97.1346,
"eval_samples_per_second": 36.99,
"eval_steps_per_second": 2.316,
"step": 95000
},
{
"epoch": 2.03096,
"grad_norm": 2.5634896755218506,
"learning_rate": 2.6134750000000002e-05,
"loss": 6.0474,
"step": 95500
},
{
"epoch": 2.03346,
"grad_norm": 2.277717113494873,
"learning_rate": 2.600975e-05,
"loss": 6.0646,
"step": 96000
},
{
"epoch": 2.03346,
"eval_accuracy": 0.027694413323075186,
"eval_loss": 6.009863376617432,
"eval_runtime": 92.8998,
"eval_samples_per_second": 38.676,
"eval_steps_per_second": 2.422,
"step": 96000
},
{
"epoch": 2.03596,
"grad_norm": 2.0405683517456055,
"learning_rate": 2.5884750000000003e-05,
"loss": 6.0532,
"step": 96500
},
{
"epoch": 2.03846,
"grad_norm": 1.5538651943206787,
"learning_rate": 2.575975e-05,
"loss": 6.0421,
"step": 97000
},
{
"epoch": 2.03846,
"eval_accuracy": 0.029365511808773982,
"eval_loss": 6.003146648406982,
"eval_runtime": 112.494,
"eval_samples_per_second": 31.939,
"eval_steps_per_second": 2.0,
"step": 97000
},
{
"epoch": 2.04096,
"grad_norm": 2.2698843479156494,
"learning_rate": 2.563475e-05,
"loss": 6.0434,
"step": 97500
},
{
"epoch": 2.04346,
"grad_norm": 3.3169667720794678,
"learning_rate": 2.5509749999999997e-05,
"loss": 6.0477,
"step": 98000
},
{
"epoch": 2.04346,
"eval_accuracy": 0.027985151535558972,
"eval_loss": 5.9978766441345215,
"eval_runtime": 110.0502,
"eval_samples_per_second": 32.649,
"eval_steps_per_second": 2.045,
"step": 98000
},
{
"epoch": 2.04596,
"grad_norm": 1.6990258693695068,
"learning_rate": 2.5385000000000002e-05,
"loss": 6.0594,
"step": 98500
},
{
"epoch": 2.04846,
"grad_norm": 2.433408260345459,
"learning_rate": 2.526e-05,
"loss": 6.0317,
"step": 99000
},
{
"epoch": 2.04846,
"eval_accuracy": 0.028621047841197345,
"eval_loss": 5.987879276275635,
"eval_runtime": 97.8847,
"eval_samples_per_second": 36.706,
"eval_steps_per_second": 2.299,
"step": 99000
},
{
"epoch": 2.05096,
"grad_norm": 3.094221830368042,
"learning_rate": 2.5135000000000002e-05,
"loss": 6.0438,
"step": 99500
},
{
"epoch": 2.05346,
"grad_norm": 2.064706802368164,
"learning_rate": 2.501e-05,
"loss": 6.0236,
"step": 100000
},
{
"epoch": 2.05346,
"eval_accuracy": 0.028613837207008466,
"eval_loss": 5.978915691375732,
"eval_runtime": 116.882,
"eval_samples_per_second": 30.74,
"eval_steps_per_second": 1.925,
"step": 100000
},
{
"epoch": 2.05596,
"grad_norm": 2.8103480339050293,
"learning_rate": 2.488525e-05,
"loss": 6.0153,
"step": 100500
},
{
"epoch": 2.05846,
"grad_norm": 3.619741678237915,
"learning_rate": 2.476025e-05,
"loss": 6.0245,
"step": 101000
},
{
"epoch": 2.05846,
"eval_accuracy": 0.028634244662260017,
"eval_loss": 5.98130989074707,
"eval_runtime": 100.1482,
"eval_samples_per_second": 35.877,
"eval_steps_per_second": 2.247,
"step": 101000
},
{
"epoch": 2.06096,
"grad_norm": 2.7197930812835693,
"learning_rate": 2.463525e-05,
"loss": 6.0433,
"step": 101500
},
{
"epoch": 2.06346,
"grad_norm": 2.0345962047576904,
"learning_rate": 2.451025e-05,
"loss": 6.0046,
"step": 102000
},
{
"epoch": 2.06346,
"eval_accuracy": 0.02719443066941215,
"eval_loss": 5.959959030151367,
"eval_runtime": 99.9834,
"eval_samples_per_second": 35.936,
"eval_steps_per_second": 2.25,
"step": 102000
},
{
"epoch": 2.06596,
"grad_norm": 3.054705858230591,
"learning_rate": 2.43855e-05,
"loss": 6.0308,
"step": 102500
},
{
"epoch": 2.06846,
"grad_norm": 3.065950632095337,
"learning_rate": 2.42605e-05,
"loss": 6.0089,
"step": 103000
},
{
"epoch": 2.06846,
"eval_accuracy": 0.028201470561225427,
"eval_loss": 5.969558238983154,
"eval_runtime": 91.0097,
"eval_samples_per_second": 39.479,
"eval_steps_per_second": 2.472,
"step": 103000
},
{
"epoch": 2.07096,
"grad_norm": 2.385056495666504,
"learning_rate": 2.413575e-05,
"loss": 6.0279,
"step": 103500
},
{
"epoch": 2.07346,
"grad_norm": 2.1604409217834473,
"learning_rate": 2.401075e-05,
"loss": 6.0268,
"step": 104000
},
{
"epoch": 2.07346,
"eval_accuracy": 0.028443775079912192,
"eval_loss": 5.9631242752075195,
"eval_runtime": 95.1215,
"eval_samples_per_second": 37.773,
"eval_steps_per_second": 2.365,
"step": 104000
},
{
"epoch": 2.07596,
"grad_norm": 2.4640963077545166,
"learning_rate": 2.388575e-05,
"loss": 6.0192,
"step": 104500
},
{
"epoch": 2.07846,
"grad_norm": 3.9500372409820557,
"learning_rate": 2.376075e-05,
"loss": 6.015,
"step": 105000
},
{
"epoch": 2.07846,
"eval_accuracy": 0.02789304588752363,
"eval_loss": 5.986013412475586,
"eval_runtime": 95.12,
"eval_samples_per_second": 37.773,
"eval_steps_per_second": 2.365,
"step": 105000
},
{
"epoch": 2.08096,
"grad_norm": 2.237582206726074,
"learning_rate": 2.363575e-05,
"loss": 6.0236,
"step": 105500
},
{
"epoch": 2.08346,
"grad_norm": 2.649489641189575,
"learning_rate": 2.3511000000000002e-05,
"loss": 5.9978,
"step": 106000
},
{
"epoch": 2.08346,
"eval_accuracy": 0.02822813630275412,
"eval_loss": 5.95936393737793,
"eval_runtime": 93.2702,
"eval_samples_per_second": 38.522,
"eval_steps_per_second": 2.412,
"step": 106000
},
{
"epoch": 2.08596,
"grad_norm": 2.1337761878967285,
"learning_rate": 2.3386000000000003e-05,
"loss": 6.0018,
"step": 106500
},
{
"epoch": 2.08846,
"grad_norm": 3.1514151096343994,
"learning_rate": 2.3261000000000003e-05,
"loss": 6.0095,
"step": 107000
},
{
"epoch": 2.08846,
"eval_accuracy": 0.028024197799940274,
"eval_loss": 5.966735363006592,
"eval_runtime": 105.6735,
"eval_samples_per_second": 34.001,
"eval_steps_per_second": 2.129,
"step": 107000
},
{
"epoch": 2.09096,
"grad_norm": 2.5281171798706055,
"learning_rate": 2.3136000000000003e-05,
"loss": 5.9958,
"step": 107500
},
{
"epoch": 2.09346,
"grad_norm": 2.90545392036438,
"learning_rate": 2.3011e-05,
"loss": 6.008,
"step": 108000
},
{
"epoch": 2.09346,
"eval_accuracy": 0.02748897827354289,
"eval_loss": 5.956141471862793,
"eval_runtime": 104.8321,
"eval_samples_per_second": 34.274,
"eval_steps_per_second": 2.146,
"step": 108000
},
{
"epoch": 2.09596,
"grad_norm": 2.385185956954956,
"learning_rate": 2.2886e-05,
"loss": 5.9762,
"step": 108500
},
{
"epoch": 2.09846,
"grad_norm": 3.710510015487671,
"learning_rate": 2.2761e-05,
"loss": 5.9912,
"step": 109000
},
{
"epoch": 2.09846,
"eval_accuracy": 0.027843251696709842,
"eval_loss": 5.974761962890625,
"eval_runtime": 92.291,
"eval_samples_per_second": 38.931,
"eval_steps_per_second": 2.438,
"step": 109000
},
{
"epoch": 2.10096,
"grad_norm": 2.344508171081543,
"learning_rate": 2.2636e-05,
"loss": 6.007,
"step": 109500
},
{
"epoch": 2.10346,
"grad_norm": 2.5263593196868896,
"learning_rate": 2.251125e-05,
"loss": 6.0,
"step": 110000
},
{
"epoch": 2.10346,
"eval_accuracy": 0.027863114953154685,
"eval_loss": 5.951306343078613,
"eval_runtime": 107.5477,
"eval_samples_per_second": 33.408,
"eval_steps_per_second": 2.092,
"step": 110000
},
{
"epoch": 2.10596,
"grad_norm": 4.0800981521606445,
"learning_rate": 2.238625e-05,
"loss": 6.0135,
"step": 110500
},
{
"epoch": 2.10846,
"grad_norm": 2.456308126449585,
"learning_rate": 2.226125e-05,
"loss": 5.9981,
"step": 111000
},
{
"epoch": 2.10846,
"eval_accuracy": 0.02768094440260916,
"eval_loss": 5.935766220092773,
"eval_runtime": 92.4942,
"eval_samples_per_second": 38.846,
"eval_steps_per_second": 2.433,
"step": 111000
},
{
"epoch": 2.11096,
"grad_norm": 2.2109882831573486,
"learning_rate": 2.213625e-05,
"loss": 5.9899,
"step": 111500
},
{
"epoch": 2.11346,
"grad_norm": 2.52247953414917,
"learning_rate": 2.20115e-05,
"loss": 5.9877,
"step": 112000
},
{
"epoch": 2.11346,
"eval_accuracy": 0.02789957627320413,
"eval_loss": 5.934952259063721,
"eval_runtime": 107.9372,
"eval_samples_per_second": 33.288,
"eval_steps_per_second": 2.085,
"step": 112000
},
{
"epoch": 2.11596,
"grad_norm": 1.6865503787994385,
"learning_rate": 2.18865e-05,
"loss": 5.9807,
"step": 112500
},
{
"epoch": 2.11846,
"grad_norm": 2.4976794719696045,
"learning_rate": 2.17615e-05,
"loss": 5.9726,
"step": 113000
},
{
"epoch": 2.11846,
"eval_accuracy": 0.027755091490023136,
"eval_loss": 5.934043884277344,
"eval_runtime": 98.7369,
"eval_samples_per_second": 36.39,
"eval_steps_per_second": 2.279,
"step": 113000
},
{
"epoch": 2.12096,
"grad_norm": 2.376749038696289,
"learning_rate": 2.1636500000000002e-05,
"loss": 5.9716,
"step": 113500
},
{
"epoch": 2.12346,
"grad_norm": 4.487443447113037,
"learning_rate": 2.1511500000000002e-05,
"loss": 5.9696,
"step": 114000
},
{
"epoch": 2.12346,
"eval_accuracy": 0.027384900251759974,
"eval_loss": 5.924759864807129,
"eval_runtime": 92.999,
"eval_samples_per_second": 38.635,
"eval_steps_per_second": 2.419,
"step": 114000
},
{
"epoch": 2.12596,
"grad_norm": 3.5680646896362305,
"learning_rate": 2.138675e-05,
"loss": 5.9932,
"step": 114500
},
{
"epoch": 2.12846,
"grad_norm": 2.196323871612549,
"learning_rate": 2.126175e-05,
"loss": 5.9842,
"step": 115000
},
{
"epoch": 2.12846,
"eval_accuracy": 0.027340003850206556,
"eval_loss": 5.95149564743042,
"eval_runtime": 101.0386,
"eval_samples_per_second": 35.561,
"eval_steps_per_second": 2.227,
"step": 115000
},
{
"epoch": 2.13096,
"grad_norm": 2.3321759700775146,
"learning_rate": 2.1136750000000004e-05,
"loss": 5.9865,
"step": 115500
},
{
"epoch": 2.13346,
"grad_norm": 2.0492184162139893,
"learning_rate": 2.101175e-05,
"loss": 5.9919,
"step": 116000
},
{
"epoch": 2.13346,
"eval_accuracy": 0.027651693716748603,
"eval_loss": 5.923666477203369,
"eval_runtime": 106.1587,
"eval_samples_per_second": 33.846,
"eval_steps_per_second": 2.119,
"step": 116000
},
{
"epoch": 2.13596,
"grad_norm": 2.516122817993164,
"learning_rate": 2.088675e-05,
"loss": 5.9827,
"step": 116500
},
{
"epoch": 2.13846,
"grad_norm": 2.3729827404022217,
"learning_rate": 2.0762000000000002e-05,
"loss": 5.972,
"step": 117000
},
{
"epoch": 2.13846,
"eval_accuracy": 0.02696409852447296,
"eval_loss": 5.927760601043701,
"eval_runtime": 122.2666,
"eval_samples_per_second": 29.387,
"eval_steps_per_second": 1.84,
"step": 117000
},
{
"epoch": 2.14096,
"grad_norm": 2.6471846103668213,
"learning_rate": 2.0637e-05,
"loss": 5.9663,
"step": 117500
},
{
"epoch": 2.14346,
"grad_norm": 4.273701190948486,
"learning_rate": 2.0512e-05,
"loss": 5.9715,
"step": 118000
},
{
"epoch": 2.14346,
"eval_accuracy": 0.026757439027625573,
"eval_loss": 5.910974502563477,
"eval_runtime": 99.6535,
"eval_samples_per_second": 36.055,
"eval_steps_per_second": 2.258,
"step": 118000
},
{
"epoch": 2.14596,
"grad_norm": 3.1654696464538574,
"learning_rate": 2.0387e-05,
"loss": 5.9718,
"step": 118500
},
{
"epoch": 2.14846,
"grad_norm": 2.6283257007598877,
"learning_rate": 2.026225e-05,
"loss": 5.9727,
"step": 119000
},
{
"epoch": 2.14846,
"eval_accuracy": 0.027529385034940963,
"eval_loss": 5.913906574249268,
"eval_runtime": 91.3308,
"eval_samples_per_second": 39.341,
"eval_steps_per_second": 2.464,
"step": 119000
},
{
"epoch": 2.15096,
"grad_norm": 2.8760488033294678,
"learning_rate": 2.013725e-05,
"loss": 5.9587,
"step": 119500
},
{
"epoch": 2.15346,
"grad_norm": 2.699265956878662,
"learning_rate": 2.001225e-05,
"loss": 5.9427,
"step": 120000
},
{
"epoch": 2.15346,
"eval_accuracy": 0.02733360951422774,
"eval_loss": 5.927834987640381,
"eval_runtime": 91.7396,
"eval_samples_per_second": 39.165,
"eval_steps_per_second": 2.453,
"step": 120000
},
{
"epoch": 2.15596,
"grad_norm": 2.398200511932373,
"learning_rate": 1.9887250000000002e-05,
"loss": 5.9724,
"step": 120500
},
{
"epoch": 2.15846,
"grad_norm": 2.2844276428222656,
"learning_rate": 1.9762250000000002e-05,
"loss": 5.9514,
"step": 121000
},
{
"epoch": 2.15846,
"eval_accuracy": 0.02694165032369625,
"eval_loss": 5.9226508140563965,
"eval_runtime": 92.7015,
"eval_samples_per_second": 38.759,
"eval_steps_per_second": 2.427,
"step": 121000
},
{
"epoch": 2.16096,
"grad_norm": 3.6655936241149902,
"learning_rate": 1.96375e-05,
"loss": 5.9701,
"step": 121500
},
{
"epoch": 2.16346,
"grad_norm": 2.5577406883239746,
"learning_rate": 1.95125e-05,
"loss": 5.9217,
"step": 122000
},
{
"epoch": 2.16346,
"eval_accuracy": 0.02731524280450134,
"eval_loss": 5.9304680824279785,
"eval_runtime": 91.7545,
"eval_samples_per_second": 39.159,
"eval_steps_per_second": 2.452,
"step": 122000
},
{
"epoch": 2.16596,
"grad_norm": 3.617988109588623,
"learning_rate": 1.93875e-05,
"loss": 5.9587,
"step": 122500
},
{
"epoch": 2.16846,
"grad_norm": 2.8228862285614014,
"learning_rate": 1.92625e-05,
"loss": 5.9862,
"step": 123000
},
{
"epoch": 2.16846,
"eval_accuracy": 0.02671240657637048,
"eval_loss": 5.909170150756836,
"eval_runtime": 92.8579,
"eval_samples_per_second": 38.694,
"eval_steps_per_second": 2.423,
"step": 123000
},
{
"epoch": 2.17096,
"grad_norm": 5.492008209228516,
"learning_rate": 1.91375e-05,
"loss": 5.9472,
"step": 123500
},
{
"epoch": 2.17346,
"grad_norm": 2.1293392181396484,
"learning_rate": 1.9012750000000002e-05,
"loss": 5.9388,
"step": 124000
},
{
"epoch": 2.17346,
"eval_accuracy": 0.0270023284906442,
"eval_loss": 5.889882564544678,
"eval_runtime": 122.3828,
"eval_samples_per_second": 29.359,
"eval_steps_per_second": 1.838,
"step": 124000
},
{
"epoch": 2.17596,
"grad_norm": 2.5894041061401367,
"learning_rate": 1.8887750000000002e-05,
"loss": 5.9259,
"step": 124500
},
{
"epoch": 2.17846,
"grad_norm": 3.7750794887542725,
"learning_rate": 1.8762750000000003e-05,
"loss": 5.9429,
"step": 125000
},
{
"epoch": 2.17846,
"eval_accuracy": 0.02674982024433166,
"eval_loss": 5.895012855529785,
"eval_runtime": 95.7852,
"eval_samples_per_second": 37.511,
"eval_steps_per_second": 2.349,
"step": 125000
},
{
"epoch": 2.18096,
"grad_norm": 2.4733965396881104,
"learning_rate": 1.8638e-05,
"loss": 5.9506,
"step": 125500
},
{
"epoch": 2.18346,
"grad_norm": 2.5394270420074463,
"learning_rate": 1.8513e-05,
"loss": 5.9317,
"step": 126000
},
{
"epoch": 2.18346,
"eval_accuracy": 0.026840429345648554,
"eval_loss": 5.910998344421387,
"eval_runtime": 95.7395,
"eval_samples_per_second": 37.529,
"eval_steps_per_second": 2.35,
"step": 126000
},
{
"epoch": 2.18596,
"grad_norm": 2.7636115550994873,
"learning_rate": 1.8388e-05,
"loss": 5.9325,
"step": 126500
},
{
"epoch": 2.18846,
"grad_norm": 3.0258498191833496,
"learning_rate": 1.8263e-05,
"loss": 5.9367,
"step": 127000
},
{
"epoch": 2.18846,
"eval_accuracy": 0.026817436946065136,
"eval_loss": 5.86806583404541,
"eval_runtime": 93.0877,
"eval_samples_per_second": 38.598,
"eval_steps_per_second": 2.417,
"step": 127000
},
{
"epoch": 2.19096,
"grad_norm": 2.4709484577178955,
"learning_rate": 1.8138e-05,
"loss": 5.9368,
"step": 127500
},
{
"epoch": 2.19346,
"grad_norm": 4.170524597167969,
"learning_rate": 1.8013000000000002e-05,
"loss": 5.9273,
"step": 128000
},
{
"epoch": 2.19346,
"eval_accuracy": 0.027440544579745874,
"eval_loss": 5.880221366882324,
"eval_runtime": 92.6132,
"eval_samples_per_second": 38.796,
"eval_steps_per_second": 2.429,
"step": 128000
},
{
"epoch": 2.19596,
"grad_norm": 3.9754855632781982,
"learning_rate": 1.7888000000000002e-05,
"loss": 5.9188,
"step": 128500
},
{
"epoch": 2.19846,
"grad_norm": 2.676684856414795,
"learning_rate": 1.7763000000000003e-05,
"loss": 5.934,
"step": 129000
},
{
"epoch": 2.19846,
"eval_accuracy": 0.026782064023629114,
"eval_loss": 5.897346496582031,
"eval_runtime": 92.8542,
"eval_samples_per_second": 38.695,
"eval_steps_per_second": 2.423,
"step": 129000
},
{
"epoch": 2.20096,
"grad_norm": 3.0156376361846924,
"learning_rate": 1.763825e-05,
"loss": 5.9191,
"step": 129500
},
{
"epoch": 2.20346,
"grad_norm": 2.913367509841919,
"learning_rate": 1.751325e-05,
"loss": 5.9229,
"step": 130000
},
{
"epoch": 2.20346,
"eval_accuracy": 0.027015253212303518,
"eval_loss": 5.891612529754639,
"eval_runtime": 91.7301,
"eval_samples_per_second": 39.169,
"eval_steps_per_second": 2.453,
"step": 130000
},
{
"epoch": 2.20596,
"grad_norm": 3.0923585891723633,
"learning_rate": 1.738825e-05,
"loss": 5.9225,
"step": 130500
},
{
"epoch": 2.20846,
"grad_norm": 2.9327263832092285,
"learning_rate": 1.726325e-05,
"loss": 5.942,
"step": 131000
},
{
"epoch": 2.20846,
"eval_accuracy": 0.026604791262343958,
"eval_loss": 5.896469593048096,
"eval_runtime": 97.1482,
"eval_samples_per_second": 36.985,
"eval_steps_per_second": 2.316,
"step": 131000
},
{
"epoch": 2.21096,
"grad_norm": 2.434720993041992,
"learning_rate": 1.7138500000000002e-05,
"loss": 5.9244,
"step": 131500
},
{
"epoch": 2.21346,
"grad_norm": 3.1938490867614746,
"learning_rate": 1.7013500000000002e-05,
"loss": 5.9224,
"step": 132000
},
{
"epoch": 2.21346,
"eval_accuracy": 0.02676478571151613,
"eval_loss": 5.8799638748168945,
"eval_runtime": 101.684,
"eval_samples_per_second": 35.335,
"eval_steps_per_second": 2.213,
"step": 132000
},
{
"epoch": 2.21596,
"grad_norm": 2.5575222969055176,
"learning_rate": 1.6888500000000003e-05,
"loss": 5.933,
"step": 132500
},
{
"epoch": 2.21846,
"grad_norm": 2.610675573348999,
"learning_rate": 1.6763500000000003e-05,
"loss": 5.936,
"step": 133000
},
{
"epoch": 2.21846,
"eval_accuracy": 0.026946139963851593,
"eval_loss": 5.869258880615234,
"eval_runtime": 94.0477,
"eval_samples_per_second": 38.204,
"eval_steps_per_second": 2.392,
"step": 133000
},
{
"epoch": 2.22096,
"grad_norm": 2.5007147789001465,
"learning_rate": 1.663875e-05,
"loss": 5.9006,
"step": 133500
},
{
"epoch": 3.00019,
"grad_norm": 2.4350554943084717,
"learning_rate": 1.651375e-05,
"loss": 5.9129,
"step": 134000
},
{
"epoch": 3.00019,
"eval_accuracy": 0.02651254956460694,
"eval_loss": 5.850106239318848,
"eval_runtime": 90.9885,
"eval_samples_per_second": 39.489,
"eval_steps_per_second": 2.473,
"step": 134000
},
{
"epoch": 3.00269,
"grad_norm": 2.9564504623413086,
"learning_rate": 1.638875e-05,
"loss": 5.8888,
"step": 134500
},
{
"epoch": 3.00519,
"grad_norm": 3.2191567420959473,
"learning_rate": 1.6263749999999998e-05,
"loss": 5.8787,
"step": 135000
},
{
"epoch": 3.00519,
"eval_accuracy": 0.0267069645883034,
"eval_loss": 5.870194911956787,
"eval_runtime": 93.5455,
"eval_samples_per_second": 38.409,
"eval_steps_per_second": 2.405,
"step": 135000
},
{
"epoch": 3.00769,
"grad_norm": 3.1718389987945557,
"learning_rate": 1.613875e-05,
"loss": 5.8943,
"step": 135500
},
{
"epoch": 3.01019,
"grad_norm": 3.5635807514190674,
"learning_rate": 1.601425e-05,
"loss": 5.9171,
"step": 136000
},
{
"epoch": 3.01019,
"eval_accuracy": 0.0268892711885506,
"eval_loss": 5.844875812530518,
"eval_runtime": 92.3095,
"eval_samples_per_second": 38.923,
"eval_steps_per_second": 2.437,
"step": 136000
},
{
"epoch": 3.01269,
"grad_norm": 4.3343305587768555,
"learning_rate": 1.588925e-05,
"loss": 5.9144,
"step": 136500
},
{
"epoch": 3.01519,
"grad_norm": 2.6254384517669678,
"learning_rate": 1.576425e-05,
"loss": 5.8931,
"step": 137000
},
{
"epoch": 3.01519,
"eval_accuracy": 0.026976070898220537,
"eval_loss": 5.845653057098389,
"eval_runtime": 92.0247,
"eval_samples_per_second": 39.044,
"eval_steps_per_second": 2.445,
"step": 137000
},
{
"epoch": 3.01769,
"grad_norm": 3.994947910308838,
"learning_rate": 1.563925e-05,
"loss": 5.9248,
"step": 137500
},
{
"epoch": 3.02019,
"grad_norm": 3.4500911235809326,
"learning_rate": 1.5514249999999998e-05,
"loss": 5.8612,
"step": 138000
},
{
"epoch": 3.02019,
"eval_accuracy": 0.026333100008094957,
"eval_loss": 5.863004684448242,
"eval_runtime": 97.9674,
"eval_samples_per_second": 36.675,
"eval_steps_per_second": 2.297,
"step": 138000
},
{
"epoch": 3.02269,
"grad_norm": 2.8210179805755615,
"learning_rate": 1.538925e-05,
"loss": 5.8873,
"step": 138500
},
{
"epoch": 3.02519,
"grad_norm": 3.462085723876953,
"learning_rate": 1.526425e-05,
"loss": 5.8897,
"step": 139000
},
{
"epoch": 3.02519,
"eval_accuracy": 0.02670546804158495,
"eval_loss": 5.849697589874268,
"eval_runtime": 91.473,
"eval_samples_per_second": 39.279,
"eval_steps_per_second": 2.46,
"step": 139000
},
{
"epoch": 3.02769,
"grad_norm": 3.4429280757904053,
"learning_rate": 1.5139250000000002e-05,
"loss": 5.8915,
"step": 139500
},
{
"epoch": 3.03019,
"grad_norm": 2.4032459259033203,
"learning_rate": 1.50145e-05,
"loss": 5.8772,
"step": 140000
},
{
"epoch": 3.03019,
"eval_accuracy": 0.026319903187032288,
"eval_loss": 5.817691802978516,
"eval_runtime": 116.7881,
"eval_samples_per_second": 30.765,
"eval_steps_per_second": 1.927,
"step": 140000
},
{
"epoch": 3.03269,
"grad_norm": 2.4531261920928955,
"learning_rate": 1.48895e-05,
"loss": 5.8735,
"step": 140500
},
{
"epoch": 3.03519,
"grad_norm": 2.7715256214141846,
"learning_rate": 1.47645e-05,
"loss": 5.8774,
"step": 141000
},
{
"epoch": 3.03519,
"eval_accuracy": 0.026607376206675824,
"eval_loss": 5.821176052093506,
"eval_runtime": 92.8442,
"eval_samples_per_second": 38.699,
"eval_steps_per_second": 2.423,
"step": 141000
},
{
"epoch": 3.03769,
"grad_norm": 3.399043560028076,
"learning_rate": 1.4639750000000002e-05,
"loss": 5.8561,
"step": 141500
},
{
"epoch": 3.04019,
"grad_norm": 3.1285316944122314,
"learning_rate": 1.4514750000000002e-05,
"loss": 5.8694,
"step": 142000
},
{
"epoch": 3.04019,
"eval_accuracy": 0.026714855471000665,
"eval_loss": 5.837355613708496,
"eval_runtime": 98.834,
"eval_samples_per_second": 36.354,
"eval_steps_per_second": 2.277,
"step": 142000
},
{
"epoch": 3.04269,
"grad_norm": 3.5642755031585693,
"learning_rate": 1.4389750000000002e-05,
"loss": 5.8642,
"step": 142500
},
{
"epoch": 3.04519,
"grad_norm": 2.953099489212036,
"learning_rate": 1.4264750000000001e-05,
"loss": 5.8561,
"step": 143000
},
{
"epoch": 3.04519,
"eval_accuracy": 0.02667281611318247,
"eval_loss": 5.792807102203369,
"eval_runtime": 93.9281,
"eval_samples_per_second": 38.253,
"eval_steps_per_second": 2.395,
"step": 143000
},
{
"epoch": 3.04769,
"grad_norm": 2.8684558868408203,
"learning_rate": 1.4139750000000001e-05,
"loss": 5.8442,
"step": 143500
},
{
"epoch": 3.05019,
"grad_norm": 2.2189698219299316,
"learning_rate": 1.4014750000000002e-05,
"loss": 5.8658,
"step": 144000
},
{
"epoch": 3.05019,
"eval_accuracy": 0.02694872490818346,
"eval_loss": 5.79358434677124,
"eval_runtime": 93.4176,
"eval_samples_per_second": 38.462,
"eval_steps_per_second": 2.409,
"step": 144000
},
{
"epoch": 3.05269,
"grad_norm": 2.659515619277954,
"learning_rate": 1.3889750000000002e-05,
"loss": 5.8347,
"step": 144500
},
{
"epoch": 3.05519,
"grad_norm": 2.62570858001709,
"learning_rate": 1.376475e-05,
"loss": 5.8295,
"step": 145000
},
{
"epoch": 3.05519,
"eval_accuracy": 0.026485203574569863,
"eval_loss": 5.795611381530762,
"eval_runtime": 94.2389,
"eval_samples_per_second": 38.127,
"eval_steps_per_second": 2.388,
"step": 145000
},
{
"epoch": 3.05769,
"grad_norm": 3.243245840072632,
"learning_rate": 1.364e-05,
"loss": 5.8247,
"step": 145500
},
{
"epoch": 3.06019,
"grad_norm": 3.164580821990967,
"learning_rate": 1.3515e-05,
"loss": 5.8444,
"step": 146000
},
{
"epoch": 3.06019,
"eval_accuracy": 0.026442620017944955,
"eval_loss": 5.792357444763184,
"eval_runtime": 91.4103,
"eval_samples_per_second": 39.306,
"eval_steps_per_second": 2.461,
"step": 146000
},
{
"epoch": 3.06269,
"grad_norm": 2.9992387294769287,
"learning_rate": 1.339e-05,
"loss": 5.8187,
"step": 146500
},
{
"epoch": 3.06519,
"grad_norm": 2.9133353233337402,
"learning_rate": 1.3265e-05,
"loss": 5.8318,
"step": 147000
},
{
"epoch": 3.06519,
"eval_accuracy": 0.026461258827074705,
"eval_loss": 5.765110015869141,
"eval_runtime": 93.7438,
"eval_samples_per_second": 38.328,
"eval_steps_per_second": 2.4,
"step": 147000
},
{
"epoch": 3.06769,
"grad_norm": 3.1825642585754395,
"learning_rate": 1.314e-05,
"loss": 5.8185,
"step": 147500
},
{
"epoch": 3.07019,
"grad_norm": 3.5604467391967773,
"learning_rate": 1.3015e-05,
"loss": 5.8323,
"step": 148000
},
{
"epoch": 3.07019,
"eval_accuracy": 0.026824783629955697,
"eval_loss": 5.770131587982178,
"eval_runtime": 92.029,
"eval_samples_per_second": 39.042,
"eval_steps_per_second": 2.445,
"step": 148000
},
{
"epoch": 3.07269,
"grad_norm": 3.883007049560547,
"learning_rate": 1.2889999999999999e-05,
"loss": 5.8111,
"step": 148500
},
{
"epoch": 3.07519,
"grad_norm": 8.257863998413086,
"learning_rate": 1.2765250000000001e-05,
"loss": 5.8239,
"step": 149000
},
{
"epoch": 3.07519,
"eval_accuracy": 0.026383438397715453,
"eval_loss": 5.779338359832764,
"eval_runtime": 92.1302,
"eval_samples_per_second": 38.999,
"eval_steps_per_second": 2.442,
"step": 149000
},
{
"epoch": 3.07769,
"grad_norm": 2.9781956672668457,
"learning_rate": 1.2640250000000002e-05,
"loss": 5.8073,
"step": 149500
},
{
"epoch": 3.08019,
"grad_norm": 3.369654893875122,
"learning_rate": 1.251525e-05,
"loss": 5.8057,
"step": 150000
},
{
"epoch": 3.08019,
"eval_accuracy": 0.027368438237857055,
"eval_loss": 5.7676262855529785,
"eval_runtime": 94.7379,
"eval_samples_per_second": 37.926,
"eval_steps_per_second": 2.375,
"step": 150000
},
{
"epoch": 3.08269,
"grad_norm": 2.6309428215026855,
"learning_rate": 1.239025e-05,
"loss": 5.798,
"step": 150500
},
{
"epoch": 3.08519,
"grad_norm": 4.712452411651611,
"learning_rate": 1.2265250000000001e-05,
"loss": 5.7818,
"step": 151000
},
{
"epoch": 3.08519,
"eval_accuracy": 0.026996750452875445,
"eval_loss": 5.756935119628906,
"eval_runtime": 93.4756,
"eval_samples_per_second": 38.438,
"eval_steps_per_second": 2.407,
"step": 151000
},
{
"epoch": 3.08769,
"grad_norm": 3.6165318489074707,
"learning_rate": 1.21405e-05,
"loss": 5.7959,
"step": 151500
},
{
"epoch": 3.09019,
"grad_norm": 3.728891134262085,
"learning_rate": 1.20155e-05,
"loss": 5.773,
"step": 152000
},
{
"epoch": 3.09019,
"eval_accuracy": 0.026722338204592903,
"eval_loss": 5.740837097167969,
"eval_runtime": 94.7277,
"eval_samples_per_second": 37.93,
"eval_steps_per_second": 2.375,
"step": 152000
},
{
"epoch": 3.09269,
"grad_norm": 3.545217990875244,
"learning_rate": 1.18905e-05,
"loss": 5.7891,
"step": 152500
},
{
"epoch": 3.09519,
"grad_norm": 3.074216842651367,
"learning_rate": 1.17655e-05,
"loss": 5.7491,
"step": 153000
},
{
"epoch": 3.09519,
"eval_accuracy": 0.027355241416794383,
"eval_loss": 5.720602989196777,
"eval_runtime": 97.4705,
"eval_samples_per_second": 36.862,
"eval_steps_per_second": 2.308,
"step": 153000
},
{
"epoch": 3.09769,
"grad_norm": 4.110429763793945,
"learning_rate": 1.16405e-05,
"loss": 5.7724,
"step": 153500
},
{
"epoch": 3.10019,
"grad_norm": 3.007507801055908,
"learning_rate": 1.151575e-05,
"loss": 5.7655,
"step": 154000
},
{
"epoch": 3.10019,
"eval_accuracy": 0.026848184178644142,
"eval_loss": 5.7095208168029785,
"eval_runtime": 98.9114,
"eval_samples_per_second": 36.325,
"eval_steps_per_second": 2.275,
"step": 154000
},
{
"epoch": 3.10269,
"grad_norm": 3.1194117069244385,
"learning_rate": 1.139075e-05,
"loss": 5.7622,
"step": 154500
},
{
"epoch": 3.10519,
"grad_norm": 4.109334945678711,
"learning_rate": 1.126575e-05,
"loss": 5.7706,
"step": 155000
},
{
"epoch": 3.10519,
"eval_accuracy": 0.02715552045473252,
"eval_loss": 5.707860469818115,
"eval_runtime": 126.3537,
"eval_samples_per_second": 28.436,
"eval_steps_per_second": 1.781,
"step": 155000
},
{
"epoch": 3.10769,
"grad_norm": 3.767932415008545,
"learning_rate": 1.1140750000000002e-05,
"loss": 5.7639,
"step": 155500
},
{
"epoch": 3.11019,
"grad_norm": 4.084874629974365,
"learning_rate": 1.1016e-05,
"loss": 5.7379,
"step": 156000
},
{
"epoch": 3.11019,
"eval_accuracy": 0.027283815323413948,
"eval_loss": 5.691872596740723,
"eval_runtime": 93.2144,
"eval_samples_per_second": 38.546,
"eval_steps_per_second": 2.414,
"step": 156000
},
{
"epoch": 3.11269,
"grad_norm": 6.118876934051514,
"learning_rate": 1.0891000000000001e-05,
"loss": 5.7411,
"step": 156500
},
{
"epoch": 3.11519,
"grad_norm": 4.562450885772705,
"learning_rate": 1.0766000000000002e-05,
"loss": 5.7374,
"step": 157000
},
{
"epoch": 3.11519,
"eval_accuracy": 0.027440680629447548,
"eval_loss": 5.667840957641602,
"eval_runtime": 91.1779,
"eval_samples_per_second": 39.406,
"eval_steps_per_second": 2.468,
"step": 157000
},
{
"epoch": 3.11769,
"grad_norm": 3.47501277923584,
"learning_rate": 1.0641e-05,
"loss": 5.7177,
"step": 157500
},
{
"epoch": 3.12019,
"grad_norm": 3.262887477874756,
"learning_rate": 1.0516e-05,
"loss": 5.7077,
"step": 158000
},
{
"epoch": 3.12019,
"eval_accuracy": 0.026955663442968986,
"eval_loss": 5.6481852531433105,
"eval_runtime": 93.6492,
"eval_samples_per_second": 38.367,
"eval_steps_per_second": 2.403,
"step": 158000
},
{
"epoch": 3.12269,
"grad_norm": 3.8364920616149902,
"learning_rate": 1.0391250000000002e-05,
"loss": 5.7026,
"step": 158500
},
{
"epoch": 3.12519,
"grad_norm": 3.216892719268799,
"learning_rate": 1.026625e-05,
"loss": 5.7176,
"step": 159000
},
{
"epoch": 3.12519,
"eval_accuracy": 0.027355649565899413,
"eval_loss": 5.614231109619141,
"eval_runtime": 128.5228,
"eval_samples_per_second": 27.956,
"eval_steps_per_second": 1.751,
"step": 159000
},
{
"epoch": 3.12769,
"grad_norm": 3.7914490699768066,
"learning_rate": 1.014125e-05,
"loss": 5.7055,
"step": 159500
},
{
"epoch": 3.13019,
"grad_norm": 3.515538215637207,
"learning_rate": 1.0016250000000001e-05,
"loss": 5.7077,
"step": 160000
},
{
"epoch": 3.13019,
"eval_accuracy": 0.027464081178135997,
"eval_loss": 5.629947185516357,
"eval_runtime": 97.5707,
"eval_samples_per_second": 36.825,
"eval_steps_per_second": 2.306,
"step": 160000
},
{
"epoch": 3.13269,
"grad_norm": 4.886687278747559,
"learning_rate": 9.8915e-06,
"loss": 5.6998,
"step": 160500
},
{
"epoch": 3.13519,
"grad_norm": 3.584073305130005,
"learning_rate": 9.7665e-06,
"loss": 5.6882,
"step": 161000
},
{
"epoch": 3.13519,
"eval_accuracy": 0.027459183388875624,
"eval_loss": 5.591392993927002,
"eval_runtime": 93.7876,
"eval_samples_per_second": 38.31,
"eval_steps_per_second": 2.399,
"step": 161000
},
{
"epoch": 3.13769,
"grad_norm": 2.614473581314087,
"learning_rate": 9.6415e-06,
"loss": 5.6896,
"step": 161500
},
{
"epoch": 3.14019,
"grad_norm": 3.612001419067383,
"learning_rate": 9.516500000000001e-06,
"loss": 5.6513,
"step": 162000
},
{
"epoch": 3.14019,
"eval_accuracy": 0.02719810401135743,
"eval_loss": 5.585655689239502,
"eval_runtime": 127.1026,
"eval_samples_per_second": 28.268,
"eval_steps_per_second": 1.77,
"step": 162000
},
{
"epoch": 3.14269,
"grad_norm": 3.9107935428619385,
"learning_rate": 9.39175e-06,
"loss": 5.657,
"step": 162500
},
{
"epoch": 3.14519,
"grad_norm": 3.753208637237549,
"learning_rate": 9.26675e-06,
"loss": 5.6516,
"step": 163000
},
{
"epoch": 3.14519,
"eval_accuracy": 0.027440544579745874,
"eval_loss": 5.5584001541137695,
"eval_runtime": 97.5764,
"eval_samples_per_second": 36.822,
"eval_steps_per_second": 2.306,
"step": 163000
},
{
"epoch": 3.14769,
"grad_norm": 3.1370689868927,
"learning_rate": 9.141750000000001e-06,
"loss": 5.6678,
"step": 163500
},
{
"epoch": 3.15019,
"grad_norm": 3.1537926197052,
"learning_rate": 9.01675e-06,
"loss": 5.6158,
"step": 164000
},
{
"epoch": 3.15019,
"eval_accuracy": 0.028059434672674622,
"eval_loss": 5.522320747375488,
"eval_runtime": 92.5826,
"eval_samples_per_second": 38.809,
"eval_steps_per_second": 2.43,
"step": 164000
},
{
"epoch": 3.1526899999999998,
"grad_norm": 3.2936768531799316,
"learning_rate": 8.89175e-06,
"loss": 5.6305,
"step": 164500
},
{
"epoch": 3.15519,
"grad_norm": 5.041363716125488,
"learning_rate": 8.767000000000001e-06,
"loss": 5.6235,
"step": 165000
},
{
"epoch": 3.15519,
"eval_accuracy": 0.027728289698792763,
"eval_loss": 5.527610778808594,
"eval_runtime": 92.1615,
"eval_samples_per_second": 38.986,
"eval_steps_per_second": 2.441,
"step": 165000
},
{
"epoch": 3.15769,
"grad_norm": 4.752361297607422,
"learning_rate": 8.642e-06,
"loss": 5.5997,
"step": 165500
},
{
"epoch": 3.16019,
"grad_norm": 4.261384963989258,
"learning_rate": 8.517e-06,
"loss": 5.6308,
"step": 166000
},
{
"epoch": 3.16019,
"eval_accuracy": 0.02816977098073468,
"eval_loss": 5.499186038970947,
"eval_runtime": 92.5045,
"eval_samples_per_second": 38.841,
"eval_steps_per_second": 2.432,
"step": 166000
},
{
"epoch": 3.16269,
"grad_norm": 4.297583103179932,
"learning_rate": 8.392e-06,
"loss": 5.5752,
"step": 166500
},
{
"epoch": 3.16519,
"grad_norm": 7.3237624168396,
"learning_rate": 8.26725e-06,
"loss": 5.5782,
"step": 167000
},
{
"epoch": 3.16519,
"eval_accuracy": 0.027606525215791832,
"eval_loss": 5.48903226852417,
"eval_runtime": 92.0464,
"eval_samples_per_second": 39.035,
"eval_steps_per_second": 2.444,
"step": 167000
},
{
"epoch": 3.16769,
"grad_norm": 3.426760196685791,
"learning_rate": 8.14225e-06,
"loss": 5.6192,
"step": 167500
},
{
"epoch": 3.17019,
"grad_norm": 3.8471386432647705,
"learning_rate": 8.01725e-06,
"loss": 5.5723,
"step": 168000
},
{
"epoch": 3.17019,
"eval_accuracy": 0.027942976128039095,
"eval_loss": 5.443605422973633,
"eval_runtime": 90.8581,
"eval_samples_per_second": 39.545,
"eval_steps_per_second": 2.476,
"step": 168000
},
{
"epoch": 3.1726900000000002,
"grad_norm": 3.767125368118286,
"learning_rate": 7.89225e-06,
"loss": 5.5569,
"step": 168500
},
{
"epoch": 3.17519,
"grad_norm": 4.584527015686035,
"learning_rate": 7.767250000000001e-06,
"loss": 5.5417,
"step": 169000
},
{
"epoch": 3.17519,
"eval_accuracy": 0.028352213630683562,
"eval_loss": 5.416599750518799,
"eval_runtime": 91.2582,
"eval_samples_per_second": 39.372,
"eval_steps_per_second": 2.466,
"step": 169000
},
{
"epoch": 3.17769,
"grad_norm": 5.043390274047852,
"learning_rate": 7.642500000000002e-06,
"loss": 5.535,
"step": 169500
},
{
"epoch": 3.18019,
"grad_norm": 7.203840732574463,
"learning_rate": 7.517500000000001e-06,
"loss": 5.5346,
"step": 170000
},
{
"epoch": 3.18019,
"eval_accuracy": 0.028451802012311136,
"eval_loss": 5.403586387634277,
"eval_runtime": 96.555,
"eval_samples_per_second": 37.212,
"eval_steps_per_second": 2.33,
"step": 170000
},
{
"epoch": 3.18269,
"grad_norm": 5.778980731964111,
"learning_rate": 7.392500000000001e-06,
"loss": 5.521,
"step": 170500
},
{
"epoch": 3.18519,
"grad_norm": 3.5962352752685547,
"learning_rate": 7.2675e-06,
"loss": 5.5068,
"step": 171000
},
{
"epoch": 3.18519,
"eval_accuracy": 0.028494929767742753,
"eval_loss": 5.36637544631958,
"eval_runtime": 92.7171,
"eval_samples_per_second": 38.752,
"eval_steps_per_second": 2.427,
"step": 171000
},
{
"epoch": 3.18769,
"grad_norm": 3.231135606765747,
"learning_rate": 7.142500000000001e-06,
"loss": 5.5068,
"step": 171500
},
{
"epoch": 3.19019,
"grad_norm": 4.50712776184082,
"learning_rate": 7.017750000000001e-06,
"loss": 5.5024,
"step": 172000
},
{
"epoch": 3.19019,
"eval_accuracy": 0.0285840423223412,
"eval_loss": 5.33723783493042,
"eval_runtime": 91.3348,
"eval_samples_per_second": 39.339,
"eval_steps_per_second": 2.463,
"step": 172000
},
{
"epoch": 3.19269,
"grad_norm": 5.674015522003174,
"learning_rate": 6.89275e-06,
"loss": 5.479,
"step": 172500
},
{
"epoch": 3.19519,
"grad_norm": 5.207662582397461,
"learning_rate": 6.7677500000000006e-06,
"loss": 5.4611,
"step": 173000
},
{
"epoch": 3.19519,
"eval_accuracy": 0.02859955198833238,
"eval_loss": 5.306539058685303,
"eval_runtime": 92.148,
"eval_samples_per_second": 38.992,
"eval_steps_per_second": 2.442,
"step": 173000
},
{
"epoch": 3.19769,
"grad_norm": 4.15488338470459,
"learning_rate": 6.64275e-06,
"loss": 5.4725,
"step": 173500
},
{
"epoch": 3.20019,
"grad_norm": 3.6545050144195557,
"learning_rate": 6.51775e-06,
"loss": 5.4352,
"step": 174000
},
{
"epoch": 3.20019,
"eval_accuracy": 0.02851234412955741,
"eval_loss": 5.30513858795166,
"eval_runtime": 128.4128,
"eval_samples_per_second": 27.98,
"eval_steps_per_second": 1.752,
"step": 174000
},
{
"epoch": 3.20269,
"grad_norm": 4.31083869934082,
"learning_rate": 6.3930000000000005e-06,
"loss": 5.4433,
"step": 174500
},
{
"epoch": 3.20519,
"grad_norm": 5.675565719604492,
"learning_rate": 6.268e-06,
"loss": 5.4305,
"step": 175000
},
{
"epoch": 3.20519,
"eval_accuracy": 0.02904035302176591,
"eval_loss": 5.271829128265381,
"eval_runtime": 91.0491,
"eval_samples_per_second": 39.462,
"eval_steps_per_second": 2.471,
"step": 175000
},
{
"epoch": 3.20769,
"grad_norm": 4.5083231925964355,
"learning_rate": 6.143e-06,
"loss": 5.4444,
"step": 175500
},
{
"epoch": 3.21019,
"grad_norm": 5.508623123168945,
"learning_rate": 6.018e-06,
"loss": 5.4244,
"step": 176000
},
{
"epoch": 3.21019,
"eval_accuracy": 0.02856485931440474,
"eval_loss": 5.234050750732422,
"eval_runtime": 97.7428,
"eval_samples_per_second": 36.76,
"eval_steps_per_second": 2.302,
"step": 176000
},
{
"epoch": 3.21269,
"grad_norm": 5.6364030838012695,
"learning_rate": 5.893000000000001e-06,
"loss": 5.3918,
"step": 176500
},
{
"epoch": 3.2151899999999998,
"grad_norm": 4.249551773071289,
"learning_rate": 5.76825e-06,
"loss": 5.406,
"step": 177000
},
{
"epoch": 3.2151899999999998,
"eval_accuracy": 0.028736417988219458,
"eval_loss": 5.197048664093018,
"eval_runtime": 101.4416,
"eval_samples_per_second": 35.419,
"eval_steps_per_second": 2.218,
"step": 177000
},
{
"epoch": 3.21769,
"grad_norm": 6.131099224090576,
"learning_rate": 5.64325e-06,
"loss": 5.4104,
"step": 177500
},
{
"epoch": 3.22019,
"grad_norm": 5.6621479988098145,
"learning_rate": 5.518250000000001e-06,
"loss": 5.3693,
"step": 178000
},
{
"epoch": 3.22019,
"eval_accuracy": 0.028811245324141817,
"eval_loss": 5.188289642333984,
"eval_runtime": 93.1736,
"eval_samples_per_second": 38.562,
"eval_steps_per_second": 2.415,
"step": 178000
},
{
"epoch": 3.22269,
"grad_norm": 6.5256242752075195,
"learning_rate": 5.39325e-06,
"loss": 5.3581,
"step": 178500
},
{
"epoch": 4.00192,
"grad_norm": 4.139052867889404,
"learning_rate": 5.2685000000000005e-06,
"loss": 5.3414,
"step": 179000
},
{
"epoch": 4.00192,
"eval_accuracy": 0.028710568544900825,
"eval_loss": 5.1566362380981445,
"eval_runtime": 128.5676,
"eval_samples_per_second": 27.946,
"eval_steps_per_second": 1.75,
"step": 179000
},
{
"epoch": 4.00442,
"grad_norm": 5.124391078948975,
"learning_rate": 5.143500000000001e-06,
"loss": 5.3228,
"step": 179500
},
{
"epoch": 4.00692,
"grad_norm": 4.564643859863281,
"learning_rate": 5.0185e-06,
"loss": 5.3252,
"step": 180000
},
{
"epoch": 4.00692,
"eval_accuracy": 0.029084841274214297,
"eval_loss": 5.121004104614258,
"eval_runtime": 99.0466,
"eval_samples_per_second": 36.276,
"eval_steps_per_second": 2.272,
"step": 180000
},
{
"epoch": 4.00942,
"grad_norm": 6.144300937652588,
"learning_rate": 4.8935e-06,
"loss": 5.326,
"step": 180500
},
{
"epoch": 4.01192,
"grad_norm": 4.762115001678467,
"learning_rate": 4.768750000000001e-06,
"loss": 5.3302,
"step": 181000
},
{
"epoch": 4.01192,
"eval_accuracy": 0.02900389170171647,
"eval_loss": 5.1127095222473145,
"eval_runtime": 93.1042,
"eval_samples_per_second": 38.591,
"eval_steps_per_second": 2.417,
"step": 181000
},
{
"epoch": 4.01442,
"grad_norm": 4.9249420166015625,
"learning_rate": 4.64375e-06,
"loss": 5.322,
"step": 181500
},
{
"epoch": 4.01692,
"grad_norm": 5.8505988121032715,
"learning_rate": 4.519e-06,
"loss": 5.3112,
"step": 182000
},
{
"epoch": 4.01692,
"eval_accuracy": 0.028904303320088896,
"eval_loss": 5.079184055328369,
"eval_runtime": 92.4255,
"eval_samples_per_second": 38.875,
"eval_steps_per_second": 2.434,
"step": 182000
},
{
"epoch": 4.01942,
"grad_norm": 5.5884599685668945,
"learning_rate": 4.394000000000001e-06,
"loss": 5.2634,
"step": 182500
},
{
"epoch": 4.02192,
"grad_norm": 6.884908199310303,
"learning_rate": 4.269e-06,
"loss": 5.2651,
"step": 183000
},
{
"epoch": 4.02192,
"eval_accuracy": 0.02909871834378535,
"eval_loss": 5.043324947357178,
"eval_runtime": 100.4033,
"eval_samples_per_second": 35.786,
"eval_steps_per_second": 2.241,
"step": 183000
},
{
"epoch": 4.02442,
"grad_norm": 4.412216663360596,
"learning_rate": 4.144e-06,
"loss": 5.2793,
"step": 183500
},
{
"epoch": 4.02692,
"grad_norm": 4.486749172210693,
"learning_rate": 4.019e-06,
"loss": 5.2623,
"step": 184000
},
{
"epoch": 4.02692,
"eval_accuracy": 0.028819816455347466,
"eval_loss": 5.025639533996582,
"eval_runtime": 94.773,
"eval_samples_per_second": 37.912,
"eval_steps_per_second": 2.374,
"step": 184000
},
{
"epoch": 4.02942,
"grad_norm": 5.047926425933838,
"learning_rate": 3.894e-06,
"loss": 5.2454,
"step": 184500
},
{
"epoch": 4.03192,
"grad_norm": 7.453557968139648,
"learning_rate": 3.7690000000000003e-06,
"loss": 5.2297,
"step": 185000
},
{
"epoch": 4.03192,
"eval_accuracy": 0.02869369838189287,
"eval_loss": 5.029138565063477,
"eval_runtime": 92.8613,
"eval_samples_per_second": 38.692,
"eval_steps_per_second": 2.423,
"step": 185000
},
{
"epoch": 4.03442,
"grad_norm": 5.0118021965026855,
"learning_rate": 3.644e-06,
"loss": 5.2391,
"step": 185500
},
{
"epoch": 4.03692,
"grad_norm": 5.173340320587158,
"learning_rate": 3.5192500000000002e-06,
"loss": 5.1991,
"step": 186000
},
{
"epoch": 4.03692,
"eval_accuracy": 0.028796688006062374,
"eval_loss": 4.970343589782715,
"eval_runtime": 95.8003,
"eval_samples_per_second": 37.505,
"eval_steps_per_second": 2.349,
"step": 186000
},
{
"epoch": 4.03942,
"grad_norm": 4.592197895050049,
"learning_rate": 3.39425e-06,
"loss": 5.203,
"step": 186500
},
{
"epoch": 4.04192,
"grad_norm": 5.571292400360107,
"learning_rate": 3.26925e-06,
"loss": 5.1883,
"step": 187000
},
{
"epoch": 4.04192,
"eval_accuracy": 0.02865968595647362,
"eval_loss": 4.975839614868164,
"eval_runtime": 92.6873,
"eval_samples_per_second": 38.765,
"eval_steps_per_second": 2.428,
"step": 187000
},
{
"epoch": 4.04442,
"grad_norm": 4.220412731170654,
"learning_rate": 3.14425e-06,
"loss": 5.1943,
"step": 187500
},
{
"epoch": 4.04692,
"grad_norm": 8.616243362426758,
"learning_rate": 3.0195e-06,
"loss": 5.1854,
"step": 188000
},
{
"epoch": 4.04692,
"eval_accuracy": 0.028208953294817662,
"eval_loss": 4.942821502685547,
"eval_runtime": 94.4959,
"eval_samples_per_second": 38.023,
"eval_steps_per_second": 2.381,
"step": 188000
},
{
"epoch": 4.04942,
"grad_norm": 5.9760894775390625,
"learning_rate": 2.8945e-06,
"loss": 5.1704,
"step": 188500
},
{
"epoch": 4.05192,
"grad_norm": 6.06154203414917,
"learning_rate": 2.7695000000000003e-06,
"loss": 5.1636,
"step": 189000
},
{
"epoch": 4.05192,
"eval_accuracy": 0.02843751679363505,
"eval_loss": 4.911832332611084,
"eval_runtime": 91.2834,
"eval_samples_per_second": 39.361,
"eval_steps_per_second": 2.465,
"step": 189000
},
{
"epoch": 4.05442,
"grad_norm": 7.606237411499023,
"learning_rate": 2.6445000000000003e-06,
"loss": 5.1407,
"step": 189500
},
{
"epoch": 4.05692,
"grad_norm": 4.682966709136963,
"learning_rate": 2.5197500000000003e-06,
"loss": 5.1356,
"step": 190000
},
{
"epoch": 4.05692,
"eval_accuracy": 0.028226095557228967,
"eval_loss": 4.904683589935303,
"eval_runtime": 92.6319,
"eval_samples_per_second": 38.788,
"eval_steps_per_second": 2.429,
"step": 190000
},
{
"epoch": 4.05942,
"grad_norm": 5.441736221313477,
"learning_rate": 2.3947500000000002e-06,
"loss": 5.1346,
"step": 190500
},
{
"epoch": 4.06192,
"grad_norm": 4.829955101013184,
"learning_rate": 2.26975e-06,
"loss": 5.1329,
"step": 191000
},
{
"epoch": 4.06192,
"eval_accuracy": 0.028335071368272256,
"eval_loss": 4.874863147735596,
"eval_runtime": 91.0476,
"eval_samples_per_second": 39.463,
"eval_steps_per_second": 2.471,
"step": 191000
},
{
"epoch": 4.06442,
"grad_norm": 5.842422962188721,
"learning_rate": 2.14475e-06,
"loss": 5.1127,
"step": 191500
},
{
"epoch": 4.06692,
"grad_norm": 5.372358322143555,
"learning_rate": 2.01975e-06,
"loss": 5.107,
"step": 192000
},
{
"epoch": 4.06692,
"eval_accuracy": 0.02805358453550251,
"eval_loss": 4.877078533172607,
"eval_runtime": 92.2372,
"eval_samples_per_second": 38.954,
"eval_steps_per_second": 2.439,
"step": 192000
},
{
"epoch": 4.06942,
"grad_norm": 5.863542556762695,
"learning_rate": 1.8950000000000003e-06,
"loss": 5.1301,
"step": 192500
},
{
"epoch": 4.07192,
"grad_norm": 8.768465042114258,
"learning_rate": 1.7700000000000002e-06,
"loss": 5.1159,
"step": 193000
},
{
"epoch": 4.07192,
"eval_accuracy": 0.027984607336752263,
"eval_loss": 4.856239318847656,
"eval_runtime": 100.1894,
"eval_samples_per_second": 35.862,
"eval_steps_per_second": 2.246,
"step": 193000
},
{
"epoch": 4.07442,
"grad_norm": 4.413668632507324,
"learning_rate": 1.645e-06,
"loss": 5.1087,
"step": 193500
},
{
"epoch": 4.07692,
"grad_norm": 7.255640983581543,
"learning_rate": 1.52e-06,
"loss": 5.0892,
"step": 194000
},
{
"epoch": 4.07692,
"eval_accuracy": 0.027881481662881083,
"eval_loss": 4.846496105194092,
"eval_runtime": 95.8083,
"eval_samples_per_second": 37.502,
"eval_steps_per_second": 2.348,
"step": 194000
},
{
"epoch": 4.07942,
"grad_norm": 5.159695148468018,
"learning_rate": 1.3952500000000001e-06,
"loss": 5.0909,
"step": 194500
},
{
"epoch": 4.08192,
"grad_norm": 4.329514026641846,
"learning_rate": 1.27025e-06,
"loss": 5.083,
"step": 195000
},
{
"epoch": 4.08192,
"eval_accuracy": 0.02791671853561543,
"eval_loss": 4.825800895690918,
"eval_runtime": 92.6653,
"eval_samples_per_second": 38.774,
"eval_steps_per_second": 2.428,
"step": 195000
},
{
"epoch": 4.08442,
"grad_norm": 6.477020263671875,
"learning_rate": 1.14525e-06,
"loss": 5.0699,
"step": 195500
},
{
"epoch": 4.08692,
"grad_norm": 4.648099899291992,
"learning_rate": 1.02025e-06,
"loss": 5.0824,
"step": 196000
},
{
"epoch": 4.08692,
"eval_accuracy": 0.028006647388423938,
"eval_loss": 4.821605682373047,
"eval_runtime": 103.4797,
"eval_samples_per_second": 34.722,
"eval_steps_per_second": 2.174,
"step": 196000
},
{
"epoch": 4.08942,
"grad_norm": 5.26840877532959,
"learning_rate": 8.952500000000001e-07,
"loss": 5.0674,
"step": 196500
},
{
"epoch": 4.09192,
"grad_norm": 5.00206184387207,
"learning_rate": 7.7075e-07,
"loss": 5.0774,
"step": 197000
},
{
"epoch": 4.09192,
"eval_accuracy": 0.027867604593310027,
"eval_loss": 4.817193984985352,
"eval_runtime": 103.2721,
"eval_samples_per_second": 34.792,
"eval_steps_per_second": 2.179,
"step": 197000
},
{
"epoch": 4.09442,
"grad_norm": 8.953470230102539,
"learning_rate": 6.4575e-07,
"loss": 5.0661,
"step": 197500
},
{
"epoch": 4.09692,
"grad_norm": 4.821086406707764,
"learning_rate": 5.207500000000001e-07,
"loss": 5.0567,
"step": 198000
},
{
"epoch": 4.09692,
"eval_accuracy": 0.027846652939251766,
"eval_loss": 4.811811923980713,
"eval_runtime": 129.0966,
"eval_samples_per_second": 27.832,
"eval_steps_per_second": 1.743,
"step": 198000
},
{
"epoch": 4.09942,
"grad_norm": 5.411968231201172,
"learning_rate": 3.9575000000000003e-07,
"loss": 5.0657,
"step": 198500
},
{
"epoch": 4.10192,
"grad_norm": 5.903570652008057,
"learning_rate": 2.7075e-07,
"loss": 5.0657,
"step": 199000
},
{
"epoch": 4.10192,
"eval_accuracy": 0.02782066744623146,
"eval_loss": 4.807706832885742,
"eval_runtime": 126.5816,
"eval_samples_per_second": 28.385,
"eval_steps_per_second": 1.778,
"step": 199000
},
{
"epoch": 4.10442,
"grad_norm": 4.718256950378418,
"learning_rate": 1.4575000000000002e-07,
"loss": 5.0755,
"step": 199500
},
{
"epoch": 4.10692,
"grad_norm": 5.640545845031738,
"learning_rate": 2.075e-08,
"loss": 5.0751,
"step": 200000
},
{
"epoch": 4.10692,
"eval_accuracy": 0.02785753691538593,
"eval_loss": 4.805818557739258,
"eval_runtime": 112.4779,
"eval_samples_per_second": 31.944,
"eval_steps_per_second": 2.0,
"step": 200000
},
{
"epoch": 4.10692,
"step": 200000,
"total_flos": 1.1895955925447475e+18,
"train_loss": 6.0653790368652345,
"train_runtime": 98053.9397,
"train_samples_per_second": 32.635,
"train_steps_per_second": 2.04
}
],
"logging_steps": 500,
"max_steps": 200000,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.1895955925447475e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}