|
{ |
|
"best_metric": 0.825, |
|
"best_model_checkpoint": "MAE-CT-CPC-Dicotomized-v8-n0-m1/checkpoint-1020", |
|
"epoch": 49.0004, |
|
"eval_steps": 500, |
|
"global_step": 2500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.004, |
|
"grad_norm": 3.163785934448242, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 0.6803, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.008, |
|
"grad_norm": 1.8414620161056519, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 0.6811, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.012, |
|
"grad_norm": 2.732853412628174, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.6724, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.016, |
|
"grad_norm": 2.302961826324463, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.6625, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 2.289227247238159, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.6826, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.0204, |
|
"eval_accuracy": 0.7625, |
|
"eval_loss": 0.6156964302062988, |
|
"eval_runtime": 16.4829, |
|
"eval_samples_per_second": 4.854, |
|
"eval_steps_per_second": 1.213, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 1.0036, |
|
"grad_norm": 2.539074420928955, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.653, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 1.0076, |
|
"grad_norm": 4.338445663452148, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.6605, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.0116, |
|
"grad_norm": 4.448472023010254, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.6647, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0156, |
|
"grad_norm": 8.590893745422363, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.665, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.0196, |
|
"grad_norm": 4.512553691864014, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.6549, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0204, |
|
"eval_accuracy": 0.7625, |
|
"eval_loss": 0.5930413603782654, |
|
"eval_runtime": 15.1042, |
|
"eval_samples_per_second": 5.297, |
|
"eval_steps_per_second": 1.324, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 2.0032, |
|
"grad_norm": 9.520898818969727, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.6033, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 2.0072, |
|
"grad_norm": 7.96126651763916, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.5796, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 2.0112, |
|
"grad_norm": 10.471980094909668, |
|
"learning_rate": 5.2e-06, |
|
"loss": 0.6731, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 2.0152, |
|
"grad_norm": 17.355741500854492, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.6853, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.0192, |
|
"grad_norm": 7.118047714233398, |
|
"learning_rate": 6e-06, |
|
"loss": 0.6486, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.0204, |
|
"eval_accuracy": 0.5125, |
|
"eval_loss": 0.6796671152114868, |
|
"eval_runtime": 15.1483, |
|
"eval_samples_per_second": 5.281, |
|
"eval_steps_per_second": 1.32, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 3.0028, |
|
"grad_norm": 9.448248863220215, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.627, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 3.0068, |
|
"grad_norm": 10.044306755065918, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 0.5988, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 3.0108, |
|
"grad_norm": 11.673822402954102, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.5388, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 3.0148, |
|
"grad_norm": 13.179671287536621, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 0.4922, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 3.0188, |
|
"grad_norm": 17.30994987487793, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5595, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 3.0204, |
|
"eval_accuracy": 0.6625, |
|
"eval_loss": 0.49017828702926636, |
|
"eval_runtime": 14.3197, |
|
"eval_samples_per_second": 5.587, |
|
"eval_steps_per_second": 1.397, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 4.0024, |
|
"grad_norm": 12.384488105773926, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.5512, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 4.0064, |
|
"grad_norm": 16.830211639404297, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.5532, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 4.0104, |
|
"grad_norm": 25.703161239624023, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 0.4503, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 4.0144, |
|
"grad_norm": 31.191774368286133, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.4281, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 4.0184, |
|
"grad_norm": 7.301267147064209, |
|
"learning_rate": 1e-05, |
|
"loss": 0.5586, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 4.0204, |
|
"eval_accuracy": 0.475, |
|
"eval_loss": 0.8194777369499207, |
|
"eval_runtime": 15.5636, |
|
"eval_samples_per_second": 5.14, |
|
"eval_steps_per_second": 1.285, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 5.002, |
|
"grad_norm": 6.1059746742248535, |
|
"learning_rate": 9.955555555555556e-06, |
|
"loss": 0.4667, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 5.006, |
|
"grad_norm": 11.250345230102539, |
|
"learning_rate": 9.911111111111113e-06, |
|
"loss": 0.3866, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 5.01, |
|
"grad_norm": 11.811776161193848, |
|
"learning_rate": 9.866666666666668e-06, |
|
"loss": 0.3093, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 5.014, |
|
"grad_norm": 2.898249387741089, |
|
"learning_rate": 9.822222222222223e-06, |
|
"loss": 0.4655, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 5.018, |
|
"grad_norm": 39.88842010498047, |
|
"learning_rate": 9.777777777777779e-06, |
|
"loss": 0.4565, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 5.0204, |
|
"eval_accuracy": 0.75, |
|
"eval_loss": 0.5872394442558289, |
|
"eval_runtime": 15.5445, |
|
"eval_samples_per_second": 5.147, |
|
"eval_steps_per_second": 1.287, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 6.0016, |
|
"grad_norm": 18.96799659729004, |
|
"learning_rate": 9.733333333333334e-06, |
|
"loss": 0.6334, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 6.0056, |
|
"grad_norm": 26.40775489807129, |
|
"learning_rate": 9.688888888888889e-06, |
|
"loss": 0.5505, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 6.0096, |
|
"grad_norm": 23.987438201904297, |
|
"learning_rate": 9.644444444444444e-06, |
|
"loss": 0.4702, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 6.0136, |
|
"grad_norm": 11.874549865722656, |
|
"learning_rate": 9.600000000000001e-06, |
|
"loss": 0.5567, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 6.0176, |
|
"grad_norm": 1.870789647102356, |
|
"learning_rate": 9.555555555555556e-06, |
|
"loss": 0.3697, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 6.0204, |
|
"eval_accuracy": 0.775, |
|
"eval_loss": 0.5016795992851257, |
|
"eval_runtime": 15.1939, |
|
"eval_samples_per_second": 5.265, |
|
"eval_steps_per_second": 1.316, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 7.0012, |
|
"grad_norm": 1.9288091659545898, |
|
"learning_rate": 9.511111111111112e-06, |
|
"loss": 0.3384, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 7.0052, |
|
"grad_norm": 47.43910598754883, |
|
"learning_rate": 9.466666666666667e-06, |
|
"loss": 0.4776, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 7.0092, |
|
"grad_norm": 24.75432586669922, |
|
"learning_rate": 9.422222222222222e-06, |
|
"loss": 0.3272, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 7.0132, |
|
"grad_norm": 6.029924392700195, |
|
"learning_rate": 9.377777777777779e-06, |
|
"loss": 0.6358, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 7.0172, |
|
"grad_norm": 36.66233825683594, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 0.6201, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 7.0204, |
|
"eval_accuracy": 0.7, |
|
"eval_loss": 0.6555034518241882, |
|
"eval_runtime": 15.1733, |
|
"eval_samples_per_second": 5.272, |
|
"eval_steps_per_second": 1.318, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 8.0008, |
|
"grad_norm": 13.244248390197754, |
|
"learning_rate": 9.28888888888889e-06, |
|
"loss": 0.2307, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 8.0048, |
|
"grad_norm": 0.1510910540819168, |
|
"learning_rate": 9.244444444444445e-06, |
|
"loss": 0.2198, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 8.0088, |
|
"grad_norm": 48.222145080566406, |
|
"learning_rate": 9.200000000000002e-06, |
|
"loss": 0.1961, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 8.0128, |
|
"grad_norm": 12.182064056396484, |
|
"learning_rate": 9.155555555555557e-06, |
|
"loss": 0.4113, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 8.0168, |
|
"grad_norm": 90.49524688720703, |
|
"learning_rate": 9.111111111111112e-06, |
|
"loss": 0.4333, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 8.0204, |
|
"eval_accuracy": 0.6125, |
|
"eval_loss": 1.2277292013168335, |
|
"eval_runtime": 15.8873, |
|
"eval_samples_per_second": 5.035, |
|
"eval_steps_per_second": 1.259, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 9.0004, |
|
"grad_norm": 8.979384422302246, |
|
"learning_rate": 9.066666666666667e-06, |
|
"loss": 0.2832, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 9.0044, |
|
"grad_norm": 124.57083892822266, |
|
"learning_rate": 9.022222222222223e-06, |
|
"loss": 0.4698, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 9.0084, |
|
"grad_norm": 1.4499937295913696, |
|
"learning_rate": 8.977777777777778e-06, |
|
"loss": 0.622, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 9.0124, |
|
"grad_norm": 204.4443359375, |
|
"learning_rate": 8.933333333333333e-06, |
|
"loss": 0.4289, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 9.0164, |
|
"grad_norm": 13.056779861450195, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 0.3958, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 9.0204, |
|
"grad_norm": 3.7403643131256104, |
|
"learning_rate": 8.844444444444445e-06, |
|
"loss": 0.2148, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 9.0204, |
|
"eval_accuracy": 0.7625, |
|
"eval_loss": 0.8114517331123352, |
|
"eval_runtime": 14.7863, |
|
"eval_samples_per_second": 5.41, |
|
"eval_steps_per_second": 1.353, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 10.004, |
|
"grad_norm": 0.3079785704612732, |
|
"learning_rate": 8.8e-06, |
|
"loss": 0.4126, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 10.008, |
|
"grad_norm": 10.519120216369629, |
|
"learning_rate": 8.755555555555556e-06, |
|
"loss": 0.2208, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 10.012, |
|
"grad_norm": 37.70737075805664, |
|
"learning_rate": 8.711111111111111e-06, |
|
"loss": 0.4088, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 10.016, |
|
"grad_norm": 27.64992332458496, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 0.6548, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 10.02, |
|
"grad_norm": 1.9343359470367432, |
|
"learning_rate": 8.622222222222223e-06, |
|
"loss": 0.9458, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 10.0204, |
|
"eval_accuracy": 0.6625, |
|
"eval_loss": 0.9872623682022095, |
|
"eval_runtime": 14.8048, |
|
"eval_samples_per_second": 5.404, |
|
"eval_steps_per_second": 1.351, |
|
"step": 561 |
|
}, |
|
{ |
|
"epoch": 11.0036, |
|
"grad_norm": 127.26286315917969, |
|
"learning_rate": 8.577777777777778e-06, |
|
"loss": 0.4077, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 11.0076, |
|
"grad_norm": 48.36285400390625, |
|
"learning_rate": 8.533333333333335e-06, |
|
"loss": 0.315, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 11.0116, |
|
"grad_norm": 1.3444968461990356, |
|
"learning_rate": 8.48888888888889e-06, |
|
"loss": 0.3529, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 11.0156, |
|
"grad_norm": 1.425925374031067, |
|
"learning_rate": 8.444444444444446e-06, |
|
"loss": 0.2537, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 11.0196, |
|
"grad_norm": 0.11205915361642838, |
|
"learning_rate": 8.400000000000001e-06, |
|
"loss": 0.0651, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 11.0204, |
|
"eval_accuracy": 0.7625, |
|
"eval_loss": 1.0840221643447876, |
|
"eval_runtime": 16.1692, |
|
"eval_samples_per_second": 4.948, |
|
"eval_steps_per_second": 1.237, |
|
"step": 612 |
|
}, |
|
{ |
|
"epoch": 12.0032, |
|
"grad_norm": 1.6690884828567505, |
|
"learning_rate": 8.355555555555556e-06, |
|
"loss": 0.0219, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 12.0072, |
|
"grad_norm": 0.04994206875562668, |
|
"learning_rate": 8.311111111111111e-06, |
|
"loss": 0.3325, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 12.0112, |
|
"grad_norm": 1.1334573030471802, |
|
"learning_rate": 8.266666666666667e-06, |
|
"loss": 0.1766, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 12.0152, |
|
"grad_norm": 84.09977722167969, |
|
"learning_rate": 8.222222222222222e-06, |
|
"loss": 0.3474, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 12.0192, |
|
"grad_norm": 0.11686452478170395, |
|
"learning_rate": 8.177777777777779e-06, |
|
"loss": 0.5756, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 12.0204, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 1.0489223003387451, |
|
"eval_runtime": 14.837, |
|
"eval_samples_per_second": 5.392, |
|
"eval_steps_per_second": 1.348, |
|
"step": 663 |
|
}, |
|
{ |
|
"epoch": 13.0028, |
|
"grad_norm": 0.4879148304462433, |
|
"learning_rate": 8.133333333333334e-06, |
|
"loss": 0.1627, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 13.0068, |
|
"grad_norm": 10.051454544067383, |
|
"learning_rate": 8.08888888888889e-06, |
|
"loss": 0.1414, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 13.0108, |
|
"grad_norm": 317.5495910644531, |
|
"learning_rate": 8.044444444444444e-06, |
|
"loss": 0.4045, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 13.0148, |
|
"grad_norm": 12.827858924865723, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.2213, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 13.0188, |
|
"grad_norm": 7.5428338050842285, |
|
"learning_rate": 7.955555555555557e-06, |
|
"loss": 0.354, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 13.0204, |
|
"eval_accuracy": 0.7875, |
|
"eval_loss": 1.1601030826568604, |
|
"eval_runtime": 14.793, |
|
"eval_samples_per_second": 5.408, |
|
"eval_steps_per_second": 1.352, |
|
"step": 714 |
|
}, |
|
{ |
|
"epoch": 14.0024, |
|
"grad_norm": 1.0718170404434204, |
|
"learning_rate": 7.911111111111112e-06, |
|
"loss": 0.1341, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 14.0064, |
|
"grad_norm": 204.27011108398438, |
|
"learning_rate": 7.866666666666667e-06, |
|
"loss": 0.2009, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 14.0104, |
|
"grad_norm": 623.6522827148438, |
|
"learning_rate": 7.822222222222224e-06, |
|
"loss": 0.2302, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 14.0144, |
|
"grad_norm": 53.07473373413086, |
|
"learning_rate": 7.77777777777778e-06, |
|
"loss": 0.4521, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 14.0184, |
|
"grad_norm": 0.032519057393074036, |
|
"learning_rate": 7.733333333333334e-06, |
|
"loss": 0.2888, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 14.0204, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 1.8143768310546875, |
|
"eval_runtime": 15.8798, |
|
"eval_samples_per_second": 5.038, |
|
"eval_steps_per_second": 1.259, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 15.002, |
|
"grad_norm": 457.6705322265625, |
|
"learning_rate": 7.68888888888889e-06, |
|
"loss": 0.2657, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 15.006, |
|
"grad_norm": 431.8014221191406, |
|
"learning_rate": 7.644444444444445e-06, |
|
"loss": 0.0887, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 15.01, |
|
"grad_norm": 0.013111516833305359, |
|
"learning_rate": 7.600000000000001e-06, |
|
"loss": 0.0183, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 15.014, |
|
"grad_norm": 0.08553914725780487, |
|
"learning_rate": 7.555555555555556e-06, |
|
"loss": 0.0991, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 15.018, |
|
"grad_norm": 0.11689701676368713, |
|
"learning_rate": 7.511111111111111e-06, |
|
"loss": 0.2449, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 15.0204, |
|
"eval_accuracy": 0.7125, |
|
"eval_loss": 1.3988301753997803, |
|
"eval_runtime": 14.9719, |
|
"eval_samples_per_second": 5.343, |
|
"eval_steps_per_second": 1.336, |
|
"step": 816 |
|
}, |
|
{ |
|
"epoch": 16.0016, |
|
"grad_norm": 0.12991830706596375, |
|
"learning_rate": 7.4666666666666675e-06, |
|
"loss": 0.009, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 16.0056, |
|
"grad_norm": 185.4541015625, |
|
"learning_rate": 7.422222222222223e-06, |
|
"loss": 0.2772, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 16.0096, |
|
"grad_norm": 0.7622888088226318, |
|
"learning_rate": 7.377777777777778e-06, |
|
"loss": 0.1701, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 16.0136, |
|
"grad_norm": 0.5893406867980957, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 0.1631, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 16.0176, |
|
"grad_norm": 0.7813571691513062, |
|
"learning_rate": 7.28888888888889e-06, |
|
"loss": 0.1326, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 16.0204, |
|
"eval_accuracy": 0.7125, |
|
"eval_loss": 1.715152382850647, |
|
"eval_runtime": 15.1438, |
|
"eval_samples_per_second": 5.283, |
|
"eval_steps_per_second": 1.321, |
|
"step": 867 |
|
}, |
|
{ |
|
"epoch": 17.0012, |
|
"grad_norm": 0.07987383008003235, |
|
"learning_rate": 7.244444444444445e-06, |
|
"loss": 0.0021, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 17.0052, |
|
"grad_norm": 17.579423904418945, |
|
"learning_rate": 7.2000000000000005e-06, |
|
"loss": 0.2835, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 17.0092, |
|
"grad_norm": 0.05407591536641121, |
|
"learning_rate": 7.155555555555556e-06, |
|
"loss": 0.0557, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 17.0132, |
|
"grad_norm": 129.4159393310547, |
|
"learning_rate": 7.111111111111112e-06, |
|
"loss": 0.147, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 17.0172, |
|
"grad_norm": 0.6726216673851013, |
|
"learning_rate": 7.066666666666667e-06, |
|
"loss": 0.0018, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 17.0204, |
|
"eval_accuracy": 0.6375, |
|
"eval_loss": 2.1475367546081543, |
|
"eval_runtime": 15.0746, |
|
"eval_samples_per_second": 5.307, |
|
"eval_steps_per_second": 1.327, |
|
"step": 918 |
|
}, |
|
{ |
|
"epoch": 18.0008, |
|
"grad_norm": 0.06342015415430069, |
|
"learning_rate": 7.022222222222222e-06, |
|
"loss": 0.2025, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 18.0048, |
|
"grad_norm": 0.03912360593676567, |
|
"learning_rate": 6.977777777777779e-06, |
|
"loss": 0.0017, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 18.0088, |
|
"grad_norm": 0.015669086948037148, |
|
"learning_rate": 6.9333333333333344e-06, |
|
"loss": 0.1518, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 18.0128, |
|
"grad_norm": 0.03878331929445267, |
|
"learning_rate": 6.88888888888889e-06, |
|
"loss": 0.6683, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 18.0168, |
|
"grad_norm": 0.009163687005639076, |
|
"learning_rate": 6.844444444444445e-06, |
|
"loss": 0.3631, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 18.0204, |
|
"eval_accuracy": 0.65, |
|
"eval_loss": 1.8957328796386719, |
|
"eval_runtime": 14.5069, |
|
"eval_samples_per_second": 5.515, |
|
"eval_steps_per_second": 1.379, |
|
"step": 969 |
|
}, |
|
{ |
|
"epoch": 19.0004, |
|
"grad_norm": 0.030928779393434525, |
|
"learning_rate": 6.800000000000001e-06, |
|
"loss": 0.1663, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 19.0044, |
|
"grad_norm": 0.8263186812400818, |
|
"learning_rate": 6.755555555555556e-06, |
|
"loss": 0.2357, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 19.0084, |
|
"grad_norm": 0.09255637228488922, |
|
"learning_rate": 6.711111111111111e-06, |
|
"loss": 0.0141, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 19.0124, |
|
"grad_norm": 0.10079475492238998, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.001, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 19.0164, |
|
"grad_norm": 0.1445166915655136, |
|
"learning_rate": 6.6222222222222236e-06, |
|
"loss": 0.1313, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 19.0204, |
|
"grad_norm": 0.0298333577811718, |
|
"learning_rate": 6.577777777777779e-06, |
|
"loss": 0.1252, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 19.0204, |
|
"eval_accuracy": 0.825, |
|
"eval_loss": 1.124619960784912, |
|
"eval_runtime": 14.4479, |
|
"eval_samples_per_second": 5.537, |
|
"eval_steps_per_second": 1.384, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 20.004, |
|
"grad_norm": 0.19592173397541046, |
|
"learning_rate": 6.533333333333334e-06, |
|
"loss": 0.0141, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 20.008, |
|
"grad_norm": 42.12409591674805, |
|
"learning_rate": 6.488888888888889e-06, |
|
"loss": 0.1769, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 20.012, |
|
"grad_norm": 482.6688537597656, |
|
"learning_rate": 6.444444444444445e-06, |
|
"loss": 0.1202, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 20.016, |
|
"grad_norm": 0.5874069333076477, |
|
"learning_rate": 6.4000000000000006e-06, |
|
"loss": 0.0007, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 20.02, |
|
"grad_norm": 0.018259378150105476, |
|
"learning_rate": 6.355555555555556e-06, |
|
"loss": 0.0943, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 20.0204, |
|
"eval_accuracy": 0.6625, |
|
"eval_loss": 1.9498172998428345, |
|
"eval_runtime": 14.4129, |
|
"eval_samples_per_second": 5.551, |
|
"eval_steps_per_second": 1.388, |
|
"step": 1071 |
|
}, |
|
{ |
|
"epoch": 21.0036, |
|
"grad_norm": 0.00311831571161747, |
|
"learning_rate": 6.311111111111111e-06, |
|
"loss": 0.2018, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 21.0076, |
|
"grad_norm": 7.55759859085083, |
|
"learning_rate": 6.266666666666668e-06, |
|
"loss": 0.333, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 21.0116, |
|
"grad_norm": 0.13464294373989105, |
|
"learning_rate": 6.222222222222223e-06, |
|
"loss": 0.2933, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 21.0156, |
|
"grad_norm": 0.013299187645316124, |
|
"learning_rate": 6.177777777777778e-06, |
|
"loss": 0.0908, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 21.0196, |
|
"grad_norm": 28.606412887573242, |
|
"learning_rate": 6.133333333333334e-06, |
|
"loss": 0.3488, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 21.0204, |
|
"eval_accuracy": 0.7875, |
|
"eval_loss": 1.3456709384918213, |
|
"eval_runtime": 15.9369, |
|
"eval_samples_per_second": 5.02, |
|
"eval_steps_per_second": 1.255, |
|
"step": 1122 |
|
}, |
|
{ |
|
"epoch": 22.0032, |
|
"grad_norm": 2.039261817932129, |
|
"learning_rate": 6.08888888888889e-06, |
|
"loss": 0.2154, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 22.0072, |
|
"grad_norm": 6.409753799438477, |
|
"learning_rate": 6.044444444444445e-06, |
|
"loss": 0.0051, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 22.0112, |
|
"grad_norm": 0.010469136759638786, |
|
"learning_rate": 6e-06, |
|
"loss": 0.1893, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 22.0152, |
|
"grad_norm": 0.022801605984568596, |
|
"learning_rate": 5.955555555555555e-06, |
|
"loss": 0.0428, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 22.0192, |
|
"grad_norm": 0.014003835618495941, |
|
"learning_rate": 5.911111111111112e-06, |
|
"loss": 0.0008, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 22.0204, |
|
"eval_accuracy": 0.7125, |
|
"eval_loss": 1.7872467041015625, |
|
"eval_runtime": 14.9328, |
|
"eval_samples_per_second": 5.357, |
|
"eval_steps_per_second": 1.339, |
|
"step": 1173 |
|
}, |
|
{ |
|
"epoch": 23.0028, |
|
"grad_norm": 336.1969299316406, |
|
"learning_rate": 5.8666666666666675e-06, |
|
"loss": 0.5488, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 23.0068, |
|
"grad_norm": 0.009750437922775745, |
|
"learning_rate": 5.822222222222223e-06, |
|
"loss": 0.3029, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 23.0108, |
|
"grad_norm": 0.15668730437755585, |
|
"learning_rate": 5.777777777777778e-06, |
|
"loss": 0.6396, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 23.0148, |
|
"grad_norm": 0.019450828433036804, |
|
"learning_rate": 5.733333333333334e-06, |
|
"loss": 0.0974, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 23.0188, |
|
"grad_norm": 6.186660289764404, |
|
"learning_rate": 5.688888888888889e-06, |
|
"loss": 0.009, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 23.0204, |
|
"eval_accuracy": 0.75, |
|
"eval_loss": 1.5437147617340088, |
|
"eval_runtime": 14.8598, |
|
"eval_samples_per_second": 5.384, |
|
"eval_steps_per_second": 1.346, |
|
"step": 1224 |
|
}, |
|
{ |
|
"epoch": 24.0024, |
|
"grad_norm": 0.012686701491475105, |
|
"learning_rate": 5.6444444444444445e-06, |
|
"loss": 0.0007, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 24.0064, |
|
"grad_norm": 0.008507036603987217, |
|
"learning_rate": 5.600000000000001e-06, |
|
"loss": 0.0796, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 24.0104, |
|
"grad_norm": 0.05275079980492592, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 0.0045, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 24.0144, |
|
"grad_norm": 0.020087506622076035, |
|
"learning_rate": 5.511111111111112e-06, |
|
"loss": 0.0007, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 24.0184, |
|
"grad_norm": 0.0037845964543521404, |
|
"learning_rate": 5.466666666666667e-06, |
|
"loss": 0.0274, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 24.0204, |
|
"eval_accuracy": 0.6875, |
|
"eval_loss": 1.9865350723266602, |
|
"eval_runtime": 15.8645, |
|
"eval_samples_per_second": 5.043, |
|
"eval_steps_per_second": 1.261, |
|
"step": 1275 |
|
}, |
|
{ |
|
"epoch": 25.002, |
|
"grad_norm": 0.008797760121524334, |
|
"learning_rate": 5.422222222222223e-06, |
|
"loss": 0.1231, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 25.006, |
|
"grad_norm": 0.012985019944608212, |
|
"learning_rate": 5.3777777777777784e-06, |
|
"loss": 0.0003, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 25.01, |
|
"grad_norm": 0.02057729661464691, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 0.058, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 25.014, |
|
"grad_norm": 1.6732549667358398, |
|
"learning_rate": 5.288888888888889e-06, |
|
"loss": 0.002, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 25.018, |
|
"grad_norm": 0.006808862090110779, |
|
"learning_rate": 5.244444444444445e-06, |
|
"loss": 0.0004, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 25.0204, |
|
"eval_accuracy": 0.7625, |
|
"eval_loss": 1.5100300312042236, |
|
"eval_runtime": 15.0066, |
|
"eval_samples_per_second": 5.331, |
|
"eval_steps_per_second": 1.333, |
|
"step": 1326 |
|
}, |
|
{ |
|
"epoch": 26.0016, |
|
"grad_norm": 0.011416507884860039, |
|
"learning_rate": 5.2e-06, |
|
"loss": 0.0003, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 26.0056, |
|
"grad_norm": 0.006314845755696297, |
|
"learning_rate": 5.155555555555556e-06, |
|
"loss": 0.0236, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 26.0096, |
|
"grad_norm": 1.8917161226272583, |
|
"learning_rate": 5.1111111111111115e-06, |
|
"loss": 0.0011, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 26.0136, |
|
"grad_norm": 0.015401429496705532, |
|
"learning_rate": 5.0666666666666676e-06, |
|
"loss": 0.0003, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 26.0176, |
|
"grad_norm": 0.2598150372505188, |
|
"learning_rate": 5.022222222222223e-06, |
|
"loss": 0.1007, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 26.0204, |
|
"eval_accuracy": 0.6875, |
|
"eval_loss": 1.959010124206543, |
|
"eval_runtime": 15.1089, |
|
"eval_samples_per_second": 5.295, |
|
"eval_steps_per_second": 1.324, |
|
"step": 1377 |
|
}, |
|
{ |
|
"epoch": 27.0012, |
|
"grad_norm": 0.025213167071342468, |
|
"learning_rate": 4.977777777777778e-06, |
|
"loss": 0.0352, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 27.0052, |
|
"grad_norm": 0.17898155748844147, |
|
"learning_rate": 4.933333333333334e-06, |
|
"loss": 0.0003, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 27.0092, |
|
"grad_norm": 0.011180482804775238, |
|
"learning_rate": 4.888888888888889e-06, |
|
"loss": 0.3099, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 27.0132, |
|
"grad_norm": 0.003275972092524171, |
|
"learning_rate": 4.8444444444444446e-06, |
|
"loss": 0.0006, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 27.0172, |
|
"grad_norm": 0.10510570555925369, |
|
"learning_rate": 4.800000000000001e-06, |
|
"loss": 0.0006, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 27.0204, |
|
"eval_accuracy": 0.7125, |
|
"eval_loss": 1.8345705270767212, |
|
"eval_runtime": 15.6323, |
|
"eval_samples_per_second": 5.118, |
|
"eval_steps_per_second": 1.279, |
|
"step": 1428 |
|
}, |
|
{ |
|
"epoch": 28.0008, |
|
"grad_norm": 0.00974931288510561, |
|
"learning_rate": 4.755555555555556e-06, |
|
"loss": 0.0003, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 28.0048, |
|
"grad_norm": 0.01707894168794155, |
|
"learning_rate": 4.711111111111111e-06, |
|
"loss": 0.0002, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 28.0088, |
|
"grad_norm": 0.007560590747743845, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.0004, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 28.0128, |
|
"grad_norm": 66.74208068847656, |
|
"learning_rate": 4.622222222222222e-06, |
|
"loss": 0.1921, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 28.0168, |
|
"grad_norm": 0.0025084693916141987, |
|
"learning_rate": 4.5777777777777785e-06, |
|
"loss": 0.0006, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 28.0204, |
|
"eval_accuracy": 0.825, |
|
"eval_loss": 1.4668537378311157, |
|
"eval_runtime": 14.5998, |
|
"eval_samples_per_second": 5.48, |
|
"eval_steps_per_second": 1.37, |
|
"step": 1479 |
|
}, |
|
{ |
|
"epoch": 29.0004, |
|
"grad_norm": 0.3155474364757538, |
|
"learning_rate": 4.533333333333334e-06, |
|
"loss": 0.0002, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 29.0044, |
|
"grad_norm": 0.024095896631479263, |
|
"learning_rate": 4.488888888888889e-06, |
|
"loss": 0.0011, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 29.0084, |
|
"grad_norm": 0.00578249292448163, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 0.0002, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 29.0124, |
|
"grad_norm": 0.009000943042337894, |
|
"learning_rate": 4.4e-06, |
|
"loss": 0.0008, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 29.0164, |
|
"grad_norm": 0.3360608220100403, |
|
"learning_rate": 4.3555555555555555e-06, |
|
"loss": 0.0004, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 29.0204, |
|
"grad_norm": 0.007300488650798798, |
|
"learning_rate": 4.3111111111111115e-06, |
|
"loss": 0.0001, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 29.0204, |
|
"eval_accuracy": 0.7875, |
|
"eval_loss": 1.5396068096160889, |
|
"eval_runtime": 14.8018, |
|
"eval_samples_per_second": 5.405, |
|
"eval_steps_per_second": 1.351, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 30.004, |
|
"grad_norm": 0.0051333606243133545, |
|
"learning_rate": 4.266666666666668e-06, |
|
"loss": 0.0002, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 30.008, |
|
"grad_norm": 0.006649728864431381, |
|
"learning_rate": 4.222222222222223e-06, |
|
"loss": 0.0002, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 30.012, |
|
"grad_norm": 0.004679904319345951, |
|
"learning_rate": 4.177777777777778e-06, |
|
"loss": 0.0002, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 30.016, |
|
"grad_norm": 220.38076782226562, |
|
"learning_rate": 4.133333333333333e-06, |
|
"loss": 0.0777, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 30.02, |
|
"grad_norm": 0.004013615660369396, |
|
"learning_rate": 4.088888888888889e-06, |
|
"loss": 0.0002, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 30.0204, |
|
"eval_accuracy": 0.7875, |
|
"eval_loss": 1.571619987487793, |
|
"eval_runtime": 14.813, |
|
"eval_samples_per_second": 5.401, |
|
"eval_steps_per_second": 1.35, |
|
"step": 1581 |
|
}, |
|
{ |
|
"epoch": 31.0036, |
|
"grad_norm": 0.049216415733098984, |
|
"learning_rate": 4.044444444444445e-06, |
|
"loss": 0.0002, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 31.0076, |
|
"grad_norm": 0.01534576527774334, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0001, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 31.0116, |
|
"grad_norm": 0.002917769132182002, |
|
"learning_rate": 3.955555555555556e-06, |
|
"loss": 0.0002, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 31.0156, |
|
"grad_norm": 0.002222651382908225, |
|
"learning_rate": 3.911111111111112e-06, |
|
"loss": 0.0002, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 31.0196, |
|
"grad_norm": 0.008118602447211742, |
|
"learning_rate": 3.866666666666667e-06, |
|
"loss": 0.0001, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 31.0204, |
|
"eval_accuracy": 0.7625, |
|
"eval_loss": 1.6614097356796265, |
|
"eval_runtime": 14.4379, |
|
"eval_samples_per_second": 5.541, |
|
"eval_steps_per_second": 1.385, |
|
"step": 1632 |
|
}, |
|
{ |
|
"epoch": 32.0032, |
|
"grad_norm": 0.004146672319620848, |
|
"learning_rate": 3.8222222222222224e-06, |
|
"loss": 0.0001, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 32.0072, |
|
"grad_norm": 0.0034797696862369776, |
|
"learning_rate": 3.777777777777778e-06, |
|
"loss": 0.0001, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 32.0112, |
|
"grad_norm": 0.0042143468745052814, |
|
"learning_rate": 3.7333333333333337e-06, |
|
"loss": 0.0001, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 32.0152, |
|
"grad_norm": 0.054984163492918015, |
|
"learning_rate": 3.688888888888889e-06, |
|
"loss": 0.0001, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 32.0192, |
|
"grad_norm": 0.004838942550122738, |
|
"learning_rate": 3.644444444444445e-06, |
|
"loss": 0.0002, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 32.0204, |
|
"eval_accuracy": 0.7625, |
|
"eval_loss": 1.6355606317520142, |
|
"eval_runtime": 15.4442, |
|
"eval_samples_per_second": 5.18, |
|
"eval_steps_per_second": 1.295, |
|
"step": 1683 |
|
}, |
|
{ |
|
"epoch": 33.0028, |
|
"grad_norm": 0.3461035192012787, |
|
"learning_rate": 3.6000000000000003e-06, |
|
"loss": 0.0001, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 33.0068, |
|
"grad_norm": 0.002831398043781519, |
|
"learning_rate": 3.555555555555556e-06, |
|
"loss": 0.0001, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 33.0108, |
|
"grad_norm": 0.06873564422130585, |
|
"learning_rate": 3.511111111111111e-06, |
|
"loss": 0.0001, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 33.0148, |
|
"grad_norm": 0.006833823397755623, |
|
"learning_rate": 3.4666666666666672e-06, |
|
"loss": 0.0002, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 33.0188, |
|
"grad_norm": 0.004828931763768196, |
|
"learning_rate": 3.4222222222222224e-06, |
|
"loss": 0.0001, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 33.0204, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 1.5730502605438232, |
|
"eval_runtime": 14.4769, |
|
"eval_samples_per_second": 5.526, |
|
"eval_steps_per_second": 1.382, |
|
"step": 1734 |
|
}, |
|
{ |
|
"epoch": 34.0024, |
|
"grad_norm": 0.02358504943549633, |
|
"learning_rate": 3.377777777777778e-06, |
|
"loss": 0.0003, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 34.0064, |
|
"grad_norm": 0.0021904546301811934, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.0237, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 34.0104, |
|
"grad_norm": 0.004002240486443043, |
|
"learning_rate": 3.2888888888888894e-06, |
|
"loss": 0.0002, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 34.0144, |
|
"grad_norm": 0.00806827750056982, |
|
"learning_rate": 3.2444444444444446e-06, |
|
"loss": 0.0001, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 34.0184, |
|
"grad_norm": 0.004369418602436781, |
|
"learning_rate": 3.2000000000000003e-06, |
|
"loss": 0.0001, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 34.0204, |
|
"eval_accuracy": 0.725, |
|
"eval_loss": 2.0019965171813965, |
|
"eval_runtime": 14.7093, |
|
"eval_samples_per_second": 5.439, |
|
"eval_steps_per_second": 1.36, |
|
"step": 1785 |
|
}, |
|
{ |
|
"epoch": 35.002, |
|
"grad_norm": 0.004029570147395134, |
|
"learning_rate": 3.1555555555555555e-06, |
|
"loss": 0.0001, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 35.006, |
|
"grad_norm": 0.0854596495628357, |
|
"learning_rate": 3.1111111111111116e-06, |
|
"loss": 0.0001, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 35.01, |
|
"grad_norm": 0.00881748553365469, |
|
"learning_rate": 3.066666666666667e-06, |
|
"loss": 0.0001, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 35.014, |
|
"grad_norm": 0.007664634846150875, |
|
"learning_rate": 3.0222222222222225e-06, |
|
"loss": 0.0001, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 35.018, |
|
"grad_norm": 0.004043503198772669, |
|
"learning_rate": 2.9777777777777777e-06, |
|
"loss": 0.0001, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 35.0204, |
|
"eval_accuracy": 0.75, |
|
"eval_loss": 1.888606071472168, |
|
"eval_runtime": 16.1561, |
|
"eval_samples_per_second": 4.952, |
|
"eval_steps_per_second": 1.238, |
|
"step": 1836 |
|
}, |
|
{ |
|
"epoch": 36.0016, |
|
"grad_norm": 0.027049187570810318, |
|
"learning_rate": 2.9333333333333338e-06, |
|
"loss": 0.0001, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 36.0056, |
|
"grad_norm": 0.011712036095559597, |
|
"learning_rate": 2.888888888888889e-06, |
|
"loss": 0.0001, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 36.0096, |
|
"grad_norm": 0.0028739357367157936, |
|
"learning_rate": 2.8444444444444446e-06, |
|
"loss": 0.0001, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 36.0136, |
|
"grad_norm": 0.0029218129348009825, |
|
"learning_rate": 2.8000000000000003e-06, |
|
"loss": 0.0001, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 36.0176, |
|
"grad_norm": 0.002428996842354536, |
|
"learning_rate": 2.755555555555556e-06, |
|
"loss": 0.0001, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 36.0204, |
|
"eval_accuracy": 0.75, |
|
"eval_loss": 1.8363139629364014, |
|
"eval_runtime": 14.1077, |
|
"eval_samples_per_second": 5.671, |
|
"eval_steps_per_second": 1.418, |
|
"step": 1887 |
|
}, |
|
{ |
|
"epoch": 37.0012, |
|
"grad_norm": 0.001762293977662921, |
|
"learning_rate": 2.7111111111111116e-06, |
|
"loss": 0.0001, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 37.0052, |
|
"grad_norm": 0.010973138734698296, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.0001, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 37.0092, |
|
"grad_norm": 0.007034891285002232, |
|
"learning_rate": 2.6222222222222225e-06, |
|
"loss": 0.0001, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 37.0132, |
|
"grad_norm": 0.04831545799970627, |
|
"learning_rate": 2.577777777777778e-06, |
|
"loss": 0.0001, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 37.0172, |
|
"grad_norm": 0.003924284130334854, |
|
"learning_rate": 2.5333333333333338e-06, |
|
"loss": 0.0001, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 37.0204, |
|
"eval_accuracy": 0.7625, |
|
"eval_loss": 1.6848043203353882, |
|
"eval_runtime": 14.1172, |
|
"eval_samples_per_second": 5.667, |
|
"eval_steps_per_second": 1.417, |
|
"step": 1938 |
|
}, |
|
{ |
|
"epoch": 38.0008, |
|
"grad_norm": 0.003881203942000866, |
|
"learning_rate": 2.488888888888889e-06, |
|
"loss": 0.0001, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 38.0048, |
|
"grad_norm": 0.002467320766299963, |
|
"learning_rate": 2.4444444444444447e-06, |
|
"loss": 0.0001, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 38.0088, |
|
"grad_norm": 0.004267244599759579, |
|
"learning_rate": 2.4000000000000003e-06, |
|
"loss": 0.0001, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 38.0128, |
|
"grad_norm": 0.003334041452035308, |
|
"learning_rate": 2.3555555555555555e-06, |
|
"loss": 0.1433, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 38.0168, |
|
"grad_norm": 0.0034292838536202908, |
|
"learning_rate": 2.311111111111111e-06, |
|
"loss": 0.0001, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 38.0204, |
|
"eval_accuracy": 0.75, |
|
"eval_loss": 1.7187621593475342, |
|
"eval_runtime": 14.1135, |
|
"eval_samples_per_second": 5.668, |
|
"eval_steps_per_second": 1.417, |
|
"step": 1989 |
|
}, |
|
{ |
|
"epoch": 39.0004, |
|
"grad_norm": 0.002690413035452366, |
|
"learning_rate": 2.266666666666667e-06, |
|
"loss": 0.0142, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 39.0044, |
|
"grad_norm": 0.003135059028863907, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.0001, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 39.0084, |
|
"grad_norm": 0.003898640163242817, |
|
"learning_rate": 2.1777777777777777e-06, |
|
"loss": 0.0001, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 39.0124, |
|
"grad_norm": 0.0050819204188883305, |
|
"learning_rate": 2.133333333333334e-06, |
|
"loss": 0.0001, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 39.0164, |
|
"grad_norm": 0.0032317114528268576, |
|
"learning_rate": 2.088888888888889e-06, |
|
"loss": 0.0412, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 39.0204, |
|
"grad_norm": 0.00116757582873106, |
|
"learning_rate": 2.0444444444444447e-06, |
|
"loss": 0.0001, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 39.0204, |
|
"eval_accuracy": 0.8, |
|
"eval_loss": 1.5820459127426147, |
|
"eval_runtime": 15.2596, |
|
"eval_samples_per_second": 5.243, |
|
"eval_steps_per_second": 1.311, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 40.004, |
|
"grad_norm": 0.00318012572824955, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.0001, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 40.008, |
|
"grad_norm": 0.0023195173125714064, |
|
"learning_rate": 1.955555555555556e-06, |
|
"loss": 0.0008, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 40.012, |
|
"grad_norm": 0.002880761167034507, |
|
"learning_rate": 1.9111111111111112e-06, |
|
"loss": 0.0001, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 40.016, |
|
"grad_norm": 88.72091674804688, |
|
"learning_rate": 1.8666666666666669e-06, |
|
"loss": 0.0023, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 40.02, |
|
"grad_norm": 0.0025204592384397984, |
|
"learning_rate": 1.8222222222222225e-06, |
|
"loss": 0.0001, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 40.0204, |
|
"eval_accuracy": 0.7875, |
|
"eval_loss": 1.6061248779296875, |
|
"eval_runtime": 14.5072, |
|
"eval_samples_per_second": 5.515, |
|
"eval_steps_per_second": 1.379, |
|
"step": 2091 |
|
}, |
|
{ |
|
"epoch": 41.0036, |
|
"grad_norm": 0.0022217093501240015, |
|
"learning_rate": 1.777777777777778e-06, |
|
"loss": 0.167, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 41.0076, |
|
"grad_norm": 0.0016050190897658467, |
|
"learning_rate": 1.7333333333333336e-06, |
|
"loss": 0.0001, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 41.0116, |
|
"grad_norm": 0.0031738209072500467, |
|
"learning_rate": 1.688888888888889e-06, |
|
"loss": 0.0001, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 41.0156, |
|
"grad_norm": 0.0036317266058176756, |
|
"learning_rate": 1.6444444444444447e-06, |
|
"loss": 0.0001, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 41.0196, |
|
"grad_norm": 0.002648336812853813, |
|
"learning_rate": 1.6000000000000001e-06, |
|
"loss": 0.0001, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 41.0204, |
|
"eval_accuracy": 0.7, |
|
"eval_loss": 2.2816524505615234, |
|
"eval_runtime": 14.2804, |
|
"eval_samples_per_second": 5.602, |
|
"eval_steps_per_second": 1.401, |
|
"step": 2142 |
|
}, |
|
{ |
|
"epoch": 42.0032, |
|
"grad_norm": 0.0022142785601317883, |
|
"learning_rate": 1.5555555555555558e-06, |
|
"loss": 0.0001, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 42.0072, |
|
"grad_norm": 0.006730781402438879, |
|
"learning_rate": 1.5111111111111112e-06, |
|
"loss": 0.0001, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 42.0112, |
|
"grad_norm": 0.0027248021215200424, |
|
"learning_rate": 1.4666666666666669e-06, |
|
"loss": 0.0001, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 42.0152, |
|
"grad_norm": 0.0026180455461144447, |
|
"learning_rate": 1.4222222222222223e-06, |
|
"loss": 0.0001, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 42.0192, |
|
"grad_norm": 0.0017348204273730516, |
|
"learning_rate": 1.377777777777778e-06, |
|
"loss": 0.0001, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 42.0204, |
|
"eval_accuracy": 0.725, |
|
"eval_loss": 2.101508617401123, |
|
"eval_runtime": 15.3729, |
|
"eval_samples_per_second": 5.204, |
|
"eval_steps_per_second": 1.301, |
|
"step": 2193 |
|
}, |
|
{ |
|
"epoch": 43.0028, |
|
"grad_norm": 0.0021387911401689053, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.0001, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 43.0068, |
|
"grad_norm": 0.0030123190954327583, |
|
"learning_rate": 1.288888888888889e-06, |
|
"loss": 0.0001, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 43.0108, |
|
"grad_norm": 0.0043581160716712475, |
|
"learning_rate": 1.2444444444444445e-06, |
|
"loss": 0.0001, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 43.0148, |
|
"grad_norm": 0.0023331050761044025, |
|
"learning_rate": 1.2000000000000002e-06, |
|
"loss": 0.0001, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 43.0188, |
|
"grad_norm": 0.0014669563388451934, |
|
"learning_rate": 1.1555555555555556e-06, |
|
"loss": 0.0001, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 43.0204, |
|
"eval_accuracy": 0.775, |
|
"eval_loss": 1.6356258392333984, |
|
"eval_runtime": 14.3954, |
|
"eval_samples_per_second": 5.557, |
|
"eval_steps_per_second": 1.389, |
|
"step": 2244 |
|
}, |
|
{ |
|
"epoch": 44.0024, |
|
"grad_norm": 0.0022818814031779766, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 0.2123, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 44.0064, |
|
"grad_norm": 0.004321521148085594, |
|
"learning_rate": 1.066666666666667e-06, |
|
"loss": 0.0001, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 44.0104, |
|
"grad_norm": 0.002616771264001727, |
|
"learning_rate": 1.0222222222222223e-06, |
|
"loss": 0.0001, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 44.0144, |
|
"grad_norm": 0.008771556429564953, |
|
"learning_rate": 9.77777777777778e-07, |
|
"loss": 0.0001, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 44.0184, |
|
"grad_norm": 0.007163883652538061, |
|
"learning_rate": 9.333333333333334e-07, |
|
"loss": 0.0001, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 44.0204, |
|
"eval_accuracy": 0.8125, |
|
"eval_loss": 1.5849277973175049, |
|
"eval_runtime": 14.7946, |
|
"eval_samples_per_second": 5.407, |
|
"eval_steps_per_second": 1.352, |
|
"step": 2295 |
|
}, |
|
{ |
|
"epoch": 45.002, |
|
"grad_norm": 0.001609973143786192, |
|
"learning_rate": 8.88888888888889e-07, |
|
"loss": 0.0001, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 45.006, |
|
"grad_norm": 0.0015221175272017717, |
|
"learning_rate": 8.444444444444445e-07, |
|
"loss": 0.0001, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 45.01, |
|
"grad_norm": 0.00561766279861331, |
|
"learning_rate": 8.000000000000001e-07, |
|
"loss": 0.0002, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 45.014, |
|
"grad_norm": 0.005561948753893375, |
|
"learning_rate": 7.555555555555556e-07, |
|
"loss": 0.0001, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 45.018, |
|
"grad_norm": 0.0020426807459443808, |
|
"learning_rate": 7.111111111111112e-07, |
|
"loss": 0.0001, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 45.0204, |
|
"eval_accuracy": 0.775, |
|
"eval_loss": 1.6463369131088257, |
|
"eval_runtime": 15.349, |
|
"eval_samples_per_second": 5.212, |
|
"eval_steps_per_second": 1.303, |
|
"step": 2346 |
|
}, |
|
{ |
|
"epoch": 46.0016, |
|
"grad_norm": 0.011843581683933735, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 0.0001, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 46.0056, |
|
"grad_norm": 0.003159413579851389, |
|
"learning_rate": 6.222222222222223e-07, |
|
"loss": 0.0001, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 46.0096, |
|
"grad_norm": 0.021830186247825623, |
|
"learning_rate": 5.777777777777778e-07, |
|
"loss": 0.0001, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 46.0136, |
|
"grad_norm": 0.0015857354737818241, |
|
"learning_rate": 5.333333333333335e-07, |
|
"loss": 0.0001, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 46.0176, |
|
"grad_norm": 0.0021360372193157673, |
|
"learning_rate": 4.88888888888889e-07, |
|
"loss": 0.0001, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 46.0204, |
|
"eval_accuracy": 0.775, |
|
"eval_loss": 1.664137601852417, |
|
"eval_runtime": 14.3393, |
|
"eval_samples_per_second": 5.579, |
|
"eval_steps_per_second": 1.395, |
|
"step": 2397 |
|
}, |
|
{ |
|
"epoch": 47.0012, |
|
"grad_norm": 7.805647850036621, |
|
"learning_rate": 4.444444444444445e-07, |
|
"loss": 0.0007, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 47.0052, |
|
"grad_norm": 0.002305046422407031, |
|
"learning_rate": 4.0000000000000003e-07, |
|
"loss": 0.0001, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 47.0092, |
|
"grad_norm": 0.0024446428287774324, |
|
"learning_rate": 3.555555555555556e-07, |
|
"loss": 0.0001, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 47.0132, |
|
"grad_norm": 0.0018226341344416142, |
|
"learning_rate": 3.111111111111111e-07, |
|
"loss": 0.0001, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 47.0172, |
|
"grad_norm": 0.0018344988347962499, |
|
"learning_rate": 2.666666666666667e-07, |
|
"loss": 0.0001, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 47.0204, |
|
"eval_accuracy": 0.7875, |
|
"eval_loss": 1.612348198890686, |
|
"eval_runtime": 14.4481, |
|
"eval_samples_per_second": 5.537, |
|
"eval_steps_per_second": 1.384, |
|
"step": 2448 |
|
}, |
|
{ |
|
"epoch": 48.0008, |
|
"grad_norm": 0.0016147164860740304, |
|
"learning_rate": 2.2222222222222224e-07, |
|
"loss": 0.0001, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 48.0048, |
|
"grad_norm": 0.0015180219197645783, |
|
"learning_rate": 1.777777777777778e-07, |
|
"loss": 0.0001, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 48.0088, |
|
"grad_norm": 0.0027082718443125486, |
|
"learning_rate": 1.3333333333333336e-07, |
|
"loss": 0.0001, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 48.0128, |
|
"grad_norm": 0.0015189133118838072, |
|
"learning_rate": 8.88888888888889e-08, |
|
"loss": 0.0001, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 48.0168, |
|
"grad_norm": 0.0014700506580993533, |
|
"learning_rate": 4.444444444444445e-08, |
|
"loss": 0.0001, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 48.0204, |
|
"eval_accuracy": 0.7875, |
|
"eval_loss": 1.6145384311676025, |
|
"eval_runtime": 14.6364, |
|
"eval_samples_per_second": 5.466, |
|
"eval_steps_per_second": 1.366, |
|
"step": 2499 |
|
}, |
|
{ |
|
"epoch": 49.0004, |
|
"grad_norm": 0.005640446674078703, |
|
"learning_rate": 0.0, |
|
"loss": 0.0001, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 49.0004, |
|
"eval_accuracy": 0.7875, |
|
"eval_loss": 1.6145387887954712, |
|
"eval_runtime": 15.4845, |
|
"eval_samples_per_second": 5.166, |
|
"eval_steps_per_second": 1.292, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 49.0004, |
|
"step": 2500, |
|
"total_flos": 4.34799425740686e+19, |
|
"train_loss": 0.1747144501608098, |
|
"train_runtime": 4798.9949, |
|
"train_samples_per_second": 2.084, |
|
"train_steps_per_second": 0.521 |
|
}, |
|
{ |
|
"epoch": 49.0004, |
|
"eval_accuracy": 0.5753424657534246, |
|
"eval_loss": 2.5945777893066406, |
|
"eval_runtime": 15.5022, |
|
"eval_samples_per_second": 4.709, |
|
"eval_steps_per_second": 1.226, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 49.0004, |
|
"eval_accuracy": 0.5753424657534246, |
|
"eval_loss": 2.5945777893066406, |
|
"eval_runtime": 14.1475, |
|
"eval_samples_per_second": 5.16, |
|
"eval_steps_per_second": 1.343, |
|
"step": 2500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 2500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 4.34799425740686e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|