|
{ |
|
"best_metric": 0.330095499753952, |
|
"best_model_checkpoint": "vit-weldclassifyv4/checkpoint-1000", |
|
"epoch": 10.0, |
|
"eval_steps": 100, |
|
"global_step": 1560, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0641025641025641, |
|
"grad_norm": 1.5280327796936035, |
|
"learning_rate": 0.00019871794871794874, |
|
"loss": 1.263, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.1282051282051282, |
|
"grad_norm": 1.8514498472213745, |
|
"learning_rate": 0.00019743589743589744, |
|
"loss": 1.1159, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.19230769230769232, |
|
"grad_norm": 1.2513030767440796, |
|
"learning_rate": 0.00019615384615384615, |
|
"loss": 1.164, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2564102564102564, |
|
"grad_norm": 2.280437707901001, |
|
"learning_rate": 0.00019487179487179487, |
|
"loss": 0.9528, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.32051282051282054, |
|
"grad_norm": 1.246656060218811, |
|
"learning_rate": 0.0001935897435897436, |
|
"loss": 1.0313, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.38461538461538464, |
|
"grad_norm": 1.9358172416687012, |
|
"learning_rate": 0.00019230769230769233, |
|
"loss": 1.038, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.44871794871794873, |
|
"grad_norm": 2.415847063064575, |
|
"learning_rate": 0.00019102564102564104, |
|
"loss": 0.9088, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 2.1359400749206543, |
|
"learning_rate": 0.00018974358974358974, |
|
"loss": 0.8265, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5769230769230769, |
|
"grad_norm": 2.2672863006591797, |
|
"learning_rate": 0.00018846153846153847, |
|
"loss": 0.8904, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.6410256410256411, |
|
"grad_norm": 2.525250196456909, |
|
"learning_rate": 0.0001871794871794872, |
|
"loss": 0.8207, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6410256410256411, |
|
"eval_accuracy": 0.564748201438849, |
|
"eval_loss": 1.0335605144500732, |
|
"eval_runtime": 2.4754, |
|
"eval_samples_per_second": 112.303, |
|
"eval_steps_per_second": 14.139, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.7051282051282052, |
|
"grad_norm": 1.6615636348724365, |
|
"learning_rate": 0.0001858974358974359, |
|
"loss": 0.9254, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7692307692307693, |
|
"grad_norm": 3.1376407146453857, |
|
"learning_rate": 0.00018461538461538463, |
|
"loss": 0.8125, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 1.8494981527328491, |
|
"learning_rate": 0.00018333333333333334, |
|
"loss": 0.6967, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8974358974358975, |
|
"grad_norm": 2.7374682426452637, |
|
"learning_rate": 0.00018205128205128207, |
|
"loss": 0.7031, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9615384615384616, |
|
"grad_norm": 2.8239476680755615, |
|
"learning_rate": 0.00018076923076923077, |
|
"loss": 0.7871, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 2.239936590194702, |
|
"learning_rate": 0.0001794871794871795, |
|
"loss": 0.8064, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.0897435897435896, |
|
"grad_norm": 2.60086727142334, |
|
"learning_rate": 0.00017820512820512823, |
|
"loss": 0.5349, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.1538461538461537, |
|
"grad_norm": 3.485903024673462, |
|
"learning_rate": 0.00017692307692307693, |
|
"loss": 0.5896, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.217948717948718, |
|
"grad_norm": 3.2817769050598145, |
|
"learning_rate": 0.00017564102564102566, |
|
"loss": 0.729, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.282051282051282, |
|
"grad_norm": 4.110422611236572, |
|
"learning_rate": 0.00017435897435897436, |
|
"loss": 0.6506, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.282051282051282, |
|
"eval_accuracy": 0.579136690647482, |
|
"eval_loss": 1.1981723308563232, |
|
"eval_runtime": 2.8799, |
|
"eval_samples_per_second": 96.532, |
|
"eval_steps_per_second": 12.153, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.3461538461538463, |
|
"grad_norm": 2.515904664993286, |
|
"learning_rate": 0.0001730769230769231, |
|
"loss": 0.8231, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.4102564102564101, |
|
"grad_norm": 2.2017431259155273, |
|
"learning_rate": 0.0001717948717948718, |
|
"loss": 0.6216, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.4743589743589745, |
|
"grad_norm": 2.253706693649292, |
|
"learning_rate": 0.00017051282051282053, |
|
"loss": 0.5126, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 3.4106080532073975, |
|
"learning_rate": 0.00016923076923076923, |
|
"loss": 0.471, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.6025641025641026, |
|
"grad_norm": 1.884901523590088, |
|
"learning_rate": 0.00016794871794871796, |
|
"loss": 0.4521, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 3.056332588195801, |
|
"learning_rate": 0.0001666666666666667, |
|
"loss": 0.3808, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.7307692307692308, |
|
"grad_norm": 3.2792108058929443, |
|
"learning_rate": 0.0001653846153846154, |
|
"loss": 0.3568, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7948717948717947, |
|
"grad_norm": 3.657397985458374, |
|
"learning_rate": 0.0001641025641025641, |
|
"loss": 0.4942, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.858974358974359, |
|
"grad_norm": 3.565870523452759, |
|
"learning_rate": 0.00016282051282051282, |
|
"loss": 0.5117, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"grad_norm": 1.8648629188537598, |
|
"learning_rate": 0.00016153846153846155, |
|
"loss": 0.5324, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9230769230769231, |
|
"eval_accuracy": 0.7769784172661871, |
|
"eval_loss": 0.605965256690979, |
|
"eval_runtime": 2.4146, |
|
"eval_samples_per_second": 115.134, |
|
"eval_steps_per_second": 14.495, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9871794871794872, |
|
"grad_norm": 5.665286540985107, |
|
"learning_rate": 0.00016025641025641028, |
|
"loss": 0.5566, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 0.6339452266693115, |
|
"learning_rate": 0.00015897435897435896, |
|
"loss": 0.3058, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.1153846153846154, |
|
"grad_norm": 2.2966978549957275, |
|
"learning_rate": 0.0001576923076923077, |
|
"loss": 0.2831, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.1794871794871793, |
|
"grad_norm": 2.510307550430298, |
|
"learning_rate": 0.00015641025641025642, |
|
"loss": 0.2055, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.2435897435897436, |
|
"grad_norm": 1.1059399843215942, |
|
"learning_rate": 0.00015512820512820515, |
|
"loss": 0.2886, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.3076923076923075, |
|
"grad_norm": 6.055357933044434, |
|
"learning_rate": 0.00015384615384615385, |
|
"loss": 0.3756, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.371794871794872, |
|
"grad_norm": 0.5536957383155823, |
|
"learning_rate": 0.00015256410256410255, |
|
"loss": 0.412, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.435897435897436, |
|
"grad_norm": 5.276978969573975, |
|
"learning_rate": 0.00015128205128205128, |
|
"loss": 0.1798, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.5, |
|
"grad_norm": 2.25166916847229, |
|
"learning_rate": 0.00015000000000000001, |
|
"loss": 0.1813, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"grad_norm": 4.955526351928711, |
|
"learning_rate": 0.00014871794871794872, |
|
"loss": 0.2486, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.564102564102564, |
|
"eval_accuracy": 0.7517985611510791, |
|
"eval_loss": 0.729444682598114, |
|
"eval_runtime": 4.0175, |
|
"eval_samples_per_second": 69.198, |
|
"eval_steps_per_second": 8.712, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.628205128205128, |
|
"grad_norm": 5.6987690925598145, |
|
"learning_rate": 0.00014743589743589745, |
|
"loss": 0.5142, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.6923076923076925, |
|
"grad_norm": 0.589967668056488, |
|
"learning_rate": 0.00014615384615384615, |
|
"loss": 0.2685, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.7564102564102564, |
|
"grad_norm": 2.2702548503875732, |
|
"learning_rate": 0.00014487179487179488, |
|
"loss": 0.3104, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.8205128205128203, |
|
"grad_norm": 4.440503120422363, |
|
"learning_rate": 0.0001435897435897436, |
|
"loss": 0.2192, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.8846153846153846, |
|
"grad_norm": 1.690927267074585, |
|
"learning_rate": 0.0001423076923076923, |
|
"loss": 0.2557, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.948717948717949, |
|
"grad_norm": 9.020477294921875, |
|
"learning_rate": 0.00014102564102564104, |
|
"loss": 0.3725, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 3.0128205128205128, |
|
"grad_norm": 1.131715178489685, |
|
"learning_rate": 0.00013974358974358974, |
|
"loss": 0.3586, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 3.076923076923077, |
|
"grad_norm": 2.3979876041412354, |
|
"learning_rate": 0.00013846153846153847, |
|
"loss": 0.1712, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 3.141025641025641, |
|
"grad_norm": 1.2889968156814575, |
|
"learning_rate": 0.00013717948717948718, |
|
"loss": 0.1988, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 3.2051282051282053, |
|
"grad_norm": 2.893319606781006, |
|
"learning_rate": 0.0001358974358974359, |
|
"loss": 0.1366, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.2051282051282053, |
|
"eval_accuracy": 0.841726618705036, |
|
"eval_loss": 0.4832339882850647, |
|
"eval_runtime": 2.588, |
|
"eval_samples_per_second": 107.42, |
|
"eval_steps_per_second": 13.524, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 3.269230769230769, |
|
"grad_norm": 3.6555581092834473, |
|
"learning_rate": 0.00013461538461538464, |
|
"loss": 0.1222, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 3.3333333333333335, |
|
"grad_norm": 0.1904444396495819, |
|
"learning_rate": 0.00013333333333333334, |
|
"loss": 0.1654, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 3.3974358974358974, |
|
"grad_norm": 4.902673244476318, |
|
"learning_rate": 0.00013205128205128204, |
|
"loss": 0.198, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 3.4615384615384617, |
|
"grad_norm": 0.30183860659599304, |
|
"learning_rate": 0.00013076923076923077, |
|
"loss": 0.2074, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 3.5256410256410255, |
|
"grad_norm": 4.17673397064209, |
|
"learning_rate": 0.0001294871794871795, |
|
"loss": 0.1021, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 3.58974358974359, |
|
"grad_norm": 1.6145508289337158, |
|
"learning_rate": 0.00012820512820512823, |
|
"loss": 0.1074, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 3.6538461538461537, |
|
"grad_norm": 4.717573165893555, |
|
"learning_rate": 0.00012692307692307693, |
|
"loss": 0.1201, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 3.717948717948718, |
|
"grad_norm": 1.2709864377975464, |
|
"learning_rate": 0.00012564102564102564, |
|
"loss": 0.0544, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 3.782051282051282, |
|
"grad_norm": 3.7621912956237793, |
|
"learning_rate": 0.00012435897435897437, |
|
"loss": 0.2016, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"grad_norm": 12.426462173461914, |
|
"learning_rate": 0.0001230769230769231, |
|
"loss": 0.3124, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.8461538461538463, |
|
"eval_accuracy": 0.762589928057554, |
|
"eval_loss": 0.8676345348358154, |
|
"eval_runtime": 3.0816, |
|
"eval_samples_per_second": 90.213, |
|
"eval_steps_per_second": 11.358, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 3.91025641025641, |
|
"grad_norm": 10.93652057647705, |
|
"learning_rate": 0.00012179487179487179, |
|
"loss": 0.2992, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 3.9743589743589745, |
|
"grad_norm": 0.6971213221549988, |
|
"learning_rate": 0.00012051282051282052, |
|
"loss": 0.1864, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 4.038461538461538, |
|
"grad_norm": 6.531364917755127, |
|
"learning_rate": 0.00011923076923076923, |
|
"loss": 0.1929, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 4.102564102564102, |
|
"grad_norm": 0.8437137007713318, |
|
"learning_rate": 0.00011794871794871796, |
|
"loss": 0.0816, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 4.166666666666667, |
|
"grad_norm": 9.23108196258545, |
|
"learning_rate": 0.00011666666666666668, |
|
"loss": 0.0654, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 4.230769230769231, |
|
"grad_norm": 1.1913517713546753, |
|
"learning_rate": 0.00011538461538461538, |
|
"loss": 0.0974, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 4.294871794871795, |
|
"grad_norm": 8.05540657043457, |
|
"learning_rate": 0.0001141025641025641, |
|
"loss": 0.0466, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 4.358974358974359, |
|
"grad_norm": 0.1012343019247055, |
|
"learning_rate": 0.00011282051282051283, |
|
"loss": 0.0641, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 4.423076923076923, |
|
"grad_norm": 7.817044734954834, |
|
"learning_rate": 0.00011153846153846154, |
|
"loss": 0.1442, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 4.487179487179487, |
|
"grad_norm": 6.788941860198975, |
|
"learning_rate": 0.00011025641025641027, |
|
"loss": 0.0296, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.487179487179487, |
|
"eval_accuracy": 0.8884892086330936, |
|
"eval_loss": 0.4233308434486389, |
|
"eval_runtime": 2.523, |
|
"eval_samples_per_second": 110.184, |
|
"eval_steps_per_second": 13.872, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 4.551282051282051, |
|
"grad_norm": 4.860511302947998, |
|
"learning_rate": 0.00010897435897435896, |
|
"loss": 0.0231, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 4.615384615384615, |
|
"grad_norm": 0.9598804116249084, |
|
"learning_rate": 0.0001076923076923077, |
|
"loss": 0.0207, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 4.67948717948718, |
|
"grad_norm": 12.745481491088867, |
|
"learning_rate": 0.00010641025641025641, |
|
"loss": 0.1516, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 4.743589743589744, |
|
"grad_norm": 7.983795166015625, |
|
"learning_rate": 0.00010512820512820514, |
|
"loss": 0.1638, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 4.8076923076923075, |
|
"grad_norm": 3.051384449005127, |
|
"learning_rate": 0.00010384615384615386, |
|
"loss": 0.0477, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 4.871794871794872, |
|
"grad_norm": 0.10625698417425156, |
|
"learning_rate": 0.00010256410256410256, |
|
"loss": 0.0719, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 4.935897435897436, |
|
"grad_norm": 0.04624614119529724, |
|
"learning_rate": 0.00010128205128205129, |
|
"loss": 0.1069, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 5.0, |
|
"grad_norm": 0.08277003467082977, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0152, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 5.064102564102564, |
|
"grad_norm": 0.09980784356594086, |
|
"learning_rate": 9.871794871794872e-05, |
|
"loss": 0.0719, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"grad_norm": 0.09162779897451401, |
|
"learning_rate": 9.743589743589744e-05, |
|
"loss": 0.0723, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.128205128205128, |
|
"eval_accuracy": 0.8848920863309353, |
|
"eval_loss": 0.4469863176345825, |
|
"eval_runtime": 2.7699, |
|
"eval_samples_per_second": 100.363, |
|
"eval_steps_per_second": 12.636, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 5.1923076923076925, |
|
"grad_norm": 0.05171818658709526, |
|
"learning_rate": 9.615384615384617e-05, |
|
"loss": 0.0192, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 5.256410256410256, |
|
"grad_norm": 0.05209165811538696, |
|
"learning_rate": 9.487179487179487e-05, |
|
"loss": 0.0394, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 5.32051282051282, |
|
"grad_norm": 0.960054874420166, |
|
"learning_rate": 9.35897435897436e-05, |
|
"loss": 0.0129, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 5.384615384615385, |
|
"grad_norm": 0.09233374148607254, |
|
"learning_rate": 9.230769230769232e-05, |
|
"loss": 0.0138, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 5.448717948717949, |
|
"grad_norm": 0.09635169804096222, |
|
"learning_rate": 9.102564102564103e-05, |
|
"loss": 0.0096, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 5.512820512820513, |
|
"grad_norm": 1.3777004480361938, |
|
"learning_rate": 8.974358974358975e-05, |
|
"loss": 0.0412, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 5.576923076923077, |
|
"grad_norm": 0.03339802846312523, |
|
"learning_rate": 8.846153846153847e-05, |
|
"loss": 0.0424, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 5.641025641025641, |
|
"grad_norm": 0.032307617366313934, |
|
"learning_rate": 8.717948717948718e-05, |
|
"loss": 0.0161, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 5.705128205128205, |
|
"grad_norm": 0.03049471788108349, |
|
"learning_rate": 8.58974358974359e-05, |
|
"loss": 0.0388, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 5.769230769230769, |
|
"grad_norm": 0.05182625725865364, |
|
"learning_rate": 8.461538461538461e-05, |
|
"loss": 0.0342, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.769230769230769, |
|
"eval_accuracy": 0.9172661870503597, |
|
"eval_loss": 0.3406282067298889, |
|
"eval_runtime": 2.3863, |
|
"eval_samples_per_second": 116.5, |
|
"eval_steps_per_second": 14.667, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 5.833333333333333, |
|
"grad_norm": 0.2365674525499344, |
|
"learning_rate": 8.333333333333334e-05, |
|
"loss": 0.0837, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 5.897435897435898, |
|
"grad_norm": 0.031284429132938385, |
|
"learning_rate": 8.205128205128205e-05, |
|
"loss": 0.0691, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 5.961538461538462, |
|
"grad_norm": 10.787687301635742, |
|
"learning_rate": 8.076923076923078e-05, |
|
"loss": 0.0524, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 6.0256410256410255, |
|
"grad_norm": 0.027590090408921242, |
|
"learning_rate": 7.948717948717948e-05, |
|
"loss": 0.0086, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 6.089743589743589, |
|
"grad_norm": 0.04675084725022316, |
|
"learning_rate": 7.820512820512821e-05, |
|
"loss": 0.0066, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 6.153846153846154, |
|
"grad_norm": 0.032889507710933685, |
|
"learning_rate": 7.692307692307693e-05, |
|
"loss": 0.0432, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 6.217948717948718, |
|
"grad_norm": 0.1580750048160553, |
|
"learning_rate": 7.564102564102564e-05, |
|
"loss": 0.006, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 6.282051282051282, |
|
"grad_norm": 0.024286190047860146, |
|
"learning_rate": 7.435897435897436e-05, |
|
"loss": 0.0123, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 6.346153846153846, |
|
"grad_norm": 0.02685542032122612, |
|
"learning_rate": 7.307692307692307e-05, |
|
"loss": 0.0059, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 6.410256410256411, |
|
"grad_norm": 0.9080101251602173, |
|
"learning_rate": 7.17948717948718e-05, |
|
"loss": 0.0055, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 6.410256410256411, |
|
"eval_accuracy": 0.920863309352518, |
|
"eval_loss": 0.330095499753952, |
|
"eval_runtime": 3.1626, |
|
"eval_samples_per_second": 87.904, |
|
"eval_steps_per_second": 11.067, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 6.4743589743589745, |
|
"grad_norm": 0.156268909573555, |
|
"learning_rate": 7.051282051282052e-05, |
|
"loss": 0.0051, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 6.538461538461538, |
|
"grad_norm": 0.025522593408823013, |
|
"learning_rate": 6.923076923076924e-05, |
|
"loss": 0.0175, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 6.602564102564102, |
|
"grad_norm": 0.025892965495586395, |
|
"learning_rate": 6.794871794871795e-05, |
|
"loss": 0.0133, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 6.666666666666667, |
|
"grad_norm": 0.02324897050857544, |
|
"learning_rate": 6.666666666666667e-05, |
|
"loss": 0.0051, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 6.730769230769231, |
|
"grad_norm": 0.20136423408985138, |
|
"learning_rate": 6.538461538461539e-05, |
|
"loss": 0.0318, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 6.794871794871795, |
|
"grad_norm": 0.11247438937425613, |
|
"learning_rate": 6.410256410256412e-05, |
|
"loss": 0.0331, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 6.858974358974359, |
|
"grad_norm": 0.10950164496898651, |
|
"learning_rate": 6.282051282051282e-05, |
|
"loss": 0.0055, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 6.923076923076923, |
|
"grad_norm": 1.537802815437317, |
|
"learning_rate": 6.153846153846155e-05, |
|
"loss": 0.0055, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 6.987179487179487, |
|
"grad_norm": 0.023923929780721664, |
|
"learning_rate": 6.025641025641026e-05, |
|
"loss": 0.0044, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 7.051282051282051, |
|
"grad_norm": 0.02083686552941799, |
|
"learning_rate": 5.897435897435898e-05, |
|
"loss": 0.0048, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 7.051282051282051, |
|
"eval_accuracy": 0.9172661870503597, |
|
"eval_loss": 0.3471122980117798, |
|
"eval_runtime": 2.4758, |
|
"eval_samples_per_second": 112.287, |
|
"eval_steps_per_second": 14.137, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 7.115384615384615, |
|
"grad_norm": 0.020538046956062317, |
|
"learning_rate": 5.769230769230769e-05, |
|
"loss": 0.0042, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 7.17948717948718, |
|
"grad_norm": 0.01733437366783619, |
|
"learning_rate": 5.6410256410256414e-05, |
|
"loss": 0.0039, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 7.243589743589744, |
|
"grad_norm": 0.01968984678387642, |
|
"learning_rate": 5.512820512820514e-05, |
|
"loss": 0.0038, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 7.3076923076923075, |
|
"grad_norm": 0.019213447347283363, |
|
"learning_rate": 5.384615384615385e-05, |
|
"loss": 0.0036, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 7.371794871794872, |
|
"grad_norm": 0.017935629934072495, |
|
"learning_rate": 5.256410256410257e-05, |
|
"loss": 0.004, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 7.435897435897436, |
|
"grad_norm": 0.01726532354950905, |
|
"learning_rate": 5.128205128205128e-05, |
|
"loss": 0.0038, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 7.5, |
|
"grad_norm": 0.01753012090921402, |
|
"learning_rate": 5e-05, |
|
"loss": 0.0038, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 7.564102564102564, |
|
"grad_norm": 0.018105851486325264, |
|
"learning_rate": 4.871794871794872e-05, |
|
"loss": 0.0036, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 7.628205128205128, |
|
"grad_norm": 0.019911447539925575, |
|
"learning_rate": 4.7435897435897435e-05, |
|
"loss": 0.0037, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"grad_norm": 0.023634430021047592, |
|
"learning_rate": 4.615384615384616e-05, |
|
"loss": 0.0036, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 7.6923076923076925, |
|
"eval_accuracy": 0.9136690647482014, |
|
"eval_loss": 0.33460894227027893, |
|
"eval_runtime": 2.4621, |
|
"eval_samples_per_second": 112.91, |
|
"eval_steps_per_second": 14.215, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 7.756410256410256, |
|
"grad_norm": 0.017936883494257927, |
|
"learning_rate": 4.4871794871794874e-05, |
|
"loss": 0.0034, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 7.82051282051282, |
|
"grad_norm": 0.01885095238685608, |
|
"learning_rate": 4.358974358974359e-05, |
|
"loss": 0.0034, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 7.884615384615385, |
|
"grad_norm": 0.017711780965328217, |
|
"learning_rate": 4.230769230769231e-05, |
|
"loss": 0.0033, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 7.948717948717949, |
|
"grad_norm": 0.014750463888049126, |
|
"learning_rate": 4.1025641025641023e-05, |
|
"loss": 0.0034, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 8.012820512820513, |
|
"grad_norm": 0.014598443172872066, |
|
"learning_rate": 3.974358974358974e-05, |
|
"loss": 0.0031, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 8.076923076923077, |
|
"grad_norm": 0.01595359854400158, |
|
"learning_rate": 3.846153846153846e-05, |
|
"loss": 0.0032, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 8.14102564102564, |
|
"grad_norm": 0.01710698939859867, |
|
"learning_rate": 3.717948717948718e-05, |
|
"loss": 0.0032, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 8.205128205128204, |
|
"grad_norm": 0.015550950542092323, |
|
"learning_rate": 3.58974358974359e-05, |
|
"loss": 0.0031, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 8.26923076923077, |
|
"grad_norm": 0.015512553043663502, |
|
"learning_rate": 3.461538461538462e-05, |
|
"loss": 0.0031, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 8.333333333333334, |
|
"grad_norm": 0.01687728427350521, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 0.003, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 8.333333333333334, |
|
"eval_accuracy": 0.9136690647482014, |
|
"eval_loss": 0.34976544976234436, |
|
"eval_runtime": 2.3973, |
|
"eval_samples_per_second": 115.962, |
|
"eval_steps_per_second": 14.599, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 8.397435897435898, |
|
"grad_norm": 0.017616627737879753, |
|
"learning_rate": 3.205128205128206e-05, |
|
"loss": 0.0031, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 8.461538461538462, |
|
"grad_norm": 0.017452212050557137, |
|
"learning_rate": 3.0769230769230774e-05, |
|
"loss": 0.0031, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 8.525641025641026, |
|
"grad_norm": 0.017976053059101105, |
|
"learning_rate": 2.948717948717949e-05, |
|
"loss": 0.0032, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 8.58974358974359, |
|
"grad_norm": 0.014091568998992443, |
|
"learning_rate": 2.8205128205128207e-05, |
|
"loss": 0.0032, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 8.653846153846153, |
|
"grad_norm": 0.015703728422522545, |
|
"learning_rate": 2.6923076923076923e-05, |
|
"loss": 0.003, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 8.717948717948717, |
|
"grad_norm": 0.01781061850488186, |
|
"learning_rate": 2.564102564102564e-05, |
|
"loss": 0.003, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 8.782051282051283, |
|
"grad_norm": 0.01647392474114895, |
|
"learning_rate": 2.435897435897436e-05, |
|
"loss": 0.0031, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 8.846153846153847, |
|
"grad_norm": 0.013498615473508835, |
|
"learning_rate": 2.307692307692308e-05, |
|
"loss": 0.0028, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 8.91025641025641, |
|
"grad_norm": 0.016175739467144012, |
|
"learning_rate": 2.1794871794871795e-05, |
|
"loss": 0.003, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 8.974358974358974, |
|
"grad_norm": 0.015950603410601616, |
|
"learning_rate": 2.0512820512820512e-05, |
|
"loss": 0.003, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 8.974358974358974, |
|
"eval_accuracy": 0.9100719424460432, |
|
"eval_loss": 0.354926198720932, |
|
"eval_runtime": 2.4324, |
|
"eval_samples_per_second": 114.29, |
|
"eval_steps_per_second": 14.389, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 9.038461538461538, |
|
"grad_norm": 0.014547958970069885, |
|
"learning_rate": 1.923076923076923e-05, |
|
"loss": 0.0029, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 9.102564102564102, |
|
"grad_norm": 0.012941875495016575, |
|
"learning_rate": 1.794871794871795e-05, |
|
"loss": 0.0028, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 9.166666666666666, |
|
"grad_norm": 0.016635097563266754, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 0.003, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 9.23076923076923, |
|
"grad_norm": 0.018657604232430458, |
|
"learning_rate": 1.5384615384615387e-05, |
|
"loss": 0.0029, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 9.294871794871796, |
|
"grad_norm": 0.015006115660071373, |
|
"learning_rate": 1.4102564102564104e-05, |
|
"loss": 0.0031, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 9.35897435897436, |
|
"grad_norm": 0.01575641520321369, |
|
"learning_rate": 1.282051282051282e-05, |
|
"loss": 0.0027, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 9.423076923076923, |
|
"grad_norm": 0.013228046707808971, |
|
"learning_rate": 1.153846153846154e-05, |
|
"loss": 0.0027, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 9.487179487179487, |
|
"grad_norm": 0.013002932071685791, |
|
"learning_rate": 1.0256410256410256e-05, |
|
"loss": 0.0027, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 9.551282051282051, |
|
"grad_norm": 0.014644928276538849, |
|
"learning_rate": 8.974358974358976e-06, |
|
"loss": 0.0029, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 9.615384615384615, |
|
"grad_norm": 0.01448275987058878, |
|
"learning_rate": 7.692307692307694e-06, |
|
"loss": 0.0027, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 9.615384615384615, |
|
"eval_accuracy": 0.9136690647482014, |
|
"eval_loss": 0.3569168150424957, |
|
"eval_runtime": 2.7846, |
|
"eval_samples_per_second": 99.835, |
|
"eval_steps_per_second": 12.569, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 9.679487179487179, |
|
"grad_norm": 0.012788016349077225, |
|
"learning_rate": 6.41025641025641e-06, |
|
"loss": 0.0028, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 9.743589743589745, |
|
"grad_norm": 0.014576306566596031, |
|
"learning_rate": 5.128205128205128e-06, |
|
"loss": 0.0026, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 9.807692307692308, |
|
"grad_norm": 0.014699741266667843, |
|
"learning_rate": 3.846153846153847e-06, |
|
"loss": 0.0029, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 9.871794871794872, |
|
"grad_norm": 0.015379097312688828, |
|
"learning_rate": 2.564102564102564e-06, |
|
"loss": 0.0026, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 9.935897435897436, |
|
"grad_norm": 0.012021095491945744, |
|
"learning_rate": 1.282051282051282e-06, |
|
"loss": 0.0027, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"grad_norm": 0.014515766873955727, |
|
"learning_rate": 0.0, |
|
"loss": 0.0028, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 10.0, |
|
"step": 1560, |
|
"total_flos": 1.9334597982400512e+18, |
|
"train_loss": 0.20911514231314263, |
|
"train_runtime": 603.1201, |
|
"train_samples_per_second": 41.368, |
|
"train_steps_per_second": 2.587 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 1560, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 10, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 1.9334597982400512e+18, |
|
"train_batch_size": 16, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|