{ "best_metric": 0.330095499753952, "best_model_checkpoint": "vit-weldclassifyv4/checkpoint-1000", "epoch": 10.0, "eval_steps": 100, "global_step": 1560, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0641025641025641, "grad_norm": 1.5280327796936035, "learning_rate": 0.00019871794871794874, "loss": 1.263, "step": 10 }, { "epoch": 0.1282051282051282, "grad_norm": 1.8514498472213745, "learning_rate": 0.00019743589743589744, "loss": 1.1159, "step": 20 }, { "epoch": 0.19230769230769232, "grad_norm": 1.2513030767440796, "learning_rate": 0.00019615384615384615, "loss": 1.164, "step": 30 }, { "epoch": 0.2564102564102564, "grad_norm": 2.280437707901001, "learning_rate": 0.00019487179487179487, "loss": 0.9528, "step": 40 }, { "epoch": 0.32051282051282054, "grad_norm": 1.246656060218811, "learning_rate": 0.0001935897435897436, "loss": 1.0313, "step": 50 }, { "epoch": 0.38461538461538464, "grad_norm": 1.9358172416687012, "learning_rate": 0.00019230769230769233, "loss": 1.038, "step": 60 }, { "epoch": 0.44871794871794873, "grad_norm": 2.415847063064575, "learning_rate": 0.00019102564102564104, "loss": 0.9088, "step": 70 }, { "epoch": 0.5128205128205128, "grad_norm": 2.1359400749206543, "learning_rate": 0.00018974358974358974, "loss": 0.8265, "step": 80 }, { "epoch": 0.5769230769230769, "grad_norm": 2.2672863006591797, "learning_rate": 0.00018846153846153847, "loss": 0.8904, "step": 90 }, { "epoch": 0.6410256410256411, "grad_norm": 2.525250196456909, "learning_rate": 0.0001871794871794872, "loss": 0.8207, "step": 100 }, { "epoch": 0.6410256410256411, "eval_accuracy": 0.564748201438849, "eval_loss": 1.0335605144500732, "eval_runtime": 2.4754, "eval_samples_per_second": 112.303, "eval_steps_per_second": 14.139, "step": 100 }, { "epoch": 0.7051282051282052, "grad_norm": 1.6615636348724365, "learning_rate": 0.0001858974358974359, "loss": 0.9254, "step": 110 }, { "epoch": 0.7692307692307693, "grad_norm": 3.1376407146453857, "learning_rate": 0.00018461538461538463, "loss": 0.8125, "step": 120 }, { "epoch": 0.8333333333333334, "grad_norm": 1.8494981527328491, "learning_rate": 0.00018333333333333334, "loss": 0.6967, "step": 130 }, { "epoch": 0.8974358974358975, "grad_norm": 2.7374682426452637, "learning_rate": 0.00018205128205128207, "loss": 0.7031, "step": 140 }, { "epoch": 0.9615384615384616, "grad_norm": 2.8239476680755615, "learning_rate": 0.00018076923076923077, "loss": 0.7871, "step": 150 }, { "epoch": 1.0256410256410255, "grad_norm": 2.239936590194702, "learning_rate": 0.0001794871794871795, "loss": 0.8064, "step": 160 }, { "epoch": 1.0897435897435896, "grad_norm": 2.60086727142334, "learning_rate": 0.00017820512820512823, "loss": 0.5349, "step": 170 }, { "epoch": 1.1538461538461537, "grad_norm": 3.485903024673462, "learning_rate": 0.00017692307692307693, "loss": 0.5896, "step": 180 }, { "epoch": 1.217948717948718, "grad_norm": 3.2817769050598145, "learning_rate": 0.00017564102564102566, "loss": 0.729, "step": 190 }, { "epoch": 1.282051282051282, "grad_norm": 4.110422611236572, "learning_rate": 0.00017435897435897436, "loss": 0.6506, "step": 200 }, { "epoch": 1.282051282051282, "eval_accuracy": 0.579136690647482, "eval_loss": 1.1981723308563232, "eval_runtime": 2.8799, "eval_samples_per_second": 96.532, "eval_steps_per_second": 12.153, "step": 200 }, { "epoch": 1.3461538461538463, "grad_norm": 2.515904664993286, "learning_rate": 0.0001730769230769231, "loss": 0.8231, "step": 210 }, { "epoch": 1.4102564102564101, "grad_norm": 2.2017431259155273, "learning_rate": 0.0001717948717948718, "loss": 0.6216, "step": 220 }, { "epoch": 1.4743589743589745, "grad_norm": 2.253706693649292, "learning_rate": 0.00017051282051282053, "loss": 0.5126, "step": 230 }, { "epoch": 1.5384615384615383, "grad_norm": 3.4106080532073975, "learning_rate": 0.00016923076923076923, "loss": 0.471, "step": 240 }, { "epoch": 1.6025641025641026, "grad_norm": 1.884901523590088, "learning_rate": 0.00016794871794871796, "loss": 0.4521, "step": 250 }, { "epoch": 1.6666666666666665, "grad_norm": 3.056332588195801, "learning_rate": 0.0001666666666666667, "loss": 0.3808, "step": 260 }, { "epoch": 1.7307692307692308, "grad_norm": 3.2792108058929443, "learning_rate": 0.0001653846153846154, "loss": 0.3568, "step": 270 }, { "epoch": 1.7948717948717947, "grad_norm": 3.657397985458374, "learning_rate": 0.0001641025641025641, "loss": 0.4942, "step": 280 }, { "epoch": 1.858974358974359, "grad_norm": 3.565870523452759, "learning_rate": 0.00016282051282051282, "loss": 0.5117, "step": 290 }, { "epoch": 1.9230769230769231, "grad_norm": 1.8648629188537598, "learning_rate": 0.00016153846153846155, "loss": 0.5324, "step": 300 }, { "epoch": 1.9230769230769231, "eval_accuracy": 0.7769784172661871, "eval_loss": 0.605965256690979, "eval_runtime": 2.4146, "eval_samples_per_second": 115.134, "eval_steps_per_second": 14.495, "step": 300 }, { "epoch": 1.9871794871794872, "grad_norm": 5.665286540985107, "learning_rate": 0.00016025641025641028, "loss": 0.5566, "step": 310 }, { "epoch": 2.051282051282051, "grad_norm": 0.6339452266693115, "learning_rate": 0.00015897435897435896, "loss": 0.3058, "step": 320 }, { "epoch": 2.1153846153846154, "grad_norm": 2.2966978549957275, "learning_rate": 0.0001576923076923077, "loss": 0.2831, "step": 330 }, { "epoch": 2.1794871794871793, "grad_norm": 2.510307550430298, "learning_rate": 0.00015641025641025642, "loss": 0.2055, "step": 340 }, { "epoch": 2.2435897435897436, "grad_norm": 1.1059399843215942, "learning_rate": 0.00015512820512820515, "loss": 0.2886, "step": 350 }, { "epoch": 2.3076923076923075, "grad_norm": 6.055357933044434, "learning_rate": 0.00015384615384615385, "loss": 0.3756, "step": 360 }, { "epoch": 2.371794871794872, "grad_norm": 0.5536957383155823, "learning_rate": 0.00015256410256410255, "loss": 0.412, "step": 370 }, { "epoch": 2.435897435897436, "grad_norm": 5.276978969573975, "learning_rate": 0.00015128205128205128, "loss": 0.1798, "step": 380 }, { "epoch": 2.5, "grad_norm": 2.25166916847229, "learning_rate": 0.00015000000000000001, "loss": 0.1813, "step": 390 }, { "epoch": 2.564102564102564, "grad_norm": 4.955526351928711, "learning_rate": 0.00014871794871794872, "loss": 0.2486, "step": 400 }, { "epoch": 2.564102564102564, "eval_accuracy": 0.7517985611510791, "eval_loss": 0.729444682598114, "eval_runtime": 4.0175, "eval_samples_per_second": 69.198, "eval_steps_per_second": 8.712, "step": 400 }, { "epoch": 2.628205128205128, "grad_norm": 5.6987690925598145, "learning_rate": 0.00014743589743589745, "loss": 0.5142, "step": 410 }, { "epoch": 2.6923076923076925, "grad_norm": 0.589967668056488, "learning_rate": 0.00014615384615384615, "loss": 0.2685, "step": 420 }, { "epoch": 2.7564102564102564, "grad_norm": 2.2702548503875732, "learning_rate": 0.00014487179487179488, "loss": 0.3104, "step": 430 }, { "epoch": 2.8205128205128203, "grad_norm": 4.440503120422363, "learning_rate": 0.0001435897435897436, "loss": 0.2192, "step": 440 }, { "epoch": 2.8846153846153846, "grad_norm": 1.690927267074585, "learning_rate": 0.0001423076923076923, "loss": 0.2557, "step": 450 }, { "epoch": 2.948717948717949, "grad_norm": 9.020477294921875, "learning_rate": 0.00014102564102564104, "loss": 0.3725, "step": 460 }, { "epoch": 3.0128205128205128, "grad_norm": 1.131715178489685, "learning_rate": 0.00013974358974358974, "loss": 0.3586, "step": 470 }, { "epoch": 3.076923076923077, "grad_norm": 2.3979876041412354, "learning_rate": 0.00013846153846153847, "loss": 0.1712, "step": 480 }, { "epoch": 3.141025641025641, "grad_norm": 1.2889968156814575, "learning_rate": 0.00013717948717948718, "loss": 0.1988, "step": 490 }, { "epoch": 3.2051282051282053, "grad_norm": 2.893319606781006, "learning_rate": 0.0001358974358974359, "loss": 0.1366, "step": 500 }, { "epoch": 3.2051282051282053, "eval_accuracy": 0.841726618705036, "eval_loss": 0.4832339882850647, "eval_runtime": 2.588, "eval_samples_per_second": 107.42, "eval_steps_per_second": 13.524, "step": 500 }, { "epoch": 3.269230769230769, "grad_norm": 3.6555581092834473, "learning_rate": 0.00013461538461538464, "loss": 0.1222, "step": 510 }, { "epoch": 3.3333333333333335, "grad_norm": 0.1904444396495819, "learning_rate": 0.00013333333333333334, "loss": 0.1654, "step": 520 }, { "epoch": 3.3974358974358974, "grad_norm": 4.902673244476318, "learning_rate": 0.00013205128205128204, "loss": 0.198, "step": 530 }, { "epoch": 3.4615384615384617, "grad_norm": 0.30183860659599304, "learning_rate": 0.00013076923076923077, "loss": 0.2074, "step": 540 }, { "epoch": 3.5256410256410255, "grad_norm": 4.17673397064209, "learning_rate": 0.0001294871794871795, "loss": 0.1021, "step": 550 }, { "epoch": 3.58974358974359, "grad_norm": 1.6145508289337158, "learning_rate": 0.00012820512820512823, "loss": 0.1074, "step": 560 }, { "epoch": 3.6538461538461537, "grad_norm": 4.717573165893555, "learning_rate": 0.00012692307692307693, "loss": 0.1201, "step": 570 }, { "epoch": 3.717948717948718, "grad_norm": 1.2709864377975464, "learning_rate": 0.00012564102564102564, "loss": 0.0544, "step": 580 }, { "epoch": 3.782051282051282, "grad_norm": 3.7621912956237793, "learning_rate": 0.00012435897435897437, "loss": 0.2016, "step": 590 }, { "epoch": 3.8461538461538463, "grad_norm": 12.426462173461914, "learning_rate": 0.0001230769230769231, "loss": 0.3124, "step": 600 }, { "epoch": 3.8461538461538463, "eval_accuracy": 0.762589928057554, "eval_loss": 0.8676345348358154, "eval_runtime": 3.0816, "eval_samples_per_second": 90.213, "eval_steps_per_second": 11.358, "step": 600 }, { "epoch": 3.91025641025641, "grad_norm": 10.93652057647705, "learning_rate": 0.00012179487179487179, "loss": 0.2992, "step": 610 }, { "epoch": 3.9743589743589745, "grad_norm": 0.6971213221549988, "learning_rate": 0.00012051282051282052, "loss": 0.1864, "step": 620 }, { "epoch": 4.038461538461538, "grad_norm": 6.531364917755127, "learning_rate": 0.00011923076923076923, "loss": 0.1929, "step": 630 }, { "epoch": 4.102564102564102, "grad_norm": 0.8437137007713318, "learning_rate": 0.00011794871794871796, "loss": 0.0816, "step": 640 }, { "epoch": 4.166666666666667, "grad_norm": 9.23108196258545, "learning_rate": 0.00011666666666666668, "loss": 0.0654, "step": 650 }, { "epoch": 4.230769230769231, "grad_norm": 1.1913517713546753, "learning_rate": 0.00011538461538461538, "loss": 0.0974, "step": 660 }, { "epoch": 4.294871794871795, "grad_norm": 8.05540657043457, "learning_rate": 0.0001141025641025641, "loss": 0.0466, "step": 670 }, { "epoch": 4.358974358974359, "grad_norm": 0.1012343019247055, "learning_rate": 0.00011282051282051283, "loss": 0.0641, "step": 680 }, { "epoch": 4.423076923076923, "grad_norm": 7.817044734954834, "learning_rate": 0.00011153846153846154, "loss": 0.1442, "step": 690 }, { "epoch": 4.487179487179487, "grad_norm": 6.788941860198975, "learning_rate": 0.00011025641025641027, "loss": 0.0296, "step": 700 }, { "epoch": 4.487179487179487, "eval_accuracy": 0.8884892086330936, "eval_loss": 0.4233308434486389, "eval_runtime": 2.523, "eval_samples_per_second": 110.184, "eval_steps_per_second": 13.872, "step": 700 }, { "epoch": 4.551282051282051, "grad_norm": 4.860511302947998, "learning_rate": 0.00010897435897435896, "loss": 0.0231, "step": 710 }, { "epoch": 4.615384615384615, "grad_norm": 0.9598804116249084, "learning_rate": 0.0001076923076923077, "loss": 0.0207, "step": 720 }, { "epoch": 4.67948717948718, "grad_norm": 12.745481491088867, "learning_rate": 0.00010641025641025641, "loss": 0.1516, "step": 730 }, { "epoch": 4.743589743589744, "grad_norm": 7.983795166015625, "learning_rate": 0.00010512820512820514, "loss": 0.1638, "step": 740 }, { "epoch": 4.8076923076923075, "grad_norm": 3.051384449005127, "learning_rate": 0.00010384615384615386, "loss": 0.0477, "step": 750 }, { "epoch": 4.871794871794872, "grad_norm": 0.10625698417425156, "learning_rate": 0.00010256410256410256, "loss": 0.0719, "step": 760 }, { "epoch": 4.935897435897436, "grad_norm": 0.04624614119529724, "learning_rate": 0.00010128205128205129, "loss": 0.1069, "step": 770 }, { "epoch": 5.0, "grad_norm": 0.08277003467082977, "learning_rate": 0.0001, "loss": 0.0152, "step": 780 }, { "epoch": 5.064102564102564, "grad_norm": 0.09980784356594086, "learning_rate": 9.871794871794872e-05, "loss": 0.0719, "step": 790 }, { "epoch": 5.128205128205128, "grad_norm": 0.09162779897451401, "learning_rate": 9.743589743589744e-05, "loss": 0.0723, "step": 800 }, { "epoch": 5.128205128205128, "eval_accuracy": 0.8848920863309353, "eval_loss": 0.4469863176345825, "eval_runtime": 2.7699, "eval_samples_per_second": 100.363, "eval_steps_per_second": 12.636, "step": 800 }, { "epoch": 5.1923076923076925, "grad_norm": 0.05171818658709526, "learning_rate": 9.615384615384617e-05, "loss": 0.0192, "step": 810 }, { "epoch": 5.256410256410256, "grad_norm": 0.05209165811538696, "learning_rate": 9.487179487179487e-05, "loss": 0.0394, "step": 820 }, { "epoch": 5.32051282051282, "grad_norm": 0.960054874420166, "learning_rate": 9.35897435897436e-05, "loss": 0.0129, "step": 830 }, { "epoch": 5.384615384615385, "grad_norm": 0.09233374148607254, "learning_rate": 9.230769230769232e-05, "loss": 0.0138, "step": 840 }, { "epoch": 5.448717948717949, "grad_norm": 0.09635169804096222, "learning_rate": 9.102564102564103e-05, "loss": 0.0096, "step": 850 }, { "epoch": 5.512820512820513, "grad_norm": 1.3777004480361938, "learning_rate": 8.974358974358975e-05, "loss": 0.0412, "step": 860 }, { "epoch": 5.576923076923077, "grad_norm": 0.03339802846312523, "learning_rate": 8.846153846153847e-05, "loss": 0.0424, "step": 870 }, { "epoch": 5.641025641025641, "grad_norm": 0.032307617366313934, "learning_rate": 8.717948717948718e-05, "loss": 0.0161, "step": 880 }, { "epoch": 5.705128205128205, "grad_norm": 0.03049471788108349, "learning_rate": 8.58974358974359e-05, "loss": 0.0388, "step": 890 }, { "epoch": 5.769230769230769, "grad_norm": 0.05182625725865364, "learning_rate": 8.461538461538461e-05, "loss": 0.0342, "step": 900 }, { "epoch": 5.769230769230769, "eval_accuracy": 0.9172661870503597, "eval_loss": 0.3406282067298889, "eval_runtime": 2.3863, "eval_samples_per_second": 116.5, "eval_steps_per_second": 14.667, "step": 900 }, { "epoch": 5.833333333333333, "grad_norm": 0.2365674525499344, "learning_rate": 8.333333333333334e-05, "loss": 0.0837, "step": 910 }, { "epoch": 5.897435897435898, "grad_norm": 0.031284429132938385, "learning_rate": 8.205128205128205e-05, "loss": 0.0691, "step": 920 }, { "epoch": 5.961538461538462, "grad_norm": 10.787687301635742, "learning_rate": 8.076923076923078e-05, "loss": 0.0524, "step": 930 }, { "epoch": 6.0256410256410255, "grad_norm": 0.027590090408921242, "learning_rate": 7.948717948717948e-05, "loss": 0.0086, "step": 940 }, { "epoch": 6.089743589743589, "grad_norm": 0.04675084725022316, "learning_rate": 7.820512820512821e-05, "loss": 0.0066, "step": 950 }, { "epoch": 6.153846153846154, "grad_norm": 0.032889507710933685, "learning_rate": 7.692307692307693e-05, "loss": 0.0432, "step": 960 }, { "epoch": 6.217948717948718, "grad_norm": 0.1580750048160553, "learning_rate": 7.564102564102564e-05, "loss": 0.006, "step": 970 }, { "epoch": 6.282051282051282, "grad_norm": 0.024286190047860146, "learning_rate": 7.435897435897436e-05, "loss": 0.0123, "step": 980 }, { "epoch": 6.346153846153846, "grad_norm": 0.02685542032122612, "learning_rate": 7.307692307692307e-05, "loss": 0.0059, "step": 990 }, { "epoch": 6.410256410256411, "grad_norm": 0.9080101251602173, "learning_rate": 7.17948717948718e-05, "loss": 0.0055, "step": 1000 }, { "epoch": 6.410256410256411, "eval_accuracy": 0.920863309352518, "eval_loss": 0.330095499753952, "eval_runtime": 3.1626, "eval_samples_per_second": 87.904, "eval_steps_per_second": 11.067, "step": 1000 }, { "epoch": 6.4743589743589745, "grad_norm": 0.156268909573555, "learning_rate": 7.051282051282052e-05, "loss": 0.0051, "step": 1010 }, { "epoch": 6.538461538461538, "grad_norm": 0.025522593408823013, "learning_rate": 6.923076923076924e-05, "loss": 0.0175, "step": 1020 }, { "epoch": 6.602564102564102, "grad_norm": 0.025892965495586395, "learning_rate": 6.794871794871795e-05, "loss": 0.0133, "step": 1030 }, { "epoch": 6.666666666666667, "grad_norm": 0.02324897050857544, "learning_rate": 6.666666666666667e-05, "loss": 0.0051, "step": 1040 }, { "epoch": 6.730769230769231, "grad_norm": 0.20136423408985138, "learning_rate": 6.538461538461539e-05, "loss": 0.0318, "step": 1050 }, { "epoch": 6.794871794871795, "grad_norm": 0.11247438937425613, "learning_rate": 6.410256410256412e-05, "loss": 0.0331, "step": 1060 }, { "epoch": 6.858974358974359, "grad_norm": 0.10950164496898651, "learning_rate": 6.282051282051282e-05, "loss": 0.0055, "step": 1070 }, { "epoch": 6.923076923076923, "grad_norm": 1.537802815437317, "learning_rate": 6.153846153846155e-05, "loss": 0.0055, "step": 1080 }, { "epoch": 6.987179487179487, "grad_norm": 0.023923929780721664, "learning_rate": 6.025641025641026e-05, "loss": 0.0044, "step": 1090 }, { "epoch": 7.051282051282051, "grad_norm": 0.02083686552941799, "learning_rate": 5.897435897435898e-05, "loss": 0.0048, "step": 1100 }, { "epoch": 7.051282051282051, "eval_accuracy": 0.9172661870503597, "eval_loss": 0.3471122980117798, "eval_runtime": 2.4758, "eval_samples_per_second": 112.287, "eval_steps_per_second": 14.137, "step": 1100 }, { "epoch": 7.115384615384615, "grad_norm": 0.020538046956062317, "learning_rate": 5.769230769230769e-05, "loss": 0.0042, "step": 1110 }, { "epoch": 7.17948717948718, "grad_norm": 0.01733437366783619, "learning_rate": 5.6410256410256414e-05, "loss": 0.0039, "step": 1120 }, { "epoch": 7.243589743589744, "grad_norm": 0.01968984678387642, "learning_rate": 5.512820512820514e-05, "loss": 0.0038, "step": 1130 }, { "epoch": 7.3076923076923075, "grad_norm": 0.019213447347283363, "learning_rate": 5.384615384615385e-05, "loss": 0.0036, "step": 1140 }, { "epoch": 7.371794871794872, "grad_norm": 0.017935629934072495, "learning_rate": 5.256410256410257e-05, "loss": 0.004, "step": 1150 }, { "epoch": 7.435897435897436, "grad_norm": 0.01726532354950905, "learning_rate": 5.128205128205128e-05, "loss": 0.0038, "step": 1160 }, { "epoch": 7.5, "grad_norm": 0.01753012090921402, "learning_rate": 5e-05, "loss": 0.0038, "step": 1170 }, { "epoch": 7.564102564102564, "grad_norm": 0.018105851486325264, "learning_rate": 4.871794871794872e-05, "loss": 0.0036, "step": 1180 }, { "epoch": 7.628205128205128, "grad_norm": 0.019911447539925575, "learning_rate": 4.7435897435897435e-05, "loss": 0.0037, "step": 1190 }, { "epoch": 7.6923076923076925, "grad_norm": 0.023634430021047592, "learning_rate": 4.615384615384616e-05, "loss": 0.0036, "step": 1200 }, { "epoch": 7.6923076923076925, "eval_accuracy": 0.9136690647482014, "eval_loss": 0.33460894227027893, "eval_runtime": 2.4621, "eval_samples_per_second": 112.91, "eval_steps_per_second": 14.215, "step": 1200 }, { "epoch": 7.756410256410256, "grad_norm": 0.017936883494257927, "learning_rate": 4.4871794871794874e-05, "loss": 0.0034, "step": 1210 }, { "epoch": 7.82051282051282, "grad_norm": 0.01885095238685608, "learning_rate": 4.358974358974359e-05, "loss": 0.0034, "step": 1220 }, { "epoch": 7.884615384615385, "grad_norm": 0.017711780965328217, "learning_rate": 4.230769230769231e-05, "loss": 0.0033, "step": 1230 }, { "epoch": 7.948717948717949, "grad_norm": 0.014750463888049126, "learning_rate": 4.1025641025641023e-05, "loss": 0.0034, "step": 1240 }, { "epoch": 8.012820512820513, "grad_norm": 0.014598443172872066, "learning_rate": 3.974358974358974e-05, "loss": 0.0031, "step": 1250 }, { "epoch": 8.076923076923077, "grad_norm": 0.01595359854400158, "learning_rate": 3.846153846153846e-05, "loss": 0.0032, "step": 1260 }, { "epoch": 8.14102564102564, "grad_norm": 0.01710698939859867, "learning_rate": 3.717948717948718e-05, "loss": 0.0032, "step": 1270 }, { "epoch": 8.205128205128204, "grad_norm": 0.015550950542092323, "learning_rate": 3.58974358974359e-05, "loss": 0.0031, "step": 1280 }, { "epoch": 8.26923076923077, "grad_norm": 0.015512553043663502, "learning_rate": 3.461538461538462e-05, "loss": 0.0031, "step": 1290 }, { "epoch": 8.333333333333334, "grad_norm": 0.01687728427350521, "learning_rate": 3.3333333333333335e-05, "loss": 0.003, "step": 1300 }, { "epoch": 8.333333333333334, "eval_accuracy": 0.9136690647482014, "eval_loss": 0.34976544976234436, "eval_runtime": 2.3973, "eval_samples_per_second": 115.962, "eval_steps_per_second": 14.599, "step": 1300 }, { "epoch": 8.397435897435898, "grad_norm": 0.017616627737879753, "learning_rate": 3.205128205128206e-05, "loss": 0.0031, "step": 1310 }, { "epoch": 8.461538461538462, "grad_norm": 0.017452212050557137, "learning_rate": 3.0769230769230774e-05, "loss": 0.0031, "step": 1320 }, { "epoch": 8.525641025641026, "grad_norm": 0.017976053059101105, "learning_rate": 2.948717948717949e-05, "loss": 0.0032, "step": 1330 }, { "epoch": 8.58974358974359, "grad_norm": 0.014091568998992443, "learning_rate": 2.8205128205128207e-05, "loss": 0.0032, "step": 1340 }, { "epoch": 8.653846153846153, "grad_norm": 0.015703728422522545, "learning_rate": 2.6923076923076923e-05, "loss": 0.003, "step": 1350 }, { "epoch": 8.717948717948717, "grad_norm": 0.01781061850488186, "learning_rate": 2.564102564102564e-05, "loss": 0.003, "step": 1360 }, { "epoch": 8.782051282051283, "grad_norm": 0.01647392474114895, "learning_rate": 2.435897435897436e-05, "loss": 0.0031, "step": 1370 }, { "epoch": 8.846153846153847, "grad_norm": 0.013498615473508835, "learning_rate": 2.307692307692308e-05, "loss": 0.0028, "step": 1380 }, { "epoch": 8.91025641025641, "grad_norm": 0.016175739467144012, "learning_rate": 2.1794871794871795e-05, "loss": 0.003, "step": 1390 }, { "epoch": 8.974358974358974, "grad_norm": 0.015950603410601616, "learning_rate": 2.0512820512820512e-05, "loss": 0.003, "step": 1400 }, { "epoch": 8.974358974358974, "eval_accuracy": 0.9100719424460432, "eval_loss": 0.354926198720932, "eval_runtime": 2.4324, "eval_samples_per_second": 114.29, "eval_steps_per_second": 14.389, "step": 1400 }, { "epoch": 9.038461538461538, "grad_norm": 0.014547958970069885, "learning_rate": 1.923076923076923e-05, "loss": 0.0029, "step": 1410 }, { "epoch": 9.102564102564102, "grad_norm": 0.012941875495016575, "learning_rate": 1.794871794871795e-05, "loss": 0.0028, "step": 1420 }, { "epoch": 9.166666666666666, "grad_norm": 0.016635097563266754, "learning_rate": 1.6666666666666667e-05, "loss": 0.003, "step": 1430 }, { "epoch": 9.23076923076923, "grad_norm": 0.018657604232430458, "learning_rate": 1.5384615384615387e-05, "loss": 0.0029, "step": 1440 }, { "epoch": 9.294871794871796, "grad_norm": 0.015006115660071373, "learning_rate": 1.4102564102564104e-05, "loss": 0.0031, "step": 1450 }, { "epoch": 9.35897435897436, "grad_norm": 0.01575641520321369, "learning_rate": 1.282051282051282e-05, "loss": 0.0027, "step": 1460 }, { "epoch": 9.423076923076923, "grad_norm": 0.013228046707808971, "learning_rate": 1.153846153846154e-05, "loss": 0.0027, "step": 1470 }, { "epoch": 9.487179487179487, "grad_norm": 0.013002932071685791, "learning_rate": 1.0256410256410256e-05, "loss": 0.0027, "step": 1480 }, { "epoch": 9.551282051282051, "grad_norm": 0.014644928276538849, "learning_rate": 8.974358974358976e-06, "loss": 0.0029, "step": 1490 }, { "epoch": 9.615384615384615, "grad_norm": 0.01448275987058878, "learning_rate": 7.692307692307694e-06, "loss": 0.0027, "step": 1500 }, { "epoch": 9.615384615384615, "eval_accuracy": 0.9136690647482014, "eval_loss": 0.3569168150424957, "eval_runtime": 2.7846, "eval_samples_per_second": 99.835, "eval_steps_per_second": 12.569, "step": 1500 }, { "epoch": 9.679487179487179, "grad_norm": 0.012788016349077225, "learning_rate": 6.41025641025641e-06, "loss": 0.0028, "step": 1510 }, { "epoch": 9.743589743589745, "grad_norm": 0.014576306566596031, "learning_rate": 5.128205128205128e-06, "loss": 0.0026, "step": 1520 }, { "epoch": 9.807692307692308, "grad_norm": 0.014699741266667843, "learning_rate": 3.846153846153847e-06, "loss": 0.0029, "step": 1530 }, { "epoch": 9.871794871794872, "grad_norm": 0.015379097312688828, "learning_rate": 2.564102564102564e-06, "loss": 0.0026, "step": 1540 }, { "epoch": 9.935897435897436, "grad_norm": 0.012021095491945744, "learning_rate": 1.282051282051282e-06, "loss": 0.0027, "step": 1550 }, { "epoch": 10.0, "grad_norm": 0.014515766873955727, "learning_rate": 0.0, "loss": 0.0028, "step": 1560 }, { "epoch": 10.0, "step": 1560, "total_flos": 1.9334597982400512e+18, "train_loss": 0.20911514231314263, "train_runtime": 603.1201, "train_samples_per_second": 41.368, "train_steps_per_second": 2.587 } ], "logging_steps": 10, "max_steps": 1560, "num_input_tokens_seen": 0, "num_train_epochs": 10, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.9334597982400512e+18, "train_batch_size": 16, "trial_name": null, "trial_params": null }