{ "best_metric": 0.9779411764705882, "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-papsmear\\checkpoint-2448", "epoch": 99.34640522875817, "eval_steps": 500, "global_step": 3800, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.26143790849673204, "grad_norm": 19.404264450073242, "learning_rate": 1.3157894736842106e-06, "loss": 1.8243, "step": 10 }, { "epoch": 0.5228758169934641, "grad_norm": 9.874568939208984, "learning_rate": 2.631578947368421e-06, "loss": 1.7542, "step": 20 }, { "epoch": 0.7843137254901961, "grad_norm": 13.61699390411377, "learning_rate": 3.9473684210526315e-06, "loss": 1.7081, "step": 30 }, { "epoch": 0.9934640522875817, "eval_accuracy": 0.2867647058823529, "eval_loss": 1.6642274856567383, "eval_runtime": 19.1091, "eval_samples_per_second": 7.117, "eval_steps_per_second": 0.89, "step": 38 }, { "epoch": 1.0457516339869282, "grad_norm": 17.95810317993164, "learning_rate": 5.263157894736842e-06, "loss": 1.6316, "step": 40 }, { "epoch": 1.3071895424836601, "grad_norm": 11.760519027709961, "learning_rate": 6.578947368421053e-06, "loss": 1.6191, "step": 50 }, { "epoch": 1.5686274509803921, "grad_norm": 12.139671325683594, "learning_rate": 7.894736842105263e-06, "loss": 1.514, "step": 60 }, { "epoch": 1.8300653594771243, "grad_norm": 11.897443771362305, "learning_rate": 9.210526315789474e-06, "loss": 1.4025, "step": 70 }, { "epoch": 1.9869281045751634, "eval_accuracy": 0.4632352941176471, "eval_loss": 1.3760590553283691, "eval_runtime": 16.8545, "eval_samples_per_second": 8.069, "eval_steps_per_second": 1.009, "step": 76 }, { "epoch": 2.0915032679738563, "grad_norm": 14.211647987365723, "learning_rate": 1.0526315789473684e-05, "loss": 1.341, "step": 80 }, { "epoch": 2.3529411764705883, "grad_norm": 21.328588485717773, "learning_rate": 1.1842105263157895e-05, "loss": 1.2617, "step": 90 }, { "epoch": 2.6143790849673203, "grad_norm": 24.131996154785156, "learning_rate": 1.3157894736842106e-05, "loss": 1.1608, "step": 100 }, { "epoch": 2.8758169934640523, "grad_norm": 23.461227416992188, "learning_rate": 1.4473684210526317e-05, "loss": 1.0918, "step": 110 }, { "epoch": 2.980392156862745, "eval_accuracy": 0.5514705882352942, "eval_loss": 1.0276451110839844, "eval_runtime": 17.5433, "eval_samples_per_second": 7.752, "eval_steps_per_second": 0.969, "step": 114 }, { "epoch": 3.1372549019607843, "grad_norm": 44.0300407409668, "learning_rate": 1.5789473684210526e-05, "loss": 0.9044, "step": 120 }, { "epoch": 3.3986928104575163, "grad_norm": 23.61319923400879, "learning_rate": 1.7105263157894737e-05, "loss": 0.9409, "step": 130 }, { "epoch": 3.6601307189542482, "grad_norm": 27.572128295898438, "learning_rate": 1.8421052631578947e-05, "loss": 0.9152, "step": 140 }, { "epoch": 3.9215686274509802, "grad_norm": 20.785051345825195, "learning_rate": 1.9736842105263158e-05, "loss": 0.8051, "step": 150 }, { "epoch": 4.0, "eval_accuracy": 0.6691176470588235, "eval_loss": 0.7678546905517578, "eval_runtime": 17.2269, "eval_samples_per_second": 7.895, "eval_steps_per_second": 0.987, "step": 153 }, { "epoch": 4.183006535947713, "grad_norm": 32.00216293334961, "learning_rate": 2.105263157894737e-05, "loss": 0.7821, "step": 160 }, { "epoch": 4.444444444444445, "grad_norm": 23.564285278320312, "learning_rate": 2.236842105263158e-05, "loss": 0.8036, "step": 170 }, { "epoch": 4.705882352941177, "grad_norm": 21.403562545776367, "learning_rate": 2.368421052631579e-05, "loss": 0.7355, "step": 180 }, { "epoch": 4.967320261437909, "grad_norm": 31.243640899658203, "learning_rate": 2.5e-05, "loss": 0.635, "step": 190 }, { "epoch": 4.993464052287582, "eval_accuracy": 0.7867647058823529, "eval_loss": 0.5927847623825073, "eval_runtime": 17.4003, "eval_samples_per_second": 7.816, "eval_steps_per_second": 0.977, "step": 191 }, { "epoch": 5.228758169934641, "grad_norm": 23.90205192565918, "learning_rate": 2.6315789473684212e-05, "loss": 0.6363, "step": 200 }, { "epoch": 5.490196078431373, "grad_norm": 23.38309669494629, "learning_rate": 2.7631578947368426e-05, "loss": 0.6285, "step": 210 }, { "epoch": 5.751633986928105, "grad_norm": 41.387149810791016, "learning_rate": 2.8947368421052634e-05, "loss": 0.6051, "step": 220 }, { "epoch": 5.9869281045751634, "eval_accuracy": 0.75, "eval_loss": 0.695731520652771, "eval_runtime": 17.5363, "eval_samples_per_second": 7.755, "eval_steps_per_second": 0.969, "step": 229 }, { "epoch": 6.0130718954248366, "grad_norm": 33.84821319580078, "learning_rate": 3.0263157894736844e-05, "loss": 0.6503, "step": 230 }, { "epoch": 6.2745098039215685, "grad_norm": 18.2890682220459, "learning_rate": 3.157894736842105e-05, "loss": 0.4905, "step": 240 }, { "epoch": 6.5359477124183005, "grad_norm": 25.626060485839844, "learning_rate": 3.289473684210527e-05, "loss": 0.5262, "step": 250 }, { "epoch": 6.7973856209150325, "grad_norm": 28.431270599365234, "learning_rate": 3.421052631578947e-05, "loss": 0.5539, "step": 260 }, { "epoch": 6.980392156862745, "eval_accuracy": 0.7941176470588235, "eval_loss": 0.5016477108001709, "eval_runtime": 17.3512, "eval_samples_per_second": 7.838, "eval_steps_per_second": 0.98, "step": 267 }, { "epoch": 7.0588235294117645, "grad_norm": 21.074764251708984, "learning_rate": 3.5526315789473684e-05, "loss": 0.4807, "step": 270 }, { "epoch": 7.3202614379084965, "grad_norm": 21.632251739501953, "learning_rate": 3.6842105263157895e-05, "loss": 0.4704, "step": 280 }, { "epoch": 7.5816993464052285, "grad_norm": 41.86575698852539, "learning_rate": 3.815789473684211e-05, "loss": 0.5141, "step": 290 }, { "epoch": 7.8431372549019605, "grad_norm": 20.23293685913086, "learning_rate": 3.9473684210526316e-05, "loss": 0.4683, "step": 300 }, { "epoch": 8.0, "eval_accuracy": 0.8235294117647058, "eval_loss": 0.4732811748981476, "eval_runtime": 17.0473, "eval_samples_per_second": 7.978, "eval_steps_per_second": 0.997, "step": 306 }, { "epoch": 8.104575163398692, "grad_norm": 67.42210388183594, "learning_rate": 4.078947368421053e-05, "loss": 0.451, "step": 310 }, { "epoch": 8.366013071895425, "grad_norm": 22.807098388671875, "learning_rate": 4.210526315789474e-05, "loss": 0.4019, "step": 320 }, { "epoch": 8.627450980392156, "grad_norm": 31.961091995239258, "learning_rate": 4.342105263157895e-05, "loss": 0.4663, "step": 330 }, { "epoch": 8.88888888888889, "grad_norm": 26.965513229370117, "learning_rate": 4.473684210526316e-05, "loss": 0.4153, "step": 340 }, { "epoch": 8.993464052287582, "eval_accuracy": 0.8529411764705882, "eval_loss": 0.4834950268268585, "eval_runtime": 16.944, "eval_samples_per_second": 8.026, "eval_steps_per_second": 1.003, "step": 344 }, { "epoch": 9.15032679738562, "grad_norm": 21.733226776123047, "learning_rate": 4.605263157894737e-05, "loss": 0.473, "step": 350 }, { "epoch": 9.411764705882353, "grad_norm": 17.1552734375, "learning_rate": 4.736842105263158e-05, "loss": 0.3912, "step": 360 }, { "epoch": 9.673202614379084, "grad_norm": 39.66945266723633, "learning_rate": 4.868421052631579e-05, "loss": 0.465, "step": 370 }, { "epoch": 9.934640522875817, "grad_norm": 24.060779571533203, "learning_rate": 5e-05, "loss": 0.3954, "step": 380 }, { "epoch": 9.986928104575163, "eval_accuracy": 0.8308823529411765, "eval_loss": 0.5431119203567505, "eval_runtime": 16.9702, "eval_samples_per_second": 8.014, "eval_steps_per_second": 1.002, "step": 382 }, { "epoch": 10.196078431372548, "grad_norm": 22.754186630249023, "learning_rate": 4.985380116959065e-05, "loss": 0.309, "step": 390 }, { "epoch": 10.457516339869281, "grad_norm": 25.09243392944336, "learning_rate": 4.970760233918128e-05, "loss": 0.2985, "step": 400 }, { "epoch": 10.718954248366012, "grad_norm": 32.95780563354492, "learning_rate": 4.956140350877193e-05, "loss": 0.3551, "step": 410 }, { "epoch": 10.980392156862745, "grad_norm": 24.594146728515625, "learning_rate": 4.941520467836258e-05, "loss": 0.3524, "step": 420 }, { "epoch": 10.980392156862745, "eval_accuracy": 0.8235294117647058, "eval_loss": 0.4060741364955902, "eval_runtime": 16.9787, "eval_samples_per_second": 8.01, "eval_steps_per_second": 1.001, "step": 420 }, { "epoch": 11.241830065359476, "grad_norm": 34.58118438720703, "learning_rate": 4.926900584795322e-05, "loss": 0.3015, "step": 430 }, { "epoch": 11.50326797385621, "grad_norm": 17.467493057250977, "learning_rate": 4.912280701754386e-05, "loss": 0.332, "step": 440 }, { "epoch": 11.764705882352942, "grad_norm": 11.450825691223145, "learning_rate": 4.8976608187134504e-05, "loss": 0.3546, "step": 450 }, { "epoch": 12.0, "eval_accuracy": 0.8382352941176471, "eval_loss": 0.4924784302711487, "eval_runtime": 17.0509, "eval_samples_per_second": 7.976, "eval_steps_per_second": 0.997, "step": 459 }, { "epoch": 12.026143790849673, "grad_norm": 22.95159912109375, "learning_rate": 4.883040935672515e-05, "loss": 0.3362, "step": 460 }, { "epoch": 12.287581699346406, "grad_norm": 15.78369140625, "learning_rate": 4.868421052631579e-05, "loss": 0.2589, "step": 470 }, { "epoch": 12.549019607843137, "grad_norm": 18.571977615356445, "learning_rate": 4.853801169590643e-05, "loss": 0.2588, "step": 480 }, { "epoch": 12.81045751633987, "grad_norm": 10.237850189208984, "learning_rate": 4.839181286549708e-05, "loss": 0.2922, "step": 490 }, { "epoch": 12.993464052287582, "eval_accuracy": 0.875, "eval_loss": 0.36371880769729614, "eval_runtime": 16.7827, "eval_samples_per_second": 8.104, "eval_steps_per_second": 1.013, "step": 497 }, { "epoch": 13.071895424836601, "grad_norm": 14.183631896972656, "learning_rate": 4.824561403508772e-05, "loss": 0.2683, "step": 500 }, { "epoch": 13.333333333333334, "grad_norm": 15.362314224243164, "learning_rate": 4.8099415204678366e-05, "loss": 0.2178, "step": 510 }, { "epoch": 13.594771241830065, "grad_norm": 31.49340057373047, "learning_rate": 4.7953216374269006e-05, "loss": 0.2095, "step": 520 }, { "epoch": 13.856209150326798, "grad_norm": 39.85598373413086, "learning_rate": 4.780701754385965e-05, "loss": 0.2342, "step": 530 }, { "epoch": 13.986928104575163, "eval_accuracy": 0.8970588235294118, "eval_loss": 0.32859814167022705, "eval_runtime": 16.8467, "eval_samples_per_second": 8.073, "eval_steps_per_second": 1.009, "step": 535 }, { "epoch": 14.117647058823529, "grad_norm": 22.395517349243164, "learning_rate": 4.7660818713450294e-05, "loss": 0.2927, "step": 540 }, { "epoch": 14.379084967320262, "grad_norm": 15.716471672058105, "learning_rate": 4.751461988304094e-05, "loss": 0.2419, "step": 550 }, { "epoch": 14.640522875816993, "grad_norm": 13.827138900756836, "learning_rate": 4.736842105263158e-05, "loss": 0.2215, "step": 560 }, { "epoch": 14.901960784313726, "grad_norm": 8.343385696411133, "learning_rate": 4.722222222222222e-05, "loss": 0.2083, "step": 570 }, { "epoch": 14.980392156862745, "eval_accuracy": 0.8823529411764706, "eval_loss": 0.327125608921051, "eval_runtime": 17.1905, "eval_samples_per_second": 7.911, "eval_steps_per_second": 0.989, "step": 573 }, { "epoch": 15.163398692810457, "grad_norm": 27.369592666625977, "learning_rate": 4.707602339181287e-05, "loss": 0.1837, "step": 580 }, { "epoch": 15.42483660130719, "grad_norm": 4.707042217254639, "learning_rate": 4.6929824561403515e-05, "loss": 0.1872, "step": 590 }, { "epoch": 15.686274509803921, "grad_norm": 19.026412963867188, "learning_rate": 4.678362573099415e-05, "loss": 0.2063, "step": 600 }, { "epoch": 15.947712418300654, "grad_norm": 39.22539138793945, "learning_rate": 4.6637426900584796e-05, "loss": 0.2704, "step": 610 }, { "epoch": 16.0, "eval_accuracy": 0.8823529411764706, "eval_loss": 0.3700261414051056, "eval_runtime": 17.2498, "eval_samples_per_second": 7.884, "eval_steps_per_second": 0.986, "step": 612 }, { "epoch": 16.209150326797385, "grad_norm": 4.610194683074951, "learning_rate": 4.649122807017544e-05, "loss": 0.1895, "step": 620 }, { "epoch": 16.470588235294116, "grad_norm": 27.570838928222656, "learning_rate": 4.634502923976608e-05, "loss": 0.1492, "step": 630 }, { "epoch": 16.73202614379085, "grad_norm": 13.742429733276367, "learning_rate": 4.619883040935672e-05, "loss": 0.1698, "step": 640 }, { "epoch": 16.99346405228758, "grad_norm": 16.786169052124023, "learning_rate": 4.605263157894737e-05, "loss": 0.1871, "step": 650 }, { "epoch": 16.99346405228758, "eval_accuracy": 0.8970588235294118, "eval_loss": 0.34471678733825684, "eval_runtime": 16.7473, "eval_samples_per_second": 8.121, "eval_steps_per_second": 1.015, "step": 650 }, { "epoch": 17.254901960784313, "grad_norm": 15.884855270385742, "learning_rate": 4.590643274853802e-05, "loss": 0.1335, "step": 660 }, { "epoch": 17.516339869281047, "grad_norm": 17.3248348236084, "learning_rate": 4.576023391812866e-05, "loss": 0.1399, "step": 670 }, { "epoch": 17.77777777777778, "grad_norm": 16.090543746948242, "learning_rate": 4.56140350877193e-05, "loss": 0.226, "step": 680 }, { "epoch": 17.986928104575163, "eval_accuracy": 0.8602941176470589, "eval_loss": 0.4279506206512451, "eval_runtime": 16.8179, "eval_samples_per_second": 8.087, "eval_steps_per_second": 1.011, "step": 688 }, { "epoch": 18.03921568627451, "grad_norm": 17.314950942993164, "learning_rate": 4.5467836257309945e-05, "loss": 0.2657, "step": 690 }, { "epoch": 18.30065359477124, "grad_norm": 26.111413955688477, "learning_rate": 4.5321637426900585e-05, "loss": 0.1238, "step": 700 }, { "epoch": 18.562091503267975, "grad_norm": 34.5568962097168, "learning_rate": 4.517543859649123e-05, "loss": 0.3426, "step": 710 }, { "epoch": 18.823529411764707, "grad_norm": 27.506118774414062, "learning_rate": 4.502923976608187e-05, "loss": 0.245, "step": 720 }, { "epoch": 18.980392156862745, "eval_accuracy": 0.8088235294117647, "eval_loss": 0.6445416212081909, "eval_runtime": 16.6042, "eval_samples_per_second": 8.191, "eval_steps_per_second": 1.024, "step": 726 }, { "epoch": 19.084967320261438, "grad_norm": 8.742308616638184, "learning_rate": 4.488304093567251e-05, "loss": 0.1876, "step": 730 }, { "epoch": 19.34640522875817, "grad_norm": 37.74170684814453, "learning_rate": 4.473684210526316e-05, "loss": 0.1044, "step": 740 }, { "epoch": 19.607843137254903, "grad_norm": 17.85502815246582, "learning_rate": 4.4590643274853806e-05, "loss": 0.1637, "step": 750 }, { "epoch": 19.869281045751634, "grad_norm": 13.413275718688965, "learning_rate": 4.4444444444444447e-05, "loss": 0.1545, "step": 760 }, { "epoch": 20.0, "eval_accuracy": 0.8602941176470589, "eval_loss": 0.41802164912223816, "eval_runtime": 16.9375, "eval_samples_per_second": 8.03, "eval_steps_per_second": 1.004, "step": 765 }, { "epoch": 20.130718954248366, "grad_norm": 24.223968505859375, "learning_rate": 4.429824561403509e-05, "loss": 0.1333, "step": 770 }, { "epoch": 20.392156862745097, "grad_norm": 22.863794326782227, "learning_rate": 4.4152046783625734e-05, "loss": 0.1223, "step": 780 }, { "epoch": 20.65359477124183, "grad_norm": 20.22460174560547, "learning_rate": 4.400584795321638e-05, "loss": 0.1906, "step": 790 }, { "epoch": 20.915032679738562, "grad_norm": 6.557627201080322, "learning_rate": 4.3859649122807014e-05, "loss": 0.0981, "step": 800 }, { "epoch": 20.99346405228758, "eval_accuracy": 0.9044117647058824, "eval_loss": 0.32080766558647156, "eval_runtime": 17.4044, "eval_samples_per_second": 7.814, "eval_steps_per_second": 0.977, "step": 803 }, { "epoch": 21.176470588235293, "grad_norm": 11.885444641113281, "learning_rate": 4.371345029239766e-05, "loss": 0.1654, "step": 810 }, { "epoch": 21.437908496732025, "grad_norm": 16.748071670532227, "learning_rate": 4.356725146198831e-05, "loss": 0.1706, "step": 820 }, { "epoch": 21.69934640522876, "grad_norm": 25.410442352294922, "learning_rate": 4.342105263157895e-05, "loss": 0.1121, "step": 830 }, { "epoch": 21.96078431372549, "grad_norm": 24.631742477416992, "learning_rate": 4.327485380116959e-05, "loss": 0.1455, "step": 840 }, { "epoch": 21.986928104575163, "eval_accuracy": 0.8602941176470589, "eval_loss": 0.425643652677536, "eval_runtime": 20.0595, "eval_samples_per_second": 6.78, "eval_steps_per_second": 0.847, "step": 841 }, { "epoch": 22.22222222222222, "grad_norm": 9.926827430725098, "learning_rate": 4.3128654970760236e-05, "loss": 0.144, "step": 850 }, { "epoch": 22.483660130718953, "grad_norm": 32.22057342529297, "learning_rate": 4.298245614035088e-05, "loss": 0.1328, "step": 860 }, { "epoch": 22.745098039215687, "grad_norm": 6.770218849182129, "learning_rate": 4.283625730994152e-05, "loss": 0.2405, "step": 870 }, { "epoch": 22.980392156862745, "eval_accuracy": 0.8970588235294118, "eval_loss": 0.34735360741615295, "eval_runtime": 36.4621, "eval_samples_per_second": 3.73, "eval_steps_per_second": 0.466, "step": 879 }, { "epoch": 23.00653594771242, "grad_norm": 18.301342010498047, "learning_rate": 4.269005847953216e-05, "loss": 0.1407, "step": 880 }, { "epoch": 23.26797385620915, "grad_norm": 25.70302963256836, "learning_rate": 4.254385964912281e-05, "loss": 0.1403, "step": 890 }, { "epoch": 23.529411764705884, "grad_norm": 6.829775333404541, "learning_rate": 4.239766081871345e-05, "loss": 0.1278, "step": 900 }, { "epoch": 23.790849673202615, "grad_norm": 15.183685302734375, "learning_rate": 4.22514619883041e-05, "loss": 0.1549, "step": 910 }, { "epoch": 24.0, "eval_accuracy": 0.9044117647058824, "eval_loss": 0.39403286576271057, "eval_runtime": 30.2513, "eval_samples_per_second": 4.496, "eval_steps_per_second": 0.562, "step": 918 }, { "epoch": 24.052287581699346, "grad_norm": 76.56197357177734, "learning_rate": 4.210526315789474e-05, "loss": 0.2019, "step": 920 }, { "epoch": 24.313725490196077, "grad_norm": 10.338065147399902, "learning_rate": 4.195906432748538e-05, "loss": 0.1341, "step": 930 }, { "epoch": 24.575163398692812, "grad_norm": 10.710972785949707, "learning_rate": 4.1812865497076025e-05, "loss": 0.1207, "step": 940 }, { "epoch": 24.836601307189543, "grad_norm": 19.086135864257812, "learning_rate": 4.166666666666667e-05, "loss": 0.1721, "step": 950 }, { "epoch": 24.99346405228758, "eval_accuracy": 0.8823529411764706, "eval_loss": 0.4279385805130005, "eval_runtime": 29.9969, "eval_samples_per_second": 4.534, "eval_steps_per_second": 0.567, "step": 956 }, { "epoch": 25.098039215686274, "grad_norm": 6.991425514221191, "learning_rate": 4.152046783625731e-05, "loss": 0.0729, "step": 960 }, { "epoch": 25.359477124183005, "grad_norm": 8.979483604431152, "learning_rate": 4.137426900584795e-05, "loss": 0.1826, "step": 970 }, { "epoch": 25.62091503267974, "grad_norm": 11.570904731750488, "learning_rate": 4.12280701754386e-05, "loss": 0.1492, "step": 980 }, { "epoch": 25.88235294117647, "grad_norm": 14.8778076171875, "learning_rate": 4.1081871345029247e-05, "loss": 0.1378, "step": 990 }, { "epoch": 25.986928104575163, "eval_accuracy": 0.9044117647058824, "eval_loss": 0.387086421251297, "eval_runtime": 29.0075, "eval_samples_per_second": 4.688, "eval_steps_per_second": 0.586, "step": 994 }, { "epoch": 26.143790849673202, "grad_norm": 11.985469818115234, "learning_rate": 4.093567251461988e-05, "loss": 0.1122, "step": 1000 }, { "epoch": 26.405228758169933, "grad_norm": 22.02225685119629, "learning_rate": 4.078947368421053e-05, "loss": 0.1172, "step": 1010 }, { "epoch": 26.666666666666668, "grad_norm": 1.2671743631362915, "learning_rate": 4.0643274853801174e-05, "loss": 0.0891, "step": 1020 }, { "epoch": 26.9281045751634, "grad_norm": 10.896835327148438, "learning_rate": 4.0497076023391814e-05, "loss": 0.0924, "step": 1030 }, { "epoch": 26.980392156862745, "eval_accuracy": 0.8455882352941176, "eval_loss": 0.7301138639450073, "eval_runtime": 28.9067, "eval_samples_per_second": 4.705, "eval_steps_per_second": 0.588, "step": 1032 }, { "epoch": 27.18954248366013, "grad_norm": 7.8527960777282715, "learning_rate": 4.0350877192982455e-05, "loss": 0.1348, "step": 1040 }, { "epoch": 27.45098039215686, "grad_norm": 2.1555140018463135, "learning_rate": 4.02046783625731e-05, "loss": 0.0675, "step": 1050 }, { "epoch": 27.712418300653596, "grad_norm": 7.751283645629883, "learning_rate": 4.005847953216375e-05, "loss": 0.0916, "step": 1060 }, { "epoch": 27.973856209150327, "grad_norm": 33.804786682128906, "learning_rate": 3.991228070175439e-05, "loss": 0.1325, "step": 1070 }, { "epoch": 28.0, "eval_accuracy": 0.9044117647058824, "eval_loss": 0.3712061643600464, "eval_runtime": 28.0451, "eval_samples_per_second": 4.849, "eval_steps_per_second": 0.606, "step": 1071 }, { "epoch": 28.235294117647058, "grad_norm": 7.706085205078125, "learning_rate": 3.976608187134503e-05, "loss": 0.0879, "step": 1080 }, { "epoch": 28.49673202614379, "grad_norm": 4.338534355163574, "learning_rate": 3.9619883040935676e-05, "loss": 0.1017, "step": 1090 }, { "epoch": 28.758169934640524, "grad_norm": 9.544697761535645, "learning_rate": 3.9473684210526316e-05, "loss": 0.1426, "step": 1100 }, { "epoch": 28.99346405228758, "eval_accuracy": 0.8602941176470589, "eval_loss": 0.440034419298172, "eval_runtime": 30.1321, "eval_samples_per_second": 4.513, "eval_steps_per_second": 0.564, "step": 1109 }, { "epoch": 29.019607843137255, "grad_norm": 0.3841346502304077, "learning_rate": 3.932748538011696e-05, "loss": 0.0981, "step": 1110 }, { "epoch": 29.281045751633986, "grad_norm": 9.533553123474121, "learning_rate": 3.9181286549707604e-05, "loss": 0.0926, "step": 1120 }, { "epoch": 29.54248366013072, "grad_norm": 26.160850524902344, "learning_rate": 3.9035087719298244e-05, "loss": 0.083, "step": 1130 }, { "epoch": 29.80392156862745, "grad_norm": 18.309621810913086, "learning_rate": 3.888888888888889e-05, "loss": 0.0866, "step": 1140 }, { "epoch": 29.986928104575163, "eval_accuracy": 0.9411764705882353, "eval_loss": 0.27793076634407043, "eval_runtime": 29.3246, "eval_samples_per_second": 4.638, "eval_steps_per_second": 0.58, "step": 1147 }, { "epoch": 30.065359477124183, "grad_norm": 24.974849700927734, "learning_rate": 3.874269005847954e-05, "loss": 0.11, "step": 1150 }, { "epoch": 30.326797385620914, "grad_norm": 3.7421281337738037, "learning_rate": 3.859649122807018e-05, "loss": 0.0712, "step": 1160 }, { "epoch": 30.58823529411765, "grad_norm": 10.041555404663086, "learning_rate": 3.845029239766082e-05, "loss": 0.0702, "step": 1170 }, { "epoch": 30.84967320261438, "grad_norm": 37.238948822021484, "learning_rate": 3.8304093567251465e-05, "loss": 0.0659, "step": 1180 }, { "epoch": 30.980392156862745, "eval_accuracy": 0.9411764705882353, "eval_loss": 0.3207360804080963, "eval_runtime": 34.3274, "eval_samples_per_second": 3.962, "eval_steps_per_second": 0.495, "step": 1185 }, { "epoch": 31.11111111111111, "grad_norm": 13.073234558105469, "learning_rate": 3.815789473684211e-05, "loss": 0.0547, "step": 1190 }, { "epoch": 31.372549019607842, "grad_norm": 3.1763381958007812, "learning_rate": 3.8011695906432746e-05, "loss": 0.0727, "step": 1200 }, { "epoch": 31.633986928104576, "grad_norm": 1.5747133493423462, "learning_rate": 3.786549707602339e-05, "loss": 0.1023, "step": 1210 }, { "epoch": 31.895424836601308, "grad_norm": 12.335155487060547, "learning_rate": 3.771929824561404e-05, "loss": 0.1175, "step": 1220 }, { "epoch": 32.0, "eval_accuracy": 0.9044117647058824, "eval_loss": 0.43389689922332764, "eval_runtime": 32.183, "eval_samples_per_second": 4.226, "eval_steps_per_second": 0.528, "step": 1224 }, { "epoch": 32.15686274509804, "grad_norm": 2.676323413848877, "learning_rate": 3.757309941520468e-05, "loss": 0.129, "step": 1230 }, { "epoch": 32.41830065359477, "grad_norm": 0.5916957259178162, "learning_rate": 3.742690058479532e-05, "loss": 0.0585, "step": 1240 }, { "epoch": 32.6797385620915, "grad_norm": 11.02872085571289, "learning_rate": 3.728070175438597e-05, "loss": 0.045, "step": 1250 }, { "epoch": 32.94117647058823, "grad_norm": 44.40802001953125, "learning_rate": 3.713450292397661e-05, "loss": 0.0455, "step": 1260 }, { "epoch": 32.99346405228758, "eval_accuracy": 0.9264705882352942, "eval_loss": 0.4536753296852112, "eval_runtime": 32.0477, "eval_samples_per_second": 4.244, "eval_steps_per_second": 0.53, "step": 1262 }, { "epoch": 33.20261437908497, "grad_norm": 0.4168817400932312, "learning_rate": 3.6988304093567254e-05, "loss": 0.0625, "step": 1270 }, { "epoch": 33.4640522875817, "grad_norm": 7.689728260040283, "learning_rate": 3.6842105263157895e-05, "loss": 0.1613, "step": 1280 }, { "epoch": 33.72549019607843, "grad_norm": 9.364749908447266, "learning_rate": 3.669590643274854e-05, "loss": 0.1001, "step": 1290 }, { "epoch": 33.98692810457516, "grad_norm": 14.09304428100586, "learning_rate": 3.654970760233918e-05, "loss": 0.1006, "step": 1300 }, { "epoch": 33.98692810457516, "eval_accuracy": 0.875, "eval_loss": 0.6521199345588684, "eval_runtime": 33.7228, "eval_samples_per_second": 4.033, "eval_steps_per_second": 0.504, "step": 1300 }, { "epoch": 34.248366013071895, "grad_norm": 14.115684509277344, "learning_rate": 3.640350877192983e-05, "loss": 0.1592, "step": 1310 }, { "epoch": 34.509803921568626, "grad_norm": 2.2361948490142822, "learning_rate": 3.625730994152047e-05, "loss": 0.0785, "step": 1320 }, { "epoch": 34.77124183006536, "grad_norm": 15.101175308227539, "learning_rate": 3.611111111111111e-05, "loss": 0.033, "step": 1330 }, { "epoch": 34.98039215686274, "eval_accuracy": 0.9044117647058824, "eval_loss": 0.5615760087966919, "eval_runtime": 20.5904, "eval_samples_per_second": 6.605, "eval_steps_per_second": 0.826, "step": 1338 }, { "epoch": 35.032679738562095, "grad_norm": 74.07561492919922, "learning_rate": 3.5964912280701756e-05, "loss": 0.1336, "step": 1340 }, { "epoch": 35.294117647058826, "grad_norm": 40.868961334228516, "learning_rate": 3.5818713450292403e-05, "loss": 0.1209, "step": 1350 }, { "epoch": 35.55555555555556, "grad_norm": 11.251754760742188, "learning_rate": 3.5672514619883044e-05, "loss": 0.0658, "step": 1360 }, { "epoch": 35.81699346405229, "grad_norm": 20.791095733642578, "learning_rate": 3.5526315789473684e-05, "loss": 0.0979, "step": 1370 }, { "epoch": 36.0, "eval_accuracy": 0.9191176470588235, "eval_loss": 0.3717995882034302, "eval_runtime": 21.531, "eval_samples_per_second": 6.316, "eval_steps_per_second": 0.79, "step": 1377 }, { "epoch": 36.07843137254902, "grad_norm": 13.336127281188965, "learning_rate": 3.538011695906433e-05, "loss": 0.0712, "step": 1380 }, { "epoch": 36.33986928104575, "grad_norm": 7.379011154174805, "learning_rate": 3.523391812865498e-05, "loss": 0.0826, "step": 1390 }, { "epoch": 36.60130718954248, "grad_norm": 1.9048967361450195, "learning_rate": 3.508771929824561e-05, "loss": 0.0791, "step": 1400 }, { "epoch": 36.86274509803921, "grad_norm": 32.38518142700195, "learning_rate": 3.494152046783626e-05, "loss": 0.1045, "step": 1410 }, { "epoch": 36.99346405228758, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.25290319323539734, "eval_runtime": 22.9294, "eval_samples_per_second": 5.931, "eval_steps_per_second": 0.741, "step": 1415 }, { "epoch": 37.12418300653595, "grad_norm": 14.719789505004883, "learning_rate": 3.4795321637426905e-05, "loss": 0.0977, "step": 1420 }, { "epoch": 37.38562091503268, "grad_norm": 21.388763427734375, "learning_rate": 3.4649122807017546e-05, "loss": 0.0374, "step": 1430 }, { "epoch": 37.64705882352941, "grad_norm": 7.066629886627197, "learning_rate": 3.4502923976608186e-05, "loss": 0.0819, "step": 1440 }, { "epoch": 37.908496732026144, "grad_norm": 4.583933353424072, "learning_rate": 3.435672514619883e-05, "loss": 0.0815, "step": 1450 }, { "epoch": 37.98692810457516, "eval_accuracy": 0.9338235294117647, "eval_loss": 0.3510648012161255, "eval_runtime": 21.3875, "eval_samples_per_second": 6.359, "eval_steps_per_second": 0.795, "step": 1453 }, { "epoch": 38.169934640522875, "grad_norm": 14.378546714782715, "learning_rate": 3.421052631578947e-05, "loss": 0.1109, "step": 1460 }, { "epoch": 38.431372549019606, "grad_norm": 4.1210408210754395, "learning_rate": 3.406432748538012e-05, "loss": 0.052, "step": 1470 }, { "epoch": 38.69281045751634, "grad_norm": 18.48431396484375, "learning_rate": 3.391812865497076e-05, "loss": 0.0932, "step": 1480 }, { "epoch": 38.95424836601307, "grad_norm": 30.51089859008789, "learning_rate": 3.377192982456141e-05, "loss": 0.0761, "step": 1490 }, { "epoch": 38.98039215686274, "eval_accuracy": 0.9338235294117647, "eval_loss": 0.31144019961357117, "eval_runtime": 32.6124, "eval_samples_per_second": 4.17, "eval_steps_per_second": 0.521, "step": 1491 }, { "epoch": 39.21568627450981, "grad_norm": 29.487356185913086, "learning_rate": 3.362573099415205e-05, "loss": 0.0995, "step": 1500 }, { "epoch": 39.47712418300654, "grad_norm": 4.752898216247559, "learning_rate": 3.3479532163742695e-05, "loss": 0.0986, "step": 1510 }, { "epoch": 39.73856209150327, "grad_norm": 23.433902740478516, "learning_rate": 3.3333333333333335e-05, "loss": 0.0908, "step": 1520 }, { "epoch": 40.0, "grad_norm": 8.154867172241211, "learning_rate": 3.3187134502923975e-05, "loss": 0.0747, "step": 1530 }, { "epoch": 40.0, "eval_accuracy": 0.9338235294117647, "eval_loss": 0.2836870849132538, "eval_runtime": 33.717, "eval_samples_per_second": 4.034, "eval_steps_per_second": 0.504, "step": 1530 }, { "epoch": 40.26143790849673, "grad_norm": 66.09915924072266, "learning_rate": 3.304093567251462e-05, "loss": 0.0746, "step": 1540 }, { "epoch": 40.52287581699346, "grad_norm": 8.447415351867676, "learning_rate": 3.289473684210527e-05, "loss": 0.0809, "step": 1550 }, { "epoch": 40.78431372549019, "grad_norm": 11.7717866897583, "learning_rate": 3.274853801169591e-05, "loss": 0.0545, "step": 1560 }, { "epoch": 40.99346405228758, "eval_accuracy": 0.9411764705882353, "eval_loss": 0.42687493562698364, "eval_runtime": 30.8285, "eval_samples_per_second": 4.412, "eval_steps_per_second": 0.551, "step": 1568 }, { "epoch": 41.04575163398693, "grad_norm": 2.3586502075195312, "learning_rate": 3.260233918128655e-05, "loss": 0.058, "step": 1570 }, { "epoch": 41.30718954248366, "grad_norm": 31.519433975219727, "learning_rate": 3.24561403508772e-05, "loss": 0.0838, "step": 1580 }, { "epoch": 41.568627450980394, "grad_norm": 0.15550392866134644, "learning_rate": 3.230994152046784e-05, "loss": 0.0853, "step": 1590 }, { "epoch": 41.830065359477125, "grad_norm": 6.823671340942383, "learning_rate": 3.216374269005848e-05, "loss": 0.0796, "step": 1600 }, { "epoch": 41.98692810457516, "eval_accuracy": 0.9411764705882353, "eval_loss": 0.23307542502880096, "eval_runtime": 33.1415, "eval_samples_per_second": 4.104, "eval_steps_per_second": 0.513, "step": 1606 }, { "epoch": 42.091503267973856, "grad_norm": 11.52629566192627, "learning_rate": 3.2017543859649124e-05, "loss": 0.0903, "step": 1610 }, { "epoch": 42.35294117647059, "grad_norm": 11.996484756469727, "learning_rate": 3.187134502923977e-05, "loss": 0.0595, "step": 1620 }, { "epoch": 42.61437908496732, "grad_norm": 1.5475754737854004, "learning_rate": 3.172514619883041e-05, "loss": 0.0993, "step": 1630 }, { "epoch": 42.87581699346405, "grad_norm": 18.27874755859375, "learning_rate": 3.157894736842105e-05, "loss": 0.055, "step": 1640 }, { "epoch": 42.98039215686274, "eval_accuracy": 0.9485294117647058, "eval_loss": 0.28995171189308167, "eval_runtime": 31.1656, "eval_samples_per_second": 4.364, "eval_steps_per_second": 0.545, "step": 1644 }, { "epoch": 43.13725490196079, "grad_norm": 1.7079222202301025, "learning_rate": 3.14327485380117e-05, "loss": 0.0851, "step": 1650 }, { "epoch": 43.39869281045752, "grad_norm": 0.0829237625002861, "learning_rate": 3.128654970760234e-05, "loss": 0.061, "step": 1660 }, { "epoch": 43.66013071895425, "grad_norm": 2.6961874961853027, "learning_rate": 3.1140350877192986e-05, "loss": 0.0205, "step": 1670 }, { "epoch": 43.92156862745098, "grad_norm": 3.1870129108428955, "learning_rate": 3.0994152046783626e-05, "loss": 0.0706, "step": 1680 }, { "epoch": 44.0, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.3367806077003479, "eval_runtime": 25.249, "eval_samples_per_second": 5.386, "eval_steps_per_second": 0.673, "step": 1683 }, { "epoch": 44.18300653594771, "grad_norm": 10.678839683532715, "learning_rate": 3.084795321637427e-05, "loss": 0.0555, "step": 1690 }, { "epoch": 44.44444444444444, "grad_norm": 0.1511285901069641, "learning_rate": 3.0701754385964913e-05, "loss": 0.0463, "step": 1700 }, { "epoch": 44.705882352941174, "grad_norm": 19.222854614257812, "learning_rate": 3.055555555555556e-05, "loss": 0.0783, "step": 1710 }, { "epoch": 44.967320261437905, "grad_norm": 12.824193954467773, "learning_rate": 3.0409356725146197e-05, "loss": 0.0505, "step": 1720 }, { "epoch": 44.99346405228758, "eval_accuracy": 0.9485294117647058, "eval_loss": 0.3779818117618561, "eval_runtime": 19.0793, "eval_samples_per_second": 7.128, "eval_steps_per_second": 0.891, "step": 1721 }, { "epoch": 45.22875816993464, "grad_norm": 18.495044708251953, "learning_rate": 3.0263157894736844e-05, "loss": 0.0679, "step": 1730 }, { "epoch": 45.490196078431374, "grad_norm": 22.039566040039062, "learning_rate": 3.0116959064327488e-05, "loss": 0.0618, "step": 1740 }, { "epoch": 45.751633986928105, "grad_norm": 0.6790270209312439, "learning_rate": 2.997076023391813e-05, "loss": 0.0698, "step": 1750 }, { "epoch": 45.98692810457516, "eval_accuracy": 0.9191176470588235, "eval_loss": 0.48222464323043823, "eval_runtime": 33.9657, "eval_samples_per_second": 4.004, "eval_steps_per_second": 0.501, "step": 1759 }, { "epoch": 46.01307189542484, "grad_norm": 48.15066909790039, "learning_rate": 2.9824561403508772e-05, "loss": 0.0745, "step": 1760 }, { "epoch": 46.27450980392157, "grad_norm": 48.96921920776367, "learning_rate": 2.9678362573099415e-05, "loss": 0.11, "step": 1770 }, { "epoch": 46.5359477124183, "grad_norm": 16.973966598510742, "learning_rate": 2.9532163742690062e-05, "loss": 0.0183, "step": 1780 }, { "epoch": 46.79738562091503, "grad_norm": 11.563841819763184, "learning_rate": 2.9385964912280706e-05, "loss": 0.0275, "step": 1790 }, { "epoch": 46.98039215686274, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.34339553117752075, "eval_runtime": 33.4784, "eval_samples_per_second": 4.062, "eval_steps_per_second": 0.508, "step": 1797 }, { "epoch": 47.05882352941177, "grad_norm": 18.660812377929688, "learning_rate": 2.9239766081871346e-05, "loss": 0.0307, "step": 1800 }, { "epoch": 47.3202614379085, "grad_norm": 19.048458099365234, "learning_rate": 2.909356725146199e-05, "loss": 0.036, "step": 1810 }, { "epoch": 47.58169934640523, "grad_norm": 0.8519901037216187, "learning_rate": 2.8947368421052634e-05, "loss": 0.0491, "step": 1820 }, { "epoch": 47.84313725490196, "grad_norm": 0.9929773211479187, "learning_rate": 2.8801169590643277e-05, "loss": 0.0641, "step": 1830 }, { "epoch": 48.0, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.3386637568473816, "eval_runtime": 33.9575, "eval_samples_per_second": 4.005, "eval_steps_per_second": 0.501, "step": 1836 }, { "epoch": 48.10457516339869, "grad_norm": 27.548429489135742, "learning_rate": 2.8654970760233917e-05, "loss": 0.0634, "step": 1840 }, { "epoch": 48.36601307189542, "grad_norm": 0.4367322027683258, "learning_rate": 2.850877192982456e-05, "loss": 0.0756, "step": 1850 }, { "epoch": 48.627450980392155, "grad_norm": 18.30873680114746, "learning_rate": 2.8362573099415208e-05, "loss": 0.0134, "step": 1860 }, { "epoch": 48.888888888888886, "grad_norm": 0.011559017933905125, "learning_rate": 2.821637426900585e-05, "loss": 0.0484, "step": 1870 }, { "epoch": 48.99346405228758, "eval_accuracy": 0.9191176470588235, "eval_loss": 0.5349822640419006, "eval_runtime": 38.4788, "eval_samples_per_second": 3.534, "eval_steps_per_second": 0.442, "step": 1874 }, { "epoch": 49.150326797385624, "grad_norm": 2.1214957237243652, "learning_rate": 2.8070175438596492e-05, "loss": 0.088, "step": 1880 }, { "epoch": 49.411764705882355, "grad_norm": 27.645193099975586, "learning_rate": 2.7923976608187135e-05, "loss": 0.0621, "step": 1890 }, { "epoch": 49.673202614379086, "grad_norm": 1.3699434995651245, "learning_rate": 2.777777777777778e-05, "loss": 0.0528, "step": 1900 }, { "epoch": 49.93464052287582, "grad_norm": 8.130342483520508, "learning_rate": 2.7631578947368426e-05, "loss": 0.0388, "step": 1910 }, { "epoch": 49.98692810457516, "eval_accuracy": 0.9117647058823529, "eval_loss": 0.382554292678833, "eval_runtime": 33.8716, "eval_samples_per_second": 4.015, "eval_steps_per_second": 0.502, "step": 1912 }, { "epoch": 50.19607843137255, "grad_norm": 47.961002349853516, "learning_rate": 2.7485380116959063e-05, "loss": 0.0941, "step": 1920 }, { "epoch": 50.45751633986928, "grad_norm": 36.82217025756836, "learning_rate": 2.733918128654971e-05, "loss": 0.0863, "step": 1930 }, { "epoch": 50.71895424836601, "grad_norm": 5.911373615264893, "learning_rate": 2.7192982456140354e-05, "loss": 0.0324, "step": 1940 }, { "epoch": 50.98039215686274, "grad_norm": 24.99283790588379, "learning_rate": 2.7046783625730997e-05, "loss": 0.0347, "step": 1950 }, { "epoch": 50.98039215686274, "eval_accuracy": 0.9558823529411765, "eval_loss": 0.3738501965999603, "eval_runtime": 30.759, "eval_samples_per_second": 4.421, "eval_steps_per_second": 0.553, "step": 1950 }, { "epoch": 51.24183006535948, "grad_norm": 70.3333969116211, "learning_rate": 2.6900584795321637e-05, "loss": 0.0428, "step": 1960 }, { "epoch": 51.50326797385621, "grad_norm": 13.072953224182129, "learning_rate": 2.675438596491228e-05, "loss": 0.0505, "step": 1970 }, { "epoch": 51.76470588235294, "grad_norm": 39.30720520019531, "learning_rate": 2.6608187134502928e-05, "loss": 0.1046, "step": 1980 }, { "epoch": 52.0, "eval_accuracy": 0.9117647058823529, "eval_loss": 0.3074805736541748, "eval_runtime": 33.894, "eval_samples_per_second": 4.013, "eval_steps_per_second": 0.502, "step": 1989 }, { "epoch": 52.02614379084967, "grad_norm": 23.061525344848633, "learning_rate": 2.6461988304093572e-05, "loss": 0.0566, "step": 1990 }, { "epoch": 52.287581699346404, "grad_norm": 2.5243396759033203, "learning_rate": 2.6315789473684212e-05, "loss": 0.0605, "step": 2000 }, { "epoch": 52.549019607843135, "grad_norm": 11.470220565795898, "learning_rate": 2.6169590643274856e-05, "loss": 0.0767, "step": 2010 }, { "epoch": 52.810457516339866, "grad_norm": 0.23322105407714844, "learning_rate": 2.60233918128655e-05, "loss": 0.0298, "step": 2020 }, { "epoch": 52.99346405228758, "eval_accuracy": 0.9558823529411765, "eval_loss": 0.3557595908641815, "eval_runtime": 25.1218, "eval_samples_per_second": 5.414, "eval_steps_per_second": 0.677, "step": 2027 }, { "epoch": 53.071895424836605, "grad_norm": 4.624847412109375, "learning_rate": 2.5877192982456143e-05, "loss": 0.0563, "step": 2030 }, { "epoch": 53.333333333333336, "grad_norm": 0.25727781653404236, "learning_rate": 2.5730994152046783e-05, "loss": 0.0977, "step": 2040 }, { "epoch": 53.59477124183007, "grad_norm": 0.22140049934387207, "learning_rate": 2.5584795321637427e-05, "loss": 0.0199, "step": 2050 }, { "epoch": 53.8562091503268, "grad_norm": 0.9178116321563721, "learning_rate": 2.5438596491228074e-05, "loss": 0.0478, "step": 2060 }, { "epoch": 53.98692810457516, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.30555427074432373, "eval_runtime": 37.1043, "eval_samples_per_second": 3.665, "eval_steps_per_second": 0.458, "step": 2065 }, { "epoch": 54.11764705882353, "grad_norm": 19.221540451049805, "learning_rate": 2.5292397660818717e-05, "loss": 0.0289, "step": 2070 }, { "epoch": 54.37908496732026, "grad_norm": 1.848120093345642, "learning_rate": 2.5146198830409358e-05, "loss": 0.095, "step": 2080 }, { "epoch": 54.64052287581699, "grad_norm": 10.04775619506836, "learning_rate": 2.5e-05, "loss": 0.0218, "step": 2090 }, { "epoch": 54.90196078431372, "grad_norm": 0.047169651836156845, "learning_rate": 2.485380116959064e-05, "loss": 0.0285, "step": 2100 }, { "epoch": 54.98039215686274, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.28512153029441833, "eval_runtime": 32.4012, "eval_samples_per_second": 4.197, "eval_steps_per_second": 0.525, "step": 2103 }, { "epoch": 55.16339869281046, "grad_norm": 2.4437642097473145, "learning_rate": 2.470760233918129e-05, "loss": 0.0029, "step": 2110 }, { "epoch": 55.42483660130719, "grad_norm": 14.518400192260742, "learning_rate": 2.456140350877193e-05, "loss": 0.0621, "step": 2120 }, { "epoch": 55.68627450980392, "grad_norm": 2.9272749423980713, "learning_rate": 2.4415204678362576e-05, "loss": 0.0129, "step": 2130 }, { "epoch": 55.947712418300654, "grad_norm": 19.935407638549805, "learning_rate": 2.4269005847953216e-05, "loss": 0.0407, "step": 2140 }, { "epoch": 56.0, "eval_accuracy": 0.9558823529411765, "eval_loss": 0.32225164771080017, "eval_runtime": 33.148, "eval_samples_per_second": 4.103, "eval_steps_per_second": 0.513, "step": 2142 }, { "epoch": 56.209150326797385, "grad_norm": 32.69438934326172, "learning_rate": 2.412280701754386e-05, "loss": 0.0161, "step": 2150 }, { "epoch": 56.470588235294116, "grad_norm": 0.04998353496193886, "learning_rate": 2.3976608187134503e-05, "loss": 0.0446, "step": 2160 }, { "epoch": 56.73202614379085, "grad_norm": 0.830470085144043, "learning_rate": 2.3830409356725147e-05, "loss": 0.1066, "step": 2170 }, { "epoch": 56.99346405228758, "grad_norm": 21.04816436767578, "learning_rate": 2.368421052631579e-05, "loss": 0.0459, "step": 2180 }, { "epoch": 56.99346405228758, "eval_accuracy": 0.9485294117647058, "eval_loss": 0.45745787024497986, "eval_runtime": 31.4986, "eval_samples_per_second": 4.318, "eval_steps_per_second": 0.54, "step": 2180 }, { "epoch": 57.254901960784316, "grad_norm": 6.693302631378174, "learning_rate": 2.3538011695906434e-05, "loss": 0.0569, "step": 2190 }, { "epoch": 57.51633986928105, "grad_norm": 12.218875885009766, "learning_rate": 2.3391812865497074e-05, "loss": 0.0455, "step": 2200 }, { "epoch": 57.77777777777778, "grad_norm": 56.21259689331055, "learning_rate": 2.324561403508772e-05, "loss": 0.0409, "step": 2210 }, { "epoch": 57.98692810457516, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.29300644993782043, "eval_runtime": 31.4287, "eval_samples_per_second": 4.327, "eval_steps_per_second": 0.541, "step": 2218 }, { "epoch": 58.03921568627451, "grad_norm": 0.48025286197662354, "learning_rate": 2.309941520467836e-05, "loss": 0.0526, "step": 2220 }, { "epoch": 58.30065359477124, "grad_norm": 6.530683994293213, "learning_rate": 2.295321637426901e-05, "loss": 0.0791, "step": 2230 }, { "epoch": 58.56209150326797, "grad_norm": 35.76517105102539, "learning_rate": 2.280701754385965e-05, "loss": 0.033, "step": 2240 }, { "epoch": 58.8235294117647, "grad_norm": 4.9538679122924805, "learning_rate": 2.2660818713450292e-05, "loss": 0.0743, "step": 2250 }, { "epoch": 58.98039215686274, "eval_accuracy": 0.9485294117647058, "eval_loss": 0.4032076299190521, "eval_runtime": 34.2283, "eval_samples_per_second": 3.973, "eval_steps_per_second": 0.497, "step": 2256 }, { "epoch": 59.08496732026144, "grad_norm": 8.96496868133545, "learning_rate": 2.2514619883040936e-05, "loss": 0.0358, "step": 2260 }, { "epoch": 59.34640522875817, "grad_norm": 10.487314224243164, "learning_rate": 2.236842105263158e-05, "loss": 0.0805, "step": 2270 }, { "epoch": 59.6078431372549, "grad_norm": 3.922236442565918, "learning_rate": 2.2222222222222223e-05, "loss": 0.0096, "step": 2280 }, { "epoch": 59.869281045751634, "grad_norm": 5.181495666503906, "learning_rate": 2.2076023391812867e-05, "loss": 0.0346, "step": 2290 }, { "epoch": 60.0, "eval_accuracy": 0.9411764705882353, "eval_loss": 0.37382781505584717, "eval_runtime": 37.1282, "eval_samples_per_second": 3.663, "eval_steps_per_second": 0.458, "step": 2295 }, { "epoch": 60.130718954248366, "grad_norm": 0.059666648507118225, "learning_rate": 2.1929824561403507e-05, "loss": 0.0551, "step": 2300 }, { "epoch": 60.3921568627451, "grad_norm": 0.5856298804283142, "learning_rate": 2.1783625730994154e-05, "loss": 0.0331, "step": 2310 }, { "epoch": 60.65359477124183, "grad_norm": 5.777927875518799, "learning_rate": 2.1637426900584794e-05, "loss": 0.0112, "step": 2320 }, { "epoch": 60.91503267973856, "grad_norm": 13.134035110473633, "learning_rate": 2.149122807017544e-05, "loss": 0.0302, "step": 2330 }, { "epoch": 60.99346405228758, "eval_accuracy": 0.9485294117647058, "eval_loss": 0.3597317337989807, "eval_runtime": 31.126, "eval_samples_per_second": 4.369, "eval_steps_per_second": 0.546, "step": 2333 }, { "epoch": 61.1764705882353, "grad_norm": 28.286643981933594, "learning_rate": 2.134502923976608e-05, "loss": 0.0311, "step": 2340 }, { "epoch": 61.43790849673203, "grad_norm": 6.936996936798096, "learning_rate": 2.1198830409356725e-05, "loss": 0.139, "step": 2350 }, { "epoch": 61.69934640522876, "grad_norm": 1.0503500699996948, "learning_rate": 2.105263157894737e-05, "loss": 0.0666, "step": 2360 }, { "epoch": 61.96078431372549, "grad_norm": 5.756121635437012, "learning_rate": 2.0906432748538013e-05, "loss": 0.0488, "step": 2370 }, { "epoch": 61.98692810457516, "eval_accuracy": 0.9558823529411765, "eval_loss": 0.2594568133354187, "eval_runtime": 34.9133, "eval_samples_per_second": 3.895, "eval_steps_per_second": 0.487, "step": 2371 }, { "epoch": 62.22222222222222, "grad_norm": 17.791810989379883, "learning_rate": 2.0760233918128656e-05, "loss": 0.0294, "step": 2380 }, { "epoch": 62.48366013071895, "grad_norm": 0.014880876056849957, "learning_rate": 2.06140350877193e-05, "loss": 0.0516, "step": 2390 }, { "epoch": 62.745098039215684, "grad_norm": 33.730533599853516, "learning_rate": 2.046783625730994e-05, "loss": 0.0562, "step": 2400 }, { "epoch": 62.98039215686274, "eval_accuracy": 0.9411764705882353, "eval_loss": 0.3763536512851715, "eval_runtime": 35.0422, "eval_samples_per_second": 3.881, "eval_steps_per_second": 0.485, "step": 2409 }, { "epoch": 63.00653594771242, "grad_norm": 58.39078903198242, "learning_rate": 2.0321637426900587e-05, "loss": 0.0751, "step": 2410 }, { "epoch": 63.26797385620915, "grad_norm": 0.0864597037434578, "learning_rate": 2.0175438596491227e-05, "loss": 0.0393, "step": 2420 }, { "epoch": 63.529411764705884, "grad_norm": 18.966829299926758, "learning_rate": 2.0029239766081874e-05, "loss": 0.0251, "step": 2430 }, { "epoch": 63.790849673202615, "grad_norm": 25.66364288330078, "learning_rate": 1.9883040935672515e-05, "loss": 0.0216, "step": 2440 }, { "epoch": 64.0, "eval_accuracy": 0.9779411764705882, "eval_loss": 0.2643776834011078, "eval_runtime": 17.3782, "eval_samples_per_second": 7.826, "eval_steps_per_second": 0.978, "step": 2448 }, { "epoch": 64.05228758169935, "grad_norm": 1.6527997255325317, "learning_rate": 1.9736842105263158e-05, "loss": 0.054, "step": 2450 }, { "epoch": 64.31372549019608, "grad_norm": 0.06280579417943954, "learning_rate": 1.9590643274853802e-05, "loss": 0.0287, "step": 2460 }, { "epoch": 64.57516339869281, "grad_norm": 1.6318433284759521, "learning_rate": 1.9444444444444445e-05, "loss": 0.0399, "step": 2470 }, { "epoch": 64.83660130718954, "grad_norm": 1.7933380603790283, "learning_rate": 1.929824561403509e-05, "loss": 0.0219, "step": 2480 }, { "epoch": 64.99346405228758, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.30917930603027344, "eval_runtime": 17.1251, "eval_samples_per_second": 7.942, "eval_steps_per_second": 0.993, "step": 2486 }, { "epoch": 65.09803921568627, "grad_norm": 10.366903305053711, "learning_rate": 1.9152046783625733e-05, "loss": 0.0539, "step": 2490 }, { "epoch": 65.359477124183, "grad_norm": 0.2696276307106018, "learning_rate": 1.9005847953216373e-05, "loss": 0.0123, "step": 2500 }, { "epoch": 65.62091503267973, "grad_norm": 2.0707309246063232, "learning_rate": 1.885964912280702e-05, "loss": 0.0209, "step": 2510 }, { "epoch": 65.88235294117646, "grad_norm": 0.026714438572525978, "learning_rate": 1.871345029239766e-05, "loss": 0.0272, "step": 2520 }, { "epoch": 65.98692810457516, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.2898404896259308, "eval_runtime": 17.5281, "eval_samples_per_second": 7.759, "eval_steps_per_second": 0.97, "step": 2524 }, { "epoch": 66.14379084967321, "grad_norm": 0.15798357129096985, "learning_rate": 1.8567251461988304e-05, "loss": 0.0091, "step": 2530 }, { "epoch": 66.40522875816994, "grad_norm": 85.56695556640625, "learning_rate": 1.8421052631578947e-05, "loss": 0.0221, "step": 2540 }, { "epoch": 66.66666666666667, "grad_norm": 25.615230560302734, "learning_rate": 1.827485380116959e-05, "loss": 0.0645, "step": 2550 }, { "epoch": 66.9281045751634, "grad_norm": 22.72310447692871, "learning_rate": 1.8128654970760235e-05, "loss": 0.027, "step": 2560 }, { "epoch": 66.98039215686275, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.2693423628807068, "eval_runtime": 23.0579, "eval_samples_per_second": 5.898, "eval_steps_per_second": 0.737, "step": 2562 }, { "epoch": 67.18954248366013, "grad_norm": 24.883161544799805, "learning_rate": 1.7982456140350878e-05, "loss": 0.0293, "step": 2570 }, { "epoch": 67.45098039215686, "grad_norm": 6.90622615814209, "learning_rate": 1.7836257309941522e-05, "loss": 0.022, "step": 2580 }, { "epoch": 67.7124183006536, "grad_norm": 48.23540115356445, "learning_rate": 1.7690058479532165e-05, "loss": 0.0509, "step": 2590 }, { "epoch": 67.97385620915033, "grad_norm": 0.07863592356443405, "learning_rate": 1.7543859649122806e-05, "loss": 0.0397, "step": 2600 }, { "epoch": 68.0, "eval_accuracy": 0.9411764705882353, "eval_loss": 0.38426852226257324, "eval_runtime": 23.971, "eval_samples_per_second": 5.674, "eval_steps_per_second": 0.709, "step": 2601 }, { "epoch": 68.23529411764706, "grad_norm": 4.26972770690918, "learning_rate": 1.7397660818713453e-05, "loss": 0.0409, "step": 2610 }, { "epoch": 68.49673202614379, "grad_norm": 1.8150982856750488, "learning_rate": 1.7251461988304093e-05, "loss": 0.0315, "step": 2620 }, { "epoch": 68.75816993464052, "grad_norm": 13.07569694519043, "learning_rate": 1.7105263157894737e-05, "loss": 0.0154, "step": 2630 }, { "epoch": 68.99346405228758, "eval_accuracy": 0.9485294117647058, "eval_loss": 0.30511775612831116, "eval_runtime": 23.3134, "eval_samples_per_second": 5.834, "eval_steps_per_second": 0.729, "step": 2639 }, { "epoch": 69.01960784313725, "grad_norm": 0.576351523399353, "learning_rate": 1.695906432748538e-05, "loss": 0.0387, "step": 2640 }, { "epoch": 69.28104575163398, "grad_norm": 0.867915153503418, "learning_rate": 1.6812865497076024e-05, "loss": 0.0178, "step": 2650 }, { "epoch": 69.54248366013071, "grad_norm": 20.2279052734375, "learning_rate": 1.6666666666666667e-05, "loss": 0.0392, "step": 2660 }, { "epoch": 69.80392156862744, "grad_norm": 0.04353189095854759, "learning_rate": 1.652046783625731e-05, "loss": 0.0004, "step": 2670 }, { "epoch": 69.98692810457516, "eval_accuracy": 0.9411764705882353, "eval_loss": 0.39089399576187134, "eval_runtime": 23.3469, "eval_samples_per_second": 5.825, "eval_steps_per_second": 0.728, "step": 2677 }, { "epoch": 70.06535947712419, "grad_norm": 77.49730682373047, "learning_rate": 1.6374269005847955e-05, "loss": 0.0467, "step": 2680 }, { "epoch": 70.32679738562092, "grad_norm": 49.50137710571289, "learning_rate": 1.62280701754386e-05, "loss": 0.0228, "step": 2690 }, { "epoch": 70.58823529411765, "grad_norm": 0.5024857521057129, "learning_rate": 1.608187134502924e-05, "loss": 0.0045, "step": 2700 }, { "epoch": 70.84967320261438, "grad_norm": 3.8934128284454346, "learning_rate": 1.5935672514619886e-05, "loss": 0.0651, "step": 2710 }, { "epoch": 70.98039215686275, "eval_accuracy": 0.9485294117647058, "eval_loss": 0.29772186279296875, "eval_runtime": 25.8712, "eval_samples_per_second": 5.257, "eval_steps_per_second": 0.657, "step": 2715 }, { "epoch": 71.11111111111111, "grad_norm": 7.867006778717041, "learning_rate": 1.5789473684210526e-05, "loss": 0.008, "step": 2720 }, { "epoch": 71.37254901960785, "grad_norm": 13.64209270477295, "learning_rate": 1.564327485380117e-05, "loss": 0.0757, "step": 2730 }, { "epoch": 71.63398692810458, "grad_norm": 6.453034400939941, "learning_rate": 1.5497076023391813e-05, "loss": 0.0214, "step": 2740 }, { "epoch": 71.89542483660131, "grad_norm": 0.1501288115978241, "learning_rate": 1.5350877192982457e-05, "loss": 0.016, "step": 2750 }, { "epoch": 72.0, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.2694728374481201, "eval_runtime": 20.9056, "eval_samples_per_second": 6.505, "eval_steps_per_second": 0.813, "step": 2754 }, { "epoch": 72.15686274509804, "grad_norm": 0.034015778452157974, "learning_rate": 1.5204678362573099e-05, "loss": 0.012, "step": 2760 }, { "epoch": 72.41830065359477, "grad_norm": 11.159213066101074, "learning_rate": 1.5058479532163744e-05, "loss": 0.0444, "step": 2770 }, { "epoch": 72.6797385620915, "grad_norm": 2.5402066707611084, "learning_rate": 1.4912280701754386e-05, "loss": 0.0359, "step": 2780 }, { "epoch": 72.94117647058823, "grad_norm": 0.016565600410103798, "learning_rate": 1.4766081871345031e-05, "loss": 0.0351, "step": 2790 }, { "epoch": 72.99346405228758, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.2720423936843872, "eval_runtime": 22.3116, "eval_samples_per_second": 6.095, "eval_steps_per_second": 0.762, "step": 2792 }, { "epoch": 73.20261437908496, "grad_norm": 79.11601257324219, "learning_rate": 1.4619883040935673e-05, "loss": 0.044, "step": 2800 }, { "epoch": 73.4640522875817, "grad_norm": 5.53911018371582, "learning_rate": 1.4473684210526317e-05, "loss": 0.0298, "step": 2810 }, { "epoch": 73.72549019607843, "grad_norm": 0.40750911831855774, "learning_rate": 1.4327485380116959e-05, "loss": 0.011, "step": 2820 }, { "epoch": 73.98692810457516, "grad_norm": 0.9360626339912415, "learning_rate": 1.4181286549707604e-05, "loss": 0.0206, "step": 2830 }, { "epoch": 73.98692810457516, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.25490206480026245, "eval_runtime": 22.7726, "eval_samples_per_second": 5.972, "eval_steps_per_second": 0.747, "step": 2830 }, { "epoch": 74.2483660130719, "grad_norm": 6.835451602935791, "learning_rate": 1.4035087719298246e-05, "loss": 0.0109, "step": 2840 }, { "epoch": 74.50980392156863, "grad_norm": 0.1265513300895691, "learning_rate": 1.388888888888889e-05, "loss": 0.0436, "step": 2850 }, { "epoch": 74.77124183006536, "grad_norm": 0.20871244370937347, "learning_rate": 1.3742690058479531e-05, "loss": 0.0109, "step": 2860 }, { "epoch": 74.98039215686275, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.24122387170791626, "eval_runtime": 19.4498, "eval_samples_per_second": 6.992, "eval_steps_per_second": 0.874, "step": 2868 }, { "epoch": 75.0326797385621, "grad_norm": 24.267925262451172, "learning_rate": 1.3596491228070177e-05, "loss": 0.0207, "step": 2870 }, { "epoch": 75.29411764705883, "grad_norm": 9.061148643493652, "learning_rate": 1.3450292397660819e-05, "loss": 0.0105, "step": 2880 }, { "epoch": 75.55555555555556, "grad_norm": 1.2824314832687378, "learning_rate": 1.3304093567251464e-05, "loss": 0.0182, "step": 2890 }, { "epoch": 75.81699346405229, "grad_norm": 0.003347081132233143, "learning_rate": 1.3157894736842106e-05, "loss": 0.0012, "step": 2900 }, { "epoch": 76.0, "eval_accuracy": 0.9779411764705882, "eval_loss": 0.34939995408058167, "eval_runtime": 20.8219, "eval_samples_per_second": 6.532, "eval_steps_per_second": 0.816, "step": 2907 }, { "epoch": 76.07843137254902, "grad_norm": 5.410060882568359, "learning_rate": 1.301169590643275e-05, "loss": 0.0214, "step": 2910 }, { "epoch": 76.33986928104575, "grad_norm": 0.6613653898239136, "learning_rate": 1.2865497076023392e-05, "loss": 0.0261, "step": 2920 }, { "epoch": 76.60130718954248, "grad_norm": 1.0403037071228027, "learning_rate": 1.2719298245614037e-05, "loss": 0.0555, "step": 2930 }, { "epoch": 76.86274509803921, "grad_norm": 15.238615036010742, "learning_rate": 1.2573099415204679e-05, "loss": 0.0418, "step": 2940 }, { "epoch": 76.99346405228758, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.37292152643203735, "eval_runtime": 20.8077, "eval_samples_per_second": 6.536, "eval_steps_per_second": 0.817, "step": 2945 }, { "epoch": 77.12418300653594, "grad_norm": 31.79336166381836, "learning_rate": 1.242690058479532e-05, "loss": 0.0302, "step": 2950 }, { "epoch": 77.38562091503267, "grad_norm": 0.0776483416557312, "learning_rate": 1.2280701754385964e-05, "loss": 0.0094, "step": 2960 }, { "epoch": 77.6470588235294, "grad_norm": 63.487571716308594, "learning_rate": 1.2134502923976608e-05, "loss": 0.0473, "step": 2970 }, { "epoch": 77.90849673202614, "grad_norm": 0.09107412397861481, "learning_rate": 1.1988304093567252e-05, "loss": 0.0165, "step": 2980 }, { "epoch": 77.98692810457516, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.347072571516037, "eval_runtime": 17.8737, "eval_samples_per_second": 7.609, "eval_steps_per_second": 0.951, "step": 2983 }, { "epoch": 78.16993464052288, "grad_norm": 36.47078323364258, "learning_rate": 1.1842105263157895e-05, "loss": 0.0176, "step": 2990 }, { "epoch": 78.43137254901961, "grad_norm": 0.0024324676487594843, "learning_rate": 1.1695906432748537e-05, "loss": 0.0317, "step": 3000 }, { "epoch": 78.69281045751634, "grad_norm": 26.059871673583984, "learning_rate": 1.154970760233918e-05, "loss": 0.0699, "step": 3010 }, { "epoch": 78.95424836601308, "grad_norm": 38.14042282104492, "learning_rate": 1.1403508771929824e-05, "loss": 0.0163, "step": 3020 }, { "epoch": 78.98039215686275, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.29730716347694397, "eval_runtime": 18.5858, "eval_samples_per_second": 7.317, "eval_steps_per_second": 0.915, "step": 3021 }, { "epoch": 79.2156862745098, "grad_norm": 87.14070129394531, "learning_rate": 1.1257309941520468e-05, "loss": 0.0556, "step": 3030 }, { "epoch": 79.47712418300654, "grad_norm": 3.418160915374756, "learning_rate": 1.1111111111111112e-05, "loss": 0.0073, "step": 3040 }, { "epoch": 79.73856209150327, "grad_norm": 22.285499572753906, "learning_rate": 1.0964912280701754e-05, "loss": 0.0249, "step": 3050 }, { "epoch": 80.0, "grad_norm": 35.9242057800293, "learning_rate": 1.0818713450292397e-05, "loss": 0.0202, "step": 3060 }, { "epoch": 80.0, "eval_accuracy": 0.9558823529411765, "eval_loss": 0.3729775846004486, "eval_runtime": 19.8789, "eval_samples_per_second": 6.841, "eval_steps_per_second": 0.855, "step": 3060 }, { "epoch": 80.26143790849673, "grad_norm": 15.128210067749023, "learning_rate": 1.067251461988304e-05, "loss": 0.0628, "step": 3070 }, { "epoch": 80.52287581699346, "grad_norm": 29.2634220123291, "learning_rate": 1.0526315789473684e-05, "loss": 0.0244, "step": 3080 }, { "epoch": 80.7843137254902, "grad_norm": 79.84837341308594, "learning_rate": 1.0380116959064328e-05, "loss": 0.0368, "step": 3090 }, { "epoch": 80.99346405228758, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.2876713275909424, "eval_runtime": 19.4821, "eval_samples_per_second": 6.981, "eval_steps_per_second": 0.873, "step": 3098 }, { "epoch": 81.04575163398692, "grad_norm": 2.7281501293182373, "learning_rate": 1.023391812865497e-05, "loss": 0.0238, "step": 3100 }, { "epoch": 81.30718954248366, "grad_norm": 0.0004346697241999209, "learning_rate": 1.0087719298245614e-05, "loss": 0.0305, "step": 3110 }, { "epoch": 81.56862745098039, "grad_norm": 0.03860533982515335, "learning_rate": 9.941520467836257e-06, "loss": 0.0136, "step": 3120 }, { "epoch": 81.83006535947712, "grad_norm": 0.4280990958213806, "learning_rate": 9.795321637426901e-06, "loss": 0.0374, "step": 3130 }, { "epoch": 81.98692810457516, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.41433659195899963, "eval_runtime": 19.9936, "eval_samples_per_second": 6.802, "eval_steps_per_second": 0.85, "step": 3136 }, { "epoch": 82.09150326797386, "grad_norm": 31.7745418548584, "learning_rate": 9.649122807017545e-06, "loss": 0.0105, "step": 3140 }, { "epoch": 82.3529411764706, "grad_norm": 2.9742166996002197, "learning_rate": 9.502923976608186e-06, "loss": 0.0361, "step": 3150 }, { "epoch": 82.61437908496733, "grad_norm": 3.588392734527588, "learning_rate": 9.35672514619883e-06, "loss": 0.0648, "step": 3160 }, { "epoch": 82.87581699346406, "grad_norm": 0.4829164147377014, "learning_rate": 9.210526315789474e-06, "loss": 0.0296, "step": 3170 }, { "epoch": 82.98039215686275, "eval_accuracy": 0.9779411764705882, "eval_loss": 0.2895439565181732, "eval_runtime": 17.9847, "eval_samples_per_second": 7.562, "eval_steps_per_second": 0.945, "step": 3174 }, { "epoch": 83.13725490196079, "grad_norm": 22.893632888793945, "learning_rate": 9.064327485380117e-06, "loss": 0.0115, "step": 3180 }, { "epoch": 83.39869281045752, "grad_norm": 0.021368976682424545, "learning_rate": 8.918128654970761e-06, "loss": 0.0269, "step": 3190 }, { "epoch": 83.66013071895425, "grad_norm": 0.06225317716598511, "learning_rate": 8.771929824561403e-06, "loss": 0.0024, "step": 3200 }, { "epoch": 83.92156862745098, "grad_norm": 0.05705859139561653, "learning_rate": 8.625730994152046e-06, "loss": 0.0405, "step": 3210 }, { "epoch": 84.0, "eval_accuracy": 0.9558823529411765, "eval_loss": 0.29270094633102417, "eval_runtime": 19.1133, "eval_samples_per_second": 7.115, "eval_steps_per_second": 0.889, "step": 3213 }, { "epoch": 84.18300653594771, "grad_norm": 24.514904022216797, "learning_rate": 8.47953216374269e-06, "loss": 0.0098, "step": 3220 }, { "epoch": 84.44444444444444, "grad_norm": 0.596236526966095, "learning_rate": 8.333333333333334e-06, "loss": 0.0035, "step": 3230 }, { "epoch": 84.70588235294117, "grad_norm": 0.050445396453142166, "learning_rate": 8.187134502923977e-06, "loss": 0.005, "step": 3240 }, { "epoch": 84.9673202614379, "grad_norm": 0.07400578260421753, "learning_rate": 8.04093567251462e-06, "loss": 0.0097, "step": 3250 }, { "epoch": 84.99346405228758, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.317930668592453, "eval_runtime": 18.575, "eval_samples_per_second": 7.322, "eval_steps_per_second": 0.915, "step": 3251 }, { "epoch": 85.22875816993464, "grad_norm": 12.950275421142578, "learning_rate": 7.894736842105263e-06, "loss": 0.0026, "step": 3260 }, { "epoch": 85.49019607843137, "grad_norm": 16.546571731567383, "learning_rate": 7.748538011695907e-06, "loss": 0.0257, "step": 3270 }, { "epoch": 85.7516339869281, "grad_norm": 0.6142169237136841, "learning_rate": 7.602339181286549e-06, "loss": 0.0182, "step": 3280 }, { "epoch": 85.98692810457516, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.30465030670166016, "eval_runtime": 18.7827, "eval_samples_per_second": 7.241, "eval_steps_per_second": 0.905, "step": 3289 }, { "epoch": 86.01307189542484, "grad_norm": 0.09201680123806, "learning_rate": 7.456140350877193e-06, "loss": 0.0086, "step": 3290 }, { "epoch": 86.27450980392157, "grad_norm": 0.6810176372528076, "learning_rate": 7.3099415204678366e-06, "loss": 0.0033, "step": 3300 }, { "epoch": 86.5359477124183, "grad_norm": 7.0328474044799805, "learning_rate": 7.163742690058479e-06, "loss": 0.023, "step": 3310 }, { "epoch": 86.79738562091504, "grad_norm": 0.5138120055198669, "learning_rate": 7.017543859649123e-06, "loss": 0.0207, "step": 3320 }, { "epoch": 86.98039215686275, "eval_accuracy": 0.9779411764705882, "eval_loss": 0.3018016815185547, "eval_runtime": 17.5979, "eval_samples_per_second": 7.728, "eval_steps_per_second": 0.966, "step": 3327 }, { "epoch": 87.05882352941177, "grad_norm": 0.11021004617214203, "learning_rate": 6.871345029239766e-06, "loss": 0.0711, "step": 3330 }, { "epoch": 87.3202614379085, "grad_norm": 0.03013734146952629, "learning_rate": 6.725146198830409e-06, "loss": 0.0424, "step": 3340 }, { "epoch": 87.58169934640523, "grad_norm": 69.32197570800781, "learning_rate": 6.578947368421053e-06, "loss": 0.0269, "step": 3350 }, { "epoch": 87.84313725490196, "grad_norm": 0.45887792110443115, "learning_rate": 6.432748538011696e-06, "loss": 0.0207, "step": 3360 }, { "epoch": 88.0, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.332051545381546, "eval_runtime": 17.8575, "eval_samples_per_second": 7.616, "eval_steps_per_second": 0.952, "step": 3366 }, { "epoch": 88.10457516339869, "grad_norm": 0.007120809052139521, "learning_rate": 6.286549707602339e-06, "loss": 0.0047, "step": 3370 }, { "epoch": 88.36601307189542, "grad_norm": 0.051657985895872116, "learning_rate": 6.140350877192982e-06, "loss": 0.0224, "step": 3380 }, { "epoch": 88.62745098039215, "grad_norm": 0.6093434691429138, "learning_rate": 5.994152046783626e-06, "loss": 0.0052, "step": 3390 }, { "epoch": 88.88888888888889, "grad_norm": 25.99680519104004, "learning_rate": 5.8479532163742686e-06, "loss": 0.003, "step": 3400 }, { "epoch": 88.99346405228758, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.30860844254493713, "eval_runtime": 18.245, "eval_samples_per_second": 7.454, "eval_steps_per_second": 0.932, "step": 3404 }, { "epoch": 89.15032679738562, "grad_norm": 31.555145263671875, "learning_rate": 5.701754385964912e-06, "loss": 0.0329, "step": 3410 }, { "epoch": 89.41176470588235, "grad_norm": 18.486536026000977, "learning_rate": 5.555555555555556e-06, "loss": 0.029, "step": 3420 }, { "epoch": 89.67320261437908, "grad_norm": 0.33306655287742615, "learning_rate": 5.409356725146199e-06, "loss": 0.0098, "step": 3430 }, { "epoch": 89.93464052287581, "grad_norm": 2.643474578857422, "learning_rate": 5.263157894736842e-06, "loss": 0.0157, "step": 3440 }, { "epoch": 89.98692810457516, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.2947893440723419, "eval_runtime": 18.1316, "eval_samples_per_second": 7.501, "eval_steps_per_second": 0.938, "step": 3442 }, { "epoch": 90.19607843137256, "grad_norm": 6.317154407501221, "learning_rate": 5.116959064327485e-06, "loss": 0.008, "step": 3450 }, { "epoch": 90.45751633986929, "grad_norm": 1.63987398147583, "learning_rate": 4.970760233918129e-06, "loss": 0.0219, "step": 3460 }, { "epoch": 90.71895424836602, "grad_norm": 8.074739456176758, "learning_rate": 4.824561403508772e-06, "loss": 0.0188, "step": 3470 }, { "epoch": 90.98039215686275, "grad_norm": 0.2915269136428833, "learning_rate": 4.678362573099415e-06, "loss": 0.0428, "step": 3480 }, { "epoch": 90.98039215686275, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.3174949586391449, "eval_runtime": 17.8483, "eval_samples_per_second": 7.62, "eval_steps_per_second": 0.952, "step": 3480 }, { "epoch": 91.24183006535948, "grad_norm": 0.3356679677963257, "learning_rate": 4.532163742690059e-06, "loss": 0.0161, "step": 3490 }, { "epoch": 91.50326797385621, "grad_norm": 1.1951477527618408, "learning_rate": 4.3859649122807014e-06, "loss": 0.0205, "step": 3500 }, { "epoch": 91.76470588235294, "grad_norm": 0.05076509341597557, "learning_rate": 4.239766081871345e-06, "loss": 0.0189, "step": 3510 }, { "epoch": 92.0, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.3239772915840149, "eval_runtime": 17.301, "eval_samples_per_second": 7.861, "eval_steps_per_second": 0.983, "step": 3519 }, { "epoch": 92.02614379084967, "grad_norm": 1.3812580108642578, "learning_rate": 4.093567251461989e-06, "loss": 0.0212, "step": 3520 }, { "epoch": 92.2875816993464, "grad_norm": 0.3320296108722687, "learning_rate": 3.9473684210526315e-06, "loss": 0.0073, "step": 3530 }, { "epoch": 92.54901960784314, "grad_norm": 0.009532331489026546, "learning_rate": 3.8011695906432747e-06, "loss": 0.0053, "step": 3540 }, { "epoch": 92.81045751633987, "grad_norm": 0.5157586932182312, "learning_rate": 3.6549707602339183e-06, "loss": 0.0046, "step": 3550 }, { "epoch": 92.99346405228758, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.341442346572876, "eval_runtime": 18.8672, "eval_samples_per_second": 7.208, "eval_steps_per_second": 0.901, "step": 3557 }, { "epoch": 93.0718954248366, "grad_norm": 61.38653564453125, "learning_rate": 3.5087719298245615e-06, "loss": 0.0246, "step": 3560 }, { "epoch": 93.33333333333333, "grad_norm": 0.477070152759552, "learning_rate": 3.3625730994152047e-06, "loss": 0.0639, "step": 3570 }, { "epoch": 93.59477124183006, "grad_norm": 68.3900375366211, "learning_rate": 3.216374269005848e-06, "loss": 0.0255, "step": 3580 }, { "epoch": 93.85620915032679, "grad_norm": 0.3444403111934662, "learning_rate": 3.070175438596491e-06, "loss": 0.0057, "step": 3590 }, { "epoch": 93.98692810457516, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.33292174339294434, "eval_runtime": 17.7377, "eval_samples_per_second": 7.667, "eval_steps_per_second": 0.958, "step": 3595 }, { "epoch": 94.11764705882354, "grad_norm": 0.04389649257063866, "learning_rate": 2.9239766081871343e-06, "loss": 0.0058, "step": 3600 }, { "epoch": 94.37908496732027, "grad_norm": 0.5849317908287048, "learning_rate": 2.777777777777778e-06, "loss": 0.0586, "step": 3610 }, { "epoch": 94.640522875817, "grad_norm": 0.019542796537280083, "learning_rate": 2.631578947368421e-06, "loss": 0.001, "step": 3620 }, { "epoch": 94.90196078431373, "grad_norm": 0.002426290884613991, "learning_rate": 2.4853801169590643e-06, "loss": 0.0165, "step": 3630 }, { "epoch": 94.98039215686275, "eval_accuracy": 0.9632352941176471, "eval_loss": 0.32402223348617554, "eval_runtime": 17.5747, "eval_samples_per_second": 7.738, "eval_steps_per_second": 0.967, "step": 3633 }, { "epoch": 95.16339869281046, "grad_norm": 2.353595495223999, "learning_rate": 2.3391812865497075e-06, "loss": 0.0009, "step": 3640 }, { "epoch": 95.42483660130719, "grad_norm": 0.7732095718383789, "learning_rate": 2.1929824561403507e-06, "loss": 0.0273, "step": 3650 }, { "epoch": 95.68627450980392, "grad_norm": 0.006318532861769199, "learning_rate": 2.0467836257309943e-06, "loss": 0.0219, "step": 3660 }, { "epoch": 95.94771241830065, "grad_norm": 0.12237526476383209, "learning_rate": 1.9005847953216373e-06, "loss": 0.006, "step": 3670 }, { "epoch": 96.0, "eval_accuracy": 0.9705882352941176, "eval_loss": 0.3180083632469177, "eval_runtime": 18.1825, "eval_samples_per_second": 7.48, "eval_steps_per_second": 0.935, "step": 3672 }, { "epoch": 96.20915032679738, "grad_norm": 4.133842468261719, "learning_rate": 1.7543859649122807e-06, "loss": 0.0876, "step": 3680 }, { "epoch": 96.47058823529412, "grad_norm": 14.3917236328125, "learning_rate": 1.608187134502924e-06, "loss": 0.0033, "step": 3690 }, { "epoch": 96.73202614379085, "grad_norm": 0.6327334642410278, "learning_rate": 1.4619883040935671e-06, "loss": 0.0045, "step": 3700 }, { "epoch": 96.99346405228758, "grad_norm": 0.47620221972465515, "learning_rate": 1.3157894736842106e-06, "loss": 0.0172, "step": 3710 }, { "epoch": 96.99346405228758, "eval_accuracy": 0.9779411764705882, "eval_loss": 0.3103199303150177, "eval_runtime": 17.4264, "eval_samples_per_second": 7.804, "eval_steps_per_second": 0.976, "step": 3710 }, { "epoch": 97.25490196078431, "grad_norm": 43.838233947753906, "learning_rate": 1.1695906432748538e-06, "loss": 0.0047, "step": 3720 }, { "epoch": 97.51633986928104, "grad_norm": 0.001560373231768608, "learning_rate": 1.0233918128654972e-06, "loss": 0.0032, "step": 3730 }, { "epoch": 97.77777777777777, "grad_norm": 0.00045679722097702324, "learning_rate": 8.771929824561404e-07, "loss": 0.0109, "step": 3740 }, { "epoch": 97.98692810457516, "eval_accuracy": 0.9779411764705882, "eval_loss": 0.3034810721874237, "eval_runtime": 18.06, "eval_samples_per_second": 7.53, "eval_steps_per_second": 0.941, "step": 3748 }, { "epoch": 98.03921568627452, "grad_norm": 0.0029410182032734156, "learning_rate": 7.309941520467836e-07, "loss": 0.0093, "step": 3750 }, { "epoch": 98.30065359477125, "grad_norm": 0.060371335595846176, "learning_rate": 5.847953216374269e-07, "loss": 0.0147, "step": 3760 }, { "epoch": 98.56209150326798, "grad_norm": 0.0018022909061983228, "learning_rate": 4.385964912280702e-07, "loss": 0.0325, "step": 3770 }, { "epoch": 98.82352941176471, "grad_norm": 0.866423487663269, "learning_rate": 2.9239766081871344e-07, "loss": 0.0172, "step": 3780 }, { "epoch": 98.98039215686275, "eval_accuracy": 0.9779411764705882, "eval_loss": 0.3034467101097107, "eval_runtime": 20.5056, "eval_samples_per_second": 6.632, "eval_steps_per_second": 0.829, "step": 3786 }, { "epoch": 99.08496732026144, "grad_norm": 0.015289215371012688, "learning_rate": 1.4619883040935672e-07, "loss": 0.0003, "step": 3790 }, { "epoch": 99.34640522875817, "grad_norm": 0.3536844849586487, "learning_rate": 0.0, "loss": 0.0219, "step": 3800 }, { "epoch": 99.34640522875817, "eval_accuracy": 0.9779411764705882, "eval_loss": 0.3036399185657501, "eval_runtime": 18.1299, "eval_samples_per_second": 7.501, "eval_steps_per_second": 0.938, "step": 3800 }, { "epoch": 99.34640522875817, "step": 3800, "total_flos": 3.0228260830838784e+18, "train_loss": 0.1524556069365874, "train_runtime": 23400.6351, "train_samples_per_second": 5.231, "train_steps_per_second": 0.162 } ], "logging_steps": 10, "max_steps": 3800, "num_input_tokens_seen": 0, "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.0228260830838784e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }