mujerry's picture
End of training
9d1ca05 verified
{
"best_metric": 0.9779411764705882,
"best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-papsmear\\checkpoint-2448",
"epoch": 99.34640522875817,
"eval_steps": 500,
"global_step": 3800,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.26143790849673204,
"grad_norm": 19.404264450073242,
"learning_rate": 1.3157894736842106e-06,
"loss": 1.8243,
"step": 10
},
{
"epoch": 0.5228758169934641,
"grad_norm": 9.874568939208984,
"learning_rate": 2.631578947368421e-06,
"loss": 1.7542,
"step": 20
},
{
"epoch": 0.7843137254901961,
"grad_norm": 13.61699390411377,
"learning_rate": 3.9473684210526315e-06,
"loss": 1.7081,
"step": 30
},
{
"epoch": 0.9934640522875817,
"eval_accuracy": 0.2867647058823529,
"eval_loss": 1.6642274856567383,
"eval_runtime": 19.1091,
"eval_samples_per_second": 7.117,
"eval_steps_per_second": 0.89,
"step": 38
},
{
"epoch": 1.0457516339869282,
"grad_norm": 17.95810317993164,
"learning_rate": 5.263157894736842e-06,
"loss": 1.6316,
"step": 40
},
{
"epoch": 1.3071895424836601,
"grad_norm": 11.760519027709961,
"learning_rate": 6.578947368421053e-06,
"loss": 1.6191,
"step": 50
},
{
"epoch": 1.5686274509803921,
"grad_norm": 12.139671325683594,
"learning_rate": 7.894736842105263e-06,
"loss": 1.514,
"step": 60
},
{
"epoch": 1.8300653594771243,
"grad_norm": 11.897443771362305,
"learning_rate": 9.210526315789474e-06,
"loss": 1.4025,
"step": 70
},
{
"epoch": 1.9869281045751634,
"eval_accuracy": 0.4632352941176471,
"eval_loss": 1.3760590553283691,
"eval_runtime": 16.8545,
"eval_samples_per_second": 8.069,
"eval_steps_per_second": 1.009,
"step": 76
},
{
"epoch": 2.0915032679738563,
"grad_norm": 14.211647987365723,
"learning_rate": 1.0526315789473684e-05,
"loss": 1.341,
"step": 80
},
{
"epoch": 2.3529411764705883,
"grad_norm": 21.328588485717773,
"learning_rate": 1.1842105263157895e-05,
"loss": 1.2617,
"step": 90
},
{
"epoch": 2.6143790849673203,
"grad_norm": 24.131996154785156,
"learning_rate": 1.3157894736842106e-05,
"loss": 1.1608,
"step": 100
},
{
"epoch": 2.8758169934640523,
"grad_norm": 23.461227416992188,
"learning_rate": 1.4473684210526317e-05,
"loss": 1.0918,
"step": 110
},
{
"epoch": 2.980392156862745,
"eval_accuracy": 0.5514705882352942,
"eval_loss": 1.0276451110839844,
"eval_runtime": 17.5433,
"eval_samples_per_second": 7.752,
"eval_steps_per_second": 0.969,
"step": 114
},
{
"epoch": 3.1372549019607843,
"grad_norm": 44.0300407409668,
"learning_rate": 1.5789473684210526e-05,
"loss": 0.9044,
"step": 120
},
{
"epoch": 3.3986928104575163,
"grad_norm": 23.61319923400879,
"learning_rate": 1.7105263157894737e-05,
"loss": 0.9409,
"step": 130
},
{
"epoch": 3.6601307189542482,
"grad_norm": 27.572128295898438,
"learning_rate": 1.8421052631578947e-05,
"loss": 0.9152,
"step": 140
},
{
"epoch": 3.9215686274509802,
"grad_norm": 20.785051345825195,
"learning_rate": 1.9736842105263158e-05,
"loss": 0.8051,
"step": 150
},
{
"epoch": 4.0,
"eval_accuracy": 0.6691176470588235,
"eval_loss": 0.7678546905517578,
"eval_runtime": 17.2269,
"eval_samples_per_second": 7.895,
"eval_steps_per_second": 0.987,
"step": 153
},
{
"epoch": 4.183006535947713,
"grad_norm": 32.00216293334961,
"learning_rate": 2.105263157894737e-05,
"loss": 0.7821,
"step": 160
},
{
"epoch": 4.444444444444445,
"grad_norm": 23.564285278320312,
"learning_rate": 2.236842105263158e-05,
"loss": 0.8036,
"step": 170
},
{
"epoch": 4.705882352941177,
"grad_norm": 21.403562545776367,
"learning_rate": 2.368421052631579e-05,
"loss": 0.7355,
"step": 180
},
{
"epoch": 4.967320261437909,
"grad_norm": 31.243640899658203,
"learning_rate": 2.5e-05,
"loss": 0.635,
"step": 190
},
{
"epoch": 4.993464052287582,
"eval_accuracy": 0.7867647058823529,
"eval_loss": 0.5927847623825073,
"eval_runtime": 17.4003,
"eval_samples_per_second": 7.816,
"eval_steps_per_second": 0.977,
"step": 191
},
{
"epoch": 5.228758169934641,
"grad_norm": 23.90205192565918,
"learning_rate": 2.6315789473684212e-05,
"loss": 0.6363,
"step": 200
},
{
"epoch": 5.490196078431373,
"grad_norm": 23.38309669494629,
"learning_rate": 2.7631578947368426e-05,
"loss": 0.6285,
"step": 210
},
{
"epoch": 5.751633986928105,
"grad_norm": 41.387149810791016,
"learning_rate": 2.8947368421052634e-05,
"loss": 0.6051,
"step": 220
},
{
"epoch": 5.9869281045751634,
"eval_accuracy": 0.75,
"eval_loss": 0.695731520652771,
"eval_runtime": 17.5363,
"eval_samples_per_second": 7.755,
"eval_steps_per_second": 0.969,
"step": 229
},
{
"epoch": 6.0130718954248366,
"grad_norm": 33.84821319580078,
"learning_rate": 3.0263157894736844e-05,
"loss": 0.6503,
"step": 230
},
{
"epoch": 6.2745098039215685,
"grad_norm": 18.2890682220459,
"learning_rate": 3.157894736842105e-05,
"loss": 0.4905,
"step": 240
},
{
"epoch": 6.5359477124183005,
"grad_norm": 25.626060485839844,
"learning_rate": 3.289473684210527e-05,
"loss": 0.5262,
"step": 250
},
{
"epoch": 6.7973856209150325,
"grad_norm": 28.431270599365234,
"learning_rate": 3.421052631578947e-05,
"loss": 0.5539,
"step": 260
},
{
"epoch": 6.980392156862745,
"eval_accuracy": 0.7941176470588235,
"eval_loss": 0.5016477108001709,
"eval_runtime": 17.3512,
"eval_samples_per_second": 7.838,
"eval_steps_per_second": 0.98,
"step": 267
},
{
"epoch": 7.0588235294117645,
"grad_norm": 21.074764251708984,
"learning_rate": 3.5526315789473684e-05,
"loss": 0.4807,
"step": 270
},
{
"epoch": 7.3202614379084965,
"grad_norm": 21.632251739501953,
"learning_rate": 3.6842105263157895e-05,
"loss": 0.4704,
"step": 280
},
{
"epoch": 7.5816993464052285,
"grad_norm": 41.86575698852539,
"learning_rate": 3.815789473684211e-05,
"loss": 0.5141,
"step": 290
},
{
"epoch": 7.8431372549019605,
"grad_norm": 20.23293685913086,
"learning_rate": 3.9473684210526316e-05,
"loss": 0.4683,
"step": 300
},
{
"epoch": 8.0,
"eval_accuracy": 0.8235294117647058,
"eval_loss": 0.4732811748981476,
"eval_runtime": 17.0473,
"eval_samples_per_second": 7.978,
"eval_steps_per_second": 0.997,
"step": 306
},
{
"epoch": 8.104575163398692,
"grad_norm": 67.42210388183594,
"learning_rate": 4.078947368421053e-05,
"loss": 0.451,
"step": 310
},
{
"epoch": 8.366013071895425,
"grad_norm": 22.807098388671875,
"learning_rate": 4.210526315789474e-05,
"loss": 0.4019,
"step": 320
},
{
"epoch": 8.627450980392156,
"grad_norm": 31.961091995239258,
"learning_rate": 4.342105263157895e-05,
"loss": 0.4663,
"step": 330
},
{
"epoch": 8.88888888888889,
"grad_norm": 26.965513229370117,
"learning_rate": 4.473684210526316e-05,
"loss": 0.4153,
"step": 340
},
{
"epoch": 8.993464052287582,
"eval_accuracy": 0.8529411764705882,
"eval_loss": 0.4834950268268585,
"eval_runtime": 16.944,
"eval_samples_per_second": 8.026,
"eval_steps_per_second": 1.003,
"step": 344
},
{
"epoch": 9.15032679738562,
"grad_norm": 21.733226776123047,
"learning_rate": 4.605263157894737e-05,
"loss": 0.473,
"step": 350
},
{
"epoch": 9.411764705882353,
"grad_norm": 17.1552734375,
"learning_rate": 4.736842105263158e-05,
"loss": 0.3912,
"step": 360
},
{
"epoch": 9.673202614379084,
"grad_norm": 39.66945266723633,
"learning_rate": 4.868421052631579e-05,
"loss": 0.465,
"step": 370
},
{
"epoch": 9.934640522875817,
"grad_norm": 24.060779571533203,
"learning_rate": 5e-05,
"loss": 0.3954,
"step": 380
},
{
"epoch": 9.986928104575163,
"eval_accuracy": 0.8308823529411765,
"eval_loss": 0.5431119203567505,
"eval_runtime": 16.9702,
"eval_samples_per_second": 8.014,
"eval_steps_per_second": 1.002,
"step": 382
},
{
"epoch": 10.196078431372548,
"grad_norm": 22.754186630249023,
"learning_rate": 4.985380116959065e-05,
"loss": 0.309,
"step": 390
},
{
"epoch": 10.457516339869281,
"grad_norm": 25.09243392944336,
"learning_rate": 4.970760233918128e-05,
"loss": 0.2985,
"step": 400
},
{
"epoch": 10.718954248366012,
"grad_norm": 32.95780563354492,
"learning_rate": 4.956140350877193e-05,
"loss": 0.3551,
"step": 410
},
{
"epoch": 10.980392156862745,
"grad_norm": 24.594146728515625,
"learning_rate": 4.941520467836258e-05,
"loss": 0.3524,
"step": 420
},
{
"epoch": 10.980392156862745,
"eval_accuracy": 0.8235294117647058,
"eval_loss": 0.4060741364955902,
"eval_runtime": 16.9787,
"eval_samples_per_second": 8.01,
"eval_steps_per_second": 1.001,
"step": 420
},
{
"epoch": 11.241830065359476,
"grad_norm": 34.58118438720703,
"learning_rate": 4.926900584795322e-05,
"loss": 0.3015,
"step": 430
},
{
"epoch": 11.50326797385621,
"grad_norm": 17.467493057250977,
"learning_rate": 4.912280701754386e-05,
"loss": 0.332,
"step": 440
},
{
"epoch": 11.764705882352942,
"grad_norm": 11.450825691223145,
"learning_rate": 4.8976608187134504e-05,
"loss": 0.3546,
"step": 450
},
{
"epoch": 12.0,
"eval_accuracy": 0.8382352941176471,
"eval_loss": 0.4924784302711487,
"eval_runtime": 17.0509,
"eval_samples_per_second": 7.976,
"eval_steps_per_second": 0.997,
"step": 459
},
{
"epoch": 12.026143790849673,
"grad_norm": 22.95159912109375,
"learning_rate": 4.883040935672515e-05,
"loss": 0.3362,
"step": 460
},
{
"epoch": 12.287581699346406,
"grad_norm": 15.78369140625,
"learning_rate": 4.868421052631579e-05,
"loss": 0.2589,
"step": 470
},
{
"epoch": 12.549019607843137,
"grad_norm": 18.571977615356445,
"learning_rate": 4.853801169590643e-05,
"loss": 0.2588,
"step": 480
},
{
"epoch": 12.81045751633987,
"grad_norm": 10.237850189208984,
"learning_rate": 4.839181286549708e-05,
"loss": 0.2922,
"step": 490
},
{
"epoch": 12.993464052287582,
"eval_accuracy": 0.875,
"eval_loss": 0.36371880769729614,
"eval_runtime": 16.7827,
"eval_samples_per_second": 8.104,
"eval_steps_per_second": 1.013,
"step": 497
},
{
"epoch": 13.071895424836601,
"grad_norm": 14.183631896972656,
"learning_rate": 4.824561403508772e-05,
"loss": 0.2683,
"step": 500
},
{
"epoch": 13.333333333333334,
"grad_norm": 15.362314224243164,
"learning_rate": 4.8099415204678366e-05,
"loss": 0.2178,
"step": 510
},
{
"epoch": 13.594771241830065,
"grad_norm": 31.49340057373047,
"learning_rate": 4.7953216374269006e-05,
"loss": 0.2095,
"step": 520
},
{
"epoch": 13.856209150326798,
"grad_norm": 39.85598373413086,
"learning_rate": 4.780701754385965e-05,
"loss": 0.2342,
"step": 530
},
{
"epoch": 13.986928104575163,
"eval_accuracy": 0.8970588235294118,
"eval_loss": 0.32859814167022705,
"eval_runtime": 16.8467,
"eval_samples_per_second": 8.073,
"eval_steps_per_second": 1.009,
"step": 535
},
{
"epoch": 14.117647058823529,
"grad_norm": 22.395517349243164,
"learning_rate": 4.7660818713450294e-05,
"loss": 0.2927,
"step": 540
},
{
"epoch": 14.379084967320262,
"grad_norm": 15.716471672058105,
"learning_rate": 4.751461988304094e-05,
"loss": 0.2419,
"step": 550
},
{
"epoch": 14.640522875816993,
"grad_norm": 13.827138900756836,
"learning_rate": 4.736842105263158e-05,
"loss": 0.2215,
"step": 560
},
{
"epoch": 14.901960784313726,
"grad_norm": 8.343385696411133,
"learning_rate": 4.722222222222222e-05,
"loss": 0.2083,
"step": 570
},
{
"epoch": 14.980392156862745,
"eval_accuracy": 0.8823529411764706,
"eval_loss": 0.327125608921051,
"eval_runtime": 17.1905,
"eval_samples_per_second": 7.911,
"eval_steps_per_second": 0.989,
"step": 573
},
{
"epoch": 15.163398692810457,
"grad_norm": 27.369592666625977,
"learning_rate": 4.707602339181287e-05,
"loss": 0.1837,
"step": 580
},
{
"epoch": 15.42483660130719,
"grad_norm": 4.707042217254639,
"learning_rate": 4.6929824561403515e-05,
"loss": 0.1872,
"step": 590
},
{
"epoch": 15.686274509803921,
"grad_norm": 19.026412963867188,
"learning_rate": 4.678362573099415e-05,
"loss": 0.2063,
"step": 600
},
{
"epoch": 15.947712418300654,
"grad_norm": 39.22539138793945,
"learning_rate": 4.6637426900584796e-05,
"loss": 0.2704,
"step": 610
},
{
"epoch": 16.0,
"eval_accuracy": 0.8823529411764706,
"eval_loss": 0.3700261414051056,
"eval_runtime": 17.2498,
"eval_samples_per_second": 7.884,
"eval_steps_per_second": 0.986,
"step": 612
},
{
"epoch": 16.209150326797385,
"grad_norm": 4.610194683074951,
"learning_rate": 4.649122807017544e-05,
"loss": 0.1895,
"step": 620
},
{
"epoch": 16.470588235294116,
"grad_norm": 27.570838928222656,
"learning_rate": 4.634502923976608e-05,
"loss": 0.1492,
"step": 630
},
{
"epoch": 16.73202614379085,
"grad_norm": 13.742429733276367,
"learning_rate": 4.619883040935672e-05,
"loss": 0.1698,
"step": 640
},
{
"epoch": 16.99346405228758,
"grad_norm": 16.786169052124023,
"learning_rate": 4.605263157894737e-05,
"loss": 0.1871,
"step": 650
},
{
"epoch": 16.99346405228758,
"eval_accuracy": 0.8970588235294118,
"eval_loss": 0.34471678733825684,
"eval_runtime": 16.7473,
"eval_samples_per_second": 8.121,
"eval_steps_per_second": 1.015,
"step": 650
},
{
"epoch": 17.254901960784313,
"grad_norm": 15.884855270385742,
"learning_rate": 4.590643274853802e-05,
"loss": 0.1335,
"step": 660
},
{
"epoch": 17.516339869281047,
"grad_norm": 17.3248348236084,
"learning_rate": 4.576023391812866e-05,
"loss": 0.1399,
"step": 670
},
{
"epoch": 17.77777777777778,
"grad_norm": 16.090543746948242,
"learning_rate": 4.56140350877193e-05,
"loss": 0.226,
"step": 680
},
{
"epoch": 17.986928104575163,
"eval_accuracy": 0.8602941176470589,
"eval_loss": 0.4279506206512451,
"eval_runtime": 16.8179,
"eval_samples_per_second": 8.087,
"eval_steps_per_second": 1.011,
"step": 688
},
{
"epoch": 18.03921568627451,
"grad_norm": 17.314950942993164,
"learning_rate": 4.5467836257309945e-05,
"loss": 0.2657,
"step": 690
},
{
"epoch": 18.30065359477124,
"grad_norm": 26.111413955688477,
"learning_rate": 4.5321637426900585e-05,
"loss": 0.1238,
"step": 700
},
{
"epoch": 18.562091503267975,
"grad_norm": 34.5568962097168,
"learning_rate": 4.517543859649123e-05,
"loss": 0.3426,
"step": 710
},
{
"epoch": 18.823529411764707,
"grad_norm": 27.506118774414062,
"learning_rate": 4.502923976608187e-05,
"loss": 0.245,
"step": 720
},
{
"epoch": 18.980392156862745,
"eval_accuracy": 0.8088235294117647,
"eval_loss": 0.6445416212081909,
"eval_runtime": 16.6042,
"eval_samples_per_second": 8.191,
"eval_steps_per_second": 1.024,
"step": 726
},
{
"epoch": 19.084967320261438,
"grad_norm": 8.742308616638184,
"learning_rate": 4.488304093567251e-05,
"loss": 0.1876,
"step": 730
},
{
"epoch": 19.34640522875817,
"grad_norm": 37.74170684814453,
"learning_rate": 4.473684210526316e-05,
"loss": 0.1044,
"step": 740
},
{
"epoch": 19.607843137254903,
"grad_norm": 17.85502815246582,
"learning_rate": 4.4590643274853806e-05,
"loss": 0.1637,
"step": 750
},
{
"epoch": 19.869281045751634,
"grad_norm": 13.413275718688965,
"learning_rate": 4.4444444444444447e-05,
"loss": 0.1545,
"step": 760
},
{
"epoch": 20.0,
"eval_accuracy": 0.8602941176470589,
"eval_loss": 0.41802164912223816,
"eval_runtime": 16.9375,
"eval_samples_per_second": 8.03,
"eval_steps_per_second": 1.004,
"step": 765
},
{
"epoch": 20.130718954248366,
"grad_norm": 24.223968505859375,
"learning_rate": 4.429824561403509e-05,
"loss": 0.1333,
"step": 770
},
{
"epoch": 20.392156862745097,
"grad_norm": 22.863794326782227,
"learning_rate": 4.4152046783625734e-05,
"loss": 0.1223,
"step": 780
},
{
"epoch": 20.65359477124183,
"grad_norm": 20.22460174560547,
"learning_rate": 4.400584795321638e-05,
"loss": 0.1906,
"step": 790
},
{
"epoch": 20.915032679738562,
"grad_norm": 6.557627201080322,
"learning_rate": 4.3859649122807014e-05,
"loss": 0.0981,
"step": 800
},
{
"epoch": 20.99346405228758,
"eval_accuracy": 0.9044117647058824,
"eval_loss": 0.32080766558647156,
"eval_runtime": 17.4044,
"eval_samples_per_second": 7.814,
"eval_steps_per_second": 0.977,
"step": 803
},
{
"epoch": 21.176470588235293,
"grad_norm": 11.885444641113281,
"learning_rate": 4.371345029239766e-05,
"loss": 0.1654,
"step": 810
},
{
"epoch": 21.437908496732025,
"grad_norm": 16.748071670532227,
"learning_rate": 4.356725146198831e-05,
"loss": 0.1706,
"step": 820
},
{
"epoch": 21.69934640522876,
"grad_norm": 25.410442352294922,
"learning_rate": 4.342105263157895e-05,
"loss": 0.1121,
"step": 830
},
{
"epoch": 21.96078431372549,
"grad_norm": 24.631742477416992,
"learning_rate": 4.327485380116959e-05,
"loss": 0.1455,
"step": 840
},
{
"epoch": 21.986928104575163,
"eval_accuracy": 0.8602941176470589,
"eval_loss": 0.425643652677536,
"eval_runtime": 20.0595,
"eval_samples_per_second": 6.78,
"eval_steps_per_second": 0.847,
"step": 841
},
{
"epoch": 22.22222222222222,
"grad_norm": 9.926827430725098,
"learning_rate": 4.3128654970760236e-05,
"loss": 0.144,
"step": 850
},
{
"epoch": 22.483660130718953,
"grad_norm": 32.22057342529297,
"learning_rate": 4.298245614035088e-05,
"loss": 0.1328,
"step": 860
},
{
"epoch": 22.745098039215687,
"grad_norm": 6.770218849182129,
"learning_rate": 4.283625730994152e-05,
"loss": 0.2405,
"step": 870
},
{
"epoch": 22.980392156862745,
"eval_accuracy": 0.8970588235294118,
"eval_loss": 0.34735360741615295,
"eval_runtime": 36.4621,
"eval_samples_per_second": 3.73,
"eval_steps_per_second": 0.466,
"step": 879
},
{
"epoch": 23.00653594771242,
"grad_norm": 18.301342010498047,
"learning_rate": 4.269005847953216e-05,
"loss": 0.1407,
"step": 880
},
{
"epoch": 23.26797385620915,
"grad_norm": 25.70302963256836,
"learning_rate": 4.254385964912281e-05,
"loss": 0.1403,
"step": 890
},
{
"epoch": 23.529411764705884,
"grad_norm": 6.829775333404541,
"learning_rate": 4.239766081871345e-05,
"loss": 0.1278,
"step": 900
},
{
"epoch": 23.790849673202615,
"grad_norm": 15.183685302734375,
"learning_rate": 4.22514619883041e-05,
"loss": 0.1549,
"step": 910
},
{
"epoch": 24.0,
"eval_accuracy": 0.9044117647058824,
"eval_loss": 0.39403286576271057,
"eval_runtime": 30.2513,
"eval_samples_per_second": 4.496,
"eval_steps_per_second": 0.562,
"step": 918
},
{
"epoch": 24.052287581699346,
"grad_norm": 76.56197357177734,
"learning_rate": 4.210526315789474e-05,
"loss": 0.2019,
"step": 920
},
{
"epoch": 24.313725490196077,
"grad_norm": 10.338065147399902,
"learning_rate": 4.195906432748538e-05,
"loss": 0.1341,
"step": 930
},
{
"epoch": 24.575163398692812,
"grad_norm": 10.710972785949707,
"learning_rate": 4.1812865497076025e-05,
"loss": 0.1207,
"step": 940
},
{
"epoch": 24.836601307189543,
"grad_norm": 19.086135864257812,
"learning_rate": 4.166666666666667e-05,
"loss": 0.1721,
"step": 950
},
{
"epoch": 24.99346405228758,
"eval_accuracy": 0.8823529411764706,
"eval_loss": 0.4279385805130005,
"eval_runtime": 29.9969,
"eval_samples_per_second": 4.534,
"eval_steps_per_second": 0.567,
"step": 956
},
{
"epoch": 25.098039215686274,
"grad_norm": 6.991425514221191,
"learning_rate": 4.152046783625731e-05,
"loss": 0.0729,
"step": 960
},
{
"epoch": 25.359477124183005,
"grad_norm": 8.979483604431152,
"learning_rate": 4.137426900584795e-05,
"loss": 0.1826,
"step": 970
},
{
"epoch": 25.62091503267974,
"grad_norm": 11.570904731750488,
"learning_rate": 4.12280701754386e-05,
"loss": 0.1492,
"step": 980
},
{
"epoch": 25.88235294117647,
"grad_norm": 14.8778076171875,
"learning_rate": 4.1081871345029247e-05,
"loss": 0.1378,
"step": 990
},
{
"epoch": 25.986928104575163,
"eval_accuracy": 0.9044117647058824,
"eval_loss": 0.387086421251297,
"eval_runtime": 29.0075,
"eval_samples_per_second": 4.688,
"eval_steps_per_second": 0.586,
"step": 994
},
{
"epoch": 26.143790849673202,
"grad_norm": 11.985469818115234,
"learning_rate": 4.093567251461988e-05,
"loss": 0.1122,
"step": 1000
},
{
"epoch": 26.405228758169933,
"grad_norm": 22.02225685119629,
"learning_rate": 4.078947368421053e-05,
"loss": 0.1172,
"step": 1010
},
{
"epoch": 26.666666666666668,
"grad_norm": 1.2671743631362915,
"learning_rate": 4.0643274853801174e-05,
"loss": 0.0891,
"step": 1020
},
{
"epoch": 26.9281045751634,
"grad_norm": 10.896835327148438,
"learning_rate": 4.0497076023391814e-05,
"loss": 0.0924,
"step": 1030
},
{
"epoch": 26.980392156862745,
"eval_accuracy": 0.8455882352941176,
"eval_loss": 0.7301138639450073,
"eval_runtime": 28.9067,
"eval_samples_per_second": 4.705,
"eval_steps_per_second": 0.588,
"step": 1032
},
{
"epoch": 27.18954248366013,
"grad_norm": 7.8527960777282715,
"learning_rate": 4.0350877192982455e-05,
"loss": 0.1348,
"step": 1040
},
{
"epoch": 27.45098039215686,
"grad_norm": 2.1555140018463135,
"learning_rate": 4.02046783625731e-05,
"loss": 0.0675,
"step": 1050
},
{
"epoch": 27.712418300653596,
"grad_norm": 7.751283645629883,
"learning_rate": 4.005847953216375e-05,
"loss": 0.0916,
"step": 1060
},
{
"epoch": 27.973856209150327,
"grad_norm": 33.804786682128906,
"learning_rate": 3.991228070175439e-05,
"loss": 0.1325,
"step": 1070
},
{
"epoch": 28.0,
"eval_accuracy": 0.9044117647058824,
"eval_loss": 0.3712061643600464,
"eval_runtime": 28.0451,
"eval_samples_per_second": 4.849,
"eval_steps_per_second": 0.606,
"step": 1071
},
{
"epoch": 28.235294117647058,
"grad_norm": 7.706085205078125,
"learning_rate": 3.976608187134503e-05,
"loss": 0.0879,
"step": 1080
},
{
"epoch": 28.49673202614379,
"grad_norm": 4.338534355163574,
"learning_rate": 3.9619883040935676e-05,
"loss": 0.1017,
"step": 1090
},
{
"epoch": 28.758169934640524,
"grad_norm": 9.544697761535645,
"learning_rate": 3.9473684210526316e-05,
"loss": 0.1426,
"step": 1100
},
{
"epoch": 28.99346405228758,
"eval_accuracy": 0.8602941176470589,
"eval_loss": 0.440034419298172,
"eval_runtime": 30.1321,
"eval_samples_per_second": 4.513,
"eval_steps_per_second": 0.564,
"step": 1109
},
{
"epoch": 29.019607843137255,
"grad_norm": 0.3841346502304077,
"learning_rate": 3.932748538011696e-05,
"loss": 0.0981,
"step": 1110
},
{
"epoch": 29.281045751633986,
"grad_norm": 9.533553123474121,
"learning_rate": 3.9181286549707604e-05,
"loss": 0.0926,
"step": 1120
},
{
"epoch": 29.54248366013072,
"grad_norm": 26.160850524902344,
"learning_rate": 3.9035087719298244e-05,
"loss": 0.083,
"step": 1130
},
{
"epoch": 29.80392156862745,
"grad_norm": 18.309621810913086,
"learning_rate": 3.888888888888889e-05,
"loss": 0.0866,
"step": 1140
},
{
"epoch": 29.986928104575163,
"eval_accuracy": 0.9411764705882353,
"eval_loss": 0.27793076634407043,
"eval_runtime": 29.3246,
"eval_samples_per_second": 4.638,
"eval_steps_per_second": 0.58,
"step": 1147
},
{
"epoch": 30.065359477124183,
"grad_norm": 24.974849700927734,
"learning_rate": 3.874269005847954e-05,
"loss": 0.11,
"step": 1150
},
{
"epoch": 30.326797385620914,
"grad_norm": 3.7421281337738037,
"learning_rate": 3.859649122807018e-05,
"loss": 0.0712,
"step": 1160
},
{
"epoch": 30.58823529411765,
"grad_norm": 10.041555404663086,
"learning_rate": 3.845029239766082e-05,
"loss": 0.0702,
"step": 1170
},
{
"epoch": 30.84967320261438,
"grad_norm": 37.238948822021484,
"learning_rate": 3.8304093567251465e-05,
"loss": 0.0659,
"step": 1180
},
{
"epoch": 30.980392156862745,
"eval_accuracy": 0.9411764705882353,
"eval_loss": 0.3207360804080963,
"eval_runtime": 34.3274,
"eval_samples_per_second": 3.962,
"eval_steps_per_second": 0.495,
"step": 1185
},
{
"epoch": 31.11111111111111,
"grad_norm": 13.073234558105469,
"learning_rate": 3.815789473684211e-05,
"loss": 0.0547,
"step": 1190
},
{
"epoch": 31.372549019607842,
"grad_norm": 3.1763381958007812,
"learning_rate": 3.8011695906432746e-05,
"loss": 0.0727,
"step": 1200
},
{
"epoch": 31.633986928104576,
"grad_norm": 1.5747133493423462,
"learning_rate": 3.786549707602339e-05,
"loss": 0.1023,
"step": 1210
},
{
"epoch": 31.895424836601308,
"grad_norm": 12.335155487060547,
"learning_rate": 3.771929824561404e-05,
"loss": 0.1175,
"step": 1220
},
{
"epoch": 32.0,
"eval_accuracy": 0.9044117647058824,
"eval_loss": 0.43389689922332764,
"eval_runtime": 32.183,
"eval_samples_per_second": 4.226,
"eval_steps_per_second": 0.528,
"step": 1224
},
{
"epoch": 32.15686274509804,
"grad_norm": 2.676323413848877,
"learning_rate": 3.757309941520468e-05,
"loss": 0.129,
"step": 1230
},
{
"epoch": 32.41830065359477,
"grad_norm": 0.5916957259178162,
"learning_rate": 3.742690058479532e-05,
"loss": 0.0585,
"step": 1240
},
{
"epoch": 32.6797385620915,
"grad_norm": 11.02872085571289,
"learning_rate": 3.728070175438597e-05,
"loss": 0.045,
"step": 1250
},
{
"epoch": 32.94117647058823,
"grad_norm": 44.40802001953125,
"learning_rate": 3.713450292397661e-05,
"loss": 0.0455,
"step": 1260
},
{
"epoch": 32.99346405228758,
"eval_accuracy": 0.9264705882352942,
"eval_loss": 0.4536753296852112,
"eval_runtime": 32.0477,
"eval_samples_per_second": 4.244,
"eval_steps_per_second": 0.53,
"step": 1262
},
{
"epoch": 33.20261437908497,
"grad_norm": 0.4168817400932312,
"learning_rate": 3.6988304093567254e-05,
"loss": 0.0625,
"step": 1270
},
{
"epoch": 33.4640522875817,
"grad_norm": 7.689728260040283,
"learning_rate": 3.6842105263157895e-05,
"loss": 0.1613,
"step": 1280
},
{
"epoch": 33.72549019607843,
"grad_norm": 9.364749908447266,
"learning_rate": 3.669590643274854e-05,
"loss": 0.1001,
"step": 1290
},
{
"epoch": 33.98692810457516,
"grad_norm": 14.09304428100586,
"learning_rate": 3.654970760233918e-05,
"loss": 0.1006,
"step": 1300
},
{
"epoch": 33.98692810457516,
"eval_accuracy": 0.875,
"eval_loss": 0.6521199345588684,
"eval_runtime": 33.7228,
"eval_samples_per_second": 4.033,
"eval_steps_per_second": 0.504,
"step": 1300
},
{
"epoch": 34.248366013071895,
"grad_norm": 14.115684509277344,
"learning_rate": 3.640350877192983e-05,
"loss": 0.1592,
"step": 1310
},
{
"epoch": 34.509803921568626,
"grad_norm": 2.2361948490142822,
"learning_rate": 3.625730994152047e-05,
"loss": 0.0785,
"step": 1320
},
{
"epoch": 34.77124183006536,
"grad_norm": 15.101175308227539,
"learning_rate": 3.611111111111111e-05,
"loss": 0.033,
"step": 1330
},
{
"epoch": 34.98039215686274,
"eval_accuracy": 0.9044117647058824,
"eval_loss": 0.5615760087966919,
"eval_runtime": 20.5904,
"eval_samples_per_second": 6.605,
"eval_steps_per_second": 0.826,
"step": 1338
},
{
"epoch": 35.032679738562095,
"grad_norm": 74.07561492919922,
"learning_rate": 3.5964912280701756e-05,
"loss": 0.1336,
"step": 1340
},
{
"epoch": 35.294117647058826,
"grad_norm": 40.868961334228516,
"learning_rate": 3.5818713450292403e-05,
"loss": 0.1209,
"step": 1350
},
{
"epoch": 35.55555555555556,
"grad_norm": 11.251754760742188,
"learning_rate": 3.5672514619883044e-05,
"loss": 0.0658,
"step": 1360
},
{
"epoch": 35.81699346405229,
"grad_norm": 20.791095733642578,
"learning_rate": 3.5526315789473684e-05,
"loss": 0.0979,
"step": 1370
},
{
"epoch": 36.0,
"eval_accuracy": 0.9191176470588235,
"eval_loss": 0.3717995882034302,
"eval_runtime": 21.531,
"eval_samples_per_second": 6.316,
"eval_steps_per_second": 0.79,
"step": 1377
},
{
"epoch": 36.07843137254902,
"grad_norm": 13.336127281188965,
"learning_rate": 3.538011695906433e-05,
"loss": 0.0712,
"step": 1380
},
{
"epoch": 36.33986928104575,
"grad_norm": 7.379011154174805,
"learning_rate": 3.523391812865498e-05,
"loss": 0.0826,
"step": 1390
},
{
"epoch": 36.60130718954248,
"grad_norm": 1.9048967361450195,
"learning_rate": 3.508771929824561e-05,
"loss": 0.0791,
"step": 1400
},
{
"epoch": 36.86274509803921,
"grad_norm": 32.38518142700195,
"learning_rate": 3.494152046783626e-05,
"loss": 0.1045,
"step": 1410
},
{
"epoch": 36.99346405228758,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.25290319323539734,
"eval_runtime": 22.9294,
"eval_samples_per_second": 5.931,
"eval_steps_per_second": 0.741,
"step": 1415
},
{
"epoch": 37.12418300653595,
"grad_norm": 14.719789505004883,
"learning_rate": 3.4795321637426905e-05,
"loss": 0.0977,
"step": 1420
},
{
"epoch": 37.38562091503268,
"grad_norm": 21.388763427734375,
"learning_rate": 3.4649122807017546e-05,
"loss": 0.0374,
"step": 1430
},
{
"epoch": 37.64705882352941,
"grad_norm": 7.066629886627197,
"learning_rate": 3.4502923976608186e-05,
"loss": 0.0819,
"step": 1440
},
{
"epoch": 37.908496732026144,
"grad_norm": 4.583933353424072,
"learning_rate": 3.435672514619883e-05,
"loss": 0.0815,
"step": 1450
},
{
"epoch": 37.98692810457516,
"eval_accuracy": 0.9338235294117647,
"eval_loss": 0.3510648012161255,
"eval_runtime": 21.3875,
"eval_samples_per_second": 6.359,
"eval_steps_per_second": 0.795,
"step": 1453
},
{
"epoch": 38.169934640522875,
"grad_norm": 14.378546714782715,
"learning_rate": 3.421052631578947e-05,
"loss": 0.1109,
"step": 1460
},
{
"epoch": 38.431372549019606,
"grad_norm": 4.1210408210754395,
"learning_rate": 3.406432748538012e-05,
"loss": 0.052,
"step": 1470
},
{
"epoch": 38.69281045751634,
"grad_norm": 18.48431396484375,
"learning_rate": 3.391812865497076e-05,
"loss": 0.0932,
"step": 1480
},
{
"epoch": 38.95424836601307,
"grad_norm": 30.51089859008789,
"learning_rate": 3.377192982456141e-05,
"loss": 0.0761,
"step": 1490
},
{
"epoch": 38.98039215686274,
"eval_accuracy": 0.9338235294117647,
"eval_loss": 0.31144019961357117,
"eval_runtime": 32.6124,
"eval_samples_per_second": 4.17,
"eval_steps_per_second": 0.521,
"step": 1491
},
{
"epoch": 39.21568627450981,
"grad_norm": 29.487356185913086,
"learning_rate": 3.362573099415205e-05,
"loss": 0.0995,
"step": 1500
},
{
"epoch": 39.47712418300654,
"grad_norm": 4.752898216247559,
"learning_rate": 3.3479532163742695e-05,
"loss": 0.0986,
"step": 1510
},
{
"epoch": 39.73856209150327,
"grad_norm": 23.433902740478516,
"learning_rate": 3.3333333333333335e-05,
"loss": 0.0908,
"step": 1520
},
{
"epoch": 40.0,
"grad_norm": 8.154867172241211,
"learning_rate": 3.3187134502923975e-05,
"loss": 0.0747,
"step": 1530
},
{
"epoch": 40.0,
"eval_accuracy": 0.9338235294117647,
"eval_loss": 0.2836870849132538,
"eval_runtime": 33.717,
"eval_samples_per_second": 4.034,
"eval_steps_per_second": 0.504,
"step": 1530
},
{
"epoch": 40.26143790849673,
"grad_norm": 66.09915924072266,
"learning_rate": 3.304093567251462e-05,
"loss": 0.0746,
"step": 1540
},
{
"epoch": 40.52287581699346,
"grad_norm": 8.447415351867676,
"learning_rate": 3.289473684210527e-05,
"loss": 0.0809,
"step": 1550
},
{
"epoch": 40.78431372549019,
"grad_norm": 11.7717866897583,
"learning_rate": 3.274853801169591e-05,
"loss": 0.0545,
"step": 1560
},
{
"epoch": 40.99346405228758,
"eval_accuracy": 0.9411764705882353,
"eval_loss": 0.42687493562698364,
"eval_runtime": 30.8285,
"eval_samples_per_second": 4.412,
"eval_steps_per_second": 0.551,
"step": 1568
},
{
"epoch": 41.04575163398693,
"grad_norm": 2.3586502075195312,
"learning_rate": 3.260233918128655e-05,
"loss": 0.058,
"step": 1570
},
{
"epoch": 41.30718954248366,
"grad_norm": 31.519433975219727,
"learning_rate": 3.24561403508772e-05,
"loss": 0.0838,
"step": 1580
},
{
"epoch": 41.568627450980394,
"grad_norm": 0.15550392866134644,
"learning_rate": 3.230994152046784e-05,
"loss": 0.0853,
"step": 1590
},
{
"epoch": 41.830065359477125,
"grad_norm": 6.823671340942383,
"learning_rate": 3.216374269005848e-05,
"loss": 0.0796,
"step": 1600
},
{
"epoch": 41.98692810457516,
"eval_accuracy": 0.9411764705882353,
"eval_loss": 0.23307542502880096,
"eval_runtime": 33.1415,
"eval_samples_per_second": 4.104,
"eval_steps_per_second": 0.513,
"step": 1606
},
{
"epoch": 42.091503267973856,
"grad_norm": 11.52629566192627,
"learning_rate": 3.2017543859649124e-05,
"loss": 0.0903,
"step": 1610
},
{
"epoch": 42.35294117647059,
"grad_norm": 11.996484756469727,
"learning_rate": 3.187134502923977e-05,
"loss": 0.0595,
"step": 1620
},
{
"epoch": 42.61437908496732,
"grad_norm": 1.5475754737854004,
"learning_rate": 3.172514619883041e-05,
"loss": 0.0993,
"step": 1630
},
{
"epoch": 42.87581699346405,
"grad_norm": 18.27874755859375,
"learning_rate": 3.157894736842105e-05,
"loss": 0.055,
"step": 1640
},
{
"epoch": 42.98039215686274,
"eval_accuracy": 0.9485294117647058,
"eval_loss": 0.28995171189308167,
"eval_runtime": 31.1656,
"eval_samples_per_second": 4.364,
"eval_steps_per_second": 0.545,
"step": 1644
},
{
"epoch": 43.13725490196079,
"grad_norm": 1.7079222202301025,
"learning_rate": 3.14327485380117e-05,
"loss": 0.0851,
"step": 1650
},
{
"epoch": 43.39869281045752,
"grad_norm": 0.0829237625002861,
"learning_rate": 3.128654970760234e-05,
"loss": 0.061,
"step": 1660
},
{
"epoch": 43.66013071895425,
"grad_norm": 2.6961874961853027,
"learning_rate": 3.1140350877192986e-05,
"loss": 0.0205,
"step": 1670
},
{
"epoch": 43.92156862745098,
"grad_norm": 3.1870129108428955,
"learning_rate": 3.0994152046783626e-05,
"loss": 0.0706,
"step": 1680
},
{
"epoch": 44.0,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.3367806077003479,
"eval_runtime": 25.249,
"eval_samples_per_second": 5.386,
"eval_steps_per_second": 0.673,
"step": 1683
},
{
"epoch": 44.18300653594771,
"grad_norm": 10.678839683532715,
"learning_rate": 3.084795321637427e-05,
"loss": 0.0555,
"step": 1690
},
{
"epoch": 44.44444444444444,
"grad_norm": 0.1511285901069641,
"learning_rate": 3.0701754385964913e-05,
"loss": 0.0463,
"step": 1700
},
{
"epoch": 44.705882352941174,
"grad_norm": 19.222854614257812,
"learning_rate": 3.055555555555556e-05,
"loss": 0.0783,
"step": 1710
},
{
"epoch": 44.967320261437905,
"grad_norm": 12.824193954467773,
"learning_rate": 3.0409356725146197e-05,
"loss": 0.0505,
"step": 1720
},
{
"epoch": 44.99346405228758,
"eval_accuracy": 0.9485294117647058,
"eval_loss": 0.3779818117618561,
"eval_runtime": 19.0793,
"eval_samples_per_second": 7.128,
"eval_steps_per_second": 0.891,
"step": 1721
},
{
"epoch": 45.22875816993464,
"grad_norm": 18.495044708251953,
"learning_rate": 3.0263157894736844e-05,
"loss": 0.0679,
"step": 1730
},
{
"epoch": 45.490196078431374,
"grad_norm": 22.039566040039062,
"learning_rate": 3.0116959064327488e-05,
"loss": 0.0618,
"step": 1740
},
{
"epoch": 45.751633986928105,
"grad_norm": 0.6790270209312439,
"learning_rate": 2.997076023391813e-05,
"loss": 0.0698,
"step": 1750
},
{
"epoch": 45.98692810457516,
"eval_accuracy": 0.9191176470588235,
"eval_loss": 0.48222464323043823,
"eval_runtime": 33.9657,
"eval_samples_per_second": 4.004,
"eval_steps_per_second": 0.501,
"step": 1759
},
{
"epoch": 46.01307189542484,
"grad_norm": 48.15066909790039,
"learning_rate": 2.9824561403508772e-05,
"loss": 0.0745,
"step": 1760
},
{
"epoch": 46.27450980392157,
"grad_norm": 48.96921920776367,
"learning_rate": 2.9678362573099415e-05,
"loss": 0.11,
"step": 1770
},
{
"epoch": 46.5359477124183,
"grad_norm": 16.973966598510742,
"learning_rate": 2.9532163742690062e-05,
"loss": 0.0183,
"step": 1780
},
{
"epoch": 46.79738562091503,
"grad_norm": 11.563841819763184,
"learning_rate": 2.9385964912280706e-05,
"loss": 0.0275,
"step": 1790
},
{
"epoch": 46.98039215686274,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.34339553117752075,
"eval_runtime": 33.4784,
"eval_samples_per_second": 4.062,
"eval_steps_per_second": 0.508,
"step": 1797
},
{
"epoch": 47.05882352941177,
"grad_norm": 18.660812377929688,
"learning_rate": 2.9239766081871346e-05,
"loss": 0.0307,
"step": 1800
},
{
"epoch": 47.3202614379085,
"grad_norm": 19.048458099365234,
"learning_rate": 2.909356725146199e-05,
"loss": 0.036,
"step": 1810
},
{
"epoch": 47.58169934640523,
"grad_norm": 0.8519901037216187,
"learning_rate": 2.8947368421052634e-05,
"loss": 0.0491,
"step": 1820
},
{
"epoch": 47.84313725490196,
"grad_norm": 0.9929773211479187,
"learning_rate": 2.8801169590643277e-05,
"loss": 0.0641,
"step": 1830
},
{
"epoch": 48.0,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.3386637568473816,
"eval_runtime": 33.9575,
"eval_samples_per_second": 4.005,
"eval_steps_per_second": 0.501,
"step": 1836
},
{
"epoch": 48.10457516339869,
"grad_norm": 27.548429489135742,
"learning_rate": 2.8654970760233917e-05,
"loss": 0.0634,
"step": 1840
},
{
"epoch": 48.36601307189542,
"grad_norm": 0.4367322027683258,
"learning_rate": 2.850877192982456e-05,
"loss": 0.0756,
"step": 1850
},
{
"epoch": 48.627450980392155,
"grad_norm": 18.30873680114746,
"learning_rate": 2.8362573099415208e-05,
"loss": 0.0134,
"step": 1860
},
{
"epoch": 48.888888888888886,
"grad_norm": 0.011559017933905125,
"learning_rate": 2.821637426900585e-05,
"loss": 0.0484,
"step": 1870
},
{
"epoch": 48.99346405228758,
"eval_accuracy": 0.9191176470588235,
"eval_loss": 0.5349822640419006,
"eval_runtime": 38.4788,
"eval_samples_per_second": 3.534,
"eval_steps_per_second": 0.442,
"step": 1874
},
{
"epoch": 49.150326797385624,
"grad_norm": 2.1214957237243652,
"learning_rate": 2.8070175438596492e-05,
"loss": 0.088,
"step": 1880
},
{
"epoch": 49.411764705882355,
"grad_norm": 27.645193099975586,
"learning_rate": 2.7923976608187135e-05,
"loss": 0.0621,
"step": 1890
},
{
"epoch": 49.673202614379086,
"grad_norm": 1.3699434995651245,
"learning_rate": 2.777777777777778e-05,
"loss": 0.0528,
"step": 1900
},
{
"epoch": 49.93464052287582,
"grad_norm": 8.130342483520508,
"learning_rate": 2.7631578947368426e-05,
"loss": 0.0388,
"step": 1910
},
{
"epoch": 49.98692810457516,
"eval_accuracy": 0.9117647058823529,
"eval_loss": 0.382554292678833,
"eval_runtime": 33.8716,
"eval_samples_per_second": 4.015,
"eval_steps_per_second": 0.502,
"step": 1912
},
{
"epoch": 50.19607843137255,
"grad_norm": 47.961002349853516,
"learning_rate": 2.7485380116959063e-05,
"loss": 0.0941,
"step": 1920
},
{
"epoch": 50.45751633986928,
"grad_norm": 36.82217025756836,
"learning_rate": 2.733918128654971e-05,
"loss": 0.0863,
"step": 1930
},
{
"epoch": 50.71895424836601,
"grad_norm": 5.911373615264893,
"learning_rate": 2.7192982456140354e-05,
"loss": 0.0324,
"step": 1940
},
{
"epoch": 50.98039215686274,
"grad_norm": 24.99283790588379,
"learning_rate": 2.7046783625730997e-05,
"loss": 0.0347,
"step": 1950
},
{
"epoch": 50.98039215686274,
"eval_accuracy": 0.9558823529411765,
"eval_loss": 0.3738501965999603,
"eval_runtime": 30.759,
"eval_samples_per_second": 4.421,
"eval_steps_per_second": 0.553,
"step": 1950
},
{
"epoch": 51.24183006535948,
"grad_norm": 70.3333969116211,
"learning_rate": 2.6900584795321637e-05,
"loss": 0.0428,
"step": 1960
},
{
"epoch": 51.50326797385621,
"grad_norm": 13.072953224182129,
"learning_rate": 2.675438596491228e-05,
"loss": 0.0505,
"step": 1970
},
{
"epoch": 51.76470588235294,
"grad_norm": 39.30720520019531,
"learning_rate": 2.6608187134502928e-05,
"loss": 0.1046,
"step": 1980
},
{
"epoch": 52.0,
"eval_accuracy": 0.9117647058823529,
"eval_loss": 0.3074805736541748,
"eval_runtime": 33.894,
"eval_samples_per_second": 4.013,
"eval_steps_per_second": 0.502,
"step": 1989
},
{
"epoch": 52.02614379084967,
"grad_norm": 23.061525344848633,
"learning_rate": 2.6461988304093572e-05,
"loss": 0.0566,
"step": 1990
},
{
"epoch": 52.287581699346404,
"grad_norm": 2.5243396759033203,
"learning_rate": 2.6315789473684212e-05,
"loss": 0.0605,
"step": 2000
},
{
"epoch": 52.549019607843135,
"grad_norm": 11.470220565795898,
"learning_rate": 2.6169590643274856e-05,
"loss": 0.0767,
"step": 2010
},
{
"epoch": 52.810457516339866,
"grad_norm": 0.23322105407714844,
"learning_rate": 2.60233918128655e-05,
"loss": 0.0298,
"step": 2020
},
{
"epoch": 52.99346405228758,
"eval_accuracy": 0.9558823529411765,
"eval_loss": 0.3557595908641815,
"eval_runtime": 25.1218,
"eval_samples_per_second": 5.414,
"eval_steps_per_second": 0.677,
"step": 2027
},
{
"epoch": 53.071895424836605,
"grad_norm": 4.624847412109375,
"learning_rate": 2.5877192982456143e-05,
"loss": 0.0563,
"step": 2030
},
{
"epoch": 53.333333333333336,
"grad_norm": 0.25727781653404236,
"learning_rate": 2.5730994152046783e-05,
"loss": 0.0977,
"step": 2040
},
{
"epoch": 53.59477124183007,
"grad_norm": 0.22140049934387207,
"learning_rate": 2.5584795321637427e-05,
"loss": 0.0199,
"step": 2050
},
{
"epoch": 53.8562091503268,
"grad_norm": 0.9178116321563721,
"learning_rate": 2.5438596491228074e-05,
"loss": 0.0478,
"step": 2060
},
{
"epoch": 53.98692810457516,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.30555427074432373,
"eval_runtime": 37.1043,
"eval_samples_per_second": 3.665,
"eval_steps_per_second": 0.458,
"step": 2065
},
{
"epoch": 54.11764705882353,
"grad_norm": 19.221540451049805,
"learning_rate": 2.5292397660818717e-05,
"loss": 0.0289,
"step": 2070
},
{
"epoch": 54.37908496732026,
"grad_norm": 1.848120093345642,
"learning_rate": 2.5146198830409358e-05,
"loss": 0.095,
"step": 2080
},
{
"epoch": 54.64052287581699,
"grad_norm": 10.04775619506836,
"learning_rate": 2.5e-05,
"loss": 0.0218,
"step": 2090
},
{
"epoch": 54.90196078431372,
"grad_norm": 0.047169651836156845,
"learning_rate": 2.485380116959064e-05,
"loss": 0.0285,
"step": 2100
},
{
"epoch": 54.98039215686274,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.28512153029441833,
"eval_runtime": 32.4012,
"eval_samples_per_second": 4.197,
"eval_steps_per_second": 0.525,
"step": 2103
},
{
"epoch": 55.16339869281046,
"grad_norm": 2.4437642097473145,
"learning_rate": 2.470760233918129e-05,
"loss": 0.0029,
"step": 2110
},
{
"epoch": 55.42483660130719,
"grad_norm": 14.518400192260742,
"learning_rate": 2.456140350877193e-05,
"loss": 0.0621,
"step": 2120
},
{
"epoch": 55.68627450980392,
"grad_norm": 2.9272749423980713,
"learning_rate": 2.4415204678362576e-05,
"loss": 0.0129,
"step": 2130
},
{
"epoch": 55.947712418300654,
"grad_norm": 19.935407638549805,
"learning_rate": 2.4269005847953216e-05,
"loss": 0.0407,
"step": 2140
},
{
"epoch": 56.0,
"eval_accuracy": 0.9558823529411765,
"eval_loss": 0.32225164771080017,
"eval_runtime": 33.148,
"eval_samples_per_second": 4.103,
"eval_steps_per_second": 0.513,
"step": 2142
},
{
"epoch": 56.209150326797385,
"grad_norm": 32.69438934326172,
"learning_rate": 2.412280701754386e-05,
"loss": 0.0161,
"step": 2150
},
{
"epoch": 56.470588235294116,
"grad_norm": 0.04998353496193886,
"learning_rate": 2.3976608187134503e-05,
"loss": 0.0446,
"step": 2160
},
{
"epoch": 56.73202614379085,
"grad_norm": 0.830470085144043,
"learning_rate": 2.3830409356725147e-05,
"loss": 0.1066,
"step": 2170
},
{
"epoch": 56.99346405228758,
"grad_norm": 21.04816436767578,
"learning_rate": 2.368421052631579e-05,
"loss": 0.0459,
"step": 2180
},
{
"epoch": 56.99346405228758,
"eval_accuracy": 0.9485294117647058,
"eval_loss": 0.45745787024497986,
"eval_runtime": 31.4986,
"eval_samples_per_second": 4.318,
"eval_steps_per_second": 0.54,
"step": 2180
},
{
"epoch": 57.254901960784316,
"grad_norm": 6.693302631378174,
"learning_rate": 2.3538011695906434e-05,
"loss": 0.0569,
"step": 2190
},
{
"epoch": 57.51633986928105,
"grad_norm": 12.218875885009766,
"learning_rate": 2.3391812865497074e-05,
"loss": 0.0455,
"step": 2200
},
{
"epoch": 57.77777777777778,
"grad_norm": 56.21259689331055,
"learning_rate": 2.324561403508772e-05,
"loss": 0.0409,
"step": 2210
},
{
"epoch": 57.98692810457516,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.29300644993782043,
"eval_runtime": 31.4287,
"eval_samples_per_second": 4.327,
"eval_steps_per_second": 0.541,
"step": 2218
},
{
"epoch": 58.03921568627451,
"grad_norm": 0.48025286197662354,
"learning_rate": 2.309941520467836e-05,
"loss": 0.0526,
"step": 2220
},
{
"epoch": 58.30065359477124,
"grad_norm": 6.530683994293213,
"learning_rate": 2.295321637426901e-05,
"loss": 0.0791,
"step": 2230
},
{
"epoch": 58.56209150326797,
"grad_norm": 35.76517105102539,
"learning_rate": 2.280701754385965e-05,
"loss": 0.033,
"step": 2240
},
{
"epoch": 58.8235294117647,
"grad_norm": 4.9538679122924805,
"learning_rate": 2.2660818713450292e-05,
"loss": 0.0743,
"step": 2250
},
{
"epoch": 58.98039215686274,
"eval_accuracy": 0.9485294117647058,
"eval_loss": 0.4032076299190521,
"eval_runtime": 34.2283,
"eval_samples_per_second": 3.973,
"eval_steps_per_second": 0.497,
"step": 2256
},
{
"epoch": 59.08496732026144,
"grad_norm": 8.96496868133545,
"learning_rate": 2.2514619883040936e-05,
"loss": 0.0358,
"step": 2260
},
{
"epoch": 59.34640522875817,
"grad_norm": 10.487314224243164,
"learning_rate": 2.236842105263158e-05,
"loss": 0.0805,
"step": 2270
},
{
"epoch": 59.6078431372549,
"grad_norm": 3.922236442565918,
"learning_rate": 2.2222222222222223e-05,
"loss": 0.0096,
"step": 2280
},
{
"epoch": 59.869281045751634,
"grad_norm": 5.181495666503906,
"learning_rate": 2.2076023391812867e-05,
"loss": 0.0346,
"step": 2290
},
{
"epoch": 60.0,
"eval_accuracy": 0.9411764705882353,
"eval_loss": 0.37382781505584717,
"eval_runtime": 37.1282,
"eval_samples_per_second": 3.663,
"eval_steps_per_second": 0.458,
"step": 2295
},
{
"epoch": 60.130718954248366,
"grad_norm": 0.059666648507118225,
"learning_rate": 2.1929824561403507e-05,
"loss": 0.0551,
"step": 2300
},
{
"epoch": 60.3921568627451,
"grad_norm": 0.5856298804283142,
"learning_rate": 2.1783625730994154e-05,
"loss": 0.0331,
"step": 2310
},
{
"epoch": 60.65359477124183,
"grad_norm": 5.777927875518799,
"learning_rate": 2.1637426900584794e-05,
"loss": 0.0112,
"step": 2320
},
{
"epoch": 60.91503267973856,
"grad_norm": 13.134035110473633,
"learning_rate": 2.149122807017544e-05,
"loss": 0.0302,
"step": 2330
},
{
"epoch": 60.99346405228758,
"eval_accuracy": 0.9485294117647058,
"eval_loss": 0.3597317337989807,
"eval_runtime": 31.126,
"eval_samples_per_second": 4.369,
"eval_steps_per_second": 0.546,
"step": 2333
},
{
"epoch": 61.1764705882353,
"grad_norm": 28.286643981933594,
"learning_rate": 2.134502923976608e-05,
"loss": 0.0311,
"step": 2340
},
{
"epoch": 61.43790849673203,
"grad_norm": 6.936996936798096,
"learning_rate": 2.1198830409356725e-05,
"loss": 0.139,
"step": 2350
},
{
"epoch": 61.69934640522876,
"grad_norm": 1.0503500699996948,
"learning_rate": 2.105263157894737e-05,
"loss": 0.0666,
"step": 2360
},
{
"epoch": 61.96078431372549,
"grad_norm": 5.756121635437012,
"learning_rate": 2.0906432748538013e-05,
"loss": 0.0488,
"step": 2370
},
{
"epoch": 61.98692810457516,
"eval_accuracy": 0.9558823529411765,
"eval_loss": 0.2594568133354187,
"eval_runtime": 34.9133,
"eval_samples_per_second": 3.895,
"eval_steps_per_second": 0.487,
"step": 2371
},
{
"epoch": 62.22222222222222,
"grad_norm": 17.791810989379883,
"learning_rate": 2.0760233918128656e-05,
"loss": 0.0294,
"step": 2380
},
{
"epoch": 62.48366013071895,
"grad_norm": 0.014880876056849957,
"learning_rate": 2.06140350877193e-05,
"loss": 0.0516,
"step": 2390
},
{
"epoch": 62.745098039215684,
"grad_norm": 33.730533599853516,
"learning_rate": 2.046783625730994e-05,
"loss": 0.0562,
"step": 2400
},
{
"epoch": 62.98039215686274,
"eval_accuracy": 0.9411764705882353,
"eval_loss": 0.3763536512851715,
"eval_runtime": 35.0422,
"eval_samples_per_second": 3.881,
"eval_steps_per_second": 0.485,
"step": 2409
},
{
"epoch": 63.00653594771242,
"grad_norm": 58.39078903198242,
"learning_rate": 2.0321637426900587e-05,
"loss": 0.0751,
"step": 2410
},
{
"epoch": 63.26797385620915,
"grad_norm": 0.0864597037434578,
"learning_rate": 2.0175438596491227e-05,
"loss": 0.0393,
"step": 2420
},
{
"epoch": 63.529411764705884,
"grad_norm": 18.966829299926758,
"learning_rate": 2.0029239766081874e-05,
"loss": 0.0251,
"step": 2430
},
{
"epoch": 63.790849673202615,
"grad_norm": 25.66364288330078,
"learning_rate": 1.9883040935672515e-05,
"loss": 0.0216,
"step": 2440
},
{
"epoch": 64.0,
"eval_accuracy": 0.9779411764705882,
"eval_loss": 0.2643776834011078,
"eval_runtime": 17.3782,
"eval_samples_per_second": 7.826,
"eval_steps_per_second": 0.978,
"step": 2448
},
{
"epoch": 64.05228758169935,
"grad_norm": 1.6527997255325317,
"learning_rate": 1.9736842105263158e-05,
"loss": 0.054,
"step": 2450
},
{
"epoch": 64.31372549019608,
"grad_norm": 0.06280579417943954,
"learning_rate": 1.9590643274853802e-05,
"loss": 0.0287,
"step": 2460
},
{
"epoch": 64.57516339869281,
"grad_norm": 1.6318433284759521,
"learning_rate": 1.9444444444444445e-05,
"loss": 0.0399,
"step": 2470
},
{
"epoch": 64.83660130718954,
"grad_norm": 1.7933380603790283,
"learning_rate": 1.929824561403509e-05,
"loss": 0.0219,
"step": 2480
},
{
"epoch": 64.99346405228758,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.30917930603027344,
"eval_runtime": 17.1251,
"eval_samples_per_second": 7.942,
"eval_steps_per_second": 0.993,
"step": 2486
},
{
"epoch": 65.09803921568627,
"grad_norm": 10.366903305053711,
"learning_rate": 1.9152046783625733e-05,
"loss": 0.0539,
"step": 2490
},
{
"epoch": 65.359477124183,
"grad_norm": 0.2696276307106018,
"learning_rate": 1.9005847953216373e-05,
"loss": 0.0123,
"step": 2500
},
{
"epoch": 65.62091503267973,
"grad_norm": 2.0707309246063232,
"learning_rate": 1.885964912280702e-05,
"loss": 0.0209,
"step": 2510
},
{
"epoch": 65.88235294117646,
"grad_norm": 0.026714438572525978,
"learning_rate": 1.871345029239766e-05,
"loss": 0.0272,
"step": 2520
},
{
"epoch": 65.98692810457516,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.2898404896259308,
"eval_runtime": 17.5281,
"eval_samples_per_second": 7.759,
"eval_steps_per_second": 0.97,
"step": 2524
},
{
"epoch": 66.14379084967321,
"grad_norm": 0.15798357129096985,
"learning_rate": 1.8567251461988304e-05,
"loss": 0.0091,
"step": 2530
},
{
"epoch": 66.40522875816994,
"grad_norm": 85.56695556640625,
"learning_rate": 1.8421052631578947e-05,
"loss": 0.0221,
"step": 2540
},
{
"epoch": 66.66666666666667,
"grad_norm": 25.615230560302734,
"learning_rate": 1.827485380116959e-05,
"loss": 0.0645,
"step": 2550
},
{
"epoch": 66.9281045751634,
"grad_norm": 22.72310447692871,
"learning_rate": 1.8128654970760235e-05,
"loss": 0.027,
"step": 2560
},
{
"epoch": 66.98039215686275,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.2693423628807068,
"eval_runtime": 23.0579,
"eval_samples_per_second": 5.898,
"eval_steps_per_second": 0.737,
"step": 2562
},
{
"epoch": 67.18954248366013,
"grad_norm": 24.883161544799805,
"learning_rate": 1.7982456140350878e-05,
"loss": 0.0293,
"step": 2570
},
{
"epoch": 67.45098039215686,
"grad_norm": 6.90622615814209,
"learning_rate": 1.7836257309941522e-05,
"loss": 0.022,
"step": 2580
},
{
"epoch": 67.7124183006536,
"grad_norm": 48.23540115356445,
"learning_rate": 1.7690058479532165e-05,
"loss": 0.0509,
"step": 2590
},
{
"epoch": 67.97385620915033,
"grad_norm": 0.07863592356443405,
"learning_rate": 1.7543859649122806e-05,
"loss": 0.0397,
"step": 2600
},
{
"epoch": 68.0,
"eval_accuracy": 0.9411764705882353,
"eval_loss": 0.38426852226257324,
"eval_runtime": 23.971,
"eval_samples_per_second": 5.674,
"eval_steps_per_second": 0.709,
"step": 2601
},
{
"epoch": 68.23529411764706,
"grad_norm": 4.26972770690918,
"learning_rate": 1.7397660818713453e-05,
"loss": 0.0409,
"step": 2610
},
{
"epoch": 68.49673202614379,
"grad_norm": 1.8150982856750488,
"learning_rate": 1.7251461988304093e-05,
"loss": 0.0315,
"step": 2620
},
{
"epoch": 68.75816993464052,
"grad_norm": 13.07569694519043,
"learning_rate": 1.7105263157894737e-05,
"loss": 0.0154,
"step": 2630
},
{
"epoch": 68.99346405228758,
"eval_accuracy": 0.9485294117647058,
"eval_loss": 0.30511775612831116,
"eval_runtime": 23.3134,
"eval_samples_per_second": 5.834,
"eval_steps_per_second": 0.729,
"step": 2639
},
{
"epoch": 69.01960784313725,
"grad_norm": 0.576351523399353,
"learning_rate": 1.695906432748538e-05,
"loss": 0.0387,
"step": 2640
},
{
"epoch": 69.28104575163398,
"grad_norm": 0.867915153503418,
"learning_rate": 1.6812865497076024e-05,
"loss": 0.0178,
"step": 2650
},
{
"epoch": 69.54248366013071,
"grad_norm": 20.2279052734375,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0392,
"step": 2660
},
{
"epoch": 69.80392156862744,
"grad_norm": 0.04353189095854759,
"learning_rate": 1.652046783625731e-05,
"loss": 0.0004,
"step": 2670
},
{
"epoch": 69.98692810457516,
"eval_accuracy": 0.9411764705882353,
"eval_loss": 0.39089399576187134,
"eval_runtime": 23.3469,
"eval_samples_per_second": 5.825,
"eval_steps_per_second": 0.728,
"step": 2677
},
{
"epoch": 70.06535947712419,
"grad_norm": 77.49730682373047,
"learning_rate": 1.6374269005847955e-05,
"loss": 0.0467,
"step": 2680
},
{
"epoch": 70.32679738562092,
"grad_norm": 49.50137710571289,
"learning_rate": 1.62280701754386e-05,
"loss": 0.0228,
"step": 2690
},
{
"epoch": 70.58823529411765,
"grad_norm": 0.5024857521057129,
"learning_rate": 1.608187134502924e-05,
"loss": 0.0045,
"step": 2700
},
{
"epoch": 70.84967320261438,
"grad_norm": 3.8934128284454346,
"learning_rate": 1.5935672514619886e-05,
"loss": 0.0651,
"step": 2710
},
{
"epoch": 70.98039215686275,
"eval_accuracy": 0.9485294117647058,
"eval_loss": 0.29772186279296875,
"eval_runtime": 25.8712,
"eval_samples_per_second": 5.257,
"eval_steps_per_second": 0.657,
"step": 2715
},
{
"epoch": 71.11111111111111,
"grad_norm": 7.867006778717041,
"learning_rate": 1.5789473684210526e-05,
"loss": 0.008,
"step": 2720
},
{
"epoch": 71.37254901960785,
"grad_norm": 13.64209270477295,
"learning_rate": 1.564327485380117e-05,
"loss": 0.0757,
"step": 2730
},
{
"epoch": 71.63398692810458,
"grad_norm": 6.453034400939941,
"learning_rate": 1.5497076023391813e-05,
"loss": 0.0214,
"step": 2740
},
{
"epoch": 71.89542483660131,
"grad_norm": 0.1501288115978241,
"learning_rate": 1.5350877192982457e-05,
"loss": 0.016,
"step": 2750
},
{
"epoch": 72.0,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.2694728374481201,
"eval_runtime": 20.9056,
"eval_samples_per_second": 6.505,
"eval_steps_per_second": 0.813,
"step": 2754
},
{
"epoch": 72.15686274509804,
"grad_norm": 0.034015778452157974,
"learning_rate": 1.5204678362573099e-05,
"loss": 0.012,
"step": 2760
},
{
"epoch": 72.41830065359477,
"grad_norm": 11.159213066101074,
"learning_rate": 1.5058479532163744e-05,
"loss": 0.0444,
"step": 2770
},
{
"epoch": 72.6797385620915,
"grad_norm": 2.5402066707611084,
"learning_rate": 1.4912280701754386e-05,
"loss": 0.0359,
"step": 2780
},
{
"epoch": 72.94117647058823,
"grad_norm": 0.016565600410103798,
"learning_rate": 1.4766081871345031e-05,
"loss": 0.0351,
"step": 2790
},
{
"epoch": 72.99346405228758,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.2720423936843872,
"eval_runtime": 22.3116,
"eval_samples_per_second": 6.095,
"eval_steps_per_second": 0.762,
"step": 2792
},
{
"epoch": 73.20261437908496,
"grad_norm": 79.11601257324219,
"learning_rate": 1.4619883040935673e-05,
"loss": 0.044,
"step": 2800
},
{
"epoch": 73.4640522875817,
"grad_norm": 5.53911018371582,
"learning_rate": 1.4473684210526317e-05,
"loss": 0.0298,
"step": 2810
},
{
"epoch": 73.72549019607843,
"grad_norm": 0.40750911831855774,
"learning_rate": 1.4327485380116959e-05,
"loss": 0.011,
"step": 2820
},
{
"epoch": 73.98692810457516,
"grad_norm": 0.9360626339912415,
"learning_rate": 1.4181286549707604e-05,
"loss": 0.0206,
"step": 2830
},
{
"epoch": 73.98692810457516,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.25490206480026245,
"eval_runtime": 22.7726,
"eval_samples_per_second": 5.972,
"eval_steps_per_second": 0.747,
"step": 2830
},
{
"epoch": 74.2483660130719,
"grad_norm": 6.835451602935791,
"learning_rate": 1.4035087719298246e-05,
"loss": 0.0109,
"step": 2840
},
{
"epoch": 74.50980392156863,
"grad_norm": 0.1265513300895691,
"learning_rate": 1.388888888888889e-05,
"loss": 0.0436,
"step": 2850
},
{
"epoch": 74.77124183006536,
"grad_norm": 0.20871244370937347,
"learning_rate": 1.3742690058479531e-05,
"loss": 0.0109,
"step": 2860
},
{
"epoch": 74.98039215686275,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.24122387170791626,
"eval_runtime": 19.4498,
"eval_samples_per_second": 6.992,
"eval_steps_per_second": 0.874,
"step": 2868
},
{
"epoch": 75.0326797385621,
"grad_norm": 24.267925262451172,
"learning_rate": 1.3596491228070177e-05,
"loss": 0.0207,
"step": 2870
},
{
"epoch": 75.29411764705883,
"grad_norm": 9.061148643493652,
"learning_rate": 1.3450292397660819e-05,
"loss": 0.0105,
"step": 2880
},
{
"epoch": 75.55555555555556,
"grad_norm": 1.2824314832687378,
"learning_rate": 1.3304093567251464e-05,
"loss": 0.0182,
"step": 2890
},
{
"epoch": 75.81699346405229,
"grad_norm": 0.003347081132233143,
"learning_rate": 1.3157894736842106e-05,
"loss": 0.0012,
"step": 2900
},
{
"epoch": 76.0,
"eval_accuracy": 0.9779411764705882,
"eval_loss": 0.34939995408058167,
"eval_runtime": 20.8219,
"eval_samples_per_second": 6.532,
"eval_steps_per_second": 0.816,
"step": 2907
},
{
"epoch": 76.07843137254902,
"grad_norm": 5.410060882568359,
"learning_rate": 1.301169590643275e-05,
"loss": 0.0214,
"step": 2910
},
{
"epoch": 76.33986928104575,
"grad_norm": 0.6613653898239136,
"learning_rate": 1.2865497076023392e-05,
"loss": 0.0261,
"step": 2920
},
{
"epoch": 76.60130718954248,
"grad_norm": 1.0403037071228027,
"learning_rate": 1.2719298245614037e-05,
"loss": 0.0555,
"step": 2930
},
{
"epoch": 76.86274509803921,
"grad_norm": 15.238615036010742,
"learning_rate": 1.2573099415204679e-05,
"loss": 0.0418,
"step": 2940
},
{
"epoch": 76.99346405228758,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.37292152643203735,
"eval_runtime": 20.8077,
"eval_samples_per_second": 6.536,
"eval_steps_per_second": 0.817,
"step": 2945
},
{
"epoch": 77.12418300653594,
"grad_norm": 31.79336166381836,
"learning_rate": 1.242690058479532e-05,
"loss": 0.0302,
"step": 2950
},
{
"epoch": 77.38562091503267,
"grad_norm": 0.0776483416557312,
"learning_rate": 1.2280701754385964e-05,
"loss": 0.0094,
"step": 2960
},
{
"epoch": 77.6470588235294,
"grad_norm": 63.487571716308594,
"learning_rate": 1.2134502923976608e-05,
"loss": 0.0473,
"step": 2970
},
{
"epoch": 77.90849673202614,
"grad_norm": 0.09107412397861481,
"learning_rate": 1.1988304093567252e-05,
"loss": 0.0165,
"step": 2980
},
{
"epoch": 77.98692810457516,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.347072571516037,
"eval_runtime": 17.8737,
"eval_samples_per_second": 7.609,
"eval_steps_per_second": 0.951,
"step": 2983
},
{
"epoch": 78.16993464052288,
"grad_norm": 36.47078323364258,
"learning_rate": 1.1842105263157895e-05,
"loss": 0.0176,
"step": 2990
},
{
"epoch": 78.43137254901961,
"grad_norm": 0.0024324676487594843,
"learning_rate": 1.1695906432748537e-05,
"loss": 0.0317,
"step": 3000
},
{
"epoch": 78.69281045751634,
"grad_norm": 26.059871673583984,
"learning_rate": 1.154970760233918e-05,
"loss": 0.0699,
"step": 3010
},
{
"epoch": 78.95424836601308,
"grad_norm": 38.14042282104492,
"learning_rate": 1.1403508771929824e-05,
"loss": 0.0163,
"step": 3020
},
{
"epoch": 78.98039215686275,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.29730716347694397,
"eval_runtime": 18.5858,
"eval_samples_per_second": 7.317,
"eval_steps_per_second": 0.915,
"step": 3021
},
{
"epoch": 79.2156862745098,
"grad_norm": 87.14070129394531,
"learning_rate": 1.1257309941520468e-05,
"loss": 0.0556,
"step": 3030
},
{
"epoch": 79.47712418300654,
"grad_norm": 3.418160915374756,
"learning_rate": 1.1111111111111112e-05,
"loss": 0.0073,
"step": 3040
},
{
"epoch": 79.73856209150327,
"grad_norm": 22.285499572753906,
"learning_rate": 1.0964912280701754e-05,
"loss": 0.0249,
"step": 3050
},
{
"epoch": 80.0,
"grad_norm": 35.9242057800293,
"learning_rate": 1.0818713450292397e-05,
"loss": 0.0202,
"step": 3060
},
{
"epoch": 80.0,
"eval_accuracy": 0.9558823529411765,
"eval_loss": 0.3729775846004486,
"eval_runtime": 19.8789,
"eval_samples_per_second": 6.841,
"eval_steps_per_second": 0.855,
"step": 3060
},
{
"epoch": 80.26143790849673,
"grad_norm": 15.128210067749023,
"learning_rate": 1.067251461988304e-05,
"loss": 0.0628,
"step": 3070
},
{
"epoch": 80.52287581699346,
"grad_norm": 29.2634220123291,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.0244,
"step": 3080
},
{
"epoch": 80.7843137254902,
"grad_norm": 79.84837341308594,
"learning_rate": 1.0380116959064328e-05,
"loss": 0.0368,
"step": 3090
},
{
"epoch": 80.99346405228758,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.2876713275909424,
"eval_runtime": 19.4821,
"eval_samples_per_second": 6.981,
"eval_steps_per_second": 0.873,
"step": 3098
},
{
"epoch": 81.04575163398692,
"grad_norm": 2.7281501293182373,
"learning_rate": 1.023391812865497e-05,
"loss": 0.0238,
"step": 3100
},
{
"epoch": 81.30718954248366,
"grad_norm": 0.0004346697241999209,
"learning_rate": 1.0087719298245614e-05,
"loss": 0.0305,
"step": 3110
},
{
"epoch": 81.56862745098039,
"grad_norm": 0.03860533982515335,
"learning_rate": 9.941520467836257e-06,
"loss": 0.0136,
"step": 3120
},
{
"epoch": 81.83006535947712,
"grad_norm": 0.4280990958213806,
"learning_rate": 9.795321637426901e-06,
"loss": 0.0374,
"step": 3130
},
{
"epoch": 81.98692810457516,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.41433659195899963,
"eval_runtime": 19.9936,
"eval_samples_per_second": 6.802,
"eval_steps_per_second": 0.85,
"step": 3136
},
{
"epoch": 82.09150326797386,
"grad_norm": 31.7745418548584,
"learning_rate": 9.649122807017545e-06,
"loss": 0.0105,
"step": 3140
},
{
"epoch": 82.3529411764706,
"grad_norm": 2.9742166996002197,
"learning_rate": 9.502923976608186e-06,
"loss": 0.0361,
"step": 3150
},
{
"epoch": 82.61437908496733,
"grad_norm": 3.588392734527588,
"learning_rate": 9.35672514619883e-06,
"loss": 0.0648,
"step": 3160
},
{
"epoch": 82.87581699346406,
"grad_norm": 0.4829164147377014,
"learning_rate": 9.210526315789474e-06,
"loss": 0.0296,
"step": 3170
},
{
"epoch": 82.98039215686275,
"eval_accuracy": 0.9779411764705882,
"eval_loss": 0.2895439565181732,
"eval_runtime": 17.9847,
"eval_samples_per_second": 7.562,
"eval_steps_per_second": 0.945,
"step": 3174
},
{
"epoch": 83.13725490196079,
"grad_norm": 22.893632888793945,
"learning_rate": 9.064327485380117e-06,
"loss": 0.0115,
"step": 3180
},
{
"epoch": 83.39869281045752,
"grad_norm": 0.021368976682424545,
"learning_rate": 8.918128654970761e-06,
"loss": 0.0269,
"step": 3190
},
{
"epoch": 83.66013071895425,
"grad_norm": 0.06225317716598511,
"learning_rate": 8.771929824561403e-06,
"loss": 0.0024,
"step": 3200
},
{
"epoch": 83.92156862745098,
"grad_norm": 0.05705859139561653,
"learning_rate": 8.625730994152046e-06,
"loss": 0.0405,
"step": 3210
},
{
"epoch": 84.0,
"eval_accuracy": 0.9558823529411765,
"eval_loss": 0.29270094633102417,
"eval_runtime": 19.1133,
"eval_samples_per_second": 7.115,
"eval_steps_per_second": 0.889,
"step": 3213
},
{
"epoch": 84.18300653594771,
"grad_norm": 24.514904022216797,
"learning_rate": 8.47953216374269e-06,
"loss": 0.0098,
"step": 3220
},
{
"epoch": 84.44444444444444,
"grad_norm": 0.596236526966095,
"learning_rate": 8.333333333333334e-06,
"loss": 0.0035,
"step": 3230
},
{
"epoch": 84.70588235294117,
"grad_norm": 0.050445396453142166,
"learning_rate": 8.187134502923977e-06,
"loss": 0.005,
"step": 3240
},
{
"epoch": 84.9673202614379,
"grad_norm": 0.07400578260421753,
"learning_rate": 8.04093567251462e-06,
"loss": 0.0097,
"step": 3250
},
{
"epoch": 84.99346405228758,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.317930668592453,
"eval_runtime": 18.575,
"eval_samples_per_second": 7.322,
"eval_steps_per_second": 0.915,
"step": 3251
},
{
"epoch": 85.22875816993464,
"grad_norm": 12.950275421142578,
"learning_rate": 7.894736842105263e-06,
"loss": 0.0026,
"step": 3260
},
{
"epoch": 85.49019607843137,
"grad_norm": 16.546571731567383,
"learning_rate": 7.748538011695907e-06,
"loss": 0.0257,
"step": 3270
},
{
"epoch": 85.7516339869281,
"grad_norm": 0.6142169237136841,
"learning_rate": 7.602339181286549e-06,
"loss": 0.0182,
"step": 3280
},
{
"epoch": 85.98692810457516,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.30465030670166016,
"eval_runtime": 18.7827,
"eval_samples_per_second": 7.241,
"eval_steps_per_second": 0.905,
"step": 3289
},
{
"epoch": 86.01307189542484,
"grad_norm": 0.09201680123806,
"learning_rate": 7.456140350877193e-06,
"loss": 0.0086,
"step": 3290
},
{
"epoch": 86.27450980392157,
"grad_norm": 0.6810176372528076,
"learning_rate": 7.3099415204678366e-06,
"loss": 0.0033,
"step": 3300
},
{
"epoch": 86.5359477124183,
"grad_norm": 7.0328474044799805,
"learning_rate": 7.163742690058479e-06,
"loss": 0.023,
"step": 3310
},
{
"epoch": 86.79738562091504,
"grad_norm": 0.5138120055198669,
"learning_rate": 7.017543859649123e-06,
"loss": 0.0207,
"step": 3320
},
{
"epoch": 86.98039215686275,
"eval_accuracy": 0.9779411764705882,
"eval_loss": 0.3018016815185547,
"eval_runtime": 17.5979,
"eval_samples_per_second": 7.728,
"eval_steps_per_second": 0.966,
"step": 3327
},
{
"epoch": 87.05882352941177,
"grad_norm": 0.11021004617214203,
"learning_rate": 6.871345029239766e-06,
"loss": 0.0711,
"step": 3330
},
{
"epoch": 87.3202614379085,
"grad_norm": 0.03013734146952629,
"learning_rate": 6.725146198830409e-06,
"loss": 0.0424,
"step": 3340
},
{
"epoch": 87.58169934640523,
"grad_norm": 69.32197570800781,
"learning_rate": 6.578947368421053e-06,
"loss": 0.0269,
"step": 3350
},
{
"epoch": 87.84313725490196,
"grad_norm": 0.45887792110443115,
"learning_rate": 6.432748538011696e-06,
"loss": 0.0207,
"step": 3360
},
{
"epoch": 88.0,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.332051545381546,
"eval_runtime": 17.8575,
"eval_samples_per_second": 7.616,
"eval_steps_per_second": 0.952,
"step": 3366
},
{
"epoch": 88.10457516339869,
"grad_norm": 0.007120809052139521,
"learning_rate": 6.286549707602339e-06,
"loss": 0.0047,
"step": 3370
},
{
"epoch": 88.36601307189542,
"grad_norm": 0.051657985895872116,
"learning_rate": 6.140350877192982e-06,
"loss": 0.0224,
"step": 3380
},
{
"epoch": 88.62745098039215,
"grad_norm": 0.6093434691429138,
"learning_rate": 5.994152046783626e-06,
"loss": 0.0052,
"step": 3390
},
{
"epoch": 88.88888888888889,
"grad_norm": 25.99680519104004,
"learning_rate": 5.8479532163742686e-06,
"loss": 0.003,
"step": 3400
},
{
"epoch": 88.99346405228758,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.30860844254493713,
"eval_runtime": 18.245,
"eval_samples_per_second": 7.454,
"eval_steps_per_second": 0.932,
"step": 3404
},
{
"epoch": 89.15032679738562,
"grad_norm": 31.555145263671875,
"learning_rate": 5.701754385964912e-06,
"loss": 0.0329,
"step": 3410
},
{
"epoch": 89.41176470588235,
"grad_norm": 18.486536026000977,
"learning_rate": 5.555555555555556e-06,
"loss": 0.029,
"step": 3420
},
{
"epoch": 89.67320261437908,
"grad_norm": 0.33306655287742615,
"learning_rate": 5.409356725146199e-06,
"loss": 0.0098,
"step": 3430
},
{
"epoch": 89.93464052287581,
"grad_norm": 2.643474578857422,
"learning_rate": 5.263157894736842e-06,
"loss": 0.0157,
"step": 3440
},
{
"epoch": 89.98692810457516,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.2947893440723419,
"eval_runtime": 18.1316,
"eval_samples_per_second": 7.501,
"eval_steps_per_second": 0.938,
"step": 3442
},
{
"epoch": 90.19607843137256,
"grad_norm": 6.317154407501221,
"learning_rate": 5.116959064327485e-06,
"loss": 0.008,
"step": 3450
},
{
"epoch": 90.45751633986929,
"grad_norm": 1.63987398147583,
"learning_rate": 4.970760233918129e-06,
"loss": 0.0219,
"step": 3460
},
{
"epoch": 90.71895424836602,
"grad_norm": 8.074739456176758,
"learning_rate": 4.824561403508772e-06,
"loss": 0.0188,
"step": 3470
},
{
"epoch": 90.98039215686275,
"grad_norm": 0.2915269136428833,
"learning_rate": 4.678362573099415e-06,
"loss": 0.0428,
"step": 3480
},
{
"epoch": 90.98039215686275,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.3174949586391449,
"eval_runtime": 17.8483,
"eval_samples_per_second": 7.62,
"eval_steps_per_second": 0.952,
"step": 3480
},
{
"epoch": 91.24183006535948,
"grad_norm": 0.3356679677963257,
"learning_rate": 4.532163742690059e-06,
"loss": 0.0161,
"step": 3490
},
{
"epoch": 91.50326797385621,
"grad_norm": 1.1951477527618408,
"learning_rate": 4.3859649122807014e-06,
"loss": 0.0205,
"step": 3500
},
{
"epoch": 91.76470588235294,
"grad_norm": 0.05076509341597557,
"learning_rate": 4.239766081871345e-06,
"loss": 0.0189,
"step": 3510
},
{
"epoch": 92.0,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.3239772915840149,
"eval_runtime": 17.301,
"eval_samples_per_second": 7.861,
"eval_steps_per_second": 0.983,
"step": 3519
},
{
"epoch": 92.02614379084967,
"grad_norm": 1.3812580108642578,
"learning_rate": 4.093567251461989e-06,
"loss": 0.0212,
"step": 3520
},
{
"epoch": 92.2875816993464,
"grad_norm": 0.3320296108722687,
"learning_rate": 3.9473684210526315e-06,
"loss": 0.0073,
"step": 3530
},
{
"epoch": 92.54901960784314,
"grad_norm": 0.009532331489026546,
"learning_rate": 3.8011695906432747e-06,
"loss": 0.0053,
"step": 3540
},
{
"epoch": 92.81045751633987,
"grad_norm": 0.5157586932182312,
"learning_rate": 3.6549707602339183e-06,
"loss": 0.0046,
"step": 3550
},
{
"epoch": 92.99346405228758,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.341442346572876,
"eval_runtime": 18.8672,
"eval_samples_per_second": 7.208,
"eval_steps_per_second": 0.901,
"step": 3557
},
{
"epoch": 93.0718954248366,
"grad_norm": 61.38653564453125,
"learning_rate": 3.5087719298245615e-06,
"loss": 0.0246,
"step": 3560
},
{
"epoch": 93.33333333333333,
"grad_norm": 0.477070152759552,
"learning_rate": 3.3625730994152047e-06,
"loss": 0.0639,
"step": 3570
},
{
"epoch": 93.59477124183006,
"grad_norm": 68.3900375366211,
"learning_rate": 3.216374269005848e-06,
"loss": 0.0255,
"step": 3580
},
{
"epoch": 93.85620915032679,
"grad_norm": 0.3444403111934662,
"learning_rate": 3.070175438596491e-06,
"loss": 0.0057,
"step": 3590
},
{
"epoch": 93.98692810457516,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.33292174339294434,
"eval_runtime": 17.7377,
"eval_samples_per_second": 7.667,
"eval_steps_per_second": 0.958,
"step": 3595
},
{
"epoch": 94.11764705882354,
"grad_norm": 0.04389649257063866,
"learning_rate": 2.9239766081871343e-06,
"loss": 0.0058,
"step": 3600
},
{
"epoch": 94.37908496732027,
"grad_norm": 0.5849317908287048,
"learning_rate": 2.777777777777778e-06,
"loss": 0.0586,
"step": 3610
},
{
"epoch": 94.640522875817,
"grad_norm": 0.019542796537280083,
"learning_rate": 2.631578947368421e-06,
"loss": 0.001,
"step": 3620
},
{
"epoch": 94.90196078431373,
"grad_norm": 0.002426290884613991,
"learning_rate": 2.4853801169590643e-06,
"loss": 0.0165,
"step": 3630
},
{
"epoch": 94.98039215686275,
"eval_accuracy": 0.9632352941176471,
"eval_loss": 0.32402223348617554,
"eval_runtime": 17.5747,
"eval_samples_per_second": 7.738,
"eval_steps_per_second": 0.967,
"step": 3633
},
{
"epoch": 95.16339869281046,
"grad_norm": 2.353595495223999,
"learning_rate": 2.3391812865497075e-06,
"loss": 0.0009,
"step": 3640
},
{
"epoch": 95.42483660130719,
"grad_norm": 0.7732095718383789,
"learning_rate": 2.1929824561403507e-06,
"loss": 0.0273,
"step": 3650
},
{
"epoch": 95.68627450980392,
"grad_norm": 0.006318532861769199,
"learning_rate": 2.0467836257309943e-06,
"loss": 0.0219,
"step": 3660
},
{
"epoch": 95.94771241830065,
"grad_norm": 0.12237526476383209,
"learning_rate": 1.9005847953216373e-06,
"loss": 0.006,
"step": 3670
},
{
"epoch": 96.0,
"eval_accuracy": 0.9705882352941176,
"eval_loss": 0.3180083632469177,
"eval_runtime": 18.1825,
"eval_samples_per_second": 7.48,
"eval_steps_per_second": 0.935,
"step": 3672
},
{
"epoch": 96.20915032679738,
"grad_norm": 4.133842468261719,
"learning_rate": 1.7543859649122807e-06,
"loss": 0.0876,
"step": 3680
},
{
"epoch": 96.47058823529412,
"grad_norm": 14.3917236328125,
"learning_rate": 1.608187134502924e-06,
"loss": 0.0033,
"step": 3690
},
{
"epoch": 96.73202614379085,
"grad_norm": 0.6327334642410278,
"learning_rate": 1.4619883040935671e-06,
"loss": 0.0045,
"step": 3700
},
{
"epoch": 96.99346405228758,
"grad_norm": 0.47620221972465515,
"learning_rate": 1.3157894736842106e-06,
"loss": 0.0172,
"step": 3710
},
{
"epoch": 96.99346405228758,
"eval_accuracy": 0.9779411764705882,
"eval_loss": 0.3103199303150177,
"eval_runtime": 17.4264,
"eval_samples_per_second": 7.804,
"eval_steps_per_second": 0.976,
"step": 3710
},
{
"epoch": 97.25490196078431,
"grad_norm": 43.838233947753906,
"learning_rate": 1.1695906432748538e-06,
"loss": 0.0047,
"step": 3720
},
{
"epoch": 97.51633986928104,
"grad_norm": 0.001560373231768608,
"learning_rate": 1.0233918128654972e-06,
"loss": 0.0032,
"step": 3730
},
{
"epoch": 97.77777777777777,
"grad_norm": 0.00045679722097702324,
"learning_rate": 8.771929824561404e-07,
"loss": 0.0109,
"step": 3740
},
{
"epoch": 97.98692810457516,
"eval_accuracy": 0.9779411764705882,
"eval_loss": 0.3034810721874237,
"eval_runtime": 18.06,
"eval_samples_per_second": 7.53,
"eval_steps_per_second": 0.941,
"step": 3748
},
{
"epoch": 98.03921568627452,
"grad_norm": 0.0029410182032734156,
"learning_rate": 7.309941520467836e-07,
"loss": 0.0093,
"step": 3750
},
{
"epoch": 98.30065359477125,
"grad_norm": 0.060371335595846176,
"learning_rate": 5.847953216374269e-07,
"loss": 0.0147,
"step": 3760
},
{
"epoch": 98.56209150326798,
"grad_norm": 0.0018022909061983228,
"learning_rate": 4.385964912280702e-07,
"loss": 0.0325,
"step": 3770
},
{
"epoch": 98.82352941176471,
"grad_norm": 0.866423487663269,
"learning_rate": 2.9239766081871344e-07,
"loss": 0.0172,
"step": 3780
},
{
"epoch": 98.98039215686275,
"eval_accuracy": 0.9779411764705882,
"eval_loss": 0.3034467101097107,
"eval_runtime": 20.5056,
"eval_samples_per_second": 6.632,
"eval_steps_per_second": 0.829,
"step": 3786
},
{
"epoch": 99.08496732026144,
"grad_norm": 0.015289215371012688,
"learning_rate": 1.4619883040935672e-07,
"loss": 0.0003,
"step": 3790
},
{
"epoch": 99.34640522875817,
"grad_norm": 0.3536844849586487,
"learning_rate": 0.0,
"loss": 0.0219,
"step": 3800
},
{
"epoch": 99.34640522875817,
"eval_accuracy": 0.9779411764705882,
"eval_loss": 0.3036399185657501,
"eval_runtime": 18.1299,
"eval_samples_per_second": 7.501,
"eval_steps_per_second": 0.938,
"step": 3800
},
{
"epoch": 99.34640522875817,
"step": 3800,
"total_flos": 3.0228260830838784e+18,
"train_loss": 0.1524556069365874,
"train_runtime": 23400.6351,
"train_samples_per_second": 5.231,
"train_steps_per_second": 0.162
}
],
"logging_steps": 10,
"max_steps": 3800,
"num_input_tokens_seen": 0,
"num_train_epochs": 100,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 3.0228260830838784e+18,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}