Mayank1996's picture
End of training
777a3d1 verified
{
"best_metric": 0.9375,
"best_model_checkpoint": "videomae-base-finetuned-ucf101-subset_fhbh/checkpoint-638",
"epoch": 24.0020350877193,
"eval_steps": 500,
"global_step": 1450,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0003508771929824561,
"grad_norm": 5.952354907989502,
"learning_rate": 1.7543859649122808e-07,
"loss": 0.745,
"step": 10
},
{
"epoch": 0.0007017543859649122,
"grad_norm": 4.616081714630127,
"learning_rate": 3.5087719298245616e-07,
"loss": 0.6751,
"step": 20
},
{
"epoch": 0.0010526315789473684,
"grad_norm": 14.299074172973633,
"learning_rate": 5.263157894736843e-07,
"loss": 0.7311,
"step": 30
},
{
"epoch": 0.0014035087719298245,
"grad_norm": 9.126326560974121,
"learning_rate": 7.017543859649123e-07,
"loss": 0.6957,
"step": 40
},
{
"epoch": 0.0017543859649122807,
"grad_norm": 6.692790985107422,
"learning_rate": 8.771929824561404e-07,
"loss": 0.7533,
"step": 50
},
{
"epoch": 0.0020350877192982456,
"eval_accuracy": 0.5416666666666666,
"eval_loss": 0.677791178226471,
"eval_runtime": 78.366,
"eval_samples_per_second": 0.613,
"eval_steps_per_second": 0.153,
"step": 58
},
{
"epoch": 1.0000701754385966,
"grad_norm": 6.502946853637695,
"learning_rate": 1.0526315789473685e-06,
"loss": 0.7694,
"step": 60
},
{
"epoch": 1.0004210526315789,
"grad_norm": 11.516799926757812,
"learning_rate": 1.2280701754385965e-06,
"loss": 0.7382,
"step": 70
},
{
"epoch": 1.0007719298245614,
"grad_norm": 7.619742393493652,
"learning_rate": 1.4035087719298246e-06,
"loss": 0.6912,
"step": 80
},
{
"epoch": 1.001122807017544,
"grad_norm": 5.542720794677734,
"learning_rate": 1.5789473684210528e-06,
"loss": 0.7054,
"step": 90
},
{
"epoch": 1.0014736842105263,
"grad_norm": 7.172524929046631,
"learning_rate": 1.7543859649122807e-06,
"loss": 0.7533,
"step": 100
},
{
"epoch": 1.0018245614035088,
"grad_norm": 6.668615341186523,
"learning_rate": 1.929824561403509e-06,
"loss": 0.7229,
"step": 110
},
{
"epoch": 1.0020350877192983,
"eval_accuracy": 0.5416666666666666,
"eval_loss": 0.663836658000946,
"eval_runtime": 77.9477,
"eval_samples_per_second": 0.616,
"eval_steps_per_second": 0.154,
"step": 116
},
{
"epoch": 2.000140350877193,
"grad_norm": 4.909543991088867,
"learning_rate": 2.105263157894737e-06,
"loss": 0.6922,
"step": 120
},
{
"epoch": 2.0004912280701754,
"grad_norm": 9.0471830368042,
"learning_rate": 2.2807017543859652e-06,
"loss": 0.6736,
"step": 130
},
{
"epoch": 2.0008421052631578,
"grad_norm": 6.69089412689209,
"learning_rate": 2.456140350877193e-06,
"loss": 0.6865,
"step": 140
},
{
"epoch": 2.0011929824561405,
"grad_norm": 9.476597785949707,
"learning_rate": 2.631578947368421e-06,
"loss": 0.6844,
"step": 150
},
{
"epoch": 2.001543859649123,
"grad_norm": 7.067219257354736,
"learning_rate": 2.8070175438596493e-06,
"loss": 0.6768,
"step": 160
},
{
"epoch": 2.001894736842105,
"grad_norm": 5.748457908630371,
"learning_rate": 2.9824561403508774e-06,
"loss": 0.6827,
"step": 170
},
{
"epoch": 2.0020350877192983,
"eval_accuracy": 0.6041666666666666,
"eval_loss": 0.6515334248542786,
"eval_runtime": 77.9754,
"eval_samples_per_second": 0.616,
"eval_steps_per_second": 0.154,
"step": 174
},
{
"epoch": 3.0002105263157897,
"grad_norm": 8.415090560913086,
"learning_rate": 3.1578947368421056e-06,
"loss": 0.7035,
"step": 180
},
{
"epoch": 3.000561403508772,
"grad_norm": 7.755239963531494,
"learning_rate": 3.3333333333333333e-06,
"loss": 0.712,
"step": 190
},
{
"epoch": 3.0009122807017543,
"grad_norm": 11.437898635864258,
"learning_rate": 3.5087719298245615e-06,
"loss": 0.6409,
"step": 200
},
{
"epoch": 3.0012631578947366,
"grad_norm": 6.896209239959717,
"learning_rate": 3.6842105263157892e-06,
"loss": 0.6862,
"step": 210
},
{
"epoch": 3.0016140350877194,
"grad_norm": 5.764392852783203,
"learning_rate": 3.859649122807018e-06,
"loss": 0.6459,
"step": 220
},
{
"epoch": 3.0019649122807017,
"grad_norm": 8.806387901306152,
"learning_rate": 4.035087719298246e-06,
"loss": 0.7322,
"step": 230
},
{
"epoch": 3.0020350877192983,
"eval_accuracy": 0.75,
"eval_loss": 0.6666872501373291,
"eval_runtime": 78.325,
"eval_samples_per_second": 0.613,
"eval_steps_per_second": 0.153,
"step": 232
},
{
"epoch": 4.000280701754386,
"grad_norm": 11.507173538208008,
"learning_rate": 4.210526315789474e-06,
"loss": 0.6937,
"step": 240
},
{
"epoch": 4.0006315789473685,
"grad_norm": 7.351099491119385,
"learning_rate": 4.3859649122807014e-06,
"loss": 0.6292,
"step": 250
},
{
"epoch": 4.000982456140351,
"grad_norm": 4.936241149902344,
"learning_rate": 4.5614035087719304e-06,
"loss": 0.6041,
"step": 260
},
{
"epoch": 4.001333333333333,
"grad_norm": 10.265213012695312,
"learning_rate": 4.736842105263159e-06,
"loss": 0.6616,
"step": 270
},
{
"epoch": 4.0016842105263155,
"grad_norm": 14.022355079650879,
"learning_rate": 4.912280701754386e-06,
"loss": 0.6489,
"step": 280
},
{
"epoch": 4.002035087719298,
"grad_norm": 14.538658142089844,
"learning_rate": 5.087719298245614e-06,
"loss": 0.6552,
"step": 290
},
{
"epoch": 4.002035087719298,
"eval_accuracy": 0.75,
"eval_loss": 0.6378026604652405,
"eval_runtime": 78.4115,
"eval_samples_per_second": 0.612,
"eval_steps_per_second": 0.153,
"step": 290
},
{
"epoch": 5.000350877192982,
"grad_norm": 6.908311367034912,
"learning_rate": 5.263157894736842e-06,
"loss": 0.6183,
"step": 300
},
{
"epoch": 5.000701754385965,
"grad_norm": 6.211957931518555,
"learning_rate": 5.43859649122807e-06,
"loss": 0.5759,
"step": 310
},
{
"epoch": 5.001052631578947,
"grad_norm": 4.951029300689697,
"learning_rate": 5.6140350877192985e-06,
"loss": 0.6144,
"step": 320
},
{
"epoch": 5.00140350877193,
"grad_norm": 8.593265533447266,
"learning_rate": 5.789473684210527e-06,
"loss": 0.5619,
"step": 330
},
{
"epoch": 5.0017543859649125,
"grad_norm": 19.80694007873535,
"learning_rate": 5.964912280701755e-06,
"loss": 0.4691,
"step": 340
},
{
"epoch": 5.002035087719298,
"eval_accuracy": 0.75,
"eval_loss": 0.5537357926368713,
"eval_runtime": 80.2663,
"eval_samples_per_second": 0.598,
"eval_steps_per_second": 0.15,
"step": 348
},
{
"epoch": 6.000070175438596,
"grad_norm": 19.27092170715332,
"learning_rate": 6.140350877192982e-06,
"loss": 0.5575,
"step": 350
},
{
"epoch": 6.000421052631579,
"grad_norm": 14.520448684692383,
"learning_rate": 6.315789473684211e-06,
"loss": 0.5209,
"step": 360
},
{
"epoch": 6.000771929824562,
"grad_norm": 13.577587127685547,
"learning_rate": 6.4912280701754385e-06,
"loss": 0.4873,
"step": 370
},
{
"epoch": 6.001122807017544,
"grad_norm": 2.4672834873199463,
"learning_rate": 6.666666666666667e-06,
"loss": 0.3996,
"step": 380
},
{
"epoch": 6.001473684210526,
"grad_norm": 29.06943702697754,
"learning_rate": 6.842105263157896e-06,
"loss": 0.58,
"step": 390
},
{
"epoch": 6.001824561403509,
"grad_norm": 10.214743614196777,
"learning_rate": 7.017543859649123e-06,
"loss": 0.6845,
"step": 400
},
{
"epoch": 6.002035087719298,
"eval_accuracy": 0.7083333333333334,
"eval_loss": 0.6998243927955627,
"eval_runtime": 81.5316,
"eval_samples_per_second": 0.589,
"eval_steps_per_second": 0.147,
"step": 406
},
{
"epoch": 7.000140350877193,
"grad_norm": 72.12657928466797,
"learning_rate": 7.192982456140351e-06,
"loss": 0.6733,
"step": 410
},
{
"epoch": 7.000491228070175,
"grad_norm": 5.446975231170654,
"learning_rate": 7.3684210526315784e-06,
"loss": 0.2873,
"step": 420
},
{
"epoch": 7.000842105263158,
"grad_norm": 9.24228286743164,
"learning_rate": 7.5438596491228074e-06,
"loss": 0.4578,
"step": 430
},
{
"epoch": 7.00119298245614,
"grad_norm": 1.2333711385726929,
"learning_rate": 7.719298245614036e-06,
"loss": 0.3516,
"step": 440
},
{
"epoch": 7.001543859649122,
"grad_norm": 6.666906833648682,
"learning_rate": 7.894736842105263e-06,
"loss": 0.5434,
"step": 450
},
{
"epoch": 7.001894736842106,
"grad_norm": 18.284526824951172,
"learning_rate": 8.070175438596492e-06,
"loss": 0.6754,
"step": 460
},
{
"epoch": 7.002035087719298,
"eval_accuracy": 0.875,
"eval_loss": 0.36466991901397705,
"eval_runtime": 80.8792,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.148,
"step": 464
},
{
"epoch": 8.00021052631579,
"grad_norm": 8.833359718322754,
"learning_rate": 8.245614035087721e-06,
"loss": 0.4877,
"step": 470
},
{
"epoch": 8.000561403508772,
"grad_norm": 10.950183868408203,
"learning_rate": 8.421052631578948e-06,
"loss": 0.3044,
"step": 480
},
{
"epoch": 8.000912280701755,
"grad_norm": 2.037674903869629,
"learning_rate": 8.596491228070176e-06,
"loss": 0.2232,
"step": 490
},
{
"epoch": 8.001263157894737,
"grad_norm": 78.8741455078125,
"learning_rate": 8.771929824561403e-06,
"loss": 0.1771,
"step": 500
},
{
"epoch": 8.00161403508772,
"grad_norm": 90.6770248413086,
"learning_rate": 8.947368421052632e-06,
"loss": 1.1209,
"step": 510
},
{
"epoch": 8.001964912280702,
"grad_norm": 39.13031768798828,
"learning_rate": 9.122807017543861e-06,
"loss": 0.8425,
"step": 520
},
{
"epoch": 8.002035087719298,
"eval_accuracy": 0.5416666666666666,
"eval_loss": 0.6199241876602173,
"eval_runtime": 81.9922,
"eval_samples_per_second": 0.585,
"eval_steps_per_second": 0.146,
"step": 522
},
{
"epoch": 9.000280701754386,
"grad_norm": 17.226152420043945,
"learning_rate": 9.298245614035088e-06,
"loss": 0.7695,
"step": 530
},
{
"epoch": 9.000631578947369,
"grad_norm": 12.632246971130371,
"learning_rate": 9.473684210526317e-06,
"loss": 0.5423,
"step": 540
},
{
"epoch": 9.00098245614035,
"grad_norm": 7.4788336753845215,
"learning_rate": 9.649122807017545e-06,
"loss": 0.6734,
"step": 550
},
{
"epoch": 9.001333333333333,
"grad_norm": 32.823486328125,
"learning_rate": 9.824561403508772e-06,
"loss": 0.4033,
"step": 560
},
{
"epoch": 9.001684210526316,
"grad_norm": 5.6088480949401855,
"learning_rate": 1e-05,
"loss": 0.2009,
"step": 570
},
{
"epoch": 9.002035087719298,
"grad_norm": 0.8267044425010681,
"learning_rate": 1.0175438596491228e-05,
"loss": 0.2276,
"step": 580
},
{
"epoch": 9.002035087719298,
"eval_accuracy": 0.7291666666666666,
"eval_loss": 0.9983854293823242,
"eval_runtime": 81.8828,
"eval_samples_per_second": 0.586,
"eval_steps_per_second": 0.147,
"step": 580
},
{
"epoch": 10.000350877192982,
"grad_norm": 0.6918083429336548,
"learning_rate": 1.0350877192982457e-05,
"loss": 0.4027,
"step": 590
},
{
"epoch": 10.000701754385965,
"grad_norm": 12.070817947387695,
"learning_rate": 1.0526315789473684e-05,
"loss": 0.1868,
"step": 600
},
{
"epoch": 10.001052631578947,
"grad_norm": 11.899476051330566,
"learning_rate": 1.0701754385964913e-05,
"loss": 0.8328,
"step": 610
},
{
"epoch": 10.00140350877193,
"grad_norm": 18.76070213317871,
"learning_rate": 1.087719298245614e-05,
"loss": 0.4753,
"step": 620
},
{
"epoch": 10.001754385964912,
"grad_norm": 15.813506126403809,
"learning_rate": 1.1052631578947368e-05,
"loss": 0.3953,
"step": 630
},
{
"epoch": 10.002035087719298,
"eval_accuracy": 0.9375,
"eval_loss": 0.3595670759677887,
"eval_runtime": 84.5422,
"eval_samples_per_second": 0.568,
"eval_steps_per_second": 0.142,
"step": 638
},
{
"epoch": 11.000070175438596,
"grad_norm": 2.381981372833252,
"learning_rate": 1.1228070175438597e-05,
"loss": 0.3252,
"step": 640
},
{
"epoch": 11.000421052631578,
"grad_norm": 8.495650291442871,
"learning_rate": 1.1403508771929824e-05,
"loss": 0.2205,
"step": 650
},
{
"epoch": 11.00077192982456,
"grad_norm": 0.5458263754844666,
"learning_rate": 1.1578947368421053e-05,
"loss": 0.4623,
"step": 660
},
{
"epoch": 11.001122807017543,
"grad_norm": 35.78744888305664,
"learning_rate": 1.1754385964912282e-05,
"loss": 0.4652,
"step": 670
},
{
"epoch": 11.001473684210527,
"grad_norm": 69.58731842041016,
"learning_rate": 1.192982456140351e-05,
"loss": 0.2175,
"step": 680
},
{
"epoch": 11.00182456140351,
"grad_norm": 80.09464263916016,
"learning_rate": 1.2105263157894737e-05,
"loss": 0.3255,
"step": 690
},
{
"epoch": 11.002035087719298,
"eval_accuracy": 0.9166666666666666,
"eval_loss": 0.39784160256385803,
"eval_runtime": 82.0895,
"eval_samples_per_second": 0.585,
"eval_steps_per_second": 0.146,
"step": 696
},
{
"epoch": 12.000140350877192,
"grad_norm": 0.08766458928585052,
"learning_rate": 1.2280701754385964e-05,
"loss": 0.0288,
"step": 700
},
{
"epoch": 12.000491228070176,
"grad_norm": 10.239900588989258,
"learning_rate": 1.2456140350877193e-05,
"loss": 0.2648,
"step": 710
},
{
"epoch": 12.000842105263159,
"grad_norm": 5.331236839294434,
"learning_rate": 1.2631578947368422e-05,
"loss": 0.3223,
"step": 720
},
{
"epoch": 12.001192982456141,
"grad_norm": 0.24060657620429993,
"learning_rate": 1.2807017543859651e-05,
"loss": 0.2808,
"step": 730
},
{
"epoch": 12.001543859649123,
"grad_norm": 0.31760913133621216,
"learning_rate": 1.2982456140350877e-05,
"loss": 0.2207,
"step": 740
},
{
"epoch": 12.001894736842106,
"grad_norm": 70.13704681396484,
"learning_rate": 1.3157894736842106e-05,
"loss": 0.2524,
"step": 750
},
{
"epoch": 12.002035087719298,
"eval_accuracy": 0.9375,
"eval_loss": 0.3351368010044098,
"eval_runtime": 80.8837,
"eval_samples_per_second": 0.593,
"eval_steps_per_second": 0.148,
"step": 754
},
{
"epoch": 13.00021052631579,
"grad_norm": 0.32135623693466187,
"learning_rate": 1.3333333333333333e-05,
"loss": 0.2225,
"step": 760
},
{
"epoch": 13.000561403508772,
"grad_norm": 21.094276428222656,
"learning_rate": 1.3508771929824562e-05,
"loss": 0.5212,
"step": 770
},
{
"epoch": 13.000912280701755,
"grad_norm": 0.08428701013326645,
"learning_rate": 1.3684210526315791e-05,
"loss": 0.4246,
"step": 780
},
{
"epoch": 13.001263157894737,
"grad_norm": 0.18355534970760345,
"learning_rate": 1.3859649122807017e-05,
"loss": 0.0793,
"step": 790
},
{
"epoch": 13.00161403508772,
"grad_norm": 8.33340072631836,
"learning_rate": 1.4035087719298246e-05,
"loss": 0.3384,
"step": 800
},
{
"epoch": 13.001964912280702,
"grad_norm": 0.7141004204750061,
"learning_rate": 1.4210526315789475e-05,
"loss": 0.5978,
"step": 810
},
{
"epoch": 13.002035087719298,
"eval_accuracy": 0.9375,
"eval_loss": 0.23082482814788818,
"eval_runtime": 81.747,
"eval_samples_per_second": 0.587,
"eval_steps_per_second": 0.147,
"step": 812
},
{
"epoch": 14.000280701754386,
"grad_norm": 0.15585492551326752,
"learning_rate": 1.4385964912280702e-05,
"loss": 0.122,
"step": 820
},
{
"epoch": 14.000631578947369,
"grad_norm": 49.04802322387695,
"learning_rate": 1.4561403508771931e-05,
"loss": 0.522,
"step": 830
},
{
"epoch": 14.00098245614035,
"grad_norm": 0.3657858967781067,
"learning_rate": 1.4736842105263157e-05,
"loss": 0.0476,
"step": 840
},
{
"epoch": 14.001333333333333,
"grad_norm": 0.05123307183384895,
"learning_rate": 1.4912280701754386e-05,
"loss": 0.2268,
"step": 850
},
{
"epoch": 14.001684210526316,
"grad_norm": 0.08785073459148407,
"learning_rate": 1.5087719298245615e-05,
"loss": 0.4392,
"step": 860
},
{
"epoch": 14.002035087719298,
"grad_norm": 0.33805736899375916,
"learning_rate": 1.5263157894736842e-05,
"loss": 0.1542,
"step": 870
},
{
"epoch": 14.002035087719298,
"eval_accuracy": 0.8958333333333334,
"eval_loss": 0.5762323141098022,
"eval_runtime": 82.832,
"eval_samples_per_second": 0.579,
"eval_steps_per_second": 0.145,
"step": 870
},
{
"epoch": 15.000350877192982,
"grad_norm": 0.06892251968383789,
"learning_rate": 1.543859649122807e-05,
"loss": 0.1377,
"step": 880
},
{
"epoch": 15.000701754385965,
"grad_norm": 0.07005161046981812,
"learning_rate": 1.56140350877193e-05,
"loss": 0.0053,
"step": 890
},
{
"epoch": 15.001052631578947,
"grad_norm": 0.03198734670877457,
"learning_rate": 1.5789473684210526e-05,
"loss": 0.5775,
"step": 900
},
{
"epoch": 15.00140350877193,
"grad_norm": 171.48255920410156,
"learning_rate": 1.5964912280701755e-05,
"loss": 0.3737,
"step": 910
},
{
"epoch": 15.001754385964912,
"grad_norm": 0.4068077504634857,
"learning_rate": 1.6140350877192984e-05,
"loss": 0.3073,
"step": 920
},
{
"epoch": 15.002035087719298,
"eval_accuracy": 0.8958333333333334,
"eval_loss": 0.33416375517845154,
"eval_runtime": 83.3591,
"eval_samples_per_second": 0.576,
"eval_steps_per_second": 0.144,
"step": 928
},
{
"epoch": 16.000070175438598,
"grad_norm": 0.3335668444633484,
"learning_rate": 1.6315789473684213e-05,
"loss": 0.7197,
"step": 930
},
{
"epoch": 16.00042105263158,
"grad_norm": 1.4757983684539795,
"learning_rate": 1.6491228070175442e-05,
"loss": 0.2539,
"step": 940
},
{
"epoch": 16.000771929824563,
"grad_norm": 0.17356331646442413,
"learning_rate": 1.6666666666666667e-05,
"loss": 0.0063,
"step": 950
},
{
"epoch": 16.001122807017545,
"grad_norm": 0.1452503204345703,
"learning_rate": 1.6842105263157896e-05,
"loss": 0.5967,
"step": 960
},
{
"epoch": 16.001473684210527,
"grad_norm": 0.1030503362417221,
"learning_rate": 1.7017543859649125e-05,
"loss": 0.6578,
"step": 970
},
{
"epoch": 16.00182456140351,
"grad_norm": 12.400784492492676,
"learning_rate": 1.719298245614035e-05,
"loss": 0.5518,
"step": 980
},
{
"epoch": 16.0020350877193,
"eval_accuracy": 0.8541666666666666,
"eval_loss": 0.4223368465900421,
"eval_runtime": 83.4362,
"eval_samples_per_second": 0.575,
"eval_steps_per_second": 0.144,
"step": 986
},
{
"epoch": 17.000140350877192,
"grad_norm": 0.28909754753112793,
"learning_rate": 1.736842105263158e-05,
"loss": 0.2008,
"step": 990
},
{
"epoch": 17.000491228070175,
"grad_norm": 0.21579360961914062,
"learning_rate": 1.7543859649122806e-05,
"loss": 0.3298,
"step": 1000
},
{
"epoch": 17.000842105263157,
"grad_norm": 0.10615105926990509,
"learning_rate": 1.7719298245614035e-05,
"loss": 0.004,
"step": 1010
},
{
"epoch": 17.00119298245614,
"grad_norm": 0.046201951801776886,
"learning_rate": 1.7894736842105264e-05,
"loss": 0.3526,
"step": 1020
},
{
"epoch": 17.00154385964912,
"grad_norm": 0.06010481342673302,
"learning_rate": 1.8070175438596493e-05,
"loss": 0.3399,
"step": 1030
},
{
"epoch": 17.001894736842104,
"grad_norm": 8.584966659545898,
"learning_rate": 1.8245614035087722e-05,
"loss": 0.6157,
"step": 1040
},
{
"epoch": 17.0020350877193,
"eval_accuracy": 0.9375,
"eval_loss": 0.17038817703723907,
"eval_runtime": 83.7401,
"eval_samples_per_second": 0.573,
"eval_steps_per_second": 0.143,
"step": 1044
},
{
"epoch": 18.00021052631579,
"grad_norm": 0.774956464767456,
"learning_rate": 1.8421052631578947e-05,
"loss": 0.1596,
"step": 1050
},
{
"epoch": 18.000561403508772,
"grad_norm": 0.36749064922332764,
"learning_rate": 1.8596491228070176e-05,
"loss": 0.2122,
"step": 1060
},
{
"epoch": 18.000912280701755,
"grad_norm": 0.06645552814006805,
"learning_rate": 1.8771929824561405e-05,
"loss": 0.2568,
"step": 1070
},
{
"epoch": 18.001263157894737,
"grad_norm": 0.021599041298031807,
"learning_rate": 1.8947368421052634e-05,
"loss": 0.283,
"step": 1080
},
{
"epoch": 18.00161403508772,
"grad_norm": 113.25637817382812,
"learning_rate": 1.9122807017543863e-05,
"loss": 0.3591,
"step": 1090
},
{
"epoch": 18.0019649122807,
"grad_norm": 0.21973834931850433,
"learning_rate": 1.929824561403509e-05,
"loss": 0.2544,
"step": 1100
},
{
"epoch": 18.0020350877193,
"eval_accuracy": 0.9166666666666666,
"eval_loss": 0.35440635681152344,
"eval_runtime": 82.2034,
"eval_samples_per_second": 0.584,
"eval_steps_per_second": 0.146,
"step": 1102
},
{
"epoch": 19.000280701754384,
"grad_norm": 0.06097158417105675,
"learning_rate": 1.9473684210526315e-05,
"loss": 0.3663,
"step": 1110
},
{
"epoch": 19.000631578947367,
"grad_norm": 25.72997283935547,
"learning_rate": 1.9649122807017544e-05,
"loss": 0.8104,
"step": 1120
},
{
"epoch": 19.000982456140353,
"grad_norm": 0.5115303993225098,
"learning_rate": 1.9824561403508773e-05,
"loss": 0.2474,
"step": 1130
},
{
"epoch": 19.001333333333335,
"grad_norm": 0.27492067217826843,
"learning_rate": 2e-05,
"loss": 0.3686,
"step": 1140
},
{
"epoch": 19.001684210526317,
"grad_norm": 22.944690704345703,
"learning_rate": 2.0175438596491227e-05,
"loss": 0.2315,
"step": 1150
},
{
"epoch": 19.0020350877193,
"grad_norm": 0.11991500854492188,
"learning_rate": 2.0350877192982456e-05,
"loss": 0.4036,
"step": 1160
},
{
"epoch": 19.0020350877193,
"eval_accuracy": 0.9166666666666666,
"eval_loss": 0.25051262974739075,
"eval_runtime": 80.7899,
"eval_samples_per_second": 0.594,
"eval_steps_per_second": 0.149,
"step": 1160
},
{
"epoch": 20.000350877192982,
"grad_norm": 0.44547587633132935,
"learning_rate": 2.0526315789473685e-05,
"loss": 0.1078,
"step": 1170
},
{
"epoch": 20.000701754385965,
"grad_norm": 76.07775115966797,
"learning_rate": 2.0701754385964914e-05,
"loss": 0.4915,
"step": 1180
},
{
"epoch": 20.001052631578947,
"grad_norm": 0.349282830953598,
"learning_rate": 2.0877192982456143e-05,
"loss": 0.2929,
"step": 1190
},
{
"epoch": 20.00140350877193,
"grad_norm": 8.304322242736816,
"learning_rate": 2.105263157894737e-05,
"loss": 0.219,
"step": 1200
},
{
"epoch": 20.00175438596491,
"grad_norm": 0.08941491693258286,
"learning_rate": 2.1228070175438598e-05,
"loss": 0.2382,
"step": 1210
},
{
"epoch": 20.0020350877193,
"eval_accuracy": 0.9375,
"eval_loss": 0.3155660927295685,
"eval_runtime": 82.6296,
"eval_samples_per_second": 0.581,
"eval_steps_per_second": 0.145,
"step": 1218
},
{
"epoch": 21.000070175438598,
"grad_norm": 6.294134140014648,
"learning_rate": 2.1403508771929827e-05,
"loss": 0.2611,
"step": 1220
},
{
"epoch": 21.00042105263158,
"grad_norm": 0.11261521279811859,
"learning_rate": 2.1578947368421053e-05,
"loss": 0.1969,
"step": 1230
},
{
"epoch": 21.000771929824563,
"grad_norm": 0.2796896696090698,
"learning_rate": 2.175438596491228e-05,
"loss": 0.2955,
"step": 1240
},
{
"epoch": 21.001122807017545,
"grad_norm": 0.07930008322000504,
"learning_rate": 2.1929824561403507e-05,
"loss": 0.013,
"step": 1250
},
{
"epoch": 21.001473684210527,
"grad_norm": 5.909428119659424,
"learning_rate": 2.2105263157894736e-05,
"loss": 0.3568,
"step": 1260
},
{
"epoch": 21.00182456140351,
"grad_norm": 168.33380126953125,
"learning_rate": 2.2280701754385965e-05,
"loss": 0.6751,
"step": 1270
},
{
"epoch": 21.0020350877193,
"eval_accuracy": 0.9375,
"eval_loss": 0.259630411863327,
"eval_runtime": 82.1271,
"eval_samples_per_second": 0.584,
"eval_steps_per_second": 0.146,
"step": 1276
},
{
"epoch": 22.000140350877192,
"grad_norm": 0.22503353655338287,
"learning_rate": 2.2456140350877194e-05,
"loss": 0.3249,
"step": 1280
},
{
"epoch": 22.000491228070175,
"grad_norm": 0.2562604248523712,
"learning_rate": 2.2631578947368423e-05,
"loss": 0.2267,
"step": 1290
},
{
"epoch": 22.000842105263157,
"grad_norm": 0.6118970513343811,
"learning_rate": 2.280701754385965e-05,
"loss": 0.7495,
"step": 1300
},
{
"epoch": 22.00119298245614,
"grad_norm": 0.2397994101047516,
"learning_rate": 2.2982456140350878e-05,
"loss": 0.0388,
"step": 1310
},
{
"epoch": 22.00154385964912,
"grad_norm": 0.10384727269411087,
"learning_rate": 2.3157894736842107e-05,
"loss": 0.3285,
"step": 1320
},
{
"epoch": 22.001894736842104,
"grad_norm": 0.0419117733836174,
"learning_rate": 2.3333333333333336e-05,
"loss": 0.2848,
"step": 1330
},
{
"epoch": 22.0020350877193,
"eval_accuracy": 0.8125,
"eval_loss": 0.822706937789917,
"eval_runtime": 83.5818,
"eval_samples_per_second": 0.574,
"eval_steps_per_second": 0.144,
"step": 1334
},
{
"epoch": 23.00021052631579,
"grad_norm": 121.5499038696289,
"learning_rate": 2.3508771929824565e-05,
"loss": 0.5364,
"step": 1340
},
{
"epoch": 23.000561403508772,
"grad_norm": 0.10266309231519699,
"learning_rate": 2.368421052631579e-05,
"loss": 0.8097,
"step": 1350
},
{
"epoch": 23.000912280701755,
"grad_norm": 9.736127853393555,
"learning_rate": 2.385964912280702e-05,
"loss": 0.6052,
"step": 1360
},
{
"epoch": 23.001263157894737,
"grad_norm": 4.3637471199035645,
"learning_rate": 2.4035087719298245e-05,
"loss": 0.3504,
"step": 1370
},
{
"epoch": 23.00161403508772,
"grad_norm": 0.19882246851921082,
"learning_rate": 2.4210526315789474e-05,
"loss": 0.3784,
"step": 1380
},
{
"epoch": 23.0019649122807,
"grad_norm": 0.27082210779190063,
"learning_rate": 2.4385964912280703e-05,
"loss": 0.1225,
"step": 1390
},
{
"epoch": 23.0020350877193,
"eval_accuracy": 0.9375,
"eval_loss": 0.2921377420425415,
"eval_runtime": 82.9556,
"eval_samples_per_second": 0.579,
"eval_steps_per_second": 0.145,
"step": 1392
},
{
"epoch": 24.000280701754384,
"grad_norm": 0.2171986997127533,
"learning_rate": 2.456140350877193e-05,
"loss": 0.1094,
"step": 1400
},
{
"epoch": 24.000631578947367,
"grad_norm": 0.21692253649234772,
"learning_rate": 2.4736842105263158e-05,
"loss": 0.3332,
"step": 1410
},
{
"epoch": 24.000982456140353,
"grad_norm": 0.3834693729877472,
"learning_rate": 2.4912280701754387e-05,
"loss": 0.2847,
"step": 1420
},
{
"epoch": 24.001333333333335,
"grad_norm": 0.08816500753164291,
"learning_rate": 2.5087719298245616e-05,
"loss": 0.1147,
"step": 1430
},
{
"epoch": 24.001684210526317,
"grad_norm": 0.21103212237358093,
"learning_rate": 2.5263157894736845e-05,
"loss": 0.4283,
"step": 1440
},
{
"epoch": 24.0020350877193,
"grad_norm": 0.27631059288978577,
"learning_rate": 2.5438596491228074e-05,
"loss": 0.616,
"step": 1450
},
{
"epoch": 24.0020350877193,
"eval_accuracy": 0.9375,
"eval_loss": 0.2928893566131592,
"eval_runtime": 81.6066,
"eval_samples_per_second": 0.588,
"eval_steps_per_second": 0.147,
"step": 1450
},
{
"epoch": 24.0020350877193,
"eval_accuracy": 0.9411764705882353,
"eval_loss": 0.2618250548839569,
"eval_runtime": 88.9045,
"eval_samples_per_second": 0.574,
"eval_steps_per_second": 0.146,
"step": 1450
},
{
"epoch": 24.0020350877193,
"eval_accuracy": 0.9411764705882353,
"eval_loss": 0.2618250548839569,
"eval_runtime": 89.4759,
"eval_samples_per_second": 0.57,
"eval_steps_per_second": 0.145,
"step": 1450
}
],
"logging_steps": 10,
"max_steps": 28500,
"num_input_tokens_seen": 0,
"num_train_epochs": 9223372036854775807,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.164871389462528e+18,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}