{ "best_metric": 0.9375, "best_model_checkpoint": "videomae-base-finetuned-ucf101-subset_fhbh/checkpoint-638", "epoch": 24.0020350877193, "eval_steps": 500, "global_step": 1450, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003508771929824561, "grad_norm": 5.952354907989502, "learning_rate": 1.7543859649122808e-07, "loss": 0.745, "step": 10 }, { "epoch": 0.0007017543859649122, "grad_norm": 4.616081714630127, "learning_rate": 3.5087719298245616e-07, "loss": 0.6751, "step": 20 }, { "epoch": 0.0010526315789473684, "grad_norm": 14.299074172973633, "learning_rate": 5.263157894736843e-07, "loss": 0.7311, "step": 30 }, { "epoch": 0.0014035087719298245, "grad_norm": 9.126326560974121, "learning_rate": 7.017543859649123e-07, "loss": 0.6957, "step": 40 }, { "epoch": 0.0017543859649122807, "grad_norm": 6.692790985107422, "learning_rate": 8.771929824561404e-07, "loss": 0.7533, "step": 50 }, { "epoch": 0.0020350877192982456, "eval_accuracy": 0.5416666666666666, "eval_loss": 0.677791178226471, "eval_runtime": 78.366, "eval_samples_per_second": 0.613, "eval_steps_per_second": 0.153, "step": 58 }, { "epoch": 1.0000701754385966, "grad_norm": 6.502946853637695, "learning_rate": 1.0526315789473685e-06, "loss": 0.7694, "step": 60 }, { "epoch": 1.0004210526315789, "grad_norm": 11.516799926757812, "learning_rate": 1.2280701754385965e-06, "loss": 0.7382, "step": 70 }, { "epoch": 1.0007719298245614, "grad_norm": 7.619742393493652, "learning_rate": 1.4035087719298246e-06, "loss": 0.6912, "step": 80 }, { "epoch": 1.001122807017544, "grad_norm": 5.542720794677734, "learning_rate": 1.5789473684210528e-06, "loss": 0.7054, "step": 90 }, { "epoch": 1.0014736842105263, "grad_norm": 7.172524929046631, "learning_rate": 1.7543859649122807e-06, "loss": 0.7533, "step": 100 }, { "epoch": 1.0018245614035088, "grad_norm": 6.668615341186523, "learning_rate": 1.929824561403509e-06, "loss": 0.7229, "step": 110 }, { "epoch": 1.0020350877192983, "eval_accuracy": 0.5416666666666666, "eval_loss": 0.663836658000946, "eval_runtime": 77.9477, "eval_samples_per_second": 0.616, "eval_steps_per_second": 0.154, "step": 116 }, { "epoch": 2.000140350877193, "grad_norm": 4.909543991088867, "learning_rate": 2.105263157894737e-06, "loss": 0.6922, "step": 120 }, { "epoch": 2.0004912280701754, "grad_norm": 9.0471830368042, "learning_rate": 2.2807017543859652e-06, "loss": 0.6736, "step": 130 }, { "epoch": 2.0008421052631578, "grad_norm": 6.69089412689209, "learning_rate": 2.456140350877193e-06, "loss": 0.6865, "step": 140 }, { "epoch": 2.0011929824561405, "grad_norm": 9.476597785949707, "learning_rate": 2.631578947368421e-06, "loss": 0.6844, "step": 150 }, { "epoch": 2.001543859649123, "grad_norm": 7.067219257354736, "learning_rate": 2.8070175438596493e-06, "loss": 0.6768, "step": 160 }, { "epoch": 2.001894736842105, "grad_norm": 5.748457908630371, "learning_rate": 2.9824561403508774e-06, "loss": 0.6827, "step": 170 }, { "epoch": 2.0020350877192983, "eval_accuracy": 0.6041666666666666, "eval_loss": 0.6515334248542786, "eval_runtime": 77.9754, "eval_samples_per_second": 0.616, "eval_steps_per_second": 0.154, "step": 174 }, { "epoch": 3.0002105263157897, "grad_norm": 8.415090560913086, "learning_rate": 3.1578947368421056e-06, "loss": 0.7035, "step": 180 }, { "epoch": 3.000561403508772, "grad_norm": 7.755239963531494, "learning_rate": 3.3333333333333333e-06, "loss": 0.712, "step": 190 }, { "epoch": 3.0009122807017543, "grad_norm": 11.437898635864258, "learning_rate": 3.5087719298245615e-06, "loss": 0.6409, "step": 200 }, { "epoch": 3.0012631578947366, "grad_norm": 6.896209239959717, "learning_rate": 3.6842105263157892e-06, "loss": 0.6862, "step": 210 }, { "epoch": 3.0016140350877194, "grad_norm": 5.764392852783203, "learning_rate": 3.859649122807018e-06, "loss": 0.6459, "step": 220 }, { "epoch": 3.0019649122807017, "grad_norm": 8.806387901306152, "learning_rate": 4.035087719298246e-06, "loss": 0.7322, "step": 230 }, { "epoch": 3.0020350877192983, "eval_accuracy": 0.75, "eval_loss": 0.6666872501373291, "eval_runtime": 78.325, "eval_samples_per_second": 0.613, "eval_steps_per_second": 0.153, "step": 232 }, { "epoch": 4.000280701754386, "grad_norm": 11.507173538208008, "learning_rate": 4.210526315789474e-06, "loss": 0.6937, "step": 240 }, { "epoch": 4.0006315789473685, "grad_norm": 7.351099491119385, "learning_rate": 4.3859649122807014e-06, "loss": 0.6292, "step": 250 }, { "epoch": 4.000982456140351, "grad_norm": 4.936241149902344, "learning_rate": 4.5614035087719304e-06, "loss": 0.6041, "step": 260 }, { "epoch": 4.001333333333333, "grad_norm": 10.265213012695312, "learning_rate": 4.736842105263159e-06, "loss": 0.6616, "step": 270 }, { "epoch": 4.0016842105263155, "grad_norm": 14.022355079650879, "learning_rate": 4.912280701754386e-06, "loss": 0.6489, "step": 280 }, { "epoch": 4.002035087719298, "grad_norm": 14.538658142089844, "learning_rate": 5.087719298245614e-06, "loss": 0.6552, "step": 290 }, { "epoch": 4.002035087719298, "eval_accuracy": 0.75, "eval_loss": 0.6378026604652405, "eval_runtime": 78.4115, "eval_samples_per_second": 0.612, "eval_steps_per_second": 0.153, "step": 290 }, { "epoch": 5.000350877192982, "grad_norm": 6.908311367034912, "learning_rate": 5.263157894736842e-06, "loss": 0.6183, "step": 300 }, { "epoch": 5.000701754385965, "grad_norm": 6.211957931518555, "learning_rate": 5.43859649122807e-06, "loss": 0.5759, "step": 310 }, { "epoch": 5.001052631578947, "grad_norm": 4.951029300689697, "learning_rate": 5.6140350877192985e-06, "loss": 0.6144, "step": 320 }, { "epoch": 5.00140350877193, "grad_norm": 8.593265533447266, "learning_rate": 5.789473684210527e-06, "loss": 0.5619, "step": 330 }, { "epoch": 5.0017543859649125, "grad_norm": 19.80694007873535, "learning_rate": 5.964912280701755e-06, "loss": 0.4691, "step": 340 }, { "epoch": 5.002035087719298, "eval_accuracy": 0.75, "eval_loss": 0.5537357926368713, "eval_runtime": 80.2663, "eval_samples_per_second": 0.598, "eval_steps_per_second": 0.15, "step": 348 }, { "epoch": 6.000070175438596, "grad_norm": 19.27092170715332, "learning_rate": 6.140350877192982e-06, "loss": 0.5575, "step": 350 }, { "epoch": 6.000421052631579, "grad_norm": 14.520448684692383, "learning_rate": 6.315789473684211e-06, "loss": 0.5209, "step": 360 }, { "epoch": 6.000771929824562, "grad_norm": 13.577587127685547, "learning_rate": 6.4912280701754385e-06, "loss": 0.4873, "step": 370 }, { "epoch": 6.001122807017544, "grad_norm": 2.4672834873199463, "learning_rate": 6.666666666666667e-06, "loss": 0.3996, "step": 380 }, { "epoch": 6.001473684210526, "grad_norm": 29.06943702697754, "learning_rate": 6.842105263157896e-06, "loss": 0.58, "step": 390 }, { "epoch": 6.001824561403509, "grad_norm": 10.214743614196777, "learning_rate": 7.017543859649123e-06, "loss": 0.6845, "step": 400 }, { "epoch": 6.002035087719298, "eval_accuracy": 0.7083333333333334, "eval_loss": 0.6998243927955627, "eval_runtime": 81.5316, "eval_samples_per_second": 0.589, "eval_steps_per_second": 0.147, "step": 406 }, { "epoch": 7.000140350877193, "grad_norm": 72.12657928466797, "learning_rate": 7.192982456140351e-06, "loss": 0.6733, "step": 410 }, { "epoch": 7.000491228070175, "grad_norm": 5.446975231170654, "learning_rate": 7.3684210526315784e-06, "loss": 0.2873, "step": 420 }, { "epoch": 7.000842105263158, "grad_norm": 9.24228286743164, "learning_rate": 7.5438596491228074e-06, "loss": 0.4578, "step": 430 }, { "epoch": 7.00119298245614, "grad_norm": 1.2333711385726929, "learning_rate": 7.719298245614036e-06, "loss": 0.3516, "step": 440 }, { "epoch": 7.001543859649122, "grad_norm": 6.666906833648682, "learning_rate": 7.894736842105263e-06, "loss": 0.5434, "step": 450 }, { "epoch": 7.001894736842106, "grad_norm": 18.284526824951172, "learning_rate": 8.070175438596492e-06, "loss": 0.6754, "step": 460 }, { "epoch": 7.002035087719298, "eval_accuracy": 0.875, "eval_loss": 0.36466991901397705, "eval_runtime": 80.8792, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.148, "step": 464 }, { "epoch": 8.00021052631579, "grad_norm": 8.833359718322754, "learning_rate": 8.245614035087721e-06, "loss": 0.4877, "step": 470 }, { "epoch": 8.000561403508772, "grad_norm": 10.950183868408203, "learning_rate": 8.421052631578948e-06, "loss": 0.3044, "step": 480 }, { "epoch": 8.000912280701755, "grad_norm": 2.037674903869629, "learning_rate": 8.596491228070176e-06, "loss": 0.2232, "step": 490 }, { "epoch": 8.001263157894737, "grad_norm": 78.8741455078125, "learning_rate": 8.771929824561403e-06, "loss": 0.1771, "step": 500 }, { "epoch": 8.00161403508772, "grad_norm": 90.6770248413086, "learning_rate": 8.947368421052632e-06, "loss": 1.1209, "step": 510 }, { "epoch": 8.001964912280702, "grad_norm": 39.13031768798828, "learning_rate": 9.122807017543861e-06, "loss": 0.8425, "step": 520 }, { "epoch": 8.002035087719298, "eval_accuracy": 0.5416666666666666, "eval_loss": 0.6199241876602173, "eval_runtime": 81.9922, "eval_samples_per_second": 0.585, "eval_steps_per_second": 0.146, "step": 522 }, { "epoch": 9.000280701754386, "grad_norm": 17.226152420043945, "learning_rate": 9.298245614035088e-06, "loss": 0.7695, "step": 530 }, { "epoch": 9.000631578947369, "grad_norm": 12.632246971130371, "learning_rate": 9.473684210526317e-06, "loss": 0.5423, "step": 540 }, { "epoch": 9.00098245614035, "grad_norm": 7.4788336753845215, "learning_rate": 9.649122807017545e-06, "loss": 0.6734, "step": 550 }, { "epoch": 9.001333333333333, "grad_norm": 32.823486328125, "learning_rate": 9.824561403508772e-06, "loss": 0.4033, "step": 560 }, { "epoch": 9.001684210526316, "grad_norm": 5.6088480949401855, "learning_rate": 1e-05, "loss": 0.2009, "step": 570 }, { "epoch": 9.002035087719298, "grad_norm": 0.8267044425010681, "learning_rate": 1.0175438596491228e-05, "loss": 0.2276, "step": 580 }, { "epoch": 9.002035087719298, "eval_accuracy": 0.7291666666666666, "eval_loss": 0.9983854293823242, "eval_runtime": 81.8828, "eval_samples_per_second": 0.586, "eval_steps_per_second": 0.147, "step": 580 }, { "epoch": 10.000350877192982, "grad_norm": 0.6918083429336548, "learning_rate": 1.0350877192982457e-05, "loss": 0.4027, "step": 590 }, { "epoch": 10.000701754385965, "grad_norm": 12.070817947387695, "learning_rate": 1.0526315789473684e-05, "loss": 0.1868, "step": 600 }, { "epoch": 10.001052631578947, "grad_norm": 11.899476051330566, "learning_rate": 1.0701754385964913e-05, "loss": 0.8328, "step": 610 }, { "epoch": 10.00140350877193, "grad_norm": 18.76070213317871, "learning_rate": 1.087719298245614e-05, "loss": 0.4753, "step": 620 }, { "epoch": 10.001754385964912, "grad_norm": 15.813506126403809, "learning_rate": 1.1052631578947368e-05, "loss": 0.3953, "step": 630 }, { "epoch": 10.002035087719298, "eval_accuracy": 0.9375, "eval_loss": 0.3595670759677887, "eval_runtime": 84.5422, "eval_samples_per_second": 0.568, "eval_steps_per_second": 0.142, "step": 638 }, { "epoch": 11.000070175438596, "grad_norm": 2.381981372833252, "learning_rate": 1.1228070175438597e-05, "loss": 0.3252, "step": 640 }, { "epoch": 11.000421052631578, "grad_norm": 8.495650291442871, "learning_rate": 1.1403508771929824e-05, "loss": 0.2205, "step": 650 }, { "epoch": 11.00077192982456, "grad_norm": 0.5458263754844666, "learning_rate": 1.1578947368421053e-05, "loss": 0.4623, "step": 660 }, { "epoch": 11.001122807017543, "grad_norm": 35.78744888305664, "learning_rate": 1.1754385964912282e-05, "loss": 0.4652, "step": 670 }, { "epoch": 11.001473684210527, "grad_norm": 69.58731842041016, "learning_rate": 1.192982456140351e-05, "loss": 0.2175, "step": 680 }, { "epoch": 11.00182456140351, "grad_norm": 80.09464263916016, "learning_rate": 1.2105263157894737e-05, "loss": 0.3255, "step": 690 }, { "epoch": 11.002035087719298, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.39784160256385803, "eval_runtime": 82.0895, "eval_samples_per_second": 0.585, "eval_steps_per_second": 0.146, "step": 696 }, { "epoch": 12.000140350877192, "grad_norm": 0.08766458928585052, "learning_rate": 1.2280701754385964e-05, "loss": 0.0288, "step": 700 }, { "epoch": 12.000491228070176, "grad_norm": 10.239900588989258, "learning_rate": 1.2456140350877193e-05, "loss": 0.2648, "step": 710 }, { "epoch": 12.000842105263159, "grad_norm": 5.331236839294434, "learning_rate": 1.2631578947368422e-05, "loss": 0.3223, "step": 720 }, { "epoch": 12.001192982456141, "grad_norm": 0.24060657620429993, "learning_rate": 1.2807017543859651e-05, "loss": 0.2808, "step": 730 }, { "epoch": 12.001543859649123, "grad_norm": 0.31760913133621216, "learning_rate": 1.2982456140350877e-05, "loss": 0.2207, "step": 740 }, { "epoch": 12.001894736842106, "grad_norm": 70.13704681396484, "learning_rate": 1.3157894736842106e-05, "loss": 0.2524, "step": 750 }, { "epoch": 12.002035087719298, "eval_accuracy": 0.9375, "eval_loss": 0.3351368010044098, "eval_runtime": 80.8837, "eval_samples_per_second": 0.593, "eval_steps_per_second": 0.148, "step": 754 }, { "epoch": 13.00021052631579, "grad_norm": 0.32135623693466187, "learning_rate": 1.3333333333333333e-05, "loss": 0.2225, "step": 760 }, { "epoch": 13.000561403508772, "grad_norm": 21.094276428222656, "learning_rate": 1.3508771929824562e-05, "loss": 0.5212, "step": 770 }, { "epoch": 13.000912280701755, "grad_norm": 0.08428701013326645, "learning_rate": 1.3684210526315791e-05, "loss": 0.4246, "step": 780 }, { "epoch": 13.001263157894737, "grad_norm": 0.18355534970760345, "learning_rate": 1.3859649122807017e-05, "loss": 0.0793, "step": 790 }, { "epoch": 13.00161403508772, "grad_norm": 8.33340072631836, "learning_rate": 1.4035087719298246e-05, "loss": 0.3384, "step": 800 }, { "epoch": 13.001964912280702, "grad_norm": 0.7141004204750061, "learning_rate": 1.4210526315789475e-05, "loss": 0.5978, "step": 810 }, { "epoch": 13.002035087719298, "eval_accuracy": 0.9375, "eval_loss": 0.23082482814788818, "eval_runtime": 81.747, "eval_samples_per_second": 0.587, "eval_steps_per_second": 0.147, "step": 812 }, { "epoch": 14.000280701754386, "grad_norm": 0.15585492551326752, "learning_rate": 1.4385964912280702e-05, "loss": 0.122, "step": 820 }, { "epoch": 14.000631578947369, "grad_norm": 49.04802322387695, "learning_rate": 1.4561403508771931e-05, "loss": 0.522, "step": 830 }, { "epoch": 14.00098245614035, "grad_norm": 0.3657858967781067, "learning_rate": 1.4736842105263157e-05, "loss": 0.0476, "step": 840 }, { "epoch": 14.001333333333333, "grad_norm": 0.05123307183384895, "learning_rate": 1.4912280701754386e-05, "loss": 0.2268, "step": 850 }, { "epoch": 14.001684210526316, "grad_norm": 0.08785073459148407, "learning_rate": 1.5087719298245615e-05, "loss": 0.4392, "step": 860 }, { "epoch": 14.002035087719298, "grad_norm": 0.33805736899375916, "learning_rate": 1.5263157894736842e-05, "loss": 0.1542, "step": 870 }, { "epoch": 14.002035087719298, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.5762323141098022, "eval_runtime": 82.832, "eval_samples_per_second": 0.579, "eval_steps_per_second": 0.145, "step": 870 }, { "epoch": 15.000350877192982, "grad_norm": 0.06892251968383789, "learning_rate": 1.543859649122807e-05, "loss": 0.1377, "step": 880 }, { "epoch": 15.000701754385965, "grad_norm": 0.07005161046981812, "learning_rate": 1.56140350877193e-05, "loss": 0.0053, "step": 890 }, { "epoch": 15.001052631578947, "grad_norm": 0.03198734670877457, "learning_rate": 1.5789473684210526e-05, "loss": 0.5775, "step": 900 }, { "epoch": 15.00140350877193, "grad_norm": 171.48255920410156, "learning_rate": 1.5964912280701755e-05, "loss": 0.3737, "step": 910 }, { "epoch": 15.001754385964912, "grad_norm": 0.4068077504634857, "learning_rate": 1.6140350877192984e-05, "loss": 0.3073, "step": 920 }, { "epoch": 15.002035087719298, "eval_accuracy": 0.8958333333333334, "eval_loss": 0.33416375517845154, "eval_runtime": 83.3591, "eval_samples_per_second": 0.576, "eval_steps_per_second": 0.144, "step": 928 }, { "epoch": 16.000070175438598, "grad_norm": 0.3335668444633484, "learning_rate": 1.6315789473684213e-05, "loss": 0.7197, "step": 930 }, { "epoch": 16.00042105263158, "grad_norm": 1.4757983684539795, "learning_rate": 1.6491228070175442e-05, "loss": 0.2539, "step": 940 }, { "epoch": 16.000771929824563, "grad_norm": 0.17356331646442413, "learning_rate": 1.6666666666666667e-05, "loss": 0.0063, "step": 950 }, { "epoch": 16.001122807017545, "grad_norm": 0.1452503204345703, "learning_rate": 1.6842105263157896e-05, "loss": 0.5967, "step": 960 }, { "epoch": 16.001473684210527, "grad_norm": 0.1030503362417221, "learning_rate": 1.7017543859649125e-05, "loss": 0.6578, "step": 970 }, { "epoch": 16.00182456140351, "grad_norm": 12.400784492492676, "learning_rate": 1.719298245614035e-05, "loss": 0.5518, "step": 980 }, { "epoch": 16.0020350877193, "eval_accuracy": 0.8541666666666666, "eval_loss": 0.4223368465900421, "eval_runtime": 83.4362, "eval_samples_per_second": 0.575, "eval_steps_per_second": 0.144, "step": 986 }, { "epoch": 17.000140350877192, "grad_norm": 0.28909754753112793, "learning_rate": 1.736842105263158e-05, "loss": 0.2008, "step": 990 }, { "epoch": 17.000491228070175, "grad_norm": 0.21579360961914062, "learning_rate": 1.7543859649122806e-05, "loss": 0.3298, "step": 1000 }, { "epoch": 17.000842105263157, "grad_norm": 0.10615105926990509, "learning_rate": 1.7719298245614035e-05, "loss": 0.004, "step": 1010 }, { "epoch": 17.00119298245614, "grad_norm": 0.046201951801776886, "learning_rate": 1.7894736842105264e-05, "loss": 0.3526, "step": 1020 }, { "epoch": 17.00154385964912, "grad_norm": 0.06010481342673302, "learning_rate": 1.8070175438596493e-05, "loss": 0.3399, "step": 1030 }, { "epoch": 17.001894736842104, "grad_norm": 8.584966659545898, "learning_rate": 1.8245614035087722e-05, "loss": 0.6157, "step": 1040 }, { "epoch": 17.0020350877193, "eval_accuracy": 0.9375, "eval_loss": 0.17038817703723907, "eval_runtime": 83.7401, "eval_samples_per_second": 0.573, "eval_steps_per_second": 0.143, "step": 1044 }, { "epoch": 18.00021052631579, "grad_norm": 0.774956464767456, "learning_rate": 1.8421052631578947e-05, "loss": 0.1596, "step": 1050 }, { "epoch": 18.000561403508772, "grad_norm": 0.36749064922332764, "learning_rate": 1.8596491228070176e-05, "loss": 0.2122, "step": 1060 }, { "epoch": 18.000912280701755, "grad_norm": 0.06645552814006805, "learning_rate": 1.8771929824561405e-05, "loss": 0.2568, "step": 1070 }, { "epoch": 18.001263157894737, "grad_norm": 0.021599041298031807, "learning_rate": 1.8947368421052634e-05, "loss": 0.283, "step": 1080 }, { "epoch": 18.00161403508772, "grad_norm": 113.25637817382812, "learning_rate": 1.9122807017543863e-05, "loss": 0.3591, "step": 1090 }, { "epoch": 18.0019649122807, "grad_norm": 0.21973834931850433, "learning_rate": 1.929824561403509e-05, "loss": 0.2544, "step": 1100 }, { "epoch": 18.0020350877193, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.35440635681152344, "eval_runtime": 82.2034, "eval_samples_per_second": 0.584, "eval_steps_per_second": 0.146, "step": 1102 }, { "epoch": 19.000280701754384, "grad_norm": 0.06097158417105675, "learning_rate": 1.9473684210526315e-05, "loss": 0.3663, "step": 1110 }, { "epoch": 19.000631578947367, "grad_norm": 25.72997283935547, "learning_rate": 1.9649122807017544e-05, "loss": 0.8104, "step": 1120 }, { "epoch": 19.000982456140353, "grad_norm": 0.5115303993225098, "learning_rate": 1.9824561403508773e-05, "loss": 0.2474, "step": 1130 }, { "epoch": 19.001333333333335, "grad_norm": 0.27492067217826843, "learning_rate": 2e-05, "loss": 0.3686, "step": 1140 }, { "epoch": 19.001684210526317, "grad_norm": 22.944690704345703, "learning_rate": 2.0175438596491227e-05, "loss": 0.2315, "step": 1150 }, { "epoch": 19.0020350877193, "grad_norm": 0.11991500854492188, "learning_rate": 2.0350877192982456e-05, "loss": 0.4036, "step": 1160 }, { "epoch": 19.0020350877193, "eval_accuracy": 0.9166666666666666, "eval_loss": 0.25051262974739075, "eval_runtime": 80.7899, "eval_samples_per_second": 0.594, "eval_steps_per_second": 0.149, "step": 1160 }, { "epoch": 20.000350877192982, "grad_norm": 0.44547587633132935, "learning_rate": 2.0526315789473685e-05, "loss": 0.1078, "step": 1170 }, { "epoch": 20.000701754385965, "grad_norm": 76.07775115966797, "learning_rate": 2.0701754385964914e-05, "loss": 0.4915, "step": 1180 }, { "epoch": 20.001052631578947, "grad_norm": 0.349282830953598, "learning_rate": 2.0877192982456143e-05, "loss": 0.2929, "step": 1190 }, { "epoch": 20.00140350877193, "grad_norm": 8.304322242736816, "learning_rate": 2.105263157894737e-05, "loss": 0.219, "step": 1200 }, { "epoch": 20.00175438596491, "grad_norm": 0.08941491693258286, "learning_rate": 2.1228070175438598e-05, "loss": 0.2382, "step": 1210 }, { "epoch": 20.0020350877193, "eval_accuracy": 0.9375, "eval_loss": 0.3155660927295685, "eval_runtime": 82.6296, "eval_samples_per_second": 0.581, "eval_steps_per_second": 0.145, "step": 1218 }, { "epoch": 21.000070175438598, "grad_norm": 6.294134140014648, "learning_rate": 2.1403508771929827e-05, "loss": 0.2611, "step": 1220 }, { "epoch": 21.00042105263158, "grad_norm": 0.11261521279811859, "learning_rate": 2.1578947368421053e-05, "loss": 0.1969, "step": 1230 }, { "epoch": 21.000771929824563, "grad_norm": 0.2796896696090698, "learning_rate": 2.175438596491228e-05, "loss": 0.2955, "step": 1240 }, { "epoch": 21.001122807017545, "grad_norm": 0.07930008322000504, "learning_rate": 2.1929824561403507e-05, "loss": 0.013, "step": 1250 }, { "epoch": 21.001473684210527, "grad_norm": 5.909428119659424, "learning_rate": 2.2105263157894736e-05, "loss": 0.3568, "step": 1260 }, { "epoch": 21.00182456140351, "grad_norm": 168.33380126953125, "learning_rate": 2.2280701754385965e-05, "loss": 0.6751, "step": 1270 }, { "epoch": 21.0020350877193, "eval_accuracy": 0.9375, "eval_loss": 0.259630411863327, "eval_runtime": 82.1271, "eval_samples_per_second": 0.584, "eval_steps_per_second": 0.146, "step": 1276 }, { "epoch": 22.000140350877192, "grad_norm": 0.22503353655338287, "learning_rate": 2.2456140350877194e-05, "loss": 0.3249, "step": 1280 }, { "epoch": 22.000491228070175, "grad_norm": 0.2562604248523712, "learning_rate": 2.2631578947368423e-05, "loss": 0.2267, "step": 1290 }, { "epoch": 22.000842105263157, "grad_norm": 0.6118970513343811, "learning_rate": 2.280701754385965e-05, "loss": 0.7495, "step": 1300 }, { "epoch": 22.00119298245614, "grad_norm": 0.2397994101047516, "learning_rate": 2.2982456140350878e-05, "loss": 0.0388, "step": 1310 }, { "epoch": 22.00154385964912, "grad_norm": 0.10384727269411087, "learning_rate": 2.3157894736842107e-05, "loss": 0.3285, "step": 1320 }, { "epoch": 22.001894736842104, "grad_norm": 0.0419117733836174, "learning_rate": 2.3333333333333336e-05, "loss": 0.2848, "step": 1330 }, { "epoch": 22.0020350877193, "eval_accuracy": 0.8125, "eval_loss": 0.822706937789917, "eval_runtime": 83.5818, "eval_samples_per_second": 0.574, "eval_steps_per_second": 0.144, "step": 1334 }, { "epoch": 23.00021052631579, "grad_norm": 121.5499038696289, "learning_rate": 2.3508771929824565e-05, "loss": 0.5364, "step": 1340 }, { "epoch": 23.000561403508772, "grad_norm": 0.10266309231519699, "learning_rate": 2.368421052631579e-05, "loss": 0.8097, "step": 1350 }, { "epoch": 23.000912280701755, "grad_norm": 9.736127853393555, "learning_rate": 2.385964912280702e-05, "loss": 0.6052, "step": 1360 }, { "epoch": 23.001263157894737, "grad_norm": 4.3637471199035645, "learning_rate": 2.4035087719298245e-05, "loss": 0.3504, "step": 1370 }, { "epoch": 23.00161403508772, "grad_norm": 0.19882246851921082, "learning_rate": 2.4210526315789474e-05, "loss": 0.3784, "step": 1380 }, { "epoch": 23.0019649122807, "grad_norm": 0.27082210779190063, "learning_rate": 2.4385964912280703e-05, "loss": 0.1225, "step": 1390 }, { "epoch": 23.0020350877193, "eval_accuracy": 0.9375, "eval_loss": 0.2921377420425415, "eval_runtime": 82.9556, "eval_samples_per_second": 0.579, "eval_steps_per_second": 0.145, "step": 1392 }, { "epoch": 24.000280701754384, "grad_norm": 0.2171986997127533, "learning_rate": 2.456140350877193e-05, "loss": 0.1094, "step": 1400 }, { "epoch": 24.000631578947367, "grad_norm": 0.21692253649234772, "learning_rate": 2.4736842105263158e-05, "loss": 0.3332, "step": 1410 }, { "epoch": 24.000982456140353, "grad_norm": 0.3834693729877472, "learning_rate": 2.4912280701754387e-05, "loss": 0.2847, "step": 1420 }, { "epoch": 24.001333333333335, "grad_norm": 0.08816500753164291, "learning_rate": 2.5087719298245616e-05, "loss": 0.1147, "step": 1430 }, { "epoch": 24.001684210526317, "grad_norm": 0.21103212237358093, "learning_rate": 2.5263157894736845e-05, "loss": 0.4283, "step": 1440 }, { "epoch": 24.0020350877193, "grad_norm": 0.27631059288978577, "learning_rate": 2.5438596491228074e-05, "loss": 0.616, "step": 1450 }, { "epoch": 24.0020350877193, "eval_accuracy": 0.9375, "eval_loss": 0.2928893566131592, "eval_runtime": 81.6066, "eval_samples_per_second": 0.588, "eval_steps_per_second": 0.147, "step": 1450 }, { "epoch": 24.0020350877193, "eval_accuracy": 0.9411764705882353, "eval_loss": 0.2618250548839569, "eval_runtime": 88.9045, "eval_samples_per_second": 0.574, "eval_steps_per_second": 0.146, "step": 1450 }, { "epoch": 24.0020350877193, "eval_accuracy": 0.9411764705882353, "eval_loss": 0.2618250548839569, "eval_runtime": 89.4759, "eval_samples_per_second": 0.57, "eval_steps_per_second": 0.145, "step": 1450 } ], "logging_steps": 10, "max_steps": 28500, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.164871389462528e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }