diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,488 +1,4637 @@ { - "best_metric": 0.9297520661157025, - "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-wsdmhar\\checkpoint-530", - "epoch": 10.0, + "best_metric": 0.9683195592286501, + "best_model_checkpoint": "swin-tiny-patch4-window7-224-finetuned-wsdmhar\\checkpoint-4187", + "epoch": 100.0, "eval_steps": 500, - "global_step": 530, + "global_step": 5300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.18867924528301888, - "grad_norm": 5.205289840698242, - "learning_rate": 9.433962264150944e-06, - "loss": 1.7479, + "grad_norm": 9.041044235229492, + "learning_rate": 9.433962264150943e-07, + "loss": 1.4928, "step": 10 }, { "epoch": 0.37735849056603776, - "grad_norm": 4.392179012298584, - "learning_rate": 1.8867924528301888e-05, - "loss": 1.4506, + "grad_norm": 7.276942253112793, + "learning_rate": 1.8867924528301887e-06, + "loss": 1.4202, "step": 20 }, { "epoch": 0.5660377358490566, - "grad_norm": 6.868739604949951, - "learning_rate": 2.830188679245283e-05, - "loss": 1.0283, + "grad_norm": 8.077420234680176, + "learning_rate": 2.830188679245283e-06, + "loss": 1.3564, "step": 30 }, { "epoch": 0.7547169811320755, - "grad_norm": 9.729569435119629, - "learning_rate": 3.7735849056603776e-05, - "loss": 0.8064, + "grad_norm": 7.560600280761719, + "learning_rate": 3.7735849056603773e-06, + "loss": 1.1939, "step": 40 }, { "epoch": 0.9433962264150944, - "grad_norm": 12.286840438842773, - "learning_rate": 4.716981132075472e-05, - "loss": 0.7388, + "grad_norm": 5.84022855758667, + "learning_rate": 4.716981132075472e-06, + "loss": 1.0624, "step": 50 }, { "epoch": 1.0, - "eval_accuracy": 0.7117768595041323, - "eval_loss": 0.6308234930038452, - "eval_runtime": 24.0663, - "eval_samples_per_second": 120.667, - "eval_steps_per_second": 3.781, + "eval_accuracy": 0.6091597796143251, + "eval_loss": 0.8878794312477112, + "eval_runtime": 12.739, + "eval_samples_per_second": 227.961, + "eval_steps_per_second": 7.143, "step": 53 }, { "epoch": 1.1320754716981132, - "grad_norm": 9.05235481262207, - "learning_rate": 4.9266247379454926e-05, - "loss": 0.6616, + "grad_norm": 9.758731842041016, + "learning_rate": 5.660377358490566e-06, + "loss": 0.9299, "step": 60 }, { "epoch": 1.320754716981132, - "grad_norm": 10.109884262084961, - "learning_rate": 4.8218029350104823e-05, - "loss": 0.6363, + "grad_norm": 7.677206993103027, + "learning_rate": 6.60377358490566e-06, + "loss": 0.8724, "step": 70 }, { "epoch": 1.509433962264151, - "grad_norm": 10.793712615966797, - "learning_rate": 4.716981132075472e-05, - "loss": 0.554, + "grad_norm": 8.157505989074707, + "learning_rate": 7.547169811320755e-06, + "loss": 0.7971, "step": 80 }, { "epoch": 1.6981132075471699, - "grad_norm": 17.802772521972656, - "learning_rate": 4.612159329140461e-05, - "loss": 0.5516, + "grad_norm": 9.804584503173828, + "learning_rate": 8.49056603773585e-06, + "loss": 0.7717, "step": 90 }, { "epoch": 1.8867924528301887, - "grad_norm": 11.082262992858887, - "learning_rate": 4.5073375262054504e-05, - "loss": 0.5099, + "grad_norm": 8.317395210266113, + "learning_rate": 9.433962264150944e-06, + "loss": 0.6893, "step": 100 }, { "epoch": 2.0, - "eval_accuracy": 0.8484848484848485, - "eval_loss": 0.36690881848335266, - "eval_runtime": 13.3175, - "eval_samples_per_second": 218.059, - "eval_steps_per_second": 6.833, + "eval_accuracy": 0.7090220385674931, + "eval_loss": 0.6601111888885498, + "eval_runtime": 13.0303, + "eval_samples_per_second": 222.865, + "eval_steps_per_second": 6.984, "step": 106 }, { "epoch": 2.0754716981132075, - "grad_norm": 6.183718204498291, - "learning_rate": 4.402515723270441e-05, - "loss": 0.4778, + "grad_norm": 5.714374542236328, + "learning_rate": 1.0377358490566038e-05, + "loss": 0.6923, "step": 110 }, { "epoch": 2.2641509433962264, - "grad_norm": 13.359389305114746, - "learning_rate": 4.29769392033543e-05, - "loss": 0.4449, + "grad_norm": 8.416643142700195, + "learning_rate": 1.1320754716981132e-05, + "loss": 0.6475, "step": 120 }, { "epoch": 2.452830188679245, - "grad_norm": 6.166327953338623, - "learning_rate": 4.192872117400419e-05, - "loss": 0.4243, + "grad_norm": 8.871334075927734, + "learning_rate": 1.2264150943396227e-05, + "loss": 0.6156, "step": 130 }, { "epoch": 2.641509433962264, - "grad_norm": 6.538959980010986, - "learning_rate": 4.088050314465409e-05, - "loss": 0.4265, + "grad_norm": 10.934247970581055, + "learning_rate": 1.320754716981132e-05, + "loss": 0.597, "step": 140 }, { "epoch": 2.830188679245283, - "grad_norm": 11.39398193359375, - "learning_rate": 3.983228511530399e-05, - "loss": 0.4319, + "grad_norm": 12.311450004577637, + "learning_rate": 1.4150943396226415e-05, + "loss": 0.6152, "step": 150 }, { "epoch": 3.0, - "eval_accuracy": 0.8684573002754821, - "eval_loss": 0.33240532875061035, - "eval_runtime": 14.2442, - "eval_samples_per_second": 203.873, - "eval_steps_per_second": 6.389, + "eval_accuracy": 0.7854683195592287, + "eval_loss": 0.5114469528198242, + "eval_runtime": 13.0985, + "eval_samples_per_second": 221.706, + "eval_steps_per_second": 6.947, "step": 159 }, { "epoch": 3.018867924528302, - "grad_norm": 9.0410795211792, - "learning_rate": 3.878406708595388e-05, - "loss": 0.3985, + "grad_norm": 8.85444164276123, + "learning_rate": 1.509433962264151e-05, + "loss": 0.5785, "step": 160 }, { "epoch": 3.207547169811321, - "grad_norm": 6.786131381988525, - "learning_rate": 3.7735849056603776e-05, - "loss": 0.3912, + "grad_norm": 7.401244640350342, + "learning_rate": 1.6037735849056604e-05, + "loss": 0.5343, "step": 170 }, { "epoch": 3.3962264150943398, - "grad_norm": 8.235289573669434, - "learning_rate": 3.6687631027253674e-05, - "loss": 0.3496, + "grad_norm": 6.694042205810547, + "learning_rate": 1.69811320754717e-05, + "loss": 0.5322, "step": 180 }, { "epoch": 3.5849056603773586, - "grad_norm": 6.866382122039795, - "learning_rate": 3.5639412997903565e-05, - "loss": 0.3696, + "grad_norm": 7.124511241912842, + "learning_rate": 1.7924528301886792e-05, + "loss": 0.5533, "step": 190 }, { "epoch": 3.7735849056603774, - "grad_norm": 8.17050552368164, - "learning_rate": 3.4591194968553456e-05, - "loss": 0.3203, + "grad_norm": 11.249773025512695, + "learning_rate": 1.8867924528301888e-05, + "loss": 0.4865, "step": 200 }, { "epoch": 3.9622641509433962, - "grad_norm": 17.23760414123535, - "learning_rate": 3.354297693920336e-05, - "loss": 0.4002, + "grad_norm": 19.239667892456055, + "learning_rate": 1.9811320754716984e-05, + "loss": 0.5456, "step": 210 }, { "epoch": 4.0, - "eval_accuracy": 0.9028925619834711, - "eval_loss": 0.2758175730705261, - "eval_runtime": 14.4978, - "eval_samples_per_second": 200.306, - "eval_steps_per_second": 6.277, + "eval_accuracy": 0.8422865013774105, + "eval_loss": 0.3818637728691101, + "eval_runtime": 13.2694, + "eval_samples_per_second": 218.849, + "eval_steps_per_second": 6.858, "step": 212 }, { "epoch": 4.150943396226415, - "grad_norm": 17.225961685180664, - "learning_rate": 3.249475890985325e-05, - "loss": 0.3484, + "grad_norm": 11.330594062805176, + "learning_rate": 2.0754716981132076e-05, + "loss": 0.5114, "step": 220 }, { "epoch": 4.339622641509434, - "grad_norm": 12.645174980163574, - "learning_rate": 3.144654088050314e-05, - "loss": 0.353, + "grad_norm": 11.731520652770996, + "learning_rate": 2.1698113207547172e-05, + "loss": 0.4923, "step": 230 }, { "epoch": 4.528301886792453, - "grad_norm": 5.725615978240967, - "learning_rate": 3.0398322851153044e-05, - "loss": 0.2883, + "grad_norm": 6.535953521728516, + "learning_rate": 2.2641509433962265e-05, + "loss": 0.4251, "step": 240 }, { "epoch": 4.716981132075472, - "grad_norm": 5.914320468902588, - "learning_rate": 2.935010482180294e-05, - "loss": 0.3067, + "grad_norm": 5.095239639282227, + "learning_rate": 2.358490566037736e-05, + "loss": 0.4388, "step": 250 }, { "epoch": 4.90566037735849, - "grad_norm": 6.547543525695801, - "learning_rate": 2.830188679245283e-05, - "loss": 0.3589, + "grad_norm": 8.532243728637695, + "learning_rate": 2.4528301886792453e-05, + "loss": 0.4673, "step": 260 }, { "epoch": 5.0, - "eval_accuracy": 0.9132231404958677, - "eval_loss": 0.2503372132778168, - "eval_runtime": 14.406, - "eval_samples_per_second": 201.582, - "eval_steps_per_second": 6.317, + "eval_accuracy": 0.871900826446281, + "eval_loss": 0.32669946551322937, + "eval_runtime": 13.3969, + "eval_samples_per_second": 216.767, + "eval_steps_per_second": 6.793, "step": 265 }, { "epoch": 5.09433962264151, - "grad_norm": 8.123590469360352, - "learning_rate": 2.7253668763102725e-05, - "loss": 0.3272, + "grad_norm": 7.914596080780029, + "learning_rate": 2.547169811320755e-05, + "loss": 0.4331, "step": 270 }, { "epoch": 5.283018867924528, - "grad_norm": 5.585710048675537, - "learning_rate": 2.6205450733752623e-05, - "loss": 0.3535, + "grad_norm": 7.141151428222656, + "learning_rate": 2.641509433962264e-05, + "loss": 0.4553, "step": 280 }, { "epoch": 5.471698113207547, - "grad_norm": 8.529882431030273, - "learning_rate": 2.5157232704402517e-05, - "loss": 0.2797, + "grad_norm": 9.838526725769043, + "learning_rate": 2.7358490566037738e-05, + "loss": 0.3864, "step": 290 }, { "epoch": 5.660377358490566, - "grad_norm": 6.009942054748535, - "learning_rate": 2.4109014675052412e-05, - "loss": 0.3004, + "grad_norm": 6.91007137298584, + "learning_rate": 2.830188679245283e-05, + "loss": 0.4015, "step": 300 }, { "epoch": 5.849056603773585, - "grad_norm": 4.354741096496582, - "learning_rate": 2.3060796645702306e-05, - "loss": 0.3096, + "grad_norm": 6.716864585876465, + "learning_rate": 2.9245283018867926e-05, + "loss": 0.4166, "step": 310 }, { "epoch": 6.0, - "eval_accuracy": 0.9135674931129476, - "eval_loss": 0.24190402030944824, - "eval_runtime": 14.5262, - "eval_samples_per_second": 199.915, - "eval_steps_per_second": 6.265, + "eval_accuracy": 0.9039256198347108, + "eval_loss": 0.2804345488548279, + "eval_runtime": 13.5503, + "eval_samples_per_second": 214.312, + "eval_steps_per_second": 6.716, "step": 318 }, { "epoch": 6.037735849056604, - "grad_norm": 6.793937683105469, - "learning_rate": 2.2012578616352204e-05, - "loss": 0.3024, + "grad_norm": 19.735078811645508, + "learning_rate": 3.018867924528302e-05, + "loss": 0.4004, "step": 320 }, { "epoch": 6.226415094339623, - "grad_norm": 4.6357221603393555, - "learning_rate": 2.0964360587002095e-05, - "loss": 0.3013, + "grad_norm": 14.484492301940918, + "learning_rate": 3.113207547169811e-05, + "loss": 0.4114, "step": 330 }, { "epoch": 6.415094339622642, - "grad_norm": 5.533959865570068, - "learning_rate": 1.9916142557651993e-05, - "loss": 0.311, + "grad_norm": 8.806564331054688, + "learning_rate": 3.207547169811321e-05, + "loss": 0.398, "step": 340 }, { "epoch": 6.60377358490566, - "grad_norm": 5.0147600173950195, - "learning_rate": 1.8867924528301888e-05, - "loss": 0.2865, + "grad_norm": 11.519726753234863, + "learning_rate": 3.30188679245283e-05, + "loss": 0.413, "step": 350 }, { "epoch": 6.7924528301886795, - "grad_norm": 4.817899227142334, - "learning_rate": 1.7819706498951782e-05, - "loss": 0.2813, + "grad_norm": 4.552971363067627, + "learning_rate": 3.39622641509434e-05, + "loss": 0.3775, "step": 360 }, { "epoch": 6.981132075471698, - "grad_norm": 6.032566547393799, - "learning_rate": 1.677148846960168e-05, - "loss": 0.2708, + "grad_norm": 6.50352668762207, + "learning_rate": 3.490566037735849e-05, + "loss": 0.3757, "step": 370 }, { "epoch": 7.0, - "eval_accuracy": 0.9232093663911846, - "eval_loss": 0.22773109376430511, - "eval_runtime": 15.0479, - "eval_samples_per_second": 192.984, - "eval_steps_per_second": 6.047, + "eval_accuracy": 0.8994490358126722, + "eval_loss": 0.2881180942058563, + "eval_runtime": 13.6033, + "eval_samples_per_second": 213.478, + "eval_steps_per_second": 6.69, "step": 371 }, { "epoch": 7.169811320754717, - "grad_norm": 5.316783428192139, - "learning_rate": 1.572327044025157e-05, - "loss": 0.3009, + "grad_norm": 5.162815093994141, + "learning_rate": 3.5849056603773584e-05, + "loss": 0.3744, "step": 380 }, { "epoch": 7.3584905660377355, - "grad_norm": 5.2730607986450195, - "learning_rate": 1.467505241090147e-05, - "loss": 0.2422, + "grad_norm": 4.919179916381836, + "learning_rate": 3.679245283018868e-05, + "loss": 0.3084, "step": 390 }, { "epoch": 7.547169811320755, - "grad_norm": 5.739044189453125, - "learning_rate": 1.3626834381551362e-05, - "loss": 0.2667, + "grad_norm": 8.160808563232422, + "learning_rate": 3.7735849056603776e-05, + "loss": 0.3388, "step": 400 }, { "epoch": 7.735849056603773, - "grad_norm": 5.689853668212891, - "learning_rate": 1.2578616352201259e-05, - "loss": 0.2833, + "grad_norm": 7.450716972351074, + "learning_rate": 3.867924528301887e-05, + "loss": 0.3578, "step": 410 }, { "epoch": 7.9245283018867925, - "grad_norm": 6.43410062789917, - "learning_rate": 1.1530398322851153e-05, - "loss": 0.261, + "grad_norm": 7.976863861083984, + "learning_rate": 3.962264150943397e-05, + "loss": 0.3798, "step": 420 }, { "epoch": 8.0, - "eval_accuracy": 0.925275482093664, - "eval_loss": 0.21677668392658234, - "eval_runtime": 14.7132, - "eval_samples_per_second": 197.373, - "eval_steps_per_second": 6.185, + "eval_accuracy": 0.903236914600551, + "eval_loss": 0.26347047090530396, + "eval_runtime": 13.5658, + "eval_samples_per_second": 214.068, + "eval_steps_per_second": 6.708, "step": 424 }, { "epoch": 8.11320754716981, - "grad_norm": 5.5042643547058105, - "learning_rate": 1.0482180293501048e-05, - "loss": 0.2675, + "grad_norm": 6.626104354858398, + "learning_rate": 4.0566037735849064e-05, + "loss": 0.3583, "step": 430 }, { "epoch": 8.30188679245283, - "grad_norm": 4.880862236022949, - "learning_rate": 9.433962264150944e-06, - "loss": 0.2413, + "grad_norm": 6.100863933563232, + "learning_rate": 4.150943396226415e-05, + "loss": 0.3201, "step": 440 }, { "epoch": 8.49056603773585, - "grad_norm": 3.9594244956970215, - "learning_rate": 8.38574423480084e-06, - "loss": 0.2577, + "grad_norm": 5.2761969566345215, + "learning_rate": 4.245283018867925e-05, + "loss": 0.3452, "step": 450 }, { "epoch": 8.679245283018869, - "grad_norm": 6.740973949432373, - "learning_rate": 7.337526205450735e-06, - "loss": 0.2595, + "grad_norm": 4.848881244659424, + "learning_rate": 4.3396226415094345e-05, + "loss": 0.3602, "step": 460 }, { "epoch": 8.867924528301886, - "grad_norm": 7.188548564910889, - "learning_rate": 6.289308176100629e-06, - "loss": 0.2526, + "grad_norm": 10.011419296264648, + "learning_rate": 4.433962264150944e-05, + "loss": 0.3303, "step": 470 }, { "epoch": 9.0, - "eval_accuracy": 0.9245867768595041, - "eval_loss": 0.20991356670856476, - "eval_runtime": 14.793, - "eval_samples_per_second": 196.31, - "eval_steps_per_second": 6.152, + "eval_accuracy": 0.9073691460055097, + "eval_loss": 0.27034133672714233, + "eval_runtime": 13.7374, + "eval_samples_per_second": 211.393, + "eval_steps_per_second": 6.624, "step": 477 }, { "epoch": 9.056603773584905, - "grad_norm": 8.5458402633667, - "learning_rate": 5.241090146750524e-06, - "loss": 0.2465, + "grad_norm": 8.56981372833252, + "learning_rate": 4.528301886792453e-05, + "loss": 0.31, "step": 480 }, { "epoch": 9.245283018867925, - "grad_norm": 4.695093154907227, - "learning_rate": 4.19287211740042e-06, - "loss": 0.2172, + "grad_norm": 7.121793270111084, + "learning_rate": 4.6226415094339625e-05, + "loss": 0.3098, "step": 490 }, { "epoch": 9.433962264150944, - "grad_norm": 3.67899751663208, - "learning_rate": 3.1446540880503146e-06, - "loss": 0.2635, + "grad_norm": 4.458034515380859, + "learning_rate": 4.716981132075472e-05, + "loss": 0.3495, "step": 500 }, { "epoch": 9.622641509433961, - "grad_norm": 4.685281276702881, - "learning_rate": 2.09643605870021e-06, - "loss": 0.254, + "grad_norm": 6.7864484786987305, + "learning_rate": 4.811320754716982e-05, + "loss": 0.3383, "step": 510 }, { "epoch": 9.81132075471698, - "grad_norm": 5.518146991729736, - "learning_rate": 1.048218029350105e-06, - "loss": 0.2574, + "grad_norm": 6.478758811950684, + "learning_rate": 4.9056603773584906e-05, + "loss": 0.316, "step": 520 }, { "epoch": 10.0, - "grad_norm": 5.643453598022461, - "learning_rate": 0.0, - "loss": 0.2767, + "grad_norm": 6.849891185760498, + "learning_rate": 5e-05, + "loss": 0.3346, "step": 530 }, { "epoch": 10.0, - "eval_accuracy": 0.9297520661157025, - "eval_loss": 0.19896750152111053, - "eval_runtime": 15.5471, - "eval_samples_per_second": 186.787, - "eval_steps_per_second": 5.853, + "eval_accuracy": 0.9004820936639119, + "eval_loss": 0.25653818249702454, + "eval_runtime": 13.7209, + "eval_samples_per_second": 211.649, + "eval_steps_per_second": 6.632, "step": 530 }, { - "epoch": 10.0, - "step": 530, - "total_flos": 1.6836842977571635e+18, - "train_loss": 0.419760063009442, - "train_runtime": 917.7632, - "train_samples_per_second": 73.799, - "train_steps_per_second": 0.577 + "epoch": 10.18867924528302, + "grad_norm": 5.824264049530029, + "learning_rate": 4.989517819706499e-05, + "loss": 0.3262, + "step": 540 + }, + { + "epoch": 10.377358490566039, + "grad_norm": 9.913765907287598, + "learning_rate": 4.979035639412998e-05, + "loss": 0.2917, + "step": 550 + }, + { + "epoch": 10.566037735849056, + "grad_norm": 8.706583976745605, + "learning_rate": 4.968553459119497e-05, + "loss": 0.325, + "step": 560 + }, + { + "epoch": 10.754716981132075, + "grad_norm": 4.388184070587158, + "learning_rate": 4.958071278825996e-05, + "loss": 0.2934, + "step": 570 + }, + { + "epoch": 10.943396226415095, + "grad_norm": 7.9404988288879395, + "learning_rate": 4.947589098532495e-05, + "loss": 0.2971, + "step": 580 + }, + { + "epoch": 11.0, + "eval_accuracy": 0.931129476584022, + "eval_loss": 0.21819160878658295, + "eval_runtime": 13.8344, + "eval_samples_per_second": 209.911, + "eval_steps_per_second": 6.578, + "step": 583 + }, + { + "epoch": 11.132075471698114, + "grad_norm": 4.344425678253174, + "learning_rate": 4.937106918238994e-05, + "loss": 0.25, + "step": 590 + }, + { + "epoch": 11.320754716981131, + "grad_norm": 7.321628570556641, + "learning_rate": 4.9266247379454926e-05, + "loss": 0.3084, + "step": 600 + }, + { + "epoch": 11.50943396226415, + "grad_norm": 12.875089645385742, + "learning_rate": 4.916142557651992e-05, + "loss": 0.3069, + "step": 610 + }, + { + "epoch": 11.69811320754717, + "grad_norm": 6.05894660949707, + "learning_rate": 4.9056603773584906e-05, + "loss": 0.2905, + "step": 620 + }, + { + "epoch": 11.88679245283019, + "grad_norm": 10.023378372192383, + "learning_rate": 4.8951781970649894e-05, + "loss": 0.2992, + "step": 630 + }, + { + "epoch": 12.0, + "eval_accuracy": 0.9256198347107438, + "eval_loss": 0.22397693991661072, + "eval_runtime": 13.8485, + "eval_samples_per_second": 209.698, + "eval_steps_per_second": 6.571, + "step": 636 + }, + { + "epoch": 12.075471698113208, + "grad_norm": 6.025568962097168, + "learning_rate": 4.884696016771489e-05, + "loss": 0.2936, + "step": 640 + }, + { + "epoch": 12.264150943396226, + "grad_norm": 5.965641021728516, + "learning_rate": 4.8742138364779875e-05, + "loss": 0.266, + "step": 650 + }, + { + "epoch": 12.452830188679245, + "grad_norm": 4.047191143035889, + "learning_rate": 4.863731656184486e-05, + "loss": 0.2696, + "step": 660 + }, + { + "epoch": 12.641509433962264, + "grad_norm": 4.262639999389648, + "learning_rate": 4.8532494758909855e-05, + "loss": 0.2642, + "step": 670 + }, + { + "epoch": 12.830188679245284, + "grad_norm": 4.320269584655762, + "learning_rate": 4.842767295597484e-05, + "loss": 0.2637, + "step": 680 + }, + { + "epoch": 13.0, + "eval_accuracy": 0.9238980716253443, + "eval_loss": 0.2130652368068695, + "eval_runtime": 13.8031, + "eval_samples_per_second": 210.388, + "eval_steps_per_second": 6.593, + "step": 689 + }, + { + "epoch": 13.018867924528301, + "grad_norm": 6.210031032562256, + "learning_rate": 4.8322851153039836e-05, + "loss": 0.2755, + "step": 690 + }, + { + "epoch": 13.20754716981132, + "grad_norm": 5.736047744750977, + "learning_rate": 4.8218029350104823e-05, + "loss": 0.2373, + "step": 700 + }, + { + "epoch": 13.39622641509434, + "grad_norm": 5.341845989227295, + "learning_rate": 4.811320754716982e-05, + "loss": 0.2462, + "step": 710 + }, + { + "epoch": 13.584905660377359, + "grad_norm": 4.025936126708984, + "learning_rate": 4.8008385744234804e-05, + "loss": 0.2491, + "step": 720 + }, + { + "epoch": 13.773584905660378, + "grad_norm": 4.850243091583252, + "learning_rate": 4.79035639412998e-05, + "loss": 0.2437, + "step": 730 + }, + { + "epoch": 13.962264150943396, + "grad_norm": 4.280818939208984, + "learning_rate": 4.7798742138364785e-05, + "loss": 0.2653, + "step": 740 + }, + { + "epoch": 14.0, + "eval_accuracy": 0.9397382920110193, + "eval_loss": 0.18005970120429993, + "eval_runtime": 13.9135, + "eval_samples_per_second": 208.718, + "eval_steps_per_second": 6.54, + "step": 742 + }, + { + "epoch": 14.150943396226415, + "grad_norm": 4.664927959442139, + "learning_rate": 4.769392033542977e-05, + "loss": 0.2444, + "step": 750 + }, + { + "epoch": 14.339622641509434, + "grad_norm": 4.106526851654053, + "learning_rate": 4.7589098532494766e-05, + "loss": 0.2468, + "step": 760 + }, + { + "epoch": 14.528301886792454, + "grad_norm": 5.82990837097168, + "learning_rate": 4.7484276729559753e-05, + "loss": 0.2534, + "step": 770 + }, + { + "epoch": 14.716981132075471, + "grad_norm": 5.992727279663086, + "learning_rate": 4.737945492662474e-05, + "loss": 0.2247, + "step": 780 + }, + { + "epoch": 14.90566037735849, + "grad_norm": 5.288634300231934, + "learning_rate": 4.7274633123689734e-05, + "loss": 0.2472, + "step": 790 + }, + { + "epoch": 15.0, + "eval_accuracy": 0.9376721763085399, + "eval_loss": 0.18071921169757843, + "eval_runtime": 13.77, + "eval_samples_per_second": 210.893, + "eval_steps_per_second": 6.609, + "step": 795 + }, + { + "epoch": 15.09433962264151, + "grad_norm": 5.116296768188477, + "learning_rate": 4.716981132075472e-05, + "loss": 0.2706, + "step": 800 + }, + { + "epoch": 15.283018867924529, + "grad_norm": 4.876150131225586, + "learning_rate": 4.706498951781971e-05, + "loss": 0.2174, + "step": 810 + }, + { + "epoch": 15.471698113207546, + "grad_norm": 4.544086456298828, + "learning_rate": 4.69601677148847e-05, + "loss": 0.1944, + "step": 820 + }, + { + "epoch": 15.660377358490566, + "grad_norm": 7.460907459259033, + "learning_rate": 4.685534591194969e-05, + "loss": 0.2432, + "step": 830 + }, + { + "epoch": 15.849056603773585, + "grad_norm": 5.503137111663818, + "learning_rate": 4.6750524109014677e-05, + "loss": 0.2263, + "step": 840 + }, + { + "epoch": 16.0, + "eval_accuracy": 0.946625344352617, + "eval_loss": 0.16115374863147736, + "eval_runtime": 13.8807, + "eval_samples_per_second": 209.211, + "eval_steps_per_second": 6.556, + "step": 848 + }, + { + "epoch": 16.037735849056602, + "grad_norm": 3.875622272491455, + "learning_rate": 4.664570230607967e-05, + "loss": 0.2365, + "step": 850 + }, + { + "epoch": 16.22641509433962, + "grad_norm": 4.338544845581055, + "learning_rate": 4.654088050314466e-05, + "loss": 0.1973, + "step": 860 + }, + { + "epoch": 16.41509433962264, + "grad_norm": 6.0123724937438965, + "learning_rate": 4.6436058700209645e-05, + "loss": 0.2341, + "step": 870 + }, + { + "epoch": 16.60377358490566, + "grad_norm": 3.352548837661743, + "learning_rate": 4.633123689727464e-05, + "loss": 0.195, + "step": 880 + }, + { + "epoch": 16.79245283018868, + "grad_norm": 3.6743788719177246, + "learning_rate": 4.6226415094339625e-05, + "loss": 0.2149, + "step": 890 + }, + { + "epoch": 16.9811320754717, + "grad_norm": 3.634098768234253, + "learning_rate": 4.612159329140461e-05, + "loss": 0.1786, + "step": 900 + }, + { + "epoch": 17.0, + "eval_accuracy": 0.9418044077134986, + "eval_loss": 0.17354166507720947, + "eval_runtime": 13.8691, + "eval_samples_per_second": 209.387, + "eval_steps_per_second": 6.561, + "step": 901 + }, + { + "epoch": 17.169811320754718, + "grad_norm": 4.430752277374268, + "learning_rate": 4.6016771488469606e-05, + "loss": 0.205, + "step": 910 + }, + { + "epoch": 17.358490566037737, + "grad_norm": 3.114032030105591, + "learning_rate": 4.5911949685534594e-05, + "loss": 0.2002, + "step": 920 + }, + { + "epoch": 17.547169811320753, + "grad_norm": 5.479780197143555, + "learning_rate": 4.580712788259958e-05, + "loss": 0.2224, + "step": 930 + }, + { + "epoch": 17.735849056603772, + "grad_norm": 5.793611526489258, + "learning_rate": 4.570230607966457e-05, + "loss": 0.1961, + "step": 940 + }, + { + "epoch": 17.92452830188679, + "grad_norm": 3.505262613296509, + "learning_rate": 4.559748427672956e-05, + "loss": 0.2103, + "step": 950 + }, + { + "epoch": 18.0, + "eval_accuracy": 0.9462809917355371, + "eval_loss": 0.1786232590675354, + "eval_runtime": 14.016, + "eval_samples_per_second": 207.192, + "eval_steps_per_second": 6.493, + "step": 954 + }, + { + "epoch": 18.11320754716981, + "grad_norm": 3.520742177963257, + "learning_rate": 4.549266247379455e-05, + "loss": 0.2158, + "step": 960 + }, + { + "epoch": 18.30188679245283, + "grad_norm": 5.575334072113037, + "learning_rate": 4.5387840670859536e-05, + "loss": 0.2004, + "step": 970 + }, + { + "epoch": 18.49056603773585, + "grad_norm": 4.227255344390869, + "learning_rate": 4.528301886792453e-05, + "loss": 0.173, + "step": 980 + }, + { + "epoch": 18.67924528301887, + "grad_norm": 4.518486022949219, + "learning_rate": 4.517819706498952e-05, + "loss": 0.1898, + "step": 990 + }, + { + "epoch": 18.867924528301888, + "grad_norm": 3.9365193843841553, + "learning_rate": 4.5073375262054504e-05, + "loss": 0.1725, + "step": 1000 + }, + { + "epoch": 19.0, + "eval_accuracy": 0.9473140495867769, + "eval_loss": 0.16307222843170166, + "eval_runtime": 14.0184, + "eval_samples_per_second": 207.157, + "eval_steps_per_second": 6.491, + "step": 1007 + }, + { + "epoch": 19.056603773584907, + "grad_norm": 4.218902587890625, + "learning_rate": 4.49685534591195e-05, + "loss": 0.1953, + "step": 1010 + }, + { + "epoch": 19.245283018867923, + "grad_norm": 5.483488082885742, + "learning_rate": 4.4863731656184485e-05, + "loss": 0.1648, + "step": 1020 + }, + { + "epoch": 19.433962264150942, + "grad_norm": 5.378849029541016, + "learning_rate": 4.475890985324948e-05, + "loss": 0.1807, + "step": 1030 + }, + { + "epoch": 19.62264150943396, + "grad_norm": 5.601091384887695, + "learning_rate": 4.4654088050314466e-05, + "loss": 0.2101, + "step": 1040 + }, + { + "epoch": 19.81132075471698, + "grad_norm": 2.9287290573120117, + "learning_rate": 4.454926624737946e-05, + "loss": 0.1628, + "step": 1050 + }, + { + "epoch": 20.0, + "grad_norm": 6.378795146942139, + "learning_rate": 4.4444444444444447e-05, + "loss": 0.1787, + "step": 1060 + }, + { + "epoch": 20.0, + "eval_accuracy": 0.953168044077135, + "eval_loss": 0.14394132792949677, + "eval_runtime": 13.9963, + "eval_samples_per_second": 207.483, + "eval_steps_per_second": 6.502, + "step": 1060 + }, + { + "epoch": 20.18867924528302, + "grad_norm": 5.156091213226318, + "learning_rate": 4.433962264150944e-05, + "loss": 0.1518, + "step": 1070 + }, + { + "epoch": 20.37735849056604, + "grad_norm": 5.145476818084717, + "learning_rate": 4.423480083857443e-05, + "loss": 0.1779, + "step": 1080 + }, + { + "epoch": 20.566037735849058, + "grad_norm": 4.119344711303711, + "learning_rate": 4.4129979035639415e-05, + "loss": 0.2038, + "step": 1090 + }, + { + "epoch": 20.754716981132077, + "grad_norm": 4.134652137756348, + "learning_rate": 4.402515723270441e-05, + "loss": 0.1713, + "step": 1100 + }, + { + "epoch": 20.943396226415093, + "grad_norm": 5.130320072174072, + "learning_rate": 4.3920335429769396e-05, + "loss": 0.1924, + "step": 1110 + }, + { + "epoch": 21.0, + "eval_accuracy": 0.9504132231404959, + "eval_loss": 0.13879624009132385, + "eval_runtime": 13.8513, + "eval_samples_per_second": 209.655, + "eval_steps_per_second": 6.57, + "step": 1113 + }, + { + "epoch": 21.132075471698112, + "grad_norm": 5.733437538146973, + "learning_rate": 4.381551362683438e-05, + "loss": 0.164, + "step": 1120 + }, + { + "epoch": 21.32075471698113, + "grad_norm": 3.7996554374694824, + "learning_rate": 4.3710691823899376e-05, + "loss": 0.1479, + "step": 1130 + }, + { + "epoch": 21.50943396226415, + "grad_norm": 4.9020891189575195, + "learning_rate": 4.3605870020964364e-05, + "loss": 0.1657, + "step": 1140 + }, + { + "epoch": 21.69811320754717, + "grad_norm": 4.601658344268799, + "learning_rate": 4.350104821802935e-05, + "loss": 0.1644, + "step": 1150 + }, + { + "epoch": 21.88679245283019, + "grad_norm": 5.638570785522461, + "learning_rate": 4.3396226415094345e-05, + "loss": 0.1662, + "step": 1160 + }, + { + "epoch": 22.0, + "eval_accuracy": 0.9507575757575758, + "eval_loss": 0.1469692885875702, + "eval_runtime": 13.9412, + "eval_samples_per_second": 208.304, + "eval_steps_per_second": 6.527, + "step": 1166 + }, + { + "epoch": 22.07547169811321, + "grad_norm": 7.325222015380859, + "learning_rate": 4.329140461215933e-05, + "loss": 0.1926, + "step": 1170 + }, + { + "epoch": 22.264150943396228, + "grad_norm": 5.337314128875732, + "learning_rate": 4.318658280922432e-05, + "loss": 0.1908, + "step": 1180 + }, + { + "epoch": 22.452830188679247, + "grad_norm": 3.660973072052002, + "learning_rate": 4.308176100628931e-05, + "loss": 0.1663, + "step": 1190 + }, + { + "epoch": 22.641509433962263, + "grad_norm": 7.075264930725098, + "learning_rate": 4.29769392033543e-05, + "loss": 0.1778, + "step": 1200 + }, + { + "epoch": 22.830188679245282, + "grad_norm": 2.9561445713043213, + "learning_rate": 4.287211740041929e-05, + "loss": 0.1724, + "step": 1210 + }, + { + "epoch": 23.0, + "eval_accuracy": 0.949724517906336, + "eval_loss": 0.15376034379005432, + "eval_runtime": 13.9076, + "eval_samples_per_second": 208.807, + "eval_steps_per_second": 6.543, + "step": 1219 + }, + { + "epoch": 23.0188679245283, + "grad_norm": 6.19976806640625, + "learning_rate": 4.276729559748428e-05, + "loss": 0.1867, + "step": 1220 + }, + { + "epoch": 23.20754716981132, + "grad_norm": 5.15887451171875, + "learning_rate": 4.266247379454927e-05, + "loss": 0.1809, + "step": 1230 + }, + { + "epoch": 23.39622641509434, + "grad_norm": 5.114108562469482, + "learning_rate": 4.2557651991614255e-05, + "loss": 0.1636, + "step": 1240 + }, + { + "epoch": 23.58490566037736, + "grad_norm": 5.130180358886719, + "learning_rate": 4.245283018867925e-05, + "loss": 0.1835, + "step": 1250 + }, + { + "epoch": 23.77358490566038, + "grad_norm": 5.375781536102295, + "learning_rate": 4.2348008385744236e-05, + "loss": 0.1764, + "step": 1260 + }, + { + "epoch": 23.962264150943398, + "grad_norm": 3.6640334129333496, + "learning_rate": 4.224318658280922e-05, + "loss": 0.1633, + "step": 1270 + }, + { + "epoch": 24.0, + "eval_accuracy": 0.9383608815426997, + "eval_loss": 0.17309589684009552, + "eval_runtime": 22.6597, + "eval_samples_per_second": 128.157, + "eval_steps_per_second": 4.016, + "step": 1272 + }, + { + "epoch": 24.150943396226417, + "grad_norm": 4.2943196296691895, + "learning_rate": 4.213836477987422e-05, + "loss": 0.1458, + "step": 1280 + }, + { + "epoch": 24.339622641509433, + "grad_norm": 4.514562129974365, + "learning_rate": 4.2033542976939204e-05, + "loss": 0.159, + "step": 1290 + }, + { + "epoch": 24.528301886792452, + "grad_norm": 7.0057454109191895, + "learning_rate": 4.192872117400419e-05, + "loss": 0.1507, + "step": 1300 + }, + { + "epoch": 24.71698113207547, + "grad_norm": 5.325743198394775, + "learning_rate": 4.1823899371069185e-05, + "loss": 0.201, + "step": 1310 + }, + { + "epoch": 24.90566037735849, + "grad_norm": 4.204189300537109, + "learning_rate": 4.171907756813417e-05, + "loss": 0.174, + "step": 1320 + }, + { + "epoch": 25.0, + "eval_accuracy": 0.9538567493112947, + "eval_loss": 0.15548603236675262, + "eval_runtime": 20.4244, + "eval_samples_per_second": 142.183, + "eval_steps_per_second": 4.455, + "step": 1325 + }, + { + "epoch": 25.09433962264151, + "grad_norm": 5.918984413146973, + "learning_rate": 4.161425576519916e-05, + "loss": 0.1604, + "step": 1330 + }, + { + "epoch": 25.28301886792453, + "grad_norm": 3.3942575454711914, + "learning_rate": 4.150943396226415e-05, + "loss": 0.1297, + "step": 1340 + }, + { + "epoch": 25.471698113207548, + "grad_norm": 2.6150972843170166, + "learning_rate": 4.140461215932914e-05, + "loss": 0.1407, + "step": 1350 + }, + { + "epoch": 25.660377358490567, + "grad_norm": 4.893429756164551, + "learning_rate": 4.129979035639413e-05, + "loss": 0.1637, + "step": 1360 + }, + { + "epoch": 25.849056603773583, + "grad_norm": 5.928979396820068, + "learning_rate": 4.119496855345912e-05, + "loss": 0.1657, + "step": 1370 + }, + { + "epoch": 26.0, + "eval_accuracy": 0.9493801652892562, + "eval_loss": 0.15420162677764893, + "eval_runtime": 20.5055, + "eval_samples_per_second": 141.62, + "eval_steps_per_second": 4.438, + "step": 1378 + }, + { + "epoch": 26.037735849056602, + "grad_norm": 3.1796255111694336, + "learning_rate": 4.109014675052411e-05, + "loss": 0.1548, + "step": 1380 + }, + { + "epoch": 26.22641509433962, + "grad_norm": 3.422128438949585, + "learning_rate": 4.09853249475891e-05, + "loss": 0.1379, + "step": 1390 + }, + { + "epoch": 26.41509433962264, + "grad_norm": 4.308591365814209, + "learning_rate": 4.088050314465409e-05, + "loss": 0.1588, + "step": 1400 + }, + { + "epoch": 26.60377358490566, + "grad_norm": 5.928549766540527, + "learning_rate": 4.077568134171908e-05, + "loss": 0.1793, + "step": 1410 + }, + { + "epoch": 26.79245283018868, + "grad_norm": 4.973033905029297, + "learning_rate": 4.067085953878407e-05, + "loss": 0.114, + "step": 1420 + }, + { + "epoch": 26.9811320754717, + "grad_norm": 3.7284395694732666, + "learning_rate": 4.0566037735849064e-05, + "loss": 0.1513, + "step": 1430 + }, + { + "epoch": 27.0, + "eval_accuracy": 0.9507575757575758, + "eval_loss": 0.15260200202465057, + "eval_runtime": 20.5592, + "eval_samples_per_second": 141.251, + "eval_steps_per_second": 4.426, + "step": 1431 + }, + { + "epoch": 27.169811320754718, + "grad_norm": 4.743962287902832, + "learning_rate": 4.046121593291405e-05, + "loss": 0.1396, + "step": 1440 + }, + { + "epoch": 27.358490566037737, + "grad_norm": 5.549553871154785, + "learning_rate": 4.035639412997904e-05, + "loss": 0.1556, + "step": 1450 + }, + { + "epoch": 27.547169811320753, + "grad_norm": 4.103055953979492, + "learning_rate": 4.025157232704403e-05, + "loss": 0.1448, + "step": 1460 + }, + { + "epoch": 27.735849056603772, + "grad_norm": 3.5048537254333496, + "learning_rate": 4.014675052410902e-05, + "loss": 0.1536, + "step": 1470 + }, + { + "epoch": 27.92452830188679, + "grad_norm": 3.7681641578674316, + "learning_rate": 4.0041928721174006e-05, + "loss": 0.126, + "step": 1480 + }, + { + "epoch": 28.0, + "eval_accuracy": 0.9511019283746557, + "eval_loss": 0.15600712597370148, + "eval_runtime": 20.7331, + "eval_samples_per_second": 140.066, + "eval_steps_per_second": 4.389, + "step": 1484 + }, + { + "epoch": 28.11320754716981, + "grad_norm": 3.522587299346924, + "learning_rate": 3.9937106918239e-05, + "loss": 0.1438, + "step": 1490 + }, + { + "epoch": 28.30188679245283, + "grad_norm": 3.7148783206939697, + "learning_rate": 3.983228511530399e-05, + "loss": 0.154, + "step": 1500 + }, + { + "epoch": 28.49056603773585, + "grad_norm": 7.454770565032959, + "learning_rate": 3.9727463312368974e-05, + "loss": 0.1783, + "step": 1510 + }, + { + "epoch": 28.67924528301887, + "grad_norm": 3.996530532836914, + "learning_rate": 3.962264150943397e-05, + "loss": 0.1263, + "step": 1520 + }, + { + "epoch": 28.867924528301888, + "grad_norm": 5.041396141052246, + "learning_rate": 3.9517819706498955e-05, + "loss": 0.1508, + "step": 1530 + }, + { + "epoch": 29.0, + "eval_accuracy": 0.9480027548209367, + "eval_loss": 0.16071830689907074, + "eval_runtime": 21.0223, + "eval_samples_per_second": 138.139, + "eval_steps_per_second": 4.329, + "step": 1537 + }, + { + "epoch": 29.056603773584907, + "grad_norm": 3.525146007537842, + "learning_rate": 3.941299790356394e-05, + "loss": 0.1421, + "step": 1540 + }, + { + "epoch": 29.245283018867923, + "grad_norm": 5.157039642333984, + "learning_rate": 3.9308176100628936e-05, + "loss": 0.1276, + "step": 1550 + }, + { + "epoch": 29.433962264150942, + "grad_norm": 6.565103054046631, + "learning_rate": 3.920335429769392e-05, + "loss": 0.1662, + "step": 1560 + }, + { + "epoch": 29.62264150943396, + "grad_norm": 3.140432357788086, + "learning_rate": 3.909853249475891e-05, + "loss": 0.1446, + "step": 1570 + }, + { + "epoch": 29.81132075471698, + "grad_norm": 3.157646656036377, + "learning_rate": 3.8993710691823904e-05, + "loss": 0.1227, + "step": 1580 + }, + { + "epoch": 30.0, + "grad_norm": 3.9616310596466064, + "learning_rate": 3.888888888888889e-05, + "loss": 0.1368, + "step": 1590 + }, + { + "epoch": 30.0, + "eval_accuracy": 0.9435261707988981, + "eval_loss": 0.17288929224014282, + "eval_runtime": 20.8012, + "eval_samples_per_second": 139.608, + "eval_steps_per_second": 4.375, + "step": 1590 + }, + { + "epoch": 30.18867924528302, + "grad_norm": 2.845510482788086, + "learning_rate": 3.878406708595388e-05, + "loss": 0.1336, + "step": 1600 + }, + { + "epoch": 30.37735849056604, + "grad_norm": 3.7001242637634277, + "learning_rate": 3.867924528301887e-05, + "loss": 0.1202, + "step": 1610 + }, + { + "epoch": 30.566037735849058, + "grad_norm": 2.4213449954986572, + "learning_rate": 3.857442348008386e-05, + "loss": 0.1529, + "step": 1620 + }, + { + "epoch": 30.754716981132077, + "grad_norm": 3.6384825706481934, + "learning_rate": 3.8469601677148846e-05, + "loss": 0.1128, + "step": 1630 + }, + { + "epoch": 30.943396226415093, + "grad_norm": 9.870887756347656, + "learning_rate": 3.836477987421384e-05, + "loss": 0.1166, + "step": 1640 + }, + { + "epoch": 31.0, + "eval_accuracy": 0.953168044077135, + "eval_loss": 0.155534565448761, + "eval_runtime": 20.9232, + "eval_samples_per_second": 138.793, + "eval_steps_per_second": 4.349, + "step": 1643 + }, + { + "epoch": 31.132075471698112, + "grad_norm": 7.167893886566162, + "learning_rate": 3.825995807127883e-05, + "loss": 0.1234, + "step": 1650 + }, + { + "epoch": 31.32075471698113, + "grad_norm": 5.442743301391602, + "learning_rate": 3.8155136268343814e-05, + "loss": 0.1565, + "step": 1660 + }, + { + "epoch": 31.50943396226415, + "grad_norm": 4.499869346618652, + "learning_rate": 3.805031446540881e-05, + "loss": 0.1191, + "step": 1670 + }, + { + "epoch": 31.69811320754717, + "grad_norm": 4.722647666931152, + "learning_rate": 3.7945492662473795e-05, + "loss": 0.1239, + "step": 1680 + }, + { + "epoch": 31.88679245283019, + "grad_norm": 3.9234910011291504, + "learning_rate": 3.784067085953878e-05, + "loss": 0.1076, + "step": 1690 + }, + { + "epoch": 32.0, + "eval_accuracy": 0.9579889807162535, + "eval_loss": 0.14003022015094757, + "eval_runtime": 21.2564, + "eval_samples_per_second": 136.618, + "eval_steps_per_second": 4.281, + "step": 1696 + }, + { + "epoch": 32.075471698113205, + "grad_norm": 5.3656721115112305, + "learning_rate": 3.7735849056603776e-05, + "loss": 0.1086, + "step": 1700 + }, + { + "epoch": 32.264150943396224, + "grad_norm": 2.3153514862060547, + "learning_rate": 3.763102725366876e-05, + "loss": 0.1109, + "step": 1710 + }, + { + "epoch": 32.45283018867924, + "grad_norm": 6.487193584442139, + "learning_rate": 3.752620545073376e-05, + "loss": 0.1296, + "step": 1720 + }, + { + "epoch": 32.64150943396226, + "grad_norm": 4.362462997436523, + "learning_rate": 3.7421383647798744e-05, + "loss": 0.1078, + "step": 1730 + }, + { + "epoch": 32.83018867924528, + "grad_norm": 4.543455123901367, + "learning_rate": 3.731656184486374e-05, + "loss": 0.1189, + "step": 1740 + }, + { + "epoch": 33.0, + "eval_accuracy": 0.9590220385674931, + "eval_loss": 0.14192205667495728, + "eval_runtime": 21.093, + "eval_samples_per_second": 137.676, + "eval_steps_per_second": 4.314, + "step": 1749 + }, + { + "epoch": 33.0188679245283, + "grad_norm": 3.2465994358062744, + "learning_rate": 3.7211740041928725e-05, + "loss": 0.1553, + "step": 1750 + }, + { + "epoch": 33.20754716981132, + "grad_norm": 3.4110634326934814, + "learning_rate": 3.710691823899371e-05, + "loss": 0.1123, + "step": 1760 + }, + { + "epoch": 33.39622641509434, + "grad_norm": 6.8291802406311035, + "learning_rate": 3.7002096436058706e-05, + "loss": 0.1288, + "step": 1770 + }, + { + "epoch": 33.58490566037736, + "grad_norm": 5.650381565093994, + "learning_rate": 3.689727463312369e-05, + "loss": 0.1498, + "step": 1780 + }, + { + "epoch": 33.77358490566038, + "grad_norm": 4.716341018676758, + "learning_rate": 3.679245283018868e-05, + "loss": 0.137, + "step": 1790 + }, + { + "epoch": 33.9622641509434, + "grad_norm": 4.079151153564453, + "learning_rate": 3.6687631027253674e-05, + "loss": 0.1512, + "step": 1800 + }, + { + "epoch": 34.0, + "eval_accuracy": 0.9579889807162535, + "eval_loss": 0.13637615740299225, + "eval_runtime": 21.2019, + "eval_samples_per_second": 136.969, + "eval_steps_per_second": 4.292, + "step": 1802 + }, + { + "epoch": 34.15094339622642, + "grad_norm": 3.646527051925659, + "learning_rate": 3.658280922431866e-05, + "loss": 0.1018, + "step": 1810 + }, + { + "epoch": 34.339622641509436, + "grad_norm": 6.170238971710205, + "learning_rate": 3.647798742138365e-05, + "loss": 0.1085, + "step": 1820 + }, + { + "epoch": 34.528301886792455, + "grad_norm": 3.559018611907959, + "learning_rate": 3.637316561844864e-05, + "loss": 0.115, + "step": 1830 + }, + { + "epoch": 34.716981132075475, + "grad_norm": 5.245954990386963, + "learning_rate": 3.626834381551363e-05, + "loss": 0.1183, + "step": 1840 + }, + { + "epoch": 34.905660377358494, + "grad_norm": 4.156854629516602, + "learning_rate": 3.6163522012578616e-05, + "loss": 0.1323, + "step": 1850 + }, + { + "epoch": 35.0, + "eval_accuracy": 0.9538567493112947, + "eval_loss": 0.14969290792942047, + "eval_runtime": 21.2793, + "eval_samples_per_second": 136.471, + "eval_steps_per_second": 4.276, + "step": 1855 + }, + { + "epoch": 35.094339622641506, + "grad_norm": 4.309362888336182, + "learning_rate": 3.605870020964361e-05, + "loss": 0.1268, + "step": 1860 + }, + { + "epoch": 35.283018867924525, + "grad_norm": 3.494779348373413, + "learning_rate": 3.59538784067086e-05, + "loss": 0.1339, + "step": 1870 + }, + { + "epoch": 35.471698113207545, + "grad_norm": 4.567333221435547, + "learning_rate": 3.5849056603773584e-05, + "loss": 0.1003, + "step": 1880 + }, + { + "epoch": 35.660377358490564, + "grad_norm": 5.073373317718506, + "learning_rate": 3.574423480083858e-05, + "loss": 0.1413, + "step": 1890 + }, + { + "epoch": 35.84905660377358, + "grad_norm": 3.0305075645446777, + "learning_rate": 3.5639412997903565e-05, + "loss": 0.1031, + "step": 1900 + }, + { + "epoch": 36.0, + "eval_accuracy": 0.9579889807162535, + "eval_loss": 0.14369449019432068, + "eval_runtime": 21.3446, + "eval_samples_per_second": 136.053, + "eval_steps_per_second": 4.263, + "step": 1908 + }, + { + "epoch": 36.0377358490566, + "grad_norm": 2.8232624530792236, + "learning_rate": 3.553459119496855e-05, + "loss": 0.1213, + "step": 1910 + }, + { + "epoch": 36.22641509433962, + "grad_norm": 3.2020962238311768, + "learning_rate": 3.5429769392033546e-05, + "loss": 0.0914, + "step": 1920 + }, + { + "epoch": 36.41509433962264, + "grad_norm": 4.236616134643555, + "learning_rate": 3.532494758909853e-05, + "loss": 0.1012, + "step": 1930 + }, + { + "epoch": 36.60377358490566, + "grad_norm": 4.817173480987549, + "learning_rate": 3.522012578616352e-05, + "loss": 0.1082, + "step": 1940 + }, + { + "epoch": 36.79245283018868, + "grad_norm": 3.9018845558166504, + "learning_rate": 3.5115303983228514e-05, + "loss": 0.1074, + "step": 1950 + }, + { + "epoch": 36.9811320754717, + "grad_norm": 5.009905815124512, + "learning_rate": 3.50104821802935e-05, + "loss": 0.1215, + "step": 1960 + }, + { + "epoch": 37.0, + "eval_accuracy": 0.9559228650137741, + "eval_loss": 0.14596055448055267, + "eval_runtime": 21.2599, + "eval_samples_per_second": 136.595, + "eval_steps_per_second": 4.28, + "step": 1961 + }, + { + "epoch": 37.16981132075472, + "grad_norm": 2.7128424644470215, + "learning_rate": 3.490566037735849e-05, + "loss": 0.1129, + "step": 1970 + }, + { + "epoch": 37.35849056603774, + "grad_norm": 4.401316165924072, + "learning_rate": 3.480083857442348e-05, + "loss": 0.1205, + "step": 1980 + }, + { + "epoch": 37.54716981132076, + "grad_norm": 5.2666778564453125, + "learning_rate": 3.469601677148847e-05, + "loss": 0.127, + "step": 1990 + }, + { + "epoch": 37.735849056603776, + "grad_norm": 2.7217955589294434, + "learning_rate": 3.4591194968553456e-05, + "loss": 0.1068, + "step": 2000 + }, + { + "epoch": 37.924528301886795, + "grad_norm": 3.7162227630615234, + "learning_rate": 3.448637316561845e-05, + "loss": 0.1069, + "step": 2010 + }, + { + "epoch": 38.0, + "eval_accuracy": 0.9600550964187328, + "eval_loss": 0.13623014092445374, + "eval_runtime": 21.3213, + "eval_samples_per_second": 136.202, + "eval_steps_per_second": 4.268, + "step": 2014 + }, + { + "epoch": 38.113207547169814, + "grad_norm": 4.025697708129883, + "learning_rate": 3.438155136268344e-05, + "loss": 0.1095, + "step": 2020 + }, + { + "epoch": 38.301886792452834, + "grad_norm": 9.188610076904297, + "learning_rate": 3.4276729559748424e-05, + "loss": 0.1233, + "step": 2030 + }, + { + "epoch": 38.490566037735846, + "grad_norm": 4.473904609680176, + "learning_rate": 3.417190775681342e-05, + "loss": 0.1043, + "step": 2040 + }, + { + "epoch": 38.679245283018865, + "grad_norm": 3.2655704021453857, + "learning_rate": 3.4067085953878405e-05, + "loss": 0.0906, + "step": 2050 + }, + { + "epoch": 38.867924528301884, + "grad_norm": 5.039525985717773, + "learning_rate": 3.39622641509434e-05, + "loss": 0.129, + "step": 2060 + }, + { + "epoch": 39.0, + "eval_accuracy": 0.9590220385674931, + "eval_loss": 0.1490471512079239, + "eval_runtime": 21.334, + "eval_samples_per_second": 136.12, + "eval_steps_per_second": 4.265, + "step": 2067 + }, + { + "epoch": 39.056603773584904, + "grad_norm": 3.194096326828003, + "learning_rate": 3.3857442348008386e-05, + "loss": 0.0893, + "step": 2070 + }, + { + "epoch": 39.24528301886792, + "grad_norm": 1.9600547552108765, + "learning_rate": 3.375262054507338e-05, + "loss": 0.103, + "step": 2080 + }, + { + "epoch": 39.43396226415094, + "grad_norm": 3.6588046550750732, + "learning_rate": 3.364779874213837e-05, + "loss": 0.1004, + "step": 2090 + }, + { + "epoch": 39.62264150943396, + "grad_norm": 5.536744594573975, + "learning_rate": 3.354297693920336e-05, + "loss": 0.1163, + "step": 2100 + }, + { + "epoch": 39.81132075471698, + "grad_norm": 2.2924985885620117, + "learning_rate": 3.343815513626835e-05, + "loss": 0.0862, + "step": 2110 + }, + { + "epoch": 40.0, + "grad_norm": 4.559634685516357, + "learning_rate": 3.3333333333333335e-05, + "loss": 0.1202, + "step": 2120 + }, + { + "epoch": 40.0, + "eval_accuracy": 0.9545454545454546, + "eval_loss": 0.16163212060928345, + "eval_runtime": 21.3342, + "eval_samples_per_second": 136.119, + "eval_steps_per_second": 4.265, + "step": 2120 + }, + { + "epoch": 40.18867924528302, + "grad_norm": 3.6690125465393066, + "learning_rate": 3.322851153039833e-05, + "loss": 0.111, + "step": 2130 + }, + { + "epoch": 40.37735849056604, + "grad_norm": 4.344234943389893, + "learning_rate": 3.3123689727463316e-05, + "loss": 0.1072, + "step": 2140 + }, + { + "epoch": 40.56603773584906, + "grad_norm": 3.5178353786468506, + "learning_rate": 3.30188679245283e-05, + "loss": 0.1036, + "step": 2150 + }, + { + "epoch": 40.75471698113208, + "grad_norm": 4.753892421722412, + "learning_rate": 3.29140461215933e-05, + "loss": 0.1173, + "step": 2160 + }, + { + "epoch": 40.943396226415096, + "grad_norm": 3.2936408519744873, + "learning_rate": 3.2809224318658284e-05, + "loss": 0.1011, + "step": 2170 + }, + { + "epoch": 41.0, + "eval_accuracy": 0.9569559228650137, + "eval_loss": 0.15179601311683655, + "eval_runtime": 21.3331, + "eval_samples_per_second": 136.127, + "eval_steps_per_second": 4.266, + "step": 2173 + }, + { + "epoch": 41.132075471698116, + "grad_norm": 2.5210537910461426, + "learning_rate": 3.270440251572327e-05, + "loss": 0.0855, + "step": 2180 + }, + { + "epoch": 41.320754716981135, + "grad_norm": 4.329328536987305, + "learning_rate": 3.2599580712788265e-05, + "loss": 0.1184, + "step": 2190 + }, + { + "epoch": 41.509433962264154, + "grad_norm": 7.0313801765441895, + "learning_rate": 3.249475890985325e-05, + "loss": 0.1234, + "step": 2200 + }, + { + "epoch": 41.698113207547166, + "grad_norm": 5.2681884765625, + "learning_rate": 3.238993710691824e-05, + "loss": 0.1292, + "step": 2210 + }, + { + "epoch": 41.886792452830186, + "grad_norm": 3.91233229637146, + "learning_rate": 3.228511530398323e-05, + "loss": 0.1092, + "step": 2220 + }, + { + "epoch": 42.0, + "eval_accuracy": 0.9617768595041323, + "eval_loss": 0.13080263137817383, + "eval_runtime": 21.5397, + "eval_samples_per_second": 134.821, + "eval_steps_per_second": 4.225, + "step": 2226 + }, + { + "epoch": 42.075471698113205, + "grad_norm": 3.9293324947357178, + "learning_rate": 3.218029350104822e-05, + "loss": 0.0893, + "step": 2230 + }, + { + "epoch": 42.264150943396224, + "grad_norm": 3.51786732673645, + "learning_rate": 3.207547169811321e-05, + "loss": 0.09, + "step": 2240 + }, + { + "epoch": 42.45283018867924, + "grad_norm": 3.4958715438842773, + "learning_rate": 3.19706498951782e-05, + "loss": 0.1196, + "step": 2250 + }, + { + "epoch": 42.64150943396226, + "grad_norm": 3.3699843883514404, + "learning_rate": 3.186582809224319e-05, + "loss": 0.1042, + "step": 2260 + }, + { + "epoch": 42.83018867924528, + "grad_norm": 3.706667423248291, + "learning_rate": 3.1761006289308175e-05, + "loss": 0.1163, + "step": 2270 + }, + { + "epoch": 43.0, + "eval_accuracy": 0.9590220385674931, + "eval_loss": 0.14582620561122894, + "eval_runtime": 21.7777, + "eval_samples_per_second": 133.347, + "eval_steps_per_second": 4.179, + "step": 2279 + }, + { + "epoch": 43.0188679245283, + "grad_norm": 3.969252347946167, + "learning_rate": 3.165618448637317e-05, + "loss": 0.0884, + "step": 2280 + }, + { + "epoch": 43.20754716981132, + "grad_norm": 5.795734882354736, + "learning_rate": 3.1551362683438156e-05, + "loss": 0.1031, + "step": 2290 + }, + { + "epoch": 43.39622641509434, + "grad_norm": 2.936450481414795, + "learning_rate": 3.144654088050314e-05, + "loss": 0.1086, + "step": 2300 + }, + { + "epoch": 43.58490566037736, + "grad_norm": 2.310685634613037, + "learning_rate": 3.134171907756814e-05, + "loss": 0.1055, + "step": 2310 + }, + { + "epoch": 43.77358490566038, + "grad_norm": 5.497471809387207, + "learning_rate": 3.1236897274633124e-05, + "loss": 0.0898, + "step": 2320 + }, + { + "epoch": 43.9622641509434, + "grad_norm": 3.115891456604004, + "learning_rate": 3.113207547169811e-05, + "loss": 0.1074, + "step": 2330 + }, + { + "epoch": 44.0, + "eval_accuracy": 0.9548898071625345, + "eval_loss": 0.14139670133590698, + "eval_runtime": 21.5889, + "eval_samples_per_second": 134.513, + "eval_steps_per_second": 4.215, + "step": 2332 + }, + { + "epoch": 44.15094339622642, + "grad_norm": 3.430103063583374, + "learning_rate": 3.1027253668763105e-05, + "loss": 0.0852, + "step": 2340 + }, + { + "epoch": 44.339622641509436, + "grad_norm": 3.0305774211883545, + "learning_rate": 3.092243186582809e-05, + "loss": 0.0818, + "step": 2350 + }, + { + "epoch": 44.528301886792455, + "grad_norm": 2.8462817668914795, + "learning_rate": 3.081761006289308e-05, + "loss": 0.0893, + "step": 2360 + }, + { + "epoch": 44.716981132075475, + "grad_norm": 2.0366451740264893, + "learning_rate": 3.071278825995807e-05, + "loss": 0.088, + "step": 2370 + }, + { + "epoch": 44.905660377358494, + "grad_norm": 2.5457682609558105, + "learning_rate": 3.060796645702306e-05, + "loss": 0.0814, + "step": 2380 + }, + { + "epoch": 45.0, + "eval_accuracy": 0.9579889807162535, + "eval_loss": 0.15091215074062347, + "eval_runtime": 21.5296, + "eval_samples_per_second": 134.884, + "eval_steps_per_second": 4.227, + "step": 2385 + }, + { + "epoch": 45.094339622641506, + "grad_norm": 2.5898990631103516, + "learning_rate": 3.050314465408805e-05, + "loss": 0.1118, + "step": 2390 + }, + { + "epoch": 45.283018867924525, + "grad_norm": 4.282632350921631, + "learning_rate": 3.0398322851153044e-05, + "loss": 0.0861, + "step": 2400 + }, + { + "epoch": 45.471698113207545, + "grad_norm": 3.0017223358154297, + "learning_rate": 3.029350104821803e-05, + "loss": 0.0882, + "step": 2410 + }, + { + "epoch": 45.660377358490564, + "grad_norm": 5.122268199920654, + "learning_rate": 3.018867924528302e-05, + "loss": 0.0847, + "step": 2420 + }, + { + "epoch": 45.84905660377358, + "grad_norm": 3.8469204902648926, + "learning_rate": 3.0083857442348012e-05, + "loss": 0.0985, + "step": 2430 + }, + { + "epoch": 46.0, + "eval_accuracy": 0.9628099173553719, + "eval_loss": 0.12866026163101196, + "eval_runtime": 21.7086, + "eval_samples_per_second": 133.772, + "eval_steps_per_second": 4.192, + "step": 2438 + }, + { + "epoch": 46.0377358490566, + "grad_norm": 3.621717691421509, + "learning_rate": 2.9979035639413e-05, + "loss": 0.0899, + "step": 2440 + }, + { + "epoch": 46.22641509433962, + "grad_norm": 5.262154579162598, + "learning_rate": 2.9874213836477987e-05, + "loss": 0.1056, + "step": 2450 + }, + { + "epoch": 46.41509433962264, + "grad_norm": 3.9734673500061035, + "learning_rate": 2.976939203354298e-05, + "loss": 0.0903, + "step": 2460 + }, + { + "epoch": 46.60377358490566, + "grad_norm": 5.937262535095215, + "learning_rate": 2.9664570230607968e-05, + "loss": 0.0728, + "step": 2470 + }, + { + "epoch": 46.79245283018868, + "grad_norm": 5.732816219329834, + "learning_rate": 2.9559748427672958e-05, + "loss": 0.1082, + "step": 2480 + }, + { + "epoch": 46.9811320754717, + "grad_norm": 4.877685070037842, + "learning_rate": 2.945492662473795e-05, + "loss": 0.0863, + "step": 2490 + }, + { + "epoch": 47.0, + "eval_accuracy": 0.962465564738292, + "eval_loss": 0.12769892811775208, + "eval_runtime": 21.6316, + "eval_samples_per_second": 134.248, + "eval_steps_per_second": 4.207, + "step": 2491 + }, + { + "epoch": 47.16981132075472, + "grad_norm": 4.740653991699219, + "learning_rate": 2.935010482180294e-05, + "loss": 0.0924, + "step": 2500 + }, + { + "epoch": 47.35849056603774, + "grad_norm": 3.5254249572753906, + "learning_rate": 2.9245283018867926e-05, + "loss": 0.0829, + "step": 2510 + }, + { + "epoch": 47.54716981132076, + "grad_norm": 2.8752875328063965, + "learning_rate": 2.9140461215932913e-05, + "loss": 0.0917, + "step": 2520 + }, + { + "epoch": 47.735849056603776, + "grad_norm": 3.466445207595825, + "learning_rate": 2.9035639412997907e-05, + "loss": 0.0683, + "step": 2530 + }, + { + "epoch": 47.924528301886795, + "grad_norm": 4.896220684051514, + "learning_rate": 2.8930817610062894e-05, + "loss": 0.0932, + "step": 2540 + }, + { + "epoch": 48.0, + "eval_accuracy": 0.9559228650137741, + "eval_loss": 0.14526014029979706, + "eval_runtime": 21.6957, + "eval_samples_per_second": 133.851, + "eval_steps_per_second": 4.194, + "step": 2544 + }, + { + "epoch": 48.113207547169814, + "grad_norm": 2.655581474304199, + "learning_rate": 2.882599580712788e-05, + "loss": 0.0709, + "step": 2550 + }, + { + "epoch": 48.301886792452834, + "grad_norm": 3.7453079223632812, + "learning_rate": 2.8721174004192875e-05, + "loss": 0.0856, + "step": 2560 + }, + { + "epoch": 48.490566037735846, + "grad_norm": 4.565659999847412, + "learning_rate": 2.8616352201257862e-05, + "loss": 0.0737, + "step": 2570 + }, + { + "epoch": 48.679245283018865, + "grad_norm": 2.7404532432556152, + "learning_rate": 2.851153039832285e-05, + "loss": 0.0843, + "step": 2580 + }, + { + "epoch": 48.867924528301884, + "grad_norm": 2.5788304805755615, + "learning_rate": 2.8406708595387843e-05, + "loss": 0.0863, + "step": 2590 + }, + { + "epoch": 49.0, + "eval_accuracy": 0.9566115702479339, + "eval_loss": 0.15200072526931763, + "eval_runtime": 21.6506, + "eval_samples_per_second": 134.13, + "eval_steps_per_second": 4.203, + "step": 2597 + }, + { + "epoch": 49.056603773584904, + "grad_norm": 2.0969185829162598, + "learning_rate": 2.830188679245283e-05, + "loss": 0.0779, + "step": 2600 + }, + { + "epoch": 49.24528301886792, + "grad_norm": 4.123626232147217, + "learning_rate": 2.8197064989517817e-05, + "loss": 0.1003, + "step": 2610 + }, + { + "epoch": 49.43396226415094, + "grad_norm": 3.9485299587249756, + "learning_rate": 2.809224318658281e-05, + "loss": 0.0862, + "step": 2620 + }, + { + "epoch": 49.62264150943396, + "grad_norm": 3.080941915512085, + "learning_rate": 2.7987421383647798e-05, + "loss": 0.072, + "step": 2630 + }, + { + "epoch": 49.81132075471698, + "grad_norm": 3.656919002532959, + "learning_rate": 2.788259958071279e-05, + "loss": 0.0945, + "step": 2640 + }, + { + "epoch": 50.0, + "grad_norm": 2.570844888687134, + "learning_rate": 2.777777777777778e-05, + "loss": 0.0887, + "step": 2650 + }, + { + "epoch": 50.0, + "eval_accuracy": 0.9655647382920111, + "eval_loss": 0.12789078056812286, + "eval_runtime": 21.6593, + "eval_samples_per_second": 134.076, + "eval_steps_per_second": 4.201, + "step": 2650 + }, + { + "epoch": 50.18867924528302, + "grad_norm": 2.957178831100464, + "learning_rate": 2.767295597484277e-05, + "loss": 0.0693, + "step": 2660 + }, + { + "epoch": 50.37735849056604, + "grad_norm": 4.276056289672852, + "learning_rate": 2.7568134171907757e-05, + "loss": 0.0892, + "step": 2670 + }, + { + "epoch": 50.56603773584906, + "grad_norm": 4.443928241729736, + "learning_rate": 2.746331236897275e-05, + "loss": 0.081, + "step": 2680 + }, + { + "epoch": 50.75471698113208, + "grad_norm": 2.7647054195404053, + "learning_rate": 2.7358490566037738e-05, + "loss": 0.0606, + "step": 2690 + }, + { + "epoch": 50.943396226415096, + "grad_norm": 5.863194942474365, + "learning_rate": 2.7253668763102725e-05, + "loss": 0.0744, + "step": 2700 + }, + { + "epoch": 51.0, + "eval_accuracy": 0.9566115702479339, + "eval_loss": 0.15517625212669373, + "eval_runtime": 21.6847, + "eval_samples_per_second": 133.919, + "eval_steps_per_second": 4.196, + "step": 2703 + }, + { + "epoch": 51.132075471698116, + "grad_norm": 3.68581485748291, + "learning_rate": 2.714884696016772e-05, + "loss": 0.0757, + "step": 2710 + }, + { + "epoch": 51.320754716981135, + "grad_norm": 6.279058933258057, + "learning_rate": 2.7044025157232706e-05, + "loss": 0.0839, + "step": 2720 + }, + { + "epoch": 51.509433962264154, + "grad_norm": 3.6847403049468994, + "learning_rate": 2.6939203354297693e-05, + "loss": 0.0754, + "step": 2730 + }, + { + "epoch": 51.698113207547166, + "grad_norm": 3.514678716659546, + "learning_rate": 2.6834381551362687e-05, + "loss": 0.0717, + "step": 2740 + }, + { + "epoch": 51.886792452830186, + "grad_norm": 5.7339768409729, + "learning_rate": 2.6729559748427674e-05, + "loss": 0.0928, + "step": 2750 + }, + { + "epoch": 52.0, + "eval_accuracy": 0.9621212121212122, + "eval_loss": 0.14646016061306, + "eval_runtime": 21.7451, + "eval_samples_per_second": 133.547, + "eval_steps_per_second": 4.185, + "step": 2756 + }, + { + "epoch": 52.075471698113205, + "grad_norm": 2.6166908740997314, + "learning_rate": 2.662473794549266e-05, + "loss": 0.0622, + "step": 2760 + }, + { + "epoch": 52.264150943396224, + "grad_norm": 4.80858850479126, + "learning_rate": 2.6519916142557655e-05, + "loss": 0.0869, + "step": 2770 + }, + { + "epoch": 52.45283018867924, + "grad_norm": 4.928915500640869, + "learning_rate": 2.641509433962264e-05, + "loss": 0.0796, + "step": 2780 + }, + { + "epoch": 52.64150943396226, + "grad_norm": 4.719991207122803, + "learning_rate": 2.631027253668763e-05, + "loss": 0.0789, + "step": 2790 + }, + { + "epoch": 52.83018867924528, + "grad_norm": 3.452692747116089, + "learning_rate": 2.6205450733752623e-05, + "loss": 0.0776, + "step": 2800 + }, + { + "epoch": 53.0, + "eval_accuracy": 0.9583333333333334, + "eval_loss": 0.15753231942653656, + "eval_runtime": 21.7056, + "eval_samples_per_second": 133.79, + "eval_steps_per_second": 4.192, + "step": 2809 + }, + { + "epoch": 53.0188679245283, + "grad_norm": 3.827479362487793, + "learning_rate": 2.610062893081761e-05, + "loss": 0.0614, + "step": 2810 + }, + { + "epoch": 53.20754716981132, + "grad_norm": 3.628530979156494, + "learning_rate": 2.59958071278826e-05, + "loss": 0.0836, + "step": 2820 + }, + { + "epoch": 53.39622641509434, + "grad_norm": 5.494470119476318, + "learning_rate": 2.589098532494759e-05, + "loss": 0.07, + "step": 2830 + }, + { + "epoch": 53.58490566037736, + "grad_norm": 2.920034646987915, + "learning_rate": 2.578616352201258e-05, + "loss": 0.0861, + "step": 2840 + }, + { + "epoch": 53.77358490566038, + "grad_norm": 2.367824077606201, + "learning_rate": 2.5681341719077568e-05, + "loss": 0.0824, + "step": 2850 + }, + { + "epoch": 53.9622641509434, + "grad_norm": 4.855032444000244, + "learning_rate": 2.5576519916142562e-05, + "loss": 0.088, + "step": 2860 + }, + { + "epoch": 54.0, + "eval_accuracy": 0.956267217630854, + "eval_loss": 0.16139821708202362, + "eval_runtime": 21.715, + "eval_samples_per_second": 133.733, + "eval_steps_per_second": 4.191, + "step": 2862 + }, + { + "epoch": 54.15094339622642, + "grad_norm": 4.724045753479004, + "learning_rate": 2.547169811320755e-05, + "loss": 0.076, + "step": 2870 + }, + { + "epoch": 54.339622641509436, + "grad_norm": 3.425045967102051, + "learning_rate": 2.5366876310272536e-05, + "loss": 0.0882, + "step": 2880 + }, + { + "epoch": 54.528301886792455, + "grad_norm": 2.366727590560913, + "learning_rate": 2.526205450733753e-05, + "loss": 0.0492, + "step": 2890 + }, + { + "epoch": 54.716981132075475, + "grad_norm": 3.3081116676330566, + "learning_rate": 2.5157232704402517e-05, + "loss": 0.0755, + "step": 2900 + }, + { + "epoch": 54.905660377358494, + "grad_norm": 4.172368049621582, + "learning_rate": 2.5052410901467504e-05, + "loss": 0.0909, + "step": 2910 + }, + { + "epoch": 55.0, + "eval_accuracy": 0.9638429752066116, + "eval_loss": 0.13122335076332092, + "eval_runtime": 22.1096, + "eval_samples_per_second": 131.346, + "eval_steps_per_second": 4.116, + "step": 2915 + }, + { + "epoch": 55.094339622641506, + "grad_norm": 4.725098609924316, + "learning_rate": 2.4947589098532495e-05, + "loss": 0.0612, + "step": 2920 + }, + { + "epoch": 55.283018867924525, + "grad_norm": 2.416428327560425, + "learning_rate": 2.4842767295597485e-05, + "loss": 0.067, + "step": 2930 + }, + { + "epoch": 55.471698113207545, + "grad_norm": 4.181457042694092, + "learning_rate": 2.4737945492662476e-05, + "loss": 0.0771, + "step": 2940 + }, + { + "epoch": 55.660377358490564, + "grad_norm": 5.096282005310059, + "learning_rate": 2.4633123689727463e-05, + "loss": 0.0747, + "step": 2950 + }, + { + "epoch": 55.84905660377358, + "grad_norm": 5.836996555328369, + "learning_rate": 2.4528301886792453e-05, + "loss": 0.089, + "step": 2960 + }, + { + "epoch": 56.0, + "eval_accuracy": 0.9652203856749312, + "eval_loss": 0.13570785522460938, + "eval_runtime": 21.9675, + "eval_samples_per_second": 132.195, + "eval_steps_per_second": 4.142, + "step": 2968 + }, + { + "epoch": 56.0377358490566, + "grad_norm": 3.5151453018188477, + "learning_rate": 2.4423480083857444e-05, + "loss": 0.0653, + "step": 2970 + }, + { + "epoch": 56.22641509433962, + "grad_norm": 5.056783676147461, + "learning_rate": 2.431865828092243e-05, + "loss": 0.0768, + "step": 2980 + }, + { + "epoch": 56.41509433962264, + "grad_norm": 2.0527374744415283, + "learning_rate": 2.421383647798742e-05, + "loss": 0.0694, + "step": 2990 + }, + { + "epoch": 56.60377358490566, + "grad_norm": 3.363852024078369, + "learning_rate": 2.4109014675052412e-05, + "loss": 0.0763, + "step": 3000 + }, + { + "epoch": 56.79245283018868, + "grad_norm": 2.2541282176971436, + "learning_rate": 2.4004192872117402e-05, + "loss": 0.0717, + "step": 3010 + }, + { + "epoch": 56.9811320754717, + "grad_norm": 1.5099104642868042, + "learning_rate": 2.3899371069182393e-05, + "loss": 0.0587, + "step": 3020 + }, + { + "epoch": 57.0, + "eval_accuracy": 0.9614325068870524, + "eval_loss": 0.15099208056926727, + "eval_runtime": 21.4231, + "eval_samples_per_second": 135.554, + "eval_steps_per_second": 4.248, + "step": 3021 + }, + { + "epoch": 57.16981132075472, + "grad_norm": 3.0756702423095703, + "learning_rate": 2.3794549266247383e-05, + "loss": 0.0931, + "step": 3030 + }, + { + "epoch": 57.35849056603774, + "grad_norm": 5.203828811645508, + "learning_rate": 2.368972746331237e-05, + "loss": 0.077, + "step": 3040 + }, + { + "epoch": 57.54716981132076, + "grad_norm": 3.4697303771972656, + "learning_rate": 2.358490566037736e-05, + "loss": 0.0773, + "step": 3050 + }, + { + "epoch": 57.735849056603776, + "grad_norm": 5.211211681365967, + "learning_rate": 2.348008385744235e-05, + "loss": 0.0829, + "step": 3060 + }, + { + "epoch": 57.924528301886795, + "grad_norm": 3.6697630882263184, + "learning_rate": 2.3375262054507338e-05, + "loss": 0.0931, + "step": 3070 + }, + { + "epoch": 58.0, + "eval_accuracy": 0.9579889807162535, + "eval_loss": 0.14657209813594818, + "eval_runtime": 22.1099, + "eval_samples_per_second": 131.344, + "eval_steps_per_second": 4.116, + "step": 3074 + }, + { + "epoch": 58.113207547169814, + "grad_norm": 5.555974960327148, + "learning_rate": 2.327044025157233e-05, + "loss": 0.0583, + "step": 3080 + }, + { + "epoch": 58.301886792452834, + "grad_norm": 4.070455551147461, + "learning_rate": 2.316561844863732e-05, + "loss": 0.0746, + "step": 3090 + }, + { + "epoch": 58.490566037735846, + "grad_norm": 2.739751100540161, + "learning_rate": 2.3060796645702306e-05, + "loss": 0.0698, + "step": 3100 + }, + { + "epoch": 58.679245283018865, + "grad_norm": 4.419424057006836, + "learning_rate": 2.2955974842767297e-05, + "loss": 0.0669, + "step": 3110 + }, + { + "epoch": 58.867924528301884, + "grad_norm": 2.998007297515869, + "learning_rate": 2.2851153039832284e-05, + "loss": 0.0878, + "step": 3120 + }, + { + "epoch": 59.0, + "eval_accuracy": 0.9590220385674931, + "eval_loss": 0.14993391931056976, + "eval_runtime": 21.3446, + "eval_samples_per_second": 136.053, + "eval_steps_per_second": 4.263, + "step": 3127 + }, + { + "epoch": 59.056603773584904, + "grad_norm": 3.094158411026001, + "learning_rate": 2.2746331236897274e-05, + "loss": 0.0679, + "step": 3130 + }, + { + "epoch": 59.24528301886792, + "grad_norm": 3.320544958114624, + "learning_rate": 2.2641509433962265e-05, + "loss": 0.066, + "step": 3140 + }, + { + "epoch": 59.43396226415094, + "grad_norm": 5.472837924957275, + "learning_rate": 2.2536687631027252e-05, + "loss": 0.071, + "step": 3150 + }, + { + "epoch": 59.62264150943396, + "grad_norm": 2.2281177043914795, + "learning_rate": 2.2431865828092242e-05, + "loss": 0.075, + "step": 3160 + }, + { + "epoch": 59.81132075471698, + "grad_norm": 4.7084503173828125, + "learning_rate": 2.2327044025157233e-05, + "loss": 0.0763, + "step": 3170 + }, + { + "epoch": 60.0, + "grad_norm": 4.050378322601318, + "learning_rate": 2.2222222222222223e-05, + "loss": 0.0725, + "step": 3180 + }, + { + "epoch": 60.0, + "eval_accuracy": 0.9597107438016529, + "eval_loss": 0.15237364172935486, + "eval_runtime": 21.4189, + "eval_samples_per_second": 135.581, + "eval_steps_per_second": 4.249, + "step": 3180 + }, + { + "epoch": 60.18867924528302, + "grad_norm": 5.035741329193115, + "learning_rate": 2.2117400419287214e-05, + "loss": 0.0754, + "step": 3190 + }, + { + "epoch": 60.37735849056604, + "grad_norm": 2.6131255626678467, + "learning_rate": 2.2012578616352204e-05, + "loss": 0.0727, + "step": 3200 + }, + { + "epoch": 60.56603773584906, + "grad_norm": 2.404343366622925, + "learning_rate": 2.190775681341719e-05, + "loss": 0.055, + "step": 3210 + }, + { + "epoch": 60.75471698113208, + "grad_norm": 3.9234628677368164, + "learning_rate": 2.1802935010482182e-05, + "loss": 0.0671, + "step": 3220 + }, + { + "epoch": 60.943396226415096, + "grad_norm": 2.465709686279297, + "learning_rate": 2.1698113207547172e-05, + "loss": 0.0543, + "step": 3230 + }, + { + "epoch": 61.0, + "eval_accuracy": 0.9583333333333334, + "eval_loss": 0.15427254140377045, + "eval_runtime": 21.2569, + "eval_samples_per_second": 136.615, + "eval_steps_per_second": 4.281, + "step": 3233 + }, + { + "epoch": 61.132075471698116, + "grad_norm": 4.535091876983643, + "learning_rate": 2.159329140461216e-05, + "loss": 0.0529, + "step": 3240 + }, + { + "epoch": 61.320754716981135, + "grad_norm": 2.738173484802246, + "learning_rate": 2.148846960167715e-05, + "loss": 0.0566, + "step": 3250 + }, + { + "epoch": 61.509433962264154, + "grad_norm": 3.950739860534668, + "learning_rate": 2.138364779874214e-05, + "loss": 0.0729, + "step": 3260 + }, + { + "epoch": 61.698113207547166, + "grad_norm": 2.284546375274658, + "learning_rate": 2.1278825995807127e-05, + "loss": 0.0555, + "step": 3270 + }, + { + "epoch": 61.886792452830186, + "grad_norm": 4.242305278778076, + "learning_rate": 2.1174004192872118e-05, + "loss": 0.0773, + "step": 3280 + }, + { + "epoch": 62.0, + "eval_accuracy": 0.9634986225895317, + "eval_loss": 0.15126247704029083, + "eval_runtime": 21.4863, + "eval_samples_per_second": 135.156, + "eval_steps_per_second": 4.235, + "step": 3286 + }, + { + "epoch": 62.075471698113205, + "grad_norm": 5.880760669708252, + "learning_rate": 2.106918238993711e-05, + "loss": 0.0663, + "step": 3290 + }, + { + "epoch": 62.264150943396224, + "grad_norm": 4.1386237144470215, + "learning_rate": 2.0964360587002095e-05, + "loss": 0.0643, + "step": 3300 + }, + { + "epoch": 62.45283018867924, + "grad_norm": 3.798180341720581, + "learning_rate": 2.0859538784067086e-05, + "loss": 0.0789, + "step": 3310 + }, + { + "epoch": 62.64150943396226, + "grad_norm": 2.6862704753875732, + "learning_rate": 2.0754716981132076e-05, + "loss": 0.0838, + "step": 3320 + }, + { + "epoch": 62.83018867924528, + "grad_norm": 4.914183616638184, + "learning_rate": 2.0649895178197063e-05, + "loss": 0.0626, + "step": 3330 + }, + { + "epoch": 63.0, + "eval_accuracy": 0.9600550964187328, + "eval_loss": 0.1511116325855255, + "eval_runtime": 21.3341, + "eval_samples_per_second": 136.12, + "eval_steps_per_second": 4.265, + "step": 3339 + }, + { + "epoch": 63.0188679245283, + "grad_norm": 3.25014591217041, + "learning_rate": 2.0545073375262054e-05, + "loss": 0.0534, + "step": 3340 + }, + { + "epoch": 63.20754716981132, + "grad_norm": 3.0906035900115967, + "learning_rate": 2.0440251572327044e-05, + "loss": 0.0598, + "step": 3350 + }, + { + "epoch": 63.39622641509434, + "grad_norm": 3.2928597927093506, + "learning_rate": 2.0335429769392035e-05, + "loss": 0.0704, + "step": 3360 + }, + { + "epoch": 63.58490566037736, + "grad_norm": 2.6541659832000732, + "learning_rate": 2.0230607966457025e-05, + "loss": 0.0541, + "step": 3370 + }, + { + "epoch": 63.77358490566038, + "grad_norm": 4.403809070587158, + "learning_rate": 2.0125786163522016e-05, + "loss": 0.0563, + "step": 3380 + }, + { + "epoch": 63.9622641509434, + "grad_norm": 4.669613361358643, + "learning_rate": 2.0020964360587003e-05, + "loss": 0.0649, + "step": 3390 + }, + { + "epoch": 64.0, + "eval_accuracy": 0.959366391184573, + "eval_loss": 0.1467009335756302, + "eval_runtime": 21.2656, + "eval_samples_per_second": 136.558, + "eval_steps_per_second": 4.279, + "step": 3392 + }, + { + "epoch": 64.15094339622641, + "grad_norm": 2.8066558837890625, + "learning_rate": 1.9916142557651993e-05, + "loss": 0.0746, + "step": 3400 + }, + { + "epoch": 64.33962264150944, + "grad_norm": 5.994083881378174, + "learning_rate": 1.9811320754716984e-05, + "loss": 0.0704, + "step": 3410 + }, + { + "epoch": 64.52830188679245, + "grad_norm": 4.9809441566467285, + "learning_rate": 1.970649895178197e-05, + "loss": 0.0773, + "step": 3420 + }, + { + "epoch": 64.71698113207547, + "grad_norm": 2.497436285018921, + "learning_rate": 1.960167714884696e-05, + "loss": 0.0781, + "step": 3430 + }, + { + "epoch": 64.90566037735849, + "grad_norm": 3.954669952392578, + "learning_rate": 1.9496855345911952e-05, + "loss": 0.0705, + "step": 3440 + }, + { + "epoch": 65.0, + "eval_accuracy": 0.9590220385674931, + "eval_loss": 0.1443195939064026, + "eval_runtime": 21.2667, + "eval_samples_per_second": 136.551, + "eval_steps_per_second": 4.279, + "step": 3445 + }, + { + "epoch": 65.09433962264151, + "grad_norm": 3.244377374649048, + "learning_rate": 1.939203354297694e-05, + "loss": 0.0515, + "step": 3450 + }, + { + "epoch": 65.28301886792453, + "grad_norm": 2.0447616577148438, + "learning_rate": 1.928721174004193e-05, + "loss": 0.0602, + "step": 3460 + }, + { + "epoch": 65.47169811320755, + "grad_norm": 9.920838356018066, + "learning_rate": 1.918238993710692e-05, + "loss": 0.072, + "step": 3470 + }, + { + "epoch": 65.66037735849056, + "grad_norm": 2.986557960510254, + "learning_rate": 1.9077568134171907e-05, + "loss": 0.0571, + "step": 3480 + }, + { + "epoch": 65.84905660377359, + "grad_norm": 3.244969606399536, + "learning_rate": 1.8972746331236897e-05, + "loss": 0.0737, + "step": 3490 + }, + { + "epoch": 66.0, + "eval_accuracy": 0.9607438016528925, + "eval_loss": 0.13612627983093262, + "eval_runtime": 21.1938, + "eval_samples_per_second": 137.021, + "eval_steps_per_second": 4.294, + "step": 3498 + }, + { + "epoch": 66.0377358490566, + "grad_norm": 3.0969536304473877, + "learning_rate": 1.8867924528301888e-05, + "loss": 0.0581, + "step": 3500 + }, + { + "epoch": 66.22641509433963, + "grad_norm": 5.167777061462402, + "learning_rate": 1.876310272536688e-05, + "loss": 0.0744, + "step": 3510 + }, + { + "epoch": 66.41509433962264, + "grad_norm": 3.00007700920105, + "learning_rate": 1.865828092243187e-05, + "loss": 0.0543, + "step": 3520 + }, + { + "epoch": 66.60377358490567, + "grad_norm": 2.6348941326141357, + "learning_rate": 1.8553459119496856e-05, + "loss": 0.0567, + "step": 3530 + }, + { + "epoch": 66.79245283018868, + "grad_norm": 5.631946563720703, + "learning_rate": 1.8448637316561846e-05, + "loss": 0.0779, + "step": 3540 + }, + { + "epoch": 66.98113207547169, + "grad_norm": 1.5974416732788086, + "learning_rate": 1.8343815513626837e-05, + "loss": 0.0518, + "step": 3550 + }, + { + "epoch": 67.0, + "eval_accuracy": 0.959366391184573, + "eval_loss": 0.14412498474121094, + "eval_runtime": 21.2408, + "eval_samples_per_second": 136.718, + "eval_steps_per_second": 4.284, + "step": 3551 + }, + { + "epoch": 67.16981132075472, + "grad_norm": 2.0217528343200684, + "learning_rate": 1.8238993710691824e-05, + "loss": 0.0645, + "step": 3560 + }, + { + "epoch": 67.35849056603773, + "grad_norm": 3.986748695373535, + "learning_rate": 1.8134171907756814e-05, + "loss": 0.0647, + "step": 3570 + }, + { + "epoch": 67.54716981132076, + "grad_norm": 4.671311855316162, + "learning_rate": 1.8029350104821805e-05, + "loss": 0.0666, + "step": 3580 + }, + { + "epoch": 67.73584905660377, + "grad_norm": 2.0372939109802246, + "learning_rate": 1.7924528301886792e-05, + "loss": 0.0554, + "step": 3590 + }, + { + "epoch": 67.9245283018868, + "grad_norm": 2.52120041847229, + "learning_rate": 1.7819706498951782e-05, + "loss": 0.0502, + "step": 3600 + }, + { + "epoch": 68.0, + "eval_accuracy": 0.9590220385674931, + "eval_loss": 0.1534823328256607, + "eval_runtime": 21.3789, + "eval_samples_per_second": 135.835, + "eval_steps_per_second": 4.257, + "step": 3604 + }, + { + "epoch": 68.11320754716981, + "grad_norm": 1.7292485237121582, + "learning_rate": 1.7714884696016773e-05, + "loss": 0.0696, + "step": 3610 + }, + { + "epoch": 68.30188679245283, + "grad_norm": 2.2730910778045654, + "learning_rate": 1.761006289308176e-05, + "loss": 0.0608, + "step": 3620 + }, + { + "epoch": 68.49056603773585, + "grad_norm": 3.564232110977173, + "learning_rate": 1.750524109014675e-05, + "loss": 0.0604, + "step": 3630 + }, + { + "epoch": 68.67924528301887, + "grad_norm": 2.5112924575805664, + "learning_rate": 1.740041928721174e-05, + "loss": 0.0415, + "step": 3640 + }, + { + "epoch": 68.86792452830188, + "grad_norm": 5.021323204040527, + "learning_rate": 1.7295597484276728e-05, + "loss": 0.0701, + "step": 3650 + }, + { + "epoch": 69.0, + "eval_accuracy": 0.9662534435261708, + "eval_loss": 0.1362384557723999, + "eval_runtime": 21.2026, + "eval_samples_per_second": 136.964, + "eval_steps_per_second": 4.292, + "step": 3657 + }, + { + "epoch": 69.05660377358491, + "grad_norm": 5.013925075531006, + "learning_rate": 1.719077568134172e-05, + "loss": 0.0744, + "step": 3660 + }, + { + "epoch": 69.24528301886792, + "grad_norm": 3.5540971755981445, + "learning_rate": 1.708595387840671e-05, + "loss": 0.0568, + "step": 3670 + }, + { + "epoch": 69.43396226415095, + "grad_norm": 3.4611597061157227, + "learning_rate": 1.69811320754717e-05, + "loss": 0.0513, + "step": 3680 + }, + { + "epoch": 69.62264150943396, + "grad_norm": 2.5300846099853516, + "learning_rate": 1.687631027253669e-05, + "loss": 0.0442, + "step": 3690 + }, + { + "epoch": 69.81132075471699, + "grad_norm": 2.6349620819091797, + "learning_rate": 1.677148846960168e-05, + "loss": 0.054, + "step": 3700 + }, + { + "epoch": 70.0, + "grad_norm": 3.122040033340454, + "learning_rate": 1.6666666666666667e-05, + "loss": 0.0826, + "step": 3710 + }, + { + "epoch": 70.0, + "eval_accuracy": 0.9610881542699724, + "eval_loss": 0.1492019146680832, + "eval_runtime": 21.2837, + "eval_samples_per_second": 136.442, + "eval_steps_per_second": 4.276, + "step": 3710 + }, + { + "epoch": 70.18867924528301, + "grad_norm": 3.5501227378845215, + "learning_rate": 1.6561844863731658e-05, + "loss": 0.0627, + "step": 3720 + }, + { + "epoch": 70.37735849056604, + "grad_norm": 2.6497879028320312, + "learning_rate": 1.645702306079665e-05, + "loss": 0.0461, + "step": 3730 + }, + { + "epoch": 70.56603773584905, + "grad_norm": 2.9843809604644775, + "learning_rate": 1.6352201257861635e-05, + "loss": 0.0484, + "step": 3740 + }, + { + "epoch": 70.75471698113208, + "grad_norm": 8.867347717285156, + "learning_rate": 1.6247379454926626e-05, + "loss": 0.0595, + "step": 3750 + }, + { + "epoch": 70.94339622641509, + "grad_norm": 4.957089900970459, + "learning_rate": 1.6142557651991616e-05, + "loss": 0.0715, + "step": 3760 + }, + { + "epoch": 71.0, + "eval_accuracy": 0.962465564738292, + "eval_loss": 0.16146376729011536, + "eval_runtime": 21.2899, + "eval_samples_per_second": 136.403, + "eval_steps_per_second": 4.274, + "step": 3763 + }, + { + "epoch": 71.13207547169812, + "grad_norm": 3.982633590698242, + "learning_rate": 1.6037735849056604e-05, + "loss": 0.0534, + "step": 3770 + }, + { + "epoch": 71.32075471698113, + "grad_norm": 4.253650188446045, + "learning_rate": 1.5932914046121594e-05, + "loss": 0.0677, + "step": 3780 + }, + { + "epoch": 71.50943396226415, + "grad_norm": 4.608425140380859, + "learning_rate": 1.5828092243186584e-05, + "loss": 0.0495, + "step": 3790 + }, + { + "epoch": 71.69811320754717, + "grad_norm": 5.12533712387085, + "learning_rate": 1.572327044025157e-05, + "loss": 0.0633, + "step": 3800 + }, + { + "epoch": 71.88679245283019, + "grad_norm": 4.220004558563232, + "learning_rate": 1.5618448637316562e-05, + "loss": 0.0635, + "step": 3810 + }, + { + "epoch": 72.0, + "eval_accuracy": 0.9641873278236914, + "eval_loss": 0.14879465103149414, + "eval_runtime": 21.2409, + "eval_samples_per_second": 136.717, + "eval_steps_per_second": 4.284, + "step": 3816 + }, + { + "epoch": 72.0754716981132, + "grad_norm": 1.841488242149353, + "learning_rate": 1.5513626834381552e-05, + "loss": 0.0608, + "step": 3820 + }, + { + "epoch": 72.26415094339623, + "grad_norm": 2.0446391105651855, + "learning_rate": 1.540880503144654e-05, + "loss": 0.0465, + "step": 3830 + }, + { + "epoch": 72.45283018867924, + "grad_norm": 3.4776628017425537, + "learning_rate": 1.530398322851153e-05, + "loss": 0.0575, + "step": 3840 + }, + { + "epoch": 72.64150943396227, + "grad_norm": 2.3915700912475586, + "learning_rate": 1.5199161425576522e-05, + "loss": 0.0564, + "step": 3850 + }, + { + "epoch": 72.83018867924528, + "grad_norm": 1.4522980451583862, + "learning_rate": 1.509433962264151e-05, + "loss": 0.0522, + "step": 3860 + }, + { + "epoch": 73.0, + "eval_accuracy": 0.9621212121212122, + "eval_loss": 0.14563634991645813, + "eval_runtime": 21.397, + "eval_samples_per_second": 135.72, + "eval_steps_per_second": 4.253, + "step": 3869 + }, + { + "epoch": 73.01886792452831, + "grad_norm": 2.048356056213379, + "learning_rate": 1.49895178197065e-05, + "loss": 0.0565, + "step": 3870 + }, + { + "epoch": 73.20754716981132, + "grad_norm": 2.7343058586120605, + "learning_rate": 1.488469601677149e-05, + "loss": 0.0403, + "step": 3880 + }, + { + "epoch": 73.39622641509433, + "grad_norm": 1.833511471748352, + "learning_rate": 1.4779874213836479e-05, + "loss": 0.0502, + "step": 3890 + }, + { + "epoch": 73.58490566037736, + "grad_norm": 2.4152145385742188, + "learning_rate": 1.467505241090147e-05, + "loss": 0.0539, + "step": 3900 + }, + { + "epoch": 73.77358490566037, + "grad_norm": 2.6949825286865234, + "learning_rate": 1.4570230607966457e-05, + "loss": 0.0499, + "step": 3910 + }, + { + "epoch": 73.9622641509434, + "grad_norm": 3.1027348041534424, + "learning_rate": 1.4465408805031447e-05, + "loss": 0.0485, + "step": 3920 + }, + { + "epoch": 74.0, + "eval_accuracy": 0.9645316804407713, + "eval_loss": 0.1386471837759018, + "eval_runtime": 21.3052, + "eval_samples_per_second": 136.304, + "eval_steps_per_second": 4.271, + "step": 3922 + }, + { + "epoch": 74.15094339622641, + "grad_norm": 1.7940607070922852, + "learning_rate": 1.4360587002096438e-05, + "loss": 0.0541, + "step": 3930 + }, + { + "epoch": 74.33962264150944, + "grad_norm": 2.581839084625244, + "learning_rate": 1.4255765199161425e-05, + "loss": 0.044, + "step": 3940 + }, + { + "epoch": 74.52830188679245, + "grad_norm": 1.7485500574111938, + "learning_rate": 1.4150943396226415e-05, + "loss": 0.043, + "step": 3950 + }, + { + "epoch": 74.71698113207547, + "grad_norm": 2.436922550201416, + "learning_rate": 1.4046121593291406e-05, + "loss": 0.0601, + "step": 3960 + }, + { + "epoch": 74.90566037735849, + "grad_norm": 2.460512638092041, + "learning_rate": 1.3941299790356394e-05, + "loss": 0.0629, + "step": 3970 + }, + { + "epoch": 75.0, + "eval_accuracy": 0.9631542699724518, + "eval_loss": 0.14631205797195435, + "eval_runtime": 21.2256, + "eval_samples_per_second": 136.816, + "eval_steps_per_second": 4.287, + "step": 3975 + }, + { + "epoch": 75.09433962264151, + "grad_norm": 2.925687551498413, + "learning_rate": 1.3836477987421385e-05, + "loss": 0.0535, + "step": 3980 + }, + { + "epoch": 75.28301886792453, + "grad_norm": 1.9862323999404907, + "learning_rate": 1.3731656184486375e-05, + "loss": 0.0586, + "step": 3990 + }, + { + "epoch": 75.47169811320755, + "grad_norm": 1.5170574188232422, + "learning_rate": 1.3626834381551362e-05, + "loss": 0.0471, + "step": 4000 + }, + { + "epoch": 75.66037735849056, + "grad_norm": 4.97707986831665, + "learning_rate": 1.3522012578616353e-05, + "loss": 0.0529, + "step": 4010 + }, + { + "epoch": 75.84905660377359, + "grad_norm": 1.9036014080047607, + "learning_rate": 1.3417190775681343e-05, + "loss": 0.0568, + "step": 4020 + }, + { + "epoch": 76.0, + "eval_accuracy": 0.9621212121212122, + "eval_loss": 0.14720916748046875, + "eval_runtime": 21.3087, + "eval_samples_per_second": 136.282, + "eval_steps_per_second": 4.271, + "step": 4028 + }, + { + "epoch": 76.0377358490566, + "grad_norm": 3.9243173599243164, + "learning_rate": 1.331236897274633e-05, + "loss": 0.0531, + "step": 4030 + }, + { + "epoch": 76.22641509433963, + "grad_norm": 3.3605711460113525, + "learning_rate": 1.320754716981132e-05, + "loss": 0.0625, + "step": 4040 + }, + { + "epoch": 76.41509433962264, + "grad_norm": 3.699404716491699, + "learning_rate": 1.3102725366876311e-05, + "loss": 0.0522, + "step": 4050 + }, + { + "epoch": 76.60377358490567, + "grad_norm": 4.595207691192627, + "learning_rate": 1.29979035639413e-05, + "loss": 0.0611, + "step": 4060 + }, + { + "epoch": 76.79245283018868, + "grad_norm": 6.970567226409912, + "learning_rate": 1.289308176100629e-05, + "loss": 0.0641, + "step": 4070 + }, + { + "epoch": 76.98113207547169, + "grad_norm": 5.223586559295654, + "learning_rate": 1.2788259958071281e-05, + "loss": 0.0556, + "step": 4080 + }, + { + "epoch": 77.0, + "eval_accuracy": 0.9659090909090909, + "eval_loss": 0.14402107894420624, + "eval_runtime": 21.2908, + "eval_samples_per_second": 136.397, + "eval_steps_per_second": 4.274, + "step": 4081 + }, + { + "epoch": 77.16981132075472, + "grad_norm": 1.7077960968017578, + "learning_rate": 1.2683438155136268e-05, + "loss": 0.0477, + "step": 4090 + }, + { + "epoch": 77.35849056603773, + "grad_norm": 3.9550302028656006, + "learning_rate": 1.2578616352201259e-05, + "loss": 0.0631, + "step": 4100 + }, + { + "epoch": 77.54716981132076, + "grad_norm": 3.574674129486084, + "learning_rate": 1.2473794549266247e-05, + "loss": 0.0357, + "step": 4110 + }, + { + "epoch": 77.73584905660377, + "grad_norm": 3.119210958480835, + "learning_rate": 1.2368972746331238e-05, + "loss": 0.0382, + "step": 4120 + }, + { + "epoch": 77.9245283018868, + "grad_norm": 1.9549647569656372, + "learning_rate": 1.2264150943396227e-05, + "loss": 0.0547, + "step": 4130 + }, + { + "epoch": 78.0, + "eval_accuracy": 0.9634986225895317, + "eval_loss": 0.14210809767246246, + "eval_runtime": 21.3339, + "eval_samples_per_second": 136.121, + "eval_steps_per_second": 4.266, + "step": 4134 + }, + { + "epoch": 78.11320754716981, + "grad_norm": 3.379382610321045, + "learning_rate": 1.2159329140461215e-05, + "loss": 0.0632, + "step": 4140 + }, + { + "epoch": 78.30188679245283, + "grad_norm": 5.461263656616211, + "learning_rate": 1.2054507337526206e-05, + "loss": 0.0689, + "step": 4150 + }, + { + "epoch": 78.49056603773585, + "grad_norm": 4.160185813903809, + "learning_rate": 1.1949685534591196e-05, + "loss": 0.054, + "step": 4160 + }, + { + "epoch": 78.67924528301887, + "grad_norm": 2.50945782661438, + "learning_rate": 1.1844863731656185e-05, + "loss": 0.0302, + "step": 4170 + }, + { + "epoch": 78.86792452830188, + "grad_norm": 3.265209197998047, + "learning_rate": 1.1740041928721176e-05, + "loss": 0.0527, + "step": 4180 + }, + { + "epoch": 79.0, + "eval_accuracy": 0.9683195592286501, + "eval_loss": 0.14441226422786713, + "eval_runtime": 21.2921, + "eval_samples_per_second": 136.389, + "eval_steps_per_second": 4.274, + "step": 4187 + }, + { + "epoch": 79.05660377358491, + "grad_norm": 1.1379858255386353, + "learning_rate": 1.1635220125786164e-05, + "loss": 0.0419, + "step": 4190 + }, + { + "epoch": 79.24528301886792, + "grad_norm": 5.1714043617248535, + "learning_rate": 1.1530398322851153e-05, + "loss": 0.053, + "step": 4200 + }, + { + "epoch": 79.43396226415095, + "grad_norm": 3.441499710083008, + "learning_rate": 1.1425576519916142e-05, + "loss": 0.0588, + "step": 4210 + }, + { + "epoch": 79.62264150943396, + "grad_norm": 1.575990915298462, + "learning_rate": 1.1320754716981132e-05, + "loss": 0.0333, + "step": 4220 + }, + { + "epoch": 79.81132075471699, + "grad_norm": 1.0692986249923706, + "learning_rate": 1.1215932914046121e-05, + "loss": 0.0375, + "step": 4230 + }, + { + "epoch": 80.0, + "grad_norm": 6.881436824798584, + "learning_rate": 1.1111111111111112e-05, + "loss": 0.054, + "step": 4240 + }, + { + "epoch": 80.0, + "eval_accuracy": 0.9628099173553719, + "eval_loss": 0.1463625133037567, + "eval_runtime": 21.276, + "eval_samples_per_second": 136.492, + "eval_steps_per_second": 4.277, + "step": 4240 + }, + { + "epoch": 80.18867924528301, + "grad_norm": 3.086602210998535, + "learning_rate": 1.1006289308176102e-05, + "loss": 0.0614, + "step": 4250 + }, + { + "epoch": 80.37735849056604, + "grad_norm": 1.642980933189392, + "learning_rate": 1.0901467505241091e-05, + "loss": 0.048, + "step": 4260 + }, + { + "epoch": 80.56603773584905, + "grad_norm": 5.067837715148926, + "learning_rate": 1.079664570230608e-05, + "loss": 0.0461, + "step": 4270 + }, + { + "epoch": 80.75471698113208, + "grad_norm": 3.674088478088379, + "learning_rate": 1.069182389937107e-05, + "loss": 0.0529, + "step": 4280 + }, + { + "epoch": 80.94339622641509, + "grad_norm": 3.663996934890747, + "learning_rate": 1.0587002096436059e-05, + "loss": 0.0641, + "step": 4290 + }, + { + "epoch": 81.0, + "eval_accuracy": 0.9634986225895317, + "eval_loss": 0.1491348147392273, + "eval_runtime": 21.3519, + "eval_samples_per_second": 136.007, + "eval_steps_per_second": 4.262, + "step": 4293 + }, + { + "epoch": 81.13207547169812, + "grad_norm": 3.4041941165924072, + "learning_rate": 1.0482180293501048e-05, + "loss": 0.0511, + "step": 4300 + }, + { + "epoch": 81.32075471698113, + "grad_norm": 5.157493591308594, + "learning_rate": 1.0377358490566038e-05, + "loss": 0.0471, + "step": 4310 + }, + { + "epoch": 81.50943396226415, + "grad_norm": 5.192855358123779, + "learning_rate": 1.0272536687631027e-05, + "loss": 0.0491, + "step": 4320 + }, + { + "epoch": 81.69811320754717, + "grad_norm": 2.236807346343994, + "learning_rate": 1.0167714884696017e-05, + "loss": 0.0358, + "step": 4330 + }, + { + "epoch": 81.88679245283019, + "grad_norm": 4.674067497253418, + "learning_rate": 1.0062893081761008e-05, + "loss": 0.0546, + "step": 4340 + }, + { + "epoch": 82.0, + "eval_accuracy": 0.9610881542699724, + "eval_loss": 0.15290114283561707, + "eval_runtime": 21.4172, + "eval_samples_per_second": 135.592, + "eval_steps_per_second": 4.249, + "step": 4346 + }, + { + "epoch": 82.0754716981132, + "grad_norm": 2.6720902919769287, + "learning_rate": 9.958071278825997e-06, + "loss": 0.0493, + "step": 4350 + }, + { + "epoch": 82.26415094339623, + "grad_norm": 3.191582441329956, + "learning_rate": 9.853249475890985e-06, + "loss": 0.0479, + "step": 4360 + }, + { + "epoch": 82.45283018867924, + "grad_norm": 6.1738481521606445, + "learning_rate": 9.748427672955976e-06, + "loss": 0.0517, + "step": 4370 + }, + { + "epoch": 82.64150943396227, + "grad_norm": 2.8287763595581055, + "learning_rate": 9.643605870020965e-06, + "loss": 0.0598, + "step": 4380 + }, + { + "epoch": 82.83018867924528, + "grad_norm": 2.7232823371887207, + "learning_rate": 9.538784067085953e-06, + "loss": 0.059, + "step": 4390 + }, + { + "epoch": 83.0, + "eval_accuracy": 0.9652203856749312, + "eval_loss": 0.14617380499839783, + "eval_runtime": 21.3846, + "eval_samples_per_second": 135.798, + "eval_steps_per_second": 4.255, + "step": 4399 + }, + { + "epoch": 83.01886792452831, + "grad_norm": 3.0003349781036377, + "learning_rate": 9.433962264150944e-06, + "loss": 0.061, + "step": 4400 + }, + { + "epoch": 83.20754716981132, + "grad_norm": 4.4709038734436035, + "learning_rate": 9.329140461215934e-06, + "loss": 0.0468, + "step": 4410 + }, + { + "epoch": 83.39622641509433, + "grad_norm": 3.809194564819336, + "learning_rate": 9.224318658280923e-06, + "loss": 0.048, + "step": 4420 + }, + { + "epoch": 83.58490566037736, + "grad_norm": 4.134964942932129, + "learning_rate": 9.119496855345912e-06, + "loss": 0.0593, + "step": 4430 + }, + { + "epoch": 83.77358490566037, + "grad_norm": 6.407557964324951, + "learning_rate": 9.014675052410902e-06, + "loss": 0.058, + "step": 4440 + }, + { + "epoch": 83.9622641509434, + "grad_norm": 2.055232048034668, + "learning_rate": 8.909853249475891e-06, + "loss": 0.0485, + "step": 4450 + }, + { + "epoch": 84.0, + "eval_accuracy": 0.9631542699724518, + "eval_loss": 0.15668189525604248, + "eval_runtime": 21.3505, + "eval_samples_per_second": 136.016, + "eval_steps_per_second": 4.262, + "step": 4452 + }, + { + "epoch": 84.15094339622641, + "grad_norm": 2.221158266067505, + "learning_rate": 8.80503144654088e-06, + "loss": 0.0555, + "step": 4460 + }, + { + "epoch": 84.33962264150944, + "grad_norm": 5.540987968444824, + "learning_rate": 8.70020964360587e-06, + "loss": 0.0507, + "step": 4470 + }, + { + "epoch": 84.52830188679245, + "grad_norm": 2.087411642074585, + "learning_rate": 8.59538784067086e-06, + "loss": 0.0485, + "step": 4480 + }, + { + "epoch": 84.71698113207547, + "grad_norm": 6.342270374298096, + "learning_rate": 8.49056603773585e-06, + "loss": 0.0533, + "step": 4490 + }, + { + "epoch": 84.90566037735849, + "grad_norm": 1.974360466003418, + "learning_rate": 8.38574423480084e-06, + "loss": 0.0388, + "step": 4500 + }, + { + "epoch": 85.0, + "eval_accuracy": 0.9621212121212122, + "eval_loss": 0.15479591488838196, + "eval_runtime": 21.3928, + "eval_samples_per_second": 135.747, + "eval_steps_per_second": 4.254, + "step": 4505 + }, + { + "epoch": 85.09433962264151, + "grad_norm": 5.822177410125732, + "learning_rate": 8.280922431865829e-06, + "loss": 0.0541, + "step": 4510 + }, + { + "epoch": 85.28301886792453, + "grad_norm": 6.9071149826049805, + "learning_rate": 8.176100628930818e-06, + "loss": 0.0382, + "step": 4520 + }, + { + "epoch": 85.47169811320755, + "grad_norm": 3.2203402519226074, + "learning_rate": 8.071278825995808e-06, + "loss": 0.0481, + "step": 4530 + }, + { + "epoch": 85.66037735849056, + "grad_norm": 2.526183843612671, + "learning_rate": 7.966457023060797e-06, + "loss": 0.0346, + "step": 4540 + }, + { + "epoch": 85.84905660377359, + "grad_norm": 1.8161990642547607, + "learning_rate": 7.861635220125786e-06, + "loss": 0.0421, + "step": 4550 + }, + { + "epoch": 86.0, + "eval_accuracy": 0.9621212121212122, + "eval_loss": 0.1483514904975891, + "eval_runtime": 21.3246, + "eval_samples_per_second": 136.181, + "eval_steps_per_second": 4.267, + "step": 4558 + }, + { + "epoch": 86.0377358490566, + "grad_norm": 4.167572021484375, + "learning_rate": 7.756813417190776e-06, + "loss": 0.0558, + "step": 4560 + }, + { + "epoch": 86.22641509433963, + "grad_norm": 7.080474853515625, + "learning_rate": 7.651991614255765e-06, + "loss": 0.0374, + "step": 4570 + }, + { + "epoch": 86.41509433962264, + "grad_norm": 3.921416759490967, + "learning_rate": 7.547169811320755e-06, + "loss": 0.0557, + "step": 4580 + }, + { + "epoch": 86.60377358490567, + "grad_norm": 2.932072639465332, + "learning_rate": 7.442348008385745e-06, + "loss": 0.0492, + "step": 4590 + }, + { + "epoch": 86.79245283018868, + "grad_norm": 3.091217041015625, + "learning_rate": 7.337526205450735e-06, + "loss": 0.0348, + "step": 4600 + }, + { + "epoch": 86.98113207547169, + "grad_norm": 3.1368813514709473, + "learning_rate": 7.2327044025157235e-06, + "loss": 0.0375, + "step": 4610 + }, + { + "epoch": 87.0, + "eval_accuracy": 0.9597107438016529, + "eval_loss": 0.1680881232023239, + "eval_runtime": 21.2539, + "eval_samples_per_second": 136.634, + "eval_steps_per_second": 4.282, + "step": 4611 + }, + { + "epoch": 87.16981132075472, + "grad_norm": 3.3242502212524414, + "learning_rate": 7.127882599580712e-06, + "loss": 0.0363, + "step": 4620 + }, + { + "epoch": 87.35849056603773, + "grad_norm": 5.762176990509033, + "learning_rate": 7.023060796645703e-06, + "loss": 0.0449, + "step": 4630 + }, + { + "epoch": 87.54716981132076, + "grad_norm": 2.077052593231201, + "learning_rate": 6.918238993710692e-06, + "loss": 0.0457, + "step": 4640 + }, + { + "epoch": 87.73584905660377, + "grad_norm": 2.2283737659454346, + "learning_rate": 6.813417190775681e-06, + "loss": 0.0397, + "step": 4650 + }, + { + "epoch": 87.9245283018868, + "grad_norm": 2.103003740310669, + "learning_rate": 6.708595387840672e-06, + "loss": 0.0376, + "step": 4660 + }, + { + "epoch": 88.0, + "eval_accuracy": 0.9631542699724518, + "eval_loss": 0.15125274658203125, + "eval_runtime": 21.3299, + "eval_samples_per_second": 136.147, + "eval_steps_per_second": 4.266, + "step": 4664 + }, + { + "epoch": 88.11320754716981, + "grad_norm": 9.418115615844727, + "learning_rate": 6.60377358490566e-06, + "loss": 0.0458, + "step": 4670 + }, + { + "epoch": 88.30188679245283, + "grad_norm": 2.476033926010132, + "learning_rate": 6.49895178197065e-06, + "loss": 0.0396, + "step": 4680 + }, + { + "epoch": 88.49056603773585, + "grad_norm": 2.3755970001220703, + "learning_rate": 6.3941299790356405e-06, + "loss": 0.0384, + "step": 4690 + }, + { + "epoch": 88.67924528301887, + "grad_norm": 2.3153653144836426, + "learning_rate": 6.289308176100629e-06, + "loss": 0.0357, + "step": 4700 + }, + { + "epoch": 88.86792452830188, + "grad_norm": 5.5892720222473145, + "learning_rate": 6.184486373165619e-06, + "loss": 0.0514, + "step": 4710 + }, + { + "epoch": 89.0, + "eval_accuracy": 0.9641873278236914, + "eval_loss": 0.148544043302536, + "eval_runtime": 21.3145, + "eval_samples_per_second": 136.245, + "eval_steps_per_second": 4.269, + "step": 4717 + }, + { + "epoch": 89.05660377358491, + "grad_norm": 3.5253398418426514, + "learning_rate": 6.079664570230608e-06, + "loss": 0.0286, + "step": 4720 + }, + { + "epoch": 89.24528301886792, + "grad_norm": 5.891650676727295, + "learning_rate": 5.974842767295598e-06, + "loss": 0.0607, + "step": 4730 + }, + { + "epoch": 89.43396226415095, + "grad_norm": 2.1808536052703857, + "learning_rate": 5.870020964360588e-06, + "loss": 0.0404, + "step": 4740 + }, + { + "epoch": 89.62264150943396, + "grad_norm": 6.388125896453857, + "learning_rate": 5.7651991614255766e-06, + "loss": 0.0469, + "step": 4750 + }, + { + "epoch": 89.81132075471699, + "grad_norm": 2.3668999671936035, + "learning_rate": 5.660377358490566e-06, + "loss": 0.0421, + "step": 4760 + }, + { + "epoch": 90.0, + "grad_norm": 3.1038739681243896, + "learning_rate": 5.555555555555556e-06, + "loss": 0.0598, + "step": 4770 + }, + { + "epoch": 90.0, + "eval_accuracy": 0.9638429752066116, + "eval_loss": 0.15414932370185852, + "eval_runtime": 21.2451, + "eval_samples_per_second": 136.69, + "eval_steps_per_second": 4.283, + "step": 4770 + }, + { + "epoch": 90.18867924528301, + "grad_norm": 3.297988176345825, + "learning_rate": 5.4507337526205454e-06, + "loss": 0.0397, + "step": 4780 + }, + { + "epoch": 90.37735849056604, + "grad_norm": 4.2701897621154785, + "learning_rate": 5.345911949685535e-06, + "loss": 0.0406, + "step": 4790 + }, + { + "epoch": 90.56603773584905, + "grad_norm": 3.7925121784210205, + "learning_rate": 5.241090146750524e-06, + "loss": 0.0458, + "step": 4800 + }, + { + "epoch": 90.75471698113208, + "grad_norm": 4.8097686767578125, + "learning_rate": 5.1362683438155135e-06, + "loss": 0.0555, + "step": 4810 + }, + { + "epoch": 90.94339622641509, + "grad_norm": 3.2213995456695557, + "learning_rate": 5.031446540880504e-06, + "loss": 0.0431, + "step": 4820 + }, + { + "epoch": 91.0, + "eval_accuracy": 0.9628099173553719, + "eval_loss": 0.14735093712806702, + "eval_runtime": 21.3261, + "eval_samples_per_second": 136.171, + "eval_steps_per_second": 4.267, + "step": 4823 + }, + { + "epoch": 91.13207547169812, + "grad_norm": 3.0545461177825928, + "learning_rate": 4.926624737945493e-06, + "loss": 0.0415, + "step": 4830 + }, + { + "epoch": 91.32075471698113, + "grad_norm": 3.0625436305999756, + "learning_rate": 4.821802935010482e-06, + "loss": 0.0454, + "step": 4840 + }, + { + "epoch": 91.50943396226415, + "grad_norm": 3.4635112285614014, + "learning_rate": 4.716981132075472e-06, + "loss": 0.0276, + "step": 4850 + }, + { + "epoch": 91.69811320754717, + "grad_norm": 2.14255428314209, + "learning_rate": 4.612159329140462e-06, + "loss": 0.0341, + "step": 4860 + }, + { + "epoch": 91.88679245283019, + "grad_norm": 5.496433258056641, + "learning_rate": 4.507337526205451e-06, + "loss": 0.0432, + "step": 4870 + }, + { + "epoch": 92.0, + "eval_accuracy": 0.9645316804407713, + "eval_loss": 0.14980562031269073, + "eval_runtime": 21.3199, + "eval_samples_per_second": 136.211, + "eval_steps_per_second": 4.268, + "step": 4876 + }, + { + "epoch": 92.0754716981132, + "grad_norm": 4.054795265197754, + "learning_rate": 4.40251572327044e-06, + "loss": 0.0529, + "step": 4880 + }, + { + "epoch": 92.26415094339623, + "grad_norm": 5.012380123138428, + "learning_rate": 4.29769392033543e-06, + "loss": 0.0492, + "step": 4890 + }, + { + "epoch": 92.45283018867924, + "grad_norm": 0.6688435673713684, + "learning_rate": 4.19287211740042e-06, + "loss": 0.052, + "step": 4900 + }, + { + "epoch": 92.64150943396227, + "grad_norm": 3.9398372173309326, + "learning_rate": 4.088050314465409e-06, + "loss": 0.0468, + "step": 4910 + }, + { + "epoch": 92.83018867924528, + "grad_norm": 1.439302682876587, + "learning_rate": 3.9832285115303985e-06, + "loss": 0.0391, + "step": 4920 + }, + { + "epoch": 93.0, + "eval_accuracy": 0.9645316804407713, + "eval_loss": 0.15064947307109833, + "eval_runtime": 21.2878, + "eval_samples_per_second": 136.416, + "eval_steps_per_second": 4.275, + "step": 4929 + }, + { + "epoch": 93.01886792452831, + "grad_norm": 2.215027093887329, + "learning_rate": 3.878406708595388e-06, + "loss": 0.0322, + "step": 4930 + }, + { + "epoch": 93.20754716981132, + "grad_norm": 4.157674312591553, + "learning_rate": 3.7735849056603773e-06, + "loss": 0.0481, + "step": 4940 + }, + { + "epoch": 93.39622641509433, + "grad_norm": 2.998509645462036, + "learning_rate": 3.6687631027253674e-06, + "loss": 0.0424, + "step": 4950 + }, + { + "epoch": 93.58490566037736, + "grad_norm": 2.701420783996582, + "learning_rate": 3.563941299790356e-06, + "loss": 0.0347, + "step": 4960 + }, + { + "epoch": 93.77358490566037, + "grad_norm": 2.793337821960449, + "learning_rate": 3.459119496855346e-06, + "loss": 0.0448, + "step": 4970 + }, + { + "epoch": 93.9622641509434, + "grad_norm": 2.3559460639953613, + "learning_rate": 3.354297693920336e-06, + "loss": 0.0408, + "step": 4980 + }, + { + "epoch": 94.0, + "eval_accuracy": 0.9641873278236914, + "eval_loss": 0.1462460607290268, + "eval_runtime": 21.3706, + "eval_samples_per_second": 135.888, + "eval_steps_per_second": 4.258, + "step": 4982 + }, + { + "epoch": 94.15094339622641, + "grad_norm": 1.9885700941085815, + "learning_rate": 3.249475890985325e-06, + "loss": 0.0373, + "step": 4990 + }, + { + "epoch": 94.33962264150944, + "grad_norm": 4.4361443519592285, + "learning_rate": 3.1446540880503146e-06, + "loss": 0.0598, + "step": 5000 + }, + { + "epoch": 94.52830188679245, + "grad_norm": 3.5327351093292236, + "learning_rate": 3.039832285115304e-06, + "loss": 0.0388, + "step": 5010 + }, + { + "epoch": 94.71698113207547, + "grad_norm": 0.833363950252533, + "learning_rate": 2.935010482180294e-06, + "loss": 0.0333, + "step": 5020 + }, + { + "epoch": 94.90566037735849, + "grad_norm": 3.4921486377716064, + "learning_rate": 2.830188679245283e-06, + "loss": 0.0335, + "step": 5030 + }, + { + "epoch": 95.0, + "eval_accuracy": 0.9652203856749312, + "eval_loss": 0.15087169408798218, + "eval_runtime": 21.4165, + "eval_samples_per_second": 135.597, + "eval_steps_per_second": 4.249, + "step": 5035 + }, + { + "epoch": 95.09433962264151, + "grad_norm": 4.499444484710693, + "learning_rate": 2.7253668763102727e-06, + "loss": 0.0394, + "step": 5040 + }, + { + "epoch": 95.28301886792453, + "grad_norm": 4.537922382354736, + "learning_rate": 2.620545073375262e-06, + "loss": 0.0428, + "step": 5050 + }, + { + "epoch": 95.47169811320755, + "grad_norm": 1.5033186674118042, + "learning_rate": 2.515723270440252e-06, + "loss": 0.0363, + "step": 5060 + }, + { + "epoch": 95.66037735849056, + "grad_norm": 0.8121969103813171, + "learning_rate": 2.410901467505241e-06, + "loss": 0.0298, + "step": 5070 + }, + { + "epoch": 95.84905660377359, + "grad_norm": 1.9728448390960693, + "learning_rate": 2.306079664570231e-06, + "loss": 0.0447, + "step": 5080 + }, + { + "epoch": 96.0, + "eval_accuracy": 0.9634986225895317, + "eval_loss": 0.15081696212291718, + "eval_runtime": 21.3522, + "eval_samples_per_second": 136.005, + "eval_steps_per_second": 4.262, + "step": 5088 + }, + { + "epoch": 96.0377358490566, + "grad_norm": 2.9549922943115234, + "learning_rate": 2.20125786163522e-06, + "loss": 0.0513, + "step": 5090 + }, + { + "epoch": 96.22641509433963, + "grad_norm": 4.062204837799072, + "learning_rate": 2.09643605870021e-06, + "loss": 0.0449, + "step": 5100 + }, + { + "epoch": 96.41509433962264, + "grad_norm": 2.071183443069458, + "learning_rate": 1.9916142557651992e-06, + "loss": 0.04, + "step": 5110 + }, + { + "epoch": 96.60377358490567, + "grad_norm": 0.9506546258926392, + "learning_rate": 1.8867924528301887e-06, + "loss": 0.0321, + "step": 5120 + }, + { + "epoch": 96.79245283018868, + "grad_norm": 2.33597469329834, + "learning_rate": 1.781970649895178e-06, + "loss": 0.024, + "step": 5130 + }, + { + "epoch": 96.98113207547169, + "grad_norm": 3.253387451171875, + "learning_rate": 1.677148846960168e-06, + "loss": 0.0477, + "step": 5140 + }, + { + "epoch": 97.0, + "eval_accuracy": 0.9634986225895317, + "eval_loss": 0.1510278433561325, + "eval_runtime": 21.3381, + "eval_samples_per_second": 136.095, + "eval_steps_per_second": 4.265, + "step": 5141 + }, + { + "epoch": 97.16981132075472, + "grad_norm": 2.858771324157715, + "learning_rate": 1.5723270440251573e-06, + "loss": 0.0386, + "step": 5150 + }, + { + "epoch": 97.35849056603773, + "grad_norm": 6.633811950683594, + "learning_rate": 1.467505241090147e-06, + "loss": 0.0419, + "step": 5160 + }, + { + "epoch": 97.54716981132076, + "grad_norm": 3.115361452102661, + "learning_rate": 1.3626834381551364e-06, + "loss": 0.0291, + "step": 5170 + }, + { + "epoch": 97.73584905660377, + "grad_norm": 3.6518330574035645, + "learning_rate": 1.257861635220126e-06, + "loss": 0.0372, + "step": 5180 + }, + { + "epoch": 97.9245283018868, + "grad_norm": 3.8466641902923584, + "learning_rate": 1.1530398322851154e-06, + "loss": 0.0504, + "step": 5190 + }, + { + "epoch": 98.0, + "eval_accuracy": 0.9641873278236914, + "eval_loss": 0.15101788938045502, + "eval_runtime": 21.5013, + "eval_samples_per_second": 135.061, + "eval_steps_per_second": 4.232, + "step": 5194 + }, + { + "epoch": 98.11320754716981, + "grad_norm": 4.916390895843506, + "learning_rate": 1.048218029350105e-06, + "loss": 0.0474, + "step": 5200 + }, + { + "epoch": 98.30188679245283, + "grad_norm": 3.228938102722168, + "learning_rate": 9.433962264150943e-07, + "loss": 0.0401, + "step": 5210 + }, + { + "epoch": 98.49056603773585, + "grad_norm": 1.5599745512008667, + "learning_rate": 8.38574423480084e-07, + "loss": 0.0439, + "step": 5220 + }, + { + "epoch": 98.67924528301887, + "grad_norm": 2.1779186725616455, + "learning_rate": 7.337526205450735e-07, + "loss": 0.0411, + "step": 5230 + }, + { + "epoch": 98.86792452830188, + "grad_norm": 3.2559657096862793, + "learning_rate": 6.28930817610063e-07, + "loss": 0.0406, + "step": 5240 + }, + { + "epoch": 99.0, + "eval_accuracy": 0.9648760330578512, + "eval_loss": 0.14794644713401794, + "eval_runtime": 21.5282, + "eval_samples_per_second": 134.893, + "eval_steps_per_second": 4.227, + "step": 5247 + }, + { + "epoch": 99.05660377358491, + "grad_norm": 2.303928852081299, + "learning_rate": 5.241090146750525e-07, + "loss": 0.0461, + "step": 5250 + }, + { + "epoch": 99.24528301886792, + "grad_norm": 5.052129745483398, + "learning_rate": 4.19287211740042e-07, + "loss": 0.0398, + "step": 5260 + }, + { + "epoch": 99.43396226415095, + "grad_norm": 5.838912010192871, + "learning_rate": 3.144654088050315e-07, + "loss": 0.0436, + "step": 5270 + }, + { + "epoch": 99.62264150943396, + "grad_norm": 1.3886405229568481, + "learning_rate": 2.09643605870021e-07, + "loss": 0.0281, + "step": 5280 + }, + { + "epoch": 99.81132075471699, + "grad_norm": 2.893517017364502, + "learning_rate": 1.048218029350105e-07, + "loss": 0.0495, + "step": 5290 + }, + { + "epoch": 100.0, + "grad_norm": 3.087257146835327, + "learning_rate": 0.0, + "loss": 0.0343, + "step": 5300 + }, + { + "epoch": 100.0, + "eval_accuracy": 0.9645316804407713, + "eval_loss": 0.1479804962873459, + "eval_runtime": 21.646, + "eval_samples_per_second": 134.159, + "eval_steps_per_second": 4.204, + "step": 5300 + }, + { + "epoch": 100.0, + "step": 5300, + "total_flos": 1.6838724615023002e+19, + "train_loss": 0.14425156983845638, + "train_runtime": 35971.5646, + "train_samples_per_second": 18.829, + "train_steps_per_second": 0.147 } ], "logging_steps": 10, - "max_steps": 530, + "max_steps": 5300, "num_input_tokens_seen": 0, - "num_train_epochs": 10, + "num_train_epochs": 100, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { @@ -496,7 +4645,7 @@ "attributes": {} } }, - "total_flos": 1.6836842977571635e+18, + "total_flos": 1.6838724615023002e+19, "train_batch_size": 32, "trial_name": null, "trial_params": null