diff --git "a/trainer_state.json" "b/trainer_state.json" --- "a/trainer_state.json" +++ "b/trainer_state.json" @@ -1,2243 +1,2943 @@ { - "best_metric": 0.825, - "best_model_checkpoint": "MAE-CT-CPC-Dicotomized-v8-n0-m1/checkpoint-1020", - "epoch": 49.0004, + "best_metric": 0.6875, + "best_model_checkpoint": "MAE-CT-CPC-Dicotomized-v8-n0-m1/checkpoint-560", + "epoch": 49.02, "eval_steps": 500, - "global_step": 2500, + "global_step": 3500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { - "epoch": 0.004, - "grad_norm": 3.163785934448242, - "learning_rate": 4.0000000000000003e-07, - "loss": 0.6803, + "epoch": 0.002857142857142857, + "grad_norm": 1.2938215732574463, + "learning_rate": 2.8571428571428575e-07, + "loss": 0.7187, "step": 10 }, { - "epoch": 0.008, - "grad_norm": 1.8414620161056519, - "learning_rate": 8.000000000000001e-07, - "loss": 0.6811, + "epoch": 0.005714285714285714, + "grad_norm": 2.850792407989502, + "learning_rate": 5.714285714285715e-07, + "loss": 0.6988, "step": 20 }, { - "epoch": 0.012, - "grad_norm": 2.732853412628174, - "learning_rate": 1.2000000000000002e-06, - "loss": 0.6724, + "epoch": 0.008571428571428572, + "grad_norm": 2.5619375705718994, + "learning_rate": 8.571428571428572e-07, + "loss": 0.6938, "step": 30 }, { - "epoch": 0.016, - "grad_norm": 2.302961826324463, - "learning_rate": 1.6000000000000001e-06, - "loss": 0.6625, + "epoch": 0.011428571428571429, + "grad_norm": 1.4017411470413208, + "learning_rate": 1.142857142857143e-06, + "loss": 0.68, "step": 40 }, { - "epoch": 0.02, - "grad_norm": 2.289227247238159, - "learning_rate": 2.0000000000000003e-06, - "loss": 0.6826, + "epoch": 0.014285714285714285, + "grad_norm": 3.408830404281616, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.6665, "step": 50 }, { - "epoch": 0.0204, - "eval_accuracy": 0.7625, - "eval_loss": 0.6156964302062988, - "eval_runtime": 16.4829, - "eval_samples_per_second": 4.854, - "eval_steps_per_second": 1.213, - "step": 51 + "epoch": 0.017142857142857144, + "grad_norm": 2.4238297939300537, + "learning_rate": 1.7142857142857145e-06, + "loss": 0.6997, + "step": 60 }, { - "epoch": 1.0036, - "grad_norm": 2.539074420928955, - "learning_rate": 2.4000000000000003e-06, - "loss": 0.653, - "step": 60 + "epoch": 0.02, + "grad_norm": 4.75000524520874, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.6769, + "step": 70 }, { - "epoch": 1.0076, - "grad_norm": 4.338445663452148, - "learning_rate": 2.8000000000000003e-06, - "loss": 0.6605, + "epoch": 0.02, + "eval_accuracy": 0.59375, + "eval_loss": 0.6814303398132324, + "eval_runtime": 6.5003, + "eval_samples_per_second": 4.923, + "eval_steps_per_second": 1.231, "step": 70 }, { - "epoch": 1.0116, - "grad_norm": 4.448472023010254, - "learning_rate": 3.2000000000000003e-06, - "loss": 0.6647, + "epoch": 1.002857142857143, + "grad_norm": 5.1099066734313965, + "learning_rate": 2.285714285714286e-06, + "loss": 0.6718, "step": 80 }, { - "epoch": 1.0156, - "grad_norm": 8.590893745422363, - "learning_rate": 3.6000000000000003e-06, - "loss": 0.665, + "epoch": 1.0057142857142858, + "grad_norm": 2.6992363929748535, + "learning_rate": 2.571428571428571e-06, + "loss": 0.6841, "step": 90 }, { - "epoch": 1.0196, - "grad_norm": 4.512553691864014, - "learning_rate": 4.000000000000001e-06, - "loss": 0.6549, + "epoch": 1.0085714285714287, + "grad_norm": 3.9713478088378906, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.6237, "step": 100 }, { - "epoch": 1.0204, - "eval_accuracy": 0.7625, - "eval_loss": 0.5930413603782654, - "eval_runtime": 15.1042, - "eval_samples_per_second": 5.297, - "eval_steps_per_second": 1.324, - "step": 102 - }, - { - "epoch": 2.0032, - "grad_norm": 9.520898818969727, - "learning_rate": 4.4e-06, - "loss": 0.6033, + "epoch": 1.0114285714285713, + "grad_norm": 9.976508140563965, + "learning_rate": 3.142857142857143e-06, + "loss": 0.7209, "step": 110 }, { - "epoch": 2.0072, - "grad_norm": 7.96126651763916, - "learning_rate": 4.800000000000001e-06, - "loss": 0.5796, + "epoch": 1.0142857142857142, + "grad_norm": 5.838253974914551, + "learning_rate": 3.428571428571429e-06, + "loss": 0.5925, "step": 120 }, { - "epoch": 2.0112, - "grad_norm": 10.471980094909668, - "learning_rate": 5.2e-06, - "loss": 0.6731, + "epoch": 1.0171428571428571, + "grad_norm": 6.639209747314453, + "learning_rate": 3.7142857142857146e-06, + "loss": 0.5364, "step": 130 }, { - "epoch": 2.0152, - "grad_norm": 17.355741500854492, - "learning_rate": 5.600000000000001e-06, - "loss": 0.6853, + "epoch": 1.02, + "grad_norm": 6.5769548416137695, + "learning_rate": 4.000000000000001e-06, + "loss": 0.7223, "step": 140 }, { - "epoch": 2.0192, - "grad_norm": 7.118047714233398, - "learning_rate": 6e-06, - "loss": 0.6486, - "step": 150 + "epoch": 1.02, + "eval_accuracy": 0.59375, + "eval_loss": 0.6952990889549255, + "eval_runtime": 6.0984, + "eval_samples_per_second": 5.247, + "eval_steps_per_second": 1.312, + "step": 140 }, { - "epoch": 2.0204, - "eval_accuracy": 0.5125, - "eval_loss": 0.6796671152114868, - "eval_runtime": 15.1483, - "eval_samples_per_second": 5.281, - "eval_steps_per_second": 1.32, - "step": 153 + "epoch": 2.0028571428571427, + "grad_norm": 4.371922969818115, + "learning_rate": 4.2857142857142855e-06, + "loss": 0.6335, + "step": 150 }, { - "epoch": 3.0028, - "grad_norm": 9.448248863220215, - "learning_rate": 6.4000000000000006e-06, - "loss": 0.627, + "epoch": 2.005714285714286, + "grad_norm": 7.266854763031006, + "learning_rate": 4.571428571428572e-06, + "loss": 0.6511, "step": 160 }, { - "epoch": 3.0068, - "grad_norm": 10.044306755065918, - "learning_rate": 6.800000000000001e-06, - "loss": 0.5988, + "epoch": 2.0085714285714285, + "grad_norm": 5.1674323081970215, + "learning_rate": 4.857142857142858e-06, + "loss": 0.6216, "step": 170 }, { - "epoch": 3.0108, - "grad_norm": 11.673822402954102, - "learning_rate": 7.2000000000000005e-06, - "loss": 0.5388, + "epoch": 2.0114285714285716, + "grad_norm": 7.353259563446045, + "learning_rate": 5.142857142857142e-06, + "loss": 0.6537, "step": 180 }, { - "epoch": 3.0148, - "grad_norm": 13.179671287536621, - "learning_rate": 7.600000000000001e-06, - "loss": 0.4922, + "epoch": 2.0142857142857142, + "grad_norm": 4.876795291900635, + "learning_rate": 5.428571428571429e-06, + "loss": 0.6154, "step": 190 }, { - "epoch": 3.0188, - "grad_norm": 17.30994987487793, - "learning_rate": 8.000000000000001e-06, - "loss": 0.5595, + "epoch": 2.0171428571428573, + "grad_norm": 6.403975486755371, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.6548, "step": 200 }, { - "epoch": 3.0204, - "eval_accuracy": 0.6625, - "eval_loss": 0.49017828702926636, - "eval_runtime": 14.3197, - "eval_samples_per_second": 5.587, - "eval_steps_per_second": 1.397, - "step": 204 + "epoch": 2.02, + "grad_norm": 8.78445053100586, + "learning_rate": 6e-06, + "loss": 0.6628, + "step": 210 }, { - "epoch": 4.0024, - "grad_norm": 12.384488105773926, - "learning_rate": 8.400000000000001e-06, - "loss": 0.5512, + "epoch": 2.02, + "eval_accuracy": 0.625, + "eval_loss": 0.6335489749908447, + "eval_runtime": 6.2455, + "eval_samples_per_second": 5.124, + "eval_steps_per_second": 1.281, "step": 210 }, { - "epoch": 4.0064, - "grad_norm": 16.830211639404297, - "learning_rate": 8.8e-06, - "loss": 0.5532, + "epoch": 3.0028571428571427, + "grad_norm": 8.260648727416992, + "learning_rate": 6.285714285714286e-06, + "loss": 0.5933, "step": 220 }, { - "epoch": 4.0104, - "grad_norm": 25.703161239624023, - "learning_rate": 9.200000000000002e-06, - "loss": 0.4503, + "epoch": 3.005714285714286, + "grad_norm": 9.865010261535645, + "learning_rate": 6.571428571428572e-06, + "loss": 0.6037, "step": 230 }, { - "epoch": 4.0144, - "grad_norm": 31.191774368286133, - "learning_rate": 9.600000000000001e-06, - "loss": 0.4281, + "epoch": 3.0085714285714285, + "grad_norm": 15.651702880859375, + "learning_rate": 6.857142857142858e-06, + "loss": 0.5401, "step": 240 }, { - "epoch": 4.0184, - "grad_norm": 7.301267147064209, - "learning_rate": 1e-05, - "loss": 0.5586, + "epoch": 3.0114285714285716, + "grad_norm": 14.37714958190918, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.7525, "step": 250 }, { - "epoch": 4.0204, - "eval_accuracy": 0.475, - "eval_loss": 0.8194777369499207, - "eval_runtime": 15.5636, - "eval_samples_per_second": 5.14, - "eval_steps_per_second": 1.285, - "step": 255 - }, - { - "epoch": 5.002, - "grad_norm": 6.1059746742248535, - "learning_rate": 9.955555555555556e-06, - "loss": 0.4667, + "epoch": 3.0142857142857142, + "grad_norm": 9.037586212158203, + "learning_rate": 7.428571428571429e-06, + "loss": 0.546, "step": 260 }, { - "epoch": 5.006, - "grad_norm": 11.250345230102539, - "learning_rate": 9.911111111111113e-06, - "loss": 0.3866, + "epoch": 3.0171428571428573, + "grad_norm": 9.574410438537598, + "learning_rate": 7.714285714285716e-06, + "loss": 0.6234, "step": 270 }, { - "epoch": 5.01, - "grad_norm": 11.811776161193848, - "learning_rate": 9.866666666666668e-06, - "loss": 0.3093, + "epoch": 3.02, + "grad_norm": 6.128604412078857, + "learning_rate": 8.000000000000001e-06, + "loss": 0.5096, "step": 280 }, { - "epoch": 5.014, - "grad_norm": 2.898249387741089, - "learning_rate": 9.822222222222223e-06, - "loss": 0.4655, - "step": 290 + "epoch": 3.02, + "eval_accuracy": 0.625, + "eval_loss": 0.6513711214065552, + "eval_runtime": 5.9941, + "eval_samples_per_second": 5.339, + "eval_steps_per_second": 1.335, + "step": 280 }, { - "epoch": 5.018, - "grad_norm": 39.88842010498047, - "learning_rate": 9.777777777777779e-06, - "loss": 0.4565, - "step": 300 + "epoch": 4.002857142857143, + "grad_norm": 10.744050025939941, + "learning_rate": 8.285714285714287e-06, + "loss": 0.5184, + "step": 290 }, { - "epoch": 5.0204, - "eval_accuracy": 0.75, - "eval_loss": 0.5872394442558289, - "eval_runtime": 15.5445, - "eval_samples_per_second": 5.147, - "eval_steps_per_second": 1.287, - "step": 306 + "epoch": 4.005714285714285, + "grad_norm": 6.656492233276367, + "learning_rate": 8.571428571428571e-06, + "loss": 0.5196, + "step": 300 }, { - "epoch": 6.0016, - "grad_norm": 18.96799659729004, - "learning_rate": 9.733333333333334e-06, - "loss": 0.6334, + "epoch": 4.008571428571429, + "grad_norm": 11.752025604248047, + "learning_rate": 8.857142857142858e-06, + "loss": 0.628, "step": 310 }, { - "epoch": 6.0056, - "grad_norm": 26.40775489807129, - "learning_rate": 9.688888888888889e-06, - "loss": 0.5505, + "epoch": 4.011428571428572, + "grad_norm": 13.207033157348633, + "learning_rate": 9.142857142857144e-06, + "loss": 0.5296, "step": 320 }, { - "epoch": 6.0096, - "grad_norm": 23.987438201904297, - "learning_rate": 9.644444444444444e-06, - "loss": 0.4702, + "epoch": 4.014285714285714, + "grad_norm": 15.192891120910645, + "learning_rate": 9.42857142857143e-06, + "loss": 0.5027, "step": 330 }, { - "epoch": 6.0136, - "grad_norm": 11.874549865722656, - "learning_rate": 9.600000000000001e-06, - "loss": 0.5567, + "epoch": 4.017142857142857, + "grad_norm": 7.084335803985596, + "learning_rate": 9.714285714285715e-06, + "loss": 0.5569, "step": 340 }, { - "epoch": 6.0176, - "grad_norm": 1.870789647102356, - "learning_rate": 9.555555555555556e-06, - "loss": 0.3697, + "epoch": 4.02, + "grad_norm": 16.267013549804688, + "learning_rate": 1e-05, + "loss": 0.4739, "step": 350 }, { - "epoch": 6.0204, - "eval_accuracy": 0.775, - "eval_loss": 0.5016795992851257, - "eval_runtime": 15.1939, - "eval_samples_per_second": 5.265, - "eval_steps_per_second": 1.316, - "step": 357 + "epoch": 4.02, + "eval_accuracy": 0.65625, + "eval_loss": 0.635766327381134, + "eval_runtime": 6.5473, + "eval_samples_per_second": 4.888, + "eval_steps_per_second": 1.222, + "step": 350 }, { - "epoch": 7.0012, - "grad_norm": 1.9288091659545898, - "learning_rate": 9.511111111111112e-06, - "loss": 0.3384, + "epoch": 5.002857142857143, + "grad_norm": 54.40953826904297, + "learning_rate": 9.968253968253969e-06, + "loss": 0.5697, "step": 360 }, { - "epoch": 7.0052, - "grad_norm": 47.43910598754883, - "learning_rate": 9.466666666666667e-06, - "loss": 0.4776, + "epoch": 5.005714285714285, + "grad_norm": 5.579638481140137, + "learning_rate": 9.936507936507937e-06, + "loss": 0.6431, "step": 370 }, { - "epoch": 7.0092, - "grad_norm": 24.75432586669922, - "learning_rate": 9.422222222222222e-06, - "loss": 0.3272, + "epoch": 5.008571428571429, + "grad_norm": 6.157373428344727, + "learning_rate": 9.904761904761906e-06, + "loss": 0.507, "step": 380 }, { - "epoch": 7.0132, - "grad_norm": 6.029924392700195, - "learning_rate": 9.377777777777779e-06, - "loss": 0.6358, + "epoch": 5.011428571428572, + "grad_norm": 10.279753684997559, + "learning_rate": 9.873015873015874e-06, + "loss": 0.5867, "step": 390 }, { - "epoch": 7.0172, - "grad_norm": 36.66233825683594, - "learning_rate": 9.333333333333334e-06, - "loss": 0.6201, + "epoch": 5.014285714285714, + "grad_norm": 9.118820190429688, + "learning_rate": 9.841269841269842e-06, + "loss": 0.6229, "step": 400 }, { - "epoch": 7.0204, - "eval_accuracy": 0.7, - "eval_loss": 0.6555034518241882, - "eval_runtime": 15.1733, - "eval_samples_per_second": 5.272, - "eval_steps_per_second": 1.318, - "step": 408 + "epoch": 5.017142857142857, + "grad_norm": 11.799158096313477, + "learning_rate": 9.80952380952381e-06, + "loss": 0.4608, + "step": 410 }, { - "epoch": 8.0008, - "grad_norm": 13.244248390197754, - "learning_rate": 9.28888888888889e-06, - "loss": 0.2307, - "step": 410 + "epoch": 5.02, + "grad_norm": 24.01024055480957, + "learning_rate": 9.777777777777779e-06, + "loss": 0.4554, + "step": 420 }, { - "epoch": 8.0048, - "grad_norm": 0.1510910540819168, - "learning_rate": 9.244444444444445e-06, - "loss": 0.2198, + "epoch": 5.02, + "eval_accuracy": 0.65625, + "eval_loss": 0.627189040184021, + "eval_runtime": 6.0832, + "eval_samples_per_second": 5.26, + "eval_steps_per_second": 1.315, "step": 420 }, { - "epoch": 8.0088, - "grad_norm": 48.222145080566406, - "learning_rate": 9.200000000000002e-06, - "loss": 0.1961, + "epoch": 6.002857142857143, + "grad_norm": 10.022480964660645, + "learning_rate": 9.746031746031747e-06, + "loss": 0.6578, "step": 430 }, { - "epoch": 8.0128, - "grad_norm": 12.182064056396484, - "learning_rate": 9.155555555555557e-06, - "loss": 0.4113, + "epoch": 6.005714285714285, + "grad_norm": 7.889081954956055, + "learning_rate": 9.714285714285715e-06, + "loss": 0.4533, "step": 440 }, { - "epoch": 8.0168, - "grad_norm": 90.49524688720703, - "learning_rate": 9.111111111111112e-06, - "loss": 0.4333, + "epoch": 6.008571428571429, + "grad_norm": 6.938020706176758, + "learning_rate": 9.682539682539683e-06, + "loss": 0.4677, "step": 450 }, { - "epoch": 8.0204, - "eval_accuracy": 0.6125, - "eval_loss": 1.2277292013168335, - "eval_runtime": 15.8873, - "eval_samples_per_second": 5.035, - "eval_steps_per_second": 1.259, - "step": 459 - }, - { - "epoch": 9.0004, - "grad_norm": 8.979384422302246, - "learning_rate": 9.066666666666667e-06, - "loss": 0.2832, + "epoch": 6.011428571428572, + "grad_norm": 1.124375343322754, + "learning_rate": 9.650793650793652e-06, + "loss": 0.3756, "step": 460 }, { - "epoch": 9.0044, - "grad_norm": 124.57083892822266, - "learning_rate": 9.022222222222223e-06, - "loss": 0.4698, + "epoch": 6.014285714285714, + "grad_norm": 42.59530258178711, + "learning_rate": 9.61904761904762e-06, + "loss": 1.2108, "step": 470 }, { - "epoch": 9.0084, - "grad_norm": 1.4499937295913696, - "learning_rate": 8.977777777777778e-06, - "loss": 0.622, + "epoch": 6.017142857142857, + "grad_norm": 10.104456901550293, + "learning_rate": 9.587301587301588e-06, + "loss": 0.612, "step": 480 }, { - "epoch": 9.0124, - "grad_norm": 204.4443359375, - "learning_rate": 8.933333333333333e-06, - "loss": 0.4289, + "epoch": 6.02, + "grad_norm": 17.34273910522461, + "learning_rate": 9.555555555555556e-06, + "loss": 0.4818, "step": 490 }, { - "epoch": 9.0164, - "grad_norm": 13.056779861450195, - "learning_rate": 8.888888888888888e-06, - "loss": 0.3958, - "step": 500 + "epoch": 6.02, + "eval_accuracy": 0.59375, + "eval_loss": 0.7726800441741943, + "eval_runtime": 6.0971, + "eval_samples_per_second": 5.248, + "eval_steps_per_second": 1.312, + "step": 490 }, { - "epoch": 9.0204, - "grad_norm": 3.7403643131256104, - "learning_rate": 8.844444444444445e-06, - "loss": 0.2148, - "step": 510 + "epoch": 7.002857142857143, + "grad_norm": 14.020530700683594, + "learning_rate": 9.523809523809525e-06, + "loss": 0.4584, + "step": 500 }, { - "epoch": 9.0204, - "eval_accuracy": 0.7625, - "eval_loss": 0.8114517331123352, - "eval_runtime": 14.7863, - "eval_samples_per_second": 5.41, - "eval_steps_per_second": 1.353, + "epoch": 7.005714285714285, + "grad_norm": 28.93553924560547, + "learning_rate": 9.492063492063493e-06, + "loss": 0.3662, "step": 510 }, { - "epoch": 10.004, - "grad_norm": 0.3079785704612732, - "learning_rate": 8.8e-06, - "loss": 0.4126, + "epoch": 7.008571428571429, + "grad_norm": 12.310239791870117, + "learning_rate": 9.460317460317461e-06, + "loss": 0.5758, "step": 520 }, { - "epoch": 10.008, - "grad_norm": 10.519120216369629, - "learning_rate": 8.755555555555556e-06, - "loss": 0.2208, + "epoch": 7.011428571428572, + "grad_norm": 4.042015552520752, + "learning_rate": 9.42857142857143e-06, + "loss": 0.4398, "step": 530 }, { - "epoch": 10.012, - "grad_norm": 37.70737075805664, - "learning_rate": 8.711111111111111e-06, - "loss": 0.4088, + "epoch": 7.014285714285714, + "grad_norm": 39.30393600463867, + "learning_rate": 9.396825396825398e-06, + "loss": 0.3887, "step": 540 }, { - "epoch": 10.016, - "grad_norm": 27.64992332458496, - "learning_rate": 8.666666666666668e-06, - "loss": 0.6548, + "epoch": 7.017142857142857, + "grad_norm": 10.91474723815918, + "learning_rate": 9.365079365079366e-06, + "loss": 0.3307, "step": 550 }, { - "epoch": 10.02, - "grad_norm": 1.9343359470367432, - "learning_rate": 8.622222222222223e-06, - "loss": 0.9458, + "epoch": 7.02, + "grad_norm": 27.856639862060547, + "learning_rate": 9.333333333333334e-06, + "loss": 0.4129, "step": 560 }, { - "epoch": 10.0204, - "eval_accuracy": 0.6625, - "eval_loss": 0.9872623682022095, - "eval_runtime": 14.8048, - "eval_samples_per_second": 5.404, - "eval_steps_per_second": 1.351, - "step": 561 + "epoch": 7.02, + "eval_accuracy": 0.6875, + "eval_loss": 0.8221972584724426, + "eval_runtime": 6.3706, + "eval_samples_per_second": 5.023, + "eval_steps_per_second": 1.256, + "step": 560 }, { - "epoch": 11.0036, - "grad_norm": 127.26286315917969, - "learning_rate": 8.577777777777778e-06, - "loss": 0.4077, + "epoch": 8.002857142857144, + "grad_norm": 25.90720558166504, + "learning_rate": 9.301587301587303e-06, + "loss": 0.3737, "step": 570 }, { - "epoch": 11.0076, - "grad_norm": 48.36285400390625, - "learning_rate": 8.533333333333335e-06, - "loss": 0.315, + "epoch": 8.005714285714285, + "grad_norm": 18.878738403320312, + "learning_rate": 9.26984126984127e-06, + "loss": 0.3757, "step": 580 }, { - "epoch": 11.0116, - "grad_norm": 1.3444968461990356, - "learning_rate": 8.48888888888889e-06, - "loss": 0.3529, + "epoch": 8.008571428571429, + "grad_norm": 16.471302032470703, + "learning_rate": 9.238095238095239e-06, + "loss": 0.466, "step": 590 }, { - "epoch": 11.0156, - "grad_norm": 1.425925374031067, - "learning_rate": 8.444444444444446e-06, - "loss": 0.2537, + "epoch": 8.01142857142857, + "grad_norm": 2.8103766441345215, + "learning_rate": 9.206349206349207e-06, + "loss": 0.45, "step": 600 }, { - "epoch": 11.0196, - "grad_norm": 0.11205915361642838, - "learning_rate": 8.400000000000001e-06, - "loss": 0.0651, + "epoch": 8.014285714285714, + "grad_norm": 35.921138763427734, + "learning_rate": 9.174603174603176e-06, + "loss": 0.6889, "step": 610 }, { - "epoch": 11.0204, - "eval_accuracy": 0.7625, - "eval_loss": 1.0840221643447876, - "eval_runtime": 16.1692, - "eval_samples_per_second": 4.948, - "eval_steps_per_second": 1.237, - "step": 612 + "epoch": 8.017142857142858, + "grad_norm": 1.5162020921707153, + "learning_rate": 9.142857142857144e-06, + "loss": 0.3012, + "step": 620 }, { - "epoch": 12.0032, - "grad_norm": 1.6690884828567505, - "learning_rate": 8.355555555555556e-06, - "loss": 0.0219, - "step": 620 + "epoch": 8.02, + "grad_norm": 31.143695831298828, + "learning_rate": 9.111111111111112e-06, + "loss": 0.6301, + "step": 630 }, { - "epoch": 12.0072, - "grad_norm": 0.04994206875562668, - "learning_rate": 8.311111111111111e-06, - "loss": 0.3325, + "epoch": 8.02, + "eval_accuracy": 0.625, + "eval_loss": 0.8040816783905029, + "eval_runtime": 5.9175, + "eval_samples_per_second": 5.408, + "eval_steps_per_second": 1.352, "step": 630 }, { - "epoch": 12.0112, - "grad_norm": 1.1334573030471802, - "learning_rate": 8.266666666666667e-06, - "loss": 0.1766, + "epoch": 9.002857142857144, + "grad_norm": 74.01934814453125, + "learning_rate": 9.07936507936508e-06, + "loss": 0.605, "step": 640 }, { - "epoch": 12.0152, - "grad_norm": 84.09977722167969, - "learning_rate": 8.222222222222222e-06, - "loss": 0.3474, + "epoch": 9.005714285714285, + "grad_norm": 46.429351806640625, + "learning_rate": 9.047619047619049e-06, + "loss": 0.3164, "step": 650 }, { - "epoch": 12.0192, - "grad_norm": 0.11686452478170395, - "learning_rate": 8.177777777777779e-06, - "loss": 0.5756, + "epoch": 9.008571428571429, + "grad_norm": 43.667686462402344, + "learning_rate": 9.015873015873017e-06, + "loss": 0.2476, "step": 660 }, { - "epoch": 12.0204, - "eval_accuracy": 0.8, - "eval_loss": 1.0489223003387451, - "eval_runtime": 14.837, - "eval_samples_per_second": 5.392, - "eval_steps_per_second": 1.348, - "step": 663 - }, - { - "epoch": 13.0028, - "grad_norm": 0.4879148304462433, - "learning_rate": 8.133333333333334e-06, - "loss": 0.1627, + "epoch": 9.01142857142857, + "grad_norm": 16.798654556274414, + "learning_rate": 8.984126984126985e-06, + "loss": 0.4533, "step": 670 }, { - "epoch": 13.0068, - "grad_norm": 10.051454544067383, - "learning_rate": 8.08888888888889e-06, - "loss": 0.1414, + "epoch": 9.014285714285714, + "grad_norm": 21.254426956176758, + "learning_rate": 8.952380952380953e-06, + "loss": 0.3874, "step": 680 }, { - "epoch": 13.0108, - "grad_norm": 317.5495910644531, - "learning_rate": 8.044444444444444e-06, - "loss": 0.4045, + "epoch": 9.017142857142858, + "grad_norm": 14.007109642028809, + "learning_rate": 8.920634920634922e-06, + "loss": 0.3162, "step": 690 }, { - "epoch": 13.0148, - "grad_norm": 12.827858924865723, - "learning_rate": 8.000000000000001e-06, - "loss": 0.2213, + "epoch": 9.02, + "grad_norm": 1.4537030458450317, + "learning_rate": 8.888888888888888e-06, + "loss": 0.3809, "step": 700 }, { - "epoch": 13.0188, - "grad_norm": 7.5428338050842285, - "learning_rate": 7.955555555555557e-06, - "loss": 0.354, - "step": 710 + "epoch": 9.02, + "eval_accuracy": 0.625, + "eval_loss": 0.8720921874046326, + "eval_runtime": 5.9604, + "eval_samples_per_second": 5.369, + "eval_steps_per_second": 1.342, + "step": 700 }, { - "epoch": 13.0204, - "eval_accuracy": 0.7875, - "eval_loss": 1.1601030826568604, - "eval_runtime": 14.793, - "eval_samples_per_second": 5.408, - "eval_steps_per_second": 1.352, - "step": 714 + "epoch": 10.002857142857144, + "grad_norm": 8.765763282775879, + "learning_rate": 8.857142857142858e-06, + "loss": 0.2273, + "step": 710 }, { - "epoch": 14.0024, - "grad_norm": 1.0718170404434204, - "learning_rate": 7.911111111111112e-06, - "loss": 0.1341, + "epoch": 10.005714285714285, + "grad_norm": 0.38795459270477295, + "learning_rate": 8.825396825396827e-06, + "loss": 0.1653, "step": 720 }, { - "epoch": 14.0064, - "grad_norm": 204.27011108398438, - "learning_rate": 7.866666666666667e-06, - "loss": 0.2009, + "epoch": 10.008571428571429, + "grad_norm": 47.69906234741211, + "learning_rate": 8.793650793650795e-06, + "loss": 0.7768, "step": 730 }, { - "epoch": 14.0104, - "grad_norm": 623.6522827148438, - "learning_rate": 7.822222222222224e-06, - "loss": 0.2302, + "epoch": 10.01142857142857, + "grad_norm": 42.63213348388672, + "learning_rate": 8.761904761904763e-06, + "loss": 0.4008, "step": 740 }, { - "epoch": 14.0144, - "grad_norm": 53.07473373413086, - "learning_rate": 7.77777777777778e-06, - "loss": 0.4521, + "epoch": 10.014285714285714, + "grad_norm": 103.43374633789062, + "learning_rate": 8.730158730158731e-06, + "loss": 0.4466, "step": 750 }, { - "epoch": 14.0184, - "grad_norm": 0.032519057393074036, - "learning_rate": 7.733333333333334e-06, - "loss": 0.2888, + "epoch": 10.017142857142858, + "grad_norm": 0.7397039532661438, + "learning_rate": 8.6984126984127e-06, + "loss": 0.1774, "step": 760 }, { - "epoch": 14.0204, - "eval_accuracy": 0.625, - "eval_loss": 1.8143768310546875, - "eval_runtime": 15.8798, - "eval_samples_per_second": 5.038, - "eval_steps_per_second": 1.259, - "step": 765 + "epoch": 10.02, + "grad_norm": 62.768890380859375, + "learning_rate": 8.666666666666668e-06, + "loss": 0.8071, + "step": 770 }, { - "epoch": 15.002, - "grad_norm": 457.6705322265625, - "learning_rate": 7.68888888888889e-06, - "loss": 0.2657, + "epoch": 10.02, + "eval_accuracy": 0.625, + "eval_loss": 1.1091701984405518, + "eval_runtime": 5.949, + "eval_samples_per_second": 5.379, + "eval_steps_per_second": 1.345, "step": 770 }, { - "epoch": 15.006, - "grad_norm": 431.8014221191406, - "learning_rate": 7.644444444444445e-06, - "loss": 0.0887, + "epoch": 11.002857142857144, + "grad_norm": 25.553237915039062, + "learning_rate": 8.634920634920636e-06, + "loss": 0.2667, "step": 780 }, { - "epoch": 15.01, - "grad_norm": 0.013111516833305359, - "learning_rate": 7.600000000000001e-06, - "loss": 0.0183, + "epoch": 11.005714285714285, + "grad_norm": 17.7698974609375, + "learning_rate": 8.603174603174604e-06, + "loss": 0.4836, "step": 790 }, { - "epoch": 15.014, - "grad_norm": 0.08553914725780487, - "learning_rate": 7.555555555555556e-06, - "loss": 0.0991, + "epoch": 11.008571428571429, + "grad_norm": 31.987483978271484, + "learning_rate": 8.571428571428571e-06, + "loss": 0.1931, "step": 800 }, { - "epoch": 15.018, - "grad_norm": 0.11689701676368713, - "learning_rate": 7.511111111111111e-06, - "loss": 0.2449, + "epoch": 11.01142857142857, + "grad_norm": 48.23198318481445, + "learning_rate": 8.53968253968254e-06, + "loss": 0.1602, "step": 810 }, { - "epoch": 15.0204, - "eval_accuracy": 0.7125, - "eval_loss": 1.3988301753997803, - "eval_runtime": 14.9719, - "eval_samples_per_second": 5.343, - "eval_steps_per_second": 1.336, - "step": 816 - }, - { - "epoch": 16.0016, - "grad_norm": 0.12991830706596375, - "learning_rate": 7.4666666666666675e-06, - "loss": 0.009, + "epoch": 11.014285714285714, + "grad_norm": 83.9683609008789, + "learning_rate": 8.507936507936509e-06, + "loss": 0.4893, "step": 820 }, { - "epoch": 16.0056, - "grad_norm": 185.4541015625, - "learning_rate": 7.422222222222223e-06, - "loss": 0.2772, + "epoch": 11.017142857142858, + "grad_norm": 47.01933670043945, + "learning_rate": 8.476190476190477e-06, + "loss": 0.3692, "step": 830 }, { - "epoch": 16.0096, - "grad_norm": 0.7622888088226318, - "learning_rate": 7.377777777777778e-06, - "loss": 0.1701, + "epoch": 11.02, + "grad_norm": 0.9395814538002014, + "learning_rate": 8.444444444444446e-06, + "loss": 0.1888, "step": 840 }, { - "epoch": 16.0136, - "grad_norm": 0.5893406867980957, - "learning_rate": 7.333333333333333e-06, - "loss": 0.1631, - "step": 850 + "epoch": 11.02, + "eval_accuracy": 0.625, + "eval_loss": 1.155624270439148, + "eval_runtime": 5.9873, + "eval_samples_per_second": 5.345, + "eval_steps_per_second": 1.336, + "step": 840 }, { - "epoch": 16.0176, - "grad_norm": 0.7813571691513062, - "learning_rate": 7.28888888888889e-06, - "loss": 0.1326, - "step": 860 + "epoch": 12.002857142857144, + "grad_norm": 0.11803951114416122, + "learning_rate": 8.412698412698414e-06, + "loss": 0.1841, + "step": 850 }, { - "epoch": 16.0204, - "eval_accuracy": 0.7125, - "eval_loss": 1.715152382850647, - "eval_runtime": 15.1438, - "eval_samples_per_second": 5.283, - "eval_steps_per_second": 1.321, - "step": 867 + "epoch": 12.005714285714285, + "grad_norm": 43.479034423828125, + "learning_rate": 8.380952380952382e-06, + "loss": 0.2496, + "step": 860 }, { - "epoch": 17.0012, - "grad_norm": 0.07987383008003235, - "learning_rate": 7.244444444444445e-06, - "loss": 0.0021, + "epoch": 12.008571428571429, + "grad_norm": 47.395957946777344, + "learning_rate": 8.34920634920635e-06, + "loss": 0.4539, "step": 870 }, { - "epoch": 17.0052, - "grad_norm": 17.579423904418945, - "learning_rate": 7.2000000000000005e-06, - "loss": 0.2835, + "epoch": 12.01142857142857, + "grad_norm": 8.515253067016602, + "learning_rate": 8.317460317460319e-06, + "loss": 0.1964, "step": 880 }, { - "epoch": 17.0092, - "grad_norm": 0.05407591536641121, - "learning_rate": 7.155555555555556e-06, - "loss": 0.0557, + "epoch": 12.014285714285714, + "grad_norm": 37.08503723144531, + "learning_rate": 8.285714285714287e-06, + "loss": 0.1819, "step": 890 }, { - "epoch": 17.0132, - "grad_norm": 129.4159393310547, - "learning_rate": 7.111111111111112e-06, - "loss": 0.147, + "epoch": 12.017142857142858, + "grad_norm": 5.870359420776367, + "learning_rate": 8.253968253968254e-06, + "loss": 0.2055, "step": 900 }, { - "epoch": 17.0172, - "grad_norm": 0.6726216673851013, - "learning_rate": 7.066666666666667e-06, - "loss": 0.0018, + "epoch": 12.02, + "grad_norm": 46.779685974121094, + "learning_rate": 8.222222222222222e-06, + "loss": 0.3762, "step": 910 }, { - "epoch": 17.0204, - "eval_accuracy": 0.6375, - "eval_loss": 2.1475367546081543, - "eval_runtime": 15.0746, - "eval_samples_per_second": 5.307, - "eval_steps_per_second": 1.327, - "step": 918 + "epoch": 12.02, + "eval_accuracy": 0.625, + "eval_loss": 1.3499457836151123, + "eval_runtime": 6.0533, + "eval_samples_per_second": 5.286, + "eval_steps_per_second": 1.322, + "step": 910 }, { - "epoch": 18.0008, - "grad_norm": 0.06342015415430069, - "learning_rate": 7.022222222222222e-06, - "loss": 0.2025, + "epoch": 13.002857142857144, + "grad_norm": 5.09779167175293, + "learning_rate": 8.190476190476192e-06, + "loss": 0.0865, "step": 920 }, { - "epoch": 18.0048, - "grad_norm": 0.03912360593676567, - "learning_rate": 6.977777777777779e-06, - "loss": 0.0017, + "epoch": 13.005714285714285, + "grad_norm": 0.5768360495567322, + "learning_rate": 8.15873015873016e-06, + "loss": 0.282, "step": 930 }, { - "epoch": 18.0088, - "grad_norm": 0.015669086948037148, - "learning_rate": 6.9333333333333344e-06, - "loss": 0.1518, + "epoch": 13.008571428571429, + "grad_norm": 15.04159164428711, + "learning_rate": 8.126984126984128e-06, + "loss": 0.276, "step": 940 }, { - "epoch": 18.0128, - "grad_norm": 0.03878331929445267, - "learning_rate": 6.88888888888889e-06, - "loss": 0.6683, + "epoch": 13.01142857142857, + "grad_norm": 13.419623374938965, + "learning_rate": 8.095238095238097e-06, + "loss": 0.3172, "step": 950 }, { - "epoch": 18.0168, - "grad_norm": 0.009163687005639076, - "learning_rate": 6.844444444444445e-06, - "loss": 0.3631, + "epoch": 13.014285714285714, + "grad_norm": 50.46155548095703, + "learning_rate": 8.063492063492065e-06, + "loss": 0.2552, "step": 960 }, { - "epoch": 18.0204, - "eval_accuracy": 0.65, - "eval_loss": 1.8957328796386719, - "eval_runtime": 14.5069, - "eval_samples_per_second": 5.515, - "eval_steps_per_second": 1.379, - "step": 969 + "epoch": 13.017142857142858, + "grad_norm": 0.9787739515304565, + "learning_rate": 8.031746031746033e-06, + "loss": 0.1545, + "step": 970 }, { - "epoch": 19.0004, - "grad_norm": 0.030928779393434525, - "learning_rate": 6.800000000000001e-06, - "loss": 0.1663, - "step": 970 + "epoch": 13.02, + "grad_norm": 18.6726131439209, + "learning_rate": 8.000000000000001e-06, + "loss": 0.3502, + "step": 980 }, { - "epoch": 19.0044, - "grad_norm": 0.8263186812400818, - "learning_rate": 6.755555555555556e-06, - "loss": 0.2357, + "epoch": 13.02, + "eval_accuracy": 0.65625, + "eval_loss": 1.5333378314971924, + "eval_runtime": 5.7825, + "eval_samples_per_second": 5.534, + "eval_steps_per_second": 1.383, "step": 980 }, { - "epoch": 19.0084, - "grad_norm": 0.09255637228488922, - "learning_rate": 6.711111111111111e-06, - "loss": 0.0141, + "epoch": 14.002857142857144, + "grad_norm": 0.06214858964085579, + "learning_rate": 7.968253968253968e-06, + "loss": 0.332, "step": 990 }, { - "epoch": 19.0124, - "grad_norm": 0.10079475492238998, - "learning_rate": 6.666666666666667e-06, - "loss": 0.001, + "epoch": 14.005714285714285, + "grad_norm": 33.86188888549805, + "learning_rate": 7.936507936507936e-06, + "loss": 0.1485, "step": 1000 }, { - "epoch": 19.0164, - "grad_norm": 0.1445166915655136, - "learning_rate": 6.6222222222222236e-06, - "loss": 0.1313, + "epoch": 14.008571428571429, + "grad_norm": 24.82603645324707, + "learning_rate": 7.904761904761904e-06, + "loss": 0.0319, "step": 1010 }, { - "epoch": 19.0204, - "grad_norm": 0.0298333577811718, - "learning_rate": 6.577777777777779e-06, - "loss": 0.1252, - "step": 1020 - }, - { - "epoch": 19.0204, - "eval_accuracy": 0.825, - "eval_loss": 1.124619960784912, - "eval_runtime": 14.4479, - "eval_samples_per_second": 5.537, - "eval_steps_per_second": 1.384, + "epoch": 14.01142857142857, + "grad_norm": 0.01239538099616766, + "learning_rate": 7.873015873015873e-06, + "loss": 0.0538, "step": 1020 }, { - "epoch": 20.004, - "grad_norm": 0.19592173397541046, - "learning_rate": 6.533333333333334e-06, - "loss": 0.0141, + "epoch": 14.014285714285714, + "grad_norm": 8.576937675476074, + "learning_rate": 7.841269841269843e-06, + "loss": 0.0159, "step": 1030 }, { - "epoch": 20.008, - "grad_norm": 42.12409591674805, - "learning_rate": 6.488888888888889e-06, - "loss": 0.1769, + "epoch": 14.017142857142858, + "grad_norm": 0.08205872029066086, + "learning_rate": 7.809523809523811e-06, + "loss": 0.2445, "step": 1040 }, { - "epoch": 20.012, - "grad_norm": 482.6688537597656, - "learning_rate": 6.444444444444445e-06, - "loss": 0.1202, + "epoch": 14.02, + "grad_norm": 0.10681977868080139, + "learning_rate": 7.77777777777778e-06, + "loss": 0.1027, "step": 1050 }, { - "epoch": 20.016, - "grad_norm": 0.5874069333076477, - "learning_rate": 6.4000000000000006e-06, - "loss": 0.0007, - "step": 1060 + "epoch": 14.02, + "eval_accuracy": 0.65625, + "eval_loss": 1.6248817443847656, + "eval_runtime": 5.8213, + "eval_samples_per_second": 5.497, + "eval_steps_per_second": 1.374, + "step": 1050 }, { - "epoch": 20.02, - "grad_norm": 0.018259378150105476, - "learning_rate": 6.355555555555556e-06, - "loss": 0.0943, - "step": 1070 + "epoch": 15.002857142857144, + "grad_norm": 0.050746768712997437, + "learning_rate": 7.746031746031747e-06, + "loss": 0.2031, + "step": 1060 }, { - "epoch": 20.0204, - "eval_accuracy": 0.6625, - "eval_loss": 1.9498172998428345, - "eval_runtime": 14.4129, - "eval_samples_per_second": 5.551, - "eval_steps_per_second": 1.388, - "step": 1071 + "epoch": 15.005714285714285, + "grad_norm": 0.08122105151414871, + "learning_rate": 7.714285714285716e-06, + "loss": 0.2621, + "step": 1070 }, { - "epoch": 21.0036, - "grad_norm": 0.00311831571161747, - "learning_rate": 6.311111111111111e-06, - "loss": 0.2018, + "epoch": 15.008571428571429, + "grad_norm": 23.472549438476562, + "learning_rate": 7.682539682539684e-06, + "loss": 0.4242, "step": 1080 }, { - "epoch": 21.0076, - "grad_norm": 7.55759859085083, - "learning_rate": 6.266666666666668e-06, - "loss": 0.333, + "epoch": 15.01142857142857, + "grad_norm": 9.292193412780762, + "learning_rate": 7.65079365079365e-06, + "loss": 0.1143, "step": 1090 }, { - "epoch": 21.0116, - "grad_norm": 0.13464294373989105, - "learning_rate": 6.222222222222223e-06, - "loss": 0.2933, + "epoch": 15.014285714285714, + "grad_norm": 2.8956408500671387, + "learning_rate": 7.61904761904762e-06, + "loss": 0.2507, "step": 1100 }, { - "epoch": 21.0156, - "grad_norm": 0.013299187645316124, - "learning_rate": 6.177777777777778e-06, - "loss": 0.0908, + "epoch": 15.017142857142858, + "grad_norm": 0.09770918637514114, + "learning_rate": 7.587301587301588e-06, + "loss": 0.2032, "step": 1110 }, { - "epoch": 21.0196, - "grad_norm": 28.606412887573242, - "learning_rate": 6.133333333333334e-06, - "loss": 0.3488, + "epoch": 15.02, + "grad_norm": 0.08947720378637314, + "learning_rate": 7.555555555555556e-06, + "loss": 0.177, "step": 1120 }, { - "epoch": 21.0204, - "eval_accuracy": 0.7875, - "eval_loss": 1.3456709384918213, - "eval_runtime": 15.9369, - "eval_samples_per_second": 5.02, - "eval_steps_per_second": 1.255, - "step": 1122 + "epoch": 15.02, + "eval_accuracy": 0.65625, + "eval_loss": 1.3757655620574951, + "eval_runtime": 6.1961, + "eval_samples_per_second": 5.165, + "eval_steps_per_second": 1.291, + "step": 1120 }, { - "epoch": 22.0032, - "grad_norm": 2.039261817932129, - "learning_rate": 6.08888888888889e-06, - "loss": 0.2154, + "epoch": 16.002857142857142, + "grad_norm": 235.26527404785156, + "learning_rate": 7.523809523809524e-06, + "loss": 0.1035, "step": 1130 }, { - "epoch": 22.0072, - "grad_norm": 6.409753799438477, - "learning_rate": 6.044444444444445e-06, - "loss": 0.0051, + "epoch": 16.005714285714287, + "grad_norm": 136.0679473876953, + "learning_rate": 7.492063492063493e-06, + "loss": 0.1338, "step": 1140 }, { - "epoch": 22.0112, - "grad_norm": 0.010469136759638786, - "learning_rate": 6e-06, - "loss": 0.1893, + "epoch": 16.00857142857143, + "grad_norm": 3.926299571990967, + "learning_rate": 7.460317460317461e-06, + "loss": 0.1069, "step": 1150 }, { - "epoch": 22.0152, - "grad_norm": 0.022801605984568596, - "learning_rate": 5.955555555555555e-06, - "loss": 0.0428, + "epoch": 16.01142857142857, + "grad_norm": 0.044299498200416565, + "learning_rate": 7.428571428571429e-06, + "loss": 0.0227, "step": 1160 }, { - "epoch": 22.0192, - "grad_norm": 0.014003835618495941, - "learning_rate": 5.911111111111112e-06, - "loss": 0.0008, + "epoch": 16.014285714285716, + "grad_norm": 0.04774919152259827, + "learning_rate": 7.3968253968253975e-06, + "loss": 0.0556, "step": 1170 }, { - "epoch": 22.0204, - "eval_accuracy": 0.7125, - "eval_loss": 1.7872467041015625, - "eval_runtime": 14.9328, - "eval_samples_per_second": 5.357, - "eval_steps_per_second": 1.339, - "step": 1173 + "epoch": 16.017142857142858, + "grad_norm": 0.24615606665611267, + "learning_rate": 7.3650793650793666e-06, + "loss": 0.1964, + "step": 1180 }, { - "epoch": 23.0028, - "grad_norm": 336.1969299316406, - "learning_rate": 5.8666666666666675e-06, - "loss": 0.5488, - "step": 1180 + "epoch": 16.02, + "grad_norm": 0.14104069769382477, + "learning_rate": 7.333333333333333e-06, + "loss": 0.0998, + "step": 1190 }, { - "epoch": 23.0068, - "grad_norm": 0.009750437922775745, - "learning_rate": 5.822222222222223e-06, - "loss": 0.3029, + "epoch": 16.02, + "eval_accuracy": 0.65625, + "eval_loss": 1.9513726234436035, + "eval_runtime": 5.7043, + "eval_samples_per_second": 5.61, + "eval_steps_per_second": 1.402, "step": 1190 }, { - "epoch": 23.0108, - "grad_norm": 0.15668730437755585, - "learning_rate": 5.777777777777778e-06, - "loss": 0.6396, + "epoch": 17.002857142857142, + "grad_norm": 6.165931701660156, + "learning_rate": 7.301587301587301e-06, + "loss": 0.1262, "step": 1200 }, { - "epoch": 23.0148, - "grad_norm": 0.019450828433036804, - "learning_rate": 5.733333333333334e-06, - "loss": 0.0974, + "epoch": 17.005714285714287, + "grad_norm": 89.7344741821289, + "learning_rate": 7.2698412698412705e-06, + "loss": 0.0378, "step": 1210 }, { - "epoch": 23.0188, - "grad_norm": 6.186660289764404, - "learning_rate": 5.688888888888889e-06, - "loss": 0.009, + "epoch": 17.00857142857143, + "grad_norm": 0.02539096027612686, + "learning_rate": 7.238095238095239e-06, + "loss": 0.0044, "step": 1220 }, { - "epoch": 23.0204, - "eval_accuracy": 0.75, - "eval_loss": 1.5437147617340088, - "eval_runtime": 14.8598, - "eval_samples_per_second": 5.384, - "eval_steps_per_second": 1.346, - "step": 1224 - }, - { - "epoch": 24.0024, - "grad_norm": 0.012686701491475105, - "learning_rate": 5.6444444444444445e-06, - "loss": 0.0007, + "epoch": 17.01142857142857, + "grad_norm": 0.14466840028762817, + "learning_rate": 7.206349206349207e-06, + "loss": 0.2566, "step": 1230 }, { - "epoch": 24.0064, - "grad_norm": 0.008507036603987217, - "learning_rate": 5.600000000000001e-06, - "loss": 0.0796, + "epoch": 17.014285714285716, + "grad_norm": 112.21966552734375, + "learning_rate": 7.174603174603175e-06, + "loss": 0.0551, "step": 1240 }, { - "epoch": 24.0104, - "grad_norm": 0.05275079980492592, - "learning_rate": 5.555555555555557e-06, - "loss": 0.0045, + "epoch": 17.017142857142858, + "grad_norm": 158.01324462890625, + "learning_rate": 7.1428571428571436e-06, + "loss": 0.0398, "step": 1250 }, { - "epoch": 24.0144, - "grad_norm": 0.020087506622076035, - "learning_rate": 5.511111111111112e-06, - "loss": 0.0007, + "epoch": 17.02, + "grad_norm": 0.04698384925723076, + "learning_rate": 7.111111111111112e-06, + "loss": 0.1749, "step": 1260 }, { - "epoch": 24.0184, - "grad_norm": 0.0037845964543521404, - "learning_rate": 5.466666666666667e-06, - "loss": 0.0274, - "step": 1270 + "epoch": 17.02, + "eval_accuracy": 0.65625, + "eval_loss": 1.9119518995285034, + "eval_runtime": 5.6445, + "eval_samples_per_second": 5.669, + "eval_steps_per_second": 1.417, + "step": 1260 }, { - "epoch": 24.0204, - "eval_accuracy": 0.6875, - "eval_loss": 1.9865350723266602, - "eval_runtime": 15.8645, - "eval_samples_per_second": 5.043, - "eval_steps_per_second": 1.261, - "step": 1275 + "epoch": 18.002857142857142, + "grad_norm": 0.007913816720247269, + "learning_rate": 7.07936507936508e-06, + "loss": 0.0817, + "step": 1270 }, { - "epoch": 25.002, - "grad_norm": 0.008797760121524334, - "learning_rate": 5.422222222222223e-06, - "loss": 0.1231, + "epoch": 18.005714285714287, + "grad_norm": 12.556783676147461, + "learning_rate": 7.047619047619048e-06, + "loss": 0.0342, "step": 1280 }, { - "epoch": 25.006, - "grad_norm": 0.012985019944608212, - "learning_rate": 5.3777777777777784e-06, - "loss": 0.0003, + "epoch": 18.00857142857143, + "grad_norm": 0.12866567075252533, + "learning_rate": 7.015873015873016e-06, + "loss": 0.1667, "step": 1290 }, { - "epoch": 25.01, - "grad_norm": 0.02057729661464691, - "learning_rate": 5.333333333333334e-06, - "loss": 0.058, + "epoch": 18.01142857142857, + "grad_norm": 0.007662401534616947, + "learning_rate": 6.984126984126984e-06, + "loss": 0.1154, "step": 1300 }, { - "epoch": 25.014, - "grad_norm": 1.6732549667358398, - "learning_rate": 5.288888888888889e-06, - "loss": 0.002, + "epoch": 18.014285714285716, + "grad_norm": 8.587801933288574, + "learning_rate": 6.952380952380952e-06, + "loss": 0.0235, "step": 1310 }, { - "epoch": 25.018, - "grad_norm": 0.006808862090110779, - "learning_rate": 5.244444444444445e-06, - "loss": 0.0004, + "epoch": 18.017142857142858, + "grad_norm": 0.016857486218214035, + "learning_rate": 6.920634920634921e-06, + "loss": 0.0947, "step": 1320 }, { - "epoch": 25.0204, - "eval_accuracy": 0.7625, - "eval_loss": 1.5100300312042236, - "eval_runtime": 15.0066, - "eval_samples_per_second": 5.331, - "eval_steps_per_second": 1.333, - "step": 1326 + "epoch": 18.02, + "grad_norm": 0.017974289134144783, + "learning_rate": 6.88888888888889e-06, + "loss": 0.0145, + "step": 1330 }, { - "epoch": 26.0016, - "grad_norm": 0.011416507884860039, - "learning_rate": 5.2e-06, - "loss": 0.0003, + "epoch": 18.02, + "eval_accuracy": 0.625, + "eval_loss": 2.1035962104797363, + "eval_runtime": 6.0525, + "eval_samples_per_second": 5.287, + "eval_steps_per_second": 1.322, "step": 1330 }, { - "epoch": 26.0056, - "grad_norm": 0.006314845755696297, - "learning_rate": 5.155555555555556e-06, - "loss": 0.0236, + "epoch": 19.002857142857142, + "grad_norm": 0.1701594740152359, + "learning_rate": 6.857142857142858e-06, + "loss": 0.1449, "step": 1340 }, { - "epoch": 26.0096, - "grad_norm": 1.8917161226272583, - "learning_rate": 5.1111111111111115e-06, - "loss": 0.0011, + "epoch": 19.005714285714287, + "grad_norm": 0.035870511084795, + "learning_rate": 6.825396825396826e-06, + "loss": 0.0272, "step": 1350 }, { - "epoch": 26.0136, - "grad_norm": 0.015401429496705532, - "learning_rate": 5.0666666666666676e-06, - "loss": 0.0003, + "epoch": 19.00857142857143, + "grad_norm": 0.00805743969976902, + "learning_rate": 6.7936507936507944e-06, + "loss": 0.0075, "step": 1360 }, { - "epoch": 26.0176, - "grad_norm": 0.2598150372505188, - "learning_rate": 5.022222222222223e-06, - "loss": 0.1007, + "epoch": 19.01142857142857, + "grad_norm": 0.008254293352365494, + "learning_rate": 6.761904761904763e-06, + "loss": 0.073, "step": 1370 }, { - "epoch": 26.0204, - "eval_accuracy": 0.6875, - "eval_loss": 1.959010124206543, - "eval_runtime": 15.1089, - "eval_samples_per_second": 5.295, - "eval_steps_per_second": 1.324, - "step": 1377 - }, - { - "epoch": 27.0012, - "grad_norm": 0.025213167071342468, - "learning_rate": 4.977777777777778e-06, - "loss": 0.0352, + "epoch": 19.014285714285716, + "grad_norm": 126.25316619873047, + "learning_rate": 6.730158730158731e-06, + "loss": 0.1744, "step": 1380 }, { - "epoch": 27.0052, - "grad_norm": 0.17898155748844147, - "learning_rate": 4.933333333333334e-06, + "epoch": 19.017142857142858, + "grad_norm": 0.012624472379684448, + "learning_rate": 6.698412698412698e-06, "loss": 0.0003, "step": 1390 }, { - "epoch": 27.0092, - "grad_norm": 0.011180482804775238, - "learning_rate": 4.888888888888889e-06, - "loss": 0.3099, + "epoch": 19.02, + "grad_norm": 0.017210116609930992, + "learning_rate": 6.666666666666667e-06, + "loss": 0.0038, "step": 1400 }, { - "epoch": 27.0132, - "grad_norm": 0.003275972092524171, - "learning_rate": 4.8444444444444446e-06, - "loss": 0.0006, - "step": 1410 + "epoch": 19.02, + "eval_accuracy": 0.625, + "eval_loss": 2.0288360118865967, + "eval_runtime": 5.8437, + "eval_samples_per_second": 5.476, + "eval_steps_per_second": 1.369, + "step": 1400 }, { - "epoch": 27.0172, - "grad_norm": 0.10510570555925369, - "learning_rate": 4.800000000000001e-06, - "loss": 0.0006, - "step": 1420 + "epoch": 20.002857142857142, + "grad_norm": 0.012508122250437737, + "learning_rate": 6.634920634920635e-06, + "loss": 0.005, + "step": 1410 }, { - "epoch": 27.0204, - "eval_accuracy": 0.7125, - "eval_loss": 1.8345705270767212, - "eval_runtime": 15.6323, - "eval_samples_per_second": 5.118, - "eval_steps_per_second": 1.279, - "step": 1428 + "epoch": 20.005714285714287, + "grad_norm": 0.07273050397634506, + "learning_rate": 6.603174603174603e-06, + "loss": 0.0004, + "step": 1420 }, { - "epoch": 28.0008, - "grad_norm": 0.00974931288510561, - "learning_rate": 4.755555555555556e-06, - "loss": 0.0003, + "epoch": 20.00857142857143, + "grad_norm": 0.00631905160844326, + "learning_rate": 6.571428571428572e-06, + "loss": 0.0036, "step": 1430 }, { - "epoch": 28.0048, - "grad_norm": 0.01707894168794155, - "learning_rate": 4.711111111111111e-06, - "loss": 0.0002, + "epoch": 20.01142857142857, + "grad_norm": 7.109875679016113, + "learning_rate": 6.5396825396825405e-06, + "loss": 0.0589, "step": 1440 }, { - "epoch": 28.0088, - "grad_norm": 0.007560590747743845, - "learning_rate": 4.666666666666667e-06, - "loss": 0.0004, + "epoch": 20.014285714285716, + "grad_norm": 0.1848406195640564, + "learning_rate": 6.507936507936509e-06, + "loss": 0.0003, "step": 1450 }, { - "epoch": 28.0128, - "grad_norm": 66.74208068847656, - "learning_rate": 4.622222222222222e-06, - "loss": 0.1921, + "epoch": 20.017142857142858, + "grad_norm": 0.005294440779834986, + "learning_rate": 6.476190476190477e-06, + "loss": 0.0004, "step": 1460 }, { - "epoch": 28.0168, - "grad_norm": 0.0025084693916141987, - "learning_rate": 4.5777777777777785e-06, - "loss": 0.0006, + "epoch": 20.02, + "grad_norm": 0.020306957885622978, + "learning_rate": 6.444444444444445e-06, + "loss": 0.1262, "step": 1470 }, { - "epoch": 28.0204, - "eval_accuracy": 0.825, - "eval_loss": 1.4668537378311157, - "eval_runtime": 14.5998, - "eval_samples_per_second": 5.48, - "eval_steps_per_second": 1.37, - "step": 1479 + "epoch": 20.02, + "eval_accuracy": 0.6875, + "eval_loss": 2.019291400909424, + "eval_runtime": 5.6553, + "eval_samples_per_second": 5.658, + "eval_steps_per_second": 1.415, + "step": 1470 }, { - "epoch": 29.0004, - "grad_norm": 0.3155474364757538, - "learning_rate": 4.533333333333334e-06, - "loss": 0.0002, + "epoch": 21.002857142857142, + "grad_norm": 0.020932432264089584, + "learning_rate": 6.412698412698414e-06, + "loss": 0.123, "step": 1480 }, { - "epoch": 29.0044, - "grad_norm": 0.024095896631479263, - "learning_rate": 4.488888888888889e-06, - "loss": 0.0011, + "epoch": 21.005714285714287, + "grad_norm": 0.0034719400573521852, + "learning_rate": 6.380952380952381e-06, + "loss": 0.0006, "step": 1490 }, { - "epoch": 29.0084, - "grad_norm": 0.00578249292448163, - "learning_rate": 4.444444444444444e-06, - "loss": 0.0002, + "epoch": 21.00857142857143, + "grad_norm": 0.40151557326316833, + "learning_rate": 6.349206349206349e-06, + "loss": 0.0427, "step": 1500 }, { - "epoch": 29.0124, - "grad_norm": 0.009000943042337894, - "learning_rate": 4.4e-06, - "loss": 0.0008, + "epoch": 21.01142857142857, + "grad_norm": 0.008790241554379463, + "learning_rate": 6.3174603174603175e-06, + "loss": 0.1958, "step": 1510 }, { - "epoch": 29.0164, - "grad_norm": 0.3360608220100403, - "learning_rate": 4.3555555555555555e-06, - "loss": 0.0004, + "epoch": 21.014285714285716, + "grad_norm": 0.012369256466627121, + "learning_rate": 6.285714285714286e-06, + "loss": 0.0894, "step": 1520 }, { - "epoch": 29.0204, - "grad_norm": 0.007300488650798798, - "learning_rate": 4.3111111111111115e-06, - "loss": 0.0001, + "epoch": 21.017142857142858, + "grad_norm": 0.016859106719493866, + "learning_rate": 6.253968253968254e-06, + "loss": 0.0003, "step": 1530 }, { - "epoch": 29.0204, - "eval_accuracy": 0.7875, - "eval_loss": 1.5396068096160889, - "eval_runtime": 14.8018, - "eval_samples_per_second": 5.405, - "eval_steps_per_second": 1.351, - "step": 1530 + "epoch": 21.02, + "grad_norm": 0.007720629218965769, + "learning_rate": 6.222222222222223e-06, + "loss": 0.0203, + "step": 1540 }, { - "epoch": 30.004, - "grad_norm": 0.0051333606243133545, - "learning_rate": 4.266666666666668e-06, - "loss": 0.0002, + "epoch": 21.02, + "eval_accuracy": 0.65625, + "eval_loss": 2.1937031745910645, + "eval_runtime": 5.7777, + "eval_samples_per_second": 5.539, + "eval_steps_per_second": 1.385, "step": 1540 }, { - "epoch": 30.008, - "grad_norm": 0.006649728864431381, - "learning_rate": 4.222222222222223e-06, + "epoch": 22.002857142857142, + "grad_norm": 0.029361654072999954, + "learning_rate": 6.1904761904761914e-06, "loss": 0.0002, "step": 1550 }, { - "epoch": 30.012, - "grad_norm": 0.004679904319345951, - "learning_rate": 4.177777777777778e-06, - "loss": 0.0002, + "epoch": 22.005714285714287, + "grad_norm": 0.013734079897403717, + "learning_rate": 6.15873015873016e-06, + "loss": 0.0253, "step": 1560 }, { - "epoch": 30.016, - "grad_norm": 220.38076782226562, - "learning_rate": 4.133333333333333e-06, - "loss": 0.0777, + "epoch": 22.00857142857143, + "grad_norm": 0.00359090743586421, + "learning_rate": 6.126984126984128e-06, + "loss": 0.0003, "step": 1570 }, { - "epoch": 30.02, - "grad_norm": 0.004013615660369396, - "learning_rate": 4.088888888888889e-06, - "loss": 0.0002, + "epoch": 22.01142857142857, + "grad_norm": 0.0032050181180238724, + "learning_rate": 6.095238095238096e-06, + "loss": 0.1542, "step": 1580 }, { - "epoch": 30.0204, - "eval_accuracy": 0.7875, - "eval_loss": 1.571619987487793, - "eval_runtime": 14.813, - "eval_samples_per_second": 5.401, - "eval_steps_per_second": 1.35, - "step": 1581 - }, - { - "epoch": 31.0036, - "grad_norm": 0.049216415733098984, - "learning_rate": 4.044444444444445e-06, - "loss": 0.0002, + "epoch": 22.014285714285716, + "grad_norm": 0.009580553509294987, + "learning_rate": 6.063492063492064e-06, + "loss": 0.0003, "step": 1590 }, { - "epoch": 31.0076, - "grad_norm": 0.01534576527774334, - "learning_rate": 4.000000000000001e-06, - "loss": 0.0001, + "epoch": 22.017142857142858, + "grad_norm": 0.00456986203789711, + "learning_rate": 6.031746031746032e-06, + "loss": 0.0819, "step": 1600 }, { - "epoch": 31.0116, - "grad_norm": 0.002917769132182002, - "learning_rate": 3.955555555555556e-06, + "epoch": 22.02, + "grad_norm": 0.004042602144181728, + "learning_rate": 6e-06, "loss": 0.0002, "step": 1610 }, { - "epoch": 31.0156, - "grad_norm": 0.002222651382908225, - "learning_rate": 3.911111111111112e-06, - "loss": 0.0002, - "step": 1620 + "epoch": 22.02, + "eval_accuracy": 0.625, + "eval_loss": 2.292170524597168, + "eval_runtime": 5.6653, + "eval_samples_per_second": 5.648, + "eval_steps_per_second": 1.412, + "step": 1610 }, { - "epoch": 31.0196, - "grad_norm": 0.008118602447211742, - "learning_rate": 3.866666666666667e-06, - "loss": 0.0001, - "step": 1630 + "epoch": 23.002857142857142, + "grad_norm": 0.009900301694869995, + "learning_rate": 5.968253968253968e-06, + "loss": 0.0094, + "step": 1620 }, { - "epoch": 31.0204, - "eval_accuracy": 0.7625, - "eval_loss": 1.6614097356796265, - "eval_runtime": 14.4379, - "eval_samples_per_second": 5.541, - "eval_steps_per_second": 1.385, - "step": 1632 + "epoch": 23.005714285714287, + "grad_norm": 0.006440621335059404, + "learning_rate": 5.936507936507937e-06, + "loss": 0.0002, + "step": 1630 }, { - "epoch": 32.0032, - "grad_norm": 0.004146672319620848, - "learning_rate": 3.8222222222222224e-06, - "loss": 0.0001, + "epoch": 23.00857142857143, + "grad_norm": 0.0025428766384720802, + "learning_rate": 5.904761904761905e-06, + "loss": 0.1124, "step": 1640 }, { - "epoch": 32.0072, - "grad_norm": 0.0034797696862369776, - "learning_rate": 3.777777777777778e-06, - "loss": 0.0001, + "epoch": 23.01142857142857, + "grad_norm": 0.0017241127789020538, + "learning_rate": 5.873015873015874e-06, + "loss": 0.0002, "step": 1650 }, { - "epoch": 32.0112, - "grad_norm": 0.0042143468745052814, - "learning_rate": 3.7333333333333337e-06, - "loss": 0.0001, + "epoch": 23.014285714285716, + "grad_norm": 0.001817885902710259, + "learning_rate": 5.841269841269842e-06, + "loss": 0.0366, "step": 1660 }, { - "epoch": 32.0152, - "grad_norm": 0.054984163492918015, - "learning_rate": 3.688888888888889e-06, - "loss": 0.0001, + "epoch": 23.017142857142858, + "grad_norm": 0.016778158023953438, + "learning_rate": 5.8095238095238106e-06, + "loss": 0.0002, "step": 1670 }, { - "epoch": 32.0192, - "grad_norm": 0.004838942550122738, - "learning_rate": 3.644444444444445e-06, - "loss": 0.0002, + "epoch": 23.02, + "grad_norm": 0.003240015124902129, + "learning_rate": 5.777777777777778e-06, + "loss": 0.0017, "step": 1680 }, { - "epoch": 32.0204, - "eval_accuracy": 0.7625, - "eval_loss": 1.6355606317520142, - "eval_runtime": 15.4442, - "eval_samples_per_second": 5.18, - "eval_steps_per_second": 1.295, - "step": 1683 + "epoch": 23.02, + "eval_accuracy": 0.65625, + "eval_loss": 2.1568727493286133, + "eval_runtime": 6.0202, + "eval_samples_per_second": 5.315, + "eval_steps_per_second": 1.329, + "step": 1680 }, { - "epoch": 33.0028, - "grad_norm": 0.3461035192012787, - "learning_rate": 3.6000000000000003e-06, - "loss": 0.0001, + "epoch": 24.002857142857142, + "grad_norm": 0.040594782680273056, + "learning_rate": 5.746031746031746e-06, + "loss": 0.0002, "step": 1690 }, { - "epoch": 33.0068, - "grad_norm": 0.002831398043781519, - "learning_rate": 3.555555555555556e-06, - "loss": 0.0001, + "epoch": 24.005714285714287, + "grad_norm": 0.009448004886507988, + "learning_rate": 5.7142857142857145e-06, + "loss": 0.0003, "step": 1700 }, { - "epoch": 33.0108, - "grad_norm": 0.06873564422130585, - "learning_rate": 3.511111111111111e-06, + "epoch": 24.00857142857143, + "grad_norm": 0.0061890422366559505, + "learning_rate": 5.682539682539683e-06, "loss": 0.0001, "step": 1710 }, { - "epoch": 33.0148, - "grad_norm": 0.006833823397755623, - "learning_rate": 3.4666666666666672e-06, + "epoch": 24.01142857142857, + "grad_norm": 0.004276420455425978, + "learning_rate": 5.650793650793651e-06, "loss": 0.0002, "step": 1720 }, { - "epoch": 33.0188, - "grad_norm": 0.004828931763768196, - "learning_rate": 3.4222222222222224e-06, - "loss": 0.0001, + "epoch": 24.014285714285716, + "grad_norm": 0.004243628121912479, + "learning_rate": 5.619047619047619e-06, + "loss": 0.0002, "step": 1730 }, { - "epoch": 33.0204, - "eval_accuracy": 0.8, - "eval_loss": 1.5730502605438232, - "eval_runtime": 14.4769, - "eval_samples_per_second": 5.526, - "eval_steps_per_second": 1.382, - "step": 1734 + "epoch": 24.017142857142858, + "grad_norm": 0.017200473695993423, + "learning_rate": 5.5873015873015876e-06, + "loss": 0.0002, + "step": 1740 }, { - "epoch": 34.0024, - "grad_norm": 0.02358504943549633, - "learning_rate": 3.377777777777778e-06, - "loss": 0.0003, - "step": 1740 + "epoch": 24.02, + "grad_norm": 0.011262903921306133, + "learning_rate": 5.555555555555557e-06, + "loss": 0.0049, + "step": 1750 }, { - "epoch": 34.0064, - "grad_norm": 0.0021904546301811934, - "learning_rate": 3.3333333333333333e-06, - "loss": 0.0237, + "epoch": 24.02, + "eval_accuracy": 0.625, + "eval_loss": 2.257319450378418, + "eval_runtime": 5.9421, + "eval_samples_per_second": 5.385, + "eval_steps_per_second": 1.346, "step": 1750 }, { - "epoch": 34.0104, - "grad_norm": 0.004002240486443043, - "learning_rate": 3.2888888888888894e-06, - "loss": 0.0002, + "epoch": 25.002857142857142, + "grad_norm": 0.0024505627807229757, + "learning_rate": 5.523809523809525e-06, + "loss": 0.0001, "step": 1760 }, { - "epoch": 34.0144, - "grad_norm": 0.00806827750056982, - "learning_rate": 3.2444444444444446e-06, - "loss": 0.0001, + "epoch": 25.005714285714287, + "grad_norm": 0.004473875276744366, + "learning_rate": 5.492063492063493e-06, + "loss": 0.0611, "step": 1770 }, { - "epoch": 34.0184, - "grad_norm": 0.004369418602436781, - "learning_rate": 3.2000000000000003e-06, - "loss": 0.0001, + "epoch": 25.00857142857143, + "grad_norm": 0.002111697569489479, + "learning_rate": 5.460317460317461e-06, + "loss": 0.3776, "step": 1780 }, { - "epoch": 34.0204, - "eval_accuracy": 0.725, - "eval_loss": 2.0019965171813965, - "eval_runtime": 14.7093, - "eval_samples_per_second": 5.439, - "eval_steps_per_second": 1.36, - "step": 1785 - }, - { - "epoch": 35.002, - "grad_norm": 0.004029570147395134, - "learning_rate": 3.1555555555555555e-06, - "loss": 0.0001, + "epoch": 25.01142857142857, + "grad_norm": 0.0120729710906744, + "learning_rate": 5.428571428571429e-06, + "loss": 0.0003, "step": 1790 }, { - "epoch": 35.006, - "grad_norm": 0.0854596495628357, - "learning_rate": 3.1111111111111116e-06, + "epoch": 25.014285714285716, + "grad_norm": 0.005537941120564938, + "learning_rate": 5.396825396825397e-06, "loss": 0.0001, "step": 1800 }, { - "epoch": 35.01, - "grad_norm": 0.00881748553365469, - "learning_rate": 3.066666666666667e-06, - "loss": 0.0001, + "epoch": 25.017142857142858, + "grad_norm": 0.1099364161491394, + "learning_rate": 5.365079365079365e-06, + "loss": 0.0007, "step": 1810 }, { - "epoch": 35.014, - "grad_norm": 0.007664634846150875, - "learning_rate": 3.0222222222222225e-06, - "loss": 0.0001, + "epoch": 25.02, + "grad_norm": 0.001969733275473118, + "learning_rate": 5.333333333333334e-06, + "loss": 0.0231, "step": 1820 }, { - "epoch": 35.018, - "grad_norm": 0.004043503198772669, - "learning_rate": 2.9777777777777777e-06, - "loss": 0.0001, - "step": 1830 + "epoch": 25.02, + "eval_accuracy": 0.6875, + "eval_loss": 2.146007537841797, + "eval_runtime": 5.7856, + "eval_samples_per_second": 5.531, + "eval_steps_per_second": 1.383, + "step": 1820 }, { - "epoch": 35.0204, - "eval_accuracy": 0.75, - "eval_loss": 1.888606071472168, - "eval_runtime": 16.1561, - "eval_samples_per_second": 4.952, - "eval_steps_per_second": 1.238, - "step": 1836 + "epoch": 26.002857142857142, + "grad_norm": 0.1284661889076233, + "learning_rate": 5.301587301587302e-06, + "loss": 0.0007, + "step": 1830 }, { - "epoch": 36.0016, - "grad_norm": 0.027049187570810318, - "learning_rate": 2.9333333333333338e-06, - "loss": 0.0001, + "epoch": 26.005714285714287, + "grad_norm": 0.027212299406528473, + "learning_rate": 5.26984126984127e-06, + "loss": 0.0002, "step": 1840 }, { - "epoch": 36.0056, - "grad_norm": 0.011712036095559597, - "learning_rate": 2.888888888888889e-06, - "loss": 0.0001, + "epoch": 26.00857142857143, + "grad_norm": 113.98649597167969, + "learning_rate": 5.2380952380952384e-06, + "loss": 0.0977, "step": 1850 }, { - "epoch": 36.0096, - "grad_norm": 0.0028739357367157936, - "learning_rate": 2.8444444444444446e-06, - "loss": 0.0001, + "epoch": 26.01142857142857, + "grad_norm": 0.0038598247338086367, + "learning_rate": 5.2063492063492076e-06, + "loss": 0.0002, "step": 1860 }, { - "epoch": 36.0136, - "grad_norm": 0.0029218129348009825, - "learning_rate": 2.8000000000000003e-06, - "loss": 0.0001, + "epoch": 26.014285714285716, + "grad_norm": 0.011346804909408092, + "learning_rate": 5.174603174603176e-06, + "loss": 0.0593, "step": 1870 }, { - "epoch": 36.0176, - "grad_norm": 0.002428996842354536, - "learning_rate": 2.755555555555556e-06, - "loss": 0.0001, + "epoch": 26.017142857142858, + "grad_norm": 60.034358978271484, + "learning_rate": 5.142857142857142e-06, + "loss": 0.0086, "step": 1880 }, { - "epoch": 36.0204, - "eval_accuracy": 0.75, - "eval_loss": 1.8363139629364014, - "eval_runtime": 14.1077, - "eval_samples_per_second": 5.671, - "eval_steps_per_second": 1.418, - "step": 1887 + "epoch": 26.02, + "grad_norm": 0.0061082011088728905, + "learning_rate": 5.1111111111111115e-06, + "loss": 0.0001, + "step": 1890 }, { - "epoch": 37.0012, - "grad_norm": 0.001762293977662921, - "learning_rate": 2.7111111111111116e-06, - "loss": 0.0001, + "epoch": 26.02, + "eval_accuracy": 0.65625, + "eval_loss": 2.356595277786255, + "eval_runtime": 6.1468, + "eval_samples_per_second": 5.206, + "eval_steps_per_second": 1.301, "step": 1890 }, { - "epoch": 37.0052, - "grad_norm": 0.010973138734698296, - "learning_rate": 2.666666666666667e-06, + "epoch": 27.002857142857142, + "grad_norm": 0.005633851513266563, + "learning_rate": 5.07936507936508e-06, "loss": 0.0001, "step": 1900 }, { - "epoch": 37.0092, - "grad_norm": 0.007034891285002232, - "learning_rate": 2.6222222222222225e-06, - "loss": 0.0001, + "epoch": 27.005714285714287, + "grad_norm": 0.011841390281915665, + "learning_rate": 5.047619047619048e-06, + "loss": 0.1327, "step": 1910 }, { - "epoch": 37.0132, - "grad_norm": 0.04831545799970627, - "learning_rate": 2.577777777777778e-06, - "loss": 0.0001, + "epoch": 27.00857142857143, + "grad_norm": 0.005733752157539129, + "learning_rate": 5.015873015873016e-06, + "loss": 0.0002, "step": 1920 }, { - "epoch": 37.0172, - "grad_norm": 0.003924284130334854, - "learning_rate": 2.5333333333333338e-06, - "loss": 0.0001, + "epoch": 27.01142857142857, + "grad_norm": 0.0031420367304235697, + "learning_rate": 4.9841269841269845e-06, + "loss": 0.0028, "step": 1930 }, { - "epoch": 37.0204, - "eval_accuracy": 0.7625, - "eval_loss": 1.6848043203353882, - "eval_runtime": 14.1172, - "eval_samples_per_second": 5.667, - "eval_steps_per_second": 1.417, - "step": 1938 - }, - { - "epoch": 38.0008, - "grad_norm": 0.003881203942000866, - "learning_rate": 2.488888888888889e-06, - "loss": 0.0001, + "epoch": 27.014285714285716, + "grad_norm": 0.0029769607353955507, + "learning_rate": 4.952380952380953e-06, + "loss": 0.0419, "step": 1940 }, { - "epoch": 38.0048, - "grad_norm": 0.002467320766299963, - "learning_rate": 2.4444444444444447e-06, + "epoch": 27.017142857142858, + "grad_norm": 0.0036415501963347197, + "learning_rate": 4.920634920634921e-06, "loss": 0.0001, "step": 1950 }, { - "epoch": 38.0088, - "grad_norm": 0.004267244599759579, - "learning_rate": 2.4000000000000003e-06, + "epoch": 27.02, + "grad_norm": 0.0069373250007629395, + "learning_rate": 4.888888888888889e-06, "loss": 0.0001, "step": 1960 }, { - "epoch": 38.0128, - "grad_norm": 0.003334041452035308, - "learning_rate": 2.3555555555555555e-06, - "loss": 0.1433, - "step": 1970 + "epoch": 27.02, + "eval_accuracy": 0.59375, + "eval_loss": 2.3822171688079834, + "eval_runtime": 5.7966, + "eval_samples_per_second": 5.52, + "eval_steps_per_second": 1.38, + "step": 1960 }, { - "epoch": 38.0168, - "grad_norm": 0.0034292838536202908, - "learning_rate": 2.311111111111111e-06, - "loss": 0.0001, - "step": 1980 + "epoch": 28.002857142857142, + "grad_norm": 0.12265493720769882, + "learning_rate": 4.857142857142858e-06, + "loss": 0.0006, + "step": 1970 }, { - "epoch": 38.0204, - "eval_accuracy": 0.75, - "eval_loss": 1.7187621593475342, - "eval_runtime": 14.1135, - "eval_samples_per_second": 5.668, - "eval_steps_per_second": 1.417, - "step": 1989 + "epoch": 28.005714285714287, + "grad_norm": 0.016870826482772827, + "learning_rate": 4.825396825396826e-06, + "loss": 0.0393, + "step": 1980 }, { - "epoch": 39.0004, - "grad_norm": 0.002690413035452366, - "learning_rate": 2.266666666666667e-06, - "loss": 0.0142, + "epoch": 28.00857142857143, + "grad_norm": 0.02534836158156395, + "learning_rate": 4.793650793650794e-06, + "loss": 0.0001, "step": 1990 }, { - "epoch": 39.0044, - "grad_norm": 0.003135059028863907, - "learning_rate": 2.222222222222222e-06, - "loss": 0.0001, + "epoch": 28.01142857142857, + "grad_norm": 0.0017878487706184387, + "learning_rate": 4.761904761904762e-06, + "loss": 0.0021, "step": 2000 }, { - "epoch": 39.0084, - "grad_norm": 0.003898640163242817, - "learning_rate": 2.1777777777777777e-06, + "epoch": 28.014285714285716, + "grad_norm": 0.004311624448746443, + "learning_rate": 4.730158730158731e-06, "loss": 0.0001, "step": 2010 }, { - "epoch": 39.0124, - "grad_norm": 0.0050819204188883305, - "learning_rate": 2.133333333333334e-06, + "epoch": 28.017142857142858, + "grad_norm": 0.0015392231289297342, + "learning_rate": 4.698412698412699e-06, "loss": 0.0001, "step": 2020 }, { - "epoch": 39.0164, - "grad_norm": 0.0032317114528268576, - "learning_rate": 2.088888888888889e-06, - "loss": 0.0412, + "epoch": 28.02, + "grad_norm": 0.07122951745986938, + "learning_rate": 4.666666666666667e-06, + "loss": 0.0001, "step": 2030 }, { - "epoch": 39.0204, - "grad_norm": 0.00116757582873106, - "learning_rate": 2.0444444444444447e-06, - "loss": 0.0001, - "step": 2040 + "epoch": 28.02, + "eval_accuracy": 0.65625, + "eval_loss": 2.3177592754364014, + "eval_runtime": 5.8041, + "eval_samples_per_second": 5.513, + "eval_steps_per_second": 1.378, + "step": 2030 }, { - "epoch": 39.0204, - "eval_accuracy": 0.8, - "eval_loss": 1.5820459127426147, - "eval_runtime": 15.2596, - "eval_samples_per_second": 5.243, - "eval_steps_per_second": 1.311, + "epoch": 29.002857142857142, + "grad_norm": 0.004755318630486727, + "learning_rate": 4.634920634920635e-06, + "loss": 0.0001, "step": 2040 }, { - "epoch": 40.004, - "grad_norm": 0.00318012572824955, - "learning_rate": 2.0000000000000003e-06, - "loss": 0.0001, + "epoch": 29.005714285714287, + "grad_norm": 0.0016361831221729517, + "learning_rate": 4.603174603174604e-06, + "loss": 0.0343, "step": 2050 }, { - "epoch": 40.008, - "grad_norm": 0.0023195173125714064, - "learning_rate": 1.955555555555556e-06, - "loss": 0.0008, + "epoch": 29.00857142857143, + "grad_norm": 0.009776725433766842, + "learning_rate": 4.571428571428572e-06, + "loss": 0.0057, "step": 2060 }, { - "epoch": 40.012, - "grad_norm": 0.002880761167034507, - "learning_rate": 1.9111111111111112e-06, + "epoch": 29.01142857142857, + "grad_norm": 0.014050081372261047, + "learning_rate": 4.53968253968254e-06, "loss": 0.0001, "step": 2070 }, { - "epoch": 40.016, - "grad_norm": 88.72091674804688, - "learning_rate": 1.8666666666666669e-06, - "loss": 0.0023, + "epoch": 29.014285714285716, + "grad_norm": 0.027094116434454918, + "learning_rate": 4.5079365079365085e-06, + "loss": 0.0002, "step": 2080 }, { - "epoch": 40.02, - "grad_norm": 0.0025204592384397984, - "learning_rate": 1.8222222222222225e-06, + "epoch": 29.017142857142858, + "grad_norm": 0.0022450664546340704, + "learning_rate": 4.476190476190477e-06, "loss": 0.0001, "step": 2090 }, { - "epoch": 40.0204, - "eval_accuracy": 0.7875, - "eval_loss": 1.6061248779296875, - "eval_runtime": 14.5072, - "eval_samples_per_second": 5.515, - "eval_steps_per_second": 1.379, - "step": 2091 + "epoch": 29.02, + "grad_norm": 0.0033212972339242697, + "learning_rate": 4.444444444444444e-06, + "loss": 0.0004, + "step": 2100 }, { - "epoch": 41.0036, - "grad_norm": 0.0022217093501240015, - "learning_rate": 1.777777777777778e-06, - "loss": 0.167, + "epoch": 29.02, + "eval_accuracy": 0.625, + "eval_loss": 2.5491509437561035, + "eval_runtime": 5.8042, + "eval_samples_per_second": 5.513, + "eval_steps_per_second": 1.378, "step": 2100 }, { - "epoch": 41.0076, - "grad_norm": 0.0016050190897658467, - "learning_rate": 1.7333333333333336e-06, - "loss": 0.0001, + "epoch": 30.002857142857142, + "grad_norm": 1.13490629196167, + "learning_rate": 4.412698412698413e-06, + "loss": 0.0002, "step": 2110 }, { - "epoch": 41.0116, - "grad_norm": 0.0031738209072500467, - "learning_rate": 1.688888888888889e-06, - "loss": 0.0001, + "epoch": 30.005714285714287, + "grad_norm": 0.002449661260470748, + "learning_rate": 4.3809523809523815e-06, + "loss": 0.0002, "step": 2120 }, { - "epoch": 41.0156, - "grad_norm": 0.0036317266058176756, - "learning_rate": 1.6444444444444447e-06, + "epoch": 30.00857142857143, + "grad_norm": 0.020857322961091995, + "learning_rate": 4.34920634920635e-06, "loss": 0.0001, "step": 2130 }, { - "epoch": 41.0196, - "grad_norm": 0.002648336812853813, - "learning_rate": 1.6000000000000001e-06, + "epoch": 30.01142857142857, + "grad_norm": 0.005406759679317474, + "learning_rate": 4.317460317460318e-06, "loss": 0.0001, "step": 2140 }, { - "epoch": 41.0204, - "eval_accuracy": 0.7, - "eval_loss": 2.2816524505615234, - "eval_runtime": 14.2804, - "eval_samples_per_second": 5.602, - "eval_steps_per_second": 1.401, - "step": 2142 - }, - { - "epoch": 42.0032, - "grad_norm": 0.0022142785601317883, - "learning_rate": 1.5555555555555558e-06, + "epoch": 30.014285714285716, + "grad_norm": 0.2224954515695572, + "learning_rate": 4.2857142857142855e-06, "loss": 0.0001, "step": 2150 }, { - "epoch": 42.0072, - "grad_norm": 0.006730781402438879, - "learning_rate": 1.5111111111111112e-06, + "epoch": 30.017142857142858, + "grad_norm": 0.014697935432195663, + "learning_rate": 4.2539682539682546e-06, "loss": 0.0001, "step": 2160 }, { - "epoch": 42.0112, - "grad_norm": 0.0027248021215200424, - "learning_rate": 1.4666666666666669e-06, - "loss": 0.0001, + "epoch": 30.02, + "grad_norm": 0.003210916882380843, + "learning_rate": 4.222222222222223e-06, + "loss": 0.0003, "step": 2170 }, { - "epoch": 42.0152, - "grad_norm": 0.0026180455461144447, - "learning_rate": 1.4222222222222223e-06, - "loss": 0.0001, - "step": 2180 + "epoch": 30.02, + "eval_accuracy": 0.625, + "eval_loss": 2.764781951904297, + "eval_runtime": 6.1993, + "eval_samples_per_second": 5.162, + "eval_steps_per_second": 1.29, + "step": 2170 }, { - "epoch": 42.0192, - "grad_norm": 0.0017348204273730516, - "learning_rate": 1.377777777777778e-06, - "loss": 0.0001, - "step": 2190 + "epoch": 31.002857142857142, + "grad_norm": 20.94349479675293, + "learning_rate": 4.190476190476191e-06, + "loss": 0.0014, + "step": 2180 }, { - "epoch": 42.0204, - "eval_accuracy": 0.725, - "eval_loss": 2.101508617401123, - "eval_runtime": 15.3729, - "eval_samples_per_second": 5.204, - "eval_steps_per_second": 1.301, - "step": 2193 + "epoch": 31.005714285714287, + "grad_norm": 0.0062758903950452805, + "learning_rate": 4.158730158730159e-06, + "loss": 0.052, + "step": 2190 }, { - "epoch": 43.0028, - "grad_norm": 0.0021387911401689053, - "learning_rate": 1.3333333333333334e-06, - "loss": 0.0001, + "epoch": 31.00857142857143, + "grad_norm": 324.1405944824219, + "learning_rate": 4.126984126984127e-06, + "loss": 0.0889, "step": 2200 }, { - "epoch": 43.0068, - "grad_norm": 0.0030123190954327583, - "learning_rate": 1.288888888888889e-06, - "loss": 0.0001, + "epoch": 31.01142857142857, + "grad_norm": 5.652071475982666, + "learning_rate": 4.095238095238096e-06, + "loss": 0.0005, "step": 2210 }, { - "epoch": 43.0108, - "grad_norm": 0.0043581160716712475, - "learning_rate": 1.2444444444444445e-06, - "loss": 0.0001, + "epoch": 31.014285714285716, + "grad_norm": 0.000993481487967074, + "learning_rate": 4.063492063492064e-06, + "loss": 0.0002, "step": 2220 }, { - "epoch": 43.0148, - "grad_norm": 0.0023331050761044025, - "learning_rate": 1.2000000000000002e-06, + "epoch": 31.017142857142858, + "grad_norm": 0.007423575036227703, + "learning_rate": 4.031746031746032e-06, "loss": 0.0001, "step": 2230 }, { - "epoch": 43.0188, - "grad_norm": 0.0014669563388451934, - "learning_rate": 1.1555555555555556e-06, + "epoch": 31.02, + "grad_norm": 0.0035263928584754467, + "learning_rate": 4.000000000000001e-06, "loss": 0.0001, "step": 2240 }, { - "epoch": 43.0204, - "eval_accuracy": 0.775, - "eval_loss": 1.6356258392333984, - "eval_runtime": 14.3954, - "eval_samples_per_second": 5.557, - "eval_steps_per_second": 1.389, - "step": 2244 + "epoch": 31.02, + "eval_accuracy": 0.625, + "eval_loss": 2.3948616981506348, + "eval_runtime": 5.739, + "eval_samples_per_second": 5.576, + "eval_steps_per_second": 1.394, + "step": 2240 }, { - "epoch": 44.0024, - "grad_norm": 0.0022818814031779766, - "learning_rate": 1.111111111111111e-06, - "loss": 0.2123, + "epoch": 32.002857142857145, + "grad_norm": 0.002942159539088607, + "learning_rate": 3.968253968253968e-06, + "loss": 0.0001, "step": 2250 }, { - "epoch": 44.0064, - "grad_norm": 0.004321521148085594, - "learning_rate": 1.066666666666667e-06, - "loss": 0.0001, + "epoch": 32.005714285714284, + "grad_norm": 0.015976279973983765, + "learning_rate": 3.936507936507936e-06, + "loss": 0.0002, "step": 2260 }, { - "epoch": 44.0104, - "grad_norm": 0.002616771264001727, - "learning_rate": 1.0222222222222223e-06, + "epoch": 32.00857142857143, + "grad_norm": 0.0022569282446056604, + "learning_rate": 3.9047619047619055e-06, "loss": 0.0001, "step": 2270 }, { - "epoch": 44.0144, - "grad_norm": 0.008771556429564953, - "learning_rate": 9.77777777777778e-07, + "epoch": 32.011428571428574, + "grad_norm": 0.001404234440997243, + "learning_rate": 3.873015873015874e-06, "loss": 0.0001, "step": 2280 }, { - "epoch": 44.0184, - "grad_norm": 0.007163883652538061, - "learning_rate": 9.333333333333334e-07, + "epoch": 32.01428571428571, + "grad_norm": 0.03129139170050621, + "learning_rate": 3.841269841269842e-06, "loss": 0.0001, "step": 2290 }, { - "epoch": 44.0204, - "eval_accuracy": 0.8125, - "eval_loss": 1.5849277973175049, - "eval_runtime": 14.7946, - "eval_samples_per_second": 5.407, - "eval_steps_per_second": 1.352, - "step": 2295 - }, - { - "epoch": 45.002, - "grad_norm": 0.001609973143786192, - "learning_rate": 8.88888888888889e-07, + "epoch": 32.01714285714286, + "grad_norm": 0.04605305194854736, + "learning_rate": 3.80952380952381e-06, "loss": 0.0001, "step": 2300 }, { - "epoch": 45.006, - "grad_norm": 0.0015221175272017717, - "learning_rate": 8.444444444444445e-07, + "epoch": 32.02, + "grad_norm": 0.0029006798285990953, + "learning_rate": 3.777777777777778e-06, "loss": 0.0001, "step": 2310 }, { - "epoch": 45.01, - "grad_norm": 0.00561766279861331, - "learning_rate": 8.000000000000001e-07, - "loss": 0.0002, + "epoch": 32.02, + "eval_accuracy": 0.65625, + "eval_loss": 2.410731792449951, + "eval_runtime": 5.8109, + "eval_samples_per_second": 5.507, + "eval_steps_per_second": 1.377, + "step": 2310 + }, + { + "epoch": 33.002857142857145, + "grad_norm": 0.004220298025757074, + "learning_rate": 3.7460317460317463e-06, + "loss": 0.0001, "step": 2320 }, { - "epoch": 45.014, - "grad_norm": 0.005561948753893375, - "learning_rate": 7.555555555555556e-07, + "epoch": 33.005714285714284, + "grad_norm": 0.05123414844274521, + "learning_rate": 3.7142857142857146e-06, "loss": 0.0001, "step": 2330 }, { - "epoch": 45.018, - "grad_norm": 0.0020426807459443808, - "learning_rate": 7.111111111111112e-07, + "epoch": 33.00857142857143, + "grad_norm": 0.0025845293421298265, + "learning_rate": 3.6825396825396833e-06, "loss": 0.0001, "step": 2340 }, { - "epoch": 45.0204, - "eval_accuracy": 0.775, - "eval_loss": 1.6463369131088257, - "eval_runtime": 15.349, - "eval_samples_per_second": 5.212, - "eval_steps_per_second": 1.303, - "step": 2346 - }, - { - "epoch": 46.0016, - "grad_norm": 0.011843581683933735, - "learning_rate": 6.666666666666667e-07, + "epoch": 33.011428571428574, + "grad_norm": 0.005027332808822393, + "learning_rate": 3.6507936507936507e-06, "loss": 0.0001, "step": 2350 }, { - "epoch": 46.0056, - "grad_norm": 0.003159413579851389, - "learning_rate": 6.222222222222223e-07, - "loss": 0.0001, + "epoch": 33.01428571428571, + "grad_norm": 0.008934518322348595, + "learning_rate": 3.6190476190476194e-06, + "loss": 0.0002, "step": 2360 }, { - "epoch": 46.0096, - "grad_norm": 0.021830186247825623, - "learning_rate": 5.777777777777778e-07, - "loss": 0.0001, + "epoch": 33.01714285714286, + "grad_norm": 0.0008886617142707109, + "learning_rate": 3.5873015873015877e-06, + "loss": 0.1871, "step": 2370 }, { - "epoch": 46.0136, - "grad_norm": 0.0015857354737818241, - "learning_rate": 5.333333333333335e-07, + "epoch": 33.02, + "grad_norm": 0.03134104609489441, + "learning_rate": 3.555555555555556e-06, "loss": 0.0001, "step": 2380 }, { - "epoch": 46.0176, - "grad_norm": 0.0021360372193157673, - "learning_rate": 4.88888888888889e-07, - "loss": 0.0001, - "step": 2390 - }, - { - "epoch": 46.0204, - "eval_accuracy": 0.775, - "eval_loss": 1.664137601852417, - "eval_runtime": 14.3393, + "epoch": 33.02, + "eval_accuracy": 0.59375, + "eval_loss": 2.6098861694335938, + "eval_runtime": 5.7355, "eval_samples_per_second": 5.579, "eval_steps_per_second": 1.395, - "step": 2397 + "step": 2380 }, { - "epoch": 47.0012, - "grad_norm": 7.805647850036621, - "learning_rate": 4.444444444444445e-07, - "loss": 0.0007, - "step": 2400 + "epoch": 34.002857142857145, + "grad_norm": 0.006479092873632908, + "learning_rate": 3.523809523809524e-06, + "loss": 0.0208, + "step": 2390 }, { - "epoch": 47.0052, - "grad_norm": 0.002305046422407031, - "learning_rate": 4.0000000000000003e-07, + "epoch": 34.005714285714284, + "grad_norm": 0.006957986857742071, + "learning_rate": 3.492063492063492e-06, "loss": 0.0001, + "step": 2400 + }, + { + "epoch": 34.00857142857143, + "grad_norm": 0.0024119375739246607, + "learning_rate": 3.4603174603174607e-06, + "loss": 0.0002, "step": 2410 }, { - "epoch": 47.0092, - "grad_norm": 0.0024446428287774324, - "learning_rate": 3.555555555555556e-07, + "epoch": 34.011428571428574, + "grad_norm": 0.0025062367785722017, + "learning_rate": 3.428571428571429e-06, "loss": 0.0001, "step": 2420 }, { - "epoch": 47.0132, - "grad_norm": 0.0018226341344416142, - "learning_rate": 3.111111111111111e-07, - "loss": 0.0001, + "epoch": 34.01428571428571, + "grad_norm": 0.0034686909057199955, + "learning_rate": 3.3968253968253972e-06, + "loss": 0.0002, "step": 2430 }, { - "epoch": 47.0172, - "grad_norm": 0.0018344988347962499, - "learning_rate": 2.666666666666667e-07, - "loss": 0.0001, + "epoch": 34.01714285714286, + "grad_norm": 0.0022994480095803738, + "learning_rate": 3.3650793650793655e-06, + "loss": 0.0002, "step": 2440 }, { - "epoch": 47.0204, - "eval_accuracy": 0.7875, - "eval_loss": 1.612348198890686, - "eval_runtime": 14.4481, - "eval_samples_per_second": 5.537, - "eval_steps_per_second": 1.384, - "step": 2448 + "epoch": 34.02, + "grad_norm": 0.004486434161663055, + "learning_rate": 3.3333333333333333e-06, + "loss": 0.0001, + "step": 2450 }, { - "epoch": 48.0008, - "grad_norm": 0.0016147164860740304, - "learning_rate": 2.2222222222222224e-07, - "loss": 0.0001, + "epoch": 34.02, + "eval_accuracy": 0.5625, + "eval_loss": 2.8574094772338867, + "eval_runtime": 5.9889, + "eval_samples_per_second": 5.343, + "eval_steps_per_second": 1.336, "step": 2450 }, { - "epoch": 48.0048, - "grad_norm": 0.0015180219197645783, - "learning_rate": 1.777777777777778e-07, - "loss": 0.0001, + "epoch": 35.002857142857145, + "grad_norm": 0.00257964339107275, + "learning_rate": 3.3015873015873016e-06, + "loss": 0.2215, "step": 2460 }, { - "epoch": 48.0088, - "grad_norm": 0.0027082718443125486, - "learning_rate": 1.3333333333333336e-07, + "epoch": 35.005714285714284, + "grad_norm": 0.00513617554679513, + "learning_rate": 3.2698412698412703e-06, "loss": 0.0001, "step": 2470 }, { - "epoch": 48.0128, - "grad_norm": 0.0015189133118838072, - "learning_rate": 8.88888888888889e-08, + "epoch": 35.00857142857143, + "grad_norm": 0.0013633773196488619, + "learning_rate": 3.2380952380952385e-06, "loss": 0.0001, "step": 2480 }, { - "epoch": 48.0168, - "grad_norm": 0.0014700506580993533, - "learning_rate": 4.444444444444445e-08, + "epoch": 35.011428571428574, + "grad_norm": 0.036525338888168335, + "learning_rate": 3.206349206349207e-06, "loss": 0.0001, "step": 2490 }, { - "epoch": 48.0204, - "eval_accuracy": 0.7875, - "eval_loss": 1.6145384311676025, - "eval_runtime": 14.6364, - "eval_samples_per_second": 5.466, - "eval_steps_per_second": 1.366, - "step": 2499 + "epoch": 35.01428571428571, + "grad_norm": 0.0019644717685878277, + "learning_rate": 3.1746031746031746e-06, + "loss": 0.0001, + "step": 2500 + }, + { + "epoch": 35.01714285714286, + "grad_norm": 0.0009433199884369969, + "learning_rate": 3.142857142857143e-06, + "loss": 0.0001, + "step": 2510 }, { - "epoch": 49.0004, - "grad_norm": 0.005640446674078703, - "learning_rate": 0.0, + "epoch": 35.02, + "grad_norm": 0.0059710158966481686, + "learning_rate": 3.1111111111111116e-06, "loss": 0.0001, - "step": 2500 + "step": 2520 }, { - "epoch": 49.0004, - "eval_accuracy": 0.7875, - "eval_loss": 1.6145387887954712, - "eval_runtime": 15.4845, - "eval_samples_per_second": 5.166, - "eval_steps_per_second": 1.292, - "step": 2500 + "epoch": 35.02, + "eval_accuracy": 0.59375, + "eval_loss": 2.5807838439941406, + "eval_runtime": 5.9217, + "eval_samples_per_second": 5.404, + "eval_steps_per_second": 1.351, + "step": 2520 }, { - "epoch": 49.0004, - "step": 2500, - "total_flos": 4.34799425740686e+19, - "train_loss": 0.1747144501608098, - "train_runtime": 4798.9949, - "train_samples_per_second": 2.084, - "train_steps_per_second": 0.521 + "epoch": 36.002857142857145, + "grad_norm": 0.004809876438230276, + "learning_rate": 3.07936507936508e-06, + "loss": 0.0001, + "step": 2530 }, { - "epoch": 49.0004, - "eval_accuracy": 0.5753424657534246, - "eval_loss": 2.5945777893066406, - "eval_runtime": 15.5022, - "eval_samples_per_second": 4.709, - "eval_steps_per_second": 1.226, - "step": 2500 + "epoch": 36.005714285714284, + "grad_norm": 0.002548061078414321, + "learning_rate": 3.047619047619048e-06, + "loss": 0.0001, + "step": 2540 }, { - "epoch": 49.0004, - "eval_accuracy": 0.5753424657534246, - "eval_loss": 2.5945777893066406, - "eval_runtime": 14.1475, - "eval_samples_per_second": 5.16, - "eval_steps_per_second": 1.343, - "step": 2500 + "epoch": 36.00857142857143, + "grad_norm": 0.0013359219301491976, + "learning_rate": 3.015873015873016e-06, + "loss": 0.0, + "step": 2550 + }, + { + "epoch": 36.011428571428574, + "grad_norm": 0.001209500478580594, + "learning_rate": 2.984126984126984e-06, + "loss": 0.0001, + "step": 2560 + }, + { + "epoch": 36.01428571428571, + "grad_norm": 0.0029701353050768375, + "learning_rate": 2.9523809523809525e-06, + "loss": 0.0001, + "step": 2570 + }, + { + "epoch": 36.01714285714286, + "grad_norm": 0.0033097926061600447, + "learning_rate": 2.920634920634921e-06, + "loss": 0.0001, + "step": 2580 + }, + { + "epoch": 36.02, + "grad_norm": 0.002935264492407441, + "learning_rate": 2.888888888888889e-06, + "loss": 0.0001, + "step": 2590 + }, + { + "epoch": 36.02, + "eval_accuracy": 0.59375, + "eval_loss": 2.624629497528076, + "eval_runtime": 5.6636, + "eval_samples_per_second": 5.65, + "eval_steps_per_second": 1.413, + "step": 2590 + }, + { + "epoch": 37.002857142857145, + "grad_norm": 0.001033996231853962, + "learning_rate": 2.8571428571428573e-06, + "loss": 0.0001, + "step": 2600 + }, + { + "epoch": 37.005714285714284, + "grad_norm": 0.0017455056076869369, + "learning_rate": 2.8253968253968255e-06, + "loss": 0.0461, + "step": 2610 + }, + { + "epoch": 37.00857142857143, + "grad_norm": 0.004716221243143082, + "learning_rate": 2.7936507936507938e-06, + "loss": 0.0001, + "step": 2620 + }, + { + "epoch": 37.011428571428574, + "grad_norm": 0.0009876637486740947, + "learning_rate": 2.7619047619047625e-06, + "loss": 0.0008, + "step": 2630 + }, + { + "epoch": 37.01428571428571, + "grad_norm": 0.0018166368827223778, + "learning_rate": 2.7301587301587303e-06, + "loss": 0.0001, + "step": 2640 + }, + { + "epoch": 37.01714285714286, + "grad_norm": 0.0022690363693982363, + "learning_rate": 2.6984126984126986e-06, + "loss": 0.0001, + "step": 2650 + }, + { + "epoch": 37.02, + "grad_norm": 0.0020156537648290396, + "learning_rate": 2.666666666666667e-06, + "loss": 0.0001, + "step": 2660 + }, + { + "epoch": 37.02, + "eval_accuracy": 0.59375, + "eval_loss": 2.705111503601074, + "eval_runtime": 5.6718, + "eval_samples_per_second": 5.642, + "eval_steps_per_second": 1.41, + "step": 2660 + }, + { + "epoch": 38.002857142857145, + "grad_norm": 0.0032058602664619684, + "learning_rate": 2.634920634920635e-06, + "loss": 0.0001, + "step": 2670 + }, + { + "epoch": 38.005714285714284, + "grad_norm": 0.0012342449044808745, + "learning_rate": 2.6031746031746038e-06, + "loss": 0.0001, + "step": 2680 + }, + { + "epoch": 38.00857142857143, + "grad_norm": 0.27555274963378906, + "learning_rate": 2.571428571428571e-06, + "loss": 0.079, + "step": 2690 + }, + { + "epoch": 38.011428571428574, + "grad_norm": 0.12426193058490753, + "learning_rate": 2.53968253968254e-06, + "loss": 0.0001, + "step": 2700 + }, + { + "epoch": 38.01428571428571, + "grad_norm": 0.001230777706950903, + "learning_rate": 2.507936507936508e-06, + "loss": 0.0, + "step": 2710 + }, + { + "epoch": 38.01714285714286, + "grad_norm": 0.0010537246707826853, + "learning_rate": 2.4761904761904764e-06, + "loss": 0.0002, + "step": 2720 + }, + { + "epoch": 38.02, + "grad_norm": 0.006648702081292868, + "learning_rate": 2.4444444444444447e-06, + "loss": 0.0001, + "step": 2730 + }, + { + "epoch": 38.02, + "eval_accuracy": 0.59375, + "eval_loss": 2.504610776901245, + "eval_runtime": 5.9071, + "eval_samples_per_second": 5.417, + "eval_steps_per_second": 1.354, + "step": 2730 + }, + { + "epoch": 39.002857142857145, + "grad_norm": 0.002145690843462944, + "learning_rate": 2.412698412698413e-06, + "loss": 0.0001, + "step": 2740 + }, + { + "epoch": 39.005714285714284, + "grad_norm": 0.002901807427406311, + "learning_rate": 2.380952380952381e-06, + "loss": 0.0001, + "step": 2750 + }, + { + "epoch": 39.00857142857143, + "grad_norm": 0.0011529697803780437, + "learning_rate": 2.3492063492063494e-06, + "loss": 0.0, + "step": 2760 + }, + { + "epoch": 39.011428571428574, + "grad_norm": 0.005594365298748016, + "learning_rate": 2.3174603174603177e-06, + "loss": 0.0001, + "step": 2770 + }, + { + "epoch": 39.01428571428571, + "grad_norm": 0.0025502736680209637, + "learning_rate": 2.285714285714286e-06, + "loss": 0.0, + "step": 2780 + }, + { + "epoch": 39.01714285714286, + "grad_norm": 0.0013142352690920234, + "learning_rate": 2.2539682539682542e-06, + "loss": 0.0001, + "step": 2790 + }, + { + "epoch": 39.02, + "grad_norm": 0.0032077666837722063, + "learning_rate": 2.222222222222222e-06, + "loss": 0.0001, + "step": 2800 + }, + { + "epoch": 39.02, + "eval_accuracy": 0.59375, + "eval_loss": 2.5002870559692383, + "eval_runtime": 5.6574, + "eval_samples_per_second": 5.656, + "eval_steps_per_second": 1.414, + "step": 2800 + }, + { + "epoch": 40.002857142857145, + "grad_norm": 0.005300651304423809, + "learning_rate": 2.1904761904761908e-06, + "loss": 0.0001, + "step": 2810 + }, + { + "epoch": 40.005714285714284, + "grad_norm": 0.0013714683009311557, + "learning_rate": 2.158730158730159e-06, + "loss": 0.0001, + "step": 2820 + }, + { + "epoch": 40.00857142857143, + "grad_norm": 0.001886402373202145, + "learning_rate": 2.1269841269841273e-06, + "loss": 0.0001, + "step": 2830 + }, + { + "epoch": 40.011428571428574, + "grad_norm": 0.0023008284624665976, + "learning_rate": 2.0952380952380955e-06, + "loss": 0.0001, + "step": 2840 + }, + { + "epoch": 40.01428571428571, + "grad_norm": 0.03095676377415657, + "learning_rate": 2.0634920634920634e-06, + "loss": 0.0, + "step": 2850 + }, + { + "epoch": 40.01714285714286, + "grad_norm": 0.002867933129891753, + "learning_rate": 2.031746031746032e-06, + "loss": 0.0, + "step": 2860 + }, + { + "epoch": 40.02, + "grad_norm": 0.0008751653949730098, + "learning_rate": 2.0000000000000003e-06, + "loss": 0.0, + "step": 2870 + }, + { + "epoch": 40.02, + "eval_accuracy": 0.625, + "eval_loss": 2.545984983444214, + "eval_runtime": 5.6867, + "eval_samples_per_second": 5.627, + "eval_steps_per_second": 1.407, + "step": 2870 + }, + { + "epoch": 41.002857142857145, + "grad_norm": 0.004483176860958338, + "learning_rate": 1.968253968253968e-06, + "loss": 0.0001, + "step": 2880 + }, + { + "epoch": 41.005714285714284, + "grad_norm": 0.012557575479149818, + "learning_rate": 1.936507936507937e-06, + "loss": 0.0001, + "step": 2890 + }, + { + "epoch": 41.00857142857143, + "grad_norm": 0.0009115163120441139, + "learning_rate": 1.904761904761905e-06, + "loss": 0.0, + "step": 2900 + }, + { + "epoch": 41.011428571428574, + "grad_norm": 0.0017849624855443835, + "learning_rate": 1.8730158730158732e-06, + "loss": 0.0, + "step": 2910 + }, + { + "epoch": 41.01428571428571, + "grad_norm": 0.002865022048354149, + "learning_rate": 1.8412698412698416e-06, + "loss": 0.0, + "step": 2920 + }, + { + "epoch": 41.01714285714286, + "grad_norm": 0.0014787889085710049, + "learning_rate": 1.8095238095238097e-06, + "loss": 0.0001, + "step": 2930 + }, + { + "epoch": 41.02, + "grad_norm": 0.0009846773464232683, + "learning_rate": 1.777777777777778e-06, + "loss": 0.0, + "step": 2940 + }, + { + "epoch": 41.02, + "eval_accuracy": 0.625, + "eval_loss": 2.5397086143493652, + "eval_runtime": 5.9883, + "eval_samples_per_second": 5.344, + "eval_steps_per_second": 1.336, + "step": 2940 + }, + { + "epoch": 42.002857142857145, + "grad_norm": 0.0012540535535663366, + "learning_rate": 1.746031746031746e-06, + "loss": 0.0, + "step": 2950 + }, + { + "epoch": 42.005714285714284, + "grad_norm": 0.00243058567866683, + "learning_rate": 1.7142857142857145e-06, + "loss": 0.0, + "step": 2960 + }, + { + "epoch": 42.00857142857143, + "grad_norm": 0.002351740375161171, + "learning_rate": 1.6825396825396827e-06, + "loss": 0.0, + "step": 2970 + }, + { + "epoch": 42.011428571428574, + "grad_norm": 0.0010893039871007204, + "learning_rate": 1.6507936507936508e-06, + "loss": 0.0001, + "step": 2980 + }, + { + "epoch": 42.01428571428571, + "grad_norm": 0.0027362005785107613, + "learning_rate": 1.6190476190476193e-06, + "loss": 0.0, + "step": 2990 + }, + { + "epoch": 42.01714285714286, + "grad_norm": 0.002978693228214979, + "learning_rate": 1.5873015873015873e-06, + "loss": 0.0001, + "step": 3000 + }, + { + "epoch": 42.02, + "grad_norm": 0.001713828998617828, + "learning_rate": 1.5555555555555558e-06, + "loss": 0.0, + "step": 3010 + }, + { + "epoch": 42.02, + "eval_accuracy": 0.625, + "eval_loss": 2.538372278213501, + "eval_runtime": 5.6509, + "eval_samples_per_second": 5.663, + "eval_steps_per_second": 1.416, + "step": 3010 + }, + { + "epoch": 43.002857142857145, + "grad_norm": 0.0017983241705223918, + "learning_rate": 1.523809523809524e-06, + "loss": 0.0, + "step": 3020 + }, + { + "epoch": 43.005714285714284, + "grad_norm": 0.009900640696287155, + "learning_rate": 1.492063492063492e-06, + "loss": 0.0, + "step": 3030 + }, + { + "epoch": 43.00857142857143, + "grad_norm": 0.004079794976860285, + "learning_rate": 1.4603174603174606e-06, + "loss": 0.0001, + "step": 3040 + }, + { + "epoch": 43.011428571428574, + "grad_norm": 0.0023102271370589733, + "learning_rate": 1.4285714285714286e-06, + "loss": 0.0001, + "step": 3050 + }, + { + "epoch": 43.01428571428571, + "grad_norm": 0.002042073756456375, + "learning_rate": 1.3968253968253969e-06, + "loss": 0.0, + "step": 3060 + }, + { + "epoch": 43.01714285714286, + "grad_norm": 0.0017096186056733131, + "learning_rate": 1.3650793650793652e-06, + "loss": 0.0, + "step": 3070 + }, + { + "epoch": 43.02, + "grad_norm": 0.003803923726081848, + "learning_rate": 1.3333333333333334e-06, + "loss": 0.0, + "step": 3080 + }, + { + "epoch": 43.02, + "eval_accuracy": 0.625, + "eval_loss": 2.484891891479492, + "eval_runtime": 6.1299, + "eval_samples_per_second": 5.22, + "eval_steps_per_second": 1.305, + "step": 3080 + }, + { + "epoch": 44.002857142857145, + "grad_norm": 0.0019901215564459562, + "learning_rate": 1.3015873015873019e-06, + "loss": 0.0, + "step": 3090 + }, + { + "epoch": 44.005714285714284, + "grad_norm": 0.0011514016659930348, + "learning_rate": 1.26984126984127e-06, + "loss": 0.0001, + "step": 3100 + }, + { + "epoch": 44.00857142857143, + "grad_norm": 0.0025108291301876307, + "learning_rate": 1.2380952380952382e-06, + "loss": 0.0, + "step": 3110 + }, + { + "epoch": 44.011428571428574, + "grad_norm": 0.0018219811609014869, + "learning_rate": 1.2063492063492065e-06, + "loss": 0.0, + "step": 3120 + }, + { + "epoch": 44.01428571428571, + "grad_norm": 0.0013160904636606574, + "learning_rate": 1.1746031746031747e-06, + "loss": 0.0, + "step": 3130 + }, + { + "epoch": 44.01714285714286, + "grad_norm": 7.966428756713867, + "learning_rate": 1.142857142857143e-06, + "loss": 0.0005, + "step": 3140 + }, + { + "epoch": 44.02, + "grad_norm": 0.001393563929013908, + "learning_rate": 1.111111111111111e-06, + "loss": 0.0, + "step": 3150 + }, + { + "epoch": 44.02, + "eval_accuracy": 0.65625, + "eval_loss": 2.5847256183624268, + "eval_runtime": 5.9398, + "eval_samples_per_second": 5.387, + "eval_steps_per_second": 1.347, + "step": 3150 + }, + { + "epoch": 45.002857142857145, + "grad_norm": 0.007060057949274778, + "learning_rate": 1.0793650793650795e-06, + "loss": 0.0001, + "step": 3160 + }, + { + "epoch": 45.005714285714284, + "grad_norm": 0.0010020129848271608, + "learning_rate": 1.0476190476190478e-06, + "loss": 0.0, + "step": 3170 + }, + { + "epoch": 45.00857142857143, + "grad_norm": 0.0028252785559743643, + "learning_rate": 1.015873015873016e-06, + "loss": 0.0, + "step": 3180 + }, + { + "epoch": 45.011428571428574, + "grad_norm": 0.012641221284866333, + "learning_rate": 9.84126984126984e-07, + "loss": 0.0001, + "step": 3190 + }, + { + "epoch": 45.01428571428571, + "grad_norm": 0.0022382563911378384, + "learning_rate": 9.523809523809525e-07, + "loss": 0.0001, + "step": 3200 + }, + { + "epoch": 45.01714285714286, + "grad_norm": 0.006219483446329832, + "learning_rate": 9.206349206349208e-07, + "loss": 0.0001, + "step": 3210 + }, + { + "epoch": 45.02, + "grad_norm": 0.0017200283473357558, + "learning_rate": 8.88888888888889e-07, + "loss": 0.0, + "step": 3220 + }, + { + "epoch": 45.02, + "eval_accuracy": 0.65625, + "eval_loss": 2.5829355716705322, + "eval_runtime": 5.6602, + "eval_samples_per_second": 5.653, + "eval_steps_per_second": 1.413, + "step": 3220 + }, + { + "epoch": 46.002857142857145, + "grad_norm": 0.0015408931067213416, + "learning_rate": 8.571428571428572e-07, + "loss": 0.0, + "step": 3230 + }, + { + "epoch": 46.005714285714284, + "grad_norm": 0.0016143143875524402, + "learning_rate": 8.253968253968254e-07, + "loss": 0.0, + "step": 3240 + }, + { + "epoch": 46.00857142857143, + "grad_norm": 0.0014364662347361445, + "learning_rate": 7.936507936507937e-07, + "loss": 0.0, + "step": 3250 + }, + { + "epoch": 46.011428571428574, + "grad_norm": 0.0040867868810892105, + "learning_rate": 7.61904761904762e-07, + "loss": 0.0, + "step": 3260 + }, + { + "epoch": 46.01428571428571, + "grad_norm": 0.0010911135468631983, + "learning_rate": 7.301587301587303e-07, + "loss": 0.0001, + "step": 3270 + }, + { + "epoch": 46.01714285714286, + "grad_norm": 0.0011279195314273238, + "learning_rate": 6.984126984126984e-07, + "loss": 0.0, + "step": 3280 + }, + { + "epoch": 46.02, + "grad_norm": 0.0029850241262465715, + "learning_rate": 6.666666666666667e-07, + "loss": 0.0, + "step": 3290 + }, + { + "epoch": 46.02, + "eval_accuracy": 0.65625, + "eval_loss": 2.580873966217041, + "eval_runtime": 5.6883, + "eval_samples_per_second": 5.626, + "eval_steps_per_second": 1.406, + "step": 3290 + }, + { + "epoch": 47.002857142857145, + "grad_norm": 0.003009574254974723, + "learning_rate": 6.34920634920635e-07, + "loss": 0.0001, + "step": 3300 + }, + { + "epoch": 47.005714285714284, + "grad_norm": 0.006839941721409559, + "learning_rate": 6.031746031746032e-07, + "loss": 0.0, + "step": 3310 + }, + { + "epoch": 47.00857142857143, + "grad_norm": 0.001524322316981852, + "learning_rate": 5.714285714285715e-07, + "loss": 0.0, + "step": 3320 + }, + { + "epoch": 47.011428571428574, + "grad_norm": 0.0027361048851162195, + "learning_rate": 5.396825396825398e-07, + "loss": 0.0, + "step": 3330 + }, + { + "epoch": 47.01428571428571, + "grad_norm": 0.0032709892839193344, + "learning_rate": 5.07936507936508e-07, + "loss": 0.0, + "step": 3340 + }, + { + "epoch": 47.01714285714286, + "grad_norm": 0.0012358782114461064, + "learning_rate": 4.7619047619047623e-07, + "loss": 0.0, + "step": 3350 + }, + { + "epoch": 47.02, + "grad_norm": 0.0044230008497834206, + "learning_rate": 4.444444444444445e-07, + "loss": 0.0, + "step": 3360 + }, + { + "epoch": 47.02, + "eval_accuracy": 0.625, + "eval_loss": 2.5756099224090576, + "eval_runtime": 5.7994, + "eval_samples_per_second": 5.518, + "eval_steps_per_second": 1.379, + "step": 3360 + }, + { + "epoch": 48.002857142857145, + "grad_norm": 0.0034374285023659468, + "learning_rate": 4.126984126984127e-07, + "loss": 0.0, + "step": 3370 + }, + { + "epoch": 48.005714285714284, + "grad_norm": 0.010423140600323677, + "learning_rate": 3.80952380952381e-07, + "loss": 0.045, + "step": 3380 + }, + { + "epoch": 48.00857142857143, + "grad_norm": 0.002047063549980521, + "learning_rate": 3.492063492063492e-07, + "loss": 0.0, + "step": 3390 + }, + { + "epoch": 48.011428571428574, + "grad_norm": 0.00131814437918365, + "learning_rate": 3.174603174603175e-07, + "loss": 0.0, + "step": 3400 + }, + { + "epoch": 48.01428571428571, + "grad_norm": 0.004262310452759266, + "learning_rate": 2.8571428571428575e-07, + "loss": 0.0, + "step": 3410 + }, + { + "epoch": 48.01714285714286, + "grad_norm": 0.005764085799455643, + "learning_rate": 2.53968253968254e-07, + "loss": 0.0, + "step": 3420 + }, + { + "epoch": 48.02, + "grad_norm": 0.0010528825223445892, + "learning_rate": 2.2222222222222224e-07, + "loss": 0.0001, + "step": 3430 + }, + { + "epoch": 48.02, + "eval_accuracy": 0.65625, + "eval_loss": 2.474351644515991, + "eval_runtime": 5.6522, + "eval_samples_per_second": 5.662, + "eval_steps_per_second": 1.415, + "step": 3430 + }, + { + "epoch": 49.002857142857145, + "grad_norm": 0.0024832114577293396, + "learning_rate": 1.904761904761905e-07, + "loss": 0.0, + "step": 3440 + }, + { + "epoch": 49.005714285714284, + "grad_norm": 0.0020221523009240627, + "learning_rate": 1.5873015873015874e-07, + "loss": 0.0, + "step": 3450 + }, + { + "epoch": 49.00857142857143, + "grad_norm": 0.0294681116938591, + "learning_rate": 1.26984126984127e-07, + "loss": 0.0, + "step": 3460 + }, + { + "epoch": 49.011428571428574, + "grad_norm": 0.0013974281027913094, + "learning_rate": 9.523809523809525e-08, + "loss": 0.0001, + "step": 3470 + }, + { + "epoch": 49.01428571428571, + "grad_norm": 0.002278068568557501, + "learning_rate": 6.34920634920635e-08, + "loss": 0.0, + "step": 3480 + }, + { + "epoch": 49.01714285714286, + "grad_norm": 0.0014251351822167635, + "learning_rate": 3.174603174603175e-08, + "loss": 0.0, + "step": 3490 + }, + { + "epoch": 49.02, + "grad_norm": 0.001036540837958455, + "learning_rate": 0.0, + "loss": 0.0, + "step": 3500 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.65625, + "eval_loss": 2.4720282554626465, + "eval_runtime": 7.6256, + "eval_samples_per_second": 4.196, + "eval_steps_per_second": 1.049, + "step": 3500 + }, + { + "epoch": 49.02, + "step": 3500, + "total_flos": 6.147436841415475e+19, + "train_loss": 0.15784767912905331, + "train_runtime": 5935.2708, + "train_samples_per_second": 2.359, + "train_steps_per_second": 0.59 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.7906976744186046, + "eval_loss": 0.4680953025817871, + "eval_runtime": 8.841, + "eval_samples_per_second": 4.864, + "eval_steps_per_second": 1.244, + "step": 3500 + }, + { + "epoch": 49.02, + "eval_accuracy": 0.7906976744186046, + "eval_loss": 0.46809521317481995, + "eval_runtime": 8.11, + "eval_samples_per_second": 5.302, + "eval_steps_per_second": 1.356, + "step": 3500 } ], "logging_steps": 10, - "max_steps": 2500, + "max_steps": 3500, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 500, @@ -2253,7 +2953,7 @@ "attributes": {} } }, - "total_flos": 4.34799425740686e+19, + "total_flos": 6.147436841415475e+19, "train_batch_size": 4, "trial_name": null, "trial_params": null