|
{ |
|
"best_metric": 0.6875, |
|
"best_model_checkpoint": "MAE-CT-CPC-Dicotomized-v8-n0-m1/checkpoint-560", |
|
"epoch": 49.02, |
|
"eval_steps": 500, |
|
"global_step": 3500, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.002857142857142857, |
|
"grad_norm": 1.2938215732574463, |
|
"learning_rate": 2.8571428571428575e-07, |
|
"loss": 0.7187, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.005714285714285714, |
|
"grad_norm": 2.850792407989502, |
|
"learning_rate": 5.714285714285715e-07, |
|
"loss": 0.6988, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.008571428571428572, |
|
"grad_norm": 2.5619375705718994, |
|
"learning_rate": 8.571428571428572e-07, |
|
"loss": 0.6938, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.011428571428571429, |
|
"grad_norm": 1.4017411470413208, |
|
"learning_rate": 1.142857142857143e-06, |
|
"loss": 0.68, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.014285714285714285, |
|
"grad_norm": 3.408830404281616, |
|
"learning_rate": 1.4285714285714286e-06, |
|
"loss": 0.6665, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.017142857142857144, |
|
"grad_norm": 2.4238297939300537, |
|
"learning_rate": 1.7142857142857145e-06, |
|
"loss": 0.6997, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 4.75000524520874, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.6769, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"eval_accuracy": 0.59375, |
|
"eval_loss": 0.6814303398132324, |
|
"eval_runtime": 6.5003, |
|
"eval_samples_per_second": 4.923, |
|
"eval_steps_per_second": 1.231, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 1.002857142857143, |
|
"grad_norm": 5.1099066734313965, |
|
"learning_rate": 2.285714285714286e-06, |
|
"loss": 0.6718, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 1.0057142857142858, |
|
"grad_norm": 2.6992363929748535, |
|
"learning_rate": 2.571428571428571e-06, |
|
"loss": 0.6841, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 1.0085714285714287, |
|
"grad_norm": 3.9713478088378906, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 0.6237, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 1.0114285714285713, |
|
"grad_norm": 9.976508140563965, |
|
"learning_rate": 3.142857142857143e-06, |
|
"loss": 0.7209, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 1.0142857142857142, |
|
"grad_norm": 5.838253974914551, |
|
"learning_rate": 3.428571428571429e-06, |
|
"loss": 0.5925, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 1.0171428571428571, |
|
"grad_norm": 6.639209747314453, |
|
"learning_rate": 3.7142857142857146e-06, |
|
"loss": 0.5364, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"grad_norm": 6.5769548416137695, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.7223, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 1.02, |
|
"eval_accuracy": 0.59375, |
|
"eval_loss": 0.6952990889549255, |
|
"eval_runtime": 6.0984, |
|
"eval_samples_per_second": 5.247, |
|
"eval_steps_per_second": 1.312, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 2.0028571428571427, |
|
"grad_norm": 4.371922969818115, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 0.6335, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 2.005714285714286, |
|
"grad_norm": 7.266854763031006, |
|
"learning_rate": 4.571428571428572e-06, |
|
"loss": 0.6511, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 2.0085714285714285, |
|
"grad_norm": 5.1674323081970215, |
|
"learning_rate": 4.857142857142858e-06, |
|
"loss": 0.6216, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 2.0114285714285716, |
|
"grad_norm": 7.353259563446045, |
|
"learning_rate": 5.142857142857142e-06, |
|
"loss": 0.6537, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 2.0142857142857142, |
|
"grad_norm": 4.876795291900635, |
|
"learning_rate": 5.428571428571429e-06, |
|
"loss": 0.6154, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 2.0171428571428573, |
|
"grad_norm": 6.403975486755371, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 0.6548, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"grad_norm": 8.78445053100586, |
|
"learning_rate": 6e-06, |
|
"loss": 0.6628, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 2.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 0.6335489749908447, |
|
"eval_runtime": 6.2455, |
|
"eval_samples_per_second": 5.124, |
|
"eval_steps_per_second": 1.281, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 3.0028571428571427, |
|
"grad_norm": 8.260648727416992, |
|
"learning_rate": 6.285714285714286e-06, |
|
"loss": 0.5933, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 3.005714285714286, |
|
"grad_norm": 9.865010261535645, |
|
"learning_rate": 6.571428571428572e-06, |
|
"loss": 0.6037, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 3.0085714285714285, |
|
"grad_norm": 15.651702880859375, |
|
"learning_rate": 6.857142857142858e-06, |
|
"loss": 0.5401, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 3.0114285714285716, |
|
"grad_norm": 14.37714958190918, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 0.7525, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 3.0142857142857142, |
|
"grad_norm": 9.037586212158203, |
|
"learning_rate": 7.428571428571429e-06, |
|
"loss": 0.546, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 3.0171428571428573, |
|
"grad_norm": 9.574410438537598, |
|
"learning_rate": 7.714285714285716e-06, |
|
"loss": 0.6234, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"grad_norm": 6.128604412078857, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.5096, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 3.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 0.6513711214065552, |
|
"eval_runtime": 5.9941, |
|
"eval_samples_per_second": 5.339, |
|
"eval_steps_per_second": 1.335, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 4.002857142857143, |
|
"grad_norm": 10.744050025939941, |
|
"learning_rate": 8.285714285714287e-06, |
|
"loss": 0.5184, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 4.005714285714285, |
|
"grad_norm": 6.656492233276367, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 0.5196, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 4.008571428571429, |
|
"grad_norm": 11.752025604248047, |
|
"learning_rate": 8.857142857142858e-06, |
|
"loss": 0.628, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 4.011428571428572, |
|
"grad_norm": 13.207033157348633, |
|
"learning_rate": 9.142857142857144e-06, |
|
"loss": 0.5296, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 4.014285714285714, |
|
"grad_norm": 15.192891120910645, |
|
"learning_rate": 9.42857142857143e-06, |
|
"loss": 0.5027, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 4.017142857142857, |
|
"grad_norm": 7.084335803985596, |
|
"learning_rate": 9.714285714285715e-06, |
|
"loss": 0.5569, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"grad_norm": 16.267013549804688, |
|
"learning_rate": 1e-05, |
|
"loss": 0.4739, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 4.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 0.635766327381134, |
|
"eval_runtime": 6.5473, |
|
"eval_samples_per_second": 4.888, |
|
"eval_steps_per_second": 1.222, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 5.002857142857143, |
|
"grad_norm": 54.40953826904297, |
|
"learning_rate": 9.968253968253969e-06, |
|
"loss": 0.5697, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 5.005714285714285, |
|
"grad_norm": 5.579638481140137, |
|
"learning_rate": 9.936507936507937e-06, |
|
"loss": 0.6431, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 5.008571428571429, |
|
"grad_norm": 6.157373428344727, |
|
"learning_rate": 9.904761904761906e-06, |
|
"loss": 0.507, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 5.011428571428572, |
|
"grad_norm": 10.279753684997559, |
|
"learning_rate": 9.873015873015874e-06, |
|
"loss": 0.5867, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 5.014285714285714, |
|
"grad_norm": 9.118820190429688, |
|
"learning_rate": 9.841269841269842e-06, |
|
"loss": 0.6229, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 5.017142857142857, |
|
"grad_norm": 11.799158096313477, |
|
"learning_rate": 9.80952380952381e-06, |
|
"loss": 0.4608, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"grad_norm": 24.01024055480957, |
|
"learning_rate": 9.777777777777779e-06, |
|
"loss": 0.4554, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 5.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 0.627189040184021, |
|
"eval_runtime": 6.0832, |
|
"eval_samples_per_second": 5.26, |
|
"eval_steps_per_second": 1.315, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 6.002857142857143, |
|
"grad_norm": 10.022480964660645, |
|
"learning_rate": 9.746031746031747e-06, |
|
"loss": 0.6578, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 6.005714285714285, |
|
"grad_norm": 7.889081954956055, |
|
"learning_rate": 9.714285714285715e-06, |
|
"loss": 0.4533, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 6.008571428571429, |
|
"grad_norm": 6.938020706176758, |
|
"learning_rate": 9.682539682539683e-06, |
|
"loss": 0.4677, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 6.011428571428572, |
|
"grad_norm": 1.124375343322754, |
|
"learning_rate": 9.650793650793652e-06, |
|
"loss": 0.3756, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 6.014285714285714, |
|
"grad_norm": 42.59530258178711, |
|
"learning_rate": 9.61904761904762e-06, |
|
"loss": 1.2108, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 6.017142857142857, |
|
"grad_norm": 10.104456901550293, |
|
"learning_rate": 9.587301587301588e-06, |
|
"loss": 0.612, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"grad_norm": 17.34273910522461, |
|
"learning_rate": 9.555555555555556e-06, |
|
"loss": 0.4818, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 6.02, |
|
"eval_accuracy": 0.59375, |
|
"eval_loss": 0.7726800441741943, |
|
"eval_runtime": 6.0971, |
|
"eval_samples_per_second": 5.248, |
|
"eval_steps_per_second": 1.312, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 7.002857142857143, |
|
"grad_norm": 14.020530700683594, |
|
"learning_rate": 9.523809523809525e-06, |
|
"loss": 0.4584, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 7.005714285714285, |
|
"grad_norm": 28.93553924560547, |
|
"learning_rate": 9.492063492063493e-06, |
|
"loss": 0.3662, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 7.008571428571429, |
|
"grad_norm": 12.310239791870117, |
|
"learning_rate": 9.460317460317461e-06, |
|
"loss": 0.5758, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 7.011428571428572, |
|
"grad_norm": 4.042015552520752, |
|
"learning_rate": 9.42857142857143e-06, |
|
"loss": 0.4398, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 7.014285714285714, |
|
"grad_norm": 39.30393600463867, |
|
"learning_rate": 9.396825396825398e-06, |
|
"loss": 0.3887, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 7.017142857142857, |
|
"grad_norm": 10.91474723815918, |
|
"learning_rate": 9.365079365079366e-06, |
|
"loss": 0.3307, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"grad_norm": 27.856639862060547, |
|
"learning_rate": 9.333333333333334e-06, |
|
"loss": 0.4129, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 7.02, |
|
"eval_accuracy": 0.6875, |
|
"eval_loss": 0.8221972584724426, |
|
"eval_runtime": 6.3706, |
|
"eval_samples_per_second": 5.023, |
|
"eval_steps_per_second": 1.256, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 8.002857142857144, |
|
"grad_norm": 25.90720558166504, |
|
"learning_rate": 9.301587301587303e-06, |
|
"loss": 0.3737, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 8.005714285714285, |
|
"grad_norm": 18.878738403320312, |
|
"learning_rate": 9.26984126984127e-06, |
|
"loss": 0.3757, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 8.008571428571429, |
|
"grad_norm": 16.471302032470703, |
|
"learning_rate": 9.238095238095239e-06, |
|
"loss": 0.466, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 8.01142857142857, |
|
"grad_norm": 2.8103766441345215, |
|
"learning_rate": 9.206349206349207e-06, |
|
"loss": 0.45, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 8.014285714285714, |
|
"grad_norm": 35.921138763427734, |
|
"learning_rate": 9.174603174603176e-06, |
|
"loss": 0.6889, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 8.017142857142858, |
|
"grad_norm": 1.5162020921707153, |
|
"learning_rate": 9.142857142857144e-06, |
|
"loss": 0.3012, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 8.02, |
|
"grad_norm": 31.143695831298828, |
|
"learning_rate": 9.111111111111112e-06, |
|
"loss": 0.6301, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 8.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 0.8040816783905029, |
|
"eval_runtime": 5.9175, |
|
"eval_samples_per_second": 5.408, |
|
"eval_steps_per_second": 1.352, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 9.002857142857144, |
|
"grad_norm": 74.01934814453125, |
|
"learning_rate": 9.07936507936508e-06, |
|
"loss": 0.605, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 9.005714285714285, |
|
"grad_norm": 46.429351806640625, |
|
"learning_rate": 9.047619047619049e-06, |
|
"loss": 0.3164, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 9.008571428571429, |
|
"grad_norm": 43.667686462402344, |
|
"learning_rate": 9.015873015873017e-06, |
|
"loss": 0.2476, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 9.01142857142857, |
|
"grad_norm": 16.798654556274414, |
|
"learning_rate": 8.984126984126985e-06, |
|
"loss": 0.4533, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 9.014285714285714, |
|
"grad_norm": 21.254426956176758, |
|
"learning_rate": 8.952380952380953e-06, |
|
"loss": 0.3874, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 9.017142857142858, |
|
"grad_norm": 14.007109642028809, |
|
"learning_rate": 8.920634920634922e-06, |
|
"loss": 0.3162, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 9.02, |
|
"grad_norm": 1.4537030458450317, |
|
"learning_rate": 8.888888888888888e-06, |
|
"loss": 0.3809, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 9.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 0.8720921874046326, |
|
"eval_runtime": 5.9604, |
|
"eval_samples_per_second": 5.369, |
|
"eval_steps_per_second": 1.342, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 10.002857142857144, |
|
"grad_norm": 8.765763282775879, |
|
"learning_rate": 8.857142857142858e-06, |
|
"loss": 0.2273, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 10.005714285714285, |
|
"grad_norm": 0.38795459270477295, |
|
"learning_rate": 8.825396825396827e-06, |
|
"loss": 0.1653, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 10.008571428571429, |
|
"grad_norm": 47.69906234741211, |
|
"learning_rate": 8.793650793650795e-06, |
|
"loss": 0.7768, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 10.01142857142857, |
|
"grad_norm": 42.63213348388672, |
|
"learning_rate": 8.761904761904763e-06, |
|
"loss": 0.4008, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 10.014285714285714, |
|
"grad_norm": 103.43374633789062, |
|
"learning_rate": 8.730158730158731e-06, |
|
"loss": 0.4466, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 10.017142857142858, |
|
"grad_norm": 0.7397039532661438, |
|
"learning_rate": 8.6984126984127e-06, |
|
"loss": 0.1774, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 10.02, |
|
"grad_norm": 62.768890380859375, |
|
"learning_rate": 8.666666666666668e-06, |
|
"loss": 0.8071, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 10.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 1.1091701984405518, |
|
"eval_runtime": 5.949, |
|
"eval_samples_per_second": 5.379, |
|
"eval_steps_per_second": 1.345, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 11.002857142857144, |
|
"grad_norm": 25.553237915039062, |
|
"learning_rate": 8.634920634920636e-06, |
|
"loss": 0.2667, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 11.005714285714285, |
|
"grad_norm": 17.7698974609375, |
|
"learning_rate": 8.603174603174604e-06, |
|
"loss": 0.4836, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 11.008571428571429, |
|
"grad_norm": 31.987483978271484, |
|
"learning_rate": 8.571428571428571e-06, |
|
"loss": 0.1931, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 11.01142857142857, |
|
"grad_norm": 48.23198318481445, |
|
"learning_rate": 8.53968253968254e-06, |
|
"loss": 0.1602, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 11.014285714285714, |
|
"grad_norm": 83.9683609008789, |
|
"learning_rate": 8.507936507936509e-06, |
|
"loss": 0.4893, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 11.017142857142858, |
|
"grad_norm": 47.01933670043945, |
|
"learning_rate": 8.476190476190477e-06, |
|
"loss": 0.3692, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"grad_norm": 0.9395814538002014, |
|
"learning_rate": 8.444444444444446e-06, |
|
"loss": 0.1888, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 11.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 1.155624270439148, |
|
"eval_runtime": 5.9873, |
|
"eval_samples_per_second": 5.345, |
|
"eval_steps_per_second": 1.336, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 12.002857142857144, |
|
"grad_norm": 0.11803951114416122, |
|
"learning_rate": 8.412698412698414e-06, |
|
"loss": 0.1841, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 12.005714285714285, |
|
"grad_norm": 43.479034423828125, |
|
"learning_rate": 8.380952380952382e-06, |
|
"loss": 0.2496, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 12.008571428571429, |
|
"grad_norm": 47.395957946777344, |
|
"learning_rate": 8.34920634920635e-06, |
|
"loss": 0.4539, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 12.01142857142857, |
|
"grad_norm": 8.515253067016602, |
|
"learning_rate": 8.317460317460319e-06, |
|
"loss": 0.1964, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 12.014285714285714, |
|
"grad_norm": 37.08503723144531, |
|
"learning_rate": 8.285714285714287e-06, |
|
"loss": 0.1819, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 12.017142857142858, |
|
"grad_norm": 5.870359420776367, |
|
"learning_rate": 8.253968253968254e-06, |
|
"loss": 0.2055, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 12.02, |
|
"grad_norm": 46.779685974121094, |
|
"learning_rate": 8.222222222222222e-06, |
|
"loss": 0.3762, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 12.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 1.3499457836151123, |
|
"eval_runtime": 6.0533, |
|
"eval_samples_per_second": 5.286, |
|
"eval_steps_per_second": 1.322, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 13.002857142857144, |
|
"grad_norm": 5.09779167175293, |
|
"learning_rate": 8.190476190476192e-06, |
|
"loss": 0.0865, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 13.005714285714285, |
|
"grad_norm": 0.5768360495567322, |
|
"learning_rate": 8.15873015873016e-06, |
|
"loss": 0.282, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 13.008571428571429, |
|
"grad_norm": 15.04159164428711, |
|
"learning_rate": 8.126984126984128e-06, |
|
"loss": 0.276, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 13.01142857142857, |
|
"grad_norm": 13.419623374938965, |
|
"learning_rate": 8.095238095238097e-06, |
|
"loss": 0.3172, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 13.014285714285714, |
|
"grad_norm": 50.46155548095703, |
|
"learning_rate": 8.063492063492065e-06, |
|
"loss": 0.2552, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 13.017142857142858, |
|
"grad_norm": 0.9787739515304565, |
|
"learning_rate": 8.031746031746033e-06, |
|
"loss": 0.1545, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 13.02, |
|
"grad_norm": 18.6726131439209, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 0.3502, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 13.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 1.5333378314971924, |
|
"eval_runtime": 5.7825, |
|
"eval_samples_per_second": 5.534, |
|
"eval_steps_per_second": 1.383, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 14.002857142857144, |
|
"grad_norm": 0.06214858964085579, |
|
"learning_rate": 7.968253968253968e-06, |
|
"loss": 0.332, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 14.005714285714285, |
|
"grad_norm": 33.86188888549805, |
|
"learning_rate": 7.936507936507936e-06, |
|
"loss": 0.1485, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 14.008571428571429, |
|
"grad_norm": 24.82603645324707, |
|
"learning_rate": 7.904761904761904e-06, |
|
"loss": 0.0319, |
|
"step": 1010 |
|
}, |
|
{ |
|
"epoch": 14.01142857142857, |
|
"grad_norm": 0.01239538099616766, |
|
"learning_rate": 7.873015873015873e-06, |
|
"loss": 0.0538, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 14.014285714285714, |
|
"grad_norm": 8.576937675476074, |
|
"learning_rate": 7.841269841269843e-06, |
|
"loss": 0.0159, |
|
"step": 1030 |
|
}, |
|
{ |
|
"epoch": 14.017142857142858, |
|
"grad_norm": 0.08205872029066086, |
|
"learning_rate": 7.809523809523811e-06, |
|
"loss": 0.2445, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 14.02, |
|
"grad_norm": 0.10681977868080139, |
|
"learning_rate": 7.77777777777778e-06, |
|
"loss": 0.1027, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 14.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 1.6248817443847656, |
|
"eval_runtime": 5.8213, |
|
"eval_samples_per_second": 5.497, |
|
"eval_steps_per_second": 1.374, |
|
"step": 1050 |
|
}, |
|
{ |
|
"epoch": 15.002857142857144, |
|
"grad_norm": 0.050746768712997437, |
|
"learning_rate": 7.746031746031747e-06, |
|
"loss": 0.2031, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 15.005714285714285, |
|
"grad_norm": 0.08122105151414871, |
|
"learning_rate": 7.714285714285716e-06, |
|
"loss": 0.2621, |
|
"step": 1070 |
|
}, |
|
{ |
|
"epoch": 15.008571428571429, |
|
"grad_norm": 23.472549438476562, |
|
"learning_rate": 7.682539682539684e-06, |
|
"loss": 0.4242, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 15.01142857142857, |
|
"grad_norm": 9.292193412780762, |
|
"learning_rate": 7.65079365079365e-06, |
|
"loss": 0.1143, |
|
"step": 1090 |
|
}, |
|
{ |
|
"epoch": 15.014285714285714, |
|
"grad_norm": 2.8956408500671387, |
|
"learning_rate": 7.61904761904762e-06, |
|
"loss": 0.2507, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 15.017142857142858, |
|
"grad_norm": 0.09770918637514114, |
|
"learning_rate": 7.587301587301588e-06, |
|
"loss": 0.2032, |
|
"step": 1110 |
|
}, |
|
{ |
|
"epoch": 15.02, |
|
"grad_norm": 0.08947720378637314, |
|
"learning_rate": 7.555555555555556e-06, |
|
"loss": 0.177, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 15.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 1.3757655620574951, |
|
"eval_runtime": 6.1961, |
|
"eval_samples_per_second": 5.165, |
|
"eval_steps_per_second": 1.291, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 16.002857142857142, |
|
"grad_norm": 235.26527404785156, |
|
"learning_rate": 7.523809523809524e-06, |
|
"loss": 0.1035, |
|
"step": 1130 |
|
}, |
|
{ |
|
"epoch": 16.005714285714287, |
|
"grad_norm": 136.0679473876953, |
|
"learning_rate": 7.492063492063493e-06, |
|
"loss": 0.1338, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 16.00857142857143, |
|
"grad_norm": 3.926299571990967, |
|
"learning_rate": 7.460317460317461e-06, |
|
"loss": 0.1069, |
|
"step": 1150 |
|
}, |
|
{ |
|
"epoch": 16.01142857142857, |
|
"grad_norm": 0.044299498200416565, |
|
"learning_rate": 7.428571428571429e-06, |
|
"loss": 0.0227, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 16.014285714285716, |
|
"grad_norm": 0.04774919152259827, |
|
"learning_rate": 7.3968253968253975e-06, |
|
"loss": 0.0556, |
|
"step": 1170 |
|
}, |
|
{ |
|
"epoch": 16.017142857142858, |
|
"grad_norm": 0.24615606665611267, |
|
"learning_rate": 7.3650793650793666e-06, |
|
"loss": 0.1964, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 16.02, |
|
"grad_norm": 0.14104069769382477, |
|
"learning_rate": 7.333333333333333e-06, |
|
"loss": 0.0998, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 16.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 1.9513726234436035, |
|
"eval_runtime": 5.7043, |
|
"eval_samples_per_second": 5.61, |
|
"eval_steps_per_second": 1.402, |
|
"step": 1190 |
|
}, |
|
{ |
|
"epoch": 17.002857142857142, |
|
"grad_norm": 6.165931701660156, |
|
"learning_rate": 7.301587301587301e-06, |
|
"loss": 0.1262, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 17.005714285714287, |
|
"grad_norm": 89.7344741821289, |
|
"learning_rate": 7.2698412698412705e-06, |
|
"loss": 0.0378, |
|
"step": 1210 |
|
}, |
|
{ |
|
"epoch": 17.00857142857143, |
|
"grad_norm": 0.02539096027612686, |
|
"learning_rate": 7.238095238095239e-06, |
|
"loss": 0.0044, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 17.01142857142857, |
|
"grad_norm": 0.14466840028762817, |
|
"learning_rate": 7.206349206349207e-06, |
|
"loss": 0.2566, |
|
"step": 1230 |
|
}, |
|
{ |
|
"epoch": 17.014285714285716, |
|
"grad_norm": 112.21966552734375, |
|
"learning_rate": 7.174603174603175e-06, |
|
"loss": 0.0551, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 17.017142857142858, |
|
"grad_norm": 158.01324462890625, |
|
"learning_rate": 7.1428571428571436e-06, |
|
"loss": 0.0398, |
|
"step": 1250 |
|
}, |
|
{ |
|
"epoch": 17.02, |
|
"grad_norm": 0.04698384925723076, |
|
"learning_rate": 7.111111111111112e-06, |
|
"loss": 0.1749, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 17.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 1.9119518995285034, |
|
"eval_runtime": 5.6445, |
|
"eval_samples_per_second": 5.669, |
|
"eval_steps_per_second": 1.417, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 18.002857142857142, |
|
"grad_norm": 0.007913816720247269, |
|
"learning_rate": 7.07936507936508e-06, |
|
"loss": 0.0817, |
|
"step": 1270 |
|
}, |
|
{ |
|
"epoch": 18.005714285714287, |
|
"grad_norm": 12.556783676147461, |
|
"learning_rate": 7.047619047619048e-06, |
|
"loss": 0.0342, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 18.00857142857143, |
|
"grad_norm": 0.12866567075252533, |
|
"learning_rate": 7.015873015873016e-06, |
|
"loss": 0.1667, |
|
"step": 1290 |
|
}, |
|
{ |
|
"epoch": 18.01142857142857, |
|
"grad_norm": 0.007662401534616947, |
|
"learning_rate": 6.984126984126984e-06, |
|
"loss": 0.1154, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 18.014285714285716, |
|
"grad_norm": 8.587801933288574, |
|
"learning_rate": 6.952380952380952e-06, |
|
"loss": 0.0235, |
|
"step": 1310 |
|
}, |
|
{ |
|
"epoch": 18.017142857142858, |
|
"grad_norm": 0.016857486218214035, |
|
"learning_rate": 6.920634920634921e-06, |
|
"loss": 0.0947, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 18.02, |
|
"grad_norm": 0.017974289134144783, |
|
"learning_rate": 6.88888888888889e-06, |
|
"loss": 0.0145, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 18.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.1035962104797363, |
|
"eval_runtime": 6.0525, |
|
"eval_samples_per_second": 5.287, |
|
"eval_steps_per_second": 1.322, |
|
"step": 1330 |
|
}, |
|
{ |
|
"epoch": 19.002857142857142, |
|
"grad_norm": 0.1701594740152359, |
|
"learning_rate": 6.857142857142858e-06, |
|
"loss": 0.1449, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 19.005714285714287, |
|
"grad_norm": 0.035870511084795, |
|
"learning_rate": 6.825396825396826e-06, |
|
"loss": 0.0272, |
|
"step": 1350 |
|
}, |
|
{ |
|
"epoch": 19.00857142857143, |
|
"grad_norm": 0.00805743969976902, |
|
"learning_rate": 6.7936507936507944e-06, |
|
"loss": 0.0075, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 19.01142857142857, |
|
"grad_norm": 0.008254293352365494, |
|
"learning_rate": 6.761904761904763e-06, |
|
"loss": 0.073, |
|
"step": 1370 |
|
}, |
|
{ |
|
"epoch": 19.014285714285716, |
|
"grad_norm": 126.25316619873047, |
|
"learning_rate": 6.730158730158731e-06, |
|
"loss": 0.1744, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 19.017142857142858, |
|
"grad_norm": 0.012624472379684448, |
|
"learning_rate": 6.698412698412698e-06, |
|
"loss": 0.0003, |
|
"step": 1390 |
|
}, |
|
{ |
|
"epoch": 19.02, |
|
"grad_norm": 0.017210116609930992, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 0.0038, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 19.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.0288360118865967, |
|
"eval_runtime": 5.8437, |
|
"eval_samples_per_second": 5.476, |
|
"eval_steps_per_second": 1.369, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 20.002857142857142, |
|
"grad_norm": 0.012508122250437737, |
|
"learning_rate": 6.634920634920635e-06, |
|
"loss": 0.005, |
|
"step": 1410 |
|
}, |
|
{ |
|
"epoch": 20.005714285714287, |
|
"grad_norm": 0.07273050397634506, |
|
"learning_rate": 6.603174603174603e-06, |
|
"loss": 0.0004, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 20.00857142857143, |
|
"grad_norm": 0.00631905160844326, |
|
"learning_rate": 6.571428571428572e-06, |
|
"loss": 0.0036, |
|
"step": 1430 |
|
}, |
|
{ |
|
"epoch": 20.01142857142857, |
|
"grad_norm": 7.109875679016113, |
|
"learning_rate": 6.5396825396825405e-06, |
|
"loss": 0.0589, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 20.014285714285716, |
|
"grad_norm": 0.1848406195640564, |
|
"learning_rate": 6.507936507936509e-06, |
|
"loss": 0.0003, |
|
"step": 1450 |
|
}, |
|
{ |
|
"epoch": 20.017142857142858, |
|
"grad_norm": 0.005294440779834986, |
|
"learning_rate": 6.476190476190477e-06, |
|
"loss": 0.0004, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 20.02, |
|
"grad_norm": 0.020306957885622978, |
|
"learning_rate": 6.444444444444445e-06, |
|
"loss": 0.1262, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 20.02, |
|
"eval_accuracy": 0.6875, |
|
"eval_loss": 2.019291400909424, |
|
"eval_runtime": 5.6553, |
|
"eval_samples_per_second": 5.658, |
|
"eval_steps_per_second": 1.415, |
|
"step": 1470 |
|
}, |
|
{ |
|
"epoch": 21.002857142857142, |
|
"grad_norm": 0.020932432264089584, |
|
"learning_rate": 6.412698412698414e-06, |
|
"loss": 0.123, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 21.005714285714287, |
|
"grad_norm": 0.0034719400573521852, |
|
"learning_rate": 6.380952380952381e-06, |
|
"loss": 0.0006, |
|
"step": 1490 |
|
}, |
|
{ |
|
"epoch": 21.00857142857143, |
|
"grad_norm": 0.40151557326316833, |
|
"learning_rate": 6.349206349206349e-06, |
|
"loss": 0.0427, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 21.01142857142857, |
|
"grad_norm": 0.008790241554379463, |
|
"learning_rate": 6.3174603174603175e-06, |
|
"loss": 0.1958, |
|
"step": 1510 |
|
}, |
|
{ |
|
"epoch": 21.014285714285716, |
|
"grad_norm": 0.012369256466627121, |
|
"learning_rate": 6.285714285714286e-06, |
|
"loss": 0.0894, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 21.017142857142858, |
|
"grad_norm": 0.016859106719493866, |
|
"learning_rate": 6.253968253968254e-06, |
|
"loss": 0.0003, |
|
"step": 1530 |
|
}, |
|
{ |
|
"epoch": 21.02, |
|
"grad_norm": 0.007720629218965769, |
|
"learning_rate": 6.222222222222223e-06, |
|
"loss": 0.0203, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 21.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 2.1937031745910645, |
|
"eval_runtime": 5.7777, |
|
"eval_samples_per_second": 5.539, |
|
"eval_steps_per_second": 1.385, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 22.002857142857142, |
|
"grad_norm": 0.029361654072999954, |
|
"learning_rate": 6.1904761904761914e-06, |
|
"loss": 0.0002, |
|
"step": 1550 |
|
}, |
|
{ |
|
"epoch": 22.005714285714287, |
|
"grad_norm": 0.013734079897403717, |
|
"learning_rate": 6.15873015873016e-06, |
|
"loss": 0.0253, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 22.00857142857143, |
|
"grad_norm": 0.00359090743586421, |
|
"learning_rate": 6.126984126984128e-06, |
|
"loss": 0.0003, |
|
"step": 1570 |
|
}, |
|
{ |
|
"epoch": 22.01142857142857, |
|
"grad_norm": 0.0032050181180238724, |
|
"learning_rate": 6.095238095238096e-06, |
|
"loss": 0.1542, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 22.014285714285716, |
|
"grad_norm": 0.009580553509294987, |
|
"learning_rate": 6.063492063492064e-06, |
|
"loss": 0.0003, |
|
"step": 1590 |
|
}, |
|
{ |
|
"epoch": 22.017142857142858, |
|
"grad_norm": 0.00456986203789711, |
|
"learning_rate": 6.031746031746032e-06, |
|
"loss": 0.0819, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 22.02, |
|
"grad_norm": 0.004042602144181728, |
|
"learning_rate": 6e-06, |
|
"loss": 0.0002, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 22.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.292170524597168, |
|
"eval_runtime": 5.6653, |
|
"eval_samples_per_second": 5.648, |
|
"eval_steps_per_second": 1.412, |
|
"step": 1610 |
|
}, |
|
{ |
|
"epoch": 23.002857142857142, |
|
"grad_norm": 0.009900301694869995, |
|
"learning_rate": 5.968253968253968e-06, |
|
"loss": 0.0094, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 23.005714285714287, |
|
"grad_norm": 0.006440621335059404, |
|
"learning_rate": 5.936507936507937e-06, |
|
"loss": 0.0002, |
|
"step": 1630 |
|
}, |
|
{ |
|
"epoch": 23.00857142857143, |
|
"grad_norm": 0.0025428766384720802, |
|
"learning_rate": 5.904761904761905e-06, |
|
"loss": 0.1124, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 23.01142857142857, |
|
"grad_norm": 0.0017241127789020538, |
|
"learning_rate": 5.873015873015874e-06, |
|
"loss": 0.0002, |
|
"step": 1650 |
|
}, |
|
{ |
|
"epoch": 23.014285714285716, |
|
"grad_norm": 0.001817885902710259, |
|
"learning_rate": 5.841269841269842e-06, |
|
"loss": 0.0366, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 23.017142857142858, |
|
"grad_norm": 0.016778158023953438, |
|
"learning_rate": 5.8095238095238106e-06, |
|
"loss": 0.0002, |
|
"step": 1670 |
|
}, |
|
{ |
|
"epoch": 23.02, |
|
"grad_norm": 0.003240015124902129, |
|
"learning_rate": 5.777777777777778e-06, |
|
"loss": 0.0017, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 23.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 2.1568727493286133, |
|
"eval_runtime": 6.0202, |
|
"eval_samples_per_second": 5.315, |
|
"eval_steps_per_second": 1.329, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 24.002857142857142, |
|
"grad_norm": 0.040594782680273056, |
|
"learning_rate": 5.746031746031746e-06, |
|
"loss": 0.0002, |
|
"step": 1690 |
|
}, |
|
{ |
|
"epoch": 24.005714285714287, |
|
"grad_norm": 0.009448004886507988, |
|
"learning_rate": 5.7142857142857145e-06, |
|
"loss": 0.0003, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 24.00857142857143, |
|
"grad_norm": 0.0061890422366559505, |
|
"learning_rate": 5.682539682539683e-06, |
|
"loss": 0.0001, |
|
"step": 1710 |
|
}, |
|
{ |
|
"epoch": 24.01142857142857, |
|
"grad_norm": 0.004276420455425978, |
|
"learning_rate": 5.650793650793651e-06, |
|
"loss": 0.0002, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 24.014285714285716, |
|
"grad_norm": 0.004243628121912479, |
|
"learning_rate": 5.619047619047619e-06, |
|
"loss": 0.0002, |
|
"step": 1730 |
|
}, |
|
{ |
|
"epoch": 24.017142857142858, |
|
"grad_norm": 0.017200473695993423, |
|
"learning_rate": 5.5873015873015876e-06, |
|
"loss": 0.0002, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 24.02, |
|
"grad_norm": 0.011262903921306133, |
|
"learning_rate": 5.555555555555557e-06, |
|
"loss": 0.0049, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 24.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.257319450378418, |
|
"eval_runtime": 5.9421, |
|
"eval_samples_per_second": 5.385, |
|
"eval_steps_per_second": 1.346, |
|
"step": 1750 |
|
}, |
|
{ |
|
"epoch": 25.002857142857142, |
|
"grad_norm": 0.0024505627807229757, |
|
"learning_rate": 5.523809523809525e-06, |
|
"loss": 0.0001, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 25.005714285714287, |
|
"grad_norm": 0.004473875276744366, |
|
"learning_rate": 5.492063492063493e-06, |
|
"loss": 0.0611, |
|
"step": 1770 |
|
}, |
|
{ |
|
"epoch": 25.00857142857143, |
|
"grad_norm": 0.002111697569489479, |
|
"learning_rate": 5.460317460317461e-06, |
|
"loss": 0.3776, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 25.01142857142857, |
|
"grad_norm": 0.0120729710906744, |
|
"learning_rate": 5.428571428571429e-06, |
|
"loss": 0.0003, |
|
"step": 1790 |
|
}, |
|
{ |
|
"epoch": 25.014285714285716, |
|
"grad_norm": 0.005537941120564938, |
|
"learning_rate": 5.396825396825397e-06, |
|
"loss": 0.0001, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 25.017142857142858, |
|
"grad_norm": 0.1099364161491394, |
|
"learning_rate": 5.365079365079365e-06, |
|
"loss": 0.0007, |
|
"step": 1810 |
|
}, |
|
{ |
|
"epoch": 25.02, |
|
"grad_norm": 0.001969733275473118, |
|
"learning_rate": 5.333333333333334e-06, |
|
"loss": 0.0231, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 25.02, |
|
"eval_accuracy": 0.6875, |
|
"eval_loss": 2.146007537841797, |
|
"eval_runtime": 5.7856, |
|
"eval_samples_per_second": 5.531, |
|
"eval_steps_per_second": 1.383, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 26.002857142857142, |
|
"grad_norm": 0.1284661889076233, |
|
"learning_rate": 5.301587301587302e-06, |
|
"loss": 0.0007, |
|
"step": 1830 |
|
}, |
|
{ |
|
"epoch": 26.005714285714287, |
|
"grad_norm": 0.027212299406528473, |
|
"learning_rate": 5.26984126984127e-06, |
|
"loss": 0.0002, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 26.00857142857143, |
|
"grad_norm": 113.98649597167969, |
|
"learning_rate": 5.2380952380952384e-06, |
|
"loss": 0.0977, |
|
"step": 1850 |
|
}, |
|
{ |
|
"epoch": 26.01142857142857, |
|
"grad_norm": 0.0038598247338086367, |
|
"learning_rate": 5.2063492063492076e-06, |
|
"loss": 0.0002, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 26.014285714285716, |
|
"grad_norm": 0.011346804909408092, |
|
"learning_rate": 5.174603174603176e-06, |
|
"loss": 0.0593, |
|
"step": 1870 |
|
}, |
|
{ |
|
"epoch": 26.017142857142858, |
|
"grad_norm": 60.034358978271484, |
|
"learning_rate": 5.142857142857142e-06, |
|
"loss": 0.0086, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 26.02, |
|
"grad_norm": 0.0061082011088728905, |
|
"learning_rate": 5.1111111111111115e-06, |
|
"loss": 0.0001, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 26.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 2.356595277786255, |
|
"eval_runtime": 6.1468, |
|
"eval_samples_per_second": 5.206, |
|
"eval_steps_per_second": 1.301, |
|
"step": 1890 |
|
}, |
|
{ |
|
"epoch": 27.002857142857142, |
|
"grad_norm": 0.005633851513266563, |
|
"learning_rate": 5.07936507936508e-06, |
|
"loss": 0.0001, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 27.005714285714287, |
|
"grad_norm": 0.011841390281915665, |
|
"learning_rate": 5.047619047619048e-06, |
|
"loss": 0.1327, |
|
"step": 1910 |
|
}, |
|
{ |
|
"epoch": 27.00857142857143, |
|
"grad_norm": 0.005733752157539129, |
|
"learning_rate": 5.015873015873016e-06, |
|
"loss": 0.0002, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 27.01142857142857, |
|
"grad_norm": 0.0031420367304235697, |
|
"learning_rate": 4.9841269841269845e-06, |
|
"loss": 0.0028, |
|
"step": 1930 |
|
}, |
|
{ |
|
"epoch": 27.014285714285716, |
|
"grad_norm": 0.0029769607353955507, |
|
"learning_rate": 4.952380952380953e-06, |
|
"loss": 0.0419, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 27.017142857142858, |
|
"grad_norm": 0.0036415501963347197, |
|
"learning_rate": 4.920634920634921e-06, |
|
"loss": 0.0001, |
|
"step": 1950 |
|
}, |
|
{ |
|
"epoch": 27.02, |
|
"grad_norm": 0.0069373250007629395, |
|
"learning_rate": 4.888888888888889e-06, |
|
"loss": 0.0001, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 27.02, |
|
"eval_accuracy": 0.59375, |
|
"eval_loss": 2.3822171688079834, |
|
"eval_runtime": 5.7966, |
|
"eval_samples_per_second": 5.52, |
|
"eval_steps_per_second": 1.38, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 28.002857142857142, |
|
"grad_norm": 0.12265493720769882, |
|
"learning_rate": 4.857142857142858e-06, |
|
"loss": 0.0006, |
|
"step": 1970 |
|
}, |
|
{ |
|
"epoch": 28.005714285714287, |
|
"grad_norm": 0.016870826482772827, |
|
"learning_rate": 4.825396825396826e-06, |
|
"loss": 0.0393, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 28.00857142857143, |
|
"grad_norm": 0.02534836158156395, |
|
"learning_rate": 4.793650793650794e-06, |
|
"loss": 0.0001, |
|
"step": 1990 |
|
}, |
|
{ |
|
"epoch": 28.01142857142857, |
|
"grad_norm": 0.0017878487706184387, |
|
"learning_rate": 4.761904761904762e-06, |
|
"loss": 0.0021, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 28.014285714285716, |
|
"grad_norm": 0.004311624448746443, |
|
"learning_rate": 4.730158730158731e-06, |
|
"loss": 0.0001, |
|
"step": 2010 |
|
}, |
|
{ |
|
"epoch": 28.017142857142858, |
|
"grad_norm": 0.0015392231289297342, |
|
"learning_rate": 4.698412698412699e-06, |
|
"loss": 0.0001, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 28.02, |
|
"grad_norm": 0.07122951745986938, |
|
"learning_rate": 4.666666666666667e-06, |
|
"loss": 0.0001, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 28.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 2.3177592754364014, |
|
"eval_runtime": 5.8041, |
|
"eval_samples_per_second": 5.513, |
|
"eval_steps_per_second": 1.378, |
|
"step": 2030 |
|
}, |
|
{ |
|
"epoch": 29.002857142857142, |
|
"grad_norm": 0.004755318630486727, |
|
"learning_rate": 4.634920634920635e-06, |
|
"loss": 0.0001, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 29.005714285714287, |
|
"grad_norm": 0.0016361831221729517, |
|
"learning_rate": 4.603174603174604e-06, |
|
"loss": 0.0343, |
|
"step": 2050 |
|
}, |
|
{ |
|
"epoch": 29.00857142857143, |
|
"grad_norm": 0.009776725433766842, |
|
"learning_rate": 4.571428571428572e-06, |
|
"loss": 0.0057, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 29.01142857142857, |
|
"grad_norm": 0.014050081372261047, |
|
"learning_rate": 4.53968253968254e-06, |
|
"loss": 0.0001, |
|
"step": 2070 |
|
}, |
|
{ |
|
"epoch": 29.014285714285716, |
|
"grad_norm": 0.027094116434454918, |
|
"learning_rate": 4.5079365079365085e-06, |
|
"loss": 0.0002, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 29.017142857142858, |
|
"grad_norm": 0.0022450664546340704, |
|
"learning_rate": 4.476190476190477e-06, |
|
"loss": 0.0001, |
|
"step": 2090 |
|
}, |
|
{ |
|
"epoch": 29.02, |
|
"grad_norm": 0.0033212972339242697, |
|
"learning_rate": 4.444444444444444e-06, |
|
"loss": 0.0004, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 29.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.5491509437561035, |
|
"eval_runtime": 5.8042, |
|
"eval_samples_per_second": 5.513, |
|
"eval_steps_per_second": 1.378, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 30.002857142857142, |
|
"grad_norm": 1.13490629196167, |
|
"learning_rate": 4.412698412698413e-06, |
|
"loss": 0.0002, |
|
"step": 2110 |
|
}, |
|
{ |
|
"epoch": 30.005714285714287, |
|
"grad_norm": 0.002449661260470748, |
|
"learning_rate": 4.3809523809523815e-06, |
|
"loss": 0.0002, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 30.00857142857143, |
|
"grad_norm": 0.020857322961091995, |
|
"learning_rate": 4.34920634920635e-06, |
|
"loss": 0.0001, |
|
"step": 2130 |
|
}, |
|
{ |
|
"epoch": 30.01142857142857, |
|
"grad_norm": 0.005406759679317474, |
|
"learning_rate": 4.317460317460318e-06, |
|
"loss": 0.0001, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 30.014285714285716, |
|
"grad_norm": 0.2224954515695572, |
|
"learning_rate": 4.2857142857142855e-06, |
|
"loss": 0.0001, |
|
"step": 2150 |
|
}, |
|
{ |
|
"epoch": 30.017142857142858, |
|
"grad_norm": 0.014697935432195663, |
|
"learning_rate": 4.2539682539682546e-06, |
|
"loss": 0.0001, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 30.02, |
|
"grad_norm": 0.003210916882380843, |
|
"learning_rate": 4.222222222222223e-06, |
|
"loss": 0.0003, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 30.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.764781951904297, |
|
"eval_runtime": 6.1993, |
|
"eval_samples_per_second": 5.162, |
|
"eval_steps_per_second": 1.29, |
|
"step": 2170 |
|
}, |
|
{ |
|
"epoch": 31.002857142857142, |
|
"grad_norm": 20.94349479675293, |
|
"learning_rate": 4.190476190476191e-06, |
|
"loss": 0.0014, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 31.005714285714287, |
|
"grad_norm": 0.0062758903950452805, |
|
"learning_rate": 4.158730158730159e-06, |
|
"loss": 0.052, |
|
"step": 2190 |
|
}, |
|
{ |
|
"epoch": 31.00857142857143, |
|
"grad_norm": 324.1405944824219, |
|
"learning_rate": 4.126984126984127e-06, |
|
"loss": 0.0889, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 31.01142857142857, |
|
"grad_norm": 5.652071475982666, |
|
"learning_rate": 4.095238095238096e-06, |
|
"loss": 0.0005, |
|
"step": 2210 |
|
}, |
|
{ |
|
"epoch": 31.014285714285716, |
|
"grad_norm": 0.000993481487967074, |
|
"learning_rate": 4.063492063492064e-06, |
|
"loss": 0.0002, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 31.017142857142858, |
|
"grad_norm": 0.007423575036227703, |
|
"learning_rate": 4.031746031746032e-06, |
|
"loss": 0.0001, |
|
"step": 2230 |
|
}, |
|
{ |
|
"epoch": 31.02, |
|
"grad_norm": 0.0035263928584754467, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 0.0001, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 31.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.3948616981506348, |
|
"eval_runtime": 5.739, |
|
"eval_samples_per_second": 5.576, |
|
"eval_steps_per_second": 1.394, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 32.002857142857145, |
|
"grad_norm": 0.002942159539088607, |
|
"learning_rate": 3.968253968253968e-06, |
|
"loss": 0.0001, |
|
"step": 2250 |
|
}, |
|
{ |
|
"epoch": 32.005714285714284, |
|
"grad_norm": 0.015976279973983765, |
|
"learning_rate": 3.936507936507936e-06, |
|
"loss": 0.0002, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 32.00857142857143, |
|
"grad_norm": 0.0022569282446056604, |
|
"learning_rate": 3.9047619047619055e-06, |
|
"loss": 0.0001, |
|
"step": 2270 |
|
}, |
|
{ |
|
"epoch": 32.011428571428574, |
|
"grad_norm": 0.001404234440997243, |
|
"learning_rate": 3.873015873015874e-06, |
|
"loss": 0.0001, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 32.01428571428571, |
|
"grad_norm": 0.03129139170050621, |
|
"learning_rate": 3.841269841269842e-06, |
|
"loss": 0.0001, |
|
"step": 2290 |
|
}, |
|
{ |
|
"epoch": 32.01714285714286, |
|
"grad_norm": 0.04605305194854736, |
|
"learning_rate": 3.80952380952381e-06, |
|
"loss": 0.0001, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 32.02, |
|
"grad_norm": 0.0029006798285990953, |
|
"learning_rate": 3.777777777777778e-06, |
|
"loss": 0.0001, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 32.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 2.410731792449951, |
|
"eval_runtime": 5.8109, |
|
"eval_samples_per_second": 5.507, |
|
"eval_steps_per_second": 1.377, |
|
"step": 2310 |
|
}, |
|
{ |
|
"epoch": 33.002857142857145, |
|
"grad_norm": 0.004220298025757074, |
|
"learning_rate": 3.7460317460317463e-06, |
|
"loss": 0.0001, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 33.005714285714284, |
|
"grad_norm": 0.05123414844274521, |
|
"learning_rate": 3.7142857142857146e-06, |
|
"loss": 0.0001, |
|
"step": 2330 |
|
}, |
|
{ |
|
"epoch": 33.00857142857143, |
|
"grad_norm": 0.0025845293421298265, |
|
"learning_rate": 3.6825396825396833e-06, |
|
"loss": 0.0001, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 33.011428571428574, |
|
"grad_norm": 0.005027332808822393, |
|
"learning_rate": 3.6507936507936507e-06, |
|
"loss": 0.0001, |
|
"step": 2350 |
|
}, |
|
{ |
|
"epoch": 33.01428571428571, |
|
"grad_norm": 0.008934518322348595, |
|
"learning_rate": 3.6190476190476194e-06, |
|
"loss": 0.0002, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 33.01714285714286, |
|
"grad_norm": 0.0008886617142707109, |
|
"learning_rate": 3.5873015873015877e-06, |
|
"loss": 0.1871, |
|
"step": 2370 |
|
}, |
|
{ |
|
"epoch": 33.02, |
|
"grad_norm": 0.03134104609489441, |
|
"learning_rate": 3.555555555555556e-06, |
|
"loss": 0.0001, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 33.02, |
|
"eval_accuracy": 0.59375, |
|
"eval_loss": 2.6098861694335938, |
|
"eval_runtime": 5.7355, |
|
"eval_samples_per_second": 5.579, |
|
"eval_steps_per_second": 1.395, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 34.002857142857145, |
|
"grad_norm": 0.006479092873632908, |
|
"learning_rate": 3.523809523809524e-06, |
|
"loss": 0.0208, |
|
"step": 2390 |
|
}, |
|
{ |
|
"epoch": 34.005714285714284, |
|
"grad_norm": 0.006957986857742071, |
|
"learning_rate": 3.492063492063492e-06, |
|
"loss": 0.0001, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 34.00857142857143, |
|
"grad_norm": 0.0024119375739246607, |
|
"learning_rate": 3.4603174603174607e-06, |
|
"loss": 0.0002, |
|
"step": 2410 |
|
}, |
|
{ |
|
"epoch": 34.011428571428574, |
|
"grad_norm": 0.0025062367785722017, |
|
"learning_rate": 3.428571428571429e-06, |
|
"loss": 0.0001, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 34.01428571428571, |
|
"grad_norm": 0.0034686909057199955, |
|
"learning_rate": 3.3968253968253972e-06, |
|
"loss": 0.0002, |
|
"step": 2430 |
|
}, |
|
{ |
|
"epoch": 34.01714285714286, |
|
"grad_norm": 0.0022994480095803738, |
|
"learning_rate": 3.3650793650793655e-06, |
|
"loss": 0.0002, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 34.02, |
|
"grad_norm": 0.004486434161663055, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 0.0001, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 34.02, |
|
"eval_accuracy": 0.5625, |
|
"eval_loss": 2.8574094772338867, |
|
"eval_runtime": 5.9889, |
|
"eval_samples_per_second": 5.343, |
|
"eval_steps_per_second": 1.336, |
|
"step": 2450 |
|
}, |
|
{ |
|
"epoch": 35.002857142857145, |
|
"grad_norm": 0.00257964339107275, |
|
"learning_rate": 3.3015873015873016e-06, |
|
"loss": 0.2215, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 35.005714285714284, |
|
"grad_norm": 0.00513617554679513, |
|
"learning_rate": 3.2698412698412703e-06, |
|
"loss": 0.0001, |
|
"step": 2470 |
|
}, |
|
{ |
|
"epoch": 35.00857142857143, |
|
"grad_norm": 0.0013633773196488619, |
|
"learning_rate": 3.2380952380952385e-06, |
|
"loss": 0.0001, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 35.011428571428574, |
|
"grad_norm": 0.036525338888168335, |
|
"learning_rate": 3.206349206349207e-06, |
|
"loss": 0.0001, |
|
"step": 2490 |
|
}, |
|
{ |
|
"epoch": 35.01428571428571, |
|
"grad_norm": 0.0019644717685878277, |
|
"learning_rate": 3.1746031746031746e-06, |
|
"loss": 0.0001, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 35.01714285714286, |
|
"grad_norm": 0.0009433199884369969, |
|
"learning_rate": 3.142857142857143e-06, |
|
"loss": 0.0001, |
|
"step": 2510 |
|
}, |
|
{ |
|
"epoch": 35.02, |
|
"grad_norm": 0.0059710158966481686, |
|
"learning_rate": 3.1111111111111116e-06, |
|
"loss": 0.0001, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 35.02, |
|
"eval_accuracy": 0.59375, |
|
"eval_loss": 2.5807838439941406, |
|
"eval_runtime": 5.9217, |
|
"eval_samples_per_second": 5.404, |
|
"eval_steps_per_second": 1.351, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 36.002857142857145, |
|
"grad_norm": 0.004809876438230276, |
|
"learning_rate": 3.07936507936508e-06, |
|
"loss": 0.0001, |
|
"step": 2530 |
|
}, |
|
{ |
|
"epoch": 36.005714285714284, |
|
"grad_norm": 0.002548061078414321, |
|
"learning_rate": 3.047619047619048e-06, |
|
"loss": 0.0001, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 36.00857142857143, |
|
"grad_norm": 0.0013359219301491976, |
|
"learning_rate": 3.015873015873016e-06, |
|
"loss": 0.0, |
|
"step": 2550 |
|
}, |
|
{ |
|
"epoch": 36.011428571428574, |
|
"grad_norm": 0.001209500478580594, |
|
"learning_rate": 2.984126984126984e-06, |
|
"loss": 0.0001, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 36.01428571428571, |
|
"grad_norm": 0.0029701353050768375, |
|
"learning_rate": 2.9523809523809525e-06, |
|
"loss": 0.0001, |
|
"step": 2570 |
|
}, |
|
{ |
|
"epoch": 36.01714285714286, |
|
"grad_norm": 0.0033097926061600447, |
|
"learning_rate": 2.920634920634921e-06, |
|
"loss": 0.0001, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 36.02, |
|
"grad_norm": 0.002935264492407441, |
|
"learning_rate": 2.888888888888889e-06, |
|
"loss": 0.0001, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 36.02, |
|
"eval_accuracy": 0.59375, |
|
"eval_loss": 2.624629497528076, |
|
"eval_runtime": 5.6636, |
|
"eval_samples_per_second": 5.65, |
|
"eval_steps_per_second": 1.413, |
|
"step": 2590 |
|
}, |
|
{ |
|
"epoch": 37.002857142857145, |
|
"grad_norm": 0.001033996231853962, |
|
"learning_rate": 2.8571428571428573e-06, |
|
"loss": 0.0001, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 37.005714285714284, |
|
"grad_norm": 0.0017455056076869369, |
|
"learning_rate": 2.8253968253968255e-06, |
|
"loss": 0.0461, |
|
"step": 2610 |
|
}, |
|
{ |
|
"epoch": 37.00857142857143, |
|
"grad_norm": 0.004716221243143082, |
|
"learning_rate": 2.7936507936507938e-06, |
|
"loss": 0.0001, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 37.011428571428574, |
|
"grad_norm": 0.0009876637486740947, |
|
"learning_rate": 2.7619047619047625e-06, |
|
"loss": 0.0008, |
|
"step": 2630 |
|
}, |
|
{ |
|
"epoch": 37.01428571428571, |
|
"grad_norm": 0.0018166368827223778, |
|
"learning_rate": 2.7301587301587303e-06, |
|
"loss": 0.0001, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 37.01714285714286, |
|
"grad_norm": 0.0022690363693982363, |
|
"learning_rate": 2.6984126984126986e-06, |
|
"loss": 0.0001, |
|
"step": 2650 |
|
}, |
|
{ |
|
"epoch": 37.02, |
|
"grad_norm": 0.0020156537648290396, |
|
"learning_rate": 2.666666666666667e-06, |
|
"loss": 0.0001, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 37.02, |
|
"eval_accuracy": 0.59375, |
|
"eval_loss": 2.705111503601074, |
|
"eval_runtime": 5.6718, |
|
"eval_samples_per_second": 5.642, |
|
"eval_steps_per_second": 1.41, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 38.002857142857145, |
|
"grad_norm": 0.0032058602664619684, |
|
"learning_rate": 2.634920634920635e-06, |
|
"loss": 0.0001, |
|
"step": 2670 |
|
}, |
|
{ |
|
"epoch": 38.005714285714284, |
|
"grad_norm": 0.0012342449044808745, |
|
"learning_rate": 2.6031746031746038e-06, |
|
"loss": 0.0001, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 38.00857142857143, |
|
"grad_norm": 0.27555274963378906, |
|
"learning_rate": 2.571428571428571e-06, |
|
"loss": 0.079, |
|
"step": 2690 |
|
}, |
|
{ |
|
"epoch": 38.011428571428574, |
|
"grad_norm": 0.12426193058490753, |
|
"learning_rate": 2.53968253968254e-06, |
|
"loss": 0.0001, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 38.01428571428571, |
|
"grad_norm": 0.001230777706950903, |
|
"learning_rate": 2.507936507936508e-06, |
|
"loss": 0.0, |
|
"step": 2710 |
|
}, |
|
{ |
|
"epoch": 38.01714285714286, |
|
"grad_norm": 0.0010537246707826853, |
|
"learning_rate": 2.4761904761904764e-06, |
|
"loss": 0.0002, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 38.02, |
|
"grad_norm": 0.006648702081292868, |
|
"learning_rate": 2.4444444444444447e-06, |
|
"loss": 0.0001, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 38.02, |
|
"eval_accuracy": 0.59375, |
|
"eval_loss": 2.504610776901245, |
|
"eval_runtime": 5.9071, |
|
"eval_samples_per_second": 5.417, |
|
"eval_steps_per_second": 1.354, |
|
"step": 2730 |
|
}, |
|
{ |
|
"epoch": 39.002857142857145, |
|
"grad_norm": 0.002145690843462944, |
|
"learning_rate": 2.412698412698413e-06, |
|
"loss": 0.0001, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 39.005714285714284, |
|
"grad_norm": 0.002901807427406311, |
|
"learning_rate": 2.380952380952381e-06, |
|
"loss": 0.0001, |
|
"step": 2750 |
|
}, |
|
{ |
|
"epoch": 39.00857142857143, |
|
"grad_norm": 0.0011529697803780437, |
|
"learning_rate": 2.3492063492063494e-06, |
|
"loss": 0.0, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 39.011428571428574, |
|
"grad_norm": 0.005594365298748016, |
|
"learning_rate": 2.3174603174603177e-06, |
|
"loss": 0.0001, |
|
"step": 2770 |
|
}, |
|
{ |
|
"epoch": 39.01428571428571, |
|
"grad_norm": 0.0025502736680209637, |
|
"learning_rate": 2.285714285714286e-06, |
|
"loss": 0.0, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 39.01714285714286, |
|
"grad_norm": 0.0013142352690920234, |
|
"learning_rate": 2.2539682539682542e-06, |
|
"loss": 0.0001, |
|
"step": 2790 |
|
}, |
|
{ |
|
"epoch": 39.02, |
|
"grad_norm": 0.0032077666837722063, |
|
"learning_rate": 2.222222222222222e-06, |
|
"loss": 0.0001, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 39.02, |
|
"eval_accuracy": 0.59375, |
|
"eval_loss": 2.5002870559692383, |
|
"eval_runtime": 5.6574, |
|
"eval_samples_per_second": 5.656, |
|
"eval_steps_per_second": 1.414, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 40.002857142857145, |
|
"grad_norm": 0.005300651304423809, |
|
"learning_rate": 2.1904761904761908e-06, |
|
"loss": 0.0001, |
|
"step": 2810 |
|
}, |
|
{ |
|
"epoch": 40.005714285714284, |
|
"grad_norm": 0.0013714683009311557, |
|
"learning_rate": 2.158730158730159e-06, |
|
"loss": 0.0001, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 40.00857142857143, |
|
"grad_norm": 0.001886402373202145, |
|
"learning_rate": 2.1269841269841273e-06, |
|
"loss": 0.0001, |
|
"step": 2830 |
|
}, |
|
{ |
|
"epoch": 40.011428571428574, |
|
"grad_norm": 0.0023008284624665976, |
|
"learning_rate": 2.0952380952380955e-06, |
|
"loss": 0.0001, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 40.01428571428571, |
|
"grad_norm": 0.03095676377415657, |
|
"learning_rate": 2.0634920634920634e-06, |
|
"loss": 0.0, |
|
"step": 2850 |
|
}, |
|
{ |
|
"epoch": 40.01714285714286, |
|
"grad_norm": 0.002867933129891753, |
|
"learning_rate": 2.031746031746032e-06, |
|
"loss": 0.0, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 40.02, |
|
"grad_norm": 0.0008751653949730098, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 0.0, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 40.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.545984983444214, |
|
"eval_runtime": 5.6867, |
|
"eval_samples_per_second": 5.627, |
|
"eval_steps_per_second": 1.407, |
|
"step": 2870 |
|
}, |
|
{ |
|
"epoch": 41.002857142857145, |
|
"grad_norm": 0.004483176860958338, |
|
"learning_rate": 1.968253968253968e-06, |
|
"loss": 0.0001, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 41.005714285714284, |
|
"grad_norm": 0.012557575479149818, |
|
"learning_rate": 1.936507936507937e-06, |
|
"loss": 0.0001, |
|
"step": 2890 |
|
}, |
|
{ |
|
"epoch": 41.00857142857143, |
|
"grad_norm": 0.0009115163120441139, |
|
"learning_rate": 1.904761904761905e-06, |
|
"loss": 0.0, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 41.011428571428574, |
|
"grad_norm": 0.0017849624855443835, |
|
"learning_rate": 1.8730158730158732e-06, |
|
"loss": 0.0, |
|
"step": 2910 |
|
}, |
|
{ |
|
"epoch": 41.01428571428571, |
|
"grad_norm": 0.002865022048354149, |
|
"learning_rate": 1.8412698412698416e-06, |
|
"loss": 0.0, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 41.01714285714286, |
|
"grad_norm": 0.0014787889085710049, |
|
"learning_rate": 1.8095238095238097e-06, |
|
"loss": 0.0001, |
|
"step": 2930 |
|
}, |
|
{ |
|
"epoch": 41.02, |
|
"grad_norm": 0.0009846773464232683, |
|
"learning_rate": 1.777777777777778e-06, |
|
"loss": 0.0, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 41.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.5397086143493652, |
|
"eval_runtime": 5.9883, |
|
"eval_samples_per_second": 5.344, |
|
"eval_steps_per_second": 1.336, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 42.002857142857145, |
|
"grad_norm": 0.0012540535535663366, |
|
"learning_rate": 1.746031746031746e-06, |
|
"loss": 0.0, |
|
"step": 2950 |
|
}, |
|
{ |
|
"epoch": 42.005714285714284, |
|
"grad_norm": 0.00243058567866683, |
|
"learning_rate": 1.7142857142857145e-06, |
|
"loss": 0.0, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 42.00857142857143, |
|
"grad_norm": 0.002351740375161171, |
|
"learning_rate": 1.6825396825396827e-06, |
|
"loss": 0.0, |
|
"step": 2970 |
|
}, |
|
{ |
|
"epoch": 42.011428571428574, |
|
"grad_norm": 0.0010893039871007204, |
|
"learning_rate": 1.6507936507936508e-06, |
|
"loss": 0.0001, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 42.01428571428571, |
|
"grad_norm": 0.0027362005785107613, |
|
"learning_rate": 1.6190476190476193e-06, |
|
"loss": 0.0, |
|
"step": 2990 |
|
}, |
|
{ |
|
"epoch": 42.01714285714286, |
|
"grad_norm": 0.002978693228214979, |
|
"learning_rate": 1.5873015873015873e-06, |
|
"loss": 0.0001, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 42.02, |
|
"grad_norm": 0.001713828998617828, |
|
"learning_rate": 1.5555555555555558e-06, |
|
"loss": 0.0, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 42.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.538372278213501, |
|
"eval_runtime": 5.6509, |
|
"eval_samples_per_second": 5.663, |
|
"eval_steps_per_second": 1.416, |
|
"step": 3010 |
|
}, |
|
{ |
|
"epoch": 43.002857142857145, |
|
"grad_norm": 0.0017983241705223918, |
|
"learning_rate": 1.523809523809524e-06, |
|
"loss": 0.0, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 43.005714285714284, |
|
"grad_norm": 0.009900640696287155, |
|
"learning_rate": 1.492063492063492e-06, |
|
"loss": 0.0, |
|
"step": 3030 |
|
}, |
|
{ |
|
"epoch": 43.00857142857143, |
|
"grad_norm": 0.004079794976860285, |
|
"learning_rate": 1.4603174603174606e-06, |
|
"loss": 0.0001, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 43.011428571428574, |
|
"grad_norm": 0.0023102271370589733, |
|
"learning_rate": 1.4285714285714286e-06, |
|
"loss": 0.0001, |
|
"step": 3050 |
|
}, |
|
{ |
|
"epoch": 43.01428571428571, |
|
"grad_norm": 0.002042073756456375, |
|
"learning_rate": 1.3968253968253969e-06, |
|
"loss": 0.0, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 43.01714285714286, |
|
"grad_norm": 0.0017096186056733131, |
|
"learning_rate": 1.3650793650793652e-06, |
|
"loss": 0.0, |
|
"step": 3070 |
|
}, |
|
{ |
|
"epoch": 43.02, |
|
"grad_norm": 0.003803923726081848, |
|
"learning_rate": 1.3333333333333334e-06, |
|
"loss": 0.0, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 43.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.484891891479492, |
|
"eval_runtime": 6.1299, |
|
"eval_samples_per_second": 5.22, |
|
"eval_steps_per_second": 1.305, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 44.002857142857145, |
|
"grad_norm": 0.0019901215564459562, |
|
"learning_rate": 1.3015873015873019e-06, |
|
"loss": 0.0, |
|
"step": 3090 |
|
}, |
|
{ |
|
"epoch": 44.005714285714284, |
|
"grad_norm": 0.0011514016659930348, |
|
"learning_rate": 1.26984126984127e-06, |
|
"loss": 0.0001, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 44.00857142857143, |
|
"grad_norm": 0.0025108291301876307, |
|
"learning_rate": 1.2380952380952382e-06, |
|
"loss": 0.0, |
|
"step": 3110 |
|
}, |
|
{ |
|
"epoch": 44.011428571428574, |
|
"grad_norm": 0.0018219811609014869, |
|
"learning_rate": 1.2063492063492065e-06, |
|
"loss": 0.0, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 44.01428571428571, |
|
"grad_norm": 0.0013160904636606574, |
|
"learning_rate": 1.1746031746031747e-06, |
|
"loss": 0.0, |
|
"step": 3130 |
|
}, |
|
{ |
|
"epoch": 44.01714285714286, |
|
"grad_norm": 7.966428756713867, |
|
"learning_rate": 1.142857142857143e-06, |
|
"loss": 0.0005, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 44.02, |
|
"grad_norm": 0.001393563929013908, |
|
"learning_rate": 1.111111111111111e-06, |
|
"loss": 0.0, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 44.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 2.5847256183624268, |
|
"eval_runtime": 5.9398, |
|
"eval_samples_per_second": 5.387, |
|
"eval_steps_per_second": 1.347, |
|
"step": 3150 |
|
}, |
|
{ |
|
"epoch": 45.002857142857145, |
|
"grad_norm": 0.007060057949274778, |
|
"learning_rate": 1.0793650793650795e-06, |
|
"loss": 0.0001, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 45.005714285714284, |
|
"grad_norm": 0.0010020129848271608, |
|
"learning_rate": 1.0476190476190478e-06, |
|
"loss": 0.0, |
|
"step": 3170 |
|
}, |
|
{ |
|
"epoch": 45.00857142857143, |
|
"grad_norm": 0.0028252785559743643, |
|
"learning_rate": 1.015873015873016e-06, |
|
"loss": 0.0, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 45.011428571428574, |
|
"grad_norm": 0.012641221284866333, |
|
"learning_rate": 9.84126984126984e-07, |
|
"loss": 0.0001, |
|
"step": 3190 |
|
}, |
|
{ |
|
"epoch": 45.01428571428571, |
|
"grad_norm": 0.0022382563911378384, |
|
"learning_rate": 9.523809523809525e-07, |
|
"loss": 0.0001, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 45.01714285714286, |
|
"grad_norm": 0.006219483446329832, |
|
"learning_rate": 9.206349206349208e-07, |
|
"loss": 0.0001, |
|
"step": 3210 |
|
}, |
|
{ |
|
"epoch": 45.02, |
|
"grad_norm": 0.0017200283473357558, |
|
"learning_rate": 8.88888888888889e-07, |
|
"loss": 0.0, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 45.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 2.5829355716705322, |
|
"eval_runtime": 5.6602, |
|
"eval_samples_per_second": 5.653, |
|
"eval_steps_per_second": 1.413, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 46.002857142857145, |
|
"grad_norm": 0.0015408931067213416, |
|
"learning_rate": 8.571428571428572e-07, |
|
"loss": 0.0, |
|
"step": 3230 |
|
}, |
|
{ |
|
"epoch": 46.005714285714284, |
|
"grad_norm": 0.0016143143875524402, |
|
"learning_rate": 8.253968253968254e-07, |
|
"loss": 0.0, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 46.00857142857143, |
|
"grad_norm": 0.0014364662347361445, |
|
"learning_rate": 7.936507936507937e-07, |
|
"loss": 0.0, |
|
"step": 3250 |
|
}, |
|
{ |
|
"epoch": 46.011428571428574, |
|
"grad_norm": 0.0040867868810892105, |
|
"learning_rate": 7.61904761904762e-07, |
|
"loss": 0.0, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 46.01428571428571, |
|
"grad_norm": 0.0010911135468631983, |
|
"learning_rate": 7.301587301587303e-07, |
|
"loss": 0.0001, |
|
"step": 3270 |
|
}, |
|
{ |
|
"epoch": 46.01714285714286, |
|
"grad_norm": 0.0011279195314273238, |
|
"learning_rate": 6.984126984126984e-07, |
|
"loss": 0.0, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 46.02, |
|
"grad_norm": 0.0029850241262465715, |
|
"learning_rate": 6.666666666666667e-07, |
|
"loss": 0.0, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 46.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 2.580873966217041, |
|
"eval_runtime": 5.6883, |
|
"eval_samples_per_second": 5.626, |
|
"eval_steps_per_second": 1.406, |
|
"step": 3290 |
|
}, |
|
{ |
|
"epoch": 47.002857142857145, |
|
"grad_norm": 0.003009574254974723, |
|
"learning_rate": 6.34920634920635e-07, |
|
"loss": 0.0001, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 47.005714285714284, |
|
"grad_norm": 0.006839941721409559, |
|
"learning_rate": 6.031746031746032e-07, |
|
"loss": 0.0, |
|
"step": 3310 |
|
}, |
|
{ |
|
"epoch": 47.00857142857143, |
|
"grad_norm": 0.001524322316981852, |
|
"learning_rate": 5.714285714285715e-07, |
|
"loss": 0.0, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 47.011428571428574, |
|
"grad_norm": 0.0027361048851162195, |
|
"learning_rate": 5.396825396825398e-07, |
|
"loss": 0.0, |
|
"step": 3330 |
|
}, |
|
{ |
|
"epoch": 47.01428571428571, |
|
"grad_norm": 0.0032709892839193344, |
|
"learning_rate": 5.07936507936508e-07, |
|
"loss": 0.0, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 47.01714285714286, |
|
"grad_norm": 0.0012358782114461064, |
|
"learning_rate": 4.7619047619047623e-07, |
|
"loss": 0.0, |
|
"step": 3350 |
|
}, |
|
{ |
|
"epoch": 47.02, |
|
"grad_norm": 0.0044230008497834206, |
|
"learning_rate": 4.444444444444445e-07, |
|
"loss": 0.0, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 47.02, |
|
"eval_accuracy": 0.625, |
|
"eval_loss": 2.5756099224090576, |
|
"eval_runtime": 5.7994, |
|
"eval_samples_per_second": 5.518, |
|
"eval_steps_per_second": 1.379, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 48.002857142857145, |
|
"grad_norm": 0.0034374285023659468, |
|
"learning_rate": 4.126984126984127e-07, |
|
"loss": 0.0, |
|
"step": 3370 |
|
}, |
|
{ |
|
"epoch": 48.005714285714284, |
|
"grad_norm": 0.010423140600323677, |
|
"learning_rate": 3.80952380952381e-07, |
|
"loss": 0.045, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 48.00857142857143, |
|
"grad_norm": 0.002047063549980521, |
|
"learning_rate": 3.492063492063492e-07, |
|
"loss": 0.0, |
|
"step": 3390 |
|
}, |
|
{ |
|
"epoch": 48.011428571428574, |
|
"grad_norm": 0.00131814437918365, |
|
"learning_rate": 3.174603174603175e-07, |
|
"loss": 0.0, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 48.01428571428571, |
|
"grad_norm": 0.004262310452759266, |
|
"learning_rate": 2.8571428571428575e-07, |
|
"loss": 0.0, |
|
"step": 3410 |
|
}, |
|
{ |
|
"epoch": 48.01714285714286, |
|
"grad_norm": 0.005764085799455643, |
|
"learning_rate": 2.53968253968254e-07, |
|
"loss": 0.0, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 48.02, |
|
"grad_norm": 0.0010528825223445892, |
|
"learning_rate": 2.2222222222222224e-07, |
|
"loss": 0.0001, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 48.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 2.474351644515991, |
|
"eval_runtime": 5.6522, |
|
"eval_samples_per_second": 5.662, |
|
"eval_steps_per_second": 1.415, |
|
"step": 3430 |
|
}, |
|
{ |
|
"epoch": 49.002857142857145, |
|
"grad_norm": 0.0024832114577293396, |
|
"learning_rate": 1.904761904761905e-07, |
|
"loss": 0.0, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 49.005714285714284, |
|
"grad_norm": 0.0020221523009240627, |
|
"learning_rate": 1.5873015873015874e-07, |
|
"loss": 0.0, |
|
"step": 3450 |
|
}, |
|
{ |
|
"epoch": 49.00857142857143, |
|
"grad_norm": 0.0294681116938591, |
|
"learning_rate": 1.26984126984127e-07, |
|
"loss": 0.0, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 49.011428571428574, |
|
"grad_norm": 0.0013974281027913094, |
|
"learning_rate": 9.523809523809525e-08, |
|
"loss": 0.0001, |
|
"step": 3470 |
|
}, |
|
{ |
|
"epoch": 49.01428571428571, |
|
"grad_norm": 0.002278068568557501, |
|
"learning_rate": 6.34920634920635e-08, |
|
"loss": 0.0, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 49.01714285714286, |
|
"grad_norm": 0.0014251351822167635, |
|
"learning_rate": 3.174603174603175e-08, |
|
"loss": 0.0, |
|
"step": 3490 |
|
}, |
|
{ |
|
"epoch": 49.02, |
|
"grad_norm": 0.001036540837958455, |
|
"learning_rate": 0.0, |
|
"loss": 0.0, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 49.02, |
|
"eval_accuracy": 0.65625, |
|
"eval_loss": 2.4720282554626465, |
|
"eval_runtime": 7.6256, |
|
"eval_samples_per_second": 4.196, |
|
"eval_steps_per_second": 1.049, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 49.02, |
|
"step": 3500, |
|
"total_flos": 6.147436841415475e+19, |
|
"train_loss": 0.15784767912905331, |
|
"train_runtime": 5935.2708, |
|
"train_samples_per_second": 2.359, |
|
"train_steps_per_second": 0.59 |
|
}, |
|
{ |
|
"epoch": 49.02, |
|
"eval_accuracy": 0.7906976744186046, |
|
"eval_loss": 0.4680953025817871, |
|
"eval_runtime": 8.841, |
|
"eval_samples_per_second": 4.864, |
|
"eval_steps_per_second": 1.244, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 49.02, |
|
"eval_accuracy": 0.7906976744186046, |
|
"eval_loss": 0.46809521317481995, |
|
"eval_runtime": 8.11, |
|
"eval_samples_per_second": 5.302, |
|
"eval_steps_per_second": 1.356, |
|
"step": 3500 |
|
} |
|
], |
|
"logging_steps": 10, |
|
"max_steps": 3500, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 9223372036854775807, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 6.147436841415475e+19, |
|
"train_batch_size": 4, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|