|
{ |
|
"best_metric": 1.4024385213851929, |
|
"best_model_checkpoint": "./outputs/202410/no_safetensors/checkpoint/checkpoint-110000", |
|
"epoch": 3.5091956039749252, |
|
"eval_steps": 10000, |
|
"global_step": 110000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.01595074410221237, |
|
"grad_norm": 108751.9921875, |
|
"learning_rate": 5e-06, |
|
"loss": 9.707, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.03190148820442474, |
|
"grad_norm": 44635.21484375, |
|
"learning_rate": 1e-05, |
|
"loss": 7.8592, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.047852232306637106, |
|
"grad_norm": 56782.453125, |
|
"learning_rate": 1.5e-05, |
|
"loss": 7.2609, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.06380297640884948, |
|
"grad_norm": 96195.9140625, |
|
"learning_rate": 2e-05, |
|
"loss": 7.1215, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.07975372051106185, |
|
"grad_norm": 108495.3828125, |
|
"learning_rate": 2.5e-05, |
|
"loss": 7.0039, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.09570446461327421, |
|
"grad_norm": 69317.0625, |
|
"learning_rate": 3e-05, |
|
"loss": 6.8924, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.11165520871548658, |
|
"grad_norm": 112126.8203125, |
|
"learning_rate": 3.5e-05, |
|
"loss": 6.7924, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.12760595281769896, |
|
"grad_norm": 94684.1640625, |
|
"learning_rate": 4e-05, |
|
"loss": 6.6991, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.14355669691991133, |
|
"grad_norm": 99549.90625, |
|
"learning_rate": 4.5e-05, |
|
"loss": 6.6077, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.1595074410221237, |
|
"grad_norm": 100152.6171875, |
|
"learning_rate": 5e-05, |
|
"loss": 6.4957, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.17545818512433606, |
|
"grad_norm": 165780.328125, |
|
"learning_rate": 5.500000000000001e-05, |
|
"loss": 6.3014, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.19140892922654842, |
|
"grad_norm": 132947.90625, |
|
"learning_rate": 6e-05, |
|
"loss": 5.9146, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.2073596733287608, |
|
"grad_norm": 146624.34375, |
|
"learning_rate": 6.500000000000001e-05, |
|
"loss": 5.5084, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.22331041743097316, |
|
"grad_norm": 145966.1875, |
|
"learning_rate": 7e-05, |
|
"loss": 5.1444, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.23926116153318552, |
|
"grad_norm": 127509.96875, |
|
"learning_rate": 7.500000000000001e-05, |
|
"loss": 4.5906, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.2552119056353979, |
|
"grad_norm": 97949.8828125, |
|
"learning_rate": 8e-05, |
|
"loss": 3.9702, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.2711626497376103, |
|
"grad_norm": 89478.4296875, |
|
"learning_rate": 8.5e-05, |
|
"loss": 3.5577, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.28711339383982265, |
|
"grad_norm": 94187.8984375, |
|
"learning_rate": 9e-05, |
|
"loss": 3.302, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.303064137942035, |
|
"grad_norm": 83401.4765625, |
|
"learning_rate": 9.5e-05, |
|
"loss": 3.122, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.3190148820442474, |
|
"grad_norm": 80432.59375, |
|
"learning_rate": 0.0001, |
|
"loss": 2.9716, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.3190148820442474, |
|
"eval_loss": 2.7994558811187744, |
|
"eval_runtime": 4941.5974, |
|
"eval_samples_per_second": 202.986, |
|
"eval_steps_per_second": 1.586, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.33496562614645975, |
|
"grad_norm": 71825.7421875, |
|
"learning_rate": 9.994949494949496e-05, |
|
"loss": 2.8485, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.3509163702486721, |
|
"grad_norm": 76352.0625, |
|
"learning_rate": 9.98989898989899e-05, |
|
"loss": 2.7498, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.3668671143508845, |
|
"grad_norm": 74371.5703125, |
|
"learning_rate": 9.984848484848486e-05, |
|
"loss": 2.6733, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.38281785845309685, |
|
"grad_norm": 68137.9609375, |
|
"learning_rate": 9.97979797979798e-05, |
|
"loss": 2.5987, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.3987686025553092, |
|
"grad_norm": 64699.58984375, |
|
"learning_rate": 9.974747474747475e-05, |
|
"loss": 2.5405, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.4147193466575216, |
|
"grad_norm": 69361.3046875, |
|
"learning_rate": 9.96969696969697e-05, |
|
"loss": 2.4881, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.43067009075973395, |
|
"grad_norm": 69834.671875, |
|
"learning_rate": 9.964646464646466e-05, |
|
"loss": 2.446, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.4466208348619463, |
|
"grad_norm": 57975.98828125, |
|
"learning_rate": 9.95959595959596e-05, |
|
"loss": 2.3991, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.4625715789641587, |
|
"grad_norm": 67055.9375, |
|
"learning_rate": 9.954545454545455e-05, |
|
"loss": 2.3637, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.47852232306637105, |
|
"grad_norm": 62929.25390625, |
|
"learning_rate": 9.94949494949495e-05, |
|
"loss": 2.3294, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.4944730671685834, |
|
"grad_norm": 65925.8828125, |
|
"learning_rate": 9.944444444444446e-05, |
|
"loss": 2.3016, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.5104238112707958, |
|
"grad_norm": 59559.88671875, |
|
"learning_rate": 9.939393939393939e-05, |
|
"loss": 2.2717, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.5263745553730081, |
|
"grad_norm": 63280.40234375, |
|
"learning_rate": 9.934343434343435e-05, |
|
"loss": 2.2462, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.5423252994752206, |
|
"grad_norm": 61513.0859375, |
|
"learning_rate": 9.92929292929293e-05, |
|
"loss": 2.2214, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.5582760435774329, |
|
"grad_norm": 63240.92578125, |
|
"learning_rate": 9.924242424242425e-05, |
|
"loss": 2.2013, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.5742267876796453, |
|
"grad_norm": 61634.92578125, |
|
"learning_rate": 9.919191919191919e-05, |
|
"loss": 2.1798, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.5901775317818576, |
|
"grad_norm": 56590.8203125, |
|
"learning_rate": 9.914141414141415e-05, |
|
"loss": 2.1557, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.60612827588407, |
|
"grad_norm": 59655.97265625, |
|
"learning_rate": 9.909090909090911e-05, |
|
"loss": 2.14, |
|
"step": 19000 |
|
}, |
|
{ |
|
"epoch": 0.6220790199862823, |
|
"grad_norm": 60380.125, |
|
"learning_rate": 9.904040404040404e-05, |
|
"loss": 2.123, |
|
"step": 19500 |
|
}, |
|
{ |
|
"epoch": 0.6380297640884948, |
|
"grad_norm": 56893.875, |
|
"learning_rate": 9.8989898989899e-05, |
|
"loss": 2.1062, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.6380297640884948, |
|
"eval_loss": 2.007786512374878, |
|
"eval_runtime": 4985.2285, |
|
"eval_samples_per_second": 201.209, |
|
"eval_steps_per_second": 1.572, |
|
"step": 20000 |
|
}, |
|
{ |
|
"epoch": 0.6539805081907071, |
|
"grad_norm": 60312.33203125, |
|
"learning_rate": 9.893939393939395e-05, |
|
"loss": 2.0889, |
|
"step": 20500 |
|
}, |
|
{ |
|
"epoch": 0.6699312522929195, |
|
"grad_norm": 58157.28125, |
|
"learning_rate": 9.888888888888889e-05, |
|
"loss": 2.0776, |
|
"step": 21000 |
|
}, |
|
{ |
|
"epoch": 0.6858819963951318, |
|
"grad_norm": 58377.234375, |
|
"learning_rate": 9.883838383838384e-05, |
|
"loss": 2.0646, |
|
"step": 21500 |
|
}, |
|
{ |
|
"epoch": 0.7018327404973442, |
|
"grad_norm": 57768.578125, |
|
"learning_rate": 9.87878787878788e-05, |
|
"loss": 2.0493, |
|
"step": 22000 |
|
}, |
|
{ |
|
"epoch": 0.7177834845995565, |
|
"grad_norm": 56948.41796875, |
|
"learning_rate": 9.873737373737374e-05, |
|
"loss": 2.0339, |
|
"step": 22500 |
|
}, |
|
{ |
|
"epoch": 0.733734228701769, |
|
"grad_norm": 58923.421875, |
|
"learning_rate": 9.868686868686869e-05, |
|
"loss": 2.0256, |
|
"step": 23000 |
|
}, |
|
{ |
|
"epoch": 0.7496849728039813, |
|
"grad_norm": 60381.23828125, |
|
"learning_rate": 9.863636363636364e-05, |
|
"loss": 2.0108, |
|
"step": 23500 |
|
}, |
|
{ |
|
"epoch": 0.7656357169061937, |
|
"grad_norm": 55685.12890625, |
|
"learning_rate": 9.85858585858586e-05, |
|
"loss": 2.0011, |
|
"step": 24000 |
|
}, |
|
{ |
|
"epoch": 0.781586461008406, |
|
"grad_norm": 55655.4609375, |
|
"learning_rate": 9.853535353535353e-05, |
|
"loss": 1.9899, |
|
"step": 24500 |
|
}, |
|
{ |
|
"epoch": 0.7975372051106184, |
|
"grad_norm": 57943.70703125, |
|
"learning_rate": 9.848484848484849e-05, |
|
"loss": 1.9793, |
|
"step": 25000 |
|
}, |
|
{ |
|
"epoch": 0.8134879492128307, |
|
"grad_norm": 56282.34375, |
|
"learning_rate": 9.843434343434344e-05, |
|
"loss": 1.9666, |
|
"step": 25500 |
|
}, |
|
{ |
|
"epoch": 0.8294386933150432, |
|
"grad_norm": 56147.5390625, |
|
"learning_rate": 9.838383838383838e-05, |
|
"loss": 1.9575, |
|
"step": 26000 |
|
}, |
|
{ |
|
"epoch": 0.8453894374172555, |
|
"grad_norm": 54766.921875, |
|
"learning_rate": 9.833333333333333e-05, |
|
"loss": 1.9508, |
|
"step": 26500 |
|
}, |
|
{ |
|
"epoch": 0.8613401815194679, |
|
"grad_norm": 57243.0390625, |
|
"learning_rate": 9.828282828282829e-05, |
|
"loss": 1.9406, |
|
"step": 27000 |
|
}, |
|
{ |
|
"epoch": 0.8772909256216802, |
|
"grad_norm": 57537.25, |
|
"learning_rate": 9.823232323232325e-05, |
|
"loss": 1.9295, |
|
"step": 27500 |
|
}, |
|
{ |
|
"epoch": 0.8932416697238926, |
|
"grad_norm": 56535.60546875, |
|
"learning_rate": 9.818181818181818e-05, |
|
"loss": 1.9232, |
|
"step": 28000 |
|
}, |
|
{ |
|
"epoch": 0.9091924138261049, |
|
"grad_norm": 60243.4375, |
|
"learning_rate": 9.813131313131314e-05, |
|
"loss": 1.9139, |
|
"step": 28500 |
|
}, |
|
{ |
|
"epoch": 0.9251431579283174, |
|
"grad_norm": 54113.95703125, |
|
"learning_rate": 9.808080808080809e-05, |
|
"loss": 1.9067, |
|
"step": 29000 |
|
}, |
|
{ |
|
"epoch": 0.9410939020305297, |
|
"grad_norm": 56388.44921875, |
|
"learning_rate": 9.803030303030303e-05, |
|
"loss": 1.8986, |
|
"step": 29500 |
|
}, |
|
{ |
|
"epoch": 0.9570446461327421, |
|
"grad_norm": 55065.8203125, |
|
"learning_rate": 9.797979797979798e-05, |
|
"loss": 1.8912, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.9570446461327421, |
|
"eval_loss": 1.7991323471069336, |
|
"eval_runtime": 4913.8629, |
|
"eval_samples_per_second": 204.132, |
|
"eval_steps_per_second": 1.595, |
|
"step": 30000 |
|
}, |
|
{ |
|
"epoch": 0.9729953902349545, |
|
"grad_norm": 55303.02734375, |
|
"learning_rate": 9.792929292929294e-05, |
|
"loss": 1.8815, |
|
"step": 30500 |
|
}, |
|
{ |
|
"epoch": 0.9889461343371668, |
|
"grad_norm": 61240.37109375, |
|
"learning_rate": 9.787878787878789e-05, |
|
"loss": 1.876, |
|
"step": 31000 |
|
}, |
|
{ |
|
"epoch": 1.0048968784393792, |
|
"grad_norm": 54453.8359375, |
|
"learning_rate": 9.782828282828283e-05, |
|
"loss": 1.8677, |
|
"step": 31500 |
|
}, |
|
{ |
|
"epoch": 1.0208476225415917, |
|
"grad_norm": 53728.48828125, |
|
"learning_rate": 9.777777777777778e-05, |
|
"loss": 1.8582, |
|
"step": 32000 |
|
}, |
|
{ |
|
"epoch": 1.0367983666438039, |
|
"grad_norm": 57865.578125, |
|
"learning_rate": 9.772727272727274e-05, |
|
"loss": 1.8532, |
|
"step": 32500 |
|
}, |
|
{ |
|
"epoch": 1.0527491107460163, |
|
"grad_norm": 57046.4609375, |
|
"learning_rate": 9.767676767676767e-05, |
|
"loss": 1.8473, |
|
"step": 33000 |
|
}, |
|
{ |
|
"epoch": 1.0686998548482287, |
|
"grad_norm": 57173.7265625, |
|
"learning_rate": 9.762626262626263e-05, |
|
"loss": 1.8394, |
|
"step": 33500 |
|
}, |
|
{ |
|
"epoch": 1.0846505989504411, |
|
"grad_norm": 55721.00390625, |
|
"learning_rate": 9.757575757575758e-05, |
|
"loss": 1.8346, |
|
"step": 34000 |
|
}, |
|
{ |
|
"epoch": 1.1006013430526533, |
|
"grad_norm": 56324.05859375, |
|
"learning_rate": 9.752525252525253e-05, |
|
"loss": 1.8319, |
|
"step": 34500 |
|
}, |
|
{ |
|
"epoch": 1.1165520871548658, |
|
"grad_norm": 55807.09765625, |
|
"learning_rate": 9.747474747474747e-05, |
|
"loss": 1.8231, |
|
"step": 35000 |
|
}, |
|
{ |
|
"epoch": 1.1325028312570782, |
|
"grad_norm": 53290.69140625, |
|
"learning_rate": 9.742424242424243e-05, |
|
"loss": 1.815, |
|
"step": 35500 |
|
}, |
|
{ |
|
"epoch": 1.1484535753592906, |
|
"grad_norm": 55672.96875, |
|
"learning_rate": 9.737373737373738e-05, |
|
"loss": 1.8117, |
|
"step": 36000 |
|
}, |
|
{ |
|
"epoch": 1.1644043194615028, |
|
"grad_norm": 53640.85546875, |
|
"learning_rate": 9.732323232323232e-05, |
|
"loss": 1.8069, |
|
"step": 36500 |
|
}, |
|
{ |
|
"epoch": 1.1803550635637152, |
|
"grad_norm": 58046.828125, |
|
"learning_rate": 9.727272727272728e-05, |
|
"loss": 1.801, |
|
"step": 37000 |
|
}, |
|
{ |
|
"epoch": 1.1963058076659276, |
|
"grad_norm": 57318.890625, |
|
"learning_rate": 9.722222222222223e-05, |
|
"loss": 1.7943, |
|
"step": 37500 |
|
}, |
|
{ |
|
"epoch": 1.21225655176814, |
|
"grad_norm": 57652.68359375, |
|
"learning_rate": 9.717171717171718e-05, |
|
"loss": 1.7907, |
|
"step": 38000 |
|
}, |
|
{ |
|
"epoch": 1.2282072958703523, |
|
"grad_norm": 54140.81640625, |
|
"learning_rate": 9.712121212121212e-05, |
|
"loss": 1.7837, |
|
"step": 38500 |
|
}, |
|
{ |
|
"epoch": 1.2441580399725647, |
|
"grad_norm": 55963.265625, |
|
"learning_rate": 9.707070707070708e-05, |
|
"loss": 1.7804, |
|
"step": 39000 |
|
}, |
|
{ |
|
"epoch": 1.260108784074777, |
|
"grad_norm": 56717.23828125, |
|
"learning_rate": 9.702020202020202e-05, |
|
"loss": 1.7779, |
|
"step": 39500 |
|
}, |
|
{ |
|
"epoch": 1.2760595281769895, |
|
"grad_norm": 54491.91796875, |
|
"learning_rate": 9.696969696969698e-05, |
|
"loss": 1.7704, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.2760595281769895, |
|
"eval_loss": 1.684906005859375, |
|
"eval_runtime": 4951.5896, |
|
"eval_samples_per_second": 202.576, |
|
"eval_steps_per_second": 1.583, |
|
"step": 40000 |
|
}, |
|
{ |
|
"epoch": 1.2920262230233042, |
|
"grad_norm": 53837.6640625, |
|
"learning_rate": 9.691919191919192e-05, |
|
"loss": 1.7651, |
|
"step": 40500 |
|
}, |
|
{ |
|
"epoch": 1.3079769671255164, |
|
"grad_norm": 54601.4765625, |
|
"learning_rate": 9.686868686868688e-05, |
|
"loss": 1.759, |
|
"step": 41000 |
|
}, |
|
{ |
|
"epoch": 1.3239277112277288, |
|
"grad_norm": 52770.5078125, |
|
"learning_rate": 9.681818181818181e-05, |
|
"loss": 1.7568, |
|
"step": 41500 |
|
}, |
|
{ |
|
"epoch": 1.3398784553299412, |
|
"grad_norm": 56085.59765625, |
|
"learning_rate": 9.676767676767677e-05, |
|
"loss": 1.7522, |
|
"step": 42000 |
|
}, |
|
{ |
|
"epoch": 1.3558291994321534, |
|
"grad_norm": 55068.359375, |
|
"learning_rate": 9.671717171717172e-05, |
|
"loss": 1.7506, |
|
"step": 42500 |
|
}, |
|
{ |
|
"epoch": 1.3717799435343658, |
|
"grad_norm": 57091.0546875, |
|
"learning_rate": 9.666666666666667e-05, |
|
"loss": 1.7419, |
|
"step": 43000 |
|
}, |
|
{ |
|
"epoch": 1.3877306876365783, |
|
"grad_norm": 52158.63671875, |
|
"learning_rate": 9.661616161616161e-05, |
|
"loss": 1.7395, |
|
"step": 43500 |
|
}, |
|
{ |
|
"epoch": 1.4036814317387907, |
|
"grad_norm": 54727.2890625, |
|
"learning_rate": 9.656565656565657e-05, |
|
"loss": 1.7351, |
|
"step": 44000 |
|
}, |
|
{ |
|
"epoch": 1.419632175841003, |
|
"grad_norm": 59289.56640625, |
|
"learning_rate": 9.651515151515152e-05, |
|
"loss": 1.731, |
|
"step": 44500 |
|
}, |
|
{ |
|
"epoch": 1.4355829199432153, |
|
"grad_norm": 57592.5546875, |
|
"learning_rate": 9.646464646464647e-05, |
|
"loss": 1.7268, |
|
"step": 45000 |
|
}, |
|
{ |
|
"epoch": 1.4515336640454277, |
|
"grad_norm": 56016.44921875, |
|
"learning_rate": 9.641414141414143e-05, |
|
"loss": 1.723, |
|
"step": 45500 |
|
}, |
|
{ |
|
"epoch": 1.4674844081476401, |
|
"grad_norm": 55521.9765625, |
|
"learning_rate": 9.636363636363637e-05, |
|
"loss": 1.7171, |
|
"step": 46000 |
|
}, |
|
{ |
|
"epoch": 1.4834351522498523, |
|
"grad_norm": 56532.88671875, |
|
"learning_rate": 9.631313131313132e-05, |
|
"loss": 1.7146, |
|
"step": 46500 |
|
}, |
|
{ |
|
"epoch": 1.4993858963520648, |
|
"grad_norm": 53762.78515625, |
|
"learning_rate": 9.626262626262627e-05, |
|
"loss": 1.7107, |
|
"step": 47000 |
|
}, |
|
{ |
|
"epoch": 1.5153366404542772, |
|
"grad_norm": 54982.08203125, |
|
"learning_rate": 9.621212121212123e-05, |
|
"loss": 1.7085, |
|
"step": 47500 |
|
}, |
|
{ |
|
"epoch": 1.5312873845564896, |
|
"grad_norm": 57135.71484375, |
|
"learning_rate": 9.616161616161616e-05, |
|
"loss": 1.7063, |
|
"step": 48000 |
|
}, |
|
{ |
|
"epoch": 1.547238128658702, |
|
"grad_norm": 55469.52734375, |
|
"learning_rate": 9.611111111111112e-05, |
|
"loss": 1.7021, |
|
"step": 48500 |
|
}, |
|
{ |
|
"epoch": 1.5631888727609144, |
|
"grad_norm": 55527.140625, |
|
"learning_rate": 9.606060606060606e-05, |
|
"loss": 1.6959, |
|
"step": 49000 |
|
}, |
|
{ |
|
"epoch": 1.5791396168631266, |
|
"grad_norm": 55039.515625, |
|
"learning_rate": 9.601010101010101e-05, |
|
"loss": 1.6953, |
|
"step": 49500 |
|
}, |
|
{ |
|
"epoch": 1.595090360965339, |
|
"grad_norm": 58584.22265625, |
|
"learning_rate": 9.595959595959596e-05, |
|
"loss": 1.6916, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.595090360965339, |
|
"eval_loss": 1.60930597782135, |
|
"eval_runtime": 4918.9878, |
|
"eval_samples_per_second": 203.919, |
|
"eval_steps_per_second": 1.593, |
|
"step": 50000 |
|
}, |
|
{ |
|
"epoch": 1.6110411050675513, |
|
"grad_norm": 55009.78125, |
|
"learning_rate": 9.590909090909092e-05, |
|
"loss": 1.685, |
|
"step": 50500 |
|
}, |
|
{ |
|
"epoch": 1.6269918491697637, |
|
"grad_norm": 53817.55078125, |
|
"learning_rate": 9.585858585858586e-05, |
|
"loss": 1.684, |
|
"step": 51000 |
|
}, |
|
{ |
|
"epoch": 1.6429425932719761, |
|
"grad_norm": 53155.3359375, |
|
"learning_rate": 9.580808080808081e-05, |
|
"loss": 1.6813, |
|
"step": 51500 |
|
}, |
|
{ |
|
"epoch": 1.6588933373741885, |
|
"grad_norm": 55288.33984375, |
|
"learning_rate": 9.575757575757576e-05, |
|
"loss": 1.6776, |
|
"step": 52000 |
|
}, |
|
{ |
|
"epoch": 1.674844081476401, |
|
"grad_norm": 58538.25, |
|
"learning_rate": 9.570707070707072e-05, |
|
"loss": 1.6746, |
|
"step": 52500 |
|
}, |
|
{ |
|
"epoch": 1.6907948255786134, |
|
"grad_norm": 54227.25390625, |
|
"learning_rate": 9.565656565656566e-05, |
|
"loss": 1.6715, |
|
"step": 53000 |
|
}, |
|
{ |
|
"epoch": 1.7067455696808256, |
|
"grad_norm": 56011.28125, |
|
"learning_rate": 9.560606060606061e-05, |
|
"loss": 1.6701, |
|
"step": 53500 |
|
}, |
|
{ |
|
"epoch": 1.722696313783038, |
|
"grad_norm": 51203.140625, |
|
"learning_rate": 9.555555555555557e-05, |
|
"loss": 1.6695, |
|
"step": 54000 |
|
}, |
|
{ |
|
"epoch": 1.7386470578852502, |
|
"grad_norm": 53641.3203125, |
|
"learning_rate": 9.550505050505051e-05, |
|
"loss": 1.6613, |
|
"step": 54500 |
|
}, |
|
{ |
|
"epoch": 1.7545978019874626, |
|
"grad_norm": 55869.0234375, |
|
"learning_rate": 9.545454545454546e-05, |
|
"loss": 1.6598, |
|
"step": 55000 |
|
}, |
|
{ |
|
"epoch": 1.770548546089675, |
|
"grad_norm": 54982.5546875, |
|
"learning_rate": 9.540404040404041e-05, |
|
"loss": 1.6595, |
|
"step": 55500 |
|
}, |
|
{ |
|
"epoch": 1.7864992901918875, |
|
"grad_norm": 55537.86328125, |
|
"learning_rate": 9.535353535353537e-05, |
|
"loss": 1.6544, |
|
"step": 56000 |
|
}, |
|
{ |
|
"epoch": 1.8024500342941, |
|
"grad_norm": 52897.34375, |
|
"learning_rate": 9.53030303030303e-05, |
|
"loss": 1.6512, |
|
"step": 56500 |
|
}, |
|
{ |
|
"epoch": 1.8184007783963123, |
|
"grad_norm": 55306.94921875, |
|
"learning_rate": 9.525252525252526e-05, |
|
"loss": 1.6493, |
|
"step": 57000 |
|
}, |
|
{ |
|
"epoch": 1.8343515224985245, |
|
"grad_norm": 55051.29296875, |
|
"learning_rate": 9.52020202020202e-05, |
|
"loss": 1.6453, |
|
"step": 57500 |
|
}, |
|
{ |
|
"epoch": 1.850302266600737, |
|
"grad_norm": 54118.21484375, |
|
"learning_rate": 9.515151515151515e-05, |
|
"loss": 1.6441, |
|
"step": 58000 |
|
}, |
|
{ |
|
"epoch": 1.8662530107029491, |
|
"grad_norm": 56088.76171875, |
|
"learning_rate": 9.51010101010101e-05, |
|
"loss": 1.6429, |
|
"step": 58500 |
|
}, |
|
{ |
|
"epoch": 1.8822037548051616, |
|
"grad_norm": 52600.8359375, |
|
"learning_rate": 9.505050505050506e-05, |
|
"loss": 1.636, |
|
"step": 59000 |
|
}, |
|
{ |
|
"epoch": 1.898154498907374, |
|
"grad_norm": 55023.9375, |
|
"learning_rate": 9.5e-05, |
|
"loss": 1.6333, |
|
"step": 59500 |
|
}, |
|
{ |
|
"epoch": 1.9141052430095864, |
|
"grad_norm": 54295.30859375, |
|
"learning_rate": 9.494949494949495e-05, |
|
"loss": 1.6335, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.9141052430095864, |
|
"eval_loss": 1.5532509088516235, |
|
"eval_runtime": 4919.5325, |
|
"eval_samples_per_second": 203.896, |
|
"eval_steps_per_second": 1.593, |
|
"step": 60000 |
|
}, |
|
{ |
|
"epoch": 1.9300559871117988, |
|
"grad_norm": 55173.84765625, |
|
"learning_rate": 9.48989898989899e-05, |
|
"loss": 1.63, |
|
"step": 60500 |
|
}, |
|
{ |
|
"epoch": 1.9460067312140112, |
|
"grad_norm": 53612.4921875, |
|
"learning_rate": 9.484848484848486e-05, |
|
"loss": 1.6286, |
|
"step": 61000 |
|
}, |
|
{ |
|
"epoch": 1.9619574753162237, |
|
"grad_norm": 55060.9140625, |
|
"learning_rate": 9.47979797979798e-05, |
|
"loss": 1.6263, |
|
"step": 61500 |
|
}, |
|
{ |
|
"epoch": 1.9779082194184359, |
|
"grad_norm": 54993.5625, |
|
"learning_rate": 9.474747474747475e-05, |
|
"loss": 1.6231, |
|
"step": 62000 |
|
}, |
|
{ |
|
"epoch": 1.9938589635206483, |
|
"grad_norm": 55949.11328125, |
|
"learning_rate": 9.469696969696971e-05, |
|
"loss": 1.6228, |
|
"step": 62500 |
|
}, |
|
{ |
|
"epoch": 2.0098097076228605, |
|
"grad_norm": 54883.53515625, |
|
"learning_rate": 9.464646464646464e-05, |
|
"loss": 1.6197, |
|
"step": 63000 |
|
}, |
|
{ |
|
"epoch": 2.025760451725073, |
|
"grad_norm": 57476.609375, |
|
"learning_rate": 9.45959595959596e-05, |
|
"loss": 1.6145, |
|
"step": 63500 |
|
}, |
|
{ |
|
"epoch": 2.0417111958272853, |
|
"grad_norm": 55516.81640625, |
|
"learning_rate": 9.454545454545455e-05, |
|
"loss": 1.6126, |
|
"step": 64000 |
|
}, |
|
{ |
|
"epoch": 2.0576619399294978, |
|
"grad_norm": 53755.71484375, |
|
"learning_rate": 9.449494949494951e-05, |
|
"loss": 1.6109, |
|
"step": 64500 |
|
}, |
|
{ |
|
"epoch": 2.07361268403171, |
|
"grad_norm": 54177.87890625, |
|
"learning_rate": 9.444444444444444e-05, |
|
"loss": 1.6105, |
|
"step": 65000 |
|
}, |
|
{ |
|
"epoch": 2.0895634281339226, |
|
"grad_norm": 56828.4765625, |
|
"learning_rate": 9.43939393939394e-05, |
|
"loss": 1.6075, |
|
"step": 65500 |
|
}, |
|
{ |
|
"epoch": 2.105514172236135, |
|
"grad_norm": 55464.76171875, |
|
"learning_rate": 9.434343434343435e-05, |
|
"loss": 1.606, |
|
"step": 66000 |
|
}, |
|
{ |
|
"epoch": 2.121464916338347, |
|
"grad_norm": 56928.1015625, |
|
"learning_rate": 9.42929292929293e-05, |
|
"loss": 1.6023, |
|
"step": 66500 |
|
}, |
|
{ |
|
"epoch": 2.1374156604405594, |
|
"grad_norm": 51041.0, |
|
"learning_rate": 9.424242424242424e-05, |
|
"loss": 1.6, |
|
"step": 67000 |
|
}, |
|
{ |
|
"epoch": 2.153366404542772, |
|
"grad_norm": 56379.828125, |
|
"learning_rate": 9.41919191919192e-05, |
|
"loss": 1.6003, |
|
"step": 67500 |
|
}, |
|
{ |
|
"epoch": 2.1693171486449843, |
|
"grad_norm": 55763.93359375, |
|
"learning_rate": 9.414141414141415e-05, |
|
"loss": 1.596, |
|
"step": 68000 |
|
}, |
|
{ |
|
"epoch": 2.1852678927471967, |
|
"grad_norm": 53785.359375, |
|
"learning_rate": 9.40909090909091e-05, |
|
"loss": 1.5932, |
|
"step": 68500 |
|
}, |
|
{ |
|
"epoch": 2.201218636849409, |
|
"grad_norm": 56003.0703125, |
|
"learning_rate": 9.404040404040404e-05, |
|
"loss": 1.5928, |
|
"step": 69000 |
|
}, |
|
{ |
|
"epoch": 2.2171693809516215, |
|
"grad_norm": 55811.99609375, |
|
"learning_rate": 9.3989898989899e-05, |
|
"loss": 1.5919, |
|
"step": 69500 |
|
}, |
|
{ |
|
"epoch": 2.233120125053834, |
|
"grad_norm": 56307.57421875, |
|
"learning_rate": 9.393939393939395e-05, |
|
"loss": 1.5869, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 2.233120125053834, |
|
"eval_loss": 1.5101096630096436, |
|
"eval_runtime": 4917.8928, |
|
"eval_samples_per_second": 203.964, |
|
"eval_steps_per_second": 1.594, |
|
"step": 70000 |
|
}, |
|
{ |
|
"epoch": 2.249070869156046, |
|
"grad_norm": 53560.9296875, |
|
"learning_rate": 9.388888888888889e-05, |
|
"loss": 1.5883, |
|
"step": 70500 |
|
}, |
|
{ |
|
"epoch": 2.2650216132582583, |
|
"grad_norm": 56536.40625, |
|
"learning_rate": 9.383838383838385e-05, |
|
"loss": 1.585, |
|
"step": 71000 |
|
}, |
|
{ |
|
"epoch": 2.2809723573604708, |
|
"grad_norm": 54454.96875, |
|
"learning_rate": 9.378787878787879e-05, |
|
"loss": 1.582, |
|
"step": 71500 |
|
}, |
|
{ |
|
"epoch": 2.296923101462683, |
|
"grad_norm": 55888.09375, |
|
"learning_rate": 9.373737373737375e-05, |
|
"loss": 1.5776, |
|
"step": 72000 |
|
}, |
|
{ |
|
"epoch": 2.3128738455648956, |
|
"grad_norm": 55370.46484375, |
|
"learning_rate": 9.368686868686869e-05, |
|
"loss": 1.5781, |
|
"step": 72500 |
|
}, |
|
{ |
|
"epoch": 2.328824589667108, |
|
"grad_norm": 56668.328125, |
|
"learning_rate": 9.363636363636364e-05, |
|
"loss": 1.5779, |
|
"step": 73000 |
|
}, |
|
{ |
|
"epoch": 2.3447753337693205, |
|
"grad_norm": 56674.85546875, |
|
"learning_rate": 9.358585858585858e-05, |
|
"loss": 1.5724, |
|
"step": 73500 |
|
}, |
|
{ |
|
"epoch": 2.360726077871533, |
|
"grad_norm": 59070.734375, |
|
"learning_rate": 9.353535353535354e-05, |
|
"loss": 1.5737, |
|
"step": 74000 |
|
}, |
|
{ |
|
"epoch": 2.376676821973745, |
|
"grad_norm": 55701.37109375, |
|
"learning_rate": 9.348484848484849e-05, |
|
"loss": 1.5702, |
|
"step": 74500 |
|
}, |
|
{ |
|
"epoch": 2.3926275660759573, |
|
"grad_norm": 54837.890625, |
|
"learning_rate": 9.343434343434344e-05, |
|
"loss": 1.5691, |
|
"step": 75000 |
|
}, |
|
{ |
|
"epoch": 2.4085783101781697, |
|
"grad_norm": 55847.98046875, |
|
"learning_rate": 9.338383838383838e-05, |
|
"loss": 1.5692, |
|
"step": 75500 |
|
}, |
|
{ |
|
"epoch": 2.424529054280382, |
|
"grad_norm": 53633.03125, |
|
"learning_rate": 9.333333333333334e-05, |
|
"loss": 1.5639, |
|
"step": 76000 |
|
}, |
|
{ |
|
"epoch": 2.4404797983825945, |
|
"grad_norm": 55944.12890625, |
|
"learning_rate": 9.328282828282829e-05, |
|
"loss": 1.5664, |
|
"step": 76500 |
|
}, |
|
{ |
|
"epoch": 2.456430542484807, |
|
"grad_norm": 53979.30859375, |
|
"learning_rate": 9.323232323232324e-05, |
|
"loss": 1.5635, |
|
"step": 77000 |
|
}, |
|
{ |
|
"epoch": 2.4723812865870194, |
|
"grad_norm": 56014.97265625, |
|
"learning_rate": 9.318181818181818e-05, |
|
"loss": 1.5601, |
|
"step": 77500 |
|
}, |
|
{ |
|
"epoch": 2.488332030689232, |
|
"grad_norm": 55291.9140625, |
|
"learning_rate": 9.313131313131314e-05, |
|
"loss": 1.5618, |
|
"step": 78000 |
|
}, |
|
{ |
|
"epoch": 2.5042827747914442, |
|
"grad_norm": 53215.24609375, |
|
"learning_rate": 9.308080808080809e-05, |
|
"loss": 1.5576, |
|
"step": 78500 |
|
}, |
|
{ |
|
"epoch": 2.520233518893656, |
|
"grad_norm": 56197.3203125, |
|
"learning_rate": 9.303030303030303e-05, |
|
"loss": 1.554, |
|
"step": 79000 |
|
}, |
|
{ |
|
"epoch": 2.5361842629958686, |
|
"grad_norm": 55106.9765625, |
|
"learning_rate": 9.2979797979798e-05, |
|
"loss": 1.5545, |
|
"step": 79500 |
|
}, |
|
{ |
|
"epoch": 2.552135007098081, |
|
"grad_norm": 54552.08984375, |
|
"learning_rate": 9.292929292929293e-05, |
|
"loss": 1.5529, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.552135007098081, |
|
"eval_loss": 1.4744161367416382, |
|
"eval_runtime": 4899.8032, |
|
"eval_samples_per_second": 204.717, |
|
"eval_steps_per_second": 1.599, |
|
"step": 80000 |
|
}, |
|
{ |
|
"epoch": 2.5681017019443955, |
|
"grad_norm": 56671.71875, |
|
"learning_rate": 9.287878787878789e-05, |
|
"loss": 1.5516, |
|
"step": 80500 |
|
}, |
|
{ |
|
"epoch": 2.5840524460466083, |
|
"grad_norm": 54676.5078125, |
|
"learning_rate": 9.282828282828283e-05, |
|
"loss": 1.5501, |
|
"step": 81000 |
|
}, |
|
{ |
|
"epoch": 2.6000031901488203, |
|
"grad_norm": 56587.88671875, |
|
"learning_rate": 9.277777777777778e-05, |
|
"loss": 1.549, |
|
"step": 81500 |
|
}, |
|
{ |
|
"epoch": 2.6159539342510327, |
|
"grad_norm": 56696.6484375, |
|
"learning_rate": 9.272727272727273e-05, |
|
"loss": 1.5448, |
|
"step": 82000 |
|
}, |
|
{ |
|
"epoch": 2.631904678353245, |
|
"grad_norm": 55145.359375, |
|
"learning_rate": 9.267676767676769e-05, |
|
"loss": 1.5439, |
|
"step": 82500 |
|
}, |
|
{ |
|
"epoch": 2.6478554224554576, |
|
"grad_norm": 55258.8828125, |
|
"learning_rate": 9.262626262626263e-05, |
|
"loss": 1.5461, |
|
"step": 83000 |
|
}, |
|
{ |
|
"epoch": 2.66380616655767, |
|
"grad_norm": 55989.4921875, |
|
"learning_rate": 9.257575757575758e-05, |
|
"loss": 1.5423, |
|
"step": 83500 |
|
}, |
|
{ |
|
"epoch": 2.6797569106598824, |
|
"grad_norm": 54569.66015625, |
|
"learning_rate": 9.252525252525253e-05, |
|
"loss": 1.5432, |
|
"step": 84000 |
|
}, |
|
{ |
|
"epoch": 2.695707654762095, |
|
"grad_norm": 56487.32421875, |
|
"learning_rate": 9.247474747474749e-05, |
|
"loss": 1.5406, |
|
"step": 84500 |
|
}, |
|
{ |
|
"epoch": 2.711658398864307, |
|
"grad_norm": 55735.38671875, |
|
"learning_rate": 9.242424242424242e-05, |
|
"loss": 1.5385, |
|
"step": 85000 |
|
}, |
|
{ |
|
"epoch": 2.7276091429665192, |
|
"grad_norm": 55246.734375, |
|
"learning_rate": 9.237373737373738e-05, |
|
"loss": 1.5385, |
|
"step": 85500 |
|
}, |
|
{ |
|
"epoch": 2.7435598870687317, |
|
"grad_norm": 54426.76171875, |
|
"learning_rate": 9.232323232323232e-05, |
|
"loss": 1.5354, |
|
"step": 86000 |
|
}, |
|
{ |
|
"epoch": 2.759510631170944, |
|
"grad_norm": 56496.05859375, |
|
"learning_rate": 9.227272727272727e-05, |
|
"loss": 1.5354, |
|
"step": 86500 |
|
}, |
|
{ |
|
"epoch": 2.7754613752731565, |
|
"grad_norm": 54483.66796875, |
|
"learning_rate": 9.222222222222223e-05, |
|
"loss": 1.5314, |
|
"step": 87000 |
|
}, |
|
{ |
|
"epoch": 2.791412119375369, |
|
"grad_norm": 56842.0, |
|
"learning_rate": 9.217171717171718e-05, |
|
"loss": 1.5307, |
|
"step": 87500 |
|
}, |
|
{ |
|
"epoch": 2.8073628634775813, |
|
"grad_norm": 55707.21875, |
|
"learning_rate": 9.212121212121214e-05, |
|
"loss": 1.5299, |
|
"step": 88000 |
|
}, |
|
{ |
|
"epoch": 2.8233136075797933, |
|
"grad_norm": 54965.58984375, |
|
"learning_rate": 9.207070707070707e-05, |
|
"loss": 1.5292, |
|
"step": 88500 |
|
}, |
|
{ |
|
"epoch": 2.839264351682006, |
|
"grad_norm": 54280.4140625, |
|
"learning_rate": 9.202020202020203e-05, |
|
"loss": 1.5281, |
|
"step": 89000 |
|
}, |
|
{ |
|
"epoch": 2.855215095784218, |
|
"grad_norm": 56317.328125, |
|
"learning_rate": 9.196969696969698e-05, |
|
"loss": 1.5292, |
|
"step": 89500 |
|
}, |
|
{ |
|
"epoch": 2.8711658398864306, |
|
"grad_norm": 56429.265625, |
|
"learning_rate": 9.191919191919192e-05, |
|
"loss": 1.5248, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.8711658398864306, |
|
"eval_loss": 1.4486085176467896, |
|
"eval_runtime": 4949.3767, |
|
"eval_samples_per_second": 202.667, |
|
"eval_steps_per_second": 1.583, |
|
"step": 90000 |
|
}, |
|
{ |
|
"epoch": 2.887116583988643, |
|
"grad_norm": 60500.609375, |
|
"learning_rate": 9.186868686868687e-05, |
|
"loss": 1.5244, |
|
"step": 90500 |
|
}, |
|
{ |
|
"epoch": 2.9030673280908554, |
|
"grad_norm": 56278.77734375, |
|
"learning_rate": 9.181818181818183e-05, |
|
"loss": 1.5228, |
|
"step": 91000 |
|
}, |
|
{ |
|
"epoch": 2.919018072193068, |
|
"grad_norm": 55179.0859375, |
|
"learning_rate": 9.176767676767677e-05, |
|
"loss": 1.5228, |
|
"step": 91500 |
|
}, |
|
{ |
|
"epoch": 2.9349688162952803, |
|
"grad_norm": 58535.69140625, |
|
"learning_rate": 9.171717171717172e-05, |
|
"loss": 1.5207, |
|
"step": 92000 |
|
}, |
|
{ |
|
"epoch": 2.9509195603974927, |
|
"grad_norm": 56315.96484375, |
|
"learning_rate": 9.166666666666667e-05, |
|
"loss": 1.5205, |
|
"step": 92500 |
|
}, |
|
{ |
|
"epoch": 2.9668703044997047, |
|
"grad_norm": 57173.87109375, |
|
"learning_rate": 9.161616161616163e-05, |
|
"loss": 1.5165, |
|
"step": 93000 |
|
}, |
|
{ |
|
"epoch": 2.9828210486019175, |
|
"grad_norm": 58435.5, |
|
"learning_rate": 9.156565656565656e-05, |
|
"loss": 1.516, |
|
"step": 93500 |
|
}, |
|
{ |
|
"epoch": 2.9987717927041295, |
|
"grad_norm": 55689.07421875, |
|
"learning_rate": 9.151515151515152e-05, |
|
"loss": 1.5155, |
|
"step": 94000 |
|
}, |
|
{ |
|
"epoch": 3.014722536806342, |
|
"grad_norm": 55691.28515625, |
|
"learning_rate": 9.146464646464647e-05, |
|
"loss": 1.5124, |
|
"step": 94500 |
|
}, |
|
{ |
|
"epoch": 3.0306732809085544, |
|
"grad_norm": 55951.5546875, |
|
"learning_rate": 9.141414141414141e-05, |
|
"loss": 1.5098, |
|
"step": 95000 |
|
}, |
|
{ |
|
"epoch": 3.046624025010767, |
|
"grad_norm": 56781.6328125, |
|
"learning_rate": 9.136363636363637e-05, |
|
"loss": 1.5107, |
|
"step": 95500 |
|
}, |
|
{ |
|
"epoch": 3.062574769112979, |
|
"grad_norm": 56111.234375, |
|
"learning_rate": 9.131313131313132e-05, |
|
"loss": 1.5072, |
|
"step": 96000 |
|
}, |
|
{ |
|
"epoch": 3.0785255132151916, |
|
"grad_norm": 55225.51171875, |
|
"learning_rate": 9.126262626262627e-05, |
|
"loss": 1.5089, |
|
"step": 96500 |
|
}, |
|
{ |
|
"epoch": 3.094476257317404, |
|
"grad_norm": 57983.26171875, |
|
"learning_rate": 9.121212121212121e-05, |
|
"loss": 1.5068, |
|
"step": 97000 |
|
}, |
|
{ |
|
"epoch": 3.110427001419616, |
|
"grad_norm": 55611.953125, |
|
"learning_rate": 9.116161616161617e-05, |
|
"loss": 1.5039, |
|
"step": 97500 |
|
}, |
|
{ |
|
"epoch": 3.1263777455218285, |
|
"grad_norm": 56500.25390625, |
|
"learning_rate": 9.111111111111112e-05, |
|
"loss": 1.5023, |
|
"step": 98000 |
|
}, |
|
{ |
|
"epoch": 3.142328489624041, |
|
"grad_norm": 55070.7578125, |
|
"learning_rate": 9.106060606060606e-05, |
|
"loss": 1.5025, |
|
"step": 98500 |
|
}, |
|
{ |
|
"epoch": 3.1582792337262533, |
|
"grad_norm": 56306.3203125, |
|
"learning_rate": 9.101010101010101e-05, |
|
"loss": 1.501, |
|
"step": 99000 |
|
}, |
|
{ |
|
"epoch": 3.1742299778284657, |
|
"grad_norm": 56296.40234375, |
|
"learning_rate": 9.095959595959597e-05, |
|
"loss": 1.4999, |
|
"step": 99500 |
|
}, |
|
{ |
|
"epoch": 3.190180721930678, |
|
"grad_norm": 56137.7265625, |
|
"learning_rate": 9.090909090909092e-05, |
|
"loss": 1.5015, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 3.190180721930678, |
|
"eval_loss": 1.4243189096450806, |
|
"eval_runtime": 4946.6072, |
|
"eval_samples_per_second": 202.78, |
|
"eval_steps_per_second": 1.584, |
|
"step": 100000 |
|
}, |
|
{ |
|
"epoch": 3.2061314660328906, |
|
"grad_norm": 57145.66796875, |
|
"learning_rate": 9.085858585858586e-05, |
|
"loss": 1.4976, |
|
"step": 100500 |
|
}, |
|
{ |
|
"epoch": 3.222082210135103, |
|
"grad_norm": 55889.25, |
|
"learning_rate": 9.080808080808081e-05, |
|
"loss": 1.4946, |
|
"step": 101000 |
|
}, |
|
{ |
|
"epoch": 3.238032954237315, |
|
"grad_norm": 54433.96875, |
|
"learning_rate": 9.075757575757577e-05, |
|
"loss": 1.4985, |
|
"step": 101500 |
|
}, |
|
{ |
|
"epoch": 3.2539836983395274, |
|
"grad_norm": 59956.953125, |
|
"learning_rate": 9.07070707070707e-05, |
|
"loss": 1.4926, |
|
"step": 102000 |
|
}, |
|
{ |
|
"epoch": 3.26993444244174, |
|
"grad_norm": 55148.5703125, |
|
"learning_rate": 9.065656565656566e-05, |
|
"loss": 1.4935, |
|
"step": 102500 |
|
}, |
|
{ |
|
"epoch": 3.2858851865439522, |
|
"grad_norm": 58131.62890625, |
|
"learning_rate": 9.060606060606061e-05, |
|
"loss": 1.4936, |
|
"step": 103000 |
|
}, |
|
{ |
|
"epoch": 3.3018359306461647, |
|
"grad_norm": 61794.17578125, |
|
"learning_rate": 9.055555555555556e-05, |
|
"loss": 1.4922, |
|
"step": 103500 |
|
}, |
|
{ |
|
"epoch": 3.317786674748377, |
|
"grad_norm": 56916.46875, |
|
"learning_rate": 9.050505050505052e-05, |
|
"loss": 1.4915, |
|
"step": 104000 |
|
}, |
|
{ |
|
"epoch": 3.3337374188505895, |
|
"grad_norm": 56791.9765625, |
|
"learning_rate": 9.045454545454546e-05, |
|
"loss": 1.4885, |
|
"step": 104500 |
|
}, |
|
{ |
|
"epoch": 3.349688162952802, |
|
"grad_norm": 59157.26171875, |
|
"learning_rate": 9.040404040404041e-05, |
|
"loss": 1.4892, |
|
"step": 105000 |
|
}, |
|
{ |
|
"epoch": 3.365638907055014, |
|
"grad_norm": 57222.37890625, |
|
"learning_rate": 9.035353535353535e-05, |
|
"loss": 1.4896, |
|
"step": 105500 |
|
}, |
|
{ |
|
"epoch": 3.3815896511572263, |
|
"grad_norm": 59154.6171875, |
|
"learning_rate": 9.030303030303031e-05, |
|
"loss": 1.4846, |
|
"step": 106000 |
|
}, |
|
{ |
|
"epoch": 3.3975403952594387, |
|
"grad_norm": 55996.48046875, |
|
"learning_rate": 9.025252525252526e-05, |
|
"loss": 1.4866, |
|
"step": 106500 |
|
}, |
|
{ |
|
"epoch": 3.413491139361651, |
|
"grad_norm": 58967.2578125, |
|
"learning_rate": 9.02020202020202e-05, |
|
"loss": 1.4846, |
|
"step": 107000 |
|
}, |
|
{ |
|
"epoch": 3.4294418834638636, |
|
"grad_norm": 58440.421875, |
|
"learning_rate": 9.015151515151515e-05, |
|
"loss": 1.4851, |
|
"step": 107500 |
|
}, |
|
{ |
|
"epoch": 3.445392627566076, |
|
"grad_norm": 55482.94140625, |
|
"learning_rate": 9.010101010101011e-05, |
|
"loss": 1.4862, |
|
"step": 108000 |
|
}, |
|
{ |
|
"epoch": 3.4613433716682884, |
|
"grad_norm": 59205.22265625, |
|
"learning_rate": 9.005050505050505e-05, |
|
"loss": 1.4829, |
|
"step": 108500 |
|
}, |
|
{ |
|
"epoch": 3.477294115770501, |
|
"grad_norm": 54196.10546875, |
|
"learning_rate": 9e-05, |
|
"loss": 1.482, |
|
"step": 109000 |
|
}, |
|
{ |
|
"epoch": 3.4932448598727133, |
|
"grad_norm": 54923.30078125, |
|
"learning_rate": 8.994949494949495e-05, |
|
"loss": 1.4797, |
|
"step": 109500 |
|
}, |
|
{ |
|
"epoch": 3.5091956039749252, |
|
"grad_norm": 54844.94140625, |
|
"learning_rate": 8.98989898989899e-05, |
|
"loss": 1.4788, |
|
"step": 110000 |
|
}, |
|
{ |
|
"epoch": 3.5091956039749252, |
|
"eval_loss": 1.4024385213851929, |
|
"eval_runtime": 4946.3399, |
|
"eval_samples_per_second": 202.791, |
|
"eval_steps_per_second": 1.584, |
|
"step": 110000 |
|
} |
|
], |
|
"logging_steps": 500, |
|
"max_steps": 1000000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 32, |
|
"save_steps": 10000, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 7.413478046369151e+18, |
|
"train_batch_size": 128, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|