sobamchan's picture
Upload folder using huggingface_hub
862c759 verified
{
"best_metric": 1.4024385213851929,
"best_model_checkpoint": "./outputs/202410/no_safetensors/checkpoint/checkpoint-110000",
"epoch": 3.5091956039749252,
"eval_steps": 10000,
"global_step": 110000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.01595074410221237,
"grad_norm": 108751.9921875,
"learning_rate": 5e-06,
"loss": 9.707,
"step": 500
},
{
"epoch": 0.03190148820442474,
"grad_norm": 44635.21484375,
"learning_rate": 1e-05,
"loss": 7.8592,
"step": 1000
},
{
"epoch": 0.047852232306637106,
"grad_norm": 56782.453125,
"learning_rate": 1.5e-05,
"loss": 7.2609,
"step": 1500
},
{
"epoch": 0.06380297640884948,
"grad_norm": 96195.9140625,
"learning_rate": 2e-05,
"loss": 7.1215,
"step": 2000
},
{
"epoch": 0.07975372051106185,
"grad_norm": 108495.3828125,
"learning_rate": 2.5e-05,
"loss": 7.0039,
"step": 2500
},
{
"epoch": 0.09570446461327421,
"grad_norm": 69317.0625,
"learning_rate": 3e-05,
"loss": 6.8924,
"step": 3000
},
{
"epoch": 0.11165520871548658,
"grad_norm": 112126.8203125,
"learning_rate": 3.5e-05,
"loss": 6.7924,
"step": 3500
},
{
"epoch": 0.12760595281769896,
"grad_norm": 94684.1640625,
"learning_rate": 4e-05,
"loss": 6.6991,
"step": 4000
},
{
"epoch": 0.14355669691991133,
"grad_norm": 99549.90625,
"learning_rate": 4.5e-05,
"loss": 6.6077,
"step": 4500
},
{
"epoch": 0.1595074410221237,
"grad_norm": 100152.6171875,
"learning_rate": 5e-05,
"loss": 6.4957,
"step": 5000
},
{
"epoch": 0.17545818512433606,
"grad_norm": 165780.328125,
"learning_rate": 5.500000000000001e-05,
"loss": 6.3014,
"step": 5500
},
{
"epoch": 0.19140892922654842,
"grad_norm": 132947.90625,
"learning_rate": 6e-05,
"loss": 5.9146,
"step": 6000
},
{
"epoch": 0.2073596733287608,
"grad_norm": 146624.34375,
"learning_rate": 6.500000000000001e-05,
"loss": 5.5084,
"step": 6500
},
{
"epoch": 0.22331041743097316,
"grad_norm": 145966.1875,
"learning_rate": 7e-05,
"loss": 5.1444,
"step": 7000
},
{
"epoch": 0.23926116153318552,
"grad_norm": 127509.96875,
"learning_rate": 7.500000000000001e-05,
"loss": 4.5906,
"step": 7500
},
{
"epoch": 0.2552119056353979,
"grad_norm": 97949.8828125,
"learning_rate": 8e-05,
"loss": 3.9702,
"step": 8000
},
{
"epoch": 0.2711626497376103,
"grad_norm": 89478.4296875,
"learning_rate": 8.5e-05,
"loss": 3.5577,
"step": 8500
},
{
"epoch": 0.28711339383982265,
"grad_norm": 94187.8984375,
"learning_rate": 9e-05,
"loss": 3.302,
"step": 9000
},
{
"epoch": 0.303064137942035,
"grad_norm": 83401.4765625,
"learning_rate": 9.5e-05,
"loss": 3.122,
"step": 9500
},
{
"epoch": 0.3190148820442474,
"grad_norm": 80432.59375,
"learning_rate": 0.0001,
"loss": 2.9716,
"step": 10000
},
{
"epoch": 0.3190148820442474,
"eval_loss": 2.7994558811187744,
"eval_runtime": 4941.5974,
"eval_samples_per_second": 202.986,
"eval_steps_per_second": 1.586,
"step": 10000
},
{
"epoch": 0.33496562614645975,
"grad_norm": 71825.7421875,
"learning_rate": 9.994949494949496e-05,
"loss": 2.8485,
"step": 10500
},
{
"epoch": 0.3509163702486721,
"grad_norm": 76352.0625,
"learning_rate": 9.98989898989899e-05,
"loss": 2.7498,
"step": 11000
},
{
"epoch": 0.3668671143508845,
"grad_norm": 74371.5703125,
"learning_rate": 9.984848484848486e-05,
"loss": 2.6733,
"step": 11500
},
{
"epoch": 0.38281785845309685,
"grad_norm": 68137.9609375,
"learning_rate": 9.97979797979798e-05,
"loss": 2.5987,
"step": 12000
},
{
"epoch": 0.3987686025553092,
"grad_norm": 64699.58984375,
"learning_rate": 9.974747474747475e-05,
"loss": 2.5405,
"step": 12500
},
{
"epoch": 0.4147193466575216,
"grad_norm": 69361.3046875,
"learning_rate": 9.96969696969697e-05,
"loss": 2.4881,
"step": 13000
},
{
"epoch": 0.43067009075973395,
"grad_norm": 69834.671875,
"learning_rate": 9.964646464646466e-05,
"loss": 2.446,
"step": 13500
},
{
"epoch": 0.4466208348619463,
"grad_norm": 57975.98828125,
"learning_rate": 9.95959595959596e-05,
"loss": 2.3991,
"step": 14000
},
{
"epoch": 0.4625715789641587,
"grad_norm": 67055.9375,
"learning_rate": 9.954545454545455e-05,
"loss": 2.3637,
"step": 14500
},
{
"epoch": 0.47852232306637105,
"grad_norm": 62929.25390625,
"learning_rate": 9.94949494949495e-05,
"loss": 2.3294,
"step": 15000
},
{
"epoch": 0.4944730671685834,
"grad_norm": 65925.8828125,
"learning_rate": 9.944444444444446e-05,
"loss": 2.3016,
"step": 15500
},
{
"epoch": 0.5104238112707958,
"grad_norm": 59559.88671875,
"learning_rate": 9.939393939393939e-05,
"loss": 2.2717,
"step": 16000
},
{
"epoch": 0.5263745553730081,
"grad_norm": 63280.40234375,
"learning_rate": 9.934343434343435e-05,
"loss": 2.2462,
"step": 16500
},
{
"epoch": 0.5423252994752206,
"grad_norm": 61513.0859375,
"learning_rate": 9.92929292929293e-05,
"loss": 2.2214,
"step": 17000
},
{
"epoch": 0.5582760435774329,
"grad_norm": 63240.92578125,
"learning_rate": 9.924242424242425e-05,
"loss": 2.2013,
"step": 17500
},
{
"epoch": 0.5742267876796453,
"grad_norm": 61634.92578125,
"learning_rate": 9.919191919191919e-05,
"loss": 2.1798,
"step": 18000
},
{
"epoch": 0.5901775317818576,
"grad_norm": 56590.8203125,
"learning_rate": 9.914141414141415e-05,
"loss": 2.1557,
"step": 18500
},
{
"epoch": 0.60612827588407,
"grad_norm": 59655.97265625,
"learning_rate": 9.909090909090911e-05,
"loss": 2.14,
"step": 19000
},
{
"epoch": 0.6220790199862823,
"grad_norm": 60380.125,
"learning_rate": 9.904040404040404e-05,
"loss": 2.123,
"step": 19500
},
{
"epoch": 0.6380297640884948,
"grad_norm": 56893.875,
"learning_rate": 9.8989898989899e-05,
"loss": 2.1062,
"step": 20000
},
{
"epoch": 0.6380297640884948,
"eval_loss": 2.007786512374878,
"eval_runtime": 4985.2285,
"eval_samples_per_second": 201.209,
"eval_steps_per_second": 1.572,
"step": 20000
},
{
"epoch": 0.6539805081907071,
"grad_norm": 60312.33203125,
"learning_rate": 9.893939393939395e-05,
"loss": 2.0889,
"step": 20500
},
{
"epoch": 0.6699312522929195,
"grad_norm": 58157.28125,
"learning_rate": 9.888888888888889e-05,
"loss": 2.0776,
"step": 21000
},
{
"epoch": 0.6858819963951318,
"grad_norm": 58377.234375,
"learning_rate": 9.883838383838384e-05,
"loss": 2.0646,
"step": 21500
},
{
"epoch": 0.7018327404973442,
"grad_norm": 57768.578125,
"learning_rate": 9.87878787878788e-05,
"loss": 2.0493,
"step": 22000
},
{
"epoch": 0.7177834845995565,
"grad_norm": 56948.41796875,
"learning_rate": 9.873737373737374e-05,
"loss": 2.0339,
"step": 22500
},
{
"epoch": 0.733734228701769,
"grad_norm": 58923.421875,
"learning_rate": 9.868686868686869e-05,
"loss": 2.0256,
"step": 23000
},
{
"epoch": 0.7496849728039813,
"grad_norm": 60381.23828125,
"learning_rate": 9.863636363636364e-05,
"loss": 2.0108,
"step": 23500
},
{
"epoch": 0.7656357169061937,
"grad_norm": 55685.12890625,
"learning_rate": 9.85858585858586e-05,
"loss": 2.0011,
"step": 24000
},
{
"epoch": 0.781586461008406,
"grad_norm": 55655.4609375,
"learning_rate": 9.853535353535353e-05,
"loss": 1.9899,
"step": 24500
},
{
"epoch": 0.7975372051106184,
"grad_norm": 57943.70703125,
"learning_rate": 9.848484848484849e-05,
"loss": 1.9793,
"step": 25000
},
{
"epoch": 0.8134879492128307,
"grad_norm": 56282.34375,
"learning_rate": 9.843434343434344e-05,
"loss": 1.9666,
"step": 25500
},
{
"epoch": 0.8294386933150432,
"grad_norm": 56147.5390625,
"learning_rate": 9.838383838383838e-05,
"loss": 1.9575,
"step": 26000
},
{
"epoch": 0.8453894374172555,
"grad_norm": 54766.921875,
"learning_rate": 9.833333333333333e-05,
"loss": 1.9508,
"step": 26500
},
{
"epoch": 0.8613401815194679,
"grad_norm": 57243.0390625,
"learning_rate": 9.828282828282829e-05,
"loss": 1.9406,
"step": 27000
},
{
"epoch": 0.8772909256216802,
"grad_norm": 57537.25,
"learning_rate": 9.823232323232325e-05,
"loss": 1.9295,
"step": 27500
},
{
"epoch": 0.8932416697238926,
"grad_norm": 56535.60546875,
"learning_rate": 9.818181818181818e-05,
"loss": 1.9232,
"step": 28000
},
{
"epoch": 0.9091924138261049,
"grad_norm": 60243.4375,
"learning_rate": 9.813131313131314e-05,
"loss": 1.9139,
"step": 28500
},
{
"epoch": 0.9251431579283174,
"grad_norm": 54113.95703125,
"learning_rate": 9.808080808080809e-05,
"loss": 1.9067,
"step": 29000
},
{
"epoch": 0.9410939020305297,
"grad_norm": 56388.44921875,
"learning_rate": 9.803030303030303e-05,
"loss": 1.8986,
"step": 29500
},
{
"epoch": 0.9570446461327421,
"grad_norm": 55065.8203125,
"learning_rate": 9.797979797979798e-05,
"loss": 1.8912,
"step": 30000
},
{
"epoch": 0.9570446461327421,
"eval_loss": 1.7991323471069336,
"eval_runtime": 4913.8629,
"eval_samples_per_second": 204.132,
"eval_steps_per_second": 1.595,
"step": 30000
},
{
"epoch": 0.9729953902349545,
"grad_norm": 55303.02734375,
"learning_rate": 9.792929292929294e-05,
"loss": 1.8815,
"step": 30500
},
{
"epoch": 0.9889461343371668,
"grad_norm": 61240.37109375,
"learning_rate": 9.787878787878789e-05,
"loss": 1.876,
"step": 31000
},
{
"epoch": 1.0048968784393792,
"grad_norm": 54453.8359375,
"learning_rate": 9.782828282828283e-05,
"loss": 1.8677,
"step": 31500
},
{
"epoch": 1.0208476225415917,
"grad_norm": 53728.48828125,
"learning_rate": 9.777777777777778e-05,
"loss": 1.8582,
"step": 32000
},
{
"epoch": 1.0367983666438039,
"grad_norm": 57865.578125,
"learning_rate": 9.772727272727274e-05,
"loss": 1.8532,
"step": 32500
},
{
"epoch": 1.0527491107460163,
"grad_norm": 57046.4609375,
"learning_rate": 9.767676767676767e-05,
"loss": 1.8473,
"step": 33000
},
{
"epoch": 1.0686998548482287,
"grad_norm": 57173.7265625,
"learning_rate": 9.762626262626263e-05,
"loss": 1.8394,
"step": 33500
},
{
"epoch": 1.0846505989504411,
"grad_norm": 55721.00390625,
"learning_rate": 9.757575757575758e-05,
"loss": 1.8346,
"step": 34000
},
{
"epoch": 1.1006013430526533,
"grad_norm": 56324.05859375,
"learning_rate": 9.752525252525253e-05,
"loss": 1.8319,
"step": 34500
},
{
"epoch": 1.1165520871548658,
"grad_norm": 55807.09765625,
"learning_rate": 9.747474747474747e-05,
"loss": 1.8231,
"step": 35000
},
{
"epoch": 1.1325028312570782,
"grad_norm": 53290.69140625,
"learning_rate": 9.742424242424243e-05,
"loss": 1.815,
"step": 35500
},
{
"epoch": 1.1484535753592906,
"grad_norm": 55672.96875,
"learning_rate": 9.737373737373738e-05,
"loss": 1.8117,
"step": 36000
},
{
"epoch": 1.1644043194615028,
"grad_norm": 53640.85546875,
"learning_rate": 9.732323232323232e-05,
"loss": 1.8069,
"step": 36500
},
{
"epoch": 1.1803550635637152,
"grad_norm": 58046.828125,
"learning_rate": 9.727272727272728e-05,
"loss": 1.801,
"step": 37000
},
{
"epoch": 1.1963058076659276,
"grad_norm": 57318.890625,
"learning_rate": 9.722222222222223e-05,
"loss": 1.7943,
"step": 37500
},
{
"epoch": 1.21225655176814,
"grad_norm": 57652.68359375,
"learning_rate": 9.717171717171718e-05,
"loss": 1.7907,
"step": 38000
},
{
"epoch": 1.2282072958703523,
"grad_norm": 54140.81640625,
"learning_rate": 9.712121212121212e-05,
"loss": 1.7837,
"step": 38500
},
{
"epoch": 1.2441580399725647,
"grad_norm": 55963.265625,
"learning_rate": 9.707070707070708e-05,
"loss": 1.7804,
"step": 39000
},
{
"epoch": 1.260108784074777,
"grad_norm": 56717.23828125,
"learning_rate": 9.702020202020202e-05,
"loss": 1.7779,
"step": 39500
},
{
"epoch": 1.2760595281769895,
"grad_norm": 54491.91796875,
"learning_rate": 9.696969696969698e-05,
"loss": 1.7704,
"step": 40000
},
{
"epoch": 1.2760595281769895,
"eval_loss": 1.684906005859375,
"eval_runtime": 4951.5896,
"eval_samples_per_second": 202.576,
"eval_steps_per_second": 1.583,
"step": 40000
},
{
"epoch": 1.2920262230233042,
"grad_norm": 53837.6640625,
"learning_rate": 9.691919191919192e-05,
"loss": 1.7651,
"step": 40500
},
{
"epoch": 1.3079769671255164,
"grad_norm": 54601.4765625,
"learning_rate": 9.686868686868688e-05,
"loss": 1.759,
"step": 41000
},
{
"epoch": 1.3239277112277288,
"grad_norm": 52770.5078125,
"learning_rate": 9.681818181818181e-05,
"loss": 1.7568,
"step": 41500
},
{
"epoch": 1.3398784553299412,
"grad_norm": 56085.59765625,
"learning_rate": 9.676767676767677e-05,
"loss": 1.7522,
"step": 42000
},
{
"epoch": 1.3558291994321534,
"grad_norm": 55068.359375,
"learning_rate": 9.671717171717172e-05,
"loss": 1.7506,
"step": 42500
},
{
"epoch": 1.3717799435343658,
"grad_norm": 57091.0546875,
"learning_rate": 9.666666666666667e-05,
"loss": 1.7419,
"step": 43000
},
{
"epoch": 1.3877306876365783,
"grad_norm": 52158.63671875,
"learning_rate": 9.661616161616161e-05,
"loss": 1.7395,
"step": 43500
},
{
"epoch": 1.4036814317387907,
"grad_norm": 54727.2890625,
"learning_rate": 9.656565656565657e-05,
"loss": 1.7351,
"step": 44000
},
{
"epoch": 1.419632175841003,
"grad_norm": 59289.56640625,
"learning_rate": 9.651515151515152e-05,
"loss": 1.731,
"step": 44500
},
{
"epoch": 1.4355829199432153,
"grad_norm": 57592.5546875,
"learning_rate": 9.646464646464647e-05,
"loss": 1.7268,
"step": 45000
},
{
"epoch": 1.4515336640454277,
"grad_norm": 56016.44921875,
"learning_rate": 9.641414141414143e-05,
"loss": 1.723,
"step": 45500
},
{
"epoch": 1.4674844081476401,
"grad_norm": 55521.9765625,
"learning_rate": 9.636363636363637e-05,
"loss": 1.7171,
"step": 46000
},
{
"epoch": 1.4834351522498523,
"grad_norm": 56532.88671875,
"learning_rate": 9.631313131313132e-05,
"loss": 1.7146,
"step": 46500
},
{
"epoch": 1.4993858963520648,
"grad_norm": 53762.78515625,
"learning_rate": 9.626262626262627e-05,
"loss": 1.7107,
"step": 47000
},
{
"epoch": 1.5153366404542772,
"grad_norm": 54982.08203125,
"learning_rate": 9.621212121212123e-05,
"loss": 1.7085,
"step": 47500
},
{
"epoch": 1.5312873845564896,
"grad_norm": 57135.71484375,
"learning_rate": 9.616161616161616e-05,
"loss": 1.7063,
"step": 48000
},
{
"epoch": 1.547238128658702,
"grad_norm": 55469.52734375,
"learning_rate": 9.611111111111112e-05,
"loss": 1.7021,
"step": 48500
},
{
"epoch": 1.5631888727609144,
"grad_norm": 55527.140625,
"learning_rate": 9.606060606060606e-05,
"loss": 1.6959,
"step": 49000
},
{
"epoch": 1.5791396168631266,
"grad_norm": 55039.515625,
"learning_rate": 9.601010101010101e-05,
"loss": 1.6953,
"step": 49500
},
{
"epoch": 1.595090360965339,
"grad_norm": 58584.22265625,
"learning_rate": 9.595959595959596e-05,
"loss": 1.6916,
"step": 50000
},
{
"epoch": 1.595090360965339,
"eval_loss": 1.60930597782135,
"eval_runtime": 4918.9878,
"eval_samples_per_second": 203.919,
"eval_steps_per_second": 1.593,
"step": 50000
},
{
"epoch": 1.6110411050675513,
"grad_norm": 55009.78125,
"learning_rate": 9.590909090909092e-05,
"loss": 1.685,
"step": 50500
},
{
"epoch": 1.6269918491697637,
"grad_norm": 53817.55078125,
"learning_rate": 9.585858585858586e-05,
"loss": 1.684,
"step": 51000
},
{
"epoch": 1.6429425932719761,
"grad_norm": 53155.3359375,
"learning_rate": 9.580808080808081e-05,
"loss": 1.6813,
"step": 51500
},
{
"epoch": 1.6588933373741885,
"grad_norm": 55288.33984375,
"learning_rate": 9.575757575757576e-05,
"loss": 1.6776,
"step": 52000
},
{
"epoch": 1.674844081476401,
"grad_norm": 58538.25,
"learning_rate": 9.570707070707072e-05,
"loss": 1.6746,
"step": 52500
},
{
"epoch": 1.6907948255786134,
"grad_norm": 54227.25390625,
"learning_rate": 9.565656565656566e-05,
"loss": 1.6715,
"step": 53000
},
{
"epoch": 1.7067455696808256,
"grad_norm": 56011.28125,
"learning_rate": 9.560606060606061e-05,
"loss": 1.6701,
"step": 53500
},
{
"epoch": 1.722696313783038,
"grad_norm": 51203.140625,
"learning_rate": 9.555555555555557e-05,
"loss": 1.6695,
"step": 54000
},
{
"epoch": 1.7386470578852502,
"grad_norm": 53641.3203125,
"learning_rate": 9.550505050505051e-05,
"loss": 1.6613,
"step": 54500
},
{
"epoch": 1.7545978019874626,
"grad_norm": 55869.0234375,
"learning_rate": 9.545454545454546e-05,
"loss": 1.6598,
"step": 55000
},
{
"epoch": 1.770548546089675,
"grad_norm": 54982.5546875,
"learning_rate": 9.540404040404041e-05,
"loss": 1.6595,
"step": 55500
},
{
"epoch": 1.7864992901918875,
"grad_norm": 55537.86328125,
"learning_rate": 9.535353535353537e-05,
"loss": 1.6544,
"step": 56000
},
{
"epoch": 1.8024500342941,
"grad_norm": 52897.34375,
"learning_rate": 9.53030303030303e-05,
"loss": 1.6512,
"step": 56500
},
{
"epoch": 1.8184007783963123,
"grad_norm": 55306.94921875,
"learning_rate": 9.525252525252526e-05,
"loss": 1.6493,
"step": 57000
},
{
"epoch": 1.8343515224985245,
"grad_norm": 55051.29296875,
"learning_rate": 9.52020202020202e-05,
"loss": 1.6453,
"step": 57500
},
{
"epoch": 1.850302266600737,
"grad_norm": 54118.21484375,
"learning_rate": 9.515151515151515e-05,
"loss": 1.6441,
"step": 58000
},
{
"epoch": 1.8662530107029491,
"grad_norm": 56088.76171875,
"learning_rate": 9.51010101010101e-05,
"loss": 1.6429,
"step": 58500
},
{
"epoch": 1.8822037548051616,
"grad_norm": 52600.8359375,
"learning_rate": 9.505050505050506e-05,
"loss": 1.636,
"step": 59000
},
{
"epoch": 1.898154498907374,
"grad_norm": 55023.9375,
"learning_rate": 9.5e-05,
"loss": 1.6333,
"step": 59500
},
{
"epoch": 1.9141052430095864,
"grad_norm": 54295.30859375,
"learning_rate": 9.494949494949495e-05,
"loss": 1.6335,
"step": 60000
},
{
"epoch": 1.9141052430095864,
"eval_loss": 1.5532509088516235,
"eval_runtime": 4919.5325,
"eval_samples_per_second": 203.896,
"eval_steps_per_second": 1.593,
"step": 60000
},
{
"epoch": 1.9300559871117988,
"grad_norm": 55173.84765625,
"learning_rate": 9.48989898989899e-05,
"loss": 1.63,
"step": 60500
},
{
"epoch": 1.9460067312140112,
"grad_norm": 53612.4921875,
"learning_rate": 9.484848484848486e-05,
"loss": 1.6286,
"step": 61000
},
{
"epoch": 1.9619574753162237,
"grad_norm": 55060.9140625,
"learning_rate": 9.47979797979798e-05,
"loss": 1.6263,
"step": 61500
},
{
"epoch": 1.9779082194184359,
"grad_norm": 54993.5625,
"learning_rate": 9.474747474747475e-05,
"loss": 1.6231,
"step": 62000
},
{
"epoch": 1.9938589635206483,
"grad_norm": 55949.11328125,
"learning_rate": 9.469696969696971e-05,
"loss": 1.6228,
"step": 62500
},
{
"epoch": 2.0098097076228605,
"grad_norm": 54883.53515625,
"learning_rate": 9.464646464646464e-05,
"loss": 1.6197,
"step": 63000
},
{
"epoch": 2.025760451725073,
"grad_norm": 57476.609375,
"learning_rate": 9.45959595959596e-05,
"loss": 1.6145,
"step": 63500
},
{
"epoch": 2.0417111958272853,
"grad_norm": 55516.81640625,
"learning_rate": 9.454545454545455e-05,
"loss": 1.6126,
"step": 64000
},
{
"epoch": 2.0576619399294978,
"grad_norm": 53755.71484375,
"learning_rate": 9.449494949494951e-05,
"loss": 1.6109,
"step": 64500
},
{
"epoch": 2.07361268403171,
"grad_norm": 54177.87890625,
"learning_rate": 9.444444444444444e-05,
"loss": 1.6105,
"step": 65000
},
{
"epoch": 2.0895634281339226,
"grad_norm": 56828.4765625,
"learning_rate": 9.43939393939394e-05,
"loss": 1.6075,
"step": 65500
},
{
"epoch": 2.105514172236135,
"grad_norm": 55464.76171875,
"learning_rate": 9.434343434343435e-05,
"loss": 1.606,
"step": 66000
},
{
"epoch": 2.121464916338347,
"grad_norm": 56928.1015625,
"learning_rate": 9.42929292929293e-05,
"loss": 1.6023,
"step": 66500
},
{
"epoch": 2.1374156604405594,
"grad_norm": 51041.0,
"learning_rate": 9.424242424242424e-05,
"loss": 1.6,
"step": 67000
},
{
"epoch": 2.153366404542772,
"grad_norm": 56379.828125,
"learning_rate": 9.41919191919192e-05,
"loss": 1.6003,
"step": 67500
},
{
"epoch": 2.1693171486449843,
"grad_norm": 55763.93359375,
"learning_rate": 9.414141414141415e-05,
"loss": 1.596,
"step": 68000
},
{
"epoch": 2.1852678927471967,
"grad_norm": 53785.359375,
"learning_rate": 9.40909090909091e-05,
"loss": 1.5932,
"step": 68500
},
{
"epoch": 2.201218636849409,
"grad_norm": 56003.0703125,
"learning_rate": 9.404040404040404e-05,
"loss": 1.5928,
"step": 69000
},
{
"epoch": 2.2171693809516215,
"grad_norm": 55811.99609375,
"learning_rate": 9.3989898989899e-05,
"loss": 1.5919,
"step": 69500
},
{
"epoch": 2.233120125053834,
"grad_norm": 56307.57421875,
"learning_rate": 9.393939393939395e-05,
"loss": 1.5869,
"step": 70000
},
{
"epoch": 2.233120125053834,
"eval_loss": 1.5101096630096436,
"eval_runtime": 4917.8928,
"eval_samples_per_second": 203.964,
"eval_steps_per_second": 1.594,
"step": 70000
},
{
"epoch": 2.249070869156046,
"grad_norm": 53560.9296875,
"learning_rate": 9.388888888888889e-05,
"loss": 1.5883,
"step": 70500
},
{
"epoch": 2.2650216132582583,
"grad_norm": 56536.40625,
"learning_rate": 9.383838383838385e-05,
"loss": 1.585,
"step": 71000
},
{
"epoch": 2.2809723573604708,
"grad_norm": 54454.96875,
"learning_rate": 9.378787878787879e-05,
"loss": 1.582,
"step": 71500
},
{
"epoch": 2.296923101462683,
"grad_norm": 55888.09375,
"learning_rate": 9.373737373737375e-05,
"loss": 1.5776,
"step": 72000
},
{
"epoch": 2.3128738455648956,
"grad_norm": 55370.46484375,
"learning_rate": 9.368686868686869e-05,
"loss": 1.5781,
"step": 72500
},
{
"epoch": 2.328824589667108,
"grad_norm": 56668.328125,
"learning_rate": 9.363636363636364e-05,
"loss": 1.5779,
"step": 73000
},
{
"epoch": 2.3447753337693205,
"grad_norm": 56674.85546875,
"learning_rate": 9.358585858585858e-05,
"loss": 1.5724,
"step": 73500
},
{
"epoch": 2.360726077871533,
"grad_norm": 59070.734375,
"learning_rate": 9.353535353535354e-05,
"loss": 1.5737,
"step": 74000
},
{
"epoch": 2.376676821973745,
"grad_norm": 55701.37109375,
"learning_rate": 9.348484848484849e-05,
"loss": 1.5702,
"step": 74500
},
{
"epoch": 2.3926275660759573,
"grad_norm": 54837.890625,
"learning_rate": 9.343434343434344e-05,
"loss": 1.5691,
"step": 75000
},
{
"epoch": 2.4085783101781697,
"grad_norm": 55847.98046875,
"learning_rate": 9.338383838383838e-05,
"loss": 1.5692,
"step": 75500
},
{
"epoch": 2.424529054280382,
"grad_norm": 53633.03125,
"learning_rate": 9.333333333333334e-05,
"loss": 1.5639,
"step": 76000
},
{
"epoch": 2.4404797983825945,
"grad_norm": 55944.12890625,
"learning_rate": 9.328282828282829e-05,
"loss": 1.5664,
"step": 76500
},
{
"epoch": 2.456430542484807,
"grad_norm": 53979.30859375,
"learning_rate": 9.323232323232324e-05,
"loss": 1.5635,
"step": 77000
},
{
"epoch": 2.4723812865870194,
"grad_norm": 56014.97265625,
"learning_rate": 9.318181818181818e-05,
"loss": 1.5601,
"step": 77500
},
{
"epoch": 2.488332030689232,
"grad_norm": 55291.9140625,
"learning_rate": 9.313131313131314e-05,
"loss": 1.5618,
"step": 78000
},
{
"epoch": 2.5042827747914442,
"grad_norm": 53215.24609375,
"learning_rate": 9.308080808080809e-05,
"loss": 1.5576,
"step": 78500
},
{
"epoch": 2.520233518893656,
"grad_norm": 56197.3203125,
"learning_rate": 9.303030303030303e-05,
"loss": 1.554,
"step": 79000
},
{
"epoch": 2.5361842629958686,
"grad_norm": 55106.9765625,
"learning_rate": 9.2979797979798e-05,
"loss": 1.5545,
"step": 79500
},
{
"epoch": 2.552135007098081,
"grad_norm": 54552.08984375,
"learning_rate": 9.292929292929293e-05,
"loss": 1.5529,
"step": 80000
},
{
"epoch": 2.552135007098081,
"eval_loss": 1.4744161367416382,
"eval_runtime": 4899.8032,
"eval_samples_per_second": 204.717,
"eval_steps_per_second": 1.599,
"step": 80000
},
{
"epoch": 2.5681017019443955,
"grad_norm": 56671.71875,
"learning_rate": 9.287878787878789e-05,
"loss": 1.5516,
"step": 80500
},
{
"epoch": 2.5840524460466083,
"grad_norm": 54676.5078125,
"learning_rate": 9.282828282828283e-05,
"loss": 1.5501,
"step": 81000
},
{
"epoch": 2.6000031901488203,
"grad_norm": 56587.88671875,
"learning_rate": 9.277777777777778e-05,
"loss": 1.549,
"step": 81500
},
{
"epoch": 2.6159539342510327,
"grad_norm": 56696.6484375,
"learning_rate": 9.272727272727273e-05,
"loss": 1.5448,
"step": 82000
},
{
"epoch": 2.631904678353245,
"grad_norm": 55145.359375,
"learning_rate": 9.267676767676769e-05,
"loss": 1.5439,
"step": 82500
},
{
"epoch": 2.6478554224554576,
"grad_norm": 55258.8828125,
"learning_rate": 9.262626262626263e-05,
"loss": 1.5461,
"step": 83000
},
{
"epoch": 2.66380616655767,
"grad_norm": 55989.4921875,
"learning_rate": 9.257575757575758e-05,
"loss": 1.5423,
"step": 83500
},
{
"epoch": 2.6797569106598824,
"grad_norm": 54569.66015625,
"learning_rate": 9.252525252525253e-05,
"loss": 1.5432,
"step": 84000
},
{
"epoch": 2.695707654762095,
"grad_norm": 56487.32421875,
"learning_rate": 9.247474747474749e-05,
"loss": 1.5406,
"step": 84500
},
{
"epoch": 2.711658398864307,
"grad_norm": 55735.38671875,
"learning_rate": 9.242424242424242e-05,
"loss": 1.5385,
"step": 85000
},
{
"epoch": 2.7276091429665192,
"grad_norm": 55246.734375,
"learning_rate": 9.237373737373738e-05,
"loss": 1.5385,
"step": 85500
},
{
"epoch": 2.7435598870687317,
"grad_norm": 54426.76171875,
"learning_rate": 9.232323232323232e-05,
"loss": 1.5354,
"step": 86000
},
{
"epoch": 2.759510631170944,
"grad_norm": 56496.05859375,
"learning_rate": 9.227272727272727e-05,
"loss": 1.5354,
"step": 86500
},
{
"epoch": 2.7754613752731565,
"grad_norm": 54483.66796875,
"learning_rate": 9.222222222222223e-05,
"loss": 1.5314,
"step": 87000
},
{
"epoch": 2.791412119375369,
"grad_norm": 56842.0,
"learning_rate": 9.217171717171718e-05,
"loss": 1.5307,
"step": 87500
},
{
"epoch": 2.8073628634775813,
"grad_norm": 55707.21875,
"learning_rate": 9.212121212121214e-05,
"loss": 1.5299,
"step": 88000
},
{
"epoch": 2.8233136075797933,
"grad_norm": 54965.58984375,
"learning_rate": 9.207070707070707e-05,
"loss": 1.5292,
"step": 88500
},
{
"epoch": 2.839264351682006,
"grad_norm": 54280.4140625,
"learning_rate": 9.202020202020203e-05,
"loss": 1.5281,
"step": 89000
},
{
"epoch": 2.855215095784218,
"grad_norm": 56317.328125,
"learning_rate": 9.196969696969698e-05,
"loss": 1.5292,
"step": 89500
},
{
"epoch": 2.8711658398864306,
"grad_norm": 56429.265625,
"learning_rate": 9.191919191919192e-05,
"loss": 1.5248,
"step": 90000
},
{
"epoch": 2.8711658398864306,
"eval_loss": 1.4486085176467896,
"eval_runtime": 4949.3767,
"eval_samples_per_second": 202.667,
"eval_steps_per_second": 1.583,
"step": 90000
},
{
"epoch": 2.887116583988643,
"grad_norm": 60500.609375,
"learning_rate": 9.186868686868687e-05,
"loss": 1.5244,
"step": 90500
},
{
"epoch": 2.9030673280908554,
"grad_norm": 56278.77734375,
"learning_rate": 9.181818181818183e-05,
"loss": 1.5228,
"step": 91000
},
{
"epoch": 2.919018072193068,
"grad_norm": 55179.0859375,
"learning_rate": 9.176767676767677e-05,
"loss": 1.5228,
"step": 91500
},
{
"epoch": 2.9349688162952803,
"grad_norm": 58535.69140625,
"learning_rate": 9.171717171717172e-05,
"loss": 1.5207,
"step": 92000
},
{
"epoch": 2.9509195603974927,
"grad_norm": 56315.96484375,
"learning_rate": 9.166666666666667e-05,
"loss": 1.5205,
"step": 92500
},
{
"epoch": 2.9668703044997047,
"grad_norm": 57173.87109375,
"learning_rate": 9.161616161616163e-05,
"loss": 1.5165,
"step": 93000
},
{
"epoch": 2.9828210486019175,
"grad_norm": 58435.5,
"learning_rate": 9.156565656565656e-05,
"loss": 1.516,
"step": 93500
},
{
"epoch": 2.9987717927041295,
"grad_norm": 55689.07421875,
"learning_rate": 9.151515151515152e-05,
"loss": 1.5155,
"step": 94000
},
{
"epoch": 3.014722536806342,
"grad_norm": 55691.28515625,
"learning_rate": 9.146464646464647e-05,
"loss": 1.5124,
"step": 94500
},
{
"epoch": 3.0306732809085544,
"grad_norm": 55951.5546875,
"learning_rate": 9.141414141414141e-05,
"loss": 1.5098,
"step": 95000
},
{
"epoch": 3.046624025010767,
"grad_norm": 56781.6328125,
"learning_rate": 9.136363636363637e-05,
"loss": 1.5107,
"step": 95500
},
{
"epoch": 3.062574769112979,
"grad_norm": 56111.234375,
"learning_rate": 9.131313131313132e-05,
"loss": 1.5072,
"step": 96000
},
{
"epoch": 3.0785255132151916,
"grad_norm": 55225.51171875,
"learning_rate": 9.126262626262627e-05,
"loss": 1.5089,
"step": 96500
},
{
"epoch": 3.094476257317404,
"grad_norm": 57983.26171875,
"learning_rate": 9.121212121212121e-05,
"loss": 1.5068,
"step": 97000
},
{
"epoch": 3.110427001419616,
"grad_norm": 55611.953125,
"learning_rate": 9.116161616161617e-05,
"loss": 1.5039,
"step": 97500
},
{
"epoch": 3.1263777455218285,
"grad_norm": 56500.25390625,
"learning_rate": 9.111111111111112e-05,
"loss": 1.5023,
"step": 98000
},
{
"epoch": 3.142328489624041,
"grad_norm": 55070.7578125,
"learning_rate": 9.106060606060606e-05,
"loss": 1.5025,
"step": 98500
},
{
"epoch": 3.1582792337262533,
"grad_norm": 56306.3203125,
"learning_rate": 9.101010101010101e-05,
"loss": 1.501,
"step": 99000
},
{
"epoch": 3.1742299778284657,
"grad_norm": 56296.40234375,
"learning_rate": 9.095959595959597e-05,
"loss": 1.4999,
"step": 99500
},
{
"epoch": 3.190180721930678,
"grad_norm": 56137.7265625,
"learning_rate": 9.090909090909092e-05,
"loss": 1.5015,
"step": 100000
},
{
"epoch": 3.190180721930678,
"eval_loss": 1.4243189096450806,
"eval_runtime": 4946.6072,
"eval_samples_per_second": 202.78,
"eval_steps_per_second": 1.584,
"step": 100000
},
{
"epoch": 3.2061314660328906,
"grad_norm": 57145.66796875,
"learning_rate": 9.085858585858586e-05,
"loss": 1.4976,
"step": 100500
},
{
"epoch": 3.222082210135103,
"grad_norm": 55889.25,
"learning_rate": 9.080808080808081e-05,
"loss": 1.4946,
"step": 101000
},
{
"epoch": 3.238032954237315,
"grad_norm": 54433.96875,
"learning_rate": 9.075757575757577e-05,
"loss": 1.4985,
"step": 101500
},
{
"epoch": 3.2539836983395274,
"grad_norm": 59956.953125,
"learning_rate": 9.07070707070707e-05,
"loss": 1.4926,
"step": 102000
},
{
"epoch": 3.26993444244174,
"grad_norm": 55148.5703125,
"learning_rate": 9.065656565656566e-05,
"loss": 1.4935,
"step": 102500
},
{
"epoch": 3.2858851865439522,
"grad_norm": 58131.62890625,
"learning_rate": 9.060606060606061e-05,
"loss": 1.4936,
"step": 103000
},
{
"epoch": 3.3018359306461647,
"grad_norm": 61794.17578125,
"learning_rate": 9.055555555555556e-05,
"loss": 1.4922,
"step": 103500
},
{
"epoch": 3.317786674748377,
"grad_norm": 56916.46875,
"learning_rate": 9.050505050505052e-05,
"loss": 1.4915,
"step": 104000
},
{
"epoch": 3.3337374188505895,
"grad_norm": 56791.9765625,
"learning_rate": 9.045454545454546e-05,
"loss": 1.4885,
"step": 104500
},
{
"epoch": 3.349688162952802,
"grad_norm": 59157.26171875,
"learning_rate": 9.040404040404041e-05,
"loss": 1.4892,
"step": 105000
},
{
"epoch": 3.365638907055014,
"grad_norm": 57222.37890625,
"learning_rate": 9.035353535353535e-05,
"loss": 1.4896,
"step": 105500
},
{
"epoch": 3.3815896511572263,
"grad_norm": 59154.6171875,
"learning_rate": 9.030303030303031e-05,
"loss": 1.4846,
"step": 106000
},
{
"epoch": 3.3975403952594387,
"grad_norm": 55996.48046875,
"learning_rate": 9.025252525252526e-05,
"loss": 1.4866,
"step": 106500
},
{
"epoch": 3.413491139361651,
"grad_norm": 58967.2578125,
"learning_rate": 9.02020202020202e-05,
"loss": 1.4846,
"step": 107000
},
{
"epoch": 3.4294418834638636,
"grad_norm": 58440.421875,
"learning_rate": 9.015151515151515e-05,
"loss": 1.4851,
"step": 107500
},
{
"epoch": 3.445392627566076,
"grad_norm": 55482.94140625,
"learning_rate": 9.010101010101011e-05,
"loss": 1.4862,
"step": 108000
},
{
"epoch": 3.4613433716682884,
"grad_norm": 59205.22265625,
"learning_rate": 9.005050505050505e-05,
"loss": 1.4829,
"step": 108500
},
{
"epoch": 3.477294115770501,
"grad_norm": 54196.10546875,
"learning_rate": 9e-05,
"loss": 1.482,
"step": 109000
},
{
"epoch": 3.4932448598727133,
"grad_norm": 54923.30078125,
"learning_rate": 8.994949494949495e-05,
"loss": 1.4797,
"step": 109500
},
{
"epoch": 3.5091956039749252,
"grad_norm": 54844.94140625,
"learning_rate": 8.98989898989899e-05,
"loss": 1.4788,
"step": 110000
},
{
"epoch": 3.5091956039749252,
"eval_loss": 1.4024385213851929,
"eval_runtime": 4946.3399,
"eval_samples_per_second": 202.791,
"eval_steps_per_second": 1.584,
"step": 110000
}
],
"logging_steps": 500,
"max_steps": 1000000,
"num_input_tokens_seen": 0,
"num_train_epochs": 32,
"save_steps": 10000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 7.413478046369151e+18,
"train_batch_size": 128,
"trial_name": null,
"trial_params": null
}