|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 1.9814814814814814, |
|
"eval_steps": 68, |
|
"global_step": 540, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.003703703703703704, |
|
"grad_norm": 0.40625306963920593, |
|
"learning_rate": 1.0000000000000002e-06, |
|
"loss": 1.3427, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.003703703703703704, |
|
"eval_loss": 1.3691776990890503, |
|
"eval_runtime": 80.493, |
|
"eval_samples_per_second": 3.615, |
|
"eval_steps_per_second": 0.46, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.007407407407407408, |
|
"grad_norm": 0.6216382384300232, |
|
"learning_rate": 2.0000000000000003e-06, |
|
"loss": 1.3914, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.011111111111111112, |
|
"grad_norm": 0.4203539788722992, |
|
"learning_rate": 3e-06, |
|
"loss": 1.3421, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.014814814814814815, |
|
"grad_norm": 0.48187777400016785, |
|
"learning_rate": 4.000000000000001e-06, |
|
"loss": 1.3913, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.018518518518518517, |
|
"grad_norm": 0.4104997515678406, |
|
"learning_rate": 5e-06, |
|
"loss": 1.3264, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.022222222222222223, |
|
"grad_norm": 0.5217423439025879, |
|
"learning_rate": 6e-06, |
|
"loss": 1.3418, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.025925925925925925, |
|
"grad_norm": 0.40824779868125916, |
|
"learning_rate": 7e-06, |
|
"loss": 1.3761, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.02962962962962963, |
|
"grad_norm": 0.41881611943244934, |
|
"learning_rate": 8.000000000000001e-06, |
|
"loss": 1.3631, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.03333333333333333, |
|
"grad_norm": 0.43708905577659607, |
|
"learning_rate": 9e-06, |
|
"loss": 1.3911, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.037037037037037035, |
|
"grad_norm": 0.48373478651046753, |
|
"learning_rate": 1e-05, |
|
"loss": 1.3813, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.040740740740740744, |
|
"grad_norm": 0.428241491317749, |
|
"learning_rate": 9.999912161129377e-06, |
|
"loss": 1.3825, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.044444444444444446, |
|
"grad_norm": 0.4543517827987671, |
|
"learning_rate": 9.999648647603774e-06, |
|
"loss": 1.3413, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.04814814814814815, |
|
"grad_norm": 0.48931288719177246, |
|
"learning_rate": 9.999209468681885e-06, |
|
"loss": 1.4078, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.05185185185185185, |
|
"grad_norm": 0.47361329197883606, |
|
"learning_rate": 9.998594639794502e-06, |
|
"loss": 1.3926, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.05555555555555555, |
|
"grad_norm": 0.46920689940452576, |
|
"learning_rate": 9.997804182543973e-06, |
|
"loss": 1.3043, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.05925925925925926, |
|
"grad_norm": 0.44550788402557373, |
|
"learning_rate": 9.996838124703448e-06, |
|
"loss": 1.3535, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.06296296296296296, |
|
"grad_norm": 0.4951707720756531, |
|
"learning_rate": 9.995696500215899e-06, |
|
"loss": 1.3355, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.06666666666666667, |
|
"grad_norm": 0.5006001591682434, |
|
"learning_rate": 9.994379349192927e-06, |
|
"loss": 1.3064, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.07037037037037037, |
|
"grad_norm": 0.45947596430778503, |
|
"learning_rate": 9.992886717913358e-06, |
|
"loss": 1.394, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.07407407407407407, |
|
"grad_norm": 0.49364641308784485, |
|
"learning_rate": 9.991218658821609e-06, |
|
"loss": 1.3043, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.07777777777777778, |
|
"grad_norm": 0.47694772481918335, |
|
"learning_rate": 9.989375230525849e-06, |
|
"loss": 1.3287, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.08148148148148149, |
|
"grad_norm": 0.5253634452819824, |
|
"learning_rate": 9.987356497795944e-06, |
|
"loss": 1.3046, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.08518518518518518, |
|
"grad_norm": 0.5501742362976074, |
|
"learning_rate": 9.985162531561174e-06, |
|
"loss": 1.3499, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.08888888888888889, |
|
"grad_norm": 0.5258708000183105, |
|
"learning_rate": 9.982793408907747e-06, |
|
"loss": 1.2779, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.09259259259259259, |
|
"grad_norm": 0.4966470003128052, |
|
"learning_rate": 9.980249213076085e-06, |
|
"loss": 1.2702, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.0962962962962963, |
|
"grad_norm": 0.4991610050201416, |
|
"learning_rate": 9.977530033457906e-06, |
|
"loss": 1.3286, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 0.5212219953536987, |
|
"learning_rate": 9.97463596559307e-06, |
|
"loss": 1.2978, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.1037037037037037, |
|
"grad_norm": 0.4977610409259796, |
|
"learning_rate": 9.971567111166246e-06, |
|
"loss": 1.3247, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.10740740740740741, |
|
"grad_norm": 0.5000190734863281, |
|
"learning_rate": 9.968323578003312e-06, |
|
"loss": 1.3017, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1111111111111111, |
|
"grad_norm": 0.476797878742218, |
|
"learning_rate": 9.964905480067585e-06, |
|
"loss": 1.2287, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.11481481481481481, |
|
"grad_norm": 0.5062195062637329, |
|
"learning_rate": 9.961312937455812e-06, |
|
"loss": 1.2521, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.11851851851851852, |
|
"grad_norm": 0.5346536636352539, |
|
"learning_rate": 9.957546076393944e-06, |
|
"loss": 1.2907, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.12222222222222222, |
|
"grad_norm": 0.5018014311790466, |
|
"learning_rate": 9.95360502923271e-06, |
|
"loss": 1.273, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.1259259259259259, |
|
"grad_norm": 0.4412826895713806, |
|
"learning_rate": 9.949489934442966e-06, |
|
"loss": 1.202, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.12962962962962962, |
|
"grad_norm": 0.47726863622665405, |
|
"learning_rate": 9.945200936610821e-06, |
|
"loss": 1.1432, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.13333333333333333, |
|
"grad_norm": 0.4887215197086334, |
|
"learning_rate": 9.940738186432565e-06, |
|
"loss": 1.1524, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.13703703703703704, |
|
"grad_norm": 0.4492252469062805, |
|
"learning_rate": 9.936101840709373e-06, |
|
"loss": 1.1903, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.14074074074074075, |
|
"grad_norm": 0.43920594453811646, |
|
"learning_rate": 9.931292062341793e-06, |
|
"loss": 1.1942, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.14444444444444443, |
|
"grad_norm": 0.44488102197647095, |
|
"learning_rate": 9.926309020324025e-06, |
|
"loss": 1.1919, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.14814814814814814, |
|
"grad_norm": 0.5044857263565063, |
|
"learning_rate": 9.921152889737985e-06, |
|
"loss": 1.1351, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.15185185185185185, |
|
"grad_norm": 0.45221227407455444, |
|
"learning_rate": 9.915823851747143e-06, |
|
"loss": 1.1624, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.15555555555555556, |
|
"grad_norm": 0.5037719011306763, |
|
"learning_rate": 9.910322093590177e-06, |
|
"loss": 1.1718, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.15925925925925927, |
|
"grad_norm": 0.44602254033088684, |
|
"learning_rate": 9.90464780857437e-06, |
|
"loss": 1.1546, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.16296296296296298, |
|
"grad_norm": 0.44312745332717896, |
|
"learning_rate": 9.898801196068839e-06, |
|
"loss": 1.2048, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.16666666666666666, |
|
"grad_norm": 0.5689204931259155, |
|
"learning_rate": 9.892782461497521e-06, |
|
"loss": 1.2042, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.17037037037037037, |
|
"grad_norm": 0.47574153542518616, |
|
"learning_rate": 9.886591816331953e-06, |
|
"loss": 1.072, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.17407407407407408, |
|
"grad_norm": 0.5947781801223755, |
|
"learning_rate": 9.880229478083849e-06, |
|
"loss": 1.1788, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.17777777777777778, |
|
"grad_norm": 0.45822006464004517, |
|
"learning_rate": 9.87369567029745e-06, |
|
"loss": 1.1901, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.1814814814814815, |
|
"grad_norm": 0.4415622055530548, |
|
"learning_rate": 9.866990622541677e-06, |
|
"loss": 1.1071, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.18518518518518517, |
|
"grad_norm": 0.49463754892349243, |
|
"learning_rate": 9.860114570402055e-06, |
|
"loss": 1.1492, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.18888888888888888, |
|
"grad_norm": 0.5251724720001221, |
|
"learning_rate": 9.853067755472447e-06, |
|
"loss": 1.102, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.1925925925925926, |
|
"grad_norm": 0.4823416471481323, |
|
"learning_rate": 9.845850425346563e-06, |
|
"loss": 1.1561, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.1962962962962963, |
|
"grad_norm": 0.5142261385917664, |
|
"learning_rate": 9.838462833609249e-06, |
|
"loss": 1.2041, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.5137107372283936, |
|
"learning_rate": 9.830905239827592e-06, |
|
"loss": 1.0813, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.2037037037037037, |
|
"grad_norm": 0.41644176840782166, |
|
"learning_rate": 9.823177909541795e-06, |
|
"loss": 1.0974, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.2074074074074074, |
|
"grad_norm": 0.40043726563453674, |
|
"learning_rate": 9.815281114255841e-06, |
|
"loss": 1.1076, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.2111111111111111, |
|
"grad_norm": 0.43805867433547974, |
|
"learning_rate": 9.807215131427966e-06, |
|
"loss": 1.0959, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.21481481481481482, |
|
"grad_norm": 0.5732157230377197, |
|
"learning_rate": 9.798980244460892e-06, |
|
"loss": 1.0742, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.21851851851851853, |
|
"grad_norm": 0.44811880588531494, |
|
"learning_rate": 9.790576742691895e-06, |
|
"loss": 1.0058, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.2222222222222222, |
|
"grad_norm": 0.44900447130203247, |
|
"learning_rate": 9.782004921382612e-06, |
|
"loss": 1.0982, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.22592592592592592, |
|
"grad_norm": 0.521683394908905, |
|
"learning_rate": 9.773265081708687e-06, |
|
"loss": 1.1294, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.22962962962962963, |
|
"grad_norm": 0.48734819889068604, |
|
"learning_rate": 9.764357530749178e-06, |
|
"loss": 1.0575, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.23333333333333334, |
|
"grad_norm": 0.47888699173927307, |
|
"learning_rate": 9.755282581475769e-06, |
|
"loss": 1.0333, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.23703703703703705, |
|
"grad_norm": 0.45292389392852783, |
|
"learning_rate": 9.74604055274178e-06, |
|
"loss": 1.0786, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.24074074074074073, |
|
"grad_norm": 0.46524283289909363, |
|
"learning_rate": 9.736631769270958e-06, |
|
"loss": 1.0708, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.24444444444444444, |
|
"grad_norm": 0.4456775486469269, |
|
"learning_rate": 9.727056561646067e-06, |
|
"loss": 1.0512, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.24814814814814815, |
|
"grad_norm": 0.461698055267334, |
|
"learning_rate": 9.717315266297277e-06, |
|
"loss": 1.1535, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.2518518518518518, |
|
"grad_norm": 0.5552849173545837, |
|
"learning_rate": 9.707408225490343e-06, |
|
"loss": 1.1064, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.2518518518518518, |
|
"eval_loss": 1.065529227256775, |
|
"eval_runtime": 80.9702, |
|
"eval_samples_per_second": 3.594, |
|
"eval_steps_per_second": 0.457, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.25555555555555554, |
|
"grad_norm": 0.541875422000885, |
|
"learning_rate": 9.697335787314573e-06, |
|
"loss": 1.0527, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.25925925925925924, |
|
"grad_norm": 0.4617699384689331, |
|
"learning_rate": 9.687098305670606e-06, |
|
"loss": 1.0456, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.26296296296296295, |
|
"grad_norm": 0.4448198080062866, |
|
"learning_rate": 9.676696140257969e-06, |
|
"loss": 1.0364, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.26666666666666666, |
|
"grad_norm": 0.3684210479259491, |
|
"learning_rate": 9.66612965656245e-06, |
|
"loss": 1.1162, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.27037037037037037, |
|
"grad_norm": 0.6021161079406738, |
|
"learning_rate": 9.655399225843244e-06, |
|
"loss": 1.0799, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.2740740740740741, |
|
"grad_norm": 0.575809895992279, |
|
"learning_rate": 9.644505225119922e-06, |
|
"loss": 1.0222, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.2777777777777778, |
|
"grad_norm": 0.5453614592552185, |
|
"learning_rate": 9.633448037159167e-06, |
|
"loss": 1.0339, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.2814814814814815, |
|
"grad_norm": 0.5681980848312378, |
|
"learning_rate": 9.622228050461345e-06, |
|
"loss": 1.0622, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.2851851851851852, |
|
"grad_norm": 0.4109339714050293, |
|
"learning_rate": 9.610845659246833e-06, |
|
"loss": 1.0395, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.28888888888888886, |
|
"grad_norm": 0.4249359667301178, |
|
"learning_rate": 9.599301263442194e-06, |
|
"loss": 1.0346, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.29259259259259257, |
|
"grad_norm": 0.5109196901321411, |
|
"learning_rate": 9.587595268666099e-06, |
|
"loss": 1.0834, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.2962962962962963, |
|
"grad_norm": 0.512137770652771, |
|
"learning_rate": 9.575728086215093e-06, |
|
"loss": 1.0438, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.3, |
|
"grad_norm": 0.5844932198524475, |
|
"learning_rate": 9.56370013304914e-06, |
|
"loss": 0.9966, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.3037037037037037, |
|
"grad_norm": 0.4886794984340668, |
|
"learning_rate": 9.551511831776966e-06, |
|
"loss": 1.0461, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.3074074074074074, |
|
"grad_norm": 0.4917876124382019, |
|
"learning_rate": 9.53916361064122e-06, |
|
"loss": 1.0121, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.3111111111111111, |
|
"grad_norm": 0.48174771666526794, |
|
"learning_rate": 9.526655903503423e-06, |
|
"loss": 1.0579, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.3148148148148148, |
|
"grad_norm": 0.5147380232810974, |
|
"learning_rate": 9.513989149828718e-06, |
|
"loss": 1.0065, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.31851851851851853, |
|
"grad_norm": 0.4484403431415558, |
|
"learning_rate": 9.501163794670445e-06, |
|
"loss": 1.0089, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.32222222222222224, |
|
"grad_norm": 0.45849668979644775, |
|
"learning_rate": 9.488180288654485e-06, |
|
"loss": 1.0262, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.32592592592592595, |
|
"grad_norm": 0.571622908115387, |
|
"learning_rate": 9.475039087963443e-06, |
|
"loss": 1.0129, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.3296296296296296, |
|
"grad_norm": 0.5279180407524109, |
|
"learning_rate": 9.461740654320608e-06, |
|
"loss": 1.03, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.3333333333333333, |
|
"grad_norm": 0.47328171133995056, |
|
"learning_rate": 9.448285454973739e-06, |
|
"loss": 0.9805, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.337037037037037, |
|
"grad_norm": 0.4972725212574005, |
|
"learning_rate": 9.434673962678638e-06, |
|
"loss": 0.976, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.34074074074074073, |
|
"grad_norm": 0.5977814793586731, |
|
"learning_rate": 9.420906655682553e-06, |
|
"loss": 0.989, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.34444444444444444, |
|
"grad_norm": 0.5420663356781006, |
|
"learning_rate": 9.40698401770736e-06, |
|
"loss": 1.0225, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.34814814814814815, |
|
"grad_norm": 0.410198450088501, |
|
"learning_rate": 9.392906537932582e-06, |
|
"loss": 1.0393, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.35185185185185186, |
|
"grad_norm": 0.5001354217529297, |
|
"learning_rate": 9.378674710978185e-06, |
|
"loss": 0.9712, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.35555555555555557, |
|
"grad_norm": 0.5929519534111023, |
|
"learning_rate": 9.364289036887214e-06, |
|
"loss": 1.0759, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.3592592592592593, |
|
"grad_norm": 0.5323709845542908, |
|
"learning_rate": 9.349750021108212e-06, |
|
"loss": 1.0619, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.362962962962963, |
|
"grad_norm": 0.5360124707221985, |
|
"learning_rate": 9.335058174477472e-06, |
|
"loss": 0.9957, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.36666666666666664, |
|
"grad_norm": 0.5704509019851685, |
|
"learning_rate": 9.320214013201079e-06, |
|
"loss": 1.0591, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.37037037037037035, |
|
"grad_norm": 0.4351862967014313, |
|
"learning_rate": 9.305218058836778e-06, |
|
"loss": 1.014, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.37407407407407406, |
|
"grad_norm": 0.48397883772850037, |
|
"learning_rate": 9.290070838275649e-06, |
|
"loss": 1.0094, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.37777777777777777, |
|
"grad_norm": 0.5487049221992493, |
|
"learning_rate": 9.274772883723587e-06, |
|
"loss": 0.9604, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.3814814814814815, |
|
"grad_norm": 0.4735201895236969, |
|
"learning_rate": 9.259324732682615e-06, |
|
"loss": 0.9577, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.3851851851851852, |
|
"grad_norm": 0.5162625312805176, |
|
"learning_rate": 9.24372692793199e-06, |
|
"loss": 1.0095, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.3888888888888889, |
|
"grad_norm": 0.4944085478782654, |
|
"learning_rate": 9.22798001750913e-06, |
|
"loss": 1.0086, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.3925925925925926, |
|
"grad_norm": 0.5985198616981506, |
|
"learning_rate": 9.21208455469037e-06, |
|
"loss": 0.9878, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.3962962962962963, |
|
"grad_norm": 0.6551868915557861, |
|
"learning_rate": 9.196041097971509e-06, |
|
"loss": 1.0079, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.4953964352607727, |
|
"learning_rate": 9.179850211048193e-06, |
|
"loss": 1.0403, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.40370370370370373, |
|
"grad_norm": 0.46935591101646423, |
|
"learning_rate": 9.163512462796113e-06, |
|
"loss": 1.0443, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.4074074074074074, |
|
"grad_norm": 0.48214173316955566, |
|
"learning_rate": 9.14702842725101e-06, |
|
"loss": 0.9952, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.4111111111111111, |
|
"grad_norm": 0.5411708354949951, |
|
"learning_rate": 9.13039868358851e-06, |
|
"loss": 1.0634, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.4148148148148148, |
|
"grad_norm": 0.68564373254776, |
|
"learning_rate": 9.113623816103775e-06, |
|
"loss": 0.9307, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.4185185185185185, |
|
"grad_norm": 0.536626398563385, |
|
"learning_rate": 9.09670441419097e-06, |
|
"loss": 1.0535, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.4222222222222222, |
|
"grad_norm": 0.485929012298584, |
|
"learning_rate": 9.079641072322555e-06, |
|
"loss": 1.0176, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.42592592592592593, |
|
"grad_norm": 0.5539782047271729, |
|
"learning_rate": 9.062434390028407e-06, |
|
"loss": 0.9906, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.42962962962962964, |
|
"grad_norm": 0.49939635396003723, |
|
"learning_rate": 9.045084971874738e-06, |
|
"loss": 0.9586, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.43333333333333335, |
|
"grad_norm": 0.48620209097862244, |
|
"learning_rate": 9.027593427442867e-06, |
|
"loss": 1.0209, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.43703703703703706, |
|
"grad_norm": 0.4806266725063324, |
|
"learning_rate": 9.009960371307798e-06, |
|
"loss": 1.0185, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.44074074074074077, |
|
"grad_norm": 0.6763521432876587, |
|
"learning_rate": 8.992186423016626e-06, |
|
"loss": 1.0247, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.4444444444444444, |
|
"grad_norm": 0.5310172438621521, |
|
"learning_rate": 8.974272207066767e-06, |
|
"loss": 1.006, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.44814814814814813, |
|
"grad_norm": 0.5065312385559082, |
|
"learning_rate": 8.956218352884022e-06, |
|
"loss": 0.9535, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.45185185185185184, |
|
"grad_norm": 0.5911722183227539, |
|
"learning_rate": 8.938025494800454e-06, |
|
"loss": 0.9698, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.45555555555555555, |
|
"grad_norm": 0.60561203956604, |
|
"learning_rate": 8.919694272032108e-06, |
|
"loss": 1.0081, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.45925925925925926, |
|
"grad_norm": 0.5998137593269348, |
|
"learning_rate": 8.901225328656543e-06, |
|
"loss": 1.0332, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.46296296296296297, |
|
"grad_norm": 0.6571759581565857, |
|
"learning_rate": 8.882619313590212e-06, |
|
"loss": 1.0501, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.4666666666666667, |
|
"grad_norm": 0.5181518793106079, |
|
"learning_rate": 8.863876880565656e-06, |
|
"loss": 0.9653, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.4703703703703704, |
|
"grad_norm": 0.5412523746490479, |
|
"learning_rate": 8.844998688108535e-06, |
|
"loss": 0.999, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.4740740740740741, |
|
"grad_norm": 0.5652058124542236, |
|
"learning_rate": 8.825985399514488e-06, |
|
"loss": 0.9647, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.4777777777777778, |
|
"grad_norm": 0.52536940574646, |
|
"learning_rate": 8.806837682825835e-06, |
|
"loss": 0.9694, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.48148148148148145, |
|
"grad_norm": 0.6217904686927795, |
|
"learning_rate": 8.787556210808101e-06, |
|
"loss": 1.0241, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.48518518518518516, |
|
"grad_norm": 0.43509605526924133, |
|
"learning_rate": 8.768141660926375e-06, |
|
"loss": 0.9598, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.4888888888888889, |
|
"grad_norm": 0.5001434087753296, |
|
"learning_rate": 8.748594715321512e-06, |
|
"loss": 0.9697, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.4925925925925926, |
|
"grad_norm": 0.6269538402557373, |
|
"learning_rate": 8.728916060786162e-06, |
|
"loss": 1.0074, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.4962962962962963, |
|
"grad_norm": 0.6777300834655762, |
|
"learning_rate": 8.70910638874064e-06, |
|
"loss": 0.9968, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.5, |
|
"grad_norm": 0.5371289849281311, |
|
"learning_rate": 8.689166395208638e-06, |
|
"loss": 0.9684, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.5037037037037037, |
|
"grad_norm": 0.6136884093284607, |
|
"learning_rate": 8.669096780792754e-06, |
|
"loss": 1.0297, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5037037037037037, |
|
"eval_loss": 0.9753141403198242, |
|
"eval_runtime": 81.1717, |
|
"eval_samples_per_second": 3.585, |
|
"eval_steps_per_second": 0.456, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.5074074074074074, |
|
"grad_norm": 0.5171265602111816, |
|
"learning_rate": 8.6488982506499e-06, |
|
"loss": 0.962, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.5111111111111111, |
|
"grad_norm": 0.6454190611839294, |
|
"learning_rate": 8.628571514466502e-06, |
|
"loss": 0.9555, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.5148148148148148, |
|
"grad_norm": 0.5578838586807251, |
|
"learning_rate": 8.608117286433583e-06, |
|
"loss": 0.9079, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.5185185185185185, |
|
"grad_norm": 0.5714731216430664, |
|
"learning_rate": 8.587536285221656e-06, |
|
"loss": 0.9894, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.5222222222222223, |
|
"grad_norm": 0.5244677066802979, |
|
"learning_rate": 8.566829233955484e-06, |
|
"loss": 0.9735, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.5259259259259259, |
|
"grad_norm": 0.4161701798439026, |
|
"learning_rate": 8.545996860188668e-06, |
|
"loss": 0.9945, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.5296296296296297, |
|
"grad_norm": 0.6657142639160156, |
|
"learning_rate": 8.525039895878078e-06, |
|
"loss": 0.982, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.5333333333333333, |
|
"grad_norm": 0.7206271886825562, |
|
"learning_rate": 8.503959077358143e-06, |
|
"loss": 0.9977, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.5370370370370371, |
|
"grad_norm": 0.7977305054664612, |
|
"learning_rate": 8.482755145314987e-06, |
|
"loss": 0.9605, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.5407407407407407, |
|
"grad_norm": 0.8049225211143494, |
|
"learning_rate": 8.46142884476038e-06, |
|
"loss": 0.999, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.5444444444444444, |
|
"grad_norm": 0.49984222650527954, |
|
"learning_rate": 8.439980925005587e-06, |
|
"loss": 0.9595, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.5481481481481482, |
|
"grad_norm": 0.48655927181243896, |
|
"learning_rate": 8.418412139635026e-06, |
|
"loss": 0.9481, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.5518518518518518, |
|
"grad_norm": 0.5527738332748413, |
|
"learning_rate": 8.396723246479798e-06, |
|
"loss": 0.9665, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.5555555555555556, |
|
"grad_norm": 0.6328939199447632, |
|
"learning_rate": 8.374915007591053e-06, |
|
"loss": 1.0021, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.5592592592592592, |
|
"grad_norm": 0.6932883262634277, |
|
"learning_rate": 8.352988189213223e-06, |
|
"loss": 0.9991, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.562962962962963, |
|
"grad_norm": 0.5916227698326111, |
|
"learning_rate": 8.330943561757092e-06, |
|
"loss": 0.9661, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.5666666666666667, |
|
"grad_norm": 0.471822589635849, |
|
"learning_rate": 8.308781899772731e-06, |
|
"loss": 0.9396, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.5703703703703704, |
|
"grad_norm": 0.5403897166252136, |
|
"learning_rate": 8.286503981922284e-06, |
|
"loss": 0.9444, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 0.5740740740740741, |
|
"grad_norm": 0.5560125708580017, |
|
"learning_rate": 8.264110590952609e-06, |
|
"loss": 0.9487, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.5777777777777777, |
|
"grad_norm": 0.6282420754432678, |
|
"learning_rate": 8.241602513667775e-06, |
|
"loss": 1.0124, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 0.5814814814814815, |
|
"grad_norm": 0.4911057949066162, |
|
"learning_rate": 8.218980540901417e-06, |
|
"loss": 0.971, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 0.5851851851851851, |
|
"grad_norm": 0.6368396878242493, |
|
"learning_rate": 8.19624546748895e-06, |
|
"loss": 1.0181, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 0.5888888888888889, |
|
"grad_norm": 0.6642744541168213, |
|
"learning_rate": 8.173398092239647e-06, |
|
"loss": 1.0051, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 0.5925925925925926, |
|
"grad_norm": 0.554905116558075, |
|
"learning_rate": 8.150439217908557e-06, |
|
"loss": 0.9329, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.5962962962962963, |
|
"grad_norm": 0.5215203762054443, |
|
"learning_rate": 8.12736965116832e-06, |
|
"loss": 0.9506, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.4904837906360626, |
|
"learning_rate": 8.104190202580811e-06, |
|
"loss": 0.9864, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 0.6037037037037037, |
|
"grad_norm": 0.570766806602478, |
|
"learning_rate": 8.080901686568664e-06, |
|
"loss": 0.9379, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 0.6074074074074074, |
|
"grad_norm": 0.687227725982666, |
|
"learning_rate": 8.057504921386661e-06, |
|
"loss": 0.9714, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 0.6111111111111112, |
|
"grad_norm": 0.6017288565635681, |
|
"learning_rate": 8.034000729092967e-06, |
|
"loss": 0.9709, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.6148148148148148, |
|
"grad_norm": 0.6062106490135193, |
|
"learning_rate": 8.010389935520269e-06, |
|
"loss": 1.0362, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 0.6185185185185185, |
|
"grad_norm": 0.5548331141471863, |
|
"learning_rate": 7.986673370246743e-06, |
|
"loss": 0.9581, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 0.6222222222222222, |
|
"grad_norm": 0.5252346396446228, |
|
"learning_rate": 7.962851866566912e-06, |
|
"loss": 0.9669, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 0.6259259259259259, |
|
"grad_norm": 0.7005597352981567, |
|
"learning_rate": 7.938926261462366e-06, |
|
"loss": 0.987, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 0.6296296296296297, |
|
"grad_norm": 0.5916934609413147, |
|
"learning_rate": 7.914897395572362e-06, |
|
"loss": 0.9433, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.6333333333333333, |
|
"grad_norm": 0.6202555298805237, |
|
"learning_rate": 7.890766113164272e-06, |
|
"loss": 0.9833, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 0.6370370370370371, |
|
"grad_norm": 0.5578716397285461, |
|
"learning_rate": 7.866533262103937e-06, |
|
"loss": 0.9479, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 0.6407407407407407, |
|
"grad_norm": 0.6666351556777954, |
|
"learning_rate": 7.842199693825863e-06, |
|
"loss": 0.9383, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 0.6444444444444445, |
|
"grad_norm": 0.5507566332817078, |
|
"learning_rate": 7.817766263303312e-06, |
|
"loss": 0.9767, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 0.6481481481481481, |
|
"grad_norm": 0.6183774471282959, |
|
"learning_rate": 7.793233829018263e-06, |
|
"loss": 0.9078, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.6518518518518519, |
|
"grad_norm": 0.499009370803833, |
|
"learning_rate": 7.768603252931243e-06, |
|
"loss": 0.9563, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 0.6555555555555556, |
|
"grad_norm": 0.629336416721344, |
|
"learning_rate": 7.743875400451047e-06, |
|
"loss": 0.911, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 0.6592592592592592, |
|
"grad_norm": 0.5423790812492371, |
|
"learning_rate": 7.719051140404327e-06, |
|
"loss": 0.9434, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 0.662962962962963, |
|
"grad_norm": 0.6060659289360046, |
|
"learning_rate": 7.69413134500507e-06, |
|
"loss": 0.95, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 0.6666666666666666, |
|
"grad_norm": 0.5223778486251831, |
|
"learning_rate": 7.669116889823955e-06, |
|
"loss": 0.9474, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.6703703703703704, |
|
"grad_norm": 0.6271294355392456, |
|
"learning_rate": 7.644008653757571e-06, |
|
"loss": 0.9652, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 0.674074074074074, |
|
"grad_norm": 0.5973348617553711, |
|
"learning_rate": 7.6188075189975644e-06, |
|
"loss": 0.9333, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 0.6777777777777778, |
|
"grad_norm": 0.5119736790657043, |
|
"learning_rate": 7.593514370999617e-06, |
|
"loss": 0.9253, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 0.6814814814814815, |
|
"grad_norm": 0.6887508630752563, |
|
"learning_rate": 7.568130098452352e-06, |
|
"loss": 0.9344, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 0.6851851851851852, |
|
"grad_norm": 0.5387381911277771, |
|
"learning_rate": 7.542655593246103e-06, |
|
"loss": 0.9645, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.6888888888888889, |
|
"grad_norm": 0.5810338854789734, |
|
"learning_rate": 7.517091750441576e-06, |
|
"loss": 0.9406, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 0.6925925925925925, |
|
"grad_norm": 0.6561952829360962, |
|
"learning_rate": 7.491439468238404e-06, |
|
"loss": 0.9363, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 0.6962962962962963, |
|
"grad_norm": 0.7444878220558167, |
|
"learning_rate": 7.465699647943586e-06, |
|
"loss": 0.945, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 0.7, |
|
"grad_norm": 0.6265509724617004, |
|
"learning_rate": 7.43987319393982e-06, |
|
"loss": 0.9576, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 0.7037037037037037, |
|
"grad_norm": 0.6139175295829773, |
|
"learning_rate": 7.413961013653725e-06, |
|
"loss": 0.9697, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.7074074074074074, |
|
"grad_norm": 0.5767727494239807, |
|
"learning_rate": 7.387964017523964e-06, |
|
"loss": 0.9721, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 0.7111111111111111, |
|
"grad_norm": 0.757271945476532, |
|
"learning_rate": 7.361883118969248e-06, |
|
"loss": 1.0013, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 0.7148148148148148, |
|
"grad_norm": 0.6246291995048523, |
|
"learning_rate": 7.335719234356245e-06, |
|
"loss": 0.9418, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 0.7185185185185186, |
|
"grad_norm": 0.4833630621433258, |
|
"learning_rate": 7.309473282967387e-06, |
|
"loss": 0.9435, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 0.7222222222222222, |
|
"grad_norm": 0.5289487242698669, |
|
"learning_rate": 7.283146186968566e-06, |
|
"loss": 0.9617, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.725925925925926, |
|
"grad_norm": 0.6008256673812866, |
|
"learning_rate": 7.256738871376733e-06, |
|
"loss": 0.8983, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 0.7296296296296296, |
|
"grad_norm": 0.5227617621421814, |
|
"learning_rate": 7.230252264027398e-06, |
|
"loss": 0.8768, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 0.7333333333333333, |
|
"grad_norm": 0.6785119771957397, |
|
"learning_rate": 7.203687295542032e-06, |
|
"loss": 0.9393, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 0.737037037037037, |
|
"grad_norm": 0.6053286790847778, |
|
"learning_rate": 7.1770448992953676e-06, |
|
"loss": 0.9125, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 0.7407407407407407, |
|
"grad_norm": 0.7238445281982422, |
|
"learning_rate": 7.1503260113826035e-06, |
|
"loss": 0.9305, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.7444444444444445, |
|
"grad_norm": 0.6719542741775513, |
|
"learning_rate": 7.123531570586515e-06, |
|
"loss": 0.9643, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 0.7481481481481481, |
|
"grad_norm": 0.5546441674232483, |
|
"learning_rate": 7.09666251834447e-06, |
|
"loss": 0.9663, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 0.7518518518518519, |
|
"grad_norm": 0.5350282192230225, |
|
"learning_rate": 7.069719798715347e-06, |
|
"loss": 0.9041, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 0.7555555555555555, |
|
"grad_norm": 0.5801582932472229, |
|
"learning_rate": 7.042704358346375e-06, |
|
"loss": 0.9444, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.7555555555555555, |
|
"eval_loss": 0.9426867961883545, |
|
"eval_runtime": 81.1055, |
|
"eval_samples_per_second": 3.588, |
|
"eval_steps_per_second": 0.456, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 0.7592592592592593, |
|
"grad_norm": 0.7228114008903503, |
|
"learning_rate": 7.015617146439863e-06, |
|
"loss": 0.931, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.762962962962963, |
|
"grad_norm": 0.5295515656471252, |
|
"learning_rate": 6.988459114719849e-06, |
|
"loss": 0.9457, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 0.7666666666666667, |
|
"grad_norm": 0.5533620119094849, |
|
"learning_rate": 6.9612312173986675e-06, |
|
"loss": 0.9407, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 0.7703703703703704, |
|
"grad_norm": 0.6508337259292603, |
|
"learning_rate": 6.933934411143419e-06, |
|
"loss": 0.9176, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 0.774074074074074, |
|
"grad_norm": 0.644389808177948, |
|
"learning_rate": 6.906569655042357e-06, |
|
"loss": 0.9796, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 0.7777777777777778, |
|
"grad_norm": 0.5943438410758972, |
|
"learning_rate": 6.879137910571191e-06, |
|
"loss": 0.9508, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.7814814814814814, |
|
"grad_norm": 0.5512163639068604, |
|
"learning_rate": 6.8516401415593005e-06, |
|
"loss": 0.9066, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 0.7851851851851852, |
|
"grad_norm": 0.5512770414352417, |
|
"learning_rate": 6.824077314155877e-06, |
|
"loss": 0.9169, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 0.7888888888888889, |
|
"grad_norm": 0.7245272397994995, |
|
"learning_rate": 6.7964503967959705e-06, |
|
"loss": 0.9563, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 0.7925925925925926, |
|
"grad_norm": 0.704143762588501, |
|
"learning_rate": 6.768760360166471e-06, |
|
"loss": 0.9662, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 0.7962962962962963, |
|
"grad_norm": 0.5439050197601318, |
|
"learning_rate": 6.741008177171995e-06, |
|
"loss": 0.9609, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.6104442477226257, |
|
"learning_rate": 6.713194822900707e-06, |
|
"loss": 0.9313, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 0.8037037037037037, |
|
"grad_norm": 0.7294436693191528, |
|
"learning_rate": 6.6853212745900585e-06, |
|
"loss": 0.933, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 0.8074074074074075, |
|
"grad_norm": 0.5400619506835938, |
|
"learning_rate": 6.657388511592453e-06, |
|
"loss": 0.9367, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 0.8111111111111111, |
|
"grad_norm": 0.8623405694961548, |
|
"learning_rate": 6.62939751534083e-06, |
|
"loss": 0.9719, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 0.8148148148148148, |
|
"grad_norm": 0.6787410378456116, |
|
"learning_rate": 6.601349269314188e-06, |
|
"loss": 0.9882, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.8185185185185185, |
|
"grad_norm": 0.6689869165420532, |
|
"learning_rate": 6.573244759003033e-06, |
|
"loss": 0.9445, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 0.8222222222222222, |
|
"grad_norm": 0.7502297759056091, |
|
"learning_rate": 6.545084971874738e-06, |
|
"loss": 0.9276, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 0.825925925925926, |
|
"grad_norm": 2.460090160369873, |
|
"learning_rate": 6.516870897338864e-06, |
|
"loss": 0.9684, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 0.8296296296296296, |
|
"grad_norm": 0.8110550045967102, |
|
"learning_rate": 6.488603526712391e-06, |
|
"loss": 0.9212, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 0.8333333333333334, |
|
"grad_norm": 0.5253615975379944, |
|
"learning_rate": 6.46028385318488e-06, |
|
"loss": 0.9385, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.837037037037037, |
|
"grad_norm": 0.5551905632019043, |
|
"learning_rate": 6.431912871783587e-06, |
|
"loss": 0.9331, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 0.8407407407407408, |
|
"grad_norm": 0.6484084129333496, |
|
"learning_rate": 6.4034915793385e-06, |
|
"loss": 0.9936, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 0.8444444444444444, |
|
"grad_norm": 0.6521108746528625, |
|
"learning_rate": 6.3750209744473105e-06, |
|
"loss": 0.974, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 0.8481481481481481, |
|
"grad_norm": 0.7478381395339966, |
|
"learning_rate": 6.346502057440327e-06, |
|
"loss": 0.9569, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 0.8518518518518519, |
|
"grad_norm": 0.6053647398948669, |
|
"learning_rate": 6.3179358303453386e-06, |
|
"loss": 0.8928, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.8555555555555555, |
|
"grad_norm": 0.7461119890213013, |
|
"learning_rate": 6.289323296852393e-06, |
|
"loss": 0.9121, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 0.8592592592592593, |
|
"grad_norm": 0.6154372692108154, |
|
"learning_rate": 6.260665462278544e-06, |
|
"loss": 0.952, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 0.8629629629629629, |
|
"grad_norm": 0.710970938205719, |
|
"learning_rate": 6.231963333532516e-06, |
|
"loss": 0.9365, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 0.8666666666666667, |
|
"grad_norm": 0.6357712149620056, |
|
"learning_rate": 6.203217919079343e-06, |
|
"loss": 0.8836, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 0.8703703703703703, |
|
"grad_norm": 0.6976805329322815, |
|
"learning_rate": 6.17443022890492e-06, |
|
"loss": 0.9757, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.8740740740740741, |
|
"grad_norm": 0.6872934699058533, |
|
"learning_rate": 6.145601274480521e-06, |
|
"loss": 0.9814, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 0.8777777777777778, |
|
"grad_norm": 0.7947030663490295, |
|
"learning_rate": 6.116732068727271e-06, |
|
"loss": 0.9016, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 0.8814814814814815, |
|
"grad_norm": 0.63334721326828, |
|
"learning_rate": 6.08782362598054e-06, |
|
"loss": 0.9679, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 0.8851851851851852, |
|
"grad_norm": 0.5451921820640564, |
|
"learning_rate": 6.058876961954308e-06, |
|
"loss": 0.9511, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 0.8888888888888888, |
|
"grad_norm": 0.5797951221466064, |
|
"learning_rate": 6.029893093705492e-06, |
|
"loss": 0.9553, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.8925925925925926, |
|
"grad_norm": 0.5836870074272156, |
|
"learning_rate": 6.0008730395981905e-06, |
|
"loss": 0.9562, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 0.8962962962962963, |
|
"grad_norm": 0.6153254508972168, |
|
"learning_rate": 5.971817819267914e-06, |
|
"loss": 0.9199, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 0.9, |
|
"grad_norm": 0.6756653785705566, |
|
"learning_rate": 5.9427284535857585e-06, |
|
"loss": 0.9599, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 0.9037037037037037, |
|
"grad_norm": 0.6547468304634094, |
|
"learning_rate": 5.9136059646225375e-06, |
|
"loss": 0.9485, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 0.9074074074074074, |
|
"grad_norm": 0.7384520769119263, |
|
"learning_rate": 5.884451375612865e-06, |
|
"loss": 0.927, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.9111111111111111, |
|
"grad_norm": 0.6480386853218079, |
|
"learning_rate": 5.855265710919211e-06, |
|
"loss": 1.0039, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 0.9148148148148149, |
|
"grad_norm": 0.5494263768196106, |
|
"learning_rate": 5.826049995995905e-06, |
|
"loss": 0.9706, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 0.9185185185185185, |
|
"grad_norm": 0.5438244342803955, |
|
"learning_rate": 5.796805257353109e-06, |
|
"loss": 0.963, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 0.9222222222222223, |
|
"grad_norm": 0.6168299317359924, |
|
"learning_rate": 5.767532522520746e-06, |
|
"loss": 0.9594, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 0.9259259259259259, |
|
"grad_norm": 0.6753399968147278, |
|
"learning_rate": 5.738232820012407e-06, |
|
"loss": 0.9181, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.9296296296296296, |
|
"grad_norm": 0.5123042464256287, |
|
"learning_rate": 5.7089071792892e-06, |
|
"loss": 0.9216, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 0.9333333333333333, |
|
"grad_norm": 0.7598656415939331, |
|
"learning_rate": 5.679556630723592e-06, |
|
"loss": 0.9725, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 0.937037037037037, |
|
"grad_norm": 0.6306942701339722, |
|
"learning_rate": 5.6501822055631976e-06, |
|
"loss": 0.9041, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 0.9407407407407408, |
|
"grad_norm": 0.7515453696250916, |
|
"learning_rate": 5.620784935894548e-06, |
|
"loss": 0.9192, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 0.9444444444444444, |
|
"grad_norm": 0.6113058924674988, |
|
"learning_rate": 5.591365854606829e-06, |
|
"loss": 0.949, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.9481481481481482, |
|
"grad_norm": 0.6589618921279907, |
|
"learning_rate": 5.561925995355595e-06, |
|
"loss": 0.9384, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 0.9518518518518518, |
|
"grad_norm": 0.7518366575241089, |
|
"learning_rate": 5.532466392526439e-06, |
|
"loss": 0.8959, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 0.9555555555555556, |
|
"grad_norm": 0.5112090110778809, |
|
"learning_rate": 5.5029880811986546e-06, |
|
"loss": 0.9214, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 0.9592592592592593, |
|
"grad_norm": 0.6436278820037842, |
|
"learning_rate": 5.4734920971088766e-06, |
|
"loss": 0.9165, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 0.9629629629629629, |
|
"grad_norm": 0.685821533203125, |
|
"learning_rate": 5.443979476614674e-06, |
|
"loss": 0.9114, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.9666666666666667, |
|
"grad_norm": 0.5555897951126099, |
|
"learning_rate": 5.4144512566581495e-06, |
|
"loss": 0.9791, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 0.9703703703703703, |
|
"grad_norm": 0.6167283058166504, |
|
"learning_rate": 5.384908474729501e-06, |
|
"loss": 0.9029, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 0.9740740740740741, |
|
"grad_norm": 0.6644378304481506, |
|
"learning_rate": 5.3553521688305655e-06, |
|
"loss": 0.9659, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 0.9777777777777777, |
|
"grad_norm": 0.6106395721435547, |
|
"learning_rate": 5.325783377438357e-06, |
|
"loss": 0.9161, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 0.9814814814814815, |
|
"grad_norm": 0.6115413904190063, |
|
"learning_rate": 5.296203139468572e-06, |
|
"loss": 0.8719, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.9851851851851852, |
|
"grad_norm": 0.8100462555885315, |
|
"learning_rate": 5.266612494239088e-06, |
|
"loss": 0.9013, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 0.9888888888888889, |
|
"grad_norm": 0.7386695742607117, |
|
"learning_rate": 5.23701248143345e-06, |
|
"loss": 0.9151, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 0.9925925925925926, |
|
"grad_norm": 0.5981118679046631, |
|
"learning_rate": 5.207404141064334e-06, |
|
"loss": 0.9077, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 0.9962962962962963, |
|
"grad_norm": 0.5839532613754272, |
|
"learning_rate": 5.177788513437013e-06, |
|
"loss": 0.9564, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.4894520938396454, |
|
"learning_rate": 5.148166639112799e-06, |
|
"loss": 0.9273, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.0037037037037038, |
|
"grad_norm": 0.6211138963699341, |
|
"learning_rate": 5.118539558872489e-06, |
|
"loss": 0.9478, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.0074074074074073, |
|
"grad_norm": 0.7439696192741394, |
|
"learning_rate": 5.088908313679788e-06, |
|
"loss": 0.9341, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.0074074074074073, |
|
"eval_loss": 0.9261184930801392, |
|
"eval_runtime": 80.9898, |
|
"eval_samples_per_second": 3.593, |
|
"eval_steps_per_second": 0.457, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.011111111111111, |
|
"grad_norm": 0.6589562296867371, |
|
"learning_rate": 5.059273944644742e-06, |
|
"loss": 0.9316, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.0148148148148148, |
|
"grad_norm": 0.5672058463096619, |
|
"learning_rate": 5.029637492987153e-06, |
|
"loss": 0.9235, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.0185185185185186, |
|
"grad_norm": 0.6068680882453918, |
|
"learning_rate": 5e-06, |
|
"loss": 0.9136, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.0037037037037038, |
|
"grad_norm": 0.7259117960929871, |
|
"learning_rate": 4.970362507012848e-06, |
|
"loss": 0.8627, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.0074074074074073, |
|
"grad_norm": 0.665239691734314, |
|
"learning_rate": 4.940726055355259e-06, |
|
"loss": 0.9385, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.011111111111111, |
|
"grad_norm": 0.71152263879776, |
|
"learning_rate": 4.911091686320213e-06, |
|
"loss": 0.9532, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.0148148148148148, |
|
"grad_norm": 0.7714909911155701, |
|
"learning_rate": 4.881460441127513e-06, |
|
"loss": 0.8689, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.0185185185185186, |
|
"grad_norm": 0.6783362030982971, |
|
"learning_rate": 4.8518333608872015e-06, |
|
"loss": 0.948, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.0222222222222221, |
|
"grad_norm": 0.5598512291908264, |
|
"learning_rate": 4.822211486562989e-06, |
|
"loss": 0.953, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.025925925925926, |
|
"grad_norm": 0.7532334327697754, |
|
"learning_rate": 4.792595858935668e-06, |
|
"loss": 0.9703, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.0296296296296297, |
|
"grad_norm": 0.7283293604850769, |
|
"learning_rate": 4.7629875185665505e-06, |
|
"loss": 0.9526, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.0333333333333334, |
|
"grad_norm": 0.6575984358787537, |
|
"learning_rate": 4.733387505760913e-06, |
|
"loss": 0.9042, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.037037037037037, |
|
"grad_norm": 0.5753719210624695, |
|
"learning_rate": 4.703796860531429e-06, |
|
"loss": 0.9009, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.0407407407407407, |
|
"grad_norm": 0.7370662689208984, |
|
"learning_rate": 4.674216622561645e-06, |
|
"loss": 0.8645, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.0444444444444445, |
|
"grad_norm": 0.602418839931488, |
|
"learning_rate": 4.644647831169435e-06, |
|
"loss": 0.9141, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.048148148148148, |
|
"grad_norm": 0.7609613537788391, |
|
"learning_rate": 4.6150915252705005e-06, |
|
"loss": 0.8668, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.0518518518518518, |
|
"grad_norm": 0.8010672330856323, |
|
"learning_rate": 4.585548743341851e-06, |
|
"loss": 0.9242, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.0555555555555556, |
|
"grad_norm": 0.6908420324325562, |
|
"learning_rate": 4.556020523385326e-06, |
|
"loss": 0.9566, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.0592592592592593, |
|
"grad_norm": 0.7219347357749939, |
|
"learning_rate": 4.526507902891124e-06, |
|
"loss": 0.8987, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.0629629629629629, |
|
"grad_norm": 0.5726153254508972, |
|
"learning_rate": 4.497011918801347e-06, |
|
"loss": 0.9259, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.0666666666666667, |
|
"grad_norm": 0.7002944350242615, |
|
"learning_rate": 4.467533607473563e-06, |
|
"loss": 0.9171, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.0703703703703704, |
|
"grad_norm": 0.7401637434959412, |
|
"learning_rate": 4.438074004644407e-06, |
|
"loss": 0.9147, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.074074074074074, |
|
"grad_norm": 0.7317702770233154, |
|
"learning_rate": 4.408634145393172e-06, |
|
"loss": 0.8777, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.0777777777777777, |
|
"grad_norm": 0.586495041847229, |
|
"learning_rate": 4.379215064105454e-06, |
|
"loss": 0.8734, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.0814814814814815, |
|
"grad_norm": 0.7603331804275513, |
|
"learning_rate": 4.349817794436805e-06, |
|
"loss": 0.9757, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.0851851851851853, |
|
"grad_norm": 0.7039903402328491, |
|
"learning_rate": 4.32044336927641e-06, |
|
"loss": 0.9117, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.0888888888888888, |
|
"grad_norm": 0.7265645265579224, |
|
"learning_rate": 4.2910928207108005e-06, |
|
"loss": 0.9547, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.0925925925925926, |
|
"grad_norm": 0.5854629278182983, |
|
"learning_rate": 4.261767179987595e-06, |
|
"loss": 0.9309, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.0962962962962963, |
|
"grad_norm": 0.7084276676177979, |
|
"learning_rate": 4.232467477479255e-06, |
|
"loss": 0.9414, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.1, |
|
"grad_norm": 0.7032147645950317, |
|
"learning_rate": 4.203194742646893e-06, |
|
"loss": 0.846, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.1037037037037036, |
|
"grad_norm": 0.7182865142822266, |
|
"learning_rate": 4.173950004004097e-06, |
|
"loss": 0.9737, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.1074074074074074, |
|
"grad_norm": 0.6024776697158813, |
|
"learning_rate": 4.1447342890807905e-06, |
|
"loss": 0.8605, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.1111111111111112, |
|
"grad_norm": 0.717693567276001, |
|
"learning_rate": 4.115548624387136e-06, |
|
"loss": 0.8731, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.1148148148148147, |
|
"grad_norm": 0.8089867830276489, |
|
"learning_rate": 4.086394035377463e-06, |
|
"loss": 0.9019, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.1185185185185185, |
|
"grad_norm": 0.5785974860191345, |
|
"learning_rate": 4.057271546414242e-06, |
|
"loss": 0.9574, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.1222222222222222, |
|
"grad_norm": 0.7001700401306152, |
|
"learning_rate": 4.028182180732088e-06, |
|
"loss": 0.8993, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.125925925925926, |
|
"grad_norm": 0.7361912131309509, |
|
"learning_rate": 3.99912696040181e-06, |
|
"loss": 0.9711, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 1.1296296296296295, |
|
"grad_norm": 0.7708266973495483, |
|
"learning_rate": 3.970106906294509e-06, |
|
"loss": 0.9195, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 1.1333333333333333, |
|
"grad_norm": 0.5702573657035828, |
|
"learning_rate": 3.9411230380456925e-06, |
|
"loss": 0.9393, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 1.137037037037037, |
|
"grad_norm": 0.6527413725852966, |
|
"learning_rate": 3.912176374019462e-06, |
|
"loss": 0.9125, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 1.1407407407407408, |
|
"grad_norm": 0.6216891407966614, |
|
"learning_rate": 3.88326793127273e-06, |
|
"loss": 0.8595, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 1.1444444444444444, |
|
"grad_norm": 0.7108457684516907, |
|
"learning_rate": 3.85439872551948e-06, |
|
"loss": 0.945, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 1.1481481481481481, |
|
"grad_norm": 0.564195990562439, |
|
"learning_rate": 3.825569771095082e-06, |
|
"loss": 0.9172, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 1.151851851851852, |
|
"grad_norm": 0.7456059455871582, |
|
"learning_rate": 3.796782080920659e-06, |
|
"loss": 0.9229, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 1.1555555555555554, |
|
"grad_norm": 0.6403030157089233, |
|
"learning_rate": 3.768036666467486e-06, |
|
"loss": 1.0, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 1.1592592592592592, |
|
"grad_norm": 0.6477362513542175, |
|
"learning_rate": 3.7393345377214584e-06, |
|
"loss": 0.9649, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 1.162962962962963, |
|
"grad_norm": 0.7265921831130981, |
|
"learning_rate": 3.7106767031476075e-06, |
|
"loss": 0.9558, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 1.1666666666666667, |
|
"grad_norm": 0.6614460349082947, |
|
"learning_rate": 3.682064169654663e-06, |
|
"loss": 0.9338, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 1.1703703703703703, |
|
"grad_norm": 0.8571596145629883, |
|
"learning_rate": 3.6534979425596747e-06, |
|
"loss": 0.8639, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 1.174074074074074, |
|
"grad_norm": 0.7662659883499146, |
|
"learning_rate": 3.6249790255526916e-06, |
|
"loss": 0.9099, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 1.1777777777777778, |
|
"grad_norm": 0.6332697868347168, |
|
"learning_rate": 3.5965084206615012e-06, |
|
"loss": 0.966, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 1.1814814814814816, |
|
"grad_norm": 0.5719053149223328, |
|
"learning_rate": 3.568087128216414e-06, |
|
"loss": 0.9005, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 1.1851851851851851, |
|
"grad_norm": 0.7472560405731201, |
|
"learning_rate": 3.539716146815122e-06, |
|
"loss": 0.8842, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 1.1888888888888889, |
|
"grad_norm": 0.661870002746582, |
|
"learning_rate": 3.511396473287611e-06, |
|
"loss": 0.9547, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 1.1925925925925926, |
|
"grad_norm": 0.8332524299621582, |
|
"learning_rate": 3.483129102661137e-06, |
|
"loss": 1.0097, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 1.1962962962962962, |
|
"grad_norm": 0.7124307155609131, |
|
"learning_rate": 3.4549150281252635e-06, |
|
"loss": 0.9479, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.6653727889060974, |
|
"learning_rate": 3.4267552409969694e-06, |
|
"loss": 0.9566, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 1.2037037037037037, |
|
"grad_norm": 0.7246274948120117, |
|
"learning_rate": 3.398650730685813e-06, |
|
"loss": 0.8731, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 1.2074074074074075, |
|
"grad_norm": 0.7679101824760437, |
|
"learning_rate": 3.3706024846591717e-06, |
|
"loss": 0.8851, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 1.211111111111111, |
|
"grad_norm": 0.6830713152885437, |
|
"learning_rate": 3.3426114884075488e-06, |
|
"loss": 0.9429, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 1.2148148148148148, |
|
"grad_norm": 0.686126172542572, |
|
"learning_rate": 3.3146787254099424e-06, |
|
"loss": 0.9363, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 1.2185185185185186, |
|
"grad_norm": 1.0472172498703003, |
|
"learning_rate": 3.2868051770992935e-06, |
|
"loss": 0.8628, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 1.2222222222222223, |
|
"grad_norm": 0.6732811331748962, |
|
"learning_rate": 3.258991822828007e-06, |
|
"loss": 0.9343, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 1.2259259259259259, |
|
"grad_norm": 0.6006411910057068, |
|
"learning_rate": 3.2312396398335312e-06, |
|
"loss": 0.8932, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 1.2296296296296296, |
|
"grad_norm": 0.6865448355674744, |
|
"learning_rate": 3.2035496032040303e-06, |
|
"loss": 0.9113, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 1.2333333333333334, |
|
"grad_norm": 0.7750067114830017, |
|
"learning_rate": 3.175922685844125e-06, |
|
"loss": 0.8964, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 1.237037037037037, |
|
"grad_norm": 0.6137946248054504, |
|
"learning_rate": 3.1483598584407003e-06, |
|
"loss": 0.9198, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 1.2407407407407407, |
|
"grad_norm": 0.5940172672271729, |
|
"learning_rate": 3.1208620894288105e-06, |
|
"loss": 0.8925, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2407407407407407, |
|
"eval_loss": 0.9176353812217712, |
|
"eval_runtime": 80.9941, |
|
"eval_samples_per_second": 3.593, |
|
"eval_steps_per_second": 0.457, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 1.2444444444444445, |
|
"grad_norm": 0.8746694326400757, |
|
"learning_rate": 3.093430344957643e-06, |
|
"loss": 0.9542, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 1.2481481481481482, |
|
"grad_norm": 0.7152467370033264, |
|
"learning_rate": 3.0660655888565827e-06, |
|
"loss": 0.9122, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 1.2518518518518518, |
|
"grad_norm": 0.665104866027832, |
|
"learning_rate": 3.038768782601335e-06, |
|
"loss": 0.8695, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 1.2555555555555555, |
|
"grad_norm": 0.6397359371185303, |
|
"learning_rate": 3.0115408852801535e-06, |
|
"loss": 0.9026, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 1.2592592592592593, |
|
"grad_norm": 0.6641426682472229, |
|
"learning_rate": 2.98438285356014e-06, |
|
"loss": 0.9131, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 1.262962962962963, |
|
"grad_norm": 0.7378568053245544, |
|
"learning_rate": 2.9572956416536267e-06, |
|
"loss": 0.9778, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 1.2666666666666666, |
|
"grad_norm": 0.7851204872131348, |
|
"learning_rate": 2.930280201284654e-06, |
|
"loss": 0.9295, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 1.2703703703703704, |
|
"grad_norm": 0.7360734939575195, |
|
"learning_rate": 2.9033374816555338e-06, |
|
"loss": 0.8333, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 1.2740740740740741, |
|
"grad_norm": 0.5486617088317871, |
|
"learning_rate": 2.8764684294134872e-06, |
|
"loss": 0.8636, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 1.2777777777777777, |
|
"grad_norm": 0.6200026273727417, |
|
"learning_rate": 2.8496739886173994e-06, |
|
"loss": 0.9163, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 1.2814814814814814, |
|
"grad_norm": 0.7656910419464111, |
|
"learning_rate": 2.822955100704634e-06, |
|
"loss": 0.8811, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 1.2851851851851852, |
|
"grad_norm": 0.8108608722686768, |
|
"learning_rate": 2.7963127044579697e-06, |
|
"loss": 0.9206, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 1.2888888888888888, |
|
"grad_norm": 0.7808861136436462, |
|
"learning_rate": 2.769747735972605e-06, |
|
"loss": 0.9116, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 1.2925925925925925, |
|
"grad_norm": 0.6127861142158508, |
|
"learning_rate": 2.743261128623269e-06, |
|
"loss": 0.8986, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 1.2962962962962963, |
|
"grad_norm": 0.8103310465812683, |
|
"learning_rate": 2.716853813031435e-06, |
|
"loss": 0.8832, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 1.3, |
|
"grad_norm": 0.658495306968689, |
|
"learning_rate": 2.6905267170326143e-06, |
|
"loss": 0.9457, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 1.3037037037037038, |
|
"grad_norm": 0.6721301078796387, |
|
"learning_rate": 2.6642807656437565e-06, |
|
"loss": 0.9182, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 1.3074074074074074, |
|
"grad_norm": 0.6494591236114502, |
|
"learning_rate": 2.6381168810307536e-06, |
|
"loss": 0.9245, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 1.3111111111111111, |
|
"grad_norm": 0.6653662919998169, |
|
"learning_rate": 2.612035982476039e-06, |
|
"loss": 0.9654, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 1.3148148148148149, |
|
"grad_norm": 0.6556596159934998, |
|
"learning_rate": 2.5860389863462765e-06, |
|
"loss": 0.9552, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 1.3185185185185184, |
|
"grad_norm": 0.7767282724380493, |
|
"learning_rate": 2.5601268060601816e-06, |
|
"loss": 0.901, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 1.3222222222222222, |
|
"grad_norm": 0.6174845099449158, |
|
"learning_rate": 2.534300352056416e-06, |
|
"loss": 0.8979, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 1.325925925925926, |
|
"grad_norm": 0.5829298496246338, |
|
"learning_rate": 2.508560531761597e-06, |
|
"loss": 0.9774, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 1.3296296296296295, |
|
"grad_norm": 0.6260789632797241, |
|
"learning_rate": 2.4829082495584244e-06, |
|
"loss": 0.9693, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 1.3333333333333333, |
|
"grad_norm": 0.6920310854911804, |
|
"learning_rate": 2.457344406753899e-06, |
|
"loss": 0.9345, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 1.337037037037037, |
|
"grad_norm": 0.6177324652671814, |
|
"learning_rate": 2.4318699015476495e-06, |
|
"loss": 0.9274, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 1.3407407407407408, |
|
"grad_norm": 0.6547250151634216, |
|
"learning_rate": 2.4064856290003863e-06, |
|
"loss": 0.9309, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 1.3444444444444446, |
|
"grad_norm": 0.7775738835334778, |
|
"learning_rate": 2.3811924810024385e-06, |
|
"loss": 0.9607, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 1.348148148148148, |
|
"grad_norm": 0.8030884861946106, |
|
"learning_rate": 2.35599134624243e-06, |
|
"loss": 0.9343, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 1.3518518518518519, |
|
"grad_norm": 0.5836490988731384, |
|
"learning_rate": 2.330883110176049e-06, |
|
"loss": 0.9088, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 1.3555555555555556, |
|
"grad_norm": 0.7231821417808533, |
|
"learning_rate": 2.3058686549949306e-06, |
|
"loss": 0.8505, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 1.3592592592592592, |
|
"grad_norm": 0.7363606095314026, |
|
"learning_rate": 2.2809488595956746e-06, |
|
"loss": 0.9975, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 1.362962962962963, |
|
"grad_norm": 0.7326072454452515, |
|
"learning_rate": 2.256124599548957e-06, |
|
"loss": 0.9272, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 1.3666666666666667, |
|
"grad_norm": 0.6802873015403748, |
|
"learning_rate": 2.2313967470687593e-06, |
|
"loss": 0.9038, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 1.3703703703703702, |
|
"grad_norm": 0.6956616640090942, |
|
"learning_rate": 2.2067661709817384e-06, |
|
"loss": 0.9062, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 1.374074074074074, |
|
"grad_norm": 0.5167267322540283, |
|
"learning_rate": 2.18223373669669e-06, |
|
"loss": 0.8963, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 1.3777777777777778, |
|
"grad_norm": 0.5965335965156555, |
|
"learning_rate": 2.157800306174139e-06, |
|
"loss": 0.9253, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 1.3814814814814815, |
|
"grad_norm": 0.6725478768348694, |
|
"learning_rate": 2.1334667378960642e-06, |
|
"loss": 0.9349, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 1.3851851851851853, |
|
"grad_norm": 0.6209405064582825, |
|
"learning_rate": 2.1092338868357305e-06, |
|
"loss": 0.9129, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 1.3888888888888888, |
|
"grad_norm": 0.7127699255943298, |
|
"learning_rate": 2.0851026044276405e-06, |
|
"loss": 0.9502, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 1.3925925925925926, |
|
"grad_norm": 0.7374861836433411, |
|
"learning_rate": 2.061073738537635e-06, |
|
"loss": 0.913, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 1.3962962962962964, |
|
"grad_norm": 0.6551845669746399, |
|
"learning_rate": 2.0371481334330913e-06, |
|
"loss": 0.9493, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.7371388077735901, |
|
"learning_rate": 2.013326629753259e-06, |
|
"loss": 0.9285, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 1.4037037037037037, |
|
"grad_norm": 0.7790321111679077, |
|
"learning_rate": 1.9896100644797316e-06, |
|
"loss": 0.8598, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 1.4074074074074074, |
|
"grad_norm": 0.6032355427742004, |
|
"learning_rate": 1.9659992709070346e-06, |
|
"loss": 0.9298, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 1.411111111111111, |
|
"grad_norm": 0.7076795101165771, |
|
"learning_rate": 1.9424950786133414e-06, |
|
"loss": 0.92, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 1.4148148148148147, |
|
"grad_norm": 0.8255873322486877, |
|
"learning_rate": 1.919098313431335e-06, |
|
"loss": 0.8778, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 1.4185185185185185, |
|
"grad_norm": 0.7901713848114014, |
|
"learning_rate": 1.8958097974191909e-06, |
|
"loss": 0.8844, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 1.4222222222222223, |
|
"grad_norm": 0.7087342739105225, |
|
"learning_rate": 1.8726303488316822e-06, |
|
"loss": 0.9575, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 1.425925925925926, |
|
"grad_norm": 0.7450980544090271, |
|
"learning_rate": 1.8495607820914451e-06, |
|
"loss": 0.811, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 1.4296296296296296, |
|
"grad_norm": 0.5872119665145874, |
|
"learning_rate": 1.826601907760357e-06, |
|
"loss": 0.9075, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 1.4333333333333333, |
|
"grad_norm": 0.7893672585487366, |
|
"learning_rate": 1.8037545325110506e-06, |
|
"loss": 0.9549, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 1.4370370370370371, |
|
"grad_norm": 0.7330056428909302, |
|
"learning_rate": 1.781019459098584e-06, |
|
"loss": 0.9366, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 1.4407407407407407, |
|
"grad_norm": 0.6859740614891052, |
|
"learning_rate": 1.7583974863322272e-06, |
|
"loss": 0.9284, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 1.4444444444444444, |
|
"grad_norm": 0.7870423793792725, |
|
"learning_rate": 1.7358894090473928e-06, |
|
"loss": 0.8698, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 1.4481481481481482, |
|
"grad_norm": 0.6686491370201111, |
|
"learning_rate": 1.7134960180777171e-06, |
|
"loss": 0.9149, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 1.4518518518518517, |
|
"grad_norm": 0.7235841751098633, |
|
"learning_rate": 1.6912181002272714e-06, |
|
"loss": 0.9068, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 1.4555555555555555, |
|
"grad_norm": 0.6145541667938232, |
|
"learning_rate": 1.6690564382429104e-06, |
|
"loss": 0.8985, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 1.4592592592592593, |
|
"grad_norm": 0.5945561528205872, |
|
"learning_rate": 1.6470118107867777e-06, |
|
"loss": 0.9318, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 1.462962962962963, |
|
"grad_norm": 0.6769958734512329, |
|
"learning_rate": 1.6250849924089485e-06, |
|
"loss": 0.9207, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 1.4666666666666668, |
|
"grad_norm": 0.6360178589820862, |
|
"learning_rate": 1.6032767535202042e-06, |
|
"loss": 0.9344, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 1.4703703703703703, |
|
"grad_norm": 0.6406002640724182, |
|
"learning_rate": 1.581587860364977e-06, |
|
"loss": 0.8903, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 1.474074074074074, |
|
"grad_norm": 0.7576456069946289, |
|
"learning_rate": 1.560019074994416e-06, |
|
"loss": 0.8949, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 1.4777777777777779, |
|
"grad_norm": 0.8080588579177856, |
|
"learning_rate": 1.5385711552396227e-06, |
|
"loss": 0.9252, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 1.4814814814814814, |
|
"grad_norm": 0.6083511114120483, |
|
"learning_rate": 1.5172448546850166e-06, |
|
"loss": 0.9096, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 1.4851851851851852, |
|
"grad_norm": 0.7563885450363159, |
|
"learning_rate": 1.4960409226418576e-06, |
|
"loss": 0.966, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 1.488888888888889, |
|
"grad_norm": 0.8337453603744507, |
|
"learning_rate": 1.4749601041219246e-06, |
|
"loss": 0.933, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 1.4925925925925925, |
|
"grad_norm": 0.5826141238212585, |
|
"learning_rate": 1.4540031398113335e-06, |
|
"loss": 0.896, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.4925925925925925, |
|
"eval_loss": 0.9128310084342957, |
|
"eval_runtime": 80.6954, |
|
"eval_samples_per_second": 3.606, |
|
"eval_steps_per_second": 0.459, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 1.4962962962962962, |
|
"grad_norm": 0.64705890417099, |
|
"learning_rate": 1.4331707660445155e-06, |
|
"loss": 0.8723, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 1.5, |
|
"grad_norm": 0.6053957939147949, |
|
"learning_rate": 1.4124637147783431e-06, |
|
"loss": 0.8731, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 1.5037037037037035, |
|
"grad_norm": 0.8161659836769104, |
|
"learning_rate": 1.3918827135664186e-06, |
|
"loss": 0.9542, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 1.5074074074074075, |
|
"grad_norm": 0.8037695288658142, |
|
"learning_rate": 1.371428485533498e-06, |
|
"loss": 0.9263, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 1.511111111111111, |
|
"grad_norm": 0.6966097950935364, |
|
"learning_rate": 1.3511017493501005e-06, |
|
"loss": 0.9611, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 1.5148148148148148, |
|
"grad_norm": 0.7274028062820435, |
|
"learning_rate": 1.3309032192072463e-06, |
|
"loss": 0.8968, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 1.5185185185185186, |
|
"grad_norm": 0.657966136932373, |
|
"learning_rate": 1.3108336047913633e-06, |
|
"loss": 0.9025, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.5222222222222221, |
|
"grad_norm": 0.7330816388130188, |
|
"learning_rate": 1.29089361125936e-06, |
|
"loss": 0.9422, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 1.525925925925926, |
|
"grad_norm": 0.6839099526405334, |
|
"learning_rate": 1.2710839392138386e-06, |
|
"loss": 0.9604, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 1.5296296296296297, |
|
"grad_norm": 0.7069361805915833, |
|
"learning_rate": 1.251405284678488e-06, |
|
"loss": 0.9125, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 1.5333333333333332, |
|
"grad_norm": 0.6964563131332397, |
|
"learning_rate": 1.2318583390736256e-06, |
|
"loss": 0.899, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 1.5370370370370372, |
|
"grad_norm": 0.6390750408172607, |
|
"learning_rate": 1.2124437891918995e-06, |
|
"loss": 0.8699, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.5407407407407407, |
|
"grad_norm": 0.7302431464195251, |
|
"learning_rate": 1.1931623171741653e-06, |
|
"loss": 0.9124, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 1.5444444444444443, |
|
"grad_norm": 0.6087197065353394, |
|
"learning_rate": 1.1740146004855141e-06, |
|
"loss": 0.8754, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 1.5481481481481483, |
|
"grad_norm": 0.768294632434845, |
|
"learning_rate": 1.1550013118914665e-06, |
|
"loss": 0.9578, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 1.5518518518518518, |
|
"grad_norm": 0.6789171099662781, |
|
"learning_rate": 1.1361231194343436e-06, |
|
"loss": 0.9235, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 1.5555555555555556, |
|
"grad_norm": 0.9581536054611206, |
|
"learning_rate": 1.1173806864097885e-06, |
|
"loss": 0.8839, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.5592592592592593, |
|
"grad_norm": 0.5669332146644592, |
|
"learning_rate": 1.0987746713434578e-06, |
|
"loss": 0.9126, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 1.5629629629629629, |
|
"grad_norm": 0.8494063019752502, |
|
"learning_rate": 1.080305727967893e-06, |
|
"loss": 0.8771, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 1.5666666666666667, |
|
"grad_norm": 0.7187017202377319, |
|
"learning_rate": 1.0619745051995473e-06, |
|
"loss": 0.9504, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 1.5703703703703704, |
|
"grad_norm": 0.5970302820205688, |
|
"learning_rate": 1.043781647115979e-06, |
|
"loss": 0.8693, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 1.574074074074074, |
|
"grad_norm": 0.6068917512893677, |
|
"learning_rate": 1.0257277929332332e-06, |
|
"loss": 0.9556, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.5777777777777777, |
|
"grad_norm": 0.5248574018478394, |
|
"learning_rate": 1.0078135769833758e-06, |
|
"loss": 0.9034, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 1.5814814814814815, |
|
"grad_norm": 0.593900740146637, |
|
"learning_rate": 9.900396286922025e-07, |
|
"loss": 0.9028, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 1.585185185185185, |
|
"grad_norm": 0.6070235371589661, |
|
"learning_rate": 9.72406572557133e-07, |
|
"loss": 0.8641, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 1.588888888888889, |
|
"grad_norm": 0.6419976353645325, |
|
"learning_rate": 9.549150281252633e-07, |
|
"loss": 0.9095, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 1.5925925925925926, |
|
"grad_norm": 0.8620632290840149, |
|
"learning_rate": 9.375656099715935e-07, |
|
"loss": 0.974, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.5962962962962963, |
|
"grad_norm": 0.6101662516593933, |
|
"learning_rate": 9.203589276774438e-07, |
|
"loss": 0.8868, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.6099417209625244, |
|
"learning_rate": 9.032955858090319e-07, |
|
"loss": 0.8978, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 1.6037037037037036, |
|
"grad_norm": 0.8809055685997009, |
|
"learning_rate": 8.86376183896226e-07, |
|
"loss": 0.8905, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 1.6074074074074074, |
|
"grad_norm": 0.5985243320465088, |
|
"learning_rate": 8.696013164114902e-07, |
|
"loss": 0.8914, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 1.6111111111111112, |
|
"grad_norm": 0.6197245121002197, |
|
"learning_rate": 8.529715727489912e-07, |
|
"loss": 0.8962, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.6148148148148147, |
|
"grad_norm": 0.6981231570243835, |
|
"learning_rate": 8.364875372038878e-07, |
|
"loss": 0.9588, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 1.6185185185185185, |
|
"grad_norm": 0.9516739249229431, |
|
"learning_rate": 8.201497889518073e-07, |
|
"loss": 0.894, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 1.6222222222222222, |
|
"grad_norm": 0.7582102417945862, |
|
"learning_rate": 8.039589020284926e-07, |
|
"loss": 0.8848, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 1.6259259259259258, |
|
"grad_norm": 0.7806898355484009, |
|
"learning_rate": 7.879154453096305e-07, |
|
"loss": 0.9589, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 1.6296296296296298, |
|
"grad_norm": 0.7488991022109985, |
|
"learning_rate": 7.720199824908692e-07, |
|
"loss": 0.876, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.6333333333333333, |
|
"grad_norm": 0.7136239409446716, |
|
"learning_rate": 7.562730720680111e-07, |
|
"loss": 0.9316, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 1.637037037037037, |
|
"grad_norm": 0.7333770990371704, |
|
"learning_rate": 7.406752673173851e-07, |
|
"loss": 0.8628, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 1.6407407407407408, |
|
"grad_norm": 0.8281632661819458, |
|
"learning_rate": 7.25227116276413e-07, |
|
"loss": 0.9111, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 1.6444444444444444, |
|
"grad_norm": 0.742244303226471, |
|
"learning_rate": 7.099291617243526e-07, |
|
"loss": 1.0076, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 1.6481481481481481, |
|
"grad_norm": 0.9548508524894714, |
|
"learning_rate": 6.947819411632223e-07, |
|
"loss": 0.8449, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.651851851851852, |
|
"grad_norm": 0.6826335787773132, |
|
"learning_rate": 6.797859867989226e-07, |
|
"loss": 0.9281, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 1.6555555555555554, |
|
"grad_norm": 0.7542606592178345, |
|
"learning_rate": 6.649418255225298e-07, |
|
"loss": 0.9343, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 1.6592592592592592, |
|
"grad_norm": 0.6692271828651428, |
|
"learning_rate": 6.502499788917893e-07, |
|
"loss": 0.9217, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 1.662962962962963, |
|
"grad_norm": 0.6740605235099792, |
|
"learning_rate": 6.357109631127889e-07, |
|
"loss": 0.9304, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 1.6666666666666665, |
|
"grad_norm": 0.7319871187210083, |
|
"learning_rate": 6.213252890218163e-07, |
|
"loss": 0.937, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.6703703703703705, |
|
"grad_norm": 0.5654203295707703, |
|
"learning_rate": 6.07093462067419e-07, |
|
"loss": 0.8914, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 1.674074074074074, |
|
"grad_norm": 0.5713678002357483, |
|
"learning_rate": 5.930159822926407e-07, |
|
"loss": 0.9257, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 1.6777777777777778, |
|
"grad_norm": 0.838940441608429, |
|
"learning_rate": 5.79093344317449e-07, |
|
"loss": 0.9104, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 1.6814814814814816, |
|
"grad_norm": 0.6449588537216187, |
|
"learning_rate": 5.653260373213632e-07, |
|
"loss": 0.8805, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 1.6851851851851851, |
|
"grad_norm": 0.7371458411216736, |
|
"learning_rate": 5.517145450262639e-07, |
|
"loss": 0.8835, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.6888888888888889, |
|
"grad_norm": 0.679885983467102, |
|
"learning_rate": 5.382593456793933e-07, |
|
"loss": 0.9306, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 1.6925925925925926, |
|
"grad_norm": 0.5046345591545105, |
|
"learning_rate": 5.249609120365579e-07, |
|
"loss": 0.8928, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 1.6962962962962962, |
|
"grad_norm": 0.8400017023086548, |
|
"learning_rate": 5.118197113455164e-07, |
|
"loss": 0.9142, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 1.7, |
|
"grad_norm": 0.6406440734863281, |
|
"learning_rate": 4.988362053295564e-07, |
|
"loss": 0.8868, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 1.7037037037037037, |
|
"grad_norm": 0.633682906627655, |
|
"learning_rate": 4.860108501712824e-07, |
|
"loss": 0.913, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.7074074074074073, |
|
"grad_norm": 0.5694250464439392, |
|
"learning_rate": 4.733440964965791e-07, |
|
"loss": 0.9226, |
|
"step": 466 |
|
}, |
|
{ |
|
"epoch": 1.7111111111111112, |
|
"grad_norm": 0.7359141707420349, |
|
"learning_rate": 4.6083638935878025e-07, |
|
"loss": 0.9148, |
|
"step": 467 |
|
}, |
|
{ |
|
"epoch": 1.7148148148148148, |
|
"grad_norm": 0.669040322303772, |
|
"learning_rate": 4.484881682230341e-07, |
|
"loss": 0.8438, |
|
"step": 468 |
|
}, |
|
{ |
|
"epoch": 1.7185185185185186, |
|
"grad_norm": 0.6235198974609375, |
|
"learning_rate": 4.3629986695086166e-07, |
|
"loss": 0.9097, |
|
"step": 469 |
|
}, |
|
{ |
|
"epoch": 1.7222222222222223, |
|
"grad_norm": 0.617144763469696, |
|
"learning_rate": 4.242719137849077e-07, |
|
"loss": 0.8961, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.7259259259259259, |
|
"grad_norm": 0.6972277164459229, |
|
"learning_rate": 4.124047313339025e-07, |
|
"loss": 0.8856, |
|
"step": 471 |
|
}, |
|
{ |
|
"epoch": 1.7296296296296296, |
|
"grad_norm": 0.7551361322402954, |
|
"learning_rate": 4.00698736557808e-07, |
|
"loss": 0.9143, |
|
"step": 472 |
|
}, |
|
{ |
|
"epoch": 1.7333333333333334, |
|
"grad_norm": 0.7269377708435059, |
|
"learning_rate": 3.891543407531673e-07, |
|
"loss": 0.9145, |
|
"step": 473 |
|
}, |
|
{ |
|
"epoch": 1.737037037037037, |
|
"grad_norm": 0.7635181546211243, |
|
"learning_rate": 3.777719495386567e-07, |
|
"loss": 0.8855, |
|
"step": 474 |
|
}, |
|
{ |
|
"epoch": 1.7407407407407407, |
|
"grad_norm": 0.745471715927124, |
|
"learning_rate": 3.665519628408332e-07, |
|
"loss": 0.9049, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.7444444444444445, |
|
"grad_norm": 0.983215868473053, |
|
"learning_rate": 3.5549477488007853e-07, |
|
"loss": 0.8904, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.7444444444444445, |
|
"eval_loss": 0.910611093044281, |
|
"eval_runtime": 80.7891, |
|
"eval_samples_per_second": 3.602, |
|
"eval_steps_per_second": 0.458, |
|
"step": 476 |
|
}, |
|
{ |
|
"epoch": 1.748148148148148, |
|
"grad_norm": 0.645391047000885, |
|
"learning_rate": 3.4460077415675473e-07, |
|
"loss": 0.9156, |
|
"step": 477 |
|
}, |
|
{ |
|
"epoch": 1.751851851851852, |
|
"grad_norm": 0.7084014415740967, |
|
"learning_rate": 3.3387034343755063e-07, |
|
"loss": 0.9417, |
|
"step": 478 |
|
}, |
|
{ |
|
"epoch": 1.7555555555555555, |
|
"grad_norm": 0.6383021473884583, |
|
"learning_rate": 3.2330385974203184e-07, |
|
"loss": 0.9339, |
|
"step": 479 |
|
}, |
|
{ |
|
"epoch": 1.7592592592592593, |
|
"grad_norm": 0.6533625721931458, |
|
"learning_rate": 3.1290169432939556e-07, |
|
"loss": 0.9548, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.762962962962963, |
|
"grad_norm": 0.5707213878631592, |
|
"learning_rate": 3.0266421268542734e-07, |
|
"loss": 0.9544, |
|
"step": 481 |
|
}, |
|
{ |
|
"epoch": 1.7666666666666666, |
|
"grad_norm": 0.6648502945899963, |
|
"learning_rate": 2.925917745096568e-07, |
|
"loss": 0.8525, |
|
"step": 482 |
|
}, |
|
{ |
|
"epoch": 1.7703703703703704, |
|
"grad_norm": 0.6798570156097412, |
|
"learning_rate": 2.826847337027222e-07, |
|
"loss": 0.9217, |
|
"step": 483 |
|
}, |
|
{ |
|
"epoch": 1.7740740740740741, |
|
"grad_norm": 0.709642231464386, |
|
"learning_rate": 2.7294343835393366e-07, |
|
"loss": 0.8996, |
|
"step": 484 |
|
}, |
|
{ |
|
"epoch": 1.7777777777777777, |
|
"grad_norm": 0.7643037438392639, |
|
"learning_rate": 2.6336823072904305e-07, |
|
"loss": 0.8625, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.7814814814814814, |
|
"grad_norm": 0.7004448771476746, |
|
"learning_rate": 2.539594472582213e-07, |
|
"loss": 0.9118, |
|
"step": 486 |
|
}, |
|
{ |
|
"epoch": 1.7851851851851852, |
|
"grad_norm": 0.6062957048416138, |
|
"learning_rate": 2.447174185242324e-07, |
|
"loss": 0.8849, |
|
"step": 487 |
|
}, |
|
{ |
|
"epoch": 1.7888888888888888, |
|
"grad_norm": 0.6031033396720886, |
|
"learning_rate": 2.3564246925082358e-07, |
|
"loss": 0.924, |
|
"step": 488 |
|
}, |
|
{ |
|
"epoch": 1.7925925925925927, |
|
"grad_norm": 0.6818556189537048, |
|
"learning_rate": 2.2673491829131365e-07, |
|
"loss": 0.9206, |
|
"step": 489 |
|
}, |
|
{ |
|
"epoch": 1.7962962962962963, |
|
"grad_norm": 0.7448561191558838, |
|
"learning_rate": 2.179950786173879e-07, |
|
"loss": 0.8549, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.603404700756073, |
|
"learning_rate": 2.0942325730810565e-07, |
|
"loss": 0.919, |
|
"step": 491 |
|
}, |
|
{ |
|
"epoch": 1.8037037037037038, |
|
"grad_norm": 0.7165398001670837, |
|
"learning_rate": 2.01019755539108e-07, |
|
"loss": 0.9021, |
|
"step": 492 |
|
}, |
|
{ |
|
"epoch": 1.8074074074074074, |
|
"grad_norm": 0.7593845725059509, |
|
"learning_rate": 1.9278486857203683e-07, |
|
"loss": 0.9153, |
|
"step": 493 |
|
}, |
|
{ |
|
"epoch": 1.8111111111111111, |
|
"grad_norm": 0.6313470602035522, |
|
"learning_rate": 1.8471888574415953e-07, |
|
"loss": 0.9106, |
|
"step": 494 |
|
}, |
|
{ |
|
"epoch": 1.8148148148148149, |
|
"grad_norm": 0.6173641085624695, |
|
"learning_rate": 1.7682209045820687e-07, |
|
"loss": 0.8888, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.8185185185185184, |
|
"grad_norm": 0.76194167137146, |
|
"learning_rate": 1.690947601724091e-07, |
|
"loss": 0.9064, |
|
"step": 496 |
|
}, |
|
{ |
|
"epoch": 1.8222222222222222, |
|
"grad_norm": 0.675755500793457, |
|
"learning_rate": 1.6153716639075223e-07, |
|
"loss": 0.9266, |
|
"step": 497 |
|
}, |
|
{ |
|
"epoch": 1.825925925925926, |
|
"grad_norm": 0.7498816847801208, |
|
"learning_rate": 1.5414957465343883e-07, |
|
"loss": 0.8579, |
|
"step": 498 |
|
}, |
|
{ |
|
"epoch": 1.8296296296296295, |
|
"grad_norm": 0.656910240650177, |
|
"learning_rate": 1.4693224452755284e-07, |
|
"loss": 0.821, |
|
"step": 499 |
|
}, |
|
{ |
|
"epoch": 1.8333333333333335, |
|
"grad_norm": 0.6735762357711792, |
|
"learning_rate": 1.3988542959794627e-07, |
|
"loss": 0.8731, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.837037037037037, |
|
"grad_norm": 0.6537667512893677, |
|
"learning_rate": 1.330093774583252e-07, |
|
"loss": 0.9249, |
|
"step": 501 |
|
}, |
|
{ |
|
"epoch": 1.8407407407407408, |
|
"grad_norm": 0.6112355589866638, |
|
"learning_rate": 1.2630432970255014e-07, |
|
"loss": 0.8936, |
|
"step": 502 |
|
}, |
|
{ |
|
"epoch": 1.8444444444444446, |
|
"grad_norm": 0.7084822058677673, |
|
"learning_rate": 1.1977052191615158e-07, |
|
"loss": 0.9221, |
|
"step": 503 |
|
}, |
|
{ |
|
"epoch": 1.848148148148148, |
|
"grad_norm": 0.652979850769043, |
|
"learning_rate": 1.1340818366804728e-07, |
|
"loss": 0.9073, |
|
"step": 504 |
|
}, |
|
{ |
|
"epoch": 1.8518518518518519, |
|
"grad_norm": 0.6980672478675842, |
|
"learning_rate": 1.0721753850247984e-07, |
|
"loss": 0.9294, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.8555555555555556, |
|
"grad_norm": 0.7224528789520264, |
|
"learning_rate": 1.0119880393116177e-07, |
|
"loss": 0.8842, |
|
"step": 506 |
|
}, |
|
{ |
|
"epoch": 1.8592592592592592, |
|
"grad_norm": 0.6340327262878418, |
|
"learning_rate": 9.535219142563168e-08, |
|
"loss": 0.9598, |
|
"step": 507 |
|
}, |
|
{ |
|
"epoch": 1.862962962962963, |
|
"grad_norm": 0.686582624912262, |
|
"learning_rate": 8.967790640982466e-08, |
|
"loss": 0.9344, |
|
"step": 508 |
|
}, |
|
{ |
|
"epoch": 1.8666666666666667, |
|
"grad_norm": 0.5959629416465759, |
|
"learning_rate": 8.417614825285636e-08, |
|
"loss": 0.9026, |
|
"step": 509 |
|
}, |
|
{ |
|
"epoch": 1.8703703703703702, |
|
"grad_norm": 0.6575542092323303, |
|
"learning_rate": 7.884711026201586e-08, |
|
"loss": 0.9288, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.8740740740740742, |
|
"grad_norm": 0.5704898238182068, |
|
"learning_rate": 7.369097967597493e-08, |
|
"loss": 0.8636, |
|
"step": 511 |
|
}, |
|
{ |
|
"epoch": 1.8777777777777778, |
|
"grad_norm": 0.6155747771263123, |
|
"learning_rate": 6.870793765820783e-08, |
|
"loss": 0.8362, |
|
"step": 512 |
|
}, |
|
{ |
|
"epoch": 1.8814814814814815, |
|
"grad_norm": 0.6208741664886475, |
|
"learning_rate": 6.389815929062848e-08, |
|
"loss": 0.9179, |
|
"step": 513 |
|
}, |
|
{ |
|
"epoch": 1.8851851851851853, |
|
"grad_norm": 0.7014544010162354, |
|
"learning_rate": 5.92618135674361e-08, |
|
"loss": 0.9333, |
|
"step": 514 |
|
}, |
|
{ |
|
"epoch": 1.8888888888888888, |
|
"grad_norm": 0.814078152179718, |
|
"learning_rate": 5.479906338917984e-08, |
|
"loss": 0.9186, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.8925925925925926, |
|
"grad_norm": 0.7297834753990173, |
|
"learning_rate": 5.0510065557034526e-08, |
|
"loss": 0.8992, |
|
"step": 516 |
|
}, |
|
{ |
|
"epoch": 1.8962962962962964, |
|
"grad_norm": 0.6444849371910095, |
|
"learning_rate": 4.639497076728949e-08, |
|
"loss": 0.94, |
|
"step": 517 |
|
}, |
|
{ |
|
"epoch": 1.9, |
|
"grad_norm": 0.5319604873657227, |
|
"learning_rate": 4.245392360605727e-08, |
|
"loss": 0.9396, |
|
"step": 518 |
|
}, |
|
{ |
|
"epoch": 1.9037037037037037, |
|
"grad_norm": 0.9374611973762512, |
|
"learning_rate": 3.86870625441893e-08, |
|
"loss": 0.9718, |
|
"step": 519 |
|
}, |
|
{ |
|
"epoch": 1.9074074074074074, |
|
"grad_norm": 0.5568841695785522, |
|
"learning_rate": 3.5094519932415417e-08, |
|
"loss": 0.89, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.911111111111111, |
|
"grad_norm": 0.9112274646759033, |
|
"learning_rate": 3.167642199668863e-08, |
|
"loss": 0.925, |
|
"step": 521 |
|
}, |
|
{ |
|
"epoch": 1.914814814814815, |
|
"grad_norm": 0.655830979347229, |
|
"learning_rate": 2.843288883375539e-08, |
|
"loss": 0.9135, |
|
"step": 522 |
|
}, |
|
{ |
|
"epoch": 1.9185185185185185, |
|
"grad_norm": 0.5499829649925232, |
|
"learning_rate": 2.5364034406930026e-08, |
|
"loss": 0.8902, |
|
"step": 523 |
|
}, |
|
{ |
|
"epoch": 1.9222222222222223, |
|
"grad_norm": 0.9093420505523682, |
|
"learning_rate": 2.2469966542096323e-08, |
|
"loss": 0.971, |
|
"step": 524 |
|
}, |
|
{ |
|
"epoch": 1.925925925925926, |
|
"grad_norm": 0.8075233101844788, |
|
"learning_rate": 1.975078692391552e-08, |
|
"loss": 0.9288, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.9296296296296296, |
|
"grad_norm": 0.6721240282058716, |
|
"learning_rate": 1.7206591092253642e-08, |
|
"loss": 0.8983, |
|
"step": 526 |
|
}, |
|
{ |
|
"epoch": 1.9333333333333333, |
|
"grad_norm": 0.6682837605476379, |
|
"learning_rate": 1.4837468438826385e-08, |
|
"loss": 0.9423, |
|
"step": 527 |
|
}, |
|
{ |
|
"epoch": 1.9370370370370371, |
|
"grad_norm": 0.653581440448761, |
|
"learning_rate": 1.264350220405719e-08, |
|
"loss": 0.9542, |
|
"step": 528 |
|
}, |
|
{ |
|
"epoch": 1.9407407407407407, |
|
"grad_norm": 0.5496450066566467, |
|
"learning_rate": 1.0624769474152363e-08, |
|
"loss": 0.9059, |
|
"step": 529 |
|
}, |
|
{ |
|
"epoch": 1.9444444444444444, |
|
"grad_norm": 0.7037910223007202, |
|
"learning_rate": 8.781341178393244e-09, |
|
"loss": 0.8928, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.9481481481481482, |
|
"grad_norm": 0.6391336917877197, |
|
"learning_rate": 7.1132820866431915e-09, |
|
"loss": 0.8936, |
|
"step": 531 |
|
}, |
|
{ |
|
"epoch": 1.9518518518518517, |
|
"grad_norm": 0.7979388236999512, |
|
"learning_rate": 5.620650807073857e-09, |
|
"loss": 0.8871, |
|
"step": 532 |
|
}, |
|
{ |
|
"epoch": 1.9555555555555557, |
|
"grad_norm": 0.6291653513908386, |
|
"learning_rate": 4.303499784102383e-09, |
|
"loss": 0.8815, |
|
"step": 533 |
|
}, |
|
{ |
|
"epoch": 1.9592592592592593, |
|
"grad_norm": 0.7071843147277832, |
|
"learning_rate": 3.1618752965534295e-09, |
|
"loss": 0.8984, |
|
"step": 534 |
|
}, |
|
{ |
|
"epoch": 1.9629629629629628, |
|
"grad_norm": 0.5879070162773132, |
|
"learning_rate": 2.19581745602826e-09, |
|
"loss": 0.849, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.9666666666666668, |
|
"grad_norm": 0.743624746799469, |
|
"learning_rate": 1.4053602054991954e-09, |
|
"loss": 0.879, |
|
"step": 536 |
|
}, |
|
{ |
|
"epoch": 1.9703703703703703, |
|
"grad_norm": 0.5870293974876404, |
|
"learning_rate": 7.905313181150176e-10, |
|
"loss": 0.9257, |
|
"step": 537 |
|
}, |
|
{ |
|
"epoch": 1.974074074074074, |
|
"grad_norm": 0.7187138199806213, |
|
"learning_rate": 3.513523962256349e-10, |
|
"loss": 0.9768, |
|
"step": 538 |
|
}, |
|
{ |
|
"epoch": 1.9777777777777779, |
|
"grad_norm": 0.6711537837982178, |
|
"learning_rate": 8.783887062324692e-11, |
|
"loss": 0.9182, |
|
"step": 539 |
|
}, |
|
{ |
|
"epoch": 1.9814814814814814, |
|
"grad_norm": 0.66741943359375, |
|
"learning_rate": 0.0, |
|
"loss": 0.8763, |
|
"step": 540 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 540, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 2, |
|
"save_steps": 135, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 2.9871043243514266e+18, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|