|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 0.26293817132021, |
|
"eval_steps": 500, |
|
"global_step": 18580, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 58.19491958618164, |
|
"learning_rate": 1.6666666666666667e-06, |
|
"loss": 4.5462, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 51.19196319580078, |
|
"learning_rate": 3.3333333333333333e-06, |
|
"loss": 4.6693, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 45.4248161315918, |
|
"learning_rate": 5e-06, |
|
"loss": 4.6065, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 57.08290100097656, |
|
"learning_rate": 6.666666666666667e-06, |
|
"loss": 4.4395, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 40.65673828125, |
|
"learning_rate": 8.333333333333334e-06, |
|
"loss": 4.4641, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 40.7547492980957, |
|
"learning_rate": 1e-05, |
|
"loss": 4.4638, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 40.71052169799805, |
|
"learning_rate": 1.1666666666666668e-05, |
|
"loss": 4.3721, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 32.69596862792969, |
|
"learning_rate": 1.3333333333333333e-05, |
|
"loss": 4.3784, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 27.53285026550293, |
|
"learning_rate": 1.5e-05, |
|
"loss": 4.3627, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 39.0136833190918, |
|
"learning_rate": 1.6666666666666667e-05, |
|
"loss": 4.2018, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 39.9036750793457, |
|
"learning_rate": 1.8333333333333333e-05, |
|
"loss": 4.1214, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 26.16208267211914, |
|
"learning_rate": 2e-05, |
|
"loss": 4.0551, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 35.66220474243164, |
|
"learning_rate": 2.1666666666666667e-05, |
|
"loss": 4.0599, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 22.310619354248047, |
|
"learning_rate": 2.3333333333333336e-05, |
|
"loss": 4.181, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 31.29083824157715, |
|
"learning_rate": 2.5e-05, |
|
"loss": 4.0389, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 18.66942596435547, |
|
"learning_rate": 2.6666666666666667e-05, |
|
"loss": 4.0888, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.0, |
|
"grad_norm": 47.483428955078125, |
|
"learning_rate": 2.8333333333333335e-05, |
|
"loss": 4.0918, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 51.05717468261719, |
|
"learning_rate": 3e-05, |
|
"loss": 3.9807, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 67.01704406738281, |
|
"learning_rate": 3.1666666666666666e-05, |
|
"loss": 4.0331, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 40.98155975341797, |
|
"learning_rate": 3.3333333333333335e-05, |
|
"loss": 4.039, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 29.619321823120117, |
|
"learning_rate": 3.5e-05, |
|
"loss": 4.077, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 41.605018615722656, |
|
"learning_rate": 3.6666666666666666e-05, |
|
"loss": 4.044, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 34.36818313598633, |
|
"learning_rate": 3.8333333333333334e-05, |
|
"loss": 3.974, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 26.917036056518555, |
|
"learning_rate": 4e-05, |
|
"loss": 4.0088, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 25.219558715820312, |
|
"learning_rate": 4.166666666666667e-05, |
|
"loss": 3.8768, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 24.45106315612793, |
|
"learning_rate": 4.3333333333333334e-05, |
|
"loss": 3.8979, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 39.479461669921875, |
|
"learning_rate": 4.5e-05, |
|
"loss": 3.9241, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 46.96614456176758, |
|
"learning_rate": 4.666666666666667e-05, |
|
"loss": 3.8796, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 31.622241973876953, |
|
"learning_rate": 4.8333333333333334e-05, |
|
"loss": 3.9045, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 146.8946990966797, |
|
"learning_rate": 5e-05, |
|
"loss": 3.941, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 29.78015899658203, |
|
"learning_rate": 4.9999868880914903e-05, |
|
"loss": 3.9279, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 44.591156005859375, |
|
"learning_rate": 4.999947552503497e-05, |
|
"loss": 3.8695, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 35.80597686767578, |
|
"learning_rate": 4.9998819936486327e-05, |
|
"loss": 3.9277, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 35.00313186645508, |
|
"learning_rate": 4.99979021221458e-05, |
|
"loss": 3.881, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 28.8647403717041, |
|
"learning_rate": 4.999672209164081e-05, |
|
"loss": 3.8286, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 33.56174087524414, |
|
"learning_rate": 4.999527985734932e-05, |
|
"loss": 3.8631, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 63.59539794921875, |
|
"learning_rate": 4.999357543439969e-05, |
|
"loss": 3.8931, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 54.89167785644531, |
|
"learning_rate": 4.999160884067051e-05, |
|
"loss": 3.8953, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 33.9933967590332, |
|
"learning_rate": 4.998938009679042e-05, |
|
"loss": 3.9113, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 56.342620849609375, |
|
"learning_rate": 4.998688922613788e-05, |
|
"loss": 3.8079, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 35.17020797729492, |
|
"learning_rate": 4.998413625484095e-05, |
|
"loss": 3.8289, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 36.69993209838867, |
|
"learning_rate": 4.998112121177699e-05, |
|
"loss": 3.9726, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 41.2137565612793, |
|
"learning_rate": 4.997784412857239e-05, |
|
"loss": 3.8602, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 49.4541130065918, |
|
"learning_rate": 4.99743050396022e-05, |
|
"loss": 3.8549, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 40.87107849121094, |
|
"learning_rate": 4.997050398198977e-05, |
|
"loss": 3.7832, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 31.820924758911133, |
|
"learning_rate": 4.9966440995606415e-05, |
|
"loss": 3.8991, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 37.09877395629883, |
|
"learning_rate": 4.9962116123070924e-05, |
|
"loss": 3.9486, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 40.25444412231445, |
|
"learning_rate": 4.995752940974918e-05, |
|
"loss": 3.848, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 38.95152282714844, |
|
"learning_rate": 4.9952680903753627e-05, |
|
"loss": 3.723, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 52.44506072998047, |
|
"learning_rate": 4.9947570655942796e-05, |
|
"loss": 3.864, |
|
"step": 1000 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 59.793373107910156, |
|
"learning_rate": 4.994219871992077e-05, |
|
"loss": 3.794, |
|
"step": 1020 |
|
}, |
|
{ |
|
"epoch": 0.01, |
|
"grad_norm": 40.9141960144043, |
|
"learning_rate": 4.993656515203662e-05, |
|
"loss": 3.8384, |
|
"step": 1040 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 33.75545883178711, |
|
"learning_rate": 4.99306700113838e-05, |
|
"loss": 3.8811, |
|
"step": 1060 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 30.463613510131836, |
|
"learning_rate": 4.9924513359799554e-05, |
|
"loss": 3.7411, |
|
"step": 1080 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 36.24667739868164, |
|
"learning_rate": 4.991809526186424e-05, |
|
"loss": 3.8915, |
|
"step": 1100 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 35.77268600463867, |
|
"learning_rate": 4.991141578490066e-05, |
|
"loss": 3.7547, |
|
"step": 1120 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 43.09757995605469, |
|
"learning_rate": 4.990447499897339e-05, |
|
"loss": 3.8161, |
|
"step": 1140 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 67.45648956298828, |
|
"learning_rate": 4.989727297688797e-05, |
|
"loss": 3.9635, |
|
"step": 1160 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 31.597640991210938, |
|
"learning_rate": 4.98898097941902e-05, |
|
"loss": 3.8912, |
|
"step": 1180 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 41.68192672729492, |
|
"learning_rate": 4.988208552916535e-05, |
|
"loss": 3.8112, |
|
"step": 1200 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 36.489810943603516, |
|
"learning_rate": 4.9874100262837296e-05, |
|
"loss": 3.7838, |
|
"step": 1220 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 31.755823135375977, |
|
"learning_rate": 4.986585407896772e-05, |
|
"loss": 3.8385, |
|
"step": 1240 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 84.64984130859375, |
|
"learning_rate": 4.985734706405516e-05, |
|
"loss": 3.8727, |
|
"step": 1260 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 32.23849868774414, |
|
"learning_rate": 4.98485793073342e-05, |
|
"loss": 3.8013, |
|
"step": 1280 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 25.90882110595703, |
|
"learning_rate": 4.983955090077444e-05, |
|
"loss": 3.7387, |
|
"step": 1300 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 43.255313873291016, |
|
"learning_rate": 4.9830261939079614e-05, |
|
"loss": 3.8756, |
|
"step": 1320 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 35.833404541015625, |
|
"learning_rate": 4.982071251968652e-05, |
|
"loss": 3.7124, |
|
"step": 1340 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 29.098703384399414, |
|
"learning_rate": 4.981090274276406e-05, |
|
"loss": 3.8525, |
|
"step": 1360 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 35.16478729248047, |
|
"learning_rate": 4.980083271121214e-05, |
|
"loss": 3.8262, |
|
"step": 1380 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 32.62320327758789, |
|
"learning_rate": 4.9790502530660635e-05, |
|
"loss": 3.8903, |
|
"step": 1400 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 48.55181884765625, |
|
"learning_rate": 4.977991230946824e-05, |
|
"loss": 3.7363, |
|
"step": 1420 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 46.640403747558594, |
|
"learning_rate": 4.976906215872138e-05, |
|
"loss": 3.9682, |
|
"step": 1440 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 32.13254928588867, |
|
"learning_rate": 4.9757952192232985e-05, |
|
"loss": 3.6851, |
|
"step": 1460 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 34.074649810791016, |
|
"learning_rate": 4.9746582526541355e-05, |
|
"loss": 3.7781, |
|
"step": 1480 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 37.383548736572266, |
|
"learning_rate": 4.9734953280908904e-05, |
|
"loss": 3.7182, |
|
"step": 1500 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 45.83818435668945, |
|
"learning_rate": 4.972306457732091e-05, |
|
"loss": 3.7685, |
|
"step": 1520 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 35.88654327392578, |
|
"learning_rate": 4.9710916540484265e-05, |
|
"loss": 3.7627, |
|
"step": 1540 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 29.5416202545166, |
|
"learning_rate": 4.96985092978261e-05, |
|
"loss": 3.8022, |
|
"step": 1560 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 31.974184036254883, |
|
"learning_rate": 4.968584297949255e-05, |
|
"loss": 3.792, |
|
"step": 1580 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 32.32705307006836, |
|
"learning_rate": 4.967291771834727e-05, |
|
"loss": 3.7238, |
|
"step": 1600 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 29.011735916137695, |
|
"learning_rate": 4.9659733649970155e-05, |
|
"loss": 3.7215, |
|
"step": 1620 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 33.73636245727539, |
|
"learning_rate": 4.9646290912655834e-05, |
|
"loss": 3.8132, |
|
"step": 1640 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 38.57840347290039, |
|
"learning_rate": 4.9632589647412265e-05, |
|
"loss": 3.8606, |
|
"step": 1660 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 33.149078369140625, |
|
"learning_rate": 4.9618629997959235e-05, |
|
"loss": 3.7518, |
|
"step": 1680 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 58.5382194519043, |
|
"learning_rate": 4.960441211072686e-05, |
|
"loss": 3.7482, |
|
"step": 1700 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 31.86609649658203, |
|
"learning_rate": 4.958993613485405e-05, |
|
"loss": 3.7683, |
|
"step": 1720 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 28.98000144958496, |
|
"learning_rate": 4.9575202222186945e-05, |
|
"loss": 3.8361, |
|
"step": 1740 |
|
}, |
|
{ |
|
"epoch": 0.02, |
|
"grad_norm": 37.06975555419922, |
|
"learning_rate": 4.956021052727731e-05, |
|
"loss": 3.7297, |
|
"step": 1760 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 44.01863479614258, |
|
"learning_rate": 4.954496120738094e-05, |
|
"loss": 3.8244, |
|
"step": 1780 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 31.08086585998535, |
|
"learning_rate": 4.9529454422455976e-05, |
|
"loss": 3.8144, |
|
"step": 1800 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 36.80121994018555, |
|
"learning_rate": 4.951369033516127e-05, |
|
"loss": 3.7668, |
|
"step": 1820 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 24.225065231323242, |
|
"learning_rate": 4.949766911085461e-05, |
|
"loss": 3.7929, |
|
"step": 1840 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 33.50989532470703, |
|
"learning_rate": 4.948139091759108e-05, |
|
"loss": 3.7897, |
|
"step": 1860 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 26.35730743408203, |
|
"learning_rate": 4.9464855926121225e-05, |
|
"loss": 3.8618, |
|
"step": 1880 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 36.487464904785156, |
|
"learning_rate": 4.944806430988927e-05, |
|
"loss": 3.7205, |
|
"step": 1900 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 35.87200164794922, |
|
"learning_rate": 4.943101624503132e-05, |
|
"loss": 3.8324, |
|
"step": 1920 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 26.013994216918945, |
|
"learning_rate": 4.941371191037354e-05, |
|
"loss": 3.6997, |
|
"step": 1940 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 42.59685134887695, |
|
"learning_rate": 4.939615148743017e-05, |
|
"loss": 3.7085, |
|
"step": 1960 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 65.71659851074219, |
|
"learning_rate": 4.9378335160401766e-05, |
|
"loss": 3.8939, |
|
"step": 1980 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 25.612024307250977, |
|
"learning_rate": 4.936026311617316e-05, |
|
"loss": 3.7231, |
|
"step": 2000 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 28.377412796020508, |
|
"learning_rate": 4.9341935544311536e-05, |
|
"loss": 3.7476, |
|
"step": 2020 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 29.760807037353516, |
|
"learning_rate": 4.9323352637064455e-05, |
|
"loss": 3.8374, |
|
"step": 2040 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 35.875770568847656, |
|
"learning_rate": 4.9304514589357834e-05, |
|
"loss": 3.7073, |
|
"step": 2060 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 26.299306869506836, |
|
"learning_rate": 4.928542159879386e-05, |
|
"loss": 3.736, |
|
"step": 2080 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 40.1691780090332, |
|
"learning_rate": 4.926607386564898e-05, |
|
"loss": 3.7416, |
|
"step": 2100 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 35.2581901550293, |
|
"learning_rate": 4.924647159287176e-05, |
|
"loss": 3.7917, |
|
"step": 2120 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 24.038591384887695, |
|
"learning_rate": 4.9226614986080763e-05, |
|
"loss": 3.7164, |
|
"step": 2140 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 41.96257019042969, |
|
"learning_rate": 4.92065042535624e-05, |
|
"loss": 3.8562, |
|
"step": 2160 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 37.07769775390625, |
|
"learning_rate": 4.918613960626873e-05, |
|
"loss": 3.845, |
|
"step": 2180 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 35.35500717163086, |
|
"learning_rate": 4.916552125781528e-05, |
|
"loss": 3.679, |
|
"step": 2200 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 28.356767654418945, |
|
"learning_rate": 4.914464942447876e-05, |
|
"loss": 3.6217, |
|
"step": 2220 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 32.50172805786133, |
|
"learning_rate": 4.912352432519484e-05, |
|
"loss": 3.8185, |
|
"step": 2240 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 36.33710861206055, |
|
"learning_rate": 4.910214618155579e-05, |
|
"loss": 3.7401, |
|
"step": 2260 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 42.05067443847656, |
|
"learning_rate": 4.908051521780824e-05, |
|
"loss": 3.6782, |
|
"step": 2280 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 37.84385299682617, |
|
"learning_rate": 4.9058631660850765e-05, |
|
"loss": 3.7863, |
|
"step": 2300 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 28.022615432739258, |
|
"learning_rate": 4.90364957402315e-05, |
|
"loss": 3.7804, |
|
"step": 2320 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 38.274173736572266, |
|
"learning_rate": 4.9014107688145804e-05, |
|
"loss": 3.6898, |
|
"step": 2340 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 29.532123565673828, |
|
"learning_rate": 4.899146773943374e-05, |
|
"loss": 3.7521, |
|
"step": 2360 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 48.601417541503906, |
|
"learning_rate": 4.896857613157765e-05, |
|
"loss": 3.646, |
|
"step": 2380 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 31.142457962036133, |
|
"learning_rate": 4.894543310469968e-05, |
|
"loss": 3.7694, |
|
"step": 2400 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 39.75430679321289, |
|
"learning_rate": 4.8922038901559224e-05, |
|
"loss": 3.7673, |
|
"step": 2420 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 46.01137924194336, |
|
"learning_rate": 4.8898393767550405e-05, |
|
"loss": 3.7022, |
|
"step": 2440 |
|
}, |
|
{ |
|
"epoch": 0.03, |
|
"grad_norm": 26.171249389648438, |
|
"learning_rate": 4.887449795069948e-05, |
|
"loss": 3.7917, |
|
"step": 2460 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 46.24589538574219, |
|
"learning_rate": 4.885035170166228e-05, |
|
"loss": 3.7352, |
|
"step": 2480 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 31.69544219970703, |
|
"learning_rate": 4.882595527372152e-05, |
|
"loss": 3.694, |
|
"step": 2500 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 35.99808883666992, |
|
"learning_rate": 4.880130892278419e-05, |
|
"loss": 3.7636, |
|
"step": 2520 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 31.871978759765625, |
|
"learning_rate": 4.877641290737884e-05, |
|
"loss": 3.7472, |
|
"step": 2540 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 35.04158401489258, |
|
"learning_rate": 4.87512674886529e-05, |
|
"loss": 3.7445, |
|
"step": 2560 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 46.71685791015625, |
|
"learning_rate": 4.872587293036991e-05, |
|
"loss": 3.7141, |
|
"step": 2580 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 26.907012939453125, |
|
"learning_rate": 4.870022949890676e-05, |
|
"loss": 3.748, |
|
"step": 2600 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 26.9509334564209, |
|
"learning_rate": 4.867433746325093e-05, |
|
"loss": 3.7635, |
|
"step": 2620 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 26.85176658630371, |
|
"learning_rate": 4.8648197094997616e-05, |
|
"loss": 3.824, |
|
"step": 2640 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 22.88348960876465, |
|
"learning_rate": 4.8621808668346906e-05, |
|
"loss": 3.7504, |
|
"step": 2660 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 27.76841163635254, |
|
"learning_rate": 4.859517246010091e-05, |
|
"loss": 3.8228, |
|
"step": 2680 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 41.46321487426758, |
|
"learning_rate": 4.856828874966086e-05, |
|
"loss": 3.6509, |
|
"step": 2700 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 28.96099090576172, |
|
"learning_rate": 4.854115781902414e-05, |
|
"loss": 3.7377, |
|
"step": 2720 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 38.632015228271484, |
|
"learning_rate": 4.851377995278138e-05, |
|
"loss": 3.8471, |
|
"step": 2740 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 32.76665496826172, |
|
"learning_rate": 4.8486155438113454e-05, |
|
"loss": 3.731, |
|
"step": 2760 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 30.798906326293945, |
|
"learning_rate": 4.845828456478842e-05, |
|
"loss": 3.6953, |
|
"step": 2780 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 35.173606872558594, |
|
"learning_rate": 4.8430167625158595e-05, |
|
"loss": 3.6521, |
|
"step": 2800 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 50.02262496948242, |
|
"learning_rate": 4.840180491415733e-05, |
|
"loss": 3.6999, |
|
"step": 2820 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 33.76813507080078, |
|
"learning_rate": 4.837319672929607e-05, |
|
"loss": 3.7118, |
|
"step": 2840 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 24.56015396118164, |
|
"learning_rate": 4.834434337066112e-05, |
|
"loss": 3.7094, |
|
"step": 2860 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 39.17055892944336, |
|
"learning_rate": 4.8315245140910556e-05, |
|
"loss": 3.799, |
|
"step": 2880 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 29.631614685058594, |
|
"learning_rate": 4.828590234527106e-05, |
|
"loss": 3.7785, |
|
"step": 2900 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 46.83203125, |
|
"learning_rate": 4.825631529153466e-05, |
|
"loss": 3.6311, |
|
"step": 2920 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 34.5321044921875, |
|
"learning_rate": 4.822648429005554e-05, |
|
"loss": 3.7288, |
|
"step": 2940 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 19.74892234802246, |
|
"learning_rate": 4.819640965374681e-05, |
|
"loss": 3.6749, |
|
"step": 2960 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 51.736480712890625, |
|
"learning_rate": 4.8166091698077164e-05, |
|
"loss": 3.8733, |
|
"step": 2980 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 24.50010871887207, |
|
"learning_rate": 4.813553074106761e-05, |
|
"loss": 3.7634, |
|
"step": 3000 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 29.08304214477539, |
|
"learning_rate": 4.810472710328812e-05, |
|
"loss": 3.7277, |
|
"step": 3020 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 55.230377197265625, |
|
"learning_rate": 4.80736811078543e-05, |
|
"loss": 3.7238, |
|
"step": 3040 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 19.770660400390625, |
|
"learning_rate": 4.804239308042392e-05, |
|
"loss": 3.7202, |
|
"step": 3060 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 28.955581665039062, |
|
"learning_rate": 4.8010863349193605e-05, |
|
"loss": 3.7079, |
|
"step": 3080 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 32.6827278137207, |
|
"learning_rate": 4.7979092244895305e-05, |
|
"loss": 3.7488, |
|
"step": 3100 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 28.665210723876953, |
|
"learning_rate": 4.794708010079289e-05, |
|
"loss": 3.6798, |
|
"step": 3120 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 31.36636734008789, |
|
"learning_rate": 4.791482725267857e-05, |
|
"loss": 3.7233, |
|
"step": 3140 |
|
}, |
|
{ |
|
"epoch": 0.04, |
|
"grad_norm": 28.98109245300293, |
|
"learning_rate": 4.7882334038869495e-05, |
|
"loss": 3.8137, |
|
"step": 3160 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 25.13091278076172, |
|
"learning_rate": 4.784960080020408e-05, |
|
"loss": 3.756, |
|
"step": 3180 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 43.819313049316406, |
|
"learning_rate": 4.781662788003851e-05, |
|
"loss": 3.7371, |
|
"step": 3200 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 25.864599227905273, |
|
"learning_rate": 4.7783415624243124e-05, |
|
"loss": 3.604, |
|
"step": 3220 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 38.96342468261719, |
|
"learning_rate": 4.7749964381198765e-05, |
|
"loss": 3.7482, |
|
"step": 3240 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 28.412094116210938, |
|
"learning_rate": 4.7716274501793144e-05, |
|
"loss": 3.6766, |
|
"step": 3260 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 35.93290328979492, |
|
"learning_rate": 4.768234633941716e-05, |
|
"loss": 3.6659, |
|
"step": 3280 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 34.64625930786133, |
|
"learning_rate": 4.764818024996117e-05, |
|
"loss": 3.6739, |
|
"step": 3300 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 32.466495513916016, |
|
"learning_rate": 4.76137765918113e-05, |
|
"loss": 3.7524, |
|
"step": 3320 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 33.156776428222656, |
|
"learning_rate": 4.7579135725845635e-05, |
|
"loss": 3.7571, |
|
"step": 3340 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 48.48731994628906, |
|
"learning_rate": 4.7544258015430463e-05, |
|
"loss": 3.6783, |
|
"step": 3360 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 30.641870498657227, |
|
"learning_rate": 4.750914382641648e-05, |
|
"loss": 3.7549, |
|
"step": 3380 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 31.7097110748291, |
|
"learning_rate": 4.747379352713489e-05, |
|
"loss": 3.6388, |
|
"step": 3400 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 45.476951599121094, |
|
"learning_rate": 4.7438207488393616e-05, |
|
"loss": 3.7421, |
|
"step": 3420 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 42.40350341796875, |
|
"learning_rate": 4.740238608347336e-05, |
|
"loss": 3.771, |
|
"step": 3440 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 26.54286003112793, |
|
"learning_rate": 4.736632968812373e-05, |
|
"loss": 3.6409, |
|
"step": 3460 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 33.44880676269531, |
|
"learning_rate": 4.733003868055923e-05, |
|
"loss": 3.6977, |
|
"step": 3480 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 30.746978759765625, |
|
"learning_rate": 4.7293513441455364e-05, |
|
"loss": 3.6403, |
|
"step": 3500 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 30.616453170776367, |
|
"learning_rate": 4.72567543539446e-05, |
|
"loss": 3.7039, |
|
"step": 3520 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 28.486270904541016, |
|
"learning_rate": 4.721976180361238e-05, |
|
"loss": 3.6331, |
|
"step": 3540 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 31.4039363861084, |
|
"learning_rate": 4.718253617849306e-05, |
|
"loss": 3.6498, |
|
"step": 3560 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 22.35509490966797, |
|
"learning_rate": 4.714507786906581e-05, |
|
"loss": 3.709, |
|
"step": 3580 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 25.957500457763672, |
|
"learning_rate": 4.710738726825059e-05, |
|
"loss": 3.7159, |
|
"step": 3600 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 27.019580841064453, |
|
"learning_rate": 4.706946477140396e-05, |
|
"loss": 3.6971, |
|
"step": 3620 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 32.743896484375, |
|
"learning_rate": 4.703131077631497e-05, |
|
"loss": 3.5543, |
|
"step": 3640 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 30.018753051757812, |
|
"learning_rate": 4.699292568320097e-05, |
|
"loss": 3.6811, |
|
"step": 3660 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 27.54176139831543, |
|
"learning_rate": 4.695430989470343e-05, |
|
"loss": 3.6593, |
|
"step": 3680 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 30.283519744873047, |
|
"learning_rate": 4.69154638158837e-05, |
|
"loss": 3.551, |
|
"step": 3700 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 26.505075454711914, |
|
"learning_rate": 4.687638785421875e-05, |
|
"loss": 3.7794, |
|
"step": 3720 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 26.94403839111328, |
|
"learning_rate": 4.683708241959694e-05, |
|
"loss": 3.6415, |
|
"step": 3740 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 31.006845474243164, |
|
"learning_rate": 4.679754792431368e-05, |
|
"loss": 3.6741, |
|
"step": 3760 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 60.343318939208984, |
|
"learning_rate": 4.675778478306712e-05, |
|
"loss": 3.6502, |
|
"step": 3780 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 52.47261047363281, |
|
"learning_rate": 4.671779341295378e-05, |
|
"loss": 3.6878, |
|
"step": 3800 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 34.15403747558594, |
|
"learning_rate": 4.6677574233464226e-05, |
|
"loss": 3.7464, |
|
"step": 3820 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 21.71308135986328, |
|
"learning_rate": 4.663712766647862e-05, |
|
"loss": 3.6239, |
|
"step": 3840 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 25.242189407348633, |
|
"learning_rate": 4.65964541362623e-05, |
|
"loss": 3.8114, |
|
"step": 3860 |
|
}, |
|
{ |
|
"epoch": 0.05, |
|
"grad_norm": 35.03647232055664, |
|
"learning_rate": 4.655555406946135e-05, |
|
"loss": 3.654, |
|
"step": 3880 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 54.89191818237305, |
|
"learning_rate": 4.6514427895098134e-05, |
|
"loss": 3.6936, |
|
"step": 3900 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 24.903459548950195, |
|
"learning_rate": 4.647307604456674e-05, |
|
"loss": 3.8267, |
|
"step": 3920 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 33.852054595947266, |
|
"learning_rate": 4.643149895162854e-05, |
|
"loss": 3.661, |
|
"step": 3940 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 35.687713623046875, |
|
"learning_rate": 4.6389697052407534e-05, |
|
"loss": 3.67, |
|
"step": 3960 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 29.4704647064209, |
|
"learning_rate": 4.6347670785385884e-05, |
|
"loss": 3.7182, |
|
"step": 3980 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 24.089828491210938, |
|
"learning_rate": 4.630542059139924e-05, |
|
"loss": 3.5781, |
|
"step": 4000 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 34.60494613647461, |
|
"learning_rate": 4.626294691363213e-05, |
|
"loss": 3.7001, |
|
"step": 4020 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 53.43947219848633, |
|
"learning_rate": 4.622025019761336e-05, |
|
"loss": 3.6048, |
|
"step": 4040 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 35.322486877441406, |
|
"learning_rate": 4.617733089121127e-05, |
|
"loss": 3.6201, |
|
"step": 4060 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 47.170005798339844, |
|
"learning_rate": 4.613418944462907e-05, |
|
"loss": 3.7443, |
|
"step": 4080 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 30.616161346435547, |
|
"learning_rate": 4.6090826310400116e-05, |
|
"loss": 3.7685, |
|
"step": 4100 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 24.628185272216797, |
|
"learning_rate": 4.6047241943383176e-05, |
|
"loss": 3.6677, |
|
"step": 4120 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 38.79618453979492, |
|
"learning_rate": 4.600343680075764e-05, |
|
"loss": 3.744, |
|
"step": 4140 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 37.38518524169922, |
|
"learning_rate": 4.595941134201871e-05, |
|
"loss": 3.7101, |
|
"step": 4160 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 29.248828887939453, |
|
"learning_rate": 4.5915166028972624e-05, |
|
"loss": 3.7209, |
|
"step": 4180 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 45.65785217285156, |
|
"learning_rate": 4.587070132573178e-05, |
|
"loss": 3.7605, |
|
"step": 4200 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 24.220314025878906, |
|
"learning_rate": 4.582601769870988e-05, |
|
"loss": 3.6609, |
|
"step": 4220 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 27.00070571899414, |
|
"learning_rate": 4.578111561661702e-05, |
|
"loss": 3.6754, |
|
"step": 4240 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 75.85283660888672, |
|
"learning_rate": 4.573599555045479e-05, |
|
"loss": 3.6605, |
|
"step": 4260 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 29.803096771240234, |
|
"learning_rate": 4.569065797351135e-05, |
|
"loss": 3.6287, |
|
"step": 4280 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 26.4781494140625, |
|
"learning_rate": 4.5645103361356415e-05, |
|
"loss": 3.6301, |
|
"step": 4300 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 47.245357513427734, |
|
"learning_rate": 4.5599332191836316e-05, |
|
"loss": 3.6776, |
|
"step": 4320 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 26.005104064941406, |
|
"learning_rate": 4.555334494506896e-05, |
|
"loss": 3.6756, |
|
"step": 4340 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 35.15077590942383, |
|
"learning_rate": 4.5507142103438794e-05, |
|
"loss": 3.7022, |
|
"step": 4360 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 29.038782119750977, |
|
"learning_rate": 4.546072415159179e-05, |
|
"loss": 3.6325, |
|
"step": 4380 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 30.944393157958984, |
|
"learning_rate": 4.541409157643027e-05, |
|
"loss": 3.6343, |
|
"step": 4400 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 31.153432846069336, |
|
"learning_rate": 4.536724486710791e-05, |
|
"loss": 3.7739, |
|
"step": 4420 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 40.95075225830078, |
|
"learning_rate": 4.53201845150245e-05, |
|
"loss": 3.6558, |
|
"step": 4440 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 30.37499237060547, |
|
"learning_rate": 4.5272911013820876e-05, |
|
"loss": 3.6093, |
|
"step": 4460 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 23.894237518310547, |
|
"learning_rate": 4.522542485937369e-05, |
|
"loss": 3.6415, |
|
"step": 4480 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 69.29508209228516, |
|
"learning_rate": 4.517772654979023e-05, |
|
"loss": 3.696, |
|
"step": 4500 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 31.464527130126953, |
|
"learning_rate": 4.5129816585403206e-05, |
|
"loss": 3.7147, |
|
"step": 4520 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 30.76380729675293, |
|
"learning_rate": 4.508169546876547e-05, |
|
"loss": 3.6428, |
|
"step": 4540 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 27.94367027282715, |
|
"learning_rate": 4.503336370464476e-05, |
|
"loss": 3.7018, |
|
"step": 4560 |
|
}, |
|
{ |
|
"epoch": 0.06, |
|
"grad_norm": 22.166793823242188, |
|
"learning_rate": 4.49848218000184e-05, |
|
"loss": 3.7018, |
|
"step": 4580 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 32.058921813964844, |
|
"learning_rate": 4.493607026406802e-05, |
|
"loss": 3.7035, |
|
"step": 4600 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 28.55988311767578, |
|
"learning_rate": 4.488710960817416e-05, |
|
"loss": 3.7725, |
|
"step": 4620 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 23.51280403137207, |
|
"learning_rate": 4.4837940345910925e-05, |
|
"loss": 3.7238, |
|
"step": 4640 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 37.3757209777832, |
|
"learning_rate": 4.4788562993040614e-05, |
|
"loss": 3.701, |
|
"step": 4660 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 38.56554412841797, |
|
"learning_rate": 4.473897806750829e-05, |
|
"loss": 3.7174, |
|
"step": 4680 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 29.553325653076172, |
|
"learning_rate": 4.4689186089436366e-05, |
|
"loss": 3.627, |
|
"step": 4700 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 33.66290283203125, |
|
"learning_rate": 4.463918758111912e-05, |
|
"loss": 3.6307, |
|
"step": 4720 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 29.957775115966797, |
|
"learning_rate": 4.4588983067017257e-05, |
|
"loss": 3.6157, |
|
"step": 4740 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 35.32748794555664, |
|
"learning_rate": 4.4538573073752365e-05, |
|
"loss": 3.5961, |
|
"step": 4760 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 24.597824096679688, |
|
"learning_rate": 4.448795813010142e-05, |
|
"loss": 3.5881, |
|
"step": 4780 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 26.248044967651367, |
|
"learning_rate": 4.443713876699124e-05, |
|
"loss": 3.6057, |
|
"step": 4800 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 25.942325592041016, |
|
"learning_rate": 4.4386115517492874e-05, |
|
"loss": 3.6286, |
|
"step": 4820 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 42.028316497802734, |
|
"learning_rate": 4.43348889168161e-05, |
|
"loss": 3.6306, |
|
"step": 4840 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 24.317644119262695, |
|
"learning_rate": 4.4283459502303695e-05, |
|
"loss": 3.5992, |
|
"step": 4860 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 43.174903869628906, |
|
"learning_rate": 4.4231827813425885e-05, |
|
"loss": 3.6493, |
|
"step": 4880 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 33.58101272583008, |
|
"learning_rate": 4.417999439177466e-05, |
|
"loss": 3.6843, |
|
"step": 4900 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 34.096824645996094, |
|
"learning_rate": 4.412795978105807e-05, |
|
"loss": 3.6134, |
|
"step": 4920 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 35.04353713989258, |
|
"learning_rate": 4.4075724527094584e-05, |
|
"loss": 3.5916, |
|
"step": 4940 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 28.97658920288086, |
|
"learning_rate": 4.402328917780728e-05, |
|
"loss": 3.6362, |
|
"step": 4960 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 35.05881118774414, |
|
"learning_rate": 4.397065428321817e-05, |
|
"loss": 3.7566, |
|
"step": 4980 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 27.057044982910156, |
|
"learning_rate": 4.391782039544238e-05, |
|
"loss": 3.4967, |
|
"step": 5000 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 22.590089797973633, |
|
"learning_rate": 4.386478806868241e-05, |
|
"loss": 3.6759, |
|
"step": 5020 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 34.77460479736328, |
|
"learning_rate": 4.3811557859222254e-05, |
|
"loss": 3.6893, |
|
"step": 5040 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 24.440248489379883, |
|
"learning_rate": 4.375813032542164e-05, |
|
"loss": 3.7167, |
|
"step": 5060 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 42.91717529296875, |
|
"learning_rate": 4.3704506027710105e-05, |
|
"loss": 3.5893, |
|
"step": 5080 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 34.991634368896484, |
|
"learning_rate": 4.365068552858115e-05, |
|
"loss": 3.5482, |
|
"step": 5100 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 37.62036895751953, |
|
"learning_rate": 4.3596669392586365e-05, |
|
"loss": 3.5972, |
|
"step": 5120 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 29.56283950805664, |
|
"learning_rate": 4.354245818632944e-05, |
|
"loss": 3.6804, |
|
"step": 5140 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 35.37843322753906, |
|
"learning_rate": 4.348805247846027e-05, |
|
"loss": 3.6491, |
|
"step": 5160 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 39.210906982421875, |
|
"learning_rate": 4.343345283966901e-05, |
|
"loss": 3.6268, |
|
"step": 5180 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 26.60144805908203, |
|
"learning_rate": 4.337865984268001e-05, |
|
"loss": 3.6277, |
|
"step": 5200 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 32.668052673339844, |
|
"learning_rate": 4.33236740622459e-05, |
|
"loss": 3.6159, |
|
"step": 5220 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 43.837833404541016, |
|
"learning_rate": 4.326849607514148e-05, |
|
"loss": 3.5939, |
|
"step": 5240 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 20.860111236572266, |
|
"learning_rate": 4.321312646015775e-05, |
|
"loss": 3.624, |
|
"step": 5260 |
|
}, |
|
{ |
|
"epoch": 0.07, |
|
"grad_norm": 24.005277633666992, |
|
"learning_rate": 4.3157565798095753e-05, |
|
"loss": 3.6098, |
|
"step": 5280 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 23.65524673461914, |
|
"learning_rate": 4.3101814671760546e-05, |
|
"loss": 3.6969, |
|
"step": 5300 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 40.98033905029297, |
|
"learning_rate": 4.304587366595506e-05, |
|
"loss": 3.8225, |
|
"step": 5320 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 28.647207260131836, |
|
"learning_rate": 4.298974336747397e-05, |
|
"loss": 3.6742, |
|
"step": 5340 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 20.806941986083984, |
|
"learning_rate": 4.2933424365097564e-05, |
|
"loss": 3.5679, |
|
"step": 5360 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 22.459196090698242, |
|
"learning_rate": 4.287691724958551e-05, |
|
"loss": 3.6389, |
|
"step": 5380 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 23.558490753173828, |
|
"learning_rate": 4.2820222613670736e-05, |
|
"loss": 3.6654, |
|
"step": 5400 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 20.315793991088867, |
|
"learning_rate": 4.276334105205312e-05, |
|
"loss": 3.5976, |
|
"step": 5420 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 21.125396728515625, |
|
"learning_rate": 4.2706273161393327e-05, |
|
"loss": 3.5712, |
|
"step": 5440 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 25.103483200073242, |
|
"learning_rate": 4.2649019540306545e-05, |
|
"loss": 3.616, |
|
"step": 5460 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 23.65394401550293, |
|
"learning_rate": 4.2591580789356156e-05, |
|
"loss": 3.6587, |
|
"step": 5480 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 31.216896057128906, |
|
"learning_rate": 4.253395751104748e-05, |
|
"loss": 3.7161, |
|
"step": 5500 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 28.144855499267578, |
|
"learning_rate": 4.247615030982144e-05, |
|
"loss": 3.6847, |
|
"step": 5520 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 23.597564697265625, |
|
"learning_rate": 4.241815979204822e-05, |
|
"loss": 3.6556, |
|
"step": 5540 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 41.00291061401367, |
|
"learning_rate": 4.2359986566020906e-05, |
|
"loss": 3.7665, |
|
"step": 5560 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 37.05702209472656, |
|
"learning_rate": 4.230163124194913e-05, |
|
"loss": 3.5916, |
|
"step": 5580 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 28.161930084228516, |
|
"learning_rate": 4.224309443195261e-05, |
|
"loss": 3.6887, |
|
"step": 5600 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 31.685361862182617, |
|
"learning_rate": 4.2184376750054786e-05, |
|
"loss": 3.5724, |
|
"step": 5620 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 38.13533020019531, |
|
"learning_rate": 4.2125478812176364e-05, |
|
"loss": 3.664, |
|
"step": 5640 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 20.385272979736328, |
|
"learning_rate": 4.206640123612884e-05, |
|
"loss": 3.73, |
|
"step": 5660 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 30.926259994506836, |
|
"learning_rate": 4.200714464160804e-05, |
|
"loss": 3.6472, |
|
"step": 5680 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 19.820131301879883, |
|
"learning_rate": 4.194770965018758e-05, |
|
"loss": 3.6226, |
|
"step": 5700 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 21.318801879882812, |
|
"learning_rate": 4.188809688531241e-05, |
|
"loss": 3.635, |
|
"step": 5720 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 18.304567337036133, |
|
"learning_rate": 4.182830697229223e-05, |
|
"loss": 3.625, |
|
"step": 5740 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 24.25802230834961, |
|
"learning_rate": 4.176834053829492e-05, |
|
"loss": 3.5844, |
|
"step": 5760 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 53.09843444824219, |
|
"learning_rate": 4.170819821234001e-05, |
|
"loss": 3.7058, |
|
"step": 5780 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 39.87876510620117, |
|
"learning_rate": 4.164788062529203e-05, |
|
"loss": 3.725, |
|
"step": 5800 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 32.36482620239258, |
|
"learning_rate": 4.1587388409853935e-05, |
|
"loss": 3.5355, |
|
"step": 5820 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 28.59760284423828, |
|
"learning_rate": 4.1526722200560445e-05, |
|
"loss": 3.6528, |
|
"step": 5840 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 21.3729305267334, |
|
"learning_rate": 4.146588263377137e-05, |
|
"loss": 3.6428, |
|
"step": 5860 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 20.160661697387695, |
|
"learning_rate": 4.140487034766499e-05, |
|
"loss": 3.6116, |
|
"step": 5880 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 31.58021354675293, |
|
"learning_rate": 4.134368598223132e-05, |
|
"loss": 3.6302, |
|
"step": 5900 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 30.793672561645508, |
|
"learning_rate": 4.128233017926538e-05, |
|
"loss": 3.5663, |
|
"step": 5920 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 22.589147567749023, |
|
"learning_rate": 4.122080358236055e-05, |
|
"loss": 3.6292, |
|
"step": 5940 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 32.27565383911133, |
|
"learning_rate": 4.1159106836901674e-05, |
|
"loss": 3.5806, |
|
"step": 5960 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 37.15829849243164, |
|
"learning_rate": 4.109724059005844e-05, |
|
"loss": 3.5662, |
|
"step": 5980 |
|
}, |
|
{ |
|
"epoch": 0.08, |
|
"grad_norm": 38.23238754272461, |
|
"learning_rate": 4.10352054907785e-05, |
|
"loss": 3.6842, |
|
"step": 6000 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 24.37531089782715, |
|
"learning_rate": 4.0973002189780694e-05, |
|
"loss": 3.6153, |
|
"step": 6020 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 24.309982299804688, |
|
"learning_rate": 4.0910631339548206e-05, |
|
"loss": 3.6502, |
|
"step": 6040 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 24.007654190063477, |
|
"learning_rate": 4.084809359432175e-05, |
|
"loss": 3.7203, |
|
"step": 6060 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 24.977094650268555, |
|
"learning_rate": 4.0785389610092686e-05, |
|
"loss": 3.5413, |
|
"step": 6080 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 27.397930145263672, |
|
"learning_rate": 4.072252004459611e-05, |
|
"loss": 3.5612, |
|
"step": 6100 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 26.012800216674805, |
|
"learning_rate": 4.065948555730405e-05, |
|
"loss": 3.6385, |
|
"step": 6120 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 29.745574951171875, |
|
"learning_rate": 4.0596286809418435e-05, |
|
"loss": 3.6646, |
|
"step": 6140 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 30.76190185546875, |
|
"learning_rate": 4.053292446386422e-05, |
|
"loss": 3.6622, |
|
"step": 6160 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 27.577564239501953, |
|
"learning_rate": 4.046939918528243e-05, |
|
"loss": 3.701, |
|
"step": 6180 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 31.610410690307617, |
|
"learning_rate": 4.0405711640023186e-05, |
|
"loss": 3.5977, |
|
"step": 6200 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 28.61423110961914, |
|
"learning_rate": 4.034186249613869e-05, |
|
"loss": 3.7307, |
|
"step": 6220 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 44.62327575683594, |
|
"learning_rate": 4.027785242337626e-05, |
|
"loss": 3.7055, |
|
"step": 6240 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 32.20371627807617, |
|
"learning_rate": 4.0213682093171254e-05, |
|
"loss": 3.6186, |
|
"step": 6260 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 32.36015701293945, |
|
"learning_rate": 4.014935217864009e-05, |
|
"loss": 3.5798, |
|
"step": 6280 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 36.1356201171875, |
|
"learning_rate": 4.008486335457312e-05, |
|
"loss": 3.6395, |
|
"step": 6300 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 20.485820770263672, |
|
"learning_rate": 4.0020216297427594e-05, |
|
"loss": 3.6075, |
|
"step": 6320 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 21.503564834594727, |
|
"learning_rate": 3.995541168532055e-05, |
|
"loss": 3.6099, |
|
"step": 6340 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 29.125812530517578, |
|
"learning_rate": 3.9890450198021704e-05, |
|
"loss": 3.6665, |
|
"step": 6360 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 24.479976654052734, |
|
"learning_rate": 3.982533251694632e-05, |
|
"loss": 3.7168, |
|
"step": 6380 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 36.184410095214844, |
|
"learning_rate": 3.976005932514807e-05, |
|
"loss": 3.5771, |
|
"step": 6400 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 28.156030654907227, |
|
"learning_rate": 3.969463130731183e-05, |
|
"loss": 3.6353, |
|
"step": 6420 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 25.22379493713379, |
|
"learning_rate": 3.962904914974656e-05, |
|
"loss": 3.5015, |
|
"step": 6440 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 31.427339553833008, |
|
"learning_rate": 3.9563313540378055e-05, |
|
"loss": 3.5712, |
|
"step": 6460 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 19.2696590423584, |
|
"learning_rate": 3.949742516874175e-05, |
|
"loss": 3.5929, |
|
"step": 6480 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 23.234111785888672, |
|
"learning_rate": 3.943138472597549e-05, |
|
"loss": 3.6166, |
|
"step": 6500 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 26.726085662841797, |
|
"learning_rate": 3.936519290481226e-05, |
|
"loss": 3.6748, |
|
"step": 6520 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 34.712257385253906, |
|
"learning_rate": 3.929885039957296e-05, |
|
"loss": 3.64, |
|
"step": 6540 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 25.96158218383789, |
|
"learning_rate": 3.923235790615907e-05, |
|
"loss": 3.6119, |
|
"step": 6560 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 34.04408264160156, |
|
"learning_rate": 3.916571612204537e-05, |
|
"loss": 3.6881, |
|
"step": 6580 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 20.656030654907227, |
|
"learning_rate": 3.909892574627266e-05, |
|
"loss": 3.6462, |
|
"step": 6600 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 23.8648738861084, |
|
"learning_rate": 3.9031987479440367e-05, |
|
"loss": 3.597, |
|
"step": 6620 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 34.48773193359375, |
|
"learning_rate": 3.896490202369924e-05, |
|
"loss": 3.5781, |
|
"step": 6640 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 26.09375762939453, |
|
"learning_rate": 3.8897670082743955e-05, |
|
"loss": 3.5463, |
|
"step": 6660 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 25.49962615966797, |
|
"learning_rate": 3.883029236180577e-05, |
|
"loss": 3.637, |
|
"step": 6680 |
|
}, |
|
{ |
|
"epoch": 0.09, |
|
"grad_norm": 37.70731735229492, |
|
"learning_rate": 3.876276956764509e-05, |
|
"loss": 3.6515, |
|
"step": 6700 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 53.345558166503906, |
|
"learning_rate": 3.8695102408544076e-05, |
|
"loss": 3.521, |
|
"step": 6720 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 34.147884368896484, |
|
"learning_rate": 3.862729159429921e-05, |
|
"loss": 3.6443, |
|
"step": 6740 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 29.45001220703125, |
|
"learning_rate": 3.855933783621384e-05, |
|
"loss": 3.5976, |
|
"step": 6760 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 29.933969497680664, |
|
"learning_rate": 3.849124184709073e-05, |
|
"loss": 3.6396, |
|
"step": 6780 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 38.85334014892578, |
|
"learning_rate": 3.84230043412246e-05, |
|
"loss": 3.6518, |
|
"step": 6800 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 34.85492706298828, |
|
"learning_rate": 3.835462603439458e-05, |
|
"loss": 3.6577, |
|
"step": 6820 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 29.77360725402832, |
|
"learning_rate": 3.828610764385676e-05, |
|
"loss": 3.6026, |
|
"step": 6840 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 38.89609909057617, |
|
"learning_rate": 3.821744988833663e-05, |
|
"loss": 3.6144, |
|
"step": 6860 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 25.664960861206055, |
|
"learning_rate": 3.814865348802157e-05, |
|
"loss": 3.5826, |
|
"step": 6880 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 31.955894470214844, |
|
"learning_rate": 3.807971916455325e-05, |
|
"loss": 3.6973, |
|
"step": 6900 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 23.378131866455078, |
|
"learning_rate": 3.8010647641020115e-05, |
|
"loss": 3.6875, |
|
"step": 6920 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 45.89334487915039, |
|
"learning_rate": 3.794143964194976e-05, |
|
"loss": 3.5457, |
|
"step": 6940 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 32.45075988769531, |
|
"learning_rate": 3.787209589330134e-05, |
|
"loss": 3.5719, |
|
"step": 6960 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 32.06966018676758, |
|
"learning_rate": 3.7802617122457975e-05, |
|
"loss": 3.6324, |
|
"step": 6980 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 27.93364715576172, |
|
"learning_rate": 3.773300405821908e-05, |
|
"loss": 3.6093, |
|
"step": 7000 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 23.111515045166016, |
|
"learning_rate": 3.766325743079277e-05, |
|
"loss": 3.5292, |
|
"step": 7020 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 24.405742645263672, |
|
"learning_rate": 3.759337797178816e-05, |
|
"loss": 3.5969, |
|
"step": 7040 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 37.218467712402344, |
|
"learning_rate": 3.752336641420772e-05, |
|
"loss": 3.653, |
|
"step": 7060 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 32.396522521972656, |
|
"learning_rate": 3.745322349243954e-05, |
|
"loss": 3.6483, |
|
"step": 7080 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 35.53373336791992, |
|
"learning_rate": 3.7382949942249694e-05, |
|
"loss": 3.6356, |
|
"step": 7100 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 33.19758987426758, |
|
"learning_rate": 3.731254650077446e-05, |
|
"loss": 3.6017, |
|
"step": 7120 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 36.60466003417969, |
|
"learning_rate": 3.7242013906512626e-05, |
|
"loss": 3.6246, |
|
"step": 7140 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 21.257328033447266, |
|
"learning_rate": 3.717135289931774e-05, |
|
"loss": 3.6046, |
|
"step": 7160 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 25.697444915771484, |
|
"learning_rate": 3.7100564220390326e-05, |
|
"loss": 3.6154, |
|
"step": 7180 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 28.491622924804688, |
|
"learning_rate": 3.702964861227013e-05, |
|
"loss": 3.6983, |
|
"step": 7200 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 26.819791793823242, |
|
"learning_rate": 3.695860681882832e-05, |
|
"loss": 3.5722, |
|
"step": 7220 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 25.864788055419922, |
|
"learning_rate": 3.6887439585259694e-05, |
|
"loss": 3.6825, |
|
"step": 7240 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 22.492717742919922, |
|
"learning_rate": 3.681614765807486e-05, |
|
"loss": 3.6377, |
|
"step": 7260 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 31.227336883544922, |
|
"learning_rate": 3.6744731785092395e-05, |
|
"loss": 3.5476, |
|
"step": 7280 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 29.010467529296875, |
|
"learning_rate": 3.6673192715431015e-05, |
|
"loss": 3.6285, |
|
"step": 7300 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 38.40274429321289, |
|
"learning_rate": 3.6601531199501714e-05, |
|
"loss": 3.6941, |
|
"step": 7320 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 26.361167907714844, |
|
"learning_rate": 3.652974798899988e-05, |
|
"loss": 3.5772, |
|
"step": 7340 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 30.241390228271484, |
|
"learning_rate": 3.645784383689742e-05, |
|
"loss": 3.5177, |
|
"step": 7360 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 43.579349517822266, |
|
"learning_rate": 3.6385819497434876e-05, |
|
"loss": 3.7467, |
|
"step": 7380 |
|
}, |
|
{ |
|
"epoch": 0.1, |
|
"grad_norm": 47.42546081542969, |
|
"learning_rate": 3.631367572611348e-05, |
|
"loss": 3.6651, |
|
"step": 7400 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 29.63494110107422, |
|
"learning_rate": 3.6241413279687254e-05, |
|
"loss": 3.6368, |
|
"step": 7420 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 23.63068389892578, |
|
"learning_rate": 3.616903291615506e-05, |
|
"loss": 3.4684, |
|
"step": 7440 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 24.25609588623047, |
|
"learning_rate": 3.6096535394752676e-05, |
|
"loss": 3.6177, |
|
"step": 7460 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 23.829919815063477, |
|
"learning_rate": 3.6023921475944794e-05, |
|
"loss": 3.6008, |
|
"step": 7480 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 23.879764556884766, |
|
"learning_rate": 3.595119192141706e-05, |
|
"loss": 3.6926, |
|
"step": 7500 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 22.195941925048828, |
|
"learning_rate": 3.5878347494068084e-05, |
|
"loss": 3.6049, |
|
"step": 7520 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 48.3228645324707, |
|
"learning_rate": 3.580538895800144e-05, |
|
"loss": 3.64, |
|
"step": 7540 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 33.77362823486328, |
|
"learning_rate": 3.5732317078517654e-05, |
|
"loss": 3.573, |
|
"step": 7560 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 29.266658782958984, |
|
"learning_rate": 3.565913262210615e-05, |
|
"loss": 3.6385, |
|
"step": 7580 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 42.985694885253906, |
|
"learning_rate": 3.5585836356437264e-05, |
|
"loss": 3.5987, |
|
"step": 7600 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 28.579496383666992, |
|
"learning_rate": 3.551242905035412e-05, |
|
"loss": 3.6161, |
|
"step": 7620 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 27.196502685546875, |
|
"learning_rate": 3.5438911473864634e-05, |
|
"loss": 3.5763, |
|
"step": 7640 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 28.27582359313965, |
|
"learning_rate": 3.5365284398133405e-05, |
|
"loss": 3.6452, |
|
"step": 7660 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 27.310009002685547, |
|
"learning_rate": 3.52915485954736e-05, |
|
"loss": 3.6718, |
|
"step": 7680 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 18.603565216064453, |
|
"learning_rate": 3.521770483933891e-05, |
|
"loss": 3.7397, |
|
"step": 7700 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 26.25426483154297, |
|
"learning_rate": 3.514375390431539e-05, |
|
"loss": 3.6665, |
|
"step": 7720 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 29.20294952392578, |
|
"learning_rate": 3.506969656611335e-05, |
|
"loss": 3.551, |
|
"step": 7740 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 37.7564697265625, |
|
"learning_rate": 3.4995533601559226e-05, |
|
"loss": 3.58, |
|
"step": 7760 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 25.87001609802246, |
|
"learning_rate": 3.4921265788587435e-05, |
|
"loss": 3.5855, |
|
"step": 7780 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 26.17401123046875, |
|
"learning_rate": 3.484689390623218e-05, |
|
"loss": 3.5951, |
|
"step": 7800 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 29.20701026916504, |
|
"learning_rate": 3.4772418734619324e-05, |
|
"loss": 3.6267, |
|
"step": 7820 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 60.92488098144531, |
|
"learning_rate": 3.4697841054958165e-05, |
|
"loss": 3.5733, |
|
"step": 7840 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 23.196178436279297, |
|
"learning_rate": 3.462316164953328e-05, |
|
"loss": 3.6283, |
|
"step": 7860 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 23.13970184326172, |
|
"learning_rate": 3.45483813016963e-05, |
|
"loss": 3.6558, |
|
"step": 7880 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 36.5677375793457, |
|
"learning_rate": 3.447350079585767e-05, |
|
"loss": 3.8141, |
|
"step": 7900 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 24.820940017700195, |
|
"learning_rate": 3.4398520917478476e-05, |
|
"loss": 3.6439, |
|
"step": 7920 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 19.9990291595459, |
|
"learning_rate": 3.4323442453062174e-05, |
|
"loss": 3.601, |
|
"step": 7940 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 20.419004440307617, |
|
"learning_rate": 3.42482661901463e-05, |
|
"loss": 3.4856, |
|
"step": 7960 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 24.06426429748535, |
|
"learning_rate": 3.417299291729431e-05, |
|
"loss": 3.679, |
|
"step": 7980 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 23.68332862854004, |
|
"learning_rate": 3.409762342408719e-05, |
|
"loss": 3.6538, |
|
"step": 8000 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 22.80304527282715, |
|
"learning_rate": 3.402215850111528e-05, |
|
"loss": 3.6685, |
|
"step": 8020 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 30.03902244567871, |
|
"learning_rate": 3.3946598939969896e-05, |
|
"loss": 3.633, |
|
"step": 8040 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 31.799922943115234, |
|
"learning_rate": 3.38709455332351e-05, |
|
"loss": 3.5756, |
|
"step": 8060 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 29.18169403076172, |
|
"learning_rate": 3.379519907447931e-05, |
|
"loss": 3.5886, |
|
"step": 8080 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 34.412113189697266, |
|
"learning_rate": 3.3719360358247054e-05, |
|
"loss": 3.5254, |
|
"step": 8100 |
|
}, |
|
{ |
|
"epoch": 0.11, |
|
"grad_norm": 38.046695709228516, |
|
"learning_rate": 3.3643430180050574e-05, |
|
"loss": 3.6677, |
|
"step": 8120 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 23.16988182067871, |
|
"learning_rate": 3.35674093363615e-05, |
|
"loss": 3.5864, |
|
"step": 8140 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 59.21152114868164, |
|
"learning_rate": 3.349129862460251e-05, |
|
"loss": 3.4903, |
|
"step": 8160 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 21.080909729003906, |
|
"learning_rate": 3.341509884313897e-05, |
|
"loss": 3.5803, |
|
"step": 8180 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 26.221805572509766, |
|
"learning_rate": 3.333881079127052e-05, |
|
"loss": 3.5238, |
|
"step": 8200 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 21.8948917388916, |
|
"learning_rate": 3.326243526922272e-05, |
|
"loss": 3.5498, |
|
"step": 8220 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 29.98341178894043, |
|
"learning_rate": 3.3185973078138664e-05, |
|
"loss": 3.6218, |
|
"step": 8240 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 21.86969757080078, |
|
"learning_rate": 3.310942502007056e-05, |
|
"loss": 3.5104, |
|
"step": 8260 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 29.3415584564209, |
|
"learning_rate": 3.303279189797131e-05, |
|
"loss": 3.5253, |
|
"step": 8280 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 30.171510696411133, |
|
"learning_rate": 3.29560745156861e-05, |
|
"loss": 3.6886, |
|
"step": 8300 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 24.074813842773438, |
|
"learning_rate": 3.287927367794397e-05, |
|
"loss": 3.6401, |
|
"step": 8320 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 25.059324264526367, |
|
"learning_rate": 3.2802390190349366e-05, |
|
"loss": 3.5847, |
|
"step": 8340 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 19.766672134399414, |
|
"learning_rate": 3.272542485937369e-05, |
|
"loss": 3.5812, |
|
"step": 8360 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 25.08376693725586, |
|
"learning_rate": 3.264837849234685e-05, |
|
"loss": 3.55, |
|
"step": 8380 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 27.044347763061523, |
|
"learning_rate": 3.2571251897448765e-05, |
|
"loss": 3.5347, |
|
"step": 8400 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 23.3479061126709, |
|
"learning_rate": 3.249404588370094e-05, |
|
"loss": 3.5016, |
|
"step": 8420 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 25.586896896362305, |
|
"learning_rate": 3.241676126095792e-05, |
|
"loss": 3.537, |
|
"step": 8440 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 31.54664421081543, |
|
"learning_rate": 3.233939883989882e-05, |
|
"loss": 3.6093, |
|
"step": 8460 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 44.6853141784668, |
|
"learning_rate": 3.226195943201883e-05, |
|
"loss": 3.6135, |
|
"step": 8480 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 43.322757720947266, |
|
"learning_rate": 3.218444384962071e-05, |
|
"loss": 3.6048, |
|
"step": 8500 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 19.633960723876953, |
|
"learning_rate": 3.210685290580622e-05, |
|
"loss": 3.5767, |
|
"step": 8520 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 23.640382766723633, |
|
"learning_rate": 3.202918741446764e-05, |
|
"loss": 3.5961, |
|
"step": 8540 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 46.06730270385742, |
|
"learning_rate": 3.1951448190279255e-05, |
|
"loss": 3.5757, |
|
"step": 8560 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 24.966190338134766, |
|
"learning_rate": 3.187363604868872e-05, |
|
"loss": 3.5488, |
|
"step": 8580 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 61.91409683227539, |
|
"learning_rate": 3.1795751805908573e-05, |
|
"loss": 3.6554, |
|
"step": 8600 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 80.80062103271484, |
|
"learning_rate": 3.171779627890769e-05, |
|
"loss": 3.6226, |
|
"step": 8620 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 20.951128005981445, |
|
"learning_rate": 3.163977028540263e-05, |
|
"loss": 3.6122, |
|
"step": 8640 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 19.875343322753906, |
|
"learning_rate": 3.156167464384917e-05, |
|
"loss": 3.5637, |
|
"step": 8660 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 21.2697811126709, |
|
"learning_rate": 3.1483510173433626e-05, |
|
"loss": 3.537, |
|
"step": 8680 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 22.24051856994629, |
|
"learning_rate": 3.1405277694064305e-05, |
|
"loss": 3.5661, |
|
"step": 8700 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 21.55095863342285, |
|
"learning_rate": 3.1326978026362904e-05, |
|
"loss": 3.5573, |
|
"step": 8720 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 32.11522674560547, |
|
"learning_rate": 3.124861199165588e-05, |
|
"loss": 3.5995, |
|
"step": 8740 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 22.775867462158203, |
|
"learning_rate": 3.117018041196585e-05, |
|
"loss": 3.6436, |
|
"step": 8760 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 23.462509155273438, |
|
"learning_rate": 3.109168411000299e-05, |
|
"loss": 3.601, |
|
"step": 8780 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 23.43865203857422, |
|
"learning_rate": 3.101312390915634e-05, |
|
"loss": 3.6081, |
|
"step": 8800 |
|
}, |
|
{ |
|
"epoch": 0.12, |
|
"grad_norm": 27.17888832092285, |
|
"learning_rate": 3.0934500633485255e-05, |
|
"loss": 3.6257, |
|
"step": 8820 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 31.697662353515625, |
|
"learning_rate": 3.0855815107710666e-05, |
|
"loss": 3.5902, |
|
"step": 8840 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 37.07548904418945, |
|
"learning_rate": 3.0777068157206536e-05, |
|
"loss": 3.6514, |
|
"step": 8860 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 20.554109573364258, |
|
"learning_rate": 3.069826060799109e-05, |
|
"loss": 3.5068, |
|
"step": 8880 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 22.490015029907227, |
|
"learning_rate": 3.061939328671824e-05, |
|
"loss": 3.6488, |
|
"step": 8900 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 25.30253791809082, |
|
"learning_rate": 3.0540467020668864e-05, |
|
"loss": 3.5931, |
|
"step": 8920 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 22.97053337097168, |
|
"learning_rate": 3.0461482637742135e-05, |
|
"loss": 3.5475, |
|
"step": 8940 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 22.68851661682129, |
|
"learning_rate": 3.0382440966446875e-05, |
|
"loss": 3.619, |
|
"step": 8960 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 28.575305938720703, |
|
"learning_rate": 3.03033428358928e-05, |
|
"loss": 3.5188, |
|
"step": 8980 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 21.909072875976562, |
|
"learning_rate": 3.0224189075781884e-05, |
|
"loss": 3.5988, |
|
"step": 9000 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 36.04661560058594, |
|
"learning_rate": 3.014498051639959e-05, |
|
"loss": 3.569, |
|
"step": 9020 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 157.5665740966797, |
|
"learning_rate": 3.0065717988606257e-05, |
|
"loss": 3.6474, |
|
"step": 9040 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 37.57924270629883, |
|
"learning_rate": 2.9986402323828272e-05, |
|
"loss": 3.5874, |
|
"step": 9060 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 26.661418914794922, |
|
"learning_rate": 2.990703435404944e-05, |
|
"loss": 3.5982, |
|
"step": 9080 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 38.95368957519531, |
|
"learning_rate": 2.9827614911802203e-05, |
|
"loss": 3.4998, |
|
"step": 9100 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 34.97966003417969, |
|
"learning_rate": 2.9748144830158924e-05, |
|
"loss": 3.5486, |
|
"step": 9120 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 27.682832717895508, |
|
"learning_rate": 2.9668624942723162e-05, |
|
"loss": 3.6144, |
|
"step": 9140 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 29.238054275512695, |
|
"learning_rate": 2.9589056083620902e-05, |
|
"loss": 3.6442, |
|
"step": 9160 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 31.821439743041992, |
|
"learning_rate": 2.9509439087491835e-05, |
|
"loss": 3.6221, |
|
"step": 9180 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 21.238704681396484, |
|
"learning_rate": 2.9429774789480575e-05, |
|
"loss": 3.6278, |
|
"step": 9200 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 28.50370216369629, |
|
"learning_rate": 2.9350064025227897e-05, |
|
"loss": 3.6592, |
|
"step": 9220 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 22.938095092773438, |
|
"learning_rate": 2.927030763086201e-05, |
|
"loss": 3.519, |
|
"step": 9240 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 27.523639678955078, |
|
"learning_rate": 2.9190506442989752e-05, |
|
"loss": 3.5285, |
|
"step": 9260 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 27.63553237915039, |
|
"learning_rate": 2.9110661298687824e-05, |
|
"loss": 3.5641, |
|
"step": 9280 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 43.626129150390625, |
|
"learning_rate": 2.9030773035493997e-05, |
|
"loss": 3.5764, |
|
"step": 9300 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 20.081253051757812, |
|
"learning_rate": 2.8950842491398357e-05, |
|
"loss": 3.6518, |
|
"step": 9320 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 20.68466567993164, |
|
"learning_rate": 2.8870870504834496e-05, |
|
"loss": 3.6206, |
|
"step": 9340 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 72.61478424072266, |
|
"learning_rate": 2.8790857914670698e-05, |
|
"loss": 3.5108, |
|
"step": 9360 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 23.662805557250977, |
|
"learning_rate": 2.871080556020118e-05, |
|
"loss": 3.6223, |
|
"step": 9380 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 25.221176147460938, |
|
"learning_rate": 2.863071428113726e-05, |
|
"loss": 3.644, |
|
"step": 9400 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 42.87479782104492, |
|
"learning_rate": 2.8550584917598554e-05, |
|
"loss": 3.7027, |
|
"step": 9420 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 30.936325073242188, |
|
"learning_rate": 2.8470418310104173e-05, |
|
"loss": 3.5493, |
|
"step": 9440 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 24.074983596801758, |
|
"learning_rate": 2.8390215299563884e-05, |
|
"loss": 3.4781, |
|
"step": 9460 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 22.818313598632812, |
|
"learning_rate": 2.8309976727269332e-05, |
|
"loss": 3.5558, |
|
"step": 9480 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 33.634605407714844, |
|
"learning_rate": 2.8229703434885163e-05, |
|
"loss": 3.5958, |
|
"step": 9500 |
|
}, |
|
{ |
|
"epoch": 0.13, |
|
"grad_norm": 29.69095802307129, |
|
"learning_rate": 2.814939626444023e-05, |
|
"loss": 3.5682, |
|
"step": 9520 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 29.696638107299805, |
|
"learning_rate": 2.8069056058318755e-05, |
|
"loss": 3.5676, |
|
"step": 9540 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 34.828269958496094, |
|
"learning_rate": 2.7988683659251474e-05, |
|
"loss": 3.482, |
|
"step": 9560 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 21.408599853515625, |
|
"learning_rate": 2.7908279910306835e-05, |
|
"loss": 3.5189, |
|
"step": 9580 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 21.67983627319336, |
|
"learning_rate": 2.782784565488211e-05, |
|
"loss": 3.5703, |
|
"step": 9600 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 24.80797576904297, |
|
"learning_rate": 2.7747381736694572e-05, |
|
"loss": 3.512, |
|
"step": 9620 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 35.9987678527832, |
|
"learning_rate": 2.766688899977266e-05, |
|
"loss": 3.5937, |
|
"step": 9640 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 24.59494400024414, |
|
"learning_rate": 2.7586368288447095e-05, |
|
"loss": 3.5829, |
|
"step": 9660 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 26.161178588867188, |
|
"learning_rate": 2.7505820447342028e-05, |
|
"loss": 3.5978, |
|
"step": 9680 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 17.551490783691406, |
|
"learning_rate": 2.7425246321366203e-05, |
|
"loss": 3.527, |
|
"step": 9700 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 31.15974998474121, |
|
"learning_rate": 2.7344646755704078e-05, |
|
"loss": 3.6422, |
|
"step": 9720 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 24.26822280883789, |
|
"learning_rate": 2.7264022595806948e-05, |
|
"loss": 3.5971, |
|
"step": 9740 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 24.76336669921875, |
|
"learning_rate": 2.71833746873841e-05, |
|
"loss": 3.5972, |
|
"step": 9760 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 18.041610717773438, |
|
"learning_rate": 2.7102703876393944e-05, |
|
"loss": 3.5832, |
|
"step": 9780 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 45.17101287841797, |
|
"learning_rate": 2.7022011009035107e-05, |
|
"loss": 3.5754, |
|
"step": 9800 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 30.87299156188965, |
|
"learning_rate": 2.6941296931737585e-05, |
|
"loss": 3.6022, |
|
"step": 9820 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 32.659786224365234, |
|
"learning_rate": 2.686056249115385e-05, |
|
"loss": 3.5947, |
|
"step": 9840 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 25.66715431213379, |
|
"learning_rate": 2.6779808534149987e-05, |
|
"loss": 3.5997, |
|
"step": 9860 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 24.735023498535156, |
|
"learning_rate": 2.6699035907796792e-05, |
|
"loss": 3.5619, |
|
"step": 9880 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 24.357643127441406, |
|
"learning_rate": 2.6618245459360897e-05, |
|
"loss": 3.6028, |
|
"step": 9900 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 29.255617141723633, |
|
"learning_rate": 2.6537438036295875e-05, |
|
"loss": 3.5231, |
|
"step": 9920 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 19.508926391601562, |
|
"learning_rate": 2.6456614486233343e-05, |
|
"loss": 3.5555, |
|
"step": 9940 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 29.48908042907715, |
|
"learning_rate": 2.6375775656974123e-05, |
|
"loss": 3.5376, |
|
"step": 9960 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 23.857908248901367, |
|
"learning_rate": 2.629492239647926e-05, |
|
"loss": 3.5641, |
|
"step": 9980 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 39.08363342285156, |
|
"learning_rate": 2.621405555286121e-05, |
|
"loss": 3.5957, |
|
"step": 10000 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 30.3124942779541, |
|
"learning_rate": 2.6133175974374892e-05, |
|
"loss": 3.5933, |
|
"step": 10020 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 23.010902404785156, |
|
"learning_rate": 2.6052284509408804e-05, |
|
"loss": 3.573, |
|
"step": 10040 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 34.93478775024414, |
|
"learning_rate": 2.5971382006476154e-05, |
|
"loss": 3.5641, |
|
"step": 10060 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 28.346033096313477, |
|
"learning_rate": 2.5890469314205897e-05, |
|
"loss": 3.5833, |
|
"step": 10080 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 41.69817352294922, |
|
"learning_rate": 2.5809547281333902e-05, |
|
"loss": 3.5718, |
|
"step": 10100 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 36.409114837646484, |
|
"learning_rate": 2.5728616756693997e-05, |
|
"loss": 3.5675, |
|
"step": 10120 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 23.812952041625977, |
|
"learning_rate": 2.564767858920909e-05, |
|
"loss": 3.6445, |
|
"step": 10140 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 31.33305549621582, |
|
"learning_rate": 2.556673362788225e-05, |
|
"loss": 3.5669, |
|
"step": 10160 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 34.65876770019531, |
|
"learning_rate": 2.5485782721787837e-05, |
|
"loss": 3.53, |
|
"step": 10180 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 21.920583724975586, |
|
"learning_rate": 2.540482672006254e-05, |
|
"loss": 3.5825, |
|
"step": 10200 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 48.019004821777344, |
|
"learning_rate": 2.5323866471896512e-05, |
|
"loss": 3.5733, |
|
"step": 10220 |
|
}, |
|
{ |
|
"epoch": 0.14, |
|
"grad_norm": 52.00071334838867, |
|
"learning_rate": 2.5242902826524434e-05, |
|
"loss": 3.5487, |
|
"step": 10240 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 47.33055877685547, |
|
"learning_rate": 2.5161936633216653e-05, |
|
"loss": 3.5076, |
|
"step": 10260 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 20.109745025634766, |
|
"learning_rate": 2.5080968741270223e-05, |
|
"loss": 3.5991, |
|
"step": 10280 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 23.458494186401367, |
|
"learning_rate": 2.5e-05, |
|
"loss": 3.6357, |
|
"step": 10300 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 23.68842315673828, |
|
"learning_rate": 2.4919031258729786e-05, |
|
"loss": 3.5449, |
|
"step": 10320 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 21.289154052734375, |
|
"learning_rate": 2.4838063366783353e-05, |
|
"loss": 3.6704, |
|
"step": 10340 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 23.132051467895508, |
|
"learning_rate": 2.4757097173475572e-05, |
|
"loss": 3.6327, |
|
"step": 10360 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 29.875104904174805, |
|
"learning_rate": 2.4676133528103497e-05, |
|
"loss": 3.5294, |
|
"step": 10380 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 23.41105079650879, |
|
"learning_rate": 2.4595173279937464e-05, |
|
"loss": 3.5995, |
|
"step": 10400 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 22.15860939025879, |
|
"learning_rate": 2.451421727821217e-05, |
|
"loss": 3.6109, |
|
"step": 10420 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 28.534278869628906, |
|
"learning_rate": 2.443326637211775e-05, |
|
"loss": 3.6389, |
|
"step": 10440 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 26.33950424194336, |
|
"learning_rate": 2.435232141079092e-05, |
|
"loss": 3.6083, |
|
"step": 10460 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 19.027633666992188, |
|
"learning_rate": 2.4271383243306016e-05, |
|
"loss": 3.5256, |
|
"step": 10480 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 28.898550033569336, |
|
"learning_rate": 2.419045271866611e-05, |
|
"loss": 3.61, |
|
"step": 10500 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 35.347347259521484, |
|
"learning_rate": 2.410953068579411e-05, |
|
"loss": 3.616, |
|
"step": 10520 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 23.184894561767578, |
|
"learning_rate": 2.402861799352386e-05, |
|
"loss": 3.6263, |
|
"step": 10540 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 32.66107177734375, |
|
"learning_rate": 2.3947715490591206e-05, |
|
"loss": 3.5446, |
|
"step": 10560 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 20.614028930664062, |
|
"learning_rate": 2.3866824025625124e-05, |
|
"loss": 3.5989, |
|
"step": 10580 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 25.750699996948242, |
|
"learning_rate": 2.3785944447138802e-05, |
|
"loss": 3.5197, |
|
"step": 10600 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 23.97648048400879, |
|
"learning_rate": 2.370507760352074e-05, |
|
"loss": 3.6399, |
|
"step": 10620 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 19.600095748901367, |
|
"learning_rate": 2.362422434302588e-05, |
|
"loss": 3.5295, |
|
"step": 10640 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 27.21882438659668, |
|
"learning_rate": 2.3543385513766656e-05, |
|
"loss": 3.512, |
|
"step": 10660 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 27.75621795654297, |
|
"learning_rate": 2.3462561963704134e-05, |
|
"loss": 3.5351, |
|
"step": 10680 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 27.200828552246094, |
|
"learning_rate": 2.338175454063911e-05, |
|
"loss": 3.5038, |
|
"step": 10700 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 27.96784782409668, |
|
"learning_rate": 2.3300964092203207e-05, |
|
"loss": 3.6097, |
|
"step": 10720 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 28.206979751586914, |
|
"learning_rate": 2.3220191465850015e-05, |
|
"loss": 3.5254, |
|
"step": 10740 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 22.781152725219727, |
|
"learning_rate": 2.3139437508846155e-05, |
|
"loss": 3.5857, |
|
"step": 10760 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 23.07236099243164, |
|
"learning_rate": 2.305870306826242e-05, |
|
"loss": 3.4872, |
|
"step": 10780 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 22.408714294433594, |
|
"learning_rate": 2.29779889909649e-05, |
|
"loss": 3.5115, |
|
"step": 10800 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 23.98442268371582, |
|
"learning_rate": 2.289729612360606e-05, |
|
"loss": 3.6297, |
|
"step": 10820 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 29.503135681152344, |
|
"learning_rate": 2.2816625312615903e-05, |
|
"loss": 3.6209, |
|
"step": 10840 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 30.787010192871094, |
|
"learning_rate": 2.2735977404193058e-05, |
|
"loss": 3.4921, |
|
"step": 10860 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 24.088376998901367, |
|
"learning_rate": 2.2655353244295928e-05, |
|
"loss": 3.5582, |
|
"step": 10880 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 25.253761291503906, |
|
"learning_rate": 2.25747536786338e-05, |
|
"loss": 3.5297, |
|
"step": 10900 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 24.333845138549805, |
|
"learning_rate": 2.2494179552657978e-05, |
|
"loss": 3.6105, |
|
"step": 10920 |
|
}, |
|
{ |
|
"epoch": 0.15, |
|
"grad_norm": 32.39179229736328, |
|
"learning_rate": 2.241363171155291e-05, |
|
"loss": 3.6122, |
|
"step": 10940 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 31.885894775390625, |
|
"learning_rate": 2.2333111000227342e-05, |
|
"loss": 3.6358, |
|
"step": 10960 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 22.056386947631836, |
|
"learning_rate": 2.225261826330543e-05, |
|
"loss": 3.5181, |
|
"step": 10980 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 23.47673225402832, |
|
"learning_rate": 2.2172154345117894e-05, |
|
"loss": 3.4853, |
|
"step": 11000 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 23.548656463623047, |
|
"learning_rate": 2.2091720089693168e-05, |
|
"loss": 3.5468, |
|
"step": 11020 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 16.66544532775879, |
|
"learning_rate": 2.201131634074853e-05, |
|
"loss": 3.626, |
|
"step": 11040 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 30.556697845458984, |
|
"learning_rate": 2.1930943941681254e-05, |
|
"loss": 3.5565, |
|
"step": 11060 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 62.914642333984375, |
|
"learning_rate": 2.1850603735559778e-05, |
|
"loss": 3.554, |
|
"step": 11080 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 25.617481231689453, |
|
"learning_rate": 2.177029656511485e-05, |
|
"loss": 3.5449, |
|
"step": 11100 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 33.26127243041992, |
|
"learning_rate": 2.169002327273068e-05, |
|
"loss": 3.6071, |
|
"step": 11120 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 21.895418167114258, |
|
"learning_rate": 2.160978470043612e-05, |
|
"loss": 3.4622, |
|
"step": 11140 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 25.30924415588379, |
|
"learning_rate": 2.152958168989584e-05, |
|
"loss": 3.5169, |
|
"step": 11160 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 28.7779541015625, |
|
"learning_rate": 2.1449415082401455e-05, |
|
"loss": 3.5817, |
|
"step": 11180 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 24.38544273376465, |
|
"learning_rate": 2.136928571886275e-05, |
|
"loss": 3.5433, |
|
"step": 11200 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 36.38949966430664, |
|
"learning_rate": 2.1289194439798818e-05, |
|
"loss": 3.5653, |
|
"step": 11220 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 36.11268615722656, |
|
"learning_rate": 2.12091420853293e-05, |
|
"loss": 3.4839, |
|
"step": 11240 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 18.36191749572754, |
|
"learning_rate": 2.1129129495165507e-05, |
|
"loss": 3.5532, |
|
"step": 11260 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 27.239763259887695, |
|
"learning_rate": 2.1049157508601642e-05, |
|
"loss": 3.5536, |
|
"step": 11280 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 25.459758758544922, |
|
"learning_rate": 2.0969226964506006e-05, |
|
"loss": 3.4878, |
|
"step": 11300 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 28.359439849853516, |
|
"learning_rate": 2.0889338701312185e-05, |
|
"loss": 3.563, |
|
"step": 11320 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 25.177392959594727, |
|
"learning_rate": 2.0809493557010247e-05, |
|
"loss": 3.6313, |
|
"step": 11340 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 26.633609771728516, |
|
"learning_rate": 2.072969236913799e-05, |
|
"loss": 3.6034, |
|
"step": 11360 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 19.589900970458984, |
|
"learning_rate": 2.0649935974772105e-05, |
|
"loss": 3.6429, |
|
"step": 11380 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 35.16368865966797, |
|
"learning_rate": 2.0570225210519434e-05, |
|
"loss": 3.5154, |
|
"step": 11400 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 37.59727478027344, |
|
"learning_rate": 2.0490560912508168e-05, |
|
"loss": 3.5652, |
|
"step": 11420 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 37.76837158203125, |
|
"learning_rate": 2.04109439163791e-05, |
|
"loss": 3.6911, |
|
"step": 11440 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 27.86673355102539, |
|
"learning_rate": 2.0331375057276844e-05, |
|
"loss": 3.4824, |
|
"step": 11460 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 51.12165832519531, |
|
"learning_rate": 2.025185516984108e-05, |
|
"loss": 3.558, |
|
"step": 11480 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 22.489160537719727, |
|
"learning_rate": 2.0172385088197803e-05, |
|
"loss": 3.5595, |
|
"step": 11500 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 19.6495304107666, |
|
"learning_rate": 2.0092965645950564e-05, |
|
"loss": 3.5679, |
|
"step": 11520 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 19.997142791748047, |
|
"learning_rate": 2.001359767617173e-05, |
|
"loss": 3.5332, |
|
"step": 11540 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 34.29532241821289, |
|
"learning_rate": 1.9934282011393753e-05, |
|
"loss": 3.4848, |
|
"step": 11560 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 20.737041473388672, |
|
"learning_rate": 1.985501948360041e-05, |
|
"loss": 3.4874, |
|
"step": 11580 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 30.11549186706543, |
|
"learning_rate": 1.9775810924218125e-05, |
|
"loss": 3.5166, |
|
"step": 11600 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 23.56212615966797, |
|
"learning_rate": 1.9696657164107202e-05, |
|
"loss": 3.652, |
|
"step": 11620 |
|
}, |
|
{ |
|
"epoch": 0.16, |
|
"grad_norm": 20.44150733947754, |
|
"learning_rate": 1.9617559033553128e-05, |
|
"loss": 3.5137, |
|
"step": 11640 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 33.37120819091797, |
|
"learning_rate": 1.9538517362257868e-05, |
|
"loss": 3.5163, |
|
"step": 11660 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 29.839820861816406, |
|
"learning_rate": 1.945953297933115e-05, |
|
"loss": 3.5979, |
|
"step": 11680 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 25.600812911987305, |
|
"learning_rate": 1.9380606713281775e-05, |
|
"loss": 3.6111, |
|
"step": 11700 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 40.76740264892578, |
|
"learning_rate": 1.9301739392008923e-05, |
|
"loss": 3.6727, |
|
"step": 11720 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 25.436763763427734, |
|
"learning_rate": 1.9222931842793473e-05, |
|
"loss": 3.6145, |
|
"step": 11740 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 19.53345489501953, |
|
"learning_rate": 1.9144184892289337e-05, |
|
"loss": 3.5486, |
|
"step": 11760 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 21.103118896484375, |
|
"learning_rate": 1.9065499366514757e-05, |
|
"loss": 3.5796, |
|
"step": 11780 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 26.135894775390625, |
|
"learning_rate": 1.8986876090843667e-05, |
|
"loss": 3.5905, |
|
"step": 11800 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 32.71371841430664, |
|
"learning_rate": 1.8908315889997007e-05, |
|
"loss": 3.531, |
|
"step": 11820 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 23.510149002075195, |
|
"learning_rate": 1.882981958803414e-05, |
|
"loss": 3.5597, |
|
"step": 11840 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 23.804306030273438, |
|
"learning_rate": 1.8751388008344117e-05, |
|
"loss": 3.5755, |
|
"step": 11860 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 33.7330436706543, |
|
"learning_rate": 1.8673021973637095e-05, |
|
"loss": 3.5092, |
|
"step": 11880 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 20.88456153869629, |
|
"learning_rate": 1.859472230593569e-05, |
|
"loss": 3.6001, |
|
"step": 11900 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 28.640546798706055, |
|
"learning_rate": 1.8516489826566376e-05, |
|
"loss": 3.5419, |
|
"step": 11920 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 25.2142391204834, |
|
"learning_rate": 1.8438325356150826e-05, |
|
"loss": 3.465, |
|
"step": 11940 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 27.663267135620117, |
|
"learning_rate": 1.836022971459737e-05, |
|
"loss": 3.5017, |
|
"step": 11960 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 31.913984298706055, |
|
"learning_rate": 1.828220372109232e-05, |
|
"loss": 3.5187, |
|
"step": 11980 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 29.825590133666992, |
|
"learning_rate": 1.820424819409143e-05, |
|
"loss": 3.5469, |
|
"step": 12000 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 18.72800636291504, |
|
"learning_rate": 1.8126363951311287e-05, |
|
"loss": 3.5486, |
|
"step": 12020 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 30.366409301757812, |
|
"learning_rate": 1.804855180972075e-05, |
|
"loss": 3.5487, |
|
"step": 12040 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 25.00339698791504, |
|
"learning_rate": 1.797081258553236e-05, |
|
"loss": 3.4778, |
|
"step": 12060 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 29.204017639160156, |
|
"learning_rate": 1.7893147094193786e-05, |
|
"loss": 3.446, |
|
"step": 12080 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 28.64485740661621, |
|
"learning_rate": 1.7815556150379298e-05, |
|
"loss": 3.5421, |
|
"step": 12100 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 31.4785213470459, |
|
"learning_rate": 1.7738040567981166e-05, |
|
"loss": 3.5075, |
|
"step": 12120 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 28.798315048217773, |
|
"learning_rate": 1.766060116010118e-05, |
|
"loss": 3.5049, |
|
"step": 12140 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 27.112850189208984, |
|
"learning_rate": 1.7583238739042086e-05, |
|
"loss": 3.5939, |
|
"step": 12160 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 24.396697998046875, |
|
"learning_rate": 1.7505954116299063e-05, |
|
"loss": 3.4596, |
|
"step": 12180 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 18.46675682067871, |
|
"learning_rate": 1.7428748102551237e-05, |
|
"loss": 3.4861, |
|
"step": 12200 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 25.615234375, |
|
"learning_rate": 1.7351621507653157e-05, |
|
"loss": 3.5211, |
|
"step": 12220 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 23.890607833862305, |
|
"learning_rate": 1.7274575140626318e-05, |
|
"loss": 3.5255, |
|
"step": 12240 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 19.983030319213867, |
|
"learning_rate": 1.7197609809650643e-05, |
|
"loss": 3.5567, |
|
"step": 12260 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 24.144041061401367, |
|
"learning_rate": 1.712072632205604e-05, |
|
"loss": 3.5586, |
|
"step": 12280 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 35.812835693359375, |
|
"learning_rate": 1.704392548431391e-05, |
|
"loss": 3.5274, |
|
"step": 12300 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 18.759809494018555, |
|
"learning_rate": 1.6967208102028697e-05, |
|
"loss": 3.5823, |
|
"step": 12320 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 19.85857391357422, |
|
"learning_rate": 1.6890574979929448e-05, |
|
"loss": 3.5583, |
|
"step": 12340 |
|
}, |
|
{ |
|
"epoch": 0.17, |
|
"grad_norm": 21.088096618652344, |
|
"learning_rate": 1.6814026921861335e-05, |
|
"loss": 3.5084, |
|
"step": 12360 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 21.760805130004883, |
|
"learning_rate": 1.6737564730777284e-05, |
|
"loss": 3.4753, |
|
"step": 12380 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 25.510221481323242, |
|
"learning_rate": 1.666118920872949e-05, |
|
"loss": 3.6024, |
|
"step": 12400 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 29.43688201904297, |
|
"learning_rate": 1.658490115686104e-05, |
|
"loss": 3.647, |
|
"step": 12420 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 19.17232322692871, |
|
"learning_rate": 1.6508701375397487e-05, |
|
"loss": 3.5505, |
|
"step": 12440 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 27.04405975341797, |
|
"learning_rate": 1.64325906636385e-05, |
|
"loss": 3.5158, |
|
"step": 12460 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 34.61522674560547, |
|
"learning_rate": 1.635656981994943e-05, |
|
"loss": 3.5723, |
|
"step": 12480 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 22.05956268310547, |
|
"learning_rate": 1.6280639641752942e-05, |
|
"loss": 3.5133, |
|
"step": 12500 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 26.21821403503418, |
|
"learning_rate": 1.6204800925520685e-05, |
|
"loss": 3.4956, |
|
"step": 12520 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 16.636159896850586, |
|
"learning_rate": 1.6129054466764904e-05, |
|
"loss": 3.5843, |
|
"step": 12540 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 27.356168746948242, |
|
"learning_rate": 1.60534010600301e-05, |
|
"loss": 3.5189, |
|
"step": 12560 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 19.394620895385742, |
|
"learning_rate": 1.5977841498884723e-05, |
|
"loss": 3.5838, |
|
"step": 12580 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 31.54738426208496, |
|
"learning_rate": 1.5902376575912815e-05, |
|
"loss": 3.6633, |
|
"step": 12600 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 23.533172607421875, |
|
"learning_rate": 1.5827007082705698e-05, |
|
"loss": 3.5234, |
|
"step": 12620 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 25.580156326293945, |
|
"learning_rate": 1.5751733809853704e-05, |
|
"loss": 3.5478, |
|
"step": 12640 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 28.20244789123535, |
|
"learning_rate": 1.5676557546937838e-05, |
|
"loss": 3.49, |
|
"step": 12660 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 26.89322280883789, |
|
"learning_rate": 1.5601479082521526e-05, |
|
"loss": 3.5238, |
|
"step": 12680 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 25.817209243774414, |
|
"learning_rate": 1.552649920414233e-05, |
|
"loss": 3.5417, |
|
"step": 12700 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 30.55599594116211, |
|
"learning_rate": 1.545161869830371e-05, |
|
"loss": 3.5908, |
|
"step": 12720 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 49.497894287109375, |
|
"learning_rate": 1.5376838350466725e-05, |
|
"loss": 3.6647, |
|
"step": 12740 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 17.040536880493164, |
|
"learning_rate": 1.5302158945041838e-05, |
|
"loss": 3.5271, |
|
"step": 12760 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 31.21510124206543, |
|
"learning_rate": 1.5227581265380685e-05, |
|
"loss": 3.4708, |
|
"step": 12780 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 26.194982528686523, |
|
"learning_rate": 1.5153106093767827e-05, |
|
"loss": 3.5831, |
|
"step": 12800 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 21.868547439575195, |
|
"learning_rate": 1.5078734211412573e-05, |
|
"loss": 3.532, |
|
"step": 12820 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 27.728635787963867, |
|
"learning_rate": 1.5004466398440775e-05, |
|
"loss": 3.5432, |
|
"step": 12840 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 22.69495964050293, |
|
"learning_rate": 1.493030343388666e-05, |
|
"loss": 3.5464, |
|
"step": 12860 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 22.85190773010254, |
|
"learning_rate": 1.4856246095684622e-05, |
|
"loss": 3.5686, |
|
"step": 12880 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 18.40359115600586, |
|
"learning_rate": 1.4782295160661103e-05, |
|
"loss": 3.4922, |
|
"step": 12900 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 27.058635711669922, |
|
"learning_rate": 1.4708451404526407e-05, |
|
"loss": 3.5231, |
|
"step": 12920 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 28.491573333740234, |
|
"learning_rate": 1.4634715601866606e-05, |
|
"loss": 3.502, |
|
"step": 12940 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 26.94320297241211, |
|
"learning_rate": 1.4561088526135375e-05, |
|
"loss": 3.5746, |
|
"step": 12960 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 26.503097534179688, |
|
"learning_rate": 1.4487570949645888e-05, |
|
"loss": 3.5195, |
|
"step": 12980 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 32.38998794555664, |
|
"learning_rate": 1.4414163643562755e-05, |
|
"loss": 3.5195, |
|
"step": 13000 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 29.240285873413086, |
|
"learning_rate": 1.434086737789386e-05, |
|
"loss": 3.6301, |
|
"step": 13020 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 19.996788024902344, |
|
"learning_rate": 1.4267682921482356e-05, |
|
"loss": 3.5252, |
|
"step": 13040 |
|
}, |
|
{ |
|
"epoch": 0.18, |
|
"grad_norm": 26.969528198242188, |
|
"learning_rate": 1.419461104199856e-05, |
|
"loss": 3.546, |
|
"step": 13060 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 39.526832580566406, |
|
"learning_rate": 1.412165250593192e-05, |
|
"loss": 3.5464, |
|
"step": 13080 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 22.60038948059082, |
|
"learning_rate": 1.4048808078582942e-05, |
|
"loss": 3.475, |
|
"step": 13100 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 20.97502899169922, |
|
"learning_rate": 1.3976078524055203e-05, |
|
"loss": 3.5398, |
|
"step": 13120 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 16.191055297851562, |
|
"learning_rate": 1.3903464605247325e-05, |
|
"loss": 3.4869, |
|
"step": 13140 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 22.308320999145508, |
|
"learning_rate": 1.3830967083844942e-05, |
|
"loss": 3.4316, |
|
"step": 13160 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 23.294443130493164, |
|
"learning_rate": 1.375858672031276e-05, |
|
"loss": 3.6033, |
|
"step": 13180 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 24.432270050048828, |
|
"learning_rate": 1.368632427388653e-05, |
|
"loss": 3.4829, |
|
"step": 13200 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 24.131166458129883, |
|
"learning_rate": 1.3614180502565135e-05, |
|
"loss": 3.5721, |
|
"step": 13220 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 36.706668853759766, |
|
"learning_rate": 1.3542156163102582e-05, |
|
"loss": 3.4877, |
|
"step": 13240 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 22.797359466552734, |
|
"learning_rate": 1.3470252011000123e-05, |
|
"loss": 3.539, |
|
"step": 13260 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 26.030719757080078, |
|
"learning_rate": 1.3398468800498293e-05, |
|
"loss": 3.5415, |
|
"step": 13280 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 24.123130798339844, |
|
"learning_rate": 1.3326807284568984e-05, |
|
"loss": 3.5354, |
|
"step": 13300 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 24.965229034423828, |
|
"learning_rate": 1.3255268214907613e-05, |
|
"loss": 3.387, |
|
"step": 13320 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 27.477920532226562, |
|
"learning_rate": 1.3183852341925145e-05, |
|
"loss": 3.5484, |
|
"step": 13340 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 27.259984970092773, |
|
"learning_rate": 1.3112560414740315e-05, |
|
"loss": 3.5104, |
|
"step": 13360 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 34.4488525390625, |
|
"learning_rate": 1.3041393181171688e-05, |
|
"loss": 3.5881, |
|
"step": 13380 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 30.281177520751953, |
|
"learning_rate": 1.2970351387729873e-05, |
|
"loss": 3.5851, |
|
"step": 13400 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 21.918861389160156, |
|
"learning_rate": 1.2899435779609682e-05, |
|
"loss": 3.5427, |
|
"step": 13420 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 30.275283813476562, |
|
"learning_rate": 1.2828647100682261e-05, |
|
"loss": 3.6322, |
|
"step": 13440 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 57.30770492553711, |
|
"learning_rate": 1.275798609348738e-05, |
|
"loss": 3.5871, |
|
"step": 13460 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 21.865861892700195, |
|
"learning_rate": 1.2687453499225545e-05, |
|
"loss": 3.5117, |
|
"step": 13480 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 19.6708927154541, |
|
"learning_rate": 1.2617050057750322e-05, |
|
"loss": 3.5015, |
|
"step": 13500 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 27.376646041870117, |
|
"learning_rate": 1.2546776507560468e-05, |
|
"loss": 3.5206, |
|
"step": 13520 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 44.89965057373047, |
|
"learning_rate": 1.2476633585792286e-05, |
|
"loss": 3.5766, |
|
"step": 13540 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 37.562286376953125, |
|
"learning_rate": 1.2406622028211844e-05, |
|
"loss": 3.5488, |
|
"step": 13560 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 41.833892822265625, |
|
"learning_rate": 1.2336742569207235e-05, |
|
"loss": 3.6429, |
|
"step": 13580 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 44.58812713623047, |
|
"learning_rate": 1.2266995941780934e-05, |
|
"loss": 3.5362, |
|
"step": 13600 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 26.543933868408203, |
|
"learning_rate": 1.2197382877542041e-05, |
|
"loss": 3.5761, |
|
"step": 13620 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 25.108768463134766, |
|
"learning_rate": 1.2127904106698666e-05, |
|
"loss": 3.4656, |
|
"step": 13640 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 22.213363647460938, |
|
"learning_rate": 1.2058560358050241e-05, |
|
"loss": 3.5438, |
|
"step": 13660 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 25.67693519592285, |
|
"learning_rate": 1.1989352358979888e-05, |
|
"loss": 3.5508, |
|
"step": 13680 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 23.043434143066406, |
|
"learning_rate": 1.1920280835446748e-05, |
|
"loss": 3.5901, |
|
"step": 13700 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 25.011388778686523, |
|
"learning_rate": 1.1851346511978425e-05, |
|
"loss": 3.5773, |
|
"step": 13720 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 29.659713745117188, |
|
"learning_rate": 1.1782550111663369e-05, |
|
"loss": 3.5795, |
|
"step": 13740 |
|
}, |
|
{ |
|
"epoch": 0.19, |
|
"grad_norm": 28.714496612548828, |
|
"learning_rate": 1.1713892356143239e-05, |
|
"loss": 3.5942, |
|
"step": 13760 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 29.76102066040039, |
|
"learning_rate": 1.1645373965605425e-05, |
|
"loss": 3.5008, |
|
"step": 13780 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 36.479854583740234, |
|
"learning_rate": 1.1576995658775405e-05, |
|
"loss": 3.4347, |
|
"step": 13800 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 22.23845100402832, |
|
"learning_rate": 1.1508758152909273e-05, |
|
"loss": 3.559, |
|
"step": 13820 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 52.79865646362305, |
|
"learning_rate": 1.1440662163786167e-05, |
|
"loss": 3.5128, |
|
"step": 13840 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 23.489912033081055, |
|
"learning_rate": 1.1372708405700793e-05, |
|
"loss": 3.6525, |
|
"step": 13860 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 22.640663146972656, |
|
"learning_rate": 1.1304897591455928e-05, |
|
"loss": 3.5387, |
|
"step": 13880 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 23.30765724182129, |
|
"learning_rate": 1.1237230432354912e-05, |
|
"loss": 3.5714, |
|
"step": 13900 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 27.42571258544922, |
|
"learning_rate": 1.1169707638194238e-05, |
|
"loss": 3.6333, |
|
"step": 13920 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 17.366491317749023, |
|
"learning_rate": 1.1102329917256046e-05, |
|
"loss": 3.5651, |
|
"step": 13940 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 17.73781967163086, |
|
"learning_rate": 1.103509797630077e-05, |
|
"loss": 3.6944, |
|
"step": 13960 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 23.699871063232422, |
|
"learning_rate": 1.0968012520559634e-05, |
|
"loss": 3.5914, |
|
"step": 13980 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 21.75518798828125, |
|
"learning_rate": 1.0901074253727336e-05, |
|
"loss": 3.592, |
|
"step": 14000 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 26.731660842895508, |
|
"learning_rate": 1.083428387795463e-05, |
|
"loss": 3.5461, |
|
"step": 14020 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 20.746625900268555, |
|
"learning_rate": 1.0767642093840932e-05, |
|
"loss": 3.5951, |
|
"step": 14040 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 29.204326629638672, |
|
"learning_rate": 1.0701149600427044e-05, |
|
"loss": 3.591, |
|
"step": 14060 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 23.31056022644043, |
|
"learning_rate": 1.0634807095187737e-05, |
|
"loss": 3.4382, |
|
"step": 14080 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 20.306123733520508, |
|
"learning_rate": 1.0568615274024522e-05, |
|
"loss": 3.5539, |
|
"step": 14100 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 19.775318145751953, |
|
"learning_rate": 1.0502574831258259e-05, |
|
"loss": 3.5532, |
|
"step": 14120 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 23.407949447631836, |
|
"learning_rate": 1.043668645962195e-05, |
|
"loss": 3.4811, |
|
"step": 14140 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 27.759410858154297, |
|
"learning_rate": 1.0370950850253449e-05, |
|
"loss": 3.6196, |
|
"step": 14160 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 35.55162048339844, |
|
"learning_rate": 1.0305368692688174e-05, |
|
"loss": 3.4774, |
|
"step": 14180 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 23.29683494567871, |
|
"learning_rate": 1.0239940674851941e-05, |
|
"loss": 3.5437, |
|
"step": 14200 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 23.486143112182617, |
|
"learning_rate": 1.0174667483053682e-05, |
|
"loss": 3.671, |
|
"step": 14220 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 19.907581329345703, |
|
"learning_rate": 1.0109549801978305e-05, |
|
"loss": 3.4272, |
|
"step": 14240 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 35.4050407409668, |
|
"learning_rate": 1.0044588314679451e-05, |
|
"loss": 3.5397, |
|
"step": 14260 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 28.344934463500977, |
|
"learning_rate": 9.979783702572412e-06, |
|
"loss": 3.5157, |
|
"step": 14280 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 27.65180015563965, |
|
"learning_rate": 9.915136645426884e-06, |
|
"loss": 3.5073, |
|
"step": 14300 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 21.25212860107422, |
|
"learning_rate": 9.850647821359918e-06, |
|
"loss": 3.5119, |
|
"step": 14320 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 25.691951751708984, |
|
"learning_rate": 9.786317906828747e-06, |
|
"loss": 3.6237, |
|
"step": 14340 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 32.24767303466797, |
|
"learning_rate": 9.722147576623743e-06, |
|
"loss": 3.5211, |
|
"step": 14360 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 20.08036231994629, |
|
"learning_rate": 9.658137503861314e-06, |
|
"loss": 3.4558, |
|
"step": 14380 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 22.380619049072266, |
|
"learning_rate": 9.594288359976817e-06, |
|
"loss": 3.4814, |
|
"step": 14400 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 28.68309211730957, |
|
"learning_rate": 9.530600814717575e-06, |
|
"loss": 3.5701, |
|
"step": 14420 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 22.603858947753906, |
|
"learning_rate": 9.467075536135787e-06, |
|
"loss": 3.5527, |
|
"step": 14440 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 20.50914192199707, |
|
"learning_rate": 9.403713190581576e-06, |
|
"loss": 3.4903, |
|
"step": 14460 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 30.767824172973633, |
|
"learning_rate": 9.340514442695952e-06, |
|
"loss": 3.5184, |
|
"step": 14480 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 23.36814308166504, |
|
"learning_rate": 9.277479955403887e-06, |
|
"loss": 3.4903, |
|
"step": 14500 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 34.33296203613281, |
|
"learning_rate": 9.214610389907327e-06, |
|
"loss": 3.5716, |
|
"step": 14520 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 21.11824607849121, |
|
"learning_rate": 9.15190640567825e-06, |
|
"loss": 3.6187, |
|
"step": 14540 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 38.17007064819336, |
|
"learning_rate": 9.0893686604518e-06, |
|
"loss": 3.5029, |
|
"step": 14560 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 22.7441349029541, |
|
"learning_rate": 9.026997810219312e-06, |
|
"loss": 3.552, |
|
"step": 14580 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 27.61566734313965, |
|
"learning_rate": 8.964794509221508e-06, |
|
"loss": 3.5794, |
|
"step": 14600 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 28.213449478149414, |
|
"learning_rate": 8.902759409941566e-06, |
|
"loss": 3.6239, |
|
"step": 14620 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 99.79093933105469, |
|
"learning_rate": 8.840893163098331e-06, |
|
"loss": 3.5571, |
|
"step": 14640 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 22.567502975463867, |
|
"learning_rate": 8.779196417639466e-06, |
|
"loss": 3.6038, |
|
"step": 14660 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 16.769285202026367, |
|
"learning_rate": 8.71766982073462e-06, |
|
"loss": 3.5192, |
|
"step": 14680 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 21.841182708740234, |
|
"learning_rate": 8.656314017768693e-06, |
|
"loss": 3.4728, |
|
"step": 14700 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 29.540746688842773, |
|
"learning_rate": 8.595129652335019e-06, |
|
"loss": 3.4656, |
|
"step": 14720 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 31.932985305786133, |
|
"learning_rate": 8.534117366228644e-06, |
|
"loss": 3.5597, |
|
"step": 14740 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 22.49396324157715, |
|
"learning_rate": 8.47327779943957e-06, |
|
"loss": 3.5653, |
|
"step": 14760 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 26.135406494140625, |
|
"learning_rate": 8.412611590146069e-06, |
|
"loss": 3.5669, |
|
"step": 14780 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 26.18607521057129, |
|
"learning_rate": 8.352119374707978e-06, |
|
"loss": 3.4971, |
|
"step": 14800 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 32.66505813598633, |
|
"learning_rate": 8.29180178766e-06, |
|
"loss": 3.6041, |
|
"step": 14820 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 22.13516616821289, |
|
"learning_rate": 8.23165946170509e-06, |
|
"loss": 3.4845, |
|
"step": 14840 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 19.47613525390625, |
|
"learning_rate": 8.171693027707772e-06, |
|
"loss": 3.582, |
|
"step": 14860 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 28.822246551513672, |
|
"learning_rate": 8.111903114687591e-06, |
|
"loss": 3.5498, |
|
"step": 14880 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 27.130552291870117, |
|
"learning_rate": 8.052290349812419e-06, |
|
"loss": 3.5724, |
|
"step": 14900 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 20.185958862304688, |
|
"learning_rate": 7.992855358391967e-06, |
|
"loss": 3.5115, |
|
"step": 14920 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 19.644073486328125, |
|
"learning_rate": 7.933598763871155e-06, |
|
"loss": 3.5116, |
|
"step": 14940 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 28.123699188232422, |
|
"learning_rate": 7.87452118782363e-06, |
|
"loss": 3.5057, |
|
"step": 14960 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 18.76272964477539, |
|
"learning_rate": 7.815623249945214e-06, |
|
"loss": 3.579, |
|
"step": 14980 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 24.90170669555664, |
|
"learning_rate": 7.756905568047393e-06, |
|
"loss": 3.4875, |
|
"step": 15000 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 28.07253074645996, |
|
"learning_rate": 7.698368758050877e-06, |
|
"loss": 3.4413, |
|
"step": 15020 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 25.81130027770996, |
|
"learning_rate": 7.640013433979093e-06, |
|
"loss": 3.5166, |
|
"step": 15040 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 19.91429901123047, |
|
"learning_rate": 7.58184020795179e-06, |
|
"loss": 3.6086, |
|
"step": 15060 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 22.920394897460938, |
|
"learning_rate": 7.523849690178567e-06, |
|
"loss": 3.4341, |
|
"step": 15080 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 32.82981872558594, |
|
"learning_rate": 7.466042488952521e-06, |
|
"loss": 3.5264, |
|
"step": 15100 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 23.146589279174805, |
|
"learning_rate": 7.408419210643847e-06, |
|
"loss": 3.4571, |
|
"step": 15120 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 16.78236198425293, |
|
"learning_rate": 7.350980459693455e-06, |
|
"loss": 3.5377, |
|
"step": 15140 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 19.41143035888672, |
|
"learning_rate": 7.293726838606674e-06, |
|
"loss": 3.5262, |
|
"step": 15160 |
|
}, |
|
{ |
|
"epoch": 0.21, |
|
"grad_norm": 35.83265686035156, |
|
"learning_rate": 7.236658947946886e-06, |
|
"loss": 3.5389, |
|
"step": 15180 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 18.50286102294922, |
|
"learning_rate": 7.179777386329276e-06, |
|
"loss": 3.4822, |
|
"step": 15200 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 22.09296226501465, |
|
"learning_rate": 7.123082750414486e-06, |
|
"loss": 3.6018, |
|
"step": 15220 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 20.790420532226562, |
|
"learning_rate": 7.066575634902436e-06, |
|
"loss": 3.5642, |
|
"step": 15240 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 29.84691047668457, |
|
"learning_rate": 7.010256632526035e-06, |
|
"loss": 3.6224, |
|
"step": 15260 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 45.16112518310547, |
|
"learning_rate": 6.9541263340449496e-06, |
|
"loss": 3.5078, |
|
"step": 15280 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 24.578792572021484, |
|
"learning_rate": 6.898185328239468e-06, |
|
"loss": 3.571, |
|
"step": 15300 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 28.100582122802734, |
|
"learning_rate": 6.842434201904255e-06, |
|
"loss": 3.4775, |
|
"step": 15320 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 31.31978988647461, |
|
"learning_rate": 6.786873539842259e-06, |
|
"loss": 3.586, |
|
"step": 15340 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 25.426467895507812, |
|
"learning_rate": 6.731503924858518e-06, |
|
"loss": 3.6732, |
|
"step": 15360 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 29.726364135742188, |
|
"learning_rate": 6.676325937754102e-06, |
|
"loss": 3.4458, |
|
"step": 15380 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 35.88907241821289, |
|
"learning_rate": 6.621340157319997e-06, |
|
"loss": 3.5081, |
|
"step": 15400 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 18.98563003540039, |
|
"learning_rate": 6.566547160330999e-06, |
|
"loss": 3.4117, |
|
"step": 15420 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 23.43938446044922, |
|
"learning_rate": 6.511947521539738e-06, |
|
"loss": 3.5529, |
|
"step": 15440 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 27.628211975097656, |
|
"learning_rate": 6.457541813670564e-06, |
|
"loss": 3.6043, |
|
"step": 15460 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 32.65241241455078, |
|
"learning_rate": 6.403330607413643e-06, |
|
"loss": 3.5273, |
|
"step": 15480 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 23.956153869628906, |
|
"learning_rate": 6.349314471418849e-06, |
|
"loss": 3.62, |
|
"step": 15500 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 26.069808959960938, |
|
"learning_rate": 6.295493972289904e-06, |
|
"loss": 3.5688, |
|
"step": 15520 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 39.316566467285156, |
|
"learning_rate": 6.241869674578363e-06, |
|
"loss": 3.5178, |
|
"step": 15540 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 27.55044174194336, |
|
"learning_rate": 6.188442140777742e-06, |
|
"loss": 3.4732, |
|
"step": 15560 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 20.663110733032227, |
|
"learning_rate": 6.1352119313175945e-06, |
|
"loss": 3.471, |
|
"step": 15580 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 24.35353660583496, |
|
"learning_rate": 6.082179604557617e-06, |
|
"loss": 3.503, |
|
"step": 15600 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 21.658618927001953, |
|
"learning_rate": 6.029345716781837e-06, |
|
"loss": 3.5414, |
|
"step": 15620 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 29.32859230041504, |
|
"learning_rate": 5.9767108221927216e-06, |
|
"loss": 3.4492, |
|
"step": 15640 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 29.497888565063477, |
|
"learning_rate": 5.924275472905424e-06, |
|
"loss": 3.6211, |
|
"step": 15660 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 22.027616500854492, |
|
"learning_rate": 5.872040218941929e-06, |
|
"loss": 3.6381, |
|
"step": 15680 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 25.56600570678711, |
|
"learning_rate": 5.820005608225346e-06, |
|
"loss": 3.6468, |
|
"step": 15700 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 20.85995864868164, |
|
"learning_rate": 5.768172186574122e-06, |
|
"loss": 3.5111, |
|
"step": 15720 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 27.828828811645508, |
|
"learning_rate": 5.716540497696307e-06, |
|
"loss": 3.4975, |
|
"step": 15740 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 28.20557403564453, |
|
"learning_rate": 5.665111083183905e-06, |
|
"loss": 3.5542, |
|
"step": 15760 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 27.977331161499023, |
|
"learning_rate": 5.613884482507123e-06, |
|
"loss": 3.5096, |
|
"step": 15780 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 15.945253372192383, |
|
"learning_rate": 5.562861233008774e-06, |
|
"loss": 3.4329, |
|
"step": 15800 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 19.057100296020508, |
|
"learning_rate": 5.512041869898585e-06, |
|
"loss": 3.5043, |
|
"step": 15820 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 19.960477828979492, |
|
"learning_rate": 5.46142692624764e-06, |
|
"loss": 3.4124, |
|
"step": 15840 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 20.818775177001953, |
|
"learning_rate": 5.411016932982752e-06, |
|
"loss": 3.4409, |
|
"step": 15860 |
|
}, |
|
{ |
|
"epoch": 0.22, |
|
"grad_norm": 19.86461639404297, |
|
"learning_rate": 5.360812418880884e-06, |
|
"loss": 3.6115, |
|
"step": 15880 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 26.129798889160156, |
|
"learning_rate": 5.310813910563644e-06, |
|
"loss": 3.5875, |
|
"step": 15900 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 17.876537322998047, |
|
"learning_rate": 5.261021932491714e-06, |
|
"loss": 3.5214, |
|
"step": 15920 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 37.67085266113281, |
|
"learning_rate": 5.2114370069593965e-06, |
|
"loss": 3.6228, |
|
"step": 15940 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 21.973346710205078, |
|
"learning_rate": 5.162059654089083e-06, |
|
"loss": 3.457, |
|
"step": 15960 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 30.133270263671875, |
|
"learning_rate": 5.112890391825845e-06, |
|
"loss": 3.4729, |
|
"step": 15980 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 24.22100830078125, |
|
"learning_rate": 5.063929735931985e-06, |
|
"loss": 3.5727, |
|
"step": 16000 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 25.7775821685791, |
|
"learning_rate": 5.015178199981602e-06, |
|
"loss": 3.5195, |
|
"step": 16020 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 21.86203384399414, |
|
"learning_rate": 4.966636295355253e-06, |
|
"loss": 3.5248, |
|
"step": 16040 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 31.954734802246094, |
|
"learning_rate": 4.918304531234533e-06, |
|
"loss": 3.5392, |
|
"step": 16060 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 19.251066207885742, |
|
"learning_rate": 4.870183414596794e-06, |
|
"loss": 3.5204, |
|
"step": 16080 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 30.7813777923584, |
|
"learning_rate": 4.8222734502097665e-06, |
|
"loss": 3.5081, |
|
"step": 16100 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 24.208667755126953, |
|
"learning_rate": 4.7745751406263165e-06, |
|
"loss": 3.5487, |
|
"step": 16120 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 24.639707565307617, |
|
"learning_rate": 4.727088986179129e-06, |
|
"loss": 3.5998, |
|
"step": 16140 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 23.774940490722656, |
|
"learning_rate": 4.679815484975505e-06, |
|
"loss": 3.4195, |
|
"step": 16160 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 20.040874481201172, |
|
"learning_rate": 4.6327551328920945e-06, |
|
"loss": 3.5555, |
|
"step": 16180 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 32.899959564208984, |
|
"learning_rate": 4.585908423569724e-06, |
|
"loss": 3.5204, |
|
"step": 16200 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 29.705387115478516, |
|
"learning_rate": 4.539275848408217e-06, |
|
"loss": 3.5667, |
|
"step": 16220 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 19.238882064819336, |
|
"learning_rate": 4.492857896561204e-06, |
|
"loss": 3.4192, |
|
"step": 16240 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 18.87932777404785, |
|
"learning_rate": 4.446655054931051e-06, |
|
"loss": 3.4987, |
|
"step": 16260 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 25.21925163269043, |
|
"learning_rate": 4.4006678081636884e-06, |
|
"loss": 3.6039, |
|
"step": 16280 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 42.43282699584961, |
|
"learning_rate": 4.35489663864359e-06, |
|
"loss": 3.5736, |
|
"step": 16300 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 34.624874114990234, |
|
"learning_rate": 4.309342026488653e-06, |
|
"loss": 3.4077, |
|
"step": 16320 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 28.791912078857422, |
|
"learning_rate": 4.264004449545206e-06, |
|
"loss": 3.511, |
|
"step": 16340 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 21.95167350769043, |
|
"learning_rate": 4.218884383382987e-06, |
|
"loss": 3.4688, |
|
"step": 16360 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 29.78521156311035, |
|
"learning_rate": 4.173982301290122e-06, |
|
"loss": 3.4808, |
|
"step": 16380 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 24.426288604736328, |
|
"learning_rate": 4.129298674268225e-06, |
|
"loss": 3.5356, |
|
"step": 16400 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 23.966203689575195, |
|
"learning_rate": 4.084833971027379e-06, |
|
"loss": 3.5471, |
|
"step": 16420 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 26.148141860961914, |
|
"learning_rate": 4.040588657981301e-06, |
|
"loss": 3.4811, |
|
"step": 16440 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 29.300769805908203, |
|
"learning_rate": 3.99656319924237e-06, |
|
"loss": 3.5584, |
|
"step": 16460 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 27.95845603942871, |
|
"learning_rate": 3.952758056616826e-06, |
|
"loss": 3.5451, |
|
"step": 16480 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 21.04642677307129, |
|
"learning_rate": 3.90917368959989e-06, |
|
"loss": 3.5002, |
|
"step": 16500 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 38.936370849609375, |
|
"learning_rate": 3.865810555370936e-06, |
|
"loss": 3.4524, |
|
"step": 16520 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 24.49747085571289, |
|
"learning_rate": 3.822669108788738e-06, |
|
"loss": 3.4887, |
|
"step": 16540 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 36.478424072265625, |
|
"learning_rate": 3.7797498023866396e-06, |
|
"loss": 3.5807, |
|
"step": 16560 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 19.034469604492188, |
|
"learning_rate": 3.737053086367873e-06, |
|
"loss": 3.5806, |
|
"step": 16580 |
|
}, |
|
{ |
|
"epoch": 0.23, |
|
"grad_norm": 26.125469207763672, |
|
"learning_rate": 3.694579408600771e-06, |
|
"loss": 3.4561, |
|
"step": 16600 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 30.372791290283203, |
|
"learning_rate": 3.6523292146141227e-06, |
|
"loss": 3.5875, |
|
"step": 16620 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 25.58843421936035, |
|
"learning_rate": 3.6103029475924726e-06, |
|
"loss": 3.498, |
|
"step": 16640 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 27.708513259887695, |
|
"learning_rate": 3.56850104837147e-06, |
|
"loss": 3.5339, |
|
"step": 16660 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 28.574857711791992, |
|
"learning_rate": 3.5269239554332563e-06, |
|
"loss": 3.5488, |
|
"step": 16680 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 37.658775329589844, |
|
"learning_rate": 3.4855721049018688e-06, |
|
"loss": 3.5008, |
|
"step": 16700 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 31.66521644592285, |
|
"learning_rate": 3.4444459305386507e-06, |
|
"loss": 3.4864, |
|
"step": 16720 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 18.813400268554688, |
|
"learning_rate": 3.403545863737706e-06, |
|
"loss": 3.5685, |
|
"step": 16740 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 24.066808700561523, |
|
"learning_rate": 3.3628723335213885e-06, |
|
"loss": 3.5549, |
|
"step": 16760 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 23.029767990112305, |
|
"learning_rate": 3.322425766535778e-06, |
|
"loss": 3.4389, |
|
"step": 16780 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 25.6854190826416, |
|
"learning_rate": 3.2822065870462217e-06, |
|
"loss": 3.4405, |
|
"step": 16800 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 26.14007568359375, |
|
"learning_rate": 3.2422152169328922e-06, |
|
"loss": 3.5291, |
|
"step": 16820 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 22.264413833618164, |
|
"learning_rate": 3.2024520756863243e-06, |
|
"loss": 3.613, |
|
"step": 16840 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 32.82612991333008, |
|
"learning_rate": 3.1629175804030658e-06, |
|
"loss": 3.4603, |
|
"step": 16860 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 32.21546173095703, |
|
"learning_rate": 3.1236121457812544e-06, |
|
"loss": 3.5886, |
|
"step": 16880 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 18.181438446044922, |
|
"learning_rate": 3.08453618411631e-06, |
|
"loss": 3.4568, |
|
"step": 16900 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 23.75358772277832, |
|
"learning_rate": 3.0456901052965724e-06, |
|
"loss": 3.5491, |
|
"step": 16920 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 17.43839454650879, |
|
"learning_rate": 3.0070743167990273e-06, |
|
"loss": 3.5776, |
|
"step": 16940 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 20.617218017578125, |
|
"learning_rate": 2.9686892236850337e-06, |
|
"loss": 3.539, |
|
"step": 16960 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 19.83597755432129, |
|
"learning_rate": 2.93053522859604e-06, |
|
"loss": 3.3575, |
|
"step": 16980 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 18.4063720703125, |
|
"learning_rate": 2.892612731749414e-06, |
|
"loss": 3.3658, |
|
"step": 17000 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 23.77143096923828, |
|
"learning_rate": 2.85492213093419e-06, |
|
"loss": 3.4393, |
|
"step": 17020 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 22.04786491394043, |
|
"learning_rate": 2.8174638215069493e-06, |
|
"loss": 3.5262, |
|
"step": 17040 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 24.277013778686523, |
|
"learning_rate": 2.780238196387619e-06, |
|
"loss": 3.4419, |
|
"step": 17060 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 30.128318786621094, |
|
"learning_rate": 2.743245646055398e-06, |
|
"loss": 3.5387, |
|
"step": 17080 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 26.737049102783203, |
|
"learning_rate": 2.7064865585446434e-06, |
|
"loss": 3.4134, |
|
"step": 17100 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 26.093942642211914, |
|
"learning_rate": 2.6699613194407725e-06, |
|
"loss": 3.5691, |
|
"step": 17120 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 30.150657653808594, |
|
"learning_rate": 2.6336703118762766e-06, |
|
"loss": 3.4658, |
|
"step": 17140 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 21.17641258239746, |
|
"learning_rate": 2.597613916526637e-06, |
|
"loss": 3.4942, |
|
"step": 17160 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 28.02484130859375, |
|
"learning_rate": 2.5617925116063924e-06, |
|
"loss": 3.4448, |
|
"step": 17180 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 24.14384651184082, |
|
"learning_rate": 2.52620647286512e-06, |
|
"loss": 3.5448, |
|
"step": 17200 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 25.69285774230957, |
|
"learning_rate": 2.4908561735835306e-06, |
|
"loss": 3.5668, |
|
"step": 17220 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 19.125316619873047, |
|
"learning_rate": 2.4557419845695427e-06, |
|
"loss": 3.5204, |
|
"step": 17240 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 23.64023208618164, |
|
"learning_rate": 2.420864274154372e-06, |
|
"loss": 3.4345, |
|
"step": 17260 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 24.39943504333496, |
|
"learning_rate": 2.3862234081887036e-06, |
|
"loss": 3.5515, |
|
"step": 17280 |
|
}, |
|
{ |
|
"epoch": 0.24, |
|
"grad_norm": 24.969629287719727, |
|
"learning_rate": 2.351819750038828e-06, |
|
"loss": 3.4973, |
|
"step": 17300 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 18.182634353637695, |
|
"learning_rate": 2.317653660582844e-06, |
|
"loss": 3.6065, |
|
"step": 17320 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 27.141944885253906, |
|
"learning_rate": 2.2837254982068567e-06, |
|
"loss": 3.5106, |
|
"step": 17340 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 20.061452865600586, |
|
"learning_rate": 2.250035618801241e-06, |
|
"loss": 3.4274, |
|
"step": 17360 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 39.53497314453125, |
|
"learning_rate": 2.2165843757568805e-06, |
|
"loss": 3.4597, |
|
"step": 17380 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 14.67487907409668, |
|
"learning_rate": 2.183372119961499e-06, |
|
"loss": 3.5732, |
|
"step": 17400 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 14.984709739685059, |
|
"learning_rate": 2.15039919979593e-06, |
|
"loss": 3.4735, |
|
"step": 17420 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 30.988039016723633, |
|
"learning_rate": 2.117665961130513e-06, |
|
"loss": 3.4269, |
|
"step": 17440 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 23.8664493560791, |
|
"learning_rate": 2.0851727473214315e-06, |
|
"loss": 3.4997, |
|
"step": 17460 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 24.271230697631836, |
|
"learning_rate": 2.05291989920712e-06, |
|
"loss": 3.5919, |
|
"step": 17480 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 32.17240524291992, |
|
"learning_rate": 2.020907755104698e-06, |
|
"loss": 3.4734, |
|
"step": 17500 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 24.72242546081543, |
|
"learning_rate": 1.9891366508064003e-06, |
|
"loss": 3.5043, |
|
"step": 17520 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 29.70708656311035, |
|
"learning_rate": 1.957606919576088e-06, |
|
"loss": 3.4543, |
|
"step": 17540 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 29.549745559692383, |
|
"learning_rate": 1.926318892145712e-06, |
|
"loss": 3.4355, |
|
"step": 17560 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 25.912363052368164, |
|
"learning_rate": 1.8952728967118804e-06, |
|
"loss": 3.4614, |
|
"step": 17580 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 22.835115432739258, |
|
"learning_rate": 1.864469258932397e-06, |
|
"loss": 3.5498, |
|
"step": 17600 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 20.103981018066406, |
|
"learning_rate": 1.8339083019228404e-06, |
|
"loss": 3.5791, |
|
"step": 17620 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 24.153532028198242, |
|
"learning_rate": 1.803590346253195e-06, |
|
"loss": 3.495, |
|
"step": 17640 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 19.70048713684082, |
|
"learning_rate": 1.7735157099444593e-06, |
|
"loss": 3.5439, |
|
"step": 17660 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 23.056358337402344, |
|
"learning_rate": 1.7436847084653456e-06, |
|
"loss": 3.4222, |
|
"step": 17680 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 25.633689880371094, |
|
"learning_rate": 1.7140976547289438e-06, |
|
"loss": 3.5387, |
|
"step": 17700 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 30.34889030456543, |
|
"learning_rate": 1.6847548590894435e-06, |
|
"loss": 3.5579, |
|
"step": 17720 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 19.06514549255371, |
|
"learning_rate": 1.6556566293388892e-06, |
|
"loss": 3.4082, |
|
"step": 17740 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 16.91566276550293, |
|
"learning_rate": 1.626803270703936e-06, |
|
"loss": 3.5513, |
|
"step": 17760 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 26.17884635925293, |
|
"learning_rate": 1.5981950858426714e-06, |
|
"loss": 3.5068, |
|
"step": 17780 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 35.77287292480469, |
|
"learning_rate": 1.5698323748414124e-06, |
|
"loss": 3.4825, |
|
"step": 17800 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 25.684925079345703, |
|
"learning_rate": 1.5417154352115742e-06, |
|
"loss": 3.5529, |
|
"step": 17820 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 23.964488983154297, |
|
"learning_rate": 1.5138445618865544e-06, |
|
"loss": 3.549, |
|
"step": 17840 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 20.69983673095703, |
|
"learning_rate": 1.4862200472186199e-06, |
|
"loss": 3.5607, |
|
"step": 17860 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 24.382530212402344, |
|
"learning_rate": 1.458842180975864e-06, |
|
"loss": 3.4468, |
|
"step": 17880 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 20.305166244506836, |
|
"learning_rate": 1.4317112503391432e-06, |
|
"loss": 3.5468, |
|
"step": 17900 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 20.76270294189453, |
|
"learning_rate": 1.4048275398990896e-06, |
|
"loss": 3.5828, |
|
"step": 17920 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 31.468564987182617, |
|
"learning_rate": 1.3781913316530948e-06, |
|
"loss": 3.6117, |
|
"step": 17940 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 22.185617446899414, |
|
"learning_rate": 1.351802905002386e-06, |
|
"loss": 3.4663, |
|
"step": 17960 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 27.454687118530273, |
|
"learning_rate": 1.32566253674907e-06, |
|
"loss": 3.4419, |
|
"step": 17980 |
|
}, |
|
{ |
|
"epoch": 0.25, |
|
"grad_norm": 20.76512336730957, |
|
"learning_rate": 1.2997705010932393e-06, |
|
"loss": 3.5315, |
|
"step": 18000 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 27.795419692993164, |
|
"learning_rate": 1.274127069630096e-06, |
|
"loss": 3.5435, |
|
"step": 18020 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 45.871864318847656, |
|
"learning_rate": 1.2487325113471032e-06, |
|
"loss": 3.3871, |
|
"step": 18040 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 15.510208129882812, |
|
"learning_rate": 1.2235870926211619e-06, |
|
"loss": 3.5862, |
|
"step": 18060 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 26.943269729614258, |
|
"learning_rate": 1.1986910772158104e-06, |
|
"loss": 3.5032, |
|
"step": 18080 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 28.423053741455078, |
|
"learning_rate": 1.1740447262784781e-06, |
|
"loss": 3.4936, |
|
"step": 18100 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 25.210853576660156, |
|
"learning_rate": 1.1496482983377189e-06, |
|
"loss": 3.4515, |
|
"step": 18120 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 32.88740921020508, |
|
"learning_rate": 1.125502049300517e-06, |
|
"loss": 3.5196, |
|
"step": 18140 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 20.562488555908203, |
|
"learning_rate": 1.1016062324496008e-06, |
|
"loss": 3.4467, |
|
"step": 18160 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 21.112634658813477, |
|
"learning_rate": 1.0779610984407773e-06, |
|
"loss": 3.5286, |
|
"step": 18180 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 29.323238372802734, |
|
"learning_rate": 1.0545668953003241e-06, |
|
"loss": 3.4971, |
|
"step": 18200 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 24.024930953979492, |
|
"learning_rate": 1.0314238684223515e-06, |
|
"loss": 3.5919, |
|
"step": 18220 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 29.396581649780273, |
|
"learning_rate": 1.0085322605662666e-06, |
|
"loss": 3.4255, |
|
"step": 18240 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 19.502662658691406, |
|
"learning_rate": 9.858923118542002e-07, |
|
"loss": 3.464, |
|
"step": 18260 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 20.03078269958496, |
|
"learning_rate": 9.635042597685023e-07, |
|
"loss": 3.4305, |
|
"step": 18280 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 18.905967712402344, |
|
"learning_rate": 9.413683391492456e-07, |
|
"loss": 3.6401, |
|
"step": 18300 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 23.61101531982422, |
|
"learning_rate": 9.194847821917623e-07, |
|
"loss": 3.5543, |
|
"step": 18320 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 18.563806533813477, |
|
"learning_rate": 8.978538184442137e-07, |
|
"loss": 3.4395, |
|
"step": 18340 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 21.695003509521484, |
|
"learning_rate": 8.764756748051662e-07, |
|
"loss": 3.4193, |
|
"step": 18360 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 21.57720947265625, |
|
"learning_rate": 8.553505755212382e-07, |
|
"loss": 3.5357, |
|
"step": 18380 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 30.37428855895996, |
|
"learning_rate": 8.344787421847217e-07, |
|
"loss": 3.5414, |
|
"step": 18400 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 42.314064025878906, |
|
"learning_rate": 8.138603937312722e-07, |
|
"loss": 3.5528, |
|
"step": 18420 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 22.21116065979004, |
|
"learning_rate": 7.934957464376058e-07, |
|
"loss": 3.6419, |
|
"step": 18440 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 26.877450942993164, |
|
"learning_rate": 7.733850139192395e-07, |
|
"loss": 3.5869, |
|
"step": 18460 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 21.281030654907227, |
|
"learning_rate": 7.535284071282455e-07, |
|
"loss": 3.6047, |
|
"step": 18480 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 20.147789001464844, |
|
"learning_rate": 7.339261343510206e-07, |
|
"loss": 3.4247, |
|
"step": 18500 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 19.394601821899414, |
|
"learning_rate": 7.145784012061424e-07, |
|
"loss": 3.5844, |
|
"step": 18520 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 22.156579971313477, |
|
"learning_rate": 6.954854106421715e-07, |
|
"loss": 3.5348, |
|
"step": 18540 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 28.641721725463867, |
|
"learning_rate": 6.766473629355452e-07, |
|
"loss": 3.5451, |
|
"step": 18560 |
|
}, |
|
{ |
|
"epoch": 0.26, |
|
"grad_norm": 19.47591209411621, |
|
"learning_rate": 6.580644556884702e-07, |
|
"loss": 3.5458, |
|
"step": 18580 |
|
} |
|
], |
|
"logging_steps": 20, |
|
"max_steps": 20000, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 1, |
|
"save_steps": 20, |
|
"total_flos": 4.037882943504384e+16, |
|
"train_batch_size": 8, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|