|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 3.0, |
|
"eval_steps": 500, |
|
"global_step": 465, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.0064516129032258064, |
|
"grad_norm": 0.15169227525554954, |
|
"learning_rate": 4.255319148936171e-06, |
|
"loss": 0.1803, |
|
"step": 1 |
|
}, |
|
{ |
|
"epoch": 0.012903225806451613, |
|
"grad_norm": 0.1108587617034759, |
|
"learning_rate": 8.510638297872341e-06, |
|
"loss": 0.1441, |
|
"step": 2 |
|
}, |
|
{ |
|
"epoch": 0.01935483870967742, |
|
"grad_norm": 0.08345105115839851, |
|
"learning_rate": 1.2765957446808511e-05, |
|
"loss": 0.1118, |
|
"step": 3 |
|
}, |
|
{ |
|
"epoch": 0.025806451612903226, |
|
"grad_norm": 0.1426234633379846, |
|
"learning_rate": 1.7021276595744682e-05, |
|
"loss": 0.1313, |
|
"step": 4 |
|
}, |
|
{ |
|
"epoch": 0.03225806451612903, |
|
"grad_norm": 0.14217530777409554, |
|
"learning_rate": 2.1276595744680852e-05, |
|
"loss": 0.1195, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.03870967741935484, |
|
"grad_norm": 0.1459141077043439, |
|
"learning_rate": 2.5531914893617022e-05, |
|
"loss": 0.1453, |
|
"step": 6 |
|
}, |
|
{ |
|
"epoch": 0.04516129032258064, |
|
"grad_norm": 0.10195797727709278, |
|
"learning_rate": 2.9787234042553192e-05, |
|
"loss": 0.1119, |
|
"step": 7 |
|
}, |
|
{ |
|
"epoch": 0.05161290322580645, |
|
"grad_norm": 0.13553076939195616, |
|
"learning_rate": 3.4042553191489365e-05, |
|
"loss": 0.1791, |
|
"step": 8 |
|
}, |
|
{ |
|
"epoch": 0.05806451612903226, |
|
"grad_norm": 0.15732711391586654, |
|
"learning_rate": 3.829787234042553e-05, |
|
"loss": 0.1282, |
|
"step": 9 |
|
}, |
|
{ |
|
"epoch": 0.06451612903225806, |
|
"grad_norm": 0.1192108991071351, |
|
"learning_rate": 4.2553191489361704e-05, |
|
"loss": 0.1288, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.07096774193548387, |
|
"grad_norm": 0.1534053229135478, |
|
"learning_rate": 4.680851063829788e-05, |
|
"loss": 0.15, |
|
"step": 11 |
|
}, |
|
{ |
|
"epoch": 0.07741935483870968, |
|
"grad_norm": 0.11808236646609899, |
|
"learning_rate": 5.1063829787234044e-05, |
|
"loss": 0.1098, |
|
"step": 12 |
|
}, |
|
{ |
|
"epoch": 0.08387096774193549, |
|
"grad_norm": 0.1127350100020763, |
|
"learning_rate": 5.531914893617022e-05, |
|
"loss": 0.1214, |
|
"step": 13 |
|
}, |
|
{ |
|
"epoch": 0.09032258064516129, |
|
"grad_norm": 0.13802666747672132, |
|
"learning_rate": 5.9574468085106384e-05, |
|
"loss": 0.1555, |
|
"step": 14 |
|
}, |
|
{ |
|
"epoch": 0.0967741935483871, |
|
"grad_norm": 0.19187355993638458, |
|
"learning_rate": 6.382978723404256e-05, |
|
"loss": 0.1474, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.1032258064516129, |
|
"grad_norm": 0.16430813087649648, |
|
"learning_rate": 6.808510638297873e-05, |
|
"loss": 0.1461, |
|
"step": 16 |
|
}, |
|
{ |
|
"epoch": 0.10967741935483871, |
|
"grad_norm": 0.13789502094335715, |
|
"learning_rate": 7.23404255319149e-05, |
|
"loss": 0.1226, |
|
"step": 17 |
|
}, |
|
{ |
|
"epoch": 0.11612903225806452, |
|
"grad_norm": 0.173980091579583, |
|
"learning_rate": 7.659574468085106e-05, |
|
"loss": 0.1558, |
|
"step": 18 |
|
}, |
|
{ |
|
"epoch": 0.12258064516129032, |
|
"grad_norm": 0.1418687450951512, |
|
"learning_rate": 8.085106382978723e-05, |
|
"loss": 0.1379, |
|
"step": 19 |
|
}, |
|
{ |
|
"epoch": 0.12903225806451613, |
|
"grad_norm": 0.1765388069700401, |
|
"learning_rate": 8.510638297872341e-05, |
|
"loss": 0.149, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.13548387096774195, |
|
"grad_norm": 0.17659794510341198, |
|
"learning_rate": 8.936170212765958e-05, |
|
"loss": 0.1101, |
|
"step": 21 |
|
}, |
|
{ |
|
"epoch": 0.14193548387096774, |
|
"grad_norm": 0.22633201272355616, |
|
"learning_rate": 9.361702127659576e-05, |
|
"loss": 0.1656, |
|
"step": 22 |
|
}, |
|
{ |
|
"epoch": 0.14838709677419354, |
|
"grad_norm": 0.16004849032165383, |
|
"learning_rate": 9.787234042553192e-05, |
|
"loss": 0.1616, |
|
"step": 23 |
|
}, |
|
{ |
|
"epoch": 0.15483870967741936, |
|
"grad_norm": 0.16981049690586422, |
|
"learning_rate": 0.00010212765957446809, |
|
"loss": 0.1321, |
|
"step": 24 |
|
}, |
|
{ |
|
"epoch": 0.16129032258064516, |
|
"grad_norm": 0.13581353274905067, |
|
"learning_rate": 0.00010638297872340425, |
|
"loss": 0.1122, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.16774193548387098, |
|
"grad_norm": 0.2089185781750498, |
|
"learning_rate": 0.00011063829787234043, |
|
"loss": 0.1527, |
|
"step": 26 |
|
}, |
|
{ |
|
"epoch": 0.17419354838709677, |
|
"grad_norm": 0.16125655225492164, |
|
"learning_rate": 0.00011489361702127661, |
|
"loss": 0.1263, |
|
"step": 27 |
|
}, |
|
{ |
|
"epoch": 0.18064516129032257, |
|
"grad_norm": 0.13753328983503424, |
|
"learning_rate": 0.00011914893617021277, |
|
"loss": 0.1224, |
|
"step": 28 |
|
}, |
|
{ |
|
"epoch": 0.1870967741935484, |
|
"grad_norm": 0.19089590529877826, |
|
"learning_rate": 0.00012340425531914893, |
|
"loss": 0.168, |
|
"step": 29 |
|
}, |
|
{ |
|
"epoch": 0.1935483870967742, |
|
"grad_norm": 0.15419681175807715, |
|
"learning_rate": 0.00012765957446808513, |
|
"loss": 0.138, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.2, |
|
"grad_norm": 0.18178665024271073, |
|
"learning_rate": 0.00013191489361702127, |
|
"loss": 0.1955, |
|
"step": 31 |
|
}, |
|
{ |
|
"epoch": 0.2064516129032258, |
|
"grad_norm": 0.11760523051627117, |
|
"learning_rate": 0.00013617021276595746, |
|
"loss": 0.1367, |
|
"step": 32 |
|
}, |
|
{ |
|
"epoch": 0.2129032258064516, |
|
"grad_norm": 0.10853469531105876, |
|
"learning_rate": 0.00014042553191489363, |
|
"loss": 0.1208, |
|
"step": 33 |
|
}, |
|
{ |
|
"epoch": 0.21935483870967742, |
|
"grad_norm": 0.12890024443339382, |
|
"learning_rate": 0.0001446808510638298, |
|
"loss": 0.142, |
|
"step": 34 |
|
}, |
|
{ |
|
"epoch": 0.22580645161290322, |
|
"grad_norm": 0.12692879603118554, |
|
"learning_rate": 0.00014893617021276596, |
|
"loss": 0.1268, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.23225806451612904, |
|
"grad_norm": 0.1529612097417899, |
|
"learning_rate": 0.00015319148936170213, |
|
"loss": 0.1194, |
|
"step": 36 |
|
}, |
|
{ |
|
"epoch": 0.23870967741935484, |
|
"grad_norm": 0.129432114467059, |
|
"learning_rate": 0.00015744680851063832, |
|
"loss": 0.128, |
|
"step": 37 |
|
}, |
|
{ |
|
"epoch": 0.24516129032258063, |
|
"grad_norm": 0.11343138573627701, |
|
"learning_rate": 0.00016170212765957446, |
|
"loss": 0.102, |
|
"step": 38 |
|
}, |
|
{ |
|
"epoch": 0.25161290322580643, |
|
"grad_norm": 0.16577469112721435, |
|
"learning_rate": 0.00016595744680851065, |
|
"loss": 0.1691, |
|
"step": 39 |
|
}, |
|
{ |
|
"epoch": 0.25806451612903225, |
|
"grad_norm": 0.13614466586953358, |
|
"learning_rate": 0.00017021276595744682, |
|
"loss": 0.1348, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.2645161290322581, |
|
"grad_norm": 0.1524812917659128, |
|
"learning_rate": 0.00017446808510638298, |
|
"loss": 0.1281, |
|
"step": 41 |
|
}, |
|
{ |
|
"epoch": 0.2709677419354839, |
|
"grad_norm": 0.14519269708506746, |
|
"learning_rate": 0.00017872340425531915, |
|
"loss": 0.1349, |
|
"step": 42 |
|
}, |
|
{ |
|
"epoch": 0.27741935483870966, |
|
"grad_norm": 0.18300481897670345, |
|
"learning_rate": 0.00018297872340425532, |
|
"loss": 0.1694, |
|
"step": 43 |
|
}, |
|
{ |
|
"epoch": 0.2838709677419355, |
|
"grad_norm": 0.11929331561330575, |
|
"learning_rate": 0.0001872340425531915, |
|
"loss": 0.1331, |
|
"step": 44 |
|
}, |
|
{ |
|
"epoch": 0.2903225806451613, |
|
"grad_norm": 0.12604932986068976, |
|
"learning_rate": 0.00019148936170212768, |
|
"loss": 0.0949, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.2967741935483871, |
|
"grad_norm": 0.14226790630565908, |
|
"learning_rate": 0.00019574468085106384, |
|
"loss": 0.1477, |
|
"step": 46 |
|
}, |
|
{ |
|
"epoch": 0.3032258064516129, |
|
"grad_norm": 0.1465980952326029, |
|
"learning_rate": 0.0002, |
|
"loss": 0.1408, |
|
"step": 47 |
|
}, |
|
{ |
|
"epoch": 0.3096774193548387, |
|
"grad_norm": 0.14535685898764863, |
|
"learning_rate": 0.0001999971756719333, |
|
"loss": 0.1474, |
|
"step": 48 |
|
}, |
|
{ |
|
"epoch": 0.3161290322580645, |
|
"grad_norm": 0.1559112597851861, |
|
"learning_rate": 0.00019998870284726968, |
|
"loss": 0.1568, |
|
"step": 49 |
|
}, |
|
{ |
|
"epoch": 0.3225806451612903, |
|
"grad_norm": 0.14392375991483491, |
|
"learning_rate": 0.00019997458200460993, |
|
"loss": 0.1526, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.32903225806451614, |
|
"grad_norm": 0.18920444771524633, |
|
"learning_rate": 0.00019995481394159188, |
|
"loss": 0.1384, |
|
"step": 51 |
|
}, |
|
{ |
|
"epoch": 0.33548387096774196, |
|
"grad_norm": 0.17943388616343298, |
|
"learning_rate": 0.0001999293997748454, |
|
"loss": 0.1555, |
|
"step": 52 |
|
}, |
|
{ |
|
"epoch": 0.3419354838709677, |
|
"grad_norm": 0.16752646291327727, |
|
"learning_rate": 0.00019989834093992945, |
|
"loss": 0.1634, |
|
"step": 53 |
|
}, |
|
{ |
|
"epoch": 0.34838709677419355, |
|
"grad_norm": 0.1484740777328073, |
|
"learning_rate": 0.00019986163919125075, |
|
"loss": 0.124, |
|
"step": 54 |
|
}, |
|
{ |
|
"epoch": 0.3548387096774194, |
|
"grad_norm": 0.15268630472434244, |
|
"learning_rate": 0.00019981929660196492, |
|
"loss": 0.1228, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.36129032258064514, |
|
"grad_norm": 0.19583236181584973, |
|
"learning_rate": 0.0001997713155638592, |
|
"loss": 0.1227, |
|
"step": 56 |
|
}, |
|
{ |
|
"epoch": 0.36774193548387096, |
|
"grad_norm": 0.1469999186659826, |
|
"learning_rate": 0.00019971769878721743, |
|
"loss": 0.1188, |
|
"step": 57 |
|
}, |
|
{ |
|
"epoch": 0.3741935483870968, |
|
"grad_norm": 0.10845193424043034, |
|
"learning_rate": 0.000199658449300667, |
|
"loss": 0.1177, |
|
"step": 58 |
|
}, |
|
{ |
|
"epoch": 0.38064516129032255, |
|
"grad_norm": 0.1845188187089657, |
|
"learning_rate": 0.00019959357045100764, |
|
"loss": 0.1726, |
|
"step": 59 |
|
}, |
|
{ |
|
"epoch": 0.3870967741935484, |
|
"grad_norm": 0.14769124665651473, |
|
"learning_rate": 0.00019952306590302247, |
|
"loss": 0.1634, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.3935483870967742, |
|
"grad_norm": 0.16408468392163889, |
|
"learning_rate": 0.00019944693963927092, |
|
"loss": 0.1584, |
|
"step": 61 |
|
}, |
|
{ |
|
"epoch": 0.4, |
|
"grad_norm": 0.15156206723802879, |
|
"learning_rate": 0.00019936519595986394, |
|
"loss": 0.1454, |
|
"step": 62 |
|
}, |
|
{ |
|
"epoch": 0.4064516129032258, |
|
"grad_norm": 0.15835599927161748, |
|
"learning_rate": 0.00019927783948222084, |
|
"loss": 0.1621, |
|
"step": 63 |
|
}, |
|
{ |
|
"epoch": 0.4129032258064516, |
|
"grad_norm": 0.1333411665662589, |
|
"learning_rate": 0.00019918487514080865, |
|
"loss": 0.1293, |
|
"step": 64 |
|
}, |
|
{ |
|
"epoch": 0.41935483870967744, |
|
"grad_norm": 0.1589874051481937, |
|
"learning_rate": 0.00019908630818686338, |
|
"loss": 0.1391, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.4258064516129032, |
|
"grad_norm": 0.12399538770065353, |
|
"learning_rate": 0.0001989821441880933, |
|
"loss": 0.1208, |
|
"step": 66 |
|
}, |
|
{ |
|
"epoch": 0.432258064516129, |
|
"grad_norm": 0.16949673743147955, |
|
"learning_rate": 0.00019887238902836448, |
|
"loss": 0.1139, |
|
"step": 67 |
|
}, |
|
{ |
|
"epoch": 0.43870967741935485, |
|
"grad_norm": 0.1875430265393267, |
|
"learning_rate": 0.00019875704890736853, |
|
"loss": 0.1856, |
|
"step": 68 |
|
}, |
|
{ |
|
"epoch": 0.44516129032258067, |
|
"grad_norm": 0.14927152386929934, |
|
"learning_rate": 0.00019863613034027224, |
|
"loss": 0.1516, |
|
"step": 69 |
|
}, |
|
{ |
|
"epoch": 0.45161290322580644, |
|
"grad_norm": 0.17501653378304205, |
|
"learning_rate": 0.0001985096401573497, |
|
"loss": 0.161, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.45806451612903226, |
|
"grad_norm": 0.17582762670350804, |
|
"learning_rate": 0.00019837758550359636, |
|
"loss": 0.1564, |
|
"step": 71 |
|
}, |
|
{ |
|
"epoch": 0.4645161290322581, |
|
"grad_norm": 0.15692049884405931, |
|
"learning_rate": 0.0001982399738383255, |
|
"loss": 0.1282, |
|
"step": 72 |
|
}, |
|
{ |
|
"epoch": 0.47096774193548385, |
|
"grad_norm": 0.14436392088728167, |
|
"learning_rate": 0.00019809681293474693, |
|
"loss": 0.1299, |
|
"step": 73 |
|
}, |
|
{ |
|
"epoch": 0.4774193548387097, |
|
"grad_norm": 0.19048463762976417, |
|
"learning_rate": 0.0001979481108795278, |
|
"loss": 0.1983, |
|
"step": 74 |
|
}, |
|
{ |
|
"epoch": 0.4838709677419355, |
|
"grad_norm": 0.13475205452089994, |
|
"learning_rate": 0.00019779387607233586, |
|
"loss": 0.156, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.49032258064516127, |
|
"grad_norm": 0.14145398172929924, |
|
"learning_rate": 0.00019763411722536502, |
|
"loss": 0.1355, |
|
"step": 76 |
|
}, |
|
{ |
|
"epoch": 0.4967741935483871, |
|
"grad_norm": 0.14400838677113517, |
|
"learning_rate": 0.00019746884336284317, |
|
"loss": 0.1371, |
|
"step": 77 |
|
}, |
|
{ |
|
"epoch": 0.5032258064516129, |
|
"grad_norm": 0.13421206031331137, |
|
"learning_rate": 0.00019729806382052248, |
|
"loss": 0.1156, |
|
"step": 78 |
|
}, |
|
{ |
|
"epoch": 0.5096774193548387, |
|
"grad_norm": 0.11613915473665105, |
|
"learning_rate": 0.00019712178824515212, |
|
"loss": 0.1293, |
|
"step": 79 |
|
}, |
|
{ |
|
"epoch": 0.5161290322580645, |
|
"grad_norm": 0.12472786830602234, |
|
"learning_rate": 0.00019694002659393305, |
|
"loss": 0.1189, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.5225806451612903, |
|
"grad_norm": 0.11050712097688373, |
|
"learning_rate": 0.00019675278913395606, |
|
"loss": 0.12, |
|
"step": 81 |
|
}, |
|
{ |
|
"epoch": 0.5290322580645161, |
|
"grad_norm": 0.1303276052231771, |
|
"learning_rate": 0.0001965600864416213, |
|
"loss": 0.1428, |
|
"step": 82 |
|
}, |
|
{ |
|
"epoch": 0.535483870967742, |
|
"grad_norm": 0.13774570876008593, |
|
"learning_rate": 0.00019636192940204134, |
|
"loss": 0.1438, |
|
"step": 83 |
|
}, |
|
{ |
|
"epoch": 0.5419354838709678, |
|
"grad_norm": 0.14810394982940484, |
|
"learning_rate": 0.00019615832920842586, |
|
"loss": 0.1404, |
|
"step": 84 |
|
}, |
|
{ |
|
"epoch": 0.5483870967741935, |
|
"grad_norm": 0.144275852100491, |
|
"learning_rate": 0.00019594929736144976, |
|
"loss": 0.1357, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.5548387096774193, |
|
"grad_norm": 0.16879600090790034, |
|
"learning_rate": 0.0001957348456686032, |
|
"loss": 0.1578, |
|
"step": 86 |
|
}, |
|
{ |
|
"epoch": 0.5612903225806452, |
|
"grad_norm": 0.1588074767274021, |
|
"learning_rate": 0.00019551498624352496, |
|
"loss": 0.1228, |
|
"step": 87 |
|
}, |
|
{ |
|
"epoch": 0.567741935483871, |
|
"grad_norm": 0.15067916918276408, |
|
"learning_rate": 0.00019528973150531787, |
|
"loss": 0.1323, |
|
"step": 88 |
|
}, |
|
{ |
|
"epoch": 0.5741935483870968, |
|
"grad_norm": 0.16266341913656662, |
|
"learning_rate": 0.00019505909417784754, |
|
"loss": 0.13, |
|
"step": 89 |
|
}, |
|
{ |
|
"epoch": 0.5806451612903226, |
|
"grad_norm": 0.121529537729675, |
|
"learning_rate": 0.00019482308728902356, |
|
"loss": 0.1067, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.5870967741935483, |
|
"grad_norm": 0.1740468182649888, |
|
"learning_rate": 0.00019458172417006347, |
|
"loss": 0.1513, |
|
"step": 91 |
|
}, |
|
{ |
|
"epoch": 0.5935483870967742, |
|
"grad_norm": 0.14062538318374462, |
|
"learning_rate": 0.00019433501845473995, |
|
"loss": 0.1438, |
|
"step": 92 |
|
}, |
|
{ |
|
"epoch": 0.6, |
|
"grad_norm": 0.17690034130801896, |
|
"learning_rate": 0.00019408298407861042, |
|
"loss": 0.1356, |
|
"step": 93 |
|
}, |
|
{ |
|
"epoch": 0.6064516129032258, |
|
"grad_norm": 0.18789482750619546, |
|
"learning_rate": 0.00019382563527823026, |
|
"loss": 0.1758, |
|
"step": 94 |
|
}, |
|
{ |
|
"epoch": 0.6129032258064516, |
|
"grad_norm": 0.16993067663839118, |
|
"learning_rate": 0.00019356298659034817, |
|
"loss": 0.1599, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.6193548387096774, |
|
"grad_norm": 0.16495058136550117, |
|
"learning_rate": 0.00019329505285108542, |
|
"loss": 0.1283, |
|
"step": 96 |
|
}, |
|
{ |
|
"epoch": 0.6258064516129033, |
|
"grad_norm": 0.15497395377626808, |
|
"learning_rate": 0.00019302184919509755, |
|
"loss": 0.1493, |
|
"step": 97 |
|
}, |
|
{ |
|
"epoch": 0.632258064516129, |
|
"grad_norm": 0.13660921526912856, |
|
"learning_rate": 0.00019274339105471971, |
|
"loss": 0.1307, |
|
"step": 98 |
|
}, |
|
{ |
|
"epoch": 0.6387096774193548, |
|
"grad_norm": 0.18246231884688152, |
|
"learning_rate": 0.00019245969415909465, |
|
"loss": 0.1598, |
|
"step": 99 |
|
}, |
|
{ |
|
"epoch": 0.6451612903225806, |
|
"grad_norm": 0.11344749529118914, |
|
"learning_rate": 0.00019217077453328449, |
|
"loss": 0.1304, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.6516129032258065, |
|
"grad_norm": 0.11682283731326468, |
|
"learning_rate": 0.0001918766484973654, |
|
"loss": 0.0977, |
|
"step": 101 |
|
}, |
|
{ |
|
"epoch": 0.6580645161290323, |
|
"grad_norm": 0.14494532290577813, |
|
"learning_rate": 0.00019157733266550575, |
|
"loss": 0.1338, |
|
"step": 102 |
|
}, |
|
{ |
|
"epoch": 0.6645161290322581, |
|
"grad_norm": 0.12095275202026515, |
|
"learning_rate": 0.0001912728439450276, |
|
"loss": 0.1513, |
|
"step": 103 |
|
}, |
|
{ |
|
"epoch": 0.6709677419354839, |
|
"grad_norm": 0.13423702981009097, |
|
"learning_rate": 0.00019096319953545185, |
|
"loss": 0.1335, |
|
"step": 104 |
|
}, |
|
{ |
|
"epoch": 0.6774193548387096, |
|
"grad_norm": 0.1670132860946028, |
|
"learning_rate": 0.0001906484169275263, |
|
"loss": 0.1607, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.6838709677419355, |
|
"grad_norm": 0.14053854153152684, |
|
"learning_rate": 0.00019032851390223812, |
|
"loss": 0.1365, |
|
"step": 106 |
|
}, |
|
{ |
|
"epoch": 0.6903225806451613, |
|
"grad_norm": 0.1399807021991922, |
|
"learning_rate": 0.00019000350852980909, |
|
"loss": 0.1589, |
|
"step": 107 |
|
}, |
|
{ |
|
"epoch": 0.6967741935483871, |
|
"grad_norm": 0.15473299551506894, |
|
"learning_rate": 0.00018967341916867518, |
|
"loss": 0.166, |
|
"step": 108 |
|
}, |
|
{ |
|
"epoch": 0.7032258064516129, |
|
"grad_norm": 0.1536872117526864, |
|
"learning_rate": 0.00018933826446444933, |
|
"loss": 0.1657, |
|
"step": 109 |
|
}, |
|
{ |
|
"epoch": 0.7096774193548387, |
|
"grad_norm": 0.1282004975556196, |
|
"learning_rate": 0.0001889980633488683, |
|
"loss": 0.1212, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.7161290322580646, |
|
"grad_norm": 0.16458328388975405, |
|
"learning_rate": 0.00018865283503872324, |
|
"loss": 0.1655, |
|
"step": 111 |
|
}, |
|
{ |
|
"epoch": 0.7225806451612903, |
|
"grad_norm": 0.1505113828615181, |
|
"learning_rate": 0.00018830259903477426, |
|
"loss": 0.1571, |
|
"step": 112 |
|
}, |
|
{ |
|
"epoch": 0.7290322580645161, |
|
"grad_norm": 0.16575595088070239, |
|
"learning_rate": 0.0001879473751206489, |
|
"loss": 0.1504, |
|
"step": 113 |
|
}, |
|
{ |
|
"epoch": 0.7354838709677419, |
|
"grad_norm": 0.1484230902451611, |
|
"learning_rate": 0.0001875871833617246, |
|
"loss": 0.1498, |
|
"step": 114 |
|
}, |
|
{ |
|
"epoch": 0.7419354838709677, |
|
"grad_norm": 0.2059907492830938, |
|
"learning_rate": 0.0001872220441039952, |
|
"loss": 0.134, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.7483870967741936, |
|
"grad_norm": 0.1491671097000444, |
|
"learning_rate": 0.0001868519779729218, |
|
"loss": 0.1374, |
|
"step": 116 |
|
}, |
|
{ |
|
"epoch": 0.7548387096774194, |
|
"grad_norm": 0.1727675086328308, |
|
"learning_rate": 0.0001864770058722676, |
|
"loss": 0.1624, |
|
"step": 117 |
|
}, |
|
{ |
|
"epoch": 0.7612903225806451, |
|
"grad_norm": 0.16357318423846662, |
|
"learning_rate": 0.00018609714898291718, |
|
"loss": 0.1528, |
|
"step": 118 |
|
}, |
|
{ |
|
"epoch": 0.7677419354838709, |
|
"grad_norm": 0.1584052674932312, |
|
"learning_rate": 0.00018571242876167996, |
|
"loss": 0.1321, |
|
"step": 119 |
|
}, |
|
{ |
|
"epoch": 0.7741935483870968, |
|
"grad_norm": 0.21714396600094343, |
|
"learning_rate": 0.0001853228669400784, |
|
"loss": 0.1748, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.7806451612903226, |
|
"grad_norm": 0.13743651841776636, |
|
"learning_rate": 0.00018492848552312014, |
|
"loss": 0.1493, |
|
"step": 121 |
|
}, |
|
{ |
|
"epoch": 0.7870967741935484, |
|
"grad_norm": 0.1541126978032927, |
|
"learning_rate": 0.00018452930678805536, |
|
"loss": 0.1331, |
|
"step": 122 |
|
}, |
|
{ |
|
"epoch": 0.7935483870967742, |
|
"grad_norm": 0.1571822882709535, |
|
"learning_rate": 0.00018412535328311814, |
|
"loss": 0.1427, |
|
"step": 123 |
|
}, |
|
{ |
|
"epoch": 0.8, |
|
"grad_norm": 0.17825934340851243, |
|
"learning_rate": 0.00018371664782625287, |
|
"loss": 0.1871, |
|
"step": 124 |
|
}, |
|
{ |
|
"epoch": 0.8064516129032258, |
|
"grad_norm": 0.1745800846806893, |
|
"learning_rate": 0.00018330321350382544, |
|
"loss": 0.1672, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.8129032258064516, |
|
"grad_norm": 0.13970218689990957, |
|
"learning_rate": 0.00018288507366931905, |
|
"loss": 0.1715, |
|
"step": 126 |
|
}, |
|
{ |
|
"epoch": 0.8193548387096774, |
|
"grad_norm": 0.15100292169016535, |
|
"learning_rate": 0.00018246225194201517, |
|
"loss": 0.1411, |
|
"step": 127 |
|
}, |
|
{ |
|
"epoch": 0.8258064516129032, |
|
"grad_norm": 0.1538126122981586, |
|
"learning_rate": 0.00018203477220565912, |
|
"loss": 0.1516, |
|
"step": 128 |
|
}, |
|
{ |
|
"epoch": 0.832258064516129, |
|
"grad_norm": 0.15630735898296536, |
|
"learning_rate": 0.00018160265860711134, |
|
"loss": 0.1636, |
|
"step": 129 |
|
}, |
|
{ |
|
"epoch": 0.8387096774193549, |
|
"grad_norm": 0.1427718560771215, |
|
"learning_rate": 0.00018116593555498307, |
|
"loss": 0.1297, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.8451612903225807, |
|
"grad_norm": 0.11911885929622754, |
|
"learning_rate": 0.0001807246277182578, |
|
"loss": 0.1115, |
|
"step": 131 |
|
}, |
|
{ |
|
"epoch": 0.8516129032258064, |
|
"grad_norm": 0.13702745725576418, |
|
"learning_rate": 0.0001802787600248977, |
|
"loss": 0.157, |
|
"step": 132 |
|
}, |
|
{ |
|
"epoch": 0.8580645161290322, |
|
"grad_norm": 0.17220644805792673, |
|
"learning_rate": 0.0001798283576604356, |
|
"loss": 0.1561, |
|
"step": 133 |
|
}, |
|
{ |
|
"epoch": 0.864516129032258, |
|
"grad_norm": 0.196867226472361, |
|
"learning_rate": 0.0001793734460665523, |
|
"loss": 0.1657, |
|
"step": 134 |
|
}, |
|
{ |
|
"epoch": 0.8709677419354839, |
|
"grad_norm": 0.1759359784948508, |
|
"learning_rate": 0.00017891405093963938, |
|
"loss": 0.1909, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.8774193548387097, |
|
"grad_norm": 0.16063163300244113, |
|
"learning_rate": 0.0001784501982293479, |
|
"loss": 0.1552, |
|
"step": 136 |
|
}, |
|
{ |
|
"epoch": 0.8838709677419355, |
|
"grad_norm": 0.17034183209183734, |
|
"learning_rate": 0.00017798191413712243, |
|
"loss": 0.1502, |
|
"step": 137 |
|
}, |
|
{ |
|
"epoch": 0.8903225806451613, |
|
"grad_norm": 0.14694529779128243, |
|
"learning_rate": 0.0001775092251147211, |
|
"loss": 0.1277, |
|
"step": 138 |
|
}, |
|
{ |
|
"epoch": 0.896774193548387, |
|
"grad_norm": 0.13174927545002138, |
|
"learning_rate": 0.0001770321578627213, |
|
"loss": 0.1277, |
|
"step": 139 |
|
}, |
|
{ |
|
"epoch": 0.9032258064516129, |
|
"grad_norm": 0.15784079444219237, |
|
"learning_rate": 0.00017655073932901168, |
|
"loss": 0.1534, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.9096774193548387, |
|
"grad_norm": 0.19691370909219638, |
|
"learning_rate": 0.0001760649967072697, |
|
"loss": 0.1688, |
|
"step": 141 |
|
}, |
|
{ |
|
"epoch": 0.9161290322580645, |
|
"grad_norm": 0.15697904965484202, |
|
"learning_rate": 0.00017557495743542585, |
|
"loss": 0.1523, |
|
"step": 142 |
|
}, |
|
{ |
|
"epoch": 0.9225806451612903, |
|
"grad_norm": 0.1759980765103477, |
|
"learning_rate": 0.00017508064919411344, |
|
"loss": 0.15, |
|
"step": 143 |
|
}, |
|
{ |
|
"epoch": 0.9290322580645162, |
|
"grad_norm": 0.19254834048997346, |
|
"learning_rate": 0.00017458209990510527, |
|
"loss": 0.1474, |
|
"step": 144 |
|
}, |
|
{ |
|
"epoch": 0.9354838709677419, |
|
"grad_norm": 0.19220369870461818, |
|
"learning_rate": 0.00017407933772973637, |
|
"loss": 0.1678, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.9419354838709677, |
|
"grad_norm": 0.14789056250556576, |
|
"learning_rate": 0.00017357239106731317, |
|
"loss": 0.1634, |
|
"step": 146 |
|
}, |
|
{ |
|
"epoch": 0.9483870967741935, |
|
"grad_norm": 0.15823815803270533, |
|
"learning_rate": 0.00017306128855350942, |
|
"loss": 0.1744, |
|
"step": 147 |
|
}, |
|
{ |
|
"epoch": 0.9548387096774194, |
|
"grad_norm": 0.159128130793647, |
|
"learning_rate": 0.0001725460590587486, |
|
"loss": 0.1732, |
|
"step": 148 |
|
}, |
|
{ |
|
"epoch": 0.9612903225806452, |
|
"grad_norm": 0.1420932941022579, |
|
"learning_rate": 0.00017202673168657318, |
|
"loss": 0.1193, |
|
"step": 149 |
|
}, |
|
{ |
|
"epoch": 0.967741935483871, |
|
"grad_norm": 0.1581041276537875, |
|
"learning_rate": 0.0001715033357720006, |
|
"loss": 0.157, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.9741935483870968, |
|
"grad_norm": 0.13409040520330398, |
|
"learning_rate": 0.00017097590087986633, |
|
"loss": 0.1187, |
|
"step": 151 |
|
}, |
|
{ |
|
"epoch": 0.9806451612903225, |
|
"grad_norm": 0.15236261967510367, |
|
"learning_rate": 0.00017044445680315372, |
|
"loss": 0.1541, |
|
"step": 152 |
|
}, |
|
{ |
|
"epoch": 0.9870967741935484, |
|
"grad_norm": 0.15028826750982388, |
|
"learning_rate": 0.00016990903356131124, |
|
"loss": 0.1462, |
|
"step": 153 |
|
}, |
|
{ |
|
"epoch": 0.9935483870967742, |
|
"grad_norm": 0.14170487741522195, |
|
"learning_rate": 0.00016936966139855663, |
|
"loss": 0.1275, |
|
"step": 154 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"grad_norm": 0.14288119090775778, |
|
"learning_rate": 0.00016882637078216868, |
|
"loss": 0.1316, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0, |
|
"eval_loss": 0.14944089949131012, |
|
"eval_runtime": 27.6083, |
|
"eval_samples_per_second": 4.745, |
|
"eval_steps_per_second": 0.616, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 1.0064516129032257, |
|
"grad_norm": 0.08453811084356862, |
|
"learning_rate": 0.0001682791924007661, |
|
"loss": 0.0732, |
|
"step": 156 |
|
}, |
|
{ |
|
"epoch": 1.0129032258064516, |
|
"grad_norm": 0.1163343033490188, |
|
"learning_rate": 0.00016772815716257412, |
|
"loss": 0.0955, |
|
"step": 157 |
|
}, |
|
{ |
|
"epoch": 1.0193548387096774, |
|
"grad_norm": 0.12774969124655083, |
|
"learning_rate": 0.0001671732961936785, |
|
"loss": 0.1154, |
|
"step": 158 |
|
}, |
|
{ |
|
"epoch": 1.0258064516129033, |
|
"grad_norm": 0.10149504758995384, |
|
"learning_rate": 0.00016661464083626734, |
|
"loss": 0.0834, |
|
"step": 159 |
|
}, |
|
{ |
|
"epoch": 1.032258064516129, |
|
"grad_norm": 0.1558767636437416, |
|
"learning_rate": 0.00016605222264686086, |
|
"loss": 0.0978, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 1.038709677419355, |
|
"grad_norm": 0.14800857594022712, |
|
"learning_rate": 0.00016548607339452853, |
|
"loss": 0.0783, |
|
"step": 161 |
|
}, |
|
{ |
|
"epoch": 1.0451612903225806, |
|
"grad_norm": 0.10427437820954576, |
|
"learning_rate": 0.00016491622505909482, |
|
"loss": 0.0714, |
|
"step": 162 |
|
}, |
|
{ |
|
"epoch": 1.0516129032258064, |
|
"grad_norm": 0.16509881361577539, |
|
"learning_rate": 0.00016434270982933273, |
|
"loss": 0.0971, |
|
"step": 163 |
|
}, |
|
{ |
|
"epoch": 1.0580645161290323, |
|
"grad_norm": 0.15528612822142446, |
|
"learning_rate": 0.0001637655601011454, |
|
"loss": 0.0806, |
|
"step": 164 |
|
}, |
|
{ |
|
"epoch": 1.064516129032258, |
|
"grad_norm": 0.1578175598943513, |
|
"learning_rate": 0.00016318480847573642, |
|
"loss": 0.0962, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 1.070967741935484, |
|
"grad_norm": 0.2591491854389689, |
|
"learning_rate": 0.00016260048775776804, |
|
"loss": 0.1107, |
|
"step": 166 |
|
}, |
|
{ |
|
"epoch": 1.0774193548387097, |
|
"grad_norm": 0.1557337965418426, |
|
"learning_rate": 0.00016201263095350833, |
|
"loss": 0.0707, |
|
"step": 167 |
|
}, |
|
{ |
|
"epoch": 1.0838709677419356, |
|
"grad_norm": 0.24212581528206514, |
|
"learning_rate": 0.0001614212712689668, |
|
"loss": 0.0964, |
|
"step": 168 |
|
}, |
|
{ |
|
"epoch": 1.0903225806451613, |
|
"grad_norm": 0.12600203218602474, |
|
"learning_rate": 0.00016082644210801844, |
|
"loss": 0.0649, |
|
"step": 169 |
|
}, |
|
{ |
|
"epoch": 1.096774193548387, |
|
"grad_norm": 0.14377456483205683, |
|
"learning_rate": 0.00016022817707051724, |
|
"loss": 0.0778, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 1.103225806451613, |
|
"grad_norm": 0.1828183442232092, |
|
"learning_rate": 0.00015962650995039783, |
|
"loss": 0.1068, |
|
"step": 171 |
|
}, |
|
{ |
|
"epoch": 1.1096774193548387, |
|
"grad_norm": 0.1695078607562437, |
|
"learning_rate": 0.00015902147473376694, |
|
"loss": 0.0938, |
|
"step": 172 |
|
}, |
|
{ |
|
"epoch": 1.1161290322580646, |
|
"grad_norm": 0.10525262692642562, |
|
"learning_rate": 0.00015841310559698343, |
|
"loss": 0.0645, |
|
"step": 173 |
|
}, |
|
{ |
|
"epoch": 1.1225806451612903, |
|
"grad_norm": 0.15379971536703851, |
|
"learning_rate": 0.0001578014369047279, |
|
"loss": 0.0752, |
|
"step": 174 |
|
}, |
|
{ |
|
"epoch": 1.129032258064516, |
|
"grad_norm": 0.1352511385337785, |
|
"learning_rate": 0.00015718650320806142, |
|
"loss": 0.0803, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 1.135483870967742, |
|
"grad_norm": 0.1708537982318491, |
|
"learning_rate": 0.00015656833924247398, |
|
"loss": 0.0908, |
|
"step": 176 |
|
}, |
|
{ |
|
"epoch": 1.1419354838709677, |
|
"grad_norm": 0.16847128771716718, |
|
"learning_rate": 0.00015594697992592232, |
|
"loss": 0.0697, |
|
"step": 177 |
|
}, |
|
{ |
|
"epoch": 1.1483870967741936, |
|
"grad_norm": 0.14044376651199897, |
|
"learning_rate": 0.00015532246035685756, |
|
"loss": 0.0707, |
|
"step": 178 |
|
}, |
|
{ |
|
"epoch": 1.1548387096774193, |
|
"grad_norm": 0.13752732543534765, |
|
"learning_rate": 0.00015469481581224272, |
|
"loss": 0.0769, |
|
"step": 179 |
|
}, |
|
{ |
|
"epoch": 1.1612903225806452, |
|
"grad_norm": 0.20084535405957962, |
|
"learning_rate": 0.00015406408174555976, |
|
"loss": 0.1114, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 1.167741935483871, |
|
"grad_norm": 0.16882912932677738, |
|
"learning_rate": 0.0001534302937848073, |
|
"loss": 0.0977, |
|
"step": 181 |
|
}, |
|
{ |
|
"epoch": 1.1741935483870969, |
|
"grad_norm": 0.16088758279122345, |
|
"learning_rate": 0.00015279348773048786, |
|
"loss": 0.0862, |
|
"step": 182 |
|
}, |
|
{ |
|
"epoch": 1.1806451612903226, |
|
"grad_norm": 0.1610420842518173, |
|
"learning_rate": 0.00015215369955358566, |
|
"loss": 0.0909, |
|
"step": 183 |
|
}, |
|
{ |
|
"epoch": 1.1870967741935483, |
|
"grad_norm": 0.1567858759713509, |
|
"learning_rate": 0.0001515109653935348, |
|
"loss": 0.0988, |
|
"step": 184 |
|
}, |
|
{ |
|
"epoch": 1.1935483870967742, |
|
"grad_norm": 0.12400211503217344, |
|
"learning_rate": 0.00015086532155617784, |
|
"loss": 0.0813, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 1.2, |
|
"grad_norm": 0.12785181430507606, |
|
"learning_rate": 0.00015021680451171498, |
|
"loss": 0.0802, |
|
"step": 186 |
|
}, |
|
{ |
|
"epoch": 1.206451612903226, |
|
"grad_norm": 0.14298258948499543, |
|
"learning_rate": 0.00014956545089264407, |
|
"loss": 0.0843, |
|
"step": 187 |
|
}, |
|
{ |
|
"epoch": 1.2129032258064516, |
|
"grad_norm": 0.20148810623427083, |
|
"learning_rate": 0.0001489112974916912, |
|
"loss": 0.0942, |
|
"step": 188 |
|
}, |
|
{ |
|
"epoch": 1.2193548387096773, |
|
"grad_norm": 0.14657915199625932, |
|
"learning_rate": 0.00014825438125973264, |
|
"loss": 0.0829, |
|
"step": 189 |
|
}, |
|
{ |
|
"epoch": 1.2258064516129032, |
|
"grad_norm": 0.16147059815020365, |
|
"learning_rate": 0.00014759473930370736, |
|
"loss": 0.0756, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 1.232258064516129, |
|
"grad_norm": 0.17956881851269735, |
|
"learning_rate": 0.0001469324088845212, |
|
"loss": 0.1007, |
|
"step": 191 |
|
}, |
|
{ |
|
"epoch": 1.238709677419355, |
|
"grad_norm": 0.1578318532174886, |
|
"learning_rate": 0.00014626742741494206, |
|
"loss": 0.0983, |
|
"step": 192 |
|
}, |
|
{ |
|
"epoch": 1.2451612903225806, |
|
"grad_norm": 0.1649792762044239, |
|
"learning_rate": 0.00014559983245748638, |
|
"loss": 0.0905, |
|
"step": 193 |
|
}, |
|
{ |
|
"epoch": 1.2516129032258063, |
|
"grad_norm": 0.1455653603561888, |
|
"learning_rate": 0.00014492966172229777, |
|
"loss": 0.0791, |
|
"step": 194 |
|
}, |
|
{ |
|
"epoch": 1.2580645161290323, |
|
"grad_norm": 0.1312841252501999, |
|
"learning_rate": 0.00014425695306501658, |
|
"loss": 0.0613, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 1.2645161290322582, |
|
"grad_norm": 0.1361347125426188, |
|
"learning_rate": 0.00014358174448464154, |
|
"loss": 0.0773, |
|
"step": 196 |
|
}, |
|
{ |
|
"epoch": 1.270967741935484, |
|
"grad_norm": 0.17551022044504175, |
|
"learning_rate": 0.00014290407412138366, |
|
"loss": 0.0985, |
|
"step": 197 |
|
}, |
|
{ |
|
"epoch": 1.2774193548387096, |
|
"grad_norm": 0.1456899710914213, |
|
"learning_rate": 0.00014222398025451135, |
|
"loss": 0.0827, |
|
"step": 198 |
|
}, |
|
{ |
|
"epoch": 1.2838709677419355, |
|
"grad_norm": 0.19482064326323745, |
|
"learning_rate": 0.00014154150130018866, |
|
"loss": 0.0974, |
|
"step": 199 |
|
}, |
|
{ |
|
"epoch": 1.2903225806451613, |
|
"grad_norm": 0.13544624920378107, |
|
"learning_rate": 0.0001408566758093048, |
|
"loss": 0.0742, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 1.2967741935483872, |
|
"grad_norm": 0.12685823273525554, |
|
"learning_rate": 0.00014016954246529696, |
|
"loss": 0.0677, |
|
"step": 201 |
|
}, |
|
{ |
|
"epoch": 1.303225806451613, |
|
"grad_norm": 0.15457862761329938, |
|
"learning_rate": 0.00013948014008196487, |
|
"loss": 0.0815, |
|
"step": 202 |
|
}, |
|
{ |
|
"epoch": 1.3096774193548386, |
|
"grad_norm": 0.1482256655702007, |
|
"learning_rate": 0.0001387885076012785, |
|
"loss": 0.0961, |
|
"step": 203 |
|
}, |
|
{ |
|
"epoch": 1.3161290322580645, |
|
"grad_norm": 0.20896278322687534, |
|
"learning_rate": 0.00013809468409117846, |
|
"loss": 0.1049, |
|
"step": 204 |
|
}, |
|
{ |
|
"epoch": 1.3225806451612903, |
|
"grad_norm": 0.175019480492322, |
|
"learning_rate": 0.00013739870874336898, |
|
"loss": 0.0884, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 1.3290322580645162, |
|
"grad_norm": 0.14363356864891735, |
|
"learning_rate": 0.00013670062087110422, |
|
"loss": 0.0864, |
|
"step": 206 |
|
}, |
|
{ |
|
"epoch": 1.335483870967742, |
|
"grad_norm": 0.1397081258858762, |
|
"learning_rate": 0.00013600045990696762, |
|
"loss": 0.0797, |
|
"step": 207 |
|
}, |
|
{ |
|
"epoch": 1.3419354838709676, |
|
"grad_norm": 0.1667674455506937, |
|
"learning_rate": 0.0001352982654006444, |
|
"loss": 0.0977, |
|
"step": 208 |
|
}, |
|
{ |
|
"epoch": 1.3483870967741935, |
|
"grad_norm": 0.1409898972781143, |
|
"learning_rate": 0.00013459407701668763, |
|
"loss": 0.0923, |
|
"step": 209 |
|
}, |
|
{ |
|
"epoch": 1.3548387096774195, |
|
"grad_norm": 0.16290520924324786, |
|
"learning_rate": 0.00013388793453227767, |
|
"loss": 0.1033, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 1.3612903225806452, |
|
"grad_norm": 0.14967784709530377, |
|
"learning_rate": 0.0001331798778349752, |
|
"loss": 0.0843, |
|
"step": 211 |
|
}, |
|
{ |
|
"epoch": 1.367741935483871, |
|
"grad_norm": 0.14370370779202904, |
|
"learning_rate": 0.00013246994692046836, |
|
"loss": 0.0845, |
|
"step": 212 |
|
}, |
|
{ |
|
"epoch": 1.3741935483870968, |
|
"grad_norm": 0.15288920332301245, |
|
"learning_rate": 0.00013175818189031327, |
|
"loss": 0.0962, |
|
"step": 213 |
|
}, |
|
{ |
|
"epoch": 1.3806451612903226, |
|
"grad_norm": 0.16907810212754584, |
|
"learning_rate": 0.00013104462294966896, |
|
"loss": 0.1033, |
|
"step": 214 |
|
}, |
|
{ |
|
"epoch": 1.3870967741935485, |
|
"grad_norm": 0.17023880837738012, |
|
"learning_rate": 0.00013032931040502627, |
|
"loss": 0.1155, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 1.3935483870967742, |
|
"grad_norm": 0.12189251069561345, |
|
"learning_rate": 0.00012961228466193116, |
|
"loss": 0.0692, |
|
"step": 216 |
|
}, |
|
{ |
|
"epoch": 1.4, |
|
"grad_norm": 0.18108512316647296, |
|
"learning_rate": 0.00012889358622270223, |
|
"loss": 0.086, |
|
"step": 217 |
|
}, |
|
{ |
|
"epoch": 1.4064516129032258, |
|
"grad_norm": 0.15804455793477432, |
|
"learning_rate": 0.00012817325568414297, |
|
"loss": 0.0751, |
|
"step": 218 |
|
}, |
|
{ |
|
"epoch": 1.4129032258064516, |
|
"grad_norm": 0.13668879316730062, |
|
"learning_rate": 0.00012745133373524853, |
|
"loss": 0.0786, |
|
"step": 219 |
|
}, |
|
{ |
|
"epoch": 1.4193548387096775, |
|
"grad_norm": 0.18610949095399393, |
|
"learning_rate": 0.0001267278611549073, |
|
"loss": 0.0732, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 1.4258064516129032, |
|
"grad_norm": 0.186875270696186, |
|
"learning_rate": 0.00012600287880959763, |
|
"loss": 0.1051, |
|
"step": 221 |
|
}, |
|
{ |
|
"epoch": 1.432258064516129, |
|
"grad_norm": 0.15558801012557805, |
|
"learning_rate": 0.0001252764276510792, |
|
"loss": 0.0879, |
|
"step": 222 |
|
}, |
|
{ |
|
"epoch": 1.4387096774193548, |
|
"grad_norm": 0.16188457902393685, |
|
"learning_rate": 0.00012454854871407994, |
|
"loss": 0.0887, |
|
"step": 223 |
|
}, |
|
{ |
|
"epoch": 1.4451612903225808, |
|
"grad_norm": 0.1332789968563102, |
|
"learning_rate": 0.00012381928311397806, |
|
"loss": 0.0819, |
|
"step": 224 |
|
}, |
|
{ |
|
"epoch": 1.4516129032258065, |
|
"grad_norm": 0.13104439295337186, |
|
"learning_rate": 0.0001230886720444796, |
|
"loss": 0.0992, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 1.4580645161290322, |
|
"grad_norm": 0.1741921210517873, |
|
"learning_rate": 0.00012235675677529158, |
|
"loss": 0.1049, |
|
"step": 226 |
|
}, |
|
{ |
|
"epoch": 1.4645161290322581, |
|
"grad_norm": 0.17686059237450052, |
|
"learning_rate": 0.00012162357864979072, |
|
"loss": 0.0942, |
|
"step": 227 |
|
}, |
|
{ |
|
"epoch": 1.4709677419354839, |
|
"grad_norm": 0.14208255193217198, |
|
"learning_rate": 0.00012088917908268821, |
|
"loss": 0.0932, |
|
"step": 228 |
|
}, |
|
{ |
|
"epoch": 1.4774193548387098, |
|
"grad_norm": 0.16207984639669018, |
|
"learning_rate": 0.00012015359955769021, |
|
"loss": 0.0952, |
|
"step": 229 |
|
}, |
|
{ |
|
"epoch": 1.4838709677419355, |
|
"grad_norm": 0.10567516069189406, |
|
"learning_rate": 0.00011941688162515467, |
|
"loss": 0.0698, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 1.4903225806451612, |
|
"grad_norm": 0.14315458863752037, |
|
"learning_rate": 0.00011867906689974428, |
|
"loss": 0.0706, |
|
"step": 231 |
|
}, |
|
{ |
|
"epoch": 1.4967741935483871, |
|
"grad_norm": 0.14644874259664967, |
|
"learning_rate": 0.00011794019705807584, |
|
"loss": 0.0954, |
|
"step": 232 |
|
}, |
|
{ |
|
"epoch": 1.5032258064516129, |
|
"grad_norm": 0.1522258926853921, |
|
"learning_rate": 0.00011720031383636585, |
|
"loss": 0.1026, |
|
"step": 233 |
|
}, |
|
{ |
|
"epoch": 1.5096774193548388, |
|
"grad_norm": 0.2389589012648026, |
|
"learning_rate": 0.00011645945902807341, |
|
"loss": 0.0981, |
|
"step": 234 |
|
}, |
|
{ |
|
"epoch": 1.5161290322580645, |
|
"grad_norm": 0.1539160607687386, |
|
"learning_rate": 0.00011571767448153901, |
|
"loss": 0.074, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 1.5225806451612902, |
|
"grad_norm": 0.17340052794011998, |
|
"learning_rate": 0.00011497500209762102, |
|
"loss": 0.0943, |
|
"step": 236 |
|
}, |
|
{ |
|
"epoch": 1.5290322580645161, |
|
"grad_norm": 0.11294207936842038, |
|
"learning_rate": 0.00011423148382732853, |
|
"loss": 0.0644, |
|
"step": 237 |
|
}, |
|
{ |
|
"epoch": 1.535483870967742, |
|
"grad_norm": 0.14391081206665055, |
|
"learning_rate": 0.00011348716166945195, |
|
"loss": 0.0774, |
|
"step": 238 |
|
}, |
|
{ |
|
"epoch": 1.5419354838709678, |
|
"grad_norm": 0.16298875474891092, |
|
"learning_rate": 0.0001127420776681905, |
|
"loss": 0.0786, |
|
"step": 239 |
|
}, |
|
{ |
|
"epoch": 1.5483870967741935, |
|
"grad_norm": 0.1325076119120941, |
|
"learning_rate": 0.00011199627391077732, |
|
"loss": 0.0889, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 1.5548387096774192, |
|
"grad_norm": 0.10971482413094012, |
|
"learning_rate": 0.00011124979252510208, |
|
"loss": 0.0737, |
|
"step": 241 |
|
}, |
|
{ |
|
"epoch": 1.5612903225806452, |
|
"grad_norm": 0.14614299703062, |
|
"learning_rate": 0.0001105026756773314, |
|
"loss": 0.0682, |
|
"step": 242 |
|
}, |
|
{ |
|
"epoch": 1.567741935483871, |
|
"grad_norm": 0.22690104820583093, |
|
"learning_rate": 0.00010975496556952682, |
|
"loss": 0.1094, |
|
"step": 243 |
|
}, |
|
{ |
|
"epoch": 1.5741935483870968, |
|
"grad_norm": 0.2561727438038473, |
|
"learning_rate": 0.00010900670443726135, |
|
"loss": 0.0851, |
|
"step": 244 |
|
}, |
|
{ |
|
"epoch": 1.5806451612903225, |
|
"grad_norm": 0.15371305701947427, |
|
"learning_rate": 0.00010825793454723325, |
|
"loss": 0.0923, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 1.5870967741935482, |
|
"grad_norm": 0.16267574566743875, |
|
"learning_rate": 0.00010750869819487883, |
|
"loss": 0.1036, |
|
"step": 246 |
|
}, |
|
{ |
|
"epoch": 1.5935483870967742, |
|
"grad_norm": 0.15591155698404394, |
|
"learning_rate": 0.00010675903770198333, |
|
"loss": 0.0893, |
|
"step": 247 |
|
}, |
|
{ |
|
"epoch": 1.6, |
|
"grad_norm": 0.14338972962339533, |
|
"learning_rate": 0.00010600899541429004, |
|
"loss": 0.0837, |
|
"step": 248 |
|
}, |
|
{ |
|
"epoch": 1.6064516129032258, |
|
"grad_norm": 0.12387607320751257, |
|
"learning_rate": 0.00010525861369910877, |
|
"loss": 0.0755, |
|
"step": 249 |
|
}, |
|
{ |
|
"epoch": 1.6129032258064515, |
|
"grad_norm": 0.16606169294386383, |
|
"learning_rate": 0.00010450793494292224, |
|
"loss": 0.1043, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 1.6193548387096774, |
|
"grad_norm": 0.1795920159350681, |
|
"learning_rate": 0.00010375700154899208, |
|
"loss": 0.1008, |
|
"step": 251 |
|
}, |
|
{ |
|
"epoch": 1.6258064516129034, |
|
"grad_norm": 0.13025097291519463, |
|
"learning_rate": 0.00010300585593496348, |
|
"loss": 0.0851, |
|
"step": 252 |
|
}, |
|
{ |
|
"epoch": 1.632258064516129, |
|
"grad_norm": 0.14349816154023654, |
|
"learning_rate": 0.00010225454053046921, |
|
"loss": 0.0807, |
|
"step": 253 |
|
}, |
|
{ |
|
"epoch": 1.6387096774193548, |
|
"grad_norm": 0.16695448888874226, |
|
"learning_rate": 0.00010150309777473306, |
|
"loss": 0.1117, |
|
"step": 254 |
|
}, |
|
{ |
|
"epoch": 1.6451612903225805, |
|
"grad_norm": 0.15743712528269815, |
|
"learning_rate": 0.0001007515701141722, |
|
"loss": 0.086, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 1.6516129032258065, |
|
"grad_norm": 0.16704335210894908, |
|
"learning_rate": 0.0001, |
|
"loss": 0.0886, |
|
"step": 256 |
|
}, |
|
{ |
|
"epoch": 1.6580645161290324, |
|
"grad_norm": 0.10245531520994122, |
|
"learning_rate": 9.924842988582782e-05, |
|
"loss": 0.0678, |
|
"step": 257 |
|
}, |
|
{ |
|
"epoch": 1.664516129032258, |
|
"grad_norm": 0.1688033836669086, |
|
"learning_rate": 9.849690222526698e-05, |
|
"loss": 0.0958, |
|
"step": 258 |
|
}, |
|
{ |
|
"epoch": 1.6709677419354838, |
|
"grad_norm": 0.1338126992775965, |
|
"learning_rate": 9.77454594695308e-05, |
|
"loss": 0.0782, |
|
"step": 259 |
|
}, |
|
{ |
|
"epoch": 1.6774193548387095, |
|
"grad_norm": 0.1332820451604909, |
|
"learning_rate": 9.699414406503654e-05, |
|
"loss": 0.0856, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 1.6838709677419355, |
|
"grad_norm": 0.1449401720605745, |
|
"learning_rate": 9.624299845100795e-05, |
|
"loss": 0.0882, |
|
"step": 261 |
|
}, |
|
{ |
|
"epoch": 1.6903225806451614, |
|
"grad_norm": 0.19395703151279187, |
|
"learning_rate": 9.549206505707777e-05, |
|
"loss": 0.1148, |
|
"step": 262 |
|
}, |
|
{ |
|
"epoch": 1.696774193548387, |
|
"grad_norm": 0.13482277559178169, |
|
"learning_rate": 9.474138630089124e-05, |
|
"loss": 0.0644, |
|
"step": 263 |
|
}, |
|
{ |
|
"epoch": 1.7032258064516128, |
|
"grad_norm": 0.18530593840863338, |
|
"learning_rate": 9.399100458570997e-05, |
|
"loss": 0.1074, |
|
"step": 264 |
|
}, |
|
{ |
|
"epoch": 1.7096774193548387, |
|
"grad_norm": 0.1897995033582595, |
|
"learning_rate": 9.324096229801674e-05, |
|
"loss": 0.0867, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 1.7161290322580647, |
|
"grad_norm": 0.16472780150681127, |
|
"learning_rate": 9.249130180512118e-05, |
|
"loss": 0.0896, |
|
"step": 266 |
|
}, |
|
{ |
|
"epoch": 1.7225806451612904, |
|
"grad_norm": 0.15525608679066774, |
|
"learning_rate": 9.174206545276677e-05, |
|
"loss": 0.0865, |
|
"step": 267 |
|
}, |
|
{ |
|
"epoch": 1.729032258064516, |
|
"grad_norm": 0.15426999703424252, |
|
"learning_rate": 9.099329556273866e-05, |
|
"loss": 0.0853, |
|
"step": 268 |
|
}, |
|
{ |
|
"epoch": 1.7354838709677418, |
|
"grad_norm": 0.19103115451320254, |
|
"learning_rate": 9.024503443047319e-05, |
|
"loss": 0.0993, |
|
"step": 269 |
|
}, |
|
{ |
|
"epoch": 1.7419354838709677, |
|
"grad_norm": 0.12323460303269068, |
|
"learning_rate": 8.949732432266866e-05, |
|
"loss": 0.0723, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 1.7483870967741937, |
|
"grad_norm": 0.1339668030381976, |
|
"learning_rate": 8.875020747489794e-05, |
|
"loss": 0.0852, |
|
"step": 271 |
|
}, |
|
{ |
|
"epoch": 1.7548387096774194, |
|
"grad_norm": 0.1385020226698345, |
|
"learning_rate": 8.800372608922271e-05, |
|
"loss": 0.0773, |
|
"step": 272 |
|
}, |
|
{ |
|
"epoch": 1.761290322580645, |
|
"grad_norm": 0.21835470061774626, |
|
"learning_rate": 8.72579223318095e-05, |
|
"loss": 0.1167, |
|
"step": 273 |
|
}, |
|
{ |
|
"epoch": 1.7677419354838708, |
|
"grad_norm": 0.1425322245524885, |
|
"learning_rate": 8.651283833054809e-05, |
|
"loss": 0.0801, |
|
"step": 274 |
|
}, |
|
{ |
|
"epoch": 1.7741935483870968, |
|
"grad_norm": 0.1794495686810833, |
|
"learning_rate": 8.57685161726715e-05, |
|
"loss": 0.0784, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 1.7806451612903227, |
|
"grad_norm": 0.15688310999446023, |
|
"learning_rate": 8.5024997902379e-05, |
|
"loss": 0.1001, |
|
"step": 276 |
|
}, |
|
{ |
|
"epoch": 1.7870967741935484, |
|
"grad_norm": 0.15417421931142297, |
|
"learning_rate": 8.428232551846101e-05, |
|
"loss": 0.0898, |
|
"step": 277 |
|
}, |
|
{ |
|
"epoch": 1.793548387096774, |
|
"grad_norm": 0.17958092903510822, |
|
"learning_rate": 8.35405409719266e-05, |
|
"loss": 0.0921, |
|
"step": 278 |
|
}, |
|
{ |
|
"epoch": 1.8, |
|
"grad_norm": 0.165061388870699, |
|
"learning_rate": 8.279968616363418e-05, |
|
"loss": 0.0935, |
|
"step": 279 |
|
}, |
|
{ |
|
"epoch": 1.8064516129032258, |
|
"grad_norm": 0.1116000947584692, |
|
"learning_rate": 8.205980294192421e-05, |
|
"loss": 0.0684, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 1.8129032258064517, |
|
"grad_norm": 0.12370745355237868, |
|
"learning_rate": 8.132093310025571e-05, |
|
"loss": 0.0764, |
|
"step": 281 |
|
}, |
|
{ |
|
"epoch": 1.8193548387096774, |
|
"grad_norm": 0.1724137684896053, |
|
"learning_rate": 8.058311837484535e-05, |
|
"loss": 0.0969, |
|
"step": 282 |
|
}, |
|
{ |
|
"epoch": 1.8258064516129031, |
|
"grad_norm": 0.16125193051943912, |
|
"learning_rate": 7.984640044230983e-05, |
|
"loss": 0.0868, |
|
"step": 283 |
|
}, |
|
{ |
|
"epoch": 1.832258064516129, |
|
"grad_norm": 0.13929471780512187, |
|
"learning_rate": 7.911082091731181e-05, |
|
"loss": 0.0701, |
|
"step": 284 |
|
}, |
|
{ |
|
"epoch": 1.838709677419355, |
|
"grad_norm": 0.1172859356132756, |
|
"learning_rate": 7.837642135020929e-05, |
|
"loss": 0.0705, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 1.8451612903225807, |
|
"grad_norm": 0.1416371336298496, |
|
"learning_rate": 7.764324322470841e-05, |
|
"loss": 0.0683, |
|
"step": 286 |
|
}, |
|
{ |
|
"epoch": 1.8516129032258064, |
|
"grad_norm": 0.19198949443360017, |
|
"learning_rate": 7.691132795552043e-05, |
|
"loss": 0.0894, |
|
"step": 287 |
|
}, |
|
{ |
|
"epoch": 1.8580645161290321, |
|
"grad_norm": 0.19384526534149363, |
|
"learning_rate": 7.618071688602199e-05, |
|
"loss": 0.0954, |
|
"step": 288 |
|
}, |
|
{ |
|
"epoch": 1.864516129032258, |
|
"grad_norm": 0.21508649416468206, |
|
"learning_rate": 7.54514512859201e-05, |
|
"loss": 0.0986, |
|
"step": 289 |
|
}, |
|
{ |
|
"epoch": 1.870967741935484, |
|
"grad_norm": 0.1952670486171591, |
|
"learning_rate": 7.472357234892082e-05, |
|
"loss": 0.1049, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 1.8774193548387097, |
|
"grad_norm": 0.13259304301639457, |
|
"learning_rate": 7.399712119040238e-05, |
|
"loss": 0.0784, |
|
"step": 291 |
|
}, |
|
{ |
|
"epoch": 1.8838709677419354, |
|
"grad_norm": 0.1367442063153438, |
|
"learning_rate": 7.327213884509272e-05, |
|
"loss": 0.0828, |
|
"step": 292 |
|
}, |
|
{ |
|
"epoch": 1.8903225806451613, |
|
"grad_norm": 0.16389743747526248, |
|
"learning_rate": 7.254866626475152e-05, |
|
"loss": 0.0927, |
|
"step": 293 |
|
}, |
|
{ |
|
"epoch": 1.896774193548387, |
|
"grad_norm": 0.22089121957380548, |
|
"learning_rate": 7.182674431585704e-05, |
|
"loss": 0.1077, |
|
"step": 294 |
|
}, |
|
{ |
|
"epoch": 1.903225806451613, |
|
"grad_norm": 0.14882797154919192, |
|
"learning_rate": 7.110641377729778e-05, |
|
"loss": 0.0879, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 1.9096774193548387, |
|
"grad_norm": 0.12794357419440644, |
|
"learning_rate": 7.038771533806884e-05, |
|
"loss": 0.0719, |
|
"step": 296 |
|
}, |
|
{ |
|
"epoch": 1.9161290322580644, |
|
"grad_norm": 0.1846824594193936, |
|
"learning_rate": 6.967068959497376e-05, |
|
"loss": 0.0895, |
|
"step": 297 |
|
}, |
|
{ |
|
"epoch": 1.9225806451612903, |
|
"grad_norm": 0.17646276082900025, |
|
"learning_rate": 6.895537705033108e-05, |
|
"loss": 0.0996, |
|
"step": 298 |
|
}, |
|
{ |
|
"epoch": 1.9290322580645163, |
|
"grad_norm": 0.15604696338212207, |
|
"learning_rate": 6.824181810968675e-05, |
|
"loss": 0.087, |
|
"step": 299 |
|
}, |
|
{ |
|
"epoch": 1.935483870967742, |
|
"grad_norm": 0.15478000466177547, |
|
"learning_rate": 6.753005307953167e-05, |
|
"loss": 0.0831, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 1.9419354838709677, |
|
"grad_norm": 0.19642059718505833, |
|
"learning_rate": 6.682012216502484e-05, |
|
"loss": 0.1073, |
|
"step": 301 |
|
}, |
|
{ |
|
"epoch": 1.9483870967741934, |
|
"grad_norm": 0.17255606883063224, |
|
"learning_rate": 6.611206546772237e-05, |
|
"loss": 0.1018, |
|
"step": 302 |
|
}, |
|
{ |
|
"epoch": 1.9548387096774194, |
|
"grad_norm": 0.1393602083174125, |
|
"learning_rate": 6.54059229833124e-05, |
|
"loss": 0.0859, |
|
"step": 303 |
|
}, |
|
{ |
|
"epoch": 1.9612903225806453, |
|
"grad_norm": 0.16652643955266602, |
|
"learning_rate": 6.47017345993556e-05, |
|
"loss": 0.0899, |
|
"step": 304 |
|
}, |
|
{ |
|
"epoch": 1.967741935483871, |
|
"grad_norm": 0.16878660937501272, |
|
"learning_rate": 6.39995400930324e-05, |
|
"loss": 0.0998, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 1.9741935483870967, |
|
"grad_norm": 0.12969061293334938, |
|
"learning_rate": 6.329937912889582e-05, |
|
"loss": 0.0706, |
|
"step": 306 |
|
}, |
|
{ |
|
"epoch": 1.9806451612903224, |
|
"grad_norm": 0.15739894009886643, |
|
"learning_rate": 6.260129125663106e-05, |
|
"loss": 0.0888, |
|
"step": 307 |
|
}, |
|
{ |
|
"epoch": 1.9870967741935484, |
|
"grad_norm": 0.1752067977727585, |
|
"learning_rate": 6.190531590882159e-05, |
|
"loss": 0.0826, |
|
"step": 308 |
|
}, |
|
{ |
|
"epoch": 1.9935483870967743, |
|
"grad_norm": 0.21002280616145647, |
|
"learning_rate": 6.121149239872151e-05, |
|
"loss": 0.123, |
|
"step": 309 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"grad_norm": 0.13850366944373244, |
|
"learning_rate": 6.051985991803517e-05, |
|
"loss": 0.073, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.0, |
|
"eval_loss": 0.16690203547477722, |
|
"eval_runtime": 25.4682, |
|
"eval_samples_per_second": 5.144, |
|
"eval_steps_per_second": 0.668, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 2.0064516129032257, |
|
"grad_norm": 0.1124990090834071, |
|
"learning_rate": 5.983045753470308e-05, |
|
"loss": 0.0485, |
|
"step": 311 |
|
}, |
|
{ |
|
"epoch": 2.0129032258064514, |
|
"grad_norm": 0.08246086368829486, |
|
"learning_rate": 5.9143324190695196e-05, |
|
"loss": 0.0411, |
|
"step": 312 |
|
}, |
|
{ |
|
"epoch": 2.0193548387096776, |
|
"grad_norm": 0.11824209839228844, |
|
"learning_rate": 5.845849869981137e-05, |
|
"loss": 0.0578, |
|
"step": 313 |
|
}, |
|
{ |
|
"epoch": 2.0258064516129033, |
|
"grad_norm": 0.09183702414760068, |
|
"learning_rate": 5.777601974548866e-05, |
|
"loss": 0.0405, |
|
"step": 314 |
|
}, |
|
{ |
|
"epoch": 2.032258064516129, |
|
"grad_norm": 0.10138990147374953, |
|
"learning_rate": 5.709592587861637e-05, |
|
"loss": 0.045, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 2.0387096774193547, |
|
"grad_norm": 0.13174518232765414, |
|
"learning_rate": 5.6418255515358486e-05, |
|
"loss": 0.0417, |
|
"step": 316 |
|
}, |
|
{ |
|
"epoch": 2.0451612903225804, |
|
"grad_norm": 0.11134581768568716, |
|
"learning_rate": 5.574304693498346e-05, |
|
"loss": 0.0351, |
|
"step": 317 |
|
}, |
|
{ |
|
"epoch": 2.0516129032258066, |
|
"grad_norm": 0.10372734844639428, |
|
"learning_rate": 5.507033827770225e-05, |
|
"loss": 0.0458, |
|
"step": 318 |
|
}, |
|
{ |
|
"epoch": 2.0580645161290323, |
|
"grad_norm": 0.09893496819685892, |
|
"learning_rate": 5.4400167542513636e-05, |
|
"loss": 0.0421, |
|
"step": 319 |
|
}, |
|
{ |
|
"epoch": 2.064516129032258, |
|
"grad_norm": 0.1544849048513997, |
|
"learning_rate": 5.3732572585057974e-05, |
|
"loss": 0.0449, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 2.0709677419354837, |
|
"grad_norm": 0.12211629106983518, |
|
"learning_rate": 5.306759111547881e-05, |
|
"loss": 0.0341, |
|
"step": 321 |
|
}, |
|
{ |
|
"epoch": 2.07741935483871, |
|
"grad_norm": 0.1519039459170261, |
|
"learning_rate": 5.240526069629265e-05, |
|
"loss": 0.0477, |
|
"step": 322 |
|
}, |
|
{ |
|
"epoch": 2.0838709677419356, |
|
"grad_norm": 0.10882973801143421, |
|
"learning_rate": 5.174561874026741e-05, |
|
"loss": 0.0376, |
|
"step": 323 |
|
}, |
|
{ |
|
"epoch": 2.0903225806451613, |
|
"grad_norm": 0.09785322895617321, |
|
"learning_rate": 5.108870250830882e-05, |
|
"loss": 0.0305, |
|
"step": 324 |
|
}, |
|
{ |
|
"epoch": 2.096774193548387, |
|
"grad_norm": 0.14990301799413153, |
|
"learning_rate": 5.0434549107355944e-05, |
|
"loss": 0.0409, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 2.1032258064516127, |
|
"grad_norm": 0.1915279670852206, |
|
"learning_rate": 4.978319548828504e-05, |
|
"loss": 0.045, |
|
"step": 326 |
|
}, |
|
{ |
|
"epoch": 2.109677419354839, |
|
"grad_norm": 0.15445162243894608, |
|
"learning_rate": 4.9134678443822166e-05, |
|
"loss": 0.0334, |
|
"step": 327 |
|
}, |
|
{ |
|
"epoch": 2.1161290322580646, |
|
"grad_norm": 0.34562522599625123, |
|
"learning_rate": 4.8489034606465225e-05, |
|
"loss": 0.0395, |
|
"step": 328 |
|
}, |
|
{ |
|
"epoch": 2.1225806451612903, |
|
"grad_norm": 0.3183911745576579, |
|
"learning_rate": 4.784630044641435e-05, |
|
"loss": 0.0546, |
|
"step": 329 |
|
}, |
|
{ |
|
"epoch": 2.129032258064516, |
|
"grad_norm": 0.1619196710614726, |
|
"learning_rate": 4.7206512269512124e-05, |
|
"loss": 0.0446, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 2.135483870967742, |
|
"grad_norm": 0.24657637060341897, |
|
"learning_rate": 4.65697062151927e-05, |
|
"loss": 0.0349, |
|
"step": 331 |
|
}, |
|
{ |
|
"epoch": 2.141935483870968, |
|
"grad_norm": 0.11723613581448042, |
|
"learning_rate": 4.593591825444028e-05, |
|
"loss": 0.0337, |
|
"step": 332 |
|
}, |
|
{ |
|
"epoch": 2.1483870967741936, |
|
"grad_norm": 0.12413977056977685, |
|
"learning_rate": 4.530518418775733e-05, |
|
"loss": 0.0355, |
|
"step": 333 |
|
}, |
|
{ |
|
"epoch": 2.1548387096774193, |
|
"grad_norm": 0.1368012687546921, |
|
"learning_rate": 4.4677539643142454e-05, |
|
"loss": 0.0369, |
|
"step": 334 |
|
}, |
|
{ |
|
"epoch": 2.161290322580645, |
|
"grad_norm": 0.132092425970093, |
|
"learning_rate": 4.40530200740777e-05, |
|
"loss": 0.0346, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 2.167741935483871, |
|
"grad_norm": 0.17255652388608525, |
|
"learning_rate": 4.343166075752605e-05, |
|
"loss": 0.0425, |
|
"step": 336 |
|
}, |
|
{ |
|
"epoch": 2.174193548387097, |
|
"grad_norm": 0.10537625462278236, |
|
"learning_rate": 4.281349679193861e-05, |
|
"loss": 0.0363, |
|
"step": 337 |
|
}, |
|
{ |
|
"epoch": 2.1806451612903226, |
|
"grad_norm": 0.08446560264535163, |
|
"learning_rate": 4.2198563095272116e-05, |
|
"loss": 0.0303, |
|
"step": 338 |
|
}, |
|
{ |
|
"epoch": 2.1870967741935483, |
|
"grad_norm": 0.11155624490505092, |
|
"learning_rate": 4.158689440301657e-05, |
|
"loss": 0.0424, |
|
"step": 339 |
|
}, |
|
{ |
|
"epoch": 2.193548387096774, |
|
"grad_norm": 0.10614591478252769, |
|
"learning_rate": 4.097852526623307e-05, |
|
"loss": 0.0374, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 2.2, |
|
"grad_norm": 0.15380955207075353, |
|
"learning_rate": 4.0373490049602204e-05, |
|
"loss": 0.0506, |
|
"step": 341 |
|
}, |
|
{ |
|
"epoch": 2.206451612903226, |
|
"grad_norm": 0.11905414571153279, |
|
"learning_rate": 3.977182292948283e-05, |
|
"loss": 0.0517, |
|
"step": 342 |
|
}, |
|
{ |
|
"epoch": 2.2129032258064516, |
|
"grad_norm": 0.14425425935781702, |
|
"learning_rate": 3.9173557891981573e-05, |
|
"loss": 0.0467, |
|
"step": 343 |
|
}, |
|
{ |
|
"epoch": 2.2193548387096773, |
|
"grad_norm": 0.12453230757386329, |
|
"learning_rate": 3.857872873103322e-05, |
|
"loss": 0.0435, |
|
"step": 344 |
|
}, |
|
{ |
|
"epoch": 2.225806451612903, |
|
"grad_norm": 0.10038267160538758, |
|
"learning_rate": 3.7987369046491684e-05, |
|
"loss": 0.0387, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 2.232258064516129, |
|
"grad_norm": 0.1438025015993235, |
|
"learning_rate": 3.7399512242231995e-05, |
|
"loss": 0.0346, |
|
"step": 346 |
|
}, |
|
{ |
|
"epoch": 2.238709677419355, |
|
"grad_norm": 0.26072179652296745, |
|
"learning_rate": 3.6815191524263624e-05, |
|
"loss": 0.0545, |
|
"step": 347 |
|
}, |
|
{ |
|
"epoch": 2.2451612903225806, |
|
"grad_norm": 0.08770511437142381, |
|
"learning_rate": 3.623443989885462e-05, |
|
"loss": 0.0327, |
|
"step": 348 |
|
}, |
|
{ |
|
"epoch": 2.2516129032258063, |
|
"grad_norm": 0.12301181296016188, |
|
"learning_rate": 3.565729017066729e-05, |
|
"loss": 0.039, |
|
"step": 349 |
|
}, |
|
{ |
|
"epoch": 2.258064516129032, |
|
"grad_norm": 0.13814667341858822, |
|
"learning_rate": 3.508377494090521e-05, |
|
"loss": 0.0356, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 2.264516129032258, |
|
"grad_norm": 0.12413533495298362, |
|
"learning_rate": 3.45139266054715e-05, |
|
"loss": 0.0415, |
|
"step": 351 |
|
}, |
|
{ |
|
"epoch": 2.270967741935484, |
|
"grad_norm": 0.11031185879435731, |
|
"learning_rate": 3.394777735313919e-05, |
|
"loss": 0.0363, |
|
"step": 352 |
|
}, |
|
{ |
|
"epoch": 2.2774193548387096, |
|
"grad_norm": 0.11353520726841541, |
|
"learning_rate": 3.338535916373266e-05, |
|
"loss": 0.0376, |
|
"step": 353 |
|
}, |
|
{ |
|
"epoch": 2.2838709677419353, |
|
"grad_norm": 0.23000368159173218, |
|
"learning_rate": 3.2826703806321525e-05, |
|
"loss": 0.0444, |
|
"step": 354 |
|
}, |
|
{ |
|
"epoch": 2.2903225806451615, |
|
"grad_norm": 0.10390022981327145, |
|
"learning_rate": 3.227184283742591e-05, |
|
"loss": 0.0308, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 2.296774193548387, |
|
"grad_norm": 0.14881153611607173, |
|
"learning_rate": 3.17208075992339e-05, |
|
"loss": 0.0428, |
|
"step": 356 |
|
}, |
|
{ |
|
"epoch": 2.303225806451613, |
|
"grad_norm": 0.22847373170988944, |
|
"learning_rate": 3.117362921783134e-05, |
|
"loss": 0.0449, |
|
"step": 357 |
|
}, |
|
{ |
|
"epoch": 2.3096774193548386, |
|
"grad_norm": 0.1261660214676178, |
|
"learning_rate": 3.063033860144339e-05, |
|
"loss": 0.0354, |
|
"step": 358 |
|
}, |
|
{ |
|
"epoch": 2.3161290322580643, |
|
"grad_norm": 0.16508383182402034, |
|
"learning_rate": 3.0090966438688772e-05, |
|
"loss": 0.0404, |
|
"step": 359 |
|
}, |
|
{ |
|
"epoch": 2.3225806451612905, |
|
"grad_norm": 0.14003607495149023, |
|
"learning_rate": 2.9555543196846292e-05, |
|
"loss": 0.0417, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 2.329032258064516, |
|
"grad_norm": 0.14067522061651452, |
|
"learning_rate": 2.9024099120133673e-05, |
|
"loss": 0.0325, |
|
"step": 361 |
|
}, |
|
{ |
|
"epoch": 2.335483870967742, |
|
"grad_norm": 0.17575236098692995, |
|
"learning_rate": 2.8496664227999415e-05, |
|
"loss": 0.0533, |
|
"step": 362 |
|
}, |
|
{ |
|
"epoch": 2.3419354838709676, |
|
"grad_norm": 0.11116834491408094, |
|
"learning_rate": 2.7973268313426837e-05, |
|
"loss": 0.0347, |
|
"step": 363 |
|
}, |
|
{ |
|
"epoch": 2.3483870967741938, |
|
"grad_norm": 0.16113939577201203, |
|
"learning_rate": 2.745394094125141e-05, |
|
"loss": 0.0472, |
|
"step": 364 |
|
}, |
|
{ |
|
"epoch": 2.3548387096774195, |
|
"grad_norm": 0.10955543396615161, |
|
"learning_rate": 2.6938711446490606e-05, |
|
"loss": 0.0357, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 2.361290322580645, |
|
"grad_norm": 0.17969798691111394, |
|
"learning_rate": 2.6427608932686843e-05, |
|
"loss": 0.0584, |
|
"step": 366 |
|
}, |
|
{ |
|
"epoch": 2.367741935483871, |
|
"grad_norm": 0.08542932983952008, |
|
"learning_rate": 2.5920662270263653e-05, |
|
"loss": 0.032, |
|
"step": 367 |
|
}, |
|
{ |
|
"epoch": 2.3741935483870966, |
|
"grad_norm": 0.14296192462496796, |
|
"learning_rate": 2.5417900094894744e-05, |
|
"loss": 0.0463, |
|
"step": 368 |
|
}, |
|
{ |
|
"epoch": 2.3806451612903228, |
|
"grad_norm": 0.11857064948287055, |
|
"learning_rate": 2.4919350805886577e-05, |
|
"loss": 0.0328, |
|
"step": 369 |
|
}, |
|
{ |
|
"epoch": 2.3870967741935485, |
|
"grad_norm": 0.1614783419773078, |
|
"learning_rate": 2.4425042564574184e-05, |
|
"loss": 0.0398, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 2.393548387096774, |
|
"grad_norm": 0.12389155943230613, |
|
"learning_rate": 2.3935003292730296e-05, |
|
"loss": 0.0389, |
|
"step": 371 |
|
}, |
|
{ |
|
"epoch": 2.4, |
|
"grad_norm": 0.11380344182842446, |
|
"learning_rate": 2.344926067098836e-05, |
|
"loss": 0.0412, |
|
"step": 372 |
|
}, |
|
{ |
|
"epoch": 2.4064516129032256, |
|
"grad_norm": 0.09336252258103142, |
|
"learning_rate": 2.2967842137278706e-05, |
|
"loss": 0.0311, |
|
"step": 373 |
|
}, |
|
{ |
|
"epoch": 2.412903225806452, |
|
"grad_norm": 0.1312469591783424, |
|
"learning_rate": 2.2490774885278908e-05, |
|
"loss": 0.0363, |
|
"step": 374 |
|
}, |
|
{ |
|
"epoch": 2.4193548387096775, |
|
"grad_norm": 0.1928829483525128, |
|
"learning_rate": 2.201808586287757e-05, |
|
"loss": 0.0447, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 2.425806451612903, |
|
"grad_norm": 0.1407497782508695, |
|
"learning_rate": 2.15498017706521e-05, |
|
"loss": 0.0425, |
|
"step": 376 |
|
}, |
|
{ |
|
"epoch": 2.432258064516129, |
|
"grad_norm": 0.12625194439970722, |
|
"learning_rate": 2.1085949060360654e-05, |
|
"loss": 0.0394, |
|
"step": 377 |
|
}, |
|
{ |
|
"epoch": 2.4387096774193546, |
|
"grad_norm": 0.1650711686265322, |
|
"learning_rate": 2.0626553933447734e-05, |
|
"loss": 0.0356, |
|
"step": 378 |
|
}, |
|
{ |
|
"epoch": 2.445161290322581, |
|
"grad_norm": 0.203794329158885, |
|
"learning_rate": 2.01716423395644e-05, |
|
"loss": 0.046, |
|
"step": 379 |
|
}, |
|
{ |
|
"epoch": 2.4516129032258065, |
|
"grad_norm": 0.0983865374506401, |
|
"learning_rate": 1.9721239975102313e-05, |
|
"loss": 0.0383, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 2.458064516129032, |
|
"grad_norm": 0.11475237817508949, |
|
"learning_rate": 1.9275372281742242e-05, |
|
"loss": 0.0341, |
|
"step": 381 |
|
}, |
|
{ |
|
"epoch": 2.464516129032258, |
|
"grad_norm": 0.11696076741124051, |
|
"learning_rate": 1.8834064445016953e-05, |
|
"loss": 0.0351, |
|
"step": 382 |
|
}, |
|
{ |
|
"epoch": 2.4709677419354836, |
|
"grad_norm": 0.11440255447742229, |
|
"learning_rate": 1.839734139288868e-05, |
|
"loss": 0.0344, |
|
"step": 383 |
|
}, |
|
{ |
|
"epoch": 2.47741935483871, |
|
"grad_norm": 0.12722166866147971, |
|
"learning_rate": 1.7965227794340877e-05, |
|
"loss": 0.0327, |
|
"step": 384 |
|
}, |
|
{ |
|
"epoch": 2.4838709677419355, |
|
"grad_norm": 0.11977054711252778, |
|
"learning_rate": 1.753774805798486e-05, |
|
"loss": 0.0374, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 2.490322580645161, |
|
"grad_norm": 0.19041811914083892, |
|
"learning_rate": 1.7114926330680957e-05, |
|
"loss": 0.0448, |
|
"step": 386 |
|
}, |
|
{ |
|
"epoch": 2.496774193548387, |
|
"grad_norm": 0.12908385372034334, |
|
"learning_rate": 1.6696786496174578e-05, |
|
"loss": 0.0421, |
|
"step": 387 |
|
}, |
|
{ |
|
"epoch": 2.5032258064516126, |
|
"grad_norm": 0.11345958947234494, |
|
"learning_rate": 1.6283352173747145e-05, |
|
"loss": 0.0337, |
|
"step": 388 |
|
}, |
|
{ |
|
"epoch": 2.509677419354839, |
|
"grad_norm": 0.13882462080577593, |
|
"learning_rate": 1.587464671688187e-05, |
|
"loss": 0.0399, |
|
"step": 389 |
|
}, |
|
{ |
|
"epoch": 2.5161290322580645, |
|
"grad_norm": 0.15241110627485502, |
|
"learning_rate": 1.5470693211944643e-05, |
|
"loss": 0.0331, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 2.52258064516129, |
|
"grad_norm": 0.10975692065298487, |
|
"learning_rate": 1.5071514476879878e-05, |
|
"loss": 0.0379, |
|
"step": 391 |
|
}, |
|
{ |
|
"epoch": 2.5290322580645164, |
|
"grad_norm": 0.1379096296948562, |
|
"learning_rate": 1.4677133059921632e-05, |
|
"loss": 0.0362, |
|
"step": 392 |
|
}, |
|
{ |
|
"epoch": 2.535483870967742, |
|
"grad_norm": 0.10190107251103887, |
|
"learning_rate": 1.4287571238320053e-05, |
|
"loss": 0.0371, |
|
"step": 393 |
|
}, |
|
{ |
|
"epoch": 2.541935483870968, |
|
"grad_norm": 0.0875699618751271, |
|
"learning_rate": 1.3902851017082864e-05, |
|
"loss": 0.0335, |
|
"step": 394 |
|
}, |
|
{ |
|
"epoch": 2.5483870967741935, |
|
"grad_norm": 0.13655405710076232, |
|
"learning_rate": 1.3522994127732414e-05, |
|
"loss": 0.0388, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 2.554838709677419, |
|
"grad_norm": 0.11073272357568731, |
|
"learning_rate": 1.3148022027078222e-05, |
|
"loss": 0.0366, |
|
"step": 396 |
|
}, |
|
{ |
|
"epoch": 2.5612903225806454, |
|
"grad_norm": 0.11393589351391524, |
|
"learning_rate": 1.2777955896004812e-05, |
|
"loss": 0.0375, |
|
"step": 397 |
|
}, |
|
{ |
|
"epoch": 2.567741935483871, |
|
"grad_norm": 0.1294793091961239, |
|
"learning_rate": 1.2412816638275404e-05, |
|
"loss": 0.0447, |
|
"step": 398 |
|
}, |
|
{ |
|
"epoch": 2.574193548387097, |
|
"grad_norm": 0.12252957295031998, |
|
"learning_rate": 1.2052624879351104e-05, |
|
"loss": 0.036, |
|
"step": 399 |
|
}, |
|
{ |
|
"epoch": 2.5806451612903225, |
|
"grad_norm": 0.10701768631052949, |
|
"learning_rate": 1.1697400965225747e-05, |
|
"loss": 0.0375, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 2.587096774193548, |
|
"grad_norm": 0.1826126683217801, |
|
"learning_rate": 1.134716496127679e-05, |
|
"loss": 0.0398, |
|
"step": 401 |
|
}, |
|
{ |
|
"epoch": 2.5935483870967744, |
|
"grad_norm": 0.12179126729794809, |
|
"learning_rate": 1.1001936651131717e-05, |
|
"loss": 0.0396, |
|
"step": 402 |
|
}, |
|
{ |
|
"epoch": 2.6, |
|
"grad_norm": 0.11972903116479075, |
|
"learning_rate": 1.0661735535550666e-05, |
|
"loss": 0.0336, |
|
"step": 403 |
|
}, |
|
{ |
|
"epoch": 2.606451612903226, |
|
"grad_norm": 0.09797391900877583, |
|
"learning_rate": 1.0326580831324817e-05, |
|
"loss": 0.0291, |
|
"step": 404 |
|
}, |
|
{ |
|
"epoch": 2.6129032258064515, |
|
"grad_norm": 0.08807259670414254, |
|
"learning_rate": 9.996491470190917e-06, |
|
"loss": 0.0318, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 2.6193548387096772, |
|
"grad_norm": 0.12986706205285126, |
|
"learning_rate": 9.671486097761917e-06, |
|
"loss": 0.0408, |
|
"step": 406 |
|
}, |
|
{ |
|
"epoch": 2.6258064516129034, |
|
"grad_norm": 0.1331746828106328, |
|
"learning_rate": 9.351583072473713e-06, |
|
"loss": 0.0427, |
|
"step": 407 |
|
}, |
|
{ |
|
"epoch": 2.632258064516129, |
|
"grad_norm": 0.14912986532560119, |
|
"learning_rate": 9.036800464548157e-06, |
|
"loss": 0.0464, |
|
"step": 408 |
|
}, |
|
{ |
|
"epoch": 2.638709677419355, |
|
"grad_norm": 0.1440292521357826, |
|
"learning_rate": 8.727156054972374e-06, |
|
"loss": 0.0448, |
|
"step": 409 |
|
}, |
|
{ |
|
"epoch": 2.6451612903225805, |
|
"grad_norm": 0.16571651133316453, |
|
"learning_rate": 8.422667334494249e-06, |
|
"loss": 0.0449, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 2.6516129032258062, |
|
"grad_norm": 0.15006924502881824, |
|
"learning_rate": 8.123351502634625e-06, |
|
"loss": 0.0371, |
|
"step": 411 |
|
}, |
|
{ |
|
"epoch": 2.6580645161290324, |
|
"grad_norm": 0.3766926409227814, |
|
"learning_rate": 7.82922546671555e-06, |
|
"loss": 0.0522, |
|
"step": 412 |
|
}, |
|
{ |
|
"epoch": 2.664516129032258, |
|
"grad_norm": 0.1693245533307095, |
|
"learning_rate": 7.54030584090537e-06, |
|
"loss": 0.0523, |
|
"step": 413 |
|
}, |
|
{ |
|
"epoch": 2.670967741935484, |
|
"grad_norm": 0.16933380856734512, |
|
"learning_rate": 7.256608945280319e-06, |
|
"loss": 0.0448, |
|
"step": 414 |
|
}, |
|
{ |
|
"epoch": 2.6774193548387095, |
|
"grad_norm": 0.12372656470631721, |
|
"learning_rate": 6.97815080490245e-06, |
|
"loss": 0.041, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 2.6838709677419352, |
|
"grad_norm": 0.1139014768011305, |
|
"learning_rate": 6.704947148914609e-06, |
|
"loss": 0.034, |
|
"step": 416 |
|
}, |
|
{ |
|
"epoch": 2.6903225806451614, |
|
"grad_norm": 0.14509781428626975, |
|
"learning_rate": 6.437013409651849e-06, |
|
"loss": 0.0289, |
|
"step": 417 |
|
}, |
|
{ |
|
"epoch": 2.696774193548387, |
|
"grad_norm": 0.1403482292705624, |
|
"learning_rate": 6.174364721769743e-06, |
|
"loss": 0.0493, |
|
"step": 418 |
|
}, |
|
{ |
|
"epoch": 2.703225806451613, |
|
"grad_norm": 0.12152856591733673, |
|
"learning_rate": 5.917015921389568e-06, |
|
"loss": 0.0343, |
|
"step": 419 |
|
}, |
|
{ |
|
"epoch": 2.709677419354839, |
|
"grad_norm": 0.10494471511565585, |
|
"learning_rate": 5.664981545260073e-06, |
|
"loss": 0.03, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 2.7161290322580647, |
|
"grad_norm": 0.2176951472197835, |
|
"learning_rate": 5.418275829936537e-06, |
|
"loss": 0.0486, |
|
"step": 421 |
|
}, |
|
{ |
|
"epoch": 2.7225806451612904, |
|
"grad_norm": 0.09677627092271344, |
|
"learning_rate": 5.176912710976467e-06, |
|
"loss": 0.0318, |
|
"step": 422 |
|
}, |
|
{ |
|
"epoch": 2.729032258064516, |
|
"grad_norm": 0.14416553291024004, |
|
"learning_rate": 4.940905822152453e-06, |
|
"loss": 0.0356, |
|
"step": 423 |
|
}, |
|
{ |
|
"epoch": 2.735483870967742, |
|
"grad_norm": 0.10141360651760384, |
|
"learning_rate": 4.710268494682146e-06, |
|
"loss": 0.0346, |
|
"step": 424 |
|
}, |
|
{ |
|
"epoch": 2.741935483870968, |
|
"grad_norm": 0.13463566674635727, |
|
"learning_rate": 4.485013756475076e-06, |
|
"loss": 0.0443, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 2.7483870967741937, |
|
"grad_norm": 0.26842778961550184, |
|
"learning_rate": 4.2651543313968145e-06, |
|
"loss": 0.0445, |
|
"step": 426 |
|
}, |
|
{ |
|
"epoch": 2.7548387096774194, |
|
"grad_norm": 0.16014595507517537, |
|
"learning_rate": 4.050702638550275e-06, |
|
"loss": 0.0469, |
|
"step": 427 |
|
}, |
|
{ |
|
"epoch": 2.761290322580645, |
|
"grad_norm": 0.2690022963645026, |
|
"learning_rate": 3.841670791574137e-06, |
|
"loss": 0.0423, |
|
"step": 428 |
|
}, |
|
{ |
|
"epoch": 2.767741935483871, |
|
"grad_norm": 0.2323526833237967, |
|
"learning_rate": 3.638070597958665e-06, |
|
"loss": 0.035, |
|
"step": 429 |
|
}, |
|
{ |
|
"epoch": 2.774193548387097, |
|
"grad_norm": 0.11643458974488918, |
|
"learning_rate": 3.4399135583787043e-06, |
|
"loss": 0.0401, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 2.7806451612903227, |
|
"grad_norm": 0.12278976372135998, |
|
"learning_rate": 3.2472108660439706e-06, |
|
"loss": 0.04, |
|
"step": 431 |
|
}, |
|
{ |
|
"epoch": 2.7870967741935484, |
|
"grad_norm": 0.11369002878299092, |
|
"learning_rate": 3.059973406066963e-06, |
|
"loss": 0.0335, |
|
"step": 432 |
|
}, |
|
{ |
|
"epoch": 2.793548387096774, |
|
"grad_norm": 0.1426764505452139, |
|
"learning_rate": 2.878211754847926e-06, |
|
"loss": 0.0431, |
|
"step": 433 |
|
}, |
|
{ |
|
"epoch": 2.8, |
|
"grad_norm": 0.17614309627774977, |
|
"learning_rate": 2.7019361794775156e-06, |
|
"loss": 0.0336, |
|
"step": 434 |
|
}, |
|
{ |
|
"epoch": 2.806451612903226, |
|
"grad_norm": 0.13119646329419238, |
|
"learning_rate": 2.5311566371568507e-06, |
|
"loss": 0.0388, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 2.8129032258064517, |
|
"grad_norm": 0.2274358694692015, |
|
"learning_rate": 2.365882774634998e-06, |
|
"loss": 0.0483, |
|
"step": 436 |
|
}, |
|
{ |
|
"epoch": 2.8193548387096774, |
|
"grad_norm": 0.16809646480030987, |
|
"learning_rate": 2.206123927664161e-06, |
|
"loss": 0.0428, |
|
"step": 437 |
|
}, |
|
{ |
|
"epoch": 2.825806451612903, |
|
"grad_norm": 0.08861174696327136, |
|
"learning_rate": 2.0518891204722168e-06, |
|
"loss": 0.0297, |
|
"step": 438 |
|
}, |
|
{ |
|
"epoch": 2.832258064516129, |
|
"grad_norm": 0.1516310249747229, |
|
"learning_rate": 1.903187065253076e-06, |
|
"loss": 0.0486, |
|
"step": 439 |
|
}, |
|
{ |
|
"epoch": 2.838709677419355, |
|
"grad_norm": 0.1154092981393085, |
|
"learning_rate": 1.7600261616745106e-06, |
|
"loss": 0.044, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 2.8451612903225807, |
|
"grad_norm": 0.14626098217303432, |
|
"learning_rate": 1.6224144964036681e-06, |
|
"loss": 0.045, |
|
"step": 441 |
|
}, |
|
{ |
|
"epoch": 2.8516129032258064, |
|
"grad_norm": 0.09272906884038755, |
|
"learning_rate": 1.4903598426503241e-06, |
|
"loss": 0.0305, |
|
"step": 442 |
|
}, |
|
{ |
|
"epoch": 2.858064516129032, |
|
"grad_norm": 0.1688562089326058, |
|
"learning_rate": 1.3638696597277679e-06, |
|
"loss": 0.0402, |
|
"step": 443 |
|
}, |
|
{ |
|
"epoch": 2.864516129032258, |
|
"grad_norm": 0.11628209549950047, |
|
"learning_rate": 1.2429510926314836e-06, |
|
"loss": 0.0309, |
|
"step": 444 |
|
}, |
|
{ |
|
"epoch": 2.870967741935484, |
|
"grad_norm": 0.14018105443781761, |
|
"learning_rate": 1.1276109716355287e-06, |
|
"loss": 0.0466, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 2.8774193548387097, |
|
"grad_norm": 0.12354234788520546, |
|
"learning_rate": 1.0178558119067315e-06, |
|
"loss": 0.0315, |
|
"step": 446 |
|
}, |
|
{ |
|
"epoch": 2.8838709677419354, |
|
"grad_norm": 0.15472531322652747, |
|
"learning_rate": 9.136918131366412e-07, |
|
"loss": 0.0436, |
|
"step": 447 |
|
}, |
|
{ |
|
"epoch": 2.8903225806451616, |
|
"grad_norm": 0.16727260037004013, |
|
"learning_rate": 8.151248591913518e-07, |
|
"loss": 0.0413, |
|
"step": 448 |
|
}, |
|
{ |
|
"epoch": 2.896774193548387, |
|
"grad_norm": 0.15358557638143366, |
|
"learning_rate": 7.221605177791691e-07, |
|
"loss": 0.0477, |
|
"step": 449 |
|
}, |
|
{ |
|
"epoch": 2.903225806451613, |
|
"grad_norm": 0.11899732193116695, |
|
"learning_rate": 6.348040401360833e-07, |
|
"loss": 0.0337, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 2.9096774193548387, |
|
"grad_norm": 0.19561410636845064, |
|
"learning_rate": 5.530603607290851e-07, |
|
"loss": 0.0556, |
|
"step": 451 |
|
}, |
|
{ |
|
"epoch": 2.9161290322580644, |
|
"grad_norm": 0.14981745610583072, |
|
"learning_rate": 4.76934096977566e-07, |
|
"loss": 0.0372, |
|
"step": 452 |
|
}, |
|
{ |
|
"epoch": 2.9225806451612906, |
|
"grad_norm": 0.1039480982798509, |
|
"learning_rate": 4.0642954899238197e-07, |
|
"loss": 0.0343, |
|
"step": 453 |
|
}, |
|
{ |
|
"epoch": 2.9290322580645163, |
|
"grad_norm": 0.11333580055218463, |
|
"learning_rate": 3.415506993330153e-07, |
|
"loss": 0.0388, |
|
"step": 454 |
|
}, |
|
{ |
|
"epoch": 2.935483870967742, |
|
"grad_norm": 0.10481858385197364, |
|
"learning_rate": 2.8230121278257637e-07, |
|
"loss": 0.0313, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 2.9419354838709677, |
|
"grad_norm": 0.15064675470229316, |
|
"learning_rate": 2.2868443614082469e-07, |
|
"loss": 0.0449, |
|
"step": 456 |
|
}, |
|
{ |
|
"epoch": 2.9483870967741934, |
|
"grad_norm": 0.13918413557689208, |
|
"learning_rate": 1.8070339803509807e-07, |
|
"loss": 0.0414, |
|
"step": 457 |
|
}, |
|
{ |
|
"epoch": 2.9548387096774196, |
|
"grad_norm": 0.15721864030858848, |
|
"learning_rate": 1.3836080874926049e-07, |
|
"loss": 0.0342, |
|
"step": 458 |
|
}, |
|
{ |
|
"epoch": 2.9612903225806453, |
|
"grad_norm": 0.15411456561887094, |
|
"learning_rate": 1.0165906007056914e-07, |
|
"loss": 0.0418, |
|
"step": 459 |
|
}, |
|
{ |
|
"epoch": 2.967741935483871, |
|
"grad_norm": 0.1513944041768792, |
|
"learning_rate": 7.060022515460451e-08, |
|
"loss": 0.0364, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 2.9741935483870967, |
|
"grad_norm": 0.22512284396579382, |
|
"learning_rate": 4.518605840815315e-08, |
|
"loss": 0.0447, |
|
"step": 461 |
|
}, |
|
{ |
|
"epoch": 2.9806451612903224, |
|
"grad_norm": 0.1529295717768871, |
|
"learning_rate": 2.5417995390086824e-08, |
|
"loss": 0.0449, |
|
"step": 462 |
|
}, |
|
{ |
|
"epoch": 2.9870967741935486, |
|
"grad_norm": 0.13942712092352366, |
|
"learning_rate": 1.129715273033849e-08, |
|
"loss": 0.0367, |
|
"step": 463 |
|
}, |
|
{ |
|
"epoch": 2.9935483870967743, |
|
"grad_norm": 0.13569899513050301, |
|
"learning_rate": 2.824328066730608e-09, |
|
"loss": 0.0398, |
|
"step": 464 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"grad_norm": 0.11004047385534296, |
|
"learning_rate": 0.0, |
|
"loss": 0.0305, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"eval_loss": 0.2107405662536621, |
|
"eval_runtime": 25.5157, |
|
"eval_samples_per_second": 5.134, |
|
"eval_steps_per_second": 0.666, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 3.0, |
|
"step": 465, |
|
"total_flos": 261374226563072.0, |
|
"train_loss": 0.09038178783751304, |
|
"train_runtime": 3542.1658, |
|
"train_samples_per_second": 2.097, |
|
"train_steps_per_second": 0.131 |
|
} |
|
], |
|
"logging_steps": 1, |
|
"max_steps": 465, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 500, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": true |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 261374226563072.0, |
|
"train_batch_size": 2, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|