|
{ |
|
"best_metric": null, |
|
"best_model_checkpoint": null, |
|
"epoch": 2.385211687537269, |
|
"eval_steps": 500, |
|
"global_step": 1000, |
|
"is_hyper_param_search": false, |
|
"is_local_process_zero": true, |
|
"is_world_process_zero": true, |
|
"log_history": [ |
|
{ |
|
"epoch": 0.011926058437686345, |
|
"grad_norm": 0.5661849975585938, |
|
"learning_rate": 4.999804802810596e-05, |
|
"loss": 1.4421, |
|
"step": 5 |
|
}, |
|
{ |
|
"epoch": 0.02385211687537269, |
|
"grad_norm": 0.5908129811286926, |
|
"learning_rate": 4.999219241723937e-05, |
|
"loss": 1.3306, |
|
"step": 10 |
|
}, |
|
{ |
|
"epoch": 0.03577817531305903, |
|
"grad_norm": 0.6965810060501099, |
|
"learning_rate": 4.998243408179925e-05, |
|
"loss": 1.1873, |
|
"step": 15 |
|
}, |
|
{ |
|
"epoch": 0.04770423375074538, |
|
"grad_norm": 0.5752276182174683, |
|
"learning_rate": 4.9968774545625344e-05, |
|
"loss": 1.0583, |
|
"step": 20 |
|
}, |
|
{ |
|
"epoch": 0.05963029218843172, |
|
"grad_norm": 0.6488135457038879, |
|
"learning_rate": 4.9951215941760075e-05, |
|
"loss": 1.0045, |
|
"step": 25 |
|
}, |
|
{ |
|
"epoch": 0.07155635062611806, |
|
"grad_norm": 0.7977299094200134, |
|
"learning_rate": 4.992976101211558e-05, |
|
"loss": 0.9836, |
|
"step": 30 |
|
}, |
|
{ |
|
"epoch": 0.08348240906380441, |
|
"grad_norm": 0.8842712044715881, |
|
"learning_rate": 4.99044131070454e-05, |
|
"loss": 0.8925, |
|
"step": 35 |
|
}, |
|
{ |
|
"epoch": 0.09540846750149076, |
|
"grad_norm": 0.798170804977417, |
|
"learning_rate": 4.987517618482142e-05, |
|
"loss": 0.8783, |
|
"step": 40 |
|
}, |
|
{ |
|
"epoch": 0.1073345259391771, |
|
"grad_norm": 0.8196467757225037, |
|
"learning_rate": 4.984205481101565e-05, |
|
"loss": 0.8681, |
|
"step": 45 |
|
}, |
|
{ |
|
"epoch": 0.11926058437686345, |
|
"grad_norm": 0.9973539113998413, |
|
"learning_rate": 4.980505415778738e-05, |
|
"loss": 0.8251, |
|
"step": 50 |
|
}, |
|
{ |
|
"epoch": 0.13118664281454978, |
|
"grad_norm": 0.8836011290550232, |
|
"learning_rate": 4.97641800030754e-05, |
|
"loss": 0.8079, |
|
"step": 55 |
|
}, |
|
{ |
|
"epoch": 0.14311270125223613, |
|
"grad_norm": 0.9255298972129822, |
|
"learning_rate": 4.971943872969582e-05, |
|
"loss": 0.7769, |
|
"step": 60 |
|
}, |
|
{ |
|
"epoch": 0.15503875968992248, |
|
"grad_norm": 0.9712087512016296, |
|
"learning_rate": 4.967083732434529e-05, |
|
"loss": 0.777, |
|
"step": 65 |
|
}, |
|
{ |
|
"epoch": 0.16696481812760883, |
|
"grad_norm": 0.7609415054321289, |
|
"learning_rate": 4.961838337650997e-05, |
|
"loss": 0.7303, |
|
"step": 70 |
|
}, |
|
{ |
|
"epoch": 0.17889087656529518, |
|
"grad_norm": 0.8297658562660217, |
|
"learning_rate": 4.9562085077280443e-05, |
|
"loss": 0.7442, |
|
"step": 75 |
|
}, |
|
{ |
|
"epoch": 0.19081693500298152, |
|
"grad_norm": 1.01498281955719, |
|
"learning_rate": 4.950195121807251e-05, |
|
"loss": 0.7077, |
|
"step": 80 |
|
}, |
|
{ |
|
"epoch": 0.20274299344066785, |
|
"grad_norm": 0.9629709720611572, |
|
"learning_rate": 4.943799118925443e-05, |
|
"loss": 0.673, |
|
"step": 85 |
|
}, |
|
{ |
|
"epoch": 0.2146690518783542, |
|
"grad_norm": 0.9121557474136353, |
|
"learning_rate": 4.937021497868047e-05, |
|
"loss": 0.7109, |
|
"step": 90 |
|
}, |
|
{ |
|
"epoch": 0.22659511031604054, |
|
"grad_norm": 0.841573178768158, |
|
"learning_rate": 4.9298633170131304e-05, |
|
"loss": 0.6798, |
|
"step": 95 |
|
}, |
|
{ |
|
"epoch": 0.2385211687537269, |
|
"grad_norm": 0.9693652391433716, |
|
"learning_rate": 4.922325694166119e-05, |
|
"loss": 0.6471, |
|
"step": 100 |
|
}, |
|
{ |
|
"epoch": 0.2504472271914132, |
|
"grad_norm": 1.1198012828826904, |
|
"learning_rate": 4.9144098063852485e-05, |
|
"loss": 0.6456, |
|
"step": 105 |
|
}, |
|
{ |
|
"epoch": 0.26237328562909956, |
|
"grad_norm": 1.1725982427597046, |
|
"learning_rate": 4.9061168897977564e-05, |
|
"loss": 0.6462, |
|
"step": 110 |
|
}, |
|
{ |
|
"epoch": 0.2742993440667859, |
|
"grad_norm": 0.9855665564537048, |
|
"learning_rate": 4.8974482394068514e-05, |
|
"loss": 0.6594, |
|
"step": 115 |
|
}, |
|
{ |
|
"epoch": 0.28622540250447226, |
|
"grad_norm": 0.8321542739868164, |
|
"learning_rate": 4.888405208889486e-05, |
|
"loss": 0.6372, |
|
"step": 120 |
|
}, |
|
{ |
|
"epoch": 0.2981514609421586, |
|
"grad_norm": 0.9451864957809448, |
|
"learning_rate": 4.878989210384972e-05, |
|
"loss": 0.6594, |
|
"step": 125 |
|
}, |
|
{ |
|
"epoch": 0.31007751937984496, |
|
"grad_norm": 1.1336216926574707, |
|
"learning_rate": 4.869201714274467e-05, |
|
"loss": 0.6544, |
|
"step": 130 |
|
}, |
|
{ |
|
"epoch": 0.3220035778175313, |
|
"grad_norm": 0.8466928601264954, |
|
"learning_rate": 4.8590442489513543e-05, |
|
"loss": 0.6275, |
|
"step": 135 |
|
}, |
|
{ |
|
"epoch": 0.33392963625521765, |
|
"grad_norm": 0.979365885257721, |
|
"learning_rate": 4.8485184005825815e-05, |
|
"loss": 0.6348, |
|
"step": 140 |
|
}, |
|
{ |
|
"epoch": 0.345855694692904, |
|
"grad_norm": 0.9159572124481201, |
|
"learning_rate": 4.837625812860961e-05, |
|
"loss": 0.6045, |
|
"step": 145 |
|
}, |
|
{ |
|
"epoch": 0.35778175313059035, |
|
"grad_norm": 1.0542089939117432, |
|
"learning_rate": 4.8263681867485e-05, |
|
"loss": 0.6487, |
|
"step": 150 |
|
}, |
|
{ |
|
"epoch": 0.3697078115682767, |
|
"grad_norm": 1.0264719724655151, |
|
"learning_rate": 4.814747280210782e-05, |
|
"loss": 0.6026, |
|
"step": 155 |
|
}, |
|
{ |
|
"epoch": 0.38163387000596305, |
|
"grad_norm": 0.9389889240264893, |
|
"learning_rate": 4.80276490794244e-05, |
|
"loss": 0.6083, |
|
"step": 160 |
|
}, |
|
{ |
|
"epoch": 0.3935599284436494, |
|
"grad_norm": 1.1090476512908936, |
|
"learning_rate": 4.790422941083786e-05, |
|
"loss": 0.5957, |
|
"step": 165 |
|
}, |
|
{ |
|
"epoch": 0.4054859868813357, |
|
"grad_norm": 1.0268605947494507, |
|
"learning_rate": 4.7777233069286154e-05, |
|
"loss": 0.6538, |
|
"step": 170 |
|
}, |
|
{ |
|
"epoch": 0.41741204531902204, |
|
"grad_norm": 1.240955114364624, |
|
"learning_rate": 4.7646679886232414e-05, |
|
"loss": 0.62, |
|
"step": 175 |
|
}, |
|
{ |
|
"epoch": 0.4293381037567084, |
|
"grad_norm": 1.2029043436050415, |
|
"learning_rate": 4.7512590248568163e-05, |
|
"loss": 0.6461, |
|
"step": 180 |
|
}, |
|
{ |
|
"epoch": 0.44126416219439474, |
|
"grad_norm": 1.0567660331726074, |
|
"learning_rate": 4.7374985095429725e-05, |
|
"loss": 0.633, |
|
"step": 185 |
|
}, |
|
{ |
|
"epoch": 0.4531902206320811, |
|
"grad_norm": 0.928354024887085, |
|
"learning_rate": 4.723388591492841e-05, |
|
"loss": 0.633, |
|
"step": 190 |
|
}, |
|
{ |
|
"epoch": 0.46511627906976744, |
|
"grad_norm": 0.8453586101531982, |
|
"learning_rate": 4.708931474079499e-05, |
|
"loss": 0.6082, |
|
"step": 195 |
|
}, |
|
{ |
|
"epoch": 0.4770423375074538, |
|
"grad_norm": 1.33816397190094, |
|
"learning_rate": 4.6941294148938954e-05, |
|
"loss": 0.6176, |
|
"step": 200 |
|
}, |
|
{ |
|
"epoch": 0.48896839594514013, |
|
"grad_norm": 0.9532414078712463, |
|
"learning_rate": 4.678984725392309e-05, |
|
"loss": 0.6168, |
|
"step": 205 |
|
}, |
|
{ |
|
"epoch": 0.5008944543828264, |
|
"grad_norm": 0.9017360210418701, |
|
"learning_rate": 4.6634997705354024e-05, |
|
"loss": 0.6454, |
|
"step": 210 |
|
}, |
|
{ |
|
"epoch": 0.5128205128205128, |
|
"grad_norm": 1.1095216274261475, |
|
"learning_rate": 4.6476769684189065e-05, |
|
"loss": 0.5738, |
|
"step": 215 |
|
}, |
|
{ |
|
"epoch": 0.5247465712581991, |
|
"grad_norm": 0.8017715811729431, |
|
"learning_rate": 4.631518789896023e-05, |
|
"loss": 0.5978, |
|
"step": 220 |
|
}, |
|
{ |
|
"epoch": 0.5366726296958855, |
|
"grad_norm": 1.0233415365219116, |
|
"learning_rate": 4.6150277581915804e-05, |
|
"loss": 0.6375, |
|
"step": 225 |
|
}, |
|
{ |
|
"epoch": 0.5485986881335718, |
|
"grad_norm": 0.9298461675643921, |
|
"learning_rate": 4.598206448508007e-05, |
|
"loss": 0.606, |
|
"step": 230 |
|
}, |
|
{ |
|
"epoch": 0.5605247465712582, |
|
"grad_norm": 0.9300760626792908, |
|
"learning_rate": 4.581057487623204e-05, |
|
"loss": 0.6261, |
|
"step": 235 |
|
}, |
|
{ |
|
"epoch": 0.5724508050089445, |
|
"grad_norm": 1.4629548788070679, |
|
"learning_rate": 4.5635835534803406e-05, |
|
"loss": 0.5877, |
|
"step": 240 |
|
}, |
|
{ |
|
"epoch": 0.5843768634466309, |
|
"grad_norm": 0.916437029838562, |
|
"learning_rate": 4.545787374769686e-05, |
|
"loss": 0.5821, |
|
"step": 245 |
|
}, |
|
{ |
|
"epoch": 0.5963029218843172, |
|
"grad_norm": 0.9588505625724792, |
|
"learning_rate": 4.527671730502491e-05, |
|
"loss": 0.6126, |
|
"step": 250 |
|
}, |
|
{ |
|
"epoch": 0.6082289803220036, |
|
"grad_norm": 1.068697214126587, |
|
"learning_rate": 4.5092394495770335e-05, |
|
"loss": 0.6034, |
|
"step": 255 |
|
}, |
|
{ |
|
"epoch": 0.6201550387596899, |
|
"grad_norm": 0.816047191619873, |
|
"learning_rate": 4.490493410336857e-05, |
|
"loss": 0.6088, |
|
"step": 260 |
|
}, |
|
{ |
|
"epoch": 0.6320810971973763, |
|
"grad_norm": 0.8465267419815063, |
|
"learning_rate": 4.4714365401213e-05, |
|
"loss": 0.6067, |
|
"step": 265 |
|
}, |
|
{ |
|
"epoch": 0.6440071556350626, |
|
"grad_norm": 0.9819373488426208, |
|
"learning_rate": 4.4520718148083665e-05, |
|
"loss": 0.6334, |
|
"step": 270 |
|
}, |
|
{ |
|
"epoch": 0.655933214072749, |
|
"grad_norm": 0.7895939350128174, |
|
"learning_rate": 4.43240225835002e-05, |
|
"loss": 0.6093, |
|
"step": 275 |
|
}, |
|
{ |
|
"epoch": 0.6678592725104353, |
|
"grad_norm": 1.0516682863235474, |
|
"learning_rate": 4.41243094229997e-05, |
|
"loss": 0.5939, |
|
"step": 280 |
|
}, |
|
{ |
|
"epoch": 0.6797853309481217, |
|
"grad_norm": 0.8521412014961243, |
|
"learning_rate": 4.392160985334027e-05, |
|
"loss": 0.5483, |
|
"step": 285 |
|
}, |
|
{ |
|
"epoch": 0.691711389385808, |
|
"grad_norm": 0.847909152507782, |
|
"learning_rate": 4.371595552763093e-05, |
|
"loss": 0.5946, |
|
"step": 290 |
|
}, |
|
{ |
|
"epoch": 0.7036374478234944, |
|
"grad_norm": 1.1502934694290161, |
|
"learning_rate": 4.350737856038878e-05, |
|
"loss": 0.5651, |
|
"step": 295 |
|
}, |
|
{ |
|
"epoch": 0.7155635062611807, |
|
"grad_norm": 1.0434402227401733, |
|
"learning_rate": 4.3295911522524044e-05, |
|
"loss": 0.5965, |
|
"step": 300 |
|
}, |
|
{ |
|
"epoch": 0.727489564698867, |
|
"grad_norm": 1.1517246961593628, |
|
"learning_rate": 4.308158743625388e-05, |
|
"loss": 0.5751, |
|
"step": 305 |
|
}, |
|
{ |
|
"epoch": 0.7394156231365534, |
|
"grad_norm": 1.1459076404571533, |
|
"learning_rate": 4.286443976994569e-05, |
|
"loss": 0.6043, |
|
"step": 310 |
|
}, |
|
{ |
|
"epoch": 0.7513416815742398, |
|
"grad_norm": 1.0111889839172363, |
|
"learning_rate": 4.264450243289079e-05, |
|
"loss": 0.5948, |
|
"step": 315 |
|
}, |
|
{ |
|
"epoch": 0.7632677400119261, |
|
"grad_norm": 1.0213602781295776, |
|
"learning_rate": 4.2421809770009225e-05, |
|
"loss": 0.5538, |
|
"step": 320 |
|
}, |
|
{ |
|
"epoch": 0.7751937984496124, |
|
"grad_norm": 1.0203680992126465, |
|
"learning_rate": 4.219639655648651e-05, |
|
"loss": 0.5696, |
|
"step": 325 |
|
}, |
|
{ |
|
"epoch": 0.7871198568872988, |
|
"grad_norm": 0.8697240352630615, |
|
"learning_rate": 4.196829799234321e-05, |
|
"loss": 0.5613, |
|
"step": 330 |
|
}, |
|
{ |
|
"epoch": 0.7990459153249851, |
|
"grad_norm": 0.8935692310333252, |
|
"learning_rate": 4.173754969693826e-05, |
|
"loss": 0.5712, |
|
"step": 335 |
|
}, |
|
{ |
|
"epoch": 0.8109719737626714, |
|
"grad_norm": 0.7936813831329346, |
|
"learning_rate": 4.1504187703406604e-05, |
|
"loss": 0.5693, |
|
"step": 340 |
|
}, |
|
{ |
|
"epoch": 0.8228980322003577, |
|
"grad_norm": 1.0851740837097168, |
|
"learning_rate": 4.126824845303248e-05, |
|
"loss": 0.5911, |
|
"step": 345 |
|
}, |
|
{ |
|
"epoch": 0.8348240906380441, |
|
"grad_norm": 0.9009485244750977, |
|
"learning_rate": 4.102976878955869e-05, |
|
"loss": 0.5551, |
|
"step": 350 |
|
}, |
|
{ |
|
"epoch": 0.8467501490757304, |
|
"grad_norm": 1.1172417402267456, |
|
"learning_rate": 4.0788785953433286e-05, |
|
"loss": 0.564, |
|
"step": 355 |
|
}, |
|
{ |
|
"epoch": 0.8586762075134168, |
|
"grad_norm": 1.2822695970535278, |
|
"learning_rate": 4.05453375759941e-05, |
|
"loss": 0.5766, |
|
"step": 360 |
|
}, |
|
{ |
|
"epoch": 0.8706022659511031, |
|
"grad_norm": 1.327506422996521, |
|
"learning_rate": 4.0299461673592376e-05, |
|
"loss": 0.5289, |
|
"step": 365 |
|
}, |
|
{ |
|
"epoch": 0.8825283243887895, |
|
"grad_norm": 1.0519084930419922, |
|
"learning_rate": 4.0051196641656185e-05, |
|
"loss": 0.5839, |
|
"step": 370 |
|
}, |
|
{ |
|
"epoch": 0.8944543828264758, |
|
"grad_norm": 0.8493732810020447, |
|
"learning_rate": 3.980058124869469e-05, |
|
"loss": 0.583, |
|
"step": 375 |
|
}, |
|
{ |
|
"epoch": 0.9063804412641622, |
|
"grad_norm": 0.9003544449806213, |
|
"learning_rate": 3.9547654630244156e-05, |
|
"loss": 0.5645, |
|
"step": 380 |
|
}, |
|
{ |
|
"epoch": 0.9183064997018485, |
|
"grad_norm": 0.8397880792617798, |
|
"learning_rate": 3.929245628275662e-05, |
|
"loss": 0.6013, |
|
"step": 385 |
|
}, |
|
{ |
|
"epoch": 0.9302325581395349, |
|
"grad_norm": 1.000611424446106, |
|
"learning_rate": 3.903502605743222e-05, |
|
"loss": 0.5657, |
|
"step": 390 |
|
}, |
|
{ |
|
"epoch": 0.9421586165772212, |
|
"grad_norm": 0.9028176665306091, |
|
"learning_rate": 3.877540415399612e-05, |
|
"loss": 0.6077, |
|
"step": 395 |
|
}, |
|
{ |
|
"epoch": 0.9540846750149076, |
|
"grad_norm": 0.8427708745002747, |
|
"learning_rate": 3.851363111442101e-05, |
|
"loss": 0.5553, |
|
"step": 400 |
|
}, |
|
{ |
|
"epoch": 0.9660107334525939, |
|
"grad_norm": 1.1009222269058228, |
|
"learning_rate": 3.8249747816596136e-05, |
|
"loss": 0.5806, |
|
"step": 405 |
|
}, |
|
{ |
|
"epoch": 0.9779367918902803, |
|
"grad_norm": 1.3395967483520508, |
|
"learning_rate": 3.7983795467943975e-05, |
|
"loss": 0.5463, |
|
"step": 410 |
|
}, |
|
{ |
|
"epoch": 0.9898628503279666, |
|
"grad_norm": 0.856643557548523, |
|
"learning_rate": 3.77158155989853e-05, |
|
"loss": 0.6032, |
|
"step": 415 |
|
}, |
|
{ |
|
"epoch": 1.0017889087656529, |
|
"grad_norm": 1.985956072807312, |
|
"learning_rate": 3.74458500568539e-05, |
|
"loss": 0.663, |
|
"step": 420 |
|
}, |
|
{ |
|
"epoch": 1.0137149672033392, |
|
"grad_norm": 1.099455714225769, |
|
"learning_rate": 3.717394099876182e-05, |
|
"loss": 0.5532, |
|
"step": 425 |
|
}, |
|
{ |
|
"epoch": 1.0256410256410255, |
|
"grad_norm": 0.8070193529129028, |
|
"learning_rate": 3.690013088541619e-05, |
|
"loss": 0.537, |
|
"step": 430 |
|
}, |
|
{ |
|
"epoch": 1.037567084078712, |
|
"grad_norm": 1.1234451532363892, |
|
"learning_rate": 3.662446247438867e-05, |
|
"loss": 0.5192, |
|
"step": 435 |
|
}, |
|
{ |
|
"epoch": 1.0494931425163982, |
|
"grad_norm": 1.0410645008087158, |
|
"learning_rate": 3.6346978813438464e-05, |
|
"loss": 0.5525, |
|
"step": 440 |
|
}, |
|
{ |
|
"epoch": 1.0614192009540846, |
|
"grad_norm": 1.0676367282867432, |
|
"learning_rate": 3.606772323379017e-05, |
|
"loss": 0.5546, |
|
"step": 445 |
|
}, |
|
{ |
|
"epoch": 1.073345259391771, |
|
"grad_norm": 1.1925772428512573, |
|
"learning_rate": 3.57867393433672e-05, |
|
"loss": 0.5298, |
|
"step": 450 |
|
}, |
|
{ |
|
"epoch": 1.0852713178294573, |
|
"grad_norm": 1.0415118932724, |
|
"learning_rate": 3.55040710199821e-05, |
|
"loss": 0.5354, |
|
"step": 455 |
|
}, |
|
{ |
|
"epoch": 1.0971973762671436, |
|
"grad_norm": 0.9862761497497559, |
|
"learning_rate": 3.521976240448468e-05, |
|
"loss": 0.5173, |
|
"step": 460 |
|
}, |
|
{ |
|
"epoch": 1.10912343470483, |
|
"grad_norm": 1.0136168003082275, |
|
"learning_rate": 3.493385789386906e-05, |
|
"loss": 0.5424, |
|
"step": 465 |
|
}, |
|
{ |
|
"epoch": 1.1210494931425163, |
|
"grad_norm": 0.999031126499176, |
|
"learning_rate": 3.464640213434079e-05, |
|
"loss": 0.574, |
|
"step": 470 |
|
}, |
|
{ |
|
"epoch": 1.1329755515802027, |
|
"grad_norm": 1.0496968030929565, |
|
"learning_rate": 3.435744001434492e-05, |
|
"loss": 0.5152, |
|
"step": 475 |
|
}, |
|
{ |
|
"epoch": 1.144901610017889, |
|
"grad_norm": 1.0987663269042969, |
|
"learning_rate": 3.40670166575564e-05, |
|
"loss": 0.5504, |
|
"step": 480 |
|
}, |
|
{ |
|
"epoch": 1.1568276684555754, |
|
"grad_norm": 1.0069026947021484, |
|
"learning_rate": 3.3775177415833605e-05, |
|
"loss": 0.5547, |
|
"step": 485 |
|
}, |
|
{ |
|
"epoch": 1.1687537268932617, |
|
"grad_norm": 1.194651484489441, |
|
"learning_rate": 3.348196786213633e-05, |
|
"loss": 0.5319, |
|
"step": 490 |
|
}, |
|
{ |
|
"epoch": 1.180679785330948, |
|
"grad_norm": 0.9666855335235596, |
|
"learning_rate": 3.3187433783409216e-05, |
|
"loss": 0.5541, |
|
"step": 495 |
|
}, |
|
{ |
|
"epoch": 1.1926058437686344, |
|
"grad_norm": 0.8826591372489929, |
|
"learning_rate": 3.289162117343173e-05, |
|
"loss": 0.5329, |
|
"step": 500 |
|
}, |
|
{ |
|
"epoch": 1.2045319022063208, |
|
"grad_norm": 1.1290160417556763, |
|
"learning_rate": 3.259457622563593e-05, |
|
"loss": 0.5964, |
|
"step": 505 |
|
}, |
|
{ |
|
"epoch": 1.2164579606440071, |
|
"grad_norm": 1.078009009361267, |
|
"learning_rate": 3.229634532589296e-05, |
|
"loss": 0.5315, |
|
"step": 510 |
|
}, |
|
{ |
|
"epoch": 1.2283840190816935, |
|
"grad_norm": 1.1144967079162598, |
|
"learning_rate": 3.199697504526955e-05, |
|
"loss": 0.5698, |
|
"step": 515 |
|
}, |
|
{ |
|
"epoch": 1.2403100775193798, |
|
"grad_norm": 1.3256494998931885, |
|
"learning_rate": 3.169651213275562e-05, |
|
"loss": 0.5234, |
|
"step": 520 |
|
}, |
|
{ |
|
"epoch": 1.2522361359570662, |
|
"grad_norm": 1.0332247018814087, |
|
"learning_rate": 3.139500350796397e-05, |
|
"loss": 0.5386, |
|
"step": 525 |
|
}, |
|
{ |
|
"epoch": 1.2641621943947525, |
|
"grad_norm": 1.2091331481933594, |
|
"learning_rate": 3.1092496253803546e-05, |
|
"loss": 0.5136, |
|
"step": 530 |
|
}, |
|
{ |
|
"epoch": 1.2760882528324389, |
|
"grad_norm": 1.0742837190628052, |
|
"learning_rate": 3.078903760912695e-05, |
|
"loss": 0.5341, |
|
"step": 535 |
|
}, |
|
{ |
|
"epoch": 1.2880143112701252, |
|
"grad_norm": 1.5551700592041016, |
|
"learning_rate": 3.048467496135384e-05, |
|
"loss": 0.5613, |
|
"step": 540 |
|
}, |
|
{ |
|
"epoch": 1.2999403697078116, |
|
"grad_norm": 1.0054267644882202, |
|
"learning_rate": 3.017945583907092e-05, |
|
"loss": 0.5076, |
|
"step": 545 |
|
}, |
|
{ |
|
"epoch": 1.311866428145498, |
|
"grad_norm": 1.081240177154541, |
|
"learning_rate": 2.9873427904610057e-05, |
|
"loss": 0.5115, |
|
"step": 550 |
|
}, |
|
{ |
|
"epoch": 1.3237924865831843, |
|
"grad_norm": 1.120842456817627, |
|
"learning_rate": 2.956663894660539e-05, |
|
"loss": 0.5467, |
|
"step": 555 |
|
}, |
|
{ |
|
"epoch": 1.3357185450208706, |
|
"grad_norm": 1.022741675376892, |
|
"learning_rate": 2.9259136872530812e-05, |
|
"loss": 0.523, |
|
"step": 560 |
|
}, |
|
{ |
|
"epoch": 1.347644603458557, |
|
"grad_norm": 1.0805561542510986, |
|
"learning_rate": 2.8950969701218783e-05, |
|
"loss": 0.5541, |
|
"step": 565 |
|
}, |
|
{ |
|
"epoch": 1.3595706618962433, |
|
"grad_norm": 1.1941672563552856, |
|
"learning_rate": 2.864218555536188e-05, |
|
"loss": 0.5303, |
|
"step": 570 |
|
}, |
|
{ |
|
"epoch": 1.3714967203339297, |
|
"grad_norm": 1.0475640296936035, |
|
"learning_rate": 2.833283265399801e-05, |
|
"loss": 0.544, |
|
"step": 575 |
|
}, |
|
{ |
|
"epoch": 1.383422778771616, |
|
"grad_norm": 1.258143424987793, |
|
"learning_rate": 2.8022959304980695e-05, |
|
"loss": 0.5596, |
|
"step": 580 |
|
}, |
|
{ |
|
"epoch": 1.3953488372093024, |
|
"grad_norm": 1.044134259223938, |
|
"learning_rate": 2.7712613897435357e-05, |
|
"loss": 0.5003, |
|
"step": 585 |
|
}, |
|
{ |
|
"epoch": 1.4072748956469887, |
|
"grad_norm": 1.100220799446106, |
|
"learning_rate": 2.7401844894203056e-05, |
|
"loss": 0.5629, |
|
"step": 590 |
|
}, |
|
{ |
|
"epoch": 1.419200954084675, |
|
"grad_norm": 0.9259990453720093, |
|
"learning_rate": 2.7090700824272557e-05, |
|
"loss": 0.4516, |
|
"step": 595 |
|
}, |
|
{ |
|
"epoch": 1.4311270125223614, |
|
"grad_norm": 0.8607438206672668, |
|
"learning_rate": 2.6779230275202243e-05, |
|
"loss": 0.538, |
|
"step": 600 |
|
}, |
|
{ |
|
"epoch": 1.4430530709600478, |
|
"grad_norm": 1.2486215829849243, |
|
"learning_rate": 2.6467481885532704e-05, |
|
"loss": 0.5314, |
|
"step": 605 |
|
}, |
|
{ |
|
"epoch": 1.454979129397734, |
|
"grad_norm": 1.1418797969818115, |
|
"learning_rate": 2.6155504337191516e-05, |
|
"loss": 0.4841, |
|
"step": 610 |
|
}, |
|
{ |
|
"epoch": 1.4669051878354205, |
|
"grad_norm": 1.3612860441207886, |
|
"learning_rate": 2.5843346347891163e-05, |
|
"loss": 0.5133, |
|
"step": 615 |
|
}, |
|
{ |
|
"epoch": 1.4788312462731068, |
|
"grad_norm": 1.1785390377044678, |
|
"learning_rate": 2.5531056663521362e-05, |
|
"loss": 0.5309, |
|
"step": 620 |
|
}, |
|
{ |
|
"epoch": 1.4907573047107932, |
|
"grad_norm": 0.8908374905586243, |
|
"learning_rate": 2.521868405053706e-05, |
|
"loss": 0.5022, |
|
"step": 625 |
|
}, |
|
{ |
|
"epoch": 1.5026833631484795, |
|
"grad_norm": 1.031840443611145, |
|
"learning_rate": 2.4906277288343123e-05, |
|
"loss": 0.5254, |
|
"step": 630 |
|
}, |
|
{ |
|
"epoch": 1.5146094215861656, |
|
"grad_norm": 1.5957837104797363, |
|
"learning_rate": 2.459388516167711e-05, |
|
"loss": 0.5507, |
|
"step": 635 |
|
}, |
|
{ |
|
"epoch": 1.5265354800238522, |
|
"grad_norm": 1.2121431827545166, |
|
"learning_rate": 2.428155645299111e-05, |
|
"loss": 0.4943, |
|
"step": 640 |
|
}, |
|
{ |
|
"epoch": 1.5384615384615383, |
|
"grad_norm": 0.9426692724227905, |
|
"learning_rate": 2.3969339934834012e-05, |
|
"loss": 0.5111, |
|
"step": 645 |
|
}, |
|
{ |
|
"epoch": 1.550387596899225, |
|
"grad_norm": 1.2159509658813477, |
|
"learning_rate": 2.3657284362235274e-05, |
|
"loss": 0.5302, |
|
"step": 650 |
|
}, |
|
{ |
|
"epoch": 1.562313655336911, |
|
"grad_norm": 1.1508508920669556, |
|
"learning_rate": 2.3345438465091455e-05, |
|
"loss": 0.5299, |
|
"step": 655 |
|
}, |
|
{ |
|
"epoch": 1.5742397137745976, |
|
"grad_norm": 1.2826497554779053, |
|
"learning_rate": 2.303385094055669e-05, |
|
"loss": 0.5453, |
|
"step": 660 |
|
}, |
|
{ |
|
"epoch": 1.5861657722122837, |
|
"grad_norm": 1.520505666732788, |
|
"learning_rate": 2.2722570445438214e-05, |
|
"loss": 0.5804, |
|
"step": 665 |
|
}, |
|
{ |
|
"epoch": 1.5980918306499703, |
|
"grad_norm": 1.1682236194610596, |
|
"learning_rate": 2.2411645588598232e-05, |
|
"loss": 0.5072, |
|
"step": 670 |
|
}, |
|
{ |
|
"epoch": 1.6100178890876564, |
|
"grad_norm": 1.1223719120025635, |
|
"learning_rate": 2.2101124923363267e-05, |
|
"loss": 0.5287, |
|
"step": 675 |
|
}, |
|
{ |
|
"epoch": 1.621943947525343, |
|
"grad_norm": 1.0129245519638062, |
|
"learning_rate": 2.1791056939942228e-05, |
|
"loss": 0.5343, |
|
"step": 680 |
|
}, |
|
{ |
|
"epoch": 1.6338700059630291, |
|
"grad_norm": 1.00519597530365, |
|
"learning_rate": 2.1481490057854217e-05, |
|
"loss": 0.5329, |
|
"step": 685 |
|
}, |
|
{ |
|
"epoch": 1.6457960644007157, |
|
"grad_norm": 1.1941732168197632, |
|
"learning_rate": 2.1172472618367483e-05, |
|
"loss": 0.5591, |
|
"step": 690 |
|
}, |
|
{ |
|
"epoch": 1.6577221228384018, |
|
"grad_norm": 0.9398036599159241, |
|
"learning_rate": 2.0864052876950552e-05, |
|
"loss": 0.5254, |
|
"step": 695 |
|
}, |
|
{ |
|
"epoch": 1.6696481812760884, |
|
"grad_norm": 0.8543565273284912, |
|
"learning_rate": 2.0556278995736782e-05, |
|
"loss": 0.5212, |
|
"step": 700 |
|
}, |
|
{ |
|
"epoch": 1.6815742397137745, |
|
"grad_norm": 1.0366357564926147, |
|
"learning_rate": 2.024919903600344e-05, |
|
"loss": 0.5276, |
|
"step": 705 |
|
}, |
|
{ |
|
"epoch": 1.693500298151461, |
|
"grad_norm": 1.2778667211532593, |
|
"learning_rate": 1.9942860950666574e-05, |
|
"loss": 0.5203, |
|
"step": 710 |
|
}, |
|
{ |
|
"epoch": 1.7054263565891472, |
|
"grad_norm": 1.2391997575759888, |
|
"learning_rate": 1.9637312576792776e-05, |
|
"loss": 0.5291, |
|
"step": 715 |
|
}, |
|
{ |
|
"epoch": 1.7173524150268338, |
|
"grad_norm": 1.0726789236068726, |
|
"learning_rate": 1.9332601628129128e-05, |
|
"loss": 0.5299, |
|
"step": 720 |
|
}, |
|
{ |
|
"epoch": 1.72927847346452, |
|
"grad_norm": 1.1761037111282349, |
|
"learning_rate": 1.9028775687652217e-05, |
|
"loss": 0.5476, |
|
"step": 725 |
|
}, |
|
{ |
|
"epoch": 1.7412045319022065, |
|
"grad_norm": 1.257358431816101, |
|
"learning_rate": 1.8725882200137762e-05, |
|
"loss": 0.5333, |
|
"step": 730 |
|
}, |
|
{ |
|
"epoch": 1.7531305903398926, |
|
"grad_norm": 1.121039628982544, |
|
"learning_rate": 1.8423968464751722e-05, |
|
"loss": 0.4962, |
|
"step": 735 |
|
}, |
|
{ |
|
"epoch": 1.7650566487775792, |
|
"grad_norm": 1.1800827980041504, |
|
"learning_rate": 1.812308162766418e-05, |
|
"loss": 0.4986, |
|
"step": 740 |
|
}, |
|
{ |
|
"epoch": 1.7769827072152653, |
|
"grad_norm": 1.262669324874878, |
|
"learning_rate": 1.7823268674687077e-05, |
|
"loss": 0.5461, |
|
"step": 745 |
|
}, |
|
{ |
|
"epoch": 1.7889087656529516, |
|
"grad_norm": 1.1524207592010498, |
|
"learning_rate": 1.7524576423937025e-05, |
|
"loss": 0.5392, |
|
"step": 750 |
|
}, |
|
{ |
|
"epoch": 1.800834824090638, |
|
"grad_norm": 0.9975520968437195, |
|
"learning_rate": 1.7227051518524286e-05, |
|
"loss": 0.5162, |
|
"step": 755 |
|
}, |
|
{ |
|
"epoch": 1.8127608825283243, |
|
"grad_norm": 1.1616145372390747, |
|
"learning_rate": 1.6930740419269132e-05, |
|
"loss": 0.5431, |
|
"step": 760 |
|
}, |
|
{ |
|
"epoch": 1.8246869409660107, |
|
"grad_norm": 0.9507209658622742, |
|
"learning_rate": 1.6635689397446562e-05, |
|
"loss": 0.5037, |
|
"step": 765 |
|
}, |
|
{ |
|
"epoch": 1.836612999403697, |
|
"grad_norm": 1.0409976243972778, |
|
"learning_rate": 1.6341944527560736e-05, |
|
"loss": 0.5164, |
|
"step": 770 |
|
}, |
|
{ |
|
"epoch": 1.8485390578413834, |
|
"grad_norm": 0.8729709386825562, |
|
"learning_rate": 1.6049551680150047e-05, |
|
"loss": 0.495, |
|
"step": 775 |
|
}, |
|
{ |
|
"epoch": 1.8604651162790697, |
|
"grad_norm": 1.223853588104248, |
|
"learning_rate": 1.5758556514624118e-05, |
|
"loss": 0.5834, |
|
"step": 780 |
|
}, |
|
{ |
|
"epoch": 1.872391174716756, |
|
"grad_norm": 1.184877634048462, |
|
"learning_rate": 1.5469004472133696e-05, |
|
"loss": 0.5459, |
|
"step": 785 |
|
}, |
|
{ |
|
"epoch": 1.8843172331544424, |
|
"grad_norm": 1.163509488105774, |
|
"learning_rate": 1.5180940768474689e-05, |
|
"loss": 0.516, |
|
"step": 790 |
|
}, |
|
{ |
|
"epoch": 1.8962432915921288, |
|
"grad_norm": 1.2189282178878784, |
|
"learning_rate": 1.489441038702735e-05, |
|
"loss": 0.5519, |
|
"step": 795 |
|
}, |
|
{ |
|
"epoch": 1.9081693500298151, |
|
"grad_norm": 1.1891826391220093, |
|
"learning_rate": 1.4609458071731796e-05, |
|
"loss": 0.5173, |
|
"step": 800 |
|
}, |
|
{ |
|
"epoch": 1.9200954084675015, |
|
"grad_norm": 1.3610830307006836, |
|
"learning_rate": 1.4326128320100867e-05, |
|
"loss": 0.517, |
|
"step": 805 |
|
}, |
|
{ |
|
"epoch": 1.9320214669051878, |
|
"grad_norm": 1.148544192314148, |
|
"learning_rate": 1.4044465376271532e-05, |
|
"loss": 0.4906, |
|
"step": 810 |
|
}, |
|
{ |
|
"epoch": 1.9439475253428742, |
|
"grad_norm": 1.404956340789795, |
|
"learning_rate": 1.3764513224095762e-05, |
|
"loss": 0.5598, |
|
"step": 815 |
|
}, |
|
{ |
|
"epoch": 1.9558735837805605, |
|
"grad_norm": 1.206635594367981, |
|
"learning_rate": 1.3486315580272202e-05, |
|
"loss": 0.5276, |
|
"step": 820 |
|
}, |
|
{ |
|
"epoch": 1.9677996422182469, |
|
"grad_norm": 0.8971341848373413, |
|
"learning_rate": 1.320991588751938e-05, |
|
"loss": 0.4966, |
|
"step": 825 |
|
}, |
|
{ |
|
"epoch": 1.9797257006559332, |
|
"grad_norm": 1.1437649726867676, |
|
"learning_rate": 1.2935357307791826e-05, |
|
"loss": 0.5133, |
|
"step": 830 |
|
}, |
|
{ |
|
"epoch": 1.9916517590936196, |
|
"grad_norm": 1.2897300720214844, |
|
"learning_rate": 1.2662682715540031e-05, |
|
"loss": 0.4814, |
|
"step": 835 |
|
}, |
|
{ |
|
"epoch": 2.0035778175313057, |
|
"grad_norm": 1.0928142070770264, |
|
"learning_rate": 1.2391934691015213e-05, |
|
"loss": 0.6167, |
|
"step": 840 |
|
}, |
|
{ |
|
"epoch": 2.0155038759689923, |
|
"grad_norm": 1.1107832193374634, |
|
"learning_rate": 1.2123155513620108e-05, |
|
"loss": 0.4915, |
|
"step": 845 |
|
}, |
|
{ |
|
"epoch": 2.0274299344066784, |
|
"grad_norm": 1.2382787466049194, |
|
"learning_rate": 1.1856387155306715e-05, |
|
"loss": 0.5116, |
|
"step": 850 |
|
}, |
|
{ |
|
"epoch": 2.039355992844365, |
|
"grad_norm": 1.46440589427948, |
|
"learning_rate": 1.1591671274022035e-05, |
|
"loss": 0.4754, |
|
"step": 855 |
|
}, |
|
{ |
|
"epoch": 2.051282051282051, |
|
"grad_norm": 1.4522420167922974, |
|
"learning_rate": 1.1329049207202904e-05, |
|
"loss": 0.4426, |
|
"step": 860 |
|
}, |
|
{ |
|
"epoch": 2.0632081097197377, |
|
"grad_norm": 1.3376868963241577, |
|
"learning_rate": 1.1068561965320764e-05, |
|
"loss": 0.5026, |
|
"step": 865 |
|
}, |
|
{ |
|
"epoch": 2.075134168157424, |
|
"grad_norm": 1.1326589584350586, |
|
"learning_rate": 1.0810250225477611e-05, |
|
"loss": 0.4872, |
|
"step": 870 |
|
}, |
|
{ |
|
"epoch": 2.0870602265951104, |
|
"grad_norm": 1.5386810302734375, |
|
"learning_rate": 1.055415432505393e-05, |
|
"loss": 0.5022, |
|
"step": 875 |
|
}, |
|
{ |
|
"epoch": 2.0989862850327965, |
|
"grad_norm": 1.2370398044586182, |
|
"learning_rate": 1.0300314255409704e-05, |
|
"loss": 0.5388, |
|
"step": 880 |
|
}, |
|
{ |
|
"epoch": 2.110912343470483, |
|
"grad_norm": 1.2834645509719849, |
|
"learning_rate": 1.004876965563945e-05, |
|
"loss": 0.481, |
|
"step": 885 |
|
}, |
|
{ |
|
"epoch": 2.122838401908169, |
|
"grad_norm": 1.3606491088867188, |
|
"learning_rate": 9.79955980638229e-06, |
|
"loss": 0.4829, |
|
"step": 890 |
|
}, |
|
{ |
|
"epoch": 2.1347644603458558, |
|
"grad_norm": 1.3206753730773926, |
|
"learning_rate": 9.552723623687934e-06, |
|
"loss": 0.4902, |
|
"step": 895 |
|
}, |
|
{ |
|
"epoch": 2.146690518783542, |
|
"grad_norm": 1.1643896102905273, |
|
"learning_rate": 9.308299652939666e-06, |
|
"loss": 0.5297, |
|
"step": 900 |
|
}, |
|
{ |
|
"epoch": 2.1586165772212285, |
|
"grad_norm": 1.0991599559783936, |
|
"learning_rate": 9.066326062835179e-06, |
|
"loss": 0.4845, |
|
"step": 905 |
|
}, |
|
{ |
|
"epoch": 2.1705426356589146, |
|
"grad_norm": 1.1832717657089233, |
|
"learning_rate": 8.826840639426218e-06, |
|
"loss": 0.5086, |
|
"step": 910 |
|
}, |
|
{ |
|
"epoch": 2.182468694096601, |
|
"grad_norm": 1.385024070739746, |
|
"learning_rate": 8.589880780218049e-06, |
|
"loss": 0.5136, |
|
"step": 915 |
|
}, |
|
{ |
|
"epoch": 2.1943947525342873, |
|
"grad_norm": 1.1657204627990723, |
|
"learning_rate": 8.355483488329471e-06, |
|
"loss": 0.5137, |
|
"step": 920 |
|
}, |
|
{ |
|
"epoch": 2.206320810971974, |
|
"grad_norm": 1.0423134565353394, |
|
"learning_rate": 8.123685366714556e-06, |
|
"loss": 0.5032, |
|
"step": 925 |
|
}, |
|
{ |
|
"epoch": 2.21824686940966, |
|
"grad_norm": 1.2326374053955078, |
|
"learning_rate": 7.89452261244677e-06, |
|
"loss": 0.4658, |
|
"step": 930 |
|
}, |
|
{ |
|
"epoch": 2.2301729278473466, |
|
"grad_norm": 1.3008525371551514, |
|
"learning_rate": 7.66803101106657e-06, |
|
"loss": 0.5261, |
|
"step": 935 |
|
}, |
|
{ |
|
"epoch": 2.2420989862850327, |
|
"grad_norm": 1.296058177947998, |
|
"learning_rate": 7.44424593099316e-06, |
|
"loss": 0.5216, |
|
"step": 940 |
|
}, |
|
{ |
|
"epoch": 2.2540250447227193, |
|
"grad_norm": 1.3323789834976196, |
|
"learning_rate": 7.223202318001465e-06, |
|
"loss": 0.4949, |
|
"step": 945 |
|
}, |
|
{ |
|
"epoch": 2.2659511031604054, |
|
"grad_norm": 1.2150744199752808, |
|
"learning_rate": 7.0049346897650745e-06, |
|
"loss": 0.464, |
|
"step": 950 |
|
}, |
|
{ |
|
"epoch": 2.277877161598092, |
|
"grad_norm": 1.1837234497070312, |
|
"learning_rate": 6.789477130466057e-06, |
|
"loss": 0.5057, |
|
"step": 955 |
|
}, |
|
{ |
|
"epoch": 2.289803220035778, |
|
"grad_norm": 1.0720417499542236, |
|
"learning_rate": 6.576863285472415e-06, |
|
"loss": 0.4875, |
|
"step": 960 |
|
}, |
|
{ |
|
"epoch": 2.3017292784734646, |
|
"grad_norm": 1.4399152994155884, |
|
"learning_rate": 6.367126356084127e-06, |
|
"loss": 0.4753, |
|
"step": 965 |
|
}, |
|
{ |
|
"epoch": 2.3136553369111508, |
|
"grad_norm": 1.3286051750183105, |
|
"learning_rate": 6.160299094348488e-06, |
|
"loss": 0.4725, |
|
"step": 970 |
|
}, |
|
{ |
|
"epoch": 2.3255813953488373, |
|
"grad_norm": 1.3082605600357056, |
|
"learning_rate": 5.956413797945657e-06, |
|
"loss": 0.5095, |
|
"step": 975 |
|
}, |
|
{ |
|
"epoch": 2.3375074537865235, |
|
"grad_norm": 1.4694650173187256, |
|
"learning_rate": 5.755502305145089e-06, |
|
"loss": 0.5099, |
|
"step": 980 |
|
}, |
|
{ |
|
"epoch": 2.34943351222421, |
|
"grad_norm": 1.1760421991348267, |
|
"learning_rate": 5.557595989833747e-06, |
|
"loss": 0.4744, |
|
"step": 985 |
|
}, |
|
{ |
|
"epoch": 2.361359570661896, |
|
"grad_norm": 1.5401874780654907, |
|
"learning_rate": 5.36272575661684e-06, |
|
"loss": 0.5126, |
|
"step": 990 |
|
}, |
|
{ |
|
"epoch": 2.3732856290995827, |
|
"grad_norm": 1.114990234375, |
|
"learning_rate": 5.170922035991838e-06, |
|
"loss": 0.4636, |
|
"step": 995 |
|
}, |
|
{ |
|
"epoch": 2.385211687537269, |
|
"grad_norm": 1.2573448419570923, |
|
"learning_rate": 4.9822147795964805e-06, |
|
"loss": 0.4817, |
|
"step": 1000 |
|
} |
|
], |
|
"logging_steps": 5, |
|
"max_steps": 1257, |
|
"num_input_tokens_seen": 0, |
|
"num_train_epochs": 3, |
|
"save_steps": 100, |
|
"stateful_callbacks": { |
|
"TrainerControl": { |
|
"args": { |
|
"should_epoch_stop": false, |
|
"should_evaluate": false, |
|
"should_log": false, |
|
"should_save": true, |
|
"should_training_stop": false |
|
}, |
|
"attributes": {} |
|
} |
|
}, |
|
"total_flos": 3.48527209218048e+17, |
|
"train_batch_size": 1, |
|
"trial_name": null, |
|
"trial_params": null |
|
} |
|
|