QwenLimSim / checkpoint-1000 /trainer_state.json
xiaofutongxuo's picture
Upload folder using huggingface_hub
8f8c9b1 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 2.385211687537269,
"eval_steps": 500,
"global_step": 1000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.011926058437686345,
"grad_norm": 0.5661849975585938,
"learning_rate": 4.999804802810596e-05,
"loss": 1.4421,
"step": 5
},
{
"epoch": 0.02385211687537269,
"grad_norm": 0.5908129811286926,
"learning_rate": 4.999219241723937e-05,
"loss": 1.3306,
"step": 10
},
{
"epoch": 0.03577817531305903,
"grad_norm": 0.6965810060501099,
"learning_rate": 4.998243408179925e-05,
"loss": 1.1873,
"step": 15
},
{
"epoch": 0.04770423375074538,
"grad_norm": 0.5752276182174683,
"learning_rate": 4.9968774545625344e-05,
"loss": 1.0583,
"step": 20
},
{
"epoch": 0.05963029218843172,
"grad_norm": 0.6488135457038879,
"learning_rate": 4.9951215941760075e-05,
"loss": 1.0045,
"step": 25
},
{
"epoch": 0.07155635062611806,
"grad_norm": 0.7977299094200134,
"learning_rate": 4.992976101211558e-05,
"loss": 0.9836,
"step": 30
},
{
"epoch": 0.08348240906380441,
"grad_norm": 0.8842712044715881,
"learning_rate": 4.99044131070454e-05,
"loss": 0.8925,
"step": 35
},
{
"epoch": 0.09540846750149076,
"grad_norm": 0.798170804977417,
"learning_rate": 4.987517618482142e-05,
"loss": 0.8783,
"step": 40
},
{
"epoch": 0.1073345259391771,
"grad_norm": 0.8196467757225037,
"learning_rate": 4.984205481101565e-05,
"loss": 0.8681,
"step": 45
},
{
"epoch": 0.11926058437686345,
"grad_norm": 0.9973539113998413,
"learning_rate": 4.980505415778738e-05,
"loss": 0.8251,
"step": 50
},
{
"epoch": 0.13118664281454978,
"grad_norm": 0.8836011290550232,
"learning_rate": 4.97641800030754e-05,
"loss": 0.8079,
"step": 55
},
{
"epoch": 0.14311270125223613,
"grad_norm": 0.9255298972129822,
"learning_rate": 4.971943872969582e-05,
"loss": 0.7769,
"step": 60
},
{
"epoch": 0.15503875968992248,
"grad_norm": 0.9712087512016296,
"learning_rate": 4.967083732434529e-05,
"loss": 0.777,
"step": 65
},
{
"epoch": 0.16696481812760883,
"grad_norm": 0.7609415054321289,
"learning_rate": 4.961838337650997e-05,
"loss": 0.7303,
"step": 70
},
{
"epoch": 0.17889087656529518,
"grad_norm": 0.8297658562660217,
"learning_rate": 4.9562085077280443e-05,
"loss": 0.7442,
"step": 75
},
{
"epoch": 0.19081693500298152,
"grad_norm": 1.01498281955719,
"learning_rate": 4.950195121807251e-05,
"loss": 0.7077,
"step": 80
},
{
"epoch": 0.20274299344066785,
"grad_norm": 0.9629709720611572,
"learning_rate": 4.943799118925443e-05,
"loss": 0.673,
"step": 85
},
{
"epoch": 0.2146690518783542,
"grad_norm": 0.9121557474136353,
"learning_rate": 4.937021497868047e-05,
"loss": 0.7109,
"step": 90
},
{
"epoch": 0.22659511031604054,
"grad_norm": 0.841573178768158,
"learning_rate": 4.9298633170131304e-05,
"loss": 0.6798,
"step": 95
},
{
"epoch": 0.2385211687537269,
"grad_norm": 0.9693652391433716,
"learning_rate": 4.922325694166119e-05,
"loss": 0.6471,
"step": 100
},
{
"epoch": 0.2504472271914132,
"grad_norm": 1.1198012828826904,
"learning_rate": 4.9144098063852485e-05,
"loss": 0.6456,
"step": 105
},
{
"epoch": 0.26237328562909956,
"grad_norm": 1.1725982427597046,
"learning_rate": 4.9061168897977564e-05,
"loss": 0.6462,
"step": 110
},
{
"epoch": 0.2742993440667859,
"grad_norm": 0.9855665564537048,
"learning_rate": 4.8974482394068514e-05,
"loss": 0.6594,
"step": 115
},
{
"epoch": 0.28622540250447226,
"grad_norm": 0.8321542739868164,
"learning_rate": 4.888405208889486e-05,
"loss": 0.6372,
"step": 120
},
{
"epoch": 0.2981514609421586,
"grad_norm": 0.9451864957809448,
"learning_rate": 4.878989210384972e-05,
"loss": 0.6594,
"step": 125
},
{
"epoch": 0.31007751937984496,
"grad_norm": 1.1336216926574707,
"learning_rate": 4.869201714274467e-05,
"loss": 0.6544,
"step": 130
},
{
"epoch": 0.3220035778175313,
"grad_norm": 0.8466928601264954,
"learning_rate": 4.8590442489513543e-05,
"loss": 0.6275,
"step": 135
},
{
"epoch": 0.33392963625521765,
"grad_norm": 0.979365885257721,
"learning_rate": 4.8485184005825815e-05,
"loss": 0.6348,
"step": 140
},
{
"epoch": 0.345855694692904,
"grad_norm": 0.9159572124481201,
"learning_rate": 4.837625812860961e-05,
"loss": 0.6045,
"step": 145
},
{
"epoch": 0.35778175313059035,
"grad_norm": 1.0542089939117432,
"learning_rate": 4.8263681867485e-05,
"loss": 0.6487,
"step": 150
},
{
"epoch": 0.3697078115682767,
"grad_norm": 1.0264719724655151,
"learning_rate": 4.814747280210782e-05,
"loss": 0.6026,
"step": 155
},
{
"epoch": 0.38163387000596305,
"grad_norm": 0.9389889240264893,
"learning_rate": 4.80276490794244e-05,
"loss": 0.6083,
"step": 160
},
{
"epoch": 0.3935599284436494,
"grad_norm": 1.1090476512908936,
"learning_rate": 4.790422941083786e-05,
"loss": 0.5957,
"step": 165
},
{
"epoch": 0.4054859868813357,
"grad_norm": 1.0268605947494507,
"learning_rate": 4.7777233069286154e-05,
"loss": 0.6538,
"step": 170
},
{
"epoch": 0.41741204531902204,
"grad_norm": 1.240955114364624,
"learning_rate": 4.7646679886232414e-05,
"loss": 0.62,
"step": 175
},
{
"epoch": 0.4293381037567084,
"grad_norm": 1.2029043436050415,
"learning_rate": 4.7512590248568163e-05,
"loss": 0.6461,
"step": 180
},
{
"epoch": 0.44126416219439474,
"grad_norm": 1.0567660331726074,
"learning_rate": 4.7374985095429725e-05,
"loss": 0.633,
"step": 185
},
{
"epoch": 0.4531902206320811,
"grad_norm": 0.928354024887085,
"learning_rate": 4.723388591492841e-05,
"loss": 0.633,
"step": 190
},
{
"epoch": 0.46511627906976744,
"grad_norm": 0.8453586101531982,
"learning_rate": 4.708931474079499e-05,
"loss": 0.6082,
"step": 195
},
{
"epoch": 0.4770423375074538,
"grad_norm": 1.33816397190094,
"learning_rate": 4.6941294148938954e-05,
"loss": 0.6176,
"step": 200
},
{
"epoch": 0.48896839594514013,
"grad_norm": 0.9532414078712463,
"learning_rate": 4.678984725392309e-05,
"loss": 0.6168,
"step": 205
},
{
"epoch": 0.5008944543828264,
"grad_norm": 0.9017360210418701,
"learning_rate": 4.6634997705354024e-05,
"loss": 0.6454,
"step": 210
},
{
"epoch": 0.5128205128205128,
"grad_norm": 1.1095216274261475,
"learning_rate": 4.6476769684189065e-05,
"loss": 0.5738,
"step": 215
},
{
"epoch": 0.5247465712581991,
"grad_norm": 0.8017715811729431,
"learning_rate": 4.631518789896023e-05,
"loss": 0.5978,
"step": 220
},
{
"epoch": 0.5366726296958855,
"grad_norm": 1.0233415365219116,
"learning_rate": 4.6150277581915804e-05,
"loss": 0.6375,
"step": 225
},
{
"epoch": 0.5485986881335718,
"grad_norm": 0.9298461675643921,
"learning_rate": 4.598206448508007e-05,
"loss": 0.606,
"step": 230
},
{
"epoch": 0.5605247465712582,
"grad_norm": 0.9300760626792908,
"learning_rate": 4.581057487623204e-05,
"loss": 0.6261,
"step": 235
},
{
"epoch": 0.5724508050089445,
"grad_norm": 1.4629548788070679,
"learning_rate": 4.5635835534803406e-05,
"loss": 0.5877,
"step": 240
},
{
"epoch": 0.5843768634466309,
"grad_norm": 0.916437029838562,
"learning_rate": 4.545787374769686e-05,
"loss": 0.5821,
"step": 245
},
{
"epoch": 0.5963029218843172,
"grad_norm": 0.9588505625724792,
"learning_rate": 4.527671730502491e-05,
"loss": 0.6126,
"step": 250
},
{
"epoch": 0.6082289803220036,
"grad_norm": 1.068697214126587,
"learning_rate": 4.5092394495770335e-05,
"loss": 0.6034,
"step": 255
},
{
"epoch": 0.6201550387596899,
"grad_norm": 0.816047191619873,
"learning_rate": 4.490493410336857e-05,
"loss": 0.6088,
"step": 260
},
{
"epoch": 0.6320810971973763,
"grad_norm": 0.8465267419815063,
"learning_rate": 4.4714365401213e-05,
"loss": 0.6067,
"step": 265
},
{
"epoch": 0.6440071556350626,
"grad_norm": 0.9819373488426208,
"learning_rate": 4.4520718148083665e-05,
"loss": 0.6334,
"step": 270
},
{
"epoch": 0.655933214072749,
"grad_norm": 0.7895939350128174,
"learning_rate": 4.43240225835002e-05,
"loss": 0.6093,
"step": 275
},
{
"epoch": 0.6678592725104353,
"grad_norm": 1.0516682863235474,
"learning_rate": 4.41243094229997e-05,
"loss": 0.5939,
"step": 280
},
{
"epoch": 0.6797853309481217,
"grad_norm": 0.8521412014961243,
"learning_rate": 4.392160985334027e-05,
"loss": 0.5483,
"step": 285
},
{
"epoch": 0.691711389385808,
"grad_norm": 0.847909152507782,
"learning_rate": 4.371595552763093e-05,
"loss": 0.5946,
"step": 290
},
{
"epoch": 0.7036374478234944,
"grad_norm": 1.1502934694290161,
"learning_rate": 4.350737856038878e-05,
"loss": 0.5651,
"step": 295
},
{
"epoch": 0.7155635062611807,
"grad_norm": 1.0434402227401733,
"learning_rate": 4.3295911522524044e-05,
"loss": 0.5965,
"step": 300
},
{
"epoch": 0.727489564698867,
"grad_norm": 1.1517246961593628,
"learning_rate": 4.308158743625388e-05,
"loss": 0.5751,
"step": 305
},
{
"epoch": 0.7394156231365534,
"grad_norm": 1.1459076404571533,
"learning_rate": 4.286443976994569e-05,
"loss": 0.6043,
"step": 310
},
{
"epoch": 0.7513416815742398,
"grad_norm": 1.0111889839172363,
"learning_rate": 4.264450243289079e-05,
"loss": 0.5948,
"step": 315
},
{
"epoch": 0.7632677400119261,
"grad_norm": 1.0213602781295776,
"learning_rate": 4.2421809770009225e-05,
"loss": 0.5538,
"step": 320
},
{
"epoch": 0.7751937984496124,
"grad_norm": 1.0203680992126465,
"learning_rate": 4.219639655648651e-05,
"loss": 0.5696,
"step": 325
},
{
"epoch": 0.7871198568872988,
"grad_norm": 0.8697240352630615,
"learning_rate": 4.196829799234321e-05,
"loss": 0.5613,
"step": 330
},
{
"epoch": 0.7990459153249851,
"grad_norm": 0.8935692310333252,
"learning_rate": 4.173754969693826e-05,
"loss": 0.5712,
"step": 335
},
{
"epoch": 0.8109719737626714,
"grad_norm": 0.7936813831329346,
"learning_rate": 4.1504187703406604e-05,
"loss": 0.5693,
"step": 340
},
{
"epoch": 0.8228980322003577,
"grad_norm": 1.0851740837097168,
"learning_rate": 4.126824845303248e-05,
"loss": 0.5911,
"step": 345
},
{
"epoch": 0.8348240906380441,
"grad_norm": 0.9009485244750977,
"learning_rate": 4.102976878955869e-05,
"loss": 0.5551,
"step": 350
},
{
"epoch": 0.8467501490757304,
"grad_norm": 1.1172417402267456,
"learning_rate": 4.0788785953433286e-05,
"loss": 0.564,
"step": 355
},
{
"epoch": 0.8586762075134168,
"grad_norm": 1.2822695970535278,
"learning_rate": 4.05453375759941e-05,
"loss": 0.5766,
"step": 360
},
{
"epoch": 0.8706022659511031,
"grad_norm": 1.327506422996521,
"learning_rate": 4.0299461673592376e-05,
"loss": 0.5289,
"step": 365
},
{
"epoch": 0.8825283243887895,
"grad_norm": 1.0519084930419922,
"learning_rate": 4.0051196641656185e-05,
"loss": 0.5839,
"step": 370
},
{
"epoch": 0.8944543828264758,
"grad_norm": 0.8493732810020447,
"learning_rate": 3.980058124869469e-05,
"loss": 0.583,
"step": 375
},
{
"epoch": 0.9063804412641622,
"grad_norm": 0.9003544449806213,
"learning_rate": 3.9547654630244156e-05,
"loss": 0.5645,
"step": 380
},
{
"epoch": 0.9183064997018485,
"grad_norm": 0.8397880792617798,
"learning_rate": 3.929245628275662e-05,
"loss": 0.6013,
"step": 385
},
{
"epoch": 0.9302325581395349,
"grad_norm": 1.000611424446106,
"learning_rate": 3.903502605743222e-05,
"loss": 0.5657,
"step": 390
},
{
"epoch": 0.9421586165772212,
"grad_norm": 0.9028176665306091,
"learning_rate": 3.877540415399612e-05,
"loss": 0.6077,
"step": 395
},
{
"epoch": 0.9540846750149076,
"grad_norm": 0.8427708745002747,
"learning_rate": 3.851363111442101e-05,
"loss": 0.5553,
"step": 400
},
{
"epoch": 0.9660107334525939,
"grad_norm": 1.1009222269058228,
"learning_rate": 3.8249747816596136e-05,
"loss": 0.5806,
"step": 405
},
{
"epoch": 0.9779367918902803,
"grad_norm": 1.3395967483520508,
"learning_rate": 3.7983795467943975e-05,
"loss": 0.5463,
"step": 410
},
{
"epoch": 0.9898628503279666,
"grad_norm": 0.856643557548523,
"learning_rate": 3.77158155989853e-05,
"loss": 0.6032,
"step": 415
},
{
"epoch": 1.0017889087656529,
"grad_norm": 1.985956072807312,
"learning_rate": 3.74458500568539e-05,
"loss": 0.663,
"step": 420
},
{
"epoch": 1.0137149672033392,
"grad_norm": 1.099455714225769,
"learning_rate": 3.717394099876182e-05,
"loss": 0.5532,
"step": 425
},
{
"epoch": 1.0256410256410255,
"grad_norm": 0.8070193529129028,
"learning_rate": 3.690013088541619e-05,
"loss": 0.537,
"step": 430
},
{
"epoch": 1.037567084078712,
"grad_norm": 1.1234451532363892,
"learning_rate": 3.662446247438867e-05,
"loss": 0.5192,
"step": 435
},
{
"epoch": 1.0494931425163982,
"grad_norm": 1.0410645008087158,
"learning_rate": 3.6346978813438464e-05,
"loss": 0.5525,
"step": 440
},
{
"epoch": 1.0614192009540846,
"grad_norm": 1.0676367282867432,
"learning_rate": 3.606772323379017e-05,
"loss": 0.5546,
"step": 445
},
{
"epoch": 1.073345259391771,
"grad_norm": 1.1925772428512573,
"learning_rate": 3.57867393433672e-05,
"loss": 0.5298,
"step": 450
},
{
"epoch": 1.0852713178294573,
"grad_norm": 1.0415118932724,
"learning_rate": 3.55040710199821e-05,
"loss": 0.5354,
"step": 455
},
{
"epoch": 1.0971973762671436,
"grad_norm": 0.9862761497497559,
"learning_rate": 3.521976240448468e-05,
"loss": 0.5173,
"step": 460
},
{
"epoch": 1.10912343470483,
"grad_norm": 1.0136168003082275,
"learning_rate": 3.493385789386906e-05,
"loss": 0.5424,
"step": 465
},
{
"epoch": 1.1210494931425163,
"grad_norm": 0.999031126499176,
"learning_rate": 3.464640213434079e-05,
"loss": 0.574,
"step": 470
},
{
"epoch": 1.1329755515802027,
"grad_norm": 1.0496968030929565,
"learning_rate": 3.435744001434492e-05,
"loss": 0.5152,
"step": 475
},
{
"epoch": 1.144901610017889,
"grad_norm": 1.0987663269042969,
"learning_rate": 3.40670166575564e-05,
"loss": 0.5504,
"step": 480
},
{
"epoch": 1.1568276684555754,
"grad_norm": 1.0069026947021484,
"learning_rate": 3.3775177415833605e-05,
"loss": 0.5547,
"step": 485
},
{
"epoch": 1.1687537268932617,
"grad_norm": 1.194651484489441,
"learning_rate": 3.348196786213633e-05,
"loss": 0.5319,
"step": 490
},
{
"epoch": 1.180679785330948,
"grad_norm": 0.9666855335235596,
"learning_rate": 3.3187433783409216e-05,
"loss": 0.5541,
"step": 495
},
{
"epoch": 1.1926058437686344,
"grad_norm": 0.8826591372489929,
"learning_rate": 3.289162117343173e-05,
"loss": 0.5329,
"step": 500
},
{
"epoch": 1.2045319022063208,
"grad_norm": 1.1290160417556763,
"learning_rate": 3.259457622563593e-05,
"loss": 0.5964,
"step": 505
},
{
"epoch": 1.2164579606440071,
"grad_norm": 1.078009009361267,
"learning_rate": 3.229634532589296e-05,
"loss": 0.5315,
"step": 510
},
{
"epoch": 1.2283840190816935,
"grad_norm": 1.1144967079162598,
"learning_rate": 3.199697504526955e-05,
"loss": 0.5698,
"step": 515
},
{
"epoch": 1.2403100775193798,
"grad_norm": 1.3256494998931885,
"learning_rate": 3.169651213275562e-05,
"loss": 0.5234,
"step": 520
},
{
"epoch": 1.2522361359570662,
"grad_norm": 1.0332247018814087,
"learning_rate": 3.139500350796397e-05,
"loss": 0.5386,
"step": 525
},
{
"epoch": 1.2641621943947525,
"grad_norm": 1.2091331481933594,
"learning_rate": 3.1092496253803546e-05,
"loss": 0.5136,
"step": 530
},
{
"epoch": 1.2760882528324389,
"grad_norm": 1.0742837190628052,
"learning_rate": 3.078903760912695e-05,
"loss": 0.5341,
"step": 535
},
{
"epoch": 1.2880143112701252,
"grad_norm": 1.5551700592041016,
"learning_rate": 3.048467496135384e-05,
"loss": 0.5613,
"step": 540
},
{
"epoch": 1.2999403697078116,
"grad_norm": 1.0054267644882202,
"learning_rate": 3.017945583907092e-05,
"loss": 0.5076,
"step": 545
},
{
"epoch": 1.311866428145498,
"grad_norm": 1.081240177154541,
"learning_rate": 2.9873427904610057e-05,
"loss": 0.5115,
"step": 550
},
{
"epoch": 1.3237924865831843,
"grad_norm": 1.120842456817627,
"learning_rate": 2.956663894660539e-05,
"loss": 0.5467,
"step": 555
},
{
"epoch": 1.3357185450208706,
"grad_norm": 1.022741675376892,
"learning_rate": 2.9259136872530812e-05,
"loss": 0.523,
"step": 560
},
{
"epoch": 1.347644603458557,
"grad_norm": 1.0805561542510986,
"learning_rate": 2.8950969701218783e-05,
"loss": 0.5541,
"step": 565
},
{
"epoch": 1.3595706618962433,
"grad_norm": 1.1941672563552856,
"learning_rate": 2.864218555536188e-05,
"loss": 0.5303,
"step": 570
},
{
"epoch": 1.3714967203339297,
"grad_norm": 1.0475640296936035,
"learning_rate": 2.833283265399801e-05,
"loss": 0.544,
"step": 575
},
{
"epoch": 1.383422778771616,
"grad_norm": 1.258143424987793,
"learning_rate": 2.8022959304980695e-05,
"loss": 0.5596,
"step": 580
},
{
"epoch": 1.3953488372093024,
"grad_norm": 1.044134259223938,
"learning_rate": 2.7712613897435357e-05,
"loss": 0.5003,
"step": 585
},
{
"epoch": 1.4072748956469887,
"grad_norm": 1.100220799446106,
"learning_rate": 2.7401844894203056e-05,
"loss": 0.5629,
"step": 590
},
{
"epoch": 1.419200954084675,
"grad_norm": 0.9259990453720093,
"learning_rate": 2.7090700824272557e-05,
"loss": 0.4516,
"step": 595
},
{
"epoch": 1.4311270125223614,
"grad_norm": 0.8607438206672668,
"learning_rate": 2.6779230275202243e-05,
"loss": 0.538,
"step": 600
},
{
"epoch": 1.4430530709600478,
"grad_norm": 1.2486215829849243,
"learning_rate": 2.6467481885532704e-05,
"loss": 0.5314,
"step": 605
},
{
"epoch": 1.454979129397734,
"grad_norm": 1.1418797969818115,
"learning_rate": 2.6155504337191516e-05,
"loss": 0.4841,
"step": 610
},
{
"epoch": 1.4669051878354205,
"grad_norm": 1.3612860441207886,
"learning_rate": 2.5843346347891163e-05,
"loss": 0.5133,
"step": 615
},
{
"epoch": 1.4788312462731068,
"grad_norm": 1.1785390377044678,
"learning_rate": 2.5531056663521362e-05,
"loss": 0.5309,
"step": 620
},
{
"epoch": 1.4907573047107932,
"grad_norm": 0.8908374905586243,
"learning_rate": 2.521868405053706e-05,
"loss": 0.5022,
"step": 625
},
{
"epoch": 1.5026833631484795,
"grad_norm": 1.031840443611145,
"learning_rate": 2.4906277288343123e-05,
"loss": 0.5254,
"step": 630
},
{
"epoch": 1.5146094215861656,
"grad_norm": 1.5957837104797363,
"learning_rate": 2.459388516167711e-05,
"loss": 0.5507,
"step": 635
},
{
"epoch": 1.5265354800238522,
"grad_norm": 1.2121431827545166,
"learning_rate": 2.428155645299111e-05,
"loss": 0.4943,
"step": 640
},
{
"epoch": 1.5384615384615383,
"grad_norm": 0.9426692724227905,
"learning_rate": 2.3969339934834012e-05,
"loss": 0.5111,
"step": 645
},
{
"epoch": 1.550387596899225,
"grad_norm": 1.2159509658813477,
"learning_rate": 2.3657284362235274e-05,
"loss": 0.5302,
"step": 650
},
{
"epoch": 1.562313655336911,
"grad_norm": 1.1508508920669556,
"learning_rate": 2.3345438465091455e-05,
"loss": 0.5299,
"step": 655
},
{
"epoch": 1.5742397137745976,
"grad_norm": 1.2826497554779053,
"learning_rate": 2.303385094055669e-05,
"loss": 0.5453,
"step": 660
},
{
"epoch": 1.5861657722122837,
"grad_norm": 1.520505666732788,
"learning_rate": 2.2722570445438214e-05,
"loss": 0.5804,
"step": 665
},
{
"epoch": 1.5980918306499703,
"grad_norm": 1.1682236194610596,
"learning_rate": 2.2411645588598232e-05,
"loss": 0.5072,
"step": 670
},
{
"epoch": 1.6100178890876564,
"grad_norm": 1.1223719120025635,
"learning_rate": 2.2101124923363267e-05,
"loss": 0.5287,
"step": 675
},
{
"epoch": 1.621943947525343,
"grad_norm": 1.0129245519638062,
"learning_rate": 2.1791056939942228e-05,
"loss": 0.5343,
"step": 680
},
{
"epoch": 1.6338700059630291,
"grad_norm": 1.00519597530365,
"learning_rate": 2.1481490057854217e-05,
"loss": 0.5329,
"step": 685
},
{
"epoch": 1.6457960644007157,
"grad_norm": 1.1941732168197632,
"learning_rate": 2.1172472618367483e-05,
"loss": 0.5591,
"step": 690
},
{
"epoch": 1.6577221228384018,
"grad_norm": 0.9398036599159241,
"learning_rate": 2.0864052876950552e-05,
"loss": 0.5254,
"step": 695
},
{
"epoch": 1.6696481812760884,
"grad_norm": 0.8543565273284912,
"learning_rate": 2.0556278995736782e-05,
"loss": 0.5212,
"step": 700
},
{
"epoch": 1.6815742397137745,
"grad_norm": 1.0366357564926147,
"learning_rate": 2.024919903600344e-05,
"loss": 0.5276,
"step": 705
},
{
"epoch": 1.693500298151461,
"grad_norm": 1.2778667211532593,
"learning_rate": 1.9942860950666574e-05,
"loss": 0.5203,
"step": 710
},
{
"epoch": 1.7054263565891472,
"grad_norm": 1.2391997575759888,
"learning_rate": 1.9637312576792776e-05,
"loss": 0.5291,
"step": 715
},
{
"epoch": 1.7173524150268338,
"grad_norm": 1.0726789236068726,
"learning_rate": 1.9332601628129128e-05,
"loss": 0.5299,
"step": 720
},
{
"epoch": 1.72927847346452,
"grad_norm": 1.1761037111282349,
"learning_rate": 1.9028775687652217e-05,
"loss": 0.5476,
"step": 725
},
{
"epoch": 1.7412045319022065,
"grad_norm": 1.257358431816101,
"learning_rate": 1.8725882200137762e-05,
"loss": 0.5333,
"step": 730
},
{
"epoch": 1.7531305903398926,
"grad_norm": 1.121039628982544,
"learning_rate": 1.8423968464751722e-05,
"loss": 0.4962,
"step": 735
},
{
"epoch": 1.7650566487775792,
"grad_norm": 1.1800827980041504,
"learning_rate": 1.812308162766418e-05,
"loss": 0.4986,
"step": 740
},
{
"epoch": 1.7769827072152653,
"grad_norm": 1.262669324874878,
"learning_rate": 1.7823268674687077e-05,
"loss": 0.5461,
"step": 745
},
{
"epoch": 1.7889087656529516,
"grad_norm": 1.1524207592010498,
"learning_rate": 1.7524576423937025e-05,
"loss": 0.5392,
"step": 750
},
{
"epoch": 1.800834824090638,
"grad_norm": 0.9975520968437195,
"learning_rate": 1.7227051518524286e-05,
"loss": 0.5162,
"step": 755
},
{
"epoch": 1.8127608825283243,
"grad_norm": 1.1616145372390747,
"learning_rate": 1.6930740419269132e-05,
"loss": 0.5431,
"step": 760
},
{
"epoch": 1.8246869409660107,
"grad_norm": 0.9507209658622742,
"learning_rate": 1.6635689397446562e-05,
"loss": 0.5037,
"step": 765
},
{
"epoch": 1.836612999403697,
"grad_norm": 1.0409976243972778,
"learning_rate": 1.6341944527560736e-05,
"loss": 0.5164,
"step": 770
},
{
"epoch": 1.8485390578413834,
"grad_norm": 0.8729709386825562,
"learning_rate": 1.6049551680150047e-05,
"loss": 0.495,
"step": 775
},
{
"epoch": 1.8604651162790697,
"grad_norm": 1.223853588104248,
"learning_rate": 1.5758556514624118e-05,
"loss": 0.5834,
"step": 780
},
{
"epoch": 1.872391174716756,
"grad_norm": 1.184877634048462,
"learning_rate": 1.5469004472133696e-05,
"loss": 0.5459,
"step": 785
},
{
"epoch": 1.8843172331544424,
"grad_norm": 1.163509488105774,
"learning_rate": 1.5180940768474689e-05,
"loss": 0.516,
"step": 790
},
{
"epoch": 1.8962432915921288,
"grad_norm": 1.2189282178878784,
"learning_rate": 1.489441038702735e-05,
"loss": 0.5519,
"step": 795
},
{
"epoch": 1.9081693500298151,
"grad_norm": 1.1891826391220093,
"learning_rate": 1.4609458071731796e-05,
"loss": 0.5173,
"step": 800
},
{
"epoch": 1.9200954084675015,
"grad_norm": 1.3610830307006836,
"learning_rate": 1.4326128320100867e-05,
"loss": 0.517,
"step": 805
},
{
"epoch": 1.9320214669051878,
"grad_norm": 1.148544192314148,
"learning_rate": 1.4044465376271532e-05,
"loss": 0.4906,
"step": 810
},
{
"epoch": 1.9439475253428742,
"grad_norm": 1.404956340789795,
"learning_rate": 1.3764513224095762e-05,
"loss": 0.5598,
"step": 815
},
{
"epoch": 1.9558735837805605,
"grad_norm": 1.206635594367981,
"learning_rate": 1.3486315580272202e-05,
"loss": 0.5276,
"step": 820
},
{
"epoch": 1.9677996422182469,
"grad_norm": 0.8971341848373413,
"learning_rate": 1.320991588751938e-05,
"loss": 0.4966,
"step": 825
},
{
"epoch": 1.9797257006559332,
"grad_norm": 1.1437649726867676,
"learning_rate": 1.2935357307791826e-05,
"loss": 0.5133,
"step": 830
},
{
"epoch": 1.9916517590936196,
"grad_norm": 1.2897300720214844,
"learning_rate": 1.2662682715540031e-05,
"loss": 0.4814,
"step": 835
},
{
"epoch": 2.0035778175313057,
"grad_norm": 1.0928142070770264,
"learning_rate": 1.2391934691015213e-05,
"loss": 0.6167,
"step": 840
},
{
"epoch": 2.0155038759689923,
"grad_norm": 1.1107832193374634,
"learning_rate": 1.2123155513620108e-05,
"loss": 0.4915,
"step": 845
},
{
"epoch": 2.0274299344066784,
"grad_norm": 1.2382787466049194,
"learning_rate": 1.1856387155306715e-05,
"loss": 0.5116,
"step": 850
},
{
"epoch": 2.039355992844365,
"grad_norm": 1.46440589427948,
"learning_rate": 1.1591671274022035e-05,
"loss": 0.4754,
"step": 855
},
{
"epoch": 2.051282051282051,
"grad_norm": 1.4522420167922974,
"learning_rate": 1.1329049207202904e-05,
"loss": 0.4426,
"step": 860
},
{
"epoch": 2.0632081097197377,
"grad_norm": 1.3376868963241577,
"learning_rate": 1.1068561965320764e-05,
"loss": 0.5026,
"step": 865
},
{
"epoch": 2.075134168157424,
"grad_norm": 1.1326589584350586,
"learning_rate": 1.0810250225477611e-05,
"loss": 0.4872,
"step": 870
},
{
"epoch": 2.0870602265951104,
"grad_norm": 1.5386810302734375,
"learning_rate": 1.055415432505393e-05,
"loss": 0.5022,
"step": 875
},
{
"epoch": 2.0989862850327965,
"grad_norm": 1.2370398044586182,
"learning_rate": 1.0300314255409704e-05,
"loss": 0.5388,
"step": 880
},
{
"epoch": 2.110912343470483,
"grad_norm": 1.2834645509719849,
"learning_rate": 1.004876965563945e-05,
"loss": 0.481,
"step": 885
},
{
"epoch": 2.122838401908169,
"grad_norm": 1.3606491088867188,
"learning_rate": 9.79955980638229e-06,
"loss": 0.4829,
"step": 890
},
{
"epoch": 2.1347644603458558,
"grad_norm": 1.3206753730773926,
"learning_rate": 9.552723623687934e-06,
"loss": 0.4902,
"step": 895
},
{
"epoch": 2.146690518783542,
"grad_norm": 1.1643896102905273,
"learning_rate": 9.308299652939666e-06,
"loss": 0.5297,
"step": 900
},
{
"epoch": 2.1586165772212285,
"grad_norm": 1.0991599559783936,
"learning_rate": 9.066326062835179e-06,
"loss": 0.4845,
"step": 905
},
{
"epoch": 2.1705426356589146,
"grad_norm": 1.1832717657089233,
"learning_rate": 8.826840639426218e-06,
"loss": 0.5086,
"step": 910
},
{
"epoch": 2.182468694096601,
"grad_norm": 1.385024070739746,
"learning_rate": 8.589880780218049e-06,
"loss": 0.5136,
"step": 915
},
{
"epoch": 2.1943947525342873,
"grad_norm": 1.1657204627990723,
"learning_rate": 8.355483488329471e-06,
"loss": 0.5137,
"step": 920
},
{
"epoch": 2.206320810971974,
"grad_norm": 1.0423134565353394,
"learning_rate": 8.123685366714556e-06,
"loss": 0.5032,
"step": 925
},
{
"epoch": 2.21824686940966,
"grad_norm": 1.2326374053955078,
"learning_rate": 7.89452261244677e-06,
"loss": 0.4658,
"step": 930
},
{
"epoch": 2.2301729278473466,
"grad_norm": 1.3008525371551514,
"learning_rate": 7.66803101106657e-06,
"loss": 0.5261,
"step": 935
},
{
"epoch": 2.2420989862850327,
"grad_norm": 1.296058177947998,
"learning_rate": 7.44424593099316e-06,
"loss": 0.5216,
"step": 940
},
{
"epoch": 2.2540250447227193,
"grad_norm": 1.3323789834976196,
"learning_rate": 7.223202318001465e-06,
"loss": 0.4949,
"step": 945
},
{
"epoch": 2.2659511031604054,
"grad_norm": 1.2150744199752808,
"learning_rate": 7.0049346897650745e-06,
"loss": 0.464,
"step": 950
},
{
"epoch": 2.277877161598092,
"grad_norm": 1.1837234497070312,
"learning_rate": 6.789477130466057e-06,
"loss": 0.5057,
"step": 955
},
{
"epoch": 2.289803220035778,
"grad_norm": 1.0720417499542236,
"learning_rate": 6.576863285472415e-06,
"loss": 0.4875,
"step": 960
},
{
"epoch": 2.3017292784734646,
"grad_norm": 1.4399152994155884,
"learning_rate": 6.367126356084127e-06,
"loss": 0.4753,
"step": 965
},
{
"epoch": 2.3136553369111508,
"grad_norm": 1.3286051750183105,
"learning_rate": 6.160299094348488e-06,
"loss": 0.4725,
"step": 970
},
{
"epoch": 2.3255813953488373,
"grad_norm": 1.3082605600357056,
"learning_rate": 5.956413797945657e-06,
"loss": 0.5095,
"step": 975
},
{
"epoch": 2.3375074537865235,
"grad_norm": 1.4694650173187256,
"learning_rate": 5.755502305145089e-06,
"loss": 0.5099,
"step": 980
},
{
"epoch": 2.34943351222421,
"grad_norm": 1.1760421991348267,
"learning_rate": 5.557595989833747e-06,
"loss": 0.4744,
"step": 985
},
{
"epoch": 2.361359570661896,
"grad_norm": 1.5401874780654907,
"learning_rate": 5.36272575661684e-06,
"loss": 0.5126,
"step": 990
},
{
"epoch": 2.3732856290995827,
"grad_norm": 1.114990234375,
"learning_rate": 5.170922035991838e-06,
"loss": 0.4636,
"step": 995
},
{
"epoch": 2.385211687537269,
"grad_norm": 1.2573448419570923,
"learning_rate": 4.9822147795964805e-06,
"loss": 0.4817,
"step": 1000
}
],
"logging_steps": 5,
"max_steps": 1257,
"num_input_tokens_seen": 0,
"num_train_epochs": 3,
"save_steps": 100,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 3.48527209218048e+17,
"train_batch_size": 1,
"trial_name": null,
"trial_params": null
}