CocoRoF's picture
Training in progress, step 110000, checkpoint
3d200e5 verified
raw
history blame
79.1 kB
{
"best_metric": 0.3242824375629425,
"best_model_checkpoint": "/workspace/plateer_classifier_v0.1_result/checkpoint-110000",
"epoch": 0.6441270979878347,
"eval_steps": 55000,
"global_step": 110000,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0014640195241643742,
"grad_norm": 50.05304718017578,
"learning_rate": 4.880000000000001e-06,
"loss": 4.3958,
"step": 250
},
{
"epoch": 0.0029280390483287485,
"grad_norm": 48.363304138183594,
"learning_rate": 9.88e-06,
"loss": 1.6496,
"step": 500
},
{
"epoch": 0.004392058572493123,
"grad_norm": 54.546974182128906,
"learning_rate": 1.488e-05,
"loss": 0.8787,
"step": 750
},
{
"epoch": 0.005856078096657497,
"grad_norm": 50.317874908447266,
"learning_rate": 1.9880000000000003e-05,
"loss": 0.7721,
"step": 1000
},
{
"epoch": 0.007320097620821872,
"grad_norm": 62.48823928833008,
"learning_rate": 2.488e-05,
"loss": 0.7047,
"step": 1250
},
{
"epoch": 0.008784117144986246,
"grad_norm": 44.35001754760742,
"learning_rate": 2.9880000000000002e-05,
"loss": 0.6749,
"step": 1500
},
{
"epoch": 0.01024813666915062,
"grad_norm": 36.486793518066406,
"learning_rate": 3.4880000000000005e-05,
"loss": 0.6409,
"step": 1750
},
{
"epoch": 0.011712156193314994,
"grad_norm": 47.03588104248047,
"learning_rate": 3.988e-05,
"loss": 0.6406,
"step": 2000
},
{
"epoch": 0.013176175717479368,
"grad_norm": 31.227832794189453,
"learning_rate": 4.488e-05,
"loss": 0.6149,
"step": 2250
},
{
"epoch": 0.014640195241643743,
"grad_norm": 39.8408317565918,
"learning_rate": 4.9880000000000004e-05,
"loss": 0.5956,
"step": 2500
},
{
"epoch": 0.016104214765808117,
"grad_norm": 41.118736267089844,
"learning_rate": 5.4879999999999996e-05,
"loss": 0.5905,
"step": 2750
},
{
"epoch": 0.017568234289972492,
"grad_norm": 29.624338150024414,
"learning_rate": 5.988e-05,
"loss": 0.5608,
"step": 3000
},
{
"epoch": 0.019032253814136865,
"grad_norm": 22.993818283081055,
"learning_rate": 6.488e-05,
"loss": 0.5614,
"step": 3250
},
{
"epoch": 0.02049627333830124,
"grad_norm": 19.964269638061523,
"learning_rate": 6.988e-05,
"loss": 0.5569,
"step": 3500
},
{
"epoch": 0.021960292862465612,
"grad_norm": 36.538047790527344,
"learning_rate": 7.488e-05,
"loss": 0.5316,
"step": 3750
},
{
"epoch": 0.023424312386629988,
"grad_norm": 37.63505935668945,
"learning_rate": 7.988e-05,
"loss": 0.5364,
"step": 4000
},
{
"epoch": 0.024888331910794363,
"grad_norm": 25.934967041015625,
"learning_rate": 8.486000000000001e-05,
"loss": 0.5234,
"step": 4250
},
{
"epoch": 0.026352351434958735,
"grad_norm": 24.810028076171875,
"learning_rate": 8.986e-05,
"loss": 0.5155,
"step": 4500
},
{
"epoch": 0.02781637095912311,
"grad_norm": 32.76811981201172,
"learning_rate": 9.484e-05,
"loss": 0.5022,
"step": 4750
},
{
"epoch": 0.029280390483287486,
"grad_norm": 27.094772338867188,
"learning_rate": 9.984e-05,
"loss": 0.5023,
"step": 5000
},
{
"epoch": 0.029280390483287486,
"eval_accuracy": 0.8572352668691132,
"eval_loss": 0.5044249296188354,
"eval_runtime": 11541.1431,
"eval_samples_per_second": 210.432,
"eval_steps_per_second": 6.576,
"step": 5000
},
{
"epoch": 0.03074441000745186,
"grad_norm": 24.74563217163086,
"learning_rate": 0.00010484,
"loss": 0.5073,
"step": 5250
},
{
"epoch": 0.032208429531616234,
"grad_norm": 17.229019165039062,
"learning_rate": 0.00010984,
"loss": 0.4932,
"step": 5500
},
{
"epoch": 0.03367244905578061,
"grad_norm": 23.318979263305664,
"learning_rate": 0.00011484000000000002,
"loss": 0.504,
"step": 5750
},
{
"epoch": 0.035136468579944985,
"grad_norm": 22.271846771240234,
"learning_rate": 0.00011983999999999999,
"loss": 0.4817,
"step": 6000
},
{
"epoch": 0.036600488104109354,
"grad_norm": 24.304887771606445,
"learning_rate": 0.00012484,
"loss": 0.4966,
"step": 6250
},
{
"epoch": 0.03806450762827373,
"grad_norm": 23.76158905029297,
"learning_rate": 0.00012984000000000002,
"loss": 0.4899,
"step": 6500
},
{
"epoch": 0.039528527152438105,
"grad_norm": 20.765274047851562,
"learning_rate": 0.00013484,
"loss": 0.4773,
"step": 6750
},
{
"epoch": 0.04099254667660248,
"grad_norm": 12.793950080871582,
"learning_rate": 0.00013982000000000003,
"loss": 0.4781,
"step": 7000
},
{
"epoch": 0.042456566200766856,
"grad_norm": 14.128210067749023,
"learning_rate": 0.00014482,
"loss": 0.4687,
"step": 7250
},
{
"epoch": 0.043920585724931224,
"grad_norm": 22.348928451538086,
"learning_rate": 0.00014982,
"loss": 0.4722,
"step": 7500
},
{
"epoch": 0.0453846052490956,
"grad_norm": 17.29800796508789,
"learning_rate": 0.00015480000000000002,
"loss": 0.4692,
"step": 7750
},
{
"epoch": 0.046848624773259975,
"grad_norm": 11.0147066116333,
"learning_rate": 0.0001598,
"loss": 0.4689,
"step": 8000
},
{
"epoch": 0.04831264429742435,
"grad_norm": 11.713265419006348,
"learning_rate": 0.0001648,
"loss": 0.4788,
"step": 8250
},
{
"epoch": 0.049776663821588726,
"grad_norm": 12.367693901062012,
"learning_rate": 0.0001698,
"loss": 0.4697,
"step": 8500
},
{
"epoch": 0.0512406833457531,
"grad_norm": 8.11889934539795,
"learning_rate": 0.00017480000000000002,
"loss": 0.4696,
"step": 8750
},
{
"epoch": 0.05270470286991747,
"grad_norm": 12.321019172668457,
"learning_rate": 0.0001798,
"loss": 0.461,
"step": 9000
},
{
"epoch": 0.054168722394081846,
"grad_norm": 15.612183570861816,
"learning_rate": 0.00018480000000000002,
"loss": 0.4646,
"step": 9250
},
{
"epoch": 0.05563274191824622,
"grad_norm": 10.72978687286377,
"learning_rate": 0.0001898,
"loss": 0.4673,
"step": 9500
},
{
"epoch": 0.0570967614424106,
"grad_norm": 8.815441131591797,
"learning_rate": 0.0001948,
"loss": 0.4472,
"step": 9750
},
{
"epoch": 0.05856078096657497,
"grad_norm": 8.681705474853516,
"learning_rate": 0.0001998,
"loss": 0.4629,
"step": 10000
},
{
"epoch": 0.05856078096657497,
"eval_accuracy": 0.8688706572649133,
"eval_loss": 0.457188218832016,
"eval_runtime": 11537.8227,
"eval_samples_per_second": 210.492,
"eval_steps_per_second": 6.578,
"step": 10000
},
{
"epoch": 0.06002480049073934,
"grad_norm": 13.643828392028809,
"learning_rate": 0.0001997014219778306,
"loss": 0.456,
"step": 10250
},
{
"epoch": 0.06148882001490372,
"grad_norm": 13.211404800415039,
"learning_rate": 0.00019939040320473745,
"loss": 0.4666,
"step": 10500
},
{
"epoch": 0.06295283953906809,
"grad_norm": 11.1001615524292,
"learning_rate": 0.00019907938443164432,
"loss": 0.4495,
"step": 10750
},
{
"epoch": 0.06441685906323247,
"grad_norm": 8.222249984741211,
"learning_rate": 0.00019876836565855117,
"loss": 0.4483,
"step": 11000
},
{
"epoch": 0.06588087858739684,
"grad_norm": 13.589752197265625,
"learning_rate": 0.0001984585909605504,
"loss": 0.4438,
"step": 11250
},
{
"epoch": 0.06734489811156122,
"grad_norm": 9.988068580627441,
"learning_rate": 0.00019814757218745724,
"loss": 0.447,
"step": 11500
},
{
"epoch": 0.0688089176357256,
"grad_norm": 8.311960220336914,
"learning_rate": 0.0001978365534143641,
"loss": 0.4476,
"step": 11750
},
{
"epoch": 0.07027293715988997,
"grad_norm": 8.099685668945312,
"learning_rate": 0.00019752553464127094,
"loss": 0.4477,
"step": 12000
},
{
"epoch": 0.07173695668405435,
"grad_norm": 8.23130989074707,
"learning_rate": 0.00019721451586817782,
"loss": 0.4385,
"step": 12250
},
{
"epoch": 0.07320097620821871,
"grad_norm": 10.875362396240234,
"learning_rate": 0.00019690349709508467,
"loss": 0.4345,
"step": 12500
},
{
"epoch": 0.07466499573238308,
"grad_norm": 9.479572296142578,
"learning_rate": 0.00019659247832199152,
"loss": 0.4345,
"step": 12750
},
{
"epoch": 0.07612901525654746,
"grad_norm": 11.883151054382324,
"learning_rate": 0.0001962814595488984,
"loss": 0.4241,
"step": 13000
},
{
"epoch": 0.07759303478071183,
"grad_norm": 8.15208911895752,
"learning_rate": 0.00019597044077580524,
"loss": 0.4335,
"step": 13250
},
{
"epoch": 0.07905705430487621,
"grad_norm": 9.323240280151367,
"learning_rate": 0.0001956594220027121,
"loss": 0.4396,
"step": 13500
},
{
"epoch": 0.08052107382904058,
"grad_norm": 7.250824928283691,
"learning_rate": 0.00019534840322961897,
"loss": 0.4376,
"step": 13750
},
{
"epoch": 0.08198509335320496,
"grad_norm": 12.220071792602539,
"learning_rate": 0.0001950373844565258,
"loss": 0.4323,
"step": 14000
},
{
"epoch": 0.08344911287736934,
"grad_norm": 8.460916519165039,
"learning_rate": 0.00019472636568343266,
"loss": 0.4271,
"step": 14250
},
{
"epoch": 0.08491313240153371,
"grad_norm": 6.110500812530518,
"learning_rate": 0.0001944153469103395,
"loss": 0.4253,
"step": 14500
},
{
"epoch": 0.08637715192569809,
"grad_norm": 10.618386268615723,
"learning_rate": 0.00019410432813724636,
"loss": 0.427,
"step": 14750
},
{
"epoch": 0.08784117144986245,
"grad_norm": 9.827556610107422,
"learning_rate": 0.00019379330936415324,
"loss": 0.4254,
"step": 15000
},
{
"epoch": 0.08784117144986245,
"eval_accuracy": 0.877075711565186,
"eval_loss": 0.4201970100402832,
"eval_runtime": 11537.2443,
"eval_samples_per_second": 210.503,
"eval_steps_per_second": 6.578,
"step": 15000
},
{
"epoch": 0.0892994385846771,
"grad_norm": 10.84184455871582,
"learning_rate": 0.00019349020046898423,
"loss": 0.4211,
"step": 15250
},
{
"epoch": 0.09076336380737672,
"grad_norm": 7.9568657875061035,
"learning_rate": 0.00019317920297562402,
"loss": 0.4203,
"step": 15500
},
{
"epoch": 0.09222728903007635,
"grad_norm": 12.237702369689941,
"learning_rate": 0.00019286820548226384,
"loss": 0.4181,
"step": 15750
},
{
"epoch": 0.09369121425277596,
"grad_norm": 25.739120483398438,
"learning_rate": 0.00019255720798890363,
"loss": 0.4143,
"step": 16000
},
{
"epoch": 0.09515513947547559,
"grad_norm": 8.341870307922363,
"learning_rate": 0.00019224621049554342,
"loss": 0.4171,
"step": 16250
},
{
"epoch": 0.09661906469817522,
"grad_norm": 10.707802772521973,
"learning_rate": 0.0001919352130021832,
"loss": 0.4058,
"step": 16500
},
{
"epoch": 0.09808298992087484,
"grad_norm": 7.021149158477783,
"learning_rate": 0.00019162421550882302,
"loss": 0.4211,
"step": 16750
},
{
"epoch": 0.09954691514357447,
"grad_norm": 11.840470314025879,
"learning_rate": 0.0001913132180154628,
"loss": 0.4093,
"step": 17000
},
{
"epoch": 0.10101084036627409,
"grad_norm": 7.401727676391602,
"learning_rate": 0.0001910022205221026,
"loss": 0.4281,
"step": 17250
},
{
"epoch": 0.10247476558897371,
"grad_norm": 7.601231575012207,
"learning_rate": 0.00019069246701871584,
"loss": 0.4044,
"step": 17500
},
{
"epoch": 0.10393869081167334,
"grad_norm": 6.85632848739624,
"learning_rate": 0.00019038146952535563,
"loss": 0.4244,
"step": 17750
},
{
"epoch": 0.10540261603437297,
"grad_norm": 10.810693740844727,
"learning_rate": 0.00019007171602196887,
"loss": 0.4216,
"step": 18000
},
{
"epoch": 0.1068665412570726,
"grad_norm": 9.758743286132812,
"learning_rate": 0.00018976071852860865,
"loss": 0.417,
"step": 18250
},
{
"epoch": 0.10833046647977221,
"grad_norm": 10.75692367553711,
"learning_rate": 0.00018944972103524847,
"loss": 0.4143,
"step": 18500
},
{
"epoch": 0.10979439170247184,
"grad_norm": 10.375711441040039,
"learning_rate": 0.00018913872354188826,
"loss": 0.4075,
"step": 18750
},
{
"epoch": 0.11125831692517146,
"grad_norm": 8.414403915405273,
"learning_rate": 0.00018882772604852805,
"loss": 0.4148,
"step": 19000
},
{
"epoch": 0.11272224214787109,
"grad_norm": 9.86490249633789,
"learning_rate": 0.00018851672855516786,
"loss": 0.4074,
"step": 19250
},
{
"epoch": 0.11418616737057072,
"grad_norm": 7.522060394287109,
"learning_rate": 0.00018820573106180765,
"loss": 0.4106,
"step": 19500
},
{
"epoch": 0.11565009259327033,
"grad_norm": 7.423270225524902,
"learning_rate": 0.00018789473356844744,
"loss": 0.4034,
"step": 19750
},
{
"epoch": 0.11711401781596996,
"grad_norm": 8.761688232421875,
"learning_rate": 0.00018758373607508723,
"loss": 0.4025,
"step": 20000
},
{
"epoch": 0.11711401781596996,
"eval_accuracy": 0.8823756104911845,
"eval_loss": 0.4016551673412323,
"eval_runtime": 11547.1595,
"eval_samples_per_second": 210.336,
"eval_steps_per_second": 6.573,
"step": 20000
},
{
"epoch": 0.11857794303866959,
"grad_norm": 9.6015043258667,
"learning_rate": 0.0001872802025215677,
"loss": 0.4087,
"step": 20250
},
{
"epoch": 0.12004186826136921,
"grad_norm": 6.658656120300293,
"learning_rate": 0.00018696920502820748,
"loss": 0.408,
"step": 20500
},
{
"epoch": 0.12150579348406883,
"grad_norm": 6.935655117034912,
"learning_rate": 0.00018665820753484727,
"loss": 0.3983,
"step": 20750
},
{
"epoch": 0.12296971870676845,
"grad_norm": 7.918155193328857,
"learning_rate": 0.00018634721004148706,
"loss": 0.3994,
"step": 21000
},
{
"epoch": 0.12443364392946808,
"grad_norm": 7.246758937835693,
"learning_rate": 0.00018603621254812688,
"loss": 0.4111,
"step": 21250
},
{
"epoch": 0.1258975691521677,
"grad_norm": 8.375380516052246,
"learning_rate": 0.00018572521505476667,
"loss": 0.4006,
"step": 21500
},
{
"epoch": 0.12736149437486732,
"grad_norm": 6.993825435638428,
"learning_rate": 0.0001854154615513799,
"loss": 0.4113,
"step": 21750
},
{
"epoch": 0.12882541959756696,
"grad_norm": 8.703255653381348,
"learning_rate": 0.00018510446405801972,
"loss": 0.3977,
"step": 22000
},
{
"epoch": 0.13028934482026658,
"grad_norm": 6.940033912658691,
"learning_rate": 0.0001847934665646595,
"loss": 0.4005,
"step": 22250
},
{
"epoch": 0.1317532700429662,
"grad_norm": 6.712055683135986,
"learning_rate": 0.0001844824690712993,
"loss": 0.41,
"step": 22500
},
{
"epoch": 0.13321719526566583,
"grad_norm": 6.171209812164307,
"learning_rate": 0.0001841714715779391,
"loss": 0.3971,
"step": 22750
},
{
"epoch": 0.13468112048836545,
"grad_norm": 10.764921188354492,
"learning_rate": 0.0001838604740845789,
"loss": 0.4105,
"step": 23000
},
{
"epoch": 0.1361450457110651,
"grad_norm": 8.0676908493042,
"learning_rate": 0.0001835494765912187,
"loss": 0.3958,
"step": 23250
},
{
"epoch": 0.1376089709337647,
"grad_norm": 5.20599365234375,
"learning_rate": 0.00018323847909785848,
"loss": 0.3946,
"step": 23500
},
{
"epoch": 0.13907289615646432,
"grad_norm": 5.9439239501953125,
"learning_rate": 0.0001829274816044983,
"loss": 0.3951,
"step": 23750
},
{
"epoch": 0.14053682137916396,
"grad_norm": 9.821541786193848,
"learning_rate": 0.0001826164841111381,
"loss": 0.3906,
"step": 24000
},
{
"epoch": 0.14200074660186357,
"grad_norm": 6.659691333770752,
"learning_rate": 0.00018230673060775133,
"loss": 0.4009,
"step": 24250
},
{
"epoch": 0.1434646718245632,
"grad_norm": 6.624240398406982,
"learning_rate": 0.00018199573311439112,
"loss": 0.3975,
"step": 24500
},
{
"epoch": 0.14492859704726282,
"grad_norm": 7.993641376495361,
"learning_rate": 0.0001816847356210309,
"loss": 0.3925,
"step": 24750
},
{
"epoch": 0.14639252226996244,
"grad_norm": 6.6386613845825195,
"learning_rate": 0.0001813737381276707,
"loss": 0.3975,
"step": 25000
},
{
"epoch": 0.14785644749266208,
"grad_norm": 9.204560279846191,
"learning_rate": 0.0001810627406343105,
"loss": 0.3997,
"step": 25250
},
{
"epoch": 0.1493203727153617,
"grad_norm": 8.072566986083984,
"learning_rate": 0.0001807517431409503,
"loss": 0.4022,
"step": 25500
},
{
"epoch": 0.15078429793806133,
"grad_norm": 10.15225601196289,
"learning_rate": 0.0001804407456475901,
"loss": 0.392,
"step": 25750
},
{
"epoch": 0.15224822316076095,
"grad_norm": 7.751401901245117,
"learning_rate": 0.0001801297481542299,
"loss": 0.3946,
"step": 26000
},
{
"epoch": 0.15371214838346056,
"grad_norm": 8.481501579284668,
"learning_rate": 0.0001798187506608697,
"loss": 0.3883,
"step": 26250
},
{
"epoch": 0.1551760736061602,
"grad_norm": 9.861278533935547,
"learning_rate": 0.00017950775316750948,
"loss": 0.3824,
"step": 26500
},
{
"epoch": 0.15663999882885982,
"grad_norm": 6.405235290527344,
"learning_rate": 0.0001791967556741493,
"loss": 0.4006,
"step": 26750
},
{
"epoch": 0.15810392405155946,
"grad_norm": 9.90355110168457,
"learning_rate": 0.00017888575818078909,
"loss": 0.3881,
"step": 27000
},
{
"epoch": 0.15956784927425907,
"grad_norm": 9.354215621948242,
"learning_rate": 0.00017857476068742887,
"loss": 0.3965,
"step": 27250
},
{
"epoch": 0.16103177449695869,
"grad_norm": 9.162219047546387,
"learning_rate": 0.00017826376319406866,
"loss": 0.3933,
"step": 27500
},
{
"epoch": 0.16249569971965833,
"grad_norm": 6.755202770233154,
"learning_rate": 0.00017795276570070848,
"loss": 0.3874,
"step": 27750
},
{
"epoch": 0.16395962494235794,
"grad_norm": 8.385200500488281,
"learning_rate": 0.00017764176820734827,
"loss": 0.3873,
"step": 28000
},
{
"epoch": 0.16542355016505758,
"grad_norm": 6.508645057678223,
"learning_rate": 0.00017733077071398806,
"loss": 0.3895,
"step": 28250
},
{
"epoch": 0.1668874753877572,
"grad_norm": 8.241129875183105,
"learning_rate": 0.00017702226120057472,
"loss": 0.3912,
"step": 28500
},
{
"epoch": 0.1683514006104568,
"grad_norm": 7.879597187042236,
"learning_rate": 0.00017671126370721454,
"loss": 0.3929,
"step": 28750
},
{
"epoch": 0.16981532583315645,
"grad_norm": 12.0702486038208,
"learning_rate": 0.00017640026621385432,
"loss": 0.404,
"step": 29000
},
{
"epoch": 0.17127925105585606,
"grad_norm": 8.789772033691406,
"learning_rate": 0.0001760892687204941,
"loss": 0.3823,
"step": 29250
},
{
"epoch": 0.1727431762785557,
"grad_norm": 11.022305488586426,
"learning_rate": 0.00017577827122713393,
"loss": 0.3887,
"step": 29500
},
{
"epoch": 0.17420710150125532,
"grad_norm": 7.665167331695557,
"learning_rate": 0.00017546727373377372,
"loss": 0.394,
"step": 29750
},
{
"epoch": 0.17567102672395493,
"grad_norm": 11.05783748626709,
"learning_rate": 0.0001751562762404135,
"loss": 0.3938,
"step": 30000
},
{
"epoch": 0.17713495194665457,
"grad_norm": 8.389631271362305,
"learning_rate": 0.0001748452787470533,
"loss": 0.39,
"step": 30250
},
{
"epoch": 0.1785988771693542,
"grad_norm": 8.158947944641113,
"learning_rate": 0.0001745342812536931,
"loss": 0.3818,
"step": 30500
},
{
"epoch": 0.1800628023920538,
"grad_norm": 7.684356689453125,
"learning_rate": 0.0001742232837603329,
"loss": 0.3905,
"step": 30750
},
{
"epoch": 0.18152672761475344,
"grad_norm": 10.129668235778809,
"learning_rate": 0.00017391353025694614,
"loss": 0.3886,
"step": 31000
},
{
"epoch": 0.18299065283745305,
"grad_norm": 6.924737453460693,
"learning_rate": 0.00017360253276358593,
"loss": 0.3892,
"step": 31250
},
{
"epoch": 0.1844545780601527,
"grad_norm": 5.863354206085205,
"learning_rate": 0.00017329153527022572,
"loss": 0.3822,
"step": 31500
},
{
"epoch": 0.1859185032828523,
"grad_norm": 9.10240650177002,
"learning_rate": 0.00017298053777686553,
"loss": 0.3895,
"step": 31750
},
{
"epoch": 0.18738242850555192,
"grad_norm": 9.565494537353516,
"learning_rate": 0.00017266954028350532,
"loss": 0.383,
"step": 32000
},
{
"epoch": 0.18884635372825156,
"grad_norm": 8.238012313842773,
"learning_rate": 0.0001723585427901451,
"loss": 0.3854,
"step": 32250
},
{
"epoch": 0.19031027895095118,
"grad_norm": 9.350130081176758,
"learning_rate": 0.0001720475452967849,
"loss": 0.3922,
"step": 32500
},
{
"epoch": 0.19177420417365082,
"grad_norm": 6.337550163269043,
"learning_rate": 0.00017173654780342472,
"loss": 0.3778,
"step": 32750
},
{
"epoch": 0.19323812939635043,
"grad_norm": 8.421921730041504,
"learning_rate": 0.00017142679430003793,
"loss": 0.3929,
"step": 33000
},
{
"epoch": 0.19470205461905005,
"grad_norm": 8.888238906860352,
"learning_rate": 0.00017111579680667774,
"loss": 0.3844,
"step": 33250
},
{
"epoch": 0.1961659798417497,
"grad_norm": 10.774327278137207,
"learning_rate": 0.00017080479931331753,
"loss": 0.3804,
"step": 33500
},
{
"epoch": 0.1976299050644493,
"grad_norm": 7.07879114151001,
"learning_rate": 0.00017049380181995732,
"loss": 0.3954,
"step": 33750
},
{
"epoch": 0.19909383028714894,
"grad_norm": 7.102870941162109,
"learning_rate": 0.00017018280432659714,
"loss": 0.3815,
"step": 34000
},
{
"epoch": 0.20055775550984856,
"grad_norm": 5.815110206604004,
"learning_rate": 0.00016987180683323693,
"loss": 0.3907,
"step": 34250
},
{
"epoch": 0.20202168073254817,
"grad_norm": 7.749156475067139,
"learning_rate": 0.00016956080933987672,
"loss": 0.3798,
"step": 34500
},
{
"epoch": 0.2034856059552478,
"grad_norm": 7.0530476570129395,
"learning_rate": 0.0001692498118465165,
"loss": 0.3947,
"step": 34750
},
{
"epoch": 0.20494953117794742,
"grad_norm": 6.623088836669922,
"learning_rate": 0.00016893881435315632,
"loss": 0.3816,
"step": 35000
},
{
"epoch": 0.20641345640064707,
"grad_norm": 8.431561470031738,
"learning_rate": 0.0001686278168597961,
"loss": 0.3815,
"step": 35250
},
{
"epoch": 0.20787738162334668,
"grad_norm": 11.600255012512207,
"learning_rate": 0.00016831806335640935,
"loss": 0.3782,
"step": 35500
},
{
"epoch": 0.2093413068460463,
"grad_norm": 5.186095237731934,
"learning_rate": 0.00016800706586304914,
"loss": 0.3828,
"step": 35750
},
{
"epoch": 0.21080523206874593,
"grad_norm": 12.819711685180664,
"learning_rate": 0.00016769606836968895,
"loss": 0.3902,
"step": 36000
},
{
"epoch": 0.21226915729144555,
"grad_norm": 7.843264579772949,
"learning_rate": 0.00016738507087632874,
"loss": 0.3716,
"step": 36250
},
{
"epoch": 0.2137330825141452,
"grad_norm": 8.602349281311035,
"learning_rate": 0.00016707407338296853,
"loss": 0.3791,
"step": 36500
},
{
"epoch": 0.2151970077368448,
"grad_norm": 7.939485549926758,
"learning_rate": 0.00016676307588960832,
"loss": 0.3752,
"step": 36750
},
{
"epoch": 0.21666093295954442,
"grad_norm": 6.328729629516602,
"learning_rate": 0.00016645207839624814,
"loss": 0.3761,
"step": 37000
},
{
"epoch": 0.21812485818224406,
"grad_norm": 6.196065902709961,
"learning_rate": 0.00016614108090288793,
"loss": 0.3817,
"step": 37250
},
{
"epoch": 0.21958878340494367,
"grad_norm": 10.096115112304688,
"learning_rate": 0.00016583008340952771,
"loss": 0.3828,
"step": 37500
},
{
"epoch": 0.2210527086276433,
"grad_norm": 6.120075702667236,
"learning_rate": 0.0001655190859161675,
"loss": 0.3774,
"step": 37750
},
{
"epoch": 0.22251663385034293,
"grad_norm": 6.575611114501953,
"learning_rate": 0.00016520808842280732,
"loss": 0.3823,
"step": 38000
},
{
"epoch": 0.22398055907304254,
"grad_norm": 7.636918067932129,
"learning_rate": 0.0001648970909294471,
"loss": 0.3846,
"step": 38250
},
{
"epoch": 0.22544448429574218,
"grad_norm": 15.759072303771973,
"learning_rate": 0.00016458733742606037,
"loss": 0.3842,
"step": 38500
},
{
"epoch": 0.2269084095184418,
"grad_norm": 10.398168563842773,
"learning_rate": 0.0001642775839226736,
"loss": 0.3794,
"step": 38750
},
{
"epoch": 0.22837233474114144,
"grad_norm": 6.939914703369141,
"learning_rate": 0.0001639665864293134,
"loss": 0.3763,
"step": 39000
},
{
"epoch": 0.22983625996384105,
"grad_norm": 11.021454811096191,
"learning_rate": 0.0001636555889359532,
"loss": 0.368,
"step": 39250
},
{
"epoch": 0.23130018518654066,
"grad_norm": 7.381429195404053,
"learning_rate": 0.00016334459144259298,
"loss": 0.3783,
"step": 39500
},
{
"epoch": 0.2327641104092403,
"grad_norm": 9.803789138793945,
"learning_rate": 0.0001630335939492328,
"loss": 0.3828,
"step": 39750
},
{
"epoch": 0.23422803563193992,
"grad_norm": 7.722465991973877,
"learning_rate": 0.00016272259645587259,
"loss": 0.3764,
"step": 40000
},
{
"epoch": 0.23569196085463953,
"grad_norm": 8.471487998962402,
"learning_rate": 0.00016241159896251237,
"loss": 0.3879,
"step": 40250
},
{
"epoch": 0.23715588607733917,
"grad_norm": 9.46483039855957,
"learning_rate": 0.00016210060146915216,
"loss": 0.3772,
"step": 40500
},
{
"epoch": 0.2386198113000388,
"grad_norm": 11.850425720214844,
"learning_rate": 0.00016178960397579198,
"loss": 0.3688,
"step": 40750
},
{
"epoch": 0.24008373652273843,
"grad_norm": 7.718139171600342,
"learning_rate": 0.00016147860648243177,
"loss": 0.3728,
"step": 41000
},
{
"epoch": 0.24154766174543804,
"grad_norm": 7.039102077484131,
"learning_rate": 0.00016116760898907156,
"loss": 0.3718,
"step": 41250
},
{
"epoch": 0.24301158696813766,
"grad_norm": 6.891547679901123,
"learning_rate": 0.00016085661149571137,
"loss": 0.3713,
"step": 41500
},
{
"epoch": 0.2444755121908373,
"grad_norm": 8.54554271697998,
"learning_rate": 0.00016054561400235116,
"loss": 0.3818,
"step": 41750
},
{
"epoch": 0.2459394374135369,
"grad_norm": 6.554268836975098,
"learning_rate": 0.00016023461650899095,
"loss": 0.3706,
"step": 42000
},
{
"epoch": 0.24740336263623655,
"grad_norm": 6.389885902404785,
"learning_rate": 0.00015992361901563074,
"loss": 0.3577,
"step": 42250
},
{
"epoch": 0.24886728785893616,
"grad_norm": 6.833805561065674,
"learning_rate": 0.00015961262152227056,
"loss": 0.3722,
"step": 42500
},
{
"epoch": 0.2503312130816358,
"grad_norm": 9.135841369628906,
"learning_rate": 0.00015930162402891034,
"loss": 0.3747,
"step": 42750
},
{
"epoch": 0.2517951383043354,
"grad_norm": 7.466910362243652,
"learning_rate": 0.00015899187052552358,
"loss": 0.378,
"step": 43000
},
{
"epoch": 0.25325906352703503,
"grad_norm": 14.597432136535645,
"learning_rate": 0.00015868087303216337,
"loss": 0.3743,
"step": 43250
},
{
"epoch": 0.25472298874973465,
"grad_norm": 6.523279190063477,
"learning_rate": 0.00015836987553880316,
"loss": 0.3728,
"step": 43500
},
{
"epoch": 0.25618691397243426,
"grad_norm": 5.352029800415039,
"learning_rate": 0.00015805887804544298,
"loss": 0.367,
"step": 43750
},
{
"epoch": 0.25765083919513393,
"grad_norm": 8.408788681030273,
"learning_rate": 0.00015774788055208277,
"loss": 0.3694,
"step": 44000
},
{
"epoch": 0.25911476441783354,
"grad_norm": 7.64408016204834,
"learning_rate": 0.00015743688305872256,
"loss": 0.3664,
"step": 44250
},
{
"epoch": 0.26057868964053316,
"grad_norm": 4.888110637664795,
"learning_rate": 0.00015712588556536234,
"loss": 0.3637,
"step": 44500
},
{
"epoch": 0.26204261486323277,
"grad_norm": 5.068843841552734,
"learning_rate": 0.00015681488807200216,
"loss": 0.369,
"step": 44750
},
{
"epoch": 0.2635065400859324,
"grad_norm": 6.427637577056885,
"learning_rate": 0.00015650389057864195,
"loss": 0.3788,
"step": 45000
},
{
"epoch": 0.26497046530863205,
"grad_norm": 8.00766658782959,
"learning_rate": 0.00015619289308528174,
"loss": 0.3638,
"step": 45250
},
{
"epoch": 0.26643439053133167,
"grad_norm": 8.729680061340332,
"learning_rate": 0.00015588189559192155,
"loss": 0.3736,
"step": 45500
},
{
"epoch": 0.2678983157540313,
"grad_norm": 10.317773818969727,
"learning_rate": 0.00015557089809856134,
"loss": 0.3618,
"step": 45750
},
{
"epoch": 0.2693622409767309,
"grad_norm": 7.715869903564453,
"learning_rate": 0.00015525990060520113,
"loss": 0.3741,
"step": 46000
},
{
"epoch": 0.2708261661994305,
"grad_norm": 5.711330890655518,
"learning_rate": 0.00015494890311184092,
"loss": 0.3745,
"step": 46250
},
{
"epoch": 0.2722900914221302,
"grad_norm": 9.835432052612305,
"learning_rate": 0.00015463790561848074,
"loss": 0.3693,
"step": 46500
},
{
"epoch": 0.2737540166448298,
"grad_norm": 6.019217014312744,
"learning_rate": 0.00015432815211509395,
"loss": 0.3674,
"step": 46750
},
{
"epoch": 0.2752179418675294,
"grad_norm": 7.813283443450928,
"learning_rate": 0.00015401715462173376,
"loss": 0.3674,
"step": 47000
},
{
"epoch": 0.276681867090229,
"grad_norm": 7.319979190826416,
"learning_rate": 0.00015370615712837355,
"loss": 0.3675,
"step": 47250
},
{
"epoch": 0.27814579231292863,
"grad_norm": 8.74886703491211,
"learning_rate": 0.00015339515963501334,
"loss": 0.3633,
"step": 47500
},
{
"epoch": 0.2796097175356283,
"grad_norm": 9.456360816955566,
"learning_rate": 0.00015308416214165316,
"loss": 0.379,
"step": 47750
},
{
"epoch": 0.2810736427583279,
"grad_norm": 10.024221420288086,
"learning_rate": 0.00015277316464829295,
"loss": 0.375,
"step": 48000
},
{
"epoch": 0.2825375679810275,
"grad_norm": 6.477073669433594,
"learning_rate": 0.00015246216715493274,
"loss": 0.3634,
"step": 48250
},
{
"epoch": 0.28400149320372714,
"grad_norm": 8.587589263916016,
"learning_rate": 0.00015215116966157255,
"loss": 0.3693,
"step": 48500
},
{
"epoch": 0.28546541842642675,
"grad_norm": 10.675822257995605,
"learning_rate": 0.00015184017216821234,
"loss": 0.3668,
"step": 48750
},
{
"epoch": 0.2869293436491264,
"grad_norm": 10.77786636352539,
"learning_rate": 0.00015153041866482558,
"loss": 0.3711,
"step": 49000
},
{
"epoch": 0.28839326887182604,
"grad_norm": 7.768797874450684,
"learning_rate": 0.00015121942117146537,
"loss": 0.3692,
"step": 49250
},
{
"epoch": 0.28985719409452565,
"grad_norm": 6.11573600769043,
"learning_rate": 0.00015090842367810516,
"loss": 0.3618,
"step": 49500
},
{
"epoch": 0.29132111931722526,
"grad_norm": 7.369346618652344,
"learning_rate": 0.00015059742618474495,
"loss": 0.365,
"step": 49750
},
{
"epoch": 0.2927850445399249,
"grad_norm": 10.559876441955566,
"learning_rate": 0.00015028642869138476,
"loss": 0.369,
"step": 50000
},
{
"epoch": 0.29424896976262455,
"grad_norm": 6.763681888580322,
"learning_rate": 0.00014997543119802455,
"loss": 0.3723,
"step": 50250
},
{
"epoch": 0.29571289498532416,
"grad_norm": 14.075911521911621,
"learning_rate": 0.00014966443370466434,
"loss": 0.3656,
"step": 50500
},
{
"epoch": 0.2971768202080238,
"grad_norm": 7.817617893218994,
"learning_rate": 0.00014935343621130416,
"loss": 0.3745,
"step": 50750
},
{
"epoch": 0.2986407454307234,
"grad_norm": 5.018287181854248,
"learning_rate": 0.00014904243871794395,
"loss": 0.3664,
"step": 51000
},
{
"epoch": 0.300104670653423,
"grad_norm": 9.846301078796387,
"learning_rate": 0.00014873144122458373,
"loss": 0.3644,
"step": 51250
},
{
"epoch": 0.30156859587612267,
"grad_norm": 8.65786361694336,
"learning_rate": 0.00014842044373122352,
"loss": 0.3698,
"step": 51500
},
{
"epoch": 0.3030325210988223,
"grad_norm": 6.303979873657227,
"learning_rate": 0.00014810944623786334,
"loss": 0.3707,
"step": 51750
},
{
"epoch": 0.3044964463215219,
"grad_norm": 39.32520294189453,
"learning_rate": 0.00014779844874450313,
"loss": 0.3617,
"step": 52000
},
{
"epoch": 0.3059603715442215,
"grad_norm": 6.535865306854248,
"learning_rate": 0.00014748869524111637,
"loss": 0.3642,
"step": 52250
},
{
"epoch": 0.3074242967669211,
"grad_norm": 6.031300067901611,
"learning_rate": 0.00014717769774775616,
"loss": 0.363,
"step": 52500
},
{
"epoch": 0.3088882219896208,
"grad_norm": 7.255093097686768,
"learning_rate": 0.00014686670025439595,
"loss": 0.3594,
"step": 52750
},
{
"epoch": 0.3103521472123204,
"grad_norm": 7.491271018981934,
"learning_rate": 0.00014655570276103576,
"loss": 0.3697,
"step": 53000
},
{
"epoch": 0.31181607243502,
"grad_norm": 8.154767036437988,
"learning_rate": 0.00014624470526767555,
"loss": 0.3667,
"step": 53250
},
{
"epoch": 0.31327999765771963,
"grad_norm": 7.7836384773254395,
"learning_rate": 0.00014593370777431534,
"loss": 0.3756,
"step": 53500
},
{
"epoch": 0.31474392288041925,
"grad_norm": 7.439420223236084,
"learning_rate": 0.00014562271028095513,
"loss": 0.3734,
"step": 53750
},
{
"epoch": 0.3162078481031189,
"grad_norm": 7.654810428619385,
"learning_rate": 0.00014531171278759494,
"loss": 0.3689,
"step": 54000
},
{
"epoch": 0.31767177332581853,
"grad_norm": 4.918389320373535,
"learning_rate": 0.00014500195928420816,
"loss": 0.3688,
"step": 54250
},
{
"epoch": 0.31913569854851814,
"grad_norm": 6.2310895919799805,
"learning_rate": 0.00014469096179084797,
"loss": 0.3711,
"step": 54500
},
{
"epoch": 0.32059962377121776,
"grad_norm": 7.458713054656982,
"learning_rate": 0.00014437996429748776,
"loss": 0.3614,
"step": 54750
},
{
"epoch": 0.32206354899391737,
"grad_norm": 6.790125370025635,
"learning_rate": 0.00014406896680412755,
"loss": 0.3635,
"step": 55000
},
{
"epoch": 0.32206354899391737,
"eval_accuracy": 0.8905084935576763,
"eval_loss": 0.362331748008728,
"eval_runtime": 11551.2138,
"eval_samples_per_second": 210.262,
"eval_steps_per_second": 6.571,
"step": 55000
},
{
"epoch": 0.32352747421661704,
"grad_norm": 7.128218650817871,
"learning_rate": 0.00014375796931076737,
"loss": 0.357,
"step": 55250
},
{
"epoch": 0.32499139943931665,
"grad_norm": 4.943136692047119,
"learning_rate": 0.00014344697181740715,
"loss": 0.3576,
"step": 55500
},
{
"epoch": 0.32645532466201627,
"grad_norm": 7.633016109466553,
"learning_rate": 0.00014313597432404694,
"loss": 0.3655,
"step": 55750
},
{
"epoch": 0.3279192498847159,
"grad_norm": 9.49149227142334,
"learning_rate": 0.00014282497683068673,
"loss": 0.3687,
"step": 56000
},
{
"epoch": 0.3293831751074155,
"grad_norm": 7.4215521812438965,
"learning_rate": 0.00014251397933732655,
"loss": 0.3705,
"step": 56250
},
{
"epoch": 0.33084710033011516,
"grad_norm": 5.638499736785889,
"learning_rate": 0.00014220298184396634,
"loss": 0.3709,
"step": 56500
},
{
"epoch": 0.3323110255528148,
"grad_norm": 9.440450668334961,
"learning_rate": 0.00014189198435060613,
"loss": 0.35,
"step": 56750
},
{
"epoch": 0.3337749507755144,
"grad_norm": 7.706991195678711,
"learning_rate": 0.00014158098685724594,
"loss": 0.3601,
"step": 57000
},
{
"epoch": 0.335238875998214,
"grad_norm": 8.154605865478516,
"learning_rate": 0.00014126998936388573,
"loss": 0.3625,
"step": 57250
},
{
"epoch": 0.3367028012209136,
"grad_norm": 7.608438491821289,
"learning_rate": 0.00014095899187052552,
"loss": 0.3588,
"step": 57500
},
{
"epoch": 0.3381667264436133,
"grad_norm": 5.466573715209961,
"learning_rate": 0.00014064799437716534,
"loss": 0.3528,
"step": 57750
},
{
"epoch": 0.3396306516663129,
"grad_norm": 7.514803409576416,
"learning_rate": 0.00014033699688380512,
"loss": 0.3624,
"step": 58000
},
{
"epoch": 0.3410945768890125,
"grad_norm": 4.846391677856445,
"learning_rate": 0.00014002599939044491,
"loss": 0.3525,
"step": 58250
},
{
"epoch": 0.3425585021117121,
"grad_norm": 6.116271018981934,
"learning_rate": 0.0001397150018970847,
"loss": 0.3556,
"step": 58500
},
{
"epoch": 0.34402242733441174,
"grad_norm": 7.234938621520996,
"learning_rate": 0.00013940400440372452,
"loss": 0.3723,
"step": 58750
},
{
"epoch": 0.3454863525571114,
"grad_norm": 8.690266609191895,
"learning_rate": 0.0001390930069103643,
"loss": 0.3671,
"step": 59000
},
{
"epoch": 0.346950277779811,
"grad_norm": 5.558066368103027,
"learning_rate": 0.0001387820094170041,
"loss": 0.3563,
"step": 59250
},
{
"epoch": 0.34841420300251064,
"grad_norm": 5.277857303619385,
"learning_rate": 0.0001384710119236439,
"loss": 0.3633,
"step": 59500
},
{
"epoch": 0.34987812822521025,
"grad_norm": 4.810859680175781,
"learning_rate": 0.00013816125842025712,
"loss": 0.3615,
"step": 59750
},
{
"epoch": 0.35134205344790986,
"grad_norm": 6.860721111297607,
"learning_rate": 0.00013785026092689694,
"loss": 0.3561,
"step": 60000
},
{
"epoch": 0.35280597867060953,
"grad_norm": 6.673612117767334,
"learning_rate": 0.00013753926343353673,
"loss": 0.3513,
"step": 60250
},
{
"epoch": 0.35426990389330915,
"grad_norm": 6.9296956062316895,
"learning_rate": 0.00013722826594017652,
"loss": 0.3563,
"step": 60500
},
{
"epoch": 0.35573382911600876,
"grad_norm": 6.235531806945801,
"learning_rate": 0.0001369172684468163,
"loss": 0.3586,
"step": 60750
},
{
"epoch": 0.3571977543387084,
"grad_norm": 6.549998760223389,
"learning_rate": 0.00013660627095345612,
"loss": 0.3572,
"step": 61000
},
{
"epoch": 0.358661679561408,
"grad_norm": 6.800797939300537,
"learning_rate": 0.0001362952734600959,
"loss": 0.3687,
"step": 61250
},
{
"epoch": 0.3601256047841076,
"grad_norm": 5.545276641845703,
"learning_rate": 0.0001359842759667357,
"loss": 0.3539,
"step": 61500
},
{
"epoch": 0.36158953000680727,
"grad_norm": 8.63070011138916,
"learning_rate": 0.00013567327847337552,
"loss": 0.3605,
"step": 61750
},
{
"epoch": 0.3630534552295069,
"grad_norm": 5.199543476104736,
"learning_rate": 0.0001353622809800153,
"loss": 0.3559,
"step": 62000
},
{
"epoch": 0.3645173804522065,
"grad_norm": 27.297420501708984,
"learning_rate": 0.0001350512834866551,
"loss": 0.3676,
"step": 62250
},
{
"epoch": 0.3659813056749061,
"grad_norm": 8.235854148864746,
"learning_rate": 0.00013474152998326833,
"loss": 0.3583,
"step": 62500
},
{
"epoch": 0.3674452308976057,
"grad_norm": 6.224372386932373,
"learning_rate": 0.00013443053248990812,
"loss": 0.3623,
"step": 62750
},
{
"epoch": 0.3689091561203054,
"grad_norm": 8.013957977294922,
"learning_rate": 0.0001341195349965479,
"loss": 0.3619,
"step": 63000
},
{
"epoch": 0.370373081343005,
"grad_norm": 6.442314147949219,
"learning_rate": 0.00013380853750318773,
"loss": 0.3586,
"step": 63250
},
{
"epoch": 0.3718370065657046,
"grad_norm": 6.883063793182373,
"learning_rate": 0.00013349754000982752,
"loss": 0.3635,
"step": 63500
},
{
"epoch": 0.37330093178840423,
"grad_norm": 5.502562999725342,
"learning_rate": 0.0001331865425164673,
"loss": 0.3525,
"step": 63750
},
{
"epoch": 0.37476485701110385,
"grad_norm": 6.841543197631836,
"learning_rate": 0.00013287554502310712,
"loss": 0.3564,
"step": 64000
},
{
"epoch": 0.3762287822338035,
"grad_norm": 6.850903034210205,
"learning_rate": 0.0001325645475297469,
"loss": 0.3549,
"step": 64250
},
{
"epoch": 0.37769270745650313,
"grad_norm": 5.823826313018799,
"learning_rate": 0.00013225479402636015,
"loss": 0.3488,
"step": 64500
},
{
"epoch": 0.37915663267920274,
"grad_norm": 9.849250793457031,
"learning_rate": 0.00013194379653299997,
"loss": 0.3526,
"step": 64750
},
{
"epoch": 0.38062055790190236,
"grad_norm": 7.8498992919921875,
"learning_rate": 0.00013163279903963975,
"loss": 0.3596,
"step": 65000
},
{
"epoch": 0.38208448312460197,
"grad_norm": 7.845436096191406,
"learning_rate": 0.00013132180154627954,
"loss": 0.3497,
"step": 65250
},
{
"epoch": 0.38354840834730164,
"grad_norm": 10.533845901489258,
"learning_rate": 0.00013101080405291933,
"loss": 0.3523,
"step": 65500
},
{
"epoch": 0.38501233357000125,
"grad_norm": 9.09399127960205,
"learning_rate": 0.00013069980655955912,
"loss": 0.347,
"step": 65750
},
{
"epoch": 0.38647625879270087,
"grad_norm": 7.205333232879639,
"learning_rate": 0.00013038880906619894,
"loss": 0.355,
"step": 66000
},
{
"epoch": 0.3879401840154005,
"grad_norm": 6.770249843597412,
"learning_rate": 0.00013007781157283873,
"loss": 0.3549,
"step": 66250
},
{
"epoch": 0.3894041092381001,
"grad_norm": 8.14482593536377,
"learning_rate": 0.00012976681407947851,
"loss": 0.3537,
"step": 66500
},
{
"epoch": 0.39086803446079976,
"grad_norm": 5.998184680938721,
"learning_rate": 0.0001294558165861183,
"loss": 0.3562,
"step": 66750
},
{
"epoch": 0.3923319596834994,
"grad_norm": 5.583696365356445,
"learning_rate": 0.00012914481909275812,
"loss": 0.3499,
"step": 67000
},
{
"epoch": 0.393795884906199,
"grad_norm": 6.899207592010498,
"learning_rate": 0.0001288338215993979,
"loss": 0.3506,
"step": 67250
},
{
"epoch": 0.3952598101288986,
"grad_norm": 6.205395221710205,
"learning_rate": 0.0001285228241060377,
"loss": 0.3512,
"step": 67500
},
{
"epoch": 0.3967237353515982,
"grad_norm": 9.125551223754883,
"learning_rate": 0.0001282118266126775,
"loss": 0.3585,
"step": 67750
},
{
"epoch": 0.3981876605742979,
"grad_norm": 6.943772792816162,
"learning_rate": 0.0001279008291193173,
"loss": 0.362,
"step": 68000
},
{
"epoch": 0.3996515857969975,
"grad_norm": 6.106304168701172,
"learning_rate": 0.0001275898316259571,
"loss": 0.3545,
"step": 68250
},
{
"epoch": 0.4011155110196971,
"grad_norm": 6.197811126708984,
"learning_rate": 0.00012728007812257036,
"loss": 0.3524,
"step": 68500
},
{
"epoch": 0.4025794362423967,
"grad_norm": 8.07652759552002,
"learning_rate": 0.00012696908062921015,
"loss": 0.3467,
"step": 68750
},
{
"epoch": 0.40404336146509634,
"grad_norm": 7.444363117218018,
"learning_rate": 0.00012665808313584994,
"loss": 0.3541,
"step": 69000
},
{
"epoch": 0.405507286687796,
"grad_norm": 6.2395782470703125,
"learning_rate": 0.00012634708564248972,
"loss": 0.3488,
"step": 69250
},
{
"epoch": 0.4069712119104956,
"grad_norm": 7.489956378936768,
"learning_rate": 0.00012603608814912954,
"loss": 0.3595,
"step": 69500
},
{
"epoch": 0.40843513713319524,
"grad_norm": 6.762283802032471,
"learning_rate": 0.00012572509065576933,
"loss": 0.3555,
"step": 69750
},
{
"epoch": 0.40989906235589485,
"grad_norm": 10.423229217529297,
"learning_rate": 0.00012541409316240912,
"loss": 0.3474,
"step": 70000
},
{
"epoch": 0.41136298757859446,
"grad_norm": 7.812709331512451,
"learning_rate": 0.0001251030956690489,
"loss": 0.3588,
"step": 70250
},
{
"epoch": 0.41282691280129413,
"grad_norm": 8.506246566772461,
"learning_rate": 0.00012479334216566215,
"loss": 0.3473,
"step": 70500
},
{
"epoch": 0.41429083802399375,
"grad_norm": 6.0005784034729,
"learning_rate": 0.00012448234467230196,
"loss": 0.3423,
"step": 70750
},
{
"epoch": 0.41575476324669336,
"grad_norm": 7.6112494468688965,
"learning_rate": 0.00012417134717894175,
"loss": 0.3469,
"step": 71000
},
{
"epoch": 0.417218688469393,
"grad_norm": 6.460068225860596,
"learning_rate": 0.00012386034968558154,
"loss": 0.3514,
"step": 71250
},
{
"epoch": 0.4186826136920926,
"grad_norm": 25.509037017822266,
"learning_rate": 0.00012354935219222136,
"loss": 0.3538,
"step": 71500
},
{
"epoch": 0.42014653891479226,
"grad_norm": 5.778562068939209,
"learning_rate": 0.00012323835469886114,
"loss": 0.3409,
"step": 71750
},
{
"epoch": 0.42161046413749187,
"grad_norm": 10.19543170928955,
"learning_rate": 0.00012292735720550093,
"loss": 0.3487,
"step": 72000
},
{
"epoch": 0.4230743893601915,
"grad_norm": 7.6341633796691895,
"learning_rate": 0.00012261635971214072,
"loss": 0.3477,
"step": 72250
},
{
"epoch": 0.4245383145828911,
"grad_norm": 5.656210422515869,
"learning_rate": 0.00012230536221878054,
"loss": 0.353,
"step": 72500
},
{
"epoch": 0.4260022398055907,
"grad_norm": 7.81094217300415,
"learning_rate": 0.00012199436472542031,
"loss": 0.3589,
"step": 72750
},
{
"epoch": 0.4274661650282904,
"grad_norm": 5.924116611480713,
"learning_rate": 0.0001216833672320601,
"loss": 0.346,
"step": 73000
},
{
"epoch": 0.42893009025099,
"grad_norm": 6.293444633483887,
"learning_rate": 0.00012137236973869992,
"loss": 0.3496,
"step": 73250
},
{
"epoch": 0.4303940154736896,
"grad_norm": 9.766921997070312,
"learning_rate": 0.00012106137224533971,
"loss": 0.347,
"step": 73500
},
{
"epoch": 0.4318579406963892,
"grad_norm": 5.998900890350342,
"learning_rate": 0.0001207503747519795,
"loss": 0.3465,
"step": 73750
},
{
"epoch": 0.43332186591908883,
"grad_norm": 8.364704132080078,
"learning_rate": 0.00012043937725861929,
"loss": 0.3429,
"step": 74000
},
{
"epoch": 0.4347857911417885,
"grad_norm": 5.508989334106445,
"learning_rate": 0.0001201283797652591,
"loss": 0.355,
"step": 74250
},
{
"epoch": 0.4362497163644881,
"grad_norm": 6.357595443725586,
"learning_rate": 0.00011981738227189889,
"loss": 0.3504,
"step": 74500
},
{
"epoch": 0.43771364158718773,
"grad_norm": 8.691376686096191,
"learning_rate": 0.00011950762876851213,
"loss": 0.3471,
"step": 74750
},
{
"epoch": 0.43917756680988734,
"grad_norm": 11.246256828308105,
"learning_rate": 0.00011919663127515193,
"loss": 0.3487,
"step": 75000
},
{
"epoch": 0.44064149203258696,
"grad_norm": 6.3526811599731445,
"learning_rate": 0.00011888563378179172,
"loss": 0.3414,
"step": 75250
},
{
"epoch": 0.4421054172552866,
"grad_norm": 9.6268310546875,
"learning_rate": 0.00011857463628843152,
"loss": 0.3457,
"step": 75500
},
{
"epoch": 0.44356934247798624,
"grad_norm": 8.093045234680176,
"learning_rate": 0.00011826363879507131,
"loss": 0.3515,
"step": 75750
},
{
"epoch": 0.44503326770068585,
"grad_norm": 7.497385025024414,
"learning_rate": 0.00011795264130171111,
"loss": 0.3361,
"step": 76000
},
{
"epoch": 0.44649719292338547,
"grad_norm": 8.374622344970703,
"learning_rate": 0.00011764164380835092,
"loss": 0.3552,
"step": 76250
},
{
"epoch": 0.4479611181460851,
"grad_norm": 8.583603858947754,
"learning_rate": 0.0001173306463149907,
"loss": 0.3395,
"step": 76500
},
{
"epoch": 0.44942504336878475,
"grad_norm": 5.933279991149902,
"learning_rate": 0.0001170196488216305,
"loss": 0.3539,
"step": 76750
},
{
"epoch": 0.45088896859148436,
"grad_norm": 7.1400556564331055,
"learning_rate": 0.00011670989531824375,
"loss": 0.3556,
"step": 77000
},
{
"epoch": 0.452352893814184,
"grad_norm": 6.4177374839782715,
"learning_rate": 0.00011639889782488354,
"loss": 0.34,
"step": 77250
},
{
"epoch": 0.4538168190368836,
"grad_norm": 8.248872756958008,
"learning_rate": 0.00011608790033152333,
"loss": 0.3454,
"step": 77500
},
{
"epoch": 0.4552807442595832,
"grad_norm": 6.789691925048828,
"learning_rate": 0.00011577690283816314,
"loss": 0.3506,
"step": 77750
},
{
"epoch": 0.4567446694822829,
"grad_norm": 7.519604206085205,
"learning_rate": 0.00011546590534480293,
"loss": 0.3438,
"step": 78000
},
{
"epoch": 0.4582085947049825,
"grad_norm": 11.287620544433594,
"learning_rate": 0.00011515490785144272,
"loss": 0.3536,
"step": 78250
},
{
"epoch": 0.4596725199276821,
"grad_norm": 5.6864914894104,
"learning_rate": 0.00011484391035808254,
"loss": 0.348,
"step": 78500
},
{
"epoch": 0.4611364451503817,
"grad_norm": 7.405890941619873,
"learning_rate": 0.00011453291286472232,
"loss": 0.3395,
"step": 78750
},
{
"epoch": 0.4626003703730813,
"grad_norm": 5.379487991333008,
"learning_rate": 0.00011422315936133556,
"loss": 0.3463,
"step": 79000
},
{
"epoch": 0.46406429559578094,
"grad_norm": 7.769617080688477,
"learning_rate": 0.00011391216186797535,
"loss": 0.3458,
"step": 79250
},
{
"epoch": 0.4655282208184806,
"grad_norm": 9.26171875,
"learning_rate": 0.00011360116437461514,
"loss": 0.3394,
"step": 79500
},
{
"epoch": 0.4669921460411802,
"grad_norm": 9.037941932678223,
"learning_rate": 0.00011329016688125493,
"loss": 0.349,
"step": 79750
},
{
"epoch": 0.46845607126387984,
"grad_norm": 8.776792526245117,
"learning_rate": 0.00011297916938789475,
"loss": 0.3384,
"step": 80000
},
{
"epoch": 0.46991999648657945,
"grad_norm": 6.737313270568848,
"learning_rate": 0.00011266817189453454,
"loss": 0.3472,
"step": 80250
},
{
"epoch": 0.47138392170927906,
"grad_norm": 7.2374114990234375,
"learning_rate": 0.00011235717440117432,
"loss": 0.3434,
"step": 80500
},
{
"epoch": 0.47284784693197873,
"grad_norm": 6.939677715301514,
"learning_rate": 0.00011204617690781414,
"loss": 0.3451,
"step": 80750
},
{
"epoch": 0.47431177215467835,
"grad_norm": 4.702803611755371,
"learning_rate": 0.00011173517941445393,
"loss": 0.3508,
"step": 81000
},
{
"epoch": 0.47577569737737796,
"grad_norm": 7.359582901000977,
"learning_rate": 0.00011142418192109372,
"loss": 0.3415,
"step": 81250
},
{
"epoch": 0.4772396226000776,
"grad_norm": 8.404651641845703,
"learning_rate": 0.00011111442841770696,
"loss": 0.3438,
"step": 81500
},
{
"epoch": 0.4787035478227772,
"grad_norm": 6.176925182342529,
"learning_rate": 0.00011080343092434675,
"loss": 0.3484,
"step": 81750
},
{
"epoch": 0.48016747304547686,
"grad_norm": 8.614276885986328,
"learning_rate": 0.00011049243343098655,
"loss": 0.3525,
"step": 82000
},
{
"epoch": 0.48163139826817647,
"grad_norm": 5.756929874420166,
"learning_rate": 0.00011018143593762635,
"loss": 0.3432,
"step": 82250
},
{
"epoch": 0.4830953234908761,
"grad_norm": 7.686267852783203,
"learning_rate": 0.00010987043844426614,
"loss": 0.3508,
"step": 82500
},
{
"epoch": 0.4845592487135757,
"grad_norm": 6.590146541595459,
"learning_rate": 0.00010955944095090593,
"loss": 0.3357,
"step": 82750
},
{
"epoch": 0.4860231739362753,
"grad_norm": 7.363981246948242,
"learning_rate": 0.00010924968744751918,
"loss": 0.3469,
"step": 83000
},
{
"epoch": 0.487487099158975,
"grad_norm": 5.942411422729492,
"learning_rate": 0.00010893868995415897,
"loss": 0.3464,
"step": 83250
},
{
"epoch": 0.4889510243816746,
"grad_norm": 8.531744003295898,
"learning_rate": 0.00010862769246079879,
"loss": 0.3349,
"step": 83500
},
{
"epoch": 0.4904149496043742,
"grad_norm": 20.821125030517578,
"learning_rate": 0.00010831669496743858,
"loss": 0.3434,
"step": 83750
},
{
"epoch": 0.4918788748270738,
"grad_norm": 9.569067001342773,
"learning_rate": 0.00010800569747407836,
"loss": 0.3421,
"step": 84000
},
{
"epoch": 0.49334280004977343,
"grad_norm": 7.6851725578308105,
"learning_rate": 0.00010769469998071815,
"loss": 0.3407,
"step": 84250
},
{
"epoch": 0.4948067252724731,
"grad_norm": 9.591890335083008,
"learning_rate": 0.00010738370248735797,
"loss": 0.347,
"step": 84500
},
{
"epoch": 0.4962706504951727,
"grad_norm": 5.16259765625,
"learning_rate": 0.00010707270499399776,
"loss": 0.3383,
"step": 84750
},
{
"epoch": 0.49773457571787233,
"grad_norm": 4.6993794441223145,
"learning_rate": 0.00010676170750063755,
"loss": 0.3392,
"step": 85000
},
{
"epoch": 0.49919850094057194,
"grad_norm": 6.331507682800293,
"learning_rate": 0.00010645071000727735,
"loss": 0.351,
"step": 85250
},
{
"epoch": 0.5006624261632716,
"grad_norm": 7.329137325286865,
"learning_rate": 0.00010613971251391714,
"loss": 0.3486,
"step": 85500
},
{
"epoch": 0.5021263513859712,
"grad_norm": 6.907947540283203,
"learning_rate": 0.00010582871502055694,
"loss": 0.3443,
"step": 85750
},
{
"epoch": 0.5035902766086708,
"grad_norm": 4.780885696411133,
"learning_rate": 0.00010551771752719674,
"loss": 0.3401,
"step": 86000
},
{
"epoch": 0.5050542018313705,
"grad_norm": 9.042526245117188,
"learning_rate": 0.00010520672003383653,
"loss": 0.3402,
"step": 86250
},
{
"epoch": 0.5065181270540701,
"grad_norm": 5.397533416748047,
"learning_rate": 0.00010489572254047632,
"loss": 0.3392,
"step": 86500
},
{
"epoch": 0.5079820522767697,
"grad_norm": 7.72251033782959,
"learning_rate": 0.00010458472504711612,
"loss": 0.3337,
"step": 86750
},
{
"epoch": 0.5094459774994693,
"grad_norm": 7.379674434661865,
"learning_rate": 0.00010427497154372936,
"loss": 0.3457,
"step": 87000
},
{
"epoch": 0.510909902722169,
"grad_norm": 7.123027801513672,
"learning_rate": 0.00010396397405036915,
"loss": 0.3311,
"step": 87250
},
{
"epoch": 0.5123738279448685,
"grad_norm": 6.388451099395752,
"learning_rate": 0.00010365297655700897,
"loss": 0.3386,
"step": 87500
},
{
"epoch": 0.5138377531675682,
"grad_norm": 8.933717727661133,
"learning_rate": 0.00010334197906364876,
"loss": 0.3377,
"step": 87750
},
{
"epoch": 0.5153016783902679,
"grad_norm": 5.813757419586182,
"learning_rate": 0.000103032225560262,
"loss": 0.3368,
"step": 88000
},
{
"epoch": 0.5167656036129674,
"grad_norm": 10.707741737365723,
"learning_rate": 0.00010272122806690178,
"loss": 0.3429,
"step": 88250
},
{
"epoch": 0.5182295288356671,
"grad_norm": 7.433245658874512,
"learning_rate": 0.00010241023057354157,
"loss": 0.3457,
"step": 88500
},
{
"epoch": 0.5196934540583666,
"grad_norm": 6.408331394195557,
"learning_rate": 0.00010209923308018139,
"loss": 0.3409,
"step": 88750
},
{
"epoch": 0.5211573792810663,
"grad_norm": 7.5843987464904785,
"learning_rate": 0.00010178823558682118,
"loss": 0.3347,
"step": 89000
},
{
"epoch": 0.522621304503766,
"grad_norm": 9.049858093261719,
"learning_rate": 0.00010147723809346097,
"loss": 0.3392,
"step": 89250
},
{
"epoch": 0.5240852297264655,
"grad_norm": 8.207107543945312,
"learning_rate": 0.00010116624060010076,
"loss": 0.334,
"step": 89500
},
{
"epoch": 0.5255491549491652,
"grad_norm": 6.511790752410889,
"learning_rate": 0.00010085648709671401,
"loss": 0.3462,
"step": 89750
},
{
"epoch": 0.5270130801718648,
"grad_norm": 5.541443824768066,
"learning_rate": 0.0001005454896033538,
"loss": 0.3318,
"step": 90000
},
{
"epoch": 0.5284770053945644,
"grad_norm": 6.216821670532227,
"learning_rate": 0.0001002344921099936,
"loss": 0.338,
"step": 90250
},
{
"epoch": 0.5299409306172641,
"grad_norm": 5.138360977172852,
"learning_rate": 9.992349461663339e-05,
"loss": 0.3457,
"step": 90500
},
{
"epoch": 0.5314048558399637,
"grad_norm": 8.401073455810547,
"learning_rate": 9.961249712327319e-05,
"loss": 0.3523,
"step": 90750
},
{
"epoch": 0.5328687810626633,
"grad_norm": 8.749157905578613,
"learning_rate": 9.930149962991298e-05,
"loss": 0.3391,
"step": 91000
},
{
"epoch": 0.5343327062853629,
"grad_norm": 7.809004783630371,
"learning_rate": 9.899050213655278e-05,
"loss": 0.3422,
"step": 91250
},
{
"epoch": 0.5357966315080626,
"grad_norm": 7.649618148803711,
"learning_rate": 9.867950464319257e-05,
"loss": 0.3512,
"step": 91500
},
{
"epoch": 0.5372605567307622,
"grad_norm": 8.770468711853027,
"learning_rate": 9.836850714983237e-05,
"loss": 0.3367,
"step": 91750
},
{
"epoch": 0.5387244819534618,
"grad_norm": 8.32112979888916,
"learning_rate": 9.805750965647216e-05,
"loss": 0.3384,
"step": 92000
},
{
"epoch": 0.5401884071761615,
"grad_norm": 9.602888107299805,
"learning_rate": 9.774651216311197e-05,
"loss": 0.3344,
"step": 92250
},
{
"epoch": 0.541652332398861,
"grad_norm": 3.2295093536376953,
"learning_rate": 9.743551466975177e-05,
"loss": 0.3314,
"step": 92500
},
{
"epoch": 0.5431162576215607,
"grad_norm": 5.456012725830078,
"learning_rate": 9.712451717639156e-05,
"loss": 0.3313,
"step": 92750
},
{
"epoch": 0.5445801828442604,
"grad_norm": 7.777164936065674,
"learning_rate": 9.681351968303136e-05,
"loss": 0.3417,
"step": 93000
},
{
"epoch": 0.5460441080669599,
"grad_norm": 10.10175895690918,
"learning_rate": 9.650252218967115e-05,
"loss": 0.3357,
"step": 93250
},
{
"epoch": 0.5475080332896596,
"grad_norm": 8.296233177185059,
"learning_rate": 9.619152469631095e-05,
"loss": 0.3368,
"step": 93500
},
{
"epoch": 0.5489719585123591,
"grad_norm": 5.55683708190918,
"learning_rate": 9.588052720295075e-05,
"loss": 0.3338,
"step": 93750
},
{
"epoch": 0.5504358837350588,
"grad_norm": 5.92700719833374,
"learning_rate": 9.556952970959054e-05,
"loss": 0.3431,
"step": 94000
},
{
"epoch": 0.5518998089577585,
"grad_norm": 5.411899089813232,
"learning_rate": 9.525853221623034e-05,
"loss": 0.3393,
"step": 94250
},
{
"epoch": 0.553363734180458,
"grad_norm": 6.517271995544434,
"learning_rate": 9.494753472287013e-05,
"loss": 0.3332,
"step": 94500
},
{
"epoch": 0.5548276594031577,
"grad_norm": 9.099715232849121,
"learning_rate": 9.463653722950994e-05,
"loss": 0.3343,
"step": 94750
},
{
"epoch": 0.5562915846258573,
"grad_norm": 4.845067501068115,
"learning_rate": 9.432553973614972e-05,
"loss": 0.3344,
"step": 95000
},
{
"epoch": 0.5577555098485569,
"grad_norm": 8.56153392791748,
"learning_rate": 9.401454224278953e-05,
"loss": 0.33,
"step": 95250
},
{
"epoch": 0.5592194350712566,
"grad_norm": 7.1542439460754395,
"learning_rate": 9.370354474942933e-05,
"loss": 0.3186,
"step": 95500
},
{
"epoch": 0.5606833602939562,
"grad_norm": 7.00217342376709,
"learning_rate": 9.339254725606912e-05,
"loss": 0.335,
"step": 95750
},
{
"epoch": 0.5621472855166558,
"grad_norm": 7.365664482116699,
"learning_rate": 9.308279375268236e-05,
"loss": 0.3303,
"step": 96000
},
{
"epoch": 0.5636112107393554,
"grad_norm": 8.063042640686035,
"learning_rate": 9.277179625932215e-05,
"loss": 0.3441,
"step": 96250
},
{
"epoch": 0.565075135962055,
"grad_norm": 5.403791904449463,
"learning_rate": 9.246079876596195e-05,
"loss": 0.3318,
"step": 96500
},
{
"epoch": 0.5665390611847547,
"grad_norm": 5.911950588226318,
"learning_rate": 9.215104526257519e-05,
"loss": 0.3327,
"step": 96750
},
{
"epoch": 0.5680029864074543,
"grad_norm": 5.484018802642822,
"learning_rate": 9.184004776921499e-05,
"loss": 0.3384,
"step": 97000
},
{
"epoch": 0.569466911630154,
"grad_norm": 4.785627365112305,
"learning_rate": 9.152905027585478e-05,
"loss": 0.3437,
"step": 97250
},
{
"epoch": 0.5709308368528535,
"grad_norm": 7.17230749130249,
"learning_rate": 9.121805278249458e-05,
"loss": 0.3331,
"step": 97500
},
{
"epoch": 0.5723947620755532,
"grad_norm": 7.777104377746582,
"learning_rate": 9.090705528913437e-05,
"loss": 0.3371,
"step": 97750
},
{
"epoch": 0.5738586872982528,
"grad_norm": 6.8572001457214355,
"learning_rate": 9.059605779577417e-05,
"loss": 0.3397,
"step": 98000
},
{
"epoch": 0.5753226125209524,
"grad_norm": 9.132293701171875,
"learning_rate": 9.028506030241398e-05,
"loss": 0.3421,
"step": 98250
},
{
"epoch": 0.5767865377436521,
"grad_norm": 7.351444244384766,
"learning_rate": 8.997406280905376e-05,
"loss": 0.3315,
"step": 98500
},
{
"epoch": 0.5782504629663516,
"grad_norm": 5.444695949554443,
"learning_rate": 8.966306531569357e-05,
"loss": 0.3313,
"step": 98750
},
{
"epoch": 0.5797143881890513,
"grad_norm": 6.229501724243164,
"learning_rate": 8.935206782233336e-05,
"loss": 0.3321,
"step": 99000
},
{
"epoch": 0.581178313411751,
"grad_norm": 4.431236743927002,
"learning_rate": 8.904107032897316e-05,
"loss": 0.3326,
"step": 99250
},
{
"epoch": 0.5826422386344505,
"grad_norm": 4.78348445892334,
"learning_rate": 8.873007283561296e-05,
"loss": 0.3362,
"step": 99500
},
{
"epoch": 0.5841061638571502,
"grad_norm": 5.964051723480225,
"learning_rate": 8.841907534225275e-05,
"loss": 0.3408,
"step": 99750
},
{
"epoch": 0.5855700890798498,
"grad_norm": 5.310559272766113,
"learning_rate": 8.810807784889255e-05,
"loss": 0.3328,
"step": 100000
},
{
"epoch": 0.5870340143025494,
"grad_norm": 4.985818862915039,
"learning_rate": 8.779708035553234e-05,
"loss": 0.337,
"step": 100250
},
{
"epoch": 0.5884979395252491,
"grad_norm": 4.851356506347656,
"learning_rate": 8.748608286217213e-05,
"loss": 0.3314,
"step": 100500
},
{
"epoch": 0.5899618647479486,
"grad_norm": 6.863201141357422,
"learning_rate": 8.717508536881193e-05,
"loss": 0.3231,
"step": 100750
},
{
"epoch": 0.5914257899706483,
"grad_norm": 6.387337684631348,
"learning_rate": 8.686533186542517e-05,
"loss": 0.322,
"step": 101000
},
{
"epoch": 0.5928897151933479,
"grad_norm": 7.897363662719727,
"learning_rate": 8.655433437206496e-05,
"loss": 0.3361,
"step": 101250
},
{
"epoch": 0.5943536404160475,
"grad_norm": 5.876019477844238,
"learning_rate": 8.624333687870476e-05,
"loss": 0.3211,
"step": 101500
},
{
"epoch": 0.5958175656387472,
"grad_norm": 4.175768852233887,
"learning_rate": 8.593233938534457e-05,
"loss": 0.3317,
"step": 101750
},
{
"epoch": 0.5972814908614468,
"grad_norm": 6.496226787567139,
"learning_rate": 8.562134189198435e-05,
"loss": 0.3289,
"step": 102000
},
{
"epoch": 0.5987454160841464,
"grad_norm": 7.092103004455566,
"learning_rate": 8.531034439862416e-05,
"loss": 0.3329,
"step": 102250
},
{
"epoch": 0.600209341306846,
"grad_norm": 7.335963726043701,
"learning_rate": 8.499934690526395e-05,
"loss": 0.3305,
"step": 102500
},
{
"epoch": 0.6016732665295457,
"grad_norm": 6.620415687561035,
"learning_rate": 8.468834941190375e-05,
"loss": 0.3324,
"step": 102750
},
{
"epoch": 0.6031371917522453,
"grad_norm": 6.866759777069092,
"learning_rate": 8.437735191854355e-05,
"loss": 0.3395,
"step": 103000
},
{
"epoch": 0.6046011169749449,
"grad_norm": 7.7242045402526855,
"learning_rate": 8.406759841515678e-05,
"loss": 0.3368,
"step": 103250
},
{
"epoch": 0.6060650421976446,
"grad_norm": 6.402958869934082,
"learning_rate": 8.375660092179658e-05,
"loss": 0.3366,
"step": 103500
},
{
"epoch": 0.6075289674203441,
"grad_norm": 6.456150531768799,
"learning_rate": 8.344560342843637e-05,
"loss": 0.3372,
"step": 103750
},
{
"epoch": 0.6089928926430438,
"grad_norm": 7.6825971603393555,
"learning_rate": 8.313460593507617e-05,
"loss": 0.3331,
"step": 104000
},
{
"epoch": 0.6104568178657435,
"grad_norm": 11.974824905395508,
"learning_rate": 8.282360844171596e-05,
"loss": 0.3317,
"step": 104250
},
{
"epoch": 0.611920743088443,
"grad_norm": 5.445409774780273,
"learning_rate": 8.251261094835576e-05,
"loss": 0.3303,
"step": 104500
},
{
"epoch": 0.6133846683111427,
"grad_norm": 8.099034309387207,
"learning_rate": 8.220161345499555e-05,
"loss": 0.3317,
"step": 104750
},
{
"epoch": 0.6148485935338422,
"grad_norm": 21.789043426513672,
"learning_rate": 8.189061596163535e-05,
"loss": 0.3146,
"step": 105000
},
{
"epoch": 0.6163125187565419,
"grad_norm": 6.879361152648926,
"learning_rate": 8.158086245824859e-05,
"loss": 0.3346,
"step": 105250
},
{
"epoch": 0.6177764439792416,
"grad_norm": 5.477085113525391,
"learning_rate": 8.126986496488838e-05,
"loss": 0.3274,
"step": 105500
},
{
"epoch": 0.6192403692019411,
"grad_norm": 6.2816667556762695,
"learning_rate": 8.095886747152818e-05,
"loss": 0.3271,
"step": 105750
},
{
"epoch": 0.6207042944246408,
"grad_norm": 9.089285850524902,
"learning_rate": 8.064786997816797e-05,
"loss": 0.3351,
"step": 106000
},
{
"epoch": 0.6221682196473404,
"grad_norm": 6.114886283874512,
"learning_rate": 8.033687248480777e-05,
"loss": 0.3296,
"step": 106250
},
{
"epoch": 0.62363214487004,
"grad_norm": 7.2542548179626465,
"learning_rate": 8.002587499144756e-05,
"loss": 0.3246,
"step": 106500
},
{
"epoch": 0.6250960700927397,
"grad_norm": 5.58528995513916,
"learning_rate": 7.971487749808737e-05,
"loss": 0.3327,
"step": 106750
},
{
"epoch": 0.6265599953154393,
"grad_norm": 3.898178815841675,
"learning_rate": 7.940388000472715e-05,
"loss": 0.3291,
"step": 107000
},
{
"epoch": 0.6280239205381389,
"grad_norm": 5.644820690155029,
"learning_rate": 7.909288251136696e-05,
"loss": 0.3281,
"step": 107250
},
{
"epoch": 0.6294878457608385,
"grad_norm": 6.363776206970215,
"learning_rate": 7.878188501800676e-05,
"loss": 0.3304,
"step": 107500
},
{
"epoch": 0.6309517709835382,
"grad_norm": 5.209687232971191,
"learning_rate": 7.847213151462e-05,
"loss": 0.3224,
"step": 107750
},
{
"epoch": 0.6324156962062378,
"grad_norm": 6.911553382873535,
"learning_rate": 7.81611340212598e-05,
"loss": 0.3246,
"step": 108000
},
{
"epoch": 0.6338796214289374,
"grad_norm": 7.6557111740112305,
"learning_rate": 7.785013652789959e-05,
"loss": 0.322,
"step": 108250
},
{
"epoch": 0.6353435466516371,
"grad_norm": 7.857481002807617,
"learning_rate": 7.753913903453939e-05,
"loss": 0.3318,
"step": 108500
},
{
"epoch": 0.6368074718743366,
"grad_norm": 5.911120891571045,
"learning_rate": 7.722814154117918e-05,
"loss": 0.325,
"step": 108750
},
{
"epoch": 0.6382713970970363,
"grad_norm": 8.592209815979004,
"learning_rate": 7.691714404781898e-05,
"loss": 0.3209,
"step": 109000
},
{
"epoch": 0.639735322319736,
"grad_norm": 6.824602127075195,
"learning_rate": 7.660614655445879e-05,
"loss": 0.3331,
"step": 109250
},
{
"epoch": 0.6411992475424355,
"grad_norm": 6.813981056213379,
"learning_rate": 7.629514906109858e-05,
"loss": 0.3313,
"step": 109500
},
{
"epoch": 0.6426631727651352,
"grad_norm": 5.7169671058654785,
"learning_rate": 7.598539555771181e-05,
"loss": 0.3206,
"step": 109750
},
{
"epoch": 0.6441270979878347,
"grad_norm": 5.429720401763916,
"learning_rate": 7.56743980643516e-05,
"loss": 0.3192,
"step": 110000
},
{
"epoch": 0.6441270979878347,
"eval_accuracy": 0.8997983351325891,
"eval_loss": 0.3242824375629425,
"eval_runtime": 11546.6804,
"eval_samples_per_second": 210.345,
"eval_steps_per_second": 6.573,
"step": 110000
}
],
"logging_steps": 250,
"max_steps": 170773,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 55000,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 2.8505890873482936e+19,
"train_batch_size": 8,
"trial_name": null,
"trial_params": null
}