fr-1B-pretraining / trainer_state.json
moussaKam's picture
upload
f329741 verified
{
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 1.0,
"eval_steps": 500.0,
"global_step": 72699,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"epoch": 0.0006877673695649183,
"grad_norm": 0.19629216194152832,
"learning_rate": 0.0001,
"loss": 1.9938,
"step": 50
},
{
"epoch": 0.0013755347391298366,
"grad_norm": 0.24478936195373535,
"learning_rate": 0.0001,
"loss": 1.9596,
"step": 100
},
{
"epoch": 0.002063302108694755,
"grad_norm": 0.22170111536979675,
"learning_rate": 0.0001,
"loss": 1.9526,
"step": 150
},
{
"epoch": 0.0027510694782596733,
"grad_norm": 0.2311343252658844,
"learning_rate": 0.0001,
"loss": 1.9471,
"step": 200
},
{
"epoch": 0.003438836847824592,
"grad_norm": 0.20621128380298615,
"learning_rate": 0.0001,
"loss": 1.9435,
"step": 250
},
{
"epoch": 0.00412660421738951,
"grad_norm": 0.22248196601867676,
"learning_rate": 0.0001,
"loss": 1.9396,
"step": 300
},
{
"epoch": 0.004814371586954428,
"grad_norm": 0.20232965052127838,
"learning_rate": 0.0001,
"loss": 1.9362,
"step": 350
},
{
"epoch": 0.0055021389565193465,
"grad_norm": 0.21155332028865814,
"learning_rate": 0.0001,
"loss": 1.9285,
"step": 400
},
{
"epoch": 0.006189906326084265,
"grad_norm": 0.25176894664764404,
"learning_rate": 0.0001,
"loss": 1.9319,
"step": 450
},
{
"epoch": 0.006877673695649184,
"grad_norm": 0.21027377247810364,
"learning_rate": 0.0001,
"loss": 1.9304,
"step": 500
},
{
"epoch": 0.007565441065214102,
"grad_norm": 0.2434869110584259,
"learning_rate": 0.0001,
"loss": 1.93,
"step": 550
},
{
"epoch": 0.00825320843477902,
"grad_norm": 0.1908300668001175,
"learning_rate": 0.0001,
"loss": 1.9254,
"step": 600
},
{
"epoch": 0.008940975804343939,
"grad_norm": 0.2221110612154007,
"learning_rate": 0.0001,
"loss": 1.9226,
"step": 650
},
{
"epoch": 0.009628743173908856,
"grad_norm": 0.22620266675949097,
"learning_rate": 0.0001,
"loss": 1.9262,
"step": 700
},
{
"epoch": 0.010316510543473776,
"grad_norm": 0.21463032066822052,
"learning_rate": 0.0001,
"loss": 1.9201,
"step": 750
},
{
"epoch": 0.011004277913038693,
"grad_norm": 0.19383488595485687,
"learning_rate": 0.0001,
"loss": 1.9236,
"step": 800
},
{
"epoch": 0.011692045282603612,
"grad_norm": 0.22416023910045624,
"learning_rate": 0.0001,
"loss": 1.9186,
"step": 850
},
{
"epoch": 0.01237981265216853,
"grad_norm": 0.2285342961549759,
"learning_rate": 0.0001,
"loss": 1.9207,
"step": 900
},
{
"epoch": 0.013067580021733449,
"grad_norm": 0.20416004955768585,
"learning_rate": 0.0001,
"loss": 1.9171,
"step": 950
},
{
"epoch": 0.013755347391298368,
"grad_norm": 0.20697274804115295,
"learning_rate": 0.0001,
"loss": 1.9177,
"step": 1000
},
{
"epoch": 0.014443114760863286,
"grad_norm": 0.2317676991224289,
"learning_rate": 0.0001,
"loss": 1.9138,
"step": 1050
},
{
"epoch": 0.015130882130428205,
"grad_norm": 0.21276156604290009,
"learning_rate": 0.0001,
"loss": 1.9111,
"step": 1100
},
{
"epoch": 0.015818649499993124,
"grad_norm": 0.20574018359184265,
"learning_rate": 0.0001,
"loss": 1.9155,
"step": 1150
},
{
"epoch": 0.01650641686955804,
"grad_norm": 0.19410207867622375,
"learning_rate": 0.0001,
"loss": 1.9073,
"step": 1200
},
{
"epoch": 0.01719418423912296,
"grad_norm": 0.19570203125476837,
"learning_rate": 0.0001,
"loss": 1.9085,
"step": 1250
},
{
"epoch": 0.017881951608687878,
"grad_norm": 0.2081640362739563,
"learning_rate": 0.0001,
"loss": 1.9093,
"step": 1300
},
{
"epoch": 0.018569718978252797,
"grad_norm": 0.19721642136573792,
"learning_rate": 0.0001,
"loss": 1.9086,
"step": 1350
},
{
"epoch": 0.019257486347817713,
"grad_norm": 0.202309712767601,
"learning_rate": 0.0001,
"loss": 1.9039,
"step": 1400
},
{
"epoch": 0.019945253717382632,
"grad_norm": 0.22128838300704956,
"learning_rate": 0.0001,
"loss": 1.9067,
"step": 1450
},
{
"epoch": 0.02063302108694755,
"grad_norm": 0.25011196732521057,
"learning_rate": 0.0001,
"loss": 1.9055,
"step": 1500
},
{
"epoch": 0.02132078845651247,
"grad_norm": 0.20523639023303986,
"learning_rate": 0.0001,
"loss": 1.9039,
"step": 1550
},
{
"epoch": 0.022008555826077386,
"grad_norm": 0.2327890396118164,
"learning_rate": 0.0001,
"loss": 1.9059,
"step": 1600
},
{
"epoch": 0.022696323195642305,
"grad_norm": 0.22426384687423706,
"learning_rate": 0.0001,
"loss": 1.9033,
"step": 1650
},
{
"epoch": 0.023384090565207225,
"grad_norm": 0.2116124927997589,
"learning_rate": 0.0001,
"loss": 1.902,
"step": 1700
},
{
"epoch": 0.024071857934772144,
"grad_norm": 0.21172966063022614,
"learning_rate": 0.0001,
"loss": 1.9007,
"step": 1750
},
{
"epoch": 0.02475962530433706,
"grad_norm": 0.19443170726299286,
"learning_rate": 0.0001,
"loss": 1.9003,
"step": 1800
},
{
"epoch": 0.02544739267390198,
"grad_norm": 0.21195723116397858,
"learning_rate": 0.0001,
"loss": 1.9015,
"step": 1850
},
{
"epoch": 0.026135160043466898,
"grad_norm": 0.22141411900520325,
"learning_rate": 0.0001,
"loss": 1.8957,
"step": 1900
},
{
"epoch": 0.026822927413031817,
"grad_norm": 0.22995401918888092,
"learning_rate": 0.0001,
"loss": 1.8979,
"step": 1950
},
{
"epoch": 0.027510694782596736,
"grad_norm": 0.2246379405260086,
"learning_rate": 0.0001,
"loss": 1.8966,
"step": 2000
},
{
"epoch": 0.028198462152161652,
"grad_norm": 0.22695621848106384,
"learning_rate": 0.0001,
"loss": 1.895,
"step": 2050
},
{
"epoch": 0.02888622952172657,
"grad_norm": 0.19988253712654114,
"learning_rate": 0.0001,
"loss": 1.8934,
"step": 2100
},
{
"epoch": 0.02957399689129149,
"grad_norm": 0.21754223108291626,
"learning_rate": 0.0001,
"loss": 1.8972,
"step": 2150
},
{
"epoch": 0.03026176426085641,
"grad_norm": 0.19053423404693604,
"learning_rate": 0.0001,
"loss": 1.8912,
"step": 2200
},
{
"epoch": 0.030949531630421325,
"grad_norm": 0.21589875221252441,
"learning_rate": 0.0001,
"loss": 1.8935,
"step": 2250
},
{
"epoch": 0.03163729899998625,
"grad_norm": 0.2087436020374298,
"learning_rate": 0.0001,
"loss": 1.8923,
"step": 2300
},
{
"epoch": 0.03232506636955116,
"grad_norm": 0.2261374592781067,
"learning_rate": 0.0001,
"loss": 1.8914,
"step": 2350
},
{
"epoch": 0.03301283373911608,
"grad_norm": 0.1949523240327835,
"learning_rate": 0.0001,
"loss": 1.8905,
"step": 2400
},
{
"epoch": 0.033700601108681,
"grad_norm": 0.21544858813285828,
"learning_rate": 0.0001,
"loss": 1.8909,
"step": 2450
},
{
"epoch": 0.03438836847824592,
"grad_norm": 0.20145681500434875,
"learning_rate": 0.0001,
"loss": 1.8876,
"step": 2500
},
{
"epoch": 0.03507613584781084,
"grad_norm": 0.21707232296466827,
"learning_rate": 0.0001,
"loss": 1.8915,
"step": 2550
},
{
"epoch": 0.035763903217375756,
"grad_norm": 0.1982990950345993,
"learning_rate": 0.0001,
"loss": 1.888,
"step": 2600
},
{
"epoch": 0.036451670586940675,
"grad_norm": 0.2223712056875229,
"learning_rate": 0.0001,
"loss": 1.8868,
"step": 2650
},
{
"epoch": 0.037139437956505594,
"grad_norm": 0.19649413228034973,
"learning_rate": 0.0001,
"loss": 1.8869,
"step": 2700
},
{
"epoch": 0.03782720532607051,
"grad_norm": 0.22767962515354156,
"learning_rate": 0.0001,
"loss": 1.8901,
"step": 2750
},
{
"epoch": 0.038514972695635426,
"grad_norm": 0.19138416647911072,
"learning_rate": 0.0001,
"loss": 1.8916,
"step": 2800
},
{
"epoch": 0.039202740065200345,
"grad_norm": 0.19380460679531097,
"learning_rate": 0.0001,
"loss": 1.8889,
"step": 2850
},
{
"epoch": 0.039890507434765264,
"grad_norm": 0.19751518964767456,
"learning_rate": 0.0001,
"loss": 1.8868,
"step": 2900
},
{
"epoch": 0.04057827480433018,
"grad_norm": 0.21071408689022064,
"learning_rate": 0.0001,
"loss": 1.8862,
"step": 2950
},
{
"epoch": 0.0412660421738951,
"grad_norm": 0.19260670244693756,
"learning_rate": 0.0001,
"loss": 1.8827,
"step": 3000
},
{
"epoch": 0.04195380954346002,
"grad_norm": 0.19185714423656464,
"learning_rate": 0.0001,
"loss": 1.8866,
"step": 3050
},
{
"epoch": 0.04264157691302494,
"grad_norm": 0.24877017736434937,
"learning_rate": 0.0001,
"loss": 1.8854,
"step": 3100
},
{
"epoch": 0.04332934428258986,
"grad_norm": 0.1947249323129654,
"learning_rate": 0.0001,
"loss": 1.8842,
"step": 3150
},
{
"epoch": 0.04401711165215477,
"grad_norm": 0.20210722088813782,
"learning_rate": 0.0001,
"loss": 1.8837,
"step": 3200
},
{
"epoch": 0.04470487902171969,
"grad_norm": 0.22242394089698792,
"learning_rate": 0.0001,
"loss": 1.8817,
"step": 3250
},
{
"epoch": 0.04539264639128461,
"grad_norm": 0.2049330472946167,
"learning_rate": 0.0001,
"loss": 1.8845,
"step": 3300
},
{
"epoch": 0.04608041376084953,
"grad_norm": 0.19368599355220795,
"learning_rate": 0.0001,
"loss": 1.884,
"step": 3350
},
{
"epoch": 0.04676818113041445,
"grad_norm": 0.1886671483516693,
"learning_rate": 0.0001,
"loss": 1.883,
"step": 3400
},
{
"epoch": 0.04745594849997937,
"grad_norm": 0.19359445571899414,
"learning_rate": 0.0001,
"loss": 1.8824,
"step": 3450
},
{
"epoch": 0.04814371586954429,
"grad_norm": 0.195325568318367,
"learning_rate": 0.0001,
"loss": 1.8806,
"step": 3500
},
{
"epoch": 0.04883148323910921,
"grad_norm": 0.21584388613700867,
"learning_rate": 0.0001,
"loss": 1.879,
"step": 3550
},
{
"epoch": 0.04951925060867412,
"grad_norm": 0.19085532426834106,
"learning_rate": 0.0001,
"loss": 1.8817,
"step": 3600
},
{
"epoch": 0.05020701797823904,
"grad_norm": 0.2133578211069107,
"learning_rate": 0.0001,
"loss": 1.8797,
"step": 3650
},
{
"epoch": 0.05089478534780396,
"grad_norm": 0.19587628543376923,
"learning_rate": 0.0001,
"loss": 1.8806,
"step": 3700
},
{
"epoch": 0.051582552717368876,
"grad_norm": 0.22608409821987152,
"learning_rate": 0.0001,
"loss": 1.8803,
"step": 3750
},
{
"epoch": 0.052270320086933796,
"grad_norm": 0.20075012743473053,
"learning_rate": 0.0001,
"loss": 1.8773,
"step": 3800
},
{
"epoch": 0.052958087456498715,
"grad_norm": 0.2007540464401245,
"learning_rate": 0.0001,
"loss": 1.8775,
"step": 3850
},
{
"epoch": 0.053645854826063634,
"grad_norm": 0.20465299487113953,
"learning_rate": 0.0001,
"loss": 1.88,
"step": 3900
},
{
"epoch": 0.05433362219562855,
"grad_norm": 0.19921573996543884,
"learning_rate": 0.0001,
"loss": 1.8749,
"step": 3950
},
{
"epoch": 0.05502138956519347,
"grad_norm": 0.19196507334709167,
"learning_rate": 0.0001,
"loss": 1.8808,
"step": 4000
},
{
"epoch": 0.055709156934758385,
"grad_norm": 0.20529140532016754,
"learning_rate": 0.0001,
"loss": 1.8787,
"step": 4050
},
{
"epoch": 0.056396924304323304,
"grad_norm": 0.23082584142684937,
"learning_rate": 0.0001,
"loss": 1.8752,
"step": 4100
},
{
"epoch": 0.05708469167388822,
"grad_norm": 0.18597312271595,
"learning_rate": 0.0001,
"loss": 1.8793,
"step": 4150
},
{
"epoch": 0.05777245904345314,
"grad_norm": 0.23071937263011932,
"learning_rate": 0.0001,
"loss": 1.8782,
"step": 4200
},
{
"epoch": 0.05846022641301806,
"grad_norm": 0.19141189754009247,
"learning_rate": 0.0001,
"loss": 1.875,
"step": 4250
},
{
"epoch": 0.05914799378258298,
"grad_norm": 0.23278222978115082,
"learning_rate": 0.0001,
"loss": 1.8805,
"step": 4300
},
{
"epoch": 0.0598357611521479,
"grad_norm": 0.21169067919254303,
"learning_rate": 0.0001,
"loss": 1.8753,
"step": 4350
},
{
"epoch": 0.06052352852171282,
"grad_norm": 0.2010953575372696,
"learning_rate": 0.0001,
"loss": 1.8758,
"step": 4400
},
{
"epoch": 0.06121129589127773,
"grad_norm": 0.19260814785957336,
"learning_rate": 0.0001,
"loss": 1.8731,
"step": 4450
},
{
"epoch": 0.06189906326084265,
"grad_norm": 0.19751103222370148,
"learning_rate": 0.0001,
"loss": 1.8719,
"step": 4500
},
{
"epoch": 0.06258683063040757,
"grad_norm": 0.21297581493854523,
"learning_rate": 0.0001,
"loss": 1.875,
"step": 4550
},
{
"epoch": 0.0632745979999725,
"grad_norm": 0.2128158062696457,
"learning_rate": 0.0001,
"loss": 1.8711,
"step": 4600
},
{
"epoch": 0.06396236536953741,
"grad_norm": 0.18719784915447235,
"learning_rate": 0.0001,
"loss": 1.8741,
"step": 4650
},
{
"epoch": 0.06465013273910232,
"grad_norm": 0.2352721244096756,
"learning_rate": 0.0001,
"loss": 1.8717,
"step": 4700
},
{
"epoch": 0.06533790010866725,
"grad_norm": 0.22228975594043732,
"learning_rate": 0.0001,
"loss": 1.8707,
"step": 4750
},
{
"epoch": 0.06602566747823216,
"grad_norm": 0.18716222047805786,
"learning_rate": 0.0001,
"loss": 1.8705,
"step": 4800
},
{
"epoch": 0.06671343484779708,
"grad_norm": 0.22167149186134338,
"learning_rate": 0.0001,
"loss": 1.8739,
"step": 4850
},
{
"epoch": 0.067401202217362,
"grad_norm": 0.24794642627239227,
"learning_rate": 0.0001,
"loss": 1.8747,
"step": 4900
},
{
"epoch": 0.06808896958692692,
"grad_norm": 0.18762528896331787,
"learning_rate": 0.0001,
"loss": 1.8702,
"step": 4950
},
{
"epoch": 0.06877673695649184,
"grad_norm": 0.19063113629817963,
"learning_rate": 0.0001,
"loss": 1.8733,
"step": 5000
},
{
"epoch": 0.06946450432605676,
"grad_norm": 0.1940603107213974,
"learning_rate": 0.0001,
"loss": 1.8685,
"step": 5050
},
{
"epoch": 0.07015227169562167,
"grad_norm": 0.19752484560012817,
"learning_rate": 0.0001,
"loss": 1.8762,
"step": 5100
},
{
"epoch": 0.07084003906518659,
"grad_norm": 0.23486199975013733,
"learning_rate": 0.0001,
"loss": 1.8708,
"step": 5150
},
{
"epoch": 0.07152780643475151,
"grad_norm": 0.20315973460674286,
"learning_rate": 0.0001,
"loss": 1.8676,
"step": 5200
},
{
"epoch": 0.07221557380431642,
"grad_norm": 0.1925646960735321,
"learning_rate": 0.0001,
"loss": 1.8634,
"step": 5250
},
{
"epoch": 0.07290334117388135,
"grad_norm": 0.20540663599967957,
"learning_rate": 0.0001,
"loss": 1.8706,
"step": 5300
},
{
"epoch": 0.07359110854344626,
"grad_norm": 0.23649099469184875,
"learning_rate": 0.0001,
"loss": 1.8685,
"step": 5350
},
{
"epoch": 0.07427887591301119,
"grad_norm": 0.23272614181041718,
"learning_rate": 0.0001,
"loss": 1.8724,
"step": 5400
},
{
"epoch": 0.0749666432825761,
"grad_norm": 0.1887608915567398,
"learning_rate": 0.0001,
"loss": 1.8707,
"step": 5450
},
{
"epoch": 0.07565441065214101,
"grad_norm": 0.18964676558971405,
"learning_rate": 0.0001,
"loss": 1.8642,
"step": 5500
},
{
"epoch": 0.07634217802170594,
"grad_norm": 0.20009934902191162,
"learning_rate": 0.0001,
"loss": 1.8657,
"step": 5550
},
{
"epoch": 0.07702994539127085,
"grad_norm": 0.1821998506784439,
"learning_rate": 0.0001,
"loss": 1.8673,
"step": 5600
},
{
"epoch": 0.07771771276083578,
"grad_norm": 0.18905235826969147,
"learning_rate": 0.0001,
"loss": 1.8687,
"step": 5650
},
{
"epoch": 0.07840548013040069,
"grad_norm": 0.19986678659915924,
"learning_rate": 0.0001,
"loss": 1.8627,
"step": 5700
},
{
"epoch": 0.07909324749996562,
"grad_norm": 0.1904374659061432,
"learning_rate": 0.0001,
"loss": 1.8633,
"step": 5750
},
{
"epoch": 0.07978101486953053,
"grad_norm": 0.19536761939525604,
"learning_rate": 0.0001,
"loss": 1.8685,
"step": 5800
},
{
"epoch": 0.08046878223909545,
"grad_norm": 0.18209826946258545,
"learning_rate": 0.0001,
"loss": 1.8599,
"step": 5850
},
{
"epoch": 0.08115654960866037,
"grad_norm": 0.21385939419269562,
"learning_rate": 0.0001,
"loss": 1.866,
"step": 5900
},
{
"epoch": 0.08184431697822528,
"grad_norm": 0.20338542759418488,
"learning_rate": 0.0001,
"loss": 1.8669,
"step": 5950
},
{
"epoch": 0.0825320843477902,
"grad_norm": 0.19536232948303223,
"learning_rate": 0.0001,
"loss": 1.8644,
"step": 6000
},
{
"epoch": 0.08321985171735512,
"grad_norm": 0.18480873107910156,
"learning_rate": 0.0001,
"loss": 1.8668,
"step": 6050
},
{
"epoch": 0.08390761908692004,
"grad_norm": 0.18024863302707672,
"learning_rate": 0.0001,
"loss": 1.8638,
"step": 6100
},
{
"epoch": 0.08459538645648496,
"grad_norm": 0.18774175643920898,
"learning_rate": 0.0001,
"loss": 1.8652,
"step": 6150
},
{
"epoch": 0.08528315382604988,
"grad_norm": 0.2518685460090637,
"learning_rate": 0.0001,
"loss": 1.8649,
"step": 6200
},
{
"epoch": 0.0859709211956148,
"grad_norm": 0.20646634697914124,
"learning_rate": 0.0001,
"loss": 1.8658,
"step": 6250
},
{
"epoch": 0.08665868856517972,
"grad_norm": 0.19222316145896912,
"learning_rate": 0.0001,
"loss": 1.8642,
"step": 6300
},
{
"epoch": 0.08734645593474463,
"grad_norm": 0.19531960785388947,
"learning_rate": 0.0001,
"loss": 1.8641,
"step": 6350
},
{
"epoch": 0.08803422330430954,
"grad_norm": 0.18218673765659332,
"learning_rate": 0.0001,
"loss": 1.8599,
"step": 6400
},
{
"epoch": 0.08872199067387447,
"grad_norm": 0.18686556816101074,
"learning_rate": 0.0001,
"loss": 1.8588,
"step": 6450
},
{
"epoch": 0.08940975804343938,
"grad_norm": 0.20718005299568176,
"learning_rate": 0.0001,
"loss": 1.8595,
"step": 6500
},
{
"epoch": 0.09009752541300431,
"grad_norm": 0.17680206894874573,
"learning_rate": 0.0001,
"loss": 1.8625,
"step": 6550
},
{
"epoch": 0.09078529278256922,
"grad_norm": 0.25429028272628784,
"learning_rate": 0.0001,
"loss": 1.8635,
"step": 6600
},
{
"epoch": 0.09147306015213415,
"grad_norm": 0.19778478145599365,
"learning_rate": 0.0001,
"loss": 1.8618,
"step": 6650
},
{
"epoch": 0.09216082752169906,
"grad_norm": 0.21198226511478424,
"learning_rate": 0.0001,
"loss": 1.8613,
"step": 6700
},
{
"epoch": 0.09284859489126399,
"grad_norm": 0.1819111704826355,
"learning_rate": 0.0001,
"loss": 1.8601,
"step": 6750
},
{
"epoch": 0.0935363622608289,
"grad_norm": 0.2141820788383484,
"learning_rate": 0.0001,
"loss": 1.8598,
"step": 6800
},
{
"epoch": 0.09422412963039381,
"grad_norm": 0.20356012880802155,
"learning_rate": 0.0001,
"loss": 1.8619,
"step": 6850
},
{
"epoch": 0.09491189699995874,
"grad_norm": 0.18998335301876068,
"learning_rate": 0.0001,
"loss": 1.8597,
"step": 6900
},
{
"epoch": 0.09559966436952365,
"grad_norm": 0.19086682796478271,
"learning_rate": 0.0001,
"loss": 1.8622,
"step": 6950
},
{
"epoch": 0.09628743173908857,
"grad_norm": 0.2049364447593689,
"learning_rate": 0.0001,
"loss": 1.8617,
"step": 7000
},
{
"epoch": 0.09697519910865349,
"grad_norm": 0.19833974540233612,
"learning_rate": 0.0001,
"loss": 1.8609,
"step": 7050
},
{
"epoch": 0.09766296647821841,
"grad_norm": 0.19551745057106018,
"learning_rate": 0.0001,
"loss": 1.8581,
"step": 7100
},
{
"epoch": 0.09835073384778333,
"grad_norm": 0.1846143752336502,
"learning_rate": 0.0001,
"loss": 1.8569,
"step": 7150
},
{
"epoch": 0.09903850121734824,
"grad_norm": 0.1906626969575882,
"learning_rate": 0.0001,
"loss": 1.8614,
"step": 7200
},
{
"epoch": 0.09972626858691316,
"grad_norm": 0.19115209579467773,
"learning_rate": 0.0001,
"loss": 1.8633,
"step": 7250
},
{
"epoch": 0.10041403595647808,
"grad_norm": 0.18704906105995178,
"learning_rate": 0.0001,
"loss": 1.8601,
"step": 7300
},
{
"epoch": 0.101101803326043,
"grad_norm": 0.18635210394859314,
"learning_rate": 0.0001,
"loss": 1.8605,
"step": 7350
},
{
"epoch": 0.10178957069560791,
"grad_norm": 0.1947161853313446,
"learning_rate": 0.0001,
"loss": 1.861,
"step": 7400
},
{
"epoch": 0.10247733806517284,
"grad_norm": 0.22087708115577698,
"learning_rate": 0.0001,
"loss": 1.8553,
"step": 7450
},
{
"epoch": 0.10316510543473775,
"grad_norm": 0.1805039346218109,
"learning_rate": 0.0001,
"loss": 1.8591,
"step": 7500
},
{
"epoch": 0.10385287280430268,
"grad_norm": 0.19084776937961578,
"learning_rate": 0.0001,
"loss": 1.8561,
"step": 7550
},
{
"epoch": 0.10454064017386759,
"grad_norm": 0.20166590809822083,
"learning_rate": 0.0001,
"loss": 1.8584,
"step": 7600
},
{
"epoch": 0.1052284075434325,
"grad_norm": 0.1892371028661728,
"learning_rate": 0.0001,
"loss": 1.8526,
"step": 7650
},
{
"epoch": 0.10591617491299743,
"grad_norm": 0.22085241973400116,
"learning_rate": 0.0001,
"loss": 1.8561,
"step": 7700
},
{
"epoch": 0.10660394228256234,
"grad_norm": 0.186112642288208,
"learning_rate": 0.0001,
"loss": 1.8597,
"step": 7750
},
{
"epoch": 0.10729170965212727,
"grad_norm": 0.1959947943687439,
"learning_rate": 0.0001,
"loss": 1.8558,
"step": 7800
},
{
"epoch": 0.10797947702169218,
"grad_norm": 0.21492016315460205,
"learning_rate": 0.0001,
"loss": 1.8608,
"step": 7850
},
{
"epoch": 0.1086672443912571,
"grad_norm": 0.18600517511367798,
"learning_rate": 0.0001,
"loss": 1.8559,
"step": 7900
},
{
"epoch": 0.10935501176082202,
"grad_norm": 0.18841132521629333,
"learning_rate": 0.0001,
"loss": 1.8542,
"step": 7950
},
{
"epoch": 0.11004277913038694,
"grad_norm": 0.20758236944675446,
"learning_rate": 0.0001,
"loss": 1.8565,
"step": 8000
},
{
"epoch": 0.11073054649995186,
"grad_norm": 0.20206254720687866,
"learning_rate": 0.0001,
"loss": 1.8553,
"step": 8050
},
{
"epoch": 0.11141831386951677,
"grad_norm": 0.19620998203754425,
"learning_rate": 0.0001,
"loss": 1.8542,
"step": 8100
},
{
"epoch": 0.1121060812390817,
"grad_norm": 0.19747626781463623,
"learning_rate": 0.0001,
"loss": 1.8545,
"step": 8150
},
{
"epoch": 0.11279384860864661,
"grad_norm": 0.21328890323638916,
"learning_rate": 0.0001,
"loss": 1.8552,
"step": 8200
},
{
"epoch": 0.11348161597821153,
"grad_norm": 0.18296054005622864,
"learning_rate": 0.0001,
"loss": 1.8579,
"step": 8250
},
{
"epoch": 0.11416938334777645,
"grad_norm": 0.21098335087299347,
"learning_rate": 0.0001,
"loss": 1.8526,
"step": 8300
},
{
"epoch": 0.11485715071734137,
"grad_norm": 0.18666841089725494,
"learning_rate": 0.0001,
"loss": 1.8484,
"step": 8350
},
{
"epoch": 0.11554491808690628,
"grad_norm": 0.18522906303405762,
"learning_rate": 0.0001,
"loss": 1.8538,
"step": 8400
},
{
"epoch": 0.1162326854564712,
"grad_norm": 0.1890312135219574,
"learning_rate": 0.0001,
"loss": 1.8519,
"step": 8450
},
{
"epoch": 0.11692045282603612,
"grad_norm": 0.197422057390213,
"learning_rate": 0.0001,
"loss": 1.8513,
"step": 8500
},
{
"epoch": 0.11760822019560103,
"grad_norm": 0.21355442702770233,
"learning_rate": 0.0001,
"loss": 1.8561,
"step": 8550
},
{
"epoch": 0.11829598756516596,
"grad_norm": 0.18543662130832672,
"learning_rate": 0.0001,
"loss": 1.8538,
"step": 8600
},
{
"epoch": 0.11898375493473087,
"grad_norm": 0.20849215984344482,
"learning_rate": 0.0001,
"loss": 1.8527,
"step": 8650
},
{
"epoch": 0.1196715223042958,
"grad_norm": 0.2109488546848297,
"learning_rate": 0.0001,
"loss": 1.8496,
"step": 8700
},
{
"epoch": 0.12035928967386071,
"grad_norm": 0.20195640623569489,
"learning_rate": 0.0001,
"loss": 1.8499,
"step": 8750
},
{
"epoch": 0.12104705704342564,
"grad_norm": 0.1749362200498581,
"learning_rate": 0.0001,
"loss": 1.8559,
"step": 8800
},
{
"epoch": 0.12173482441299055,
"grad_norm": 0.20881310105323792,
"learning_rate": 0.0001,
"loss": 1.8536,
"step": 8850
},
{
"epoch": 0.12242259178255546,
"grad_norm": 0.1801750510931015,
"learning_rate": 0.0001,
"loss": 1.8507,
"step": 8900
},
{
"epoch": 0.12311035915212039,
"grad_norm": 0.1898815929889679,
"learning_rate": 0.0001,
"loss": 1.8493,
"step": 8950
},
{
"epoch": 0.1237981265216853,
"grad_norm": 0.19754734635353088,
"learning_rate": 0.0001,
"loss": 1.853,
"step": 9000
},
{
"epoch": 0.12448589389125023,
"grad_norm": 0.1855219006538391,
"learning_rate": 0.0001,
"loss": 1.8529,
"step": 9050
},
{
"epoch": 0.12517366126081514,
"grad_norm": 0.19341996312141418,
"learning_rate": 0.0001,
"loss": 1.8513,
"step": 9100
},
{
"epoch": 0.12586142863038005,
"grad_norm": 0.19776052236557007,
"learning_rate": 0.0001,
"loss": 1.8507,
"step": 9150
},
{
"epoch": 0.126549195999945,
"grad_norm": 0.185306116938591,
"learning_rate": 0.0001,
"loss": 1.851,
"step": 9200
},
{
"epoch": 0.1272369633695099,
"grad_norm": 0.19926750659942627,
"learning_rate": 0.0001,
"loss": 1.8504,
"step": 9250
},
{
"epoch": 0.12792473073907482,
"grad_norm": 0.21605028212070465,
"learning_rate": 0.0001,
"loss": 1.8502,
"step": 9300
},
{
"epoch": 0.12861249810863973,
"grad_norm": 0.18174859881401062,
"learning_rate": 0.0001,
"loss": 1.8505,
"step": 9350
},
{
"epoch": 0.12930026547820464,
"grad_norm": 0.19654984772205353,
"learning_rate": 0.0001,
"loss": 1.8517,
"step": 9400
},
{
"epoch": 0.12998803284776958,
"grad_norm": 0.1764276772737503,
"learning_rate": 0.0001,
"loss": 1.8483,
"step": 9450
},
{
"epoch": 0.1306758002173345,
"grad_norm": 0.17811571061611176,
"learning_rate": 0.0001,
"loss": 1.8469,
"step": 9500
},
{
"epoch": 0.1313635675868994,
"grad_norm": 0.20159000158309937,
"learning_rate": 0.0001,
"loss": 1.8455,
"step": 9550
},
{
"epoch": 0.13205133495646432,
"grad_norm": 0.1840062290430069,
"learning_rate": 0.0001,
"loss": 1.8511,
"step": 9600
},
{
"epoch": 0.13273910232602926,
"grad_norm": 0.190440833568573,
"learning_rate": 0.0001,
"loss": 1.8474,
"step": 9650
},
{
"epoch": 0.13342686969559417,
"grad_norm": 0.20033535361289978,
"learning_rate": 0.0001,
"loss": 1.8479,
"step": 9700
},
{
"epoch": 0.13411463706515908,
"grad_norm": 0.1811174899339676,
"learning_rate": 0.0001,
"loss": 1.8504,
"step": 9750
},
{
"epoch": 0.134802404434724,
"grad_norm": 0.2073344737291336,
"learning_rate": 0.0001,
"loss": 1.8507,
"step": 9800
},
{
"epoch": 0.1354901718042889,
"grad_norm": 0.21762603521347046,
"learning_rate": 0.0001,
"loss": 1.8499,
"step": 9850
},
{
"epoch": 0.13617793917385385,
"grad_norm": 0.1864607185125351,
"learning_rate": 0.0001,
"loss": 1.8471,
"step": 9900
},
{
"epoch": 0.13686570654341876,
"grad_norm": 0.17837654054164886,
"learning_rate": 0.0001,
"loss": 1.8485,
"step": 9950
},
{
"epoch": 0.13755347391298367,
"grad_norm": 0.20498532056808472,
"learning_rate": 0.0001,
"loss": 1.8497,
"step": 10000
},
{
"epoch": 0.13824124128254858,
"grad_norm": 0.18355566263198853,
"learning_rate": 0.0001,
"loss": 1.8458,
"step": 10050
},
{
"epoch": 0.13892900865211352,
"grad_norm": 0.2033490389585495,
"learning_rate": 0.0001,
"loss": 1.8451,
"step": 10100
},
{
"epoch": 0.13961677602167843,
"grad_norm": 0.1855219006538391,
"learning_rate": 0.0001,
"loss": 1.8475,
"step": 10150
},
{
"epoch": 0.14030454339124335,
"grad_norm": 0.18876652419567108,
"learning_rate": 0.0001,
"loss": 1.8473,
"step": 10200
},
{
"epoch": 0.14099231076080826,
"grad_norm": 0.1731424629688263,
"learning_rate": 0.0001,
"loss": 1.8475,
"step": 10250
},
{
"epoch": 0.14168007813037317,
"grad_norm": 0.186906635761261,
"learning_rate": 0.0001,
"loss": 1.8498,
"step": 10300
},
{
"epoch": 0.1423678454999381,
"grad_norm": 0.18285425007343292,
"learning_rate": 0.0001,
"loss": 1.8451,
"step": 10350
},
{
"epoch": 0.14305561286950302,
"grad_norm": 0.19545456767082214,
"learning_rate": 0.0001,
"loss": 1.8487,
"step": 10400
},
{
"epoch": 0.14374338023906794,
"grad_norm": 0.16256272792816162,
"learning_rate": 0.0001,
"loss": 1.8461,
"step": 10450
},
{
"epoch": 0.14443114760863285,
"grad_norm": 0.19637931883335114,
"learning_rate": 0.0001,
"loss": 1.8462,
"step": 10500
},
{
"epoch": 0.14511891497819776,
"grad_norm": 0.20408660173416138,
"learning_rate": 0.0001,
"loss": 1.8465,
"step": 10550
},
{
"epoch": 0.1458066823477627,
"grad_norm": 0.2140285223722458,
"learning_rate": 0.0001,
"loss": 1.8421,
"step": 10600
},
{
"epoch": 0.1464944497173276,
"grad_norm": 0.18366774916648865,
"learning_rate": 0.0001,
"loss": 1.8454,
"step": 10650
},
{
"epoch": 0.14718221708689253,
"grad_norm": 0.19011645019054413,
"learning_rate": 0.0001,
"loss": 1.8427,
"step": 10700
},
{
"epoch": 0.14786998445645744,
"grad_norm": 0.1923753321170807,
"learning_rate": 0.0001,
"loss": 1.8442,
"step": 10750
},
{
"epoch": 0.14855775182602238,
"grad_norm": 0.19208142161369324,
"learning_rate": 0.0001,
"loss": 1.8413,
"step": 10800
},
{
"epoch": 0.1492455191955873,
"grad_norm": 0.19608841836452484,
"learning_rate": 0.0001,
"loss": 1.8468,
"step": 10850
},
{
"epoch": 0.1499332865651522,
"grad_norm": 0.19484341144561768,
"learning_rate": 0.0001,
"loss": 1.849,
"step": 10900
},
{
"epoch": 0.15062105393471711,
"grad_norm": 0.18584389984607697,
"learning_rate": 0.0001,
"loss": 1.8416,
"step": 10950
},
{
"epoch": 0.15130882130428203,
"grad_norm": 0.1894279420375824,
"learning_rate": 0.0001,
"loss": 1.8454,
"step": 11000
},
{
"epoch": 0.15199658867384697,
"grad_norm": 0.19622810184955597,
"learning_rate": 0.0001,
"loss": 1.8449,
"step": 11050
},
{
"epoch": 0.15268435604341188,
"grad_norm": 0.18603233993053436,
"learning_rate": 0.0001,
"loss": 1.848,
"step": 11100
},
{
"epoch": 0.1533721234129768,
"grad_norm": 0.18146397173404694,
"learning_rate": 0.0001,
"loss": 1.8413,
"step": 11150
},
{
"epoch": 0.1540598907825417,
"grad_norm": 0.20820939540863037,
"learning_rate": 0.0001,
"loss": 1.844,
"step": 11200
},
{
"epoch": 0.15474765815210664,
"grad_norm": 0.18021373450756073,
"learning_rate": 0.0001,
"loss": 1.8434,
"step": 11250
},
{
"epoch": 0.15543542552167156,
"grad_norm": 0.19339635968208313,
"learning_rate": 0.0001,
"loss": 1.8405,
"step": 11300
},
{
"epoch": 0.15612319289123647,
"grad_norm": 0.1994727998971939,
"learning_rate": 0.0001,
"loss": 1.8403,
"step": 11350
},
{
"epoch": 0.15681096026080138,
"grad_norm": 0.1830483376979828,
"learning_rate": 0.0001,
"loss": 1.8415,
"step": 11400
},
{
"epoch": 0.1574987276303663,
"grad_norm": 0.17064842581748962,
"learning_rate": 0.0001,
"loss": 1.8433,
"step": 11450
},
{
"epoch": 0.15818649499993123,
"grad_norm": 0.19161944091320038,
"learning_rate": 0.0001,
"loss": 1.8428,
"step": 11500
},
{
"epoch": 0.15887426236949614,
"grad_norm": 0.21216394007205963,
"learning_rate": 0.0001,
"loss": 1.8432,
"step": 11550
},
{
"epoch": 0.15956202973906106,
"grad_norm": 0.1909138560295105,
"learning_rate": 0.0001,
"loss": 1.8429,
"step": 11600
},
{
"epoch": 0.16024979710862597,
"grad_norm": 0.20326951146125793,
"learning_rate": 0.0001,
"loss": 1.8419,
"step": 11650
},
{
"epoch": 0.1609375644781909,
"grad_norm": 0.19515758752822876,
"learning_rate": 0.0001,
"loss": 1.8448,
"step": 11700
},
{
"epoch": 0.16162533184775582,
"grad_norm": 0.2075706273317337,
"learning_rate": 0.0001,
"loss": 1.8439,
"step": 11750
},
{
"epoch": 0.16231309921732073,
"grad_norm": 0.21147705614566803,
"learning_rate": 0.0001,
"loss": 1.8433,
"step": 11800
},
{
"epoch": 0.16300086658688565,
"grad_norm": 0.18318484723567963,
"learning_rate": 0.0001,
"loss": 1.8383,
"step": 11850
},
{
"epoch": 0.16368863395645056,
"grad_norm": 0.18728312849998474,
"learning_rate": 0.0001,
"loss": 1.8426,
"step": 11900
},
{
"epoch": 0.1643764013260155,
"grad_norm": 0.20905287563800812,
"learning_rate": 0.0001,
"loss": 1.8421,
"step": 11950
},
{
"epoch": 0.1650641686955804,
"grad_norm": 0.18393969535827637,
"learning_rate": 0.0001,
"loss": 1.8408,
"step": 12000
},
{
"epoch": 0.16575193606514532,
"grad_norm": 0.18366305530071259,
"learning_rate": 0.0001,
"loss": 1.8365,
"step": 12050
},
{
"epoch": 0.16643970343471023,
"grad_norm": 0.19170603156089783,
"learning_rate": 0.0001,
"loss": 1.8416,
"step": 12100
},
{
"epoch": 0.16712747080427517,
"grad_norm": 0.172319233417511,
"learning_rate": 0.0001,
"loss": 1.8411,
"step": 12150
},
{
"epoch": 0.1678152381738401,
"grad_norm": 0.2174234390258789,
"learning_rate": 0.0001,
"loss": 1.8416,
"step": 12200
},
{
"epoch": 0.168503005543405,
"grad_norm": 0.20210625231266022,
"learning_rate": 0.0001,
"loss": 1.8422,
"step": 12250
},
{
"epoch": 0.1691907729129699,
"grad_norm": 0.1902657449245453,
"learning_rate": 0.0001,
"loss": 1.8369,
"step": 12300
},
{
"epoch": 0.16987854028253482,
"grad_norm": 0.18901073932647705,
"learning_rate": 0.0001,
"loss": 1.8415,
"step": 12350
},
{
"epoch": 0.17056630765209976,
"grad_norm": 0.17624430358409882,
"learning_rate": 0.0001,
"loss": 1.8373,
"step": 12400
},
{
"epoch": 0.17125407502166468,
"grad_norm": 0.1844191551208496,
"learning_rate": 0.0001,
"loss": 1.8391,
"step": 12450
},
{
"epoch": 0.1719418423912296,
"grad_norm": 0.19392350316047668,
"learning_rate": 0.0001,
"loss": 1.8416,
"step": 12500
},
{
"epoch": 0.1726296097607945,
"grad_norm": 0.18644706904888153,
"learning_rate": 0.0001,
"loss": 1.8409,
"step": 12550
},
{
"epoch": 0.17331737713035944,
"grad_norm": 0.19530895352363586,
"learning_rate": 0.0001,
"loss": 1.8381,
"step": 12600
},
{
"epoch": 0.17400514449992435,
"grad_norm": 0.18004032969474792,
"learning_rate": 0.0001,
"loss": 1.8419,
"step": 12650
},
{
"epoch": 0.17469291186948926,
"grad_norm": 0.20025117695331573,
"learning_rate": 0.0001,
"loss": 1.8379,
"step": 12700
},
{
"epoch": 0.17538067923905418,
"grad_norm": 0.17622490227222443,
"learning_rate": 0.0001,
"loss": 1.8364,
"step": 12750
},
{
"epoch": 0.1760684466086191,
"grad_norm": 0.19657030701637268,
"learning_rate": 0.0001,
"loss": 1.8364,
"step": 12800
},
{
"epoch": 0.17675621397818403,
"grad_norm": 0.19141744077205658,
"learning_rate": 0.0001,
"loss": 1.8388,
"step": 12850
},
{
"epoch": 0.17744398134774894,
"grad_norm": 0.23409488797187805,
"learning_rate": 0.0001,
"loss": 1.8392,
"step": 12900
},
{
"epoch": 0.17813174871731385,
"grad_norm": 0.19104769825935364,
"learning_rate": 0.0001,
"loss": 1.8407,
"step": 12950
},
{
"epoch": 0.17881951608687877,
"grad_norm": 0.1978139728307724,
"learning_rate": 0.0001,
"loss": 1.836,
"step": 13000
},
{
"epoch": 0.1795072834564437,
"grad_norm": 0.1839970201253891,
"learning_rate": 0.0001,
"loss": 1.8406,
"step": 13050
},
{
"epoch": 0.18019505082600862,
"grad_norm": 0.1969710737466812,
"learning_rate": 0.0001,
"loss": 1.8382,
"step": 13100
},
{
"epoch": 0.18088281819557353,
"grad_norm": 0.21036314964294434,
"learning_rate": 0.0001,
"loss": 1.8372,
"step": 13150
},
{
"epoch": 0.18157058556513844,
"grad_norm": 0.18064115941524506,
"learning_rate": 0.0001,
"loss": 1.8387,
"step": 13200
},
{
"epoch": 0.18225835293470335,
"grad_norm": 0.20280593633651733,
"learning_rate": 0.0001,
"loss": 1.8345,
"step": 13250
},
{
"epoch": 0.1829461203042683,
"grad_norm": 0.21196794509887695,
"learning_rate": 0.0001,
"loss": 1.8403,
"step": 13300
},
{
"epoch": 0.1836338876738332,
"grad_norm": 0.18529263138771057,
"learning_rate": 0.0001,
"loss": 1.8395,
"step": 13350
},
{
"epoch": 0.18432165504339812,
"grad_norm": 0.20009498298168182,
"learning_rate": 0.0001,
"loss": 1.8418,
"step": 13400
},
{
"epoch": 0.18500942241296303,
"grad_norm": 0.1844586879014969,
"learning_rate": 0.0001,
"loss": 1.8388,
"step": 13450
},
{
"epoch": 0.18569718978252797,
"grad_norm": 0.17497003078460693,
"learning_rate": 0.0001,
"loss": 1.8374,
"step": 13500
},
{
"epoch": 0.18638495715209288,
"grad_norm": 0.21536414325237274,
"learning_rate": 0.0001,
"loss": 1.834,
"step": 13550
},
{
"epoch": 0.1870727245216578,
"grad_norm": 0.20212842524051666,
"learning_rate": 0.0001,
"loss": 1.8361,
"step": 13600
},
{
"epoch": 0.1877604918912227,
"grad_norm": 0.21032044291496277,
"learning_rate": 0.0001,
"loss": 1.8352,
"step": 13650
},
{
"epoch": 0.18844825926078762,
"grad_norm": 0.17547431588172913,
"learning_rate": 0.0001,
"loss": 1.839,
"step": 13700
},
{
"epoch": 0.18913602663035256,
"grad_norm": 0.17463110387325287,
"learning_rate": 0.0001,
"loss": 1.8345,
"step": 13750
},
{
"epoch": 0.18982379399991747,
"grad_norm": 0.19794687628746033,
"learning_rate": 0.0001,
"loss": 1.8367,
"step": 13800
},
{
"epoch": 0.19051156136948239,
"grad_norm": 0.17595866322517395,
"learning_rate": 0.0001,
"loss": 1.8349,
"step": 13850
},
{
"epoch": 0.1911993287390473,
"grad_norm": 0.19087472558021545,
"learning_rate": 0.0001,
"loss": 1.8377,
"step": 13900
},
{
"epoch": 0.1918870961086122,
"grad_norm": 0.1895439624786377,
"learning_rate": 0.0001,
"loss": 1.8392,
"step": 13950
},
{
"epoch": 0.19257486347817715,
"grad_norm": 0.19558320939540863,
"learning_rate": 0.0001,
"loss": 1.8331,
"step": 14000
},
{
"epoch": 0.19326263084774206,
"grad_norm": 0.18495230376720428,
"learning_rate": 0.0001,
"loss": 1.8357,
"step": 14050
},
{
"epoch": 0.19395039821730697,
"grad_norm": 0.19197221100330353,
"learning_rate": 0.0001,
"loss": 1.8379,
"step": 14100
},
{
"epoch": 0.1946381655868719,
"grad_norm": 0.17729446291923523,
"learning_rate": 0.0001,
"loss": 1.8336,
"step": 14150
},
{
"epoch": 0.19532593295643683,
"grad_norm": 0.20683547854423523,
"learning_rate": 0.0001,
"loss": 1.8344,
"step": 14200
},
{
"epoch": 0.19601370032600174,
"grad_norm": 0.16708314418792725,
"learning_rate": 0.0001,
"loss": 1.8375,
"step": 14250
},
{
"epoch": 0.19670146769556665,
"grad_norm": 0.2065526694059372,
"learning_rate": 0.0001,
"loss": 1.8397,
"step": 14300
},
{
"epoch": 0.19738923506513156,
"grad_norm": 0.2007008045911789,
"learning_rate": 0.0001,
"loss": 1.8351,
"step": 14350
},
{
"epoch": 0.19807700243469648,
"grad_norm": 0.1773243397474289,
"learning_rate": 0.0001,
"loss": 1.8338,
"step": 14400
},
{
"epoch": 0.19876476980426142,
"grad_norm": 0.1875116229057312,
"learning_rate": 0.0001,
"loss": 1.8379,
"step": 14450
},
{
"epoch": 0.19945253717382633,
"grad_norm": 0.19387130439281464,
"learning_rate": 0.0001,
"loss": 1.8343,
"step": 14500
},
{
"epoch": 0.20014030454339124,
"grad_norm": 0.17164736986160278,
"learning_rate": 0.0001,
"loss": 1.8338,
"step": 14550
},
{
"epoch": 0.20082807191295615,
"grad_norm": 0.19135966897010803,
"learning_rate": 0.0001,
"loss": 1.8321,
"step": 14600
},
{
"epoch": 0.2015158392825211,
"grad_norm": 0.21152153611183167,
"learning_rate": 0.0001,
"loss": 1.8332,
"step": 14650
},
{
"epoch": 0.202203606652086,
"grad_norm": 0.19576500356197357,
"learning_rate": 0.0001,
"loss": 1.8338,
"step": 14700
},
{
"epoch": 0.20289137402165092,
"grad_norm": 0.21700510382652283,
"learning_rate": 0.0001,
"loss": 1.8381,
"step": 14750
},
{
"epoch": 0.20357914139121583,
"grad_norm": 0.18183092772960663,
"learning_rate": 0.0001,
"loss": 1.833,
"step": 14800
},
{
"epoch": 0.20426690876078074,
"grad_norm": 0.1678183525800705,
"learning_rate": 0.0001,
"loss": 1.8365,
"step": 14850
},
{
"epoch": 0.20495467613034568,
"grad_norm": 0.1790694147348404,
"learning_rate": 0.0001,
"loss": 1.8323,
"step": 14900
},
{
"epoch": 0.2056424434999106,
"grad_norm": 0.17274673283100128,
"learning_rate": 0.0001,
"loss": 1.8357,
"step": 14950
},
{
"epoch": 0.2063302108694755,
"grad_norm": 0.1773209273815155,
"learning_rate": 0.0001,
"loss": 1.8338,
"step": 15000
},
{
"epoch": 0.20701797823904042,
"grad_norm": 0.29811668395996094,
"learning_rate": 0.0001,
"loss": 1.8322,
"step": 15050
},
{
"epoch": 0.20770574560860536,
"grad_norm": 0.18590272963047028,
"learning_rate": 0.0001,
"loss": 1.8307,
"step": 15100
},
{
"epoch": 0.20839351297817027,
"grad_norm": 0.19656258821487427,
"learning_rate": 0.0001,
"loss": 1.8364,
"step": 15150
},
{
"epoch": 0.20908128034773518,
"grad_norm": 0.1760113537311554,
"learning_rate": 0.0001,
"loss": 1.8363,
"step": 15200
},
{
"epoch": 0.2097690477173001,
"grad_norm": 0.17442069947719574,
"learning_rate": 0.0001,
"loss": 1.8346,
"step": 15250
},
{
"epoch": 0.210456815086865,
"grad_norm": 0.2154201865196228,
"learning_rate": 0.0001,
"loss": 1.8359,
"step": 15300
},
{
"epoch": 0.21114458245642995,
"grad_norm": 0.18702222406864166,
"learning_rate": 0.0001,
"loss": 1.8333,
"step": 15350
},
{
"epoch": 0.21183234982599486,
"grad_norm": 0.222214013338089,
"learning_rate": 0.0001,
"loss": 1.8386,
"step": 15400
},
{
"epoch": 0.21252011719555977,
"grad_norm": 0.18646612763404846,
"learning_rate": 0.0001,
"loss": 1.8336,
"step": 15450
},
{
"epoch": 0.21320788456512468,
"grad_norm": 0.19032032787799835,
"learning_rate": 0.0001,
"loss": 1.8359,
"step": 15500
},
{
"epoch": 0.21389565193468962,
"grad_norm": 0.1962030827999115,
"learning_rate": 0.0001,
"loss": 1.8314,
"step": 15550
},
{
"epoch": 0.21458341930425454,
"grad_norm": 0.18067054450511932,
"learning_rate": 0.0001,
"loss": 1.8298,
"step": 15600
},
{
"epoch": 0.21527118667381945,
"grad_norm": 0.1977655440568924,
"learning_rate": 0.0001,
"loss": 1.8335,
"step": 15650
},
{
"epoch": 0.21595895404338436,
"grad_norm": 0.17689162492752075,
"learning_rate": 0.0001,
"loss": 1.834,
"step": 15700
},
{
"epoch": 0.21664672141294927,
"grad_norm": 0.189301997423172,
"learning_rate": 0.0001,
"loss": 1.8302,
"step": 15750
},
{
"epoch": 0.2173344887825142,
"grad_norm": 0.21416552364826202,
"learning_rate": 0.0001,
"loss": 1.833,
"step": 15800
},
{
"epoch": 0.21802225615207912,
"grad_norm": 0.17280973494052887,
"learning_rate": 0.0001,
"loss": 1.8325,
"step": 15850
},
{
"epoch": 0.21871002352164404,
"grad_norm": 0.2203332632780075,
"learning_rate": 0.0001,
"loss": 1.8315,
"step": 15900
},
{
"epoch": 0.21939779089120895,
"grad_norm": 0.17942380905151367,
"learning_rate": 0.0001,
"loss": 1.8313,
"step": 15950
},
{
"epoch": 0.2200855582607739,
"grad_norm": 0.2053511142730713,
"learning_rate": 0.0001,
"loss": 1.8322,
"step": 16000
},
{
"epoch": 0.2207733256303388,
"grad_norm": 0.18660666048526764,
"learning_rate": 0.0001,
"loss": 1.8315,
"step": 16050
},
{
"epoch": 0.2214610929999037,
"grad_norm": 0.20179618895053864,
"learning_rate": 0.0001,
"loss": 1.8309,
"step": 16100
},
{
"epoch": 0.22214886036946863,
"grad_norm": 0.1849927455186844,
"learning_rate": 0.0001,
"loss": 1.8349,
"step": 16150
},
{
"epoch": 0.22283662773903354,
"grad_norm": 0.16893066465854645,
"learning_rate": 0.0001,
"loss": 1.8333,
"step": 16200
},
{
"epoch": 0.22352439510859848,
"grad_norm": 0.1815815567970276,
"learning_rate": 0.0001,
"loss": 1.8277,
"step": 16250
},
{
"epoch": 0.2242121624781634,
"grad_norm": 0.17478667199611664,
"learning_rate": 0.0001,
"loss": 1.8324,
"step": 16300
},
{
"epoch": 0.2248999298477283,
"grad_norm": 0.20333503186702728,
"learning_rate": 0.0001,
"loss": 1.8299,
"step": 16350
},
{
"epoch": 0.22558769721729321,
"grad_norm": 0.19628338515758514,
"learning_rate": 0.0001,
"loss": 1.8322,
"step": 16400
},
{
"epoch": 0.22627546458685815,
"grad_norm": 0.19011887907981873,
"learning_rate": 0.0001,
"loss": 1.8301,
"step": 16450
},
{
"epoch": 0.22696323195642307,
"grad_norm": 0.19007809460163116,
"learning_rate": 0.0001,
"loss": 1.8306,
"step": 16500
},
{
"epoch": 0.22765099932598798,
"grad_norm": 0.18108965456485748,
"learning_rate": 0.0001,
"loss": 1.8304,
"step": 16550
},
{
"epoch": 0.2283387666955529,
"grad_norm": 0.16927501559257507,
"learning_rate": 0.0001,
"loss": 1.832,
"step": 16600
},
{
"epoch": 0.2290265340651178,
"grad_norm": 0.18328557908535004,
"learning_rate": 0.0001,
"loss": 1.8315,
"step": 16650
},
{
"epoch": 0.22971430143468274,
"grad_norm": 0.21978403627872467,
"learning_rate": 0.0001,
"loss": 1.8314,
"step": 16700
},
{
"epoch": 0.23040206880424766,
"grad_norm": 0.1928972601890564,
"learning_rate": 0.0001,
"loss": 1.8281,
"step": 16750
},
{
"epoch": 0.23108983617381257,
"grad_norm": 0.19355738162994385,
"learning_rate": 0.0001,
"loss": 1.8289,
"step": 16800
},
{
"epoch": 0.23177760354337748,
"grad_norm": 0.18013496696949005,
"learning_rate": 0.0001,
"loss": 1.831,
"step": 16850
},
{
"epoch": 0.2324653709129424,
"grad_norm": 0.1848910003900528,
"learning_rate": 0.0001,
"loss": 1.826,
"step": 16900
},
{
"epoch": 0.23315313828250733,
"grad_norm": 0.20185594260692596,
"learning_rate": 0.0001,
"loss": 1.8274,
"step": 16950
},
{
"epoch": 0.23384090565207225,
"grad_norm": 0.1898491382598877,
"learning_rate": 0.0001,
"loss": 1.8292,
"step": 17000
},
{
"epoch": 0.23452867302163716,
"grad_norm": 0.17610591650009155,
"learning_rate": 0.0001,
"loss": 1.831,
"step": 17050
},
{
"epoch": 0.23521644039120207,
"grad_norm": 0.2032867968082428,
"learning_rate": 0.0001,
"loss": 1.8306,
"step": 17100
},
{
"epoch": 0.235904207760767,
"grad_norm": 0.1812831312417984,
"learning_rate": 0.0001,
"loss": 1.8331,
"step": 17150
},
{
"epoch": 0.23659197513033192,
"grad_norm": 0.17079557478427887,
"learning_rate": 0.0001,
"loss": 1.8266,
"step": 17200
},
{
"epoch": 0.23727974249989683,
"grad_norm": 0.17599579691886902,
"learning_rate": 0.0001,
"loss": 1.8327,
"step": 17250
},
{
"epoch": 0.23796750986946175,
"grad_norm": 0.16692423820495605,
"learning_rate": 0.0001,
"loss": 1.8294,
"step": 17300
},
{
"epoch": 0.23865527723902666,
"grad_norm": 0.17235307395458221,
"learning_rate": 0.0001,
"loss": 1.8324,
"step": 17350
},
{
"epoch": 0.2393430446085916,
"grad_norm": 0.18419289588928223,
"learning_rate": 0.0001,
"loss": 1.8234,
"step": 17400
},
{
"epoch": 0.2400308119781565,
"grad_norm": 0.16880065202713013,
"learning_rate": 0.0001,
"loss": 1.8315,
"step": 17450
},
{
"epoch": 0.24071857934772142,
"grad_norm": 0.18046660721302032,
"learning_rate": 0.0001,
"loss": 1.8288,
"step": 17500
},
{
"epoch": 0.24140634671728634,
"grad_norm": 0.19775420427322388,
"learning_rate": 0.0001,
"loss": 1.8304,
"step": 17550
},
{
"epoch": 0.24209411408685128,
"grad_norm": 0.18596383929252625,
"learning_rate": 0.0001,
"loss": 1.8269,
"step": 17600
},
{
"epoch": 0.2427818814564162,
"grad_norm": 0.18525435030460358,
"learning_rate": 0.0001,
"loss": 1.8293,
"step": 17650
},
{
"epoch": 0.2434696488259811,
"grad_norm": 0.2105979025363922,
"learning_rate": 0.0001,
"loss": 1.8252,
"step": 17700
},
{
"epoch": 0.244157416195546,
"grad_norm": 0.18099245429039001,
"learning_rate": 0.0001,
"loss": 1.8271,
"step": 17750
},
{
"epoch": 0.24484518356511092,
"grad_norm": 0.17330291867256165,
"learning_rate": 0.0001,
"loss": 1.8261,
"step": 17800
},
{
"epoch": 0.24553295093467586,
"grad_norm": 0.17979152500629425,
"learning_rate": 0.0001,
"loss": 1.8304,
"step": 17850
},
{
"epoch": 0.24622071830424078,
"grad_norm": 0.19253650307655334,
"learning_rate": 0.0001,
"loss": 1.83,
"step": 17900
},
{
"epoch": 0.2469084856738057,
"grad_norm": 0.20440231263637543,
"learning_rate": 0.0001,
"loss": 1.8251,
"step": 17950
},
{
"epoch": 0.2475962530433706,
"grad_norm": 0.18242883682250977,
"learning_rate": 0.0001,
"loss": 1.8286,
"step": 18000
},
{
"epoch": 0.24828402041293554,
"grad_norm": 0.1742672622203827,
"learning_rate": 0.0001,
"loss": 1.8271,
"step": 18050
},
{
"epoch": 0.24897178778250045,
"grad_norm": 0.19099250435829163,
"learning_rate": 0.0001,
"loss": 1.8284,
"step": 18100
},
{
"epoch": 0.24965955515206537,
"grad_norm": 0.19839410483837128,
"learning_rate": 0.0001,
"loss": 1.8254,
"step": 18150
},
{
"epoch": 0.2503473225216303,
"grad_norm": 0.18187545239925385,
"learning_rate": 0.0001,
"loss": 1.8258,
"step": 18200
},
{
"epoch": 0.2510350898911952,
"grad_norm": 0.16419640183448792,
"learning_rate": 0.0001,
"loss": 1.825,
"step": 18250
},
{
"epoch": 0.2517228572607601,
"grad_norm": 0.1788015216588974,
"learning_rate": 0.0001,
"loss": 1.8257,
"step": 18300
},
{
"epoch": 0.25241062463032504,
"grad_norm": 0.2013292908668518,
"learning_rate": 0.0001,
"loss": 1.8345,
"step": 18350
},
{
"epoch": 0.25309839199989,
"grad_norm": 0.18886993825435638,
"learning_rate": 0.0001,
"loss": 1.8269,
"step": 18400
},
{
"epoch": 0.25378615936945487,
"grad_norm": 0.18426848948001862,
"learning_rate": 0.0001,
"loss": 1.8291,
"step": 18450
},
{
"epoch": 0.2544739267390198,
"grad_norm": 0.1836244910955429,
"learning_rate": 0.0001,
"loss": 1.8228,
"step": 18500
},
{
"epoch": 0.2551616941085847,
"grad_norm": 0.18584777414798737,
"learning_rate": 0.0001,
"loss": 1.8283,
"step": 18550
},
{
"epoch": 0.25584946147814963,
"grad_norm": 0.16920630633831024,
"learning_rate": 0.0001,
"loss": 1.8274,
"step": 18600
},
{
"epoch": 0.25653722884771457,
"grad_norm": 0.20111984014511108,
"learning_rate": 0.0001,
"loss": 1.8285,
"step": 18650
},
{
"epoch": 0.25722499621727946,
"grad_norm": 0.18769313395023346,
"learning_rate": 0.0001,
"loss": 1.8295,
"step": 18700
},
{
"epoch": 0.2579127635868444,
"grad_norm": 0.18159103393554688,
"learning_rate": 0.0001,
"loss": 1.8236,
"step": 18750
},
{
"epoch": 0.2586005309564093,
"grad_norm": 0.1929440200328827,
"learning_rate": 0.0001,
"loss": 1.8279,
"step": 18800
},
{
"epoch": 0.2592882983259742,
"grad_norm": 0.16436657309532166,
"learning_rate": 0.0001,
"loss": 1.823,
"step": 18850
},
{
"epoch": 0.25997606569553916,
"grad_norm": 0.1638740748167038,
"learning_rate": 0.0001,
"loss": 1.8251,
"step": 18900
},
{
"epoch": 0.26066383306510404,
"grad_norm": 0.18252821266651154,
"learning_rate": 0.0001,
"loss": 1.8251,
"step": 18950
},
{
"epoch": 0.261351600434669,
"grad_norm": 0.18031029403209686,
"learning_rate": 0.0001,
"loss": 1.8243,
"step": 19000
},
{
"epoch": 0.26203936780423387,
"grad_norm": 0.1770683377981186,
"learning_rate": 0.0001,
"loss": 1.8274,
"step": 19050
},
{
"epoch": 0.2627271351737988,
"grad_norm": 0.20250555872917175,
"learning_rate": 0.0001,
"loss": 1.8258,
"step": 19100
},
{
"epoch": 0.26341490254336375,
"grad_norm": 0.16491496562957764,
"learning_rate": 0.0001,
"loss": 1.8251,
"step": 19150
},
{
"epoch": 0.26410266991292863,
"grad_norm": 0.19582998752593994,
"learning_rate": 0.0001,
"loss": 1.824,
"step": 19200
},
{
"epoch": 0.2647904372824936,
"grad_norm": 0.17773911356925964,
"learning_rate": 0.0001,
"loss": 1.8195,
"step": 19250
},
{
"epoch": 0.2654782046520585,
"grad_norm": 0.18118888139724731,
"learning_rate": 0.0001,
"loss": 1.8239,
"step": 19300
},
{
"epoch": 0.2661659720216234,
"grad_norm": 0.15766191482543945,
"learning_rate": 0.0001,
"loss": 1.8232,
"step": 19350
},
{
"epoch": 0.26685373939118834,
"grad_norm": 0.17026937007904053,
"learning_rate": 0.0001,
"loss": 1.8223,
"step": 19400
},
{
"epoch": 0.2675415067607532,
"grad_norm": 0.18863512575626373,
"learning_rate": 0.0001,
"loss": 1.8257,
"step": 19450
},
{
"epoch": 0.26822927413031816,
"grad_norm": 0.18321500718593597,
"learning_rate": 0.0001,
"loss": 1.8238,
"step": 19500
},
{
"epoch": 0.2689170414998831,
"grad_norm": 0.20935237407684326,
"learning_rate": 0.0001,
"loss": 1.8229,
"step": 19550
},
{
"epoch": 0.269604808869448,
"grad_norm": 0.19490981101989746,
"learning_rate": 0.0001,
"loss": 1.8194,
"step": 19600
},
{
"epoch": 0.2702925762390129,
"grad_norm": 0.19290666282176971,
"learning_rate": 0.0001,
"loss": 1.8258,
"step": 19650
},
{
"epoch": 0.2709803436085778,
"grad_norm": 0.1819174438714981,
"learning_rate": 0.0001,
"loss": 1.8224,
"step": 19700
},
{
"epoch": 0.27166811097814275,
"grad_norm": 0.18501299619674683,
"learning_rate": 0.0001,
"loss": 1.8297,
"step": 19750
},
{
"epoch": 0.2723558783477077,
"grad_norm": 0.19111846387386322,
"learning_rate": 0.0001,
"loss": 1.8226,
"step": 19800
},
{
"epoch": 0.2730436457172726,
"grad_norm": 0.18800359964370728,
"learning_rate": 0.0001,
"loss": 1.8215,
"step": 19850
},
{
"epoch": 0.2737314130868375,
"grad_norm": 0.18408334255218506,
"learning_rate": 0.0001,
"loss": 1.8239,
"step": 19900
},
{
"epoch": 0.2744191804564024,
"grad_norm": 0.19500131905078888,
"learning_rate": 0.0001,
"loss": 1.8232,
"step": 19950
},
{
"epoch": 0.27510694782596734,
"grad_norm": 0.18263010680675507,
"learning_rate": 0.0001,
"loss": 1.8246,
"step": 20000
},
{
"epoch": 0.2757947151955323,
"grad_norm": 0.1732577383518219,
"learning_rate": 0.0001,
"loss": 1.8241,
"step": 20050
},
{
"epoch": 0.27648248256509717,
"grad_norm": 0.1958979219198227,
"learning_rate": 0.0001,
"loss": 1.8215,
"step": 20100
},
{
"epoch": 0.2771702499346621,
"grad_norm": 0.1755562722682953,
"learning_rate": 0.0001,
"loss": 1.8275,
"step": 20150
},
{
"epoch": 0.27785801730422705,
"grad_norm": 0.17292717099189758,
"learning_rate": 0.0001,
"loss": 1.8221,
"step": 20200
},
{
"epoch": 0.27854578467379193,
"grad_norm": 0.16997367143630981,
"learning_rate": 0.0001,
"loss": 1.8221,
"step": 20250
},
{
"epoch": 0.27923355204335687,
"grad_norm": 0.1903601735830307,
"learning_rate": 0.0001,
"loss": 1.8243,
"step": 20300
},
{
"epoch": 0.27992131941292175,
"grad_norm": 0.17447033524513245,
"learning_rate": 0.0001,
"loss": 1.8229,
"step": 20350
},
{
"epoch": 0.2806090867824867,
"grad_norm": 0.18861395120620728,
"learning_rate": 0.0001,
"loss": 1.8222,
"step": 20400
},
{
"epoch": 0.28129685415205163,
"grad_norm": 0.17015644907951355,
"learning_rate": 0.0001,
"loss": 1.8207,
"step": 20450
},
{
"epoch": 0.2819846215216165,
"grad_norm": 0.19356681406497955,
"learning_rate": 0.0001,
"loss": 1.8202,
"step": 20500
},
{
"epoch": 0.28267238889118146,
"grad_norm": 0.1988779753446579,
"learning_rate": 0.0001,
"loss": 1.8199,
"step": 20550
},
{
"epoch": 0.28336015626074634,
"grad_norm": 0.1967942714691162,
"learning_rate": 0.0001,
"loss": 1.8217,
"step": 20600
},
{
"epoch": 0.2840479236303113,
"grad_norm": 0.18917816877365112,
"learning_rate": 0.0001,
"loss": 1.8229,
"step": 20650
},
{
"epoch": 0.2847356909998762,
"grad_norm": 0.16583094000816345,
"learning_rate": 0.0001,
"loss": 1.8219,
"step": 20700
},
{
"epoch": 0.2854234583694411,
"grad_norm": 0.19918115437030792,
"learning_rate": 0.0001,
"loss": 1.8246,
"step": 20750
},
{
"epoch": 0.28611122573900605,
"grad_norm": 0.1981818974018097,
"learning_rate": 0.0001,
"loss": 1.8211,
"step": 20800
},
{
"epoch": 0.28679899310857093,
"grad_norm": 0.1838293969631195,
"learning_rate": 0.0001,
"loss": 1.8224,
"step": 20850
},
{
"epoch": 0.28748676047813587,
"grad_norm": 0.20068101584911346,
"learning_rate": 0.0001,
"loss": 1.82,
"step": 20900
},
{
"epoch": 0.2881745278477008,
"grad_norm": 0.17375263571739197,
"learning_rate": 0.0001,
"loss": 1.8195,
"step": 20950
},
{
"epoch": 0.2888622952172657,
"grad_norm": 0.16706246137619019,
"learning_rate": 0.0001,
"loss": 1.826,
"step": 21000
},
{
"epoch": 0.28955006258683064,
"grad_norm": 0.20021022856235504,
"learning_rate": 0.0001,
"loss": 1.8207,
"step": 21050
},
{
"epoch": 0.2902378299563955,
"grad_norm": 0.20570990443229675,
"learning_rate": 0.0001,
"loss": 1.8221,
"step": 21100
},
{
"epoch": 0.29092559732596046,
"grad_norm": 0.2043515294790268,
"learning_rate": 0.0001,
"loss": 1.8239,
"step": 21150
},
{
"epoch": 0.2916133646955254,
"grad_norm": 0.17122073471546173,
"learning_rate": 0.0001,
"loss": 1.8203,
"step": 21200
},
{
"epoch": 0.2923011320650903,
"grad_norm": 0.19589883089065552,
"learning_rate": 0.0001,
"loss": 1.8206,
"step": 21250
},
{
"epoch": 0.2929888994346552,
"grad_norm": 0.19675767421722412,
"learning_rate": 0.0001,
"loss": 1.8244,
"step": 21300
},
{
"epoch": 0.29367666680422017,
"grad_norm": 0.1788429468870163,
"learning_rate": 0.0001,
"loss": 1.8225,
"step": 21350
},
{
"epoch": 0.29436443417378505,
"grad_norm": 0.17564085125923157,
"learning_rate": 0.0001,
"loss": 1.8242,
"step": 21400
},
{
"epoch": 0.29505220154335,
"grad_norm": 0.1807086318731308,
"learning_rate": 0.0001,
"loss": 1.8245,
"step": 21450
},
{
"epoch": 0.2957399689129149,
"grad_norm": 0.1772526502609253,
"learning_rate": 0.0001,
"loss": 1.8231,
"step": 21500
},
{
"epoch": 0.2964277362824798,
"grad_norm": 0.1903577297925949,
"learning_rate": 0.0001,
"loss": 1.8209,
"step": 21550
},
{
"epoch": 0.29711550365204475,
"grad_norm": 0.17995303869247437,
"learning_rate": 0.0001,
"loss": 1.817,
"step": 21600
},
{
"epoch": 0.29780327102160964,
"grad_norm": 0.1937420666217804,
"learning_rate": 0.0001,
"loss": 1.8241,
"step": 21650
},
{
"epoch": 0.2984910383911746,
"grad_norm": 0.1729700267314911,
"learning_rate": 0.0001,
"loss": 1.822,
"step": 21700
},
{
"epoch": 0.29917880576073946,
"grad_norm": 0.16370828449726105,
"learning_rate": 0.0001,
"loss": 1.8217,
"step": 21750
},
{
"epoch": 0.2998665731303044,
"grad_norm": 0.17373540997505188,
"learning_rate": 0.0001,
"loss": 1.8191,
"step": 21800
},
{
"epoch": 0.30055434049986934,
"grad_norm": 0.19695748388767242,
"learning_rate": 0.0001,
"loss": 1.8236,
"step": 21850
},
{
"epoch": 0.30124210786943423,
"grad_norm": 0.20299525558948517,
"learning_rate": 0.0001,
"loss": 1.8181,
"step": 21900
},
{
"epoch": 0.30192987523899917,
"grad_norm": 0.5943254828453064,
"learning_rate": 0.0001,
"loss": 1.8207,
"step": 21950
},
{
"epoch": 0.30261764260856405,
"grad_norm": 0.1915915608406067,
"learning_rate": 0.0001,
"loss": 1.8245,
"step": 22000
},
{
"epoch": 0.303305409978129,
"grad_norm": 0.16212280094623566,
"learning_rate": 0.0001,
"loss": 1.8226,
"step": 22050
},
{
"epoch": 0.30399317734769393,
"grad_norm": 0.16871103644371033,
"learning_rate": 0.0001,
"loss": 1.8193,
"step": 22100
},
{
"epoch": 0.3046809447172588,
"grad_norm": 0.1811041682958603,
"learning_rate": 0.0001,
"loss": 1.8187,
"step": 22150
},
{
"epoch": 0.30536871208682376,
"grad_norm": 0.1868380606174469,
"learning_rate": 0.0001,
"loss": 1.8219,
"step": 22200
},
{
"epoch": 0.3060564794563887,
"grad_norm": 0.18134795129299164,
"learning_rate": 0.0001,
"loss": 1.8207,
"step": 22250
},
{
"epoch": 0.3067442468259536,
"grad_norm": 0.17329555749893188,
"learning_rate": 0.0001,
"loss": 1.8193,
"step": 22300
},
{
"epoch": 0.3074320141955185,
"grad_norm": 0.18371562659740448,
"learning_rate": 0.0001,
"loss": 1.821,
"step": 22350
},
{
"epoch": 0.3081197815650834,
"grad_norm": 0.17543677985668182,
"learning_rate": 0.0001,
"loss": 1.8182,
"step": 22400
},
{
"epoch": 0.30880754893464835,
"grad_norm": 0.18362955749034882,
"learning_rate": 0.0001,
"loss": 1.8187,
"step": 22450
},
{
"epoch": 0.3094953163042133,
"grad_norm": 0.20341430604457855,
"learning_rate": 0.0001,
"loss": 1.8198,
"step": 22500
},
{
"epoch": 0.31018308367377817,
"grad_norm": 0.1833573579788208,
"learning_rate": 0.0001,
"loss": 1.8167,
"step": 22550
},
{
"epoch": 0.3108708510433431,
"grad_norm": 0.1798466444015503,
"learning_rate": 0.0001,
"loss": 1.8204,
"step": 22600
},
{
"epoch": 0.311558618412908,
"grad_norm": 0.18346908688545227,
"learning_rate": 0.0001,
"loss": 1.8197,
"step": 22650
},
{
"epoch": 0.31224638578247293,
"grad_norm": 0.1842503696680069,
"learning_rate": 0.0001,
"loss": 1.822,
"step": 22700
},
{
"epoch": 0.3129341531520379,
"grad_norm": 0.1917971521615982,
"learning_rate": 0.0001,
"loss": 1.8205,
"step": 22750
},
{
"epoch": 0.31362192052160276,
"grad_norm": 0.18140938878059387,
"learning_rate": 0.0001,
"loss": 1.8187,
"step": 22800
},
{
"epoch": 0.3143096878911677,
"grad_norm": 0.17349034547805786,
"learning_rate": 0.0001,
"loss": 1.8204,
"step": 22850
},
{
"epoch": 0.3149974552607326,
"grad_norm": 0.17727358639240265,
"learning_rate": 0.0001,
"loss": 1.8203,
"step": 22900
},
{
"epoch": 0.3156852226302975,
"grad_norm": 0.1764019876718521,
"learning_rate": 0.0001,
"loss": 1.8197,
"step": 22950
},
{
"epoch": 0.31637298999986246,
"grad_norm": 0.18336281180381775,
"learning_rate": 0.0001,
"loss": 1.8168,
"step": 23000
},
{
"epoch": 0.31706075736942735,
"grad_norm": 0.15488466620445251,
"learning_rate": 0.0001,
"loss": 1.819,
"step": 23050
},
{
"epoch": 0.3177485247389923,
"grad_norm": 0.16988332569599152,
"learning_rate": 0.0001,
"loss": 1.8151,
"step": 23100
},
{
"epoch": 0.31843629210855723,
"grad_norm": 0.16344988346099854,
"learning_rate": 0.0001,
"loss": 1.819,
"step": 23150
},
{
"epoch": 0.3191240594781221,
"grad_norm": 0.17984721064567566,
"learning_rate": 0.0001,
"loss": 1.8182,
"step": 23200
},
{
"epoch": 0.31981182684768705,
"grad_norm": 0.19572113454341888,
"learning_rate": 0.0001,
"loss": 1.8158,
"step": 23250
},
{
"epoch": 0.32049959421725194,
"grad_norm": 0.21890446543693542,
"learning_rate": 0.0001,
"loss": 1.8158,
"step": 23300
},
{
"epoch": 0.3211873615868169,
"grad_norm": 0.1672099530696869,
"learning_rate": 0.0001,
"loss": 1.8183,
"step": 23350
},
{
"epoch": 0.3218751289563818,
"grad_norm": 0.18066146969795227,
"learning_rate": 0.0001,
"loss": 1.8194,
"step": 23400
},
{
"epoch": 0.3225628963259467,
"grad_norm": 0.1749303936958313,
"learning_rate": 0.0001,
"loss": 1.8192,
"step": 23450
},
{
"epoch": 0.32325066369551164,
"grad_norm": 0.1646299809217453,
"learning_rate": 0.0001,
"loss": 1.819,
"step": 23500
},
{
"epoch": 0.3239384310650765,
"grad_norm": 0.204520583152771,
"learning_rate": 0.0001,
"loss": 1.8166,
"step": 23550
},
{
"epoch": 0.32462619843464147,
"grad_norm": 0.166048064827919,
"learning_rate": 0.0001,
"loss": 1.8163,
"step": 23600
},
{
"epoch": 0.3253139658042064,
"grad_norm": 0.17722272872924805,
"learning_rate": 0.0001,
"loss": 1.8158,
"step": 23650
},
{
"epoch": 0.3260017331737713,
"grad_norm": 0.1896638125181198,
"learning_rate": 0.0001,
"loss": 1.8165,
"step": 23700
},
{
"epoch": 0.32668950054333623,
"grad_norm": 0.16389790177345276,
"learning_rate": 0.0001,
"loss": 1.8163,
"step": 23750
},
{
"epoch": 0.3273772679129011,
"grad_norm": 0.17973138391971588,
"learning_rate": 0.0001,
"loss": 1.8201,
"step": 23800
},
{
"epoch": 0.32806503528246606,
"grad_norm": 0.20095448195934296,
"learning_rate": 0.0001,
"loss": 1.8174,
"step": 23850
},
{
"epoch": 0.328752802652031,
"grad_norm": 0.18039678037166595,
"learning_rate": 0.0001,
"loss": 1.8179,
"step": 23900
},
{
"epoch": 0.3294405700215959,
"grad_norm": 0.1760893315076828,
"learning_rate": 0.0001,
"loss": 1.816,
"step": 23950
},
{
"epoch": 0.3301283373911608,
"grad_norm": 0.171057790517807,
"learning_rate": 0.0001,
"loss": 1.816,
"step": 24000
},
{
"epoch": 0.33081610476072576,
"grad_norm": 0.17639483511447906,
"learning_rate": 0.0001,
"loss": 1.8157,
"step": 24050
},
{
"epoch": 0.33150387213029064,
"grad_norm": 0.16385740041732788,
"learning_rate": 0.0001,
"loss": 1.8195,
"step": 24100
},
{
"epoch": 0.3321916394998556,
"grad_norm": 0.18215522170066833,
"learning_rate": 0.0001,
"loss": 1.8157,
"step": 24150
},
{
"epoch": 0.33287940686942047,
"grad_norm": 0.17613132297992706,
"learning_rate": 0.0001,
"loss": 1.8152,
"step": 24200
},
{
"epoch": 0.3335671742389854,
"grad_norm": 0.16723348200321198,
"learning_rate": 0.0001,
"loss": 1.8141,
"step": 24250
},
{
"epoch": 0.33425494160855035,
"grad_norm": 0.16092203557491302,
"learning_rate": 0.0001,
"loss": 1.8173,
"step": 24300
},
{
"epoch": 0.33494270897811523,
"grad_norm": 0.17928454279899597,
"learning_rate": 0.0001,
"loss": 1.8188,
"step": 24350
},
{
"epoch": 0.3356304763476802,
"grad_norm": 0.18230123817920685,
"learning_rate": 0.0001,
"loss": 1.8152,
"step": 24400
},
{
"epoch": 0.33631824371724506,
"grad_norm": 0.1699696034193039,
"learning_rate": 0.0001,
"loss": 1.8194,
"step": 24450
},
{
"epoch": 0.33700601108681,
"grad_norm": 0.1800839602947235,
"learning_rate": 0.0001,
"loss": 1.8126,
"step": 24500
},
{
"epoch": 0.33769377845637494,
"grad_norm": 0.19913671910762787,
"learning_rate": 0.0001,
"loss": 1.8148,
"step": 24550
},
{
"epoch": 0.3383815458259398,
"grad_norm": 0.16596053540706635,
"learning_rate": 0.0001,
"loss": 1.818,
"step": 24600
},
{
"epoch": 0.33906931319550476,
"grad_norm": 0.1894855797290802,
"learning_rate": 0.0001,
"loss": 1.8142,
"step": 24650
},
{
"epoch": 0.33975708056506965,
"grad_norm": 0.1800161600112915,
"learning_rate": 0.0001,
"loss": 1.8152,
"step": 24700
},
{
"epoch": 0.3404448479346346,
"grad_norm": 0.17433103919029236,
"learning_rate": 0.0001,
"loss": 1.815,
"step": 24750
},
{
"epoch": 0.3411326153041995,
"grad_norm": 0.18210847675800323,
"learning_rate": 0.0001,
"loss": 1.8168,
"step": 24800
},
{
"epoch": 0.3418203826737644,
"grad_norm": 0.17840790748596191,
"learning_rate": 0.0001,
"loss": 1.8159,
"step": 24850
},
{
"epoch": 0.34250815004332935,
"grad_norm": 0.18368154764175415,
"learning_rate": 0.0001,
"loss": 1.8171,
"step": 24900
},
{
"epoch": 0.34319591741289424,
"grad_norm": 0.17999804019927979,
"learning_rate": 0.0001,
"loss": 1.817,
"step": 24950
},
{
"epoch": 0.3438836847824592,
"grad_norm": 0.19299517571926117,
"learning_rate": 0.0001,
"loss": 1.8161,
"step": 25000
},
{
"epoch": 0.3445714521520241,
"grad_norm": 0.17866362631320953,
"learning_rate": 0.0001,
"loss": 1.8121,
"step": 25050
},
{
"epoch": 0.345259219521589,
"grad_norm": 0.16793055832386017,
"learning_rate": 0.0001,
"loss": 1.8137,
"step": 25100
},
{
"epoch": 0.34594698689115394,
"grad_norm": 0.18356679379940033,
"learning_rate": 0.0001,
"loss": 1.8158,
"step": 25150
},
{
"epoch": 0.3466347542607189,
"grad_norm": 0.18392959237098694,
"learning_rate": 0.0001,
"loss": 1.8135,
"step": 25200
},
{
"epoch": 0.34732252163028376,
"grad_norm": 0.18158595263957977,
"learning_rate": 0.0001,
"loss": 1.8168,
"step": 25250
},
{
"epoch": 0.3480102889998487,
"grad_norm": 0.1956174075603485,
"learning_rate": 0.0001,
"loss": 1.8137,
"step": 25300
},
{
"epoch": 0.3486980563694136,
"grad_norm": 0.17629751563072205,
"learning_rate": 0.0001,
"loss": 1.8161,
"step": 25350
},
{
"epoch": 0.34938582373897853,
"grad_norm": 0.1842150092124939,
"learning_rate": 0.0001,
"loss": 1.8112,
"step": 25400
},
{
"epoch": 0.35007359110854347,
"grad_norm": 0.18889479339122772,
"learning_rate": 0.0001,
"loss": 1.8152,
"step": 25450
},
{
"epoch": 0.35076135847810835,
"grad_norm": 0.16872894763946533,
"learning_rate": 0.0001,
"loss": 1.818,
"step": 25500
},
{
"epoch": 0.3514491258476733,
"grad_norm": 0.16502858698368073,
"learning_rate": 0.0001,
"loss": 1.8127,
"step": 25550
},
{
"epoch": 0.3521368932172382,
"grad_norm": 0.1778111755847931,
"learning_rate": 0.0001,
"loss": 1.8202,
"step": 25600
},
{
"epoch": 0.3528246605868031,
"grad_norm": 0.16866064071655273,
"learning_rate": 0.0001,
"loss": 1.8155,
"step": 25650
},
{
"epoch": 0.35351242795636806,
"grad_norm": 0.1845904141664505,
"learning_rate": 0.0001,
"loss": 1.8171,
"step": 25700
},
{
"epoch": 0.35420019532593294,
"grad_norm": 0.19138947129249573,
"learning_rate": 0.0001,
"loss": 1.8164,
"step": 25750
},
{
"epoch": 0.3548879626954979,
"grad_norm": 0.18222880363464355,
"learning_rate": 0.0001,
"loss": 1.8131,
"step": 25800
},
{
"epoch": 0.35557573006506277,
"grad_norm": 0.17819440364837646,
"learning_rate": 0.0001,
"loss": 1.8147,
"step": 25850
},
{
"epoch": 0.3562634974346277,
"grad_norm": 0.20162558555603027,
"learning_rate": 0.0001,
"loss": 1.8188,
"step": 25900
},
{
"epoch": 0.35695126480419265,
"grad_norm": 0.17715832591056824,
"learning_rate": 0.0001,
"loss": 1.813,
"step": 25950
},
{
"epoch": 0.35763903217375753,
"grad_norm": 0.16032275557518005,
"learning_rate": 0.0001,
"loss": 1.8135,
"step": 26000
},
{
"epoch": 0.35832679954332247,
"grad_norm": 0.17023804783821106,
"learning_rate": 0.0001,
"loss": 1.8168,
"step": 26050
},
{
"epoch": 0.3590145669128874,
"grad_norm": 0.19815494120121002,
"learning_rate": 0.0001,
"loss": 1.8123,
"step": 26100
},
{
"epoch": 0.3597023342824523,
"grad_norm": 0.19192709028720856,
"learning_rate": 0.0001,
"loss": 1.8164,
"step": 26150
},
{
"epoch": 0.36039010165201724,
"grad_norm": 0.18932852149009705,
"learning_rate": 0.0001,
"loss": 1.813,
"step": 26200
},
{
"epoch": 0.3610778690215821,
"grad_norm": 0.16477489471435547,
"learning_rate": 0.0001,
"loss": 1.8147,
"step": 26250
},
{
"epoch": 0.36176563639114706,
"grad_norm": 0.19172504544258118,
"learning_rate": 0.0001,
"loss": 1.814,
"step": 26300
},
{
"epoch": 0.362453403760712,
"grad_norm": 0.19087177515029907,
"learning_rate": 0.0001,
"loss": 1.8123,
"step": 26350
},
{
"epoch": 0.3631411711302769,
"grad_norm": 0.1714990735054016,
"learning_rate": 0.0001,
"loss": 1.8133,
"step": 26400
},
{
"epoch": 0.3638289384998418,
"grad_norm": 0.16309858858585358,
"learning_rate": 0.0001,
"loss": 1.8168,
"step": 26450
},
{
"epoch": 0.3645167058694067,
"grad_norm": 0.1791163831949234,
"learning_rate": 0.0001,
"loss": 1.818,
"step": 26500
},
{
"epoch": 0.36520447323897165,
"grad_norm": 0.17130139470100403,
"learning_rate": 0.0001,
"loss": 1.8182,
"step": 26550
},
{
"epoch": 0.3658922406085366,
"grad_norm": 0.17432111501693726,
"learning_rate": 0.0001,
"loss": 1.8177,
"step": 26600
},
{
"epoch": 0.3665800079781015,
"grad_norm": 0.15398447215557098,
"learning_rate": 0.0001,
"loss": 1.8161,
"step": 26650
},
{
"epoch": 0.3672677753476664,
"grad_norm": 0.2831607162952423,
"learning_rate": 0.0001,
"loss": 1.815,
"step": 26700
},
{
"epoch": 0.3679555427172313,
"grad_norm": 0.17564986646175385,
"learning_rate": 0.0001,
"loss": 1.8129,
"step": 26750
},
{
"epoch": 0.36864331008679624,
"grad_norm": 0.18288859724998474,
"learning_rate": 0.0001,
"loss": 1.813,
"step": 26800
},
{
"epoch": 0.3693310774563612,
"grad_norm": 0.1621311753988266,
"learning_rate": 0.0001,
"loss": 1.8069,
"step": 26850
},
{
"epoch": 0.37001884482592606,
"grad_norm": 0.16472625732421875,
"learning_rate": 0.0001,
"loss": 1.8136,
"step": 26900
},
{
"epoch": 0.370706612195491,
"grad_norm": 0.16450871527194977,
"learning_rate": 0.0001,
"loss": 1.8149,
"step": 26950
},
{
"epoch": 0.37139437956505594,
"grad_norm": 0.1769149899482727,
"learning_rate": 0.0001,
"loss": 1.8078,
"step": 27000
},
{
"epoch": 0.3720821469346208,
"grad_norm": 0.1917348951101303,
"learning_rate": 0.0001,
"loss": 1.8121,
"step": 27050
},
{
"epoch": 0.37276991430418577,
"grad_norm": 0.18277530372142792,
"learning_rate": 0.0001,
"loss": 1.812,
"step": 27100
},
{
"epoch": 0.37345768167375065,
"grad_norm": 0.1814720183610916,
"learning_rate": 0.0001,
"loss": 1.8092,
"step": 27150
},
{
"epoch": 0.3741454490433156,
"grad_norm": 0.17358410358428955,
"learning_rate": 0.0001,
"loss": 1.8118,
"step": 27200
},
{
"epoch": 0.37483321641288053,
"grad_norm": 0.18569444119930267,
"learning_rate": 0.0001,
"loss": 1.8115,
"step": 27250
},
{
"epoch": 0.3755209837824454,
"grad_norm": 0.15812502801418304,
"learning_rate": 0.0001,
"loss": 1.813,
"step": 27300
},
{
"epoch": 0.37620875115201036,
"grad_norm": 0.19051866233348846,
"learning_rate": 0.0001,
"loss": 1.8162,
"step": 27350
},
{
"epoch": 0.37689651852157524,
"grad_norm": 0.1646508276462555,
"learning_rate": 0.0001,
"loss": 1.8109,
"step": 27400
},
{
"epoch": 0.3775842858911402,
"grad_norm": 0.16069738566875458,
"learning_rate": 0.0001,
"loss": 1.8088,
"step": 27450
},
{
"epoch": 0.3782720532607051,
"grad_norm": 0.18708954751491547,
"learning_rate": 0.0001,
"loss": 1.809,
"step": 27500
},
{
"epoch": 0.37895982063027,
"grad_norm": 0.18674279749393463,
"learning_rate": 0.0001,
"loss": 1.8141,
"step": 27550
},
{
"epoch": 0.37964758799983495,
"grad_norm": 0.17408175766468048,
"learning_rate": 0.0001,
"loss": 1.8126,
"step": 27600
},
{
"epoch": 0.38033535536939983,
"grad_norm": 0.15924981236457825,
"learning_rate": 0.0001,
"loss": 1.8122,
"step": 27650
},
{
"epoch": 0.38102312273896477,
"grad_norm": 0.17203688621520996,
"learning_rate": 0.0001,
"loss": 1.8118,
"step": 27700
},
{
"epoch": 0.3817108901085297,
"grad_norm": 0.18587364256381989,
"learning_rate": 0.0001,
"loss": 1.8129,
"step": 27750
},
{
"epoch": 0.3823986574780946,
"grad_norm": 0.18941548466682434,
"learning_rate": 0.0001,
"loss": 1.8099,
"step": 27800
},
{
"epoch": 0.38308642484765953,
"grad_norm": 0.14958040416240692,
"learning_rate": 0.0001,
"loss": 1.8141,
"step": 27850
},
{
"epoch": 0.3837741922172244,
"grad_norm": 0.17599830031394958,
"learning_rate": 0.0001,
"loss": 1.8141,
"step": 27900
},
{
"epoch": 0.38446195958678936,
"grad_norm": 0.17611196637153625,
"learning_rate": 0.0001,
"loss": 1.8114,
"step": 27950
},
{
"epoch": 0.3851497269563543,
"grad_norm": 0.1823156625032425,
"learning_rate": 0.0001,
"loss": 1.8116,
"step": 28000
},
{
"epoch": 0.3858374943259192,
"grad_norm": 0.17287470400333405,
"learning_rate": 0.0001,
"loss": 1.812,
"step": 28050
},
{
"epoch": 0.3865252616954841,
"grad_norm": 0.17163801193237305,
"learning_rate": 0.0001,
"loss": 1.8102,
"step": 28100
},
{
"epoch": 0.38721302906504906,
"grad_norm": 0.16863061487674713,
"learning_rate": 0.0001,
"loss": 1.8085,
"step": 28150
},
{
"epoch": 0.38790079643461395,
"grad_norm": 0.1910269409418106,
"learning_rate": 0.0001,
"loss": 1.8128,
"step": 28200
},
{
"epoch": 0.3885885638041789,
"grad_norm": 0.16055557131767273,
"learning_rate": 0.0001,
"loss": 1.8122,
"step": 28250
},
{
"epoch": 0.3892763311737438,
"grad_norm": 0.17268548905849457,
"learning_rate": 0.0001,
"loss": 1.8084,
"step": 28300
},
{
"epoch": 0.3899640985433087,
"grad_norm": 0.16962352395057678,
"learning_rate": 0.0001,
"loss": 1.8131,
"step": 28350
},
{
"epoch": 0.39065186591287365,
"grad_norm": 0.1744450330734253,
"learning_rate": 0.0001,
"loss": 1.811,
"step": 28400
},
{
"epoch": 0.39133963328243854,
"grad_norm": 0.17569154500961304,
"learning_rate": 0.0001,
"loss": 1.8165,
"step": 28450
},
{
"epoch": 0.3920274006520035,
"grad_norm": 0.17034880816936493,
"learning_rate": 0.0001,
"loss": 1.8125,
"step": 28500
},
{
"epoch": 0.39271516802156836,
"grad_norm": 0.16873665153980255,
"learning_rate": 0.0001,
"loss": 1.8124,
"step": 28550
},
{
"epoch": 0.3934029353911333,
"grad_norm": 0.1771818846464157,
"learning_rate": 0.0001,
"loss": 1.8132,
"step": 28600
},
{
"epoch": 0.39409070276069824,
"grad_norm": 0.17641928791999817,
"learning_rate": 0.0001,
"loss": 1.8131,
"step": 28650
},
{
"epoch": 0.3947784701302631,
"grad_norm": 0.16521941125392914,
"learning_rate": 0.0001,
"loss": 1.8082,
"step": 28700
},
{
"epoch": 0.39546623749982807,
"grad_norm": 0.17453192174434662,
"learning_rate": 0.0001,
"loss": 1.813,
"step": 28750
},
{
"epoch": 0.39615400486939295,
"grad_norm": 0.17454297840595245,
"learning_rate": 0.0001,
"loss": 1.8129,
"step": 28800
},
{
"epoch": 0.3968417722389579,
"grad_norm": 0.155872642993927,
"learning_rate": 0.0001,
"loss": 1.8116,
"step": 28850
},
{
"epoch": 0.39752953960852283,
"grad_norm": 0.17079751193523407,
"learning_rate": 0.0001,
"loss": 1.8097,
"step": 28900
},
{
"epoch": 0.3982173069780877,
"grad_norm": 0.1715528666973114,
"learning_rate": 0.0001,
"loss": 1.8082,
"step": 28950
},
{
"epoch": 0.39890507434765266,
"grad_norm": 0.17352135479450226,
"learning_rate": 0.0001,
"loss": 1.8058,
"step": 29000
},
{
"epoch": 0.3995928417172176,
"grad_norm": 0.17056448757648468,
"learning_rate": 0.0001,
"loss": 1.8093,
"step": 29050
},
{
"epoch": 0.4002806090867825,
"grad_norm": 0.16389931738376617,
"learning_rate": 0.0001,
"loss": 1.8079,
"step": 29100
},
{
"epoch": 0.4009683764563474,
"grad_norm": 0.17660637199878693,
"learning_rate": 0.0001,
"loss": 1.8101,
"step": 29150
},
{
"epoch": 0.4016561438259123,
"grad_norm": 0.1871548742055893,
"learning_rate": 0.0001,
"loss": 1.8124,
"step": 29200
},
{
"epoch": 0.40234391119547724,
"grad_norm": 0.17292185127735138,
"learning_rate": 0.0001,
"loss": 1.8074,
"step": 29250
},
{
"epoch": 0.4030316785650422,
"grad_norm": 0.16299203038215637,
"learning_rate": 0.0001,
"loss": 1.81,
"step": 29300
},
{
"epoch": 0.40371944593460707,
"grad_norm": 0.20287854969501495,
"learning_rate": 0.0001,
"loss": 1.8141,
"step": 29350
},
{
"epoch": 0.404407213304172,
"grad_norm": 0.1632193922996521,
"learning_rate": 0.0001,
"loss": 1.8102,
"step": 29400
},
{
"epoch": 0.4050949806737369,
"grad_norm": 0.16991235315799713,
"learning_rate": 0.0001,
"loss": 1.8084,
"step": 29450
},
{
"epoch": 0.40578274804330183,
"grad_norm": 0.17448389530181885,
"learning_rate": 0.0001,
"loss": 1.8108,
"step": 29500
},
{
"epoch": 0.4064705154128668,
"grad_norm": 0.1706276535987854,
"learning_rate": 0.0001,
"loss": 1.8091,
"step": 29550
},
{
"epoch": 0.40715828278243166,
"grad_norm": 0.187569260597229,
"learning_rate": 0.0001,
"loss": 1.8077,
"step": 29600
},
{
"epoch": 0.4078460501519966,
"grad_norm": 0.18289169669151306,
"learning_rate": 0.0001,
"loss": 1.8062,
"step": 29650
},
{
"epoch": 0.4085338175215615,
"grad_norm": 0.17096656560897827,
"learning_rate": 0.0001,
"loss": 1.8117,
"step": 29700
},
{
"epoch": 0.4092215848911264,
"grad_norm": 0.18183813989162445,
"learning_rate": 0.0001,
"loss": 1.8108,
"step": 29750
},
{
"epoch": 0.40990935226069136,
"grad_norm": 0.18215380609035492,
"learning_rate": 0.0001,
"loss": 1.8113,
"step": 29800
},
{
"epoch": 0.41059711963025625,
"grad_norm": 0.19367296993732452,
"learning_rate": 0.0001,
"loss": 1.8111,
"step": 29850
},
{
"epoch": 0.4112848869998212,
"grad_norm": 0.18118008971214294,
"learning_rate": 0.0001,
"loss": 1.81,
"step": 29900
},
{
"epoch": 0.4119726543693861,
"grad_norm": 0.16475409269332886,
"learning_rate": 0.0001,
"loss": 1.8095,
"step": 29950
},
{
"epoch": 0.412660421738951,
"grad_norm": 0.19968119263648987,
"learning_rate": 0.0001,
"loss": 1.8078,
"step": 30000
},
{
"epoch": 0.41334818910851595,
"grad_norm": 0.2024579495191574,
"learning_rate": 0.0001,
"loss": 1.8097,
"step": 30050
},
{
"epoch": 0.41403595647808084,
"grad_norm": 0.1678769886493683,
"learning_rate": 0.0001,
"loss": 1.8099,
"step": 30100
},
{
"epoch": 0.4147237238476458,
"grad_norm": 0.19947120547294617,
"learning_rate": 0.0001,
"loss": 1.8124,
"step": 30150
},
{
"epoch": 0.4154114912172107,
"grad_norm": 0.1908283233642578,
"learning_rate": 0.0001,
"loss": 1.8094,
"step": 30200
},
{
"epoch": 0.4160992585867756,
"grad_norm": 0.16802892088890076,
"learning_rate": 0.0001,
"loss": 1.8029,
"step": 30250
},
{
"epoch": 0.41678702595634054,
"grad_norm": 0.1601232886314392,
"learning_rate": 0.0001,
"loss": 1.8078,
"step": 30300
},
{
"epoch": 0.4174747933259054,
"grad_norm": 0.16903936862945557,
"learning_rate": 0.0001,
"loss": 1.811,
"step": 30350
},
{
"epoch": 0.41816256069547036,
"grad_norm": 0.17131748795509338,
"learning_rate": 0.0001,
"loss": 1.8057,
"step": 30400
},
{
"epoch": 0.4188503280650353,
"grad_norm": 0.17509245872497559,
"learning_rate": 0.0001,
"loss": 1.8109,
"step": 30450
},
{
"epoch": 0.4195380954346002,
"grad_norm": 0.17135483026504517,
"learning_rate": 0.0001,
"loss": 1.8086,
"step": 30500
},
{
"epoch": 0.42022586280416513,
"grad_norm": 0.1780470460653305,
"learning_rate": 0.0001,
"loss": 1.8054,
"step": 30550
},
{
"epoch": 0.42091363017373,
"grad_norm": 0.16642825305461884,
"learning_rate": 0.0001,
"loss": 1.8101,
"step": 30600
},
{
"epoch": 0.42160139754329495,
"grad_norm": 0.17237281799316406,
"learning_rate": 0.0001,
"loss": 1.8131,
"step": 30650
},
{
"epoch": 0.4222891649128599,
"grad_norm": 0.1773928999900818,
"learning_rate": 0.0001,
"loss": 1.807,
"step": 30700
},
{
"epoch": 0.4229769322824248,
"grad_norm": 0.15655359625816345,
"learning_rate": 0.0001,
"loss": 1.8102,
"step": 30750
},
{
"epoch": 0.4236646996519897,
"grad_norm": 0.18366913497447968,
"learning_rate": 0.0001,
"loss": 1.8045,
"step": 30800
},
{
"epoch": 0.4243524670215546,
"grad_norm": 0.15379434823989868,
"learning_rate": 0.0001,
"loss": 1.808,
"step": 30850
},
{
"epoch": 0.42504023439111954,
"grad_norm": 0.17815300822257996,
"learning_rate": 0.0001,
"loss": 1.8043,
"step": 30900
},
{
"epoch": 0.4257280017606845,
"grad_norm": 0.17477139830589294,
"learning_rate": 0.0001,
"loss": 1.8106,
"step": 30950
},
{
"epoch": 0.42641576913024937,
"grad_norm": 0.18266303837299347,
"learning_rate": 0.0001,
"loss": 1.8089,
"step": 31000
},
{
"epoch": 0.4271035364998143,
"grad_norm": 0.17377638816833496,
"learning_rate": 0.0001,
"loss": 1.808,
"step": 31050
},
{
"epoch": 0.42779130386937925,
"grad_norm": 0.16105225682258606,
"learning_rate": 0.0001,
"loss": 1.8058,
"step": 31100
},
{
"epoch": 0.42847907123894413,
"grad_norm": 0.16976149380207062,
"learning_rate": 0.0001,
"loss": 1.8108,
"step": 31150
},
{
"epoch": 0.42916683860850907,
"grad_norm": 0.1994379609823227,
"learning_rate": 0.0001,
"loss": 1.8103,
"step": 31200
},
{
"epoch": 0.42985460597807396,
"grad_norm": 0.1827680766582489,
"learning_rate": 0.0001,
"loss": 1.8044,
"step": 31250
},
{
"epoch": 0.4305423733476389,
"grad_norm": 0.17883870005607605,
"learning_rate": 0.0001,
"loss": 1.8067,
"step": 31300
},
{
"epoch": 0.43123014071720384,
"grad_norm": 0.1809430867433548,
"learning_rate": 0.0001,
"loss": 1.8105,
"step": 31350
},
{
"epoch": 0.4319179080867687,
"grad_norm": 0.15287983417510986,
"learning_rate": 0.0001,
"loss": 1.8032,
"step": 31400
},
{
"epoch": 0.43260567545633366,
"grad_norm": 0.1845768690109253,
"learning_rate": 0.0001,
"loss": 1.8044,
"step": 31450
},
{
"epoch": 0.43329344282589854,
"grad_norm": 0.15448009967803955,
"learning_rate": 0.0001,
"loss": 1.8074,
"step": 31500
},
{
"epoch": 0.4339812101954635,
"grad_norm": 0.16838380694389343,
"learning_rate": 0.0001,
"loss": 1.806,
"step": 31550
},
{
"epoch": 0.4346689775650284,
"grad_norm": 0.16129769384860992,
"learning_rate": 0.0001,
"loss": 1.8103,
"step": 31600
},
{
"epoch": 0.4353567449345933,
"grad_norm": 0.16702227294445038,
"learning_rate": 0.0001,
"loss": 1.81,
"step": 31650
},
{
"epoch": 0.43604451230415825,
"grad_norm": 0.1646498441696167,
"learning_rate": 0.0001,
"loss": 1.806,
"step": 31700
},
{
"epoch": 0.43673227967372313,
"grad_norm": 0.1929212510585785,
"learning_rate": 0.0001,
"loss": 1.8073,
"step": 31750
},
{
"epoch": 0.4374200470432881,
"grad_norm": 0.1728442758321762,
"learning_rate": 0.0001,
"loss": 1.8062,
"step": 31800
},
{
"epoch": 0.438107814412853,
"grad_norm": 0.15660201013088226,
"learning_rate": 0.0001,
"loss": 1.8025,
"step": 31850
},
{
"epoch": 0.4387955817824179,
"grad_norm": 0.1685377061367035,
"learning_rate": 0.0001,
"loss": 1.8079,
"step": 31900
},
{
"epoch": 0.43948334915198284,
"grad_norm": 0.18124371767044067,
"learning_rate": 0.0001,
"loss": 1.8042,
"step": 31950
},
{
"epoch": 0.4401711165215478,
"grad_norm": 0.18348287045955658,
"learning_rate": 0.0001,
"loss": 1.809,
"step": 32000
},
{
"epoch": 0.44085888389111266,
"grad_norm": 0.17936021089553833,
"learning_rate": 0.0001,
"loss": 1.8052,
"step": 32050
},
{
"epoch": 0.4415466512606776,
"grad_norm": 0.17418572306632996,
"learning_rate": 0.0001,
"loss": 1.8075,
"step": 32100
},
{
"epoch": 0.4422344186302425,
"grad_norm": 0.16956304013729095,
"learning_rate": 0.0001,
"loss": 1.8077,
"step": 32150
},
{
"epoch": 0.4429221859998074,
"grad_norm": 0.18142879009246826,
"learning_rate": 0.0001,
"loss": 1.8053,
"step": 32200
},
{
"epoch": 0.44360995336937237,
"grad_norm": 0.17536590993404388,
"learning_rate": 0.0001,
"loss": 1.8055,
"step": 32250
},
{
"epoch": 0.44429772073893725,
"grad_norm": 0.18276521563529968,
"learning_rate": 0.0001,
"loss": 1.8052,
"step": 32300
},
{
"epoch": 0.4449854881085022,
"grad_norm": 0.15810468792915344,
"learning_rate": 0.0001,
"loss": 1.8012,
"step": 32350
},
{
"epoch": 0.4456732554780671,
"grad_norm": 0.17224664986133575,
"learning_rate": 0.0001,
"loss": 1.8077,
"step": 32400
},
{
"epoch": 0.446361022847632,
"grad_norm": 0.17988136410713196,
"learning_rate": 0.0001,
"loss": 1.806,
"step": 32450
},
{
"epoch": 0.44704879021719696,
"grad_norm": 0.16269569098949432,
"learning_rate": 0.0001,
"loss": 1.8072,
"step": 32500
},
{
"epoch": 0.44773655758676184,
"grad_norm": 0.1897774487733841,
"learning_rate": 0.0001,
"loss": 1.8034,
"step": 32550
},
{
"epoch": 0.4484243249563268,
"grad_norm": 0.17675265669822693,
"learning_rate": 0.0001,
"loss": 1.8053,
"step": 32600
},
{
"epoch": 0.44911209232589167,
"grad_norm": 0.1847987174987793,
"learning_rate": 0.0001,
"loss": 1.8065,
"step": 32650
},
{
"epoch": 0.4497998596954566,
"grad_norm": 0.16706308722496033,
"learning_rate": 0.0001,
"loss": 1.8072,
"step": 32700
},
{
"epoch": 0.45048762706502155,
"grad_norm": 0.19702313840389252,
"learning_rate": 0.0001,
"loss": 1.8092,
"step": 32750
},
{
"epoch": 0.45117539443458643,
"grad_norm": 0.17378373444080353,
"learning_rate": 0.0001,
"loss": 1.8069,
"step": 32800
},
{
"epoch": 0.45186316180415137,
"grad_norm": 0.15358635783195496,
"learning_rate": 0.0001,
"loss": 1.8042,
"step": 32850
},
{
"epoch": 0.4525509291737163,
"grad_norm": 0.16188420355319977,
"learning_rate": 0.0001,
"loss": 1.8046,
"step": 32900
},
{
"epoch": 0.4532386965432812,
"grad_norm": 0.15988096594810486,
"learning_rate": 0.0001,
"loss": 1.8048,
"step": 32950
},
{
"epoch": 0.45392646391284613,
"grad_norm": 0.17328138649463654,
"learning_rate": 0.0001,
"loss": 1.8031,
"step": 33000
},
{
"epoch": 0.454614231282411,
"grad_norm": 0.18192242085933685,
"learning_rate": 0.0001,
"loss": 1.8015,
"step": 33050
},
{
"epoch": 0.45530199865197596,
"grad_norm": 0.18269090354442596,
"learning_rate": 0.0001,
"loss": 1.8086,
"step": 33100
},
{
"epoch": 0.4559897660215409,
"grad_norm": 0.1573922038078308,
"learning_rate": 0.0001,
"loss": 1.8054,
"step": 33150
},
{
"epoch": 0.4566775333911058,
"grad_norm": 0.20478671789169312,
"learning_rate": 0.0001,
"loss": 1.8047,
"step": 33200
},
{
"epoch": 0.4573653007606707,
"grad_norm": 0.17149974405765533,
"learning_rate": 0.0001,
"loss": 1.8045,
"step": 33250
},
{
"epoch": 0.4580530681302356,
"grad_norm": 0.1575038731098175,
"learning_rate": 0.0001,
"loss": 1.8008,
"step": 33300
},
{
"epoch": 0.45874083549980055,
"grad_norm": 0.1684975028038025,
"learning_rate": 0.0001,
"loss": 1.8036,
"step": 33350
},
{
"epoch": 0.4594286028693655,
"grad_norm": 0.17977888882160187,
"learning_rate": 0.0001,
"loss": 1.8058,
"step": 33400
},
{
"epoch": 0.46011637023893037,
"grad_norm": 0.1595628559589386,
"learning_rate": 0.0001,
"loss": 1.8052,
"step": 33450
},
{
"epoch": 0.4608041376084953,
"grad_norm": 0.17325359582901,
"learning_rate": 0.0001,
"loss": 1.8036,
"step": 33500
},
{
"epoch": 0.4614919049780602,
"grad_norm": 0.1705903857946396,
"learning_rate": 0.0001,
"loss": 1.8048,
"step": 33550
},
{
"epoch": 0.46217967234762514,
"grad_norm": 0.1714329570531845,
"learning_rate": 0.0001,
"loss": 1.8042,
"step": 33600
},
{
"epoch": 0.4628674397171901,
"grad_norm": 0.17674137651920319,
"learning_rate": 0.0001,
"loss": 1.8067,
"step": 33650
},
{
"epoch": 0.46355520708675496,
"grad_norm": 0.1605982631444931,
"learning_rate": 0.0001,
"loss": 1.8059,
"step": 33700
},
{
"epoch": 0.4642429744563199,
"grad_norm": 0.17221522331237793,
"learning_rate": 0.0001,
"loss": 1.8025,
"step": 33750
},
{
"epoch": 0.4649307418258848,
"grad_norm": 0.17648015916347504,
"learning_rate": 0.0001,
"loss": 1.8065,
"step": 33800
},
{
"epoch": 0.4656185091954497,
"grad_norm": 0.16860069334506989,
"learning_rate": 0.0001,
"loss": 1.8064,
"step": 33850
},
{
"epoch": 0.46630627656501467,
"grad_norm": 0.19352567195892334,
"learning_rate": 0.0001,
"loss": 1.8043,
"step": 33900
},
{
"epoch": 0.46699404393457955,
"grad_norm": 0.1634499430656433,
"learning_rate": 0.0001,
"loss": 1.7994,
"step": 33950
},
{
"epoch": 0.4676818113041445,
"grad_norm": 0.1790640950202942,
"learning_rate": 0.0001,
"loss": 1.8075,
"step": 34000
},
{
"epoch": 0.46836957867370943,
"grad_norm": 0.16731584072113037,
"learning_rate": 0.0001,
"loss": 1.8022,
"step": 34050
},
{
"epoch": 0.4690573460432743,
"grad_norm": 0.17351976037025452,
"learning_rate": 0.0001,
"loss": 1.8066,
"step": 34100
},
{
"epoch": 0.46974511341283925,
"grad_norm": 0.18717612326145172,
"learning_rate": 0.0001,
"loss": 1.8048,
"step": 34150
},
{
"epoch": 0.47043288078240414,
"grad_norm": 0.18829597532749176,
"learning_rate": 0.0001,
"loss": 1.8018,
"step": 34200
},
{
"epoch": 0.4711206481519691,
"grad_norm": 0.16731028258800507,
"learning_rate": 0.0001,
"loss": 1.806,
"step": 34250
},
{
"epoch": 0.471808415521534,
"grad_norm": 0.17419900000095367,
"learning_rate": 0.0001,
"loss": 1.8009,
"step": 34300
},
{
"epoch": 0.4724961828910989,
"grad_norm": 0.16232840716838837,
"learning_rate": 0.0001,
"loss": 1.8048,
"step": 34350
},
{
"epoch": 0.47318395026066384,
"grad_norm": 0.1557988077402115,
"learning_rate": 0.0001,
"loss": 1.8021,
"step": 34400
},
{
"epoch": 0.47387171763022873,
"grad_norm": 0.18441712856292725,
"learning_rate": 0.0001,
"loss": 1.8066,
"step": 34450
},
{
"epoch": 0.47455948499979367,
"grad_norm": 0.1681167036294937,
"learning_rate": 0.0001,
"loss": 1.8035,
"step": 34500
},
{
"epoch": 0.4752472523693586,
"grad_norm": 0.1694021373987198,
"learning_rate": 0.0001,
"loss": 1.8056,
"step": 34550
},
{
"epoch": 0.4759350197389235,
"grad_norm": 0.16909408569335938,
"learning_rate": 0.0001,
"loss": 1.8012,
"step": 34600
},
{
"epoch": 0.47662278710848843,
"grad_norm": 0.18573597073554993,
"learning_rate": 0.0001,
"loss": 1.8025,
"step": 34650
},
{
"epoch": 0.4773105544780533,
"grad_norm": 0.1591121107339859,
"learning_rate": 0.0001,
"loss": 1.8022,
"step": 34700
},
{
"epoch": 0.47799832184761826,
"grad_norm": 0.16243012249469757,
"learning_rate": 0.0001,
"loss": 1.809,
"step": 34750
},
{
"epoch": 0.4786860892171832,
"grad_norm": 0.1876152753829956,
"learning_rate": 0.0001,
"loss": 1.8043,
"step": 34800
},
{
"epoch": 0.4793738565867481,
"grad_norm": 0.160101518034935,
"learning_rate": 0.0001,
"loss": 1.8041,
"step": 34850
},
{
"epoch": 0.480061623956313,
"grad_norm": 0.17508384585380554,
"learning_rate": 0.0001,
"loss": 1.8025,
"step": 34900
},
{
"epoch": 0.48074939132587796,
"grad_norm": 0.16169220209121704,
"learning_rate": 0.0001,
"loss": 1.8045,
"step": 34950
},
{
"epoch": 0.48143715869544285,
"grad_norm": 0.17065638303756714,
"learning_rate": 0.0001,
"loss": 1.8046,
"step": 35000
},
{
"epoch": 0.4821249260650078,
"grad_norm": 0.16137543320655823,
"learning_rate": 0.0001,
"loss": 1.8006,
"step": 35050
},
{
"epoch": 0.48281269343457267,
"grad_norm": 0.1716589331626892,
"learning_rate": 0.0001,
"loss": 1.8065,
"step": 35100
},
{
"epoch": 0.4835004608041376,
"grad_norm": 0.16750770807266235,
"learning_rate": 0.0001,
"loss": 1.8069,
"step": 35150
},
{
"epoch": 0.48418822817370255,
"grad_norm": 0.1668424755334854,
"learning_rate": 0.0001,
"loss": 1.8045,
"step": 35200
},
{
"epoch": 0.48487599554326744,
"grad_norm": 0.1577017605304718,
"learning_rate": 0.0001,
"loss": 1.8015,
"step": 35250
},
{
"epoch": 0.4855637629128324,
"grad_norm": 0.16916392743587494,
"learning_rate": 0.0001,
"loss": 1.8012,
"step": 35300
},
{
"epoch": 0.48625153028239726,
"grad_norm": 0.16878165304660797,
"learning_rate": 0.0001,
"loss": 1.8049,
"step": 35350
},
{
"epoch": 0.4869392976519622,
"grad_norm": 0.1834115982055664,
"learning_rate": 0.0001,
"loss": 1.8024,
"step": 35400
},
{
"epoch": 0.48762706502152714,
"grad_norm": 0.16310469806194305,
"learning_rate": 0.0001,
"loss": 1.8043,
"step": 35450
},
{
"epoch": 0.488314832391092,
"grad_norm": 0.17430266737937927,
"learning_rate": 0.0001,
"loss": 1.8017,
"step": 35500
},
{
"epoch": 0.48900259976065696,
"grad_norm": 0.20293480157852173,
"learning_rate": 0.0001,
"loss": 1.8022,
"step": 35550
},
{
"epoch": 0.48969036713022185,
"grad_norm": 0.16140292584896088,
"learning_rate": 0.0001,
"loss": 1.8012,
"step": 35600
},
{
"epoch": 0.4903781344997868,
"grad_norm": 0.15472573041915894,
"learning_rate": 0.0001,
"loss": 1.804,
"step": 35650
},
{
"epoch": 0.49106590186935173,
"grad_norm": 0.19431902468204498,
"learning_rate": 0.0001,
"loss": 1.8013,
"step": 35700
},
{
"epoch": 0.4917536692389166,
"grad_norm": 0.1693229377269745,
"learning_rate": 0.0001,
"loss": 1.8004,
"step": 35750
},
{
"epoch": 0.49244143660848155,
"grad_norm": 0.19499187171459198,
"learning_rate": 0.0001,
"loss": 1.8035,
"step": 35800
},
{
"epoch": 0.4931292039780465,
"grad_norm": 0.16046124696731567,
"learning_rate": 0.0001,
"loss": 1.8062,
"step": 35850
},
{
"epoch": 0.4938169713476114,
"grad_norm": 0.17743340134620667,
"learning_rate": 0.0001,
"loss": 1.8038,
"step": 35900
},
{
"epoch": 0.4945047387171763,
"grad_norm": 0.20568375289440155,
"learning_rate": 0.0001,
"loss": 1.8039,
"step": 35950
},
{
"epoch": 0.4951925060867412,
"grad_norm": 0.1706654578447342,
"learning_rate": 0.0001,
"loss": 1.8022,
"step": 36000
},
{
"epoch": 0.49588027345630614,
"grad_norm": 0.17956335842609406,
"learning_rate": 0.0001,
"loss": 1.8038,
"step": 36050
},
{
"epoch": 0.4965680408258711,
"grad_norm": 0.1683945506811142,
"learning_rate": 0.0001,
"loss": 1.801,
"step": 36100
},
{
"epoch": 0.49725580819543597,
"grad_norm": 0.16132575273513794,
"learning_rate": 0.0001,
"loss": 1.7994,
"step": 36150
},
{
"epoch": 0.4979435755650009,
"grad_norm": 0.15439482033252716,
"learning_rate": 0.0001,
"loss": 1.798,
"step": 36200
},
{
"epoch": 0.4986313429345658,
"grad_norm": 0.17427167296409607,
"learning_rate": 0.0001,
"loss": 1.803,
"step": 36250
},
{
"epoch": 0.49931911030413073,
"grad_norm": 0.1826677918434143,
"learning_rate": 0.0001,
"loss": 1.8041,
"step": 36300
},
{
"epoch": 0.5000068776736957,
"grad_norm": 0.1664198338985443,
"learning_rate": 0.0001,
"loss": 1.8007,
"step": 36350
},
{
"epoch": 0.5006946450432606,
"grad_norm": 0.19743186235427856,
"learning_rate": 0.0001,
"loss": 1.8009,
"step": 36400
},
{
"epoch": 0.5013824124128254,
"grad_norm": 0.17416580021381378,
"learning_rate": 0.0001,
"loss": 1.8033,
"step": 36450
},
{
"epoch": 0.5020701797823904,
"grad_norm": 0.16447678208351135,
"learning_rate": 0.0001,
"loss": 1.8027,
"step": 36500
},
{
"epoch": 0.5027579471519553,
"grad_norm": 0.19978569447994232,
"learning_rate": 0.0001,
"loss": 1.8027,
"step": 36550
},
{
"epoch": 0.5034457145215202,
"grad_norm": 0.1768701672554016,
"learning_rate": 0.0001,
"loss": 1.8018,
"step": 36600
},
{
"epoch": 0.5041334818910852,
"grad_norm": 0.17458416521549225,
"learning_rate": 0.0001,
"loss": 1.8031,
"step": 36650
},
{
"epoch": 0.5048212492606501,
"grad_norm": 0.15409618616104126,
"learning_rate": 0.0001,
"loss": 1.8043,
"step": 36700
},
{
"epoch": 0.505509016630215,
"grad_norm": 0.20529916882514954,
"learning_rate": 0.0001,
"loss": 1.8026,
"step": 36750
},
{
"epoch": 0.50619678399978,
"grad_norm": 0.1579432338476181,
"learning_rate": 0.0001,
"loss": 1.806,
"step": 36800
},
{
"epoch": 0.5068845513693448,
"grad_norm": 0.16803112626075745,
"learning_rate": 0.0001,
"loss": 1.8027,
"step": 36850
},
{
"epoch": 0.5075723187389097,
"grad_norm": 0.19382716715335846,
"learning_rate": 0.0001,
"loss": 1.8029,
"step": 36900
},
{
"epoch": 0.5082600861084746,
"grad_norm": 0.17823243141174316,
"learning_rate": 0.0001,
"loss": 1.8035,
"step": 36950
},
{
"epoch": 0.5089478534780396,
"grad_norm": 0.1742970496416092,
"learning_rate": 0.0001,
"loss": 1.8033,
"step": 37000
},
{
"epoch": 0.5096356208476045,
"grad_norm": 0.17236186563968658,
"learning_rate": 0.0001,
"loss": 1.8062,
"step": 37050
},
{
"epoch": 0.5103233882171694,
"grad_norm": 0.1705719381570816,
"learning_rate": 0.0001,
"loss": 1.8052,
"step": 37100
},
{
"epoch": 0.5110111555867344,
"grad_norm": 0.19941222667694092,
"learning_rate": 0.0001,
"loss": 1.8011,
"step": 37150
},
{
"epoch": 0.5116989229562993,
"grad_norm": 0.16263477504253387,
"learning_rate": 0.0001,
"loss": 1.8026,
"step": 37200
},
{
"epoch": 0.5123866903258641,
"grad_norm": 0.15199637413024902,
"learning_rate": 0.0001,
"loss": 1.8001,
"step": 37250
},
{
"epoch": 0.5130744576954291,
"grad_norm": 0.16797873377799988,
"learning_rate": 0.0001,
"loss": 1.8016,
"step": 37300
},
{
"epoch": 0.513762225064994,
"grad_norm": 0.16336190700531006,
"learning_rate": 0.0001,
"loss": 1.798,
"step": 37350
},
{
"epoch": 0.5144499924345589,
"grad_norm": 0.16497831046581268,
"learning_rate": 0.0001,
"loss": 1.8001,
"step": 37400
},
{
"epoch": 0.5151377598041239,
"grad_norm": 0.1712917536497116,
"learning_rate": 0.0001,
"loss": 1.804,
"step": 37450
},
{
"epoch": 0.5158255271736888,
"grad_norm": 0.16597513854503632,
"learning_rate": 0.0001,
"loss": 1.8019,
"step": 37500
},
{
"epoch": 0.5165132945432537,
"grad_norm": 0.15661810338497162,
"learning_rate": 0.0001,
"loss": 1.8043,
"step": 37550
},
{
"epoch": 0.5172010619128186,
"grad_norm": 0.17713536322116852,
"learning_rate": 0.0001,
"loss": 1.7973,
"step": 37600
},
{
"epoch": 0.5178888292823836,
"grad_norm": 0.15873874723911285,
"learning_rate": 0.0001,
"loss": 1.8003,
"step": 37650
},
{
"epoch": 0.5185765966519484,
"grad_norm": 0.1784040331840515,
"learning_rate": 0.0001,
"loss": 1.798,
"step": 37700
},
{
"epoch": 0.5192643640215133,
"grad_norm": 0.16135090589523315,
"learning_rate": 0.0001,
"loss": 1.8082,
"step": 37750
},
{
"epoch": 0.5199521313910783,
"grad_norm": 0.15565833449363708,
"learning_rate": 0.0001,
"loss": 1.8006,
"step": 37800
},
{
"epoch": 0.5206398987606432,
"grad_norm": 0.1711311787366867,
"learning_rate": 0.0001,
"loss": 1.7975,
"step": 37850
},
{
"epoch": 0.5213276661302081,
"grad_norm": 0.17314565181732178,
"learning_rate": 0.0001,
"loss": 1.7997,
"step": 37900
},
{
"epoch": 0.5220154334997731,
"grad_norm": 0.1723901331424713,
"learning_rate": 0.0001,
"loss": 1.8006,
"step": 37950
},
{
"epoch": 0.522703200869338,
"grad_norm": 0.15868623554706573,
"learning_rate": 0.0001,
"loss": 1.8013,
"step": 38000
},
{
"epoch": 0.5233909682389029,
"grad_norm": 0.17163942754268646,
"learning_rate": 0.0001,
"loss": 1.7991,
"step": 38050
},
{
"epoch": 0.5240787356084677,
"grad_norm": 0.17622709274291992,
"learning_rate": 0.0001,
"loss": 1.8027,
"step": 38100
},
{
"epoch": 0.5247665029780327,
"grad_norm": 0.1616000235080719,
"learning_rate": 0.0001,
"loss": 1.7993,
"step": 38150
},
{
"epoch": 0.5254542703475976,
"grad_norm": 0.1638936698436737,
"learning_rate": 0.0001,
"loss": 1.7978,
"step": 38200
},
{
"epoch": 0.5261420377171625,
"grad_norm": 0.1706729531288147,
"learning_rate": 0.0001,
"loss": 1.7999,
"step": 38250
},
{
"epoch": 0.5268298050867275,
"grad_norm": 0.2048814296722412,
"learning_rate": 0.0001,
"loss": 1.7987,
"step": 38300
},
{
"epoch": 0.5275175724562924,
"grad_norm": 0.15826106071472168,
"learning_rate": 0.0001,
"loss": 1.8022,
"step": 38350
},
{
"epoch": 0.5282053398258573,
"grad_norm": 0.16068226099014282,
"learning_rate": 0.0001,
"loss": 1.8032,
"step": 38400
},
{
"epoch": 0.5288931071954223,
"grad_norm": 0.17855240404605865,
"learning_rate": 0.0001,
"loss": 1.7994,
"step": 38450
},
{
"epoch": 0.5295808745649871,
"grad_norm": 0.16978466510772705,
"learning_rate": 0.0001,
"loss": 1.8022,
"step": 38500
},
{
"epoch": 0.530268641934552,
"grad_norm": 0.1745109260082245,
"learning_rate": 0.0001,
"loss": 1.8008,
"step": 38550
},
{
"epoch": 0.530956409304117,
"grad_norm": 0.1952807605266571,
"learning_rate": 0.0001,
"loss": 1.7977,
"step": 38600
},
{
"epoch": 0.5316441766736819,
"grad_norm": 0.1846735179424286,
"learning_rate": 0.0001,
"loss": 1.8033,
"step": 38650
},
{
"epoch": 0.5323319440432468,
"grad_norm": 0.17474836111068726,
"learning_rate": 0.0001,
"loss": 1.8034,
"step": 38700
},
{
"epoch": 0.5330197114128117,
"grad_norm": 0.1729106903076172,
"learning_rate": 0.0001,
"loss": 1.8043,
"step": 38750
},
{
"epoch": 0.5337074787823767,
"grad_norm": 0.18584811687469482,
"learning_rate": 0.0001,
"loss": 1.805,
"step": 38800
},
{
"epoch": 0.5343952461519416,
"grad_norm": 0.15596157312393188,
"learning_rate": 0.0001,
"loss": 1.8014,
"step": 38850
},
{
"epoch": 0.5350830135215064,
"grad_norm": 0.15528340637683868,
"learning_rate": 0.0001,
"loss": 1.7969,
"step": 38900
},
{
"epoch": 0.5357707808910714,
"grad_norm": 0.1738685965538025,
"learning_rate": 0.0001,
"loss": 1.8003,
"step": 38950
},
{
"epoch": 0.5364585482606363,
"grad_norm": 0.1620347946882248,
"learning_rate": 0.0001,
"loss": 1.796,
"step": 39000
},
{
"epoch": 0.5371463156302012,
"grad_norm": 0.1705981343984604,
"learning_rate": 0.0001,
"loss": 1.8008,
"step": 39050
},
{
"epoch": 0.5378340829997662,
"grad_norm": 0.16167068481445312,
"learning_rate": 0.0001,
"loss": 1.8037,
"step": 39100
},
{
"epoch": 0.5385218503693311,
"grad_norm": 0.15977101027965546,
"learning_rate": 0.0001,
"loss": 1.8043,
"step": 39150
},
{
"epoch": 0.539209617738896,
"grad_norm": 0.1699797809123993,
"learning_rate": 0.0001,
"loss": 1.8025,
"step": 39200
},
{
"epoch": 0.5398973851084609,
"grad_norm": 0.17108047008514404,
"learning_rate": 0.0001,
"loss": 1.7999,
"step": 39250
},
{
"epoch": 0.5405851524780259,
"grad_norm": 0.1756991147994995,
"learning_rate": 0.0001,
"loss": 1.8001,
"step": 39300
},
{
"epoch": 0.5412729198475907,
"grad_norm": 0.1716366708278656,
"learning_rate": 0.0001,
"loss": 1.7987,
"step": 39350
},
{
"epoch": 0.5419606872171556,
"grad_norm": 0.16876575350761414,
"learning_rate": 0.0001,
"loss": 1.8013,
"step": 39400
},
{
"epoch": 0.5426484545867206,
"grad_norm": 0.1650577336549759,
"learning_rate": 0.0001,
"loss": 1.8001,
"step": 39450
},
{
"epoch": 0.5433362219562855,
"grad_norm": 0.17242754995822906,
"learning_rate": 0.0001,
"loss": 1.8006,
"step": 39500
},
{
"epoch": 0.5440239893258504,
"grad_norm": 0.16941705346107483,
"learning_rate": 0.0001,
"loss": 1.7995,
"step": 39550
},
{
"epoch": 0.5447117566954154,
"grad_norm": 0.21036018431186676,
"learning_rate": 0.0001,
"loss": 1.802,
"step": 39600
},
{
"epoch": 0.5453995240649803,
"grad_norm": 0.16824571788311005,
"learning_rate": 0.0001,
"loss": 1.7992,
"step": 39650
},
{
"epoch": 0.5460872914345452,
"grad_norm": 0.162497416138649,
"learning_rate": 0.0001,
"loss": 1.7978,
"step": 39700
},
{
"epoch": 0.5467750588041101,
"grad_norm": 0.18297506868839264,
"learning_rate": 0.0001,
"loss": 1.7968,
"step": 39750
},
{
"epoch": 0.547462826173675,
"grad_norm": 0.15444135665893555,
"learning_rate": 0.0001,
"loss": 1.7942,
"step": 39800
},
{
"epoch": 0.5481505935432399,
"grad_norm": 0.17254306375980377,
"learning_rate": 0.0001,
"loss": 1.797,
"step": 39850
},
{
"epoch": 0.5488383609128048,
"grad_norm": 0.18030798435211182,
"learning_rate": 0.0001,
"loss": 1.8008,
"step": 39900
},
{
"epoch": 0.5495261282823698,
"grad_norm": 0.18069452047348022,
"learning_rate": 0.0001,
"loss": 1.7988,
"step": 39950
},
{
"epoch": 0.5502138956519347,
"grad_norm": 0.16256502270698547,
"learning_rate": 0.0001,
"loss": 1.8019,
"step": 40000
},
{
"epoch": 0.5509016630214996,
"grad_norm": 0.16416381299495697,
"learning_rate": 0.0001,
"loss": 1.7976,
"step": 40050
},
{
"epoch": 0.5515894303910646,
"grad_norm": 0.1743890941143036,
"learning_rate": 0.0001,
"loss": 1.7966,
"step": 40100
},
{
"epoch": 0.5522771977606294,
"grad_norm": 0.1875494122505188,
"learning_rate": 0.0001,
"loss": 1.799,
"step": 40150
},
{
"epoch": 0.5529649651301943,
"grad_norm": 0.18323060870170593,
"learning_rate": 0.0001,
"loss": 1.7968,
"step": 40200
},
{
"epoch": 0.5536527324997593,
"grad_norm": 0.1552455574274063,
"learning_rate": 0.0001,
"loss": 1.799,
"step": 40250
},
{
"epoch": 0.5543404998693242,
"grad_norm": 0.1685846745967865,
"learning_rate": 0.0001,
"loss": 1.7989,
"step": 40300
},
{
"epoch": 0.5550282672388891,
"grad_norm": 0.16371703147888184,
"learning_rate": 0.0001,
"loss": 1.7943,
"step": 40350
},
{
"epoch": 0.5557160346084541,
"grad_norm": 0.17993508279323578,
"learning_rate": 0.0001,
"loss": 1.7972,
"step": 40400
},
{
"epoch": 0.556403801978019,
"grad_norm": 0.17061980068683624,
"learning_rate": 0.0001,
"loss": 1.7954,
"step": 40450
},
{
"epoch": 0.5570915693475839,
"grad_norm": 0.17588096857070923,
"learning_rate": 0.0001,
"loss": 1.7975,
"step": 40500
},
{
"epoch": 0.5577793367171487,
"grad_norm": 0.16484741866588593,
"learning_rate": 0.0001,
"loss": 1.7959,
"step": 40550
},
{
"epoch": 0.5584671040867137,
"grad_norm": 0.1812593787908554,
"learning_rate": 0.0001,
"loss": 1.801,
"step": 40600
},
{
"epoch": 0.5591548714562786,
"grad_norm": 0.17755167186260223,
"learning_rate": 0.0001,
"loss": 1.797,
"step": 40650
},
{
"epoch": 0.5598426388258435,
"grad_norm": 0.16877087950706482,
"learning_rate": 0.0001,
"loss": 1.7975,
"step": 40700
},
{
"epoch": 0.5605304061954085,
"grad_norm": 0.15780018270015717,
"learning_rate": 0.0001,
"loss": 1.7967,
"step": 40750
},
{
"epoch": 0.5612181735649734,
"grad_norm": 0.15145239233970642,
"learning_rate": 0.0001,
"loss": 1.7988,
"step": 40800
},
{
"epoch": 0.5619059409345383,
"grad_norm": 0.18385986983776093,
"learning_rate": 0.0001,
"loss": 1.7965,
"step": 40850
},
{
"epoch": 0.5625937083041033,
"grad_norm": 0.15375161170959473,
"learning_rate": 0.0001,
"loss": 1.7946,
"step": 40900
},
{
"epoch": 0.5632814756736682,
"grad_norm": 0.15694858133792877,
"learning_rate": 0.0001,
"loss": 1.7989,
"step": 40950
},
{
"epoch": 0.563969243043233,
"grad_norm": 0.1538461446762085,
"learning_rate": 0.0001,
"loss": 1.7965,
"step": 41000
},
{
"epoch": 0.5646570104127979,
"grad_norm": 0.16211877763271332,
"learning_rate": 0.0001,
"loss": 1.7931,
"step": 41050
},
{
"epoch": 0.5653447777823629,
"grad_norm": 0.1737697869539261,
"learning_rate": 0.0001,
"loss": 1.7972,
"step": 41100
},
{
"epoch": 0.5660325451519278,
"grad_norm": 0.1610105037689209,
"learning_rate": 0.0001,
"loss": 1.798,
"step": 41150
},
{
"epoch": 0.5667203125214927,
"grad_norm": 0.1762542873620987,
"learning_rate": 0.0001,
"loss": 1.7991,
"step": 41200
},
{
"epoch": 0.5674080798910577,
"grad_norm": 0.16195493936538696,
"learning_rate": 0.0001,
"loss": 1.7972,
"step": 41250
},
{
"epoch": 0.5680958472606226,
"grad_norm": 0.18047676980495453,
"learning_rate": 0.0001,
"loss": 1.7962,
"step": 41300
},
{
"epoch": 0.5687836146301875,
"grad_norm": 0.18760687112808228,
"learning_rate": 0.0001,
"loss": 1.8,
"step": 41350
},
{
"epoch": 0.5694713819997524,
"grad_norm": 0.17012238502502441,
"learning_rate": 0.0001,
"loss": 1.7969,
"step": 41400
},
{
"epoch": 0.5701591493693173,
"grad_norm": 0.1699533313512802,
"learning_rate": 0.0001,
"loss": 1.7953,
"step": 41450
},
{
"epoch": 0.5708469167388822,
"grad_norm": 0.16422894597053528,
"learning_rate": 0.0001,
"loss": 1.7995,
"step": 41500
},
{
"epoch": 0.5715346841084472,
"grad_norm": 0.17526569962501526,
"learning_rate": 0.0001,
"loss": 1.7967,
"step": 41550
},
{
"epoch": 0.5722224514780121,
"grad_norm": 0.158601313829422,
"learning_rate": 0.0001,
"loss": 1.8006,
"step": 41600
},
{
"epoch": 0.572910218847577,
"grad_norm": 0.1562766283750534,
"learning_rate": 0.0001,
"loss": 1.7969,
"step": 41650
},
{
"epoch": 0.5735979862171419,
"grad_norm": 0.15490677952766418,
"learning_rate": 0.0001,
"loss": 1.8017,
"step": 41700
},
{
"epoch": 0.5742857535867069,
"grad_norm": 0.17004509270191193,
"learning_rate": 0.0001,
"loss": 1.7958,
"step": 41750
},
{
"epoch": 0.5749735209562717,
"grad_norm": 0.17213889956474304,
"learning_rate": 0.0001,
"loss": 1.797,
"step": 41800
},
{
"epoch": 0.5756612883258366,
"grad_norm": 0.17541930079460144,
"learning_rate": 0.0001,
"loss": 1.7972,
"step": 41850
},
{
"epoch": 0.5763490556954016,
"grad_norm": 0.18296034634113312,
"learning_rate": 0.0001,
"loss": 1.796,
"step": 41900
},
{
"epoch": 0.5770368230649665,
"grad_norm": 0.1777525097131729,
"learning_rate": 0.0001,
"loss": 1.7959,
"step": 41950
},
{
"epoch": 0.5777245904345314,
"grad_norm": 0.17678572237491608,
"learning_rate": 0.0001,
"loss": 1.7989,
"step": 42000
},
{
"epoch": 0.5784123578040964,
"grad_norm": 0.1763673573732376,
"learning_rate": 0.0001,
"loss": 1.8004,
"step": 42050
},
{
"epoch": 0.5791001251736613,
"grad_norm": 0.18608896434307098,
"learning_rate": 0.0001,
"loss": 1.7997,
"step": 42100
},
{
"epoch": 0.5797878925432262,
"grad_norm": 0.1691625863313675,
"learning_rate": 0.0001,
"loss": 1.7988,
"step": 42150
},
{
"epoch": 0.580475659912791,
"grad_norm": 0.1609441488981247,
"learning_rate": 0.0001,
"loss": 1.7993,
"step": 42200
},
{
"epoch": 0.581163427282356,
"grad_norm": 0.15776963531970978,
"learning_rate": 0.0001,
"loss": 1.7994,
"step": 42250
},
{
"epoch": 0.5818511946519209,
"grad_norm": 0.20214344561100006,
"learning_rate": 0.0001,
"loss": 1.7998,
"step": 42300
},
{
"epoch": 0.5825389620214858,
"grad_norm": 0.18112723529338837,
"learning_rate": 0.0001,
"loss": 1.8,
"step": 42350
},
{
"epoch": 0.5832267293910508,
"grad_norm": 0.1543450802564621,
"learning_rate": 0.0001,
"loss": 1.7982,
"step": 42400
},
{
"epoch": 0.5839144967606157,
"grad_norm": 0.15315985679626465,
"learning_rate": 0.0001,
"loss": 1.7995,
"step": 42450
},
{
"epoch": 0.5846022641301806,
"grad_norm": 0.16166909039020538,
"learning_rate": 0.0001,
"loss": 1.7995,
"step": 42500
},
{
"epoch": 0.5852900314997456,
"grad_norm": 0.15933014452457428,
"learning_rate": 0.0001,
"loss": 1.7968,
"step": 42550
},
{
"epoch": 0.5859777988693105,
"grad_norm": 0.15434689819812775,
"learning_rate": 0.0001,
"loss": 1.797,
"step": 42600
},
{
"epoch": 0.5866655662388753,
"grad_norm": 0.1875755488872528,
"learning_rate": 0.0001,
"loss": 1.7964,
"step": 42650
},
{
"epoch": 0.5873533336084403,
"grad_norm": 0.15559327602386475,
"learning_rate": 0.0001,
"loss": 1.7997,
"step": 42700
},
{
"epoch": 0.5880411009780052,
"grad_norm": 0.16149398684501648,
"learning_rate": 0.0001,
"loss": 1.7956,
"step": 42750
},
{
"epoch": 0.5887288683475701,
"grad_norm": 0.1777992695569992,
"learning_rate": 0.0001,
"loss": 1.7912,
"step": 42800
},
{
"epoch": 0.589416635717135,
"grad_norm": 0.15934714674949646,
"learning_rate": 0.0001,
"loss": 1.7989,
"step": 42850
},
{
"epoch": 0.5901044030867,
"grad_norm": 0.16847145557403564,
"learning_rate": 0.0001,
"loss": 1.7997,
"step": 42900
},
{
"epoch": 0.5907921704562649,
"grad_norm": 0.17410792410373688,
"learning_rate": 0.0001,
"loss": 1.7999,
"step": 42950
},
{
"epoch": 0.5914799378258297,
"grad_norm": 0.18102861940860748,
"learning_rate": 0.0001,
"loss": 1.7983,
"step": 43000
},
{
"epoch": 0.5921677051953947,
"grad_norm": 0.1682325005531311,
"learning_rate": 0.0001,
"loss": 1.7986,
"step": 43050
},
{
"epoch": 0.5928554725649596,
"grad_norm": 0.17732855677604675,
"learning_rate": 0.0001,
"loss": 1.8004,
"step": 43100
},
{
"epoch": 0.5935432399345245,
"grad_norm": 0.16327179968357086,
"learning_rate": 0.0001,
"loss": 1.7969,
"step": 43150
},
{
"epoch": 0.5942310073040895,
"grad_norm": 0.1582539677619934,
"learning_rate": 0.0001,
"loss": 1.798,
"step": 43200
},
{
"epoch": 0.5949187746736544,
"grad_norm": 0.14965754747390747,
"learning_rate": 0.0001,
"loss": 1.7986,
"step": 43250
},
{
"epoch": 0.5956065420432193,
"grad_norm": 0.1617211103439331,
"learning_rate": 0.0001,
"loss": 1.7938,
"step": 43300
},
{
"epoch": 0.5962943094127843,
"grad_norm": 0.17458325624465942,
"learning_rate": 0.0001,
"loss": 1.7978,
"step": 43350
},
{
"epoch": 0.5969820767823492,
"grad_norm": 0.1668146252632141,
"learning_rate": 0.0001,
"loss": 1.7983,
"step": 43400
},
{
"epoch": 0.597669844151914,
"grad_norm": 0.15414200723171234,
"learning_rate": 0.0001,
"loss": 1.7989,
"step": 43450
},
{
"epoch": 0.5983576115214789,
"grad_norm": 0.15912353992462158,
"learning_rate": 0.0001,
"loss": 1.7964,
"step": 43500
},
{
"epoch": 0.5990453788910439,
"grad_norm": 0.15936636924743652,
"learning_rate": 0.0001,
"loss": 1.7944,
"step": 43550
},
{
"epoch": 0.5997331462606088,
"grad_norm": 0.17340709269046783,
"learning_rate": 0.0001,
"loss": 1.7912,
"step": 43600
},
{
"epoch": 0.6004209136301737,
"grad_norm": 0.18960115313529968,
"learning_rate": 0.0001,
"loss": 1.7946,
"step": 43650
},
{
"epoch": 0.6011086809997387,
"grad_norm": 0.17091485857963562,
"learning_rate": 0.0001,
"loss": 1.7998,
"step": 43700
},
{
"epoch": 0.6017964483693036,
"grad_norm": 0.17222945392131805,
"learning_rate": 0.0001,
"loss": 1.8016,
"step": 43750
},
{
"epoch": 0.6024842157388685,
"grad_norm": 0.1608862429857254,
"learning_rate": 0.0001,
"loss": 1.794,
"step": 43800
},
{
"epoch": 0.6031719831084335,
"grad_norm": 0.16626954078674316,
"learning_rate": 0.0001,
"loss": 1.7971,
"step": 43850
},
{
"epoch": 0.6038597504779983,
"grad_norm": 0.1769898533821106,
"learning_rate": 0.0001,
"loss": 1.7992,
"step": 43900
},
{
"epoch": 0.6045475178475632,
"grad_norm": 0.1665075570344925,
"learning_rate": 0.0001,
"loss": 1.7976,
"step": 43950
},
{
"epoch": 0.6052352852171281,
"grad_norm": 0.1957935094833374,
"learning_rate": 0.0001,
"loss": 1.7972,
"step": 44000
},
{
"epoch": 0.6059230525866931,
"grad_norm": 0.20066794753074646,
"learning_rate": 0.0001,
"loss": 1.7976,
"step": 44050
},
{
"epoch": 0.606610819956258,
"grad_norm": 0.16102181375026703,
"learning_rate": 0.0001,
"loss": 1.7942,
"step": 44100
},
{
"epoch": 0.6072985873258229,
"grad_norm": 0.16587640345096588,
"learning_rate": 0.0001,
"loss": 1.7964,
"step": 44150
},
{
"epoch": 0.6079863546953879,
"grad_norm": 0.17338010668754578,
"learning_rate": 0.0001,
"loss": 1.7955,
"step": 44200
},
{
"epoch": 0.6086741220649527,
"grad_norm": 0.1979152411222458,
"learning_rate": 0.0001,
"loss": 1.7964,
"step": 44250
},
{
"epoch": 0.6093618894345176,
"grad_norm": 0.16478174924850464,
"learning_rate": 0.0001,
"loss": 1.8013,
"step": 44300
},
{
"epoch": 0.6100496568040826,
"grad_norm": 0.16508819162845612,
"learning_rate": 0.0001,
"loss": 1.7922,
"step": 44350
},
{
"epoch": 0.6107374241736475,
"grad_norm": 0.15964439511299133,
"learning_rate": 0.0001,
"loss": 1.7975,
"step": 44400
},
{
"epoch": 0.6114251915432124,
"grad_norm": 0.18116386234760284,
"learning_rate": 0.0001,
"loss": 1.7972,
"step": 44450
},
{
"epoch": 0.6121129589127774,
"grad_norm": 0.1808495819568634,
"learning_rate": 0.0001,
"loss": 1.7958,
"step": 44500
},
{
"epoch": 0.6128007262823423,
"grad_norm": 0.1634376347064972,
"learning_rate": 0.0001,
"loss": 1.7931,
"step": 44550
},
{
"epoch": 0.6134884936519072,
"grad_norm": 0.15140944719314575,
"learning_rate": 0.0001,
"loss": 1.7995,
"step": 44600
},
{
"epoch": 0.614176261021472,
"grad_norm": 0.15988072752952576,
"learning_rate": 0.0001,
"loss": 1.7957,
"step": 44650
},
{
"epoch": 0.614864028391037,
"grad_norm": 0.16280120611190796,
"learning_rate": 0.0001,
"loss": 1.7986,
"step": 44700
},
{
"epoch": 0.6155517957606019,
"grad_norm": 0.16643498837947845,
"learning_rate": 0.0001,
"loss": 1.7975,
"step": 44750
},
{
"epoch": 0.6162395631301668,
"grad_norm": 0.151467427611351,
"learning_rate": 0.0001,
"loss": 1.7972,
"step": 44800
},
{
"epoch": 0.6169273304997318,
"grad_norm": 0.1621852070093155,
"learning_rate": 0.0001,
"loss": 1.7948,
"step": 44850
},
{
"epoch": 0.6176150978692967,
"grad_norm": 0.1828535795211792,
"learning_rate": 0.0001,
"loss": 1.7939,
"step": 44900
},
{
"epoch": 0.6183028652388616,
"grad_norm": 0.1630941480398178,
"learning_rate": 0.0001,
"loss": 1.7987,
"step": 44950
},
{
"epoch": 0.6189906326084266,
"grad_norm": 0.1701328009366989,
"learning_rate": 0.0001,
"loss": 1.7955,
"step": 45000
},
{
"epoch": 0.6196783999779915,
"grad_norm": 0.16631458699703217,
"learning_rate": 0.0001,
"loss": 1.7985,
"step": 45050
},
{
"epoch": 0.6203661673475563,
"grad_norm": 0.17133264243602753,
"learning_rate": 0.0001,
"loss": 1.7946,
"step": 45100
},
{
"epoch": 0.6210539347171212,
"grad_norm": 0.19388112425804138,
"learning_rate": 0.0001,
"loss": 1.7944,
"step": 45150
},
{
"epoch": 0.6217417020866862,
"grad_norm": 0.1769258826971054,
"learning_rate": 0.0001,
"loss": 1.7937,
"step": 45200
},
{
"epoch": 0.6224294694562511,
"grad_norm": 0.21986328065395355,
"learning_rate": 0.0001,
"loss": 1.7946,
"step": 45250
},
{
"epoch": 0.623117236825816,
"grad_norm": 0.1711747795343399,
"learning_rate": 0.0001,
"loss": 1.7923,
"step": 45300
},
{
"epoch": 0.623805004195381,
"grad_norm": 0.1730772852897644,
"learning_rate": 0.0001,
"loss": 1.7976,
"step": 45350
},
{
"epoch": 0.6244927715649459,
"grad_norm": 0.16657279431819916,
"learning_rate": 0.0001,
"loss": 1.7958,
"step": 45400
},
{
"epoch": 0.6251805389345108,
"grad_norm": 0.15675725042819977,
"learning_rate": 0.0001,
"loss": 1.7931,
"step": 45450
},
{
"epoch": 0.6258683063040757,
"grad_norm": 0.17763769626617432,
"learning_rate": 0.0001,
"loss": 1.7972,
"step": 45500
},
{
"epoch": 0.6265560736736406,
"grad_norm": 0.1630527824163437,
"learning_rate": 0.0001,
"loss": 1.7948,
"step": 45550
},
{
"epoch": 0.6272438410432055,
"grad_norm": 0.16628991067409515,
"learning_rate": 0.0001,
"loss": 1.7959,
"step": 45600
},
{
"epoch": 0.6279316084127705,
"grad_norm": 0.1589209884405136,
"learning_rate": 0.0001,
"loss": 1.7949,
"step": 45650
},
{
"epoch": 0.6286193757823354,
"grad_norm": 0.17715197801589966,
"learning_rate": 0.0001,
"loss": 1.7971,
"step": 45700
},
{
"epoch": 0.6293071431519003,
"grad_norm": 0.1824561059474945,
"learning_rate": 0.0001,
"loss": 1.795,
"step": 45750
},
{
"epoch": 0.6299949105214652,
"grad_norm": 0.16866008937358856,
"learning_rate": 0.0001,
"loss": 1.7957,
"step": 45800
},
{
"epoch": 0.6306826778910302,
"grad_norm": 0.14337721467018127,
"learning_rate": 0.0001,
"loss": 1.7937,
"step": 45850
},
{
"epoch": 0.631370445260595,
"grad_norm": 0.15916399657726288,
"learning_rate": 0.0001,
"loss": 1.7938,
"step": 45900
},
{
"epoch": 0.6320582126301599,
"grad_norm": 0.1653524488210678,
"learning_rate": 0.0001,
"loss": 1.795,
"step": 45950
},
{
"epoch": 0.6327459799997249,
"grad_norm": 0.1588210016489029,
"learning_rate": 0.0001,
"loss": 1.7963,
"step": 46000
},
{
"epoch": 0.6334337473692898,
"grad_norm": 0.16008345782756805,
"learning_rate": 0.0001,
"loss": 1.7978,
"step": 46050
},
{
"epoch": 0.6341215147388547,
"grad_norm": 0.16054043173789978,
"learning_rate": 0.0001,
"loss": 1.7914,
"step": 46100
},
{
"epoch": 0.6348092821084197,
"grad_norm": 0.19745290279388428,
"learning_rate": 0.0001,
"loss": 1.7938,
"step": 46150
},
{
"epoch": 0.6354970494779846,
"grad_norm": 0.18955908715724945,
"learning_rate": 0.0001,
"loss": 1.7948,
"step": 46200
},
{
"epoch": 0.6361848168475495,
"grad_norm": 0.16962236166000366,
"learning_rate": 0.0001,
"loss": 1.7911,
"step": 46250
},
{
"epoch": 0.6368725842171145,
"grad_norm": 0.17200341820716858,
"learning_rate": 0.0001,
"loss": 1.7935,
"step": 46300
},
{
"epoch": 0.6375603515866793,
"grad_norm": 0.17781908810138702,
"learning_rate": 0.0001,
"loss": 1.7905,
"step": 46350
},
{
"epoch": 0.6382481189562442,
"grad_norm": 0.17602622509002686,
"learning_rate": 0.0001,
"loss": 1.7945,
"step": 46400
},
{
"epoch": 0.6389358863258091,
"grad_norm": 0.1686919629573822,
"learning_rate": 0.0001,
"loss": 1.7892,
"step": 46450
},
{
"epoch": 0.6396236536953741,
"grad_norm": 0.15013763308525085,
"learning_rate": 0.0001,
"loss": 1.7969,
"step": 46500
},
{
"epoch": 0.640311421064939,
"grad_norm": 0.16534103453159332,
"learning_rate": 0.0001,
"loss": 1.7943,
"step": 46550
},
{
"epoch": 0.6409991884345039,
"grad_norm": 0.16527748107910156,
"learning_rate": 0.0001,
"loss": 1.7904,
"step": 46600
},
{
"epoch": 0.6416869558040689,
"grad_norm": 0.15024395287036896,
"learning_rate": 0.0001,
"loss": 1.7944,
"step": 46650
},
{
"epoch": 0.6423747231736338,
"grad_norm": 0.17082852125167847,
"learning_rate": 0.0001,
"loss": 1.7942,
"step": 46700
},
{
"epoch": 0.6430624905431986,
"grad_norm": 0.1649017482995987,
"learning_rate": 0.0001,
"loss": 1.7936,
"step": 46750
},
{
"epoch": 0.6437502579127636,
"grad_norm": 0.16045525670051575,
"learning_rate": 0.0001,
"loss": 1.7913,
"step": 46800
},
{
"epoch": 0.6444380252823285,
"grad_norm": 0.18290746212005615,
"learning_rate": 0.0001,
"loss": 1.7898,
"step": 46850
},
{
"epoch": 0.6451257926518934,
"grad_norm": 0.14731939136981964,
"learning_rate": 0.0001,
"loss": 1.7934,
"step": 46900
},
{
"epoch": 0.6458135600214583,
"grad_norm": 0.16072627902030945,
"learning_rate": 0.0001,
"loss": 1.7933,
"step": 46950
},
{
"epoch": 0.6465013273910233,
"grad_norm": 0.14942970871925354,
"learning_rate": 0.0001,
"loss": 1.7944,
"step": 47000
},
{
"epoch": 0.6471890947605882,
"grad_norm": 0.14922235906124115,
"learning_rate": 0.0001,
"loss": 1.7953,
"step": 47050
},
{
"epoch": 0.647876862130153,
"grad_norm": 0.17120474576950073,
"learning_rate": 0.0001,
"loss": 1.7955,
"step": 47100
},
{
"epoch": 0.648564629499718,
"grad_norm": 0.17423823475837708,
"learning_rate": 0.0001,
"loss": 1.7919,
"step": 47150
},
{
"epoch": 0.6492523968692829,
"grad_norm": 0.1567763239145279,
"learning_rate": 0.0001,
"loss": 1.7934,
"step": 47200
},
{
"epoch": 0.6499401642388478,
"grad_norm": 0.15817411243915558,
"learning_rate": 0.0001,
"loss": 1.7928,
"step": 47250
},
{
"epoch": 0.6506279316084128,
"grad_norm": 0.1748141348361969,
"learning_rate": 0.0001,
"loss": 1.7884,
"step": 47300
},
{
"epoch": 0.6513156989779777,
"grad_norm": 0.2045951634645462,
"learning_rate": 0.0001,
"loss": 1.7978,
"step": 47350
},
{
"epoch": 0.6520034663475426,
"grad_norm": 0.17650052905082703,
"learning_rate": 0.0001,
"loss": 1.792,
"step": 47400
},
{
"epoch": 0.6526912337171076,
"grad_norm": 0.17905278503894806,
"learning_rate": 0.0001,
"loss": 1.7958,
"step": 47450
},
{
"epoch": 0.6533790010866725,
"grad_norm": 0.1599511355161667,
"learning_rate": 0.0001,
"loss": 1.7912,
"step": 47500
},
{
"epoch": 0.6540667684562373,
"grad_norm": 0.1584351658821106,
"learning_rate": 0.0001,
"loss": 1.7949,
"step": 47550
},
{
"epoch": 0.6547545358258022,
"grad_norm": 0.17251476645469666,
"learning_rate": 0.0001,
"loss": 1.7913,
"step": 47600
},
{
"epoch": 0.6554423031953672,
"grad_norm": 0.17718471586704254,
"learning_rate": 0.0001,
"loss": 1.7934,
"step": 47650
},
{
"epoch": 0.6561300705649321,
"grad_norm": 0.15196654200553894,
"learning_rate": 0.0001,
"loss": 1.7894,
"step": 47700
},
{
"epoch": 0.656817837934497,
"grad_norm": 0.17444145679473877,
"learning_rate": 0.0001,
"loss": 1.7907,
"step": 47750
},
{
"epoch": 0.657505605304062,
"grad_norm": 0.15149961411952972,
"learning_rate": 0.0001,
"loss": 1.7959,
"step": 47800
},
{
"epoch": 0.6581933726736269,
"grad_norm": 0.1591227501630783,
"learning_rate": 0.0001,
"loss": 1.7907,
"step": 47850
},
{
"epoch": 0.6588811400431918,
"grad_norm": 0.20135171711444855,
"learning_rate": 0.0001,
"loss": 1.7963,
"step": 47900
},
{
"epoch": 0.6595689074127568,
"grad_norm": 0.16523614525794983,
"learning_rate": 0.0001,
"loss": 1.7968,
"step": 47950
},
{
"epoch": 0.6602566747823216,
"grad_norm": 0.15842151641845703,
"learning_rate": 0.0001,
"loss": 1.7897,
"step": 48000
},
{
"epoch": 0.6609444421518865,
"grad_norm": 0.160832479596138,
"learning_rate": 0.0001,
"loss": 1.796,
"step": 48050
},
{
"epoch": 0.6616322095214515,
"grad_norm": 0.16063477098941803,
"learning_rate": 0.0001,
"loss": 1.7903,
"step": 48100
},
{
"epoch": 0.6623199768910164,
"grad_norm": 0.1595107465982437,
"learning_rate": 0.0001,
"loss": 1.7953,
"step": 48150
},
{
"epoch": 0.6630077442605813,
"grad_norm": 0.18313910067081451,
"learning_rate": 0.0001,
"loss": 1.7957,
"step": 48200
},
{
"epoch": 0.6636955116301462,
"grad_norm": 0.17561380565166473,
"learning_rate": 0.0001,
"loss": 1.7906,
"step": 48250
},
{
"epoch": 0.6643832789997112,
"grad_norm": 0.18327072262763977,
"learning_rate": 0.0001,
"loss": 1.7916,
"step": 48300
},
{
"epoch": 0.665071046369276,
"grad_norm": 0.16745221614837646,
"learning_rate": 0.0001,
"loss": 1.791,
"step": 48350
},
{
"epoch": 0.6657588137388409,
"grad_norm": 0.16286319494247437,
"learning_rate": 0.0001,
"loss": 1.7942,
"step": 48400
},
{
"epoch": 0.6664465811084059,
"grad_norm": 0.15864308178424835,
"learning_rate": 0.0001,
"loss": 1.7953,
"step": 48450
},
{
"epoch": 0.6671343484779708,
"grad_norm": 0.16778843104839325,
"learning_rate": 0.0001,
"loss": 1.7945,
"step": 48500
},
{
"epoch": 0.6678221158475357,
"grad_norm": 0.1448727399110794,
"learning_rate": 0.0001,
"loss": 1.7942,
"step": 48550
},
{
"epoch": 0.6685098832171007,
"grad_norm": 0.16745643317699432,
"learning_rate": 0.0001,
"loss": 1.7903,
"step": 48600
},
{
"epoch": 0.6691976505866656,
"grad_norm": 0.1633836030960083,
"learning_rate": 0.0001,
"loss": 1.7938,
"step": 48650
},
{
"epoch": 0.6698854179562305,
"grad_norm": 0.15037505328655243,
"learning_rate": 0.0001,
"loss": 1.7963,
"step": 48700
},
{
"epoch": 0.6705731853257954,
"grad_norm": 0.1707869917154312,
"learning_rate": 0.0001,
"loss": 1.7895,
"step": 48750
},
{
"epoch": 0.6712609526953603,
"grad_norm": 0.17392534017562866,
"learning_rate": 0.0001,
"loss": 1.7926,
"step": 48800
},
{
"epoch": 0.6719487200649252,
"grad_norm": 0.1588422805070877,
"learning_rate": 0.0001,
"loss": 1.7958,
"step": 48850
},
{
"epoch": 0.6726364874344901,
"grad_norm": 0.1751549243927002,
"learning_rate": 0.0001,
"loss": 1.7931,
"step": 48900
},
{
"epoch": 0.6733242548040551,
"grad_norm": 0.1722249686717987,
"learning_rate": 0.0001,
"loss": 1.7892,
"step": 48950
},
{
"epoch": 0.67401202217362,
"grad_norm": 0.1673288643360138,
"learning_rate": 0.0001,
"loss": 1.793,
"step": 49000
},
{
"epoch": 0.6746997895431849,
"grad_norm": 0.1552770733833313,
"learning_rate": 0.0001,
"loss": 1.7916,
"step": 49050
},
{
"epoch": 0.6753875569127499,
"grad_norm": 0.15788178145885468,
"learning_rate": 0.0001,
"loss": 1.7981,
"step": 49100
},
{
"epoch": 0.6760753242823148,
"grad_norm": 0.17959725856781006,
"learning_rate": 0.0001,
"loss": 1.7949,
"step": 49150
},
{
"epoch": 0.6767630916518796,
"grad_norm": 0.1584416925907135,
"learning_rate": 0.0001,
"loss": 1.7946,
"step": 49200
},
{
"epoch": 0.6774508590214446,
"grad_norm": 0.1645151674747467,
"learning_rate": 0.0001,
"loss": 1.7916,
"step": 49250
},
{
"epoch": 0.6781386263910095,
"grad_norm": 0.1522347778081894,
"learning_rate": 0.0001,
"loss": 1.7891,
"step": 49300
},
{
"epoch": 0.6788263937605744,
"grad_norm": 0.16095298528671265,
"learning_rate": 0.0001,
"loss": 1.7927,
"step": 49350
},
{
"epoch": 0.6795141611301393,
"grad_norm": 0.15317974984645844,
"learning_rate": 0.0001,
"loss": 1.7947,
"step": 49400
},
{
"epoch": 0.6802019284997043,
"grad_norm": 0.16854670643806458,
"learning_rate": 0.0001,
"loss": 1.7929,
"step": 49450
},
{
"epoch": 0.6808896958692692,
"grad_norm": 0.1702488660812378,
"learning_rate": 0.0001,
"loss": 1.791,
"step": 49500
},
{
"epoch": 0.6815774632388341,
"grad_norm": 0.16388344764709473,
"learning_rate": 0.0001,
"loss": 1.7894,
"step": 49550
},
{
"epoch": 0.682265230608399,
"grad_norm": 0.16601653397083282,
"learning_rate": 0.0001,
"loss": 1.7949,
"step": 49600
},
{
"epoch": 0.6829529979779639,
"grad_norm": 0.17910674214363098,
"learning_rate": 0.0001,
"loss": 1.7875,
"step": 49650
},
{
"epoch": 0.6836407653475288,
"grad_norm": 0.15689565241336823,
"learning_rate": 0.0001,
"loss": 1.7904,
"step": 49700
},
{
"epoch": 0.6843285327170938,
"grad_norm": 0.15473750233650208,
"learning_rate": 0.0001,
"loss": 1.7894,
"step": 49750
},
{
"epoch": 0.6850163000866587,
"grad_norm": 0.16794639825820923,
"learning_rate": 0.0001,
"loss": 1.7934,
"step": 49800
},
{
"epoch": 0.6857040674562236,
"grad_norm": 0.15183915197849274,
"learning_rate": 0.0001,
"loss": 1.7887,
"step": 49850
},
{
"epoch": 0.6863918348257885,
"grad_norm": 0.15028232336044312,
"learning_rate": 0.0001,
"loss": 1.7929,
"step": 49900
},
{
"epoch": 0.6870796021953535,
"grad_norm": 0.16230390965938568,
"learning_rate": 0.0001,
"loss": 1.7948,
"step": 49950
},
{
"epoch": 0.6877673695649184,
"grad_norm": 0.16958658397197723,
"learning_rate": 0.0001,
"loss": 1.7932,
"step": 50000
},
{
"epoch": 0.6884551369344832,
"grad_norm": 0.15662765502929688,
"learning_rate": 0.0001,
"loss": 1.7904,
"step": 50050
},
{
"epoch": 0.6891429043040482,
"grad_norm": 0.17507807910442352,
"learning_rate": 0.0001,
"loss": 1.795,
"step": 50100
},
{
"epoch": 0.6898306716736131,
"grad_norm": 0.16449585556983948,
"learning_rate": 0.0001,
"loss": 1.7888,
"step": 50150
},
{
"epoch": 0.690518439043178,
"grad_norm": 0.17615753412246704,
"learning_rate": 0.0001,
"loss": 1.7889,
"step": 50200
},
{
"epoch": 0.691206206412743,
"grad_norm": 0.16010646522045135,
"learning_rate": 0.0001,
"loss": 1.7932,
"step": 50250
},
{
"epoch": 0.6918939737823079,
"grad_norm": 0.14614787697792053,
"learning_rate": 0.0001,
"loss": 1.792,
"step": 50300
},
{
"epoch": 0.6925817411518728,
"grad_norm": 0.19960370659828186,
"learning_rate": 0.0001,
"loss": 1.7907,
"step": 50350
},
{
"epoch": 0.6932695085214378,
"grad_norm": 0.16230808198451996,
"learning_rate": 0.0001,
"loss": 1.7855,
"step": 50400
},
{
"epoch": 0.6939572758910026,
"grad_norm": 0.16344518959522247,
"learning_rate": 0.0001,
"loss": 1.791,
"step": 50450
},
{
"epoch": 0.6946450432605675,
"grad_norm": 0.16584964096546173,
"learning_rate": 0.0001,
"loss": 1.7916,
"step": 50500
},
{
"epoch": 0.6953328106301324,
"grad_norm": 0.15551120042800903,
"learning_rate": 0.0001,
"loss": 1.7948,
"step": 50550
},
{
"epoch": 0.6960205779996974,
"grad_norm": 0.1697503924369812,
"learning_rate": 0.0001,
"loss": 1.7917,
"step": 50600
},
{
"epoch": 0.6967083453692623,
"grad_norm": 0.15577536821365356,
"learning_rate": 0.0001,
"loss": 1.7905,
"step": 50650
},
{
"epoch": 0.6973961127388272,
"grad_norm": 0.17658278346061707,
"learning_rate": 0.0001,
"loss": 1.7884,
"step": 50700
},
{
"epoch": 0.6980838801083922,
"grad_norm": 0.16718824207782745,
"learning_rate": 0.0001,
"loss": 1.7936,
"step": 50750
},
{
"epoch": 0.6987716474779571,
"grad_norm": 0.16996939480304718,
"learning_rate": 0.0001,
"loss": 1.7919,
"step": 50800
},
{
"epoch": 0.6994594148475219,
"grad_norm": 0.15299175679683685,
"learning_rate": 0.0001,
"loss": 1.7919,
"step": 50850
},
{
"epoch": 0.7001471822170869,
"grad_norm": 0.1672915816307068,
"learning_rate": 0.0001,
"loss": 1.795,
"step": 50900
},
{
"epoch": 0.7008349495866518,
"grad_norm": 0.17287658154964447,
"learning_rate": 0.0001,
"loss": 1.7877,
"step": 50950
},
{
"epoch": 0.7015227169562167,
"grad_norm": 0.16447900235652924,
"learning_rate": 0.0001,
"loss": 1.7915,
"step": 51000
},
{
"epoch": 0.7022104843257817,
"grad_norm": 0.16016733646392822,
"learning_rate": 0.0001,
"loss": 1.7911,
"step": 51050
},
{
"epoch": 0.7028982516953466,
"grad_norm": 0.15329506993293762,
"learning_rate": 0.0001,
"loss": 1.7915,
"step": 51100
},
{
"epoch": 0.7035860190649115,
"grad_norm": 0.1695086658000946,
"learning_rate": 0.0001,
"loss": 1.7925,
"step": 51150
},
{
"epoch": 0.7042737864344764,
"grad_norm": 0.15667758882045746,
"learning_rate": 0.0001,
"loss": 1.7908,
"step": 51200
},
{
"epoch": 0.7049615538040414,
"grad_norm": 0.1636906862258911,
"learning_rate": 0.0001,
"loss": 1.7911,
"step": 51250
},
{
"epoch": 0.7056493211736062,
"grad_norm": 0.16701051592826843,
"learning_rate": 0.0001,
"loss": 1.7929,
"step": 51300
},
{
"epoch": 0.7063370885431711,
"grad_norm": 0.17164082825183868,
"learning_rate": 0.0001,
"loss": 1.7922,
"step": 51350
},
{
"epoch": 0.7070248559127361,
"grad_norm": 0.18162649869918823,
"learning_rate": 0.0001,
"loss": 1.7892,
"step": 51400
},
{
"epoch": 0.707712623282301,
"grad_norm": 0.1521824300289154,
"learning_rate": 0.0001,
"loss": 1.7937,
"step": 51450
},
{
"epoch": 0.7084003906518659,
"grad_norm": 0.168669655919075,
"learning_rate": 0.0001,
"loss": 1.7873,
"step": 51500
},
{
"epoch": 0.7090881580214309,
"grad_norm": 0.17441484332084656,
"learning_rate": 0.0001,
"loss": 1.79,
"step": 51550
},
{
"epoch": 0.7097759253909958,
"grad_norm": 0.1877586394548416,
"learning_rate": 0.0001,
"loss": 1.7927,
"step": 51600
},
{
"epoch": 0.7104636927605607,
"grad_norm": 0.16195935010910034,
"learning_rate": 0.0001,
"loss": 1.7962,
"step": 51650
},
{
"epoch": 0.7111514601301255,
"grad_norm": 0.16282670199871063,
"learning_rate": 0.0001,
"loss": 1.7939,
"step": 51700
},
{
"epoch": 0.7118392274996905,
"grad_norm": 0.15550565719604492,
"learning_rate": 0.0001,
"loss": 1.793,
"step": 51750
},
{
"epoch": 0.7125269948692554,
"grad_norm": 0.16963760554790497,
"learning_rate": 0.0001,
"loss": 1.7921,
"step": 51800
},
{
"epoch": 0.7132147622388203,
"grad_norm": 0.1632436364889145,
"learning_rate": 0.0001,
"loss": 1.7943,
"step": 51850
},
{
"epoch": 0.7139025296083853,
"grad_norm": 0.15533354878425598,
"learning_rate": 0.0001,
"loss": 1.7917,
"step": 51900
},
{
"epoch": 0.7145902969779502,
"grad_norm": 0.15280106663703918,
"learning_rate": 0.0001,
"loss": 1.7874,
"step": 51950
},
{
"epoch": 0.7152780643475151,
"grad_norm": 0.1561509668827057,
"learning_rate": 0.0001,
"loss": 1.7918,
"step": 52000
},
{
"epoch": 0.7159658317170801,
"grad_norm": 0.1560848206281662,
"learning_rate": 0.0001,
"loss": 1.7927,
"step": 52050
},
{
"epoch": 0.7166535990866449,
"grad_norm": 0.1706065684556961,
"learning_rate": 0.0001,
"loss": 1.7877,
"step": 52100
},
{
"epoch": 0.7173413664562098,
"grad_norm": 0.16388699412345886,
"learning_rate": 0.0001,
"loss": 1.7859,
"step": 52150
},
{
"epoch": 0.7180291338257748,
"grad_norm": 0.16502410173416138,
"learning_rate": 0.0001,
"loss": 1.7899,
"step": 52200
},
{
"epoch": 0.7187169011953397,
"grad_norm": 0.17022061347961426,
"learning_rate": 0.0001,
"loss": 1.7881,
"step": 52250
},
{
"epoch": 0.7194046685649046,
"grad_norm": 0.17903153598308563,
"learning_rate": 0.0001,
"loss": 1.7892,
"step": 52300
},
{
"epoch": 0.7200924359344695,
"grad_norm": 0.15719935297966003,
"learning_rate": 0.0001,
"loss": 1.7934,
"step": 52350
},
{
"epoch": 0.7207802033040345,
"grad_norm": 0.16321443021297455,
"learning_rate": 0.0001,
"loss": 1.7914,
"step": 52400
},
{
"epoch": 0.7214679706735994,
"grad_norm": 0.1724744439125061,
"learning_rate": 0.0001,
"loss": 1.7905,
"step": 52450
},
{
"epoch": 0.7221557380431642,
"grad_norm": 0.16059927642345428,
"learning_rate": 0.0001,
"loss": 1.7929,
"step": 52500
},
{
"epoch": 0.7228435054127292,
"grad_norm": 0.17748789489269257,
"learning_rate": 0.0001,
"loss": 1.7913,
"step": 52550
},
{
"epoch": 0.7235312727822941,
"grad_norm": 0.16190293431282043,
"learning_rate": 0.0001,
"loss": 1.7956,
"step": 52600
},
{
"epoch": 0.724219040151859,
"grad_norm": 0.1841738224029541,
"learning_rate": 0.0001,
"loss": 1.7899,
"step": 52650
},
{
"epoch": 0.724906807521424,
"grad_norm": 0.15971702337265015,
"learning_rate": 0.0001,
"loss": 1.7891,
"step": 52700
},
{
"epoch": 0.7255945748909889,
"grad_norm": 0.15894858539104462,
"learning_rate": 0.0001,
"loss": 1.7939,
"step": 52750
},
{
"epoch": 0.7262823422605538,
"grad_norm": 0.15041370689868927,
"learning_rate": 0.0001,
"loss": 1.7885,
"step": 52800
},
{
"epoch": 0.7269701096301187,
"grad_norm": 0.15757033228874207,
"learning_rate": 0.0001,
"loss": 1.7881,
"step": 52850
},
{
"epoch": 0.7276578769996837,
"grad_norm": 0.16385579109191895,
"learning_rate": 0.0001,
"loss": 1.7889,
"step": 52900
},
{
"epoch": 0.7283456443692485,
"grad_norm": 0.15629428625106812,
"learning_rate": 0.0001,
"loss": 1.7932,
"step": 52950
},
{
"epoch": 0.7290334117388134,
"grad_norm": 0.1573755145072937,
"learning_rate": 0.0001,
"loss": 1.7926,
"step": 53000
},
{
"epoch": 0.7297211791083784,
"grad_norm": 0.15800927579402924,
"learning_rate": 0.0001,
"loss": 1.789,
"step": 53050
},
{
"epoch": 0.7304089464779433,
"grad_norm": 0.16997511684894562,
"learning_rate": 0.0001,
"loss": 1.7892,
"step": 53100
},
{
"epoch": 0.7310967138475082,
"grad_norm": 0.1457889825105667,
"learning_rate": 0.0001,
"loss": 1.7892,
"step": 53150
},
{
"epoch": 0.7317844812170732,
"grad_norm": 0.15250973403453827,
"learning_rate": 0.0001,
"loss": 1.7892,
"step": 53200
},
{
"epoch": 0.7324722485866381,
"grad_norm": 0.1561204344034195,
"learning_rate": 0.0001,
"loss": 1.7915,
"step": 53250
},
{
"epoch": 0.733160015956203,
"grad_norm": 0.17602892220020294,
"learning_rate": 0.0001,
"loss": 1.789,
"step": 53300
},
{
"epoch": 0.7338477833257679,
"grad_norm": 0.15751750767230988,
"learning_rate": 0.0001,
"loss": 1.7924,
"step": 53350
},
{
"epoch": 0.7345355506953328,
"grad_norm": 0.1686706244945526,
"learning_rate": 0.0001,
"loss": 1.7859,
"step": 53400
},
{
"epoch": 0.7352233180648977,
"grad_norm": 0.15886232256889343,
"learning_rate": 0.0001,
"loss": 1.7891,
"step": 53450
},
{
"epoch": 0.7359110854344626,
"grad_norm": 0.1548243910074234,
"learning_rate": 0.0001,
"loss": 1.7887,
"step": 53500
},
{
"epoch": 0.7365988528040276,
"grad_norm": 0.16160327196121216,
"learning_rate": 0.0001,
"loss": 1.792,
"step": 53550
},
{
"epoch": 0.7372866201735925,
"grad_norm": 0.1588127613067627,
"learning_rate": 0.0001,
"loss": 1.791,
"step": 53600
},
{
"epoch": 0.7379743875431574,
"grad_norm": 0.1562395691871643,
"learning_rate": 0.0001,
"loss": 1.7876,
"step": 53650
},
{
"epoch": 0.7386621549127224,
"grad_norm": 0.1463010013103485,
"learning_rate": 0.0001,
"loss": 1.7903,
"step": 53700
},
{
"epoch": 0.7393499222822872,
"grad_norm": 0.1688784807920456,
"learning_rate": 0.0001,
"loss": 1.7874,
"step": 53750
},
{
"epoch": 0.7400376896518521,
"grad_norm": 0.16111525893211365,
"learning_rate": 0.0001,
"loss": 1.7891,
"step": 53800
},
{
"epoch": 0.7407254570214171,
"grad_norm": 0.15798266232013702,
"learning_rate": 0.0001,
"loss": 1.7901,
"step": 53850
},
{
"epoch": 0.741413224390982,
"grad_norm": 0.1544068306684494,
"learning_rate": 0.0001,
"loss": 1.79,
"step": 53900
},
{
"epoch": 0.7421009917605469,
"grad_norm": 0.16747315227985382,
"learning_rate": 0.0001,
"loss": 1.7923,
"step": 53950
},
{
"epoch": 0.7427887591301119,
"grad_norm": 0.20277969539165497,
"learning_rate": 0.0001,
"loss": 1.7932,
"step": 54000
},
{
"epoch": 0.7434765264996768,
"grad_norm": 0.1490595042705536,
"learning_rate": 0.0001,
"loss": 1.7899,
"step": 54050
},
{
"epoch": 0.7441642938692417,
"grad_norm": 0.15864817798137665,
"learning_rate": 0.0001,
"loss": 1.7838,
"step": 54100
},
{
"epoch": 0.7448520612388065,
"grad_norm": 0.17168639600276947,
"learning_rate": 0.0001,
"loss": 1.79,
"step": 54150
},
{
"epoch": 0.7455398286083715,
"grad_norm": 0.1612584888935089,
"learning_rate": 0.0001,
"loss": 1.7894,
"step": 54200
},
{
"epoch": 0.7462275959779364,
"grad_norm": 0.16638678312301636,
"learning_rate": 0.0001,
"loss": 1.7852,
"step": 54250
},
{
"epoch": 0.7469153633475013,
"grad_norm": 0.16757947206497192,
"learning_rate": 0.0001,
"loss": 1.7899,
"step": 54300
},
{
"epoch": 0.7476031307170663,
"grad_norm": 0.17740657925605774,
"learning_rate": 0.0001,
"loss": 1.7891,
"step": 54350
},
{
"epoch": 0.7482908980866312,
"grad_norm": 0.15608841180801392,
"learning_rate": 0.0001,
"loss": 1.7864,
"step": 54400
},
{
"epoch": 0.7489786654561961,
"grad_norm": 0.1486404538154602,
"learning_rate": 0.0001,
"loss": 1.7895,
"step": 54450
},
{
"epoch": 0.7496664328257611,
"grad_norm": 0.17158234119415283,
"learning_rate": 0.0001,
"loss": 1.789,
"step": 54500
},
{
"epoch": 0.750354200195326,
"grad_norm": 0.1535918265581131,
"learning_rate": 0.0001,
"loss": 1.7858,
"step": 54550
},
{
"epoch": 0.7510419675648908,
"grad_norm": 0.17464052140712738,
"learning_rate": 0.0001,
"loss": 1.7884,
"step": 54600
},
{
"epoch": 0.7517297349344557,
"grad_norm": 0.15320485830307007,
"learning_rate": 0.0001,
"loss": 1.7909,
"step": 54650
},
{
"epoch": 0.7524175023040207,
"grad_norm": 0.16376914083957672,
"learning_rate": 0.0001,
"loss": 1.789,
"step": 54700
},
{
"epoch": 0.7531052696735856,
"grad_norm": 0.17047230899333954,
"learning_rate": 0.0001,
"loss": 1.7886,
"step": 54750
},
{
"epoch": 0.7537930370431505,
"grad_norm": 0.1580251306295395,
"learning_rate": 0.0001,
"loss": 1.7904,
"step": 54800
},
{
"epoch": 0.7544808044127155,
"grad_norm": 0.16085964441299438,
"learning_rate": 0.0001,
"loss": 1.7872,
"step": 54850
},
{
"epoch": 0.7551685717822804,
"grad_norm": 0.1530008316040039,
"learning_rate": 0.0001,
"loss": 1.7909,
"step": 54900
},
{
"epoch": 0.7558563391518452,
"grad_norm": 0.18514500558376312,
"learning_rate": 0.0001,
"loss": 1.789,
"step": 54950
},
{
"epoch": 0.7565441065214102,
"grad_norm": 0.16724203526973724,
"learning_rate": 0.0001,
"loss": 1.7895,
"step": 55000
},
{
"epoch": 0.7572318738909751,
"grad_norm": 0.17008638381958008,
"learning_rate": 0.0001,
"loss": 1.7909,
"step": 55050
},
{
"epoch": 0.75791964126054,
"grad_norm": 0.15402346849441528,
"learning_rate": 0.0001,
"loss": 1.7858,
"step": 55100
},
{
"epoch": 0.758607408630105,
"grad_norm": 0.1750432401895523,
"learning_rate": 0.0001,
"loss": 1.7898,
"step": 55150
},
{
"epoch": 0.7592951759996699,
"grad_norm": 0.18680183589458466,
"learning_rate": 0.0001,
"loss": 1.788,
"step": 55200
},
{
"epoch": 0.7599829433692348,
"grad_norm": 0.16581743955612183,
"learning_rate": 0.0001,
"loss": 1.7902,
"step": 55250
},
{
"epoch": 0.7606707107387997,
"grad_norm": 0.16159740090370178,
"learning_rate": 0.0001,
"loss": 1.7843,
"step": 55300
},
{
"epoch": 0.7613584781083647,
"grad_norm": 0.14381587505340576,
"learning_rate": 0.0001,
"loss": 1.7918,
"step": 55350
},
{
"epoch": 0.7620462454779295,
"grad_norm": 0.15160152316093445,
"learning_rate": 0.0001,
"loss": 1.789,
"step": 55400
},
{
"epoch": 0.7627340128474944,
"grad_norm": 0.16748382151126862,
"learning_rate": 0.0001,
"loss": 1.7865,
"step": 55450
},
{
"epoch": 0.7634217802170594,
"grad_norm": 0.15434932708740234,
"learning_rate": 0.0001,
"loss": 1.7894,
"step": 55500
},
{
"epoch": 0.7641095475866243,
"grad_norm": 0.16281753778457642,
"learning_rate": 0.0001,
"loss": 1.7909,
"step": 55550
},
{
"epoch": 0.7647973149561892,
"grad_norm": 0.1581009328365326,
"learning_rate": 0.0001,
"loss": 1.7872,
"step": 55600
},
{
"epoch": 0.7654850823257542,
"grad_norm": 0.16244924068450928,
"learning_rate": 0.0001,
"loss": 1.7882,
"step": 55650
},
{
"epoch": 0.7661728496953191,
"grad_norm": 0.1727581024169922,
"learning_rate": 0.0001,
"loss": 1.7894,
"step": 55700
},
{
"epoch": 0.766860617064884,
"grad_norm": 0.15804524719715118,
"learning_rate": 0.0001,
"loss": 1.7883,
"step": 55750
},
{
"epoch": 0.7675483844344488,
"grad_norm": 0.16742980480194092,
"learning_rate": 0.0001,
"loss": 1.7883,
"step": 55800
},
{
"epoch": 0.7682361518040138,
"grad_norm": 0.15518859028816223,
"learning_rate": 0.0001,
"loss": 1.7877,
"step": 55850
},
{
"epoch": 0.7689239191735787,
"grad_norm": 0.14549891650676727,
"learning_rate": 0.0001,
"loss": 1.7894,
"step": 55900
},
{
"epoch": 0.7696116865431436,
"grad_norm": 0.15677410364151,
"learning_rate": 0.0001,
"loss": 1.7868,
"step": 55950
},
{
"epoch": 0.7702994539127086,
"grad_norm": 0.1627907007932663,
"learning_rate": 0.0001,
"loss": 1.7861,
"step": 56000
},
{
"epoch": 0.7709872212822735,
"grad_norm": 0.17789112031459808,
"learning_rate": 0.0001,
"loss": 1.7917,
"step": 56050
},
{
"epoch": 0.7716749886518384,
"grad_norm": 0.17732852697372437,
"learning_rate": 0.0001,
"loss": 1.7885,
"step": 56100
},
{
"epoch": 0.7723627560214034,
"grad_norm": 0.16175003349781036,
"learning_rate": 0.0001,
"loss": 1.7847,
"step": 56150
},
{
"epoch": 0.7730505233909682,
"grad_norm": 0.16384829580783844,
"learning_rate": 0.0001,
"loss": 1.7879,
"step": 56200
},
{
"epoch": 0.7737382907605331,
"grad_norm": 0.18334250152111053,
"learning_rate": 0.0001,
"loss": 1.7841,
"step": 56250
},
{
"epoch": 0.7744260581300981,
"grad_norm": 0.16775920987129211,
"learning_rate": 0.0001,
"loss": 1.7844,
"step": 56300
},
{
"epoch": 0.775113825499663,
"grad_norm": 0.15945740044116974,
"learning_rate": 0.0001,
"loss": 1.7867,
"step": 56350
},
{
"epoch": 0.7758015928692279,
"grad_norm": 0.16826015710830688,
"learning_rate": 0.0001,
"loss": 1.7874,
"step": 56400
},
{
"epoch": 0.7764893602387928,
"grad_norm": 0.16733418405056,
"learning_rate": 0.0001,
"loss": 1.7843,
"step": 56450
},
{
"epoch": 0.7771771276083578,
"grad_norm": 0.17716175317764282,
"learning_rate": 0.0001,
"loss": 1.7852,
"step": 56500
},
{
"epoch": 0.7778648949779227,
"grad_norm": 0.15145139396190643,
"learning_rate": 0.0001,
"loss": 1.7864,
"step": 56550
},
{
"epoch": 0.7785526623474875,
"grad_norm": 0.1650010645389557,
"learning_rate": 0.0001,
"loss": 1.788,
"step": 56600
},
{
"epoch": 0.7792404297170525,
"grad_norm": 0.15676827728748322,
"learning_rate": 0.0001,
"loss": 1.7863,
"step": 56650
},
{
"epoch": 0.7799281970866174,
"grad_norm": 0.15251976251602173,
"learning_rate": 0.0001,
"loss": 1.7894,
"step": 56700
},
{
"epoch": 0.7806159644561823,
"grad_norm": 0.16107071936130524,
"learning_rate": 0.0001,
"loss": 1.7872,
"step": 56750
},
{
"epoch": 0.7813037318257473,
"grad_norm": 0.16008871793746948,
"learning_rate": 0.0001,
"loss": 1.7879,
"step": 56800
},
{
"epoch": 0.7819914991953122,
"grad_norm": 0.1748703122138977,
"learning_rate": 0.0001,
"loss": 1.7883,
"step": 56850
},
{
"epoch": 0.7826792665648771,
"grad_norm": 0.1847066432237625,
"learning_rate": 0.0001,
"loss": 1.7878,
"step": 56900
},
{
"epoch": 0.7833670339344421,
"grad_norm": 0.14105017483234406,
"learning_rate": 0.0001,
"loss": 1.7872,
"step": 56950
},
{
"epoch": 0.784054801304007,
"grad_norm": 0.1463741511106491,
"learning_rate": 0.0001,
"loss": 1.784,
"step": 57000
},
{
"epoch": 0.7847425686735718,
"grad_norm": 0.15982814133167267,
"learning_rate": 0.0001,
"loss": 1.7904,
"step": 57050
},
{
"epoch": 0.7854303360431367,
"grad_norm": 0.15282031893730164,
"learning_rate": 0.0001,
"loss": 1.788,
"step": 57100
},
{
"epoch": 0.7861181034127017,
"grad_norm": 0.16466231644153595,
"learning_rate": 0.0001,
"loss": 1.7862,
"step": 57150
},
{
"epoch": 0.7868058707822666,
"grad_norm": 0.16176077723503113,
"learning_rate": 0.0001,
"loss": 1.7883,
"step": 57200
},
{
"epoch": 0.7874936381518315,
"grad_norm": 0.16768991947174072,
"learning_rate": 0.0001,
"loss": 1.791,
"step": 57250
},
{
"epoch": 0.7881814055213965,
"grad_norm": 0.15378397703170776,
"learning_rate": 0.0001,
"loss": 1.7889,
"step": 57300
},
{
"epoch": 0.7888691728909614,
"grad_norm": 0.16845440864562988,
"learning_rate": 0.0001,
"loss": 1.7865,
"step": 57350
},
{
"epoch": 0.7895569402605263,
"grad_norm": 0.16859596967697144,
"learning_rate": 0.0001,
"loss": 1.7893,
"step": 57400
},
{
"epoch": 0.7902447076300912,
"grad_norm": 0.17096339166164398,
"learning_rate": 0.0001,
"loss": 1.7842,
"step": 57450
},
{
"epoch": 0.7909324749996561,
"grad_norm": 0.19546246528625488,
"learning_rate": 0.0001,
"loss": 1.7859,
"step": 57500
},
{
"epoch": 0.791620242369221,
"grad_norm": 0.15690521895885468,
"learning_rate": 0.0001,
"loss": 1.7905,
"step": 57550
},
{
"epoch": 0.7923080097387859,
"grad_norm": 0.15288680791854858,
"learning_rate": 0.0001,
"loss": 1.7871,
"step": 57600
},
{
"epoch": 0.7929957771083509,
"grad_norm": 0.15947267413139343,
"learning_rate": 0.0001,
"loss": 1.7851,
"step": 57650
},
{
"epoch": 0.7936835444779158,
"grad_norm": 0.1813030242919922,
"learning_rate": 0.0001,
"loss": 1.7833,
"step": 57700
},
{
"epoch": 0.7943713118474807,
"grad_norm": 0.16709686815738678,
"learning_rate": 0.0001,
"loss": 1.7908,
"step": 57750
},
{
"epoch": 0.7950590792170457,
"grad_norm": 0.19110731780529022,
"learning_rate": 0.0001,
"loss": 1.7845,
"step": 57800
},
{
"epoch": 0.7957468465866105,
"grad_norm": 0.15795393288135529,
"learning_rate": 0.0001,
"loss": 1.7908,
"step": 57850
},
{
"epoch": 0.7964346139561754,
"grad_norm": 0.14493565261363983,
"learning_rate": 0.0001,
"loss": 1.7893,
"step": 57900
},
{
"epoch": 0.7971223813257404,
"grad_norm": 0.14182139933109283,
"learning_rate": 0.0001,
"loss": 1.7883,
"step": 57950
},
{
"epoch": 0.7978101486953053,
"grad_norm": 0.14074084162712097,
"learning_rate": 0.0001,
"loss": 1.7857,
"step": 58000
},
{
"epoch": 0.7984979160648702,
"grad_norm": 0.1791408807039261,
"learning_rate": 0.0001,
"loss": 1.7889,
"step": 58050
},
{
"epoch": 0.7991856834344352,
"grad_norm": 0.17944924533367157,
"learning_rate": 0.0001,
"loss": 1.7884,
"step": 58100
},
{
"epoch": 0.7998734508040001,
"grad_norm": 0.19336557388305664,
"learning_rate": 0.0001,
"loss": 1.786,
"step": 58150
},
{
"epoch": 0.800561218173565,
"grad_norm": 0.14197582006454468,
"learning_rate": 0.0001,
"loss": 1.7834,
"step": 58200
},
{
"epoch": 0.8012489855431298,
"grad_norm": 0.17862093448638916,
"learning_rate": 0.0001,
"loss": 1.7859,
"step": 58250
},
{
"epoch": 0.8019367529126948,
"grad_norm": 0.15174590051174164,
"learning_rate": 0.0001,
"loss": 1.7883,
"step": 58300
},
{
"epoch": 0.8026245202822597,
"grad_norm": 0.15902046859264374,
"learning_rate": 0.0001,
"loss": 1.7859,
"step": 58350
},
{
"epoch": 0.8033122876518246,
"grad_norm": 0.1593545824289322,
"learning_rate": 0.0001,
"loss": 1.7871,
"step": 58400
},
{
"epoch": 0.8040000550213896,
"grad_norm": 0.16780108213424683,
"learning_rate": 0.0001,
"loss": 1.7892,
"step": 58450
},
{
"epoch": 0.8046878223909545,
"grad_norm": 0.16704651713371277,
"learning_rate": 0.0001,
"loss": 1.7827,
"step": 58500
},
{
"epoch": 0.8053755897605194,
"grad_norm": 0.20908869802951813,
"learning_rate": 0.0001,
"loss": 1.7868,
"step": 58550
},
{
"epoch": 0.8060633571300844,
"grad_norm": 0.1484072208404541,
"learning_rate": 0.0001,
"loss": 1.7905,
"step": 58600
},
{
"epoch": 0.8067511244996493,
"grad_norm": 0.16092757880687714,
"learning_rate": 0.0001,
"loss": 1.7849,
"step": 58650
},
{
"epoch": 0.8074388918692141,
"grad_norm": 0.15798570215702057,
"learning_rate": 0.0001,
"loss": 1.7897,
"step": 58700
},
{
"epoch": 0.808126659238779,
"grad_norm": 0.15388993918895721,
"learning_rate": 0.0001,
"loss": 1.7874,
"step": 58750
},
{
"epoch": 0.808814426608344,
"grad_norm": 0.16136646270751953,
"learning_rate": 0.0001,
"loss": 1.7866,
"step": 58800
},
{
"epoch": 0.8095021939779089,
"grad_norm": 0.20280751585960388,
"learning_rate": 0.0001,
"loss": 1.7868,
"step": 58850
},
{
"epoch": 0.8101899613474738,
"grad_norm": 0.16941416263580322,
"learning_rate": 0.0001,
"loss": 1.7834,
"step": 58900
},
{
"epoch": 0.8108777287170388,
"grad_norm": 0.1597299724817276,
"learning_rate": 0.0001,
"loss": 1.7823,
"step": 58950
},
{
"epoch": 0.8115654960866037,
"grad_norm": 0.1581617146730423,
"learning_rate": 0.0001,
"loss": 1.7902,
"step": 59000
},
{
"epoch": 0.8122532634561686,
"grad_norm": 0.17084243893623352,
"learning_rate": 0.0001,
"loss": 1.7873,
"step": 59050
},
{
"epoch": 0.8129410308257335,
"grad_norm": 0.16124476492404938,
"learning_rate": 0.0001,
"loss": 1.7894,
"step": 59100
},
{
"epoch": 0.8136287981952984,
"grad_norm": 0.15042969584465027,
"learning_rate": 0.0001,
"loss": 1.7873,
"step": 59150
},
{
"epoch": 0.8143165655648633,
"grad_norm": 0.14492358267307281,
"learning_rate": 0.0001,
"loss": 1.7836,
"step": 59200
},
{
"epoch": 0.8150043329344283,
"grad_norm": 0.17020314931869507,
"learning_rate": 0.0001,
"loss": 1.7859,
"step": 59250
},
{
"epoch": 0.8156921003039932,
"grad_norm": 0.1630934178829193,
"learning_rate": 0.0001,
"loss": 1.7841,
"step": 59300
},
{
"epoch": 0.8163798676735581,
"grad_norm": 0.17032647132873535,
"learning_rate": 0.0001,
"loss": 1.7851,
"step": 59350
},
{
"epoch": 0.817067635043123,
"grad_norm": 0.15546603500843048,
"learning_rate": 0.0001,
"loss": 1.7866,
"step": 59400
},
{
"epoch": 0.817755402412688,
"grad_norm": 0.1688961237668991,
"learning_rate": 0.0001,
"loss": 1.7858,
"step": 59450
},
{
"epoch": 0.8184431697822528,
"grad_norm": 0.15222899615764618,
"learning_rate": 0.0001,
"loss": 1.7848,
"step": 59500
},
{
"epoch": 0.8191309371518177,
"grad_norm": 0.15309302508831024,
"learning_rate": 0.0001,
"loss": 1.7847,
"step": 59550
},
{
"epoch": 0.8198187045213827,
"grad_norm": 0.1601337045431137,
"learning_rate": 0.0001,
"loss": 1.7861,
"step": 59600
},
{
"epoch": 0.8205064718909476,
"grad_norm": 0.14973758161067963,
"learning_rate": 0.0001,
"loss": 1.7893,
"step": 59650
},
{
"epoch": 0.8211942392605125,
"grad_norm": 0.17928583920001984,
"learning_rate": 0.0001,
"loss": 1.7841,
"step": 59700
},
{
"epoch": 0.8218820066300775,
"grad_norm": 0.1628539264202118,
"learning_rate": 0.0001,
"loss": 1.7861,
"step": 59750
},
{
"epoch": 0.8225697739996424,
"grad_norm": 0.1617124229669571,
"learning_rate": 0.0001,
"loss": 1.7837,
"step": 59800
},
{
"epoch": 0.8232575413692073,
"grad_norm": 0.16710211336612701,
"learning_rate": 0.0001,
"loss": 1.7843,
"step": 59850
},
{
"epoch": 0.8239453087387723,
"grad_norm": 0.18266211450099945,
"learning_rate": 0.0001,
"loss": 1.7882,
"step": 59900
},
{
"epoch": 0.8246330761083371,
"grad_norm": 0.15460216999053955,
"learning_rate": 0.0001,
"loss": 1.7856,
"step": 59950
},
{
"epoch": 0.825320843477902,
"grad_norm": 0.19238495826721191,
"learning_rate": 0.0001,
"loss": 1.7867,
"step": 60000
},
{
"epoch": 0.8260086108474669,
"grad_norm": 0.17882536351680756,
"learning_rate": 0.0001,
"loss": 1.79,
"step": 60050
},
{
"epoch": 0.8266963782170319,
"grad_norm": 0.17022471129894257,
"learning_rate": 0.0001,
"loss": 1.7843,
"step": 60100
},
{
"epoch": 0.8273841455865968,
"grad_norm": 0.16253788769245148,
"learning_rate": 0.0001,
"loss": 1.7842,
"step": 60150
},
{
"epoch": 0.8280719129561617,
"grad_norm": 0.1684889793395996,
"learning_rate": 0.0001,
"loss": 1.7871,
"step": 60200
},
{
"epoch": 0.8287596803257267,
"grad_norm": 0.1623234748840332,
"learning_rate": 0.0001,
"loss": 1.7812,
"step": 60250
},
{
"epoch": 0.8294474476952916,
"grad_norm": 0.14207519590854645,
"learning_rate": 0.0001,
"loss": 1.7873,
"step": 60300
},
{
"epoch": 0.8301352150648564,
"grad_norm": 0.15550558269023895,
"learning_rate": 0.0001,
"loss": 1.7876,
"step": 60350
},
{
"epoch": 0.8308229824344214,
"grad_norm": 0.16578029096126556,
"learning_rate": 0.0001,
"loss": 1.7804,
"step": 60400
},
{
"epoch": 0.8315107498039863,
"grad_norm": 0.16406333446502686,
"learning_rate": 0.0001,
"loss": 1.7837,
"step": 60450
},
{
"epoch": 0.8321985171735512,
"grad_norm": 0.1568935364484787,
"learning_rate": 0.0001,
"loss": 1.786,
"step": 60500
},
{
"epoch": 0.8328862845431161,
"grad_norm": 0.17918673157691956,
"learning_rate": 0.0001,
"loss": 1.7877,
"step": 60550
},
{
"epoch": 0.8335740519126811,
"grad_norm": 0.14733350276947021,
"learning_rate": 0.0001,
"loss": 1.7821,
"step": 60600
},
{
"epoch": 0.834261819282246,
"grad_norm": 0.14916177093982697,
"learning_rate": 0.0001,
"loss": 1.7862,
"step": 60650
},
{
"epoch": 0.8349495866518108,
"grad_norm": 0.15052981674671173,
"learning_rate": 0.0001,
"loss": 1.7892,
"step": 60700
},
{
"epoch": 0.8356373540213758,
"grad_norm": 0.1831791251897812,
"learning_rate": 0.0001,
"loss": 1.7844,
"step": 60750
},
{
"epoch": 0.8363251213909407,
"grad_norm": 0.16115884482860565,
"learning_rate": 0.0001,
"loss": 1.7827,
"step": 60800
},
{
"epoch": 0.8370128887605056,
"grad_norm": 0.15721943974494934,
"learning_rate": 0.0001,
"loss": 1.7862,
"step": 60850
},
{
"epoch": 0.8377006561300706,
"grad_norm": 0.1528850942850113,
"learning_rate": 0.0001,
"loss": 1.7852,
"step": 60900
},
{
"epoch": 0.8383884234996355,
"grad_norm": 0.16134890913963318,
"learning_rate": 0.0001,
"loss": 1.7875,
"step": 60950
},
{
"epoch": 0.8390761908692004,
"grad_norm": 0.16336651146411896,
"learning_rate": 0.0001,
"loss": 1.7848,
"step": 61000
},
{
"epoch": 0.8397639582387654,
"grad_norm": 0.16578875482082367,
"learning_rate": 0.0001,
"loss": 1.7858,
"step": 61050
},
{
"epoch": 0.8404517256083303,
"grad_norm": 0.16235701739788055,
"learning_rate": 0.0001,
"loss": 1.7869,
"step": 61100
},
{
"epoch": 0.8411394929778951,
"grad_norm": 0.16650299727916718,
"learning_rate": 0.0001,
"loss": 1.7868,
"step": 61150
},
{
"epoch": 0.84182726034746,
"grad_norm": 0.148828387260437,
"learning_rate": 0.0001,
"loss": 1.7827,
"step": 61200
},
{
"epoch": 0.842515027717025,
"grad_norm": 0.1572546660900116,
"learning_rate": 0.0001,
"loss": 1.7846,
"step": 61250
},
{
"epoch": 0.8432027950865899,
"grad_norm": 0.15572214126586914,
"learning_rate": 0.0001,
"loss": 1.788,
"step": 61300
},
{
"epoch": 0.8438905624561548,
"grad_norm": 0.18148384988307953,
"learning_rate": 0.0001,
"loss": 1.7829,
"step": 61350
},
{
"epoch": 0.8445783298257198,
"grad_norm": 0.16225239634513855,
"learning_rate": 0.0001,
"loss": 1.787,
"step": 61400
},
{
"epoch": 0.8452660971952847,
"grad_norm": 0.1546306014060974,
"learning_rate": 0.0001,
"loss": 1.7886,
"step": 61450
},
{
"epoch": 0.8459538645648496,
"grad_norm": 0.1589781790971756,
"learning_rate": 0.0001,
"loss": 1.7876,
"step": 61500
},
{
"epoch": 0.8466416319344146,
"grad_norm": 0.16938839852809906,
"learning_rate": 0.0001,
"loss": 1.7805,
"step": 61550
},
{
"epoch": 0.8473293993039794,
"grad_norm": 0.17635032534599304,
"learning_rate": 0.0001,
"loss": 1.7836,
"step": 61600
},
{
"epoch": 0.8480171666735443,
"grad_norm": 0.16436606645584106,
"learning_rate": 0.0001,
"loss": 1.7829,
"step": 61650
},
{
"epoch": 0.8487049340431092,
"grad_norm": 0.15410180389881134,
"learning_rate": 0.0001,
"loss": 1.7833,
"step": 61700
},
{
"epoch": 0.8493927014126742,
"grad_norm": 0.15711359679698944,
"learning_rate": 0.0001,
"loss": 1.7855,
"step": 61750
},
{
"epoch": 0.8500804687822391,
"grad_norm": 0.14257673919200897,
"learning_rate": 0.0001,
"loss": 1.7846,
"step": 61800
},
{
"epoch": 0.850768236151804,
"grad_norm": 0.1770082414150238,
"learning_rate": 0.0001,
"loss": 1.786,
"step": 61850
},
{
"epoch": 0.851456003521369,
"grad_norm": 0.14938481152057648,
"learning_rate": 0.0001,
"loss": 1.7841,
"step": 61900
},
{
"epoch": 0.8521437708909338,
"grad_norm": 0.16232655942440033,
"learning_rate": 0.0001,
"loss": 1.7872,
"step": 61950
},
{
"epoch": 0.8528315382604987,
"grad_norm": 0.14662796258926392,
"learning_rate": 0.0001,
"loss": 1.7846,
"step": 62000
},
{
"epoch": 0.8535193056300637,
"grad_norm": 0.15960827469825745,
"learning_rate": 0.0001,
"loss": 1.7868,
"step": 62050
},
{
"epoch": 0.8542070729996286,
"grad_norm": 0.1585722714662552,
"learning_rate": 0.0001,
"loss": 1.7833,
"step": 62100
},
{
"epoch": 0.8548948403691935,
"grad_norm": 0.15847063064575195,
"learning_rate": 0.0001,
"loss": 1.7861,
"step": 62150
},
{
"epoch": 0.8555826077387585,
"grad_norm": 0.1581469178199768,
"learning_rate": 0.0001,
"loss": 1.7872,
"step": 62200
},
{
"epoch": 0.8562703751083234,
"grad_norm": 0.18087923526763916,
"learning_rate": 0.0001,
"loss": 1.7837,
"step": 62250
},
{
"epoch": 0.8569581424778883,
"grad_norm": 0.15878331661224365,
"learning_rate": 0.0001,
"loss": 1.7865,
"step": 62300
},
{
"epoch": 0.8576459098474531,
"grad_norm": 0.1652536690235138,
"learning_rate": 0.0001,
"loss": 1.7864,
"step": 62350
},
{
"epoch": 0.8583336772170181,
"grad_norm": 0.16467753052711487,
"learning_rate": 0.0001,
"loss": 1.788,
"step": 62400
},
{
"epoch": 0.859021444586583,
"grad_norm": 0.17342518270015717,
"learning_rate": 0.0001,
"loss": 1.7853,
"step": 62450
},
{
"epoch": 0.8597092119561479,
"grad_norm": 0.15487852692604065,
"learning_rate": 0.0001,
"loss": 1.7861,
"step": 62500
},
{
"epoch": 0.8603969793257129,
"grad_norm": 0.16185085475444794,
"learning_rate": 0.0001,
"loss": 1.7891,
"step": 62550
},
{
"epoch": 0.8610847466952778,
"grad_norm": 0.18629157543182373,
"learning_rate": 0.0001,
"loss": 1.7836,
"step": 62600
},
{
"epoch": 0.8617725140648427,
"grad_norm": 0.20009976625442505,
"learning_rate": 0.0001,
"loss": 1.7849,
"step": 62650
},
{
"epoch": 0.8624602814344077,
"grad_norm": 0.16432398557662964,
"learning_rate": 0.0001,
"loss": 1.786,
"step": 62700
},
{
"epoch": 0.8631480488039726,
"grad_norm": 0.16151119768619537,
"learning_rate": 0.0001,
"loss": 1.7838,
"step": 62750
},
{
"epoch": 0.8638358161735374,
"grad_norm": 0.16223236918449402,
"learning_rate": 0.0001,
"loss": 1.7857,
"step": 62800
},
{
"epoch": 0.8645235835431024,
"grad_norm": 0.15118102729320526,
"learning_rate": 0.0001,
"loss": 1.7824,
"step": 62850
},
{
"epoch": 0.8652113509126673,
"grad_norm": 0.15173585712909698,
"learning_rate": 0.0001,
"loss": 1.7858,
"step": 62900
},
{
"epoch": 0.8658991182822322,
"grad_norm": 0.1547808051109314,
"learning_rate": 0.0001,
"loss": 1.7806,
"step": 62950
},
{
"epoch": 0.8665868856517971,
"grad_norm": 0.1542670577764511,
"learning_rate": 0.0001,
"loss": 1.7816,
"step": 63000
},
{
"epoch": 0.8672746530213621,
"grad_norm": 0.16760842502117157,
"learning_rate": 0.0001,
"loss": 1.7845,
"step": 63050
},
{
"epoch": 0.867962420390927,
"grad_norm": 0.17703787982463837,
"learning_rate": 0.0001,
"loss": 1.788,
"step": 63100
},
{
"epoch": 0.8686501877604919,
"grad_norm": 0.1573743224143982,
"learning_rate": 0.0001,
"loss": 1.7792,
"step": 63150
},
{
"epoch": 0.8693379551300568,
"grad_norm": 0.1451522409915924,
"learning_rate": 0.0001,
"loss": 1.7854,
"step": 63200
},
{
"epoch": 0.8700257224996217,
"grad_norm": 0.17078782618045807,
"learning_rate": 0.0001,
"loss": 1.784,
"step": 63250
},
{
"epoch": 0.8707134898691866,
"grad_norm": 0.15471959114074707,
"learning_rate": 0.0001,
"loss": 1.7832,
"step": 63300
},
{
"epoch": 0.8714012572387516,
"grad_norm": 0.16724149882793427,
"learning_rate": 0.0001,
"loss": 1.7783,
"step": 63350
},
{
"epoch": 0.8720890246083165,
"grad_norm": 0.15160906314849854,
"learning_rate": 0.0001,
"loss": 1.7843,
"step": 63400
},
{
"epoch": 0.8727767919778814,
"grad_norm": 0.156820610165596,
"learning_rate": 0.0001,
"loss": 1.7856,
"step": 63450
},
{
"epoch": 0.8734645593474463,
"grad_norm": 0.16410048305988312,
"learning_rate": 0.0001,
"loss": 1.7845,
"step": 63500
},
{
"epoch": 0.8741523267170113,
"grad_norm": 0.16022023558616638,
"learning_rate": 0.0001,
"loss": 1.7801,
"step": 63550
},
{
"epoch": 0.8748400940865761,
"grad_norm": 0.1775195300579071,
"learning_rate": 0.0001,
"loss": 1.7824,
"step": 63600
},
{
"epoch": 0.875527861456141,
"grad_norm": 0.17621392011642456,
"learning_rate": 0.0001,
"loss": 1.7792,
"step": 63650
},
{
"epoch": 0.876215628825706,
"grad_norm": 0.17508172988891602,
"learning_rate": 0.0001,
"loss": 1.785,
"step": 63700
},
{
"epoch": 0.8769033961952709,
"grad_norm": 0.167220801115036,
"learning_rate": 0.0001,
"loss": 1.7838,
"step": 63750
},
{
"epoch": 0.8775911635648358,
"grad_norm": 0.22981862723827362,
"learning_rate": 0.0001,
"loss": 1.7885,
"step": 63800
},
{
"epoch": 0.8782789309344008,
"grad_norm": 0.17177161574363708,
"learning_rate": 0.0001,
"loss": 1.7846,
"step": 63850
},
{
"epoch": 0.8789666983039657,
"grad_norm": 0.16599243879318237,
"learning_rate": 0.0001,
"loss": 1.7819,
"step": 63900
},
{
"epoch": 0.8796544656735306,
"grad_norm": 0.17125064134597778,
"learning_rate": 0.0001,
"loss": 1.7839,
"step": 63950
},
{
"epoch": 0.8803422330430956,
"grad_norm": 0.17469707131385803,
"learning_rate": 0.0001,
"loss": 1.7797,
"step": 64000
},
{
"epoch": 0.8810300004126604,
"grad_norm": 0.16639864444732666,
"learning_rate": 0.0001,
"loss": 1.7833,
"step": 64050
},
{
"epoch": 0.8817177677822253,
"grad_norm": 0.16656282544136047,
"learning_rate": 0.0001,
"loss": 1.7816,
"step": 64100
},
{
"epoch": 0.8824055351517902,
"grad_norm": 0.14526651799678802,
"learning_rate": 0.0001,
"loss": 1.7817,
"step": 64150
},
{
"epoch": 0.8830933025213552,
"grad_norm": 0.1783958077430725,
"learning_rate": 0.0001,
"loss": 1.7828,
"step": 64200
},
{
"epoch": 0.8837810698909201,
"grad_norm": 0.16352634131908417,
"learning_rate": 0.0001,
"loss": 1.7807,
"step": 64250
},
{
"epoch": 0.884468837260485,
"grad_norm": 0.16130295395851135,
"learning_rate": 0.0001,
"loss": 1.7803,
"step": 64300
},
{
"epoch": 0.88515660463005,
"grad_norm": 0.16286851465702057,
"learning_rate": 0.0001,
"loss": 1.7866,
"step": 64350
},
{
"epoch": 0.8858443719996149,
"grad_norm": 0.16668406128883362,
"learning_rate": 0.0001,
"loss": 1.7805,
"step": 64400
},
{
"epoch": 0.8865321393691797,
"grad_norm": 0.16575850546360016,
"learning_rate": 0.0001,
"loss": 1.7803,
"step": 64450
},
{
"epoch": 0.8872199067387447,
"grad_norm": 0.16535095870494843,
"learning_rate": 0.0001,
"loss": 1.7795,
"step": 64500
},
{
"epoch": 0.8879076741083096,
"grad_norm": 0.14137853682041168,
"learning_rate": 0.0001,
"loss": 1.7854,
"step": 64550
},
{
"epoch": 0.8885954414778745,
"grad_norm": 0.14880156517028809,
"learning_rate": 0.0001,
"loss": 1.7862,
"step": 64600
},
{
"epoch": 0.8892832088474394,
"grad_norm": 0.17448197305202484,
"learning_rate": 0.0001,
"loss": 1.7847,
"step": 64650
},
{
"epoch": 0.8899709762170044,
"grad_norm": 0.1944260448217392,
"learning_rate": 0.0001,
"loss": 1.786,
"step": 64700
},
{
"epoch": 0.8906587435865693,
"grad_norm": 0.1693488508462906,
"learning_rate": 0.0001,
"loss": 1.7857,
"step": 64750
},
{
"epoch": 0.8913465109561342,
"grad_norm": 0.16250942647457123,
"learning_rate": 0.0001,
"loss": 1.7835,
"step": 64800
},
{
"epoch": 0.8920342783256991,
"grad_norm": 0.1573057919740677,
"learning_rate": 0.0001,
"loss": 1.782,
"step": 64850
},
{
"epoch": 0.892722045695264,
"grad_norm": 0.19034920632839203,
"learning_rate": 0.0001,
"loss": 1.782,
"step": 64900
},
{
"epoch": 0.8934098130648289,
"grad_norm": 0.13963682949543,
"learning_rate": 0.0001,
"loss": 1.7887,
"step": 64950
},
{
"epoch": 0.8940975804343939,
"grad_norm": 0.25064077973365784,
"learning_rate": 0.0001,
"loss": 1.7873,
"step": 65000
},
{
"epoch": 0.8947853478039588,
"grad_norm": 0.17574715614318848,
"learning_rate": 0.0001,
"loss": 1.7841,
"step": 65050
},
{
"epoch": 0.8954731151735237,
"grad_norm": 0.156754732131958,
"learning_rate": 0.0001,
"loss": 1.7807,
"step": 65100
},
{
"epoch": 0.8961608825430887,
"grad_norm": 0.17132636904716492,
"learning_rate": 0.0001,
"loss": 1.7801,
"step": 65150
},
{
"epoch": 0.8968486499126536,
"grad_norm": 0.15248049795627594,
"learning_rate": 0.0001,
"loss": 1.781,
"step": 65200
},
{
"epoch": 0.8975364172822184,
"grad_norm": 0.1603154093027115,
"learning_rate": 0.0001,
"loss": 1.7836,
"step": 65250
},
{
"epoch": 0.8982241846517833,
"grad_norm": 0.14862816035747528,
"learning_rate": 0.0001,
"loss": 1.7823,
"step": 65300
},
{
"epoch": 0.8989119520213483,
"grad_norm": 0.17050820589065552,
"learning_rate": 0.0001,
"loss": 1.7856,
"step": 65350
},
{
"epoch": 0.8995997193909132,
"grad_norm": 0.16287332773208618,
"learning_rate": 0.0001,
"loss": 1.7833,
"step": 65400
},
{
"epoch": 0.9002874867604781,
"grad_norm": 0.15486200153827667,
"learning_rate": 0.0001,
"loss": 1.7804,
"step": 65450
},
{
"epoch": 0.9009752541300431,
"grad_norm": 0.16483095288276672,
"learning_rate": 0.0001,
"loss": 1.7845,
"step": 65500
},
{
"epoch": 0.901663021499608,
"grad_norm": 0.15963926911354065,
"learning_rate": 0.0001,
"loss": 1.7865,
"step": 65550
},
{
"epoch": 0.9023507888691729,
"grad_norm": 0.14927932620048523,
"learning_rate": 0.0001,
"loss": 1.7814,
"step": 65600
},
{
"epoch": 0.9030385562387379,
"grad_norm": 0.15622937679290771,
"learning_rate": 0.0001,
"loss": 1.7841,
"step": 65650
},
{
"epoch": 0.9037263236083027,
"grad_norm": 0.14870509505271912,
"learning_rate": 0.0001,
"loss": 1.7865,
"step": 65700
},
{
"epoch": 0.9044140909778676,
"grad_norm": 0.16585543751716614,
"learning_rate": 0.0001,
"loss": 1.7803,
"step": 65750
},
{
"epoch": 0.9051018583474326,
"grad_norm": 0.16925722360610962,
"learning_rate": 0.0001,
"loss": 1.7905,
"step": 65800
},
{
"epoch": 0.9057896257169975,
"grad_norm": 0.16086918115615845,
"learning_rate": 0.0001,
"loss": 1.7818,
"step": 65850
},
{
"epoch": 0.9064773930865624,
"grad_norm": 0.17064189910888672,
"learning_rate": 0.0001,
"loss": 1.7829,
"step": 65900
},
{
"epoch": 0.9071651604561273,
"grad_norm": 0.1507936716079712,
"learning_rate": 0.0001,
"loss": 1.7826,
"step": 65950
},
{
"epoch": 0.9078529278256923,
"grad_norm": 0.16139142215251923,
"learning_rate": 0.0001,
"loss": 1.7832,
"step": 66000
},
{
"epoch": 0.9085406951952572,
"grad_norm": 0.14373824000358582,
"learning_rate": 0.0001,
"loss": 1.7834,
"step": 66050
},
{
"epoch": 0.909228462564822,
"grad_norm": 0.14268267154693604,
"learning_rate": 0.0001,
"loss": 1.7832,
"step": 66100
},
{
"epoch": 0.909916229934387,
"grad_norm": 0.14548690617084503,
"learning_rate": 0.0001,
"loss": 1.7827,
"step": 66150
},
{
"epoch": 0.9106039973039519,
"grad_norm": 0.1726326048374176,
"learning_rate": 0.0001,
"loss": 1.7799,
"step": 66200
},
{
"epoch": 0.9112917646735168,
"grad_norm": 0.1607373058795929,
"learning_rate": 0.0001,
"loss": 1.7809,
"step": 66250
},
{
"epoch": 0.9119795320430818,
"grad_norm": 0.14730975031852722,
"learning_rate": 0.0001,
"loss": 1.7791,
"step": 66300
},
{
"epoch": 0.9126672994126467,
"grad_norm": 0.1616540104150772,
"learning_rate": 0.0001,
"loss": 1.7791,
"step": 66350
},
{
"epoch": 0.9133550667822116,
"grad_norm": 0.16029463708400726,
"learning_rate": 0.0001,
"loss": 1.7828,
"step": 66400
},
{
"epoch": 0.9140428341517765,
"grad_norm": 0.15002845227718353,
"learning_rate": 0.0001,
"loss": 1.7812,
"step": 66450
},
{
"epoch": 0.9147306015213414,
"grad_norm": 0.14482907950878143,
"learning_rate": 0.0001,
"loss": 1.7802,
"step": 66500
},
{
"epoch": 0.9154183688909063,
"grad_norm": 0.17749476432800293,
"learning_rate": 0.0001,
"loss": 1.781,
"step": 66550
},
{
"epoch": 0.9161061362604712,
"grad_norm": 0.15776415169239044,
"learning_rate": 0.0001,
"loss": 1.7816,
"step": 66600
},
{
"epoch": 0.9167939036300362,
"grad_norm": 0.149980366230011,
"learning_rate": 0.0001,
"loss": 1.7756,
"step": 66650
},
{
"epoch": 0.9174816709996011,
"grad_norm": 0.16899780929088593,
"learning_rate": 0.0001,
"loss": 1.7814,
"step": 66700
},
{
"epoch": 0.918169438369166,
"grad_norm": 0.17424631118774414,
"learning_rate": 0.0001,
"loss": 1.7781,
"step": 66750
},
{
"epoch": 0.918857205738731,
"grad_norm": 0.1580991894006729,
"learning_rate": 0.0001,
"loss": 1.7801,
"step": 66800
},
{
"epoch": 0.9195449731082959,
"grad_norm": 0.16126061975955963,
"learning_rate": 0.0001,
"loss": 1.782,
"step": 66850
},
{
"epoch": 0.9202327404778607,
"grad_norm": 0.15646252036094666,
"learning_rate": 0.0001,
"loss": 1.7828,
"step": 66900
},
{
"epoch": 0.9209205078474257,
"grad_norm": 0.17129796743392944,
"learning_rate": 0.0001,
"loss": 1.7844,
"step": 66950
},
{
"epoch": 0.9216082752169906,
"grad_norm": 0.1756673902273178,
"learning_rate": 0.0001,
"loss": 1.7839,
"step": 67000
},
{
"epoch": 0.9222960425865555,
"grad_norm": 0.15259510278701782,
"learning_rate": 0.0001,
"loss": 1.7795,
"step": 67050
},
{
"epoch": 0.9229838099561204,
"grad_norm": 0.1639316827058792,
"learning_rate": 0.0001,
"loss": 1.7843,
"step": 67100
},
{
"epoch": 0.9236715773256854,
"grad_norm": 0.17190176248550415,
"learning_rate": 0.0001,
"loss": 1.78,
"step": 67150
},
{
"epoch": 0.9243593446952503,
"grad_norm": 0.16864174604415894,
"learning_rate": 0.0001,
"loss": 1.7852,
"step": 67200
},
{
"epoch": 0.9250471120648152,
"grad_norm": 0.15548075735569,
"learning_rate": 0.0001,
"loss": 1.7828,
"step": 67250
},
{
"epoch": 0.9257348794343802,
"grad_norm": 0.16301994025707245,
"learning_rate": 0.0001,
"loss": 1.7846,
"step": 67300
},
{
"epoch": 0.926422646803945,
"grad_norm": 0.1735038459300995,
"learning_rate": 0.0001,
"loss": 1.7798,
"step": 67350
},
{
"epoch": 0.9271104141735099,
"grad_norm": 0.1380920112133026,
"learning_rate": 0.0001,
"loss": 1.7806,
"step": 67400
},
{
"epoch": 0.9277981815430749,
"grad_norm": 0.15920446813106537,
"learning_rate": 0.0001,
"loss": 1.7792,
"step": 67450
},
{
"epoch": 0.9284859489126398,
"grad_norm": 0.17028312385082245,
"learning_rate": 0.0001,
"loss": 1.7888,
"step": 67500
},
{
"epoch": 0.9291737162822047,
"grad_norm": 0.1769266575574875,
"learning_rate": 0.0001,
"loss": 1.7814,
"step": 67550
},
{
"epoch": 0.9298614836517696,
"grad_norm": 0.1450556516647339,
"learning_rate": 0.0001,
"loss": 1.7817,
"step": 67600
},
{
"epoch": 0.9305492510213346,
"grad_norm": 0.16302357614040375,
"learning_rate": 0.0001,
"loss": 1.7813,
"step": 67650
},
{
"epoch": 0.9312370183908995,
"grad_norm": 0.1574389934539795,
"learning_rate": 0.0001,
"loss": 1.7776,
"step": 67700
},
{
"epoch": 0.9319247857604643,
"grad_norm": 0.14627063274383545,
"learning_rate": 0.0001,
"loss": 1.7826,
"step": 67750
},
{
"epoch": 0.9326125531300293,
"grad_norm": 0.18861928582191467,
"learning_rate": 0.0001,
"loss": 1.781,
"step": 67800
},
{
"epoch": 0.9333003204995942,
"grad_norm": 0.1549026519060135,
"learning_rate": 0.0001,
"loss": 1.7787,
"step": 67850
},
{
"epoch": 0.9339880878691591,
"grad_norm": 0.1620372235774994,
"learning_rate": 0.0001,
"loss": 1.7826,
"step": 67900
},
{
"epoch": 0.9346758552387241,
"grad_norm": 0.15894797444343567,
"learning_rate": 0.0001,
"loss": 1.7818,
"step": 67950
},
{
"epoch": 0.935363622608289,
"grad_norm": 0.19588086009025574,
"learning_rate": 0.0001,
"loss": 1.7835,
"step": 68000
},
{
"epoch": 0.9360513899778539,
"grad_norm": 0.1861431747674942,
"learning_rate": 0.0001,
"loss": 1.7815,
"step": 68050
},
{
"epoch": 0.9367391573474189,
"grad_norm": 0.16720125079154968,
"learning_rate": 0.0001,
"loss": 1.781,
"step": 68100
},
{
"epoch": 0.9374269247169837,
"grad_norm": 0.1603463739156723,
"learning_rate": 0.0001,
"loss": 1.7788,
"step": 68150
},
{
"epoch": 0.9381146920865486,
"grad_norm": 0.14092972874641418,
"learning_rate": 0.0001,
"loss": 1.7824,
"step": 68200
},
{
"epoch": 0.9388024594561135,
"grad_norm": 0.1622365266084671,
"learning_rate": 0.0001,
"loss": 1.7779,
"step": 68250
},
{
"epoch": 0.9394902268256785,
"grad_norm": 0.16566450893878937,
"learning_rate": 0.0001,
"loss": 1.7789,
"step": 68300
},
{
"epoch": 0.9401779941952434,
"grad_norm": 0.14181503653526306,
"learning_rate": 0.0001,
"loss": 1.7773,
"step": 68350
},
{
"epoch": 0.9408657615648083,
"grad_norm": 0.16675251722335815,
"learning_rate": 0.0001,
"loss": 1.7796,
"step": 68400
},
{
"epoch": 0.9415535289343733,
"grad_norm": 0.15481418371200562,
"learning_rate": 0.0001,
"loss": 1.7797,
"step": 68450
},
{
"epoch": 0.9422412963039382,
"grad_norm": 0.16480682790279388,
"learning_rate": 0.0001,
"loss": 1.7767,
"step": 68500
},
{
"epoch": 0.942929063673503,
"grad_norm": 0.13726095855236053,
"learning_rate": 0.0001,
"loss": 1.7799,
"step": 68550
},
{
"epoch": 0.943616831043068,
"grad_norm": 0.1498117446899414,
"learning_rate": 0.0001,
"loss": 1.7826,
"step": 68600
},
{
"epoch": 0.9443045984126329,
"grad_norm": 0.15102407336235046,
"learning_rate": 0.0001,
"loss": 1.7807,
"step": 68650
},
{
"epoch": 0.9449923657821978,
"grad_norm": 0.1596510410308838,
"learning_rate": 0.0001,
"loss": 1.7773,
"step": 68700
},
{
"epoch": 0.9456801331517628,
"grad_norm": 0.15061867237091064,
"learning_rate": 0.0001,
"loss": 1.7781,
"step": 68750
},
{
"epoch": 0.9463679005213277,
"grad_norm": 0.18302445113658905,
"learning_rate": 0.0001,
"loss": 1.7801,
"step": 68800
},
{
"epoch": 0.9470556678908926,
"grad_norm": 0.1563147008419037,
"learning_rate": 0.0001,
"loss": 1.7807,
"step": 68850
},
{
"epoch": 0.9477434352604575,
"grad_norm": 0.1559109389781952,
"learning_rate": 0.0001,
"loss": 1.779,
"step": 68900
},
{
"epoch": 0.9484312026300225,
"grad_norm": 0.1892656683921814,
"learning_rate": 0.0001,
"loss": 1.7815,
"step": 68950
},
{
"epoch": 0.9491189699995873,
"grad_norm": 0.16753901541233063,
"learning_rate": 0.0001,
"loss": 1.779,
"step": 69000
},
{
"epoch": 0.9498067373691522,
"grad_norm": 0.16571739315986633,
"learning_rate": 0.0001,
"loss": 1.781,
"step": 69050
},
{
"epoch": 0.9504945047387172,
"grad_norm": 0.15618735551834106,
"learning_rate": 0.0001,
"loss": 1.7801,
"step": 69100
},
{
"epoch": 0.9511822721082821,
"grad_norm": 0.15602505207061768,
"learning_rate": 0.0001,
"loss": 1.7782,
"step": 69150
},
{
"epoch": 0.951870039477847,
"grad_norm": 0.1441372036933899,
"learning_rate": 0.0001,
"loss": 1.7808,
"step": 69200
},
{
"epoch": 0.952557806847412,
"grad_norm": 0.16956308484077454,
"learning_rate": 0.0001,
"loss": 1.7805,
"step": 69250
},
{
"epoch": 0.9532455742169769,
"grad_norm": 0.1570560336112976,
"learning_rate": 0.0001,
"loss": 1.7829,
"step": 69300
},
{
"epoch": 0.9539333415865417,
"grad_norm": 0.13851186633110046,
"learning_rate": 0.0001,
"loss": 1.779,
"step": 69350
},
{
"epoch": 0.9546211089561066,
"grad_norm": 0.18309037387371063,
"learning_rate": 0.0001,
"loss": 1.7772,
"step": 69400
},
{
"epoch": 0.9553088763256716,
"grad_norm": 1.6850249767303467,
"learning_rate": 0.0001,
"loss": 1.7781,
"step": 69450
},
{
"epoch": 0.9559966436952365,
"grad_norm": 0.1578509509563446,
"learning_rate": 0.0001,
"loss": 1.7843,
"step": 69500
},
{
"epoch": 0.9566844110648014,
"grad_norm": 0.15330944955348969,
"learning_rate": 0.0001,
"loss": 1.7785,
"step": 69550
},
{
"epoch": 0.9573721784343664,
"grad_norm": 0.15504170954227448,
"learning_rate": 0.0001,
"loss": 1.7851,
"step": 69600
},
{
"epoch": 0.9580599458039313,
"grad_norm": 0.17802022397518158,
"learning_rate": 0.0001,
"loss": 1.7794,
"step": 69650
},
{
"epoch": 0.9587477131734962,
"grad_norm": 0.18508057296276093,
"learning_rate": 0.0001,
"loss": 1.7827,
"step": 69700
},
{
"epoch": 0.9594354805430612,
"grad_norm": 0.19704073667526245,
"learning_rate": 0.0001,
"loss": 1.7809,
"step": 69750
},
{
"epoch": 0.960123247912626,
"grad_norm": 0.17070503532886505,
"learning_rate": 0.0001,
"loss": 1.7791,
"step": 69800
},
{
"epoch": 0.9608110152821909,
"grad_norm": 0.1832980215549469,
"learning_rate": 0.0001,
"loss": 1.7798,
"step": 69850
},
{
"epoch": 0.9614987826517559,
"grad_norm": 0.15290822088718414,
"learning_rate": 0.0001,
"loss": 1.7819,
"step": 69900
},
{
"epoch": 0.9621865500213208,
"grad_norm": 0.1691426783800125,
"learning_rate": 0.0001,
"loss": 1.7792,
"step": 69950
},
{
"epoch": 0.9628743173908857,
"grad_norm": 0.1656666249036789,
"learning_rate": 0.0001,
"loss": 1.7836,
"step": 70000
},
{
"epoch": 0.9635620847604506,
"grad_norm": 0.15653489530086517,
"learning_rate": 0.0001,
"loss": 1.7811,
"step": 70050
},
{
"epoch": 0.9642498521300156,
"grad_norm": 0.15945695340633392,
"learning_rate": 0.0001,
"loss": 1.7789,
"step": 70100
},
{
"epoch": 0.9649376194995805,
"grad_norm": 0.173899307847023,
"learning_rate": 0.0001,
"loss": 1.782,
"step": 70150
},
{
"epoch": 0.9656253868691453,
"grad_norm": 0.13982714712619781,
"learning_rate": 0.0001,
"loss": 1.7796,
"step": 70200
},
{
"epoch": 0.9663131542387103,
"grad_norm": 0.16570891439914703,
"learning_rate": 0.0001,
"loss": 1.7814,
"step": 70250
},
{
"epoch": 0.9670009216082752,
"grad_norm": 0.1680910885334015,
"learning_rate": 0.0001,
"loss": 1.7797,
"step": 70300
},
{
"epoch": 0.9676886889778401,
"grad_norm": 0.18602094054222107,
"learning_rate": 0.0001,
"loss": 1.7799,
"step": 70350
},
{
"epoch": 0.9683764563474051,
"grad_norm": 0.15171028673648834,
"learning_rate": 0.0001,
"loss": 1.7824,
"step": 70400
},
{
"epoch": 0.96906422371697,
"grad_norm": 0.17273007333278656,
"learning_rate": 0.0001,
"loss": 1.779,
"step": 70450
},
{
"epoch": 0.9697519910865349,
"grad_norm": 0.1841355711221695,
"learning_rate": 0.0001,
"loss": 1.7849,
"step": 70500
},
{
"epoch": 0.9704397584560998,
"grad_norm": 0.14629191160202026,
"learning_rate": 0.0001,
"loss": 1.7822,
"step": 70550
},
{
"epoch": 0.9711275258256648,
"grad_norm": 0.19547376036643982,
"learning_rate": 0.0001,
"loss": 1.7805,
"step": 70600
},
{
"epoch": 0.9718152931952296,
"grad_norm": 0.1695117950439453,
"learning_rate": 0.0001,
"loss": 1.7808,
"step": 70650
},
{
"epoch": 0.9725030605647945,
"grad_norm": 0.15734167397022247,
"learning_rate": 0.0001,
"loss": 1.7826,
"step": 70700
},
{
"epoch": 0.9731908279343595,
"grad_norm": 0.15534259378910065,
"learning_rate": 0.0001,
"loss": 1.7784,
"step": 70750
},
{
"epoch": 0.9738785953039244,
"grad_norm": 0.17524221539497375,
"learning_rate": 0.0001,
"loss": 1.7802,
"step": 70800
},
{
"epoch": 0.9745663626734893,
"grad_norm": 0.16551004350185394,
"learning_rate": 0.0001,
"loss": 1.7774,
"step": 70850
},
{
"epoch": 0.9752541300430543,
"grad_norm": 0.18955057859420776,
"learning_rate": 0.0001,
"loss": 1.7771,
"step": 70900
},
{
"epoch": 0.9759418974126192,
"grad_norm": 0.1564190834760666,
"learning_rate": 0.0001,
"loss": 1.7836,
"step": 70950
},
{
"epoch": 0.976629664782184,
"grad_norm": 0.18080365657806396,
"learning_rate": 0.0001,
"loss": 1.7809,
"step": 71000
},
{
"epoch": 0.977317432151749,
"grad_norm": 0.17052794992923737,
"learning_rate": 0.0001,
"loss": 1.7785,
"step": 71050
},
{
"epoch": 0.9780051995213139,
"grad_norm": 0.15679985284805298,
"learning_rate": 0.0001,
"loss": 1.777,
"step": 71100
},
{
"epoch": 0.9786929668908788,
"grad_norm": 0.14611759781837463,
"learning_rate": 0.0001,
"loss": 1.7831,
"step": 71150
},
{
"epoch": 0.9793807342604437,
"grad_norm": 0.17994888126850128,
"learning_rate": 0.0001,
"loss": 1.7811,
"step": 71200
},
{
"epoch": 0.9800685016300087,
"grad_norm": 0.1523408442735672,
"learning_rate": 0.0001,
"loss": 1.7819,
"step": 71250
},
{
"epoch": 0.9807562689995736,
"grad_norm": 0.14828313887119293,
"learning_rate": 0.0001,
"loss": 1.7766,
"step": 71300
},
{
"epoch": 0.9814440363691385,
"grad_norm": 0.1424998790025711,
"learning_rate": 0.0001,
"loss": 1.7788,
"step": 71350
},
{
"epoch": 0.9821318037387035,
"grad_norm": 0.14312104880809784,
"learning_rate": 0.0001,
"loss": 1.7783,
"step": 71400
},
{
"epoch": 0.9828195711082683,
"grad_norm": 0.14697466790676117,
"learning_rate": 0.0001,
"loss": 1.7808,
"step": 71450
},
{
"epoch": 0.9835073384778332,
"grad_norm": 0.16363121569156647,
"learning_rate": 0.0001,
"loss": 1.7783,
"step": 71500
},
{
"epoch": 0.9841951058473982,
"grad_norm": 0.1542508453130722,
"learning_rate": 0.0001,
"loss": 1.7817,
"step": 71550
},
{
"epoch": 0.9848828732169631,
"grad_norm": 0.1389523297548294,
"learning_rate": 0.0001,
"loss": 1.7791,
"step": 71600
},
{
"epoch": 0.985570640586528,
"grad_norm": 0.15856057405471802,
"learning_rate": 0.0001,
"loss": 1.7833,
"step": 71650
},
{
"epoch": 0.986258407956093,
"grad_norm": 0.15098857879638672,
"learning_rate": 0.0001,
"loss": 1.7764,
"step": 71700
},
{
"epoch": 0.9869461753256579,
"grad_norm": 0.14318101108074188,
"learning_rate": 0.0001,
"loss": 1.7782,
"step": 71750
},
{
"epoch": 0.9876339426952228,
"grad_norm": 0.16459529101848602,
"learning_rate": 0.0001,
"loss": 1.7774,
"step": 71800
},
{
"epoch": 0.9883217100647876,
"grad_norm": 0.14705689251422882,
"learning_rate": 0.0001,
"loss": 1.7813,
"step": 71850
},
{
"epoch": 0.9890094774343526,
"grad_norm": 0.2091091424226761,
"learning_rate": 0.0001,
"loss": 1.7819,
"step": 71900
},
{
"epoch": 0.9896972448039175,
"grad_norm": 0.1711418330669403,
"learning_rate": 0.0001,
"loss": 1.7782,
"step": 71950
},
{
"epoch": 0.9903850121734824,
"grad_norm": 0.15255683660507202,
"learning_rate": 0.0001,
"loss": 1.7851,
"step": 72000
},
{
"epoch": 0.9910727795430474,
"grad_norm": 0.17501915991306305,
"learning_rate": 0.0001,
"loss": 1.7824,
"step": 72050
},
{
"epoch": 0.9917605469126123,
"grad_norm": 0.1605847328901291,
"learning_rate": 0.0001,
"loss": 1.7802,
"step": 72100
},
{
"epoch": 0.9924483142821772,
"grad_norm": 0.14898759126663208,
"learning_rate": 0.0001,
"loss": 1.7836,
"step": 72150
},
{
"epoch": 0.9931360816517422,
"grad_norm": 0.15966999530792236,
"learning_rate": 0.0001,
"loss": 1.7773,
"step": 72200
},
{
"epoch": 0.993823849021307,
"grad_norm": 0.14977654814720154,
"learning_rate": 0.0001,
"loss": 1.7764,
"step": 72250
},
{
"epoch": 0.9945116163908719,
"grad_norm": 0.16077259182929993,
"learning_rate": 0.0001,
"loss": 1.7789,
"step": 72300
},
{
"epoch": 0.9951993837604368,
"grad_norm": 0.1603011190891266,
"learning_rate": 0.0001,
"loss": 1.7756,
"step": 72350
},
{
"epoch": 0.9958871511300018,
"grad_norm": 0.17926956713199615,
"learning_rate": 0.0001,
"loss": 1.7805,
"step": 72400
},
{
"epoch": 0.9965749184995667,
"grad_norm": 0.15523836016654968,
"learning_rate": 0.0001,
"loss": 1.7816,
"step": 72450
},
{
"epoch": 0.9972626858691316,
"grad_norm": 0.15533694624900818,
"learning_rate": 0.0001,
"loss": 1.7817,
"step": 72500
},
{
"epoch": 0.9979504532386966,
"grad_norm": 0.17167145013809204,
"learning_rate": 0.0001,
"loss": 1.7793,
"step": 72550
},
{
"epoch": 0.9986382206082615,
"grad_norm": 0.1536383181810379,
"learning_rate": 0.0001,
"loss": 1.7792,
"step": 72600
},
{
"epoch": 0.9993259879778263,
"grad_norm": 0.15611621737480164,
"learning_rate": 0.0001,
"loss": 1.7798,
"step": 72650
}
],
"logging_steps": 50,
"max_steps": 72699,
"num_input_tokens_seen": 0,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 1.7804002887190855e+21,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}