{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500.0, "global_step": 72699, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0006877673695649183, "grad_norm": 0.19629216194152832, "learning_rate": 0.0001, "loss": 1.9938, "step": 50 }, { "epoch": 0.0013755347391298366, "grad_norm": 0.24478936195373535, "learning_rate": 0.0001, "loss": 1.9596, "step": 100 }, { "epoch": 0.002063302108694755, "grad_norm": 0.22170111536979675, "learning_rate": 0.0001, "loss": 1.9526, "step": 150 }, { "epoch": 0.0027510694782596733, "grad_norm": 0.2311343252658844, "learning_rate": 0.0001, "loss": 1.9471, "step": 200 }, { "epoch": 0.003438836847824592, "grad_norm": 0.20621128380298615, "learning_rate": 0.0001, "loss": 1.9435, "step": 250 }, { "epoch": 0.00412660421738951, "grad_norm": 0.22248196601867676, "learning_rate": 0.0001, "loss": 1.9396, "step": 300 }, { "epoch": 0.004814371586954428, "grad_norm": 0.20232965052127838, "learning_rate": 0.0001, "loss": 1.9362, "step": 350 }, { "epoch": 0.0055021389565193465, "grad_norm": 0.21155332028865814, "learning_rate": 0.0001, "loss": 1.9285, "step": 400 }, { "epoch": 0.006189906326084265, "grad_norm": 0.25176894664764404, "learning_rate": 0.0001, "loss": 1.9319, "step": 450 }, { "epoch": 0.006877673695649184, "grad_norm": 0.21027377247810364, "learning_rate": 0.0001, "loss": 1.9304, "step": 500 }, { "epoch": 0.007565441065214102, "grad_norm": 0.2434869110584259, "learning_rate": 0.0001, "loss": 1.93, "step": 550 }, { "epoch": 0.00825320843477902, "grad_norm": 0.1908300668001175, "learning_rate": 0.0001, "loss": 1.9254, "step": 600 }, { "epoch": 0.008940975804343939, "grad_norm": 0.2221110612154007, "learning_rate": 0.0001, "loss": 1.9226, "step": 650 }, { "epoch": 0.009628743173908856, "grad_norm": 0.22620266675949097, "learning_rate": 0.0001, "loss": 1.9262, "step": 700 }, { "epoch": 0.010316510543473776, "grad_norm": 0.21463032066822052, "learning_rate": 0.0001, "loss": 1.9201, "step": 750 }, { "epoch": 0.011004277913038693, "grad_norm": 0.19383488595485687, "learning_rate": 0.0001, "loss": 1.9236, "step": 800 }, { "epoch": 0.011692045282603612, "grad_norm": 0.22416023910045624, "learning_rate": 0.0001, "loss": 1.9186, "step": 850 }, { "epoch": 0.01237981265216853, "grad_norm": 0.2285342961549759, "learning_rate": 0.0001, "loss": 1.9207, "step": 900 }, { "epoch": 0.013067580021733449, "grad_norm": 0.20416004955768585, "learning_rate": 0.0001, "loss": 1.9171, "step": 950 }, { "epoch": 0.013755347391298368, "grad_norm": 0.20697274804115295, "learning_rate": 0.0001, "loss": 1.9177, "step": 1000 }, { "epoch": 0.014443114760863286, "grad_norm": 0.2317676991224289, "learning_rate": 0.0001, "loss": 1.9138, "step": 1050 }, { "epoch": 0.015130882130428205, "grad_norm": 0.21276156604290009, "learning_rate": 0.0001, "loss": 1.9111, "step": 1100 }, { "epoch": 0.015818649499993124, "grad_norm": 0.20574018359184265, "learning_rate": 0.0001, "loss": 1.9155, "step": 1150 }, { "epoch": 0.01650641686955804, "grad_norm": 0.19410207867622375, "learning_rate": 0.0001, "loss": 1.9073, "step": 1200 }, { "epoch": 0.01719418423912296, "grad_norm": 0.19570203125476837, "learning_rate": 0.0001, "loss": 1.9085, "step": 1250 }, { "epoch": 0.017881951608687878, "grad_norm": 0.2081640362739563, "learning_rate": 0.0001, "loss": 1.9093, "step": 1300 }, { "epoch": 0.018569718978252797, "grad_norm": 0.19721642136573792, "learning_rate": 0.0001, "loss": 1.9086, "step": 1350 }, { "epoch": 0.019257486347817713, "grad_norm": 0.202309712767601, "learning_rate": 0.0001, "loss": 1.9039, "step": 1400 }, { "epoch": 0.019945253717382632, "grad_norm": 0.22128838300704956, "learning_rate": 0.0001, "loss": 1.9067, "step": 1450 }, { "epoch": 0.02063302108694755, "grad_norm": 0.25011196732521057, "learning_rate": 0.0001, "loss": 1.9055, "step": 1500 }, { "epoch": 0.02132078845651247, "grad_norm": 0.20523639023303986, "learning_rate": 0.0001, "loss": 1.9039, "step": 1550 }, { "epoch": 0.022008555826077386, "grad_norm": 0.2327890396118164, "learning_rate": 0.0001, "loss": 1.9059, "step": 1600 }, { "epoch": 0.022696323195642305, "grad_norm": 0.22426384687423706, "learning_rate": 0.0001, "loss": 1.9033, "step": 1650 }, { "epoch": 0.023384090565207225, "grad_norm": 0.2116124927997589, "learning_rate": 0.0001, "loss": 1.902, "step": 1700 }, { "epoch": 0.024071857934772144, "grad_norm": 0.21172966063022614, "learning_rate": 0.0001, "loss": 1.9007, "step": 1750 }, { "epoch": 0.02475962530433706, "grad_norm": 0.19443170726299286, "learning_rate": 0.0001, "loss": 1.9003, "step": 1800 }, { "epoch": 0.02544739267390198, "grad_norm": 0.21195723116397858, "learning_rate": 0.0001, "loss": 1.9015, "step": 1850 }, { "epoch": 0.026135160043466898, "grad_norm": 0.22141411900520325, "learning_rate": 0.0001, "loss": 1.8957, "step": 1900 }, { "epoch": 0.026822927413031817, "grad_norm": 0.22995401918888092, "learning_rate": 0.0001, "loss": 1.8979, "step": 1950 }, { "epoch": 0.027510694782596736, "grad_norm": 0.2246379405260086, "learning_rate": 0.0001, "loss": 1.8966, "step": 2000 }, { "epoch": 0.028198462152161652, "grad_norm": 0.22695621848106384, "learning_rate": 0.0001, "loss": 1.895, "step": 2050 }, { "epoch": 0.02888622952172657, "grad_norm": 0.19988253712654114, "learning_rate": 0.0001, "loss": 1.8934, "step": 2100 }, { "epoch": 0.02957399689129149, "grad_norm": 0.21754223108291626, "learning_rate": 0.0001, "loss": 1.8972, "step": 2150 }, { "epoch": 0.03026176426085641, "grad_norm": 0.19053423404693604, "learning_rate": 0.0001, "loss": 1.8912, "step": 2200 }, { "epoch": 0.030949531630421325, "grad_norm": 0.21589875221252441, "learning_rate": 0.0001, "loss": 1.8935, "step": 2250 }, { "epoch": 0.03163729899998625, "grad_norm": 0.2087436020374298, "learning_rate": 0.0001, "loss": 1.8923, "step": 2300 }, { "epoch": 0.03232506636955116, "grad_norm": 0.2261374592781067, "learning_rate": 0.0001, "loss": 1.8914, "step": 2350 }, { "epoch": 0.03301283373911608, "grad_norm": 0.1949523240327835, "learning_rate": 0.0001, "loss": 1.8905, "step": 2400 }, { "epoch": 0.033700601108681, "grad_norm": 0.21544858813285828, "learning_rate": 0.0001, "loss": 1.8909, "step": 2450 }, { "epoch": 0.03438836847824592, "grad_norm": 0.20145681500434875, "learning_rate": 0.0001, "loss": 1.8876, "step": 2500 }, { "epoch": 0.03507613584781084, "grad_norm": 0.21707232296466827, "learning_rate": 0.0001, "loss": 1.8915, "step": 2550 }, { "epoch": 0.035763903217375756, "grad_norm": 0.1982990950345993, "learning_rate": 0.0001, "loss": 1.888, "step": 2600 }, { "epoch": 0.036451670586940675, "grad_norm": 0.2223712056875229, "learning_rate": 0.0001, "loss": 1.8868, "step": 2650 }, { "epoch": 0.037139437956505594, "grad_norm": 0.19649413228034973, "learning_rate": 0.0001, "loss": 1.8869, "step": 2700 }, { "epoch": 0.03782720532607051, "grad_norm": 0.22767962515354156, "learning_rate": 0.0001, "loss": 1.8901, "step": 2750 }, { "epoch": 0.038514972695635426, "grad_norm": 0.19138416647911072, "learning_rate": 0.0001, "loss": 1.8916, "step": 2800 }, { "epoch": 0.039202740065200345, "grad_norm": 0.19380460679531097, "learning_rate": 0.0001, "loss": 1.8889, "step": 2850 }, { "epoch": 0.039890507434765264, "grad_norm": 0.19751518964767456, "learning_rate": 0.0001, "loss": 1.8868, "step": 2900 }, { "epoch": 0.04057827480433018, "grad_norm": 0.21071408689022064, "learning_rate": 0.0001, "loss": 1.8862, "step": 2950 }, { "epoch": 0.0412660421738951, "grad_norm": 0.19260670244693756, "learning_rate": 0.0001, "loss": 1.8827, "step": 3000 }, { "epoch": 0.04195380954346002, "grad_norm": 0.19185714423656464, "learning_rate": 0.0001, "loss": 1.8866, "step": 3050 }, { "epoch": 0.04264157691302494, "grad_norm": 0.24877017736434937, "learning_rate": 0.0001, "loss": 1.8854, "step": 3100 }, { "epoch": 0.04332934428258986, "grad_norm": 0.1947249323129654, "learning_rate": 0.0001, "loss": 1.8842, "step": 3150 }, { "epoch": 0.04401711165215477, "grad_norm": 0.20210722088813782, "learning_rate": 0.0001, "loss": 1.8837, "step": 3200 }, { "epoch": 0.04470487902171969, "grad_norm": 0.22242394089698792, "learning_rate": 0.0001, "loss": 1.8817, "step": 3250 }, { "epoch": 0.04539264639128461, "grad_norm": 0.2049330472946167, "learning_rate": 0.0001, "loss": 1.8845, "step": 3300 }, { "epoch": 0.04608041376084953, "grad_norm": 0.19368599355220795, "learning_rate": 0.0001, "loss": 1.884, "step": 3350 }, { "epoch": 0.04676818113041445, "grad_norm": 0.1886671483516693, "learning_rate": 0.0001, "loss": 1.883, "step": 3400 }, { "epoch": 0.04745594849997937, "grad_norm": 0.19359445571899414, "learning_rate": 0.0001, "loss": 1.8824, "step": 3450 }, { "epoch": 0.04814371586954429, "grad_norm": 0.195325568318367, "learning_rate": 0.0001, "loss": 1.8806, "step": 3500 }, { "epoch": 0.04883148323910921, "grad_norm": 0.21584388613700867, "learning_rate": 0.0001, "loss": 1.879, "step": 3550 }, { "epoch": 0.04951925060867412, "grad_norm": 0.19085532426834106, "learning_rate": 0.0001, "loss": 1.8817, "step": 3600 }, { "epoch": 0.05020701797823904, "grad_norm": 0.2133578211069107, "learning_rate": 0.0001, "loss": 1.8797, "step": 3650 }, { "epoch": 0.05089478534780396, "grad_norm": 0.19587628543376923, "learning_rate": 0.0001, "loss": 1.8806, "step": 3700 }, { "epoch": 0.051582552717368876, "grad_norm": 0.22608409821987152, "learning_rate": 0.0001, "loss": 1.8803, "step": 3750 }, { "epoch": 0.052270320086933796, "grad_norm": 0.20075012743473053, "learning_rate": 0.0001, "loss": 1.8773, "step": 3800 }, { "epoch": 0.052958087456498715, "grad_norm": 0.2007540464401245, "learning_rate": 0.0001, "loss": 1.8775, "step": 3850 }, { "epoch": 0.053645854826063634, "grad_norm": 0.20465299487113953, "learning_rate": 0.0001, "loss": 1.88, "step": 3900 }, { "epoch": 0.05433362219562855, "grad_norm": 0.19921573996543884, "learning_rate": 0.0001, "loss": 1.8749, "step": 3950 }, { "epoch": 0.05502138956519347, "grad_norm": 0.19196507334709167, "learning_rate": 0.0001, "loss": 1.8808, "step": 4000 }, { "epoch": 0.055709156934758385, "grad_norm": 0.20529140532016754, "learning_rate": 0.0001, "loss": 1.8787, "step": 4050 }, { "epoch": 0.056396924304323304, "grad_norm": 0.23082584142684937, "learning_rate": 0.0001, "loss": 1.8752, "step": 4100 }, { "epoch": 0.05708469167388822, "grad_norm": 0.18597312271595, "learning_rate": 0.0001, "loss": 1.8793, "step": 4150 }, { "epoch": 0.05777245904345314, "grad_norm": 0.23071937263011932, "learning_rate": 0.0001, "loss": 1.8782, "step": 4200 }, { "epoch": 0.05846022641301806, "grad_norm": 0.19141189754009247, "learning_rate": 0.0001, "loss": 1.875, "step": 4250 }, { "epoch": 0.05914799378258298, "grad_norm": 0.23278222978115082, "learning_rate": 0.0001, "loss": 1.8805, "step": 4300 }, { "epoch": 0.0598357611521479, "grad_norm": 0.21169067919254303, "learning_rate": 0.0001, "loss": 1.8753, "step": 4350 }, { "epoch": 0.06052352852171282, "grad_norm": 0.2010953575372696, "learning_rate": 0.0001, "loss": 1.8758, "step": 4400 }, { "epoch": 0.06121129589127773, "grad_norm": 0.19260814785957336, "learning_rate": 0.0001, "loss": 1.8731, "step": 4450 }, { "epoch": 0.06189906326084265, "grad_norm": 0.19751103222370148, "learning_rate": 0.0001, "loss": 1.8719, "step": 4500 }, { "epoch": 0.06258683063040757, "grad_norm": 0.21297581493854523, "learning_rate": 0.0001, "loss": 1.875, "step": 4550 }, { "epoch": 0.0632745979999725, "grad_norm": 0.2128158062696457, "learning_rate": 0.0001, "loss": 1.8711, "step": 4600 }, { "epoch": 0.06396236536953741, "grad_norm": 0.18719784915447235, "learning_rate": 0.0001, "loss": 1.8741, "step": 4650 }, { "epoch": 0.06465013273910232, "grad_norm": 0.2352721244096756, "learning_rate": 0.0001, "loss": 1.8717, "step": 4700 }, { "epoch": 0.06533790010866725, "grad_norm": 0.22228975594043732, "learning_rate": 0.0001, "loss": 1.8707, "step": 4750 }, { "epoch": 0.06602566747823216, "grad_norm": 0.18716222047805786, "learning_rate": 0.0001, "loss": 1.8705, "step": 4800 }, { "epoch": 0.06671343484779708, "grad_norm": 0.22167149186134338, "learning_rate": 0.0001, "loss": 1.8739, "step": 4850 }, { "epoch": 0.067401202217362, "grad_norm": 0.24794642627239227, "learning_rate": 0.0001, "loss": 1.8747, "step": 4900 }, { "epoch": 0.06808896958692692, "grad_norm": 0.18762528896331787, "learning_rate": 0.0001, "loss": 1.8702, "step": 4950 }, { "epoch": 0.06877673695649184, "grad_norm": 0.19063113629817963, "learning_rate": 0.0001, "loss": 1.8733, "step": 5000 }, { "epoch": 0.06946450432605676, "grad_norm": 0.1940603107213974, "learning_rate": 0.0001, "loss": 1.8685, "step": 5050 }, { "epoch": 0.07015227169562167, "grad_norm": 0.19752484560012817, "learning_rate": 0.0001, "loss": 1.8762, "step": 5100 }, { "epoch": 0.07084003906518659, "grad_norm": 0.23486199975013733, "learning_rate": 0.0001, "loss": 1.8708, "step": 5150 }, { "epoch": 0.07152780643475151, "grad_norm": 0.20315973460674286, "learning_rate": 0.0001, "loss": 1.8676, "step": 5200 }, { "epoch": 0.07221557380431642, "grad_norm": 0.1925646960735321, "learning_rate": 0.0001, "loss": 1.8634, "step": 5250 }, { "epoch": 0.07290334117388135, "grad_norm": 0.20540663599967957, "learning_rate": 0.0001, "loss": 1.8706, "step": 5300 }, { "epoch": 0.07359110854344626, "grad_norm": 0.23649099469184875, "learning_rate": 0.0001, "loss": 1.8685, "step": 5350 }, { "epoch": 0.07427887591301119, "grad_norm": 0.23272614181041718, "learning_rate": 0.0001, "loss": 1.8724, "step": 5400 }, { "epoch": 0.0749666432825761, "grad_norm": 0.1887608915567398, "learning_rate": 0.0001, "loss": 1.8707, "step": 5450 }, { "epoch": 0.07565441065214101, "grad_norm": 0.18964676558971405, "learning_rate": 0.0001, "loss": 1.8642, "step": 5500 }, { "epoch": 0.07634217802170594, "grad_norm": 0.20009934902191162, "learning_rate": 0.0001, "loss": 1.8657, "step": 5550 }, { "epoch": 0.07702994539127085, "grad_norm": 0.1821998506784439, "learning_rate": 0.0001, "loss": 1.8673, "step": 5600 }, { "epoch": 0.07771771276083578, "grad_norm": 0.18905235826969147, "learning_rate": 0.0001, "loss": 1.8687, "step": 5650 }, { "epoch": 0.07840548013040069, "grad_norm": 0.19986678659915924, "learning_rate": 0.0001, "loss": 1.8627, "step": 5700 }, { "epoch": 0.07909324749996562, "grad_norm": 0.1904374659061432, "learning_rate": 0.0001, "loss": 1.8633, "step": 5750 }, { "epoch": 0.07978101486953053, "grad_norm": 0.19536761939525604, "learning_rate": 0.0001, "loss": 1.8685, "step": 5800 }, { "epoch": 0.08046878223909545, "grad_norm": 0.18209826946258545, "learning_rate": 0.0001, "loss": 1.8599, "step": 5850 }, { "epoch": 0.08115654960866037, "grad_norm": 0.21385939419269562, "learning_rate": 0.0001, "loss": 1.866, "step": 5900 }, { "epoch": 0.08184431697822528, "grad_norm": 0.20338542759418488, "learning_rate": 0.0001, "loss": 1.8669, "step": 5950 }, { "epoch": 0.0825320843477902, "grad_norm": 0.19536232948303223, "learning_rate": 0.0001, "loss": 1.8644, "step": 6000 }, { "epoch": 0.08321985171735512, "grad_norm": 0.18480873107910156, "learning_rate": 0.0001, "loss": 1.8668, "step": 6050 }, { "epoch": 0.08390761908692004, "grad_norm": 0.18024863302707672, "learning_rate": 0.0001, "loss": 1.8638, "step": 6100 }, { "epoch": 0.08459538645648496, "grad_norm": 0.18774175643920898, "learning_rate": 0.0001, "loss": 1.8652, "step": 6150 }, { "epoch": 0.08528315382604988, "grad_norm": 0.2518685460090637, "learning_rate": 0.0001, "loss": 1.8649, "step": 6200 }, { "epoch": 0.0859709211956148, "grad_norm": 0.20646634697914124, "learning_rate": 0.0001, "loss": 1.8658, "step": 6250 }, { "epoch": 0.08665868856517972, "grad_norm": 0.19222316145896912, "learning_rate": 0.0001, "loss": 1.8642, "step": 6300 }, { "epoch": 0.08734645593474463, "grad_norm": 0.19531960785388947, "learning_rate": 0.0001, "loss": 1.8641, "step": 6350 }, { "epoch": 0.08803422330430954, "grad_norm": 0.18218673765659332, "learning_rate": 0.0001, "loss": 1.8599, "step": 6400 }, { "epoch": 0.08872199067387447, "grad_norm": 0.18686556816101074, "learning_rate": 0.0001, "loss": 1.8588, "step": 6450 }, { "epoch": 0.08940975804343938, "grad_norm": 0.20718005299568176, "learning_rate": 0.0001, "loss": 1.8595, "step": 6500 }, { "epoch": 0.09009752541300431, "grad_norm": 0.17680206894874573, "learning_rate": 0.0001, "loss": 1.8625, "step": 6550 }, { "epoch": 0.09078529278256922, "grad_norm": 0.25429028272628784, "learning_rate": 0.0001, "loss": 1.8635, "step": 6600 }, { "epoch": 0.09147306015213415, "grad_norm": 0.19778478145599365, "learning_rate": 0.0001, "loss": 1.8618, "step": 6650 }, { "epoch": 0.09216082752169906, "grad_norm": 0.21198226511478424, "learning_rate": 0.0001, "loss": 1.8613, "step": 6700 }, { "epoch": 0.09284859489126399, "grad_norm": 0.1819111704826355, "learning_rate": 0.0001, "loss": 1.8601, "step": 6750 }, { "epoch": 0.0935363622608289, "grad_norm": 0.2141820788383484, "learning_rate": 0.0001, "loss": 1.8598, "step": 6800 }, { "epoch": 0.09422412963039381, "grad_norm": 0.20356012880802155, "learning_rate": 0.0001, "loss": 1.8619, "step": 6850 }, { "epoch": 0.09491189699995874, "grad_norm": 0.18998335301876068, "learning_rate": 0.0001, "loss": 1.8597, "step": 6900 }, { "epoch": 0.09559966436952365, "grad_norm": 0.19086682796478271, "learning_rate": 0.0001, "loss": 1.8622, "step": 6950 }, { "epoch": 0.09628743173908857, "grad_norm": 0.2049364447593689, "learning_rate": 0.0001, "loss": 1.8617, "step": 7000 }, { "epoch": 0.09697519910865349, "grad_norm": 0.19833974540233612, "learning_rate": 0.0001, "loss": 1.8609, "step": 7050 }, { "epoch": 0.09766296647821841, "grad_norm": 0.19551745057106018, "learning_rate": 0.0001, "loss": 1.8581, "step": 7100 }, { "epoch": 0.09835073384778333, "grad_norm": 0.1846143752336502, "learning_rate": 0.0001, "loss": 1.8569, "step": 7150 }, { "epoch": 0.09903850121734824, "grad_norm": 0.1906626969575882, "learning_rate": 0.0001, "loss": 1.8614, "step": 7200 }, { "epoch": 0.09972626858691316, "grad_norm": 0.19115209579467773, "learning_rate": 0.0001, "loss": 1.8633, "step": 7250 }, { "epoch": 0.10041403595647808, "grad_norm": 0.18704906105995178, "learning_rate": 0.0001, "loss": 1.8601, "step": 7300 }, { "epoch": 0.101101803326043, "grad_norm": 0.18635210394859314, "learning_rate": 0.0001, "loss": 1.8605, "step": 7350 }, { "epoch": 0.10178957069560791, "grad_norm": 0.1947161853313446, "learning_rate": 0.0001, "loss": 1.861, "step": 7400 }, { "epoch": 0.10247733806517284, "grad_norm": 0.22087708115577698, "learning_rate": 0.0001, "loss": 1.8553, "step": 7450 }, { "epoch": 0.10316510543473775, "grad_norm": 0.1805039346218109, "learning_rate": 0.0001, "loss": 1.8591, "step": 7500 }, { "epoch": 0.10385287280430268, "grad_norm": 0.19084776937961578, "learning_rate": 0.0001, "loss": 1.8561, "step": 7550 }, { "epoch": 0.10454064017386759, "grad_norm": 0.20166590809822083, "learning_rate": 0.0001, "loss": 1.8584, "step": 7600 }, { "epoch": 0.1052284075434325, "grad_norm": 0.1892371028661728, "learning_rate": 0.0001, "loss": 1.8526, "step": 7650 }, { "epoch": 0.10591617491299743, "grad_norm": 0.22085241973400116, "learning_rate": 0.0001, "loss": 1.8561, "step": 7700 }, { "epoch": 0.10660394228256234, "grad_norm": 0.186112642288208, "learning_rate": 0.0001, "loss": 1.8597, "step": 7750 }, { "epoch": 0.10729170965212727, "grad_norm": 0.1959947943687439, "learning_rate": 0.0001, "loss": 1.8558, "step": 7800 }, { "epoch": 0.10797947702169218, "grad_norm": 0.21492016315460205, "learning_rate": 0.0001, "loss": 1.8608, "step": 7850 }, { "epoch": 0.1086672443912571, "grad_norm": 0.18600517511367798, "learning_rate": 0.0001, "loss": 1.8559, "step": 7900 }, { "epoch": 0.10935501176082202, "grad_norm": 0.18841132521629333, "learning_rate": 0.0001, "loss": 1.8542, "step": 7950 }, { "epoch": 0.11004277913038694, "grad_norm": 0.20758236944675446, "learning_rate": 0.0001, "loss": 1.8565, "step": 8000 }, { "epoch": 0.11073054649995186, "grad_norm": 0.20206254720687866, "learning_rate": 0.0001, "loss": 1.8553, "step": 8050 }, { "epoch": 0.11141831386951677, "grad_norm": 0.19620998203754425, "learning_rate": 0.0001, "loss": 1.8542, "step": 8100 }, { "epoch": 0.1121060812390817, "grad_norm": 0.19747626781463623, "learning_rate": 0.0001, "loss": 1.8545, "step": 8150 }, { "epoch": 0.11279384860864661, "grad_norm": 0.21328890323638916, "learning_rate": 0.0001, "loss": 1.8552, "step": 8200 }, { "epoch": 0.11348161597821153, "grad_norm": 0.18296054005622864, "learning_rate": 0.0001, "loss": 1.8579, "step": 8250 }, { "epoch": 0.11416938334777645, "grad_norm": 0.21098335087299347, "learning_rate": 0.0001, "loss": 1.8526, "step": 8300 }, { "epoch": 0.11485715071734137, "grad_norm": 0.18666841089725494, "learning_rate": 0.0001, "loss": 1.8484, "step": 8350 }, { "epoch": 0.11554491808690628, "grad_norm": 0.18522906303405762, "learning_rate": 0.0001, "loss": 1.8538, "step": 8400 }, { "epoch": 0.1162326854564712, "grad_norm": 0.1890312135219574, "learning_rate": 0.0001, "loss": 1.8519, "step": 8450 }, { "epoch": 0.11692045282603612, "grad_norm": 0.197422057390213, "learning_rate": 0.0001, "loss": 1.8513, "step": 8500 }, { "epoch": 0.11760822019560103, "grad_norm": 0.21355442702770233, "learning_rate": 0.0001, "loss": 1.8561, "step": 8550 }, { "epoch": 0.11829598756516596, "grad_norm": 0.18543662130832672, "learning_rate": 0.0001, "loss": 1.8538, "step": 8600 }, { "epoch": 0.11898375493473087, "grad_norm": 0.20849215984344482, "learning_rate": 0.0001, "loss": 1.8527, "step": 8650 }, { "epoch": 0.1196715223042958, "grad_norm": 0.2109488546848297, "learning_rate": 0.0001, "loss": 1.8496, "step": 8700 }, { "epoch": 0.12035928967386071, "grad_norm": 0.20195640623569489, "learning_rate": 0.0001, "loss": 1.8499, "step": 8750 }, { "epoch": 0.12104705704342564, "grad_norm": 0.1749362200498581, "learning_rate": 0.0001, "loss": 1.8559, "step": 8800 }, { "epoch": 0.12173482441299055, "grad_norm": 0.20881310105323792, "learning_rate": 0.0001, "loss": 1.8536, "step": 8850 }, { "epoch": 0.12242259178255546, "grad_norm": 0.1801750510931015, "learning_rate": 0.0001, "loss": 1.8507, "step": 8900 }, { "epoch": 0.12311035915212039, "grad_norm": 0.1898815929889679, "learning_rate": 0.0001, "loss": 1.8493, "step": 8950 }, { "epoch": 0.1237981265216853, "grad_norm": 0.19754734635353088, "learning_rate": 0.0001, "loss": 1.853, "step": 9000 }, { "epoch": 0.12448589389125023, "grad_norm": 0.1855219006538391, "learning_rate": 0.0001, "loss": 1.8529, "step": 9050 }, { "epoch": 0.12517366126081514, "grad_norm": 0.19341996312141418, "learning_rate": 0.0001, "loss": 1.8513, "step": 9100 }, { "epoch": 0.12586142863038005, "grad_norm": 0.19776052236557007, "learning_rate": 0.0001, "loss": 1.8507, "step": 9150 }, { "epoch": 0.126549195999945, "grad_norm": 0.185306116938591, "learning_rate": 0.0001, "loss": 1.851, "step": 9200 }, { "epoch": 0.1272369633695099, "grad_norm": 0.19926750659942627, "learning_rate": 0.0001, "loss": 1.8504, "step": 9250 }, { "epoch": 0.12792473073907482, "grad_norm": 0.21605028212070465, "learning_rate": 0.0001, "loss": 1.8502, "step": 9300 }, { "epoch": 0.12861249810863973, "grad_norm": 0.18174859881401062, "learning_rate": 0.0001, "loss": 1.8505, "step": 9350 }, { "epoch": 0.12930026547820464, "grad_norm": 0.19654984772205353, "learning_rate": 0.0001, "loss": 1.8517, "step": 9400 }, { "epoch": 0.12998803284776958, "grad_norm": 0.1764276772737503, "learning_rate": 0.0001, "loss": 1.8483, "step": 9450 }, { "epoch": 0.1306758002173345, "grad_norm": 0.17811571061611176, "learning_rate": 0.0001, "loss": 1.8469, "step": 9500 }, { "epoch": 0.1313635675868994, "grad_norm": 0.20159000158309937, "learning_rate": 0.0001, "loss": 1.8455, "step": 9550 }, { "epoch": 0.13205133495646432, "grad_norm": 0.1840062290430069, "learning_rate": 0.0001, "loss": 1.8511, "step": 9600 }, { "epoch": 0.13273910232602926, "grad_norm": 0.190440833568573, "learning_rate": 0.0001, "loss": 1.8474, "step": 9650 }, { "epoch": 0.13342686969559417, "grad_norm": 0.20033535361289978, "learning_rate": 0.0001, "loss": 1.8479, "step": 9700 }, { "epoch": 0.13411463706515908, "grad_norm": 0.1811174899339676, "learning_rate": 0.0001, "loss": 1.8504, "step": 9750 }, { "epoch": 0.134802404434724, "grad_norm": 0.2073344737291336, "learning_rate": 0.0001, "loss": 1.8507, "step": 9800 }, { "epoch": 0.1354901718042889, "grad_norm": 0.21762603521347046, "learning_rate": 0.0001, "loss": 1.8499, "step": 9850 }, { "epoch": 0.13617793917385385, "grad_norm": 0.1864607185125351, "learning_rate": 0.0001, "loss": 1.8471, "step": 9900 }, { "epoch": 0.13686570654341876, "grad_norm": 0.17837654054164886, "learning_rate": 0.0001, "loss": 1.8485, "step": 9950 }, { "epoch": 0.13755347391298367, "grad_norm": 0.20498532056808472, "learning_rate": 0.0001, "loss": 1.8497, "step": 10000 }, { "epoch": 0.13824124128254858, "grad_norm": 0.18355566263198853, "learning_rate": 0.0001, "loss": 1.8458, "step": 10050 }, { "epoch": 0.13892900865211352, "grad_norm": 0.2033490389585495, "learning_rate": 0.0001, "loss": 1.8451, "step": 10100 }, { "epoch": 0.13961677602167843, "grad_norm": 0.1855219006538391, "learning_rate": 0.0001, "loss": 1.8475, "step": 10150 }, { "epoch": 0.14030454339124335, "grad_norm": 0.18876652419567108, "learning_rate": 0.0001, "loss": 1.8473, "step": 10200 }, { "epoch": 0.14099231076080826, "grad_norm": 0.1731424629688263, "learning_rate": 0.0001, "loss": 1.8475, "step": 10250 }, { "epoch": 0.14168007813037317, "grad_norm": 0.186906635761261, "learning_rate": 0.0001, "loss": 1.8498, "step": 10300 }, { "epoch": 0.1423678454999381, "grad_norm": 0.18285425007343292, "learning_rate": 0.0001, "loss": 1.8451, "step": 10350 }, { "epoch": 0.14305561286950302, "grad_norm": 0.19545456767082214, "learning_rate": 0.0001, "loss": 1.8487, "step": 10400 }, { "epoch": 0.14374338023906794, "grad_norm": 0.16256272792816162, "learning_rate": 0.0001, "loss": 1.8461, "step": 10450 }, { "epoch": 0.14443114760863285, "grad_norm": 0.19637931883335114, "learning_rate": 0.0001, "loss": 1.8462, "step": 10500 }, { "epoch": 0.14511891497819776, "grad_norm": 0.20408660173416138, "learning_rate": 0.0001, "loss": 1.8465, "step": 10550 }, { "epoch": 0.1458066823477627, "grad_norm": 0.2140285223722458, "learning_rate": 0.0001, "loss": 1.8421, "step": 10600 }, { "epoch": 0.1464944497173276, "grad_norm": 0.18366774916648865, "learning_rate": 0.0001, "loss": 1.8454, "step": 10650 }, { "epoch": 0.14718221708689253, "grad_norm": 0.19011645019054413, "learning_rate": 0.0001, "loss": 1.8427, "step": 10700 }, { "epoch": 0.14786998445645744, "grad_norm": 0.1923753321170807, "learning_rate": 0.0001, "loss": 1.8442, "step": 10750 }, { "epoch": 0.14855775182602238, "grad_norm": 0.19208142161369324, "learning_rate": 0.0001, "loss": 1.8413, "step": 10800 }, { "epoch": 0.1492455191955873, "grad_norm": 0.19608841836452484, "learning_rate": 0.0001, "loss": 1.8468, "step": 10850 }, { "epoch": 0.1499332865651522, "grad_norm": 0.19484341144561768, "learning_rate": 0.0001, "loss": 1.849, "step": 10900 }, { "epoch": 0.15062105393471711, "grad_norm": 0.18584389984607697, "learning_rate": 0.0001, "loss": 1.8416, "step": 10950 }, { "epoch": 0.15130882130428203, "grad_norm": 0.1894279420375824, "learning_rate": 0.0001, "loss": 1.8454, "step": 11000 }, { "epoch": 0.15199658867384697, "grad_norm": 0.19622810184955597, "learning_rate": 0.0001, "loss": 1.8449, "step": 11050 }, { "epoch": 0.15268435604341188, "grad_norm": 0.18603233993053436, "learning_rate": 0.0001, "loss": 1.848, "step": 11100 }, { "epoch": 0.1533721234129768, "grad_norm": 0.18146397173404694, "learning_rate": 0.0001, "loss": 1.8413, "step": 11150 }, { "epoch": 0.1540598907825417, "grad_norm": 0.20820939540863037, "learning_rate": 0.0001, "loss": 1.844, "step": 11200 }, { "epoch": 0.15474765815210664, "grad_norm": 0.18021373450756073, "learning_rate": 0.0001, "loss": 1.8434, "step": 11250 }, { "epoch": 0.15543542552167156, "grad_norm": 0.19339635968208313, "learning_rate": 0.0001, "loss": 1.8405, "step": 11300 }, { "epoch": 0.15612319289123647, "grad_norm": 0.1994727998971939, "learning_rate": 0.0001, "loss": 1.8403, "step": 11350 }, { "epoch": 0.15681096026080138, "grad_norm": 0.1830483376979828, "learning_rate": 0.0001, "loss": 1.8415, "step": 11400 }, { "epoch": 0.1574987276303663, "grad_norm": 0.17064842581748962, "learning_rate": 0.0001, "loss": 1.8433, "step": 11450 }, { "epoch": 0.15818649499993123, "grad_norm": 0.19161944091320038, "learning_rate": 0.0001, "loss": 1.8428, "step": 11500 }, { "epoch": 0.15887426236949614, "grad_norm": 0.21216394007205963, "learning_rate": 0.0001, "loss": 1.8432, "step": 11550 }, { "epoch": 0.15956202973906106, "grad_norm": 0.1909138560295105, "learning_rate": 0.0001, "loss": 1.8429, "step": 11600 }, { "epoch": 0.16024979710862597, "grad_norm": 0.20326951146125793, "learning_rate": 0.0001, "loss": 1.8419, "step": 11650 }, { "epoch": 0.1609375644781909, "grad_norm": 0.19515758752822876, "learning_rate": 0.0001, "loss": 1.8448, "step": 11700 }, { "epoch": 0.16162533184775582, "grad_norm": 0.2075706273317337, "learning_rate": 0.0001, "loss": 1.8439, "step": 11750 }, { "epoch": 0.16231309921732073, "grad_norm": 0.21147705614566803, "learning_rate": 0.0001, "loss": 1.8433, "step": 11800 }, { "epoch": 0.16300086658688565, "grad_norm": 0.18318484723567963, "learning_rate": 0.0001, "loss": 1.8383, "step": 11850 }, { "epoch": 0.16368863395645056, "grad_norm": 0.18728312849998474, "learning_rate": 0.0001, "loss": 1.8426, "step": 11900 }, { "epoch": 0.1643764013260155, "grad_norm": 0.20905287563800812, "learning_rate": 0.0001, "loss": 1.8421, "step": 11950 }, { "epoch": 0.1650641686955804, "grad_norm": 0.18393969535827637, "learning_rate": 0.0001, "loss": 1.8408, "step": 12000 }, { "epoch": 0.16575193606514532, "grad_norm": 0.18366305530071259, "learning_rate": 0.0001, "loss": 1.8365, "step": 12050 }, { "epoch": 0.16643970343471023, "grad_norm": 0.19170603156089783, "learning_rate": 0.0001, "loss": 1.8416, "step": 12100 }, { "epoch": 0.16712747080427517, "grad_norm": 0.172319233417511, "learning_rate": 0.0001, "loss": 1.8411, "step": 12150 }, { "epoch": 0.1678152381738401, "grad_norm": 0.2174234390258789, "learning_rate": 0.0001, "loss": 1.8416, "step": 12200 }, { "epoch": 0.168503005543405, "grad_norm": 0.20210625231266022, "learning_rate": 0.0001, "loss": 1.8422, "step": 12250 }, { "epoch": 0.1691907729129699, "grad_norm": 0.1902657449245453, "learning_rate": 0.0001, "loss": 1.8369, "step": 12300 }, { "epoch": 0.16987854028253482, "grad_norm": 0.18901073932647705, "learning_rate": 0.0001, "loss": 1.8415, "step": 12350 }, { "epoch": 0.17056630765209976, "grad_norm": 0.17624430358409882, "learning_rate": 0.0001, "loss": 1.8373, "step": 12400 }, { "epoch": 0.17125407502166468, "grad_norm": 0.1844191551208496, "learning_rate": 0.0001, "loss": 1.8391, "step": 12450 }, { "epoch": 0.1719418423912296, "grad_norm": 0.19392350316047668, "learning_rate": 0.0001, "loss": 1.8416, "step": 12500 }, { "epoch": 0.1726296097607945, "grad_norm": 0.18644706904888153, "learning_rate": 0.0001, "loss": 1.8409, "step": 12550 }, { "epoch": 0.17331737713035944, "grad_norm": 0.19530895352363586, "learning_rate": 0.0001, "loss": 1.8381, "step": 12600 }, { "epoch": 0.17400514449992435, "grad_norm": 0.18004032969474792, "learning_rate": 0.0001, "loss": 1.8419, "step": 12650 }, { "epoch": 0.17469291186948926, "grad_norm": 0.20025117695331573, "learning_rate": 0.0001, "loss": 1.8379, "step": 12700 }, { "epoch": 0.17538067923905418, "grad_norm": 0.17622490227222443, "learning_rate": 0.0001, "loss": 1.8364, "step": 12750 }, { "epoch": 0.1760684466086191, "grad_norm": 0.19657030701637268, "learning_rate": 0.0001, "loss": 1.8364, "step": 12800 }, { "epoch": 0.17675621397818403, "grad_norm": 0.19141744077205658, "learning_rate": 0.0001, "loss": 1.8388, "step": 12850 }, { "epoch": 0.17744398134774894, "grad_norm": 0.23409488797187805, "learning_rate": 0.0001, "loss": 1.8392, "step": 12900 }, { "epoch": 0.17813174871731385, "grad_norm": 0.19104769825935364, "learning_rate": 0.0001, "loss": 1.8407, "step": 12950 }, { "epoch": 0.17881951608687877, "grad_norm": 0.1978139728307724, "learning_rate": 0.0001, "loss": 1.836, "step": 13000 }, { "epoch": 0.1795072834564437, "grad_norm": 0.1839970201253891, "learning_rate": 0.0001, "loss": 1.8406, "step": 13050 }, { "epoch": 0.18019505082600862, "grad_norm": 0.1969710737466812, "learning_rate": 0.0001, "loss": 1.8382, "step": 13100 }, { "epoch": 0.18088281819557353, "grad_norm": 0.21036314964294434, "learning_rate": 0.0001, "loss": 1.8372, "step": 13150 }, { "epoch": 0.18157058556513844, "grad_norm": 0.18064115941524506, "learning_rate": 0.0001, "loss": 1.8387, "step": 13200 }, { "epoch": 0.18225835293470335, "grad_norm": 0.20280593633651733, "learning_rate": 0.0001, "loss": 1.8345, "step": 13250 }, { "epoch": 0.1829461203042683, "grad_norm": 0.21196794509887695, "learning_rate": 0.0001, "loss": 1.8403, "step": 13300 }, { "epoch": 0.1836338876738332, "grad_norm": 0.18529263138771057, "learning_rate": 0.0001, "loss": 1.8395, "step": 13350 }, { "epoch": 0.18432165504339812, "grad_norm": 0.20009498298168182, "learning_rate": 0.0001, "loss": 1.8418, "step": 13400 }, { "epoch": 0.18500942241296303, "grad_norm": 0.1844586879014969, "learning_rate": 0.0001, "loss": 1.8388, "step": 13450 }, { "epoch": 0.18569718978252797, "grad_norm": 0.17497003078460693, "learning_rate": 0.0001, "loss": 1.8374, "step": 13500 }, { "epoch": 0.18638495715209288, "grad_norm": 0.21536414325237274, "learning_rate": 0.0001, "loss": 1.834, "step": 13550 }, { "epoch": 0.1870727245216578, "grad_norm": 0.20212842524051666, "learning_rate": 0.0001, "loss": 1.8361, "step": 13600 }, { "epoch": 0.1877604918912227, "grad_norm": 0.21032044291496277, "learning_rate": 0.0001, "loss": 1.8352, "step": 13650 }, { "epoch": 0.18844825926078762, "grad_norm": 0.17547431588172913, "learning_rate": 0.0001, "loss": 1.839, "step": 13700 }, { "epoch": 0.18913602663035256, "grad_norm": 0.17463110387325287, "learning_rate": 0.0001, "loss": 1.8345, "step": 13750 }, { "epoch": 0.18982379399991747, "grad_norm": 0.19794687628746033, "learning_rate": 0.0001, "loss": 1.8367, "step": 13800 }, { "epoch": 0.19051156136948239, "grad_norm": 0.17595866322517395, "learning_rate": 0.0001, "loss": 1.8349, "step": 13850 }, { "epoch": 0.1911993287390473, "grad_norm": 0.19087472558021545, "learning_rate": 0.0001, "loss": 1.8377, "step": 13900 }, { "epoch": 0.1918870961086122, "grad_norm": 0.1895439624786377, "learning_rate": 0.0001, "loss": 1.8392, "step": 13950 }, { "epoch": 0.19257486347817715, "grad_norm": 0.19558320939540863, "learning_rate": 0.0001, "loss": 1.8331, "step": 14000 }, { "epoch": 0.19326263084774206, "grad_norm": 0.18495230376720428, "learning_rate": 0.0001, "loss": 1.8357, "step": 14050 }, { "epoch": 0.19395039821730697, "grad_norm": 0.19197221100330353, "learning_rate": 0.0001, "loss": 1.8379, "step": 14100 }, { "epoch": 0.1946381655868719, "grad_norm": 0.17729446291923523, "learning_rate": 0.0001, "loss": 1.8336, "step": 14150 }, { "epoch": 0.19532593295643683, "grad_norm": 0.20683547854423523, "learning_rate": 0.0001, "loss": 1.8344, "step": 14200 }, { "epoch": 0.19601370032600174, "grad_norm": 0.16708314418792725, "learning_rate": 0.0001, "loss": 1.8375, "step": 14250 }, { "epoch": 0.19670146769556665, "grad_norm": 0.2065526694059372, "learning_rate": 0.0001, "loss": 1.8397, "step": 14300 }, { "epoch": 0.19738923506513156, "grad_norm": 0.2007008045911789, "learning_rate": 0.0001, "loss": 1.8351, "step": 14350 }, { "epoch": 0.19807700243469648, "grad_norm": 0.1773243397474289, "learning_rate": 0.0001, "loss": 1.8338, "step": 14400 }, { "epoch": 0.19876476980426142, "grad_norm": 0.1875116229057312, "learning_rate": 0.0001, "loss": 1.8379, "step": 14450 }, { "epoch": 0.19945253717382633, "grad_norm": 0.19387130439281464, "learning_rate": 0.0001, "loss": 1.8343, "step": 14500 }, { "epoch": 0.20014030454339124, "grad_norm": 0.17164736986160278, "learning_rate": 0.0001, "loss": 1.8338, "step": 14550 }, { "epoch": 0.20082807191295615, "grad_norm": 0.19135966897010803, "learning_rate": 0.0001, "loss": 1.8321, "step": 14600 }, { "epoch": 0.2015158392825211, "grad_norm": 0.21152153611183167, "learning_rate": 0.0001, "loss": 1.8332, "step": 14650 }, { "epoch": 0.202203606652086, "grad_norm": 0.19576500356197357, "learning_rate": 0.0001, "loss": 1.8338, "step": 14700 }, { "epoch": 0.20289137402165092, "grad_norm": 0.21700510382652283, "learning_rate": 0.0001, "loss": 1.8381, "step": 14750 }, { "epoch": 0.20357914139121583, "grad_norm": 0.18183092772960663, "learning_rate": 0.0001, "loss": 1.833, "step": 14800 }, { "epoch": 0.20426690876078074, "grad_norm": 0.1678183525800705, "learning_rate": 0.0001, "loss": 1.8365, "step": 14850 }, { "epoch": 0.20495467613034568, "grad_norm": 0.1790694147348404, "learning_rate": 0.0001, "loss": 1.8323, "step": 14900 }, { "epoch": 0.2056424434999106, "grad_norm": 0.17274673283100128, "learning_rate": 0.0001, "loss": 1.8357, "step": 14950 }, { "epoch": 0.2063302108694755, "grad_norm": 0.1773209273815155, "learning_rate": 0.0001, "loss": 1.8338, "step": 15000 }, { "epoch": 0.20701797823904042, "grad_norm": 0.29811668395996094, "learning_rate": 0.0001, "loss": 1.8322, "step": 15050 }, { "epoch": 0.20770574560860536, "grad_norm": 0.18590272963047028, "learning_rate": 0.0001, "loss": 1.8307, "step": 15100 }, { "epoch": 0.20839351297817027, "grad_norm": 0.19656258821487427, "learning_rate": 0.0001, "loss": 1.8364, "step": 15150 }, { "epoch": 0.20908128034773518, "grad_norm": 0.1760113537311554, "learning_rate": 0.0001, "loss": 1.8363, "step": 15200 }, { "epoch": 0.2097690477173001, "grad_norm": 0.17442069947719574, "learning_rate": 0.0001, "loss": 1.8346, "step": 15250 }, { "epoch": 0.210456815086865, "grad_norm": 0.2154201865196228, "learning_rate": 0.0001, "loss": 1.8359, "step": 15300 }, { "epoch": 0.21114458245642995, "grad_norm": 0.18702222406864166, "learning_rate": 0.0001, "loss": 1.8333, "step": 15350 }, { "epoch": 0.21183234982599486, "grad_norm": 0.222214013338089, "learning_rate": 0.0001, "loss": 1.8386, "step": 15400 }, { "epoch": 0.21252011719555977, "grad_norm": 0.18646612763404846, "learning_rate": 0.0001, "loss": 1.8336, "step": 15450 }, { "epoch": 0.21320788456512468, "grad_norm": 0.19032032787799835, "learning_rate": 0.0001, "loss": 1.8359, "step": 15500 }, { "epoch": 0.21389565193468962, "grad_norm": 0.1962030827999115, "learning_rate": 0.0001, "loss": 1.8314, "step": 15550 }, { "epoch": 0.21458341930425454, "grad_norm": 0.18067054450511932, "learning_rate": 0.0001, "loss": 1.8298, "step": 15600 }, { "epoch": 0.21527118667381945, "grad_norm": 0.1977655440568924, "learning_rate": 0.0001, "loss": 1.8335, "step": 15650 }, { "epoch": 0.21595895404338436, "grad_norm": 0.17689162492752075, "learning_rate": 0.0001, "loss": 1.834, "step": 15700 }, { "epoch": 0.21664672141294927, "grad_norm": 0.189301997423172, "learning_rate": 0.0001, "loss": 1.8302, "step": 15750 }, { "epoch": 0.2173344887825142, "grad_norm": 0.21416552364826202, "learning_rate": 0.0001, "loss": 1.833, "step": 15800 }, { "epoch": 0.21802225615207912, "grad_norm": 0.17280973494052887, "learning_rate": 0.0001, "loss": 1.8325, "step": 15850 }, { "epoch": 0.21871002352164404, "grad_norm": 0.2203332632780075, "learning_rate": 0.0001, "loss": 1.8315, "step": 15900 }, { "epoch": 0.21939779089120895, "grad_norm": 0.17942380905151367, "learning_rate": 0.0001, "loss": 1.8313, "step": 15950 }, { "epoch": 0.2200855582607739, "grad_norm": 0.2053511142730713, "learning_rate": 0.0001, "loss": 1.8322, "step": 16000 }, { "epoch": 0.2207733256303388, "grad_norm": 0.18660666048526764, "learning_rate": 0.0001, "loss": 1.8315, "step": 16050 }, { "epoch": 0.2214610929999037, "grad_norm": 0.20179618895053864, "learning_rate": 0.0001, "loss": 1.8309, "step": 16100 }, { "epoch": 0.22214886036946863, "grad_norm": 0.1849927455186844, "learning_rate": 0.0001, "loss": 1.8349, "step": 16150 }, { "epoch": 0.22283662773903354, "grad_norm": 0.16893066465854645, "learning_rate": 0.0001, "loss": 1.8333, "step": 16200 }, { "epoch": 0.22352439510859848, "grad_norm": 0.1815815567970276, "learning_rate": 0.0001, "loss": 1.8277, "step": 16250 }, { "epoch": 0.2242121624781634, "grad_norm": 0.17478667199611664, "learning_rate": 0.0001, "loss": 1.8324, "step": 16300 }, { "epoch": 0.2248999298477283, "grad_norm": 0.20333503186702728, "learning_rate": 0.0001, "loss": 1.8299, "step": 16350 }, { "epoch": 0.22558769721729321, "grad_norm": 0.19628338515758514, "learning_rate": 0.0001, "loss": 1.8322, "step": 16400 }, { "epoch": 0.22627546458685815, "grad_norm": 0.19011887907981873, "learning_rate": 0.0001, "loss": 1.8301, "step": 16450 }, { "epoch": 0.22696323195642307, "grad_norm": 0.19007809460163116, "learning_rate": 0.0001, "loss": 1.8306, "step": 16500 }, { "epoch": 0.22765099932598798, "grad_norm": 0.18108965456485748, "learning_rate": 0.0001, "loss": 1.8304, "step": 16550 }, { "epoch": 0.2283387666955529, "grad_norm": 0.16927501559257507, "learning_rate": 0.0001, "loss": 1.832, "step": 16600 }, { "epoch": 0.2290265340651178, "grad_norm": 0.18328557908535004, "learning_rate": 0.0001, "loss": 1.8315, "step": 16650 }, { "epoch": 0.22971430143468274, "grad_norm": 0.21978403627872467, "learning_rate": 0.0001, "loss": 1.8314, "step": 16700 }, { "epoch": 0.23040206880424766, "grad_norm": 0.1928972601890564, "learning_rate": 0.0001, "loss": 1.8281, "step": 16750 }, { "epoch": 0.23108983617381257, "grad_norm": 0.19355738162994385, "learning_rate": 0.0001, "loss": 1.8289, "step": 16800 }, { "epoch": 0.23177760354337748, "grad_norm": 0.18013496696949005, "learning_rate": 0.0001, "loss": 1.831, "step": 16850 }, { "epoch": 0.2324653709129424, "grad_norm": 0.1848910003900528, "learning_rate": 0.0001, "loss": 1.826, "step": 16900 }, { "epoch": 0.23315313828250733, "grad_norm": 0.20185594260692596, "learning_rate": 0.0001, "loss": 1.8274, "step": 16950 }, { "epoch": 0.23384090565207225, "grad_norm": 0.1898491382598877, "learning_rate": 0.0001, "loss": 1.8292, "step": 17000 }, { "epoch": 0.23452867302163716, "grad_norm": 0.17610591650009155, "learning_rate": 0.0001, "loss": 1.831, "step": 17050 }, { "epoch": 0.23521644039120207, "grad_norm": 0.2032867968082428, "learning_rate": 0.0001, "loss": 1.8306, "step": 17100 }, { "epoch": 0.235904207760767, "grad_norm": 0.1812831312417984, "learning_rate": 0.0001, "loss": 1.8331, "step": 17150 }, { "epoch": 0.23659197513033192, "grad_norm": 0.17079557478427887, "learning_rate": 0.0001, "loss": 1.8266, "step": 17200 }, { "epoch": 0.23727974249989683, "grad_norm": 0.17599579691886902, "learning_rate": 0.0001, "loss": 1.8327, "step": 17250 }, { "epoch": 0.23796750986946175, "grad_norm": 0.16692423820495605, "learning_rate": 0.0001, "loss": 1.8294, "step": 17300 }, { "epoch": 0.23865527723902666, "grad_norm": 0.17235307395458221, "learning_rate": 0.0001, "loss": 1.8324, "step": 17350 }, { "epoch": 0.2393430446085916, "grad_norm": 0.18419289588928223, "learning_rate": 0.0001, "loss": 1.8234, "step": 17400 }, { "epoch": 0.2400308119781565, "grad_norm": 0.16880065202713013, "learning_rate": 0.0001, "loss": 1.8315, "step": 17450 }, { "epoch": 0.24071857934772142, "grad_norm": 0.18046660721302032, "learning_rate": 0.0001, "loss": 1.8288, "step": 17500 }, { "epoch": 0.24140634671728634, "grad_norm": 0.19775420427322388, "learning_rate": 0.0001, "loss": 1.8304, "step": 17550 }, { "epoch": 0.24209411408685128, "grad_norm": 0.18596383929252625, "learning_rate": 0.0001, "loss": 1.8269, "step": 17600 }, { "epoch": 0.2427818814564162, "grad_norm": 0.18525435030460358, "learning_rate": 0.0001, "loss": 1.8293, "step": 17650 }, { "epoch": 0.2434696488259811, "grad_norm": 0.2105979025363922, "learning_rate": 0.0001, "loss": 1.8252, "step": 17700 }, { "epoch": 0.244157416195546, "grad_norm": 0.18099245429039001, "learning_rate": 0.0001, "loss": 1.8271, "step": 17750 }, { "epoch": 0.24484518356511092, "grad_norm": 0.17330291867256165, "learning_rate": 0.0001, "loss": 1.8261, "step": 17800 }, { "epoch": 0.24553295093467586, "grad_norm": 0.17979152500629425, "learning_rate": 0.0001, "loss": 1.8304, "step": 17850 }, { "epoch": 0.24622071830424078, "grad_norm": 0.19253650307655334, "learning_rate": 0.0001, "loss": 1.83, "step": 17900 }, { "epoch": 0.2469084856738057, "grad_norm": 0.20440231263637543, "learning_rate": 0.0001, "loss": 1.8251, "step": 17950 }, { "epoch": 0.2475962530433706, "grad_norm": 0.18242883682250977, "learning_rate": 0.0001, "loss": 1.8286, "step": 18000 }, { "epoch": 0.24828402041293554, "grad_norm": 0.1742672622203827, "learning_rate": 0.0001, "loss": 1.8271, "step": 18050 }, { "epoch": 0.24897178778250045, "grad_norm": 0.19099250435829163, "learning_rate": 0.0001, "loss": 1.8284, "step": 18100 }, { "epoch": 0.24965955515206537, "grad_norm": 0.19839410483837128, "learning_rate": 0.0001, "loss": 1.8254, "step": 18150 }, { "epoch": 0.2503473225216303, "grad_norm": 0.18187545239925385, "learning_rate": 0.0001, "loss": 1.8258, "step": 18200 }, { "epoch": 0.2510350898911952, "grad_norm": 0.16419640183448792, "learning_rate": 0.0001, "loss": 1.825, "step": 18250 }, { "epoch": 0.2517228572607601, "grad_norm": 0.1788015216588974, "learning_rate": 0.0001, "loss": 1.8257, "step": 18300 }, { "epoch": 0.25241062463032504, "grad_norm": 0.2013292908668518, "learning_rate": 0.0001, "loss": 1.8345, "step": 18350 }, { "epoch": 0.25309839199989, "grad_norm": 0.18886993825435638, "learning_rate": 0.0001, "loss": 1.8269, "step": 18400 }, { "epoch": 0.25378615936945487, "grad_norm": 0.18426848948001862, "learning_rate": 0.0001, "loss": 1.8291, "step": 18450 }, { "epoch": 0.2544739267390198, "grad_norm": 0.1836244910955429, "learning_rate": 0.0001, "loss": 1.8228, "step": 18500 }, { "epoch": 0.2551616941085847, "grad_norm": 0.18584777414798737, "learning_rate": 0.0001, "loss": 1.8283, "step": 18550 }, { "epoch": 0.25584946147814963, "grad_norm": 0.16920630633831024, "learning_rate": 0.0001, "loss": 1.8274, "step": 18600 }, { "epoch": 0.25653722884771457, "grad_norm": 0.20111984014511108, "learning_rate": 0.0001, "loss": 1.8285, "step": 18650 }, { "epoch": 0.25722499621727946, "grad_norm": 0.18769313395023346, "learning_rate": 0.0001, "loss": 1.8295, "step": 18700 }, { "epoch": 0.2579127635868444, "grad_norm": 0.18159103393554688, "learning_rate": 0.0001, "loss": 1.8236, "step": 18750 }, { "epoch": 0.2586005309564093, "grad_norm": 0.1929440200328827, "learning_rate": 0.0001, "loss": 1.8279, "step": 18800 }, { "epoch": 0.2592882983259742, "grad_norm": 0.16436657309532166, "learning_rate": 0.0001, "loss": 1.823, "step": 18850 }, { "epoch": 0.25997606569553916, "grad_norm": 0.1638740748167038, "learning_rate": 0.0001, "loss": 1.8251, "step": 18900 }, { "epoch": 0.26066383306510404, "grad_norm": 0.18252821266651154, "learning_rate": 0.0001, "loss": 1.8251, "step": 18950 }, { "epoch": 0.261351600434669, "grad_norm": 0.18031029403209686, "learning_rate": 0.0001, "loss": 1.8243, "step": 19000 }, { "epoch": 0.26203936780423387, "grad_norm": 0.1770683377981186, "learning_rate": 0.0001, "loss": 1.8274, "step": 19050 }, { "epoch": 0.2627271351737988, "grad_norm": 0.20250555872917175, "learning_rate": 0.0001, "loss": 1.8258, "step": 19100 }, { "epoch": 0.26341490254336375, "grad_norm": 0.16491496562957764, "learning_rate": 0.0001, "loss": 1.8251, "step": 19150 }, { "epoch": 0.26410266991292863, "grad_norm": 0.19582998752593994, "learning_rate": 0.0001, "loss": 1.824, "step": 19200 }, { "epoch": 0.2647904372824936, "grad_norm": 0.17773911356925964, "learning_rate": 0.0001, "loss": 1.8195, "step": 19250 }, { "epoch": 0.2654782046520585, "grad_norm": 0.18118888139724731, "learning_rate": 0.0001, "loss": 1.8239, "step": 19300 }, { "epoch": 0.2661659720216234, "grad_norm": 0.15766191482543945, "learning_rate": 0.0001, "loss": 1.8232, "step": 19350 }, { "epoch": 0.26685373939118834, "grad_norm": 0.17026937007904053, "learning_rate": 0.0001, "loss": 1.8223, "step": 19400 }, { "epoch": 0.2675415067607532, "grad_norm": 0.18863512575626373, "learning_rate": 0.0001, "loss": 1.8257, "step": 19450 }, { "epoch": 0.26822927413031816, "grad_norm": 0.18321500718593597, "learning_rate": 0.0001, "loss": 1.8238, "step": 19500 }, { "epoch": 0.2689170414998831, "grad_norm": 0.20935237407684326, "learning_rate": 0.0001, "loss": 1.8229, "step": 19550 }, { "epoch": 0.269604808869448, "grad_norm": 0.19490981101989746, "learning_rate": 0.0001, "loss": 1.8194, "step": 19600 }, { "epoch": 0.2702925762390129, "grad_norm": 0.19290666282176971, "learning_rate": 0.0001, "loss": 1.8258, "step": 19650 }, { "epoch": 0.2709803436085778, "grad_norm": 0.1819174438714981, "learning_rate": 0.0001, "loss": 1.8224, "step": 19700 }, { "epoch": 0.27166811097814275, "grad_norm": 0.18501299619674683, "learning_rate": 0.0001, "loss": 1.8297, "step": 19750 }, { "epoch": 0.2723558783477077, "grad_norm": 0.19111846387386322, "learning_rate": 0.0001, "loss": 1.8226, "step": 19800 }, { "epoch": 0.2730436457172726, "grad_norm": 0.18800359964370728, "learning_rate": 0.0001, "loss": 1.8215, "step": 19850 }, { "epoch": 0.2737314130868375, "grad_norm": 0.18408334255218506, "learning_rate": 0.0001, "loss": 1.8239, "step": 19900 }, { "epoch": 0.2744191804564024, "grad_norm": 0.19500131905078888, "learning_rate": 0.0001, "loss": 1.8232, "step": 19950 }, { "epoch": 0.27510694782596734, "grad_norm": 0.18263010680675507, "learning_rate": 0.0001, "loss": 1.8246, "step": 20000 }, { "epoch": 0.2757947151955323, "grad_norm": 0.1732577383518219, "learning_rate": 0.0001, "loss": 1.8241, "step": 20050 }, { "epoch": 0.27648248256509717, "grad_norm": 0.1958979219198227, "learning_rate": 0.0001, "loss": 1.8215, "step": 20100 }, { "epoch": 0.2771702499346621, "grad_norm": 0.1755562722682953, "learning_rate": 0.0001, "loss": 1.8275, "step": 20150 }, { "epoch": 0.27785801730422705, "grad_norm": 0.17292717099189758, "learning_rate": 0.0001, "loss": 1.8221, "step": 20200 }, { "epoch": 0.27854578467379193, "grad_norm": 0.16997367143630981, "learning_rate": 0.0001, "loss": 1.8221, "step": 20250 }, { "epoch": 0.27923355204335687, "grad_norm": 0.1903601735830307, "learning_rate": 0.0001, "loss": 1.8243, "step": 20300 }, { "epoch": 0.27992131941292175, "grad_norm": 0.17447033524513245, "learning_rate": 0.0001, "loss": 1.8229, "step": 20350 }, { "epoch": 0.2806090867824867, "grad_norm": 0.18861395120620728, "learning_rate": 0.0001, "loss": 1.8222, "step": 20400 }, { "epoch": 0.28129685415205163, "grad_norm": 0.17015644907951355, "learning_rate": 0.0001, "loss": 1.8207, "step": 20450 }, { "epoch": 0.2819846215216165, "grad_norm": 0.19356681406497955, "learning_rate": 0.0001, "loss": 1.8202, "step": 20500 }, { "epoch": 0.28267238889118146, "grad_norm": 0.1988779753446579, "learning_rate": 0.0001, "loss": 1.8199, "step": 20550 }, { "epoch": 0.28336015626074634, "grad_norm": 0.1967942714691162, "learning_rate": 0.0001, "loss": 1.8217, "step": 20600 }, { "epoch": 0.2840479236303113, "grad_norm": 0.18917816877365112, "learning_rate": 0.0001, "loss": 1.8229, "step": 20650 }, { "epoch": 0.2847356909998762, "grad_norm": 0.16583094000816345, "learning_rate": 0.0001, "loss": 1.8219, "step": 20700 }, { "epoch": 0.2854234583694411, "grad_norm": 0.19918115437030792, "learning_rate": 0.0001, "loss": 1.8246, "step": 20750 }, { "epoch": 0.28611122573900605, "grad_norm": 0.1981818974018097, "learning_rate": 0.0001, "loss": 1.8211, "step": 20800 }, { "epoch": 0.28679899310857093, "grad_norm": 0.1838293969631195, "learning_rate": 0.0001, "loss": 1.8224, "step": 20850 }, { "epoch": 0.28748676047813587, "grad_norm": 0.20068101584911346, "learning_rate": 0.0001, "loss": 1.82, "step": 20900 }, { "epoch": 0.2881745278477008, "grad_norm": 0.17375263571739197, "learning_rate": 0.0001, "loss": 1.8195, "step": 20950 }, { "epoch": 0.2888622952172657, "grad_norm": 0.16706246137619019, "learning_rate": 0.0001, "loss": 1.826, "step": 21000 }, { "epoch": 0.28955006258683064, "grad_norm": 0.20021022856235504, "learning_rate": 0.0001, "loss": 1.8207, "step": 21050 }, { "epoch": 0.2902378299563955, "grad_norm": 0.20570990443229675, "learning_rate": 0.0001, "loss": 1.8221, "step": 21100 }, { "epoch": 0.29092559732596046, "grad_norm": 0.2043515294790268, "learning_rate": 0.0001, "loss": 1.8239, "step": 21150 }, { "epoch": 0.2916133646955254, "grad_norm": 0.17122073471546173, "learning_rate": 0.0001, "loss": 1.8203, "step": 21200 }, { "epoch": 0.2923011320650903, "grad_norm": 0.19589883089065552, "learning_rate": 0.0001, "loss": 1.8206, "step": 21250 }, { "epoch": 0.2929888994346552, "grad_norm": 0.19675767421722412, "learning_rate": 0.0001, "loss": 1.8244, "step": 21300 }, { "epoch": 0.29367666680422017, "grad_norm": 0.1788429468870163, "learning_rate": 0.0001, "loss": 1.8225, "step": 21350 }, { "epoch": 0.29436443417378505, "grad_norm": 0.17564085125923157, "learning_rate": 0.0001, "loss": 1.8242, "step": 21400 }, { "epoch": 0.29505220154335, "grad_norm": 0.1807086318731308, "learning_rate": 0.0001, "loss": 1.8245, "step": 21450 }, { "epoch": 0.2957399689129149, "grad_norm": 0.1772526502609253, "learning_rate": 0.0001, "loss": 1.8231, "step": 21500 }, { "epoch": 0.2964277362824798, "grad_norm": 0.1903577297925949, "learning_rate": 0.0001, "loss": 1.8209, "step": 21550 }, { "epoch": 0.29711550365204475, "grad_norm": 0.17995303869247437, "learning_rate": 0.0001, "loss": 1.817, "step": 21600 }, { "epoch": 0.29780327102160964, "grad_norm": 0.1937420666217804, "learning_rate": 0.0001, "loss": 1.8241, "step": 21650 }, { "epoch": 0.2984910383911746, "grad_norm": 0.1729700267314911, "learning_rate": 0.0001, "loss": 1.822, "step": 21700 }, { "epoch": 0.29917880576073946, "grad_norm": 0.16370828449726105, "learning_rate": 0.0001, "loss": 1.8217, "step": 21750 }, { "epoch": 0.2998665731303044, "grad_norm": 0.17373540997505188, "learning_rate": 0.0001, "loss": 1.8191, "step": 21800 }, { "epoch": 0.30055434049986934, "grad_norm": 0.19695748388767242, "learning_rate": 0.0001, "loss": 1.8236, "step": 21850 }, { "epoch": 0.30124210786943423, "grad_norm": 0.20299525558948517, "learning_rate": 0.0001, "loss": 1.8181, "step": 21900 }, { "epoch": 0.30192987523899917, "grad_norm": 0.5943254828453064, "learning_rate": 0.0001, "loss": 1.8207, "step": 21950 }, { "epoch": 0.30261764260856405, "grad_norm": 0.1915915608406067, "learning_rate": 0.0001, "loss": 1.8245, "step": 22000 }, { "epoch": 0.303305409978129, "grad_norm": 0.16212280094623566, "learning_rate": 0.0001, "loss": 1.8226, "step": 22050 }, { "epoch": 0.30399317734769393, "grad_norm": 0.16871103644371033, "learning_rate": 0.0001, "loss": 1.8193, "step": 22100 }, { "epoch": 0.3046809447172588, "grad_norm": 0.1811041682958603, "learning_rate": 0.0001, "loss": 1.8187, "step": 22150 }, { "epoch": 0.30536871208682376, "grad_norm": 0.1868380606174469, "learning_rate": 0.0001, "loss": 1.8219, "step": 22200 }, { "epoch": 0.3060564794563887, "grad_norm": 0.18134795129299164, "learning_rate": 0.0001, "loss": 1.8207, "step": 22250 }, { "epoch": 0.3067442468259536, "grad_norm": 0.17329555749893188, "learning_rate": 0.0001, "loss": 1.8193, "step": 22300 }, { "epoch": 0.3074320141955185, "grad_norm": 0.18371562659740448, "learning_rate": 0.0001, "loss": 1.821, "step": 22350 }, { "epoch": 0.3081197815650834, "grad_norm": 0.17543677985668182, "learning_rate": 0.0001, "loss": 1.8182, "step": 22400 }, { "epoch": 0.30880754893464835, "grad_norm": 0.18362955749034882, "learning_rate": 0.0001, "loss": 1.8187, "step": 22450 }, { "epoch": 0.3094953163042133, "grad_norm": 0.20341430604457855, "learning_rate": 0.0001, "loss": 1.8198, "step": 22500 }, { "epoch": 0.31018308367377817, "grad_norm": 0.1833573579788208, "learning_rate": 0.0001, "loss": 1.8167, "step": 22550 }, { "epoch": 0.3108708510433431, "grad_norm": 0.1798466444015503, "learning_rate": 0.0001, "loss": 1.8204, "step": 22600 }, { "epoch": 0.311558618412908, "grad_norm": 0.18346908688545227, "learning_rate": 0.0001, "loss": 1.8197, "step": 22650 }, { "epoch": 0.31224638578247293, "grad_norm": 0.1842503696680069, "learning_rate": 0.0001, "loss": 1.822, "step": 22700 }, { "epoch": 0.3129341531520379, "grad_norm": 0.1917971521615982, "learning_rate": 0.0001, "loss": 1.8205, "step": 22750 }, { "epoch": 0.31362192052160276, "grad_norm": 0.18140938878059387, "learning_rate": 0.0001, "loss": 1.8187, "step": 22800 }, { "epoch": 0.3143096878911677, "grad_norm": 0.17349034547805786, "learning_rate": 0.0001, "loss": 1.8204, "step": 22850 }, { "epoch": 0.3149974552607326, "grad_norm": 0.17727358639240265, "learning_rate": 0.0001, "loss": 1.8203, "step": 22900 }, { "epoch": 0.3156852226302975, "grad_norm": 0.1764019876718521, "learning_rate": 0.0001, "loss": 1.8197, "step": 22950 }, { "epoch": 0.31637298999986246, "grad_norm": 0.18336281180381775, "learning_rate": 0.0001, "loss": 1.8168, "step": 23000 }, { "epoch": 0.31706075736942735, "grad_norm": 0.15488466620445251, "learning_rate": 0.0001, "loss": 1.819, "step": 23050 }, { "epoch": 0.3177485247389923, "grad_norm": 0.16988332569599152, "learning_rate": 0.0001, "loss": 1.8151, "step": 23100 }, { "epoch": 0.31843629210855723, "grad_norm": 0.16344988346099854, "learning_rate": 0.0001, "loss": 1.819, "step": 23150 }, { "epoch": 0.3191240594781221, "grad_norm": 0.17984721064567566, "learning_rate": 0.0001, "loss": 1.8182, "step": 23200 }, { "epoch": 0.31981182684768705, "grad_norm": 0.19572113454341888, "learning_rate": 0.0001, "loss": 1.8158, "step": 23250 }, { "epoch": 0.32049959421725194, "grad_norm": 0.21890446543693542, "learning_rate": 0.0001, "loss": 1.8158, "step": 23300 }, { "epoch": 0.3211873615868169, "grad_norm": 0.1672099530696869, "learning_rate": 0.0001, "loss": 1.8183, "step": 23350 }, { "epoch": 0.3218751289563818, "grad_norm": 0.18066146969795227, "learning_rate": 0.0001, "loss": 1.8194, "step": 23400 }, { "epoch": 0.3225628963259467, "grad_norm": 0.1749303936958313, "learning_rate": 0.0001, "loss": 1.8192, "step": 23450 }, { "epoch": 0.32325066369551164, "grad_norm": 0.1646299809217453, "learning_rate": 0.0001, "loss": 1.819, "step": 23500 }, { "epoch": 0.3239384310650765, "grad_norm": 0.204520583152771, "learning_rate": 0.0001, "loss": 1.8166, "step": 23550 }, { "epoch": 0.32462619843464147, "grad_norm": 0.166048064827919, "learning_rate": 0.0001, "loss": 1.8163, "step": 23600 }, { "epoch": 0.3253139658042064, "grad_norm": 0.17722272872924805, "learning_rate": 0.0001, "loss": 1.8158, "step": 23650 }, { "epoch": 0.3260017331737713, "grad_norm": 0.1896638125181198, "learning_rate": 0.0001, "loss": 1.8165, "step": 23700 }, { "epoch": 0.32668950054333623, "grad_norm": 0.16389790177345276, "learning_rate": 0.0001, "loss": 1.8163, "step": 23750 }, { "epoch": 0.3273772679129011, "grad_norm": 0.17973138391971588, "learning_rate": 0.0001, "loss": 1.8201, "step": 23800 }, { "epoch": 0.32806503528246606, "grad_norm": 0.20095448195934296, "learning_rate": 0.0001, "loss": 1.8174, "step": 23850 }, { "epoch": 0.328752802652031, "grad_norm": 0.18039678037166595, "learning_rate": 0.0001, "loss": 1.8179, "step": 23900 }, { "epoch": 0.3294405700215959, "grad_norm": 0.1760893315076828, "learning_rate": 0.0001, "loss": 1.816, "step": 23950 }, { "epoch": 0.3301283373911608, "grad_norm": 0.171057790517807, "learning_rate": 0.0001, "loss": 1.816, "step": 24000 }, { "epoch": 0.33081610476072576, "grad_norm": 0.17639483511447906, "learning_rate": 0.0001, "loss": 1.8157, "step": 24050 }, { "epoch": 0.33150387213029064, "grad_norm": 0.16385740041732788, "learning_rate": 0.0001, "loss": 1.8195, "step": 24100 }, { "epoch": 0.3321916394998556, "grad_norm": 0.18215522170066833, "learning_rate": 0.0001, "loss": 1.8157, "step": 24150 }, { "epoch": 0.33287940686942047, "grad_norm": 0.17613132297992706, "learning_rate": 0.0001, "loss": 1.8152, "step": 24200 }, { "epoch": 0.3335671742389854, "grad_norm": 0.16723348200321198, "learning_rate": 0.0001, "loss": 1.8141, "step": 24250 }, { "epoch": 0.33425494160855035, "grad_norm": 0.16092203557491302, "learning_rate": 0.0001, "loss": 1.8173, "step": 24300 }, { "epoch": 0.33494270897811523, "grad_norm": 0.17928454279899597, "learning_rate": 0.0001, "loss": 1.8188, "step": 24350 }, { "epoch": 0.3356304763476802, "grad_norm": 0.18230123817920685, "learning_rate": 0.0001, "loss": 1.8152, "step": 24400 }, { "epoch": 0.33631824371724506, "grad_norm": 0.1699696034193039, "learning_rate": 0.0001, "loss": 1.8194, "step": 24450 }, { "epoch": 0.33700601108681, "grad_norm": 0.1800839602947235, "learning_rate": 0.0001, "loss": 1.8126, "step": 24500 }, { "epoch": 0.33769377845637494, "grad_norm": 0.19913671910762787, "learning_rate": 0.0001, "loss": 1.8148, "step": 24550 }, { "epoch": 0.3383815458259398, "grad_norm": 0.16596053540706635, "learning_rate": 0.0001, "loss": 1.818, "step": 24600 }, { "epoch": 0.33906931319550476, "grad_norm": 0.1894855797290802, "learning_rate": 0.0001, "loss": 1.8142, "step": 24650 }, { "epoch": 0.33975708056506965, "grad_norm": 0.1800161600112915, "learning_rate": 0.0001, "loss": 1.8152, "step": 24700 }, { "epoch": 0.3404448479346346, "grad_norm": 0.17433103919029236, "learning_rate": 0.0001, "loss": 1.815, "step": 24750 }, { "epoch": 0.3411326153041995, "grad_norm": 0.18210847675800323, "learning_rate": 0.0001, "loss": 1.8168, "step": 24800 }, { "epoch": 0.3418203826737644, "grad_norm": 0.17840790748596191, "learning_rate": 0.0001, "loss": 1.8159, "step": 24850 }, { "epoch": 0.34250815004332935, "grad_norm": 0.18368154764175415, "learning_rate": 0.0001, "loss": 1.8171, "step": 24900 }, { "epoch": 0.34319591741289424, "grad_norm": 0.17999804019927979, "learning_rate": 0.0001, "loss": 1.817, "step": 24950 }, { "epoch": 0.3438836847824592, "grad_norm": 0.19299517571926117, "learning_rate": 0.0001, "loss": 1.8161, "step": 25000 }, { "epoch": 0.3445714521520241, "grad_norm": 0.17866362631320953, "learning_rate": 0.0001, "loss": 1.8121, "step": 25050 }, { "epoch": 0.345259219521589, "grad_norm": 0.16793055832386017, "learning_rate": 0.0001, "loss": 1.8137, "step": 25100 }, { "epoch": 0.34594698689115394, "grad_norm": 0.18356679379940033, "learning_rate": 0.0001, "loss": 1.8158, "step": 25150 }, { "epoch": 0.3466347542607189, "grad_norm": 0.18392959237098694, "learning_rate": 0.0001, "loss": 1.8135, "step": 25200 }, { "epoch": 0.34732252163028376, "grad_norm": 0.18158595263957977, "learning_rate": 0.0001, "loss": 1.8168, "step": 25250 }, { "epoch": 0.3480102889998487, "grad_norm": 0.1956174075603485, "learning_rate": 0.0001, "loss": 1.8137, "step": 25300 }, { "epoch": 0.3486980563694136, "grad_norm": 0.17629751563072205, "learning_rate": 0.0001, "loss": 1.8161, "step": 25350 }, { "epoch": 0.34938582373897853, "grad_norm": 0.1842150092124939, "learning_rate": 0.0001, "loss": 1.8112, "step": 25400 }, { "epoch": 0.35007359110854347, "grad_norm": 0.18889479339122772, "learning_rate": 0.0001, "loss": 1.8152, "step": 25450 }, { "epoch": 0.35076135847810835, "grad_norm": 0.16872894763946533, "learning_rate": 0.0001, "loss": 1.818, "step": 25500 }, { "epoch": 0.3514491258476733, "grad_norm": 0.16502858698368073, "learning_rate": 0.0001, "loss": 1.8127, "step": 25550 }, { "epoch": 0.3521368932172382, "grad_norm": 0.1778111755847931, "learning_rate": 0.0001, "loss": 1.8202, "step": 25600 }, { "epoch": 0.3528246605868031, "grad_norm": 0.16866064071655273, "learning_rate": 0.0001, "loss": 1.8155, "step": 25650 }, { "epoch": 0.35351242795636806, "grad_norm": 0.1845904141664505, "learning_rate": 0.0001, "loss": 1.8171, "step": 25700 }, { "epoch": 0.35420019532593294, "grad_norm": 0.19138947129249573, "learning_rate": 0.0001, "loss": 1.8164, "step": 25750 }, { "epoch": 0.3548879626954979, "grad_norm": 0.18222880363464355, "learning_rate": 0.0001, "loss": 1.8131, "step": 25800 }, { "epoch": 0.35557573006506277, "grad_norm": 0.17819440364837646, "learning_rate": 0.0001, "loss": 1.8147, "step": 25850 }, { "epoch": 0.3562634974346277, "grad_norm": 0.20162558555603027, "learning_rate": 0.0001, "loss": 1.8188, "step": 25900 }, { "epoch": 0.35695126480419265, "grad_norm": 0.17715832591056824, "learning_rate": 0.0001, "loss": 1.813, "step": 25950 }, { "epoch": 0.35763903217375753, "grad_norm": 0.16032275557518005, "learning_rate": 0.0001, "loss": 1.8135, "step": 26000 }, { "epoch": 0.35832679954332247, "grad_norm": 0.17023804783821106, "learning_rate": 0.0001, "loss": 1.8168, "step": 26050 }, { "epoch": 0.3590145669128874, "grad_norm": 0.19815494120121002, "learning_rate": 0.0001, "loss": 1.8123, "step": 26100 }, { "epoch": 0.3597023342824523, "grad_norm": 0.19192709028720856, "learning_rate": 0.0001, "loss": 1.8164, "step": 26150 }, { "epoch": 0.36039010165201724, "grad_norm": 0.18932852149009705, "learning_rate": 0.0001, "loss": 1.813, "step": 26200 }, { "epoch": 0.3610778690215821, "grad_norm": 0.16477489471435547, "learning_rate": 0.0001, "loss": 1.8147, "step": 26250 }, { "epoch": 0.36176563639114706, "grad_norm": 0.19172504544258118, "learning_rate": 0.0001, "loss": 1.814, "step": 26300 }, { "epoch": 0.362453403760712, "grad_norm": 0.19087177515029907, "learning_rate": 0.0001, "loss": 1.8123, "step": 26350 }, { "epoch": 0.3631411711302769, "grad_norm": 0.1714990735054016, "learning_rate": 0.0001, "loss": 1.8133, "step": 26400 }, { "epoch": 0.3638289384998418, "grad_norm": 0.16309858858585358, "learning_rate": 0.0001, "loss": 1.8168, "step": 26450 }, { "epoch": 0.3645167058694067, "grad_norm": 0.1791163831949234, "learning_rate": 0.0001, "loss": 1.818, "step": 26500 }, { "epoch": 0.36520447323897165, "grad_norm": 0.17130139470100403, "learning_rate": 0.0001, "loss": 1.8182, "step": 26550 }, { "epoch": 0.3658922406085366, "grad_norm": 0.17432111501693726, "learning_rate": 0.0001, "loss": 1.8177, "step": 26600 }, { "epoch": 0.3665800079781015, "grad_norm": 0.15398447215557098, "learning_rate": 0.0001, "loss": 1.8161, "step": 26650 }, { "epoch": 0.3672677753476664, "grad_norm": 0.2831607162952423, "learning_rate": 0.0001, "loss": 1.815, "step": 26700 }, { "epoch": 0.3679555427172313, "grad_norm": 0.17564986646175385, "learning_rate": 0.0001, "loss": 1.8129, "step": 26750 }, { "epoch": 0.36864331008679624, "grad_norm": 0.18288859724998474, "learning_rate": 0.0001, "loss": 1.813, "step": 26800 }, { "epoch": 0.3693310774563612, "grad_norm": 0.1621311753988266, "learning_rate": 0.0001, "loss": 1.8069, "step": 26850 }, { "epoch": 0.37001884482592606, "grad_norm": 0.16472625732421875, "learning_rate": 0.0001, "loss": 1.8136, "step": 26900 }, { "epoch": 0.370706612195491, "grad_norm": 0.16450871527194977, "learning_rate": 0.0001, "loss": 1.8149, "step": 26950 }, { "epoch": 0.37139437956505594, "grad_norm": 0.1769149899482727, "learning_rate": 0.0001, "loss": 1.8078, "step": 27000 }, { "epoch": 0.3720821469346208, "grad_norm": 0.1917348951101303, "learning_rate": 0.0001, "loss": 1.8121, "step": 27050 }, { "epoch": 0.37276991430418577, "grad_norm": 0.18277530372142792, "learning_rate": 0.0001, "loss": 1.812, "step": 27100 }, { "epoch": 0.37345768167375065, "grad_norm": 0.1814720183610916, "learning_rate": 0.0001, "loss": 1.8092, "step": 27150 }, { "epoch": 0.3741454490433156, "grad_norm": 0.17358410358428955, "learning_rate": 0.0001, "loss": 1.8118, "step": 27200 }, { "epoch": 0.37483321641288053, "grad_norm": 0.18569444119930267, "learning_rate": 0.0001, "loss": 1.8115, "step": 27250 }, { "epoch": 0.3755209837824454, "grad_norm": 0.15812502801418304, "learning_rate": 0.0001, "loss": 1.813, "step": 27300 }, { "epoch": 0.37620875115201036, "grad_norm": 0.19051866233348846, "learning_rate": 0.0001, "loss": 1.8162, "step": 27350 }, { "epoch": 0.37689651852157524, "grad_norm": 0.1646508276462555, "learning_rate": 0.0001, "loss": 1.8109, "step": 27400 }, { "epoch": 0.3775842858911402, "grad_norm": 0.16069738566875458, "learning_rate": 0.0001, "loss": 1.8088, "step": 27450 }, { "epoch": 0.3782720532607051, "grad_norm": 0.18708954751491547, "learning_rate": 0.0001, "loss": 1.809, "step": 27500 }, { "epoch": 0.37895982063027, "grad_norm": 0.18674279749393463, "learning_rate": 0.0001, "loss": 1.8141, "step": 27550 }, { "epoch": 0.37964758799983495, "grad_norm": 0.17408175766468048, "learning_rate": 0.0001, "loss": 1.8126, "step": 27600 }, { "epoch": 0.38033535536939983, "grad_norm": 0.15924981236457825, "learning_rate": 0.0001, "loss": 1.8122, "step": 27650 }, { "epoch": 0.38102312273896477, "grad_norm": 0.17203688621520996, "learning_rate": 0.0001, "loss": 1.8118, "step": 27700 }, { "epoch": 0.3817108901085297, "grad_norm": 0.18587364256381989, "learning_rate": 0.0001, "loss": 1.8129, "step": 27750 }, { "epoch": 0.3823986574780946, "grad_norm": 0.18941548466682434, "learning_rate": 0.0001, "loss": 1.8099, "step": 27800 }, { "epoch": 0.38308642484765953, "grad_norm": 0.14958040416240692, "learning_rate": 0.0001, "loss": 1.8141, "step": 27850 }, { "epoch": 0.3837741922172244, "grad_norm": 0.17599830031394958, "learning_rate": 0.0001, "loss": 1.8141, "step": 27900 }, { "epoch": 0.38446195958678936, "grad_norm": 0.17611196637153625, "learning_rate": 0.0001, "loss": 1.8114, "step": 27950 }, { "epoch": 0.3851497269563543, "grad_norm": 0.1823156625032425, "learning_rate": 0.0001, "loss": 1.8116, "step": 28000 }, { "epoch": 0.3858374943259192, "grad_norm": 0.17287470400333405, "learning_rate": 0.0001, "loss": 1.812, "step": 28050 }, { "epoch": 0.3865252616954841, "grad_norm": 0.17163801193237305, "learning_rate": 0.0001, "loss": 1.8102, "step": 28100 }, { "epoch": 0.38721302906504906, "grad_norm": 0.16863061487674713, "learning_rate": 0.0001, "loss": 1.8085, "step": 28150 }, { "epoch": 0.38790079643461395, "grad_norm": 0.1910269409418106, "learning_rate": 0.0001, "loss": 1.8128, "step": 28200 }, { "epoch": 0.3885885638041789, "grad_norm": 0.16055557131767273, "learning_rate": 0.0001, "loss": 1.8122, "step": 28250 }, { "epoch": 0.3892763311737438, "grad_norm": 0.17268548905849457, "learning_rate": 0.0001, "loss": 1.8084, "step": 28300 }, { "epoch": 0.3899640985433087, "grad_norm": 0.16962352395057678, "learning_rate": 0.0001, "loss": 1.8131, "step": 28350 }, { "epoch": 0.39065186591287365, "grad_norm": 0.1744450330734253, "learning_rate": 0.0001, "loss": 1.811, "step": 28400 }, { "epoch": 0.39133963328243854, "grad_norm": 0.17569154500961304, "learning_rate": 0.0001, "loss": 1.8165, "step": 28450 }, { "epoch": 0.3920274006520035, "grad_norm": 0.17034880816936493, "learning_rate": 0.0001, "loss": 1.8125, "step": 28500 }, { "epoch": 0.39271516802156836, "grad_norm": 0.16873665153980255, "learning_rate": 0.0001, "loss": 1.8124, "step": 28550 }, { "epoch": 0.3934029353911333, "grad_norm": 0.1771818846464157, "learning_rate": 0.0001, "loss": 1.8132, "step": 28600 }, { "epoch": 0.39409070276069824, "grad_norm": 0.17641928791999817, "learning_rate": 0.0001, "loss": 1.8131, "step": 28650 }, { "epoch": 0.3947784701302631, "grad_norm": 0.16521941125392914, "learning_rate": 0.0001, "loss": 1.8082, "step": 28700 }, { "epoch": 0.39546623749982807, "grad_norm": 0.17453192174434662, "learning_rate": 0.0001, "loss": 1.813, "step": 28750 }, { "epoch": 0.39615400486939295, "grad_norm": 0.17454297840595245, "learning_rate": 0.0001, "loss": 1.8129, "step": 28800 }, { "epoch": 0.3968417722389579, "grad_norm": 0.155872642993927, "learning_rate": 0.0001, "loss": 1.8116, "step": 28850 }, { "epoch": 0.39752953960852283, "grad_norm": 0.17079751193523407, "learning_rate": 0.0001, "loss": 1.8097, "step": 28900 }, { "epoch": 0.3982173069780877, "grad_norm": 0.1715528666973114, "learning_rate": 0.0001, "loss": 1.8082, "step": 28950 }, { "epoch": 0.39890507434765266, "grad_norm": 0.17352135479450226, "learning_rate": 0.0001, "loss": 1.8058, "step": 29000 }, { "epoch": 0.3995928417172176, "grad_norm": 0.17056448757648468, "learning_rate": 0.0001, "loss": 1.8093, "step": 29050 }, { "epoch": 0.4002806090867825, "grad_norm": 0.16389931738376617, "learning_rate": 0.0001, "loss": 1.8079, "step": 29100 }, { "epoch": 0.4009683764563474, "grad_norm": 0.17660637199878693, "learning_rate": 0.0001, "loss": 1.8101, "step": 29150 }, { "epoch": 0.4016561438259123, "grad_norm": 0.1871548742055893, "learning_rate": 0.0001, "loss": 1.8124, "step": 29200 }, { "epoch": 0.40234391119547724, "grad_norm": 0.17292185127735138, "learning_rate": 0.0001, "loss": 1.8074, "step": 29250 }, { "epoch": 0.4030316785650422, "grad_norm": 0.16299203038215637, "learning_rate": 0.0001, "loss": 1.81, "step": 29300 }, { "epoch": 0.40371944593460707, "grad_norm": 0.20287854969501495, "learning_rate": 0.0001, "loss": 1.8141, "step": 29350 }, { "epoch": 0.404407213304172, "grad_norm": 0.1632193922996521, "learning_rate": 0.0001, "loss": 1.8102, "step": 29400 }, { "epoch": 0.4050949806737369, "grad_norm": 0.16991235315799713, "learning_rate": 0.0001, "loss": 1.8084, "step": 29450 }, { "epoch": 0.40578274804330183, "grad_norm": 0.17448389530181885, "learning_rate": 0.0001, "loss": 1.8108, "step": 29500 }, { "epoch": 0.4064705154128668, "grad_norm": 0.1706276535987854, "learning_rate": 0.0001, "loss": 1.8091, "step": 29550 }, { "epoch": 0.40715828278243166, "grad_norm": 0.187569260597229, "learning_rate": 0.0001, "loss": 1.8077, "step": 29600 }, { "epoch": 0.4078460501519966, "grad_norm": 0.18289169669151306, "learning_rate": 0.0001, "loss": 1.8062, "step": 29650 }, { "epoch": 0.4085338175215615, "grad_norm": 0.17096656560897827, "learning_rate": 0.0001, "loss": 1.8117, "step": 29700 }, { "epoch": 0.4092215848911264, "grad_norm": 0.18183813989162445, "learning_rate": 0.0001, "loss": 1.8108, "step": 29750 }, { "epoch": 0.40990935226069136, "grad_norm": 0.18215380609035492, "learning_rate": 0.0001, "loss": 1.8113, "step": 29800 }, { "epoch": 0.41059711963025625, "grad_norm": 0.19367296993732452, "learning_rate": 0.0001, "loss": 1.8111, "step": 29850 }, { "epoch": 0.4112848869998212, "grad_norm": 0.18118008971214294, "learning_rate": 0.0001, "loss": 1.81, "step": 29900 }, { "epoch": 0.4119726543693861, "grad_norm": 0.16475409269332886, "learning_rate": 0.0001, "loss": 1.8095, "step": 29950 }, { "epoch": 0.412660421738951, "grad_norm": 0.19968119263648987, "learning_rate": 0.0001, "loss": 1.8078, "step": 30000 }, { "epoch": 0.41334818910851595, "grad_norm": 0.2024579495191574, "learning_rate": 0.0001, "loss": 1.8097, "step": 30050 }, { "epoch": 0.41403595647808084, "grad_norm": 0.1678769886493683, "learning_rate": 0.0001, "loss": 1.8099, "step": 30100 }, { "epoch": 0.4147237238476458, "grad_norm": 0.19947120547294617, "learning_rate": 0.0001, "loss": 1.8124, "step": 30150 }, { "epoch": 0.4154114912172107, "grad_norm": 0.1908283233642578, "learning_rate": 0.0001, "loss": 1.8094, "step": 30200 }, { "epoch": 0.4160992585867756, "grad_norm": 0.16802892088890076, "learning_rate": 0.0001, "loss": 1.8029, "step": 30250 }, { "epoch": 0.41678702595634054, "grad_norm": 0.1601232886314392, "learning_rate": 0.0001, "loss": 1.8078, "step": 30300 }, { "epoch": 0.4174747933259054, "grad_norm": 0.16903936862945557, "learning_rate": 0.0001, "loss": 1.811, "step": 30350 }, { "epoch": 0.41816256069547036, "grad_norm": 0.17131748795509338, "learning_rate": 0.0001, "loss": 1.8057, "step": 30400 }, { "epoch": 0.4188503280650353, "grad_norm": 0.17509245872497559, "learning_rate": 0.0001, "loss": 1.8109, "step": 30450 }, { "epoch": 0.4195380954346002, "grad_norm": 0.17135483026504517, "learning_rate": 0.0001, "loss": 1.8086, "step": 30500 }, { "epoch": 0.42022586280416513, "grad_norm": 0.1780470460653305, "learning_rate": 0.0001, "loss": 1.8054, "step": 30550 }, { "epoch": 0.42091363017373, "grad_norm": 0.16642825305461884, "learning_rate": 0.0001, "loss": 1.8101, "step": 30600 }, { "epoch": 0.42160139754329495, "grad_norm": 0.17237281799316406, "learning_rate": 0.0001, "loss": 1.8131, "step": 30650 }, { "epoch": 0.4222891649128599, "grad_norm": 0.1773928999900818, "learning_rate": 0.0001, "loss": 1.807, "step": 30700 }, { "epoch": 0.4229769322824248, "grad_norm": 0.15655359625816345, "learning_rate": 0.0001, "loss": 1.8102, "step": 30750 }, { "epoch": 0.4236646996519897, "grad_norm": 0.18366913497447968, "learning_rate": 0.0001, "loss": 1.8045, "step": 30800 }, { "epoch": 0.4243524670215546, "grad_norm": 0.15379434823989868, "learning_rate": 0.0001, "loss": 1.808, "step": 30850 }, { "epoch": 0.42504023439111954, "grad_norm": 0.17815300822257996, "learning_rate": 0.0001, "loss": 1.8043, "step": 30900 }, { "epoch": 0.4257280017606845, "grad_norm": 0.17477139830589294, "learning_rate": 0.0001, "loss": 1.8106, "step": 30950 }, { "epoch": 0.42641576913024937, "grad_norm": 0.18266303837299347, "learning_rate": 0.0001, "loss": 1.8089, "step": 31000 }, { "epoch": 0.4271035364998143, "grad_norm": 0.17377638816833496, "learning_rate": 0.0001, "loss": 1.808, "step": 31050 }, { "epoch": 0.42779130386937925, "grad_norm": 0.16105225682258606, "learning_rate": 0.0001, "loss": 1.8058, "step": 31100 }, { "epoch": 0.42847907123894413, "grad_norm": 0.16976149380207062, "learning_rate": 0.0001, "loss": 1.8108, "step": 31150 }, { "epoch": 0.42916683860850907, "grad_norm": 0.1994379609823227, "learning_rate": 0.0001, "loss": 1.8103, "step": 31200 }, { "epoch": 0.42985460597807396, "grad_norm": 0.1827680766582489, "learning_rate": 0.0001, "loss": 1.8044, "step": 31250 }, { "epoch": 0.4305423733476389, "grad_norm": 0.17883870005607605, "learning_rate": 0.0001, "loss": 1.8067, "step": 31300 }, { "epoch": 0.43123014071720384, "grad_norm": 0.1809430867433548, "learning_rate": 0.0001, "loss": 1.8105, "step": 31350 }, { "epoch": 0.4319179080867687, "grad_norm": 0.15287983417510986, "learning_rate": 0.0001, "loss": 1.8032, "step": 31400 }, { "epoch": 0.43260567545633366, "grad_norm": 0.1845768690109253, "learning_rate": 0.0001, "loss": 1.8044, "step": 31450 }, { "epoch": 0.43329344282589854, "grad_norm": 0.15448009967803955, "learning_rate": 0.0001, "loss": 1.8074, "step": 31500 }, { "epoch": 0.4339812101954635, "grad_norm": 0.16838380694389343, "learning_rate": 0.0001, "loss": 1.806, "step": 31550 }, { "epoch": 0.4346689775650284, "grad_norm": 0.16129769384860992, "learning_rate": 0.0001, "loss": 1.8103, "step": 31600 }, { "epoch": 0.4353567449345933, "grad_norm": 0.16702227294445038, "learning_rate": 0.0001, "loss": 1.81, "step": 31650 }, { "epoch": 0.43604451230415825, "grad_norm": 0.1646498441696167, "learning_rate": 0.0001, "loss": 1.806, "step": 31700 }, { "epoch": 0.43673227967372313, "grad_norm": 0.1929212510585785, "learning_rate": 0.0001, "loss": 1.8073, "step": 31750 }, { "epoch": 0.4374200470432881, "grad_norm": 0.1728442758321762, "learning_rate": 0.0001, "loss": 1.8062, "step": 31800 }, { "epoch": 0.438107814412853, "grad_norm": 0.15660201013088226, "learning_rate": 0.0001, "loss": 1.8025, "step": 31850 }, { "epoch": 0.4387955817824179, "grad_norm": 0.1685377061367035, "learning_rate": 0.0001, "loss": 1.8079, "step": 31900 }, { "epoch": 0.43948334915198284, "grad_norm": 0.18124371767044067, "learning_rate": 0.0001, "loss": 1.8042, "step": 31950 }, { "epoch": 0.4401711165215478, "grad_norm": 0.18348287045955658, "learning_rate": 0.0001, "loss": 1.809, "step": 32000 }, { "epoch": 0.44085888389111266, "grad_norm": 0.17936021089553833, "learning_rate": 0.0001, "loss": 1.8052, "step": 32050 }, { "epoch": 0.4415466512606776, "grad_norm": 0.17418572306632996, "learning_rate": 0.0001, "loss": 1.8075, "step": 32100 }, { "epoch": 0.4422344186302425, "grad_norm": 0.16956304013729095, "learning_rate": 0.0001, "loss": 1.8077, "step": 32150 }, { "epoch": 0.4429221859998074, "grad_norm": 0.18142879009246826, "learning_rate": 0.0001, "loss": 1.8053, "step": 32200 }, { "epoch": 0.44360995336937237, "grad_norm": 0.17536590993404388, "learning_rate": 0.0001, "loss": 1.8055, "step": 32250 }, { "epoch": 0.44429772073893725, "grad_norm": 0.18276521563529968, "learning_rate": 0.0001, "loss": 1.8052, "step": 32300 }, { "epoch": 0.4449854881085022, "grad_norm": 0.15810468792915344, "learning_rate": 0.0001, "loss": 1.8012, "step": 32350 }, { "epoch": 0.4456732554780671, "grad_norm": 0.17224664986133575, "learning_rate": 0.0001, "loss": 1.8077, "step": 32400 }, { "epoch": 0.446361022847632, "grad_norm": 0.17988136410713196, "learning_rate": 0.0001, "loss": 1.806, "step": 32450 }, { "epoch": 0.44704879021719696, "grad_norm": 0.16269569098949432, "learning_rate": 0.0001, "loss": 1.8072, "step": 32500 }, { "epoch": 0.44773655758676184, "grad_norm": 0.1897774487733841, "learning_rate": 0.0001, "loss": 1.8034, "step": 32550 }, { "epoch": 0.4484243249563268, "grad_norm": 0.17675265669822693, "learning_rate": 0.0001, "loss": 1.8053, "step": 32600 }, { "epoch": 0.44911209232589167, "grad_norm": 0.1847987174987793, "learning_rate": 0.0001, "loss": 1.8065, "step": 32650 }, { "epoch": 0.4497998596954566, "grad_norm": 0.16706308722496033, "learning_rate": 0.0001, "loss": 1.8072, "step": 32700 }, { "epoch": 0.45048762706502155, "grad_norm": 0.19702313840389252, "learning_rate": 0.0001, "loss": 1.8092, "step": 32750 }, { "epoch": 0.45117539443458643, "grad_norm": 0.17378373444080353, "learning_rate": 0.0001, "loss": 1.8069, "step": 32800 }, { "epoch": 0.45186316180415137, "grad_norm": 0.15358635783195496, "learning_rate": 0.0001, "loss": 1.8042, "step": 32850 }, { "epoch": 0.4525509291737163, "grad_norm": 0.16188420355319977, "learning_rate": 0.0001, "loss": 1.8046, "step": 32900 }, { "epoch": 0.4532386965432812, "grad_norm": 0.15988096594810486, "learning_rate": 0.0001, "loss": 1.8048, "step": 32950 }, { "epoch": 0.45392646391284613, "grad_norm": 0.17328138649463654, "learning_rate": 0.0001, "loss": 1.8031, "step": 33000 }, { "epoch": 0.454614231282411, "grad_norm": 0.18192242085933685, "learning_rate": 0.0001, "loss": 1.8015, "step": 33050 }, { "epoch": 0.45530199865197596, "grad_norm": 0.18269090354442596, "learning_rate": 0.0001, "loss": 1.8086, "step": 33100 }, { "epoch": 0.4559897660215409, "grad_norm": 0.1573922038078308, "learning_rate": 0.0001, "loss": 1.8054, "step": 33150 }, { "epoch": 0.4566775333911058, "grad_norm": 0.20478671789169312, "learning_rate": 0.0001, "loss": 1.8047, "step": 33200 }, { "epoch": 0.4573653007606707, "grad_norm": 0.17149974405765533, "learning_rate": 0.0001, "loss": 1.8045, "step": 33250 }, { "epoch": 0.4580530681302356, "grad_norm": 0.1575038731098175, "learning_rate": 0.0001, "loss": 1.8008, "step": 33300 }, { "epoch": 0.45874083549980055, "grad_norm": 0.1684975028038025, "learning_rate": 0.0001, "loss": 1.8036, "step": 33350 }, { "epoch": 0.4594286028693655, "grad_norm": 0.17977888882160187, "learning_rate": 0.0001, "loss": 1.8058, "step": 33400 }, { "epoch": 0.46011637023893037, "grad_norm": 0.1595628559589386, "learning_rate": 0.0001, "loss": 1.8052, "step": 33450 }, { "epoch": 0.4608041376084953, "grad_norm": 0.17325359582901, "learning_rate": 0.0001, "loss": 1.8036, "step": 33500 }, { "epoch": 0.4614919049780602, "grad_norm": 0.1705903857946396, "learning_rate": 0.0001, "loss": 1.8048, "step": 33550 }, { "epoch": 0.46217967234762514, "grad_norm": 0.1714329570531845, "learning_rate": 0.0001, "loss": 1.8042, "step": 33600 }, { "epoch": 0.4628674397171901, "grad_norm": 0.17674137651920319, "learning_rate": 0.0001, "loss": 1.8067, "step": 33650 }, { "epoch": 0.46355520708675496, "grad_norm": 0.1605982631444931, "learning_rate": 0.0001, "loss": 1.8059, "step": 33700 }, { "epoch": 0.4642429744563199, "grad_norm": 0.17221522331237793, "learning_rate": 0.0001, "loss": 1.8025, "step": 33750 }, { "epoch": 0.4649307418258848, "grad_norm": 0.17648015916347504, "learning_rate": 0.0001, "loss": 1.8065, "step": 33800 }, { "epoch": 0.4656185091954497, "grad_norm": 0.16860069334506989, "learning_rate": 0.0001, "loss": 1.8064, "step": 33850 }, { "epoch": 0.46630627656501467, "grad_norm": 0.19352567195892334, "learning_rate": 0.0001, "loss": 1.8043, "step": 33900 }, { "epoch": 0.46699404393457955, "grad_norm": 0.1634499430656433, "learning_rate": 0.0001, "loss": 1.7994, "step": 33950 }, { "epoch": 0.4676818113041445, "grad_norm": 0.1790640950202942, "learning_rate": 0.0001, "loss": 1.8075, "step": 34000 }, { "epoch": 0.46836957867370943, "grad_norm": 0.16731584072113037, "learning_rate": 0.0001, "loss": 1.8022, "step": 34050 }, { "epoch": 0.4690573460432743, "grad_norm": 0.17351976037025452, "learning_rate": 0.0001, "loss": 1.8066, "step": 34100 }, { "epoch": 0.46974511341283925, "grad_norm": 0.18717612326145172, "learning_rate": 0.0001, "loss": 1.8048, "step": 34150 }, { "epoch": 0.47043288078240414, "grad_norm": 0.18829597532749176, "learning_rate": 0.0001, "loss": 1.8018, "step": 34200 }, { "epoch": 0.4711206481519691, "grad_norm": 0.16731028258800507, "learning_rate": 0.0001, "loss": 1.806, "step": 34250 }, { "epoch": 0.471808415521534, "grad_norm": 0.17419900000095367, "learning_rate": 0.0001, "loss": 1.8009, "step": 34300 }, { "epoch": 0.4724961828910989, "grad_norm": 0.16232840716838837, "learning_rate": 0.0001, "loss": 1.8048, "step": 34350 }, { "epoch": 0.47318395026066384, "grad_norm": 0.1557988077402115, "learning_rate": 0.0001, "loss": 1.8021, "step": 34400 }, { "epoch": 0.47387171763022873, "grad_norm": 0.18441712856292725, "learning_rate": 0.0001, "loss": 1.8066, "step": 34450 }, { "epoch": 0.47455948499979367, "grad_norm": 0.1681167036294937, "learning_rate": 0.0001, "loss": 1.8035, "step": 34500 }, { "epoch": 0.4752472523693586, "grad_norm": 0.1694021373987198, "learning_rate": 0.0001, "loss": 1.8056, "step": 34550 }, { "epoch": 0.4759350197389235, "grad_norm": 0.16909408569335938, "learning_rate": 0.0001, "loss": 1.8012, "step": 34600 }, { "epoch": 0.47662278710848843, "grad_norm": 0.18573597073554993, "learning_rate": 0.0001, "loss": 1.8025, "step": 34650 }, { "epoch": 0.4773105544780533, "grad_norm": 0.1591121107339859, "learning_rate": 0.0001, "loss": 1.8022, "step": 34700 }, { "epoch": 0.47799832184761826, "grad_norm": 0.16243012249469757, "learning_rate": 0.0001, "loss": 1.809, "step": 34750 }, { "epoch": 0.4786860892171832, "grad_norm": 0.1876152753829956, "learning_rate": 0.0001, "loss": 1.8043, "step": 34800 }, { "epoch": 0.4793738565867481, "grad_norm": 0.160101518034935, "learning_rate": 0.0001, "loss": 1.8041, "step": 34850 }, { "epoch": 0.480061623956313, "grad_norm": 0.17508384585380554, "learning_rate": 0.0001, "loss": 1.8025, "step": 34900 }, { "epoch": 0.48074939132587796, "grad_norm": 0.16169220209121704, "learning_rate": 0.0001, "loss": 1.8045, "step": 34950 }, { "epoch": 0.48143715869544285, "grad_norm": 0.17065638303756714, "learning_rate": 0.0001, "loss": 1.8046, "step": 35000 }, { "epoch": 0.4821249260650078, "grad_norm": 0.16137543320655823, "learning_rate": 0.0001, "loss": 1.8006, "step": 35050 }, { "epoch": 0.48281269343457267, "grad_norm": 0.1716589331626892, "learning_rate": 0.0001, "loss": 1.8065, "step": 35100 }, { "epoch": 0.4835004608041376, "grad_norm": 0.16750770807266235, "learning_rate": 0.0001, "loss": 1.8069, "step": 35150 }, { "epoch": 0.48418822817370255, "grad_norm": 0.1668424755334854, "learning_rate": 0.0001, "loss": 1.8045, "step": 35200 }, { "epoch": 0.48487599554326744, "grad_norm": 0.1577017605304718, "learning_rate": 0.0001, "loss": 1.8015, "step": 35250 }, { "epoch": 0.4855637629128324, "grad_norm": 0.16916392743587494, "learning_rate": 0.0001, "loss": 1.8012, "step": 35300 }, { "epoch": 0.48625153028239726, "grad_norm": 0.16878165304660797, "learning_rate": 0.0001, "loss": 1.8049, "step": 35350 }, { "epoch": 0.4869392976519622, "grad_norm": 0.1834115982055664, "learning_rate": 0.0001, "loss": 1.8024, "step": 35400 }, { "epoch": 0.48762706502152714, "grad_norm": 0.16310469806194305, "learning_rate": 0.0001, "loss": 1.8043, "step": 35450 }, { "epoch": 0.488314832391092, "grad_norm": 0.17430266737937927, "learning_rate": 0.0001, "loss": 1.8017, "step": 35500 }, { "epoch": 0.48900259976065696, "grad_norm": 0.20293480157852173, "learning_rate": 0.0001, "loss": 1.8022, "step": 35550 }, { "epoch": 0.48969036713022185, "grad_norm": 0.16140292584896088, "learning_rate": 0.0001, "loss": 1.8012, "step": 35600 }, { "epoch": 0.4903781344997868, "grad_norm": 0.15472573041915894, "learning_rate": 0.0001, "loss": 1.804, "step": 35650 }, { "epoch": 0.49106590186935173, "grad_norm": 0.19431902468204498, "learning_rate": 0.0001, "loss": 1.8013, "step": 35700 }, { "epoch": 0.4917536692389166, "grad_norm": 0.1693229377269745, "learning_rate": 0.0001, "loss": 1.8004, "step": 35750 }, { "epoch": 0.49244143660848155, "grad_norm": 0.19499187171459198, "learning_rate": 0.0001, "loss": 1.8035, "step": 35800 }, { "epoch": 0.4931292039780465, "grad_norm": 0.16046124696731567, "learning_rate": 0.0001, "loss": 1.8062, "step": 35850 }, { "epoch": 0.4938169713476114, "grad_norm": 0.17743340134620667, "learning_rate": 0.0001, "loss": 1.8038, "step": 35900 }, { "epoch": 0.4945047387171763, "grad_norm": 0.20568375289440155, "learning_rate": 0.0001, "loss": 1.8039, "step": 35950 }, { "epoch": 0.4951925060867412, "grad_norm": 0.1706654578447342, "learning_rate": 0.0001, "loss": 1.8022, "step": 36000 }, { "epoch": 0.49588027345630614, "grad_norm": 0.17956335842609406, "learning_rate": 0.0001, "loss": 1.8038, "step": 36050 }, { "epoch": 0.4965680408258711, "grad_norm": 0.1683945506811142, "learning_rate": 0.0001, "loss": 1.801, "step": 36100 }, { "epoch": 0.49725580819543597, "grad_norm": 0.16132575273513794, "learning_rate": 0.0001, "loss": 1.7994, "step": 36150 }, { "epoch": 0.4979435755650009, "grad_norm": 0.15439482033252716, "learning_rate": 0.0001, "loss": 1.798, "step": 36200 }, { "epoch": 0.4986313429345658, "grad_norm": 0.17427167296409607, "learning_rate": 0.0001, "loss": 1.803, "step": 36250 }, { "epoch": 0.49931911030413073, "grad_norm": 0.1826677918434143, "learning_rate": 0.0001, "loss": 1.8041, "step": 36300 }, { "epoch": 0.5000068776736957, "grad_norm": 0.1664198338985443, "learning_rate": 0.0001, "loss": 1.8007, "step": 36350 }, { "epoch": 0.5006946450432606, "grad_norm": 0.19743186235427856, "learning_rate": 0.0001, "loss": 1.8009, "step": 36400 }, { "epoch": 0.5013824124128254, "grad_norm": 0.17416580021381378, "learning_rate": 0.0001, "loss": 1.8033, "step": 36450 }, { "epoch": 0.5020701797823904, "grad_norm": 0.16447678208351135, "learning_rate": 0.0001, "loss": 1.8027, "step": 36500 }, { "epoch": 0.5027579471519553, "grad_norm": 0.19978569447994232, "learning_rate": 0.0001, "loss": 1.8027, "step": 36550 }, { "epoch": 0.5034457145215202, "grad_norm": 0.1768701672554016, "learning_rate": 0.0001, "loss": 1.8018, "step": 36600 }, { "epoch": 0.5041334818910852, "grad_norm": 0.17458416521549225, "learning_rate": 0.0001, "loss": 1.8031, "step": 36650 }, { "epoch": 0.5048212492606501, "grad_norm": 0.15409618616104126, "learning_rate": 0.0001, "loss": 1.8043, "step": 36700 }, { "epoch": 0.505509016630215, "grad_norm": 0.20529916882514954, "learning_rate": 0.0001, "loss": 1.8026, "step": 36750 }, { "epoch": 0.50619678399978, "grad_norm": 0.1579432338476181, "learning_rate": 0.0001, "loss": 1.806, "step": 36800 }, { "epoch": 0.5068845513693448, "grad_norm": 0.16803112626075745, "learning_rate": 0.0001, "loss": 1.8027, "step": 36850 }, { "epoch": 0.5075723187389097, "grad_norm": 0.19382716715335846, "learning_rate": 0.0001, "loss": 1.8029, "step": 36900 }, { "epoch": 0.5082600861084746, "grad_norm": 0.17823243141174316, "learning_rate": 0.0001, "loss": 1.8035, "step": 36950 }, { "epoch": 0.5089478534780396, "grad_norm": 0.1742970496416092, "learning_rate": 0.0001, "loss": 1.8033, "step": 37000 }, { "epoch": 0.5096356208476045, "grad_norm": 0.17236186563968658, "learning_rate": 0.0001, "loss": 1.8062, "step": 37050 }, { "epoch": 0.5103233882171694, "grad_norm": 0.1705719381570816, "learning_rate": 0.0001, "loss": 1.8052, "step": 37100 }, { "epoch": 0.5110111555867344, "grad_norm": 0.19941222667694092, "learning_rate": 0.0001, "loss": 1.8011, "step": 37150 }, { "epoch": 0.5116989229562993, "grad_norm": 0.16263477504253387, "learning_rate": 0.0001, "loss": 1.8026, "step": 37200 }, { "epoch": 0.5123866903258641, "grad_norm": 0.15199637413024902, "learning_rate": 0.0001, "loss": 1.8001, "step": 37250 }, { "epoch": 0.5130744576954291, "grad_norm": 0.16797873377799988, "learning_rate": 0.0001, "loss": 1.8016, "step": 37300 }, { "epoch": 0.513762225064994, "grad_norm": 0.16336190700531006, "learning_rate": 0.0001, "loss": 1.798, "step": 37350 }, { "epoch": 0.5144499924345589, "grad_norm": 0.16497831046581268, "learning_rate": 0.0001, "loss": 1.8001, "step": 37400 }, { "epoch": 0.5151377598041239, "grad_norm": 0.1712917536497116, "learning_rate": 0.0001, "loss": 1.804, "step": 37450 }, { "epoch": 0.5158255271736888, "grad_norm": 0.16597513854503632, "learning_rate": 0.0001, "loss": 1.8019, "step": 37500 }, { "epoch": 0.5165132945432537, "grad_norm": 0.15661810338497162, "learning_rate": 0.0001, "loss": 1.8043, "step": 37550 }, { "epoch": 0.5172010619128186, "grad_norm": 0.17713536322116852, "learning_rate": 0.0001, "loss": 1.7973, "step": 37600 }, { "epoch": 0.5178888292823836, "grad_norm": 0.15873874723911285, "learning_rate": 0.0001, "loss": 1.8003, "step": 37650 }, { "epoch": 0.5185765966519484, "grad_norm": 0.1784040331840515, "learning_rate": 0.0001, "loss": 1.798, "step": 37700 }, { "epoch": 0.5192643640215133, "grad_norm": 0.16135090589523315, "learning_rate": 0.0001, "loss": 1.8082, "step": 37750 }, { "epoch": 0.5199521313910783, "grad_norm": 0.15565833449363708, "learning_rate": 0.0001, "loss": 1.8006, "step": 37800 }, { "epoch": 0.5206398987606432, "grad_norm": 0.1711311787366867, "learning_rate": 0.0001, "loss": 1.7975, "step": 37850 }, { "epoch": 0.5213276661302081, "grad_norm": 0.17314565181732178, "learning_rate": 0.0001, "loss": 1.7997, "step": 37900 }, { "epoch": 0.5220154334997731, "grad_norm": 0.1723901331424713, "learning_rate": 0.0001, "loss": 1.8006, "step": 37950 }, { "epoch": 0.522703200869338, "grad_norm": 0.15868623554706573, "learning_rate": 0.0001, "loss": 1.8013, "step": 38000 }, { "epoch": 0.5233909682389029, "grad_norm": 0.17163942754268646, "learning_rate": 0.0001, "loss": 1.7991, "step": 38050 }, { "epoch": 0.5240787356084677, "grad_norm": 0.17622709274291992, "learning_rate": 0.0001, "loss": 1.8027, "step": 38100 }, { "epoch": 0.5247665029780327, "grad_norm": 0.1616000235080719, "learning_rate": 0.0001, "loss": 1.7993, "step": 38150 }, { "epoch": 0.5254542703475976, "grad_norm": 0.1638936698436737, "learning_rate": 0.0001, "loss": 1.7978, "step": 38200 }, { "epoch": 0.5261420377171625, "grad_norm": 0.1706729531288147, "learning_rate": 0.0001, "loss": 1.7999, "step": 38250 }, { "epoch": 0.5268298050867275, "grad_norm": 0.2048814296722412, "learning_rate": 0.0001, "loss": 1.7987, "step": 38300 }, { "epoch": 0.5275175724562924, "grad_norm": 0.15826106071472168, "learning_rate": 0.0001, "loss": 1.8022, "step": 38350 }, { "epoch": 0.5282053398258573, "grad_norm": 0.16068226099014282, "learning_rate": 0.0001, "loss": 1.8032, "step": 38400 }, { "epoch": 0.5288931071954223, "grad_norm": 0.17855240404605865, "learning_rate": 0.0001, "loss": 1.7994, "step": 38450 }, { "epoch": 0.5295808745649871, "grad_norm": 0.16978466510772705, "learning_rate": 0.0001, "loss": 1.8022, "step": 38500 }, { "epoch": 0.530268641934552, "grad_norm": 0.1745109260082245, "learning_rate": 0.0001, "loss": 1.8008, "step": 38550 }, { "epoch": 0.530956409304117, "grad_norm": 0.1952807605266571, "learning_rate": 0.0001, "loss": 1.7977, "step": 38600 }, { "epoch": 0.5316441766736819, "grad_norm": 0.1846735179424286, "learning_rate": 0.0001, "loss": 1.8033, "step": 38650 }, { "epoch": 0.5323319440432468, "grad_norm": 0.17474836111068726, "learning_rate": 0.0001, "loss": 1.8034, "step": 38700 }, { "epoch": 0.5330197114128117, "grad_norm": 0.1729106903076172, "learning_rate": 0.0001, "loss": 1.8043, "step": 38750 }, { "epoch": 0.5337074787823767, "grad_norm": 0.18584811687469482, "learning_rate": 0.0001, "loss": 1.805, "step": 38800 }, { "epoch": 0.5343952461519416, "grad_norm": 0.15596157312393188, "learning_rate": 0.0001, "loss": 1.8014, "step": 38850 }, { "epoch": 0.5350830135215064, "grad_norm": 0.15528340637683868, "learning_rate": 0.0001, "loss": 1.7969, "step": 38900 }, { "epoch": 0.5357707808910714, "grad_norm": 0.1738685965538025, "learning_rate": 0.0001, "loss": 1.8003, "step": 38950 }, { "epoch": 0.5364585482606363, "grad_norm": 0.1620347946882248, "learning_rate": 0.0001, "loss": 1.796, "step": 39000 }, { "epoch": 0.5371463156302012, "grad_norm": 0.1705981343984604, "learning_rate": 0.0001, "loss": 1.8008, "step": 39050 }, { "epoch": 0.5378340829997662, "grad_norm": 0.16167068481445312, "learning_rate": 0.0001, "loss": 1.8037, "step": 39100 }, { "epoch": 0.5385218503693311, "grad_norm": 0.15977101027965546, "learning_rate": 0.0001, "loss": 1.8043, "step": 39150 }, { "epoch": 0.539209617738896, "grad_norm": 0.1699797809123993, "learning_rate": 0.0001, "loss": 1.8025, "step": 39200 }, { "epoch": 0.5398973851084609, "grad_norm": 0.17108047008514404, "learning_rate": 0.0001, "loss": 1.7999, "step": 39250 }, { "epoch": 0.5405851524780259, "grad_norm": 0.1756991147994995, "learning_rate": 0.0001, "loss": 1.8001, "step": 39300 }, { "epoch": 0.5412729198475907, "grad_norm": 0.1716366708278656, "learning_rate": 0.0001, "loss": 1.7987, "step": 39350 }, { "epoch": 0.5419606872171556, "grad_norm": 0.16876575350761414, "learning_rate": 0.0001, "loss": 1.8013, "step": 39400 }, { "epoch": 0.5426484545867206, "grad_norm": 0.1650577336549759, "learning_rate": 0.0001, "loss": 1.8001, "step": 39450 }, { "epoch": 0.5433362219562855, "grad_norm": 0.17242754995822906, "learning_rate": 0.0001, "loss": 1.8006, "step": 39500 }, { "epoch": 0.5440239893258504, "grad_norm": 0.16941705346107483, "learning_rate": 0.0001, "loss": 1.7995, "step": 39550 }, { "epoch": 0.5447117566954154, "grad_norm": 0.21036018431186676, "learning_rate": 0.0001, "loss": 1.802, "step": 39600 }, { "epoch": 0.5453995240649803, "grad_norm": 0.16824571788311005, "learning_rate": 0.0001, "loss": 1.7992, "step": 39650 }, { "epoch": 0.5460872914345452, "grad_norm": 0.162497416138649, "learning_rate": 0.0001, "loss": 1.7978, "step": 39700 }, { "epoch": 0.5467750588041101, "grad_norm": 0.18297506868839264, "learning_rate": 0.0001, "loss": 1.7968, "step": 39750 }, { "epoch": 0.547462826173675, "grad_norm": 0.15444135665893555, "learning_rate": 0.0001, "loss": 1.7942, "step": 39800 }, { "epoch": 0.5481505935432399, "grad_norm": 0.17254306375980377, "learning_rate": 0.0001, "loss": 1.797, "step": 39850 }, { "epoch": 0.5488383609128048, "grad_norm": 0.18030798435211182, "learning_rate": 0.0001, "loss": 1.8008, "step": 39900 }, { "epoch": 0.5495261282823698, "grad_norm": 0.18069452047348022, "learning_rate": 0.0001, "loss": 1.7988, "step": 39950 }, { "epoch": 0.5502138956519347, "grad_norm": 0.16256502270698547, "learning_rate": 0.0001, "loss": 1.8019, "step": 40000 }, { "epoch": 0.5509016630214996, "grad_norm": 0.16416381299495697, "learning_rate": 0.0001, "loss": 1.7976, "step": 40050 }, { "epoch": 0.5515894303910646, "grad_norm": 0.1743890941143036, "learning_rate": 0.0001, "loss": 1.7966, "step": 40100 }, { "epoch": 0.5522771977606294, "grad_norm": 0.1875494122505188, "learning_rate": 0.0001, "loss": 1.799, "step": 40150 }, { "epoch": 0.5529649651301943, "grad_norm": 0.18323060870170593, "learning_rate": 0.0001, "loss": 1.7968, "step": 40200 }, { "epoch": 0.5536527324997593, "grad_norm": 0.1552455574274063, "learning_rate": 0.0001, "loss": 1.799, "step": 40250 }, { "epoch": 0.5543404998693242, "grad_norm": 0.1685846745967865, "learning_rate": 0.0001, "loss": 1.7989, "step": 40300 }, { "epoch": 0.5550282672388891, "grad_norm": 0.16371703147888184, "learning_rate": 0.0001, "loss": 1.7943, "step": 40350 }, { "epoch": 0.5557160346084541, "grad_norm": 0.17993508279323578, "learning_rate": 0.0001, "loss": 1.7972, "step": 40400 }, { "epoch": 0.556403801978019, "grad_norm": 0.17061980068683624, "learning_rate": 0.0001, "loss": 1.7954, "step": 40450 }, { "epoch": 0.5570915693475839, "grad_norm": 0.17588096857070923, "learning_rate": 0.0001, "loss": 1.7975, "step": 40500 }, { "epoch": 0.5577793367171487, "grad_norm": 0.16484741866588593, "learning_rate": 0.0001, "loss": 1.7959, "step": 40550 }, { "epoch": 0.5584671040867137, "grad_norm": 0.1812593787908554, "learning_rate": 0.0001, "loss": 1.801, "step": 40600 }, { "epoch": 0.5591548714562786, "grad_norm": 0.17755167186260223, "learning_rate": 0.0001, "loss": 1.797, "step": 40650 }, { "epoch": 0.5598426388258435, "grad_norm": 0.16877087950706482, "learning_rate": 0.0001, "loss": 1.7975, "step": 40700 }, { "epoch": 0.5605304061954085, "grad_norm": 0.15780018270015717, "learning_rate": 0.0001, "loss": 1.7967, "step": 40750 }, { "epoch": 0.5612181735649734, "grad_norm": 0.15145239233970642, "learning_rate": 0.0001, "loss": 1.7988, "step": 40800 }, { "epoch": 0.5619059409345383, "grad_norm": 0.18385986983776093, "learning_rate": 0.0001, "loss": 1.7965, "step": 40850 }, { "epoch": 0.5625937083041033, "grad_norm": 0.15375161170959473, "learning_rate": 0.0001, "loss": 1.7946, "step": 40900 }, { "epoch": 0.5632814756736682, "grad_norm": 0.15694858133792877, "learning_rate": 0.0001, "loss": 1.7989, "step": 40950 }, { "epoch": 0.563969243043233, "grad_norm": 0.1538461446762085, "learning_rate": 0.0001, "loss": 1.7965, "step": 41000 }, { "epoch": 0.5646570104127979, "grad_norm": 0.16211877763271332, "learning_rate": 0.0001, "loss": 1.7931, "step": 41050 }, { "epoch": 0.5653447777823629, "grad_norm": 0.1737697869539261, "learning_rate": 0.0001, "loss": 1.7972, "step": 41100 }, { "epoch": 0.5660325451519278, "grad_norm": 0.1610105037689209, "learning_rate": 0.0001, "loss": 1.798, "step": 41150 }, { "epoch": 0.5667203125214927, "grad_norm": 0.1762542873620987, "learning_rate": 0.0001, "loss": 1.7991, "step": 41200 }, { "epoch": 0.5674080798910577, "grad_norm": 0.16195493936538696, "learning_rate": 0.0001, "loss": 1.7972, "step": 41250 }, { "epoch": 0.5680958472606226, "grad_norm": 0.18047676980495453, "learning_rate": 0.0001, "loss": 1.7962, "step": 41300 }, { "epoch": 0.5687836146301875, "grad_norm": 0.18760687112808228, "learning_rate": 0.0001, "loss": 1.8, "step": 41350 }, { "epoch": 0.5694713819997524, "grad_norm": 0.17012238502502441, "learning_rate": 0.0001, "loss": 1.7969, "step": 41400 }, { "epoch": 0.5701591493693173, "grad_norm": 0.1699533313512802, "learning_rate": 0.0001, "loss": 1.7953, "step": 41450 }, { "epoch": 0.5708469167388822, "grad_norm": 0.16422894597053528, "learning_rate": 0.0001, "loss": 1.7995, "step": 41500 }, { "epoch": 0.5715346841084472, "grad_norm": 0.17526569962501526, "learning_rate": 0.0001, "loss": 1.7967, "step": 41550 }, { "epoch": 0.5722224514780121, "grad_norm": 0.158601313829422, "learning_rate": 0.0001, "loss": 1.8006, "step": 41600 }, { "epoch": 0.572910218847577, "grad_norm": 0.1562766283750534, "learning_rate": 0.0001, "loss": 1.7969, "step": 41650 }, { "epoch": 0.5735979862171419, "grad_norm": 0.15490677952766418, "learning_rate": 0.0001, "loss": 1.8017, "step": 41700 }, { "epoch": 0.5742857535867069, "grad_norm": 0.17004509270191193, "learning_rate": 0.0001, "loss": 1.7958, "step": 41750 }, { "epoch": 0.5749735209562717, "grad_norm": 0.17213889956474304, "learning_rate": 0.0001, "loss": 1.797, "step": 41800 }, { "epoch": 0.5756612883258366, "grad_norm": 0.17541930079460144, "learning_rate": 0.0001, "loss": 1.7972, "step": 41850 }, { "epoch": 0.5763490556954016, "grad_norm": 0.18296034634113312, "learning_rate": 0.0001, "loss": 1.796, "step": 41900 }, { "epoch": 0.5770368230649665, "grad_norm": 0.1777525097131729, "learning_rate": 0.0001, "loss": 1.7959, "step": 41950 }, { "epoch": 0.5777245904345314, "grad_norm": 0.17678572237491608, "learning_rate": 0.0001, "loss": 1.7989, "step": 42000 }, { "epoch": 0.5784123578040964, "grad_norm": 0.1763673573732376, "learning_rate": 0.0001, "loss": 1.8004, "step": 42050 }, { "epoch": 0.5791001251736613, "grad_norm": 0.18608896434307098, "learning_rate": 0.0001, "loss": 1.7997, "step": 42100 }, { "epoch": 0.5797878925432262, "grad_norm": 0.1691625863313675, "learning_rate": 0.0001, "loss": 1.7988, "step": 42150 }, { "epoch": 0.580475659912791, "grad_norm": 0.1609441488981247, "learning_rate": 0.0001, "loss": 1.7993, "step": 42200 }, { "epoch": 0.581163427282356, "grad_norm": 0.15776963531970978, "learning_rate": 0.0001, "loss": 1.7994, "step": 42250 }, { "epoch": 0.5818511946519209, "grad_norm": 0.20214344561100006, "learning_rate": 0.0001, "loss": 1.7998, "step": 42300 }, { "epoch": 0.5825389620214858, "grad_norm": 0.18112723529338837, "learning_rate": 0.0001, "loss": 1.8, "step": 42350 }, { "epoch": 0.5832267293910508, "grad_norm": 0.1543450802564621, "learning_rate": 0.0001, "loss": 1.7982, "step": 42400 }, { "epoch": 0.5839144967606157, "grad_norm": 0.15315985679626465, "learning_rate": 0.0001, "loss": 1.7995, "step": 42450 }, { "epoch": 0.5846022641301806, "grad_norm": 0.16166909039020538, "learning_rate": 0.0001, "loss": 1.7995, "step": 42500 }, { "epoch": 0.5852900314997456, "grad_norm": 0.15933014452457428, "learning_rate": 0.0001, "loss": 1.7968, "step": 42550 }, { "epoch": 0.5859777988693105, "grad_norm": 0.15434689819812775, "learning_rate": 0.0001, "loss": 1.797, "step": 42600 }, { "epoch": 0.5866655662388753, "grad_norm": 0.1875755488872528, "learning_rate": 0.0001, "loss": 1.7964, "step": 42650 }, { "epoch": 0.5873533336084403, "grad_norm": 0.15559327602386475, "learning_rate": 0.0001, "loss": 1.7997, "step": 42700 }, { "epoch": 0.5880411009780052, "grad_norm": 0.16149398684501648, "learning_rate": 0.0001, "loss": 1.7956, "step": 42750 }, { "epoch": 0.5887288683475701, "grad_norm": 0.1777992695569992, "learning_rate": 0.0001, "loss": 1.7912, "step": 42800 }, { "epoch": 0.589416635717135, "grad_norm": 0.15934714674949646, "learning_rate": 0.0001, "loss": 1.7989, "step": 42850 }, { "epoch": 0.5901044030867, "grad_norm": 0.16847145557403564, "learning_rate": 0.0001, "loss": 1.7997, "step": 42900 }, { "epoch": 0.5907921704562649, "grad_norm": 0.17410792410373688, "learning_rate": 0.0001, "loss": 1.7999, "step": 42950 }, { "epoch": 0.5914799378258297, "grad_norm": 0.18102861940860748, "learning_rate": 0.0001, "loss": 1.7983, "step": 43000 }, { "epoch": 0.5921677051953947, "grad_norm": 0.1682325005531311, "learning_rate": 0.0001, "loss": 1.7986, "step": 43050 }, { "epoch": 0.5928554725649596, "grad_norm": 0.17732855677604675, "learning_rate": 0.0001, "loss": 1.8004, "step": 43100 }, { "epoch": 0.5935432399345245, "grad_norm": 0.16327179968357086, "learning_rate": 0.0001, "loss": 1.7969, "step": 43150 }, { "epoch": 0.5942310073040895, "grad_norm": 0.1582539677619934, "learning_rate": 0.0001, "loss": 1.798, "step": 43200 }, { "epoch": 0.5949187746736544, "grad_norm": 0.14965754747390747, "learning_rate": 0.0001, "loss": 1.7986, "step": 43250 }, { "epoch": 0.5956065420432193, "grad_norm": 0.1617211103439331, "learning_rate": 0.0001, "loss": 1.7938, "step": 43300 }, { "epoch": 0.5962943094127843, "grad_norm": 0.17458325624465942, "learning_rate": 0.0001, "loss": 1.7978, "step": 43350 }, { "epoch": 0.5969820767823492, "grad_norm": 0.1668146252632141, "learning_rate": 0.0001, "loss": 1.7983, "step": 43400 }, { "epoch": 0.597669844151914, "grad_norm": 0.15414200723171234, "learning_rate": 0.0001, "loss": 1.7989, "step": 43450 }, { "epoch": 0.5983576115214789, "grad_norm": 0.15912353992462158, "learning_rate": 0.0001, "loss": 1.7964, "step": 43500 }, { "epoch": 0.5990453788910439, "grad_norm": 0.15936636924743652, "learning_rate": 0.0001, "loss": 1.7944, "step": 43550 }, { "epoch": 0.5997331462606088, "grad_norm": 0.17340709269046783, "learning_rate": 0.0001, "loss": 1.7912, "step": 43600 }, { "epoch": 0.6004209136301737, "grad_norm": 0.18960115313529968, "learning_rate": 0.0001, "loss": 1.7946, "step": 43650 }, { "epoch": 0.6011086809997387, "grad_norm": 0.17091485857963562, "learning_rate": 0.0001, "loss": 1.7998, "step": 43700 }, { "epoch": 0.6017964483693036, "grad_norm": 0.17222945392131805, "learning_rate": 0.0001, "loss": 1.8016, "step": 43750 }, { "epoch": 0.6024842157388685, "grad_norm": 0.1608862429857254, "learning_rate": 0.0001, "loss": 1.794, "step": 43800 }, { "epoch": 0.6031719831084335, "grad_norm": 0.16626954078674316, "learning_rate": 0.0001, "loss": 1.7971, "step": 43850 }, { "epoch": 0.6038597504779983, "grad_norm": 0.1769898533821106, "learning_rate": 0.0001, "loss": 1.7992, "step": 43900 }, { "epoch": 0.6045475178475632, "grad_norm": 0.1665075570344925, "learning_rate": 0.0001, "loss": 1.7976, "step": 43950 }, { "epoch": 0.6052352852171281, "grad_norm": 0.1957935094833374, "learning_rate": 0.0001, "loss": 1.7972, "step": 44000 }, { "epoch": 0.6059230525866931, "grad_norm": 0.20066794753074646, "learning_rate": 0.0001, "loss": 1.7976, "step": 44050 }, { "epoch": 0.606610819956258, "grad_norm": 0.16102181375026703, "learning_rate": 0.0001, "loss": 1.7942, "step": 44100 }, { "epoch": 0.6072985873258229, "grad_norm": 0.16587640345096588, "learning_rate": 0.0001, "loss": 1.7964, "step": 44150 }, { "epoch": 0.6079863546953879, "grad_norm": 0.17338010668754578, "learning_rate": 0.0001, "loss": 1.7955, "step": 44200 }, { "epoch": 0.6086741220649527, "grad_norm": 0.1979152411222458, "learning_rate": 0.0001, "loss": 1.7964, "step": 44250 }, { "epoch": 0.6093618894345176, "grad_norm": 0.16478174924850464, "learning_rate": 0.0001, "loss": 1.8013, "step": 44300 }, { "epoch": 0.6100496568040826, "grad_norm": 0.16508819162845612, "learning_rate": 0.0001, "loss": 1.7922, "step": 44350 }, { "epoch": 0.6107374241736475, "grad_norm": 0.15964439511299133, "learning_rate": 0.0001, "loss": 1.7975, "step": 44400 }, { "epoch": 0.6114251915432124, "grad_norm": 0.18116386234760284, "learning_rate": 0.0001, "loss": 1.7972, "step": 44450 }, { "epoch": 0.6121129589127774, "grad_norm": 0.1808495819568634, "learning_rate": 0.0001, "loss": 1.7958, "step": 44500 }, { "epoch": 0.6128007262823423, "grad_norm": 0.1634376347064972, "learning_rate": 0.0001, "loss": 1.7931, "step": 44550 }, { "epoch": 0.6134884936519072, "grad_norm": 0.15140944719314575, "learning_rate": 0.0001, "loss": 1.7995, "step": 44600 }, { "epoch": 0.614176261021472, "grad_norm": 0.15988072752952576, "learning_rate": 0.0001, "loss": 1.7957, "step": 44650 }, { "epoch": 0.614864028391037, "grad_norm": 0.16280120611190796, "learning_rate": 0.0001, "loss": 1.7986, "step": 44700 }, { "epoch": 0.6155517957606019, "grad_norm": 0.16643498837947845, "learning_rate": 0.0001, "loss": 1.7975, "step": 44750 }, { "epoch": 0.6162395631301668, "grad_norm": 0.151467427611351, "learning_rate": 0.0001, "loss": 1.7972, "step": 44800 }, { "epoch": 0.6169273304997318, "grad_norm": 0.1621852070093155, "learning_rate": 0.0001, "loss": 1.7948, "step": 44850 }, { "epoch": 0.6176150978692967, "grad_norm": 0.1828535795211792, "learning_rate": 0.0001, "loss": 1.7939, "step": 44900 }, { "epoch": 0.6183028652388616, "grad_norm": 0.1630941480398178, "learning_rate": 0.0001, "loss": 1.7987, "step": 44950 }, { "epoch": 0.6189906326084266, "grad_norm": 0.1701328009366989, "learning_rate": 0.0001, "loss": 1.7955, "step": 45000 }, { "epoch": 0.6196783999779915, "grad_norm": 0.16631458699703217, "learning_rate": 0.0001, "loss": 1.7985, "step": 45050 }, { "epoch": 0.6203661673475563, "grad_norm": 0.17133264243602753, "learning_rate": 0.0001, "loss": 1.7946, "step": 45100 }, { "epoch": 0.6210539347171212, "grad_norm": 0.19388112425804138, "learning_rate": 0.0001, "loss": 1.7944, "step": 45150 }, { "epoch": 0.6217417020866862, "grad_norm": 0.1769258826971054, "learning_rate": 0.0001, "loss": 1.7937, "step": 45200 }, { "epoch": 0.6224294694562511, "grad_norm": 0.21986328065395355, "learning_rate": 0.0001, "loss": 1.7946, "step": 45250 }, { "epoch": 0.623117236825816, "grad_norm": 0.1711747795343399, "learning_rate": 0.0001, "loss": 1.7923, "step": 45300 }, { "epoch": 0.623805004195381, "grad_norm": 0.1730772852897644, "learning_rate": 0.0001, "loss": 1.7976, "step": 45350 }, { "epoch": 0.6244927715649459, "grad_norm": 0.16657279431819916, "learning_rate": 0.0001, "loss": 1.7958, "step": 45400 }, { "epoch": 0.6251805389345108, "grad_norm": 0.15675725042819977, "learning_rate": 0.0001, "loss": 1.7931, "step": 45450 }, { "epoch": 0.6258683063040757, "grad_norm": 0.17763769626617432, "learning_rate": 0.0001, "loss": 1.7972, "step": 45500 }, { "epoch": 0.6265560736736406, "grad_norm": 0.1630527824163437, "learning_rate": 0.0001, "loss": 1.7948, "step": 45550 }, { "epoch": 0.6272438410432055, "grad_norm": 0.16628991067409515, "learning_rate": 0.0001, "loss": 1.7959, "step": 45600 }, { "epoch": 0.6279316084127705, "grad_norm": 0.1589209884405136, "learning_rate": 0.0001, "loss": 1.7949, "step": 45650 }, { "epoch": 0.6286193757823354, "grad_norm": 0.17715197801589966, "learning_rate": 0.0001, "loss": 1.7971, "step": 45700 }, { "epoch": 0.6293071431519003, "grad_norm": 0.1824561059474945, "learning_rate": 0.0001, "loss": 1.795, "step": 45750 }, { "epoch": 0.6299949105214652, "grad_norm": 0.16866008937358856, "learning_rate": 0.0001, "loss": 1.7957, "step": 45800 }, { "epoch": 0.6306826778910302, "grad_norm": 0.14337721467018127, "learning_rate": 0.0001, "loss": 1.7937, "step": 45850 }, { "epoch": 0.631370445260595, "grad_norm": 0.15916399657726288, "learning_rate": 0.0001, "loss": 1.7938, "step": 45900 }, { "epoch": 0.6320582126301599, "grad_norm": 0.1653524488210678, "learning_rate": 0.0001, "loss": 1.795, "step": 45950 }, { "epoch": 0.6327459799997249, "grad_norm": 0.1588210016489029, "learning_rate": 0.0001, "loss": 1.7963, "step": 46000 }, { "epoch": 0.6334337473692898, "grad_norm": 0.16008345782756805, "learning_rate": 0.0001, "loss": 1.7978, "step": 46050 }, { "epoch": 0.6341215147388547, "grad_norm": 0.16054043173789978, "learning_rate": 0.0001, "loss": 1.7914, "step": 46100 }, { "epoch": 0.6348092821084197, "grad_norm": 0.19745290279388428, "learning_rate": 0.0001, "loss": 1.7938, "step": 46150 }, { "epoch": 0.6354970494779846, "grad_norm": 0.18955908715724945, "learning_rate": 0.0001, "loss": 1.7948, "step": 46200 }, { "epoch": 0.6361848168475495, "grad_norm": 0.16962236166000366, "learning_rate": 0.0001, "loss": 1.7911, "step": 46250 }, { "epoch": 0.6368725842171145, "grad_norm": 0.17200341820716858, "learning_rate": 0.0001, "loss": 1.7935, "step": 46300 }, { "epoch": 0.6375603515866793, "grad_norm": 0.17781908810138702, "learning_rate": 0.0001, "loss": 1.7905, "step": 46350 }, { "epoch": 0.6382481189562442, "grad_norm": 0.17602622509002686, "learning_rate": 0.0001, "loss": 1.7945, "step": 46400 }, { "epoch": 0.6389358863258091, "grad_norm": 0.1686919629573822, "learning_rate": 0.0001, "loss": 1.7892, "step": 46450 }, { "epoch": 0.6396236536953741, "grad_norm": 0.15013763308525085, "learning_rate": 0.0001, "loss": 1.7969, "step": 46500 }, { "epoch": 0.640311421064939, "grad_norm": 0.16534103453159332, "learning_rate": 0.0001, "loss": 1.7943, "step": 46550 }, { "epoch": 0.6409991884345039, "grad_norm": 0.16527748107910156, "learning_rate": 0.0001, "loss": 1.7904, "step": 46600 }, { "epoch": 0.6416869558040689, "grad_norm": 0.15024395287036896, "learning_rate": 0.0001, "loss": 1.7944, "step": 46650 }, { "epoch": 0.6423747231736338, "grad_norm": 0.17082852125167847, "learning_rate": 0.0001, "loss": 1.7942, "step": 46700 }, { "epoch": 0.6430624905431986, "grad_norm": 0.1649017482995987, "learning_rate": 0.0001, "loss": 1.7936, "step": 46750 }, { "epoch": 0.6437502579127636, "grad_norm": 0.16045525670051575, "learning_rate": 0.0001, "loss": 1.7913, "step": 46800 }, { "epoch": 0.6444380252823285, "grad_norm": 0.18290746212005615, "learning_rate": 0.0001, "loss": 1.7898, "step": 46850 }, { "epoch": 0.6451257926518934, "grad_norm": 0.14731939136981964, "learning_rate": 0.0001, "loss": 1.7934, "step": 46900 }, { "epoch": 0.6458135600214583, "grad_norm": 0.16072627902030945, "learning_rate": 0.0001, "loss": 1.7933, "step": 46950 }, { "epoch": 0.6465013273910233, "grad_norm": 0.14942970871925354, "learning_rate": 0.0001, "loss": 1.7944, "step": 47000 }, { "epoch": 0.6471890947605882, "grad_norm": 0.14922235906124115, "learning_rate": 0.0001, "loss": 1.7953, "step": 47050 }, { "epoch": 0.647876862130153, "grad_norm": 0.17120474576950073, "learning_rate": 0.0001, "loss": 1.7955, "step": 47100 }, { "epoch": 0.648564629499718, "grad_norm": 0.17423823475837708, "learning_rate": 0.0001, "loss": 1.7919, "step": 47150 }, { "epoch": 0.6492523968692829, "grad_norm": 0.1567763239145279, "learning_rate": 0.0001, "loss": 1.7934, "step": 47200 }, { "epoch": 0.6499401642388478, "grad_norm": 0.15817411243915558, "learning_rate": 0.0001, "loss": 1.7928, "step": 47250 }, { "epoch": 0.6506279316084128, "grad_norm": 0.1748141348361969, "learning_rate": 0.0001, "loss": 1.7884, "step": 47300 }, { "epoch": 0.6513156989779777, "grad_norm": 0.2045951634645462, "learning_rate": 0.0001, "loss": 1.7978, "step": 47350 }, { "epoch": 0.6520034663475426, "grad_norm": 0.17650052905082703, "learning_rate": 0.0001, "loss": 1.792, "step": 47400 }, { "epoch": 0.6526912337171076, "grad_norm": 0.17905278503894806, "learning_rate": 0.0001, "loss": 1.7958, "step": 47450 }, { "epoch": 0.6533790010866725, "grad_norm": 0.1599511355161667, "learning_rate": 0.0001, "loss": 1.7912, "step": 47500 }, { "epoch": 0.6540667684562373, "grad_norm": 0.1584351658821106, "learning_rate": 0.0001, "loss": 1.7949, "step": 47550 }, { "epoch": 0.6547545358258022, "grad_norm": 0.17251476645469666, "learning_rate": 0.0001, "loss": 1.7913, "step": 47600 }, { "epoch": 0.6554423031953672, "grad_norm": 0.17718471586704254, "learning_rate": 0.0001, "loss": 1.7934, "step": 47650 }, { "epoch": 0.6561300705649321, "grad_norm": 0.15196654200553894, "learning_rate": 0.0001, "loss": 1.7894, "step": 47700 }, { "epoch": 0.656817837934497, "grad_norm": 0.17444145679473877, "learning_rate": 0.0001, "loss": 1.7907, "step": 47750 }, { "epoch": 0.657505605304062, "grad_norm": 0.15149961411952972, "learning_rate": 0.0001, "loss": 1.7959, "step": 47800 }, { "epoch": 0.6581933726736269, "grad_norm": 0.1591227501630783, "learning_rate": 0.0001, "loss": 1.7907, "step": 47850 }, { "epoch": 0.6588811400431918, "grad_norm": 0.20135171711444855, "learning_rate": 0.0001, "loss": 1.7963, "step": 47900 }, { "epoch": 0.6595689074127568, "grad_norm": 0.16523614525794983, "learning_rate": 0.0001, "loss": 1.7968, "step": 47950 }, { "epoch": 0.6602566747823216, "grad_norm": 0.15842151641845703, "learning_rate": 0.0001, "loss": 1.7897, "step": 48000 }, { "epoch": 0.6609444421518865, "grad_norm": 0.160832479596138, "learning_rate": 0.0001, "loss": 1.796, "step": 48050 }, { "epoch": 0.6616322095214515, "grad_norm": 0.16063477098941803, "learning_rate": 0.0001, "loss": 1.7903, "step": 48100 }, { "epoch": 0.6623199768910164, "grad_norm": 0.1595107465982437, "learning_rate": 0.0001, "loss": 1.7953, "step": 48150 }, { "epoch": 0.6630077442605813, "grad_norm": 0.18313910067081451, "learning_rate": 0.0001, "loss": 1.7957, "step": 48200 }, { "epoch": 0.6636955116301462, "grad_norm": 0.17561380565166473, "learning_rate": 0.0001, "loss": 1.7906, "step": 48250 }, { "epoch": 0.6643832789997112, "grad_norm": 0.18327072262763977, "learning_rate": 0.0001, "loss": 1.7916, "step": 48300 }, { "epoch": 0.665071046369276, "grad_norm": 0.16745221614837646, "learning_rate": 0.0001, "loss": 1.791, "step": 48350 }, { "epoch": 0.6657588137388409, "grad_norm": 0.16286319494247437, "learning_rate": 0.0001, "loss": 1.7942, "step": 48400 }, { "epoch": 0.6664465811084059, "grad_norm": 0.15864308178424835, "learning_rate": 0.0001, "loss": 1.7953, "step": 48450 }, { "epoch": 0.6671343484779708, "grad_norm": 0.16778843104839325, "learning_rate": 0.0001, "loss": 1.7945, "step": 48500 }, { "epoch": 0.6678221158475357, "grad_norm": 0.1448727399110794, "learning_rate": 0.0001, "loss": 1.7942, "step": 48550 }, { "epoch": 0.6685098832171007, "grad_norm": 0.16745643317699432, "learning_rate": 0.0001, "loss": 1.7903, "step": 48600 }, { "epoch": 0.6691976505866656, "grad_norm": 0.1633836030960083, "learning_rate": 0.0001, "loss": 1.7938, "step": 48650 }, { "epoch": 0.6698854179562305, "grad_norm": 0.15037505328655243, "learning_rate": 0.0001, "loss": 1.7963, "step": 48700 }, { "epoch": 0.6705731853257954, "grad_norm": 0.1707869917154312, "learning_rate": 0.0001, "loss": 1.7895, "step": 48750 }, { "epoch": 0.6712609526953603, "grad_norm": 0.17392534017562866, "learning_rate": 0.0001, "loss": 1.7926, "step": 48800 }, { "epoch": 0.6719487200649252, "grad_norm": 0.1588422805070877, "learning_rate": 0.0001, "loss": 1.7958, "step": 48850 }, { "epoch": 0.6726364874344901, "grad_norm": 0.1751549243927002, "learning_rate": 0.0001, "loss": 1.7931, "step": 48900 }, { "epoch": 0.6733242548040551, "grad_norm": 0.1722249686717987, "learning_rate": 0.0001, "loss": 1.7892, "step": 48950 }, { "epoch": 0.67401202217362, "grad_norm": 0.1673288643360138, "learning_rate": 0.0001, "loss": 1.793, "step": 49000 }, { "epoch": 0.6746997895431849, "grad_norm": 0.1552770733833313, "learning_rate": 0.0001, "loss": 1.7916, "step": 49050 }, { "epoch": 0.6753875569127499, "grad_norm": 0.15788178145885468, "learning_rate": 0.0001, "loss": 1.7981, "step": 49100 }, { "epoch": 0.6760753242823148, "grad_norm": 0.17959725856781006, "learning_rate": 0.0001, "loss": 1.7949, "step": 49150 }, { "epoch": 0.6767630916518796, "grad_norm": 0.1584416925907135, "learning_rate": 0.0001, "loss": 1.7946, "step": 49200 }, { "epoch": 0.6774508590214446, "grad_norm": 0.1645151674747467, "learning_rate": 0.0001, "loss": 1.7916, "step": 49250 }, { "epoch": 0.6781386263910095, "grad_norm": 0.1522347778081894, "learning_rate": 0.0001, "loss": 1.7891, "step": 49300 }, { "epoch": 0.6788263937605744, "grad_norm": 0.16095298528671265, "learning_rate": 0.0001, "loss": 1.7927, "step": 49350 }, { "epoch": 0.6795141611301393, "grad_norm": 0.15317974984645844, "learning_rate": 0.0001, "loss": 1.7947, "step": 49400 }, { "epoch": 0.6802019284997043, "grad_norm": 0.16854670643806458, "learning_rate": 0.0001, "loss": 1.7929, "step": 49450 }, { "epoch": 0.6808896958692692, "grad_norm": 0.1702488660812378, "learning_rate": 0.0001, "loss": 1.791, "step": 49500 }, { "epoch": 0.6815774632388341, "grad_norm": 0.16388344764709473, "learning_rate": 0.0001, "loss": 1.7894, "step": 49550 }, { "epoch": 0.682265230608399, "grad_norm": 0.16601653397083282, "learning_rate": 0.0001, "loss": 1.7949, "step": 49600 }, { "epoch": 0.6829529979779639, "grad_norm": 0.17910674214363098, "learning_rate": 0.0001, "loss": 1.7875, "step": 49650 }, { "epoch": 0.6836407653475288, "grad_norm": 0.15689565241336823, "learning_rate": 0.0001, "loss": 1.7904, "step": 49700 }, { "epoch": 0.6843285327170938, "grad_norm": 0.15473750233650208, "learning_rate": 0.0001, "loss": 1.7894, "step": 49750 }, { "epoch": 0.6850163000866587, "grad_norm": 0.16794639825820923, "learning_rate": 0.0001, "loss": 1.7934, "step": 49800 }, { "epoch": 0.6857040674562236, "grad_norm": 0.15183915197849274, "learning_rate": 0.0001, "loss": 1.7887, "step": 49850 }, { "epoch": 0.6863918348257885, "grad_norm": 0.15028232336044312, "learning_rate": 0.0001, "loss": 1.7929, "step": 49900 }, { "epoch": 0.6870796021953535, "grad_norm": 0.16230390965938568, "learning_rate": 0.0001, "loss": 1.7948, "step": 49950 }, { "epoch": 0.6877673695649184, "grad_norm": 0.16958658397197723, "learning_rate": 0.0001, "loss": 1.7932, "step": 50000 }, { "epoch": 0.6884551369344832, "grad_norm": 0.15662765502929688, "learning_rate": 0.0001, "loss": 1.7904, "step": 50050 }, { "epoch": 0.6891429043040482, "grad_norm": 0.17507807910442352, "learning_rate": 0.0001, "loss": 1.795, "step": 50100 }, { "epoch": 0.6898306716736131, "grad_norm": 0.16449585556983948, "learning_rate": 0.0001, "loss": 1.7888, "step": 50150 }, { "epoch": 0.690518439043178, "grad_norm": 0.17615753412246704, "learning_rate": 0.0001, "loss": 1.7889, "step": 50200 }, { "epoch": 0.691206206412743, "grad_norm": 0.16010646522045135, "learning_rate": 0.0001, "loss": 1.7932, "step": 50250 }, { "epoch": 0.6918939737823079, "grad_norm": 0.14614787697792053, "learning_rate": 0.0001, "loss": 1.792, "step": 50300 }, { "epoch": 0.6925817411518728, "grad_norm": 0.19960370659828186, "learning_rate": 0.0001, "loss": 1.7907, "step": 50350 }, { "epoch": 0.6932695085214378, "grad_norm": 0.16230808198451996, "learning_rate": 0.0001, "loss": 1.7855, "step": 50400 }, { "epoch": 0.6939572758910026, "grad_norm": 0.16344518959522247, "learning_rate": 0.0001, "loss": 1.791, "step": 50450 }, { "epoch": 0.6946450432605675, "grad_norm": 0.16584964096546173, "learning_rate": 0.0001, "loss": 1.7916, "step": 50500 }, { "epoch": 0.6953328106301324, "grad_norm": 0.15551120042800903, "learning_rate": 0.0001, "loss": 1.7948, "step": 50550 }, { "epoch": 0.6960205779996974, "grad_norm": 0.1697503924369812, "learning_rate": 0.0001, "loss": 1.7917, "step": 50600 }, { "epoch": 0.6967083453692623, "grad_norm": 0.15577536821365356, "learning_rate": 0.0001, "loss": 1.7905, "step": 50650 }, { "epoch": 0.6973961127388272, "grad_norm": 0.17658278346061707, "learning_rate": 0.0001, "loss": 1.7884, "step": 50700 }, { "epoch": 0.6980838801083922, "grad_norm": 0.16718824207782745, "learning_rate": 0.0001, "loss": 1.7936, "step": 50750 }, { "epoch": 0.6987716474779571, "grad_norm": 0.16996939480304718, "learning_rate": 0.0001, "loss": 1.7919, "step": 50800 }, { "epoch": 0.6994594148475219, "grad_norm": 0.15299175679683685, "learning_rate": 0.0001, "loss": 1.7919, "step": 50850 }, { "epoch": 0.7001471822170869, "grad_norm": 0.1672915816307068, "learning_rate": 0.0001, "loss": 1.795, "step": 50900 }, { "epoch": 0.7008349495866518, "grad_norm": 0.17287658154964447, "learning_rate": 0.0001, "loss": 1.7877, "step": 50950 }, { "epoch": 0.7015227169562167, "grad_norm": 0.16447900235652924, "learning_rate": 0.0001, "loss": 1.7915, "step": 51000 }, { "epoch": 0.7022104843257817, "grad_norm": 0.16016733646392822, "learning_rate": 0.0001, "loss": 1.7911, "step": 51050 }, { "epoch": 0.7028982516953466, "grad_norm": 0.15329506993293762, "learning_rate": 0.0001, "loss": 1.7915, "step": 51100 }, { "epoch": 0.7035860190649115, "grad_norm": 0.1695086658000946, "learning_rate": 0.0001, "loss": 1.7925, "step": 51150 }, { "epoch": 0.7042737864344764, "grad_norm": 0.15667758882045746, "learning_rate": 0.0001, "loss": 1.7908, "step": 51200 }, { "epoch": 0.7049615538040414, "grad_norm": 0.1636906862258911, "learning_rate": 0.0001, "loss": 1.7911, "step": 51250 }, { "epoch": 0.7056493211736062, "grad_norm": 0.16701051592826843, "learning_rate": 0.0001, "loss": 1.7929, "step": 51300 }, { "epoch": 0.7063370885431711, "grad_norm": 0.17164082825183868, "learning_rate": 0.0001, "loss": 1.7922, "step": 51350 }, { "epoch": 0.7070248559127361, "grad_norm": 0.18162649869918823, "learning_rate": 0.0001, "loss": 1.7892, "step": 51400 }, { "epoch": 0.707712623282301, "grad_norm": 0.1521824300289154, "learning_rate": 0.0001, "loss": 1.7937, "step": 51450 }, { "epoch": 0.7084003906518659, "grad_norm": 0.168669655919075, "learning_rate": 0.0001, "loss": 1.7873, "step": 51500 }, { "epoch": 0.7090881580214309, "grad_norm": 0.17441484332084656, "learning_rate": 0.0001, "loss": 1.79, "step": 51550 }, { "epoch": 0.7097759253909958, "grad_norm": 0.1877586394548416, "learning_rate": 0.0001, "loss": 1.7927, "step": 51600 }, { "epoch": 0.7104636927605607, "grad_norm": 0.16195935010910034, "learning_rate": 0.0001, "loss": 1.7962, "step": 51650 }, { "epoch": 0.7111514601301255, "grad_norm": 0.16282670199871063, "learning_rate": 0.0001, "loss": 1.7939, "step": 51700 }, { "epoch": 0.7118392274996905, "grad_norm": 0.15550565719604492, "learning_rate": 0.0001, "loss": 1.793, "step": 51750 }, { "epoch": 0.7125269948692554, "grad_norm": 0.16963760554790497, "learning_rate": 0.0001, "loss": 1.7921, "step": 51800 }, { "epoch": 0.7132147622388203, "grad_norm": 0.1632436364889145, "learning_rate": 0.0001, "loss": 1.7943, "step": 51850 }, { "epoch": 0.7139025296083853, "grad_norm": 0.15533354878425598, "learning_rate": 0.0001, "loss": 1.7917, "step": 51900 }, { "epoch": 0.7145902969779502, "grad_norm": 0.15280106663703918, "learning_rate": 0.0001, "loss": 1.7874, "step": 51950 }, { "epoch": 0.7152780643475151, "grad_norm": 0.1561509668827057, "learning_rate": 0.0001, "loss": 1.7918, "step": 52000 }, { "epoch": 0.7159658317170801, "grad_norm": 0.1560848206281662, "learning_rate": 0.0001, "loss": 1.7927, "step": 52050 }, { "epoch": 0.7166535990866449, "grad_norm": 0.1706065684556961, "learning_rate": 0.0001, "loss": 1.7877, "step": 52100 }, { "epoch": 0.7173413664562098, "grad_norm": 0.16388699412345886, "learning_rate": 0.0001, "loss": 1.7859, "step": 52150 }, { "epoch": 0.7180291338257748, "grad_norm": 0.16502410173416138, "learning_rate": 0.0001, "loss": 1.7899, "step": 52200 }, { "epoch": 0.7187169011953397, "grad_norm": 0.17022061347961426, "learning_rate": 0.0001, "loss": 1.7881, "step": 52250 }, { "epoch": 0.7194046685649046, "grad_norm": 0.17903153598308563, "learning_rate": 0.0001, "loss": 1.7892, "step": 52300 }, { "epoch": 0.7200924359344695, "grad_norm": 0.15719935297966003, "learning_rate": 0.0001, "loss": 1.7934, "step": 52350 }, { "epoch": 0.7207802033040345, "grad_norm": 0.16321443021297455, "learning_rate": 0.0001, "loss": 1.7914, "step": 52400 }, { "epoch": 0.7214679706735994, "grad_norm": 0.1724744439125061, "learning_rate": 0.0001, "loss": 1.7905, "step": 52450 }, { "epoch": 0.7221557380431642, "grad_norm": 0.16059927642345428, "learning_rate": 0.0001, "loss": 1.7929, "step": 52500 }, { "epoch": 0.7228435054127292, "grad_norm": 0.17748789489269257, "learning_rate": 0.0001, "loss": 1.7913, "step": 52550 }, { "epoch": 0.7235312727822941, "grad_norm": 0.16190293431282043, "learning_rate": 0.0001, "loss": 1.7956, "step": 52600 }, { "epoch": 0.724219040151859, "grad_norm": 0.1841738224029541, "learning_rate": 0.0001, "loss": 1.7899, "step": 52650 }, { "epoch": 0.724906807521424, "grad_norm": 0.15971702337265015, "learning_rate": 0.0001, "loss": 1.7891, "step": 52700 }, { "epoch": 0.7255945748909889, "grad_norm": 0.15894858539104462, "learning_rate": 0.0001, "loss": 1.7939, "step": 52750 }, { "epoch": 0.7262823422605538, "grad_norm": 0.15041370689868927, "learning_rate": 0.0001, "loss": 1.7885, "step": 52800 }, { "epoch": 0.7269701096301187, "grad_norm": 0.15757033228874207, "learning_rate": 0.0001, "loss": 1.7881, "step": 52850 }, { "epoch": 0.7276578769996837, "grad_norm": 0.16385579109191895, "learning_rate": 0.0001, "loss": 1.7889, "step": 52900 }, { "epoch": 0.7283456443692485, "grad_norm": 0.15629428625106812, "learning_rate": 0.0001, "loss": 1.7932, "step": 52950 }, { "epoch": 0.7290334117388134, "grad_norm": 0.1573755145072937, "learning_rate": 0.0001, "loss": 1.7926, "step": 53000 }, { "epoch": 0.7297211791083784, "grad_norm": 0.15800927579402924, "learning_rate": 0.0001, "loss": 1.789, "step": 53050 }, { "epoch": 0.7304089464779433, "grad_norm": 0.16997511684894562, "learning_rate": 0.0001, "loss": 1.7892, "step": 53100 }, { "epoch": 0.7310967138475082, "grad_norm": 0.1457889825105667, "learning_rate": 0.0001, "loss": 1.7892, "step": 53150 }, { "epoch": 0.7317844812170732, "grad_norm": 0.15250973403453827, "learning_rate": 0.0001, "loss": 1.7892, "step": 53200 }, { "epoch": 0.7324722485866381, "grad_norm": 0.1561204344034195, "learning_rate": 0.0001, "loss": 1.7915, "step": 53250 }, { "epoch": 0.733160015956203, "grad_norm": 0.17602892220020294, "learning_rate": 0.0001, "loss": 1.789, "step": 53300 }, { "epoch": 0.7338477833257679, "grad_norm": 0.15751750767230988, "learning_rate": 0.0001, "loss": 1.7924, "step": 53350 }, { "epoch": 0.7345355506953328, "grad_norm": 0.1686706244945526, "learning_rate": 0.0001, "loss": 1.7859, "step": 53400 }, { "epoch": 0.7352233180648977, "grad_norm": 0.15886232256889343, "learning_rate": 0.0001, "loss": 1.7891, "step": 53450 }, { "epoch": 0.7359110854344626, "grad_norm": 0.1548243910074234, "learning_rate": 0.0001, "loss": 1.7887, "step": 53500 }, { "epoch": 0.7365988528040276, "grad_norm": 0.16160327196121216, "learning_rate": 0.0001, "loss": 1.792, "step": 53550 }, { "epoch": 0.7372866201735925, "grad_norm": 0.1588127613067627, "learning_rate": 0.0001, "loss": 1.791, "step": 53600 }, { "epoch": 0.7379743875431574, "grad_norm": 0.1562395691871643, "learning_rate": 0.0001, "loss": 1.7876, "step": 53650 }, { "epoch": 0.7386621549127224, "grad_norm": 0.1463010013103485, "learning_rate": 0.0001, "loss": 1.7903, "step": 53700 }, { "epoch": 0.7393499222822872, "grad_norm": 0.1688784807920456, "learning_rate": 0.0001, "loss": 1.7874, "step": 53750 }, { "epoch": 0.7400376896518521, "grad_norm": 0.16111525893211365, "learning_rate": 0.0001, "loss": 1.7891, "step": 53800 }, { "epoch": 0.7407254570214171, "grad_norm": 0.15798266232013702, "learning_rate": 0.0001, "loss": 1.7901, "step": 53850 }, { "epoch": 0.741413224390982, "grad_norm": 0.1544068306684494, "learning_rate": 0.0001, "loss": 1.79, "step": 53900 }, { "epoch": 0.7421009917605469, "grad_norm": 0.16747315227985382, "learning_rate": 0.0001, "loss": 1.7923, "step": 53950 }, { "epoch": 0.7427887591301119, "grad_norm": 0.20277969539165497, "learning_rate": 0.0001, "loss": 1.7932, "step": 54000 }, { "epoch": 0.7434765264996768, "grad_norm": 0.1490595042705536, "learning_rate": 0.0001, "loss": 1.7899, "step": 54050 }, { "epoch": 0.7441642938692417, "grad_norm": 0.15864817798137665, "learning_rate": 0.0001, "loss": 1.7838, "step": 54100 }, { "epoch": 0.7448520612388065, "grad_norm": 0.17168639600276947, "learning_rate": 0.0001, "loss": 1.79, "step": 54150 }, { "epoch": 0.7455398286083715, "grad_norm": 0.1612584888935089, "learning_rate": 0.0001, "loss": 1.7894, "step": 54200 }, { "epoch": 0.7462275959779364, "grad_norm": 0.16638678312301636, "learning_rate": 0.0001, "loss": 1.7852, "step": 54250 }, { "epoch": 0.7469153633475013, "grad_norm": 0.16757947206497192, "learning_rate": 0.0001, "loss": 1.7899, "step": 54300 }, { "epoch": 0.7476031307170663, "grad_norm": 0.17740657925605774, "learning_rate": 0.0001, "loss": 1.7891, "step": 54350 }, { "epoch": 0.7482908980866312, "grad_norm": 0.15608841180801392, "learning_rate": 0.0001, "loss": 1.7864, "step": 54400 }, { "epoch": 0.7489786654561961, "grad_norm": 0.1486404538154602, "learning_rate": 0.0001, "loss": 1.7895, "step": 54450 }, { "epoch": 0.7496664328257611, "grad_norm": 0.17158234119415283, "learning_rate": 0.0001, "loss": 1.789, "step": 54500 }, { "epoch": 0.750354200195326, "grad_norm": 0.1535918265581131, "learning_rate": 0.0001, "loss": 1.7858, "step": 54550 }, { "epoch": 0.7510419675648908, "grad_norm": 0.17464052140712738, "learning_rate": 0.0001, "loss": 1.7884, "step": 54600 }, { "epoch": 0.7517297349344557, "grad_norm": 0.15320485830307007, "learning_rate": 0.0001, "loss": 1.7909, "step": 54650 }, { "epoch": 0.7524175023040207, "grad_norm": 0.16376914083957672, "learning_rate": 0.0001, "loss": 1.789, "step": 54700 }, { "epoch": 0.7531052696735856, "grad_norm": 0.17047230899333954, "learning_rate": 0.0001, "loss": 1.7886, "step": 54750 }, { "epoch": 0.7537930370431505, "grad_norm": 0.1580251306295395, "learning_rate": 0.0001, "loss": 1.7904, "step": 54800 }, { "epoch": 0.7544808044127155, "grad_norm": 0.16085964441299438, "learning_rate": 0.0001, "loss": 1.7872, "step": 54850 }, { "epoch": 0.7551685717822804, "grad_norm": 0.1530008316040039, "learning_rate": 0.0001, "loss": 1.7909, "step": 54900 }, { "epoch": 0.7558563391518452, "grad_norm": 0.18514500558376312, "learning_rate": 0.0001, "loss": 1.789, "step": 54950 }, { "epoch": 0.7565441065214102, "grad_norm": 0.16724203526973724, "learning_rate": 0.0001, "loss": 1.7895, "step": 55000 }, { "epoch": 0.7572318738909751, "grad_norm": 0.17008638381958008, "learning_rate": 0.0001, "loss": 1.7909, "step": 55050 }, { "epoch": 0.75791964126054, "grad_norm": 0.15402346849441528, "learning_rate": 0.0001, "loss": 1.7858, "step": 55100 }, { "epoch": 0.758607408630105, "grad_norm": 0.1750432401895523, "learning_rate": 0.0001, "loss": 1.7898, "step": 55150 }, { "epoch": 0.7592951759996699, "grad_norm": 0.18680183589458466, "learning_rate": 0.0001, "loss": 1.788, "step": 55200 }, { "epoch": 0.7599829433692348, "grad_norm": 0.16581743955612183, "learning_rate": 0.0001, "loss": 1.7902, "step": 55250 }, { "epoch": 0.7606707107387997, "grad_norm": 0.16159740090370178, "learning_rate": 0.0001, "loss": 1.7843, "step": 55300 }, { "epoch": 0.7613584781083647, "grad_norm": 0.14381587505340576, "learning_rate": 0.0001, "loss": 1.7918, "step": 55350 }, { "epoch": 0.7620462454779295, "grad_norm": 0.15160152316093445, "learning_rate": 0.0001, "loss": 1.789, "step": 55400 }, { "epoch": 0.7627340128474944, "grad_norm": 0.16748382151126862, "learning_rate": 0.0001, "loss": 1.7865, "step": 55450 }, { "epoch": 0.7634217802170594, "grad_norm": 0.15434932708740234, "learning_rate": 0.0001, "loss": 1.7894, "step": 55500 }, { "epoch": 0.7641095475866243, "grad_norm": 0.16281753778457642, "learning_rate": 0.0001, "loss": 1.7909, "step": 55550 }, { "epoch": 0.7647973149561892, "grad_norm": 0.1581009328365326, "learning_rate": 0.0001, "loss": 1.7872, "step": 55600 }, { "epoch": 0.7654850823257542, "grad_norm": 0.16244924068450928, "learning_rate": 0.0001, "loss": 1.7882, "step": 55650 }, { "epoch": 0.7661728496953191, "grad_norm": 0.1727581024169922, "learning_rate": 0.0001, "loss": 1.7894, "step": 55700 }, { "epoch": 0.766860617064884, "grad_norm": 0.15804524719715118, "learning_rate": 0.0001, "loss": 1.7883, "step": 55750 }, { "epoch": 0.7675483844344488, "grad_norm": 0.16742980480194092, "learning_rate": 0.0001, "loss": 1.7883, "step": 55800 }, { "epoch": 0.7682361518040138, "grad_norm": 0.15518859028816223, "learning_rate": 0.0001, "loss": 1.7877, "step": 55850 }, { "epoch": 0.7689239191735787, "grad_norm": 0.14549891650676727, "learning_rate": 0.0001, "loss": 1.7894, "step": 55900 }, { "epoch": 0.7696116865431436, "grad_norm": 0.15677410364151, "learning_rate": 0.0001, "loss": 1.7868, "step": 55950 }, { "epoch": 0.7702994539127086, "grad_norm": 0.1627907007932663, "learning_rate": 0.0001, "loss": 1.7861, "step": 56000 }, { "epoch": 0.7709872212822735, "grad_norm": 0.17789112031459808, "learning_rate": 0.0001, "loss": 1.7917, "step": 56050 }, { "epoch": 0.7716749886518384, "grad_norm": 0.17732852697372437, "learning_rate": 0.0001, "loss": 1.7885, "step": 56100 }, { "epoch": 0.7723627560214034, "grad_norm": 0.16175003349781036, "learning_rate": 0.0001, "loss": 1.7847, "step": 56150 }, { "epoch": 0.7730505233909682, "grad_norm": 0.16384829580783844, "learning_rate": 0.0001, "loss": 1.7879, "step": 56200 }, { "epoch": 0.7737382907605331, "grad_norm": 0.18334250152111053, "learning_rate": 0.0001, "loss": 1.7841, "step": 56250 }, { "epoch": 0.7744260581300981, "grad_norm": 0.16775920987129211, "learning_rate": 0.0001, "loss": 1.7844, "step": 56300 }, { "epoch": 0.775113825499663, "grad_norm": 0.15945740044116974, "learning_rate": 0.0001, "loss": 1.7867, "step": 56350 }, { "epoch": 0.7758015928692279, "grad_norm": 0.16826015710830688, "learning_rate": 0.0001, "loss": 1.7874, "step": 56400 }, { "epoch": 0.7764893602387928, "grad_norm": 0.16733418405056, "learning_rate": 0.0001, "loss": 1.7843, "step": 56450 }, { "epoch": 0.7771771276083578, "grad_norm": 0.17716175317764282, "learning_rate": 0.0001, "loss": 1.7852, "step": 56500 }, { "epoch": 0.7778648949779227, "grad_norm": 0.15145139396190643, "learning_rate": 0.0001, "loss": 1.7864, "step": 56550 }, { "epoch": 0.7785526623474875, "grad_norm": 0.1650010645389557, "learning_rate": 0.0001, "loss": 1.788, "step": 56600 }, { "epoch": 0.7792404297170525, "grad_norm": 0.15676827728748322, "learning_rate": 0.0001, "loss": 1.7863, "step": 56650 }, { "epoch": 0.7799281970866174, "grad_norm": 0.15251976251602173, "learning_rate": 0.0001, "loss": 1.7894, "step": 56700 }, { "epoch": 0.7806159644561823, "grad_norm": 0.16107071936130524, "learning_rate": 0.0001, "loss": 1.7872, "step": 56750 }, { "epoch": 0.7813037318257473, "grad_norm": 0.16008871793746948, "learning_rate": 0.0001, "loss": 1.7879, "step": 56800 }, { "epoch": 0.7819914991953122, "grad_norm": 0.1748703122138977, "learning_rate": 0.0001, "loss": 1.7883, "step": 56850 }, { "epoch": 0.7826792665648771, "grad_norm": 0.1847066432237625, "learning_rate": 0.0001, "loss": 1.7878, "step": 56900 }, { "epoch": 0.7833670339344421, "grad_norm": 0.14105017483234406, "learning_rate": 0.0001, "loss": 1.7872, "step": 56950 }, { "epoch": 0.784054801304007, "grad_norm": 0.1463741511106491, "learning_rate": 0.0001, "loss": 1.784, "step": 57000 }, { "epoch": 0.7847425686735718, "grad_norm": 0.15982814133167267, "learning_rate": 0.0001, "loss": 1.7904, "step": 57050 }, { "epoch": 0.7854303360431367, "grad_norm": 0.15282031893730164, "learning_rate": 0.0001, "loss": 1.788, "step": 57100 }, { "epoch": 0.7861181034127017, "grad_norm": 0.16466231644153595, "learning_rate": 0.0001, "loss": 1.7862, "step": 57150 }, { "epoch": 0.7868058707822666, "grad_norm": 0.16176077723503113, "learning_rate": 0.0001, "loss": 1.7883, "step": 57200 }, { "epoch": 0.7874936381518315, "grad_norm": 0.16768991947174072, "learning_rate": 0.0001, "loss": 1.791, "step": 57250 }, { "epoch": 0.7881814055213965, "grad_norm": 0.15378397703170776, "learning_rate": 0.0001, "loss": 1.7889, "step": 57300 }, { "epoch": 0.7888691728909614, "grad_norm": 0.16845440864562988, "learning_rate": 0.0001, "loss": 1.7865, "step": 57350 }, { "epoch": 0.7895569402605263, "grad_norm": 0.16859596967697144, "learning_rate": 0.0001, "loss": 1.7893, "step": 57400 }, { "epoch": 0.7902447076300912, "grad_norm": 0.17096339166164398, "learning_rate": 0.0001, "loss": 1.7842, "step": 57450 }, { "epoch": 0.7909324749996561, "grad_norm": 0.19546246528625488, "learning_rate": 0.0001, "loss": 1.7859, "step": 57500 }, { "epoch": 0.791620242369221, "grad_norm": 0.15690521895885468, "learning_rate": 0.0001, "loss": 1.7905, "step": 57550 }, { "epoch": 0.7923080097387859, "grad_norm": 0.15288680791854858, "learning_rate": 0.0001, "loss": 1.7871, "step": 57600 }, { "epoch": 0.7929957771083509, "grad_norm": 0.15947267413139343, "learning_rate": 0.0001, "loss": 1.7851, "step": 57650 }, { "epoch": 0.7936835444779158, "grad_norm": 0.1813030242919922, "learning_rate": 0.0001, "loss": 1.7833, "step": 57700 }, { "epoch": 0.7943713118474807, "grad_norm": 0.16709686815738678, "learning_rate": 0.0001, "loss": 1.7908, "step": 57750 }, { "epoch": 0.7950590792170457, "grad_norm": 0.19110731780529022, "learning_rate": 0.0001, "loss": 1.7845, "step": 57800 }, { "epoch": 0.7957468465866105, "grad_norm": 0.15795393288135529, "learning_rate": 0.0001, "loss": 1.7908, "step": 57850 }, { "epoch": 0.7964346139561754, "grad_norm": 0.14493565261363983, "learning_rate": 0.0001, "loss": 1.7893, "step": 57900 }, { "epoch": 0.7971223813257404, "grad_norm": 0.14182139933109283, "learning_rate": 0.0001, "loss": 1.7883, "step": 57950 }, { "epoch": 0.7978101486953053, "grad_norm": 0.14074084162712097, "learning_rate": 0.0001, "loss": 1.7857, "step": 58000 }, { "epoch": 0.7984979160648702, "grad_norm": 0.1791408807039261, "learning_rate": 0.0001, "loss": 1.7889, "step": 58050 }, { "epoch": 0.7991856834344352, "grad_norm": 0.17944924533367157, "learning_rate": 0.0001, "loss": 1.7884, "step": 58100 }, { "epoch": 0.7998734508040001, "grad_norm": 0.19336557388305664, "learning_rate": 0.0001, "loss": 1.786, "step": 58150 }, { "epoch": 0.800561218173565, "grad_norm": 0.14197582006454468, "learning_rate": 0.0001, "loss": 1.7834, "step": 58200 }, { "epoch": 0.8012489855431298, "grad_norm": 0.17862093448638916, "learning_rate": 0.0001, "loss": 1.7859, "step": 58250 }, { "epoch": 0.8019367529126948, "grad_norm": 0.15174590051174164, "learning_rate": 0.0001, "loss": 1.7883, "step": 58300 }, { "epoch": 0.8026245202822597, "grad_norm": 0.15902046859264374, "learning_rate": 0.0001, "loss": 1.7859, "step": 58350 }, { "epoch": 0.8033122876518246, "grad_norm": 0.1593545824289322, "learning_rate": 0.0001, "loss": 1.7871, "step": 58400 }, { "epoch": 0.8040000550213896, "grad_norm": 0.16780108213424683, "learning_rate": 0.0001, "loss": 1.7892, "step": 58450 }, { "epoch": 0.8046878223909545, "grad_norm": 0.16704651713371277, "learning_rate": 0.0001, "loss": 1.7827, "step": 58500 }, { "epoch": 0.8053755897605194, "grad_norm": 0.20908869802951813, "learning_rate": 0.0001, "loss": 1.7868, "step": 58550 }, { "epoch": 0.8060633571300844, "grad_norm": 0.1484072208404541, "learning_rate": 0.0001, "loss": 1.7905, "step": 58600 }, { "epoch": 0.8067511244996493, "grad_norm": 0.16092757880687714, "learning_rate": 0.0001, "loss": 1.7849, "step": 58650 }, { "epoch": 0.8074388918692141, "grad_norm": 0.15798570215702057, "learning_rate": 0.0001, "loss": 1.7897, "step": 58700 }, { "epoch": 0.808126659238779, "grad_norm": 0.15388993918895721, "learning_rate": 0.0001, "loss": 1.7874, "step": 58750 }, { "epoch": 0.808814426608344, "grad_norm": 0.16136646270751953, "learning_rate": 0.0001, "loss": 1.7866, "step": 58800 }, { "epoch": 0.8095021939779089, "grad_norm": 0.20280751585960388, "learning_rate": 0.0001, "loss": 1.7868, "step": 58850 }, { "epoch": 0.8101899613474738, "grad_norm": 0.16941416263580322, "learning_rate": 0.0001, "loss": 1.7834, "step": 58900 }, { "epoch": 0.8108777287170388, "grad_norm": 0.1597299724817276, "learning_rate": 0.0001, "loss": 1.7823, "step": 58950 }, { "epoch": 0.8115654960866037, "grad_norm": 0.1581617146730423, "learning_rate": 0.0001, "loss": 1.7902, "step": 59000 }, { "epoch": 0.8122532634561686, "grad_norm": 0.17084243893623352, "learning_rate": 0.0001, "loss": 1.7873, "step": 59050 }, { "epoch": 0.8129410308257335, "grad_norm": 0.16124476492404938, "learning_rate": 0.0001, "loss": 1.7894, "step": 59100 }, { "epoch": 0.8136287981952984, "grad_norm": 0.15042969584465027, "learning_rate": 0.0001, "loss": 1.7873, "step": 59150 }, { "epoch": 0.8143165655648633, "grad_norm": 0.14492358267307281, "learning_rate": 0.0001, "loss": 1.7836, "step": 59200 }, { "epoch": 0.8150043329344283, "grad_norm": 0.17020314931869507, "learning_rate": 0.0001, "loss": 1.7859, "step": 59250 }, { "epoch": 0.8156921003039932, "grad_norm": 0.1630934178829193, "learning_rate": 0.0001, "loss": 1.7841, "step": 59300 }, { "epoch": 0.8163798676735581, "grad_norm": 0.17032647132873535, "learning_rate": 0.0001, "loss": 1.7851, "step": 59350 }, { "epoch": 0.817067635043123, "grad_norm": 0.15546603500843048, "learning_rate": 0.0001, "loss": 1.7866, "step": 59400 }, { "epoch": 0.817755402412688, "grad_norm": 0.1688961237668991, "learning_rate": 0.0001, "loss": 1.7858, "step": 59450 }, { "epoch": 0.8184431697822528, "grad_norm": 0.15222899615764618, "learning_rate": 0.0001, "loss": 1.7848, "step": 59500 }, { "epoch": 0.8191309371518177, "grad_norm": 0.15309302508831024, "learning_rate": 0.0001, "loss": 1.7847, "step": 59550 }, { "epoch": 0.8198187045213827, "grad_norm": 0.1601337045431137, "learning_rate": 0.0001, "loss": 1.7861, "step": 59600 }, { "epoch": 0.8205064718909476, "grad_norm": 0.14973758161067963, "learning_rate": 0.0001, "loss": 1.7893, "step": 59650 }, { "epoch": 0.8211942392605125, "grad_norm": 0.17928583920001984, "learning_rate": 0.0001, "loss": 1.7841, "step": 59700 }, { "epoch": 0.8218820066300775, "grad_norm": 0.1628539264202118, "learning_rate": 0.0001, "loss": 1.7861, "step": 59750 }, { "epoch": 0.8225697739996424, "grad_norm": 0.1617124229669571, "learning_rate": 0.0001, "loss": 1.7837, "step": 59800 }, { "epoch": 0.8232575413692073, "grad_norm": 0.16710211336612701, "learning_rate": 0.0001, "loss": 1.7843, "step": 59850 }, { "epoch": 0.8239453087387723, "grad_norm": 0.18266211450099945, "learning_rate": 0.0001, "loss": 1.7882, "step": 59900 }, { "epoch": 0.8246330761083371, "grad_norm": 0.15460216999053955, "learning_rate": 0.0001, "loss": 1.7856, "step": 59950 }, { "epoch": 0.825320843477902, "grad_norm": 0.19238495826721191, "learning_rate": 0.0001, "loss": 1.7867, "step": 60000 }, { "epoch": 0.8260086108474669, "grad_norm": 0.17882536351680756, "learning_rate": 0.0001, "loss": 1.79, "step": 60050 }, { "epoch": 0.8266963782170319, "grad_norm": 0.17022471129894257, "learning_rate": 0.0001, "loss": 1.7843, "step": 60100 }, { "epoch": 0.8273841455865968, "grad_norm": 0.16253788769245148, "learning_rate": 0.0001, "loss": 1.7842, "step": 60150 }, { "epoch": 0.8280719129561617, "grad_norm": 0.1684889793395996, "learning_rate": 0.0001, "loss": 1.7871, "step": 60200 }, { "epoch": 0.8287596803257267, "grad_norm": 0.1623234748840332, "learning_rate": 0.0001, "loss": 1.7812, "step": 60250 }, { "epoch": 0.8294474476952916, "grad_norm": 0.14207519590854645, "learning_rate": 0.0001, "loss": 1.7873, "step": 60300 }, { "epoch": 0.8301352150648564, "grad_norm": 0.15550558269023895, "learning_rate": 0.0001, "loss": 1.7876, "step": 60350 }, { "epoch": 0.8308229824344214, "grad_norm": 0.16578029096126556, "learning_rate": 0.0001, "loss": 1.7804, "step": 60400 }, { "epoch": 0.8315107498039863, "grad_norm": 0.16406333446502686, "learning_rate": 0.0001, "loss": 1.7837, "step": 60450 }, { "epoch": 0.8321985171735512, "grad_norm": 0.1568935364484787, "learning_rate": 0.0001, "loss": 1.786, "step": 60500 }, { "epoch": 0.8328862845431161, "grad_norm": 0.17918673157691956, "learning_rate": 0.0001, "loss": 1.7877, "step": 60550 }, { "epoch": 0.8335740519126811, "grad_norm": 0.14733350276947021, "learning_rate": 0.0001, "loss": 1.7821, "step": 60600 }, { "epoch": 0.834261819282246, "grad_norm": 0.14916177093982697, "learning_rate": 0.0001, "loss": 1.7862, "step": 60650 }, { "epoch": 0.8349495866518108, "grad_norm": 0.15052981674671173, "learning_rate": 0.0001, "loss": 1.7892, "step": 60700 }, { "epoch": 0.8356373540213758, "grad_norm": 0.1831791251897812, "learning_rate": 0.0001, "loss": 1.7844, "step": 60750 }, { "epoch": 0.8363251213909407, "grad_norm": 0.16115884482860565, "learning_rate": 0.0001, "loss": 1.7827, "step": 60800 }, { "epoch": 0.8370128887605056, "grad_norm": 0.15721943974494934, "learning_rate": 0.0001, "loss": 1.7862, "step": 60850 }, { "epoch": 0.8377006561300706, "grad_norm": 0.1528850942850113, "learning_rate": 0.0001, "loss": 1.7852, "step": 60900 }, { "epoch": 0.8383884234996355, "grad_norm": 0.16134890913963318, "learning_rate": 0.0001, "loss": 1.7875, "step": 60950 }, { "epoch": 0.8390761908692004, "grad_norm": 0.16336651146411896, "learning_rate": 0.0001, "loss": 1.7848, "step": 61000 }, { "epoch": 0.8397639582387654, "grad_norm": 0.16578875482082367, "learning_rate": 0.0001, "loss": 1.7858, "step": 61050 }, { "epoch": 0.8404517256083303, "grad_norm": 0.16235701739788055, "learning_rate": 0.0001, "loss": 1.7869, "step": 61100 }, { "epoch": 0.8411394929778951, "grad_norm": 0.16650299727916718, "learning_rate": 0.0001, "loss": 1.7868, "step": 61150 }, { "epoch": 0.84182726034746, "grad_norm": 0.148828387260437, "learning_rate": 0.0001, "loss": 1.7827, "step": 61200 }, { "epoch": 0.842515027717025, "grad_norm": 0.1572546660900116, "learning_rate": 0.0001, "loss": 1.7846, "step": 61250 }, { "epoch": 0.8432027950865899, "grad_norm": 0.15572214126586914, "learning_rate": 0.0001, "loss": 1.788, "step": 61300 }, { "epoch": 0.8438905624561548, "grad_norm": 0.18148384988307953, "learning_rate": 0.0001, "loss": 1.7829, "step": 61350 }, { "epoch": 0.8445783298257198, "grad_norm": 0.16225239634513855, "learning_rate": 0.0001, "loss": 1.787, "step": 61400 }, { "epoch": 0.8452660971952847, "grad_norm": 0.1546306014060974, "learning_rate": 0.0001, "loss": 1.7886, "step": 61450 }, { "epoch": 0.8459538645648496, "grad_norm": 0.1589781790971756, "learning_rate": 0.0001, "loss": 1.7876, "step": 61500 }, { "epoch": 0.8466416319344146, "grad_norm": 0.16938839852809906, "learning_rate": 0.0001, "loss": 1.7805, "step": 61550 }, { "epoch": 0.8473293993039794, "grad_norm": 0.17635032534599304, "learning_rate": 0.0001, "loss": 1.7836, "step": 61600 }, { "epoch": 0.8480171666735443, "grad_norm": 0.16436606645584106, "learning_rate": 0.0001, "loss": 1.7829, "step": 61650 }, { "epoch": 0.8487049340431092, "grad_norm": 0.15410180389881134, "learning_rate": 0.0001, "loss": 1.7833, "step": 61700 }, { "epoch": 0.8493927014126742, "grad_norm": 0.15711359679698944, "learning_rate": 0.0001, "loss": 1.7855, "step": 61750 }, { "epoch": 0.8500804687822391, "grad_norm": 0.14257673919200897, "learning_rate": 0.0001, "loss": 1.7846, "step": 61800 }, { "epoch": 0.850768236151804, "grad_norm": 0.1770082414150238, "learning_rate": 0.0001, "loss": 1.786, "step": 61850 }, { "epoch": 0.851456003521369, "grad_norm": 0.14938481152057648, "learning_rate": 0.0001, "loss": 1.7841, "step": 61900 }, { "epoch": 0.8521437708909338, "grad_norm": 0.16232655942440033, "learning_rate": 0.0001, "loss": 1.7872, "step": 61950 }, { "epoch": 0.8528315382604987, "grad_norm": 0.14662796258926392, "learning_rate": 0.0001, "loss": 1.7846, "step": 62000 }, { "epoch": 0.8535193056300637, "grad_norm": 0.15960827469825745, "learning_rate": 0.0001, "loss": 1.7868, "step": 62050 }, { "epoch": 0.8542070729996286, "grad_norm": 0.1585722714662552, "learning_rate": 0.0001, "loss": 1.7833, "step": 62100 }, { "epoch": 0.8548948403691935, "grad_norm": 0.15847063064575195, "learning_rate": 0.0001, "loss": 1.7861, "step": 62150 }, { "epoch": 0.8555826077387585, "grad_norm": 0.1581469178199768, "learning_rate": 0.0001, "loss": 1.7872, "step": 62200 }, { "epoch": 0.8562703751083234, "grad_norm": 0.18087923526763916, "learning_rate": 0.0001, "loss": 1.7837, "step": 62250 }, { "epoch": 0.8569581424778883, "grad_norm": 0.15878331661224365, "learning_rate": 0.0001, "loss": 1.7865, "step": 62300 }, { "epoch": 0.8576459098474531, "grad_norm": 0.1652536690235138, "learning_rate": 0.0001, "loss": 1.7864, "step": 62350 }, { "epoch": 0.8583336772170181, "grad_norm": 0.16467753052711487, "learning_rate": 0.0001, "loss": 1.788, "step": 62400 }, { "epoch": 0.859021444586583, "grad_norm": 0.17342518270015717, "learning_rate": 0.0001, "loss": 1.7853, "step": 62450 }, { "epoch": 0.8597092119561479, "grad_norm": 0.15487852692604065, "learning_rate": 0.0001, "loss": 1.7861, "step": 62500 }, { "epoch": 0.8603969793257129, "grad_norm": 0.16185085475444794, "learning_rate": 0.0001, "loss": 1.7891, "step": 62550 }, { "epoch": 0.8610847466952778, "grad_norm": 0.18629157543182373, "learning_rate": 0.0001, "loss": 1.7836, "step": 62600 }, { "epoch": 0.8617725140648427, "grad_norm": 0.20009976625442505, "learning_rate": 0.0001, "loss": 1.7849, "step": 62650 }, { "epoch": 0.8624602814344077, "grad_norm": 0.16432398557662964, "learning_rate": 0.0001, "loss": 1.786, "step": 62700 }, { "epoch": 0.8631480488039726, "grad_norm": 0.16151119768619537, "learning_rate": 0.0001, "loss": 1.7838, "step": 62750 }, { "epoch": 0.8638358161735374, "grad_norm": 0.16223236918449402, "learning_rate": 0.0001, "loss": 1.7857, "step": 62800 }, { "epoch": 0.8645235835431024, "grad_norm": 0.15118102729320526, "learning_rate": 0.0001, "loss": 1.7824, "step": 62850 }, { "epoch": 0.8652113509126673, "grad_norm": 0.15173585712909698, "learning_rate": 0.0001, "loss": 1.7858, "step": 62900 }, { "epoch": 0.8658991182822322, "grad_norm": 0.1547808051109314, "learning_rate": 0.0001, "loss": 1.7806, "step": 62950 }, { "epoch": 0.8665868856517971, "grad_norm": 0.1542670577764511, "learning_rate": 0.0001, "loss": 1.7816, "step": 63000 }, { "epoch": 0.8672746530213621, "grad_norm": 0.16760842502117157, "learning_rate": 0.0001, "loss": 1.7845, "step": 63050 }, { "epoch": 0.867962420390927, "grad_norm": 0.17703787982463837, "learning_rate": 0.0001, "loss": 1.788, "step": 63100 }, { "epoch": 0.8686501877604919, "grad_norm": 0.1573743224143982, "learning_rate": 0.0001, "loss": 1.7792, "step": 63150 }, { "epoch": 0.8693379551300568, "grad_norm": 0.1451522409915924, "learning_rate": 0.0001, "loss": 1.7854, "step": 63200 }, { "epoch": 0.8700257224996217, "grad_norm": 0.17078782618045807, "learning_rate": 0.0001, "loss": 1.784, "step": 63250 }, { "epoch": 0.8707134898691866, "grad_norm": 0.15471959114074707, "learning_rate": 0.0001, "loss": 1.7832, "step": 63300 }, { "epoch": 0.8714012572387516, "grad_norm": 0.16724149882793427, "learning_rate": 0.0001, "loss": 1.7783, "step": 63350 }, { "epoch": 0.8720890246083165, "grad_norm": 0.15160906314849854, "learning_rate": 0.0001, "loss": 1.7843, "step": 63400 }, { "epoch": 0.8727767919778814, "grad_norm": 0.156820610165596, "learning_rate": 0.0001, "loss": 1.7856, "step": 63450 }, { "epoch": 0.8734645593474463, "grad_norm": 0.16410048305988312, "learning_rate": 0.0001, "loss": 1.7845, "step": 63500 }, { "epoch": 0.8741523267170113, "grad_norm": 0.16022023558616638, "learning_rate": 0.0001, "loss": 1.7801, "step": 63550 }, { "epoch": 0.8748400940865761, "grad_norm": 0.1775195300579071, "learning_rate": 0.0001, "loss": 1.7824, "step": 63600 }, { "epoch": 0.875527861456141, "grad_norm": 0.17621392011642456, "learning_rate": 0.0001, "loss": 1.7792, "step": 63650 }, { "epoch": 0.876215628825706, "grad_norm": 0.17508172988891602, "learning_rate": 0.0001, "loss": 1.785, "step": 63700 }, { "epoch": 0.8769033961952709, "grad_norm": 0.167220801115036, "learning_rate": 0.0001, "loss": 1.7838, "step": 63750 }, { "epoch": 0.8775911635648358, "grad_norm": 0.22981862723827362, "learning_rate": 0.0001, "loss": 1.7885, "step": 63800 }, { "epoch": 0.8782789309344008, "grad_norm": 0.17177161574363708, "learning_rate": 0.0001, "loss": 1.7846, "step": 63850 }, { "epoch": 0.8789666983039657, "grad_norm": 0.16599243879318237, "learning_rate": 0.0001, "loss": 1.7819, "step": 63900 }, { "epoch": 0.8796544656735306, "grad_norm": 0.17125064134597778, "learning_rate": 0.0001, "loss": 1.7839, "step": 63950 }, { "epoch": 0.8803422330430956, "grad_norm": 0.17469707131385803, "learning_rate": 0.0001, "loss": 1.7797, "step": 64000 }, { "epoch": 0.8810300004126604, "grad_norm": 0.16639864444732666, "learning_rate": 0.0001, "loss": 1.7833, "step": 64050 }, { "epoch": 0.8817177677822253, "grad_norm": 0.16656282544136047, "learning_rate": 0.0001, "loss": 1.7816, "step": 64100 }, { "epoch": 0.8824055351517902, "grad_norm": 0.14526651799678802, "learning_rate": 0.0001, "loss": 1.7817, "step": 64150 }, { "epoch": 0.8830933025213552, "grad_norm": 0.1783958077430725, "learning_rate": 0.0001, "loss": 1.7828, "step": 64200 }, { "epoch": 0.8837810698909201, "grad_norm": 0.16352634131908417, "learning_rate": 0.0001, "loss": 1.7807, "step": 64250 }, { "epoch": 0.884468837260485, "grad_norm": 0.16130295395851135, "learning_rate": 0.0001, "loss": 1.7803, "step": 64300 }, { "epoch": 0.88515660463005, "grad_norm": 0.16286851465702057, "learning_rate": 0.0001, "loss": 1.7866, "step": 64350 }, { "epoch": 0.8858443719996149, "grad_norm": 0.16668406128883362, "learning_rate": 0.0001, "loss": 1.7805, "step": 64400 }, { "epoch": 0.8865321393691797, "grad_norm": 0.16575850546360016, "learning_rate": 0.0001, "loss": 1.7803, "step": 64450 }, { "epoch": 0.8872199067387447, "grad_norm": 0.16535095870494843, "learning_rate": 0.0001, "loss": 1.7795, "step": 64500 }, { "epoch": 0.8879076741083096, "grad_norm": 0.14137853682041168, "learning_rate": 0.0001, "loss": 1.7854, "step": 64550 }, { "epoch": 0.8885954414778745, "grad_norm": 0.14880156517028809, "learning_rate": 0.0001, "loss": 1.7862, "step": 64600 }, { "epoch": 0.8892832088474394, "grad_norm": 0.17448197305202484, "learning_rate": 0.0001, "loss": 1.7847, "step": 64650 }, { "epoch": 0.8899709762170044, "grad_norm": 0.1944260448217392, "learning_rate": 0.0001, "loss": 1.786, "step": 64700 }, { "epoch": 0.8906587435865693, "grad_norm": 0.1693488508462906, "learning_rate": 0.0001, "loss": 1.7857, "step": 64750 }, { "epoch": 0.8913465109561342, "grad_norm": 0.16250942647457123, "learning_rate": 0.0001, "loss": 1.7835, "step": 64800 }, { "epoch": 0.8920342783256991, "grad_norm": 0.1573057919740677, "learning_rate": 0.0001, "loss": 1.782, "step": 64850 }, { "epoch": 0.892722045695264, "grad_norm": 0.19034920632839203, "learning_rate": 0.0001, "loss": 1.782, "step": 64900 }, { "epoch": 0.8934098130648289, "grad_norm": 0.13963682949543, "learning_rate": 0.0001, "loss": 1.7887, "step": 64950 }, { "epoch": 0.8940975804343939, "grad_norm": 0.25064077973365784, "learning_rate": 0.0001, "loss": 1.7873, "step": 65000 }, { "epoch": 0.8947853478039588, "grad_norm": 0.17574715614318848, "learning_rate": 0.0001, "loss": 1.7841, "step": 65050 }, { "epoch": 0.8954731151735237, "grad_norm": 0.156754732131958, "learning_rate": 0.0001, "loss": 1.7807, "step": 65100 }, { "epoch": 0.8961608825430887, "grad_norm": 0.17132636904716492, "learning_rate": 0.0001, "loss": 1.7801, "step": 65150 }, { "epoch": 0.8968486499126536, "grad_norm": 0.15248049795627594, "learning_rate": 0.0001, "loss": 1.781, "step": 65200 }, { "epoch": 0.8975364172822184, "grad_norm": 0.1603154093027115, "learning_rate": 0.0001, "loss": 1.7836, "step": 65250 }, { "epoch": 0.8982241846517833, "grad_norm": 0.14862816035747528, "learning_rate": 0.0001, "loss": 1.7823, "step": 65300 }, { "epoch": 0.8989119520213483, "grad_norm": 0.17050820589065552, "learning_rate": 0.0001, "loss": 1.7856, "step": 65350 }, { "epoch": 0.8995997193909132, "grad_norm": 0.16287332773208618, "learning_rate": 0.0001, "loss": 1.7833, "step": 65400 }, { "epoch": 0.9002874867604781, "grad_norm": 0.15486200153827667, "learning_rate": 0.0001, "loss": 1.7804, "step": 65450 }, { "epoch": 0.9009752541300431, "grad_norm": 0.16483095288276672, "learning_rate": 0.0001, "loss": 1.7845, "step": 65500 }, { "epoch": 0.901663021499608, "grad_norm": 0.15963926911354065, "learning_rate": 0.0001, "loss": 1.7865, "step": 65550 }, { "epoch": 0.9023507888691729, "grad_norm": 0.14927932620048523, "learning_rate": 0.0001, "loss": 1.7814, "step": 65600 }, { "epoch": 0.9030385562387379, "grad_norm": 0.15622937679290771, "learning_rate": 0.0001, "loss": 1.7841, "step": 65650 }, { "epoch": 0.9037263236083027, "grad_norm": 0.14870509505271912, "learning_rate": 0.0001, "loss": 1.7865, "step": 65700 }, { "epoch": 0.9044140909778676, "grad_norm": 0.16585543751716614, "learning_rate": 0.0001, "loss": 1.7803, "step": 65750 }, { "epoch": 0.9051018583474326, "grad_norm": 0.16925722360610962, "learning_rate": 0.0001, "loss": 1.7905, "step": 65800 }, { "epoch": 0.9057896257169975, "grad_norm": 0.16086918115615845, "learning_rate": 0.0001, "loss": 1.7818, "step": 65850 }, { "epoch": 0.9064773930865624, "grad_norm": 0.17064189910888672, "learning_rate": 0.0001, "loss": 1.7829, "step": 65900 }, { "epoch": 0.9071651604561273, "grad_norm": 0.1507936716079712, "learning_rate": 0.0001, "loss": 1.7826, "step": 65950 }, { "epoch": 0.9078529278256923, "grad_norm": 0.16139142215251923, "learning_rate": 0.0001, "loss": 1.7832, "step": 66000 }, { "epoch": 0.9085406951952572, "grad_norm": 0.14373824000358582, "learning_rate": 0.0001, "loss": 1.7834, "step": 66050 }, { "epoch": 0.909228462564822, "grad_norm": 0.14268267154693604, "learning_rate": 0.0001, "loss": 1.7832, "step": 66100 }, { "epoch": 0.909916229934387, "grad_norm": 0.14548690617084503, "learning_rate": 0.0001, "loss": 1.7827, "step": 66150 }, { "epoch": 0.9106039973039519, "grad_norm": 0.1726326048374176, "learning_rate": 0.0001, "loss": 1.7799, "step": 66200 }, { "epoch": 0.9112917646735168, "grad_norm": 0.1607373058795929, "learning_rate": 0.0001, "loss": 1.7809, "step": 66250 }, { "epoch": 0.9119795320430818, "grad_norm": 0.14730975031852722, "learning_rate": 0.0001, "loss": 1.7791, "step": 66300 }, { "epoch": 0.9126672994126467, "grad_norm": 0.1616540104150772, "learning_rate": 0.0001, "loss": 1.7791, "step": 66350 }, { "epoch": 0.9133550667822116, "grad_norm": 0.16029463708400726, "learning_rate": 0.0001, "loss": 1.7828, "step": 66400 }, { "epoch": 0.9140428341517765, "grad_norm": 0.15002845227718353, "learning_rate": 0.0001, "loss": 1.7812, "step": 66450 }, { "epoch": 0.9147306015213414, "grad_norm": 0.14482907950878143, "learning_rate": 0.0001, "loss": 1.7802, "step": 66500 }, { "epoch": 0.9154183688909063, "grad_norm": 0.17749476432800293, "learning_rate": 0.0001, "loss": 1.781, "step": 66550 }, { "epoch": 0.9161061362604712, "grad_norm": 0.15776415169239044, "learning_rate": 0.0001, "loss": 1.7816, "step": 66600 }, { "epoch": 0.9167939036300362, "grad_norm": 0.149980366230011, "learning_rate": 0.0001, "loss": 1.7756, "step": 66650 }, { "epoch": 0.9174816709996011, "grad_norm": 0.16899780929088593, "learning_rate": 0.0001, "loss": 1.7814, "step": 66700 }, { "epoch": 0.918169438369166, "grad_norm": 0.17424631118774414, "learning_rate": 0.0001, "loss": 1.7781, "step": 66750 }, { "epoch": 0.918857205738731, "grad_norm": 0.1580991894006729, "learning_rate": 0.0001, "loss": 1.7801, "step": 66800 }, { "epoch": 0.9195449731082959, "grad_norm": 0.16126061975955963, "learning_rate": 0.0001, "loss": 1.782, "step": 66850 }, { "epoch": 0.9202327404778607, "grad_norm": 0.15646252036094666, "learning_rate": 0.0001, "loss": 1.7828, "step": 66900 }, { "epoch": 0.9209205078474257, "grad_norm": 0.17129796743392944, "learning_rate": 0.0001, "loss": 1.7844, "step": 66950 }, { "epoch": 0.9216082752169906, "grad_norm": 0.1756673902273178, "learning_rate": 0.0001, "loss": 1.7839, "step": 67000 }, { "epoch": 0.9222960425865555, "grad_norm": 0.15259510278701782, "learning_rate": 0.0001, "loss": 1.7795, "step": 67050 }, { "epoch": 0.9229838099561204, "grad_norm": 0.1639316827058792, "learning_rate": 0.0001, "loss": 1.7843, "step": 67100 }, { "epoch": 0.9236715773256854, "grad_norm": 0.17190176248550415, "learning_rate": 0.0001, "loss": 1.78, "step": 67150 }, { "epoch": 0.9243593446952503, "grad_norm": 0.16864174604415894, "learning_rate": 0.0001, "loss": 1.7852, "step": 67200 }, { "epoch": 0.9250471120648152, "grad_norm": 0.15548075735569, "learning_rate": 0.0001, "loss": 1.7828, "step": 67250 }, { "epoch": 0.9257348794343802, "grad_norm": 0.16301994025707245, "learning_rate": 0.0001, "loss": 1.7846, "step": 67300 }, { "epoch": 0.926422646803945, "grad_norm": 0.1735038459300995, "learning_rate": 0.0001, "loss": 1.7798, "step": 67350 }, { "epoch": 0.9271104141735099, "grad_norm": 0.1380920112133026, "learning_rate": 0.0001, "loss": 1.7806, "step": 67400 }, { "epoch": 0.9277981815430749, "grad_norm": 0.15920446813106537, "learning_rate": 0.0001, "loss": 1.7792, "step": 67450 }, { "epoch": 0.9284859489126398, "grad_norm": 0.17028312385082245, "learning_rate": 0.0001, "loss": 1.7888, "step": 67500 }, { "epoch": 0.9291737162822047, "grad_norm": 0.1769266575574875, "learning_rate": 0.0001, "loss": 1.7814, "step": 67550 }, { "epoch": 0.9298614836517696, "grad_norm": 0.1450556516647339, "learning_rate": 0.0001, "loss": 1.7817, "step": 67600 }, { "epoch": 0.9305492510213346, "grad_norm": 0.16302357614040375, "learning_rate": 0.0001, "loss": 1.7813, "step": 67650 }, { "epoch": 0.9312370183908995, "grad_norm": 0.1574389934539795, "learning_rate": 0.0001, "loss": 1.7776, "step": 67700 }, { "epoch": 0.9319247857604643, "grad_norm": 0.14627063274383545, "learning_rate": 0.0001, "loss": 1.7826, "step": 67750 }, { "epoch": 0.9326125531300293, "grad_norm": 0.18861928582191467, "learning_rate": 0.0001, "loss": 1.781, "step": 67800 }, { "epoch": 0.9333003204995942, "grad_norm": 0.1549026519060135, "learning_rate": 0.0001, "loss": 1.7787, "step": 67850 }, { "epoch": 0.9339880878691591, "grad_norm": 0.1620372235774994, "learning_rate": 0.0001, "loss": 1.7826, "step": 67900 }, { "epoch": 0.9346758552387241, "grad_norm": 0.15894797444343567, "learning_rate": 0.0001, "loss": 1.7818, "step": 67950 }, { "epoch": 0.935363622608289, "grad_norm": 0.19588086009025574, "learning_rate": 0.0001, "loss": 1.7835, "step": 68000 }, { "epoch": 0.9360513899778539, "grad_norm": 0.1861431747674942, "learning_rate": 0.0001, "loss": 1.7815, "step": 68050 }, { "epoch": 0.9367391573474189, "grad_norm": 0.16720125079154968, "learning_rate": 0.0001, "loss": 1.781, "step": 68100 }, { "epoch": 0.9374269247169837, "grad_norm": 0.1603463739156723, "learning_rate": 0.0001, "loss": 1.7788, "step": 68150 }, { "epoch": 0.9381146920865486, "grad_norm": 0.14092972874641418, "learning_rate": 0.0001, "loss": 1.7824, "step": 68200 }, { "epoch": 0.9388024594561135, "grad_norm": 0.1622365266084671, "learning_rate": 0.0001, "loss": 1.7779, "step": 68250 }, { "epoch": 0.9394902268256785, "grad_norm": 0.16566450893878937, "learning_rate": 0.0001, "loss": 1.7789, "step": 68300 }, { "epoch": 0.9401779941952434, "grad_norm": 0.14181503653526306, "learning_rate": 0.0001, "loss": 1.7773, "step": 68350 }, { "epoch": 0.9408657615648083, "grad_norm": 0.16675251722335815, "learning_rate": 0.0001, "loss": 1.7796, "step": 68400 }, { "epoch": 0.9415535289343733, "grad_norm": 0.15481418371200562, "learning_rate": 0.0001, "loss": 1.7797, "step": 68450 }, { "epoch": 0.9422412963039382, "grad_norm": 0.16480682790279388, "learning_rate": 0.0001, "loss": 1.7767, "step": 68500 }, { "epoch": 0.942929063673503, "grad_norm": 0.13726095855236053, "learning_rate": 0.0001, "loss": 1.7799, "step": 68550 }, { "epoch": 0.943616831043068, "grad_norm": 0.1498117446899414, "learning_rate": 0.0001, "loss": 1.7826, "step": 68600 }, { "epoch": 0.9443045984126329, "grad_norm": 0.15102407336235046, "learning_rate": 0.0001, "loss": 1.7807, "step": 68650 }, { "epoch": 0.9449923657821978, "grad_norm": 0.1596510410308838, "learning_rate": 0.0001, "loss": 1.7773, "step": 68700 }, { "epoch": 0.9456801331517628, "grad_norm": 0.15061867237091064, "learning_rate": 0.0001, "loss": 1.7781, "step": 68750 }, { "epoch": 0.9463679005213277, "grad_norm": 0.18302445113658905, "learning_rate": 0.0001, "loss": 1.7801, "step": 68800 }, { "epoch": 0.9470556678908926, "grad_norm": 0.1563147008419037, "learning_rate": 0.0001, "loss": 1.7807, "step": 68850 }, { "epoch": 0.9477434352604575, "grad_norm": 0.1559109389781952, "learning_rate": 0.0001, "loss": 1.779, "step": 68900 }, { "epoch": 0.9484312026300225, "grad_norm": 0.1892656683921814, "learning_rate": 0.0001, "loss": 1.7815, "step": 68950 }, { "epoch": 0.9491189699995873, "grad_norm": 0.16753901541233063, "learning_rate": 0.0001, "loss": 1.779, "step": 69000 }, { "epoch": 0.9498067373691522, "grad_norm": 0.16571739315986633, "learning_rate": 0.0001, "loss": 1.781, "step": 69050 }, { "epoch": 0.9504945047387172, "grad_norm": 0.15618735551834106, "learning_rate": 0.0001, "loss": 1.7801, "step": 69100 }, { "epoch": 0.9511822721082821, "grad_norm": 0.15602505207061768, "learning_rate": 0.0001, "loss": 1.7782, "step": 69150 }, { "epoch": 0.951870039477847, "grad_norm": 0.1441372036933899, "learning_rate": 0.0001, "loss": 1.7808, "step": 69200 }, { "epoch": 0.952557806847412, "grad_norm": 0.16956308484077454, "learning_rate": 0.0001, "loss": 1.7805, "step": 69250 }, { "epoch": 0.9532455742169769, "grad_norm": 0.1570560336112976, "learning_rate": 0.0001, "loss": 1.7829, "step": 69300 }, { "epoch": 0.9539333415865417, "grad_norm": 0.13851186633110046, "learning_rate": 0.0001, "loss": 1.779, "step": 69350 }, { "epoch": 0.9546211089561066, "grad_norm": 0.18309037387371063, "learning_rate": 0.0001, "loss": 1.7772, "step": 69400 }, { "epoch": 0.9553088763256716, "grad_norm": 1.6850249767303467, "learning_rate": 0.0001, "loss": 1.7781, "step": 69450 }, { "epoch": 0.9559966436952365, "grad_norm": 0.1578509509563446, "learning_rate": 0.0001, "loss": 1.7843, "step": 69500 }, { "epoch": 0.9566844110648014, "grad_norm": 0.15330944955348969, "learning_rate": 0.0001, "loss": 1.7785, "step": 69550 }, { "epoch": 0.9573721784343664, "grad_norm": 0.15504170954227448, "learning_rate": 0.0001, "loss": 1.7851, "step": 69600 }, { "epoch": 0.9580599458039313, "grad_norm": 0.17802022397518158, "learning_rate": 0.0001, "loss": 1.7794, "step": 69650 }, { "epoch": 0.9587477131734962, "grad_norm": 0.18508057296276093, "learning_rate": 0.0001, "loss": 1.7827, "step": 69700 }, { "epoch": 0.9594354805430612, "grad_norm": 0.19704073667526245, "learning_rate": 0.0001, "loss": 1.7809, "step": 69750 }, { "epoch": 0.960123247912626, "grad_norm": 0.17070503532886505, "learning_rate": 0.0001, "loss": 1.7791, "step": 69800 }, { "epoch": 0.9608110152821909, "grad_norm": 0.1832980215549469, "learning_rate": 0.0001, "loss": 1.7798, "step": 69850 }, { "epoch": 0.9614987826517559, "grad_norm": 0.15290822088718414, "learning_rate": 0.0001, "loss": 1.7819, "step": 69900 }, { "epoch": 0.9621865500213208, "grad_norm": 0.1691426783800125, "learning_rate": 0.0001, "loss": 1.7792, "step": 69950 }, { "epoch": 0.9628743173908857, "grad_norm": 0.1656666249036789, "learning_rate": 0.0001, "loss": 1.7836, "step": 70000 }, { "epoch": 0.9635620847604506, "grad_norm": 0.15653489530086517, "learning_rate": 0.0001, "loss": 1.7811, "step": 70050 }, { "epoch": 0.9642498521300156, "grad_norm": 0.15945695340633392, "learning_rate": 0.0001, "loss": 1.7789, "step": 70100 }, { "epoch": 0.9649376194995805, "grad_norm": 0.173899307847023, "learning_rate": 0.0001, "loss": 1.782, "step": 70150 }, { "epoch": 0.9656253868691453, "grad_norm": 0.13982714712619781, "learning_rate": 0.0001, "loss": 1.7796, "step": 70200 }, { "epoch": 0.9663131542387103, "grad_norm": 0.16570891439914703, "learning_rate": 0.0001, "loss": 1.7814, "step": 70250 }, { "epoch": 0.9670009216082752, "grad_norm": 0.1680910885334015, "learning_rate": 0.0001, "loss": 1.7797, "step": 70300 }, { "epoch": 0.9676886889778401, "grad_norm": 0.18602094054222107, "learning_rate": 0.0001, "loss": 1.7799, "step": 70350 }, { "epoch": 0.9683764563474051, "grad_norm": 0.15171028673648834, "learning_rate": 0.0001, "loss": 1.7824, "step": 70400 }, { "epoch": 0.96906422371697, "grad_norm": 0.17273007333278656, "learning_rate": 0.0001, "loss": 1.779, "step": 70450 }, { "epoch": 0.9697519910865349, "grad_norm": 0.1841355711221695, "learning_rate": 0.0001, "loss": 1.7849, "step": 70500 }, { "epoch": 0.9704397584560998, "grad_norm": 0.14629191160202026, "learning_rate": 0.0001, "loss": 1.7822, "step": 70550 }, { "epoch": 0.9711275258256648, "grad_norm": 0.19547376036643982, "learning_rate": 0.0001, "loss": 1.7805, "step": 70600 }, { "epoch": 0.9718152931952296, "grad_norm": 0.1695117950439453, "learning_rate": 0.0001, "loss": 1.7808, "step": 70650 }, { "epoch": 0.9725030605647945, "grad_norm": 0.15734167397022247, "learning_rate": 0.0001, "loss": 1.7826, "step": 70700 }, { "epoch": 0.9731908279343595, "grad_norm": 0.15534259378910065, "learning_rate": 0.0001, "loss": 1.7784, "step": 70750 }, { "epoch": 0.9738785953039244, "grad_norm": 0.17524221539497375, "learning_rate": 0.0001, "loss": 1.7802, "step": 70800 }, { "epoch": 0.9745663626734893, "grad_norm": 0.16551004350185394, "learning_rate": 0.0001, "loss": 1.7774, "step": 70850 }, { "epoch": 0.9752541300430543, "grad_norm": 0.18955057859420776, "learning_rate": 0.0001, "loss": 1.7771, "step": 70900 }, { "epoch": 0.9759418974126192, "grad_norm": 0.1564190834760666, "learning_rate": 0.0001, "loss": 1.7836, "step": 70950 }, { "epoch": 0.976629664782184, "grad_norm": 0.18080365657806396, "learning_rate": 0.0001, "loss": 1.7809, "step": 71000 }, { "epoch": 0.977317432151749, "grad_norm": 0.17052794992923737, "learning_rate": 0.0001, "loss": 1.7785, "step": 71050 }, { "epoch": 0.9780051995213139, "grad_norm": 0.15679985284805298, "learning_rate": 0.0001, "loss": 1.777, "step": 71100 }, { "epoch": 0.9786929668908788, "grad_norm": 0.14611759781837463, "learning_rate": 0.0001, "loss": 1.7831, "step": 71150 }, { "epoch": 0.9793807342604437, "grad_norm": 0.17994888126850128, "learning_rate": 0.0001, "loss": 1.7811, "step": 71200 }, { "epoch": 0.9800685016300087, "grad_norm": 0.1523408442735672, "learning_rate": 0.0001, "loss": 1.7819, "step": 71250 }, { "epoch": 0.9807562689995736, "grad_norm": 0.14828313887119293, "learning_rate": 0.0001, "loss": 1.7766, "step": 71300 }, { "epoch": 0.9814440363691385, "grad_norm": 0.1424998790025711, "learning_rate": 0.0001, "loss": 1.7788, "step": 71350 }, { "epoch": 0.9821318037387035, "grad_norm": 0.14312104880809784, "learning_rate": 0.0001, "loss": 1.7783, "step": 71400 }, { "epoch": 0.9828195711082683, "grad_norm": 0.14697466790676117, "learning_rate": 0.0001, "loss": 1.7808, "step": 71450 }, { "epoch": 0.9835073384778332, "grad_norm": 0.16363121569156647, "learning_rate": 0.0001, "loss": 1.7783, "step": 71500 }, { "epoch": 0.9841951058473982, "grad_norm": 0.1542508453130722, "learning_rate": 0.0001, "loss": 1.7817, "step": 71550 }, { "epoch": 0.9848828732169631, "grad_norm": 0.1389523297548294, "learning_rate": 0.0001, "loss": 1.7791, "step": 71600 }, { "epoch": 0.985570640586528, "grad_norm": 0.15856057405471802, "learning_rate": 0.0001, "loss": 1.7833, "step": 71650 }, { "epoch": 0.986258407956093, "grad_norm": 0.15098857879638672, "learning_rate": 0.0001, "loss": 1.7764, "step": 71700 }, { "epoch": 0.9869461753256579, "grad_norm": 0.14318101108074188, "learning_rate": 0.0001, "loss": 1.7782, "step": 71750 }, { "epoch": 0.9876339426952228, "grad_norm": 0.16459529101848602, "learning_rate": 0.0001, "loss": 1.7774, "step": 71800 }, { "epoch": 0.9883217100647876, "grad_norm": 0.14705689251422882, "learning_rate": 0.0001, "loss": 1.7813, "step": 71850 }, { "epoch": 0.9890094774343526, "grad_norm": 0.2091091424226761, "learning_rate": 0.0001, "loss": 1.7819, "step": 71900 }, { "epoch": 0.9896972448039175, "grad_norm": 0.1711418330669403, "learning_rate": 0.0001, "loss": 1.7782, "step": 71950 }, { "epoch": 0.9903850121734824, "grad_norm": 0.15255683660507202, "learning_rate": 0.0001, "loss": 1.7851, "step": 72000 }, { "epoch": 0.9910727795430474, "grad_norm": 0.17501915991306305, "learning_rate": 0.0001, "loss": 1.7824, "step": 72050 }, { "epoch": 0.9917605469126123, "grad_norm": 0.1605847328901291, "learning_rate": 0.0001, "loss": 1.7802, "step": 72100 }, { "epoch": 0.9924483142821772, "grad_norm": 0.14898759126663208, "learning_rate": 0.0001, "loss": 1.7836, "step": 72150 }, { "epoch": 0.9931360816517422, "grad_norm": 0.15966999530792236, "learning_rate": 0.0001, "loss": 1.7773, "step": 72200 }, { "epoch": 0.993823849021307, "grad_norm": 0.14977654814720154, "learning_rate": 0.0001, "loss": 1.7764, "step": 72250 }, { "epoch": 0.9945116163908719, "grad_norm": 0.16077259182929993, "learning_rate": 0.0001, "loss": 1.7789, "step": 72300 }, { "epoch": 0.9951993837604368, "grad_norm": 0.1603011190891266, "learning_rate": 0.0001, "loss": 1.7756, "step": 72350 }, { "epoch": 0.9958871511300018, "grad_norm": 0.17926956713199615, "learning_rate": 0.0001, "loss": 1.7805, "step": 72400 }, { "epoch": 0.9965749184995667, "grad_norm": 0.15523836016654968, "learning_rate": 0.0001, "loss": 1.7816, "step": 72450 }, { "epoch": 0.9972626858691316, "grad_norm": 0.15533694624900818, "learning_rate": 0.0001, "loss": 1.7817, "step": 72500 }, { "epoch": 0.9979504532386966, "grad_norm": 0.17167145013809204, "learning_rate": 0.0001, "loss": 1.7793, "step": 72550 }, { "epoch": 0.9986382206082615, "grad_norm": 0.1536383181810379, "learning_rate": 0.0001, "loss": 1.7792, "step": 72600 }, { "epoch": 0.9993259879778263, "grad_norm": 0.15611621737480164, "learning_rate": 0.0001, "loss": 1.7798, "step": 72650 } ], "logging_steps": 50, "max_steps": 72699, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.7804002887190855e+21, "train_batch_size": 4, "trial_name": null, "trial_params": null }