diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10204 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500.0, + "global_step": 72699, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006877673695649183, + "grad_norm": 0.19629216194152832, + "learning_rate": 0.0001, + "loss": 1.9938, + "step": 50 + }, + { + "epoch": 0.0013755347391298366, + "grad_norm": 0.24478936195373535, + "learning_rate": 0.0001, + "loss": 1.9596, + "step": 100 + }, + { + "epoch": 0.002063302108694755, + "grad_norm": 0.22170111536979675, + "learning_rate": 0.0001, + "loss": 1.9526, + "step": 150 + }, + { + "epoch": 0.0027510694782596733, + "grad_norm": 0.2311343252658844, + "learning_rate": 0.0001, + "loss": 1.9471, + "step": 200 + }, + { + "epoch": 0.003438836847824592, + "grad_norm": 0.20621128380298615, + "learning_rate": 0.0001, + "loss": 1.9435, + "step": 250 + }, + { + "epoch": 0.00412660421738951, + "grad_norm": 0.22248196601867676, + "learning_rate": 0.0001, + "loss": 1.9396, + "step": 300 + }, + { + "epoch": 0.004814371586954428, + "grad_norm": 0.20232965052127838, + "learning_rate": 0.0001, + "loss": 1.9362, + "step": 350 + }, + { + "epoch": 0.0055021389565193465, + "grad_norm": 0.21155332028865814, + "learning_rate": 0.0001, + "loss": 1.9285, + "step": 400 + }, + { + "epoch": 0.006189906326084265, + "grad_norm": 0.25176894664764404, + "learning_rate": 0.0001, + "loss": 1.9319, + "step": 450 + }, + { + "epoch": 0.006877673695649184, + "grad_norm": 0.21027377247810364, + "learning_rate": 0.0001, + "loss": 1.9304, + "step": 500 + }, + { + "epoch": 0.007565441065214102, + "grad_norm": 0.2434869110584259, + "learning_rate": 0.0001, + "loss": 1.93, + "step": 550 + }, + { + "epoch": 0.00825320843477902, + "grad_norm": 0.1908300668001175, + "learning_rate": 0.0001, + "loss": 1.9254, + "step": 600 + }, + { + "epoch": 0.008940975804343939, + "grad_norm": 0.2221110612154007, + "learning_rate": 0.0001, + "loss": 1.9226, + "step": 650 + }, + { + "epoch": 0.009628743173908856, + "grad_norm": 0.22620266675949097, + "learning_rate": 0.0001, + "loss": 1.9262, + "step": 700 + }, + { + "epoch": 0.010316510543473776, + "grad_norm": 0.21463032066822052, + "learning_rate": 0.0001, + "loss": 1.9201, + "step": 750 + }, + { + "epoch": 0.011004277913038693, + "grad_norm": 0.19383488595485687, + "learning_rate": 0.0001, + "loss": 1.9236, + "step": 800 + }, + { + "epoch": 0.011692045282603612, + "grad_norm": 0.22416023910045624, + "learning_rate": 0.0001, + "loss": 1.9186, + "step": 850 + }, + { + "epoch": 0.01237981265216853, + "grad_norm": 0.2285342961549759, + "learning_rate": 0.0001, + "loss": 1.9207, + "step": 900 + }, + { + "epoch": 0.013067580021733449, + "grad_norm": 0.20416004955768585, + "learning_rate": 0.0001, + "loss": 1.9171, + "step": 950 + }, + { + "epoch": 0.013755347391298368, + "grad_norm": 0.20697274804115295, + "learning_rate": 0.0001, + "loss": 1.9177, + "step": 1000 + }, + { + "epoch": 0.014443114760863286, + "grad_norm": 0.2317676991224289, + "learning_rate": 0.0001, + "loss": 1.9138, + "step": 1050 + }, + { + "epoch": 0.015130882130428205, + "grad_norm": 0.21276156604290009, + "learning_rate": 0.0001, + "loss": 1.9111, + "step": 1100 + }, + { + "epoch": 0.015818649499993124, + "grad_norm": 0.20574018359184265, + "learning_rate": 0.0001, + "loss": 1.9155, + "step": 1150 + }, + { + "epoch": 0.01650641686955804, + "grad_norm": 0.19410207867622375, + "learning_rate": 0.0001, + "loss": 1.9073, + "step": 1200 + }, + { + "epoch": 0.01719418423912296, + "grad_norm": 0.19570203125476837, + "learning_rate": 0.0001, + "loss": 1.9085, + "step": 1250 + }, + { + "epoch": 0.017881951608687878, + "grad_norm": 0.2081640362739563, + "learning_rate": 0.0001, + "loss": 1.9093, + "step": 1300 + }, + { + "epoch": 0.018569718978252797, + "grad_norm": 0.19721642136573792, + "learning_rate": 0.0001, + "loss": 1.9086, + "step": 1350 + }, + { + "epoch": 0.019257486347817713, + "grad_norm": 0.202309712767601, + "learning_rate": 0.0001, + "loss": 1.9039, + "step": 1400 + }, + { + "epoch": 0.019945253717382632, + "grad_norm": 0.22128838300704956, + "learning_rate": 0.0001, + "loss": 1.9067, + "step": 1450 + }, + { + "epoch": 0.02063302108694755, + "grad_norm": 0.25011196732521057, + "learning_rate": 0.0001, + "loss": 1.9055, + "step": 1500 + }, + { + "epoch": 0.02132078845651247, + "grad_norm": 0.20523639023303986, + "learning_rate": 0.0001, + "loss": 1.9039, + "step": 1550 + }, + { + "epoch": 0.022008555826077386, + "grad_norm": 0.2327890396118164, + "learning_rate": 0.0001, + "loss": 1.9059, + "step": 1600 + }, + { + "epoch": 0.022696323195642305, + "grad_norm": 0.22426384687423706, + "learning_rate": 0.0001, + "loss": 1.9033, + "step": 1650 + }, + { + "epoch": 0.023384090565207225, + "grad_norm": 0.2116124927997589, + "learning_rate": 0.0001, + "loss": 1.902, + "step": 1700 + }, + { + "epoch": 0.024071857934772144, + "grad_norm": 0.21172966063022614, + "learning_rate": 0.0001, + "loss": 1.9007, + "step": 1750 + }, + { + "epoch": 0.02475962530433706, + "grad_norm": 0.19443170726299286, + "learning_rate": 0.0001, + "loss": 1.9003, + "step": 1800 + }, + { + "epoch": 0.02544739267390198, + "grad_norm": 0.21195723116397858, + "learning_rate": 0.0001, + "loss": 1.9015, + "step": 1850 + }, + { + "epoch": 0.026135160043466898, + "grad_norm": 0.22141411900520325, + "learning_rate": 0.0001, + "loss": 1.8957, + "step": 1900 + }, + { + "epoch": 0.026822927413031817, + "grad_norm": 0.22995401918888092, + "learning_rate": 0.0001, + "loss": 1.8979, + "step": 1950 + }, + { + "epoch": 0.027510694782596736, + "grad_norm": 0.2246379405260086, + "learning_rate": 0.0001, + "loss": 1.8966, + "step": 2000 + }, + { + "epoch": 0.028198462152161652, + "grad_norm": 0.22695621848106384, + "learning_rate": 0.0001, + "loss": 1.895, + "step": 2050 + }, + { + "epoch": 0.02888622952172657, + "grad_norm": 0.19988253712654114, + "learning_rate": 0.0001, + "loss": 1.8934, + "step": 2100 + }, + { + "epoch": 0.02957399689129149, + "grad_norm": 0.21754223108291626, + "learning_rate": 0.0001, + "loss": 1.8972, + "step": 2150 + }, + { + "epoch": 0.03026176426085641, + "grad_norm": 0.19053423404693604, + "learning_rate": 0.0001, + "loss": 1.8912, + "step": 2200 + }, + { + "epoch": 0.030949531630421325, + "grad_norm": 0.21589875221252441, + "learning_rate": 0.0001, + "loss": 1.8935, + "step": 2250 + }, + { + "epoch": 0.03163729899998625, + "grad_norm": 0.2087436020374298, + "learning_rate": 0.0001, + "loss": 1.8923, + "step": 2300 + }, + { + "epoch": 0.03232506636955116, + "grad_norm": 0.2261374592781067, + "learning_rate": 0.0001, + "loss": 1.8914, + "step": 2350 + }, + { + "epoch": 0.03301283373911608, + "grad_norm": 0.1949523240327835, + "learning_rate": 0.0001, + "loss": 1.8905, + "step": 2400 + }, + { + "epoch": 0.033700601108681, + "grad_norm": 0.21544858813285828, + "learning_rate": 0.0001, + "loss": 1.8909, + "step": 2450 + }, + { + "epoch": 0.03438836847824592, + "grad_norm": 0.20145681500434875, + "learning_rate": 0.0001, + "loss": 1.8876, + "step": 2500 + }, + { + "epoch": 0.03507613584781084, + "grad_norm": 0.21707232296466827, + "learning_rate": 0.0001, + "loss": 1.8915, + "step": 2550 + }, + { + "epoch": 0.035763903217375756, + "grad_norm": 0.1982990950345993, + "learning_rate": 0.0001, + "loss": 1.888, + "step": 2600 + }, + { + "epoch": 0.036451670586940675, + "grad_norm": 0.2223712056875229, + "learning_rate": 0.0001, + "loss": 1.8868, + "step": 2650 + }, + { + "epoch": 0.037139437956505594, + "grad_norm": 0.19649413228034973, + "learning_rate": 0.0001, + "loss": 1.8869, + "step": 2700 + }, + { + "epoch": 0.03782720532607051, + "grad_norm": 0.22767962515354156, + "learning_rate": 0.0001, + "loss": 1.8901, + "step": 2750 + }, + { + "epoch": 0.038514972695635426, + "grad_norm": 0.19138416647911072, + "learning_rate": 0.0001, + "loss": 1.8916, + "step": 2800 + }, + { + "epoch": 0.039202740065200345, + "grad_norm": 0.19380460679531097, + "learning_rate": 0.0001, + "loss": 1.8889, + "step": 2850 + }, + { + "epoch": 0.039890507434765264, + "grad_norm": 0.19751518964767456, + "learning_rate": 0.0001, + "loss": 1.8868, + "step": 2900 + }, + { + "epoch": 0.04057827480433018, + "grad_norm": 0.21071408689022064, + "learning_rate": 0.0001, + "loss": 1.8862, + "step": 2950 + }, + { + "epoch": 0.0412660421738951, + "grad_norm": 0.19260670244693756, + "learning_rate": 0.0001, + "loss": 1.8827, + "step": 3000 + }, + { + "epoch": 0.04195380954346002, + "grad_norm": 0.19185714423656464, + "learning_rate": 0.0001, + "loss": 1.8866, + "step": 3050 + }, + { + "epoch": 0.04264157691302494, + "grad_norm": 0.24877017736434937, + "learning_rate": 0.0001, + "loss": 1.8854, + "step": 3100 + }, + { + "epoch": 0.04332934428258986, + "grad_norm": 0.1947249323129654, + "learning_rate": 0.0001, + "loss": 1.8842, + "step": 3150 + }, + { + "epoch": 0.04401711165215477, + "grad_norm": 0.20210722088813782, + "learning_rate": 0.0001, + "loss": 1.8837, + "step": 3200 + }, + { + "epoch": 0.04470487902171969, + "grad_norm": 0.22242394089698792, + "learning_rate": 0.0001, + "loss": 1.8817, + "step": 3250 + }, + { + "epoch": 0.04539264639128461, + "grad_norm": 0.2049330472946167, + "learning_rate": 0.0001, + "loss": 1.8845, + "step": 3300 + }, + { + "epoch": 0.04608041376084953, + "grad_norm": 0.19368599355220795, + "learning_rate": 0.0001, + "loss": 1.884, + "step": 3350 + }, + { + "epoch": 0.04676818113041445, + "grad_norm": 0.1886671483516693, + "learning_rate": 0.0001, + "loss": 1.883, + "step": 3400 + }, + { + "epoch": 0.04745594849997937, + "grad_norm": 0.19359445571899414, + "learning_rate": 0.0001, + "loss": 1.8824, + "step": 3450 + }, + { + "epoch": 0.04814371586954429, + "grad_norm": 0.195325568318367, + "learning_rate": 0.0001, + "loss": 1.8806, + "step": 3500 + }, + { + "epoch": 0.04883148323910921, + "grad_norm": 0.21584388613700867, + "learning_rate": 0.0001, + "loss": 1.879, + "step": 3550 + }, + { + "epoch": 0.04951925060867412, + "grad_norm": 0.19085532426834106, + "learning_rate": 0.0001, + "loss": 1.8817, + "step": 3600 + }, + { + "epoch": 0.05020701797823904, + "grad_norm": 0.2133578211069107, + "learning_rate": 0.0001, + "loss": 1.8797, + "step": 3650 + }, + { + "epoch": 0.05089478534780396, + "grad_norm": 0.19587628543376923, + "learning_rate": 0.0001, + "loss": 1.8806, + "step": 3700 + }, + { + "epoch": 0.051582552717368876, + "grad_norm": 0.22608409821987152, + "learning_rate": 0.0001, + "loss": 1.8803, + "step": 3750 + }, + { + "epoch": 0.052270320086933796, + "grad_norm": 0.20075012743473053, + "learning_rate": 0.0001, + "loss": 1.8773, + "step": 3800 + }, + { + "epoch": 0.052958087456498715, + "grad_norm": 0.2007540464401245, + "learning_rate": 0.0001, + "loss": 1.8775, + "step": 3850 + }, + { + "epoch": 0.053645854826063634, + "grad_norm": 0.20465299487113953, + "learning_rate": 0.0001, + "loss": 1.88, + "step": 3900 + }, + { + "epoch": 0.05433362219562855, + "grad_norm": 0.19921573996543884, + "learning_rate": 0.0001, + "loss": 1.8749, + "step": 3950 + }, + { + "epoch": 0.05502138956519347, + "grad_norm": 0.19196507334709167, + "learning_rate": 0.0001, + "loss": 1.8808, + "step": 4000 + }, + { + "epoch": 0.055709156934758385, + "grad_norm": 0.20529140532016754, + "learning_rate": 0.0001, + "loss": 1.8787, + "step": 4050 + }, + { + "epoch": 0.056396924304323304, + "grad_norm": 0.23082584142684937, + "learning_rate": 0.0001, + "loss": 1.8752, + "step": 4100 + }, + { + "epoch": 0.05708469167388822, + "grad_norm": 0.18597312271595, + "learning_rate": 0.0001, + "loss": 1.8793, + "step": 4150 + }, + { + "epoch": 0.05777245904345314, + "grad_norm": 0.23071937263011932, + "learning_rate": 0.0001, + "loss": 1.8782, + "step": 4200 + }, + { + "epoch": 0.05846022641301806, + "grad_norm": 0.19141189754009247, + "learning_rate": 0.0001, + "loss": 1.875, + "step": 4250 + }, + { + "epoch": 0.05914799378258298, + "grad_norm": 0.23278222978115082, + "learning_rate": 0.0001, + "loss": 1.8805, + "step": 4300 + }, + { + "epoch": 0.0598357611521479, + "grad_norm": 0.21169067919254303, + "learning_rate": 0.0001, + "loss": 1.8753, + "step": 4350 + }, + { + "epoch": 0.06052352852171282, + "grad_norm": 0.2010953575372696, + "learning_rate": 0.0001, + "loss": 1.8758, + "step": 4400 + }, + { + "epoch": 0.06121129589127773, + "grad_norm": 0.19260814785957336, + "learning_rate": 0.0001, + "loss": 1.8731, + "step": 4450 + }, + { + "epoch": 0.06189906326084265, + "grad_norm": 0.19751103222370148, + "learning_rate": 0.0001, + "loss": 1.8719, + "step": 4500 + }, + { + "epoch": 0.06258683063040757, + "grad_norm": 0.21297581493854523, + "learning_rate": 0.0001, + "loss": 1.875, + "step": 4550 + }, + { + "epoch": 0.0632745979999725, + "grad_norm": 0.2128158062696457, + "learning_rate": 0.0001, + "loss": 1.8711, + "step": 4600 + }, + { + "epoch": 0.06396236536953741, + "grad_norm": 0.18719784915447235, + "learning_rate": 0.0001, + "loss": 1.8741, + "step": 4650 + }, + { + "epoch": 0.06465013273910232, + "grad_norm": 0.2352721244096756, + "learning_rate": 0.0001, + "loss": 1.8717, + "step": 4700 + }, + { + "epoch": 0.06533790010866725, + "grad_norm": 0.22228975594043732, + "learning_rate": 0.0001, + "loss": 1.8707, + "step": 4750 + }, + { + "epoch": 0.06602566747823216, + "grad_norm": 0.18716222047805786, + "learning_rate": 0.0001, + "loss": 1.8705, + "step": 4800 + }, + { + "epoch": 0.06671343484779708, + "grad_norm": 0.22167149186134338, + "learning_rate": 0.0001, + "loss": 1.8739, + "step": 4850 + }, + { + "epoch": 0.067401202217362, + "grad_norm": 0.24794642627239227, + "learning_rate": 0.0001, + "loss": 1.8747, + "step": 4900 + }, + { + "epoch": 0.06808896958692692, + "grad_norm": 0.18762528896331787, + "learning_rate": 0.0001, + "loss": 1.8702, + "step": 4950 + }, + { + "epoch": 0.06877673695649184, + "grad_norm": 0.19063113629817963, + "learning_rate": 0.0001, + "loss": 1.8733, + "step": 5000 + }, + { + "epoch": 0.06946450432605676, + "grad_norm": 0.1940603107213974, + "learning_rate": 0.0001, + "loss": 1.8685, + "step": 5050 + }, + { + "epoch": 0.07015227169562167, + "grad_norm": 0.19752484560012817, + "learning_rate": 0.0001, + "loss": 1.8762, + "step": 5100 + }, + { + "epoch": 0.07084003906518659, + "grad_norm": 0.23486199975013733, + "learning_rate": 0.0001, + "loss": 1.8708, + "step": 5150 + }, + { + "epoch": 0.07152780643475151, + "grad_norm": 0.20315973460674286, + "learning_rate": 0.0001, + "loss": 1.8676, + "step": 5200 + }, + { + "epoch": 0.07221557380431642, + "grad_norm": 0.1925646960735321, + "learning_rate": 0.0001, + "loss": 1.8634, + "step": 5250 + }, + { + "epoch": 0.07290334117388135, + "grad_norm": 0.20540663599967957, + "learning_rate": 0.0001, + "loss": 1.8706, + "step": 5300 + }, + { + "epoch": 0.07359110854344626, + "grad_norm": 0.23649099469184875, + "learning_rate": 0.0001, + "loss": 1.8685, + "step": 5350 + }, + { + "epoch": 0.07427887591301119, + "grad_norm": 0.23272614181041718, + "learning_rate": 0.0001, + "loss": 1.8724, + "step": 5400 + }, + { + "epoch": 0.0749666432825761, + "grad_norm": 0.1887608915567398, + "learning_rate": 0.0001, + "loss": 1.8707, + "step": 5450 + }, + { + "epoch": 0.07565441065214101, + "grad_norm": 0.18964676558971405, + "learning_rate": 0.0001, + "loss": 1.8642, + "step": 5500 + }, + { + "epoch": 0.07634217802170594, + "grad_norm": 0.20009934902191162, + "learning_rate": 0.0001, + "loss": 1.8657, + "step": 5550 + }, + { + "epoch": 0.07702994539127085, + "grad_norm": 0.1821998506784439, + "learning_rate": 0.0001, + "loss": 1.8673, + "step": 5600 + }, + { + "epoch": 0.07771771276083578, + "grad_norm": 0.18905235826969147, + "learning_rate": 0.0001, + "loss": 1.8687, + "step": 5650 + }, + { + "epoch": 0.07840548013040069, + "grad_norm": 0.19986678659915924, + "learning_rate": 0.0001, + "loss": 1.8627, + "step": 5700 + }, + { + "epoch": 0.07909324749996562, + "grad_norm": 0.1904374659061432, + "learning_rate": 0.0001, + "loss": 1.8633, + "step": 5750 + }, + { + "epoch": 0.07978101486953053, + "grad_norm": 0.19536761939525604, + "learning_rate": 0.0001, + "loss": 1.8685, + "step": 5800 + }, + { + "epoch": 0.08046878223909545, + "grad_norm": 0.18209826946258545, + "learning_rate": 0.0001, + "loss": 1.8599, + "step": 5850 + }, + { + "epoch": 0.08115654960866037, + "grad_norm": 0.21385939419269562, + "learning_rate": 0.0001, + "loss": 1.866, + "step": 5900 + }, + { + "epoch": 0.08184431697822528, + "grad_norm": 0.20338542759418488, + "learning_rate": 0.0001, + "loss": 1.8669, + "step": 5950 + }, + { + "epoch": 0.0825320843477902, + "grad_norm": 0.19536232948303223, + "learning_rate": 0.0001, + "loss": 1.8644, + "step": 6000 + }, + { + "epoch": 0.08321985171735512, + "grad_norm": 0.18480873107910156, + "learning_rate": 0.0001, + "loss": 1.8668, + "step": 6050 + }, + { + "epoch": 0.08390761908692004, + "grad_norm": 0.18024863302707672, + "learning_rate": 0.0001, + "loss": 1.8638, + "step": 6100 + }, + { + "epoch": 0.08459538645648496, + "grad_norm": 0.18774175643920898, + "learning_rate": 0.0001, + "loss": 1.8652, + "step": 6150 + }, + { + "epoch": 0.08528315382604988, + "grad_norm": 0.2518685460090637, + "learning_rate": 0.0001, + "loss": 1.8649, + "step": 6200 + }, + { + "epoch": 0.0859709211956148, + "grad_norm": 0.20646634697914124, + "learning_rate": 0.0001, + "loss": 1.8658, + "step": 6250 + }, + { + "epoch": 0.08665868856517972, + "grad_norm": 0.19222316145896912, + "learning_rate": 0.0001, + "loss": 1.8642, + "step": 6300 + }, + { + "epoch": 0.08734645593474463, + "grad_norm": 0.19531960785388947, + "learning_rate": 0.0001, + "loss": 1.8641, + "step": 6350 + }, + { + "epoch": 0.08803422330430954, + "grad_norm": 0.18218673765659332, + "learning_rate": 0.0001, + "loss": 1.8599, + "step": 6400 + }, + { + "epoch": 0.08872199067387447, + "grad_norm": 0.18686556816101074, + "learning_rate": 0.0001, + "loss": 1.8588, + "step": 6450 + }, + { + "epoch": 0.08940975804343938, + "grad_norm": 0.20718005299568176, + "learning_rate": 0.0001, + "loss": 1.8595, + "step": 6500 + }, + { + "epoch": 0.09009752541300431, + "grad_norm": 0.17680206894874573, + "learning_rate": 0.0001, + "loss": 1.8625, + "step": 6550 + }, + { + "epoch": 0.09078529278256922, + "grad_norm": 0.25429028272628784, + "learning_rate": 0.0001, + "loss": 1.8635, + "step": 6600 + }, + { + "epoch": 0.09147306015213415, + "grad_norm": 0.19778478145599365, + "learning_rate": 0.0001, + "loss": 1.8618, + "step": 6650 + }, + { + "epoch": 0.09216082752169906, + "grad_norm": 0.21198226511478424, + "learning_rate": 0.0001, + "loss": 1.8613, + "step": 6700 + }, + { + "epoch": 0.09284859489126399, + "grad_norm": 0.1819111704826355, + "learning_rate": 0.0001, + "loss": 1.8601, + "step": 6750 + }, + { + "epoch": 0.0935363622608289, + "grad_norm": 0.2141820788383484, + "learning_rate": 0.0001, + "loss": 1.8598, + "step": 6800 + }, + { + "epoch": 0.09422412963039381, + "grad_norm": 0.20356012880802155, + "learning_rate": 0.0001, + "loss": 1.8619, + "step": 6850 + }, + { + "epoch": 0.09491189699995874, + "grad_norm": 0.18998335301876068, + "learning_rate": 0.0001, + "loss": 1.8597, + "step": 6900 + }, + { + "epoch": 0.09559966436952365, + "grad_norm": 0.19086682796478271, + "learning_rate": 0.0001, + "loss": 1.8622, + "step": 6950 + }, + { + "epoch": 0.09628743173908857, + "grad_norm": 0.2049364447593689, + "learning_rate": 0.0001, + "loss": 1.8617, + "step": 7000 + }, + { + "epoch": 0.09697519910865349, + "grad_norm": 0.19833974540233612, + "learning_rate": 0.0001, + "loss": 1.8609, + "step": 7050 + }, + { + "epoch": 0.09766296647821841, + "grad_norm": 0.19551745057106018, + "learning_rate": 0.0001, + "loss": 1.8581, + "step": 7100 + }, + { + "epoch": 0.09835073384778333, + "grad_norm": 0.1846143752336502, + "learning_rate": 0.0001, + "loss": 1.8569, + "step": 7150 + }, + { + "epoch": 0.09903850121734824, + "grad_norm": 0.1906626969575882, + "learning_rate": 0.0001, + "loss": 1.8614, + "step": 7200 + }, + { + "epoch": 0.09972626858691316, + "grad_norm": 0.19115209579467773, + "learning_rate": 0.0001, + "loss": 1.8633, + "step": 7250 + }, + { + "epoch": 0.10041403595647808, + "grad_norm": 0.18704906105995178, + "learning_rate": 0.0001, + "loss": 1.8601, + "step": 7300 + }, + { + "epoch": 0.101101803326043, + "grad_norm": 0.18635210394859314, + "learning_rate": 0.0001, + "loss": 1.8605, + "step": 7350 + }, + { + "epoch": 0.10178957069560791, + "grad_norm": 0.1947161853313446, + "learning_rate": 0.0001, + "loss": 1.861, + "step": 7400 + }, + { + "epoch": 0.10247733806517284, + "grad_norm": 0.22087708115577698, + "learning_rate": 0.0001, + "loss": 1.8553, + "step": 7450 + }, + { + "epoch": 0.10316510543473775, + "grad_norm": 0.1805039346218109, + "learning_rate": 0.0001, + "loss": 1.8591, + "step": 7500 + }, + { + "epoch": 0.10385287280430268, + "grad_norm": 0.19084776937961578, + "learning_rate": 0.0001, + "loss": 1.8561, + "step": 7550 + }, + { + "epoch": 0.10454064017386759, + "grad_norm": 0.20166590809822083, + "learning_rate": 0.0001, + "loss": 1.8584, + "step": 7600 + }, + { + "epoch": 0.1052284075434325, + "grad_norm": 0.1892371028661728, + "learning_rate": 0.0001, + "loss": 1.8526, + "step": 7650 + }, + { + "epoch": 0.10591617491299743, + "grad_norm": 0.22085241973400116, + "learning_rate": 0.0001, + "loss": 1.8561, + "step": 7700 + }, + { + "epoch": 0.10660394228256234, + "grad_norm": 0.186112642288208, + "learning_rate": 0.0001, + "loss": 1.8597, + "step": 7750 + }, + { + "epoch": 0.10729170965212727, + "grad_norm": 0.1959947943687439, + "learning_rate": 0.0001, + "loss": 1.8558, + "step": 7800 + }, + { + "epoch": 0.10797947702169218, + "grad_norm": 0.21492016315460205, + "learning_rate": 0.0001, + "loss": 1.8608, + "step": 7850 + }, + { + "epoch": 0.1086672443912571, + "grad_norm": 0.18600517511367798, + "learning_rate": 0.0001, + "loss": 1.8559, + "step": 7900 + }, + { + "epoch": 0.10935501176082202, + "grad_norm": 0.18841132521629333, + "learning_rate": 0.0001, + "loss": 1.8542, + "step": 7950 + }, + { + "epoch": 0.11004277913038694, + "grad_norm": 0.20758236944675446, + "learning_rate": 0.0001, + "loss": 1.8565, + "step": 8000 + }, + { + "epoch": 0.11073054649995186, + "grad_norm": 0.20206254720687866, + "learning_rate": 0.0001, + "loss": 1.8553, + "step": 8050 + }, + { + "epoch": 0.11141831386951677, + "grad_norm": 0.19620998203754425, + "learning_rate": 0.0001, + "loss": 1.8542, + "step": 8100 + }, + { + "epoch": 0.1121060812390817, + "grad_norm": 0.19747626781463623, + "learning_rate": 0.0001, + "loss": 1.8545, + "step": 8150 + }, + { + "epoch": 0.11279384860864661, + "grad_norm": 0.21328890323638916, + "learning_rate": 0.0001, + "loss": 1.8552, + "step": 8200 + }, + { + "epoch": 0.11348161597821153, + "grad_norm": 0.18296054005622864, + "learning_rate": 0.0001, + "loss": 1.8579, + "step": 8250 + }, + { + "epoch": 0.11416938334777645, + "grad_norm": 0.21098335087299347, + "learning_rate": 0.0001, + "loss": 1.8526, + "step": 8300 + }, + { + "epoch": 0.11485715071734137, + "grad_norm": 0.18666841089725494, + "learning_rate": 0.0001, + "loss": 1.8484, + "step": 8350 + }, + { + "epoch": 0.11554491808690628, + "grad_norm": 0.18522906303405762, + "learning_rate": 0.0001, + "loss": 1.8538, + "step": 8400 + }, + { + "epoch": 0.1162326854564712, + "grad_norm": 0.1890312135219574, + "learning_rate": 0.0001, + "loss": 1.8519, + "step": 8450 + }, + { + "epoch": 0.11692045282603612, + "grad_norm": 0.197422057390213, + "learning_rate": 0.0001, + "loss": 1.8513, + "step": 8500 + }, + { + "epoch": 0.11760822019560103, + "grad_norm": 0.21355442702770233, + "learning_rate": 0.0001, + "loss": 1.8561, + "step": 8550 + }, + { + "epoch": 0.11829598756516596, + "grad_norm": 0.18543662130832672, + "learning_rate": 0.0001, + "loss": 1.8538, + "step": 8600 + }, + { + "epoch": 0.11898375493473087, + "grad_norm": 0.20849215984344482, + "learning_rate": 0.0001, + "loss": 1.8527, + "step": 8650 + }, + { + "epoch": 0.1196715223042958, + "grad_norm": 0.2109488546848297, + "learning_rate": 0.0001, + "loss": 1.8496, + "step": 8700 + }, + { + "epoch": 0.12035928967386071, + "grad_norm": 0.20195640623569489, + "learning_rate": 0.0001, + "loss": 1.8499, + "step": 8750 + }, + { + "epoch": 0.12104705704342564, + "grad_norm": 0.1749362200498581, + "learning_rate": 0.0001, + "loss": 1.8559, + "step": 8800 + }, + { + "epoch": 0.12173482441299055, + "grad_norm": 0.20881310105323792, + "learning_rate": 0.0001, + "loss": 1.8536, + "step": 8850 + }, + { + "epoch": 0.12242259178255546, + "grad_norm": 0.1801750510931015, + "learning_rate": 0.0001, + "loss": 1.8507, + "step": 8900 + }, + { + "epoch": 0.12311035915212039, + "grad_norm": 0.1898815929889679, + "learning_rate": 0.0001, + "loss": 1.8493, + "step": 8950 + }, + { + "epoch": 0.1237981265216853, + "grad_norm": 0.19754734635353088, + "learning_rate": 0.0001, + "loss": 1.853, + "step": 9000 + }, + { + "epoch": 0.12448589389125023, + "grad_norm": 0.1855219006538391, + "learning_rate": 0.0001, + "loss": 1.8529, + "step": 9050 + }, + { + "epoch": 0.12517366126081514, + "grad_norm": 0.19341996312141418, + "learning_rate": 0.0001, + "loss": 1.8513, + "step": 9100 + }, + { + "epoch": 0.12586142863038005, + "grad_norm": 0.19776052236557007, + "learning_rate": 0.0001, + "loss": 1.8507, + "step": 9150 + }, + { + "epoch": 0.126549195999945, + "grad_norm": 0.185306116938591, + "learning_rate": 0.0001, + "loss": 1.851, + "step": 9200 + }, + { + "epoch": 0.1272369633695099, + "grad_norm": 0.19926750659942627, + "learning_rate": 0.0001, + "loss": 1.8504, + "step": 9250 + }, + { + "epoch": 0.12792473073907482, + "grad_norm": 0.21605028212070465, + "learning_rate": 0.0001, + "loss": 1.8502, + "step": 9300 + }, + { + "epoch": 0.12861249810863973, + "grad_norm": 0.18174859881401062, + "learning_rate": 0.0001, + "loss": 1.8505, + "step": 9350 + }, + { + "epoch": 0.12930026547820464, + "grad_norm": 0.19654984772205353, + "learning_rate": 0.0001, + "loss": 1.8517, + "step": 9400 + }, + { + "epoch": 0.12998803284776958, + "grad_norm": 0.1764276772737503, + "learning_rate": 0.0001, + "loss": 1.8483, + "step": 9450 + }, + { + "epoch": 0.1306758002173345, + "grad_norm": 0.17811571061611176, + "learning_rate": 0.0001, + "loss": 1.8469, + "step": 9500 + }, + { + "epoch": 0.1313635675868994, + "grad_norm": 0.20159000158309937, + "learning_rate": 0.0001, + "loss": 1.8455, + "step": 9550 + }, + { + "epoch": 0.13205133495646432, + "grad_norm": 0.1840062290430069, + "learning_rate": 0.0001, + "loss": 1.8511, + "step": 9600 + }, + { + "epoch": 0.13273910232602926, + "grad_norm": 0.190440833568573, + "learning_rate": 0.0001, + "loss": 1.8474, + "step": 9650 + }, + { + "epoch": 0.13342686969559417, + "grad_norm": 0.20033535361289978, + "learning_rate": 0.0001, + "loss": 1.8479, + "step": 9700 + }, + { + "epoch": 0.13411463706515908, + "grad_norm": 0.1811174899339676, + "learning_rate": 0.0001, + "loss": 1.8504, + "step": 9750 + }, + { + "epoch": 0.134802404434724, + "grad_norm": 0.2073344737291336, + "learning_rate": 0.0001, + "loss": 1.8507, + "step": 9800 + }, + { + "epoch": 0.1354901718042889, + "grad_norm": 0.21762603521347046, + "learning_rate": 0.0001, + "loss": 1.8499, + "step": 9850 + }, + { + "epoch": 0.13617793917385385, + "grad_norm": 0.1864607185125351, + "learning_rate": 0.0001, + "loss": 1.8471, + "step": 9900 + }, + { + "epoch": 0.13686570654341876, + "grad_norm": 0.17837654054164886, + "learning_rate": 0.0001, + "loss": 1.8485, + "step": 9950 + }, + { + "epoch": 0.13755347391298367, + "grad_norm": 0.20498532056808472, + "learning_rate": 0.0001, + "loss": 1.8497, + "step": 10000 + }, + { + "epoch": 0.13824124128254858, + "grad_norm": 0.18355566263198853, + "learning_rate": 0.0001, + "loss": 1.8458, + "step": 10050 + }, + { + "epoch": 0.13892900865211352, + "grad_norm": 0.2033490389585495, + "learning_rate": 0.0001, + "loss": 1.8451, + "step": 10100 + }, + { + "epoch": 0.13961677602167843, + "grad_norm": 0.1855219006538391, + "learning_rate": 0.0001, + "loss": 1.8475, + "step": 10150 + }, + { + "epoch": 0.14030454339124335, + "grad_norm": 0.18876652419567108, + "learning_rate": 0.0001, + "loss": 1.8473, + "step": 10200 + }, + { + "epoch": 0.14099231076080826, + "grad_norm": 0.1731424629688263, + "learning_rate": 0.0001, + "loss": 1.8475, + "step": 10250 + }, + { + "epoch": 0.14168007813037317, + "grad_norm": 0.186906635761261, + "learning_rate": 0.0001, + "loss": 1.8498, + "step": 10300 + }, + { + "epoch": 0.1423678454999381, + "grad_norm": 0.18285425007343292, + "learning_rate": 0.0001, + "loss": 1.8451, + "step": 10350 + }, + { + "epoch": 0.14305561286950302, + "grad_norm": 0.19545456767082214, + "learning_rate": 0.0001, + "loss": 1.8487, + "step": 10400 + }, + { + "epoch": 0.14374338023906794, + "grad_norm": 0.16256272792816162, + "learning_rate": 0.0001, + "loss": 1.8461, + "step": 10450 + }, + { + "epoch": 0.14443114760863285, + "grad_norm": 0.19637931883335114, + "learning_rate": 0.0001, + "loss": 1.8462, + "step": 10500 + }, + { + "epoch": 0.14511891497819776, + "grad_norm": 0.20408660173416138, + "learning_rate": 0.0001, + "loss": 1.8465, + "step": 10550 + }, + { + "epoch": 0.1458066823477627, + "grad_norm": 0.2140285223722458, + "learning_rate": 0.0001, + "loss": 1.8421, + "step": 10600 + }, + { + "epoch": 0.1464944497173276, + "grad_norm": 0.18366774916648865, + "learning_rate": 0.0001, + "loss": 1.8454, + "step": 10650 + }, + { + "epoch": 0.14718221708689253, + "grad_norm": 0.19011645019054413, + "learning_rate": 0.0001, + "loss": 1.8427, + "step": 10700 + }, + { + "epoch": 0.14786998445645744, + "grad_norm": 0.1923753321170807, + "learning_rate": 0.0001, + "loss": 1.8442, + "step": 10750 + }, + { + "epoch": 0.14855775182602238, + "grad_norm": 0.19208142161369324, + "learning_rate": 0.0001, + "loss": 1.8413, + "step": 10800 + }, + { + "epoch": 0.1492455191955873, + "grad_norm": 0.19608841836452484, + "learning_rate": 0.0001, + "loss": 1.8468, + "step": 10850 + }, + { + "epoch": 0.1499332865651522, + "grad_norm": 0.19484341144561768, + "learning_rate": 0.0001, + "loss": 1.849, + "step": 10900 + }, + { + "epoch": 0.15062105393471711, + "grad_norm": 0.18584389984607697, + "learning_rate": 0.0001, + "loss": 1.8416, + "step": 10950 + }, + { + "epoch": 0.15130882130428203, + "grad_norm": 0.1894279420375824, + "learning_rate": 0.0001, + "loss": 1.8454, + "step": 11000 + }, + { + "epoch": 0.15199658867384697, + "grad_norm": 0.19622810184955597, + "learning_rate": 0.0001, + "loss": 1.8449, + "step": 11050 + }, + { + "epoch": 0.15268435604341188, + "grad_norm": 0.18603233993053436, + "learning_rate": 0.0001, + "loss": 1.848, + "step": 11100 + }, + { + "epoch": 0.1533721234129768, + "grad_norm": 0.18146397173404694, + "learning_rate": 0.0001, + "loss": 1.8413, + "step": 11150 + }, + { + "epoch": 0.1540598907825417, + "grad_norm": 0.20820939540863037, + "learning_rate": 0.0001, + "loss": 1.844, + "step": 11200 + }, + { + "epoch": 0.15474765815210664, + "grad_norm": 0.18021373450756073, + "learning_rate": 0.0001, + "loss": 1.8434, + "step": 11250 + }, + { + "epoch": 0.15543542552167156, + "grad_norm": 0.19339635968208313, + "learning_rate": 0.0001, + "loss": 1.8405, + "step": 11300 + }, + { + "epoch": 0.15612319289123647, + "grad_norm": 0.1994727998971939, + "learning_rate": 0.0001, + "loss": 1.8403, + "step": 11350 + }, + { + "epoch": 0.15681096026080138, + "grad_norm": 0.1830483376979828, + "learning_rate": 0.0001, + "loss": 1.8415, + "step": 11400 + }, + { + "epoch": 0.1574987276303663, + "grad_norm": 0.17064842581748962, + "learning_rate": 0.0001, + "loss": 1.8433, + "step": 11450 + }, + { + "epoch": 0.15818649499993123, + "grad_norm": 0.19161944091320038, + "learning_rate": 0.0001, + "loss": 1.8428, + "step": 11500 + }, + { + "epoch": 0.15887426236949614, + "grad_norm": 0.21216394007205963, + "learning_rate": 0.0001, + "loss": 1.8432, + "step": 11550 + }, + { + "epoch": 0.15956202973906106, + "grad_norm": 0.1909138560295105, + "learning_rate": 0.0001, + "loss": 1.8429, + "step": 11600 + }, + { + "epoch": 0.16024979710862597, + "grad_norm": 0.20326951146125793, + "learning_rate": 0.0001, + "loss": 1.8419, + "step": 11650 + }, + { + "epoch": 0.1609375644781909, + "grad_norm": 0.19515758752822876, + "learning_rate": 0.0001, + "loss": 1.8448, + "step": 11700 + }, + { + "epoch": 0.16162533184775582, + "grad_norm": 0.2075706273317337, + "learning_rate": 0.0001, + "loss": 1.8439, + "step": 11750 + }, + { + "epoch": 0.16231309921732073, + "grad_norm": 0.21147705614566803, + "learning_rate": 0.0001, + "loss": 1.8433, + "step": 11800 + }, + { + "epoch": 0.16300086658688565, + "grad_norm": 0.18318484723567963, + "learning_rate": 0.0001, + "loss": 1.8383, + "step": 11850 + }, + { + "epoch": 0.16368863395645056, + "grad_norm": 0.18728312849998474, + "learning_rate": 0.0001, + "loss": 1.8426, + "step": 11900 + }, + { + "epoch": 0.1643764013260155, + "grad_norm": 0.20905287563800812, + "learning_rate": 0.0001, + "loss": 1.8421, + "step": 11950 + }, + { + "epoch": 0.1650641686955804, + "grad_norm": 0.18393969535827637, + "learning_rate": 0.0001, + "loss": 1.8408, + "step": 12000 + }, + { + "epoch": 0.16575193606514532, + "grad_norm": 0.18366305530071259, + "learning_rate": 0.0001, + "loss": 1.8365, + "step": 12050 + }, + { + "epoch": 0.16643970343471023, + "grad_norm": 0.19170603156089783, + "learning_rate": 0.0001, + "loss": 1.8416, + "step": 12100 + }, + { + "epoch": 0.16712747080427517, + "grad_norm": 0.172319233417511, + "learning_rate": 0.0001, + "loss": 1.8411, + "step": 12150 + }, + { + "epoch": 0.1678152381738401, + "grad_norm": 0.2174234390258789, + "learning_rate": 0.0001, + "loss": 1.8416, + "step": 12200 + }, + { + "epoch": 0.168503005543405, + "grad_norm": 0.20210625231266022, + "learning_rate": 0.0001, + "loss": 1.8422, + "step": 12250 + }, + { + "epoch": 0.1691907729129699, + "grad_norm": 0.1902657449245453, + "learning_rate": 0.0001, + "loss": 1.8369, + "step": 12300 + }, + { + "epoch": 0.16987854028253482, + "grad_norm": 0.18901073932647705, + "learning_rate": 0.0001, + "loss": 1.8415, + "step": 12350 + }, + { + "epoch": 0.17056630765209976, + "grad_norm": 0.17624430358409882, + "learning_rate": 0.0001, + "loss": 1.8373, + "step": 12400 + }, + { + "epoch": 0.17125407502166468, + "grad_norm": 0.1844191551208496, + "learning_rate": 0.0001, + "loss": 1.8391, + "step": 12450 + }, + { + "epoch": 0.1719418423912296, + "grad_norm": 0.19392350316047668, + "learning_rate": 0.0001, + "loss": 1.8416, + "step": 12500 + }, + { + "epoch": 0.1726296097607945, + "grad_norm": 0.18644706904888153, + "learning_rate": 0.0001, + "loss": 1.8409, + "step": 12550 + }, + { + "epoch": 0.17331737713035944, + "grad_norm": 0.19530895352363586, + "learning_rate": 0.0001, + "loss": 1.8381, + "step": 12600 + }, + { + "epoch": 0.17400514449992435, + "grad_norm": 0.18004032969474792, + "learning_rate": 0.0001, + "loss": 1.8419, + "step": 12650 + }, + { + "epoch": 0.17469291186948926, + "grad_norm": 0.20025117695331573, + "learning_rate": 0.0001, + "loss": 1.8379, + "step": 12700 + }, + { + "epoch": 0.17538067923905418, + "grad_norm": 0.17622490227222443, + "learning_rate": 0.0001, + "loss": 1.8364, + "step": 12750 + }, + { + "epoch": 0.1760684466086191, + "grad_norm": 0.19657030701637268, + "learning_rate": 0.0001, + "loss": 1.8364, + "step": 12800 + }, + { + "epoch": 0.17675621397818403, + "grad_norm": 0.19141744077205658, + "learning_rate": 0.0001, + "loss": 1.8388, + "step": 12850 + }, + { + "epoch": 0.17744398134774894, + "grad_norm": 0.23409488797187805, + "learning_rate": 0.0001, + "loss": 1.8392, + "step": 12900 + }, + { + "epoch": 0.17813174871731385, + "grad_norm": 0.19104769825935364, + "learning_rate": 0.0001, + "loss": 1.8407, + "step": 12950 + }, + { + "epoch": 0.17881951608687877, + "grad_norm": 0.1978139728307724, + "learning_rate": 0.0001, + "loss": 1.836, + "step": 13000 + }, + { + "epoch": 0.1795072834564437, + "grad_norm": 0.1839970201253891, + "learning_rate": 0.0001, + "loss": 1.8406, + "step": 13050 + }, + { + "epoch": 0.18019505082600862, + "grad_norm": 0.1969710737466812, + "learning_rate": 0.0001, + "loss": 1.8382, + "step": 13100 + }, + { + "epoch": 0.18088281819557353, + "grad_norm": 0.21036314964294434, + "learning_rate": 0.0001, + "loss": 1.8372, + "step": 13150 + }, + { + "epoch": 0.18157058556513844, + "grad_norm": 0.18064115941524506, + "learning_rate": 0.0001, + "loss": 1.8387, + "step": 13200 + }, + { + "epoch": 0.18225835293470335, + "grad_norm": 0.20280593633651733, + "learning_rate": 0.0001, + "loss": 1.8345, + "step": 13250 + }, + { + "epoch": 0.1829461203042683, + "grad_norm": 0.21196794509887695, + "learning_rate": 0.0001, + "loss": 1.8403, + "step": 13300 + }, + { + "epoch": 0.1836338876738332, + "grad_norm": 0.18529263138771057, + "learning_rate": 0.0001, + "loss": 1.8395, + "step": 13350 + }, + { + "epoch": 0.18432165504339812, + "grad_norm": 0.20009498298168182, + "learning_rate": 0.0001, + "loss": 1.8418, + "step": 13400 + }, + { + "epoch": 0.18500942241296303, + "grad_norm": 0.1844586879014969, + "learning_rate": 0.0001, + "loss": 1.8388, + "step": 13450 + }, + { + "epoch": 0.18569718978252797, + "grad_norm": 0.17497003078460693, + "learning_rate": 0.0001, + "loss": 1.8374, + "step": 13500 + }, + { + "epoch": 0.18638495715209288, + "grad_norm": 0.21536414325237274, + "learning_rate": 0.0001, + "loss": 1.834, + "step": 13550 + }, + { + "epoch": 0.1870727245216578, + "grad_norm": 0.20212842524051666, + "learning_rate": 0.0001, + "loss": 1.8361, + "step": 13600 + }, + { + "epoch": 0.1877604918912227, + "grad_norm": 0.21032044291496277, + "learning_rate": 0.0001, + "loss": 1.8352, + "step": 13650 + }, + { + "epoch": 0.18844825926078762, + "grad_norm": 0.17547431588172913, + "learning_rate": 0.0001, + "loss": 1.839, + "step": 13700 + }, + { + "epoch": 0.18913602663035256, + "grad_norm": 0.17463110387325287, + "learning_rate": 0.0001, + "loss": 1.8345, + "step": 13750 + }, + { + "epoch": 0.18982379399991747, + "grad_norm": 0.19794687628746033, + "learning_rate": 0.0001, + "loss": 1.8367, + "step": 13800 + }, + { + "epoch": 0.19051156136948239, + "grad_norm": 0.17595866322517395, + "learning_rate": 0.0001, + "loss": 1.8349, + "step": 13850 + }, + { + "epoch": 0.1911993287390473, + "grad_norm": 0.19087472558021545, + "learning_rate": 0.0001, + "loss": 1.8377, + "step": 13900 + }, + { + "epoch": 0.1918870961086122, + "grad_norm": 0.1895439624786377, + "learning_rate": 0.0001, + "loss": 1.8392, + "step": 13950 + }, + { + "epoch": 0.19257486347817715, + "grad_norm": 0.19558320939540863, + "learning_rate": 0.0001, + "loss": 1.8331, + "step": 14000 + }, + { + "epoch": 0.19326263084774206, + "grad_norm": 0.18495230376720428, + "learning_rate": 0.0001, + "loss": 1.8357, + "step": 14050 + }, + { + "epoch": 0.19395039821730697, + "grad_norm": 0.19197221100330353, + "learning_rate": 0.0001, + "loss": 1.8379, + "step": 14100 + }, + { + "epoch": 0.1946381655868719, + "grad_norm": 0.17729446291923523, + "learning_rate": 0.0001, + "loss": 1.8336, + "step": 14150 + }, + { + "epoch": 0.19532593295643683, + "grad_norm": 0.20683547854423523, + "learning_rate": 0.0001, + "loss": 1.8344, + "step": 14200 + }, + { + "epoch": 0.19601370032600174, + "grad_norm": 0.16708314418792725, + "learning_rate": 0.0001, + "loss": 1.8375, + "step": 14250 + }, + { + "epoch": 0.19670146769556665, + "grad_norm": 0.2065526694059372, + "learning_rate": 0.0001, + "loss": 1.8397, + "step": 14300 + }, + { + "epoch": 0.19738923506513156, + "grad_norm": 0.2007008045911789, + "learning_rate": 0.0001, + "loss": 1.8351, + "step": 14350 + }, + { + "epoch": 0.19807700243469648, + "grad_norm": 0.1773243397474289, + "learning_rate": 0.0001, + "loss": 1.8338, + "step": 14400 + }, + { + "epoch": 0.19876476980426142, + "grad_norm": 0.1875116229057312, + "learning_rate": 0.0001, + "loss": 1.8379, + "step": 14450 + }, + { + "epoch": 0.19945253717382633, + "grad_norm": 0.19387130439281464, + "learning_rate": 0.0001, + "loss": 1.8343, + "step": 14500 + }, + { + "epoch": 0.20014030454339124, + "grad_norm": 0.17164736986160278, + "learning_rate": 0.0001, + "loss": 1.8338, + "step": 14550 + }, + { + "epoch": 0.20082807191295615, + "grad_norm": 0.19135966897010803, + "learning_rate": 0.0001, + "loss": 1.8321, + "step": 14600 + }, + { + "epoch": 0.2015158392825211, + "grad_norm": 0.21152153611183167, + "learning_rate": 0.0001, + "loss": 1.8332, + "step": 14650 + }, + { + "epoch": 0.202203606652086, + "grad_norm": 0.19576500356197357, + "learning_rate": 0.0001, + "loss": 1.8338, + "step": 14700 + }, + { + "epoch": 0.20289137402165092, + "grad_norm": 0.21700510382652283, + "learning_rate": 0.0001, + "loss": 1.8381, + "step": 14750 + }, + { + "epoch": 0.20357914139121583, + "grad_norm": 0.18183092772960663, + "learning_rate": 0.0001, + "loss": 1.833, + "step": 14800 + }, + { + "epoch": 0.20426690876078074, + "grad_norm": 0.1678183525800705, + "learning_rate": 0.0001, + "loss": 1.8365, + "step": 14850 + }, + { + "epoch": 0.20495467613034568, + "grad_norm": 0.1790694147348404, + "learning_rate": 0.0001, + "loss": 1.8323, + "step": 14900 + }, + { + "epoch": 0.2056424434999106, + "grad_norm": 0.17274673283100128, + "learning_rate": 0.0001, + "loss": 1.8357, + "step": 14950 + }, + { + "epoch": 0.2063302108694755, + "grad_norm": 0.1773209273815155, + "learning_rate": 0.0001, + "loss": 1.8338, + "step": 15000 + }, + { + "epoch": 0.20701797823904042, + "grad_norm": 0.29811668395996094, + "learning_rate": 0.0001, + "loss": 1.8322, + "step": 15050 + }, + { + "epoch": 0.20770574560860536, + "grad_norm": 0.18590272963047028, + "learning_rate": 0.0001, + "loss": 1.8307, + "step": 15100 + }, + { + "epoch": 0.20839351297817027, + "grad_norm": 0.19656258821487427, + "learning_rate": 0.0001, + "loss": 1.8364, + "step": 15150 + }, + { + "epoch": 0.20908128034773518, + "grad_norm": 0.1760113537311554, + "learning_rate": 0.0001, + "loss": 1.8363, + "step": 15200 + }, + { + "epoch": 0.2097690477173001, + "grad_norm": 0.17442069947719574, + "learning_rate": 0.0001, + "loss": 1.8346, + "step": 15250 + }, + { + "epoch": 0.210456815086865, + "grad_norm": 0.2154201865196228, + "learning_rate": 0.0001, + "loss": 1.8359, + "step": 15300 + }, + { + "epoch": 0.21114458245642995, + "grad_norm": 0.18702222406864166, + "learning_rate": 0.0001, + "loss": 1.8333, + "step": 15350 + }, + { + "epoch": 0.21183234982599486, + "grad_norm": 0.222214013338089, + "learning_rate": 0.0001, + "loss": 1.8386, + "step": 15400 + }, + { + "epoch": 0.21252011719555977, + "grad_norm": 0.18646612763404846, + "learning_rate": 0.0001, + "loss": 1.8336, + "step": 15450 + }, + { + "epoch": 0.21320788456512468, + "grad_norm": 0.19032032787799835, + "learning_rate": 0.0001, + "loss": 1.8359, + "step": 15500 + }, + { + "epoch": 0.21389565193468962, + "grad_norm": 0.1962030827999115, + "learning_rate": 0.0001, + "loss": 1.8314, + "step": 15550 + }, + { + "epoch": 0.21458341930425454, + "grad_norm": 0.18067054450511932, + "learning_rate": 0.0001, + "loss": 1.8298, + "step": 15600 + }, + { + "epoch": 0.21527118667381945, + "grad_norm": 0.1977655440568924, + "learning_rate": 0.0001, + "loss": 1.8335, + "step": 15650 + }, + { + "epoch": 0.21595895404338436, + "grad_norm": 0.17689162492752075, + "learning_rate": 0.0001, + "loss": 1.834, + "step": 15700 + }, + { + "epoch": 0.21664672141294927, + "grad_norm": 0.189301997423172, + "learning_rate": 0.0001, + "loss": 1.8302, + "step": 15750 + }, + { + "epoch": 0.2173344887825142, + "grad_norm": 0.21416552364826202, + "learning_rate": 0.0001, + "loss": 1.833, + "step": 15800 + }, + { + "epoch": 0.21802225615207912, + "grad_norm": 0.17280973494052887, + "learning_rate": 0.0001, + "loss": 1.8325, + "step": 15850 + }, + { + "epoch": 0.21871002352164404, + "grad_norm": 0.2203332632780075, + "learning_rate": 0.0001, + "loss": 1.8315, + "step": 15900 + }, + { + "epoch": 0.21939779089120895, + "grad_norm": 0.17942380905151367, + "learning_rate": 0.0001, + "loss": 1.8313, + "step": 15950 + }, + { + "epoch": 0.2200855582607739, + "grad_norm": 0.2053511142730713, + "learning_rate": 0.0001, + "loss": 1.8322, + "step": 16000 + }, + { + "epoch": 0.2207733256303388, + "grad_norm": 0.18660666048526764, + "learning_rate": 0.0001, + "loss": 1.8315, + "step": 16050 + }, + { + "epoch": 0.2214610929999037, + "grad_norm": 0.20179618895053864, + "learning_rate": 0.0001, + "loss": 1.8309, + "step": 16100 + }, + { + "epoch": 0.22214886036946863, + "grad_norm": 0.1849927455186844, + "learning_rate": 0.0001, + "loss": 1.8349, + "step": 16150 + }, + { + "epoch": 0.22283662773903354, + "grad_norm": 0.16893066465854645, + "learning_rate": 0.0001, + "loss": 1.8333, + "step": 16200 + }, + { + "epoch": 0.22352439510859848, + "grad_norm": 0.1815815567970276, + "learning_rate": 0.0001, + "loss": 1.8277, + "step": 16250 + }, + { + "epoch": 0.2242121624781634, + "grad_norm": 0.17478667199611664, + "learning_rate": 0.0001, + "loss": 1.8324, + "step": 16300 + }, + { + "epoch": 0.2248999298477283, + "grad_norm": 0.20333503186702728, + "learning_rate": 0.0001, + "loss": 1.8299, + "step": 16350 + }, + { + "epoch": 0.22558769721729321, + "grad_norm": 0.19628338515758514, + "learning_rate": 0.0001, + "loss": 1.8322, + "step": 16400 + }, + { + "epoch": 0.22627546458685815, + "grad_norm": 0.19011887907981873, + "learning_rate": 0.0001, + "loss": 1.8301, + "step": 16450 + }, + { + "epoch": 0.22696323195642307, + "grad_norm": 0.19007809460163116, + "learning_rate": 0.0001, + "loss": 1.8306, + "step": 16500 + }, + { + "epoch": 0.22765099932598798, + "grad_norm": 0.18108965456485748, + "learning_rate": 0.0001, + "loss": 1.8304, + "step": 16550 + }, + { + "epoch": 0.2283387666955529, + "grad_norm": 0.16927501559257507, + "learning_rate": 0.0001, + "loss": 1.832, + "step": 16600 + }, + { + "epoch": 0.2290265340651178, + "grad_norm": 0.18328557908535004, + "learning_rate": 0.0001, + "loss": 1.8315, + "step": 16650 + }, + { + "epoch": 0.22971430143468274, + "grad_norm": 0.21978403627872467, + "learning_rate": 0.0001, + "loss": 1.8314, + "step": 16700 + }, + { + "epoch": 0.23040206880424766, + "grad_norm": 0.1928972601890564, + "learning_rate": 0.0001, + "loss": 1.8281, + "step": 16750 + }, + { + "epoch": 0.23108983617381257, + "grad_norm": 0.19355738162994385, + "learning_rate": 0.0001, + "loss": 1.8289, + "step": 16800 + }, + { + "epoch": 0.23177760354337748, + "grad_norm": 0.18013496696949005, + "learning_rate": 0.0001, + "loss": 1.831, + "step": 16850 + }, + { + "epoch": 0.2324653709129424, + "grad_norm": 0.1848910003900528, + "learning_rate": 0.0001, + "loss": 1.826, + "step": 16900 + }, + { + "epoch": 0.23315313828250733, + "grad_norm": 0.20185594260692596, + "learning_rate": 0.0001, + "loss": 1.8274, + "step": 16950 + }, + { + "epoch": 0.23384090565207225, + "grad_norm": 0.1898491382598877, + "learning_rate": 0.0001, + "loss": 1.8292, + "step": 17000 + }, + { + "epoch": 0.23452867302163716, + "grad_norm": 0.17610591650009155, + "learning_rate": 0.0001, + "loss": 1.831, + "step": 17050 + }, + { + "epoch": 0.23521644039120207, + "grad_norm": 0.2032867968082428, + "learning_rate": 0.0001, + "loss": 1.8306, + "step": 17100 + }, + { + "epoch": 0.235904207760767, + "grad_norm": 0.1812831312417984, + "learning_rate": 0.0001, + "loss": 1.8331, + "step": 17150 + }, + { + "epoch": 0.23659197513033192, + "grad_norm": 0.17079557478427887, + "learning_rate": 0.0001, + "loss": 1.8266, + "step": 17200 + }, + { + "epoch": 0.23727974249989683, + "grad_norm": 0.17599579691886902, + "learning_rate": 0.0001, + "loss": 1.8327, + "step": 17250 + }, + { + "epoch": 0.23796750986946175, + "grad_norm": 0.16692423820495605, + "learning_rate": 0.0001, + "loss": 1.8294, + "step": 17300 + }, + { + "epoch": 0.23865527723902666, + "grad_norm": 0.17235307395458221, + "learning_rate": 0.0001, + "loss": 1.8324, + "step": 17350 + }, + { + "epoch": 0.2393430446085916, + "grad_norm": 0.18419289588928223, + "learning_rate": 0.0001, + "loss": 1.8234, + "step": 17400 + }, + { + "epoch": 0.2400308119781565, + "grad_norm": 0.16880065202713013, + "learning_rate": 0.0001, + "loss": 1.8315, + "step": 17450 + }, + { + "epoch": 0.24071857934772142, + "grad_norm": 0.18046660721302032, + "learning_rate": 0.0001, + "loss": 1.8288, + "step": 17500 + }, + { + "epoch": 0.24140634671728634, + "grad_norm": 0.19775420427322388, + "learning_rate": 0.0001, + "loss": 1.8304, + "step": 17550 + }, + { + "epoch": 0.24209411408685128, + "grad_norm": 0.18596383929252625, + "learning_rate": 0.0001, + "loss": 1.8269, + "step": 17600 + }, + { + "epoch": 0.2427818814564162, + "grad_norm": 0.18525435030460358, + "learning_rate": 0.0001, + "loss": 1.8293, + "step": 17650 + }, + { + "epoch": 0.2434696488259811, + "grad_norm": 0.2105979025363922, + "learning_rate": 0.0001, + "loss": 1.8252, + "step": 17700 + }, + { + "epoch": 0.244157416195546, + "grad_norm": 0.18099245429039001, + "learning_rate": 0.0001, + "loss": 1.8271, + "step": 17750 + }, + { + "epoch": 0.24484518356511092, + "grad_norm": 0.17330291867256165, + "learning_rate": 0.0001, + "loss": 1.8261, + "step": 17800 + }, + { + "epoch": 0.24553295093467586, + "grad_norm": 0.17979152500629425, + "learning_rate": 0.0001, + "loss": 1.8304, + "step": 17850 + }, + { + "epoch": 0.24622071830424078, + "grad_norm": 0.19253650307655334, + "learning_rate": 0.0001, + "loss": 1.83, + "step": 17900 + }, + { + "epoch": 0.2469084856738057, + "grad_norm": 0.20440231263637543, + "learning_rate": 0.0001, + "loss": 1.8251, + "step": 17950 + }, + { + "epoch": 0.2475962530433706, + "grad_norm": 0.18242883682250977, + "learning_rate": 0.0001, + "loss": 1.8286, + "step": 18000 + }, + { + "epoch": 0.24828402041293554, + "grad_norm": 0.1742672622203827, + "learning_rate": 0.0001, + "loss": 1.8271, + "step": 18050 + }, + { + "epoch": 0.24897178778250045, + "grad_norm": 0.19099250435829163, + "learning_rate": 0.0001, + "loss": 1.8284, + "step": 18100 + }, + { + "epoch": 0.24965955515206537, + "grad_norm": 0.19839410483837128, + "learning_rate": 0.0001, + "loss": 1.8254, + "step": 18150 + }, + { + "epoch": 0.2503473225216303, + "grad_norm": 0.18187545239925385, + "learning_rate": 0.0001, + "loss": 1.8258, + "step": 18200 + }, + { + "epoch": 0.2510350898911952, + "grad_norm": 0.16419640183448792, + "learning_rate": 0.0001, + "loss": 1.825, + "step": 18250 + }, + { + "epoch": 0.2517228572607601, + "grad_norm": 0.1788015216588974, + "learning_rate": 0.0001, + "loss": 1.8257, + "step": 18300 + }, + { + "epoch": 0.25241062463032504, + "grad_norm": 0.2013292908668518, + "learning_rate": 0.0001, + "loss": 1.8345, + "step": 18350 + }, + { + "epoch": 0.25309839199989, + "grad_norm": 0.18886993825435638, + "learning_rate": 0.0001, + "loss": 1.8269, + "step": 18400 + }, + { + "epoch": 0.25378615936945487, + "grad_norm": 0.18426848948001862, + "learning_rate": 0.0001, + "loss": 1.8291, + "step": 18450 + }, + { + "epoch": 0.2544739267390198, + "grad_norm": 0.1836244910955429, + "learning_rate": 0.0001, + "loss": 1.8228, + "step": 18500 + }, + { + "epoch": 0.2551616941085847, + "grad_norm": 0.18584777414798737, + "learning_rate": 0.0001, + "loss": 1.8283, + "step": 18550 + }, + { + "epoch": 0.25584946147814963, + "grad_norm": 0.16920630633831024, + "learning_rate": 0.0001, + "loss": 1.8274, + "step": 18600 + }, + { + "epoch": 0.25653722884771457, + "grad_norm": 0.20111984014511108, + "learning_rate": 0.0001, + "loss": 1.8285, + "step": 18650 + }, + { + "epoch": 0.25722499621727946, + "grad_norm": 0.18769313395023346, + "learning_rate": 0.0001, + "loss": 1.8295, + "step": 18700 + }, + { + "epoch": 0.2579127635868444, + "grad_norm": 0.18159103393554688, + "learning_rate": 0.0001, + "loss": 1.8236, + "step": 18750 + }, + { + "epoch": 0.2586005309564093, + "grad_norm": 0.1929440200328827, + "learning_rate": 0.0001, + "loss": 1.8279, + "step": 18800 + }, + { + "epoch": 0.2592882983259742, + "grad_norm": 0.16436657309532166, + "learning_rate": 0.0001, + "loss": 1.823, + "step": 18850 + }, + { + "epoch": 0.25997606569553916, + "grad_norm": 0.1638740748167038, + "learning_rate": 0.0001, + "loss": 1.8251, + "step": 18900 + }, + { + "epoch": 0.26066383306510404, + "grad_norm": 0.18252821266651154, + "learning_rate": 0.0001, + "loss": 1.8251, + "step": 18950 + }, + { + "epoch": 0.261351600434669, + "grad_norm": 0.18031029403209686, + "learning_rate": 0.0001, + "loss": 1.8243, + "step": 19000 + }, + { + "epoch": 0.26203936780423387, + "grad_norm": 0.1770683377981186, + "learning_rate": 0.0001, + "loss": 1.8274, + "step": 19050 + }, + { + "epoch": 0.2627271351737988, + "grad_norm": 0.20250555872917175, + "learning_rate": 0.0001, + "loss": 1.8258, + "step": 19100 + }, + { + "epoch": 0.26341490254336375, + "grad_norm": 0.16491496562957764, + "learning_rate": 0.0001, + "loss": 1.8251, + "step": 19150 + }, + { + "epoch": 0.26410266991292863, + "grad_norm": 0.19582998752593994, + "learning_rate": 0.0001, + "loss": 1.824, + "step": 19200 + }, + { + "epoch": 0.2647904372824936, + "grad_norm": 0.17773911356925964, + "learning_rate": 0.0001, + "loss": 1.8195, + "step": 19250 + }, + { + "epoch": 0.2654782046520585, + "grad_norm": 0.18118888139724731, + "learning_rate": 0.0001, + "loss": 1.8239, + "step": 19300 + }, + { + "epoch": 0.2661659720216234, + "grad_norm": 0.15766191482543945, + "learning_rate": 0.0001, + "loss": 1.8232, + "step": 19350 + }, + { + "epoch": 0.26685373939118834, + "grad_norm": 0.17026937007904053, + "learning_rate": 0.0001, + "loss": 1.8223, + "step": 19400 + }, + { + "epoch": 0.2675415067607532, + "grad_norm": 0.18863512575626373, + "learning_rate": 0.0001, + "loss": 1.8257, + "step": 19450 + }, + { + "epoch": 0.26822927413031816, + "grad_norm": 0.18321500718593597, + "learning_rate": 0.0001, + "loss": 1.8238, + "step": 19500 + }, + { + "epoch": 0.2689170414998831, + "grad_norm": 0.20935237407684326, + "learning_rate": 0.0001, + "loss": 1.8229, + "step": 19550 + }, + { + "epoch": 0.269604808869448, + "grad_norm": 0.19490981101989746, + "learning_rate": 0.0001, + "loss": 1.8194, + "step": 19600 + }, + { + "epoch": 0.2702925762390129, + "grad_norm": 0.19290666282176971, + "learning_rate": 0.0001, + "loss": 1.8258, + "step": 19650 + }, + { + "epoch": 0.2709803436085778, + "grad_norm": 0.1819174438714981, + "learning_rate": 0.0001, + "loss": 1.8224, + "step": 19700 + }, + { + "epoch": 0.27166811097814275, + "grad_norm": 0.18501299619674683, + "learning_rate": 0.0001, + "loss": 1.8297, + "step": 19750 + }, + { + "epoch": 0.2723558783477077, + "grad_norm": 0.19111846387386322, + "learning_rate": 0.0001, + "loss": 1.8226, + "step": 19800 + }, + { + "epoch": 0.2730436457172726, + "grad_norm": 0.18800359964370728, + "learning_rate": 0.0001, + "loss": 1.8215, + "step": 19850 + }, + { + "epoch": 0.2737314130868375, + "grad_norm": 0.18408334255218506, + "learning_rate": 0.0001, + "loss": 1.8239, + "step": 19900 + }, + { + "epoch": 0.2744191804564024, + "grad_norm": 0.19500131905078888, + "learning_rate": 0.0001, + "loss": 1.8232, + "step": 19950 + }, + { + "epoch": 0.27510694782596734, + "grad_norm": 0.18263010680675507, + "learning_rate": 0.0001, + "loss": 1.8246, + "step": 20000 + }, + { + "epoch": 0.2757947151955323, + "grad_norm": 0.1732577383518219, + "learning_rate": 0.0001, + "loss": 1.8241, + "step": 20050 + }, + { + "epoch": 0.27648248256509717, + "grad_norm": 0.1958979219198227, + "learning_rate": 0.0001, + "loss": 1.8215, + "step": 20100 + }, + { + "epoch": 0.2771702499346621, + "grad_norm": 0.1755562722682953, + "learning_rate": 0.0001, + "loss": 1.8275, + "step": 20150 + }, + { + "epoch": 0.27785801730422705, + "grad_norm": 0.17292717099189758, + "learning_rate": 0.0001, + "loss": 1.8221, + "step": 20200 + }, + { + "epoch": 0.27854578467379193, + "grad_norm": 0.16997367143630981, + "learning_rate": 0.0001, + "loss": 1.8221, + "step": 20250 + }, + { + "epoch": 0.27923355204335687, + "grad_norm": 0.1903601735830307, + "learning_rate": 0.0001, + "loss": 1.8243, + "step": 20300 + }, + { + "epoch": 0.27992131941292175, + "grad_norm": 0.17447033524513245, + "learning_rate": 0.0001, + "loss": 1.8229, + "step": 20350 + }, + { + "epoch": 0.2806090867824867, + "grad_norm": 0.18861395120620728, + "learning_rate": 0.0001, + "loss": 1.8222, + "step": 20400 + }, + { + "epoch": 0.28129685415205163, + "grad_norm": 0.17015644907951355, + "learning_rate": 0.0001, + "loss": 1.8207, + "step": 20450 + }, + { + "epoch": 0.2819846215216165, + "grad_norm": 0.19356681406497955, + "learning_rate": 0.0001, + "loss": 1.8202, + "step": 20500 + }, + { + "epoch": 0.28267238889118146, + "grad_norm": 0.1988779753446579, + "learning_rate": 0.0001, + "loss": 1.8199, + "step": 20550 + }, + { + "epoch": 0.28336015626074634, + "grad_norm": 0.1967942714691162, + "learning_rate": 0.0001, + "loss": 1.8217, + "step": 20600 + }, + { + "epoch": 0.2840479236303113, + "grad_norm": 0.18917816877365112, + "learning_rate": 0.0001, + "loss": 1.8229, + "step": 20650 + }, + { + "epoch": 0.2847356909998762, + "grad_norm": 0.16583094000816345, + "learning_rate": 0.0001, + "loss": 1.8219, + "step": 20700 + }, + { + "epoch": 0.2854234583694411, + "grad_norm": 0.19918115437030792, + "learning_rate": 0.0001, + "loss": 1.8246, + "step": 20750 + }, + { + "epoch": 0.28611122573900605, + "grad_norm": 0.1981818974018097, + "learning_rate": 0.0001, + "loss": 1.8211, + "step": 20800 + }, + { + "epoch": 0.28679899310857093, + "grad_norm": 0.1838293969631195, + "learning_rate": 0.0001, + "loss": 1.8224, + "step": 20850 + }, + { + "epoch": 0.28748676047813587, + "grad_norm": 0.20068101584911346, + "learning_rate": 0.0001, + "loss": 1.82, + "step": 20900 + }, + { + "epoch": 0.2881745278477008, + "grad_norm": 0.17375263571739197, + "learning_rate": 0.0001, + "loss": 1.8195, + "step": 20950 + }, + { + "epoch": 0.2888622952172657, + "grad_norm": 0.16706246137619019, + "learning_rate": 0.0001, + "loss": 1.826, + "step": 21000 + }, + { + "epoch": 0.28955006258683064, + "grad_norm": 0.20021022856235504, + "learning_rate": 0.0001, + "loss": 1.8207, + "step": 21050 + }, + { + "epoch": 0.2902378299563955, + "grad_norm": 0.20570990443229675, + "learning_rate": 0.0001, + "loss": 1.8221, + "step": 21100 + }, + { + "epoch": 0.29092559732596046, + "grad_norm": 0.2043515294790268, + "learning_rate": 0.0001, + "loss": 1.8239, + "step": 21150 + }, + { + "epoch": 0.2916133646955254, + "grad_norm": 0.17122073471546173, + "learning_rate": 0.0001, + "loss": 1.8203, + "step": 21200 + }, + { + "epoch": 0.2923011320650903, + "grad_norm": 0.19589883089065552, + "learning_rate": 0.0001, + "loss": 1.8206, + "step": 21250 + }, + { + "epoch": 0.2929888994346552, + "grad_norm": 0.19675767421722412, + "learning_rate": 0.0001, + "loss": 1.8244, + "step": 21300 + }, + { + "epoch": 0.29367666680422017, + "grad_norm": 0.1788429468870163, + "learning_rate": 0.0001, + "loss": 1.8225, + "step": 21350 + }, + { + "epoch": 0.29436443417378505, + "grad_norm": 0.17564085125923157, + "learning_rate": 0.0001, + "loss": 1.8242, + "step": 21400 + }, + { + "epoch": 0.29505220154335, + "grad_norm": 0.1807086318731308, + "learning_rate": 0.0001, + "loss": 1.8245, + "step": 21450 + }, + { + "epoch": 0.2957399689129149, + "grad_norm": 0.1772526502609253, + "learning_rate": 0.0001, + "loss": 1.8231, + "step": 21500 + }, + { + "epoch": 0.2964277362824798, + "grad_norm": 0.1903577297925949, + "learning_rate": 0.0001, + "loss": 1.8209, + "step": 21550 + }, + { + "epoch": 0.29711550365204475, + "grad_norm": 0.17995303869247437, + "learning_rate": 0.0001, + "loss": 1.817, + "step": 21600 + }, + { + "epoch": 0.29780327102160964, + "grad_norm": 0.1937420666217804, + "learning_rate": 0.0001, + "loss": 1.8241, + "step": 21650 + }, + { + "epoch": 0.2984910383911746, + "grad_norm": 0.1729700267314911, + "learning_rate": 0.0001, + "loss": 1.822, + "step": 21700 + }, + { + "epoch": 0.29917880576073946, + "grad_norm": 0.16370828449726105, + "learning_rate": 0.0001, + "loss": 1.8217, + "step": 21750 + }, + { + "epoch": 0.2998665731303044, + "grad_norm": 0.17373540997505188, + "learning_rate": 0.0001, + "loss": 1.8191, + "step": 21800 + }, + { + "epoch": 0.30055434049986934, + "grad_norm": 0.19695748388767242, + "learning_rate": 0.0001, + "loss": 1.8236, + "step": 21850 + }, + { + "epoch": 0.30124210786943423, + "grad_norm": 0.20299525558948517, + "learning_rate": 0.0001, + "loss": 1.8181, + "step": 21900 + }, + { + "epoch": 0.30192987523899917, + "grad_norm": 0.5943254828453064, + "learning_rate": 0.0001, + "loss": 1.8207, + "step": 21950 + }, + { + "epoch": 0.30261764260856405, + "grad_norm": 0.1915915608406067, + "learning_rate": 0.0001, + "loss": 1.8245, + "step": 22000 + }, + { + "epoch": 0.303305409978129, + "grad_norm": 0.16212280094623566, + "learning_rate": 0.0001, + "loss": 1.8226, + "step": 22050 + }, + { + "epoch": 0.30399317734769393, + "grad_norm": 0.16871103644371033, + "learning_rate": 0.0001, + "loss": 1.8193, + "step": 22100 + }, + { + "epoch": 0.3046809447172588, + "grad_norm": 0.1811041682958603, + "learning_rate": 0.0001, + "loss": 1.8187, + "step": 22150 + }, + { + "epoch": 0.30536871208682376, + "grad_norm": 0.1868380606174469, + "learning_rate": 0.0001, + "loss": 1.8219, + "step": 22200 + }, + { + "epoch": 0.3060564794563887, + "grad_norm": 0.18134795129299164, + "learning_rate": 0.0001, + "loss": 1.8207, + "step": 22250 + }, + { + "epoch": 0.3067442468259536, + "grad_norm": 0.17329555749893188, + "learning_rate": 0.0001, + "loss": 1.8193, + "step": 22300 + }, + { + "epoch": 0.3074320141955185, + "grad_norm": 0.18371562659740448, + "learning_rate": 0.0001, + "loss": 1.821, + "step": 22350 + }, + { + "epoch": 0.3081197815650834, + "grad_norm": 0.17543677985668182, + "learning_rate": 0.0001, + "loss": 1.8182, + "step": 22400 + }, + { + "epoch": 0.30880754893464835, + "grad_norm": 0.18362955749034882, + "learning_rate": 0.0001, + "loss": 1.8187, + "step": 22450 + }, + { + "epoch": 0.3094953163042133, + "grad_norm": 0.20341430604457855, + "learning_rate": 0.0001, + "loss": 1.8198, + "step": 22500 + }, + { + "epoch": 0.31018308367377817, + "grad_norm": 0.1833573579788208, + "learning_rate": 0.0001, + "loss": 1.8167, + "step": 22550 + }, + { + "epoch": 0.3108708510433431, + "grad_norm": 0.1798466444015503, + "learning_rate": 0.0001, + "loss": 1.8204, + "step": 22600 + }, + { + "epoch": 0.311558618412908, + "grad_norm": 0.18346908688545227, + "learning_rate": 0.0001, + "loss": 1.8197, + "step": 22650 + }, + { + "epoch": 0.31224638578247293, + "grad_norm": 0.1842503696680069, + "learning_rate": 0.0001, + "loss": 1.822, + "step": 22700 + }, + { + "epoch": 0.3129341531520379, + "grad_norm": 0.1917971521615982, + "learning_rate": 0.0001, + "loss": 1.8205, + "step": 22750 + }, + { + "epoch": 0.31362192052160276, + "grad_norm": 0.18140938878059387, + "learning_rate": 0.0001, + "loss": 1.8187, + "step": 22800 + }, + { + "epoch": 0.3143096878911677, + "grad_norm": 0.17349034547805786, + "learning_rate": 0.0001, + "loss": 1.8204, + "step": 22850 + }, + { + "epoch": 0.3149974552607326, + "grad_norm": 0.17727358639240265, + "learning_rate": 0.0001, + "loss": 1.8203, + "step": 22900 + }, + { + "epoch": 0.3156852226302975, + "grad_norm": 0.1764019876718521, + "learning_rate": 0.0001, + "loss": 1.8197, + "step": 22950 + }, + { + "epoch": 0.31637298999986246, + "grad_norm": 0.18336281180381775, + "learning_rate": 0.0001, + "loss": 1.8168, + "step": 23000 + }, + { + "epoch": 0.31706075736942735, + "grad_norm": 0.15488466620445251, + "learning_rate": 0.0001, + "loss": 1.819, + "step": 23050 + }, + { + "epoch": 0.3177485247389923, + "grad_norm": 0.16988332569599152, + "learning_rate": 0.0001, + "loss": 1.8151, + "step": 23100 + }, + { + "epoch": 0.31843629210855723, + "grad_norm": 0.16344988346099854, + "learning_rate": 0.0001, + "loss": 1.819, + "step": 23150 + }, + { + "epoch": 0.3191240594781221, + "grad_norm": 0.17984721064567566, + "learning_rate": 0.0001, + "loss": 1.8182, + "step": 23200 + }, + { + "epoch": 0.31981182684768705, + "grad_norm": 0.19572113454341888, + "learning_rate": 0.0001, + "loss": 1.8158, + "step": 23250 + }, + { + "epoch": 0.32049959421725194, + "grad_norm": 0.21890446543693542, + "learning_rate": 0.0001, + "loss": 1.8158, + "step": 23300 + }, + { + "epoch": 0.3211873615868169, + "grad_norm": 0.1672099530696869, + "learning_rate": 0.0001, + "loss": 1.8183, + "step": 23350 + }, + { + "epoch": 0.3218751289563818, + "grad_norm": 0.18066146969795227, + "learning_rate": 0.0001, + "loss": 1.8194, + "step": 23400 + }, + { + "epoch": 0.3225628963259467, + "grad_norm": 0.1749303936958313, + "learning_rate": 0.0001, + "loss": 1.8192, + "step": 23450 + }, + { + "epoch": 0.32325066369551164, + "grad_norm": 0.1646299809217453, + "learning_rate": 0.0001, + "loss": 1.819, + "step": 23500 + }, + { + "epoch": 0.3239384310650765, + "grad_norm": 0.204520583152771, + "learning_rate": 0.0001, + "loss": 1.8166, + "step": 23550 + }, + { + "epoch": 0.32462619843464147, + "grad_norm": 0.166048064827919, + "learning_rate": 0.0001, + "loss": 1.8163, + "step": 23600 + }, + { + "epoch": 0.3253139658042064, + "grad_norm": 0.17722272872924805, + "learning_rate": 0.0001, + "loss": 1.8158, + "step": 23650 + }, + { + "epoch": 0.3260017331737713, + "grad_norm": 0.1896638125181198, + "learning_rate": 0.0001, + "loss": 1.8165, + "step": 23700 + }, + { + "epoch": 0.32668950054333623, + "grad_norm": 0.16389790177345276, + "learning_rate": 0.0001, + "loss": 1.8163, + "step": 23750 + }, + { + "epoch": 0.3273772679129011, + "grad_norm": 0.17973138391971588, + "learning_rate": 0.0001, + "loss": 1.8201, + "step": 23800 + }, + { + "epoch": 0.32806503528246606, + "grad_norm": 0.20095448195934296, + "learning_rate": 0.0001, + "loss": 1.8174, + "step": 23850 + }, + { + "epoch": 0.328752802652031, + "grad_norm": 0.18039678037166595, + "learning_rate": 0.0001, + "loss": 1.8179, + "step": 23900 + }, + { + "epoch": 0.3294405700215959, + "grad_norm": 0.1760893315076828, + "learning_rate": 0.0001, + "loss": 1.816, + "step": 23950 + }, + { + "epoch": 0.3301283373911608, + "grad_norm": 0.171057790517807, + "learning_rate": 0.0001, + "loss": 1.816, + "step": 24000 + }, + { + "epoch": 0.33081610476072576, + "grad_norm": 0.17639483511447906, + "learning_rate": 0.0001, + "loss": 1.8157, + "step": 24050 + }, + { + "epoch": 0.33150387213029064, + "grad_norm": 0.16385740041732788, + "learning_rate": 0.0001, + "loss": 1.8195, + "step": 24100 + }, + { + "epoch": 0.3321916394998556, + "grad_norm": 0.18215522170066833, + "learning_rate": 0.0001, + "loss": 1.8157, + "step": 24150 + }, + { + "epoch": 0.33287940686942047, + "grad_norm": 0.17613132297992706, + "learning_rate": 0.0001, + "loss": 1.8152, + "step": 24200 + }, + { + "epoch": 0.3335671742389854, + "grad_norm": 0.16723348200321198, + "learning_rate": 0.0001, + "loss": 1.8141, + "step": 24250 + }, + { + "epoch": 0.33425494160855035, + "grad_norm": 0.16092203557491302, + "learning_rate": 0.0001, + "loss": 1.8173, + "step": 24300 + }, + { + "epoch": 0.33494270897811523, + "grad_norm": 0.17928454279899597, + "learning_rate": 0.0001, + "loss": 1.8188, + "step": 24350 + }, + { + "epoch": 0.3356304763476802, + "grad_norm": 0.18230123817920685, + "learning_rate": 0.0001, + "loss": 1.8152, + "step": 24400 + }, + { + "epoch": 0.33631824371724506, + "grad_norm": 0.1699696034193039, + "learning_rate": 0.0001, + "loss": 1.8194, + "step": 24450 + }, + { + "epoch": 0.33700601108681, + "grad_norm": 0.1800839602947235, + "learning_rate": 0.0001, + "loss": 1.8126, + "step": 24500 + }, + { + "epoch": 0.33769377845637494, + "grad_norm": 0.19913671910762787, + "learning_rate": 0.0001, + "loss": 1.8148, + "step": 24550 + }, + { + "epoch": 0.3383815458259398, + "grad_norm": 0.16596053540706635, + "learning_rate": 0.0001, + "loss": 1.818, + "step": 24600 + }, + { + "epoch": 0.33906931319550476, + "grad_norm": 0.1894855797290802, + "learning_rate": 0.0001, + "loss": 1.8142, + "step": 24650 + }, + { + "epoch": 0.33975708056506965, + "grad_norm": 0.1800161600112915, + "learning_rate": 0.0001, + "loss": 1.8152, + "step": 24700 + }, + { + "epoch": 0.3404448479346346, + "grad_norm": 0.17433103919029236, + "learning_rate": 0.0001, + "loss": 1.815, + "step": 24750 + }, + { + "epoch": 0.3411326153041995, + "grad_norm": 0.18210847675800323, + "learning_rate": 0.0001, + "loss": 1.8168, + "step": 24800 + }, + { + "epoch": 0.3418203826737644, + "grad_norm": 0.17840790748596191, + "learning_rate": 0.0001, + "loss": 1.8159, + "step": 24850 + }, + { + "epoch": 0.34250815004332935, + "grad_norm": 0.18368154764175415, + "learning_rate": 0.0001, + "loss": 1.8171, + "step": 24900 + }, + { + "epoch": 0.34319591741289424, + "grad_norm": 0.17999804019927979, + "learning_rate": 0.0001, + "loss": 1.817, + "step": 24950 + }, + { + "epoch": 0.3438836847824592, + "grad_norm": 0.19299517571926117, + "learning_rate": 0.0001, + "loss": 1.8161, + "step": 25000 + }, + { + "epoch": 0.3445714521520241, + "grad_norm": 0.17866362631320953, + "learning_rate": 0.0001, + "loss": 1.8121, + "step": 25050 + }, + { + "epoch": 0.345259219521589, + "grad_norm": 0.16793055832386017, + "learning_rate": 0.0001, + "loss": 1.8137, + "step": 25100 + }, + { + "epoch": 0.34594698689115394, + "grad_norm": 0.18356679379940033, + "learning_rate": 0.0001, + "loss": 1.8158, + "step": 25150 + }, + { + "epoch": 0.3466347542607189, + "grad_norm": 0.18392959237098694, + "learning_rate": 0.0001, + "loss": 1.8135, + "step": 25200 + }, + { + "epoch": 0.34732252163028376, + "grad_norm": 0.18158595263957977, + "learning_rate": 0.0001, + "loss": 1.8168, + "step": 25250 + }, + { + "epoch": 0.3480102889998487, + "grad_norm": 0.1956174075603485, + "learning_rate": 0.0001, + "loss": 1.8137, + "step": 25300 + }, + { + "epoch": 0.3486980563694136, + "grad_norm": 0.17629751563072205, + "learning_rate": 0.0001, + "loss": 1.8161, + "step": 25350 + }, + { + "epoch": 0.34938582373897853, + "grad_norm": 0.1842150092124939, + "learning_rate": 0.0001, + "loss": 1.8112, + "step": 25400 + }, + { + "epoch": 0.35007359110854347, + "grad_norm": 0.18889479339122772, + "learning_rate": 0.0001, + "loss": 1.8152, + "step": 25450 + }, + { + "epoch": 0.35076135847810835, + "grad_norm": 0.16872894763946533, + "learning_rate": 0.0001, + "loss": 1.818, + "step": 25500 + }, + { + "epoch": 0.3514491258476733, + "grad_norm": 0.16502858698368073, + "learning_rate": 0.0001, + "loss": 1.8127, + "step": 25550 + }, + { + "epoch": 0.3521368932172382, + "grad_norm": 0.1778111755847931, + "learning_rate": 0.0001, + "loss": 1.8202, + "step": 25600 + }, + { + "epoch": 0.3528246605868031, + "grad_norm": 0.16866064071655273, + "learning_rate": 0.0001, + "loss": 1.8155, + "step": 25650 + }, + { + "epoch": 0.35351242795636806, + "grad_norm": 0.1845904141664505, + "learning_rate": 0.0001, + "loss": 1.8171, + "step": 25700 + }, + { + "epoch": 0.35420019532593294, + "grad_norm": 0.19138947129249573, + "learning_rate": 0.0001, + "loss": 1.8164, + "step": 25750 + }, + { + "epoch": 0.3548879626954979, + "grad_norm": 0.18222880363464355, + "learning_rate": 0.0001, + "loss": 1.8131, + "step": 25800 + }, + { + "epoch": 0.35557573006506277, + "grad_norm": 0.17819440364837646, + "learning_rate": 0.0001, + "loss": 1.8147, + "step": 25850 + }, + { + "epoch": 0.3562634974346277, + "grad_norm": 0.20162558555603027, + "learning_rate": 0.0001, + "loss": 1.8188, + "step": 25900 + }, + { + "epoch": 0.35695126480419265, + "grad_norm": 0.17715832591056824, + "learning_rate": 0.0001, + "loss": 1.813, + "step": 25950 + }, + { + "epoch": 0.35763903217375753, + "grad_norm": 0.16032275557518005, + "learning_rate": 0.0001, + "loss": 1.8135, + "step": 26000 + }, + { + "epoch": 0.35832679954332247, + "grad_norm": 0.17023804783821106, + "learning_rate": 0.0001, + "loss": 1.8168, + "step": 26050 + }, + { + "epoch": 0.3590145669128874, + "grad_norm": 0.19815494120121002, + "learning_rate": 0.0001, + "loss": 1.8123, + "step": 26100 + }, + { + "epoch": 0.3597023342824523, + "grad_norm": 0.19192709028720856, + "learning_rate": 0.0001, + "loss": 1.8164, + "step": 26150 + }, + { + "epoch": 0.36039010165201724, + "grad_norm": 0.18932852149009705, + "learning_rate": 0.0001, + "loss": 1.813, + "step": 26200 + }, + { + "epoch": 0.3610778690215821, + "grad_norm": 0.16477489471435547, + "learning_rate": 0.0001, + "loss": 1.8147, + "step": 26250 + }, + { + "epoch": 0.36176563639114706, + "grad_norm": 0.19172504544258118, + "learning_rate": 0.0001, + "loss": 1.814, + "step": 26300 + }, + { + "epoch": 0.362453403760712, + "grad_norm": 0.19087177515029907, + "learning_rate": 0.0001, + "loss": 1.8123, + "step": 26350 + }, + { + "epoch": 0.3631411711302769, + "grad_norm": 0.1714990735054016, + "learning_rate": 0.0001, + "loss": 1.8133, + "step": 26400 + }, + { + "epoch": 0.3638289384998418, + "grad_norm": 0.16309858858585358, + "learning_rate": 0.0001, + "loss": 1.8168, + "step": 26450 + }, + { + "epoch": 0.3645167058694067, + "grad_norm": 0.1791163831949234, + "learning_rate": 0.0001, + "loss": 1.818, + "step": 26500 + }, + { + "epoch": 0.36520447323897165, + "grad_norm": 0.17130139470100403, + "learning_rate": 0.0001, + "loss": 1.8182, + "step": 26550 + }, + { + "epoch": 0.3658922406085366, + "grad_norm": 0.17432111501693726, + "learning_rate": 0.0001, + "loss": 1.8177, + "step": 26600 + }, + { + "epoch": 0.3665800079781015, + "grad_norm": 0.15398447215557098, + "learning_rate": 0.0001, + "loss": 1.8161, + "step": 26650 + }, + { + "epoch": 0.3672677753476664, + "grad_norm": 0.2831607162952423, + "learning_rate": 0.0001, + "loss": 1.815, + "step": 26700 + }, + { + "epoch": 0.3679555427172313, + "grad_norm": 0.17564986646175385, + "learning_rate": 0.0001, + "loss": 1.8129, + "step": 26750 + }, + { + "epoch": 0.36864331008679624, + "grad_norm": 0.18288859724998474, + "learning_rate": 0.0001, + "loss": 1.813, + "step": 26800 + }, + { + "epoch": 0.3693310774563612, + "grad_norm": 0.1621311753988266, + "learning_rate": 0.0001, + "loss": 1.8069, + "step": 26850 + }, + { + "epoch": 0.37001884482592606, + "grad_norm": 0.16472625732421875, + "learning_rate": 0.0001, + "loss": 1.8136, + "step": 26900 + }, + { + "epoch": 0.370706612195491, + "grad_norm": 0.16450871527194977, + "learning_rate": 0.0001, + "loss": 1.8149, + "step": 26950 + }, + { + "epoch": 0.37139437956505594, + "grad_norm": 0.1769149899482727, + "learning_rate": 0.0001, + "loss": 1.8078, + "step": 27000 + }, + { + "epoch": 0.3720821469346208, + "grad_norm": 0.1917348951101303, + "learning_rate": 0.0001, + "loss": 1.8121, + "step": 27050 + }, + { + "epoch": 0.37276991430418577, + "grad_norm": 0.18277530372142792, + "learning_rate": 0.0001, + "loss": 1.812, + "step": 27100 + }, + { + "epoch": 0.37345768167375065, + "grad_norm": 0.1814720183610916, + "learning_rate": 0.0001, + "loss": 1.8092, + "step": 27150 + }, + { + "epoch": 0.3741454490433156, + "grad_norm": 0.17358410358428955, + "learning_rate": 0.0001, + "loss": 1.8118, + "step": 27200 + }, + { + "epoch": 0.37483321641288053, + "grad_norm": 0.18569444119930267, + "learning_rate": 0.0001, + "loss": 1.8115, + "step": 27250 + }, + { + "epoch": 0.3755209837824454, + "grad_norm": 0.15812502801418304, + "learning_rate": 0.0001, + "loss": 1.813, + "step": 27300 + }, + { + "epoch": 0.37620875115201036, + "grad_norm": 0.19051866233348846, + "learning_rate": 0.0001, + "loss": 1.8162, + "step": 27350 + }, + { + "epoch": 0.37689651852157524, + "grad_norm": 0.1646508276462555, + "learning_rate": 0.0001, + "loss": 1.8109, + "step": 27400 + }, + { + "epoch": 0.3775842858911402, + "grad_norm": 0.16069738566875458, + "learning_rate": 0.0001, + "loss": 1.8088, + "step": 27450 + }, + { + "epoch": 0.3782720532607051, + "grad_norm": 0.18708954751491547, + "learning_rate": 0.0001, + "loss": 1.809, + "step": 27500 + }, + { + "epoch": 0.37895982063027, + "grad_norm": 0.18674279749393463, + "learning_rate": 0.0001, + "loss": 1.8141, + "step": 27550 + }, + { + "epoch": 0.37964758799983495, + "grad_norm": 0.17408175766468048, + "learning_rate": 0.0001, + "loss": 1.8126, + "step": 27600 + }, + { + "epoch": 0.38033535536939983, + "grad_norm": 0.15924981236457825, + "learning_rate": 0.0001, + "loss": 1.8122, + "step": 27650 + }, + { + "epoch": 0.38102312273896477, + "grad_norm": 0.17203688621520996, + "learning_rate": 0.0001, + "loss": 1.8118, + "step": 27700 + }, + { + "epoch": 0.3817108901085297, + "grad_norm": 0.18587364256381989, + "learning_rate": 0.0001, + "loss": 1.8129, + "step": 27750 + }, + { + "epoch": 0.3823986574780946, + "grad_norm": 0.18941548466682434, + "learning_rate": 0.0001, + "loss": 1.8099, + "step": 27800 + }, + { + "epoch": 0.38308642484765953, + "grad_norm": 0.14958040416240692, + "learning_rate": 0.0001, + "loss": 1.8141, + "step": 27850 + }, + { + "epoch": 0.3837741922172244, + "grad_norm": 0.17599830031394958, + "learning_rate": 0.0001, + "loss": 1.8141, + "step": 27900 + }, + { + "epoch": 0.38446195958678936, + "grad_norm": 0.17611196637153625, + "learning_rate": 0.0001, + "loss": 1.8114, + "step": 27950 + }, + { + "epoch": 0.3851497269563543, + "grad_norm": 0.1823156625032425, + "learning_rate": 0.0001, + "loss": 1.8116, + "step": 28000 + }, + { + "epoch": 0.3858374943259192, + "grad_norm": 0.17287470400333405, + "learning_rate": 0.0001, + "loss": 1.812, + "step": 28050 + }, + { + "epoch": 0.3865252616954841, + "grad_norm": 0.17163801193237305, + "learning_rate": 0.0001, + "loss": 1.8102, + "step": 28100 + }, + { + "epoch": 0.38721302906504906, + "grad_norm": 0.16863061487674713, + "learning_rate": 0.0001, + "loss": 1.8085, + "step": 28150 + }, + { + "epoch": 0.38790079643461395, + "grad_norm": 0.1910269409418106, + "learning_rate": 0.0001, + "loss": 1.8128, + "step": 28200 + }, + { + "epoch": 0.3885885638041789, + "grad_norm": 0.16055557131767273, + "learning_rate": 0.0001, + "loss": 1.8122, + "step": 28250 + }, + { + "epoch": 0.3892763311737438, + "grad_norm": 0.17268548905849457, + "learning_rate": 0.0001, + "loss": 1.8084, + "step": 28300 + }, + { + "epoch": 0.3899640985433087, + "grad_norm": 0.16962352395057678, + "learning_rate": 0.0001, + "loss": 1.8131, + "step": 28350 + }, + { + "epoch": 0.39065186591287365, + "grad_norm": 0.1744450330734253, + "learning_rate": 0.0001, + "loss": 1.811, + "step": 28400 + }, + { + "epoch": 0.39133963328243854, + "grad_norm": 0.17569154500961304, + "learning_rate": 0.0001, + "loss": 1.8165, + "step": 28450 + }, + { + "epoch": 0.3920274006520035, + "grad_norm": 0.17034880816936493, + "learning_rate": 0.0001, + "loss": 1.8125, + "step": 28500 + }, + { + "epoch": 0.39271516802156836, + "grad_norm": 0.16873665153980255, + "learning_rate": 0.0001, + "loss": 1.8124, + "step": 28550 + }, + { + "epoch": 0.3934029353911333, + "grad_norm": 0.1771818846464157, + "learning_rate": 0.0001, + "loss": 1.8132, + "step": 28600 + }, + { + "epoch": 0.39409070276069824, + "grad_norm": 0.17641928791999817, + "learning_rate": 0.0001, + "loss": 1.8131, + "step": 28650 + }, + { + "epoch": 0.3947784701302631, + "grad_norm": 0.16521941125392914, + "learning_rate": 0.0001, + "loss": 1.8082, + "step": 28700 + }, + { + "epoch": 0.39546623749982807, + "grad_norm": 0.17453192174434662, + "learning_rate": 0.0001, + "loss": 1.813, + "step": 28750 + }, + { + "epoch": 0.39615400486939295, + "grad_norm": 0.17454297840595245, + "learning_rate": 0.0001, + "loss": 1.8129, + "step": 28800 + }, + { + "epoch": 0.3968417722389579, + "grad_norm": 0.155872642993927, + "learning_rate": 0.0001, + "loss": 1.8116, + "step": 28850 + }, + { + "epoch": 0.39752953960852283, + "grad_norm": 0.17079751193523407, + "learning_rate": 0.0001, + "loss": 1.8097, + "step": 28900 + }, + { + "epoch": 0.3982173069780877, + "grad_norm": 0.1715528666973114, + "learning_rate": 0.0001, + "loss": 1.8082, + "step": 28950 + }, + { + "epoch": 0.39890507434765266, + "grad_norm": 0.17352135479450226, + "learning_rate": 0.0001, + "loss": 1.8058, + "step": 29000 + }, + { + "epoch": 0.3995928417172176, + "grad_norm": 0.17056448757648468, + "learning_rate": 0.0001, + "loss": 1.8093, + "step": 29050 + }, + { + "epoch": 0.4002806090867825, + "grad_norm": 0.16389931738376617, + "learning_rate": 0.0001, + "loss": 1.8079, + "step": 29100 + }, + { + "epoch": 0.4009683764563474, + "grad_norm": 0.17660637199878693, + "learning_rate": 0.0001, + "loss": 1.8101, + "step": 29150 + }, + { + "epoch": 0.4016561438259123, + "grad_norm": 0.1871548742055893, + "learning_rate": 0.0001, + "loss": 1.8124, + "step": 29200 + }, + { + "epoch": 0.40234391119547724, + "grad_norm": 0.17292185127735138, + "learning_rate": 0.0001, + "loss": 1.8074, + "step": 29250 + }, + { + "epoch": 0.4030316785650422, + "grad_norm": 0.16299203038215637, + "learning_rate": 0.0001, + "loss": 1.81, + "step": 29300 + }, + { + "epoch": 0.40371944593460707, + "grad_norm": 0.20287854969501495, + "learning_rate": 0.0001, + "loss": 1.8141, + "step": 29350 + }, + { + "epoch": 0.404407213304172, + "grad_norm": 0.1632193922996521, + "learning_rate": 0.0001, + "loss": 1.8102, + "step": 29400 + }, + { + "epoch": 0.4050949806737369, + "grad_norm": 0.16991235315799713, + "learning_rate": 0.0001, + "loss": 1.8084, + "step": 29450 + }, + { + "epoch": 0.40578274804330183, + "grad_norm": 0.17448389530181885, + "learning_rate": 0.0001, + "loss": 1.8108, + "step": 29500 + }, + { + "epoch": 0.4064705154128668, + "grad_norm": 0.1706276535987854, + "learning_rate": 0.0001, + "loss": 1.8091, + "step": 29550 + }, + { + "epoch": 0.40715828278243166, + "grad_norm": 0.187569260597229, + "learning_rate": 0.0001, + "loss": 1.8077, + "step": 29600 + }, + { + "epoch": 0.4078460501519966, + "grad_norm": 0.18289169669151306, + "learning_rate": 0.0001, + "loss": 1.8062, + "step": 29650 + }, + { + "epoch": 0.4085338175215615, + "grad_norm": 0.17096656560897827, + "learning_rate": 0.0001, + "loss": 1.8117, + "step": 29700 + }, + { + "epoch": 0.4092215848911264, + "grad_norm": 0.18183813989162445, + "learning_rate": 0.0001, + "loss": 1.8108, + "step": 29750 + }, + { + "epoch": 0.40990935226069136, + "grad_norm": 0.18215380609035492, + "learning_rate": 0.0001, + "loss": 1.8113, + "step": 29800 + }, + { + "epoch": 0.41059711963025625, + "grad_norm": 0.19367296993732452, + "learning_rate": 0.0001, + "loss": 1.8111, + "step": 29850 + }, + { + "epoch": 0.4112848869998212, + "grad_norm": 0.18118008971214294, + "learning_rate": 0.0001, + "loss": 1.81, + "step": 29900 + }, + { + "epoch": 0.4119726543693861, + "grad_norm": 0.16475409269332886, + "learning_rate": 0.0001, + "loss": 1.8095, + "step": 29950 + }, + { + "epoch": 0.412660421738951, + "grad_norm": 0.19968119263648987, + "learning_rate": 0.0001, + "loss": 1.8078, + "step": 30000 + }, + { + "epoch": 0.41334818910851595, + "grad_norm": 0.2024579495191574, + "learning_rate": 0.0001, + "loss": 1.8097, + "step": 30050 + }, + { + "epoch": 0.41403595647808084, + "grad_norm": 0.1678769886493683, + "learning_rate": 0.0001, + "loss": 1.8099, + "step": 30100 + }, + { + "epoch": 0.4147237238476458, + "grad_norm": 0.19947120547294617, + "learning_rate": 0.0001, + "loss": 1.8124, + "step": 30150 + }, + { + "epoch": 0.4154114912172107, + "grad_norm": 0.1908283233642578, + "learning_rate": 0.0001, + "loss": 1.8094, + "step": 30200 + }, + { + "epoch": 0.4160992585867756, + "grad_norm": 0.16802892088890076, + "learning_rate": 0.0001, + "loss": 1.8029, + "step": 30250 + }, + { + "epoch": 0.41678702595634054, + "grad_norm": 0.1601232886314392, + "learning_rate": 0.0001, + "loss": 1.8078, + "step": 30300 + }, + { + "epoch": 0.4174747933259054, + "grad_norm": 0.16903936862945557, + "learning_rate": 0.0001, + "loss": 1.811, + "step": 30350 + }, + { + "epoch": 0.41816256069547036, + "grad_norm": 0.17131748795509338, + "learning_rate": 0.0001, + "loss": 1.8057, + "step": 30400 + }, + { + "epoch": 0.4188503280650353, + "grad_norm": 0.17509245872497559, + "learning_rate": 0.0001, + "loss": 1.8109, + "step": 30450 + }, + { + "epoch": 0.4195380954346002, + "grad_norm": 0.17135483026504517, + "learning_rate": 0.0001, + "loss": 1.8086, + "step": 30500 + }, + { + "epoch": 0.42022586280416513, + "grad_norm": 0.1780470460653305, + "learning_rate": 0.0001, + "loss": 1.8054, + "step": 30550 + }, + { + "epoch": 0.42091363017373, + "grad_norm": 0.16642825305461884, + "learning_rate": 0.0001, + "loss": 1.8101, + "step": 30600 + }, + { + "epoch": 0.42160139754329495, + "grad_norm": 0.17237281799316406, + "learning_rate": 0.0001, + "loss": 1.8131, + "step": 30650 + }, + { + "epoch": 0.4222891649128599, + "grad_norm": 0.1773928999900818, + "learning_rate": 0.0001, + "loss": 1.807, + "step": 30700 + }, + { + "epoch": 0.4229769322824248, + "grad_norm": 0.15655359625816345, + "learning_rate": 0.0001, + "loss": 1.8102, + "step": 30750 + }, + { + "epoch": 0.4236646996519897, + "grad_norm": 0.18366913497447968, + "learning_rate": 0.0001, + "loss": 1.8045, + "step": 30800 + }, + { + "epoch": 0.4243524670215546, + "grad_norm": 0.15379434823989868, + "learning_rate": 0.0001, + "loss": 1.808, + "step": 30850 + }, + { + "epoch": 0.42504023439111954, + "grad_norm": 0.17815300822257996, + "learning_rate": 0.0001, + "loss": 1.8043, + "step": 30900 + }, + { + "epoch": 0.4257280017606845, + "grad_norm": 0.17477139830589294, + "learning_rate": 0.0001, + "loss": 1.8106, + "step": 30950 + }, + { + "epoch": 0.42641576913024937, + "grad_norm": 0.18266303837299347, + "learning_rate": 0.0001, + "loss": 1.8089, + "step": 31000 + }, + { + "epoch": 0.4271035364998143, + "grad_norm": 0.17377638816833496, + "learning_rate": 0.0001, + "loss": 1.808, + "step": 31050 + }, + { + "epoch": 0.42779130386937925, + "grad_norm": 0.16105225682258606, + "learning_rate": 0.0001, + "loss": 1.8058, + "step": 31100 + }, + { + "epoch": 0.42847907123894413, + "grad_norm": 0.16976149380207062, + "learning_rate": 0.0001, + "loss": 1.8108, + "step": 31150 + }, + { + "epoch": 0.42916683860850907, + "grad_norm": 0.1994379609823227, + "learning_rate": 0.0001, + "loss": 1.8103, + "step": 31200 + }, + { + "epoch": 0.42985460597807396, + "grad_norm": 0.1827680766582489, + "learning_rate": 0.0001, + "loss": 1.8044, + "step": 31250 + }, + { + "epoch": 0.4305423733476389, + "grad_norm": 0.17883870005607605, + "learning_rate": 0.0001, + "loss": 1.8067, + "step": 31300 + }, + { + "epoch": 0.43123014071720384, + "grad_norm": 0.1809430867433548, + "learning_rate": 0.0001, + "loss": 1.8105, + "step": 31350 + }, + { + "epoch": 0.4319179080867687, + "grad_norm": 0.15287983417510986, + "learning_rate": 0.0001, + "loss": 1.8032, + "step": 31400 + }, + { + "epoch": 0.43260567545633366, + "grad_norm": 0.1845768690109253, + "learning_rate": 0.0001, + "loss": 1.8044, + "step": 31450 + }, + { + "epoch": 0.43329344282589854, + "grad_norm": 0.15448009967803955, + "learning_rate": 0.0001, + "loss": 1.8074, + "step": 31500 + }, + { + "epoch": 0.4339812101954635, + "grad_norm": 0.16838380694389343, + "learning_rate": 0.0001, + "loss": 1.806, + "step": 31550 + }, + { + "epoch": 0.4346689775650284, + "grad_norm": 0.16129769384860992, + "learning_rate": 0.0001, + "loss": 1.8103, + "step": 31600 + }, + { + "epoch": 0.4353567449345933, + "grad_norm": 0.16702227294445038, + "learning_rate": 0.0001, + "loss": 1.81, + "step": 31650 + }, + { + "epoch": 0.43604451230415825, + "grad_norm": 0.1646498441696167, + "learning_rate": 0.0001, + "loss": 1.806, + "step": 31700 + }, + { + "epoch": 0.43673227967372313, + "grad_norm": 0.1929212510585785, + "learning_rate": 0.0001, + "loss": 1.8073, + "step": 31750 + }, + { + "epoch": 0.4374200470432881, + "grad_norm": 0.1728442758321762, + "learning_rate": 0.0001, + "loss": 1.8062, + "step": 31800 + }, + { + "epoch": 0.438107814412853, + "grad_norm": 0.15660201013088226, + "learning_rate": 0.0001, + "loss": 1.8025, + "step": 31850 + }, + { + "epoch": 0.4387955817824179, + "grad_norm": 0.1685377061367035, + "learning_rate": 0.0001, + "loss": 1.8079, + "step": 31900 + }, + { + "epoch": 0.43948334915198284, + "grad_norm": 0.18124371767044067, + "learning_rate": 0.0001, + "loss": 1.8042, + "step": 31950 + }, + { + "epoch": 0.4401711165215478, + "grad_norm": 0.18348287045955658, + "learning_rate": 0.0001, + "loss": 1.809, + "step": 32000 + }, + { + "epoch": 0.44085888389111266, + "grad_norm": 0.17936021089553833, + "learning_rate": 0.0001, + "loss": 1.8052, + "step": 32050 + }, + { + "epoch": 0.4415466512606776, + "grad_norm": 0.17418572306632996, + "learning_rate": 0.0001, + "loss": 1.8075, + "step": 32100 + }, + { + "epoch": 0.4422344186302425, + "grad_norm": 0.16956304013729095, + "learning_rate": 0.0001, + "loss": 1.8077, + "step": 32150 + }, + { + "epoch": 0.4429221859998074, + "grad_norm": 0.18142879009246826, + "learning_rate": 0.0001, + "loss": 1.8053, + "step": 32200 + }, + { + "epoch": 0.44360995336937237, + "grad_norm": 0.17536590993404388, + "learning_rate": 0.0001, + "loss": 1.8055, + "step": 32250 + }, + { + "epoch": 0.44429772073893725, + "grad_norm": 0.18276521563529968, + "learning_rate": 0.0001, + "loss": 1.8052, + "step": 32300 + }, + { + "epoch": 0.4449854881085022, + "grad_norm": 0.15810468792915344, + "learning_rate": 0.0001, + "loss": 1.8012, + "step": 32350 + }, + { + "epoch": 0.4456732554780671, + "grad_norm": 0.17224664986133575, + "learning_rate": 0.0001, + "loss": 1.8077, + "step": 32400 + }, + { + "epoch": 0.446361022847632, + "grad_norm": 0.17988136410713196, + "learning_rate": 0.0001, + "loss": 1.806, + "step": 32450 + }, + { + "epoch": 0.44704879021719696, + "grad_norm": 0.16269569098949432, + "learning_rate": 0.0001, + "loss": 1.8072, + "step": 32500 + }, + { + "epoch": 0.44773655758676184, + "grad_norm": 0.1897774487733841, + "learning_rate": 0.0001, + "loss": 1.8034, + "step": 32550 + }, + { + "epoch": 0.4484243249563268, + "grad_norm": 0.17675265669822693, + "learning_rate": 0.0001, + "loss": 1.8053, + "step": 32600 + }, + { + "epoch": 0.44911209232589167, + "grad_norm": 0.1847987174987793, + "learning_rate": 0.0001, + "loss": 1.8065, + "step": 32650 + }, + { + "epoch": 0.4497998596954566, + "grad_norm": 0.16706308722496033, + "learning_rate": 0.0001, + "loss": 1.8072, + "step": 32700 + }, + { + "epoch": 0.45048762706502155, + "grad_norm": 0.19702313840389252, + "learning_rate": 0.0001, + "loss": 1.8092, + "step": 32750 + }, + { + "epoch": 0.45117539443458643, + "grad_norm": 0.17378373444080353, + "learning_rate": 0.0001, + "loss": 1.8069, + "step": 32800 + }, + { + "epoch": 0.45186316180415137, + "grad_norm": 0.15358635783195496, + "learning_rate": 0.0001, + "loss": 1.8042, + "step": 32850 + }, + { + "epoch": 0.4525509291737163, + "grad_norm": 0.16188420355319977, + "learning_rate": 0.0001, + "loss": 1.8046, + "step": 32900 + }, + { + "epoch": 0.4532386965432812, + "grad_norm": 0.15988096594810486, + "learning_rate": 0.0001, + "loss": 1.8048, + "step": 32950 + }, + { + "epoch": 0.45392646391284613, + "grad_norm": 0.17328138649463654, + "learning_rate": 0.0001, + "loss": 1.8031, + "step": 33000 + }, + { + "epoch": 0.454614231282411, + "grad_norm": 0.18192242085933685, + "learning_rate": 0.0001, + "loss": 1.8015, + "step": 33050 + }, + { + "epoch": 0.45530199865197596, + "grad_norm": 0.18269090354442596, + "learning_rate": 0.0001, + "loss": 1.8086, + "step": 33100 + }, + { + "epoch": 0.4559897660215409, + "grad_norm": 0.1573922038078308, + "learning_rate": 0.0001, + "loss": 1.8054, + "step": 33150 + }, + { + "epoch": 0.4566775333911058, + "grad_norm": 0.20478671789169312, + "learning_rate": 0.0001, + "loss": 1.8047, + "step": 33200 + }, + { + "epoch": 0.4573653007606707, + "grad_norm": 0.17149974405765533, + "learning_rate": 0.0001, + "loss": 1.8045, + "step": 33250 + }, + { + "epoch": 0.4580530681302356, + "grad_norm": 0.1575038731098175, + "learning_rate": 0.0001, + "loss": 1.8008, + "step": 33300 + }, + { + "epoch": 0.45874083549980055, + "grad_norm": 0.1684975028038025, + "learning_rate": 0.0001, + "loss": 1.8036, + "step": 33350 + }, + { + "epoch": 0.4594286028693655, + "grad_norm": 0.17977888882160187, + "learning_rate": 0.0001, + "loss": 1.8058, + "step": 33400 + }, + { + "epoch": 0.46011637023893037, + "grad_norm": 0.1595628559589386, + "learning_rate": 0.0001, + "loss": 1.8052, + "step": 33450 + }, + { + "epoch": 0.4608041376084953, + "grad_norm": 0.17325359582901, + "learning_rate": 0.0001, + "loss": 1.8036, + "step": 33500 + }, + { + "epoch": 0.4614919049780602, + "grad_norm": 0.1705903857946396, + "learning_rate": 0.0001, + "loss": 1.8048, + "step": 33550 + }, + { + "epoch": 0.46217967234762514, + "grad_norm": 0.1714329570531845, + "learning_rate": 0.0001, + "loss": 1.8042, + "step": 33600 + }, + { + "epoch": 0.4628674397171901, + "grad_norm": 0.17674137651920319, + "learning_rate": 0.0001, + "loss": 1.8067, + "step": 33650 + }, + { + "epoch": 0.46355520708675496, + "grad_norm": 0.1605982631444931, + "learning_rate": 0.0001, + "loss": 1.8059, + "step": 33700 + }, + { + "epoch": 0.4642429744563199, + "grad_norm": 0.17221522331237793, + "learning_rate": 0.0001, + "loss": 1.8025, + "step": 33750 + }, + { + "epoch": 0.4649307418258848, + "grad_norm": 0.17648015916347504, + "learning_rate": 0.0001, + "loss": 1.8065, + "step": 33800 + }, + { + "epoch": 0.4656185091954497, + "grad_norm": 0.16860069334506989, + "learning_rate": 0.0001, + "loss": 1.8064, + "step": 33850 + }, + { + "epoch": 0.46630627656501467, + "grad_norm": 0.19352567195892334, + "learning_rate": 0.0001, + "loss": 1.8043, + "step": 33900 + }, + { + "epoch": 0.46699404393457955, + "grad_norm": 0.1634499430656433, + "learning_rate": 0.0001, + "loss": 1.7994, + "step": 33950 + }, + { + "epoch": 0.4676818113041445, + "grad_norm": 0.1790640950202942, + "learning_rate": 0.0001, + "loss": 1.8075, + "step": 34000 + }, + { + "epoch": 0.46836957867370943, + "grad_norm": 0.16731584072113037, + "learning_rate": 0.0001, + "loss": 1.8022, + "step": 34050 + }, + { + "epoch": 0.4690573460432743, + "grad_norm": 0.17351976037025452, + "learning_rate": 0.0001, + "loss": 1.8066, + "step": 34100 + }, + { + "epoch": 0.46974511341283925, + "grad_norm": 0.18717612326145172, + "learning_rate": 0.0001, + "loss": 1.8048, + "step": 34150 + }, + { + "epoch": 0.47043288078240414, + "grad_norm": 0.18829597532749176, + "learning_rate": 0.0001, + "loss": 1.8018, + "step": 34200 + }, + { + "epoch": 0.4711206481519691, + "grad_norm": 0.16731028258800507, + "learning_rate": 0.0001, + "loss": 1.806, + "step": 34250 + }, + { + "epoch": 0.471808415521534, + "grad_norm": 0.17419900000095367, + "learning_rate": 0.0001, + "loss": 1.8009, + "step": 34300 + }, + { + "epoch": 0.4724961828910989, + "grad_norm": 0.16232840716838837, + "learning_rate": 0.0001, + "loss": 1.8048, + "step": 34350 + }, + { + "epoch": 0.47318395026066384, + "grad_norm": 0.1557988077402115, + "learning_rate": 0.0001, + "loss": 1.8021, + "step": 34400 + }, + { + "epoch": 0.47387171763022873, + "grad_norm": 0.18441712856292725, + "learning_rate": 0.0001, + "loss": 1.8066, + "step": 34450 + }, + { + "epoch": 0.47455948499979367, + "grad_norm": 0.1681167036294937, + "learning_rate": 0.0001, + "loss": 1.8035, + "step": 34500 + }, + { + "epoch": 0.4752472523693586, + "grad_norm": 0.1694021373987198, + "learning_rate": 0.0001, + "loss": 1.8056, + "step": 34550 + }, + { + "epoch": 0.4759350197389235, + "grad_norm": 0.16909408569335938, + "learning_rate": 0.0001, + "loss": 1.8012, + "step": 34600 + }, + { + "epoch": 0.47662278710848843, + "grad_norm": 0.18573597073554993, + "learning_rate": 0.0001, + "loss": 1.8025, + "step": 34650 + }, + { + "epoch": 0.4773105544780533, + "grad_norm": 0.1591121107339859, + "learning_rate": 0.0001, + "loss": 1.8022, + "step": 34700 + }, + { + "epoch": 0.47799832184761826, + "grad_norm": 0.16243012249469757, + "learning_rate": 0.0001, + "loss": 1.809, + "step": 34750 + }, + { + "epoch": 0.4786860892171832, + "grad_norm": 0.1876152753829956, + "learning_rate": 0.0001, + "loss": 1.8043, + "step": 34800 + }, + { + "epoch": 0.4793738565867481, + "grad_norm": 0.160101518034935, + "learning_rate": 0.0001, + "loss": 1.8041, + "step": 34850 + }, + { + "epoch": 0.480061623956313, + "grad_norm": 0.17508384585380554, + "learning_rate": 0.0001, + "loss": 1.8025, + "step": 34900 + }, + { + "epoch": 0.48074939132587796, + "grad_norm": 0.16169220209121704, + "learning_rate": 0.0001, + "loss": 1.8045, + "step": 34950 + }, + { + "epoch": 0.48143715869544285, + "grad_norm": 0.17065638303756714, + "learning_rate": 0.0001, + "loss": 1.8046, + "step": 35000 + }, + { + "epoch": 0.4821249260650078, + "grad_norm": 0.16137543320655823, + "learning_rate": 0.0001, + "loss": 1.8006, + "step": 35050 + }, + { + "epoch": 0.48281269343457267, + "grad_norm": 0.1716589331626892, + "learning_rate": 0.0001, + "loss": 1.8065, + "step": 35100 + }, + { + "epoch": 0.4835004608041376, + "grad_norm": 0.16750770807266235, + "learning_rate": 0.0001, + "loss": 1.8069, + "step": 35150 + }, + { + "epoch": 0.48418822817370255, + "grad_norm": 0.1668424755334854, + "learning_rate": 0.0001, + "loss": 1.8045, + "step": 35200 + }, + { + "epoch": 0.48487599554326744, + "grad_norm": 0.1577017605304718, + "learning_rate": 0.0001, + "loss": 1.8015, + "step": 35250 + }, + { + "epoch": 0.4855637629128324, + "grad_norm": 0.16916392743587494, + "learning_rate": 0.0001, + "loss": 1.8012, + "step": 35300 + }, + { + "epoch": 0.48625153028239726, + "grad_norm": 0.16878165304660797, + "learning_rate": 0.0001, + "loss": 1.8049, + "step": 35350 + }, + { + "epoch": 0.4869392976519622, + "grad_norm": 0.1834115982055664, + "learning_rate": 0.0001, + "loss": 1.8024, + "step": 35400 + }, + { + "epoch": 0.48762706502152714, + "grad_norm": 0.16310469806194305, + "learning_rate": 0.0001, + "loss": 1.8043, + "step": 35450 + }, + { + "epoch": 0.488314832391092, + "grad_norm": 0.17430266737937927, + "learning_rate": 0.0001, + "loss": 1.8017, + "step": 35500 + }, + { + "epoch": 0.48900259976065696, + "grad_norm": 0.20293480157852173, + "learning_rate": 0.0001, + "loss": 1.8022, + "step": 35550 + }, + { + "epoch": 0.48969036713022185, + "grad_norm": 0.16140292584896088, + "learning_rate": 0.0001, + "loss": 1.8012, + "step": 35600 + }, + { + "epoch": 0.4903781344997868, + "grad_norm": 0.15472573041915894, + "learning_rate": 0.0001, + "loss": 1.804, + "step": 35650 + }, + { + "epoch": 0.49106590186935173, + "grad_norm": 0.19431902468204498, + "learning_rate": 0.0001, + "loss": 1.8013, + "step": 35700 + }, + { + "epoch": 0.4917536692389166, + "grad_norm": 0.1693229377269745, + "learning_rate": 0.0001, + "loss": 1.8004, + "step": 35750 + }, + { + "epoch": 0.49244143660848155, + "grad_norm": 0.19499187171459198, + "learning_rate": 0.0001, + "loss": 1.8035, + "step": 35800 + }, + { + "epoch": 0.4931292039780465, + "grad_norm": 0.16046124696731567, + "learning_rate": 0.0001, + "loss": 1.8062, + "step": 35850 + }, + { + "epoch": 0.4938169713476114, + "grad_norm": 0.17743340134620667, + "learning_rate": 0.0001, + "loss": 1.8038, + "step": 35900 + }, + { + "epoch": 0.4945047387171763, + "grad_norm": 0.20568375289440155, + "learning_rate": 0.0001, + "loss": 1.8039, + "step": 35950 + }, + { + "epoch": 0.4951925060867412, + "grad_norm": 0.1706654578447342, + "learning_rate": 0.0001, + "loss": 1.8022, + "step": 36000 + }, + { + "epoch": 0.49588027345630614, + "grad_norm": 0.17956335842609406, + "learning_rate": 0.0001, + "loss": 1.8038, + "step": 36050 + }, + { + "epoch": 0.4965680408258711, + "grad_norm": 0.1683945506811142, + "learning_rate": 0.0001, + "loss": 1.801, + "step": 36100 + }, + { + "epoch": 0.49725580819543597, + "grad_norm": 0.16132575273513794, + "learning_rate": 0.0001, + "loss": 1.7994, + "step": 36150 + }, + { + "epoch": 0.4979435755650009, + "grad_norm": 0.15439482033252716, + "learning_rate": 0.0001, + "loss": 1.798, + "step": 36200 + }, + { + "epoch": 0.4986313429345658, + "grad_norm": 0.17427167296409607, + "learning_rate": 0.0001, + "loss": 1.803, + "step": 36250 + }, + { + "epoch": 0.49931911030413073, + "grad_norm": 0.1826677918434143, + "learning_rate": 0.0001, + "loss": 1.8041, + "step": 36300 + }, + { + "epoch": 0.5000068776736957, + "grad_norm": 0.1664198338985443, + "learning_rate": 0.0001, + "loss": 1.8007, + "step": 36350 + }, + { + "epoch": 0.5006946450432606, + "grad_norm": 0.19743186235427856, + "learning_rate": 0.0001, + "loss": 1.8009, + "step": 36400 + }, + { + "epoch": 0.5013824124128254, + "grad_norm": 0.17416580021381378, + "learning_rate": 0.0001, + "loss": 1.8033, + "step": 36450 + }, + { + "epoch": 0.5020701797823904, + "grad_norm": 0.16447678208351135, + "learning_rate": 0.0001, + "loss": 1.8027, + "step": 36500 + }, + { + "epoch": 0.5027579471519553, + "grad_norm": 0.19978569447994232, + "learning_rate": 0.0001, + "loss": 1.8027, + "step": 36550 + }, + { + "epoch": 0.5034457145215202, + "grad_norm": 0.1768701672554016, + "learning_rate": 0.0001, + "loss": 1.8018, + "step": 36600 + }, + { + "epoch": 0.5041334818910852, + "grad_norm": 0.17458416521549225, + "learning_rate": 0.0001, + "loss": 1.8031, + "step": 36650 + }, + { + "epoch": 0.5048212492606501, + "grad_norm": 0.15409618616104126, + "learning_rate": 0.0001, + "loss": 1.8043, + "step": 36700 + }, + { + "epoch": 0.505509016630215, + "grad_norm": 0.20529916882514954, + "learning_rate": 0.0001, + "loss": 1.8026, + "step": 36750 + }, + { + "epoch": 0.50619678399978, + "grad_norm": 0.1579432338476181, + "learning_rate": 0.0001, + "loss": 1.806, + "step": 36800 + }, + { + "epoch": 0.5068845513693448, + "grad_norm": 0.16803112626075745, + "learning_rate": 0.0001, + "loss": 1.8027, + "step": 36850 + }, + { + "epoch": 0.5075723187389097, + "grad_norm": 0.19382716715335846, + "learning_rate": 0.0001, + "loss": 1.8029, + "step": 36900 + }, + { + "epoch": 0.5082600861084746, + "grad_norm": 0.17823243141174316, + "learning_rate": 0.0001, + "loss": 1.8035, + "step": 36950 + }, + { + "epoch": 0.5089478534780396, + "grad_norm": 0.1742970496416092, + "learning_rate": 0.0001, + "loss": 1.8033, + "step": 37000 + }, + { + "epoch": 0.5096356208476045, + "grad_norm": 0.17236186563968658, + "learning_rate": 0.0001, + "loss": 1.8062, + "step": 37050 + }, + { + "epoch": 0.5103233882171694, + "grad_norm": 0.1705719381570816, + "learning_rate": 0.0001, + "loss": 1.8052, + "step": 37100 + }, + { + "epoch": 0.5110111555867344, + "grad_norm": 0.19941222667694092, + "learning_rate": 0.0001, + "loss": 1.8011, + "step": 37150 + }, + { + "epoch": 0.5116989229562993, + "grad_norm": 0.16263477504253387, + "learning_rate": 0.0001, + "loss": 1.8026, + "step": 37200 + }, + { + "epoch": 0.5123866903258641, + "grad_norm": 0.15199637413024902, + "learning_rate": 0.0001, + "loss": 1.8001, + "step": 37250 + }, + { + "epoch": 0.5130744576954291, + "grad_norm": 0.16797873377799988, + "learning_rate": 0.0001, + "loss": 1.8016, + "step": 37300 + }, + { + "epoch": 0.513762225064994, + "grad_norm": 0.16336190700531006, + "learning_rate": 0.0001, + "loss": 1.798, + "step": 37350 + }, + { + "epoch": 0.5144499924345589, + "grad_norm": 0.16497831046581268, + "learning_rate": 0.0001, + "loss": 1.8001, + "step": 37400 + }, + { + "epoch": 0.5151377598041239, + "grad_norm": 0.1712917536497116, + "learning_rate": 0.0001, + "loss": 1.804, + "step": 37450 + }, + { + "epoch": 0.5158255271736888, + "grad_norm": 0.16597513854503632, + "learning_rate": 0.0001, + "loss": 1.8019, + "step": 37500 + }, + { + "epoch": 0.5165132945432537, + "grad_norm": 0.15661810338497162, + "learning_rate": 0.0001, + "loss": 1.8043, + "step": 37550 + }, + { + "epoch": 0.5172010619128186, + "grad_norm": 0.17713536322116852, + "learning_rate": 0.0001, + "loss": 1.7973, + "step": 37600 + }, + { + "epoch": 0.5178888292823836, + "grad_norm": 0.15873874723911285, + "learning_rate": 0.0001, + "loss": 1.8003, + "step": 37650 + }, + { + "epoch": 0.5185765966519484, + "grad_norm": 0.1784040331840515, + "learning_rate": 0.0001, + "loss": 1.798, + "step": 37700 + }, + { + "epoch": 0.5192643640215133, + "grad_norm": 0.16135090589523315, + "learning_rate": 0.0001, + "loss": 1.8082, + "step": 37750 + }, + { + "epoch": 0.5199521313910783, + "grad_norm": 0.15565833449363708, + "learning_rate": 0.0001, + "loss": 1.8006, + "step": 37800 + }, + { + "epoch": 0.5206398987606432, + "grad_norm": 0.1711311787366867, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 37850 + }, + { + "epoch": 0.5213276661302081, + "grad_norm": 0.17314565181732178, + "learning_rate": 0.0001, + "loss": 1.7997, + "step": 37900 + }, + { + "epoch": 0.5220154334997731, + "grad_norm": 0.1723901331424713, + "learning_rate": 0.0001, + "loss": 1.8006, + "step": 37950 + }, + { + "epoch": 0.522703200869338, + "grad_norm": 0.15868623554706573, + "learning_rate": 0.0001, + "loss": 1.8013, + "step": 38000 + }, + { + "epoch": 0.5233909682389029, + "grad_norm": 0.17163942754268646, + "learning_rate": 0.0001, + "loss": 1.7991, + "step": 38050 + }, + { + "epoch": 0.5240787356084677, + "grad_norm": 0.17622709274291992, + "learning_rate": 0.0001, + "loss": 1.8027, + "step": 38100 + }, + { + "epoch": 0.5247665029780327, + "grad_norm": 0.1616000235080719, + "learning_rate": 0.0001, + "loss": 1.7993, + "step": 38150 + }, + { + "epoch": 0.5254542703475976, + "grad_norm": 0.1638936698436737, + "learning_rate": 0.0001, + "loss": 1.7978, + "step": 38200 + }, + { + "epoch": 0.5261420377171625, + "grad_norm": 0.1706729531288147, + "learning_rate": 0.0001, + "loss": 1.7999, + "step": 38250 + }, + { + "epoch": 0.5268298050867275, + "grad_norm": 0.2048814296722412, + "learning_rate": 0.0001, + "loss": 1.7987, + "step": 38300 + }, + { + "epoch": 0.5275175724562924, + "grad_norm": 0.15826106071472168, + "learning_rate": 0.0001, + "loss": 1.8022, + "step": 38350 + }, + { + "epoch": 0.5282053398258573, + "grad_norm": 0.16068226099014282, + "learning_rate": 0.0001, + "loss": 1.8032, + "step": 38400 + }, + { + "epoch": 0.5288931071954223, + "grad_norm": 0.17855240404605865, + "learning_rate": 0.0001, + "loss": 1.7994, + "step": 38450 + }, + { + "epoch": 0.5295808745649871, + "grad_norm": 0.16978466510772705, + "learning_rate": 0.0001, + "loss": 1.8022, + "step": 38500 + }, + { + "epoch": 0.530268641934552, + "grad_norm": 0.1745109260082245, + "learning_rate": 0.0001, + "loss": 1.8008, + "step": 38550 + }, + { + "epoch": 0.530956409304117, + "grad_norm": 0.1952807605266571, + "learning_rate": 0.0001, + "loss": 1.7977, + "step": 38600 + }, + { + "epoch": 0.5316441766736819, + "grad_norm": 0.1846735179424286, + "learning_rate": 0.0001, + "loss": 1.8033, + "step": 38650 + }, + { + "epoch": 0.5323319440432468, + "grad_norm": 0.17474836111068726, + "learning_rate": 0.0001, + "loss": 1.8034, + "step": 38700 + }, + { + "epoch": 0.5330197114128117, + "grad_norm": 0.1729106903076172, + "learning_rate": 0.0001, + "loss": 1.8043, + "step": 38750 + }, + { + "epoch": 0.5337074787823767, + "grad_norm": 0.18584811687469482, + "learning_rate": 0.0001, + "loss": 1.805, + "step": 38800 + }, + { + "epoch": 0.5343952461519416, + "grad_norm": 0.15596157312393188, + "learning_rate": 0.0001, + "loss": 1.8014, + "step": 38850 + }, + { + "epoch": 0.5350830135215064, + "grad_norm": 0.15528340637683868, + "learning_rate": 0.0001, + "loss": 1.7969, + "step": 38900 + }, + { + "epoch": 0.5357707808910714, + "grad_norm": 0.1738685965538025, + "learning_rate": 0.0001, + "loss": 1.8003, + "step": 38950 + }, + { + "epoch": 0.5364585482606363, + "grad_norm": 0.1620347946882248, + "learning_rate": 0.0001, + "loss": 1.796, + "step": 39000 + }, + { + "epoch": 0.5371463156302012, + "grad_norm": 0.1705981343984604, + "learning_rate": 0.0001, + "loss": 1.8008, + "step": 39050 + }, + { + "epoch": 0.5378340829997662, + "grad_norm": 0.16167068481445312, + "learning_rate": 0.0001, + "loss": 1.8037, + "step": 39100 + }, + { + "epoch": 0.5385218503693311, + "grad_norm": 0.15977101027965546, + "learning_rate": 0.0001, + "loss": 1.8043, + "step": 39150 + }, + { + "epoch": 0.539209617738896, + "grad_norm": 0.1699797809123993, + "learning_rate": 0.0001, + "loss": 1.8025, + "step": 39200 + }, + { + "epoch": 0.5398973851084609, + "grad_norm": 0.17108047008514404, + "learning_rate": 0.0001, + "loss": 1.7999, + "step": 39250 + }, + { + "epoch": 0.5405851524780259, + "grad_norm": 0.1756991147994995, + "learning_rate": 0.0001, + "loss": 1.8001, + "step": 39300 + }, + { + "epoch": 0.5412729198475907, + "grad_norm": 0.1716366708278656, + "learning_rate": 0.0001, + "loss": 1.7987, + "step": 39350 + }, + { + "epoch": 0.5419606872171556, + "grad_norm": 0.16876575350761414, + "learning_rate": 0.0001, + "loss": 1.8013, + "step": 39400 + }, + { + "epoch": 0.5426484545867206, + "grad_norm": 0.1650577336549759, + "learning_rate": 0.0001, + "loss": 1.8001, + "step": 39450 + }, + { + "epoch": 0.5433362219562855, + "grad_norm": 0.17242754995822906, + "learning_rate": 0.0001, + "loss": 1.8006, + "step": 39500 + }, + { + "epoch": 0.5440239893258504, + "grad_norm": 0.16941705346107483, + "learning_rate": 0.0001, + "loss": 1.7995, + "step": 39550 + }, + { + "epoch": 0.5447117566954154, + "grad_norm": 0.21036018431186676, + "learning_rate": 0.0001, + "loss": 1.802, + "step": 39600 + }, + { + "epoch": 0.5453995240649803, + "grad_norm": 0.16824571788311005, + "learning_rate": 0.0001, + "loss": 1.7992, + "step": 39650 + }, + { + "epoch": 0.5460872914345452, + "grad_norm": 0.162497416138649, + "learning_rate": 0.0001, + "loss": 1.7978, + "step": 39700 + }, + { + "epoch": 0.5467750588041101, + "grad_norm": 0.18297506868839264, + "learning_rate": 0.0001, + "loss": 1.7968, + "step": 39750 + }, + { + "epoch": 0.547462826173675, + "grad_norm": 0.15444135665893555, + "learning_rate": 0.0001, + "loss": 1.7942, + "step": 39800 + }, + { + "epoch": 0.5481505935432399, + "grad_norm": 0.17254306375980377, + "learning_rate": 0.0001, + "loss": 1.797, + "step": 39850 + }, + { + "epoch": 0.5488383609128048, + "grad_norm": 0.18030798435211182, + "learning_rate": 0.0001, + "loss": 1.8008, + "step": 39900 + }, + { + "epoch": 0.5495261282823698, + "grad_norm": 0.18069452047348022, + "learning_rate": 0.0001, + "loss": 1.7988, + "step": 39950 + }, + { + "epoch": 0.5502138956519347, + "grad_norm": 0.16256502270698547, + "learning_rate": 0.0001, + "loss": 1.8019, + "step": 40000 + }, + { + "epoch": 0.5509016630214996, + "grad_norm": 0.16416381299495697, + "learning_rate": 0.0001, + "loss": 1.7976, + "step": 40050 + }, + { + "epoch": 0.5515894303910646, + "grad_norm": 0.1743890941143036, + "learning_rate": 0.0001, + "loss": 1.7966, + "step": 40100 + }, + { + "epoch": 0.5522771977606294, + "grad_norm": 0.1875494122505188, + "learning_rate": 0.0001, + "loss": 1.799, + "step": 40150 + }, + { + "epoch": 0.5529649651301943, + "grad_norm": 0.18323060870170593, + "learning_rate": 0.0001, + "loss": 1.7968, + "step": 40200 + }, + { + "epoch": 0.5536527324997593, + "grad_norm": 0.1552455574274063, + "learning_rate": 0.0001, + "loss": 1.799, + "step": 40250 + }, + { + "epoch": 0.5543404998693242, + "grad_norm": 0.1685846745967865, + "learning_rate": 0.0001, + "loss": 1.7989, + "step": 40300 + }, + { + "epoch": 0.5550282672388891, + "grad_norm": 0.16371703147888184, + "learning_rate": 0.0001, + "loss": 1.7943, + "step": 40350 + }, + { + "epoch": 0.5557160346084541, + "grad_norm": 0.17993508279323578, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 40400 + }, + { + "epoch": 0.556403801978019, + "grad_norm": 0.17061980068683624, + "learning_rate": 0.0001, + "loss": 1.7954, + "step": 40450 + }, + { + "epoch": 0.5570915693475839, + "grad_norm": 0.17588096857070923, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 40500 + }, + { + "epoch": 0.5577793367171487, + "grad_norm": 0.16484741866588593, + "learning_rate": 0.0001, + "loss": 1.7959, + "step": 40550 + }, + { + "epoch": 0.5584671040867137, + "grad_norm": 0.1812593787908554, + "learning_rate": 0.0001, + "loss": 1.801, + "step": 40600 + }, + { + "epoch": 0.5591548714562786, + "grad_norm": 0.17755167186260223, + "learning_rate": 0.0001, + "loss": 1.797, + "step": 40650 + }, + { + "epoch": 0.5598426388258435, + "grad_norm": 0.16877087950706482, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 40700 + }, + { + "epoch": 0.5605304061954085, + "grad_norm": 0.15780018270015717, + "learning_rate": 0.0001, + "loss": 1.7967, + "step": 40750 + }, + { + "epoch": 0.5612181735649734, + "grad_norm": 0.15145239233970642, + "learning_rate": 0.0001, + "loss": 1.7988, + "step": 40800 + }, + { + "epoch": 0.5619059409345383, + "grad_norm": 0.18385986983776093, + "learning_rate": 0.0001, + "loss": 1.7965, + "step": 40850 + }, + { + "epoch": 0.5625937083041033, + "grad_norm": 0.15375161170959473, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 40900 + }, + { + "epoch": 0.5632814756736682, + "grad_norm": 0.15694858133792877, + "learning_rate": 0.0001, + "loss": 1.7989, + "step": 40950 + }, + { + "epoch": 0.563969243043233, + "grad_norm": 0.1538461446762085, + "learning_rate": 0.0001, + "loss": 1.7965, + "step": 41000 + }, + { + "epoch": 0.5646570104127979, + "grad_norm": 0.16211877763271332, + "learning_rate": 0.0001, + "loss": 1.7931, + "step": 41050 + }, + { + "epoch": 0.5653447777823629, + "grad_norm": 0.1737697869539261, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 41100 + }, + { + "epoch": 0.5660325451519278, + "grad_norm": 0.1610105037689209, + "learning_rate": 0.0001, + "loss": 1.798, + "step": 41150 + }, + { + "epoch": 0.5667203125214927, + "grad_norm": 0.1762542873620987, + "learning_rate": 0.0001, + "loss": 1.7991, + "step": 41200 + }, + { + "epoch": 0.5674080798910577, + "grad_norm": 0.16195493936538696, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 41250 + }, + { + "epoch": 0.5680958472606226, + "grad_norm": 0.18047676980495453, + "learning_rate": 0.0001, + "loss": 1.7962, + "step": 41300 + }, + { + "epoch": 0.5687836146301875, + "grad_norm": 0.18760687112808228, + "learning_rate": 0.0001, + "loss": 1.8, + "step": 41350 + }, + { + "epoch": 0.5694713819997524, + "grad_norm": 0.17012238502502441, + "learning_rate": 0.0001, + "loss": 1.7969, + "step": 41400 + }, + { + "epoch": 0.5701591493693173, + "grad_norm": 0.1699533313512802, + "learning_rate": 0.0001, + "loss": 1.7953, + "step": 41450 + }, + { + "epoch": 0.5708469167388822, + "grad_norm": 0.16422894597053528, + "learning_rate": 0.0001, + "loss": 1.7995, + "step": 41500 + }, + { + "epoch": 0.5715346841084472, + "grad_norm": 0.17526569962501526, + "learning_rate": 0.0001, + "loss": 1.7967, + "step": 41550 + }, + { + "epoch": 0.5722224514780121, + "grad_norm": 0.158601313829422, + "learning_rate": 0.0001, + "loss": 1.8006, + "step": 41600 + }, + { + "epoch": 0.572910218847577, + "grad_norm": 0.1562766283750534, + "learning_rate": 0.0001, + "loss": 1.7969, + "step": 41650 + }, + { + "epoch": 0.5735979862171419, + "grad_norm": 0.15490677952766418, + "learning_rate": 0.0001, + "loss": 1.8017, + "step": 41700 + }, + { + "epoch": 0.5742857535867069, + "grad_norm": 0.17004509270191193, + "learning_rate": 0.0001, + "loss": 1.7958, + "step": 41750 + }, + { + "epoch": 0.5749735209562717, + "grad_norm": 0.17213889956474304, + "learning_rate": 0.0001, + "loss": 1.797, + "step": 41800 + }, + { + "epoch": 0.5756612883258366, + "grad_norm": 0.17541930079460144, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 41850 + }, + { + "epoch": 0.5763490556954016, + "grad_norm": 0.18296034634113312, + "learning_rate": 0.0001, + "loss": 1.796, + "step": 41900 + }, + { + "epoch": 0.5770368230649665, + "grad_norm": 0.1777525097131729, + "learning_rate": 0.0001, + "loss": 1.7959, + "step": 41950 + }, + { + "epoch": 0.5777245904345314, + "grad_norm": 0.17678572237491608, + "learning_rate": 0.0001, + "loss": 1.7989, + "step": 42000 + }, + { + "epoch": 0.5784123578040964, + "grad_norm": 0.1763673573732376, + "learning_rate": 0.0001, + "loss": 1.8004, + "step": 42050 + }, + { + "epoch": 0.5791001251736613, + "grad_norm": 0.18608896434307098, + "learning_rate": 0.0001, + "loss": 1.7997, + "step": 42100 + }, + { + "epoch": 0.5797878925432262, + "grad_norm": 0.1691625863313675, + "learning_rate": 0.0001, + "loss": 1.7988, + "step": 42150 + }, + { + "epoch": 0.580475659912791, + "grad_norm": 0.1609441488981247, + "learning_rate": 0.0001, + "loss": 1.7993, + "step": 42200 + }, + { + "epoch": 0.581163427282356, + "grad_norm": 0.15776963531970978, + "learning_rate": 0.0001, + "loss": 1.7994, + "step": 42250 + }, + { + "epoch": 0.5818511946519209, + "grad_norm": 0.20214344561100006, + "learning_rate": 0.0001, + "loss": 1.7998, + "step": 42300 + }, + { + "epoch": 0.5825389620214858, + "grad_norm": 0.18112723529338837, + "learning_rate": 0.0001, + "loss": 1.8, + "step": 42350 + }, + { + "epoch": 0.5832267293910508, + "grad_norm": 0.1543450802564621, + "learning_rate": 0.0001, + "loss": 1.7982, + "step": 42400 + }, + { + "epoch": 0.5839144967606157, + "grad_norm": 0.15315985679626465, + "learning_rate": 0.0001, + "loss": 1.7995, + "step": 42450 + }, + { + "epoch": 0.5846022641301806, + "grad_norm": 0.16166909039020538, + "learning_rate": 0.0001, + "loss": 1.7995, + "step": 42500 + }, + { + "epoch": 0.5852900314997456, + "grad_norm": 0.15933014452457428, + "learning_rate": 0.0001, + "loss": 1.7968, + "step": 42550 + }, + { + "epoch": 0.5859777988693105, + "grad_norm": 0.15434689819812775, + "learning_rate": 0.0001, + "loss": 1.797, + "step": 42600 + }, + { + "epoch": 0.5866655662388753, + "grad_norm": 0.1875755488872528, + "learning_rate": 0.0001, + "loss": 1.7964, + "step": 42650 + }, + { + "epoch": 0.5873533336084403, + "grad_norm": 0.15559327602386475, + "learning_rate": 0.0001, + "loss": 1.7997, + "step": 42700 + }, + { + "epoch": 0.5880411009780052, + "grad_norm": 0.16149398684501648, + "learning_rate": 0.0001, + "loss": 1.7956, + "step": 42750 + }, + { + "epoch": 0.5887288683475701, + "grad_norm": 0.1777992695569992, + "learning_rate": 0.0001, + "loss": 1.7912, + "step": 42800 + }, + { + "epoch": 0.589416635717135, + "grad_norm": 0.15934714674949646, + "learning_rate": 0.0001, + "loss": 1.7989, + "step": 42850 + }, + { + "epoch": 0.5901044030867, + "grad_norm": 0.16847145557403564, + "learning_rate": 0.0001, + "loss": 1.7997, + "step": 42900 + }, + { + "epoch": 0.5907921704562649, + "grad_norm": 0.17410792410373688, + "learning_rate": 0.0001, + "loss": 1.7999, + "step": 42950 + }, + { + "epoch": 0.5914799378258297, + "grad_norm": 0.18102861940860748, + "learning_rate": 0.0001, + "loss": 1.7983, + "step": 43000 + }, + { + "epoch": 0.5921677051953947, + "grad_norm": 0.1682325005531311, + "learning_rate": 0.0001, + "loss": 1.7986, + "step": 43050 + }, + { + "epoch": 0.5928554725649596, + "grad_norm": 0.17732855677604675, + "learning_rate": 0.0001, + "loss": 1.8004, + "step": 43100 + }, + { + "epoch": 0.5935432399345245, + "grad_norm": 0.16327179968357086, + "learning_rate": 0.0001, + "loss": 1.7969, + "step": 43150 + }, + { + "epoch": 0.5942310073040895, + "grad_norm": 0.1582539677619934, + "learning_rate": 0.0001, + "loss": 1.798, + "step": 43200 + }, + { + "epoch": 0.5949187746736544, + "grad_norm": 0.14965754747390747, + "learning_rate": 0.0001, + "loss": 1.7986, + "step": 43250 + }, + { + "epoch": 0.5956065420432193, + "grad_norm": 0.1617211103439331, + "learning_rate": 0.0001, + "loss": 1.7938, + "step": 43300 + }, + { + "epoch": 0.5962943094127843, + "grad_norm": 0.17458325624465942, + "learning_rate": 0.0001, + "loss": 1.7978, + "step": 43350 + }, + { + "epoch": 0.5969820767823492, + "grad_norm": 0.1668146252632141, + "learning_rate": 0.0001, + "loss": 1.7983, + "step": 43400 + }, + { + "epoch": 0.597669844151914, + "grad_norm": 0.15414200723171234, + "learning_rate": 0.0001, + "loss": 1.7989, + "step": 43450 + }, + { + "epoch": 0.5983576115214789, + "grad_norm": 0.15912353992462158, + "learning_rate": 0.0001, + "loss": 1.7964, + "step": 43500 + }, + { + "epoch": 0.5990453788910439, + "grad_norm": 0.15936636924743652, + "learning_rate": 0.0001, + "loss": 1.7944, + "step": 43550 + }, + { + "epoch": 0.5997331462606088, + "grad_norm": 0.17340709269046783, + "learning_rate": 0.0001, + "loss": 1.7912, + "step": 43600 + }, + { + "epoch": 0.6004209136301737, + "grad_norm": 0.18960115313529968, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 43650 + }, + { + "epoch": 0.6011086809997387, + "grad_norm": 0.17091485857963562, + "learning_rate": 0.0001, + "loss": 1.7998, + "step": 43700 + }, + { + "epoch": 0.6017964483693036, + "grad_norm": 0.17222945392131805, + "learning_rate": 0.0001, + "loss": 1.8016, + "step": 43750 + }, + { + "epoch": 0.6024842157388685, + "grad_norm": 0.1608862429857254, + "learning_rate": 0.0001, + "loss": 1.794, + "step": 43800 + }, + { + "epoch": 0.6031719831084335, + "grad_norm": 0.16626954078674316, + "learning_rate": 0.0001, + "loss": 1.7971, + "step": 43850 + }, + { + "epoch": 0.6038597504779983, + "grad_norm": 0.1769898533821106, + "learning_rate": 0.0001, + "loss": 1.7992, + "step": 43900 + }, + { + "epoch": 0.6045475178475632, + "grad_norm": 0.1665075570344925, + "learning_rate": 0.0001, + "loss": 1.7976, + "step": 43950 + }, + { + "epoch": 0.6052352852171281, + "grad_norm": 0.1957935094833374, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 44000 + }, + { + "epoch": 0.6059230525866931, + "grad_norm": 0.20066794753074646, + "learning_rate": 0.0001, + "loss": 1.7976, + "step": 44050 + }, + { + "epoch": 0.606610819956258, + "grad_norm": 0.16102181375026703, + "learning_rate": 0.0001, + "loss": 1.7942, + "step": 44100 + }, + { + "epoch": 0.6072985873258229, + "grad_norm": 0.16587640345096588, + "learning_rate": 0.0001, + "loss": 1.7964, + "step": 44150 + }, + { + "epoch": 0.6079863546953879, + "grad_norm": 0.17338010668754578, + "learning_rate": 0.0001, + "loss": 1.7955, + "step": 44200 + }, + { + "epoch": 0.6086741220649527, + "grad_norm": 0.1979152411222458, + "learning_rate": 0.0001, + "loss": 1.7964, + "step": 44250 + }, + { + "epoch": 0.6093618894345176, + "grad_norm": 0.16478174924850464, + "learning_rate": 0.0001, + "loss": 1.8013, + "step": 44300 + }, + { + "epoch": 0.6100496568040826, + "grad_norm": 0.16508819162845612, + "learning_rate": 0.0001, + "loss": 1.7922, + "step": 44350 + }, + { + "epoch": 0.6107374241736475, + "grad_norm": 0.15964439511299133, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 44400 + }, + { + "epoch": 0.6114251915432124, + "grad_norm": 0.18116386234760284, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 44450 + }, + { + "epoch": 0.6121129589127774, + "grad_norm": 0.1808495819568634, + "learning_rate": 0.0001, + "loss": 1.7958, + "step": 44500 + }, + { + "epoch": 0.6128007262823423, + "grad_norm": 0.1634376347064972, + "learning_rate": 0.0001, + "loss": 1.7931, + "step": 44550 + }, + { + "epoch": 0.6134884936519072, + "grad_norm": 0.15140944719314575, + "learning_rate": 0.0001, + "loss": 1.7995, + "step": 44600 + }, + { + "epoch": 0.614176261021472, + "grad_norm": 0.15988072752952576, + "learning_rate": 0.0001, + "loss": 1.7957, + "step": 44650 + }, + { + "epoch": 0.614864028391037, + "grad_norm": 0.16280120611190796, + "learning_rate": 0.0001, + "loss": 1.7986, + "step": 44700 + }, + { + "epoch": 0.6155517957606019, + "grad_norm": 0.16643498837947845, + "learning_rate": 0.0001, + "loss": 1.7975, + "step": 44750 + }, + { + "epoch": 0.6162395631301668, + "grad_norm": 0.151467427611351, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 44800 + }, + { + "epoch": 0.6169273304997318, + "grad_norm": 0.1621852070093155, + "learning_rate": 0.0001, + "loss": 1.7948, + "step": 44850 + }, + { + "epoch": 0.6176150978692967, + "grad_norm": 0.1828535795211792, + "learning_rate": 0.0001, + "loss": 1.7939, + "step": 44900 + }, + { + "epoch": 0.6183028652388616, + "grad_norm": 0.1630941480398178, + "learning_rate": 0.0001, + "loss": 1.7987, + "step": 44950 + }, + { + "epoch": 0.6189906326084266, + "grad_norm": 0.1701328009366989, + "learning_rate": 0.0001, + "loss": 1.7955, + "step": 45000 + }, + { + "epoch": 0.6196783999779915, + "grad_norm": 0.16631458699703217, + "learning_rate": 0.0001, + "loss": 1.7985, + "step": 45050 + }, + { + "epoch": 0.6203661673475563, + "grad_norm": 0.17133264243602753, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 45100 + }, + { + "epoch": 0.6210539347171212, + "grad_norm": 0.19388112425804138, + "learning_rate": 0.0001, + "loss": 1.7944, + "step": 45150 + }, + { + "epoch": 0.6217417020866862, + "grad_norm": 0.1769258826971054, + "learning_rate": 0.0001, + "loss": 1.7937, + "step": 45200 + }, + { + "epoch": 0.6224294694562511, + "grad_norm": 0.21986328065395355, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 45250 + }, + { + "epoch": 0.623117236825816, + "grad_norm": 0.1711747795343399, + "learning_rate": 0.0001, + "loss": 1.7923, + "step": 45300 + }, + { + "epoch": 0.623805004195381, + "grad_norm": 0.1730772852897644, + "learning_rate": 0.0001, + "loss": 1.7976, + "step": 45350 + }, + { + "epoch": 0.6244927715649459, + "grad_norm": 0.16657279431819916, + "learning_rate": 0.0001, + "loss": 1.7958, + "step": 45400 + }, + { + "epoch": 0.6251805389345108, + "grad_norm": 0.15675725042819977, + "learning_rate": 0.0001, + "loss": 1.7931, + "step": 45450 + }, + { + "epoch": 0.6258683063040757, + "grad_norm": 0.17763769626617432, + "learning_rate": 0.0001, + "loss": 1.7972, + "step": 45500 + }, + { + "epoch": 0.6265560736736406, + "grad_norm": 0.1630527824163437, + "learning_rate": 0.0001, + "loss": 1.7948, + "step": 45550 + }, + { + "epoch": 0.6272438410432055, + "grad_norm": 0.16628991067409515, + "learning_rate": 0.0001, + "loss": 1.7959, + "step": 45600 + }, + { + "epoch": 0.6279316084127705, + "grad_norm": 0.1589209884405136, + "learning_rate": 0.0001, + "loss": 1.7949, + "step": 45650 + }, + { + "epoch": 0.6286193757823354, + "grad_norm": 0.17715197801589966, + "learning_rate": 0.0001, + "loss": 1.7971, + "step": 45700 + }, + { + "epoch": 0.6293071431519003, + "grad_norm": 0.1824561059474945, + "learning_rate": 0.0001, + "loss": 1.795, + "step": 45750 + }, + { + "epoch": 0.6299949105214652, + "grad_norm": 0.16866008937358856, + "learning_rate": 0.0001, + "loss": 1.7957, + "step": 45800 + }, + { + "epoch": 0.6306826778910302, + "grad_norm": 0.14337721467018127, + "learning_rate": 0.0001, + "loss": 1.7937, + "step": 45850 + }, + { + "epoch": 0.631370445260595, + "grad_norm": 0.15916399657726288, + "learning_rate": 0.0001, + "loss": 1.7938, + "step": 45900 + }, + { + "epoch": 0.6320582126301599, + "grad_norm": 0.1653524488210678, + "learning_rate": 0.0001, + "loss": 1.795, + "step": 45950 + }, + { + "epoch": 0.6327459799997249, + "grad_norm": 0.1588210016489029, + "learning_rate": 0.0001, + "loss": 1.7963, + "step": 46000 + }, + { + "epoch": 0.6334337473692898, + "grad_norm": 0.16008345782756805, + "learning_rate": 0.0001, + "loss": 1.7978, + "step": 46050 + }, + { + "epoch": 0.6341215147388547, + "grad_norm": 0.16054043173789978, + "learning_rate": 0.0001, + "loss": 1.7914, + "step": 46100 + }, + { + "epoch": 0.6348092821084197, + "grad_norm": 0.19745290279388428, + "learning_rate": 0.0001, + "loss": 1.7938, + "step": 46150 + }, + { + "epoch": 0.6354970494779846, + "grad_norm": 0.18955908715724945, + "learning_rate": 0.0001, + "loss": 1.7948, + "step": 46200 + }, + { + "epoch": 0.6361848168475495, + "grad_norm": 0.16962236166000366, + "learning_rate": 0.0001, + "loss": 1.7911, + "step": 46250 + }, + { + "epoch": 0.6368725842171145, + "grad_norm": 0.17200341820716858, + "learning_rate": 0.0001, + "loss": 1.7935, + "step": 46300 + }, + { + "epoch": 0.6375603515866793, + "grad_norm": 0.17781908810138702, + "learning_rate": 0.0001, + "loss": 1.7905, + "step": 46350 + }, + { + "epoch": 0.6382481189562442, + "grad_norm": 0.17602622509002686, + "learning_rate": 0.0001, + "loss": 1.7945, + "step": 46400 + }, + { + "epoch": 0.6389358863258091, + "grad_norm": 0.1686919629573822, + "learning_rate": 0.0001, + "loss": 1.7892, + "step": 46450 + }, + { + "epoch": 0.6396236536953741, + "grad_norm": 0.15013763308525085, + "learning_rate": 0.0001, + "loss": 1.7969, + "step": 46500 + }, + { + "epoch": 0.640311421064939, + "grad_norm": 0.16534103453159332, + "learning_rate": 0.0001, + "loss": 1.7943, + "step": 46550 + }, + { + "epoch": 0.6409991884345039, + "grad_norm": 0.16527748107910156, + "learning_rate": 0.0001, + "loss": 1.7904, + "step": 46600 + }, + { + "epoch": 0.6416869558040689, + "grad_norm": 0.15024395287036896, + "learning_rate": 0.0001, + "loss": 1.7944, + "step": 46650 + }, + { + "epoch": 0.6423747231736338, + "grad_norm": 0.17082852125167847, + "learning_rate": 0.0001, + "loss": 1.7942, + "step": 46700 + }, + { + "epoch": 0.6430624905431986, + "grad_norm": 0.1649017482995987, + "learning_rate": 0.0001, + "loss": 1.7936, + "step": 46750 + }, + { + "epoch": 0.6437502579127636, + "grad_norm": 0.16045525670051575, + "learning_rate": 0.0001, + "loss": 1.7913, + "step": 46800 + }, + { + "epoch": 0.6444380252823285, + "grad_norm": 0.18290746212005615, + "learning_rate": 0.0001, + "loss": 1.7898, + "step": 46850 + }, + { + "epoch": 0.6451257926518934, + "grad_norm": 0.14731939136981964, + "learning_rate": 0.0001, + "loss": 1.7934, + "step": 46900 + }, + { + "epoch": 0.6458135600214583, + "grad_norm": 0.16072627902030945, + "learning_rate": 0.0001, + "loss": 1.7933, + "step": 46950 + }, + { + "epoch": 0.6465013273910233, + "grad_norm": 0.14942970871925354, + "learning_rate": 0.0001, + "loss": 1.7944, + "step": 47000 + }, + { + "epoch": 0.6471890947605882, + "grad_norm": 0.14922235906124115, + "learning_rate": 0.0001, + "loss": 1.7953, + "step": 47050 + }, + { + "epoch": 0.647876862130153, + "grad_norm": 0.17120474576950073, + "learning_rate": 0.0001, + "loss": 1.7955, + "step": 47100 + }, + { + "epoch": 0.648564629499718, + "grad_norm": 0.17423823475837708, + "learning_rate": 0.0001, + "loss": 1.7919, + "step": 47150 + }, + { + "epoch": 0.6492523968692829, + "grad_norm": 0.1567763239145279, + "learning_rate": 0.0001, + "loss": 1.7934, + "step": 47200 + }, + { + "epoch": 0.6499401642388478, + "grad_norm": 0.15817411243915558, + "learning_rate": 0.0001, + "loss": 1.7928, + "step": 47250 + }, + { + "epoch": 0.6506279316084128, + "grad_norm": 0.1748141348361969, + "learning_rate": 0.0001, + "loss": 1.7884, + "step": 47300 + }, + { + "epoch": 0.6513156989779777, + "grad_norm": 0.2045951634645462, + "learning_rate": 0.0001, + "loss": 1.7978, + "step": 47350 + }, + { + "epoch": 0.6520034663475426, + "grad_norm": 0.17650052905082703, + "learning_rate": 0.0001, + "loss": 1.792, + "step": 47400 + }, + { + "epoch": 0.6526912337171076, + "grad_norm": 0.17905278503894806, + "learning_rate": 0.0001, + "loss": 1.7958, + "step": 47450 + }, + { + "epoch": 0.6533790010866725, + "grad_norm": 0.1599511355161667, + "learning_rate": 0.0001, + "loss": 1.7912, + "step": 47500 + }, + { + "epoch": 0.6540667684562373, + "grad_norm": 0.1584351658821106, + "learning_rate": 0.0001, + "loss": 1.7949, + "step": 47550 + }, + { + "epoch": 0.6547545358258022, + "grad_norm": 0.17251476645469666, + "learning_rate": 0.0001, + "loss": 1.7913, + "step": 47600 + }, + { + "epoch": 0.6554423031953672, + "grad_norm": 0.17718471586704254, + "learning_rate": 0.0001, + "loss": 1.7934, + "step": 47650 + }, + { + "epoch": 0.6561300705649321, + "grad_norm": 0.15196654200553894, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 47700 + }, + { + "epoch": 0.656817837934497, + "grad_norm": 0.17444145679473877, + "learning_rate": 0.0001, + "loss": 1.7907, + "step": 47750 + }, + { + "epoch": 0.657505605304062, + "grad_norm": 0.15149961411952972, + "learning_rate": 0.0001, + "loss": 1.7959, + "step": 47800 + }, + { + "epoch": 0.6581933726736269, + "grad_norm": 0.1591227501630783, + "learning_rate": 0.0001, + "loss": 1.7907, + "step": 47850 + }, + { + "epoch": 0.6588811400431918, + "grad_norm": 0.20135171711444855, + "learning_rate": 0.0001, + "loss": 1.7963, + "step": 47900 + }, + { + "epoch": 0.6595689074127568, + "grad_norm": 0.16523614525794983, + "learning_rate": 0.0001, + "loss": 1.7968, + "step": 47950 + }, + { + "epoch": 0.6602566747823216, + "grad_norm": 0.15842151641845703, + "learning_rate": 0.0001, + "loss": 1.7897, + "step": 48000 + }, + { + "epoch": 0.6609444421518865, + "grad_norm": 0.160832479596138, + "learning_rate": 0.0001, + "loss": 1.796, + "step": 48050 + }, + { + "epoch": 0.6616322095214515, + "grad_norm": 0.16063477098941803, + "learning_rate": 0.0001, + "loss": 1.7903, + "step": 48100 + }, + { + "epoch": 0.6623199768910164, + "grad_norm": 0.1595107465982437, + "learning_rate": 0.0001, + "loss": 1.7953, + "step": 48150 + }, + { + "epoch": 0.6630077442605813, + "grad_norm": 0.18313910067081451, + "learning_rate": 0.0001, + "loss": 1.7957, + "step": 48200 + }, + { + "epoch": 0.6636955116301462, + "grad_norm": 0.17561380565166473, + "learning_rate": 0.0001, + "loss": 1.7906, + "step": 48250 + }, + { + "epoch": 0.6643832789997112, + "grad_norm": 0.18327072262763977, + "learning_rate": 0.0001, + "loss": 1.7916, + "step": 48300 + }, + { + "epoch": 0.665071046369276, + "grad_norm": 0.16745221614837646, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 48350 + }, + { + "epoch": 0.6657588137388409, + "grad_norm": 0.16286319494247437, + "learning_rate": 0.0001, + "loss": 1.7942, + "step": 48400 + }, + { + "epoch": 0.6664465811084059, + "grad_norm": 0.15864308178424835, + "learning_rate": 0.0001, + "loss": 1.7953, + "step": 48450 + }, + { + "epoch": 0.6671343484779708, + "grad_norm": 0.16778843104839325, + "learning_rate": 0.0001, + "loss": 1.7945, + "step": 48500 + }, + { + "epoch": 0.6678221158475357, + "grad_norm": 0.1448727399110794, + "learning_rate": 0.0001, + "loss": 1.7942, + "step": 48550 + }, + { + "epoch": 0.6685098832171007, + "grad_norm": 0.16745643317699432, + "learning_rate": 0.0001, + "loss": 1.7903, + "step": 48600 + }, + { + "epoch": 0.6691976505866656, + "grad_norm": 0.1633836030960083, + "learning_rate": 0.0001, + "loss": 1.7938, + "step": 48650 + }, + { + "epoch": 0.6698854179562305, + "grad_norm": 0.15037505328655243, + "learning_rate": 0.0001, + "loss": 1.7963, + "step": 48700 + }, + { + "epoch": 0.6705731853257954, + "grad_norm": 0.1707869917154312, + "learning_rate": 0.0001, + "loss": 1.7895, + "step": 48750 + }, + { + "epoch": 0.6712609526953603, + "grad_norm": 0.17392534017562866, + "learning_rate": 0.0001, + "loss": 1.7926, + "step": 48800 + }, + { + "epoch": 0.6719487200649252, + "grad_norm": 0.1588422805070877, + "learning_rate": 0.0001, + "loss": 1.7958, + "step": 48850 + }, + { + "epoch": 0.6726364874344901, + "grad_norm": 0.1751549243927002, + "learning_rate": 0.0001, + "loss": 1.7931, + "step": 48900 + }, + { + "epoch": 0.6733242548040551, + "grad_norm": 0.1722249686717987, + "learning_rate": 0.0001, + "loss": 1.7892, + "step": 48950 + }, + { + "epoch": 0.67401202217362, + "grad_norm": 0.1673288643360138, + "learning_rate": 0.0001, + "loss": 1.793, + "step": 49000 + }, + { + "epoch": 0.6746997895431849, + "grad_norm": 0.1552770733833313, + "learning_rate": 0.0001, + "loss": 1.7916, + "step": 49050 + }, + { + "epoch": 0.6753875569127499, + "grad_norm": 0.15788178145885468, + "learning_rate": 0.0001, + "loss": 1.7981, + "step": 49100 + }, + { + "epoch": 0.6760753242823148, + "grad_norm": 0.17959725856781006, + "learning_rate": 0.0001, + "loss": 1.7949, + "step": 49150 + }, + { + "epoch": 0.6767630916518796, + "grad_norm": 0.1584416925907135, + "learning_rate": 0.0001, + "loss": 1.7946, + "step": 49200 + }, + { + "epoch": 0.6774508590214446, + "grad_norm": 0.1645151674747467, + "learning_rate": 0.0001, + "loss": 1.7916, + "step": 49250 + }, + { + "epoch": 0.6781386263910095, + "grad_norm": 0.1522347778081894, + "learning_rate": 0.0001, + "loss": 1.7891, + "step": 49300 + }, + { + "epoch": 0.6788263937605744, + "grad_norm": 0.16095298528671265, + "learning_rate": 0.0001, + "loss": 1.7927, + "step": 49350 + }, + { + "epoch": 0.6795141611301393, + "grad_norm": 0.15317974984645844, + "learning_rate": 0.0001, + "loss": 1.7947, + "step": 49400 + }, + { + "epoch": 0.6802019284997043, + "grad_norm": 0.16854670643806458, + "learning_rate": 0.0001, + "loss": 1.7929, + "step": 49450 + }, + { + "epoch": 0.6808896958692692, + "grad_norm": 0.1702488660812378, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 49500 + }, + { + "epoch": 0.6815774632388341, + "grad_norm": 0.16388344764709473, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 49550 + }, + { + "epoch": 0.682265230608399, + "grad_norm": 0.16601653397083282, + "learning_rate": 0.0001, + "loss": 1.7949, + "step": 49600 + }, + { + "epoch": 0.6829529979779639, + "grad_norm": 0.17910674214363098, + "learning_rate": 0.0001, + "loss": 1.7875, + "step": 49650 + }, + { + "epoch": 0.6836407653475288, + "grad_norm": 0.15689565241336823, + "learning_rate": 0.0001, + "loss": 1.7904, + "step": 49700 + }, + { + "epoch": 0.6843285327170938, + "grad_norm": 0.15473750233650208, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 49750 + }, + { + "epoch": 0.6850163000866587, + "grad_norm": 0.16794639825820923, + "learning_rate": 0.0001, + "loss": 1.7934, + "step": 49800 + }, + { + "epoch": 0.6857040674562236, + "grad_norm": 0.15183915197849274, + "learning_rate": 0.0001, + "loss": 1.7887, + "step": 49850 + }, + { + "epoch": 0.6863918348257885, + "grad_norm": 0.15028232336044312, + "learning_rate": 0.0001, + "loss": 1.7929, + "step": 49900 + }, + { + "epoch": 0.6870796021953535, + "grad_norm": 0.16230390965938568, + "learning_rate": 0.0001, + "loss": 1.7948, + "step": 49950 + }, + { + "epoch": 0.6877673695649184, + "grad_norm": 0.16958658397197723, + "learning_rate": 0.0001, + "loss": 1.7932, + "step": 50000 + }, + { + "epoch": 0.6884551369344832, + "grad_norm": 0.15662765502929688, + "learning_rate": 0.0001, + "loss": 1.7904, + "step": 50050 + }, + { + "epoch": 0.6891429043040482, + "grad_norm": 0.17507807910442352, + "learning_rate": 0.0001, + "loss": 1.795, + "step": 50100 + }, + { + "epoch": 0.6898306716736131, + "grad_norm": 0.16449585556983948, + "learning_rate": 0.0001, + "loss": 1.7888, + "step": 50150 + }, + { + "epoch": 0.690518439043178, + "grad_norm": 0.17615753412246704, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 50200 + }, + { + "epoch": 0.691206206412743, + "grad_norm": 0.16010646522045135, + "learning_rate": 0.0001, + "loss": 1.7932, + "step": 50250 + }, + { + "epoch": 0.6918939737823079, + "grad_norm": 0.14614787697792053, + "learning_rate": 0.0001, + "loss": 1.792, + "step": 50300 + }, + { + "epoch": 0.6925817411518728, + "grad_norm": 0.19960370659828186, + "learning_rate": 0.0001, + "loss": 1.7907, + "step": 50350 + }, + { + "epoch": 0.6932695085214378, + "grad_norm": 0.16230808198451996, + "learning_rate": 0.0001, + "loss": 1.7855, + "step": 50400 + }, + { + "epoch": 0.6939572758910026, + "grad_norm": 0.16344518959522247, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 50450 + }, + { + "epoch": 0.6946450432605675, + "grad_norm": 0.16584964096546173, + "learning_rate": 0.0001, + "loss": 1.7916, + "step": 50500 + }, + { + "epoch": 0.6953328106301324, + "grad_norm": 0.15551120042800903, + "learning_rate": 0.0001, + "loss": 1.7948, + "step": 50550 + }, + { + "epoch": 0.6960205779996974, + "grad_norm": 0.1697503924369812, + "learning_rate": 0.0001, + "loss": 1.7917, + "step": 50600 + }, + { + "epoch": 0.6967083453692623, + "grad_norm": 0.15577536821365356, + "learning_rate": 0.0001, + "loss": 1.7905, + "step": 50650 + }, + { + "epoch": 0.6973961127388272, + "grad_norm": 0.17658278346061707, + "learning_rate": 0.0001, + "loss": 1.7884, + "step": 50700 + }, + { + "epoch": 0.6980838801083922, + "grad_norm": 0.16718824207782745, + "learning_rate": 0.0001, + "loss": 1.7936, + "step": 50750 + }, + { + "epoch": 0.6987716474779571, + "grad_norm": 0.16996939480304718, + "learning_rate": 0.0001, + "loss": 1.7919, + "step": 50800 + }, + { + "epoch": 0.6994594148475219, + "grad_norm": 0.15299175679683685, + "learning_rate": 0.0001, + "loss": 1.7919, + "step": 50850 + }, + { + "epoch": 0.7001471822170869, + "grad_norm": 0.1672915816307068, + "learning_rate": 0.0001, + "loss": 1.795, + "step": 50900 + }, + { + "epoch": 0.7008349495866518, + "grad_norm": 0.17287658154964447, + "learning_rate": 0.0001, + "loss": 1.7877, + "step": 50950 + }, + { + "epoch": 0.7015227169562167, + "grad_norm": 0.16447900235652924, + "learning_rate": 0.0001, + "loss": 1.7915, + "step": 51000 + }, + { + "epoch": 0.7022104843257817, + "grad_norm": 0.16016733646392822, + "learning_rate": 0.0001, + "loss": 1.7911, + "step": 51050 + }, + { + "epoch": 0.7028982516953466, + "grad_norm": 0.15329506993293762, + "learning_rate": 0.0001, + "loss": 1.7915, + "step": 51100 + }, + { + "epoch": 0.7035860190649115, + "grad_norm": 0.1695086658000946, + "learning_rate": 0.0001, + "loss": 1.7925, + "step": 51150 + }, + { + "epoch": 0.7042737864344764, + "grad_norm": 0.15667758882045746, + "learning_rate": 0.0001, + "loss": 1.7908, + "step": 51200 + }, + { + "epoch": 0.7049615538040414, + "grad_norm": 0.1636906862258911, + "learning_rate": 0.0001, + "loss": 1.7911, + "step": 51250 + }, + { + "epoch": 0.7056493211736062, + "grad_norm": 0.16701051592826843, + "learning_rate": 0.0001, + "loss": 1.7929, + "step": 51300 + }, + { + "epoch": 0.7063370885431711, + "grad_norm": 0.17164082825183868, + "learning_rate": 0.0001, + "loss": 1.7922, + "step": 51350 + }, + { + "epoch": 0.7070248559127361, + "grad_norm": 0.18162649869918823, + "learning_rate": 0.0001, + "loss": 1.7892, + "step": 51400 + }, + { + "epoch": 0.707712623282301, + "grad_norm": 0.1521824300289154, + "learning_rate": 0.0001, + "loss": 1.7937, + "step": 51450 + }, + { + "epoch": 0.7084003906518659, + "grad_norm": 0.168669655919075, + "learning_rate": 0.0001, + "loss": 1.7873, + "step": 51500 + }, + { + "epoch": 0.7090881580214309, + "grad_norm": 0.17441484332084656, + "learning_rate": 0.0001, + "loss": 1.79, + "step": 51550 + }, + { + "epoch": 0.7097759253909958, + "grad_norm": 0.1877586394548416, + "learning_rate": 0.0001, + "loss": 1.7927, + "step": 51600 + }, + { + "epoch": 0.7104636927605607, + "grad_norm": 0.16195935010910034, + "learning_rate": 0.0001, + "loss": 1.7962, + "step": 51650 + }, + { + "epoch": 0.7111514601301255, + "grad_norm": 0.16282670199871063, + "learning_rate": 0.0001, + "loss": 1.7939, + "step": 51700 + }, + { + "epoch": 0.7118392274996905, + "grad_norm": 0.15550565719604492, + "learning_rate": 0.0001, + "loss": 1.793, + "step": 51750 + }, + { + "epoch": 0.7125269948692554, + "grad_norm": 0.16963760554790497, + "learning_rate": 0.0001, + "loss": 1.7921, + "step": 51800 + }, + { + "epoch": 0.7132147622388203, + "grad_norm": 0.1632436364889145, + "learning_rate": 0.0001, + "loss": 1.7943, + "step": 51850 + }, + { + "epoch": 0.7139025296083853, + "grad_norm": 0.15533354878425598, + "learning_rate": 0.0001, + "loss": 1.7917, + "step": 51900 + }, + { + "epoch": 0.7145902969779502, + "grad_norm": 0.15280106663703918, + "learning_rate": 0.0001, + "loss": 1.7874, + "step": 51950 + }, + { + "epoch": 0.7152780643475151, + "grad_norm": 0.1561509668827057, + "learning_rate": 0.0001, + "loss": 1.7918, + "step": 52000 + }, + { + "epoch": 0.7159658317170801, + "grad_norm": 0.1560848206281662, + "learning_rate": 0.0001, + "loss": 1.7927, + "step": 52050 + }, + { + "epoch": 0.7166535990866449, + "grad_norm": 0.1706065684556961, + "learning_rate": 0.0001, + "loss": 1.7877, + "step": 52100 + }, + { + "epoch": 0.7173413664562098, + "grad_norm": 0.16388699412345886, + "learning_rate": 0.0001, + "loss": 1.7859, + "step": 52150 + }, + { + "epoch": 0.7180291338257748, + "grad_norm": 0.16502410173416138, + "learning_rate": 0.0001, + "loss": 1.7899, + "step": 52200 + }, + { + "epoch": 0.7187169011953397, + "grad_norm": 0.17022061347961426, + "learning_rate": 0.0001, + "loss": 1.7881, + "step": 52250 + }, + { + "epoch": 0.7194046685649046, + "grad_norm": 0.17903153598308563, + "learning_rate": 0.0001, + "loss": 1.7892, + "step": 52300 + }, + { + "epoch": 0.7200924359344695, + "grad_norm": 0.15719935297966003, + "learning_rate": 0.0001, + "loss": 1.7934, + "step": 52350 + }, + { + "epoch": 0.7207802033040345, + "grad_norm": 0.16321443021297455, + "learning_rate": 0.0001, + "loss": 1.7914, + "step": 52400 + }, + { + "epoch": 0.7214679706735994, + "grad_norm": 0.1724744439125061, + "learning_rate": 0.0001, + "loss": 1.7905, + "step": 52450 + }, + { + "epoch": 0.7221557380431642, + "grad_norm": 0.16059927642345428, + "learning_rate": 0.0001, + "loss": 1.7929, + "step": 52500 + }, + { + "epoch": 0.7228435054127292, + "grad_norm": 0.17748789489269257, + "learning_rate": 0.0001, + "loss": 1.7913, + "step": 52550 + }, + { + "epoch": 0.7235312727822941, + "grad_norm": 0.16190293431282043, + "learning_rate": 0.0001, + "loss": 1.7956, + "step": 52600 + }, + { + "epoch": 0.724219040151859, + "grad_norm": 0.1841738224029541, + "learning_rate": 0.0001, + "loss": 1.7899, + "step": 52650 + }, + { + "epoch": 0.724906807521424, + "grad_norm": 0.15971702337265015, + "learning_rate": 0.0001, + "loss": 1.7891, + "step": 52700 + }, + { + "epoch": 0.7255945748909889, + "grad_norm": 0.15894858539104462, + "learning_rate": 0.0001, + "loss": 1.7939, + "step": 52750 + }, + { + "epoch": 0.7262823422605538, + "grad_norm": 0.15041370689868927, + "learning_rate": 0.0001, + "loss": 1.7885, + "step": 52800 + }, + { + "epoch": 0.7269701096301187, + "grad_norm": 0.15757033228874207, + "learning_rate": 0.0001, + "loss": 1.7881, + "step": 52850 + }, + { + "epoch": 0.7276578769996837, + "grad_norm": 0.16385579109191895, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 52900 + }, + { + "epoch": 0.7283456443692485, + "grad_norm": 0.15629428625106812, + "learning_rate": 0.0001, + "loss": 1.7932, + "step": 52950 + }, + { + "epoch": 0.7290334117388134, + "grad_norm": 0.1573755145072937, + "learning_rate": 0.0001, + "loss": 1.7926, + "step": 53000 + }, + { + "epoch": 0.7297211791083784, + "grad_norm": 0.15800927579402924, + "learning_rate": 0.0001, + "loss": 1.789, + "step": 53050 + }, + { + "epoch": 0.7304089464779433, + "grad_norm": 0.16997511684894562, + "learning_rate": 0.0001, + "loss": 1.7892, + "step": 53100 + }, + { + "epoch": 0.7310967138475082, + "grad_norm": 0.1457889825105667, + "learning_rate": 0.0001, + "loss": 1.7892, + "step": 53150 + }, + { + "epoch": 0.7317844812170732, + "grad_norm": 0.15250973403453827, + "learning_rate": 0.0001, + "loss": 1.7892, + "step": 53200 + }, + { + "epoch": 0.7324722485866381, + "grad_norm": 0.1561204344034195, + "learning_rate": 0.0001, + "loss": 1.7915, + "step": 53250 + }, + { + "epoch": 0.733160015956203, + "grad_norm": 0.17602892220020294, + "learning_rate": 0.0001, + "loss": 1.789, + "step": 53300 + }, + { + "epoch": 0.7338477833257679, + "grad_norm": 0.15751750767230988, + "learning_rate": 0.0001, + "loss": 1.7924, + "step": 53350 + }, + { + "epoch": 0.7345355506953328, + "grad_norm": 0.1686706244945526, + "learning_rate": 0.0001, + "loss": 1.7859, + "step": 53400 + }, + { + "epoch": 0.7352233180648977, + "grad_norm": 0.15886232256889343, + "learning_rate": 0.0001, + "loss": 1.7891, + "step": 53450 + }, + { + "epoch": 0.7359110854344626, + "grad_norm": 0.1548243910074234, + "learning_rate": 0.0001, + "loss": 1.7887, + "step": 53500 + }, + { + "epoch": 0.7365988528040276, + "grad_norm": 0.16160327196121216, + "learning_rate": 0.0001, + "loss": 1.792, + "step": 53550 + }, + { + "epoch": 0.7372866201735925, + "grad_norm": 0.1588127613067627, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 53600 + }, + { + "epoch": 0.7379743875431574, + "grad_norm": 0.1562395691871643, + "learning_rate": 0.0001, + "loss": 1.7876, + "step": 53650 + }, + { + "epoch": 0.7386621549127224, + "grad_norm": 0.1463010013103485, + "learning_rate": 0.0001, + "loss": 1.7903, + "step": 53700 + }, + { + "epoch": 0.7393499222822872, + "grad_norm": 0.1688784807920456, + "learning_rate": 0.0001, + "loss": 1.7874, + "step": 53750 + }, + { + "epoch": 0.7400376896518521, + "grad_norm": 0.16111525893211365, + "learning_rate": 0.0001, + "loss": 1.7891, + "step": 53800 + }, + { + "epoch": 0.7407254570214171, + "grad_norm": 0.15798266232013702, + "learning_rate": 0.0001, + "loss": 1.7901, + "step": 53850 + }, + { + "epoch": 0.741413224390982, + "grad_norm": 0.1544068306684494, + "learning_rate": 0.0001, + "loss": 1.79, + "step": 53900 + }, + { + "epoch": 0.7421009917605469, + "grad_norm": 0.16747315227985382, + "learning_rate": 0.0001, + "loss": 1.7923, + "step": 53950 + }, + { + "epoch": 0.7427887591301119, + "grad_norm": 0.20277969539165497, + "learning_rate": 0.0001, + "loss": 1.7932, + "step": 54000 + }, + { + "epoch": 0.7434765264996768, + "grad_norm": 0.1490595042705536, + "learning_rate": 0.0001, + "loss": 1.7899, + "step": 54050 + }, + { + "epoch": 0.7441642938692417, + "grad_norm": 0.15864817798137665, + "learning_rate": 0.0001, + "loss": 1.7838, + "step": 54100 + }, + { + "epoch": 0.7448520612388065, + "grad_norm": 0.17168639600276947, + "learning_rate": 0.0001, + "loss": 1.79, + "step": 54150 + }, + { + "epoch": 0.7455398286083715, + "grad_norm": 0.1612584888935089, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 54200 + }, + { + "epoch": 0.7462275959779364, + "grad_norm": 0.16638678312301636, + "learning_rate": 0.0001, + "loss": 1.7852, + "step": 54250 + }, + { + "epoch": 0.7469153633475013, + "grad_norm": 0.16757947206497192, + "learning_rate": 0.0001, + "loss": 1.7899, + "step": 54300 + }, + { + "epoch": 0.7476031307170663, + "grad_norm": 0.17740657925605774, + "learning_rate": 0.0001, + "loss": 1.7891, + "step": 54350 + }, + { + "epoch": 0.7482908980866312, + "grad_norm": 0.15608841180801392, + "learning_rate": 0.0001, + "loss": 1.7864, + "step": 54400 + }, + { + "epoch": 0.7489786654561961, + "grad_norm": 0.1486404538154602, + "learning_rate": 0.0001, + "loss": 1.7895, + "step": 54450 + }, + { + "epoch": 0.7496664328257611, + "grad_norm": 0.17158234119415283, + "learning_rate": 0.0001, + "loss": 1.789, + "step": 54500 + }, + { + "epoch": 0.750354200195326, + "grad_norm": 0.1535918265581131, + "learning_rate": 0.0001, + "loss": 1.7858, + "step": 54550 + }, + { + "epoch": 0.7510419675648908, + "grad_norm": 0.17464052140712738, + "learning_rate": 0.0001, + "loss": 1.7884, + "step": 54600 + }, + { + "epoch": 0.7517297349344557, + "grad_norm": 0.15320485830307007, + "learning_rate": 0.0001, + "loss": 1.7909, + "step": 54650 + }, + { + "epoch": 0.7524175023040207, + "grad_norm": 0.16376914083957672, + "learning_rate": 0.0001, + "loss": 1.789, + "step": 54700 + }, + { + "epoch": 0.7531052696735856, + "grad_norm": 0.17047230899333954, + "learning_rate": 0.0001, + "loss": 1.7886, + "step": 54750 + }, + { + "epoch": 0.7537930370431505, + "grad_norm": 0.1580251306295395, + "learning_rate": 0.0001, + "loss": 1.7904, + "step": 54800 + }, + { + "epoch": 0.7544808044127155, + "grad_norm": 0.16085964441299438, + "learning_rate": 0.0001, + "loss": 1.7872, + "step": 54850 + }, + { + "epoch": 0.7551685717822804, + "grad_norm": 0.1530008316040039, + "learning_rate": 0.0001, + "loss": 1.7909, + "step": 54900 + }, + { + "epoch": 0.7558563391518452, + "grad_norm": 0.18514500558376312, + "learning_rate": 0.0001, + "loss": 1.789, + "step": 54950 + }, + { + "epoch": 0.7565441065214102, + "grad_norm": 0.16724203526973724, + "learning_rate": 0.0001, + "loss": 1.7895, + "step": 55000 + }, + { + "epoch": 0.7572318738909751, + "grad_norm": 0.17008638381958008, + "learning_rate": 0.0001, + "loss": 1.7909, + "step": 55050 + }, + { + "epoch": 0.75791964126054, + "grad_norm": 0.15402346849441528, + "learning_rate": 0.0001, + "loss": 1.7858, + "step": 55100 + }, + { + "epoch": 0.758607408630105, + "grad_norm": 0.1750432401895523, + "learning_rate": 0.0001, + "loss": 1.7898, + "step": 55150 + }, + { + "epoch": 0.7592951759996699, + "grad_norm": 0.18680183589458466, + "learning_rate": 0.0001, + "loss": 1.788, + "step": 55200 + }, + { + "epoch": 0.7599829433692348, + "grad_norm": 0.16581743955612183, + "learning_rate": 0.0001, + "loss": 1.7902, + "step": 55250 + }, + { + "epoch": 0.7606707107387997, + "grad_norm": 0.16159740090370178, + "learning_rate": 0.0001, + "loss": 1.7843, + "step": 55300 + }, + { + "epoch": 0.7613584781083647, + "grad_norm": 0.14381587505340576, + "learning_rate": 0.0001, + "loss": 1.7918, + "step": 55350 + }, + { + "epoch": 0.7620462454779295, + "grad_norm": 0.15160152316093445, + "learning_rate": 0.0001, + "loss": 1.789, + "step": 55400 + }, + { + "epoch": 0.7627340128474944, + "grad_norm": 0.16748382151126862, + "learning_rate": 0.0001, + "loss": 1.7865, + "step": 55450 + }, + { + "epoch": 0.7634217802170594, + "grad_norm": 0.15434932708740234, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 55500 + }, + { + "epoch": 0.7641095475866243, + "grad_norm": 0.16281753778457642, + "learning_rate": 0.0001, + "loss": 1.7909, + "step": 55550 + }, + { + "epoch": 0.7647973149561892, + "grad_norm": 0.1581009328365326, + "learning_rate": 0.0001, + "loss": 1.7872, + "step": 55600 + }, + { + "epoch": 0.7654850823257542, + "grad_norm": 0.16244924068450928, + "learning_rate": 0.0001, + "loss": 1.7882, + "step": 55650 + }, + { + "epoch": 0.7661728496953191, + "grad_norm": 0.1727581024169922, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 55700 + }, + { + "epoch": 0.766860617064884, + "grad_norm": 0.15804524719715118, + "learning_rate": 0.0001, + "loss": 1.7883, + "step": 55750 + }, + { + "epoch": 0.7675483844344488, + "grad_norm": 0.16742980480194092, + "learning_rate": 0.0001, + "loss": 1.7883, + "step": 55800 + }, + { + "epoch": 0.7682361518040138, + "grad_norm": 0.15518859028816223, + "learning_rate": 0.0001, + "loss": 1.7877, + "step": 55850 + }, + { + "epoch": 0.7689239191735787, + "grad_norm": 0.14549891650676727, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 55900 + }, + { + "epoch": 0.7696116865431436, + "grad_norm": 0.15677410364151, + "learning_rate": 0.0001, + "loss": 1.7868, + "step": 55950 + }, + { + "epoch": 0.7702994539127086, + "grad_norm": 0.1627907007932663, + "learning_rate": 0.0001, + "loss": 1.7861, + "step": 56000 + }, + { + "epoch": 0.7709872212822735, + "grad_norm": 0.17789112031459808, + "learning_rate": 0.0001, + "loss": 1.7917, + "step": 56050 + }, + { + "epoch": 0.7716749886518384, + "grad_norm": 0.17732852697372437, + "learning_rate": 0.0001, + "loss": 1.7885, + "step": 56100 + }, + { + "epoch": 0.7723627560214034, + "grad_norm": 0.16175003349781036, + "learning_rate": 0.0001, + "loss": 1.7847, + "step": 56150 + }, + { + "epoch": 0.7730505233909682, + "grad_norm": 0.16384829580783844, + "learning_rate": 0.0001, + "loss": 1.7879, + "step": 56200 + }, + { + "epoch": 0.7737382907605331, + "grad_norm": 0.18334250152111053, + "learning_rate": 0.0001, + "loss": 1.7841, + "step": 56250 + }, + { + "epoch": 0.7744260581300981, + "grad_norm": 0.16775920987129211, + "learning_rate": 0.0001, + "loss": 1.7844, + "step": 56300 + }, + { + "epoch": 0.775113825499663, + "grad_norm": 0.15945740044116974, + "learning_rate": 0.0001, + "loss": 1.7867, + "step": 56350 + }, + { + "epoch": 0.7758015928692279, + "grad_norm": 0.16826015710830688, + "learning_rate": 0.0001, + "loss": 1.7874, + "step": 56400 + }, + { + "epoch": 0.7764893602387928, + "grad_norm": 0.16733418405056, + "learning_rate": 0.0001, + "loss": 1.7843, + "step": 56450 + }, + { + "epoch": 0.7771771276083578, + "grad_norm": 0.17716175317764282, + "learning_rate": 0.0001, + "loss": 1.7852, + "step": 56500 + }, + { + "epoch": 0.7778648949779227, + "grad_norm": 0.15145139396190643, + "learning_rate": 0.0001, + "loss": 1.7864, + "step": 56550 + }, + { + "epoch": 0.7785526623474875, + "grad_norm": 0.1650010645389557, + "learning_rate": 0.0001, + "loss": 1.788, + "step": 56600 + }, + { + "epoch": 0.7792404297170525, + "grad_norm": 0.15676827728748322, + "learning_rate": 0.0001, + "loss": 1.7863, + "step": 56650 + }, + { + "epoch": 0.7799281970866174, + "grad_norm": 0.15251976251602173, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 56700 + }, + { + "epoch": 0.7806159644561823, + "grad_norm": 0.16107071936130524, + "learning_rate": 0.0001, + "loss": 1.7872, + "step": 56750 + }, + { + "epoch": 0.7813037318257473, + "grad_norm": 0.16008871793746948, + "learning_rate": 0.0001, + "loss": 1.7879, + "step": 56800 + }, + { + "epoch": 0.7819914991953122, + "grad_norm": 0.1748703122138977, + "learning_rate": 0.0001, + "loss": 1.7883, + "step": 56850 + }, + { + "epoch": 0.7826792665648771, + "grad_norm": 0.1847066432237625, + "learning_rate": 0.0001, + "loss": 1.7878, + "step": 56900 + }, + { + "epoch": 0.7833670339344421, + "grad_norm": 0.14105017483234406, + "learning_rate": 0.0001, + "loss": 1.7872, + "step": 56950 + }, + { + "epoch": 0.784054801304007, + "grad_norm": 0.1463741511106491, + "learning_rate": 0.0001, + "loss": 1.784, + "step": 57000 + }, + { + "epoch": 0.7847425686735718, + "grad_norm": 0.15982814133167267, + "learning_rate": 0.0001, + "loss": 1.7904, + "step": 57050 + }, + { + "epoch": 0.7854303360431367, + "grad_norm": 0.15282031893730164, + "learning_rate": 0.0001, + "loss": 1.788, + "step": 57100 + }, + { + "epoch": 0.7861181034127017, + "grad_norm": 0.16466231644153595, + "learning_rate": 0.0001, + "loss": 1.7862, + "step": 57150 + }, + { + "epoch": 0.7868058707822666, + "grad_norm": 0.16176077723503113, + "learning_rate": 0.0001, + "loss": 1.7883, + "step": 57200 + }, + { + "epoch": 0.7874936381518315, + "grad_norm": 0.16768991947174072, + "learning_rate": 0.0001, + "loss": 1.791, + "step": 57250 + }, + { + "epoch": 0.7881814055213965, + "grad_norm": 0.15378397703170776, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 57300 + }, + { + "epoch": 0.7888691728909614, + "grad_norm": 0.16845440864562988, + "learning_rate": 0.0001, + "loss": 1.7865, + "step": 57350 + }, + { + "epoch": 0.7895569402605263, + "grad_norm": 0.16859596967697144, + "learning_rate": 0.0001, + "loss": 1.7893, + "step": 57400 + }, + { + "epoch": 0.7902447076300912, + "grad_norm": 0.17096339166164398, + "learning_rate": 0.0001, + "loss": 1.7842, + "step": 57450 + }, + { + "epoch": 0.7909324749996561, + "grad_norm": 0.19546246528625488, + "learning_rate": 0.0001, + "loss": 1.7859, + "step": 57500 + }, + { + "epoch": 0.791620242369221, + "grad_norm": 0.15690521895885468, + "learning_rate": 0.0001, + "loss": 1.7905, + "step": 57550 + }, + { + "epoch": 0.7923080097387859, + "grad_norm": 0.15288680791854858, + "learning_rate": 0.0001, + "loss": 1.7871, + "step": 57600 + }, + { + "epoch": 0.7929957771083509, + "grad_norm": 0.15947267413139343, + "learning_rate": 0.0001, + "loss": 1.7851, + "step": 57650 + }, + { + "epoch": 0.7936835444779158, + "grad_norm": 0.1813030242919922, + "learning_rate": 0.0001, + "loss": 1.7833, + "step": 57700 + }, + { + "epoch": 0.7943713118474807, + "grad_norm": 0.16709686815738678, + "learning_rate": 0.0001, + "loss": 1.7908, + "step": 57750 + }, + { + "epoch": 0.7950590792170457, + "grad_norm": 0.19110731780529022, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 57800 + }, + { + "epoch": 0.7957468465866105, + "grad_norm": 0.15795393288135529, + "learning_rate": 0.0001, + "loss": 1.7908, + "step": 57850 + }, + { + "epoch": 0.7964346139561754, + "grad_norm": 0.14493565261363983, + "learning_rate": 0.0001, + "loss": 1.7893, + "step": 57900 + }, + { + "epoch": 0.7971223813257404, + "grad_norm": 0.14182139933109283, + "learning_rate": 0.0001, + "loss": 1.7883, + "step": 57950 + }, + { + "epoch": 0.7978101486953053, + "grad_norm": 0.14074084162712097, + "learning_rate": 0.0001, + "loss": 1.7857, + "step": 58000 + }, + { + "epoch": 0.7984979160648702, + "grad_norm": 0.1791408807039261, + "learning_rate": 0.0001, + "loss": 1.7889, + "step": 58050 + }, + { + "epoch": 0.7991856834344352, + "grad_norm": 0.17944924533367157, + "learning_rate": 0.0001, + "loss": 1.7884, + "step": 58100 + }, + { + "epoch": 0.7998734508040001, + "grad_norm": 0.19336557388305664, + "learning_rate": 0.0001, + "loss": 1.786, + "step": 58150 + }, + { + "epoch": 0.800561218173565, + "grad_norm": 0.14197582006454468, + "learning_rate": 0.0001, + "loss": 1.7834, + "step": 58200 + }, + { + "epoch": 0.8012489855431298, + "grad_norm": 0.17862093448638916, + "learning_rate": 0.0001, + "loss": 1.7859, + "step": 58250 + }, + { + "epoch": 0.8019367529126948, + "grad_norm": 0.15174590051174164, + "learning_rate": 0.0001, + "loss": 1.7883, + "step": 58300 + }, + { + "epoch": 0.8026245202822597, + "grad_norm": 0.15902046859264374, + "learning_rate": 0.0001, + "loss": 1.7859, + "step": 58350 + }, + { + "epoch": 0.8033122876518246, + "grad_norm": 0.1593545824289322, + "learning_rate": 0.0001, + "loss": 1.7871, + "step": 58400 + }, + { + "epoch": 0.8040000550213896, + "grad_norm": 0.16780108213424683, + "learning_rate": 0.0001, + "loss": 1.7892, + "step": 58450 + }, + { + "epoch": 0.8046878223909545, + "grad_norm": 0.16704651713371277, + "learning_rate": 0.0001, + "loss": 1.7827, + "step": 58500 + }, + { + "epoch": 0.8053755897605194, + "grad_norm": 0.20908869802951813, + "learning_rate": 0.0001, + "loss": 1.7868, + "step": 58550 + }, + { + "epoch": 0.8060633571300844, + "grad_norm": 0.1484072208404541, + "learning_rate": 0.0001, + "loss": 1.7905, + "step": 58600 + }, + { + "epoch": 0.8067511244996493, + "grad_norm": 0.16092757880687714, + "learning_rate": 0.0001, + "loss": 1.7849, + "step": 58650 + }, + { + "epoch": 0.8074388918692141, + "grad_norm": 0.15798570215702057, + "learning_rate": 0.0001, + "loss": 1.7897, + "step": 58700 + }, + { + "epoch": 0.808126659238779, + "grad_norm": 0.15388993918895721, + "learning_rate": 0.0001, + "loss": 1.7874, + "step": 58750 + }, + { + "epoch": 0.808814426608344, + "grad_norm": 0.16136646270751953, + "learning_rate": 0.0001, + "loss": 1.7866, + "step": 58800 + }, + { + "epoch": 0.8095021939779089, + "grad_norm": 0.20280751585960388, + "learning_rate": 0.0001, + "loss": 1.7868, + "step": 58850 + }, + { + "epoch": 0.8101899613474738, + "grad_norm": 0.16941416263580322, + "learning_rate": 0.0001, + "loss": 1.7834, + "step": 58900 + }, + { + "epoch": 0.8108777287170388, + "grad_norm": 0.1597299724817276, + "learning_rate": 0.0001, + "loss": 1.7823, + "step": 58950 + }, + { + "epoch": 0.8115654960866037, + "grad_norm": 0.1581617146730423, + "learning_rate": 0.0001, + "loss": 1.7902, + "step": 59000 + }, + { + "epoch": 0.8122532634561686, + "grad_norm": 0.17084243893623352, + "learning_rate": 0.0001, + "loss": 1.7873, + "step": 59050 + }, + { + "epoch": 0.8129410308257335, + "grad_norm": 0.16124476492404938, + "learning_rate": 0.0001, + "loss": 1.7894, + "step": 59100 + }, + { + "epoch": 0.8136287981952984, + "grad_norm": 0.15042969584465027, + "learning_rate": 0.0001, + "loss": 1.7873, + "step": 59150 + }, + { + "epoch": 0.8143165655648633, + "grad_norm": 0.14492358267307281, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 59200 + }, + { + "epoch": 0.8150043329344283, + "grad_norm": 0.17020314931869507, + "learning_rate": 0.0001, + "loss": 1.7859, + "step": 59250 + }, + { + "epoch": 0.8156921003039932, + "grad_norm": 0.1630934178829193, + "learning_rate": 0.0001, + "loss": 1.7841, + "step": 59300 + }, + { + "epoch": 0.8163798676735581, + "grad_norm": 0.17032647132873535, + "learning_rate": 0.0001, + "loss": 1.7851, + "step": 59350 + }, + { + "epoch": 0.817067635043123, + "grad_norm": 0.15546603500843048, + "learning_rate": 0.0001, + "loss": 1.7866, + "step": 59400 + }, + { + "epoch": 0.817755402412688, + "grad_norm": 0.1688961237668991, + "learning_rate": 0.0001, + "loss": 1.7858, + "step": 59450 + }, + { + "epoch": 0.8184431697822528, + "grad_norm": 0.15222899615764618, + "learning_rate": 0.0001, + "loss": 1.7848, + "step": 59500 + }, + { + "epoch": 0.8191309371518177, + "grad_norm": 0.15309302508831024, + "learning_rate": 0.0001, + "loss": 1.7847, + "step": 59550 + }, + { + "epoch": 0.8198187045213827, + "grad_norm": 0.1601337045431137, + "learning_rate": 0.0001, + "loss": 1.7861, + "step": 59600 + }, + { + "epoch": 0.8205064718909476, + "grad_norm": 0.14973758161067963, + "learning_rate": 0.0001, + "loss": 1.7893, + "step": 59650 + }, + { + "epoch": 0.8211942392605125, + "grad_norm": 0.17928583920001984, + "learning_rate": 0.0001, + "loss": 1.7841, + "step": 59700 + }, + { + "epoch": 0.8218820066300775, + "grad_norm": 0.1628539264202118, + "learning_rate": 0.0001, + "loss": 1.7861, + "step": 59750 + }, + { + "epoch": 0.8225697739996424, + "grad_norm": 0.1617124229669571, + "learning_rate": 0.0001, + "loss": 1.7837, + "step": 59800 + }, + { + "epoch": 0.8232575413692073, + "grad_norm": 0.16710211336612701, + "learning_rate": 0.0001, + "loss": 1.7843, + "step": 59850 + }, + { + "epoch": 0.8239453087387723, + "grad_norm": 0.18266211450099945, + "learning_rate": 0.0001, + "loss": 1.7882, + "step": 59900 + }, + { + "epoch": 0.8246330761083371, + "grad_norm": 0.15460216999053955, + "learning_rate": 0.0001, + "loss": 1.7856, + "step": 59950 + }, + { + "epoch": 0.825320843477902, + "grad_norm": 0.19238495826721191, + "learning_rate": 0.0001, + "loss": 1.7867, + "step": 60000 + }, + { + "epoch": 0.8260086108474669, + "grad_norm": 0.17882536351680756, + "learning_rate": 0.0001, + "loss": 1.79, + "step": 60050 + }, + { + "epoch": 0.8266963782170319, + "grad_norm": 0.17022471129894257, + "learning_rate": 0.0001, + "loss": 1.7843, + "step": 60100 + }, + { + "epoch": 0.8273841455865968, + "grad_norm": 0.16253788769245148, + "learning_rate": 0.0001, + "loss": 1.7842, + "step": 60150 + }, + { + "epoch": 0.8280719129561617, + "grad_norm": 0.1684889793395996, + "learning_rate": 0.0001, + "loss": 1.7871, + "step": 60200 + }, + { + "epoch": 0.8287596803257267, + "grad_norm": 0.1623234748840332, + "learning_rate": 0.0001, + "loss": 1.7812, + "step": 60250 + }, + { + "epoch": 0.8294474476952916, + "grad_norm": 0.14207519590854645, + "learning_rate": 0.0001, + "loss": 1.7873, + "step": 60300 + }, + { + "epoch": 0.8301352150648564, + "grad_norm": 0.15550558269023895, + "learning_rate": 0.0001, + "loss": 1.7876, + "step": 60350 + }, + { + "epoch": 0.8308229824344214, + "grad_norm": 0.16578029096126556, + "learning_rate": 0.0001, + "loss": 1.7804, + "step": 60400 + }, + { + "epoch": 0.8315107498039863, + "grad_norm": 0.16406333446502686, + "learning_rate": 0.0001, + "loss": 1.7837, + "step": 60450 + }, + { + "epoch": 0.8321985171735512, + "grad_norm": 0.1568935364484787, + "learning_rate": 0.0001, + "loss": 1.786, + "step": 60500 + }, + { + "epoch": 0.8328862845431161, + "grad_norm": 0.17918673157691956, + "learning_rate": 0.0001, + "loss": 1.7877, + "step": 60550 + }, + { + "epoch": 0.8335740519126811, + "grad_norm": 0.14733350276947021, + "learning_rate": 0.0001, + "loss": 1.7821, + "step": 60600 + }, + { + "epoch": 0.834261819282246, + "grad_norm": 0.14916177093982697, + "learning_rate": 0.0001, + "loss": 1.7862, + "step": 60650 + }, + { + "epoch": 0.8349495866518108, + "grad_norm": 0.15052981674671173, + "learning_rate": 0.0001, + "loss": 1.7892, + "step": 60700 + }, + { + "epoch": 0.8356373540213758, + "grad_norm": 0.1831791251897812, + "learning_rate": 0.0001, + "loss": 1.7844, + "step": 60750 + }, + { + "epoch": 0.8363251213909407, + "grad_norm": 0.16115884482860565, + "learning_rate": 0.0001, + "loss": 1.7827, + "step": 60800 + }, + { + "epoch": 0.8370128887605056, + "grad_norm": 0.15721943974494934, + "learning_rate": 0.0001, + "loss": 1.7862, + "step": 60850 + }, + { + "epoch": 0.8377006561300706, + "grad_norm": 0.1528850942850113, + "learning_rate": 0.0001, + "loss": 1.7852, + "step": 60900 + }, + { + "epoch": 0.8383884234996355, + "grad_norm": 0.16134890913963318, + "learning_rate": 0.0001, + "loss": 1.7875, + "step": 60950 + }, + { + "epoch": 0.8390761908692004, + "grad_norm": 0.16336651146411896, + "learning_rate": 0.0001, + "loss": 1.7848, + "step": 61000 + }, + { + "epoch": 0.8397639582387654, + "grad_norm": 0.16578875482082367, + "learning_rate": 0.0001, + "loss": 1.7858, + "step": 61050 + }, + { + "epoch": 0.8404517256083303, + "grad_norm": 0.16235701739788055, + "learning_rate": 0.0001, + "loss": 1.7869, + "step": 61100 + }, + { + "epoch": 0.8411394929778951, + "grad_norm": 0.16650299727916718, + "learning_rate": 0.0001, + "loss": 1.7868, + "step": 61150 + }, + { + "epoch": 0.84182726034746, + "grad_norm": 0.148828387260437, + "learning_rate": 0.0001, + "loss": 1.7827, + "step": 61200 + }, + { + "epoch": 0.842515027717025, + "grad_norm": 0.1572546660900116, + "learning_rate": 0.0001, + "loss": 1.7846, + "step": 61250 + }, + { + "epoch": 0.8432027950865899, + "grad_norm": 0.15572214126586914, + "learning_rate": 0.0001, + "loss": 1.788, + "step": 61300 + }, + { + "epoch": 0.8438905624561548, + "grad_norm": 0.18148384988307953, + "learning_rate": 0.0001, + "loss": 1.7829, + "step": 61350 + }, + { + "epoch": 0.8445783298257198, + "grad_norm": 0.16225239634513855, + "learning_rate": 0.0001, + "loss": 1.787, + "step": 61400 + }, + { + "epoch": 0.8452660971952847, + "grad_norm": 0.1546306014060974, + "learning_rate": 0.0001, + "loss": 1.7886, + "step": 61450 + }, + { + "epoch": 0.8459538645648496, + "grad_norm": 0.1589781790971756, + "learning_rate": 0.0001, + "loss": 1.7876, + "step": 61500 + }, + { + "epoch": 0.8466416319344146, + "grad_norm": 0.16938839852809906, + "learning_rate": 0.0001, + "loss": 1.7805, + "step": 61550 + }, + { + "epoch": 0.8473293993039794, + "grad_norm": 0.17635032534599304, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 61600 + }, + { + "epoch": 0.8480171666735443, + "grad_norm": 0.16436606645584106, + "learning_rate": 0.0001, + "loss": 1.7829, + "step": 61650 + }, + { + "epoch": 0.8487049340431092, + "grad_norm": 0.15410180389881134, + "learning_rate": 0.0001, + "loss": 1.7833, + "step": 61700 + }, + { + "epoch": 0.8493927014126742, + "grad_norm": 0.15711359679698944, + "learning_rate": 0.0001, + "loss": 1.7855, + "step": 61750 + }, + { + "epoch": 0.8500804687822391, + "grad_norm": 0.14257673919200897, + "learning_rate": 0.0001, + "loss": 1.7846, + "step": 61800 + }, + { + "epoch": 0.850768236151804, + "grad_norm": 0.1770082414150238, + "learning_rate": 0.0001, + "loss": 1.786, + "step": 61850 + }, + { + "epoch": 0.851456003521369, + "grad_norm": 0.14938481152057648, + "learning_rate": 0.0001, + "loss": 1.7841, + "step": 61900 + }, + { + "epoch": 0.8521437708909338, + "grad_norm": 0.16232655942440033, + "learning_rate": 0.0001, + "loss": 1.7872, + "step": 61950 + }, + { + "epoch": 0.8528315382604987, + "grad_norm": 0.14662796258926392, + "learning_rate": 0.0001, + "loss": 1.7846, + "step": 62000 + }, + { + "epoch": 0.8535193056300637, + "grad_norm": 0.15960827469825745, + "learning_rate": 0.0001, + "loss": 1.7868, + "step": 62050 + }, + { + "epoch": 0.8542070729996286, + "grad_norm": 0.1585722714662552, + "learning_rate": 0.0001, + "loss": 1.7833, + "step": 62100 + }, + { + "epoch": 0.8548948403691935, + "grad_norm": 0.15847063064575195, + "learning_rate": 0.0001, + "loss": 1.7861, + "step": 62150 + }, + { + "epoch": 0.8555826077387585, + "grad_norm": 0.1581469178199768, + "learning_rate": 0.0001, + "loss": 1.7872, + "step": 62200 + }, + { + "epoch": 0.8562703751083234, + "grad_norm": 0.18087923526763916, + "learning_rate": 0.0001, + "loss": 1.7837, + "step": 62250 + }, + { + "epoch": 0.8569581424778883, + "grad_norm": 0.15878331661224365, + "learning_rate": 0.0001, + "loss": 1.7865, + "step": 62300 + }, + { + "epoch": 0.8576459098474531, + "grad_norm": 0.1652536690235138, + "learning_rate": 0.0001, + "loss": 1.7864, + "step": 62350 + }, + { + "epoch": 0.8583336772170181, + "grad_norm": 0.16467753052711487, + "learning_rate": 0.0001, + "loss": 1.788, + "step": 62400 + }, + { + "epoch": 0.859021444586583, + "grad_norm": 0.17342518270015717, + "learning_rate": 0.0001, + "loss": 1.7853, + "step": 62450 + }, + { + "epoch": 0.8597092119561479, + "grad_norm": 0.15487852692604065, + "learning_rate": 0.0001, + "loss": 1.7861, + "step": 62500 + }, + { + "epoch": 0.8603969793257129, + "grad_norm": 0.16185085475444794, + "learning_rate": 0.0001, + "loss": 1.7891, + "step": 62550 + }, + { + "epoch": 0.8610847466952778, + "grad_norm": 0.18629157543182373, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 62600 + }, + { + "epoch": 0.8617725140648427, + "grad_norm": 0.20009976625442505, + "learning_rate": 0.0001, + "loss": 1.7849, + "step": 62650 + }, + { + "epoch": 0.8624602814344077, + "grad_norm": 0.16432398557662964, + "learning_rate": 0.0001, + "loss": 1.786, + "step": 62700 + }, + { + "epoch": 0.8631480488039726, + "grad_norm": 0.16151119768619537, + "learning_rate": 0.0001, + "loss": 1.7838, + "step": 62750 + }, + { + "epoch": 0.8638358161735374, + "grad_norm": 0.16223236918449402, + "learning_rate": 0.0001, + "loss": 1.7857, + "step": 62800 + }, + { + "epoch": 0.8645235835431024, + "grad_norm": 0.15118102729320526, + "learning_rate": 0.0001, + "loss": 1.7824, + "step": 62850 + }, + { + "epoch": 0.8652113509126673, + "grad_norm": 0.15173585712909698, + "learning_rate": 0.0001, + "loss": 1.7858, + "step": 62900 + }, + { + "epoch": 0.8658991182822322, + "grad_norm": 0.1547808051109314, + "learning_rate": 0.0001, + "loss": 1.7806, + "step": 62950 + }, + { + "epoch": 0.8665868856517971, + "grad_norm": 0.1542670577764511, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 63000 + }, + { + "epoch": 0.8672746530213621, + "grad_norm": 0.16760842502117157, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 63050 + }, + { + "epoch": 0.867962420390927, + "grad_norm": 0.17703787982463837, + "learning_rate": 0.0001, + "loss": 1.788, + "step": 63100 + }, + { + "epoch": 0.8686501877604919, + "grad_norm": 0.1573743224143982, + "learning_rate": 0.0001, + "loss": 1.7792, + "step": 63150 + }, + { + "epoch": 0.8693379551300568, + "grad_norm": 0.1451522409915924, + "learning_rate": 0.0001, + "loss": 1.7854, + "step": 63200 + }, + { + "epoch": 0.8700257224996217, + "grad_norm": 0.17078782618045807, + "learning_rate": 0.0001, + "loss": 1.784, + "step": 63250 + }, + { + "epoch": 0.8707134898691866, + "grad_norm": 0.15471959114074707, + "learning_rate": 0.0001, + "loss": 1.7832, + "step": 63300 + }, + { + "epoch": 0.8714012572387516, + "grad_norm": 0.16724149882793427, + "learning_rate": 0.0001, + "loss": 1.7783, + "step": 63350 + }, + { + "epoch": 0.8720890246083165, + "grad_norm": 0.15160906314849854, + "learning_rate": 0.0001, + "loss": 1.7843, + "step": 63400 + }, + { + "epoch": 0.8727767919778814, + "grad_norm": 0.156820610165596, + "learning_rate": 0.0001, + "loss": 1.7856, + "step": 63450 + }, + { + "epoch": 0.8734645593474463, + "grad_norm": 0.16410048305988312, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 63500 + }, + { + "epoch": 0.8741523267170113, + "grad_norm": 0.16022023558616638, + "learning_rate": 0.0001, + "loss": 1.7801, + "step": 63550 + }, + { + "epoch": 0.8748400940865761, + "grad_norm": 0.1775195300579071, + "learning_rate": 0.0001, + "loss": 1.7824, + "step": 63600 + }, + { + "epoch": 0.875527861456141, + "grad_norm": 0.17621392011642456, + "learning_rate": 0.0001, + "loss": 1.7792, + "step": 63650 + }, + { + "epoch": 0.876215628825706, + "grad_norm": 0.17508172988891602, + "learning_rate": 0.0001, + "loss": 1.785, + "step": 63700 + }, + { + "epoch": 0.8769033961952709, + "grad_norm": 0.167220801115036, + "learning_rate": 0.0001, + "loss": 1.7838, + "step": 63750 + }, + { + "epoch": 0.8775911635648358, + "grad_norm": 0.22981862723827362, + "learning_rate": 0.0001, + "loss": 1.7885, + "step": 63800 + }, + { + "epoch": 0.8782789309344008, + "grad_norm": 0.17177161574363708, + "learning_rate": 0.0001, + "loss": 1.7846, + "step": 63850 + }, + { + "epoch": 0.8789666983039657, + "grad_norm": 0.16599243879318237, + "learning_rate": 0.0001, + "loss": 1.7819, + "step": 63900 + }, + { + "epoch": 0.8796544656735306, + "grad_norm": 0.17125064134597778, + "learning_rate": 0.0001, + "loss": 1.7839, + "step": 63950 + }, + { + "epoch": 0.8803422330430956, + "grad_norm": 0.17469707131385803, + "learning_rate": 0.0001, + "loss": 1.7797, + "step": 64000 + }, + { + "epoch": 0.8810300004126604, + "grad_norm": 0.16639864444732666, + "learning_rate": 0.0001, + "loss": 1.7833, + "step": 64050 + }, + { + "epoch": 0.8817177677822253, + "grad_norm": 0.16656282544136047, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 64100 + }, + { + "epoch": 0.8824055351517902, + "grad_norm": 0.14526651799678802, + "learning_rate": 0.0001, + "loss": 1.7817, + "step": 64150 + }, + { + "epoch": 0.8830933025213552, + "grad_norm": 0.1783958077430725, + "learning_rate": 0.0001, + "loss": 1.7828, + "step": 64200 + }, + { + "epoch": 0.8837810698909201, + "grad_norm": 0.16352634131908417, + "learning_rate": 0.0001, + "loss": 1.7807, + "step": 64250 + }, + { + "epoch": 0.884468837260485, + "grad_norm": 0.16130295395851135, + "learning_rate": 0.0001, + "loss": 1.7803, + "step": 64300 + }, + { + "epoch": 0.88515660463005, + "grad_norm": 0.16286851465702057, + "learning_rate": 0.0001, + "loss": 1.7866, + "step": 64350 + }, + { + "epoch": 0.8858443719996149, + "grad_norm": 0.16668406128883362, + "learning_rate": 0.0001, + "loss": 1.7805, + "step": 64400 + }, + { + "epoch": 0.8865321393691797, + "grad_norm": 0.16575850546360016, + "learning_rate": 0.0001, + "loss": 1.7803, + "step": 64450 + }, + { + "epoch": 0.8872199067387447, + "grad_norm": 0.16535095870494843, + "learning_rate": 0.0001, + "loss": 1.7795, + "step": 64500 + }, + { + "epoch": 0.8879076741083096, + "grad_norm": 0.14137853682041168, + "learning_rate": 0.0001, + "loss": 1.7854, + "step": 64550 + }, + { + "epoch": 0.8885954414778745, + "grad_norm": 0.14880156517028809, + "learning_rate": 0.0001, + "loss": 1.7862, + "step": 64600 + }, + { + "epoch": 0.8892832088474394, + "grad_norm": 0.17448197305202484, + "learning_rate": 0.0001, + "loss": 1.7847, + "step": 64650 + }, + { + "epoch": 0.8899709762170044, + "grad_norm": 0.1944260448217392, + "learning_rate": 0.0001, + "loss": 1.786, + "step": 64700 + }, + { + "epoch": 0.8906587435865693, + "grad_norm": 0.1693488508462906, + "learning_rate": 0.0001, + "loss": 1.7857, + "step": 64750 + }, + { + "epoch": 0.8913465109561342, + "grad_norm": 0.16250942647457123, + "learning_rate": 0.0001, + "loss": 1.7835, + "step": 64800 + }, + { + "epoch": 0.8920342783256991, + "grad_norm": 0.1573057919740677, + "learning_rate": 0.0001, + "loss": 1.782, + "step": 64850 + }, + { + "epoch": 0.892722045695264, + "grad_norm": 0.19034920632839203, + "learning_rate": 0.0001, + "loss": 1.782, + "step": 64900 + }, + { + "epoch": 0.8934098130648289, + "grad_norm": 0.13963682949543, + "learning_rate": 0.0001, + "loss": 1.7887, + "step": 64950 + }, + { + "epoch": 0.8940975804343939, + "grad_norm": 0.25064077973365784, + "learning_rate": 0.0001, + "loss": 1.7873, + "step": 65000 + }, + { + "epoch": 0.8947853478039588, + "grad_norm": 0.17574715614318848, + "learning_rate": 0.0001, + "loss": 1.7841, + "step": 65050 + }, + { + "epoch": 0.8954731151735237, + "grad_norm": 0.156754732131958, + "learning_rate": 0.0001, + "loss": 1.7807, + "step": 65100 + }, + { + "epoch": 0.8961608825430887, + "grad_norm": 0.17132636904716492, + "learning_rate": 0.0001, + "loss": 1.7801, + "step": 65150 + }, + { + "epoch": 0.8968486499126536, + "grad_norm": 0.15248049795627594, + "learning_rate": 0.0001, + "loss": 1.781, + "step": 65200 + }, + { + "epoch": 0.8975364172822184, + "grad_norm": 0.1603154093027115, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 65250 + }, + { + "epoch": 0.8982241846517833, + "grad_norm": 0.14862816035747528, + "learning_rate": 0.0001, + "loss": 1.7823, + "step": 65300 + }, + { + "epoch": 0.8989119520213483, + "grad_norm": 0.17050820589065552, + "learning_rate": 0.0001, + "loss": 1.7856, + "step": 65350 + }, + { + "epoch": 0.8995997193909132, + "grad_norm": 0.16287332773208618, + "learning_rate": 0.0001, + "loss": 1.7833, + "step": 65400 + }, + { + "epoch": 0.9002874867604781, + "grad_norm": 0.15486200153827667, + "learning_rate": 0.0001, + "loss": 1.7804, + "step": 65450 + }, + { + "epoch": 0.9009752541300431, + "grad_norm": 0.16483095288276672, + "learning_rate": 0.0001, + "loss": 1.7845, + "step": 65500 + }, + { + "epoch": 0.901663021499608, + "grad_norm": 0.15963926911354065, + "learning_rate": 0.0001, + "loss": 1.7865, + "step": 65550 + }, + { + "epoch": 0.9023507888691729, + "grad_norm": 0.14927932620048523, + "learning_rate": 0.0001, + "loss": 1.7814, + "step": 65600 + }, + { + "epoch": 0.9030385562387379, + "grad_norm": 0.15622937679290771, + "learning_rate": 0.0001, + "loss": 1.7841, + "step": 65650 + }, + { + "epoch": 0.9037263236083027, + "grad_norm": 0.14870509505271912, + "learning_rate": 0.0001, + "loss": 1.7865, + "step": 65700 + }, + { + "epoch": 0.9044140909778676, + "grad_norm": 0.16585543751716614, + "learning_rate": 0.0001, + "loss": 1.7803, + "step": 65750 + }, + { + "epoch": 0.9051018583474326, + "grad_norm": 0.16925722360610962, + "learning_rate": 0.0001, + "loss": 1.7905, + "step": 65800 + }, + { + "epoch": 0.9057896257169975, + "grad_norm": 0.16086918115615845, + "learning_rate": 0.0001, + "loss": 1.7818, + "step": 65850 + }, + { + "epoch": 0.9064773930865624, + "grad_norm": 0.17064189910888672, + "learning_rate": 0.0001, + "loss": 1.7829, + "step": 65900 + }, + { + "epoch": 0.9071651604561273, + "grad_norm": 0.1507936716079712, + "learning_rate": 0.0001, + "loss": 1.7826, + "step": 65950 + }, + { + "epoch": 0.9078529278256923, + "grad_norm": 0.16139142215251923, + "learning_rate": 0.0001, + "loss": 1.7832, + "step": 66000 + }, + { + "epoch": 0.9085406951952572, + "grad_norm": 0.14373824000358582, + "learning_rate": 0.0001, + "loss": 1.7834, + "step": 66050 + }, + { + "epoch": 0.909228462564822, + "grad_norm": 0.14268267154693604, + "learning_rate": 0.0001, + "loss": 1.7832, + "step": 66100 + }, + { + "epoch": 0.909916229934387, + "grad_norm": 0.14548690617084503, + "learning_rate": 0.0001, + "loss": 1.7827, + "step": 66150 + }, + { + "epoch": 0.9106039973039519, + "grad_norm": 0.1726326048374176, + "learning_rate": 0.0001, + "loss": 1.7799, + "step": 66200 + }, + { + "epoch": 0.9112917646735168, + "grad_norm": 0.1607373058795929, + "learning_rate": 0.0001, + "loss": 1.7809, + "step": 66250 + }, + { + "epoch": 0.9119795320430818, + "grad_norm": 0.14730975031852722, + "learning_rate": 0.0001, + "loss": 1.7791, + "step": 66300 + }, + { + "epoch": 0.9126672994126467, + "grad_norm": 0.1616540104150772, + "learning_rate": 0.0001, + "loss": 1.7791, + "step": 66350 + }, + { + "epoch": 0.9133550667822116, + "grad_norm": 0.16029463708400726, + "learning_rate": 0.0001, + "loss": 1.7828, + "step": 66400 + }, + { + "epoch": 0.9140428341517765, + "grad_norm": 0.15002845227718353, + "learning_rate": 0.0001, + "loss": 1.7812, + "step": 66450 + }, + { + "epoch": 0.9147306015213414, + "grad_norm": 0.14482907950878143, + "learning_rate": 0.0001, + "loss": 1.7802, + "step": 66500 + }, + { + "epoch": 0.9154183688909063, + "grad_norm": 0.17749476432800293, + "learning_rate": 0.0001, + "loss": 1.781, + "step": 66550 + }, + { + "epoch": 0.9161061362604712, + "grad_norm": 0.15776415169239044, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 66600 + }, + { + "epoch": 0.9167939036300362, + "grad_norm": 0.149980366230011, + "learning_rate": 0.0001, + "loss": 1.7756, + "step": 66650 + }, + { + "epoch": 0.9174816709996011, + "grad_norm": 0.16899780929088593, + "learning_rate": 0.0001, + "loss": 1.7814, + "step": 66700 + }, + { + "epoch": 0.918169438369166, + "grad_norm": 0.17424631118774414, + "learning_rate": 0.0001, + "loss": 1.7781, + "step": 66750 + }, + { + "epoch": 0.918857205738731, + "grad_norm": 0.1580991894006729, + "learning_rate": 0.0001, + "loss": 1.7801, + "step": 66800 + }, + { + "epoch": 0.9195449731082959, + "grad_norm": 0.16126061975955963, + "learning_rate": 0.0001, + "loss": 1.782, + "step": 66850 + }, + { + "epoch": 0.9202327404778607, + "grad_norm": 0.15646252036094666, + "learning_rate": 0.0001, + "loss": 1.7828, + "step": 66900 + }, + { + "epoch": 0.9209205078474257, + "grad_norm": 0.17129796743392944, + "learning_rate": 0.0001, + "loss": 1.7844, + "step": 66950 + }, + { + "epoch": 0.9216082752169906, + "grad_norm": 0.1756673902273178, + "learning_rate": 0.0001, + "loss": 1.7839, + "step": 67000 + }, + { + "epoch": 0.9222960425865555, + "grad_norm": 0.15259510278701782, + "learning_rate": 0.0001, + "loss": 1.7795, + "step": 67050 + }, + { + "epoch": 0.9229838099561204, + "grad_norm": 0.1639316827058792, + "learning_rate": 0.0001, + "loss": 1.7843, + "step": 67100 + }, + { + "epoch": 0.9236715773256854, + "grad_norm": 0.17190176248550415, + "learning_rate": 0.0001, + "loss": 1.78, + "step": 67150 + }, + { + "epoch": 0.9243593446952503, + "grad_norm": 0.16864174604415894, + "learning_rate": 0.0001, + "loss": 1.7852, + "step": 67200 + }, + { + "epoch": 0.9250471120648152, + "grad_norm": 0.15548075735569, + "learning_rate": 0.0001, + "loss": 1.7828, + "step": 67250 + }, + { + "epoch": 0.9257348794343802, + "grad_norm": 0.16301994025707245, + "learning_rate": 0.0001, + "loss": 1.7846, + "step": 67300 + }, + { + "epoch": 0.926422646803945, + "grad_norm": 0.1735038459300995, + "learning_rate": 0.0001, + "loss": 1.7798, + "step": 67350 + }, + { + "epoch": 0.9271104141735099, + "grad_norm": 0.1380920112133026, + "learning_rate": 0.0001, + "loss": 1.7806, + "step": 67400 + }, + { + "epoch": 0.9277981815430749, + "grad_norm": 0.15920446813106537, + "learning_rate": 0.0001, + "loss": 1.7792, + "step": 67450 + }, + { + "epoch": 0.9284859489126398, + "grad_norm": 0.17028312385082245, + "learning_rate": 0.0001, + "loss": 1.7888, + "step": 67500 + }, + { + "epoch": 0.9291737162822047, + "grad_norm": 0.1769266575574875, + "learning_rate": 0.0001, + "loss": 1.7814, + "step": 67550 + }, + { + "epoch": 0.9298614836517696, + "grad_norm": 0.1450556516647339, + "learning_rate": 0.0001, + "loss": 1.7817, + "step": 67600 + }, + { + "epoch": 0.9305492510213346, + "grad_norm": 0.16302357614040375, + "learning_rate": 0.0001, + "loss": 1.7813, + "step": 67650 + }, + { + "epoch": 0.9312370183908995, + "grad_norm": 0.1574389934539795, + "learning_rate": 0.0001, + "loss": 1.7776, + "step": 67700 + }, + { + "epoch": 0.9319247857604643, + "grad_norm": 0.14627063274383545, + "learning_rate": 0.0001, + "loss": 1.7826, + "step": 67750 + }, + { + "epoch": 0.9326125531300293, + "grad_norm": 0.18861928582191467, + "learning_rate": 0.0001, + "loss": 1.781, + "step": 67800 + }, + { + "epoch": 0.9333003204995942, + "grad_norm": 0.1549026519060135, + "learning_rate": 0.0001, + "loss": 1.7787, + "step": 67850 + }, + { + "epoch": 0.9339880878691591, + "grad_norm": 0.1620372235774994, + "learning_rate": 0.0001, + "loss": 1.7826, + "step": 67900 + }, + { + "epoch": 0.9346758552387241, + "grad_norm": 0.15894797444343567, + "learning_rate": 0.0001, + "loss": 1.7818, + "step": 67950 + }, + { + "epoch": 0.935363622608289, + "grad_norm": 0.19588086009025574, + "learning_rate": 0.0001, + "loss": 1.7835, + "step": 68000 + }, + { + "epoch": 0.9360513899778539, + "grad_norm": 0.1861431747674942, + "learning_rate": 0.0001, + "loss": 1.7815, + "step": 68050 + }, + { + "epoch": 0.9367391573474189, + "grad_norm": 0.16720125079154968, + "learning_rate": 0.0001, + "loss": 1.781, + "step": 68100 + }, + { + "epoch": 0.9374269247169837, + "grad_norm": 0.1603463739156723, + "learning_rate": 0.0001, + "loss": 1.7788, + "step": 68150 + }, + { + "epoch": 0.9381146920865486, + "grad_norm": 0.14092972874641418, + "learning_rate": 0.0001, + "loss": 1.7824, + "step": 68200 + }, + { + "epoch": 0.9388024594561135, + "grad_norm": 0.1622365266084671, + "learning_rate": 0.0001, + "loss": 1.7779, + "step": 68250 + }, + { + "epoch": 0.9394902268256785, + "grad_norm": 0.16566450893878937, + "learning_rate": 0.0001, + "loss": 1.7789, + "step": 68300 + }, + { + "epoch": 0.9401779941952434, + "grad_norm": 0.14181503653526306, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 68350 + }, + { + "epoch": 0.9408657615648083, + "grad_norm": 0.16675251722335815, + "learning_rate": 0.0001, + "loss": 1.7796, + "step": 68400 + }, + { + "epoch": 0.9415535289343733, + "grad_norm": 0.15481418371200562, + "learning_rate": 0.0001, + "loss": 1.7797, + "step": 68450 + }, + { + "epoch": 0.9422412963039382, + "grad_norm": 0.16480682790279388, + "learning_rate": 0.0001, + "loss": 1.7767, + "step": 68500 + }, + { + "epoch": 0.942929063673503, + "grad_norm": 0.13726095855236053, + "learning_rate": 0.0001, + "loss": 1.7799, + "step": 68550 + }, + { + "epoch": 0.943616831043068, + "grad_norm": 0.1498117446899414, + "learning_rate": 0.0001, + "loss": 1.7826, + "step": 68600 + }, + { + "epoch": 0.9443045984126329, + "grad_norm": 0.15102407336235046, + "learning_rate": 0.0001, + "loss": 1.7807, + "step": 68650 + }, + { + "epoch": 0.9449923657821978, + "grad_norm": 0.1596510410308838, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 68700 + }, + { + "epoch": 0.9456801331517628, + "grad_norm": 0.15061867237091064, + "learning_rate": 0.0001, + "loss": 1.7781, + "step": 68750 + }, + { + "epoch": 0.9463679005213277, + "grad_norm": 0.18302445113658905, + "learning_rate": 0.0001, + "loss": 1.7801, + "step": 68800 + }, + { + "epoch": 0.9470556678908926, + "grad_norm": 0.1563147008419037, + "learning_rate": 0.0001, + "loss": 1.7807, + "step": 68850 + }, + { + "epoch": 0.9477434352604575, + "grad_norm": 0.1559109389781952, + "learning_rate": 0.0001, + "loss": 1.779, + "step": 68900 + }, + { + "epoch": 0.9484312026300225, + "grad_norm": 0.1892656683921814, + "learning_rate": 0.0001, + "loss": 1.7815, + "step": 68950 + }, + { + "epoch": 0.9491189699995873, + "grad_norm": 0.16753901541233063, + "learning_rate": 0.0001, + "loss": 1.779, + "step": 69000 + }, + { + "epoch": 0.9498067373691522, + "grad_norm": 0.16571739315986633, + "learning_rate": 0.0001, + "loss": 1.781, + "step": 69050 + }, + { + "epoch": 0.9504945047387172, + "grad_norm": 0.15618735551834106, + "learning_rate": 0.0001, + "loss": 1.7801, + "step": 69100 + }, + { + "epoch": 0.9511822721082821, + "grad_norm": 0.15602505207061768, + "learning_rate": 0.0001, + "loss": 1.7782, + "step": 69150 + }, + { + "epoch": 0.951870039477847, + "grad_norm": 0.1441372036933899, + "learning_rate": 0.0001, + "loss": 1.7808, + "step": 69200 + }, + { + "epoch": 0.952557806847412, + "grad_norm": 0.16956308484077454, + "learning_rate": 0.0001, + "loss": 1.7805, + "step": 69250 + }, + { + "epoch": 0.9532455742169769, + "grad_norm": 0.1570560336112976, + "learning_rate": 0.0001, + "loss": 1.7829, + "step": 69300 + }, + { + "epoch": 0.9539333415865417, + "grad_norm": 0.13851186633110046, + "learning_rate": 0.0001, + "loss": 1.779, + "step": 69350 + }, + { + "epoch": 0.9546211089561066, + "grad_norm": 0.18309037387371063, + "learning_rate": 0.0001, + "loss": 1.7772, + "step": 69400 + }, + { + "epoch": 0.9553088763256716, + "grad_norm": 1.6850249767303467, + "learning_rate": 0.0001, + "loss": 1.7781, + "step": 69450 + }, + { + "epoch": 0.9559966436952365, + "grad_norm": 0.1578509509563446, + "learning_rate": 0.0001, + "loss": 1.7843, + "step": 69500 + }, + { + "epoch": 0.9566844110648014, + "grad_norm": 0.15330944955348969, + "learning_rate": 0.0001, + "loss": 1.7785, + "step": 69550 + }, + { + "epoch": 0.9573721784343664, + "grad_norm": 0.15504170954227448, + "learning_rate": 0.0001, + "loss": 1.7851, + "step": 69600 + }, + { + "epoch": 0.9580599458039313, + "grad_norm": 0.17802022397518158, + "learning_rate": 0.0001, + "loss": 1.7794, + "step": 69650 + }, + { + "epoch": 0.9587477131734962, + "grad_norm": 0.18508057296276093, + "learning_rate": 0.0001, + "loss": 1.7827, + "step": 69700 + }, + { + "epoch": 0.9594354805430612, + "grad_norm": 0.19704073667526245, + "learning_rate": 0.0001, + "loss": 1.7809, + "step": 69750 + }, + { + "epoch": 0.960123247912626, + "grad_norm": 0.17070503532886505, + "learning_rate": 0.0001, + "loss": 1.7791, + "step": 69800 + }, + { + "epoch": 0.9608110152821909, + "grad_norm": 0.1832980215549469, + "learning_rate": 0.0001, + "loss": 1.7798, + "step": 69850 + }, + { + "epoch": 0.9614987826517559, + "grad_norm": 0.15290822088718414, + "learning_rate": 0.0001, + "loss": 1.7819, + "step": 69900 + }, + { + "epoch": 0.9621865500213208, + "grad_norm": 0.1691426783800125, + "learning_rate": 0.0001, + "loss": 1.7792, + "step": 69950 + }, + { + "epoch": 0.9628743173908857, + "grad_norm": 0.1656666249036789, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 70000 + }, + { + "epoch": 0.9635620847604506, + "grad_norm": 0.15653489530086517, + "learning_rate": 0.0001, + "loss": 1.7811, + "step": 70050 + }, + { + "epoch": 0.9642498521300156, + "grad_norm": 0.15945695340633392, + "learning_rate": 0.0001, + "loss": 1.7789, + "step": 70100 + }, + { + "epoch": 0.9649376194995805, + "grad_norm": 0.173899307847023, + "learning_rate": 0.0001, + "loss": 1.782, + "step": 70150 + }, + { + "epoch": 0.9656253868691453, + "grad_norm": 0.13982714712619781, + "learning_rate": 0.0001, + "loss": 1.7796, + "step": 70200 + }, + { + "epoch": 0.9663131542387103, + "grad_norm": 0.16570891439914703, + "learning_rate": 0.0001, + "loss": 1.7814, + "step": 70250 + }, + { + "epoch": 0.9670009216082752, + "grad_norm": 0.1680910885334015, + "learning_rate": 0.0001, + "loss": 1.7797, + "step": 70300 + }, + { + "epoch": 0.9676886889778401, + "grad_norm": 0.18602094054222107, + "learning_rate": 0.0001, + "loss": 1.7799, + "step": 70350 + }, + { + "epoch": 0.9683764563474051, + "grad_norm": 0.15171028673648834, + "learning_rate": 0.0001, + "loss": 1.7824, + "step": 70400 + }, + { + "epoch": 0.96906422371697, + "grad_norm": 0.17273007333278656, + "learning_rate": 0.0001, + "loss": 1.779, + "step": 70450 + }, + { + "epoch": 0.9697519910865349, + "grad_norm": 0.1841355711221695, + "learning_rate": 0.0001, + "loss": 1.7849, + "step": 70500 + }, + { + "epoch": 0.9704397584560998, + "grad_norm": 0.14629191160202026, + "learning_rate": 0.0001, + "loss": 1.7822, + "step": 70550 + }, + { + "epoch": 0.9711275258256648, + "grad_norm": 0.19547376036643982, + "learning_rate": 0.0001, + "loss": 1.7805, + "step": 70600 + }, + { + "epoch": 0.9718152931952296, + "grad_norm": 0.1695117950439453, + "learning_rate": 0.0001, + "loss": 1.7808, + "step": 70650 + }, + { + "epoch": 0.9725030605647945, + "grad_norm": 0.15734167397022247, + "learning_rate": 0.0001, + "loss": 1.7826, + "step": 70700 + }, + { + "epoch": 0.9731908279343595, + "grad_norm": 0.15534259378910065, + "learning_rate": 0.0001, + "loss": 1.7784, + "step": 70750 + }, + { + "epoch": 0.9738785953039244, + "grad_norm": 0.17524221539497375, + "learning_rate": 0.0001, + "loss": 1.7802, + "step": 70800 + }, + { + "epoch": 0.9745663626734893, + "grad_norm": 0.16551004350185394, + "learning_rate": 0.0001, + "loss": 1.7774, + "step": 70850 + }, + { + "epoch": 0.9752541300430543, + "grad_norm": 0.18955057859420776, + "learning_rate": 0.0001, + "loss": 1.7771, + "step": 70900 + }, + { + "epoch": 0.9759418974126192, + "grad_norm": 0.1564190834760666, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 70950 + }, + { + "epoch": 0.976629664782184, + "grad_norm": 0.18080365657806396, + "learning_rate": 0.0001, + "loss": 1.7809, + "step": 71000 + }, + { + "epoch": 0.977317432151749, + "grad_norm": 0.17052794992923737, + "learning_rate": 0.0001, + "loss": 1.7785, + "step": 71050 + }, + { + "epoch": 0.9780051995213139, + "grad_norm": 0.15679985284805298, + "learning_rate": 0.0001, + "loss": 1.777, + "step": 71100 + }, + { + "epoch": 0.9786929668908788, + "grad_norm": 0.14611759781837463, + "learning_rate": 0.0001, + "loss": 1.7831, + "step": 71150 + }, + { + "epoch": 0.9793807342604437, + "grad_norm": 0.17994888126850128, + "learning_rate": 0.0001, + "loss": 1.7811, + "step": 71200 + }, + { + "epoch": 0.9800685016300087, + "grad_norm": 0.1523408442735672, + "learning_rate": 0.0001, + "loss": 1.7819, + "step": 71250 + }, + { + "epoch": 0.9807562689995736, + "grad_norm": 0.14828313887119293, + "learning_rate": 0.0001, + "loss": 1.7766, + "step": 71300 + }, + { + "epoch": 0.9814440363691385, + "grad_norm": 0.1424998790025711, + "learning_rate": 0.0001, + "loss": 1.7788, + "step": 71350 + }, + { + "epoch": 0.9821318037387035, + "grad_norm": 0.14312104880809784, + "learning_rate": 0.0001, + "loss": 1.7783, + "step": 71400 + }, + { + "epoch": 0.9828195711082683, + "grad_norm": 0.14697466790676117, + "learning_rate": 0.0001, + "loss": 1.7808, + "step": 71450 + }, + { + "epoch": 0.9835073384778332, + "grad_norm": 0.16363121569156647, + "learning_rate": 0.0001, + "loss": 1.7783, + "step": 71500 + }, + { + "epoch": 0.9841951058473982, + "grad_norm": 0.1542508453130722, + "learning_rate": 0.0001, + "loss": 1.7817, + "step": 71550 + }, + { + "epoch": 0.9848828732169631, + "grad_norm": 0.1389523297548294, + "learning_rate": 0.0001, + "loss": 1.7791, + "step": 71600 + }, + { + "epoch": 0.985570640586528, + "grad_norm": 0.15856057405471802, + "learning_rate": 0.0001, + "loss": 1.7833, + "step": 71650 + }, + { + "epoch": 0.986258407956093, + "grad_norm": 0.15098857879638672, + "learning_rate": 0.0001, + "loss": 1.7764, + "step": 71700 + }, + { + "epoch": 0.9869461753256579, + "grad_norm": 0.14318101108074188, + "learning_rate": 0.0001, + "loss": 1.7782, + "step": 71750 + }, + { + "epoch": 0.9876339426952228, + "grad_norm": 0.16459529101848602, + "learning_rate": 0.0001, + "loss": 1.7774, + "step": 71800 + }, + { + "epoch": 0.9883217100647876, + "grad_norm": 0.14705689251422882, + "learning_rate": 0.0001, + "loss": 1.7813, + "step": 71850 + }, + { + "epoch": 0.9890094774343526, + "grad_norm": 0.2091091424226761, + "learning_rate": 0.0001, + "loss": 1.7819, + "step": 71900 + }, + { + "epoch": 0.9896972448039175, + "grad_norm": 0.1711418330669403, + "learning_rate": 0.0001, + "loss": 1.7782, + "step": 71950 + }, + { + "epoch": 0.9903850121734824, + "grad_norm": 0.15255683660507202, + "learning_rate": 0.0001, + "loss": 1.7851, + "step": 72000 + }, + { + "epoch": 0.9910727795430474, + "grad_norm": 0.17501915991306305, + "learning_rate": 0.0001, + "loss": 1.7824, + "step": 72050 + }, + { + "epoch": 0.9917605469126123, + "grad_norm": 0.1605847328901291, + "learning_rate": 0.0001, + "loss": 1.7802, + "step": 72100 + }, + { + "epoch": 0.9924483142821772, + "grad_norm": 0.14898759126663208, + "learning_rate": 0.0001, + "loss": 1.7836, + "step": 72150 + }, + { + "epoch": 0.9931360816517422, + "grad_norm": 0.15966999530792236, + "learning_rate": 0.0001, + "loss": 1.7773, + "step": 72200 + }, + { + "epoch": 0.993823849021307, + "grad_norm": 0.14977654814720154, + "learning_rate": 0.0001, + "loss": 1.7764, + "step": 72250 + }, + { + "epoch": 0.9945116163908719, + "grad_norm": 0.16077259182929993, + "learning_rate": 0.0001, + "loss": 1.7789, + "step": 72300 + }, + { + "epoch": 0.9951993837604368, + "grad_norm": 0.1603011190891266, + "learning_rate": 0.0001, + "loss": 1.7756, + "step": 72350 + }, + { + "epoch": 0.9958871511300018, + "grad_norm": 0.17926956713199615, + "learning_rate": 0.0001, + "loss": 1.7805, + "step": 72400 + }, + { + "epoch": 0.9965749184995667, + "grad_norm": 0.15523836016654968, + "learning_rate": 0.0001, + "loss": 1.7816, + "step": 72450 + }, + { + "epoch": 0.9972626858691316, + "grad_norm": 0.15533694624900818, + "learning_rate": 0.0001, + "loss": 1.7817, + "step": 72500 + }, + { + "epoch": 0.9979504532386966, + "grad_norm": 0.17167145013809204, + "learning_rate": 0.0001, + "loss": 1.7793, + "step": 72550 + }, + { + "epoch": 0.9986382206082615, + "grad_norm": 0.1536383181810379, + "learning_rate": 0.0001, + "loss": 1.7792, + "step": 72600 + }, + { + "epoch": 0.9993259879778263, + "grad_norm": 0.15611621737480164, + "learning_rate": 0.0001, + "loss": 1.7798, + "step": 72650 + } + ], + "logging_steps": 50, + "max_steps": 72699, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 1.7804002887190855e+21, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}