diff --git "a/trainer_state.json" "b/trainer_state.json" new file mode 100644--- /dev/null +++ "b/trainer_state.json" @@ -0,0 +1,10204 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 1.0, + "eval_steps": 500.0, + "global_step": 72699, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.0006877673695649183, + "grad_norm": 0.06897997111082077, + "learning_rate": 0.0001, + "loss": 1.8351, + "step": 50 + }, + { + "epoch": 0.0013755347391298366, + "grad_norm": 0.0760108083486557, + "learning_rate": 0.0001, + "loss": 1.8141, + "step": 100 + }, + { + "epoch": 0.002063302108694755, + "grad_norm": 0.07610712945461273, + "learning_rate": 0.0001, + "loss": 1.8047, + "step": 150 + }, + { + "epoch": 0.0027510694782596733, + "grad_norm": 0.07757844030857086, + "learning_rate": 0.0001, + "loss": 1.7991, + "step": 200 + }, + { + "epoch": 0.003438836847824592, + "grad_norm": 0.07695277035236359, + "learning_rate": 0.0001, + "loss": 1.7939, + "step": 250 + }, + { + "epoch": 0.00412660421738951, + "grad_norm": 0.07816466689109802, + "learning_rate": 0.0001, + "loss": 1.7895, + "step": 300 + }, + { + "epoch": 0.004814371586954428, + "grad_norm": 0.07848047465085983, + "learning_rate": 0.0001, + "loss": 1.7861, + "step": 350 + }, + { + "epoch": 0.0055021389565193465, + "grad_norm": 0.08218562602996826, + "learning_rate": 0.0001, + "loss": 1.7777, + "step": 400 + }, + { + "epoch": 0.006189906326084265, + "grad_norm": 0.07656747847795486, + "learning_rate": 0.0001, + "loss": 1.7805, + "step": 450 + }, + { + "epoch": 0.006877673695649184, + "grad_norm": 0.07792618125677109, + "learning_rate": 0.0001, + "loss": 1.7791, + "step": 500 + }, + { + "epoch": 0.007565441065214102, + "grad_norm": 0.10222098976373672, + "learning_rate": 0.0001, + "loss": 1.7782, + "step": 550 + }, + { + "epoch": 0.00825320843477902, + "grad_norm": 0.07399246096611023, + "learning_rate": 0.0001, + "loss": 1.7734, + "step": 600 + }, + { + "epoch": 0.008940975804343939, + "grad_norm": 0.08359324187040329, + "learning_rate": 0.0001, + "loss": 1.7699, + "step": 650 + }, + { + "epoch": 0.009628743173908856, + "grad_norm": 0.08059624582529068, + "learning_rate": 0.0001, + "loss": 1.7735, + "step": 700 + }, + { + "epoch": 0.010316510543473776, + "grad_norm": 0.07659206539392471, + "learning_rate": 0.0001, + "loss": 1.767, + "step": 750 + }, + { + "epoch": 0.011004277913038693, + "grad_norm": 0.08097860962152481, + "learning_rate": 0.0001, + "loss": 1.7703, + "step": 800 + }, + { + "epoch": 0.011692045282603612, + "grad_norm": 0.08088383823633194, + "learning_rate": 0.0001, + "loss": 1.7648, + "step": 850 + }, + { + "epoch": 0.01237981265216853, + "grad_norm": 0.07697251439094543, + "learning_rate": 0.0001, + "loss": 1.7671, + "step": 900 + }, + { + "epoch": 0.013067580021733449, + "grad_norm": 0.08154206722974777, + "learning_rate": 0.0001, + "loss": 1.7629, + "step": 950 + }, + { + "epoch": 0.013755347391298368, + "grad_norm": 0.08085618168115616, + "learning_rate": 0.0001, + "loss": 1.7632, + "step": 1000 + }, + { + "epoch": 0.014443114760863286, + "grad_norm": 0.08026960492134094, + "learning_rate": 0.0001, + "loss": 1.7594, + "step": 1050 + }, + { + "epoch": 0.015130882130428205, + "grad_norm": 0.07709988951683044, + "learning_rate": 0.0001, + "loss": 1.7566, + "step": 1100 + }, + { + "epoch": 0.015818649499993124, + "grad_norm": 0.08290690928697586, + "learning_rate": 0.0001, + "loss": 1.7607, + "step": 1150 + }, + { + "epoch": 0.01650641686955804, + "grad_norm": 0.07915584743022919, + "learning_rate": 0.0001, + "loss": 1.7527, + "step": 1200 + }, + { + "epoch": 0.01719418423912296, + "grad_norm": 0.08169861882925034, + "learning_rate": 0.0001, + "loss": 1.7535, + "step": 1250 + }, + { + "epoch": 0.017881951608687878, + "grad_norm": 0.07745598256587982, + "learning_rate": 0.0001, + "loss": 1.7542, + "step": 1300 + }, + { + "epoch": 0.018569718978252797, + "grad_norm": 0.07793346047401428, + "learning_rate": 0.0001, + "loss": 1.7536, + "step": 1350 + }, + { + "epoch": 0.019257486347817713, + "grad_norm": 0.0771438255906105, + "learning_rate": 0.0001, + "loss": 1.7486, + "step": 1400 + }, + { + "epoch": 0.019945253717382632, + "grad_norm": 0.08003713190555573, + "learning_rate": 0.0001, + "loss": 1.7501, + "step": 1450 + }, + { + "epoch": 0.02063302108694755, + "grad_norm": 0.08053753525018692, + "learning_rate": 0.0001, + "loss": 1.7493, + "step": 1500 + }, + { + "epoch": 0.02132078845651247, + "grad_norm": 0.0768120214343071, + "learning_rate": 0.0001, + "loss": 1.7482, + "step": 1550 + }, + { + "epoch": 0.022008555826077386, + "grad_norm": 0.07853832095861435, + "learning_rate": 0.0001, + "loss": 1.7493, + "step": 1600 + }, + { + "epoch": 0.022696323195642305, + "grad_norm": 0.083896704018116, + "learning_rate": 0.0001, + "loss": 1.7467, + "step": 1650 + }, + { + "epoch": 0.023384090565207225, + "grad_norm": 0.07456424832344055, + "learning_rate": 0.0001, + "loss": 1.7456, + "step": 1700 + }, + { + "epoch": 0.024071857934772144, + "grad_norm": 0.0817745104432106, + "learning_rate": 0.0001, + "loss": 1.7438, + "step": 1750 + }, + { + "epoch": 0.02475962530433706, + "grad_norm": 0.0803341269493103, + "learning_rate": 0.0001, + "loss": 1.7434, + "step": 1800 + }, + { + "epoch": 0.02544739267390198, + "grad_norm": 0.07569049298763275, + "learning_rate": 0.0001, + "loss": 1.7449, + "step": 1850 + }, + { + "epoch": 0.026135160043466898, + "grad_norm": 0.07928919047117233, + "learning_rate": 0.0001, + "loss": 1.7388, + "step": 1900 + }, + { + "epoch": 0.026822927413031817, + "grad_norm": 0.0802655890583992, + "learning_rate": 0.0001, + "loss": 1.7409, + "step": 1950 + }, + { + "epoch": 0.027510694782596736, + "grad_norm": 0.07988564670085907, + "learning_rate": 0.0001, + "loss": 1.7396, + "step": 2000 + }, + { + "epoch": 0.028198462152161652, + "grad_norm": 0.08523596078157425, + "learning_rate": 0.0001, + "loss": 1.7375, + "step": 2050 + }, + { + "epoch": 0.02888622952172657, + "grad_norm": 0.08964947611093521, + "learning_rate": 0.0001, + "loss": 1.736, + "step": 2100 + }, + { + "epoch": 0.02957399689129149, + "grad_norm": 0.08145201951265335, + "learning_rate": 0.0001, + "loss": 1.7401, + "step": 2150 + }, + { + "epoch": 0.03026176426085641, + "grad_norm": 0.08227697014808655, + "learning_rate": 0.0001, + "loss": 1.7332, + "step": 2200 + }, + { + "epoch": 0.030949531630421325, + "grad_norm": 0.07990765571594238, + "learning_rate": 0.0001, + "loss": 1.7359, + "step": 2250 + }, + { + "epoch": 0.03163729899998625, + "grad_norm": 0.0835530161857605, + "learning_rate": 0.0001, + "loss": 1.7353, + "step": 2300 + }, + { + "epoch": 0.03232506636955116, + "grad_norm": 0.09058014303445816, + "learning_rate": 0.0001, + "loss": 1.7337, + "step": 2350 + }, + { + "epoch": 0.03301283373911608, + "grad_norm": 0.07631311565637589, + "learning_rate": 0.0001, + "loss": 1.7319, + "step": 2400 + }, + { + "epoch": 0.033700601108681, + "grad_norm": 0.07975883036851883, + "learning_rate": 0.0001, + "loss": 1.7331, + "step": 2450 + }, + { + "epoch": 0.03438836847824592, + "grad_norm": 0.0856098085641861, + "learning_rate": 0.0001, + "loss": 1.729, + "step": 2500 + }, + { + "epoch": 0.03507613584781084, + "grad_norm": 0.07782306522130966, + "learning_rate": 0.0001, + "loss": 1.7333, + "step": 2550 + }, + { + "epoch": 0.035763903217375756, + "grad_norm": 0.08808156102895737, + "learning_rate": 0.0001, + "loss": 1.7295, + "step": 2600 + }, + { + "epoch": 0.036451670586940675, + "grad_norm": 0.08983422815799713, + "learning_rate": 0.0001, + "loss": 1.7281, + "step": 2650 + }, + { + "epoch": 0.037139437956505594, + "grad_norm": 0.0850716382265091, + "learning_rate": 0.0001, + "loss": 1.7288, + "step": 2700 + }, + { + "epoch": 0.03782720532607051, + "grad_norm": 0.07847283780574799, + "learning_rate": 0.0001, + "loss": 1.7311, + "step": 2750 + }, + { + "epoch": 0.038514972695635426, + "grad_norm": 0.08320654928684235, + "learning_rate": 0.0001, + "loss": 1.7326, + "step": 2800 + }, + { + "epoch": 0.039202740065200345, + "grad_norm": 0.09107951819896698, + "learning_rate": 0.0001, + "loss": 1.7302, + "step": 2850 + }, + { + "epoch": 0.039890507434765264, + "grad_norm": 0.08038274943828583, + "learning_rate": 0.0001, + "loss": 1.7278, + "step": 2900 + }, + { + "epoch": 0.04057827480433018, + "grad_norm": 0.08225669711828232, + "learning_rate": 0.0001, + "loss": 1.7278, + "step": 2950 + }, + { + "epoch": 0.0412660421738951, + "grad_norm": 0.0852913185954094, + "learning_rate": 0.0001, + "loss": 1.7233, + "step": 3000 + }, + { + "epoch": 0.04195380954346002, + "grad_norm": 0.08353286981582642, + "learning_rate": 0.0001, + "loss": 1.7274, + "step": 3050 + }, + { + "epoch": 0.04264157691302494, + "grad_norm": 0.08236906677484512, + "learning_rate": 0.0001, + "loss": 1.7259, + "step": 3100 + }, + { + "epoch": 0.04332934428258986, + "grad_norm": 0.08094590902328491, + "learning_rate": 0.0001, + "loss": 1.7252, + "step": 3150 + }, + { + "epoch": 0.04401711165215477, + "grad_norm": 0.08384421467781067, + "learning_rate": 0.0001, + "loss": 1.7248, + "step": 3200 + }, + { + "epoch": 0.04470487902171969, + "grad_norm": 0.0810803771018982, + "learning_rate": 0.0001, + "loss": 1.7221, + "step": 3250 + }, + { + "epoch": 0.04539264639128461, + "grad_norm": 0.08863117545843124, + "learning_rate": 0.0001, + "loss": 1.7249, + "step": 3300 + }, + { + "epoch": 0.04608041376084953, + "grad_norm": 0.08272850513458252, + "learning_rate": 0.0001, + "loss": 1.7245, + "step": 3350 + }, + { + "epoch": 0.04676818113041445, + "grad_norm": 0.08872344344854355, + "learning_rate": 0.0001, + "loss": 1.7238, + "step": 3400 + }, + { + "epoch": 0.04745594849997937, + "grad_norm": 0.08702059090137482, + "learning_rate": 0.0001, + "loss": 1.723, + "step": 3450 + }, + { + "epoch": 0.04814371586954429, + "grad_norm": 0.07964511960744858, + "learning_rate": 0.0001, + "loss": 1.7208, + "step": 3500 + }, + { + "epoch": 0.04883148323910921, + "grad_norm": 0.08207014203071594, + "learning_rate": 0.0001, + "loss": 1.7189, + "step": 3550 + }, + { + "epoch": 0.04951925060867412, + "grad_norm": 0.08424731343984604, + "learning_rate": 0.0001, + "loss": 1.722, + "step": 3600 + }, + { + "epoch": 0.05020701797823904, + "grad_norm": 0.07758868485689163, + "learning_rate": 0.0001, + "loss": 1.7202, + "step": 3650 + }, + { + "epoch": 0.05089478534780396, + "grad_norm": 0.08630118519067764, + "learning_rate": 0.0001, + "loss": 1.7206, + "step": 3700 + }, + { + "epoch": 0.051582552717368876, + "grad_norm": 0.08104898035526276, + "learning_rate": 0.0001, + "loss": 1.7204, + "step": 3750 + }, + { + "epoch": 0.052270320086933796, + "grad_norm": 0.08586841821670532, + "learning_rate": 0.0001, + "loss": 1.7174, + "step": 3800 + }, + { + "epoch": 0.052958087456498715, + "grad_norm": 0.07795108109712601, + "learning_rate": 0.0001, + "loss": 1.7181, + "step": 3850 + }, + { + "epoch": 0.053645854826063634, + "grad_norm": 0.08409079909324646, + "learning_rate": 0.0001, + "loss": 1.7197, + "step": 3900 + }, + { + "epoch": 0.05433362219562855, + "grad_norm": 0.08350292593240738, + "learning_rate": 0.0001, + "loss": 1.7148, + "step": 3950 + }, + { + "epoch": 0.05502138956519347, + "grad_norm": 0.08318202942609787, + "learning_rate": 0.0001, + "loss": 1.7206, + "step": 4000 + }, + { + "epoch": 0.055709156934758385, + "grad_norm": 0.08408112823963165, + "learning_rate": 0.0001, + "loss": 1.7185, + "step": 4050 + }, + { + "epoch": 0.056396924304323304, + "grad_norm": 0.08753649145364761, + "learning_rate": 0.0001, + "loss": 1.7146, + "step": 4100 + }, + { + "epoch": 0.05708469167388822, + "grad_norm": 0.08772628009319305, + "learning_rate": 0.0001, + "loss": 1.719, + "step": 4150 + }, + { + "epoch": 0.05777245904345314, + "grad_norm": 0.08609924465417862, + "learning_rate": 0.0001, + "loss": 1.7189, + "step": 4200 + }, + { + "epoch": 0.05846022641301806, + "grad_norm": 0.08166562020778656, + "learning_rate": 0.0001, + "loss": 1.7156, + "step": 4250 + }, + { + "epoch": 0.05914799378258298, + "grad_norm": 0.10656413435935974, + "learning_rate": 0.0001, + "loss": 1.7198, + "step": 4300 + }, + { + "epoch": 0.0598357611521479, + "grad_norm": 0.08603313565254211, + "learning_rate": 0.0001, + "loss": 1.7152, + "step": 4350 + }, + { + "epoch": 0.06052352852171282, + "grad_norm": 0.08598031848669052, + "learning_rate": 0.0001, + "loss": 1.7163, + "step": 4400 + }, + { + "epoch": 0.06121129589127773, + "grad_norm": 0.08404461294412613, + "learning_rate": 0.0001, + "loss": 1.7125, + "step": 4450 + }, + { + "epoch": 0.06189906326084265, + "grad_norm": 0.08633497357368469, + "learning_rate": 0.0001, + "loss": 1.7118, + "step": 4500 + }, + { + "epoch": 0.06258683063040757, + "grad_norm": 0.087265245616436, + "learning_rate": 0.0001, + "loss": 1.7145, + "step": 4550 + }, + { + "epoch": 0.0632745979999725, + "grad_norm": 0.08514571934938431, + "learning_rate": 0.0001, + "loss": 1.7106, + "step": 4600 + }, + { + "epoch": 0.06396236536953741, + "grad_norm": 0.08637581020593643, + "learning_rate": 0.0001, + "loss": 1.7136, + "step": 4650 + }, + { + "epoch": 0.06465013273910232, + "grad_norm": 0.08497826009988785, + "learning_rate": 0.0001, + "loss": 1.7114, + "step": 4700 + }, + { + "epoch": 0.06533790010866725, + "grad_norm": 0.09578850120306015, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 4750 + }, + { + "epoch": 0.06602566747823216, + "grad_norm": 0.08667325973510742, + "learning_rate": 0.0001, + "loss": 1.7102, + "step": 4800 + }, + { + "epoch": 0.06671343484779708, + "grad_norm": 0.08680035918951035, + "learning_rate": 0.0001, + "loss": 1.7135, + "step": 4850 + }, + { + "epoch": 0.067401202217362, + "grad_norm": 0.0779147818684578, + "learning_rate": 0.0001, + "loss": 1.7144, + "step": 4900 + }, + { + "epoch": 0.06808896958692692, + "grad_norm": 0.08890321105718613, + "learning_rate": 0.0001, + "loss": 1.7105, + "step": 4950 + }, + { + "epoch": 0.06877673695649184, + "grad_norm": 0.08258546888828278, + "learning_rate": 0.0001, + "loss": 1.7126, + "step": 5000 + }, + { + "epoch": 0.06946450432605676, + "grad_norm": 0.0860370546579361, + "learning_rate": 0.0001, + "loss": 1.7084, + "step": 5050 + }, + { + "epoch": 0.07015227169562167, + "grad_norm": 0.0840732529759407, + "learning_rate": 0.0001, + "loss": 1.7153, + "step": 5100 + }, + { + "epoch": 0.07084003906518659, + "grad_norm": 0.08410490304231644, + "learning_rate": 0.0001, + "loss": 1.7105, + "step": 5150 + }, + { + "epoch": 0.07152780643475151, + "grad_norm": 0.07879506051540375, + "learning_rate": 0.0001, + "loss": 1.7067, + "step": 5200 + }, + { + "epoch": 0.07221557380431642, + "grad_norm": 0.0969550833106041, + "learning_rate": 0.0001, + "loss": 1.7033, + "step": 5250 + }, + { + "epoch": 0.07290334117388135, + "grad_norm": 0.08318573236465454, + "learning_rate": 0.0001, + "loss": 1.7101, + "step": 5300 + }, + { + "epoch": 0.07359110854344626, + "grad_norm": 0.08452852070331573, + "learning_rate": 0.0001, + "loss": 1.7081, + "step": 5350 + }, + { + "epoch": 0.07427887591301119, + "grad_norm": 0.08123467862606049, + "learning_rate": 0.0001, + "loss": 1.712, + "step": 5400 + }, + { + "epoch": 0.0749666432825761, + "grad_norm": 0.08749833703041077, + "learning_rate": 0.0001, + "loss": 1.7096, + "step": 5450 + }, + { + "epoch": 0.07565441065214101, + "grad_norm": 0.09270595759153366, + "learning_rate": 0.0001, + "loss": 1.7038, + "step": 5500 + }, + { + "epoch": 0.07634217802170594, + "grad_norm": 0.0823141559958458, + "learning_rate": 0.0001, + "loss": 1.7047, + "step": 5550 + }, + { + "epoch": 0.07702994539127085, + "grad_norm": 0.09130799025297165, + "learning_rate": 0.0001, + "loss": 1.707, + "step": 5600 + }, + { + "epoch": 0.07771771276083578, + "grad_norm": 0.08671557903289795, + "learning_rate": 0.0001, + "loss": 1.7083, + "step": 5650 + }, + { + "epoch": 0.07840548013040069, + "grad_norm": 0.08462578803300858, + "learning_rate": 0.0001, + "loss": 1.7018, + "step": 5700 + }, + { + "epoch": 0.07909324749996562, + "grad_norm": 0.08817154169082642, + "learning_rate": 0.0001, + "loss": 1.7024, + "step": 5750 + }, + { + "epoch": 0.07978101486953053, + "grad_norm": 0.09289363026618958, + "learning_rate": 0.0001, + "loss": 1.7077, + "step": 5800 + }, + { + "epoch": 0.08046878223909545, + "grad_norm": 0.08403845876455307, + "learning_rate": 0.0001, + "loss": 1.6997, + "step": 5850 + }, + { + "epoch": 0.08115654960866037, + "grad_norm": 0.08645929396152496, + "learning_rate": 0.0001, + "loss": 1.7051, + "step": 5900 + }, + { + "epoch": 0.08184431697822528, + "grad_norm": 0.08411643654108047, + "learning_rate": 0.0001, + "loss": 1.7059, + "step": 5950 + }, + { + "epoch": 0.0825320843477902, + "grad_norm": 0.09166127443313599, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 6000 + }, + { + "epoch": 0.08321985171735512, + "grad_norm": 0.08519306033849716, + "learning_rate": 0.0001, + "loss": 1.7056, + "step": 6050 + }, + { + "epoch": 0.08390761908692004, + "grad_norm": 0.08552943915128708, + "learning_rate": 0.0001, + "loss": 1.7034, + "step": 6100 + }, + { + "epoch": 0.08459538645648496, + "grad_norm": 0.08541347831487656, + "learning_rate": 0.0001, + "loss": 1.7046, + "step": 6150 + }, + { + "epoch": 0.08528315382604988, + "grad_norm": 0.08669677376747131, + "learning_rate": 0.0001, + "loss": 1.7042, + "step": 6200 + }, + { + "epoch": 0.0859709211956148, + "grad_norm": 0.08166626840829849, + "learning_rate": 0.0001, + "loss": 1.705, + "step": 6250 + }, + { + "epoch": 0.08665868856517972, + "grad_norm": 0.08560743182897568, + "learning_rate": 0.0001, + "loss": 1.7033, + "step": 6300 + }, + { + "epoch": 0.08734645593474463, + "grad_norm": 0.08644382655620575, + "learning_rate": 0.0001, + "loss": 1.7036, + "step": 6350 + }, + { + "epoch": 0.08803422330430954, + "grad_norm": 0.0843438133597374, + "learning_rate": 0.0001, + "loss": 1.6998, + "step": 6400 + }, + { + "epoch": 0.08872199067387447, + "grad_norm": 0.08665913343429565, + "learning_rate": 0.0001, + "loss": 1.6986, + "step": 6450 + }, + { + "epoch": 0.08940975804343938, + "grad_norm": 0.07717980444431305, + "learning_rate": 0.0001, + "loss": 1.6986, + "step": 6500 + }, + { + "epoch": 0.09009752541300431, + "grad_norm": 0.08198987692594528, + "learning_rate": 0.0001, + "loss": 1.7019, + "step": 6550 + }, + { + "epoch": 0.09078529278256922, + "grad_norm": 0.08437033742666245, + "learning_rate": 0.0001, + "loss": 1.7032, + "step": 6600 + }, + { + "epoch": 0.09147306015213415, + "grad_norm": 0.08697813004255295, + "learning_rate": 0.0001, + "loss": 1.701, + "step": 6650 + }, + { + "epoch": 0.09216082752169906, + "grad_norm": 0.08434218913316727, + "learning_rate": 0.0001, + "loss": 1.7, + "step": 6700 + }, + { + "epoch": 0.09284859489126399, + "grad_norm": 0.08920707553625107, + "learning_rate": 0.0001, + "loss": 1.6993, + "step": 6750 + }, + { + "epoch": 0.0935363622608289, + "grad_norm": 0.08233244717121124, + "learning_rate": 0.0001, + "loss": 1.6993, + "step": 6800 + }, + { + "epoch": 0.09422412963039381, + "grad_norm": 0.0831465944647789, + "learning_rate": 0.0001, + "loss": 1.7009, + "step": 6850 + }, + { + "epoch": 0.09491189699995874, + "grad_norm": 0.08565302193164825, + "learning_rate": 0.0001, + "loss": 1.699, + "step": 6900 + }, + { + "epoch": 0.09559966436952365, + "grad_norm": 0.08086910843849182, + "learning_rate": 0.0001, + "loss": 1.7014, + "step": 6950 + }, + { + "epoch": 0.09628743173908857, + "grad_norm": 0.08551020175218582, + "learning_rate": 0.0001, + "loss": 1.7005, + "step": 7000 + }, + { + "epoch": 0.09697519910865349, + "grad_norm": 0.08930126577615738, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 7050 + }, + { + "epoch": 0.09766296647821841, + "grad_norm": 0.0878065899014473, + "learning_rate": 0.0001, + "loss": 1.6976, + "step": 7100 + }, + { + "epoch": 0.09835073384778333, + "grad_norm": 0.09178236126899719, + "learning_rate": 0.0001, + "loss": 1.6957, + "step": 7150 + }, + { + "epoch": 0.09903850121734824, + "grad_norm": 0.08948659151792526, + "learning_rate": 0.0001, + "loss": 1.7008, + "step": 7200 + }, + { + "epoch": 0.09972626858691316, + "grad_norm": 0.0796937644481659, + "learning_rate": 0.0001, + "loss": 1.7023, + "step": 7250 + }, + { + "epoch": 0.10041403595647808, + "grad_norm": 0.08426263928413391, + "learning_rate": 0.0001, + "loss": 1.6992, + "step": 7300 + }, + { + "epoch": 0.101101803326043, + "grad_norm": 0.0868782177567482, + "learning_rate": 0.0001, + "loss": 1.6996, + "step": 7350 + }, + { + "epoch": 0.10178957069560791, + "grad_norm": 0.08032725006341934, + "learning_rate": 0.0001, + "loss": 1.7, + "step": 7400 + }, + { + "epoch": 0.10247733806517284, + "grad_norm": 0.08831656724214554, + "learning_rate": 0.0001, + "loss": 1.6943, + "step": 7450 + }, + { + "epoch": 0.10316510543473775, + "grad_norm": 0.0800151377916336, + "learning_rate": 0.0001, + "loss": 1.6986, + "step": 7500 + }, + { + "epoch": 0.10385287280430268, + "grad_norm": 0.09132002294063568, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 7550 + }, + { + "epoch": 0.10454064017386759, + "grad_norm": 0.0925612673163414, + "learning_rate": 0.0001, + "loss": 1.6975, + "step": 7600 + }, + { + "epoch": 0.1052284075434325, + "grad_norm": 0.08521568030118942, + "learning_rate": 0.0001, + "loss": 1.6918, + "step": 7650 + }, + { + "epoch": 0.10591617491299743, + "grad_norm": 0.08576962351799011, + "learning_rate": 0.0001, + "loss": 1.6951, + "step": 7700 + }, + { + "epoch": 0.10660394228256234, + "grad_norm": 0.08979818224906921, + "learning_rate": 0.0001, + "loss": 1.6994, + "step": 7750 + }, + { + "epoch": 0.10729170965212727, + "grad_norm": 0.08927462995052338, + "learning_rate": 0.0001, + "loss": 1.6946, + "step": 7800 + }, + { + "epoch": 0.10797947702169218, + "grad_norm": 0.0862877294421196, + "learning_rate": 0.0001, + "loss": 1.6994, + "step": 7850 + }, + { + "epoch": 0.1086672443912571, + "grad_norm": 0.0894964188337326, + "learning_rate": 0.0001, + "loss": 1.6953, + "step": 7900 + }, + { + "epoch": 0.10935501176082202, + "grad_norm": 0.09155282378196716, + "learning_rate": 0.0001, + "loss": 1.6929, + "step": 7950 + }, + { + "epoch": 0.11004277913038694, + "grad_norm": 0.08712241053581238, + "learning_rate": 0.0001, + "loss": 1.6958, + "step": 8000 + }, + { + "epoch": 0.11073054649995186, + "grad_norm": 0.08157862722873688, + "learning_rate": 0.0001, + "loss": 1.6942, + "step": 8050 + }, + { + "epoch": 0.11141831386951677, + "grad_norm": 0.08820342272520065, + "learning_rate": 0.0001, + "loss": 1.6928, + "step": 8100 + }, + { + "epoch": 0.1121060812390817, + "grad_norm": 0.08663163334131241, + "learning_rate": 0.0001, + "loss": 1.6936, + "step": 8150 + }, + { + "epoch": 0.11279384860864661, + "grad_norm": 0.08267082273960114, + "learning_rate": 0.0001, + "loss": 1.6939, + "step": 8200 + }, + { + "epoch": 0.11348161597821153, + "grad_norm": 0.0887032225728035, + "learning_rate": 0.0001, + "loss": 1.6965, + "step": 8250 + }, + { + "epoch": 0.11416938334777645, + "grad_norm": 0.0971740260720253, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 8300 + }, + { + "epoch": 0.11485715071734137, + "grad_norm": 0.08765862137079239, + "learning_rate": 0.0001, + "loss": 1.6878, + "step": 8350 + }, + { + "epoch": 0.11554491808690628, + "grad_norm": 0.08712899684906006, + "learning_rate": 0.0001, + "loss": 1.6927, + "step": 8400 + }, + { + "epoch": 0.1162326854564712, + "grad_norm": 0.08298210054636002, + "learning_rate": 0.0001, + "loss": 1.6909, + "step": 8450 + }, + { + "epoch": 0.11692045282603612, + "grad_norm": 0.08736353367567062, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 8500 + }, + { + "epoch": 0.11760822019560103, + "grad_norm": 0.0875655934214592, + "learning_rate": 0.0001, + "loss": 1.6945, + "step": 8550 + }, + { + "epoch": 0.11829598756516596, + "grad_norm": 0.08461877703666687, + "learning_rate": 0.0001, + "loss": 1.6938, + "step": 8600 + }, + { + "epoch": 0.11898375493473087, + "grad_norm": 0.08680044859647751, + "learning_rate": 0.0001, + "loss": 1.6914, + "step": 8650 + }, + { + "epoch": 0.1196715223042958, + "grad_norm": 0.0874258503317833, + "learning_rate": 0.0001, + "loss": 1.6886, + "step": 8700 + }, + { + "epoch": 0.12035928967386071, + "grad_norm": 0.08458847552537918, + "learning_rate": 0.0001, + "loss": 1.6889, + "step": 8750 + }, + { + "epoch": 0.12104705704342564, + "grad_norm": 0.08277745544910431, + "learning_rate": 0.0001, + "loss": 1.6955, + "step": 8800 + }, + { + "epoch": 0.12173482441299055, + "grad_norm": 0.08370788395404816, + "learning_rate": 0.0001, + "loss": 1.693, + "step": 8850 + }, + { + "epoch": 0.12242259178255546, + "grad_norm": 0.08819565922021866, + "learning_rate": 0.0001, + "loss": 1.6898, + "step": 8900 + }, + { + "epoch": 0.12311035915212039, + "grad_norm": 0.08511374890804291, + "learning_rate": 0.0001, + "loss": 1.6877, + "step": 8950 + }, + { + "epoch": 0.1237981265216853, + "grad_norm": 0.08699041604995728, + "learning_rate": 0.0001, + "loss": 1.6919, + "step": 9000 + }, + { + "epoch": 0.12448589389125023, + "grad_norm": 0.08358988165855408, + "learning_rate": 0.0001, + "loss": 1.692, + "step": 9050 + }, + { + "epoch": 0.12517366126081514, + "grad_norm": 0.08777803182601929, + "learning_rate": 0.0001, + "loss": 1.6906, + "step": 9100 + }, + { + "epoch": 0.12586142863038005, + "grad_norm": 0.08466688543558121, + "learning_rate": 0.0001, + "loss": 1.69, + "step": 9150 + }, + { + "epoch": 0.126549195999945, + "grad_norm": 0.09302639961242676, + "learning_rate": 0.0001, + "loss": 1.6905, + "step": 9200 + }, + { + "epoch": 0.1272369633695099, + "grad_norm": 0.085854172706604, + "learning_rate": 0.0001, + "loss": 1.6901, + "step": 9250 + }, + { + "epoch": 0.12792473073907482, + "grad_norm": 0.10086188465356827, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 9300 + }, + { + "epoch": 0.12861249810863973, + "grad_norm": 0.09083892405033112, + "learning_rate": 0.0001, + "loss": 1.6896, + "step": 9350 + }, + { + "epoch": 0.12930026547820464, + "grad_norm": 0.08710625022649765, + "learning_rate": 0.0001, + "loss": 1.6907, + "step": 9400 + }, + { + "epoch": 0.12998803284776958, + "grad_norm": 0.08261676877737045, + "learning_rate": 0.0001, + "loss": 1.6878, + "step": 9450 + }, + { + "epoch": 0.1306758002173345, + "grad_norm": 0.08663926273584366, + "learning_rate": 0.0001, + "loss": 1.686, + "step": 9500 + }, + { + "epoch": 0.1313635675868994, + "grad_norm": 0.09142150729894638, + "learning_rate": 0.0001, + "loss": 1.6848, + "step": 9550 + }, + { + "epoch": 0.13205133495646432, + "grad_norm": 0.0847783163189888, + "learning_rate": 0.0001, + "loss": 1.6907, + "step": 9600 + }, + { + "epoch": 0.13273910232602926, + "grad_norm": 0.08931740373373032, + "learning_rate": 0.0001, + "loss": 1.6863, + "step": 9650 + }, + { + "epoch": 0.13342686969559417, + "grad_norm": 0.10057684034109116, + "learning_rate": 0.0001, + "loss": 1.6874, + "step": 9700 + }, + { + "epoch": 0.13411463706515908, + "grad_norm": 0.08137702941894531, + "learning_rate": 0.0001, + "loss": 1.6899, + "step": 9750 + }, + { + "epoch": 0.134802404434724, + "grad_norm": 0.08957816660404205, + "learning_rate": 0.0001, + "loss": 1.6893, + "step": 9800 + }, + { + "epoch": 0.1354901718042889, + "grad_norm": 0.09797471016645432, + "learning_rate": 0.0001, + "loss": 1.6894, + "step": 9850 + }, + { + "epoch": 0.13617793917385385, + "grad_norm": 0.08011554181575775, + "learning_rate": 0.0001, + "loss": 1.6862, + "step": 9900 + }, + { + "epoch": 0.13686570654341876, + "grad_norm": 0.0851946547627449, + "learning_rate": 0.0001, + "loss": 1.6874, + "step": 9950 + }, + { + "epoch": 0.13755347391298367, + "grad_norm": 0.0880025252699852, + "learning_rate": 0.0001, + "loss": 1.6891, + "step": 10000 + }, + { + "epoch": 0.13824124128254858, + "grad_norm": 0.09258699417114258, + "learning_rate": 0.0001, + "loss": 1.6841, + "step": 10050 + }, + { + "epoch": 0.13892900865211352, + "grad_norm": 0.08737599104642868, + "learning_rate": 0.0001, + "loss": 1.6848, + "step": 10100 + }, + { + "epoch": 0.13961677602167843, + "grad_norm": 0.08749546855688095, + "learning_rate": 0.0001, + "loss": 1.686, + "step": 10150 + }, + { + "epoch": 0.14030454339124335, + "grad_norm": 0.09744163602590561, + "learning_rate": 0.0001, + "loss": 1.6864, + "step": 10200 + }, + { + "epoch": 0.14099231076080826, + "grad_norm": 0.08548065274953842, + "learning_rate": 0.0001, + "loss": 1.6868, + "step": 10250 + }, + { + "epoch": 0.14168007813037317, + "grad_norm": 0.0831003487110138, + "learning_rate": 0.0001, + "loss": 1.6886, + "step": 10300 + }, + { + "epoch": 0.1423678454999381, + "grad_norm": 0.08544298261404037, + "learning_rate": 0.0001, + "loss": 1.684, + "step": 10350 + }, + { + "epoch": 0.14305561286950302, + "grad_norm": 0.08695852756500244, + "learning_rate": 0.0001, + "loss": 1.6879, + "step": 10400 + }, + { + "epoch": 0.14374338023906794, + "grad_norm": 0.0854826420545578, + "learning_rate": 0.0001, + "loss": 1.685, + "step": 10450 + }, + { + "epoch": 0.14443114760863285, + "grad_norm": 0.08266725391149521, + "learning_rate": 0.0001, + "loss": 1.6854, + "step": 10500 + }, + { + "epoch": 0.14511891497819776, + "grad_norm": 0.08955837041139603, + "learning_rate": 0.0001, + "loss": 1.6856, + "step": 10550 + }, + { + "epoch": 0.1458066823477627, + "grad_norm": 0.11815349757671356, + "learning_rate": 0.0001, + "loss": 1.6823, + "step": 10600 + }, + { + "epoch": 0.1464944497173276, + "grad_norm": 0.08930766582489014, + "learning_rate": 0.0001, + "loss": 1.6848, + "step": 10650 + }, + { + "epoch": 0.14718221708689253, + "grad_norm": 0.08815452456474304, + "learning_rate": 0.0001, + "loss": 1.6819, + "step": 10700 + }, + { + "epoch": 0.14786998445645744, + "grad_norm": 0.09267221391201019, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 10750 + }, + { + "epoch": 0.14855775182602238, + "grad_norm": 0.08665202558040619, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 10800 + }, + { + "epoch": 0.1492455191955873, + "grad_norm": 0.083659827709198, + "learning_rate": 0.0001, + "loss": 1.6855, + "step": 10850 + }, + { + "epoch": 0.1499332865651522, + "grad_norm": 0.08788301050662994, + "learning_rate": 0.0001, + "loss": 1.6882, + "step": 10900 + }, + { + "epoch": 0.15062105393471711, + "grad_norm": 0.08252999931573868, + "learning_rate": 0.0001, + "loss": 1.6809, + "step": 10950 + }, + { + "epoch": 0.15130882130428203, + "grad_norm": 0.08625887334346771, + "learning_rate": 0.0001, + "loss": 1.6844, + "step": 11000 + }, + { + "epoch": 0.15199658867384697, + "grad_norm": 0.0943947359919548, + "learning_rate": 0.0001, + "loss": 1.684, + "step": 11050 + }, + { + "epoch": 0.15268435604341188, + "grad_norm": 0.08911854773759842, + "learning_rate": 0.0001, + "loss": 1.6873, + "step": 11100 + }, + { + "epoch": 0.1533721234129768, + "grad_norm": 0.08470957726240158, + "learning_rate": 0.0001, + "loss": 1.6807, + "step": 11150 + }, + { + "epoch": 0.1540598907825417, + "grad_norm": 0.08653486520051956, + "learning_rate": 0.0001, + "loss": 1.6822, + "step": 11200 + }, + { + "epoch": 0.15474765815210664, + "grad_norm": 0.08835257589817047, + "learning_rate": 0.0001, + "loss": 1.6827, + "step": 11250 + }, + { + "epoch": 0.15543542552167156, + "grad_norm": 0.08290986716747284, + "learning_rate": 0.0001, + "loss": 1.6795, + "step": 11300 + }, + { + "epoch": 0.15612319289123647, + "grad_norm": 0.36724504828453064, + "learning_rate": 0.0001, + "loss": 1.6793, + "step": 11350 + }, + { + "epoch": 0.15681096026080138, + "grad_norm": 0.0866614505648613, + "learning_rate": 0.0001, + "loss": 1.6815, + "step": 11400 + }, + { + "epoch": 0.1574987276303663, + "grad_norm": 0.09343897551298141, + "learning_rate": 0.0001, + "loss": 1.6824, + "step": 11450 + }, + { + "epoch": 0.15818649499993123, + "grad_norm": 0.09004870057106018, + "learning_rate": 0.0001, + "loss": 1.6821, + "step": 11500 + }, + { + "epoch": 0.15887426236949614, + "grad_norm": 0.08509238809347153, + "learning_rate": 0.0001, + "loss": 1.6816, + "step": 11550 + }, + { + "epoch": 0.15956202973906106, + "grad_norm": 0.09254981577396393, + "learning_rate": 0.0001, + "loss": 1.6821, + "step": 11600 + }, + { + "epoch": 0.16024979710862597, + "grad_norm": 0.08585833013057709, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 11650 + }, + { + "epoch": 0.1609375644781909, + "grad_norm": 0.08284648507833481, + "learning_rate": 0.0001, + "loss": 1.6843, + "step": 11700 + }, + { + "epoch": 0.16162533184775582, + "grad_norm": 0.08601938188076019, + "learning_rate": 0.0001, + "loss": 1.6829, + "step": 11750 + }, + { + "epoch": 0.16231309921732073, + "grad_norm": 0.08931539207696915, + "learning_rate": 0.0001, + "loss": 1.682, + "step": 11800 + }, + { + "epoch": 0.16300086658688565, + "grad_norm": 0.09171988070011139, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 11850 + }, + { + "epoch": 0.16368863395645056, + "grad_norm": 0.08799155801534653, + "learning_rate": 0.0001, + "loss": 1.6817, + "step": 11900 + }, + { + "epoch": 0.1643764013260155, + "grad_norm": 0.09240555018186569, + "learning_rate": 0.0001, + "loss": 1.681, + "step": 11950 + }, + { + "epoch": 0.1650641686955804, + "grad_norm": 0.08808182179927826, + "learning_rate": 0.0001, + "loss": 1.6796, + "step": 12000 + }, + { + "epoch": 0.16575193606514532, + "grad_norm": 0.09885094314813614, + "learning_rate": 0.0001, + "loss": 1.6763, + "step": 12050 + }, + { + "epoch": 0.16643970343471023, + "grad_norm": 0.0844956710934639, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 12100 + }, + { + "epoch": 0.16712747080427517, + "grad_norm": 0.08291836082935333, + "learning_rate": 0.0001, + "loss": 1.6803, + "step": 12150 + }, + { + "epoch": 0.1678152381738401, + "grad_norm": 0.085113026201725, + "learning_rate": 0.0001, + "loss": 1.6812, + "step": 12200 + }, + { + "epoch": 0.168503005543405, + "grad_norm": 0.09225833415985107, + "learning_rate": 0.0001, + "loss": 1.6814, + "step": 12250 + }, + { + "epoch": 0.1691907729129699, + "grad_norm": 0.09046541899442673, + "learning_rate": 0.0001, + "loss": 1.6763, + "step": 12300 + }, + { + "epoch": 0.16987854028253482, + "grad_norm": 0.08825410902500153, + "learning_rate": 0.0001, + "loss": 1.6804, + "step": 12350 + }, + { + "epoch": 0.17056630765209976, + "grad_norm": 0.08579885214567184, + "learning_rate": 0.0001, + "loss": 1.6768, + "step": 12400 + }, + { + "epoch": 0.17125407502166468, + "grad_norm": 0.08292757719755173, + "learning_rate": 0.0001, + "loss": 1.6787, + "step": 12450 + }, + { + "epoch": 0.1719418423912296, + "grad_norm": 0.08542559295892715, + "learning_rate": 0.0001, + "loss": 1.6805, + "step": 12500 + }, + { + "epoch": 0.1726296097607945, + "grad_norm": 0.08370521664619446, + "learning_rate": 0.0001, + "loss": 1.6802, + "step": 12550 + }, + { + "epoch": 0.17331737713035944, + "grad_norm": 0.08622439205646515, + "learning_rate": 0.0001, + "loss": 1.6768, + "step": 12600 + }, + { + "epoch": 0.17400514449992435, + "grad_norm": 0.08716084808111191, + "learning_rate": 0.0001, + "loss": 1.6809, + "step": 12650 + }, + { + "epoch": 0.17469291186948926, + "grad_norm": 0.09401235729455948, + "learning_rate": 0.0001, + "loss": 1.6774, + "step": 12700 + }, + { + "epoch": 0.17538067923905418, + "grad_norm": 0.08970025926828384, + "learning_rate": 0.0001, + "loss": 1.6756, + "step": 12750 + }, + { + "epoch": 0.1760684466086191, + "grad_norm": 0.09375713020563126, + "learning_rate": 0.0001, + "loss": 1.6758, + "step": 12800 + }, + { + "epoch": 0.17675621397818403, + "grad_norm": 0.09062221646308899, + "learning_rate": 0.0001, + "loss": 1.6775, + "step": 12850 + }, + { + "epoch": 0.17744398134774894, + "grad_norm": 0.08808109164237976, + "learning_rate": 0.0001, + "loss": 1.6783, + "step": 12900 + }, + { + "epoch": 0.17813174871731385, + "grad_norm": 0.08363673090934753, + "learning_rate": 0.0001, + "loss": 1.6796, + "step": 12950 + }, + { + "epoch": 0.17881951608687877, + "grad_norm": 0.09371782094240189, + "learning_rate": 0.0001, + "loss": 1.6757, + "step": 13000 + }, + { + "epoch": 0.1795072834564437, + "grad_norm": 0.09487029165029526, + "learning_rate": 0.0001, + "loss": 1.6801, + "step": 13050 + }, + { + "epoch": 0.18019505082600862, + "grad_norm": 0.09358393400907516, + "learning_rate": 0.0001, + "loss": 1.6772, + "step": 13100 + }, + { + "epoch": 0.18088281819557353, + "grad_norm": 0.09072808176279068, + "learning_rate": 0.0001, + "loss": 1.6764, + "step": 13150 + }, + { + "epoch": 0.18157058556513844, + "grad_norm": 0.10067198425531387, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 13200 + }, + { + "epoch": 0.18225835293470335, + "grad_norm": 0.0920378789305687, + "learning_rate": 0.0001, + "loss": 1.6737, + "step": 13250 + }, + { + "epoch": 0.1829461203042683, + "grad_norm": 0.09141296148300171, + "learning_rate": 0.0001, + "loss": 1.68, + "step": 13300 + }, + { + "epoch": 0.1836338876738332, + "grad_norm": 0.09030226618051529, + "learning_rate": 0.0001, + "loss": 1.6786, + "step": 13350 + }, + { + "epoch": 0.18432165504339812, + "grad_norm": 0.09276604652404785, + "learning_rate": 0.0001, + "loss": 1.681, + "step": 13400 + }, + { + "epoch": 0.18500942241296303, + "grad_norm": 0.08355134725570679, + "learning_rate": 0.0001, + "loss": 1.6777, + "step": 13450 + }, + { + "epoch": 0.18569718978252797, + "grad_norm": 0.0854455828666687, + "learning_rate": 0.0001, + "loss": 1.6763, + "step": 13500 + }, + { + "epoch": 0.18638495715209288, + "grad_norm": 0.09083615243434906, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 13550 + }, + { + "epoch": 0.1870727245216578, + "grad_norm": 0.08724341541528702, + "learning_rate": 0.0001, + "loss": 1.6751, + "step": 13600 + }, + { + "epoch": 0.1877604918912227, + "grad_norm": 0.08758584409952164, + "learning_rate": 0.0001, + "loss": 1.6751, + "step": 13650 + }, + { + "epoch": 0.18844825926078762, + "grad_norm": 0.09144899994134903, + "learning_rate": 0.0001, + "loss": 1.6779, + "step": 13700 + }, + { + "epoch": 0.18913602663035256, + "grad_norm": 0.09547972679138184, + "learning_rate": 0.0001, + "loss": 1.6739, + "step": 13750 + }, + { + "epoch": 0.18982379399991747, + "grad_norm": 0.09593083709478378, + "learning_rate": 0.0001, + "loss": 1.6762, + "step": 13800 + }, + { + "epoch": 0.19051156136948239, + "grad_norm": 0.09824243932962418, + "learning_rate": 0.0001, + "loss": 1.6744, + "step": 13850 + }, + { + "epoch": 0.1911993287390473, + "grad_norm": 0.08824923634529114, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 13900 + }, + { + "epoch": 0.1918870961086122, + "grad_norm": 0.09215164929628372, + "learning_rate": 0.0001, + "loss": 1.6786, + "step": 13950 + }, + { + "epoch": 0.19257486347817715, + "grad_norm": 0.09035980701446533, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 14000 + }, + { + "epoch": 0.19326263084774206, + "grad_norm": 0.09424874931573868, + "learning_rate": 0.0001, + "loss": 1.6753, + "step": 14050 + }, + { + "epoch": 0.19395039821730697, + "grad_norm": 0.08970798552036285, + "learning_rate": 0.0001, + "loss": 1.6771, + "step": 14100 + }, + { + "epoch": 0.1946381655868719, + "grad_norm": 0.0905926451086998, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 14150 + }, + { + "epoch": 0.19532593295643683, + "grad_norm": 0.0877937376499176, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 14200 + }, + { + "epoch": 0.19601370032600174, + "grad_norm": 0.08889397233724594, + "learning_rate": 0.0001, + "loss": 1.6778, + "step": 14250 + }, + { + "epoch": 0.19670146769556665, + "grad_norm": 0.09094177931547165, + "learning_rate": 0.0001, + "loss": 1.6791, + "step": 14300 + }, + { + "epoch": 0.19738923506513156, + "grad_norm": 0.08906359225511551, + "learning_rate": 0.0001, + "loss": 1.6746, + "step": 14350 + }, + { + "epoch": 0.19807700243469648, + "grad_norm": 0.086732417345047, + "learning_rate": 0.0001, + "loss": 1.6729, + "step": 14400 + }, + { + "epoch": 0.19876476980426142, + "grad_norm": 0.08629673719406128, + "learning_rate": 0.0001, + "loss": 1.6768, + "step": 14450 + }, + { + "epoch": 0.19945253717382633, + "grad_norm": 0.08930768072605133, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 14500 + }, + { + "epoch": 0.20014030454339124, + "grad_norm": 0.09292076528072357, + "learning_rate": 0.0001, + "loss": 1.6733, + "step": 14550 + }, + { + "epoch": 0.20082807191295615, + "grad_norm": 0.09250384569168091, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 14600 + }, + { + "epoch": 0.2015158392825211, + "grad_norm": 0.0884978249669075, + "learning_rate": 0.0001, + "loss": 1.6727, + "step": 14650 + }, + { + "epoch": 0.202203606652086, + "grad_norm": 0.09365909546613693, + "learning_rate": 0.0001, + "loss": 1.6732, + "step": 14700 + }, + { + "epoch": 0.20289137402165092, + "grad_norm": 0.09228827059268951, + "learning_rate": 0.0001, + "loss": 1.6774, + "step": 14750 + }, + { + "epoch": 0.20357914139121583, + "grad_norm": 0.08782710880041122, + "learning_rate": 0.0001, + "loss": 1.6724, + "step": 14800 + }, + { + "epoch": 0.20426690876078074, + "grad_norm": 0.09346231073141098, + "learning_rate": 0.0001, + "loss": 1.6758, + "step": 14850 + }, + { + "epoch": 0.20495467613034568, + "grad_norm": 0.08543895184993744, + "learning_rate": 0.0001, + "loss": 1.6719, + "step": 14900 + }, + { + "epoch": 0.2056424434999106, + "grad_norm": 0.08667740225791931, + "learning_rate": 0.0001, + "loss": 1.675, + "step": 14950 + }, + { + "epoch": 0.2063302108694755, + "grad_norm": 0.09252138435840607, + "learning_rate": 0.0001, + "loss": 1.6731, + "step": 15000 + }, + { + "epoch": 0.20701797823904042, + "grad_norm": 0.09669343382120132, + "learning_rate": 0.0001, + "loss": 1.6721, + "step": 15050 + }, + { + "epoch": 0.20770574560860536, + "grad_norm": 0.08828496187925339, + "learning_rate": 0.0001, + "loss": 1.6701, + "step": 15100 + }, + { + "epoch": 0.20839351297817027, + "grad_norm": 0.08292458951473236, + "learning_rate": 0.0001, + "loss": 1.6754, + "step": 15150 + }, + { + "epoch": 0.20908128034773518, + "grad_norm": 0.085512176156044, + "learning_rate": 0.0001, + "loss": 1.6758, + "step": 15200 + }, + { + "epoch": 0.2097690477173001, + "grad_norm": 0.08498524874448776, + "learning_rate": 0.0001, + "loss": 1.674, + "step": 15250 + }, + { + "epoch": 0.210456815086865, + "grad_norm": 0.08492223173379898, + "learning_rate": 0.0001, + "loss": 1.6758, + "step": 15300 + }, + { + "epoch": 0.21114458245642995, + "grad_norm": 0.08872613310813904, + "learning_rate": 0.0001, + "loss": 1.6722, + "step": 15350 + }, + { + "epoch": 0.21183234982599486, + "grad_norm": 0.08605389297008514, + "learning_rate": 0.0001, + "loss": 1.6783, + "step": 15400 + }, + { + "epoch": 0.21252011719555977, + "grad_norm": 0.09041046351194382, + "learning_rate": 0.0001, + "loss": 1.6735, + "step": 15450 + }, + { + "epoch": 0.21320788456512468, + "grad_norm": 0.09070464968681335, + "learning_rate": 0.0001, + "loss": 1.6758, + "step": 15500 + }, + { + "epoch": 0.21389565193468962, + "grad_norm": 0.09620167315006256, + "learning_rate": 0.0001, + "loss": 1.6707, + "step": 15550 + }, + { + "epoch": 0.21458341930425454, + "grad_norm": 0.08734453469514847, + "learning_rate": 0.0001, + "loss": 1.6693, + "step": 15600 + }, + { + "epoch": 0.21527118667381945, + "grad_norm": 0.09259238839149475, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 15650 + }, + { + "epoch": 0.21595895404338436, + "grad_norm": 0.08994511514902115, + "learning_rate": 0.0001, + "loss": 1.6734, + "step": 15700 + }, + { + "epoch": 0.21664672141294927, + "grad_norm": 0.08940455317497253, + "learning_rate": 0.0001, + "loss": 1.6694, + "step": 15750 + }, + { + "epoch": 0.2173344887825142, + "grad_norm": 0.08544927090406418, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 15800 + }, + { + "epoch": 0.21802225615207912, + "grad_norm": 0.08161517977714539, + "learning_rate": 0.0001, + "loss": 1.672, + "step": 15850 + }, + { + "epoch": 0.21871002352164404, + "grad_norm": 0.09207966923713684, + "learning_rate": 0.0001, + "loss": 1.6709, + "step": 15900 + }, + { + "epoch": 0.21939779089120895, + "grad_norm": 0.08921949565410614, + "learning_rate": 0.0001, + "loss": 1.6707, + "step": 15950 + }, + { + "epoch": 0.2200855582607739, + "grad_norm": 0.08339584618806839, + "learning_rate": 0.0001, + "loss": 1.6713, + "step": 16000 + }, + { + "epoch": 0.2207733256303388, + "grad_norm": 0.08629053831100464, + "learning_rate": 0.0001, + "loss": 1.6707, + "step": 16050 + }, + { + "epoch": 0.2214610929999037, + "grad_norm": 0.08945165574550629, + "learning_rate": 0.0001, + "loss": 1.6706, + "step": 16100 + }, + { + "epoch": 0.22214886036946863, + "grad_norm": 0.09606142342090607, + "learning_rate": 0.0001, + "loss": 1.6741, + "step": 16150 + }, + { + "epoch": 0.22283662773903354, + "grad_norm": 0.08945801854133606, + "learning_rate": 0.0001, + "loss": 1.6725, + "step": 16200 + }, + { + "epoch": 0.22352439510859848, + "grad_norm": 0.09277788549661636, + "learning_rate": 0.0001, + "loss": 1.6673, + "step": 16250 + }, + { + "epoch": 0.2242121624781634, + "grad_norm": 0.08870001137256622, + "learning_rate": 0.0001, + "loss": 1.6717, + "step": 16300 + }, + { + "epoch": 0.2248999298477283, + "grad_norm": 0.09212413430213928, + "learning_rate": 0.0001, + "loss": 1.6696, + "step": 16350 + }, + { + "epoch": 0.22558769721729321, + "grad_norm": 0.09183049201965332, + "learning_rate": 0.0001, + "loss": 1.6723, + "step": 16400 + }, + { + "epoch": 0.22627546458685815, + "grad_norm": 0.08794934302568436, + "learning_rate": 0.0001, + "loss": 1.6694, + "step": 16450 + }, + { + "epoch": 0.22696323195642307, + "grad_norm": 0.08112750947475433, + "learning_rate": 0.0001, + "loss": 1.6703, + "step": 16500 + }, + { + "epoch": 0.22765099932598798, + "grad_norm": 0.09124380350112915, + "learning_rate": 0.0001, + "loss": 1.6697, + "step": 16550 + }, + { + "epoch": 0.2283387666955529, + "grad_norm": 0.09015939384698868, + "learning_rate": 0.0001, + "loss": 1.6716, + "step": 16600 + }, + { + "epoch": 0.2290265340651178, + "grad_norm": 0.08522862941026688, + "learning_rate": 0.0001, + "loss": 1.6708, + "step": 16650 + }, + { + "epoch": 0.22971430143468274, + "grad_norm": 0.0881582573056221, + "learning_rate": 0.0001, + "loss": 1.671, + "step": 16700 + }, + { + "epoch": 0.23040206880424766, + "grad_norm": 0.09910543262958527, + "learning_rate": 0.0001, + "loss": 1.6674, + "step": 16750 + }, + { + "epoch": 0.23108983617381257, + "grad_norm": 0.08917541056871414, + "learning_rate": 0.0001, + "loss": 1.6682, + "step": 16800 + }, + { + "epoch": 0.23177760354337748, + "grad_norm": 0.1020062193274498, + "learning_rate": 0.0001, + "loss": 1.6699, + "step": 16850 + }, + { + "epoch": 0.2324653709129424, + "grad_norm": 0.09061840176582336, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 16900 + }, + { + "epoch": 0.23315313828250733, + "grad_norm": 0.08623366057872772, + "learning_rate": 0.0001, + "loss": 1.6663, + "step": 16950 + }, + { + "epoch": 0.23384090565207225, + "grad_norm": 0.09871737658977509, + "learning_rate": 0.0001, + "loss": 1.6686, + "step": 17000 + }, + { + "epoch": 0.23452867302163716, + "grad_norm": 0.08913227170705795, + "learning_rate": 0.0001, + "loss": 1.6701, + "step": 17050 + }, + { + "epoch": 0.23521644039120207, + "grad_norm": 0.08803007751703262, + "learning_rate": 0.0001, + "loss": 1.67, + "step": 17100 + }, + { + "epoch": 0.235904207760767, + "grad_norm": 0.08955027163028717, + "learning_rate": 0.0001, + "loss": 1.6726, + "step": 17150 + }, + { + "epoch": 0.23659197513033192, + "grad_norm": 0.09248829632997513, + "learning_rate": 0.0001, + "loss": 1.6669, + "step": 17200 + }, + { + "epoch": 0.23727974249989683, + "grad_norm": 0.0897279903292656, + "learning_rate": 0.0001, + "loss": 1.6715, + "step": 17250 + }, + { + "epoch": 0.23796750986946175, + "grad_norm": 0.09461032599210739, + "learning_rate": 0.0001, + "loss": 1.6683, + "step": 17300 + }, + { + "epoch": 0.23865527723902666, + "grad_norm": 0.09740615636110306, + "learning_rate": 0.0001, + "loss": 1.6717, + "step": 17350 + }, + { + "epoch": 0.2393430446085916, + "grad_norm": 0.08833985030651093, + "learning_rate": 0.0001, + "loss": 1.6629, + "step": 17400 + }, + { + "epoch": 0.2400308119781565, + "grad_norm": 0.08585455268621445, + "learning_rate": 0.0001, + "loss": 1.6701, + "step": 17450 + }, + { + "epoch": 0.24071857934772142, + "grad_norm": 0.08493249863386154, + "learning_rate": 0.0001, + "loss": 1.6678, + "step": 17500 + }, + { + "epoch": 0.24140634671728634, + "grad_norm": 0.09439011663198471, + "learning_rate": 0.0001, + "loss": 1.6693, + "step": 17550 + }, + { + "epoch": 0.24209411408685128, + "grad_norm": 0.08851542323827744, + "learning_rate": 0.0001, + "loss": 1.6664, + "step": 17600 + }, + { + "epoch": 0.2427818814564162, + "grad_norm": 0.09400813281536102, + "learning_rate": 0.0001, + "loss": 1.6686, + "step": 17650 + }, + { + "epoch": 0.2434696488259811, + "grad_norm": 0.09821904450654984, + "learning_rate": 0.0001, + "loss": 1.665, + "step": 17700 + }, + { + "epoch": 0.244157416195546, + "grad_norm": 0.09214925765991211, + "learning_rate": 0.0001, + "loss": 1.6662, + "step": 17750 + }, + { + "epoch": 0.24484518356511092, + "grad_norm": 0.09265078604221344, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 17800 + }, + { + "epoch": 0.24553295093467586, + "grad_norm": 0.08704175055027008, + "learning_rate": 0.0001, + "loss": 1.6695, + "step": 17850 + }, + { + "epoch": 0.24622071830424078, + "grad_norm": 0.09113669395446777, + "learning_rate": 0.0001, + "loss": 1.6695, + "step": 17900 + }, + { + "epoch": 0.2469084856738057, + "grad_norm": 0.09568019956350327, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 17950 + }, + { + "epoch": 0.2475962530433706, + "grad_norm": 0.08732044696807861, + "learning_rate": 0.0001, + "loss": 1.6682, + "step": 18000 + }, + { + "epoch": 0.24828402041293554, + "grad_norm": 0.10421797633171082, + "learning_rate": 0.0001, + "loss": 1.6665, + "step": 18050 + }, + { + "epoch": 0.24897178778250045, + "grad_norm": 0.0909595936536789, + "learning_rate": 0.0001, + "loss": 1.668, + "step": 18100 + }, + { + "epoch": 0.24965955515206537, + "grad_norm": 0.088596411049366, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 18150 + }, + { + "epoch": 0.2503473225216303, + "grad_norm": 0.0889756828546524, + "learning_rate": 0.0001, + "loss": 1.6658, + "step": 18200 + }, + { + "epoch": 0.2510350898911952, + "grad_norm": 0.09015469253063202, + "learning_rate": 0.0001, + "loss": 1.6648, + "step": 18250 + }, + { + "epoch": 0.2517228572607601, + "grad_norm": 0.08240851014852524, + "learning_rate": 0.0001, + "loss": 1.6655, + "step": 18300 + }, + { + "epoch": 0.25241062463032504, + "grad_norm": 0.08527081459760666, + "learning_rate": 0.0001, + "loss": 1.6738, + "step": 18350 + }, + { + "epoch": 0.25309839199989, + "grad_norm": 0.0891634002327919, + "learning_rate": 0.0001, + "loss": 1.6659, + "step": 18400 + }, + { + "epoch": 0.25378615936945487, + "grad_norm": 0.08512425422668457, + "learning_rate": 0.0001, + "loss": 1.6685, + "step": 18450 + }, + { + "epoch": 0.2544739267390198, + "grad_norm": 0.08884911984205246, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 18500 + }, + { + "epoch": 0.2551616941085847, + "grad_norm": 0.09091176837682724, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 18550 + }, + { + "epoch": 0.25584946147814963, + "grad_norm": 0.09358973056077957, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 18600 + }, + { + "epoch": 0.25653722884771457, + "grad_norm": 0.08879607170820236, + "learning_rate": 0.0001, + "loss": 1.6675, + "step": 18650 + }, + { + "epoch": 0.25722499621727946, + "grad_norm": 0.08596090972423553, + "learning_rate": 0.0001, + "loss": 1.6684, + "step": 18700 + }, + { + "epoch": 0.2579127635868444, + "grad_norm": 0.08910541981458664, + "learning_rate": 0.0001, + "loss": 1.6631, + "step": 18750 + }, + { + "epoch": 0.2586005309564093, + "grad_norm": 0.09641165286302567, + "learning_rate": 0.0001, + "loss": 1.6677, + "step": 18800 + }, + { + "epoch": 0.2592882983259742, + "grad_norm": 0.08356452733278275, + "learning_rate": 0.0001, + "loss": 1.6622, + "step": 18850 + }, + { + "epoch": 0.25997606569553916, + "grad_norm": 0.09701082855463028, + "learning_rate": 0.0001, + "loss": 1.6651, + "step": 18900 + }, + { + "epoch": 0.26066383306510404, + "grad_norm": 0.09678094834089279, + "learning_rate": 0.0001, + "loss": 1.6647, + "step": 18950 + }, + { + "epoch": 0.261351600434669, + "grad_norm": 0.0965198501944542, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 19000 + }, + { + "epoch": 0.26203936780423387, + "grad_norm": 0.08802270889282227, + "learning_rate": 0.0001, + "loss": 1.6668, + "step": 19050 + }, + { + "epoch": 0.2627271351737988, + "grad_norm": 0.09215667843818665, + "learning_rate": 0.0001, + "loss": 1.6654, + "step": 19100 + }, + { + "epoch": 0.26341490254336375, + "grad_norm": 0.08846131712198257, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 19150 + }, + { + "epoch": 0.26410266991292863, + "grad_norm": 0.09379055351018906, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 19200 + }, + { + "epoch": 0.2647904372824936, + "grad_norm": 0.08257373422384262, + "learning_rate": 0.0001, + "loss": 1.6599, + "step": 19250 + }, + { + "epoch": 0.2654782046520585, + "grad_norm": 0.10151083022356033, + "learning_rate": 0.0001, + "loss": 1.6632, + "step": 19300 + }, + { + "epoch": 0.2661659720216234, + "grad_norm": 0.08694400638341904, + "learning_rate": 0.0001, + "loss": 1.6627, + "step": 19350 + }, + { + "epoch": 0.26685373939118834, + "grad_norm": 0.10145068168640137, + "learning_rate": 0.0001, + "loss": 1.6624, + "step": 19400 + }, + { + "epoch": 0.2675415067607532, + "grad_norm": 0.09250553697347641, + "learning_rate": 0.0001, + "loss": 1.6654, + "step": 19450 + }, + { + "epoch": 0.26822927413031816, + "grad_norm": 0.09129736572504044, + "learning_rate": 0.0001, + "loss": 1.6639, + "step": 19500 + }, + { + "epoch": 0.2689170414998831, + "grad_norm": 0.08812187612056732, + "learning_rate": 0.0001, + "loss": 1.6626, + "step": 19550 + }, + { + "epoch": 0.269604808869448, + "grad_norm": 0.08967280387878418, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 19600 + }, + { + "epoch": 0.2702925762390129, + "grad_norm": 0.12475176900625229, + "learning_rate": 0.0001, + "loss": 1.6655, + "step": 19650 + }, + { + "epoch": 0.2709803436085778, + "grad_norm": 0.09744782745838165, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 19700 + }, + { + "epoch": 0.27166811097814275, + "grad_norm": 0.09547289460897446, + "learning_rate": 0.0001, + "loss": 1.669, + "step": 19750 + }, + { + "epoch": 0.2723558783477077, + "grad_norm": 0.10474015772342682, + "learning_rate": 0.0001, + "loss": 1.6626, + "step": 19800 + }, + { + "epoch": 0.2730436457172726, + "grad_norm": 0.09450657665729523, + "learning_rate": 0.0001, + "loss": 1.661, + "step": 19850 + }, + { + "epoch": 0.2737314130868375, + "grad_norm": 0.09040903300046921, + "learning_rate": 0.0001, + "loss": 1.6633, + "step": 19900 + }, + { + "epoch": 0.2744191804564024, + "grad_norm": 0.09171809256076813, + "learning_rate": 0.0001, + "loss": 1.6632, + "step": 19950 + }, + { + "epoch": 0.27510694782596734, + "grad_norm": 0.08326128870248795, + "learning_rate": 0.0001, + "loss": 1.6641, + "step": 20000 + }, + { + "epoch": 0.2757947151955323, + "grad_norm": 0.08934567868709564, + "learning_rate": 0.0001, + "loss": 1.6639, + "step": 20050 + }, + { + "epoch": 0.27648248256509717, + "grad_norm": 0.08973874151706696, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 20100 + }, + { + "epoch": 0.2771702499346621, + "grad_norm": 0.08561202138662338, + "learning_rate": 0.0001, + "loss": 1.6669, + "step": 20150 + }, + { + "epoch": 0.27785801730422705, + "grad_norm": 0.09638834744691849, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 20200 + }, + { + "epoch": 0.27854578467379193, + "grad_norm": 0.08998638391494751, + "learning_rate": 0.0001, + "loss": 1.6614, + "step": 20250 + }, + { + "epoch": 0.27923355204335687, + "grad_norm": 0.0963120236992836, + "learning_rate": 0.0001, + "loss": 1.6643, + "step": 20300 + }, + { + "epoch": 0.27992131941292175, + "grad_norm": 0.09031634032726288, + "learning_rate": 0.0001, + "loss": 1.6628, + "step": 20350 + }, + { + "epoch": 0.2806090867824867, + "grad_norm": 0.08368144184350967, + "learning_rate": 0.0001, + "loss": 1.6619, + "step": 20400 + }, + { + "epoch": 0.28129685415205163, + "grad_norm": 0.08716925233602524, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 20450 + }, + { + "epoch": 0.2819846215216165, + "grad_norm": 0.08537859469652176, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 20500 + }, + { + "epoch": 0.28267238889118146, + "grad_norm": 0.08953417092561722, + "learning_rate": 0.0001, + "loss": 1.6596, + "step": 20550 + }, + { + "epoch": 0.28336015626074634, + "grad_norm": 0.09007777273654938, + "learning_rate": 0.0001, + "loss": 1.6609, + "step": 20600 + }, + { + "epoch": 0.2840479236303113, + "grad_norm": 0.09223859757184982, + "learning_rate": 0.0001, + "loss": 1.6624, + "step": 20650 + }, + { + "epoch": 0.2847356909998762, + "grad_norm": 0.08682399988174438, + "learning_rate": 0.0001, + "loss": 1.6611, + "step": 20700 + }, + { + "epoch": 0.2854234583694411, + "grad_norm": 0.0930657759308815, + "learning_rate": 0.0001, + "loss": 1.6636, + "step": 20750 + }, + { + "epoch": 0.28611122573900605, + "grad_norm": 0.08895847946405411, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 20800 + }, + { + "epoch": 0.28679899310857093, + "grad_norm": 0.09470313787460327, + "learning_rate": 0.0001, + "loss": 1.6613, + "step": 20850 + }, + { + "epoch": 0.28748676047813587, + "grad_norm": 0.08398160338401794, + "learning_rate": 0.0001, + "loss": 1.6591, + "step": 20900 + }, + { + "epoch": 0.2881745278477008, + "grad_norm": 0.09906968474388123, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 20950 + }, + { + "epoch": 0.2888622952172657, + "grad_norm": 0.08456803858280182, + "learning_rate": 0.0001, + "loss": 1.6653, + "step": 21000 + }, + { + "epoch": 0.28955006258683064, + "grad_norm": 0.09115090221166611, + "learning_rate": 0.0001, + "loss": 1.6604, + "step": 21050 + }, + { + "epoch": 0.2902378299563955, + "grad_norm": 0.09190023690462112, + "learning_rate": 0.0001, + "loss": 1.6618, + "step": 21100 + }, + { + "epoch": 0.29092559732596046, + "grad_norm": 0.0813484638929367, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 21150 + }, + { + "epoch": 0.2916133646955254, + "grad_norm": 0.09336463361978531, + "learning_rate": 0.0001, + "loss": 1.6597, + "step": 21200 + }, + { + "epoch": 0.2923011320650903, + "grad_norm": 0.08862286061048508, + "learning_rate": 0.0001, + "loss": 1.6606, + "step": 21250 + }, + { + "epoch": 0.2929888994346552, + "grad_norm": 0.09896854311227798, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 21300 + }, + { + "epoch": 0.29367666680422017, + "grad_norm": 0.08877858519554138, + "learning_rate": 0.0001, + "loss": 1.6625, + "step": 21350 + }, + { + "epoch": 0.29436443417378505, + "grad_norm": 0.0959668979048729, + "learning_rate": 0.0001, + "loss": 1.6637, + "step": 21400 + }, + { + "epoch": 0.29505220154335, + "grad_norm": 0.09921707957983017, + "learning_rate": 0.0001, + "loss": 1.6636, + "step": 21450 + }, + { + "epoch": 0.2957399689129149, + "grad_norm": 0.08495631068944931, + "learning_rate": 0.0001, + "loss": 1.6628, + "step": 21500 + }, + { + "epoch": 0.2964277362824798, + "grad_norm": 0.09217420965433121, + "learning_rate": 0.0001, + "loss": 1.6599, + "step": 21550 + }, + { + "epoch": 0.29711550365204475, + "grad_norm": 0.08740544319152832, + "learning_rate": 0.0001, + "loss": 1.6563, + "step": 21600 + }, + { + "epoch": 0.29780327102160964, + "grad_norm": 0.0943366289138794, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 21650 + }, + { + "epoch": 0.2984910383911746, + "grad_norm": 0.08775294572114944, + "learning_rate": 0.0001, + "loss": 1.6624, + "step": 21700 + }, + { + "epoch": 0.29917880576073946, + "grad_norm": 0.09602498263120651, + "learning_rate": 0.0001, + "loss": 1.6616, + "step": 21750 + }, + { + "epoch": 0.2998665731303044, + "grad_norm": 0.09389972686767578, + "learning_rate": 0.0001, + "loss": 1.6592, + "step": 21800 + }, + { + "epoch": 0.30055434049986934, + "grad_norm": 0.08765329420566559, + "learning_rate": 0.0001, + "loss": 1.6638, + "step": 21850 + }, + { + "epoch": 0.30124210786943423, + "grad_norm": 0.08868211507797241, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 21900 + }, + { + "epoch": 0.30192987523899917, + "grad_norm": 0.08885643631219864, + "learning_rate": 0.0001, + "loss": 1.6607, + "step": 21950 + }, + { + "epoch": 0.30261764260856405, + "grad_norm": 0.090006984770298, + "learning_rate": 0.0001, + "loss": 1.6646, + "step": 22000 + }, + { + "epoch": 0.303305409978129, + "grad_norm": 0.09837059676647186, + "learning_rate": 0.0001, + "loss": 1.662, + "step": 22050 + }, + { + "epoch": 0.30399317734769393, + "grad_norm": 0.09263578057289124, + "learning_rate": 0.0001, + "loss": 1.659, + "step": 22100 + }, + { + "epoch": 0.3046809447172588, + "grad_norm": 0.10446369647979736, + "learning_rate": 0.0001, + "loss": 1.6585, + "step": 22150 + }, + { + "epoch": 0.30536871208682376, + "grad_norm": 0.0927567332983017, + "learning_rate": 0.0001, + "loss": 1.6621, + "step": 22200 + }, + { + "epoch": 0.3060564794563887, + "grad_norm": 0.08797762542963028, + "learning_rate": 0.0001, + "loss": 1.6603, + "step": 22250 + }, + { + "epoch": 0.3067442468259536, + "grad_norm": 0.09671605378389359, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 22300 + }, + { + "epoch": 0.3074320141955185, + "grad_norm": 0.08927121758460999, + "learning_rate": 0.0001, + "loss": 1.6605, + "step": 22350 + }, + { + "epoch": 0.3081197815650834, + "grad_norm": 0.09074302762746811, + "learning_rate": 0.0001, + "loss": 1.6578, + "step": 22400 + }, + { + "epoch": 0.30880754893464835, + "grad_norm": 0.07775942236185074, + "learning_rate": 0.0001, + "loss": 1.6588, + "step": 22450 + }, + { + "epoch": 0.3094953163042133, + "grad_norm": 0.08967916667461395, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 22500 + }, + { + "epoch": 0.31018308367377817, + "grad_norm": 0.09383999556303024, + "learning_rate": 0.0001, + "loss": 1.6567, + "step": 22550 + }, + { + "epoch": 0.3108708510433431, + "grad_norm": 0.08825831860303879, + "learning_rate": 0.0001, + "loss": 1.6604, + "step": 22600 + }, + { + "epoch": 0.311558618412908, + "grad_norm": 0.08609315752983093, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 22650 + }, + { + "epoch": 0.31224638578247293, + "grad_norm": 0.09169257432222366, + "learning_rate": 0.0001, + "loss": 1.6617, + "step": 22700 + }, + { + "epoch": 0.3129341531520379, + "grad_norm": 0.08700941503047943, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 22750 + }, + { + "epoch": 0.31362192052160276, + "grad_norm": 0.09563438594341278, + "learning_rate": 0.0001, + "loss": 1.6584, + "step": 22800 + }, + { + "epoch": 0.3143096878911677, + "grad_norm": 0.08614029735326767, + "learning_rate": 0.0001, + "loss": 1.6602, + "step": 22850 + }, + { + "epoch": 0.3149974552607326, + "grad_norm": 0.09383559972047806, + "learning_rate": 0.0001, + "loss": 1.6598, + "step": 22900 + }, + { + "epoch": 0.3156852226302975, + "grad_norm": 0.08948863297700882, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 22950 + }, + { + "epoch": 0.31637298999986246, + "grad_norm": 0.09441053867340088, + "learning_rate": 0.0001, + "loss": 1.6569, + "step": 23000 + }, + { + "epoch": 0.31706075736942735, + "grad_norm": 0.10128844529390335, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 23050 + }, + { + "epoch": 0.3177485247389923, + "grad_norm": 0.09148503839969635, + "learning_rate": 0.0001, + "loss": 1.6543, + "step": 23100 + }, + { + "epoch": 0.31843629210855723, + "grad_norm": 0.09208353608846664, + "learning_rate": 0.0001, + "loss": 1.658, + "step": 23150 + }, + { + "epoch": 0.3191240594781221, + "grad_norm": 0.09142562001943588, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 23200 + }, + { + "epoch": 0.31981182684768705, + "grad_norm": 0.0846257209777832, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 23250 + }, + { + "epoch": 0.32049959421725194, + "grad_norm": 0.09171921014785767, + "learning_rate": 0.0001, + "loss": 1.6551, + "step": 23300 + }, + { + "epoch": 0.3211873615868169, + "grad_norm": 0.0841546282172203, + "learning_rate": 0.0001, + "loss": 1.6578, + "step": 23350 + }, + { + "epoch": 0.3218751289563818, + "grad_norm": 0.09704132378101349, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 23400 + }, + { + "epoch": 0.3225628963259467, + "grad_norm": 0.08762117475271225, + "learning_rate": 0.0001, + "loss": 1.6589, + "step": 23450 + }, + { + "epoch": 0.32325066369551164, + "grad_norm": 0.0907997116446495, + "learning_rate": 0.0001, + "loss": 1.6586, + "step": 23500 + }, + { + "epoch": 0.3239384310650765, + "grad_norm": 0.09183737635612488, + "learning_rate": 0.0001, + "loss": 1.6564, + "step": 23550 + }, + { + "epoch": 0.32462619843464147, + "grad_norm": 0.08676784485578537, + "learning_rate": 0.0001, + "loss": 1.6568, + "step": 23600 + }, + { + "epoch": 0.3253139658042064, + "grad_norm": 0.09329142421483994, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 23650 + }, + { + "epoch": 0.3260017331737713, + "grad_norm": 0.09180528670549393, + "learning_rate": 0.0001, + "loss": 1.6562, + "step": 23700 + }, + { + "epoch": 0.32668950054333623, + "grad_norm": 0.09366358071565628, + "learning_rate": 0.0001, + "loss": 1.656, + "step": 23750 + }, + { + "epoch": 0.3273772679129011, + "grad_norm": 0.08830198645591736, + "learning_rate": 0.0001, + "loss": 1.6594, + "step": 23800 + }, + { + "epoch": 0.32806503528246606, + "grad_norm": 0.0980730950832367, + "learning_rate": 0.0001, + "loss": 1.6575, + "step": 23850 + }, + { + "epoch": 0.328752802652031, + "grad_norm": 0.08481217920780182, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 23900 + }, + { + "epoch": 0.3294405700215959, + "grad_norm": 0.10097907483577728, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 23950 + }, + { + "epoch": 0.3301283373911608, + "grad_norm": 0.09167522937059402, + "learning_rate": 0.0001, + "loss": 1.6561, + "step": 24000 + }, + { + "epoch": 0.33081610476072576, + "grad_norm": 0.0857388898730278, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 24050 + }, + { + "epoch": 0.33150387213029064, + "grad_norm": 0.08629937469959259, + "learning_rate": 0.0001, + "loss": 1.6598, + "step": 24100 + }, + { + "epoch": 0.3321916394998556, + "grad_norm": 0.09269680827856064, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 24150 + }, + { + "epoch": 0.33287940686942047, + "grad_norm": 0.08871278166770935, + "learning_rate": 0.0001, + "loss": 1.655, + "step": 24200 + }, + { + "epoch": 0.3335671742389854, + "grad_norm": 0.08813636004924774, + "learning_rate": 0.0001, + "loss": 1.6537, + "step": 24250 + }, + { + "epoch": 0.33425494160855035, + "grad_norm": 0.09300006926059723, + "learning_rate": 0.0001, + "loss": 1.6569, + "step": 24300 + }, + { + "epoch": 0.33494270897811523, + "grad_norm": 0.09674695879220963, + "learning_rate": 0.0001, + "loss": 1.658, + "step": 24350 + }, + { + "epoch": 0.3356304763476802, + "grad_norm": 0.08643494546413422, + "learning_rate": 0.0001, + "loss": 1.6557, + "step": 24400 + }, + { + "epoch": 0.33631824371724506, + "grad_norm": 0.09858324378728867, + "learning_rate": 0.0001, + "loss": 1.6592, + "step": 24450 + }, + { + "epoch": 0.33700601108681, + "grad_norm": 0.09983374178409576, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 24500 + }, + { + "epoch": 0.33769377845637494, + "grad_norm": 0.08795691281557083, + "learning_rate": 0.0001, + "loss": 1.6549, + "step": 24550 + }, + { + "epoch": 0.3383815458259398, + "grad_norm": 0.09592121094465256, + "learning_rate": 0.0001, + "loss": 1.6584, + "step": 24600 + }, + { + "epoch": 0.33906931319550476, + "grad_norm": 0.10493771731853485, + "learning_rate": 0.0001, + "loss": 1.654, + "step": 24650 + }, + { + "epoch": 0.33975708056506965, + "grad_norm": 0.09947793930768967, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 24700 + }, + { + "epoch": 0.3404448479346346, + "grad_norm": 0.08845814317464828, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 24750 + }, + { + "epoch": 0.3411326153041995, + "grad_norm": 0.09179400652647018, + "learning_rate": 0.0001, + "loss": 1.6559, + "step": 24800 + }, + { + "epoch": 0.3418203826737644, + "grad_norm": 0.09495829790830612, + "learning_rate": 0.0001, + "loss": 1.6566, + "step": 24850 + }, + { + "epoch": 0.34250815004332935, + "grad_norm": 0.08588258177042007, + "learning_rate": 0.0001, + "loss": 1.6566, + "step": 24900 + }, + { + "epoch": 0.34319591741289424, + "grad_norm": 0.094237320125103, + "learning_rate": 0.0001, + "loss": 1.6571, + "step": 24950 + }, + { + "epoch": 0.3438836847824592, + "grad_norm": 0.09663517773151398, + "learning_rate": 0.0001, + "loss": 1.6563, + "step": 25000 + }, + { + "epoch": 0.3445714521520241, + "grad_norm": 0.08958858996629715, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 25050 + }, + { + "epoch": 0.345259219521589, + "grad_norm": 0.09064103662967682, + "learning_rate": 0.0001, + "loss": 1.6542, + "step": 25100 + }, + { + "epoch": 0.34594698689115394, + "grad_norm": 0.09530662000179291, + "learning_rate": 0.0001, + "loss": 1.6561, + "step": 25150 + }, + { + "epoch": 0.3466347542607189, + "grad_norm": 0.10598750412464142, + "learning_rate": 0.0001, + "loss": 1.6536, + "step": 25200 + }, + { + "epoch": 0.34732252163028376, + "grad_norm": 0.09103056788444519, + "learning_rate": 0.0001, + "loss": 1.6571, + "step": 25250 + }, + { + "epoch": 0.3480102889998487, + "grad_norm": 0.08924443274736404, + "learning_rate": 0.0001, + "loss": 1.654, + "step": 25300 + }, + { + "epoch": 0.3486980563694136, + "grad_norm": 0.0930083692073822, + "learning_rate": 0.0001, + "loss": 1.6557, + "step": 25350 + }, + { + "epoch": 0.34938582373897853, + "grad_norm": 0.09939617663621902, + "learning_rate": 0.0001, + "loss": 1.6512, + "step": 25400 + }, + { + "epoch": 0.35007359110854347, + "grad_norm": 0.1029135212302208, + "learning_rate": 0.0001, + "loss": 1.6546, + "step": 25450 + }, + { + "epoch": 0.35076135847810835, + "grad_norm": 0.0853937491774559, + "learning_rate": 0.0001, + "loss": 1.6583, + "step": 25500 + }, + { + "epoch": 0.3514491258476733, + "grad_norm": 0.09008786827325821, + "learning_rate": 0.0001, + "loss": 1.6523, + "step": 25550 + }, + { + "epoch": 0.3521368932172382, + "grad_norm": 0.10317879915237427, + "learning_rate": 0.0001, + "loss": 1.6595, + "step": 25600 + }, + { + "epoch": 0.3528246605868031, + "grad_norm": 0.08662793040275574, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 25650 + }, + { + "epoch": 0.35351242795636806, + "grad_norm": 0.09117420762777328, + "learning_rate": 0.0001, + "loss": 1.6571, + "step": 25700 + }, + { + "epoch": 0.35420019532593294, + "grad_norm": 0.08435691148042679, + "learning_rate": 0.0001, + "loss": 1.6567, + "step": 25750 + }, + { + "epoch": 0.3548879626954979, + "grad_norm": 0.0903763696551323, + "learning_rate": 0.0001, + "loss": 1.6525, + "step": 25800 + }, + { + "epoch": 0.35557573006506277, + "grad_norm": 0.0919974073767662, + "learning_rate": 0.0001, + "loss": 1.6541, + "step": 25850 + }, + { + "epoch": 0.3562634974346277, + "grad_norm": 0.08953440189361572, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 25900 + }, + { + "epoch": 0.35695126480419265, + "grad_norm": 0.08938378095626831, + "learning_rate": 0.0001, + "loss": 1.6526, + "step": 25950 + }, + { + "epoch": 0.35763903217375753, + "grad_norm": 0.09824621677398682, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 26000 + }, + { + "epoch": 0.35832679954332247, + "grad_norm": 0.09804233908653259, + "learning_rate": 0.0001, + "loss": 1.6567, + "step": 26050 + }, + { + "epoch": 0.3590145669128874, + "grad_norm": 0.1049811840057373, + "learning_rate": 0.0001, + "loss": 1.6523, + "step": 26100 + }, + { + "epoch": 0.3597023342824523, + "grad_norm": 0.09391681849956512, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 26150 + }, + { + "epoch": 0.36039010165201724, + "grad_norm": 0.10802886635065079, + "learning_rate": 0.0001, + "loss": 1.6525, + "step": 26200 + }, + { + "epoch": 0.3610778690215821, + "grad_norm": 0.08910466730594635, + "learning_rate": 0.0001, + "loss": 1.6551, + "step": 26250 + }, + { + "epoch": 0.36176563639114706, + "grad_norm": 0.08862115442752838, + "learning_rate": 0.0001, + "loss": 1.6538, + "step": 26300 + }, + { + "epoch": 0.362453403760712, + "grad_norm": 0.08569826930761337, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 26350 + }, + { + "epoch": 0.3631411711302769, + "grad_norm": 0.09084028750658035, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 26400 + }, + { + "epoch": 0.3638289384998418, + "grad_norm": 0.08381815999746323, + "learning_rate": 0.0001, + "loss": 1.6569, + "step": 26450 + }, + { + "epoch": 0.3645167058694067, + "grad_norm": 0.08396822959184647, + "learning_rate": 0.0001, + "loss": 1.6583, + "step": 26500 + }, + { + "epoch": 0.36520447323897165, + "grad_norm": 0.0848502367734909, + "learning_rate": 0.0001, + "loss": 1.6572, + "step": 26550 + }, + { + "epoch": 0.3658922406085366, + "grad_norm": 0.09545731544494629, + "learning_rate": 0.0001, + "loss": 1.6577, + "step": 26600 + }, + { + "epoch": 0.3665800079781015, + "grad_norm": 0.09016140550374985, + "learning_rate": 0.0001, + "loss": 1.6554, + "step": 26650 + }, + { + "epoch": 0.3672677753476664, + "grad_norm": 0.08480773121118546, + "learning_rate": 0.0001, + "loss": 1.6552, + "step": 26700 + }, + { + "epoch": 0.3679555427172313, + "grad_norm": 0.08351889252662659, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 26750 + }, + { + "epoch": 0.36864331008679624, + "grad_norm": 0.08628764003515244, + "learning_rate": 0.0001, + "loss": 1.6527, + "step": 26800 + }, + { + "epoch": 0.3693310774563612, + "grad_norm": 0.09489446878433228, + "learning_rate": 0.0001, + "loss": 1.6475, + "step": 26850 + }, + { + "epoch": 0.37001884482592606, + "grad_norm": 0.09590397030115128, + "learning_rate": 0.0001, + "loss": 1.6544, + "step": 26900 + }, + { + "epoch": 0.370706612195491, + "grad_norm": 0.08726419508457184, + "learning_rate": 0.0001, + "loss": 1.6553, + "step": 26950 + }, + { + "epoch": 0.37139437956505594, + "grad_norm": 0.09084060788154602, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 27000 + }, + { + "epoch": 0.3720821469346208, + "grad_norm": 0.08947817236185074, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 27050 + }, + { + "epoch": 0.37276991430418577, + "grad_norm": 0.0947887971997261, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 27100 + }, + { + "epoch": 0.37345768167375065, + "grad_norm": 0.09059783816337585, + "learning_rate": 0.0001, + "loss": 1.6491, + "step": 27150 + }, + { + "epoch": 0.3741454490433156, + "grad_norm": 0.09635411202907562, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 27200 + }, + { + "epoch": 0.37483321641288053, + "grad_norm": 0.09230557829141617, + "learning_rate": 0.0001, + "loss": 1.6513, + "step": 27250 + }, + { + "epoch": 0.3755209837824454, + "grad_norm": 0.09676863998174667, + "learning_rate": 0.0001, + "loss": 1.653, + "step": 27300 + }, + { + "epoch": 0.37620875115201036, + "grad_norm": 0.08972521126270294, + "learning_rate": 0.0001, + "loss": 1.6556, + "step": 27350 + }, + { + "epoch": 0.37689651852157524, + "grad_norm": 0.09976805746555328, + "learning_rate": 0.0001, + "loss": 1.6509, + "step": 27400 + }, + { + "epoch": 0.3775842858911402, + "grad_norm": 0.08917619287967682, + "learning_rate": 0.0001, + "loss": 1.649, + "step": 27450 + }, + { + "epoch": 0.3782720532607051, + "grad_norm": 0.09673383086919785, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 27500 + }, + { + "epoch": 0.37895982063027, + "grad_norm": 0.0897962898015976, + "learning_rate": 0.0001, + "loss": 1.6533, + "step": 27550 + }, + { + "epoch": 0.37964758799983495, + "grad_norm": 0.10002953559160233, + "learning_rate": 0.0001, + "loss": 1.6526, + "step": 27600 + }, + { + "epoch": 0.38033535536939983, + "grad_norm": 0.10410989075899124, + "learning_rate": 0.0001, + "loss": 1.6523, + "step": 27650 + }, + { + "epoch": 0.38102312273896477, + "grad_norm": 0.09563115984201431, + "learning_rate": 0.0001, + "loss": 1.6519, + "step": 27700 + }, + { + "epoch": 0.3817108901085297, + "grad_norm": 0.08847146481275558, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 27750 + }, + { + "epoch": 0.3823986574780946, + "grad_norm": 0.088249571621418, + "learning_rate": 0.0001, + "loss": 1.6499, + "step": 27800 + }, + { + "epoch": 0.38308642484765953, + "grad_norm": 0.09289827942848206, + "learning_rate": 0.0001, + "loss": 1.6543, + "step": 27850 + }, + { + "epoch": 0.3837741922172244, + "grad_norm": 0.08918619155883789, + "learning_rate": 0.0001, + "loss": 1.6537, + "step": 27900 + }, + { + "epoch": 0.38446195958678936, + "grad_norm": 0.08992282301187515, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 27950 + }, + { + "epoch": 0.3851497269563543, + "grad_norm": 0.09168649464845657, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 28000 + }, + { + "epoch": 0.3858374943259192, + "grad_norm": 0.08888909220695496, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 28050 + }, + { + "epoch": 0.3865252616954841, + "grad_norm": 0.098940409719944, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 28100 + }, + { + "epoch": 0.38721302906504906, + "grad_norm": 0.08919871598482132, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 28150 + }, + { + "epoch": 0.38790079643461395, + "grad_norm": 0.09120476245880127, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 28200 + }, + { + "epoch": 0.3885885638041789, + "grad_norm": 0.09309836477041245, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 28250 + }, + { + "epoch": 0.3892763311737438, + "grad_norm": 0.0869157537817955, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 28300 + }, + { + "epoch": 0.3899640985433087, + "grad_norm": 0.09697956591844559, + "learning_rate": 0.0001, + "loss": 1.6534, + "step": 28350 + }, + { + "epoch": 0.39065186591287365, + "grad_norm": 0.09053334593772888, + "learning_rate": 0.0001, + "loss": 1.6513, + "step": 28400 + }, + { + "epoch": 0.39133963328243854, + "grad_norm": 0.09129533171653748, + "learning_rate": 0.0001, + "loss": 1.6558, + "step": 28450 + }, + { + "epoch": 0.3920274006520035, + "grad_norm": 0.09009382128715515, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 28500 + }, + { + "epoch": 0.39271516802156836, + "grad_norm": 0.10035433620214462, + "learning_rate": 0.0001, + "loss": 1.6521, + "step": 28550 + }, + { + "epoch": 0.3934029353911333, + "grad_norm": 0.09056541323661804, + "learning_rate": 0.0001, + "loss": 1.6532, + "step": 28600 + }, + { + "epoch": 0.39409070276069824, + "grad_norm": 0.0904608890414238, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 28650 + }, + { + "epoch": 0.3947784701302631, + "grad_norm": 0.11022058129310608, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 28700 + }, + { + "epoch": 0.39546623749982807, + "grad_norm": 0.0866251140832901, + "learning_rate": 0.0001, + "loss": 1.6525, + "step": 28750 + }, + { + "epoch": 0.39615400486939295, + "grad_norm": 0.09494468569755554, + "learning_rate": 0.0001, + "loss": 1.6529, + "step": 28800 + }, + { + "epoch": 0.3968417722389579, + "grad_norm": 0.09128791838884354, + "learning_rate": 0.0001, + "loss": 1.6518, + "step": 28850 + }, + { + "epoch": 0.39752953960852283, + "grad_norm": 0.09931056201457977, + "learning_rate": 0.0001, + "loss": 1.6494, + "step": 28900 + }, + { + "epoch": 0.3982173069780877, + "grad_norm": 0.0899323970079422, + "learning_rate": 0.0001, + "loss": 1.648, + "step": 28950 + }, + { + "epoch": 0.39890507434765266, + "grad_norm": 0.09281651675701141, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 29000 + }, + { + "epoch": 0.3995928417172176, + "grad_norm": 0.09721958637237549, + "learning_rate": 0.0001, + "loss": 1.6495, + "step": 29050 + }, + { + "epoch": 0.4002806090867825, + "grad_norm": 0.09711379557847977, + "learning_rate": 0.0001, + "loss": 1.6473, + "step": 29100 + }, + { + "epoch": 0.4009683764563474, + "grad_norm": 0.09055937081575394, + "learning_rate": 0.0001, + "loss": 1.6505, + "step": 29150 + }, + { + "epoch": 0.4016561438259123, + "grad_norm": 0.09126099944114685, + "learning_rate": 0.0001, + "loss": 1.652, + "step": 29200 + }, + { + "epoch": 0.40234391119547724, + "grad_norm": 0.09342388063669205, + "learning_rate": 0.0001, + "loss": 1.6476, + "step": 29250 + }, + { + "epoch": 0.4030316785650422, + "grad_norm": 0.09094425290822983, + "learning_rate": 0.0001, + "loss": 1.6499, + "step": 29300 + }, + { + "epoch": 0.40371944593460707, + "grad_norm": 0.09739495813846588, + "learning_rate": 0.0001, + "loss": 1.6537, + "step": 29350 + }, + { + "epoch": 0.404407213304172, + "grad_norm": 0.08720117807388306, + "learning_rate": 0.0001, + "loss": 1.6501, + "step": 29400 + }, + { + "epoch": 0.4050949806737369, + "grad_norm": 0.10012072324752808, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 29450 + }, + { + "epoch": 0.40578274804330183, + "grad_norm": 0.09751909226179123, + "learning_rate": 0.0001, + "loss": 1.6503, + "step": 29500 + }, + { + "epoch": 0.4064705154128668, + "grad_norm": 0.0909474641084671, + "learning_rate": 0.0001, + "loss": 1.6487, + "step": 29550 + }, + { + "epoch": 0.40715828278243166, + "grad_norm": 0.09806191921234131, + "learning_rate": 0.0001, + "loss": 1.6476, + "step": 29600 + }, + { + "epoch": 0.4078460501519966, + "grad_norm": 0.08882980048656464, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 29650 + }, + { + "epoch": 0.4085338175215615, + "grad_norm": 0.08612953126430511, + "learning_rate": 0.0001, + "loss": 1.6517, + "step": 29700 + }, + { + "epoch": 0.4092215848911264, + "grad_norm": 0.09713184833526611, + "learning_rate": 0.0001, + "loss": 1.6505, + "step": 29750 + }, + { + "epoch": 0.40990935226069136, + "grad_norm": 0.08773653209209442, + "learning_rate": 0.0001, + "loss": 1.6513, + "step": 29800 + }, + { + "epoch": 0.41059711963025625, + "grad_norm": 0.09357753396034241, + "learning_rate": 0.0001, + "loss": 1.6512, + "step": 29850 + }, + { + "epoch": 0.4112848869998212, + "grad_norm": 0.10132279247045517, + "learning_rate": 0.0001, + "loss": 1.6499, + "step": 29900 + }, + { + "epoch": 0.4119726543693861, + "grad_norm": 0.09292474389076233, + "learning_rate": 0.0001, + "loss": 1.649, + "step": 29950 + }, + { + "epoch": 0.412660421738951, + "grad_norm": 0.09160569310188293, + "learning_rate": 0.0001, + "loss": 1.6472, + "step": 30000 + }, + { + "epoch": 0.41334818910851595, + "grad_norm": 0.09838960319757462, + "learning_rate": 0.0001, + "loss": 1.6498, + "step": 30050 + }, + { + "epoch": 0.41403595647808084, + "grad_norm": 0.0884481742978096, + "learning_rate": 0.0001, + "loss": 1.6498, + "step": 30100 + }, + { + "epoch": 0.4147237238476458, + "grad_norm": 0.09937591850757599, + "learning_rate": 0.0001, + "loss": 1.6516, + "step": 30150 + }, + { + "epoch": 0.4154114912172107, + "grad_norm": 0.08566869050264359, + "learning_rate": 0.0001, + "loss": 1.6495, + "step": 30200 + }, + { + "epoch": 0.4160992585867756, + "grad_norm": 0.09897395223379135, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 30250 + }, + { + "epoch": 0.41678702595634054, + "grad_norm": 0.10065190494060516, + "learning_rate": 0.0001, + "loss": 1.6478, + "step": 30300 + }, + { + "epoch": 0.4174747933259054, + "grad_norm": 0.09039245545864105, + "learning_rate": 0.0001, + "loss": 1.6511, + "step": 30350 + }, + { + "epoch": 0.41816256069547036, + "grad_norm": 0.08918923139572144, + "learning_rate": 0.0001, + "loss": 1.6454, + "step": 30400 + }, + { + "epoch": 0.4188503280650353, + "grad_norm": 0.08621367067098618, + "learning_rate": 0.0001, + "loss": 1.6505, + "step": 30450 + }, + { + "epoch": 0.4195380954346002, + "grad_norm": 0.107161745429039, + "learning_rate": 0.0001, + "loss": 1.6486, + "step": 30500 + }, + { + "epoch": 0.42022586280416513, + "grad_norm": 0.1019233912229538, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 30550 + }, + { + "epoch": 0.42091363017373, + "grad_norm": 0.10087515413761139, + "learning_rate": 0.0001, + "loss": 1.65, + "step": 30600 + }, + { + "epoch": 0.42160139754329495, + "grad_norm": 0.08938942849636078, + "learning_rate": 0.0001, + "loss": 1.6531, + "step": 30650 + }, + { + "epoch": 0.4222891649128599, + "grad_norm": 0.10269896686077118, + "learning_rate": 0.0001, + "loss": 1.6471, + "step": 30700 + }, + { + "epoch": 0.4229769322824248, + "grad_norm": 0.09027698636054993, + "learning_rate": 0.0001, + "loss": 1.6502, + "step": 30750 + }, + { + "epoch": 0.4236646996519897, + "grad_norm": 0.09330408275127411, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 30800 + }, + { + "epoch": 0.4243524670215546, + "grad_norm": 0.08705689758062363, + "learning_rate": 0.0001, + "loss": 1.648, + "step": 30850 + }, + { + "epoch": 0.42504023439111954, + "grad_norm": 0.09150363504886627, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 30900 + }, + { + "epoch": 0.4257280017606845, + "grad_norm": 0.09293975681066513, + "learning_rate": 0.0001, + "loss": 1.6506, + "step": 30950 + }, + { + "epoch": 0.42641576913024937, + "grad_norm": 0.10130377858877182, + "learning_rate": 0.0001, + "loss": 1.6489, + "step": 31000 + }, + { + "epoch": 0.4271035364998143, + "grad_norm": 0.08871527761220932, + "learning_rate": 0.0001, + "loss": 1.6485, + "step": 31050 + }, + { + "epoch": 0.42779130386937925, + "grad_norm": 0.0994982048869133, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 31100 + }, + { + "epoch": 0.42847907123894413, + "grad_norm": 0.09573712199926376, + "learning_rate": 0.0001, + "loss": 1.6508, + "step": 31150 + }, + { + "epoch": 0.42916683860850907, + "grad_norm": 0.10358277708292007, + "learning_rate": 0.0001, + "loss": 1.6502, + "step": 31200 + }, + { + "epoch": 0.42985460597807396, + "grad_norm": 0.0881354883313179, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 31250 + }, + { + "epoch": 0.4305423733476389, + "grad_norm": 0.08965957164764404, + "learning_rate": 0.0001, + "loss": 1.6464, + "step": 31300 + }, + { + "epoch": 0.43123014071720384, + "grad_norm": 0.09935072064399719, + "learning_rate": 0.0001, + "loss": 1.6504, + "step": 31350 + }, + { + "epoch": 0.4319179080867687, + "grad_norm": 0.09450331330299377, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 31400 + }, + { + "epoch": 0.43260567545633366, + "grad_norm": 0.09426354616880417, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 31450 + }, + { + "epoch": 0.43329344282589854, + "grad_norm": 0.09853871166706085, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 31500 + }, + { + "epoch": 0.4339812101954635, + "grad_norm": 0.09059667587280273, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 31550 + }, + { + "epoch": 0.4346689775650284, + "grad_norm": 0.10137219727039337, + "learning_rate": 0.0001, + "loss": 1.6497, + "step": 31600 + }, + { + "epoch": 0.4353567449345933, + "grad_norm": 0.09198787808418274, + "learning_rate": 0.0001, + "loss": 1.6493, + "step": 31650 + }, + { + "epoch": 0.43604451230415825, + "grad_norm": 0.1016339510679245, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 31700 + }, + { + "epoch": 0.43673227967372313, + "grad_norm": 0.09335731714963913, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 31750 + }, + { + "epoch": 0.4374200470432881, + "grad_norm": 0.09104933589696884, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 31800 + }, + { + "epoch": 0.438107814412853, + "grad_norm": 0.08286421746015549, + "learning_rate": 0.0001, + "loss": 1.6425, + "step": 31850 + }, + { + "epoch": 0.4387955817824179, + "grad_norm": 0.0951671227812767, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 31900 + }, + { + "epoch": 0.43948334915198284, + "grad_norm": 0.08751095086336136, + "learning_rate": 0.0001, + "loss": 1.6447, + "step": 31950 + }, + { + "epoch": 0.4401711165215478, + "grad_norm": 0.09626565128564835, + "learning_rate": 0.0001, + "loss": 1.6487, + "step": 32000 + }, + { + "epoch": 0.44085888389111266, + "grad_norm": 0.09458938241004944, + "learning_rate": 0.0001, + "loss": 1.6457, + "step": 32050 + }, + { + "epoch": 0.4415466512606776, + "grad_norm": 0.1030418872833252, + "learning_rate": 0.0001, + "loss": 1.6477, + "step": 32100 + }, + { + "epoch": 0.4422344186302425, + "grad_norm": 0.09569265693426132, + "learning_rate": 0.0001, + "loss": 1.6476, + "step": 32150 + }, + { + "epoch": 0.4429221859998074, + "grad_norm": 0.10030734539031982, + "learning_rate": 0.0001, + "loss": 1.6454, + "step": 32200 + }, + { + "epoch": 0.44360995336937237, + "grad_norm": 0.11456091701984406, + "learning_rate": 0.0001, + "loss": 1.6457, + "step": 32250 + }, + { + "epoch": 0.44429772073893725, + "grad_norm": 0.08938100934028625, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 32300 + }, + { + "epoch": 0.4449854881085022, + "grad_norm": 0.10480993241071701, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 32350 + }, + { + "epoch": 0.4456732554780671, + "grad_norm": 0.10453122854232788, + "learning_rate": 0.0001, + "loss": 1.6483, + "step": 32400 + }, + { + "epoch": 0.446361022847632, + "grad_norm": 0.10204358398914337, + "learning_rate": 0.0001, + "loss": 1.6457, + "step": 32450 + }, + { + "epoch": 0.44704879021719696, + "grad_norm": 0.0900953933596611, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 32500 + }, + { + "epoch": 0.44773655758676184, + "grad_norm": 0.10755446553230286, + "learning_rate": 0.0001, + "loss": 1.6431, + "step": 32550 + }, + { + "epoch": 0.4484243249563268, + "grad_norm": 0.08640281856060028, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 32600 + }, + { + "epoch": 0.44911209232589167, + "grad_norm": 0.09299297630786896, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 32650 + }, + { + "epoch": 0.4497998596954566, + "grad_norm": 0.09522339701652527, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 32700 + }, + { + "epoch": 0.45048762706502155, + "grad_norm": 0.10342853516340256, + "learning_rate": 0.0001, + "loss": 1.6492, + "step": 32750 + }, + { + "epoch": 0.45117539443458643, + "grad_norm": 0.08631814271211624, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 32800 + }, + { + "epoch": 0.45186316180415137, + "grad_norm": 0.09134256094694138, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 32850 + }, + { + "epoch": 0.4525509291737163, + "grad_norm": 0.09882289171218872, + "learning_rate": 0.0001, + "loss": 1.645, + "step": 32900 + }, + { + "epoch": 0.4532386965432812, + "grad_norm": 0.10555307567119598, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 32950 + }, + { + "epoch": 0.45392646391284613, + "grad_norm": 0.0928015485405922, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 33000 + }, + { + "epoch": 0.454614231282411, + "grad_norm": 0.09406717866659164, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 33050 + }, + { + "epoch": 0.45530199865197596, + "grad_norm": 0.09144891798496246, + "learning_rate": 0.0001, + "loss": 1.6487, + "step": 33100 + }, + { + "epoch": 0.4559897660215409, + "grad_norm": 0.09776079654693604, + "learning_rate": 0.0001, + "loss": 1.6453, + "step": 33150 + }, + { + "epoch": 0.4566775333911058, + "grad_norm": 0.09001190215349197, + "learning_rate": 0.0001, + "loss": 1.6447, + "step": 33200 + }, + { + "epoch": 0.4573653007606707, + "grad_norm": 0.09209096431732178, + "learning_rate": 0.0001, + "loss": 1.6437, + "step": 33250 + }, + { + "epoch": 0.4580530681302356, + "grad_norm": 0.09005513042211533, + "learning_rate": 0.0001, + "loss": 1.641, + "step": 33300 + }, + { + "epoch": 0.45874083549980055, + "grad_norm": 0.09380592405796051, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 33350 + }, + { + "epoch": 0.4594286028693655, + "grad_norm": 0.09379936754703522, + "learning_rate": 0.0001, + "loss": 1.6454, + "step": 33400 + }, + { + "epoch": 0.46011637023893037, + "grad_norm": 0.0955335795879364, + "learning_rate": 0.0001, + "loss": 1.645, + "step": 33450 + }, + { + "epoch": 0.4608041376084953, + "grad_norm": 0.10363741219043732, + "learning_rate": 0.0001, + "loss": 1.6439, + "step": 33500 + }, + { + "epoch": 0.4614919049780602, + "grad_norm": 0.08596354722976685, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 33550 + }, + { + "epoch": 0.46217967234762514, + "grad_norm": 0.08921484649181366, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 33600 + }, + { + "epoch": 0.4628674397171901, + "grad_norm": 0.09811948239803314, + "learning_rate": 0.0001, + "loss": 1.6462, + "step": 33650 + }, + { + "epoch": 0.46355520708675496, + "grad_norm": 0.0884086936712265, + "learning_rate": 0.0001, + "loss": 1.6458, + "step": 33700 + }, + { + "epoch": 0.4642429744563199, + "grad_norm": 0.08605129271745682, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 33750 + }, + { + "epoch": 0.4649307418258848, + "grad_norm": 0.10650883615016937, + "learning_rate": 0.0001, + "loss": 1.6468, + "step": 33800 + }, + { + "epoch": 0.4656185091954497, + "grad_norm": 0.09201018512248993, + "learning_rate": 0.0001, + "loss": 1.6463, + "step": 33850 + }, + { + "epoch": 0.46630627656501467, + "grad_norm": 0.09525769203901291, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 33900 + }, + { + "epoch": 0.46699404393457955, + "grad_norm": 0.10839025676250458, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 33950 + }, + { + "epoch": 0.4676818113041445, + "grad_norm": 0.09996642172336578, + "learning_rate": 0.0001, + "loss": 1.6469, + "step": 34000 + }, + { + "epoch": 0.46836957867370943, + "grad_norm": 0.09651756286621094, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 34050 + }, + { + "epoch": 0.4690573460432743, + "grad_norm": 0.10378842055797577, + "learning_rate": 0.0001, + "loss": 1.646, + "step": 34100 + }, + { + "epoch": 0.46974511341283925, + "grad_norm": 0.10965878516435623, + "learning_rate": 0.0001, + "loss": 1.6449, + "step": 34150 + }, + { + "epoch": 0.47043288078240414, + "grad_norm": 0.08958712965250015, + "learning_rate": 0.0001, + "loss": 1.6412, + "step": 34200 + }, + { + "epoch": 0.4711206481519691, + "grad_norm": 0.09303830564022064, + "learning_rate": 0.0001, + "loss": 1.6457, + "step": 34250 + }, + { + "epoch": 0.471808415521534, + "grad_norm": 0.08511397242546082, + "learning_rate": 0.0001, + "loss": 1.6408, + "step": 34300 + }, + { + "epoch": 0.4724961828910989, + "grad_norm": 0.09241296350955963, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 34350 + }, + { + "epoch": 0.47318395026066384, + "grad_norm": 0.10253586620092392, + "learning_rate": 0.0001, + "loss": 1.6424, + "step": 34400 + }, + { + "epoch": 0.47387171763022873, + "grad_norm": 0.09580568969249725, + "learning_rate": 0.0001, + "loss": 1.647, + "step": 34450 + }, + { + "epoch": 0.47455948499979367, + "grad_norm": 0.10204749554395676, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 34500 + }, + { + "epoch": 0.4752472523693586, + "grad_norm": 0.08885429054498672, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 34550 + }, + { + "epoch": 0.4759350197389235, + "grad_norm": 0.09475323557853699, + "learning_rate": 0.0001, + "loss": 1.6412, + "step": 34600 + }, + { + "epoch": 0.47662278710848843, + "grad_norm": 0.09055089205503464, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 34650 + }, + { + "epoch": 0.4773105544780533, + "grad_norm": 0.09198318421840668, + "learning_rate": 0.0001, + "loss": 1.6425, + "step": 34700 + }, + { + "epoch": 0.47799832184761826, + "grad_norm": 0.10331781953573227, + "learning_rate": 0.0001, + "loss": 1.649, + "step": 34750 + }, + { + "epoch": 0.4786860892171832, + "grad_norm": 0.0971902385354042, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 34800 + }, + { + "epoch": 0.4793738565867481, + "grad_norm": 0.13026170432567596, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 34850 + }, + { + "epoch": 0.480061623956313, + "grad_norm": 0.0918232873082161, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 34900 + }, + { + "epoch": 0.48074939132587796, + "grad_norm": 0.0905313566327095, + "learning_rate": 0.0001, + "loss": 1.645, + "step": 34950 + }, + { + "epoch": 0.48143715869544285, + "grad_norm": 0.08482769876718521, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 35000 + }, + { + "epoch": 0.4821249260650078, + "grad_norm": 0.09367966651916504, + "learning_rate": 0.0001, + "loss": 1.6408, + "step": 35050 + }, + { + "epoch": 0.48281269343457267, + "grad_norm": 0.08897031843662262, + "learning_rate": 0.0001, + "loss": 1.6465, + "step": 35100 + }, + { + "epoch": 0.4835004608041376, + "grad_norm": 0.09347031265497208, + "learning_rate": 0.0001, + "loss": 1.6466, + "step": 35150 + }, + { + "epoch": 0.48418822817370255, + "grad_norm": 0.09764907509088516, + "learning_rate": 0.0001, + "loss": 1.6449, + "step": 35200 + }, + { + "epoch": 0.48487599554326744, + "grad_norm": 0.09478213638067245, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 35250 + }, + { + "epoch": 0.4855637629128324, + "grad_norm": 0.09859916567802429, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 35300 + }, + { + "epoch": 0.48625153028239726, + "grad_norm": 0.10785259306430817, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 35350 + }, + { + "epoch": 0.4869392976519622, + "grad_norm": 0.09389789402484894, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 35400 + }, + { + "epoch": 0.48762706502152714, + "grad_norm": 0.0896189734339714, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 35450 + }, + { + "epoch": 0.488314832391092, + "grad_norm": 0.0927240401506424, + "learning_rate": 0.0001, + "loss": 1.642, + "step": 35500 + }, + { + "epoch": 0.48900259976065696, + "grad_norm": 0.09178788214921951, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 35550 + }, + { + "epoch": 0.48969036713022185, + "grad_norm": 0.09502946585416794, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 35600 + }, + { + "epoch": 0.4903781344997868, + "grad_norm": 0.11376187205314636, + "learning_rate": 0.0001, + "loss": 1.6445, + "step": 35650 + }, + { + "epoch": 0.49106590186935173, + "grad_norm": 0.09764853864908218, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 35700 + }, + { + "epoch": 0.4917536692389166, + "grad_norm": 0.09321732074022293, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 35750 + }, + { + "epoch": 0.49244143660848155, + "grad_norm": 0.09407221525907516, + "learning_rate": 0.0001, + "loss": 1.6441, + "step": 35800 + }, + { + "epoch": 0.4931292039780465, + "grad_norm": 0.09491898119449615, + "learning_rate": 0.0001, + "loss": 1.6455, + "step": 35850 + }, + { + "epoch": 0.4938169713476114, + "grad_norm": 0.0940353125333786, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 35900 + }, + { + "epoch": 0.4945047387171763, + "grad_norm": 0.10276436060667038, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 35950 + }, + { + "epoch": 0.4951925060867412, + "grad_norm": 0.10237590223550797, + "learning_rate": 0.0001, + "loss": 1.6423, + "step": 36000 + }, + { + "epoch": 0.49588027345630614, + "grad_norm": 0.08295126259326935, + "learning_rate": 0.0001, + "loss": 1.644, + "step": 36050 + }, + { + "epoch": 0.4965680408258711, + "grad_norm": 0.09782548248767853, + "learning_rate": 0.0001, + "loss": 1.6414, + "step": 36100 + }, + { + "epoch": 0.49725580819543597, + "grad_norm": 0.1010642945766449, + "learning_rate": 0.0001, + "loss": 1.6397, + "step": 36150 + }, + { + "epoch": 0.4979435755650009, + "grad_norm": 0.10704517364501953, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 36200 + }, + { + "epoch": 0.4986313429345658, + "grad_norm": 0.09292057156562805, + "learning_rate": 0.0001, + "loss": 1.6435, + "step": 36250 + }, + { + "epoch": 0.49931911030413073, + "grad_norm": 0.09314380586147308, + "learning_rate": 0.0001, + "loss": 1.6443, + "step": 36300 + }, + { + "epoch": 0.5000068776736957, + "grad_norm": 0.10215429961681366, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 36350 + }, + { + "epoch": 0.5006946450432606, + "grad_norm": 0.10266946256160736, + "learning_rate": 0.0001, + "loss": 1.6408, + "step": 36400 + }, + { + "epoch": 0.5013824124128254, + "grad_norm": 0.10435584932565689, + "learning_rate": 0.0001, + "loss": 1.6437, + "step": 36450 + }, + { + "epoch": 0.5020701797823904, + "grad_norm": 0.09806975722312927, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 36500 + }, + { + "epoch": 0.5027579471519553, + "grad_norm": 0.09484187513589859, + "learning_rate": 0.0001, + "loss": 1.6428, + "step": 36550 + }, + { + "epoch": 0.5034457145215202, + "grad_norm": 0.1000257357954979, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 36600 + }, + { + "epoch": 0.5041334818910852, + "grad_norm": 0.11959541589021683, + "learning_rate": 0.0001, + "loss": 1.6437, + "step": 36650 + }, + { + "epoch": 0.5048212492606501, + "grad_norm": 0.08579277247190475, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 36700 + }, + { + "epoch": 0.505509016630215, + "grad_norm": 0.09670394659042358, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 36750 + }, + { + "epoch": 0.50619678399978, + "grad_norm": 0.10155735909938812, + "learning_rate": 0.0001, + "loss": 1.6466, + "step": 36800 + }, + { + "epoch": 0.5068845513693448, + "grad_norm": 0.09957952052354813, + "learning_rate": 0.0001, + "loss": 1.6432, + "step": 36850 + }, + { + "epoch": 0.5075723187389097, + "grad_norm": 0.09222400933504105, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 36900 + }, + { + "epoch": 0.5082600861084746, + "grad_norm": 0.09004013985395432, + "learning_rate": 0.0001, + "loss": 1.6439, + "step": 36950 + }, + { + "epoch": 0.5089478534780396, + "grad_norm": 0.1122378334403038, + "learning_rate": 0.0001, + "loss": 1.643, + "step": 37000 + }, + { + "epoch": 0.5096356208476045, + "grad_norm": 0.10449572652578354, + "learning_rate": 0.0001, + "loss": 1.6461, + "step": 37050 + }, + { + "epoch": 0.5103233882171694, + "grad_norm": 0.09255963563919067, + "learning_rate": 0.0001, + "loss": 1.645, + "step": 37100 + }, + { + "epoch": 0.5110111555867344, + "grad_norm": 0.08528398722410202, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 37150 + }, + { + "epoch": 0.5116989229562993, + "grad_norm": 0.08895813673734665, + "learning_rate": 0.0001, + "loss": 1.6427, + "step": 37200 + }, + { + "epoch": 0.5123866903258641, + "grad_norm": 0.08886689692735672, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 37250 + }, + { + "epoch": 0.5130744576954291, + "grad_norm": 0.10976780205965042, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 37300 + }, + { + "epoch": 0.513762225064994, + "grad_norm": 0.09144289046525955, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 37350 + }, + { + "epoch": 0.5144499924345589, + "grad_norm": 0.0914168730378151, + "learning_rate": 0.0001, + "loss": 1.64, + "step": 37400 + }, + { + "epoch": 0.5151377598041239, + "grad_norm": 0.09350179880857468, + "learning_rate": 0.0001, + "loss": 1.6442, + "step": 37450 + }, + { + "epoch": 0.5158255271736888, + "grad_norm": 0.09165963530540466, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 37500 + }, + { + "epoch": 0.5165132945432537, + "grad_norm": 0.09791237860918045, + "learning_rate": 0.0001, + "loss": 1.6452, + "step": 37550 + }, + { + "epoch": 0.5172010619128186, + "grad_norm": 0.1264970451593399, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 37600 + }, + { + "epoch": 0.5178888292823836, + "grad_norm": 0.09959422051906586, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 37650 + }, + { + "epoch": 0.5185765966519484, + "grad_norm": 0.09144961088895798, + "learning_rate": 0.0001, + "loss": 1.639, + "step": 37700 + }, + { + "epoch": 0.5192643640215133, + "grad_norm": 0.09086447954177856, + "learning_rate": 0.0001, + "loss": 1.6479, + "step": 37750 + }, + { + "epoch": 0.5199521313910783, + "grad_norm": 0.10586626827716827, + "learning_rate": 0.0001, + "loss": 1.6416, + "step": 37800 + }, + { + "epoch": 0.5206398987606432, + "grad_norm": 0.10102024674415588, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 37850 + }, + { + "epoch": 0.5213276661302081, + "grad_norm": 0.09939133375883102, + "learning_rate": 0.0001, + "loss": 1.6402, + "step": 37900 + }, + { + "epoch": 0.5220154334997731, + "grad_norm": 0.0867060124874115, + "learning_rate": 0.0001, + "loss": 1.6412, + "step": 37950 + }, + { + "epoch": 0.522703200869338, + "grad_norm": 0.09193069487810135, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 38000 + }, + { + "epoch": 0.5233909682389029, + "grad_norm": 0.08879737555980682, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 38050 + }, + { + "epoch": 0.5240787356084677, + "grad_norm": 0.09856677055358887, + "learning_rate": 0.0001, + "loss": 1.6421, + "step": 38100 + }, + { + "epoch": 0.5247665029780327, + "grad_norm": 0.10344164073467255, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 38150 + }, + { + "epoch": 0.5254542703475976, + "grad_norm": 0.09736845642328262, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 38200 + }, + { + "epoch": 0.5261420377171625, + "grad_norm": 0.09295806288719177, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 38250 + }, + { + "epoch": 0.5268298050867275, + "grad_norm": 0.10306044667959213, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 38300 + }, + { + "epoch": 0.5275175724562924, + "grad_norm": 0.10452929884195328, + "learning_rate": 0.0001, + "loss": 1.642, + "step": 38350 + }, + { + "epoch": 0.5282053398258573, + "grad_norm": 0.1042618453502655, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 38400 + }, + { + "epoch": 0.5288931071954223, + "grad_norm": 0.09721370786428452, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 38450 + }, + { + "epoch": 0.5295808745649871, + "grad_norm": 0.09331890940666199, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 38500 + }, + { + "epoch": 0.530268641934552, + "grad_norm": 0.08918920904397964, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 38550 + }, + { + "epoch": 0.530956409304117, + "grad_norm": 0.10316510498523712, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 38600 + }, + { + "epoch": 0.5316441766736819, + "grad_norm": 0.0914444625377655, + "learning_rate": 0.0001, + "loss": 1.6436, + "step": 38650 + }, + { + "epoch": 0.5323319440432468, + "grad_norm": 0.09143774956464767, + "learning_rate": 0.0001, + "loss": 1.6434, + "step": 38700 + }, + { + "epoch": 0.5330197114128117, + "grad_norm": 0.10366670787334442, + "learning_rate": 0.0001, + "loss": 1.6444, + "step": 38750 + }, + { + "epoch": 0.5337074787823767, + "grad_norm": 0.09326903522014618, + "learning_rate": 0.0001, + "loss": 1.645, + "step": 38800 + }, + { + "epoch": 0.5343952461519416, + "grad_norm": 0.09438502043485641, + "learning_rate": 0.0001, + "loss": 1.6418, + "step": 38850 + }, + { + "epoch": 0.5350830135215064, + "grad_norm": 0.09484802931547165, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 38900 + }, + { + "epoch": 0.5357707808910714, + "grad_norm": 0.09375619888305664, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 38950 + }, + { + "epoch": 0.5364585482606363, + "grad_norm": 0.08983484655618668, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 39000 + }, + { + "epoch": 0.5371463156302012, + "grad_norm": 0.08946409076452255, + "learning_rate": 0.0001, + "loss": 1.6414, + "step": 39050 + }, + { + "epoch": 0.5378340829997662, + "grad_norm": 0.09255479276180267, + "learning_rate": 0.0001, + "loss": 1.6446, + "step": 39100 + }, + { + "epoch": 0.5385218503693311, + "grad_norm": 0.11457061767578125, + "learning_rate": 0.0001, + "loss": 1.6438, + "step": 39150 + }, + { + "epoch": 0.539209617738896, + "grad_norm": 0.09728623181581497, + "learning_rate": 0.0001, + "loss": 1.6433, + "step": 39200 + }, + { + "epoch": 0.5398973851084609, + "grad_norm": 0.09877431392669678, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 39250 + }, + { + "epoch": 0.5405851524780259, + "grad_norm": 0.09415557980537415, + "learning_rate": 0.0001, + "loss": 1.6403, + "step": 39300 + }, + { + "epoch": 0.5412729198475907, + "grad_norm": 0.1116420328617096, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 39350 + }, + { + "epoch": 0.5419606872171556, + "grad_norm": 0.0912848562002182, + "learning_rate": 0.0001, + "loss": 1.6419, + "step": 39400 + }, + { + "epoch": 0.5426484545867206, + "grad_norm": 0.0994088277220726, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 39450 + }, + { + "epoch": 0.5433362219562855, + "grad_norm": 0.12156657129526138, + "learning_rate": 0.0001, + "loss": 1.6411, + "step": 39500 + }, + { + "epoch": 0.5440239893258504, + "grad_norm": 0.09030243009328842, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 39550 + }, + { + "epoch": 0.5447117566954154, + "grad_norm": 0.09504235535860062, + "learning_rate": 0.0001, + "loss": 1.6417, + "step": 39600 + }, + { + "epoch": 0.5453995240649803, + "grad_norm": 0.09386838972568512, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 39650 + }, + { + "epoch": 0.5460872914345452, + "grad_norm": 0.09527257829904556, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 39700 + }, + { + "epoch": 0.5467750588041101, + "grad_norm": 0.09409776329994202, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 39750 + }, + { + "epoch": 0.547462826173675, + "grad_norm": 0.08981654047966003, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 39800 + }, + { + "epoch": 0.5481505935432399, + "grad_norm": 0.09664871543645859, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 39850 + }, + { + "epoch": 0.5488383609128048, + "grad_norm": 0.09075263887643814, + "learning_rate": 0.0001, + "loss": 1.6407, + "step": 39900 + }, + { + "epoch": 0.5495261282823698, + "grad_norm": 0.1084190309047699, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 39950 + }, + { + "epoch": 0.5502138956519347, + "grad_norm": 0.09443186968564987, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 40000 + }, + { + "epoch": 0.5509016630214996, + "grad_norm": 0.1035468801856041, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 40050 + }, + { + "epoch": 0.5515894303910646, + "grad_norm": 0.0966610461473465, + "learning_rate": 0.0001, + "loss": 1.6374, + "step": 40100 + }, + { + "epoch": 0.5522771977606294, + "grad_norm": 0.10241789370775223, + "learning_rate": 0.0001, + "loss": 1.6387, + "step": 40150 + }, + { + "epoch": 0.5529649651301943, + "grad_norm": 0.10644425451755524, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 40200 + }, + { + "epoch": 0.5536527324997593, + "grad_norm": 0.08675768971443176, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 40250 + }, + { + "epoch": 0.5543404998693242, + "grad_norm": 0.12207689881324768, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 40300 + }, + { + "epoch": 0.5550282672388891, + "grad_norm": 0.0907023698091507, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 40350 + }, + { + "epoch": 0.5557160346084541, + "grad_norm": 0.09063316136598587, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 40400 + }, + { + "epoch": 0.556403801978019, + "grad_norm": 0.10163768380880356, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 40450 + }, + { + "epoch": 0.5570915693475839, + "grad_norm": 0.11267077177762985, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 40500 + }, + { + "epoch": 0.5577793367171487, + "grad_norm": 0.11460031569004059, + "learning_rate": 0.0001, + "loss": 1.6365, + "step": 40550 + }, + { + "epoch": 0.5584671040867137, + "grad_norm": 0.09331660717725754, + "learning_rate": 0.0001, + "loss": 1.6411, + "step": 40600 + }, + { + "epoch": 0.5591548714562786, + "grad_norm": 0.09418249875307083, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 40650 + }, + { + "epoch": 0.5598426388258435, + "grad_norm": 0.10210473090410233, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 40700 + }, + { + "epoch": 0.5605304061954085, + "grad_norm": 0.09525810927152634, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 40750 + }, + { + "epoch": 0.5612181735649734, + "grad_norm": 0.0911373421549797, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 40800 + }, + { + "epoch": 0.5619059409345383, + "grad_norm": 0.10007715970277786, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 40850 + }, + { + "epoch": 0.5625937083041033, + "grad_norm": 0.0913693979382515, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 40900 + }, + { + "epoch": 0.5632814756736682, + "grad_norm": 0.10602299869060516, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 40950 + }, + { + "epoch": 0.563969243043233, + "grad_norm": 0.10825448483228683, + "learning_rate": 0.0001, + "loss": 1.6365, + "step": 41000 + }, + { + "epoch": 0.5646570104127979, + "grad_norm": 0.09337472170591354, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 41050 + }, + { + "epoch": 0.5653447777823629, + "grad_norm": 0.0868719145655632, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 41100 + }, + { + "epoch": 0.5660325451519278, + "grad_norm": 0.09493357688188553, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 41150 + }, + { + "epoch": 0.5667203125214927, + "grad_norm": 0.09529832005500793, + "learning_rate": 0.0001, + "loss": 1.6389, + "step": 41200 + }, + { + "epoch": 0.5674080798910577, + "grad_norm": 0.0934169664978981, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 41250 + }, + { + "epoch": 0.5680958472606226, + "grad_norm": 0.09936791658401489, + "learning_rate": 0.0001, + "loss": 1.6365, + "step": 41300 + }, + { + "epoch": 0.5687836146301875, + "grad_norm": 0.09473796933889389, + "learning_rate": 0.0001, + "loss": 1.6394, + "step": 41350 + }, + { + "epoch": 0.5694713819997524, + "grad_norm": 0.09517840296030045, + "learning_rate": 0.0001, + "loss": 1.6379, + "step": 41400 + }, + { + "epoch": 0.5701591493693173, + "grad_norm": 0.09487368911504745, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 41450 + }, + { + "epoch": 0.5708469167388822, + "grad_norm": 0.10064958035945892, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 41500 + }, + { + "epoch": 0.5715346841084472, + "grad_norm": 0.10654688626527786, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 41550 + }, + { + "epoch": 0.5722224514780121, + "grad_norm": 0.10369981825351715, + "learning_rate": 0.0001, + "loss": 1.6408, + "step": 41600 + }, + { + "epoch": 0.572910218847577, + "grad_norm": 0.09047607332468033, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 41650 + }, + { + "epoch": 0.5735979862171419, + "grad_norm": 0.0958494246006012, + "learning_rate": 0.0001, + "loss": 1.6422, + "step": 41700 + }, + { + "epoch": 0.5742857535867069, + "grad_norm": 0.09315364807844162, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 41750 + }, + { + "epoch": 0.5749735209562717, + "grad_norm": 0.10469076037406921, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 41800 + }, + { + "epoch": 0.5756612883258366, + "grad_norm": 0.1030019223690033, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 41850 + }, + { + "epoch": 0.5763490556954016, + "grad_norm": 0.09740956872701645, + "learning_rate": 0.0001, + "loss": 1.6359, + "step": 41900 + }, + { + "epoch": 0.5770368230649665, + "grad_norm": 0.09355924278497696, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 41950 + }, + { + "epoch": 0.5777245904345314, + "grad_norm": 0.09326741844415665, + "learning_rate": 0.0001, + "loss": 1.6389, + "step": 42000 + }, + { + "epoch": 0.5784123578040964, + "grad_norm": 0.11062312871217728, + "learning_rate": 0.0001, + "loss": 1.6406, + "step": 42050 + }, + { + "epoch": 0.5791001251736613, + "grad_norm": 0.09104353934526443, + "learning_rate": 0.0001, + "loss": 1.64, + "step": 42100 + }, + { + "epoch": 0.5797878925432262, + "grad_norm": 0.1009470596909523, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 42150 + }, + { + "epoch": 0.580475659912791, + "grad_norm": 0.09980602562427521, + "learning_rate": 0.0001, + "loss": 1.6397, + "step": 42200 + }, + { + "epoch": 0.581163427282356, + "grad_norm": 0.08888303488492966, + "learning_rate": 0.0001, + "loss": 1.6393, + "step": 42250 + }, + { + "epoch": 0.5818511946519209, + "grad_norm": 0.09892967343330383, + "learning_rate": 0.0001, + "loss": 1.6405, + "step": 42300 + }, + { + "epoch": 0.5825389620214858, + "grad_norm": 0.09480975568294525, + "learning_rate": 0.0001, + "loss": 1.6401, + "step": 42350 + }, + { + "epoch": 0.5832267293910508, + "grad_norm": 0.08748164027929306, + "learning_rate": 0.0001, + "loss": 1.6385, + "step": 42400 + }, + { + "epoch": 0.5839144967606157, + "grad_norm": 0.0984395295381546, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 42450 + }, + { + "epoch": 0.5846022641301806, + "grad_norm": 0.09309285134077072, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 42500 + }, + { + "epoch": 0.5852900314997456, + "grad_norm": 0.09663578122854233, + "learning_rate": 0.0001, + "loss": 1.6366, + "step": 42550 + }, + { + "epoch": 0.5859777988693105, + "grad_norm": 0.1040002703666687, + "learning_rate": 0.0001, + "loss": 1.6373, + "step": 42600 + }, + { + "epoch": 0.5866655662388753, + "grad_norm": 0.0976615622639656, + "learning_rate": 0.0001, + "loss": 1.6372, + "step": 42650 + }, + { + "epoch": 0.5873533336084403, + "grad_norm": 0.0910048708319664, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 42700 + }, + { + "epoch": 0.5880411009780052, + "grad_norm": 0.10820795595645905, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 42750 + }, + { + "epoch": 0.5887288683475701, + "grad_norm": 0.08897601813077927, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 42800 + }, + { + "epoch": 0.589416635717135, + "grad_norm": 0.10053162276744843, + "learning_rate": 0.0001, + "loss": 1.6389, + "step": 42850 + }, + { + "epoch": 0.5901044030867, + "grad_norm": 0.10383192449808121, + "learning_rate": 0.0001, + "loss": 1.6396, + "step": 42900 + }, + { + "epoch": 0.5907921704562649, + "grad_norm": 0.11211832612752914, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 42950 + }, + { + "epoch": 0.5914799378258297, + "grad_norm": 0.10318583995103836, + "learning_rate": 0.0001, + "loss": 1.6385, + "step": 43000 + }, + { + "epoch": 0.5921677051953947, + "grad_norm": 0.09368006885051727, + "learning_rate": 0.0001, + "loss": 1.6385, + "step": 43050 + }, + { + "epoch": 0.5928554725649596, + "grad_norm": 0.10585025697946548, + "learning_rate": 0.0001, + "loss": 1.6404, + "step": 43100 + }, + { + "epoch": 0.5935432399345245, + "grad_norm": 0.11520183831453323, + "learning_rate": 0.0001, + "loss": 1.6371, + "step": 43150 + }, + { + "epoch": 0.5942310073040895, + "grad_norm": 0.10421840101480484, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 43200 + }, + { + "epoch": 0.5949187746736544, + "grad_norm": 0.11153177917003632, + "learning_rate": 0.0001, + "loss": 1.6395, + "step": 43250 + }, + { + "epoch": 0.5956065420432193, + "grad_norm": 0.09862000495195389, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 43300 + }, + { + "epoch": 0.5962943094127843, + "grad_norm": 0.11102604120969772, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 43350 + }, + { + "epoch": 0.5969820767823492, + "grad_norm": 0.11711497604846954, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 43400 + }, + { + "epoch": 0.597669844151914, + "grad_norm": 0.0938689261674881, + "learning_rate": 0.0001, + "loss": 1.6392, + "step": 43450 + }, + { + "epoch": 0.5983576115214789, + "grad_norm": 0.09853795170783997, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 43500 + }, + { + "epoch": 0.5990453788910439, + "grad_norm": 0.09836630523204803, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 43550 + }, + { + "epoch": 0.5997331462606088, + "grad_norm": 0.10666079819202423, + "learning_rate": 0.0001, + "loss": 1.6321, + "step": 43600 + }, + { + "epoch": 0.6004209136301737, + "grad_norm": 0.10402662307024002, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 43650 + }, + { + "epoch": 0.6011086809997387, + "grad_norm": 0.09178145229816437, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 43700 + }, + { + "epoch": 0.6017964483693036, + "grad_norm": 0.11560804396867752, + "learning_rate": 0.0001, + "loss": 1.6413, + "step": 43750 + }, + { + "epoch": 0.6024842157388685, + "grad_norm": 0.10294962674379349, + "learning_rate": 0.0001, + "loss": 1.6344, + "step": 43800 + }, + { + "epoch": 0.6031719831084335, + "grad_norm": 0.10806939750909805, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 43850 + }, + { + "epoch": 0.6038597504779983, + "grad_norm": 0.09059430658817291, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 43900 + }, + { + "epoch": 0.6045475178475632, + "grad_norm": 0.09560652822256088, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 43950 + }, + { + "epoch": 0.6052352852171281, + "grad_norm": 0.1222764179110527, + "learning_rate": 0.0001, + "loss": 1.6378, + "step": 44000 + }, + { + "epoch": 0.6059230525866931, + "grad_norm": 0.09961801767349243, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 44050 + }, + { + "epoch": 0.606610819956258, + "grad_norm": 0.09827202558517456, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 44100 + }, + { + "epoch": 0.6072985873258229, + "grad_norm": 0.09373018145561218, + "learning_rate": 0.0001, + "loss": 1.6367, + "step": 44150 + }, + { + "epoch": 0.6079863546953879, + "grad_norm": 0.1013326495885849, + "learning_rate": 0.0001, + "loss": 1.6359, + "step": 44200 + }, + { + "epoch": 0.6086741220649527, + "grad_norm": 0.09980370849370956, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 44250 + }, + { + "epoch": 0.6093618894345176, + "grad_norm": 0.0929165855050087, + "learning_rate": 0.0001, + "loss": 1.6415, + "step": 44300 + }, + { + "epoch": 0.6100496568040826, + "grad_norm": 0.11303722113370895, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 44350 + }, + { + "epoch": 0.6107374241736475, + "grad_norm": 0.09982936829328537, + "learning_rate": 0.0001, + "loss": 1.6383, + "step": 44400 + }, + { + "epoch": 0.6114251915432124, + "grad_norm": 0.10015662014484406, + "learning_rate": 0.0001, + "loss": 1.638, + "step": 44450 + }, + { + "epoch": 0.6121129589127774, + "grad_norm": 0.09394964575767517, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 44500 + }, + { + "epoch": 0.6128007262823423, + "grad_norm": 0.09288498759269714, + "learning_rate": 0.0001, + "loss": 1.6336, + "step": 44550 + }, + { + "epoch": 0.6134884936519072, + "grad_norm": 0.09083982557058334, + "learning_rate": 0.0001, + "loss": 1.6399, + "step": 44600 + }, + { + "epoch": 0.614176261021472, + "grad_norm": 0.11770026385784149, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 44650 + }, + { + "epoch": 0.614864028391037, + "grad_norm": 0.09519533067941666, + "learning_rate": 0.0001, + "loss": 1.6391, + "step": 44700 + }, + { + "epoch": 0.6155517957606019, + "grad_norm": 0.11158303171396255, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 44750 + }, + { + "epoch": 0.6162395631301668, + "grad_norm": 0.1033138781785965, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 44800 + }, + { + "epoch": 0.6169273304997318, + "grad_norm": 0.09701905399560928, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 44850 + }, + { + "epoch": 0.6176150978692967, + "grad_norm": 0.0948035940527916, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 44900 + }, + { + "epoch": 0.6183028652388616, + "grad_norm": 0.10962845385074615, + "learning_rate": 0.0001, + "loss": 1.6388, + "step": 44950 + }, + { + "epoch": 0.6189906326084266, + "grad_norm": 0.1121959462761879, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 45000 + }, + { + "epoch": 0.6196783999779915, + "grad_norm": 0.0968470573425293, + "learning_rate": 0.0001, + "loss": 1.6384, + "step": 45050 + }, + { + "epoch": 0.6203661673475563, + "grad_norm": 0.10296080261468887, + "learning_rate": 0.0001, + "loss": 1.6344, + "step": 45100 + }, + { + "epoch": 0.6210539347171212, + "grad_norm": 0.09767179936170578, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 45150 + }, + { + "epoch": 0.6217417020866862, + "grad_norm": 0.09881862252950668, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 45200 + }, + { + "epoch": 0.6224294694562511, + "grad_norm": 0.11464895308017731, + "learning_rate": 0.0001, + "loss": 1.6344, + "step": 45250 + }, + { + "epoch": 0.623117236825816, + "grad_norm": 0.09954984486103058, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 45300 + }, + { + "epoch": 0.623805004195381, + "grad_norm": 0.10357275605201721, + "learning_rate": 0.0001, + "loss": 1.6382, + "step": 45350 + }, + { + "epoch": 0.6244927715649459, + "grad_norm": 0.09138759225606918, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 45400 + }, + { + "epoch": 0.6251805389345108, + "grad_norm": 0.09229311347007751, + "learning_rate": 0.0001, + "loss": 1.6333, + "step": 45450 + }, + { + "epoch": 0.6258683063040757, + "grad_norm": 0.09420092403888702, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 45500 + }, + { + "epoch": 0.6265560736736406, + "grad_norm": 0.0874767154455185, + "learning_rate": 0.0001, + "loss": 1.635, + "step": 45550 + }, + { + "epoch": 0.6272438410432055, + "grad_norm": 0.09979282319545746, + "learning_rate": 0.0001, + "loss": 1.6356, + "step": 45600 + }, + { + "epoch": 0.6279316084127705, + "grad_norm": 0.105704665184021, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 45650 + }, + { + "epoch": 0.6286193757823354, + "grad_norm": 0.09676025807857513, + "learning_rate": 0.0001, + "loss": 1.6376, + "step": 45700 + }, + { + "epoch": 0.6293071431519003, + "grad_norm": 0.1145695224404335, + "learning_rate": 0.0001, + "loss": 1.6359, + "step": 45750 + }, + { + "epoch": 0.6299949105214652, + "grad_norm": 0.0872386172413826, + "learning_rate": 0.0001, + "loss": 1.6349, + "step": 45800 + }, + { + "epoch": 0.6306826778910302, + "grad_norm": 0.10765109956264496, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 45850 + }, + { + "epoch": 0.631370445260595, + "grad_norm": 0.09556550532579422, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 45900 + }, + { + "epoch": 0.6320582126301599, + "grad_norm": 0.09580112993717194, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 45950 + }, + { + "epoch": 0.6327459799997249, + "grad_norm": 0.09488419443368912, + "learning_rate": 0.0001, + "loss": 1.637, + "step": 46000 + }, + { + "epoch": 0.6334337473692898, + "grad_norm": 0.15395735204219818, + "learning_rate": 0.0001, + "loss": 1.6375, + "step": 46050 + }, + { + "epoch": 0.6341215147388547, + "grad_norm": 0.10091084241867065, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 46100 + }, + { + "epoch": 0.6348092821084197, + "grad_norm": 0.09612899273633957, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 46150 + }, + { + "epoch": 0.6354970494779846, + "grad_norm": 0.08732175827026367, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 46200 + }, + { + "epoch": 0.6361848168475495, + "grad_norm": 0.12121989578008652, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 46250 + }, + { + "epoch": 0.6368725842171145, + "grad_norm": 0.10441911965608597, + "learning_rate": 0.0001, + "loss": 1.6333, + "step": 46300 + }, + { + "epoch": 0.6375603515866793, + "grad_norm": 0.09910161048173904, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 46350 + }, + { + "epoch": 0.6382481189562442, + "grad_norm": 0.10073550045490265, + "learning_rate": 0.0001, + "loss": 1.6349, + "step": 46400 + }, + { + "epoch": 0.6389358863258091, + "grad_norm": 0.09164462983608246, + "learning_rate": 0.0001, + "loss": 1.6295, + "step": 46450 + }, + { + "epoch": 0.6396236536953741, + "grad_norm": 0.09485001116991043, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 46500 + }, + { + "epoch": 0.640311421064939, + "grad_norm": 0.09691156446933746, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 46550 + }, + { + "epoch": 0.6409991884345039, + "grad_norm": 0.0953722670674324, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 46600 + }, + { + "epoch": 0.6416869558040689, + "grad_norm": 0.09703515470027924, + "learning_rate": 0.0001, + "loss": 1.6349, + "step": 46650 + }, + { + "epoch": 0.6423747231736338, + "grad_norm": 0.09629255533218384, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 46700 + }, + { + "epoch": 0.6430624905431986, + "grad_norm": 0.09151779860258102, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 46750 + }, + { + "epoch": 0.6437502579127636, + "grad_norm": 0.0923495665192604, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 46800 + }, + { + "epoch": 0.6444380252823285, + "grad_norm": 0.09875286370515823, + "learning_rate": 0.0001, + "loss": 1.6301, + "step": 46850 + }, + { + "epoch": 0.6451257926518934, + "grad_norm": 0.09045711159706116, + "learning_rate": 0.0001, + "loss": 1.6336, + "step": 46900 + }, + { + "epoch": 0.6458135600214583, + "grad_norm": 0.10379324108362198, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 46950 + }, + { + "epoch": 0.6465013273910233, + "grad_norm": 0.11694531887769699, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 47000 + }, + { + "epoch": 0.6471890947605882, + "grad_norm": 0.11057207733392715, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 47050 + }, + { + "epoch": 0.647876862130153, + "grad_norm": 0.09550927579402924, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 47100 + }, + { + "epoch": 0.648564629499718, + "grad_norm": 0.09823106229305267, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 47150 + }, + { + "epoch": 0.6492523968692829, + "grad_norm": 0.10705850273370743, + "learning_rate": 0.0001, + "loss": 1.6336, + "step": 47200 + }, + { + "epoch": 0.6499401642388478, + "grad_norm": 0.10305814445018768, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 47250 + }, + { + "epoch": 0.6506279316084128, + "grad_norm": 0.09874992817640305, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 47300 + }, + { + "epoch": 0.6513156989779777, + "grad_norm": 0.09681331366300583, + "learning_rate": 0.0001, + "loss": 1.6377, + "step": 47350 + }, + { + "epoch": 0.6520034663475426, + "grad_norm": 0.10364115238189697, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 47400 + }, + { + "epoch": 0.6526912337171076, + "grad_norm": 0.11742060631513596, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 47450 + }, + { + "epoch": 0.6533790010866725, + "grad_norm": 0.08997539430856705, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 47500 + }, + { + "epoch": 0.6540667684562373, + "grad_norm": 0.09857484698295593, + "learning_rate": 0.0001, + "loss": 1.6354, + "step": 47550 + }, + { + "epoch": 0.6547545358258022, + "grad_norm": 0.09228866547346115, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 47600 + }, + { + "epoch": 0.6554423031953672, + "grad_norm": 0.09897273778915405, + "learning_rate": 0.0001, + "loss": 1.6337, + "step": 47650 + }, + { + "epoch": 0.6561300705649321, + "grad_norm": 0.09947068989276886, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 47700 + }, + { + "epoch": 0.656817837934497, + "grad_norm": 0.11004633456468582, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 47750 + }, + { + "epoch": 0.657505605304062, + "grad_norm": 0.09651768952608109, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 47800 + }, + { + "epoch": 0.6581933726736269, + "grad_norm": 0.0905209481716156, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 47850 + }, + { + "epoch": 0.6588811400431918, + "grad_norm": 0.11071925610303879, + "learning_rate": 0.0001, + "loss": 1.6368, + "step": 47900 + }, + { + "epoch": 0.6595689074127568, + "grad_norm": 0.08634346723556519, + "learning_rate": 0.0001, + "loss": 1.6369, + "step": 47950 + }, + { + "epoch": 0.6602566747823216, + "grad_norm": 0.15600930154323578, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 48000 + }, + { + "epoch": 0.6609444421518865, + "grad_norm": 0.11416825652122498, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 48050 + }, + { + "epoch": 0.6616322095214515, + "grad_norm": 0.08934716135263443, + "learning_rate": 0.0001, + "loss": 1.6307, + "step": 48100 + }, + { + "epoch": 0.6623199768910164, + "grad_norm": 0.10360625386238098, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 48150 + }, + { + "epoch": 0.6630077442605813, + "grad_norm": 0.08972904086112976, + "learning_rate": 0.0001, + "loss": 1.6364, + "step": 48200 + }, + { + "epoch": 0.6636955116301462, + "grad_norm": 0.10486090183258057, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 48250 + }, + { + "epoch": 0.6643832789997112, + "grad_norm": 0.09509388357400894, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 48300 + }, + { + "epoch": 0.665071046369276, + "grad_norm": 0.0978211984038353, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 48350 + }, + { + "epoch": 0.6657588137388409, + "grad_norm": 0.08715818077325821, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 48400 + }, + { + "epoch": 0.6664465811084059, + "grad_norm": 0.09371665865182877, + "learning_rate": 0.0001, + "loss": 1.6358, + "step": 48450 + }, + { + "epoch": 0.6671343484779708, + "grad_norm": 0.09343220293521881, + "learning_rate": 0.0001, + "loss": 1.6349, + "step": 48500 + }, + { + "epoch": 0.6678221158475357, + "grad_norm": 0.09670977294445038, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 48550 + }, + { + "epoch": 0.6685098832171007, + "grad_norm": 0.0940290242433548, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 48600 + }, + { + "epoch": 0.6691976505866656, + "grad_norm": 0.10598721355199814, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 48650 + }, + { + "epoch": 0.6698854179562305, + "grad_norm": 0.10703752934932709, + "learning_rate": 0.0001, + "loss": 1.6363, + "step": 48700 + }, + { + "epoch": 0.6705731853257954, + "grad_norm": 0.10036034137010574, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 48750 + }, + { + "epoch": 0.6712609526953603, + "grad_norm": 0.09091803431510925, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 48800 + }, + { + "epoch": 0.6719487200649252, + "grad_norm": 0.09322872757911682, + "learning_rate": 0.0001, + "loss": 1.6361, + "step": 48850 + }, + { + "epoch": 0.6726364874344901, + "grad_norm": 0.10486938804388046, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 48900 + }, + { + "epoch": 0.6733242548040551, + "grad_norm": 0.09341230988502502, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 48950 + }, + { + "epoch": 0.67401202217362, + "grad_norm": 0.09082542359828949, + "learning_rate": 0.0001, + "loss": 1.6335, + "step": 49000 + }, + { + "epoch": 0.6746997895431849, + "grad_norm": 0.10231276601552963, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 49050 + }, + { + "epoch": 0.6753875569127499, + "grad_norm": 0.08918558061122894, + "learning_rate": 0.0001, + "loss": 1.6386, + "step": 49100 + }, + { + "epoch": 0.6760753242823148, + "grad_norm": 0.09939314424991608, + "learning_rate": 0.0001, + "loss": 1.6343, + "step": 49150 + }, + { + "epoch": 0.6767630916518796, + "grad_norm": 0.09934555739164352, + "learning_rate": 0.0001, + "loss": 1.6348, + "step": 49200 + }, + { + "epoch": 0.6774508590214446, + "grad_norm": 0.09298571944236755, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 49250 + }, + { + "epoch": 0.6781386263910095, + "grad_norm": 0.09460543841123581, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 49300 + }, + { + "epoch": 0.6788263937605744, + "grad_norm": 0.08836425095796585, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 49350 + }, + { + "epoch": 0.6795141611301393, + "grad_norm": 0.11813092231750488, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 49400 + }, + { + "epoch": 0.6802019284997043, + "grad_norm": 0.09533282369375229, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 49450 + }, + { + "epoch": 0.6808896958692692, + "grad_norm": 0.09631752967834473, + "learning_rate": 0.0001, + "loss": 1.6309, + "step": 49500 + }, + { + "epoch": 0.6815774632388341, + "grad_norm": 0.10790685564279556, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 49550 + }, + { + "epoch": 0.682265230608399, + "grad_norm": 0.09565212577581406, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 49600 + }, + { + "epoch": 0.6829529979779639, + "grad_norm": 0.0955987349152565, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 49650 + }, + { + "epoch": 0.6836407653475288, + "grad_norm": 0.10549987852573395, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 49700 + }, + { + "epoch": 0.6843285327170938, + "grad_norm": 0.10708790272474289, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 49750 + }, + { + "epoch": 0.6850163000866587, + "grad_norm": 0.09704712778329849, + "learning_rate": 0.0001, + "loss": 1.6334, + "step": 49800 + }, + { + "epoch": 0.6857040674562236, + "grad_norm": 0.08443079143762589, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 49850 + }, + { + "epoch": 0.6863918348257885, + "grad_norm": 0.09780169278383255, + "learning_rate": 0.0001, + "loss": 1.6333, + "step": 49900 + }, + { + "epoch": 0.6870796021953535, + "grad_norm": 0.09641287475824356, + "learning_rate": 0.0001, + "loss": 1.6351, + "step": 49950 + }, + { + "epoch": 0.6877673695649184, + "grad_norm": 0.10506296902894974, + "learning_rate": 0.0001, + "loss": 1.6337, + "step": 50000 + }, + { + "epoch": 0.6884551369344832, + "grad_norm": 0.08869095146656036, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 50050 + }, + { + "epoch": 0.6891429043040482, + "grad_norm": 0.0935509204864502, + "learning_rate": 0.0001, + "loss": 1.6354, + "step": 50100 + }, + { + "epoch": 0.6898306716736131, + "grad_norm": 0.09292100369930267, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 50150 + }, + { + "epoch": 0.690518439043178, + "grad_norm": 0.09042533487081528, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 50200 + }, + { + "epoch": 0.691206206412743, + "grad_norm": 0.09161574393510818, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 50250 + }, + { + "epoch": 0.6918939737823079, + "grad_norm": 0.0865265503525734, + "learning_rate": 0.0001, + "loss": 1.6328, + "step": 50300 + }, + { + "epoch": 0.6925817411518728, + "grad_norm": 0.09176287055015564, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 50350 + }, + { + "epoch": 0.6932695085214378, + "grad_norm": 0.10909677296876907, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 50400 + }, + { + "epoch": 0.6939572758910026, + "grad_norm": 0.10558103770017624, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 50450 + }, + { + "epoch": 0.6946450432605675, + "grad_norm": 0.10107540339231491, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 50500 + }, + { + "epoch": 0.6953328106301324, + "grad_norm": 0.10059240460395813, + "learning_rate": 0.0001, + "loss": 1.6353, + "step": 50550 + }, + { + "epoch": 0.6960205779996974, + "grad_norm": 0.0953984409570694, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 50600 + }, + { + "epoch": 0.6967083453692623, + "grad_norm": 0.09249977767467499, + "learning_rate": 0.0001, + "loss": 1.6309, + "step": 50650 + }, + { + "epoch": 0.6973961127388272, + "grad_norm": 0.10159103572368622, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 50700 + }, + { + "epoch": 0.6980838801083922, + "grad_norm": 0.09958640486001968, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 50750 + }, + { + "epoch": 0.6987716474779571, + "grad_norm": 0.09354562312364578, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 50800 + }, + { + "epoch": 0.6994594148475219, + "grad_norm": 0.08785393089056015, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 50850 + }, + { + "epoch": 0.7001471822170869, + "grad_norm": 0.10028740018606186, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 50900 + }, + { + "epoch": 0.7008349495866518, + "grad_norm": 0.09850720316171646, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 50950 + }, + { + "epoch": 0.7015227169562167, + "grad_norm": 0.0887802317738533, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 51000 + }, + { + "epoch": 0.7022104843257817, + "grad_norm": 0.1057766005396843, + "learning_rate": 0.0001, + "loss": 1.6313, + "step": 51050 + }, + { + "epoch": 0.7028982516953466, + "grad_norm": 0.10256383568048477, + "learning_rate": 0.0001, + "loss": 1.6322, + "step": 51100 + }, + { + "epoch": 0.7035860190649115, + "grad_norm": 0.1051393449306488, + "learning_rate": 0.0001, + "loss": 1.6326, + "step": 51150 + }, + { + "epoch": 0.7042737864344764, + "grad_norm": 0.10550976544618607, + "learning_rate": 0.0001, + "loss": 1.6319, + "step": 51200 + }, + { + "epoch": 0.7049615538040414, + "grad_norm": 0.10621052980422974, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 51250 + }, + { + "epoch": 0.7056493211736062, + "grad_norm": 0.10684607177972794, + "learning_rate": 0.0001, + "loss": 1.6329, + "step": 51300 + }, + { + "epoch": 0.7063370885431711, + "grad_norm": 0.10426071286201477, + "learning_rate": 0.0001, + "loss": 1.6325, + "step": 51350 + }, + { + "epoch": 0.7070248559127361, + "grad_norm": 0.09892316907644272, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 51400 + }, + { + "epoch": 0.707712623282301, + "grad_norm": 0.11626065522432327, + "learning_rate": 0.0001, + "loss": 1.6345, + "step": 51450 + }, + { + "epoch": 0.7084003906518659, + "grad_norm": 0.10195190459489822, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 51500 + }, + { + "epoch": 0.7090881580214309, + "grad_norm": 0.10849519073963165, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 51550 + }, + { + "epoch": 0.7097759253909958, + "grad_norm": 0.10192925482988358, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 51600 + }, + { + "epoch": 0.7104636927605607, + "grad_norm": 0.09579839557409286, + "learning_rate": 0.0001, + "loss": 1.6357, + "step": 51650 + }, + { + "epoch": 0.7111514601301255, + "grad_norm": 0.09045730531215668, + "learning_rate": 0.0001, + "loss": 1.6338, + "step": 51700 + }, + { + "epoch": 0.7118392274996905, + "grad_norm": 0.09064441174268723, + "learning_rate": 0.0001, + "loss": 1.6333, + "step": 51750 + }, + { + "epoch": 0.7125269948692554, + "grad_norm": 0.09252890199422836, + "learning_rate": 0.0001, + "loss": 1.6323, + "step": 51800 + }, + { + "epoch": 0.7132147622388203, + "grad_norm": 0.09987664967775345, + "learning_rate": 0.0001, + "loss": 1.6347, + "step": 51850 + }, + { + "epoch": 0.7139025296083853, + "grad_norm": 0.09582144021987915, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 51900 + }, + { + "epoch": 0.7145902969779502, + "grad_norm": 0.09969571232795715, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 51950 + }, + { + "epoch": 0.7152780643475151, + "grad_norm": 0.104249969124794, + "learning_rate": 0.0001, + "loss": 1.6321, + "step": 52000 + }, + { + "epoch": 0.7159658317170801, + "grad_norm": 0.1016249805688858, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 52050 + }, + { + "epoch": 0.7166535990866449, + "grad_norm": 0.10166680812835693, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 52100 + }, + { + "epoch": 0.7173413664562098, + "grad_norm": 0.08657228201627731, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 52150 + }, + { + "epoch": 0.7180291338257748, + "grad_norm": 0.09621983766555786, + "learning_rate": 0.0001, + "loss": 1.6301, + "step": 52200 + }, + { + "epoch": 0.7187169011953397, + "grad_norm": 0.10212218761444092, + "learning_rate": 0.0001, + "loss": 1.6283, + "step": 52250 + }, + { + "epoch": 0.7194046685649046, + "grad_norm": 0.09836029261350632, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 52300 + }, + { + "epoch": 0.7200924359344695, + "grad_norm": 0.1043592095375061, + "learning_rate": 0.0001, + "loss": 1.6342, + "step": 52350 + }, + { + "epoch": 0.7207802033040345, + "grad_norm": 0.09003785252571106, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 52400 + }, + { + "epoch": 0.7214679706735994, + "grad_norm": 0.09913681447505951, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 52450 + }, + { + "epoch": 0.7221557380431642, + "grad_norm": 0.0937609001994133, + "learning_rate": 0.0001, + "loss": 1.6339, + "step": 52500 + }, + { + "epoch": 0.7228435054127292, + "grad_norm": 0.10508369654417038, + "learning_rate": 0.0001, + "loss": 1.6315, + "step": 52550 + }, + { + "epoch": 0.7235312727822941, + "grad_norm": 0.08586754649877548, + "learning_rate": 0.0001, + "loss": 1.6355, + "step": 52600 + }, + { + "epoch": 0.724219040151859, + "grad_norm": 0.10271697491407394, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 52650 + }, + { + "epoch": 0.724906807521424, + "grad_norm": 0.09757651388645172, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 52700 + }, + { + "epoch": 0.7255945748909889, + "grad_norm": 0.09661528468132019, + "learning_rate": 0.0001, + "loss": 1.6346, + "step": 52750 + }, + { + "epoch": 0.7262823422605538, + "grad_norm": 0.10581836849451065, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 52800 + }, + { + "epoch": 0.7269701096301187, + "grad_norm": 0.08999978750944138, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 52850 + }, + { + "epoch": 0.7276578769996837, + "grad_norm": 0.10840348899364471, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 52900 + }, + { + "epoch": 0.7283456443692485, + "grad_norm": 0.0947146937251091, + "learning_rate": 0.0001, + "loss": 1.6331, + "step": 52950 + }, + { + "epoch": 0.7290334117388134, + "grad_norm": 0.09926970303058624, + "learning_rate": 0.0001, + "loss": 1.633, + "step": 53000 + }, + { + "epoch": 0.7297211791083784, + "grad_norm": 0.10415767878293991, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 53050 + }, + { + "epoch": 0.7304089464779433, + "grad_norm": 0.11093425750732422, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 53100 + }, + { + "epoch": 0.7310967138475082, + "grad_norm": 0.11629913002252579, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 53150 + }, + { + "epoch": 0.7317844812170732, + "grad_norm": 0.09028749912977219, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 53200 + }, + { + "epoch": 0.7324722485866381, + "grad_norm": 0.09283788502216339, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 53250 + }, + { + "epoch": 0.733160015956203, + "grad_norm": 0.10100384056568146, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 53300 + }, + { + "epoch": 0.7338477833257679, + "grad_norm": 0.09549766778945923, + "learning_rate": 0.0001, + "loss": 1.6327, + "step": 53350 + }, + { + "epoch": 0.7345355506953328, + "grad_norm": 0.10129329562187195, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 53400 + }, + { + "epoch": 0.7352233180648977, + "grad_norm": 0.085689477622509, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 53450 + }, + { + "epoch": 0.7359110854344626, + "grad_norm": 0.10251662135124207, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 53500 + }, + { + "epoch": 0.7365988528040276, + "grad_norm": 0.10034999996423721, + "learning_rate": 0.0001, + "loss": 1.6321, + "step": 53550 + }, + { + "epoch": 0.7372866201735925, + "grad_norm": 0.10141023993492126, + "learning_rate": 0.0001, + "loss": 1.6321, + "step": 53600 + }, + { + "epoch": 0.7379743875431574, + "grad_norm": 0.11427409946918488, + "learning_rate": 0.0001, + "loss": 1.6284, + "step": 53650 + }, + { + "epoch": 0.7386621549127224, + "grad_norm": 0.09212636202573776, + "learning_rate": 0.0001, + "loss": 1.6307, + "step": 53700 + }, + { + "epoch": 0.7393499222822872, + "grad_norm": 0.11293049156665802, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 53750 + }, + { + "epoch": 0.7400376896518521, + "grad_norm": 0.1038009375333786, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 53800 + }, + { + "epoch": 0.7407254570214171, + "grad_norm": 0.09193024039268494, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 53850 + }, + { + "epoch": 0.741413224390982, + "grad_norm": 0.10515929013490677, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 53900 + }, + { + "epoch": 0.7421009917605469, + "grad_norm": 0.09728860855102539, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 53950 + }, + { + "epoch": 0.7427887591301119, + "grad_norm": 0.10483654588460922, + "learning_rate": 0.0001, + "loss": 1.6336, + "step": 54000 + }, + { + "epoch": 0.7434765264996768, + "grad_norm": 0.09928806871175766, + "learning_rate": 0.0001, + "loss": 1.6302, + "step": 54050 + }, + { + "epoch": 0.7441642938692417, + "grad_norm": 0.10114587098360062, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 54100 + }, + { + "epoch": 0.7448520612388065, + "grad_norm": 0.09662097692489624, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 54150 + }, + { + "epoch": 0.7455398286083715, + "grad_norm": 0.1011410728096962, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 54200 + }, + { + "epoch": 0.7462275959779364, + "grad_norm": 0.09594978392124176, + "learning_rate": 0.0001, + "loss": 1.626, + "step": 54250 + }, + { + "epoch": 0.7469153633475013, + "grad_norm": 0.0986706018447876, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 54300 + }, + { + "epoch": 0.7476031307170663, + "grad_norm": 0.11080548167228699, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 54350 + }, + { + "epoch": 0.7482908980866312, + "grad_norm": 0.10811348259449005, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 54400 + }, + { + "epoch": 0.7489786654561961, + "grad_norm": 0.09786169975996017, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 54450 + }, + { + "epoch": 0.7496664328257611, + "grad_norm": 0.09347623586654663, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 54500 + }, + { + "epoch": 0.750354200195326, + "grad_norm": 0.10388332605361938, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 54550 + }, + { + "epoch": 0.7510419675648908, + "grad_norm": 0.09866851568222046, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 54600 + }, + { + "epoch": 0.7517297349344557, + "grad_norm": 0.10995014756917953, + "learning_rate": 0.0001, + "loss": 1.6312, + "step": 54650 + }, + { + "epoch": 0.7524175023040207, + "grad_norm": 0.09450124949216843, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 54700 + }, + { + "epoch": 0.7531052696735856, + "grad_norm": 0.10181824117898941, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 54750 + }, + { + "epoch": 0.7537930370431505, + "grad_norm": 0.11469998210668564, + "learning_rate": 0.0001, + "loss": 1.6306, + "step": 54800 + }, + { + "epoch": 0.7544808044127155, + "grad_norm": 0.11170945316553116, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 54850 + }, + { + "epoch": 0.7551685717822804, + "grad_norm": 0.09184923022985458, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 54900 + }, + { + "epoch": 0.7558563391518452, + "grad_norm": 0.10830254852771759, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 54950 + }, + { + "epoch": 0.7565441065214102, + "grad_norm": 0.09561311453580856, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 55000 + }, + { + "epoch": 0.7572318738909751, + "grad_norm": 0.0922449454665184, + "learning_rate": 0.0001, + "loss": 1.6314, + "step": 55050 + }, + { + "epoch": 0.75791964126054, + "grad_norm": 0.09750694036483765, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 55100 + }, + { + "epoch": 0.758607408630105, + "grad_norm": 0.10356149077415466, + "learning_rate": 0.0001, + "loss": 1.6298, + "step": 55150 + }, + { + "epoch": 0.7592951759996699, + "grad_norm": 0.09972983598709106, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 55200 + }, + { + "epoch": 0.7599829433692348, + "grad_norm": 0.09673431515693665, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 55250 + }, + { + "epoch": 0.7606707107387997, + "grad_norm": 0.11159659922122955, + "learning_rate": 0.0001, + "loss": 1.6243, + "step": 55300 + }, + { + "epoch": 0.7613584781083647, + "grad_norm": 0.10829446464776993, + "learning_rate": 0.0001, + "loss": 1.6324, + "step": 55350 + }, + { + "epoch": 0.7620462454779295, + "grad_norm": 0.09917978942394257, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 55400 + }, + { + "epoch": 0.7627340128474944, + "grad_norm": 0.09947081655263901, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 55450 + }, + { + "epoch": 0.7634217802170594, + "grad_norm": 0.09545807540416718, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 55500 + }, + { + "epoch": 0.7641095475866243, + "grad_norm": 0.10184399038553238, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 55550 + }, + { + "epoch": 0.7647973149561892, + "grad_norm": 0.0954732596874237, + "learning_rate": 0.0001, + "loss": 1.6285, + "step": 55600 + }, + { + "epoch": 0.7654850823257542, + "grad_norm": 0.1068364828824997, + "learning_rate": 0.0001, + "loss": 1.6289, + "step": 55650 + }, + { + "epoch": 0.7661728496953191, + "grad_norm": 0.09565538913011551, + "learning_rate": 0.0001, + "loss": 1.6293, + "step": 55700 + }, + { + "epoch": 0.766860617064884, + "grad_norm": 0.09793255478143692, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 55750 + }, + { + "epoch": 0.7675483844344488, + "grad_norm": 0.11079392582178116, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 55800 + }, + { + "epoch": 0.7682361518040138, + "grad_norm": 0.09202311187982559, + "learning_rate": 0.0001, + "loss": 1.6288, + "step": 55850 + }, + { + "epoch": 0.7689239191735787, + "grad_norm": 0.09076053649187088, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 55900 + }, + { + "epoch": 0.7696116865431436, + "grad_norm": 0.10245150327682495, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 55950 + }, + { + "epoch": 0.7702994539127086, + "grad_norm": 0.09228003025054932, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 56000 + }, + { + "epoch": 0.7709872212822735, + "grad_norm": 0.10882555693387985, + "learning_rate": 0.0001, + "loss": 1.632, + "step": 56050 + }, + { + "epoch": 0.7716749886518384, + "grad_norm": 0.11051888018846512, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 56100 + }, + { + "epoch": 0.7723627560214034, + "grad_norm": 0.11535434424877167, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 56150 + }, + { + "epoch": 0.7730505233909682, + "grad_norm": 0.09059146791696548, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 56200 + }, + { + "epoch": 0.7737382907605331, + "grad_norm": 0.09102441370487213, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 56250 + }, + { + "epoch": 0.7744260581300981, + "grad_norm": 0.09455610811710358, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 56300 + }, + { + "epoch": 0.775113825499663, + "grad_norm": 0.09675318747758865, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 56350 + }, + { + "epoch": 0.7758015928692279, + "grad_norm": 0.08795568346977234, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 56400 + }, + { + "epoch": 0.7764893602387928, + "grad_norm": 0.1048702672123909, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 56450 + }, + { + "epoch": 0.7771771276083578, + "grad_norm": 0.10998234897851944, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 56500 + }, + { + "epoch": 0.7778648949779227, + "grad_norm": 0.09200160205364227, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 56550 + }, + { + "epoch": 0.7785526623474875, + "grad_norm": 0.12960132956504822, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 56600 + }, + { + "epoch": 0.7792404297170525, + "grad_norm": 0.09751798212528229, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 56650 + }, + { + "epoch": 0.7799281970866174, + "grad_norm": 0.09842194616794586, + "learning_rate": 0.0001, + "loss": 1.6299, + "step": 56700 + }, + { + "epoch": 0.7806159644561823, + "grad_norm": 0.09356516599655151, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 56750 + }, + { + "epoch": 0.7813037318257473, + "grad_norm": 0.10366610437631607, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 56800 + }, + { + "epoch": 0.7819914991953122, + "grad_norm": 0.10093358904123306, + "learning_rate": 0.0001, + "loss": 1.6288, + "step": 56850 + }, + { + "epoch": 0.7826792665648771, + "grad_norm": 0.11231804639101028, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 56900 + }, + { + "epoch": 0.7833670339344421, + "grad_norm": 0.12199211865663528, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 56950 + }, + { + "epoch": 0.784054801304007, + "grad_norm": 0.10776622593402863, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 57000 + }, + { + "epoch": 0.7847425686735718, + "grad_norm": 0.10442288219928741, + "learning_rate": 0.0001, + "loss": 1.6309, + "step": 57050 + }, + { + "epoch": 0.7854303360431367, + "grad_norm": 0.11845128238201141, + "learning_rate": 0.0001, + "loss": 1.6283, + "step": 57100 + }, + { + "epoch": 0.7861181034127017, + "grad_norm": 0.09980395436286926, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 57150 + }, + { + "epoch": 0.7868058707822666, + "grad_norm": 0.105705127120018, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 57200 + }, + { + "epoch": 0.7874936381518315, + "grad_norm": 0.09033431857824326, + "learning_rate": 0.0001, + "loss": 1.6317, + "step": 57250 + }, + { + "epoch": 0.7881814055213965, + "grad_norm": 0.10274254530668259, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 57300 + }, + { + "epoch": 0.7888691728909614, + "grad_norm": 0.09502615034580231, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 57350 + }, + { + "epoch": 0.7895569402605263, + "grad_norm": 0.10381683707237244, + "learning_rate": 0.0001, + "loss": 1.6306, + "step": 57400 + }, + { + "epoch": 0.7902447076300912, + "grad_norm": 0.10381805896759033, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 57450 + }, + { + "epoch": 0.7909324749996561, + "grad_norm": 0.0951840877532959, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 57500 + }, + { + "epoch": 0.791620242369221, + "grad_norm": 0.117914579808712, + "learning_rate": 0.0001, + "loss": 1.6316, + "step": 57550 + }, + { + "epoch": 0.7923080097387859, + "grad_norm": 0.10382041335105896, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 57600 + }, + { + "epoch": 0.7929957771083509, + "grad_norm": 0.11510676145553589, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 57650 + }, + { + "epoch": 0.7936835444779158, + "grad_norm": 0.09333761036396027, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 57700 + }, + { + "epoch": 0.7943713118474807, + "grad_norm": 0.09227602928876877, + "learning_rate": 0.0001, + "loss": 1.631, + "step": 57750 + }, + { + "epoch": 0.7950590792170457, + "grad_norm": 0.10818013548851013, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 57800 + }, + { + "epoch": 0.7957468465866105, + "grad_norm": 0.10440421849489212, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 57850 + }, + { + "epoch": 0.7964346139561754, + "grad_norm": 0.10464609414339066, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 57900 + }, + { + "epoch": 0.7971223813257404, + "grad_norm": 0.11318383365869522, + "learning_rate": 0.0001, + "loss": 1.6283, + "step": 57950 + }, + { + "epoch": 0.7978101486953053, + "grad_norm": 0.1296527087688446, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 58000 + }, + { + "epoch": 0.7984979160648702, + "grad_norm": 0.09575483202934265, + "learning_rate": 0.0001, + "loss": 1.6288, + "step": 58050 + }, + { + "epoch": 0.7991856834344352, + "grad_norm": 0.11318851262331009, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 58100 + }, + { + "epoch": 0.7998734508040001, + "grad_norm": 0.10303325206041336, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 58150 + }, + { + "epoch": 0.800561218173565, + "grad_norm": 0.09992048889398575, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 58200 + }, + { + "epoch": 0.8012489855431298, + "grad_norm": 0.09654615819454193, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 58250 + }, + { + "epoch": 0.8019367529126948, + "grad_norm": 0.0984918400645256, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 58300 + }, + { + "epoch": 0.8026245202822597, + "grad_norm": 0.09900155663490295, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 58350 + }, + { + "epoch": 0.8033122876518246, + "grad_norm": 0.09438169002532959, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 58400 + }, + { + "epoch": 0.8040000550213896, + "grad_norm": 0.09372150897979736, + "learning_rate": 0.0001, + "loss": 1.6296, + "step": 58450 + }, + { + "epoch": 0.8046878223909545, + "grad_norm": 0.10187190771102905, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 58500 + }, + { + "epoch": 0.8053755897605194, + "grad_norm": 0.0914900004863739, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 58550 + }, + { + "epoch": 0.8060633571300844, + "grad_norm": 0.08726612478494644, + "learning_rate": 0.0001, + "loss": 1.6308, + "step": 58600 + }, + { + "epoch": 0.8067511244996493, + "grad_norm": 0.11641035974025726, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 58650 + }, + { + "epoch": 0.8074388918692141, + "grad_norm": 0.121851347386837, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 58700 + }, + { + "epoch": 0.808126659238779, + "grad_norm": 0.10153374820947647, + "learning_rate": 0.0001, + "loss": 1.6275, + "step": 58750 + }, + { + "epoch": 0.808814426608344, + "grad_norm": 0.10574651509523392, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 58800 + }, + { + "epoch": 0.8095021939779089, + "grad_norm": 0.11473171412944794, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 58850 + }, + { + "epoch": 0.8101899613474738, + "grad_norm": 0.09999868273735046, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 58900 + }, + { + "epoch": 0.8108777287170388, + "grad_norm": 0.10102663189172745, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 58950 + }, + { + "epoch": 0.8115654960866037, + "grad_norm": 0.09449347853660583, + "learning_rate": 0.0001, + "loss": 1.6305, + "step": 59000 + }, + { + "epoch": 0.8122532634561686, + "grad_norm": 0.1009218841791153, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 59050 + }, + { + "epoch": 0.8129410308257335, + "grad_norm": 0.08553290367126465, + "learning_rate": 0.0001, + "loss": 1.63, + "step": 59100 + }, + { + "epoch": 0.8136287981952984, + "grad_norm": 0.1168675571680069, + "learning_rate": 0.0001, + "loss": 1.6277, + "step": 59150 + }, + { + "epoch": 0.8143165655648633, + "grad_norm": 0.10498977452516556, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 59200 + }, + { + "epoch": 0.8150043329344283, + "grad_norm": 0.09838738292455673, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 59250 + }, + { + "epoch": 0.8156921003039932, + "grad_norm": 0.1220153272151947, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 59300 + }, + { + "epoch": 0.8163798676735581, + "grad_norm": 0.12157600373029709, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 59350 + }, + { + "epoch": 0.817067635043123, + "grad_norm": 0.0998833104968071, + "learning_rate": 0.0001, + "loss": 1.6266, + "step": 59400 + }, + { + "epoch": 0.817755402412688, + "grad_norm": 0.10543464869260788, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 59450 + }, + { + "epoch": 0.8184431697822528, + "grad_norm": 0.09231004863977432, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 59500 + }, + { + "epoch": 0.8191309371518177, + "grad_norm": 0.10896420478820801, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 59550 + }, + { + "epoch": 0.8198187045213827, + "grad_norm": 0.09489674121141434, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 59600 + }, + { + "epoch": 0.8205064718909476, + "grad_norm": 0.10061246901750565, + "learning_rate": 0.0001, + "loss": 1.6294, + "step": 59650 + }, + { + "epoch": 0.8211942392605125, + "grad_norm": 0.10421578586101532, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 59700 + }, + { + "epoch": 0.8218820066300775, + "grad_norm": 0.10863976180553436, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 59750 + }, + { + "epoch": 0.8225697739996424, + "grad_norm": 0.10106450319290161, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 59800 + }, + { + "epoch": 0.8232575413692073, + "grad_norm": 0.1225091740489006, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 59850 + }, + { + "epoch": 0.8239453087387723, + "grad_norm": 0.1194959282875061, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 59900 + }, + { + "epoch": 0.8246330761083371, + "grad_norm": 0.10718126595020294, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 59950 + }, + { + "epoch": 0.825320843477902, + "grad_norm": 0.09109228849411011, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 60000 + }, + { + "epoch": 0.8260086108474669, + "grad_norm": 0.12683629989624023, + "learning_rate": 0.0001, + "loss": 1.6303, + "step": 60050 + }, + { + "epoch": 0.8266963782170319, + "grad_norm": 0.09432388842105865, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 60100 + }, + { + "epoch": 0.8273841455865968, + "grad_norm": 0.10563753545284271, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 60150 + }, + { + "epoch": 0.8280719129561617, + "grad_norm": 0.09300840646028519, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 60200 + }, + { + "epoch": 0.8287596803257267, + "grad_norm": 0.09419063478708267, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 60250 + }, + { + "epoch": 0.8294474476952916, + "grad_norm": 0.11976797133684158, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 60300 + }, + { + "epoch": 0.8301352150648564, + "grad_norm": 0.1019161194562912, + "learning_rate": 0.0001, + "loss": 1.6281, + "step": 60350 + }, + { + "epoch": 0.8308229824344214, + "grad_norm": 0.10402313619852066, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 60400 + }, + { + "epoch": 0.8315107498039863, + "grad_norm": 0.0936688631772995, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 60450 + }, + { + "epoch": 0.8321985171735512, + "grad_norm": 0.09794539213180542, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 60500 + }, + { + "epoch": 0.8328862845431161, + "grad_norm": 0.09901450574398041, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 60550 + }, + { + "epoch": 0.8335740519126811, + "grad_norm": 0.09567803889513016, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 60600 + }, + { + "epoch": 0.834261819282246, + "grad_norm": 0.12296677380800247, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 60650 + }, + { + "epoch": 0.8349495866518108, + "grad_norm": 0.10701528191566467, + "learning_rate": 0.0001, + "loss": 1.6297, + "step": 60700 + }, + { + "epoch": 0.8356373540213758, + "grad_norm": 0.09757701307535172, + "learning_rate": 0.0001, + "loss": 1.6246, + "step": 60750 + }, + { + "epoch": 0.8363251213909407, + "grad_norm": 0.11205516010522842, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 60800 + }, + { + "epoch": 0.8370128887605056, + "grad_norm": 0.09203255921602249, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 60850 + }, + { + "epoch": 0.8377006561300706, + "grad_norm": 0.10238678753376007, + "learning_rate": 0.0001, + "loss": 1.6256, + "step": 60900 + }, + { + "epoch": 0.8383884234996355, + "grad_norm": 0.0966443195939064, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 60950 + }, + { + "epoch": 0.8390761908692004, + "grad_norm": 0.09180071949958801, + "learning_rate": 0.0001, + "loss": 1.6252, + "step": 61000 + }, + { + "epoch": 0.8397639582387654, + "grad_norm": 0.09823059290647507, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 61050 + }, + { + "epoch": 0.8404517256083303, + "grad_norm": 0.11375342309474945, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 61100 + }, + { + "epoch": 0.8411394929778951, + "grad_norm": 0.10059309750795364, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 61150 + }, + { + "epoch": 0.84182726034746, + "grad_norm": 0.10789557546377182, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 61200 + }, + { + "epoch": 0.842515027717025, + "grad_norm": 0.09390353411436081, + "learning_rate": 0.0001, + "loss": 1.6251, + "step": 61250 + }, + { + "epoch": 0.8432027950865899, + "grad_norm": 0.09548452496528625, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 61300 + }, + { + "epoch": 0.8438905624561548, + "grad_norm": 0.10145819932222366, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 61350 + }, + { + "epoch": 0.8445783298257198, + "grad_norm": 0.10150100290775299, + "learning_rate": 0.0001, + "loss": 1.6273, + "step": 61400 + }, + { + "epoch": 0.8452660971952847, + "grad_norm": 0.09277284890413284, + "learning_rate": 0.0001, + "loss": 1.6286, + "step": 61450 + }, + { + "epoch": 0.8459538645648496, + "grad_norm": 0.09942333400249481, + "learning_rate": 0.0001, + "loss": 1.628, + "step": 61500 + }, + { + "epoch": 0.8466416319344146, + "grad_norm": 0.0959252342581749, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 61550 + }, + { + "epoch": 0.8473293993039794, + "grad_norm": 0.12430740892887115, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 61600 + }, + { + "epoch": 0.8480171666735443, + "grad_norm": 0.09702388197183609, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 61650 + }, + { + "epoch": 0.8487049340431092, + "grad_norm": 0.09867244213819504, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 61700 + }, + { + "epoch": 0.8493927014126742, + "grad_norm": 0.10050085186958313, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 61750 + }, + { + "epoch": 0.8500804687822391, + "grad_norm": 0.09910161048173904, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 61800 + }, + { + "epoch": 0.850768236151804, + "grad_norm": 0.13913027942180634, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 61850 + }, + { + "epoch": 0.851456003521369, + "grad_norm": 0.11147304624319077, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 61900 + }, + { + "epoch": 0.8521437708909338, + "grad_norm": 0.09804028272628784, + "learning_rate": 0.0001, + "loss": 1.6278, + "step": 61950 + }, + { + "epoch": 0.8528315382604987, + "grad_norm": 0.09778320044279099, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 62000 + }, + { + "epoch": 0.8535193056300637, + "grad_norm": 0.098909392952919, + "learning_rate": 0.0001, + "loss": 1.6272, + "step": 62050 + }, + { + "epoch": 0.8542070729996286, + "grad_norm": 0.10008524358272552, + "learning_rate": 0.0001, + "loss": 1.6237, + "step": 62100 + }, + { + "epoch": 0.8548948403691935, + "grad_norm": 0.11636464297771454, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 62150 + }, + { + "epoch": 0.8555826077387585, + "grad_norm": 0.10067223757505417, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 62200 + }, + { + "epoch": 0.8562703751083234, + "grad_norm": 0.0951443538069725, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 62250 + }, + { + "epoch": 0.8569581424778883, + "grad_norm": 0.1106119304895401, + "learning_rate": 0.0001, + "loss": 1.6274, + "step": 62300 + }, + { + "epoch": 0.8576459098474531, + "grad_norm": 0.1256386637687683, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 62350 + }, + { + "epoch": 0.8583336772170181, + "grad_norm": 0.10102882981300354, + "learning_rate": 0.0001, + "loss": 1.6287, + "step": 62400 + }, + { + "epoch": 0.859021444586583, + "grad_norm": 0.0932788997888565, + "learning_rate": 0.0001, + "loss": 1.6264, + "step": 62450 + }, + { + "epoch": 0.8597092119561479, + "grad_norm": 0.10514089465141296, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 62500 + }, + { + "epoch": 0.8603969793257129, + "grad_norm": 0.11338023841381073, + "learning_rate": 0.0001, + "loss": 1.6291, + "step": 62550 + }, + { + "epoch": 0.8610847466952778, + "grad_norm": 0.11062069982290268, + "learning_rate": 0.0001, + "loss": 1.6243, + "step": 62600 + }, + { + "epoch": 0.8617725140648427, + "grad_norm": 0.11567613482475281, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 62650 + }, + { + "epoch": 0.8624602814344077, + "grad_norm": 0.10254676640033722, + "learning_rate": 0.0001, + "loss": 1.6267, + "step": 62700 + }, + { + "epoch": 0.8631480488039726, + "grad_norm": 0.10593708604574203, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 62750 + }, + { + "epoch": 0.8638358161735374, + "grad_norm": 0.10564611107110977, + "learning_rate": 0.0001, + "loss": 1.6256, + "step": 62800 + }, + { + "epoch": 0.8645235835431024, + "grad_norm": 0.10547158122062683, + "learning_rate": 0.0001, + "loss": 1.6227, + "step": 62850 + }, + { + "epoch": 0.8652113509126673, + "grad_norm": 0.09912673383951187, + "learning_rate": 0.0001, + "loss": 1.6269, + "step": 62900 + }, + { + "epoch": 0.8658991182822322, + "grad_norm": 0.09913560748100281, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 62950 + }, + { + "epoch": 0.8665868856517971, + "grad_norm": 0.10240307450294495, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 63000 + }, + { + "epoch": 0.8672746530213621, + "grad_norm": 0.11807160824537277, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 63050 + }, + { + "epoch": 0.867962420390927, + "grad_norm": 0.09863050282001495, + "learning_rate": 0.0001, + "loss": 1.6279, + "step": 63100 + }, + { + "epoch": 0.8686501877604919, + "grad_norm": 0.1057911291718483, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 63150 + }, + { + "epoch": 0.8693379551300568, + "grad_norm": 0.1161641925573349, + "learning_rate": 0.0001, + "loss": 1.626, + "step": 63200 + }, + { + "epoch": 0.8700257224996217, + "grad_norm": 0.10318218171596527, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 63250 + }, + { + "epoch": 0.8707134898691866, + "grad_norm": 0.09954388439655304, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 63300 + }, + { + "epoch": 0.8714012572387516, + "grad_norm": 0.09751243889331818, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 63350 + }, + { + "epoch": 0.8720890246083165, + "grad_norm": 0.09338164329528809, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 63400 + }, + { + "epoch": 0.8727767919778814, + "grad_norm": 0.09846915304660797, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 63450 + }, + { + "epoch": 0.8734645593474463, + "grad_norm": 0.10697762668132782, + "learning_rate": 0.0001, + "loss": 1.6249, + "step": 63500 + }, + { + "epoch": 0.8741523267170113, + "grad_norm": 0.10318434238433838, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 63550 + }, + { + "epoch": 0.8748400940865761, + "grad_norm": 0.10023129731416702, + "learning_rate": 0.0001, + "loss": 1.6227, + "step": 63600 + }, + { + "epoch": 0.875527861456141, + "grad_norm": 0.11498495936393738, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 63650 + }, + { + "epoch": 0.876215628825706, + "grad_norm": 0.09535204619169235, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 63700 + }, + { + "epoch": 0.8769033961952709, + "grad_norm": 0.10134591907262802, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 63750 + }, + { + "epoch": 0.8775911635648358, + "grad_norm": 0.11040335893630981, + "learning_rate": 0.0001, + "loss": 1.6282, + "step": 63800 + }, + { + "epoch": 0.8782789309344008, + "grad_norm": 0.09738776832818985, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 63850 + }, + { + "epoch": 0.8789666983039657, + "grad_norm": 0.09208226203918457, + "learning_rate": 0.0001, + "loss": 1.6227, + "step": 63900 + }, + { + "epoch": 0.8796544656735306, + "grad_norm": 0.1082238256931305, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 63950 + }, + { + "epoch": 0.8803422330430956, + "grad_norm": 0.10054194927215576, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 64000 + }, + { + "epoch": 0.8810300004126604, + "grad_norm": 0.10898640006780624, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 64050 + }, + { + "epoch": 0.8817177677822253, + "grad_norm": 0.10575822740793228, + "learning_rate": 0.0001, + "loss": 1.6215, + "step": 64100 + }, + { + "epoch": 0.8824055351517902, + "grad_norm": 0.0936664566397667, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 64150 + }, + { + "epoch": 0.8830933025213552, + "grad_norm": 0.08812412619590759, + "learning_rate": 0.0001, + "loss": 1.6228, + "step": 64200 + }, + { + "epoch": 0.8837810698909201, + "grad_norm": 0.10116361826658249, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 64250 + }, + { + "epoch": 0.884468837260485, + "grad_norm": 0.09832023084163666, + "learning_rate": 0.0001, + "loss": 1.6208, + "step": 64300 + }, + { + "epoch": 0.88515660463005, + "grad_norm": 0.10003996640443802, + "learning_rate": 0.0001, + "loss": 1.6268, + "step": 64350 + }, + { + "epoch": 0.8858443719996149, + "grad_norm": 0.10371046513319016, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 64400 + }, + { + "epoch": 0.8865321393691797, + "grad_norm": 0.11491281539201736, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 64450 + }, + { + "epoch": 0.8872199067387447, + "grad_norm": 0.10414805263280869, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 64500 + }, + { + "epoch": 0.8879076741083096, + "grad_norm": 0.09586089849472046, + "learning_rate": 0.0001, + "loss": 1.6254, + "step": 64550 + }, + { + "epoch": 0.8885954414778745, + "grad_norm": 0.09846927970647812, + "learning_rate": 0.0001, + "loss": 1.627, + "step": 64600 + }, + { + "epoch": 0.8892832088474394, + "grad_norm": 0.09198208153247833, + "learning_rate": 0.0001, + "loss": 1.6253, + "step": 64650 + }, + { + "epoch": 0.8899709762170044, + "grad_norm": 0.08919122070074081, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 64700 + }, + { + "epoch": 0.8906587435865693, + "grad_norm": 0.10351312905550003, + "learning_rate": 0.0001, + "loss": 1.6259, + "step": 64750 + }, + { + "epoch": 0.8913465109561342, + "grad_norm": 0.10056328028440475, + "learning_rate": 0.0001, + "loss": 1.6242, + "step": 64800 + }, + { + "epoch": 0.8920342783256991, + "grad_norm": 0.11205385625362396, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 64850 + }, + { + "epoch": 0.892722045695264, + "grad_norm": 0.11789285391569138, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 64900 + }, + { + "epoch": 0.8934098130648289, + "grad_norm": 0.09621193259954453, + "learning_rate": 0.0001, + "loss": 1.629, + "step": 64950 + }, + { + "epoch": 0.8940975804343939, + "grad_norm": 0.11940276622772217, + "learning_rate": 0.0001, + "loss": 1.6276, + "step": 65000 + }, + { + "epoch": 0.8947853478039588, + "grad_norm": 0.0920642614364624, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 65050 + }, + { + "epoch": 0.8954731151735237, + "grad_norm": 0.11501994729042053, + "learning_rate": 0.0001, + "loss": 1.6215, + "step": 65100 + }, + { + "epoch": 0.8961608825430887, + "grad_norm": 0.09871424734592438, + "learning_rate": 0.0001, + "loss": 1.6208, + "step": 65150 + }, + { + "epoch": 0.8968486499126536, + "grad_norm": 0.11804469674825668, + "learning_rate": 0.0001, + "loss": 1.622, + "step": 65200 + }, + { + "epoch": 0.8975364172822184, + "grad_norm": 0.1077907532453537, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 65250 + }, + { + "epoch": 0.8982241846517833, + "grad_norm": 0.1031891405582428, + "learning_rate": 0.0001, + "loss": 1.6224, + "step": 65300 + }, + { + "epoch": 0.8989119520213483, + "grad_norm": 0.09871700406074524, + "learning_rate": 0.0001, + "loss": 1.6263, + "step": 65350 + }, + { + "epoch": 0.8995997193909132, + "grad_norm": 0.11097069084644318, + "learning_rate": 0.0001, + "loss": 1.6231, + "step": 65400 + }, + { + "epoch": 0.9002874867604781, + "grad_norm": 0.10109886527061462, + "learning_rate": 0.0001, + "loss": 1.6204, + "step": 65450 + }, + { + "epoch": 0.9009752541300431, + "grad_norm": 0.09359996765851974, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 65500 + }, + { + "epoch": 0.901663021499608, + "grad_norm": 0.09548566490411758, + "learning_rate": 0.0001, + "loss": 1.6262, + "step": 65550 + }, + { + "epoch": 0.9023507888691729, + "grad_norm": 0.09799979627132416, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 65600 + }, + { + "epoch": 0.9030385562387379, + "grad_norm": 0.10698045790195465, + "learning_rate": 0.0001, + "loss": 1.6245, + "step": 65650 + }, + { + "epoch": 0.9037263236083027, + "grad_norm": 0.11245743930339813, + "learning_rate": 0.0001, + "loss": 1.6265, + "step": 65700 + }, + { + "epoch": 0.9044140909778676, + "grad_norm": 0.0961054340004921, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 65750 + }, + { + "epoch": 0.9051018583474326, + "grad_norm": 0.10733941197395325, + "learning_rate": 0.0001, + "loss": 1.6311, + "step": 65800 + }, + { + "epoch": 0.9057896257169975, + "grad_norm": 0.11429203301668167, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 65850 + }, + { + "epoch": 0.9064773930865624, + "grad_norm": 0.10942267626523972, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 65900 + }, + { + "epoch": 0.9071651604561273, + "grad_norm": 0.11220332235097885, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 65950 + }, + { + "epoch": 0.9078529278256923, + "grad_norm": 0.10204115509986877, + "learning_rate": 0.0001, + "loss": 1.6238, + "step": 66000 + }, + { + "epoch": 0.9085406951952572, + "grad_norm": 0.09927280992269516, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 66050 + }, + { + "epoch": 0.909228462564822, + "grad_norm": 0.10095991939306259, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 66100 + }, + { + "epoch": 0.909916229934387, + "grad_norm": 0.0866779088973999, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 66150 + }, + { + "epoch": 0.9106039973039519, + "grad_norm": 0.10905203968286514, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 66200 + }, + { + "epoch": 0.9112917646735168, + "grad_norm": 0.08728614449501038, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 66250 + }, + { + "epoch": 0.9119795320430818, + "grad_norm": 0.11857550591230392, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 66300 + }, + { + "epoch": 0.9126672994126467, + "grad_norm": 0.11594309657812119, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 66350 + }, + { + "epoch": 0.9133550667822116, + "grad_norm": 0.09083879739046097, + "learning_rate": 0.0001, + "loss": 1.6239, + "step": 66400 + }, + { + "epoch": 0.9140428341517765, + "grad_norm": 0.10801181942224503, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 66450 + }, + { + "epoch": 0.9147306015213414, + "grad_norm": 0.09474363178014755, + "learning_rate": 0.0001, + "loss": 1.6207, + "step": 66500 + }, + { + "epoch": 0.9154183688909063, + "grad_norm": 0.11899229139089584, + "learning_rate": 0.0001, + "loss": 1.621, + "step": 66550 + }, + { + "epoch": 0.9161061362604712, + "grad_norm": 0.09917139261960983, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 66600 + }, + { + "epoch": 0.9167939036300362, + "grad_norm": 0.09271197021007538, + "learning_rate": 0.0001, + "loss": 1.616, + "step": 66650 + }, + { + "epoch": 0.9174816709996011, + "grad_norm": 0.10327056050300598, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 66700 + }, + { + "epoch": 0.918169438369166, + "grad_norm": 0.10798952728509903, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 66750 + }, + { + "epoch": 0.918857205738731, + "grad_norm": 0.09382636100053787, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 66800 + }, + { + "epoch": 0.9195449731082959, + "grad_norm": 0.09712707996368408, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 66850 + }, + { + "epoch": 0.9202327404778607, + "grad_norm": 0.09838785231113434, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 66900 + }, + { + "epoch": 0.9209205078474257, + "grad_norm": 0.09992516040802002, + "learning_rate": 0.0001, + "loss": 1.6248, + "step": 66950 + }, + { + "epoch": 0.9216082752169906, + "grad_norm": 0.10132128745317459, + "learning_rate": 0.0001, + "loss": 1.6241, + "step": 67000 + }, + { + "epoch": 0.9222960425865555, + "grad_norm": 0.11531495302915573, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 67050 + }, + { + "epoch": 0.9229838099561204, + "grad_norm": 0.1062740609049797, + "learning_rate": 0.0001, + "loss": 1.6244, + "step": 67100 + }, + { + "epoch": 0.9236715773256854, + "grad_norm": 0.10253230482339859, + "learning_rate": 0.0001, + "loss": 1.621, + "step": 67150 + }, + { + "epoch": 0.9243593446952503, + "grad_norm": 0.11413050442934036, + "learning_rate": 0.0001, + "loss": 1.6257, + "step": 67200 + }, + { + "epoch": 0.9250471120648152, + "grad_norm": 0.09013490378856659, + "learning_rate": 0.0001, + "loss": 1.6228, + "step": 67250 + }, + { + "epoch": 0.9257348794343802, + "grad_norm": 0.11905354261398315, + "learning_rate": 0.0001, + "loss": 1.6247, + "step": 67300 + }, + { + "epoch": 0.926422646803945, + "grad_norm": 0.11970172822475433, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 67350 + }, + { + "epoch": 0.9271104141735099, + "grad_norm": 0.0958150252699852, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 67400 + }, + { + "epoch": 0.9277981815430749, + "grad_norm": 0.11688381433486938, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 67450 + }, + { + "epoch": 0.9284859489126398, + "grad_norm": 0.09815829992294312, + "learning_rate": 0.0001, + "loss": 1.6292, + "step": 67500 + }, + { + "epoch": 0.9291737162822047, + "grad_norm": 0.10201198607683182, + "learning_rate": 0.0001, + "loss": 1.6211, + "step": 67550 + }, + { + "epoch": 0.9298614836517696, + "grad_norm": 0.12186852842569351, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 67600 + }, + { + "epoch": 0.9305492510213346, + "grad_norm": 0.1051778495311737, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 67650 + }, + { + "epoch": 0.9312370183908995, + "grad_norm": 0.09837380796670914, + "learning_rate": 0.0001, + "loss": 1.6181, + "step": 67700 + }, + { + "epoch": 0.9319247857604643, + "grad_norm": 0.10912145674228668, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 67750 + }, + { + "epoch": 0.9326125531300293, + "grad_norm": 0.10964906960725784, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 67800 + }, + { + "epoch": 0.9333003204995942, + "grad_norm": 0.11275553703308105, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 67850 + }, + { + "epoch": 0.9339880878691591, + "grad_norm": 0.1131332740187645, + "learning_rate": 0.0001, + "loss": 1.6233, + "step": 67900 + }, + { + "epoch": 0.9346758552387241, + "grad_norm": 0.09700637310743332, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 67950 + }, + { + "epoch": 0.935363622608289, + "grad_norm": 0.10155003517866135, + "learning_rate": 0.0001, + "loss": 1.6232, + "step": 68000 + }, + { + "epoch": 0.9360513899778539, + "grad_norm": 0.09441974759101868, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 68050 + }, + { + "epoch": 0.9367391573474189, + "grad_norm": 0.10332616418600082, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 68100 + }, + { + "epoch": 0.9374269247169837, + "grad_norm": 0.10507716983556747, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 68150 + }, + { + "epoch": 0.9381146920865486, + "grad_norm": 0.12356195598840714, + "learning_rate": 0.0001, + "loss": 1.6228, + "step": 68200 + }, + { + "epoch": 0.9388024594561135, + "grad_norm": 0.09743137657642365, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 68250 + }, + { + "epoch": 0.9394902268256785, + "grad_norm": 0.09538557380437851, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 68300 + }, + { + "epoch": 0.9401779941952434, + "grad_norm": 0.10625521838665009, + "learning_rate": 0.0001, + "loss": 1.6175, + "step": 68350 + }, + { + "epoch": 0.9408657615648083, + "grad_norm": 0.09300932288169861, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 68400 + }, + { + "epoch": 0.9415535289343733, + "grad_norm": 0.10895366966724396, + "learning_rate": 0.0001, + "loss": 1.6204, + "step": 68450 + }, + { + "epoch": 0.9422412963039382, + "grad_norm": 0.09512270987033844, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 68500 + }, + { + "epoch": 0.942929063673503, + "grad_norm": 0.09526942670345306, + "learning_rate": 0.0001, + "loss": 1.6204, + "step": 68550 + }, + { + "epoch": 0.943616831043068, + "grad_norm": 0.10735663026571274, + "learning_rate": 0.0001, + "loss": 1.6233, + "step": 68600 + }, + { + "epoch": 0.9443045984126329, + "grad_norm": 0.10702252388000488, + "learning_rate": 0.0001, + "loss": 1.621, + "step": 68650 + }, + { + "epoch": 0.9449923657821978, + "grad_norm": 0.12600167095661163, + "learning_rate": 0.0001, + "loss": 1.6183, + "step": 68700 + }, + { + "epoch": 0.9456801331517628, + "grad_norm": 0.10086681693792343, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 68750 + }, + { + "epoch": 0.9463679005213277, + "grad_norm": 0.10467182099819183, + "learning_rate": 0.0001, + "loss": 1.6204, + "step": 68800 + }, + { + "epoch": 0.9470556678908926, + "grad_norm": 0.10897420346736908, + "learning_rate": 0.0001, + "loss": 1.621, + "step": 68850 + }, + { + "epoch": 0.9477434352604575, + "grad_norm": 0.11620637029409409, + "learning_rate": 0.0001, + "loss": 1.6191, + "step": 68900 + }, + { + "epoch": 0.9484312026300225, + "grad_norm": 0.1035158559679985, + "learning_rate": 0.0001, + "loss": 1.6221, + "step": 68950 + }, + { + "epoch": 0.9491189699995873, + "grad_norm": 0.09212500602006912, + "learning_rate": 0.0001, + "loss": 1.62, + "step": 69000 + }, + { + "epoch": 0.9498067373691522, + "grad_norm": 0.10910465568304062, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 69050 + }, + { + "epoch": 0.9504945047387172, + "grad_norm": 0.09782394766807556, + "learning_rate": 0.0001, + "loss": 1.6205, + "step": 69100 + }, + { + "epoch": 0.9511822721082821, + "grad_norm": 0.11514250189065933, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 69150 + }, + { + "epoch": 0.951870039477847, + "grad_norm": 0.10694628953933716, + "learning_rate": 0.0001, + "loss": 1.6204, + "step": 69200 + }, + { + "epoch": 0.952557806847412, + "grad_norm": 0.11289383471012115, + "learning_rate": 0.0001, + "loss": 1.6208, + "step": 69250 + }, + { + "epoch": 0.9532455742169769, + "grad_norm": 0.11638884246349335, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 69300 + }, + { + "epoch": 0.9539333415865417, + "grad_norm": 0.09971887618303299, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 69350 + }, + { + "epoch": 0.9546211089561066, + "grad_norm": 0.09802468121051788, + "learning_rate": 0.0001, + "loss": 1.6177, + "step": 69400 + }, + { + "epoch": 0.9553088763256716, + "grad_norm": 0.10239792615175247, + "learning_rate": 0.0001, + "loss": 1.6176, + "step": 69450 + }, + { + "epoch": 0.9559966436952365, + "grad_norm": 0.10659025609493256, + "learning_rate": 0.0001, + "loss": 1.6243, + "step": 69500 + }, + { + "epoch": 0.9566844110648014, + "grad_norm": 0.09827960282564163, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 69550 + }, + { + "epoch": 0.9573721784343664, + "grad_norm": 0.11827285587787628, + "learning_rate": 0.0001, + "loss": 1.6255, + "step": 69600 + }, + { + "epoch": 0.9580599458039313, + "grad_norm": 0.10839015990495682, + "learning_rate": 0.0001, + "loss": 1.6197, + "step": 69650 + }, + { + "epoch": 0.9587477131734962, + "grad_norm": 0.10128850489854813, + "learning_rate": 0.0001, + "loss": 1.6233, + "step": 69700 + }, + { + "epoch": 0.9594354805430612, + "grad_norm": 0.0996512919664383, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 69750 + }, + { + "epoch": 0.960123247912626, + "grad_norm": 0.11462216824293137, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 69800 + }, + { + "epoch": 0.9608110152821909, + "grad_norm": 0.10932575911283493, + "learning_rate": 0.0001, + "loss": 1.6199, + "step": 69850 + }, + { + "epoch": 0.9614987826517559, + "grad_norm": 0.12074451148509979, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 69900 + }, + { + "epoch": 0.9621865500213208, + "grad_norm": 0.10791147500276566, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 69950 + }, + { + "epoch": 0.9628743173908857, + "grad_norm": 0.09193072468042374, + "learning_rate": 0.0001, + "loss": 1.6236, + "step": 70000 + }, + { + "epoch": 0.9635620847604506, + "grad_norm": 0.09834913909435272, + "learning_rate": 0.0001, + "loss": 1.6217, + "step": 70050 + }, + { + "epoch": 0.9642498521300156, + "grad_norm": 0.10776387155056, + "learning_rate": 0.0001, + "loss": 1.6194, + "step": 70100 + }, + { + "epoch": 0.9649376194995805, + "grad_norm": 0.10709933191537857, + "learning_rate": 0.0001, + "loss": 1.623, + "step": 70150 + }, + { + "epoch": 0.9656253868691453, + "grad_norm": 0.09259042143821716, + "learning_rate": 0.0001, + "loss": 1.6195, + "step": 70200 + }, + { + "epoch": 0.9663131542387103, + "grad_norm": 0.09519805014133453, + "learning_rate": 0.0001, + "loss": 1.6216, + "step": 70250 + }, + { + "epoch": 0.9670009216082752, + "grad_norm": 0.10261819511651993, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 70300 + }, + { + "epoch": 0.9676886889778401, + "grad_norm": 0.09229016304016113, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 70350 + }, + { + "epoch": 0.9683764563474051, + "grad_norm": 0.10125798732042313, + "learning_rate": 0.0001, + "loss": 1.6225, + "step": 70400 + }, + { + "epoch": 0.96906422371697, + "grad_norm": 0.0942578986287117, + "learning_rate": 0.0001, + "loss": 1.6192, + "step": 70450 + }, + { + "epoch": 0.9697519910865349, + "grad_norm": 0.09841769933700562, + "learning_rate": 0.0001, + "loss": 1.6258, + "step": 70500 + }, + { + "epoch": 0.9704397584560998, + "grad_norm": 0.10628259927034378, + "learning_rate": 0.0001, + "loss": 1.6226, + "step": 70550 + }, + { + "epoch": 0.9711275258256648, + "grad_norm": 0.10877252370119095, + "learning_rate": 0.0001, + "loss": 1.6203, + "step": 70600 + }, + { + "epoch": 0.9718152931952296, + "grad_norm": 0.1018475666642189, + "learning_rate": 0.0001, + "loss": 1.6212, + "step": 70650 + }, + { + "epoch": 0.9725030605647945, + "grad_norm": 0.09722470492124557, + "learning_rate": 0.0001, + "loss": 1.6228, + "step": 70700 + }, + { + "epoch": 0.9731908279343595, + "grad_norm": 0.10455095767974854, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 70750 + }, + { + "epoch": 0.9738785953039244, + "grad_norm": 0.0981382504105568, + "learning_rate": 0.0001, + "loss": 1.6206, + "step": 70800 + }, + { + "epoch": 0.9745663626734893, + "grad_norm": 0.09375258535146713, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 70850 + }, + { + "epoch": 0.9752541300430543, + "grad_norm": 0.11002162843942642, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 70900 + }, + { + "epoch": 0.9759418974126192, + "grad_norm": 0.09439252316951752, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 70950 + }, + { + "epoch": 0.976629664782184, + "grad_norm": 0.08569147437810898, + "learning_rate": 0.0001, + "loss": 1.6218, + "step": 71000 + }, + { + "epoch": 0.977317432151749, + "grad_norm": 0.10502757132053375, + "learning_rate": 0.0001, + "loss": 1.619, + "step": 71050 + }, + { + "epoch": 0.9780051995213139, + "grad_norm": 0.09755092114210129, + "learning_rate": 0.0001, + "loss": 1.6179, + "step": 71100 + }, + { + "epoch": 0.9786929668908788, + "grad_norm": 0.09338153153657913, + "learning_rate": 0.0001, + "loss": 1.6235, + "step": 71150 + }, + { + "epoch": 0.9793807342604437, + "grad_norm": 0.13502080738544464, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 71200 + }, + { + "epoch": 0.9800685016300087, + "grad_norm": 0.10013073682785034, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 71250 + }, + { + "epoch": 0.9807562689995736, + "grad_norm": 0.0925469771027565, + "learning_rate": 0.0001, + "loss": 1.6171, + "step": 71300 + }, + { + "epoch": 0.9814440363691385, + "grad_norm": 0.09714153409004211, + "learning_rate": 0.0001, + "loss": 1.6193, + "step": 71350 + }, + { + "epoch": 0.9821318037387035, + "grad_norm": 0.09583668410778046, + "learning_rate": 0.0001, + "loss": 1.6189, + "step": 71400 + }, + { + "epoch": 0.9828195711082683, + "grad_norm": 0.10106085985898972, + "learning_rate": 0.0001, + "loss": 1.6215, + "step": 71450 + }, + { + "epoch": 0.9835073384778332, + "grad_norm": 0.122016541659832, + "learning_rate": 0.0001, + "loss": 1.6184, + "step": 71500 + }, + { + "epoch": 0.9841951058473982, + "grad_norm": 0.11579859256744385, + "learning_rate": 0.0001, + "loss": 1.6214, + "step": 71550 + }, + { + "epoch": 0.9848828732169631, + "grad_norm": 0.1063709408044815, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 71600 + }, + { + "epoch": 0.985570640586528, + "grad_norm": 0.10856927186250687, + "learning_rate": 0.0001, + "loss": 1.6234, + "step": 71650 + }, + { + "epoch": 0.986258407956093, + "grad_norm": 0.10665950924158096, + "learning_rate": 0.0001, + "loss": 1.6162, + "step": 71700 + }, + { + "epoch": 0.9869461753256579, + "grad_norm": 0.10389940440654755, + "learning_rate": 0.0001, + "loss": 1.6186, + "step": 71750 + }, + { + "epoch": 0.9876339426952228, + "grad_norm": 0.09887974709272385, + "learning_rate": 0.0001, + "loss": 1.6178, + "step": 71800 + }, + { + "epoch": 0.9883217100647876, + "grad_norm": 0.0978117361664772, + "learning_rate": 0.0001, + "loss": 1.6213, + "step": 71850 + }, + { + "epoch": 0.9890094774343526, + "grad_norm": 0.09448680281639099, + "learning_rate": 0.0001, + "loss": 1.6229, + "step": 71900 + }, + { + "epoch": 0.9896972448039175, + "grad_norm": 0.10754861682653427, + "learning_rate": 0.0001, + "loss": 1.6185, + "step": 71950 + }, + { + "epoch": 0.9903850121734824, + "grad_norm": 0.11369004845619202, + "learning_rate": 0.0001, + "loss": 1.6256, + "step": 72000 + }, + { + "epoch": 0.9910727795430474, + "grad_norm": 0.10211578011512756, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 72050 + }, + { + "epoch": 0.9917605469126123, + "grad_norm": 0.09252225607633591, + "learning_rate": 0.0001, + "loss": 1.6202, + "step": 72100 + }, + { + "epoch": 0.9924483142821772, + "grad_norm": 0.11989615112543106, + "learning_rate": 0.0001, + "loss": 1.624, + "step": 72150 + }, + { + "epoch": 0.9931360816517422, + "grad_norm": 0.09846003353595734, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 72200 + }, + { + "epoch": 0.993823849021307, + "grad_norm": 0.11069589853286743, + "learning_rate": 0.0001, + "loss": 1.6173, + "step": 72250 + }, + { + "epoch": 0.9945116163908719, + "grad_norm": 0.10039959102869034, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 72300 + }, + { + "epoch": 0.9951993837604368, + "grad_norm": 0.1280982345342636, + "learning_rate": 0.0001, + "loss": 1.6163, + "step": 72350 + }, + { + "epoch": 0.9958871511300018, + "grad_norm": 0.11330771446228027, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 72400 + }, + { + "epoch": 0.9965749184995667, + "grad_norm": 0.12211450189352036, + "learning_rate": 0.0001, + "loss": 1.6222, + "step": 72450 + }, + { + "epoch": 0.9972626858691316, + "grad_norm": 0.12727168202400208, + "learning_rate": 0.0001, + "loss": 1.6219, + "step": 72500 + }, + { + "epoch": 0.9979504532386966, + "grad_norm": 0.10834234952926636, + "learning_rate": 0.0001, + "loss": 1.6196, + "step": 72550 + }, + { + "epoch": 0.9986382206082615, + "grad_norm": 0.12034707516431808, + "learning_rate": 0.0001, + "loss": 1.6198, + "step": 72600 + }, + { + "epoch": 0.9993259879778263, + "grad_norm": 0.10049953311681747, + "learning_rate": 0.0001, + "loss": 1.6209, + "step": 72650 + } + ], + "logging_steps": 50, + "max_steps": 72699, + "num_input_tokens_seen": 0, + "num_train_epochs": 1, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 5.156983764564035e+21, + "train_batch_size": 4, + "trial_name": null, + "trial_params": null +}