{ "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999499081649691, "eval_steps": 500, "global_step": 8982, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.0003339455668725998, "grad_norm": 12.92171003192369, "learning_rate": 1.1123470522803115e-08, "loss": 0.2988, "step": 1 }, { "epoch": 0.0006678911337451996, "grad_norm": 13.956338815299878, "learning_rate": 2.224694104560623e-08, "loss": 0.308, "step": 2 }, { "epoch": 0.0010018367006177993, "grad_norm": 12.86023641194538, "learning_rate": 3.337041156840935e-08, "loss": 0.2821, "step": 3 }, { "epoch": 0.0013357822674903992, "grad_norm": 13.913511855437857, "learning_rate": 4.449388209121246e-08, "loss": 0.2781, "step": 4 }, { "epoch": 0.0016697278343629988, "grad_norm": 13.699199844473183, "learning_rate": 5.561735261401558e-08, "loss": 0.3071, "step": 5 }, { "epoch": 0.0020036734012355987, "grad_norm": 14.10327817555655, "learning_rate": 6.67408231368187e-08, "loss": 0.3214, "step": 6 }, { "epoch": 0.0023376189681081983, "grad_norm": 12.536899972256993, "learning_rate": 7.78642936596218e-08, "loss": 0.2617, "step": 7 }, { "epoch": 0.0026715645349807983, "grad_norm": 12.888187017193689, "learning_rate": 8.898776418242492e-08, "loss": 0.2894, "step": 8 }, { "epoch": 0.003005510101853398, "grad_norm": 12.454584884182633, "learning_rate": 1.0011123470522804e-07, "loss": 0.2908, "step": 9 }, { "epoch": 0.0033394556687259976, "grad_norm": 12.00546599592856, "learning_rate": 1.1123470522803116e-07, "loss": 0.2737, "step": 10 }, { "epoch": 0.0036734012355985972, "grad_norm": 12.881443540473851, "learning_rate": 1.2235817575083427e-07, "loss": 0.278, "step": 11 }, { "epoch": 0.004007346802471197, "grad_norm": 13.509260575004946, "learning_rate": 1.334816462736374e-07, "loss": 0.3083, "step": 12 }, { "epoch": 0.004341292369343797, "grad_norm": 12.995692303974334, "learning_rate": 1.446051167964405e-07, "loss": 0.3051, "step": 13 }, { "epoch": 0.0046752379362163966, "grad_norm": 12.482273819850871, "learning_rate": 1.557285873192436e-07, "loss": 0.2637, "step": 14 }, { "epoch": 0.005009183503088996, "grad_norm": 12.386935079831533, "learning_rate": 1.6685205784204674e-07, "loss": 0.2804, "step": 15 }, { "epoch": 0.005343129069961597, "grad_norm": 12.819259388755173, "learning_rate": 1.7797552836484985e-07, "loss": 0.2941, "step": 16 }, { "epoch": 0.005677074636834196, "grad_norm": 11.912184553214447, "learning_rate": 1.8909899888765295e-07, "loss": 0.2928, "step": 17 }, { "epoch": 0.006011020203706796, "grad_norm": 12.941830493445417, "learning_rate": 2.0022246941045608e-07, "loss": 0.3081, "step": 18 }, { "epoch": 0.006344965770579396, "grad_norm": 11.406373410492941, "learning_rate": 2.113459399332592e-07, "loss": 0.2819, "step": 19 }, { "epoch": 0.006678911337451995, "grad_norm": 11.061731469121861, "learning_rate": 2.2246941045606232e-07, "loss": 0.228, "step": 20 }, { "epoch": 0.007012856904324595, "grad_norm": 7.802127624417816, "learning_rate": 2.3359288097886543e-07, "loss": 0.2066, "step": 21 }, { "epoch": 0.0073468024711971945, "grad_norm": 8.587928584552232, "learning_rate": 2.4471635150166853e-07, "loss": 0.2391, "step": 22 }, { "epoch": 0.007680748038069795, "grad_norm": 8.077867973647216, "learning_rate": 2.5583982202447166e-07, "loss": 0.2279, "step": 23 }, { "epoch": 0.008014693604942395, "grad_norm": 8.738395787944595, "learning_rate": 2.669632925472748e-07, "loss": 0.2266, "step": 24 }, { "epoch": 0.008348639171814994, "grad_norm": 9.149113683182511, "learning_rate": 2.780867630700779e-07, "loss": 0.2537, "step": 25 }, { "epoch": 0.008682584738687594, "grad_norm": 9.020287772281916, "learning_rate": 2.89210233592881e-07, "loss": 0.255, "step": 26 }, { "epoch": 0.009016530305560193, "grad_norm": 7.417370767206343, "learning_rate": 3.003337041156841e-07, "loss": 0.2038, "step": 27 }, { "epoch": 0.009350475872432793, "grad_norm": 5.316597345079005, "learning_rate": 3.114571746384872e-07, "loss": 0.1992, "step": 28 }, { "epoch": 0.009684421439305393, "grad_norm": 4.27986688396643, "learning_rate": 3.2258064516129035e-07, "loss": 0.1671, "step": 29 }, { "epoch": 0.010018367006177992, "grad_norm": 4.624509032720886, "learning_rate": 3.337041156840935e-07, "loss": 0.1496, "step": 30 }, { "epoch": 0.010352312573050592, "grad_norm": 3.7335787807472025, "learning_rate": 3.4482758620689656e-07, "loss": 0.1196, "step": 31 }, { "epoch": 0.010686258139923193, "grad_norm": 3.732271378852131, "learning_rate": 3.559510567296997e-07, "loss": 0.1486, "step": 32 }, { "epoch": 0.011020203706795793, "grad_norm": 3.8306272781057356, "learning_rate": 3.670745272525028e-07, "loss": 0.1327, "step": 33 }, { "epoch": 0.011354149273668393, "grad_norm": 3.4893370876395227, "learning_rate": 3.781979977753059e-07, "loss": 0.1839, "step": 34 }, { "epoch": 0.011688094840540992, "grad_norm": 3.8733495197080647, "learning_rate": 3.8932146829810904e-07, "loss": 0.1686, "step": 35 }, { "epoch": 0.012022040407413592, "grad_norm": 2.7736882693641483, "learning_rate": 4.0044493882091217e-07, "loss": 0.1284, "step": 36 }, { "epoch": 0.012355985974286192, "grad_norm": 3.1365209481113356, "learning_rate": 4.115684093437153e-07, "loss": 0.1736, "step": 37 }, { "epoch": 0.012689931541158791, "grad_norm": 2.5256497657985255, "learning_rate": 4.226918798665184e-07, "loss": 0.138, "step": 38 }, { "epoch": 0.01302387710803139, "grad_norm": 2.71639770957255, "learning_rate": 4.338153503893215e-07, "loss": 0.1431, "step": 39 }, { "epoch": 0.01335782267490399, "grad_norm": 2.00008767091192, "learning_rate": 4.4493882091212464e-07, "loss": 0.1214, "step": 40 }, { "epoch": 0.01369176824177659, "grad_norm": 2.2089314405521474, "learning_rate": 4.560622914349278e-07, "loss": 0.1226, "step": 41 }, { "epoch": 0.01402571380864919, "grad_norm": 1.911269906505354, "learning_rate": 4.6718576195773085e-07, "loss": 0.1284, "step": 42 }, { "epoch": 0.01435965937552179, "grad_norm": 1.8689118089163397, "learning_rate": 4.783092324805339e-07, "loss": 0.1367, "step": 43 }, { "epoch": 0.014693604942394389, "grad_norm": 1.9664760230436695, "learning_rate": 4.894327030033371e-07, "loss": 0.1411, "step": 44 }, { "epoch": 0.01502755050926699, "grad_norm": 1.7429607986196036, "learning_rate": 5.005561735261402e-07, "loss": 0.1191, "step": 45 }, { "epoch": 0.01536149607613959, "grad_norm": 1.4591054195429314, "learning_rate": 5.116796440489433e-07, "loss": 0.0894, "step": 46 }, { "epoch": 0.01569544164301219, "grad_norm": 1.5500253735232825, "learning_rate": 5.228031145717465e-07, "loss": 0.1123, "step": 47 }, { "epoch": 0.01602938720988479, "grad_norm": 1.430119262578493, "learning_rate": 5.339265850945496e-07, "loss": 0.0826, "step": 48 }, { "epoch": 0.01636333277675739, "grad_norm": 1.5162569575731026, "learning_rate": 5.450500556173527e-07, "loss": 0.1104, "step": 49 }, { "epoch": 0.01669727834362999, "grad_norm": 1.8050273480823371, "learning_rate": 5.561735261401558e-07, "loss": 0.1437, "step": 50 }, { "epoch": 0.017031223910502588, "grad_norm": 1.4138690044307227, "learning_rate": 5.672969966629589e-07, "loss": 0.1006, "step": 51 }, { "epoch": 0.017365169477375188, "grad_norm": 2.003451442850109, "learning_rate": 5.78420467185762e-07, "loss": 0.1202, "step": 52 }, { "epoch": 0.017699115044247787, "grad_norm": 1.9150778054568744, "learning_rate": 5.89543937708565e-07, "loss": 0.1368, "step": 53 }, { "epoch": 0.018033060611120387, "grad_norm": 1.420482579614073, "learning_rate": 6.006674082313682e-07, "loss": 0.1135, "step": 54 }, { "epoch": 0.018367006177992987, "grad_norm": 1.6619813862888555, "learning_rate": 6.117908787541713e-07, "loss": 0.1098, "step": 55 }, { "epoch": 0.018700951744865586, "grad_norm": 1.8039097675422715, "learning_rate": 6.229143492769744e-07, "loss": 0.1182, "step": 56 }, { "epoch": 0.019034897311738186, "grad_norm": 1.9251360126472772, "learning_rate": 6.340378197997777e-07, "loss": 0.1229, "step": 57 }, { "epoch": 0.019368842878610786, "grad_norm": 1.646127653594256, "learning_rate": 6.451612903225807e-07, "loss": 0.0908, "step": 58 }, { "epoch": 0.019702788445483385, "grad_norm": 1.8096753461486952, "learning_rate": 6.562847608453838e-07, "loss": 0.0902, "step": 59 }, { "epoch": 0.020036734012355985, "grad_norm": 1.3257550156504097, "learning_rate": 6.67408231368187e-07, "loss": 0.0927, "step": 60 }, { "epoch": 0.020370679579228584, "grad_norm": 1.4611566503899422, "learning_rate": 6.785317018909901e-07, "loss": 0.0949, "step": 61 }, { "epoch": 0.020704625146101184, "grad_norm": 1.4170363748007948, "learning_rate": 6.896551724137931e-07, "loss": 0.0756, "step": 62 }, { "epoch": 0.021038570712973784, "grad_norm": 1.5649016246749317, "learning_rate": 7.007786429365964e-07, "loss": 0.1167, "step": 63 }, { "epoch": 0.021372516279846387, "grad_norm": 1.7223755363490967, "learning_rate": 7.119021134593994e-07, "loss": 0.0914, "step": 64 }, { "epoch": 0.021706461846718986, "grad_norm": 2.264053959910125, "learning_rate": 7.230255839822026e-07, "loss": 0.1022, "step": 65 }, { "epoch": 0.022040407413591586, "grad_norm": 1.2864993126473974, "learning_rate": 7.341490545050057e-07, "loss": 0.1053, "step": 66 }, { "epoch": 0.022374352980464186, "grad_norm": 1.44023472678529, "learning_rate": 7.452725250278087e-07, "loss": 0.0957, "step": 67 }, { "epoch": 0.022708298547336785, "grad_norm": 1.6488839104515984, "learning_rate": 7.563959955506118e-07, "loss": 0.1114, "step": 68 }, { "epoch": 0.023042244114209385, "grad_norm": 1.4538025741923084, "learning_rate": 7.675194660734149e-07, "loss": 0.108, "step": 69 }, { "epoch": 0.023376189681081985, "grad_norm": 1.5002627715774677, "learning_rate": 7.786429365962181e-07, "loss": 0.1141, "step": 70 }, { "epoch": 0.023710135247954584, "grad_norm": 1.586495576761618, "learning_rate": 7.897664071190211e-07, "loss": 0.1098, "step": 71 }, { "epoch": 0.024044080814827184, "grad_norm": 1.1824262761185285, "learning_rate": 8.008898776418243e-07, "loss": 0.0772, "step": 72 }, { "epoch": 0.024378026381699783, "grad_norm": 1.618110947873907, "learning_rate": 8.120133481646274e-07, "loss": 0.1097, "step": 73 }, { "epoch": 0.024711971948572383, "grad_norm": 1.1671719943955008, "learning_rate": 8.231368186874306e-07, "loss": 0.0852, "step": 74 }, { "epoch": 0.025045917515444983, "grad_norm": 1.2830768944527116, "learning_rate": 8.342602892102336e-07, "loss": 0.1165, "step": 75 }, { "epoch": 0.025379863082317582, "grad_norm": 1.4018712313176183, "learning_rate": 8.453837597330368e-07, "loss": 0.0874, "step": 76 }, { "epoch": 0.025713808649190182, "grad_norm": 1.2432680393980957, "learning_rate": 8.565072302558399e-07, "loss": 0.1088, "step": 77 }, { "epoch": 0.02604775421606278, "grad_norm": 1.3164039971574175, "learning_rate": 8.67630700778643e-07, "loss": 0.0894, "step": 78 }, { "epoch": 0.02638169978293538, "grad_norm": 1.642213217588854, "learning_rate": 8.78754171301446e-07, "loss": 0.0894, "step": 79 }, { "epoch": 0.02671564534980798, "grad_norm": 1.0839090380138228, "learning_rate": 8.898776418242493e-07, "loss": 0.0806, "step": 80 }, { "epoch": 0.02704959091668058, "grad_norm": 1.8952932050331013, "learning_rate": 9.010011123470523e-07, "loss": 0.132, "step": 81 }, { "epoch": 0.02738353648355318, "grad_norm": 1.465175894344485, "learning_rate": 9.121245828698556e-07, "loss": 0.0628, "step": 82 }, { "epoch": 0.02771748205042578, "grad_norm": 1.3366732234313092, "learning_rate": 9.232480533926586e-07, "loss": 0.0863, "step": 83 }, { "epoch": 0.02805142761729838, "grad_norm": 1.643937154471812, "learning_rate": 9.343715239154617e-07, "loss": 0.096, "step": 84 }, { "epoch": 0.02838537318417098, "grad_norm": 1.2570354689622436, "learning_rate": 9.454949944382647e-07, "loss": 0.0948, "step": 85 }, { "epoch": 0.02871931875104358, "grad_norm": 1.5112058267872772, "learning_rate": 9.566184649610679e-07, "loss": 0.1044, "step": 86 }, { "epoch": 0.02905326431791618, "grad_norm": 1.0702555105247553, "learning_rate": 9.67741935483871e-07, "loss": 0.0762, "step": 87 }, { "epoch": 0.029387209884788778, "grad_norm": 1.6750200548468086, "learning_rate": 9.788654060066741e-07, "loss": 0.1282, "step": 88 }, { "epoch": 0.029721155451661378, "grad_norm": 2.2918645416277785, "learning_rate": 9.899888765294773e-07, "loss": 0.0915, "step": 89 }, { "epoch": 0.03005510101853398, "grad_norm": 2.242498528106184, "learning_rate": 1.0011123470522804e-06, "loss": 0.1077, "step": 90 }, { "epoch": 0.03038904658540658, "grad_norm": 1.6529779325988916, "learning_rate": 1.0122358175750835e-06, "loss": 0.0981, "step": 91 }, { "epoch": 0.03072299215227918, "grad_norm": 1.6048070404143326, "learning_rate": 1.0233592880978867e-06, "loss": 0.0823, "step": 92 }, { "epoch": 0.03105693771915178, "grad_norm": 1.6293330212334107, "learning_rate": 1.0344827586206898e-06, "loss": 0.1174, "step": 93 }, { "epoch": 0.03139088328602438, "grad_norm": 1.4328639917912847, "learning_rate": 1.045606229143493e-06, "loss": 0.1005, "step": 94 }, { "epoch": 0.03172482885289698, "grad_norm": 1.2582858682128226, "learning_rate": 1.056729699666296e-06, "loss": 0.1019, "step": 95 }, { "epoch": 0.03205877441976958, "grad_norm": 1.2746152228652043, "learning_rate": 1.0678531701890992e-06, "loss": 0.0946, "step": 96 }, { "epoch": 0.03239271998664218, "grad_norm": 1.39449403536346, "learning_rate": 1.0789766407119021e-06, "loss": 0.0739, "step": 97 }, { "epoch": 0.03272666555351478, "grad_norm": 1.3423087237113123, "learning_rate": 1.0901001112347055e-06, "loss": 0.0728, "step": 98 }, { "epoch": 0.03306061112038738, "grad_norm": 1.6223819048746644, "learning_rate": 1.1012235817575084e-06, "loss": 0.1153, "step": 99 }, { "epoch": 0.03339455668725998, "grad_norm": 1.3427962365052586, "learning_rate": 1.1123470522803115e-06, "loss": 0.0835, "step": 100 }, { "epoch": 0.03372850225413258, "grad_norm": 1.149441339699243, "learning_rate": 1.1234705228031146e-06, "loss": 0.0849, "step": 101 }, { "epoch": 0.034062447821005176, "grad_norm": 1.5347477678101182, "learning_rate": 1.1345939933259178e-06, "loss": 0.0949, "step": 102 }, { "epoch": 0.034396393387877776, "grad_norm": 1.5538378922417313, "learning_rate": 1.145717463848721e-06, "loss": 0.1102, "step": 103 }, { "epoch": 0.034730338954750375, "grad_norm": 1.6668690527736754, "learning_rate": 1.156840934371524e-06, "loss": 0.0914, "step": 104 }, { "epoch": 0.035064284521622975, "grad_norm": 1.2103230671109944, "learning_rate": 1.1679644048943272e-06, "loss": 0.0626, "step": 105 }, { "epoch": 0.035398230088495575, "grad_norm": 1.8764172669861614, "learning_rate": 1.17908787541713e-06, "loss": 0.0691, "step": 106 }, { "epoch": 0.035732175655368174, "grad_norm": 1.722910145928535, "learning_rate": 1.1902113459399334e-06, "loss": 0.093, "step": 107 }, { "epoch": 0.036066121222240774, "grad_norm": 1.0618234858765518, "learning_rate": 1.2013348164627363e-06, "loss": 0.0761, "step": 108 }, { "epoch": 0.036400066789113374, "grad_norm": 1.3121783082566816, "learning_rate": 1.2124582869855397e-06, "loss": 0.089, "step": 109 }, { "epoch": 0.03673401235598597, "grad_norm": 1.4499363672076466, "learning_rate": 1.2235817575083426e-06, "loss": 0.0804, "step": 110 }, { "epoch": 0.03706795792285857, "grad_norm": 1.2615508501901613, "learning_rate": 1.2347052280311457e-06, "loss": 0.0845, "step": 111 }, { "epoch": 0.03740190348973117, "grad_norm": 0.9940605122797614, "learning_rate": 1.2458286985539489e-06, "loss": 0.0668, "step": 112 }, { "epoch": 0.03773584905660377, "grad_norm": 1.1405073614214674, "learning_rate": 1.256952169076752e-06, "loss": 0.0681, "step": 113 }, { "epoch": 0.03806979462347637, "grad_norm": 1.4030861518429394, "learning_rate": 1.2680756395995554e-06, "loss": 0.0814, "step": 114 }, { "epoch": 0.03840374019034897, "grad_norm": 1.3578007200676907, "learning_rate": 1.2791991101223583e-06, "loss": 0.0895, "step": 115 }, { "epoch": 0.03873768575722157, "grad_norm": 1.0249529431802051, "learning_rate": 1.2903225806451614e-06, "loss": 0.0739, "step": 116 }, { "epoch": 0.03907163132409417, "grad_norm": 1.235415087018068, "learning_rate": 1.3014460511679643e-06, "loss": 0.0824, "step": 117 }, { "epoch": 0.03940557689096677, "grad_norm": 1.3479513963722216, "learning_rate": 1.3125695216907677e-06, "loss": 0.0801, "step": 118 }, { "epoch": 0.03973952245783937, "grad_norm": 1.2754322824182192, "learning_rate": 1.3236929922135708e-06, "loss": 0.1071, "step": 119 }, { "epoch": 0.04007346802471197, "grad_norm": 1.1196937681855994, "learning_rate": 1.334816462736374e-06, "loss": 0.0711, "step": 120 }, { "epoch": 0.04040741359158457, "grad_norm": 1.0251751276579624, "learning_rate": 1.3459399332591769e-06, "loss": 0.0657, "step": 121 }, { "epoch": 0.04074135915845717, "grad_norm": 1.8597822373350466, "learning_rate": 1.3570634037819802e-06, "loss": 0.0991, "step": 122 }, { "epoch": 0.04107530472532977, "grad_norm": 0.8913110658609731, "learning_rate": 1.3681868743047833e-06, "loss": 0.0582, "step": 123 }, { "epoch": 0.04140925029220237, "grad_norm": 1.7385584510532524, "learning_rate": 1.3793103448275862e-06, "loss": 0.114, "step": 124 }, { "epoch": 0.04174319585907497, "grad_norm": 1.1376218320024045, "learning_rate": 1.3904338153503894e-06, "loss": 0.0692, "step": 125 }, { "epoch": 0.04207714142594757, "grad_norm": 1.3062772064746195, "learning_rate": 1.4015572858731927e-06, "loss": 0.0954, "step": 126 }, { "epoch": 0.04241108699282017, "grad_norm": 1.366103399563069, "learning_rate": 1.4126807563959956e-06, "loss": 0.0928, "step": 127 }, { "epoch": 0.042745032559692774, "grad_norm": 1.2246926697415539, "learning_rate": 1.4238042269187988e-06, "loss": 0.0791, "step": 128 }, { "epoch": 0.04307897812656537, "grad_norm": 1.6570991234879342, "learning_rate": 1.434927697441602e-06, "loss": 0.0858, "step": 129 }, { "epoch": 0.04341292369343797, "grad_norm": 1.369880127314859, "learning_rate": 1.4460511679644053e-06, "loss": 0.0833, "step": 130 }, { "epoch": 0.04374686926031057, "grad_norm": 1.206854655306054, "learning_rate": 1.4571746384872082e-06, "loss": 0.0765, "step": 131 }, { "epoch": 0.04408081482718317, "grad_norm": 1.125499057097289, "learning_rate": 1.4682981090100113e-06, "loss": 0.0756, "step": 132 }, { "epoch": 0.04441476039405577, "grad_norm": 1.4059140667007435, "learning_rate": 1.4794215795328142e-06, "loss": 0.0848, "step": 133 }, { "epoch": 0.04474870596092837, "grad_norm": 1.06168619350604, "learning_rate": 1.4905450500556174e-06, "loss": 0.052, "step": 134 }, { "epoch": 0.04508265152780097, "grad_norm": 1.6567637931657537, "learning_rate": 1.5016685205784207e-06, "loss": 0.0992, "step": 135 }, { "epoch": 0.04541659709467357, "grad_norm": 1.920957032614187, "learning_rate": 1.5127919911012236e-06, "loss": 0.1022, "step": 136 }, { "epoch": 0.04575054266154617, "grad_norm": 1.074227017597323, "learning_rate": 1.5239154616240268e-06, "loss": 0.0768, "step": 137 }, { "epoch": 0.04608448822841877, "grad_norm": 1.016076882611769, "learning_rate": 1.5350389321468299e-06, "loss": 0.0695, "step": 138 }, { "epoch": 0.04641843379529137, "grad_norm": 0.9899774375747383, "learning_rate": 1.5461624026696332e-06, "loss": 0.0661, "step": 139 }, { "epoch": 0.04675237936216397, "grad_norm": 1.308714111392643, "learning_rate": 1.5572858731924361e-06, "loss": 0.0605, "step": 140 }, { "epoch": 0.04708632492903657, "grad_norm": 1.02745462441021, "learning_rate": 1.5684093437152393e-06, "loss": 0.0775, "step": 141 }, { "epoch": 0.04742027049590917, "grad_norm": 1.168735536247947, "learning_rate": 1.5795328142380422e-06, "loss": 0.0917, "step": 142 }, { "epoch": 0.04775421606278177, "grad_norm": 1.196785596172984, "learning_rate": 1.5906562847608455e-06, "loss": 0.0797, "step": 143 }, { "epoch": 0.04808816162965437, "grad_norm": 1.1451563981154869, "learning_rate": 1.6017797552836487e-06, "loss": 0.078, "step": 144 }, { "epoch": 0.04842210719652697, "grad_norm": 1.0392222279053414, "learning_rate": 1.6129032258064516e-06, "loss": 0.0793, "step": 145 }, { "epoch": 0.04875605276339957, "grad_norm": 1.0934735024383997, "learning_rate": 1.6240266963292547e-06, "loss": 0.0716, "step": 146 }, { "epoch": 0.049089998330272167, "grad_norm": 1.4519901055252726, "learning_rate": 1.635150166852058e-06, "loss": 0.0757, "step": 147 }, { "epoch": 0.049423943897144766, "grad_norm": 2.2852241619827813, "learning_rate": 1.6462736373748612e-06, "loss": 0.0862, "step": 148 }, { "epoch": 0.049757889464017366, "grad_norm": 1.6408340659482898, "learning_rate": 1.6573971078976641e-06, "loss": 0.0886, "step": 149 }, { "epoch": 0.050091835030889965, "grad_norm": 2.0454946446505016, "learning_rate": 1.6685205784204673e-06, "loss": 0.0772, "step": 150 }, { "epoch": 0.050425780597762565, "grad_norm": 1.2259922787210893, "learning_rate": 1.6796440489432706e-06, "loss": 0.0683, "step": 151 }, { "epoch": 0.050759726164635165, "grad_norm": 1.1562777397964157, "learning_rate": 1.6907675194660735e-06, "loss": 0.0757, "step": 152 }, { "epoch": 0.051093671731507764, "grad_norm": 1.136779453981954, "learning_rate": 1.7018909899888767e-06, "loss": 0.0913, "step": 153 }, { "epoch": 0.051427617298380364, "grad_norm": 1.0030489951613217, "learning_rate": 1.7130144605116798e-06, "loss": 0.0682, "step": 154 }, { "epoch": 0.051761562865252964, "grad_norm": 1.7385357922589744, "learning_rate": 1.724137931034483e-06, "loss": 0.074, "step": 155 }, { "epoch": 0.05209550843212556, "grad_norm": 1.3740643745362002, "learning_rate": 1.735261401557286e-06, "loss": 0.0753, "step": 156 }, { "epoch": 0.05242945399899816, "grad_norm": 1.1583165071055108, "learning_rate": 1.7463848720800892e-06, "loss": 0.0624, "step": 157 }, { "epoch": 0.05276339956587076, "grad_norm": 1.1621428052160172, "learning_rate": 1.757508342602892e-06, "loss": 0.0728, "step": 158 }, { "epoch": 0.05309734513274336, "grad_norm": 0.9935446809403798, "learning_rate": 1.7686318131256954e-06, "loss": 0.064, "step": 159 }, { "epoch": 0.05343129069961596, "grad_norm": 1.0753646446851233, "learning_rate": 1.7797552836484986e-06, "loss": 0.0735, "step": 160 }, { "epoch": 0.05376523626648856, "grad_norm": 1.1635933935840013, "learning_rate": 1.7908787541713015e-06, "loss": 0.0601, "step": 161 }, { "epoch": 0.05409918183336116, "grad_norm": 0.976271537576658, "learning_rate": 1.8020022246941046e-06, "loss": 0.0642, "step": 162 }, { "epoch": 0.05443312740023376, "grad_norm": 1.4114206211723894, "learning_rate": 1.813125695216908e-06, "loss": 0.0934, "step": 163 }, { "epoch": 0.05476707296710636, "grad_norm": 1.237154441559006, "learning_rate": 1.824249165739711e-06, "loss": 0.086, "step": 164 }, { "epoch": 0.05510101853397896, "grad_norm": 1.2253824367688997, "learning_rate": 1.835372636262514e-06, "loss": 0.0865, "step": 165 }, { "epoch": 0.05543496410085156, "grad_norm": 0.8378818554174366, "learning_rate": 1.8464961067853172e-06, "loss": 0.0659, "step": 166 }, { "epoch": 0.05576890966772416, "grad_norm": 0.9223176156027858, "learning_rate": 1.85761957730812e-06, "loss": 0.0741, "step": 167 }, { "epoch": 0.05610285523459676, "grad_norm": 1.3912311372924906, "learning_rate": 1.8687430478309234e-06, "loss": 0.0971, "step": 168 }, { "epoch": 0.05643680080146936, "grad_norm": 1.0181918901777176, "learning_rate": 1.8798665183537266e-06, "loss": 0.0748, "step": 169 }, { "epoch": 0.05677074636834196, "grad_norm": 0.8910063887713932, "learning_rate": 1.8909899888765295e-06, "loss": 0.0625, "step": 170 }, { "epoch": 0.05710469193521456, "grad_norm": 1.3057004674938029, "learning_rate": 1.9021134593993326e-06, "loss": 0.0941, "step": 171 }, { "epoch": 0.05743863750208716, "grad_norm": 1.193105077297629, "learning_rate": 1.9132369299221357e-06, "loss": 0.0643, "step": 172 }, { "epoch": 0.05777258306895976, "grad_norm": 1.3055004936633776, "learning_rate": 1.924360400444939e-06, "loss": 0.0895, "step": 173 }, { "epoch": 0.05810652863583236, "grad_norm": 1.1564338453638052, "learning_rate": 1.935483870967742e-06, "loss": 0.0582, "step": 174 }, { "epoch": 0.058440474202704956, "grad_norm": 1.5341843111867526, "learning_rate": 1.946607341490545e-06, "loss": 0.0936, "step": 175 }, { "epoch": 0.058774419769577556, "grad_norm": 1.4090086836755578, "learning_rate": 1.9577308120133483e-06, "loss": 0.0906, "step": 176 }, { "epoch": 0.059108365336450155, "grad_norm": 0.9351572301997615, "learning_rate": 1.9688542825361514e-06, "loss": 0.0809, "step": 177 }, { "epoch": 0.059442310903322755, "grad_norm": 1.2238183304370427, "learning_rate": 1.9799777530589545e-06, "loss": 0.0839, "step": 178 }, { "epoch": 0.059776256470195355, "grad_norm": 0.7778091378640176, "learning_rate": 1.9911012235817577e-06, "loss": 0.0467, "step": 179 }, { "epoch": 0.06011020203706796, "grad_norm": 1.760420759812158, "learning_rate": 2.002224694104561e-06, "loss": 0.1007, "step": 180 }, { "epoch": 0.06044414760394056, "grad_norm": 1.1970302826818442, "learning_rate": 2.013348164627364e-06, "loss": 0.0835, "step": 181 }, { "epoch": 0.06077809317081316, "grad_norm": 1.2558562706280731, "learning_rate": 2.024471635150167e-06, "loss": 0.0936, "step": 182 }, { "epoch": 0.06111203873768576, "grad_norm": 0.9804679330236008, "learning_rate": 2.03559510567297e-06, "loss": 0.0696, "step": 183 }, { "epoch": 0.06144598430455836, "grad_norm": 1.1097267450527353, "learning_rate": 2.0467185761957733e-06, "loss": 0.0801, "step": 184 }, { "epoch": 0.06177992987143096, "grad_norm": 1.438725958156707, "learning_rate": 2.0578420467185764e-06, "loss": 0.0766, "step": 185 }, { "epoch": 0.06211387543830356, "grad_norm": 1.450645634817868, "learning_rate": 2.0689655172413796e-06, "loss": 0.0605, "step": 186 }, { "epoch": 0.06244782100517616, "grad_norm": 0.9637236859717696, "learning_rate": 2.0800889877641823e-06, "loss": 0.071, "step": 187 }, { "epoch": 0.06278176657204876, "grad_norm": 1.4369154647412303, "learning_rate": 2.091212458286986e-06, "loss": 0.0729, "step": 188 }, { "epoch": 0.06311571213892135, "grad_norm": 1.489682534370634, "learning_rate": 2.102335928809789e-06, "loss": 0.0743, "step": 189 }, { "epoch": 0.06344965770579396, "grad_norm": 0.9133377888567851, "learning_rate": 2.113459399332592e-06, "loss": 0.0658, "step": 190 }, { "epoch": 0.06378360327266655, "grad_norm": 1.3622393297307778, "learning_rate": 2.124582869855395e-06, "loss": 0.0772, "step": 191 }, { "epoch": 0.06411754883953916, "grad_norm": 1.0931775160388473, "learning_rate": 2.1357063403781984e-06, "loss": 0.064, "step": 192 }, { "epoch": 0.06445149440641175, "grad_norm": 0.9403564090361918, "learning_rate": 2.1468298109010015e-06, "loss": 0.0643, "step": 193 }, { "epoch": 0.06478543997328436, "grad_norm": 0.8675001719292815, "learning_rate": 2.1579532814238042e-06, "loss": 0.0589, "step": 194 }, { "epoch": 0.06511938554015695, "grad_norm": 1.1933566142668406, "learning_rate": 2.1690767519466073e-06, "loss": 0.0862, "step": 195 }, { "epoch": 0.06545333110702956, "grad_norm": 1.2459886422557267, "learning_rate": 2.180200222469411e-06, "loss": 0.0663, "step": 196 }, { "epoch": 0.06578727667390215, "grad_norm": 1.3992714860593076, "learning_rate": 2.1913236929922136e-06, "loss": 0.0872, "step": 197 }, { "epoch": 0.06612122224077475, "grad_norm": 1.2606360762089206, "learning_rate": 2.2024471635150167e-06, "loss": 0.0713, "step": 198 }, { "epoch": 0.06645516780764735, "grad_norm": 1.2973243700693666, "learning_rate": 2.21357063403782e-06, "loss": 0.1072, "step": 199 }, { "epoch": 0.06678911337451995, "grad_norm": 1.4558397957910205, "learning_rate": 2.224694104560623e-06, "loss": 0.1002, "step": 200 }, { "epoch": 0.06712305894139255, "grad_norm": 0.9920794318487967, "learning_rate": 2.235817575083426e-06, "loss": 0.0677, "step": 201 }, { "epoch": 0.06745700450826515, "grad_norm": 1.087436627599926, "learning_rate": 2.2469410456062293e-06, "loss": 0.0872, "step": 202 }, { "epoch": 0.06779095007513775, "grad_norm": 1.02095781221236, "learning_rate": 2.2580645161290324e-06, "loss": 0.0721, "step": 203 }, { "epoch": 0.06812489564201035, "grad_norm": 0.9306631155090207, "learning_rate": 2.2691879866518355e-06, "loss": 0.0601, "step": 204 }, { "epoch": 0.06845884120888296, "grad_norm": 1.3190666786505112, "learning_rate": 2.2803114571746387e-06, "loss": 0.091, "step": 205 }, { "epoch": 0.06879278677575555, "grad_norm": 0.8639747841314965, "learning_rate": 2.291434927697442e-06, "loss": 0.076, "step": 206 }, { "epoch": 0.06912673234262816, "grad_norm": 1.238776383531512, "learning_rate": 2.302558398220245e-06, "loss": 0.091, "step": 207 }, { "epoch": 0.06946067790950075, "grad_norm": 1.0654244974201894, "learning_rate": 2.313681868743048e-06, "loss": 0.0812, "step": 208 }, { "epoch": 0.06979462347637336, "grad_norm": 1.0719673964669938, "learning_rate": 2.324805339265851e-06, "loss": 0.0608, "step": 209 }, { "epoch": 0.07012856904324595, "grad_norm": 1.007448591232436, "learning_rate": 2.3359288097886543e-06, "loss": 0.0573, "step": 210 }, { "epoch": 0.07046251461011856, "grad_norm": 0.9909285215050861, "learning_rate": 2.3470522803114575e-06, "loss": 0.0737, "step": 211 }, { "epoch": 0.07079646017699115, "grad_norm": 1.4902013241813645, "learning_rate": 2.35817575083426e-06, "loss": 0.0762, "step": 212 }, { "epoch": 0.07113040574386376, "grad_norm": 1.0594307960564555, "learning_rate": 2.3692992213570637e-06, "loss": 0.0729, "step": 213 }, { "epoch": 0.07146435131073635, "grad_norm": 1.6186928433649752, "learning_rate": 2.380422691879867e-06, "loss": 0.1039, "step": 214 }, { "epoch": 0.07179829687760896, "grad_norm": 1.0664749163779126, "learning_rate": 2.39154616240267e-06, "loss": 0.0708, "step": 215 }, { "epoch": 0.07213224244448155, "grad_norm": 0.8702436463796133, "learning_rate": 2.4026696329254727e-06, "loss": 0.0815, "step": 216 }, { "epoch": 0.07246618801135415, "grad_norm": 0.9979296999970432, "learning_rate": 2.4137931034482762e-06, "loss": 0.0753, "step": 217 }, { "epoch": 0.07280013357822675, "grad_norm": 1.2439645375449007, "learning_rate": 2.4249165739710794e-06, "loss": 0.0891, "step": 218 }, { "epoch": 0.07313407914509935, "grad_norm": 0.9537212933511459, "learning_rate": 2.436040044493882e-06, "loss": 0.0651, "step": 219 }, { "epoch": 0.07346802471197195, "grad_norm": 1.0016290487955293, "learning_rate": 2.4471635150166852e-06, "loss": 0.0629, "step": 220 }, { "epoch": 0.07380197027884455, "grad_norm": 1.0540971677217288, "learning_rate": 2.4582869855394888e-06, "loss": 0.0876, "step": 221 }, { "epoch": 0.07413591584571715, "grad_norm": 1.2439977170608356, "learning_rate": 2.4694104560622915e-06, "loss": 0.0624, "step": 222 }, { "epoch": 0.07446986141258975, "grad_norm": 1.2972720451499673, "learning_rate": 2.4805339265850946e-06, "loss": 0.0873, "step": 223 }, { "epoch": 0.07480380697946235, "grad_norm": 1.4086762013586394, "learning_rate": 2.4916573971078977e-06, "loss": 0.0708, "step": 224 }, { "epoch": 0.07513775254633495, "grad_norm": 1.0791618745968774, "learning_rate": 2.502780867630701e-06, "loss": 0.0705, "step": 225 }, { "epoch": 0.07547169811320754, "grad_norm": 1.0990305310646942, "learning_rate": 2.513904338153504e-06, "loss": 0.0739, "step": 226 }, { "epoch": 0.07580564368008015, "grad_norm": 1.013766222792433, "learning_rate": 2.5250278086763076e-06, "loss": 0.0687, "step": 227 }, { "epoch": 0.07613958924695274, "grad_norm": 1.1326007702379792, "learning_rate": 2.5361512791991107e-06, "loss": 0.0843, "step": 228 }, { "epoch": 0.07647353481382535, "grad_norm": 1.2189122363684406, "learning_rate": 2.5472747497219134e-06, "loss": 0.0777, "step": 229 }, { "epoch": 0.07680748038069794, "grad_norm": 1.4445267765189558, "learning_rate": 2.5583982202447165e-06, "loss": 0.0654, "step": 230 }, { "epoch": 0.07714142594757055, "grad_norm": 1.061842223719913, "learning_rate": 2.5695216907675197e-06, "loss": 0.0843, "step": 231 }, { "epoch": 0.07747537151444314, "grad_norm": 0.8838057693927293, "learning_rate": 2.580645161290323e-06, "loss": 0.0471, "step": 232 }, { "epoch": 0.07780931708131575, "grad_norm": 1.3696770724687637, "learning_rate": 2.591768631813126e-06, "loss": 0.0752, "step": 233 }, { "epoch": 0.07814326264818834, "grad_norm": 1.2158788748088327, "learning_rate": 2.6028921023359286e-06, "loss": 0.0944, "step": 234 }, { "epoch": 0.07847720821506095, "grad_norm": 1.295274490342438, "learning_rate": 2.6140155728587318e-06, "loss": 0.0857, "step": 235 }, { "epoch": 0.07881115378193354, "grad_norm": 1.1539292236059824, "learning_rate": 2.6251390433815353e-06, "loss": 0.0508, "step": 236 }, { "epoch": 0.07914509934880615, "grad_norm": 0.7116705685872293, "learning_rate": 2.6362625139043385e-06, "loss": 0.0517, "step": 237 }, { "epoch": 0.07947904491567874, "grad_norm": 1.21665495177681, "learning_rate": 2.6473859844271416e-06, "loss": 0.0843, "step": 238 }, { "epoch": 0.07981299048255135, "grad_norm": 0.8226166857909414, "learning_rate": 2.6585094549499447e-06, "loss": 0.0771, "step": 239 }, { "epoch": 0.08014693604942394, "grad_norm": 1.140511279181508, "learning_rate": 2.669632925472748e-06, "loss": 0.0806, "step": 240 }, { "epoch": 0.08048088161629655, "grad_norm": 2.6499827732546657, "learning_rate": 2.6807563959955506e-06, "loss": 0.072, "step": 241 }, { "epoch": 0.08081482718316914, "grad_norm": 1.2457689203698872, "learning_rate": 2.6918798665183537e-06, "loss": 0.0827, "step": 242 }, { "epoch": 0.08114877275004174, "grad_norm": 1.2331789620109057, "learning_rate": 2.703003337041157e-06, "loss": 0.0764, "step": 243 }, { "epoch": 0.08148271831691434, "grad_norm": 2.403989830723792, "learning_rate": 2.7141268075639604e-06, "loss": 0.0933, "step": 244 }, { "epoch": 0.08181666388378694, "grad_norm": 0.8317597079044875, "learning_rate": 2.7252502780867635e-06, "loss": 0.0544, "step": 245 }, { "epoch": 0.08215060945065954, "grad_norm": 1.1552367973010966, "learning_rate": 2.7363737486095667e-06, "loss": 0.0958, "step": 246 }, { "epoch": 0.08248455501753214, "grad_norm": 1.2187444616161907, "learning_rate": 2.7474972191323694e-06, "loss": 0.0896, "step": 247 }, { "epoch": 0.08281850058440474, "grad_norm": 1.6494729328754958, "learning_rate": 2.7586206896551725e-06, "loss": 0.0858, "step": 248 }, { "epoch": 0.08315244615127734, "grad_norm": 1.2042569328691684, "learning_rate": 2.7697441601779756e-06, "loss": 0.0522, "step": 249 }, { "epoch": 0.08348639171814994, "grad_norm": 1.0969576654880853, "learning_rate": 2.7808676307007788e-06, "loss": 0.0698, "step": 250 }, { "epoch": 0.08382033728502254, "grad_norm": 1.4190087693901525, "learning_rate": 2.791991101223582e-06, "loss": 0.0858, "step": 251 }, { "epoch": 0.08415428285189513, "grad_norm": 1.011867095931969, "learning_rate": 2.8031145717463854e-06, "loss": 0.0663, "step": 252 }, { "epoch": 0.08448822841876774, "grad_norm": 1.1357462237499456, "learning_rate": 2.8142380422691886e-06, "loss": 0.0758, "step": 253 }, { "epoch": 0.08482217398564033, "grad_norm": 1.0992432358389839, "learning_rate": 2.8253615127919913e-06, "loss": 0.0686, "step": 254 }, { "epoch": 0.08515611955251294, "grad_norm": 1.2210379843748043, "learning_rate": 2.8364849833147944e-06, "loss": 0.0751, "step": 255 }, { "epoch": 0.08549006511938555, "grad_norm": 1.333496324687641, "learning_rate": 2.8476084538375975e-06, "loss": 0.0856, "step": 256 }, { "epoch": 0.08582401068625814, "grad_norm": 0.8982589393714245, "learning_rate": 2.8587319243604007e-06, "loss": 0.0685, "step": 257 }, { "epoch": 0.08615795625313075, "grad_norm": 0.7664615334007062, "learning_rate": 2.869855394883204e-06, "loss": 0.0562, "step": 258 }, { "epoch": 0.08649190182000334, "grad_norm": 0.8853220569466045, "learning_rate": 2.8809788654060065e-06, "loss": 0.0478, "step": 259 }, { "epoch": 0.08682584738687595, "grad_norm": 1.3074993687517895, "learning_rate": 2.8921023359288105e-06, "loss": 0.0702, "step": 260 }, { "epoch": 0.08715979295374854, "grad_norm": 1.4187470300484282, "learning_rate": 2.903225806451613e-06, "loss": 0.0798, "step": 261 }, { "epoch": 0.08749373852062114, "grad_norm": 0.8946541168733985, "learning_rate": 2.9143492769744163e-06, "loss": 0.0546, "step": 262 }, { "epoch": 0.08782768408749374, "grad_norm": 0.9019084816464253, "learning_rate": 2.9254727474972195e-06, "loss": 0.0637, "step": 263 }, { "epoch": 0.08816162965436634, "grad_norm": 1.035031565835833, "learning_rate": 2.9365962180200226e-06, "loss": 0.071, "step": 264 }, { "epoch": 0.08849557522123894, "grad_norm": 1.062399337907389, "learning_rate": 2.9477196885428257e-06, "loss": 0.0606, "step": 265 }, { "epoch": 0.08882952078811154, "grad_norm": 1.3812400808776473, "learning_rate": 2.9588431590656284e-06, "loss": 0.099, "step": 266 }, { "epoch": 0.08916346635498414, "grad_norm": 2.1106807968897905, "learning_rate": 2.9699666295884316e-06, "loss": 0.0697, "step": 267 }, { "epoch": 0.08949741192185674, "grad_norm": 0.7470407396628205, "learning_rate": 2.9810901001112347e-06, "loss": 0.0525, "step": 268 }, { "epoch": 0.08983135748872934, "grad_norm": 1.2416470600288456, "learning_rate": 2.9922135706340383e-06, "loss": 0.0878, "step": 269 }, { "epoch": 0.09016530305560194, "grad_norm": 0.7739232097144934, "learning_rate": 3.0033370411568414e-06, "loss": 0.0486, "step": 270 }, { "epoch": 0.09049924862247453, "grad_norm": 0.9689287092068435, "learning_rate": 3.0144605116796445e-06, "loss": 0.0615, "step": 271 }, { "epoch": 0.09083319418934714, "grad_norm": 0.9706450420856501, "learning_rate": 3.0255839822024472e-06, "loss": 0.0663, "step": 272 }, { "epoch": 0.09116713975621973, "grad_norm": 0.8113655680721432, "learning_rate": 3.0367074527252504e-06, "loss": 0.0584, "step": 273 }, { "epoch": 0.09150108532309234, "grad_norm": 0.88652357445208, "learning_rate": 3.0478309232480535e-06, "loss": 0.0593, "step": 274 }, { "epoch": 0.09183503088996493, "grad_norm": 0.9619013850625336, "learning_rate": 3.0589543937708566e-06, "loss": 0.0567, "step": 275 }, { "epoch": 0.09216897645683754, "grad_norm": 1.0650640915491556, "learning_rate": 3.0700778642936598e-06, "loss": 0.0834, "step": 276 }, { "epoch": 0.09250292202371013, "grad_norm": 1.0371199805327422, "learning_rate": 3.0812013348164633e-06, "loss": 0.0644, "step": 277 }, { "epoch": 0.09283686759058274, "grad_norm": 1.5437364955186497, "learning_rate": 3.0923248053392665e-06, "loss": 0.0926, "step": 278 }, { "epoch": 0.09317081315745533, "grad_norm": 1.350929344249909, "learning_rate": 3.103448275862069e-06, "loss": 0.0806, "step": 279 }, { "epoch": 0.09350475872432794, "grad_norm": 0.984748131136075, "learning_rate": 3.1145717463848723e-06, "loss": 0.0644, "step": 280 }, { "epoch": 0.09383870429120053, "grad_norm": 0.8197737946172219, "learning_rate": 3.1256952169076754e-06, "loss": 0.0551, "step": 281 }, { "epoch": 0.09417264985807314, "grad_norm": 1.0028702309436293, "learning_rate": 3.1368186874304786e-06, "loss": 0.0855, "step": 282 }, { "epoch": 0.09450659542494573, "grad_norm": 1.1354845043569812, "learning_rate": 3.1479421579532817e-06, "loss": 0.0656, "step": 283 }, { "epoch": 0.09484054099181834, "grad_norm": 1.4093692815220873, "learning_rate": 3.1590656284760844e-06, "loss": 0.0926, "step": 284 }, { "epoch": 0.09517448655869093, "grad_norm": 0.8150099194875217, "learning_rate": 3.170189098998888e-06, "loss": 0.0713, "step": 285 }, { "epoch": 0.09550843212556354, "grad_norm": 0.9994394655029959, "learning_rate": 3.181312569521691e-06, "loss": 0.0471, "step": 286 }, { "epoch": 0.09584237769243613, "grad_norm": 0.9166625944563191, "learning_rate": 3.1924360400444942e-06, "loss": 0.0714, "step": 287 }, { "epoch": 0.09617632325930874, "grad_norm": 1.0476210871953306, "learning_rate": 3.2035595105672973e-06, "loss": 0.082, "step": 288 }, { "epoch": 0.09651026882618133, "grad_norm": 0.8552898997492805, "learning_rate": 3.2146829810901005e-06, "loss": 0.0578, "step": 289 }, { "epoch": 0.09684421439305393, "grad_norm": 0.9887021575952659, "learning_rate": 3.225806451612903e-06, "loss": 0.0721, "step": 290 }, { "epoch": 0.09717815995992653, "grad_norm": 1.3001300892308882, "learning_rate": 3.2369299221357063e-06, "loss": 0.0797, "step": 291 }, { "epoch": 0.09751210552679913, "grad_norm": 0.9616361386087116, "learning_rate": 3.2480533926585095e-06, "loss": 0.0751, "step": 292 }, { "epoch": 0.09784605109367173, "grad_norm": 0.769818976357463, "learning_rate": 3.259176863181313e-06, "loss": 0.0476, "step": 293 }, { "epoch": 0.09817999666054433, "grad_norm": 0.9039460600335342, "learning_rate": 3.270300333704116e-06, "loss": 0.0568, "step": 294 }, { "epoch": 0.09851394222741693, "grad_norm": 0.7357670662426185, "learning_rate": 3.2814238042269193e-06, "loss": 0.061, "step": 295 }, { "epoch": 0.09884788779428953, "grad_norm": 0.985486971157034, "learning_rate": 3.2925472747497224e-06, "loss": 0.0537, "step": 296 }, { "epoch": 0.09918183336116213, "grad_norm": 1.0657623613030358, "learning_rate": 3.303670745272525e-06, "loss": 0.0727, "step": 297 }, { "epoch": 0.09951577892803473, "grad_norm": 0.8178754986431233, "learning_rate": 3.3147942157953282e-06, "loss": 0.0575, "step": 298 }, { "epoch": 0.09984972449490732, "grad_norm": 1.0267008299629332, "learning_rate": 3.3259176863181314e-06, "loss": 0.0837, "step": 299 }, { "epoch": 0.10018367006177993, "grad_norm": 0.9577138509261269, "learning_rate": 3.3370411568409345e-06, "loss": 0.069, "step": 300 }, { "epoch": 0.10051761562865252, "grad_norm": 0.7687077726851129, "learning_rate": 3.3481646273637376e-06, "loss": 0.0734, "step": 301 }, { "epoch": 0.10085156119552513, "grad_norm": 0.9432917509341645, "learning_rate": 3.359288097886541e-06, "loss": 0.0901, "step": 302 }, { "epoch": 0.10118550676239772, "grad_norm": 0.723295485828715, "learning_rate": 3.3704115684093443e-06, "loss": 0.0479, "step": 303 }, { "epoch": 0.10151945232927033, "grad_norm": 0.8433208705630757, "learning_rate": 3.381535038932147e-06, "loss": 0.0631, "step": 304 }, { "epoch": 0.10185339789614292, "grad_norm": 0.8451391738861146, "learning_rate": 3.39265850945495e-06, "loss": 0.0674, "step": 305 }, { "epoch": 0.10218734346301553, "grad_norm": 0.9422394841862002, "learning_rate": 3.4037819799777533e-06, "loss": 0.0789, "step": 306 }, { "epoch": 0.10252128902988812, "grad_norm": 0.6754944960236535, "learning_rate": 3.4149054505005564e-06, "loss": 0.0546, "step": 307 }, { "epoch": 0.10285523459676073, "grad_norm": 0.854906938501467, "learning_rate": 3.4260289210233596e-06, "loss": 0.0691, "step": 308 }, { "epoch": 0.10318918016363333, "grad_norm": 0.8412785322678099, "learning_rate": 3.4371523915461623e-06, "loss": 0.05, "step": 309 }, { "epoch": 0.10352312573050593, "grad_norm": 0.975075371136509, "learning_rate": 3.448275862068966e-06, "loss": 0.0681, "step": 310 }, { "epoch": 0.10385707129737853, "grad_norm": 1.204824033576981, "learning_rate": 3.459399332591769e-06, "loss": 0.0833, "step": 311 }, { "epoch": 0.10419101686425113, "grad_norm": 1.6455562509076394, "learning_rate": 3.470522803114572e-06, "loss": 0.0872, "step": 312 }, { "epoch": 0.10452496243112373, "grad_norm": 1.389913324675098, "learning_rate": 3.4816462736373752e-06, "loss": 0.0728, "step": 313 }, { "epoch": 0.10485890799799633, "grad_norm": 1.0502187561605811, "learning_rate": 3.4927697441601784e-06, "loss": 0.067, "step": 314 }, { "epoch": 0.10519285356486893, "grad_norm": 0.9059963677233313, "learning_rate": 3.503893214682981e-06, "loss": 0.064, "step": 315 }, { "epoch": 0.10552679913174152, "grad_norm": 1.509854880843312, "learning_rate": 3.515016685205784e-06, "loss": 0.0755, "step": 316 }, { "epoch": 0.10586074469861413, "grad_norm": 1.090887522966728, "learning_rate": 3.5261401557285873e-06, "loss": 0.0723, "step": 317 }, { "epoch": 0.10619469026548672, "grad_norm": 0.8850762549294612, "learning_rate": 3.537263626251391e-06, "loss": 0.0788, "step": 318 }, { "epoch": 0.10652863583235933, "grad_norm": 0.757063526049472, "learning_rate": 3.548387096774194e-06, "loss": 0.054, "step": 319 }, { "epoch": 0.10686258139923192, "grad_norm": 0.8830245601771521, "learning_rate": 3.559510567296997e-06, "loss": 0.0651, "step": 320 }, { "epoch": 0.10719652696610453, "grad_norm": 0.7908966462909623, "learning_rate": 3.5706340378198003e-06, "loss": 0.0682, "step": 321 }, { "epoch": 0.10753047253297712, "grad_norm": 1.1397441595400608, "learning_rate": 3.581757508342603e-06, "loss": 0.0885, "step": 322 }, { "epoch": 0.10786441809984973, "grad_norm": 0.8429977094893334, "learning_rate": 3.592880978865406e-06, "loss": 0.0614, "step": 323 }, { "epoch": 0.10819836366672232, "grad_norm": 1.2403336317224285, "learning_rate": 3.6040044493882093e-06, "loss": 0.0795, "step": 324 }, { "epoch": 0.10853230923359493, "grad_norm": 0.9696392027482027, "learning_rate": 3.6151279199110124e-06, "loss": 0.0728, "step": 325 }, { "epoch": 0.10886625480046752, "grad_norm": 0.8848399109331946, "learning_rate": 3.626251390433816e-06, "loss": 0.0643, "step": 326 }, { "epoch": 0.10920020036734013, "grad_norm": 0.8716169658223119, "learning_rate": 3.637374860956619e-06, "loss": 0.0509, "step": 327 }, { "epoch": 0.10953414593421272, "grad_norm": 1.037505265979191, "learning_rate": 3.648498331479422e-06, "loss": 0.0753, "step": 328 }, { "epoch": 0.10986809150108533, "grad_norm": 1.1819898795024548, "learning_rate": 3.659621802002225e-06, "loss": 0.0833, "step": 329 }, { "epoch": 0.11020203706795792, "grad_norm": 1.1406256978667613, "learning_rate": 3.670745272525028e-06, "loss": 0.081, "step": 330 }, { "epoch": 0.11053598263483053, "grad_norm": 1.0443853645878745, "learning_rate": 3.681868743047831e-06, "loss": 0.0614, "step": 331 }, { "epoch": 0.11086992820170312, "grad_norm": 1.6306549726788961, "learning_rate": 3.6929922135706343e-06, "loss": 0.0434, "step": 332 }, { "epoch": 0.11120387376857573, "grad_norm": 0.9531009380601928, "learning_rate": 3.7041156840934374e-06, "loss": 0.084, "step": 333 }, { "epoch": 0.11153781933544832, "grad_norm": 1.0996412187005624, "learning_rate": 3.71523915461624e-06, "loss": 0.0864, "step": 334 }, { "epoch": 0.11187176490232092, "grad_norm": 1.3393366382316005, "learning_rate": 3.7263626251390437e-06, "loss": 0.0723, "step": 335 }, { "epoch": 0.11220571046919352, "grad_norm": 0.9568198725045595, "learning_rate": 3.737486095661847e-06, "loss": 0.0615, "step": 336 }, { "epoch": 0.11253965603606612, "grad_norm": 0.9319636035637239, "learning_rate": 3.74860956618465e-06, "loss": 0.0644, "step": 337 }, { "epoch": 0.11287360160293872, "grad_norm": 1.1870868966528694, "learning_rate": 3.759733036707453e-06, "loss": 0.0562, "step": 338 }, { "epoch": 0.11320754716981132, "grad_norm": 0.9526342694332731, "learning_rate": 3.7708565072302562e-06, "loss": 0.0584, "step": 339 }, { "epoch": 0.11354149273668392, "grad_norm": 1.2723184920889619, "learning_rate": 3.781979977753059e-06, "loss": 0.0736, "step": 340 }, { "epoch": 0.11387543830355652, "grad_norm": 0.9263403362171003, "learning_rate": 3.793103448275862e-06, "loss": 0.0586, "step": 341 }, { "epoch": 0.11420938387042912, "grad_norm": 0.8430467437161417, "learning_rate": 3.804226918798665e-06, "loss": 0.0556, "step": 342 }, { "epoch": 0.11454332943730172, "grad_norm": 1.154771466253, "learning_rate": 3.815350389321469e-06, "loss": 0.0571, "step": 343 }, { "epoch": 0.11487727500417431, "grad_norm": 1.1082196652446936, "learning_rate": 3.8264738598442715e-06, "loss": 0.0639, "step": 344 }, { "epoch": 0.11521122057104692, "grad_norm": 0.8868903401007896, "learning_rate": 3.837597330367075e-06, "loss": 0.0533, "step": 345 }, { "epoch": 0.11554516613791951, "grad_norm": 1.0803894175416395, "learning_rate": 3.848720800889878e-06, "loss": 0.0863, "step": 346 }, { "epoch": 0.11587911170479212, "grad_norm": 1.0324462777222236, "learning_rate": 3.859844271412681e-06, "loss": 0.0578, "step": 347 }, { "epoch": 0.11621305727166471, "grad_norm": 1.1025377393840483, "learning_rate": 3.870967741935484e-06, "loss": 0.0646, "step": 348 }, { "epoch": 0.11654700283853732, "grad_norm": 0.7616860665860318, "learning_rate": 3.8820912124582876e-06, "loss": 0.0631, "step": 349 }, { "epoch": 0.11688094840540991, "grad_norm": 1.3354653908723382, "learning_rate": 3.89321468298109e-06, "loss": 0.077, "step": 350 }, { "epoch": 0.11721489397228252, "grad_norm": 0.8317935535836048, "learning_rate": 3.904338153503894e-06, "loss": 0.0741, "step": 351 }, { "epoch": 0.11754883953915511, "grad_norm": 1.1109506420743003, "learning_rate": 3.9154616240266965e-06, "loss": 0.0763, "step": 352 }, { "epoch": 0.11788278510602772, "grad_norm": 0.9494638434263117, "learning_rate": 3.9265850945495e-06, "loss": 0.0669, "step": 353 }, { "epoch": 0.11821673067290031, "grad_norm": 0.7662930810479017, "learning_rate": 3.937708565072303e-06, "loss": 0.0597, "step": 354 }, { "epoch": 0.11855067623977292, "grad_norm": 1.3373783677115336, "learning_rate": 3.948832035595106e-06, "loss": 0.0885, "step": 355 }, { "epoch": 0.11888462180664551, "grad_norm": 1.0321161388076057, "learning_rate": 3.959955506117909e-06, "loss": 0.0572, "step": 356 }, { "epoch": 0.11921856737351812, "grad_norm": 0.9316855711072523, "learning_rate": 3.971078976640712e-06, "loss": 0.0697, "step": 357 }, { "epoch": 0.11955251294039071, "grad_norm": 1.1383940818027423, "learning_rate": 3.982202447163515e-06, "loss": 0.0543, "step": 358 }, { "epoch": 0.11988645850726332, "grad_norm": 1.0241814060412053, "learning_rate": 3.993325917686319e-06, "loss": 0.068, "step": 359 }, { "epoch": 0.12022040407413592, "grad_norm": 1.3927998360633838, "learning_rate": 4.004449388209122e-06, "loss": 0.0594, "step": 360 }, { "epoch": 0.12055434964100852, "grad_norm": 1.465185164593061, "learning_rate": 4.015572858731925e-06, "loss": 0.0707, "step": 361 }, { "epoch": 0.12088829520788112, "grad_norm": 1.0218384291719091, "learning_rate": 4.026696329254728e-06, "loss": 0.0862, "step": 362 }, { "epoch": 0.12122224077475371, "grad_norm": 0.8335335272923647, "learning_rate": 4.0378197997775306e-06, "loss": 0.0666, "step": 363 }, { "epoch": 0.12155618634162632, "grad_norm": 0.7971387678207199, "learning_rate": 4.048943270300334e-06, "loss": 0.0536, "step": 364 }, { "epoch": 0.12189013190849891, "grad_norm": 0.9018760524980276, "learning_rate": 4.060066740823137e-06, "loss": 0.0652, "step": 365 }, { "epoch": 0.12222407747537152, "grad_norm": 0.7100076734118838, "learning_rate": 4.07119021134594e-06, "loss": 0.0522, "step": 366 }, { "epoch": 0.12255802304224411, "grad_norm": 1.1553456634700505, "learning_rate": 4.082313681868743e-06, "loss": 0.0665, "step": 367 }, { "epoch": 0.12289196860911672, "grad_norm": 1.2667757129005257, "learning_rate": 4.093437152391547e-06, "loss": 0.0883, "step": 368 }, { "epoch": 0.12322591417598931, "grad_norm": 0.6961100960879517, "learning_rate": 4.104560622914349e-06, "loss": 0.0633, "step": 369 }, { "epoch": 0.12355985974286192, "grad_norm": 1.008433304396925, "learning_rate": 4.115684093437153e-06, "loss": 0.0758, "step": 370 }, { "epoch": 0.12389380530973451, "grad_norm": 1.0695684974598691, "learning_rate": 4.126807563959956e-06, "loss": 0.0663, "step": 371 }, { "epoch": 0.12422775087660712, "grad_norm": 0.806555344783645, "learning_rate": 4.137931034482759e-06, "loss": 0.0748, "step": 372 }, { "epoch": 0.12456169644347971, "grad_norm": 1.107697753039673, "learning_rate": 4.149054505005562e-06, "loss": 0.067, "step": 373 }, { "epoch": 0.12489564201035232, "grad_norm": 0.7010287093915, "learning_rate": 4.160177975528365e-06, "loss": 0.0691, "step": 374 }, { "epoch": 0.1252295875772249, "grad_norm": 0.9084861258601861, "learning_rate": 4.171301446051168e-06, "loss": 0.0665, "step": 375 }, { "epoch": 0.12556353314409752, "grad_norm": 1.0664391897856733, "learning_rate": 4.182424916573972e-06, "loss": 0.0897, "step": 376 }, { "epoch": 0.12589747871097012, "grad_norm": 0.7424223077507432, "learning_rate": 4.193548387096774e-06, "loss": 0.064, "step": 377 }, { "epoch": 0.1262314242778427, "grad_norm": 2.7620275717695226, "learning_rate": 4.204671857619578e-06, "loss": 0.088, "step": 378 }, { "epoch": 0.1265653698447153, "grad_norm": 0.7941430874418766, "learning_rate": 4.215795328142381e-06, "loss": 0.0628, "step": 379 }, { "epoch": 0.12689931541158792, "grad_norm": 0.9354132323804676, "learning_rate": 4.226918798665184e-06, "loss": 0.0679, "step": 380 }, { "epoch": 0.12723326097846052, "grad_norm": 0.7920974181780777, "learning_rate": 4.238042269187987e-06, "loss": 0.0597, "step": 381 }, { "epoch": 0.1275672065453331, "grad_norm": 0.867841544916767, "learning_rate": 4.24916573971079e-06, "loss": 0.0579, "step": 382 }, { "epoch": 0.1279011521122057, "grad_norm": 1.4020195657180856, "learning_rate": 4.260289210233593e-06, "loss": 0.0819, "step": 383 }, { "epoch": 0.1282350976790783, "grad_norm": 1.1938775225409488, "learning_rate": 4.271412680756397e-06, "loss": 0.0779, "step": 384 }, { "epoch": 0.12856904324595092, "grad_norm": 0.8022906514664288, "learning_rate": 4.2825361512791995e-06, "loss": 0.0609, "step": 385 }, { "epoch": 0.1289029888128235, "grad_norm": 0.6616577614601218, "learning_rate": 4.293659621802003e-06, "loss": 0.0606, "step": 386 }, { "epoch": 0.1292369343796961, "grad_norm": 1.3449449266845075, "learning_rate": 4.304783092324806e-06, "loss": 0.0876, "step": 387 }, { "epoch": 0.1295708799465687, "grad_norm": 1.0005342420994785, "learning_rate": 4.3159065628476084e-06, "loss": 0.0635, "step": 388 }, { "epoch": 0.12990482551344132, "grad_norm": 0.7802565312984167, "learning_rate": 4.327030033370412e-06, "loss": 0.0545, "step": 389 }, { "epoch": 0.1302387710803139, "grad_norm": 0.8621424847164936, "learning_rate": 4.338153503893215e-06, "loss": 0.0618, "step": 390 }, { "epoch": 0.1305727166471865, "grad_norm": 0.7700216914249677, "learning_rate": 4.349276974416018e-06, "loss": 0.0682, "step": 391 }, { "epoch": 0.1309066622140591, "grad_norm": 1.0332852000851969, "learning_rate": 4.360400444938822e-06, "loss": 0.0783, "step": 392 }, { "epoch": 0.13124060778093172, "grad_norm": 0.7084663536325163, "learning_rate": 4.3715239154616245e-06, "loss": 0.0513, "step": 393 }, { "epoch": 0.1315745533478043, "grad_norm": 0.7258019483196939, "learning_rate": 4.382647385984427e-06, "loss": 0.0618, "step": 394 }, { "epoch": 0.1319084989146769, "grad_norm": 0.7384245252121673, "learning_rate": 4.393770856507231e-06, "loss": 0.0617, "step": 395 }, { "epoch": 0.1322424444815495, "grad_norm": 0.879416102341382, "learning_rate": 4.4048943270300335e-06, "loss": 0.0893, "step": 396 }, { "epoch": 0.13257639004842212, "grad_norm": 1.111269865167675, "learning_rate": 4.416017797552837e-06, "loss": 0.0815, "step": 397 }, { "epoch": 0.1329103356152947, "grad_norm": 0.8541301967655361, "learning_rate": 4.42714126807564e-06, "loss": 0.0669, "step": 398 }, { "epoch": 0.1332442811821673, "grad_norm": 1.0914177666489773, "learning_rate": 4.4382647385984425e-06, "loss": 0.0745, "step": 399 }, { "epoch": 0.1335782267490399, "grad_norm": 0.8333840613959441, "learning_rate": 4.449388209121246e-06, "loss": 0.055, "step": 400 }, { "epoch": 0.13391217231591251, "grad_norm": 0.5813378320576991, "learning_rate": 4.4605116796440496e-06, "loss": 0.0464, "step": 401 }, { "epoch": 0.1342461178827851, "grad_norm": 1.0914380120324834, "learning_rate": 4.471635150166852e-06, "loss": 0.056, "step": 402 }, { "epoch": 0.1345800634496577, "grad_norm": 0.8968599625799999, "learning_rate": 4.482758620689656e-06, "loss": 0.0689, "step": 403 }, { "epoch": 0.1349140090165303, "grad_norm": 0.7137264684303274, "learning_rate": 4.4938820912124585e-06, "loss": 0.0723, "step": 404 }, { "epoch": 0.1352479545834029, "grad_norm": 1.7192219912416853, "learning_rate": 4.505005561735262e-06, "loss": 0.0794, "step": 405 }, { "epoch": 0.1355819001502755, "grad_norm": 0.6914730913580311, "learning_rate": 4.516129032258065e-06, "loss": 0.0507, "step": 406 }, { "epoch": 0.1359158457171481, "grad_norm": 0.8852780808500477, "learning_rate": 4.5272525027808675e-06, "loss": 0.0589, "step": 407 }, { "epoch": 0.1362497912840207, "grad_norm": 1.2348616096620406, "learning_rate": 4.538375973303671e-06, "loss": 0.0651, "step": 408 }, { "epoch": 0.1365837368508933, "grad_norm": 0.7435214705909083, "learning_rate": 4.549499443826475e-06, "loss": 0.0481, "step": 409 }, { "epoch": 0.13691768241776592, "grad_norm": 0.7747111792754291, "learning_rate": 4.560622914349277e-06, "loss": 0.0579, "step": 410 }, { "epoch": 0.1372516279846385, "grad_norm": 1.2774674221987332, "learning_rate": 4.571746384872081e-06, "loss": 0.1106, "step": 411 }, { "epoch": 0.1375855735515111, "grad_norm": 0.8184734262766915, "learning_rate": 4.582869855394884e-06, "loss": 0.0615, "step": 412 }, { "epoch": 0.1379195191183837, "grad_norm": 0.8893517844574772, "learning_rate": 4.593993325917686e-06, "loss": 0.0694, "step": 413 }, { "epoch": 0.13825346468525632, "grad_norm": 0.7942856390259999, "learning_rate": 4.60511679644049e-06, "loss": 0.0817, "step": 414 }, { "epoch": 0.1385874102521289, "grad_norm": 0.6972510816940831, "learning_rate": 4.6162402669632926e-06, "loss": 0.0657, "step": 415 }, { "epoch": 0.1389213558190015, "grad_norm": 0.6735407580491654, "learning_rate": 4.627363737486096e-06, "loss": 0.0422, "step": 416 }, { "epoch": 0.1392553013858741, "grad_norm": 0.9068832429274885, "learning_rate": 4.6384872080089e-06, "loss": 0.0738, "step": 417 }, { "epoch": 0.13958924695274672, "grad_norm": 0.5573937357195854, "learning_rate": 4.649610678531702e-06, "loss": 0.0486, "step": 418 }, { "epoch": 0.1399231925196193, "grad_norm": 1.1306131771178998, "learning_rate": 4.660734149054505e-06, "loss": 0.0721, "step": 419 }, { "epoch": 0.1402571380864919, "grad_norm": 0.9161627282442215, "learning_rate": 4.671857619577309e-06, "loss": 0.0636, "step": 420 }, { "epoch": 0.1405910836533645, "grad_norm": 0.8662276360669395, "learning_rate": 4.682981090100111e-06, "loss": 0.057, "step": 421 }, { "epoch": 0.1409250292202371, "grad_norm": 0.8059068213689825, "learning_rate": 4.694104560622915e-06, "loss": 0.0794, "step": 422 }, { "epoch": 0.1412589747871097, "grad_norm": 0.7513356926995356, "learning_rate": 4.705228031145718e-06, "loss": 0.0685, "step": 423 }, { "epoch": 0.1415929203539823, "grad_norm": 0.9234385601663734, "learning_rate": 4.71635150166852e-06, "loss": 0.0693, "step": 424 }, { "epoch": 0.1419268659208549, "grad_norm": 0.7933568219531439, "learning_rate": 4.727474972191325e-06, "loss": 0.0727, "step": 425 }, { "epoch": 0.1422608114877275, "grad_norm": 0.956125290520426, "learning_rate": 4.7385984427141274e-06, "loss": 0.089, "step": 426 }, { "epoch": 0.1425947570546001, "grad_norm": 0.9789206648859708, "learning_rate": 4.74972191323693e-06, "loss": 0.0641, "step": 427 }, { "epoch": 0.1429287026214727, "grad_norm": 0.644535093275283, "learning_rate": 4.760845383759734e-06, "loss": 0.0526, "step": 428 }, { "epoch": 0.1432626481883453, "grad_norm": 0.8809124618007454, "learning_rate": 4.771968854282536e-06, "loss": 0.0594, "step": 429 }, { "epoch": 0.1435965937552179, "grad_norm": 0.9101100326367371, "learning_rate": 4.78309232480534e-06, "loss": 0.0598, "step": 430 }, { "epoch": 0.1439305393220905, "grad_norm": 0.8039963024048183, "learning_rate": 4.794215795328143e-06, "loss": 0.0764, "step": 431 }, { "epoch": 0.1442644848889631, "grad_norm": 1.2707372886300605, "learning_rate": 4.805339265850945e-06, "loss": 0.0788, "step": 432 }, { "epoch": 0.1445984304558357, "grad_norm": 0.8505934103256769, "learning_rate": 4.816462736373749e-06, "loss": 0.0595, "step": 433 }, { "epoch": 0.1449323760227083, "grad_norm": 0.956829814390663, "learning_rate": 4.8275862068965525e-06, "loss": 0.0749, "step": 434 }, { "epoch": 0.1452663215895809, "grad_norm": 0.8762229891444342, "learning_rate": 4.838709677419355e-06, "loss": 0.0759, "step": 435 }, { "epoch": 0.1456002671564535, "grad_norm": 0.8346509440556535, "learning_rate": 4.849833147942159e-06, "loss": 0.0558, "step": 436 }, { "epoch": 0.1459342127233261, "grad_norm": 0.8166339845883285, "learning_rate": 4.8609566184649615e-06, "loss": 0.0648, "step": 437 }, { "epoch": 0.1462681582901987, "grad_norm": 1.2323034153108503, "learning_rate": 4.872080088987764e-06, "loss": 0.0754, "step": 438 }, { "epoch": 0.1466021038570713, "grad_norm": 0.9170291879229514, "learning_rate": 4.883203559510568e-06, "loss": 0.0802, "step": 439 }, { "epoch": 0.1469360494239439, "grad_norm": 0.7221436604229694, "learning_rate": 4.8943270300333704e-06, "loss": 0.0481, "step": 440 }, { "epoch": 0.1472699949908165, "grad_norm": 0.8990697598786687, "learning_rate": 4.905450500556174e-06, "loss": 0.0706, "step": 441 }, { "epoch": 0.1476039405576891, "grad_norm": 0.7615310608245218, "learning_rate": 4.9165739710789776e-06, "loss": 0.0738, "step": 442 }, { "epoch": 0.14793788612456168, "grad_norm": 0.7568416347024116, "learning_rate": 4.92769744160178e-06, "loss": 0.0723, "step": 443 }, { "epoch": 0.1482718316914343, "grad_norm": 1.994169955997745, "learning_rate": 4.938820912124583e-06, "loss": 0.0915, "step": 444 }, { "epoch": 0.1486057772583069, "grad_norm": 0.804709830172916, "learning_rate": 4.9499443826473865e-06, "loss": 0.0544, "step": 445 }, { "epoch": 0.1489397228251795, "grad_norm": 0.7623771915839768, "learning_rate": 4.961067853170189e-06, "loss": 0.0533, "step": 446 }, { "epoch": 0.14927366839205208, "grad_norm": 0.9978398241826115, "learning_rate": 4.972191323692993e-06, "loss": 0.0461, "step": 447 }, { "epoch": 0.1496076139589247, "grad_norm": 1.0896383334289894, "learning_rate": 4.9833147942157955e-06, "loss": 0.0681, "step": 448 }, { "epoch": 0.1499415595257973, "grad_norm": 0.9028419951187907, "learning_rate": 4.994438264738598e-06, "loss": 0.0606, "step": 449 }, { "epoch": 0.1502755050926699, "grad_norm": 1.083921419144493, "learning_rate": 5.005561735261402e-06, "loss": 0.0702, "step": 450 }, { "epoch": 0.15060945065954248, "grad_norm": 1.0713782553803741, "learning_rate": 5.016685205784205e-06, "loss": 0.0662, "step": 451 }, { "epoch": 0.1509433962264151, "grad_norm": 1.1834646725533113, "learning_rate": 5.027808676307008e-06, "loss": 0.0735, "step": 452 }, { "epoch": 0.1512773417932877, "grad_norm": 0.9703074891389526, "learning_rate": 5.038932146829812e-06, "loss": 0.0737, "step": 453 }, { "epoch": 0.1516112873601603, "grad_norm": 0.755237585571562, "learning_rate": 5.050055617352615e-06, "loss": 0.0668, "step": 454 }, { "epoch": 0.15194523292703288, "grad_norm": 0.9117012164808175, "learning_rate": 5.061179087875418e-06, "loss": 0.0586, "step": 455 }, { "epoch": 0.1522791784939055, "grad_norm": 0.6181916310720402, "learning_rate": 5.072302558398221e-06, "loss": 0.0495, "step": 456 }, { "epoch": 0.1526131240607781, "grad_norm": 0.8163763785306399, "learning_rate": 5.083426028921023e-06, "loss": 0.0634, "step": 457 }, { "epoch": 0.1529470696276507, "grad_norm": 0.8796412602404203, "learning_rate": 5.094549499443827e-06, "loss": 0.0618, "step": 458 }, { "epoch": 0.15328101519452328, "grad_norm": 0.7135447035944562, "learning_rate": 5.1056729699666295e-06, "loss": 0.0744, "step": 459 }, { "epoch": 0.15361496076139589, "grad_norm": 0.7553785666045255, "learning_rate": 5.116796440489433e-06, "loss": 0.0686, "step": 460 }, { "epoch": 0.1539489063282685, "grad_norm": 0.7545703857319316, "learning_rate": 5.127919911012236e-06, "loss": 0.061, "step": 461 }, { "epoch": 0.1542828518951411, "grad_norm": 0.7582065006521851, "learning_rate": 5.139043381535039e-06, "loss": 0.0658, "step": 462 }, { "epoch": 0.1546167974620137, "grad_norm": 0.7655579748134159, "learning_rate": 5.150166852057843e-06, "loss": 0.0447, "step": 463 }, { "epoch": 0.15495074302888628, "grad_norm": 0.8955937293257055, "learning_rate": 5.161290322580646e-06, "loss": 0.0583, "step": 464 }, { "epoch": 0.1552846885957589, "grad_norm": 0.7093979423603337, "learning_rate": 5.172413793103449e-06, "loss": 0.0517, "step": 465 }, { "epoch": 0.1556186341626315, "grad_norm": 1.2047517628712154, "learning_rate": 5.183537263626252e-06, "loss": 0.0726, "step": 466 }, { "epoch": 0.1559525797295041, "grad_norm": 0.6586898864529823, "learning_rate": 5.1946607341490554e-06, "loss": 0.0508, "step": 467 }, { "epoch": 0.15628652529637668, "grad_norm": 0.7296233118104705, "learning_rate": 5.205784204671857e-06, "loss": 0.0547, "step": 468 }, { "epoch": 0.1566204708632493, "grad_norm": 1.029385580406455, "learning_rate": 5.216907675194661e-06, "loss": 0.0617, "step": 469 }, { "epoch": 0.1569544164301219, "grad_norm": 0.8828964156642398, "learning_rate": 5.2280311457174636e-06, "loss": 0.0738, "step": 470 }, { "epoch": 0.1572883619969945, "grad_norm": 0.7357519449955818, "learning_rate": 5.239154616240267e-06, "loss": 0.0588, "step": 471 }, { "epoch": 0.15762230756386708, "grad_norm": 0.9117473447896604, "learning_rate": 5.250278086763071e-06, "loss": 0.0548, "step": 472 }, { "epoch": 0.1579562531307397, "grad_norm": 0.6906100135121893, "learning_rate": 5.261401557285873e-06, "loss": 0.0557, "step": 473 }, { "epoch": 0.1582901986976123, "grad_norm": 1.1950263378701476, "learning_rate": 5.272525027808677e-06, "loss": 0.058, "step": 474 }, { "epoch": 0.1586241442644849, "grad_norm": 0.6927807149939814, "learning_rate": 5.28364849833148e-06, "loss": 0.0557, "step": 475 }, { "epoch": 0.15895808983135748, "grad_norm": 0.65365492551376, "learning_rate": 5.294771968854283e-06, "loss": 0.0582, "step": 476 }, { "epoch": 0.1592920353982301, "grad_norm": 0.8799420679217809, "learning_rate": 5.305895439377086e-06, "loss": 0.0863, "step": 477 }, { "epoch": 0.1596259809651027, "grad_norm": 0.7103549816274838, "learning_rate": 5.3170189098998895e-06, "loss": 0.0487, "step": 478 }, { "epoch": 0.1599599265319753, "grad_norm": 0.7152667161275338, "learning_rate": 5.328142380422693e-06, "loss": 0.0441, "step": 479 }, { "epoch": 0.16029387209884788, "grad_norm": 0.9695447542502575, "learning_rate": 5.339265850945496e-06, "loss": 0.0638, "step": 480 }, { "epoch": 0.16062781766572048, "grad_norm": 0.7741189676524061, "learning_rate": 5.350389321468299e-06, "loss": 0.0509, "step": 481 }, { "epoch": 0.1609617632325931, "grad_norm": 0.8810457057758688, "learning_rate": 5.361512791991101e-06, "loss": 0.0638, "step": 482 }, { "epoch": 0.1612957087994657, "grad_norm": 0.6102246479100515, "learning_rate": 5.372636262513905e-06, "loss": 0.0581, "step": 483 }, { "epoch": 0.16162965436633828, "grad_norm": 1.3278482352026502, "learning_rate": 5.383759733036707e-06, "loss": 0.0763, "step": 484 }, { "epoch": 0.16196359993321088, "grad_norm": 0.7371166558096207, "learning_rate": 5.394883203559511e-06, "loss": 0.0647, "step": 485 }, { "epoch": 0.1622975455000835, "grad_norm": 1.0736671434503915, "learning_rate": 5.406006674082314e-06, "loss": 0.0678, "step": 486 }, { "epoch": 0.1626314910669561, "grad_norm": 0.6949676204696933, "learning_rate": 5.417130144605117e-06, "loss": 0.0523, "step": 487 }, { "epoch": 0.16296543663382868, "grad_norm": 0.7936075395734432, "learning_rate": 5.428253615127921e-06, "loss": 0.0597, "step": 488 }, { "epoch": 0.16329938220070128, "grad_norm": 0.6514863106711773, "learning_rate": 5.4393770856507235e-06, "loss": 0.0509, "step": 489 }, { "epoch": 0.1636333277675739, "grad_norm": 0.8324752891246695, "learning_rate": 5.450500556173527e-06, "loss": 0.0716, "step": 490 }, { "epoch": 0.1639672733344465, "grad_norm": 0.9053927241925424, "learning_rate": 5.46162402669633e-06, "loss": 0.0706, "step": 491 }, { "epoch": 0.16430121890131907, "grad_norm": 0.8721247709102367, "learning_rate": 5.472747497219133e-06, "loss": 0.0699, "step": 492 }, { "epoch": 0.16463516446819168, "grad_norm": 0.8430069088405966, "learning_rate": 5.483870967741935e-06, "loss": 0.0774, "step": 493 }, { "epoch": 0.1649691100350643, "grad_norm": 0.5996875363467666, "learning_rate": 5.494994438264739e-06, "loss": 0.0473, "step": 494 }, { "epoch": 0.1653030556019369, "grad_norm": 0.7525414129851855, "learning_rate": 5.506117908787543e-06, "loss": 0.0548, "step": 495 }, { "epoch": 0.16563700116880947, "grad_norm": 0.7456419929890177, "learning_rate": 5.517241379310345e-06, "loss": 0.0521, "step": 496 }, { "epoch": 0.16597094673568208, "grad_norm": 1.4821871731471925, "learning_rate": 5.5283648498331485e-06, "loss": 0.0633, "step": 497 }, { "epoch": 0.16630489230255469, "grad_norm": 1.0035158211938564, "learning_rate": 5.539488320355951e-06, "loss": 0.0501, "step": 498 }, { "epoch": 0.1666388378694273, "grad_norm": 1.0643854193297664, "learning_rate": 5.550611790878755e-06, "loss": 0.069, "step": 499 }, { "epoch": 0.16697278343629987, "grad_norm": 0.8695276836951805, "learning_rate": 5.5617352614015575e-06, "loss": 0.0678, "step": 500 }, { "epoch": 0.16730672900317248, "grad_norm": 0.8450565320978253, "learning_rate": 5.572858731924361e-06, "loss": 0.0571, "step": 501 }, { "epoch": 0.16764067457004508, "grad_norm": 1.1669524985507995, "learning_rate": 5.583982202447164e-06, "loss": 0.0834, "step": 502 }, { "epoch": 0.1679746201369177, "grad_norm": 0.970165804178769, "learning_rate": 5.595105672969967e-06, "loss": 0.0555, "step": 503 }, { "epoch": 0.16830856570379027, "grad_norm": 0.7723836408243235, "learning_rate": 5.606229143492771e-06, "loss": 0.0464, "step": 504 }, { "epoch": 0.16864251127066288, "grad_norm": 0.5381677090389018, "learning_rate": 5.617352614015574e-06, "loss": 0.0498, "step": 505 }, { "epoch": 0.16897645683753548, "grad_norm": 1.0636586844048173, "learning_rate": 5.628476084538377e-06, "loss": 0.068, "step": 506 }, { "epoch": 0.1693104024044081, "grad_norm": 0.8162554296195744, "learning_rate": 5.639599555061179e-06, "loss": 0.0645, "step": 507 }, { "epoch": 0.16964434797128067, "grad_norm": 0.6481582277347443, "learning_rate": 5.6507230255839826e-06, "loss": 0.0615, "step": 508 }, { "epoch": 0.16997829353815327, "grad_norm": 1.3923437281539484, "learning_rate": 5.661846496106785e-06, "loss": 0.0822, "step": 509 }, { "epoch": 0.17031223910502588, "grad_norm": 0.977000370333707, "learning_rate": 5.672969966629589e-06, "loss": 0.0731, "step": 510 }, { "epoch": 0.1706461846718985, "grad_norm": 0.8458998828404004, "learning_rate": 5.6840934371523915e-06, "loss": 0.0591, "step": 511 }, { "epoch": 0.1709801302387711, "grad_norm": 0.7348551384773913, "learning_rate": 5.695216907675195e-06, "loss": 0.0588, "step": 512 }, { "epoch": 0.17131407580564367, "grad_norm": 0.836801901679019, "learning_rate": 5.706340378197999e-06, "loss": 0.0621, "step": 513 }, { "epoch": 0.17164802137251628, "grad_norm": 0.8189625571489982, "learning_rate": 5.717463848720801e-06, "loss": 0.0446, "step": 514 }, { "epoch": 0.1719819669393889, "grad_norm": 1.432359875394896, "learning_rate": 5.728587319243605e-06, "loss": 0.0806, "step": 515 }, { "epoch": 0.1723159125062615, "grad_norm": 0.7614900359440986, "learning_rate": 5.739710789766408e-06, "loss": 0.0551, "step": 516 }, { "epoch": 0.17264985807313407, "grad_norm": 0.6671415448030087, "learning_rate": 5.750834260289211e-06, "loss": 0.0503, "step": 517 }, { "epoch": 0.17298380364000668, "grad_norm": 0.8520371085785486, "learning_rate": 5.761957730812013e-06, "loss": 0.0585, "step": 518 }, { "epoch": 0.17331774920687928, "grad_norm": 2.3571026809143714, "learning_rate": 5.773081201334817e-06, "loss": 0.0579, "step": 519 }, { "epoch": 0.1736516947737519, "grad_norm": 0.9830936139267258, "learning_rate": 5.784204671857621e-06, "loss": 0.0679, "step": 520 }, { "epoch": 0.17398564034062447, "grad_norm": 1.8476146611052202, "learning_rate": 5.795328142380423e-06, "loss": 0.0739, "step": 521 }, { "epoch": 0.17431958590749708, "grad_norm": 1.0888171261377078, "learning_rate": 5.806451612903226e-06, "loss": 0.0607, "step": 522 }, { "epoch": 0.17465353147436968, "grad_norm": 0.9907717227940405, "learning_rate": 5.817575083426029e-06, "loss": 0.057, "step": 523 }, { "epoch": 0.1749874770412423, "grad_norm": 1.0443916244462574, "learning_rate": 5.828698553948833e-06, "loss": 0.0737, "step": 524 }, { "epoch": 0.17532142260811487, "grad_norm": 0.7458714946625439, "learning_rate": 5.839822024471635e-06, "loss": 0.063, "step": 525 }, { "epoch": 0.17565536817498748, "grad_norm": 0.7211494174786142, "learning_rate": 5.850945494994439e-06, "loss": 0.0523, "step": 526 }, { "epoch": 0.17598931374186008, "grad_norm": 1.3712158931896836, "learning_rate": 5.862068965517242e-06, "loss": 0.0762, "step": 527 }, { "epoch": 0.1763232593087327, "grad_norm": 0.7938349835402784, "learning_rate": 5.873192436040045e-06, "loss": 0.0589, "step": 528 }, { "epoch": 0.17665720487560527, "grad_norm": 0.6992457731952962, "learning_rate": 5.884315906562849e-06, "loss": 0.0566, "step": 529 }, { "epoch": 0.17699115044247787, "grad_norm": 0.9355890517584756, "learning_rate": 5.8954393770856515e-06, "loss": 0.0655, "step": 530 }, { "epoch": 0.17732509600935048, "grad_norm": 0.9272921789800707, "learning_rate": 5.906562847608455e-06, "loss": 0.0793, "step": 531 }, { "epoch": 0.1776590415762231, "grad_norm": 0.8859708515971795, "learning_rate": 5.917686318131257e-06, "loss": 0.0552, "step": 532 }, { "epoch": 0.17799298714309567, "grad_norm": 0.8398942776604181, "learning_rate": 5.9288097886540604e-06, "loss": 0.0465, "step": 533 }, { "epoch": 0.17832693270996827, "grad_norm": 0.7985514645311459, "learning_rate": 5.939933259176863e-06, "loss": 0.0675, "step": 534 }, { "epoch": 0.17866087827684088, "grad_norm": 0.697071401921753, "learning_rate": 5.951056729699667e-06, "loss": 0.0429, "step": 535 }, { "epoch": 0.17899482384371349, "grad_norm": 0.7041137612115344, "learning_rate": 5.962180200222469e-06, "loss": 0.0431, "step": 536 }, { "epoch": 0.17932876941058606, "grad_norm": 0.6682339970873195, "learning_rate": 5.973303670745273e-06, "loss": 0.0592, "step": 537 }, { "epoch": 0.17966271497745867, "grad_norm": 1.17716914430658, "learning_rate": 5.9844271412680765e-06, "loss": 0.0787, "step": 538 }, { "epoch": 0.17999666054433128, "grad_norm": 2.1039130020292207, "learning_rate": 5.995550611790879e-06, "loss": 0.0586, "step": 539 }, { "epoch": 0.18033060611120388, "grad_norm": 1.6854132829750088, "learning_rate": 6.006674082313683e-06, "loss": 0.088, "step": 540 }, { "epoch": 0.18066455167807646, "grad_norm": 0.9436435794531032, "learning_rate": 6.0177975528364855e-06, "loss": 0.068, "step": 541 }, { "epoch": 0.18099849724494907, "grad_norm": 1.4543330724458545, "learning_rate": 6.028921023359289e-06, "loss": 0.0814, "step": 542 }, { "epoch": 0.18133244281182168, "grad_norm": 1.2058456355383562, "learning_rate": 6.040044493882091e-06, "loss": 0.0829, "step": 543 }, { "epoch": 0.18166638837869428, "grad_norm": 0.8900032807415988, "learning_rate": 6.0511679644048945e-06, "loss": 0.0658, "step": 544 }, { "epoch": 0.18200033394556686, "grad_norm": 1.025438934220678, "learning_rate": 6.062291434927698e-06, "loss": 0.0685, "step": 545 }, { "epoch": 0.18233427951243947, "grad_norm": 1.488594859100904, "learning_rate": 6.073414905450501e-06, "loss": 0.0802, "step": 546 }, { "epoch": 0.18266822507931207, "grad_norm": 1.077167826841067, "learning_rate": 6.084538375973304e-06, "loss": 0.0692, "step": 547 }, { "epoch": 0.18300217064618468, "grad_norm": 0.8359382817432315, "learning_rate": 6.095661846496107e-06, "loss": 0.0617, "step": 548 }, { "epoch": 0.18333611621305726, "grad_norm": 0.9359938494704068, "learning_rate": 6.1067853170189106e-06, "loss": 0.0598, "step": 549 }, { "epoch": 0.18367006177992987, "grad_norm": 0.898094211129922, "learning_rate": 6.117908787541713e-06, "loss": 0.0715, "step": 550 }, { "epoch": 0.18400400734680247, "grad_norm": 0.8085241518785767, "learning_rate": 6.129032258064517e-06, "loss": 0.0611, "step": 551 }, { "epoch": 0.18433795291367508, "grad_norm": 0.835628986807132, "learning_rate": 6.1401557285873195e-06, "loss": 0.0545, "step": 552 }, { "epoch": 0.18467189848054766, "grad_norm": 0.9870690066587878, "learning_rate": 6.151279199110123e-06, "loss": 0.0555, "step": 553 }, { "epoch": 0.18500584404742026, "grad_norm": 0.7044338384587818, "learning_rate": 6.162402669632927e-06, "loss": 0.0563, "step": 554 }, { "epoch": 0.18533978961429287, "grad_norm": 0.9421964678475573, "learning_rate": 6.173526140155729e-06, "loss": 0.0694, "step": 555 }, { "epoch": 0.18567373518116548, "grad_norm": 0.7531432153729787, "learning_rate": 6.184649610678533e-06, "loss": 0.0398, "step": 556 }, { "epoch": 0.18600768074803806, "grad_norm": 0.7178016597672993, "learning_rate": 6.195773081201335e-06, "loss": 0.0527, "step": 557 }, { "epoch": 0.18634162631491066, "grad_norm": 0.6870955058429726, "learning_rate": 6.206896551724138e-06, "loss": 0.0515, "step": 558 }, { "epoch": 0.18667557188178327, "grad_norm": 1.058584370525592, "learning_rate": 6.218020022246941e-06, "loss": 0.0716, "step": 559 }, { "epoch": 0.18700951744865588, "grad_norm": 0.793793339054818, "learning_rate": 6.229143492769745e-06, "loss": 0.059, "step": 560 }, { "epoch": 0.18734346301552846, "grad_norm": 0.7921846601688673, "learning_rate": 6.240266963292548e-06, "loss": 0.0571, "step": 561 }, { "epoch": 0.18767740858240106, "grad_norm": 1.0822954840038115, "learning_rate": 6.251390433815351e-06, "loss": 0.0808, "step": 562 }, { "epoch": 0.18801135414927367, "grad_norm": 1.1838974205578672, "learning_rate": 6.262513904338154e-06, "loss": 0.0802, "step": 563 }, { "epoch": 0.18834529971614627, "grad_norm": 0.6664360867051361, "learning_rate": 6.273637374860957e-06, "loss": 0.0423, "step": 564 }, { "epoch": 0.18867924528301888, "grad_norm": 1.12975769644805, "learning_rate": 6.284760845383761e-06, "loss": 0.059, "step": 565 }, { "epoch": 0.18901319084989146, "grad_norm": 0.9023368076115867, "learning_rate": 6.295884315906563e-06, "loss": 0.0663, "step": 566 }, { "epoch": 0.18934713641676407, "grad_norm": 0.7070819040726736, "learning_rate": 6.307007786429367e-06, "loss": 0.0599, "step": 567 }, { "epoch": 0.18968108198363667, "grad_norm": 0.7167383442202009, "learning_rate": 6.318131256952169e-06, "loss": 0.0547, "step": 568 }, { "epoch": 0.19001502755050928, "grad_norm": 0.8667596619045079, "learning_rate": 6.329254727474972e-06, "loss": 0.0577, "step": 569 }, { "epoch": 0.19034897311738186, "grad_norm": 0.6903866651198595, "learning_rate": 6.340378197997776e-06, "loss": 0.0541, "step": 570 }, { "epoch": 0.19068291868425447, "grad_norm": 0.8652401120314771, "learning_rate": 6.351501668520579e-06, "loss": 0.0675, "step": 571 }, { "epoch": 0.19101686425112707, "grad_norm": 0.9567786258900463, "learning_rate": 6.362625139043382e-06, "loss": 0.0635, "step": 572 }, { "epoch": 0.19135080981799968, "grad_norm": 1.4032940279907142, "learning_rate": 6.373748609566185e-06, "loss": 0.0721, "step": 573 }, { "epoch": 0.19168475538487226, "grad_norm": 1.1264541824058907, "learning_rate": 6.3848720800889884e-06, "loss": 0.0686, "step": 574 }, { "epoch": 0.19201870095174486, "grad_norm": 1.0736034358506426, "learning_rate": 6.395995550611791e-06, "loss": 0.0873, "step": 575 }, { "epoch": 0.19235264651861747, "grad_norm": 0.8630872636820003, "learning_rate": 6.407119021134595e-06, "loss": 0.0582, "step": 576 }, { "epoch": 0.19268659208549008, "grad_norm": 1.5039433166171976, "learning_rate": 6.418242491657397e-06, "loss": 0.0895, "step": 577 }, { "epoch": 0.19302053765236266, "grad_norm": 0.658053148089993, "learning_rate": 6.429365962180201e-06, "loss": 0.0482, "step": 578 }, { "epoch": 0.19335448321923526, "grad_norm": 0.6038070862576609, "learning_rate": 6.4404894327030045e-06, "loss": 0.0407, "step": 579 }, { "epoch": 0.19368842878610787, "grad_norm": 0.8352398964922494, "learning_rate": 6.451612903225806e-06, "loss": 0.0594, "step": 580 }, { "epoch": 0.19402237435298048, "grad_norm": 1.1267488159998478, "learning_rate": 6.462736373748611e-06, "loss": 0.074, "step": 581 }, { "epoch": 0.19435631991985305, "grad_norm": 0.721628757355493, "learning_rate": 6.473859844271413e-06, "loss": 0.0678, "step": 582 }, { "epoch": 0.19469026548672566, "grad_norm": 0.648380978198407, "learning_rate": 6.484983314794216e-06, "loss": 0.0756, "step": 583 }, { "epoch": 0.19502421105359827, "grad_norm": 0.6572366191878652, "learning_rate": 6.496106785317019e-06, "loss": 0.0564, "step": 584 }, { "epoch": 0.19535815662047087, "grad_norm": 0.6746106748324757, "learning_rate": 6.5072302558398225e-06, "loss": 0.0601, "step": 585 }, { "epoch": 0.19569210218734345, "grad_norm": 0.9506138166905724, "learning_rate": 6.518353726362626e-06, "loss": 0.0729, "step": 586 }, { "epoch": 0.19602604775421606, "grad_norm": 0.7944011705745151, "learning_rate": 6.529477196885429e-06, "loss": 0.0825, "step": 587 }, { "epoch": 0.19635999332108867, "grad_norm": 0.7265582927727529, "learning_rate": 6.540600667408232e-06, "loss": 0.0751, "step": 588 }, { "epoch": 0.19669393888796127, "grad_norm": 1.3335730160125474, "learning_rate": 6.551724137931035e-06, "loss": 0.0569, "step": 589 }, { "epoch": 0.19702788445483385, "grad_norm": 0.7315873291244676, "learning_rate": 6.5628476084538385e-06, "loss": 0.0462, "step": 590 }, { "epoch": 0.19736183002170646, "grad_norm": 0.7935447081743017, "learning_rate": 6.573971078976641e-06, "loss": 0.0528, "step": 591 }, { "epoch": 0.19769577558857906, "grad_norm": 0.6859243667795377, "learning_rate": 6.585094549499445e-06, "loss": 0.0526, "step": 592 }, { "epoch": 0.19802972115545167, "grad_norm": 0.7655199162301891, "learning_rate": 6.596218020022247e-06, "loss": 0.0584, "step": 593 }, { "epoch": 0.19836366672232425, "grad_norm": 1.0167741029460753, "learning_rate": 6.60734149054505e-06, "loss": 0.0645, "step": 594 }, { "epoch": 0.19869761228919686, "grad_norm": 0.969233187245511, "learning_rate": 6.618464961067854e-06, "loss": 0.0765, "step": 595 }, { "epoch": 0.19903155785606946, "grad_norm": 0.8910946717229351, "learning_rate": 6.6295884315906565e-06, "loss": 0.0531, "step": 596 }, { "epoch": 0.19936550342294207, "grad_norm": 0.7392381351339108, "learning_rate": 6.64071190211346e-06, "loss": 0.056, "step": 597 }, { "epoch": 0.19969944898981465, "grad_norm": 0.6524425989201603, "learning_rate": 6.651835372636263e-06, "loss": 0.059, "step": 598 }, { "epoch": 0.20003339455668726, "grad_norm": 0.8142443247115522, "learning_rate": 6.662958843159066e-06, "loss": 0.0686, "step": 599 }, { "epoch": 0.20036734012355986, "grad_norm": 0.6323681488217677, "learning_rate": 6.674082313681869e-06, "loss": 0.0488, "step": 600 }, { "epoch": 0.20070128569043247, "grad_norm": 0.8041917413196822, "learning_rate": 6.6852057842046726e-06, "loss": 0.0571, "step": 601 }, { "epoch": 0.20103523125730505, "grad_norm": 0.5893400857741306, "learning_rate": 6.696329254727475e-06, "loss": 0.0445, "step": 602 }, { "epoch": 0.20136917682417765, "grad_norm": 0.7214738998710117, "learning_rate": 6.707452725250279e-06, "loss": 0.0744, "step": 603 }, { "epoch": 0.20170312239105026, "grad_norm": 1.2233939320369276, "learning_rate": 6.718576195773082e-06, "loss": 0.0653, "step": 604 }, { "epoch": 0.20203706795792287, "grad_norm": 0.8197861276964413, "learning_rate": 6.729699666295884e-06, "loss": 0.0617, "step": 605 }, { "epoch": 0.20237101352479545, "grad_norm": 0.9144895067312468, "learning_rate": 6.740823136818689e-06, "loss": 0.0467, "step": 606 }, { "epoch": 0.20270495909166805, "grad_norm": 0.8306665062152114, "learning_rate": 6.7519466073414905e-06, "loss": 0.0522, "step": 607 }, { "epoch": 0.20303890465854066, "grad_norm": 0.6614365082598056, "learning_rate": 6.763070077864294e-06, "loss": 0.0511, "step": 608 }, { "epoch": 0.20337285022541327, "grad_norm": 0.713950052001751, "learning_rate": 6.774193548387097e-06, "loss": 0.0621, "step": 609 }, { "epoch": 0.20370679579228584, "grad_norm": 0.8905334489243183, "learning_rate": 6.7853170189099e-06, "loss": 0.0589, "step": 610 }, { "epoch": 0.20404074135915845, "grad_norm": 0.6543811592847132, "learning_rate": 6.796440489432704e-06, "loss": 0.0514, "step": 611 }, { "epoch": 0.20437468692603106, "grad_norm": 0.9007985574710429, "learning_rate": 6.807563959955507e-06, "loss": 0.0497, "step": 612 }, { "epoch": 0.20470863249290366, "grad_norm": 0.9073568133724813, "learning_rate": 6.81868743047831e-06, "loss": 0.0692, "step": 613 }, { "epoch": 0.20504257805977624, "grad_norm": 0.6067895191197412, "learning_rate": 6.829810901001113e-06, "loss": 0.0494, "step": 614 }, { "epoch": 0.20537652362664885, "grad_norm": 1.1233152951341259, "learning_rate": 6.840934371523916e-06, "loss": 0.0601, "step": 615 }, { "epoch": 0.20571046919352146, "grad_norm": 0.7913630031379069, "learning_rate": 6.852057842046719e-06, "loss": 0.0443, "step": 616 }, { "epoch": 0.20604441476039406, "grad_norm": 0.6710216282911835, "learning_rate": 6.863181312569523e-06, "loss": 0.0487, "step": 617 }, { "epoch": 0.20637836032726667, "grad_norm": 0.5492071484783791, "learning_rate": 6.8743047830923245e-06, "loss": 0.0515, "step": 618 }, { "epoch": 0.20671230589413925, "grad_norm": 0.725700461458707, "learning_rate": 6.885428253615128e-06, "loss": 0.0443, "step": 619 }, { "epoch": 0.20704625146101185, "grad_norm": 0.730493977320189, "learning_rate": 6.896551724137932e-06, "loss": 0.0514, "step": 620 }, { "epoch": 0.20738019702788446, "grad_norm": 0.629676391292487, "learning_rate": 6.907675194660734e-06, "loss": 0.0466, "step": 621 }, { "epoch": 0.20771414259475707, "grad_norm": 0.9439772579694213, "learning_rate": 6.918798665183538e-06, "loss": 0.0593, "step": 622 }, { "epoch": 0.20804808816162965, "grad_norm": 0.6727789860352383, "learning_rate": 6.929922135706341e-06, "loss": 0.0714, "step": 623 }, { "epoch": 0.20838203372850225, "grad_norm": 0.7216726522421172, "learning_rate": 6.941045606229144e-06, "loss": 0.0605, "step": 624 }, { "epoch": 0.20871597929537486, "grad_norm": 0.554147090395553, "learning_rate": 6.952169076751947e-06, "loss": 0.0414, "step": 625 }, { "epoch": 0.20904992486224747, "grad_norm": 0.8311935206386598, "learning_rate": 6.9632925472747504e-06, "loss": 0.0578, "step": 626 }, { "epoch": 0.20938387042912004, "grad_norm": 0.8000074593969697, "learning_rate": 6.974416017797554e-06, "loss": 0.0584, "step": 627 }, { "epoch": 0.20971781599599265, "grad_norm": 0.9722412112123996, "learning_rate": 6.985539488320357e-06, "loss": 0.0609, "step": 628 }, { "epoch": 0.21005176156286526, "grad_norm": 0.878780487308983, "learning_rate": 6.99666295884316e-06, "loss": 0.0672, "step": 629 }, { "epoch": 0.21038570712973786, "grad_norm": 0.6398612017056129, "learning_rate": 7.007786429365962e-06, "loss": 0.0595, "step": 630 }, { "epoch": 0.21071965269661044, "grad_norm": 1.3820823508166513, "learning_rate": 7.0189098998887665e-06, "loss": 0.0641, "step": 631 }, { "epoch": 0.21105359826348305, "grad_norm": 0.8389581412239546, "learning_rate": 7.030033370411568e-06, "loss": 0.0534, "step": 632 }, { "epoch": 0.21138754383035566, "grad_norm": 1.235867509288309, "learning_rate": 7.041156840934372e-06, "loss": 0.0649, "step": 633 }, { "epoch": 0.21172148939722826, "grad_norm": 0.6822686682534393, "learning_rate": 7.052280311457175e-06, "loss": 0.0496, "step": 634 }, { "epoch": 0.21205543496410084, "grad_norm": 0.677863381365653, "learning_rate": 7.063403781979978e-06, "loss": 0.0524, "step": 635 }, { "epoch": 0.21238938053097345, "grad_norm": 0.5616965372733239, "learning_rate": 7.074527252502782e-06, "loss": 0.045, "step": 636 }, { "epoch": 0.21272332609784605, "grad_norm": 0.9305795338913498, "learning_rate": 7.0856507230255845e-06, "loss": 0.0598, "step": 637 }, { "epoch": 0.21305727166471866, "grad_norm": 1.148754024371969, "learning_rate": 7.096774193548388e-06, "loss": 0.0851, "step": 638 }, { "epoch": 0.21339121723159124, "grad_norm": 0.6771383882990121, "learning_rate": 7.107897664071191e-06, "loss": 0.057, "step": 639 }, { "epoch": 0.21372516279846385, "grad_norm": 0.7326053693987247, "learning_rate": 7.119021134593994e-06, "loss": 0.0378, "step": 640 }, { "epoch": 0.21405910836533645, "grad_norm": 0.8642206474374593, "learning_rate": 7.130144605116797e-06, "loss": 0.0633, "step": 641 }, { "epoch": 0.21439305393220906, "grad_norm": 0.689691985990761, "learning_rate": 7.1412680756396006e-06, "loss": 0.0507, "step": 642 }, { "epoch": 0.21472699949908164, "grad_norm": 0.5505119046969866, "learning_rate": 7.152391546162402e-06, "loss": 0.0481, "step": 643 }, { "epoch": 0.21506094506595425, "grad_norm": 0.7259987823104186, "learning_rate": 7.163515016685206e-06, "loss": 0.054, "step": 644 }, { "epoch": 0.21539489063282685, "grad_norm": 0.9492663396246556, "learning_rate": 7.1746384872080095e-06, "loss": 0.0765, "step": 645 }, { "epoch": 0.21572883619969946, "grad_norm": 1.1774451315327281, "learning_rate": 7.185761957730812e-06, "loss": 0.0643, "step": 646 }, { "epoch": 0.21606278176657204, "grad_norm": 0.7103830226184474, "learning_rate": 7.196885428253616e-06, "loss": 0.0476, "step": 647 }, { "epoch": 0.21639672733344464, "grad_norm": 0.7935594037529661, "learning_rate": 7.2080088987764185e-06, "loss": 0.0523, "step": 648 }, { "epoch": 0.21673067290031725, "grad_norm": 0.6896064477390729, "learning_rate": 7.219132369299222e-06, "loss": 0.0655, "step": 649 }, { "epoch": 0.21706461846718986, "grad_norm": 0.8771285074850425, "learning_rate": 7.230255839822025e-06, "loss": 0.0722, "step": 650 }, { "epoch": 0.21739856403406244, "grad_norm": 0.6004195294857952, "learning_rate": 7.241379310344828e-06, "loss": 0.0442, "step": 651 }, { "epoch": 0.21773250960093504, "grad_norm": 1.491334243239559, "learning_rate": 7.252502780867632e-06, "loss": 0.0693, "step": 652 }, { "epoch": 0.21806645516780765, "grad_norm": 1.0264363212929928, "learning_rate": 7.263626251390435e-06, "loss": 0.0589, "step": 653 }, { "epoch": 0.21840040073468026, "grad_norm": 0.6937073911174437, "learning_rate": 7.274749721913238e-06, "loss": 0.0485, "step": 654 }, { "epoch": 0.21873434630155283, "grad_norm": 0.6813217875877775, "learning_rate": 7.28587319243604e-06, "loss": 0.0468, "step": 655 }, { "epoch": 0.21906829186842544, "grad_norm": 0.8468751024073949, "learning_rate": 7.296996662958844e-06, "loss": 0.0473, "step": 656 }, { "epoch": 0.21940223743529805, "grad_norm": 0.5826196253520607, "learning_rate": 7.308120133481646e-06, "loss": 0.0362, "step": 657 }, { "epoch": 0.21973618300217065, "grad_norm": 0.8823320801298193, "learning_rate": 7.31924360400445e-06, "loss": 0.0696, "step": 658 }, { "epoch": 0.22007012856904323, "grad_norm": 0.6752508644503113, "learning_rate": 7.3303670745272525e-06, "loss": 0.0597, "step": 659 }, { "epoch": 0.22040407413591584, "grad_norm": 0.630584714988691, "learning_rate": 7.341490545050056e-06, "loss": 0.0462, "step": 660 }, { "epoch": 0.22073801970278845, "grad_norm": 1.1380911578526518, "learning_rate": 7.35261401557286e-06, "loss": 0.0716, "step": 661 }, { "epoch": 0.22107196526966105, "grad_norm": 0.8225765908938263, "learning_rate": 7.363737486095662e-06, "loss": 0.0871, "step": 662 }, { "epoch": 0.22140591083653363, "grad_norm": 0.6508406370720058, "learning_rate": 7.374860956618466e-06, "loss": 0.0538, "step": 663 }, { "epoch": 0.22173985640340624, "grad_norm": 0.6498773981616411, "learning_rate": 7.385984427141269e-06, "loss": 0.0421, "step": 664 }, { "epoch": 0.22207380197027884, "grad_norm": 0.5632972140069444, "learning_rate": 7.397107897664072e-06, "loss": 0.0401, "step": 665 }, { "epoch": 0.22240774753715145, "grad_norm": 0.6955201001818666, "learning_rate": 7.408231368186875e-06, "loss": 0.0515, "step": 666 }, { "epoch": 0.22274169310402406, "grad_norm": 0.9245962605688041, "learning_rate": 7.4193548387096784e-06, "loss": 0.0658, "step": 667 }, { "epoch": 0.22307563867089664, "grad_norm": 0.8566192824542013, "learning_rate": 7.43047830923248e-06, "loss": 0.0644, "step": 668 }, { "epoch": 0.22340958423776924, "grad_norm": 0.7879617681877166, "learning_rate": 7.441601779755284e-06, "loss": 0.08, "step": 669 }, { "epoch": 0.22374352980464185, "grad_norm": 0.5720327250980868, "learning_rate": 7.452725250278087e-06, "loss": 0.0602, "step": 670 }, { "epoch": 0.22407747537151446, "grad_norm": 0.7243965181151018, "learning_rate": 7.46384872080089e-06, "loss": 0.0538, "step": 671 }, { "epoch": 0.22441142093838704, "grad_norm": 0.6765854869756044, "learning_rate": 7.474972191323694e-06, "loss": 0.0438, "step": 672 }, { "epoch": 0.22474536650525964, "grad_norm": 1.1551543065277514, "learning_rate": 7.486095661846496e-06, "loss": 0.0641, "step": 673 }, { "epoch": 0.22507931207213225, "grad_norm": 1.2471246659573005, "learning_rate": 7.4972191323693e-06, "loss": 0.0954, "step": 674 }, { "epoch": 0.22541325763900485, "grad_norm": 0.717581831189048, "learning_rate": 7.508342602892103e-06, "loss": 0.0717, "step": 675 }, { "epoch": 0.22574720320587743, "grad_norm": 1.4383898751001543, "learning_rate": 7.519466073414906e-06, "loss": 0.0681, "step": 676 }, { "epoch": 0.22608114877275004, "grad_norm": 0.6278799747313417, "learning_rate": 7.53058954393771e-06, "loss": 0.0368, "step": 677 }, { "epoch": 0.22641509433962265, "grad_norm": 1.078681505592307, "learning_rate": 7.5417130144605125e-06, "loss": 0.055, "step": 678 }, { "epoch": 0.22674903990649525, "grad_norm": 0.8391619044412681, "learning_rate": 7.552836484983316e-06, "loss": 0.057, "step": 679 }, { "epoch": 0.22708298547336783, "grad_norm": 1.0326612676142242, "learning_rate": 7.563959955506118e-06, "loss": 0.0546, "step": 680 }, { "epoch": 0.22741693104024044, "grad_norm": 0.6535947488360222, "learning_rate": 7.575083426028922e-06, "loss": 0.0574, "step": 681 }, { "epoch": 0.22775087660711305, "grad_norm": 0.814336343141877, "learning_rate": 7.586206896551724e-06, "loss": 0.0697, "step": 682 }, { "epoch": 0.22808482217398565, "grad_norm": 0.8931605432690218, "learning_rate": 7.597330367074528e-06, "loss": 0.0581, "step": 683 }, { "epoch": 0.22841876774085823, "grad_norm": 0.698447467488072, "learning_rate": 7.60845383759733e-06, "loss": 0.0501, "step": 684 }, { "epoch": 0.22875271330773084, "grad_norm": 0.8023761432510702, "learning_rate": 7.619577308120134e-06, "loss": 0.0847, "step": 685 }, { "epoch": 0.22908665887460344, "grad_norm": 0.6095282065363956, "learning_rate": 7.630700778642938e-06, "loss": 0.062, "step": 686 }, { "epoch": 0.22942060444147605, "grad_norm": 0.4774849614665295, "learning_rate": 7.64182424916574e-06, "loss": 0.0532, "step": 687 }, { "epoch": 0.22975455000834863, "grad_norm": 0.5898418038186112, "learning_rate": 7.652947719688543e-06, "loss": 0.0516, "step": 688 }, { "epoch": 0.23008849557522124, "grad_norm": 0.8783761532562184, "learning_rate": 7.664071190211346e-06, "loss": 0.0693, "step": 689 }, { "epoch": 0.23042244114209384, "grad_norm": 0.6022197037001502, "learning_rate": 7.67519466073415e-06, "loss": 0.0441, "step": 690 }, { "epoch": 0.23075638670896645, "grad_norm": 0.7320630483302178, "learning_rate": 7.686318131256953e-06, "loss": 0.0645, "step": 691 }, { "epoch": 0.23109033227583903, "grad_norm": 0.7168522057955218, "learning_rate": 7.697441601779755e-06, "loss": 0.0585, "step": 692 }, { "epoch": 0.23142427784271163, "grad_norm": 0.6468450826874373, "learning_rate": 7.70856507230256e-06, "loss": 0.0438, "step": 693 }, { "epoch": 0.23175822340958424, "grad_norm": 0.6036146013357945, "learning_rate": 7.719688542825363e-06, "loss": 0.0351, "step": 694 }, { "epoch": 0.23209216897645685, "grad_norm": 0.6184625300068487, "learning_rate": 7.730812013348165e-06, "loss": 0.054, "step": 695 }, { "epoch": 0.23242611454332943, "grad_norm": 0.7161536721534412, "learning_rate": 7.741935483870968e-06, "loss": 0.0498, "step": 696 }, { "epoch": 0.23276006011020203, "grad_norm": 0.92662755009442, "learning_rate": 7.753058954393772e-06, "loss": 0.0619, "step": 697 }, { "epoch": 0.23309400567707464, "grad_norm": 0.9741063837144837, "learning_rate": 7.764182424916575e-06, "loss": 0.0615, "step": 698 }, { "epoch": 0.23342795124394725, "grad_norm": 0.6571626264384028, "learning_rate": 7.775305895439378e-06, "loss": 0.0691, "step": 699 }, { "epoch": 0.23376189681081982, "grad_norm": 0.6623436201391426, "learning_rate": 7.78642936596218e-06, "loss": 0.0631, "step": 700 }, { "epoch": 0.23409584237769243, "grad_norm": 0.5671382014262107, "learning_rate": 7.797552836484983e-06, "loss": 0.0528, "step": 701 }, { "epoch": 0.23442978794456504, "grad_norm": 0.5186931729414294, "learning_rate": 7.808676307007788e-06, "loss": 0.0473, "step": 702 }, { "epoch": 0.23476373351143764, "grad_norm": 0.6306362158538241, "learning_rate": 7.81979977753059e-06, "loss": 0.0552, "step": 703 }, { "epoch": 0.23509767907831022, "grad_norm": 0.5466189227341881, "learning_rate": 7.830923248053393e-06, "loss": 0.0568, "step": 704 }, { "epoch": 0.23543162464518283, "grad_norm": 0.5613237493646677, "learning_rate": 7.842046718576196e-06, "loss": 0.0546, "step": 705 }, { "epoch": 0.23576557021205544, "grad_norm": 0.5138027671363771, "learning_rate": 7.853170189099e-06, "loss": 0.0455, "step": 706 }, { "epoch": 0.23609951577892804, "grad_norm": 1.0000382007451012, "learning_rate": 7.864293659621803e-06, "loss": 0.0815, "step": 707 }, { "epoch": 0.23643346134580062, "grad_norm": 0.5309889504993703, "learning_rate": 7.875417130144606e-06, "loss": 0.0415, "step": 708 }, { "epoch": 0.23676740691267323, "grad_norm": 0.601523770166247, "learning_rate": 7.886540600667408e-06, "loss": 0.0424, "step": 709 }, { "epoch": 0.23710135247954583, "grad_norm": 0.5614576890827592, "learning_rate": 7.897664071190213e-06, "loss": 0.0511, "step": 710 }, { "epoch": 0.23743529804641844, "grad_norm": 1.1787590770393492, "learning_rate": 7.908787541713015e-06, "loss": 0.0555, "step": 711 }, { "epoch": 0.23776924361329102, "grad_norm": 0.6857456493243878, "learning_rate": 7.919911012235818e-06, "loss": 0.0489, "step": 712 }, { "epoch": 0.23810318918016363, "grad_norm": 0.6570662490997047, "learning_rate": 7.93103448275862e-06, "loss": 0.0632, "step": 713 }, { "epoch": 0.23843713474703623, "grad_norm": 0.6206550500253379, "learning_rate": 7.942157953281424e-06, "loss": 0.0581, "step": 714 }, { "epoch": 0.23877108031390884, "grad_norm": 0.6819702882562815, "learning_rate": 7.953281423804228e-06, "loss": 0.0773, "step": 715 }, { "epoch": 0.23910502588078142, "grad_norm": 0.6584842233833688, "learning_rate": 7.96440489432703e-06, "loss": 0.0606, "step": 716 }, { "epoch": 0.23943897144765403, "grad_norm": 0.708678681452552, "learning_rate": 7.975528364849833e-06, "loss": 0.0527, "step": 717 }, { "epoch": 0.23977291701452663, "grad_norm": 0.6559013503580498, "learning_rate": 7.986651835372638e-06, "loss": 0.0468, "step": 718 }, { "epoch": 0.24010686258139924, "grad_norm": 0.6089295320922504, "learning_rate": 7.99777530589544e-06, "loss": 0.0469, "step": 719 }, { "epoch": 0.24044080814827185, "grad_norm": 0.5421853109439192, "learning_rate": 8.008898776418243e-06, "loss": 0.0425, "step": 720 }, { "epoch": 0.24077475371514442, "grad_norm": 0.6646654479239261, "learning_rate": 8.020022246941046e-06, "loss": 0.0526, "step": 721 }, { "epoch": 0.24110869928201703, "grad_norm": 0.5909129859046834, "learning_rate": 8.03114571746385e-06, "loss": 0.0578, "step": 722 }, { "epoch": 0.24144264484888964, "grad_norm": 0.5465817247840761, "learning_rate": 8.042269187986651e-06, "loss": 0.0381, "step": 723 }, { "epoch": 0.24177659041576224, "grad_norm": 0.5663522143293497, "learning_rate": 8.053392658509456e-06, "loss": 0.0456, "step": 724 }, { "epoch": 0.24211053598263482, "grad_norm": 0.7099755585096478, "learning_rate": 8.064516129032258e-06, "loss": 0.0553, "step": 725 }, { "epoch": 0.24244448154950743, "grad_norm": 0.4703416320234455, "learning_rate": 8.075639599555061e-06, "loss": 0.0314, "step": 726 }, { "epoch": 0.24277842711638004, "grad_norm": 0.748131315704341, "learning_rate": 8.086763070077866e-06, "loss": 0.0575, "step": 727 }, { "epoch": 0.24311237268325264, "grad_norm": 0.7959707312517568, "learning_rate": 8.097886540600668e-06, "loss": 0.0522, "step": 728 }, { "epoch": 0.24344631825012522, "grad_norm": 0.7143826412790298, "learning_rate": 8.109010011123471e-06, "loss": 0.0603, "step": 729 }, { "epoch": 0.24378026381699783, "grad_norm": 0.7546112375024333, "learning_rate": 8.120133481646274e-06, "loss": 0.0447, "step": 730 }, { "epoch": 0.24411420938387043, "grad_norm": 1.320002478448852, "learning_rate": 8.131256952169078e-06, "loss": 0.0613, "step": 731 }, { "epoch": 0.24444815495074304, "grad_norm": 0.9123876302024165, "learning_rate": 8.14238042269188e-06, "loss": 0.0555, "step": 732 }, { "epoch": 0.24478210051761562, "grad_norm": 1.0679687444316195, "learning_rate": 8.153503893214683e-06, "loss": 0.0661, "step": 733 }, { "epoch": 0.24511604608448823, "grad_norm": 0.5913377615797212, "learning_rate": 8.164627363737486e-06, "loss": 0.0536, "step": 734 }, { "epoch": 0.24544999165136083, "grad_norm": 0.6908337267132758, "learning_rate": 8.17575083426029e-06, "loss": 0.0545, "step": 735 }, { "epoch": 0.24578393721823344, "grad_norm": 0.7525711702575639, "learning_rate": 8.186874304783093e-06, "loss": 0.0422, "step": 736 }, { "epoch": 0.24611788278510602, "grad_norm": 0.6795534047118373, "learning_rate": 8.197997775305896e-06, "loss": 0.0435, "step": 737 }, { "epoch": 0.24645182835197862, "grad_norm": 0.7419691167366567, "learning_rate": 8.209121245828699e-06, "loss": 0.0458, "step": 738 }, { "epoch": 0.24678577391885123, "grad_norm": 0.9596461385373865, "learning_rate": 8.220244716351501e-06, "loss": 0.0529, "step": 739 }, { "epoch": 0.24711971948572384, "grad_norm": 1.0129102887265393, "learning_rate": 8.231368186874306e-06, "loss": 0.0739, "step": 740 }, { "epoch": 0.24745366505259642, "grad_norm": 1.227881171847166, "learning_rate": 8.242491657397109e-06, "loss": 0.0622, "step": 741 }, { "epoch": 0.24778761061946902, "grad_norm": 0.6383626322611397, "learning_rate": 8.253615127919911e-06, "loss": 0.0607, "step": 742 }, { "epoch": 0.24812155618634163, "grad_norm": 0.9642447588805736, "learning_rate": 8.264738598442716e-06, "loss": 0.0644, "step": 743 }, { "epoch": 0.24845550175321424, "grad_norm": 0.8593161289423901, "learning_rate": 8.275862068965518e-06, "loss": 0.0537, "step": 744 }, { "epoch": 0.24878944732008682, "grad_norm": 0.8432266000953739, "learning_rate": 8.286985539488321e-06, "loss": 0.0653, "step": 745 }, { "epoch": 0.24912339288695942, "grad_norm": 0.6532807195982379, "learning_rate": 8.298109010011124e-06, "loss": 0.0575, "step": 746 }, { "epoch": 0.24945733845383203, "grad_norm": 0.8080274258567304, "learning_rate": 8.309232480533928e-06, "loss": 0.0607, "step": 747 }, { "epoch": 0.24979128402070463, "grad_norm": 0.7451776946200317, "learning_rate": 8.32035595105673e-06, "loss": 0.0688, "step": 748 }, { "epoch": 0.2501252295875772, "grad_norm": 1.3723617173501166, "learning_rate": 8.331479421579534e-06, "loss": 0.0716, "step": 749 }, { "epoch": 0.2504591751544498, "grad_norm": 0.8665671289199542, "learning_rate": 8.342602892102336e-06, "loss": 0.045, "step": 750 }, { "epoch": 0.2507931207213224, "grad_norm": 1.0379503434218704, "learning_rate": 8.353726362625139e-06, "loss": 0.0583, "step": 751 }, { "epoch": 0.25112706628819503, "grad_norm": 1.1391729732452065, "learning_rate": 8.364849833147943e-06, "loss": 0.0662, "step": 752 }, { "epoch": 0.25146101185506764, "grad_norm": 0.8156416909847637, "learning_rate": 8.375973303670746e-06, "loss": 0.0554, "step": 753 }, { "epoch": 0.25179495742194025, "grad_norm": 1.2010427348455242, "learning_rate": 8.387096774193549e-06, "loss": 0.0748, "step": 754 }, { "epoch": 0.2521289029888128, "grad_norm": 0.9559734645728132, "learning_rate": 8.398220244716352e-06, "loss": 0.0617, "step": 755 }, { "epoch": 0.2524628485556854, "grad_norm": 1.0341990720233247, "learning_rate": 8.409343715239156e-06, "loss": 0.0584, "step": 756 }, { "epoch": 0.252796794122558, "grad_norm": 0.6904656633504147, "learning_rate": 8.420467185761959e-06, "loss": 0.0523, "step": 757 }, { "epoch": 0.2531307396894306, "grad_norm": 0.6536837531277213, "learning_rate": 8.431590656284761e-06, "loss": 0.043, "step": 758 }, { "epoch": 0.2534646852563032, "grad_norm": 1.0547443803828938, "learning_rate": 8.442714126807566e-06, "loss": 0.0428, "step": 759 }, { "epoch": 0.25379863082317583, "grad_norm": 0.8767180736789091, "learning_rate": 8.453837597330368e-06, "loss": 0.0492, "step": 760 }, { "epoch": 0.25413257639004844, "grad_norm": 0.7923571000035343, "learning_rate": 8.464961067853171e-06, "loss": 0.0644, "step": 761 }, { "epoch": 0.25446652195692104, "grad_norm": 1.170351442324846, "learning_rate": 8.476084538375974e-06, "loss": 0.0725, "step": 762 }, { "epoch": 0.2548004675237936, "grad_norm": 0.6763299749127198, "learning_rate": 8.487208008898777e-06, "loss": 0.0526, "step": 763 }, { "epoch": 0.2551344130906662, "grad_norm": 0.6234066579891239, "learning_rate": 8.49833147942158e-06, "loss": 0.0565, "step": 764 }, { "epoch": 0.2554683586575388, "grad_norm": 0.9791934529286336, "learning_rate": 8.509454949944384e-06, "loss": 0.0602, "step": 765 }, { "epoch": 0.2558023042244114, "grad_norm": 0.9128970785663827, "learning_rate": 8.520578420467186e-06, "loss": 0.0603, "step": 766 }, { "epoch": 0.256136249791284, "grad_norm": 0.7932800291438664, "learning_rate": 8.531701890989989e-06, "loss": 0.0515, "step": 767 }, { "epoch": 0.2564701953581566, "grad_norm": 0.5921505126585687, "learning_rate": 8.542825361512793e-06, "loss": 0.0444, "step": 768 }, { "epoch": 0.25680414092502923, "grad_norm": 0.6802827785195577, "learning_rate": 8.553948832035596e-06, "loss": 0.055, "step": 769 }, { "epoch": 0.25713808649190184, "grad_norm": 0.6020518899742204, "learning_rate": 8.565072302558399e-06, "loss": 0.0499, "step": 770 }, { "epoch": 0.25747203205877445, "grad_norm": 0.8934328993348921, "learning_rate": 8.576195773081202e-06, "loss": 0.0522, "step": 771 }, { "epoch": 0.257805977625647, "grad_norm": 0.8499029920395502, "learning_rate": 8.587319243604006e-06, "loss": 0.0676, "step": 772 }, { "epoch": 0.2581399231925196, "grad_norm": 0.6943521402387964, "learning_rate": 8.598442714126807e-06, "loss": 0.0407, "step": 773 }, { "epoch": 0.2584738687593922, "grad_norm": 0.6458829421725016, "learning_rate": 8.609566184649611e-06, "loss": 0.044, "step": 774 }, { "epoch": 0.2588078143262648, "grad_norm": 1.1875978947893053, "learning_rate": 8.620689655172414e-06, "loss": 0.0855, "step": 775 }, { "epoch": 0.2591417598931374, "grad_norm": 0.744997675331295, "learning_rate": 8.631813125695217e-06, "loss": 0.0633, "step": 776 }, { "epoch": 0.25947570546001003, "grad_norm": 0.7152958982591525, "learning_rate": 8.642936596218021e-06, "loss": 0.0547, "step": 777 }, { "epoch": 0.25980965102688264, "grad_norm": 0.6618361809301214, "learning_rate": 8.654060066740824e-06, "loss": 0.053, "step": 778 }, { "epoch": 0.26014359659375524, "grad_norm": 0.7450567717031901, "learning_rate": 8.665183537263627e-06, "loss": 0.0737, "step": 779 }, { "epoch": 0.2604775421606278, "grad_norm": 0.7678230051735238, "learning_rate": 8.67630700778643e-06, "loss": 0.0713, "step": 780 }, { "epoch": 0.2608114877275004, "grad_norm": 0.7930621080238116, "learning_rate": 8.687430478309234e-06, "loss": 0.0632, "step": 781 }, { "epoch": 0.261145433294373, "grad_norm": 0.779443098676805, "learning_rate": 8.698553948832036e-06, "loss": 0.0708, "step": 782 }, { "epoch": 0.2614793788612456, "grad_norm": 0.7710866735575862, "learning_rate": 8.70967741935484e-06, "loss": 0.0629, "step": 783 }, { "epoch": 0.2618133244281182, "grad_norm": 0.5774069846802348, "learning_rate": 8.720800889877644e-06, "loss": 0.0502, "step": 784 }, { "epoch": 0.26214726999499083, "grad_norm": 1.020267111033492, "learning_rate": 8.731924360400446e-06, "loss": 0.0881, "step": 785 }, { "epoch": 0.26248121556186343, "grad_norm": 0.6543360371229927, "learning_rate": 8.743047830923249e-06, "loss": 0.0588, "step": 786 }, { "epoch": 0.26281516112873604, "grad_norm": 1.2169325544853526, "learning_rate": 8.754171301446052e-06, "loss": 0.0568, "step": 787 }, { "epoch": 0.2631491066956086, "grad_norm": 0.7084276003060112, "learning_rate": 8.765294771968854e-06, "loss": 0.0481, "step": 788 }, { "epoch": 0.2634830522624812, "grad_norm": 0.885044828105131, "learning_rate": 8.776418242491657e-06, "loss": 0.0493, "step": 789 }, { "epoch": 0.2638169978293538, "grad_norm": 0.7881657725641322, "learning_rate": 8.787541713014462e-06, "loss": 0.0676, "step": 790 }, { "epoch": 0.2641509433962264, "grad_norm": 0.6772140173044136, "learning_rate": 8.798665183537264e-06, "loss": 0.0823, "step": 791 }, { "epoch": 0.264484888963099, "grad_norm": 0.6563446607895745, "learning_rate": 8.809788654060067e-06, "loss": 0.0508, "step": 792 }, { "epoch": 0.2648188345299716, "grad_norm": 0.7443612991045175, "learning_rate": 8.820912124582871e-06, "loss": 0.078, "step": 793 }, { "epoch": 0.26515278009684423, "grad_norm": 0.8183742978778376, "learning_rate": 8.832035595105674e-06, "loss": 0.0539, "step": 794 }, { "epoch": 0.26548672566371684, "grad_norm": 0.6131052919596894, "learning_rate": 8.843159065628477e-06, "loss": 0.0519, "step": 795 }, { "epoch": 0.2658206712305894, "grad_norm": 0.6144509102084704, "learning_rate": 8.85428253615128e-06, "loss": 0.0546, "step": 796 }, { "epoch": 0.266154616797462, "grad_norm": 0.49063270153285354, "learning_rate": 8.865406006674084e-06, "loss": 0.0326, "step": 797 }, { "epoch": 0.2664885623643346, "grad_norm": 1.0582370293005623, "learning_rate": 8.876529477196885e-06, "loss": 0.0428, "step": 798 }, { "epoch": 0.2668225079312072, "grad_norm": 0.9108709900656993, "learning_rate": 8.88765294771969e-06, "loss": 0.0628, "step": 799 }, { "epoch": 0.2671564534980798, "grad_norm": 0.6789734849125307, "learning_rate": 8.898776418242492e-06, "loss": 0.0458, "step": 800 }, { "epoch": 0.2674903990649524, "grad_norm": 0.728760335901269, "learning_rate": 8.909899888765295e-06, "loss": 0.065, "step": 801 }, { "epoch": 0.26782434463182503, "grad_norm": 0.6239280771541308, "learning_rate": 8.921023359288099e-06, "loss": 0.0684, "step": 802 }, { "epoch": 0.26815829019869764, "grad_norm": 0.7892196073901578, "learning_rate": 8.932146829810902e-06, "loss": 0.0546, "step": 803 }, { "epoch": 0.2684922357655702, "grad_norm": 0.7022611679612035, "learning_rate": 8.943270300333705e-06, "loss": 0.0579, "step": 804 }, { "epoch": 0.2688261813324428, "grad_norm": 0.533124691592435, "learning_rate": 8.954393770856507e-06, "loss": 0.046, "step": 805 }, { "epoch": 0.2691601268993154, "grad_norm": 0.6878117959147262, "learning_rate": 8.965517241379312e-06, "loss": 0.0716, "step": 806 }, { "epoch": 0.269494072466188, "grad_norm": 0.5774095121132162, "learning_rate": 8.976640711902114e-06, "loss": 0.0442, "step": 807 }, { "epoch": 0.2698280180330606, "grad_norm": 0.5836306912559, "learning_rate": 8.987764182424917e-06, "loss": 0.0502, "step": 808 }, { "epoch": 0.2701619635999332, "grad_norm": 0.6607292526260694, "learning_rate": 8.998887652947721e-06, "loss": 0.0601, "step": 809 }, { "epoch": 0.2704959091668058, "grad_norm": 0.6557024370885054, "learning_rate": 9.010011123470524e-06, "loss": 0.0625, "step": 810 }, { "epoch": 0.27082985473367843, "grad_norm": 0.7146971542868865, "learning_rate": 9.021134593993327e-06, "loss": 0.0534, "step": 811 }, { "epoch": 0.271163800300551, "grad_norm": 0.6438970374293348, "learning_rate": 9.03225806451613e-06, "loss": 0.0638, "step": 812 }, { "epoch": 0.2714977458674236, "grad_norm": 0.5829254200611607, "learning_rate": 9.043381535038932e-06, "loss": 0.0472, "step": 813 }, { "epoch": 0.2718316914342962, "grad_norm": 0.654413773007462, "learning_rate": 9.054505005561735e-06, "loss": 0.0601, "step": 814 }, { "epoch": 0.2721656370011688, "grad_norm": 0.5106614833580086, "learning_rate": 9.06562847608454e-06, "loss": 0.0465, "step": 815 }, { "epoch": 0.2724995825680414, "grad_norm": 0.597727488309152, "learning_rate": 9.076751946607342e-06, "loss": 0.0498, "step": 816 }, { "epoch": 0.272833528134914, "grad_norm": 0.7658624569422692, "learning_rate": 9.087875417130145e-06, "loss": 0.0818, "step": 817 }, { "epoch": 0.2731674737017866, "grad_norm": 0.6226947658116192, "learning_rate": 9.09899888765295e-06, "loss": 0.049, "step": 818 }, { "epoch": 0.27350141926865923, "grad_norm": 0.7874840061595467, "learning_rate": 9.110122358175752e-06, "loss": 0.0529, "step": 819 }, { "epoch": 0.27383536483553184, "grad_norm": 0.7057495392935673, "learning_rate": 9.121245828698555e-06, "loss": 0.0655, "step": 820 }, { "epoch": 0.2741693104024044, "grad_norm": 0.7556130731717619, "learning_rate": 9.132369299221357e-06, "loss": 0.0612, "step": 821 }, { "epoch": 0.274503255969277, "grad_norm": 0.709870989496386, "learning_rate": 9.143492769744162e-06, "loss": 0.0536, "step": 822 }, { "epoch": 0.2748372015361496, "grad_norm": 0.43499101929326484, "learning_rate": 9.154616240266963e-06, "loss": 0.0393, "step": 823 }, { "epoch": 0.2751711471030222, "grad_norm": 0.6961537521945201, "learning_rate": 9.165739710789767e-06, "loss": 0.0557, "step": 824 }, { "epoch": 0.2755050926698948, "grad_norm": 0.7553143434917614, "learning_rate": 9.176863181312572e-06, "loss": 0.0663, "step": 825 }, { "epoch": 0.2758390382367674, "grad_norm": 0.5720851308592011, "learning_rate": 9.187986651835373e-06, "loss": 0.0523, "step": 826 }, { "epoch": 0.27617298380364, "grad_norm": 0.5270648394655818, "learning_rate": 9.199110122358177e-06, "loss": 0.0455, "step": 827 }, { "epoch": 0.27650692937051263, "grad_norm": 0.5470786945731327, "learning_rate": 9.21023359288098e-06, "loss": 0.0383, "step": 828 }, { "epoch": 0.2768408749373852, "grad_norm": 0.5530166964220198, "learning_rate": 9.221357063403782e-06, "loss": 0.0539, "step": 829 }, { "epoch": 0.2771748205042578, "grad_norm": 0.7434155435704276, "learning_rate": 9.232480533926585e-06, "loss": 0.0494, "step": 830 }, { "epoch": 0.2775087660711304, "grad_norm": 0.5809199893108594, "learning_rate": 9.24360400444939e-06, "loss": 0.0527, "step": 831 }, { "epoch": 0.277842711638003, "grad_norm": 0.7209529033280945, "learning_rate": 9.254727474972192e-06, "loss": 0.0537, "step": 832 }, { "epoch": 0.2781766572048756, "grad_norm": 0.5102243838580376, "learning_rate": 9.265850945494995e-06, "loss": 0.0332, "step": 833 }, { "epoch": 0.2785106027717482, "grad_norm": 0.6551621469992068, "learning_rate": 9.2769744160178e-06, "loss": 0.0496, "step": 834 }, { "epoch": 0.2788445483386208, "grad_norm": 0.6155475477030943, "learning_rate": 9.288097886540602e-06, "loss": 0.0519, "step": 835 }, { "epoch": 0.27917849390549343, "grad_norm": 0.749997680109745, "learning_rate": 9.299221357063405e-06, "loss": 0.0609, "step": 836 }, { "epoch": 0.279512439472366, "grad_norm": 0.5426171336986118, "learning_rate": 9.310344827586207e-06, "loss": 0.058, "step": 837 }, { "epoch": 0.2798463850392386, "grad_norm": 0.5920624994442251, "learning_rate": 9.32146829810901e-06, "loss": 0.0444, "step": 838 }, { "epoch": 0.2801803306061112, "grad_norm": 0.538004372440245, "learning_rate": 9.332591768631813e-06, "loss": 0.0459, "step": 839 }, { "epoch": 0.2805142761729838, "grad_norm": 0.5517405979478516, "learning_rate": 9.343715239154617e-06, "loss": 0.0482, "step": 840 }, { "epoch": 0.2808482217398564, "grad_norm": 0.7691300801734562, "learning_rate": 9.35483870967742e-06, "loss": 0.0467, "step": 841 }, { "epoch": 0.281182167306729, "grad_norm": 0.5454206931132759, "learning_rate": 9.365962180200223e-06, "loss": 0.0462, "step": 842 }, { "epoch": 0.2815161128736016, "grad_norm": 0.6584147344642964, "learning_rate": 9.377085650723027e-06, "loss": 0.0532, "step": 843 }, { "epoch": 0.2818500584404742, "grad_norm": 0.7540716528017773, "learning_rate": 9.38820912124583e-06, "loss": 0.0533, "step": 844 }, { "epoch": 0.2821840040073468, "grad_norm": 0.6441723405492215, "learning_rate": 9.399332591768633e-06, "loss": 0.0572, "step": 845 }, { "epoch": 0.2825179495742194, "grad_norm": 0.6245762842213952, "learning_rate": 9.410456062291435e-06, "loss": 0.0441, "step": 846 }, { "epoch": 0.282851895141092, "grad_norm": 0.6946159964808465, "learning_rate": 9.42157953281424e-06, "loss": 0.0468, "step": 847 }, { "epoch": 0.2831858407079646, "grad_norm": 1.06780412126692, "learning_rate": 9.43270300333704e-06, "loss": 0.0714, "step": 848 }, { "epoch": 0.2835197862748372, "grad_norm": 0.5119991575843846, "learning_rate": 9.443826473859845e-06, "loss": 0.0441, "step": 849 }, { "epoch": 0.2838537318417098, "grad_norm": 0.5935557569737968, "learning_rate": 9.45494994438265e-06, "loss": 0.046, "step": 850 }, { "epoch": 0.2841876774085824, "grad_norm": 0.7699566957350499, "learning_rate": 9.46607341490545e-06, "loss": 0.0569, "step": 851 }, { "epoch": 0.284521622975455, "grad_norm": 0.5323645354386496, "learning_rate": 9.477196885428255e-06, "loss": 0.0562, "step": 852 }, { "epoch": 0.2848555685423276, "grad_norm": 0.6008477104164092, "learning_rate": 9.488320355951058e-06, "loss": 0.0502, "step": 853 }, { "epoch": 0.2851895141092002, "grad_norm": 0.5769074354525283, "learning_rate": 9.49944382647386e-06, "loss": 0.0443, "step": 854 }, { "epoch": 0.2855234596760728, "grad_norm": 0.6612659226019639, "learning_rate": 9.510567296996663e-06, "loss": 0.0401, "step": 855 }, { "epoch": 0.2858574052429454, "grad_norm": 0.5255707752772606, "learning_rate": 9.521690767519467e-06, "loss": 0.0422, "step": 856 }, { "epoch": 0.286191350809818, "grad_norm": 0.5689158347471137, "learning_rate": 9.53281423804227e-06, "loss": 0.0538, "step": 857 }, { "epoch": 0.2865252963766906, "grad_norm": 0.7181350237218456, "learning_rate": 9.543937708565073e-06, "loss": 0.0509, "step": 858 }, { "epoch": 0.2868592419435632, "grad_norm": 0.7983998789202897, "learning_rate": 9.555061179087877e-06, "loss": 0.0468, "step": 859 }, { "epoch": 0.2871931875104358, "grad_norm": 0.6218813290653613, "learning_rate": 9.56618464961068e-06, "loss": 0.0496, "step": 860 }, { "epoch": 0.28752713307730837, "grad_norm": 0.5965225546980025, "learning_rate": 9.577308120133483e-06, "loss": 0.0487, "step": 861 }, { "epoch": 0.287861078644181, "grad_norm": 0.4985642306082885, "learning_rate": 9.588431590656285e-06, "loss": 0.0593, "step": 862 }, { "epoch": 0.2881950242110536, "grad_norm": 1.2787183095841748, "learning_rate": 9.599555061179088e-06, "loss": 0.0612, "step": 863 }, { "epoch": 0.2885289697779262, "grad_norm": 1.2982367757349111, "learning_rate": 9.61067853170189e-06, "loss": 0.0607, "step": 864 }, { "epoch": 0.2888629153447988, "grad_norm": 0.678857658457712, "learning_rate": 9.621802002224695e-06, "loss": 0.0692, "step": 865 }, { "epoch": 0.2891968609116714, "grad_norm": 0.6716018674150093, "learning_rate": 9.632925472747498e-06, "loss": 0.0457, "step": 866 }, { "epoch": 0.289530806478544, "grad_norm": 0.3721700132825576, "learning_rate": 9.6440489432703e-06, "loss": 0.0319, "step": 867 }, { "epoch": 0.2898647520454166, "grad_norm": 1.1152820078290502, "learning_rate": 9.655172413793105e-06, "loss": 0.0831, "step": 868 }, { "epoch": 0.29019869761228917, "grad_norm": 0.7166835764536688, "learning_rate": 9.666295884315908e-06, "loss": 0.0592, "step": 869 }, { "epoch": 0.2905326431791618, "grad_norm": 0.5634273378329758, "learning_rate": 9.67741935483871e-06, "loss": 0.049, "step": 870 }, { "epoch": 0.2908665887460344, "grad_norm": 0.9374386237892269, "learning_rate": 9.688542825361513e-06, "loss": 0.0465, "step": 871 }, { "epoch": 0.291200534312907, "grad_norm": 0.8652631794306441, "learning_rate": 9.699666295884318e-06, "loss": 0.0565, "step": 872 }, { "epoch": 0.2915344798797796, "grad_norm": 0.5502052384336255, "learning_rate": 9.710789766407119e-06, "loss": 0.0436, "step": 873 }, { "epoch": 0.2918684254466522, "grad_norm": 0.8517087684298448, "learning_rate": 9.721913236929923e-06, "loss": 0.0528, "step": 874 }, { "epoch": 0.2922023710135248, "grad_norm": 0.6982728444124906, "learning_rate": 9.733036707452727e-06, "loss": 0.0568, "step": 875 }, { "epoch": 0.2925363165803974, "grad_norm": 0.5480759138372429, "learning_rate": 9.744160177975528e-06, "loss": 0.0506, "step": 876 }, { "epoch": 0.29287026214727, "grad_norm": 0.7227485062019516, "learning_rate": 9.755283648498333e-06, "loss": 0.0467, "step": 877 }, { "epoch": 0.2932042077141426, "grad_norm": 1.2801312311944717, "learning_rate": 9.766407119021135e-06, "loss": 0.0587, "step": 878 }, { "epoch": 0.2935381532810152, "grad_norm": 0.6922723872684086, "learning_rate": 9.777530589543938e-06, "loss": 0.0685, "step": 879 }, { "epoch": 0.2938720988478878, "grad_norm": 0.5224930160013006, "learning_rate": 9.788654060066741e-06, "loss": 0.0517, "step": 880 }, { "epoch": 0.2942060444147604, "grad_norm": 1.3256100303559288, "learning_rate": 9.799777530589545e-06, "loss": 0.0634, "step": 881 }, { "epoch": 0.294539989981633, "grad_norm": 0.7626222770236885, "learning_rate": 9.810901001112348e-06, "loss": 0.0699, "step": 882 }, { "epoch": 0.2948739355485056, "grad_norm": 0.6445500365350331, "learning_rate": 9.82202447163515e-06, "loss": 0.0567, "step": 883 }, { "epoch": 0.2952078811153782, "grad_norm": 0.7133948826260625, "learning_rate": 9.833147942157955e-06, "loss": 0.0656, "step": 884 }, { "epoch": 0.2955418266822508, "grad_norm": 0.7293006193055029, "learning_rate": 9.844271412680758e-06, "loss": 0.0682, "step": 885 }, { "epoch": 0.29587577224912337, "grad_norm": 0.723469040579579, "learning_rate": 9.85539488320356e-06, "loss": 0.0557, "step": 886 }, { "epoch": 0.296209717815996, "grad_norm": 0.9182207199919388, "learning_rate": 9.866518353726363e-06, "loss": 0.0757, "step": 887 }, { "epoch": 0.2965436633828686, "grad_norm": 0.8696259837392362, "learning_rate": 9.877641824249166e-06, "loss": 0.0544, "step": 888 }, { "epoch": 0.2968776089497412, "grad_norm": 0.5859047475602415, "learning_rate": 9.888765294771969e-06, "loss": 0.0426, "step": 889 }, { "epoch": 0.2972115545166138, "grad_norm": 0.8098551825765264, "learning_rate": 9.899888765294773e-06, "loss": 0.0545, "step": 890 }, { "epoch": 0.2975455000834864, "grad_norm": 0.7610901027395703, "learning_rate": 9.911012235817576e-06, "loss": 0.0508, "step": 891 }, { "epoch": 0.297879445650359, "grad_norm": 0.7568651557259278, "learning_rate": 9.922135706340378e-06, "loss": 0.0673, "step": 892 }, { "epoch": 0.2982133912172316, "grad_norm": 1.1494689009052674, "learning_rate": 9.933259176863183e-06, "loss": 0.0579, "step": 893 }, { "epoch": 0.29854733678410417, "grad_norm": 0.5819263992119622, "learning_rate": 9.944382647385986e-06, "loss": 0.0589, "step": 894 }, { "epoch": 0.2988812823509768, "grad_norm": 0.6959715412341217, "learning_rate": 9.955506117908788e-06, "loss": 0.0636, "step": 895 }, { "epoch": 0.2992152279178494, "grad_norm": 0.6561594765200578, "learning_rate": 9.966629588431591e-06, "loss": 0.062, "step": 896 }, { "epoch": 0.299549173484722, "grad_norm": 0.771047397124875, "learning_rate": 9.977753058954395e-06, "loss": 0.0512, "step": 897 }, { "epoch": 0.2998831190515946, "grad_norm": 0.7812633820758519, "learning_rate": 9.988876529477196e-06, "loss": 0.0589, "step": 898 }, { "epoch": 0.3002170646184672, "grad_norm": 0.6414790783874095, "learning_rate": 1e-05, "loss": 0.0529, "step": 899 }, { "epoch": 0.3005510101853398, "grad_norm": 0.9018861790671471, "learning_rate": 9.999999622345564e-06, "loss": 0.0629, "step": 900 }, { "epoch": 0.3008849557522124, "grad_norm": 1.1555100050578804, "learning_rate": 9.999998489382312e-06, "loss": 0.058, "step": 901 }, { "epoch": 0.30121890131908496, "grad_norm": 0.5315076215327028, "learning_rate": 9.999996601110414e-06, "loss": 0.0444, "step": 902 }, { "epoch": 0.30155284688595757, "grad_norm": 1.1152517950303242, "learning_rate": 9.999993957530157e-06, "loss": 0.0614, "step": 903 }, { "epoch": 0.3018867924528302, "grad_norm": 0.6956334431936639, "learning_rate": 9.999990558641939e-06, "loss": 0.0578, "step": 904 }, { "epoch": 0.3022207380197028, "grad_norm": 0.6092546044991591, "learning_rate": 9.999986404446276e-06, "loss": 0.0533, "step": 905 }, { "epoch": 0.3025546835865754, "grad_norm": 0.9717116959813895, "learning_rate": 9.999981494943791e-06, "loss": 0.0716, "step": 906 }, { "epoch": 0.302888629153448, "grad_norm": 0.9659324392329879, "learning_rate": 9.99997583013523e-06, "loss": 0.065, "step": 907 }, { "epoch": 0.3032225747203206, "grad_norm": 0.40111083516509344, "learning_rate": 9.999969410021447e-06, "loss": 0.0425, "step": 908 }, { "epoch": 0.3035565202871932, "grad_norm": 0.9630082239232272, "learning_rate": 9.999962234603412e-06, "loss": 0.0653, "step": 909 }, { "epoch": 0.30389046585406576, "grad_norm": 0.6032894259625411, "learning_rate": 9.99995430388221e-06, "loss": 0.0411, "step": 910 }, { "epoch": 0.30422441142093837, "grad_norm": 0.6243752152697278, "learning_rate": 9.999945617859034e-06, "loss": 0.0523, "step": 911 }, { "epoch": 0.304558356987811, "grad_norm": 0.6661838900721574, "learning_rate": 9.999936176535203e-06, "loss": 0.0559, "step": 912 }, { "epoch": 0.3048923025546836, "grad_norm": 0.7847123440405521, "learning_rate": 9.99992597991214e-06, "loss": 0.0547, "step": 913 }, { "epoch": 0.3052262481215562, "grad_norm": 0.6034643612676034, "learning_rate": 9.999915027991384e-06, "loss": 0.0458, "step": 914 }, { "epoch": 0.3055601936884288, "grad_norm": 0.6373222518679178, "learning_rate": 9.999903320774593e-06, "loss": 0.0436, "step": 915 }, { "epoch": 0.3058941392553014, "grad_norm": 0.7836447446906802, "learning_rate": 9.999890858263532e-06, "loss": 0.051, "step": 916 }, { "epoch": 0.306228084822174, "grad_norm": 0.6508583114723121, "learning_rate": 9.999877640460085e-06, "loss": 0.0473, "step": 917 }, { "epoch": 0.30656203038904656, "grad_norm": 0.5706335931118961, "learning_rate": 9.999863667366249e-06, "loss": 0.0612, "step": 918 }, { "epoch": 0.30689597595591916, "grad_norm": 0.6946210169872966, "learning_rate": 9.999848938984135e-06, "loss": 0.0472, "step": 919 }, { "epoch": 0.30722992152279177, "grad_norm": 0.5162847660972449, "learning_rate": 9.999833455315966e-06, "loss": 0.0567, "step": 920 }, { "epoch": 0.3075638670896644, "grad_norm": 0.6282724513719338, "learning_rate": 9.999817216364085e-06, "loss": 0.0586, "step": 921 }, { "epoch": 0.307897812656537, "grad_norm": 0.5970451629512815, "learning_rate": 9.99980022213094e-06, "loss": 0.0541, "step": 922 }, { "epoch": 0.3082317582234096, "grad_norm": 0.6049044469788369, "learning_rate": 9.999782472619102e-06, "loss": 0.0494, "step": 923 }, { "epoch": 0.3085657037902822, "grad_norm": 0.6449474401906675, "learning_rate": 9.99976396783125e-06, "loss": 0.0616, "step": 924 }, { "epoch": 0.3088996493571548, "grad_norm": 0.8027233078599899, "learning_rate": 9.999744707770182e-06, "loss": 0.0441, "step": 925 }, { "epoch": 0.3092335949240274, "grad_norm": 0.8880518704433723, "learning_rate": 9.999724692438805e-06, "loss": 0.0791, "step": 926 }, { "epoch": 0.30956754049089996, "grad_norm": 0.5138081603128517, "learning_rate": 9.999703921840143e-06, "loss": 0.0535, "step": 927 }, { "epoch": 0.30990148605777257, "grad_norm": 0.6603602172738352, "learning_rate": 9.999682395977334e-06, "loss": 0.0551, "step": 928 }, { "epoch": 0.3102354316246452, "grad_norm": 0.6890303648289062, "learning_rate": 9.999660114853631e-06, "loss": 0.0482, "step": 929 }, { "epoch": 0.3105693771915178, "grad_norm": 0.43150917486352325, "learning_rate": 9.999637078472398e-06, "loss": 0.039, "step": 930 }, { "epoch": 0.3109033227583904, "grad_norm": 0.45827832678573877, "learning_rate": 9.999613286837115e-06, "loss": 0.0352, "step": 931 }, { "epoch": 0.311237268325263, "grad_norm": 0.6168319762636703, "learning_rate": 9.999588739951376e-06, "loss": 0.0588, "step": 932 }, { "epoch": 0.3115712138921356, "grad_norm": 0.6007064538712662, "learning_rate": 9.99956343781889e-06, "loss": 0.067, "step": 933 }, { "epoch": 0.3119051594590082, "grad_norm": 0.6869659934736515, "learning_rate": 9.999537380443479e-06, "loss": 0.0548, "step": 934 }, { "epoch": 0.31223910502588076, "grad_norm": 0.6822343644574054, "learning_rate": 9.999510567829079e-06, "loss": 0.0456, "step": 935 }, { "epoch": 0.31257305059275337, "grad_norm": 0.6001577713535029, "learning_rate": 9.999482999979739e-06, "loss": 0.0591, "step": 936 }, { "epoch": 0.31290699615962597, "grad_norm": 0.8075508966129092, "learning_rate": 9.999454676899628e-06, "loss": 0.0573, "step": 937 }, { "epoch": 0.3132409417264986, "grad_norm": 0.9578024765896318, "learning_rate": 9.999425598593018e-06, "loss": 0.0689, "step": 938 }, { "epoch": 0.3135748872933712, "grad_norm": 0.6553702398957317, "learning_rate": 9.999395765064308e-06, "loss": 0.0749, "step": 939 }, { "epoch": 0.3139088328602438, "grad_norm": 0.5845552152591339, "learning_rate": 9.999365176318e-06, "loss": 0.0464, "step": 940 }, { "epoch": 0.3142427784271164, "grad_norm": 0.8267163038157918, "learning_rate": 9.999333832358716e-06, "loss": 0.062, "step": 941 }, { "epoch": 0.314576723993989, "grad_norm": 0.6776387963285405, "learning_rate": 9.999301733191193e-06, "loss": 0.0404, "step": 942 }, { "epoch": 0.31491066956086156, "grad_norm": 0.954616904867087, "learning_rate": 9.999268878820278e-06, "loss": 0.0676, "step": 943 }, { "epoch": 0.31524461512773416, "grad_norm": 0.8261235198634368, "learning_rate": 9.999235269250933e-06, "loss": 0.0496, "step": 944 }, { "epoch": 0.31557856069460677, "grad_norm": 0.4354925203700167, "learning_rate": 9.999200904488238e-06, "loss": 0.0453, "step": 945 }, { "epoch": 0.3159125062614794, "grad_norm": 0.6393409713170454, "learning_rate": 9.999165784537381e-06, "loss": 0.0562, "step": 946 }, { "epoch": 0.316246451828352, "grad_norm": 0.5051127870569156, "learning_rate": 9.999129909403671e-06, "loss": 0.0414, "step": 947 }, { "epoch": 0.3165803973952246, "grad_norm": 0.6830945915913949, "learning_rate": 9.999093279092524e-06, "loss": 0.0657, "step": 948 }, { "epoch": 0.3169143429620972, "grad_norm": 0.7493036064245449, "learning_rate": 9.999055893609475e-06, "loss": 0.0685, "step": 949 }, { "epoch": 0.3172482885289698, "grad_norm": 0.6573495934445489, "learning_rate": 9.999017752960172e-06, "loss": 0.0541, "step": 950 }, { "epoch": 0.31758223409584235, "grad_norm": 0.8220607798846838, "learning_rate": 9.998978857150375e-06, "loss": 0.0772, "step": 951 }, { "epoch": 0.31791617966271496, "grad_norm": 0.6051049278520222, "learning_rate": 9.99893920618596e-06, "loss": 0.0548, "step": 952 }, { "epoch": 0.31825012522958757, "grad_norm": 0.8396686530912078, "learning_rate": 9.998898800072919e-06, "loss": 0.0559, "step": 953 }, { "epoch": 0.3185840707964602, "grad_norm": 0.5138888258715585, "learning_rate": 9.998857638817354e-06, "loss": 0.0408, "step": 954 }, { "epoch": 0.3189180163633328, "grad_norm": 0.5930870657595226, "learning_rate": 9.99881572242548e-06, "loss": 0.0553, "step": 955 }, { "epoch": 0.3192519619302054, "grad_norm": 0.5739338240819899, "learning_rate": 9.998773050903637e-06, "loss": 0.0451, "step": 956 }, { "epoch": 0.319585907497078, "grad_norm": 0.6488602357523269, "learning_rate": 9.998729624258262e-06, "loss": 0.0547, "step": 957 }, { "epoch": 0.3199198530639506, "grad_norm": 0.4766136004536333, "learning_rate": 9.998685442495921e-06, "loss": 0.0545, "step": 958 }, { "epoch": 0.32025379863082315, "grad_norm": 0.6496173450622319, "learning_rate": 9.998640505623284e-06, "loss": 0.0527, "step": 959 }, { "epoch": 0.32058774419769576, "grad_norm": 0.7607961932674538, "learning_rate": 9.998594813647145e-06, "loss": 0.0495, "step": 960 }, { "epoch": 0.32092168976456836, "grad_norm": 0.551714885960616, "learning_rate": 9.998548366574401e-06, "loss": 0.0506, "step": 961 }, { "epoch": 0.32125563533144097, "grad_norm": 0.44257355961637446, "learning_rate": 9.99850116441207e-06, "loss": 0.0409, "step": 962 }, { "epoch": 0.3215895808983136, "grad_norm": 0.5235000652023187, "learning_rate": 9.998453207167282e-06, "loss": 0.0664, "step": 963 }, { "epoch": 0.3219235264651862, "grad_norm": 0.597540827350683, "learning_rate": 9.998404494847285e-06, "loss": 0.0553, "step": 964 }, { "epoch": 0.3222574720320588, "grad_norm": 0.49350745999906936, "learning_rate": 9.998355027459432e-06, "loss": 0.057, "step": 965 }, { "epoch": 0.3225914175989314, "grad_norm": 0.5319476769802207, "learning_rate": 9.998304805011199e-06, "loss": 0.0516, "step": 966 }, { "epoch": 0.32292536316580395, "grad_norm": 0.6810796573105793, "learning_rate": 9.998253827510173e-06, "loss": 0.0636, "step": 967 }, { "epoch": 0.32325930873267655, "grad_norm": 0.8578957548648788, "learning_rate": 9.998202094964053e-06, "loss": 0.0741, "step": 968 }, { "epoch": 0.32359325429954916, "grad_norm": 0.685083143230946, "learning_rate": 9.998149607380654e-06, "loss": 0.0414, "step": 969 }, { "epoch": 0.32392719986642177, "grad_norm": 0.5894900776993433, "learning_rate": 9.998096364767906e-06, "loss": 0.0545, "step": 970 }, { "epoch": 0.3242611454332944, "grad_norm": 0.768268478439697, "learning_rate": 9.998042367133854e-06, "loss": 0.0616, "step": 971 }, { "epoch": 0.324595091000167, "grad_norm": 0.5999522250409731, "learning_rate": 9.997987614486648e-06, "loss": 0.0404, "step": 972 }, { "epoch": 0.3249290365670396, "grad_norm": 0.5388683391884815, "learning_rate": 9.997932106834567e-06, "loss": 0.0387, "step": 973 }, { "epoch": 0.3252629821339122, "grad_norm": 0.7358432151948985, "learning_rate": 9.997875844185991e-06, "loss": 0.0679, "step": 974 }, { "epoch": 0.3255969277007848, "grad_norm": 0.7408974822801317, "learning_rate": 9.99781882654942e-06, "loss": 0.0512, "step": 975 }, { "epoch": 0.32593087326765735, "grad_norm": 0.46737257421637174, "learning_rate": 9.997761053933469e-06, "loss": 0.0501, "step": 976 }, { "epoch": 0.32626481883452996, "grad_norm": 0.49551151355877743, "learning_rate": 9.997702526346864e-06, "loss": 0.0511, "step": 977 }, { "epoch": 0.32659876440140256, "grad_norm": 0.7232364196431961, "learning_rate": 9.997643243798446e-06, "loss": 0.06, "step": 978 }, { "epoch": 0.32693270996827517, "grad_norm": 0.4266653292183982, "learning_rate": 9.99758320629717e-06, "loss": 0.0358, "step": 979 }, { "epoch": 0.3272666555351478, "grad_norm": 0.49875757240014273, "learning_rate": 9.997522413852108e-06, "loss": 0.0539, "step": 980 }, { "epoch": 0.3276006011020204, "grad_norm": 0.43873085951347823, "learning_rate": 9.997460866472439e-06, "loss": 0.0531, "step": 981 }, { "epoch": 0.327934546668893, "grad_norm": 0.8212265247081236, "learning_rate": 9.997398564167465e-06, "loss": 0.0638, "step": 982 }, { "epoch": 0.3282684922357656, "grad_norm": 0.5360694151874814, "learning_rate": 9.997335506946596e-06, "loss": 0.047, "step": 983 }, { "epoch": 0.32860243780263815, "grad_norm": 0.7864436246170377, "learning_rate": 9.997271694819354e-06, "loss": 0.055, "step": 984 }, { "epoch": 0.32893638336951075, "grad_norm": 0.6060787744687433, "learning_rate": 9.997207127795383e-06, "loss": 0.0418, "step": 985 }, { "epoch": 0.32927032893638336, "grad_norm": 0.5567811944339895, "learning_rate": 9.997141805884436e-06, "loss": 0.0408, "step": 986 }, { "epoch": 0.32960427450325597, "grad_norm": 0.6134680091972111, "learning_rate": 9.997075729096379e-06, "loss": 0.0545, "step": 987 }, { "epoch": 0.3299382200701286, "grad_norm": 0.7391293100903685, "learning_rate": 9.997008897441194e-06, "loss": 0.0441, "step": 988 }, { "epoch": 0.3302721656370012, "grad_norm": 0.4972724534693914, "learning_rate": 9.996941310928978e-06, "loss": 0.0533, "step": 989 }, { "epoch": 0.3306061112038738, "grad_norm": 0.6167427767477403, "learning_rate": 9.99687296956994e-06, "loss": 0.049, "step": 990 }, { "epoch": 0.3309400567707464, "grad_norm": 0.5005496813016653, "learning_rate": 9.996803873374402e-06, "loss": 0.0314, "step": 991 }, { "epoch": 0.33127400233761894, "grad_norm": 0.6552605175128834, "learning_rate": 9.996734022352805e-06, "loss": 0.0582, "step": 992 }, { "epoch": 0.33160794790449155, "grad_norm": 0.4989430154132851, "learning_rate": 9.9966634165157e-06, "loss": 0.046, "step": 993 }, { "epoch": 0.33194189347136416, "grad_norm": 0.5191323578296947, "learning_rate": 9.99659205587375e-06, "loss": 0.0502, "step": 994 }, { "epoch": 0.33227583903823676, "grad_norm": 0.8142715454027585, "learning_rate": 9.996519940437737e-06, "loss": 0.0715, "step": 995 }, { "epoch": 0.33260978460510937, "grad_norm": 0.4443342477966491, "learning_rate": 9.996447070218557e-06, "loss": 0.0564, "step": 996 }, { "epoch": 0.332943730171982, "grad_norm": 0.7482918825922684, "learning_rate": 9.996373445227215e-06, "loss": 0.0455, "step": 997 }, { "epoch": 0.3332776757388546, "grad_norm": 0.6779946308046043, "learning_rate": 9.996299065474832e-06, "loss": 0.039, "step": 998 }, { "epoch": 0.3336116213057272, "grad_norm": 0.7044409037727745, "learning_rate": 9.996223930972649e-06, "loss": 0.0564, "step": 999 }, { "epoch": 0.33394556687259974, "grad_norm": 0.5201950603047609, "learning_rate": 9.99614804173201e-06, "loss": 0.0379, "step": 1000 }, { "epoch": 0.33427951243947235, "grad_norm": 0.5919922786821062, "learning_rate": 9.996071397764381e-06, "loss": 0.0518, "step": 1001 }, { "epoch": 0.33461345800634495, "grad_norm": 0.4468692713433013, "learning_rate": 9.995993999081343e-06, "loss": 0.0415, "step": 1002 }, { "epoch": 0.33494740357321756, "grad_norm": 0.5228157333947675, "learning_rate": 9.995915845694584e-06, "loss": 0.0443, "step": 1003 }, { "epoch": 0.33528134914009017, "grad_norm": 0.802915398850966, "learning_rate": 9.995836937615913e-06, "loss": 0.0471, "step": 1004 }, { "epoch": 0.3356152947069628, "grad_norm": 0.4554512215067062, "learning_rate": 9.995757274857246e-06, "loss": 0.0472, "step": 1005 }, { "epoch": 0.3359492402738354, "grad_norm": 0.5103336708517712, "learning_rate": 9.995676857430621e-06, "loss": 0.0544, "step": 1006 }, { "epoch": 0.336283185840708, "grad_norm": 0.5132698819214155, "learning_rate": 9.995595685348186e-06, "loss": 0.0564, "step": 1007 }, { "epoch": 0.33661713140758054, "grad_norm": 0.5511313978650463, "learning_rate": 9.995513758622198e-06, "loss": 0.061, "step": 1008 }, { "epoch": 0.33695107697445315, "grad_norm": 0.5019800042411315, "learning_rate": 9.995431077265038e-06, "loss": 0.0457, "step": 1009 }, { "epoch": 0.33728502254132575, "grad_norm": 0.5196839976674814, "learning_rate": 9.995347641289194e-06, "loss": 0.0453, "step": 1010 }, { "epoch": 0.33761896810819836, "grad_norm": 0.6156570747013106, "learning_rate": 9.995263450707273e-06, "loss": 0.0609, "step": 1011 }, { "epoch": 0.33795291367507097, "grad_norm": 0.5044945359037992, "learning_rate": 9.995178505531989e-06, "loss": 0.0403, "step": 1012 }, { "epoch": 0.33828685924194357, "grad_norm": 0.6239801495147239, "learning_rate": 9.995092805776175e-06, "loss": 0.0599, "step": 1013 }, { "epoch": 0.3386208048088162, "grad_norm": 0.855667110065577, "learning_rate": 9.995006351452775e-06, "loss": 0.0362, "step": 1014 }, { "epoch": 0.3389547503756888, "grad_norm": 0.5684075673561093, "learning_rate": 9.994919142574854e-06, "loss": 0.0363, "step": 1015 }, { "epoch": 0.33928869594256134, "grad_norm": 0.6532875524634826, "learning_rate": 9.994831179155584e-06, "loss": 0.0529, "step": 1016 }, { "epoch": 0.33962264150943394, "grad_norm": 0.7820987397590781, "learning_rate": 9.994742461208251e-06, "loss": 0.0592, "step": 1017 }, { "epoch": 0.33995658707630655, "grad_norm": 0.576914782518453, "learning_rate": 9.994652988746258e-06, "loss": 0.0433, "step": 1018 }, { "epoch": 0.34029053264317916, "grad_norm": 0.9155248759228477, "learning_rate": 9.994562761783122e-06, "loss": 0.0585, "step": 1019 }, { "epoch": 0.34062447821005176, "grad_norm": 0.49836935453655445, "learning_rate": 9.99447178033247e-06, "loss": 0.0466, "step": 1020 }, { "epoch": 0.34095842377692437, "grad_norm": 0.6610249361759681, "learning_rate": 9.99438004440805e-06, "loss": 0.0486, "step": 1021 }, { "epoch": 0.341292369343797, "grad_norm": 0.45444211329207995, "learning_rate": 9.994287554023717e-06, "loss": 0.0392, "step": 1022 }, { "epoch": 0.3416263149106696, "grad_norm": 0.5625728324533656, "learning_rate": 9.994194309193442e-06, "loss": 0.0408, "step": 1023 }, { "epoch": 0.3419602604775422, "grad_norm": 0.6195011121917613, "learning_rate": 9.99410030993131e-06, "loss": 0.0454, "step": 1024 }, { "epoch": 0.34229420604441474, "grad_norm": 0.4493699382996825, "learning_rate": 9.994005556251525e-06, "loss": 0.043, "step": 1025 }, { "epoch": 0.34262815161128735, "grad_norm": 0.5894717029644626, "learning_rate": 9.993910048168399e-06, "loss": 0.0406, "step": 1026 }, { "epoch": 0.34296209717815995, "grad_norm": 0.6486042487221031, "learning_rate": 9.993813785696355e-06, "loss": 0.0433, "step": 1027 }, { "epoch": 0.34329604274503256, "grad_norm": 0.5200330454917651, "learning_rate": 9.993716768849942e-06, "loss": 0.0394, "step": 1028 }, { "epoch": 0.34362998831190517, "grad_norm": 0.7015607884906295, "learning_rate": 9.99361899764381e-06, "loss": 0.0558, "step": 1029 }, { "epoch": 0.3439639338787778, "grad_norm": 0.8055226873973813, "learning_rate": 9.993520472092732e-06, "loss": 0.0544, "step": 1030 }, { "epoch": 0.3442978794456504, "grad_norm": 0.41330444801652355, "learning_rate": 9.99342119221159e-06, "loss": 0.04, "step": 1031 }, { "epoch": 0.344631825012523, "grad_norm": 0.48400055037656814, "learning_rate": 9.993321158015379e-06, "loss": 0.0349, "step": 1032 }, { "epoch": 0.34496577057939554, "grad_norm": 0.7469177579288511, "learning_rate": 9.993220369519215e-06, "loss": 0.0596, "step": 1033 }, { "epoch": 0.34529971614626814, "grad_norm": 0.5104501629279141, "learning_rate": 9.99311882673832e-06, "loss": 0.0323, "step": 1034 }, { "epoch": 0.34563366171314075, "grad_norm": 0.6392111836152765, "learning_rate": 9.993016529688033e-06, "loss": 0.0515, "step": 1035 }, { "epoch": 0.34596760728001336, "grad_norm": 0.6188335187658996, "learning_rate": 9.99291347838381e-06, "loss": 0.0436, "step": 1036 }, { "epoch": 0.34630155284688596, "grad_norm": 0.5010576873349704, "learning_rate": 9.992809672841218e-06, "loss": 0.0466, "step": 1037 }, { "epoch": 0.34663549841375857, "grad_norm": 0.6905362980082702, "learning_rate": 9.992705113075933e-06, "loss": 0.0484, "step": 1038 }, { "epoch": 0.3469694439806312, "grad_norm": 0.4979053704058577, "learning_rate": 9.992599799103754e-06, "loss": 0.0408, "step": 1039 }, { "epoch": 0.3473033895475038, "grad_norm": 0.7295972332427876, "learning_rate": 9.99249373094059e-06, "loss": 0.0626, "step": 1040 }, { "epoch": 0.34763733511437633, "grad_norm": 0.8606004422561815, "learning_rate": 9.992386908602466e-06, "loss": 0.0692, "step": 1041 }, { "epoch": 0.34797128068124894, "grad_norm": 0.4083370175535445, "learning_rate": 9.992279332105512e-06, "loss": 0.04, "step": 1042 }, { "epoch": 0.34830522624812155, "grad_norm": 0.6516779145116438, "learning_rate": 9.992171001465985e-06, "loss": 0.0606, "step": 1043 }, { "epoch": 0.34863917181499415, "grad_norm": 0.46181250675828023, "learning_rate": 9.992061916700247e-06, "loss": 0.0358, "step": 1044 }, { "epoch": 0.34897311738186676, "grad_norm": 0.4205777188027031, "learning_rate": 9.991952077824776e-06, "loss": 0.0455, "step": 1045 }, { "epoch": 0.34930706294873937, "grad_norm": 0.6479523601109954, "learning_rate": 9.991841484856166e-06, "loss": 0.0644, "step": 1046 }, { "epoch": 0.349641008515612, "grad_norm": 0.4445216305048308, "learning_rate": 9.991730137811122e-06, "loss": 0.049, "step": 1047 }, { "epoch": 0.3499749540824846, "grad_norm": 0.5093022665200985, "learning_rate": 9.991618036706464e-06, "loss": 0.0506, "step": 1048 }, { "epoch": 0.35030889964935713, "grad_norm": 0.6502313742689316, "learning_rate": 9.99150518155913e-06, "loss": 0.0541, "step": 1049 }, { "epoch": 0.35064284521622974, "grad_norm": 0.5924500908160205, "learning_rate": 9.991391572386162e-06, "loss": 0.0647, "step": 1050 }, { "epoch": 0.35097679078310234, "grad_norm": 0.7866655871375102, "learning_rate": 9.991277209204728e-06, "loss": 0.061, "step": 1051 }, { "epoch": 0.35131073634997495, "grad_norm": 0.7000607088577165, "learning_rate": 9.991162092032101e-06, "loss": 0.0764, "step": 1052 }, { "epoch": 0.35164468191684756, "grad_norm": 0.5909027653068674, "learning_rate": 9.99104622088567e-06, "loss": 0.0566, "step": 1053 }, { "epoch": 0.35197862748372016, "grad_norm": 0.6401638042696182, "learning_rate": 9.990929595782938e-06, "loss": 0.0387, "step": 1054 }, { "epoch": 0.35231257305059277, "grad_norm": 0.5051120395345109, "learning_rate": 9.990812216741529e-06, "loss": 0.0415, "step": 1055 }, { "epoch": 0.3526465186174654, "grad_norm": 0.5952307644884801, "learning_rate": 9.990694083779166e-06, "loss": 0.0572, "step": 1056 }, { "epoch": 0.3529804641843379, "grad_norm": 0.6185415636964717, "learning_rate": 9.990575196913699e-06, "loss": 0.0438, "step": 1057 }, { "epoch": 0.35331440975121053, "grad_norm": 0.6702811452763382, "learning_rate": 9.990455556163086e-06, "loss": 0.0621, "step": 1058 }, { "epoch": 0.35364835531808314, "grad_norm": 0.8501224163188211, "learning_rate": 9.990335161545401e-06, "loss": 0.062, "step": 1059 }, { "epoch": 0.35398230088495575, "grad_norm": 0.5000086771582775, "learning_rate": 9.99021401307883e-06, "loss": 0.056, "step": 1060 }, { "epoch": 0.35431624645182835, "grad_norm": 0.6794738007002346, "learning_rate": 9.990092110781675e-06, "loss": 0.0613, "step": 1061 }, { "epoch": 0.35465019201870096, "grad_norm": 0.7239179023620307, "learning_rate": 9.98996945467235e-06, "loss": 0.0515, "step": 1062 }, { "epoch": 0.35498413758557357, "grad_norm": 0.4383656802090465, "learning_rate": 9.989846044769384e-06, "loss": 0.0332, "step": 1063 }, { "epoch": 0.3553180831524462, "grad_norm": 0.6911848617553831, "learning_rate": 9.98972188109142e-06, "loss": 0.0509, "step": 1064 }, { "epoch": 0.3556520287193187, "grad_norm": 0.8755822538317177, "learning_rate": 9.989596963657213e-06, "loss": 0.0655, "step": 1065 }, { "epoch": 0.35598597428619133, "grad_norm": 1.2761713511915753, "learning_rate": 9.989471292485636e-06, "loss": 0.0824, "step": 1066 }, { "epoch": 0.35631991985306394, "grad_norm": 0.7097954027148441, "learning_rate": 9.989344867595668e-06, "loss": 0.0664, "step": 1067 }, { "epoch": 0.35665386541993654, "grad_norm": 0.662383198070942, "learning_rate": 9.989217689006412e-06, "loss": 0.0513, "step": 1068 }, { "epoch": 0.35698781098680915, "grad_norm": 0.5819687943988014, "learning_rate": 9.989089756737077e-06, "loss": 0.0484, "step": 1069 }, { "epoch": 0.35732175655368176, "grad_norm": 0.6486771942566604, "learning_rate": 9.988961070806991e-06, "loss": 0.055, "step": 1070 }, { "epoch": 0.35765570212055436, "grad_norm": 0.5068948204875828, "learning_rate": 9.988831631235591e-06, "loss": 0.0504, "step": 1071 }, { "epoch": 0.35798964768742697, "grad_norm": 0.475378088086222, "learning_rate": 9.98870143804243e-06, "loss": 0.0357, "step": 1072 }, { "epoch": 0.3583235932542995, "grad_norm": 0.47100408678138567, "learning_rate": 9.988570491247179e-06, "loss": 0.0452, "step": 1073 }, { "epoch": 0.35865753882117213, "grad_norm": 0.6000529837893382, "learning_rate": 9.988438790869616e-06, "loss": 0.0541, "step": 1074 }, { "epoch": 0.35899148438804473, "grad_norm": 0.7406154549381468, "learning_rate": 9.988306336929637e-06, "loss": 0.0588, "step": 1075 }, { "epoch": 0.35932542995491734, "grad_norm": 0.5825635763306092, "learning_rate": 9.988173129447251e-06, "loss": 0.0608, "step": 1076 }, { "epoch": 0.35965937552178995, "grad_norm": 0.5387058170920624, "learning_rate": 9.98803916844258e-06, "loss": 0.0501, "step": 1077 }, { "epoch": 0.35999332108866255, "grad_norm": 0.5341997705770352, "learning_rate": 9.98790445393586e-06, "loss": 0.0426, "step": 1078 }, { "epoch": 0.36032726665553516, "grad_norm": 0.7479385849991379, "learning_rate": 9.98776898594744e-06, "loss": 0.0571, "step": 1079 }, { "epoch": 0.36066121222240777, "grad_norm": 0.5834622266603493, "learning_rate": 9.987632764497787e-06, "loss": 0.0437, "step": 1080 }, { "epoch": 0.3609951577892804, "grad_norm": 0.418269485846868, "learning_rate": 9.987495789607478e-06, "loss": 0.0478, "step": 1081 }, { "epoch": 0.3613291033561529, "grad_norm": 0.5652661852831252, "learning_rate": 9.987358061297203e-06, "loss": 0.0522, "step": 1082 }, { "epoch": 0.36166304892302553, "grad_norm": 0.5112807279357294, "learning_rate": 9.987219579587768e-06, "loss": 0.0421, "step": 1083 }, { "epoch": 0.36199699448989814, "grad_norm": 0.5259032382547962, "learning_rate": 9.987080344500094e-06, "loss": 0.0503, "step": 1084 }, { "epoch": 0.36233094005677075, "grad_norm": 0.622483979721095, "learning_rate": 9.986940356055212e-06, "loss": 0.0435, "step": 1085 }, { "epoch": 0.36266488562364335, "grad_norm": 0.4294577205276959, "learning_rate": 9.986799614274271e-06, "loss": 0.0406, "step": 1086 }, { "epoch": 0.36299883119051596, "grad_norm": 0.45824971878379894, "learning_rate": 9.986658119178532e-06, "loss": 0.0533, "step": 1087 }, { "epoch": 0.36333277675738856, "grad_norm": 0.5766999228997975, "learning_rate": 9.986515870789366e-06, "loss": 0.0467, "step": 1088 }, { "epoch": 0.36366672232426117, "grad_norm": 0.5595701577587491, "learning_rate": 9.986372869128264e-06, "loss": 0.0574, "step": 1089 }, { "epoch": 0.3640006678911337, "grad_norm": 0.7333099233179504, "learning_rate": 9.986229114216828e-06, "loss": 0.0469, "step": 1090 }, { "epoch": 0.36433461345800633, "grad_norm": 0.4619575799630175, "learning_rate": 9.986084606076772e-06, "loss": 0.0431, "step": 1091 }, { "epoch": 0.36466855902487894, "grad_norm": 0.44056819753468957, "learning_rate": 9.985939344729926e-06, "loss": 0.0322, "step": 1092 }, { "epoch": 0.36500250459175154, "grad_norm": 0.5098328152482641, "learning_rate": 9.985793330198237e-06, "loss": 0.0474, "step": 1093 }, { "epoch": 0.36533645015862415, "grad_norm": 0.40154545526503355, "learning_rate": 9.98564656250376e-06, "loss": 0.0398, "step": 1094 }, { "epoch": 0.36567039572549676, "grad_norm": 0.6710265566649831, "learning_rate": 9.985499041668664e-06, "loss": 0.0534, "step": 1095 }, { "epoch": 0.36600434129236936, "grad_norm": 0.47184607186017474, "learning_rate": 9.985350767715236e-06, "loss": 0.0474, "step": 1096 }, { "epoch": 0.36633828685924197, "grad_norm": 0.43565151040472383, "learning_rate": 9.985201740665873e-06, "loss": 0.0376, "step": 1097 }, { "epoch": 0.3666722324261145, "grad_norm": 0.5620042551551873, "learning_rate": 9.98505196054309e-06, "loss": 0.0439, "step": 1098 }, { "epoch": 0.3670061779929871, "grad_norm": 0.5732658369619484, "learning_rate": 9.98490142736951e-06, "loss": 0.0452, "step": 1099 }, { "epoch": 0.36734012355985973, "grad_norm": 0.5005726529403091, "learning_rate": 9.984750141167874e-06, "loss": 0.0491, "step": 1100 }, { "epoch": 0.36767406912673234, "grad_norm": 0.7443452256344757, "learning_rate": 9.984598101961036e-06, "loss": 0.0518, "step": 1101 }, { "epoch": 0.36800801469360495, "grad_norm": 0.6843009797649618, "learning_rate": 9.984445309771963e-06, "loss": 0.0688, "step": 1102 }, { "epoch": 0.36834196026047755, "grad_norm": 0.5428163389051452, "learning_rate": 9.984291764623735e-06, "loss": 0.0401, "step": 1103 }, { "epoch": 0.36867590582735016, "grad_norm": 0.8129104692667752, "learning_rate": 9.98413746653955e-06, "loss": 0.0471, "step": 1104 }, { "epoch": 0.36900985139422277, "grad_norm": 0.42308489801042903, "learning_rate": 9.983982415542713e-06, "loss": 0.035, "step": 1105 }, { "epoch": 0.3693437969610953, "grad_norm": 0.575726981231527, "learning_rate": 9.983826611656649e-06, "loss": 0.0455, "step": 1106 }, { "epoch": 0.3696777425279679, "grad_norm": 0.6214448886769854, "learning_rate": 9.983670054904891e-06, "loss": 0.0426, "step": 1107 }, { "epoch": 0.37001168809484053, "grad_norm": 0.4840384379327109, "learning_rate": 9.98351274531109e-06, "loss": 0.0398, "step": 1108 }, { "epoch": 0.37034563366171314, "grad_norm": 0.6726458580399798, "learning_rate": 9.983354682899012e-06, "loss": 0.0525, "step": 1109 }, { "epoch": 0.37067957922858574, "grad_norm": 0.7898451002671479, "learning_rate": 9.98319586769253e-06, "loss": 0.0463, "step": 1110 }, { "epoch": 0.37101352479545835, "grad_norm": 0.6329293017127283, "learning_rate": 9.983036299715637e-06, "loss": 0.0561, "step": 1111 }, { "epoch": 0.37134747036233096, "grad_norm": 0.7822396614156675, "learning_rate": 9.98287597899244e-06, "loss": 0.0523, "step": 1112 }, { "epoch": 0.37168141592920356, "grad_norm": 0.6611247395030577, "learning_rate": 9.982714905547152e-06, "loss": 0.0524, "step": 1113 }, { "epoch": 0.3720153614960761, "grad_norm": 0.6292445023529448, "learning_rate": 9.982553079404109e-06, "loss": 0.0599, "step": 1114 }, { "epoch": 0.3723493070629487, "grad_norm": 0.4281011826469759, "learning_rate": 9.982390500587755e-06, "loss": 0.0387, "step": 1115 }, { "epoch": 0.3726832526298213, "grad_norm": 0.5993569991835048, "learning_rate": 9.982227169122652e-06, "loss": 0.0403, "step": 1116 }, { "epoch": 0.37301719819669393, "grad_norm": 0.6425741270395904, "learning_rate": 9.98206308503347e-06, "loss": 0.0571, "step": 1117 }, { "epoch": 0.37335114376356654, "grad_norm": 0.35852961411871975, "learning_rate": 9.981898248344996e-06, "loss": 0.0376, "step": 1118 }, { "epoch": 0.37368508933043915, "grad_norm": 0.39932054074997514, "learning_rate": 9.981732659082136e-06, "loss": 0.0433, "step": 1119 }, { "epoch": 0.37401903489731175, "grad_norm": 0.4132193378666293, "learning_rate": 9.981566317269895e-06, "loss": 0.0343, "step": 1120 }, { "epoch": 0.37435298046418436, "grad_norm": 0.5548913168137909, "learning_rate": 9.981399222933408e-06, "loss": 0.0523, "step": 1121 }, { "epoch": 0.3746869260310569, "grad_norm": 0.7247522744765278, "learning_rate": 9.981231376097914e-06, "loss": 0.0572, "step": 1122 }, { "epoch": 0.3750208715979295, "grad_norm": 0.4952993307764918, "learning_rate": 9.981062776788769e-06, "loss": 0.0385, "step": 1123 }, { "epoch": 0.3753548171648021, "grad_norm": 0.45534515652271046, "learning_rate": 9.98089342503144e-06, "loss": 0.0506, "step": 1124 }, { "epoch": 0.37568876273167473, "grad_norm": 0.474082473254327, "learning_rate": 9.980723320851512e-06, "loss": 0.0494, "step": 1125 }, { "epoch": 0.37602270829854734, "grad_norm": 0.5496501514083226, "learning_rate": 9.98055246427468e-06, "loss": 0.055, "step": 1126 }, { "epoch": 0.37635665386541994, "grad_norm": 0.7029249775324979, "learning_rate": 9.980380855326754e-06, "loss": 0.0611, "step": 1127 }, { "epoch": 0.37669059943229255, "grad_norm": 0.3571680349771877, "learning_rate": 9.980208494033659e-06, "loss": 0.0456, "step": 1128 }, { "epoch": 0.37702454499916516, "grad_norm": 0.5787446290422406, "learning_rate": 9.98003538042143e-06, "loss": 0.0543, "step": 1129 }, { "epoch": 0.37735849056603776, "grad_norm": 0.5193793235190981, "learning_rate": 9.979861514516217e-06, "loss": 0.0485, "step": 1130 }, { "epoch": 0.3776924361329103, "grad_norm": 0.4450333193144988, "learning_rate": 9.979686896344289e-06, "loss": 0.047, "step": 1131 }, { "epoch": 0.3780263816997829, "grad_norm": 0.5238410201117426, "learning_rate": 9.97951152593202e-06, "loss": 0.0596, "step": 1132 }, { "epoch": 0.3783603272666555, "grad_norm": 0.7958266829905504, "learning_rate": 9.979335403305904e-06, "loss": 0.0491, "step": 1133 }, { "epoch": 0.37869427283352813, "grad_norm": 0.5391712301146426, "learning_rate": 9.979158528492546e-06, "loss": 0.0496, "step": 1134 }, { "epoch": 0.37902821840040074, "grad_norm": 0.7434018159065988, "learning_rate": 9.978980901518663e-06, "loss": 0.0519, "step": 1135 }, { "epoch": 0.37936216396727335, "grad_norm": 0.5140398521432605, "learning_rate": 9.978802522411091e-06, "loss": 0.0477, "step": 1136 }, { "epoch": 0.37969610953414595, "grad_norm": 0.525099189694247, "learning_rate": 9.978623391196774e-06, "loss": 0.0627, "step": 1137 }, { "epoch": 0.38003005510101856, "grad_norm": 0.8405858175836848, "learning_rate": 9.978443507902772e-06, "loss": 0.0523, "step": 1138 }, { "epoch": 0.3803640006678911, "grad_norm": 0.5362127551512809, "learning_rate": 9.978262872556257e-06, "loss": 0.0525, "step": 1139 }, { "epoch": 0.3806979462347637, "grad_norm": 0.8901488866233538, "learning_rate": 9.97808148518452e-06, "loss": 0.0598, "step": 1140 }, { "epoch": 0.3810318918016363, "grad_norm": 0.8614316646494008, "learning_rate": 9.977899345814959e-06, "loss": 0.0779, "step": 1141 }, { "epoch": 0.38136583736850893, "grad_norm": 0.5113726601773714, "learning_rate": 9.977716454475089e-06, "loss": 0.0508, "step": 1142 }, { "epoch": 0.38169978293538154, "grad_norm": 0.4148156818182061, "learning_rate": 9.977532811192539e-06, "loss": 0.0416, "step": 1143 }, { "epoch": 0.38203372850225414, "grad_norm": 0.5817665146573645, "learning_rate": 9.977348415995048e-06, "loss": 0.0421, "step": 1144 }, { "epoch": 0.38236767406912675, "grad_norm": 0.6024373699140408, "learning_rate": 9.977163268910472e-06, "loss": 0.0457, "step": 1145 }, { "epoch": 0.38270161963599936, "grad_norm": 0.4992753612965335, "learning_rate": 9.976977369966781e-06, "loss": 0.0411, "step": 1146 }, { "epoch": 0.3830355652028719, "grad_norm": 0.43207767333593483, "learning_rate": 9.976790719192055e-06, "loss": 0.042, "step": 1147 }, { "epoch": 0.3833695107697445, "grad_norm": 0.6297237611298425, "learning_rate": 9.976603316614492e-06, "loss": 0.0579, "step": 1148 }, { "epoch": 0.3837034563366171, "grad_norm": 0.4638601261210525, "learning_rate": 9.976415162262401e-06, "loss": 0.0303, "step": 1149 }, { "epoch": 0.38403740190348973, "grad_norm": 0.5053291928207179, "learning_rate": 9.976226256164204e-06, "loss": 0.0531, "step": 1150 }, { "epoch": 0.38437134747036233, "grad_norm": 0.49472959374363185, "learning_rate": 9.976036598348437e-06, "loss": 0.0415, "step": 1151 }, { "epoch": 0.38470529303723494, "grad_norm": 0.43376405843719706, "learning_rate": 9.975846188843754e-06, "loss": 0.0386, "step": 1152 }, { "epoch": 0.38503923860410755, "grad_norm": 0.49450793525193587, "learning_rate": 9.975655027678913e-06, "loss": 0.052, "step": 1153 }, { "epoch": 0.38537318417098015, "grad_norm": 0.4413392442318113, "learning_rate": 9.975463114882792e-06, "loss": 0.0403, "step": 1154 }, { "epoch": 0.3857071297378527, "grad_norm": 0.6403828404233969, "learning_rate": 9.975270450484385e-06, "loss": 0.0477, "step": 1155 }, { "epoch": 0.3860410753047253, "grad_norm": 0.545724273094895, "learning_rate": 9.975077034512795e-06, "loss": 0.0438, "step": 1156 }, { "epoch": 0.3863750208715979, "grad_norm": 0.4567797017718175, "learning_rate": 9.97488286699724e-06, "loss": 0.0503, "step": 1157 }, { "epoch": 0.3867089664384705, "grad_norm": 0.4477030075936158, "learning_rate": 9.974687947967047e-06, "loss": 0.0407, "step": 1158 }, { "epoch": 0.38704291200534313, "grad_norm": 0.5691343454538987, "learning_rate": 9.974492277451668e-06, "loss": 0.0468, "step": 1159 }, { "epoch": 0.38737685757221574, "grad_norm": 0.6618096964997607, "learning_rate": 9.974295855480658e-06, "loss": 0.0562, "step": 1160 }, { "epoch": 0.38771080313908834, "grad_norm": 0.4112412707585751, "learning_rate": 9.974098682083687e-06, "loss": 0.0313, "step": 1161 }, { "epoch": 0.38804474870596095, "grad_norm": 0.3845254377778595, "learning_rate": 9.973900757290541e-06, "loss": 0.0445, "step": 1162 }, { "epoch": 0.3883786942728335, "grad_norm": 0.49392505730444436, "learning_rate": 9.97370208113112e-06, "loss": 0.0509, "step": 1163 }, { "epoch": 0.3887126398397061, "grad_norm": 0.588413841355745, "learning_rate": 9.973502653635438e-06, "loss": 0.0579, "step": 1164 }, { "epoch": 0.3890465854065787, "grad_norm": 0.6271328850335199, "learning_rate": 9.97330247483362e-06, "loss": 0.0561, "step": 1165 }, { "epoch": 0.3893805309734513, "grad_norm": 0.3921815676693701, "learning_rate": 9.973101544755901e-06, "loss": 0.0418, "step": 1166 }, { "epoch": 0.38971447654032393, "grad_norm": 0.6668285107357198, "learning_rate": 9.97289986343264e-06, "loss": 0.0515, "step": 1167 }, { "epoch": 0.39004842210719654, "grad_norm": 0.5044214747666783, "learning_rate": 9.972697430894299e-06, "loss": 0.048, "step": 1168 }, { "epoch": 0.39038236767406914, "grad_norm": 0.5810068915729227, "learning_rate": 9.97249424717146e-06, "loss": 0.0437, "step": 1169 }, { "epoch": 0.39071631324094175, "grad_norm": 0.4942384218878549, "learning_rate": 9.972290312294816e-06, "loss": 0.056, "step": 1170 }, { "epoch": 0.3910502588078143, "grad_norm": 0.6300510182254271, "learning_rate": 9.972085626295173e-06, "loss": 0.052, "step": 1171 }, { "epoch": 0.3913842043746869, "grad_norm": 0.5135514249497448, "learning_rate": 9.971880189203452e-06, "loss": 0.0556, "step": 1172 }, { "epoch": 0.3917181499415595, "grad_norm": 0.5381850673904547, "learning_rate": 9.971674001050687e-06, "loss": 0.0545, "step": 1173 }, { "epoch": 0.3920520955084321, "grad_norm": 0.7459483271483872, "learning_rate": 9.971467061868022e-06, "loss": 0.0546, "step": 1174 }, { "epoch": 0.3923860410753047, "grad_norm": 0.5703554311385124, "learning_rate": 9.971259371686724e-06, "loss": 0.0378, "step": 1175 }, { "epoch": 0.39271998664217733, "grad_norm": 0.611429485228926, "learning_rate": 9.971050930538161e-06, "loss": 0.0647, "step": 1176 }, { "epoch": 0.39305393220904994, "grad_norm": 0.6200371316956779, "learning_rate": 9.970841738453823e-06, "loss": 0.0522, "step": 1177 }, { "epoch": 0.39338787777592255, "grad_norm": 0.5539338167498827, "learning_rate": 9.970631795465311e-06, "loss": 0.0466, "step": 1178 }, { "epoch": 0.39372182334279515, "grad_norm": 0.372460342051298, "learning_rate": 9.970421101604339e-06, "loss": 0.0396, "step": 1179 }, { "epoch": 0.3940557689096677, "grad_norm": 0.5819236200727373, "learning_rate": 9.970209656902734e-06, "loss": 0.053, "step": 1180 }, { "epoch": 0.3943897144765403, "grad_norm": 0.43112048749562476, "learning_rate": 9.969997461392439e-06, "loss": 0.0443, "step": 1181 }, { "epoch": 0.3947236600434129, "grad_norm": 0.6862293235082171, "learning_rate": 9.969784515105508e-06, "loss": 0.0693, "step": 1182 }, { "epoch": 0.3950576056102855, "grad_norm": 0.46118889763849785, "learning_rate": 9.969570818074109e-06, "loss": 0.0333, "step": 1183 }, { "epoch": 0.39539155117715813, "grad_norm": 0.7480464718455543, "learning_rate": 9.96935637033052e-06, "loss": 0.0418, "step": 1184 }, { "epoch": 0.39572549674403074, "grad_norm": 0.6006469165707279, "learning_rate": 9.969141171907142e-06, "loss": 0.0418, "step": 1185 }, { "epoch": 0.39605944231090334, "grad_norm": 0.5126146905146513, "learning_rate": 9.968925222836478e-06, "loss": 0.047, "step": 1186 }, { "epoch": 0.39639338787777595, "grad_norm": 0.39110614660927157, "learning_rate": 9.968708523151154e-06, "loss": 0.0468, "step": 1187 }, { "epoch": 0.3967273334446485, "grad_norm": 0.5815177714468796, "learning_rate": 9.968491072883902e-06, "loss": 0.0576, "step": 1188 }, { "epoch": 0.3970612790115211, "grad_norm": 0.6559422926494999, "learning_rate": 9.968272872067571e-06, "loss": 0.0634, "step": 1189 }, { "epoch": 0.3973952245783937, "grad_norm": 0.6095510416572145, "learning_rate": 9.968053920735124e-06, "loss": 0.0592, "step": 1190 }, { "epoch": 0.3977291701452663, "grad_norm": 0.5355585919017009, "learning_rate": 9.967834218919634e-06, "loss": 0.0459, "step": 1191 }, { "epoch": 0.3980631157121389, "grad_norm": 0.5607840872774007, "learning_rate": 9.967613766654293e-06, "loss": 0.0447, "step": 1192 }, { "epoch": 0.39839706127901153, "grad_norm": 0.48251025480811577, "learning_rate": 9.967392563972399e-06, "loss": 0.0415, "step": 1193 }, { "epoch": 0.39873100684588414, "grad_norm": 0.5809032518017457, "learning_rate": 9.96717061090737e-06, "loss": 0.0536, "step": 1194 }, { "epoch": 0.39906495241275675, "grad_norm": 0.5545765346935037, "learning_rate": 9.966947907492734e-06, "loss": 0.0505, "step": 1195 }, { "epoch": 0.3993988979796293, "grad_norm": 0.5487534111585209, "learning_rate": 9.966724453762131e-06, "loss": 0.0651, "step": 1196 }, { "epoch": 0.3997328435465019, "grad_norm": 0.7918118540428859, "learning_rate": 9.96650024974932e-06, "loss": 0.0901, "step": 1197 }, { "epoch": 0.4000667891133745, "grad_norm": 0.4377465320214387, "learning_rate": 9.966275295488165e-06, "loss": 0.0478, "step": 1198 }, { "epoch": 0.4004007346802471, "grad_norm": 0.4363946236918287, "learning_rate": 9.966049591012651e-06, "loss": 0.0412, "step": 1199 }, { "epoch": 0.4007346802471197, "grad_norm": 0.5645750603178316, "learning_rate": 9.965823136356877e-06, "loss": 0.058, "step": 1200 }, { "epoch": 0.40106862581399233, "grad_norm": 0.6490952716726128, "learning_rate": 9.965595931555043e-06, "loss": 0.052, "step": 1201 }, { "epoch": 0.40140257138086494, "grad_norm": 0.4281760314358244, "learning_rate": 9.965367976641478e-06, "loss": 0.0394, "step": 1202 }, { "epoch": 0.40173651694773754, "grad_norm": 0.8412813438780575, "learning_rate": 9.965139271650614e-06, "loss": 0.0763, "step": 1203 }, { "epoch": 0.4020704625146101, "grad_norm": 0.6074908680773805, "learning_rate": 9.964909816617002e-06, "loss": 0.0564, "step": 1204 }, { "epoch": 0.4024044080814827, "grad_norm": 0.4679413848655991, "learning_rate": 9.964679611575298e-06, "loss": 0.0369, "step": 1205 }, { "epoch": 0.4027383536483553, "grad_norm": 0.5847037533969296, "learning_rate": 9.964448656560286e-06, "loss": 0.039, "step": 1206 }, { "epoch": 0.4030722992152279, "grad_norm": 0.3845921861967511, "learning_rate": 9.964216951606848e-06, "loss": 0.0336, "step": 1207 }, { "epoch": 0.4034062447821005, "grad_norm": 0.41271257424344426, "learning_rate": 9.963984496749988e-06, "loss": 0.0414, "step": 1208 }, { "epoch": 0.4037401903489731, "grad_norm": 0.7027487676934003, "learning_rate": 9.96375129202482e-06, "loss": 0.0687, "step": 1209 }, { "epoch": 0.40407413591584573, "grad_norm": 0.41417343851954624, "learning_rate": 9.963517337466575e-06, "loss": 0.0357, "step": 1210 }, { "epoch": 0.40440808148271834, "grad_norm": 0.4817301873683958, "learning_rate": 9.963282633110591e-06, "loss": 0.0419, "step": 1211 }, { "epoch": 0.4047420270495909, "grad_norm": 0.3188002448155716, "learning_rate": 9.963047178992324e-06, "loss": 0.0276, "step": 1212 }, { "epoch": 0.4050759726164635, "grad_norm": 0.5874020573242373, "learning_rate": 9.962810975147344e-06, "loss": 0.0626, "step": 1213 }, { "epoch": 0.4054099181833361, "grad_norm": 0.6407406965589435, "learning_rate": 9.96257402161133e-06, "loss": 0.0419, "step": 1214 }, { "epoch": 0.4057438637502087, "grad_norm": 0.5759204262478326, "learning_rate": 9.962336318420078e-06, "loss": 0.0454, "step": 1215 }, { "epoch": 0.4060778093170813, "grad_norm": 0.5243793824473311, "learning_rate": 9.962097865609495e-06, "loss": 0.0387, "step": 1216 }, { "epoch": 0.4064117548839539, "grad_norm": 0.6572551064096243, "learning_rate": 9.961858663215604e-06, "loss": 0.0543, "step": 1217 }, { "epoch": 0.40674570045082653, "grad_norm": 0.6558388201939296, "learning_rate": 9.961618711274537e-06, "loss": 0.0583, "step": 1218 }, { "epoch": 0.40707964601769914, "grad_norm": 0.5551867364009926, "learning_rate": 9.961378009822542e-06, "loss": 0.0363, "step": 1219 }, { "epoch": 0.4074135915845717, "grad_norm": 0.44824260044365105, "learning_rate": 9.961136558895981e-06, "loss": 0.0405, "step": 1220 }, { "epoch": 0.4077475371514443, "grad_norm": 0.526321946092579, "learning_rate": 9.960894358531328e-06, "loss": 0.0413, "step": 1221 }, { "epoch": 0.4080814827183169, "grad_norm": 0.6622908968547422, "learning_rate": 9.960651408765168e-06, "loss": 0.0417, "step": 1222 }, { "epoch": 0.4084154282851895, "grad_norm": 0.5746512951521658, "learning_rate": 9.960407709634203e-06, "loss": 0.057, "step": 1223 }, { "epoch": 0.4087493738520621, "grad_norm": 0.9897729337912883, "learning_rate": 9.960163261175247e-06, "loss": 0.0434, "step": 1224 }, { "epoch": 0.4090833194189347, "grad_norm": 0.36426890117105776, "learning_rate": 9.959918063425228e-06, "loss": 0.0361, "step": 1225 }, { "epoch": 0.4094172649858073, "grad_norm": 0.5631602662914157, "learning_rate": 9.959672116421181e-06, "loss": 0.0445, "step": 1226 }, { "epoch": 0.40975121055267993, "grad_norm": 0.5870494363349898, "learning_rate": 9.959425420200267e-06, "loss": 0.045, "step": 1227 }, { "epoch": 0.4100851561195525, "grad_norm": 0.6056365184139995, "learning_rate": 9.959177974799742e-06, "loss": 0.0617, "step": 1228 }, { "epoch": 0.4104191016864251, "grad_norm": 0.47243494475642694, "learning_rate": 9.958929780256996e-06, "loss": 0.0515, "step": 1229 }, { "epoch": 0.4107530472532977, "grad_norm": 0.4965355083160451, "learning_rate": 9.958680836609516e-06, "loss": 0.0419, "step": 1230 }, { "epoch": 0.4110869928201703, "grad_norm": 0.6125293052777985, "learning_rate": 9.95843114389491e-06, "loss": 0.0533, "step": 1231 }, { "epoch": 0.4114209383870429, "grad_norm": 0.48382750319054785, "learning_rate": 9.958180702150895e-06, "loss": 0.0361, "step": 1232 }, { "epoch": 0.4117548839539155, "grad_norm": 0.6457316899991283, "learning_rate": 9.957929511415304e-06, "loss": 0.0574, "step": 1233 }, { "epoch": 0.4120888295207881, "grad_norm": 0.6018987285814226, "learning_rate": 9.957677571726084e-06, "loss": 0.0533, "step": 1234 }, { "epoch": 0.41242277508766073, "grad_norm": 0.5787195430357516, "learning_rate": 9.95742488312129e-06, "loss": 0.0706, "step": 1235 }, { "epoch": 0.41275672065453334, "grad_norm": 0.5810859648790506, "learning_rate": 9.957171445639096e-06, "loss": 0.0495, "step": 1236 }, { "epoch": 0.4130906662214059, "grad_norm": 0.5792339241008343, "learning_rate": 9.956917259317788e-06, "loss": 0.0414, "step": 1237 }, { "epoch": 0.4134246117882785, "grad_norm": 0.5015069588431272, "learning_rate": 9.95666232419576e-06, "loss": 0.0476, "step": 1238 }, { "epoch": 0.4137585573551511, "grad_norm": 0.5523189073691389, "learning_rate": 9.956406640311527e-06, "loss": 0.054, "step": 1239 }, { "epoch": 0.4140925029220237, "grad_norm": 0.5407896672488476, "learning_rate": 9.956150207703712e-06, "loss": 0.0373, "step": 1240 }, { "epoch": 0.4144264484888963, "grad_norm": 0.5743250272298346, "learning_rate": 9.955893026411048e-06, "loss": 0.0532, "step": 1241 }, { "epoch": 0.4147603940557689, "grad_norm": 0.46544361103029347, "learning_rate": 9.955635096472391e-06, "loss": 0.0333, "step": 1242 }, { "epoch": 0.41509433962264153, "grad_norm": 0.8758747467998904, "learning_rate": 9.9553764179267e-06, "loss": 0.0558, "step": 1243 }, { "epoch": 0.41542828518951413, "grad_norm": 0.5038013010376116, "learning_rate": 9.955116990813056e-06, "loss": 0.0591, "step": 1244 }, { "epoch": 0.4157622307563867, "grad_norm": 0.479430959952089, "learning_rate": 9.954856815170644e-06, "loss": 0.0454, "step": 1245 }, { "epoch": 0.4160961763232593, "grad_norm": 0.6542054204376075, "learning_rate": 9.95459589103877e-06, "loss": 0.047, "step": 1246 }, { "epoch": 0.4164301218901319, "grad_norm": 0.45841216149214287, "learning_rate": 9.954334218456846e-06, "loss": 0.0371, "step": 1247 }, { "epoch": 0.4167640674570045, "grad_norm": 0.8704556228734338, "learning_rate": 9.954071797464405e-06, "loss": 0.0624, "step": 1248 }, { "epoch": 0.4170980130238771, "grad_norm": 0.5585633517077369, "learning_rate": 9.953808628101086e-06, "loss": 0.0507, "step": 1249 }, { "epoch": 0.4174319585907497, "grad_norm": 0.5522660658752164, "learning_rate": 9.953544710406646e-06, "loss": 0.0482, "step": 1250 }, { "epoch": 0.4177659041576223, "grad_norm": 0.48948221308855294, "learning_rate": 9.95328004442095e-06, "loss": 0.0445, "step": 1251 }, { "epoch": 0.41809984972449493, "grad_norm": 0.4797459764133718, "learning_rate": 9.953014630183979e-06, "loss": 0.0377, "step": 1252 }, { "epoch": 0.4184337952913675, "grad_norm": 0.7395642030505643, "learning_rate": 9.95274846773583e-06, "loss": 0.0542, "step": 1253 }, { "epoch": 0.4187677408582401, "grad_norm": 0.337693760872802, "learning_rate": 9.952481557116708e-06, "loss": 0.0294, "step": 1254 }, { "epoch": 0.4191016864251127, "grad_norm": 0.5120454944498827, "learning_rate": 9.952213898366932e-06, "loss": 0.0487, "step": 1255 }, { "epoch": 0.4194356319919853, "grad_norm": 0.4787237800304835, "learning_rate": 9.951945491526938e-06, "loss": 0.039, "step": 1256 }, { "epoch": 0.4197695775588579, "grad_norm": 0.5560558795953889, "learning_rate": 9.951676336637267e-06, "loss": 0.0446, "step": 1257 }, { "epoch": 0.4201035231257305, "grad_norm": 0.47763341704211254, "learning_rate": 9.951406433738587e-06, "loss": 0.0452, "step": 1258 }, { "epoch": 0.4204374686926031, "grad_norm": 0.6009184004401019, "learning_rate": 9.95113578287166e-06, "loss": 0.0525, "step": 1259 }, { "epoch": 0.42077141425947573, "grad_norm": 0.9478592385559077, "learning_rate": 9.950864384077376e-06, "loss": 0.0528, "step": 1260 }, { "epoch": 0.4211053598263483, "grad_norm": 0.6769818272741464, "learning_rate": 9.950592237396732e-06, "loss": 0.0483, "step": 1261 }, { "epoch": 0.4214393053932209, "grad_norm": 0.3330372524755465, "learning_rate": 9.95031934287084e-06, "loss": 0.0251, "step": 1262 }, { "epoch": 0.4217732509600935, "grad_norm": 0.5699833015601791, "learning_rate": 9.950045700540923e-06, "loss": 0.0481, "step": 1263 }, { "epoch": 0.4221071965269661, "grad_norm": 0.43939228148604276, "learning_rate": 9.949771310448317e-06, "loss": 0.0441, "step": 1264 }, { "epoch": 0.4224411420938387, "grad_norm": 0.39998322656970237, "learning_rate": 9.949496172634474e-06, "loss": 0.0338, "step": 1265 }, { "epoch": 0.4227750876607113, "grad_norm": 0.5307284843623364, "learning_rate": 9.949220287140955e-06, "loss": 0.0526, "step": 1266 }, { "epoch": 0.4231090332275839, "grad_norm": 0.403318394857699, "learning_rate": 9.948943654009438e-06, "loss": 0.051, "step": 1267 }, { "epoch": 0.4234429787944565, "grad_norm": 0.44163504205551896, "learning_rate": 9.948666273281708e-06, "loss": 0.0457, "step": 1268 }, { "epoch": 0.4237769243613291, "grad_norm": 0.4168383676719667, "learning_rate": 9.94838814499967e-06, "loss": 0.0487, "step": 1269 }, { "epoch": 0.4241108699282017, "grad_norm": 0.37701006647801505, "learning_rate": 9.948109269205338e-06, "loss": 0.0347, "step": 1270 }, { "epoch": 0.4244448154950743, "grad_norm": 0.6020931247605068, "learning_rate": 9.947829645940836e-06, "loss": 0.0491, "step": 1271 }, { "epoch": 0.4247787610619469, "grad_norm": 0.41915818325977333, "learning_rate": 9.94754927524841e-06, "loss": 0.0422, "step": 1272 }, { "epoch": 0.4251127066288195, "grad_norm": 0.6051942529439326, "learning_rate": 9.947268157170409e-06, "loss": 0.0535, "step": 1273 }, { "epoch": 0.4254466521956921, "grad_norm": 0.38521647220535943, "learning_rate": 9.9469862917493e-06, "loss": 0.0378, "step": 1274 }, { "epoch": 0.4257805977625647, "grad_norm": 0.355228311505307, "learning_rate": 9.946703679027664e-06, "loss": 0.0356, "step": 1275 }, { "epoch": 0.4261145433294373, "grad_norm": 0.5270972733848863, "learning_rate": 9.946420319048192e-06, "loss": 0.0689, "step": 1276 }, { "epoch": 0.4264484888963099, "grad_norm": 0.5112082801340879, "learning_rate": 9.946136211853689e-06, "loss": 0.0505, "step": 1277 }, { "epoch": 0.4267824344631825, "grad_norm": 0.44718504260614705, "learning_rate": 9.94585135748707e-06, "loss": 0.0493, "step": 1278 }, { "epoch": 0.4271163800300551, "grad_norm": 0.5699231475397143, "learning_rate": 9.94556575599137e-06, "loss": 0.0357, "step": 1279 }, { "epoch": 0.4274503255969277, "grad_norm": 0.5365753163832601, "learning_rate": 9.94527940740973e-06, "loss": 0.0448, "step": 1280 }, { "epoch": 0.4277842711638003, "grad_norm": 0.4735308826285214, "learning_rate": 9.944992311785406e-06, "loss": 0.0546, "step": 1281 }, { "epoch": 0.4281182167306729, "grad_norm": 0.47380049162337246, "learning_rate": 9.94470446916177e-06, "loss": 0.0452, "step": 1282 }, { "epoch": 0.4284521622975455, "grad_norm": 0.4644022381468032, "learning_rate": 9.9444158795823e-06, "loss": 0.046, "step": 1283 }, { "epoch": 0.4287861078644181, "grad_norm": 0.44717115014964487, "learning_rate": 9.944126543090593e-06, "loss": 0.0416, "step": 1284 }, { "epoch": 0.4291200534312907, "grad_norm": 0.4414360842309514, "learning_rate": 9.943836459730356e-06, "loss": 0.0419, "step": 1285 }, { "epoch": 0.4294539989981633, "grad_norm": 1.0178687587182773, "learning_rate": 9.943545629545412e-06, "loss": 0.0663, "step": 1286 }, { "epoch": 0.4297879445650359, "grad_norm": 0.46936757985831473, "learning_rate": 9.94325405257969e-06, "loss": 0.0463, "step": 1287 }, { "epoch": 0.4301218901319085, "grad_norm": 0.4487040089691065, "learning_rate": 9.94296172887724e-06, "loss": 0.0447, "step": 1288 }, { "epoch": 0.4304558356987811, "grad_norm": 0.41537669627976054, "learning_rate": 9.942668658482219e-06, "loss": 0.0348, "step": 1289 }, { "epoch": 0.4307897812656537, "grad_norm": 0.5726595986047104, "learning_rate": 9.942374841438898e-06, "loss": 0.0617, "step": 1290 }, { "epoch": 0.4311237268325263, "grad_norm": 0.43582524228831776, "learning_rate": 9.942080277791663e-06, "loss": 0.0357, "step": 1291 }, { "epoch": 0.4314576723993989, "grad_norm": 0.5963138767833485, "learning_rate": 9.941784967585012e-06, "loss": 0.0531, "step": 1292 }, { "epoch": 0.4317916179662715, "grad_norm": 0.36943889395441276, "learning_rate": 9.941488910863553e-06, "loss": 0.0288, "step": 1293 }, { "epoch": 0.4321255635331441, "grad_norm": 0.37474135577933215, "learning_rate": 9.941192107672011e-06, "loss": 0.0347, "step": 1294 }, { "epoch": 0.4324595091000167, "grad_norm": 0.4891570752361696, "learning_rate": 9.940894558055218e-06, "loss": 0.0432, "step": 1295 }, { "epoch": 0.4327934546668893, "grad_norm": 0.5033309022266377, "learning_rate": 9.940596262058128e-06, "loss": 0.0494, "step": 1296 }, { "epoch": 0.4331274002337619, "grad_norm": 0.4535306288903673, "learning_rate": 9.940297219725797e-06, "loss": 0.0431, "step": 1297 }, { "epoch": 0.4334613458006345, "grad_norm": 0.42543847729786305, "learning_rate": 9.939997431103402e-06, "loss": 0.0466, "step": 1298 }, { "epoch": 0.4337952913675071, "grad_norm": 0.5496540648197035, "learning_rate": 9.939696896236229e-06, "loss": 0.0512, "step": 1299 }, { "epoch": 0.4341292369343797, "grad_norm": 0.5483031819247153, "learning_rate": 9.939395615169673e-06, "loss": 0.0424, "step": 1300 }, { "epoch": 0.4344631825012523, "grad_norm": 0.514101464539269, "learning_rate": 9.939093587949254e-06, "loss": 0.056, "step": 1301 }, { "epoch": 0.43479712806812487, "grad_norm": 0.5153592179996069, "learning_rate": 9.938790814620591e-06, "loss": 0.0452, "step": 1302 }, { "epoch": 0.4351310736349975, "grad_norm": 0.8560536548611567, "learning_rate": 9.938487295229423e-06, "loss": 0.0647, "step": 1303 }, { "epoch": 0.4354650192018701, "grad_norm": 0.45392421020879836, "learning_rate": 9.9381830298216e-06, "loss": 0.0449, "step": 1304 }, { "epoch": 0.4357989647687427, "grad_norm": 0.6660398362359009, "learning_rate": 9.937878018443085e-06, "loss": 0.053, "step": 1305 }, { "epoch": 0.4361329103356153, "grad_norm": 1.093738726263721, "learning_rate": 9.937572261139956e-06, "loss": 0.0404, "step": 1306 }, { "epoch": 0.4364668559024879, "grad_norm": 0.8851395672879241, "learning_rate": 9.937265757958397e-06, "loss": 0.0753, "step": 1307 }, { "epoch": 0.4368008014693605, "grad_norm": 0.4741734263702632, "learning_rate": 9.93695850894471e-06, "loss": 0.0386, "step": 1308 }, { "epoch": 0.4371347470362331, "grad_norm": 0.5789187133288732, "learning_rate": 9.93665051414531e-06, "loss": 0.0484, "step": 1309 }, { "epoch": 0.43746869260310567, "grad_norm": 0.6102715857348852, "learning_rate": 9.936341773606723e-06, "loss": 0.0443, "step": 1310 }, { "epoch": 0.4378026381699783, "grad_norm": 0.783967201141356, "learning_rate": 9.936032287375587e-06, "loss": 0.0554, "step": 1311 }, { "epoch": 0.4381365837368509, "grad_norm": 0.48743951221359044, "learning_rate": 9.935722055498655e-06, "loss": 0.0369, "step": 1312 }, { "epoch": 0.4384705293037235, "grad_norm": 0.6692855376112433, "learning_rate": 9.935411078022791e-06, "loss": 0.0537, "step": 1313 }, { "epoch": 0.4388044748705961, "grad_norm": 0.7812088505589483, "learning_rate": 9.93509935499497e-06, "loss": 0.0537, "step": 1314 }, { "epoch": 0.4391384204374687, "grad_norm": 0.975296219611782, "learning_rate": 9.934786886462282e-06, "loss": 0.0412, "step": 1315 }, { "epoch": 0.4394723660043413, "grad_norm": 0.509882945002344, "learning_rate": 9.934473672471931e-06, "loss": 0.0426, "step": 1316 }, { "epoch": 0.4398063115712139, "grad_norm": 0.4494078479356775, "learning_rate": 9.934159713071229e-06, "loss": 0.029, "step": 1317 }, { "epoch": 0.44014025713808647, "grad_norm": 0.49041183105866554, "learning_rate": 9.933845008307605e-06, "loss": 0.0462, "step": 1318 }, { "epoch": 0.4404742027049591, "grad_norm": 0.46408390070537386, "learning_rate": 9.933529558228599e-06, "loss": 0.0363, "step": 1319 }, { "epoch": 0.4408081482718317, "grad_norm": 0.5570170216019052, "learning_rate": 9.933213362881861e-06, "loss": 0.052, "step": 1320 }, { "epoch": 0.4411420938387043, "grad_norm": 0.4219448033529627, "learning_rate": 9.932896422315159e-06, "loss": 0.0382, "step": 1321 }, { "epoch": 0.4414760394055769, "grad_norm": 0.6070665233479298, "learning_rate": 9.93257873657637e-06, "loss": 0.0449, "step": 1322 }, { "epoch": 0.4418099849724495, "grad_norm": 0.5361328280997671, "learning_rate": 9.932260305713481e-06, "loss": 0.0529, "step": 1323 }, { "epoch": 0.4421439305393221, "grad_norm": 0.5227207215911478, "learning_rate": 9.9319411297746e-06, "loss": 0.0443, "step": 1324 }, { "epoch": 0.4424778761061947, "grad_norm": 0.6159240333179103, "learning_rate": 9.931621208807939e-06, "loss": 0.0454, "step": 1325 }, { "epoch": 0.44281182167306726, "grad_norm": 0.6801978834254696, "learning_rate": 9.931300542861826e-06, "loss": 0.0604, "step": 1326 }, { "epoch": 0.44314576723993987, "grad_norm": 0.6733576208335831, "learning_rate": 9.930979131984702e-06, "loss": 0.0428, "step": 1327 }, { "epoch": 0.4434797128068125, "grad_norm": 0.5131450335966199, "learning_rate": 9.93065697622512e-06, "loss": 0.0412, "step": 1328 }, { "epoch": 0.4438136583736851, "grad_norm": 0.45628227470387467, "learning_rate": 9.930334075631745e-06, "loss": 0.0448, "step": 1329 }, { "epoch": 0.4441476039405577, "grad_norm": 0.5023094091205198, "learning_rate": 9.930010430253356e-06, "loss": 0.0354, "step": 1330 }, { "epoch": 0.4444815495074303, "grad_norm": 0.557464084839111, "learning_rate": 9.92968604013884e-06, "loss": 0.0535, "step": 1331 }, { "epoch": 0.4448154950743029, "grad_norm": 0.7409360498813402, "learning_rate": 9.929360905337204e-06, "loss": 0.0448, "step": 1332 }, { "epoch": 0.4451494406411755, "grad_norm": 0.399591445429372, "learning_rate": 9.929035025897561e-06, "loss": 0.035, "step": 1333 }, { "epoch": 0.4454833862080481, "grad_norm": 0.6806929089635603, "learning_rate": 9.928708401869143e-06, "loss": 0.0647, "step": 1334 }, { "epoch": 0.44581733177492067, "grad_norm": 1.0748118239786535, "learning_rate": 9.928381033301284e-06, "loss": 0.0543, "step": 1335 }, { "epoch": 0.4461512773417933, "grad_norm": 0.8691112605217693, "learning_rate": 9.928052920243443e-06, "loss": 0.0631, "step": 1336 }, { "epoch": 0.4464852229086659, "grad_norm": 0.6264993005302014, "learning_rate": 9.927724062745179e-06, "loss": 0.0616, "step": 1337 }, { "epoch": 0.4468191684755385, "grad_norm": 0.5416140116082875, "learning_rate": 9.927394460856174e-06, "loss": 0.0505, "step": 1338 }, { "epoch": 0.4471531140424111, "grad_norm": 0.6290176108125836, "learning_rate": 9.92706411462622e-06, "loss": 0.0459, "step": 1339 }, { "epoch": 0.4474870596092837, "grad_norm": 0.5369142702628142, "learning_rate": 9.926733024105216e-06, "loss": 0.0419, "step": 1340 }, { "epoch": 0.4478210051761563, "grad_norm": 0.49018142983174273, "learning_rate": 9.926401189343177e-06, "loss": 0.0465, "step": 1341 }, { "epoch": 0.4481549507430289, "grad_norm": 0.401600930205156, "learning_rate": 9.926068610390231e-06, "loss": 0.0372, "step": 1342 }, { "epoch": 0.44848889630990146, "grad_norm": 0.4088672091096697, "learning_rate": 9.925735287296621e-06, "loss": 0.04, "step": 1343 }, { "epoch": 0.44882284187677407, "grad_norm": 0.5276126911354879, "learning_rate": 9.925401220112698e-06, "loss": 0.0373, "step": 1344 }, { "epoch": 0.4491567874436467, "grad_norm": 0.5425732637602215, "learning_rate": 9.925066408888924e-06, "loss": 0.0501, "step": 1345 }, { "epoch": 0.4494907330105193, "grad_norm": 0.4226487446607824, "learning_rate": 9.92473085367588e-06, "loss": 0.048, "step": 1346 }, { "epoch": 0.4498246785773919, "grad_norm": 0.39943297643787967, "learning_rate": 9.924394554524252e-06, "loss": 0.0417, "step": 1347 }, { "epoch": 0.4501586241442645, "grad_norm": 0.43913272334149567, "learning_rate": 9.924057511484844e-06, "loss": 0.0381, "step": 1348 }, { "epoch": 0.4504925697111371, "grad_norm": 0.5049662393105034, "learning_rate": 9.92371972460857e-06, "loss": 0.0484, "step": 1349 }, { "epoch": 0.4508265152780097, "grad_norm": 1.1050941555590548, "learning_rate": 9.923381193946457e-06, "loss": 0.0595, "step": 1350 }, { "epoch": 0.45116046084488226, "grad_norm": 0.6439100674202259, "learning_rate": 9.923041919549644e-06, "loss": 0.0457, "step": 1351 }, { "epoch": 0.45149440641175487, "grad_norm": 0.5177257397789261, "learning_rate": 9.92270190146938e-06, "loss": 0.0413, "step": 1352 }, { "epoch": 0.4518283519786275, "grad_norm": 0.6170011843335443, "learning_rate": 9.922361139757033e-06, "loss": 0.0446, "step": 1353 }, { "epoch": 0.4521622975455001, "grad_norm": 0.8446550194519187, "learning_rate": 9.922019634464077e-06, "loss": 0.0466, "step": 1354 }, { "epoch": 0.4524962431123727, "grad_norm": 0.5393970052349853, "learning_rate": 9.9216773856421e-06, "loss": 0.0406, "step": 1355 }, { "epoch": 0.4528301886792453, "grad_norm": 0.36930192855971267, "learning_rate": 9.921334393342803e-06, "loss": 0.033, "step": 1356 }, { "epoch": 0.4531641342461179, "grad_norm": 0.6997590413387955, "learning_rate": 9.920990657617998e-06, "loss": 0.0603, "step": 1357 }, { "epoch": 0.4534980798129905, "grad_norm": 0.41790934970038635, "learning_rate": 9.920646178519612e-06, "loss": 0.0333, "step": 1358 }, { "epoch": 0.45383202537986306, "grad_norm": 0.43059296536579117, "learning_rate": 9.920300956099682e-06, "loss": 0.0472, "step": 1359 }, { "epoch": 0.45416597094673566, "grad_norm": 0.34400095690252225, "learning_rate": 9.919954990410359e-06, "loss": 0.0426, "step": 1360 }, { "epoch": 0.45449991651360827, "grad_norm": 0.40050739871675406, "learning_rate": 9.919608281503903e-06, "loss": 0.0344, "step": 1361 }, { "epoch": 0.4548338620804809, "grad_norm": 0.44401533509970764, "learning_rate": 9.91926082943269e-06, "loss": 0.0501, "step": 1362 }, { "epoch": 0.4551678076473535, "grad_norm": 0.5826813299787759, "learning_rate": 9.918912634249206e-06, "loss": 0.0443, "step": 1363 }, { "epoch": 0.4555017532142261, "grad_norm": 0.4915573623962292, "learning_rate": 9.91856369600605e-06, "loss": 0.0365, "step": 1364 }, { "epoch": 0.4558356987810987, "grad_norm": 0.6009700282747781, "learning_rate": 9.918214014755935e-06, "loss": 0.0504, "step": 1365 }, { "epoch": 0.4561696443479713, "grad_norm": 0.5468155746254137, "learning_rate": 9.917863590551682e-06, "loss": 0.0361, "step": 1366 }, { "epoch": 0.45650358991484385, "grad_norm": 0.5516763083838638, "learning_rate": 9.917512423446226e-06, "loss": 0.0409, "step": 1367 }, { "epoch": 0.45683753548171646, "grad_norm": 0.4891058902403625, "learning_rate": 9.917160513492619e-06, "loss": 0.0416, "step": 1368 }, { "epoch": 0.45717148104858907, "grad_norm": 0.41191878216869804, "learning_rate": 9.916807860744017e-06, "loss": 0.0501, "step": 1369 }, { "epoch": 0.4575054266154617, "grad_norm": 0.5190297522711743, "learning_rate": 9.916454465253695e-06, "loss": 0.049, "step": 1370 }, { "epoch": 0.4578393721823343, "grad_norm": 0.36002436206656735, "learning_rate": 9.916100327075038e-06, "loss": 0.0469, "step": 1371 }, { "epoch": 0.4581733177492069, "grad_norm": 0.45972030730800345, "learning_rate": 9.91574544626154e-06, "loss": 0.037, "step": 1372 }, { "epoch": 0.4585072633160795, "grad_norm": 0.5769486944913123, "learning_rate": 9.915389822866811e-06, "loss": 0.0301, "step": 1373 }, { "epoch": 0.4588412088829521, "grad_norm": 0.36635983513397724, "learning_rate": 9.915033456944572e-06, "loss": 0.0401, "step": 1374 }, { "epoch": 0.45917515444982465, "grad_norm": 0.3934936851188898, "learning_rate": 9.914676348548658e-06, "loss": 0.0321, "step": 1375 }, { "epoch": 0.45950910001669726, "grad_norm": 0.4123363366511699, "learning_rate": 9.914318497733013e-06, "loss": 0.0342, "step": 1376 }, { "epoch": 0.45984304558356986, "grad_norm": 0.5779376661465719, "learning_rate": 9.913959904551695e-06, "loss": 0.0469, "step": 1377 }, { "epoch": 0.46017699115044247, "grad_norm": 0.4974245654516736, "learning_rate": 9.913600569058871e-06, "loss": 0.0495, "step": 1378 }, { "epoch": 0.4605109367173151, "grad_norm": 0.481123185282172, "learning_rate": 9.913240491308828e-06, "loss": 0.0335, "step": 1379 }, { "epoch": 0.4608448822841877, "grad_norm": 0.47405440925593645, "learning_rate": 9.912879671355956e-06, "loss": 0.0376, "step": 1380 }, { "epoch": 0.4611788278510603, "grad_norm": 0.3698815548610607, "learning_rate": 9.912518109254763e-06, "loss": 0.0249, "step": 1381 }, { "epoch": 0.4615127734179329, "grad_norm": 0.5249294778729459, "learning_rate": 9.912155805059866e-06, "loss": 0.0445, "step": 1382 }, { "epoch": 0.4618467189848055, "grad_norm": 0.6781920087763629, "learning_rate": 9.911792758825996e-06, "loss": 0.0489, "step": 1383 }, { "epoch": 0.46218066455167806, "grad_norm": 0.5889363661388466, "learning_rate": 9.911428970607995e-06, "loss": 0.0505, "step": 1384 }, { "epoch": 0.46251461011855066, "grad_norm": 0.7397372770295463, "learning_rate": 9.911064440460818e-06, "loss": 0.0443, "step": 1385 }, { "epoch": 0.46284855568542327, "grad_norm": 0.48219568773534327, "learning_rate": 9.91069916843953e-06, "loss": 0.0377, "step": 1386 }, { "epoch": 0.4631825012522959, "grad_norm": 0.5418126046127661, "learning_rate": 9.910333154599314e-06, "loss": 0.0462, "step": 1387 }, { "epoch": 0.4635164468191685, "grad_norm": 0.45297834065714765, "learning_rate": 9.909966398995456e-06, "loss": 0.0313, "step": 1388 }, { "epoch": 0.4638503923860411, "grad_norm": 0.6679683885195675, "learning_rate": 9.909598901683361e-06, "loss": 0.0543, "step": 1389 }, { "epoch": 0.4641843379529137, "grad_norm": 0.5437056275819844, "learning_rate": 9.909230662718543e-06, "loss": 0.057, "step": 1390 }, { "epoch": 0.4645182835197863, "grad_norm": 0.5834133195251893, "learning_rate": 9.908861682156628e-06, "loss": 0.0495, "step": 1391 }, { "epoch": 0.46485222908665885, "grad_norm": 0.6343676710620773, "learning_rate": 9.908491960053357e-06, "loss": 0.0592, "step": 1392 }, { "epoch": 0.46518617465353146, "grad_norm": 0.4219191146923237, "learning_rate": 9.90812149646458e-06, "loss": 0.0423, "step": 1393 }, { "epoch": 0.46552012022040407, "grad_norm": 0.4053675842863338, "learning_rate": 9.907750291446258e-06, "loss": 0.0329, "step": 1394 }, { "epoch": 0.46585406578727667, "grad_norm": 0.4003154328908937, "learning_rate": 9.907378345054471e-06, "loss": 0.0315, "step": 1395 }, { "epoch": 0.4661880113541493, "grad_norm": 0.43633656747345956, "learning_rate": 9.9070056573454e-06, "loss": 0.0336, "step": 1396 }, { "epoch": 0.4665219569210219, "grad_norm": 0.7441691070141674, "learning_rate": 9.906632228375346e-06, "loss": 0.057, "step": 1397 }, { "epoch": 0.4668559024878945, "grad_norm": 0.44281489718854117, "learning_rate": 9.906258058200722e-06, "loss": 0.03, "step": 1398 }, { "epoch": 0.4671898480547671, "grad_norm": 0.5879656363220077, "learning_rate": 9.905883146878049e-06, "loss": 0.0527, "step": 1399 }, { "epoch": 0.46752379362163965, "grad_norm": 0.6613327027917685, "learning_rate": 9.90550749446396e-06, "loss": 0.0793, "step": 1400 }, { "epoch": 0.46785773918851226, "grad_norm": 0.45880391936667725, "learning_rate": 9.905131101015204e-06, "loss": 0.0427, "step": 1401 }, { "epoch": 0.46819168475538486, "grad_norm": 0.6543858751031911, "learning_rate": 9.904753966588638e-06, "loss": 0.0517, "step": 1402 }, { "epoch": 0.46852563032225747, "grad_norm": 0.4481287511626373, "learning_rate": 9.904376091241236e-06, "loss": 0.0506, "step": 1403 }, { "epoch": 0.4688595758891301, "grad_norm": 0.33458825171140666, "learning_rate": 9.903997475030077e-06, "loss": 0.0299, "step": 1404 }, { "epoch": 0.4691935214560027, "grad_norm": 0.4958866313971678, "learning_rate": 9.903618118012358e-06, "loss": 0.0429, "step": 1405 }, { "epoch": 0.4695274670228753, "grad_norm": 0.3971419975851642, "learning_rate": 9.903238020245383e-06, "loss": 0.0371, "step": 1406 }, { "epoch": 0.4698614125897479, "grad_norm": 0.6540638399343981, "learning_rate": 9.902857181786571e-06, "loss": 0.0449, "step": 1407 }, { "epoch": 0.47019535815662045, "grad_norm": 0.4322169481213042, "learning_rate": 9.902475602693451e-06, "loss": 0.032, "step": 1408 }, { "epoch": 0.47052930372349305, "grad_norm": 0.3458875024135452, "learning_rate": 9.90209328302367e-06, "loss": 0.0323, "step": 1409 }, { "epoch": 0.47086324929036566, "grad_norm": 0.422986124801938, "learning_rate": 9.901710222834976e-06, "loss": 0.0398, "step": 1410 }, { "epoch": 0.47119719485723827, "grad_norm": 0.43510914715380106, "learning_rate": 9.901326422185238e-06, "loss": 0.0335, "step": 1411 }, { "epoch": 0.4715311404241109, "grad_norm": 0.5283980317667754, "learning_rate": 9.900941881132431e-06, "loss": 0.0448, "step": 1412 }, { "epoch": 0.4718650859909835, "grad_norm": 1.1088829014613777, "learning_rate": 9.900556599734647e-06, "loss": 0.0547, "step": 1413 }, { "epoch": 0.4721990315578561, "grad_norm": 0.5843202080905576, "learning_rate": 9.900170578050088e-06, "loss": 0.0753, "step": 1414 }, { "epoch": 0.4725329771247287, "grad_norm": 0.4905756109371659, "learning_rate": 9.899783816137065e-06, "loss": 0.0414, "step": 1415 }, { "epoch": 0.47286692269160124, "grad_norm": 0.7559325911936297, "learning_rate": 9.899396314054002e-06, "loss": 0.0446, "step": 1416 }, { "epoch": 0.47320086825847385, "grad_norm": 0.3388909843775115, "learning_rate": 9.89900807185944e-06, "loss": 0.028, "step": 1417 }, { "epoch": 0.47353481382534646, "grad_norm": 0.46595584543428215, "learning_rate": 9.89861908961202e-06, "loss": 0.0498, "step": 1418 }, { "epoch": 0.47386875939221906, "grad_norm": 0.9783336902808817, "learning_rate": 9.89822936737051e-06, "loss": 0.0614, "step": 1419 }, { "epoch": 0.47420270495909167, "grad_norm": 0.5073491753338615, "learning_rate": 9.897838905193781e-06, "loss": 0.0345, "step": 1420 }, { "epoch": 0.4745366505259643, "grad_norm": 0.7037993960648413, "learning_rate": 9.897447703140813e-06, "loss": 0.0512, "step": 1421 }, { "epoch": 0.4748705960928369, "grad_norm": 0.5437711089780456, "learning_rate": 9.897055761270705e-06, "loss": 0.0386, "step": 1422 }, { "epoch": 0.4752045416597095, "grad_norm": 0.5148402217137021, "learning_rate": 9.896663079642663e-06, "loss": 0.0529, "step": 1423 }, { "epoch": 0.47553848722658204, "grad_norm": 0.579826595234891, "learning_rate": 9.896269658316006e-06, "loss": 0.0498, "step": 1424 }, { "epoch": 0.47587243279345465, "grad_norm": 0.46060422264727224, "learning_rate": 9.895875497350165e-06, "loss": 0.0344, "step": 1425 }, { "epoch": 0.47620637836032725, "grad_norm": 0.5779317629395826, "learning_rate": 9.895480596804684e-06, "loss": 0.0341, "step": 1426 }, { "epoch": 0.47654032392719986, "grad_norm": 0.6251142827066727, "learning_rate": 9.895084956739215e-06, "loss": 0.044, "step": 1427 }, { "epoch": 0.47687426949407247, "grad_norm": 0.5070877530779723, "learning_rate": 9.894688577213527e-06, "loss": 0.044, "step": 1428 }, { "epoch": 0.4772082150609451, "grad_norm": 0.5413308077354688, "learning_rate": 9.894291458287496e-06, "loss": 0.0473, "step": 1429 }, { "epoch": 0.4775421606278177, "grad_norm": 0.536917214023776, "learning_rate": 9.893893600021112e-06, "loss": 0.0447, "step": 1430 }, { "epoch": 0.4778761061946903, "grad_norm": 0.5093039804216097, "learning_rate": 9.893495002474475e-06, "loss": 0.0417, "step": 1431 }, { "epoch": 0.47821005176156284, "grad_norm": 0.5138668195571756, "learning_rate": 9.893095665707801e-06, "loss": 0.0427, "step": 1432 }, { "epoch": 0.47854399732843544, "grad_norm": 0.4319009777536361, "learning_rate": 9.89269558978141e-06, "loss": 0.0356, "step": 1433 }, { "epoch": 0.47887794289530805, "grad_norm": 0.9396179730801536, "learning_rate": 9.892294774755741e-06, "loss": 0.0696, "step": 1434 }, { "epoch": 0.47921188846218066, "grad_norm": 0.8768865613461034, "learning_rate": 9.891893220691343e-06, "loss": 0.0496, "step": 1435 }, { "epoch": 0.47954583402905326, "grad_norm": 0.5639812000975869, "learning_rate": 9.891490927648872e-06, "loss": 0.0428, "step": 1436 }, { "epoch": 0.47987977959592587, "grad_norm": 0.706065626441353, "learning_rate": 9.891087895689102e-06, "loss": 0.0482, "step": 1437 }, { "epoch": 0.4802137251627985, "grad_norm": 0.7787287102104395, "learning_rate": 9.890684124872914e-06, "loss": 0.0482, "step": 1438 }, { "epoch": 0.4805476707296711, "grad_norm": 0.6289694427407534, "learning_rate": 9.890279615261302e-06, "loss": 0.0425, "step": 1439 }, { "epoch": 0.4808816162965437, "grad_norm": 0.7495730884069466, "learning_rate": 9.889874366915374e-06, "loss": 0.053, "step": 1440 }, { "epoch": 0.48121556186341624, "grad_norm": 0.6670551951706001, "learning_rate": 9.889468379896347e-06, "loss": 0.0537, "step": 1441 }, { "epoch": 0.48154950743028885, "grad_norm": 0.4662113102746823, "learning_rate": 9.88906165426555e-06, "loss": 0.0533, "step": 1442 }, { "epoch": 0.48188345299716145, "grad_norm": 0.3709485011214875, "learning_rate": 9.888654190084422e-06, "loss": 0.0342, "step": 1443 }, { "epoch": 0.48221739856403406, "grad_norm": 0.6270254124222672, "learning_rate": 9.888245987414517e-06, "loss": 0.0481, "step": 1444 }, { "epoch": 0.48255134413090667, "grad_norm": 0.5476949184551392, "learning_rate": 9.8878370463175e-06, "loss": 0.0523, "step": 1445 }, { "epoch": 0.4828852896977793, "grad_norm": 0.40312540701883587, "learning_rate": 9.887427366855142e-06, "loss": 0.032, "step": 1446 }, { "epoch": 0.4832192352646519, "grad_norm": 0.39758087626866595, "learning_rate": 9.887016949089334e-06, "loss": 0.0425, "step": 1447 }, { "epoch": 0.4835531808315245, "grad_norm": 0.33656048927842874, "learning_rate": 9.886605793082073e-06, "loss": 0.0405, "step": 1448 }, { "epoch": 0.48388712639839704, "grad_norm": 0.5003352475198972, "learning_rate": 9.886193898895468e-06, "loss": 0.0419, "step": 1449 }, { "epoch": 0.48422107196526964, "grad_norm": 1.1045291501772663, "learning_rate": 9.885781266591742e-06, "loss": 0.0525, "step": 1450 }, { "epoch": 0.48455501753214225, "grad_norm": 0.5375756063813563, "learning_rate": 9.885367896233229e-06, "loss": 0.0547, "step": 1451 }, { "epoch": 0.48488896309901486, "grad_norm": 0.47135224008762544, "learning_rate": 9.88495378788237e-06, "loss": 0.0464, "step": 1452 }, { "epoch": 0.48522290866588746, "grad_norm": 0.47155647032558995, "learning_rate": 9.884538941601725e-06, "loss": 0.0397, "step": 1453 }, { "epoch": 0.48555685423276007, "grad_norm": 0.5062134825081213, "learning_rate": 9.884123357453959e-06, "loss": 0.045, "step": 1454 }, { "epoch": 0.4858907997996327, "grad_norm": 0.4250903593753984, "learning_rate": 9.883707035501849e-06, "loss": 0.0382, "step": 1455 }, { "epoch": 0.4862247453665053, "grad_norm": 0.5303503912675623, "learning_rate": 9.883289975808288e-06, "loss": 0.0474, "step": 1456 }, { "epoch": 0.48655869093337784, "grad_norm": 0.5303299665051235, "learning_rate": 9.882872178436277e-06, "loss": 0.0421, "step": 1457 }, { "epoch": 0.48689263650025044, "grad_norm": 0.7095470003827418, "learning_rate": 9.882453643448933e-06, "loss": 0.0622, "step": 1458 }, { "epoch": 0.48722658206712305, "grad_norm": 0.6654969976908746, "learning_rate": 9.882034370909474e-06, "loss": 0.0604, "step": 1459 }, { "epoch": 0.48756052763399566, "grad_norm": 0.46621656384269283, "learning_rate": 9.88161436088124e-06, "loss": 0.0438, "step": 1460 }, { "epoch": 0.48789447320086826, "grad_norm": 0.4903801406467757, "learning_rate": 9.881193613427676e-06, "loss": 0.0431, "step": 1461 }, { "epoch": 0.48822841876774087, "grad_norm": 0.4594116756992676, "learning_rate": 9.880772128612345e-06, "loss": 0.0386, "step": 1462 }, { "epoch": 0.4885623643346135, "grad_norm": 0.5367656181476723, "learning_rate": 9.880349906498914e-06, "loss": 0.049, "step": 1463 }, { "epoch": 0.4888963099014861, "grad_norm": 0.7229467490956691, "learning_rate": 9.879926947151164e-06, "loss": 0.0521, "step": 1464 }, { "epoch": 0.48923025546835863, "grad_norm": 0.3537344850947976, "learning_rate": 9.879503250632991e-06, "loss": 0.0285, "step": 1465 }, { "epoch": 0.48956420103523124, "grad_norm": 0.6767204938026421, "learning_rate": 9.879078817008395e-06, "loss": 0.0474, "step": 1466 }, { "epoch": 0.48989814660210385, "grad_norm": 0.4013562222465895, "learning_rate": 9.878653646341498e-06, "loss": 0.0402, "step": 1467 }, { "epoch": 0.49023209216897645, "grad_norm": 0.5299310670346635, "learning_rate": 9.878227738696522e-06, "loss": 0.0388, "step": 1468 }, { "epoch": 0.49056603773584906, "grad_norm": 0.3553114238589389, "learning_rate": 9.877801094137807e-06, "loss": 0.0284, "step": 1469 }, { "epoch": 0.49089998330272167, "grad_norm": 0.5492910503107569, "learning_rate": 9.877373712729803e-06, "loss": 0.0543, "step": 1470 }, { "epoch": 0.49123392886959427, "grad_norm": 0.5645202038197307, "learning_rate": 9.876945594537069e-06, "loss": 0.0451, "step": 1471 }, { "epoch": 0.4915678744364669, "grad_norm": 0.495824832203648, "learning_rate": 9.876516739624279e-06, "loss": 0.0433, "step": 1472 }, { "epoch": 0.49190182000333943, "grad_norm": 0.7343312967983936, "learning_rate": 9.876087148056217e-06, "loss": 0.0451, "step": 1473 }, { "epoch": 0.49223576557021204, "grad_norm": 0.5363624356990242, "learning_rate": 9.875656819897776e-06, "loss": 0.0425, "step": 1474 }, { "epoch": 0.49256971113708464, "grad_norm": 0.509346221989081, "learning_rate": 9.875225755213966e-06, "loss": 0.0387, "step": 1475 }, { "epoch": 0.49290365670395725, "grad_norm": 0.5782935647682491, "learning_rate": 9.874793954069899e-06, "loss": 0.05, "step": 1476 }, { "epoch": 0.49323760227082986, "grad_norm": 0.36314841770955747, "learning_rate": 9.874361416530808e-06, "loss": 0.0354, "step": 1477 }, { "epoch": 0.49357154783770246, "grad_norm": 0.4924488268107228, "learning_rate": 9.873928142662031e-06, "loss": 0.0472, "step": 1478 }, { "epoch": 0.49390549340457507, "grad_norm": 0.4461045098551525, "learning_rate": 9.873494132529018e-06, "loss": 0.0256, "step": 1479 }, { "epoch": 0.4942394389714477, "grad_norm": 0.5530793139483993, "learning_rate": 9.873059386197335e-06, "loss": 0.0598, "step": 1480 }, { "epoch": 0.4945733845383202, "grad_norm": 0.5520821800980065, "learning_rate": 9.872623903732652e-06, "loss": 0.0459, "step": 1481 }, { "epoch": 0.49490733010519283, "grad_norm": 0.3485202877204217, "learning_rate": 9.872187685200756e-06, "loss": 0.0312, "step": 1482 }, { "epoch": 0.49524127567206544, "grad_norm": 0.6098012581793566, "learning_rate": 9.87175073066754e-06, "loss": 0.0541, "step": 1483 }, { "epoch": 0.49557522123893805, "grad_norm": 0.4293178205870001, "learning_rate": 9.871313040199015e-06, "loss": 0.0478, "step": 1484 }, { "epoch": 0.49590916680581065, "grad_norm": 0.3906292288259583, "learning_rate": 9.870874613861297e-06, "loss": 0.0382, "step": 1485 }, { "epoch": 0.49624311237268326, "grad_norm": 0.4709217542922022, "learning_rate": 9.870435451720614e-06, "loss": 0.043, "step": 1486 }, { "epoch": 0.49657705793955587, "grad_norm": 0.28737543979668984, "learning_rate": 9.869995553843313e-06, "loss": 0.0342, "step": 1487 }, { "epoch": 0.4969110035064285, "grad_norm": 0.43962448048905367, "learning_rate": 9.869554920295836e-06, "loss": 0.0423, "step": 1488 }, { "epoch": 0.4972449490733011, "grad_norm": 0.4467152430981485, "learning_rate": 9.869113551144754e-06, "loss": 0.0322, "step": 1489 }, { "epoch": 0.49757889464017363, "grad_norm": 0.5938591642903027, "learning_rate": 9.86867144645674e-06, "loss": 0.0609, "step": 1490 }, { "epoch": 0.49791284020704624, "grad_norm": 0.4091995526314113, "learning_rate": 9.868228606298574e-06, "loss": 0.0478, "step": 1491 }, { "epoch": 0.49824678577391884, "grad_norm": 0.47491406195892255, "learning_rate": 9.867785030737157e-06, "loss": 0.0492, "step": 1492 }, { "epoch": 0.49858073134079145, "grad_norm": 0.530286677828167, "learning_rate": 9.867340719839494e-06, "loss": 0.0529, "step": 1493 }, { "epoch": 0.49891467690766406, "grad_norm": 0.9102296526082475, "learning_rate": 9.866895673672704e-06, "loss": 0.0534, "step": 1494 }, { "epoch": 0.49924862247453666, "grad_norm": 0.4266869255406315, "learning_rate": 9.866449892304017e-06, "loss": 0.0412, "step": 1495 }, { "epoch": 0.49958256804140927, "grad_norm": 0.4543185437825696, "learning_rate": 9.866003375800773e-06, "loss": 0.0477, "step": 1496 }, { "epoch": 0.4999165136082819, "grad_norm": 0.36897957679861176, "learning_rate": 9.865556124230425e-06, "loss": 0.0423, "step": 1497 }, { "epoch": 0.5002504591751544, "grad_norm": 0.4345258721100873, "learning_rate": 9.865108137660533e-06, "loss": 0.0377, "step": 1498 }, { "epoch": 0.500584404742027, "grad_norm": 0.421072416575119, "learning_rate": 9.864659416158773e-06, "loss": 0.0397, "step": 1499 }, { "epoch": 0.5009183503088996, "grad_norm": 0.40080400903218444, "learning_rate": 9.864209959792927e-06, "loss": 0.0428, "step": 1500 }, { "epoch": 0.5012522958757722, "grad_norm": 0.7430605391213705, "learning_rate": 9.863759768630893e-06, "loss": 0.0556, "step": 1501 }, { "epoch": 0.5015862414426449, "grad_norm": 0.4353906314679589, "learning_rate": 9.863308842740678e-06, "loss": 0.0467, "step": 1502 }, { "epoch": 0.5019201870095175, "grad_norm": 0.42564048436155383, "learning_rate": 9.862857182190398e-06, "loss": 0.0445, "step": 1503 }, { "epoch": 0.5022541325763901, "grad_norm": 0.37197214507407245, "learning_rate": 9.862404787048283e-06, "loss": 0.036, "step": 1504 }, { "epoch": 0.5025880781432627, "grad_norm": 0.46236612219055323, "learning_rate": 9.861951657382671e-06, "loss": 0.0366, "step": 1505 }, { "epoch": 0.5029220237101353, "grad_norm": 0.3598684883757102, "learning_rate": 9.861497793262014e-06, "loss": 0.0437, "step": 1506 }, { "epoch": 0.5032559692770079, "grad_norm": 0.41849390499484806, "learning_rate": 9.861043194754874e-06, "loss": 0.047, "step": 1507 }, { "epoch": 0.5035899148438805, "grad_norm": 0.36415995653183014, "learning_rate": 9.860587861929922e-06, "loss": 0.0295, "step": 1508 }, { "epoch": 0.5039238604107531, "grad_norm": 0.3787109451952684, "learning_rate": 9.86013179485594e-06, "loss": 0.0431, "step": 1509 }, { "epoch": 0.5042578059776256, "grad_norm": 0.32757238901458857, "learning_rate": 9.859674993601826e-06, "loss": 0.0354, "step": 1510 }, { "epoch": 0.5045917515444982, "grad_norm": 0.58017880122697, "learning_rate": 9.859217458236583e-06, "loss": 0.0398, "step": 1511 }, { "epoch": 0.5049256971113708, "grad_norm": 0.40980697132132676, "learning_rate": 9.858759188829328e-06, "loss": 0.0459, "step": 1512 }, { "epoch": 0.5052596426782434, "grad_norm": 0.5442983985798375, "learning_rate": 9.858300185449287e-06, "loss": 0.0446, "step": 1513 }, { "epoch": 0.505593588245116, "grad_norm": 0.4057797385047999, "learning_rate": 9.857840448165798e-06, "loss": 0.0463, "step": 1514 }, { "epoch": 0.5059275338119886, "grad_norm": 0.3928975349261946, "learning_rate": 9.857379977048311e-06, "loss": 0.0323, "step": 1515 }, { "epoch": 0.5062614793788612, "grad_norm": 0.3541880239496388, "learning_rate": 9.856918772166385e-06, "loss": 0.0352, "step": 1516 }, { "epoch": 0.5065954249457338, "grad_norm": 0.5281051134338431, "learning_rate": 9.856456833589688e-06, "loss": 0.047, "step": 1517 }, { "epoch": 0.5069293705126064, "grad_norm": 0.5800360079286986, "learning_rate": 9.855994161388005e-06, "loss": 0.0441, "step": 1518 }, { "epoch": 0.507263316079479, "grad_norm": 0.4704690709667183, "learning_rate": 9.855530755631226e-06, "loss": 0.0584, "step": 1519 }, { "epoch": 0.5075972616463517, "grad_norm": 0.46215825272653954, "learning_rate": 9.855066616389356e-06, "loss": 0.0314, "step": 1520 }, { "epoch": 0.5079312072132243, "grad_norm": 0.50202463463797, "learning_rate": 9.854601743732504e-06, "loss": 0.0503, "step": 1521 }, { "epoch": 0.5082651527800969, "grad_norm": 0.44982854674438427, "learning_rate": 9.854136137730899e-06, "loss": 0.0446, "step": 1522 }, { "epoch": 0.5085990983469695, "grad_norm": 0.34760472847614, "learning_rate": 9.853669798454875e-06, "loss": 0.0297, "step": 1523 }, { "epoch": 0.5089330439138421, "grad_norm": 0.5921353813069149, "learning_rate": 9.853202725974878e-06, "loss": 0.0563, "step": 1524 }, { "epoch": 0.5092669894807147, "grad_norm": 0.5752415954719585, "learning_rate": 9.852734920361465e-06, "loss": 0.0521, "step": 1525 }, { "epoch": 0.5096009350475872, "grad_norm": 0.7122673071217719, "learning_rate": 9.8522663816853e-06, "loss": 0.0624, "step": 1526 }, { "epoch": 0.5099348806144598, "grad_norm": 0.3954636837544188, "learning_rate": 9.851797110017167e-06, "loss": 0.0264, "step": 1527 }, { "epoch": 0.5102688261813324, "grad_norm": 0.3835123204727526, "learning_rate": 9.851327105427952e-06, "loss": 0.0279, "step": 1528 }, { "epoch": 0.510602771748205, "grad_norm": 0.4386775905097899, "learning_rate": 9.850856367988657e-06, "loss": 0.0298, "step": 1529 }, { "epoch": 0.5109367173150776, "grad_norm": 1.2477612411573338, "learning_rate": 9.850384897770388e-06, "loss": 0.0544, "step": 1530 }, { "epoch": 0.5112706628819502, "grad_norm": 0.3895391857295608, "learning_rate": 9.84991269484437e-06, "loss": 0.0423, "step": 1531 }, { "epoch": 0.5116046084488228, "grad_norm": 0.4555479557728127, "learning_rate": 9.849439759281934e-06, "loss": 0.0433, "step": 1532 }, { "epoch": 0.5119385540156954, "grad_norm": 0.5499567731343226, "learning_rate": 9.848966091154522e-06, "loss": 0.0464, "step": 1533 }, { "epoch": 0.512272499582568, "grad_norm": 0.4446478633482936, "learning_rate": 9.848491690533686e-06, "loss": 0.0429, "step": 1534 }, { "epoch": 0.5126064451494406, "grad_norm": 0.45233848105286506, "learning_rate": 9.848016557491092e-06, "loss": 0.0484, "step": 1535 }, { "epoch": 0.5129403907163133, "grad_norm": 0.8704994263668998, "learning_rate": 9.847540692098513e-06, "loss": 0.0533, "step": 1536 }, { "epoch": 0.5132743362831859, "grad_norm": 0.45046021701537253, "learning_rate": 9.847064094427835e-06, "loss": 0.0407, "step": 1537 }, { "epoch": 0.5136082818500585, "grad_norm": 0.5979846320083059, "learning_rate": 9.846586764551054e-06, "loss": 0.0453, "step": 1538 }, { "epoch": 0.5139422274169311, "grad_norm": 0.8835141383688757, "learning_rate": 9.846108702540274e-06, "loss": 0.0864, "step": 1539 }, { "epoch": 0.5142761729838037, "grad_norm": 0.578209835359361, "learning_rate": 9.845629908467714e-06, "loss": 0.038, "step": 1540 }, { "epoch": 0.5146101185506763, "grad_norm": 0.6165144292679225, "learning_rate": 9.8451503824057e-06, "loss": 0.0402, "step": 1541 }, { "epoch": 0.5149440641175489, "grad_norm": 0.6417056591975662, "learning_rate": 9.844670124426672e-06, "loss": 0.0401, "step": 1542 }, { "epoch": 0.5152780096844214, "grad_norm": 0.5347630232075131, "learning_rate": 9.844189134603178e-06, "loss": 0.0447, "step": 1543 }, { "epoch": 0.515611955251294, "grad_norm": 0.3234157913440331, "learning_rate": 9.843707413007874e-06, "loss": 0.0332, "step": 1544 }, { "epoch": 0.5159459008181666, "grad_norm": 0.5786068038610814, "learning_rate": 9.843224959713535e-06, "loss": 0.0684, "step": 1545 }, { "epoch": 0.5162798463850392, "grad_norm": 0.5250243847114785, "learning_rate": 9.842741774793038e-06, "loss": 0.0569, "step": 1546 }, { "epoch": 0.5166137919519118, "grad_norm": 0.5759984993928302, "learning_rate": 9.842257858319375e-06, "loss": 0.0395, "step": 1547 }, { "epoch": 0.5169477375187844, "grad_norm": 0.8308086977999896, "learning_rate": 9.841773210365646e-06, "loss": 0.0617, "step": 1548 }, { "epoch": 0.517281683085657, "grad_norm": 0.4596758376186459, "learning_rate": 9.841287831005064e-06, "loss": 0.041, "step": 1549 }, { "epoch": 0.5176156286525296, "grad_norm": 0.4076509829581428, "learning_rate": 9.84080172031095e-06, "loss": 0.0375, "step": 1550 }, { "epoch": 0.5179495742194022, "grad_norm": 0.6507682472405012, "learning_rate": 9.840314878356739e-06, "loss": 0.0526, "step": 1551 }, { "epoch": 0.5182835197862748, "grad_norm": 0.5564719871264918, "learning_rate": 9.839827305215972e-06, "loss": 0.0591, "step": 1552 }, { "epoch": 0.5186174653531475, "grad_norm": 0.5070125812645897, "learning_rate": 9.839339000962305e-06, "loss": 0.044, "step": 1553 }, { "epoch": 0.5189514109200201, "grad_norm": 0.5469926168367498, "learning_rate": 9.838849965669499e-06, "loss": 0.0462, "step": 1554 }, { "epoch": 0.5192853564868927, "grad_norm": 0.4877482448023593, "learning_rate": 9.83836019941143e-06, "loss": 0.0426, "step": 1555 }, { "epoch": 0.5196193020537653, "grad_norm": 0.46400833958387616, "learning_rate": 9.837869702262082e-06, "loss": 0.055, "step": 1556 }, { "epoch": 0.5199532476206379, "grad_norm": 0.5469798799351714, "learning_rate": 9.837378474295553e-06, "loss": 0.0432, "step": 1557 }, { "epoch": 0.5202871931875105, "grad_norm": 0.39835442324282394, "learning_rate": 9.836886515586045e-06, "loss": 0.0365, "step": 1558 }, { "epoch": 0.520621138754383, "grad_norm": 0.4899421332569305, "learning_rate": 9.83639382620788e-06, "loss": 0.0292, "step": 1559 }, { "epoch": 0.5209550843212556, "grad_norm": 0.31630185886358, "learning_rate": 9.835900406235479e-06, "loss": 0.0339, "step": 1560 }, { "epoch": 0.5212890298881282, "grad_norm": 0.5410350548004083, "learning_rate": 9.835406255743381e-06, "loss": 0.0506, "step": 1561 }, { "epoch": 0.5216229754550008, "grad_norm": 0.3826455467346051, "learning_rate": 9.834911374806231e-06, "loss": 0.0331, "step": 1562 }, { "epoch": 0.5219569210218734, "grad_norm": 0.3690454201745996, "learning_rate": 9.83441576349879e-06, "loss": 0.036, "step": 1563 }, { "epoch": 0.522290866588746, "grad_norm": 0.5198004451505397, "learning_rate": 9.833919421895926e-06, "loss": 0.046, "step": 1564 }, { "epoch": 0.5226248121556186, "grad_norm": 0.40943166185929364, "learning_rate": 9.833422350072615e-06, "loss": 0.0358, "step": 1565 }, { "epoch": 0.5229587577224912, "grad_norm": 0.6365032126717701, "learning_rate": 9.832924548103945e-06, "loss": 0.0478, "step": 1566 }, { "epoch": 0.5232927032893638, "grad_norm": 0.3162022970947212, "learning_rate": 9.832426016065117e-06, "loss": 0.0346, "step": 1567 }, { "epoch": 0.5236266488562364, "grad_norm": 0.43102986601306426, "learning_rate": 9.83192675403144e-06, "loss": 0.0491, "step": 1568 }, { "epoch": 0.523960594423109, "grad_norm": 0.4866183135413825, "learning_rate": 9.831426762078331e-06, "loss": 0.0416, "step": 1569 }, { "epoch": 0.5242945399899817, "grad_norm": 0.6415618280163502, "learning_rate": 9.830926040281321e-06, "loss": 0.0504, "step": 1570 }, { "epoch": 0.5246284855568543, "grad_norm": 0.5820504422099528, "learning_rate": 9.830424588716053e-06, "loss": 0.0527, "step": 1571 }, { "epoch": 0.5249624311237269, "grad_norm": 0.5336982251530853, "learning_rate": 9.829922407458273e-06, "loss": 0.0551, "step": 1572 }, { "epoch": 0.5252963766905995, "grad_norm": 0.48056082471570233, "learning_rate": 9.829419496583843e-06, "loss": 0.0462, "step": 1573 }, { "epoch": 0.5256303222574721, "grad_norm": 0.501265768809885, "learning_rate": 9.828915856168734e-06, "loss": 0.0486, "step": 1574 }, { "epoch": 0.5259642678243446, "grad_norm": 0.6064150035667107, "learning_rate": 9.828411486289026e-06, "loss": 0.0389, "step": 1575 }, { "epoch": 0.5262982133912172, "grad_norm": 0.5054548918351807, "learning_rate": 9.82790638702091e-06, "loss": 0.0449, "step": 1576 }, { "epoch": 0.5266321589580898, "grad_norm": 0.478694127738411, "learning_rate": 9.827400558440687e-06, "loss": 0.0368, "step": 1577 }, { "epoch": 0.5269661045249624, "grad_norm": 0.45181073485918044, "learning_rate": 9.826894000624769e-06, "loss": 0.0314, "step": 1578 }, { "epoch": 0.527300050091835, "grad_norm": 0.7749356164087102, "learning_rate": 9.826386713649678e-06, "loss": 0.0636, "step": 1579 }, { "epoch": 0.5276339956587076, "grad_norm": 0.5825159462865975, "learning_rate": 9.825878697592046e-06, "loss": 0.0467, "step": 1580 }, { "epoch": 0.5279679412255802, "grad_norm": 0.5575072135049433, "learning_rate": 9.825369952528611e-06, "loss": 0.0473, "step": 1581 }, { "epoch": 0.5283018867924528, "grad_norm": 0.41941816464691917, "learning_rate": 9.824860478536231e-06, "loss": 0.0307, "step": 1582 }, { "epoch": 0.5286358323593254, "grad_norm": 0.5461230635241924, "learning_rate": 9.824350275691864e-06, "loss": 0.0514, "step": 1583 }, { "epoch": 0.528969777926198, "grad_norm": 0.6438870299046571, "learning_rate": 9.823839344072582e-06, "loss": 0.0443, "step": 1584 }, { "epoch": 0.5293037234930706, "grad_norm": 0.5777059381505149, "learning_rate": 9.823327683755566e-06, "loss": 0.0503, "step": 1585 }, { "epoch": 0.5296376690599433, "grad_norm": 0.6353044866085541, "learning_rate": 9.822815294818113e-06, "loss": 0.0489, "step": 1586 }, { "epoch": 0.5299716146268159, "grad_norm": 0.6029003429316526, "learning_rate": 9.822302177337624e-06, "loss": 0.0414, "step": 1587 }, { "epoch": 0.5303055601936885, "grad_norm": 0.33971430969523186, "learning_rate": 9.821788331391609e-06, "loss": 0.0334, "step": 1588 }, { "epoch": 0.5306395057605611, "grad_norm": 0.5268398704129876, "learning_rate": 9.821273757057692e-06, "loss": 0.0359, "step": 1589 }, { "epoch": 0.5309734513274337, "grad_norm": 0.8283223934346511, "learning_rate": 9.820758454413606e-06, "loss": 0.04, "step": 1590 }, { "epoch": 0.5313073968943063, "grad_norm": 0.5217381131428658, "learning_rate": 9.820242423537192e-06, "loss": 0.0408, "step": 1591 }, { "epoch": 0.5316413424611788, "grad_norm": 0.5762948306371981, "learning_rate": 9.819725664506404e-06, "loss": 0.0335, "step": 1592 }, { "epoch": 0.5319752880280514, "grad_norm": 0.7203062895879468, "learning_rate": 9.819208177399303e-06, "loss": 0.0432, "step": 1593 }, { "epoch": 0.532309233594924, "grad_norm": 0.7160632979278783, "learning_rate": 9.818689962294063e-06, "loss": 0.0469, "step": 1594 }, { "epoch": 0.5326431791617966, "grad_norm": 0.7917036861187641, "learning_rate": 9.818171019268965e-06, "loss": 0.0477, "step": 1595 }, { "epoch": 0.5329771247286692, "grad_norm": 0.5237629728026038, "learning_rate": 9.817651348402403e-06, "loss": 0.0521, "step": 1596 }, { "epoch": 0.5333110702955418, "grad_norm": 0.6723127715974083, "learning_rate": 9.81713094977288e-06, "loss": 0.039, "step": 1597 }, { "epoch": 0.5336450158624144, "grad_norm": 0.5315645446991091, "learning_rate": 9.816609823459007e-06, "loss": 0.0369, "step": 1598 }, { "epoch": 0.533978961429287, "grad_norm": 0.49683308900599976, "learning_rate": 9.816087969539506e-06, "loss": 0.0343, "step": 1599 }, { "epoch": 0.5343129069961596, "grad_norm": 0.4968494208487672, "learning_rate": 9.815565388093209e-06, "loss": 0.0436, "step": 1600 }, { "epoch": 0.5346468525630322, "grad_norm": 0.45501021484043996, "learning_rate": 9.81504207919906e-06, "loss": 0.04, "step": 1601 }, { "epoch": 0.5349807981299048, "grad_norm": 0.5184804765890906, "learning_rate": 9.814518042936107e-06, "loss": 0.0502, "step": 1602 }, { "epoch": 0.5353147436967775, "grad_norm": 0.5432491916613281, "learning_rate": 9.813993279383518e-06, "loss": 0.0451, "step": 1603 }, { "epoch": 0.5356486892636501, "grad_norm": 0.3618440392593531, "learning_rate": 9.813467788620559e-06, "loss": 0.0335, "step": 1604 }, { "epoch": 0.5359826348305227, "grad_norm": 0.40492935745007325, "learning_rate": 9.812941570726615e-06, "loss": 0.0322, "step": 1605 }, { "epoch": 0.5363165803973953, "grad_norm": 0.6040359865504972, "learning_rate": 9.812414625781175e-06, "loss": 0.0476, "step": 1606 }, { "epoch": 0.5366505259642679, "grad_norm": 0.49142896450495743, "learning_rate": 9.811886953863841e-06, "loss": 0.0343, "step": 1607 }, { "epoch": 0.5369844715311404, "grad_norm": 0.47805940680703596, "learning_rate": 9.811358555054326e-06, "loss": 0.0326, "step": 1608 }, { "epoch": 0.537318417098013, "grad_norm": 0.6548301712919927, "learning_rate": 9.810829429432449e-06, "loss": 0.034, "step": 1609 }, { "epoch": 0.5376523626648856, "grad_norm": 0.5009292103530449, "learning_rate": 9.81029957707814e-06, "loss": 0.044, "step": 1610 }, { "epoch": 0.5379863082317582, "grad_norm": 0.5624294342432482, "learning_rate": 9.809768998071442e-06, "loss": 0.0405, "step": 1611 }, { "epoch": 0.5383202537986308, "grad_norm": 0.4170362571368871, "learning_rate": 9.809237692492503e-06, "loss": 0.0367, "step": 1612 }, { "epoch": 0.5386541993655034, "grad_norm": 0.4596498288262667, "learning_rate": 9.808705660421582e-06, "loss": 0.0478, "step": 1613 }, { "epoch": 0.538988144932376, "grad_norm": 0.7178474019180208, "learning_rate": 9.808172901939053e-06, "loss": 0.0459, "step": 1614 }, { "epoch": 0.5393220904992486, "grad_norm": 0.3434626720597141, "learning_rate": 9.807639417125392e-06, "loss": 0.0312, "step": 1615 }, { "epoch": 0.5396560360661212, "grad_norm": 0.5221178590381972, "learning_rate": 9.807105206061186e-06, "loss": 0.0348, "step": 1616 }, { "epoch": 0.5399899816329938, "grad_norm": 0.4820129468059947, "learning_rate": 9.80657026882714e-06, "loss": 0.038, "step": 1617 }, { "epoch": 0.5403239271998664, "grad_norm": 0.5177152300225278, "learning_rate": 9.80603460550406e-06, "loss": 0.0576, "step": 1618 }, { "epoch": 0.540657872766739, "grad_norm": 0.48562119319527614, "learning_rate": 9.805498216172861e-06, "loss": 0.0389, "step": 1619 }, { "epoch": 0.5409918183336117, "grad_norm": 0.4593153545602104, "learning_rate": 9.804961100914575e-06, "loss": 0.0381, "step": 1620 }, { "epoch": 0.5413257639004843, "grad_norm": 0.3626106588247109, "learning_rate": 9.804423259810338e-06, "loss": 0.027, "step": 1621 }, { "epoch": 0.5416597094673569, "grad_norm": 0.43240020249317507, "learning_rate": 9.803884692941397e-06, "loss": 0.0468, "step": 1622 }, { "epoch": 0.5419936550342295, "grad_norm": 0.36564231841245876, "learning_rate": 9.803345400389111e-06, "loss": 0.0386, "step": 1623 }, { "epoch": 0.542327600601102, "grad_norm": 0.4450810931513539, "learning_rate": 9.802805382234941e-06, "loss": 0.0436, "step": 1624 }, { "epoch": 0.5426615461679746, "grad_norm": 0.4029971664558369, "learning_rate": 9.80226463856047e-06, "loss": 0.0408, "step": 1625 }, { "epoch": 0.5429954917348472, "grad_norm": 0.6669350636767151, "learning_rate": 9.801723169447378e-06, "loss": 0.0484, "step": 1626 }, { "epoch": 0.5433294373017198, "grad_norm": 0.45380330224505766, "learning_rate": 9.801180974977466e-06, "loss": 0.0361, "step": 1627 }, { "epoch": 0.5436633828685924, "grad_norm": 0.40762758173819824, "learning_rate": 9.800638055232635e-06, "loss": 0.038, "step": 1628 }, { "epoch": 0.543997328435465, "grad_norm": 0.5561520311338787, "learning_rate": 9.800094410294897e-06, "loss": 0.042, "step": 1629 }, { "epoch": 0.5443312740023376, "grad_norm": 0.3911726844205256, "learning_rate": 9.799550040246381e-06, "loss": 0.0313, "step": 1630 }, { "epoch": 0.5446652195692102, "grad_norm": 0.4119958716503093, "learning_rate": 9.799004945169319e-06, "loss": 0.0367, "step": 1631 }, { "epoch": 0.5449991651360828, "grad_norm": 0.48263621914834215, "learning_rate": 9.798459125146054e-06, "loss": 0.0483, "step": 1632 }, { "epoch": 0.5453331107029554, "grad_norm": 0.5034503809346389, "learning_rate": 9.797912580259037e-06, "loss": 0.0407, "step": 1633 }, { "epoch": 0.545667056269828, "grad_norm": 0.7922156973587396, "learning_rate": 9.797365310590832e-06, "loss": 0.0591, "step": 1634 }, { "epoch": 0.5460010018367006, "grad_norm": 0.46868734613073554, "learning_rate": 9.796817316224107e-06, "loss": 0.0359, "step": 1635 }, { "epoch": 0.5463349474035732, "grad_norm": 0.41482875148647297, "learning_rate": 9.79626859724165e-06, "loss": 0.0394, "step": 1636 }, { "epoch": 0.5466688929704459, "grad_norm": 0.5057102762417754, "learning_rate": 9.795719153726345e-06, "loss": 0.0325, "step": 1637 }, { "epoch": 0.5470028385373185, "grad_norm": 0.49177583996918156, "learning_rate": 9.795168985761192e-06, "loss": 0.0454, "step": 1638 }, { "epoch": 0.5473367841041911, "grad_norm": 0.38141980746291837, "learning_rate": 9.794618093429305e-06, "loss": 0.0427, "step": 1639 }, { "epoch": 0.5476707296710637, "grad_norm": 0.4058774865359395, "learning_rate": 9.794066476813901e-06, "loss": 0.0394, "step": 1640 }, { "epoch": 0.5480046752379362, "grad_norm": 0.4462450927151456, "learning_rate": 9.793514135998306e-06, "loss": 0.0301, "step": 1641 }, { "epoch": 0.5483386208048088, "grad_norm": 0.3699683133102845, "learning_rate": 9.792961071065958e-06, "loss": 0.0467, "step": 1642 }, { "epoch": 0.5486725663716814, "grad_norm": 0.4396087429356699, "learning_rate": 9.792407282100407e-06, "loss": 0.0277, "step": 1643 }, { "epoch": 0.549006511938554, "grad_norm": 0.40474370695884687, "learning_rate": 9.791852769185306e-06, "loss": 0.0537, "step": 1644 }, { "epoch": 0.5493404575054266, "grad_norm": 0.3409292888851801, "learning_rate": 9.791297532404422e-06, "loss": 0.0324, "step": 1645 }, { "epoch": 0.5496744030722992, "grad_norm": 0.7477123909924196, "learning_rate": 9.790741571841629e-06, "loss": 0.0538, "step": 1646 }, { "epoch": 0.5500083486391718, "grad_norm": 0.6511168542888314, "learning_rate": 9.790184887580914e-06, "loss": 0.0584, "step": 1647 }, { "epoch": 0.5503422942060444, "grad_norm": 0.45904632423721786, "learning_rate": 9.78962747970637e-06, "loss": 0.0425, "step": 1648 }, { "epoch": 0.550676239772917, "grad_norm": 0.44426679223407733, "learning_rate": 9.789069348302197e-06, "loss": 0.0379, "step": 1649 }, { "epoch": 0.5510101853397896, "grad_norm": 0.45455317565005104, "learning_rate": 9.78851049345271e-06, "loss": 0.0512, "step": 1650 }, { "epoch": 0.5513441309066622, "grad_norm": 0.4068642439545971, "learning_rate": 9.78795091524233e-06, "loss": 0.0378, "step": 1651 }, { "epoch": 0.5516780764735348, "grad_norm": 0.3312250333711377, "learning_rate": 9.78739061375559e-06, "loss": 0.0294, "step": 1652 }, { "epoch": 0.5520120220404074, "grad_norm": 0.7810027856995916, "learning_rate": 9.786829589077125e-06, "loss": 0.0573, "step": 1653 }, { "epoch": 0.55234596760728, "grad_norm": 0.5475404100744792, "learning_rate": 9.78626784129169e-06, "loss": 0.0436, "step": 1654 }, { "epoch": 0.5526799131741527, "grad_norm": 0.5330799826085645, "learning_rate": 9.78570537048414e-06, "loss": 0.0379, "step": 1655 }, { "epoch": 0.5530138587410253, "grad_norm": 0.49487937190466735, "learning_rate": 9.785142176739444e-06, "loss": 0.0461, "step": 1656 }, { "epoch": 0.5533478043078978, "grad_norm": 0.5147594641113324, "learning_rate": 9.784578260142679e-06, "loss": 0.0424, "step": 1657 }, { "epoch": 0.5536817498747704, "grad_norm": 0.43346677742855155, "learning_rate": 9.784013620779031e-06, "loss": 0.0383, "step": 1658 }, { "epoch": 0.554015695441643, "grad_norm": 0.4874022508489168, "learning_rate": 9.783448258733795e-06, "loss": 0.0508, "step": 1659 }, { "epoch": 0.5543496410085156, "grad_norm": 0.9737800245753055, "learning_rate": 9.782882174092377e-06, "loss": 0.0457, "step": 1660 }, { "epoch": 0.5546835865753882, "grad_norm": 0.44150611696364056, "learning_rate": 9.78231536694029e-06, "loss": 0.0448, "step": 1661 }, { "epoch": 0.5550175321422608, "grad_norm": 0.4720638581175937, "learning_rate": 9.781747837363158e-06, "loss": 0.0426, "step": 1662 }, { "epoch": 0.5553514777091334, "grad_norm": 0.39428004968531427, "learning_rate": 9.781179585446711e-06, "loss": 0.0461, "step": 1663 }, { "epoch": 0.555685423276006, "grad_norm": 0.5364669161422959, "learning_rate": 9.780610611276791e-06, "loss": 0.0358, "step": 1664 }, { "epoch": 0.5560193688428786, "grad_norm": 0.40805268475403095, "learning_rate": 9.780040914939349e-06, "loss": 0.0363, "step": 1665 }, { "epoch": 0.5563533144097512, "grad_norm": 1.0136455115245742, "learning_rate": 9.779470496520442e-06, "loss": 0.1141, "step": 1666 }, { "epoch": 0.5566872599766238, "grad_norm": 0.4639087743513537, "learning_rate": 9.77889935610624e-06, "loss": 0.0434, "step": 1667 }, { "epoch": 0.5570212055434964, "grad_norm": 0.3441933451370595, "learning_rate": 9.778327493783022e-06, "loss": 0.0423, "step": 1668 }, { "epoch": 0.557355151110369, "grad_norm": 0.41641457287248573, "learning_rate": 9.777754909637173e-06, "loss": 0.0279, "step": 1669 }, { "epoch": 0.5576890966772416, "grad_norm": 0.40059036909704987, "learning_rate": 9.777181603755188e-06, "loss": 0.0347, "step": 1670 }, { "epoch": 0.5580230422441143, "grad_norm": 0.564059108858758, "learning_rate": 9.776607576223673e-06, "loss": 0.0497, "step": 1671 }, { "epoch": 0.5583569878109869, "grad_norm": 0.47482441046313406, "learning_rate": 9.776032827129338e-06, "loss": 0.0368, "step": 1672 }, { "epoch": 0.5586909333778594, "grad_norm": 0.7269906153257919, "learning_rate": 9.775457356559013e-06, "loss": 0.0395, "step": 1673 }, { "epoch": 0.559024878944732, "grad_norm": 0.760415482871149, "learning_rate": 9.774881164599621e-06, "loss": 0.0587, "step": 1674 }, { "epoch": 0.5593588245116046, "grad_norm": 0.8227202729605906, "learning_rate": 9.77430425133821e-06, "loss": 0.0502, "step": 1675 }, { "epoch": 0.5596927700784772, "grad_norm": 0.5023770215405837, "learning_rate": 9.773726616861926e-06, "loss": 0.0423, "step": 1676 }, { "epoch": 0.5600267156453498, "grad_norm": 0.5027304555843701, "learning_rate": 9.773148261258025e-06, "loss": 0.053, "step": 1677 }, { "epoch": 0.5603606612122224, "grad_norm": 0.4456588855350891, "learning_rate": 9.772569184613879e-06, "loss": 0.0308, "step": 1678 }, { "epoch": 0.560694606779095, "grad_norm": 0.45427388831713583, "learning_rate": 9.771989387016962e-06, "loss": 0.0381, "step": 1679 }, { "epoch": 0.5610285523459676, "grad_norm": 0.6142935080859407, "learning_rate": 9.77140886855486e-06, "loss": 0.0489, "step": 1680 }, { "epoch": 0.5613624979128402, "grad_norm": 0.5833919680636962, "learning_rate": 9.770827629315266e-06, "loss": 0.0478, "step": 1681 }, { "epoch": 0.5616964434797128, "grad_norm": 0.5926392272950822, "learning_rate": 9.770245669385984e-06, "loss": 0.0404, "step": 1682 }, { "epoch": 0.5620303890465854, "grad_norm": 0.5270987927707339, "learning_rate": 9.76966298885493e-06, "loss": 0.0426, "step": 1683 }, { "epoch": 0.562364334613458, "grad_norm": 0.586726309389911, "learning_rate": 9.769079587810115e-06, "loss": 0.0335, "step": 1684 }, { "epoch": 0.5626982801803306, "grad_norm": 0.5887110017814984, "learning_rate": 9.768495466339675e-06, "loss": 0.0375, "step": 1685 }, { "epoch": 0.5630322257472032, "grad_norm": 0.42871318705843875, "learning_rate": 9.767910624531852e-06, "loss": 0.0365, "step": 1686 }, { "epoch": 0.5633661713140758, "grad_norm": 0.4295891737050711, "learning_rate": 9.767325062474984e-06, "loss": 0.0311, "step": 1687 }, { "epoch": 0.5637001168809485, "grad_norm": 0.49488756256363087, "learning_rate": 9.766738780257535e-06, "loss": 0.0366, "step": 1688 }, { "epoch": 0.564034062447821, "grad_norm": 0.397622714634608, "learning_rate": 9.766151777968063e-06, "loss": 0.0412, "step": 1689 }, { "epoch": 0.5643680080146936, "grad_norm": 0.6398746761967488, "learning_rate": 9.765564055695249e-06, "loss": 0.0453, "step": 1690 }, { "epoch": 0.5647019535815662, "grad_norm": 0.5777262669287194, "learning_rate": 9.76497561352787e-06, "loss": 0.0517, "step": 1691 }, { "epoch": 0.5650358991484388, "grad_norm": 0.4354174250708227, "learning_rate": 9.764386451554819e-06, "loss": 0.0406, "step": 1692 }, { "epoch": 0.5653698447153114, "grad_norm": 0.40765008160639593, "learning_rate": 9.763796569865095e-06, "loss": 0.0407, "step": 1693 }, { "epoch": 0.565703790282184, "grad_norm": 0.34479313887853946, "learning_rate": 9.763205968547808e-06, "loss": 0.0405, "step": 1694 }, { "epoch": 0.5660377358490566, "grad_norm": 0.44612130164603186, "learning_rate": 9.762614647692175e-06, "loss": 0.0353, "step": 1695 }, { "epoch": 0.5663716814159292, "grad_norm": 0.4081773718045894, "learning_rate": 9.762022607387522e-06, "loss": 0.0403, "step": 1696 }, { "epoch": 0.5667056269828018, "grad_norm": 0.3986242039861698, "learning_rate": 9.761429847723281e-06, "loss": 0.0453, "step": 1697 }, { "epoch": 0.5670395725496744, "grad_norm": 0.4647791276949541, "learning_rate": 9.760836368788999e-06, "loss": 0.0457, "step": 1698 }, { "epoch": 0.567373518116547, "grad_norm": 0.7058313139209909, "learning_rate": 9.760242170674325e-06, "loss": 0.0292, "step": 1699 }, { "epoch": 0.5677074636834196, "grad_norm": 0.49994785567155847, "learning_rate": 9.759647253469023e-06, "loss": 0.0751, "step": 1700 }, { "epoch": 0.5680414092502922, "grad_norm": 0.48979765791061497, "learning_rate": 9.75905161726296e-06, "loss": 0.0488, "step": 1701 }, { "epoch": 0.5683753548171648, "grad_norm": 0.4754036512113146, "learning_rate": 9.758455262146114e-06, "loss": 0.0421, "step": 1702 }, { "epoch": 0.5687093003840374, "grad_norm": 0.46348012900650337, "learning_rate": 9.757858188208571e-06, "loss": 0.0416, "step": 1703 }, { "epoch": 0.56904324595091, "grad_norm": 0.4077132589607528, "learning_rate": 9.757260395540527e-06, "loss": 0.0331, "step": 1704 }, { "epoch": 0.5693771915177827, "grad_norm": 0.48223135405172074, "learning_rate": 9.756661884232286e-06, "loss": 0.0506, "step": 1705 }, { "epoch": 0.5697111370846552, "grad_norm": 0.40773646511134254, "learning_rate": 9.756062654374259e-06, "loss": 0.0398, "step": 1706 }, { "epoch": 0.5700450826515278, "grad_norm": 0.4436906126403325, "learning_rate": 9.755462706056966e-06, "loss": 0.048, "step": 1707 }, { "epoch": 0.5703790282184004, "grad_norm": 0.4733346088039868, "learning_rate": 9.75486203937104e-06, "loss": 0.0388, "step": 1708 }, { "epoch": 0.570712973785273, "grad_norm": 0.48669551866401806, "learning_rate": 9.754260654407214e-06, "loss": 0.04, "step": 1709 }, { "epoch": 0.5710469193521456, "grad_norm": 0.44310970928304405, "learning_rate": 9.753658551256338e-06, "loss": 0.0361, "step": 1710 }, { "epoch": 0.5713808649190182, "grad_norm": 0.3834459594566251, "learning_rate": 9.753055730009364e-06, "loss": 0.0381, "step": 1711 }, { "epoch": 0.5717148104858908, "grad_norm": 0.608494935824481, "learning_rate": 9.752452190757358e-06, "loss": 0.0475, "step": 1712 }, { "epoch": 0.5720487560527634, "grad_norm": 0.3116160449574889, "learning_rate": 9.751847933591489e-06, "loss": 0.0298, "step": 1713 }, { "epoch": 0.572382701619636, "grad_norm": 0.36538216205006346, "learning_rate": 9.75124295860304e-06, "loss": 0.0408, "step": 1714 }, { "epoch": 0.5727166471865086, "grad_norm": 0.34464107849922015, "learning_rate": 9.750637265883395e-06, "loss": 0.0371, "step": 1715 }, { "epoch": 0.5730505927533812, "grad_norm": 0.4398000685746299, "learning_rate": 9.750030855524058e-06, "loss": 0.0439, "step": 1716 }, { "epoch": 0.5733845383202538, "grad_norm": 0.3898477877018435, "learning_rate": 9.749423727616628e-06, "loss": 0.0534, "step": 1717 }, { "epoch": 0.5737184838871264, "grad_norm": 0.39525352162345495, "learning_rate": 9.748815882252823e-06, "loss": 0.0331, "step": 1718 }, { "epoch": 0.574052429453999, "grad_norm": 0.40168206830824166, "learning_rate": 9.748207319524462e-06, "loss": 0.0518, "step": 1719 }, { "epoch": 0.5743863750208716, "grad_norm": 0.4026888446467387, "learning_rate": 9.747598039523476e-06, "loss": 0.034, "step": 1720 }, { "epoch": 0.5747203205877442, "grad_norm": 0.5043447327069664, "learning_rate": 9.746988042341907e-06, "loss": 0.0509, "step": 1721 }, { "epoch": 0.5750542661546167, "grad_norm": 0.5878621389397128, "learning_rate": 9.746377328071899e-06, "loss": 0.033, "step": 1722 }, { "epoch": 0.5753882117214894, "grad_norm": 0.4047824403195135, "learning_rate": 9.74576589680571e-06, "loss": 0.0354, "step": 1723 }, { "epoch": 0.575722157288362, "grad_norm": 0.6433613853468115, "learning_rate": 9.745153748635702e-06, "loss": 0.0443, "step": 1724 }, { "epoch": 0.5760561028552346, "grad_norm": 0.4954299759674441, "learning_rate": 9.744540883654348e-06, "loss": 0.0505, "step": 1725 }, { "epoch": 0.5763900484221072, "grad_norm": 0.5261383027946491, "learning_rate": 9.743927301954229e-06, "loss": 0.0603, "step": 1726 }, { "epoch": 0.5767239939889798, "grad_norm": 0.3101222368704965, "learning_rate": 9.743313003628033e-06, "loss": 0.0373, "step": 1727 }, { "epoch": 0.5770579395558524, "grad_norm": 1.1817673381964775, "learning_rate": 9.742697988768557e-06, "loss": 0.0566, "step": 1728 }, { "epoch": 0.577391885122725, "grad_norm": 0.4074827108070887, "learning_rate": 9.742082257468705e-06, "loss": 0.034, "step": 1729 }, { "epoch": 0.5777258306895976, "grad_norm": 0.46863638424338366, "learning_rate": 9.741465809821493e-06, "loss": 0.0446, "step": 1730 }, { "epoch": 0.5780597762564702, "grad_norm": 0.4836885565541822, "learning_rate": 9.74084864592004e-06, "loss": 0.0411, "step": 1731 }, { "epoch": 0.5783937218233428, "grad_norm": 0.553986398907438, "learning_rate": 9.74023076585758e-06, "loss": 0.0366, "step": 1732 }, { "epoch": 0.5787276673902154, "grad_norm": 0.44076096145232735, "learning_rate": 9.739612169727446e-06, "loss": 0.0271, "step": 1733 }, { "epoch": 0.579061612957088, "grad_norm": 0.33565597315818607, "learning_rate": 9.73899285762309e-06, "loss": 0.0367, "step": 1734 }, { "epoch": 0.5793955585239606, "grad_norm": 0.5922993089683395, "learning_rate": 9.738372829638058e-06, "loss": 0.045, "step": 1735 }, { "epoch": 0.5797295040908332, "grad_norm": 0.6039800326539692, "learning_rate": 9.73775208586602e-06, "loss": 0.0505, "step": 1736 }, { "epoch": 0.5800634496577058, "grad_norm": 0.4715635213171511, "learning_rate": 9.737130626400745e-06, "loss": 0.0461, "step": 1737 }, { "epoch": 0.5803973952245783, "grad_norm": 0.9311351699330999, "learning_rate": 9.736508451336111e-06, "loss": 0.0603, "step": 1738 }, { "epoch": 0.580731340791451, "grad_norm": 0.4618796011976892, "learning_rate": 9.735885560766104e-06, "loss": 0.0416, "step": 1739 }, { "epoch": 0.5810652863583236, "grad_norm": 0.5566929361313651, "learning_rate": 9.73526195478482e-06, "loss": 0.0798, "step": 1740 }, { "epoch": 0.5813992319251962, "grad_norm": 0.37517028857443224, "learning_rate": 9.73463763348646e-06, "loss": 0.0362, "step": 1741 }, { "epoch": 0.5817331774920688, "grad_norm": 0.5412175742504487, "learning_rate": 9.734012596965341e-06, "loss": 0.0391, "step": 1742 }, { "epoch": 0.5820671230589414, "grad_norm": 0.47452101080336717, "learning_rate": 9.733386845315875e-06, "loss": 0.0479, "step": 1743 }, { "epoch": 0.582401068625814, "grad_norm": 0.9914074972686332, "learning_rate": 9.732760378632592e-06, "loss": 0.0661, "step": 1744 }, { "epoch": 0.5827350141926866, "grad_norm": 0.45693844718749216, "learning_rate": 9.73213319701013e-06, "loss": 0.0359, "step": 1745 }, { "epoch": 0.5830689597595592, "grad_norm": 0.4007173084318067, "learning_rate": 9.731505300543228e-06, "loss": 0.0363, "step": 1746 }, { "epoch": 0.5834029053264318, "grad_norm": 0.5636375021117038, "learning_rate": 9.730876689326739e-06, "loss": 0.0442, "step": 1747 }, { "epoch": 0.5837368508933044, "grad_norm": 0.5413144380237097, "learning_rate": 9.730247363455621e-06, "loss": 0.0447, "step": 1748 }, { "epoch": 0.584070796460177, "grad_norm": 0.33551405977733567, "learning_rate": 9.729617323024943e-06, "loss": 0.0375, "step": 1749 }, { "epoch": 0.5844047420270496, "grad_norm": 0.5237682630936968, "learning_rate": 9.728986568129876e-06, "loss": 0.0399, "step": 1750 }, { "epoch": 0.5847386875939222, "grad_norm": 0.3544041932326313, "learning_rate": 9.72835509886571e-06, "loss": 0.0367, "step": 1751 }, { "epoch": 0.5850726331607948, "grad_norm": 0.44789001502132775, "learning_rate": 9.727722915327828e-06, "loss": 0.036, "step": 1752 }, { "epoch": 0.5854065787276674, "grad_norm": 0.4960974968783306, "learning_rate": 9.727090017611736e-06, "loss": 0.0311, "step": 1753 }, { "epoch": 0.58574052429454, "grad_norm": 0.41086256928254816, "learning_rate": 9.726456405813033e-06, "loss": 0.0422, "step": 1754 }, { "epoch": 0.5860744698614125, "grad_norm": 0.3315468335717578, "learning_rate": 9.725822080027442e-06, "loss": 0.0271, "step": 1755 }, { "epoch": 0.5864084154282851, "grad_norm": 0.3730186120425334, "learning_rate": 9.725187040350778e-06, "loss": 0.0377, "step": 1756 }, { "epoch": 0.5867423609951578, "grad_norm": 0.31288603321670977, "learning_rate": 9.724551286878976e-06, "loss": 0.0335, "step": 1757 }, { "epoch": 0.5870763065620304, "grad_norm": 0.4264371401607715, "learning_rate": 9.723914819708073e-06, "loss": 0.0368, "step": 1758 }, { "epoch": 0.587410252128903, "grad_norm": 0.3585948244649243, "learning_rate": 9.723277638934212e-06, "loss": 0.0339, "step": 1759 }, { "epoch": 0.5877441976957756, "grad_norm": 0.362457406151004, "learning_rate": 9.72263974465365e-06, "loss": 0.0312, "step": 1760 }, { "epoch": 0.5880781432626482, "grad_norm": 0.5232785559424709, "learning_rate": 9.722001136962746e-06, "loss": 0.0437, "step": 1761 }, { "epoch": 0.5884120888295208, "grad_norm": 0.4422200875621492, "learning_rate": 9.721361815957973e-06, "loss": 0.0481, "step": 1762 }, { "epoch": 0.5887460343963934, "grad_norm": 0.3447670320289213, "learning_rate": 9.720721781735905e-06, "loss": 0.0314, "step": 1763 }, { "epoch": 0.589079979963266, "grad_norm": 0.41271915267881604, "learning_rate": 9.720081034393226e-06, "loss": 0.0441, "step": 1764 }, { "epoch": 0.5894139255301386, "grad_norm": 0.36241051150896486, "learning_rate": 9.71943957402673e-06, "loss": 0.0382, "step": 1765 }, { "epoch": 0.5897478710970112, "grad_norm": 0.3805774855878043, "learning_rate": 9.718797400733314e-06, "loss": 0.0337, "step": 1766 }, { "epoch": 0.5900818166638838, "grad_norm": 0.37939735746645925, "learning_rate": 9.718154514609992e-06, "loss": 0.0419, "step": 1767 }, { "epoch": 0.5904157622307564, "grad_norm": 0.5746897366424192, "learning_rate": 9.717510915753876e-06, "loss": 0.0404, "step": 1768 }, { "epoch": 0.590749707797629, "grad_norm": 0.6070537951285289, "learning_rate": 9.716866604262189e-06, "loss": 0.0447, "step": 1769 }, { "epoch": 0.5910836533645016, "grad_norm": 0.3279231579739388, "learning_rate": 9.716221580232261e-06, "loss": 0.0336, "step": 1770 }, { "epoch": 0.5914175989313741, "grad_norm": 0.6739511464001847, "learning_rate": 9.715575843761534e-06, "loss": 0.0368, "step": 1771 }, { "epoch": 0.5917515444982467, "grad_norm": 0.4083766841062251, "learning_rate": 9.714929394947548e-06, "loss": 0.0408, "step": 1772 }, { "epoch": 0.5920854900651193, "grad_norm": 0.6761259201718556, "learning_rate": 9.714282233887962e-06, "loss": 0.0498, "step": 1773 }, { "epoch": 0.592419435631992, "grad_norm": 0.5520271218117264, "learning_rate": 9.713634360680537e-06, "loss": 0.0446, "step": 1774 }, { "epoch": 0.5927533811988646, "grad_norm": 0.48495273660881766, "learning_rate": 9.712985775423141e-06, "loss": 0.0475, "step": 1775 }, { "epoch": 0.5930873267657372, "grad_norm": 0.3852406336180792, "learning_rate": 9.712336478213747e-06, "loss": 0.0311, "step": 1776 }, { "epoch": 0.5934212723326098, "grad_norm": 0.3499732013840638, "learning_rate": 9.711686469150444e-06, "loss": 0.0438, "step": 1777 }, { "epoch": 0.5937552178994824, "grad_norm": 0.5017946237801338, "learning_rate": 9.711035748331421e-06, "loss": 0.0492, "step": 1778 }, { "epoch": 0.594089163466355, "grad_norm": 0.2771548188234721, "learning_rate": 9.710384315854977e-06, "loss": 0.022, "step": 1779 }, { "epoch": 0.5944231090332276, "grad_norm": 0.5861557106448326, "learning_rate": 9.70973217181952e-06, "loss": 0.0515, "step": 1780 }, { "epoch": 0.5947570546001002, "grad_norm": 0.8050176980062403, "learning_rate": 9.709079316323564e-06, "loss": 0.053, "step": 1781 }, { "epoch": 0.5950910001669728, "grad_norm": 0.4056848136430168, "learning_rate": 9.70842574946573e-06, "loss": 0.0362, "step": 1782 }, { "epoch": 0.5954249457338454, "grad_norm": 0.45483119774188496, "learning_rate": 9.707771471344744e-06, "loss": 0.0325, "step": 1783 }, { "epoch": 0.595758891300718, "grad_norm": 0.3715318388374774, "learning_rate": 9.707116482059447e-06, "loss": 0.0411, "step": 1784 }, { "epoch": 0.5960928368675906, "grad_norm": 0.41928845236130025, "learning_rate": 9.70646078170878e-06, "loss": 0.0356, "step": 1785 }, { "epoch": 0.5964267824344632, "grad_norm": 0.44261717206772755, "learning_rate": 9.705804370391794e-06, "loss": 0.0342, "step": 1786 }, { "epoch": 0.5967607280013357, "grad_norm": 0.515469603257908, "learning_rate": 9.705147248207652e-06, "loss": 0.0399, "step": 1787 }, { "epoch": 0.5970946735682083, "grad_norm": 0.4767092190313439, "learning_rate": 9.704489415255614e-06, "loss": 0.0389, "step": 1788 }, { "epoch": 0.5974286191350809, "grad_norm": 0.5800879449898292, "learning_rate": 9.703830871635057e-06, "loss": 0.0322, "step": 1789 }, { "epoch": 0.5977625647019535, "grad_norm": 0.3808283732863363, "learning_rate": 9.703171617445461e-06, "loss": 0.0291, "step": 1790 }, { "epoch": 0.5980965102688262, "grad_norm": 0.5152911556416488, "learning_rate": 9.702511652786414e-06, "loss": 0.0386, "step": 1791 }, { "epoch": 0.5984304558356988, "grad_norm": 0.4657296814566914, "learning_rate": 9.701850977757611e-06, "loss": 0.0399, "step": 1792 }, { "epoch": 0.5987644014025714, "grad_norm": 0.4495711775100688, "learning_rate": 9.701189592458858e-06, "loss": 0.0395, "step": 1793 }, { "epoch": 0.599098346969444, "grad_norm": 0.8654683483972361, "learning_rate": 9.70052749699006e-06, "loss": 0.0431, "step": 1794 }, { "epoch": 0.5994322925363166, "grad_norm": 0.4101268642173975, "learning_rate": 9.699864691451236e-06, "loss": 0.0328, "step": 1795 }, { "epoch": 0.5997662381031892, "grad_norm": 0.3536285767977456, "learning_rate": 9.699201175942514e-06, "loss": 0.0311, "step": 1796 }, { "epoch": 0.6001001836700618, "grad_norm": 0.3590359858839266, "learning_rate": 9.698536950564121e-06, "loss": 0.0326, "step": 1797 }, { "epoch": 0.6004341292369344, "grad_norm": 0.5365861834913674, "learning_rate": 9.6978720154164e-06, "loss": 0.0385, "step": 1798 }, { "epoch": 0.600768074803807, "grad_norm": 0.5436229825155504, "learning_rate": 9.697206370599793e-06, "loss": 0.0433, "step": 1799 }, { "epoch": 0.6011020203706796, "grad_norm": 0.5012686078311579, "learning_rate": 9.696540016214857e-06, "loss": 0.0428, "step": 1800 }, { "epoch": 0.6014359659375522, "grad_norm": 0.3583759614222113, "learning_rate": 9.695872952362253e-06, "loss": 0.038, "step": 1801 }, { "epoch": 0.6017699115044248, "grad_norm": 0.4743476178265279, "learning_rate": 9.695205179142746e-06, "loss": 0.045, "step": 1802 }, { "epoch": 0.6021038570712974, "grad_norm": 0.42979322267761627, "learning_rate": 9.694536696657213e-06, "loss": 0.039, "step": 1803 }, { "epoch": 0.6024378026381699, "grad_norm": 0.4923396538362408, "learning_rate": 9.693867505006634e-06, "loss": 0.0465, "step": 1804 }, { "epoch": 0.6027717482050425, "grad_norm": 0.33772456017783375, "learning_rate": 9.693197604292101e-06, "loss": 0.0353, "step": 1805 }, { "epoch": 0.6031056937719151, "grad_norm": 0.40009823146013773, "learning_rate": 9.69252699461481e-06, "loss": 0.0395, "step": 1806 }, { "epoch": 0.6034396393387877, "grad_norm": 0.399579996705222, "learning_rate": 9.691855676076064e-06, "loss": 0.0433, "step": 1807 }, { "epoch": 0.6037735849056604, "grad_norm": 0.3547509009946232, "learning_rate": 9.691183648777271e-06, "loss": 0.0436, "step": 1808 }, { "epoch": 0.604107530472533, "grad_norm": 0.38969256891694476, "learning_rate": 9.690510912819952e-06, "loss": 0.0411, "step": 1809 }, { "epoch": 0.6044414760394056, "grad_norm": 0.4858599541616899, "learning_rate": 9.689837468305732e-06, "loss": 0.0398, "step": 1810 }, { "epoch": 0.6047754216062782, "grad_norm": 0.5323739845380792, "learning_rate": 9.689163315336339e-06, "loss": 0.0534, "step": 1811 }, { "epoch": 0.6051093671731508, "grad_norm": 0.4411506877737185, "learning_rate": 9.688488454013616e-06, "loss": 0.0462, "step": 1812 }, { "epoch": 0.6054433127400234, "grad_norm": 0.4860508883152515, "learning_rate": 9.687812884439506e-06, "loss": 0.0409, "step": 1813 }, { "epoch": 0.605777258306896, "grad_norm": 0.42259518973967364, "learning_rate": 9.687136606716064e-06, "loss": 0.0327, "step": 1814 }, { "epoch": 0.6061112038737686, "grad_norm": 0.3614066633082068, "learning_rate": 9.686459620945445e-06, "loss": 0.0296, "step": 1815 }, { "epoch": 0.6064451494406412, "grad_norm": 0.47366380002731584, "learning_rate": 9.685781927229923e-06, "loss": 0.044, "step": 1816 }, { "epoch": 0.6067790950075138, "grad_norm": 0.5981411902514698, "learning_rate": 9.685103525671864e-06, "loss": 0.0753, "step": 1817 }, { "epoch": 0.6071130405743864, "grad_norm": 0.3675903668150694, "learning_rate": 9.684424416373754e-06, "loss": 0.0463, "step": 1818 }, { "epoch": 0.607446986141259, "grad_norm": 0.4676757309110554, "learning_rate": 9.683744599438178e-06, "loss": 0.048, "step": 1819 }, { "epoch": 0.6077809317081315, "grad_norm": 0.3693956960582205, "learning_rate": 9.683064074967832e-06, "loss": 0.0451, "step": 1820 }, { "epoch": 0.6081148772750041, "grad_norm": 0.3194528730340604, "learning_rate": 9.682382843065516e-06, "loss": 0.0321, "step": 1821 }, { "epoch": 0.6084488228418767, "grad_norm": 0.44144989718851135, "learning_rate": 9.681700903834137e-06, "loss": 0.0443, "step": 1822 }, { "epoch": 0.6087827684087493, "grad_norm": 0.5664392510781382, "learning_rate": 9.681018257376713e-06, "loss": 0.0507, "step": 1823 }, { "epoch": 0.609116713975622, "grad_norm": 0.4714622252126422, "learning_rate": 9.680334903796363e-06, "loss": 0.0488, "step": 1824 }, { "epoch": 0.6094506595424946, "grad_norm": 0.38670578461387006, "learning_rate": 9.679650843196318e-06, "loss": 0.0298, "step": 1825 }, { "epoch": 0.6097846051093672, "grad_norm": 0.44357125166347744, "learning_rate": 9.678966075679909e-06, "loss": 0.0454, "step": 1826 }, { "epoch": 0.6101185506762398, "grad_norm": 0.28844122123191956, "learning_rate": 9.678280601350584e-06, "loss": 0.0271, "step": 1827 }, { "epoch": 0.6104524962431124, "grad_norm": 0.3736854608954971, "learning_rate": 9.67759442031189e-06, "loss": 0.0338, "step": 1828 }, { "epoch": 0.610786441809985, "grad_norm": 0.3957082301752809, "learning_rate": 9.676907532667478e-06, "loss": 0.0358, "step": 1829 }, { "epoch": 0.6111203873768576, "grad_norm": 0.4430521380416934, "learning_rate": 9.676219938521116e-06, "loss": 0.0412, "step": 1830 }, { "epoch": 0.6114543329437302, "grad_norm": 0.4629812439050685, "learning_rate": 9.675531637976673e-06, "loss": 0.0264, "step": 1831 }, { "epoch": 0.6117882785106028, "grad_norm": 0.35940162870134673, "learning_rate": 9.674842631138121e-06, "loss": 0.0293, "step": 1832 }, { "epoch": 0.6121222240774754, "grad_norm": 0.4267551169880035, "learning_rate": 9.674152918109547e-06, "loss": 0.0393, "step": 1833 }, { "epoch": 0.612456169644348, "grad_norm": 0.5339190057612913, "learning_rate": 9.673462498995138e-06, "loss": 0.0439, "step": 1834 }, { "epoch": 0.6127901152112206, "grad_norm": 0.39801476705213834, "learning_rate": 9.672771373899192e-06, "loss": 0.0355, "step": 1835 }, { "epoch": 0.6131240607780931, "grad_norm": 0.4429084200500223, "learning_rate": 9.672079542926108e-06, "loss": 0.0376, "step": 1836 }, { "epoch": 0.6134580063449657, "grad_norm": 0.67222515154493, "learning_rate": 9.671387006180398e-06, "loss": 0.0499, "step": 1837 }, { "epoch": 0.6137919519118383, "grad_norm": 0.47779150626601224, "learning_rate": 9.670693763766674e-06, "loss": 0.0401, "step": 1838 }, { "epoch": 0.6141258974787109, "grad_norm": 0.37426042285762723, "learning_rate": 9.669999815789664e-06, "loss": 0.0328, "step": 1839 }, { "epoch": 0.6144598430455835, "grad_norm": 0.41636243829079883, "learning_rate": 9.669305162354194e-06, "loss": 0.0354, "step": 1840 }, { "epoch": 0.6147937886124561, "grad_norm": 0.6226963095081265, "learning_rate": 9.6686098035652e-06, "loss": 0.0488, "step": 1841 }, { "epoch": 0.6151277341793288, "grad_norm": 0.429397152236842, "learning_rate": 9.667913739527724e-06, "loss": 0.0426, "step": 1842 }, { "epoch": 0.6154616797462014, "grad_norm": 0.4372126677348734, "learning_rate": 9.667216970346916e-06, "loss": 0.0483, "step": 1843 }, { "epoch": 0.615795625313074, "grad_norm": 0.3967715291605297, "learning_rate": 9.666519496128027e-06, "loss": 0.0385, "step": 1844 }, { "epoch": 0.6161295708799466, "grad_norm": 0.426735892267052, "learning_rate": 9.665821316976423e-06, "loss": 0.0358, "step": 1845 }, { "epoch": 0.6164635164468192, "grad_norm": 0.36159372896020636, "learning_rate": 9.665122432997571e-06, "loss": 0.032, "step": 1846 }, { "epoch": 0.6167974620136918, "grad_norm": 0.2725741607332122, "learning_rate": 9.664422844297045e-06, "loss": 0.0309, "step": 1847 }, { "epoch": 0.6171314075805644, "grad_norm": 0.45038910214507577, "learning_rate": 9.663722550980528e-06, "loss": 0.0401, "step": 1848 }, { "epoch": 0.617465353147437, "grad_norm": 0.42086104921704576, "learning_rate": 9.663021553153805e-06, "loss": 0.0348, "step": 1849 }, { "epoch": 0.6177992987143096, "grad_norm": 0.4668187156562129, "learning_rate": 9.66231985092277e-06, "loss": 0.034, "step": 1850 }, { "epoch": 0.6181332442811822, "grad_norm": 0.3986616199994234, "learning_rate": 9.661617444393427e-06, "loss": 0.0487, "step": 1851 }, { "epoch": 0.6184671898480548, "grad_norm": 0.42521752469741725, "learning_rate": 9.660914333671878e-06, "loss": 0.032, "step": 1852 }, { "epoch": 0.6188011354149273, "grad_norm": 0.5122186042752632, "learning_rate": 9.66021051886434e-06, "loss": 0.0447, "step": 1853 }, { "epoch": 0.6191350809817999, "grad_norm": 0.376105460409327, "learning_rate": 9.65950600007713e-06, "loss": 0.0347, "step": 1854 }, { "epoch": 0.6194690265486725, "grad_norm": 0.3477680883817906, "learning_rate": 9.658800777416676e-06, "loss": 0.0304, "step": 1855 }, { "epoch": 0.6198029721155451, "grad_norm": 0.31072974884173854, "learning_rate": 9.658094850989508e-06, "loss": 0.0279, "step": 1856 }, { "epoch": 0.6201369176824177, "grad_norm": 0.3841256211026492, "learning_rate": 9.657388220902265e-06, "loss": 0.0346, "step": 1857 }, { "epoch": 0.6204708632492903, "grad_norm": 0.3313444417218464, "learning_rate": 9.656680887261693e-06, "loss": 0.026, "step": 1858 }, { "epoch": 0.620804808816163, "grad_norm": 0.43367576990253265, "learning_rate": 9.655972850174642e-06, "loss": 0.04, "step": 1859 }, { "epoch": 0.6211387543830356, "grad_norm": 0.34493385105284713, "learning_rate": 9.65526410974807e-06, "loss": 0.0275, "step": 1860 }, { "epoch": 0.6214726999499082, "grad_norm": 0.47405934018109014, "learning_rate": 9.65455466608904e-06, "loss": 0.0343, "step": 1861 }, { "epoch": 0.6218066455167808, "grad_norm": 0.4681075468104997, "learning_rate": 9.653844519304722e-06, "loss": 0.0416, "step": 1862 }, { "epoch": 0.6221405910836534, "grad_norm": 0.5265841271768287, "learning_rate": 9.653133669502393e-06, "loss": 0.0524, "step": 1863 }, { "epoch": 0.622474536650526, "grad_norm": 0.30744485928642573, "learning_rate": 9.652422116789432e-06, "loss": 0.0271, "step": 1864 }, { "epoch": 0.6228084822173986, "grad_norm": 0.6876129662779562, "learning_rate": 9.651709861273334e-06, "loss": 0.0364, "step": 1865 }, { "epoch": 0.6231424277842712, "grad_norm": 0.7125513818465257, "learning_rate": 9.650996903061685e-06, "loss": 0.0388, "step": 1866 }, { "epoch": 0.6234763733511438, "grad_norm": 0.39487180334252686, "learning_rate": 9.650283242262192e-06, "loss": 0.0421, "step": 1867 }, { "epoch": 0.6238103189180164, "grad_norm": 0.3642856274716189, "learning_rate": 9.64956887898266e-06, "loss": 0.0474, "step": 1868 }, { "epoch": 0.6241442644848889, "grad_norm": 2.8861305941138524, "learning_rate": 9.648853813331e-06, "loss": 0.0386, "step": 1869 }, { "epoch": 0.6244782100517615, "grad_norm": 0.5704609822104695, "learning_rate": 9.648138045415236e-06, "loss": 0.0568, "step": 1870 }, { "epoch": 0.6248121556186341, "grad_norm": 0.3623440610754578, "learning_rate": 9.647421575343488e-06, "loss": 0.0318, "step": 1871 }, { "epoch": 0.6251461011855067, "grad_norm": 0.8475867166920036, "learning_rate": 9.646704403223991e-06, "loss": 0.0313, "step": 1872 }, { "epoch": 0.6254800467523793, "grad_norm": 0.3528685542346121, "learning_rate": 9.64598652916508e-06, "loss": 0.0315, "step": 1873 }, { "epoch": 0.6258139923192519, "grad_norm": 0.30780792596204365, "learning_rate": 9.6452679532752e-06, "loss": 0.0336, "step": 1874 }, { "epoch": 0.6261479378861246, "grad_norm": 0.6832366896758314, "learning_rate": 9.644548675662897e-06, "loss": 0.0432, "step": 1875 }, { "epoch": 0.6264818834529972, "grad_norm": 0.36973569271153145, "learning_rate": 9.64382869643683e-06, "loss": 0.0334, "step": 1876 }, { "epoch": 0.6268158290198698, "grad_norm": 0.5904473422843772, "learning_rate": 9.64310801570576e-06, "loss": 0.0605, "step": 1877 }, { "epoch": 0.6271497745867424, "grad_norm": 0.49367437779993034, "learning_rate": 9.642386633578553e-06, "loss": 0.049, "step": 1878 }, { "epoch": 0.627483720153615, "grad_norm": 0.6309937635353293, "learning_rate": 9.641664550164182e-06, "loss": 0.0481, "step": 1879 }, { "epoch": 0.6278176657204876, "grad_norm": 0.32741957513574144, "learning_rate": 9.640941765571727e-06, "loss": 0.0337, "step": 1880 }, { "epoch": 0.6281516112873602, "grad_norm": 0.25633710985612546, "learning_rate": 9.640218279910374e-06, "loss": 0.0281, "step": 1881 }, { "epoch": 0.6284855568542328, "grad_norm": 0.38195096646212046, "learning_rate": 9.639494093289412e-06, "loss": 0.0358, "step": 1882 }, { "epoch": 0.6288195024211054, "grad_norm": 0.5386835031032556, "learning_rate": 9.638769205818239e-06, "loss": 0.0361, "step": 1883 }, { "epoch": 0.629153447987978, "grad_norm": 0.7679738802288255, "learning_rate": 9.638043617606358e-06, "loss": 0.0617, "step": 1884 }, { "epoch": 0.6294873935548505, "grad_norm": 0.5460854814891332, "learning_rate": 9.637317328763378e-06, "loss": 0.0433, "step": 1885 }, { "epoch": 0.6298213391217231, "grad_norm": 0.365060548357797, "learning_rate": 9.636590339399012e-06, "loss": 0.0362, "step": 1886 }, { "epoch": 0.6301552846885957, "grad_norm": 0.3705326290040247, "learning_rate": 9.63586264962308e-06, "loss": 0.0328, "step": 1887 }, { "epoch": 0.6304892302554683, "grad_norm": 0.5471497013132491, "learning_rate": 9.635134259545511e-06, "loss": 0.0375, "step": 1888 }, { "epoch": 0.6308231758223409, "grad_norm": 0.5263577057665104, "learning_rate": 9.634405169276335e-06, "loss": 0.0506, "step": 1889 }, { "epoch": 0.6311571213892135, "grad_norm": 0.4272692073087324, "learning_rate": 9.63367537892569e-06, "loss": 0.0427, "step": 1890 }, { "epoch": 0.6314910669560861, "grad_norm": 0.5705137389135463, "learning_rate": 9.63294488860382e-06, "loss": 0.0349, "step": 1891 }, { "epoch": 0.6318250125229588, "grad_norm": 0.3558713877577924, "learning_rate": 9.63221369842107e-06, "loss": 0.0278, "step": 1892 }, { "epoch": 0.6321589580898314, "grad_norm": 0.4721988518594218, "learning_rate": 9.631481808487902e-06, "loss": 0.0361, "step": 1893 }, { "epoch": 0.632492903656704, "grad_norm": 0.4480326110973692, "learning_rate": 9.63074921891487e-06, "loss": 0.0427, "step": 1894 }, { "epoch": 0.6328268492235766, "grad_norm": 0.37188152907558825, "learning_rate": 9.630015929812646e-06, "loss": 0.0252, "step": 1895 }, { "epoch": 0.6331607947904492, "grad_norm": 0.5563749487990718, "learning_rate": 9.629281941291998e-06, "loss": 0.0485, "step": 1896 }, { "epoch": 0.6334947403573218, "grad_norm": 0.36670401543741943, "learning_rate": 9.628547253463804e-06, "loss": 0.0301, "step": 1897 }, { "epoch": 0.6338286859241944, "grad_norm": 0.37224420585800755, "learning_rate": 9.627811866439048e-06, "loss": 0.0401, "step": 1898 }, { "epoch": 0.634162631491067, "grad_norm": 0.4893359595269946, "learning_rate": 9.627075780328818e-06, "loss": 0.0379, "step": 1899 }, { "epoch": 0.6344965770579396, "grad_norm": 0.43686684602558706, "learning_rate": 9.626338995244313e-06, "loss": 0.0418, "step": 1900 }, { "epoch": 0.6348305226248122, "grad_norm": 0.40625611969226033, "learning_rate": 9.625601511296826e-06, "loss": 0.0456, "step": 1901 }, { "epoch": 0.6351644681916847, "grad_norm": 0.5709668921816438, "learning_rate": 9.624863328597767e-06, "loss": 0.0438, "step": 1902 }, { "epoch": 0.6354984137585573, "grad_norm": 0.3871929125084485, "learning_rate": 9.624124447258647e-06, "loss": 0.0342, "step": 1903 }, { "epoch": 0.6358323593254299, "grad_norm": 0.4971974547489401, "learning_rate": 9.62338486739108e-06, "loss": 0.0574, "step": 1904 }, { "epoch": 0.6361663048923025, "grad_norm": 0.6026356798550235, "learning_rate": 9.62264458910679e-06, "loss": 0.0485, "step": 1905 }, { "epoch": 0.6365002504591751, "grad_norm": 0.5130463017123024, "learning_rate": 9.621903612517608e-06, "loss": 0.0375, "step": 1906 }, { "epoch": 0.6368341960260477, "grad_norm": 0.3757677910039498, "learning_rate": 9.621161937735463e-06, "loss": 0.0357, "step": 1907 }, { "epoch": 0.6371681415929203, "grad_norm": 0.3495116271000302, "learning_rate": 9.620419564872394e-06, "loss": 0.0441, "step": 1908 }, { "epoch": 0.637502087159793, "grad_norm": 0.570418547019151, "learning_rate": 9.619676494040547e-06, "loss": 0.0395, "step": 1909 }, { "epoch": 0.6378360327266656, "grad_norm": 0.5738128070519113, "learning_rate": 9.61893272535217e-06, "loss": 0.0378, "step": 1910 }, { "epoch": 0.6381699782935382, "grad_norm": 0.6166375661027198, "learning_rate": 9.618188258919618e-06, "loss": 0.0611, "step": 1911 }, { "epoch": 0.6385039238604108, "grad_norm": 0.4201294810770252, "learning_rate": 9.617443094855354e-06, "loss": 0.0369, "step": 1912 }, { "epoch": 0.6388378694272834, "grad_norm": 0.5173332455124691, "learning_rate": 9.61669723327194e-06, "loss": 0.0519, "step": 1913 }, { "epoch": 0.639171814994156, "grad_norm": 0.5160328548914549, "learning_rate": 9.615950674282049e-06, "loss": 0.0426, "step": 1914 }, { "epoch": 0.6395057605610286, "grad_norm": 0.40673952670075453, "learning_rate": 9.61520341799846e-06, "loss": 0.0304, "step": 1915 }, { "epoch": 0.6398397061279012, "grad_norm": 0.5900313878047575, "learning_rate": 9.614455464534049e-06, "loss": 0.0586, "step": 1916 }, { "epoch": 0.6401736516947738, "grad_norm": 0.38570360482895305, "learning_rate": 9.613706814001809e-06, "loss": 0.0343, "step": 1917 }, { "epoch": 0.6405075972616463, "grad_norm": 0.5842377236150416, "learning_rate": 9.612957466514829e-06, "loss": 0.037, "step": 1918 }, { "epoch": 0.6408415428285189, "grad_norm": 0.4148470046802926, "learning_rate": 9.61220742218631e-06, "loss": 0.0426, "step": 1919 }, { "epoch": 0.6411754883953915, "grad_norm": 0.43247246033021025, "learning_rate": 9.61145668112955e-06, "loss": 0.0362, "step": 1920 }, { "epoch": 0.6415094339622641, "grad_norm": 0.890996448157842, "learning_rate": 9.610705243457962e-06, "loss": 0.0537, "step": 1921 }, { "epoch": 0.6418433795291367, "grad_norm": 0.44996598549319694, "learning_rate": 9.609953109285057e-06, "loss": 0.0463, "step": 1922 }, { "epoch": 0.6421773250960093, "grad_norm": 0.3806053279338398, "learning_rate": 9.609200278724456e-06, "loss": 0.0335, "step": 1923 }, { "epoch": 0.6425112706628819, "grad_norm": 0.40527587523840664, "learning_rate": 9.60844675188988e-06, "loss": 0.045, "step": 1924 }, { "epoch": 0.6428452162297545, "grad_norm": 0.5400479824298494, "learning_rate": 9.60769252889516e-06, "loss": 0.0495, "step": 1925 }, { "epoch": 0.6431791617966272, "grad_norm": 0.41163884457514005, "learning_rate": 9.606937609854227e-06, "loss": 0.0395, "step": 1926 }, { "epoch": 0.6435131073634998, "grad_norm": 0.4872094368071328, "learning_rate": 9.606181994881124e-06, "loss": 0.0514, "step": 1927 }, { "epoch": 0.6438470529303724, "grad_norm": 0.48280423993985105, "learning_rate": 9.605425684089998e-06, "loss": 0.074, "step": 1928 }, { "epoch": 0.644180998497245, "grad_norm": 0.4409583152553842, "learning_rate": 9.604668677595093e-06, "loss": 0.0388, "step": 1929 }, { "epoch": 0.6445149440641176, "grad_norm": 0.3967310876918796, "learning_rate": 9.603910975510764e-06, "loss": 0.0368, "step": 1930 }, { "epoch": 0.6448488896309902, "grad_norm": 0.4668003203945541, "learning_rate": 9.603152577951476e-06, "loss": 0.0543, "step": 1931 }, { "epoch": 0.6451828351978628, "grad_norm": 0.32286948560483264, "learning_rate": 9.60239348503179e-06, "loss": 0.0244, "step": 1932 }, { "epoch": 0.6455167807647354, "grad_norm": 0.3690852719449052, "learning_rate": 9.601633696866376e-06, "loss": 0.0323, "step": 1933 }, { "epoch": 0.6458507263316079, "grad_norm": 0.5874667298049112, "learning_rate": 9.60087321357001e-06, "loss": 0.0403, "step": 1934 }, { "epoch": 0.6461846718984805, "grad_norm": 0.42739345880339813, "learning_rate": 9.600112035257571e-06, "loss": 0.0401, "step": 1935 }, { "epoch": 0.6465186174653531, "grad_norm": 0.3030639504292046, "learning_rate": 9.599350162044045e-06, "loss": 0.0365, "step": 1936 }, { "epoch": 0.6468525630322257, "grad_norm": 0.3580636364891596, "learning_rate": 9.598587594044522e-06, "loss": 0.0319, "step": 1937 }, { "epoch": 0.6471865085990983, "grad_norm": 0.36452760261258627, "learning_rate": 9.597824331374196e-06, "loss": 0.0311, "step": 1938 }, { "epoch": 0.6475204541659709, "grad_norm": 0.4047003448041299, "learning_rate": 9.597060374148365e-06, "loss": 0.0348, "step": 1939 }, { "epoch": 0.6478543997328435, "grad_norm": 0.29086789236359495, "learning_rate": 9.596295722482439e-06, "loss": 0.0264, "step": 1940 }, { "epoch": 0.6481883452997161, "grad_norm": 0.47002828184210704, "learning_rate": 9.595530376491924e-06, "loss": 0.0343, "step": 1941 }, { "epoch": 0.6485222908665887, "grad_norm": 0.3602197839006709, "learning_rate": 9.594764336292432e-06, "loss": 0.0427, "step": 1942 }, { "epoch": 0.6488562364334614, "grad_norm": 0.4153593770291862, "learning_rate": 9.593997601999689e-06, "loss": 0.0375, "step": 1943 }, { "epoch": 0.649190182000334, "grad_norm": 0.42311990704308083, "learning_rate": 9.593230173729514e-06, "loss": 0.0389, "step": 1944 }, { "epoch": 0.6495241275672066, "grad_norm": 0.3166040022386767, "learning_rate": 9.592462051597838e-06, "loss": 0.0342, "step": 1945 }, { "epoch": 0.6498580731340792, "grad_norm": 0.35294407201689076, "learning_rate": 9.591693235720695e-06, "loss": 0.0341, "step": 1946 }, { "epoch": 0.6501920187009518, "grad_norm": 0.37311930946901645, "learning_rate": 9.590923726214224e-06, "loss": 0.0374, "step": 1947 }, { "epoch": 0.6505259642678244, "grad_norm": 0.4463196511307658, "learning_rate": 9.590153523194665e-06, "loss": 0.0439, "step": 1948 }, { "epoch": 0.650859909834697, "grad_norm": 0.31649078827997107, "learning_rate": 9.589382626778371e-06, "loss": 0.0288, "step": 1949 }, { "epoch": 0.6511938554015696, "grad_norm": 0.3298840446407738, "learning_rate": 9.588611037081793e-06, "loss": 0.0278, "step": 1950 }, { "epoch": 0.6515278009684421, "grad_norm": 0.5228733689269723, "learning_rate": 9.587838754221488e-06, "loss": 0.0475, "step": 1951 }, { "epoch": 0.6518617465353147, "grad_norm": 0.6327059040983106, "learning_rate": 9.587065778314119e-06, "loss": 0.0521, "step": 1952 }, { "epoch": 0.6521956921021873, "grad_norm": 0.2451710269110937, "learning_rate": 9.586292109476454e-06, "loss": 0.0202, "step": 1953 }, { "epoch": 0.6525296376690599, "grad_norm": 0.8916205012323529, "learning_rate": 9.585517747825363e-06, "loss": 0.0633, "step": 1954 }, { "epoch": 0.6528635832359325, "grad_norm": 0.4709021087758125, "learning_rate": 9.584742693477825e-06, "loss": 0.0318, "step": 1955 }, { "epoch": 0.6531975288028051, "grad_norm": 0.5136557067976495, "learning_rate": 9.58396694655092e-06, "loss": 0.0595, "step": 1956 }, { "epoch": 0.6535314743696777, "grad_norm": 0.4756999308077858, "learning_rate": 9.583190507161832e-06, "loss": 0.0316, "step": 1957 }, { "epoch": 0.6538654199365503, "grad_norm": 0.7455511037822554, "learning_rate": 9.582413375427852e-06, "loss": 0.0368, "step": 1958 }, { "epoch": 0.654199365503423, "grad_norm": 0.4754787105088287, "learning_rate": 9.581635551466376e-06, "loss": 0.0462, "step": 1959 }, { "epoch": 0.6545333110702956, "grad_norm": 0.38003687303801914, "learning_rate": 9.580857035394904e-06, "loss": 0.0315, "step": 1960 }, { "epoch": 0.6548672566371682, "grad_norm": 0.4533530798330268, "learning_rate": 9.580077827331038e-06, "loss": 0.0401, "step": 1961 }, { "epoch": 0.6552012022040408, "grad_norm": 0.42545050057012723, "learning_rate": 9.579297927392488e-06, "loss": 0.0354, "step": 1962 }, { "epoch": 0.6555351477709134, "grad_norm": 0.49139603401747467, "learning_rate": 9.578517335697065e-06, "loss": 0.0453, "step": 1963 }, { "epoch": 0.655869093337786, "grad_norm": 0.46637109731071646, "learning_rate": 9.577736052362689e-06, "loss": 0.0442, "step": 1964 }, { "epoch": 0.6562030389046586, "grad_norm": 0.299441958264375, "learning_rate": 9.576954077507381e-06, "loss": 0.0278, "step": 1965 }, { "epoch": 0.6565369844715312, "grad_norm": 0.5657838188124532, "learning_rate": 9.576171411249269e-06, "loss": 0.0441, "step": 1966 }, { "epoch": 0.6568709300384037, "grad_norm": 0.40584154730467964, "learning_rate": 9.575388053706582e-06, "loss": 0.0381, "step": 1967 }, { "epoch": 0.6572048756052763, "grad_norm": 0.4436611950009847, "learning_rate": 9.574604004997654e-06, "loss": 0.039, "step": 1968 }, { "epoch": 0.6575388211721489, "grad_norm": 0.5730574727726172, "learning_rate": 9.57381926524093e-06, "loss": 0.0661, "step": 1969 }, { "epoch": 0.6578727667390215, "grad_norm": 0.31332708853227675, "learning_rate": 9.57303383455495e-06, "loss": 0.0273, "step": 1970 }, { "epoch": 0.6582067123058941, "grad_norm": 0.3980333285692315, "learning_rate": 9.572247713058362e-06, "loss": 0.0378, "step": 1971 }, { "epoch": 0.6585406578727667, "grad_norm": 0.3250619162709416, "learning_rate": 9.571460900869923e-06, "loss": 0.0364, "step": 1972 }, { "epoch": 0.6588746034396393, "grad_norm": 0.533505317222907, "learning_rate": 9.570673398108485e-06, "loss": 0.0459, "step": 1973 }, { "epoch": 0.6592085490065119, "grad_norm": 0.44470417319471883, "learning_rate": 9.569885204893015e-06, "loss": 0.0591, "step": 1974 }, { "epoch": 0.6595424945733845, "grad_norm": 0.3008623607862951, "learning_rate": 9.569096321342574e-06, "loss": 0.0302, "step": 1975 }, { "epoch": 0.6598764401402571, "grad_norm": 0.431796799584234, "learning_rate": 9.568306747576335e-06, "loss": 0.0369, "step": 1976 }, { "epoch": 0.6602103857071298, "grad_norm": 0.3580539264016544, "learning_rate": 9.567516483713572e-06, "loss": 0.0314, "step": 1977 }, { "epoch": 0.6605443312740024, "grad_norm": 0.4010486344232972, "learning_rate": 9.566725529873664e-06, "loss": 0.039, "step": 1978 }, { "epoch": 0.660878276840875, "grad_norm": 0.4370058000801178, "learning_rate": 9.565933886176093e-06, "loss": 0.0318, "step": 1979 }, { "epoch": 0.6612122224077476, "grad_norm": 0.43869235801488754, "learning_rate": 9.565141552740445e-06, "loss": 0.0378, "step": 1980 }, { "epoch": 0.6615461679746202, "grad_norm": 0.34181395068932374, "learning_rate": 9.564348529686413e-06, "loss": 0.0296, "step": 1981 }, { "epoch": 0.6618801135414928, "grad_norm": 0.39098423304700447, "learning_rate": 9.563554817133794e-06, "loss": 0.039, "step": 1982 }, { "epoch": 0.6622140591083653, "grad_norm": 0.32495353931107385, "learning_rate": 9.562760415202483e-06, "loss": 0.0292, "step": 1983 }, { "epoch": 0.6625480046752379, "grad_norm": 0.2823617434602211, "learning_rate": 9.56196532401249e-06, "loss": 0.0262, "step": 1984 }, { "epoch": 0.6628819502421105, "grad_norm": 0.5311234770914175, "learning_rate": 9.561169543683917e-06, "loss": 0.0389, "step": 1985 }, { "epoch": 0.6632158958089831, "grad_norm": 0.3750951453671438, "learning_rate": 9.560373074336977e-06, "loss": 0.0425, "step": 1986 }, { "epoch": 0.6635498413758557, "grad_norm": 0.44916711941040793, "learning_rate": 9.55957591609199e-06, "loss": 0.0381, "step": 1987 }, { "epoch": 0.6638837869427283, "grad_norm": 0.44845371539395534, "learning_rate": 9.558778069069373e-06, "loss": 0.0416, "step": 1988 }, { "epoch": 0.6642177325096009, "grad_norm": 0.4486434168528874, "learning_rate": 9.55797953338965e-06, "loss": 0.0313, "step": 1989 }, { "epoch": 0.6645516780764735, "grad_norm": 0.5988333882434549, "learning_rate": 9.55718030917345e-06, "loss": 0.0424, "step": 1990 }, { "epoch": 0.6648856236433461, "grad_norm": 0.4449319242672763, "learning_rate": 9.556380396541507e-06, "loss": 0.0374, "step": 1991 }, { "epoch": 0.6652195692102187, "grad_norm": 0.657520332652472, "learning_rate": 9.555579795614654e-06, "loss": 0.0414, "step": 1992 }, { "epoch": 0.6655535147770913, "grad_norm": 0.6832267111237972, "learning_rate": 9.554778506513834e-06, "loss": 0.0524, "step": 1993 }, { "epoch": 0.665887460343964, "grad_norm": 0.5302737077362314, "learning_rate": 9.553976529360087e-06, "loss": 0.0663, "step": 1994 }, { "epoch": 0.6662214059108366, "grad_norm": 0.37787301151681557, "learning_rate": 9.553173864274567e-06, "loss": 0.0293, "step": 1995 }, { "epoch": 0.6665553514777092, "grad_norm": 0.7179998805532395, "learning_rate": 9.552370511378522e-06, "loss": 0.0401, "step": 1996 }, { "epoch": 0.6668892970445818, "grad_norm": 0.6405562511755868, "learning_rate": 9.551566470793308e-06, "loss": 0.0435, "step": 1997 }, { "epoch": 0.6672232426114544, "grad_norm": 0.3965480294280699, "learning_rate": 9.550761742640387e-06, "loss": 0.0374, "step": 1998 }, { "epoch": 0.667557188178327, "grad_norm": 0.4954674875869502, "learning_rate": 9.549956327041318e-06, "loss": 0.0405, "step": 1999 }, { "epoch": 0.6678911337451995, "grad_norm": 0.38398997797274265, "learning_rate": 9.549150224117776e-06, "loss": 0.0279, "step": 2000 }, { "epoch": 0.6682250793120721, "grad_norm": 0.35076214100338154, "learning_rate": 9.548343433991524e-06, "loss": 0.0285, "step": 2001 }, { "epoch": 0.6685590248789447, "grad_norm": 0.5925647380158942, "learning_rate": 9.547535956784445e-06, "loss": 0.0609, "step": 2002 }, { "epoch": 0.6688929704458173, "grad_norm": 0.5143382525893382, "learning_rate": 9.546727792618512e-06, "loss": 0.0411, "step": 2003 }, { "epoch": 0.6692269160126899, "grad_norm": 0.43516019025805047, "learning_rate": 9.545918941615811e-06, "loss": 0.0477, "step": 2004 }, { "epoch": 0.6695608615795625, "grad_norm": 0.5467950491101975, "learning_rate": 9.545109403898527e-06, "loss": 0.0468, "step": 2005 }, { "epoch": 0.6698948071464351, "grad_norm": 0.3388940167646659, "learning_rate": 9.544299179588952e-06, "loss": 0.0323, "step": 2006 }, { "epoch": 0.6702287527133077, "grad_norm": 0.47902655682264245, "learning_rate": 9.543488268809478e-06, "loss": 0.0368, "step": 2007 }, { "epoch": 0.6705626982801803, "grad_norm": 0.4270736708549859, "learning_rate": 9.542676671682601e-06, "loss": 0.0426, "step": 2008 }, { "epoch": 0.6708966438470529, "grad_norm": 0.6044442787017646, "learning_rate": 9.541864388330926e-06, "loss": 0.0484, "step": 2009 }, { "epoch": 0.6712305894139255, "grad_norm": 0.4259121663848109, "learning_rate": 9.541051418877156e-06, "loss": 0.0377, "step": 2010 }, { "epoch": 0.6715645349807982, "grad_norm": 0.3568645051200129, "learning_rate": 9.5402377634441e-06, "loss": 0.0376, "step": 2011 }, { "epoch": 0.6718984805476708, "grad_norm": 0.35494036371640764, "learning_rate": 9.539423422154672e-06, "loss": 0.0225, "step": 2012 }, { "epoch": 0.6722324261145434, "grad_norm": 0.3154795081643971, "learning_rate": 9.538608395131884e-06, "loss": 0.0378, "step": 2013 }, { "epoch": 0.672566371681416, "grad_norm": 0.39098881138831215, "learning_rate": 9.537792682498859e-06, "loss": 0.0373, "step": 2014 }, { "epoch": 0.6729003172482886, "grad_norm": 0.4566591398312619, "learning_rate": 9.536976284378818e-06, "loss": 0.0484, "step": 2015 }, { "epoch": 0.6732342628151611, "grad_norm": 0.34076311960195427, "learning_rate": 9.536159200895088e-06, "loss": 0.0429, "step": 2016 }, { "epoch": 0.6735682083820337, "grad_norm": 0.42658190950359615, "learning_rate": 9.535341432171098e-06, "loss": 0.0375, "step": 2017 }, { "epoch": 0.6739021539489063, "grad_norm": 0.4276502107061724, "learning_rate": 9.534522978330384e-06, "loss": 0.035, "step": 2018 }, { "epoch": 0.6742360995157789, "grad_norm": 0.38321912147980164, "learning_rate": 9.533703839496581e-06, "loss": 0.0334, "step": 2019 }, { "epoch": 0.6745700450826515, "grad_norm": 0.36958472601309506, "learning_rate": 9.532884015793432e-06, "loss": 0.0288, "step": 2020 }, { "epoch": 0.6749039906495241, "grad_norm": 0.4357493497620933, "learning_rate": 9.532063507344777e-06, "loss": 0.0345, "step": 2021 }, { "epoch": 0.6752379362163967, "grad_norm": 0.41201588517815246, "learning_rate": 9.53124231427457e-06, "loss": 0.0474, "step": 2022 }, { "epoch": 0.6755718817832693, "grad_norm": 0.42070337228935134, "learning_rate": 9.530420436706853e-06, "loss": 0.0378, "step": 2023 }, { "epoch": 0.6759058273501419, "grad_norm": 0.47467486330484465, "learning_rate": 9.529597874765788e-06, "loss": 0.0398, "step": 2024 }, { "epoch": 0.6762397729170145, "grad_norm": 0.5304964057091106, "learning_rate": 9.528774628575628e-06, "loss": 0.0522, "step": 2025 }, { "epoch": 0.6765737184838871, "grad_norm": 0.41027979340530846, "learning_rate": 9.527950698260737e-06, "loss": 0.04, "step": 2026 }, { "epoch": 0.6769076640507597, "grad_norm": 0.798364895576848, "learning_rate": 9.527126083945578e-06, "loss": 0.0472, "step": 2027 }, { "epoch": 0.6772416096176324, "grad_norm": 0.5383452982596113, "learning_rate": 9.526300785754719e-06, "loss": 0.0412, "step": 2028 }, { "epoch": 0.677575555184505, "grad_norm": 0.6616398683443208, "learning_rate": 9.525474803812831e-06, "loss": 0.041, "step": 2029 }, { "epoch": 0.6779095007513776, "grad_norm": 0.4049436603518837, "learning_rate": 9.524648138244688e-06, "loss": 0.043, "step": 2030 }, { "epoch": 0.6782434463182502, "grad_norm": 0.4484259211351194, "learning_rate": 9.523820789175167e-06, "loss": 0.0305, "step": 2031 }, { "epoch": 0.6785773918851227, "grad_norm": 0.3512661433794848, "learning_rate": 9.52299275672925e-06, "loss": 0.0329, "step": 2032 }, { "epoch": 0.6789113374519953, "grad_norm": 0.6493754131574312, "learning_rate": 9.52216404103202e-06, "loss": 0.0336, "step": 2033 }, { "epoch": 0.6792452830188679, "grad_norm": 0.7535163293104985, "learning_rate": 9.521334642208666e-06, "loss": 0.0413, "step": 2034 }, { "epoch": 0.6795792285857405, "grad_norm": 0.49326517121601776, "learning_rate": 9.520504560384476e-06, "loss": 0.045, "step": 2035 }, { "epoch": 0.6799131741526131, "grad_norm": 0.41120052266043067, "learning_rate": 9.519673795684845e-06, "loss": 0.0266, "step": 2036 }, { "epoch": 0.6802471197194857, "grad_norm": 0.6140177963518832, "learning_rate": 9.518842348235271e-06, "loss": 0.0483, "step": 2037 }, { "epoch": 0.6805810652863583, "grad_norm": 0.4725025346002652, "learning_rate": 9.51801021816135e-06, "loss": 0.0407, "step": 2038 }, { "epoch": 0.6809150108532309, "grad_norm": 0.41261462649031916, "learning_rate": 9.51717740558879e-06, "loss": 0.0268, "step": 2039 }, { "epoch": 0.6812489564201035, "grad_norm": 0.3707450546827718, "learning_rate": 9.516343910643395e-06, "loss": 0.0458, "step": 2040 }, { "epoch": 0.6815829019869761, "grad_norm": 0.2771729301031353, "learning_rate": 9.515509733451074e-06, "loss": 0.0274, "step": 2041 }, { "epoch": 0.6819168475538487, "grad_norm": 0.32940546739682114, "learning_rate": 9.514674874137838e-06, "loss": 0.0337, "step": 2042 }, { "epoch": 0.6822507931207213, "grad_norm": 0.3734831314096347, "learning_rate": 9.513839332829806e-06, "loss": 0.0362, "step": 2043 }, { "epoch": 0.682584738687594, "grad_norm": 0.4834078783984118, "learning_rate": 9.513003109653192e-06, "loss": 0.0374, "step": 2044 }, { "epoch": 0.6829186842544666, "grad_norm": 0.3588811997999953, "learning_rate": 9.512166204734322e-06, "loss": 0.0307, "step": 2045 }, { "epoch": 0.6832526298213392, "grad_norm": 0.325964726603329, "learning_rate": 9.511328618199614e-06, "loss": 0.0324, "step": 2046 }, { "epoch": 0.6835865753882118, "grad_norm": 0.8229699925882626, "learning_rate": 9.510490350175602e-06, "loss": 0.0628, "step": 2047 }, { "epoch": 0.6839205209550844, "grad_norm": 0.30887054089282684, "learning_rate": 9.50965140078891e-06, "loss": 0.0419, "step": 2048 }, { "epoch": 0.6842544665219569, "grad_norm": 0.39605664376160044, "learning_rate": 9.508811770166277e-06, "loss": 0.0382, "step": 2049 }, { "epoch": 0.6845884120888295, "grad_norm": 0.38009363458721646, "learning_rate": 9.507971458434538e-06, "loss": 0.0317, "step": 2050 }, { "epoch": 0.6849223576557021, "grad_norm": 0.31482364804749247, "learning_rate": 9.507130465720628e-06, "loss": 0.0291, "step": 2051 }, { "epoch": 0.6852563032225747, "grad_norm": 0.2793468062451066, "learning_rate": 9.506288792151592e-06, "loss": 0.0268, "step": 2052 }, { "epoch": 0.6855902487894473, "grad_norm": 0.3838063300395601, "learning_rate": 9.505446437854574e-06, "loss": 0.0312, "step": 2053 }, { "epoch": 0.6859241943563199, "grad_norm": 0.5550332249747015, "learning_rate": 9.504603402956823e-06, "loss": 0.0464, "step": 2054 }, { "epoch": 0.6862581399231925, "grad_norm": 0.27606317902607436, "learning_rate": 9.503759687585686e-06, "loss": 0.0319, "step": 2055 }, { "epoch": 0.6865920854900651, "grad_norm": 0.437480194017149, "learning_rate": 9.50291529186862e-06, "loss": 0.0351, "step": 2056 }, { "epoch": 0.6869260310569377, "grad_norm": 0.3541870244307474, "learning_rate": 9.502070215933177e-06, "loss": 0.0328, "step": 2057 }, { "epoch": 0.6872599766238103, "grad_norm": 0.4000587433984851, "learning_rate": 9.501224459907019e-06, "loss": 0.0334, "step": 2058 }, { "epoch": 0.6875939221906829, "grad_norm": 0.32412375991716214, "learning_rate": 9.500378023917906e-06, "loss": 0.0288, "step": 2059 }, { "epoch": 0.6879278677575555, "grad_norm": 0.39624546639503, "learning_rate": 9.499530908093702e-06, "loss": 0.0437, "step": 2060 }, { "epoch": 0.6882618133244282, "grad_norm": 0.4105246708870515, "learning_rate": 9.498683112562374e-06, "loss": 0.036, "step": 2061 }, { "epoch": 0.6885957588913008, "grad_norm": 0.3299231249871794, "learning_rate": 9.497834637451992e-06, "loss": 0.0322, "step": 2062 }, { "epoch": 0.6889297044581734, "grad_norm": 0.366774085118686, "learning_rate": 9.496985482890728e-06, "loss": 0.0319, "step": 2063 }, { "epoch": 0.689263650025046, "grad_norm": 0.4672075079573483, "learning_rate": 9.496135649006857e-06, "loss": 0.0345, "step": 2064 }, { "epoch": 0.6895975955919185, "grad_norm": 0.3404779492995906, "learning_rate": 9.495285135928755e-06, "loss": 0.0333, "step": 2065 }, { "epoch": 0.6899315411587911, "grad_norm": 0.30411140383949004, "learning_rate": 9.494433943784901e-06, "loss": 0.0336, "step": 2066 }, { "epoch": 0.6902654867256637, "grad_norm": 0.3145670236628234, "learning_rate": 9.493582072703883e-06, "loss": 0.0348, "step": 2067 }, { "epoch": 0.6905994322925363, "grad_norm": 0.3767704383684599, "learning_rate": 9.49272952281438e-06, "loss": 0.0368, "step": 2068 }, { "epoch": 0.6909333778594089, "grad_norm": 0.36483330422600235, "learning_rate": 9.491876294245184e-06, "loss": 0.0315, "step": 2069 }, { "epoch": 0.6912673234262815, "grad_norm": 0.6576254105862623, "learning_rate": 9.491022387125183e-06, "loss": 0.0356, "step": 2070 }, { "epoch": 0.6916012689931541, "grad_norm": 0.3700579958974638, "learning_rate": 9.490167801583373e-06, "loss": 0.0324, "step": 2071 }, { "epoch": 0.6919352145600267, "grad_norm": 0.36934678168549157, "learning_rate": 9.489312537748843e-06, "loss": 0.0376, "step": 2072 }, { "epoch": 0.6922691601268993, "grad_norm": 0.37061885576754783, "learning_rate": 9.488456595750795e-06, "loss": 0.0364, "step": 2073 }, { "epoch": 0.6926031056937719, "grad_norm": 0.5357535933097675, "learning_rate": 9.487599975718529e-06, "loss": 0.0457, "step": 2074 }, { "epoch": 0.6929370512606445, "grad_norm": 0.331179966616601, "learning_rate": 9.486742677781446e-06, "loss": 0.0322, "step": 2075 }, { "epoch": 0.6932709968275171, "grad_norm": 0.4479123448178622, "learning_rate": 9.485884702069053e-06, "loss": 0.0529, "step": 2076 }, { "epoch": 0.6936049423943897, "grad_norm": 0.4043620854538527, "learning_rate": 9.485026048710957e-06, "loss": 0.035, "step": 2077 }, { "epoch": 0.6939388879612624, "grad_norm": 0.3374418056381048, "learning_rate": 9.484166717836865e-06, "loss": 0.0343, "step": 2078 }, { "epoch": 0.694272833528135, "grad_norm": 0.3844254319088217, "learning_rate": 9.48330670957659e-06, "loss": 0.0293, "step": 2079 }, { "epoch": 0.6946067790950076, "grad_norm": 0.3985114019469326, "learning_rate": 9.48244602406005e-06, "loss": 0.0467, "step": 2080 }, { "epoch": 0.6949407246618801, "grad_norm": 0.5016675984334057, "learning_rate": 9.481584661417258e-06, "loss": 0.0358, "step": 2081 }, { "epoch": 0.6952746702287527, "grad_norm": 0.35840475345410083, "learning_rate": 9.480722621778334e-06, "loss": 0.036, "step": 2082 }, { "epoch": 0.6956086157956253, "grad_norm": 0.28766872101826946, "learning_rate": 9.479859905273498e-06, "loss": 0.0273, "step": 2083 }, { "epoch": 0.6959425613624979, "grad_norm": 0.6414607344065364, "learning_rate": 9.478996512033074e-06, "loss": 0.0593, "step": 2084 }, { "epoch": 0.6962765069293705, "grad_norm": 0.4752069954799563, "learning_rate": 9.478132442187491e-06, "loss": 0.0337, "step": 2085 }, { "epoch": 0.6966104524962431, "grad_norm": 0.39948088189108805, "learning_rate": 9.477267695867275e-06, "loss": 0.0372, "step": 2086 }, { "epoch": 0.6969443980631157, "grad_norm": 0.4082284378453413, "learning_rate": 9.476402273203052e-06, "loss": 0.0389, "step": 2087 }, { "epoch": 0.6972783436299883, "grad_norm": 0.36586269541710464, "learning_rate": 9.47553617432556e-06, "loss": 0.0415, "step": 2088 }, { "epoch": 0.6976122891968609, "grad_norm": 0.45222951343907314, "learning_rate": 9.47466939936563e-06, "loss": 0.0407, "step": 2089 }, { "epoch": 0.6979462347637335, "grad_norm": 0.3799030291064665, "learning_rate": 9.473801948454199e-06, "loss": 0.0316, "step": 2090 }, { "epoch": 0.6982801803306061, "grad_norm": 0.3589556260093423, "learning_rate": 9.472933821722307e-06, "loss": 0.0404, "step": 2091 }, { "epoch": 0.6986141258974787, "grad_norm": 0.3058433466630493, "learning_rate": 9.472065019301095e-06, "loss": 0.0276, "step": 2092 }, { "epoch": 0.6989480714643513, "grad_norm": 0.4957747306104084, "learning_rate": 9.471195541321805e-06, "loss": 0.0365, "step": 2093 }, { "epoch": 0.699282017031224, "grad_norm": 0.3660253521415888, "learning_rate": 9.470325387915782e-06, "loss": 0.0388, "step": 2094 }, { "epoch": 0.6996159625980966, "grad_norm": 0.3969797950662937, "learning_rate": 9.469454559214473e-06, "loss": 0.0432, "step": 2095 }, { "epoch": 0.6999499081649692, "grad_norm": 0.553351992055136, "learning_rate": 9.468583055349425e-06, "loss": 0.0394, "step": 2096 }, { "epoch": 0.7002838537318417, "grad_norm": 0.3506807419904832, "learning_rate": 9.467710876452292e-06, "loss": 0.0418, "step": 2097 }, { "epoch": 0.7006177992987143, "grad_norm": 0.5097933681573146, "learning_rate": 9.466838022654826e-06, "loss": 0.0371, "step": 2098 }, { "epoch": 0.7009517448655869, "grad_norm": 0.4204143743306447, "learning_rate": 9.465964494088879e-06, "loss": 0.036, "step": 2099 }, { "epoch": 0.7012856904324595, "grad_norm": 0.3803661985442615, "learning_rate": 9.465090290886411e-06, "loss": 0.0406, "step": 2100 }, { "epoch": 0.7016196359993321, "grad_norm": 0.4085602290339402, "learning_rate": 9.464215413179483e-06, "loss": 0.0504, "step": 2101 }, { "epoch": 0.7019535815662047, "grad_norm": 0.48582041271567966, "learning_rate": 9.46333986110025e-06, "loss": 0.0376, "step": 2102 }, { "epoch": 0.7022875271330773, "grad_norm": 0.3169719380753962, "learning_rate": 9.462463634780977e-06, "loss": 0.0338, "step": 2103 }, { "epoch": 0.7026214726999499, "grad_norm": 0.3126122735122749, "learning_rate": 9.461586734354027e-06, "loss": 0.024, "step": 2104 }, { "epoch": 0.7029554182668225, "grad_norm": 0.66368486085725, "learning_rate": 9.460709159951867e-06, "loss": 0.0608, "step": 2105 }, { "epoch": 0.7032893638336951, "grad_norm": 0.33121278172693575, "learning_rate": 9.459830911707066e-06, "loss": 0.0282, "step": 2106 }, { "epoch": 0.7036233094005677, "grad_norm": 0.4406857934513667, "learning_rate": 9.458951989752295e-06, "loss": 0.0339, "step": 2107 }, { "epoch": 0.7039572549674403, "grad_norm": 0.5133036530910118, "learning_rate": 9.458072394220321e-06, "loss": 0.043, "step": 2108 }, { "epoch": 0.7042912005343129, "grad_norm": 0.30461348264628313, "learning_rate": 9.457192125244021e-06, "loss": 0.0245, "step": 2109 }, { "epoch": 0.7046251461011855, "grad_norm": 0.34330988923124833, "learning_rate": 9.456311182956368e-06, "loss": 0.0308, "step": 2110 }, { "epoch": 0.7049590916680581, "grad_norm": 0.32911770953151354, "learning_rate": 9.45542956749044e-06, "loss": 0.0328, "step": 2111 }, { "epoch": 0.7052930372349308, "grad_norm": 0.6083984422933292, "learning_rate": 9.454547278979415e-06, "loss": 0.0406, "step": 2112 }, { "epoch": 0.7056269828018034, "grad_norm": 0.3185936140057806, "learning_rate": 9.453664317556572e-06, "loss": 0.0274, "step": 2113 }, { "epoch": 0.7059609283686759, "grad_norm": 0.48570418213064875, "learning_rate": 9.452780683355295e-06, "loss": 0.0373, "step": 2114 }, { "epoch": 0.7062948739355485, "grad_norm": 0.3778477949862705, "learning_rate": 9.451896376509065e-06, "loss": 0.0336, "step": 2115 }, { "epoch": 0.7066288195024211, "grad_norm": 0.3185035797864648, "learning_rate": 9.451011397151469e-06, "loss": 0.0282, "step": 2116 }, { "epoch": 0.7069627650692937, "grad_norm": 0.32082110489545806, "learning_rate": 9.450125745416191e-06, "loss": 0.0303, "step": 2117 }, { "epoch": 0.7072967106361663, "grad_norm": 0.5487871290248476, "learning_rate": 9.44923942143702e-06, "loss": 0.0467, "step": 2118 }, { "epoch": 0.7076306562030389, "grad_norm": 0.7642656430859089, "learning_rate": 9.448352425347848e-06, "loss": 0.0599, "step": 2119 }, { "epoch": 0.7079646017699115, "grad_norm": 0.41308813145148743, "learning_rate": 9.447464757282665e-06, "loss": 0.0468, "step": 2120 }, { "epoch": 0.7082985473367841, "grad_norm": 0.5125691388816903, "learning_rate": 9.44657641737556e-06, "loss": 0.0381, "step": 2121 }, { "epoch": 0.7086324929036567, "grad_norm": 0.30328822423975166, "learning_rate": 9.445687405760735e-06, "loss": 0.0367, "step": 2122 }, { "epoch": 0.7089664384705293, "grad_norm": 0.3251459274141834, "learning_rate": 9.444797722572479e-06, "loss": 0.0361, "step": 2123 }, { "epoch": 0.7093003840374019, "grad_norm": 0.929693515245696, "learning_rate": 9.44390736794519e-06, "loss": 0.0552, "step": 2124 }, { "epoch": 0.7096343296042745, "grad_norm": 0.3665625751767317, "learning_rate": 9.443016342013369e-06, "loss": 0.0352, "step": 2125 }, { "epoch": 0.7099682751711471, "grad_norm": 0.30557329648239634, "learning_rate": 9.442124644911614e-06, "loss": 0.0371, "step": 2126 }, { "epoch": 0.7103022207380197, "grad_norm": 0.4266907565619148, "learning_rate": 9.441232276774629e-06, "loss": 0.0369, "step": 2127 }, { "epoch": 0.7106361663048923, "grad_norm": 0.49036847738058204, "learning_rate": 9.440339237737213e-06, "loss": 0.0355, "step": 2128 }, { "epoch": 0.710970111871765, "grad_norm": 0.5439634251546771, "learning_rate": 9.439445527934272e-06, "loss": 0.0478, "step": 2129 }, { "epoch": 0.7113040574386374, "grad_norm": 0.7030520850755128, "learning_rate": 9.438551147500812e-06, "loss": 0.0472, "step": 2130 }, { "epoch": 0.7116380030055101, "grad_norm": 0.4244879408655492, "learning_rate": 9.437656096571938e-06, "loss": 0.0321, "step": 2131 }, { "epoch": 0.7119719485723827, "grad_norm": 0.5624451104866817, "learning_rate": 9.436760375282858e-06, "loss": 0.0414, "step": 2132 }, { "epoch": 0.7123058941392553, "grad_norm": 0.5585892134208846, "learning_rate": 9.435863983768884e-06, "loss": 0.0424, "step": 2133 }, { "epoch": 0.7126398397061279, "grad_norm": 0.4815680736159215, "learning_rate": 9.434966922165424e-06, "loss": 0.0458, "step": 2134 }, { "epoch": 0.7129737852730005, "grad_norm": 0.6999091954272587, "learning_rate": 9.43406919060799e-06, "loss": 0.0393, "step": 2135 }, { "epoch": 0.7133077308398731, "grad_norm": 0.5557423602281404, "learning_rate": 9.433170789232196e-06, "loss": 0.0287, "step": 2136 }, { "epoch": 0.7136416764067457, "grad_norm": 0.6046902850475429, "learning_rate": 9.432271718173756e-06, "loss": 0.0417, "step": 2137 }, { "epoch": 0.7139756219736183, "grad_norm": 0.51087554433223, "learning_rate": 9.431371977568483e-06, "loss": 0.0415, "step": 2138 }, { "epoch": 0.7143095675404909, "grad_norm": 0.7207660363117191, "learning_rate": 9.430471567552295e-06, "loss": 0.0368, "step": 2139 }, { "epoch": 0.7146435131073635, "grad_norm": 0.6787427646753198, "learning_rate": 9.42957048826121e-06, "loss": 0.0369, "step": 2140 }, { "epoch": 0.7149774586742361, "grad_norm": 0.3798691880618357, "learning_rate": 9.428668739831349e-06, "loss": 0.0372, "step": 2141 }, { "epoch": 0.7153114042411087, "grad_norm": 0.5857347216142876, "learning_rate": 9.427766322398926e-06, "loss": 0.0473, "step": 2142 }, { "epoch": 0.7156453498079813, "grad_norm": 0.5065366747180108, "learning_rate": 9.426863236100266e-06, "loss": 0.0404, "step": 2143 }, { "epoch": 0.7159792953748539, "grad_norm": 0.6105120015697073, "learning_rate": 9.425959481071787e-06, "loss": 0.0491, "step": 2144 }, { "epoch": 0.7163132409417265, "grad_norm": 0.3026933591924967, "learning_rate": 9.425055057450017e-06, "loss": 0.0305, "step": 2145 }, { "epoch": 0.716647186508599, "grad_norm": 0.3150470920767179, "learning_rate": 9.424149965371576e-06, "loss": 0.029, "step": 2146 }, { "epoch": 0.7169811320754716, "grad_norm": 0.5381519528684029, "learning_rate": 9.423244204973191e-06, "loss": 0.0314, "step": 2147 }, { "epoch": 0.7173150776423443, "grad_norm": 0.30701392419370926, "learning_rate": 9.422337776391686e-06, "loss": 0.0238, "step": 2148 }, { "epoch": 0.7176490232092169, "grad_norm": 0.3641873297705958, "learning_rate": 9.421430679763989e-06, "loss": 0.0325, "step": 2149 }, { "epoch": 0.7179829687760895, "grad_norm": 0.36154786456566407, "learning_rate": 9.420522915227129e-06, "loss": 0.0279, "step": 2150 }, { "epoch": 0.7183169143429621, "grad_norm": 0.33684500270630197, "learning_rate": 9.419614482918229e-06, "loss": 0.039, "step": 2151 }, { "epoch": 0.7186508599098347, "grad_norm": 0.38092296437484474, "learning_rate": 9.418705382974524e-06, "loss": 0.0285, "step": 2152 }, { "epoch": 0.7189848054767073, "grad_norm": 0.3162897641730264, "learning_rate": 9.417795615533343e-06, "loss": 0.0355, "step": 2153 }, { "epoch": 0.7193187510435799, "grad_norm": 0.5527311501906367, "learning_rate": 9.416885180732115e-06, "loss": 0.0504, "step": 2154 }, { "epoch": 0.7196526966104525, "grad_norm": 0.31064578020814043, "learning_rate": 9.415974078708375e-06, "loss": 0.0297, "step": 2155 }, { "epoch": 0.7199866421773251, "grad_norm": 0.2783485728603494, "learning_rate": 9.415062309599751e-06, "loss": 0.0242, "step": 2156 }, { "epoch": 0.7203205877441977, "grad_norm": 0.45640344369628066, "learning_rate": 9.414149873543983e-06, "loss": 0.0422, "step": 2157 }, { "epoch": 0.7206545333110703, "grad_norm": 0.43007533085532074, "learning_rate": 9.4132367706789e-06, "loss": 0.0353, "step": 2158 }, { "epoch": 0.7209884788779429, "grad_norm": 0.28082360420233443, "learning_rate": 9.412323001142438e-06, "loss": 0.0282, "step": 2159 }, { "epoch": 0.7213224244448155, "grad_norm": 0.3199494105748125, "learning_rate": 9.411408565072635e-06, "loss": 0.0455, "step": 2160 }, { "epoch": 0.7216563700116881, "grad_norm": 0.3745445600000952, "learning_rate": 9.410493462607623e-06, "loss": 0.0315, "step": 2161 }, { "epoch": 0.7219903155785607, "grad_norm": 0.46795502563432667, "learning_rate": 9.409577693885642e-06, "loss": 0.0359, "step": 2162 }, { "epoch": 0.7223242611454332, "grad_norm": 0.6439904868634199, "learning_rate": 9.408661259045032e-06, "loss": 0.0371, "step": 2163 }, { "epoch": 0.7226582067123059, "grad_norm": 0.36532717951072996, "learning_rate": 9.407744158224227e-06, "loss": 0.0335, "step": 2164 }, { "epoch": 0.7229921522791785, "grad_norm": 0.4761693864609595, "learning_rate": 9.406826391561767e-06, "loss": 0.0436, "step": 2165 }, { "epoch": 0.7233260978460511, "grad_norm": 0.542276120583976, "learning_rate": 9.405907959196293e-06, "loss": 0.0504, "step": 2166 }, { "epoch": 0.7236600434129237, "grad_norm": 0.3101592116587309, "learning_rate": 9.404988861266543e-06, "loss": 0.0298, "step": 2167 }, { "epoch": 0.7239939889797963, "grad_norm": 0.37567398821930725, "learning_rate": 9.404069097911358e-06, "loss": 0.0373, "step": 2168 }, { "epoch": 0.7243279345466689, "grad_norm": 1.1232092577160215, "learning_rate": 9.40314866926968e-06, "loss": 0.0285, "step": 2169 }, { "epoch": 0.7246618801135415, "grad_norm": 0.3456497379006263, "learning_rate": 9.402227575480549e-06, "loss": 0.0302, "step": 2170 }, { "epoch": 0.7249958256804141, "grad_norm": 0.2956667125599662, "learning_rate": 9.401305816683111e-06, "loss": 0.0267, "step": 2171 }, { "epoch": 0.7253297712472867, "grad_norm": 0.4008379808926853, "learning_rate": 9.400383393016604e-06, "loss": 0.0502, "step": 2172 }, { "epoch": 0.7256637168141593, "grad_norm": 0.41478548582639985, "learning_rate": 9.39946030462037e-06, "loss": 0.0399, "step": 2173 }, { "epoch": 0.7259976623810319, "grad_norm": 0.5694715010360888, "learning_rate": 9.39853655163386e-06, "loss": 0.0288, "step": 2174 }, { "epoch": 0.7263316079479045, "grad_norm": 0.3686960281419378, "learning_rate": 9.39761213419661e-06, "loss": 0.032, "step": 2175 }, { "epoch": 0.7266655535147771, "grad_norm": 0.3830247290605787, "learning_rate": 9.396687052448267e-06, "loss": 0.0476, "step": 2176 }, { "epoch": 0.7269994990816497, "grad_norm": 0.35566223597964414, "learning_rate": 9.395761306528576e-06, "loss": 0.0342, "step": 2177 }, { "epoch": 0.7273334446485223, "grad_norm": 0.393115357857828, "learning_rate": 9.39483489657738e-06, "loss": 0.0276, "step": 2178 }, { "epoch": 0.7276673902153948, "grad_norm": 0.3353379272112651, "learning_rate": 9.393907822734627e-06, "loss": 0.0349, "step": 2179 }, { "epoch": 0.7280013357822674, "grad_norm": 0.44949813030160335, "learning_rate": 9.39298008514036e-06, "loss": 0.045, "step": 2180 }, { "epoch": 0.72833528134914, "grad_norm": 0.3584142367623185, "learning_rate": 9.392051683934726e-06, "loss": 0.0346, "step": 2181 }, { "epoch": 0.7286692269160127, "grad_norm": 0.34676570759036107, "learning_rate": 9.39112261925797e-06, "loss": 0.0408, "step": 2182 }, { "epoch": 0.7290031724828853, "grad_norm": 0.35127440088031114, "learning_rate": 9.390192891250439e-06, "loss": 0.0365, "step": 2183 }, { "epoch": 0.7293371180497579, "grad_norm": 1.2571936885864603, "learning_rate": 9.389262500052578e-06, "loss": 0.0476, "step": 2184 }, { "epoch": 0.7296710636166305, "grad_norm": 0.6176783849662094, "learning_rate": 9.388331445804935e-06, "loss": 0.0425, "step": 2185 }, { "epoch": 0.7300050091835031, "grad_norm": 0.35129875622236834, "learning_rate": 9.387399728648156e-06, "loss": 0.0395, "step": 2186 }, { "epoch": 0.7303389547503757, "grad_norm": 0.41741731330621706, "learning_rate": 9.386467348722989e-06, "loss": 0.0463, "step": 2187 }, { "epoch": 0.7306729003172483, "grad_norm": 0.5003210965167986, "learning_rate": 9.385534306170279e-06, "loss": 0.0364, "step": 2188 }, { "epoch": 0.7310068458841209, "grad_norm": 0.3415018618510439, "learning_rate": 9.384600601130973e-06, "loss": 0.0394, "step": 2189 }, { "epoch": 0.7313407914509935, "grad_norm": 0.37855829690889353, "learning_rate": 9.383666233746121e-06, "loss": 0.0373, "step": 2190 }, { "epoch": 0.7316747370178661, "grad_norm": 0.5621844322113564, "learning_rate": 9.382731204156869e-06, "loss": 0.0461, "step": 2191 }, { "epoch": 0.7320086825847387, "grad_norm": 0.4227093732171971, "learning_rate": 9.381795512504461e-06, "loss": 0.0359, "step": 2192 }, { "epoch": 0.7323426281516113, "grad_norm": 0.6463561503128954, "learning_rate": 9.380859158930249e-06, "loss": 0.0622, "step": 2193 }, { "epoch": 0.7326765737184839, "grad_norm": 0.3230426282789849, "learning_rate": 9.379922143575678e-06, "loss": 0.0248, "step": 2194 }, { "epoch": 0.7330105192853564, "grad_norm": 0.3773252718914423, "learning_rate": 9.378984466582294e-06, "loss": 0.0356, "step": 2195 }, { "epoch": 0.733344464852229, "grad_norm": 0.4584867118916967, "learning_rate": 9.378046128091748e-06, "loss": 0.0386, "step": 2196 }, { "epoch": 0.7336784104191016, "grad_norm": 0.39063708609130854, "learning_rate": 9.377107128245782e-06, "loss": 0.0268, "step": 2197 }, { "epoch": 0.7340123559859743, "grad_norm": 0.4621952850142301, "learning_rate": 9.376167467186246e-06, "loss": 0.034, "step": 2198 }, { "epoch": 0.7343463015528469, "grad_norm": 0.9020751036892598, "learning_rate": 9.375227145055085e-06, "loss": 0.032, "step": 2199 }, { "epoch": 0.7346802471197195, "grad_norm": 0.581350479413104, "learning_rate": 9.374286161994351e-06, "loss": 0.0433, "step": 2200 }, { "epoch": 0.7350141926865921, "grad_norm": 0.470124648155974, "learning_rate": 9.373344518146184e-06, "loss": 0.0504, "step": 2201 }, { "epoch": 0.7353481382534647, "grad_norm": 0.4280483085829255, "learning_rate": 9.372402213652833e-06, "loss": 0.044, "step": 2202 }, { "epoch": 0.7356820838203373, "grad_norm": 0.45019101568081704, "learning_rate": 9.371459248656645e-06, "loss": 0.0387, "step": 2203 }, { "epoch": 0.7360160293872099, "grad_norm": 0.45417671331877707, "learning_rate": 9.370515623300066e-06, "loss": 0.0418, "step": 2204 }, { "epoch": 0.7363499749540825, "grad_norm": 0.46822253382427065, "learning_rate": 9.369571337725638e-06, "loss": 0.0396, "step": 2205 }, { "epoch": 0.7366839205209551, "grad_norm": 0.4384426918864997, "learning_rate": 9.368626392076013e-06, "loss": 0.0348, "step": 2206 }, { "epoch": 0.7370178660878277, "grad_norm": 0.3886070779136618, "learning_rate": 9.367680786493929e-06, "loss": 0.0394, "step": 2207 }, { "epoch": 0.7373518116547003, "grad_norm": 0.37301161619664175, "learning_rate": 9.366734521122236e-06, "loss": 0.0389, "step": 2208 }, { "epoch": 0.7376857572215729, "grad_norm": 0.4353994364406186, "learning_rate": 9.365787596103877e-06, "loss": 0.0414, "step": 2209 }, { "epoch": 0.7380197027884455, "grad_norm": 0.5940195018896346, "learning_rate": 9.364840011581896e-06, "loss": 0.042, "step": 2210 }, { "epoch": 0.7383536483553181, "grad_norm": 0.3967047113384441, "learning_rate": 9.363891767699437e-06, "loss": 0.0358, "step": 2211 }, { "epoch": 0.7386875939221906, "grad_norm": 0.40143940408633927, "learning_rate": 9.362942864599746e-06, "loss": 0.0318, "step": 2212 }, { "epoch": 0.7390215394890632, "grad_norm": 0.49633045943653026, "learning_rate": 9.36199330242616e-06, "loss": 0.0474, "step": 2213 }, { "epoch": 0.7393554850559358, "grad_norm": 0.3330337287688398, "learning_rate": 9.361043081322125e-06, "loss": 0.0316, "step": 2214 }, { "epoch": 0.7396894306228085, "grad_norm": 0.4951336586004503, "learning_rate": 9.360092201431186e-06, "loss": 0.0392, "step": 2215 }, { "epoch": 0.7400233761896811, "grad_norm": 0.5995767210304951, "learning_rate": 9.359140662896978e-06, "loss": 0.0504, "step": 2216 }, { "epoch": 0.7403573217565537, "grad_norm": 0.33445634694416476, "learning_rate": 9.358188465863247e-06, "loss": 0.037, "step": 2217 }, { "epoch": 0.7406912673234263, "grad_norm": 0.3243516604870869, "learning_rate": 9.357235610473833e-06, "loss": 0.0374, "step": 2218 }, { "epoch": 0.7410252128902989, "grad_norm": 0.3249633073589906, "learning_rate": 9.356282096872673e-06, "loss": 0.0455, "step": 2219 }, { "epoch": 0.7413591584571715, "grad_norm": 0.3272398278474091, "learning_rate": 9.355327925203811e-06, "loss": 0.0299, "step": 2220 }, { "epoch": 0.7416931040240441, "grad_norm": 0.5104439431607845, "learning_rate": 9.354373095611383e-06, "loss": 0.0506, "step": 2221 }, { "epoch": 0.7420270495909167, "grad_norm": 0.303402283180065, "learning_rate": 9.353417608239627e-06, "loss": 0.0355, "step": 2222 }, { "epoch": 0.7423609951577893, "grad_norm": 0.48439694546886963, "learning_rate": 9.352461463232882e-06, "loss": 0.0417, "step": 2223 }, { "epoch": 0.7426949407246619, "grad_norm": 0.5067073653283253, "learning_rate": 9.351504660735583e-06, "loss": 0.0466, "step": 2224 }, { "epoch": 0.7430288862915345, "grad_norm": 0.26718770356295135, "learning_rate": 9.350547200892271e-06, "loss": 0.0295, "step": 2225 }, { "epoch": 0.7433628318584071, "grad_norm": 0.37623817934192, "learning_rate": 9.349589083847577e-06, "loss": 0.0325, "step": 2226 }, { "epoch": 0.7436967774252797, "grad_norm": 0.3326289340340217, "learning_rate": 9.348630309746236e-06, "loss": 0.0373, "step": 2227 }, { "epoch": 0.7440307229921522, "grad_norm": 0.31750724437460565, "learning_rate": 9.347670878733084e-06, "loss": 0.0297, "step": 2228 }, { "epoch": 0.7443646685590248, "grad_norm": 0.4042643363629417, "learning_rate": 9.346710790953053e-06, "loss": 0.0457, "step": 2229 }, { "epoch": 0.7446986141258974, "grad_norm": 0.3791139189174066, "learning_rate": 9.345750046551177e-06, "loss": 0.0338, "step": 2230 }, { "epoch": 0.74503255969277, "grad_norm": 0.3048806840867557, "learning_rate": 9.344788645672585e-06, "loss": 0.0295, "step": 2231 }, { "epoch": 0.7453665052596427, "grad_norm": 0.3093142222126558, "learning_rate": 9.343826588462513e-06, "loss": 0.0307, "step": 2232 }, { "epoch": 0.7457004508265153, "grad_norm": 0.423785351652411, "learning_rate": 9.342863875066284e-06, "loss": 0.0302, "step": 2233 }, { "epoch": 0.7460343963933879, "grad_norm": 0.4104524654735883, "learning_rate": 9.341900505629333e-06, "loss": 0.0267, "step": 2234 }, { "epoch": 0.7463683419602605, "grad_norm": 0.6179830673034177, "learning_rate": 9.340936480297187e-06, "loss": 0.0429, "step": 2235 }, { "epoch": 0.7467022875271331, "grad_norm": 0.2712801420997214, "learning_rate": 9.339971799215472e-06, "loss": 0.0297, "step": 2236 }, { "epoch": 0.7470362330940057, "grad_norm": 0.5720340115168108, "learning_rate": 9.339006462529916e-06, "loss": 0.0309, "step": 2237 }, { "epoch": 0.7473701786608783, "grad_norm": 0.29473138517563946, "learning_rate": 9.338040470386344e-06, "loss": 0.0299, "step": 2238 }, { "epoch": 0.7477041242277509, "grad_norm": 0.3660711868493045, "learning_rate": 9.337073822930681e-06, "loss": 0.0499, "step": 2239 }, { "epoch": 0.7480380697946235, "grad_norm": 0.39026290597585506, "learning_rate": 9.336106520308948e-06, "loss": 0.0324, "step": 2240 }, { "epoch": 0.7483720153614961, "grad_norm": 0.34956067278873115, "learning_rate": 9.335138562667267e-06, "loss": 0.0462, "step": 2241 }, { "epoch": 0.7487059609283687, "grad_norm": 0.328480681617309, "learning_rate": 9.334169950151866e-06, "loss": 0.0277, "step": 2242 }, { "epoch": 0.7490399064952413, "grad_norm": 0.5570308483129467, "learning_rate": 9.333200682909059e-06, "loss": 0.042, "step": 2243 }, { "epoch": 0.7493738520621138, "grad_norm": 0.5499427636240891, "learning_rate": 9.332230761085265e-06, "loss": 0.0396, "step": 2244 }, { "epoch": 0.7497077976289864, "grad_norm": 0.3647005932018772, "learning_rate": 9.331260184827006e-06, "loss": 0.0375, "step": 2245 }, { "epoch": 0.750041743195859, "grad_norm": 0.4355905627081893, "learning_rate": 9.330288954280898e-06, "loss": 0.0326, "step": 2246 }, { "epoch": 0.7503756887627316, "grad_norm": 0.6850520487771359, "learning_rate": 9.329317069593654e-06, "loss": 0.0505, "step": 2247 }, { "epoch": 0.7507096343296042, "grad_norm": 0.48892253010051345, "learning_rate": 9.328344530912093e-06, "loss": 0.0414, "step": 2248 }, { "epoch": 0.7510435798964769, "grad_norm": 0.514361695486632, "learning_rate": 9.327371338383124e-06, "loss": 0.036, "step": 2249 }, { "epoch": 0.7513775254633495, "grad_norm": 0.40093172239531716, "learning_rate": 9.326397492153762e-06, "loss": 0.0384, "step": 2250 }, { "epoch": 0.7517114710302221, "grad_norm": 0.9703862802428921, "learning_rate": 9.325422992371117e-06, "loss": 0.0544, "step": 2251 }, { "epoch": 0.7520454165970947, "grad_norm": 0.5118673491055077, "learning_rate": 9.324447839182397e-06, "loss": 0.0461, "step": 2252 }, { "epoch": 0.7523793621639673, "grad_norm": 0.3639689648823959, "learning_rate": 9.323472032734915e-06, "loss": 0.032, "step": 2253 }, { "epoch": 0.7527133077308399, "grad_norm": 0.6187138592600782, "learning_rate": 9.322495573176073e-06, "loss": 0.0458, "step": 2254 }, { "epoch": 0.7530472532977125, "grad_norm": 0.27008895818226897, "learning_rate": 9.321518460653381e-06, "loss": 0.0268, "step": 2255 }, { "epoch": 0.7533811988645851, "grad_norm": 0.37538592472089005, "learning_rate": 9.32054069531444e-06, "loss": 0.0279, "step": 2256 }, { "epoch": 0.7537151444314577, "grad_norm": 0.5327976462756395, "learning_rate": 9.319562277306955e-06, "loss": 0.0404, "step": 2257 }, { "epoch": 0.7540490899983303, "grad_norm": 0.33481660725693313, "learning_rate": 9.318583206778726e-06, "loss": 0.0327, "step": 2258 }, { "epoch": 0.7543830355652029, "grad_norm": 0.43566182686512606, "learning_rate": 9.317603483877654e-06, "loss": 0.037, "step": 2259 }, { "epoch": 0.7547169811320755, "grad_norm": 0.34502868792169167, "learning_rate": 9.316623108751739e-06, "loss": 0.0274, "step": 2260 }, { "epoch": 0.755050926698948, "grad_norm": 0.4098480126148483, "learning_rate": 9.315642081549074e-06, "loss": 0.0346, "step": 2261 }, { "epoch": 0.7553848722658206, "grad_norm": 0.3520297697972669, "learning_rate": 9.31466040241786e-06, "loss": 0.0339, "step": 2262 }, { "epoch": 0.7557188178326932, "grad_norm": 0.3353997409151091, "learning_rate": 9.313678071506388e-06, "loss": 0.0325, "step": 2263 }, { "epoch": 0.7560527633995658, "grad_norm": 0.28276655262328104, "learning_rate": 9.31269508896305e-06, "loss": 0.0395, "step": 2264 }, { "epoch": 0.7563867089664384, "grad_norm": 0.4461004263857972, "learning_rate": 9.31171145493634e-06, "loss": 0.04, "step": 2265 }, { "epoch": 0.756720654533311, "grad_norm": 0.4051370472975567, "learning_rate": 9.310727169574847e-06, "loss": 0.0371, "step": 2266 }, { "epoch": 0.7570546001001837, "grad_norm": 0.3119139688090534, "learning_rate": 9.309742233027258e-06, "loss": 0.031, "step": 2267 }, { "epoch": 0.7573885456670563, "grad_norm": 0.33750869813243045, "learning_rate": 9.308756645442356e-06, "loss": 0.0305, "step": 2268 }, { "epoch": 0.7577224912339289, "grad_norm": 0.35256712007998253, "learning_rate": 9.307770406969032e-06, "loss": 0.0353, "step": 2269 }, { "epoch": 0.7580564368008015, "grad_norm": 0.2884726594341784, "learning_rate": 9.306783517756264e-06, "loss": 0.03, "step": 2270 }, { "epoch": 0.7583903823676741, "grad_norm": 0.42069566480415976, "learning_rate": 9.305795977953134e-06, "loss": 0.0364, "step": 2271 }, { "epoch": 0.7587243279345467, "grad_norm": 0.4546454469635054, "learning_rate": 9.304807787708825e-06, "loss": 0.0374, "step": 2272 }, { "epoch": 0.7590582735014193, "grad_norm": 0.39993702427824834, "learning_rate": 9.303818947172611e-06, "loss": 0.0308, "step": 2273 }, { "epoch": 0.7593922190682919, "grad_norm": 0.4791795651688872, "learning_rate": 9.302829456493868e-06, "loss": 0.0431, "step": 2274 }, { "epoch": 0.7597261646351645, "grad_norm": 0.33578853656263385, "learning_rate": 9.301839315822072e-06, "loss": 0.0339, "step": 2275 }, { "epoch": 0.7600601102020371, "grad_norm": 0.3537306255409295, "learning_rate": 9.300848525306797e-06, "loss": 0.0365, "step": 2276 }, { "epoch": 0.7603940557689096, "grad_norm": 0.335284276454725, "learning_rate": 9.299857085097708e-06, "loss": 0.0341, "step": 2277 }, { "epoch": 0.7607280013357822, "grad_norm": 0.40738792079537517, "learning_rate": 9.298864995344579e-06, "loss": 0.0272, "step": 2278 }, { "epoch": 0.7610619469026548, "grad_norm": 0.6229553924584766, "learning_rate": 9.297872256197276e-06, "loss": 0.0412, "step": 2279 }, { "epoch": 0.7613958924695274, "grad_norm": 0.2538644625026248, "learning_rate": 9.296878867805762e-06, "loss": 0.022, "step": 2280 }, { "epoch": 0.7617298380364, "grad_norm": 0.28342092080256903, "learning_rate": 9.2958848303201e-06, "loss": 0.0293, "step": 2281 }, { "epoch": 0.7620637836032726, "grad_norm": 0.3192840612391781, "learning_rate": 9.294890143890451e-06, "loss": 0.0327, "step": 2282 }, { "epoch": 0.7623977291701453, "grad_norm": 0.31682716384579585, "learning_rate": 9.293894808667077e-06, "loss": 0.038, "step": 2283 }, { "epoch": 0.7627316747370179, "grad_norm": 0.37258062981176443, "learning_rate": 9.292898824800333e-06, "loss": 0.0375, "step": 2284 }, { "epoch": 0.7630656203038905, "grad_norm": 0.39974063226260376, "learning_rate": 9.291902192440673e-06, "loss": 0.0429, "step": 2285 }, { "epoch": 0.7633995658707631, "grad_norm": 0.3159295288529469, "learning_rate": 9.290904911738653e-06, "loss": 0.0335, "step": 2286 }, { "epoch": 0.7637335114376357, "grad_norm": 0.36322486258442793, "learning_rate": 9.289906982844923e-06, "loss": 0.0381, "step": 2287 }, { "epoch": 0.7640674570045083, "grad_norm": 0.6540773212903619, "learning_rate": 9.288908405910228e-06, "loss": 0.027, "step": 2288 }, { "epoch": 0.7644014025713809, "grad_norm": 0.3262228264152689, "learning_rate": 9.287909181085421e-06, "loss": 0.0296, "step": 2289 }, { "epoch": 0.7647353481382535, "grad_norm": 0.36663665870234724, "learning_rate": 9.286909308521443e-06, "loss": 0.0316, "step": 2290 }, { "epoch": 0.7650692937051261, "grad_norm": 0.3391954940491059, "learning_rate": 9.285908788369336e-06, "loss": 0.0348, "step": 2291 }, { "epoch": 0.7654032392719987, "grad_norm": 0.35131535631671784, "learning_rate": 9.284907620780244e-06, "loss": 0.0307, "step": 2292 }, { "epoch": 0.7657371848388712, "grad_norm": 0.35743271985538666, "learning_rate": 9.2839058059054e-06, "loss": 0.0263, "step": 2293 }, { "epoch": 0.7660711304057438, "grad_norm": 0.35251162520858464, "learning_rate": 9.282903343896144e-06, "loss": 0.0285, "step": 2294 }, { "epoch": 0.7664050759726164, "grad_norm": 0.40568589387088894, "learning_rate": 9.281900234903908e-06, "loss": 0.0329, "step": 2295 }, { "epoch": 0.766739021539489, "grad_norm": 0.40969132207513087, "learning_rate": 9.280896479080224e-06, "loss": 0.0417, "step": 2296 }, { "epoch": 0.7670729671063616, "grad_norm": 0.33811726746443876, "learning_rate": 9.27989207657672e-06, "loss": 0.0273, "step": 2297 }, { "epoch": 0.7674069126732342, "grad_norm": 0.36425845736357687, "learning_rate": 9.278887027545125e-06, "loss": 0.0395, "step": 2298 }, { "epoch": 0.7677408582401068, "grad_norm": 0.4273780853380815, "learning_rate": 9.277881332137261e-06, "loss": 0.0412, "step": 2299 }, { "epoch": 0.7680748038069795, "grad_norm": 0.39795117070638286, "learning_rate": 9.276874990505053e-06, "loss": 0.0417, "step": 2300 }, { "epoch": 0.7684087493738521, "grad_norm": 0.5686838243198611, "learning_rate": 9.27586800280052e-06, "loss": 0.0401, "step": 2301 }, { "epoch": 0.7687426949407247, "grad_norm": 0.3871820169781138, "learning_rate": 9.274860369175775e-06, "loss": 0.0417, "step": 2302 }, { "epoch": 0.7690766405075973, "grad_norm": 0.5327552237845823, "learning_rate": 9.27385208978304e-06, "loss": 0.0562, "step": 2303 }, { "epoch": 0.7694105860744699, "grad_norm": 0.326833888376255, "learning_rate": 9.272843164774622e-06, "loss": 0.0362, "step": 2304 }, { "epoch": 0.7697445316413425, "grad_norm": 0.4505078375757577, "learning_rate": 9.27183359430293e-06, "loss": 0.0369, "step": 2305 }, { "epoch": 0.7700784772082151, "grad_norm": 0.2827253170407005, "learning_rate": 9.270823378520478e-06, "loss": 0.0296, "step": 2306 }, { "epoch": 0.7704124227750877, "grad_norm": 0.4820110768733872, "learning_rate": 9.269812517579867e-06, "loss": 0.0386, "step": 2307 }, { "epoch": 0.7707463683419603, "grad_norm": 0.7226482197248502, "learning_rate": 9.268801011633799e-06, "loss": 0.0473, "step": 2308 }, { "epoch": 0.7710803139088329, "grad_norm": 0.2851388528969421, "learning_rate": 9.267788860835076e-06, "loss": 0.0269, "step": 2309 }, { "epoch": 0.7714142594757054, "grad_norm": 0.5878850312448372, "learning_rate": 9.266776065336593e-06, "loss": 0.0628, "step": 2310 }, { "epoch": 0.771748205042578, "grad_norm": 0.34757519733300535, "learning_rate": 9.265762625291346e-06, "loss": 0.0337, "step": 2311 }, { "epoch": 0.7720821506094506, "grad_norm": 0.3855968835433228, "learning_rate": 9.264748540852427e-06, "loss": 0.0383, "step": 2312 }, { "epoch": 0.7724160961763232, "grad_norm": 0.332258418348324, "learning_rate": 9.263733812173023e-06, "loss": 0.035, "step": 2313 }, { "epoch": 0.7727500417431958, "grad_norm": 0.6300936967816194, "learning_rate": 9.262718439406425e-06, "loss": 0.0473, "step": 2314 }, { "epoch": 0.7730839873100684, "grad_norm": 0.5642941486462103, "learning_rate": 9.261702422706014e-06, "loss": 0.048, "step": 2315 }, { "epoch": 0.773417932876941, "grad_norm": 0.3490947858434045, "learning_rate": 9.260685762225273e-06, "loss": 0.0349, "step": 2316 }, { "epoch": 0.7737518784438137, "grad_norm": 0.34370853187301154, "learning_rate": 9.25966845811778e-06, "loss": 0.0375, "step": 2317 }, { "epoch": 0.7740858240106863, "grad_norm": 0.2893861373537869, "learning_rate": 9.258650510537208e-06, "loss": 0.0218, "step": 2318 }, { "epoch": 0.7744197695775589, "grad_norm": 0.42479897197665645, "learning_rate": 9.257631919637333e-06, "loss": 0.0342, "step": 2319 }, { "epoch": 0.7747537151444315, "grad_norm": 0.3423376050436938, "learning_rate": 9.256612685572027e-06, "loss": 0.0361, "step": 2320 }, { "epoch": 0.7750876607113041, "grad_norm": 1.0006130724014104, "learning_rate": 9.255592808495254e-06, "loss": 0.0559, "step": 2321 }, { "epoch": 0.7754216062781767, "grad_norm": 0.3735277315857998, "learning_rate": 9.254572288561077e-06, "loss": 0.0361, "step": 2322 }, { "epoch": 0.7757555518450493, "grad_norm": 0.38814019180363407, "learning_rate": 9.253551125923662e-06, "loss": 0.0466, "step": 2323 }, { "epoch": 0.7760894974119219, "grad_norm": 0.564850405216005, "learning_rate": 9.252529320737265e-06, "loss": 0.0565, "step": 2324 }, { "epoch": 0.7764234429787945, "grad_norm": 0.38527470396801416, "learning_rate": 9.251506873156242e-06, "loss": 0.0415, "step": 2325 }, { "epoch": 0.776757388545667, "grad_norm": 0.4496699876510423, "learning_rate": 9.250483783335046e-06, "loss": 0.0356, "step": 2326 }, { "epoch": 0.7770913341125396, "grad_norm": 0.44017310855164865, "learning_rate": 9.249460051428226e-06, "loss": 0.0318, "step": 2327 }, { "epoch": 0.7774252796794122, "grad_norm": 0.3751258453069622, "learning_rate": 9.24843567759043e-06, "loss": 0.0293, "step": 2328 }, { "epoch": 0.7777592252462848, "grad_norm": 0.4126464214266128, "learning_rate": 9.247410661976402e-06, "loss": 0.0398, "step": 2329 }, { "epoch": 0.7780931708131574, "grad_norm": 0.4186674320930181, "learning_rate": 9.246385004740981e-06, "loss": 0.0591, "step": 2330 }, { "epoch": 0.77842711638003, "grad_norm": 0.4846763983678124, "learning_rate": 9.245358706039105e-06, "loss": 0.0463, "step": 2331 }, { "epoch": 0.7787610619469026, "grad_norm": 0.29009378694281474, "learning_rate": 9.244331766025812e-06, "loss": 0.0307, "step": 2332 }, { "epoch": 0.7790950075137753, "grad_norm": 0.41617628696252, "learning_rate": 9.243304184856226e-06, "loss": 0.032, "step": 2333 }, { "epoch": 0.7794289530806479, "grad_norm": 0.28182386606315113, "learning_rate": 9.242275962685584e-06, "loss": 0.0308, "step": 2334 }, { "epoch": 0.7797628986475205, "grad_norm": 0.44821281607522734, "learning_rate": 9.241247099669202e-06, "loss": 0.0456, "step": 2335 }, { "epoch": 0.7800968442143931, "grad_norm": 0.36446204885079103, "learning_rate": 9.24021759596251e-06, "loss": 0.033, "step": 2336 }, { "epoch": 0.7804307897812657, "grad_norm": 0.4045213815459264, "learning_rate": 9.239187451721021e-06, "loss": 0.0399, "step": 2337 }, { "epoch": 0.7807647353481383, "grad_norm": 0.4448765508265765, "learning_rate": 9.238156667100354e-06, "loss": 0.0487, "step": 2338 }, { "epoch": 0.7810986809150109, "grad_norm": 0.38492584393252205, "learning_rate": 9.237125242256219e-06, "loss": 0.0482, "step": 2339 }, { "epoch": 0.7814326264818835, "grad_norm": 0.3436442668647944, "learning_rate": 9.236093177344427e-06, "loss": 0.0349, "step": 2340 }, { "epoch": 0.7817665720487561, "grad_norm": 0.3751537242198368, "learning_rate": 9.23506047252088e-06, "loss": 0.0346, "step": 2341 }, { "epoch": 0.7821005176156286, "grad_norm": 0.424955879356634, "learning_rate": 9.234027127941585e-06, "loss": 0.034, "step": 2342 }, { "epoch": 0.7824344631825012, "grad_norm": 0.4998866236749951, "learning_rate": 9.232993143762637e-06, "loss": 0.0372, "step": 2343 }, { "epoch": 0.7827684087493738, "grad_norm": 0.25156417480387433, "learning_rate": 9.231958520140232e-06, "loss": 0.0228, "step": 2344 }, { "epoch": 0.7831023543162464, "grad_norm": 0.6408870209428651, "learning_rate": 9.230923257230663e-06, "loss": 0.0407, "step": 2345 }, { "epoch": 0.783436299883119, "grad_norm": 0.439321078845631, "learning_rate": 9.22988735519032e-06, "loss": 0.0283, "step": 2346 }, { "epoch": 0.7837702454499916, "grad_norm": 0.7523973235580048, "learning_rate": 9.228850814175684e-06, "loss": 0.0274, "step": 2347 }, { "epoch": 0.7841041910168642, "grad_norm": 0.3867047988392383, "learning_rate": 9.22781363434334e-06, "loss": 0.0307, "step": 2348 }, { "epoch": 0.7844381365837368, "grad_norm": 0.4819176914879532, "learning_rate": 9.226775815849969e-06, "loss": 0.0392, "step": 2349 }, { "epoch": 0.7847720821506095, "grad_norm": 0.365813211738815, "learning_rate": 9.225737358852339e-06, "loss": 0.039, "step": 2350 }, { "epoch": 0.7851060277174821, "grad_norm": 0.3297844802233969, "learning_rate": 9.224698263507326e-06, "loss": 0.0385, "step": 2351 }, { "epoch": 0.7854399732843547, "grad_norm": 0.4214744807341373, "learning_rate": 9.223658529971896e-06, "loss": 0.0444, "step": 2352 }, { "epoch": 0.7857739188512273, "grad_norm": 0.445684392370362, "learning_rate": 9.222618158403111e-06, "loss": 0.0391, "step": 2353 }, { "epoch": 0.7861078644180999, "grad_norm": 0.3742527468558874, "learning_rate": 9.221577148958137e-06, "loss": 0.0263, "step": 2354 }, { "epoch": 0.7864418099849725, "grad_norm": 0.41245806746412306, "learning_rate": 9.220535501794224e-06, "loss": 0.0296, "step": 2355 }, { "epoch": 0.7867757555518451, "grad_norm": 0.36738237117600886, "learning_rate": 9.21949321706873e-06, "loss": 0.0298, "step": 2356 }, { "epoch": 0.7871097011187177, "grad_norm": 1.739612723513887, "learning_rate": 9.218450294939103e-06, "loss": 0.0424, "step": 2357 }, { "epoch": 0.7874436466855903, "grad_norm": 0.502219044342745, "learning_rate": 9.217406735562887e-06, "loss": 0.0383, "step": 2358 }, { "epoch": 0.7877775922524628, "grad_norm": 0.36902696622805026, "learning_rate": 9.216362539097726e-06, "loss": 0.0283, "step": 2359 }, { "epoch": 0.7881115378193354, "grad_norm": 0.4595719327394596, "learning_rate": 9.215317705701356e-06, "loss": 0.0356, "step": 2360 }, { "epoch": 0.788445483386208, "grad_norm": 0.40020534193794044, "learning_rate": 9.214272235531615e-06, "loss": 0.0223, "step": 2361 }, { "epoch": 0.7887794289530806, "grad_norm": 0.4674393974345048, "learning_rate": 9.213226128746431e-06, "loss": 0.0423, "step": 2362 }, { "epoch": 0.7891133745199532, "grad_norm": 0.37668519786186994, "learning_rate": 9.21217938550383e-06, "loss": 0.037, "step": 2363 }, { "epoch": 0.7894473200868258, "grad_norm": 0.3140922341544152, "learning_rate": 9.211132005961936e-06, "loss": 0.0332, "step": 2364 }, { "epoch": 0.7897812656536984, "grad_norm": 0.3887117251094952, "learning_rate": 9.210083990278968e-06, "loss": 0.0383, "step": 2365 }, { "epoch": 0.790115211220571, "grad_norm": 0.2794176454357536, "learning_rate": 9.209035338613242e-06, "loss": 0.0253, "step": 2366 }, { "epoch": 0.7904491567874437, "grad_norm": 0.4790062539480891, "learning_rate": 9.207986051123167e-06, "loss": 0.0435, "step": 2367 }, { "epoch": 0.7907831023543163, "grad_norm": 0.3569405064812053, "learning_rate": 9.206936127967254e-06, "loss": 0.0348, "step": 2368 }, { "epoch": 0.7911170479211889, "grad_norm": 0.46043945348276366, "learning_rate": 9.205885569304103e-06, "loss": 0.0518, "step": 2369 }, { "epoch": 0.7914509934880615, "grad_norm": 0.48821940054543433, "learning_rate": 9.204834375292413e-06, "loss": 0.0454, "step": 2370 }, { "epoch": 0.7917849390549341, "grad_norm": 0.39694079969159907, "learning_rate": 9.20378254609098e-06, "loss": 0.0377, "step": 2371 }, { "epoch": 0.7921188846218067, "grad_norm": 0.3183483878023652, "learning_rate": 9.202730081858697e-06, "loss": 0.0288, "step": 2372 }, { "epoch": 0.7924528301886793, "grad_norm": 0.37945832633015725, "learning_rate": 9.201676982754549e-06, "loss": 0.0349, "step": 2373 }, { "epoch": 0.7927867757555519, "grad_norm": 0.3527971006203742, "learning_rate": 9.200623248937619e-06, "loss": 0.0336, "step": 2374 }, { "epoch": 0.7931207213224244, "grad_norm": 0.5719888456394068, "learning_rate": 9.199568880567085e-06, "loss": 0.056, "step": 2375 }, { "epoch": 0.793454666889297, "grad_norm": 0.2937981953922501, "learning_rate": 9.198513877802226e-06, "loss": 0.0311, "step": 2376 }, { "epoch": 0.7937886124561696, "grad_norm": 0.418195405726491, "learning_rate": 9.19745824080241e-06, "loss": 0.044, "step": 2377 }, { "epoch": 0.7941225580230422, "grad_norm": 0.31126928333848797, "learning_rate": 9.196401969727101e-06, "loss": 0.0342, "step": 2378 }, { "epoch": 0.7944565035899148, "grad_norm": 0.3489075878698303, "learning_rate": 9.195345064735865e-06, "loss": 0.033, "step": 2379 }, { "epoch": 0.7947904491567874, "grad_norm": 0.5326991082953855, "learning_rate": 9.194287525988358e-06, "loss": 0.0486, "step": 2380 }, { "epoch": 0.79512439472366, "grad_norm": 0.30114911980458225, "learning_rate": 9.193229353644336e-06, "loss": 0.0273, "step": 2381 }, { "epoch": 0.7954583402905326, "grad_norm": 0.30664946189607595, "learning_rate": 9.192170547863644e-06, "loss": 0.028, "step": 2382 }, { "epoch": 0.7957922858574052, "grad_norm": 0.5154007863444322, "learning_rate": 9.191111108806228e-06, "loss": 0.06, "step": 2383 }, { "epoch": 0.7961262314242779, "grad_norm": 0.3483317254335199, "learning_rate": 9.190051036632133e-06, "loss": 0.0267, "step": 2384 }, { "epoch": 0.7964601769911505, "grad_norm": 0.3830564906103203, "learning_rate": 9.188990331501493e-06, "loss": 0.0317, "step": 2385 }, { "epoch": 0.7967941225580231, "grad_norm": 0.3534612409362452, "learning_rate": 9.187928993574537e-06, "loss": 0.0399, "step": 2386 }, { "epoch": 0.7971280681248957, "grad_norm": 0.5207150849458622, "learning_rate": 9.186867023011598e-06, "loss": 0.0435, "step": 2387 }, { "epoch": 0.7974620136917683, "grad_norm": 0.5533648404301923, "learning_rate": 9.185804419973096e-06, "loss": 0.0298, "step": 2388 }, { "epoch": 0.7977959592586409, "grad_norm": 0.39095097237270326, "learning_rate": 9.18474118461955e-06, "loss": 0.0371, "step": 2389 }, { "epoch": 0.7981299048255135, "grad_norm": 0.3223845734875138, "learning_rate": 9.183677317111572e-06, "loss": 0.0256, "step": 2390 }, { "epoch": 0.798463850392386, "grad_norm": 0.3270148569203314, "learning_rate": 9.182612817609877e-06, "loss": 0.0279, "step": 2391 }, { "epoch": 0.7987977959592586, "grad_norm": 0.5074522008396516, "learning_rate": 9.181547686275266e-06, "loss": 0.0473, "step": 2392 }, { "epoch": 0.7991317415261312, "grad_norm": 0.4095450740977763, "learning_rate": 9.180481923268641e-06, "loss": 0.0359, "step": 2393 }, { "epoch": 0.7994656870930038, "grad_norm": 0.6349669291396607, "learning_rate": 9.179415528750998e-06, "loss": 0.0443, "step": 2394 }, { "epoch": 0.7997996326598764, "grad_norm": 0.3217982545761102, "learning_rate": 9.178348502883428e-06, "loss": 0.0284, "step": 2395 }, { "epoch": 0.800133578226749, "grad_norm": 0.4167659783631312, "learning_rate": 9.17728084582712e-06, "loss": 0.0476, "step": 2396 }, { "epoch": 0.8004675237936216, "grad_norm": 0.3966339225977142, "learning_rate": 9.176212557743352e-06, "loss": 0.0354, "step": 2397 }, { "epoch": 0.8008014693604942, "grad_norm": 0.5187414436076643, "learning_rate": 9.175143638793504e-06, "loss": 0.0425, "step": 2398 }, { "epoch": 0.8011354149273668, "grad_norm": 0.4727472650003963, "learning_rate": 9.174074089139048e-06, "loss": 0.0334, "step": 2399 }, { "epoch": 0.8014693604942394, "grad_norm": 0.46117154169807706, "learning_rate": 9.173003908941555e-06, "loss": 0.0405, "step": 2400 }, { "epoch": 0.801803306061112, "grad_norm": 0.37078738977373793, "learning_rate": 9.171933098362685e-06, "loss": 0.0461, "step": 2401 }, { "epoch": 0.8021372516279847, "grad_norm": 0.32751459125821236, "learning_rate": 9.170861657564197e-06, "loss": 0.0271, "step": 2402 }, { "epoch": 0.8024711971948573, "grad_norm": 0.4234680642811903, "learning_rate": 9.169789586707947e-06, "loss": 0.0368, "step": 2403 }, { "epoch": 0.8028051427617299, "grad_norm": 0.7077112915277073, "learning_rate": 9.16871688595588e-06, "loss": 0.0526, "step": 2404 }, { "epoch": 0.8031390883286025, "grad_norm": 0.3150630326209093, "learning_rate": 9.167643555470044e-06, "loss": 0.032, "step": 2405 }, { "epoch": 0.8034730338954751, "grad_norm": 0.48362754954045, "learning_rate": 9.166569595412576e-06, "loss": 0.0468, "step": 2406 }, { "epoch": 0.8038069794623477, "grad_norm": 0.7833067374264567, "learning_rate": 9.16549500594571e-06, "loss": 0.0309, "step": 2407 }, { "epoch": 0.8041409250292202, "grad_norm": 0.3873284202124137, "learning_rate": 9.164419787231778e-06, "loss": 0.0336, "step": 2408 }, { "epoch": 0.8044748705960928, "grad_norm": 0.5462434144047629, "learning_rate": 9.163343939433202e-06, "loss": 0.0316, "step": 2409 }, { "epoch": 0.8048088161629654, "grad_norm": 0.5206466027492244, "learning_rate": 9.162267462712502e-06, "loss": 0.04, "step": 2410 }, { "epoch": 0.805142761729838, "grad_norm": 0.3640320791945312, "learning_rate": 9.161190357232292e-06, "loss": 0.0416, "step": 2411 }, { "epoch": 0.8054767072967106, "grad_norm": 0.2826873572129083, "learning_rate": 9.160112623155282e-06, "loss": 0.0255, "step": 2412 }, { "epoch": 0.8058106528635832, "grad_norm": 0.24947060858739167, "learning_rate": 9.159034260644277e-06, "loss": 0.0259, "step": 2413 }, { "epoch": 0.8061445984304558, "grad_norm": 0.4415832391790346, "learning_rate": 9.157955269862176e-06, "loss": 0.0538, "step": 2414 }, { "epoch": 0.8064785439973284, "grad_norm": 0.5403604731819692, "learning_rate": 9.156875650971974e-06, "loss": 0.0479, "step": 2415 }, { "epoch": 0.806812489564201, "grad_norm": 0.37074362603031524, "learning_rate": 9.155795404136757e-06, "loss": 0.0313, "step": 2416 }, { "epoch": 0.8071464351310736, "grad_norm": 0.28043313341854315, "learning_rate": 9.154714529519715e-06, "loss": 0.0257, "step": 2417 }, { "epoch": 0.8074803806979463, "grad_norm": 0.3944787897956824, "learning_rate": 9.15363302728412e-06, "loss": 0.037, "step": 2418 }, { "epoch": 0.8078143262648189, "grad_norm": 0.54183445956239, "learning_rate": 9.15255089759335e-06, "loss": 0.0376, "step": 2419 }, { "epoch": 0.8081482718316915, "grad_norm": 0.3641107896799597, "learning_rate": 9.151468140610872e-06, "loss": 0.0345, "step": 2420 }, { "epoch": 0.8084822173985641, "grad_norm": 0.34098392754429363, "learning_rate": 9.150384756500249e-06, "loss": 0.0296, "step": 2421 }, { "epoch": 0.8088161629654367, "grad_norm": 0.4701897245201092, "learning_rate": 9.14930074542514e-06, "loss": 0.0375, "step": 2422 }, { "epoch": 0.8091501085323093, "grad_norm": 0.8031930296798723, "learning_rate": 9.148216107549297e-06, "loss": 0.0449, "step": 2423 }, { "epoch": 0.8094840540991818, "grad_norm": 0.36074531884402183, "learning_rate": 9.147130843036567e-06, "loss": 0.0343, "step": 2424 }, { "epoch": 0.8098179996660544, "grad_norm": 0.3821526286807176, "learning_rate": 9.146044952050891e-06, "loss": 0.033, "step": 2425 }, { "epoch": 0.810151945232927, "grad_norm": 0.41207661037352494, "learning_rate": 9.144958434756308e-06, "loss": 0.0406, "step": 2426 }, { "epoch": 0.8104858907997996, "grad_norm": 0.4425118776433806, "learning_rate": 9.14387129131695e-06, "loss": 0.0339, "step": 2427 }, { "epoch": 0.8108198363666722, "grad_norm": 0.4517910740924372, "learning_rate": 9.142783521897038e-06, "loss": 0.0349, "step": 2428 }, { "epoch": 0.8111537819335448, "grad_norm": 0.42162971152368633, "learning_rate": 9.141695126660896e-06, "loss": 0.0321, "step": 2429 }, { "epoch": 0.8114877275004174, "grad_norm": 0.411906356374825, "learning_rate": 9.14060610577294e-06, "loss": 0.0399, "step": 2430 }, { "epoch": 0.81182167306729, "grad_norm": 0.28577457074283213, "learning_rate": 9.139516459397675e-06, "loss": 0.028, "step": 2431 }, { "epoch": 0.8121556186341626, "grad_norm": 0.7449829445852623, "learning_rate": 9.13842618769971e-06, "loss": 0.0505, "step": 2432 }, { "epoch": 0.8124895642010352, "grad_norm": 0.45647043906420526, "learning_rate": 9.13733529084374e-06, "loss": 0.0599, "step": 2433 }, { "epoch": 0.8128235097679078, "grad_norm": 0.35935069149352, "learning_rate": 9.13624376899456e-06, "loss": 0.0375, "step": 2434 }, { "epoch": 0.8131574553347805, "grad_norm": 0.26724471005808453, "learning_rate": 9.135151622317054e-06, "loss": 0.0214, "step": 2435 }, { "epoch": 0.8134914009016531, "grad_norm": 0.4143829862143663, "learning_rate": 9.134058850976205e-06, "loss": 0.0394, "step": 2436 }, { "epoch": 0.8138253464685257, "grad_norm": 0.3387135148533028, "learning_rate": 9.132965455137092e-06, "loss": 0.0232, "step": 2437 }, { "epoch": 0.8141592920353983, "grad_norm": 0.387476338381007, "learning_rate": 9.13187143496488e-06, "loss": 0.0328, "step": 2438 }, { "epoch": 0.8144932376022709, "grad_norm": 0.32699976371462375, "learning_rate": 9.13077679062484e-06, "loss": 0.0313, "step": 2439 }, { "epoch": 0.8148271831691434, "grad_norm": 0.4811761527697918, "learning_rate": 9.129681522282326e-06, "loss": 0.052, "step": 2440 }, { "epoch": 0.815161128736016, "grad_norm": 0.40425537229020103, "learning_rate": 9.128585630102793e-06, "loss": 0.0343, "step": 2441 }, { "epoch": 0.8154950743028886, "grad_norm": 0.22313328264074472, "learning_rate": 9.127489114251787e-06, "loss": 0.0221, "step": 2442 }, { "epoch": 0.8158290198697612, "grad_norm": 0.4236672242450553, "learning_rate": 9.12639197489495e-06, "loss": 0.0409, "step": 2443 }, { "epoch": 0.8161629654366338, "grad_norm": 0.3226748396398696, "learning_rate": 9.125294212198022e-06, "loss": 0.0318, "step": 2444 }, { "epoch": 0.8164969110035064, "grad_norm": 0.39649953078996875, "learning_rate": 9.124195826326827e-06, "loss": 0.0394, "step": 2445 }, { "epoch": 0.816830856570379, "grad_norm": 0.44571838385313783, "learning_rate": 9.12309681744729e-06, "loss": 0.0359, "step": 2446 }, { "epoch": 0.8171648021372516, "grad_norm": 0.4612803299732282, "learning_rate": 9.121997185725433e-06, "loss": 0.0503, "step": 2447 }, { "epoch": 0.8174987477041242, "grad_norm": 0.28429125911881203, "learning_rate": 9.120896931327366e-06, "loss": 0.0328, "step": 2448 }, { "epoch": 0.8178326932709968, "grad_norm": 0.3489991448508281, "learning_rate": 9.119796054419295e-06, "loss": 0.0397, "step": 2449 }, { "epoch": 0.8181666388378694, "grad_norm": 0.46086943336663794, "learning_rate": 9.118694555167521e-06, "loss": 0.0435, "step": 2450 }, { "epoch": 0.818500584404742, "grad_norm": 0.37026378315513875, "learning_rate": 9.117592433738439e-06, "loss": 0.0396, "step": 2451 }, { "epoch": 0.8188345299716147, "grad_norm": 0.3769869415058017, "learning_rate": 9.116489690298536e-06, "loss": 0.0339, "step": 2452 }, { "epoch": 0.8191684755384873, "grad_norm": 0.3655957147179104, "learning_rate": 9.115386325014396e-06, "loss": 0.0279, "step": 2453 }, { "epoch": 0.8195024211053599, "grad_norm": 0.5595894810042473, "learning_rate": 9.114282338052695e-06, "loss": 0.0733, "step": 2454 }, { "epoch": 0.8198363666722325, "grad_norm": 0.3916737281680144, "learning_rate": 9.113177729580203e-06, "loss": 0.0303, "step": 2455 }, { "epoch": 0.820170312239105, "grad_norm": 0.264321562523143, "learning_rate": 9.112072499763783e-06, "loss": 0.0263, "step": 2456 }, { "epoch": 0.8205042578059776, "grad_norm": 0.43920788251930337, "learning_rate": 9.110966648770392e-06, "loss": 0.0367, "step": 2457 }, { "epoch": 0.8208382033728502, "grad_norm": 0.3810762130061277, "learning_rate": 9.109860176767085e-06, "loss": 0.0374, "step": 2458 }, { "epoch": 0.8211721489397228, "grad_norm": 0.37419432634766514, "learning_rate": 9.108753083921007e-06, "loss": 0.0269, "step": 2459 }, { "epoch": 0.8215060945065954, "grad_norm": 0.3937119028500666, "learning_rate": 9.107645370399395e-06, "loss": 0.035, "step": 2460 }, { "epoch": 0.821840040073468, "grad_norm": 0.3157086225779351, "learning_rate": 9.106537036369587e-06, "loss": 0.027, "step": 2461 }, { "epoch": 0.8221739856403406, "grad_norm": 0.36220649413296774, "learning_rate": 9.105428081999004e-06, "loss": 0.0377, "step": 2462 }, { "epoch": 0.8225079312072132, "grad_norm": 0.29694055654637075, "learning_rate": 9.10431850745517e-06, "loss": 0.0274, "step": 2463 }, { "epoch": 0.8228418767740858, "grad_norm": 0.5536074318217982, "learning_rate": 9.103208312905698e-06, "loss": 0.0339, "step": 2464 }, { "epoch": 0.8231758223409584, "grad_norm": 0.31589960925835464, "learning_rate": 9.102097498518299e-06, "loss": 0.0328, "step": 2465 }, { "epoch": 0.823509767907831, "grad_norm": 0.240873576082435, "learning_rate": 9.100986064460769e-06, "loss": 0.0253, "step": 2466 }, { "epoch": 0.8238437134747036, "grad_norm": 0.6390537219668236, "learning_rate": 9.099874010901009e-06, "loss": 0.0384, "step": 2467 }, { "epoch": 0.8241776590415762, "grad_norm": 0.4003867672451697, "learning_rate": 9.098761338007003e-06, "loss": 0.0421, "step": 2468 }, { "epoch": 0.8245116046084489, "grad_norm": 0.25997349608393655, "learning_rate": 9.097648045946837e-06, "loss": 0.0276, "step": 2469 }, { "epoch": 0.8248455501753215, "grad_norm": 0.37630087962921577, "learning_rate": 9.096534134888685e-06, "loss": 0.0397, "step": 2470 }, { "epoch": 0.8251794957421941, "grad_norm": 0.448552322720191, "learning_rate": 9.095419605000817e-06, "loss": 0.0542, "step": 2471 }, { "epoch": 0.8255134413090667, "grad_norm": 0.40038428905827006, "learning_rate": 9.094304456451596e-06, "loss": 0.0329, "step": 2472 }, { "epoch": 0.8258473868759392, "grad_norm": 0.35070676220854424, "learning_rate": 9.093188689409477e-06, "loss": 0.0305, "step": 2473 }, { "epoch": 0.8261813324428118, "grad_norm": 0.6567098569875411, "learning_rate": 9.09207230404301e-06, "loss": 0.0433, "step": 2474 }, { "epoch": 0.8265152780096844, "grad_norm": 0.4146248814385917, "learning_rate": 9.090955300520842e-06, "loss": 0.0359, "step": 2475 }, { "epoch": 0.826849223576557, "grad_norm": 0.53906952828287, "learning_rate": 9.089837679011704e-06, "loss": 0.0487, "step": 2476 }, { "epoch": 0.8271831691434296, "grad_norm": 0.3961223055800164, "learning_rate": 9.08871943968443e-06, "loss": 0.0465, "step": 2477 }, { "epoch": 0.8275171147103022, "grad_norm": 0.35789536394251836, "learning_rate": 9.08760058270794e-06, "loss": 0.0304, "step": 2478 }, { "epoch": 0.8278510602771748, "grad_norm": 0.5180262448696479, "learning_rate": 9.086481108251253e-06, "loss": 0.0475, "step": 2479 }, { "epoch": 0.8281850058440474, "grad_norm": 0.33508836274125614, "learning_rate": 9.085361016483477e-06, "loss": 0.0324, "step": 2480 }, { "epoch": 0.82851895141092, "grad_norm": 0.46774903934797496, "learning_rate": 9.084240307573816e-06, "loss": 0.0418, "step": 2481 }, { "epoch": 0.8288528969777926, "grad_norm": 0.4297754725269856, "learning_rate": 9.083118981691567e-06, "loss": 0.0387, "step": 2482 }, { "epoch": 0.8291868425446652, "grad_norm": 0.4137131594058549, "learning_rate": 9.081997039006117e-06, "loss": 0.023, "step": 2483 }, { "epoch": 0.8295207881115378, "grad_norm": 0.3159406308326085, "learning_rate": 9.080874479686952e-06, "loss": 0.0294, "step": 2484 }, { "epoch": 0.8298547336784105, "grad_norm": 0.4318463669325737, "learning_rate": 9.079751303903646e-06, "loss": 0.0388, "step": 2485 }, { "epoch": 0.8301886792452831, "grad_norm": 0.5132811378489065, "learning_rate": 9.078627511825866e-06, "loss": 0.0442, "step": 2486 }, { "epoch": 0.8305226248121557, "grad_norm": 0.369999066202171, "learning_rate": 9.077503103623379e-06, "loss": 0.0415, "step": 2487 }, { "epoch": 0.8308565703790283, "grad_norm": 0.35851599522502636, "learning_rate": 9.076378079466036e-06, "loss": 0.0292, "step": 2488 }, { "epoch": 0.8311905159459008, "grad_norm": 0.39661769308364786, "learning_rate": 9.075252439523785e-06, "loss": 0.0273, "step": 2489 }, { "epoch": 0.8315244615127734, "grad_norm": 0.6490263729376547, "learning_rate": 9.074126183966669e-06, "loss": 0.0459, "step": 2490 }, { "epoch": 0.831858407079646, "grad_norm": 0.4993562408027148, "learning_rate": 9.072999312964823e-06, "loss": 0.0322, "step": 2491 }, { "epoch": 0.8321923526465186, "grad_norm": 0.5590656530101995, "learning_rate": 9.071871826688472e-06, "loss": 0.037, "step": 2492 }, { "epoch": 0.8325262982133912, "grad_norm": 0.40610149648410226, "learning_rate": 9.070743725307937e-06, "loss": 0.0353, "step": 2493 }, { "epoch": 0.8328602437802638, "grad_norm": 0.47595095725779396, "learning_rate": 9.06961500899363e-06, "loss": 0.0434, "step": 2494 }, { "epoch": 0.8331941893471364, "grad_norm": 0.3155987530334695, "learning_rate": 9.068485677916059e-06, "loss": 0.0268, "step": 2495 }, { "epoch": 0.833528134914009, "grad_norm": 0.29476634712756444, "learning_rate": 9.06735573224582e-06, "loss": 0.0293, "step": 2496 }, { "epoch": 0.8338620804808816, "grad_norm": 0.45532023846265596, "learning_rate": 9.066225172153607e-06, "loss": 0.0383, "step": 2497 }, { "epoch": 0.8341960260477542, "grad_norm": 0.45987924413218256, "learning_rate": 9.065093997810204e-06, "loss": 0.0419, "step": 2498 }, { "epoch": 0.8345299716146268, "grad_norm": 0.3355220422575681, "learning_rate": 9.063962209386485e-06, "loss": 0.0332, "step": 2499 }, { "epoch": 0.8348639171814994, "grad_norm": 0.3454349722962344, "learning_rate": 9.062829807053426e-06, "loss": 0.0309, "step": 2500 }, { "epoch": 0.835197862748372, "grad_norm": 0.35926714475349847, "learning_rate": 9.061696790982086e-06, "loss": 0.0392, "step": 2501 }, { "epoch": 0.8355318083152447, "grad_norm": 0.4295126451366027, "learning_rate": 9.060563161343618e-06, "loss": 0.0349, "step": 2502 }, { "epoch": 0.8358657538821173, "grad_norm": 0.29260561635820886, "learning_rate": 9.059428918309276e-06, "loss": 0.0257, "step": 2503 }, { "epoch": 0.8361996994489899, "grad_norm": 0.33008995795141166, "learning_rate": 9.058294062050396e-06, "loss": 0.046, "step": 2504 }, { "epoch": 0.8365336450158624, "grad_norm": 0.2517921444583666, "learning_rate": 9.057158592738414e-06, "loss": 0.0208, "step": 2505 }, { "epoch": 0.836867590582735, "grad_norm": 0.31092700651231453, "learning_rate": 9.056022510544855e-06, "loss": 0.0264, "step": 2506 }, { "epoch": 0.8372015361496076, "grad_norm": 0.3313582044848553, "learning_rate": 9.054885815641336e-06, "loss": 0.0388, "step": 2507 }, { "epoch": 0.8375354817164802, "grad_norm": 0.39183559708166843, "learning_rate": 9.05374850819957e-06, "loss": 0.0335, "step": 2508 }, { "epoch": 0.8378694272833528, "grad_norm": 0.3097698801810843, "learning_rate": 9.052610588391363e-06, "loss": 0.0304, "step": 2509 }, { "epoch": 0.8382033728502254, "grad_norm": 0.46386041507843345, "learning_rate": 9.051472056388606e-06, "loss": 0.0411, "step": 2510 }, { "epoch": 0.838537318417098, "grad_norm": 0.3402621546983913, "learning_rate": 9.050332912363292e-06, "loss": 0.0464, "step": 2511 }, { "epoch": 0.8388712639839706, "grad_norm": 0.42905597080581825, "learning_rate": 9.049193156487501e-06, "loss": 0.0347, "step": 2512 }, { "epoch": 0.8392052095508432, "grad_norm": 0.36758534229327194, "learning_rate": 9.048052788933405e-06, "loss": 0.0373, "step": 2513 }, { "epoch": 0.8395391551177158, "grad_norm": 0.3109644251678908, "learning_rate": 9.046911809873271e-06, "loss": 0.0359, "step": 2514 }, { "epoch": 0.8398731006845884, "grad_norm": 0.270042973528763, "learning_rate": 9.045770219479457e-06, "loss": 0.0283, "step": 2515 }, { "epoch": 0.840207046251461, "grad_norm": 0.3615870964014073, "learning_rate": 9.044628017924415e-06, "loss": 0.0387, "step": 2516 }, { "epoch": 0.8405409918183336, "grad_norm": 0.4431650479128026, "learning_rate": 9.043485205380685e-06, "loss": 0.0424, "step": 2517 }, { "epoch": 0.8408749373852062, "grad_norm": 0.3106387026167272, "learning_rate": 9.042341782020906e-06, "loss": 0.0296, "step": 2518 }, { "epoch": 0.8412088829520789, "grad_norm": 0.4525540474250504, "learning_rate": 9.041197748017802e-06, "loss": 0.0285, "step": 2519 }, { "epoch": 0.8415428285189515, "grad_norm": 0.4281303859257426, "learning_rate": 9.040053103544196e-06, "loss": 0.0309, "step": 2520 }, { "epoch": 0.8418767740858241, "grad_norm": 0.34463536752834206, "learning_rate": 9.038907848772999e-06, "loss": 0.0291, "step": 2521 }, { "epoch": 0.8422107196526966, "grad_norm": 0.4978034332914274, "learning_rate": 9.037761983877214e-06, "loss": 0.0403, "step": 2522 }, { "epoch": 0.8425446652195692, "grad_norm": 0.2935442990809466, "learning_rate": 9.036615509029939e-06, "loss": 0.0277, "step": 2523 }, { "epoch": 0.8428786107864418, "grad_norm": 0.3939069379966287, "learning_rate": 9.035468424404362e-06, "loss": 0.0447, "step": 2524 }, { "epoch": 0.8432125563533144, "grad_norm": 0.2692561881254716, "learning_rate": 9.034320730173762e-06, "loss": 0.032, "step": 2525 }, { "epoch": 0.843546501920187, "grad_norm": 0.40534574419674485, "learning_rate": 9.033172426511515e-06, "loss": 0.0338, "step": 2526 }, { "epoch": 0.8438804474870596, "grad_norm": 0.38824482878056465, "learning_rate": 9.032023513591081e-06, "loss": 0.0341, "step": 2527 }, { "epoch": 0.8442143930539322, "grad_norm": 0.3807926866614681, "learning_rate": 9.030873991586021e-06, "loss": 0.0328, "step": 2528 }, { "epoch": 0.8445483386208048, "grad_norm": 0.3329664612627948, "learning_rate": 9.029723860669983e-06, "loss": 0.0333, "step": 2529 }, { "epoch": 0.8448822841876774, "grad_norm": 0.4923109783222729, "learning_rate": 9.028573121016707e-06, "loss": 0.0469, "step": 2530 }, { "epoch": 0.84521622975455, "grad_norm": 0.4685760628222625, "learning_rate": 9.027421772800027e-06, "loss": 0.0455, "step": 2531 }, { "epoch": 0.8455501753214226, "grad_norm": 0.3429466888956345, "learning_rate": 9.026269816193867e-06, "loss": 0.0326, "step": 2532 }, { "epoch": 0.8458841208882952, "grad_norm": 0.41156545406048173, "learning_rate": 9.025117251372242e-06, "loss": 0.0389, "step": 2533 }, { "epoch": 0.8462180664551678, "grad_norm": 0.38520743067716473, "learning_rate": 9.023964078509263e-06, "loss": 0.0468, "step": 2534 }, { "epoch": 0.8465520120220404, "grad_norm": 0.4426812965109251, "learning_rate": 9.022810297779129e-06, "loss": 0.0314, "step": 2535 }, { "epoch": 0.846885957588913, "grad_norm": 0.4054259111233482, "learning_rate": 9.021655909356133e-06, "loss": 0.0319, "step": 2536 }, { "epoch": 0.8472199031557857, "grad_norm": 0.275475936813345, "learning_rate": 9.020500913414658e-06, "loss": 0.0313, "step": 2537 }, { "epoch": 0.8475538487226582, "grad_norm": 0.5625464771479998, "learning_rate": 9.019345310129179e-06, "loss": 0.0343, "step": 2538 }, { "epoch": 0.8478877942895308, "grad_norm": 0.37284068266175235, "learning_rate": 9.018189099674266e-06, "loss": 0.0393, "step": 2539 }, { "epoch": 0.8482217398564034, "grad_norm": 0.39007837088611796, "learning_rate": 9.017032282224577e-06, "loss": 0.0318, "step": 2540 }, { "epoch": 0.848555685423276, "grad_norm": 0.2668760370278359, "learning_rate": 9.015874857954863e-06, "loss": 0.0243, "step": 2541 }, { "epoch": 0.8488896309901486, "grad_norm": 0.3726617870078607, "learning_rate": 9.014716827039965e-06, "loss": 0.0419, "step": 2542 }, { "epoch": 0.8492235765570212, "grad_norm": 0.3365880533500763, "learning_rate": 9.013558189654819e-06, "loss": 0.0324, "step": 2543 }, { "epoch": 0.8495575221238938, "grad_norm": 0.3724418543681984, "learning_rate": 9.01239894597445e-06, "loss": 0.0321, "step": 2544 }, { "epoch": 0.8498914676907664, "grad_norm": 0.36786990425937205, "learning_rate": 9.011239096173977e-06, "loss": 0.026, "step": 2545 }, { "epoch": 0.850225413257639, "grad_norm": 0.3369252086446057, "learning_rate": 9.010078640428606e-06, "loss": 0.0294, "step": 2546 }, { "epoch": 0.8505593588245116, "grad_norm": 0.41643199528600433, "learning_rate": 9.00891757891364e-06, "loss": 0.0359, "step": 2547 }, { "epoch": 0.8508933043913842, "grad_norm": 0.3551459626507266, "learning_rate": 9.007755911804471e-06, "loss": 0.0416, "step": 2548 }, { "epoch": 0.8512272499582568, "grad_norm": 0.5818091704775687, "learning_rate": 9.006593639276582e-06, "loss": 0.0388, "step": 2549 }, { "epoch": 0.8515611955251294, "grad_norm": 0.4401470784045499, "learning_rate": 9.005430761505548e-06, "loss": 0.0379, "step": 2550 }, { "epoch": 0.851895141092002, "grad_norm": 0.34058136620945995, "learning_rate": 9.004267278667032e-06, "loss": 0.037, "step": 2551 }, { "epoch": 0.8522290866588746, "grad_norm": 0.2688628258869588, "learning_rate": 9.003103190936797e-06, "loss": 0.0233, "step": 2552 }, { "epoch": 0.8525630322257473, "grad_norm": 0.31540272385338997, "learning_rate": 9.00193849849069e-06, "loss": 0.026, "step": 2553 }, { "epoch": 0.8528969777926197, "grad_norm": 0.3048526266868853, "learning_rate": 9.00077320150465e-06, "loss": 0.0309, "step": 2554 }, { "epoch": 0.8532309233594924, "grad_norm": 0.6693321639654497, "learning_rate": 8.999607300154712e-06, "loss": 0.0421, "step": 2555 }, { "epoch": 0.853564868926365, "grad_norm": 0.25290515928318363, "learning_rate": 8.998440794616998e-06, "loss": 0.0235, "step": 2556 }, { "epoch": 0.8538988144932376, "grad_norm": 0.3147693099430538, "learning_rate": 8.99727368506772e-06, "loss": 0.0276, "step": 2557 }, { "epoch": 0.8542327600601102, "grad_norm": 0.7088509591714547, "learning_rate": 8.996105971683187e-06, "loss": 0.0278, "step": 2558 }, { "epoch": 0.8545667056269828, "grad_norm": 0.6308194130049286, "learning_rate": 8.994937654639793e-06, "loss": 0.0442, "step": 2559 }, { "epoch": 0.8549006511938554, "grad_norm": 0.44898239754939884, "learning_rate": 8.993768734114029e-06, "loss": 0.0309, "step": 2560 }, { "epoch": 0.855234596760728, "grad_norm": 0.28749219626684475, "learning_rate": 8.992599210282471e-06, "loss": 0.0315, "step": 2561 }, { "epoch": 0.8555685423276006, "grad_norm": 0.3812508704110184, "learning_rate": 8.991429083321792e-06, "loss": 0.0312, "step": 2562 }, { "epoch": 0.8559024878944732, "grad_norm": 1.7031389079076569, "learning_rate": 8.990258353408754e-06, "loss": 0.0459, "step": 2563 }, { "epoch": 0.8562364334613458, "grad_norm": 0.31544007516151007, "learning_rate": 8.989087020720204e-06, "loss": 0.0371, "step": 2564 }, { "epoch": 0.8565703790282184, "grad_norm": 0.38767338172827265, "learning_rate": 8.987915085433092e-06, "loss": 0.0335, "step": 2565 }, { "epoch": 0.856904324595091, "grad_norm": 0.3000791654786373, "learning_rate": 8.98674254772445e-06, "loss": 0.0318, "step": 2566 }, { "epoch": 0.8572382701619636, "grad_norm": 0.31229740046048454, "learning_rate": 8.985569407771404e-06, "loss": 0.0303, "step": 2567 }, { "epoch": 0.8575722157288362, "grad_norm": 0.49339490655013896, "learning_rate": 8.984395665751169e-06, "loss": 0.0421, "step": 2568 }, { "epoch": 0.8579061612957088, "grad_norm": 0.36816145730214384, "learning_rate": 8.983221321841056e-06, "loss": 0.0328, "step": 2569 }, { "epoch": 0.8582401068625815, "grad_norm": 0.37529493325365043, "learning_rate": 8.98204637621846e-06, "loss": 0.0331, "step": 2570 }, { "epoch": 0.858574052429454, "grad_norm": 0.45748221301458636, "learning_rate": 8.980870829060872e-06, "loss": 0.0321, "step": 2571 }, { "epoch": 0.8589079979963266, "grad_norm": 0.5329144888857236, "learning_rate": 8.979694680545872e-06, "loss": 0.0339, "step": 2572 }, { "epoch": 0.8592419435631992, "grad_norm": 0.33675903890161457, "learning_rate": 8.978517930851132e-06, "loss": 0.0284, "step": 2573 }, { "epoch": 0.8595758891300718, "grad_norm": 0.44303915509008357, "learning_rate": 8.977340580154411e-06, "loss": 0.0367, "step": 2574 }, { "epoch": 0.8599098346969444, "grad_norm": 0.4514433634093268, "learning_rate": 8.976162628633565e-06, "loss": 0.0457, "step": 2575 }, { "epoch": 0.860243780263817, "grad_norm": 0.43410756328034344, "learning_rate": 8.974984076466537e-06, "loss": 0.035, "step": 2576 }, { "epoch": 0.8605777258306896, "grad_norm": 0.3872709268496956, "learning_rate": 8.97380492383136e-06, "loss": 0.0305, "step": 2577 }, { "epoch": 0.8609116713975622, "grad_norm": 0.47725018597115404, "learning_rate": 8.972625170906157e-06, "loss": 0.0483, "step": 2578 }, { "epoch": 0.8612456169644348, "grad_norm": 0.31772576137655567, "learning_rate": 8.971444817869148e-06, "loss": 0.0283, "step": 2579 }, { "epoch": 0.8615795625313074, "grad_norm": 0.37107286537717504, "learning_rate": 8.970263864898636e-06, "loss": 0.0413, "step": 2580 }, { "epoch": 0.86191350809818, "grad_norm": 0.5725590523682224, "learning_rate": 8.969082312173021e-06, "loss": 0.0423, "step": 2581 }, { "epoch": 0.8622474536650526, "grad_norm": 0.3941374912936759, "learning_rate": 8.967900159870787e-06, "loss": 0.0295, "step": 2582 }, { "epoch": 0.8625813992319252, "grad_norm": 0.36191755909944073, "learning_rate": 8.966717408170512e-06, "loss": 0.031, "step": 2583 }, { "epoch": 0.8629153447987978, "grad_norm": 0.3429675161089552, "learning_rate": 8.965534057250866e-06, "loss": 0.0433, "step": 2584 }, { "epoch": 0.8632492903656704, "grad_norm": 0.3028077737377305, "learning_rate": 8.964350107290609e-06, "loss": 0.0297, "step": 2585 }, { "epoch": 0.863583235932543, "grad_norm": 0.4830327842147132, "learning_rate": 8.96316555846859e-06, "loss": 0.0399, "step": 2586 }, { "epoch": 0.8639171814994155, "grad_norm": 0.26916796497004913, "learning_rate": 8.961980410963749e-06, "loss": 0.02, "step": 2587 }, { "epoch": 0.8642511270662881, "grad_norm": 0.4134466211792293, "learning_rate": 8.960794664955115e-06, "loss": 0.041, "step": 2588 }, { "epoch": 0.8645850726331608, "grad_norm": 0.32659389902593844, "learning_rate": 8.95960832062181e-06, "loss": 0.029, "step": 2589 }, { "epoch": 0.8649190182000334, "grad_norm": 0.7694702843000258, "learning_rate": 8.958421378143046e-06, "loss": 0.0386, "step": 2590 }, { "epoch": 0.865252963766906, "grad_norm": 0.5168924274103711, "learning_rate": 8.957233837698122e-06, "loss": 0.0411, "step": 2591 }, { "epoch": 0.8655869093337786, "grad_norm": 0.43121061815199463, "learning_rate": 8.956045699466433e-06, "loss": 0.0525, "step": 2592 }, { "epoch": 0.8659208549006512, "grad_norm": 0.3314527135026029, "learning_rate": 8.95485696362746e-06, "loss": 0.0358, "step": 2593 }, { "epoch": 0.8662548004675238, "grad_norm": 0.463869796250436, "learning_rate": 8.953667630360778e-06, "loss": 0.036, "step": 2594 }, { "epoch": 0.8665887460343964, "grad_norm": 0.5740069960790621, "learning_rate": 8.952477699846044e-06, "loss": 0.0433, "step": 2595 }, { "epoch": 0.866922691601269, "grad_norm": 0.6188270632194969, "learning_rate": 8.951287172263018e-06, "loss": 0.0291, "step": 2596 }, { "epoch": 0.8672566371681416, "grad_norm": 0.422883593465061, "learning_rate": 8.950096047791539e-06, "loss": 0.0431, "step": 2597 }, { "epoch": 0.8675905827350142, "grad_norm": 0.6846113727170556, "learning_rate": 8.94890432661154e-06, "loss": 0.0496, "step": 2598 }, { "epoch": 0.8679245283018868, "grad_norm": 0.3031987176226039, "learning_rate": 8.947712008903045e-06, "loss": 0.0307, "step": 2599 }, { "epoch": 0.8682584738687594, "grad_norm": 0.56108161337743, "learning_rate": 8.946519094846169e-06, "loss": 0.0335, "step": 2600 }, { "epoch": 0.868592419435632, "grad_norm": 0.37223617309309653, "learning_rate": 8.945325584621116e-06, "loss": 0.0297, "step": 2601 }, { "epoch": 0.8689263650025046, "grad_norm": 0.5549951575258364, "learning_rate": 8.944131478408177e-06, "loss": 0.0449, "step": 2602 }, { "epoch": 0.8692603105693771, "grad_norm": 0.45803541602321984, "learning_rate": 8.942936776387739e-06, "loss": 0.0352, "step": 2603 }, { "epoch": 0.8695942561362497, "grad_norm": 0.4264773699081465, "learning_rate": 8.941741478740272e-06, "loss": 0.0369, "step": 2604 }, { "epoch": 0.8699282017031224, "grad_norm": 0.3191449035292794, "learning_rate": 8.940545585646344e-06, "loss": 0.0297, "step": 2605 }, { "epoch": 0.870262147269995, "grad_norm": 0.3198982569663816, "learning_rate": 8.939349097286608e-06, "loss": 0.0265, "step": 2606 }, { "epoch": 0.8705960928368676, "grad_norm": 0.5553117913988942, "learning_rate": 8.938152013841803e-06, "loss": 0.0366, "step": 2607 }, { "epoch": 0.8709300384037402, "grad_norm": 0.44719504860903736, "learning_rate": 8.93695433549277e-06, "loss": 0.0376, "step": 2608 }, { "epoch": 0.8712639839706128, "grad_norm": 0.32679155125163895, "learning_rate": 8.935756062420426e-06, "loss": 0.0346, "step": 2609 }, { "epoch": 0.8715979295374854, "grad_norm": 0.4609210191387648, "learning_rate": 8.934557194805787e-06, "loss": 0.0325, "step": 2610 }, { "epoch": 0.871931875104358, "grad_norm": 0.4706308256660193, "learning_rate": 8.933357732829957e-06, "loss": 0.0397, "step": 2611 }, { "epoch": 0.8722658206712306, "grad_norm": 0.40300368032655215, "learning_rate": 8.932157676674126e-06, "loss": 0.0292, "step": 2612 }, { "epoch": 0.8725997662381032, "grad_norm": 0.28732447433061004, "learning_rate": 8.93095702651958e-06, "loss": 0.0305, "step": 2613 }, { "epoch": 0.8729337118049758, "grad_norm": 0.36369387012379667, "learning_rate": 8.92975578254769e-06, "loss": 0.0504, "step": 2614 }, { "epoch": 0.8732676573718484, "grad_norm": 0.4591704692481582, "learning_rate": 8.928553944939915e-06, "loss": 0.0402, "step": 2615 }, { "epoch": 0.873601602938721, "grad_norm": 0.33481246908498963, "learning_rate": 8.92735151387781e-06, "loss": 0.0287, "step": 2616 }, { "epoch": 0.8739355485055936, "grad_norm": 0.357360199668993, "learning_rate": 8.926148489543018e-06, "loss": 0.0362, "step": 2617 }, { "epoch": 0.8742694940724662, "grad_norm": 0.39042917680217704, "learning_rate": 8.924944872117264e-06, "loss": 0.0391, "step": 2618 }, { "epoch": 0.8746034396393388, "grad_norm": 0.28540601454276765, "learning_rate": 8.923740661782376e-06, "loss": 0.0287, "step": 2619 }, { "epoch": 0.8749373852062113, "grad_norm": 0.4140095320834736, "learning_rate": 8.92253585872026e-06, "loss": 0.0326, "step": 2620 }, { "epoch": 0.8752713307730839, "grad_norm": 0.4121447335969474, "learning_rate": 8.921330463112915e-06, "loss": 0.0255, "step": 2621 }, { "epoch": 0.8756052763399566, "grad_norm": 0.3554290077776676, "learning_rate": 8.92012447514243e-06, "loss": 0.031, "step": 2622 }, { "epoch": 0.8759392219068292, "grad_norm": 0.7307532156262747, "learning_rate": 8.918917894990989e-06, "loss": 0.0362, "step": 2623 }, { "epoch": 0.8762731674737018, "grad_norm": 0.2553936615311305, "learning_rate": 8.917710722840853e-06, "loss": 0.0178, "step": 2624 }, { "epoch": 0.8766071130405744, "grad_norm": 0.37407512446904145, "learning_rate": 8.916502958874385e-06, "loss": 0.0363, "step": 2625 }, { "epoch": 0.876941058607447, "grad_norm": 0.42475876395394646, "learning_rate": 8.915294603274027e-06, "loss": 0.0368, "step": 2626 }, { "epoch": 0.8772750041743196, "grad_norm": 0.3358353924527036, "learning_rate": 8.91408565622232e-06, "loss": 0.0446, "step": 2627 }, { "epoch": 0.8776089497411922, "grad_norm": 0.40566349975881943, "learning_rate": 8.912876117901887e-06, "loss": 0.0372, "step": 2628 }, { "epoch": 0.8779428953080648, "grad_norm": 0.3742251940601137, "learning_rate": 8.911665988495446e-06, "loss": 0.0417, "step": 2629 }, { "epoch": 0.8782768408749374, "grad_norm": 0.3712879563967958, "learning_rate": 8.910455268185795e-06, "loss": 0.0339, "step": 2630 }, { "epoch": 0.87861078644181, "grad_norm": 0.34386298764350637, "learning_rate": 8.909243957155835e-06, "loss": 0.0326, "step": 2631 }, { "epoch": 0.8789447320086826, "grad_norm": 0.3236861653841093, "learning_rate": 8.908032055588544e-06, "loss": 0.0474, "step": 2632 }, { "epoch": 0.8792786775755552, "grad_norm": 0.26739114830091093, "learning_rate": 8.906819563666997e-06, "loss": 0.034, "step": 2633 }, { "epoch": 0.8796126231424278, "grad_norm": 0.3358869564481119, "learning_rate": 8.905606481574351e-06, "loss": 0.0342, "step": 2634 }, { "epoch": 0.8799465687093004, "grad_norm": 0.2755597387425229, "learning_rate": 8.90439280949386e-06, "loss": 0.0267, "step": 2635 }, { "epoch": 0.8802805142761729, "grad_norm": 0.3766510920867843, "learning_rate": 8.903178547608863e-06, "loss": 0.0296, "step": 2636 }, { "epoch": 0.8806144598430455, "grad_norm": 0.34793544795872505, "learning_rate": 8.901963696102788e-06, "loss": 0.034, "step": 2637 }, { "epoch": 0.8809484054099181, "grad_norm": 0.275487457116531, "learning_rate": 8.900748255159152e-06, "loss": 0.0267, "step": 2638 }, { "epoch": 0.8812823509767908, "grad_norm": 0.24705015902803695, "learning_rate": 8.899532224961562e-06, "loss": 0.0227, "step": 2639 }, { "epoch": 0.8816162965436634, "grad_norm": 0.6536853667142282, "learning_rate": 8.898315605693715e-06, "loss": 0.0438, "step": 2640 }, { "epoch": 0.881950242110536, "grad_norm": 0.43430130069824485, "learning_rate": 8.897098397539394e-06, "loss": 0.0317, "step": 2641 }, { "epoch": 0.8822841876774086, "grad_norm": 0.43687515508839014, "learning_rate": 8.895880600682472e-06, "loss": 0.0384, "step": 2642 }, { "epoch": 0.8826181332442812, "grad_norm": 0.39590306979153195, "learning_rate": 8.894662215306913e-06, "loss": 0.033, "step": 2643 }, { "epoch": 0.8829520788111538, "grad_norm": 0.7290575707828736, "learning_rate": 8.89344324159677e-06, "loss": 0.0364, "step": 2644 }, { "epoch": 0.8832860243780264, "grad_norm": 0.3751669108911836, "learning_rate": 8.89222367973618e-06, "loss": 0.0311, "step": 2645 }, { "epoch": 0.883619969944899, "grad_norm": 0.34397576939891117, "learning_rate": 8.891003529909375e-06, "loss": 0.0371, "step": 2646 }, { "epoch": 0.8839539155117716, "grad_norm": 0.4109270316750449, "learning_rate": 8.889782792300672e-06, "loss": 0.0301, "step": 2647 }, { "epoch": 0.8842878610786442, "grad_norm": 0.30637162441557436, "learning_rate": 8.888561467094476e-06, "loss": 0.0275, "step": 2648 }, { "epoch": 0.8846218066455168, "grad_norm": 0.30622959236678143, "learning_rate": 8.887339554475284e-06, "loss": 0.0326, "step": 2649 }, { "epoch": 0.8849557522123894, "grad_norm": 0.3161478933617422, "learning_rate": 8.886117054627682e-06, "loss": 0.032, "step": 2650 }, { "epoch": 0.885289697779262, "grad_norm": 0.4208502399428019, "learning_rate": 8.88489396773634e-06, "loss": 0.0242, "step": 2651 }, { "epoch": 0.8856236433461345, "grad_norm": 0.3681407208121261, "learning_rate": 8.883670293986019e-06, "loss": 0.0298, "step": 2652 }, { "epoch": 0.8859575889130071, "grad_norm": 0.34878417508861853, "learning_rate": 8.882446033561576e-06, "loss": 0.033, "step": 2653 }, { "epoch": 0.8862915344798797, "grad_norm": 0.4684780724338432, "learning_rate": 8.881221186647941e-06, "loss": 0.045, "step": 2654 }, { "epoch": 0.8866254800467523, "grad_norm": 0.3957874804275106, "learning_rate": 8.879995753430148e-06, "loss": 0.0347, "step": 2655 }, { "epoch": 0.886959425613625, "grad_norm": 0.42665252936857895, "learning_rate": 8.878769734093312e-06, "loss": 0.032, "step": 2656 }, { "epoch": 0.8872933711804976, "grad_norm": 0.29682241926451486, "learning_rate": 8.877543128822634e-06, "loss": 0.0298, "step": 2657 }, { "epoch": 0.8876273167473702, "grad_norm": 0.3651957801300177, "learning_rate": 8.876315937803413e-06, "loss": 0.0323, "step": 2658 }, { "epoch": 0.8879612623142428, "grad_norm": 0.2769485137576066, "learning_rate": 8.875088161221025e-06, "loss": 0.0307, "step": 2659 }, { "epoch": 0.8882952078811154, "grad_norm": 0.4415688094211487, "learning_rate": 8.873859799260944e-06, "loss": 0.0353, "step": 2660 }, { "epoch": 0.888629153447988, "grad_norm": 0.33974912444710537, "learning_rate": 8.872630852108725e-06, "loss": 0.0358, "step": 2661 }, { "epoch": 0.8889630990148606, "grad_norm": 0.2861482838286048, "learning_rate": 8.87140131995002e-06, "loss": 0.0331, "step": 2662 }, { "epoch": 0.8892970445817332, "grad_norm": 0.33208570178611857, "learning_rate": 8.870171202970559e-06, "loss": 0.0248, "step": 2663 }, { "epoch": 0.8896309901486058, "grad_norm": 0.35668018176898314, "learning_rate": 8.868940501356169e-06, "loss": 0.0359, "step": 2664 }, { "epoch": 0.8899649357154784, "grad_norm": 0.2931305648052054, "learning_rate": 8.86770921529276e-06, "loss": 0.0255, "step": 2665 }, { "epoch": 0.890298881282351, "grad_norm": 0.8325037062693247, "learning_rate": 8.866477344966334e-06, "loss": 0.0302, "step": 2666 }, { "epoch": 0.8906328268492236, "grad_norm": 0.27860169435527826, "learning_rate": 8.865244890562978e-06, "loss": 0.0297, "step": 2667 }, { "epoch": 0.8909667724160962, "grad_norm": 0.49230324299758665, "learning_rate": 8.864011852268872e-06, "loss": 0.0431, "step": 2668 }, { "epoch": 0.8913007179829687, "grad_norm": 0.3231549530220241, "learning_rate": 8.862778230270276e-06, "loss": 0.028, "step": 2669 }, { "epoch": 0.8916346635498413, "grad_norm": 0.4893422631717919, "learning_rate": 8.861544024753545e-06, "loss": 0.0638, "step": 2670 }, { "epoch": 0.8919686091167139, "grad_norm": 0.4412187905975192, "learning_rate": 8.860309235905122e-06, "loss": 0.0423, "step": 2671 }, { "epoch": 0.8923025546835865, "grad_norm": 0.33046413914807304, "learning_rate": 8.859073863911536e-06, "loss": 0.0459, "step": 2672 }, { "epoch": 0.8926365002504592, "grad_norm": 0.3452233475138895, "learning_rate": 8.857837908959404e-06, "loss": 0.0382, "step": 2673 }, { "epoch": 0.8929704458173318, "grad_norm": 0.37413800958104054, "learning_rate": 8.856601371235429e-06, "loss": 0.0342, "step": 2674 }, { "epoch": 0.8933043913842044, "grad_norm": 0.4060950887816249, "learning_rate": 8.855364250926409e-06, "loss": 0.0424, "step": 2675 }, { "epoch": 0.893638336951077, "grad_norm": 0.35810437403999507, "learning_rate": 8.854126548219222e-06, "loss": 0.024, "step": 2676 }, { "epoch": 0.8939722825179496, "grad_norm": 0.5647203968417588, "learning_rate": 8.85288826330084e-06, "loss": 0.0397, "step": 2677 }, { "epoch": 0.8943062280848222, "grad_norm": 0.3599785221058561, "learning_rate": 8.85164939635832e-06, "loss": 0.0269, "step": 2678 }, { "epoch": 0.8946401736516948, "grad_norm": 0.3624847754468508, "learning_rate": 8.850409947578806e-06, "loss": 0.0363, "step": 2679 }, { "epoch": 0.8949741192185674, "grad_norm": 0.3693969366357945, "learning_rate": 8.849169917149532e-06, "loss": 0.0333, "step": 2680 }, { "epoch": 0.89530806478544, "grad_norm": 0.323426425859245, "learning_rate": 8.847929305257821e-06, "loss": 0.0341, "step": 2681 }, { "epoch": 0.8956420103523126, "grad_norm": 0.4534456123344399, "learning_rate": 8.846688112091078e-06, "loss": 0.0354, "step": 2682 }, { "epoch": 0.8959759559191852, "grad_norm": 0.43091063353507614, "learning_rate": 8.845446337836805e-06, "loss": 0.0306, "step": 2683 }, { "epoch": 0.8963099014860578, "grad_norm": 0.4524258538049501, "learning_rate": 8.844203982682583e-06, "loss": 0.0394, "step": 2684 }, { "epoch": 0.8966438470529303, "grad_norm": 0.436855081098346, "learning_rate": 8.842961046816085e-06, "loss": 0.0378, "step": 2685 }, { "epoch": 0.8969777926198029, "grad_norm": 0.31495273658629613, "learning_rate": 8.841717530425071e-06, "loss": 0.0311, "step": 2686 }, { "epoch": 0.8973117381866755, "grad_norm": 0.3807648276384131, "learning_rate": 8.84047343369739e-06, "loss": 0.032, "step": 2687 }, { "epoch": 0.8976456837535481, "grad_norm": 0.5188136392000254, "learning_rate": 8.839228756820977e-06, "loss": 0.0546, "step": 2688 }, { "epoch": 0.8979796293204207, "grad_norm": 0.8778263519222621, "learning_rate": 8.837983499983856e-06, "loss": 0.0482, "step": 2689 }, { "epoch": 0.8983135748872934, "grad_norm": 0.3485019158278814, "learning_rate": 8.836737663374135e-06, "loss": 0.0457, "step": 2690 }, { "epoch": 0.898647520454166, "grad_norm": 0.314559173548534, "learning_rate": 8.835491247180012e-06, "loss": 0.0283, "step": 2691 }, { "epoch": 0.8989814660210386, "grad_norm": 0.2484352935221926, "learning_rate": 8.834244251589778e-06, "loss": 0.021, "step": 2692 }, { "epoch": 0.8993154115879112, "grad_norm": 0.5245493546269812, "learning_rate": 8.832996676791802e-06, "loss": 0.0511, "step": 2693 }, { "epoch": 0.8996493571547838, "grad_norm": 0.3573273074502577, "learning_rate": 8.831748522974545e-06, "loss": 0.0271, "step": 2694 }, { "epoch": 0.8999833027216564, "grad_norm": 0.5556433337685233, "learning_rate": 8.830499790326556e-06, "loss": 0.0342, "step": 2695 }, { "epoch": 0.900317248288529, "grad_norm": 0.25571973762062844, "learning_rate": 8.829250479036473e-06, "loss": 0.032, "step": 2696 }, { "epoch": 0.9006511938554016, "grad_norm": 0.4030184084917132, "learning_rate": 8.828000589293016e-06, "loss": 0.0408, "step": 2697 }, { "epoch": 0.9009851394222742, "grad_norm": 0.3486286771438965, "learning_rate": 8.826750121284998e-06, "loss": 0.0311, "step": 2698 }, { "epoch": 0.9013190849891468, "grad_norm": 0.40150810429876416, "learning_rate": 8.825499075201314e-06, "loss": 0.0371, "step": 2699 }, { "epoch": 0.9016530305560194, "grad_norm": 0.3876708079046449, "learning_rate": 8.824247451230949e-06, "loss": 0.0381, "step": 2700 }, { "epoch": 0.9019869761228919, "grad_norm": 0.37203978664868487, "learning_rate": 8.82299524956298e-06, "loss": 0.0316, "step": 2701 }, { "epoch": 0.9023209216897645, "grad_norm": 0.2943335898487093, "learning_rate": 8.821742470386565e-06, "loss": 0.0196, "step": 2702 }, { "epoch": 0.9026548672566371, "grad_norm": 0.374572155840365, "learning_rate": 8.820489113890949e-06, "loss": 0.0316, "step": 2703 }, { "epoch": 0.9029888128235097, "grad_norm": 0.2859222579401617, "learning_rate": 8.819235180265468e-06, "loss": 0.0282, "step": 2704 }, { "epoch": 0.9033227583903823, "grad_norm": 0.2638982265530527, "learning_rate": 8.817980669699544e-06, "loss": 0.0223, "step": 2705 }, { "epoch": 0.903656703957255, "grad_norm": 0.3631531219422456, "learning_rate": 8.816725582382681e-06, "loss": 0.037, "step": 2706 }, { "epoch": 0.9039906495241276, "grad_norm": 0.28931984035548053, "learning_rate": 8.815469918504482e-06, "loss": 0.0289, "step": 2707 }, { "epoch": 0.9043245950910002, "grad_norm": 0.4812472575051047, "learning_rate": 8.814213678254624e-06, "loss": 0.047, "step": 2708 }, { "epoch": 0.9046585406578728, "grad_norm": 0.3354390917849965, "learning_rate": 8.81295686182288e-06, "loss": 0.0299, "step": 2709 }, { "epoch": 0.9049924862247454, "grad_norm": 0.3643640891783243, "learning_rate": 8.811699469399106e-06, "loss": 0.0318, "step": 2710 }, { "epoch": 0.905326431791618, "grad_norm": 0.5331678735002254, "learning_rate": 8.810441501173245e-06, "loss": 0.0381, "step": 2711 }, { "epoch": 0.9056603773584906, "grad_norm": 0.3199955531031879, "learning_rate": 8.809182957335329e-06, "loss": 0.0358, "step": 2712 }, { "epoch": 0.9059943229253632, "grad_norm": 0.3128292176535784, "learning_rate": 8.807923838075476e-06, "loss": 0.026, "step": 2713 }, { "epoch": 0.9063282684922358, "grad_norm": 0.4896501773103624, "learning_rate": 8.80666414358389e-06, "loss": 0.0442, "step": 2714 }, { "epoch": 0.9066622140591084, "grad_norm": 0.4854663678009563, "learning_rate": 8.805403874050864e-06, "loss": 0.0296, "step": 2715 }, { "epoch": 0.906996159625981, "grad_norm": 0.40495799536236976, "learning_rate": 8.804143029666775e-06, "loss": 0.0317, "step": 2716 }, { "epoch": 0.9073301051928536, "grad_norm": 0.3624131473644916, "learning_rate": 8.802881610622089e-06, "loss": 0.0337, "step": 2717 }, { "epoch": 0.9076640507597261, "grad_norm": 0.3117849134150128, "learning_rate": 8.801619617107359e-06, "loss": 0.0272, "step": 2718 }, { "epoch": 0.9079979963265987, "grad_norm": 0.45987734826481586, "learning_rate": 8.800357049313222e-06, "loss": 0.0431, "step": 2719 }, { "epoch": 0.9083319418934713, "grad_norm": 0.49114593678422225, "learning_rate": 8.799093907430406e-06, "loss": 0.0343, "step": 2720 }, { "epoch": 0.9086658874603439, "grad_norm": 0.2913713443180512, "learning_rate": 8.797830191649721e-06, "loss": 0.0316, "step": 2721 }, { "epoch": 0.9089998330272165, "grad_norm": 0.3934277531253081, "learning_rate": 8.796565902162069e-06, "loss": 0.039, "step": 2722 }, { "epoch": 0.9093337785940891, "grad_norm": 0.44386869889786956, "learning_rate": 8.795301039158433e-06, "loss": 0.0358, "step": 2723 }, { "epoch": 0.9096677241609618, "grad_norm": 0.45849877618788204, "learning_rate": 8.794035602829887e-06, "loss": 0.0458, "step": 2724 }, { "epoch": 0.9100016697278344, "grad_norm": 0.33113577952821344, "learning_rate": 8.792769593367591e-06, "loss": 0.0332, "step": 2725 }, { "epoch": 0.910335615294707, "grad_norm": 0.43368960494681885, "learning_rate": 8.79150301096279e-06, "loss": 0.0491, "step": 2726 }, { "epoch": 0.9106695608615796, "grad_norm": 0.32205062944743973, "learning_rate": 8.790235855806814e-06, "loss": 0.0318, "step": 2727 }, { "epoch": 0.9110035064284522, "grad_norm": 0.39229807110022535, "learning_rate": 8.788968128091084e-06, "loss": 0.024, "step": 2728 }, { "epoch": 0.9113374519953248, "grad_norm": 0.3983195921827163, "learning_rate": 8.787699828007104e-06, "loss": 0.0332, "step": 2729 }, { "epoch": 0.9116713975621974, "grad_norm": 0.3269993182645894, "learning_rate": 8.786430955746468e-06, "loss": 0.022, "step": 2730 }, { "epoch": 0.91200534312907, "grad_norm": 0.508155656233159, "learning_rate": 8.78516151150085e-06, "loss": 0.0315, "step": 2731 }, { "epoch": 0.9123392886959426, "grad_norm": 0.47003450333924607, "learning_rate": 8.783891495462018e-06, "loss": 0.0375, "step": 2732 }, { "epoch": 0.9126732342628152, "grad_norm": 0.3494164836588301, "learning_rate": 8.782620907821823e-06, "loss": 0.04, "step": 2733 }, { "epoch": 0.9130071798296877, "grad_norm": 0.3703653271950426, "learning_rate": 8.781349748772198e-06, "loss": 0.0297, "step": 2734 }, { "epoch": 0.9133411253965603, "grad_norm": 0.613808416623768, "learning_rate": 8.780078018505172e-06, "loss": 0.0356, "step": 2735 }, { "epoch": 0.9136750709634329, "grad_norm": 0.5787663724611068, "learning_rate": 8.778805717212853e-06, "loss": 0.0406, "step": 2736 }, { "epoch": 0.9140090165303055, "grad_norm": 0.3541068759399978, "learning_rate": 8.777532845087434e-06, "loss": 0.0327, "step": 2737 }, { "epoch": 0.9143429620971781, "grad_norm": 0.5111329190044734, "learning_rate": 8.776259402321201e-06, "loss": 0.0403, "step": 2738 }, { "epoch": 0.9146769076640507, "grad_norm": 0.5241498884651358, "learning_rate": 8.774985389106521e-06, "loss": 0.0367, "step": 2739 }, { "epoch": 0.9150108532309233, "grad_norm": 0.5279796385901651, "learning_rate": 8.77371080563585e-06, "loss": 0.0427, "step": 2740 }, { "epoch": 0.915344798797796, "grad_norm": 0.3122021748914317, "learning_rate": 8.772435652101726e-06, "loss": 0.0329, "step": 2741 }, { "epoch": 0.9156787443646686, "grad_norm": 0.4038853663767032, "learning_rate": 8.771159928696779e-06, "loss": 0.032, "step": 2742 }, { "epoch": 0.9160126899315412, "grad_norm": 0.40808780028981756, "learning_rate": 8.76988363561372e-06, "loss": 0.024, "step": 2743 }, { "epoch": 0.9163466354984138, "grad_norm": 0.585171766406125, "learning_rate": 8.76860677304535e-06, "loss": 0.0437, "step": 2744 }, { "epoch": 0.9166805810652864, "grad_norm": 0.42957640561970534, "learning_rate": 8.767329341184552e-06, "loss": 0.0338, "step": 2745 }, { "epoch": 0.917014526632159, "grad_norm": 0.3923721591081285, "learning_rate": 8.766051340224297e-06, "loss": 0.0463, "step": 2746 }, { "epoch": 0.9173484721990316, "grad_norm": 0.5290985574802106, "learning_rate": 8.764772770357646e-06, "loss": 0.0374, "step": 2747 }, { "epoch": 0.9176824177659042, "grad_norm": 0.3121582743944563, "learning_rate": 8.763493631777738e-06, "loss": 0.0243, "step": 2748 }, { "epoch": 0.9180163633327768, "grad_norm": 0.3333000144950329, "learning_rate": 8.762213924677802e-06, "loss": 0.0251, "step": 2749 }, { "epoch": 0.9183503088996493, "grad_norm": 0.5432337404166314, "learning_rate": 8.760933649251155e-06, "loss": 0.0443, "step": 2750 }, { "epoch": 0.9186842544665219, "grad_norm": 0.3316771561942305, "learning_rate": 8.759652805691197e-06, "loss": 0.036, "step": 2751 }, { "epoch": 0.9190182000333945, "grad_norm": 0.396723782048542, "learning_rate": 8.758371394191415e-06, "loss": 0.0338, "step": 2752 }, { "epoch": 0.9193521456002671, "grad_norm": 0.3211967394461863, "learning_rate": 8.75708941494538e-06, "loss": 0.0205, "step": 2753 }, { "epoch": 0.9196860911671397, "grad_norm": 0.42862694028879433, "learning_rate": 8.75580686814675e-06, "loss": 0.0396, "step": 2754 }, { "epoch": 0.9200200367340123, "grad_norm": 0.5817811564476707, "learning_rate": 8.75452375398927e-06, "loss": 0.0267, "step": 2755 }, { "epoch": 0.9203539823008849, "grad_norm": 0.294806437662154, "learning_rate": 8.753240072666769e-06, "loss": 0.0243, "step": 2756 }, { "epoch": 0.9206879278677575, "grad_norm": 0.5437423516003268, "learning_rate": 8.751955824373161e-06, "loss": 0.0413, "step": 2757 }, { "epoch": 0.9210218734346302, "grad_norm": 0.2887919920681828, "learning_rate": 8.750671009302448e-06, "loss": 0.0284, "step": 2758 }, { "epoch": 0.9213558190015028, "grad_norm": 0.3030039732206616, "learning_rate": 8.749385627648717e-06, "loss": 0.0266, "step": 2759 }, { "epoch": 0.9216897645683754, "grad_norm": 0.3945261183205245, "learning_rate": 8.748099679606139e-06, "loss": 0.0398, "step": 2760 }, { "epoch": 0.922023710135248, "grad_norm": 0.37834962580284026, "learning_rate": 8.746813165368973e-06, "loss": 0.0252, "step": 2761 }, { "epoch": 0.9223576557021206, "grad_norm": 0.45297643964361894, "learning_rate": 8.745526085131559e-06, "loss": 0.0271, "step": 2762 }, { "epoch": 0.9226916012689932, "grad_norm": 0.31967657504572755, "learning_rate": 8.744238439088328e-06, "loss": 0.0234, "step": 2763 }, { "epoch": 0.9230255468358658, "grad_norm": 0.36315923077791945, "learning_rate": 8.742950227433795e-06, "loss": 0.0321, "step": 2764 }, { "epoch": 0.9233594924027384, "grad_norm": 0.4225791521025416, "learning_rate": 8.741661450362559e-06, "loss": 0.0296, "step": 2765 }, { "epoch": 0.923693437969611, "grad_norm": 0.3165999016376335, "learning_rate": 8.740372108069304e-06, "loss": 0.0279, "step": 2766 }, { "epoch": 0.9240273835364835, "grad_norm": 0.36103149782930405, "learning_rate": 8.739082200748799e-06, "loss": 0.0353, "step": 2767 }, { "epoch": 0.9243613291033561, "grad_norm": 0.4443288162951213, "learning_rate": 8.737791728595903e-06, "loss": 0.0383, "step": 2768 }, { "epoch": 0.9246952746702287, "grad_norm": 0.2758006368236196, "learning_rate": 8.736500691805554e-06, "loss": 0.0322, "step": 2769 }, { "epoch": 0.9250292202371013, "grad_norm": 0.617235448945394, "learning_rate": 8.73520909057278e-06, "loss": 0.0445, "step": 2770 }, { "epoch": 0.9253631658039739, "grad_norm": 0.5465684287209733, "learning_rate": 8.733916925092691e-06, "loss": 0.0467, "step": 2771 }, { "epoch": 0.9256971113708465, "grad_norm": 0.30496841436394523, "learning_rate": 8.732624195560487e-06, "loss": 0.0294, "step": 2772 }, { "epoch": 0.9260310569377191, "grad_norm": 0.3323619952467661, "learning_rate": 8.731330902171447e-06, "loss": 0.0383, "step": 2773 }, { "epoch": 0.9263650025045918, "grad_norm": 0.4887364194723922, "learning_rate": 8.730037045120941e-06, "loss": 0.036, "step": 2774 }, { "epoch": 0.9266989480714644, "grad_norm": 0.4225404524617371, "learning_rate": 8.728742624604418e-06, "loss": 0.0453, "step": 2775 }, { "epoch": 0.927032893638337, "grad_norm": 0.6077111166457021, "learning_rate": 8.727447640817417e-06, "loss": 0.0415, "step": 2776 }, { "epoch": 0.9273668392052096, "grad_norm": 0.3448105629744357, "learning_rate": 8.726152093955561e-06, "loss": 0.0245, "step": 2777 }, { "epoch": 0.9277007847720822, "grad_norm": 0.42752963555637163, "learning_rate": 8.724855984214558e-06, "loss": 0.0355, "step": 2778 }, { "epoch": 0.9280347303389548, "grad_norm": 0.3972583917930645, "learning_rate": 8.723559311790197e-06, "loss": 0.0535, "step": 2779 }, { "epoch": 0.9283686759058274, "grad_norm": 0.29900926228908553, "learning_rate": 8.722262076878361e-06, "loss": 0.0276, "step": 2780 }, { "epoch": 0.9287026214727, "grad_norm": 0.33922319676365026, "learning_rate": 8.720964279675009e-06, "loss": 0.0421, "step": 2781 }, { "epoch": 0.9290365670395726, "grad_norm": 0.27641093661892724, "learning_rate": 8.71966592037619e-06, "loss": 0.0216, "step": 2782 }, { "epoch": 0.9293705126064451, "grad_norm": 0.3760096877264999, "learning_rate": 8.718366999178037e-06, "loss": 0.0303, "step": 2783 }, { "epoch": 0.9297044581733177, "grad_norm": 0.48778202627788486, "learning_rate": 8.717067516276764e-06, "loss": 0.0378, "step": 2784 }, { "epoch": 0.9300384037401903, "grad_norm": 0.4340137713205771, "learning_rate": 8.715767471868679e-06, "loss": 0.0342, "step": 2785 }, { "epoch": 0.9303723493070629, "grad_norm": 0.5365931673610034, "learning_rate": 8.714466866150162e-06, "loss": 0.0551, "step": 2786 }, { "epoch": 0.9307062948739355, "grad_norm": 0.38121178906320907, "learning_rate": 8.71316569931769e-06, "loss": 0.0413, "step": 2787 }, { "epoch": 0.9310402404408081, "grad_norm": 0.4073284225625517, "learning_rate": 8.71186397156782e-06, "loss": 0.0353, "step": 2788 }, { "epoch": 0.9313741860076807, "grad_norm": 0.5559132115434412, "learning_rate": 8.710561683097189e-06, "loss": 0.0473, "step": 2789 }, { "epoch": 0.9317081315745533, "grad_norm": 0.5318828253027332, "learning_rate": 8.709258834102525e-06, "loss": 0.0439, "step": 2790 }, { "epoch": 0.932042077141426, "grad_norm": 0.9639026006766994, "learning_rate": 8.70795542478064e-06, "loss": 0.0436, "step": 2791 }, { "epoch": 0.9323760227082986, "grad_norm": 0.4438159314820279, "learning_rate": 8.706651455328427e-06, "loss": 0.0402, "step": 2792 }, { "epoch": 0.9327099682751712, "grad_norm": 0.392408891585289, "learning_rate": 8.70534692594287e-06, "loss": 0.031, "step": 2793 }, { "epoch": 0.9330439138420438, "grad_norm": 0.5651403344284713, "learning_rate": 8.704041836821029e-06, "loss": 0.0464, "step": 2794 }, { "epoch": 0.9333778594089164, "grad_norm": 0.49450949919654424, "learning_rate": 8.702736188160055e-06, "loss": 0.0361, "step": 2795 }, { "epoch": 0.933711804975789, "grad_norm": 0.38494955406386794, "learning_rate": 8.70142998015718e-06, "loss": 0.032, "step": 2796 }, { "epoch": 0.9340457505426616, "grad_norm": 0.4040279285565328, "learning_rate": 8.700123213009726e-06, "loss": 0.0405, "step": 2797 }, { "epoch": 0.9343796961095342, "grad_norm": 0.45590293833918766, "learning_rate": 8.698815886915094e-06, "loss": 0.044, "step": 2798 }, { "epoch": 0.9347136416764067, "grad_norm": 0.5284038477980592, "learning_rate": 8.697508002070766e-06, "loss": 0.0345, "step": 2799 }, { "epoch": 0.9350475872432793, "grad_norm": 0.8424772908860036, "learning_rate": 8.696199558674321e-06, "loss": 0.0506, "step": 2800 }, { "epoch": 0.9353815328101519, "grad_norm": 0.3998538125832933, "learning_rate": 8.69489055692341e-06, "loss": 0.0257, "step": 2801 }, { "epoch": 0.9357154783770245, "grad_norm": 0.316886878656811, "learning_rate": 8.693580997015775e-06, "loss": 0.0332, "step": 2802 }, { "epoch": 0.9360494239438971, "grad_norm": 0.4750431820729923, "learning_rate": 8.692270879149241e-06, "loss": 0.0422, "step": 2803 }, { "epoch": 0.9363833695107697, "grad_norm": 0.3373304941500603, "learning_rate": 8.690960203521713e-06, "loss": 0.0332, "step": 2804 }, { "epoch": 0.9367173150776423, "grad_norm": 0.38393173851861656, "learning_rate": 8.689648970331188e-06, "loss": 0.0366, "step": 2805 }, { "epoch": 0.9370512606445149, "grad_norm": 0.7476014323783545, "learning_rate": 8.68833717977574e-06, "loss": 0.0296, "step": 2806 }, { "epoch": 0.9373852062113875, "grad_norm": 0.33936631924839616, "learning_rate": 8.687024832053534e-06, "loss": 0.031, "step": 2807 }, { "epoch": 0.9377191517782602, "grad_norm": 14.084074512678521, "learning_rate": 8.685711927362815e-06, "loss": 0.0994, "step": 2808 }, { "epoch": 0.9380530973451328, "grad_norm": 13.830511528793766, "learning_rate": 8.68439846590191e-06, "loss": 0.1105, "step": 2809 }, { "epoch": 0.9383870429120054, "grad_norm": 1.562466728274396, "learning_rate": 8.683084447869234e-06, "loss": 0.0396, "step": 2810 }, { "epoch": 0.938720988478878, "grad_norm": 0.53221906447519, "learning_rate": 8.681769873463286e-06, "loss": 0.0438, "step": 2811 }, { "epoch": 0.9390549340457506, "grad_norm": 0.38591734245315623, "learning_rate": 8.680454742882647e-06, "loss": 0.0473, "step": 2812 }, { "epoch": 0.9393888796126232, "grad_norm": 0.6621567165500295, "learning_rate": 8.679139056325983e-06, "loss": 0.0497, "step": 2813 }, { "epoch": 0.9397228251794958, "grad_norm": 0.5339382738206233, "learning_rate": 8.677822813992046e-06, "loss": 0.0471, "step": 2814 }, { "epoch": 0.9400567707463683, "grad_norm": 0.4556400842899425, "learning_rate": 8.676506016079664e-06, "loss": 0.0422, "step": 2815 }, { "epoch": 0.9403907163132409, "grad_norm": 0.2552204877044466, "learning_rate": 8.675188662787762e-06, "loss": 0.024, "step": 2816 }, { "epoch": 0.9407246618801135, "grad_norm": 0.48970926286691463, "learning_rate": 8.673870754315336e-06, "loss": 0.0254, "step": 2817 }, { "epoch": 0.9410586074469861, "grad_norm": 0.35539519724430313, "learning_rate": 8.672552290861478e-06, "loss": 0.0308, "step": 2818 }, { "epoch": 0.9413925530138587, "grad_norm": 0.42194617584509675, "learning_rate": 8.67123327262535e-06, "loss": 0.0317, "step": 2819 }, { "epoch": 0.9417264985807313, "grad_norm": 0.35950800586942655, "learning_rate": 8.669913699806209e-06, "loss": 0.0326, "step": 2820 }, { "epoch": 0.9420604441476039, "grad_norm": 0.5884959091131621, "learning_rate": 8.668593572603394e-06, "loss": 0.0596, "step": 2821 }, { "epoch": 0.9423943897144765, "grad_norm": 0.8979722718289816, "learning_rate": 8.667272891216323e-06, "loss": 0.0354, "step": 2822 }, { "epoch": 0.9427283352813491, "grad_norm": 0.5702463862400153, "learning_rate": 8.6659516558445e-06, "loss": 0.0379, "step": 2823 }, { "epoch": 0.9430622808482217, "grad_norm": 0.429804314201765, "learning_rate": 8.664629866687514e-06, "loss": 0.0364, "step": 2824 }, { "epoch": 0.9433962264150944, "grad_norm": 0.3164168163392966, "learning_rate": 8.663307523945038e-06, "loss": 0.0276, "step": 2825 }, { "epoch": 0.943730171981967, "grad_norm": 0.4377506840195495, "learning_rate": 8.661984627816827e-06, "loss": 0.0379, "step": 2826 }, { "epoch": 0.9440641175488396, "grad_norm": 0.3477662801592816, "learning_rate": 8.660661178502719e-06, "loss": 0.0248, "step": 2827 }, { "epoch": 0.9443980631157122, "grad_norm": 0.3631585132399093, "learning_rate": 8.659337176202636e-06, "loss": 0.0275, "step": 2828 }, { "epoch": 0.9447320086825848, "grad_norm": 0.3416351430051519, "learning_rate": 8.658012621116585e-06, "loss": 0.0321, "step": 2829 }, { "epoch": 0.9450659542494574, "grad_norm": 0.3337198340290349, "learning_rate": 8.656687513444656e-06, "loss": 0.0272, "step": 2830 }, { "epoch": 0.94539989981633, "grad_norm": 0.35863055812839195, "learning_rate": 8.655361853387024e-06, "loss": 0.0385, "step": 2831 }, { "epoch": 0.9457338453832025, "grad_norm": 0.5053231448166995, "learning_rate": 8.654035641143944e-06, "loss": 0.037, "step": 2832 }, { "epoch": 0.9460677909500751, "grad_norm": 0.5382339861527287, "learning_rate": 8.652708876915752e-06, "loss": 0.0367, "step": 2833 }, { "epoch": 0.9464017365169477, "grad_norm": 0.4457932521194742, "learning_rate": 8.651381560902876e-06, "loss": 0.0414, "step": 2834 }, { "epoch": 0.9467356820838203, "grad_norm": 0.39745194892382557, "learning_rate": 8.650053693305824e-06, "loss": 0.035, "step": 2835 }, { "epoch": 0.9470696276506929, "grad_norm": 0.5876270983655634, "learning_rate": 8.648725274325182e-06, "loss": 0.0425, "step": 2836 }, { "epoch": 0.9474035732175655, "grad_norm": 0.47412194218262077, "learning_rate": 8.647396304161625e-06, "loss": 0.0356, "step": 2837 }, { "epoch": 0.9477375187844381, "grad_norm": 0.33002487107674067, "learning_rate": 8.64606678301591e-06, "loss": 0.0265, "step": 2838 }, { "epoch": 0.9480714643513107, "grad_norm": 0.4280598865539549, "learning_rate": 8.644736711088874e-06, "loss": 0.0404, "step": 2839 }, { "epoch": 0.9484054099181833, "grad_norm": 0.29538546972578494, "learning_rate": 8.643406088581446e-06, "loss": 0.0354, "step": 2840 }, { "epoch": 0.948739355485056, "grad_norm": 0.33778362712743576, "learning_rate": 8.642074915694626e-06, "loss": 0.0328, "step": 2841 }, { "epoch": 0.9490733010519286, "grad_norm": 0.385851877180008, "learning_rate": 8.640743192629507e-06, "loss": 0.0322, "step": 2842 }, { "epoch": 0.9494072466188012, "grad_norm": 0.4905991095903688, "learning_rate": 8.63941091958726e-06, "loss": 0.0299, "step": 2843 }, { "epoch": 0.9497411921856738, "grad_norm": 0.62659788395832, "learning_rate": 8.638078096769141e-06, "loss": 0.0433, "step": 2844 }, { "epoch": 0.9500751377525464, "grad_norm": 0.3106239489437533, "learning_rate": 8.636744724376488e-06, "loss": 0.0211, "step": 2845 }, { "epoch": 0.950409083319419, "grad_norm": 0.5065096378817863, "learning_rate": 8.635410802610724e-06, "loss": 0.0392, "step": 2846 }, { "epoch": 0.9507430288862916, "grad_norm": 0.37322447884044396, "learning_rate": 8.634076331673354e-06, "loss": 0.0372, "step": 2847 }, { "epoch": 0.9510769744531641, "grad_norm": 0.37424053275601493, "learning_rate": 8.632741311765962e-06, "loss": 0.0269, "step": 2848 }, { "epoch": 0.9514109200200367, "grad_norm": 0.4088336584549044, "learning_rate": 8.631405743090223e-06, "loss": 0.0335, "step": 2849 }, { "epoch": 0.9517448655869093, "grad_norm": 0.42041913563134303, "learning_rate": 8.630069625847885e-06, "loss": 0.0343, "step": 2850 }, { "epoch": 0.9520788111537819, "grad_norm": 0.37625611217598554, "learning_rate": 8.628732960240788e-06, "loss": 0.036, "step": 2851 }, { "epoch": 0.9524127567206545, "grad_norm": 0.3041360538014126, "learning_rate": 8.627395746470852e-06, "loss": 0.0262, "step": 2852 }, { "epoch": 0.9527467022875271, "grad_norm": 0.4914311564689556, "learning_rate": 8.626057984740077e-06, "loss": 0.0398, "step": 2853 }, { "epoch": 0.9530806478543997, "grad_norm": 0.37470132846015813, "learning_rate": 8.624719675250547e-06, "loss": 0.0311, "step": 2854 }, { "epoch": 0.9534145934212723, "grad_norm": 0.4379281940128687, "learning_rate": 8.623380818204431e-06, "loss": 0.0326, "step": 2855 }, { "epoch": 0.9537485389881449, "grad_norm": 0.45115929352751377, "learning_rate": 8.622041413803979e-06, "loss": 0.0294, "step": 2856 }, { "epoch": 0.9540824845550175, "grad_norm": 0.4001383163578965, "learning_rate": 8.620701462251522e-06, "loss": 0.0303, "step": 2857 }, { "epoch": 0.9544164301218901, "grad_norm": 0.44020882077328205, "learning_rate": 8.619360963749478e-06, "loss": 0.0469, "step": 2858 }, { "epoch": 0.9547503756887628, "grad_norm": 0.3479400556473947, "learning_rate": 8.618019918500342e-06, "loss": 0.0323, "step": 2859 }, { "epoch": 0.9550843212556354, "grad_norm": 0.30866760767552254, "learning_rate": 8.616678326706698e-06, "loss": 0.0288, "step": 2860 }, { "epoch": 0.955418266822508, "grad_norm": 0.5698954494113244, "learning_rate": 8.615336188571208e-06, "loss": 0.042, "step": 2861 }, { "epoch": 0.9557522123893806, "grad_norm": 0.3137338000223819, "learning_rate": 8.613993504296617e-06, "loss": 0.0234, "step": 2862 }, { "epoch": 0.9560861579562532, "grad_norm": 0.5571666589008545, "learning_rate": 8.612650274085755e-06, "loss": 0.0322, "step": 2863 }, { "epoch": 0.9564201035231257, "grad_norm": 0.3795418938612114, "learning_rate": 8.61130649814153e-06, "loss": 0.0307, "step": 2864 }, { "epoch": 0.9567540490899983, "grad_norm": 0.2991500991470353, "learning_rate": 8.609962176666936e-06, "loss": 0.0281, "step": 2865 }, { "epoch": 0.9570879946568709, "grad_norm": 0.3933303996098282, "learning_rate": 8.608617309865051e-06, "loss": 0.0354, "step": 2866 }, { "epoch": 0.9574219402237435, "grad_norm": 0.4401767972110846, "learning_rate": 8.60727189793903e-06, "loss": 0.0328, "step": 2867 }, { "epoch": 0.9577558857906161, "grad_norm": 0.35415630693635686, "learning_rate": 8.605925941092114e-06, "loss": 0.0349, "step": 2868 }, { "epoch": 0.9580898313574887, "grad_norm": 0.38490670451193765, "learning_rate": 8.604579439527627e-06, "loss": 0.0432, "step": 2869 }, { "epoch": 0.9584237769243613, "grad_norm": 0.4523937847055889, "learning_rate": 8.603232393448974e-06, "loss": 0.0354, "step": 2870 }, { "epoch": 0.9587577224912339, "grad_norm": 0.5011806427265976, "learning_rate": 8.601884803059641e-06, "loss": 0.0376, "step": 2871 }, { "epoch": 0.9590916680581065, "grad_norm": 0.34696412310171365, "learning_rate": 8.600536668563197e-06, "loss": 0.0381, "step": 2872 }, { "epoch": 0.9594256136249791, "grad_norm": 0.4690614391895925, "learning_rate": 8.599187990163296e-06, "loss": 0.0333, "step": 2873 }, { "epoch": 0.9597595591918517, "grad_norm": 0.490118930140376, "learning_rate": 8.597838768063667e-06, "loss": 0.0408, "step": 2874 }, { "epoch": 0.9600935047587243, "grad_norm": 0.7046487184714277, "learning_rate": 8.596489002468132e-06, "loss": 0.0341, "step": 2875 }, { "epoch": 0.960427450325597, "grad_norm": 0.37477769288279206, "learning_rate": 8.595138693580583e-06, "loss": 0.0339, "step": 2876 }, { "epoch": 0.9607613958924696, "grad_norm": 0.4023564173346981, "learning_rate": 8.593787841605004e-06, "loss": 0.0329, "step": 2877 }, { "epoch": 0.9610953414593422, "grad_norm": 0.2927118313100555, "learning_rate": 8.592436446745457e-06, "loss": 0.0243, "step": 2878 }, { "epoch": 0.9614292870262148, "grad_norm": 0.28765916145995385, "learning_rate": 8.591084509206085e-06, "loss": 0.0294, "step": 2879 }, { "epoch": 0.9617632325930874, "grad_norm": 0.311112204922644, "learning_rate": 8.589732029191113e-06, "loss": 0.0373, "step": 2880 }, { "epoch": 0.9620971781599599, "grad_norm": 0.5020139795287193, "learning_rate": 8.588379006904852e-06, "loss": 0.0298, "step": 2881 }, { "epoch": 0.9624311237268325, "grad_norm": 0.39531595803524777, "learning_rate": 8.587025442551689e-06, "loss": 0.0327, "step": 2882 }, { "epoch": 0.9627650692937051, "grad_norm": 0.3380535054161513, "learning_rate": 8.585671336336096e-06, "loss": 0.0238, "step": 2883 }, { "epoch": 0.9630990148605777, "grad_norm": 0.328491408305338, "learning_rate": 8.58431668846263e-06, "loss": 0.0248, "step": 2884 }, { "epoch": 0.9634329604274503, "grad_norm": 0.8318934448018565, "learning_rate": 8.582961499135925e-06, "loss": 0.0355, "step": 2885 }, { "epoch": 0.9637669059943229, "grad_norm": 0.4144795128834873, "learning_rate": 8.581605768560694e-06, "loss": 0.0308, "step": 2886 }, { "epoch": 0.9641008515611955, "grad_norm": 0.35680027909560613, "learning_rate": 8.580249496941742e-06, "loss": 0.0377, "step": 2887 }, { "epoch": 0.9644347971280681, "grad_norm": 0.4077728117900122, "learning_rate": 8.578892684483947e-06, "loss": 0.0369, "step": 2888 }, { "epoch": 0.9647687426949407, "grad_norm": 0.25562548526002254, "learning_rate": 8.577535331392272e-06, "loss": 0.0247, "step": 2889 }, { "epoch": 0.9651026882618133, "grad_norm": 0.2902086333221242, "learning_rate": 8.57617743787176e-06, "loss": 0.0308, "step": 2890 }, { "epoch": 0.9654366338286859, "grad_norm": 0.33474046267090213, "learning_rate": 8.574819004127539e-06, "loss": 0.0335, "step": 2891 }, { "epoch": 0.9657705793955585, "grad_norm": 0.3320655781521263, "learning_rate": 8.573460030364816e-06, "loss": 0.0223, "step": 2892 }, { "epoch": 0.9661045249624312, "grad_norm": 0.3968589907940447, "learning_rate": 8.572100516788878e-06, "loss": 0.0383, "step": 2893 }, { "epoch": 0.9664384705293038, "grad_norm": 0.7192859827729127, "learning_rate": 8.570740463605096e-06, "loss": 0.0531, "step": 2894 }, { "epoch": 0.9667724160961764, "grad_norm": 0.4164267642835362, "learning_rate": 8.569379871018925e-06, "loss": 0.0369, "step": 2895 }, { "epoch": 0.967106361663049, "grad_norm": 0.30076241964324807, "learning_rate": 8.568018739235895e-06, "loss": 0.0231, "step": 2896 }, { "epoch": 0.9674403072299215, "grad_norm": 0.39351832420344696, "learning_rate": 8.566657068461624e-06, "loss": 0.0293, "step": 2897 }, { "epoch": 0.9677742527967941, "grad_norm": 0.3944978232327262, "learning_rate": 8.565294858901804e-06, "loss": 0.0342, "step": 2898 }, { "epoch": 0.9681081983636667, "grad_norm": 0.30473112605308045, "learning_rate": 8.563932110762218e-06, "loss": 0.0284, "step": 2899 }, { "epoch": 0.9684421439305393, "grad_norm": 0.30375003233444114, "learning_rate": 8.562568824248722e-06, "loss": 0.0307, "step": 2900 }, { "epoch": 0.9687760894974119, "grad_norm": 0.4177704988999483, "learning_rate": 8.561204999567258e-06, "loss": 0.0348, "step": 2901 }, { "epoch": 0.9691100350642845, "grad_norm": 0.8540798710917039, "learning_rate": 8.559840636923845e-06, "loss": 0.0428, "step": 2902 }, { "epoch": 0.9694439806311571, "grad_norm": 0.3181147315438223, "learning_rate": 8.55847573652459e-06, "loss": 0.0307, "step": 2903 }, { "epoch": 0.9697779261980297, "grad_norm": 0.3431224740505919, "learning_rate": 8.557110298575674e-06, "loss": 0.0332, "step": 2904 }, { "epoch": 0.9701118717649023, "grad_norm": 0.24414566406935073, "learning_rate": 8.555744323283364e-06, "loss": 0.0275, "step": 2905 }, { "epoch": 0.9704458173317749, "grad_norm": 0.30865069403048434, "learning_rate": 8.554377810854006e-06, "loss": 0.0321, "step": 2906 }, { "epoch": 0.9707797628986475, "grad_norm": 0.48010232683215975, "learning_rate": 8.553010761494029e-06, "loss": 0.0306, "step": 2907 }, { "epoch": 0.9711137084655201, "grad_norm": 0.6055659866288445, "learning_rate": 8.551643175409941e-06, "loss": 0.031, "step": 2908 }, { "epoch": 0.9714476540323927, "grad_norm": 0.377750454693661, "learning_rate": 8.550275052808332e-06, "loss": 0.0443, "step": 2909 }, { "epoch": 0.9717815995992654, "grad_norm": 0.2952143718133554, "learning_rate": 8.548906393895876e-06, "loss": 0.0272, "step": 2910 }, { "epoch": 0.972115545166138, "grad_norm": 0.2652892479972758, "learning_rate": 8.547537198879318e-06, "loss": 0.0261, "step": 2911 }, { "epoch": 0.9724494907330106, "grad_norm": 0.4987918289652696, "learning_rate": 8.546167467965496e-06, "loss": 0.0259, "step": 2912 }, { "epoch": 0.9727834362998831, "grad_norm": 0.32460862107890287, "learning_rate": 8.544797201361324e-06, "loss": 0.0303, "step": 2913 }, { "epoch": 0.9731173818667557, "grad_norm": 1.1289177431083712, "learning_rate": 8.543426399273796e-06, "loss": 0.0871, "step": 2914 }, { "epoch": 0.9734513274336283, "grad_norm": 0.5390204663725657, "learning_rate": 8.542055061909988e-06, "loss": 0.0384, "step": 2915 }, { "epoch": 0.9737852730005009, "grad_norm": 0.39398476412347677, "learning_rate": 8.540683189477057e-06, "loss": 0.0344, "step": 2916 }, { "epoch": 0.9741192185673735, "grad_norm": 0.2954084404298375, "learning_rate": 8.539310782182238e-06, "loss": 0.0265, "step": 2917 }, { "epoch": 0.9744531641342461, "grad_norm": 0.3686457963637047, "learning_rate": 8.537937840232853e-06, "loss": 0.044, "step": 2918 }, { "epoch": 0.9747871097011187, "grad_norm": 0.3781534452843894, "learning_rate": 8.5365643638363e-06, "loss": 0.0361, "step": 2919 }, { "epoch": 0.9751210552679913, "grad_norm": 0.399554948316432, "learning_rate": 8.535190353200056e-06, "loss": 0.0389, "step": 2920 }, { "epoch": 0.9754550008348639, "grad_norm": 0.3504184417402766, "learning_rate": 8.533815808531685e-06, "loss": 0.0309, "step": 2921 }, { "epoch": 0.9757889464017365, "grad_norm": 0.6489535836387182, "learning_rate": 8.532440730038826e-06, "loss": 0.0352, "step": 2922 }, { "epoch": 0.9761228919686091, "grad_norm": 0.26295253255849527, "learning_rate": 8.531065117929202e-06, "loss": 0.0258, "step": 2923 }, { "epoch": 0.9764568375354817, "grad_norm": 0.24537007744699338, "learning_rate": 8.529688972410616e-06, "loss": 0.0249, "step": 2924 }, { "epoch": 0.9767907831023543, "grad_norm": 0.34239462407723953, "learning_rate": 8.52831229369095e-06, "loss": 0.0301, "step": 2925 }, { "epoch": 0.977124728669227, "grad_norm": 0.35920630314760105, "learning_rate": 8.526935081978166e-06, "loss": 0.0403, "step": 2926 }, { "epoch": 0.9774586742360996, "grad_norm": 0.33516431454490325, "learning_rate": 8.52555733748031e-06, "loss": 0.0332, "step": 2927 }, { "epoch": 0.9777926198029722, "grad_norm": 0.9567710656426874, "learning_rate": 8.524179060405507e-06, "loss": 0.0478, "step": 2928 }, { "epoch": 0.9781265653698448, "grad_norm": 0.4502980530312899, "learning_rate": 8.52280025096196e-06, "loss": 0.0492, "step": 2929 }, { "epoch": 0.9784605109367173, "grad_norm": 0.3204750578200467, "learning_rate": 8.521420909357956e-06, "loss": 0.0338, "step": 2930 }, { "epoch": 0.9787944565035899, "grad_norm": 0.695268630715423, "learning_rate": 8.52004103580186e-06, "loss": 0.046, "step": 2931 }, { "epoch": 0.9791284020704625, "grad_norm": 0.4422763961698212, "learning_rate": 8.51866063050212e-06, "loss": 0.0317, "step": 2932 }, { "epoch": 0.9794623476373351, "grad_norm": 0.27671788577447276, "learning_rate": 8.51727969366726e-06, "loss": 0.023, "step": 2933 }, { "epoch": 0.9797962932042077, "grad_norm": 0.28110937671481945, "learning_rate": 8.515898225505885e-06, "loss": 0.0239, "step": 2934 }, { "epoch": 0.9801302387710803, "grad_norm": 0.30396446778503616, "learning_rate": 8.514516226226688e-06, "loss": 0.0341, "step": 2935 }, { "epoch": 0.9804641843379529, "grad_norm": 0.3796155250110715, "learning_rate": 8.513133696038432e-06, "loss": 0.0404, "step": 2936 }, { "epoch": 0.9807981299048255, "grad_norm": 0.2370893181557276, "learning_rate": 8.511750635149965e-06, "loss": 0.0227, "step": 2937 }, { "epoch": 0.9811320754716981, "grad_norm": 0.25849720854736874, "learning_rate": 8.510367043770213e-06, "loss": 0.0272, "step": 2938 }, { "epoch": 0.9814660210385707, "grad_norm": 0.3026169531535005, "learning_rate": 8.508982922108188e-06, "loss": 0.0319, "step": 2939 }, { "epoch": 0.9817999666054433, "grad_norm": 0.34747292936824925, "learning_rate": 8.507598270372977e-06, "loss": 0.0386, "step": 2940 }, { "epoch": 0.9821339121723159, "grad_norm": 0.25846543228895674, "learning_rate": 8.506213088773744e-06, "loss": 0.0264, "step": 2941 }, { "epoch": 0.9824678577391885, "grad_norm": 0.32624005236877335, "learning_rate": 8.504827377519743e-06, "loss": 0.029, "step": 2942 }, { "epoch": 0.9828018033060612, "grad_norm": 0.37189342796157415, "learning_rate": 8.503441136820296e-06, "loss": 0.0372, "step": 2943 }, { "epoch": 0.9831357488729338, "grad_norm": 0.2590228156928456, "learning_rate": 8.502054366884813e-06, "loss": 0.0258, "step": 2944 }, { "epoch": 0.9834696944398064, "grad_norm": 0.4958551793393793, "learning_rate": 8.500667067922784e-06, "loss": 0.0345, "step": 2945 }, { "epoch": 0.9838036400066789, "grad_norm": 0.46911834010090736, "learning_rate": 8.499279240143776e-06, "loss": 0.0377, "step": 2946 }, { "epoch": 0.9841375855735515, "grad_norm": 0.30981252637620316, "learning_rate": 8.497890883757434e-06, "loss": 0.0299, "step": 2947 }, { "epoch": 0.9844715311404241, "grad_norm": 0.3114864548517596, "learning_rate": 8.496501998973489e-06, "loss": 0.0301, "step": 2948 }, { "epoch": 0.9848054767072967, "grad_norm": 0.3763100598918492, "learning_rate": 8.495112586001747e-06, "loss": 0.0385, "step": 2949 }, { "epoch": 0.9851394222741693, "grad_norm": 0.49195061338951485, "learning_rate": 8.493722645052093e-06, "loss": 0.0498, "step": 2950 }, { "epoch": 0.9854733678410419, "grad_norm": 0.43350088323905367, "learning_rate": 8.4923321763345e-06, "loss": 0.0223, "step": 2951 }, { "epoch": 0.9858073134079145, "grad_norm": 0.41645937072227135, "learning_rate": 8.490941180059009e-06, "loss": 0.0343, "step": 2952 }, { "epoch": 0.9861412589747871, "grad_norm": 0.24031710993391084, "learning_rate": 8.489549656435748e-06, "loss": 0.0287, "step": 2953 }, { "epoch": 0.9864752045416597, "grad_norm": 0.2867054804285229, "learning_rate": 8.488157605674924e-06, "loss": 0.0261, "step": 2954 }, { "epoch": 0.9868091501085323, "grad_norm": 0.35742715911016765, "learning_rate": 8.486765027986821e-06, "loss": 0.0374, "step": 2955 }, { "epoch": 0.9871430956754049, "grad_norm": 0.34186016687613524, "learning_rate": 8.485371923581807e-06, "loss": 0.0296, "step": 2956 }, { "epoch": 0.9874770412422775, "grad_norm": 0.33322174738473864, "learning_rate": 8.483978292670324e-06, "loss": 0.0259, "step": 2957 }, { "epoch": 0.9878109868091501, "grad_norm": 0.30139118498858575, "learning_rate": 8.482584135462896e-06, "loss": 0.0301, "step": 2958 }, { "epoch": 0.9881449323760227, "grad_norm": 0.5939531620552276, "learning_rate": 8.48118945217013e-06, "loss": 0.0502, "step": 2959 }, { "epoch": 0.9884788779428954, "grad_norm": 0.3115499964256641, "learning_rate": 8.479794243002707e-06, "loss": 0.0307, "step": 2960 }, { "epoch": 0.988812823509768, "grad_norm": 0.2615585331165329, "learning_rate": 8.47839850817139e-06, "loss": 0.0232, "step": 2961 }, { "epoch": 0.9891467690766405, "grad_norm": 0.3671074028862246, "learning_rate": 8.477002247887024e-06, "loss": 0.0267, "step": 2962 }, { "epoch": 0.9894807146435131, "grad_norm": 0.2969673994841143, "learning_rate": 8.475605462360525e-06, "loss": 0.0364, "step": 2963 }, { "epoch": 0.9898146602103857, "grad_norm": 0.48234835840360146, "learning_rate": 8.474208151802898e-06, "loss": 0.0503, "step": 2964 }, { "epoch": 0.9901486057772583, "grad_norm": 0.44747298497568694, "learning_rate": 8.472810316425223e-06, "loss": 0.0372, "step": 2965 }, { "epoch": 0.9904825513441309, "grad_norm": 0.2952379007761491, "learning_rate": 8.471411956438657e-06, "loss": 0.0278, "step": 2966 }, { "epoch": 0.9908164969110035, "grad_norm": 0.5301810921767381, "learning_rate": 8.470013072054442e-06, "loss": 0.0333, "step": 2967 }, { "epoch": 0.9911504424778761, "grad_norm": 0.39644981737324414, "learning_rate": 8.468613663483894e-06, "loss": 0.03, "step": 2968 }, { "epoch": 0.9914843880447487, "grad_norm": 0.517759109550299, "learning_rate": 8.467213730938408e-06, "loss": 0.0352, "step": 2969 }, { "epoch": 0.9918183336116213, "grad_norm": 0.37509790116884606, "learning_rate": 8.465813274629466e-06, "loss": 0.0357, "step": 2970 }, { "epoch": 0.9921522791784939, "grad_norm": 0.3729478237418451, "learning_rate": 8.46441229476862e-06, "loss": 0.0414, "step": 2971 }, { "epoch": 0.9924862247453665, "grad_norm": 0.2887147074041474, "learning_rate": 8.463010791567503e-06, "loss": 0.0232, "step": 2972 }, { "epoch": 0.9928201703122391, "grad_norm": 0.30249947551885503, "learning_rate": 8.461608765237832e-06, "loss": 0.0374, "step": 2973 }, { "epoch": 0.9931541158791117, "grad_norm": 0.42989384332405156, "learning_rate": 8.460206215991398e-06, "loss": 0.0429, "step": 2974 }, { "epoch": 0.9934880614459843, "grad_norm": 0.43458607387665993, "learning_rate": 8.458803144040071e-06, "loss": 0.0447, "step": 2975 }, { "epoch": 0.993822007012857, "grad_norm": 0.32059077440084477, "learning_rate": 8.457399549595803e-06, "loss": 0.0285, "step": 2976 }, { "epoch": 0.9941559525797296, "grad_norm": 0.43557159634889453, "learning_rate": 8.455995432870626e-06, "loss": 0.045, "step": 2977 }, { "epoch": 0.9944898981466022, "grad_norm": 0.5529750352646045, "learning_rate": 8.454590794076642e-06, "loss": 0.0318, "step": 2978 }, { "epoch": 0.9948238437134747, "grad_norm": 0.6860455648765753, "learning_rate": 8.453185633426044e-06, "loss": 0.037, "step": 2979 }, { "epoch": 0.9951577892803473, "grad_norm": 0.2702880584625079, "learning_rate": 8.451779951131096e-06, "loss": 0.0279, "step": 2980 }, { "epoch": 0.9954917348472199, "grad_norm": 0.3468127833882437, "learning_rate": 8.450373747404143e-06, "loss": 0.0294, "step": 2981 }, { "epoch": 0.9958256804140925, "grad_norm": 0.3163357942388814, "learning_rate": 8.448967022457611e-06, "loss": 0.0308, "step": 2982 }, { "epoch": 0.9961596259809651, "grad_norm": 0.2687120444587892, "learning_rate": 8.447559776503998e-06, "loss": 0.0287, "step": 2983 }, { "epoch": 0.9964935715478377, "grad_norm": 0.23908782274380247, "learning_rate": 8.446152009755886e-06, "loss": 0.0262, "step": 2984 }, { "epoch": 0.9968275171147103, "grad_norm": 0.4168061054496135, "learning_rate": 8.444743722425937e-06, "loss": 0.0394, "step": 2985 }, { "epoch": 0.9971614626815829, "grad_norm": 0.33954139397298466, "learning_rate": 8.443334914726886e-06, "loss": 0.0307, "step": 2986 }, { "epoch": 0.9974954082484555, "grad_norm": 0.3913146771735054, "learning_rate": 8.441925586871556e-06, "loss": 0.0361, "step": 2987 }, { "epoch": 0.9978293538153281, "grad_norm": 0.2878594596242855, "learning_rate": 8.440515739072836e-06, "loss": 0.0278, "step": 2988 }, { "epoch": 0.9981632993822007, "grad_norm": 0.42990855177109644, "learning_rate": 8.439105371543703e-06, "loss": 0.0299, "step": 2989 }, { "epoch": 0.9984972449490733, "grad_norm": 0.2992324274086244, "learning_rate": 8.43769448449721e-06, "loss": 0.0259, "step": 2990 }, { "epoch": 0.9988311905159459, "grad_norm": 0.32890318973287186, "learning_rate": 8.436283078146488e-06, "loss": 0.0311, "step": 2991 }, { "epoch": 0.9991651360828185, "grad_norm": 0.5188806377460586, "learning_rate": 8.434871152704745e-06, "loss": 0.0443, "step": 2992 }, { "epoch": 0.9994990816496911, "grad_norm": 0.4772461349888854, "learning_rate": 8.433458708385272e-06, "loss": 0.0367, "step": 2993 }, { "epoch": 0.9998330272165638, "grad_norm": 0.3135369453907103, "learning_rate": 8.432045745401431e-06, "loss": 0.0308, "step": 2994 }, { "epoch": 0.9998330272165638, "eval_loss": 0.03422769904136658, "eval_runtime": 183.5729, "eval_samples_per_second": 109.891, "eval_steps_per_second": 1.721, "step": 2994 }, { "epoch": 1.0001669727834364, "grad_norm": 0.29771575206090833, "learning_rate": 8.430632263966672e-06, "loss": 0.0323, "step": 2995 }, { "epoch": 1.0005009183503089, "grad_norm": 0.3697550157731697, "learning_rate": 8.429218264294512e-06, "loss": 0.0268, "step": 2996 }, { "epoch": 1.0008348639171816, "grad_norm": 0.26297889924216067, "learning_rate": 8.427803746598557e-06, "loss": 0.0258, "step": 2997 }, { "epoch": 1.001168809484054, "grad_norm": 0.25011685660945276, "learning_rate": 8.426388711092486e-06, "loss": 0.0274, "step": 2998 }, { "epoch": 1.0015027550509268, "grad_norm": 0.27126523572208655, "learning_rate": 8.424973157990053e-06, "loss": 0.0318, "step": 2999 }, { "epoch": 1.0018367006177993, "grad_norm": 0.3722527919143557, "learning_rate": 8.4235570875051e-06, "loss": 0.0288, "step": 3000 }, { "epoch": 1.002170646184672, "grad_norm": 0.32588205635847833, "learning_rate": 8.422140499851536e-06, "loss": 0.0291, "step": 3001 }, { "epoch": 1.0025045917515445, "grad_norm": 0.29852829256692953, "learning_rate": 8.420723395243356e-06, "loss": 0.0354, "step": 3002 }, { "epoch": 1.002838537318417, "grad_norm": 0.2865517515058864, "learning_rate": 8.419305773894628e-06, "loss": 0.0266, "step": 3003 }, { "epoch": 1.0031724828852897, "grad_norm": 0.33608424316914015, "learning_rate": 8.417887636019504e-06, "loss": 0.0322, "step": 3004 }, { "epoch": 1.0035064284521622, "grad_norm": 0.2776393554879091, "learning_rate": 8.416468981832207e-06, "loss": 0.0283, "step": 3005 }, { "epoch": 1.003840374019035, "grad_norm": 0.36039590633364826, "learning_rate": 8.415049811547043e-06, "loss": 0.0381, "step": 3006 }, { "epoch": 1.0041743195859074, "grad_norm": 0.3311398415075013, "learning_rate": 8.413630125378393e-06, "loss": 0.0261, "step": 3007 }, { "epoch": 1.0045082651527801, "grad_norm": 0.26780439680340007, "learning_rate": 8.412209923540719e-06, "loss": 0.0283, "step": 3008 }, { "epoch": 1.0048422107196526, "grad_norm": 0.342411464839398, "learning_rate": 8.41078920624856e-06, "loss": 0.0305, "step": 3009 }, { "epoch": 1.0051761562865253, "grad_norm": 0.2657920894694099, "learning_rate": 8.409367973716527e-06, "loss": 0.0231, "step": 3010 }, { "epoch": 1.0055101018533978, "grad_norm": 0.30883734956705133, "learning_rate": 8.40794622615932e-06, "loss": 0.0259, "step": 3011 }, { "epoch": 1.0058440474202706, "grad_norm": 0.3813557990190081, "learning_rate": 8.406523963791709e-06, "loss": 0.0307, "step": 3012 }, { "epoch": 1.006177992987143, "grad_norm": 0.38089081135860814, "learning_rate": 8.405101186828542e-06, "loss": 0.0422, "step": 3013 }, { "epoch": 1.0065119385540158, "grad_norm": 0.3857972617574587, "learning_rate": 8.403677895484746e-06, "loss": 0.0281, "step": 3014 }, { "epoch": 1.0068458841208883, "grad_norm": 0.27516406978293884, "learning_rate": 8.402254089975328e-06, "loss": 0.0249, "step": 3015 }, { "epoch": 1.007179829687761, "grad_norm": 0.4172881502462482, "learning_rate": 8.400829770515369e-06, "loss": 0.0275, "step": 3016 }, { "epoch": 1.0075137752546335, "grad_norm": 0.2727571111267168, "learning_rate": 8.399404937320031e-06, "loss": 0.0212, "step": 3017 }, { "epoch": 1.0078477208215062, "grad_norm": 0.42937101245811243, "learning_rate": 8.397979590604548e-06, "loss": 0.0387, "step": 3018 }, { "epoch": 1.0081816663883787, "grad_norm": 0.322584473450276, "learning_rate": 8.39655373058424e-06, "loss": 0.0239, "step": 3019 }, { "epoch": 1.0085156119552512, "grad_norm": 0.40625205352551624, "learning_rate": 8.395127357474498e-06, "loss": 0.0399, "step": 3020 }, { "epoch": 1.008849557522124, "grad_norm": 0.23824946169720115, "learning_rate": 8.39370047149079e-06, "loss": 0.0184, "step": 3021 }, { "epoch": 1.0091835030889964, "grad_norm": 0.3081783469560006, "learning_rate": 8.39227307284867e-06, "loss": 0.0262, "step": 3022 }, { "epoch": 1.0095174486558691, "grad_norm": 0.32016315512595445, "learning_rate": 8.390845161763756e-06, "loss": 0.0293, "step": 3023 }, { "epoch": 1.0098513942227416, "grad_norm": 0.3542949947754674, "learning_rate": 8.389416738451755e-06, "loss": 0.0294, "step": 3024 }, { "epoch": 1.0101853397896143, "grad_norm": 0.4493078332033422, "learning_rate": 8.387987803128447e-06, "loss": 0.0312, "step": 3025 }, { "epoch": 1.0105192853564868, "grad_norm": 0.31875130673953356, "learning_rate": 8.386558356009691e-06, "loss": 0.0314, "step": 3026 }, { "epoch": 1.0108532309233595, "grad_norm": 0.33589429627907247, "learning_rate": 8.385128397311418e-06, "loss": 0.0338, "step": 3027 }, { "epoch": 1.011187176490232, "grad_norm": 0.2560370442628772, "learning_rate": 8.383697927249641e-06, "loss": 0.0225, "step": 3028 }, { "epoch": 1.0115211220571048, "grad_norm": 0.520422510941351, "learning_rate": 8.382266946040453e-06, "loss": 0.0323, "step": 3029 }, { "epoch": 1.0118550676239773, "grad_norm": 0.46973474511439056, "learning_rate": 8.380835453900017e-06, "loss": 0.0312, "step": 3030 }, { "epoch": 1.01218901319085, "grad_norm": 0.25205200991018967, "learning_rate": 8.379403451044576e-06, "loss": 0.0198, "step": 3031 }, { "epoch": 1.0125229587577225, "grad_norm": 0.315932615396693, "learning_rate": 8.377970937690455e-06, "loss": 0.033, "step": 3032 }, { "epoch": 1.0128569043245952, "grad_norm": 0.4030566295570241, "learning_rate": 8.376537914054048e-06, "loss": 0.0281, "step": 3033 }, { "epoch": 1.0131908498914677, "grad_norm": 0.3763206399060218, "learning_rate": 8.37510438035183e-06, "loss": 0.0259, "step": 3034 }, { "epoch": 1.0135247954583404, "grad_norm": 0.3039694525112911, "learning_rate": 8.373670336800358e-06, "loss": 0.0271, "step": 3035 }, { "epoch": 1.013858741025213, "grad_norm": 0.3738516402626686, "learning_rate": 8.372235783616258e-06, "loss": 0.0283, "step": 3036 }, { "epoch": 1.0141926865920854, "grad_norm": 0.3591033770343096, "learning_rate": 8.370800721016232e-06, "loss": 0.0372, "step": 3037 }, { "epoch": 1.014526632158958, "grad_norm": 0.39241184515001626, "learning_rate": 8.369365149217072e-06, "loss": 0.032, "step": 3038 }, { "epoch": 1.0148605777258306, "grad_norm": 0.5526497550193588, "learning_rate": 8.36792906843563e-06, "loss": 0.0289, "step": 3039 }, { "epoch": 1.0151945232927033, "grad_norm": 0.6452615909755268, "learning_rate": 8.366492478888849e-06, "loss": 0.0344, "step": 3040 }, { "epoch": 1.0155284688595758, "grad_norm": 0.30586192991505584, "learning_rate": 8.365055380793737e-06, "loss": 0.0249, "step": 3041 }, { "epoch": 1.0158624144264485, "grad_norm": 0.44331254419930605, "learning_rate": 8.363617774367389e-06, "loss": 0.027, "step": 3042 }, { "epoch": 1.016196359993321, "grad_norm": 0.27610835252193916, "learning_rate": 8.36217965982697e-06, "loss": 0.0213, "step": 3043 }, { "epoch": 1.0165303055601937, "grad_norm": 0.3366498975190066, "learning_rate": 8.360741037389727e-06, "loss": 0.0265, "step": 3044 }, { "epoch": 1.0168642511270662, "grad_norm": 0.370029731400079, "learning_rate": 8.359301907272976e-06, "loss": 0.0266, "step": 3045 }, { "epoch": 1.017198196693939, "grad_norm": 0.29690579079600554, "learning_rate": 8.35786226969412e-06, "loss": 0.0235, "step": 3046 }, { "epoch": 1.0175321422608115, "grad_norm": 0.4331409717406702, "learning_rate": 8.356422124870629e-06, "loss": 0.0202, "step": 3047 }, { "epoch": 1.0178660878276842, "grad_norm": 0.36375446537420747, "learning_rate": 8.354981473020056e-06, "loss": 0.0301, "step": 3048 }, { "epoch": 1.0182000333945567, "grad_norm": 0.32901432769444916, "learning_rate": 8.353540314360027e-06, "loss": 0.0343, "step": 3049 }, { "epoch": 1.0185339789614294, "grad_norm": 0.30963182120038696, "learning_rate": 8.352098649108246e-06, "loss": 0.0348, "step": 3050 }, { "epoch": 1.0188679245283019, "grad_norm": 0.2813886580565502, "learning_rate": 8.350656477482497e-06, "loss": 0.0205, "step": 3051 }, { "epoch": 1.0192018700951744, "grad_norm": 0.3631125626486861, "learning_rate": 8.349213799700635e-06, "loss": 0.0296, "step": 3052 }, { "epoch": 1.019535815662047, "grad_norm": 0.37858193548772595, "learning_rate": 8.34777061598059e-06, "loss": 0.0286, "step": 3053 }, { "epoch": 1.0198697612289196, "grad_norm": 0.32955853947458696, "learning_rate": 8.346326926540377e-06, "loss": 0.0321, "step": 3054 }, { "epoch": 1.0202037067957923, "grad_norm": 0.768760878982909, "learning_rate": 8.344882731598079e-06, "loss": 0.0223, "step": 3055 }, { "epoch": 1.0205376523626648, "grad_norm": 0.3193180970107813, "learning_rate": 8.343438031371858e-06, "loss": 0.0234, "step": 3056 }, { "epoch": 1.0208715979295375, "grad_norm": 0.4827705103087651, "learning_rate": 8.341992826079956e-06, "loss": 0.056, "step": 3057 }, { "epoch": 1.02120554349641, "grad_norm": 0.4434386097886292, "learning_rate": 8.340547115940688e-06, "loss": 0.0242, "step": 3058 }, { "epoch": 1.0215394890632827, "grad_norm": 0.5546308377722811, "learning_rate": 8.339100901172443e-06, "loss": 0.0362, "step": 3059 }, { "epoch": 1.0218734346301552, "grad_norm": 0.5023578367259403, "learning_rate": 8.337654181993691e-06, "loss": 0.0199, "step": 3060 }, { "epoch": 1.022207380197028, "grad_norm": 1.078937180219304, "learning_rate": 8.336206958622975e-06, "loss": 0.0385, "step": 3061 }, { "epoch": 1.0225413257639004, "grad_norm": 0.34334042347382643, "learning_rate": 8.334759231278915e-06, "loss": 0.0233, "step": 3062 }, { "epoch": 1.0228752713307732, "grad_norm": 0.3297745842454771, "learning_rate": 8.333311000180208e-06, "loss": 0.0247, "step": 3063 }, { "epoch": 1.0232092168976457, "grad_norm": 0.21351540675230476, "learning_rate": 8.331862265545627e-06, "loss": 0.0168, "step": 3064 }, { "epoch": 1.0235431624645184, "grad_norm": 0.454635462538323, "learning_rate": 8.330413027594019e-06, "loss": 0.046, "step": 3065 }, { "epoch": 1.0238771080313909, "grad_norm": 0.5560074504521261, "learning_rate": 8.328963286544309e-06, "loss": 0.04, "step": 3066 }, { "epoch": 1.0242110535982636, "grad_norm": 0.45077409854950257, "learning_rate": 8.327513042615496e-06, "loss": 0.0342, "step": 3067 }, { "epoch": 1.024544999165136, "grad_norm": 0.37806201311651555, "learning_rate": 8.326062296026657e-06, "loss": 0.0236, "step": 3068 }, { "epoch": 1.0248789447320086, "grad_norm": 0.3586911886970298, "learning_rate": 8.324611046996947e-06, "loss": 0.0292, "step": 3069 }, { "epoch": 1.0252128902988813, "grad_norm": 0.36652700517579456, "learning_rate": 8.32315929574559e-06, "loss": 0.0255, "step": 3070 }, { "epoch": 1.0255468358657538, "grad_norm": 0.24863258101599792, "learning_rate": 8.321707042491895e-06, "loss": 0.0213, "step": 3071 }, { "epoch": 1.0258807814326265, "grad_norm": 0.3639895846996747, "learning_rate": 8.320254287455238e-06, "loss": 0.0223, "step": 3072 }, { "epoch": 1.026214726999499, "grad_norm": 0.2728335926701653, "learning_rate": 8.318801030855078e-06, "loss": 0.0188, "step": 3073 }, { "epoch": 1.0265486725663717, "grad_norm": 0.573002744738013, "learning_rate": 8.317347272910944e-06, "loss": 0.0504, "step": 3074 }, { "epoch": 1.0268826181332442, "grad_norm": 0.39901750658595253, "learning_rate": 8.315893013842441e-06, "loss": 0.0373, "step": 3075 }, { "epoch": 1.027216563700117, "grad_norm": 0.41332567428108785, "learning_rate": 8.31443825386926e-06, "loss": 0.0369, "step": 3076 }, { "epoch": 1.0275505092669894, "grad_norm": 0.4564478963697029, "learning_rate": 8.312982993211151e-06, "loss": 0.0486, "step": 3077 }, { "epoch": 1.0278844548338621, "grad_norm": 0.26904193285387235, "learning_rate": 8.311527232087951e-06, "loss": 0.0269, "step": 3078 }, { "epoch": 1.0282184004007346, "grad_norm": 0.3523462902540687, "learning_rate": 8.310070970719573e-06, "loss": 0.0281, "step": 3079 }, { "epoch": 1.0285523459676074, "grad_norm": 0.30712694816648556, "learning_rate": 8.308614209325997e-06, "loss": 0.0311, "step": 3080 }, { "epoch": 1.0288862915344799, "grad_norm": 0.4275847869419905, "learning_rate": 8.30715694812729e-06, "loss": 0.0244, "step": 3081 }, { "epoch": 1.0292202371013526, "grad_norm": 0.44420722595328466, "learning_rate": 8.305699187343586e-06, "loss": 0.0352, "step": 3082 }, { "epoch": 1.029554182668225, "grad_norm": 0.3178046177388971, "learning_rate": 8.304240927195094e-06, "loss": 0.0344, "step": 3083 }, { "epoch": 1.0298881282350978, "grad_norm": 0.36535031040615956, "learning_rate": 8.302782167902103e-06, "loss": 0.0347, "step": 3084 }, { "epoch": 1.0302220738019703, "grad_norm": 0.2985913932941345, "learning_rate": 8.30132290968498e-06, "loss": 0.0199, "step": 3085 }, { "epoch": 1.0305560193688428, "grad_norm": 0.39983165675579185, "learning_rate": 8.299863152764158e-06, "loss": 0.0293, "step": 3086 }, { "epoch": 1.0308899649357155, "grad_norm": 0.4044736097086519, "learning_rate": 8.298402897360152e-06, "loss": 0.0285, "step": 3087 }, { "epoch": 1.031223910502588, "grad_norm": 0.30600013716904445, "learning_rate": 8.29694214369355e-06, "loss": 0.0312, "step": 3088 }, { "epoch": 1.0315578560694607, "grad_norm": 0.3184850789055068, "learning_rate": 8.295480891985019e-06, "loss": 0.0296, "step": 3089 }, { "epoch": 1.0318918016363332, "grad_norm": 0.46694113108521274, "learning_rate": 8.294019142455295e-06, "loss": 0.0386, "step": 3090 }, { "epoch": 1.032225747203206, "grad_norm": 0.3435767279673977, "learning_rate": 8.292556895325195e-06, "loss": 0.0226, "step": 3091 }, { "epoch": 1.0325596927700784, "grad_norm": 0.43518409317233214, "learning_rate": 8.291094150815607e-06, "loss": 0.0302, "step": 3092 }, { "epoch": 1.0328936383369511, "grad_norm": 0.2878261284693619, "learning_rate": 8.289630909147494e-06, "loss": 0.0217, "step": 3093 }, { "epoch": 1.0332275839038236, "grad_norm": 0.32810234847386943, "learning_rate": 8.2881671705419e-06, "loss": 0.019, "step": 3094 }, { "epoch": 1.0335615294706963, "grad_norm": 0.28794547523903125, "learning_rate": 8.286702935219936e-06, "loss": 0.026, "step": 3095 }, { "epoch": 1.0338954750375688, "grad_norm": 0.4588925092844552, "learning_rate": 8.285238203402796e-06, "loss": 0.0329, "step": 3096 }, { "epoch": 1.0342294206044416, "grad_norm": 0.33891803932934295, "learning_rate": 8.283772975311742e-06, "loss": 0.0301, "step": 3097 }, { "epoch": 1.034563366171314, "grad_norm": 0.3093952371234193, "learning_rate": 8.282307251168116e-06, "loss": 0.0234, "step": 3098 }, { "epoch": 1.0348973117381868, "grad_norm": 0.3361344519386881, "learning_rate": 8.28084103119333e-06, "loss": 0.0254, "step": 3099 }, { "epoch": 1.0352312573050593, "grad_norm": 0.4456375711383226, "learning_rate": 8.279374315608877e-06, "loss": 0.0361, "step": 3100 }, { "epoch": 1.0355652028719318, "grad_norm": 0.36902061126373575, "learning_rate": 8.27790710463632e-06, "loss": 0.0288, "step": 3101 }, { "epoch": 1.0358991484388045, "grad_norm": 1.2728677691161303, "learning_rate": 8.276439398497298e-06, "loss": 0.036, "step": 3102 }, { "epoch": 1.036233094005677, "grad_norm": 0.31932718122490666, "learning_rate": 8.274971197413527e-06, "loss": 0.0215, "step": 3103 }, { "epoch": 1.0365670395725497, "grad_norm": 0.30187842717202285, "learning_rate": 8.273502501606794e-06, "loss": 0.028, "step": 3104 }, { "epoch": 1.0369009851394222, "grad_norm": 0.41631944796866643, "learning_rate": 8.272033311298965e-06, "loss": 0.0366, "step": 3105 }, { "epoch": 1.037234930706295, "grad_norm": 0.37127879143926973, "learning_rate": 8.270563626711979e-06, "loss": 0.0318, "step": 3106 }, { "epoch": 1.0375688762731674, "grad_norm": 0.24229265529666802, "learning_rate": 8.269093448067845e-06, "loss": 0.019, "step": 3107 }, { "epoch": 1.0379028218400401, "grad_norm": 0.40860081811063315, "learning_rate": 8.267622775588653e-06, "loss": 0.0363, "step": 3108 }, { "epoch": 1.0382367674069126, "grad_norm": 0.24137702262320918, "learning_rate": 8.266151609496567e-06, "loss": 0.0173, "step": 3109 }, { "epoch": 1.0385707129737853, "grad_norm": 0.9606289959687807, "learning_rate": 8.26467995001382e-06, "loss": 0.0325, "step": 3110 }, { "epoch": 1.0389046585406578, "grad_norm": 0.32029074991943135, "learning_rate": 8.26320779736273e-06, "loss": 0.0304, "step": 3111 }, { "epoch": 1.0392386041075306, "grad_norm": 0.5093859227143862, "learning_rate": 8.261735151765678e-06, "loss": 0.0262, "step": 3112 }, { "epoch": 1.039572549674403, "grad_norm": 0.38519493130500526, "learning_rate": 8.260262013445126e-06, "loss": 0.0216, "step": 3113 }, { "epoch": 1.0399064952412758, "grad_norm": 0.2796498288038895, "learning_rate": 8.258788382623607e-06, "loss": 0.0261, "step": 3114 }, { "epoch": 1.0402404408081483, "grad_norm": 0.4478050010652396, "learning_rate": 8.257314259523732e-06, "loss": 0.028, "step": 3115 }, { "epoch": 1.040574386375021, "grad_norm": 0.40193088729007864, "learning_rate": 8.255839644368185e-06, "loss": 0.0288, "step": 3116 }, { "epoch": 1.0409083319418935, "grad_norm": 0.4025068152340384, "learning_rate": 8.254364537379725e-06, "loss": 0.0272, "step": 3117 }, { "epoch": 1.041242277508766, "grad_norm": 0.4665400693883106, "learning_rate": 8.25288893878118e-06, "loss": 0.0312, "step": 3118 }, { "epoch": 1.0415762230756387, "grad_norm": 0.3983536390475072, "learning_rate": 8.251412848795462e-06, "loss": 0.0253, "step": 3119 }, { "epoch": 1.0419101686425112, "grad_norm": 0.2601635485849514, "learning_rate": 8.249936267645546e-06, "loss": 0.0202, "step": 3120 }, { "epoch": 1.042244114209384, "grad_norm": 0.3267182537592143, "learning_rate": 8.248459195554492e-06, "loss": 0.026, "step": 3121 }, { "epoch": 1.0425780597762564, "grad_norm": 0.3610038201378598, "learning_rate": 8.246981632745428e-06, "loss": 0.0371, "step": 3122 }, { "epoch": 1.0429120053431291, "grad_norm": 0.3471189852385901, "learning_rate": 8.245503579441554e-06, "loss": 0.0283, "step": 3123 }, { "epoch": 1.0432459509100016, "grad_norm": 0.3313712073324069, "learning_rate": 8.244025035866151e-06, "loss": 0.0302, "step": 3124 }, { "epoch": 1.0435798964768743, "grad_norm": 0.29523361657471914, "learning_rate": 8.242546002242569e-06, "loss": 0.0195, "step": 3125 }, { "epoch": 1.0439138420437468, "grad_norm": 0.4708060332711091, "learning_rate": 8.241066478794233e-06, "loss": 0.0351, "step": 3126 }, { "epoch": 1.0442477876106195, "grad_norm": 0.33361012747520763, "learning_rate": 8.239586465744644e-06, "loss": 0.0341, "step": 3127 }, { "epoch": 1.044581733177492, "grad_norm": 0.44564837446813743, "learning_rate": 8.238105963317376e-06, "loss": 0.0407, "step": 3128 }, { "epoch": 1.0449156787443648, "grad_norm": 0.2862680825008876, "learning_rate": 8.236624971736071e-06, "loss": 0.026, "step": 3129 }, { "epoch": 1.0452496243112372, "grad_norm": 0.2474654693302596, "learning_rate": 8.235143491224458e-06, "loss": 0.0193, "step": 3130 }, { "epoch": 1.04558356987811, "grad_norm": 0.2949794781745856, "learning_rate": 8.233661522006324e-06, "loss": 0.0193, "step": 3131 }, { "epoch": 1.0459175154449825, "grad_norm": 0.31837585275654734, "learning_rate": 8.232179064305545e-06, "loss": 0.0306, "step": 3132 }, { "epoch": 1.0462514610118552, "grad_norm": 0.3394771192991278, "learning_rate": 8.230696118346059e-06, "loss": 0.0322, "step": 3133 }, { "epoch": 1.0465854065787277, "grad_norm": 0.36475144705170737, "learning_rate": 8.229212684351886e-06, "loss": 0.0418, "step": 3134 }, { "epoch": 1.0469193521456002, "grad_norm": 0.29704859389937754, "learning_rate": 8.227728762547112e-06, "loss": 0.0156, "step": 3135 }, { "epoch": 1.0472532977124729, "grad_norm": 0.2738307368121378, "learning_rate": 8.226244353155906e-06, "loss": 0.0227, "step": 3136 }, { "epoch": 1.0475872432793454, "grad_norm": 0.3158699956701117, "learning_rate": 8.2247594564025e-06, "loss": 0.0252, "step": 3137 }, { "epoch": 1.047921188846218, "grad_norm": 0.2778342337220743, "learning_rate": 8.22327407251121e-06, "loss": 0.0304, "step": 3138 }, { "epoch": 1.0482551344130906, "grad_norm": 0.30138033930047925, "learning_rate": 8.221788201706416e-06, "loss": 0.0296, "step": 3139 }, { "epoch": 1.0485890799799633, "grad_norm": 0.29959189001762615, "learning_rate": 8.22030184421258e-06, "loss": 0.0263, "step": 3140 }, { "epoch": 1.0489230255468358, "grad_norm": 0.33416108029859287, "learning_rate": 8.218815000254233e-06, "loss": 0.0343, "step": 3141 }, { "epoch": 1.0492569711137085, "grad_norm": 0.4379592928548615, "learning_rate": 8.21732767005598e-06, "loss": 0.0383, "step": 3142 }, { "epoch": 1.049590916680581, "grad_norm": 0.5810887056619938, "learning_rate": 8.215839853842498e-06, "loss": 0.0421, "step": 3143 }, { "epoch": 1.0499248622474537, "grad_norm": 0.38367710322524623, "learning_rate": 8.214351551838541e-06, "loss": 0.0374, "step": 3144 }, { "epoch": 1.0502588078143262, "grad_norm": 0.33266657106849523, "learning_rate": 8.212862764268936e-06, "loss": 0.0243, "step": 3145 }, { "epoch": 1.050592753381199, "grad_norm": 0.3200406377901553, "learning_rate": 8.21137349135858e-06, "loss": 0.0241, "step": 3146 }, { "epoch": 1.0509266989480714, "grad_norm": 0.3031139344803291, "learning_rate": 8.209883733332444e-06, "loss": 0.0274, "step": 3147 }, { "epoch": 1.0512606445149442, "grad_norm": 0.24042528181312012, "learning_rate": 8.208393490415576e-06, "loss": 0.0221, "step": 3148 }, { "epoch": 1.0515945900818167, "grad_norm": 0.5308126282602463, "learning_rate": 8.206902762833095e-06, "loss": 0.0266, "step": 3149 }, { "epoch": 1.0519285356486892, "grad_norm": 0.3003822510107234, "learning_rate": 8.205411550810189e-06, "loss": 0.0259, "step": 3150 }, { "epoch": 1.0522624812155619, "grad_norm": 0.28714582443865505, "learning_rate": 8.203919854572126e-06, "loss": 0.025, "step": 3151 }, { "epoch": 1.0525964267824344, "grad_norm": 0.3077690157961476, "learning_rate": 8.202427674344246e-06, "loss": 0.0243, "step": 3152 }, { "epoch": 1.052930372349307, "grad_norm": 0.3751881310366447, "learning_rate": 8.200935010351958e-06, "loss": 0.0236, "step": 3153 }, { "epoch": 1.0532643179161796, "grad_norm": 0.35076399474792186, "learning_rate": 8.199441862820746e-06, "loss": 0.0248, "step": 3154 }, { "epoch": 1.0535982634830523, "grad_norm": 0.26307182834482473, "learning_rate": 8.197948231976169e-06, "loss": 0.0203, "step": 3155 }, { "epoch": 1.0539322090499248, "grad_norm": 0.3044129331544508, "learning_rate": 8.196454118043856e-06, "loss": 0.0302, "step": 3156 }, { "epoch": 1.0542661546167975, "grad_norm": 0.3627983241007162, "learning_rate": 8.194959521249512e-06, "loss": 0.0307, "step": 3157 }, { "epoch": 1.05460010018367, "grad_norm": 0.44319883100320684, "learning_rate": 8.193464441818913e-06, "loss": 0.0465, "step": 3158 }, { "epoch": 1.0549340457505427, "grad_norm": 0.2887795746389143, "learning_rate": 8.191968879977907e-06, "loss": 0.0245, "step": 3159 }, { "epoch": 1.0552679913174152, "grad_norm": 0.3032079505776515, "learning_rate": 8.190472835952419e-06, "loss": 0.0265, "step": 3160 }, { "epoch": 1.055601936884288, "grad_norm": 0.2592623562248579, "learning_rate": 8.188976309968443e-06, "loss": 0.0202, "step": 3161 }, { "epoch": 1.0559358824511604, "grad_norm": 0.36726483272764104, "learning_rate": 8.187479302252045e-06, "loss": 0.0294, "step": 3162 }, { "epoch": 1.0562698280180332, "grad_norm": 0.24711785591608035, "learning_rate": 8.185981813029368e-06, "loss": 0.0202, "step": 3163 }, { "epoch": 1.0566037735849056, "grad_norm": 0.567690906291969, "learning_rate": 8.184483842526623e-06, "loss": 0.0227, "step": 3164 }, { "epoch": 1.0569377191517784, "grad_norm": 0.26315373047195273, "learning_rate": 8.1829853909701e-06, "loss": 0.0216, "step": 3165 }, { "epoch": 1.0572716647186509, "grad_norm": 0.3377394298084983, "learning_rate": 8.181486458586153e-06, "loss": 0.0407, "step": 3166 }, { "epoch": 1.0576056102855234, "grad_norm": 0.3578886023927057, "learning_rate": 8.179987045601217e-06, "loss": 0.0261, "step": 3167 }, { "epoch": 1.057939555852396, "grad_norm": 0.32132008387067623, "learning_rate": 8.178487152241795e-06, "loss": 0.0295, "step": 3168 }, { "epoch": 1.0582735014192686, "grad_norm": 0.2954476552620977, "learning_rate": 8.17698677873446e-06, "loss": 0.033, "step": 3169 }, { "epoch": 1.0586074469861413, "grad_norm": 0.3270198201043449, "learning_rate": 8.175485925305867e-06, "loss": 0.0204, "step": 3170 }, { "epoch": 1.0589413925530138, "grad_norm": 0.340683504806724, "learning_rate": 8.173984592182736e-06, "loss": 0.0339, "step": 3171 }, { "epoch": 1.0592753381198865, "grad_norm": 0.30570215261127376, "learning_rate": 8.172482779591858e-06, "loss": 0.0291, "step": 3172 }, { "epoch": 1.059609283686759, "grad_norm": 0.37838930187846, "learning_rate": 8.170980487760101e-06, "loss": 0.0438, "step": 3173 }, { "epoch": 1.0599432292536317, "grad_norm": 0.3383647238519604, "learning_rate": 8.169477716914405e-06, "loss": 0.0297, "step": 3174 }, { "epoch": 1.0602771748205042, "grad_norm": 0.25213861969089507, "learning_rate": 8.16797446728178e-06, "loss": 0.0277, "step": 3175 }, { "epoch": 1.060611120387377, "grad_norm": 0.39284531943287526, "learning_rate": 8.16647073908931e-06, "loss": 0.0501, "step": 3176 }, { "epoch": 1.0609450659542494, "grad_norm": 0.40504969758581055, "learning_rate": 8.164966532564152e-06, "loss": 0.0406, "step": 3177 }, { "epoch": 1.0612790115211221, "grad_norm": 0.4228096924800613, "learning_rate": 8.163461847933532e-06, "loss": 0.0233, "step": 3178 }, { "epoch": 1.0616129570879946, "grad_norm": 0.3212886971594315, "learning_rate": 8.161956685424752e-06, "loss": 0.0267, "step": 3179 }, { "epoch": 1.0619469026548674, "grad_norm": 0.3145132469817971, "learning_rate": 8.160451045265183e-06, "loss": 0.0223, "step": 3180 }, { "epoch": 1.0622808482217398, "grad_norm": 0.2453353231679829, "learning_rate": 8.158944927682269e-06, "loss": 0.0234, "step": 3181 }, { "epoch": 1.0626147937886126, "grad_norm": 0.28972504686288075, "learning_rate": 8.157438332903531e-06, "loss": 0.0286, "step": 3182 }, { "epoch": 1.062948739355485, "grad_norm": 0.24724894945946668, "learning_rate": 8.155931261156555e-06, "loss": 0.0248, "step": 3183 }, { "epoch": 1.0632826849223576, "grad_norm": 0.3756606442742857, "learning_rate": 8.154423712669003e-06, "loss": 0.0321, "step": 3184 }, { "epoch": 1.0636166304892303, "grad_norm": 0.22225664087496289, "learning_rate": 8.152915687668603e-06, "loss": 0.0201, "step": 3185 }, { "epoch": 1.0639505760561028, "grad_norm": 0.31357879955688045, "learning_rate": 8.151407186383166e-06, "loss": 0.0221, "step": 3186 }, { "epoch": 1.0642845216229755, "grad_norm": 0.325060312972888, "learning_rate": 8.149898209040568e-06, "loss": 0.019, "step": 3187 }, { "epoch": 1.064618467189848, "grad_norm": 0.336164714357586, "learning_rate": 8.148388755868757e-06, "loss": 0.0221, "step": 3188 }, { "epoch": 1.0649524127567207, "grad_norm": 0.2754589561530737, "learning_rate": 8.146878827095751e-06, "loss": 0.0269, "step": 3189 }, { "epoch": 1.0652863583235932, "grad_norm": 0.38697959026680784, "learning_rate": 8.145368422949647e-06, "loss": 0.0319, "step": 3190 }, { "epoch": 1.065620303890466, "grad_norm": 0.2894212531269538, "learning_rate": 8.143857543658606e-06, "loss": 0.0218, "step": 3191 }, { "epoch": 1.0659542494573384, "grad_norm": 0.2614254696383892, "learning_rate": 8.142346189450866e-06, "loss": 0.023, "step": 3192 }, { "epoch": 1.0662881950242111, "grad_norm": 0.2484086201153225, "learning_rate": 8.140834360554734e-06, "loss": 0.0231, "step": 3193 }, { "epoch": 1.0666221405910836, "grad_norm": 0.34486282980188804, "learning_rate": 8.13932205719859e-06, "loss": 0.0333, "step": 3194 }, { "epoch": 1.0669560861579563, "grad_norm": 0.3475212405148576, "learning_rate": 8.137809279610885e-06, "loss": 0.024, "step": 3195 }, { "epoch": 1.0672900317248288, "grad_norm": 0.35655065259988694, "learning_rate": 8.13629602802014e-06, "loss": 0.0244, "step": 3196 }, { "epoch": 1.0676239772917016, "grad_norm": 0.4429028783957121, "learning_rate": 8.134782302654953e-06, "loss": 0.0402, "step": 3197 }, { "epoch": 1.067957922858574, "grad_norm": 0.5389386221395466, "learning_rate": 8.133268103743989e-06, "loss": 0.0437, "step": 3198 }, { "epoch": 1.0682918684254465, "grad_norm": 0.33079685403936093, "learning_rate": 8.131753431515984e-06, "loss": 0.0271, "step": 3199 }, { "epoch": 1.0686258139923193, "grad_norm": 0.34631306750481533, "learning_rate": 8.130238286199747e-06, "loss": 0.0324, "step": 3200 }, { "epoch": 1.0689597595591918, "grad_norm": 0.31929619605337367, "learning_rate": 8.128722668024161e-06, "loss": 0.0215, "step": 3201 }, { "epoch": 1.0692937051260645, "grad_norm": 0.43532157020234624, "learning_rate": 8.127206577218177e-06, "loss": 0.0421, "step": 3202 }, { "epoch": 1.069627650692937, "grad_norm": 0.2807873183261818, "learning_rate": 8.125690014010814e-06, "loss": 0.0251, "step": 3203 }, { "epoch": 1.0699615962598097, "grad_norm": 0.405320920289809, "learning_rate": 8.124172978631173e-06, "loss": 0.0311, "step": 3204 }, { "epoch": 1.0702955418266822, "grad_norm": 0.3051621480936762, "learning_rate": 8.12265547130842e-06, "loss": 0.0269, "step": 3205 }, { "epoch": 1.070629487393555, "grad_norm": 0.37517077772627283, "learning_rate": 8.121137492271787e-06, "loss": 0.0361, "step": 3206 }, { "epoch": 1.0709634329604274, "grad_norm": 0.3906482364046291, "learning_rate": 8.119619041750586e-06, "loss": 0.0366, "step": 3207 }, { "epoch": 1.0712973785273001, "grad_norm": 0.7720388272918915, "learning_rate": 8.118100119974197e-06, "loss": 0.0259, "step": 3208 }, { "epoch": 1.0716313240941726, "grad_norm": 0.34682141237815584, "learning_rate": 8.116580727172071e-06, "loss": 0.0216, "step": 3209 }, { "epoch": 1.0719652696610453, "grad_norm": 0.31560666044668617, "learning_rate": 8.115060863573729e-06, "loss": 0.021, "step": 3210 }, { "epoch": 1.0722992152279178, "grad_norm": 0.4940458994908807, "learning_rate": 8.113540529408766e-06, "loss": 0.0371, "step": 3211 }, { "epoch": 1.0726331607947905, "grad_norm": 0.30957068605851945, "learning_rate": 8.112019724906844e-06, "loss": 0.0258, "step": 3212 }, { "epoch": 1.072967106361663, "grad_norm": 0.4848843155643987, "learning_rate": 8.1104984502977e-06, "loss": 0.0321, "step": 3213 }, { "epoch": 1.0733010519285355, "grad_norm": 1.0662051334114149, "learning_rate": 8.108976705811138e-06, "loss": 0.0445, "step": 3214 }, { "epoch": 1.0736349974954082, "grad_norm": 0.21981439832497088, "learning_rate": 8.107454491677041e-06, "loss": 0.0192, "step": 3215 }, { "epoch": 1.0739689430622807, "grad_norm": 0.3710162574426828, "learning_rate": 8.10593180812535e-06, "loss": 0.0265, "step": 3216 }, { "epoch": 1.0743028886291535, "grad_norm": 0.37385849697364243, "learning_rate": 8.104408655386092e-06, "loss": 0.0464, "step": 3217 }, { "epoch": 1.074636834196026, "grad_norm": 1.4727901927499285, "learning_rate": 8.102885033689352e-06, "loss": 0.0278, "step": 3218 }, { "epoch": 1.0749707797628987, "grad_norm": 0.3679842685510149, "learning_rate": 8.101360943265293e-06, "loss": 0.0333, "step": 3219 }, { "epoch": 1.0753047253297712, "grad_norm": 0.45617417497121054, "learning_rate": 8.099836384344146e-06, "loss": 0.0326, "step": 3220 }, { "epoch": 1.075638670896644, "grad_norm": 0.3780406313130693, "learning_rate": 8.098311357156213e-06, "loss": 0.0273, "step": 3221 }, { "epoch": 1.0759726164635164, "grad_norm": 0.38085812855747886, "learning_rate": 8.096785861931868e-06, "loss": 0.0254, "step": 3222 }, { "epoch": 1.076306562030389, "grad_norm": 0.3362186305688881, "learning_rate": 8.095259898901557e-06, "loss": 0.034, "step": 3223 }, { "epoch": 1.0766405075972616, "grad_norm": 0.30395187211020996, "learning_rate": 8.09373346829579e-06, "loss": 0.0263, "step": 3224 }, { "epoch": 1.0769744531641343, "grad_norm": 0.24365452420250513, "learning_rate": 8.092206570345158e-06, "loss": 0.0181, "step": 3225 }, { "epoch": 1.0773083987310068, "grad_norm": 0.21241122610162672, "learning_rate": 8.090679205280311e-06, "loss": 0.0213, "step": 3226 }, { "epoch": 1.0776423442978795, "grad_norm": 0.3092399150533459, "learning_rate": 8.08915137333198e-06, "loss": 0.0312, "step": 3227 }, { "epoch": 1.077976289864752, "grad_norm": 0.45577254513776233, "learning_rate": 8.08762307473096e-06, "loss": 0.0458, "step": 3228 }, { "epoch": 1.0783102354316247, "grad_norm": 0.41877391528488833, "learning_rate": 8.08609430970812e-06, "loss": 0.0356, "step": 3229 }, { "epoch": 1.0786441809984972, "grad_norm": 0.2907717077806986, "learning_rate": 8.084565078494396e-06, "loss": 0.0231, "step": 3230 }, { "epoch": 1.07897812656537, "grad_norm": 0.3393837138554928, "learning_rate": 8.083035381320798e-06, "loss": 0.0209, "step": 3231 }, { "epoch": 1.0793120721322425, "grad_norm": 0.3478604414360994, "learning_rate": 8.081505218418403e-06, "loss": 0.0265, "step": 3232 }, { "epoch": 1.079646017699115, "grad_norm": 0.2954603675989673, "learning_rate": 8.079974590018363e-06, "loss": 0.0257, "step": 3233 }, { "epoch": 1.0799799632659877, "grad_norm": 0.3202770249602899, "learning_rate": 8.078443496351893e-06, "loss": 0.0283, "step": 3234 }, { "epoch": 1.0803139088328602, "grad_norm": 0.3527776921990553, "learning_rate": 8.076911937650288e-06, "loss": 0.0275, "step": 3235 }, { "epoch": 1.0806478543997329, "grad_norm": 0.3004797505469835, "learning_rate": 8.075379914144902e-06, "loss": 0.0365, "step": 3236 }, { "epoch": 1.0809817999666054, "grad_norm": 0.26293437171819317, "learning_rate": 8.073847426067172e-06, "loss": 0.0198, "step": 3237 }, { "epoch": 1.081315745533478, "grad_norm": 0.30384901190258123, "learning_rate": 8.072314473648595e-06, "loss": 0.0363, "step": 3238 }, { "epoch": 1.0816496911003506, "grad_norm": 0.3687712801154847, "learning_rate": 8.07078105712074e-06, "loss": 0.0327, "step": 3239 }, { "epoch": 1.0819836366672233, "grad_norm": 0.3405941440703542, "learning_rate": 8.06924717671525e-06, "loss": 0.0366, "step": 3240 }, { "epoch": 1.0823175822340958, "grad_norm": 0.24414727479773302, "learning_rate": 8.067712832663831e-06, "loss": 0.0237, "step": 3241 }, { "epoch": 1.0826515278009685, "grad_norm": 0.24157122723692315, "learning_rate": 8.066178025198272e-06, "loss": 0.021, "step": 3242 }, { "epoch": 1.082985473367841, "grad_norm": 0.3460307656468123, "learning_rate": 8.064642754550418e-06, "loss": 0.0287, "step": 3243 }, { "epoch": 1.0833194189347137, "grad_norm": 0.33932834150817975, "learning_rate": 8.06310702095219e-06, "loss": 0.0257, "step": 3244 }, { "epoch": 1.0836533645015862, "grad_norm": 0.34780098201834897, "learning_rate": 8.06157082463558e-06, "loss": 0.0392, "step": 3245 }, { "epoch": 1.083987310068459, "grad_norm": 0.3875529529464877, "learning_rate": 8.060034165832648e-06, "loss": 0.0353, "step": 3246 }, { "epoch": 1.0843212556353314, "grad_norm": 0.3928569828597569, "learning_rate": 8.058497044775526e-06, "loss": 0.0269, "step": 3247 }, { "epoch": 1.084655201202204, "grad_norm": 0.5550811794335628, "learning_rate": 8.05695946169641e-06, "loss": 0.0352, "step": 3248 }, { "epoch": 1.0849891467690767, "grad_norm": 0.3021650481495041, "learning_rate": 8.055421416827575e-06, "loss": 0.0232, "step": 3249 }, { "epoch": 1.0853230923359491, "grad_norm": 0.396961012865088, "learning_rate": 8.053882910401359e-06, "loss": 0.0288, "step": 3250 }, { "epoch": 1.0856570379028219, "grad_norm": 0.4026659132646484, "learning_rate": 8.052343942650168e-06, "loss": 0.0423, "step": 3251 }, { "epoch": 1.0859909834696944, "grad_norm": 0.394582221220994, "learning_rate": 8.050804513806488e-06, "loss": 0.0343, "step": 3252 }, { "epoch": 1.086324929036567, "grad_norm": 0.36225579539614333, "learning_rate": 8.049264624102862e-06, "loss": 0.0309, "step": 3253 }, { "epoch": 1.0866588746034396, "grad_norm": 0.3280866037074307, "learning_rate": 8.047724273771909e-06, "loss": 0.027, "step": 3254 }, { "epoch": 1.0869928201703123, "grad_norm": 0.4655193854035031, "learning_rate": 8.046183463046322e-06, "loss": 0.0355, "step": 3255 }, { "epoch": 1.0873267657371848, "grad_norm": 0.30646720508211767, "learning_rate": 8.044642192158854e-06, "loss": 0.0323, "step": 3256 }, { "epoch": 1.0876607113040575, "grad_norm": 0.26320109588751534, "learning_rate": 8.043100461342332e-06, "loss": 0.0218, "step": 3257 }, { "epoch": 1.08799465687093, "grad_norm": 0.30419079651848474, "learning_rate": 8.041558270829655e-06, "loss": 0.0334, "step": 3258 }, { "epoch": 1.0883286024378027, "grad_norm": 0.37646678852625404, "learning_rate": 8.04001562085379e-06, "loss": 0.0248, "step": 3259 }, { "epoch": 1.0886625480046752, "grad_norm": 0.29796456459806475, "learning_rate": 8.038472511647768e-06, "loss": 0.0275, "step": 3260 }, { "epoch": 1.088996493571548, "grad_norm": 0.34171278276903755, "learning_rate": 8.036928943444698e-06, "loss": 0.0281, "step": 3261 }, { "epoch": 1.0893304391384204, "grad_norm": 0.29628993929471975, "learning_rate": 8.03538491647775e-06, "loss": 0.0202, "step": 3262 }, { "epoch": 1.089664384705293, "grad_norm": 0.3659770174939796, "learning_rate": 8.03384043098017e-06, "loss": 0.026, "step": 3263 }, { "epoch": 1.0899983302721656, "grad_norm": 0.5871669117756957, "learning_rate": 8.032295487185273e-06, "loss": 0.0488, "step": 3264 }, { "epoch": 1.0903322758390381, "grad_norm": 0.2862753129702655, "learning_rate": 8.030750085326438e-06, "loss": 0.0314, "step": 3265 }, { "epoch": 1.0906662214059109, "grad_norm": 0.341660846931696, "learning_rate": 8.029204225637114e-06, "loss": 0.0323, "step": 3266 }, { "epoch": 1.0910001669727833, "grad_norm": 0.26031377523360283, "learning_rate": 8.027657908350826e-06, "loss": 0.023, "step": 3267 }, { "epoch": 1.091334112539656, "grad_norm": 0.4909674930624971, "learning_rate": 8.026111133701162e-06, "loss": 0.0394, "step": 3268 }, { "epoch": 1.0916680581065286, "grad_norm": 0.4673577196433493, "learning_rate": 8.02456390192178e-06, "loss": 0.0422, "step": 3269 }, { "epoch": 1.0920020036734013, "grad_norm": 0.5810221713722641, "learning_rate": 8.023016213246406e-06, "loss": 0.0242, "step": 3270 }, { "epoch": 1.0923359492402738, "grad_norm": 0.37038397677449103, "learning_rate": 8.021468067908839e-06, "loss": 0.032, "step": 3271 }, { "epoch": 1.0926698948071465, "grad_norm": 0.4574071800556038, "learning_rate": 8.019919466142945e-06, "loss": 0.0341, "step": 3272 }, { "epoch": 1.093003840374019, "grad_norm": 0.43810029082879615, "learning_rate": 8.018370408182655e-06, "loss": 0.0348, "step": 3273 }, { "epoch": 1.0933377859408917, "grad_norm": 0.46502945180075905, "learning_rate": 8.016820894261975e-06, "loss": 0.0356, "step": 3274 }, { "epoch": 1.0936717315077642, "grad_norm": 0.4901221523760667, "learning_rate": 8.015270924614977e-06, "loss": 0.0378, "step": 3275 }, { "epoch": 1.094005677074637, "grad_norm": 0.43822557031995935, "learning_rate": 8.013720499475804e-06, "loss": 0.0426, "step": 3276 }, { "epoch": 1.0943396226415094, "grad_norm": 0.3551130156793218, "learning_rate": 8.012169619078662e-06, "loss": 0.0277, "step": 3277 }, { "epoch": 1.0946735682083821, "grad_norm": 0.34961132896253827, "learning_rate": 8.010618283657834e-06, "loss": 0.0337, "step": 3278 }, { "epoch": 1.0950075137752546, "grad_norm": 0.3009876818314765, "learning_rate": 8.009066493447664e-06, "loss": 0.0257, "step": 3279 }, { "epoch": 1.0953414593421273, "grad_norm": 0.35858495140204744, "learning_rate": 8.00751424868257e-06, "loss": 0.0303, "step": 3280 }, { "epoch": 1.0956754049089998, "grad_norm": 0.3387476640151862, "learning_rate": 8.005961549597037e-06, "loss": 0.0324, "step": 3281 }, { "epoch": 1.0960093504758723, "grad_norm": 0.4290414442587191, "learning_rate": 8.004408396425617e-06, "loss": 0.029, "step": 3282 }, { "epoch": 1.096343296042745, "grad_norm": 0.33094967279971516, "learning_rate": 8.002854789402931e-06, "loss": 0.0253, "step": 3283 }, { "epoch": 1.0966772416096175, "grad_norm": 0.2586770805586634, "learning_rate": 8.001300728763674e-06, "loss": 0.0262, "step": 3284 }, { "epoch": 1.0970111871764903, "grad_norm": 0.3288852460265123, "learning_rate": 7.999746214742603e-06, "loss": 0.0281, "step": 3285 }, { "epoch": 1.0973451327433628, "grad_norm": 0.2988088829590742, "learning_rate": 7.998191247574545e-06, "loss": 0.0218, "step": 3286 }, { "epoch": 1.0976790783102355, "grad_norm": 0.49335428203556075, "learning_rate": 7.996635827494397e-06, "loss": 0.0237, "step": 3287 }, { "epoch": 1.098013023877108, "grad_norm": 0.38631025593619045, "learning_rate": 7.995079954737122e-06, "loss": 0.0312, "step": 3288 }, { "epoch": 1.0983469694439807, "grad_norm": 0.3600579178108258, "learning_rate": 7.993523629537753e-06, "loss": 0.0338, "step": 3289 }, { "epoch": 1.0986809150108532, "grad_norm": 0.30779911947153704, "learning_rate": 7.991966852131394e-06, "loss": 0.0219, "step": 3290 }, { "epoch": 1.099014860577726, "grad_norm": 0.2905441947150159, "learning_rate": 7.990409622753212e-06, "loss": 0.0284, "step": 3291 }, { "epoch": 1.0993488061445984, "grad_norm": 0.306190108652269, "learning_rate": 7.988851941638445e-06, "loss": 0.0272, "step": 3292 }, { "epoch": 1.0996827517114711, "grad_norm": 0.3428238124277849, "learning_rate": 7.987293809022401e-06, "loss": 0.0368, "step": 3293 }, { "epoch": 1.1000166972783436, "grad_norm": 0.42435268643800517, "learning_rate": 7.985735225140452e-06, "loss": 0.0396, "step": 3294 }, { "epoch": 1.1003506428452163, "grad_norm": 0.5434448966943168, "learning_rate": 7.984176190228042e-06, "loss": 0.0338, "step": 3295 }, { "epoch": 1.1006845884120888, "grad_norm": 0.354328827694649, "learning_rate": 7.98261670452068e-06, "loss": 0.0386, "step": 3296 }, { "epoch": 1.1010185339789613, "grad_norm": 0.35602064466517996, "learning_rate": 7.981056768253945e-06, "loss": 0.032, "step": 3297 }, { "epoch": 1.101352479545834, "grad_norm": 0.2941162933646615, "learning_rate": 7.979496381663486e-06, "loss": 0.0236, "step": 3298 }, { "epoch": 1.1016864251127065, "grad_norm": 0.38612453583250106, "learning_rate": 7.977935544985016e-06, "loss": 0.0317, "step": 3299 }, { "epoch": 1.1020203706795793, "grad_norm": 0.3799863500385795, "learning_rate": 7.976374258454317e-06, "loss": 0.0289, "step": 3300 }, { "epoch": 1.1023543162464517, "grad_norm": 0.3268097291310616, "learning_rate": 7.97481252230724e-06, "loss": 0.0258, "step": 3301 }, { "epoch": 1.1026882618133245, "grad_norm": 0.39369341443797184, "learning_rate": 7.973250336779705e-06, "loss": 0.0322, "step": 3302 }, { "epoch": 1.103022207380197, "grad_norm": 0.3737112857749979, "learning_rate": 7.971687702107698e-06, "loss": 0.0279, "step": 3303 }, { "epoch": 1.1033561529470697, "grad_norm": 0.35479945345298036, "learning_rate": 7.970124618527274e-06, "loss": 0.0299, "step": 3304 }, { "epoch": 1.1036900985139422, "grad_norm": 0.31349347013890455, "learning_rate": 7.968561086274553e-06, "loss": 0.0238, "step": 3305 }, { "epoch": 1.104024044080815, "grad_norm": 0.2481600213298206, "learning_rate": 7.966997105585727e-06, "loss": 0.0263, "step": 3306 }, { "epoch": 1.1043579896476874, "grad_norm": 0.3634651565674134, "learning_rate": 7.965432676697052e-06, "loss": 0.0295, "step": 3307 }, { "epoch": 1.10469193521456, "grad_norm": 0.23875067279052692, "learning_rate": 7.963867799844855e-06, "loss": 0.0223, "step": 3308 }, { "epoch": 1.1050258807814326, "grad_norm": 0.25377720340243354, "learning_rate": 7.962302475265527e-06, "loss": 0.0218, "step": 3309 }, { "epoch": 1.1053598263483053, "grad_norm": 0.31721599881937135, "learning_rate": 7.960736703195533e-06, "loss": 0.0225, "step": 3310 }, { "epoch": 1.1056937719151778, "grad_norm": 0.31578452859925404, "learning_rate": 7.959170483871398e-06, "loss": 0.0256, "step": 3311 }, { "epoch": 1.1060277174820503, "grad_norm": 0.39843821196394125, "learning_rate": 7.957603817529715e-06, "loss": 0.0348, "step": 3312 }, { "epoch": 1.106361663048923, "grad_norm": 0.4255737697271387, "learning_rate": 7.956036704407153e-06, "loss": 0.0411, "step": 3313 }, { "epoch": 1.1066956086157955, "grad_norm": 0.2831333730132518, "learning_rate": 7.954469144740441e-06, "loss": 0.0224, "step": 3314 }, { "epoch": 1.1070295541826682, "grad_norm": 0.3944181528504548, "learning_rate": 7.952901138766376e-06, "loss": 0.0358, "step": 3315 }, { "epoch": 1.1073634997495407, "grad_norm": 0.3531085915481224, "learning_rate": 7.951332686721825e-06, "loss": 0.04, "step": 3316 }, { "epoch": 1.1076974453164135, "grad_norm": 0.2681560115879675, "learning_rate": 7.94976378884372e-06, "loss": 0.0222, "step": 3317 }, { "epoch": 1.108031390883286, "grad_norm": 0.3364026241493204, "learning_rate": 7.948194445369065e-06, "loss": 0.0278, "step": 3318 }, { "epoch": 1.1083653364501587, "grad_norm": 0.34866005885048446, "learning_rate": 7.946624656534922e-06, "loss": 0.0342, "step": 3319 }, { "epoch": 1.1086992820170312, "grad_norm": 0.36641364979886615, "learning_rate": 7.945054422578432e-06, "loss": 0.0324, "step": 3320 }, { "epoch": 1.1090332275839039, "grad_norm": 0.34940742331356994, "learning_rate": 7.943483743736793e-06, "loss": 0.033, "step": 3321 }, { "epoch": 1.1093671731507764, "grad_norm": 0.355563802184475, "learning_rate": 7.941912620247276e-06, "loss": 0.0276, "step": 3322 }, { "epoch": 1.109701118717649, "grad_norm": 0.45120068049438616, "learning_rate": 7.940341052347219e-06, "loss": 0.0352, "step": 3323 }, { "epoch": 1.1100350642845216, "grad_norm": 0.40408839744135716, "learning_rate": 7.938769040274022e-06, "loss": 0.0417, "step": 3324 }, { "epoch": 1.1103690098513943, "grad_norm": 0.4338686807243482, "learning_rate": 7.937196584265161e-06, "loss": 0.0313, "step": 3325 }, { "epoch": 1.1107029554182668, "grad_norm": 0.29181078715721387, "learning_rate": 7.93562368455817e-06, "loss": 0.0249, "step": 3326 }, { "epoch": 1.1110369009851395, "grad_norm": 0.4185676067944589, "learning_rate": 7.934050341390659e-06, "loss": 0.0234, "step": 3327 }, { "epoch": 1.111370846552012, "grad_norm": 0.2767702565319191, "learning_rate": 7.932476555000294e-06, "loss": 0.0243, "step": 3328 }, { "epoch": 1.1117047921188847, "grad_norm": 0.29143933795252175, "learning_rate": 7.930902325624816e-06, "loss": 0.0217, "step": 3329 }, { "epoch": 1.1120387376857572, "grad_norm": 0.27250096630310733, "learning_rate": 7.929327653502032e-06, "loss": 0.022, "step": 3330 }, { "epoch": 1.1123726832526297, "grad_norm": 0.3517864948807678, "learning_rate": 7.927752538869816e-06, "loss": 0.0294, "step": 3331 }, { "epoch": 1.1127066288195024, "grad_norm": 0.42682315501132234, "learning_rate": 7.926176981966102e-06, "loss": 0.0368, "step": 3332 }, { "epoch": 1.113040574386375, "grad_norm": 0.5442250868163462, "learning_rate": 7.924600983028903e-06, "loss": 0.0431, "step": 3333 }, { "epoch": 1.1133745199532477, "grad_norm": 0.332474222206286, "learning_rate": 7.92302454229629e-06, "loss": 0.0246, "step": 3334 }, { "epoch": 1.1137084655201201, "grad_norm": 0.3218443871111087, "learning_rate": 7.9214476600064e-06, "loss": 0.0318, "step": 3335 }, { "epoch": 1.1140424110869929, "grad_norm": 0.4195822755083483, "learning_rate": 7.919870336397444e-06, "loss": 0.0353, "step": 3336 }, { "epoch": 1.1143763566538654, "grad_norm": 0.37589029930284823, "learning_rate": 7.918292571707693e-06, "loss": 0.0255, "step": 3337 }, { "epoch": 1.114710302220738, "grad_norm": 0.3726668985785104, "learning_rate": 7.916714366175487e-06, "loss": 0.0303, "step": 3338 }, { "epoch": 1.1150442477876106, "grad_norm": 0.2869112064253432, "learning_rate": 7.915135720039233e-06, "loss": 0.0251, "step": 3339 }, { "epoch": 1.1153781933544833, "grad_norm": 0.3122401429651216, "learning_rate": 7.913556633537403e-06, "loss": 0.0242, "step": 3340 }, { "epoch": 1.1157121389213558, "grad_norm": 0.33236127861053366, "learning_rate": 7.91197710690854e-06, "loss": 0.0362, "step": 3341 }, { "epoch": 1.1160460844882285, "grad_norm": 0.3136339582547713, "learning_rate": 7.910397140391244e-06, "loss": 0.0287, "step": 3342 }, { "epoch": 1.116380030055101, "grad_norm": 0.32873912375824965, "learning_rate": 7.908816734224195e-06, "loss": 0.0329, "step": 3343 }, { "epoch": 1.1167139756219737, "grad_norm": 0.4083856391063721, "learning_rate": 7.907235888646126e-06, "loss": 0.0379, "step": 3344 }, { "epoch": 1.1170479211888462, "grad_norm": 0.43212270409282794, "learning_rate": 7.905654603895843e-06, "loss": 0.0375, "step": 3345 }, { "epoch": 1.1173818667557187, "grad_norm": 0.45992624147429745, "learning_rate": 7.90407288021222e-06, "loss": 0.0306, "step": 3346 }, { "epoch": 1.1177158123225914, "grad_norm": 0.3892224439462069, "learning_rate": 7.902490717834196e-06, "loss": 0.037, "step": 3347 }, { "epoch": 1.118049757889464, "grad_norm": 0.28294143055491255, "learning_rate": 7.90090811700077e-06, "loss": 0.0219, "step": 3348 }, { "epoch": 1.1183837034563366, "grad_norm": 0.32420382240075996, "learning_rate": 7.899325077951018e-06, "loss": 0.0278, "step": 3349 }, { "epoch": 1.1187176490232091, "grad_norm": 0.23351130904306372, "learning_rate": 7.897741600924073e-06, "loss": 0.0197, "step": 3350 }, { "epoch": 1.1190515945900819, "grad_norm": 0.27507459110299837, "learning_rate": 7.896157686159142e-06, "loss": 0.0227, "step": 3351 }, { "epoch": 1.1193855401569544, "grad_norm": 0.3928012030302448, "learning_rate": 7.89457333389549e-06, "loss": 0.0338, "step": 3352 }, { "epoch": 1.119719485723827, "grad_norm": 0.27000272290262073, "learning_rate": 7.892988544372454e-06, "loss": 0.0204, "step": 3353 }, { "epoch": 1.1200534312906996, "grad_norm": 0.3192793404473544, "learning_rate": 7.891403317829434e-06, "loss": 0.0291, "step": 3354 }, { "epoch": 1.1203873768575723, "grad_norm": 0.3425808568634987, "learning_rate": 7.889817654505897e-06, "loss": 0.0314, "step": 3355 }, { "epoch": 1.1207213224244448, "grad_norm": 0.28002198089551705, "learning_rate": 7.888231554641377e-06, "loss": 0.0267, "step": 3356 }, { "epoch": 1.1210552679913175, "grad_norm": 0.27841002206811855, "learning_rate": 7.886645018475474e-06, "loss": 0.0224, "step": 3357 }, { "epoch": 1.12138921355819, "grad_norm": 0.9556836898606712, "learning_rate": 7.885058046247852e-06, "loss": 0.0561, "step": 3358 }, { "epoch": 1.1217231591250627, "grad_norm": 0.2725277534227355, "learning_rate": 7.88347063819824e-06, "loss": 0.024, "step": 3359 }, { "epoch": 1.1220571046919352, "grad_norm": 0.3506925856290798, "learning_rate": 7.881882794566438e-06, "loss": 0.0308, "step": 3360 }, { "epoch": 1.1223910502588077, "grad_norm": 0.3017350492143483, "learning_rate": 7.880294515592304e-06, "loss": 0.0278, "step": 3361 }, { "epoch": 1.1227249958256804, "grad_norm": 0.3173029659624254, "learning_rate": 7.878705801515772e-06, "loss": 0.0287, "step": 3362 }, { "epoch": 1.123058941392553, "grad_norm": 0.33327651253665935, "learning_rate": 7.877116652576832e-06, "loss": 0.0282, "step": 3363 }, { "epoch": 1.1233928869594256, "grad_norm": 0.3330771960433005, "learning_rate": 7.875527069015545e-06, "loss": 0.0249, "step": 3364 }, { "epoch": 1.1237268325262981, "grad_norm": 0.31208907904599537, "learning_rate": 7.873937051072037e-06, "loss": 0.0238, "step": 3365 }, { "epoch": 1.1240607780931708, "grad_norm": 0.3232661666692226, "learning_rate": 7.872346598986496e-06, "loss": 0.0324, "step": 3366 }, { "epoch": 1.1243947236600433, "grad_norm": 0.5704434100953263, "learning_rate": 7.87075571299918e-06, "loss": 0.0459, "step": 3367 }, { "epoch": 1.124728669226916, "grad_norm": 0.25271598829371567, "learning_rate": 7.869164393350412e-06, "loss": 0.0191, "step": 3368 }, { "epoch": 1.1250626147937886, "grad_norm": 0.3784736957459662, "learning_rate": 7.86757264028058e-06, "loss": 0.0307, "step": 3369 }, { "epoch": 1.1253965603606613, "grad_norm": 0.27817786100942254, "learning_rate": 7.865980454030135e-06, "loss": 0.0329, "step": 3370 }, { "epoch": 1.1257305059275338, "grad_norm": 0.3253548797529124, "learning_rate": 7.864387834839598e-06, "loss": 0.0256, "step": 3371 }, { "epoch": 1.1260644514944065, "grad_norm": 0.37085916865559204, "learning_rate": 7.86279478294955e-06, "loss": 0.0291, "step": 3372 }, { "epoch": 1.126398397061279, "grad_norm": 0.43750061606167917, "learning_rate": 7.861201298600642e-06, "loss": 0.0438, "step": 3373 }, { "epoch": 1.1267323426281517, "grad_norm": 0.3429848284373353, "learning_rate": 7.85960738203359e-06, "loss": 0.0295, "step": 3374 }, { "epoch": 1.1270662881950242, "grad_norm": 0.519164072515975, "learning_rate": 7.858013033489171e-06, "loss": 0.039, "step": 3375 }, { "epoch": 1.1274002337618967, "grad_norm": 0.3046523315968954, "learning_rate": 7.856418253208232e-06, "loss": 0.0454, "step": 3376 }, { "epoch": 1.1277341793287694, "grad_norm": 0.33487579971327147, "learning_rate": 7.85482304143168e-06, "loss": 0.0272, "step": 3377 }, { "epoch": 1.1280681248956421, "grad_norm": 0.33452345204735684, "learning_rate": 7.853227398400495e-06, "loss": 0.0221, "step": 3378 }, { "epoch": 1.1284020704625146, "grad_norm": 0.2993478347966669, "learning_rate": 7.851631324355717e-06, "loss": 0.02, "step": 3379 }, { "epoch": 1.1287360160293871, "grad_norm": 0.3101706399966744, "learning_rate": 7.850034819538448e-06, "loss": 0.0319, "step": 3380 }, { "epoch": 1.1290699615962598, "grad_norm": 0.30946031624112036, "learning_rate": 7.848437884189864e-06, "loss": 0.0284, "step": 3381 }, { "epoch": 1.1294039071631323, "grad_norm": 0.35003207065103414, "learning_rate": 7.846840518551197e-06, "loss": 0.0366, "step": 3382 }, { "epoch": 1.129737852730005, "grad_norm": 0.29124555633635885, "learning_rate": 7.845242722863749e-06, "loss": 0.031, "step": 3383 }, { "epoch": 1.1300717982968775, "grad_norm": 0.330688106020703, "learning_rate": 7.843644497368886e-06, "loss": 0.0338, "step": 3384 }, { "epoch": 1.1304057438637503, "grad_norm": 0.5756691362504007, "learning_rate": 7.842045842308038e-06, "loss": 0.045, "step": 3385 }, { "epoch": 1.1307396894306228, "grad_norm": 0.3253252750964825, "learning_rate": 7.840446757922704e-06, "loss": 0.022, "step": 3386 }, { "epoch": 1.1310736349974955, "grad_norm": 0.339599458339216, "learning_rate": 7.838847244454441e-06, "loss": 0.0256, "step": 3387 }, { "epoch": 1.131407580564368, "grad_norm": 0.3720567727910156, "learning_rate": 7.837247302144874e-06, "loss": 0.0329, "step": 3388 }, { "epoch": 1.1317415261312407, "grad_norm": 0.36887312774570036, "learning_rate": 7.835646931235697e-06, "loss": 0.0321, "step": 3389 }, { "epoch": 1.1320754716981132, "grad_norm": 0.3199150608989669, "learning_rate": 7.83404613196866e-06, "loss": 0.0263, "step": 3390 }, { "epoch": 1.132409417264986, "grad_norm": 0.3576472141589384, "learning_rate": 7.832444904585587e-06, "loss": 0.0298, "step": 3391 }, { "epoch": 1.1327433628318584, "grad_norm": 0.33277393919868725, "learning_rate": 7.83084324932836e-06, "loss": 0.0412, "step": 3392 }, { "epoch": 1.133077308398731, "grad_norm": 0.3431785654729045, "learning_rate": 7.829241166438925e-06, "loss": 0.0312, "step": 3393 }, { "epoch": 1.1334112539656036, "grad_norm": 0.25972205424157063, "learning_rate": 7.827638656159302e-06, "loss": 0.0203, "step": 3394 }, { "epoch": 1.133745199532476, "grad_norm": 0.3890664642320907, "learning_rate": 7.826035718731564e-06, "loss": 0.0445, "step": 3395 }, { "epoch": 1.1340791450993488, "grad_norm": 0.24772805025026107, "learning_rate": 7.824432354397857e-06, "loss": 0.0266, "step": 3396 }, { "epoch": 1.1344130906662213, "grad_norm": 0.37753984036104876, "learning_rate": 7.822828563400384e-06, "loss": 0.0236, "step": 3397 }, { "epoch": 1.134747036233094, "grad_norm": 0.3605642048938382, "learning_rate": 7.82122434598142e-06, "loss": 0.0342, "step": 3398 }, { "epoch": 1.1350809817999665, "grad_norm": 0.3795237258187998, "learning_rate": 7.819619702383299e-06, "loss": 0.0304, "step": 3399 }, { "epoch": 1.1354149273668392, "grad_norm": 0.35098445157073305, "learning_rate": 7.818014632848422e-06, "loss": 0.0318, "step": 3400 }, { "epoch": 1.1357488729337117, "grad_norm": 0.24736562010249868, "learning_rate": 7.816409137619254e-06, "loss": 0.0278, "step": 3401 }, { "epoch": 1.1360828185005845, "grad_norm": 0.34561163672602185, "learning_rate": 7.814803216938324e-06, "loss": 0.0283, "step": 3402 }, { "epoch": 1.136416764067457, "grad_norm": 0.2811089597579645, "learning_rate": 7.813196871048226e-06, "loss": 0.0246, "step": 3403 }, { "epoch": 1.1367507096343297, "grad_norm": 0.29229247333057806, "learning_rate": 7.811590100191613e-06, "loss": 0.0281, "step": 3404 }, { "epoch": 1.1370846552012022, "grad_norm": 0.3233596938603605, "learning_rate": 7.809982904611213e-06, "loss": 0.0294, "step": 3405 }, { "epoch": 1.1374186007680749, "grad_norm": 0.2938492505209193, "learning_rate": 7.808375284549807e-06, "loss": 0.0314, "step": 3406 }, { "epoch": 1.1377525463349474, "grad_norm": 0.3974430989729181, "learning_rate": 7.806767240250248e-06, "loss": 0.0349, "step": 3407 }, { "epoch": 1.13808649190182, "grad_norm": 0.37393621261776927, "learning_rate": 7.805158771955448e-06, "loss": 0.0288, "step": 3408 }, { "epoch": 1.1384204374686926, "grad_norm": 0.31201214590230125, "learning_rate": 7.803549879908385e-06, "loss": 0.0269, "step": 3409 }, { "epoch": 1.138754383035565, "grad_norm": 0.23628934183711123, "learning_rate": 7.801940564352103e-06, "loss": 0.0238, "step": 3410 }, { "epoch": 1.1390883286024378, "grad_norm": 0.3178175273801832, "learning_rate": 7.800330825529707e-06, "loss": 0.0289, "step": 3411 }, { "epoch": 1.1394222741693105, "grad_norm": 0.28193244763416103, "learning_rate": 7.798720663684367e-06, "loss": 0.027, "step": 3412 }, { "epoch": 1.139756219736183, "grad_norm": 0.33395340241939425, "learning_rate": 7.797110079059315e-06, "loss": 0.0303, "step": 3413 }, { "epoch": 1.1400901653030555, "grad_norm": 0.3245023901004787, "learning_rate": 7.795499071897855e-06, "loss": 0.025, "step": 3414 }, { "epoch": 1.1404241108699282, "grad_norm": 0.9728514347695408, "learning_rate": 7.79388764244334e-06, "loss": 0.0245, "step": 3415 }, { "epoch": 1.1407580564368007, "grad_norm": 0.32287542439954203, "learning_rate": 7.792275790939202e-06, "loss": 0.024, "step": 3416 }, { "epoch": 1.1410920020036734, "grad_norm": 0.41902094576109844, "learning_rate": 7.790663517628927e-06, "loss": 0.0259, "step": 3417 }, { "epoch": 1.141425947570546, "grad_norm": 0.27249682731176167, "learning_rate": 7.789050822756068e-06, "loss": 0.0235, "step": 3418 }, { "epoch": 1.1417598931374187, "grad_norm": 0.25567186294511735, "learning_rate": 7.787437706564243e-06, "loss": 0.0243, "step": 3419 }, { "epoch": 1.1420938387042912, "grad_norm": 0.3892942977962352, "learning_rate": 7.78582416929713e-06, "loss": 0.0249, "step": 3420 }, { "epoch": 1.1424277842711639, "grad_norm": 0.6207323860141518, "learning_rate": 7.784210211198475e-06, "loss": 0.0306, "step": 3421 }, { "epoch": 1.1427617298380364, "grad_norm": 0.3786748110839557, "learning_rate": 7.782595832512086e-06, "loss": 0.0312, "step": 3422 }, { "epoch": 1.143095675404909, "grad_norm": 0.44842472367060976, "learning_rate": 7.780981033481832e-06, "loss": 0.0214, "step": 3423 }, { "epoch": 1.1434296209717816, "grad_norm": 0.4241362850976065, "learning_rate": 7.779365814351648e-06, "loss": 0.0382, "step": 3424 }, { "epoch": 1.143763566538654, "grad_norm": 0.4863833526126975, "learning_rate": 7.77775017536553e-06, "loss": 0.0452, "step": 3425 }, { "epoch": 1.1440975121055268, "grad_norm": 0.3739248050143184, "learning_rate": 7.776134116767544e-06, "loss": 0.0359, "step": 3426 }, { "epoch": 1.1444314576723995, "grad_norm": 0.4540275651438344, "learning_rate": 7.774517638801808e-06, "loss": 0.0278, "step": 3427 }, { "epoch": 1.144765403239272, "grad_norm": 0.3285748768169526, "learning_rate": 7.772900741712516e-06, "loss": 0.0293, "step": 3428 }, { "epoch": 1.1450993488061445, "grad_norm": 0.41110587198841414, "learning_rate": 7.771283425743916e-06, "loss": 0.0336, "step": 3429 }, { "epoch": 1.1454332943730172, "grad_norm": 0.2805074428621494, "learning_rate": 7.769665691140325e-06, "loss": 0.0222, "step": 3430 }, { "epoch": 1.1457672399398897, "grad_norm": 0.4080248079241784, "learning_rate": 7.76804753814612e-06, "loss": 0.04, "step": 3431 }, { "epoch": 1.1461011855067624, "grad_norm": 0.34620183085375505, "learning_rate": 7.76642896700574e-06, "loss": 0.0237, "step": 3432 }, { "epoch": 1.146435131073635, "grad_norm": 0.3722667167027366, "learning_rate": 7.764809977963692e-06, "loss": 0.0262, "step": 3433 }, { "epoch": 1.1467690766405076, "grad_norm": 0.30275538331877355, "learning_rate": 7.763190571264542e-06, "loss": 0.025, "step": 3434 }, { "epoch": 1.1471030222073801, "grad_norm": 0.39944709590348804, "learning_rate": 7.761570747152923e-06, "loss": 0.0236, "step": 3435 }, { "epoch": 1.1474369677742529, "grad_norm": 0.256332459908972, "learning_rate": 7.759950505873523e-06, "loss": 0.0233, "step": 3436 }, { "epoch": 1.1477709133411254, "grad_norm": 0.48252140488184914, "learning_rate": 7.758329847671103e-06, "loss": 0.0362, "step": 3437 }, { "epoch": 1.148104858907998, "grad_norm": 0.3171989479466795, "learning_rate": 7.75670877279048e-06, "loss": 0.0222, "step": 3438 }, { "epoch": 1.1484388044748706, "grad_norm": 0.2702070056143434, "learning_rate": 7.755087281476539e-06, "loss": 0.024, "step": 3439 }, { "epoch": 1.1487727500417433, "grad_norm": 0.24206119047571278, "learning_rate": 7.753465373974223e-06, "loss": 0.0225, "step": 3440 }, { "epoch": 1.1491066956086158, "grad_norm": 0.41400502979105464, "learning_rate": 7.751843050528543e-06, "loss": 0.0299, "step": 3441 }, { "epoch": 1.1494406411754885, "grad_norm": 0.3980616335286519, "learning_rate": 7.750220311384567e-06, "loss": 0.0392, "step": 3442 }, { "epoch": 1.149774586742361, "grad_norm": 0.26094169338484696, "learning_rate": 7.748597156787429e-06, "loss": 0.0231, "step": 3443 }, { "epoch": 1.1501085323092335, "grad_norm": 0.3244504831386949, "learning_rate": 7.746973586982328e-06, "loss": 0.029, "step": 3444 }, { "epoch": 1.1504424778761062, "grad_norm": 0.2547299448602225, "learning_rate": 7.745349602214522e-06, "loss": 0.0182, "step": 3445 }, { "epoch": 1.1507764234429787, "grad_norm": 0.32000861461781827, "learning_rate": 7.743725202729335e-06, "loss": 0.0291, "step": 3446 }, { "epoch": 1.1511103690098514, "grad_norm": 0.35683895137351757, "learning_rate": 7.742100388772148e-06, "loss": 0.0235, "step": 3447 }, { "epoch": 1.151444314576724, "grad_norm": 0.26881767875180934, "learning_rate": 7.74047516058841e-06, "loss": 0.0274, "step": 3448 }, { "epoch": 1.1517782601435966, "grad_norm": 0.32882443243371134, "learning_rate": 7.73884951842363e-06, "loss": 0.0347, "step": 3449 }, { "epoch": 1.1521122057104691, "grad_norm": 0.34210788270298503, "learning_rate": 7.737223462523383e-06, "loss": 0.0306, "step": 3450 }, { "epoch": 1.1524461512773418, "grad_norm": 0.32502201568148453, "learning_rate": 7.735596993133303e-06, "loss": 0.0322, "step": 3451 }, { "epoch": 1.1527800968442143, "grad_norm": 0.30890188288415493, "learning_rate": 7.733970110499086e-06, "loss": 0.0222, "step": 3452 }, { "epoch": 1.153114042411087, "grad_norm": 0.35388885412840615, "learning_rate": 7.732342814866489e-06, "loss": 0.0264, "step": 3453 }, { "epoch": 1.1534479879779596, "grad_norm": 0.33965422979713805, "learning_rate": 7.730715106481342e-06, "loss": 0.0338, "step": 3454 }, { "epoch": 1.1537819335448323, "grad_norm": 0.24510797290870673, "learning_rate": 7.729086985589523e-06, "loss": 0.021, "step": 3455 }, { "epoch": 1.1541158791117048, "grad_norm": 0.2540203541644058, "learning_rate": 7.72745845243698e-06, "loss": 0.0216, "step": 3456 }, { "epoch": 1.1544498246785775, "grad_norm": 0.43230712144777433, "learning_rate": 7.725829507269723e-06, "loss": 0.0245, "step": 3457 }, { "epoch": 1.15478377024545, "grad_norm": 0.45834909309480487, "learning_rate": 7.724200150333826e-06, "loss": 0.0318, "step": 3458 }, { "epoch": 1.1551177158123225, "grad_norm": 0.3044753513105623, "learning_rate": 7.722570381875418e-06, "loss": 0.0308, "step": 3459 }, { "epoch": 1.1554516613791952, "grad_norm": 0.357081990843535, "learning_rate": 7.720940202140698e-06, "loss": 0.0356, "step": 3460 }, { "epoch": 1.155785606946068, "grad_norm": 0.23549803019765664, "learning_rate": 7.71930961137592e-06, "loss": 0.0218, "step": 3461 }, { "epoch": 1.1561195525129404, "grad_norm": 0.3376045946718028, "learning_rate": 7.717678609827409e-06, "loss": 0.0281, "step": 3462 }, { "epoch": 1.156453498079813, "grad_norm": 0.4660996016445457, "learning_rate": 7.716047197741543e-06, "loss": 0.0367, "step": 3463 }, { "epoch": 1.1567874436466856, "grad_norm": 0.3289329792685201, "learning_rate": 7.714415375364768e-06, "loss": 0.0335, "step": 3464 }, { "epoch": 1.1571213892135581, "grad_norm": 0.313194622895452, "learning_rate": 7.712783142943588e-06, "loss": 0.0304, "step": 3465 }, { "epoch": 1.1574553347804308, "grad_norm": 0.31856894335048774, "learning_rate": 7.711150500724574e-06, "loss": 0.0314, "step": 3466 }, { "epoch": 1.1577892803473033, "grad_norm": 0.290210905224967, "learning_rate": 7.709517448954353e-06, "loss": 0.0212, "step": 3467 }, { "epoch": 1.158123225914176, "grad_norm": 0.3450403529308227, "learning_rate": 7.707883987879617e-06, "loss": 0.024, "step": 3468 }, { "epoch": 1.1584571714810485, "grad_norm": 0.2853233655464682, "learning_rate": 7.70625011774712e-06, "loss": 0.0204, "step": 3469 }, { "epoch": 1.1587911170479213, "grad_norm": 0.30596720190071924, "learning_rate": 7.70461583880368e-06, "loss": 0.0246, "step": 3470 }, { "epoch": 1.1591250626147938, "grad_norm": 0.45582478478363625, "learning_rate": 7.70298115129617e-06, "loss": 0.0315, "step": 3471 }, { "epoch": 1.1594590081816665, "grad_norm": 0.3343310583345285, "learning_rate": 7.701346055471533e-06, "loss": 0.0238, "step": 3472 }, { "epoch": 1.159792953748539, "grad_norm": 0.25549838242569206, "learning_rate": 7.699710551576763e-06, "loss": 0.017, "step": 3473 }, { "epoch": 1.1601268993154115, "grad_norm": 0.4784591140687851, "learning_rate": 7.69807463985893e-06, "loss": 0.0315, "step": 3474 }, { "epoch": 1.1604608448822842, "grad_norm": 0.253805038978094, "learning_rate": 7.696438320565152e-06, "loss": 0.0234, "step": 3475 }, { "epoch": 1.160794790449157, "grad_norm": 0.3239179659179292, "learning_rate": 7.694801593942615e-06, "loss": 0.0304, "step": 3476 }, { "epoch": 1.1611287360160294, "grad_norm": 0.3700450583545322, "learning_rate": 7.69316446023857e-06, "loss": 0.0347, "step": 3477 }, { "epoch": 1.161462681582902, "grad_norm": 0.4088219631304055, "learning_rate": 7.691526919700319e-06, "loss": 0.0237, "step": 3478 }, { "epoch": 1.1617966271497746, "grad_norm": 0.5225888845081335, "learning_rate": 7.689888972575237e-06, "loss": 0.0566, "step": 3479 }, { "epoch": 1.162130572716647, "grad_norm": 0.30268914331089336, "learning_rate": 7.688250619110752e-06, "loss": 0.0279, "step": 3480 }, { "epoch": 1.1624645182835198, "grad_norm": 0.33861445710195365, "learning_rate": 7.686611859554361e-06, "loss": 0.0362, "step": 3481 }, { "epoch": 1.1627984638503923, "grad_norm": 0.21615391449424057, "learning_rate": 7.684972694153612e-06, "loss": 0.0218, "step": 3482 }, { "epoch": 1.163132409417265, "grad_norm": 0.3213041342802179, "learning_rate": 7.683333123156122e-06, "loss": 0.0264, "step": 3483 }, { "epoch": 1.1634663549841375, "grad_norm": 0.2391705643336867, "learning_rate": 7.681693146809572e-06, "loss": 0.0216, "step": 3484 }, { "epoch": 1.1638003005510102, "grad_norm": 0.2327024190047155, "learning_rate": 7.680052765361693e-06, "loss": 0.0228, "step": 3485 }, { "epoch": 1.1641342461178827, "grad_norm": 0.466746356161113, "learning_rate": 7.678411979060289e-06, "loss": 0.0349, "step": 3486 }, { "epoch": 1.1644681916847555, "grad_norm": 0.35594369741459175, "learning_rate": 7.676770788153218e-06, "loss": 0.0327, "step": 3487 }, { "epoch": 1.164802137251628, "grad_norm": 0.4396354659182487, "learning_rate": 7.6751291928884e-06, "loss": 0.031, "step": 3488 }, { "epoch": 1.1651360828185007, "grad_norm": 0.2986162472432065, "learning_rate": 7.673487193513821e-06, "loss": 0.0244, "step": 3489 }, { "epoch": 1.1654700283853732, "grad_norm": 0.2855238553660112, "learning_rate": 7.671844790277522e-06, "loss": 0.0237, "step": 3490 }, { "epoch": 1.1658039739522459, "grad_norm": 0.3669523981505205, "learning_rate": 7.670201983427606e-06, "loss": 0.0318, "step": 3491 }, { "epoch": 1.1661379195191184, "grad_norm": 0.2802650217978861, "learning_rate": 7.66855877321224e-06, "loss": 0.0243, "step": 3492 }, { "epoch": 1.1664718650859909, "grad_norm": 0.5173781157672644, "learning_rate": 7.666915159879651e-06, "loss": 0.0296, "step": 3493 }, { "epoch": 1.1668058106528636, "grad_norm": 0.3631614765686001, "learning_rate": 7.665271143678125e-06, "loss": 0.0262, "step": 3494 }, { "epoch": 1.167139756219736, "grad_norm": 0.35272600101893303, "learning_rate": 7.66362672485601e-06, "loss": 0.0325, "step": 3495 }, { "epoch": 1.1674737017866088, "grad_norm": 0.2953204494126073, "learning_rate": 7.661981903661715e-06, "loss": 0.0225, "step": 3496 }, { "epoch": 1.1678076473534813, "grad_norm": 0.3676966213427377, "learning_rate": 7.66033668034371e-06, "loss": 0.0282, "step": 3497 }, { "epoch": 1.168141592920354, "grad_norm": 0.3675832189387499, "learning_rate": 7.658691055150524e-06, "loss": 0.0366, "step": 3498 }, { "epoch": 1.1684755384872265, "grad_norm": 0.32632280987510315, "learning_rate": 7.65704502833075e-06, "loss": 0.0269, "step": 3499 }, { "epoch": 1.1688094840540992, "grad_norm": 0.4123603700874419, "learning_rate": 7.655398600133037e-06, "loss": 0.0395, "step": 3500 }, { "epoch": 1.1691434296209717, "grad_norm": 0.34577078396490485, "learning_rate": 7.653751770806101e-06, "loss": 0.0321, "step": 3501 }, { "epoch": 1.1694773751878444, "grad_norm": 0.2937895703947491, "learning_rate": 7.652104540598712e-06, "loss": 0.0245, "step": 3502 }, { "epoch": 1.169811320754717, "grad_norm": 0.33622346719358, "learning_rate": 7.650456909759707e-06, "loss": 0.0198, "step": 3503 }, { "epoch": 1.1701452663215897, "grad_norm": 0.4467975494526781, "learning_rate": 7.648808878537976e-06, "loss": 0.0308, "step": 3504 }, { "epoch": 1.1704792118884622, "grad_norm": 0.33257231714219554, "learning_rate": 7.647160447182475e-06, "loss": 0.0355, "step": 3505 }, { "epoch": 1.1708131574553349, "grad_norm": 0.24774628720941608, "learning_rate": 7.645511615942218e-06, "loss": 0.0264, "step": 3506 }, { "epoch": 1.1711471030222074, "grad_norm": 0.2316830661797605, "learning_rate": 7.643862385066285e-06, "loss": 0.0232, "step": 3507 }, { "epoch": 1.1714810485890799, "grad_norm": 0.3364496062607349, "learning_rate": 7.642212754803804e-06, "loss": 0.0254, "step": 3508 }, { "epoch": 1.1718149941559526, "grad_norm": 0.3474891703591838, "learning_rate": 7.640562725403978e-06, "loss": 0.0347, "step": 3509 }, { "epoch": 1.1721489397228253, "grad_norm": 0.2908152664638895, "learning_rate": 7.638912297116061e-06, "loss": 0.0288, "step": 3510 }, { "epoch": 1.1724828852896978, "grad_norm": 0.25078594758561484, "learning_rate": 7.637261470189369e-06, "loss": 0.0203, "step": 3511 }, { "epoch": 1.1728168308565703, "grad_norm": 0.24150363668531105, "learning_rate": 7.635610244873277e-06, "loss": 0.0228, "step": 3512 }, { "epoch": 1.173150776423443, "grad_norm": 0.3553529908080216, "learning_rate": 7.633958621417226e-06, "loss": 0.0321, "step": 3513 }, { "epoch": 1.1734847219903155, "grad_norm": 0.2882204107122788, "learning_rate": 7.632306600070711e-06, "loss": 0.0276, "step": 3514 }, { "epoch": 1.1738186675571882, "grad_norm": 0.27175946355194863, "learning_rate": 7.63065418108329e-06, "loss": 0.0199, "step": 3515 }, { "epoch": 1.1741526131240607, "grad_norm": 0.29365182270148615, "learning_rate": 7.62900136470458e-06, "loss": 0.0247, "step": 3516 }, { "epoch": 1.1744865586909334, "grad_norm": 0.2794766476478305, "learning_rate": 7.627348151184257e-06, "loss": 0.0238, "step": 3517 }, { "epoch": 1.174820504257806, "grad_norm": 0.3659872340269811, "learning_rate": 7.625694540772062e-06, "loss": 0.0306, "step": 3518 }, { "epoch": 1.1751544498246786, "grad_norm": 0.35167227874267293, "learning_rate": 7.624040533717789e-06, "loss": 0.0285, "step": 3519 }, { "epoch": 1.1754883953915511, "grad_norm": 0.290174682025725, "learning_rate": 7.622386130271296e-06, "loss": 0.0251, "step": 3520 }, { "epoch": 1.1758223409584239, "grad_norm": 0.4219865137237148, "learning_rate": 7.620731330682501e-06, "loss": 0.0469, "step": 3521 }, { "epoch": 1.1761562865252964, "grad_norm": 0.5707703670899809, "learning_rate": 7.6190761352013795e-06, "loss": 0.0272, "step": 3522 }, { "epoch": 1.1764902320921689, "grad_norm": 0.327756162551458, "learning_rate": 7.61742054407797e-06, "loss": 0.0252, "step": 3523 }, { "epoch": 1.1768241776590416, "grad_norm": 0.3274956389443555, "learning_rate": 7.615764557562368e-06, "loss": 0.028, "step": 3524 }, { "epoch": 1.1771581232259143, "grad_norm": 0.4405481427142839, "learning_rate": 7.6141081759047305e-06, "loss": 0.0381, "step": 3525 }, { "epoch": 1.1774920687927868, "grad_norm": 0.3735302426547869, "learning_rate": 7.612451399355273e-06, "loss": 0.0304, "step": 3526 }, { "epoch": 1.1778260143596593, "grad_norm": 0.3824426145771473, "learning_rate": 7.610794228164271e-06, "loss": 0.0351, "step": 3527 }, { "epoch": 1.178159959926532, "grad_norm": 0.26595801059874885, "learning_rate": 7.60913666258206e-06, "loss": 0.0219, "step": 3528 }, { "epoch": 1.1784939054934045, "grad_norm": 0.24261111244989236, "learning_rate": 7.6074787028590325e-06, "loss": 0.0237, "step": 3529 }, { "epoch": 1.1788278510602772, "grad_norm": 0.27314138563421714, "learning_rate": 7.605820349245645e-06, "loss": 0.0266, "step": 3530 }, { "epoch": 1.1791617966271497, "grad_norm": 0.32357852051384983, "learning_rate": 7.6041616019924125e-06, "loss": 0.0225, "step": 3531 }, { "epoch": 1.1794957421940224, "grad_norm": 0.3636625856897844, "learning_rate": 7.602502461349907e-06, "loss": 0.0337, "step": 3532 }, { "epoch": 1.179829687760895, "grad_norm": 0.3435214508277399, "learning_rate": 7.600842927568761e-06, "loss": 0.0342, "step": 3533 }, { "epoch": 1.1801636333277676, "grad_norm": 0.3477842064588296, "learning_rate": 7.599183000899667e-06, "loss": 0.0244, "step": 3534 }, { "epoch": 1.1804975788946401, "grad_norm": 0.5292869411998767, "learning_rate": 7.597522681593375e-06, "loss": 0.0342, "step": 3535 }, { "epoch": 1.1808315244615128, "grad_norm": 0.3037457490616166, "learning_rate": 7.595861969900698e-06, "loss": 0.0284, "step": 3536 }, { "epoch": 1.1811654700283853, "grad_norm": 0.4447622076079352, "learning_rate": 7.5942008660725065e-06, "loss": 0.0306, "step": 3537 }, { "epoch": 1.181499415595258, "grad_norm": 0.265649322156267, "learning_rate": 7.5925393703597265e-06, "loss": 0.0236, "step": 3538 }, { "epoch": 1.1818333611621306, "grad_norm": 0.23806273170509104, "learning_rate": 7.59087748301335e-06, "loss": 0.0262, "step": 3539 }, { "epoch": 1.1821673067290033, "grad_norm": 0.2513056552938845, "learning_rate": 7.5892152042844224e-06, "loss": 0.0199, "step": 3540 }, { "epoch": 1.1825012522958758, "grad_norm": 0.3183336268239464, "learning_rate": 7.58755253442405e-06, "loss": 0.0263, "step": 3541 }, { "epoch": 1.1828351978627483, "grad_norm": 0.3071757110631517, "learning_rate": 7.585889473683401e-06, "loss": 0.0255, "step": 3542 }, { "epoch": 1.183169143429621, "grad_norm": 0.27160021026868736, "learning_rate": 7.5842260223137e-06, "loss": 0.0173, "step": 3543 }, { "epoch": 1.1835030889964935, "grad_norm": 0.24386094830503965, "learning_rate": 7.5825621805662285e-06, "loss": 0.0241, "step": 3544 }, { "epoch": 1.1838370345633662, "grad_norm": 0.47224786416194436, "learning_rate": 7.580897948692332e-06, "loss": 0.0303, "step": 3545 }, { "epoch": 1.1841709801302387, "grad_norm": 0.2639401565130608, "learning_rate": 7.579233326943412e-06, "loss": 0.0216, "step": 3546 }, { "epoch": 1.1845049256971114, "grad_norm": 0.29087331734446137, "learning_rate": 7.577568315570925e-06, "loss": 0.026, "step": 3547 }, { "epoch": 1.184838871263984, "grad_norm": 0.6776129769735288, "learning_rate": 7.5759029148263975e-06, "loss": 0.0461, "step": 3548 }, { "epoch": 1.1851728168308566, "grad_norm": 0.38653065090861605, "learning_rate": 7.574237124961403e-06, "loss": 0.0327, "step": 3549 }, { "epoch": 1.1855067623977291, "grad_norm": 0.264678448438959, "learning_rate": 7.572570946227582e-06, "loss": 0.0289, "step": 3550 }, { "epoch": 1.1858407079646018, "grad_norm": 0.530675620958939, "learning_rate": 7.570904378876627e-06, "loss": 0.0349, "step": 3551 }, { "epoch": 1.1861746535314743, "grad_norm": 0.3488264650847778, "learning_rate": 7.569237423160294e-06, "loss": 0.0252, "step": 3552 }, { "epoch": 1.186508599098347, "grad_norm": 0.4583056075333567, "learning_rate": 7.567570079330395e-06, "loss": 0.0457, "step": 3553 }, { "epoch": 1.1868425446652195, "grad_norm": 0.5023824187574232, "learning_rate": 7.565902347638806e-06, "loss": 0.0385, "step": 3554 }, { "epoch": 1.1871764902320923, "grad_norm": 0.3792435649151429, "learning_rate": 7.564234228337452e-06, "loss": 0.0284, "step": 3555 }, { "epoch": 1.1875104357989648, "grad_norm": 0.2633029612235313, "learning_rate": 7.5625657216783276e-06, "loss": 0.0257, "step": 3556 }, { "epoch": 1.1878443813658373, "grad_norm": 0.3584621687245536, "learning_rate": 7.560896827913478e-06, "loss": 0.0293, "step": 3557 }, { "epoch": 1.18817832693271, "grad_norm": 0.2925668913663093, "learning_rate": 7.559227547295007e-06, "loss": 0.0242, "step": 3558 }, { "epoch": 1.1885122724995827, "grad_norm": 0.23377559506332798, "learning_rate": 7.557557880075082e-06, "loss": 0.0159, "step": 3559 }, { "epoch": 1.1888462180664552, "grad_norm": 0.31876519927547686, "learning_rate": 7.555887826505926e-06, "loss": 0.0235, "step": 3560 }, { "epoch": 1.1891801636333277, "grad_norm": 0.4288110041407026, "learning_rate": 7.554217386839817e-06, "loss": 0.0367, "step": 3561 }, { "epoch": 1.1895141092002004, "grad_norm": 0.4036764539007261, "learning_rate": 7.552546561329097e-06, "loss": 0.0243, "step": 3562 }, { "epoch": 1.189848054767073, "grad_norm": 0.3111499091936664, "learning_rate": 7.550875350226166e-06, "loss": 0.0298, "step": 3563 }, { "epoch": 1.1901820003339456, "grad_norm": 0.2472740508116731, "learning_rate": 7.549203753783475e-06, "loss": 0.0213, "step": 3564 }, { "epoch": 1.190515945900818, "grad_norm": 0.46078982620520526, "learning_rate": 7.547531772253542e-06, "loss": 0.0373, "step": 3565 }, { "epoch": 1.1908498914676908, "grad_norm": 0.3765683569247024, "learning_rate": 7.54585940588894e-06, "loss": 0.0405, "step": 3566 }, { "epoch": 1.1911838370345633, "grad_norm": 0.2646796997015321, "learning_rate": 7.544186654942296e-06, "loss": 0.0259, "step": 3567 }, { "epoch": 1.191517782601436, "grad_norm": 0.2364250317630789, "learning_rate": 7.542513519666302e-06, "loss": 0.0176, "step": 3568 }, { "epoch": 1.1918517281683085, "grad_norm": 0.3678649976098643, "learning_rate": 7.540840000313705e-06, "loss": 0.035, "step": 3569 }, { "epoch": 1.1921856737351813, "grad_norm": 0.22728263615909344, "learning_rate": 7.539166097137306e-06, "loss": 0.0189, "step": 3570 }, { "epoch": 1.1925196193020537, "grad_norm": 0.37395129436814156, "learning_rate": 7.537491810389972e-06, "loss": 0.0359, "step": 3571 }, { "epoch": 1.1928535648689262, "grad_norm": 0.42274723036577955, "learning_rate": 7.535817140324622e-06, "loss": 0.0344, "step": 3572 }, { "epoch": 1.193187510435799, "grad_norm": 0.3830021184430302, "learning_rate": 7.534142087194234e-06, "loss": 0.0188, "step": 3573 }, { "epoch": 1.1935214560026717, "grad_norm": 0.281046038536792, "learning_rate": 7.532466651251846e-06, "loss": 0.0293, "step": 3574 }, { "epoch": 1.1938554015695442, "grad_norm": 0.3438638328473842, "learning_rate": 7.5307908327505506e-06, "loss": 0.0272, "step": 3575 }, { "epoch": 1.1941893471364167, "grad_norm": 0.2938006118359201, "learning_rate": 7.529114631943501e-06, "loss": 0.02, "step": 3576 }, { "epoch": 1.1945232927032894, "grad_norm": 0.3837708613941296, "learning_rate": 7.527438049083908e-06, "loss": 0.0281, "step": 3577 }, { "epoch": 1.1948572382701619, "grad_norm": 0.42985674766269283, "learning_rate": 7.5257610844250385e-06, "loss": 0.0451, "step": 3578 }, { "epoch": 1.1951911838370346, "grad_norm": 0.4765094879000336, "learning_rate": 7.524083738220214e-06, "loss": 0.0438, "step": 3579 }, { "epoch": 1.195525129403907, "grad_norm": 0.3391737775316126, "learning_rate": 7.522406010722824e-06, "loss": 0.0326, "step": 3580 }, { "epoch": 1.1958590749707798, "grad_norm": 0.4252817384598663, "learning_rate": 7.5207279021863045e-06, "loss": 0.0301, "step": 3581 }, { "epoch": 1.1961930205376523, "grad_norm": 0.3537507345301453, "learning_rate": 7.5190494128641545e-06, "loss": 0.0264, "step": 3582 }, { "epoch": 1.196526966104525, "grad_norm": 0.28006465042221873, "learning_rate": 7.5173705430099295e-06, "loss": 0.0217, "step": 3583 }, { "epoch": 1.1968609116713975, "grad_norm": 0.31595347079765157, "learning_rate": 7.515691292877243e-06, "loss": 0.0351, "step": 3584 }, { "epoch": 1.1971948572382702, "grad_norm": 0.3410057754683141, "learning_rate": 7.514011662719766e-06, "loss": 0.031, "step": 3585 }, { "epoch": 1.1975288028051427, "grad_norm": 0.4029779175853151, "learning_rate": 7.512331652791226e-06, "loss": 0.0477, "step": 3586 }, { "epoch": 1.1978627483720155, "grad_norm": 0.319223776904647, "learning_rate": 7.510651263345408e-06, "loss": 0.0328, "step": 3587 }, { "epoch": 1.198196693938888, "grad_norm": 0.2832307786697477, "learning_rate": 7.508970494636154e-06, "loss": 0.0241, "step": 3588 }, { "epoch": 1.1985306395057607, "grad_norm": 0.3140097991487349, "learning_rate": 7.507289346917366e-06, "loss": 0.0205, "step": 3589 }, { "epoch": 1.1988645850726332, "grad_norm": 0.2893843548475163, "learning_rate": 7.505607820442997e-06, "loss": 0.0237, "step": 3590 }, { "epoch": 1.1991985306395057, "grad_norm": 0.43556983943009747, "learning_rate": 7.503925915467066e-06, "loss": 0.033, "step": 3591 }, { "epoch": 1.1995324762063784, "grad_norm": 0.3413309881274883, "learning_rate": 7.502243632243645e-06, "loss": 0.0378, "step": 3592 }, { "epoch": 1.1998664217732509, "grad_norm": 0.2820857935235703, "learning_rate": 7.500560971026856e-06, "loss": 0.0314, "step": 3593 }, { "epoch": 1.2002003673401236, "grad_norm": 0.34793538336750207, "learning_rate": 7.498877932070892e-06, "loss": 0.0307, "step": 3594 }, { "epoch": 1.200534312906996, "grad_norm": 0.33430281958249236, "learning_rate": 7.497194515629992e-06, "loss": 0.0296, "step": 3595 }, { "epoch": 1.2008682584738688, "grad_norm": 0.33080673886716166, "learning_rate": 7.4955107219584575e-06, "loss": 0.0269, "step": 3596 }, { "epoch": 1.2012022040407413, "grad_norm": 0.3219894335146745, "learning_rate": 7.493826551310645e-06, "loss": 0.0255, "step": 3597 }, { "epoch": 1.201536149607614, "grad_norm": 0.3366922056344637, "learning_rate": 7.492142003940966e-06, "loss": 0.0302, "step": 3598 }, { "epoch": 1.2018700951744865, "grad_norm": 0.2769343236942904, "learning_rate": 7.490457080103895e-06, "loss": 0.0258, "step": 3599 }, { "epoch": 1.2022040407413592, "grad_norm": 0.2959540008543708, "learning_rate": 7.4887717800539584e-06, "loss": 0.0261, "step": 3600 }, { "epoch": 1.2025379863082317, "grad_norm": 0.28120309411333794, "learning_rate": 7.48708610404574e-06, "loss": 0.021, "step": 3601 }, { "epoch": 1.2028719318751044, "grad_norm": 0.3780547262738609, "learning_rate": 7.48540005233388e-06, "loss": 0.0297, "step": 3602 }, { "epoch": 1.203205877441977, "grad_norm": 0.37150191881556194, "learning_rate": 7.483713625173078e-06, "loss": 0.0218, "step": 3603 }, { "epoch": 1.2035398230088497, "grad_norm": 0.2728269051670745, "learning_rate": 7.482026822818088e-06, "loss": 0.0199, "step": 3604 }, { "epoch": 1.2038737685757221, "grad_norm": 0.4121363752550122, "learning_rate": 7.480339645523721e-06, "loss": 0.0345, "step": 3605 }, { "epoch": 1.2042077141425946, "grad_norm": 0.3126177722679711, "learning_rate": 7.478652093544846e-06, "loss": 0.0237, "step": 3606 }, { "epoch": 1.2045416597094674, "grad_norm": 0.28129377355025, "learning_rate": 7.476964167136388e-06, "loss": 0.0235, "step": 3607 }, { "epoch": 1.20487560527634, "grad_norm": 0.30910687217761773, "learning_rate": 7.475275866553326e-06, "loss": 0.0277, "step": 3608 }, { "epoch": 1.2052095508432126, "grad_norm": 0.5336132934124976, "learning_rate": 7.473587192050698e-06, "loss": 0.0323, "step": 3609 }, { "epoch": 1.205543496410085, "grad_norm": 0.26106460006122423, "learning_rate": 7.471898143883601e-06, "loss": 0.0197, "step": 3610 }, { "epoch": 1.2058774419769578, "grad_norm": 0.30317693876187, "learning_rate": 7.470208722307183e-06, "loss": 0.0253, "step": 3611 }, { "epoch": 1.2062113875438303, "grad_norm": 0.25193483447797743, "learning_rate": 7.468518927576653e-06, "loss": 0.0215, "step": 3612 }, { "epoch": 1.206545333110703, "grad_norm": 0.263215237487763, "learning_rate": 7.466828759947271e-06, "loss": 0.0227, "step": 3613 }, { "epoch": 1.2068792786775755, "grad_norm": 0.29627708566072053, "learning_rate": 7.465138219674359e-06, "loss": 0.0288, "step": 3614 }, { "epoch": 1.2072132242444482, "grad_norm": 0.3509762238645572, "learning_rate": 7.463447307013294e-06, "loss": 0.0331, "step": 3615 }, { "epoch": 1.2075471698113207, "grad_norm": 0.35838158455769176, "learning_rate": 7.461756022219507e-06, "loss": 0.0219, "step": 3616 }, { "epoch": 1.2078811153781934, "grad_norm": 0.5607378045821658, "learning_rate": 7.460064365548486e-06, "loss": 0.0337, "step": 3617 }, { "epoch": 1.208215060945066, "grad_norm": 0.2473096236923032, "learning_rate": 7.458372337255777e-06, "loss": 0.0245, "step": 3618 }, { "epoch": 1.2085490065119386, "grad_norm": 0.2791939675004245, "learning_rate": 7.45667993759698e-06, "loss": 0.0238, "step": 3619 }, { "epoch": 1.2088829520788111, "grad_norm": 0.32236033089419347, "learning_rate": 7.454987166827751e-06, "loss": 0.033, "step": 3620 }, { "epoch": 1.2092168976456836, "grad_norm": 0.34359333802445224, "learning_rate": 7.4532940252038055e-06, "loss": 0.0242, "step": 3621 }, { "epoch": 1.2095508432125563, "grad_norm": 0.28346174168334615, "learning_rate": 7.45160051298091e-06, "loss": 0.0219, "step": 3622 }, { "epoch": 1.209884788779429, "grad_norm": 0.3394500848050175, "learning_rate": 7.4499066304148904e-06, "loss": 0.0358, "step": 3623 }, { "epoch": 1.2102187343463016, "grad_norm": 0.28727684362348355, "learning_rate": 7.448212377761628e-06, "loss": 0.0236, "step": 3624 }, { "epoch": 1.210552679913174, "grad_norm": 0.23030080548127382, "learning_rate": 7.4465177552770585e-06, "loss": 0.0211, "step": 3625 }, { "epoch": 1.2108866254800468, "grad_norm": 0.3752054423570739, "learning_rate": 7.444822763217174e-06, "loss": 0.0312, "step": 3626 }, { "epoch": 1.2112205710469193, "grad_norm": 0.3880340810105748, "learning_rate": 7.443127401838026e-06, "loss": 0.0304, "step": 3627 }, { "epoch": 1.211554516613792, "grad_norm": 0.3593823234963432, "learning_rate": 7.441431671395717e-06, "loss": 0.0252, "step": 3628 }, { "epoch": 1.2118884621806645, "grad_norm": 0.28294692967676155, "learning_rate": 7.439735572146407e-06, "loss": 0.0224, "step": 3629 }, { "epoch": 1.2122224077475372, "grad_norm": 0.4649483517510499, "learning_rate": 7.438039104346312e-06, "loss": 0.0338, "step": 3630 }, { "epoch": 1.2125563533144097, "grad_norm": 0.28148285679186763, "learning_rate": 7.436342268251702e-06, "loss": 0.026, "step": 3631 }, { "epoch": 1.2128902988812824, "grad_norm": 0.3519283974511048, "learning_rate": 7.434645064118906e-06, "loss": 0.0327, "step": 3632 }, { "epoch": 1.213224244448155, "grad_norm": 0.2553342539182069, "learning_rate": 7.432947492204308e-06, "loss": 0.0185, "step": 3633 }, { "epoch": 1.2135581900150276, "grad_norm": 0.3141381461297296, "learning_rate": 7.431249552764342e-06, "loss": 0.0314, "step": 3634 }, { "epoch": 1.2138921355819001, "grad_norm": 0.3525165310268248, "learning_rate": 7.429551246055504e-06, "loss": 0.0335, "step": 3635 }, { "epoch": 1.2142260811487728, "grad_norm": 0.3989324630739906, "learning_rate": 7.427852572334344e-06, "loss": 0.0335, "step": 3636 }, { "epoch": 1.2145600267156453, "grad_norm": 0.3337331951530839, "learning_rate": 7.426153531857466e-06, "loss": 0.0324, "step": 3637 }, { "epoch": 1.214893972282518, "grad_norm": 0.20674712407248522, "learning_rate": 7.424454124881531e-06, "loss": 0.0189, "step": 3638 }, { "epoch": 1.2152279178493905, "grad_norm": 0.3204365065385207, "learning_rate": 7.422754351663252e-06, "loss": 0.032, "step": 3639 }, { "epoch": 1.215561863416263, "grad_norm": 0.21512092753837306, "learning_rate": 7.4210542124594e-06, "loss": 0.0219, "step": 3640 }, { "epoch": 1.2158958089831358, "grad_norm": 0.31341545847677554, "learning_rate": 7.419353707526804e-06, "loss": 0.0261, "step": 3641 }, { "epoch": 1.2162297545500083, "grad_norm": 0.2889569310745874, "learning_rate": 7.417652837122345e-06, "loss": 0.0245, "step": 3642 }, { "epoch": 1.216563700116881, "grad_norm": 0.2902261356520683, "learning_rate": 7.4159516015029545e-06, "loss": 0.0261, "step": 3643 }, { "epoch": 1.2168976456837535, "grad_norm": 0.31994923991612734, "learning_rate": 7.414250000925629e-06, "loss": 0.0297, "step": 3644 }, { "epoch": 1.2172315912506262, "grad_norm": 0.3074415733624934, "learning_rate": 7.412548035647416e-06, "loss": 0.0291, "step": 3645 }, { "epoch": 1.2175655368174987, "grad_norm": 0.2600369049394611, "learning_rate": 7.4108457059254135e-06, "loss": 0.0271, "step": 3646 }, { "epoch": 1.2178994823843714, "grad_norm": 0.3618982584590955, "learning_rate": 7.40914301201678e-06, "loss": 0.0349, "step": 3647 }, { "epoch": 1.218233427951244, "grad_norm": 0.328663757492403, "learning_rate": 7.407439954178729e-06, "loss": 0.0312, "step": 3648 }, { "epoch": 1.2185673735181166, "grad_norm": 0.24598683290237383, "learning_rate": 7.405736532668525e-06, "loss": 0.019, "step": 3649 }, { "epoch": 1.218901319084989, "grad_norm": 0.30976253960270267, "learning_rate": 7.4040327477434926e-06, "loss": 0.0224, "step": 3650 }, { "epoch": 1.2192352646518618, "grad_norm": 0.32080993247481554, "learning_rate": 7.402328599661006e-06, "loss": 0.0226, "step": 3651 }, { "epoch": 1.2195692102187343, "grad_norm": 0.37200366630708953, "learning_rate": 7.400624088678497e-06, "loss": 0.0308, "step": 3652 }, { "epoch": 1.219903155785607, "grad_norm": 0.27923380310336365, "learning_rate": 7.398919215053455e-06, "loss": 0.0207, "step": 3653 }, { "epoch": 1.2202371013524795, "grad_norm": 0.2985047236029938, "learning_rate": 7.397213979043418e-06, "loss": 0.028, "step": 3654 }, { "epoch": 1.220571046919352, "grad_norm": 0.34394570071340674, "learning_rate": 7.395508380905983e-06, "loss": 0.0231, "step": 3655 }, { "epoch": 1.2209049924862247, "grad_norm": 0.30248672754705974, "learning_rate": 7.393802420898801e-06, "loss": 0.0242, "step": 3656 }, { "epoch": 1.2212389380530975, "grad_norm": 0.2503042875788558, "learning_rate": 7.392096099279579e-06, "loss": 0.0202, "step": 3657 }, { "epoch": 1.22157288361997, "grad_norm": 0.3569871645662349, "learning_rate": 7.390389416306073e-06, "loss": 0.0257, "step": 3658 }, { "epoch": 1.2219068291868425, "grad_norm": 0.25009609421324414, "learning_rate": 7.3886823722361e-06, "loss": 0.0251, "step": 3659 }, { "epoch": 1.2222407747537152, "grad_norm": 0.24203243906148403, "learning_rate": 7.386974967327531e-06, "loss": 0.0222, "step": 3660 }, { "epoch": 1.2225747203205877, "grad_norm": 0.27631277036377877, "learning_rate": 7.385267201838284e-06, "loss": 0.0182, "step": 3661 }, { "epoch": 1.2229086658874604, "grad_norm": 0.46240374040892274, "learning_rate": 7.383559076026343e-06, "loss": 0.0385, "step": 3662 }, { "epoch": 1.2232426114543329, "grad_norm": 0.2415788281018439, "learning_rate": 7.381850590149737e-06, "loss": 0.02, "step": 3663 }, { "epoch": 1.2235765570212056, "grad_norm": 0.34618615305852357, "learning_rate": 7.380141744466555e-06, "loss": 0.0246, "step": 3664 }, { "epoch": 1.223910502588078, "grad_norm": 0.28776979877352327, "learning_rate": 7.378432539234936e-06, "loss": 0.0232, "step": 3665 }, { "epoch": 1.2242444481549508, "grad_norm": 0.32124672291576994, "learning_rate": 7.376722974713078e-06, "loss": 0.0191, "step": 3666 }, { "epoch": 1.2245783937218233, "grad_norm": 0.361088326471107, "learning_rate": 7.3750130511592275e-06, "loss": 0.0298, "step": 3667 }, { "epoch": 1.224912339288696, "grad_norm": 0.41441701315501017, "learning_rate": 7.373302768831694e-06, "loss": 0.0395, "step": 3668 }, { "epoch": 1.2252462848555685, "grad_norm": 0.2768492371418646, "learning_rate": 7.371592127988831e-06, "loss": 0.0302, "step": 3669 }, { "epoch": 1.225580230422441, "grad_norm": 0.2773369556998907, "learning_rate": 7.369881128889052e-06, "loss": 0.0171, "step": 3670 }, { "epoch": 1.2259141759893137, "grad_norm": 0.2766091810132415, "learning_rate": 7.368169771790825e-06, "loss": 0.0234, "step": 3671 }, { "epoch": 1.2262481215561865, "grad_norm": 0.3764666912304945, "learning_rate": 7.366458056952668e-06, "loss": 0.0275, "step": 3672 }, { "epoch": 1.226582067123059, "grad_norm": 0.25978182887351225, "learning_rate": 7.36474598463316e-06, "loss": 0.0206, "step": 3673 }, { "epoch": 1.2269160126899314, "grad_norm": 0.28853253691773284, "learning_rate": 7.363033555090925e-06, "loss": 0.022, "step": 3674 }, { "epoch": 1.2272499582568042, "grad_norm": 0.3224334967181729, "learning_rate": 7.361320768584648e-06, "loss": 0.0247, "step": 3675 }, { "epoch": 1.2275839038236767, "grad_norm": 0.38351597041853136, "learning_rate": 7.359607625373065e-06, "loss": 0.0293, "step": 3676 }, { "epoch": 1.2279178493905494, "grad_norm": 0.26102522211177076, "learning_rate": 7.357894125714967e-06, "loss": 0.0229, "step": 3677 }, { "epoch": 1.2282517949574219, "grad_norm": 0.2969561641140478, "learning_rate": 7.3561802698691976e-06, "loss": 0.0271, "step": 3678 }, { "epoch": 1.2285857405242946, "grad_norm": 0.30718238258239033, "learning_rate": 7.354466058094656e-06, "loss": 0.0296, "step": 3679 }, { "epoch": 1.228919686091167, "grad_norm": 0.41127526819441096, "learning_rate": 7.352751490650294e-06, "loss": 0.041, "step": 3680 }, { "epoch": 1.2292536316580398, "grad_norm": 0.34722166038411595, "learning_rate": 7.3510365677951155e-06, "loss": 0.0231, "step": 3681 }, { "epoch": 1.2295875772249123, "grad_norm": 0.2969655574350491, "learning_rate": 7.349321289788181e-06, "loss": 0.0229, "step": 3682 }, { "epoch": 1.229921522791785, "grad_norm": 0.36054722563729874, "learning_rate": 7.3476056568886036e-06, "loss": 0.0286, "step": 3683 }, { "epoch": 1.2302554683586575, "grad_norm": 0.38218968715418694, "learning_rate": 7.34588966935555e-06, "loss": 0.0317, "step": 3684 }, { "epoch": 1.2305894139255302, "grad_norm": 0.35561484948591066, "learning_rate": 7.344173327448238e-06, "loss": 0.0267, "step": 3685 }, { "epoch": 1.2309233594924027, "grad_norm": 0.3315702141284436, "learning_rate": 7.342456631425945e-06, "loss": 0.0385, "step": 3686 }, { "epoch": 1.2312573050592754, "grad_norm": 0.5917973351022848, "learning_rate": 7.340739581547996e-06, "loss": 0.0338, "step": 3687 }, { "epoch": 1.231591250626148, "grad_norm": 0.36875324150377503, "learning_rate": 7.339022178073772e-06, "loss": 0.0227, "step": 3688 }, { "epoch": 1.2319251961930204, "grad_norm": 0.36599153753174596, "learning_rate": 7.337304421262706e-06, "loss": 0.0271, "step": 3689 }, { "epoch": 1.2322591417598932, "grad_norm": 0.28380159214160994, "learning_rate": 7.335586311374287e-06, "loss": 0.0294, "step": 3690 }, { "epoch": 1.2325930873267656, "grad_norm": 0.3715206835350677, "learning_rate": 7.3338678486680545e-06, "loss": 0.0377, "step": 3691 }, { "epoch": 1.2329270328936384, "grad_norm": 0.3889552947049423, "learning_rate": 7.3321490334036035e-06, "loss": 0.0379, "step": 3692 }, { "epoch": 1.2332609784605109, "grad_norm": 0.43657615137252603, "learning_rate": 7.3304298658405815e-06, "loss": 0.0374, "step": 3693 }, { "epoch": 1.2335949240273836, "grad_norm": 0.4115973135062599, "learning_rate": 7.328710346238688e-06, "loss": 0.0285, "step": 3694 }, { "epoch": 1.233928869594256, "grad_norm": 0.3322707553589683, "learning_rate": 7.326990474857676e-06, "loss": 0.0161, "step": 3695 }, { "epoch": 1.2342628151611288, "grad_norm": 0.26815057066610914, "learning_rate": 7.3252702519573545e-06, "loss": 0.0207, "step": 3696 }, { "epoch": 1.2345967607280013, "grad_norm": 0.41046045045124907, "learning_rate": 7.323549677797582e-06, "loss": 0.0268, "step": 3697 }, { "epoch": 1.234930706294874, "grad_norm": 0.2957086659234872, "learning_rate": 7.3218287526382716e-06, "loss": 0.0279, "step": 3698 }, { "epoch": 1.2352646518617465, "grad_norm": 0.35141890241321566, "learning_rate": 7.320107476739389e-06, "loss": 0.0353, "step": 3699 }, { "epoch": 1.2355985974286192, "grad_norm": 0.3318220269094786, "learning_rate": 7.318385850360954e-06, "loss": 0.0291, "step": 3700 }, { "epoch": 1.2359325429954917, "grad_norm": 0.4101419387384786, "learning_rate": 7.316663873763039e-06, "loss": 0.0257, "step": 3701 }, { "epoch": 1.2362664885623644, "grad_norm": 0.3066255629032018, "learning_rate": 7.314941547205767e-06, "loss": 0.0258, "step": 3702 }, { "epoch": 1.236600434129237, "grad_norm": 0.27561284004815184, "learning_rate": 7.313218870949317e-06, "loss": 0.0236, "step": 3703 }, { "epoch": 1.2369343796961094, "grad_norm": 0.28450644260498353, "learning_rate": 7.31149584525392e-06, "loss": 0.0265, "step": 3704 }, { "epoch": 1.2372683252629821, "grad_norm": 0.2990032445567785, "learning_rate": 7.309772470379856e-06, "loss": 0.03, "step": 3705 }, { "epoch": 1.2376022708298549, "grad_norm": 0.3054920377427087, "learning_rate": 7.308048746587466e-06, "loss": 0.0226, "step": 3706 }, { "epoch": 1.2379362163967274, "grad_norm": 0.31201169241005794, "learning_rate": 7.3063246741371365e-06, "loss": 0.0216, "step": 3707 }, { "epoch": 1.2382701619635998, "grad_norm": 0.389669345428009, "learning_rate": 7.304600253289308e-06, "loss": 0.0323, "step": 3708 }, { "epoch": 1.2386041075304726, "grad_norm": 0.26737606664063485, "learning_rate": 7.302875484304476e-06, "loss": 0.0353, "step": 3709 }, { "epoch": 1.238938053097345, "grad_norm": 0.3235634216216689, "learning_rate": 7.301150367443186e-06, "loss": 0.0283, "step": 3710 }, { "epoch": 1.2392719986642178, "grad_norm": 0.33968239674236567, "learning_rate": 7.299424902966039e-06, "loss": 0.0314, "step": 3711 }, { "epoch": 1.2396059442310903, "grad_norm": 0.24288643222976627, "learning_rate": 7.297699091133685e-06, "loss": 0.0251, "step": 3712 }, { "epoch": 1.239939889797963, "grad_norm": 0.2603087747863267, "learning_rate": 7.295972932206827e-06, "loss": 0.0225, "step": 3713 }, { "epoch": 1.2402738353648355, "grad_norm": 0.3163294158642131, "learning_rate": 7.2942464264462255e-06, "loss": 0.0244, "step": 3714 }, { "epoch": 1.2406077809317082, "grad_norm": 0.299362756794637, "learning_rate": 7.292519574112688e-06, "loss": 0.0277, "step": 3715 }, { "epoch": 1.2409417264985807, "grad_norm": 0.3554647010376918, "learning_rate": 7.290792375467074e-06, "loss": 0.0294, "step": 3716 }, { "epoch": 1.2412756720654534, "grad_norm": 0.3630876067157386, "learning_rate": 7.2890648307702985e-06, "loss": 0.0349, "step": 3717 }, { "epoch": 1.241609617632326, "grad_norm": 0.2755333053761441, "learning_rate": 7.287336940283327e-06, "loss": 0.0256, "step": 3718 }, { "epoch": 1.2419435631991984, "grad_norm": 0.351516780352397, "learning_rate": 7.28560870426718e-06, "loss": 0.0319, "step": 3719 }, { "epoch": 1.2422775087660711, "grad_norm": 0.3682681590002886, "learning_rate": 7.2838801229829245e-06, "loss": 0.0291, "step": 3720 }, { "epoch": 1.2426114543329438, "grad_norm": 0.33590142023212194, "learning_rate": 7.2821511966916845e-06, "loss": 0.023, "step": 3721 }, { "epoch": 1.2429453998998163, "grad_norm": 0.26643045835727613, "learning_rate": 7.280421925654635e-06, "loss": 0.0214, "step": 3722 }, { "epoch": 1.2432793454666888, "grad_norm": 0.2776713771622553, "learning_rate": 7.278692310133003e-06, "loss": 0.0249, "step": 3723 }, { "epoch": 1.2436132910335616, "grad_norm": 0.3039900578868121, "learning_rate": 7.276962350388067e-06, "loss": 0.039, "step": 3724 }, { "epoch": 1.243947236600434, "grad_norm": 0.3580496006561448, "learning_rate": 7.275232046681157e-06, "loss": 0.0294, "step": 3725 }, { "epoch": 1.2442811821673068, "grad_norm": 0.3491025317337139, "learning_rate": 7.273501399273656e-06, "loss": 0.0342, "step": 3726 }, { "epoch": 1.2446151277341793, "grad_norm": 0.33509533968101746, "learning_rate": 7.271770408427e-06, "loss": 0.0341, "step": 3727 }, { "epoch": 1.244949073301052, "grad_norm": 0.46171379206470553, "learning_rate": 7.2700390744026735e-06, "loss": 0.0333, "step": 3728 }, { "epoch": 1.2452830188679245, "grad_norm": 0.36499803123861596, "learning_rate": 7.2683073974622165e-06, "loss": 0.021, "step": 3729 }, { "epoch": 1.2456169644347972, "grad_norm": 0.48052357895113695, "learning_rate": 7.26657537786722e-06, "loss": 0.0369, "step": 3730 }, { "epoch": 1.2459509100016697, "grad_norm": 0.32479066646778987, "learning_rate": 7.264843015879321e-06, "loss": 0.0188, "step": 3731 }, { "epoch": 1.2462848555685424, "grad_norm": 0.5374273644484915, "learning_rate": 7.263110311760221e-06, "loss": 0.0341, "step": 3732 }, { "epoch": 1.246618801135415, "grad_norm": 0.3937749073326078, "learning_rate": 7.2613772657716585e-06, "loss": 0.0267, "step": 3733 }, { "epoch": 1.2469527467022876, "grad_norm": 0.339198762168034, "learning_rate": 7.259643878175434e-06, "loss": 0.0432, "step": 3734 }, { "epoch": 1.2472866922691601, "grad_norm": 0.3832528094853473, "learning_rate": 7.2579101492333956e-06, "loss": 0.0332, "step": 3735 }, { "epoch": 1.2476206378360328, "grad_norm": 0.3728492101697254, "learning_rate": 7.256176079207442e-06, "loss": 0.034, "step": 3736 }, { "epoch": 1.2479545834029053, "grad_norm": 0.36633795428673205, "learning_rate": 7.254441668359527e-06, "loss": 0.0252, "step": 3737 }, { "epoch": 1.2482885289697778, "grad_norm": 0.48856680714255346, "learning_rate": 7.252706916951653e-06, "loss": 0.0347, "step": 3738 }, { "epoch": 1.2486224745366505, "grad_norm": 0.3314686964943417, "learning_rate": 7.250971825245874e-06, "loss": 0.0312, "step": 3739 }, { "epoch": 1.248956420103523, "grad_norm": 0.2533430483215571, "learning_rate": 7.249236393504296e-06, "loss": 0.0219, "step": 3740 }, { "epoch": 1.2492903656703958, "grad_norm": 0.31333906688251445, "learning_rate": 7.247500621989078e-06, "loss": 0.0306, "step": 3741 }, { "epoch": 1.2496243112372682, "grad_norm": 0.374680220817901, "learning_rate": 7.245764510962426e-06, "loss": 0.0425, "step": 3742 }, { "epoch": 1.249958256804141, "grad_norm": 0.3897165493108914, "learning_rate": 7.244028060686603e-06, "loss": 0.0266, "step": 3743 }, { "epoch": 1.2502922023710135, "grad_norm": 0.32147299764024756, "learning_rate": 7.242291271423919e-06, "loss": 0.0236, "step": 3744 }, { "epoch": 1.2506261479378862, "grad_norm": 0.34935109492629934, "learning_rate": 7.240554143436735e-06, "loss": 0.0279, "step": 3745 }, { "epoch": 1.2509600935047587, "grad_norm": 0.4583164819519443, "learning_rate": 7.238816676987467e-06, "loss": 0.0305, "step": 3746 }, { "epoch": 1.2512940390716314, "grad_norm": 0.44515078128846375, "learning_rate": 7.237078872338579e-06, "loss": 0.0332, "step": 3747 }, { "epoch": 1.2516279846385039, "grad_norm": 0.39088623320149796, "learning_rate": 7.235340729752584e-06, "loss": 0.0283, "step": 3748 }, { "epoch": 1.2519619302053766, "grad_norm": 0.2830828577664718, "learning_rate": 7.233602249492055e-06, "loss": 0.0244, "step": 3749 }, { "epoch": 1.252295875772249, "grad_norm": 0.29156033814021487, "learning_rate": 7.2318634318196045e-06, "loss": 0.0274, "step": 3750 }, { "epoch": 1.2526298213391218, "grad_norm": 0.4269714947221831, "learning_rate": 7.230124276997903e-06, "loss": 0.0371, "step": 3751 }, { "epoch": 1.2529637669059943, "grad_norm": 0.292773984024898, "learning_rate": 7.228384785289671e-06, "loss": 0.0283, "step": 3752 }, { "epoch": 1.2532977124728668, "grad_norm": 0.2606263972987227, "learning_rate": 7.2266449569576804e-06, "loss": 0.0235, "step": 3753 }, { "epoch": 1.2536316580397395, "grad_norm": 0.45745548811577635, "learning_rate": 7.224904792264748e-06, "loss": 0.0399, "step": 3754 }, { "epoch": 1.2539656036066122, "grad_norm": 0.3575217997424679, "learning_rate": 7.223164291473752e-06, "loss": 0.0217, "step": 3755 }, { "epoch": 1.2542995491734847, "grad_norm": 0.34302189426855073, "learning_rate": 7.221423454847611e-06, "loss": 0.0219, "step": 3756 }, { "epoch": 1.2546334947403572, "grad_norm": 0.3340657088961129, "learning_rate": 7.219682282649302e-06, "loss": 0.0209, "step": 3757 }, { "epoch": 1.25496744030723, "grad_norm": 0.3380616537037228, "learning_rate": 7.2179407751418485e-06, "loss": 0.0244, "step": 3758 }, { "epoch": 1.2553013858741024, "grad_norm": 0.2869316250472297, "learning_rate": 7.216198932588325e-06, "loss": 0.021, "step": 3759 }, { "epoch": 1.2556353314409752, "grad_norm": 0.31710265418670425, "learning_rate": 7.214456755251858e-06, "loss": 0.0246, "step": 3760 }, { "epoch": 1.2559692770078477, "grad_norm": 0.2905960517037898, "learning_rate": 7.212714243395623e-06, "loss": 0.0293, "step": 3761 }, { "epoch": 1.2563032225747204, "grad_norm": 0.3541273326846222, "learning_rate": 7.210971397282848e-06, "loss": 0.0265, "step": 3762 }, { "epoch": 1.2566371681415929, "grad_norm": 0.2694311729984884, "learning_rate": 7.20922821717681e-06, "loss": 0.0244, "step": 3763 }, { "epoch": 1.2569711137084656, "grad_norm": 0.2422903599196817, "learning_rate": 7.207484703340838e-06, "loss": 0.0204, "step": 3764 }, { "epoch": 1.257305059275338, "grad_norm": 0.2544145464785348, "learning_rate": 7.205740856038308e-06, "loss": 0.0234, "step": 3765 }, { "epoch": 1.2576390048422108, "grad_norm": 0.29299204974133036, "learning_rate": 7.2039966755326515e-06, "loss": 0.0253, "step": 3766 }, { "epoch": 1.2579729504090833, "grad_norm": 0.3177885966840588, "learning_rate": 7.2022521620873456e-06, "loss": 0.0256, "step": 3767 }, { "epoch": 1.2583068959759558, "grad_norm": 0.2542501316544053, "learning_rate": 7.2005073159659186e-06, "loss": 0.0215, "step": 3768 }, { "epoch": 1.2586408415428285, "grad_norm": 0.5497569773621858, "learning_rate": 7.198762137431952e-06, "loss": 0.0431, "step": 3769 }, { "epoch": 1.2589747871097012, "grad_norm": 0.2553974569267879, "learning_rate": 7.197016626749076e-06, "loss": 0.019, "step": 3770 }, { "epoch": 1.2593087326765737, "grad_norm": 0.19873663578782186, "learning_rate": 7.195270784180968e-06, "loss": 0.0156, "step": 3771 }, { "epoch": 1.2596426782434462, "grad_norm": 0.4408211680948502, "learning_rate": 7.193524609991359e-06, "loss": 0.0429, "step": 3772 }, { "epoch": 1.259976623810319, "grad_norm": 0.3525185783707462, "learning_rate": 7.191778104444031e-06, "loss": 0.0297, "step": 3773 }, { "epoch": 1.2603105693771914, "grad_norm": 0.3667028187971598, "learning_rate": 7.190031267802814e-06, "loss": 0.0344, "step": 3774 }, { "epoch": 1.2606445149440642, "grad_norm": 0.3342665533007622, "learning_rate": 7.188284100331585e-06, "loss": 0.0319, "step": 3775 }, { "epoch": 1.2609784605109366, "grad_norm": 0.27642443399097244, "learning_rate": 7.186536602294278e-06, "loss": 0.0287, "step": 3776 }, { "epoch": 1.2613124060778094, "grad_norm": 0.35601960051788384, "learning_rate": 7.184788773954871e-06, "loss": 0.0266, "step": 3777 }, { "epoch": 1.2616463516446819, "grad_norm": 0.17862828576846648, "learning_rate": 7.1830406155773946e-06, "loss": 0.0139, "step": 3778 }, { "epoch": 1.2619802972115546, "grad_norm": 0.35403715659010926, "learning_rate": 7.181292127425928e-06, "loss": 0.0265, "step": 3779 }, { "epoch": 1.262314242778427, "grad_norm": 0.3091743906409628, "learning_rate": 7.179543309764604e-06, "loss": 0.0275, "step": 3780 }, { "epoch": 1.2626481883452998, "grad_norm": 0.24350518113982092, "learning_rate": 7.177794162857598e-06, "loss": 0.0216, "step": 3781 }, { "epoch": 1.2629821339121723, "grad_norm": 0.3420262121066958, "learning_rate": 7.176044686969141e-06, "loss": 0.0346, "step": 3782 }, { "epoch": 1.2633160794790448, "grad_norm": 0.29497645371329917, "learning_rate": 7.174294882363513e-06, "loss": 0.0244, "step": 3783 }, { "epoch": 1.2636500250459175, "grad_norm": 0.27482921785317416, "learning_rate": 7.172544749305039e-06, "loss": 0.0258, "step": 3784 }, { "epoch": 1.2639839706127902, "grad_norm": 0.39636662140469486, "learning_rate": 7.170794288058103e-06, "loss": 0.0282, "step": 3785 }, { "epoch": 1.2643179161796627, "grad_norm": 0.2670025994032695, "learning_rate": 7.169043498887126e-06, "loss": 0.0224, "step": 3786 }, { "epoch": 1.2646518617465352, "grad_norm": 0.31426460403959644, "learning_rate": 7.1672923820565925e-06, "loss": 0.0214, "step": 3787 }, { "epoch": 1.264985807313408, "grad_norm": 0.35382114056259417, "learning_rate": 7.165540937831024e-06, "loss": 0.0209, "step": 3788 }, { "epoch": 1.2653197528802806, "grad_norm": 0.2190706486255055, "learning_rate": 7.163789166474998e-06, "loss": 0.0196, "step": 3789 }, { "epoch": 1.2656536984471531, "grad_norm": 0.3089953610537669, "learning_rate": 7.162037068253141e-06, "loss": 0.0181, "step": 3790 }, { "epoch": 1.2659876440140256, "grad_norm": 0.29351890815761744, "learning_rate": 7.160284643430129e-06, "loss": 0.0314, "step": 3791 }, { "epoch": 1.2663215895808984, "grad_norm": 0.5468606345028866, "learning_rate": 7.158531892270682e-06, "loss": 0.0397, "step": 3792 }, { "epoch": 1.2666555351477709, "grad_norm": 0.22776825216991417, "learning_rate": 7.156778815039579e-06, "loss": 0.0218, "step": 3793 }, { "epoch": 1.2669894807146436, "grad_norm": 0.3511226987717307, "learning_rate": 7.15502541200164e-06, "loss": 0.0332, "step": 3794 }, { "epoch": 1.267323426281516, "grad_norm": 0.2909516765573883, "learning_rate": 7.153271683421738e-06, "loss": 0.0229, "step": 3795 }, { "epoch": 1.2676573718483888, "grad_norm": 0.3054934354023608, "learning_rate": 7.151517629564795e-06, "loss": 0.0344, "step": 3796 }, { "epoch": 1.2679913174152613, "grad_norm": 0.25579604698663366, "learning_rate": 7.14976325069578e-06, "loss": 0.0239, "step": 3797 }, { "epoch": 1.268325262982134, "grad_norm": 0.2818385793611901, "learning_rate": 7.148008547079713e-06, "loss": 0.0229, "step": 3798 }, { "epoch": 1.2686592085490065, "grad_norm": 0.26293039949095737, "learning_rate": 7.1462535189816636e-06, "loss": 0.0227, "step": 3799 }, { "epoch": 1.2689931541158792, "grad_norm": 0.2627632990160037, "learning_rate": 7.14449816666675e-06, "loss": 0.0223, "step": 3800 }, { "epoch": 1.2693270996827517, "grad_norm": 0.2428387518039147, "learning_rate": 7.142742490400135e-06, "loss": 0.0191, "step": 3801 }, { "epoch": 1.2696610452496242, "grad_norm": 0.34986548303294884, "learning_rate": 7.140986490447039e-06, "loss": 0.0307, "step": 3802 }, { "epoch": 1.269994990816497, "grad_norm": 0.2687958689660581, "learning_rate": 7.139230167072724e-06, "loss": 0.0294, "step": 3803 }, { "epoch": 1.2703289363833696, "grad_norm": 0.279248991904363, "learning_rate": 7.137473520542503e-06, "loss": 0.0263, "step": 3804 }, { "epoch": 1.2706628819502421, "grad_norm": 0.3237279440794913, "learning_rate": 7.135716551121739e-06, "loss": 0.0355, "step": 3805 }, { "epoch": 1.2709968275171146, "grad_norm": 0.32990174059468896, "learning_rate": 7.133959259075844e-06, "loss": 0.0377, "step": 3806 }, { "epoch": 1.2713307730839873, "grad_norm": 0.27475097706870694, "learning_rate": 7.132201644670274e-06, "loss": 0.0196, "step": 3807 }, { "epoch": 1.2716647186508598, "grad_norm": 0.2970159606782559, "learning_rate": 7.13044370817054e-06, "loss": 0.0329, "step": 3808 }, { "epoch": 1.2719986642177326, "grad_norm": 0.3233728797140094, "learning_rate": 7.128685449842201e-06, "loss": 0.0235, "step": 3809 }, { "epoch": 1.272332609784605, "grad_norm": 0.29928552659389207, "learning_rate": 7.1269268699508574e-06, "loss": 0.0235, "step": 3810 }, { "epoch": 1.2726665553514778, "grad_norm": 0.269527821898528, "learning_rate": 7.1251679687621685e-06, "loss": 0.0193, "step": 3811 }, { "epoch": 1.2730005009183503, "grad_norm": 0.3104286192101173, "learning_rate": 7.123408746541835e-06, "loss": 0.0293, "step": 3812 }, { "epoch": 1.273334446485223, "grad_norm": 0.36794897135182747, "learning_rate": 7.1216492035556075e-06, "loss": 0.0273, "step": 3813 }, { "epoch": 1.2736683920520955, "grad_norm": 0.3134344825908921, "learning_rate": 7.119889340069286e-06, "loss": 0.0231, "step": 3814 }, { "epoch": 1.2740023376189682, "grad_norm": 0.2866279702689287, "learning_rate": 7.1181291563487175e-06, "loss": 0.0229, "step": 3815 }, { "epoch": 1.2743362831858407, "grad_norm": 0.2963489265558395, "learning_rate": 7.116368652659802e-06, "loss": 0.0266, "step": 3816 }, { "epoch": 1.2746702287527132, "grad_norm": 0.28477888768155046, "learning_rate": 7.114607829268481e-06, "loss": 0.0198, "step": 3817 }, { "epoch": 1.275004174319586, "grad_norm": 0.28887194985404846, "learning_rate": 7.1128466864407486e-06, "loss": 0.0304, "step": 3818 }, { "epoch": 1.2753381198864586, "grad_norm": 0.21621104290989435, "learning_rate": 7.111085224442647e-06, "loss": 0.017, "step": 3819 }, { "epoch": 1.2756720654533311, "grad_norm": 0.25506144939716724, "learning_rate": 7.109323443540263e-06, "loss": 0.0245, "step": 3820 }, { "epoch": 1.2760060110202036, "grad_norm": 0.3523064108286381, "learning_rate": 7.107561343999739e-06, "loss": 0.0225, "step": 3821 }, { "epoch": 1.2763399565870763, "grad_norm": 0.3004006393385708, "learning_rate": 7.105798926087257e-06, "loss": 0.0324, "step": 3822 }, { "epoch": 1.2766739021539488, "grad_norm": 0.2632550484129978, "learning_rate": 7.104036190069052e-06, "loss": 0.0213, "step": 3823 }, { "epoch": 1.2770078477208215, "grad_norm": 0.23054401128039914, "learning_rate": 7.102273136211407e-06, "loss": 0.0226, "step": 3824 }, { "epoch": 1.277341793287694, "grad_norm": 0.2628399430254801, "learning_rate": 7.10050976478065e-06, "loss": 0.0259, "step": 3825 }, { "epoch": 1.2776757388545668, "grad_norm": 0.3727161911287167, "learning_rate": 7.098746076043162e-06, "loss": 0.0285, "step": 3826 }, { "epoch": 1.2780096844214393, "grad_norm": 0.26894485749895153, "learning_rate": 7.096982070265366e-06, "loss": 0.0235, "step": 3827 }, { "epoch": 1.278343629988312, "grad_norm": 0.3045997903242515, "learning_rate": 7.0952177477137374e-06, "loss": 0.0251, "step": 3828 }, { "epoch": 1.2786775755551845, "grad_norm": 0.2708735775382479, "learning_rate": 7.093453108654798e-06, "loss": 0.029, "step": 3829 }, { "epoch": 1.2790115211220572, "grad_norm": 0.373792728388833, "learning_rate": 7.091688153355116e-06, "loss": 0.0243, "step": 3830 }, { "epoch": 1.2793454666889297, "grad_norm": 0.34068259251681265, "learning_rate": 7.08992288208131e-06, "loss": 0.0312, "step": 3831 }, { "epoch": 1.2796794122558022, "grad_norm": 0.30366043592667635, "learning_rate": 7.088157295100046e-06, "loss": 0.0226, "step": 3832 }, { "epoch": 1.280013357822675, "grad_norm": 0.2695816296727487, "learning_rate": 7.0863913926780335e-06, "loss": 0.0294, "step": 3833 }, { "epoch": 1.2803473033895476, "grad_norm": 0.2530947877307193, "learning_rate": 7.084625175082036e-06, "loss": 0.0204, "step": 3834 }, { "epoch": 1.28068124895642, "grad_norm": 0.3598741848865989, "learning_rate": 7.082858642578861e-06, "loss": 0.028, "step": 3835 }, { "epoch": 1.2810151945232926, "grad_norm": 0.3821041027680648, "learning_rate": 7.081091795435361e-06, "loss": 0.0312, "step": 3836 }, { "epoch": 1.2813491400901653, "grad_norm": 0.3892097845716495, "learning_rate": 7.079324633918443e-06, "loss": 0.0319, "step": 3837 }, { "epoch": 1.281683085657038, "grad_norm": 0.2419078489802551, "learning_rate": 7.077557158295053e-06, "loss": 0.0201, "step": 3838 }, { "epoch": 1.2820170312239105, "grad_norm": 0.3358163798342825, "learning_rate": 7.075789368832194e-06, "loss": 0.0292, "step": 3839 }, { "epoch": 1.282350976790783, "grad_norm": 0.44046064172342075, "learning_rate": 7.074021265796909e-06, "loss": 0.0272, "step": 3840 }, { "epoch": 1.2826849223576557, "grad_norm": 0.29494253778608953, "learning_rate": 7.072252849456291e-06, "loss": 0.0253, "step": 3841 }, { "epoch": 1.2830188679245282, "grad_norm": 0.31714296561014493, "learning_rate": 7.07048412007748e-06, "loss": 0.0245, "step": 3842 }, { "epoch": 1.283352813491401, "grad_norm": 0.4015211112379901, "learning_rate": 7.068715077927664e-06, "loss": 0.0233, "step": 3843 }, { "epoch": 1.2836867590582735, "grad_norm": 0.32931183887132787, "learning_rate": 7.066945723274077e-06, "loss": 0.035, "step": 3844 }, { "epoch": 1.2840207046251462, "grad_norm": 0.2152950381478552, "learning_rate": 7.065176056383999e-06, "loss": 0.0185, "step": 3845 }, { "epoch": 1.2843546501920187, "grad_norm": 0.3725494017257642, "learning_rate": 7.063406077524764e-06, "loss": 0.0264, "step": 3846 }, { "epoch": 1.2846885957588914, "grad_norm": 0.3170507120197753, "learning_rate": 7.061635786963743e-06, "loss": 0.0194, "step": 3847 }, { "epoch": 1.2850225413257639, "grad_norm": 0.335061845849936, "learning_rate": 7.059865184968362e-06, "loss": 0.0221, "step": 3848 }, { "epoch": 1.2853564868926366, "grad_norm": 0.3142541759000103, "learning_rate": 7.058094271806091e-06, "loss": 0.0356, "step": 3849 }, { "epoch": 1.285690432459509, "grad_norm": 0.2390745082714666, "learning_rate": 7.056323047744447e-06, "loss": 0.0177, "step": 3850 }, { "epoch": 1.2860243780263816, "grad_norm": 0.4288888245923048, "learning_rate": 7.054551513050993e-06, "loss": 0.0458, "step": 3851 }, { "epoch": 1.2863583235932543, "grad_norm": 0.4158711399622571, "learning_rate": 7.052779667993342e-06, "loss": 0.0388, "step": 3852 }, { "epoch": 1.286692269160127, "grad_norm": 0.3481540210344807, "learning_rate": 7.051007512839153e-06, "loss": 0.0276, "step": 3853 }, { "epoch": 1.2870262147269995, "grad_norm": 0.4434873906513856, "learning_rate": 7.0492350478561275e-06, "loss": 0.0296, "step": 3854 }, { "epoch": 1.287360160293872, "grad_norm": 0.40788009074663517, "learning_rate": 7.04746227331202e-06, "loss": 0.03, "step": 3855 }, { "epoch": 1.2876941058607447, "grad_norm": 0.32346129814996305, "learning_rate": 7.045689189474628e-06, "loss": 0.0199, "step": 3856 }, { "epoch": 1.2880280514276172, "grad_norm": 0.22659696481141453, "learning_rate": 7.0439157966117955e-06, "loss": 0.0196, "step": 3857 }, { "epoch": 1.28836199699449, "grad_norm": 0.3076910939405401, "learning_rate": 7.042142094991418e-06, "loss": 0.0192, "step": 3858 }, { "epoch": 1.2886959425613624, "grad_norm": 0.2412962650349266, "learning_rate": 7.04036808488143e-06, "loss": 0.0185, "step": 3859 }, { "epoch": 1.2890298881282352, "grad_norm": 0.3110724652350511, "learning_rate": 7.038593766549817e-06, "loss": 0.0257, "step": 3860 }, { "epoch": 1.2893638336951077, "grad_norm": 0.3382309693823327, "learning_rate": 7.0368191402646145e-06, "loss": 0.0315, "step": 3861 }, { "epoch": 1.2896977792619804, "grad_norm": 0.38629258045217485, "learning_rate": 7.035044206293898e-06, "loss": 0.0277, "step": 3862 }, { "epoch": 1.2900317248288529, "grad_norm": 0.3378226150888299, "learning_rate": 7.0332689649057905e-06, "loss": 0.0287, "step": 3863 }, { "epoch": 1.2903656703957256, "grad_norm": 0.31026600784411296, "learning_rate": 7.031493416368466e-06, "loss": 0.0256, "step": 3864 }, { "epoch": 1.290699615962598, "grad_norm": 0.42079825484514044, "learning_rate": 7.029717560950141e-06, "loss": 0.0389, "step": 3865 }, { "epoch": 1.2910335615294706, "grad_norm": 0.30751877180824905, "learning_rate": 7.027941398919078e-06, "loss": 0.0233, "step": 3866 }, { "epoch": 1.2913675070963433, "grad_norm": 0.3354377400321467, "learning_rate": 7.0261649305435895e-06, "loss": 0.0337, "step": 3867 }, { "epoch": 1.291701452663216, "grad_norm": 0.3288505264109346, "learning_rate": 7.02438815609203e-06, "loss": 0.0265, "step": 3868 }, { "epoch": 1.2920353982300885, "grad_norm": 0.4817040107976783, "learning_rate": 7.022611075832804e-06, "loss": 0.026, "step": 3869 }, { "epoch": 1.292369343796961, "grad_norm": 0.44315485160950524, "learning_rate": 7.02083369003436e-06, "loss": 0.0412, "step": 3870 }, { "epoch": 1.2927032893638337, "grad_norm": 0.24605793521653016, "learning_rate": 7.019055998965191e-06, "loss": 0.0198, "step": 3871 }, { "epoch": 1.2930372349307062, "grad_norm": 0.4426945239123617, "learning_rate": 7.017278002893841e-06, "loss": 0.0282, "step": 3872 }, { "epoch": 1.293371180497579, "grad_norm": 0.2725147428081745, "learning_rate": 7.015499702088896e-06, "loss": 0.0251, "step": 3873 }, { "epoch": 1.2937051260644514, "grad_norm": 0.2762178036510364, "learning_rate": 7.013721096818988e-06, "loss": 0.0214, "step": 3874 }, { "epoch": 1.2940390716313241, "grad_norm": 0.3484476255070034, "learning_rate": 7.011942187352798e-06, "loss": 0.0291, "step": 3875 }, { "epoch": 1.2943730171981966, "grad_norm": 0.33750939212648334, "learning_rate": 7.010162973959052e-06, "loss": 0.0283, "step": 3876 }, { "epoch": 1.2947069627650694, "grad_norm": 0.3273685064522598, "learning_rate": 7.008383456906518e-06, "loss": 0.0266, "step": 3877 }, { "epoch": 1.2950409083319419, "grad_norm": 0.32685118427331566, "learning_rate": 7.0066036364640165e-06, "loss": 0.025, "step": 3878 }, { "epoch": 1.2953748538988146, "grad_norm": 0.33583083029365574, "learning_rate": 7.004823512900408e-06, "loss": 0.0274, "step": 3879 }, { "epoch": 1.295708799465687, "grad_norm": 0.36445779700535097, "learning_rate": 7.003043086484602e-06, "loss": 0.0306, "step": 3880 }, { "epoch": 1.2960427450325596, "grad_norm": 0.2662035057674562, "learning_rate": 7.001262357485553e-06, "loss": 0.0227, "step": 3881 }, { "epoch": 1.2963766905994323, "grad_norm": 0.34726810137953923, "learning_rate": 6.99948132617226e-06, "loss": 0.0353, "step": 3882 }, { "epoch": 1.296710636166305, "grad_norm": 0.3300358273093104, "learning_rate": 6.99769999281377e-06, "loss": 0.0334, "step": 3883 }, { "epoch": 1.2970445817331775, "grad_norm": 0.2953831654724505, "learning_rate": 6.9959183576791745e-06, "loss": 0.0208, "step": 3884 }, { "epoch": 1.29737852730005, "grad_norm": 0.33514086178239627, "learning_rate": 6.9941364210376095e-06, "loss": 0.0212, "step": 3885 }, { "epoch": 1.2977124728669227, "grad_norm": 0.34895797860840005, "learning_rate": 6.992354183158258e-06, "loss": 0.0242, "step": 3886 }, { "epoch": 1.2980464184337954, "grad_norm": 0.3422201671917834, "learning_rate": 6.9905716443103475e-06, "loss": 0.0281, "step": 3887 }, { "epoch": 1.298380364000668, "grad_norm": 0.38144608640187583, "learning_rate": 6.9887888047631525e-06, "loss": 0.0351, "step": 3888 }, { "epoch": 1.2987143095675404, "grad_norm": 0.37988626182907137, "learning_rate": 6.987005664785991e-06, "loss": 0.0281, "step": 3889 }, { "epoch": 1.2990482551344131, "grad_norm": 0.37015257657339745, "learning_rate": 6.985222224648227e-06, "loss": 0.0284, "step": 3890 }, { "epoch": 1.2993822007012856, "grad_norm": 0.3035372110152349, "learning_rate": 6.983438484619272e-06, "loss": 0.0196, "step": 3891 }, { "epoch": 1.2997161462681583, "grad_norm": 0.25495387431951255, "learning_rate": 6.981654444968578e-06, "loss": 0.0248, "step": 3892 }, { "epoch": 1.3000500918350308, "grad_norm": 0.33620484416742547, "learning_rate": 6.979870105965648e-06, "loss": 0.0291, "step": 3893 }, { "epoch": 1.3003840374019036, "grad_norm": 0.39856282078247696, "learning_rate": 6.978085467880027e-06, "loss": 0.0294, "step": 3894 }, { "epoch": 1.300717982968776, "grad_norm": 0.40499896354271187, "learning_rate": 6.9763005309813025e-06, "loss": 0.0476, "step": 3895 }, { "epoch": 1.3010519285356488, "grad_norm": 0.32998626585092516, "learning_rate": 6.974515295539115e-06, "loss": 0.0262, "step": 3896 }, { "epoch": 1.3013858741025213, "grad_norm": 0.3444307969984011, "learning_rate": 6.9727297618231416e-06, "loss": 0.0296, "step": 3897 }, { "epoch": 1.301719819669394, "grad_norm": 0.3279836003629136, "learning_rate": 6.970943930103109e-06, "loss": 0.0244, "step": 3898 }, { "epoch": 1.3020537652362665, "grad_norm": 0.2944925667220935, "learning_rate": 6.96915780064879e-06, "loss": 0.0244, "step": 3899 }, { "epoch": 1.302387710803139, "grad_norm": 0.3948240601667736, "learning_rate": 6.96737137373e-06, "loss": 0.0317, "step": 3900 }, { "epoch": 1.3027216563700117, "grad_norm": 0.3032366799397449, "learning_rate": 6.965584649616597e-06, "loss": 0.0302, "step": 3901 }, { "epoch": 1.3030556019368844, "grad_norm": 0.38122992390738114, "learning_rate": 6.963797628578489e-06, "loss": 0.032, "step": 3902 }, { "epoch": 1.303389547503757, "grad_norm": 0.2678076758754484, "learning_rate": 6.962010310885627e-06, "loss": 0.0221, "step": 3903 }, { "epoch": 1.3037234930706294, "grad_norm": 0.3214451468010525, "learning_rate": 6.960222696808004e-06, "loss": 0.0356, "step": 3904 }, { "epoch": 1.3040574386375021, "grad_norm": 0.3518423311397084, "learning_rate": 6.958434786615663e-06, "loss": 0.0298, "step": 3905 }, { "epoch": 1.3043913842043746, "grad_norm": 0.21057233219584512, "learning_rate": 6.956646580578687e-06, "loss": 0.021, "step": 3906 }, { "epoch": 1.3047253297712473, "grad_norm": 0.5161034386935823, "learning_rate": 6.954858078967207e-06, "loss": 0.0426, "step": 3907 }, { "epoch": 1.3050592753381198, "grad_norm": 0.26331783533453623, "learning_rate": 6.953069282051397e-06, "loss": 0.0232, "step": 3908 }, { "epoch": 1.3053932209049925, "grad_norm": 0.23474646321070586, "learning_rate": 6.951280190101475e-06, "loss": 0.0212, "step": 3909 }, { "epoch": 1.305727166471865, "grad_norm": 0.42496950645456616, "learning_rate": 6.949490803387704e-06, "loss": 0.0337, "step": 3910 }, { "epoch": 1.3060611120387378, "grad_norm": 0.29339700000989377, "learning_rate": 6.9477011221803935e-06, "loss": 0.0309, "step": 3911 }, { "epoch": 1.3063950576056103, "grad_norm": 0.6979893125456561, "learning_rate": 6.945911146749894e-06, "loss": 0.0268, "step": 3912 }, { "epoch": 1.306729003172483, "grad_norm": 0.3445728422912598, "learning_rate": 6.944120877366605e-06, "loss": 0.0283, "step": 3913 }, { "epoch": 1.3070629487393555, "grad_norm": 0.5227605014625762, "learning_rate": 6.9423303143009644e-06, "loss": 0.0412, "step": 3914 }, { "epoch": 1.307396894306228, "grad_norm": 0.34993513734593257, "learning_rate": 6.940539457823459e-06, "loss": 0.0267, "step": 3915 }, { "epoch": 1.3077308398731007, "grad_norm": 0.3492814502001204, "learning_rate": 6.938748308204622e-06, "loss": 0.0319, "step": 3916 }, { "epoch": 1.3080647854399734, "grad_norm": 0.3836991897907742, "learning_rate": 6.936956865715024e-06, "loss": 0.0262, "step": 3917 }, { "epoch": 1.308398731006846, "grad_norm": 0.4155299096434416, "learning_rate": 6.9351651306252836e-06, "loss": 0.0336, "step": 3918 }, { "epoch": 1.3087326765737184, "grad_norm": 0.24702361474860343, "learning_rate": 6.933373103206064e-06, "loss": 0.0269, "step": 3919 }, { "epoch": 1.309066622140591, "grad_norm": 0.39994215569764924, "learning_rate": 6.931580783728075e-06, "loss": 0.0357, "step": 3920 }, { "epoch": 1.3094005677074636, "grad_norm": 0.27883276101757154, "learning_rate": 6.929788172462063e-06, "loss": 0.0281, "step": 3921 }, { "epoch": 1.3097345132743363, "grad_norm": 0.28887807961612816, "learning_rate": 6.927995269678826e-06, "loss": 0.0273, "step": 3922 }, { "epoch": 1.3100684588412088, "grad_norm": 0.4386138703907347, "learning_rate": 6.926202075649202e-06, "loss": 0.0467, "step": 3923 }, { "epoch": 1.3104024044080815, "grad_norm": 0.40678813429125205, "learning_rate": 6.924408590644073e-06, "loss": 0.0328, "step": 3924 }, { "epoch": 1.310736349974954, "grad_norm": 0.3913932790283656, "learning_rate": 6.922614814934367e-06, "loss": 0.0242, "step": 3925 }, { "epoch": 1.3110702955418267, "grad_norm": 0.38960844261127175, "learning_rate": 6.920820748791057e-06, "loss": 0.0325, "step": 3926 }, { "epoch": 1.3114042411086992, "grad_norm": 0.29252956824499116, "learning_rate": 6.919026392485154e-06, "loss": 0.0261, "step": 3927 }, { "epoch": 1.311738186675572, "grad_norm": 0.3039028024201547, "learning_rate": 6.91723174628772e-06, "loss": 0.0267, "step": 3928 }, { "epoch": 1.3120721322424445, "grad_norm": 0.38398838331243357, "learning_rate": 6.915436810469856e-06, "loss": 0.0253, "step": 3929 }, { "epoch": 1.312406077809317, "grad_norm": 0.24845265321342241, "learning_rate": 6.913641585302708e-06, "loss": 0.0181, "step": 3930 }, { "epoch": 1.3127400233761897, "grad_norm": 0.2506277610515143, "learning_rate": 6.9118460710574665e-06, "loss": 0.0245, "step": 3931 }, { "epoch": 1.3130739689430624, "grad_norm": 0.32113820929456094, "learning_rate": 6.910050268005364e-06, "loss": 0.0363, "step": 3932 }, { "epoch": 1.3134079145099349, "grad_norm": 0.3677484606300255, "learning_rate": 6.908254176417679e-06, "loss": 0.0328, "step": 3933 }, { "epoch": 1.3137418600768074, "grad_norm": 0.27919780414072687, "learning_rate": 6.906457796565732e-06, "loss": 0.0186, "step": 3934 }, { "epoch": 1.31407580564368, "grad_norm": 0.3400969435978823, "learning_rate": 6.904661128720887e-06, "loss": 0.027, "step": 3935 }, { "epoch": 1.3144097512105528, "grad_norm": 0.3043884951884974, "learning_rate": 6.902864173154551e-06, "loss": 0.0263, "step": 3936 }, { "epoch": 1.3147436967774253, "grad_norm": 0.293373219215153, "learning_rate": 6.9010669301381765e-06, "loss": 0.0289, "step": 3937 }, { "epoch": 1.3150776423442978, "grad_norm": 0.39515451698408716, "learning_rate": 6.899269399943258e-06, "loss": 0.0413, "step": 3938 }, { "epoch": 1.3154115879111705, "grad_norm": 0.3434005867420467, "learning_rate": 6.897471582841333e-06, "loss": 0.0264, "step": 3939 }, { "epoch": 1.315745533478043, "grad_norm": 0.36105915879525285, "learning_rate": 6.895673479103983e-06, "loss": 0.025, "step": 3940 }, { "epoch": 1.3160794790449157, "grad_norm": 0.39452187658778015, "learning_rate": 6.893875089002835e-06, "loss": 0.0372, "step": 3941 }, { "epoch": 1.3164134246117882, "grad_norm": 0.2761487674620673, "learning_rate": 6.892076412809553e-06, "loss": 0.0242, "step": 3942 }, { "epoch": 1.316747370178661, "grad_norm": 0.3253838307421517, "learning_rate": 6.890277450795851e-06, "loss": 0.0296, "step": 3943 }, { "epoch": 1.3170813157455334, "grad_norm": 0.30682564746593466, "learning_rate": 6.888478203233484e-06, "loss": 0.0207, "step": 3944 }, { "epoch": 1.3174152613124062, "grad_norm": 0.25463543711485626, "learning_rate": 6.886678670394247e-06, "loss": 0.0231, "step": 3945 }, { "epoch": 1.3177492068792787, "grad_norm": 0.28710601370474476, "learning_rate": 6.884878852549982e-06, "loss": 0.0228, "step": 3946 }, { "epoch": 1.3180831524461514, "grad_norm": 0.42826472453397146, "learning_rate": 6.883078749972573e-06, "loss": 0.0312, "step": 3947 }, { "epoch": 1.3184170980130239, "grad_norm": 0.3229766959914937, "learning_rate": 6.881278362933947e-06, "loss": 0.0294, "step": 3948 }, { "epoch": 1.3187510435798964, "grad_norm": 0.2917459954299232, "learning_rate": 6.879477691706071e-06, "loss": 0.0241, "step": 3949 }, { "epoch": 1.319084989146769, "grad_norm": 0.36512101323717394, "learning_rate": 6.877676736560961e-06, "loss": 0.0316, "step": 3950 }, { "epoch": 1.3194189347136418, "grad_norm": 0.45980782785937463, "learning_rate": 6.87587549777067e-06, "loss": 0.0462, "step": 3951 }, { "epoch": 1.3197528802805143, "grad_norm": 0.3148451226110855, "learning_rate": 6.874073975607298e-06, "loss": 0.0287, "step": 3952 }, { "epoch": 1.3200868258473868, "grad_norm": 0.3383131679801026, "learning_rate": 6.872272170342985e-06, "loss": 0.0273, "step": 3953 }, { "epoch": 1.3204207714142595, "grad_norm": 0.5932896335075106, "learning_rate": 6.870470082249917e-06, "loss": 0.039, "step": 3954 }, { "epoch": 1.320754716981132, "grad_norm": 0.2994717729086685, "learning_rate": 6.868667711600318e-06, "loss": 0.0227, "step": 3955 }, { "epoch": 1.3210886625480047, "grad_norm": 0.3657604982283384, "learning_rate": 6.866865058666459e-06, "loss": 0.0422, "step": 3956 }, { "epoch": 1.3214226081148772, "grad_norm": 0.293653773705607, "learning_rate": 6.86506212372065e-06, "loss": 0.03, "step": 3957 }, { "epoch": 1.32175655368175, "grad_norm": 0.3430768386734613, "learning_rate": 6.863258907035246e-06, "loss": 0.0233, "step": 3958 }, { "epoch": 1.3220904992486224, "grad_norm": 0.350082700433093, "learning_rate": 6.861455408882647e-06, "loss": 0.039, "step": 3959 }, { "epoch": 1.3224244448154951, "grad_norm": 0.6032409866829661, "learning_rate": 6.85965162953529e-06, "loss": 0.026, "step": 3960 }, { "epoch": 1.3227583903823676, "grad_norm": 0.2505725461489505, "learning_rate": 6.857847569265657e-06, "loss": 0.0198, "step": 3961 }, { "epoch": 1.3230923359492404, "grad_norm": 0.31158792659610374, "learning_rate": 6.8560432283462745e-06, "loss": 0.0252, "step": 3962 }, { "epoch": 1.3234262815161129, "grad_norm": 0.3607329340646706, "learning_rate": 6.854238607049707e-06, "loss": 0.0341, "step": 3963 }, { "epoch": 1.3237602270829854, "grad_norm": 0.27647703602120005, "learning_rate": 6.852433705648566e-06, "loss": 0.0256, "step": 3964 }, { "epoch": 1.324094172649858, "grad_norm": 0.4046546985600388, "learning_rate": 6.8506285244155e-06, "loss": 0.0257, "step": 3965 }, { "epoch": 1.3244281182167308, "grad_norm": 0.3514736909950181, "learning_rate": 6.848823063623207e-06, "loss": 0.0337, "step": 3966 }, { "epoch": 1.3247620637836033, "grad_norm": 0.35150070137713213, "learning_rate": 6.84701732354442e-06, "loss": 0.0356, "step": 3967 }, { "epoch": 1.3250960093504758, "grad_norm": 0.28099789366834593, "learning_rate": 6.845211304451919e-06, "loss": 0.0231, "step": 3968 }, { "epoch": 1.3254299549173485, "grad_norm": 0.3049624975688236, "learning_rate": 6.843405006618523e-06, "loss": 0.0225, "step": 3969 }, { "epoch": 1.325763900484221, "grad_norm": 0.23477450061474828, "learning_rate": 6.841598430317096e-06, "loss": 0.0226, "step": 3970 }, { "epoch": 1.3260978460510937, "grad_norm": 0.3179366600343478, "learning_rate": 6.839791575820541e-06, "loss": 0.0189, "step": 3971 }, { "epoch": 1.3264317916179662, "grad_norm": 0.3096965137248418, "learning_rate": 6.837984443401807e-06, "loss": 0.029, "step": 3972 }, { "epoch": 1.326765737184839, "grad_norm": 0.3348005675592078, "learning_rate": 6.836177033333882e-06, "loss": 0.0236, "step": 3973 }, { "epoch": 1.3270996827517114, "grad_norm": 0.2841523201390299, "learning_rate": 6.834369345889793e-06, "loss": 0.0257, "step": 3974 }, { "epoch": 1.3274336283185841, "grad_norm": 0.29332746682343497, "learning_rate": 6.832561381342617e-06, "loss": 0.0335, "step": 3975 }, { "epoch": 1.3277675738854566, "grad_norm": 0.246911495207825, "learning_rate": 6.830753139965467e-06, "loss": 0.0223, "step": 3976 }, { "epoch": 1.3281015194523293, "grad_norm": 0.30704740336809216, "learning_rate": 6.828944622031497e-06, "loss": 0.0296, "step": 3977 }, { "epoch": 1.3284354650192018, "grad_norm": 0.365974937372308, "learning_rate": 6.827135827813909e-06, "loss": 0.0288, "step": 3978 }, { "epoch": 1.3287694105860743, "grad_norm": 0.3236335047280812, "learning_rate": 6.825326757585939e-06, "loss": 0.0316, "step": 3979 }, { "epoch": 1.329103356152947, "grad_norm": 0.2800451560116284, "learning_rate": 6.823517411620871e-06, "loss": 0.0199, "step": 3980 }, { "epoch": 1.3294373017198198, "grad_norm": 0.2877918505125145, "learning_rate": 6.821707790192025e-06, "loss": 0.0255, "step": 3981 }, { "epoch": 1.3297712472866923, "grad_norm": 0.3273448367309883, "learning_rate": 6.819897893572769e-06, "loss": 0.025, "step": 3982 }, { "epoch": 1.3301051928535648, "grad_norm": 0.2400797163629544, "learning_rate": 6.818087722036507e-06, "loss": 0.0233, "step": 3983 }, { "epoch": 1.3304391384204375, "grad_norm": 0.3002257737199839, "learning_rate": 6.8162772758566875e-06, "loss": 0.0221, "step": 3984 }, { "epoch": 1.3307730839873102, "grad_norm": 0.42312864191195376, "learning_rate": 6.8144665553067975e-06, "loss": 0.0386, "step": 3985 }, { "epoch": 1.3311070295541827, "grad_norm": 0.234612610844101, "learning_rate": 6.812655560660373e-06, "loss": 0.0249, "step": 3986 }, { "epoch": 1.3314409751210552, "grad_norm": 0.21714699849900038, "learning_rate": 6.810844292190982e-06, "loss": 0.0214, "step": 3987 }, { "epoch": 1.331774920687928, "grad_norm": 0.36538631652745873, "learning_rate": 6.809032750172236e-06, "loss": 0.042, "step": 3988 }, { "epoch": 1.3321088662548004, "grad_norm": 0.32197788602142097, "learning_rate": 6.807220934877794e-06, "loss": 0.0272, "step": 3989 }, { "epoch": 1.3324428118216731, "grad_norm": 0.2526354456491122, "learning_rate": 6.80540884658135e-06, "loss": 0.0234, "step": 3990 }, { "epoch": 1.3327767573885456, "grad_norm": 0.5313469904936909, "learning_rate": 6.803596485556643e-06, "loss": 0.0362, "step": 3991 }, { "epoch": 1.3331107029554183, "grad_norm": 0.3438581739382402, "learning_rate": 6.8017838520774494e-06, "loss": 0.0213, "step": 3992 }, { "epoch": 1.3334446485222908, "grad_norm": 0.36752865327424455, "learning_rate": 6.79997094641759e-06, "loss": 0.0298, "step": 3993 }, { "epoch": 1.3337785940891635, "grad_norm": 0.5564283943026228, "learning_rate": 6.798157768850924e-06, "loss": 0.0283, "step": 3994 }, { "epoch": 1.334112539656036, "grad_norm": 0.28383456837215054, "learning_rate": 6.796344319651356e-06, "loss": 0.018, "step": 3995 }, { "epoch": 1.3344464852229088, "grad_norm": 1.0556049547755992, "learning_rate": 6.794530599092826e-06, "loss": 0.0376, "step": 3996 }, { "epoch": 1.3347804307897813, "grad_norm": 0.4417443049678084, "learning_rate": 6.792716607449319e-06, "loss": 0.0303, "step": 3997 }, { "epoch": 1.3351143763566538, "grad_norm": 0.6345478217588778, "learning_rate": 6.790902344994861e-06, "loss": 0.0257, "step": 3998 }, { "epoch": 1.3354483219235265, "grad_norm": 0.3070025343946019, "learning_rate": 6.789087812003516e-06, "loss": 0.025, "step": 3999 }, { "epoch": 1.3357822674903992, "grad_norm": 0.24306447100088768, "learning_rate": 6.787273008749391e-06, "loss": 0.0231, "step": 4000 }, { "epoch": 1.3361162130572717, "grad_norm": 0.4126685742015921, "learning_rate": 6.785457935506634e-06, "loss": 0.0345, "step": 4001 }, { "epoch": 1.3364501586241442, "grad_norm": 0.2969746284841204, "learning_rate": 6.783642592549433e-06, "loss": 0.0254, "step": 4002 }, { "epoch": 1.336784104191017, "grad_norm": 0.38972185979025803, "learning_rate": 6.781826980152015e-06, "loss": 0.0262, "step": 4003 }, { "epoch": 1.3371180497578894, "grad_norm": 0.2857683016397726, "learning_rate": 6.780011098588654e-06, "loss": 0.0289, "step": 4004 }, { "epoch": 1.337451995324762, "grad_norm": 0.23426567056860534, "learning_rate": 6.778194948133656e-06, "loss": 0.0179, "step": 4005 }, { "epoch": 1.3377859408916346, "grad_norm": 0.2651808306864631, "learning_rate": 6.776378529061374e-06, "loss": 0.0246, "step": 4006 }, { "epoch": 1.3381198864585073, "grad_norm": 0.2885585517738164, "learning_rate": 6.774561841646199e-06, "loss": 0.0216, "step": 4007 }, { "epoch": 1.3384538320253798, "grad_norm": 0.46368001528475267, "learning_rate": 6.772744886162563e-06, "loss": 0.0278, "step": 4008 }, { "epoch": 1.3387877775922525, "grad_norm": 0.33113431384047287, "learning_rate": 6.770927662884937e-06, "loss": 0.034, "step": 4009 }, { "epoch": 1.339121723159125, "grad_norm": 0.27994058941053224, "learning_rate": 6.769110172087838e-06, "loss": 0.0262, "step": 4010 }, { "epoch": 1.3394556687259978, "grad_norm": 0.24977503041778243, "learning_rate": 6.767292414045816e-06, "loss": 0.023, "step": 4011 }, { "epoch": 1.3397896142928702, "grad_norm": 0.9873981358898284, "learning_rate": 6.765474389033464e-06, "loss": 0.0331, "step": 4012 }, { "epoch": 1.3401235598597427, "grad_norm": 0.4491882980158148, "learning_rate": 6.7636560973254195e-06, "loss": 0.0275, "step": 4013 }, { "epoch": 1.3404575054266155, "grad_norm": 0.345020742446994, "learning_rate": 6.761837539196355e-06, "loss": 0.0252, "step": 4014 }, { "epoch": 1.3407914509934882, "grad_norm": 0.39817735840628704, "learning_rate": 6.760018714920985e-06, "loss": 0.0305, "step": 4015 }, { "epoch": 1.3411253965603607, "grad_norm": 0.3551586935083602, "learning_rate": 6.758199624774065e-06, "loss": 0.026, "step": 4016 }, { "epoch": 1.3414593421272332, "grad_norm": 0.32117913628735567, "learning_rate": 6.7563802690303895e-06, "loss": 0.0289, "step": 4017 }, { "epoch": 1.3417932876941059, "grad_norm": 0.3473114328174935, "learning_rate": 6.7545606479647915e-06, "loss": 0.0266, "step": 4018 }, { "epoch": 1.3421272332609784, "grad_norm": 0.2941592745160124, "learning_rate": 6.752740761852151e-06, "loss": 0.0273, "step": 4019 }, { "epoch": 1.342461178827851, "grad_norm": 0.2774360939047979, "learning_rate": 6.7509206109673794e-06, "loss": 0.0249, "step": 4020 }, { "epoch": 1.3427951243947236, "grad_norm": 0.32894801950482583, "learning_rate": 6.749100195585433e-06, "loss": 0.0259, "step": 4021 }, { "epoch": 1.3431290699615963, "grad_norm": 0.40103198680095004, "learning_rate": 6.747279515981307e-06, "loss": 0.049, "step": 4022 }, { "epoch": 1.3434630155284688, "grad_norm": 0.3833407081992172, "learning_rate": 6.745458572430038e-06, "loss": 0.0289, "step": 4023 }, { "epoch": 1.3437969610953415, "grad_norm": 0.26911043319382494, "learning_rate": 6.743637365206698e-06, "loss": 0.0226, "step": 4024 }, { "epoch": 1.344130906662214, "grad_norm": 0.3224970861528086, "learning_rate": 6.741815894586404e-06, "loss": 0.0332, "step": 4025 }, { "epoch": 1.3444648522290867, "grad_norm": 0.4219560992033902, "learning_rate": 6.7399941608443096e-06, "loss": 0.0199, "step": 4026 }, { "epoch": 1.3447987977959592, "grad_norm": 0.2658775478467294, "learning_rate": 6.7381721642556095e-06, "loss": 0.0217, "step": 4027 }, { "epoch": 1.3451327433628317, "grad_norm": 0.3586867143621259, "learning_rate": 6.736349905095538e-06, "loss": 0.0264, "step": 4028 }, { "epoch": 1.3454666889297044, "grad_norm": 0.2978065766752167, "learning_rate": 6.734527383639369e-06, "loss": 0.027, "step": 4029 }, { "epoch": 1.3458006344965772, "grad_norm": 0.25314422291377997, "learning_rate": 6.732704600162414e-06, "loss": 0.0269, "step": 4030 }, { "epoch": 1.3461345800634497, "grad_norm": 0.2973028453678317, "learning_rate": 6.730881554940029e-06, "loss": 0.0275, "step": 4031 }, { "epoch": 1.3464685256303222, "grad_norm": 0.32619727280955996, "learning_rate": 6.729058248247602e-06, "loss": 0.0346, "step": 4032 }, { "epoch": 1.3468024711971949, "grad_norm": 0.3434849845380133, "learning_rate": 6.727234680360569e-06, "loss": 0.0361, "step": 4033 }, { "epoch": 1.3471364167640676, "grad_norm": 0.22024543274433755, "learning_rate": 6.725410851554401e-06, "loss": 0.0227, "step": 4034 }, { "epoch": 1.34747036233094, "grad_norm": 0.4967844811902783, "learning_rate": 6.7235867621046055e-06, "loss": 0.0297, "step": 4035 }, { "epoch": 1.3478043078978126, "grad_norm": 0.310306987070182, "learning_rate": 6.721762412286738e-06, "loss": 0.0293, "step": 4036 }, { "epoch": 1.3481382534646853, "grad_norm": 0.3388201384862846, "learning_rate": 6.719937802376383e-06, "loss": 0.0256, "step": 4037 }, { "epoch": 1.3484721990315578, "grad_norm": 0.270161787617815, "learning_rate": 6.718112932649171e-06, "loss": 0.0246, "step": 4038 }, { "epoch": 1.3488061445984305, "grad_norm": 0.3748167636011021, "learning_rate": 6.716287803380771e-06, "loss": 0.0252, "step": 4039 }, { "epoch": 1.349140090165303, "grad_norm": 0.35972495101420165, "learning_rate": 6.714462414846891e-06, "loss": 0.0346, "step": 4040 }, { "epoch": 1.3494740357321757, "grad_norm": 0.42476931990784994, "learning_rate": 6.712636767323273e-06, "loss": 0.0249, "step": 4041 }, { "epoch": 1.3498079812990482, "grad_norm": 0.3103294509936811, "learning_rate": 6.710810861085708e-06, "loss": 0.0257, "step": 4042 }, { "epoch": 1.3501419268659207, "grad_norm": 0.3012245512907342, "learning_rate": 6.708984696410018e-06, "loss": 0.0285, "step": 4043 }, { "epoch": 1.3504758724327934, "grad_norm": 0.21337508707349964, "learning_rate": 6.707158273572066e-06, "loss": 0.0189, "step": 4044 }, { "epoch": 1.3508098179996662, "grad_norm": 0.346151871120594, "learning_rate": 6.7053315928477566e-06, "loss": 0.0276, "step": 4045 }, { "epoch": 1.3511437635665386, "grad_norm": 0.29626553431200836, "learning_rate": 6.703504654513031e-06, "loss": 0.0287, "step": 4046 }, { "epoch": 1.3514777091334111, "grad_norm": 0.2876935778489747, "learning_rate": 6.701677458843868e-06, "loss": 0.0219, "step": 4047 }, { "epoch": 1.3518116547002839, "grad_norm": 0.29341604070807464, "learning_rate": 6.6998500061162884e-06, "loss": 0.0245, "step": 4048 }, { "epoch": 1.3521456002671566, "grad_norm": 0.418776001046412, "learning_rate": 6.6980222966063516e-06, "loss": 0.0193, "step": 4049 }, { "epoch": 1.352479545834029, "grad_norm": 0.26535454991719315, "learning_rate": 6.6961943305901515e-06, "loss": 0.0259, "step": 4050 }, { "epoch": 1.3528134914009016, "grad_norm": 0.5962298072412725, "learning_rate": 6.694366108343827e-06, "loss": 0.049, "step": 4051 }, { "epoch": 1.3531474369677743, "grad_norm": 0.4392971826647376, "learning_rate": 6.692537630143551e-06, "loss": 0.0308, "step": 4052 }, { "epoch": 1.3534813825346468, "grad_norm": 0.32724956884853096, "learning_rate": 6.6907088962655375e-06, "loss": 0.0323, "step": 4053 }, { "epoch": 1.3538153281015195, "grad_norm": 0.3628099638430513, "learning_rate": 6.688879906986036e-06, "loss": 0.0284, "step": 4054 }, { "epoch": 1.354149273668392, "grad_norm": 0.46593214083179785, "learning_rate": 6.687050662581341e-06, "loss": 0.0303, "step": 4055 }, { "epoch": 1.3544832192352647, "grad_norm": 0.3224708133821626, "learning_rate": 6.685221163327778e-06, "loss": 0.0273, "step": 4056 }, { "epoch": 1.3548171648021372, "grad_norm": 0.34515194238754754, "learning_rate": 6.683391409501715e-06, "loss": 0.0356, "step": 4057 }, { "epoch": 1.35515111036901, "grad_norm": 0.46353458831364824, "learning_rate": 6.6815614013795595e-06, "loss": 0.0556, "step": 4058 }, { "epoch": 1.3554850559358824, "grad_norm": 0.40208487931458303, "learning_rate": 6.679731139237753e-06, "loss": 0.0275, "step": 4059 }, { "epoch": 1.3558190015027551, "grad_norm": 0.31487963630422133, "learning_rate": 6.67790062335278e-06, "loss": 0.0277, "step": 4060 }, { "epoch": 1.3561529470696276, "grad_norm": 0.2344332093466108, "learning_rate": 6.676069854001162e-06, "loss": 0.0213, "step": 4061 }, { "epoch": 1.3564868926365001, "grad_norm": 0.2652788846659976, "learning_rate": 6.674238831459456e-06, "loss": 0.033, "step": 4062 }, { "epoch": 1.3568208382033728, "grad_norm": 0.3535027844673089, "learning_rate": 6.672407556004262e-06, "loss": 0.0308, "step": 4063 }, { "epoch": 1.3571547837702456, "grad_norm": 0.3466652983065082, "learning_rate": 6.670576027912215e-06, "loss": 0.0246, "step": 4064 }, { "epoch": 1.357488729337118, "grad_norm": 0.2872669596381113, "learning_rate": 6.668744247459988e-06, "loss": 0.0282, "step": 4065 }, { "epoch": 1.3578226749039906, "grad_norm": 0.3195019261678072, "learning_rate": 6.666912214924295e-06, "loss": 0.0291, "step": 4066 }, { "epoch": 1.3581566204708633, "grad_norm": 0.25006459208573967, "learning_rate": 6.665079930581883e-06, "loss": 0.0243, "step": 4067 }, { "epoch": 1.3584905660377358, "grad_norm": 0.2879867660869676, "learning_rate": 6.663247394709542e-06, "loss": 0.0285, "step": 4068 }, { "epoch": 1.3588245116046085, "grad_norm": 0.3299351416990284, "learning_rate": 6.661414607584099e-06, "loss": 0.0406, "step": 4069 }, { "epoch": 1.359158457171481, "grad_norm": 0.24675535742236313, "learning_rate": 6.659581569482415e-06, "loss": 0.0211, "step": 4070 }, { "epoch": 1.3594924027383537, "grad_norm": 0.25695780423165604, "learning_rate": 6.657748280681395e-06, "loss": 0.0273, "step": 4071 }, { "epoch": 1.3598263483052262, "grad_norm": 0.3971129785947468, "learning_rate": 6.65591474145798e-06, "loss": 0.0311, "step": 4072 }, { "epoch": 1.360160293872099, "grad_norm": 0.3535264899370516, "learning_rate": 6.6540809520891425e-06, "loss": 0.029, "step": 4073 }, { "epoch": 1.3604942394389714, "grad_norm": 0.2722749459673896, "learning_rate": 6.652246912851903e-06, "loss": 0.0255, "step": 4074 }, { "epoch": 1.3608281850058441, "grad_norm": 0.3570074437582283, "learning_rate": 6.650412624023311e-06, "loss": 0.0283, "step": 4075 }, { "epoch": 1.3611621305727166, "grad_norm": 0.3915857756994722, "learning_rate": 6.648578085880461e-06, "loss": 0.0238, "step": 4076 }, { "epoch": 1.3614960761395891, "grad_norm": 0.3202078213972106, "learning_rate": 6.64674329870048e-06, "loss": 0.027, "step": 4077 }, { "epoch": 1.3618300217064618, "grad_norm": 0.2641663134144966, "learning_rate": 6.644908262760531e-06, "loss": 0.0221, "step": 4078 }, { "epoch": 1.3621639672733346, "grad_norm": 0.28433526909822354, "learning_rate": 6.643072978337823e-06, "loss": 0.0253, "step": 4079 }, { "epoch": 1.362497912840207, "grad_norm": 0.31299018742545603, "learning_rate": 6.641237445709595e-06, "loss": 0.0274, "step": 4080 }, { "epoch": 1.3628318584070795, "grad_norm": 0.24717740346921763, "learning_rate": 6.639401665153126e-06, "loss": 0.0229, "step": 4081 }, { "epoch": 1.3631658039739523, "grad_norm": 0.27096646609903063, "learning_rate": 6.637565636945731e-06, "loss": 0.0258, "step": 4082 }, { "epoch": 1.363499749540825, "grad_norm": 0.2690275667539032, "learning_rate": 6.635729361364765e-06, "loss": 0.0213, "step": 4083 }, { "epoch": 1.3638336951076975, "grad_norm": 0.26993706166426773, "learning_rate": 6.633892838687621e-06, "loss": 0.0186, "step": 4084 }, { "epoch": 1.36416764067457, "grad_norm": 0.5005558098460409, "learning_rate": 6.632056069191723e-06, "loss": 0.0313, "step": 4085 }, { "epoch": 1.3645015862414427, "grad_norm": 0.23785822741383483, "learning_rate": 6.6302190531545395e-06, "loss": 0.022, "step": 4086 }, { "epoch": 1.3648355318083152, "grad_norm": 0.3218355765369769, "learning_rate": 6.628381790853573e-06, "loss": 0.035, "step": 4087 }, { "epoch": 1.365169477375188, "grad_norm": 0.30629019108515765, "learning_rate": 6.626544282566363e-06, "loss": 0.0287, "step": 4088 }, { "epoch": 1.3655034229420604, "grad_norm": 0.3601402344314719, "learning_rate": 6.624706528570487e-06, "loss": 0.034, "step": 4089 }, { "epoch": 1.3658373685089331, "grad_norm": 0.4365428161116427, "learning_rate": 6.6228685291435605e-06, "loss": 0.0538, "step": 4090 }, { "epoch": 1.3661713140758056, "grad_norm": 0.31154510065927254, "learning_rate": 6.621030284563232e-06, "loss": 0.0225, "step": 4091 }, { "epoch": 1.366505259642678, "grad_norm": 0.2619073412486402, "learning_rate": 6.619191795107195e-06, "loss": 0.0241, "step": 4092 }, { "epoch": 1.3668392052095508, "grad_norm": 0.3073309860724148, "learning_rate": 6.617353061053171e-06, "loss": 0.0346, "step": 4093 }, { "epoch": 1.3671731507764235, "grad_norm": 0.3651138576018971, "learning_rate": 6.615514082678922e-06, "loss": 0.0275, "step": 4094 }, { "epoch": 1.367507096343296, "grad_norm": 0.37263957485508603, "learning_rate": 6.613674860262249e-06, "loss": 0.0338, "step": 4095 }, { "epoch": 1.3678410419101685, "grad_norm": 0.31759700401037727, "learning_rate": 6.61183539408099e-06, "loss": 0.0193, "step": 4096 }, { "epoch": 1.3681749874770412, "grad_norm": 0.2796330865033312, "learning_rate": 6.609995684413013e-06, "loss": 0.0237, "step": 4097 }, { "epoch": 1.368508933043914, "grad_norm": 0.3106557266787774, "learning_rate": 6.608155731536233e-06, "loss": 0.0282, "step": 4098 }, { "epoch": 1.3688428786107865, "grad_norm": 0.23696635035589267, "learning_rate": 6.606315535728594e-06, "loss": 0.0242, "step": 4099 }, { "epoch": 1.369176824177659, "grad_norm": 0.352664668684995, "learning_rate": 6.604475097268079e-06, "loss": 0.0261, "step": 4100 }, { "epoch": 1.3695107697445317, "grad_norm": 0.2374625906875964, "learning_rate": 6.602634416432708e-06, "loss": 0.0187, "step": 4101 }, { "epoch": 1.3698447153114042, "grad_norm": 0.47414479442954915, "learning_rate": 6.600793493500539e-06, "loss": 0.0406, "step": 4102 }, { "epoch": 1.3701786608782769, "grad_norm": 0.33701968193020054, "learning_rate": 6.5989523287496645e-06, "loss": 0.021, "step": 4103 }, { "epoch": 1.3705126064451494, "grad_norm": 0.39106877817440683, "learning_rate": 6.597110922458214e-06, "loss": 0.0207, "step": 4104 }, { "epoch": 1.370846552012022, "grad_norm": 0.2999427539554092, "learning_rate": 6.595269274904351e-06, "loss": 0.0227, "step": 4105 }, { "epoch": 1.3711804975788946, "grad_norm": 0.3149311012443616, "learning_rate": 6.593427386366282e-06, "loss": 0.0273, "step": 4106 }, { "epoch": 1.3715144431457673, "grad_norm": 0.35364225049435305, "learning_rate": 6.591585257122244e-06, "loss": 0.0385, "step": 4107 }, { "epoch": 1.3718483887126398, "grad_norm": 0.34072234254307066, "learning_rate": 6.589742887450512e-06, "loss": 0.0256, "step": 4108 }, { "epoch": 1.3721823342795125, "grad_norm": 0.25273579340021224, "learning_rate": 6.5879002776294e-06, "loss": 0.0222, "step": 4109 }, { "epoch": 1.372516279846385, "grad_norm": 0.2705604579708577, "learning_rate": 6.586057427937252e-06, "loss": 0.0295, "step": 4110 }, { "epoch": 1.3728502254132575, "grad_norm": 0.3050249184143848, "learning_rate": 6.584214338652455e-06, "loss": 0.0333, "step": 4111 }, { "epoch": 1.3731841709801302, "grad_norm": 0.32920241603327477, "learning_rate": 6.582371010053429e-06, "loss": 0.0345, "step": 4112 }, { "epoch": 1.373518116547003, "grad_norm": 0.33457957602169924, "learning_rate": 6.58052744241863e-06, "loss": 0.0242, "step": 4113 }, { "epoch": 1.3738520621138754, "grad_norm": 0.3227834142235845, "learning_rate": 6.578683636026551e-06, "loss": 0.0319, "step": 4114 }, { "epoch": 1.374186007680748, "grad_norm": 0.3635738039039029, "learning_rate": 6.576839591155719e-06, "loss": 0.03, "step": 4115 }, { "epoch": 1.3745199532476207, "grad_norm": 0.28247441512053173, "learning_rate": 6.574995308084702e-06, "loss": 0.0221, "step": 4116 }, { "epoch": 1.3748538988144932, "grad_norm": 0.24864008592072592, "learning_rate": 6.573150787092097e-06, "loss": 0.018, "step": 4117 }, { "epoch": 1.3751878443813659, "grad_norm": 0.3214042563466242, "learning_rate": 6.5713060284565435e-06, "loss": 0.0261, "step": 4118 }, { "epoch": 1.3755217899482384, "grad_norm": 0.28233940140350905, "learning_rate": 6.569461032456713e-06, "loss": 0.0264, "step": 4119 }, { "epoch": 1.375855735515111, "grad_norm": 0.28718907119051273, "learning_rate": 6.567615799371313e-06, "loss": 0.0233, "step": 4120 }, { "epoch": 1.3761896810819836, "grad_norm": 0.3066969589450228, "learning_rate": 6.565770329479089e-06, "loss": 0.0208, "step": 4121 }, { "epoch": 1.3765236266488563, "grad_norm": 0.2836497564182992, "learning_rate": 6.5639246230588205e-06, "loss": 0.029, "step": 4122 }, { "epoch": 1.3768575722157288, "grad_norm": 0.2818740754115631, "learning_rate": 6.562078680389323e-06, "loss": 0.0182, "step": 4123 }, { "epoch": 1.3771915177826015, "grad_norm": 0.5710079291815752, "learning_rate": 6.560232501749446e-06, "loss": 0.0284, "step": 4124 }, { "epoch": 1.377525463349474, "grad_norm": 0.2753884512123724, "learning_rate": 6.558386087418082e-06, "loss": 0.025, "step": 4125 }, { "epoch": 1.3778594089163465, "grad_norm": 0.31365721658402956, "learning_rate": 6.556539437674147e-06, "loss": 0.0279, "step": 4126 }, { "epoch": 1.3781933544832192, "grad_norm": 0.33371329476070377, "learning_rate": 6.554692552796604e-06, "loss": 0.0338, "step": 4127 }, { "epoch": 1.378527300050092, "grad_norm": 0.32997381420229777, "learning_rate": 6.552845433064445e-06, "loss": 0.0294, "step": 4128 }, { "epoch": 1.3788612456169644, "grad_norm": 0.28196085692407397, "learning_rate": 6.550998078756698e-06, "loss": 0.026, "step": 4129 }, { "epoch": 1.379195191183837, "grad_norm": 0.3099513281295468, "learning_rate": 6.549150490152429e-06, "loss": 0.0407, "step": 4130 }, { "epoch": 1.3795291367507097, "grad_norm": 0.39082881671939157, "learning_rate": 6.5473026675307394e-06, "loss": 0.0366, "step": 4131 }, { "epoch": 1.3798630823175824, "grad_norm": 0.35755906278361366, "learning_rate": 6.545454611170762e-06, "loss": 0.0288, "step": 4132 }, { "epoch": 1.3801970278844549, "grad_norm": 0.31410143934597123, "learning_rate": 6.543606321351668e-06, "loss": 0.0305, "step": 4133 }, { "epoch": 1.3805309734513274, "grad_norm": 0.3044608100050167, "learning_rate": 6.541757798352664e-06, "loss": 0.0279, "step": 4134 }, { "epoch": 1.3808649190182, "grad_norm": 0.3787462097567985, "learning_rate": 6.539909042452991e-06, "loss": 0.0314, "step": 4135 }, { "epoch": 1.3811988645850726, "grad_norm": 0.3948524816613283, "learning_rate": 6.538060053931925e-06, "loss": 0.0315, "step": 4136 }, { "epoch": 1.3815328101519453, "grad_norm": 0.2848048596769651, "learning_rate": 6.536210833068779e-06, "loss": 0.0286, "step": 4137 }, { "epoch": 1.3818667557188178, "grad_norm": 0.4213633188225437, "learning_rate": 6.534361380142896e-06, "loss": 0.0312, "step": 4138 }, { "epoch": 1.3822007012856905, "grad_norm": 0.8024795017243784, "learning_rate": 6.532511695433662e-06, "loss": 0.0266, "step": 4139 }, { "epoch": 1.382534646852563, "grad_norm": 0.2725350229821373, "learning_rate": 6.5306617792204915e-06, "loss": 0.0243, "step": 4140 }, { "epoch": 1.3828685924194355, "grad_norm": 0.24386319344336235, "learning_rate": 6.528811631782835e-06, "loss": 0.0194, "step": 4141 }, { "epoch": 1.3832025379863082, "grad_norm": 0.33063096325815156, "learning_rate": 6.526961253400181e-06, "loss": 0.0253, "step": 4142 }, { "epoch": 1.383536483553181, "grad_norm": 0.49910265592710185, "learning_rate": 6.525110644352052e-06, "loss": 0.0318, "step": 4143 }, { "epoch": 1.3838704291200534, "grad_norm": 0.2835546392846156, "learning_rate": 6.523259804918001e-06, "loss": 0.0225, "step": 4144 }, { "epoch": 1.384204374686926, "grad_norm": 0.29621563311109494, "learning_rate": 6.52140873537762e-06, "loss": 0.0233, "step": 4145 }, { "epoch": 1.3845383202537986, "grad_norm": 0.40928485304465473, "learning_rate": 6.519557436010535e-06, "loss": 0.0285, "step": 4146 }, { "epoch": 1.3848722658206714, "grad_norm": 0.4048074209668024, "learning_rate": 6.51770590709641e-06, "loss": 0.0316, "step": 4147 }, { "epoch": 1.3852062113875439, "grad_norm": 0.31960094426853036, "learning_rate": 6.515854148914935e-06, "loss": 0.0274, "step": 4148 }, { "epoch": 1.3855401569544163, "grad_norm": 0.38366615291065215, "learning_rate": 6.514002161745844e-06, "loss": 0.0357, "step": 4149 }, { "epoch": 1.385874102521289, "grad_norm": 0.46633156556378175, "learning_rate": 6.512149945868898e-06, "loss": 0.0389, "step": 4150 }, { "epoch": 1.3862080480881616, "grad_norm": 0.46532757925687274, "learning_rate": 6.510297501563899e-06, "loss": 0.0404, "step": 4151 }, { "epoch": 1.3865419936550343, "grad_norm": 0.4292523328715074, "learning_rate": 6.5084448291106785e-06, "loss": 0.0324, "step": 4152 }, { "epoch": 1.3868759392219068, "grad_norm": 0.24459452855524624, "learning_rate": 6.506591928789105e-06, "loss": 0.0241, "step": 4153 }, { "epoch": 1.3872098847887795, "grad_norm": 0.3004738237091303, "learning_rate": 6.504738800879081e-06, "loss": 0.0237, "step": 4154 }, { "epoch": 1.387543830355652, "grad_norm": 0.22475864428390777, "learning_rate": 6.502885445660544e-06, "loss": 0.0227, "step": 4155 }, { "epoch": 1.3878777759225247, "grad_norm": 0.2255535025880028, "learning_rate": 6.501031863413464e-06, "loss": 0.0166, "step": 4156 }, { "epoch": 1.3882117214893972, "grad_norm": 0.2839987973562058, "learning_rate": 6.499178054417847e-06, "loss": 0.0249, "step": 4157 }, { "epoch": 1.38854566705627, "grad_norm": 0.37338379091010143, "learning_rate": 6.497324018953732e-06, "loss": 0.0376, "step": 4158 }, { "epoch": 1.3888796126231424, "grad_norm": 0.3242868836132737, "learning_rate": 6.495469757301196e-06, "loss": 0.0252, "step": 4159 }, { "epoch": 1.389213558190015, "grad_norm": 0.3772569942795662, "learning_rate": 6.493615269740343e-06, "loss": 0.0459, "step": 4160 }, { "epoch": 1.3895475037568876, "grad_norm": 0.3238315246329841, "learning_rate": 6.491760556551315e-06, "loss": 0.0263, "step": 4161 }, { "epoch": 1.3898814493237603, "grad_norm": 0.2858225011114631, "learning_rate": 6.489905618014293e-06, "loss": 0.0281, "step": 4162 }, { "epoch": 1.3902153948906328, "grad_norm": 0.28175386943143504, "learning_rate": 6.488050454409483e-06, "loss": 0.021, "step": 4163 }, { "epoch": 1.3905493404575053, "grad_norm": 0.3189900098389574, "learning_rate": 6.486195066017129e-06, "loss": 0.0303, "step": 4164 }, { "epoch": 1.390883286024378, "grad_norm": 0.3336994920742012, "learning_rate": 6.484339453117514e-06, "loss": 0.0288, "step": 4165 }, { "epoch": 1.3912172315912505, "grad_norm": 0.2697524405797859, "learning_rate": 6.482483615990945e-06, "loss": 0.0205, "step": 4166 }, { "epoch": 1.3915511771581233, "grad_norm": 0.2844755122937119, "learning_rate": 6.480627554917771e-06, "loss": 0.0245, "step": 4167 }, { "epoch": 1.3918851227249958, "grad_norm": 0.2713321170353821, "learning_rate": 6.47877127017837e-06, "loss": 0.0185, "step": 4168 }, { "epoch": 1.3922190682918685, "grad_norm": 0.5237458575178805, "learning_rate": 6.476914762053158e-06, "loss": 0.0482, "step": 4169 }, { "epoch": 1.392553013858741, "grad_norm": 0.36295832297667063, "learning_rate": 6.47505803082258e-06, "loss": 0.0313, "step": 4170 }, { "epoch": 1.3928869594256137, "grad_norm": 0.3764443012879255, "learning_rate": 6.473201076767119e-06, "loss": 0.036, "step": 4171 }, { "epoch": 1.3932209049924862, "grad_norm": 0.2582316419690857, "learning_rate": 6.471343900167289e-06, "loss": 0.0255, "step": 4172 }, { "epoch": 1.393554850559359, "grad_norm": 0.33000479078040534, "learning_rate": 6.469486501303639e-06, "loss": 0.0392, "step": 4173 }, { "epoch": 1.3938887961262314, "grad_norm": 0.2500769154013191, "learning_rate": 6.467628880456749e-06, "loss": 0.0221, "step": 4174 }, { "epoch": 1.394222741693104, "grad_norm": 0.4008734050914926, "learning_rate": 6.465771037907236e-06, "loss": 0.0483, "step": 4175 }, { "epoch": 1.3945566872599766, "grad_norm": 0.20843075979871026, "learning_rate": 6.463912973935749e-06, "loss": 0.0192, "step": 4176 }, { "epoch": 1.3948906328268493, "grad_norm": 0.39933295292421844, "learning_rate": 6.462054688822971e-06, "loss": 0.0326, "step": 4177 }, { "epoch": 1.3952245783937218, "grad_norm": 0.2359623271327014, "learning_rate": 6.460196182849616e-06, "loss": 0.02, "step": 4178 }, { "epoch": 1.3955585239605943, "grad_norm": 0.2774424476758121, "learning_rate": 6.458337456296434e-06, "loss": 0.0246, "step": 4179 }, { "epoch": 1.395892469527467, "grad_norm": 0.44255909800554805, "learning_rate": 6.456478509444209e-06, "loss": 0.025, "step": 4180 }, { "epoch": 1.3962264150943398, "grad_norm": 0.31440313277685955, "learning_rate": 6.454619342573756e-06, "loss": 0.0226, "step": 4181 }, { "epoch": 1.3965603606612123, "grad_norm": 0.32671755747136283, "learning_rate": 6.452759955965922e-06, "loss": 0.0305, "step": 4182 }, { "epoch": 1.3968943062280847, "grad_norm": 0.39497156158711616, "learning_rate": 6.450900349901592e-06, "loss": 0.028, "step": 4183 }, { "epoch": 1.3972282517949575, "grad_norm": 0.3028024031748205, "learning_rate": 6.449040524661681e-06, "loss": 0.0249, "step": 4184 }, { "epoch": 1.39756219736183, "grad_norm": 0.34207219241021913, "learning_rate": 6.447180480527135e-06, "loss": 0.027, "step": 4185 }, { "epoch": 1.3978961429287027, "grad_norm": 0.36238534784500936, "learning_rate": 6.445320217778939e-06, "loss": 0.0329, "step": 4186 }, { "epoch": 1.3982300884955752, "grad_norm": 0.665449263626439, "learning_rate": 6.443459736698106e-06, "loss": 0.0303, "step": 4187 }, { "epoch": 1.398564034062448, "grad_norm": 0.3853842127778719, "learning_rate": 6.4415990375656826e-06, "loss": 0.0318, "step": 4188 }, { "epoch": 1.3988979796293204, "grad_norm": 0.935871160664241, "learning_rate": 6.4397381206627505e-06, "loss": 0.0402, "step": 4189 }, { "epoch": 1.3992319251961929, "grad_norm": 0.3994369014425904, "learning_rate": 6.437876986270424e-06, "loss": 0.0226, "step": 4190 }, { "epoch": 1.3995658707630656, "grad_norm": 0.3582334612795359, "learning_rate": 6.436015634669848e-06, "loss": 0.0331, "step": 4191 }, { "epoch": 1.3998998163299383, "grad_norm": 0.26236389446839914, "learning_rate": 6.434154066142201e-06, "loss": 0.0211, "step": 4192 }, { "epoch": 1.4002337618968108, "grad_norm": 0.24871708051095404, "learning_rate": 6.432292280968695e-06, "loss": 0.0181, "step": 4193 }, { "epoch": 1.4005677074636833, "grad_norm": 0.4326489182291974, "learning_rate": 6.430430279430577e-06, "loss": 0.0258, "step": 4194 }, { "epoch": 1.400901653030556, "grad_norm": 0.25749714153032927, "learning_rate": 6.428568061809122e-06, "loss": 0.0249, "step": 4195 }, { "epoch": 1.4012355985974287, "grad_norm": 0.35931741482762375, "learning_rate": 6.426705628385641e-06, "loss": 0.0252, "step": 4196 }, { "epoch": 1.4015695441643012, "grad_norm": 0.5307331310360581, "learning_rate": 6.4248429794414745e-06, "loss": 0.0306, "step": 4197 }, { "epoch": 1.4019034897311737, "grad_norm": 0.38274339803651247, "learning_rate": 6.422980115258e-06, "loss": 0.0339, "step": 4198 }, { "epoch": 1.4022374352980465, "grad_norm": 0.3555093435984295, "learning_rate": 6.421117036116624e-06, "loss": 0.0236, "step": 4199 }, { "epoch": 1.402571380864919, "grad_norm": 0.2907826146941461, "learning_rate": 6.4192537422987864e-06, "loss": 0.0295, "step": 4200 }, { "epoch": 1.4029053264317917, "grad_norm": 0.30963497215325875, "learning_rate": 6.417390234085961e-06, "loss": 0.0277, "step": 4201 }, { "epoch": 1.4032392719986642, "grad_norm": 0.35136844329402045, "learning_rate": 6.415526511759649e-06, "loss": 0.0259, "step": 4202 }, { "epoch": 1.4035732175655369, "grad_norm": 0.2751243696032686, "learning_rate": 6.413662575601391e-06, "loss": 0.0365, "step": 4203 }, { "epoch": 1.4039071631324094, "grad_norm": 0.22159857145657047, "learning_rate": 6.4117984258927565e-06, "loss": 0.0232, "step": 4204 }, { "epoch": 1.404241108699282, "grad_norm": 0.29844503255762855, "learning_rate": 6.409934062915345e-06, "loss": 0.026, "step": 4205 }, { "epoch": 1.4045750542661546, "grad_norm": 0.3961504186942763, "learning_rate": 6.408069486950793e-06, "loss": 0.0481, "step": 4206 }, { "epoch": 1.4049089998330273, "grad_norm": 0.2538361477160226, "learning_rate": 6.406204698280766e-06, "loss": 0.023, "step": 4207 }, { "epoch": 1.4052429453998998, "grad_norm": 0.22444211327860183, "learning_rate": 6.40433969718696e-06, "loss": 0.0156, "step": 4208 }, { "epoch": 1.4055768909667723, "grad_norm": 0.44034457427431095, "learning_rate": 6.402474483951109e-06, "loss": 0.0284, "step": 4209 }, { "epoch": 1.405910836533645, "grad_norm": 0.21447942513244683, "learning_rate": 6.400609058854973e-06, "loss": 0.0198, "step": 4210 }, { "epoch": 1.4062447821005177, "grad_norm": 0.3768648995338918, "learning_rate": 6.398743422180346e-06, "loss": 0.0368, "step": 4211 }, { "epoch": 1.4065787276673902, "grad_norm": 0.24537398824322698, "learning_rate": 6.396877574209057e-06, "loss": 0.023, "step": 4212 }, { "epoch": 1.4069126732342627, "grad_norm": 0.31548463622046174, "learning_rate": 6.395011515222962e-06, "loss": 0.0222, "step": 4213 }, { "epoch": 1.4072466188011354, "grad_norm": 0.2911911195637709, "learning_rate": 6.393145245503951e-06, "loss": 0.0278, "step": 4214 }, { "epoch": 1.407580564368008, "grad_norm": 0.3424927468106257, "learning_rate": 6.391278765333948e-06, "loss": 0.0308, "step": 4215 }, { "epoch": 1.4079145099348807, "grad_norm": 0.24610239576806028, "learning_rate": 6.389412074994906e-06, "loss": 0.0202, "step": 4216 }, { "epoch": 1.4082484555017531, "grad_norm": 0.28614382765446894, "learning_rate": 6.387545174768809e-06, "loss": 0.034, "step": 4217 }, { "epoch": 1.4085824010686259, "grad_norm": 0.34679664912713826, "learning_rate": 6.385678064937677e-06, "loss": 0.0354, "step": 4218 }, { "epoch": 1.4089163466354984, "grad_norm": 0.28291344104145383, "learning_rate": 6.383810745783556e-06, "loss": 0.0239, "step": 4219 }, { "epoch": 1.409250292202371, "grad_norm": 0.3148389335644667, "learning_rate": 6.38194321758853e-06, "loss": 0.028, "step": 4220 }, { "epoch": 1.4095842377692436, "grad_norm": 0.24227708426055747, "learning_rate": 6.3800754806347065e-06, "loss": 0.0248, "step": 4221 }, { "epoch": 1.4099181833361163, "grad_norm": 0.2842259981650139, "learning_rate": 6.378207535204234e-06, "loss": 0.0254, "step": 4222 }, { "epoch": 1.4102521289029888, "grad_norm": 0.2360963689254675, "learning_rate": 6.376339381579285e-06, "loss": 0.0175, "step": 4223 }, { "epoch": 1.4105860744698613, "grad_norm": 0.25162957127237595, "learning_rate": 6.374471020042067e-06, "loss": 0.0201, "step": 4224 }, { "epoch": 1.410920020036734, "grad_norm": 0.40135490058030626, "learning_rate": 6.372602450874816e-06, "loss": 0.0209, "step": 4225 }, { "epoch": 1.4112539656036067, "grad_norm": 0.3223280172139753, "learning_rate": 6.370733674359803e-06, "loss": 0.0439, "step": 4226 }, { "epoch": 1.4115879111704792, "grad_norm": 0.4121077145420317, "learning_rate": 6.36886469077933e-06, "loss": 0.0552, "step": 4227 }, { "epoch": 1.4119218567373517, "grad_norm": 0.4886731791786846, "learning_rate": 6.366995500415727e-06, "loss": 0.0337, "step": 4228 }, { "epoch": 1.4122558023042244, "grad_norm": 0.26735245095573773, "learning_rate": 6.365126103551358e-06, "loss": 0.0198, "step": 4229 }, { "epoch": 1.4125897478710971, "grad_norm": 0.379505760027139, "learning_rate": 6.363256500468617e-06, "loss": 0.0379, "step": 4230 }, { "epoch": 1.4129236934379696, "grad_norm": 0.28808782349374135, "learning_rate": 6.3613866914499285e-06, "loss": 0.0299, "step": 4231 }, { "epoch": 1.4132576390048421, "grad_norm": 0.30287090451349447, "learning_rate": 6.359516676777751e-06, "loss": 0.0271, "step": 4232 }, { "epoch": 1.4135915845717149, "grad_norm": 0.3022571428909766, "learning_rate": 6.357646456734574e-06, "loss": 0.0249, "step": 4233 }, { "epoch": 1.4139255301385873, "grad_norm": 0.2859251587416214, "learning_rate": 6.3557760316029115e-06, "loss": 0.0307, "step": 4234 }, { "epoch": 1.41425947570546, "grad_norm": 0.2836402492400007, "learning_rate": 6.353905401665317e-06, "loss": 0.0193, "step": 4235 }, { "epoch": 1.4145934212723326, "grad_norm": 0.270069829815595, "learning_rate": 6.35203456720437e-06, "loss": 0.0244, "step": 4236 }, { "epoch": 1.4149273668392053, "grad_norm": 0.4785601138722959, "learning_rate": 6.35016352850268e-06, "loss": 0.0312, "step": 4237 }, { "epoch": 1.4152613124060778, "grad_norm": 0.2924169655476126, "learning_rate": 6.3482922858428915e-06, "loss": 0.0326, "step": 4238 }, { "epoch": 1.4155952579729503, "grad_norm": 0.3005980690764834, "learning_rate": 6.34642083950768e-06, "loss": 0.0235, "step": 4239 }, { "epoch": 1.415929203539823, "grad_norm": 0.28610543510377096, "learning_rate": 6.344549189779745e-06, "loss": 0.024, "step": 4240 }, { "epoch": 1.4162631491066957, "grad_norm": 0.23991623737660478, "learning_rate": 6.342677336941825e-06, "loss": 0.0252, "step": 4241 }, { "epoch": 1.4165970946735682, "grad_norm": 0.31781159720858426, "learning_rate": 6.340805281276683e-06, "loss": 0.032, "step": 4242 }, { "epoch": 1.4169310402404407, "grad_norm": 0.3675808070280987, "learning_rate": 6.338933023067114e-06, "loss": 0.0366, "step": 4243 }, { "epoch": 1.4172649858073134, "grad_norm": 0.2489197449042772, "learning_rate": 6.337060562595949e-06, "loss": 0.0208, "step": 4244 }, { "epoch": 1.4175989313741861, "grad_norm": 0.3387588987548785, "learning_rate": 6.3351879001460425e-06, "loss": 0.0271, "step": 4245 }, { "epoch": 1.4179328769410586, "grad_norm": 0.39569938256230014, "learning_rate": 6.333315036000281e-06, "loss": 0.0406, "step": 4246 }, { "epoch": 1.4182668225079311, "grad_norm": 0.40449303682465976, "learning_rate": 6.331441970441585e-06, "loss": 0.0311, "step": 4247 }, { "epoch": 1.4186007680748038, "grad_norm": 0.26889267903113867, "learning_rate": 6.329568703752902e-06, "loss": 0.0223, "step": 4248 }, { "epoch": 1.4189347136416763, "grad_norm": 0.24388086962817454, "learning_rate": 6.32769523621721e-06, "loss": 0.0224, "step": 4249 }, { "epoch": 1.419268659208549, "grad_norm": 0.24511992387520704, "learning_rate": 6.3258215681175215e-06, "loss": 0.0195, "step": 4250 }, { "epoch": 1.4196026047754216, "grad_norm": 0.26285398620511435, "learning_rate": 6.323947699736873e-06, "loss": 0.0218, "step": 4251 }, { "epoch": 1.4199365503422943, "grad_norm": 0.312074709076044, "learning_rate": 6.3220736313583345e-06, "loss": 0.0246, "step": 4252 }, { "epoch": 1.4202704959091668, "grad_norm": 0.34952999657564465, "learning_rate": 6.320199363265008e-06, "loss": 0.0339, "step": 4253 }, { "epoch": 1.4206044414760395, "grad_norm": 0.28860880207619427, "learning_rate": 6.318324895740023e-06, "loss": 0.0233, "step": 4254 }, { "epoch": 1.420938387042912, "grad_norm": 0.29733049298495257, "learning_rate": 6.31645022906654e-06, "loss": 0.0323, "step": 4255 }, { "epoch": 1.4212723326097847, "grad_norm": 0.31805717441672404, "learning_rate": 6.314575363527748e-06, "loss": 0.0268, "step": 4256 }, { "epoch": 1.4216062781766572, "grad_norm": 0.286972328699409, "learning_rate": 6.312700299406871e-06, "loss": 0.0287, "step": 4257 }, { "epoch": 1.4219402237435297, "grad_norm": 0.2532610207953052, "learning_rate": 6.310825036987154e-06, "loss": 0.0223, "step": 4258 }, { "epoch": 1.4222741693104024, "grad_norm": 0.35252950002232053, "learning_rate": 6.308949576551884e-06, "loss": 0.0317, "step": 4259 }, { "epoch": 1.4226081148772751, "grad_norm": 0.3862716954803643, "learning_rate": 6.3070739183843655e-06, "loss": 0.0281, "step": 4260 }, { "epoch": 1.4229420604441476, "grad_norm": 0.3171157760416562, "learning_rate": 6.305198062767942e-06, "loss": 0.0315, "step": 4261 }, { "epoch": 1.4232760060110201, "grad_norm": 0.4086440283230938, "learning_rate": 6.303322009985984e-06, "loss": 0.0313, "step": 4262 }, { "epoch": 1.4236099515778928, "grad_norm": 0.29594655062853514, "learning_rate": 6.301445760321889e-06, "loss": 0.0216, "step": 4263 }, { "epoch": 1.4239438971447653, "grad_norm": 0.2193500553098334, "learning_rate": 6.299569314059088e-06, "loss": 0.0193, "step": 4264 }, { "epoch": 1.424277842711638, "grad_norm": 0.25979567694041017, "learning_rate": 6.297692671481042e-06, "loss": 0.0247, "step": 4265 }, { "epoch": 1.4246117882785105, "grad_norm": 0.5571964084684246, "learning_rate": 6.295815832871235e-06, "loss": 0.0257, "step": 4266 }, { "epoch": 1.4249457338453833, "grad_norm": 0.27824904953702756, "learning_rate": 6.2939387985131905e-06, "loss": 0.0261, "step": 4267 }, { "epoch": 1.4252796794122558, "grad_norm": 0.43944047402045616, "learning_rate": 6.292061568690455e-06, "loss": 0.0242, "step": 4268 }, { "epoch": 1.4256136249791285, "grad_norm": 0.34363125953944945, "learning_rate": 6.290184143686606e-06, "loss": 0.0319, "step": 4269 }, { "epoch": 1.425947570546001, "grad_norm": 0.367581617819465, "learning_rate": 6.288306523785252e-06, "loss": 0.0352, "step": 4270 }, { "epoch": 1.4262815161128737, "grad_norm": 0.2806152073022378, "learning_rate": 6.286428709270026e-06, "loss": 0.0247, "step": 4271 }, { "epoch": 1.4266154616797462, "grad_norm": 0.292103212824337, "learning_rate": 6.284550700424597e-06, "loss": 0.0223, "step": 4272 }, { "epoch": 1.4269494072466187, "grad_norm": 0.2594443901105272, "learning_rate": 6.282672497532659e-06, "loss": 0.0234, "step": 4273 }, { "epoch": 1.4272833528134914, "grad_norm": 0.21679403571505187, "learning_rate": 6.280794100877938e-06, "loss": 0.0201, "step": 4274 }, { "epoch": 1.427617298380364, "grad_norm": 0.3305930506341123, "learning_rate": 6.278915510744187e-06, "loss": 0.0199, "step": 4275 }, { "epoch": 1.4279512439472366, "grad_norm": 0.36608441028261, "learning_rate": 6.277036727415189e-06, "loss": 0.0294, "step": 4276 }, { "epoch": 1.428285189514109, "grad_norm": 0.3227092739262827, "learning_rate": 6.2751577511747575e-06, "loss": 0.0285, "step": 4277 }, { "epoch": 1.4286191350809818, "grad_norm": 0.2798813553302426, "learning_rate": 6.273278582306732e-06, "loss": 0.024, "step": 4278 }, { "epoch": 1.4289530806478545, "grad_norm": 0.35298879503799635, "learning_rate": 6.271399221094986e-06, "loss": 0.0222, "step": 4279 }, { "epoch": 1.429287026214727, "grad_norm": 0.31617957096070615, "learning_rate": 6.269519667823416e-06, "loss": 0.0328, "step": 4280 }, { "epoch": 1.4296209717815995, "grad_norm": 0.2902821305467271, "learning_rate": 6.267639922775952e-06, "loss": 0.0192, "step": 4281 }, { "epoch": 1.4299549173484722, "grad_norm": 0.25669861326313853, "learning_rate": 6.265759986236552e-06, "loss": 0.0195, "step": 4282 }, { "epoch": 1.4302888629153447, "grad_norm": 0.29127756161554613, "learning_rate": 6.263879858489204e-06, "loss": 0.024, "step": 4283 }, { "epoch": 1.4306228084822175, "grad_norm": 0.3462771143572902, "learning_rate": 6.261999539817919e-06, "loss": 0.0251, "step": 4284 }, { "epoch": 1.43095675404909, "grad_norm": 0.2794947820600223, "learning_rate": 6.260119030506746e-06, "loss": 0.0201, "step": 4285 }, { "epoch": 1.4312906996159627, "grad_norm": 0.27555656343229007, "learning_rate": 6.258238330839754e-06, "loss": 0.0206, "step": 4286 }, { "epoch": 1.4316246451828352, "grad_norm": 0.35812082031648257, "learning_rate": 6.2563574411010485e-06, "loss": 0.0307, "step": 4287 }, { "epoch": 1.4319585907497077, "grad_norm": 0.35617509376580264, "learning_rate": 6.254476361574757e-06, "loss": 0.0244, "step": 4288 }, { "epoch": 1.4322925363165804, "grad_norm": 0.3051016083373015, "learning_rate": 6.252595092545042e-06, "loss": 0.0213, "step": 4289 }, { "epoch": 1.432626481883453, "grad_norm": 0.2538639212635603, "learning_rate": 6.250713634296087e-06, "loss": 0.0202, "step": 4290 }, { "epoch": 1.4329604274503256, "grad_norm": 0.3331413922065278, "learning_rate": 6.248831987112113e-06, "loss": 0.0291, "step": 4291 }, { "epoch": 1.433294373017198, "grad_norm": 0.22297512890626758, "learning_rate": 6.246950151277362e-06, "loss": 0.0197, "step": 4292 }, { "epoch": 1.4336283185840708, "grad_norm": 0.31374971852841, "learning_rate": 6.245068127076109e-06, "loss": 0.0261, "step": 4293 }, { "epoch": 1.4339622641509435, "grad_norm": 0.2511216012849922, "learning_rate": 6.243185914792655e-06, "loss": 0.0203, "step": 4294 }, { "epoch": 1.434296209717816, "grad_norm": 0.6807560465015144, "learning_rate": 6.2413035147113295e-06, "loss": 0.0395, "step": 4295 }, { "epoch": 1.4346301552846885, "grad_norm": 0.4305937530593603, "learning_rate": 6.239420927116493e-06, "loss": 0.0335, "step": 4296 }, { "epoch": 1.4349641008515612, "grad_norm": 0.23599825462107743, "learning_rate": 6.2375381522925325e-06, "loss": 0.0244, "step": 4297 }, { "epoch": 1.4352980464184337, "grad_norm": 0.31721034326424485, "learning_rate": 6.235655190523862e-06, "loss": 0.0243, "step": 4298 }, { "epoch": 1.4356319919853064, "grad_norm": 0.2833244629213553, "learning_rate": 6.233772042094924e-06, "loss": 0.0254, "step": 4299 }, { "epoch": 1.435965937552179, "grad_norm": 0.30297019801640057, "learning_rate": 6.231888707290194e-06, "loss": 0.0254, "step": 4300 }, { "epoch": 1.4362998831190517, "grad_norm": 0.3761343437617929, "learning_rate": 6.230005186394169e-06, "loss": 0.0197, "step": 4301 }, { "epoch": 1.4366338286859242, "grad_norm": 0.20260845010605857, "learning_rate": 6.228121479691377e-06, "loss": 0.0176, "step": 4302 }, { "epoch": 1.4369677742527969, "grad_norm": 0.2953658817660671, "learning_rate": 6.226237587466375e-06, "loss": 0.0299, "step": 4303 }, { "epoch": 1.4373017198196694, "grad_norm": 0.30324716864151124, "learning_rate": 6.224353510003747e-06, "loss": 0.0331, "step": 4304 }, { "epoch": 1.437635665386542, "grad_norm": 0.6184108877720337, "learning_rate": 6.222469247588105e-06, "loss": 0.0343, "step": 4305 }, { "epoch": 1.4379696109534146, "grad_norm": 0.2735142999233015, "learning_rate": 6.220584800504091e-06, "loss": 0.0269, "step": 4306 }, { "epoch": 1.438303556520287, "grad_norm": 0.2607299495322127, "learning_rate": 6.218700169036368e-06, "loss": 0.0265, "step": 4307 }, { "epoch": 1.4386375020871598, "grad_norm": 0.2656994956847387, "learning_rate": 6.216815353469636e-06, "loss": 0.0244, "step": 4308 }, { "epoch": 1.4389714476540325, "grad_norm": 0.19914033911661247, "learning_rate": 6.214930354088618e-06, "loss": 0.019, "step": 4309 }, { "epoch": 1.439305393220905, "grad_norm": 0.29142320482694123, "learning_rate": 6.213045171178063e-06, "loss": 0.0238, "step": 4310 }, { "epoch": 1.4396393387877775, "grad_norm": 0.21785132569368448, "learning_rate": 6.2111598050227535e-06, "loss": 0.0184, "step": 4311 }, { "epoch": 1.4399732843546502, "grad_norm": 0.3170393956576559, "learning_rate": 6.209274255907494e-06, "loss": 0.0284, "step": 4312 }, { "epoch": 1.4403072299215227, "grad_norm": 0.2704015210477321, "learning_rate": 6.207388524117119e-06, "loss": 0.029, "step": 4313 }, { "epoch": 1.4406411754883954, "grad_norm": 0.33824713125453343, "learning_rate": 6.205502609936491e-06, "loss": 0.0247, "step": 4314 }, { "epoch": 1.440975121055268, "grad_norm": 0.26962689220866193, "learning_rate": 6.2036165136505e-06, "loss": 0.0345, "step": 4315 }, { "epoch": 1.4413090666221406, "grad_norm": 0.27196873050146303, "learning_rate": 6.201730235544062e-06, "loss": 0.0267, "step": 4316 }, { "epoch": 1.4416430121890131, "grad_norm": 0.2806915451900328, "learning_rate": 6.1998437759021235e-06, "loss": 0.0215, "step": 4317 }, { "epoch": 1.4419769577558859, "grad_norm": 0.4189514378039057, "learning_rate": 6.197957135009653e-06, "loss": 0.0321, "step": 4318 }, { "epoch": 1.4423109033227584, "grad_norm": 0.3651276750301456, "learning_rate": 6.196070313151652e-06, "loss": 0.035, "step": 4319 }, { "epoch": 1.442644848889631, "grad_norm": 0.3155492201587994, "learning_rate": 6.194183310613147e-06, "loss": 0.0323, "step": 4320 }, { "epoch": 1.4429787944565036, "grad_norm": 0.2013473872887474, "learning_rate": 6.1922961276791925e-06, "loss": 0.014, "step": 4321 }, { "epoch": 1.443312740023376, "grad_norm": 0.3723058797287197, "learning_rate": 6.190408764634869e-06, "loss": 0.0231, "step": 4322 }, { "epoch": 1.4436466855902488, "grad_norm": 0.39725513943297047, "learning_rate": 6.188521221765285e-06, "loss": 0.0317, "step": 4323 }, { "epoch": 1.4439806311571215, "grad_norm": 0.2618057425801992, "learning_rate": 6.186633499355576e-06, "loss": 0.0182, "step": 4324 }, { "epoch": 1.444314576723994, "grad_norm": 0.26419649231389597, "learning_rate": 6.184745597690903e-06, "loss": 0.0221, "step": 4325 }, { "epoch": 1.4446485222908665, "grad_norm": 0.2752264239920511, "learning_rate": 6.1828575170564595e-06, "loss": 0.0229, "step": 4326 }, { "epoch": 1.4449824678577392, "grad_norm": 0.366699170884167, "learning_rate": 6.18096925773746e-06, "loss": 0.0289, "step": 4327 }, { "epoch": 1.445316413424612, "grad_norm": 0.22060280111458522, "learning_rate": 6.179080820019147e-06, "loss": 0.0179, "step": 4328 }, { "epoch": 1.4456503589914844, "grad_norm": 0.30863112948765364, "learning_rate": 6.177192204186796e-06, "loss": 0.0268, "step": 4329 }, { "epoch": 1.445984304558357, "grad_norm": 0.2955937241533402, "learning_rate": 6.1753034105257e-06, "loss": 0.0265, "step": 4330 }, { "epoch": 1.4463182501252296, "grad_norm": 0.2782935487389836, "learning_rate": 6.173414439321185e-06, "loss": 0.0284, "step": 4331 }, { "epoch": 1.4466521956921021, "grad_norm": 0.3681146222875431, "learning_rate": 6.171525290858602e-06, "loss": 0.0282, "step": 4332 }, { "epoch": 1.4469861412589748, "grad_norm": 0.3455793107526349, "learning_rate": 6.169635965423331e-06, "loss": 0.0314, "step": 4333 }, { "epoch": 1.4473200868258473, "grad_norm": 0.29459532408232986, "learning_rate": 6.167746463300774e-06, "loss": 0.0252, "step": 4334 }, { "epoch": 1.44765403239272, "grad_norm": 0.3417564847463384, "learning_rate": 6.1658567847763655e-06, "loss": 0.026, "step": 4335 }, { "epoch": 1.4479879779595926, "grad_norm": 0.32711670984725155, "learning_rate": 6.163966930135561e-06, "loss": 0.0286, "step": 4336 }, { "epoch": 1.448321923526465, "grad_norm": 0.4142744064136188, "learning_rate": 6.162076899663846e-06, "loss": 0.0352, "step": 4337 }, { "epoch": 1.4486558690933378, "grad_norm": 0.3051924164507242, "learning_rate": 6.160186693646732e-06, "loss": 0.0257, "step": 4338 }, { "epoch": 1.4489898146602105, "grad_norm": 0.831081294436607, "learning_rate": 6.158296312369759e-06, "loss": 0.0355, "step": 4339 }, { "epoch": 1.449323760227083, "grad_norm": 0.35720312499269324, "learning_rate": 6.156405756118489e-06, "loss": 0.0349, "step": 4340 }, { "epoch": 1.4496577057939555, "grad_norm": 0.40600814896802045, "learning_rate": 6.154515025178511e-06, "loss": 0.0354, "step": 4341 }, { "epoch": 1.4499916513608282, "grad_norm": 0.3844705951190644, "learning_rate": 6.152624119835447e-06, "loss": 0.034, "step": 4342 }, { "epoch": 1.450325596927701, "grad_norm": 0.4232775372248357, "learning_rate": 6.150733040374937e-06, "loss": 0.0365, "step": 4343 }, { "epoch": 1.4506595424945734, "grad_norm": 0.25552025928843425, "learning_rate": 6.148841787082653e-06, "loss": 0.0231, "step": 4344 }, { "epoch": 1.450993488061446, "grad_norm": 0.3526139504264799, "learning_rate": 6.146950360244288e-06, "loss": 0.028, "step": 4345 }, { "epoch": 1.4513274336283186, "grad_norm": 0.6088722502298349, "learning_rate": 6.145058760145568e-06, "loss": 0.0499, "step": 4346 }, { "epoch": 1.4516613791951911, "grad_norm": 0.25052877688381175, "learning_rate": 6.14316698707224e-06, "loss": 0.0218, "step": 4347 }, { "epoch": 1.4519953247620638, "grad_norm": 0.3183343144553221, "learning_rate": 6.1412750413100754e-06, "loss": 0.0244, "step": 4348 }, { "epoch": 1.4523292703289363, "grad_norm": 0.2834692629233744, "learning_rate": 6.13938292314488e-06, "loss": 0.0274, "step": 4349 }, { "epoch": 1.452663215895809, "grad_norm": 0.49889728547952156, "learning_rate": 6.137490632862479e-06, "loss": 0.027, "step": 4350 }, { "epoch": 1.4529971614626815, "grad_norm": 0.32268338807206937, "learning_rate": 6.135598170748721e-06, "loss": 0.025, "step": 4351 }, { "epoch": 1.4533311070295543, "grad_norm": 0.32154313051083633, "learning_rate": 6.13370553708949e-06, "loss": 0.0262, "step": 4352 }, { "epoch": 1.4536650525964268, "grad_norm": 0.4029298036487729, "learning_rate": 6.13181273217069e-06, "loss": 0.0449, "step": 4353 }, { "epoch": 1.4539989981632995, "grad_norm": 0.3048666264152515, "learning_rate": 6.129919756278248e-06, "loss": 0.0209, "step": 4354 }, { "epoch": 1.454332943730172, "grad_norm": 0.3170989508504314, "learning_rate": 6.128026609698124e-06, "loss": 0.0252, "step": 4355 }, { "epoch": 1.4546668892970445, "grad_norm": 0.288892382898713, "learning_rate": 6.126133292716297e-06, "loss": 0.0312, "step": 4356 }, { "epoch": 1.4550008348639172, "grad_norm": 0.24778110269529585, "learning_rate": 6.124239805618778e-06, "loss": 0.0273, "step": 4357 }, { "epoch": 1.45533478043079, "grad_norm": 0.2340694718381469, "learning_rate": 6.122346148691598e-06, "loss": 0.013, "step": 4358 }, { "epoch": 1.4556687259976624, "grad_norm": 0.27797544140500174, "learning_rate": 6.120452322220818e-06, "loss": 0.0217, "step": 4359 }, { "epoch": 1.456002671564535, "grad_norm": 0.5159385172970974, "learning_rate": 6.11855832649252e-06, "loss": 0.0392, "step": 4360 }, { "epoch": 1.4563366171314076, "grad_norm": 0.26066668128539205, "learning_rate": 6.116664161792817e-06, "loss": 0.0206, "step": 4361 }, { "epoch": 1.45667056269828, "grad_norm": 0.2582490857089062, "learning_rate": 6.114769828407845e-06, "loss": 0.0186, "step": 4362 }, { "epoch": 1.4570045082651528, "grad_norm": 0.27525466498956835, "learning_rate": 6.112875326623763e-06, "loss": 0.0243, "step": 4363 }, { "epoch": 1.4573384538320253, "grad_norm": 0.25254017267673917, "learning_rate": 6.110980656726759e-06, "loss": 0.0209, "step": 4364 }, { "epoch": 1.457672399398898, "grad_norm": 0.2421009015651126, "learning_rate": 6.109085819003048e-06, "loss": 0.0234, "step": 4365 }, { "epoch": 1.4580063449657705, "grad_norm": 0.39741005395893025, "learning_rate": 6.107190813738864e-06, "loss": 0.0439, "step": 4366 }, { "epoch": 1.4583402905326432, "grad_norm": 0.265471907374353, "learning_rate": 6.10529564122047e-06, "loss": 0.0219, "step": 4367 }, { "epoch": 1.4586742360995157, "grad_norm": 0.35149394439823667, "learning_rate": 6.103400301734155e-06, "loss": 0.0262, "step": 4368 }, { "epoch": 1.4590081816663885, "grad_norm": 0.28547971804341876, "learning_rate": 6.101504795566232e-06, "loss": 0.0246, "step": 4369 }, { "epoch": 1.459342127233261, "grad_norm": 0.44836156586191384, "learning_rate": 6.099609123003041e-06, "loss": 0.0294, "step": 4370 }, { "epoch": 1.4596760728001335, "grad_norm": 0.2108431768776645, "learning_rate": 6.097713284330944e-06, "loss": 0.0146, "step": 4371 }, { "epoch": 1.4600100183670062, "grad_norm": 0.28812177099641406, "learning_rate": 6.095817279836329e-06, "loss": 0.0324, "step": 4372 }, { "epoch": 1.4603439639338789, "grad_norm": 0.26926791362816294, "learning_rate": 6.093921109805612e-06, "loss": 0.0192, "step": 4373 }, { "epoch": 1.4606779095007514, "grad_norm": 0.279596395422423, "learning_rate": 6.092024774525231e-06, "loss": 0.0245, "step": 4374 }, { "epoch": 1.4610118550676239, "grad_norm": 0.29637531877517337, "learning_rate": 6.090128274281649e-06, "loss": 0.0227, "step": 4375 }, { "epoch": 1.4613458006344966, "grad_norm": 0.27206098990143734, "learning_rate": 6.0882316093613555e-06, "loss": 0.027, "step": 4376 }, { "epoch": 1.4616797462013693, "grad_norm": 0.43365534951385276, "learning_rate": 6.086334780050865e-06, "loss": 0.0538, "step": 4377 }, { "epoch": 1.4620136917682418, "grad_norm": 0.26104853212611145, "learning_rate": 6.084437786636713e-06, "loss": 0.0206, "step": 4378 }, { "epoch": 1.4623476373351143, "grad_norm": 0.28464550950118167, "learning_rate": 6.082540629405467e-06, "loss": 0.0257, "step": 4379 }, { "epoch": 1.462681582901987, "grad_norm": 0.27122050896414035, "learning_rate": 6.08064330864371e-06, "loss": 0.0226, "step": 4380 }, { "epoch": 1.4630155284688595, "grad_norm": 0.4038213204342197, "learning_rate": 6.078745824638058e-06, "loss": 0.0395, "step": 4381 }, { "epoch": 1.4633494740357322, "grad_norm": 0.2634999359783141, "learning_rate": 6.076848177675148e-06, "loss": 0.0215, "step": 4382 }, { "epoch": 1.4636834196026047, "grad_norm": 0.42833389864849086, "learning_rate": 6.07495036804164e-06, "loss": 0.0404, "step": 4383 }, { "epoch": 1.4640173651694774, "grad_norm": 0.46512879630504517, "learning_rate": 6.073052396024222e-06, "loss": 0.0284, "step": 4384 }, { "epoch": 1.46435131073635, "grad_norm": 0.3623048973487272, "learning_rate": 6.071154261909605e-06, "loss": 0.0321, "step": 4385 }, { "epoch": 1.4646852563032224, "grad_norm": 0.32199116452410337, "learning_rate": 6.069255965984524e-06, "loss": 0.0277, "step": 4386 }, { "epoch": 1.4650192018700952, "grad_norm": 0.2653636189061635, "learning_rate": 6.067357508535741e-06, "loss": 0.0213, "step": 4387 }, { "epoch": 1.4653531474369679, "grad_norm": 0.31178905599193835, "learning_rate": 6.065458889850037e-06, "loss": 0.0379, "step": 4388 }, { "epoch": 1.4656870930038404, "grad_norm": 0.33518735458407195, "learning_rate": 6.063560110214224e-06, "loss": 0.0299, "step": 4389 }, { "epoch": 1.4660210385707129, "grad_norm": 0.26262022718549094, "learning_rate": 6.061661169915132e-06, "loss": 0.0235, "step": 4390 }, { "epoch": 1.4663549841375856, "grad_norm": 0.3173487761294358, "learning_rate": 6.05976206923962e-06, "loss": 0.0331, "step": 4391 }, { "epoch": 1.4666889297044583, "grad_norm": 0.25361114572533433, "learning_rate": 6.057862808474569e-06, "loss": 0.0256, "step": 4392 }, { "epoch": 1.4670228752713308, "grad_norm": 0.44289787751970144, "learning_rate": 6.055963387906884e-06, "loss": 0.0326, "step": 4393 }, { "epoch": 1.4673568208382033, "grad_norm": 0.23277029812391556, "learning_rate": 6.054063807823497e-06, "loss": 0.0204, "step": 4394 }, { "epoch": 1.467690766405076, "grad_norm": 0.30250490534314056, "learning_rate": 6.052164068511359e-06, "loss": 0.0272, "step": 4395 }, { "epoch": 1.4680247119719485, "grad_norm": 0.36103894659828506, "learning_rate": 6.05026417025745e-06, "loss": 0.0445, "step": 4396 }, { "epoch": 1.4683586575388212, "grad_norm": 0.3890491899135918, "learning_rate": 6.0483641133487736e-06, "loss": 0.0349, "step": 4397 }, { "epoch": 1.4686926031056937, "grad_norm": 0.41602537627815867, "learning_rate": 6.046463898072351e-06, "loss": 0.0252, "step": 4398 }, { "epoch": 1.4690265486725664, "grad_norm": 0.6512734708596248, "learning_rate": 6.044563524715237e-06, "loss": 0.0358, "step": 4399 }, { "epoch": 1.469360494239439, "grad_norm": 0.27057183116247335, "learning_rate": 6.042662993564503e-06, "loss": 0.0236, "step": 4400 }, { "epoch": 1.4696944398063116, "grad_norm": 0.2517911938451049, "learning_rate": 6.040762304907246e-06, "loss": 0.0214, "step": 4401 }, { "epoch": 1.4700283853731841, "grad_norm": 0.30410284603534155, "learning_rate": 6.038861459030588e-06, "loss": 0.0231, "step": 4402 }, { "epoch": 1.4703623309400569, "grad_norm": 0.25305959236280945, "learning_rate": 6.036960456221677e-06, "loss": 0.0228, "step": 4403 }, { "epoch": 1.4706962765069294, "grad_norm": 0.35791353127906433, "learning_rate": 6.035059296767676e-06, "loss": 0.0259, "step": 4404 }, { "epoch": 1.4710302220738019, "grad_norm": 0.36887185957430985, "learning_rate": 6.033157980955782e-06, "loss": 0.0261, "step": 4405 }, { "epoch": 1.4713641676406746, "grad_norm": 0.26456754460765436, "learning_rate": 6.0312565090732115e-06, "loss": 0.0219, "step": 4406 }, { "epoch": 1.4716981132075473, "grad_norm": 0.26907973736125956, "learning_rate": 6.0293548814072004e-06, "loss": 0.026, "step": 4407 }, { "epoch": 1.4720320587744198, "grad_norm": 0.28627901760653973, "learning_rate": 6.0274530982450155e-06, "loss": 0.0264, "step": 4408 }, { "epoch": 1.4723660043412923, "grad_norm": 0.37333216269323416, "learning_rate": 6.025551159873941e-06, "loss": 0.0307, "step": 4409 }, { "epoch": 1.472699949908165, "grad_norm": 0.3661677265346944, "learning_rate": 6.023649066581288e-06, "loss": 0.0324, "step": 4410 }, { "epoch": 1.4730338954750375, "grad_norm": 0.3341440260192572, "learning_rate": 6.021746818654393e-06, "loss": 0.0268, "step": 4411 }, { "epoch": 1.4733678410419102, "grad_norm": 0.31758251491581163, "learning_rate": 6.019844416380609e-06, "loss": 0.0352, "step": 4412 }, { "epoch": 1.4737017866087827, "grad_norm": 0.29246718704932134, "learning_rate": 6.017941860047318e-06, "loss": 0.0196, "step": 4413 }, { "epoch": 1.4740357321756554, "grad_norm": 0.38209107163761036, "learning_rate": 6.016039149941924e-06, "loss": 0.0241, "step": 4414 }, { "epoch": 1.474369677742528, "grad_norm": 0.28155226219443474, "learning_rate": 6.01413628635185e-06, "loss": 0.0267, "step": 4415 }, { "epoch": 1.4747036233094006, "grad_norm": 0.29575840316153873, "learning_rate": 6.012233269564551e-06, "loss": 0.0265, "step": 4416 }, { "epoch": 1.4750375688762731, "grad_norm": 0.2775488169569573, "learning_rate": 6.010330099867497e-06, "loss": 0.021, "step": 4417 }, { "epoch": 1.4753715144431458, "grad_norm": 0.3206485863371625, "learning_rate": 6.008426777548186e-06, "loss": 0.028, "step": 4418 }, { "epoch": 1.4757054600100183, "grad_norm": 0.31890744967601664, "learning_rate": 6.0065233028941365e-06, "loss": 0.0236, "step": 4419 }, { "epoch": 1.4760394055768908, "grad_norm": 0.23284350322239286, "learning_rate": 6.00461967619289e-06, "loss": 0.0249, "step": 4420 }, { "epoch": 1.4763733511437636, "grad_norm": 0.24372449701774504, "learning_rate": 6.002715897732013e-06, "loss": 0.0173, "step": 4421 }, { "epoch": 1.4767072967106363, "grad_norm": 0.3406960607347691, "learning_rate": 6.000811967799092e-06, "loss": 0.0267, "step": 4422 }, { "epoch": 1.4770412422775088, "grad_norm": 0.31046737270932906, "learning_rate": 5.99890788668174e-06, "loss": 0.0253, "step": 4423 }, { "epoch": 1.4773751878443813, "grad_norm": 0.263792987727194, "learning_rate": 5.997003654667589e-06, "loss": 0.0231, "step": 4424 }, { "epoch": 1.477709133411254, "grad_norm": 0.2736562632219921, "learning_rate": 5.995099272044298e-06, "loss": 0.0239, "step": 4425 }, { "epoch": 1.4780430789781267, "grad_norm": 0.39782948402809115, "learning_rate": 5.9931947390995435e-06, "loss": 0.0314, "step": 4426 }, { "epoch": 1.4783770245449992, "grad_norm": 0.30590420348196146, "learning_rate": 5.99129005612103e-06, "loss": 0.0329, "step": 4427 }, { "epoch": 1.4787109701118717, "grad_norm": 0.275694254754224, "learning_rate": 5.989385223396482e-06, "loss": 0.0231, "step": 4428 }, { "epoch": 1.4790449156787444, "grad_norm": 0.2446036022092954, "learning_rate": 5.987480241213646e-06, "loss": 0.0234, "step": 4429 }, { "epoch": 1.479378861245617, "grad_norm": 0.6236265871612486, "learning_rate": 5.985575109860292e-06, "loss": 0.042, "step": 4430 }, { "epoch": 1.4797128068124896, "grad_norm": 0.4350835063370474, "learning_rate": 5.983669829624214e-06, "loss": 0.0296, "step": 4431 }, { "epoch": 1.4800467523793621, "grad_norm": 0.26353919474889465, "learning_rate": 5.981764400793224e-06, "loss": 0.0235, "step": 4432 }, { "epoch": 1.4803806979462348, "grad_norm": 0.26317645607539303, "learning_rate": 5.9798588236551626e-06, "loss": 0.0203, "step": 4433 }, { "epoch": 1.4807146435131073, "grad_norm": 0.28637973774053704, "learning_rate": 5.977953098497889e-06, "loss": 0.0239, "step": 4434 }, { "epoch": 1.4810485890799798, "grad_norm": 0.27151618029180996, "learning_rate": 5.976047225609284e-06, "loss": 0.0242, "step": 4435 }, { "epoch": 1.4813825346468525, "grad_norm": 0.23468847255267425, "learning_rate": 5.974141205277253e-06, "loss": 0.0212, "step": 4436 }, { "epoch": 1.4817164802137253, "grad_norm": 0.32883916141243086, "learning_rate": 5.972235037789723e-06, "loss": 0.0271, "step": 4437 }, { "epoch": 1.4820504257805978, "grad_norm": 0.24546122673487103, "learning_rate": 5.970328723434642e-06, "loss": 0.022, "step": 4438 }, { "epoch": 1.4823843713474703, "grad_norm": 0.30865368447552105, "learning_rate": 5.968422262499983e-06, "loss": 0.0233, "step": 4439 }, { "epoch": 1.482718316914343, "grad_norm": 0.3202865420587498, "learning_rate": 5.966515655273739e-06, "loss": 0.0263, "step": 4440 }, { "epoch": 1.4830522624812157, "grad_norm": 0.2924941338804397, "learning_rate": 5.9646089020439245e-06, "loss": 0.0256, "step": 4441 }, { "epoch": 1.4833862080480882, "grad_norm": 0.239778534488851, "learning_rate": 5.962702003098576e-06, "loss": 0.0171, "step": 4442 }, { "epoch": 1.4837201536149607, "grad_norm": 0.27248970962594404, "learning_rate": 5.960794958725756e-06, "loss": 0.0267, "step": 4443 }, { "epoch": 1.4840540991818334, "grad_norm": 0.5668079145755521, "learning_rate": 5.958887769213544e-06, "loss": 0.0252, "step": 4444 }, { "epoch": 1.484388044748706, "grad_norm": 0.2893695129438283, "learning_rate": 5.956980434850044e-06, "loss": 0.022, "step": 4445 }, { "epoch": 1.4847219903155786, "grad_norm": 0.260698276431372, "learning_rate": 5.955072955923381e-06, "loss": 0.0266, "step": 4446 }, { "epoch": 1.485055935882451, "grad_norm": 0.25696206994345266, "learning_rate": 5.9531653327217035e-06, "loss": 0.0221, "step": 4447 }, { "epoch": 1.4853898814493238, "grad_norm": 0.25981626325338775, "learning_rate": 5.951257565533177e-06, "loss": 0.0241, "step": 4448 }, { "epoch": 1.4857238270161963, "grad_norm": 0.3447712293620019, "learning_rate": 5.949349654645997e-06, "loss": 0.0276, "step": 4449 }, { "epoch": 1.486057772583069, "grad_norm": 0.378301455185927, "learning_rate": 5.947441600348373e-06, "loss": 0.0472, "step": 4450 }, { "epoch": 1.4863917181499415, "grad_norm": 0.48273204515525314, "learning_rate": 5.945533402928537e-06, "loss": 0.0458, "step": 4451 }, { "epoch": 1.4867256637168142, "grad_norm": 0.3150713667270021, "learning_rate": 5.9436250626747505e-06, "loss": 0.0467, "step": 4452 }, { "epoch": 1.4870596092836867, "grad_norm": 0.29999024142010366, "learning_rate": 5.941716579875286e-06, "loss": 0.0278, "step": 4453 }, { "epoch": 1.4873935548505592, "grad_norm": 0.2847484281739037, "learning_rate": 5.939807954818443e-06, "loss": 0.0267, "step": 4454 }, { "epoch": 1.487727500417432, "grad_norm": 1.1781087472656608, "learning_rate": 5.937899187792544e-06, "loss": 0.0407, "step": 4455 }, { "epoch": 1.4880614459843047, "grad_norm": 0.23953326319386384, "learning_rate": 5.935990279085928e-06, "loss": 0.0212, "step": 4456 }, { "epoch": 1.4883953915511772, "grad_norm": 0.365271084860815, "learning_rate": 5.93408122898696e-06, "loss": 0.0284, "step": 4457 }, { "epoch": 1.4887293371180497, "grad_norm": 0.22194315158527386, "learning_rate": 5.9321720377840245e-06, "loss": 0.0185, "step": 4458 }, { "epoch": 1.4890632826849224, "grad_norm": 0.2578458646307319, "learning_rate": 5.930262705765526e-06, "loss": 0.0197, "step": 4459 }, { "epoch": 1.4893972282517949, "grad_norm": 0.3628195999931555, "learning_rate": 5.928353233219893e-06, "loss": 0.0211, "step": 4460 }, { "epoch": 1.4897311738186676, "grad_norm": 0.24962598655161936, "learning_rate": 5.926443620435572e-06, "loss": 0.0199, "step": 4461 }, { "epoch": 1.49006511938554, "grad_norm": 0.3309447927990789, "learning_rate": 5.924533867701034e-06, "loss": 0.022, "step": 4462 }, { "epoch": 1.4903990649524128, "grad_norm": 1.1256395660318999, "learning_rate": 5.922623975304771e-06, "loss": 0.0288, "step": 4463 }, { "epoch": 1.4907330105192853, "grad_norm": 0.24791379266010563, "learning_rate": 5.920713943535291e-06, "loss": 0.0231, "step": 4464 }, { "epoch": 1.491066956086158, "grad_norm": 0.3540328285682308, "learning_rate": 5.9188037726811285e-06, "loss": 0.0296, "step": 4465 }, { "epoch": 1.4914009016530305, "grad_norm": 0.40262122809029594, "learning_rate": 5.9168934630308385e-06, "loss": 0.0295, "step": 4466 }, { "epoch": 1.4917348472199032, "grad_norm": 0.2741715077506556, "learning_rate": 5.914983014872995e-06, "loss": 0.0271, "step": 4467 }, { "epoch": 1.4920687927867757, "grad_norm": 0.28712406032794907, "learning_rate": 5.9130724284961924e-06, "loss": 0.0242, "step": 4468 }, { "epoch": 1.4924027383536482, "grad_norm": 0.2788407148776453, "learning_rate": 5.91116170418905e-06, "loss": 0.0243, "step": 4469 }, { "epoch": 1.492736683920521, "grad_norm": 0.3824905570351801, "learning_rate": 5.909250842240203e-06, "loss": 0.0286, "step": 4470 }, { "epoch": 1.4930706294873937, "grad_norm": 0.4407368792854273, "learning_rate": 5.907339842938309e-06, "loss": 0.0171, "step": 4471 }, { "epoch": 1.4934045750542662, "grad_norm": 0.3349997431603896, "learning_rate": 5.90542870657205e-06, "loss": 0.0361, "step": 4472 }, { "epoch": 1.4937385206211387, "grad_norm": 0.31415251690410684, "learning_rate": 5.903517433430123e-06, "loss": 0.0209, "step": 4473 }, { "epoch": 1.4940724661880114, "grad_norm": 0.38454030743164036, "learning_rate": 5.901606023801248e-06, "loss": 0.027, "step": 4474 }, { "epoch": 1.494406411754884, "grad_norm": 0.3702022580921498, "learning_rate": 5.899694477974168e-06, "loss": 0.0314, "step": 4475 }, { "epoch": 1.4947403573217566, "grad_norm": 0.8169004780549912, "learning_rate": 5.897782796237645e-06, "loss": 0.0368, "step": 4476 }, { "epoch": 1.495074302888629, "grad_norm": 0.3345481747212127, "learning_rate": 5.895870978880457e-06, "loss": 0.0292, "step": 4477 }, { "epoch": 1.4954082484555018, "grad_norm": 0.24816626634703076, "learning_rate": 5.89395902619141e-06, "loss": 0.0204, "step": 4478 }, { "epoch": 1.4957421940223743, "grad_norm": 0.2574079017165514, "learning_rate": 5.892046938459327e-06, "loss": 0.021, "step": 4479 }, { "epoch": 1.496076139589247, "grad_norm": 0.36519695983560385, "learning_rate": 5.890134715973049e-06, "loss": 0.034, "step": 4480 }, { "epoch": 1.4964100851561195, "grad_norm": 0.354264536613751, "learning_rate": 5.888222359021443e-06, "loss": 0.0285, "step": 4481 }, { "epoch": 1.4967440307229922, "grad_norm": 0.4296920160950373, "learning_rate": 5.8863098678933896e-06, "loss": 0.0319, "step": 4482 }, { "epoch": 1.4970779762898647, "grad_norm": 0.5234360225964875, "learning_rate": 5.884397242877795e-06, "loss": 0.033, "step": 4483 }, { "epoch": 1.4974119218567372, "grad_norm": 0.44551044784097543, "learning_rate": 5.882484484263584e-06, "loss": 0.0294, "step": 4484 }, { "epoch": 1.49774586742361, "grad_norm": 0.23448900773796297, "learning_rate": 5.8805715923397e-06, "loss": 0.02, "step": 4485 }, { "epoch": 1.4980798129904827, "grad_norm": 0.30741229578954143, "learning_rate": 5.87865856739511e-06, "loss": 0.0288, "step": 4486 }, { "epoch": 1.4984137585573551, "grad_norm": 0.23931479236630407, "learning_rate": 5.876745409718796e-06, "loss": 0.0175, "step": 4487 }, { "epoch": 1.4987477041242276, "grad_norm": 0.19925632275590355, "learning_rate": 5.874832119599766e-06, "loss": 0.0131, "step": 4488 }, { "epoch": 1.4990816496911004, "grad_norm": 0.29390092263538264, "learning_rate": 5.872918697327042e-06, "loss": 0.0273, "step": 4489 }, { "epoch": 1.499415595257973, "grad_norm": 0.3195610194040288, "learning_rate": 5.871005143189671e-06, "loss": 0.0357, "step": 4490 }, { "epoch": 1.4997495408248456, "grad_norm": 0.3002328256551994, "learning_rate": 5.869091457476718e-06, "loss": 0.0273, "step": 4491 }, { "epoch": 1.500083486391718, "grad_norm": 0.2754902250643781, "learning_rate": 5.8671776404772655e-06, "loss": 0.0229, "step": 4492 }, { "epoch": 1.5004174319585908, "grad_norm": 0.30637840916966047, "learning_rate": 5.8652636924804206e-06, "loss": 0.0322, "step": 4493 }, { "epoch": 1.5007513775254635, "grad_norm": 0.29614098612626816, "learning_rate": 5.863349613775308e-06, "loss": 0.0274, "step": 4494 }, { "epoch": 1.501085323092336, "grad_norm": 0.2593370898495482, "learning_rate": 5.861435404651068e-06, "loss": 0.0216, "step": 4495 }, { "epoch": 1.5014192686592085, "grad_norm": 0.24726160704571823, "learning_rate": 5.859521065396869e-06, "loss": 0.0255, "step": 4496 }, { "epoch": 1.5017532142260812, "grad_norm": 0.36087923078253414, "learning_rate": 5.857606596301892e-06, "loss": 0.0219, "step": 4497 }, { "epoch": 1.5020871597929537, "grad_norm": 0.29399012235997096, "learning_rate": 5.85569199765534e-06, "loss": 0.0264, "step": 4498 }, { "epoch": 1.5024211053598262, "grad_norm": 0.283003119902052, "learning_rate": 5.853777269746438e-06, "loss": 0.0181, "step": 4499 }, { "epoch": 1.502755050926699, "grad_norm": 0.2295637318903182, "learning_rate": 5.851862412864426e-06, "loss": 0.0189, "step": 4500 }, { "epoch": 1.5030889964935716, "grad_norm": 0.22368830141025758, "learning_rate": 5.8499474272985654e-06, "loss": 0.0199, "step": 4501 }, { "epoch": 1.5034229420604441, "grad_norm": 0.3658507875277788, "learning_rate": 5.848032313338139e-06, "loss": 0.0283, "step": 4502 }, { "epoch": 1.5037568876273166, "grad_norm": 0.2717646343434779, "learning_rate": 5.846117071272444e-06, "loss": 0.0264, "step": 4503 }, { "epoch": 1.5040908331941893, "grad_norm": 0.2963475545938359, "learning_rate": 5.844201701390806e-06, "loss": 0.0327, "step": 4504 }, { "epoch": 1.504424778761062, "grad_norm": 0.42533245553802934, "learning_rate": 5.842286203982559e-06, "loss": 0.0225, "step": 4505 }, { "epoch": 1.5047587243279346, "grad_norm": 0.347760145868081, "learning_rate": 5.840370579337063e-06, "loss": 0.0319, "step": 4506 }, { "epoch": 1.505092669894807, "grad_norm": 0.35348437597389887, "learning_rate": 5.838454827743697e-06, "loss": 0.0281, "step": 4507 }, { "epoch": 1.5054266154616798, "grad_norm": 0.33239623968706344, "learning_rate": 5.8365389494918565e-06, "loss": 0.0234, "step": 4508 }, { "epoch": 1.5057605610285525, "grad_norm": 0.27395724996539655, "learning_rate": 5.834622944870959e-06, "loss": 0.0248, "step": 4509 }, { "epoch": 1.506094506595425, "grad_norm": 0.28594646026098314, "learning_rate": 5.832706814170437e-06, "loss": 0.0212, "step": 4510 }, { "epoch": 1.5064284521622975, "grad_norm": 0.5007176706738875, "learning_rate": 5.830790557679746e-06, "loss": 0.0316, "step": 4511 }, { "epoch": 1.5067623977291702, "grad_norm": 0.3828671663025841, "learning_rate": 5.8288741756883585e-06, "loss": 0.0361, "step": 4512 }, { "epoch": 1.5070963432960427, "grad_norm": 0.3299652216663339, "learning_rate": 5.826957668485768e-06, "loss": 0.0226, "step": 4513 }, { "epoch": 1.5074302888629152, "grad_norm": 0.2473818410219272, "learning_rate": 5.825041036361484e-06, "loss": 0.0184, "step": 4514 }, { "epoch": 1.507764234429788, "grad_norm": 0.3635681183868704, "learning_rate": 5.823124279605037e-06, "loss": 0.0417, "step": 4515 }, { "epoch": 1.5080981799966606, "grad_norm": 0.2588835718038487, "learning_rate": 5.821207398505976e-06, "loss": 0.018, "step": 4516 }, { "epoch": 1.5084321255635331, "grad_norm": 0.4298291272505506, "learning_rate": 5.819290393353867e-06, "loss": 0.0459, "step": 4517 }, { "epoch": 1.5087660711304056, "grad_norm": 0.42770580956101134, "learning_rate": 5.817373264438297e-06, "loss": 0.0314, "step": 4518 }, { "epoch": 1.5091000166972783, "grad_norm": 0.2719253002527101, "learning_rate": 5.815456012048873e-06, "loss": 0.0225, "step": 4519 }, { "epoch": 1.509433962264151, "grad_norm": 0.29592703075543736, "learning_rate": 5.8135386364752154e-06, "loss": 0.0237, "step": 4520 }, { "epoch": 1.5097679078310235, "grad_norm": 0.3969604385518994, "learning_rate": 5.8116211380069675e-06, "loss": 0.0282, "step": 4521 }, { "epoch": 1.510101853397896, "grad_norm": 0.3272548522243053, "learning_rate": 5.809703516933791e-06, "loss": 0.0204, "step": 4522 }, { "epoch": 1.5104357989647688, "grad_norm": 0.28527999186507796, "learning_rate": 5.807785773545364e-06, "loss": 0.0242, "step": 4523 }, { "epoch": 1.5107697445316415, "grad_norm": 0.3083549976096167, "learning_rate": 5.805867908131384e-06, "loss": 0.024, "step": 4524 }, { "epoch": 1.511103690098514, "grad_norm": 0.2796754045800017, "learning_rate": 5.803949920981568e-06, "loss": 0.0216, "step": 4525 }, { "epoch": 1.5114376356653865, "grad_norm": 0.3457570056577854, "learning_rate": 5.802031812385651e-06, "loss": 0.0265, "step": 4526 }, { "epoch": 1.5117715812322592, "grad_norm": 0.6303018675821321, "learning_rate": 5.800113582633384e-06, "loss": 0.0337, "step": 4527 }, { "epoch": 1.512105526799132, "grad_norm": 0.25098579911670743, "learning_rate": 5.7981952320145405e-06, "loss": 0.0247, "step": 4528 }, { "epoch": 1.5124394723660042, "grad_norm": 0.2910769566131532, "learning_rate": 5.796276760818908e-06, "loss": 0.0226, "step": 4529 }, { "epoch": 1.512773417932877, "grad_norm": 0.38280059234355135, "learning_rate": 5.794358169336295e-06, "loss": 0.0291, "step": 4530 }, { "epoch": 1.5131073634997496, "grad_norm": 0.3133082887840216, "learning_rate": 5.792439457856528e-06, "loss": 0.0318, "step": 4531 }, { "epoch": 1.513441309066622, "grad_norm": 0.2964147026248281, "learning_rate": 5.790520626669449e-06, "loss": 0.0269, "step": 4532 }, { "epoch": 1.5137752546334946, "grad_norm": 0.3291293657386248, "learning_rate": 5.788601676064922e-06, "loss": 0.0244, "step": 4533 }, { "epoch": 1.5141092002003673, "grad_norm": 0.2534649953871449, "learning_rate": 5.786682606332827e-06, "loss": 0.021, "step": 4534 }, { "epoch": 1.51444314576724, "grad_norm": 0.3956137234206681, "learning_rate": 5.78476341776306e-06, "loss": 0.0399, "step": 4535 }, { "epoch": 1.5147770913341125, "grad_norm": 0.3521007059702409, "learning_rate": 5.782844110645539e-06, "loss": 0.0225, "step": 4536 }, { "epoch": 1.515111036900985, "grad_norm": 0.3187280652722967, "learning_rate": 5.780924685270198e-06, "loss": 0.0263, "step": 4537 }, { "epoch": 1.5154449824678577, "grad_norm": 0.2713978210437977, "learning_rate": 5.779005141926988e-06, "loss": 0.0265, "step": 4538 }, { "epoch": 1.5157789280347305, "grad_norm": 0.24634928031527029, "learning_rate": 5.777085480905877e-06, "loss": 0.0216, "step": 4539 }, { "epoch": 1.516112873601603, "grad_norm": 0.27789043202449926, "learning_rate": 5.7751657024968565e-06, "loss": 0.0299, "step": 4540 }, { "epoch": 1.5164468191684755, "grad_norm": 0.4188611767754047, "learning_rate": 5.773245806989929e-06, "loss": 0.0379, "step": 4541 }, { "epoch": 1.5167807647353482, "grad_norm": 0.3115149618402707, "learning_rate": 5.771325794675117e-06, "loss": 0.0253, "step": 4542 }, { "epoch": 1.517114710302221, "grad_norm": 0.2560699542001376, "learning_rate": 5.769405665842461e-06, "loss": 0.0185, "step": 4543 }, { "epoch": 1.5174486558690934, "grad_norm": 0.3159015963424157, "learning_rate": 5.767485420782021e-06, "loss": 0.0265, "step": 4544 }, { "epoch": 1.5177826014359659, "grad_norm": 0.3235905206470134, "learning_rate": 5.7655650597838704e-06, "loss": 0.0262, "step": 4545 }, { "epoch": 1.5181165470028386, "grad_norm": 0.2930813024022368, "learning_rate": 5.7636445831381034e-06, "loss": 0.0215, "step": 4546 }, { "epoch": 1.518450492569711, "grad_norm": 0.2857122199031493, "learning_rate": 5.761723991134831e-06, "loss": 0.029, "step": 4547 }, { "epoch": 1.5187844381365836, "grad_norm": 0.29260302770460334, "learning_rate": 5.759803284064181e-06, "loss": 0.0244, "step": 4548 }, { "epoch": 1.5191183837034563, "grad_norm": 0.2531697490963169, "learning_rate": 5.757882462216299e-06, "loss": 0.0219, "step": 4549 }, { "epoch": 1.519452329270329, "grad_norm": 0.5624367886881112, "learning_rate": 5.755961525881345e-06, "loss": 0.0275, "step": 4550 }, { "epoch": 1.5197862748372015, "grad_norm": 0.4082335546430365, "learning_rate": 5.7540404753495034e-06, "loss": 0.0344, "step": 4551 }, { "epoch": 1.520120220404074, "grad_norm": 0.3735604674461184, "learning_rate": 5.75211931091097e-06, "loss": 0.0454, "step": 4552 }, { "epoch": 1.5204541659709467, "grad_norm": 0.2818705851229258, "learning_rate": 5.750198032855956e-06, "loss": 0.0192, "step": 4553 }, { "epoch": 1.5207881115378195, "grad_norm": 0.27003828603018404, "learning_rate": 5.748276641474698e-06, "loss": 0.0189, "step": 4554 }, { "epoch": 1.521122057104692, "grad_norm": 0.26984539217454406, "learning_rate": 5.746355137057442e-06, "loss": 0.0265, "step": 4555 }, { "epoch": 1.5214560026715644, "grad_norm": 0.2599567549036431, "learning_rate": 5.7444335198944555e-06, "loss": 0.0251, "step": 4556 }, { "epoch": 1.5217899482384372, "grad_norm": 0.3143515665008245, "learning_rate": 5.7425117902760195e-06, "loss": 0.0244, "step": 4557 }, { "epoch": 1.5221238938053099, "grad_norm": 0.3370359114708471, "learning_rate": 5.7405899484924346e-06, "loss": 0.0462, "step": 4558 }, { "epoch": 1.5224578393721824, "grad_norm": 0.2856420988296376, "learning_rate": 5.738667994834019e-06, "loss": 0.0242, "step": 4559 }, { "epoch": 1.5227917849390549, "grad_norm": 0.29237378296137156, "learning_rate": 5.736745929591103e-06, "loss": 0.0207, "step": 4560 }, { "epoch": 1.5231257305059276, "grad_norm": 0.26313380186967383, "learning_rate": 5.734823753054042e-06, "loss": 0.0285, "step": 4561 }, { "epoch": 1.5234596760728, "grad_norm": 0.25120794571849253, "learning_rate": 5.732901465513199e-06, "loss": 0.0211, "step": 4562 }, { "epoch": 1.5237936216396726, "grad_norm": 0.26062660804053955, "learning_rate": 5.73097906725896e-06, "loss": 0.0185, "step": 4563 }, { "epoch": 1.5241275672065453, "grad_norm": 0.24209339902864357, "learning_rate": 5.729056558581727e-06, "loss": 0.0217, "step": 4564 }, { "epoch": 1.524461512773418, "grad_norm": 0.6038532436961349, "learning_rate": 5.727133939771915e-06, "loss": 0.0327, "step": 4565 }, { "epoch": 1.5247954583402905, "grad_norm": 0.289937371343371, "learning_rate": 5.725211211119961e-06, "loss": 0.0267, "step": 4566 }, { "epoch": 1.525129403907163, "grad_norm": 0.247180956626713, "learning_rate": 5.723288372916315e-06, "loss": 0.0238, "step": 4567 }, { "epoch": 1.5254633494740357, "grad_norm": 0.23066461883261297, "learning_rate": 5.721365425451442e-06, "loss": 0.0173, "step": 4568 }, { "epoch": 1.5257972950409084, "grad_norm": 0.45808699359164345, "learning_rate": 5.719442369015828e-06, "loss": 0.0285, "step": 4569 }, { "epoch": 1.526131240607781, "grad_norm": 0.23071180498252666, "learning_rate": 5.717519203899975e-06, "loss": 0.0147, "step": 4570 }, { "epoch": 1.5264651861746534, "grad_norm": 0.3104740042009261, "learning_rate": 5.715595930394396e-06, "loss": 0.029, "step": 4571 }, { "epoch": 1.5267991317415261, "grad_norm": 0.3886417927297356, "learning_rate": 5.713672548789626e-06, "loss": 0.0333, "step": 4572 }, { "epoch": 1.5271330773083989, "grad_norm": 0.31689181981946263, "learning_rate": 5.711749059376215e-06, "loss": 0.0248, "step": 4573 }, { "epoch": 1.5274670228752714, "grad_norm": 0.268398606883179, "learning_rate": 5.7098254624447255e-06, "loss": 0.0257, "step": 4574 }, { "epoch": 1.5278009684421439, "grad_norm": 0.3220231489764643, "learning_rate": 5.707901758285745e-06, "loss": 0.025, "step": 4575 }, { "epoch": 1.5281349140090166, "grad_norm": 0.2688945215404341, "learning_rate": 5.705977947189868e-06, "loss": 0.0181, "step": 4576 }, { "epoch": 1.5284688595758893, "grad_norm": 0.3202538053443321, "learning_rate": 5.704054029447708e-06, "loss": 0.0306, "step": 4577 }, { "epoch": 1.5288028051427616, "grad_norm": 0.42388476698429495, "learning_rate": 5.702130005349899e-06, "loss": 0.0276, "step": 4578 }, { "epoch": 1.5291367507096343, "grad_norm": 0.38182722661325896, "learning_rate": 5.700205875187084e-06, "loss": 0.0366, "step": 4579 }, { "epoch": 1.529470696276507, "grad_norm": 0.31316755531102236, "learning_rate": 5.698281639249927e-06, "loss": 0.022, "step": 4580 }, { "epoch": 1.5298046418433795, "grad_norm": 0.2714309659875258, "learning_rate": 5.696357297829106e-06, "loss": 0.0231, "step": 4581 }, { "epoch": 1.530138587410252, "grad_norm": 0.2712535452604488, "learning_rate": 5.6944328512153165e-06, "loss": 0.0237, "step": 4582 }, { "epoch": 1.5304725329771247, "grad_norm": 0.3182382636290354, "learning_rate": 5.692508299699269e-06, "loss": 0.0275, "step": 4583 }, { "epoch": 1.5308064785439974, "grad_norm": 0.238015636588773, "learning_rate": 5.690583643571687e-06, "loss": 0.0189, "step": 4584 }, { "epoch": 1.53114042411087, "grad_norm": 0.4009935230094596, "learning_rate": 5.688658883123315e-06, "loss": 0.0403, "step": 4585 }, { "epoch": 1.5314743696777424, "grad_norm": 0.25016289320218604, "learning_rate": 5.68673401864491e-06, "loss": 0.0228, "step": 4586 }, { "epoch": 1.5318083152446151, "grad_norm": 0.267276585325518, "learning_rate": 5.684809050427247e-06, "loss": 0.018, "step": 4587 }, { "epoch": 1.5321422608114879, "grad_norm": 0.3936855696850991, "learning_rate": 5.682883978761111e-06, "loss": 0.0241, "step": 4588 }, { "epoch": 1.5324762063783604, "grad_norm": 0.2877023941912195, "learning_rate": 5.680958803937311e-06, "loss": 0.0182, "step": 4589 }, { "epoch": 1.5328101519452328, "grad_norm": 0.3574736219479748, "learning_rate": 5.6790335262466645e-06, "loss": 0.032, "step": 4590 }, { "epoch": 1.5331440975121056, "grad_norm": 0.3512393893416295, "learning_rate": 5.677108145980008e-06, "loss": 0.0337, "step": 4591 }, { "epoch": 1.5334780430789783, "grad_norm": 0.2925069173622745, "learning_rate": 5.675182663428196e-06, "loss": 0.0282, "step": 4592 }, { "epoch": 1.5338119886458508, "grad_norm": 0.20860657643540745, "learning_rate": 5.673257078882091e-06, "loss": 0.0174, "step": 4593 }, { "epoch": 1.5341459342127233, "grad_norm": 0.36380528226657366, "learning_rate": 5.671331392632577e-06, "loss": 0.0314, "step": 4594 }, { "epoch": 1.534479879779596, "grad_norm": 0.32532710429967887, "learning_rate": 5.6694056049705506e-06, "loss": 0.026, "step": 4595 }, { "epoch": 1.5348138253464685, "grad_norm": 0.2512125523806715, "learning_rate": 5.667479716186927e-06, "loss": 0.0239, "step": 4596 }, { "epoch": 1.535147770913341, "grad_norm": 0.313861757323899, "learning_rate": 5.665553726572631e-06, "loss": 0.0289, "step": 4597 }, { "epoch": 1.5354817164802137, "grad_norm": 0.2554555233620205, "learning_rate": 5.663627636418611e-06, "loss": 0.0189, "step": 4598 }, { "epoch": 1.5358156620470864, "grad_norm": 0.2700226452622836, "learning_rate": 5.661701446015821e-06, "loss": 0.0246, "step": 4599 }, { "epoch": 1.536149607613959, "grad_norm": 0.2871125077340285, "learning_rate": 5.659775155655235e-06, "loss": 0.0254, "step": 4600 }, { "epoch": 1.5364835531808314, "grad_norm": 0.4007450117048198, "learning_rate": 5.6578487656278446e-06, "loss": 0.0188, "step": 4601 }, { "epoch": 1.5368174987477041, "grad_norm": 0.24601929474702078, "learning_rate": 5.655922276224652e-06, "loss": 0.0226, "step": 4602 }, { "epoch": 1.5371514443145768, "grad_norm": 0.2495254899972368, "learning_rate": 5.653995687736676e-06, "loss": 0.0214, "step": 4603 }, { "epoch": 1.5374853898814493, "grad_norm": 0.3368790194729096, "learning_rate": 5.652069000454951e-06, "loss": 0.026, "step": 4604 }, { "epoch": 1.5378193354483218, "grad_norm": 0.2542367660282918, "learning_rate": 5.650142214670527e-06, "loss": 0.0185, "step": 4605 }, { "epoch": 1.5381532810151946, "grad_norm": 1.0028351241572269, "learning_rate": 5.648215330674464e-06, "loss": 0.0296, "step": 4606 }, { "epoch": 1.5384872265820673, "grad_norm": 0.3162108319139779, "learning_rate": 5.646288348757845e-06, "loss": 0.0309, "step": 4607 }, { "epoch": 1.5388211721489398, "grad_norm": 0.36952837259072446, "learning_rate": 5.64436126921176e-06, "loss": 0.0217, "step": 4608 }, { "epoch": 1.5391551177158123, "grad_norm": 0.26075476018207117, "learning_rate": 5.642434092327318e-06, "loss": 0.0178, "step": 4609 }, { "epoch": 1.539489063282685, "grad_norm": 0.2950114033119621, "learning_rate": 5.640506818395643e-06, "loss": 0.0254, "step": 4610 }, { "epoch": 1.5398230088495575, "grad_norm": 0.34970712078527644, "learning_rate": 5.638579447707871e-06, "loss": 0.0233, "step": 4611 }, { "epoch": 1.54015695441643, "grad_norm": 0.26247724622548424, "learning_rate": 5.636651980555153e-06, "loss": 0.0291, "step": 4612 }, { "epoch": 1.5404908999833027, "grad_norm": 0.4277826509695619, "learning_rate": 5.634724417228658e-06, "loss": 0.0328, "step": 4613 }, { "epoch": 1.5408248455501754, "grad_norm": 0.3751967744435991, "learning_rate": 5.632796758019566e-06, "loss": 0.0302, "step": 4614 }, { "epoch": 1.541158791117048, "grad_norm": 0.3050719666108031, "learning_rate": 5.630869003219072e-06, "loss": 0.0238, "step": 4615 }, { "epoch": 1.5414927366839204, "grad_norm": 0.40607807074425567, "learning_rate": 5.628941153118388e-06, "loss": 0.0322, "step": 4616 }, { "epoch": 1.5418266822507931, "grad_norm": 0.22955755926861565, "learning_rate": 5.627013208008737e-06, "loss": 0.0187, "step": 4617 }, { "epoch": 1.5421606278176658, "grad_norm": 0.28266256943565177, "learning_rate": 5.625085168181357e-06, "loss": 0.0328, "step": 4618 }, { "epoch": 1.5424945733845383, "grad_norm": 0.3007434350195192, "learning_rate": 5.623157033927503e-06, "loss": 0.0268, "step": 4619 }, { "epoch": 1.5428285189514108, "grad_norm": 0.27442694457382366, "learning_rate": 5.621228805538443e-06, "loss": 0.0196, "step": 4620 }, { "epoch": 1.5431624645182835, "grad_norm": 0.2891208608608508, "learning_rate": 5.619300483305454e-06, "loss": 0.0233, "step": 4621 }, { "epoch": 1.5434964100851563, "grad_norm": 0.2621804175678766, "learning_rate": 5.617372067519837e-06, "loss": 0.0218, "step": 4622 }, { "epoch": 1.5438303556520288, "grad_norm": 0.22817704008707426, "learning_rate": 5.6154435584729e-06, "loss": 0.0178, "step": 4623 }, { "epoch": 1.5441643012189012, "grad_norm": 0.2564681222775103, "learning_rate": 5.6135149564559665e-06, "loss": 0.0235, "step": 4624 }, { "epoch": 1.544498246785774, "grad_norm": 0.2912918765243214, "learning_rate": 5.611586261760375e-06, "loss": 0.0199, "step": 4625 }, { "epoch": 1.5448321923526467, "grad_norm": 0.3001962667437202, "learning_rate": 5.609657474677478e-06, "loss": 0.0265, "step": 4626 }, { "epoch": 1.545166137919519, "grad_norm": 0.37257182980937525, "learning_rate": 5.607728595498641e-06, "loss": 0.0268, "step": 4627 }, { "epoch": 1.5455000834863917, "grad_norm": 0.25465617494988596, "learning_rate": 5.6057996245152435e-06, "loss": 0.0227, "step": 4628 }, { "epoch": 1.5458340290532644, "grad_norm": 0.4444859778718718, "learning_rate": 5.603870562018679e-06, "loss": 0.0181, "step": 4629 }, { "epoch": 1.5461679746201369, "grad_norm": 0.272913967079136, "learning_rate": 5.601941408300358e-06, "loss": 0.0298, "step": 4630 }, { "epoch": 1.5465019201870094, "grad_norm": 0.2393553678940523, "learning_rate": 5.600012163651698e-06, "loss": 0.0179, "step": 4631 }, { "epoch": 1.546835865753882, "grad_norm": 0.28587806870771554, "learning_rate": 5.598082828364134e-06, "loss": 0.0281, "step": 4632 }, { "epoch": 1.5471698113207548, "grad_norm": 0.26092692494166153, "learning_rate": 5.596153402729118e-06, "loss": 0.026, "step": 4633 }, { "epoch": 1.5475037568876273, "grad_norm": 0.2019802370150038, "learning_rate": 5.594223887038113e-06, "loss": 0.0215, "step": 4634 }, { "epoch": 1.5478377024544998, "grad_norm": 0.40809753995562387, "learning_rate": 5.592294281582591e-06, "loss": 0.0192, "step": 4635 }, { "epoch": 1.5481716480213725, "grad_norm": 0.21067349549869563, "learning_rate": 5.590364586654043e-06, "loss": 0.0167, "step": 4636 }, { "epoch": 1.5485055935882452, "grad_norm": 0.4390158755584321, "learning_rate": 5.588434802543975e-06, "loss": 0.0263, "step": 4637 }, { "epoch": 1.5488395391551177, "grad_norm": 0.25371956661779926, "learning_rate": 5.5865049295439e-06, "loss": 0.0193, "step": 4638 }, { "epoch": 1.5491734847219902, "grad_norm": 0.34840836287415267, "learning_rate": 5.584574967945351e-06, "loss": 0.022, "step": 4639 }, { "epoch": 1.549507430288863, "grad_norm": 0.3323124921361567, "learning_rate": 5.582644918039869e-06, "loss": 0.0283, "step": 4640 }, { "epoch": 1.5498413758557357, "grad_norm": 0.2369539977035479, "learning_rate": 5.580714780119011e-06, "loss": 0.0232, "step": 4641 }, { "epoch": 1.5501753214226082, "grad_norm": 0.2358771219447518, "learning_rate": 5.578784554474348e-06, "loss": 0.018, "step": 4642 }, { "epoch": 1.5505092669894807, "grad_norm": 0.7038527335405669, "learning_rate": 5.5768542413974645e-06, "loss": 0.0339, "step": 4643 }, { "epoch": 1.5508432125563534, "grad_norm": 0.33279921269341783, "learning_rate": 5.574923841179953e-06, "loss": 0.0262, "step": 4644 }, { "epoch": 1.5511771581232259, "grad_norm": 0.26103378741182265, "learning_rate": 5.572993354113429e-06, "loss": 0.0139, "step": 4645 }, { "epoch": 1.5515111036900984, "grad_norm": 0.42961469455510687, "learning_rate": 5.5710627804895105e-06, "loss": 0.0382, "step": 4646 }, { "epoch": 1.551845049256971, "grad_norm": 0.3138532263978365, "learning_rate": 5.569132120599834e-06, "loss": 0.0213, "step": 4647 }, { "epoch": 1.5521789948238438, "grad_norm": 0.2921545662946304, "learning_rate": 5.567201374736051e-06, "loss": 0.0207, "step": 4648 }, { "epoch": 1.5525129403907163, "grad_norm": 0.3590977867771232, "learning_rate": 5.565270543189821e-06, "loss": 0.0278, "step": 4649 }, { "epoch": 1.5528468859575888, "grad_norm": 0.24965628468729104, "learning_rate": 5.563339626252819e-06, "loss": 0.0252, "step": 4650 }, { "epoch": 1.5531808315244615, "grad_norm": 0.2114257381053459, "learning_rate": 5.561408624216734e-06, "loss": 0.0166, "step": 4651 }, { "epoch": 1.5535147770913342, "grad_norm": 0.3148892639908185, "learning_rate": 5.559477537373267e-06, "loss": 0.033, "step": 4652 }, { "epoch": 1.5538487226582067, "grad_norm": 0.39515797368240496, "learning_rate": 5.557546366014129e-06, "loss": 0.0249, "step": 4653 }, { "epoch": 1.5541826682250792, "grad_norm": 0.221771025652442, "learning_rate": 5.555615110431049e-06, "loss": 0.0195, "step": 4654 }, { "epoch": 1.554516613791952, "grad_norm": 0.22627484311750407, "learning_rate": 5.553683770915763e-06, "loss": 0.0191, "step": 4655 }, { "epoch": 1.5548505593588247, "grad_norm": 0.2843287276844521, "learning_rate": 5.551752347760023e-06, "loss": 0.0193, "step": 4656 }, { "epoch": 1.5551845049256972, "grad_norm": 0.2767401280559718, "learning_rate": 5.549820841255597e-06, "loss": 0.0188, "step": 4657 }, { "epoch": 1.5555184504925696, "grad_norm": 0.2782344950701206, "learning_rate": 5.547889251694257e-06, "loss": 0.0202, "step": 4658 }, { "epoch": 1.5558523960594424, "grad_norm": 0.3133332604155637, "learning_rate": 5.545957579367795e-06, "loss": 0.0294, "step": 4659 }, { "epoch": 1.5561863416263149, "grad_norm": 0.4599047583382848, "learning_rate": 5.544025824568011e-06, "loss": 0.0308, "step": 4660 }, { "epoch": 1.5565202871931874, "grad_norm": 0.27903961028820656, "learning_rate": 5.542093987586722e-06, "loss": 0.0325, "step": 4661 }, { "epoch": 1.55685423276006, "grad_norm": 0.3064781549878294, "learning_rate": 5.540162068715752e-06, "loss": 0.0232, "step": 4662 }, { "epoch": 1.5571881783269328, "grad_norm": 0.40819545051225026, "learning_rate": 5.538230068246942e-06, "loss": 0.0298, "step": 4663 }, { "epoch": 1.5575221238938053, "grad_norm": 0.3165798642538438, "learning_rate": 5.536297986472142e-06, "loss": 0.0221, "step": 4664 }, { "epoch": 1.5578560694606778, "grad_norm": 0.2400764108757089, "learning_rate": 5.534365823683219e-06, "loss": 0.0215, "step": 4665 }, { "epoch": 1.5581900150275505, "grad_norm": 0.4807705650038244, "learning_rate": 5.532433580172044e-06, "loss": 0.0334, "step": 4666 }, { "epoch": 1.5585239605944232, "grad_norm": 0.26510184630934774, "learning_rate": 5.5305012562305075e-06, "loss": 0.0216, "step": 4667 }, { "epoch": 1.5588579061612957, "grad_norm": 0.33302149776035345, "learning_rate": 5.528568852150511e-06, "loss": 0.0282, "step": 4668 }, { "epoch": 1.5591918517281682, "grad_norm": 0.3316923001526142, "learning_rate": 5.526636368223965e-06, "loss": 0.0387, "step": 4669 }, { "epoch": 1.559525797295041, "grad_norm": 0.2530947372334988, "learning_rate": 5.524703804742793e-06, "loss": 0.0235, "step": 4670 }, { "epoch": 1.5598597428619136, "grad_norm": 0.23760674435311052, "learning_rate": 5.522771161998936e-06, "loss": 0.0183, "step": 4671 }, { "epoch": 1.5601936884287861, "grad_norm": 0.29753277466576084, "learning_rate": 5.52083844028434e-06, "loss": 0.0237, "step": 4672 }, { "epoch": 1.5605276339956586, "grad_norm": 0.2949765784112042, "learning_rate": 5.518905639890961e-06, "loss": 0.0253, "step": 4673 }, { "epoch": 1.5608615795625314, "grad_norm": 0.27461339126092194, "learning_rate": 5.516972761110778e-06, "loss": 0.0262, "step": 4674 }, { "epoch": 1.561195525129404, "grad_norm": 0.27062981712833906, "learning_rate": 5.515039804235772e-06, "loss": 0.0215, "step": 4675 }, { "epoch": 1.5615294706962763, "grad_norm": 0.23118327380251755, "learning_rate": 5.51310676955794e-06, "loss": 0.019, "step": 4676 }, { "epoch": 1.561863416263149, "grad_norm": 0.3252735016340356, "learning_rate": 5.511173657369287e-06, "loss": 0.0306, "step": 4677 }, { "epoch": 1.5621973618300218, "grad_norm": 0.29006622084789985, "learning_rate": 5.509240467961835e-06, "loss": 0.0223, "step": 4678 }, { "epoch": 1.5625313073968943, "grad_norm": 0.18297376051731895, "learning_rate": 5.507307201627614e-06, "loss": 0.0154, "step": 4679 }, { "epoch": 1.5628652529637668, "grad_norm": 0.25067390514121257, "learning_rate": 5.505373858658668e-06, "loss": 0.0225, "step": 4680 }, { "epoch": 1.5631991985306395, "grad_norm": 0.21910683859517935, "learning_rate": 5.503440439347048e-06, "loss": 0.0179, "step": 4681 }, { "epoch": 1.5635331440975122, "grad_norm": 0.30904611155154754, "learning_rate": 5.501506943984823e-06, "loss": 0.0167, "step": 4682 }, { "epoch": 1.5638670896643847, "grad_norm": 0.30637023547215625, "learning_rate": 5.4995733728640695e-06, "loss": 0.0163, "step": 4683 }, { "epoch": 1.5642010352312572, "grad_norm": 0.27583062890681237, "learning_rate": 5.497639726276876e-06, "loss": 0.023, "step": 4684 }, { "epoch": 1.56453498079813, "grad_norm": 0.3082953985784777, "learning_rate": 5.49570600451534e-06, "loss": 0.031, "step": 4685 }, { "epoch": 1.5648689263650026, "grad_norm": 0.27738685199879104, "learning_rate": 5.493772207871577e-06, "loss": 0.0205, "step": 4686 }, { "epoch": 1.5652028719318751, "grad_norm": 0.42280758757405773, "learning_rate": 5.491838336637708e-06, "loss": 0.035, "step": 4687 }, { "epoch": 1.5655368174987476, "grad_norm": 0.2799160244683536, "learning_rate": 5.4899043911058665e-06, "loss": 0.0168, "step": 4688 }, { "epoch": 1.5658707630656203, "grad_norm": 0.24153761994573122, "learning_rate": 5.487970371568199e-06, "loss": 0.0141, "step": 4689 }, { "epoch": 1.566204708632493, "grad_norm": 0.19062471668626987, "learning_rate": 5.486036278316861e-06, "loss": 0.0116, "step": 4690 }, { "epoch": 1.5665386541993656, "grad_norm": 0.39218107389774626, "learning_rate": 5.48410211164402e-06, "loss": 0.0194, "step": 4691 }, { "epoch": 1.566872599766238, "grad_norm": 0.33593644567960307, "learning_rate": 5.482167871841855e-06, "loss": 0.0308, "step": 4692 }, { "epoch": 1.5672065453331108, "grad_norm": 0.2418824940638314, "learning_rate": 5.480233559202556e-06, "loss": 0.0218, "step": 4693 }, { "epoch": 1.5675404908999833, "grad_norm": 0.3160367765089976, "learning_rate": 5.4782991740183225e-06, "loss": 0.034, "step": 4694 }, { "epoch": 1.5678744364668558, "grad_norm": 0.25420505987501624, "learning_rate": 5.476364716581367e-06, "loss": 0.0193, "step": 4695 }, { "epoch": 1.5682083820337285, "grad_norm": 0.24988980694723345, "learning_rate": 5.474430187183912e-06, "loss": 0.0215, "step": 4696 }, { "epoch": 1.5685423276006012, "grad_norm": 0.3130027566208691, "learning_rate": 5.472495586118192e-06, "loss": 0.0285, "step": 4697 }, { "epoch": 1.5688762731674737, "grad_norm": 0.3616846271941303, "learning_rate": 5.47056091367645e-06, "loss": 0.0254, "step": 4698 }, { "epoch": 1.5692102187343462, "grad_norm": 0.3860937318368782, "learning_rate": 5.468626170150942e-06, "loss": 0.0292, "step": 4699 }, { "epoch": 1.569544164301219, "grad_norm": 0.3273696176668461, "learning_rate": 5.466691355833932e-06, "loss": 0.0342, "step": 4700 }, { "epoch": 1.5698781098680916, "grad_norm": 0.2990149381635294, "learning_rate": 5.464756471017696e-06, "loss": 0.0265, "step": 4701 }, { "epoch": 1.5702120554349641, "grad_norm": 0.20533438782692248, "learning_rate": 5.462821515994525e-06, "loss": 0.0146, "step": 4702 }, { "epoch": 1.5705460010018366, "grad_norm": 0.2766583366827619, "learning_rate": 5.460886491056714e-06, "loss": 0.0231, "step": 4703 }, { "epoch": 1.5708799465687093, "grad_norm": 0.29272686931138153, "learning_rate": 5.458951396496572e-06, "loss": 0.0309, "step": 4704 }, { "epoch": 1.571213892135582, "grad_norm": 0.2622278349438255, "learning_rate": 5.457016232606417e-06, "loss": 0.0241, "step": 4705 }, { "epoch": 1.5715478377024545, "grad_norm": 0.31097621246261814, "learning_rate": 5.455080999678579e-06, "loss": 0.0317, "step": 4706 }, { "epoch": 1.571881783269327, "grad_norm": 0.3514563774469613, "learning_rate": 5.453145698005399e-06, "loss": 0.0283, "step": 4707 }, { "epoch": 1.5722157288361998, "grad_norm": 0.23391581133573316, "learning_rate": 5.451210327879223e-06, "loss": 0.0212, "step": 4708 }, { "epoch": 1.5725496744030723, "grad_norm": 0.31739255467856997, "learning_rate": 5.449274889592416e-06, "loss": 0.02, "step": 4709 }, { "epoch": 1.5728836199699447, "grad_norm": 0.3555040759906985, "learning_rate": 5.4473393834373466e-06, "loss": 0.0292, "step": 4710 }, { "epoch": 1.5732175655368175, "grad_norm": 0.2935354434700193, "learning_rate": 5.445403809706395e-06, "loss": 0.0271, "step": 4711 }, { "epoch": 1.5735515111036902, "grad_norm": 0.22900019021369852, "learning_rate": 5.443468168691954e-06, "loss": 0.0233, "step": 4712 }, { "epoch": 1.5738854566705627, "grad_norm": 0.2423720507858526, "learning_rate": 5.441532460686426e-06, "loss": 0.0231, "step": 4713 }, { "epoch": 1.5742194022374352, "grad_norm": 0.3058308198831014, "learning_rate": 5.4395966859822195e-06, "loss": 0.0283, "step": 4714 }, { "epoch": 1.574553347804308, "grad_norm": 0.23322716561321696, "learning_rate": 5.437660844871758e-06, "loss": 0.0137, "step": 4715 }, { "epoch": 1.5748872933711806, "grad_norm": 0.29576286678046054, "learning_rate": 5.435724937647473e-06, "loss": 0.0326, "step": 4716 }, { "epoch": 1.575221238938053, "grad_norm": 0.3089105400151078, "learning_rate": 5.433788964601804e-06, "loss": 0.0232, "step": 4717 }, { "epoch": 1.5755551845049256, "grad_norm": 0.2899807206089013, "learning_rate": 5.431852926027206e-06, "loss": 0.0251, "step": 4718 }, { "epoch": 1.5758891300717983, "grad_norm": 0.29668329095541673, "learning_rate": 5.429916822216138e-06, "loss": 0.0372, "step": 4719 }, { "epoch": 1.576223075638671, "grad_norm": 0.23919092453680474, "learning_rate": 5.42798065346107e-06, "loss": 0.015, "step": 4720 }, { "epoch": 1.5765570212055435, "grad_norm": 0.2786262183787538, "learning_rate": 5.426044420054488e-06, "loss": 0.0203, "step": 4721 }, { "epoch": 1.576890966772416, "grad_norm": 0.3806863772664518, "learning_rate": 5.424108122288878e-06, "loss": 0.034, "step": 4722 }, { "epoch": 1.5772249123392887, "grad_norm": 0.29144612056718594, "learning_rate": 5.4221717604567435e-06, "loss": 0.0266, "step": 4723 }, { "epoch": 1.5775588579061615, "grad_norm": 0.30137680589643145, "learning_rate": 5.420235334850593e-06, "loss": 0.0251, "step": 4724 }, { "epoch": 1.5778928034730337, "grad_norm": 0.29449817772581827, "learning_rate": 5.418298845762947e-06, "loss": 0.0253, "step": 4725 }, { "epoch": 1.5782267490399065, "grad_norm": 0.3016360861775003, "learning_rate": 5.416362293486336e-06, "loss": 0.0205, "step": 4726 }, { "epoch": 1.5785606946067792, "grad_norm": 0.32694696518846245, "learning_rate": 5.4144256783132975e-06, "loss": 0.0227, "step": 4727 }, { "epoch": 1.5788946401736517, "grad_norm": 0.3042867092000363, "learning_rate": 5.41248900053638e-06, "loss": 0.034, "step": 4728 }, { "epoch": 1.5792285857405242, "grad_norm": 0.25855069164140154, "learning_rate": 5.4105522604481435e-06, "loss": 0.0205, "step": 4729 }, { "epoch": 1.5795625313073969, "grad_norm": 0.2916706163829039, "learning_rate": 5.408615458341152e-06, "loss": 0.0259, "step": 4730 }, { "epoch": 1.5798964768742696, "grad_norm": 0.20060261462918372, "learning_rate": 5.4066785945079855e-06, "loss": 0.0178, "step": 4731 }, { "epoch": 1.580230422441142, "grad_norm": 0.27302644205906934, "learning_rate": 5.404741669241228e-06, "loss": 0.0208, "step": 4732 }, { "epoch": 1.5805643680080146, "grad_norm": 0.3215309320982359, "learning_rate": 5.402804682833477e-06, "loss": 0.0261, "step": 4733 }, { "epoch": 1.5808983135748873, "grad_norm": 0.42220882713573354, "learning_rate": 5.400867635577335e-06, "loss": 0.0339, "step": 4734 }, { "epoch": 1.58123225914176, "grad_norm": 0.36305931152161164, "learning_rate": 5.398930527765416e-06, "loss": 0.0336, "step": 4735 }, { "epoch": 1.5815662047086325, "grad_norm": 0.26248823150529205, "learning_rate": 5.396993359690345e-06, "loss": 0.025, "step": 4736 }, { "epoch": 1.581900150275505, "grad_norm": 0.4569090185707832, "learning_rate": 5.395056131644752e-06, "loss": 0.022, "step": 4737 }, { "epoch": 1.5822340958423777, "grad_norm": 0.2605050720874279, "learning_rate": 5.393118843921277e-06, "loss": 0.0248, "step": 4738 }, { "epoch": 1.5825680414092504, "grad_norm": 0.7888558177840966, "learning_rate": 5.391181496812573e-06, "loss": 0.0196, "step": 4739 }, { "epoch": 1.582901986976123, "grad_norm": 0.2629408746991032, "learning_rate": 5.389244090611298e-06, "loss": 0.0244, "step": 4740 }, { "epoch": 1.5832359325429954, "grad_norm": 0.41574229723430983, "learning_rate": 5.38730662561012e-06, "loss": 0.0294, "step": 4741 }, { "epoch": 1.5835698781098682, "grad_norm": 0.3006902731232163, "learning_rate": 5.385369102101716e-06, "loss": 0.0214, "step": 4742 }, { "epoch": 1.5839038236767407, "grad_norm": 0.5117946854034846, "learning_rate": 5.38343152037877e-06, "loss": 0.0298, "step": 4743 }, { "epoch": 1.5842377692436131, "grad_norm": 0.2865823988910225, "learning_rate": 5.38149388073398e-06, "loss": 0.0271, "step": 4744 }, { "epoch": 1.5845717148104859, "grad_norm": 0.2404491870137412, "learning_rate": 5.379556183460047e-06, "loss": 0.021, "step": 4745 }, { "epoch": 1.5849056603773586, "grad_norm": 0.3456730889133306, "learning_rate": 5.377618428849683e-06, "loss": 0.0297, "step": 4746 }, { "epoch": 1.585239605944231, "grad_norm": 0.4326588378740707, "learning_rate": 5.375680617195609e-06, "loss": 0.0401, "step": 4747 }, { "epoch": 1.5855735515111036, "grad_norm": 0.3092114747232085, "learning_rate": 5.373742748790555e-06, "loss": 0.0276, "step": 4748 }, { "epoch": 1.5859074970779763, "grad_norm": 0.32861060357738203, "learning_rate": 5.371804823927258e-06, "loss": 0.0231, "step": 4749 }, { "epoch": 1.586241442644849, "grad_norm": 0.3309238635446407, "learning_rate": 5.369866842898465e-06, "loss": 0.0302, "step": 4750 }, { "epoch": 1.5865753882117215, "grad_norm": 0.273042234665651, "learning_rate": 5.367928805996929e-06, "loss": 0.0179, "step": 4751 }, { "epoch": 1.586909333778594, "grad_norm": 0.22121126306814043, "learning_rate": 5.365990713515414e-06, "loss": 0.0155, "step": 4752 }, { "epoch": 1.5872432793454667, "grad_norm": 0.36004708447717365, "learning_rate": 5.364052565746693e-06, "loss": 0.0339, "step": 4753 }, { "epoch": 1.5875772249123394, "grad_norm": 0.35471488508422894, "learning_rate": 5.362114362983547e-06, "loss": 0.0285, "step": 4754 }, { "epoch": 1.587911170479212, "grad_norm": 0.3278108083323332, "learning_rate": 5.360176105518761e-06, "loss": 0.0252, "step": 4755 }, { "epoch": 1.5882451160460844, "grad_norm": 0.5442057996894242, "learning_rate": 5.358237793645133e-06, "loss": 0.0259, "step": 4756 }, { "epoch": 1.5885790616129571, "grad_norm": 0.3066559517227354, "learning_rate": 5.356299427655469e-06, "loss": 0.0248, "step": 4757 }, { "epoch": 1.5889130071798296, "grad_norm": 0.3509292080854506, "learning_rate": 5.354361007842581e-06, "loss": 0.0238, "step": 4758 }, { "epoch": 1.5892469527467021, "grad_norm": 0.3127844158130525, "learning_rate": 5.352422534499291e-06, "loss": 0.0209, "step": 4759 }, { "epoch": 1.5895808983135749, "grad_norm": 0.25436475877059445, "learning_rate": 5.350484007918428e-06, "loss": 0.0288, "step": 4760 }, { "epoch": 1.5899148438804476, "grad_norm": 0.26940685127458835, "learning_rate": 5.3485454283928265e-06, "loss": 0.0202, "step": 4761 }, { "epoch": 1.59024878944732, "grad_norm": 0.29915499194764766, "learning_rate": 5.346606796215335e-06, "loss": 0.0227, "step": 4762 }, { "epoch": 1.5905827350141926, "grad_norm": 0.3150840168448465, "learning_rate": 5.344668111678805e-06, "loss": 0.0289, "step": 4763 }, { "epoch": 1.5909166805810653, "grad_norm": 0.29708402447560933, "learning_rate": 5.3427293750761e-06, "loss": 0.0293, "step": 4764 }, { "epoch": 1.591250626147938, "grad_norm": 0.507848113811238, "learning_rate": 5.340790586700086e-06, "loss": 0.0444, "step": 4765 }, { "epoch": 1.5915845717148105, "grad_norm": 0.47958756147843334, "learning_rate": 5.338851746843643e-06, "loss": 0.0504, "step": 4766 }, { "epoch": 1.591918517281683, "grad_norm": 0.3857724170103133, "learning_rate": 5.336912855799652e-06, "loss": 0.0333, "step": 4767 }, { "epoch": 1.5922524628485557, "grad_norm": 0.3256115713330487, "learning_rate": 5.334973913861008e-06, "loss": 0.0298, "step": 4768 }, { "epoch": 1.5925864084154284, "grad_norm": 0.3210869938593134, "learning_rate": 5.33303492132061e-06, "loss": 0.0245, "step": 4769 }, { "epoch": 1.592920353982301, "grad_norm": 0.3027694155959764, "learning_rate": 5.3310958784713655e-06, "loss": 0.0224, "step": 4770 }, { "epoch": 1.5932542995491734, "grad_norm": 0.3374932600880021, "learning_rate": 5.329156785606191e-06, "loss": 0.0272, "step": 4771 }, { "epoch": 1.5935882451160461, "grad_norm": 0.19829963870156075, "learning_rate": 5.327217643018008e-06, "loss": 0.0188, "step": 4772 }, { "epoch": 1.5939221906829188, "grad_norm": 0.2875739802901327, "learning_rate": 5.325278450999747e-06, "loss": 0.0285, "step": 4773 }, { "epoch": 1.5942561362497911, "grad_norm": 0.28658185117328083, "learning_rate": 5.323339209844346e-06, "loss": 0.0225, "step": 4774 }, { "epoch": 1.5945900818166638, "grad_norm": 0.20728421149729695, "learning_rate": 5.32139991984475e-06, "loss": 0.0188, "step": 4775 }, { "epoch": 1.5949240273835366, "grad_norm": 0.23157554397758384, "learning_rate": 5.319460581293911e-06, "loss": 0.0245, "step": 4776 }, { "epoch": 1.595257972950409, "grad_norm": 0.4322591968780855, "learning_rate": 5.317521194484791e-06, "loss": 0.0361, "step": 4777 }, { "epoch": 1.5955919185172815, "grad_norm": 0.2646817157765945, "learning_rate": 5.315581759710356e-06, "loss": 0.0262, "step": 4778 }, { "epoch": 1.5959258640841543, "grad_norm": 0.3301913898104641, "learning_rate": 5.313642277263577e-06, "loss": 0.032, "step": 4779 }, { "epoch": 1.596259809651027, "grad_norm": 0.2703215674687887, "learning_rate": 5.311702747437443e-06, "loss": 0.0237, "step": 4780 }, { "epoch": 1.5965937552178995, "grad_norm": 0.2530070537761009, "learning_rate": 5.309763170524937e-06, "loss": 0.0201, "step": 4781 }, { "epoch": 1.596927700784772, "grad_norm": 0.3446708667156327, "learning_rate": 5.307823546819056e-06, "loss": 0.0251, "step": 4782 }, { "epoch": 1.5972616463516447, "grad_norm": 0.22083629292761292, "learning_rate": 5.305883876612805e-06, "loss": 0.0175, "step": 4783 }, { "epoch": 1.5975955919185174, "grad_norm": 0.3487952843616271, "learning_rate": 5.303944160199193e-06, "loss": 0.0277, "step": 4784 }, { "epoch": 1.59792953748539, "grad_norm": 0.3540184288587415, "learning_rate": 5.302004397871237e-06, "loss": 0.0268, "step": 4785 }, { "epoch": 1.5982634830522624, "grad_norm": 0.3465534816893311, "learning_rate": 5.3000645899219594e-06, "loss": 0.035, "step": 4786 }, { "epoch": 1.5985974286191351, "grad_norm": 0.3162003508577011, "learning_rate": 5.298124736644392e-06, "loss": 0.0262, "step": 4787 }, { "epoch": 1.5989313741860078, "grad_norm": 0.35194628643626913, "learning_rate": 5.296184838331575e-06, "loss": 0.0272, "step": 4788 }, { "epoch": 1.5992653197528803, "grad_norm": 0.32583954152290284, "learning_rate": 5.2942448952765495e-06, "loss": 0.0192, "step": 4789 }, { "epoch": 1.5995992653197528, "grad_norm": 0.2805320714680657, "learning_rate": 5.292304907772367e-06, "loss": 0.02, "step": 4790 }, { "epoch": 1.5999332108866255, "grad_norm": 0.40436549839817487, "learning_rate": 5.290364876112088e-06, "loss": 0.0296, "step": 4791 }, { "epoch": 1.600267156453498, "grad_norm": 0.24443108313728634, "learning_rate": 5.288424800588775e-06, "loss": 0.0195, "step": 4792 }, { "epoch": 1.6006011020203705, "grad_norm": 0.2727773278746564, "learning_rate": 5.2864846814955e-06, "loss": 0.0249, "step": 4793 }, { "epoch": 1.6009350475872433, "grad_norm": 0.23916156551333528, "learning_rate": 5.28454451912534e-06, "loss": 0.0212, "step": 4794 }, { "epoch": 1.601268993154116, "grad_norm": 0.3076333340386845, "learning_rate": 5.28260431377138e-06, "loss": 0.0223, "step": 4795 }, { "epoch": 1.6016029387209885, "grad_norm": 0.6434255776259884, "learning_rate": 5.280664065726712e-06, "loss": 0.0325, "step": 4796 }, { "epoch": 1.601936884287861, "grad_norm": 0.2842031209752502, "learning_rate": 5.278723775284432e-06, "loss": 0.0297, "step": 4797 }, { "epoch": 1.6022708298547337, "grad_norm": 0.25222956898096344, "learning_rate": 5.276783442737642e-06, "loss": 0.0255, "step": 4798 }, { "epoch": 1.6026047754216064, "grad_norm": 0.321502376766453, "learning_rate": 5.274843068379456e-06, "loss": 0.0307, "step": 4799 }, { "epoch": 1.602938720988479, "grad_norm": 0.29430310303389207, "learning_rate": 5.272902652502988e-06, "loss": 0.0237, "step": 4800 }, { "epoch": 1.6032726665553514, "grad_norm": 0.27058933831977283, "learning_rate": 5.27096219540136e-06, "loss": 0.0305, "step": 4801 }, { "epoch": 1.603606612122224, "grad_norm": 0.2555684774059336, "learning_rate": 5.269021697367702e-06, "loss": 0.0242, "step": 4802 }, { "epoch": 1.6039405576890968, "grad_norm": 0.3308202615222362, "learning_rate": 5.26708115869515e-06, "loss": 0.0239, "step": 4803 }, { "epoch": 1.6042745032559693, "grad_norm": 0.2510960013556946, "learning_rate": 5.265140579676844e-06, "loss": 0.0258, "step": 4804 }, { "epoch": 1.6046084488228418, "grad_norm": 0.21484759970123984, "learning_rate": 5.263199960605931e-06, "loss": 0.0194, "step": 4805 }, { "epoch": 1.6049423943897145, "grad_norm": 0.3116786428964571, "learning_rate": 5.261259301775564e-06, "loss": 0.034, "step": 4806 }, { "epoch": 1.605276339956587, "grad_norm": 0.2277523219865577, "learning_rate": 5.259318603478904e-06, "loss": 0.0169, "step": 4807 }, { "epoch": 1.6056102855234595, "grad_norm": 0.48014624121360283, "learning_rate": 5.2573778660091156e-06, "loss": 0.0371, "step": 4808 }, { "epoch": 1.6059442310903322, "grad_norm": 0.29796799196893226, "learning_rate": 5.255437089659371e-06, "loss": 0.0293, "step": 4809 }, { "epoch": 1.606278176657205, "grad_norm": 0.4735160658342941, "learning_rate": 5.253496274722846e-06, "loss": 0.0348, "step": 4810 }, { "epoch": 1.6066121222240775, "grad_norm": 0.21864767422674275, "learning_rate": 5.251555421492722e-06, "loss": 0.0221, "step": 4811 }, { "epoch": 1.60694606779095, "grad_norm": 0.22988882576601047, "learning_rate": 5.249614530262191e-06, "loss": 0.0168, "step": 4812 }, { "epoch": 1.6072800133578227, "grad_norm": 0.23866870118765524, "learning_rate": 5.2476736013244475e-06, "loss": 0.0188, "step": 4813 }, { "epoch": 1.6076139589246954, "grad_norm": 0.242067290022722, "learning_rate": 5.245732634972688e-06, "loss": 0.0207, "step": 4814 }, { "epoch": 1.6079479044915679, "grad_norm": 0.27981419296431, "learning_rate": 5.243791631500122e-06, "loss": 0.0256, "step": 4815 }, { "epoch": 1.6082818500584404, "grad_norm": 0.2874140162528723, "learning_rate": 5.24185059119996e-06, "loss": 0.0182, "step": 4816 }, { "epoch": 1.608615795625313, "grad_norm": 0.4656270336515884, "learning_rate": 5.239909514365415e-06, "loss": 0.0198, "step": 4817 }, { "epoch": 1.6089497411921858, "grad_norm": 0.27341091957596597, "learning_rate": 5.237968401289717e-06, "loss": 0.0251, "step": 4818 }, { "epoch": 1.6092836867590583, "grad_norm": 0.23066541951949823, "learning_rate": 5.236027252266088e-06, "loss": 0.0183, "step": 4819 }, { "epoch": 1.6096176323259308, "grad_norm": 0.29781999695816197, "learning_rate": 5.234086067587765e-06, "loss": 0.0354, "step": 4820 }, { "epoch": 1.6099515778928035, "grad_norm": 0.28720055288235125, "learning_rate": 5.232144847547983e-06, "loss": 0.0199, "step": 4821 }, { "epoch": 1.6102855234596762, "grad_norm": 0.4228859821305198, "learning_rate": 5.230203592439989e-06, "loss": 0.0314, "step": 4822 }, { "epoch": 1.6106194690265485, "grad_norm": 0.26080625923864204, "learning_rate": 5.228262302557034e-06, "loss": 0.0222, "step": 4823 }, { "epoch": 1.6109534145934212, "grad_norm": 0.34423070383950344, "learning_rate": 5.226320978192369e-06, "loss": 0.0325, "step": 4824 }, { "epoch": 1.611287360160294, "grad_norm": 0.2234218708852561, "learning_rate": 5.224379619639253e-06, "loss": 0.016, "step": 4825 }, { "epoch": 1.6116213057271664, "grad_norm": 0.4742693452016969, "learning_rate": 5.222438227190957e-06, "loss": 0.0224, "step": 4826 }, { "epoch": 1.611955251294039, "grad_norm": 0.2688246703311745, "learning_rate": 5.220496801140746e-06, "loss": 0.0215, "step": 4827 }, { "epoch": 1.6122891968609117, "grad_norm": 0.40459376139778924, "learning_rate": 5.218555341781897e-06, "loss": 0.0262, "step": 4828 }, { "epoch": 1.6126231424277844, "grad_norm": 0.3428041641425851, "learning_rate": 5.216613849407691e-06, "loss": 0.0312, "step": 4829 }, { "epoch": 1.6129570879946569, "grad_norm": 0.3605583467383309, "learning_rate": 5.214672324311412e-06, "loss": 0.0303, "step": 4830 }, { "epoch": 1.6132910335615294, "grad_norm": 0.31557417238903585, "learning_rate": 5.21273076678635e-06, "loss": 0.0233, "step": 4831 }, { "epoch": 1.613624979128402, "grad_norm": 0.2642680826640008, "learning_rate": 5.210789177125802e-06, "loss": 0.0184, "step": 4832 }, { "epoch": 1.6139589246952748, "grad_norm": 0.3859219655297724, "learning_rate": 5.208847555623066e-06, "loss": 0.0286, "step": 4833 }, { "epoch": 1.6142928702621473, "grad_norm": 0.25315683691876983, "learning_rate": 5.206905902571447e-06, "loss": 0.0182, "step": 4834 }, { "epoch": 1.6146268158290198, "grad_norm": 0.25054761531808073, "learning_rate": 5.204964218264258e-06, "loss": 0.0198, "step": 4835 }, { "epoch": 1.6149607613958925, "grad_norm": 0.26758594323195806, "learning_rate": 5.203022502994808e-06, "loss": 0.0188, "step": 4836 }, { "epoch": 1.6152947069627652, "grad_norm": 0.24297502186397885, "learning_rate": 5.201080757056418e-06, "loss": 0.0233, "step": 4837 }, { "epoch": 1.6156286525296377, "grad_norm": 0.3336641968278252, "learning_rate": 5.1991389807424145e-06, "loss": 0.0298, "step": 4838 }, { "epoch": 1.6159625980965102, "grad_norm": 0.28801638864308404, "learning_rate": 5.1971971743461215e-06, "loss": 0.0218, "step": 4839 }, { "epoch": 1.616296543663383, "grad_norm": 0.33992786590422136, "learning_rate": 5.195255338160873e-06, "loss": 0.0312, "step": 4840 }, { "epoch": 1.6166304892302554, "grad_norm": 0.3300505734606772, "learning_rate": 5.193313472480007e-06, "loss": 0.0351, "step": 4841 }, { "epoch": 1.616964434797128, "grad_norm": 0.2469267585111408, "learning_rate": 5.191371577596866e-06, "loss": 0.0212, "step": 4842 }, { "epoch": 1.6172983803640006, "grad_norm": 0.25766521677771825, "learning_rate": 5.189429653804794e-06, "loss": 0.0227, "step": 4843 }, { "epoch": 1.6176323259308734, "grad_norm": 0.2638100017500362, "learning_rate": 5.187487701397142e-06, "loss": 0.0232, "step": 4844 }, { "epoch": 1.6179662714977459, "grad_norm": 0.30013885783696354, "learning_rate": 5.185545720667266e-06, "loss": 0.0254, "step": 4845 }, { "epoch": 1.6183002170646184, "grad_norm": 0.2777395363844827, "learning_rate": 5.183603711908523e-06, "loss": 0.0343, "step": 4846 }, { "epoch": 1.618634162631491, "grad_norm": 0.25097533760935614, "learning_rate": 5.181661675414278e-06, "loss": 0.0218, "step": 4847 }, { "epoch": 1.6189681081983638, "grad_norm": 0.3333191072539681, "learning_rate": 5.179719611477898e-06, "loss": 0.0292, "step": 4848 }, { "epoch": 1.6193020537652363, "grad_norm": 0.2892431012234522, "learning_rate": 5.1777775203927535e-06, "loss": 0.0259, "step": 4849 }, { "epoch": 1.6196359993321088, "grad_norm": 0.2756012623572237, "learning_rate": 5.175835402452223e-06, "loss": 0.0247, "step": 4850 }, { "epoch": 1.6199699448989815, "grad_norm": 0.314328113990028, "learning_rate": 5.173893257949683e-06, "loss": 0.0237, "step": 4851 }, { "epoch": 1.6203038904658542, "grad_norm": 0.35503470207424787, "learning_rate": 5.17195108717852e-06, "loss": 0.0343, "step": 4852 }, { "epoch": 1.6206378360327267, "grad_norm": 0.33815210771986753, "learning_rate": 5.170008890432121e-06, "loss": 0.0242, "step": 4853 }, { "epoch": 1.6209717815995992, "grad_norm": 0.34596514073694246, "learning_rate": 5.168066668003876e-06, "loss": 0.0299, "step": 4854 }, { "epoch": 1.621305727166472, "grad_norm": 0.33130168382659214, "learning_rate": 5.166124420187182e-06, "loss": 0.0315, "step": 4855 }, { "epoch": 1.6216396727333444, "grad_norm": 0.5759503857143327, "learning_rate": 5.164182147275439e-06, "loss": 0.034, "step": 4856 }, { "epoch": 1.621973618300217, "grad_norm": 0.371117361447823, "learning_rate": 5.16223984956205e-06, "loss": 0.0289, "step": 4857 }, { "epoch": 1.6223075638670896, "grad_norm": 0.3252396054377313, "learning_rate": 5.1602975273404196e-06, "loss": 0.0333, "step": 4858 }, { "epoch": 1.6226415094339623, "grad_norm": 0.2675817867545674, "learning_rate": 5.158355180903961e-06, "loss": 0.0238, "step": 4859 }, { "epoch": 1.6229754550008348, "grad_norm": 0.22672990587307784, "learning_rate": 5.156412810546089e-06, "loss": 0.0151, "step": 4860 }, { "epoch": 1.6233094005677073, "grad_norm": 0.34311026777566395, "learning_rate": 5.154470416560219e-06, "loss": 0.0365, "step": 4861 }, { "epoch": 1.62364334613458, "grad_norm": 0.2885105262846136, "learning_rate": 5.152527999239774e-06, "loss": 0.0196, "step": 4862 }, { "epoch": 1.6239772917014528, "grad_norm": 0.33436878170656337, "learning_rate": 5.150585558878177e-06, "loss": 0.0288, "step": 4863 }, { "epoch": 1.6243112372683253, "grad_norm": 0.2304555832945819, "learning_rate": 5.148643095768861e-06, "loss": 0.0182, "step": 4864 }, { "epoch": 1.6246451828351978, "grad_norm": 0.22989453735018273, "learning_rate": 5.146700610205254e-06, "loss": 0.0193, "step": 4865 }, { "epoch": 1.6249791284020705, "grad_norm": 0.3030183725671978, "learning_rate": 5.144758102480792e-06, "loss": 0.0272, "step": 4866 }, { "epoch": 1.6253130739689432, "grad_norm": 0.2848636087310223, "learning_rate": 5.142815572888915e-06, "loss": 0.0179, "step": 4867 }, { "epoch": 1.6256470195358157, "grad_norm": 0.30807782885876955, "learning_rate": 5.140873021723065e-06, "loss": 0.0164, "step": 4868 }, { "epoch": 1.6259809651026882, "grad_norm": 0.2636911885214975, "learning_rate": 5.138930449276686e-06, "loss": 0.0213, "step": 4869 }, { "epoch": 1.626314910669561, "grad_norm": 0.3594121242116709, "learning_rate": 5.136987855843226e-06, "loss": 0.0335, "step": 4870 }, { "epoch": 1.6266488562364336, "grad_norm": 0.30992688499839566, "learning_rate": 5.135045241716138e-06, "loss": 0.0272, "step": 4871 }, { "epoch": 1.626982801803306, "grad_norm": 0.6901160642035612, "learning_rate": 5.133102607188875e-06, "loss": 0.0462, "step": 4872 }, { "epoch": 1.6273167473701786, "grad_norm": 0.23929753820696184, "learning_rate": 5.131159952554896e-06, "loss": 0.023, "step": 4873 }, { "epoch": 1.6276506929370513, "grad_norm": 0.2724424655945263, "learning_rate": 5.129217278107663e-06, "loss": 0.0282, "step": 4874 }, { "epoch": 1.6279846385039238, "grad_norm": 0.282057777681632, "learning_rate": 5.127274584140636e-06, "loss": 0.0213, "step": 4875 }, { "epoch": 1.6283185840707963, "grad_norm": 0.3379046637925344, "learning_rate": 5.125331870947287e-06, "loss": 0.0296, "step": 4876 }, { "epoch": 1.628652529637669, "grad_norm": 0.24309572848826755, "learning_rate": 5.123389138821084e-06, "loss": 0.0199, "step": 4877 }, { "epoch": 1.6289864752045418, "grad_norm": 0.36315804777175525, "learning_rate": 5.121446388055497e-06, "loss": 0.0367, "step": 4878 }, { "epoch": 1.6293204207714143, "grad_norm": 0.23689374667209845, "learning_rate": 5.119503618944004e-06, "loss": 0.0192, "step": 4879 }, { "epoch": 1.6296543663382868, "grad_norm": 0.22554833791099554, "learning_rate": 5.117560831780082e-06, "loss": 0.0198, "step": 4880 }, { "epoch": 1.6299883119051595, "grad_norm": 0.2855831070144288, "learning_rate": 5.115618026857211e-06, "loss": 0.0214, "step": 4881 }, { "epoch": 1.6303222574720322, "grad_norm": 0.28459196633380857, "learning_rate": 5.113675204468876e-06, "loss": 0.0268, "step": 4882 }, { "epoch": 1.6306562030389047, "grad_norm": 0.2986998450636787, "learning_rate": 5.111732364908564e-06, "loss": 0.027, "step": 4883 }, { "epoch": 1.6309901486057772, "grad_norm": 0.21241646747444373, "learning_rate": 5.109789508469761e-06, "loss": 0.019, "step": 4884 }, { "epoch": 1.63132409417265, "grad_norm": 0.24206957076505653, "learning_rate": 5.107846635445962e-06, "loss": 0.026, "step": 4885 }, { "epoch": 1.6316580397395226, "grad_norm": 0.37132477665509145, "learning_rate": 5.1059037461306586e-06, "loss": 0.0271, "step": 4886 }, { "epoch": 1.631991985306395, "grad_norm": 0.3072025606318378, "learning_rate": 5.103960840817346e-06, "loss": 0.0236, "step": 4887 }, { "epoch": 1.6323259308732676, "grad_norm": 0.2767586912028166, "learning_rate": 5.1020179197995245e-06, "loss": 0.03, "step": 4888 }, { "epoch": 1.6326598764401403, "grad_norm": 0.3670096785316321, "learning_rate": 5.1000749833706964e-06, "loss": 0.0238, "step": 4889 }, { "epoch": 1.6329938220070128, "grad_norm": 0.30533979712995624, "learning_rate": 5.098132031824362e-06, "loss": 0.0239, "step": 4890 }, { "epoch": 1.6333277675738853, "grad_norm": 0.2868800217312553, "learning_rate": 5.096189065454029e-06, "loss": 0.0241, "step": 4891 }, { "epoch": 1.633661713140758, "grad_norm": 0.298824688834227, "learning_rate": 5.094246084553206e-06, "loss": 0.0386, "step": 4892 }, { "epoch": 1.6339956587076307, "grad_norm": 0.2670158781325221, "learning_rate": 5.092303089415403e-06, "loss": 0.0191, "step": 4893 }, { "epoch": 1.6343296042745032, "grad_norm": 0.26080028444541575, "learning_rate": 5.09036008033413e-06, "loss": 0.0245, "step": 4894 }, { "epoch": 1.6346635498413757, "grad_norm": 0.35152356013618197, "learning_rate": 5.0884170576029034e-06, "loss": 0.0425, "step": 4895 }, { "epoch": 1.6349974954082485, "grad_norm": 0.37809725267474514, "learning_rate": 5.086474021515238e-06, "loss": 0.0287, "step": 4896 }, { "epoch": 1.6353314409751212, "grad_norm": 0.3471898600544406, "learning_rate": 5.084530972364656e-06, "loss": 0.0259, "step": 4897 }, { "epoch": 1.6356653865419937, "grad_norm": 0.3368310914230557, "learning_rate": 5.082587910444674e-06, "loss": 0.0273, "step": 4898 }, { "epoch": 1.6359993321088662, "grad_norm": 0.36674910929416554, "learning_rate": 5.080644836048815e-06, "loss": 0.0237, "step": 4899 }, { "epoch": 1.6363332776757389, "grad_norm": 0.234722678580623, "learning_rate": 5.0787017494706035e-06, "loss": 0.019, "step": 4900 }, { "epoch": 1.6366672232426116, "grad_norm": 0.32775241114624526, "learning_rate": 5.076758651003567e-06, "loss": 0.0229, "step": 4901 }, { "epoch": 1.637001168809484, "grad_norm": 0.27395380220635784, "learning_rate": 5.0748155409412325e-06, "loss": 0.0247, "step": 4902 }, { "epoch": 1.6373351143763566, "grad_norm": 0.26414085875841076, "learning_rate": 5.0728724195771295e-06, "loss": 0.0253, "step": 4903 }, { "epoch": 1.6376690599432293, "grad_norm": 0.32636371422360083, "learning_rate": 5.070929287204789e-06, "loss": 0.0207, "step": 4904 }, { "epoch": 1.6380030055101018, "grad_norm": 0.3133734134501154, "learning_rate": 5.068986144117746e-06, "loss": 0.0236, "step": 4905 }, { "epoch": 1.6383369510769743, "grad_norm": 0.38445653705556826, "learning_rate": 5.067042990609533e-06, "loss": 0.0308, "step": 4906 }, { "epoch": 1.638670896643847, "grad_norm": 0.34188288130237365, "learning_rate": 5.065099826973685e-06, "loss": 0.0313, "step": 4907 }, { "epoch": 1.6390048422107197, "grad_norm": 0.2945760238756098, "learning_rate": 5.0631566535037435e-06, "loss": 0.025, "step": 4908 }, { "epoch": 1.6393387877775922, "grad_norm": 0.250507187653982, "learning_rate": 5.061213470493246e-06, "loss": 0.0219, "step": 4909 }, { "epoch": 1.6396727333444647, "grad_norm": 0.35430706689850594, "learning_rate": 5.059270278235732e-06, "loss": 0.0365, "step": 4910 }, { "epoch": 1.6400066789113374, "grad_norm": 0.26003382451159485, "learning_rate": 5.057327077024745e-06, "loss": 0.023, "step": 4911 }, { "epoch": 1.6403406244782102, "grad_norm": 0.32358799810408434, "learning_rate": 5.055383867153829e-06, "loss": 0.0288, "step": 4912 }, { "epoch": 1.6406745700450827, "grad_norm": 0.24320296939869507, "learning_rate": 5.053440648916526e-06, "loss": 0.018, "step": 4913 }, { "epoch": 1.6410085156119552, "grad_norm": 0.3309120854791246, "learning_rate": 5.051497422606385e-06, "loss": 0.0219, "step": 4914 }, { "epoch": 1.6413424611788279, "grad_norm": 0.24222949820992562, "learning_rate": 5.049554188516952e-06, "loss": 0.022, "step": 4915 }, { "epoch": 1.6416764067457006, "grad_norm": 0.28324159577532293, "learning_rate": 5.047610946941775e-06, "loss": 0.0252, "step": 4916 }, { "epoch": 1.642010352312573, "grad_norm": 0.2467957660971369, "learning_rate": 5.045667698174403e-06, "loss": 0.0197, "step": 4917 }, { "epoch": 1.6423442978794456, "grad_norm": 0.270345628312582, "learning_rate": 5.043724442508388e-06, "loss": 0.0235, "step": 4918 }, { "epoch": 1.6426782434463183, "grad_norm": 0.3826462739790992, "learning_rate": 5.0417811802372815e-06, "loss": 0.0253, "step": 4919 }, { "epoch": 1.643012189013191, "grad_norm": 0.2954307686434209, "learning_rate": 5.039837911654637e-06, "loss": 0.023, "step": 4920 }, { "epoch": 1.6433461345800633, "grad_norm": 0.25784881747962524, "learning_rate": 5.037894637054005e-06, "loss": 0.0194, "step": 4921 }, { "epoch": 1.643680080146936, "grad_norm": 0.3703488806793411, "learning_rate": 5.035951356728942e-06, "loss": 0.0263, "step": 4922 }, { "epoch": 1.6440140257138087, "grad_norm": 0.23582795623156796, "learning_rate": 5.034008070973004e-06, "loss": 0.0207, "step": 4923 }, { "epoch": 1.6443479712806812, "grad_norm": 0.4003376452782364, "learning_rate": 5.032064780079746e-06, "loss": 0.0342, "step": 4924 }, { "epoch": 1.6446819168475537, "grad_norm": 0.2683086827761114, "learning_rate": 5.030121484342725e-06, "loss": 0.0159, "step": 4925 }, { "epoch": 1.6450158624144264, "grad_norm": 0.2923408313808179, "learning_rate": 5.0281781840555e-06, "loss": 0.0165, "step": 4926 }, { "epoch": 1.6453498079812992, "grad_norm": 0.2837884708755334, "learning_rate": 5.026234879511629e-06, "loss": 0.0255, "step": 4927 }, { "epoch": 1.6456837535481716, "grad_norm": 0.3337187235406368, "learning_rate": 5.024291571004668e-06, "loss": 0.0299, "step": 4928 }, { "epoch": 1.6460176991150441, "grad_norm": 0.2334689315079052, "learning_rate": 5.022348258828181e-06, "loss": 0.0188, "step": 4929 }, { "epoch": 1.6463516446819169, "grad_norm": 0.35295419225541663, "learning_rate": 5.020404943275727e-06, "loss": 0.0246, "step": 4930 }, { "epoch": 1.6466855902487896, "grad_norm": 0.46047678634995604, "learning_rate": 5.018461624640864e-06, "loss": 0.0273, "step": 4931 }, { "epoch": 1.647019535815662, "grad_norm": 0.42576537149936977, "learning_rate": 5.016518303217157e-06, "loss": 0.03, "step": 4932 }, { "epoch": 1.6473534813825346, "grad_norm": 0.31993065717037794, "learning_rate": 5.014574979298166e-06, "loss": 0.0266, "step": 4933 }, { "epoch": 1.6476874269494073, "grad_norm": 0.43189933456424034, "learning_rate": 5.012631653177451e-06, "loss": 0.0346, "step": 4934 }, { "epoch": 1.64802137251628, "grad_norm": 0.24598475765528363, "learning_rate": 5.010688325148577e-06, "loss": 0.019, "step": 4935 }, { "epoch": 1.6483553180831525, "grad_norm": 0.2454250345953795, "learning_rate": 5.008744995505107e-06, "loss": 0.0231, "step": 4936 }, { "epoch": 1.648689263650025, "grad_norm": 0.2958006817103818, "learning_rate": 5.0068016645406e-06, "loss": 0.0227, "step": 4937 }, { "epoch": 1.6490232092168977, "grad_norm": 0.328464060756527, "learning_rate": 5.0048583325486234e-06, "loss": 0.029, "step": 4938 }, { "epoch": 1.6493571547837702, "grad_norm": 0.2824788317369057, "learning_rate": 5.002914999822737e-06, "loss": 0.0262, "step": 4939 }, { "epoch": 1.6496911003506427, "grad_norm": 0.3191445345770001, "learning_rate": 5.000971666656508e-06, "loss": 0.0189, "step": 4940 }, { "epoch": 1.6500250459175154, "grad_norm": 0.3926108442312257, "learning_rate": 4.999028333343494e-06, "loss": 0.0322, "step": 4941 }, { "epoch": 1.6503589914843881, "grad_norm": 0.26382261560676995, "learning_rate": 4.9970850001772634e-06, "loss": 0.0255, "step": 4942 }, { "epoch": 1.6506929370512606, "grad_norm": 0.48378684788902976, "learning_rate": 4.995141667451378e-06, "loss": 0.038, "step": 4943 }, { "epoch": 1.6510268826181331, "grad_norm": 0.25734112215571786, "learning_rate": 4.993198335459401e-06, "loss": 0.0204, "step": 4944 }, { "epoch": 1.6513608281850058, "grad_norm": 0.427964087391538, "learning_rate": 4.991255004494896e-06, "loss": 0.03, "step": 4945 }, { "epoch": 1.6516947737518786, "grad_norm": 0.3083034757934975, "learning_rate": 4.989311674851424e-06, "loss": 0.025, "step": 4946 }, { "epoch": 1.652028719318751, "grad_norm": 0.2904322188828532, "learning_rate": 4.9873683468225495e-06, "loss": 0.0349, "step": 4947 }, { "epoch": 1.6523626648856236, "grad_norm": 0.25018309190884663, "learning_rate": 4.985425020701836e-06, "loss": 0.0232, "step": 4948 }, { "epoch": 1.6526966104524963, "grad_norm": 0.2443123792017699, "learning_rate": 4.983481696782844e-06, "loss": 0.0228, "step": 4949 }, { "epoch": 1.653030556019369, "grad_norm": 0.32784355546403116, "learning_rate": 4.9815383753591365e-06, "loss": 0.0224, "step": 4950 }, { "epoch": 1.6533645015862415, "grad_norm": 0.3556476588955834, "learning_rate": 4.9795950567242754e-06, "loss": 0.0338, "step": 4951 }, { "epoch": 1.653698447153114, "grad_norm": 0.21734563076902141, "learning_rate": 4.9776517411718214e-06, "loss": 0.0156, "step": 4952 }, { "epoch": 1.6540323927199867, "grad_norm": 0.5343447426928528, "learning_rate": 4.9757084289953325e-06, "loss": 0.033, "step": 4953 }, { "epoch": 1.6543663382868592, "grad_norm": 0.2637916442901931, "learning_rate": 4.973765120488373e-06, "loss": 0.0206, "step": 4954 }, { "epoch": 1.6547002838537317, "grad_norm": 0.4122999440215748, "learning_rate": 4.9718218159445015e-06, "loss": 0.0349, "step": 4955 }, { "epoch": 1.6550342294206044, "grad_norm": 0.631519622363533, "learning_rate": 4.969878515657276e-06, "loss": 0.0309, "step": 4956 }, { "epoch": 1.6553681749874771, "grad_norm": 0.2901519199422888, "learning_rate": 4.967935219920257e-06, "loss": 0.0217, "step": 4957 }, { "epoch": 1.6557021205543496, "grad_norm": 0.21665088728859036, "learning_rate": 4.9659919290269986e-06, "loss": 0.0194, "step": 4958 }, { "epoch": 1.6560360661212221, "grad_norm": 0.2702260715546599, "learning_rate": 4.964048643271058e-06, "loss": 0.0272, "step": 4959 }, { "epoch": 1.6563700116880948, "grad_norm": 0.34327017188350023, "learning_rate": 4.962105362945996e-06, "loss": 0.0318, "step": 4960 }, { "epoch": 1.6567039572549676, "grad_norm": 0.3175698342141884, "learning_rate": 4.960162088345365e-06, "loss": 0.0207, "step": 4961 }, { "epoch": 1.65703790282184, "grad_norm": 0.2674284079862701, "learning_rate": 4.958218819762719e-06, "loss": 0.0197, "step": 4962 }, { "epoch": 1.6573718483887125, "grad_norm": 0.21629682957993326, "learning_rate": 4.9562755574916125e-06, "loss": 0.0174, "step": 4963 }, { "epoch": 1.6577057939555853, "grad_norm": 0.28107690820445647, "learning_rate": 4.954332301825597e-06, "loss": 0.026, "step": 4964 }, { "epoch": 1.658039739522458, "grad_norm": 0.34105964033776986, "learning_rate": 4.952389053058226e-06, "loss": 0.0251, "step": 4965 }, { "epoch": 1.6583736850893305, "grad_norm": 0.2805619123027518, "learning_rate": 4.95044581148305e-06, "loss": 0.0165, "step": 4966 }, { "epoch": 1.658707630656203, "grad_norm": 0.3159403043348787, "learning_rate": 4.948502577393617e-06, "loss": 0.0295, "step": 4967 }, { "epoch": 1.6590415762230757, "grad_norm": 0.29156347754202194, "learning_rate": 4.946559351083475e-06, "loss": 0.0247, "step": 4968 }, { "epoch": 1.6593755217899484, "grad_norm": 0.3093528722391559, "learning_rate": 4.944616132846174e-06, "loss": 0.025, "step": 4969 }, { "epoch": 1.6597094673568207, "grad_norm": 0.29106247154535825, "learning_rate": 4.942672922975255e-06, "loss": 0.0303, "step": 4970 }, { "epoch": 1.6600434129236934, "grad_norm": 0.29749313478295963, "learning_rate": 4.940729721764268e-06, "loss": 0.0268, "step": 4971 }, { "epoch": 1.6603773584905661, "grad_norm": 0.29895015574619965, "learning_rate": 4.938786529506755e-06, "loss": 0.0286, "step": 4972 }, { "epoch": 1.6607113040574386, "grad_norm": 0.3307527429050344, "learning_rate": 4.936843346496257e-06, "loss": 0.0334, "step": 4973 }, { "epoch": 1.661045249624311, "grad_norm": 0.3007469570717518, "learning_rate": 4.934900173026316e-06, "loss": 0.024, "step": 4974 }, { "epoch": 1.6613791951911838, "grad_norm": 0.3809458735159086, "learning_rate": 4.93295700939047e-06, "loss": 0.0431, "step": 4975 }, { "epoch": 1.6617131407580565, "grad_norm": 0.37616593115326924, "learning_rate": 4.931013855882255e-06, "loss": 0.0276, "step": 4976 }, { "epoch": 1.662047086324929, "grad_norm": 0.33783893099792933, "learning_rate": 4.929070712795211e-06, "loss": 0.035, "step": 4977 }, { "epoch": 1.6623810318918015, "grad_norm": 0.27044433571857607, "learning_rate": 4.927127580422871e-06, "loss": 0.0205, "step": 4978 }, { "epoch": 1.6627149774586742, "grad_norm": 0.33689189069593234, "learning_rate": 4.925184459058769e-06, "loss": 0.0206, "step": 4979 }, { "epoch": 1.663048923025547, "grad_norm": 0.32713079428308717, "learning_rate": 4.9232413489964345e-06, "loss": 0.0361, "step": 4980 }, { "epoch": 1.6633828685924195, "grad_norm": 0.3093826978462791, "learning_rate": 4.921298250529398e-06, "loss": 0.0281, "step": 4981 }, { "epoch": 1.663716814159292, "grad_norm": 0.2130044144120047, "learning_rate": 4.919355163951186e-06, "loss": 0.02, "step": 4982 }, { "epoch": 1.6640507597261647, "grad_norm": 0.27556262868873405, "learning_rate": 4.917412089555328e-06, "loss": 0.0252, "step": 4983 }, { "epoch": 1.6643847052930374, "grad_norm": 0.2516454214029749, "learning_rate": 4.915469027635345e-06, "loss": 0.0227, "step": 4984 }, { "epoch": 1.6647186508599099, "grad_norm": 0.2651718544270958, "learning_rate": 4.9135259784847625e-06, "loss": 0.0143, "step": 4985 }, { "epoch": 1.6650525964267824, "grad_norm": 0.3111543496451446, "learning_rate": 4.911582942397098e-06, "loss": 0.0267, "step": 4986 }, { "epoch": 1.665386541993655, "grad_norm": 0.3924904379069039, "learning_rate": 4.909639919665872e-06, "loss": 0.029, "step": 4987 }, { "epoch": 1.6657204875605276, "grad_norm": 0.32837484331891814, "learning_rate": 4.907696910584599e-06, "loss": 0.03, "step": 4988 }, { "epoch": 1.6660544331274, "grad_norm": 0.22603938931000145, "learning_rate": 4.905753915446795e-06, "loss": 0.0176, "step": 4989 }, { "epoch": 1.6663883786942728, "grad_norm": 0.3157935549046105, "learning_rate": 4.903810934545972e-06, "loss": 0.0266, "step": 4990 }, { "epoch": 1.6667223242611455, "grad_norm": 0.2541625230640935, "learning_rate": 4.90186796817564e-06, "loss": 0.0247, "step": 4991 }, { "epoch": 1.667056269828018, "grad_norm": 0.19296385447867082, "learning_rate": 4.899925016629307e-06, "loss": 0.0165, "step": 4992 }, { "epoch": 1.6673902153948905, "grad_norm": 0.3805727338041553, "learning_rate": 4.897982080200477e-06, "loss": 0.0392, "step": 4993 }, { "epoch": 1.6677241609617632, "grad_norm": 0.26836873484687457, "learning_rate": 4.896039159182655e-06, "loss": 0.022, "step": 4994 }, { "epoch": 1.668058106528636, "grad_norm": 0.26205931655509546, "learning_rate": 4.894096253869343e-06, "loss": 0.0217, "step": 4995 }, { "epoch": 1.6683920520955084, "grad_norm": 0.34799301349475786, "learning_rate": 4.89215336455404e-06, "loss": 0.0252, "step": 4996 }, { "epoch": 1.668725997662381, "grad_norm": 0.25375672055132636, "learning_rate": 4.89021049153024e-06, "loss": 0.019, "step": 4997 }, { "epoch": 1.6690599432292537, "grad_norm": 0.1707851497636513, "learning_rate": 4.888267635091439e-06, "loss": 0.0118, "step": 4998 }, { "epoch": 1.6693938887961264, "grad_norm": 0.35173002736254716, "learning_rate": 4.886324795531126e-06, "loss": 0.0272, "step": 4999 }, { "epoch": 1.6697278343629989, "grad_norm": 0.23934163516206688, "learning_rate": 4.88438197314279e-06, "loss": 0.0176, "step": 5000 }, { "epoch": 1.6700617799298714, "grad_norm": 0.2814792448653315, "learning_rate": 4.88243916821992e-06, "loss": 0.0284, "step": 5001 }, { "epoch": 1.670395725496744, "grad_norm": 0.2720343620929229, "learning_rate": 4.880496381055998e-06, "loss": 0.0244, "step": 5002 }, { "epoch": 1.6707296710636166, "grad_norm": 0.32709680479017667, "learning_rate": 4.878553611944505e-06, "loss": 0.0237, "step": 5003 }, { "epoch": 1.671063616630489, "grad_norm": 0.2350543918677866, "learning_rate": 4.876610861178918e-06, "loss": 0.0207, "step": 5004 }, { "epoch": 1.6713975621973618, "grad_norm": 0.26545230223903515, "learning_rate": 4.874668129052712e-06, "loss": 0.0211, "step": 5005 }, { "epoch": 1.6717315077642345, "grad_norm": 0.282447405556621, "learning_rate": 4.872725415859363e-06, "loss": 0.0212, "step": 5006 }, { "epoch": 1.672065453331107, "grad_norm": 0.24449657408298028, "learning_rate": 4.8707827218923385e-06, "loss": 0.0297, "step": 5007 }, { "epoch": 1.6723993988979795, "grad_norm": 0.4229927007115372, "learning_rate": 4.868840047445106e-06, "loss": 0.0406, "step": 5008 }, { "epoch": 1.6727333444648522, "grad_norm": 0.255050706127349, "learning_rate": 4.866897392811127e-06, "loss": 0.0208, "step": 5009 }, { "epoch": 1.673067290031725, "grad_norm": 0.3434314227523628, "learning_rate": 4.864954758283865e-06, "loss": 0.0377, "step": 5010 }, { "epoch": 1.6734012355985974, "grad_norm": 0.24896547532933488, "learning_rate": 4.8630121441567755e-06, "loss": 0.0289, "step": 5011 }, { "epoch": 1.67373518116547, "grad_norm": 0.19163970837587832, "learning_rate": 4.861069550723316e-06, "loss": 0.0204, "step": 5012 }, { "epoch": 1.6740691267323426, "grad_norm": 0.293813592423613, "learning_rate": 4.859126978276937e-06, "loss": 0.0303, "step": 5013 }, { "epoch": 1.6744030722992154, "grad_norm": 0.30395564935825325, "learning_rate": 4.857184427111086e-06, "loss": 0.0302, "step": 5014 }, { "epoch": 1.6747370178660879, "grad_norm": 0.2145584342304078, "learning_rate": 4.855241897519209e-06, "loss": 0.0263, "step": 5015 }, { "epoch": 1.6750709634329604, "grad_norm": 0.3326339898574843, "learning_rate": 4.8532993897947464e-06, "loss": 0.0272, "step": 5016 }, { "epoch": 1.675404908999833, "grad_norm": 0.2685224873280316, "learning_rate": 4.851356904231139e-06, "loss": 0.0226, "step": 5017 }, { "epoch": 1.6757388545667058, "grad_norm": 0.2736298493314978, "learning_rate": 4.849414441121823e-06, "loss": 0.0287, "step": 5018 }, { "epoch": 1.676072800133578, "grad_norm": 0.29949992752936794, "learning_rate": 4.847472000760228e-06, "loss": 0.0203, "step": 5019 }, { "epoch": 1.6764067457004508, "grad_norm": 0.4163779178955796, "learning_rate": 4.845529583439783e-06, "loss": 0.0331, "step": 5020 }, { "epoch": 1.6767406912673235, "grad_norm": 0.28545506524486947, "learning_rate": 4.843587189453914e-06, "loss": 0.0274, "step": 5021 }, { "epoch": 1.677074636834196, "grad_norm": 0.3181720812252546, "learning_rate": 4.84164481909604e-06, "loss": 0.0261, "step": 5022 }, { "epoch": 1.6774085824010685, "grad_norm": 0.3070593709975989, "learning_rate": 4.839702472659581e-06, "loss": 0.0279, "step": 5023 }, { "epoch": 1.6777425279679412, "grad_norm": 0.2729106425334527, "learning_rate": 4.837760150437952e-06, "loss": 0.0206, "step": 5024 }, { "epoch": 1.678076473534814, "grad_norm": 0.29145481234092924, "learning_rate": 4.8358178527245625e-06, "loss": 0.0248, "step": 5025 }, { "epoch": 1.6784104191016864, "grad_norm": 0.3115074242308137, "learning_rate": 4.83387557981282e-06, "loss": 0.0226, "step": 5026 }, { "epoch": 1.678744364668559, "grad_norm": 0.2944538379749023, "learning_rate": 4.831933331996126e-06, "loss": 0.0192, "step": 5027 }, { "epoch": 1.6790783102354316, "grad_norm": 0.307979192560287, "learning_rate": 4.8299911095678816e-06, "loss": 0.0283, "step": 5028 }, { "epoch": 1.6794122558023044, "grad_norm": 0.2662280595537268, "learning_rate": 4.82804891282148e-06, "loss": 0.0203, "step": 5029 }, { "epoch": 1.6797462013691769, "grad_norm": 0.222976795987867, "learning_rate": 4.8261067420503175e-06, "loss": 0.0175, "step": 5030 }, { "epoch": 1.6800801469360493, "grad_norm": 0.23873787386123677, "learning_rate": 4.8241645975477785e-06, "loss": 0.0188, "step": 5031 }, { "epoch": 1.680414092502922, "grad_norm": 0.2925453023614851, "learning_rate": 4.822222479607247e-06, "loss": 0.0211, "step": 5032 }, { "epoch": 1.6807480380697948, "grad_norm": 0.35771726874298143, "learning_rate": 4.820280388522104e-06, "loss": 0.0308, "step": 5033 }, { "epoch": 1.6810819836366673, "grad_norm": 0.2684815292118438, "learning_rate": 4.818338324585725e-06, "loss": 0.0186, "step": 5034 }, { "epoch": 1.6814159292035398, "grad_norm": 0.2983882412900993, "learning_rate": 4.816396288091478e-06, "loss": 0.0254, "step": 5035 }, { "epoch": 1.6817498747704125, "grad_norm": 0.27390334861865234, "learning_rate": 4.814454279332737e-06, "loss": 0.0199, "step": 5036 }, { "epoch": 1.682083820337285, "grad_norm": 0.41113374231178185, "learning_rate": 4.81251229860286e-06, "loss": 0.0191, "step": 5037 }, { "epoch": 1.6824177659041575, "grad_norm": 0.6393521000517295, "learning_rate": 4.810570346195207e-06, "loss": 0.0328, "step": 5038 }, { "epoch": 1.6827517114710302, "grad_norm": 0.30404000290546124, "learning_rate": 4.808628422403135e-06, "loss": 0.022, "step": 5039 }, { "epoch": 1.683085657037903, "grad_norm": 0.2962533988056783, "learning_rate": 4.806686527519994e-06, "loss": 0.0233, "step": 5040 }, { "epoch": 1.6834196026047754, "grad_norm": 0.2950371437436768, "learning_rate": 4.804744661839128e-06, "loss": 0.0303, "step": 5041 }, { "epoch": 1.683753548171648, "grad_norm": 0.2075556348178394, "learning_rate": 4.80280282565388e-06, "loss": 0.0168, "step": 5042 }, { "epoch": 1.6840874937385206, "grad_norm": 0.2673260805204257, "learning_rate": 4.800861019257587e-06, "loss": 0.0225, "step": 5043 }, { "epoch": 1.6844214393053933, "grad_norm": 0.34873262895580137, "learning_rate": 4.798919242943583e-06, "loss": 0.029, "step": 5044 }, { "epoch": 1.6847553848722658, "grad_norm": 0.307026568834567, "learning_rate": 4.796977497005194e-06, "loss": 0.0253, "step": 5045 }, { "epoch": 1.6850893304391383, "grad_norm": 0.5801891199429996, "learning_rate": 4.795035781735743e-06, "loss": 0.0203, "step": 5046 }, { "epoch": 1.685423276006011, "grad_norm": 0.3324355995162621, "learning_rate": 4.793094097428552e-06, "loss": 0.0297, "step": 5047 }, { "epoch": 1.6857572215728838, "grad_norm": 0.23355545241862927, "learning_rate": 4.7911524443769346e-06, "loss": 0.0209, "step": 5048 }, { "epoch": 1.6860911671397563, "grad_norm": 0.30561981730612925, "learning_rate": 4.789210822874199e-06, "loss": 0.0178, "step": 5049 }, { "epoch": 1.6864251127066288, "grad_norm": 0.27650548852788287, "learning_rate": 4.787269233213651e-06, "loss": 0.0234, "step": 5050 }, { "epoch": 1.6867590582735015, "grad_norm": 0.21521902160155762, "learning_rate": 4.785327675688591e-06, "loss": 0.0214, "step": 5051 }, { "epoch": 1.687093003840374, "grad_norm": 0.24266712803809304, "learning_rate": 4.7833861505923096e-06, "loss": 0.021, "step": 5052 }, { "epoch": 1.6874269494072465, "grad_norm": 0.31718648799906535, "learning_rate": 4.781444658218103e-06, "loss": 0.0345, "step": 5053 }, { "epoch": 1.6877608949741192, "grad_norm": 0.5583495864488831, "learning_rate": 4.779503198859255e-06, "loss": 0.0264, "step": 5054 }, { "epoch": 1.688094840540992, "grad_norm": 0.3001913541199259, "learning_rate": 4.777561772809045e-06, "loss": 0.0225, "step": 5055 }, { "epoch": 1.6884287861078644, "grad_norm": 0.3979758757656625, "learning_rate": 4.775620380360747e-06, "loss": 0.0254, "step": 5056 }, { "epoch": 1.688762731674737, "grad_norm": 0.8123384527348672, "learning_rate": 4.773679021807634e-06, "loss": 0.033, "step": 5057 }, { "epoch": 1.6890966772416096, "grad_norm": 0.23010428257933566, "learning_rate": 4.771737697442968e-06, "loss": 0.0181, "step": 5058 }, { "epoch": 1.6894306228084823, "grad_norm": 0.28248420722514384, "learning_rate": 4.7697964075600114e-06, "loss": 0.0257, "step": 5059 }, { "epoch": 1.6897645683753548, "grad_norm": 0.33196036112840116, "learning_rate": 4.767855152452019e-06, "loss": 0.0237, "step": 5060 }, { "epoch": 1.6900985139422273, "grad_norm": 0.40334280586752824, "learning_rate": 4.765913932412237e-06, "loss": 0.0416, "step": 5061 }, { "epoch": 1.6904324595091, "grad_norm": 0.2651581033201635, "learning_rate": 4.763972747733913e-06, "loss": 0.027, "step": 5062 }, { "epoch": 1.6907664050759728, "grad_norm": 0.3741347101468278, "learning_rate": 4.762031598710285e-06, "loss": 0.0235, "step": 5063 }, { "epoch": 1.6911003506428453, "grad_norm": 0.3128418321858523, "learning_rate": 4.760090485634584e-06, "loss": 0.0294, "step": 5064 }, { "epoch": 1.6914342962097177, "grad_norm": 0.2593181856558939, "learning_rate": 4.758149408800042e-06, "loss": 0.0244, "step": 5065 }, { "epoch": 1.6917682417765905, "grad_norm": 0.25238998632163034, "learning_rate": 4.756208368499879e-06, "loss": 0.0227, "step": 5066 }, { "epoch": 1.692102187343463, "grad_norm": 0.24034790988450105, "learning_rate": 4.754267365027314e-06, "loss": 0.0195, "step": 5067 }, { "epoch": 1.6924361329103355, "grad_norm": 0.30594466697486583, "learning_rate": 4.752326398675555e-06, "loss": 0.023, "step": 5068 }, { "epoch": 1.6927700784772082, "grad_norm": 0.3484229596217318, "learning_rate": 4.750385469737811e-06, "loss": 0.0269, "step": 5069 }, { "epoch": 1.693104024044081, "grad_norm": 0.2868801325419484, "learning_rate": 4.748444578507278e-06, "loss": 0.0256, "step": 5070 }, { "epoch": 1.6934379696109534, "grad_norm": 0.35948168405650954, "learning_rate": 4.746503725277156e-06, "loss": 0.0276, "step": 5071 }, { "epoch": 1.6937719151778259, "grad_norm": 0.405572432672421, "learning_rate": 4.744562910340631e-06, "loss": 0.035, "step": 5072 }, { "epoch": 1.6941058607446986, "grad_norm": 0.2597793140715524, "learning_rate": 4.742622133990885e-06, "loss": 0.024, "step": 5073 }, { "epoch": 1.6944398063115713, "grad_norm": 0.34892402665041466, "learning_rate": 4.740681396521097e-06, "loss": 0.033, "step": 5074 }, { "epoch": 1.6947737518784438, "grad_norm": 0.4626790306972768, "learning_rate": 4.738740698224438e-06, "loss": 0.0336, "step": 5075 }, { "epoch": 1.6951076974453163, "grad_norm": 0.26948137870452266, "learning_rate": 4.73680003939407e-06, "loss": 0.0183, "step": 5076 }, { "epoch": 1.695441643012189, "grad_norm": 0.3210469611402509, "learning_rate": 4.734859420323158e-06, "loss": 0.0298, "step": 5077 }, { "epoch": 1.6957755885790617, "grad_norm": 0.36254289631206843, "learning_rate": 4.7329188413048515e-06, "loss": 0.0235, "step": 5078 }, { "epoch": 1.6961095341459342, "grad_norm": 0.2519877993211459, "learning_rate": 4.7309783026322995e-06, "loss": 0.0179, "step": 5079 }, { "epoch": 1.6964434797128067, "grad_norm": 0.25224254971516313, "learning_rate": 4.7290378045986425e-06, "loss": 0.0264, "step": 5080 }, { "epoch": 1.6967774252796795, "grad_norm": 0.33297507889500944, "learning_rate": 4.727097347497014e-06, "loss": 0.0309, "step": 5081 }, { "epoch": 1.6971113708465522, "grad_norm": 0.34603534888592014, "learning_rate": 4.7251569316205455e-06, "loss": 0.0267, "step": 5082 }, { "epoch": 1.6974453164134247, "grad_norm": 0.3009500608729052, "learning_rate": 4.723216557262359e-06, "loss": 0.0223, "step": 5083 }, { "epoch": 1.6977792619802972, "grad_norm": 0.40241970379084385, "learning_rate": 4.721276224715569e-06, "loss": 0.028, "step": 5084 }, { "epoch": 1.6981132075471699, "grad_norm": 0.2164773872430984, "learning_rate": 4.719335934273289e-06, "loss": 0.0147, "step": 5085 }, { "epoch": 1.6984471531140424, "grad_norm": 0.34221091426098466, "learning_rate": 4.717395686228621e-06, "loss": 0.0233, "step": 5086 }, { "epoch": 1.6987810986809149, "grad_norm": 0.24905328977217717, "learning_rate": 4.715455480874661e-06, "loss": 0.0239, "step": 5087 }, { "epoch": 1.6991150442477876, "grad_norm": 0.31499459708637956, "learning_rate": 4.713515318504501e-06, "loss": 0.0228, "step": 5088 }, { "epoch": 1.6994489898146603, "grad_norm": 0.28305044429283904, "learning_rate": 4.711575199411226e-06, "loss": 0.0242, "step": 5089 }, { "epoch": 1.6997829353815328, "grad_norm": 0.3738602532204356, "learning_rate": 4.7096351238879135e-06, "loss": 0.033, "step": 5090 }, { "epoch": 1.7001168809484053, "grad_norm": 0.640430751504491, "learning_rate": 4.707695092227634e-06, "loss": 0.0311, "step": 5091 }, { "epoch": 1.700450826515278, "grad_norm": 0.30386975850806625, "learning_rate": 4.705755104723453e-06, "loss": 0.0302, "step": 5092 }, { "epoch": 1.7007847720821507, "grad_norm": 0.7792120964224993, "learning_rate": 4.703815161668426e-06, "loss": 0.0291, "step": 5093 }, { "epoch": 1.7011187176490232, "grad_norm": 0.28215177134505537, "learning_rate": 4.701875263355608e-06, "loss": 0.0274, "step": 5094 }, { "epoch": 1.7014526632158957, "grad_norm": 0.37768556717253043, "learning_rate": 4.699935410078042e-06, "loss": 0.0167, "step": 5095 }, { "epoch": 1.7017866087827684, "grad_norm": 0.3064811018277969, "learning_rate": 4.697995602128766e-06, "loss": 0.0253, "step": 5096 }, { "epoch": 1.7021205543496412, "grad_norm": 0.28903167050073053, "learning_rate": 4.696055839800809e-06, "loss": 0.0282, "step": 5097 }, { "epoch": 1.7024544999165137, "grad_norm": 0.34452536171124826, "learning_rate": 4.694116123387197e-06, "loss": 0.0339, "step": 5098 }, { "epoch": 1.7027884454833861, "grad_norm": 0.31291977199400417, "learning_rate": 4.692176453180944e-06, "loss": 0.0232, "step": 5099 }, { "epoch": 1.7031223910502589, "grad_norm": 0.2163732660843581, "learning_rate": 4.6902368294750644e-06, "loss": 0.0163, "step": 5100 }, { "epoch": 1.7034563366171314, "grad_norm": 0.33786859647608486, "learning_rate": 4.688297252562559e-06, "loss": 0.0307, "step": 5101 }, { "epoch": 1.7037902821840039, "grad_norm": 0.27604472899960997, "learning_rate": 4.6863577227364235e-06, "loss": 0.028, "step": 5102 }, { "epoch": 1.7041242277508766, "grad_norm": 0.2840436666898987, "learning_rate": 4.684418240289648e-06, "loss": 0.0255, "step": 5103 }, { "epoch": 1.7044581733177493, "grad_norm": 0.30621435120215135, "learning_rate": 4.682478805515212e-06, "loss": 0.0321, "step": 5104 }, { "epoch": 1.7047921188846218, "grad_norm": 0.4193594079022245, "learning_rate": 4.680539418706091e-06, "loss": 0.0352, "step": 5105 }, { "epoch": 1.7051260644514943, "grad_norm": 0.342921331965035, "learning_rate": 4.678600080155252e-06, "loss": 0.0201, "step": 5106 }, { "epoch": 1.705460010018367, "grad_norm": 0.23207198540603993, "learning_rate": 4.676660790155656e-06, "loss": 0.0189, "step": 5107 }, { "epoch": 1.7057939555852397, "grad_norm": 0.26082751138107063, "learning_rate": 4.674721549000255e-06, "loss": 0.0215, "step": 5108 }, { "epoch": 1.7061279011521122, "grad_norm": 0.3967764398350967, "learning_rate": 4.6727823569819944e-06, "loss": 0.0211, "step": 5109 }, { "epoch": 1.7064618467189847, "grad_norm": 0.2165363032956635, "learning_rate": 4.670843214393811e-06, "loss": 0.0216, "step": 5110 }, { "epoch": 1.7067957922858574, "grad_norm": 0.2974595631503717, "learning_rate": 4.6689041215286344e-06, "loss": 0.0247, "step": 5111 }, { "epoch": 1.7071297378527301, "grad_norm": 0.31041041022123594, "learning_rate": 4.666965078679391e-06, "loss": 0.0233, "step": 5112 }, { "epoch": 1.7074636834196026, "grad_norm": 0.23339664932116805, "learning_rate": 4.665026086138993e-06, "loss": 0.0225, "step": 5113 }, { "epoch": 1.7077976289864751, "grad_norm": 0.2414185889361508, "learning_rate": 4.66308714420035e-06, "loss": 0.0191, "step": 5114 }, { "epoch": 1.7081315745533479, "grad_norm": 0.2714979313342858, "learning_rate": 4.6611482531563595e-06, "loss": 0.0157, "step": 5115 }, { "epoch": 1.7084655201202203, "grad_norm": 0.3195693561574328, "learning_rate": 4.659209413299916e-06, "loss": 0.0328, "step": 5116 }, { "epoch": 1.7087994656870928, "grad_norm": 0.31177788487252767, "learning_rate": 4.657270624923901e-06, "loss": 0.0197, "step": 5117 }, { "epoch": 1.7091334112539656, "grad_norm": 0.43420911966539794, "learning_rate": 4.6553318883211955e-06, "loss": 0.032, "step": 5118 }, { "epoch": 1.7094673568208383, "grad_norm": 0.24534622116361365, "learning_rate": 4.653393203784667e-06, "loss": 0.0231, "step": 5119 }, { "epoch": 1.7098013023877108, "grad_norm": 0.21379368492645143, "learning_rate": 4.651454571607176e-06, "loss": 0.0146, "step": 5120 }, { "epoch": 1.7101352479545833, "grad_norm": 0.37667456250455755, "learning_rate": 4.649515992081576e-06, "loss": 0.0216, "step": 5121 }, { "epoch": 1.710469193521456, "grad_norm": 0.26418397352790207, "learning_rate": 4.64757746550071e-06, "loss": 0.0243, "step": 5122 }, { "epoch": 1.7108031390883287, "grad_norm": 0.28943731105183423, "learning_rate": 4.645638992157419e-06, "loss": 0.0267, "step": 5123 }, { "epoch": 1.7111370846552012, "grad_norm": 0.2594891781153189, "learning_rate": 4.6437005723445316e-06, "loss": 0.02, "step": 5124 }, { "epoch": 1.7114710302220737, "grad_norm": 0.31454282807529704, "learning_rate": 4.6417622063548675e-06, "loss": 0.0281, "step": 5125 }, { "epoch": 1.7118049757889464, "grad_norm": 0.656527704033975, "learning_rate": 4.6398238944812414e-06, "loss": 0.0293, "step": 5126 }, { "epoch": 1.7121389213558191, "grad_norm": 0.2766119451674607, "learning_rate": 4.637885637016456e-06, "loss": 0.03, "step": 5127 }, { "epoch": 1.7124728669226916, "grad_norm": 0.22158608692699336, "learning_rate": 4.635947434253308e-06, "loss": 0.0186, "step": 5128 }, { "epoch": 1.7128068124895641, "grad_norm": 0.24689145353828215, "learning_rate": 4.634009286484586e-06, "loss": 0.0164, "step": 5129 }, { "epoch": 1.7131407580564368, "grad_norm": 0.25846956500234836, "learning_rate": 4.632071194003073e-06, "loss": 0.0219, "step": 5130 }, { "epoch": 1.7134747036233096, "grad_norm": 0.3741788662744667, "learning_rate": 4.630133157101537e-06, "loss": 0.0375, "step": 5131 }, { "epoch": 1.713808649190182, "grad_norm": 0.28587885711693023, "learning_rate": 4.6281951760727435e-06, "loss": 0.0261, "step": 5132 }, { "epoch": 1.7141425947570545, "grad_norm": 0.22445231096509569, "learning_rate": 4.626257251209446e-06, "loss": 0.0198, "step": 5133 }, { "epoch": 1.7144765403239273, "grad_norm": 0.5551044535713995, "learning_rate": 4.624319382804391e-06, "loss": 0.0383, "step": 5134 }, { "epoch": 1.7148104858907998, "grad_norm": 0.2488565314772827, "learning_rate": 4.622381571150317e-06, "loss": 0.0226, "step": 5135 }, { "epoch": 1.7151444314576723, "grad_norm": 0.41971143384081905, "learning_rate": 4.620443816539954e-06, "loss": 0.0194, "step": 5136 }, { "epoch": 1.715478377024545, "grad_norm": 0.23765851993618284, "learning_rate": 4.618506119266021e-06, "loss": 0.0195, "step": 5137 }, { "epoch": 1.7158123225914177, "grad_norm": 0.28219873903593257, "learning_rate": 4.6165684796212306e-06, "loss": 0.0218, "step": 5138 }, { "epoch": 1.7161462681582902, "grad_norm": 0.3627060960703866, "learning_rate": 4.6146308978982865e-06, "loss": 0.0201, "step": 5139 }, { "epoch": 1.7164802137251627, "grad_norm": 0.24050130170589512, "learning_rate": 4.612693374389881e-06, "loss": 0.0207, "step": 5140 }, { "epoch": 1.7168141592920354, "grad_norm": 0.291678696132503, "learning_rate": 4.610755909388703e-06, "loss": 0.0262, "step": 5141 }, { "epoch": 1.7171481048589081, "grad_norm": 0.33128741432440445, "learning_rate": 4.608818503187428e-06, "loss": 0.0231, "step": 5142 }, { "epoch": 1.7174820504257806, "grad_norm": 0.22460146277547102, "learning_rate": 4.606881156078725e-06, "loss": 0.0176, "step": 5143 }, { "epoch": 1.717815995992653, "grad_norm": 0.3173399095946577, "learning_rate": 4.604943868355251e-06, "loss": 0.0262, "step": 5144 }, { "epoch": 1.7181499415595258, "grad_norm": 0.32845666940588364, "learning_rate": 4.603006640309658e-06, "loss": 0.0269, "step": 5145 }, { "epoch": 1.7184838871263985, "grad_norm": 0.24539305320111826, "learning_rate": 4.601069472234584e-06, "loss": 0.0191, "step": 5146 }, { "epoch": 1.718817832693271, "grad_norm": 0.24112482557342005, "learning_rate": 4.599132364422666e-06, "loss": 0.023, "step": 5147 }, { "epoch": 1.7191517782601435, "grad_norm": 0.2584785514803024, "learning_rate": 4.597195317166525e-06, "loss": 0.024, "step": 5148 }, { "epoch": 1.7194857238270163, "grad_norm": 0.25819795597233125, "learning_rate": 4.595258330758773e-06, "loss": 0.021, "step": 5149 }, { "epoch": 1.7198196693938888, "grad_norm": 0.2731501980834195, "learning_rate": 4.593321405492017e-06, "loss": 0.0202, "step": 5150 }, { "epoch": 1.7201536149607612, "grad_norm": 0.2899731157210256, "learning_rate": 4.59138454165885e-06, "loss": 0.0251, "step": 5151 }, { "epoch": 1.720487560527634, "grad_norm": 0.27693644241374615, "learning_rate": 4.589447739551857e-06, "loss": 0.0251, "step": 5152 }, { "epoch": 1.7208215060945067, "grad_norm": 0.3480525121627245, "learning_rate": 4.58751099946362e-06, "loss": 0.0277, "step": 5153 }, { "epoch": 1.7211554516613792, "grad_norm": 0.27701070481131357, "learning_rate": 4.585574321686704e-06, "loss": 0.0257, "step": 5154 }, { "epoch": 1.7214893972282517, "grad_norm": 0.26125681321982575, "learning_rate": 4.583637706513665e-06, "loss": 0.0249, "step": 5155 }, { "epoch": 1.7218233427951244, "grad_norm": 0.2415452060756729, "learning_rate": 4.5817011542370535e-06, "loss": 0.0227, "step": 5156 }, { "epoch": 1.722157288361997, "grad_norm": 0.26543938489252966, "learning_rate": 4.579764665149409e-06, "loss": 0.0217, "step": 5157 }, { "epoch": 1.7224912339288696, "grad_norm": 0.28644415612875096, "learning_rate": 4.577828239543257e-06, "loss": 0.0198, "step": 5158 }, { "epoch": 1.722825179495742, "grad_norm": 0.24786963473825338, "learning_rate": 4.575891877711123e-06, "loss": 0.0254, "step": 5159 }, { "epoch": 1.7231591250626148, "grad_norm": 0.25167514847976147, "learning_rate": 4.573955579945514e-06, "loss": 0.0305, "step": 5160 }, { "epoch": 1.7234930706294875, "grad_norm": 0.2044952223511773, "learning_rate": 4.572019346538931e-06, "loss": 0.0142, "step": 5161 }, { "epoch": 1.72382701619636, "grad_norm": 0.3470529283014334, "learning_rate": 4.570083177783865e-06, "loss": 0.0294, "step": 5162 }, { "epoch": 1.7241609617632325, "grad_norm": 0.28886051693419007, "learning_rate": 4.568147073972795e-06, "loss": 0.0324, "step": 5163 }, { "epoch": 1.7244949073301052, "grad_norm": 0.2575548960939929, "learning_rate": 4.566211035398196e-06, "loss": 0.0272, "step": 5164 }, { "epoch": 1.7248288528969777, "grad_norm": 0.23550200627980475, "learning_rate": 4.564275062352529e-06, "loss": 0.0193, "step": 5165 }, { "epoch": 1.7251627984638502, "grad_norm": 0.3550995831730855, "learning_rate": 4.5623391551282435e-06, "loss": 0.0318, "step": 5166 }, { "epoch": 1.725496744030723, "grad_norm": 0.2433252411104981, "learning_rate": 4.560403314017782e-06, "loss": 0.0218, "step": 5167 }, { "epoch": 1.7258306895975957, "grad_norm": 0.24278489909282805, "learning_rate": 4.558467539313576e-06, "loss": 0.0204, "step": 5168 }, { "epoch": 1.7261646351644682, "grad_norm": 0.2910727371167379, "learning_rate": 4.556531831308045e-06, "loss": 0.0289, "step": 5169 }, { "epoch": 1.7264985807313407, "grad_norm": 0.30541878123697425, "learning_rate": 4.554596190293606e-06, "loss": 0.0231, "step": 5170 }, { "epoch": 1.7268325262982134, "grad_norm": 0.35690239159501624, "learning_rate": 4.552660616562655e-06, "loss": 0.0173, "step": 5171 }, { "epoch": 1.727166471865086, "grad_norm": 0.2581256706438739, "learning_rate": 4.550725110407586e-06, "loss": 0.0224, "step": 5172 }, { "epoch": 1.7275004174319586, "grad_norm": 0.22722182545082023, "learning_rate": 4.548789672120779e-06, "loss": 0.0158, "step": 5173 }, { "epoch": 1.727834362998831, "grad_norm": 0.2604577885495867, "learning_rate": 4.5468543019946045e-06, "loss": 0.0185, "step": 5174 }, { "epoch": 1.7281683085657038, "grad_norm": 0.3221706980927786, "learning_rate": 4.544919000321421e-06, "loss": 0.0216, "step": 5175 }, { "epoch": 1.7285022541325765, "grad_norm": 0.2266912598497077, "learning_rate": 4.542983767393584e-06, "loss": 0.0175, "step": 5176 }, { "epoch": 1.728836199699449, "grad_norm": 0.3941274815544478, "learning_rate": 4.541048603503429e-06, "loss": 0.0432, "step": 5177 }, { "epoch": 1.7291701452663215, "grad_norm": 0.27304423179494175, "learning_rate": 4.539113508943287e-06, "loss": 0.0263, "step": 5178 }, { "epoch": 1.7295040908331942, "grad_norm": 0.242960209646446, "learning_rate": 4.537178484005476e-06, "loss": 0.02, "step": 5179 }, { "epoch": 1.729838036400067, "grad_norm": 0.2741842848720956, "learning_rate": 4.535243528982305e-06, "loss": 0.0185, "step": 5180 }, { "epoch": 1.7301719819669394, "grad_norm": 0.2607738698037124, "learning_rate": 4.53330864416607e-06, "loss": 0.0173, "step": 5181 }, { "epoch": 1.730505927533812, "grad_norm": 0.255859782690453, "learning_rate": 4.531373829849061e-06, "loss": 0.0223, "step": 5182 }, { "epoch": 1.7308398731006847, "grad_norm": 0.28134369235214096, "learning_rate": 4.529439086323552e-06, "loss": 0.0239, "step": 5183 }, { "epoch": 1.7311738186675572, "grad_norm": 0.25464785465833706, "learning_rate": 4.52750441388181e-06, "loss": 0.0224, "step": 5184 }, { "epoch": 1.7315077642344296, "grad_norm": 0.22273440750512966, "learning_rate": 4.52556981281609e-06, "loss": 0.0166, "step": 5185 }, { "epoch": 1.7318417098013024, "grad_norm": 0.29297547989847234, "learning_rate": 4.523635283418635e-06, "loss": 0.0231, "step": 5186 }, { "epoch": 1.732175655368175, "grad_norm": 0.25082038119545436, "learning_rate": 4.521700825981678e-06, "loss": 0.0231, "step": 5187 }, { "epoch": 1.7325096009350476, "grad_norm": 0.25437856871721237, "learning_rate": 4.519766440797446e-06, "loss": 0.0244, "step": 5188 }, { "epoch": 1.73284354650192, "grad_norm": 0.27913666196932985, "learning_rate": 4.517832128158147e-06, "loss": 0.0307, "step": 5189 }, { "epoch": 1.7331774920687928, "grad_norm": 0.20122095846884455, "learning_rate": 4.515897888355982e-06, "loss": 0.0168, "step": 5190 }, { "epoch": 1.7335114376356655, "grad_norm": 0.27965290295736694, "learning_rate": 4.513963721683142e-06, "loss": 0.017, "step": 5191 }, { "epoch": 1.733845383202538, "grad_norm": 0.2580682457275597, "learning_rate": 4.5120296284318035e-06, "loss": 0.0161, "step": 5192 }, { "epoch": 1.7341793287694105, "grad_norm": 0.23191787950935147, "learning_rate": 4.510095608894134e-06, "loss": 0.0207, "step": 5193 }, { "epoch": 1.7345132743362832, "grad_norm": 0.29488894091518947, "learning_rate": 4.508161663362294e-06, "loss": 0.0239, "step": 5194 }, { "epoch": 1.734847219903156, "grad_norm": 0.2894669906650829, "learning_rate": 4.506227792128424e-06, "loss": 0.0261, "step": 5195 }, { "epoch": 1.7351811654700284, "grad_norm": 0.2769926890552985, "learning_rate": 4.504293995484662e-06, "loss": 0.0217, "step": 5196 }, { "epoch": 1.735515111036901, "grad_norm": 0.2817270523625594, "learning_rate": 4.502360273723127e-06, "loss": 0.0228, "step": 5197 }, { "epoch": 1.7358490566037736, "grad_norm": 0.3528872875931685, "learning_rate": 4.500426627135933e-06, "loss": 0.0217, "step": 5198 }, { "epoch": 1.7361830021706461, "grad_norm": 0.3912683429319728, "learning_rate": 4.4984930560151776e-06, "loss": 0.0207, "step": 5199 }, { "epoch": 1.7365169477375186, "grad_norm": 0.26965344769366073, "learning_rate": 4.496559560652952e-06, "loss": 0.0231, "step": 5200 }, { "epoch": 1.7368508933043914, "grad_norm": 0.3743078925794467, "learning_rate": 4.494626141341334e-06, "loss": 0.0229, "step": 5201 }, { "epoch": 1.737184838871264, "grad_norm": 0.2502915835025997, "learning_rate": 4.4926927983723876e-06, "loss": 0.0261, "step": 5202 }, { "epoch": 1.7375187844381366, "grad_norm": 0.34834333501190895, "learning_rate": 4.490759532038166e-06, "loss": 0.0271, "step": 5203 }, { "epoch": 1.737852730005009, "grad_norm": 0.33511590487420073, "learning_rate": 4.488826342630714e-06, "loss": 0.0263, "step": 5204 }, { "epoch": 1.7381866755718818, "grad_norm": 0.3954516710449802, "learning_rate": 4.486893230442062e-06, "loss": 0.0495, "step": 5205 }, { "epoch": 1.7385206211387545, "grad_norm": 0.2259170817390708, "learning_rate": 4.4849601957642295e-06, "loss": 0.0151, "step": 5206 }, { "epoch": 1.738854566705627, "grad_norm": 0.2672302838807095, "learning_rate": 4.483027238889223e-06, "loss": 0.0272, "step": 5207 }, { "epoch": 1.7391885122724995, "grad_norm": 0.26479393205927415, "learning_rate": 4.48109436010904e-06, "loss": 0.0205, "step": 5208 }, { "epoch": 1.7395224578393722, "grad_norm": 0.2575717548051547, "learning_rate": 4.4791615597156635e-06, "loss": 0.0182, "step": 5209 }, { "epoch": 1.739856403406245, "grad_norm": 0.2696827620570794, "learning_rate": 4.477228838001065e-06, "loss": 0.0225, "step": 5210 }, { "epoch": 1.7401903489731174, "grad_norm": 0.26423003954609514, "learning_rate": 4.475296195257206e-06, "loss": 0.021, "step": 5211 }, { "epoch": 1.74052429453999, "grad_norm": 0.27984330617580844, "learning_rate": 4.4733636317760365e-06, "loss": 0.0253, "step": 5212 }, { "epoch": 1.7408582401068626, "grad_norm": 0.25585669363698704, "learning_rate": 4.471431147849491e-06, "loss": 0.0173, "step": 5213 }, { "epoch": 1.7411921856737351, "grad_norm": 0.4212922976308605, "learning_rate": 4.469498743769493e-06, "loss": 0.0391, "step": 5214 }, { "epoch": 1.7415261312406076, "grad_norm": 0.31581981326018854, "learning_rate": 4.467566419827958e-06, "loss": 0.0299, "step": 5215 }, { "epoch": 1.7418600768074803, "grad_norm": 0.3834912936761464, "learning_rate": 4.465634176316782e-06, "loss": 0.0256, "step": 5216 }, { "epoch": 1.742194022374353, "grad_norm": 0.21644224182564878, "learning_rate": 4.463702013527857e-06, "loss": 0.0172, "step": 5217 }, { "epoch": 1.7425279679412256, "grad_norm": 0.330355490467127, "learning_rate": 4.4617699317530585e-06, "loss": 0.0303, "step": 5218 }, { "epoch": 1.742861913508098, "grad_norm": 0.2143760695592441, "learning_rate": 4.459837931284249e-06, "loss": 0.0164, "step": 5219 }, { "epoch": 1.7431958590749708, "grad_norm": 0.2588968504882097, "learning_rate": 4.45790601241328e-06, "loss": 0.0202, "step": 5220 }, { "epoch": 1.7435298046418435, "grad_norm": 0.3282147515400972, "learning_rate": 4.45597417543199e-06, "loss": 0.0241, "step": 5221 }, { "epoch": 1.743863750208716, "grad_norm": 0.3435669407216119, "learning_rate": 4.454042420632206e-06, "loss": 0.0279, "step": 5222 }, { "epoch": 1.7441976957755885, "grad_norm": 0.2101296550084173, "learning_rate": 4.452110748305744e-06, "loss": 0.017, "step": 5223 }, { "epoch": 1.7445316413424612, "grad_norm": 0.21290305536117435, "learning_rate": 4.450179158744405e-06, "loss": 0.0174, "step": 5224 }, { "epoch": 1.744865586909334, "grad_norm": 0.26165712554358617, "learning_rate": 4.448247652239978e-06, "loss": 0.0202, "step": 5225 }, { "epoch": 1.7451995324762064, "grad_norm": 0.23896565599813474, "learning_rate": 4.4463162290842395e-06, "loss": 0.0152, "step": 5226 }, { "epoch": 1.745533478043079, "grad_norm": 0.20755652136602437, "learning_rate": 4.444384889568954e-06, "loss": 0.0154, "step": 5227 }, { "epoch": 1.7458674236099516, "grad_norm": 0.3630635864615925, "learning_rate": 4.442453633985872e-06, "loss": 0.028, "step": 5228 }, { "epoch": 1.7462013691768243, "grad_norm": 0.328488739761976, "learning_rate": 4.4405224626267345e-06, "loss": 0.0236, "step": 5229 }, { "epoch": 1.7465353147436968, "grad_norm": 0.4058031221515435, "learning_rate": 4.438591375783267e-06, "loss": 0.0358, "step": 5230 }, { "epoch": 1.7468692603105693, "grad_norm": 0.35575753423783746, "learning_rate": 4.4366603737471825e-06, "loss": 0.0262, "step": 5231 }, { "epoch": 1.747203205877442, "grad_norm": 0.23757363888091992, "learning_rate": 4.434729456810182e-06, "loss": 0.0156, "step": 5232 }, { "epoch": 1.7475371514443145, "grad_norm": 0.3700115258942405, "learning_rate": 4.432798625263951e-06, "loss": 0.0298, "step": 5233 }, { "epoch": 1.747871097011187, "grad_norm": 0.2706429041718493, "learning_rate": 4.430867879400167e-06, "loss": 0.0145, "step": 5234 }, { "epoch": 1.7482050425780598, "grad_norm": 0.27304067087571326, "learning_rate": 4.428937219510491e-06, "loss": 0.0202, "step": 5235 }, { "epoch": 1.7485389881449325, "grad_norm": 0.38639372012668477, "learning_rate": 4.427006645886573e-06, "loss": 0.0234, "step": 5236 }, { "epoch": 1.748872933711805, "grad_norm": 0.2612221157674085, "learning_rate": 4.425076158820048e-06, "loss": 0.0277, "step": 5237 }, { "epoch": 1.7492068792786775, "grad_norm": 0.3044920989736937, "learning_rate": 4.423145758602538e-06, "loss": 0.0311, "step": 5238 }, { "epoch": 1.7495408248455502, "grad_norm": 0.3040687089932642, "learning_rate": 4.4212154455256535e-06, "loss": 0.0252, "step": 5239 }, { "epoch": 1.749874770412423, "grad_norm": 0.3793438133424752, "learning_rate": 4.41928521988099e-06, "loss": 0.0382, "step": 5240 }, { "epoch": 1.7502087159792954, "grad_norm": 0.2999439361917826, "learning_rate": 4.417355081960133e-06, "loss": 0.0217, "step": 5241 }, { "epoch": 1.7505426615461679, "grad_norm": 0.2658247453435652, "learning_rate": 4.415425032054651e-06, "loss": 0.0308, "step": 5242 }, { "epoch": 1.7508766071130406, "grad_norm": 0.23317641913026302, "learning_rate": 4.413495070456101e-06, "loss": 0.0183, "step": 5243 }, { "epoch": 1.7512105526799133, "grad_norm": 0.21068434522021748, "learning_rate": 4.411565197456027e-06, "loss": 0.0165, "step": 5244 }, { "epoch": 1.7515444982467858, "grad_norm": 0.27416963372766473, "learning_rate": 4.409635413345956e-06, "loss": 0.0251, "step": 5245 }, { "epoch": 1.7518784438136583, "grad_norm": 0.30041001314863824, "learning_rate": 4.40770571841741e-06, "loss": 0.0198, "step": 5246 }, { "epoch": 1.752212389380531, "grad_norm": 0.2574052028154111, "learning_rate": 4.405776112961889e-06, "loss": 0.0219, "step": 5247 }, { "epoch": 1.7525463349474035, "grad_norm": 0.2543463471795309, "learning_rate": 4.4038465972708824e-06, "loss": 0.0255, "step": 5248 }, { "epoch": 1.752880280514276, "grad_norm": 0.31099364739070234, "learning_rate": 4.4019171716358675e-06, "loss": 0.0342, "step": 5249 }, { "epoch": 1.7532142260811487, "grad_norm": 0.4155957865786336, "learning_rate": 4.399987836348305e-06, "loss": 0.0459, "step": 5250 }, { "epoch": 1.7535481716480215, "grad_norm": 0.3225731040108076, "learning_rate": 4.398058591699645e-06, "loss": 0.0297, "step": 5251 }, { "epoch": 1.753882117214894, "grad_norm": 0.30866483615255197, "learning_rate": 4.396129437981322e-06, "loss": 0.026, "step": 5252 }, { "epoch": 1.7542160627817664, "grad_norm": 0.2823405793941288, "learning_rate": 4.394200375484758e-06, "loss": 0.0203, "step": 5253 }, { "epoch": 1.7545500083486392, "grad_norm": 0.3377115568658968, "learning_rate": 4.392271404501361e-06, "loss": 0.0323, "step": 5254 }, { "epoch": 1.7548839539155119, "grad_norm": 0.30509659035590736, "learning_rate": 4.390342525322524e-06, "loss": 0.0275, "step": 5255 }, { "epoch": 1.7552178994823844, "grad_norm": 0.22784622096849774, "learning_rate": 4.3884137382396255e-06, "loss": 0.0144, "step": 5256 }, { "epoch": 1.7555518450492569, "grad_norm": 0.24378374758424304, "learning_rate": 4.3864850435440335e-06, "loss": 0.0225, "step": 5257 }, { "epoch": 1.7558857906161296, "grad_norm": 0.31573952872689903, "learning_rate": 4.3845564415271e-06, "loss": 0.0258, "step": 5258 }, { "epoch": 1.7562197361830023, "grad_norm": 0.36788575475331386, "learning_rate": 4.382627932480164e-06, "loss": 0.026, "step": 5259 }, { "epoch": 1.7565536817498748, "grad_norm": 0.2653527898917707, "learning_rate": 4.380699516694547e-06, "loss": 0.0213, "step": 5260 }, { "epoch": 1.7568876273167473, "grad_norm": 0.36528065971526696, "learning_rate": 4.37877119446156e-06, "loss": 0.0235, "step": 5261 }, { "epoch": 1.75722157288362, "grad_norm": 0.33795778085562383, "learning_rate": 4.3768429660725e-06, "loss": 0.0231, "step": 5262 }, { "epoch": 1.7575555184504925, "grad_norm": 0.2356782285703482, "learning_rate": 4.374914831818643e-06, "loss": 0.0161, "step": 5263 }, { "epoch": 1.757889464017365, "grad_norm": 0.33317869895051455, "learning_rate": 4.372986791991265e-06, "loss": 0.0184, "step": 5264 }, { "epoch": 1.7582234095842377, "grad_norm": 0.2287786204950603, "learning_rate": 4.371058846881614e-06, "loss": 0.0219, "step": 5265 }, { "epoch": 1.7585573551511104, "grad_norm": 0.26532748190311334, "learning_rate": 4.36913099678093e-06, "loss": 0.0271, "step": 5266 }, { "epoch": 1.758891300717983, "grad_norm": 0.32653349239746027, "learning_rate": 4.367203241980437e-06, "loss": 0.0226, "step": 5267 }, { "epoch": 1.7592252462848554, "grad_norm": 0.2428359751991708, "learning_rate": 4.3652755827713456e-06, "loss": 0.0206, "step": 5268 }, { "epoch": 1.7595591918517282, "grad_norm": 0.253140371750707, "learning_rate": 4.363348019444848e-06, "loss": 0.0207, "step": 5269 }, { "epoch": 1.7598931374186009, "grad_norm": 0.3183427687083458, "learning_rate": 4.361420552292132e-06, "loss": 0.0215, "step": 5270 }, { "epoch": 1.7602270829854734, "grad_norm": 0.20742994013478552, "learning_rate": 4.35949318160436e-06, "loss": 0.0237, "step": 5271 }, { "epoch": 1.7605610285523459, "grad_norm": 0.31861864003067447, "learning_rate": 4.357565907672684e-06, "loss": 0.0224, "step": 5272 }, { "epoch": 1.7608949741192186, "grad_norm": 0.2872871253468191, "learning_rate": 4.355638730788242e-06, "loss": 0.0245, "step": 5273 }, { "epoch": 1.7612289196860913, "grad_norm": 0.3656731111702706, "learning_rate": 4.353711651242157e-06, "loss": 0.0265, "step": 5274 }, { "epoch": 1.7615628652529638, "grad_norm": 0.22647542136807475, "learning_rate": 4.3517846693255365e-06, "loss": 0.0172, "step": 5275 }, { "epoch": 1.7618968108198363, "grad_norm": 0.3169411742043286, "learning_rate": 4.349857785329475e-06, "loss": 0.0324, "step": 5276 }, { "epoch": 1.762230756386709, "grad_norm": 0.20135404230110754, "learning_rate": 4.34793099954505e-06, "loss": 0.014, "step": 5277 }, { "epoch": 1.7625647019535817, "grad_norm": 0.32577566003284725, "learning_rate": 4.3460043122633256e-06, "loss": 0.0411, "step": 5278 }, { "epoch": 1.7628986475204542, "grad_norm": 0.22638661706663452, "learning_rate": 4.344077723775349e-06, "loss": 0.0163, "step": 5279 }, { "epoch": 1.7632325930873267, "grad_norm": 0.3359730383509722, "learning_rate": 4.342151234372155e-06, "loss": 0.0407, "step": 5280 }, { "epoch": 1.7635665386541994, "grad_norm": 0.26602280764370484, "learning_rate": 4.340224844344766e-06, "loss": 0.0176, "step": 5281 }, { "epoch": 1.763900484221072, "grad_norm": 0.2628029157397793, "learning_rate": 4.338298553984181e-06, "loss": 0.024, "step": 5282 }, { "epoch": 1.7642344297879444, "grad_norm": 0.3145709521741432, "learning_rate": 4.336372363581391e-06, "loss": 0.0215, "step": 5283 }, { "epoch": 1.7645683753548171, "grad_norm": 0.2618626178881137, "learning_rate": 4.33444627342737e-06, "loss": 0.023, "step": 5284 }, { "epoch": 1.7649023209216899, "grad_norm": 0.24790118139469644, "learning_rate": 4.332520283813075e-06, "loss": 0.0214, "step": 5285 }, { "epoch": 1.7652362664885624, "grad_norm": 0.3790678109613269, "learning_rate": 4.330594395029449e-06, "loss": 0.0621, "step": 5286 }, { "epoch": 1.7655702120554349, "grad_norm": 0.25914103207267886, "learning_rate": 4.328668607367424e-06, "loss": 0.0212, "step": 5287 }, { "epoch": 1.7659041576223076, "grad_norm": 0.29920527792637985, "learning_rate": 4.326742921117911e-06, "loss": 0.0264, "step": 5288 }, { "epoch": 1.7662381031891803, "grad_norm": 0.38668903811160105, "learning_rate": 4.324817336571806e-06, "loss": 0.0299, "step": 5289 }, { "epoch": 1.7665720487560528, "grad_norm": 0.31248843933088094, "learning_rate": 4.3228918540199926e-06, "loss": 0.0309, "step": 5290 }, { "epoch": 1.7669059943229253, "grad_norm": 0.24132598955109996, "learning_rate": 4.320966473753337e-06, "loss": 0.0246, "step": 5291 }, { "epoch": 1.767239939889798, "grad_norm": 0.26448154221641196, "learning_rate": 4.31904119606269e-06, "loss": 0.0214, "step": 5292 }, { "epoch": 1.7675738854566707, "grad_norm": 0.32016062345779206, "learning_rate": 4.31711602123889e-06, "loss": 0.0201, "step": 5293 }, { "epoch": 1.7679078310235432, "grad_norm": 0.20820968552951152, "learning_rate": 4.315190949572755e-06, "loss": 0.0128, "step": 5294 }, { "epoch": 1.7682417765904157, "grad_norm": 0.2877356275411102, "learning_rate": 4.313265981355091e-06, "loss": 0.0237, "step": 5295 }, { "epoch": 1.7685757221572884, "grad_norm": 0.3507237164174799, "learning_rate": 4.311341116876687e-06, "loss": 0.0199, "step": 5296 }, { "epoch": 1.768909667724161, "grad_norm": 0.2842876528233048, "learning_rate": 4.309416356428315e-06, "loss": 0.0198, "step": 5297 }, { "epoch": 1.7692436132910334, "grad_norm": 0.3450221107394591, "learning_rate": 4.307491700300733e-06, "loss": 0.0233, "step": 5298 }, { "epoch": 1.7695775588579061, "grad_norm": 0.25229333425881884, "learning_rate": 4.305567148784685e-06, "loss": 0.0187, "step": 5299 }, { "epoch": 1.7699115044247788, "grad_norm": 0.20358798555497293, "learning_rate": 4.3036427021708955e-06, "loss": 0.0149, "step": 5300 }, { "epoch": 1.7702454499916513, "grad_norm": 0.349335170862707, "learning_rate": 4.301718360750074e-06, "loss": 0.0235, "step": 5301 }, { "epoch": 1.7705793955585238, "grad_norm": 0.21444336137973968, "learning_rate": 4.299794124812918e-06, "loss": 0.0185, "step": 5302 }, { "epoch": 1.7709133411253966, "grad_norm": 0.34497724819497605, "learning_rate": 4.297869994650103e-06, "loss": 0.0337, "step": 5303 }, { "epoch": 1.7712472866922693, "grad_norm": 0.24126082129522472, "learning_rate": 4.295945970552293e-06, "loss": 0.0209, "step": 5304 }, { "epoch": 1.7715812322591418, "grad_norm": 0.2643741860051308, "learning_rate": 4.294022052810134e-06, "loss": 0.0262, "step": 5305 }, { "epoch": 1.7719151778260143, "grad_norm": 0.338841061616678, "learning_rate": 4.292098241714256e-06, "loss": 0.0221, "step": 5306 }, { "epoch": 1.772249123392887, "grad_norm": 0.22618246910030593, "learning_rate": 4.290174537555275e-06, "loss": 0.0189, "step": 5307 }, { "epoch": 1.7725830689597597, "grad_norm": 0.40917282584344794, "learning_rate": 4.2882509406237885e-06, "loss": 0.0192, "step": 5308 }, { "epoch": 1.7729170145266322, "grad_norm": 0.24258968238238737, "learning_rate": 4.286327451210377e-06, "loss": 0.0196, "step": 5309 }, { "epoch": 1.7732509600935047, "grad_norm": 0.2887790744063041, "learning_rate": 4.284404069605605e-06, "loss": 0.0322, "step": 5310 }, { "epoch": 1.7735849056603774, "grad_norm": 0.2617785767881327, "learning_rate": 4.282480796100027e-06, "loss": 0.0232, "step": 5311 }, { "epoch": 1.77391885122725, "grad_norm": 0.24059581061942995, "learning_rate": 4.280557630984173e-06, "loss": 0.0178, "step": 5312 }, { "epoch": 1.7742527967941224, "grad_norm": 0.3566445937685634, "learning_rate": 4.27863457454856e-06, "loss": 0.0309, "step": 5313 }, { "epoch": 1.7745867423609951, "grad_norm": 0.2085309533801013, "learning_rate": 4.276711627083688e-06, "loss": 0.0166, "step": 5314 }, { "epoch": 1.7749206879278678, "grad_norm": 0.29753796148611467, "learning_rate": 4.274788788880041e-06, "loss": 0.0274, "step": 5315 }, { "epoch": 1.7752546334947403, "grad_norm": 0.310507014057238, "learning_rate": 4.272866060228084e-06, "loss": 0.026, "step": 5316 }, { "epoch": 1.7755885790616128, "grad_norm": 0.2375018822584037, "learning_rate": 4.270943441418275e-06, "loss": 0.0236, "step": 5317 }, { "epoch": 1.7759225246284855, "grad_norm": 0.22875405020671685, "learning_rate": 4.2690209327410406e-06, "loss": 0.0149, "step": 5318 }, { "epoch": 1.7762564701953583, "grad_norm": 0.33326698236459007, "learning_rate": 4.267098534486803e-06, "loss": 0.0282, "step": 5319 }, { "epoch": 1.7765904157622308, "grad_norm": 0.30360024900284505, "learning_rate": 4.26517624694596e-06, "loss": 0.0206, "step": 5320 }, { "epoch": 1.7769243613291033, "grad_norm": 0.27982814877797346, "learning_rate": 4.2632540704088975e-06, "loss": 0.0234, "step": 5321 }, { "epoch": 1.777258306895976, "grad_norm": 0.35480889529813037, "learning_rate": 4.261332005165984e-06, "loss": 0.0188, "step": 5322 }, { "epoch": 1.7775922524628487, "grad_norm": 0.25259645616514015, "learning_rate": 4.259410051507567e-06, "loss": 0.0204, "step": 5323 }, { "epoch": 1.7779261980297212, "grad_norm": 0.3132454024785608, "learning_rate": 4.257488209723981e-06, "loss": 0.0273, "step": 5324 }, { "epoch": 1.7782601435965937, "grad_norm": 0.29434662822649665, "learning_rate": 4.255566480105546e-06, "loss": 0.0253, "step": 5325 }, { "epoch": 1.7785940891634664, "grad_norm": 0.29831340270595536, "learning_rate": 4.2536448629425585e-06, "loss": 0.0173, "step": 5326 }, { "epoch": 1.7789280347303391, "grad_norm": 0.2569724696689149, "learning_rate": 4.2517233585253024e-06, "loss": 0.0257, "step": 5327 }, { "epoch": 1.7792619802972116, "grad_norm": 0.376623295708525, "learning_rate": 4.2498019671440435e-06, "loss": 0.0334, "step": 5328 }, { "epoch": 1.779595925864084, "grad_norm": 0.25523237722503017, "learning_rate": 4.247880689089033e-06, "loss": 0.0242, "step": 5329 }, { "epoch": 1.7799298714309568, "grad_norm": 0.2569011744362559, "learning_rate": 4.245959524650498e-06, "loss": 0.0223, "step": 5330 }, { "epoch": 1.7802638169978293, "grad_norm": 0.26138192449838726, "learning_rate": 4.244038474118656e-06, "loss": 0.0188, "step": 5331 }, { "epoch": 1.7805977625647018, "grad_norm": 0.22985300788655164, "learning_rate": 4.242117537783704e-06, "loss": 0.0199, "step": 5332 }, { "epoch": 1.7809317081315745, "grad_norm": 0.29138987527189336, "learning_rate": 4.2401967159358195e-06, "loss": 0.0191, "step": 5333 }, { "epoch": 1.7812656536984472, "grad_norm": 0.3103929794813883, "learning_rate": 4.2382760088651696e-06, "loss": 0.0213, "step": 5334 }, { "epoch": 1.7815995992653197, "grad_norm": 0.28698349102347676, "learning_rate": 4.236355416861897e-06, "loss": 0.025, "step": 5335 }, { "epoch": 1.7819335448321922, "grad_norm": 0.21359581758066254, "learning_rate": 4.23443494021613e-06, "loss": 0.0207, "step": 5336 }, { "epoch": 1.782267490399065, "grad_norm": 0.2733168254407143, "learning_rate": 4.232514579217981e-06, "loss": 0.0205, "step": 5337 }, { "epoch": 1.7826014359659377, "grad_norm": 0.31333058790493223, "learning_rate": 4.23059433415754e-06, "loss": 0.029, "step": 5338 }, { "epoch": 1.7829353815328102, "grad_norm": 0.2553071507076, "learning_rate": 4.228674205324884e-06, "loss": 0.0208, "step": 5339 }, { "epoch": 1.7832693270996827, "grad_norm": 0.3666602706218142, "learning_rate": 4.226754193010072e-06, "loss": 0.02, "step": 5340 }, { "epoch": 1.7836032726665554, "grad_norm": 0.1696611479409936, "learning_rate": 4.224834297503145e-06, "loss": 0.0142, "step": 5341 }, { "epoch": 1.783937218233428, "grad_norm": 0.2690713183795865, "learning_rate": 4.222914519094124e-06, "loss": 0.0224, "step": 5342 }, { "epoch": 1.7842711638003006, "grad_norm": 0.29383558269603804, "learning_rate": 4.220994858073014e-06, "loss": 0.0273, "step": 5343 }, { "epoch": 1.784605109367173, "grad_norm": 0.22268131709141537, "learning_rate": 4.2190753147298044e-06, "loss": 0.0203, "step": 5344 }, { "epoch": 1.7849390549340458, "grad_norm": 0.24896416899627177, "learning_rate": 4.2171558893544626e-06, "loss": 0.0233, "step": 5345 }, { "epoch": 1.7852730005009183, "grad_norm": 0.2636696161135228, "learning_rate": 4.215236582236941e-06, "loss": 0.0254, "step": 5346 }, { "epoch": 1.7856069460677908, "grad_norm": 0.18638440619800603, "learning_rate": 4.213317393667175e-06, "loss": 0.0168, "step": 5347 }, { "epoch": 1.7859408916346635, "grad_norm": 0.25220214295454435, "learning_rate": 4.211398323935079e-06, "loss": 0.0227, "step": 5348 }, { "epoch": 1.7862748372015362, "grad_norm": 0.5021878889729603, "learning_rate": 4.209479373330552e-06, "loss": 0.0391, "step": 5349 }, { "epoch": 1.7866087827684087, "grad_norm": 0.2631743106456026, "learning_rate": 4.207560542143474e-06, "loss": 0.023, "step": 5350 }, { "epoch": 1.7869427283352812, "grad_norm": 0.227006636329623, "learning_rate": 4.205641830663706e-06, "loss": 0.0187, "step": 5351 }, { "epoch": 1.787276673902154, "grad_norm": 0.3444413732172693, "learning_rate": 4.2037232391810925e-06, "loss": 0.0234, "step": 5352 }, { "epoch": 1.7876106194690267, "grad_norm": 0.22090864487906997, "learning_rate": 4.20180476798546e-06, "loss": 0.0195, "step": 5353 }, { "epoch": 1.7879445650358992, "grad_norm": 0.3037373120044528, "learning_rate": 4.1998864173666174e-06, "loss": 0.0262, "step": 5354 }, { "epoch": 1.7882785106027717, "grad_norm": 0.29325394785793535, "learning_rate": 4.197968187614351e-06, "loss": 0.0189, "step": 5355 }, { "epoch": 1.7886124561696444, "grad_norm": 0.32265916252831683, "learning_rate": 4.196050079018433e-06, "loss": 0.0246, "step": 5356 }, { "epoch": 1.788946401736517, "grad_norm": 0.6868177119693587, "learning_rate": 4.194132091868616e-06, "loss": 0.0484, "step": 5357 }, { "epoch": 1.7892803473033896, "grad_norm": 0.26307189164847017, "learning_rate": 4.1922142264546365e-06, "loss": 0.0213, "step": 5358 }, { "epoch": 1.789614292870262, "grad_norm": 0.2590407420495831, "learning_rate": 4.1902964830662104e-06, "loss": 0.021, "step": 5359 }, { "epoch": 1.7899482384371348, "grad_norm": 0.29313996054388525, "learning_rate": 4.188378861993034e-06, "loss": 0.0188, "step": 5360 }, { "epoch": 1.7902821840040073, "grad_norm": 0.2642773265804162, "learning_rate": 4.186461363524786e-06, "loss": 0.0224, "step": 5361 }, { "epoch": 1.7906161295708798, "grad_norm": 0.2644774454859336, "learning_rate": 4.184543987951127e-06, "loss": 0.0281, "step": 5362 }, { "epoch": 1.7909500751377525, "grad_norm": 0.23094529361829833, "learning_rate": 4.182626735561703e-06, "loss": 0.0182, "step": 5363 }, { "epoch": 1.7912840207046252, "grad_norm": 0.3491309779710296, "learning_rate": 4.180709606646134e-06, "loss": 0.0325, "step": 5364 }, { "epoch": 1.7916179662714977, "grad_norm": 0.22344855625000304, "learning_rate": 4.178792601494026e-06, "loss": 0.0228, "step": 5365 }, { "epoch": 1.7919519118383702, "grad_norm": 0.35552602513091575, "learning_rate": 4.176875720394965e-06, "loss": 0.0286, "step": 5366 }, { "epoch": 1.792285857405243, "grad_norm": 0.37183661349160846, "learning_rate": 4.174958963638518e-06, "loss": 0.0419, "step": 5367 }, { "epoch": 1.7926198029721157, "grad_norm": 0.2390298845947433, "learning_rate": 4.173042331514234e-06, "loss": 0.018, "step": 5368 }, { "epoch": 1.7929537485389881, "grad_norm": 0.3338490058455825, "learning_rate": 4.171125824311642e-06, "loss": 0.0301, "step": 5369 }, { "epoch": 1.7932876941058606, "grad_norm": 0.24368378382488928, "learning_rate": 4.169209442320255e-06, "loss": 0.0231, "step": 5370 }, { "epoch": 1.7936216396727334, "grad_norm": 0.2817320940247766, "learning_rate": 4.167293185829565e-06, "loss": 0.023, "step": 5371 }, { "epoch": 1.793955585239606, "grad_norm": 0.18946796263856613, "learning_rate": 4.165377055129043e-06, "loss": 0.0169, "step": 5372 }, { "epoch": 1.7942895308064786, "grad_norm": 0.29722934666835743, "learning_rate": 4.163461050508144e-06, "loss": 0.028, "step": 5373 }, { "epoch": 1.794623476373351, "grad_norm": 0.21908600998447844, "learning_rate": 4.161545172256303e-06, "loss": 0.0189, "step": 5374 }, { "epoch": 1.7949574219402238, "grad_norm": 0.3225208887986041, "learning_rate": 4.1596294206629375e-06, "loss": 0.0309, "step": 5375 }, { "epoch": 1.7952913675070965, "grad_norm": 0.30967674890293173, "learning_rate": 4.157713796017442e-06, "loss": 0.0272, "step": 5376 }, { "epoch": 1.795625313073969, "grad_norm": 0.2835020834126306, "learning_rate": 4.155798298609196e-06, "loss": 0.0238, "step": 5377 }, { "epoch": 1.7959592586408415, "grad_norm": 0.38842210457929394, "learning_rate": 4.1538829287275565e-06, "loss": 0.0306, "step": 5378 }, { "epoch": 1.7962932042077142, "grad_norm": 0.4373247550605237, "learning_rate": 4.151967686661864e-06, "loss": 0.0304, "step": 5379 }, { "epoch": 1.7966271497745867, "grad_norm": 0.2178778814345169, "learning_rate": 4.150052572701435e-06, "loss": 0.0183, "step": 5380 }, { "epoch": 1.7969610953414592, "grad_norm": 0.23803039038479404, "learning_rate": 4.148137587135575e-06, "loss": 0.011, "step": 5381 }, { "epoch": 1.797295040908332, "grad_norm": 0.27432238572838175, "learning_rate": 4.146222730253563e-06, "loss": 0.0174, "step": 5382 }, { "epoch": 1.7976289864752046, "grad_norm": 0.2886665548459234, "learning_rate": 4.1443080023446605e-06, "loss": 0.025, "step": 5383 }, { "epoch": 1.7979629320420771, "grad_norm": 0.2903039365585172, "learning_rate": 4.1423934036981096e-06, "loss": 0.0255, "step": 5384 }, { "epoch": 1.7982968776089496, "grad_norm": 0.3539049120782072, "learning_rate": 4.140478934603133e-06, "loss": 0.036, "step": 5385 }, { "epoch": 1.7986308231758223, "grad_norm": 0.22683048673008707, "learning_rate": 4.138564595348932e-06, "loss": 0.0213, "step": 5386 }, { "epoch": 1.798964768742695, "grad_norm": 0.22565394299325583, "learning_rate": 4.136650386224694e-06, "loss": 0.0201, "step": 5387 }, { "epoch": 1.7992987143095676, "grad_norm": 0.30882097747445864, "learning_rate": 4.13473630751958e-06, "loss": 0.0287, "step": 5388 }, { "epoch": 1.79963265987644, "grad_norm": 0.26398179500046115, "learning_rate": 4.132822359522735e-06, "loss": 0.0174, "step": 5389 }, { "epoch": 1.7999666054433128, "grad_norm": 0.3235547733972141, "learning_rate": 4.130908542523285e-06, "loss": 0.0313, "step": 5390 }, { "epoch": 1.8003005510101855, "grad_norm": 0.3575902350850217, "learning_rate": 4.128994856810332e-06, "loss": 0.0169, "step": 5391 }, { "epoch": 1.800634496577058, "grad_norm": 0.27565167641382193, "learning_rate": 4.127081302672958e-06, "loss": 0.0218, "step": 5392 }, { "epoch": 1.8009684421439305, "grad_norm": 0.2936261877894819, "learning_rate": 4.125167880400235e-06, "loss": 0.0216, "step": 5393 }, { "epoch": 1.8013023877108032, "grad_norm": 0.3016931264197796, "learning_rate": 4.1232545902812046e-06, "loss": 0.0206, "step": 5394 }, { "epoch": 1.8016363332776757, "grad_norm": 0.30476717861849295, "learning_rate": 4.121341432604892e-06, "loss": 0.0265, "step": 5395 }, { "epoch": 1.8019702788445482, "grad_norm": 0.27747897043735814, "learning_rate": 4.1194284076603004e-06, "loss": 0.027, "step": 5396 }, { "epoch": 1.802304224411421, "grad_norm": 0.31358773125177547, "learning_rate": 4.117515515736418e-06, "loss": 0.0253, "step": 5397 }, { "epoch": 1.8026381699782936, "grad_norm": 0.23139827292246676, "learning_rate": 4.1156027571222054e-06, "loss": 0.0173, "step": 5398 }, { "epoch": 1.8029721155451661, "grad_norm": 0.31138422498829915, "learning_rate": 4.113690132106611e-06, "loss": 0.0286, "step": 5399 }, { "epoch": 1.8033060611120386, "grad_norm": 0.3279422610620659, "learning_rate": 4.111777640978559e-06, "loss": 0.025, "step": 5400 }, { "epoch": 1.8036400066789113, "grad_norm": 0.23702486611837187, "learning_rate": 4.109865284026953e-06, "loss": 0.0194, "step": 5401 }, { "epoch": 1.803973952245784, "grad_norm": 0.4036518116576094, "learning_rate": 4.107953061540676e-06, "loss": 0.0207, "step": 5402 }, { "epoch": 1.8043078978126565, "grad_norm": 0.26599846945307454, "learning_rate": 4.10604097380859e-06, "loss": 0.0193, "step": 5403 }, { "epoch": 1.804641843379529, "grad_norm": 0.2421582318039491, "learning_rate": 4.104129021119543e-06, "loss": 0.0201, "step": 5404 }, { "epoch": 1.8049757889464018, "grad_norm": 0.27743796176695185, "learning_rate": 4.102217203762357e-06, "loss": 0.029, "step": 5405 }, { "epoch": 1.8053097345132745, "grad_norm": 0.28819453175578164, "learning_rate": 4.1003055220258335e-06, "loss": 0.0218, "step": 5406 }, { "epoch": 1.805643680080147, "grad_norm": 0.257400719403574, "learning_rate": 4.0983939761987535e-06, "loss": 0.0246, "step": 5407 }, { "epoch": 1.8059776256470195, "grad_norm": 0.2718063590711352, "learning_rate": 4.09648256656988e-06, "loss": 0.0237, "step": 5408 }, { "epoch": 1.8063115712138922, "grad_norm": 0.31384646806016225, "learning_rate": 4.094571293427951e-06, "loss": 0.0288, "step": 5409 }, { "epoch": 1.8066455167807647, "grad_norm": 0.2976011549234811, "learning_rate": 4.092660157061691e-06, "loss": 0.0236, "step": 5410 }, { "epoch": 1.8069794623476372, "grad_norm": 0.3234524195314314, "learning_rate": 4.090749157759799e-06, "loss": 0.0326, "step": 5411 }, { "epoch": 1.80731340791451, "grad_norm": 0.3429949172983408, "learning_rate": 4.088838295810952e-06, "loss": 0.0371, "step": 5412 }, { "epoch": 1.8076473534813826, "grad_norm": 0.2363244998146425, "learning_rate": 4.086927571503808e-06, "loss": 0.0248, "step": 5413 }, { "epoch": 1.807981299048255, "grad_norm": 0.3265329850053635, "learning_rate": 4.0850169851270075e-06, "loss": 0.0336, "step": 5414 }, { "epoch": 1.8083152446151276, "grad_norm": 0.24605220793741153, "learning_rate": 4.0831065369691615e-06, "loss": 0.0206, "step": 5415 }, { "epoch": 1.8086491901820003, "grad_norm": 0.3377906714487289, "learning_rate": 4.0811962273188714e-06, "loss": 0.0232, "step": 5416 }, { "epoch": 1.808983135748873, "grad_norm": 0.28045208147729683, "learning_rate": 4.0792860564647105e-06, "loss": 0.0179, "step": 5417 }, { "epoch": 1.8093170813157455, "grad_norm": 0.23352590492682648, "learning_rate": 4.077376024695231e-06, "loss": 0.0167, "step": 5418 }, { "epoch": 1.809651026882618, "grad_norm": 0.34066937785685186, "learning_rate": 4.075466132298967e-06, "loss": 0.0299, "step": 5419 }, { "epoch": 1.8099849724494907, "grad_norm": 0.29209763808469125, "learning_rate": 4.073556379564429e-06, "loss": 0.0288, "step": 5420 }, { "epoch": 1.8103189180163635, "grad_norm": 0.28219662731217826, "learning_rate": 4.071646766780109e-06, "loss": 0.0195, "step": 5421 }, { "epoch": 1.810652863583236, "grad_norm": 0.24070868548418348, "learning_rate": 4.069737294234475e-06, "loss": 0.02, "step": 5422 }, { "epoch": 1.8109868091501085, "grad_norm": 0.352103957259622, "learning_rate": 4.067827962215977e-06, "loss": 0.0238, "step": 5423 }, { "epoch": 1.8113207547169812, "grad_norm": 0.3068937119539319, "learning_rate": 4.065918771013042e-06, "loss": 0.0255, "step": 5424 }, { "epoch": 1.811654700283854, "grad_norm": 0.2297808906169926, "learning_rate": 4.064009720914074e-06, "loss": 0.022, "step": 5425 }, { "epoch": 1.8119886458507264, "grad_norm": 0.28344916963934125, "learning_rate": 4.062100812207459e-06, "loss": 0.0212, "step": 5426 }, { "epoch": 1.8123225914175989, "grad_norm": 0.2414624350592072, "learning_rate": 4.060192045181558e-06, "loss": 0.0194, "step": 5427 }, { "epoch": 1.8126565369844716, "grad_norm": 0.2601365970991397, "learning_rate": 4.058283420124716e-06, "loss": 0.021, "step": 5428 }, { "epoch": 1.812990482551344, "grad_norm": 0.21451975788454472, "learning_rate": 4.056374937325251e-06, "loss": 0.0156, "step": 5429 }, { "epoch": 1.8133244281182166, "grad_norm": 0.34697941022019413, "learning_rate": 4.054466597071464e-06, "loss": 0.0366, "step": 5430 }, { "epoch": 1.8136583736850893, "grad_norm": 0.24356598349836267, "learning_rate": 4.05255839965163e-06, "loss": 0.017, "step": 5431 }, { "epoch": 1.813992319251962, "grad_norm": 0.3650783429151526, "learning_rate": 4.050650345354006e-06, "loss": 0.0227, "step": 5432 }, { "epoch": 1.8143262648188345, "grad_norm": 0.36655433912376695, "learning_rate": 4.048742434466823e-06, "loss": 0.0311, "step": 5433 }, { "epoch": 1.814660210385707, "grad_norm": 0.290337405991827, "learning_rate": 4.046834667278298e-06, "loss": 0.0206, "step": 5434 }, { "epoch": 1.8149941559525797, "grad_norm": 0.24916852160715636, "learning_rate": 4.04492704407662e-06, "loss": 0.0274, "step": 5435 }, { "epoch": 1.8153281015194525, "grad_norm": 0.29535786465917413, "learning_rate": 4.043019565149958e-06, "loss": 0.027, "step": 5436 }, { "epoch": 1.815662047086325, "grad_norm": 0.2021908403005868, "learning_rate": 4.041112230786458e-06, "loss": 0.0155, "step": 5437 }, { "epoch": 1.8159959926531974, "grad_norm": 0.3498627376980739, "learning_rate": 4.039205041274247e-06, "loss": 0.0167, "step": 5438 }, { "epoch": 1.8163299382200702, "grad_norm": 0.41619769673398277, "learning_rate": 4.0372979969014245e-06, "loss": 0.0343, "step": 5439 }, { "epoch": 1.8166638837869429, "grad_norm": 0.25849631373366844, "learning_rate": 4.035391097956077e-06, "loss": 0.0255, "step": 5440 }, { "epoch": 1.8169978293538154, "grad_norm": 0.32699202396126836, "learning_rate": 4.0334843447262625e-06, "loss": 0.0309, "step": 5441 }, { "epoch": 1.8173317749206879, "grad_norm": 0.335101820793403, "learning_rate": 4.0315777375000185e-06, "loss": 0.0372, "step": 5442 }, { "epoch": 1.8176657204875606, "grad_norm": 0.23616061677896705, "learning_rate": 4.029671276565359e-06, "loss": 0.0155, "step": 5443 }, { "epoch": 1.817999666054433, "grad_norm": 0.3101037907121459, "learning_rate": 4.027764962210278e-06, "loss": 0.0167, "step": 5444 }, { "epoch": 1.8183336116213056, "grad_norm": 0.22442417044732038, "learning_rate": 4.025858794722749e-06, "loss": 0.0204, "step": 5445 }, { "epoch": 1.8186675571881783, "grad_norm": 0.2515916617840221, "learning_rate": 4.0239527743907184e-06, "loss": 0.0233, "step": 5446 }, { "epoch": 1.819001502755051, "grad_norm": 0.26487508507309315, "learning_rate": 4.022046901502114e-06, "loss": 0.023, "step": 5447 }, { "epoch": 1.8193354483219235, "grad_norm": 0.23250313759564167, "learning_rate": 4.020141176344839e-06, "loss": 0.0206, "step": 5448 }, { "epoch": 1.819669393888796, "grad_norm": 0.3282770658819903, "learning_rate": 4.018235599206778e-06, "loss": 0.025, "step": 5449 }, { "epoch": 1.8200033394556687, "grad_norm": 0.3208453089063136, "learning_rate": 4.016330170375787e-06, "loss": 0.0274, "step": 5450 }, { "epoch": 1.8203372850225414, "grad_norm": 0.29679640130276935, "learning_rate": 4.014424890139709e-06, "loss": 0.0276, "step": 5451 }, { "epoch": 1.820671230589414, "grad_norm": 0.24060158633305925, "learning_rate": 4.012519758786355e-06, "loss": 0.0237, "step": 5452 }, { "epoch": 1.8210051761562864, "grad_norm": 0.25024447045270015, "learning_rate": 4.01061477660352e-06, "loss": 0.0204, "step": 5453 }, { "epoch": 1.8213391217231591, "grad_norm": 0.23602236265844892, "learning_rate": 4.008709943878971e-06, "loss": 0.0199, "step": 5454 }, { "epoch": 1.8216730672900319, "grad_norm": 0.2680017862209099, "learning_rate": 4.006805260900458e-06, "loss": 0.0211, "step": 5455 }, { "epoch": 1.8220070128569044, "grad_norm": 0.4033688267476277, "learning_rate": 4.004900727955703e-06, "loss": 0.0283, "step": 5456 }, { "epoch": 1.8223409584237769, "grad_norm": 0.4268011302014313, "learning_rate": 4.0029963453324115e-06, "loss": 0.024, "step": 5457 }, { "epoch": 1.8226749039906496, "grad_norm": 0.26721601677980833, "learning_rate": 4.001092113318261e-06, "loss": 0.0216, "step": 5458 }, { "epoch": 1.823008849557522, "grad_norm": 0.29060048266066063, "learning_rate": 3.99918803220091e-06, "loss": 0.0237, "step": 5459 }, { "epoch": 1.8233427951243946, "grad_norm": 0.259958566041274, "learning_rate": 3.99728410226799e-06, "loss": 0.0284, "step": 5460 }, { "epoch": 1.8236767406912673, "grad_norm": 0.19359468067247562, "learning_rate": 3.995380323807113e-06, "loss": 0.0143, "step": 5461 }, { "epoch": 1.82401068625814, "grad_norm": 0.21317160490398643, "learning_rate": 3.993476697105864e-06, "loss": 0.0147, "step": 5462 }, { "epoch": 1.8243446318250125, "grad_norm": 0.32302572897565923, "learning_rate": 3.991573222451815e-06, "loss": 0.0315, "step": 5463 }, { "epoch": 1.824678577391885, "grad_norm": 0.2740700371674136, "learning_rate": 3.989669900132504e-06, "loss": 0.0162, "step": 5464 }, { "epoch": 1.8250125229587577, "grad_norm": 0.31972446446373626, "learning_rate": 3.987766730435451e-06, "loss": 0.037, "step": 5465 }, { "epoch": 1.8253464685256304, "grad_norm": 0.23864627315021922, "learning_rate": 3.9858637136481515e-06, "loss": 0.0232, "step": 5466 }, { "epoch": 1.825680414092503, "grad_norm": 0.21408657342708176, "learning_rate": 3.98396085005808e-06, "loss": 0.023, "step": 5467 }, { "epoch": 1.8260143596593754, "grad_norm": 0.2684457102186131, "learning_rate": 3.982058139952684e-06, "loss": 0.0155, "step": 5468 }, { "epoch": 1.8263483052262481, "grad_norm": 0.23174021707235026, "learning_rate": 3.980155583619392e-06, "loss": 0.019, "step": 5469 }, { "epoch": 1.8266822507931209, "grad_norm": 0.19011721333269616, "learning_rate": 3.978253181345609e-06, "loss": 0.016, "step": 5470 }, { "epoch": 1.8270161963599933, "grad_norm": 0.2078323252649679, "learning_rate": 3.9763509334187125e-06, "loss": 0.016, "step": 5471 }, { "epoch": 1.8273501419268658, "grad_norm": 0.3644020722265962, "learning_rate": 3.974448840126061e-06, "loss": 0.0251, "step": 5472 }, { "epoch": 1.8276840874937386, "grad_norm": 0.23660112447892773, "learning_rate": 3.972546901754987e-06, "loss": 0.0181, "step": 5473 }, { "epoch": 1.8280180330606113, "grad_norm": 0.25494333574362743, "learning_rate": 3.9706451185928e-06, "loss": 0.0254, "step": 5474 }, { "epoch": 1.8283519786274836, "grad_norm": 0.27315501958549304, "learning_rate": 3.968743490926791e-06, "loss": 0.0181, "step": 5475 }, { "epoch": 1.8286859241943563, "grad_norm": 0.24251661486588935, "learning_rate": 3.966842019044219e-06, "loss": 0.0246, "step": 5476 }, { "epoch": 1.829019869761229, "grad_norm": 0.30620427884970364, "learning_rate": 3.964940703232326e-06, "loss": 0.0269, "step": 5477 }, { "epoch": 1.8293538153281015, "grad_norm": 0.2927813112869122, "learning_rate": 3.963039543778327e-06, "loss": 0.0227, "step": 5478 }, { "epoch": 1.829687760894974, "grad_norm": 0.23695367680264587, "learning_rate": 3.961138540969411e-06, "loss": 0.0231, "step": 5479 }, { "epoch": 1.8300217064618467, "grad_norm": 0.29434700237793276, "learning_rate": 3.9592376950927545e-06, "loss": 0.018, "step": 5480 }, { "epoch": 1.8303556520287194, "grad_norm": 0.3291314134586716, "learning_rate": 3.957337006435499e-06, "loss": 0.02, "step": 5481 }, { "epoch": 1.830689597595592, "grad_norm": 0.47854055491961983, "learning_rate": 3.955436475284764e-06, "loss": 0.0221, "step": 5482 }, { "epoch": 1.8310235431624644, "grad_norm": 0.29821311669194633, "learning_rate": 3.95353610192765e-06, "loss": 0.0216, "step": 5483 }, { "epoch": 1.8313574887293371, "grad_norm": 0.5937005251560681, "learning_rate": 3.95163588665123e-06, "loss": 0.0246, "step": 5484 }, { "epoch": 1.8316914342962098, "grad_norm": 0.26448450237898724, "learning_rate": 3.949735829742549e-06, "loss": 0.0193, "step": 5485 }, { "epoch": 1.8320253798630823, "grad_norm": 0.3381094734753388, "learning_rate": 3.947835931488642e-06, "loss": 0.031, "step": 5486 }, { "epoch": 1.8323593254299548, "grad_norm": 0.3108842067678793, "learning_rate": 3.9459361921765045e-06, "loss": 0.0237, "step": 5487 }, { "epoch": 1.8326932709968276, "grad_norm": 0.21893302179562465, "learning_rate": 3.944036612093117e-06, "loss": 0.0176, "step": 5488 }, { "epoch": 1.8330272165637003, "grad_norm": 0.40966298117956673, "learning_rate": 3.942137191525434e-06, "loss": 0.0259, "step": 5489 }, { "epoch": 1.8333611621305728, "grad_norm": 0.25483669604333686, "learning_rate": 3.9402379307603825e-06, "loss": 0.0156, "step": 5490 }, { "epoch": 1.8336951076974453, "grad_norm": 0.3885415497226158, "learning_rate": 3.93833883008487e-06, "loss": 0.0289, "step": 5491 }, { "epoch": 1.834029053264318, "grad_norm": 0.3160823222538175, "learning_rate": 3.936439889785778e-06, "loss": 0.0309, "step": 5492 }, { "epoch": 1.8343629988311905, "grad_norm": 0.39990525790638476, "learning_rate": 3.934541110149964e-06, "loss": 0.0201, "step": 5493 }, { "epoch": 1.834696944398063, "grad_norm": 0.2894636221218856, "learning_rate": 3.932642491464261e-06, "loss": 0.0216, "step": 5494 }, { "epoch": 1.8350308899649357, "grad_norm": 0.25044931248245683, "learning_rate": 3.930744034015477e-06, "loss": 0.0288, "step": 5495 }, { "epoch": 1.8353648355318084, "grad_norm": 0.2636980287182773, "learning_rate": 3.9288457380903954e-06, "loss": 0.0261, "step": 5496 }, { "epoch": 1.835698781098681, "grad_norm": 0.3629198209733116, "learning_rate": 3.926947603975778e-06, "loss": 0.0184, "step": 5497 }, { "epoch": 1.8360327266655534, "grad_norm": 0.2810675742420417, "learning_rate": 3.925049631958361e-06, "loss": 0.0263, "step": 5498 }, { "epoch": 1.8363666722324261, "grad_norm": 0.22680372732984186, "learning_rate": 3.923151822324854e-06, "loss": 0.0115, "step": 5499 }, { "epoch": 1.8367006177992988, "grad_norm": 0.21607623890424313, "learning_rate": 3.9212541753619435e-06, "loss": 0.0209, "step": 5500 }, { "epoch": 1.8370345633661713, "grad_norm": 0.28889443060475545, "learning_rate": 3.9193566913562915e-06, "loss": 0.023, "step": 5501 }, { "epoch": 1.8373685089330438, "grad_norm": 0.3346094052197944, "learning_rate": 3.917459370594537e-06, "loss": 0.0316, "step": 5502 }, { "epoch": 1.8377024544999165, "grad_norm": 0.21726344198060354, "learning_rate": 3.915562213363287e-06, "loss": 0.0178, "step": 5503 }, { "epoch": 1.8380364000667893, "grad_norm": 0.23047898029796604, "learning_rate": 3.9136652199491365e-06, "loss": 0.0271, "step": 5504 }, { "epoch": 1.8383703456336618, "grad_norm": 0.3490157278091612, "learning_rate": 3.911768390638645e-06, "loss": 0.0211, "step": 5505 }, { "epoch": 1.8387042912005342, "grad_norm": 0.3597586877823906, "learning_rate": 3.909871725718353e-06, "loss": 0.0306, "step": 5506 }, { "epoch": 1.839038236767407, "grad_norm": 0.26121684414929136, "learning_rate": 3.907975225474771e-06, "loss": 0.0194, "step": 5507 }, { "epoch": 1.8393721823342795, "grad_norm": 0.33833553870163363, "learning_rate": 3.906078890194391e-06, "loss": 0.035, "step": 5508 }, { "epoch": 1.839706127901152, "grad_norm": 0.25208364698224595, "learning_rate": 3.904182720163672e-06, "loss": 0.0195, "step": 5509 }, { "epoch": 1.8400400734680247, "grad_norm": 0.2557373009577202, "learning_rate": 3.902286715669058e-06, "loss": 0.0222, "step": 5510 }, { "epoch": 1.8403740190348974, "grad_norm": 0.3247971345380231, "learning_rate": 3.9003908769969615e-06, "loss": 0.0402, "step": 5511 }, { "epoch": 1.8407079646017699, "grad_norm": 0.2700198529014962, "learning_rate": 3.89849520443377e-06, "loss": 0.0209, "step": 5512 }, { "epoch": 1.8410419101686424, "grad_norm": 0.30992211623056726, "learning_rate": 3.896599698265847e-06, "loss": 0.0201, "step": 5513 }, { "epoch": 1.841375855735515, "grad_norm": 0.23716161756218984, "learning_rate": 3.894704358779533e-06, "loss": 0.0191, "step": 5514 }, { "epoch": 1.8417098013023878, "grad_norm": 0.24735710770163488, "learning_rate": 3.892809186261138e-06, "loss": 0.0244, "step": 5515 }, { "epoch": 1.8420437468692603, "grad_norm": 0.4304108744527725, "learning_rate": 3.890914180996954e-06, "loss": 0.0257, "step": 5516 }, { "epoch": 1.8423776924361328, "grad_norm": 0.30686700018936613, "learning_rate": 3.889019343273242e-06, "loss": 0.0154, "step": 5517 }, { "epoch": 1.8427116380030055, "grad_norm": 0.22599827439120004, "learning_rate": 3.887124673376239e-06, "loss": 0.0151, "step": 5518 }, { "epoch": 1.8430455835698782, "grad_norm": 0.31114427910408443, "learning_rate": 3.885230171592157e-06, "loss": 0.0282, "step": 5519 }, { "epoch": 1.8433795291367507, "grad_norm": 0.24629323970394204, "learning_rate": 3.883335838207183e-06, "loss": 0.0212, "step": 5520 }, { "epoch": 1.8437134747036232, "grad_norm": 0.3182009568722478, "learning_rate": 3.881441673507481e-06, "loss": 0.0244, "step": 5521 }, { "epoch": 1.844047420270496, "grad_norm": 0.2564380913013376, "learning_rate": 3.879547677779184e-06, "loss": 0.0194, "step": 5522 }, { "epoch": 1.8443813658373687, "grad_norm": 0.21173823822645305, "learning_rate": 3.8776538513084036e-06, "loss": 0.0205, "step": 5523 }, { "epoch": 1.844715311404241, "grad_norm": 0.24248205699015107, "learning_rate": 3.875760194381224e-06, "loss": 0.0168, "step": 5524 }, { "epoch": 1.8450492569711137, "grad_norm": 0.3347737084558029, "learning_rate": 3.873866707283704e-06, "loss": 0.0302, "step": 5525 }, { "epoch": 1.8453832025379864, "grad_norm": 0.30224256905798147, "learning_rate": 3.871973390301876e-06, "loss": 0.0214, "step": 5526 }, { "epoch": 1.8457171481048589, "grad_norm": 0.33206429677499233, "learning_rate": 3.8700802437217526e-06, "loss": 0.0214, "step": 5527 }, { "epoch": 1.8460510936717314, "grad_norm": 0.22123229129665162, "learning_rate": 3.8681872678293115e-06, "loss": 0.0182, "step": 5528 }, { "epoch": 1.846385039238604, "grad_norm": 0.3093987292653506, "learning_rate": 3.866294462910511e-06, "loss": 0.0162, "step": 5529 }, { "epoch": 1.8467189848054768, "grad_norm": 0.26523134207398047, "learning_rate": 3.86440182925128e-06, "loss": 0.0196, "step": 5530 }, { "epoch": 1.8470529303723493, "grad_norm": 0.2637429917377835, "learning_rate": 3.862509367137525e-06, "loss": 0.0235, "step": 5531 }, { "epoch": 1.8473868759392218, "grad_norm": 0.2110379872242139, "learning_rate": 3.86061707685512e-06, "loss": 0.0171, "step": 5532 }, { "epoch": 1.8477208215060945, "grad_norm": 0.2526531718422867, "learning_rate": 3.8587249586899245e-06, "loss": 0.0267, "step": 5533 }, { "epoch": 1.8480547670729672, "grad_norm": 0.21459919782795164, "learning_rate": 3.856833012927762e-06, "loss": 0.0208, "step": 5534 }, { "epoch": 1.8483887126398397, "grad_norm": 0.21105754001049573, "learning_rate": 3.854941239854433e-06, "loss": 0.0205, "step": 5535 }, { "epoch": 1.8487226582067122, "grad_norm": 0.28888844943590203, "learning_rate": 3.853049639755713e-06, "loss": 0.0289, "step": 5536 }, { "epoch": 1.849056603773585, "grad_norm": 0.3479629629831733, "learning_rate": 3.8511582129173495e-06, "loss": 0.0217, "step": 5537 }, { "epoch": 1.8493905493404577, "grad_norm": 0.3040582837759172, "learning_rate": 3.8492669596250636e-06, "loss": 0.0274, "step": 5538 }, { "epoch": 1.8497244949073302, "grad_norm": 0.5059198619227633, "learning_rate": 3.8473758801645535e-06, "loss": 0.0432, "step": 5539 }, { "epoch": 1.8500584404742026, "grad_norm": 0.31288464994197834, "learning_rate": 3.84548497482149e-06, "loss": 0.03, "step": 5540 }, { "epoch": 1.8503923860410754, "grad_norm": 0.20476246652095195, "learning_rate": 3.843594243881513e-06, "loss": 0.019, "step": 5541 }, { "epoch": 1.8507263316079479, "grad_norm": 0.29668482094120385, "learning_rate": 3.841703687630243e-06, "loss": 0.0215, "step": 5542 }, { "epoch": 1.8510602771748204, "grad_norm": 0.37539580908242665, "learning_rate": 3.8398133063532685e-06, "loss": 0.0462, "step": 5543 }, { "epoch": 1.851394222741693, "grad_norm": 0.2731567019708558, "learning_rate": 3.837923100336155e-06, "loss": 0.0303, "step": 5544 }, { "epoch": 1.8517281683085658, "grad_norm": 0.25254166715668286, "learning_rate": 3.836033069864441e-06, "loss": 0.0195, "step": 5545 }, { "epoch": 1.8520621138754383, "grad_norm": 0.7063946387194417, "learning_rate": 3.834143215223637e-06, "loss": 0.0333, "step": 5546 }, { "epoch": 1.8523960594423108, "grad_norm": 0.6160870778017123, "learning_rate": 3.832253536699227e-06, "loss": 0.0197, "step": 5547 }, { "epoch": 1.8527300050091835, "grad_norm": 0.30730347146628395, "learning_rate": 3.8303640345766714e-06, "loss": 0.029, "step": 5548 }, { "epoch": 1.8530639505760562, "grad_norm": 0.2326576001773235, "learning_rate": 3.8284747091414e-06, "loss": 0.0171, "step": 5549 }, { "epoch": 1.8533978961429287, "grad_norm": 0.1841606448958243, "learning_rate": 3.826585560678816e-06, "loss": 0.0152, "step": 5550 }, { "epoch": 1.8537318417098012, "grad_norm": 0.2471422845895843, "learning_rate": 3.824696589474301e-06, "loss": 0.0186, "step": 5551 }, { "epoch": 1.854065787276674, "grad_norm": 0.2660303246249173, "learning_rate": 3.8228077958132055e-06, "loss": 0.0257, "step": 5552 }, { "epoch": 1.8543997328435466, "grad_norm": 0.39585633761465444, "learning_rate": 3.8209191799808535e-06, "loss": 0.0223, "step": 5553 }, { "epoch": 1.8547336784104191, "grad_norm": 0.2511907798723612, "learning_rate": 3.819030742262542e-06, "loss": 0.0251, "step": 5554 }, { "epoch": 1.8550676239772916, "grad_norm": 0.3425668300199371, "learning_rate": 3.817142482943543e-06, "loss": 0.0224, "step": 5555 }, { "epoch": 1.8554015695441644, "grad_norm": 0.31983763319282277, "learning_rate": 3.815254402309097e-06, "loss": 0.0272, "step": 5556 }, { "epoch": 1.8557355151110368, "grad_norm": 0.33181148570323177, "learning_rate": 3.813366500644426e-06, "loss": 0.0307, "step": 5557 }, { "epoch": 1.8560694606779093, "grad_norm": 0.22575433477709975, "learning_rate": 3.8114787782347172e-06, "loss": 0.0185, "step": 5558 }, { "epoch": 1.856403406244782, "grad_norm": 0.28208061818423985, "learning_rate": 3.809591235365133e-06, "loss": 0.0185, "step": 5559 }, { "epoch": 1.8567373518116548, "grad_norm": 0.22743798029102708, "learning_rate": 3.807703872320809e-06, "loss": 0.0171, "step": 5560 }, { "epoch": 1.8570712973785273, "grad_norm": 0.21158273922660456, "learning_rate": 3.8058166893868543e-06, "loss": 0.017, "step": 5561 }, { "epoch": 1.8574052429453998, "grad_norm": 0.22146074037145108, "learning_rate": 3.8039296868483493e-06, "loss": 0.0168, "step": 5562 }, { "epoch": 1.8577391885122725, "grad_norm": 0.2663937377420855, "learning_rate": 3.802042864990349e-06, "loss": 0.0288, "step": 5563 }, { "epoch": 1.8580731340791452, "grad_norm": 0.2616890466945177, "learning_rate": 3.8001562240978785e-06, "loss": 0.025, "step": 5564 }, { "epoch": 1.8584070796460177, "grad_norm": 0.26220805872412206, "learning_rate": 3.7982697644559385e-06, "loss": 0.0279, "step": 5565 }, { "epoch": 1.8587410252128902, "grad_norm": 0.2610702092037741, "learning_rate": 3.7963834863495013e-06, "loss": 0.0228, "step": 5566 }, { "epoch": 1.859074970779763, "grad_norm": 0.19892127676747182, "learning_rate": 3.794497390063509e-06, "loss": 0.0138, "step": 5567 }, { "epoch": 1.8594089163466356, "grad_norm": 0.28811953069250973, "learning_rate": 3.792611475882881e-06, "loss": 0.0217, "step": 5568 }, { "epoch": 1.8597428619135081, "grad_norm": 0.25996900075359325, "learning_rate": 3.790725744092507e-06, "loss": 0.0238, "step": 5569 }, { "epoch": 1.8600768074803806, "grad_norm": 0.3049636787340111, "learning_rate": 3.788840194977248e-06, "loss": 0.0225, "step": 5570 }, { "epoch": 1.8604107530472533, "grad_norm": 0.27304032137042455, "learning_rate": 3.7869548288219383e-06, "loss": 0.0224, "step": 5571 }, { "epoch": 1.860744698614126, "grad_norm": 0.3510042654290772, "learning_rate": 3.7850696459113845e-06, "loss": 0.0335, "step": 5572 }, { "epoch": 1.8610786441809983, "grad_norm": 0.312305379265438, "learning_rate": 3.783184646530364e-06, "loss": 0.0304, "step": 5573 }, { "epoch": 1.861412589747871, "grad_norm": 0.2705352857466786, "learning_rate": 3.7812998309636323e-06, "loss": 0.0219, "step": 5574 }, { "epoch": 1.8617465353147438, "grad_norm": 0.25166842792429045, "learning_rate": 3.779415199495911e-06, "loss": 0.0198, "step": 5575 }, { "epoch": 1.8620804808816163, "grad_norm": 0.24989128103290548, "learning_rate": 3.777530752411896e-06, "loss": 0.0231, "step": 5576 }, { "epoch": 1.8624144264484888, "grad_norm": 0.3480029471981483, "learning_rate": 3.7756464899962546e-06, "loss": 0.0333, "step": 5577 }, { "epoch": 1.8627483720153615, "grad_norm": 0.24458747733387348, "learning_rate": 3.773762412533627e-06, "loss": 0.0189, "step": 5578 }, { "epoch": 1.8630823175822342, "grad_norm": 0.27455416860199744, "learning_rate": 3.771878520308624e-06, "loss": 0.0249, "step": 5579 }, { "epoch": 1.8634162631491067, "grad_norm": 0.2784356958064225, "learning_rate": 3.7699948136058327e-06, "loss": 0.0202, "step": 5580 }, { "epoch": 1.8637502087159792, "grad_norm": 0.22976269059104357, "learning_rate": 3.768111292709808e-06, "loss": 0.0157, "step": 5581 }, { "epoch": 1.864084154282852, "grad_norm": 0.41097509033896396, "learning_rate": 3.7662279579050777e-06, "loss": 0.0219, "step": 5582 }, { "epoch": 1.8644180998497246, "grad_norm": 0.23981699775525583, "learning_rate": 3.764344809476141e-06, "loss": 0.0187, "step": 5583 }, { "epoch": 1.8647520454165971, "grad_norm": 0.281541505320401, "learning_rate": 3.7624618477074705e-06, "loss": 0.0204, "step": 5584 }, { "epoch": 1.8650859909834696, "grad_norm": 0.23371525454578326, "learning_rate": 3.760579072883508e-06, "loss": 0.0192, "step": 5585 }, { "epoch": 1.8654199365503423, "grad_norm": 0.26629309955721303, "learning_rate": 3.758696485288672e-06, "loss": 0.025, "step": 5586 }, { "epoch": 1.865753882117215, "grad_norm": 0.34621385985352066, "learning_rate": 3.7568140852073464e-06, "loss": 0.0277, "step": 5587 }, { "epoch": 1.8660878276840875, "grad_norm": 0.28514982345251355, "learning_rate": 3.754931872923892e-06, "loss": 0.0226, "step": 5588 }, { "epoch": 1.86642177325096, "grad_norm": 0.24262838635866096, "learning_rate": 3.7530498487226384e-06, "loss": 0.0209, "step": 5589 }, { "epoch": 1.8667557188178328, "grad_norm": 0.32330920987705275, "learning_rate": 3.751168012887888e-06, "loss": 0.0241, "step": 5590 }, { "epoch": 1.8670896643847052, "grad_norm": 0.24429685845411966, "learning_rate": 3.7492863657039126e-06, "loss": 0.0208, "step": 5591 }, { "epoch": 1.8674236099515777, "grad_norm": 0.3181913740552329, "learning_rate": 3.7474049074549596e-06, "loss": 0.0338, "step": 5592 }, { "epoch": 1.8677575555184505, "grad_norm": 0.29926290786382986, "learning_rate": 3.7455236384252435e-06, "loss": 0.0295, "step": 5593 }, { "epoch": 1.8680915010853232, "grad_norm": 0.24550763167551037, "learning_rate": 3.743642558898953e-06, "loss": 0.0172, "step": 5594 }, { "epoch": 1.8684254466521957, "grad_norm": 0.34331971296626457, "learning_rate": 3.7417616691602477e-06, "loss": 0.0268, "step": 5595 }, { "epoch": 1.8687593922190682, "grad_norm": 0.2886453361558632, "learning_rate": 3.739880969493257e-06, "loss": 0.0217, "step": 5596 }, { "epoch": 1.869093337785941, "grad_norm": 0.2564313897081605, "learning_rate": 3.738000460182081e-06, "loss": 0.0256, "step": 5597 }, { "epoch": 1.8694272833528136, "grad_norm": 0.2634234063519612, "learning_rate": 3.736120141510798e-06, "loss": 0.0287, "step": 5598 }, { "epoch": 1.869761228919686, "grad_norm": 0.4393229550372479, "learning_rate": 3.734240013763448e-06, "loss": 0.0308, "step": 5599 }, { "epoch": 1.8700951744865586, "grad_norm": 0.2164664379744827, "learning_rate": 3.732360077224049e-06, "loss": 0.0146, "step": 5600 }, { "epoch": 1.8704291200534313, "grad_norm": 0.26072485374206744, "learning_rate": 3.730480332176586e-06, "loss": 0.02, "step": 5601 }, { "epoch": 1.870763065620304, "grad_norm": 0.3526006967683492, "learning_rate": 3.7286007789050147e-06, "loss": 0.0308, "step": 5602 }, { "epoch": 1.8710970111871765, "grad_norm": 0.3136971586674732, "learning_rate": 3.726721417693268e-06, "loss": 0.0279, "step": 5603 }, { "epoch": 1.871430956754049, "grad_norm": 0.257007205554229, "learning_rate": 3.7248422488252433e-06, "loss": 0.029, "step": 5604 }, { "epoch": 1.8717649023209217, "grad_norm": 0.5048281900375152, "learning_rate": 3.722963272584812e-06, "loss": 0.026, "step": 5605 }, { "epoch": 1.8720988478877942, "grad_norm": 0.26789406283606615, "learning_rate": 3.721084489255815e-06, "loss": 0.021, "step": 5606 }, { "epoch": 1.8724327934546667, "grad_norm": 0.2747009983922569, "learning_rate": 3.719205899122064e-06, "loss": 0.0161, "step": 5607 }, { "epoch": 1.8727667390215395, "grad_norm": 0.3052164921402659, "learning_rate": 3.7173275024673424e-06, "loss": 0.0282, "step": 5608 }, { "epoch": 1.8731006845884122, "grad_norm": 0.5504177680294845, "learning_rate": 3.7154492995754046e-06, "loss": 0.0328, "step": 5609 }, { "epoch": 1.8734346301552847, "grad_norm": 0.2852296707309642, "learning_rate": 3.7135712907299753e-06, "loss": 0.0222, "step": 5610 }, { "epoch": 1.8737685757221572, "grad_norm": 0.2528660856970436, "learning_rate": 3.7116934762147504e-06, "loss": 0.0165, "step": 5611 }, { "epoch": 1.8741025212890299, "grad_norm": 0.28073984732189905, "learning_rate": 3.709815856313395e-06, "loss": 0.0232, "step": 5612 }, { "epoch": 1.8744364668559026, "grad_norm": 0.3032357761909178, "learning_rate": 3.7079384313095464e-06, "loss": 0.0226, "step": 5613 }, { "epoch": 1.874770412422775, "grad_norm": 0.27339499967603625, "learning_rate": 3.70606120148681e-06, "loss": 0.0223, "step": 5614 }, { "epoch": 1.8751043579896476, "grad_norm": 0.8596306710365944, "learning_rate": 3.7041841671287654e-06, "loss": 0.0305, "step": 5615 }, { "epoch": 1.8754383035565203, "grad_norm": 0.33671063875926993, "learning_rate": 3.70230732851896e-06, "loss": 0.0281, "step": 5616 }, { "epoch": 1.875772249123393, "grad_norm": 0.3501949992836972, "learning_rate": 3.7004306859409134e-06, "loss": 0.0302, "step": 5617 }, { "epoch": 1.8761061946902655, "grad_norm": 0.3367802086748022, "learning_rate": 3.6985542396781127e-06, "loss": 0.0265, "step": 5618 }, { "epoch": 1.876440140257138, "grad_norm": 0.3153966030061333, "learning_rate": 3.6966779900140193e-06, "loss": 0.0216, "step": 5619 }, { "epoch": 1.8767740858240107, "grad_norm": 1.9249950041547073, "learning_rate": 3.694801937232058e-06, "loss": 0.0308, "step": 5620 }, { "epoch": 1.8771080313908834, "grad_norm": 0.3497465104917823, "learning_rate": 3.6929260816156353e-06, "loss": 0.037, "step": 5621 }, { "epoch": 1.8774419769577557, "grad_norm": 0.2232673582977845, "learning_rate": 3.691050423448118e-06, "loss": 0.0221, "step": 5622 }, { "epoch": 1.8777759225246284, "grad_norm": 0.32552142053258964, "learning_rate": 3.689174963012847e-06, "loss": 0.0326, "step": 5623 }, { "epoch": 1.8781098680915012, "grad_norm": 0.23739151785510731, "learning_rate": 3.6872997005931323e-06, "loss": 0.0162, "step": 5624 }, { "epoch": 1.8784438136583737, "grad_norm": 0.367580525575576, "learning_rate": 3.6854246364722534e-06, "loss": 0.0258, "step": 5625 }, { "epoch": 1.8787777592252461, "grad_norm": 0.2453190407078018, "learning_rate": 3.683549770933461e-06, "loss": 0.0221, "step": 5626 }, { "epoch": 1.8791117047921189, "grad_norm": 0.4175588122602081, "learning_rate": 3.6816751042599774e-06, "loss": 0.0339, "step": 5627 }, { "epoch": 1.8794456503589916, "grad_norm": 0.7360282038989799, "learning_rate": 3.6798006367349926e-06, "loss": 0.0312, "step": 5628 }, { "epoch": 1.879779595925864, "grad_norm": 0.2902616679070814, "learning_rate": 3.6779263686416668e-06, "loss": 0.0209, "step": 5629 }, { "epoch": 1.8801135414927366, "grad_norm": 0.27211614708357434, "learning_rate": 3.676052300263129e-06, "loss": 0.0168, "step": 5630 }, { "epoch": 1.8804474870596093, "grad_norm": 0.3081016855972875, "learning_rate": 3.6741784318824814e-06, "loss": 0.0249, "step": 5631 }, { "epoch": 1.880781432626482, "grad_norm": 0.21709985011962707, "learning_rate": 3.6723047637827897e-06, "loss": 0.0152, "step": 5632 }, { "epoch": 1.8811153781933545, "grad_norm": 0.28996588861927514, "learning_rate": 3.670431296247099e-06, "loss": 0.018, "step": 5633 }, { "epoch": 1.881449323760227, "grad_norm": 0.31626106909858126, "learning_rate": 3.6685580295584162e-06, "loss": 0.023, "step": 5634 }, { "epoch": 1.8817832693270997, "grad_norm": 0.7737782998676195, "learning_rate": 3.6666849639997205e-06, "loss": 0.02, "step": 5635 }, { "epoch": 1.8821172148939724, "grad_norm": 0.31216587439059384, "learning_rate": 3.6648120998539596e-06, "loss": 0.025, "step": 5636 }, { "epoch": 1.882451160460845, "grad_norm": 0.31648761971277567, "learning_rate": 3.662939437404053e-06, "loss": 0.0392, "step": 5637 }, { "epoch": 1.8827851060277174, "grad_norm": 0.36291856789574106, "learning_rate": 3.6610669769328853e-06, "loss": 0.0331, "step": 5638 }, { "epoch": 1.8831190515945901, "grad_norm": 0.28430850615956293, "learning_rate": 3.659194718723319e-06, "loss": 0.0221, "step": 5639 }, { "epoch": 1.8834529971614626, "grad_norm": 0.26557372516319366, "learning_rate": 3.657322663058177e-06, "loss": 0.0246, "step": 5640 }, { "epoch": 1.8837869427283351, "grad_norm": 0.3382648773244678, "learning_rate": 3.655450810220257e-06, "loss": 0.0254, "step": 5641 }, { "epoch": 1.8841208882952079, "grad_norm": 0.3856131556946965, "learning_rate": 3.6535791604923225e-06, "loss": 0.0258, "step": 5642 }, { "epoch": 1.8844548338620806, "grad_norm": 0.2625114895455324, "learning_rate": 3.6517077141571076e-06, "loss": 0.0184, "step": 5643 }, { "epoch": 1.884788779428953, "grad_norm": 0.6705068872465284, "learning_rate": 3.649836471497321e-06, "loss": 0.0396, "step": 5644 }, { "epoch": 1.8851227249958256, "grad_norm": 0.3247720638992046, "learning_rate": 3.6479654327956325e-06, "loss": 0.0246, "step": 5645 }, { "epoch": 1.8854566705626983, "grad_norm": 0.4229451507494173, "learning_rate": 3.646094598334685e-06, "loss": 0.0267, "step": 5646 }, { "epoch": 1.885790616129571, "grad_norm": 0.3830282898171951, "learning_rate": 3.64422396839709e-06, "loss": 0.0236, "step": 5647 }, { "epoch": 1.8861245616964435, "grad_norm": 0.24184828368269592, "learning_rate": 3.642353543265429e-06, "loss": 0.0214, "step": 5648 }, { "epoch": 1.886458507263316, "grad_norm": 0.26558508192597324, "learning_rate": 3.640483323222248e-06, "loss": 0.0239, "step": 5649 }, { "epoch": 1.8867924528301887, "grad_norm": 0.3366403275042395, "learning_rate": 3.638613308550072e-06, "loss": 0.0315, "step": 5650 }, { "epoch": 1.8871263983970614, "grad_norm": 0.26774828007915513, "learning_rate": 3.636743499531385e-06, "loss": 0.0204, "step": 5651 }, { "epoch": 1.887460343963934, "grad_norm": 0.243076448644584, "learning_rate": 3.634873896448644e-06, "loss": 0.0213, "step": 5652 }, { "epoch": 1.8877942895308064, "grad_norm": 0.20326265865240656, "learning_rate": 3.633004499584275e-06, "loss": 0.0179, "step": 5653 }, { "epoch": 1.8881282350976791, "grad_norm": 0.2484244726530193, "learning_rate": 3.6311353092206723e-06, "loss": 0.0201, "step": 5654 }, { "epoch": 1.8884621806645516, "grad_norm": 0.28341009012279234, "learning_rate": 3.6292663256401967e-06, "loss": 0.0276, "step": 5655 }, { "epoch": 1.8887961262314241, "grad_norm": 0.3091307445849336, "learning_rate": 3.6273975491251844e-06, "loss": 0.0272, "step": 5656 }, { "epoch": 1.8891300717982968, "grad_norm": 0.2813763442913171, "learning_rate": 3.625528979957935e-06, "loss": 0.0208, "step": 5657 }, { "epoch": 1.8894640173651696, "grad_norm": 0.30988343067989294, "learning_rate": 3.6236606184207164e-06, "loss": 0.0221, "step": 5658 }, { "epoch": 1.889797962932042, "grad_norm": 0.30786214327412476, "learning_rate": 3.621792464795767e-06, "loss": 0.0255, "step": 5659 }, { "epoch": 1.8901319084989145, "grad_norm": 0.24506763944937457, "learning_rate": 3.6199245193652944e-06, "loss": 0.0217, "step": 5660 }, { "epoch": 1.8904658540657873, "grad_norm": 0.24257643910939025, "learning_rate": 3.6180567824114715e-06, "loss": 0.0221, "step": 5661 }, { "epoch": 1.89079979963266, "grad_norm": 0.2831029912470948, "learning_rate": 3.6161892542164444e-06, "loss": 0.0202, "step": 5662 }, { "epoch": 1.8911337451995325, "grad_norm": 0.29041372715845953, "learning_rate": 3.614321935062325e-06, "loss": 0.0212, "step": 5663 }, { "epoch": 1.891467690766405, "grad_norm": 0.27341642196534827, "learning_rate": 3.6124548252311918e-06, "loss": 0.0289, "step": 5664 }, { "epoch": 1.8918016363332777, "grad_norm": 0.3695378161978298, "learning_rate": 3.610587925005097e-06, "loss": 0.0306, "step": 5665 }, { "epoch": 1.8921355819001504, "grad_norm": 0.39052305016986566, "learning_rate": 3.608721234666054e-06, "loss": 0.0321, "step": 5666 }, { "epoch": 1.892469527467023, "grad_norm": 0.25340199290291054, "learning_rate": 3.6068547544960493e-06, "loss": 0.0249, "step": 5667 }, { "epoch": 1.8928034730338954, "grad_norm": 0.20473054344839864, "learning_rate": 3.6049884847770396e-06, "loss": 0.0178, "step": 5668 }, { "epoch": 1.8931374186007681, "grad_norm": 0.2541131857332617, "learning_rate": 3.6031224257909448e-06, "loss": 0.0185, "step": 5669 }, { "epoch": 1.8934713641676408, "grad_norm": 0.32645449183874226, "learning_rate": 3.6012565778196552e-06, "loss": 0.0331, "step": 5670 }, { "epoch": 1.893805309734513, "grad_norm": 0.4710526248291429, "learning_rate": 3.5993909411450297e-06, "loss": 0.0314, "step": 5671 }, { "epoch": 1.8941392553013858, "grad_norm": 0.43063289232438423, "learning_rate": 3.597525516048894e-06, "loss": 0.0329, "step": 5672 }, { "epoch": 1.8944732008682585, "grad_norm": 0.3069695240366432, "learning_rate": 3.5956603028130397e-06, "loss": 0.0293, "step": 5673 }, { "epoch": 1.894807146435131, "grad_norm": 0.3191714135604289, "learning_rate": 3.5937953017192356e-06, "loss": 0.0323, "step": 5674 }, { "epoch": 1.8951410920020035, "grad_norm": 0.2955595424077923, "learning_rate": 3.591930513049208e-06, "loss": 0.0267, "step": 5675 }, { "epoch": 1.8954750375688763, "grad_norm": 0.2569862023773537, "learning_rate": 3.5900659370846556e-06, "loss": 0.0184, "step": 5676 }, { "epoch": 1.895808983135749, "grad_norm": 0.29791923924704183, "learning_rate": 3.5882015741072464e-06, "loss": 0.0295, "step": 5677 }, { "epoch": 1.8961429287026215, "grad_norm": 0.33406877157660403, "learning_rate": 3.586337424398609e-06, "loss": 0.0292, "step": 5678 }, { "epoch": 1.896476874269494, "grad_norm": 0.30303765639025965, "learning_rate": 3.584473488240352e-06, "loss": 0.0365, "step": 5679 }, { "epoch": 1.8968108198363667, "grad_norm": 0.31733216907511996, "learning_rate": 3.5826097659140413e-06, "loss": 0.0259, "step": 5680 }, { "epoch": 1.8971447654032394, "grad_norm": 0.22585600941395312, "learning_rate": 3.5807462577012152e-06, "loss": 0.0221, "step": 5681 }, { "epoch": 1.897478710970112, "grad_norm": 0.25590338746280156, "learning_rate": 3.5788829638833777e-06, "loss": 0.0253, "step": 5682 }, { "epoch": 1.8978126565369844, "grad_norm": 0.26186866055214336, "learning_rate": 3.5770198847420016e-06, "loss": 0.0226, "step": 5683 }, { "epoch": 1.898146602103857, "grad_norm": 0.2633618834673288, "learning_rate": 3.5751570205585264e-06, "loss": 0.0249, "step": 5684 }, { "epoch": 1.8984805476707298, "grad_norm": 0.28940639133878754, "learning_rate": 3.573294371614361e-06, "loss": 0.0211, "step": 5685 }, { "epoch": 1.8988144932376023, "grad_norm": 0.3674848153022131, "learning_rate": 3.571431938190879e-06, "loss": 0.0333, "step": 5686 }, { "epoch": 1.8991484388044748, "grad_norm": 0.3772093176187774, "learning_rate": 3.5695697205694246e-06, "loss": 0.0207, "step": 5687 }, { "epoch": 1.8994823843713475, "grad_norm": 0.279200525858833, "learning_rate": 3.567707719031306e-06, "loss": 0.0315, "step": 5688 }, { "epoch": 1.89981632993822, "grad_norm": 0.2888839645900449, "learning_rate": 3.5658459338578016e-06, "loss": 0.0288, "step": 5689 }, { "epoch": 1.9001502755050925, "grad_norm": 0.22384834559700187, "learning_rate": 3.563984365330153e-06, "loss": 0.0249, "step": 5690 }, { "epoch": 1.9004842210719652, "grad_norm": 0.21465859175580254, "learning_rate": 3.562123013729577e-06, "loss": 0.018, "step": 5691 }, { "epoch": 1.900818166638838, "grad_norm": 0.2767172704703618, "learning_rate": 3.56026187933725e-06, "loss": 0.0262, "step": 5692 }, { "epoch": 1.9011521122057105, "grad_norm": 0.23014830000881922, "learning_rate": 3.5584009624343187e-06, "loss": 0.0199, "step": 5693 }, { "epoch": 1.901486057772583, "grad_norm": 0.2751937117181656, "learning_rate": 3.5565402633018963e-06, "loss": 0.0153, "step": 5694 }, { "epoch": 1.9018200033394557, "grad_norm": 0.2824563250664518, "learning_rate": 3.554679782221063e-06, "loss": 0.0301, "step": 5695 }, { "epoch": 1.9021539489063284, "grad_norm": 0.24291693898765357, "learning_rate": 3.552819519472865e-06, "loss": 0.0194, "step": 5696 }, { "epoch": 1.9024878944732009, "grad_norm": 0.2851782921848308, "learning_rate": 3.5509594753383202e-06, "loss": 0.0234, "step": 5697 }, { "epoch": 1.9028218400400734, "grad_norm": 0.27319900921435764, "learning_rate": 3.5490996500984085e-06, "loss": 0.0306, "step": 5698 }, { "epoch": 1.903155785606946, "grad_norm": 0.31762881081794064, "learning_rate": 3.547240044034079e-06, "loss": 0.0212, "step": 5699 }, { "epoch": 1.9034897311738188, "grad_norm": 0.27509760813151607, "learning_rate": 3.545380657426247e-06, "loss": 0.024, "step": 5700 }, { "epoch": 1.9038236767406913, "grad_norm": 0.30464699764027325, "learning_rate": 3.5435214905557937e-06, "loss": 0.0229, "step": 5701 }, { "epoch": 1.9041576223075638, "grad_norm": 0.24258387734140954, "learning_rate": 3.5416625437035656e-06, "loss": 0.0237, "step": 5702 }, { "epoch": 1.9044915678744365, "grad_norm": 0.23685690862307018, "learning_rate": 3.539803817150385e-06, "loss": 0.0214, "step": 5703 }, { "epoch": 1.904825513441309, "grad_norm": 0.26934409608958326, "learning_rate": 3.5379453111770313e-06, "loss": 0.0258, "step": 5704 }, { "epoch": 1.9051594590081815, "grad_norm": 0.30369357691944876, "learning_rate": 3.536087026064252e-06, "loss": 0.0269, "step": 5705 }, { "epoch": 1.9054934045750542, "grad_norm": 0.25558186749101636, "learning_rate": 3.534228962092766e-06, "loss": 0.0225, "step": 5706 }, { "epoch": 1.905827350141927, "grad_norm": 0.23940305901461514, "learning_rate": 3.5323711195432533e-06, "loss": 0.0237, "step": 5707 }, { "epoch": 1.9061612957087994, "grad_norm": 0.20537004199358336, "learning_rate": 3.530513498696363e-06, "loss": 0.017, "step": 5708 }, { "epoch": 1.906495241275672, "grad_norm": 0.366429410200943, "learning_rate": 3.5286560998327125e-06, "loss": 0.0206, "step": 5709 }, { "epoch": 1.9068291868425447, "grad_norm": 0.23299796097563769, "learning_rate": 3.5267989232328827e-06, "loss": 0.0169, "step": 5710 }, { "epoch": 1.9071631324094174, "grad_norm": 0.31482949941738964, "learning_rate": 3.5249419691774212e-06, "loss": 0.0187, "step": 5711 }, { "epoch": 1.9074970779762899, "grad_norm": 0.2997997130992164, "learning_rate": 3.523085237946844e-06, "loss": 0.0223, "step": 5712 }, { "epoch": 1.9078310235431624, "grad_norm": 0.2357928221885971, "learning_rate": 3.5212287298216306e-06, "loss": 0.0158, "step": 5713 }, { "epoch": 1.908164969110035, "grad_norm": 0.4568262748198393, "learning_rate": 3.5193724450822296e-06, "loss": 0.0276, "step": 5714 }, { "epoch": 1.9084989146769078, "grad_norm": 0.34578103578375924, "learning_rate": 3.517516384009056e-06, "loss": 0.0279, "step": 5715 }, { "epoch": 1.9088328602437803, "grad_norm": 0.23662446686899274, "learning_rate": 3.515660546882488e-06, "loss": 0.0162, "step": 5716 }, { "epoch": 1.9091668058106528, "grad_norm": 0.31818349141824004, "learning_rate": 3.5138049339828718e-06, "loss": 0.0219, "step": 5717 }, { "epoch": 1.9095007513775255, "grad_norm": 0.4035219440624549, "learning_rate": 3.5119495455905194e-06, "loss": 0.0229, "step": 5718 }, { "epoch": 1.9098346969443982, "grad_norm": 0.3607144084836721, "learning_rate": 3.5100943819857082e-06, "loss": 0.0224, "step": 5719 }, { "epoch": 1.9101686425112705, "grad_norm": 0.21702268535076308, "learning_rate": 3.508239443448685e-06, "loss": 0.0182, "step": 5720 }, { "epoch": 1.9105025880781432, "grad_norm": 0.2472802879250286, "learning_rate": 3.5063847302596587e-06, "loss": 0.018, "step": 5721 }, { "epoch": 1.910836533645016, "grad_norm": 0.29950848375670075, "learning_rate": 3.504530242698806e-06, "loss": 0.0241, "step": 5722 }, { "epoch": 1.9111704792118884, "grad_norm": 0.27458335393601785, "learning_rate": 3.5026759810462687e-06, "loss": 0.0164, "step": 5723 }, { "epoch": 1.911504424778761, "grad_norm": 0.21515272952143766, "learning_rate": 3.5008219455821546e-06, "loss": 0.0189, "step": 5724 }, { "epoch": 1.9118383703456336, "grad_norm": 0.27069095778701835, "learning_rate": 3.4989681365865363e-06, "loss": 0.0182, "step": 5725 }, { "epoch": 1.9121723159125064, "grad_norm": 0.26467824055167855, "learning_rate": 3.497114554339457e-06, "loss": 0.019, "step": 5726 }, { "epoch": 1.9125062614793789, "grad_norm": 0.469183792243403, "learning_rate": 3.4952611991209197e-06, "loss": 0.0313, "step": 5727 }, { "epoch": 1.9128402070462514, "grad_norm": 0.252396180883767, "learning_rate": 3.4934080712108964e-06, "loss": 0.0225, "step": 5728 }, { "epoch": 1.913174152613124, "grad_norm": 0.2550532671470629, "learning_rate": 3.4915551708893236e-06, "loss": 0.0201, "step": 5729 }, { "epoch": 1.9135080981799968, "grad_norm": 0.25301491539265647, "learning_rate": 3.489702498436103e-06, "loss": 0.0246, "step": 5730 }, { "epoch": 1.9138420437468693, "grad_norm": 0.3181839797481373, "learning_rate": 3.487850054131103e-06, "loss": 0.0238, "step": 5731 }, { "epoch": 1.9141759893137418, "grad_norm": 0.28803121860494485, "learning_rate": 3.4859978382541575e-06, "loss": 0.0277, "step": 5732 }, { "epoch": 1.9145099348806145, "grad_norm": 0.26992756840531185, "learning_rate": 3.4841458510850656e-06, "loss": 0.0208, "step": 5733 }, { "epoch": 1.9148438804474872, "grad_norm": 0.25450634681763407, "learning_rate": 3.482294092903592e-06, "loss": 0.0176, "step": 5734 }, { "epoch": 1.9151778260143597, "grad_norm": 0.27097711118186313, "learning_rate": 3.480442563989466e-06, "loss": 0.02, "step": 5735 }, { "epoch": 1.9155117715812322, "grad_norm": 0.2543927295526409, "learning_rate": 3.4785912646223813e-06, "loss": 0.0248, "step": 5736 }, { "epoch": 1.915845717148105, "grad_norm": 0.3572264494090962, "learning_rate": 3.4767401950820003e-06, "loss": 0.0348, "step": 5737 }, { "epoch": 1.9161796627149774, "grad_norm": 0.2006125898602107, "learning_rate": 3.4748893556479497e-06, "loss": 0.0149, "step": 5738 }, { "epoch": 1.91651360828185, "grad_norm": 0.17789457397292407, "learning_rate": 3.4730387465998194e-06, "loss": 0.0138, "step": 5739 }, { "epoch": 1.9168475538487226, "grad_norm": 0.313329535925791, "learning_rate": 3.4711883682171666e-06, "loss": 0.0248, "step": 5740 }, { "epoch": 1.9171814994155953, "grad_norm": 0.25576284198479743, "learning_rate": 3.4693382207795114e-06, "loss": 0.0186, "step": 5741 }, { "epoch": 1.9175154449824678, "grad_norm": 0.259449240986749, "learning_rate": 3.4674883045663404e-06, "loss": 0.0135, "step": 5742 }, { "epoch": 1.9178493905493403, "grad_norm": 0.2631975844534271, "learning_rate": 3.465638619857104e-06, "loss": 0.0221, "step": 5743 }, { "epoch": 1.918183336116213, "grad_norm": 0.3371311670108778, "learning_rate": 3.463789166931223e-06, "loss": 0.0317, "step": 5744 }, { "epoch": 1.9185172816830858, "grad_norm": 0.23233644899160083, "learning_rate": 3.4619399460680757e-06, "loss": 0.0158, "step": 5745 }, { "epoch": 1.9188512272499583, "grad_norm": 0.4168115584854737, "learning_rate": 3.460090957547011e-06, "loss": 0.0185, "step": 5746 }, { "epoch": 1.9191851728168308, "grad_norm": 0.22650684368857574, "learning_rate": 3.4582422016473384e-06, "loss": 0.0171, "step": 5747 }, { "epoch": 1.9195191183837035, "grad_norm": 0.2607711815041672, "learning_rate": 3.4563936786483345e-06, "loss": 0.02, "step": 5748 }, { "epoch": 1.9198530639505762, "grad_norm": 0.2680289256407857, "learning_rate": 3.454545388829239e-06, "loss": 0.0245, "step": 5749 }, { "epoch": 1.9201870095174487, "grad_norm": 0.30776760651468293, "learning_rate": 3.4526973324692614e-06, "loss": 0.0272, "step": 5750 }, { "epoch": 1.9205209550843212, "grad_norm": 0.297492885355231, "learning_rate": 3.4508495098475712e-06, "loss": 0.0257, "step": 5751 }, { "epoch": 1.920854900651194, "grad_norm": 0.3382392401642789, "learning_rate": 3.4490019212433035e-06, "loss": 0.0411, "step": 5752 }, { "epoch": 1.9211888462180664, "grad_norm": 0.305923167577958, "learning_rate": 3.447154566935557e-06, "loss": 0.0247, "step": 5753 }, { "epoch": 1.921522791784939, "grad_norm": 0.2524122769731954, "learning_rate": 3.4453074472033975e-06, "loss": 0.0188, "step": 5754 }, { "epoch": 1.9218567373518116, "grad_norm": 0.28425994275426447, "learning_rate": 3.443460562325853e-06, "loss": 0.022, "step": 5755 }, { "epoch": 1.9221906829186843, "grad_norm": 0.3118905363265386, "learning_rate": 3.4416139125819204e-06, "loss": 0.0264, "step": 5756 }, { "epoch": 1.9225246284855568, "grad_norm": 0.3281207577346625, "learning_rate": 3.4397674982505546e-06, "loss": 0.0255, "step": 5757 }, { "epoch": 1.9228585740524293, "grad_norm": 0.28634068475023605, "learning_rate": 3.43792131961068e-06, "loss": 0.0238, "step": 5758 }, { "epoch": 1.923192519619302, "grad_norm": 0.21798902076613347, "learning_rate": 3.4360753769411816e-06, "loss": 0.0154, "step": 5759 }, { "epoch": 1.9235264651861748, "grad_norm": 0.32068310275870915, "learning_rate": 3.4342296705209112e-06, "loss": 0.0223, "step": 5760 }, { "epoch": 1.9238604107530473, "grad_norm": 0.22021435612465068, "learning_rate": 3.432384200628688e-06, "loss": 0.0211, "step": 5761 }, { "epoch": 1.9241943563199198, "grad_norm": 0.24396127581095575, "learning_rate": 3.4305389675432882e-06, "loss": 0.0197, "step": 5762 }, { "epoch": 1.9245283018867925, "grad_norm": 0.36661985082876813, "learning_rate": 3.4286939715434573e-06, "loss": 0.0256, "step": 5763 }, { "epoch": 1.9248622474536652, "grad_norm": 0.29503627923718295, "learning_rate": 3.4268492129079047e-06, "loss": 0.0218, "step": 5764 }, { "epoch": 1.9251961930205377, "grad_norm": 0.2542535312199132, "learning_rate": 3.4250046919153e-06, "loss": 0.0262, "step": 5765 }, { "epoch": 1.9255301385874102, "grad_norm": 0.2847098668501808, "learning_rate": 3.4231604088442806e-06, "loss": 0.0219, "step": 5766 }, { "epoch": 1.925864084154283, "grad_norm": 0.2625897038985264, "learning_rate": 3.4213163639734504e-06, "loss": 0.0169, "step": 5767 }, { "epoch": 1.9261980297211556, "grad_norm": 0.2753234418285163, "learning_rate": 3.4194725575813707e-06, "loss": 0.0212, "step": 5768 }, { "epoch": 1.9265319752880279, "grad_norm": 0.22249139602145235, "learning_rate": 3.417628989946572e-06, "loss": 0.0199, "step": 5769 }, { "epoch": 1.9268659208549006, "grad_norm": 0.2643160394653972, "learning_rate": 3.415785661347546e-06, "loss": 0.0197, "step": 5770 }, { "epoch": 1.9271998664217733, "grad_norm": 0.3020632813878166, "learning_rate": 3.4139425720627494e-06, "loss": 0.0259, "step": 5771 }, { "epoch": 1.9275338119886458, "grad_norm": 0.43705020368872793, "learning_rate": 3.412099722370601e-06, "loss": 0.0228, "step": 5772 }, { "epoch": 1.9278677575555183, "grad_norm": 0.31263637342794637, "learning_rate": 3.4102571125494877e-06, "loss": 0.0274, "step": 5773 }, { "epoch": 1.928201703122391, "grad_norm": 0.3005120875419277, "learning_rate": 3.408414742877757e-06, "loss": 0.0206, "step": 5774 }, { "epoch": 1.9285356486892637, "grad_norm": 0.314114858724423, "learning_rate": 3.406572613633719e-06, "loss": 0.0238, "step": 5775 }, { "epoch": 1.9288695942561362, "grad_norm": 0.20709489788582736, "learning_rate": 3.40473072509565e-06, "loss": 0.0179, "step": 5776 }, { "epoch": 1.9292035398230087, "grad_norm": 0.2079678458008911, "learning_rate": 3.4028890775417887e-06, "loss": 0.0151, "step": 5777 }, { "epoch": 1.9295374853898815, "grad_norm": 0.3596915574099437, "learning_rate": 3.4010476712503367e-06, "loss": 0.031, "step": 5778 }, { "epoch": 1.9298714309567542, "grad_norm": 0.2911308911076081, "learning_rate": 3.3992065064994615e-06, "loss": 0.0204, "step": 5779 }, { "epoch": 1.9302053765236267, "grad_norm": 0.2545928557163873, "learning_rate": 3.3973655835672923e-06, "loss": 0.0269, "step": 5780 }, { "epoch": 1.9305393220904992, "grad_norm": 0.231130465888615, "learning_rate": 3.3955249027319214e-06, "loss": 0.0202, "step": 5781 }, { "epoch": 1.9308732676573719, "grad_norm": 0.3375892586951769, "learning_rate": 3.3936844642714073e-06, "loss": 0.0279, "step": 5782 }, { "epoch": 1.9312072132242446, "grad_norm": 0.2477328990343217, "learning_rate": 3.3918442684637687e-06, "loss": 0.0273, "step": 5783 }, { "epoch": 1.931541158791117, "grad_norm": 0.29131288253390225, "learning_rate": 3.3900043155869865e-06, "loss": 0.0218, "step": 5784 }, { "epoch": 1.9318751043579896, "grad_norm": 0.240469610356494, "learning_rate": 3.388164605919012e-06, "loss": 0.0234, "step": 5785 }, { "epoch": 1.9322090499248623, "grad_norm": 0.3167886733801051, "learning_rate": 3.3863251397377516e-06, "loss": 0.0195, "step": 5786 }, { "epoch": 1.9325429954917348, "grad_norm": 0.2639229475840124, "learning_rate": 3.3844859173210797e-06, "loss": 0.0232, "step": 5787 }, { "epoch": 1.9328769410586073, "grad_norm": 0.3323833296643505, "learning_rate": 3.382646938946832e-06, "loss": 0.0219, "step": 5788 }, { "epoch": 1.93321088662548, "grad_norm": 0.2609261762353949, "learning_rate": 3.3808082048928083e-06, "loss": 0.0182, "step": 5789 }, { "epoch": 1.9335448321923527, "grad_norm": 0.26727327381174226, "learning_rate": 3.378969715436767e-06, "loss": 0.0201, "step": 5790 }, { "epoch": 1.9338787777592252, "grad_norm": 0.2311478036321507, "learning_rate": 3.3771314708564408e-06, "loss": 0.0234, "step": 5791 }, { "epoch": 1.9342127233260977, "grad_norm": 0.31724214783039545, "learning_rate": 3.3752934714295146e-06, "loss": 0.0282, "step": 5792 }, { "epoch": 1.9345466688929704, "grad_norm": 0.3639905507013953, "learning_rate": 3.373455717433639e-06, "loss": 0.0358, "step": 5793 }, { "epoch": 1.9348806144598432, "grad_norm": 0.29177698252553197, "learning_rate": 3.3716182091464295e-06, "loss": 0.0197, "step": 5794 }, { "epoch": 1.9352145600267157, "grad_norm": 0.34356793338454994, "learning_rate": 3.3697809468454634e-06, "loss": 0.0276, "step": 5795 }, { "epoch": 1.9355485055935882, "grad_norm": 0.3375503364780255, "learning_rate": 3.3679439308082777e-06, "loss": 0.024, "step": 5796 }, { "epoch": 1.9358824511604609, "grad_norm": 0.28808755513578127, "learning_rate": 3.366107161312381e-06, "loss": 0.0228, "step": 5797 }, { "epoch": 1.9362163967273336, "grad_norm": 0.3256739763787763, "learning_rate": 3.3642706386352355e-06, "loss": 0.0317, "step": 5798 }, { "epoch": 1.936550342294206, "grad_norm": 0.47020890589467057, "learning_rate": 3.3624343630542707e-06, "loss": 0.0198, "step": 5799 }, { "epoch": 1.9368842878610786, "grad_norm": 0.2845018000102603, "learning_rate": 3.3605983348468764e-06, "loss": 0.0241, "step": 5800 }, { "epoch": 1.9372182334279513, "grad_norm": 0.41753467962227386, "learning_rate": 3.3587625542904063e-06, "loss": 0.0335, "step": 5801 }, { "epoch": 1.9375521789948238, "grad_norm": 0.3515233072582555, "learning_rate": 3.356927021662178e-06, "loss": 0.0276, "step": 5802 }, { "epoch": 1.9378861245616963, "grad_norm": 0.2916770899627356, "learning_rate": 3.3550917372394696e-06, "loss": 0.023, "step": 5803 }, { "epoch": 1.938220070128569, "grad_norm": 0.2270008434678482, "learning_rate": 3.353256701299522e-06, "loss": 0.0232, "step": 5804 }, { "epoch": 1.9385540156954417, "grad_norm": 0.2586504692258007, "learning_rate": 3.3514219141195404e-06, "loss": 0.0184, "step": 5805 }, { "epoch": 1.9388879612623142, "grad_norm": 0.2470140225476682, "learning_rate": 3.3495873759766897e-06, "loss": 0.0204, "step": 5806 }, { "epoch": 1.9392219068291867, "grad_norm": 0.2258297573077572, "learning_rate": 3.347753087148098e-06, "loss": 0.0213, "step": 5807 }, { "epoch": 1.9395558523960594, "grad_norm": 0.33504239696759075, "learning_rate": 3.3459190479108583e-06, "loss": 0.0291, "step": 5808 }, { "epoch": 1.9398897979629321, "grad_norm": 0.31120048688790203, "learning_rate": 3.344085258542022e-06, "loss": 0.028, "step": 5809 }, { "epoch": 1.9402237435298046, "grad_norm": 0.26852814800786234, "learning_rate": 3.3422517193186056e-06, "loss": 0.0249, "step": 5810 }, { "epoch": 1.9405576890966771, "grad_norm": 0.2889792359846112, "learning_rate": 3.340418430517586e-06, "loss": 0.0248, "step": 5811 }, { "epoch": 1.9408916346635499, "grad_norm": 0.3341469130820314, "learning_rate": 3.338585392415904e-06, "loss": 0.0321, "step": 5812 }, { "epoch": 1.9412255802304226, "grad_norm": 0.430265299263613, "learning_rate": 3.3367526052904585e-06, "loss": 0.0187, "step": 5813 }, { "epoch": 1.941559525797295, "grad_norm": 0.46345645303214994, "learning_rate": 3.3349200694181182e-06, "loss": 0.0256, "step": 5814 }, { "epoch": 1.9418934713641676, "grad_norm": 0.26956771851846767, "learning_rate": 3.333087785075707e-06, "loss": 0.0209, "step": 5815 }, { "epoch": 1.9422274169310403, "grad_norm": 0.27106704261332304, "learning_rate": 3.3312557525400133e-06, "loss": 0.0178, "step": 5816 }, { "epoch": 1.942561362497913, "grad_norm": 0.35578560827960454, "learning_rate": 3.329423972087787e-06, "loss": 0.0286, "step": 5817 }, { "epoch": 1.9428953080647853, "grad_norm": 0.33650992551520637, "learning_rate": 3.3275924439957397e-06, "loss": 0.0328, "step": 5818 }, { "epoch": 1.943229253631658, "grad_norm": 0.26326183730826697, "learning_rate": 3.3257611685405444e-06, "loss": 0.0135, "step": 5819 }, { "epoch": 1.9435631991985307, "grad_norm": 0.38701056914123266, "learning_rate": 3.3239301459988395e-06, "loss": 0.0445, "step": 5820 }, { "epoch": 1.9438971447654032, "grad_norm": 0.354903143809386, "learning_rate": 3.322099376647221e-06, "loss": 0.0302, "step": 5821 }, { "epoch": 1.9442310903322757, "grad_norm": 0.24735400000002247, "learning_rate": 3.320268860762249e-06, "loss": 0.0158, "step": 5822 }, { "epoch": 1.9445650358991484, "grad_norm": 0.255502633960381, "learning_rate": 3.318438598620444e-06, "loss": 0.0186, "step": 5823 }, { "epoch": 1.9448989814660211, "grad_norm": 0.28664866311148385, "learning_rate": 3.316608590498287e-06, "loss": 0.0207, "step": 5824 }, { "epoch": 1.9452329270328936, "grad_norm": 0.3129524544312331, "learning_rate": 3.314778836672224e-06, "loss": 0.0166, "step": 5825 }, { "epoch": 1.9455668725997661, "grad_norm": 0.2518869915826915, "learning_rate": 3.312949337418661e-06, "loss": 0.0207, "step": 5826 }, { "epoch": 1.9459008181666388, "grad_norm": 0.3028117432261122, "learning_rate": 3.311120093013964e-06, "loss": 0.0335, "step": 5827 }, { "epoch": 1.9462347637335116, "grad_norm": 0.257864600229698, "learning_rate": 3.3092911037344642e-06, "loss": 0.0226, "step": 5828 }, { "epoch": 1.946568709300384, "grad_norm": 0.23405019122218812, "learning_rate": 3.30746236985645e-06, "loss": 0.0178, "step": 5829 }, { "epoch": 1.9469026548672566, "grad_norm": 0.31640288842976955, "learning_rate": 3.305633891656175e-06, "loss": 0.0256, "step": 5830 }, { "epoch": 1.9472366004341293, "grad_norm": 0.2811885023984529, "learning_rate": 3.3038056694098485e-06, "loss": 0.0243, "step": 5831 }, { "epoch": 1.947570546001002, "grad_norm": 0.2595925379709668, "learning_rate": 3.3019777033936497e-06, "loss": 0.0217, "step": 5832 }, { "epoch": 1.9479044915678745, "grad_norm": 0.2987458082606157, "learning_rate": 3.3001499938837124e-06, "loss": 0.0235, "step": 5833 }, { "epoch": 1.948238437134747, "grad_norm": 0.3306504487927109, "learning_rate": 3.2983225411561338e-06, "loss": 0.0244, "step": 5834 }, { "epoch": 1.9485723827016197, "grad_norm": 0.3396602763806835, "learning_rate": 3.296495345486971e-06, "loss": 0.0288, "step": 5835 }, { "epoch": 1.9489063282684922, "grad_norm": 0.2226756267363155, "learning_rate": 3.294668407152245e-06, "loss": 0.0195, "step": 5836 }, { "epoch": 1.9492402738353647, "grad_norm": 0.2801191040536907, "learning_rate": 3.2928417264279338e-06, "loss": 0.0215, "step": 5837 }, { "epoch": 1.9495742194022374, "grad_norm": 0.24784811196277576, "learning_rate": 3.2910153035899826e-06, "loss": 0.0206, "step": 5838 }, { "epoch": 1.9499081649691101, "grad_norm": 0.34129288191423274, "learning_rate": 3.2891891389142933e-06, "loss": 0.0306, "step": 5839 }, { "epoch": 1.9502421105359826, "grad_norm": 0.22300306004602136, "learning_rate": 3.2873632326767278e-06, "loss": 0.0215, "step": 5840 }, { "epoch": 1.9505760561028551, "grad_norm": 0.23560460974879197, "learning_rate": 3.2855375851531122e-06, "loss": 0.0178, "step": 5841 }, { "epoch": 1.9509100016697278, "grad_norm": 0.31498906746012384, "learning_rate": 3.283712196619229e-06, "loss": 0.0214, "step": 5842 }, { "epoch": 1.9512439472366006, "grad_norm": 0.3113196487782054, "learning_rate": 3.2818870673508297e-06, "loss": 0.0278, "step": 5843 }, { "epoch": 1.951577892803473, "grad_norm": 0.21125492883964472, "learning_rate": 3.2800621976236184e-06, "loss": 0.0158, "step": 5844 }, { "epoch": 1.9519118383703455, "grad_norm": 0.28307853134694105, "learning_rate": 3.2782375877132643e-06, "loss": 0.0205, "step": 5845 }, { "epoch": 1.9522457839372183, "grad_norm": 0.2999456377062616, "learning_rate": 3.276413237895395e-06, "loss": 0.0232, "step": 5846 }, { "epoch": 1.952579729504091, "grad_norm": 0.2567854381982598, "learning_rate": 3.2745891484456016e-06, "loss": 0.0245, "step": 5847 }, { "epoch": 1.9529136750709635, "grad_norm": 0.38292431688849965, "learning_rate": 3.2727653196394314e-06, "loss": 0.0247, "step": 5848 }, { "epoch": 1.953247620637836, "grad_norm": 0.2800679231787765, "learning_rate": 3.270941751752398e-06, "loss": 0.0264, "step": 5849 }, { "epoch": 1.9535815662047087, "grad_norm": 0.19956091896284442, "learning_rate": 3.269118445059973e-06, "loss": 0.0167, "step": 5850 }, { "epoch": 1.9539155117715812, "grad_norm": 0.26335270714390147, "learning_rate": 3.267295399837587e-06, "loss": 0.0254, "step": 5851 }, { "epoch": 1.9542494573384537, "grad_norm": 0.23201558038246078, "learning_rate": 3.2654726163606333e-06, "loss": 0.024, "step": 5852 }, { "epoch": 1.9545834029053264, "grad_norm": 0.33735614001278114, "learning_rate": 3.2636500949044637e-06, "loss": 0.0371, "step": 5853 }, { "epoch": 1.9549173484721991, "grad_norm": 0.29281664745168995, "learning_rate": 3.2618278357443913e-06, "loss": 0.0246, "step": 5854 }, { "epoch": 1.9552512940390716, "grad_norm": 0.23201336782090234, "learning_rate": 3.260005839155691e-06, "loss": 0.0198, "step": 5855 }, { "epoch": 1.955585239605944, "grad_norm": 0.2814654517399288, "learning_rate": 3.258184105413597e-06, "loss": 0.0276, "step": 5856 }, { "epoch": 1.9559191851728168, "grad_norm": 0.21019308925441066, "learning_rate": 3.256362634793303e-06, "loss": 0.0202, "step": 5857 }, { "epoch": 1.9562531307396895, "grad_norm": 0.4581378154718263, "learning_rate": 3.2545414275699638e-06, "loss": 0.0302, "step": 5858 }, { "epoch": 1.956587076306562, "grad_norm": 0.262252359623129, "learning_rate": 3.2527204840186944e-06, "loss": 0.0237, "step": 5859 }, { "epoch": 1.9569210218734345, "grad_norm": 0.4503845110632144, "learning_rate": 3.2508998044145674e-06, "loss": 0.0188, "step": 5860 }, { "epoch": 1.9572549674403072, "grad_norm": 0.20579539512288297, "learning_rate": 3.249079389032621e-06, "loss": 0.0136, "step": 5861 }, { "epoch": 1.95758891300718, "grad_norm": 0.3146515037895394, "learning_rate": 3.247259238147851e-06, "loss": 0.0233, "step": 5862 }, { "epoch": 1.9579228585740525, "grad_norm": 0.2913048497578694, "learning_rate": 3.245439352035209e-06, "loss": 0.0211, "step": 5863 }, { "epoch": 1.958256804140925, "grad_norm": 0.22715714845211402, "learning_rate": 3.243619730969614e-06, "loss": 0.0168, "step": 5864 }, { "epoch": 1.9585907497077977, "grad_norm": 0.21930421251497045, "learning_rate": 3.2418003752259374e-06, "loss": 0.0173, "step": 5865 }, { "epoch": 1.9589246952746704, "grad_norm": 0.21710261658100735, "learning_rate": 3.239981285079016e-06, "loss": 0.0186, "step": 5866 }, { "epoch": 1.9592586408415427, "grad_norm": 0.27064666090436496, "learning_rate": 3.238162460803646e-06, "loss": 0.028, "step": 5867 }, { "epoch": 1.9595925864084154, "grad_norm": 0.23906712649623107, "learning_rate": 3.2363439026745813e-06, "loss": 0.0184, "step": 5868 }, { "epoch": 1.959926531975288, "grad_norm": 0.2076308046728014, "learning_rate": 3.2345256109665366e-06, "loss": 0.0137, "step": 5869 }, { "epoch": 1.9602604775421606, "grad_norm": 0.2954053456605062, "learning_rate": 3.2327075859541867e-06, "loss": 0.026, "step": 5870 }, { "epoch": 1.960594423109033, "grad_norm": 0.25075346173026614, "learning_rate": 3.2308898279121646e-06, "loss": 0.0219, "step": 5871 }, { "epoch": 1.9609283686759058, "grad_norm": 0.28114262490316694, "learning_rate": 3.2290723371150627e-06, "loss": 0.0223, "step": 5872 }, { "epoch": 1.9612623142427785, "grad_norm": 0.2210698203821854, "learning_rate": 3.2272551138374387e-06, "loss": 0.018, "step": 5873 }, { "epoch": 1.961596259809651, "grad_norm": 0.24530615285130294, "learning_rate": 3.2254381583538025e-06, "loss": 0.0211, "step": 5874 }, { "epoch": 1.9619302053765235, "grad_norm": 0.31106545427697324, "learning_rate": 3.223621470938628e-06, "loss": 0.0238, "step": 5875 }, { "epoch": 1.9622641509433962, "grad_norm": 0.2660833208769417, "learning_rate": 3.2218050518663457e-06, "loss": 0.0213, "step": 5876 }, { "epoch": 1.962598096510269, "grad_norm": 0.21743227427686262, "learning_rate": 3.219988901411347e-06, "loss": 0.0223, "step": 5877 }, { "epoch": 1.9629320420771414, "grad_norm": 0.24486708130129758, "learning_rate": 3.218173019847985e-06, "loss": 0.0135, "step": 5878 }, { "epoch": 1.963265987644014, "grad_norm": 0.35907634828149954, "learning_rate": 3.2163574074505686e-06, "loss": 0.027, "step": 5879 }, { "epoch": 1.9635999332108867, "grad_norm": 0.32844558924637945, "learning_rate": 3.214542064493367e-06, "loss": 0.0219, "step": 5880 }, { "epoch": 1.9639338787777594, "grad_norm": 0.35326428299476076, "learning_rate": 3.2127269912506103e-06, "loss": 0.0248, "step": 5881 }, { "epoch": 1.9642678243446319, "grad_norm": 0.32373207491017764, "learning_rate": 3.210912187996486e-06, "loss": 0.0276, "step": 5882 }, { "epoch": 1.9646017699115044, "grad_norm": 0.26299833241497556, "learning_rate": 3.2090976550051393e-06, "loss": 0.0241, "step": 5883 }, { "epoch": 1.964935715478377, "grad_norm": 0.40795756898929836, "learning_rate": 3.207283392550681e-06, "loss": 0.0262, "step": 5884 }, { "epoch": 1.9652696610452496, "grad_norm": 0.2401971528025246, "learning_rate": 3.2054694009071753e-06, "loss": 0.0176, "step": 5885 }, { "epoch": 1.965603606612122, "grad_norm": 0.30160405688234604, "learning_rate": 3.2036556803486465e-06, "loss": 0.0315, "step": 5886 }, { "epoch": 1.9659375521789948, "grad_norm": 0.4733971362045815, "learning_rate": 3.2018422311490778e-06, "loss": 0.0231, "step": 5887 }, { "epoch": 1.9662714977458675, "grad_norm": 0.27799211423995956, "learning_rate": 3.200029053582413e-06, "loss": 0.03, "step": 5888 }, { "epoch": 1.96660544331274, "grad_norm": 0.26146796580960074, "learning_rate": 3.1982161479225514e-06, "loss": 0.0191, "step": 5889 }, { "epoch": 1.9669393888796125, "grad_norm": 0.32179812754743276, "learning_rate": 3.196403514443358e-06, "loss": 0.0277, "step": 5890 }, { "epoch": 1.9672733344464852, "grad_norm": 0.25901646040108445, "learning_rate": 3.19459115341865e-06, "loss": 0.0213, "step": 5891 }, { "epoch": 1.967607280013358, "grad_norm": 0.2580956396135416, "learning_rate": 3.1927790651222073e-06, "loss": 0.0273, "step": 5892 }, { "epoch": 1.9679412255802304, "grad_norm": 0.3359443142087912, "learning_rate": 3.1909672498277656e-06, "loss": 0.0222, "step": 5893 }, { "epoch": 1.968275171147103, "grad_norm": 0.3314281062252131, "learning_rate": 3.1891557078090218e-06, "loss": 0.0309, "step": 5894 }, { "epoch": 1.9686091167139756, "grad_norm": 0.34978716289236084, "learning_rate": 3.187344439339628e-06, "loss": 0.0361, "step": 5895 }, { "epoch": 1.9689430622808484, "grad_norm": 0.34410950313006444, "learning_rate": 3.1855334446932025e-06, "loss": 0.0307, "step": 5896 }, { "epoch": 1.9692770078477209, "grad_norm": 0.20713580653399105, "learning_rate": 3.1837227241433145e-06, "loss": 0.0186, "step": 5897 }, { "epoch": 1.9696109534145934, "grad_norm": 0.24913821861245006, "learning_rate": 3.181912277963495e-06, "loss": 0.027, "step": 5898 }, { "epoch": 1.969944898981466, "grad_norm": 0.2547974035629979, "learning_rate": 3.180102106427233e-06, "loss": 0.0261, "step": 5899 }, { "epoch": 1.9702788445483386, "grad_norm": 0.8450885560275477, "learning_rate": 3.178292209807976e-06, "loss": 0.0271, "step": 5900 }, { "epoch": 1.970612790115211, "grad_norm": 0.29206840523406574, "learning_rate": 3.1764825883791306e-06, "loss": 0.0229, "step": 5901 }, { "epoch": 1.9709467356820838, "grad_norm": 0.30484728775592423, "learning_rate": 3.174673242414062e-06, "loss": 0.0257, "step": 5902 }, { "epoch": 1.9712806812489565, "grad_norm": 0.37122635945952387, "learning_rate": 3.1728641721860925e-06, "loss": 0.0222, "step": 5903 }, { "epoch": 1.971614626815829, "grad_norm": 0.2567382911579384, "learning_rate": 3.1710553779685036e-06, "loss": 0.0223, "step": 5904 }, { "epoch": 1.9719485723827015, "grad_norm": 0.23499162344359104, "learning_rate": 3.169246860034535e-06, "loss": 0.0208, "step": 5905 }, { "epoch": 1.9722825179495742, "grad_norm": 0.2646541002066373, "learning_rate": 3.1674386186573853e-06, "loss": 0.0251, "step": 5906 }, { "epoch": 1.972616463516447, "grad_norm": 0.24678009565546116, "learning_rate": 3.1656306541102073e-06, "loss": 0.021, "step": 5907 }, { "epoch": 1.9729504090833194, "grad_norm": 0.1881145249223792, "learning_rate": 3.16382296666612e-06, "loss": 0.0158, "step": 5908 }, { "epoch": 1.973284354650192, "grad_norm": 0.2901700905718627, "learning_rate": 3.1620155565981942e-06, "loss": 0.0265, "step": 5909 }, { "epoch": 1.9736183002170646, "grad_norm": 0.34233287596476747, "learning_rate": 3.1602084241794595e-06, "loss": 0.0258, "step": 5910 }, { "epoch": 1.9739522457839374, "grad_norm": 0.21072531762872868, "learning_rate": 3.158401569682906e-06, "loss": 0.0218, "step": 5911 }, { "epoch": 1.9742861913508098, "grad_norm": 0.2662037549693113, "learning_rate": 3.156594993381479e-06, "loss": 0.0164, "step": 5912 }, { "epoch": 1.9746201369176823, "grad_norm": 0.19624794911578164, "learning_rate": 3.154788695548082e-06, "loss": 0.0195, "step": 5913 }, { "epoch": 1.974954082484555, "grad_norm": 0.22435623212512112, "learning_rate": 3.152982676455581e-06, "loss": 0.0166, "step": 5914 }, { "epoch": 1.9752880280514278, "grad_norm": 0.2546770021129933, "learning_rate": 3.151176936376794e-06, "loss": 0.0201, "step": 5915 }, { "epoch": 1.9756219736183, "grad_norm": 0.25422196414367904, "learning_rate": 3.1493714755845013e-06, "loss": 0.0172, "step": 5916 }, { "epoch": 1.9759559191851728, "grad_norm": 0.2636805130416644, "learning_rate": 3.1475662943514366e-06, "loss": 0.0182, "step": 5917 }, { "epoch": 1.9762898647520455, "grad_norm": 0.2726425398062032, "learning_rate": 3.145761392950293e-06, "loss": 0.0291, "step": 5918 }, { "epoch": 1.976623810318918, "grad_norm": 0.28387231861318524, "learning_rate": 3.1439567716537268e-06, "loss": 0.026, "step": 5919 }, { "epoch": 1.9769577558857905, "grad_norm": 0.3159811989141398, "learning_rate": 3.142152430734343e-06, "loss": 0.0288, "step": 5920 }, { "epoch": 1.9772917014526632, "grad_norm": 0.23312569497717564, "learning_rate": 3.140348370464711e-06, "loss": 0.0159, "step": 5921 }, { "epoch": 1.977625647019536, "grad_norm": 0.4675278517518357, "learning_rate": 3.138544591117354e-06, "loss": 0.0214, "step": 5922 }, { "epoch": 1.9779595925864084, "grad_norm": 0.2598155604686844, "learning_rate": 3.1367410929647544e-06, "loss": 0.0273, "step": 5923 }, { "epoch": 1.978293538153281, "grad_norm": 0.3597219327094149, "learning_rate": 3.1349378762793515e-06, "loss": 0.0318, "step": 5924 }, { "epoch": 1.9786274837201536, "grad_norm": 0.2603932482412402, "learning_rate": 3.133134941333543e-06, "loss": 0.0191, "step": 5925 }, { "epoch": 1.9789614292870263, "grad_norm": 0.3058024009180753, "learning_rate": 3.1313322883996833e-06, "loss": 0.0229, "step": 5926 }, { "epoch": 1.9792953748538988, "grad_norm": 0.2246348142136579, "learning_rate": 3.129529917750085e-06, "loss": 0.0122, "step": 5927 }, { "epoch": 1.9796293204207713, "grad_norm": 0.2996893775026067, "learning_rate": 3.1277278296570157e-06, "loss": 0.0307, "step": 5928 }, { "epoch": 1.979963265987644, "grad_norm": 0.5614506963389504, "learning_rate": 3.1259260243927035e-06, "loss": 0.0244, "step": 5929 }, { "epoch": 1.9802972115545168, "grad_norm": 0.3243657307907125, "learning_rate": 3.12412450222933e-06, "loss": 0.0229, "step": 5930 }, { "epoch": 1.9806311571213893, "grad_norm": 0.19348918220019287, "learning_rate": 3.12232326343904e-06, "loss": 0.0127, "step": 5931 }, { "epoch": 1.9809651026882618, "grad_norm": 0.28759933936010396, "learning_rate": 3.1205223082939302e-06, "loss": 0.0223, "step": 5932 }, { "epoch": 1.9812990482551345, "grad_norm": 0.23128262171772607, "learning_rate": 3.1187216370660558e-06, "loss": 0.0188, "step": 5933 }, { "epoch": 1.981632993822007, "grad_norm": 0.36061628504818183, "learning_rate": 3.1169212500274294e-06, "loss": 0.0272, "step": 5934 }, { "epoch": 1.9819669393888795, "grad_norm": 0.28025072567451725, "learning_rate": 3.11512114745002e-06, "loss": 0.0202, "step": 5935 }, { "epoch": 1.9823008849557522, "grad_norm": 0.2682571408976759, "learning_rate": 3.113321329605754e-06, "loss": 0.0183, "step": 5936 }, { "epoch": 1.982634830522625, "grad_norm": 0.2645470983431173, "learning_rate": 3.1115217967665174e-06, "loss": 0.0303, "step": 5937 }, { "epoch": 1.9829687760894974, "grad_norm": 0.2340744816177296, "learning_rate": 3.1097225492041494e-06, "loss": 0.0178, "step": 5938 }, { "epoch": 1.98330272165637, "grad_norm": 0.3225713067830606, "learning_rate": 3.107923587190448e-06, "loss": 0.0258, "step": 5939 }, { "epoch": 1.9836366672232426, "grad_norm": 0.27858292660121353, "learning_rate": 3.106124910997168e-06, "loss": 0.0268, "step": 5940 }, { "epoch": 1.9839706127901153, "grad_norm": 0.2682566434750068, "learning_rate": 3.1043265208960187e-06, "loss": 0.0215, "step": 5941 }, { "epoch": 1.9843045583569878, "grad_norm": 0.33843940950024376, "learning_rate": 3.102528417158668e-06, "loss": 0.0219, "step": 5942 }, { "epoch": 1.9846385039238603, "grad_norm": 0.2745822590878245, "learning_rate": 3.1007306000567434e-06, "loss": 0.0246, "step": 5943 }, { "epoch": 1.984972449490733, "grad_norm": 0.24053022466255353, "learning_rate": 3.0989330698618248e-06, "loss": 0.0181, "step": 5944 }, { "epoch": 1.9853063950576058, "grad_norm": 0.21327671700227108, "learning_rate": 3.097135826845451e-06, "loss": 0.0185, "step": 5945 }, { "epoch": 1.9856403406244783, "grad_norm": 0.2619959346125703, "learning_rate": 3.0953388712791155e-06, "loss": 0.0171, "step": 5946 }, { "epoch": 1.9859742861913507, "grad_norm": 0.25444385012693954, "learning_rate": 3.09354220343427e-06, "loss": 0.0206, "step": 5947 }, { "epoch": 1.9863082317582235, "grad_norm": 0.31650063936320205, "learning_rate": 3.0917458235823215e-06, "loss": 0.0247, "step": 5948 }, { "epoch": 1.986642177325096, "grad_norm": 0.3739815208954093, "learning_rate": 3.089949731994637e-06, "loss": 0.0289, "step": 5949 }, { "epoch": 1.9869761228919685, "grad_norm": 0.24452356544852305, "learning_rate": 3.088153928942535e-06, "loss": 0.0215, "step": 5950 }, { "epoch": 1.9873100684588412, "grad_norm": 0.24877693440300758, "learning_rate": 3.0863584146972935e-06, "loss": 0.017, "step": 5951 }, { "epoch": 1.987644014025714, "grad_norm": 0.23985073167068344, "learning_rate": 3.084563189530146e-06, "loss": 0.0226, "step": 5952 }, { "epoch": 1.9879779595925864, "grad_norm": 0.2734363984573269, "learning_rate": 3.0827682537122817e-06, "loss": 0.0241, "step": 5953 }, { "epoch": 1.9883119051594589, "grad_norm": 0.38896925458025244, "learning_rate": 3.0809736075148456e-06, "loss": 0.0246, "step": 5954 }, { "epoch": 1.9886458507263316, "grad_norm": 0.3042865422394154, "learning_rate": 3.0791792512089443e-06, "loss": 0.0253, "step": 5955 }, { "epoch": 1.9889797962932043, "grad_norm": 0.28970310667020616, "learning_rate": 3.0773851850656335e-06, "loss": 0.0239, "step": 5956 }, { "epoch": 1.9893137418600768, "grad_norm": 0.2598842642261865, "learning_rate": 3.075591409355929e-06, "loss": 0.0254, "step": 5957 }, { "epoch": 1.9896476874269493, "grad_norm": 0.2377822154904266, "learning_rate": 3.073797924350801e-06, "loss": 0.0202, "step": 5958 }, { "epoch": 1.989981632993822, "grad_norm": 0.2333692680246371, "learning_rate": 3.0720047303211746e-06, "loss": 0.0192, "step": 5959 }, { "epoch": 1.9903155785606947, "grad_norm": 0.2423296384346233, "learning_rate": 3.0702118275379376e-06, "loss": 0.0278, "step": 5960 }, { "epoch": 1.9906495241275672, "grad_norm": 0.25947751515757067, "learning_rate": 3.0684192162719263e-06, "loss": 0.0201, "step": 5961 }, { "epoch": 1.9909834696944397, "grad_norm": 0.24743408871594888, "learning_rate": 3.066626896793936e-06, "loss": 0.0209, "step": 5962 }, { "epoch": 1.9913174152613125, "grad_norm": 0.26507318718394685, "learning_rate": 3.0648348693747177e-06, "loss": 0.0236, "step": 5963 }, { "epoch": 1.9916513608281852, "grad_norm": 0.3655272594041909, "learning_rate": 3.063043134284979e-06, "loss": 0.03, "step": 5964 }, { "epoch": 1.9919853063950574, "grad_norm": 0.20299432033076675, "learning_rate": 3.0612516917953783e-06, "loss": 0.0167, "step": 5965 }, { "epoch": 1.9923192519619302, "grad_norm": 0.21254003248625267, "learning_rate": 3.0594605421765406e-06, "loss": 0.0156, "step": 5966 }, { "epoch": 1.9926531975288029, "grad_norm": 0.32424159826192184, "learning_rate": 3.057669685699037e-06, "loss": 0.0309, "step": 5967 }, { "epoch": 1.9929871430956754, "grad_norm": 0.3552126929745393, "learning_rate": 3.0558791226333974e-06, "loss": 0.0164, "step": 5968 }, { "epoch": 1.9933210886625479, "grad_norm": 0.2805002505757546, "learning_rate": 3.0540888532501075e-06, "loss": 0.0235, "step": 5969 }, { "epoch": 1.9936550342294206, "grad_norm": 0.2822730332647638, "learning_rate": 3.052298877819608e-06, "loss": 0.0198, "step": 5970 }, { "epoch": 1.9939889797962933, "grad_norm": 0.33483079517361786, "learning_rate": 3.050509196612297e-06, "loss": 0.0265, "step": 5971 }, { "epoch": 1.9943229253631658, "grad_norm": 0.21161861271095728, "learning_rate": 3.0487198098985265e-06, "loss": 0.0155, "step": 5972 }, { "epoch": 1.9946568709300383, "grad_norm": 0.32276428423138714, "learning_rate": 3.046930717948604e-06, "loss": 0.0209, "step": 5973 }, { "epoch": 1.994990816496911, "grad_norm": 0.30491785406405486, "learning_rate": 3.0451419210327935e-06, "loss": 0.0148, "step": 5974 }, { "epoch": 1.9953247620637837, "grad_norm": 0.2608247963775136, "learning_rate": 3.0433534194213143e-06, "loss": 0.017, "step": 5975 }, { "epoch": 1.9956587076306562, "grad_norm": 0.254211017514349, "learning_rate": 3.0415652133843375e-06, "loss": 0.0194, "step": 5976 }, { "epoch": 1.9959926531975287, "grad_norm": 0.28198311333264253, "learning_rate": 3.0397773031919966e-06, "loss": 0.0243, "step": 5977 }, { "epoch": 1.9963265987644014, "grad_norm": 0.3038115591835136, "learning_rate": 3.0379896891143746e-06, "loss": 0.0241, "step": 5978 }, { "epoch": 1.9966605443312742, "grad_norm": 0.2950551580736191, "learning_rate": 3.036202371421513e-06, "loss": 0.0199, "step": 5979 }, { "epoch": 1.9969944898981467, "grad_norm": 0.4063860114054834, "learning_rate": 3.034415350383405e-06, "loss": 0.031, "step": 5980 }, { "epoch": 1.9973284354650191, "grad_norm": 0.2924471722384363, "learning_rate": 3.0326286262700035e-06, "loss": 0.0194, "step": 5981 }, { "epoch": 1.9976623810318919, "grad_norm": 0.24206756223897735, "learning_rate": 3.030842199351212e-06, "loss": 0.0163, "step": 5982 }, { "epoch": 1.9979963265987644, "grad_norm": 0.29146831920685096, "learning_rate": 3.0290560698968907e-06, "loss": 0.0238, "step": 5983 }, { "epoch": 1.9983302721656369, "grad_norm": 0.29091063864844646, "learning_rate": 3.0272702381768593e-06, "loss": 0.0226, "step": 5984 }, { "epoch": 1.9986642177325096, "grad_norm": 0.24201912088373048, "learning_rate": 3.0254847044608872e-06, "loss": 0.0186, "step": 5985 }, { "epoch": 1.9989981632993823, "grad_norm": 0.2498637660186078, "learning_rate": 3.0236994690186983e-06, "loss": 0.0182, "step": 5986 }, { "epoch": 1.9993321088662548, "grad_norm": 0.31008017670084925, "learning_rate": 3.0219145321199763e-06, "loss": 0.0215, "step": 5987 }, { "epoch": 1.9996660544331273, "grad_norm": 0.2321229794882306, "learning_rate": 3.0201298940343543e-06, "loss": 0.0164, "step": 5988 }, { "epoch": 2.0, "grad_norm": 0.31303179367197403, "learning_rate": 3.018345555031422e-06, "loss": 0.0276, "step": 5989 }, { "epoch": 2.0, "eval_loss": 0.02529776282608509, "eval_runtime": 180.2975, "eval_samples_per_second": 111.887, "eval_steps_per_second": 1.753, "step": 5989 }, { "epoch": 2.0003339455668727, "grad_norm": 0.15175628656848728, "learning_rate": 3.0165615153807293e-06, "loss": 0.0101, "step": 5990 }, { "epoch": 2.000667891133745, "grad_norm": 0.20080372094640112, "learning_rate": 3.014777775351774e-06, "loss": 0.0125, "step": 5991 }, { "epoch": 2.0010018367006177, "grad_norm": 0.19208222718758047, "learning_rate": 3.012994335214011e-06, "loss": 0.0122, "step": 5992 }, { "epoch": 2.0013357822674904, "grad_norm": 0.1494873384407777, "learning_rate": 3.0112111952368496e-06, "loss": 0.0105, "step": 5993 }, { "epoch": 2.001669727834363, "grad_norm": 0.2370525768330149, "learning_rate": 3.009428355689654e-06, "loss": 0.0214, "step": 5994 }, { "epoch": 2.0020036734012354, "grad_norm": 0.21197485891283063, "learning_rate": 3.007645816841743e-06, "loss": 0.0177, "step": 5995 }, { "epoch": 2.002337618968108, "grad_norm": 0.2595571537221928, "learning_rate": 3.0058635789623926e-06, "loss": 0.0151, "step": 5996 }, { "epoch": 2.002671564534981, "grad_norm": 0.22107331988782644, "learning_rate": 3.0040816423208276e-06, "loss": 0.0142, "step": 5997 }, { "epoch": 2.0030055101018536, "grad_norm": 0.21302220094074045, "learning_rate": 3.002300007186232e-06, "loss": 0.0157, "step": 5998 }, { "epoch": 2.003339455668726, "grad_norm": 0.2418773728097957, "learning_rate": 3.0005186738277407e-06, "loss": 0.0108, "step": 5999 }, { "epoch": 2.0036734012355986, "grad_norm": 0.24464390588700685, "learning_rate": 2.9987376425144477e-06, "loss": 0.0175, "step": 6000 }, { "epoch": 2.0040073468024713, "grad_norm": 0.23346157094483946, "learning_rate": 2.9969569135153985e-06, "loss": 0.0125, "step": 6001 }, { "epoch": 2.004341292369344, "grad_norm": 0.32085743175992615, "learning_rate": 2.9951764870995925e-06, "loss": 0.0238, "step": 6002 }, { "epoch": 2.0046752379362163, "grad_norm": 0.30667266904713103, "learning_rate": 2.9933963635359847e-06, "loss": 0.0309, "step": 6003 }, { "epoch": 2.005009183503089, "grad_norm": 0.25174422534977653, "learning_rate": 2.991616543093483e-06, "loss": 0.0203, "step": 6004 }, { "epoch": 2.0053431290699617, "grad_norm": 0.25210599501180714, "learning_rate": 2.9898370260409502e-06, "loss": 0.0136, "step": 6005 }, { "epoch": 2.005677074636834, "grad_norm": 0.32066276200037597, "learning_rate": 2.9880578126472015e-06, "loss": 0.0235, "step": 6006 }, { "epoch": 2.0060110202037067, "grad_norm": 0.29859901902839975, "learning_rate": 2.9862789031810126e-06, "loss": 0.0174, "step": 6007 }, { "epoch": 2.0063449657705794, "grad_norm": 0.2444167474205298, "learning_rate": 2.984500297911106e-06, "loss": 0.0135, "step": 6008 }, { "epoch": 2.006678911337452, "grad_norm": 0.24874680321922657, "learning_rate": 2.9827219971061607e-06, "loss": 0.0139, "step": 6009 }, { "epoch": 2.0070128569043244, "grad_norm": 0.2718681703918907, "learning_rate": 2.98094400103481e-06, "loss": 0.0153, "step": 6010 }, { "epoch": 2.007346802471197, "grad_norm": 0.243284484635126, "learning_rate": 2.9791663099656424e-06, "loss": 0.0126, "step": 6011 }, { "epoch": 2.00768074803807, "grad_norm": 0.3588516268580067, "learning_rate": 2.977388924167196e-06, "loss": 0.0181, "step": 6012 }, { "epoch": 2.0080146936049426, "grad_norm": 0.2603451593112769, "learning_rate": 2.975611843907971e-06, "loss": 0.0165, "step": 6013 }, { "epoch": 2.008348639171815, "grad_norm": 0.2297039277694502, "learning_rate": 2.9738350694564117e-06, "loss": 0.0087, "step": 6014 }, { "epoch": 2.0086825847386875, "grad_norm": 0.19118966348710978, "learning_rate": 2.9720586010809234e-06, "loss": 0.0103, "step": 6015 }, { "epoch": 2.0090165303055603, "grad_norm": 0.17734445888402203, "learning_rate": 2.9702824390498615e-06, "loss": 0.0091, "step": 6016 }, { "epoch": 2.009350475872433, "grad_norm": 0.29652862266656277, "learning_rate": 2.9685065836315362e-06, "loss": 0.0171, "step": 6017 }, { "epoch": 2.0096844214393053, "grad_norm": 0.2501294834147169, "learning_rate": 2.9667310350942103e-06, "loss": 0.0093, "step": 6018 }, { "epoch": 2.010018367006178, "grad_norm": 0.33681145379379857, "learning_rate": 2.964955793706104e-06, "loss": 0.0147, "step": 6019 }, { "epoch": 2.0103523125730507, "grad_norm": 0.24074715765915838, "learning_rate": 2.963180859735387e-06, "loss": 0.0117, "step": 6020 }, { "epoch": 2.0106862581399234, "grad_norm": 0.35974894103064475, "learning_rate": 2.961406233450184e-06, "loss": 0.0207, "step": 6021 }, { "epoch": 2.0110202037067957, "grad_norm": 0.3357420455626794, "learning_rate": 2.9596319151185713e-06, "loss": 0.0188, "step": 6022 }, { "epoch": 2.0113541492736684, "grad_norm": 0.3218706265057225, "learning_rate": 2.9578579050085836e-06, "loss": 0.0215, "step": 6023 }, { "epoch": 2.011688094840541, "grad_norm": 0.2745907153839172, "learning_rate": 2.956084203388204e-06, "loss": 0.0196, "step": 6024 }, { "epoch": 2.0120220404074134, "grad_norm": 0.21460785258188947, "learning_rate": 2.9543108105253733e-06, "loss": 0.0119, "step": 6025 }, { "epoch": 2.012355985974286, "grad_norm": 0.2323097710648883, "learning_rate": 2.9525377266879813e-06, "loss": 0.0121, "step": 6026 }, { "epoch": 2.012689931541159, "grad_norm": 0.2676166773319939, "learning_rate": 2.950764952143874e-06, "loss": 0.0162, "step": 6027 }, { "epoch": 2.0130238771080315, "grad_norm": 0.27274738950486416, "learning_rate": 2.9489924871608495e-06, "loss": 0.0137, "step": 6028 }, { "epoch": 2.013357822674904, "grad_norm": 0.3127813298403477, "learning_rate": 2.9472203320066594e-06, "loss": 0.0208, "step": 6029 }, { "epoch": 2.0136917682417765, "grad_norm": 0.32911083350257675, "learning_rate": 2.9454484869490074e-06, "loss": 0.0151, "step": 6030 }, { "epoch": 2.0140257138086493, "grad_norm": 0.3737748175194281, "learning_rate": 2.943676952255554e-06, "loss": 0.0208, "step": 6031 }, { "epoch": 2.014359659375522, "grad_norm": 0.3241492409129578, "learning_rate": 2.9419057281939106e-06, "loss": 0.0166, "step": 6032 }, { "epoch": 2.0146936049423942, "grad_norm": 0.2091162730671664, "learning_rate": 2.94013481503164e-06, "loss": 0.0105, "step": 6033 }, { "epoch": 2.015027550509267, "grad_norm": 0.2896212018597147, "learning_rate": 2.9383642130362596e-06, "loss": 0.0155, "step": 6034 }, { "epoch": 2.0153614960761397, "grad_norm": 0.2938511006885393, "learning_rate": 2.9365939224752394e-06, "loss": 0.0142, "step": 6035 }, { "epoch": 2.0156954416430124, "grad_norm": 0.3404838851864081, "learning_rate": 2.934823943616001e-06, "loss": 0.0163, "step": 6036 }, { "epoch": 2.0160293872098847, "grad_norm": 0.2968319342281456, "learning_rate": 2.933054276725925e-06, "loss": 0.0182, "step": 6037 }, { "epoch": 2.0163633327767574, "grad_norm": 0.3218646982495804, "learning_rate": 2.9312849220723382e-06, "loss": 0.0193, "step": 6038 }, { "epoch": 2.01669727834363, "grad_norm": 0.27731965017708854, "learning_rate": 2.929515879922522e-06, "loss": 0.0149, "step": 6039 }, { "epoch": 2.0170312239105024, "grad_norm": 0.29319446372010927, "learning_rate": 2.9277471505437105e-06, "loss": 0.0147, "step": 6040 }, { "epoch": 2.017365169477375, "grad_norm": 0.32453300186935935, "learning_rate": 2.925978734203092e-06, "loss": 0.0171, "step": 6041 }, { "epoch": 2.017699115044248, "grad_norm": 0.42720111743976336, "learning_rate": 2.924210631167807e-06, "loss": 0.0148, "step": 6042 }, { "epoch": 2.0180330606111205, "grad_norm": 0.35981122091536205, "learning_rate": 2.922442841704948e-06, "loss": 0.0231, "step": 6043 }, { "epoch": 2.018367006177993, "grad_norm": 0.2634298939888762, "learning_rate": 2.920675366081559e-06, "loss": 0.0103, "step": 6044 }, { "epoch": 2.0187009517448655, "grad_norm": 0.40644227124389015, "learning_rate": 2.9189082045646404e-06, "loss": 0.0218, "step": 6045 }, { "epoch": 2.0190348973117382, "grad_norm": 0.22790158180052494, "learning_rate": 2.9171413574211426e-06, "loss": 0.0157, "step": 6046 }, { "epoch": 2.019368842878611, "grad_norm": 0.24907090607140453, "learning_rate": 2.9153748249179637e-06, "loss": 0.0139, "step": 6047 }, { "epoch": 2.0197027884454832, "grad_norm": 0.1856542662559802, "learning_rate": 2.9136086073219665e-06, "loss": 0.01, "step": 6048 }, { "epoch": 2.020036734012356, "grad_norm": 0.2218450590776673, "learning_rate": 2.9118427048999544e-06, "loss": 0.0122, "step": 6049 }, { "epoch": 2.0203706795792287, "grad_norm": 0.33827557376306444, "learning_rate": 2.9100771179186904e-06, "loss": 0.0177, "step": 6050 }, { "epoch": 2.0207046251461014, "grad_norm": 0.24309427342943882, "learning_rate": 2.9083118466448845e-06, "loss": 0.0137, "step": 6051 }, { "epoch": 2.0210385707129737, "grad_norm": 0.2970030949584663, "learning_rate": 2.9065468913452045e-06, "loss": 0.0157, "step": 6052 }, { "epoch": 2.0213725162798464, "grad_norm": 0.293592731613864, "learning_rate": 2.904782252286264e-06, "loss": 0.0164, "step": 6053 }, { "epoch": 2.021706461846719, "grad_norm": 0.2411401378571078, "learning_rate": 2.903017929734635e-06, "loss": 0.011, "step": 6054 }, { "epoch": 2.0220404074135914, "grad_norm": 0.34656115399225423, "learning_rate": 2.9012539239568405e-06, "loss": 0.0185, "step": 6055 }, { "epoch": 2.022374352980464, "grad_norm": 0.2545646721008136, "learning_rate": 2.899490235219351e-06, "loss": 0.0113, "step": 6056 }, { "epoch": 2.022708298547337, "grad_norm": 0.1859773868262544, "learning_rate": 2.897726863788595e-06, "loss": 0.0108, "step": 6057 }, { "epoch": 2.0230422441142095, "grad_norm": 0.23682701564000458, "learning_rate": 2.8959638099309504e-06, "loss": 0.0139, "step": 6058 }, { "epoch": 2.023376189681082, "grad_norm": 0.2757830867378058, "learning_rate": 2.8942010739127446e-06, "loss": 0.0158, "step": 6059 }, { "epoch": 2.0237101352479545, "grad_norm": 0.3019110217368406, "learning_rate": 2.8924386560002627e-06, "loss": 0.0148, "step": 6060 }, { "epoch": 2.0240440808148272, "grad_norm": 0.27416394262098626, "learning_rate": 2.8906765564597384e-06, "loss": 0.0103, "step": 6061 }, { "epoch": 2.0243780263817, "grad_norm": 0.2877330047605113, "learning_rate": 2.8889147755573556e-06, "loss": 0.0115, "step": 6062 }, { "epoch": 2.024711971948572, "grad_norm": 0.40638905717884094, "learning_rate": 2.8871533135592544e-06, "loss": 0.0279, "step": 6063 }, { "epoch": 2.025045917515445, "grad_norm": 0.22426651913039228, "learning_rate": 2.8853921707315215e-06, "loss": 0.0132, "step": 6064 }, { "epoch": 2.0253798630823177, "grad_norm": 0.307370164224585, "learning_rate": 2.8836313473402e-06, "loss": 0.0225, "step": 6065 }, { "epoch": 2.0257138086491904, "grad_norm": 0.22825809003354988, "learning_rate": 2.881870843651282e-06, "loss": 0.0103, "step": 6066 }, { "epoch": 2.0260477542160626, "grad_norm": 0.22814925295941907, "learning_rate": 2.8801106599307164e-06, "loss": 0.0129, "step": 6067 }, { "epoch": 2.0263816997829354, "grad_norm": 0.3003877625096707, "learning_rate": 2.8783507964443942e-06, "loss": 0.0147, "step": 6068 }, { "epoch": 2.026715645349808, "grad_norm": 0.31922608946006525, "learning_rate": 2.8765912534581674e-06, "loss": 0.0219, "step": 6069 }, { "epoch": 2.027049590916681, "grad_norm": 0.23767329459980238, "learning_rate": 2.874832031237833e-06, "loss": 0.0113, "step": 6070 }, { "epoch": 2.027383536483553, "grad_norm": 0.2996470845907607, "learning_rate": 2.873073130049142e-06, "loss": 0.0106, "step": 6071 }, { "epoch": 2.027717482050426, "grad_norm": 0.2946807365764898, "learning_rate": 2.8713145501578e-06, "loss": 0.0142, "step": 6072 }, { "epoch": 2.0280514276172985, "grad_norm": 0.4829768510277467, "learning_rate": 2.869556291829461e-06, "loss": 0.0135, "step": 6073 }, { "epoch": 2.028385373184171, "grad_norm": 0.20999719678359777, "learning_rate": 2.8677983553297266e-06, "loss": 0.0087, "step": 6074 }, { "epoch": 2.0287193187510435, "grad_norm": 0.290353405698059, "learning_rate": 2.8660407409241593e-06, "loss": 0.0113, "step": 6075 }, { "epoch": 2.029053264317916, "grad_norm": 0.4619981297584356, "learning_rate": 2.864283448878262e-06, "loss": 0.0212, "step": 6076 }, { "epoch": 2.029387209884789, "grad_norm": 0.2796731561001804, "learning_rate": 2.8625264794574975e-06, "loss": 0.0129, "step": 6077 }, { "epoch": 2.029721155451661, "grad_norm": 0.32994452425599774, "learning_rate": 2.860769832927276e-06, "loss": 0.0177, "step": 6078 }, { "epoch": 2.030055101018534, "grad_norm": 0.30679330425306117, "learning_rate": 2.8590135095529624e-06, "loss": 0.0272, "step": 6079 }, { "epoch": 2.0303890465854066, "grad_norm": 0.22030996569126765, "learning_rate": 2.8572575095998646e-06, "loss": 0.0108, "step": 6080 }, { "epoch": 2.0307229921522794, "grad_norm": 0.21127749541792287, "learning_rate": 2.855501833333253e-06, "loss": 0.0099, "step": 6081 }, { "epoch": 2.0310569377191516, "grad_norm": 0.24058478118174959, "learning_rate": 2.853746481018337e-06, "loss": 0.0105, "step": 6082 }, { "epoch": 2.0313908832860244, "grad_norm": 0.32205879831670353, "learning_rate": 2.8519914529202868e-06, "loss": 0.0211, "step": 6083 }, { "epoch": 2.031724828852897, "grad_norm": 0.2832939774790598, "learning_rate": 2.8502367493042217e-06, "loss": 0.0197, "step": 6084 }, { "epoch": 2.03205877441977, "grad_norm": 0.26575117464371134, "learning_rate": 2.848482370435206e-06, "loss": 0.0163, "step": 6085 }, { "epoch": 2.032392719986642, "grad_norm": 0.3258080129190906, "learning_rate": 2.8467283165782643e-06, "loss": 0.0215, "step": 6086 }, { "epoch": 2.0327266655535148, "grad_norm": 0.47511153250582916, "learning_rate": 2.8449745879983614e-06, "loss": 0.0181, "step": 6087 }, { "epoch": 2.0330606111203875, "grad_norm": 0.2476053327934346, "learning_rate": 2.8432211849604218e-06, "loss": 0.0202, "step": 6088 }, { "epoch": 2.0333945566872598, "grad_norm": 0.2783084602439737, "learning_rate": 2.841468107729318e-06, "loss": 0.0156, "step": 6089 }, { "epoch": 2.0337285022541325, "grad_norm": 0.3357539801022698, "learning_rate": 2.8397153565698744e-06, "loss": 0.0175, "step": 6090 }, { "epoch": 2.034062447821005, "grad_norm": 0.352764606623764, "learning_rate": 2.8379629317468604e-06, "loss": 0.0226, "step": 6091 }, { "epoch": 2.034396393387878, "grad_norm": 0.22486127145038035, "learning_rate": 2.8362108335250044e-06, "loss": 0.0094, "step": 6092 }, { "epoch": 2.03473033895475, "grad_norm": 0.33740378604902393, "learning_rate": 2.834459062168978e-06, "loss": 0.0234, "step": 6093 }, { "epoch": 2.035064284521623, "grad_norm": 0.2876460462444672, "learning_rate": 2.8327076179434088e-06, "loss": 0.0142, "step": 6094 }, { "epoch": 2.0353982300884956, "grad_norm": 0.24471925855800045, "learning_rate": 2.8309565011128732e-06, "loss": 0.0147, "step": 6095 }, { "epoch": 2.0357321756553683, "grad_norm": 0.297454363053485, "learning_rate": 2.8292057119418994e-06, "loss": 0.012, "step": 6096 }, { "epoch": 2.0360661212222406, "grad_norm": 0.2657319967797067, "learning_rate": 2.827455250694961e-06, "loss": 0.0143, "step": 6097 }, { "epoch": 2.0364000667891133, "grad_norm": 0.279516415732953, "learning_rate": 2.8257051176364903e-06, "loss": 0.0113, "step": 6098 }, { "epoch": 2.036734012355986, "grad_norm": 0.4482745624442629, "learning_rate": 2.8239553130308604e-06, "loss": 0.0326, "step": 6099 }, { "epoch": 2.0370679579228588, "grad_norm": 0.27664131408412507, "learning_rate": 2.8222058371424033e-06, "loss": 0.0113, "step": 6100 }, { "epoch": 2.037401903489731, "grad_norm": 0.35872564722376515, "learning_rate": 2.820456690235397e-06, "loss": 0.0214, "step": 6101 }, { "epoch": 2.0377358490566038, "grad_norm": 0.3491645170001635, "learning_rate": 2.8187078725740723e-06, "loss": 0.0155, "step": 6102 }, { "epoch": 2.0380697946234765, "grad_norm": 0.2861994696551622, "learning_rate": 2.8169593844226063e-06, "loss": 0.0176, "step": 6103 }, { "epoch": 2.0384037401903488, "grad_norm": 0.24515443776860424, "learning_rate": 2.815211226045131e-06, "loss": 0.0132, "step": 6104 }, { "epoch": 2.0387376857572215, "grad_norm": 0.2921236535017713, "learning_rate": 2.8134633977057236e-06, "loss": 0.0192, "step": 6105 }, { "epoch": 2.039071631324094, "grad_norm": 0.2680276689556573, "learning_rate": 2.811715899668415e-06, "loss": 0.0154, "step": 6106 }, { "epoch": 2.039405576890967, "grad_norm": 0.2536113930376566, "learning_rate": 2.8099687321971887e-06, "loss": 0.0116, "step": 6107 }, { "epoch": 2.039739522457839, "grad_norm": 0.23374677415265926, "learning_rate": 2.80822189555597e-06, "loss": 0.0106, "step": 6108 }, { "epoch": 2.040073468024712, "grad_norm": 0.2467280502889194, "learning_rate": 2.8064753900086427e-06, "loss": 0.0132, "step": 6109 }, { "epoch": 2.0404074135915846, "grad_norm": 0.3602627966972387, "learning_rate": 2.804729215819034e-06, "loss": 0.03, "step": 6110 }, { "epoch": 2.0407413591584573, "grad_norm": 0.27184514001034055, "learning_rate": 2.8029833732509282e-06, "loss": 0.0139, "step": 6111 }, { "epoch": 2.0410753047253296, "grad_norm": 0.2669730051001783, "learning_rate": 2.801237862568048e-06, "loss": 0.0134, "step": 6112 }, { "epoch": 2.0414092502922023, "grad_norm": 0.5003174822212786, "learning_rate": 2.799492684034083e-06, "loss": 0.0293, "step": 6113 }, { "epoch": 2.041743195859075, "grad_norm": 0.33582503522969365, "learning_rate": 2.797747837912656e-06, "loss": 0.0172, "step": 6114 }, { "epoch": 2.0420771414259478, "grad_norm": 0.2970612700629547, "learning_rate": 2.796003324467351e-06, "loss": 0.018, "step": 6115 }, { "epoch": 2.04241108699282, "grad_norm": 0.293969635600175, "learning_rate": 2.794259143961693e-06, "loss": 0.0139, "step": 6116 }, { "epoch": 2.0427450325596928, "grad_norm": 0.28897709967415713, "learning_rate": 2.7925152966591627e-06, "loss": 0.0181, "step": 6117 }, { "epoch": 2.0430789781265655, "grad_norm": 0.23328926665983554, "learning_rate": 2.7907717828231893e-06, "loss": 0.0107, "step": 6118 }, { "epoch": 2.043412923693438, "grad_norm": 0.3653229302778942, "learning_rate": 2.7890286027171532e-06, "loss": 0.0097, "step": 6119 }, { "epoch": 2.0437468692603105, "grad_norm": 0.2549791164964824, "learning_rate": 2.7872857566043775e-06, "loss": 0.0137, "step": 6120 }, { "epoch": 2.044080814827183, "grad_norm": 0.3701554878191902, "learning_rate": 2.7855432447481444e-06, "loss": 0.0197, "step": 6121 }, { "epoch": 2.044414760394056, "grad_norm": 0.4363622609730641, "learning_rate": 2.7838010674116767e-06, "loss": 0.0261, "step": 6122 }, { "epoch": 2.044748705960928, "grad_norm": 0.2544040344814649, "learning_rate": 2.7820592248581523e-06, "loss": 0.0092, "step": 6123 }, { "epoch": 2.045082651527801, "grad_norm": 0.3189030079520338, "learning_rate": 2.780317717350697e-06, "loss": 0.0184, "step": 6124 }, { "epoch": 2.0454165970946736, "grad_norm": 0.26266961940353706, "learning_rate": 2.7785765451523896e-06, "loss": 0.0141, "step": 6125 }, { "epoch": 2.0457505426615463, "grad_norm": 0.28467830927450516, "learning_rate": 2.7768357085262486e-06, "loss": 0.0133, "step": 6126 }, { "epoch": 2.0460844882284186, "grad_norm": 0.29069874970425696, "learning_rate": 2.7750952077352534e-06, "loss": 0.015, "step": 6127 }, { "epoch": 2.0464184337952913, "grad_norm": 0.27639028866434623, "learning_rate": 2.7733550430423216e-06, "loss": 0.0194, "step": 6128 }, { "epoch": 2.046752379362164, "grad_norm": 0.24647568749153773, "learning_rate": 2.7716152147103292e-06, "loss": 0.0133, "step": 6129 }, { "epoch": 2.0470863249290367, "grad_norm": 0.21392067621442906, "learning_rate": 2.7698757230020986e-06, "loss": 0.0107, "step": 6130 }, { "epoch": 2.047420270495909, "grad_norm": 0.28322222234106154, "learning_rate": 2.7681365681803967e-06, "loss": 0.0155, "step": 6131 }, { "epoch": 2.0477542160627817, "grad_norm": 0.29557830115031114, "learning_rate": 2.7663977505079483e-06, "loss": 0.0173, "step": 6132 }, { "epoch": 2.0480881616296545, "grad_norm": 0.292441068650384, "learning_rate": 2.764659270247417e-06, "loss": 0.0216, "step": 6133 }, { "epoch": 2.048422107196527, "grad_norm": 0.3001107000648257, "learning_rate": 2.7629211276614255e-06, "loss": 0.0152, "step": 6134 }, { "epoch": 2.0487560527633994, "grad_norm": 0.30063661297242134, "learning_rate": 2.761183323012534e-06, "loss": 0.0134, "step": 6135 }, { "epoch": 2.049089998330272, "grad_norm": 0.23132167995057779, "learning_rate": 2.7594458565632664e-06, "loss": 0.0125, "step": 6136 }, { "epoch": 2.049423943897145, "grad_norm": 0.30096447248142183, "learning_rate": 2.757708728576083e-06, "loss": 0.0144, "step": 6137 }, { "epoch": 2.049757889464017, "grad_norm": 0.237418866501332, "learning_rate": 2.7559719393133987e-06, "loss": 0.011, "step": 6138 }, { "epoch": 2.05009183503089, "grad_norm": 0.24139322555604806, "learning_rate": 2.754235489037575e-06, "loss": 0.0129, "step": 6139 }, { "epoch": 2.0504257805977626, "grad_norm": 0.23864369974629612, "learning_rate": 2.7524993780109254e-06, "loss": 0.0113, "step": 6140 }, { "epoch": 2.0507597261646353, "grad_norm": 0.31222063886359813, "learning_rate": 2.750763606495704e-06, "loss": 0.0136, "step": 6141 }, { "epoch": 2.0510936717315076, "grad_norm": 0.3838453344596363, "learning_rate": 2.7490281747541276e-06, "loss": 0.0293, "step": 6142 }, { "epoch": 2.0514276172983803, "grad_norm": 0.300159067963588, "learning_rate": 2.747293083048348e-06, "loss": 0.0176, "step": 6143 }, { "epoch": 2.051761562865253, "grad_norm": 0.33554352526393266, "learning_rate": 2.7455583316404744e-06, "loss": 0.017, "step": 6144 }, { "epoch": 2.0520955084321257, "grad_norm": 0.32068925701366413, "learning_rate": 2.743823920792559e-06, "loss": 0.0152, "step": 6145 }, { "epoch": 2.052429453998998, "grad_norm": 0.2834314103614763, "learning_rate": 2.742089850766607e-06, "loss": 0.0204, "step": 6146 }, { "epoch": 2.0527633995658707, "grad_norm": 0.3034670465553409, "learning_rate": 2.7403561218245654e-06, "loss": 0.0178, "step": 6147 }, { "epoch": 2.0530973451327434, "grad_norm": 0.3080500108252758, "learning_rate": 2.7386227342283423e-06, "loss": 0.0199, "step": 6148 }, { "epoch": 2.053431290699616, "grad_norm": 0.3562626710488752, "learning_rate": 2.73688968823978e-06, "loss": 0.0159, "step": 6149 }, { "epoch": 2.0537652362664884, "grad_norm": 0.20246524699762142, "learning_rate": 2.7351569841206792e-06, "loss": 0.0112, "step": 6150 }, { "epoch": 2.054099181833361, "grad_norm": 0.35429621418538865, "learning_rate": 2.733424622132782e-06, "loss": 0.0211, "step": 6151 }, { "epoch": 2.054433127400234, "grad_norm": 0.24907192378489898, "learning_rate": 2.7316926025377855e-06, "loss": 0.0147, "step": 6152 }, { "epoch": 2.054767072967106, "grad_norm": 0.2848960654848761, "learning_rate": 2.729960925597328e-06, "loss": 0.0178, "step": 6153 }, { "epoch": 2.055101018533979, "grad_norm": 0.22352736192834902, "learning_rate": 2.7282295915730016e-06, "loss": 0.0107, "step": 6154 }, { "epoch": 2.0554349641008516, "grad_norm": 0.29154379201592695, "learning_rate": 2.726498600726346e-06, "loss": 0.0144, "step": 6155 }, { "epoch": 2.0557689096677243, "grad_norm": 0.24102221843832142, "learning_rate": 2.7247679533188446e-06, "loss": 0.012, "step": 6156 }, { "epoch": 2.0561028552345966, "grad_norm": 0.2936137109883559, "learning_rate": 2.723037649611936e-06, "loss": 0.0148, "step": 6157 }, { "epoch": 2.0564368008014693, "grad_norm": 0.32490189824501264, "learning_rate": 2.721307689866997e-06, "loss": 0.0189, "step": 6158 }, { "epoch": 2.056770746368342, "grad_norm": 0.2786962057928275, "learning_rate": 2.719578074345366e-06, "loss": 0.0165, "step": 6159 }, { "epoch": 2.0571046919352147, "grad_norm": 0.29844488405350844, "learning_rate": 2.7178488033083163e-06, "loss": 0.0165, "step": 6160 }, { "epoch": 2.057438637502087, "grad_norm": 0.39936642719643084, "learning_rate": 2.7161198770170784e-06, "loss": 0.0193, "step": 6161 }, { "epoch": 2.0577725830689597, "grad_norm": 0.30338747450520953, "learning_rate": 2.714391295732822e-06, "loss": 0.0137, "step": 6162 }, { "epoch": 2.0581065286358324, "grad_norm": 0.5142800910557641, "learning_rate": 2.712663059716675e-06, "loss": 0.0138, "step": 6163 }, { "epoch": 2.058440474202705, "grad_norm": 0.27738714768212364, "learning_rate": 2.7109351692297015e-06, "loss": 0.0156, "step": 6164 }, { "epoch": 2.0587744197695774, "grad_norm": 0.2553756722956166, "learning_rate": 2.7092076245329273e-06, "loss": 0.0109, "step": 6165 }, { "epoch": 2.05910836533645, "grad_norm": 0.2558447520215222, "learning_rate": 2.7074804258873127e-06, "loss": 0.0113, "step": 6166 }, { "epoch": 2.059442310903323, "grad_norm": 0.3399173166704573, "learning_rate": 2.7057535735537754e-06, "loss": 0.0174, "step": 6167 }, { "epoch": 2.0597762564701956, "grad_norm": 0.31623765510457286, "learning_rate": 2.704027067793173e-06, "loss": 0.0161, "step": 6168 }, { "epoch": 2.060110202037068, "grad_norm": 0.2733253699048302, "learning_rate": 2.7023009088663176e-06, "loss": 0.0111, "step": 6169 }, { "epoch": 2.0604441476039406, "grad_norm": 0.3468734264286785, "learning_rate": 2.7005750970339607e-06, "loss": 0.0149, "step": 6170 }, { "epoch": 2.0607780931708133, "grad_norm": 0.34320281938735064, "learning_rate": 2.698849632556815e-06, "loss": 0.019, "step": 6171 }, { "epoch": 2.0611120387376856, "grad_norm": 0.2604572316624649, "learning_rate": 2.697124515695524e-06, "loss": 0.0173, "step": 6172 }, { "epoch": 2.0614459843045583, "grad_norm": 0.2774091219209041, "learning_rate": 2.695399746710693e-06, "loss": 0.022, "step": 6173 }, { "epoch": 2.061779929871431, "grad_norm": 0.310753394931392, "learning_rate": 2.6936753258628643e-06, "loss": 0.0208, "step": 6174 }, { "epoch": 2.0621138754383037, "grad_norm": 0.30587375121149146, "learning_rate": 2.691951253412536e-06, "loss": 0.015, "step": 6175 }, { "epoch": 2.062447821005176, "grad_norm": 0.20310024699670445, "learning_rate": 2.6902275296201445e-06, "loss": 0.0089, "step": 6176 }, { "epoch": 2.0627817665720487, "grad_norm": 0.23136331185968897, "learning_rate": 2.688504154746082e-06, "loss": 0.0102, "step": 6177 }, { "epoch": 2.0631157121389214, "grad_norm": 0.2088692663882883, "learning_rate": 2.686781129050685e-06, "loss": 0.0113, "step": 6178 }, { "epoch": 2.063449657705794, "grad_norm": 0.27160561119880056, "learning_rate": 2.685058452794235e-06, "loss": 0.0119, "step": 6179 }, { "epoch": 2.0637836032726664, "grad_norm": 0.28719766913607175, "learning_rate": 2.6833361262369644e-06, "loss": 0.0133, "step": 6180 }, { "epoch": 2.064117548839539, "grad_norm": 0.29712817431083016, "learning_rate": 2.681614149639048e-06, "loss": 0.0293, "step": 6181 }, { "epoch": 2.064451494406412, "grad_norm": 0.2586448746195902, "learning_rate": 2.679892523260612e-06, "loss": 0.0138, "step": 6182 }, { "epoch": 2.0647854399732846, "grad_norm": 0.278751060322034, "learning_rate": 2.6781712473617293e-06, "loss": 0.0161, "step": 6183 }, { "epoch": 2.065119385540157, "grad_norm": 0.3270286494815345, "learning_rate": 2.6764503222024202e-06, "loss": 0.0158, "step": 6184 }, { "epoch": 2.0654533311070296, "grad_norm": 0.28789205731344475, "learning_rate": 2.674729748042647e-06, "loss": 0.0135, "step": 6185 }, { "epoch": 2.0657872766739023, "grad_norm": 0.2645692272440605, "learning_rate": 2.673009525142326e-06, "loss": 0.0135, "step": 6186 }, { "epoch": 2.0661212222407745, "grad_norm": 0.2732618694162732, "learning_rate": 2.6712896537613143e-06, "loss": 0.0134, "step": 6187 }, { "epoch": 2.0664551678076473, "grad_norm": 0.21235744945635984, "learning_rate": 2.6695701341594193e-06, "loss": 0.0113, "step": 6188 }, { "epoch": 2.06678911337452, "grad_norm": 0.257867563067866, "learning_rate": 2.667850966596396e-06, "loss": 0.0113, "step": 6189 }, { "epoch": 2.0671230589413927, "grad_norm": 0.24490635522610477, "learning_rate": 2.6661321513319467e-06, "loss": 0.0116, "step": 6190 }, { "epoch": 2.067457004508265, "grad_norm": 0.2792789359976981, "learning_rate": 2.6644136886257138e-06, "loss": 0.0148, "step": 6191 }, { "epoch": 2.0677909500751377, "grad_norm": 0.28195978320173876, "learning_rate": 2.6626955787372962e-06, "loss": 0.0114, "step": 6192 }, { "epoch": 2.0681248956420104, "grad_norm": 0.3160367855950243, "learning_rate": 2.6609778219262296e-06, "loss": 0.0126, "step": 6193 }, { "epoch": 2.068458841208883, "grad_norm": 0.22697057263532353, "learning_rate": 2.659260418452005e-06, "loss": 0.011, "step": 6194 }, { "epoch": 2.0687927867757554, "grad_norm": 0.258081658283097, "learning_rate": 2.6575433685740547e-06, "loss": 0.0112, "step": 6195 }, { "epoch": 2.069126732342628, "grad_norm": 0.2660646367791906, "learning_rate": 2.655826672551762e-06, "loss": 0.0109, "step": 6196 }, { "epoch": 2.069460677909501, "grad_norm": 0.34897654305382064, "learning_rate": 2.6541103306444516e-06, "loss": 0.014, "step": 6197 }, { "epoch": 2.0697946234763736, "grad_norm": 0.29879647705197826, "learning_rate": 2.6523943431113985e-06, "loss": 0.0155, "step": 6198 }, { "epoch": 2.070128569043246, "grad_norm": 0.33255267726468113, "learning_rate": 2.6506787102118204e-06, "loss": 0.0136, "step": 6199 }, { "epoch": 2.0704625146101185, "grad_norm": 0.24807825968469902, "learning_rate": 2.6489634322048853e-06, "loss": 0.0098, "step": 6200 }, { "epoch": 2.0707964601769913, "grad_norm": 0.2806514649688107, "learning_rate": 2.647248509349708e-06, "loss": 0.011, "step": 6201 }, { "epoch": 2.0711304057438635, "grad_norm": 0.4061888496602113, "learning_rate": 2.645533941905345e-06, "loss": 0.0188, "step": 6202 }, { "epoch": 2.0714643513107363, "grad_norm": 0.27149238375049944, "learning_rate": 2.6438197301308045e-06, "loss": 0.0132, "step": 6203 }, { "epoch": 2.071798296877609, "grad_norm": 0.3471557875876584, "learning_rate": 2.6421058742850346e-06, "loss": 0.0172, "step": 6204 }, { "epoch": 2.0721322424444817, "grad_norm": 0.2785416139500432, "learning_rate": 2.6403923746269368e-06, "loss": 0.0138, "step": 6205 }, { "epoch": 2.072466188011354, "grad_norm": 0.31617376730516916, "learning_rate": 2.638679231415353e-06, "loss": 0.0103, "step": 6206 }, { "epoch": 2.0728001335782267, "grad_norm": 0.233890516100944, "learning_rate": 2.636966444909077e-06, "loss": 0.0109, "step": 6207 }, { "epoch": 2.0731340791450994, "grad_norm": 0.22880810709401206, "learning_rate": 2.635254015366842e-06, "loss": 0.0131, "step": 6208 }, { "epoch": 2.073468024711972, "grad_norm": 0.2688531281020386, "learning_rate": 2.633541943047334e-06, "loss": 0.0146, "step": 6209 }, { "epoch": 2.0738019702788444, "grad_norm": 0.2998383308147042, "learning_rate": 2.6318302282091772e-06, "loss": 0.0128, "step": 6210 }, { "epoch": 2.074135915845717, "grad_norm": 0.3200518557729673, "learning_rate": 2.6301188711109494e-06, "loss": 0.0155, "step": 6211 }, { "epoch": 2.07446986141259, "grad_norm": 0.2882625256509772, "learning_rate": 2.6284078720111693e-06, "loss": 0.0168, "step": 6212 }, { "epoch": 2.0748038069794625, "grad_norm": 0.3810363900456062, "learning_rate": 2.626697231168308e-06, "loss": 0.0183, "step": 6213 }, { "epoch": 2.075137752546335, "grad_norm": 0.2600797779262035, "learning_rate": 2.624986948840772e-06, "loss": 0.0129, "step": 6214 }, { "epoch": 2.0754716981132075, "grad_norm": 0.23901841608496618, "learning_rate": 2.6232770252869243e-06, "loss": 0.0108, "step": 6215 }, { "epoch": 2.0758056436800802, "grad_norm": 0.2895261670531142, "learning_rate": 2.6215674607650653e-06, "loss": 0.0161, "step": 6216 }, { "epoch": 2.076139589246953, "grad_norm": 0.24797933486687493, "learning_rate": 2.619858255533446e-06, "loss": 0.0134, "step": 6217 }, { "epoch": 2.0764735348138252, "grad_norm": 0.23863762950528655, "learning_rate": 2.6181494098502626e-06, "loss": 0.0132, "step": 6218 }, { "epoch": 2.076807480380698, "grad_norm": 0.23554487880100103, "learning_rate": 2.616440923973659e-06, "loss": 0.01, "step": 6219 }, { "epoch": 2.0771414259475707, "grad_norm": 0.2629196257938369, "learning_rate": 2.6147327981617167e-06, "loss": 0.0109, "step": 6220 }, { "epoch": 2.077475371514443, "grad_norm": 0.30347136443509615, "learning_rate": 2.613025032672472e-06, "loss": 0.0156, "step": 6221 }, { "epoch": 2.0778093170813157, "grad_norm": 0.2758700539876662, "learning_rate": 2.611317627763901e-06, "loss": 0.0165, "step": 6222 }, { "epoch": 2.0781432626481884, "grad_norm": 0.25343059122580197, "learning_rate": 2.609610583693928e-06, "loss": 0.0138, "step": 6223 }, { "epoch": 2.078477208215061, "grad_norm": 0.28907889847773965, "learning_rate": 2.6079039007204238e-06, "loss": 0.0137, "step": 6224 }, { "epoch": 2.0788111537819334, "grad_norm": 0.3238433865568032, "learning_rate": 2.6061975791011996e-06, "loss": 0.0173, "step": 6225 }, { "epoch": 2.079145099348806, "grad_norm": 0.2417912605480169, "learning_rate": 2.6044916190940194e-06, "loss": 0.0142, "step": 6226 }, { "epoch": 2.079479044915679, "grad_norm": 0.25117331307032686, "learning_rate": 2.6027860209565835e-06, "loss": 0.0148, "step": 6227 }, { "epoch": 2.0798129904825515, "grad_norm": 0.3077747308192545, "learning_rate": 2.6010807849465468e-06, "loss": 0.0172, "step": 6228 }, { "epoch": 2.080146936049424, "grad_norm": 0.29674813218974383, "learning_rate": 2.5993759113215032e-06, "loss": 0.0137, "step": 6229 }, { "epoch": 2.0804808816162965, "grad_norm": 0.20110443136718312, "learning_rate": 2.5976714003389963e-06, "loss": 0.0111, "step": 6230 }, { "epoch": 2.0808148271831692, "grad_norm": 0.298389559232403, "learning_rate": 2.5959672522565095e-06, "loss": 0.0173, "step": 6231 }, { "epoch": 2.081148772750042, "grad_norm": 0.5126092442675864, "learning_rate": 2.594263467331477e-06, "loss": 0.021, "step": 6232 }, { "epoch": 2.0814827183169142, "grad_norm": 0.32916226434549356, "learning_rate": 2.592560045821273e-06, "loss": 0.0196, "step": 6233 }, { "epoch": 2.081816663883787, "grad_norm": 0.2980573368158267, "learning_rate": 2.5908569879832223e-06, "loss": 0.0144, "step": 6234 }, { "epoch": 2.0821506094506597, "grad_norm": 0.28157395797472956, "learning_rate": 2.5891542940745873e-06, "loss": 0.0162, "step": 6235 }, { "epoch": 2.082484555017532, "grad_norm": 0.3448861362716233, "learning_rate": 2.5874519643525864e-06, "loss": 0.0178, "step": 6236 }, { "epoch": 2.0828185005844047, "grad_norm": 0.26130095063103065, "learning_rate": 2.5857499990743706e-06, "loss": 0.013, "step": 6237 }, { "epoch": 2.0831524461512774, "grad_norm": 0.320555994239304, "learning_rate": 2.584048398497047e-06, "loss": 0.0208, "step": 6238 }, { "epoch": 2.08348639171815, "grad_norm": 0.29398743811544803, "learning_rate": 2.5823471628776574e-06, "loss": 0.0117, "step": 6239 }, { "epoch": 2.0838203372850224, "grad_norm": 0.263872652087751, "learning_rate": 2.5806462924731955e-06, "loss": 0.0122, "step": 6240 }, { "epoch": 2.084154282851895, "grad_norm": 0.28444258494075586, "learning_rate": 2.5789457875405986e-06, "loss": 0.0201, "step": 6241 }, { "epoch": 2.084488228418768, "grad_norm": 0.29910679769368453, "learning_rate": 2.57724564833675e-06, "loss": 0.0196, "step": 6242 }, { "epoch": 2.0848221739856405, "grad_norm": 0.27112794160906495, "learning_rate": 2.5755458751184705e-06, "loss": 0.0188, "step": 6243 }, { "epoch": 2.085156119552513, "grad_norm": 0.3007399325807635, "learning_rate": 2.5738464681425356e-06, "loss": 0.0139, "step": 6244 }, { "epoch": 2.0854900651193855, "grad_norm": 0.3825229399760879, "learning_rate": 2.5721474276656566e-06, "loss": 0.0218, "step": 6245 }, { "epoch": 2.0858240106862582, "grad_norm": 0.30070767227458534, "learning_rate": 2.5704487539444956e-06, "loss": 0.0153, "step": 6246 }, { "epoch": 2.086157956253131, "grad_norm": 0.4106733638952333, "learning_rate": 2.5687504472356596e-06, "loss": 0.0176, "step": 6247 }, { "epoch": 2.086491901820003, "grad_norm": 0.2881483708610056, "learning_rate": 2.5670525077956944e-06, "loss": 0.0143, "step": 6248 }, { "epoch": 2.086825847386876, "grad_norm": 0.26086768861553566, "learning_rate": 2.5653549358810957e-06, "loss": 0.0114, "step": 6249 }, { "epoch": 2.0871597929537486, "grad_norm": 0.4159783982692846, "learning_rate": 2.563657731748299e-06, "loss": 0.0176, "step": 6250 }, { "epoch": 2.087493738520621, "grad_norm": 0.24033871419824848, "learning_rate": 2.5619608956536895e-06, "loss": 0.0156, "step": 6251 }, { "epoch": 2.0878276840874936, "grad_norm": 0.3776534290505859, "learning_rate": 2.5602644278535937e-06, "loss": 0.0273, "step": 6252 }, { "epoch": 2.0881616296543664, "grad_norm": 0.29359583761742586, "learning_rate": 2.558568328604285e-06, "loss": 0.0152, "step": 6253 }, { "epoch": 2.088495575221239, "grad_norm": 0.28565380949311164, "learning_rate": 2.5568725981619747e-06, "loss": 0.0193, "step": 6254 }, { "epoch": 2.0888295207881113, "grad_norm": 0.4426396794338646, "learning_rate": 2.5551772367828276e-06, "loss": 0.0424, "step": 6255 }, { "epoch": 2.089163466354984, "grad_norm": 0.21743482548014179, "learning_rate": 2.5534822447229436e-06, "loss": 0.0105, "step": 6256 }, { "epoch": 2.089497411921857, "grad_norm": 0.2923398831037656, "learning_rate": 2.551787622238376e-06, "loss": 0.0131, "step": 6257 }, { "epoch": 2.0898313574887295, "grad_norm": 0.3110990186071121, "learning_rate": 2.5500933695851104e-06, "loss": 0.0157, "step": 6258 }, { "epoch": 2.0901653030556018, "grad_norm": 0.32442775102682153, "learning_rate": 2.548399487019092e-06, "loss": 0.0157, "step": 6259 }, { "epoch": 2.0904992486224745, "grad_norm": 0.3482995484791825, "learning_rate": 2.5467059747961953e-06, "loss": 0.0134, "step": 6260 }, { "epoch": 2.090833194189347, "grad_norm": 0.3089071994523559, "learning_rate": 2.54501283317225e-06, "loss": 0.016, "step": 6261 }, { "epoch": 2.09116713975622, "grad_norm": 0.24004573560699569, "learning_rate": 2.5433200624030212e-06, "loss": 0.0107, "step": 6262 }, { "epoch": 2.091501085323092, "grad_norm": 0.3106056910743025, "learning_rate": 2.541627662744225e-06, "loss": 0.0133, "step": 6263 }, { "epoch": 2.091835030889965, "grad_norm": 0.28718205108796635, "learning_rate": 2.5399356344515138e-06, "loss": 0.0159, "step": 6264 }, { "epoch": 2.0921689764568376, "grad_norm": 0.2687950827191494, "learning_rate": 2.538243977780494e-06, "loss": 0.0116, "step": 6265 }, { "epoch": 2.0925029220237104, "grad_norm": 0.32488093758238024, "learning_rate": 2.5365526929867056e-06, "loss": 0.0161, "step": 6266 }, { "epoch": 2.0928368675905826, "grad_norm": 0.4031736692518948, "learning_rate": 2.534861780325642e-06, "loss": 0.0153, "step": 6267 }, { "epoch": 2.0931708131574553, "grad_norm": 0.2373644499881437, "learning_rate": 2.53317124005273e-06, "loss": 0.0099, "step": 6268 }, { "epoch": 2.093504758724328, "grad_norm": 0.2597092551565546, "learning_rate": 2.5314810724233502e-06, "loss": 0.0107, "step": 6269 }, { "epoch": 2.0938387042912003, "grad_norm": 0.36860569074985583, "learning_rate": 2.529791277692818e-06, "loss": 0.0175, "step": 6270 }, { "epoch": 2.094172649858073, "grad_norm": 0.26038967520554, "learning_rate": 2.5281018561163996e-06, "loss": 0.0134, "step": 6271 }, { "epoch": 2.0945065954249458, "grad_norm": 0.2659192944639581, "learning_rate": 2.5264128079493033e-06, "loss": 0.0144, "step": 6272 }, { "epoch": 2.0948405409918185, "grad_norm": 0.29276916811500125, "learning_rate": 2.524724133446676e-06, "loss": 0.0155, "step": 6273 }, { "epoch": 2.0951744865586908, "grad_norm": 0.2795195143239489, "learning_rate": 2.523035832863614e-06, "loss": 0.0118, "step": 6274 }, { "epoch": 2.0955084321255635, "grad_norm": 0.21419630636848647, "learning_rate": 2.521347906455154e-06, "loss": 0.0087, "step": 6275 }, { "epoch": 2.095842377692436, "grad_norm": 0.4287695267783562, "learning_rate": 2.5196603544762804e-06, "loss": 0.0228, "step": 6276 }, { "epoch": 2.096176323259309, "grad_norm": 0.24742579163324185, "learning_rate": 2.5179731771819133e-06, "loss": 0.0114, "step": 6277 }, { "epoch": 2.096510268826181, "grad_norm": 0.280917963843559, "learning_rate": 2.5162863748269247e-06, "loss": 0.0177, "step": 6278 }, { "epoch": 2.096844214393054, "grad_norm": 0.37891106724615875, "learning_rate": 2.514599947666122e-06, "loss": 0.0206, "step": 6279 }, { "epoch": 2.0971781599599266, "grad_norm": 0.35690210579383713, "learning_rate": 2.5129138959542633e-06, "loss": 0.0201, "step": 6280 }, { "epoch": 2.0975121055267993, "grad_norm": 0.27909643901718395, "learning_rate": 2.5112282199460415e-06, "loss": 0.0178, "step": 6281 }, { "epoch": 2.0978460510936716, "grad_norm": 0.4217844694011119, "learning_rate": 2.5095429198961056e-06, "loss": 0.0228, "step": 6282 }, { "epoch": 2.0981799966605443, "grad_norm": 0.687734506847529, "learning_rate": 2.507857996059034e-06, "loss": 0.0286, "step": 6283 }, { "epoch": 2.098513942227417, "grad_norm": 0.5163752703538171, "learning_rate": 2.5061734486893574e-06, "loss": 0.0168, "step": 6284 }, { "epoch": 2.0988478877942893, "grad_norm": 0.26744004222956186, "learning_rate": 2.504489278041544e-06, "loss": 0.0149, "step": 6285 }, { "epoch": 2.099181833361162, "grad_norm": 0.2885805123873711, "learning_rate": 2.5028054843700102e-06, "loss": 0.0154, "step": 6286 }, { "epoch": 2.0995157789280348, "grad_norm": 0.2313607874220595, "learning_rate": 2.501122067929108e-06, "loss": 0.0107, "step": 6287 }, { "epoch": 2.0998497244949075, "grad_norm": 0.20117095925556797, "learning_rate": 2.4994390289731446e-06, "loss": 0.011, "step": 6288 }, { "epoch": 2.1001836700617798, "grad_norm": 0.38636228293728364, "learning_rate": 2.497756367756357e-06, "loss": 0.0169, "step": 6289 }, { "epoch": 2.1005176156286525, "grad_norm": 0.26307109841840925, "learning_rate": 2.496074084532935e-06, "loss": 0.0151, "step": 6290 }, { "epoch": 2.100851561195525, "grad_norm": 0.27202503480199225, "learning_rate": 2.4943921795570033e-06, "loss": 0.0134, "step": 6291 }, { "epoch": 2.101185506762398, "grad_norm": 0.27228359902656274, "learning_rate": 2.4927106530826372e-06, "loss": 0.0179, "step": 6292 }, { "epoch": 2.10151945232927, "grad_norm": 0.2976352902965977, "learning_rate": 2.491029505363848e-06, "loss": 0.0139, "step": 6293 }, { "epoch": 2.101853397896143, "grad_norm": 0.2543321242263, "learning_rate": 2.489348736654593e-06, "loss": 0.0136, "step": 6294 }, { "epoch": 2.1021873434630156, "grad_norm": 0.2196231601619742, "learning_rate": 2.4876683472087767e-06, "loss": 0.0085, "step": 6295 }, { "epoch": 2.1025212890298883, "grad_norm": 0.23987061103317567, "learning_rate": 2.4859883372802357e-06, "loss": 0.0136, "step": 6296 }, { "epoch": 2.1028552345967606, "grad_norm": 0.19884545816103985, "learning_rate": 2.484308707122758e-06, "loss": 0.0101, "step": 6297 }, { "epoch": 2.1031891801636333, "grad_norm": 0.383114758245921, "learning_rate": 2.4826294569900725e-06, "loss": 0.0165, "step": 6298 }, { "epoch": 2.103523125730506, "grad_norm": 0.33389410315398704, "learning_rate": 2.4809505871358476e-06, "loss": 0.019, "step": 6299 }, { "epoch": 2.1038570712973783, "grad_norm": 0.29862187139083723, "learning_rate": 2.4792720978136967e-06, "loss": 0.0162, "step": 6300 }, { "epoch": 2.104191016864251, "grad_norm": 0.21961769644919973, "learning_rate": 2.4775939892771787e-06, "loss": 0.012, "step": 6301 }, { "epoch": 2.1045249624311237, "grad_norm": 0.30846576518311836, "learning_rate": 2.4759162617797873e-06, "loss": 0.02, "step": 6302 }, { "epoch": 2.1048589079979965, "grad_norm": 0.3291334958589151, "learning_rate": 2.4742389155749657e-06, "loss": 0.0187, "step": 6303 }, { "epoch": 2.1051928535648687, "grad_norm": 0.28348334818111015, "learning_rate": 2.472561950916094e-06, "loss": 0.0122, "step": 6304 }, { "epoch": 2.1055267991317415, "grad_norm": 0.2821980081552556, "learning_rate": 2.4708853680565e-06, "loss": 0.0151, "step": 6305 }, { "epoch": 2.105860744698614, "grad_norm": 0.22092780482715804, "learning_rate": 2.4692091672494494e-06, "loss": 0.0095, "step": 6306 }, { "epoch": 2.106194690265487, "grad_norm": 0.34163952014772775, "learning_rate": 2.4675333487481558e-06, "loss": 0.0148, "step": 6307 }, { "epoch": 2.106528635832359, "grad_norm": 0.23487466372973265, "learning_rate": 2.4658579128057665e-06, "loss": 0.0147, "step": 6308 }, { "epoch": 2.106862581399232, "grad_norm": 0.309473110287223, "learning_rate": 2.4641828596753803e-06, "loss": 0.0174, "step": 6309 }, { "epoch": 2.1071965269661046, "grad_norm": 0.23957152064386736, "learning_rate": 2.4625081896100294e-06, "loss": 0.0147, "step": 6310 }, { "epoch": 2.1075304725329773, "grad_norm": 0.2356402669385959, "learning_rate": 2.4608339028626943e-06, "loss": 0.0185, "step": 6311 }, { "epoch": 2.1078644180998496, "grad_norm": 0.2478075683890819, "learning_rate": 2.4591599996862957e-06, "loss": 0.0114, "step": 6312 }, { "epoch": 2.1081983636667223, "grad_norm": 0.3299389202861783, "learning_rate": 2.457486480333699e-06, "loss": 0.0208, "step": 6313 }, { "epoch": 2.108532309233595, "grad_norm": 0.32861519914876286, "learning_rate": 2.4558133450577044e-06, "loss": 0.0184, "step": 6314 }, { "epoch": 2.1088662548004677, "grad_norm": 0.3236467368213756, "learning_rate": 2.4541405941110626e-06, "loss": 0.0199, "step": 6315 }, { "epoch": 2.10920020036734, "grad_norm": 0.4048303066970319, "learning_rate": 2.452468227746459e-06, "loss": 0.0119, "step": 6316 }, { "epoch": 2.1095341459342127, "grad_norm": 0.34363621322258253, "learning_rate": 2.4507962462165254e-06, "loss": 0.0145, "step": 6317 }, { "epoch": 2.1098680915010855, "grad_norm": 0.2583666199765121, "learning_rate": 2.449124649773835e-06, "loss": 0.0129, "step": 6318 }, { "epoch": 2.1102020370679577, "grad_norm": 0.39906251268362747, "learning_rate": 2.4474534386709036e-06, "loss": 0.0192, "step": 6319 }, { "epoch": 2.1105359826348304, "grad_norm": 0.30957618565823813, "learning_rate": 2.4457826131601835e-06, "loss": 0.0147, "step": 6320 }, { "epoch": 2.110869928201703, "grad_norm": 0.37197451281109395, "learning_rate": 2.444112173494077e-06, "loss": 0.0206, "step": 6321 }, { "epoch": 2.111203873768576, "grad_norm": 0.2476699885184769, "learning_rate": 2.4424421199249194e-06, "loss": 0.0118, "step": 6322 }, { "epoch": 2.111537819335448, "grad_norm": 0.28164011124237065, "learning_rate": 2.440772452704993e-06, "loss": 0.0202, "step": 6323 }, { "epoch": 2.111871764902321, "grad_norm": 0.2985031021074347, "learning_rate": 2.4391031720865246e-06, "loss": 0.0191, "step": 6324 }, { "epoch": 2.1122057104691936, "grad_norm": 0.28205015046644444, "learning_rate": 2.4374342783216732e-06, "loss": 0.0122, "step": 6325 }, { "epoch": 2.1125396560360663, "grad_norm": 0.28397498027623147, "learning_rate": 2.435765771662549e-06, "loss": 0.0157, "step": 6326 }, { "epoch": 2.1128736016029386, "grad_norm": 0.2527821376524227, "learning_rate": 2.4340976523611957e-06, "loss": 0.0154, "step": 6327 }, { "epoch": 2.1132075471698113, "grad_norm": 0.2657558328790342, "learning_rate": 2.4324299206696057e-06, "loss": 0.0121, "step": 6328 }, { "epoch": 2.113541492736684, "grad_norm": 0.328007150050808, "learning_rate": 2.4307625768397077e-06, "loss": 0.0196, "step": 6329 }, { "epoch": 2.1138754383035567, "grad_norm": 0.2794516305190381, "learning_rate": 2.4290956211233757e-06, "loss": 0.0129, "step": 6330 }, { "epoch": 2.114209383870429, "grad_norm": 0.2403463497273294, "learning_rate": 2.42742905377242e-06, "loss": 0.0089, "step": 6331 }, { "epoch": 2.1145433294373017, "grad_norm": 0.32149971714143255, "learning_rate": 2.4257628750385987e-06, "loss": 0.0176, "step": 6332 }, { "epoch": 2.1148772750041744, "grad_norm": 0.3159822559410141, "learning_rate": 2.424097085173604e-06, "loss": 0.0148, "step": 6333 }, { "epoch": 2.1152112205710467, "grad_norm": 0.3292482716731008, "learning_rate": 2.4224316844290747e-06, "loss": 0.0167, "step": 6334 }, { "epoch": 2.1155451661379194, "grad_norm": 0.3620125389575101, "learning_rate": 2.4207666730565893e-06, "loss": 0.0178, "step": 6335 }, { "epoch": 2.115879111704792, "grad_norm": 0.30644156127925465, "learning_rate": 2.4191020513076697e-06, "loss": 0.0142, "step": 6336 }, { "epoch": 2.116213057271665, "grad_norm": 0.2866973652860801, "learning_rate": 2.4174378194337715e-06, "loss": 0.0131, "step": 6337 }, { "epoch": 2.116547002838537, "grad_norm": 0.3570314974441156, "learning_rate": 2.4157739776863023e-06, "loss": 0.0186, "step": 6338 }, { "epoch": 2.11688094840541, "grad_norm": 0.29606147566208474, "learning_rate": 2.4141105263166e-06, "loss": 0.0128, "step": 6339 }, { "epoch": 2.1172148939722826, "grad_norm": 0.33670647754402017, "learning_rate": 2.41244746557595e-06, "loss": 0.0202, "step": 6340 }, { "epoch": 2.1175488395391553, "grad_norm": 0.2600737581622472, "learning_rate": 2.4107847957155784e-06, "loss": 0.0139, "step": 6341 }, { "epoch": 2.1178827851060276, "grad_norm": 0.2520569871419981, "learning_rate": 2.409122516986652e-06, "loss": 0.0133, "step": 6342 }, { "epoch": 2.1182167306729003, "grad_norm": 0.2549206631474349, "learning_rate": 2.4074606296402735e-06, "loss": 0.0123, "step": 6343 }, { "epoch": 2.118550676239773, "grad_norm": 0.3133443756263147, "learning_rate": 2.405799133927496e-06, "loss": 0.0168, "step": 6344 }, { "epoch": 2.1188846218066457, "grad_norm": 0.36328037869246643, "learning_rate": 2.404138030099303e-06, "loss": 0.0163, "step": 6345 }, { "epoch": 2.119218567373518, "grad_norm": 0.24759314519351655, "learning_rate": 2.4024773184066253e-06, "loss": 0.0154, "step": 6346 }, { "epoch": 2.1195525129403907, "grad_norm": 0.27322584324323756, "learning_rate": 2.4008169991003356e-06, "loss": 0.0136, "step": 6347 }, { "epoch": 2.1198864585072634, "grad_norm": 0.23665414390601153, "learning_rate": 2.3991570724312405e-06, "loss": 0.0146, "step": 6348 }, { "epoch": 2.1202204040741357, "grad_norm": 0.27824772245001744, "learning_rate": 2.3974975386500958e-06, "loss": 0.0163, "step": 6349 }, { "epoch": 2.1205543496410084, "grad_norm": 0.2579284454433024, "learning_rate": 2.3958383980075896e-06, "loss": 0.0123, "step": 6350 }, { "epoch": 2.120888295207881, "grad_norm": 0.2927224426767084, "learning_rate": 2.394179650754358e-06, "loss": 0.0138, "step": 6351 }, { "epoch": 2.121222240774754, "grad_norm": 0.28411440366395213, "learning_rate": 2.3925212971409688e-06, "loss": 0.0166, "step": 6352 }, { "epoch": 2.121556186341626, "grad_norm": 0.20964275273940322, "learning_rate": 2.3908633374179436e-06, "loss": 0.0089, "step": 6353 }, { "epoch": 2.121890131908499, "grad_norm": 0.2808166456045462, "learning_rate": 2.3892057718357308e-06, "loss": 0.0146, "step": 6354 }, { "epoch": 2.1222240774753716, "grad_norm": 0.22847341818999706, "learning_rate": 2.3875486006447294e-06, "loss": 0.0117, "step": 6355 }, { "epoch": 2.1225580230422443, "grad_norm": 0.2654764079191746, "learning_rate": 2.3858918240952703e-06, "loss": 0.0098, "step": 6356 }, { "epoch": 2.1228919686091166, "grad_norm": 0.2741049101290846, "learning_rate": 2.384235442437632e-06, "loss": 0.0132, "step": 6357 }, { "epoch": 2.1232259141759893, "grad_norm": 0.2961895766669102, "learning_rate": 2.3825794559220296e-06, "loss": 0.0194, "step": 6358 }, { "epoch": 2.123559859742862, "grad_norm": 0.36911682734464085, "learning_rate": 2.380923864798621e-06, "loss": 0.0266, "step": 6359 }, { "epoch": 2.1238938053097347, "grad_norm": 0.20844802443615157, "learning_rate": 2.3792686693174993e-06, "loss": 0.0106, "step": 6360 }, { "epoch": 2.124227750876607, "grad_norm": 0.27269892682133406, "learning_rate": 2.3776138697287055e-06, "loss": 0.0145, "step": 6361 }, { "epoch": 2.1245616964434797, "grad_norm": 0.26181954181026257, "learning_rate": 2.3759594662822122e-06, "loss": 0.0129, "step": 6362 }, { "epoch": 2.1248956420103524, "grad_norm": 0.27356183681183144, "learning_rate": 2.3743054592279386e-06, "loss": 0.0119, "step": 6363 }, { "epoch": 2.125229587577225, "grad_norm": 0.2732613382505249, "learning_rate": 2.372651848815742e-06, "loss": 0.0144, "step": 6364 }, { "epoch": 2.1255635331440974, "grad_norm": 0.17167852700380207, "learning_rate": 2.370998635295421e-06, "loss": 0.0074, "step": 6365 }, { "epoch": 2.12589747871097, "grad_norm": 0.27158385445851063, "learning_rate": 2.3693458189167106e-06, "loss": 0.0161, "step": 6366 }, { "epoch": 2.126231424277843, "grad_norm": 0.32176653140968703, "learning_rate": 2.3676933999292905e-06, "loss": 0.0135, "step": 6367 }, { "epoch": 2.126565369844715, "grad_norm": 0.3311250748765162, "learning_rate": 2.366041378582775e-06, "loss": 0.018, "step": 6368 }, { "epoch": 2.126899315411588, "grad_norm": 0.24458509726088312, "learning_rate": 2.364389755126723e-06, "loss": 0.0101, "step": 6369 }, { "epoch": 2.1272332609784605, "grad_norm": 0.2083650726094157, "learning_rate": 2.3627385298106344e-06, "loss": 0.0094, "step": 6370 }, { "epoch": 2.1275672065453333, "grad_norm": 0.3471440551333678, "learning_rate": 2.361087702883941e-06, "loss": 0.0127, "step": 6371 }, { "epoch": 2.1279011521122055, "grad_norm": 0.26878538303276184, "learning_rate": 2.359437274596024e-06, "loss": 0.0117, "step": 6372 }, { "epoch": 2.1282350976790783, "grad_norm": 0.24044339642405563, "learning_rate": 2.357787245196197e-06, "loss": 0.0128, "step": 6373 }, { "epoch": 2.128569043245951, "grad_norm": 0.1791980061910617, "learning_rate": 2.3561376149337188e-06, "loss": 0.0052, "step": 6374 }, { "epoch": 2.1289029888128237, "grad_norm": 0.2726359222798816, "learning_rate": 2.3544883840577815e-06, "loss": 0.0139, "step": 6375 }, { "epoch": 2.129236934379696, "grad_norm": 0.31427581655705467, "learning_rate": 2.352839552817527e-06, "loss": 0.0174, "step": 6376 }, { "epoch": 2.1295708799465687, "grad_norm": 0.31105706909069314, "learning_rate": 2.3511911214620255e-06, "loss": 0.0116, "step": 6377 }, { "epoch": 2.1299048255134414, "grad_norm": 0.3165157283981129, "learning_rate": 2.3495430902402956e-06, "loss": 0.0225, "step": 6378 }, { "epoch": 2.1302387710803137, "grad_norm": 0.2708561870368086, "learning_rate": 2.3478954594012884e-06, "loss": 0.0099, "step": 6379 }, { "epoch": 2.1305727166471864, "grad_norm": 0.2567672621447335, "learning_rate": 2.346248229193901e-06, "loss": 0.0139, "step": 6380 }, { "epoch": 2.130906662214059, "grad_norm": 0.36063235869416643, "learning_rate": 2.344601399866962e-06, "loss": 0.0222, "step": 6381 }, { "epoch": 2.131240607780932, "grad_norm": 0.2289741706220592, "learning_rate": 2.342954971669252e-06, "loss": 0.0139, "step": 6382 }, { "epoch": 2.131574553347804, "grad_norm": 0.3242083045253181, "learning_rate": 2.341308944849477e-06, "loss": 0.0127, "step": 6383 }, { "epoch": 2.131908498914677, "grad_norm": 0.24755677452308103, "learning_rate": 2.3396633196562924e-06, "loss": 0.0111, "step": 6384 }, { "epoch": 2.1322424444815495, "grad_norm": 0.35538912735353595, "learning_rate": 2.3380180963382866e-06, "loss": 0.0194, "step": 6385 }, { "epoch": 2.1325763900484223, "grad_norm": 0.333269934452208, "learning_rate": 2.3363732751439926e-06, "loss": 0.0152, "step": 6386 }, { "epoch": 2.1329103356152945, "grad_norm": 0.38942370033643875, "learning_rate": 2.334728856321875e-06, "loss": 0.0165, "step": 6387 }, { "epoch": 2.1332442811821672, "grad_norm": 0.3414507275124361, "learning_rate": 2.33308484012035e-06, "loss": 0.0178, "step": 6388 }, { "epoch": 2.13357822674904, "grad_norm": 0.2320738052381853, "learning_rate": 2.33144122678776e-06, "loss": 0.0099, "step": 6389 }, { "epoch": 2.1339121723159127, "grad_norm": 0.2678591322809289, "learning_rate": 2.3297980165723953e-06, "loss": 0.0108, "step": 6390 }, { "epoch": 2.134246117882785, "grad_norm": 0.3449058952106492, "learning_rate": 2.3281552097224798e-06, "loss": 0.0173, "step": 6391 }, { "epoch": 2.1345800634496577, "grad_norm": 0.3138892815959436, "learning_rate": 2.326512806486181e-06, "loss": 0.0149, "step": 6392 }, { "epoch": 2.1349140090165304, "grad_norm": 0.21716274667209393, "learning_rate": 2.3248708071116005e-06, "loss": 0.0125, "step": 6393 }, { "epoch": 2.135247954583403, "grad_norm": 0.23095461058696024, "learning_rate": 2.323229211846783e-06, "loss": 0.0108, "step": 6394 }, { "epoch": 2.1355819001502754, "grad_norm": 0.27261593654845534, "learning_rate": 2.3215880209397133e-06, "loss": 0.0147, "step": 6395 }, { "epoch": 2.135915845717148, "grad_norm": 0.3127963098104018, "learning_rate": 2.319947234638308e-06, "loss": 0.0144, "step": 6396 }, { "epoch": 2.136249791284021, "grad_norm": 0.3241815094646612, "learning_rate": 2.3183068531904317e-06, "loss": 0.0116, "step": 6397 }, { "epoch": 2.136583736850893, "grad_norm": 0.23018954007064094, "learning_rate": 2.3166668768438772e-06, "loss": 0.0104, "step": 6398 }, { "epoch": 2.136917682417766, "grad_norm": 0.27751962651823514, "learning_rate": 2.31502730584639e-06, "loss": 0.0158, "step": 6399 }, { "epoch": 2.1372516279846385, "grad_norm": 0.2480879720731281, "learning_rate": 2.313388140445641e-06, "loss": 0.0123, "step": 6400 }, { "epoch": 2.1375855735515112, "grad_norm": 0.23526793343388888, "learning_rate": 2.311749380889249e-06, "loss": 0.0157, "step": 6401 }, { "epoch": 2.1379195191183835, "grad_norm": 0.44094618802044216, "learning_rate": 2.310111027424764e-06, "loss": 0.0255, "step": 6402 }, { "epoch": 2.1382534646852562, "grad_norm": 0.26614479700607774, "learning_rate": 2.308473080299683e-06, "loss": 0.0138, "step": 6403 }, { "epoch": 2.138587410252129, "grad_norm": 0.25853358473002913, "learning_rate": 2.3068355397614313e-06, "loss": 0.0166, "step": 6404 }, { "epoch": 2.1389213558190017, "grad_norm": 0.3646570931116901, "learning_rate": 2.3051984060573855e-06, "loss": 0.0156, "step": 6405 }, { "epoch": 2.139255301385874, "grad_norm": 0.2844331645715239, "learning_rate": 2.303561679434849e-06, "loss": 0.0152, "step": 6406 }, { "epoch": 2.1395892469527467, "grad_norm": 0.2704412909914307, "learning_rate": 2.3019253601410725e-06, "loss": 0.0125, "step": 6407 }, { "epoch": 2.1399231925196194, "grad_norm": 0.26389364475661037, "learning_rate": 2.300289448423237e-06, "loss": 0.0142, "step": 6408 }, { "epoch": 2.140257138086492, "grad_norm": 0.26890366044211367, "learning_rate": 2.2986539445284705e-06, "loss": 0.0127, "step": 6409 }, { "epoch": 2.1405910836533644, "grad_norm": 0.29937362645021015, "learning_rate": 2.2970188487038293e-06, "loss": 0.0138, "step": 6410 }, { "epoch": 2.140925029220237, "grad_norm": 0.46247979897770325, "learning_rate": 2.295384161196321e-06, "loss": 0.0169, "step": 6411 }, { "epoch": 2.14125897478711, "grad_norm": 0.21502341753824644, "learning_rate": 2.293749882252879e-06, "loss": 0.0125, "step": 6412 }, { "epoch": 2.1415929203539825, "grad_norm": 0.22272698105449634, "learning_rate": 2.2921160121203847e-06, "loss": 0.0118, "step": 6413 }, { "epoch": 2.141926865920855, "grad_norm": 0.2998770049748982, "learning_rate": 2.290482551045649e-06, "loss": 0.0177, "step": 6414 }, { "epoch": 2.1422608114877275, "grad_norm": 0.23166158608170034, "learning_rate": 2.2888494992754294e-06, "loss": 0.0105, "step": 6415 }, { "epoch": 2.1425947570546002, "grad_norm": 0.2593065690196099, "learning_rate": 2.2872168570564136e-06, "loss": 0.0142, "step": 6416 }, { "epoch": 2.1429287026214725, "grad_norm": 0.22886712443986532, "learning_rate": 2.2855846246352335e-06, "loss": 0.0101, "step": 6417 }, { "epoch": 2.143262648188345, "grad_norm": 0.27142361160843476, "learning_rate": 2.2839528022584596e-06, "loss": 0.0158, "step": 6418 }, { "epoch": 2.143596593755218, "grad_norm": 0.3377025436421419, "learning_rate": 2.2823213901725927e-06, "loss": 0.0219, "step": 6419 }, { "epoch": 2.1439305393220907, "grad_norm": 0.32254654670742733, "learning_rate": 2.2806903886240815e-06, "loss": 0.0219, "step": 6420 }, { "epoch": 2.144264484888963, "grad_norm": 0.2978003362475645, "learning_rate": 2.2790597978593044e-06, "loss": 0.0161, "step": 6421 }, { "epoch": 2.1445984304558356, "grad_norm": 0.4167027119901389, "learning_rate": 2.2774296181245825e-06, "loss": 0.027, "step": 6422 }, { "epoch": 2.1449323760227084, "grad_norm": 0.31401897722913386, "learning_rate": 2.275799849666174e-06, "loss": 0.0151, "step": 6423 }, { "epoch": 2.145266321589581, "grad_norm": 0.34065054419097557, "learning_rate": 2.274170492730277e-06, "loss": 0.0209, "step": 6424 }, { "epoch": 2.1456002671564534, "grad_norm": 0.45491689300781946, "learning_rate": 2.27254154756302e-06, "loss": 0.0291, "step": 6425 }, { "epoch": 2.145934212723326, "grad_norm": 0.3007007719494336, "learning_rate": 2.2709130144104795e-06, "loss": 0.0106, "step": 6426 }, { "epoch": 2.146268158290199, "grad_norm": 0.19476559482408035, "learning_rate": 2.26928489351866e-06, "loss": 0.008, "step": 6427 }, { "epoch": 2.146602103857071, "grad_norm": 0.3470246119824881, "learning_rate": 2.267657185133511e-06, "loss": 0.0156, "step": 6428 }, { "epoch": 2.146936049423944, "grad_norm": 0.22884004535190874, "learning_rate": 2.2660298895009157e-06, "loss": 0.0101, "step": 6429 }, { "epoch": 2.1472699949908165, "grad_norm": 0.3585614088561408, "learning_rate": 2.2644030068666993e-06, "loss": 0.0214, "step": 6430 }, { "epoch": 2.147603940557689, "grad_norm": 0.43823519765365065, "learning_rate": 2.2627765374766175e-06, "loss": 0.0155, "step": 6431 }, { "epoch": 2.1479378861245615, "grad_norm": 0.3609804258874382, "learning_rate": 2.2611504815763715e-06, "loss": 0.0129, "step": 6432 }, { "epoch": 2.148271831691434, "grad_norm": 0.30559656784704303, "learning_rate": 2.259524839411592e-06, "loss": 0.0134, "step": 6433 }, { "epoch": 2.148605777258307, "grad_norm": 0.26901653891642513, "learning_rate": 2.2578996112278535e-06, "loss": 0.0133, "step": 6434 }, { "epoch": 2.1489397228251796, "grad_norm": 0.35426557485104243, "learning_rate": 2.2562747972706663e-06, "loss": 0.0215, "step": 6435 }, { "epoch": 2.149273668392052, "grad_norm": 0.35077812604510644, "learning_rate": 2.254650397785479e-06, "loss": 0.0172, "step": 6436 }, { "epoch": 2.1496076139589246, "grad_norm": 0.3718090048434478, "learning_rate": 2.253026413017672e-06, "loss": 0.0187, "step": 6437 }, { "epoch": 2.1499415595257974, "grad_norm": 0.23086826190060963, "learning_rate": 2.2514028432125722e-06, "loss": 0.0094, "step": 6438 }, { "epoch": 2.15027550509267, "grad_norm": 0.3515387319496888, "learning_rate": 2.249779688615435e-06, "loss": 0.0219, "step": 6439 }, { "epoch": 2.1506094506595423, "grad_norm": 0.24022766804051457, "learning_rate": 2.248156949471459e-06, "loss": 0.0162, "step": 6440 }, { "epoch": 2.150943396226415, "grad_norm": 0.3259619450139486, "learning_rate": 2.2465346260257786e-06, "loss": 0.0322, "step": 6441 }, { "epoch": 2.151277341793288, "grad_norm": 0.27871676383828087, "learning_rate": 2.2449127185234626e-06, "loss": 0.0165, "step": 6442 }, { "epoch": 2.1516112873601605, "grad_norm": 0.47230038262915325, "learning_rate": 2.2432912272095227e-06, "loss": 0.0228, "step": 6443 }, { "epoch": 2.1519452329270328, "grad_norm": 0.2745384003892179, "learning_rate": 2.2416701523288997e-06, "loss": 0.0171, "step": 6444 }, { "epoch": 2.1522791784939055, "grad_norm": 0.19520006217044095, "learning_rate": 2.240049494126479e-06, "loss": 0.0123, "step": 6445 }, { "epoch": 2.152613124060778, "grad_norm": 0.2682638297511614, "learning_rate": 2.238429252847079e-06, "loss": 0.0193, "step": 6446 }, { "epoch": 2.1529470696276505, "grad_norm": 0.2798517279219314, "learning_rate": 2.2368094287354586e-06, "loss": 0.0128, "step": 6447 }, { "epoch": 2.153281015194523, "grad_norm": 0.29017966020638897, "learning_rate": 2.2351900220363083e-06, "loss": 0.0167, "step": 6448 }, { "epoch": 2.153614960761396, "grad_norm": 0.24358722672839853, "learning_rate": 2.2335710329942613e-06, "loss": 0.0135, "step": 6449 }, { "epoch": 2.1539489063282686, "grad_norm": 0.27452795615277803, "learning_rate": 2.2319524618538814e-06, "loss": 0.0115, "step": 6450 }, { "epoch": 2.154282851895141, "grad_norm": 0.32600071812298304, "learning_rate": 2.2303343088596753e-06, "loss": 0.0174, "step": 6451 }, { "epoch": 2.1546167974620136, "grad_norm": 0.2860901905940594, "learning_rate": 2.2287165742560828e-06, "loss": 0.0175, "step": 6452 }, { "epoch": 2.1549507430288863, "grad_norm": 0.27689664455515633, "learning_rate": 2.227099258287485e-06, "loss": 0.0187, "step": 6453 }, { "epoch": 2.155284688595759, "grad_norm": 0.36139584950542614, "learning_rate": 2.2254823611981926e-06, "loss": 0.0261, "step": 6454 }, { "epoch": 2.1556186341626313, "grad_norm": 0.27794608830824197, "learning_rate": 2.2238658832324593e-06, "loss": 0.0134, "step": 6455 }, { "epoch": 2.155952579729504, "grad_norm": 0.2172174704558338, "learning_rate": 2.222249824634471e-06, "loss": 0.0116, "step": 6456 }, { "epoch": 2.1562865252963768, "grad_norm": 0.39581746911098087, "learning_rate": 2.220634185648354e-06, "loss": 0.0237, "step": 6457 }, { "epoch": 2.1566204708632495, "grad_norm": 0.2841949761379651, "learning_rate": 2.2190189665181684e-06, "loss": 0.0154, "step": 6458 }, { "epoch": 2.1569544164301218, "grad_norm": 0.2331623300770666, "learning_rate": 2.2174041674879152e-06, "loss": 0.0119, "step": 6459 }, { "epoch": 2.1572883619969945, "grad_norm": 0.274387595273132, "learning_rate": 2.2157897888015247e-06, "loss": 0.01, "step": 6460 }, { "epoch": 2.157622307563867, "grad_norm": 0.28893125434374417, "learning_rate": 2.214175830702871e-06, "loss": 0.0119, "step": 6461 }, { "epoch": 2.15795625313074, "grad_norm": 0.22081395823937297, "learning_rate": 2.2125622934357588e-06, "loss": 0.0119, "step": 6462 }, { "epoch": 2.158290198697612, "grad_norm": 0.4761637320071046, "learning_rate": 2.210949177243933e-06, "loss": 0.0134, "step": 6463 }, { "epoch": 2.158624144264485, "grad_norm": 0.37701366853689444, "learning_rate": 2.209336482371076e-06, "loss": 0.0286, "step": 6464 }, { "epoch": 2.1589580898313576, "grad_norm": 0.30219015519774584, "learning_rate": 2.2077242090608e-06, "loss": 0.016, "step": 6465 }, { "epoch": 2.15929203539823, "grad_norm": 0.28032324471684783, "learning_rate": 2.206112357556662e-06, "loss": 0.0186, "step": 6466 }, { "epoch": 2.1596259809651026, "grad_norm": 0.2399235550633597, "learning_rate": 2.2045009281021486e-06, "loss": 0.0101, "step": 6467 }, { "epoch": 2.1599599265319753, "grad_norm": 0.30685223515071325, "learning_rate": 2.202889920940685e-06, "loss": 0.0178, "step": 6468 }, { "epoch": 2.160293872098848, "grad_norm": 0.28914116472358037, "learning_rate": 2.2012793363156337e-06, "loss": 0.0158, "step": 6469 }, { "epoch": 2.1606278176657203, "grad_norm": 0.2973262388915178, "learning_rate": 2.199669174470295e-06, "loss": 0.014, "step": 6470 }, { "epoch": 2.160961763232593, "grad_norm": 0.34694651618081374, "learning_rate": 2.1980594356478977e-06, "loss": 0.0199, "step": 6471 }, { "epoch": 2.1612957087994658, "grad_norm": 0.22795188552565387, "learning_rate": 2.196450120091617e-06, "loss": 0.0141, "step": 6472 }, { "epoch": 2.1616296543663385, "grad_norm": 0.2548834137226818, "learning_rate": 2.194841228044554e-06, "loss": 0.0113, "step": 6473 }, { "epoch": 2.1619635999332107, "grad_norm": 0.3079495826836035, "learning_rate": 2.1932327597497537e-06, "loss": 0.0158, "step": 6474 }, { "epoch": 2.1622975455000835, "grad_norm": 0.3183568664049095, "learning_rate": 2.1916247154501937e-06, "loss": 0.0148, "step": 6475 }, { "epoch": 2.162631491066956, "grad_norm": 0.29722517078008187, "learning_rate": 2.190017095388789e-06, "loss": 0.0209, "step": 6476 }, { "epoch": 2.1629654366338285, "grad_norm": 0.2374639534191777, "learning_rate": 2.1884098998083867e-06, "loss": 0.0115, "step": 6477 }, { "epoch": 2.163299382200701, "grad_norm": 0.3003642132969079, "learning_rate": 2.1868031289517773e-06, "loss": 0.015, "step": 6478 }, { "epoch": 2.163633327767574, "grad_norm": 0.2790728792422586, "learning_rate": 2.1851967830616773e-06, "loss": 0.0141, "step": 6479 }, { "epoch": 2.1639672733344466, "grad_norm": 0.37289641299565107, "learning_rate": 2.1835908623807462e-06, "loss": 0.0155, "step": 6480 }, { "epoch": 2.164301218901319, "grad_norm": 0.2828161579770664, "learning_rate": 2.1819853671515774e-06, "loss": 0.0144, "step": 6481 }, { "epoch": 2.1646351644681916, "grad_norm": 0.35307565715924766, "learning_rate": 2.180380297616702e-06, "loss": 0.0195, "step": 6482 }, { "epoch": 2.1649691100350643, "grad_norm": 0.3231217864449414, "learning_rate": 2.178775654018581e-06, "loss": 0.0229, "step": 6483 }, { "epoch": 2.165303055601937, "grad_norm": 0.32269783910999195, "learning_rate": 2.177171436599618e-06, "loss": 0.023, "step": 6484 }, { "epoch": 2.1656370011688093, "grad_norm": 0.33757397074450257, "learning_rate": 2.1755676456021454e-06, "loss": 0.0186, "step": 6485 }, { "epoch": 2.165970946735682, "grad_norm": 0.26188053160341745, "learning_rate": 2.173964281268436e-06, "loss": 0.0153, "step": 6486 }, { "epoch": 2.1663048923025547, "grad_norm": 0.30936393138757456, "learning_rate": 2.1723613438407e-06, "loss": 0.0132, "step": 6487 }, { "epoch": 2.1666388378694275, "grad_norm": 0.3458044555886651, "learning_rate": 2.170758833561075e-06, "loss": 0.016, "step": 6488 }, { "epoch": 2.1669727834362997, "grad_norm": 0.31332504262449845, "learning_rate": 2.1691567506716433e-06, "loss": 0.01, "step": 6489 }, { "epoch": 2.1673067290031724, "grad_norm": 0.32582133447973455, "learning_rate": 2.1675550954144147e-06, "loss": 0.0137, "step": 6490 }, { "epoch": 2.167640674570045, "grad_norm": 0.2617877479730621, "learning_rate": 2.1659538680313403e-06, "loss": 0.0161, "step": 6491 }, { "epoch": 2.167974620136918, "grad_norm": 0.3380088068416834, "learning_rate": 2.1643530687643036e-06, "loss": 0.0171, "step": 6492 }, { "epoch": 2.16830856570379, "grad_norm": 0.484747950505644, "learning_rate": 2.1627526978551265e-06, "loss": 0.0187, "step": 6493 }, { "epoch": 2.168642511270663, "grad_norm": 0.36607927637539583, "learning_rate": 2.1611527555455604e-06, "loss": 0.0152, "step": 6494 }, { "epoch": 2.1689764568375356, "grad_norm": 0.2269337363172091, "learning_rate": 2.159553242077298e-06, "loss": 0.0122, "step": 6495 }, { "epoch": 2.169310402404408, "grad_norm": 0.2705252246172487, "learning_rate": 2.1579541576919624e-06, "loss": 0.0147, "step": 6496 }, { "epoch": 2.1696443479712806, "grad_norm": 0.46140511901702713, "learning_rate": 2.1563555026311166e-06, "loss": 0.0217, "step": 6497 }, { "epoch": 2.1699782935381533, "grad_norm": 0.32301551635597753, "learning_rate": 2.154757277136251e-06, "loss": 0.0156, "step": 6498 }, { "epoch": 2.170312239105026, "grad_norm": 0.25392799693456997, "learning_rate": 2.153159481448805e-06, "loss": 0.012, "step": 6499 }, { "epoch": 2.1706461846718983, "grad_norm": 0.362154037471294, "learning_rate": 2.1515621158101372e-06, "loss": 0.0238, "step": 6500 }, { "epoch": 2.170980130238771, "grad_norm": 0.24543762967602123, "learning_rate": 2.1499651804615534e-06, "loss": 0.0126, "step": 6501 }, { "epoch": 2.1713140758056437, "grad_norm": 0.32792892953249453, "learning_rate": 2.148368675644285e-06, "loss": 0.0102, "step": 6502 }, { "epoch": 2.1716480213725164, "grad_norm": 0.31516955497983384, "learning_rate": 2.146772601599507e-06, "loss": 0.0119, "step": 6503 }, { "epoch": 2.1719819669393887, "grad_norm": 0.32905645746958895, "learning_rate": 2.1451769585683196e-06, "loss": 0.0162, "step": 6504 }, { "epoch": 2.1723159125062614, "grad_norm": 0.2558183977993946, "learning_rate": 2.14358174679177e-06, "loss": 0.0126, "step": 6505 }, { "epoch": 2.172649858073134, "grad_norm": 0.20034120251384852, "learning_rate": 2.1419869665108303e-06, "loss": 0.0092, "step": 6506 }, { "epoch": 2.172983803640007, "grad_norm": 0.35462762982860063, "learning_rate": 2.140392617966412e-06, "loss": 0.0192, "step": 6507 }, { "epoch": 2.173317749206879, "grad_norm": 0.2697304698584604, "learning_rate": 2.1387987013993583e-06, "loss": 0.0128, "step": 6508 }, { "epoch": 2.173651694773752, "grad_norm": 0.26940891178046966, "learning_rate": 2.137205217050452e-06, "loss": 0.0147, "step": 6509 }, { "epoch": 2.1739856403406246, "grad_norm": 0.2975749995789792, "learning_rate": 2.135612165160404e-06, "loss": 0.0177, "step": 6510 }, { "epoch": 2.1743195859074973, "grad_norm": 0.2968568004179634, "learning_rate": 2.1340195459698653e-06, "loss": 0.0153, "step": 6511 }, { "epoch": 2.1746535314743696, "grad_norm": 0.24343310815444663, "learning_rate": 2.1324273597194223e-06, "loss": 0.011, "step": 6512 }, { "epoch": 2.1749874770412423, "grad_norm": 0.20837938839405604, "learning_rate": 2.1308356066495893e-06, "loss": 0.0087, "step": 6513 }, { "epoch": 2.175321422608115, "grad_norm": 0.23374895493572262, "learning_rate": 2.1292442870008213e-06, "loss": 0.0112, "step": 6514 }, { "epoch": 2.1756553681749873, "grad_norm": 0.35733174135473794, "learning_rate": 2.1276534010135053e-06, "loss": 0.0148, "step": 6515 }, { "epoch": 2.17598931374186, "grad_norm": 0.2928966567009293, "learning_rate": 2.1260629489279662e-06, "loss": 0.0165, "step": 6516 }, { "epoch": 2.1763232593087327, "grad_norm": 0.23835990278481167, "learning_rate": 2.1244729309844564e-06, "loss": 0.0107, "step": 6517 }, { "epoch": 2.1766572048756054, "grad_norm": 0.2586294519234971, "learning_rate": 2.1228833474231703e-06, "loss": 0.0107, "step": 6518 }, { "epoch": 2.1769911504424777, "grad_norm": 0.30566469616604947, "learning_rate": 2.1212941984842295e-06, "loss": 0.0143, "step": 6519 }, { "epoch": 2.1773250960093504, "grad_norm": 0.29438725613258704, "learning_rate": 2.1197054844076975e-06, "loss": 0.0162, "step": 6520 }, { "epoch": 2.177659041576223, "grad_norm": 0.33845121195766575, "learning_rate": 2.118117205433563e-06, "loss": 0.0148, "step": 6521 }, { "epoch": 2.177992987143096, "grad_norm": 0.2795510097775822, "learning_rate": 2.1165293618017612e-06, "loss": 0.0167, "step": 6522 }, { "epoch": 2.178326932709968, "grad_norm": 0.29923949401454447, "learning_rate": 2.1149419537521495e-06, "loss": 0.0167, "step": 6523 }, { "epoch": 2.178660878276841, "grad_norm": 0.31872030152725334, "learning_rate": 2.1133549815245273e-06, "loss": 0.0201, "step": 6524 }, { "epoch": 2.1789948238437136, "grad_norm": 0.30638147299674867, "learning_rate": 2.1117684453586236e-06, "loss": 0.0158, "step": 6525 }, { "epoch": 2.179328769410586, "grad_norm": 0.2840187173218848, "learning_rate": 2.110182345494105e-06, "loss": 0.0135, "step": 6526 }, { "epoch": 2.1796627149774586, "grad_norm": 0.3239255247053515, "learning_rate": 2.1085966821705662e-06, "loss": 0.0154, "step": 6527 }, { "epoch": 2.1799966605443313, "grad_norm": 0.34214025711008544, "learning_rate": 2.1070114556275473e-06, "loss": 0.0144, "step": 6528 }, { "epoch": 2.180330606111204, "grad_norm": 0.3442808319286775, "learning_rate": 2.1054266661045105e-06, "loss": 0.024, "step": 6529 }, { "epoch": 2.1806645516780763, "grad_norm": 0.29995911473421333, "learning_rate": 2.103842313840859e-06, "loss": 0.0154, "step": 6530 }, { "epoch": 2.180998497244949, "grad_norm": 0.24219781517831027, "learning_rate": 2.1022583990759265e-06, "loss": 0.0106, "step": 6531 }, { "epoch": 2.1813324428118217, "grad_norm": 0.2571694548375631, "learning_rate": 2.1006749220489834e-06, "loss": 0.0114, "step": 6532 }, { "epoch": 2.1816663883786944, "grad_norm": 0.25602685008977827, "learning_rate": 2.0990918829992307e-06, "loss": 0.0096, "step": 6533 }, { "epoch": 2.1820003339455667, "grad_norm": 0.2898189520049827, "learning_rate": 2.097509282165806e-06, "loss": 0.0122, "step": 6534 }, { "epoch": 2.1823342795124394, "grad_norm": 0.32648803670202425, "learning_rate": 2.0959271197877816e-06, "loss": 0.0227, "step": 6535 }, { "epoch": 2.182668225079312, "grad_norm": 0.27872952226235354, "learning_rate": 2.0943453961041587e-06, "loss": 0.0144, "step": 6536 }, { "epoch": 2.183002170646185, "grad_norm": 0.2623052154519515, "learning_rate": 2.0927641113538764e-06, "loss": 0.0117, "step": 6537 }, { "epoch": 2.183336116213057, "grad_norm": 0.28053985522515906, "learning_rate": 2.0911832657758086e-06, "loss": 0.012, "step": 6538 }, { "epoch": 2.18367006177993, "grad_norm": 0.22013024035220435, "learning_rate": 2.089602859608757e-06, "loss": 0.0107, "step": 6539 }, { "epoch": 2.1840040073468026, "grad_norm": 0.298667729904666, "learning_rate": 2.088022893091462e-06, "loss": 0.0189, "step": 6540 }, { "epoch": 2.1843379529136753, "grad_norm": 0.3372124117393272, "learning_rate": 2.086443366462598e-06, "loss": 0.0216, "step": 6541 }, { "epoch": 2.1846718984805475, "grad_norm": 0.25179480117190695, "learning_rate": 2.084864279960768e-06, "loss": 0.0143, "step": 6542 }, { "epoch": 2.1850058440474203, "grad_norm": 0.3368441001252815, "learning_rate": 2.0832856338245157e-06, "loss": 0.0162, "step": 6543 }, { "epoch": 2.185339789614293, "grad_norm": 0.3397412128187092, "learning_rate": 2.0817074282923087e-06, "loss": 0.0125, "step": 6544 }, { "epoch": 2.1856737351811653, "grad_norm": 0.2201558676784805, "learning_rate": 2.080129663602557e-06, "loss": 0.0111, "step": 6545 }, { "epoch": 2.186007680748038, "grad_norm": 0.25782742203882697, "learning_rate": 2.0785523399935996e-06, "loss": 0.0115, "step": 6546 }, { "epoch": 2.1863416263149107, "grad_norm": 0.431806806501223, "learning_rate": 2.076975457703712e-06, "loss": 0.0194, "step": 6547 }, { "epoch": 2.1866755718817834, "grad_norm": 0.23690739666194224, "learning_rate": 2.0753990169710973e-06, "loss": 0.0098, "step": 6548 }, { "epoch": 2.1870095174486557, "grad_norm": 0.2609284901050491, "learning_rate": 2.0738230180338993e-06, "loss": 0.0115, "step": 6549 }, { "epoch": 2.1873434630155284, "grad_norm": 0.26290840908723817, "learning_rate": 2.0722474611301868e-06, "loss": 0.0166, "step": 6550 }, { "epoch": 2.187677408582401, "grad_norm": 0.2747028612559764, "learning_rate": 2.0706723464979687e-06, "loss": 0.0142, "step": 6551 }, { "epoch": 2.188011354149274, "grad_norm": 0.35122297292874205, "learning_rate": 2.0690976743751844e-06, "loss": 0.0216, "step": 6552 }, { "epoch": 2.188345299716146, "grad_norm": 0.2420889603896751, "learning_rate": 2.0675234449997085e-06, "loss": 0.0112, "step": 6553 }, { "epoch": 2.188679245283019, "grad_norm": 0.29582887432454824, "learning_rate": 2.065949658609343e-06, "loss": 0.0163, "step": 6554 }, { "epoch": 2.1890131908498915, "grad_norm": 0.26322567331836094, "learning_rate": 2.0643763154418304e-06, "loss": 0.016, "step": 6555 }, { "epoch": 2.1893471364167643, "grad_norm": 0.2516184473720276, "learning_rate": 2.06280341573484e-06, "loss": 0.0131, "step": 6556 }, { "epoch": 2.1896810819836365, "grad_norm": 0.24844362417839239, "learning_rate": 2.0612309597259776e-06, "loss": 0.0127, "step": 6557 }, { "epoch": 2.1900150275505093, "grad_norm": 0.2824712930720436, "learning_rate": 2.059658947652784e-06, "loss": 0.0148, "step": 6558 }, { "epoch": 2.190348973117382, "grad_norm": 0.2223810259384668, "learning_rate": 2.058087379752725e-06, "loss": 0.012, "step": 6559 }, { "epoch": 2.1906829186842547, "grad_norm": 0.33889376103291663, "learning_rate": 2.056516256263208e-06, "loss": 0.0141, "step": 6560 }, { "epoch": 2.191016864251127, "grad_norm": 0.4351553799654831, "learning_rate": 2.0549455774215705e-06, "loss": 0.0215, "step": 6561 }, { "epoch": 2.1913508098179997, "grad_norm": 0.23904238429895308, "learning_rate": 2.0533753434650784e-06, "loss": 0.0103, "step": 6562 }, { "epoch": 2.1916847553848724, "grad_norm": 0.4589855113278374, "learning_rate": 2.0518055546309362e-06, "loss": 0.0121, "step": 6563 }, { "epoch": 2.1920187009517447, "grad_norm": 0.27028252936995945, "learning_rate": 2.0502362111562806e-06, "loss": 0.0126, "step": 6564 }, { "epoch": 2.1923526465186174, "grad_norm": 0.21546077643021916, "learning_rate": 2.048667313278176e-06, "loss": 0.0103, "step": 6565 }, { "epoch": 2.19268659208549, "grad_norm": 0.34168154460500905, "learning_rate": 2.0470988612336264e-06, "loss": 0.0121, "step": 6566 }, { "epoch": 2.193020537652363, "grad_norm": 0.30497679243547027, "learning_rate": 2.045530855259561e-06, "loss": 0.0156, "step": 6567 }, { "epoch": 2.193354483219235, "grad_norm": 0.3072013185745378, "learning_rate": 2.043963295592848e-06, "loss": 0.0157, "step": 6568 }, { "epoch": 2.193688428786108, "grad_norm": 0.3228384471020377, "learning_rate": 2.042396182470285e-06, "loss": 0.016, "step": 6569 }, { "epoch": 2.1940223743529805, "grad_norm": 0.3016048980193982, "learning_rate": 2.040829516128605e-06, "loss": 0.0123, "step": 6570 }, { "epoch": 2.1943563199198532, "grad_norm": 0.32031088004186065, "learning_rate": 2.0392632968044686e-06, "loss": 0.0192, "step": 6571 }, { "epoch": 2.1946902654867255, "grad_norm": 0.2905920654524365, "learning_rate": 2.0376975247344736e-06, "loss": 0.0152, "step": 6572 }, { "epoch": 2.1950242110535982, "grad_norm": 0.29792214862022215, "learning_rate": 2.0361322001551466e-06, "loss": 0.0144, "step": 6573 }, { "epoch": 2.195358156620471, "grad_norm": 0.2731855199925054, "learning_rate": 2.034567323302949e-06, "loss": 0.0104, "step": 6574 }, { "epoch": 2.1956921021873432, "grad_norm": 0.30065602321357754, "learning_rate": 2.0330028944142736e-06, "loss": 0.0142, "step": 6575 }, { "epoch": 2.196026047754216, "grad_norm": 0.2649649674226487, "learning_rate": 2.031438913725448e-06, "loss": 0.0109, "step": 6576 }, { "epoch": 2.1963599933210887, "grad_norm": 0.4033152837191047, "learning_rate": 2.0298753814727267e-06, "loss": 0.0176, "step": 6577 }, { "epoch": 2.1966939388879614, "grad_norm": 0.29091787328055135, "learning_rate": 2.028312297892303e-06, "loss": 0.0178, "step": 6578 }, { "epoch": 2.1970278844548337, "grad_norm": 0.2882873443886919, "learning_rate": 2.0267496632202953e-06, "loss": 0.0148, "step": 6579 }, { "epoch": 2.1973618300217064, "grad_norm": 0.34924737508772247, "learning_rate": 2.0251874776927598e-06, "loss": 0.0138, "step": 6580 }, { "epoch": 2.197695775588579, "grad_norm": 0.2722375379686338, "learning_rate": 2.0236257415456833e-06, "loss": 0.012, "step": 6581 }, { "epoch": 2.198029721155452, "grad_norm": 0.3017348298669219, "learning_rate": 2.022064455014986e-06, "loss": 0.0115, "step": 6582 }, { "epoch": 2.198363666722324, "grad_norm": 0.291571694536511, "learning_rate": 2.0205036183365145e-06, "loss": 0.0135, "step": 6583 }, { "epoch": 2.198697612289197, "grad_norm": 0.22837631193895652, "learning_rate": 2.018943231746056e-06, "loss": 0.0143, "step": 6584 }, { "epoch": 2.1990315578560695, "grad_norm": 0.32345728882665414, "learning_rate": 2.0173832954793216e-06, "loss": 0.0229, "step": 6585 }, { "epoch": 2.1993655034229422, "grad_norm": 0.3151011464115596, "learning_rate": 2.0158238097719597e-06, "loss": 0.0119, "step": 6586 }, { "epoch": 2.1996994489898145, "grad_norm": 0.3343215908705643, "learning_rate": 2.0142647748595502e-06, "loss": 0.0197, "step": 6587 }, { "epoch": 2.2000333945566872, "grad_norm": 0.40873630806611855, "learning_rate": 2.0127061909776e-06, "loss": 0.0225, "step": 6588 }, { "epoch": 2.20036734012356, "grad_norm": 0.28452049085939546, "learning_rate": 2.0111480583615566e-06, "loss": 0.0138, "step": 6589 }, { "epoch": 2.2007012856904327, "grad_norm": 0.29368232293360924, "learning_rate": 2.00959037724679e-06, "loss": 0.0151, "step": 6590 }, { "epoch": 2.201035231257305, "grad_norm": 0.3738527865316701, "learning_rate": 2.0080331478686087e-06, "loss": 0.027, "step": 6591 }, { "epoch": 2.2013691768241777, "grad_norm": 0.2954931019724482, "learning_rate": 2.006476370462247e-06, "loss": 0.0182, "step": 6592 }, { "epoch": 2.2017031223910504, "grad_norm": 0.23965981424437982, "learning_rate": 2.0049200452628803e-06, "loss": 0.0113, "step": 6593 }, { "epoch": 2.2020370679579226, "grad_norm": 0.2654406426479545, "learning_rate": 2.0033641725056048e-06, "loss": 0.0106, "step": 6594 }, { "epoch": 2.2023710135247954, "grad_norm": 0.1984422467408501, "learning_rate": 2.001808752425457e-06, "loss": 0.0083, "step": 6595 }, { "epoch": 2.202704959091668, "grad_norm": 0.27510526024218646, "learning_rate": 2.000253785257398e-06, "loss": 0.0151, "step": 6596 }, { "epoch": 2.203038904658541, "grad_norm": 0.3277857679527759, "learning_rate": 1.998699271236326e-06, "loss": 0.0157, "step": 6597 }, { "epoch": 2.203372850225413, "grad_norm": 0.3006917973785943, "learning_rate": 1.997145210597068e-06, "loss": 0.0187, "step": 6598 }, { "epoch": 2.203706795792286, "grad_norm": 0.2466694275717017, "learning_rate": 1.9955916035743855e-06, "loss": 0.0112, "step": 6599 }, { "epoch": 2.2040407413591585, "grad_norm": 0.26123473352892995, "learning_rate": 1.9940384504029647e-06, "loss": 0.0094, "step": 6600 }, { "epoch": 2.2043746869260312, "grad_norm": 0.2895012517364698, "learning_rate": 1.9924857513174324e-06, "loss": 0.0129, "step": 6601 }, { "epoch": 2.2047086324929035, "grad_norm": 0.2729192583071324, "learning_rate": 1.990933506552337e-06, "loss": 0.0106, "step": 6602 }, { "epoch": 2.205042578059776, "grad_norm": 0.31083387330033807, "learning_rate": 1.989381716342167e-06, "loss": 0.0156, "step": 6603 }, { "epoch": 2.205376523626649, "grad_norm": 0.32074137316370627, "learning_rate": 1.9878303809213367e-06, "loss": 0.0133, "step": 6604 }, { "epoch": 2.2057104691935217, "grad_norm": 0.3382822914612812, "learning_rate": 1.986279500524197e-06, "loss": 0.0117, "step": 6605 }, { "epoch": 2.206044414760394, "grad_norm": 0.24538926003413847, "learning_rate": 1.984729075385022e-06, "loss": 0.0091, "step": 6606 }, { "epoch": 2.2063783603272666, "grad_norm": 0.2698418649853723, "learning_rate": 1.983179105738026e-06, "loss": 0.0116, "step": 6607 }, { "epoch": 2.2067123058941394, "grad_norm": 0.32079606395170346, "learning_rate": 1.9816295918173462e-06, "loss": 0.0135, "step": 6608 }, { "epoch": 2.207046251461012, "grad_norm": 0.27266592971802145, "learning_rate": 1.9800805338570562e-06, "loss": 0.0109, "step": 6609 }, { "epoch": 2.2073801970278843, "grad_norm": 0.32347673712171227, "learning_rate": 1.9785319320911623e-06, "loss": 0.0145, "step": 6610 }, { "epoch": 2.207714142594757, "grad_norm": 0.24981179450852878, "learning_rate": 1.9769837867535948e-06, "loss": 0.0132, "step": 6611 }, { "epoch": 2.20804808816163, "grad_norm": 0.2783778140995123, "learning_rate": 1.9754360980782227e-06, "loss": 0.0141, "step": 6612 }, { "epoch": 2.208382033728502, "grad_norm": 0.5386887104628023, "learning_rate": 1.973888866298839e-06, "loss": 0.0243, "step": 6613 }, { "epoch": 2.2087159792953748, "grad_norm": 0.2840583006409117, "learning_rate": 1.972342091649176e-06, "loss": 0.015, "step": 6614 }, { "epoch": 2.2090499248622475, "grad_norm": 0.25509634295763384, "learning_rate": 1.9707957743628854e-06, "loss": 0.0096, "step": 6615 }, { "epoch": 2.20938387042912, "grad_norm": 0.31283208613488706, "learning_rate": 1.9692499146735646e-06, "loss": 0.0185, "step": 6616 }, { "epoch": 2.2097178159959925, "grad_norm": 0.26442709656835756, "learning_rate": 1.967704512814728e-06, "loss": 0.0123, "step": 6617 }, { "epoch": 2.210051761562865, "grad_norm": 0.27959036621366046, "learning_rate": 1.966159569019831e-06, "loss": 0.0103, "step": 6618 }, { "epoch": 2.210385707129738, "grad_norm": 0.26272583128668314, "learning_rate": 1.9646150835222517e-06, "loss": 0.0132, "step": 6619 }, { "epoch": 2.2107196526966106, "grad_norm": 0.28851377822694657, "learning_rate": 1.9630710565553063e-06, "loss": 0.0108, "step": 6620 }, { "epoch": 2.211053598263483, "grad_norm": 0.23989503512968924, "learning_rate": 1.9615274883522327e-06, "loss": 0.0123, "step": 6621 }, { "epoch": 2.2113875438303556, "grad_norm": 0.301524978850988, "learning_rate": 1.9599843791462123e-06, "loss": 0.0181, "step": 6622 }, { "epoch": 2.2117214893972283, "grad_norm": 0.23221174730581715, "learning_rate": 1.958441729170345e-06, "loss": 0.0118, "step": 6623 }, { "epoch": 2.2120554349641006, "grad_norm": 0.2665690588320243, "learning_rate": 1.9568995386576695e-06, "loss": 0.0111, "step": 6624 }, { "epoch": 2.2123893805309733, "grad_norm": 0.22961451400894545, "learning_rate": 1.9553578078411476e-06, "loss": 0.0116, "step": 6625 }, { "epoch": 2.212723326097846, "grad_norm": 0.26487606187895424, "learning_rate": 1.953816536953681e-06, "loss": 0.0101, "step": 6626 }, { "epoch": 2.2130572716647188, "grad_norm": 0.27195401533111785, "learning_rate": 1.95227572622809e-06, "loss": 0.0132, "step": 6627 }, { "epoch": 2.213391217231591, "grad_norm": 0.29527337141955756, "learning_rate": 1.95073537589714e-06, "loss": 0.0173, "step": 6628 }, { "epoch": 2.2137251627984638, "grad_norm": 0.22944929591469504, "learning_rate": 1.949195486193514e-06, "loss": 0.0128, "step": 6629 }, { "epoch": 2.2140591083653365, "grad_norm": 0.23158442415002364, "learning_rate": 1.9476560573498332e-06, "loss": 0.0108, "step": 6630 }, { "epoch": 2.214393053932209, "grad_norm": 0.2807469430714143, "learning_rate": 1.946117089598644e-06, "loss": 0.0147, "step": 6631 }, { "epoch": 2.2147269994990815, "grad_norm": 0.2590706685467037, "learning_rate": 1.9445785831724274e-06, "loss": 0.012, "step": 6632 }, { "epoch": 2.215060945065954, "grad_norm": 0.2721975963281122, "learning_rate": 1.943040538303591e-06, "loss": 0.014, "step": 6633 }, { "epoch": 2.215394890632827, "grad_norm": 0.24478951889187736, "learning_rate": 1.9415029552244758e-06, "loss": 0.0131, "step": 6634 }, { "epoch": 2.2157288361996996, "grad_norm": 0.37398456573476724, "learning_rate": 1.939965834167354e-06, "loss": 0.0248, "step": 6635 }, { "epoch": 2.216062781766572, "grad_norm": 0.22501235029711275, "learning_rate": 1.9384291753644215e-06, "loss": 0.0118, "step": 6636 }, { "epoch": 2.2163967273334446, "grad_norm": 0.2222184565929166, "learning_rate": 1.9368929790478126e-06, "loss": 0.0107, "step": 6637 }, { "epoch": 2.2167306729003173, "grad_norm": 0.30438470736257894, "learning_rate": 1.935357245449583e-06, "loss": 0.0141, "step": 6638 }, { "epoch": 2.21706461846719, "grad_norm": 0.3370022723149651, "learning_rate": 1.9338219748017297e-06, "loss": 0.0143, "step": 6639 }, { "epoch": 2.2173985640340623, "grad_norm": 0.33336519710123375, "learning_rate": 1.932287167336168e-06, "loss": 0.0193, "step": 6640 }, { "epoch": 2.217732509600935, "grad_norm": 0.2916646274486344, "learning_rate": 1.9307528232847533e-06, "loss": 0.014, "step": 6641 }, { "epoch": 2.2180664551678078, "grad_norm": 0.25762380383595834, "learning_rate": 1.9292189428792617e-06, "loss": 0.0115, "step": 6642 }, { "epoch": 2.21840040073468, "grad_norm": 0.36295273735003497, "learning_rate": 1.927685526351408e-06, "loss": 0.0201, "step": 6643 }, { "epoch": 2.2187343463015528, "grad_norm": 0.26969220260430155, "learning_rate": 1.9261525739328273e-06, "loss": 0.0142, "step": 6644 }, { "epoch": 2.2190682918684255, "grad_norm": 0.23887853483812735, "learning_rate": 1.924620085855097e-06, "loss": 0.0103, "step": 6645 }, { "epoch": 2.219402237435298, "grad_norm": 0.3358340061922241, "learning_rate": 1.923088062349713e-06, "loss": 0.0172, "step": 6646 }, { "epoch": 2.2197361830021705, "grad_norm": 0.27072620057255836, "learning_rate": 1.9215565036481083e-06, "loss": 0.0116, "step": 6647 }, { "epoch": 2.220070128569043, "grad_norm": 0.2871520252799103, "learning_rate": 1.920025409981639e-06, "loss": 0.0157, "step": 6648 }, { "epoch": 2.220404074135916, "grad_norm": 0.2663963856217783, "learning_rate": 1.918494781581599e-06, "loss": 0.0118, "step": 6649 }, { "epoch": 2.2207380197027886, "grad_norm": 0.273154380274795, "learning_rate": 1.9169646186792025e-06, "loss": 0.0158, "step": 6650 }, { "epoch": 2.221071965269661, "grad_norm": 0.24243409303867358, "learning_rate": 1.9154349215056052e-06, "loss": 0.0123, "step": 6651 }, { "epoch": 2.2214059108365336, "grad_norm": 0.2917113561997659, "learning_rate": 1.9139056902918805e-06, "loss": 0.0164, "step": 6652 }, { "epoch": 2.2217398564034063, "grad_norm": 0.2460869043839189, "learning_rate": 1.912376925269041e-06, "loss": 0.0085, "step": 6653 }, { "epoch": 2.222073801970279, "grad_norm": 0.3132078323676648, "learning_rate": 1.910848626668021e-06, "loss": 0.0132, "step": 6654 }, { "epoch": 2.2224077475371513, "grad_norm": 0.2701488132093934, "learning_rate": 1.9093207947196908e-06, "loss": 0.0157, "step": 6655 }, { "epoch": 2.222741693104024, "grad_norm": 0.29168666693425044, "learning_rate": 1.9077934296548445e-06, "loss": 0.0153, "step": 6656 }, { "epoch": 2.2230756386708967, "grad_norm": 0.2863420988214763, "learning_rate": 1.9062665317042106e-06, "loss": 0.0165, "step": 6657 }, { "epoch": 2.2234095842377695, "grad_norm": 0.2226448764019503, "learning_rate": 1.9047401010984456e-06, "loss": 0.0119, "step": 6658 }, { "epoch": 2.2237435298046417, "grad_norm": 0.26354728659444615, "learning_rate": 1.9032141380681329e-06, "loss": 0.0094, "step": 6659 }, { "epoch": 2.2240774753715145, "grad_norm": 0.2978104319107039, "learning_rate": 1.9016886428437893e-06, "loss": 0.0163, "step": 6660 }, { "epoch": 2.224411420938387, "grad_norm": 0.3054822295300854, "learning_rate": 1.9001636156558562e-06, "loss": 0.0165, "step": 6661 }, { "epoch": 2.2247453665052594, "grad_norm": 0.3426405779533461, "learning_rate": 1.8986390567347085e-06, "loss": 0.0113, "step": 6662 }, { "epoch": 2.225079312072132, "grad_norm": 0.27080000362096207, "learning_rate": 1.8971149663106482e-06, "loss": 0.0149, "step": 6663 }, { "epoch": 2.225413257639005, "grad_norm": 0.255362712712875, "learning_rate": 1.8955913446139096e-06, "loss": 0.0179, "step": 6664 }, { "epoch": 2.2257472032058776, "grad_norm": 0.21085822866973594, "learning_rate": 1.8940681918746495e-06, "loss": 0.0096, "step": 6665 }, { "epoch": 2.22608114877275, "grad_norm": 0.26197638895665815, "learning_rate": 1.8925455083229622e-06, "loss": 0.0129, "step": 6666 }, { "epoch": 2.2264150943396226, "grad_norm": 0.296044755979088, "learning_rate": 1.891023294188863e-06, "loss": 0.015, "step": 6667 }, { "epoch": 2.2267490399064953, "grad_norm": 0.33847615330580905, "learning_rate": 1.8895015497023022e-06, "loss": 0.0174, "step": 6668 }, { "epoch": 2.227082985473368, "grad_norm": 0.27161238309725877, "learning_rate": 1.8879802750931574e-06, "loss": 0.0131, "step": 6669 }, { "epoch": 2.2274169310402403, "grad_norm": 0.2630033886363965, "learning_rate": 1.886459470591237e-06, "loss": 0.0184, "step": 6670 }, { "epoch": 2.227750876607113, "grad_norm": 0.25221611148972534, "learning_rate": 1.8849391364262721e-06, "loss": 0.0102, "step": 6671 }, { "epoch": 2.2280848221739857, "grad_norm": 0.2411037462392019, "learning_rate": 1.883419272827931e-06, "loss": 0.0119, "step": 6672 }, { "epoch": 2.228418767740858, "grad_norm": 0.30842097982208766, "learning_rate": 1.881899880025802e-06, "loss": 0.0118, "step": 6673 }, { "epoch": 2.2287527133077307, "grad_norm": 0.30269452723790186, "learning_rate": 1.8803809582494143e-06, "loss": 0.0132, "step": 6674 }, { "epoch": 2.2290866588746034, "grad_norm": 0.2315962193941701, "learning_rate": 1.878862507728213e-06, "loss": 0.014, "step": 6675 }, { "epoch": 2.229420604441476, "grad_norm": 0.2576225977756485, "learning_rate": 1.877344528691582e-06, "loss": 0.0119, "step": 6676 }, { "epoch": 2.2297545500083484, "grad_norm": 0.2687141455819776, "learning_rate": 1.8758270213688263e-06, "loss": 0.0123, "step": 6677 }, { "epoch": 2.230088495575221, "grad_norm": 0.3142648959932829, "learning_rate": 1.8743099859891866e-06, "loss": 0.0163, "step": 6678 }, { "epoch": 2.230422441142094, "grad_norm": 0.22243140409970208, "learning_rate": 1.8727934227818255e-06, "loss": 0.0087, "step": 6679 }, { "epoch": 2.2307563867089666, "grad_norm": 0.31782764706341865, "learning_rate": 1.8712773319758398e-06, "loss": 0.0175, "step": 6680 }, { "epoch": 2.231090332275839, "grad_norm": 0.23012015237340638, "learning_rate": 1.8697617138002545e-06, "loss": 0.0113, "step": 6681 }, { "epoch": 2.2314242778427116, "grad_norm": 0.2907323691970788, "learning_rate": 1.8682465684840178e-06, "loss": 0.0108, "step": 6682 }, { "epoch": 2.2317582234095843, "grad_norm": 0.23011189568327983, "learning_rate": 1.8667318962560137e-06, "loss": 0.0082, "step": 6683 }, { "epoch": 2.232092168976457, "grad_norm": 0.2516721806927547, "learning_rate": 1.865217697345048e-06, "loss": 0.0097, "step": 6684 }, { "epoch": 2.2324261145433293, "grad_norm": 0.2817613761200602, "learning_rate": 1.86370397197986e-06, "loss": 0.012, "step": 6685 }, { "epoch": 2.232760060110202, "grad_norm": 0.42020472149998295, "learning_rate": 1.8621907203891159e-06, "loss": 0.0237, "step": 6686 }, { "epoch": 2.2330940056770747, "grad_norm": 0.43109621633257, "learning_rate": 1.8606779428014116e-06, "loss": 0.0168, "step": 6687 }, { "epoch": 2.2334279512439474, "grad_norm": 0.3332873336373983, "learning_rate": 1.8591656394452667e-06, "loss": 0.0167, "step": 6688 }, { "epoch": 2.2337618968108197, "grad_norm": 0.29492481170953533, "learning_rate": 1.8576538105491359e-06, "loss": 0.0125, "step": 6689 }, { "epoch": 2.2340958423776924, "grad_norm": 0.4297722631827301, "learning_rate": 1.8561424563413949e-06, "loss": 0.0137, "step": 6690 }, { "epoch": 2.234429787944565, "grad_norm": 0.3126356239382773, "learning_rate": 1.8546315770503537e-06, "loss": 0.01, "step": 6691 }, { "epoch": 2.2347637335114374, "grad_norm": 0.2659002685378052, "learning_rate": 1.8531211729042486e-06, "loss": 0.0119, "step": 6692 }, { "epoch": 2.23509767907831, "grad_norm": 0.2515884976630654, "learning_rate": 1.8516112441312451e-06, "loss": 0.0102, "step": 6693 }, { "epoch": 2.235431624645183, "grad_norm": 0.6584408224358603, "learning_rate": 1.8501017909594327e-06, "loss": 0.0292, "step": 6694 }, { "epoch": 2.2357655702120556, "grad_norm": 0.32916335912485434, "learning_rate": 1.8485928136168353e-06, "loss": 0.0189, "step": 6695 }, { "epoch": 2.236099515778928, "grad_norm": 0.30130453888410424, "learning_rate": 1.8470843123313982e-06, "loss": 0.0139, "step": 6696 }, { "epoch": 2.2364334613458006, "grad_norm": 0.31575108799020773, "learning_rate": 1.8455762873309995e-06, "loss": 0.0208, "step": 6697 }, { "epoch": 2.2367674069126733, "grad_norm": 0.2634477834138086, "learning_rate": 1.844068738843446e-06, "loss": 0.0101, "step": 6698 }, { "epoch": 2.237101352479546, "grad_norm": 0.31670778701029184, "learning_rate": 1.8425616670964702e-06, "loss": 0.016, "step": 6699 }, { "epoch": 2.2374352980464183, "grad_norm": 0.2863805719542064, "learning_rate": 1.8410550723177306e-06, "loss": 0.0176, "step": 6700 }, { "epoch": 2.237769243613291, "grad_norm": 0.4318660466650565, "learning_rate": 1.8395489547348193e-06, "loss": 0.0321, "step": 6701 }, { "epoch": 2.2381031891801637, "grad_norm": 0.2541954139214152, "learning_rate": 1.8380433145752502e-06, "loss": 0.0181, "step": 6702 }, { "epoch": 2.2384371347470364, "grad_norm": 0.25345957940192115, "learning_rate": 1.8365381520664695e-06, "loss": 0.0143, "step": 6703 }, { "epoch": 2.2387710803139087, "grad_norm": 0.2487211406285926, "learning_rate": 1.8350334674358505e-06, "loss": 0.0132, "step": 6704 }, { "epoch": 2.2391050258807814, "grad_norm": 0.34324111636550203, "learning_rate": 1.8335292609106914e-06, "loss": 0.0147, "step": 6705 }, { "epoch": 2.239438971447654, "grad_norm": 0.22447618926400847, "learning_rate": 1.8320255327182224e-06, "loss": 0.0108, "step": 6706 }, { "epoch": 2.239772917014527, "grad_norm": 0.24868281068883916, "learning_rate": 1.8305222830855973e-06, "loss": 0.011, "step": 6707 }, { "epoch": 2.240106862581399, "grad_norm": 0.37030940073547103, "learning_rate": 1.8290195122399007e-06, "loss": 0.0122, "step": 6708 }, { "epoch": 2.240440808148272, "grad_norm": 0.249673743056727, "learning_rate": 1.8275172204081437e-06, "loss": 0.0102, "step": 6709 }, { "epoch": 2.2407747537151446, "grad_norm": 0.24818510455504927, "learning_rate": 1.826015407817267e-06, "loss": 0.0126, "step": 6710 }, { "epoch": 2.241108699282017, "grad_norm": 0.3418088337757231, "learning_rate": 1.8245140746941336e-06, "loss": 0.0177, "step": 6711 }, { "epoch": 2.2414426448488896, "grad_norm": 0.2745073300071627, "learning_rate": 1.823013221265541e-06, "loss": 0.013, "step": 6712 }, { "epoch": 2.2417765904157623, "grad_norm": 0.4142563888912224, "learning_rate": 1.8215128477582077e-06, "loss": 0.0187, "step": 6713 }, { "epoch": 2.242110535982635, "grad_norm": 0.3484803774881054, "learning_rate": 1.8200129543987843e-06, "loss": 0.0225, "step": 6714 }, { "epoch": 2.2424444815495073, "grad_norm": 0.306940495659452, "learning_rate": 1.818513541413847e-06, "loss": 0.0199, "step": 6715 }, { "epoch": 2.24277842711638, "grad_norm": 0.22152939963141818, "learning_rate": 1.8170146090299018e-06, "loss": 0.0096, "step": 6716 }, { "epoch": 2.2431123726832527, "grad_norm": 0.2774631148214958, "learning_rate": 1.8155161574733772e-06, "loss": 0.0123, "step": 6717 }, { "epoch": 2.2434463182501254, "grad_norm": 0.2479872398163057, "learning_rate": 1.8140181869706341e-06, "loss": 0.0115, "step": 6718 }, { "epoch": 2.2437802638169977, "grad_norm": 0.2731487174337819, "learning_rate": 1.812520697747956e-06, "loss": 0.016, "step": 6719 }, { "epoch": 2.2441142093838704, "grad_norm": 0.2709296983405702, "learning_rate": 1.8110236900315582e-06, "loss": 0.0121, "step": 6720 }, { "epoch": 2.244448154950743, "grad_norm": 0.3014712209019079, "learning_rate": 1.8095271640475802e-06, "loss": 0.0134, "step": 6721 }, { "epoch": 2.2447821005176154, "grad_norm": 0.31515946349656015, "learning_rate": 1.8080311200220935e-06, "loss": 0.0127, "step": 6722 }, { "epoch": 2.245116046084488, "grad_norm": 0.28613752523689995, "learning_rate": 1.8065355581810878e-06, "loss": 0.012, "step": 6723 }, { "epoch": 2.245449991651361, "grad_norm": 0.3752827969453257, "learning_rate": 1.80504047875049e-06, "loss": 0.0191, "step": 6724 }, { "epoch": 2.2457839372182336, "grad_norm": 0.26723048621555223, "learning_rate": 1.8035458819561453e-06, "loss": 0.0141, "step": 6725 }, { "epoch": 2.246117882785106, "grad_norm": 0.22271638835274343, "learning_rate": 1.8020517680238326e-06, "loss": 0.0113, "step": 6726 }, { "epoch": 2.2464518283519785, "grad_norm": 0.24665940678808335, "learning_rate": 1.8005581371792564e-06, "loss": 0.0115, "step": 6727 }, { "epoch": 2.2467857739188513, "grad_norm": 0.3005858141558532, "learning_rate": 1.799064989648044e-06, "loss": 0.0129, "step": 6728 }, { "epoch": 2.247119719485724, "grad_norm": 0.25348457130729, "learning_rate": 1.797572325655756e-06, "loss": 0.0088, "step": 6729 }, { "epoch": 2.2474536650525963, "grad_norm": 0.24754307031892908, "learning_rate": 1.7960801454278742e-06, "loss": 0.0113, "step": 6730 }, { "epoch": 2.247787610619469, "grad_norm": 0.2522656652125552, "learning_rate": 1.7945884491898119e-06, "loss": 0.01, "step": 6731 }, { "epoch": 2.2481215561863417, "grad_norm": 0.26088498011835287, "learning_rate": 1.7930972371669064e-06, "loss": 0.0146, "step": 6732 }, { "epoch": 2.2484555017532144, "grad_norm": 0.24391651493052038, "learning_rate": 1.791606509584425e-06, "loss": 0.0101, "step": 6733 }, { "epoch": 2.2487894473200867, "grad_norm": 0.22394944524557142, "learning_rate": 1.7901162666675564e-06, "loss": 0.0093, "step": 6734 }, { "epoch": 2.2491233928869594, "grad_norm": 0.370741820473369, "learning_rate": 1.7886265086414222e-06, "loss": 0.0241, "step": 6735 }, { "epoch": 2.249457338453832, "grad_norm": 0.3234539530147868, "learning_rate": 1.7871372357310651e-06, "loss": 0.0184, "step": 6736 }, { "epoch": 2.249791284020705, "grad_norm": 0.26048540767066103, "learning_rate": 1.7856484481614605e-06, "loss": 0.0137, "step": 6737 }, { "epoch": 2.250125229587577, "grad_norm": 0.22129009578567416, "learning_rate": 1.784160146157502e-06, "loss": 0.0139, "step": 6738 }, { "epoch": 2.25045917515445, "grad_norm": 0.37748350638730316, "learning_rate": 1.7826723299440224e-06, "loss": 0.0289, "step": 6739 }, { "epoch": 2.2507931207213225, "grad_norm": 0.24091456501707934, "learning_rate": 1.7811849997457681e-06, "loss": 0.0144, "step": 6740 }, { "epoch": 2.251127066288195, "grad_norm": 0.33289209133700864, "learning_rate": 1.779698155787422e-06, "loss": 0.0247, "step": 6741 }, { "epoch": 2.2514610118550675, "grad_norm": 0.29840920550364575, "learning_rate": 1.7782117982935854e-06, "loss": 0.02, "step": 6742 }, { "epoch": 2.2517949574219402, "grad_norm": 0.27802535917937715, "learning_rate": 1.7767259274887937e-06, "loss": 0.0163, "step": 6743 }, { "epoch": 2.252128902988813, "grad_norm": 0.3311624672647798, "learning_rate": 1.7752405435975002e-06, "loss": 0.0159, "step": 6744 }, { "epoch": 2.2524628485556852, "grad_norm": 0.2797337923935447, "learning_rate": 1.7737556468440964e-06, "loss": 0.0089, "step": 6745 }, { "epoch": 2.252796794122558, "grad_norm": 0.28803284395983925, "learning_rate": 1.7722712374528877e-06, "loss": 0.0125, "step": 6746 }, { "epoch": 2.2531307396894307, "grad_norm": 0.3025555554692831, "learning_rate": 1.7707873156481158e-06, "loss": 0.0162, "step": 6747 }, { "epoch": 2.2534646852563034, "grad_norm": 0.296463753648962, "learning_rate": 1.7693038816539416e-06, "loss": 0.0103, "step": 6748 }, { "epoch": 2.2537986308231757, "grad_norm": 0.30204472467144544, "learning_rate": 1.767820935694457e-06, "loss": 0.0125, "step": 6749 }, { "epoch": 2.2541325763900484, "grad_norm": 0.30579246176501707, "learning_rate": 1.7663384779936764e-06, "loss": 0.0158, "step": 6750 }, { "epoch": 2.254466521956921, "grad_norm": 0.31808116764866246, "learning_rate": 1.7648565087755442e-06, "loss": 0.0152, "step": 6751 }, { "epoch": 2.2548004675237934, "grad_norm": 0.31607876908662236, "learning_rate": 1.76337502826393e-06, "loss": 0.0122, "step": 6752 }, { "epoch": 2.255134413090666, "grad_norm": 0.3078147858767164, "learning_rate": 1.7618940366826266e-06, "loss": 0.0152, "step": 6753 }, { "epoch": 2.255468358657539, "grad_norm": 0.3184779928496126, "learning_rate": 1.7604135342553564e-06, "loss": 0.0199, "step": 6754 }, { "epoch": 2.2558023042244115, "grad_norm": 0.29048561038570914, "learning_rate": 1.7589335212057663e-06, "loss": 0.0159, "step": 6755 }, { "epoch": 2.2561362497912842, "grad_norm": 0.2856138908380283, "learning_rate": 1.7574539977574323e-06, "loss": 0.0151, "step": 6756 }, { "epoch": 2.2564701953581565, "grad_norm": 0.23988233482648116, "learning_rate": 1.7559749641338497e-06, "loss": 0.0138, "step": 6757 }, { "epoch": 2.2568041409250292, "grad_norm": 0.2280298279760251, "learning_rate": 1.7544964205584476e-06, "loss": 0.0087, "step": 6758 }, { "epoch": 2.257138086491902, "grad_norm": 0.24679574669725393, "learning_rate": 1.7530183672545743e-06, "loss": 0.0095, "step": 6759 }, { "epoch": 2.2574720320587742, "grad_norm": 0.29375683971947864, "learning_rate": 1.7515408044455102e-06, "loss": 0.015, "step": 6760 }, { "epoch": 2.257805977625647, "grad_norm": 0.28482641688588567, "learning_rate": 1.7500637323544534e-06, "loss": 0.0116, "step": 6761 }, { "epoch": 2.2581399231925197, "grad_norm": 0.29429403922629754, "learning_rate": 1.74858715120454e-06, "loss": 0.0139, "step": 6762 }, { "epoch": 2.2584738687593924, "grad_norm": 0.5477196044056735, "learning_rate": 1.7471110612188203e-06, "loss": 0.021, "step": 6763 }, { "epoch": 2.2588078143262647, "grad_norm": 0.3411849167684714, "learning_rate": 1.7456354626202775e-06, "loss": 0.0204, "step": 6764 }, { "epoch": 2.2591417598931374, "grad_norm": 0.22717898295287525, "learning_rate": 1.7441603556318155e-06, "loss": 0.0107, "step": 6765 }, { "epoch": 2.25947570546001, "grad_norm": 0.21815232273039045, "learning_rate": 1.74268574047627e-06, "loss": 0.0067, "step": 6766 }, { "epoch": 2.259809651026883, "grad_norm": 0.24089500821530055, "learning_rate": 1.7412116173763931e-06, "loss": 0.0111, "step": 6767 }, { "epoch": 2.260143596593755, "grad_norm": 0.27440544097516567, "learning_rate": 1.7397379865548758e-06, "loss": 0.0151, "step": 6768 }, { "epoch": 2.260477542160628, "grad_norm": 0.30917108392500187, "learning_rate": 1.7382648482343229e-06, "loss": 0.0193, "step": 6769 }, { "epoch": 2.2608114877275005, "grad_norm": 0.28570002063275857, "learning_rate": 1.7367922026372713e-06, "loss": 0.0172, "step": 6770 }, { "epoch": 2.261145433294373, "grad_norm": 0.31167117187193333, "learning_rate": 1.7353200499861794e-06, "loss": 0.0189, "step": 6771 }, { "epoch": 2.2614793788612455, "grad_norm": 0.22686891157216968, "learning_rate": 1.733848390503436e-06, "loss": 0.0101, "step": 6772 }, { "epoch": 2.261813324428118, "grad_norm": 0.279376728241647, "learning_rate": 1.732377224411349e-06, "loss": 0.0147, "step": 6773 }, { "epoch": 2.262147269994991, "grad_norm": 0.31891679122809297, "learning_rate": 1.7309065519321572e-06, "loss": 0.0169, "step": 6774 }, { "epoch": 2.2624812155618637, "grad_norm": 0.25239017270506375, "learning_rate": 1.729436373288025e-06, "loss": 0.0113, "step": 6775 }, { "epoch": 2.262815161128736, "grad_norm": 0.32504179795442667, "learning_rate": 1.7279666887010361e-06, "loss": 0.0132, "step": 6776 }, { "epoch": 2.2631491066956086, "grad_norm": 0.3534521670275748, "learning_rate": 1.726497498393206e-06, "loss": 0.0175, "step": 6777 }, { "epoch": 2.2634830522624814, "grad_norm": 0.3267800201637023, "learning_rate": 1.7250288025864747e-06, "loss": 0.0153, "step": 6778 }, { "epoch": 2.2638169978293536, "grad_norm": 0.345343770191701, "learning_rate": 1.7235606015027029e-06, "loss": 0.0131, "step": 6779 }, { "epoch": 2.2641509433962264, "grad_norm": 0.2891933591093856, "learning_rate": 1.7220928953636812e-06, "loss": 0.0109, "step": 6780 }, { "epoch": 2.264484888963099, "grad_norm": 0.3172121439753211, "learning_rate": 1.7206256843911252e-06, "loss": 0.0164, "step": 6781 }, { "epoch": 2.264818834529972, "grad_norm": 0.3066571591895138, "learning_rate": 1.7191589688066706e-06, "loss": 0.017, "step": 6782 }, { "epoch": 2.265152780096844, "grad_norm": 0.4642163671941375, "learning_rate": 1.7176927488318868e-06, "loss": 0.0235, "step": 6783 }, { "epoch": 2.265486725663717, "grad_norm": 0.28202389264257427, "learning_rate": 1.7162270246882595e-06, "loss": 0.0132, "step": 6784 }, { "epoch": 2.2658206712305895, "grad_norm": 0.279908610262276, "learning_rate": 1.7147617965972052e-06, "loss": 0.0129, "step": 6785 }, { "epoch": 2.266154616797462, "grad_norm": 0.22167584604368668, "learning_rate": 1.7132970647800639e-06, "loss": 0.009, "step": 6786 }, { "epoch": 2.2664885623643345, "grad_norm": 0.3382200241546776, "learning_rate": 1.7118328294581028e-06, "loss": 0.0201, "step": 6787 }, { "epoch": 2.266822507931207, "grad_norm": 0.2213767516570011, "learning_rate": 1.7103690908525072e-06, "loss": 0.007, "step": 6788 }, { "epoch": 2.26715645349808, "grad_norm": 0.36591740613831275, "learning_rate": 1.7089058491843967e-06, "loss": 0.0194, "step": 6789 }, { "epoch": 2.267490399064952, "grad_norm": 0.29368537764968344, "learning_rate": 1.7074431046748075e-06, "loss": 0.0154, "step": 6790 }, { "epoch": 2.267824344631825, "grad_norm": 0.4352427002983423, "learning_rate": 1.7059808575447057e-06, "loss": 0.0285, "step": 6791 }, { "epoch": 2.2681582901986976, "grad_norm": 0.2450375169758912, "learning_rate": 1.7045191080149815e-06, "loss": 0.0147, "step": 6792 }, { "epoch": 2.2684922357655704, "grad_norm": 0.23659982706957028, "learning_rate": 1.7030578563064504e-06, "loss": 0.0136, "step": 6793 }, { "epoch": 2.2688261813324426, "grad_norm": 0.2753970165028371, "learning_rate": 1.7015971026398487e-06, "loss": 0.0153, "step": 6794 }, { "epoch": 2.2691601268993153, "grad_norm": 0.24823421067993465, "learning_rate": 1.7001368472358442e-06, "loss": 0.0106, "step": 6795 }, { "epoch": 2.269494072466188, "grad_norm": 0.25569894707620683, "learning_rate": 1.6986770903150213e-06, "loss": 0.0098, "step": 6796 }, { "epoch": 2.269828018033061, "grad_norm": 0.26151944726558574, "learning_rate": 1.697217832097896e-06, "loss": 0.0098, "step": 6797 }, { "epoch": 2.270161963599933, "grad_norm": 0.3804194185347123, "learning_rate": 1.6957590728049078e-06, "loss": 0.0183, "step": 6798 }, { "epoch": 2.2704959091668058, "grad_norm": 0.3077685386335056, "learning_rate": 1.6943008126564164e-06, "loss": 0.015, "step": 6799 }, { "epoch": 2.2708298547336785, "grad_norm": 0.251061230961801, "learning_rate": 1.6928430518727102e-06, "loss": 0.0104, "step": 6800 }, { "epoch": 2.2711638003005508, "grad_norm": 0.2500679716397881, "learning_rate": 1.6913857906740033e-06, "loss": 0.0108, "step": 6801 }, { "epoch": 2.2714977458674235, "grad_norm": 0.2668967253339908, "learning_rate": 1.6899290292804288e-06, "loss": 0.0138, "step": 6802 }, { "epoch": 2.271831691434296, "grad_norm": 0.27352451073187406, "learning_rate": 1.6884727679120493e-06, "loss": 0.0123, "step": 6803 }, { "epoch": 2.272165637001169, "grad_norm": 0.2851665401691066, "learning_rate": 1.687017006788852e-06, "loss": 0.0133, "step": 6804 }, { "epoch": 2.2724995825680416, "grad_norm": 0.2827338506966288, "learning_rate": 1.6855617461307427e-06, "loss": 0.0179, "step": 6805 }, { "epoch": 2.272833528134914, "grad_norm": 0.25163410520849566, "learning_rate": 1.6841069861575598e-06, "loss": 0.0126, "step": 6806 }, { "epoch": 2.2731674737017866, "grad_norm": 0.2799593201961557, "learning_rate": 1.6826527270890587e-06, "loss": 0.0125, "step": 6807 }, { "epoch": 2.2735014192686593, "grad_norm": 0.36610892656306426, "learning_rate": 1.6811989691449232e-06, "loss": 0.0222, "step": 6808 }, { "epoch": 2.2738353648355316, "grad_norm": 0.29707283974348947, "learning_rate": 1.6797457125447614e-06, "loss": 0.0132, "step": 6809 }, { "epoch": 2.2741693104024043, "grad_norm": 0.2830973072756119, "learning_rate": 1.678292957508106e-06, "loss": 0.0176, "step": 6810 }, { "epoch": 2.274503255969277, "grad_norm": 0.2518703588316546, "learning_rate": 1.6768407042544093e-06, "loss": 0.013, "step": 6811 }, { "epoch": 2.2748372015361498, "grad_norm": 0.26358998186935967, "learning_rate": 1.6753889530030554e-06, "loss": 0.0108, "step": 6812 }, { "epoch": 2.275171147103022, "grad_norm": 0.29076873005784715, "learning_rate": 1.673937703973344e-06, "loss": 0.0136, "step": 6813 }, { "epoch": 2.2755050926698948, "grad_norm": 0.27140123012108447, "learning_rate": 1.6724869573845054e-06, "loss": 0.0095, "step": 6814 }, { "epoch": 2.2758390382367675, "grad_norm": 0.3248385125076713, "learning_rate": 1.6710367134556926e-06, "loss": 0.0152, "step": 6815 }, { "epoch": 2.27617298380364, "grad_norm": 0.42139279076769803, "learning_rate": 1.6695869724059827e-06, "loss": 0.0186, "step": 6816 }, { "epoch": 2.2765069293705125, "grad_norm": 0.30791526847134654, "learning_rate": 1.6681377344543737e-06, "loss": 0.0165, "step": 6817 }, { "epoch": 2.276840874937385, "grad_norm": 0.2404710251135645, "learning_rate": 1.6666889998197927e-06, "loss": 0.011, "step": 6818 }, { "epoch": 2.277174820504258, "grad_norm": 0.296729816970035, "learning_rate": 1.6652407687210853e-06, "loss": 0.0143, "step": 6819 }, { "epoch": 2.27750876607113, "grad_norm": 0.2629727000599189, "learning_rate": 1.6637930413770249e-06, "loss": 0.0102, "step": 6820 }, { "epoch": 2.277842711638003, "grad_norm": 0.30968336667798596, "learning_rate": 1.6623458180063084e-06, "loss": 0.0124, "step": 6821 }, { "epoch": 2.2781766572048756, "grad_norm": 0.23392177958027288, "learning_rate": 1.6608990988275575e-06, "loss": 0.0114, "step": 6822 }, { "epoch": 2.2785106027717483, "grad_norm": 0.26399538480166607, "learning_rate": 1.6594528840593128e-06, "loss": 0.0126, "step": 6823 }, { "epoch": 2.278844548338621, "grad_norm": 0.3126256564920955, "learning_rate": 1.6580071739200448e-06, "loss": 0.0198, "step": 6824 }, { "epoch": 2.2791784939054933, "grad_norm": 0.1980324301544163, "learning_rate": 1.6565619686281425e-06, "loss": 0.0086, "step": 6825 }, { "epoch": 2.279512439472366, "grad_norm": 0.3800839800161238, "learning_rate": 1.6551172684019224e-06, "loss": 0.0219, "step": 6826 }, { "epoch": 2.2798463850392388, "grad_norm": 0.34973357722789106, "learning_rate": 1.6536730734596257e-06, "loss": 0.0222, "step": 6827 }, { "epoch": 2.280180330606111, "grad_norm": 0.3159132113182457, "learning_rate": 1.652229384019411e-06, "loss": 0.0142, "step": 6828 }, { "epoch": 2.2805142761729837, "grad_norm": 0.28538657841798853, "learning_rate": 1.650786200299368e-06, "loss": 0.0143, "step": 6829 }, { "epoch": 2.2808482217398565, "grad_norm": 0.383727581012012, "learning_rate": 1.6493435225175042e-06, "loss": 0.0165, "step": 6830 }, { "epoch": 2.281182167306729, "grad_norm": 0.2018080194084335, "learning_rate": 1.6479013508917552e-06, "loss": 0.0079, "step": 6831 }, { "epoch": 2.2815161128736015, "grad_norm": 0.2674727772870895, "learning_rate": 1.6464596856399734e-06, "loss": 0.0176, "step": 6832 }, { "epoch": 2.281850058440474, "grad_norm": 0.23790251702327064, "learning_rate": 1.6450185269799462e-06, "loss": 0.0101, "step": 6833 }, { "epoch": 2.282184004007347, "grad_norm": 0.2611975481305445, "learning_rate": 1.6435778751293723e-06, "loss": 0.0138, "step": 6834 }, { "epoch": 2.2825179495742196, "grad_norm": 0.20370042032538926, "learning_rate": 1.6421377303058829e-06, "loss": 0.0103, "step": 6835 }, { "epoch": 2.282851895141092, "grad_norm": 0.32989165343685595, "learning_rate": 1.640698092727025e-06, "loss": 0.0115, "step": 6836 }, { "epoch": 2.2831858407079646, "grad_norm": 0.2379040986628446, "learning_rate": 1.639258962610275e-06, "loss": 0.0106, "step": 6837 }, { "epoch": 2.2835197862748373, "grad_norm": 0.2680402438125095, "learning_rate": 1.6378203401730303e-06, "loss": 0.015, "step": 6838 }, { "epoch": 2.2838537318417096, "grad_norm": 0.32978647443444303, "learning_rate": 1.6363822256326128e-06, "loss": 0.0162, "step": 6839 }, { "epoch": 2.2841876774085823, "grad_norm": 0.32164075396309677, "learning_rate": 1.6349446192062635e-06, "loss": 0.0152, "step": 6840 }, { "epoch": 2.284521622975455, "grad_norm": 0.3271033440192926, "learning_rate": 1.633507521111154e-06, "loss": 0.0129, "step": 6841 }, { "epoch": 2.2848555685423277, "grad_norm": 0.2508946834293806, "learning_rate": 1.6320709315643708e-06, "loss": 0.014, "step": 6842 }, { "epoch": 2.2851895141092, "grad_norm": 0.2986067958825717, "learning_rate": 1.6306348507829294e-06, "loss": 0.0171, "step": 6843 }, { "epoch": 2.2855234596760727, "grad_norm": 0.31711981611601764, "learning_rate": 1.6291992789837669e-06, "loss": 0.0142, "step": 6844 }, { "epoch": 2.2858574052429455, "grad_norm": 0.3179418395778233, "learning_rate": 1.6277642163837444e-06, "loss": 0.0162, "step": 6845 }, { "epoch": 2.286191350809818, "grad_norm": 0.27948032372466824, "learning_rate": 1.6263296631996422e-06, "loss": 0.0126, "step": 6846 }, { "epoch": 2.2865252963766904, "grad_norm": 0.2152095919458289, "learning_rate": 1.6248956196481701e-06, "loss": 0.0107, "step": 6847 }, { "epoch": 2.286859241943563, "grad_norm": 0.3129421993487875, "learning_rate": 1.6234620859459537e-06, "loss": 0.0187, "step": 6848 }, { "epoch": 2.287193187510436, "grad_norm": 0.29263364992986585, "learning_rate": 1.6220290623095463e-06, "loss": 0.0124, "step": 6849 }, { "epoch": 2.287527133077308, "grad_norm": 0.3180497394134938, "learning_rate": 1.6205965489554248e-06, "loss": 0.0168, "step": 6850 }, { "epoch": 2.287861078644181, "grad_norm": 0.32067412381710797, "learning_rate": 1.619164546099985e-06, "loss": 0.0253, "step": 6851 }, { "epoch": 2.2881950242110536, "grad_norm": 0.23473276148048866, "learning_rate": 1.6177330539595493e-06, "loss": 0.01, "step": 6852 }, { "epoch": 2.2885289697779263, "grad_norm": 0.2160113689800733, "learning_rate": 1.6163020727503592e-06, "loss": 0.0112, "step": 6853 }, { "epoch": 2.288862915344799, "grad_norm": 0.3801212896786191, "learning_rate": 1.6148716026885847e-06, "loss": 0.0218, "step": 6854 }, { "epoch": 2.2891968609116713, "grad_norm": 0.3801251650044196, "learning_rate": 1.61344164399031e-06, "loss": 0.0207, "step": 6855 }, { "epoch": 2.289530806478544, "grad_norm": 0.25983595677989696, "learning_rate": 1.6120121968715535e-06, "loss": 0.0102, "step": 6856 }, { "epoch": 2.2898647520454167, "grad_norm": 0.2972474075437074, "learning_rate": 1.6105832615482453e-06, "loss": 0.0161, "step": 6857 }, { "epoch": 2.290198697612289, "grad_norm": 0.2326329260414521, "learning_rate": 1.609154838236246e-06, "loss": 0.0117, "step": 6858 }, { "epoch": 2.2905326431791617, "grad_norm": 0.2507480605395413, "learning_rate": 1.6077269271513328e-06, "loss": 0.0125, "step": 6859 }, { "epoch": 2.2908665887460344, "grad_norm": 0.290488371092609, "learning_rate": 1.606299528509212e-06, "loss": 0.0175, "step": 6860 }, { "epoch": 2.291200534312907, "grad_norm": 0.2883163507713603, "learning_rate": 1.604872642525503e-06, "loss": 0.0111, "step": 6861 }, { "epoch": 2.2915344798797794, "grad_norm": 0.2844972972797543, "learning_rate": 1.6034462694157615e-06, "loss": 0.0174, "step": 6862 }, { "epoch": 2.291868425446652, "grad_norm": 0.33202652174069175, "learning_rate": 1.6020204093954523e-06, "loss": 0.0163, "step": 6863 }, { "epoch": 2.292202371013525, "grad_norm": 0.2275984281278442, "learning_rate": 1.6005950626799716e-06, "loss": 0.012, "step": 6864 }, { "epoch": 2.2925363165803976, "grad_norm": 0.3253942305308068, "learning_rate": 1.5991702294846318e-06, "loss": 0.0127, "step": 6865 }, { "epoch": 2.29287026214727, "grad_norm": 0.3488086489966652, "learning_rate": 1.597745910024674e-06, "loss": 0.0189, "step": 6866 }, { "epoch": 2.2932042077141426, "grad_norm": 0.317684008652152, "learning_rate": 1.5963221045152537e-06, "loss": 0.0149, "step": 6867 }, { "epoch": 2.2935381532810153, "grad_norm": 0.28305997131249266, "learning_rate": 1.5948988131714594e-06, "loss": 0.0133, "step": 6868 }, { "epoch": 2.2938720988478876, "grad_norm": 0.291190951166451, "learning_rate": 1.593476036208292e-06, "loss": 0.0108, "step": 6869 }, { "epoch": 2.2942060444147603, "grad_norm": 0.33410447493705175, "learning_rate": 1.5920537738406811e-06, "loss": 0.015, "step": 6870 }, { "epoch": 2.294539989981633, "grad_norm": 0.29687307023421855, "learning_rate": 1.5906320262834735e-06, "loss": 0.0082, "step": 6871 }, { "epoch": 2.2948739355485057, "grad_norm": 0.3159385432404704, "learning_rate": 1.5892107937514424e-06, "loss": 0.0194, "step": 6872 }, { "epoch": 2.2952078811153784, "grad_norm": 0.2783235833559468, "learning_rate": 1.587790076459283e-06, "loss": 0.0116, "step": 6873 }, { "epoch": 2.2955418266822507, "grad_norm": 0.2997547526346615, "learning_rate": 1.5863698746216082e-06, "loss": 0.0137, "step": 6874 }, { "epoch": 2.2958757722491234, "grad_norm": 0.31551977953032, "learning_rate": 1.58495018845296e-06, "loss": 0.014, "step": 6875 }, { "epoch": 2.296209717815996, "grad_norm": 0.2157668011951437, "learning_rate": 1.5835310181677954e-06, "loss": 0.0127, "step": 6876 }, { "epoch": 2.2965436633828684, "grad_norm": 0.29478318697081235, "learning_rate": 1.5821123639804992e-06, "loss": 0.0133, "step": 6877 }, { "epoch": 2.296877608949741, "grad_norm": 0.22043845791969274, "learning_rate": 1.5806942261053715e-06, "loss": 0.0098, "step": 6878 }, { "epoch": 2.297211554516614, "grad_norm": 0.27271994474637484, "learning_rate": 1.5792766047566455e-06, "loss": 0.0134, "step": 6879 }, { "epoch": 2.2975455000834866, "grad_norm": 0.24570288637658616, "learning_rate": 1.5778595001484648e-06, "loss": 0.0119, "step": 6880 }, { "epoch": 2.297879445650359, "grad_norm": 0.4306132286684389, "learning_rate": 1.5764429124949022e-06, "loss": 0.0267, "step": 6881 }, { "epoch": 2.2982133912172316, "grad_norm": 0.2673610459356278, "learning_rate": 1.5750268420099468e-06, "loss": 0.011, "step": 6882 }, { "epoch": 2.2985473367841043, "grad_norm": 0.29930055585529564, "learning_rate": 1.5736112889075167e-06, "loss": 0.0128, "step": 6883 }, { "epoch": 2.298881282350977, "grad_norm": 0.27314842131745765, "learning_rate": 1.5721962534014424e-06, "loss": 0.0148, "step": 6884 }, { "epoch": 2.2992152279178493, "grad_norm": 0.2539822566919215, "learning_rate": 1.5707817357054882e-06, "loss": 0.0112, "step": 6885 }, { "epoch": 2.299549173484722, "grad_norm": 0.3423133751705591, "learning_rate": 1.5693677360333293e-06, "loss": 0.0117, "step": 6886 }, { "epoch": 2.2998831190515947, "grad_norm": 0.2775115327127339, "learning_rate": 1.56795425459857e-06, "loss": 0.0119, "step": 6887 }, { "epoch": 2.300217064618467, "grad_norm": 0.3363374162676071, "learning_rate": 1.5665412916147298e-06, "loss": 0.018, "step": 6888 }, { "epoch": 2.3005510101853397, "grad_norm": 0.23109839671365773, "learning_rate": 1.5651288472952564e-06, "loss": 0.011, "step": 6889 }, { "epoch": 2.3008849557522124, "grad_norm": 0.39509648574155504, "learning_rate": 1.563716921853512e-06, "loss": 0.0196, "step": 6890 }, { "epoch": 2.301218901319085, "grad_norm": 0.30126459375211606, "learning_rate": 1.562305515502791e-06, "loss": 0.0143, "step": 6891 }, { "epoch": 2.3015528468859574, "grad_norm": 0.2832106918828107, "learning_rate": 1.5608946284562977e-06, "loss": 0.0141, "step": 6892 }, { "epoch": 2.30188679245283, "grad_norm": 0.33834053618681464, "learning_rate": 1.559484260927166e-06, "loss": 0.0184, "step": 6893 }, { "epoch": 2.302220738019703, "grad_norm": 0.32368834968773247, "learning_rate": 1.5580744131284464e-06, "loss": 0.0188, "step": 6894 }, { "epoch": 2.3025546835865756, "grad_norm": 0.2713427793874071, "learning_rate": 1.5566650852731151e-06, "loss": 0.0082, "step": 6895 }, { "epoch": 2.302888629153448, "grad_norm": 0.19785854455069538, "learning_rate": 1.5552562775740654e-06, "loss": 0.0062, "step": 6896 }, { "epoch": 2.3032225747203205, "grad_norm": 0.3499223117876873, "learning_rate": 1.5538479902441156e-06, "loss": 0.023, "step": 6897 }, { "epoch": 2.3035565202871933, "grad_norm": 0.35008911224844896, "learning_rate": 1.5524402234960056e-06, "loss": 0.0168, "step": 6898 }, { "epoch": 2.3038904658540655, "grad_norm": 0.3280570545798395, "learning_rate": 1.5510329775423916e-06, "loss": 0.0188, "step": 6899 }, { "epoch": 2.3042244114209383, "grad_norm": 0.309890201802014, "learning_rate": 1.5496262525958583e-06, "loss": 0.0129, "step": 6900 }, { "epoch": 2.304558356987811, "grad_norm": 0.22455359922836288, "learning_rate": 1.5482200488689054e-06, "loss": 0.0091, "step": 6901 }, { "epoch": 2.3048923025546837, "grad_norm": 0.359645016511295, "learning_rate": 1.5468143665739565e-06, "loss": 0.0131, "step": 6902 }, { "epoch": 2.3052262481215564, "grad_norm": 0.3260309762792919, "learning_rate": 1.5454092059233583e-06, "loss": 0.0147, "step": 6903 }, { "epoch": 2.3055601936884287, "grad_norm": 0.28768781892352635, "learning_rate": 1.5440045671293774e-06, "loss": 0.0109, "step": 6904 }, { "epoch": 2.3058941392553014, "grad_norm": 0.26557102939232013, "learning_rate": 1.542600450404198e-06, "loss": 0.0147, "step": 6905 }, { "epoch": 2.306228084822174, "grad_norm": 0.3395811271144134, "learning_rate": 1.5411968559599317e-06, "loss": 0.0187, "step": 6906 }, { "epoch": 2.3065620303890464, "grad_norm": 0.3522346805217385, "learning_rate": 1.5397937840086048e-06, "loss": 0.0198, "step": 6907 }, { "epoch": 2.306895975955919, "grad_norm": 0.22118044128481698, "learning_rate": 1.5383912347621693e-06, "loss": 0.0093, "step": 6908 }, { "epoch": 2.307229921522792, "grad_norm": 0.3118623337917481, "learning_rate": 1.5369892084324972e-06, "loss": 0.0112, "step": 6909 }, { "epoch": 2.3075638670896645, "grad_norm": 0.2884975298254704, "learning_rate": 1.5355877052313822e-06, "loss": 0.0161, "step": 6910 }, { "epoch": 2.307897812656537, "grad_norm": 0.28911069870718353, "learning_rate": 1.534186725370535e-06, "loss": 0.0129, "step": 6911 }, { "epoch": 2.3082317582234095, "grad_norm": 0.31201033265750977, "learning_rate": 1.532786269061593e-06, "loss": 0.0115, "step": 6912 }, { "epoch": 2.3085657037902823, "grad_norm": 0.3165311675238296, "learning_rate": 1.531386336516107e-06, "loss": 0.015, "step": 6913 }, { "epoch": 2.308899649357155, "grad_norm": 0.29782598920207437, "learning_rate": 1.52998692794556e-06, "loss": 0.0091, "step": 6914 }, { "epoch": 2.3092335949240272, "grad_norm": 0.2584111156420597, "learning_rate": 1.5285880435613438e-06, "loss": 0.0105, "step": 6915 }, { "epoch": 2.3095675404909, "grad_norm": 0.3932468032770264, "learning_rate": 1.5271896835747795e-06, "loss": 0.0256, "step": 6916 }, { "epoch": 2.3099014860577727, "grad_norm": 0.2470975333731172, "learning_rate": 1.5257918481971028e-06, "loss": 0.0131, "step": 6917 }, { "epoch": 2.310235431624645, "grad_norm": 0.24886696312141146, "learning_rate": 1.524394537639477e-06, "loss": 0.0097, "step": 6918 }, { "epoch": 2.3105693771915177, "grad_norm": 0.27187073985292953, "learning_rate": 1.5229977521129785e-06, "loss": 0.0129, "step": 6919 }, { "epoch": 2.3109033227583904, "grad_norm": 0.3119106316207161, "learning_rate": 1.5216014918286097e-06, "loss": 0.0142, "step": 6920 }, { "epoch": 2.311237268325263, "grad_norm": 0.3043107936059499, "learning_rate": 1.5202057569972945e-06, "loss": 0.015, "step": 6921 }, { "epoch": 2.311571213892136, "grad_norm": 0.24181495314185136, "learning_rate": 1.518810547829871e-06, "loss": 0.0079, "step": 6922 }, { "epoch": 2.311905159459008, "grad_norm": 0.379917378957993, "learning_rate": 1.517415864537105e-06, "loss": 0.0153, "step": 6923 }, { "epoch": 2.312239105025881, "grad_norm": 0.38406960085102787, "learning_rate": 1.516021707329678e-06, "loss": 0.02, "step": 6924 }, { "epoch": 2.3125730505927535, "grad_norm": 0.2261195871270627, "learning_rate": 1.5146280764181942e-06, "loss": 0.014, "step": 6925 }, { "epoch": 2.312906996159626, "grad_norm": 0.3532238162584895, "learning_rate": 1.5132349720131783e-06, "loss": 0.0216, "step": 6926 }, { "epoch": 2.3132409417264985, "grad_norm": 0.31298472699062174, "learning_rate": 1.511842394325077e-06, "loss": 0.0113, "step": 6927 }, { "epoch": 2.3135748872933712, "grad_norm": 0.3177922138619716, "learning_rate": 1.5104503435642526e-06, "loss": 0.0156, "step": 6928 }, { "epoch": 2.313908832860244, "grad_norm": 0.2768380147428541, "learning_rate": 1.5090588199409927e-06, "loss": 0.0137, "step": 6929 }, { "epoch": 2.3142427784271162, "grad_norm": 0.33564312248612466, "learning_rate": 1.5076678236655018e-06, "loss": 0.0191, "step": 6930 }, { "epoch": 2.314576723993989, "grad_norm": 0.3354268527242691, "learning_rate": 1.5062773549479064e-06, "loss": 0.0173, "step": 6931 }, { "epoch": 2.3149106695608617, "grad_norm": 0.25080284224765265, "learning_rate": 1.504887413998254e-06, "loss": 0.0124, "step": 6932 }, { "epoch": 2.3152446151277344, "grad_norm": 0.28694018057723303, "learning_rate": 1.5034980010265127e-06, "loss": 0.0124, "step": 6933 }, { "epoch": 2.3155785606946067, "grad_norm": 0.3303089047311808, "learning_rate": 1.5021091162425672e-06, "loss": 0.0141, "step": 6934 }, { "epoch": 2.3159125062614794, "grad_norm": 0.27309001326669086, "learning_rate": 1.5007207598562268e-06, "loss": 0.0175, "step": 6935 }, { "epoch": 2.316246451828352, "grad_norm": 0.3623573581505953, "learning_rate": 1.4993329320772177e-06, "loss": 0.0245, "step": 6936 }, { "epoch": 2.3165803973952244, "grad_norm": 0.27206488443658156, "learning_rate": 1.4979456331151875e-06, "loss": 0.0176, "step": 6937 }, { "epoch": 2.316914342962097, "grad_norm": 0.561266236606752, "learning_rate": 1.4965588631797052e-06, "loss": 0.0149, "step": 6938 }, { "epoch": 2.31724828852897, "grad_norm": 0.23942318099071092, "learning_rate": 1.4951726224802593e-06, "loss": 0.011, "step": 6939 }, { "epoch": 2.3175822340958425, "grad_norm": 0.251998175067427, "learning_rate": 1.493786911226256e-06, "loss": 0.0179, "step": 6940 }, { "epoch": 2.317916179662715, "grad_norm": 0.276215128862187, "learning_rate": 1.492401729627025e-06, "loss": 0.0116, "step": 6941 }, { "epoch": 2.3182501252295875, "grad_norm": 0.2651607229741079, "learning_rate": 1.491017077891812e-06, "loss": 0.0148, "step": 6942 }, { "epoch": 2.3185840707964602, "grad_norm": 0.25248514184534754, "learning_rate": 1.4896329562297863e-06, "loss": 0.009, "step": 6943 }, { "epoch": 2.318918016363333, "grad_norm": 0.2548273018450235, "learning_rate": 1.4882493648500373e-06, "loss": 0.0098, "step": 6944 }, { "epoch": 2.319251961930205, "grad_norm": 0.3088948076685334, "learning_rate": 1.48686630396157e-06, "loss": 0.0174, "step": 6945 }, { "epoch": 2.319585907497078, "grad_norm": 0.30576510424073655, "learning_rate": 1.4854837737733147e-06, "loss": 0.0206, "step": 6946 }, { "epoch": 2.3199198530639507, "grad_norm": 0.2979807996026274, "learning_rate": 1.484101774494116e-06, "loss": 0.0148, "step": 6947 }, { "epoch": 2.320253798630823, "grad_norm": 0.2565580544831514, "learning_rate": 1.4827203063327427e-06, "loss": 0.0136, "step": 6948 }, { "epoch": 2.3205877441976956, "grad_norm": 0.2731189156615786, "learning_rate": 1.4813393694978812e-06, "loss": 0.0136, "step": 6949 }, { "epoch": 2.3209216897645684, "grad_norm": 0.2849877677152315, "learning_rate": 1.479958964198141e-06, "loss": 0.0126, "step": 6950 }, { "epoch": 2.321255635331441, "grad_norm": 0.42411171708168294, "learning_rate": 1.4785790906420445e-06, "loss": 0.0188, "step": 6951 }, { "epoch": 2.321589580898314, "grad_norm": 0.2441349390152868, "learning_rate": 1.4771997490380414e-06, "loss": 0.0154, "step": 6952 }, { "epoch": 2.321923526465186, "grad_norm": 0.33877273899911525, "learning_rate": 1.4758209395944945e-06, "loss": 0.0202, "step": 6953 }, { "epoch": 2.322257472032059, "grad_norm": 0.32485886612406134, "learning_rate": 1.47444266251969e-06, "loss": 0.0084, "step": 6954 }, { "epoch": 2.3225914175989315, "grad_norm": 0.3386213394227128, "learning_rate": 1.4730649180218337e-06, "loss": 0.0166, "step": 6955 }, { "epoch": 2.322925363165804, "grad_norm": 0.22010222064052062, "learning_rate": 1.4716877063090517e-06, "loss": 0.0092, "step": 6956 }, { "epoch": 2.3232593087326765, "grad_norm": 0.2629499196798745, "learning_rate": 1.4703110275893846e-06, "loss": 0.0156, "step": 6957 }, { "epoch": 2.323593254299549, "grad_norm": 0.2552628821030842, "learning_rate": 1.4689348820707988e-06, "loss": 0.0125, "step": 6958 }, { "epoch": 2.323927199866422, "grad_norm": 0.265968468931718, "learning_rate": 1.4675592699611741e-06, "loss": 0.0094, "step": 6959 }, { "epoch": 2.324261145433294, "grad_norm": 0.35108506391580413, "learning_rate": 1.4661841914683156e-06, "loss": 0.0233, "step": 6960 }, { "epoch": 2.324595091000167, "grad_norm": 0.28573449329162687, "learning_rate": 1.464809646799944e-06, "loss": 0.0137, "step": 6961 }, { "epoch": 2.3249290365670396, "grad_norm": 0.24620526100717358, "learning_rate": 1.463435636163702e-06, "loss": 0.0189, "step": 6962 }, { "epoch": 2.3252629821339124, "grad_norm": 0.28282179128012264, "learning_rate": 1.4620621597671476e-06, "loss": 0.0167, "step": 6963 }, { "epoch": 2.3255969277007846, "grad_norm": 0.28613556450697475, "learning_rate": 1.4606892178177633e-06, "loss": 0.0156, "step": 6964 }, { "epoch": 2.3259308732676574, "grad_norm": 0.24834290335390774, "learning_rate": 1.459316810522945e-06, "loss": 0.0124, "step": 6965 }, { "epoch": 2.32626481883453, "grad_norm": 0.3049774665147699, "learning_rate": 1.457944938090013e-06, "loss": 0.0162, "step": 6966 }, { "epoch": 2.3265987644014023, "grad_norm": 0.29986365571686485, "learning_rate": 1.456573600726206e-06, "loss": 0.017, "step": 6967 }, { "epoch": 2.326932709968275, "grad_norm": 0.28644245612453273, "learning_rate": 1.4552027986386775e-06, "loss": 0.0125, "step": 6968 }, { "epoch": 2.3272666555351478, "grad_norm": 0.28840902391683015, "learning_rate": 1.453832532034506e-06, "loss": 0.024, "step": 6969 }, { "epoch": 2.3276006011020205, "grad_norm": 0.28433771644705647, "learning_rate": 1.4524628011206843e-06, "loss": 0.0156, "step": 6970 }, { "epoch": 2.327934546668893, "grad_norm": 0.38140193932310595, "learning_rate": 1.4510936061041269e-06, "loss": 0.0199, "step": 6971 }, { "epoch": 2.3282684922357655, "grad_norm": 0.19225305041517793, "learning_rate": 1.449724947191668e-06, "loss": 0.0061, "step": 6972 }, { "epoch": 2.328602437802638, "grad_norm": 0.28570383362481183, "learning_rate": 1.4483568245900597e-06, "loss": 0.0154, "step": 6973 }, { "epoch": 2.328936383369511, "grad_norm": 0.2903318790132737, "learning_rate": 1.4469892385059713e-06, "loss": 0.0136, "step": 6974 }, { "epoch": 2.329270328936383, "grad_norm": 0.2538408576499128, "learning_rate": 1.4456221891459953e-06, "loss": 0.0093, "step": 6975 }, { "epoch": 2.329604274503256, "grad_norm": 0.2874971317361385, "learning_rate": 1.4442556767166371e-06, "loss": 0.0131, "step": 6976 }, { "epoch": 2.3299382200701286, "grad_norm": 0.24425712032382957, "learning_rate": 1.4428897014243288e-06, "loss": 0.0109, "step": 6977 }, { "epoch": 2.3302721656370013, "grad_norm": 0.25296973405459755, "learning_rate": 1.4415242634754107e-06, "loss": 0.0123, "step": 6978 }, { "epoch": 2.3306061112038736, "grad_norm": 0.2320438547831141, "learning_rate": 1.4401593630761562e-06, "loss": 0.012, "step": 6979 }, { "epoch": 2.3309400567707463, "grad_norm": 0.3235855326462055, "learning_rate": 1.4387950004327434e-06, "loss": 0.013, "step": 6980 }, { "epoch": 2.331274002337619, "grad_norm": 0.24662402204525793, "learning_rate": 1.4374311757512798e-06, "loss": 0.0116, "step": 6981 }, { "epoch": 2.3316079479044918, "grad_norm": 0.33864863398546513, "learning_rate": 1.4360678892377833e-06, "loss": 0.0161, "step": 6982 }, { "epoch": 2.331941893471364, "grad_norm": 0.3952467987766312, "learning_rate": 1.434705141098197e-06, "loss": 0.0213, "step": 6983 }, { "epoch": 2.3322758390382368, "grad_norm": 0.2700400125085471, "learning_rate": 1.4333429315383768e-06, "loss": 0.0147, "step": 6984 }, { "epoch": 2.3326097846051095, "grad_norm": 0.30636721537276057, "learning_rate": 1.4319812607641055e-06, "loss": 0.0135, "step": 6985 }, { "epoch": 2.3329437301719818, "grad_norm": 0.29262149370274226, "learning_rate": 1.4306201289810756e-06, "loss": 0.016, "step": 6986 }, { "epoch": 2.3332776757388545, "grad_norm": 0.30306257392613584, "learning_rate": 1.4292595363949047e-06, "loss": 0.0155, "step": 6987 }, { "epoch": 2.333611621305727, "grad_norm": 0.2660978556325702, "learning_rate": 1.4278994832111232e-06, "loss": 0.0127, "step": 6988 }, { "epoch": 2.3339455668726, "grad_norm": 0.2369096043320216, "learning_rate": 1.4265399696351867e-06, "loss": 0.0094, "step": 6989 }, { "epoch": 2.334279512439472, "grad_norm": 0.29684526715594656, "learning_rate": 1.4251809958724623e-06, "loss": 0.0151, "step": 6990 }, { "epoch": 2.334613458006345, "grad_norm": 0.26183653131005824, "learning_rate": 1.4238225621282403e-06, "loss": 0.0129, "step": 6991 }, { "epoch": 2.3349474035732176, "grad_norm": 0.2715118542480326, "learning_rate": 1.4224646686077303e-06, "loss": 0.0119, "step": 6992 }, { "epoch": 2.3352813491400903, "grad_norm": 0.28918724818389474, "learning_rate": 1.4211073155160544e-06, "loss": 0.019, "step": 6993 }, { "epoch": 2.3356152947069626, "grad_norm": 0.28190471982389226, "learning_rate": 1.4197505030582588e-06, "loss": 0.0139, "step": 6994 }, { "epoch": 2.3359492402738353, "grad_norm": 0.23096099581309007, "learning_rate": 1.4183942314393056e-06, "loss": 0.0118, "step": 6995 }, { "epoch": 2.336283185840708, "grad_norm": 0.24940806167635468, "learning_rate": 1.4170385008640774e-06, "loss": 0.0108, "step": 6996 }, { "epoch": 2.3366171314075803, "grad_norm": 0.2900515861486615, "learning_rate": 1.4156833115373702e-06, "loss": 0.0096, "step": 6997 }, { "epoch": 2.336951076974453, "grad_norm": 0.29814869780191827, "learning_rate": 1.4143286636639043e-06, "loss": 0.0143, "step": 6998 }, { "epoch": 2.3372850225413258, "grad_norm": 0.1977826268024735, "learning_rate": 1.4129745574483123e-06, "loss": 0.0097, "step": 6999 }, { "epoch": 2.3376189681081985, "grad_norm": 0.31898545352338264, "learning_rate": 1.4116209930951508e-06, "loss": 0.0104, "step": 7000 }, { "epoch": 2.337952913675071, "grad_norm": 0.28063789586595905, "learning_rate": 1.4102679708088867e-06, "loss": 0.0127, "step": 7001 }, { "epoch": 2.3382868592419435, "grad_norm": 0.20999288065604801, "learning_rate": 1.4089154907939162e-06, "loss": 0.0108, "step": 7002 }, { "epoch": 2.338620804808816, "grad_norm": 0.41120695251909045, "learning_rate": 1.4075635532545435e-06, "loss": 0.0164, "step": 7003 }, { "epoch": 2.338954750375689, "grad_norm": 0.30117205030740735, "learning_rate": 1.4062121583949967e-06, "loss": 0.0146, "step": 7004 }, { "epoch": 2.339288695942561, "grad_norm": 0.3122728919338953, "learning_rate": 1.4048613064194178e-06, "loss": 0.0194, "step": 7005 }, { "epoch": 2.339622641509434, "grad_norm": 0.23533936551158222, "learning_rate": 1.4035109975318712e-06, "loss": 0.012, "step": 7006 }, { "epoch": 2.3399565870763066, "grad_norm": 0.29820578850033036, "learning_rate": 1.4021612319363326e-06, "loss": 0.0137, "step": 7007 }, { "epoch": 2.3402905326431793, "grad_norm": 0.2877623873664818, "learning_rate": 1.4008120098367062e-06, "loss": 0.0205, "step": 7008 }, { "epoch": 2.3406244782100516, "grad_norm": 0.2736968095086382, "learning_rate": 1.3994633314368034e-06, "loss": 0.013, "step": 7009 }, { "epoch": 2.3409584237769243, "grad_norm": 0.2743224683523854, "learning_rate": 1.3981151969403606e-06, "loss": 0.0099, "step": 7010 }, { "epoch": 2.341292369343797, "grad_norm": 0.29674448270865644, "learning_rate": 1.3967676065510266e-06, "loss": 0.0121, "step": 7011 }, { "epoch": 2.3416263149106697, "grad_norm": 0.24211897585212844, "learning_rate": 1.3954205604723742e-06, "loss": 0.0076, "step": 7012 }, { "epoch": 2.341960260477542, "grad_norm": 0.25201915901381633, "learning_rate": 1.3940740589078872e-06, "loss": 0.0101, "step": 7013 }, { "epoch": 2.3422942060444147, "grad_norm": 0.3697045818940729, "learning_rate": 1.3927281020609712e-06, "loss": 0.0332, "step": 7014 }, { "epoch": 2.3426281516112875, "grad_norm": 0.34153673187121747, "learning_rate": 1.391382690134952e-06, "loss": 0.0204, "step": 7015 }, { "epoch": 2.3429620971781597, "grad_norm": 0.28501533535633566, "learning_rate": 1.3900378233330658e-06, "loss": 0.0119, "step": 7016 }, { "epoch": 2.3432960427450324, "grad_norm": 0.346380977016863, "learning_rate": 1.3886935018584719e-06, "loss": 0.0205, "step": 7017 }, { "epoch": 2.343629988311905, "grad_norm": 0.295754362676055, "learning_rate": 1.3873497259142483e-06, "loss": 0.0129, "step": 7018 }, { "epoch": 2.343963933878778, "grad_norm": 0.35969859400328646, "learning_rate": 1.3860064957033847e-06, "loss": 0.0171, "step": 7019 }, { "epoch": 2.3442978794456506, "grad_norm": 0.27965472909276673, "learning_rate": 1.384663811428793e-06, "loss": 0.0132, "step": 7020 }, { "epoch": 2.344631825012523, "grad_norm": 0.40781867550159484, "learning_rate": 1.3833216732933035e-06, "loss": 0.0145, "step": 7021 }, { "epoch": 2.3449657705793956, "grad_norm": 0.2365689774849031, "learning_rate": 1.3819800814996587e-06, "loss": 0.0099, "step": 7022 }, { "epoch": 2.3452997161462683, "grad_norm": 0.3094593364384856, "learning_rate": 1.3806390362505251e-06, "loss": 0.0187, "step": 7023 }, { "epoch": 2.3456336617131406, "grad_norm": 0.24395158215992865, "learning_rate": 1.3792985377484796e-06, "loss": 0.0102, "step": 7024 }, { "epoch": 2.3459676072800133, "grad_norm": 0.2884488668419262, "learning_rate": 1.3779585861960226e-06, "loss": 0.0131, "step": 7025 }, { "epoch": 2.346301552846886, "grad_norm": 0.27578421765993216, "learning_rate": 1.3766191817955699e-06, "loss": 0.011, "step": 7026 }, { "epoch": 2.3466354984137587, "grad_norm": 0.3523686929773811, "learning_rate": 1.3752803247494545e-06, "loss": 0.0165, "step": 7027 }, { "epoch": 2.346969443980631, "grad_norm": 0.2952555179298213, "learning_rate": 1.3739420152599247e-06, "loss": 0.018, "step": 7028 }, { "epoch": 2.3473033895475037, "grad_norm": 0.20665887221493115, "learning_rate": 1.37260425352915e-06, "loss": 0.0082, "step": 7029 }, { "epoch": 2.3476373351143764, "grad_norm": 0.19721715321930855, "learning_rate": 1.3712670397592127e-06, "loss": 0.0085, "step": 7030 }, { "epoch": 2.347971280681249, "grad_norm": 0.25467914543662806, "learning_rate": 1.3699303741521158e-06, "loss": 0.0135, "step": 7031 }, { "epoch": 2.3483052262481214, "grad_norm": 0.38590963097806746, "learning_rate": 1.3685942569097793e-06, "loss": 0.0272, "step": 7032 }, { "epoch": 2.348639171814994, "grad_norm": 0.26568628385319204, "learning_rate": 1.3672586882340393e-06, "loss": 0.0121, "step": 7033 }, { "epoch": 2.348973117381867, "grad_norm": 0.3321610478768502, "learning_rate": 1.3659236683266475e-06, "loss": 0.0138, "step": 7034 }, { "epoch": 2.349307062948739, "grad_norm": 0.25198850580372606, "learning_rate": 1.3645891973892772e-06, "loss": 0.0139, "step": 7035 }, { "epoch": 2.349641008515612, "grad_norm": 0.27852847233661115, "learning_rate": 1.3632552756235124e-06, "loss": 0.0127, "step": 7036 }, { "epoch": 2.3499749540824846, "grad_norm": 0.286362480694555, "learning_rate": 1.3619219032308594e-06, "loss": 0.0137, "step": 7037 }, { "epoch": 2.3503088996493573, "grad_norm": 0.31169443623660936, "learning_rate": 1.3605890804127415e-06, "loss": 0.0195, "step": 7038 }, { "epoch": 2.3506428452162296, "grad_norm": 0.25787425292668803, "learning_rate": 1.3592568073704943e-06, "loss": 0.0147, "step": 7039 }, { "epoch": 2.3509767907831023, "grad_norm": 0.26326282020932457, "learning_rate": 1.3579250843053747e-06, "loss": 0.0125, "step": 7040 }, { "epoch": 2.351310736349975, "grad_norm": 0.2838348168143694, "learning_rate": 1.3565939114185568e-06, "loss": 0.0155, "step": 7041 }, { "epoch": 2.3516446819168477, "grad_norm": 0.3433587724559374, "learning_rate": 1.3552632889111266e-06, "loss": 0.0127, "step": 7042 }, { "epoch": 2.35197862748372, "grad_norm": 0.2779079478718898, "learning_rate": 1.3539332169840918e-06, "loss": 0.0143, "step": 7043 }, { "epoch": 2.3523125730505927, "grad_norm": 0.2840763118001524, "learning_rate": 1.3526036958383777e-06, "loss": 0.0109, "step": 7044 }, { "epoch": 2.3526465186174654, "grad_norm": 0.2691719799676859, "learning_rate": 1.35127472567482e-06, "loss": 0.0111, "step": 7045 }, { "epoch": 2.3529804641843377, "grad_norm": 0.24783968197198344, "learning_rate": 1.3499463066941787e-06, "loss": 0.0095, "step": 7046 }, { "epoch": 2.3533144097512104, "grad_norm": 0.2499373106705042, "learning_rate": 1.3486184390971246e-06, "loss": 0.012, "step": 7047 }, { "epoch": 2.353648355318083, "grad_norm": 0.3529376993851651, "learning_rate": 1.347291123084249e-06, "loss": 0.0257, "step": 7048 }, { "epoch": 2.353982300884956, "grad_norm": 0.29267545149309965, "learning_rate": 1.3459643588560583e-06, "loss": 0.0185, "step": 7049 }, { "epoch": 2.3543162464518286, "grad_norm": 0.23077744975338793, "learning_rate": 1.3446381466129777e-06, "loss": 0.0105, "step": 7050 }, { "epoch": 2.354650192018701, "grad_norm": 0.33530589063300237, "learning_rate": 1.3433124865553437e-06, "loss": 0.0167, "step": 7051 }, { "epoch": 2.3549841375855736, "grad_norm": 0.21482688137489903, "learning_rate": 1.3419873788834164e-06, "loss": 0.0077, "step": 7052 }, { "epoch": 2.3553180831524463, "grad_norm": 0.3833841379576733, "learning_rate": 1.3406628237973662e-06, "loss": 0.0211, "step": 7053 }, { "epoch": 2.3556520287193186, "grad_norm": 0.33450685389920704, "learning_rate": 1.339338821497283e-06, "loss": 0.0168, "step": 7054 }, { "epoch": 2.3559859742861913, "grad_norm": 0.2138274774323666, "learning_rate": 1.3380153721831745e-06, "loss": 0.0106, "step": 7055 }, { "epoch": 2.356319919853064, "grad_norm": 0.2927400855130381, "learning_rate": 1.3366924760549632e-06, "loss": 0.0166, "step": 7056 }, { "epoch": 2.3566538654199367, "grad_norm": 0.26819134391433797, "learning_rate": 1.3353701333124863e-06, "loss": 0.0117, "step": 7057 }, { "epoch": 2.356987810986809, "grad_norm": 0.2713273801765072, "learning_rate": 1.3340483441555024e-06, "loss": 0.0099, "step": 7058 }, { "epoch": 2.3573217565536817, "grad_norm": 0.2613137718575152, "learning_rate": 1.3327271087836792e-06, "loss": 0.0155, "step": 7059 }, { "epoch": 2.3576557021205544, "grad_norm": 0.23319159108616203, "learning_rate": 1.331406427396607e-06, "loss": 0.0116, "step": 7060 }, { "epoch": 2.357989647687427, "grad_norm": 0.36631150451240413, "learning_rate": 1.3300863001937902e-06, "loss": 0.0181, "step": 7061 }, { "epoch": 2.3583235932542994, "grad_norm": 0.28616188309510715, "learning_rate": 1.3287667273746513e-06, "loss": 0.0116, "step": 7062 }, { "epoch": 2.358657538821172, "grad_norm": 0.6289186504409826, "learning_rate": 1.3274477091385241e-06, "loss": 0.026, "step": 7063 }, { "epoch": 2.358991484388045, "grad_norm": 0.27430290347825487, "learning_rate": 1.3261292456846648e-06, "loss": 0.0197, "step": 7064 }, { "epoch": 2.359325429954917, "grad_norm": 0.36644620101865705, "learning_rate": 1.3248113372122395e-06, "loss": 0.0207, "step": 7065 }, { "epoch": 2.35965937552179, "grad_norm": 0.32346152710879994, "learning_rate": 1.3234939839203358e-06, "loss": 0.0125, "step": 7066 }, { "epoch": 2.3599933210886626, "grad_norm": 0.32124866089323395, "learning_rate": 1.3221771860079569e-06, "loss": 0.0152, "step": 7067 }, { "epoch": 2.3603272666555353, "grad_norm": 0.29419654271254136, "learning_rate": 1.3208609436740178e-06, "loss": 0.0127, "step": 7068 }, { "epoch": 2.360661212222408, "grad_norm": 0.2304331921607892, "learning_rate": 1.3195452571173551e-06, "loss": 0.0088, "step": 7069 }, { "epoch": 2.3609951577892803, "grad_norm": 0.2631261767561719, "learning_rate": 1.3182301265367154e-06, "loss": 0.0144, "step": 7070 }, { "epoch": 2.361329103356153, "grad_norm": 0.24281070467974317, "learning_rate": 1.3169155521307664e-06, "loss": 0.0082, "step": 7071 }, { "epoch": 2.3616630489230257, "grad_norm": 0.2448273905444959, "learning_rate": 1.3156015340980904e-06, "loss": 0.0092, "step": 7072 }, { "epoch": 2.361996994489898, "grad_norm": 0.23846182020447573, "learning_rate": 1.3142880726371865e-06, "loss": 0.0107, "step": 7073 }, { "epoch": 2.3623309400567707, "grad_norm": 0.24963247889376136, "learning_rate": 1.312975167946466e-06, "loss": 0.0106, "step": 7074 }, { "epoch": 2.3626648856236434, "grad_norm": 0.25951704733003367, "learning_rate": 1.3116628202242603e-06, "loss": 0.0119, "step": 7075 }, { "epoch": 2.362998831190516, "grad_norm": 0.27714867760392925, "learning_rate": 1.3103510296688137e-06, "loss": 0.0136, "step": 7076 }, { "epoch": 2.3633327767573884, "grad_norm": 0.20879695838851028, "learning_rate": 1.309039796478288e-06, "loss": 0.0093, "step": 7077 }, { "epoch": 2.363666722324261, "grad_norm": 0.33077264493637937, "learning_rate": 1.307729120850761e-06, "loss": 0.0157, "step": 7078 }, { "epoch": 2.364000667891134, "grad_norm": 0.25900661975678163, "learning_rate": 1.306419002984226e-06, "loss": 0.0114, "step": 7079 }, { "epoch": 2.3643346134580066, "grad_norm": 0.2781441894602771, "learning_rate": 1.3051094430765905e-06, "loss": 0.0122, "step": 7080 }, { "epoch": 2.364668559024879, "grad_norm": 0.3098809962725239, "learning_rate": 1.3038004413256805e-06, "loss": 0.0162, "step": 7081 }, { "epoch": 2.3650025045917515, "grad_norm": 0.2227075007275612, "learning_rate": 1.3024919979292338e-06, "loss": 0.0132, "step": 7082 }, { "epoch": 2.3653364501586243, "grad_norm": 0.28777777897750045, "learning_rate": 1.3011841130849079e-06, "loss": 0.0114, "step": 7083 }, { "epoch": 2.3656703957254965, "grad_norm": 0.25312442289604575, "learning_rate": 1.2998767869902733e-06, "loss": 0.0217, "step": 7084 }, { "epoch": 2.3660043412923693, "grad_norm": 0.27828982828361787, "learning_rate": 1.2985700198428197e-06, "loss": 0.0146, "step": 7085 }, { "epoch": 2.366338286859242, "grad_norm": 0.3197384774721434, "learning_rate": 1.2972638118399456e-06, "loss": 0.0135, "step": 7086 }, { "epoch": 2.3666722324261147, "grad_norm": 0.22957950968851462, "learning_rate": 1.2959581631789725e-06, "loss": 0.0099, "step": 7087 }, { "epoch": 2.367006177992987, "grad_norm": 0.2988210668936548, "learning_rate": 1.2946530740571316e-06, "loss": 0.0143, "step": 7088 }, { "epoch": 2.3673401235598597, "grad_norm": 0.29309095316351913, "learning_rate": 1.293348544671572e-06, "loss": 0.0158, "step": 7089 }, { "epoch": 2.3676740691267324, "grad_norm": 0.254190088622963, "learning_rate": 1.2920445752193617e-06, "loss": 0.0163, "step": 7090 }, { "epoch": 2.368008014693605, "grad_norm": 0.30099725210470746, "learning_rate": 1.2907411658974756e-06, "loss": 0.0153, "step": 7091 }, { "epoch": 2.3683419602604774, "grad_norm": 0.253633411322103, "learning_rate": 1.2894383169028134e-06, "loss": 0.011, "step": 7092 }, { "epoch": 2.36867590582735, "grad_norm": 0.257362772235316, "learning_rate": 1.2881360284321825e-06, "loss": 0.015, "step": 7093 }, { "epoch": 2.369009851394223, "grad_norm": 0.304343203003211, "learning_rate": 1.2868343006823113e-06, "loss": 0.0195, "step": 7094 }, { "epoch": 2.369343796961095, "grad_norm": 0.23013095858508806, "learning_rate": 1.2855331338498377e-06, "loss": 0.0114, "step": 7095 }, { "epoch": 2.369677742527968, "grad_norm": 0.3436576246126651, "learning_rate": 1.2842325281313233e-06, "loss": 0.0175, "step": 7096 }, { "epoch": 2.3700116880948405, "grad_norm": 0.3026982252998837, "learning_rate": 1.282932483723236e-06, "loss": 0.0122, "step": 7097 }, { "epoch": 2.3703456336617132, "grad_norm": 0.2957131535759339, "learning_rate": 1.2816330008219656e-06, "loss": 0.0135, "step": 7098 }, { "epoch": 2.370679579228586, "grad_norm": 0.2721010527776561, "learning_rate": 1.280334079623811e-06, "loss": 0.0142, "step": 7099 }, { "epoch": 2.3710135247954582, "grad_norm": 0.29651695111407894, "learning_rate": 1.2790357203249931e-06, "loss": 0.0195, "step": 7100 }, { "epoch": 2.371347470362331, "grad_norm": 0.31437865394416964, "learning_rate": 1.2777379231216391e-06, "loss": 0.0178, "step": 7101 }, { "epoch": 2.3716814159292037, "grad_norm": 0.371615913467936, "learning_rate": 1.2764406882098035e-06, "loss": 0.0245, "step": 7102 }, { "epoch": 2.372015361496076, "grad_norm": 0.2632986372930604, "learning_rate": 1.2751440157854439e-06, "loss": 0.011, "step": 7103 }, { "epoch": 2.3723493070629487, "grad_norm": 0.3588190483629132, "learning_rate": 1.2738479060444408e-06, "loss": 0.0273, "step": 7104 }, { "epoch": 2.3726832526298214, "grad_norm": 0.20756599791892164, "learning_rate": 1.2725523591825845e-06, "loss": 0.0096, "step": 7105 }, { "epoch": 2.373017198196694, "grad_norm": 0.2528450889854755, "learning_rate": 1.2712573753955842e-06, "loss": 0.0119, "step": 7106 }, { "epoch": 2.3733511437635664, "grad_norm": 0.274285984152569, "learning_rate": 1.2699629548790599e-06, "loss": 0.0146, "step": 7107 }, { "epoch": 2.373685089330439, "grad_norm": 0.32026522896008774, "learning_rate": 1.2686690978285533e-06, "loss": 0.0153, "step": 7108 }, { "epoch": 2.374019034897312, "grad_norm": 0.2295521880397819, "learning_rate": 1.267375804439513e-06, "loss": 0.0098, "step": 7109 }, { "epoch": 2.3743529804641845, "grad_norm": 0.31821061012740803, "learning_rate": 1.2660830749073093e-06, "loss": 0.0197, "step": 7110 }, { "epoch": 2.374686926031057, "grad_norm": 0.31868346130439756, "learning_rate": 1.2647909094272215e-06, "loss": 0.0138, "step": 7111 }, { "epoch": 2.3750208715979295, "grad_norm": 0.2875321158025383, "learning_rate": 1.2634993081944469e-06, "loss": 0.0129, "step": 7112 }, { "epoch": 2.3753548171648022, "grad_norm": 0.4186992288337688, "learning_rate": 1.2622082714040995e-06, "loss": 0.019, "step": 7113 }, { "epoch": 2.3756887627316745, "grad_norm": 0.27442706926786037, "learning_rate": 1.2609177992512022e-06, "loss": 0.013, "step": 7114 }, { "epoch": 2.3760227082985472, "grad_norm": 0.26981130860632113, "learning_rate": 1.2596278919306993e-06, "loss": 0.0101, "step": 7115 }, { "epoch": 2.37635665386542, "grad_norm": 0.25333406289441435, "learning_rate": 1.2583385496374428e-06, "loss": 0.0128, "step": 7116 }, { "epoch": 2.3766905994322927, "grad_norm": 0.32993803245798575, "learning_rate": 1.2570497725662067e-06, "loss": 0.0162, "step": 7117 }, { "epoch": 2.3770245449991654, "grad_norm": 0.30424642025928195, "learning_rate": 1.2557615609116713e-06, "loss": 0.0133, "step": 7118 }, { "epoch": 2.3773584905660377, "grad_norm": 0.21437599529078102, "learning_rate": 1.254473914868442e-06, "loss": 0.0091, "step": 7119 }, { "epoch": 2.3776924361329104, "grad_norm": 0.28701269204362556, "learning_rate": 1.2531868346310288e-06, "loss": 0.0118, "step": 7120 }, { "epoch": 2.378026381699783, "grad_norm": 0.262028252236464, "learning_rate": 1.2519003203938628e-06, "loss": 0.0101, "step": 7121 }, { "epoch": 2.3783603272666554, "grad_norm": 0.26227682214931713, "learning_rate": 1.2506143723512842e-06, "loss": 0.0107, "step": 7122 }, { "epoch": 2.378694272833528, "grad_norm": 0.41353513779341977, "learning_rate": 1.2493289906975543e-06, "loss": 0.0201, "step": 7123 }, { "epoch": 2.379028218400401, "grad_norm": 0.39013520606720037, "learning_rate": 1.2480441756268397e-06, "loss": 0.0216, "step": 7124 }, { "epoch": 2.3793621639672735, "grad_norm": 0.39793688390234655, "learning_rate": 1.2467599273332332e-06, "loss": 0.0206, "step": 7125 }, { "epoch": 2.379696109534146, "grad_norm": 0.3389970768895821, "learning_rate": 1.245476246010731e-06, "loss": 0.0182, "step": 7126 }, { "epoch": 2.3800300551010185, "grad_norm": 0.24997088072860787, "learning_rate": 1.244193131853252e-06, "loss": 0.0128, "step": 7127 }, { "epoch": 2.3803640006678912, "grad_norm": 0.35901501094601995, "learning_rate": 1.2429105850546213e-06, "loss": 0.0193, "step": 7128 }, { "epoch": 2.380697946234764, "grad_norm": 0.26315141431875416, "learning_rate": 1.241628605808587e-06, "loss": 0.0096, "step": 7129 }, { "epoch": 2.381031891801636, "grad_norm": 0.322861217747456, "learning_rate": 1.2403471943088018e-06, "loss": 0.0153, "step": 7130 }, { "epoch": 2.381365837368509, "grad_norm": 0.2886616062972126, "learning_rate": 1.239066350748845e-06, "loss": 0.0119, "step": 7131 }, { "epoch": 2.3816997829353816, "grad_norm": 0.29726272959524785, "learning_rate": 1.2377860753221976e-06, "loss": 0.0177, "step": 7132 }, { "epoch": 2.382033728502254, "grad_norm": 0.2537239930121517, "learning_rate": 1.236506368222264e-06, "loss": 0.0095, "step": 7133 }, { "epoch": 2.3823676740691266, "grad_norm": 0.28960540961632786, "learning_rate": 1.235227229642355e-06, "loss": 0.0114, "step": 7134 }, { "epoch": 2.3827016196359994, "grad_norm": 0.3111297053772571, "learning_rate": 1.2339486597757038e-06, "loss": 0.0162, "step": 7135 }, { "epoch": 2.383035565202872, "grad_norm": 0.2679547588899558, "learning_rate": 1.2326706588154496e-06, "loss": 0.0177, "step": 7136 }, { "epoch": 2.3833695107697443, "grad_norm": 0.2949021800531175, "learning_rate": 1.2313932269546518e-06, "loss": 0.0125, "step": 7137 }, { "epoch": 2.383703456336617, "grad_norm": 0.2565085317899601, "learning_rate": 1.2301163643862817e-06, "loss": 0.0114, "step": 7138 }, { "epoch": 2.38403740190349, "grad_norm": 0.2386767623144155, "learning_rate": 1.2288400713032227e-06, "loss": 0.0106, "step": 7139 }, { "epoch": 2.3843713474703625, "grad_norm": 0.37789984358150264, "learning_rate": 1.2275643478982762e-06, "loss": 0.0225, "step": 7140 }, { "epoch": 2.3847052930372348, "grad_norm": 0.27959514576471695, "learning_rate": 1.2262891943641526e-06, "loss": 0.0127, "step": 7141 }, { "epoch": 2.3850392386041075, "grad_norm": 0.280060467735272, "learning_rate": 1.2250146108934802e-06, "loss": 0.0112, "step": 7142 }, { "epoch": 2.38537318417098, "grad_norm": 0.279827979047377, "learning_rate": 1.2237405976787997e-06, "loss": 0.0143, "step": 7143 }, { "epoch": 2.3857071297378525, "grad_norm": 0.2539864619433211, "learning_rate": 1.2224671549125673e-06, "loss": 0.0135, "step": 7144 }, { "epoch": 2.386041075304725, "grad_norm": 0.3003236120274151, "learning_rate": 1.2211942827871486e-06, "loss": 0.0161, "step": 7145 }, { "epoch": 2.386375020871598, "grad_norm": 0.29455212569386563, "learning_rate": 1.2199219814948294e-06, "loss": 0.0133, "step": 7146 }, { "epoch": 2.3867089664384706, "grad_norm": 0.4499970929383756, "learning_rate": 1.218650251227802e-06, "loss": 0.0194, "step": 7147 }, { "epoch": 2.3870429120053434, "grad_norm": 0.3853206507772999, "learning_rate": 1.2173790921781786e-06, "loss": 0.0145, "step": 7148 }, { "epoch": 2.3873768575722156, "grad_norm": 0.4580032725791835, "learning_rate": 1.2161085045379818e-06, "loss": 0.0279, "step": 7149 }, { "epoch": 2.3877108031390883, "grad_norm": 0.30795167351760605, "learning_rate": 1.214838488499151e-06, "loss": 0.0131, "step": 7150 }, { "epoch": 2.388044748705961, "grad_norm": 0.2449759315544695, "learning_rate": 1.2135690442535335e-06, "loss": 0.0116, "step": 7151 }, { "epoch": 2.3883786942728333, "grad_norm": 0.23540419332513418, "learning_rate": 1.2123001719928972e-06, "loss": 0.0122, "step": 7152 }, { "epoch": 2.388712639839706, "grad_norm": 0.27832047670593196, "learning_rate": 1.211031871908916e-06, "loss": 0.0133, "step": 7153 }, { "epoch": 2.3890465854065788, "grad_norm": 0.35796883834542315, "learning_rate": 1.2097641441931868e-06, "loss": 0.0169, "step": 7154 }, { "epoch": 2.3893805309734515, "grad_norm": 0.33903461284328346, "learning_rate": 1.2084969890372111e-06, "loss": 0.0122, "step": 7155 }, { "epoch": 2.3897144765403238, "grad_norm": 0.3545470345634991, "learning_rate": 1.2072304066324103e-06, "loss": 0.0179, "step": 7156 }, { "epoch": 2.3900484221071965, "grad_norm": 0.278512013267484, "learning_rate": 1.205964397170113e-06, "loss": 0.0116, "step": 7157 }, { "epoch": 2.390382367674069, "grad_norm": 0.32364809262854305, "learning_rate": 1.2046989608415682e-06, "loss": 0.0136, "step": 7158 }, { "epoch": 2.390716313240942, "grad_norm": 0.27568292826293606, "learning_rate": 1.2034340978379328e-06, "loss": 0.0151, "step": 7159 }, { "epoch": 2.391050258807814, "grad_norm": 0.7998645300973338, "learning_rate": 1.2021698083502797e-06, "loss": 0.0176, "step": 7160 }, { "epoch": 2.391384204374687, "grad_norm": 0.24883334864576495, "learning_rate": 1.2009060925695965e-06, "loss": 0.0129, "step": 7161 }, { "epoch": 2.3917181499415596, "grad_norm": 0.24945895571795934, "learning_rate": 1.1996429506867797e-06, "loss": 0.0091, "step": 7162 }, { "epoch": 2.392052095508432, "grad_norm": 0.31494602785813586, "learning_rate": 1.1983803828926438e-06, "loss": 0.013, "step": 7163 }, { "epoch": 2.3923860410753046, "grad_norm": 0.31393119034954425, "learning_rate": 1.1971183893779125e-06, "loss": 0.013, "step": 7164 }, { "epoch": 2.3927199866421773, "grad_norm": 0.3238796788901826, "learning_rate": 1.1958569703332262e-06, "loss": 0.0147, "step": 7165 }, { "epoch": 2.39305393220905, "grad_norm": 0.342314876593608, "learning_rate": 1.1945961259491368e-06, "loss": 0.015, "step": 7166 }, { "epoch": 2.3933878777759228, "grad_norm": 0.2750549613113608, "learning_rate": 1.1933358564161108e-06, "loss": 0.013, "step": 7167 }, { "epoch": 2.393721823342795, "grad_norm": 0.2584896582368959, "learning_rate": 1.1920761619245246e-06, "loss": 0.0128, "step": 7168 }, { "epoch": 2.3940557689096678, "grad_norm": 0.3167388422063857, "learning_rate": 1.1908170426646726e-06, "loss": 0.01, "step": 7169 }, { "epoch": 2.3943897144765405, "grad_norm": 0.20762422386899154, "learning_rate": 1.189558498826756e-06, "loss": 0.0112, "step": 7170 }, { "epoch": 2.3947236600434127, "grad_norm": 0.2825120417049143, "learning_rate": 1.1883005306008955e-06, "loss": 0.0153, "step": 7171 }, { "epoch": 2.3950576056102855, "grad_norm": 0.26261833220252195, "learning_rate": 1.1870431381771203e-06, "loss": 0.0103, "step": 7172 }, { "epoch": 2.395391551177158, "grad_norm": 0.2590827538536333, "learning_rate": 1.185786321745377e-06, "loss": 0.0119, "step": 7173 }, { "epoch": 2.395725496744031, "grad_norm": 0.24902586742990407, "learning_rate": 1.1845300814955192e-06, "loss": 0.0126, "step": 7174 }, { "epoch": 2.396059442310903, "grad_norm": 0.2858493562375798, "learning_rate": 1.18327441761732e-06, "loss": 0.0127, "step": 7175 }, { "epoch": 2.396393387877776, "grad_norm": 0.3054522282716761, "learning_rate": 1.1820193303004584e-06, "loss": 0.0156, "step": 7176 }, { "epoch": 2.3967273334446486, "grad_norm": 0.16667413021003308, "learning_rate": 1.1807648197345327e-06, "loss": 0.0064, "step": 7177 }, { "epoch": 2.3970612790115213, "grad_norm": 0.2759422229871676, "learning_rate": 1.1795108861090515e-06, "loss": 0.0105, "step": 7178 }, { "epoch": 2.3973952245783936, "grad_norm": 0.293786695623443, "learning_rate": 1.1782575296134363e-06, "loss": 0.0135, "step": 7179 }, { "epoch": 2.3977291701452663, "grad_norm": 0.2837512554177609, "learning_rate": 1.1770047504370197e-06, "loss": 0.0151, "step": 7180 }, { "epoch": 2.398063115712139, "grad_norm": 0.27206695994681046, "learning_rate": 1.1757525487690513e-06, "loss": 0.0128, "step": 7181 }, { "epoch": 2.3983970612790113, "grad_norm": 0.2886242427903375, "learning_rate": 1.1745009247986882e-06, "loss": 0.0187, "step": 7182 }, { "epoch": 2.398731006845884, "grad_norm": 0.2743196315802026, "learning_rate": 1.1732498787150044e-06, "loss": 0.0088, "step": 7183 }, { "epoch": 2.3990649524127567, "grad_norm": 0.27742323748456466, "learning_rate": 1.171999410706986e-06, "loss": 0.0121, "step": 7184 }, { "epoch": 2.3993988979796295, "grad_norm": 0.28507840321145467, "learning_rate": 1.1707495209635283e-06, "loss": 0.0128, "step": 7185 }, { "epoch": 2.3997328435465017, "grad_norm": 0.2745559950112873, "learning_rate": 1.1695002096734454e-06, "loss": 0.0155, "step": 7186 }, { "epoch": 2.4000667891133745, "grad_norm": 0.31055652439516734, "learning_rate": 1.1682514770254567e-06, "loss": 0.0114, "step": 7187 }, { "epoch": 2.400400734680247, "grad_norm": 0.2619762054457191, "learning_rate": 1.1670033232081995e-06, "loss": 0.0119, "step": 7188 }, { "epoch": 2.40073468024712, "grad_norm": 0.2915813885638635, "learning_rate": 1.1657557484102228e-06, "loss": 0.0157, "step": 7189 }, { "epoch": 2.401068625813992, "grad_norm": 0.28864581385503507, "learning_rate": 1.1645087528199883e-06, "loss": 0.0148, "step": 7190 }, { "epoch": 2.401402571380865, "grad_norm": 0.24759176434505523, "learning_rate": 1.1632623366258666e-06, "loss": 0.0111, "step": 7191 }, { "epoch": 2.4017365169477376, "grad_norm": 0.2710021724502984, "learning_rate": 1.162016500016147e-06, "loss": 0.0088, "step": 7192 }, { "epoch": 2.40207046251461, "grad_norm": 0.2917673858939121, "learning_rate": 1.1607712431790242e-06, "loss": 0.0164, "step": 7193 }, { "epoch": 2.4024044080814826, "grad_norm": 0.24049259677765855, "learning_rate": 1.15952656630261e-06, "loss": 0.0097, "step": 7194 }, { "epoch": 2.4027383536483553, "grad_norm": 0.44935370061229574, "learning_rate": 1.158282469574929e-06, "loss": 0.0232, "step": 7195 }, { "epoch": 2.403072299215228, "grad_norm": 0.2765799696859926, "learning_rate": 1.1570389531839165e-06, "loss": 0.0145, "step": 7196 }, { "epoch": 2.4034062447821007, "grad_norm": 0.24358066177297905, "learning_rate": 1.1557960173174183e-06, "loss": 0.0108, "step": 7197 }, { "epoch": 2.403740190348973, "grad_norm": 0.25225171377279826, "learning_rate": 1.154553662163197e-06, "loss": 0.0133, "step": 7198 }, { "epoch": 2.4040741359158457, "grad_norm": 0.28316155252835085, "learning_rate": 1.1533118879089227e-06, "loss": 0.0109, "step": 7199 }, { "epoch": 2.4044080814827185, "grad_norm": 0.26854994796798737, "learning_rate": 1.1520706947421806e-06, "loss": 0.0103, "step": 7200 }, { "epoch": 2.4047420270495907, "grad_norm": 0.32211611766440146, "learning_rate": 1.1508300828504682e-06, "loss": 0.015, "step": 7201 }, { "epoch": 2.4050759726164634, "grad_norm": 0.2574191532961684, "learning_rate": 1.1495900524211955e-06, "loss": 0.0137, "step": 7202 }, { "epoch": 2.405409918183336, "grad_norm": 0.2597273172537471, "learning_rate": 1.1483506036416814e-06, "loss": 0.0142, "step": 7203 }, { "epoch": 2.405743863750209, "grad_norm": 0.253138913648762, "learning_rate": 1.1471117366991613e-06, "loss": 0.0121, "step": 7204 }, { "epoch": 2.406077809317081, "grad_norm": 0.23321722385078825, "learning_rate": 1.1458734517807785e-06, "loss": 0.0148, "step": 7205 }, { "epoch": 2.406411754883954, "grad_norm": 0.2894150642303496, "learning_rate": 1.1446357490735921e-06, "loss": 0.0115, "step": 7206 }, { "epoch": 2.4067457004508266, "grad_norm": 0.20410274855343485, "learning_rate": 1.143398628764572e-06, "loss": 0.0087, "step": 7207 }, { "epoch": 2.4070796460176993, "grad_norm": 0.25520301126643197, "learning_rate": 1.1421620910405977e-06, "loss": 0.0124, "step": 7208 }, { "epoch": 2.4074135915845716, "grad_norm": 0.2591264065436835, "learning_rate": 1.1409261360884661e-06, "loss": 0.0137, "step": 7209 }, { "epoch": 2.4077475371514443, "grad_norm": 0.2382586100039912, "learning_rate": 1.1396907640948785e-06, "loss": 0.0113, "step": 7210 }, { "epoch": 2.408081482718317, "grad_norm": 0.2955676439934013, "learning_rate": 1.1384559752464553e-06, "loss": 0.0134, "step": 7211 }, { "epoch": 2.4084154282851893, "grad_norm": 0.3206987871751663, "learning_rate": 1.137221769729725e-06, "loss": 0.0117, "step": 7212 }, { "epoch": 2.408749373852062, "grad_norm": 0.3463327320325806, "learning_rate": 1.1359881477311301e-06, "loss": 0.0165, "step": 7213 }, { "epoch": 2.4090833194189347, "grad_norm": 0.33288614094209124, "learning_rate": 1.1347551094370224e-06, "loss": 0.0216, "step": 7214 }, { "epoch": 2.4094172649858074, "grad_norm": 0.2988111602023324, "learning_rate": 1.1335226550336676e-06, "loss": 0.0161, "step": 7215 }, { "epoch": 2.40975121055268, "grad_norm": 0.22773046890329157, "learning_rate": 1.1322907847072411e-06, "loss": 0.0095, "step": 7216 }, { "epoch": 2.4100851561195524, "grad_norm": 0.27544140321724186, "learning_rate": 1.1310594986438339e-06, "loss": 0.0133, "step": 7217 }, { "epoch": 2.410419101686425, "grad_norm": 0.21034760716212833, "learning_rate": 1.129828797029442e-06, "loss": 0.0083, "step": 7218 }, { "epoch": 2.410753047253298, "grad_norm": 0.3253036600406103, "learning_rate": 1.128598680049982e-06, "loss": 0.0158, "step": 7219 }, { "epoch": 2.41108699282017, "grad_norm": 0.2774108699535163, "learning_rate": 1.1273691478912752e-06, "loss": 0.0111, "step": 7220 }, { "epoch": 2.411420938387043, "grad_norm": 0.31876378259886223, "learning_rate": 1.1261402007390587e-06, "loss": 0.0172, "step": 7221 }, { "epoch": 2.4117548839539156, "grad_norm": 0.3119847861004831, "learning_rate": 1.1249118387789764e-06, "loss": 0.0146, "step": 7222 }, { "epoch": 2.4120888295207883, "grad_norm": 0.2828378050063791, "learning_rate": 1.12368406219659e-06, "loss": 0.0184, "step": 7223 }, { "epoch": 2.4124227750876606, "grad_norm": 0.24241244445989907, "learning_rate": 1.1224568711773653e-06, "loss": 0.008, "step": 7224 }, { "epoch": 2.4127567206545333, "grad_norm": 0.25305626652921254, "learning_rate": 1.1212302659066898e-06, "loss": 0.013, "step": 7225 }, { "epoch": 2.413090666221406, "grad_norm": 0.29622762532899843, "learning_rate": 1.1200042465698518e-06, "loss": 0.0162, "step": 7226 }, { "epoch": 2.4134246117882787, "grad_norm": 0.24693155073398468, "learning_rate": 1.1187788133520594e-06, "loss": 0.0125, "step": 7227 }, { "epoch": 2.413758557355151, "grad_norm": 0.222319670796678, "learning_rate": 1.1175539664384261e-06, "loss": 0.0122, "step": 7228 }, { "epoch": 2.4140925029220237, "grad_norm": 0.2643043895840703, "learning_rate": 1.1163297060139815e-06, "loss": 0.0099, "step": 7229 }, { "epoch": 2.4144264484888964, "grad_norm": 0.2755059947823996, "learning_rate": 1.1151060322636625e-06, "loss": 0.0109, "step": 7230 }, { "epoch": 2.4147603940557687, "grad_norm": 0.3011623464267576, "learning_rate": 1.1138829453723204e-06, "loss": 0.0167, "step": 7231 }, { "epoch": 2.4150943396226414, "grad_norm": 0.24423068821650673, "learning_rate": 1.112660445524718e-06, "loss": 0.0114, "step": 7232 }, { "epoch": 2.415428285189514, "grad_norm": 0.2468340235552794, "learning_rate": 1.1114385329055262e-06, "loss": 0.0099, "step": 7233 }, { "epoch": 2.415762230756387, "grad_norm": 0.30319247623744344, "learning_rate": 1.1102172076993301e-06, "loss": 0.0091, "step": 7234 }, { "epoch": 2.416096176323259, "grad_norm": 0.322759779147581, "learning_rate": 1.1089964700906257e-06, "loss": 0.0122, "step": 7235 }, { "epoch": 2.416430121890132, "grad_norm": 0.3288493547130975, "learning_rate": 1.1077763202638208e-06, "loss": 0.0216, "step": 7236 }, { "epoch": 2.4167640674570046, "grad_norm": 0.26661830516489915, "learning_rate": 1.106556758403231e-06, "loss": 0.0124, "step": 7237 }, { "epoch": 2.4170980130238773, "grad_norm": 0.26447273292117623, "learning_rate": 1.105337784693088e-06, "loss": 0.0136, "step": 7238 }, { "epoch": 2.4174319585907496, "grad_norm": 0.3060522227695716, "learning_rate": 1.1041193993175293e-06, "loss": 0.0147, "step": 7239 }, { "epoch": 2.4177659041576223, "grad_norm": 0.38026942731737745, "learning_rate": 1.1029016024606093e-06, "loss": 0.0093, "step": 7240 }, { "epoch": 2.418099849724495, "grad_norm": 0.32183588906554944, "learning_rate": 1.101684394306286e-06, "loss": 0.0145, "step": 7241 }, { "epoch": 2.4184337952913673, "grad_norm": 0.2660050271437931, "learning_rate": 1.100467775038439e-06, "loss": 0.0132, "step": 7242 }, { "epoch": 2.41876774085824, "grad_norm": 0.423035256858228, "learning_rate": 1.099251744840849e-06, "loss": 0.0198, "step": 7243 }, { "epoch": 2.4191016864251127, "grad_norm": 0.3625454286746034, "learning_rate": 1.0980363038972141e-06, "loss": 0.0213, "step": 7244 }, { "epoch": 2.4194356319919854, "grad_norm": 0.2424465912500267, "learning_rate": 1.096821452391138e-06, "loss": 0.0105, "step": 7245 }, { "epoch": 2.419769577558858, "grad_norm": 0.2586066381095055, "learning_rate": 1.0956071905061415e-06, "loss": 0.0128, "step": 7246 }, { "epoch": 2.4201035231257304, "grad_norm": 0.3333268966603381, "learning_rate": 1.0943935184256487e-06, "loss": 0.0177, "step": 7247 }, { "epoch": 2.420437468692603, "grad_norm": 0.286084481345273, "learning_rate": 1.093180436333005e-06, "loss": 0.0149, "step": 7248 }, { "epoch": 2.420771414259476, "grad_norm": 0.2493653138982644, "learning_rate": 1.091967944411456e-06, "loss": 0.0107, "step": 7249 }, { "epoch": 2.421105359826348, "grad_norm": 0.25159429541466294, "learning_rate": 1.0907560428441666e-06, "loss": 0.0087, "step": 7250 }, { "epoch": 2.421439305393221, "grad_norm": 0.2513470760755896, "learning_rate": 1.0895447318142043e-06, "loss": 0.0121, "step": 7251 }, { "epoch": 2.4217732509600935, "grad_norm": 0.20574294406547308, "learning_rate": 1.0883340115045566e-06, "loss": 0.0081, "step": 7252 }, { "epoch": 2.4221071965269663, "grad_norm": 0.2306528422692096, "learning_rate": 1.0871238820981133e-06, "loss": 0.0096, "step": 7253 }, { "epoch": 2.4224411420938385, "grad_norm": 0.2616276871365394, "learning_rate": 1.0859143437776803e-06, "loss": 0.0138, "step": 7254 }, { "epoch": 2.4227750876607113, "grad_norm": 0.321239771478697, "learning_rate": 1.0847053967259736e-06, "loss": 0.0132, "step": 7255 }, { "epoch": 2.423109033227584, "grad_norm": 0.29953783460510724, "learning_rate": 1.0834970411256167e-06, "loss": 0.0155, "step": 7256 }, { "epoch": 2.4234429787944567, "grad_norm": 0.3217997164330562, "learning_rate": 1.082289277159147e-06, "loss": 0.0111, "step": 7257 }, { "epoch": 2.423776924361329, "grad_norm": 0.34331779083669667, "learning_rate": 1.0810821050090132e-06, "loss": 0.0223, "step": 7258 }, { "epoch": 2.4241108699282017, "grad_norm": 0.29478707084790945, "learning_rate": 1.0798755248575694e-06, "loss": 0.0141, "step": 7259 }, { "epoch": 2.4244448154950744, "grad_norm": 0.2786668752463381, "learning_rate": 1.078669536887086e-06, "loss": 0.0106, "step": 7260 }, { "epoch": 2.4247787610619467, "grad_norm": 0.21496456487304513, "learning_rate": 1.077464141279742e-06, "loss": 0.0097, "step": 7261 }, { "epoch": 2.4251127066288194, "grad_norm": 0.2752509532762129, "learning_rate": 1.0762593382176244e-06, "loss": 0.0104, "step": 7262 }, { "epoch": 2.425446652195692, "grad_norm": 0.23406747399877303, "learning_rate": 1.0750551278827365e-06, "loss": 0.0113, "step": 7263 }, { "epoch": 2.425780597762565, "grad_norm": 0.2843164037544598, "learning_rate": 1.073851510456984e-06, "loss": 0.013, "step": 7264 }, { "epoch": 2.4261145433294375, "grad_norm": 0.29019737142117985, "learning_rate": 1.0726484861221902e-06, "loss": 0.0146, "step": 7265 }, { "epoch": 2.42644848889631, "grad_norm": 0.32223930603474793, "learning_rate": 1.0714460550600859e-06, "loss": 0.0204, "step": 7266 }, { "epoch": 2.4267824344631825, "grad_norm": 0.313047643406518, "learning_rate": 1.0702442174523132e-06, "loss": 0.016, "step": 7267 }, { "epoch": 2.4271163800300553, "grad_norm": 0.2593153263087686, "learning_rate": 1.0690429734804214e-06, "loss": 0.0088, "step": 7268 }, { "epoch": 2.4274503255969275, "grad_norm": 0.2815875789425934, "learning_rate": 1.0678423233258755e-06, "loss": 0.0149, "step": 7269 }, { "epoch": 2.4277842711638002, "grad_norm": 0.3730049656620886, "learning_rate": 1.0666422671700438e-06, "loss": 0.0155, "step": 7270 }, { "epoch": 2.428118216730673, "grad_norm": 0.35707437192802916, "learning_rate": 1.065442805194214e-06, "loss": 0.0131, "step": 7271 }, { "epoch": 2.4284521622975457, "grad_norm": 0.2455000952029101, "learning_rate": 1.0642439375795748e-06, "loss": 0.0108, "step": 7272 }, { "epoch": 2.428786107864418, "grad_norm": 0.3720971236342139, "learning_rate": 1.0630456645072324e-06, "loss": 0.0213, "step": 7273 }, { "epoch": 2.4291200534312907, "grad_norm": 0.29797087934836636, "learning_rate": 1.0618479861581971e-06, "loss": 0.0137, "step": 7274 }, { "epoch": 2.4294539989981634, "grad_norm": 0.2400773867898801, "learning_rate": 1.060650902713395e-06, "loss": 0.0109, "step": 7275 }, { "epoch": 2.429787944565036, "grad_norm": 0.3130496837461216, "learning_rate": 1.0594544143536572e-06, "loss": 0.0158, "step": 7276 }, { "epoch": 2.4301218901319084, "grad_norm": 0.2865455182480135, "learning_rate": 1.0582585212597286e-06, "loss": 0.0127, "step": 7277 }, { "epoch": 2.430455835698781, "grad_norm": 0.305507595068663, "learning_rate": 1.0570632236122641e-06, "loss": 0.0096, "step": 7278 }, { "epoch": 2.430789781265654, "grad_norm": 0.22714468428546217, "learning_rate": 1.0558685215918246e-06, "loss": 0.0083, "step": 7279 }, { "epoch": 2.431123726832526, "grad_norm": 0.30195079622243304, "learning_rate": 1.0546744153788858e-06, "loss": 0.013, "step": 7280 }, { "epoch": 2.431457672399399, "grad_norm": 0.4330203059273548, "learning_rate": 1.0534809051538324e-06, "loss": 0.0173, "step": 7281 }, { "epoch": 2.4317916179662715, "grad_norm": 0.3901599505968961, "learning_rate": 1.0522879910969563e-06, "loss": 0.0167, "step": 7282 }, { "epoch": 2.4321255635331442, "grad_norm": 0.33959626776148355, "learning_rate": 1.0510956733884614e-06, "loss": 0.0118, "step": 7283 }, { "epoch": 2.4324595091000165, "grad_norm": 0.3049771255951287, "learning_rate": 1.0499039522084637e-06, "loss": 0.012, "step": 7284 }, { "epoch": 2.4327934546668892, "grad_norm": 0.2925027780827184, "learning_rate": 1.0487128277369829e-06, "loss": 0.015, "step": 7285 }, { "epoch": 2.433127400233762, "grad_norm": 0.3465757430233572, "learning_rate": 1.0475223001539564e-06, "loss": 0.0143, "step": 7286 }, { "epoch": 2.4334613458006347, "grad_norm": 0.2921377854090987, "learning_rate": 1.0463323696392236e-06, "loss": 0.0131, "step": 7287 }, { "epoch": 2.433795291367507, "grad_norm": 0.22218855506720242, "learning_rate": 1.0451430363725395e-06, "loss": 0.0098, "step": 7288 }, { "epoch": 2.4341292369343797, "grad_norm": 0.24695462801766666, "learning_rate": 1.043954300533566e-06, "loss": 0.0117, "step": 7289 }, { "epoch": 2.4344631825012524, "grad_norm": 0.28325740225039686, "learning_rate": 1.0427661623018786e-06, "loss": 0.0142, "step": 7290 }, { "epoch": 2.4347971280681246, "grad_norm": 0.28787787794787173, "learning_rate": 1.0415786218569557e-06, "loss": 0.015, "step": 7291 }, { "epoch": 2.4351310736349974, "grad_norm": 0.21698008235982977, "learning_rate": 1.0403916793781922e-06, "loss": 0.0126, "step": 7292 }, { "epoch": 2.43546501920187, "grad_norm": 0.20833791702536347, "learning_rate": 1.0392053350448867e-06, "loss": 0.0089, "step": 7293 }, { "epoch": 2.435798964768743, "grad_norm": 0.30648024865173745, "learning_rate": 1.0380195890362527e-06, "loss": 0.015, "step": 7294 }, { "epoch": 2.4361329103356155, "grad_norm": 0.25092182838695504, "learning_rate": 1.0368344415314101e-06, "loss": 0.0135, "step": 7295 }, { "epoch": 2.436466855902488, "grad_norm": 0.23198410448315185, "learning_rate": 1.0356498927093916e-06, "loss": 0.0091, "step": 7296 }, { "epoch": 2.4368008014693605, "grad_norm": 0.2375015576403759, "learning_rate": 1.0344659427491343e-06, "loss": 0.0114, "step": 7297 }, { "epoch": 2.4371347470362332, "grad_norm": 0.2945697420611448, "learning_rate": 1.0332825918294898e-06, "loss": 0.013, "step": 7298 }, { "epoch": 2.4374686926031055, "grad_norm": 0.2983015023268837, "learning_rate": 1.0320998401292154e-06, "loss": 0.0177, "step": 7299 }, { "epoch": 2.437802638169978, "grad_norm": 0.3106444295761094, "learning_rate": 1.0309176878269806e-06, "loss": 0.0163, "step": 7300 }, { "epoch": 2.438136583736851, "grad_norm": 0.27717876792538665, "learning_rate": 1.0297361351013646e-06, "loss": 0.0155, "step": 7301 }, { "epoch": 2.4384705293037237, "grad_norm": 0.39237590046406856, "learning_rate": 1.028555182130853e-06, "loss": 0.0179, "step": 7302 }, { "epoch": 2.438804474870596, "grad_norm": 0.25520872453543136, "learning_rate": 1.027374829093843e-06, "loss": 0.0153, "step": 7303 }, { "epoch": 2.4391384204374686, "grad_norm": 0.26162517217255865, "learning_rate": 1.0261950761686423e-06, "loss": 0.0107, "step": 7304 }, { "epoch": 2.4394723660043414, "grad_norm": 0.20978647733689199, "learning_rate": 1.0250159235334645e-06, "loss": 0.0087, "step": 7305 }, { "epoch": 2.439806311571214, "grad_norm": 0.3167807769076685, "learning_rate": 1.0238373713664351e-06, "loss": 0.0153, "step": 7306 }, { "epoch": 2.4401402571380864, "grad_norm": 0.27678055611181496, "learning_rate": 1.0226594198455903e-06, "loss": 0.0155, "step": 7307 }, { "epoch": 2.440474202704959, "grad_norm": 0.2565559204887818, "learning_rate": 1.0214820691488698e-06, "loss": 0.0088, "step": 7308 }, { "epoch": 2.440808148271832, "grad_norm": 0.28826978115615537, "learning_rate": 1.02030531945413e-06, "loss": 0.0167, "step": 7309 }, { "epoch": 2.441142093838704, "grad_norm": 0.284312603833394, "learning_rate": 1.0191291709391298e-06, "loss": 0.0125, "step": 7310 }, { "epoch": 2.441476039405577, "grad_norm": 0.3861989846609746, "learning_rate": 1.0179536237815413e-06, "loss": 0.0231, "step": 7311 }, { "epoch": 2.4418099849724495, "grad_norm": 0.24064228201789706, "learning_rate": 1.016778678158945e-06, "loss": 0.0112, "step": 7312 }, { "epoch": 2.442143930539322, "grad_norm": 0.3123762844764769, "learning_rate": 1.015604334248832e-06, "loss": 0.0163, "step": 7313 }, { "epoch": 2.442477876106195, "grad_norm": 0.26853057421196386, "learning_rate": 1.0144305922285975e-06, "loss": 0.0102, "step": 7314 }, { "epoch": 2.442811821673067, "grad_norm": 0.31314817805475603, "learning_rate": 1.0132574522755518e-06, "loss": 0.0206, "step": 7315 }, { "epoch": 2.44314576723994, "grad_norm": 0.3021525501381858, "learning_rate": 1.0120849145669093e-06, "loss": 0.0176, "step": 7316 }, { "epoch": 2.4434797128068126, "grad_norm": 0.2970065454054504, "learning_rate": 1.010912979279796e-06, "loss": 0.0143, "step": 7317 }, { "epoch": 2.443813658373685, "grad_norm": 0.24825508662583712, "learning_rate": 1.009741646591248e-06, "loss": 0.0126, "step": 7318 }, { "epoch": 2.4441476039405576, "grad_norm": 0.2597659499615799, "learning_rate": 1.0085709166782088e-06, "loss": 0.0144, "step": 7319 }, { "epoch": 2.4444815495074304, "grad_norm": 0.25450949330419564, "learning_rate": 1.0074007897175291e-06, "loss": 0.0111, "step": 7320 }, { "epoch": 2.444815495074303, "grad_norm": 0.3156661941355761, "learning_rate": 1.0062312658859723e-06, "loss": 0.0171, "step": 7321 }, { "epoch": 2.4451494406411753, "grad_norm": 0.32955147077355185, "learning_rate": 1.0050623453602075e-06, "loss": 0.0137, "step": 7322 }, { "epoch": 2.445483386208048, "grad_norm": 0.2703462933932539, "learning_rate": 1.0038940283168136e-06, "loss": 0.0145, "step": 7323 }, { "epoch": 2.4458173317749208, "grad_norm": 0.3294855990312569, "learning_rate": 1.0027263149322797e-06, "loss": 0.0192, "step": 7324 }, { "epoch": 2.4461512773417935, "grad_norm": 0.24937110062318102, "learning_rate": 1.001559205383003e-06, "loss": 0.018, "step": 7325 }, { "epoch": 2.4464852229086658, "grad_norm": 0.2548893452856415, "learning_rate": 1.000392699845288e-06, "loss": 0.0158, "step": 7326 }, { "epoch": 2.4468191684755385, "grad_norm": 0.32509317137948335, "learning_rate": 9.992267984953503e-07, "loss": 0.0163, "step": 7327 }, { "epoch": 2.447153114042411, "grad_norm": 0.3356595603778868, "learning_rate": 9.98061501509311e-07, "loss": 0.0165, "step": 7328 }, { "epoch": 2.4474870596092835, "grad_norm": 0.2976814000591848, "learning_rate": 9.968968090632032e-07, "loss": 0.0141, "step": 7329 }, { "epoch": 2.447821005176156, "grad_norm": 0.31139018416994313, "learning_rate": 9.957327213329687e-07, "loss": 0.0202, "step": 7330 }, { "epoch": 2.448154950743029, "grad_norm": 0.23872054040481558, "learning_rate": 9.945692384944544e-07, "loss": 0.0105, "step": 7331 }, { "epoch": 2.4484888963099016, "grad_norm": 0.3190787506388644, "learning_rate": 9.934063607234202e-07, "loss": 0.0157, "step": 7332 }, { "epoch": 2.448822841876774, "grad_norm": 0.28657011536320304, "learning_rate": 9.922440881955298e-07, "loss": 0.0118, "step": 7333 }, { "epoch": 2.4491567874436466, "grad_norm": 0.3540342936916669, "learning_rate": 9.910824210863611e-07, "loss": 0.0171, "step": 7334 }, { "epoch": 2.4494907330105193, "grad_norm": 0.33985510555077725, "learning_rate": 9.899213595713935e-07, "loss": 0.024, "step": 7335 }, { "epoch": 2.449824678577392, "grad_norm": 0.24922446167159795, "learning_rate": 9.887609038260243e-07, "loss": 0.0142, "step": 7336 }, { "epoch": 2.4501586241442643, "grad_norm": 0.4667165619725214, "learning_rate": 9.876010540255504e-07, "loss": 0.0163, "step": 7337 }, { "epoch": 2.450492569711137, "grad_norm": 0.25087403523273744, "learning_rate": 9.86441810345183e-07, "loss": 0.0123, "step": 7338 }, { "epoch": 2.4508265152780098, "grad_norm": 0.2623893719815298, "learning_rate": 9.852831729600365e-07, "loss": 0.0131, "step": 7339 }, { "epoch": 2.451160460844882, "grad_norm": 0.18227982549195418, "learning_rate": 9.841251420451398e-07, "loss": 0.008, "step": 7340 }, { "epoch": 2.4514944064117548, "grad_norm": 0.2928098522990452, "learning_rate": 9.829677177754231e-07, "loss": 0.0142, "step": 7341 }, { "epoch": 2.4518283519786275, "grad_norm": 0.28737546887449433, "learning_rate": 9.818109003257348e-07, "loss": 0.0124, "step": 7342 }, { "epoch": 2.4521622975455, "grad_norm": 0.283085700704366, "learning_rate": 9.806546898708213e-07, "loss": 0.0089, "step": 7343 }, { "epoch": 2.452496243112373, "grad_norm": 0.2503307668814493, "learning_rate": 9.794990865853444e-07, "loss": 0.0098, "step": 7344 }, { "epoch": 2.452830188679245, "grad_norm": 0.3123889008674629, "learning_rate": 9.783440906438686e-07, "loss": 0.016, "step": 7345 }, { "epoch": 2.453164134246118, "grad_norm": 0.2523087173336739, "learning_rate": 9.771897022208732e-07, "loss": 0.0085, "step": 7346 }, { "epoch": 2.4534980798129906, "grad_norm": 0.2572594641327064, "learning_rate": 9.760359214907372e-07, "loss": 0.0114, "step": 7347 }, { "epoch": 2.453832025379863, "grad_norm": 0.3432933434003597, "learning_rate": 9.74882748627759e-07, "loss": 0.0113, "step": 7348 }, { "epoch": 2.4541659709467356, "grad_norm": 0.24869987251894454, "learning_rate": 9.737301838061342e-07, "loss": 0.0074, "step": 7349 }, { "epoch": 2.4544999165136083, "grad_norm": 0.2459174687004886, "learning_rate": 9.725782271999744e-07, "loss": 0.0105, "step": 7350 }, { "epoch": 2.454833862080481, "grad_norm": 0.2563920283285319, "learning_rate": 9.714268789832937e-07, "loss": 0.0125, "step": 7351 }, { "epoch": 2.4551678076473533, "grad_norm": 0.32344159898648145, "learning_rate": 9.702761393300176e-07, "loss": 0.0084, "step": 7352 }, { "epoch": 2.455501753214226, "grad_norm": 0.263201655566848, "learning_rate": 9.691260084139802e-07, "loss": 0.01, "step": 7353 }, { "epoch": 2.4558356987810988, "grad_norm": 0.2712607887343785, "learning_rate": 9.679764864089203e-07, "loss": 0.0132, "step": 7354 }, { "epoch": 2.4561696443479715, "grad_norm": 0.32113476946235286, "learning_rate": 9.668275734884885e-07, "loss": 0.0158, "step": 7355 }, { "epoch": 2.4565035899148437, "grad_norm": 0.28333905746829324, "learning_rate": 9.656792698262402e-07, "loss": 0.0119, "step": 7356 }, { "epoch": 2.4568375354817165, "grad_norm": 0.2698329597727835, "learning_rate": 9.645315755956413e-07, "loss": 0.0122, "step": 7357 }, { "epoch": 2.457171481048589, "grad_norm": 0.23963820262015254, "learning_rate": 9.633844909700618e-07, "loss": 0.013, "step": 7358 }, { "epoch": 2.4575054266154615, "grad_norm": 0.3335867582433281, "learning_rate": 9.622380161227873e-07, "loss": 0.0171, "step": 7359 }, { "epoch": 2.457839372182334, "grad_norm": 0.24621102816372686, "learning_rate": 9.61092151227002e-07, "loss": 0.0115, "step": 7360 }, { "epoch": 2.458173317749207, "grad_norm": 0.25744811295419195, "learning_rate": 9.599468964558051e-07, "loss": 0.0081, "step": 7361 }, { "epoch": 2.4585072633160796, "grad_norm": 0.2814718685341393, "learning_rate": 9.588022519821983e-07, "loss": 0.0226, "step": 7362 }, { "epoch": 2.4588412088829523, "grad_norm": 0.25557267803743267, "learning_rate": 9.576582179790967e-07, "loss": 0.0108, "step": 7363 }, { "epoch": 2.4591751544498246, "grad_norm": 0.25217261839201205, "learning_rate": 9.565147946193149e-07, "loss": 0.009, "step": 7364 }, { "epoch": 2.4595091000166973, "grad_norm": 0.39641517511590446, "learning_rate": 9.553719820755869e-07, "loss": 0.0188, "step": 7365 }, { "epoch": 2.45984304558357, "grad_norm": 0.2666369562077089, "learning_rate": 9.542297805205436e-07, "loss": 0.0131, "step": 7366 }, { "epoch": 2.4601769911504423, "grad_norm": 0.29185457548413524, "learning_rate": 9.530881901267308e-07, "loss": 0.0118, "step": 7367 }, { "epoch": 2.460510936717315, "grad_norm": 0.34129391396897135, "learning_rate": 9.519472110665967e-07, "loss": 0.015, "step": 7368 }, { "epoch": 2.4608448822841877, "grad_norm": 0.3359051402514027, "learning_rate": 9.508068435125012e-07, "loss": 0.0184, "step": 7369 }, { "epoch": 2.4611788278510605, "grad_norm": 0.3149923402450988, "learning_rate": 9.496670876367076e-07, "loss": 0.0131, "step": 7370 }, { "epoch": 2.4615127734179327, "grad_norm": 0.2595265333132604, "learning_rate": 9.485279436113942e-07, "loss": 0.0115, "step": 7371 }, { "epoch": 2.4618467189848054, "grad_norm": 0.3149118619492064, "learning_rate": 9.473894116086379e-07, "loss": 0.0146, "step": 7372 }, { "epoch": 2.462180664551678, "grad_norm": 0.2807543285827548, "learning_rate": 9.462514918004301e-07, "loss": 0.0113, "step": 7373 }, { "epoch": 2.462514610118551, "grad_norm": 0.37467502315809514, "learning_rate": 9.451141843586647e-07, "loss": 0.0135, "step": 7374 }, { "epoch": 2.462848555685423, "grad_norm": 0.259262591473988, "learning_rate": 9.439774894551479e-07, "loss": 0.0105, "step": 7375 }, { "epoch": 2.463182501252296, "grad_norm": 0.34799217970770124, "learning_rate": 9.428414072615877e-07, "loss": 0.0202, "step": 7376 }, { "epoch": 2.4635164468191686, "grad_norm": 0.2427597504087343, "learning_rate": 9.417059379496047e-07, "loss": 0.0096, "step": 7377 }, { "epoch": 2.463850392386041, "grad_norm": 0.2867707268451149, "learning_rate": 9.40571081690726e-07, "loss": 0.0111, "step": 7378 }, { "epoch": 2.4641843379529136, "grad_norm": 0.2845436020853542, "learning_rate": 9.394368386563823e-07, "loss": 0.0095, "step": 7379 }, { "epoch": 2.4645182835197863, "grad_norm": 0.29399836991743034, "learning_rate": 9.383032090179173e-07, "loss": 0.0152, "step": 7380 }, { "epoch": 2.464852229086659, "grad_norm": 0.228948456972105, "learning_rate": 9.371701929465759e-07, "loss": 0.008, "step": 7381 }, { "epoch": 2.4651861746535313, "grad_norm": 0.36012924008642466, "learning_rate": 9.360377906135148e-07, "loss": 0.0247, "step": 7382 }, { "epoch": 2.465520120220404, "grad_norm": 0.28176533973421675, "learning_rate": 9.349060021897976e-07, "loss": 0.0118, "step": 7383 }, { "epoch": 2.4658540657872767, "grad_norm": 0.2968451728673293, "learning_rate": 9.337748278463948e-07, "loss": 0.0136, "step": 7384 }, { "epoch": 2.4661880113541494, "grad_norm": 0.41312460562774683, "learning_rate": 9.326442677541813e-07, "loss": 0.0188, "step": 7385 }, { "epoch": 2.4665219569210217, "grad_norm": 0.32533184691092876, "learning_rate": 9.31514322083944e-07, "loss": 0.0149, "step": 7386 }, { "epoch": 2.4668559024878944, "grad_norm": 0.2682778800681192, "learning_rate": 9.303849910063717e-07, "loss": 0.014, "step": 7387 }, { "epoch": 2.467189848054767, "grad_norm": 0.34155137603953306, "learning_rate": 9.292562746920647e-07, "loss": 0.025, "step": 7388 }, { "epoch": 2.4675237936216394, "grad_norm": 0.30184382378474317, "learning_rate": 9.281281733115288e-07, "loss": 0.0127, "step": 7389 }, { "epoch": 2.467857739188512, "grad_norm": 0.28929079886766496, "learning_rate": 9.270006870351789e-07, "loss": 0.0133, "step": 7390 }, { "epoch": 2.468191684755385, "grad_norm": 0.3159966548414994, "learning_rate": 9.258738160333314e-07, "loss": 0.0197, "step": 7391 }, { "epoch": 2.4685256303222576, "grad_norm": 0.3834637689221866, "learning_rate": 9.247475604762168e-07, "loss": 0.018, "step": 7392 }, { "epoch": 2.4688595758891303, "grad_norm": 0.2406444693430883, "learning_rate": 9.236219205339647e-07, "loss": 0.0098, "step": 7393 }, { "epoch": 2.4691935214560026, "grad_norm": 0.24324079827551298, "learning_rate": 9.224968963766223e-07, "loss": 0.0124, "step": 7394 }, { "epoch": 2.4695274670228753, "grad_norm": 0.2855170662342513, "learning_rate": 9.213724881741337e-07, "loss": 0.011, "step": 7395 }, { "epoch": 2.469861412589748, "grad_norm": 0.2398212449249697, "learning_rate": 9.202486960963559e-07, "loss": 0.0116, "step": 7396 }, { "epoch": 2.4701953581566203, "grad_norm": 0.2823663592353502, "learning_rate": 9.191255203130489e-07, "loss": 0.0149, "step": 7397 }, { "epoch": 2.470529303723493, "grad_norm": 0.2707020008992273, "learning_rate": 9.18002960993884e-07, "loss": 0.0138, "step": 7398 }, { "epoch": 2.4708632492903657, "grad_norm": 0.2717785560876566, "learning_rate": 9.168810183084348e-07, "loss": 0.0158, "step": 7399 }, { "epoch": 2.4711971948572384, "grad_norm": 0.2772250103816418, "learning_rate": 9.157596924261847e-07, "loss": 0.0113, "step": 7400 }, { "epoch": 2.4715311404241107, "grad_norm": 0.35576545877944604, "learning_rate": 9.146389835165248e-07, "loss": 0.0142, "step": 7401 }, { "epoch": 2.4718650859909834, "grad_norm": 0.2481302215494884, "learning_rate": 9.135188917487487e-07, "loss": 0.0088, "step": 7402 }, { "epoch": 2.472199031557856, "grad_norm": 0.23749219989866024, "learning_rate": 9.12399417292062e-07, "loss": 0.0082, "step": 7403 }, { "epoch": 2.472532977124729, "grad_norm": 0.2543245478587539, "learning_rate": 9.112805603155716e-07, "loss": 0.0121, "step": 7404 }, { "epoch": 2.472866922691601, "grad_norm": 0.5376307491249888, "learning_rate": 9.101623209882965e-07, "loss": 0.0158, "step": 7405 }, { "epoch": 2.473200868258474, "grad_norm": 0.26655967629657945, "learning_rate": 9.090446994791585e-07, "loss": 0.0147, "step": 7406 }, { "epoch": 2.4735348138253466, "grad_norm": 0.4096313618904711, "learning_rate": 9.079276959569899e-07, "loss": 0.0204, "step": 7407 }, { "epoch": 2.473868759392219, "grad_norm": 0.29010719357614445, "learning_rate": 9.068113105905235e-07, "loss": 0.0132, "step": 7408 }, { "epoch": 2.4742027049590916, "grad_norm": 0.2664435999033433, "learning_rate": 9.056955435484061e-07, "loss": 0.0112, "step": 7409 }, { "epoch": 2.4745366505259643, "grad_norm": 0.31839424122550386, "learning_rate": 9.045803949991843e-07, "loss": 0.0134, "step": 7410 }, { "epoch": 2.474870596092837, "grad_norm": 0.22594237538056391, "learning_rate": 9.034658651113154e-07, "loss": 0.0105, "step": 7411 }, { "epoch": 2.4752045416597097, "grad_norm": 0.32982060721670076, "learning_rate": 9.023519540531633e-07, "loss": 0.0206, "step": 7412 }, { "epoch": 2.475538487226582, "grad_norm": 0.22372970550593976, "learning_rate": 9.01238661992998e-07, "loss": 0.0111, "step": 7413 }, { "epoch": 2.4758724327934547, "grad_norm": 0.2198342491823257, "learning_rate": 9.001259890989927e-07, "loss": 0.01, "step": 7414 }, { "epoch": 2.4762063783603274, "grad_norm": 0.25854013340820664, "learning_rate": 8.990139355392324e-07, "loss": 0.0098, "step": 7415 }, { "epoch": 2.4765403239271997, "grad_norm": 0.2059290500377877, "learning_rate": 8.979025014817039e-07, "loss": 0.0071, "step": 7416 }, { "epoch": 2.4768742694940724, "grad_norm": 0.24791823447152975, "learning_rate": 8.967916870943028e-07, "loss": 0.0105, "step": 7417 }, { "epoch": 2.477208215060945, "grad_norm": 0.2267840483916544, "learning_rate": 8.956814925448309e-07, "loss": 0.012, "step": 7418 }, { "epoch": 2.477542160627818, "grad_norm": 0.27758594398830816, "learning_rate": 8.945719180009977e-07, "loss": 0.0131, "step": 7419 }, { "epoch": 2.47787610619469, "grad_norm": 0.2643542821536824, "learning_rate": 8.934629636304149e-07, "loss": 0.0117, "step": 7420 }, { "epoch": 2.478210051761563, "grad_norm": 0.5148591572402273, "learning_rate": 8.923546296006058e-07, "loss": 0.029, "step": 7421 }, { "epoch": 2.4785439973284356, "grad_norm": 0.2317178743468494, "learning_rate": 8.912469160789944e-07, "loss": 0.0094, "step": 7422 }, { "epoch": 2.4788779428953083, "grad_norm": 0.3059872414456733, "learning_rate": 8.901398232329156e-07, "loss": 0.0127, "step": 7423 }, { "epoch": 2.4792118884621805, "grad_norm": 0.32666304109913874, "learning_rate": 8.890333512296095e-07, "loss": 0.0137, "step": 7424 }, { "epoch": 2.4795458340290533, "grad_norm": 0.2492647478725229, "learning_rate": 8.879275002362197e-07, "loss": 0.0113, "step": 7425 }, { "epoch": 2.479879779595926, "grad_norm": 0.5699061430872919, "learning_rate": 8.868222704198004e-07, "loss": 0.0153, "step": 7426 }, { "epoch": 2.4802137251627983, "grad_norm": 0.32537831526187644, "learning_rate": 8.857176619473068e-07, "loss": 0.0133, "step": 7427 }, { "epoch": 2.480547670729671, "grad_norm": 0.2645468568701481, "learning_rate": 8.846136749856044e-07, "loss": 0.0106, "step": 7428 }, { "epoch": 2.4808816162965437, "grad_norm": 0.28970387862449526, "learning_rate": 8.835103097014636e-07, "loss": 0.0119, "step": 7429 }, { "epoch": 2.4812155618634164, "grad_norm": 0.3264385633712302, "learning_rate": 8.824075662615617e-07, "loss": 0.0164, "step": 7430 }, { "epoch": 2.4815495074302887, "grad_norm": 0.3392268866713018, "learning_rate": 8.813054448324792e-07, "loss": 0.0216, "step": 7431 }, { "epoch": 2.4818834529971614, "grad_norm": 0.24878933053929583, "learning_rate": 8.80203945580706e-07, "loss": 0.0111, "step": 7432 }, { "epoch": 2.482217398564034, "grad_norm": 0.3115663099998713, "learning_rate": 8.791030686726349e-07, "loss": 0.0133, "step": 7433 }, { "epoch": 2.482551344130907, "grad_norm": 0.2686470558722783, "learning_rate": 8.780028142745673e-07, "loss": 0.013, "step": 7434 }, { "epoch": 2.482885289697779, "grad_norm": 0.26978008421913086, "learning_rate": 8.769031825527097e-07, "loss": 0.0111, "step": 7435 }, { "epoch": 2.483219235264652, "grad_norm": 0.3637520293394349, "learning_rate": 8.758041736731753e-07, "loss": 0.0114, "step": 7436 }, { "epoch": 2.4835531808315245, "grad_norm": 0.27255003830385555, "learning_rate": 8.747057878019799e-07, "loss": 0.014, "step": 7437 }, { "epoch": 2.483887126398397, "grad_norm": 0.2747134069292236, "learning_rate": 8.736080251050505e-07, "loss": 0.0103, "step": 7438 }, { "epoch": 2.4842210719652695, "grad_norm": 0.2372651711736165, "learning_rate": 8.725108857482145e-07, "loss": 0.0101, "step": 7439 }, { "epoch": 2.4845550175321423, "grad_norm": 0.3305539702320851, "learning_rate": 8.714143698972083e-07, "loss": 0.0133, "step": 7440 }, { "epoch": 2.484888963099015, "grad_norm": 0.2739958689199965, "learning_rate": 8.703184777176743e-07, "loss": 0.0093, "step": 7441 }, { "epoch": 2.4852229086658877, "grad_norm": 0.23830424327958363, "learning_rate": 8.692232093751613e-07, "loss": 0.011, "step": 7442 }, { "epoch": 2.48555685423276, "grad_norm": 0.3024521958358322, "learning_rate": 8.68128565035119e-07, "loss": 0.0179, "step": 7443 }, { "epoch": 2.4858907997996327, "grad_norm": 0.26137383555521065, "learning_rate": 8.670345448629097e-07, "loss": 0.0092, "step": 7444 }, { "epoch": 2.4862247453665054, "grad_norm": 0.2872450507530698, "learning_rate": 8.659411490237951e-07, "loss": 0.0166, "step": 7445 }, { "epoch": 2.4865586909333777, "grad_norm": 0.37505759068410366, "learning_rate": 8.648483776829469e-07, "loss": 0.0167, "step": 7446 }, { "epoch": 2.4868926365002504, "grad_norm": 0.26292564056472145, "learning_rate": 8.637562310054425e-07, "loss": 0.0131, "step": 7447 }, { "epoch": 2.487226582067123, "grad_norm": 0.28773112340703616, "learning_rate": 8.626647091562612e-07, "loss": 0.0184, "step": 7448 }, { "epoch": 2.487560527633996, "grad_norm": 0.2730958661513948, "learning_rate": 8.61573812300292e-07, "loss": 0.0106, "step": 7449 }, { "epoch": 2.487894473200868, "grad_norm": 0.23593585732233188, "learning_rate": 8.604835406023254e-07, "loss": 0.0091, "step": 7450 }, { "epoch": 2.488228418767741, "grad_norm": 0.22218945927555991, "learning_rate": 8.593938942270613e-07, "loss": 0.0105, "step": 7451 }, { "epoch": 2.4885623643346135, "grad_norm": 0.2848038418982785, "learning_rate": 8.583048733391036e-07, "loss": 0.0127, "step": 7452 }, { "epoch": 2.4888963099014862, "grad_norm": 0.28377579944649034, "learning_rate": 8.57216478102963e-07, "loss": 0.0124, "step": 7453 }, { "epoch": 2.4892302554683585, "grad_norm": 0.24586441459462724, "learning_rate": 8.561287086830516e-07, "loss": 0.0083, "step": 7454 }, { "epoch": 2.4895642010352312, "grad_norm": 0.2367840812098278, "learning_rate": 8.550415652436927e-07, "loss": 0.0075, "step": 7455 }, { "epoch": 2.489898146602104, "grad_norm": 0.2748449301147865, "learning_rate": 8.539550479491093e-07, "loss": 0.0171, "step": 7456 }, { "epoch": 2.4902320921689762, "grad_norm": 0.2958625824272446, "learning_rate": 8.528691569634357e-07, "loss": 0.019, "step": 7457 }, { "epoch": 2.490566037735849, "grad_norm": 0.33993780967279436, "learning_rate": 8.517838924507039e-07, "loss": 0.0151, "step": 7458 }, { "epoch": 2.4908999833027217, "grad_norm": 0.318394649387788, "learning_rate": 8.50699254574861e-07, "loss": 0.0171, "step": 7459 }, { "epoch": 2.4912339288695944, "grad_norm": 0.2135113378957332, "learning_rate": 8.496152434997518e-07, "loss": 0.0096, "step": 7460 }, { "epoch": 2.491567874436467, "grad_norm": 0.26150100790717984, "learning_rate": 8.485318593891295e-07, "loss": 0.0112, "step": 7461 }, { "epoch": 2.4919018200033394, "grad_norm": 0.5438477535756069, "learning_rate": 8.474491024066512e-07, "loss": 0.0295, "step": 7462 }, { "epoch": 2.492235765570212, "grad_norm": 0.2324033045906197, "learning_rate": 8.463669727158819e-07, "loss": 0.0104, "step": 7463 }, { "epoch": 2.492569711137085, "grad_norm": 0.2868012703797143, "learning_rate": 8.45285470480286e-07, "loss": 0.0128, "step": 7464 }, { "epoch": 2.492903656703957, "grad_norm": 0.34009350810193995, "learning_rate": 8.442045958632428e-07, "loss": 0.0119, "step": 7465 }, { "epoch": 2.49323760227083, "grad_norm": 0.4513546666143164, "learning_rate": 8.431243490280267e-07, "loss": 0.0266, "step": 7466 }, { "epoch": 2.4935715478377025, "grad_norm": 0.19228656975813063, "learning_rate": 8.420447301378249e-07, "loss": 0.0058, "step": 7467 }, { "epoch": 2.4939054934045752, "grad_norm": 0.26433361673700384, "learning_rate": 8.409657393557236e-07, "loss": 0.0148, "step": 7468 }, { "epoch": 2.4942394389714475, "grad_norm": 0.36446607846846146, "learning_rate": 8.39887376844718e-07, "loss": 0.0166, "step": 7469 }, { "epoch": 2.4945733845383202, "grad_norm": 0.31405048361481996, "learning_rate": 8.388096427677095e-07, "loss": 0.0167, "step": 7470 }, { "epoch": 2.494907330105193, "grad_norm": 0.32071936783487687, "learning_rate": 8.377325372874995e-07, "loss": 0.0162, "step": 7471 }, { "epoch": 2.4952412756720657, "grad_norm": 0.26660919712543363, "learning_rate": 8.366560605668006e-07, "loss": 0.0092, "step": 7472 }, { "epoch": 2.495575221238938, "grad_norm": 0.2847134033925736, "learning_rate": 8.355802127682238e-07, "loss": 0.0115, "step": 7473 }, { "epoch": 2.4959091668058107, "grad_norm": 0.26420205422985205, "learning_rate": 8.345049940542904e-07, "loss": 0.0088, "step": 7474 }, { "epoch": 2.4962431123726834, "grad_norm": 0.26939620063837594, "learning_rate": 8.334304045874248e-07, "loss": 0.0112, "step": 7475 }, { "epoch": 2.4965770579395556, "grad_norm": 0.21972676943472785, "learning_rate": 8.323564445299575e-07, "loss": 0.0089, "step": 7476 }, { "epoch": 2.4969110035064284, "grad_norm": 0.21377191805667578, "learning_rate": 8.312831140441207e-07, "loss": 0.0075, "step": 7477 }, { "epoch": 2.497244949073301, "grad_norm": 0.21299621422449647, "learning_rate": 8.302104132920552e-07, "loss": 0.0079, "step": 7478 }, { "epoch": 2.497578894640174, "grad_norm": 0.23024827188708863, "learning_rate": 8.291383424358041e-07, "loss": 0.0108, "step": 7479 }, { "epoch": 2.497912840207046, "grad_norm": 0.33857903181597176, "learning_rate": 8.280669016373172e-07, "loss": 0.0172, "step": 7480 }, { "epoch": 2.498246785773919, "grad_norm": 0.26519779138516475, "learning_rate": 8.269960910584457e-07, "loss": 0.0134, "step": 7481 }, { "epoch": 2.4985807313407915, "grad_norm": 0.33444793311586507, "learning_rate": 8.259259108609524e-07, "loss": 0.0189, "step": 7482 }, { "epoch": 2.4989146769076642, "grad_norm": 0.25943343687083725, "learning_rate": 8.248563612064969e-07, "loss": 0.0103, "step": 7483 }, { "epoch": 2.4992486224745365, "grad_norm": 0.30237306680255943, "learning_rate": 8.237874422566505e-07, "loss": 0.0186, "step": 7484 }, { "epoch": 2.499582568041409, "grad_norm": 0.2649511850101908, "learning_rate": 8.227191541728829e-07, "loss": 0.0139, "step": 7485 }, { "epoch": 2.499916513608282, "grad_norm": 0.32639988819096716, "learning_rate": 8.21651497116574e-07, "loss": 0.0125, "step": 7486 }, { "epoch": 2.500250459175154, "grad_norm": 0.33547525147974094, "learning_rate": 8.205844712490024e-07, "loss": 0.0135, "step": 7487 }, { "epoch": 2.500584404742027, "grad_norm": 0.2617696819554997, "learning_rate": 8.195180767313604e-07, "loss": 0.0099, "step": 7488 }, { "epoch": 2.5009183503088996, "grad_norm": 0.3643538060755637, "learning_rate": 8.184523137247346e-07, "loss": 0.0177, "step": 7489 }, { "epoch": 2.5012522958757724, "grad_norm": 0.30941708656489425, "learning_rate": 8.173871823901247e-07, "loss": 0.012, "step": 7490 }, { "epoch": 2.501586241442645, "grad_norm": 0.32032174298152893, "learning_rate": 8.16322682888428e-07, "loss": 0.0187, "step": 7491 }, { "epoch": 2.5019201870095173, "grad_norm": 0.27503930542634536, "learning_rate": 8.15258815380453e-07, "loss": 0.0174, "step": 7492 }, { "epoch": 2.50225413257639, "grad_norm": 0.2863134875388935, "learning_rate": 8.141955800269058e-07, "loss": 0.0144, "step": 7493 }, { "epoch": 2.502588078143263, "grad_norm": 0.3006021417430872, "learning_rate": 8.131329769884027e-07, "loss": 0.0118, "step": 7494 }, { "epoch": 2.502922023710135, "grad_norm": 0.2758410721079843, "learning_rate": 8.120710064254634e-07, "loss": 0.0118, "step": 7495 }, { "epoch": 2.5032559692770078, "grad_norm": 0.3139761802139935, "learning_rate": 8.110096684985086e-07, "loss": 0.0154, "step": 7496 }, { "epoch": 2.5035899148438805, "grad_norm": 0.2696734721830606, "learning_rate": 8.099489633678676e-07, "loss": 0.0151, "step": 7497 }, { "epoch": 2.503923860410753, "grad_norm": 0.3142316456179232, "learning_rate": 8.088888911937726e-07, "loss": 0.0158, "step": 7498 }, { "epoch": 2.5042578059776255, "grad_norm": 0.4117660260331043, "learning_rate": 8.078294521363584e-07, "loss": 0.0259, "step": 7499 }, { "epoch": 2.504591751544498, "grad_norm": 0.3236151848025178, "learning_rate": 8.067706463556663e-07, "loss": 0.0125, "step": 7500 }, { "epoch": 2.504925697111371, "grad_norm": 0.26476105995267335, "learning_rate": 8.057124740116434e-07, "loss": 0.0174, "step": 7501 }, { "epoch": 2.5052596426782436, "grad_norm": 0.21317169985024756, "learning_rate": 8.046549352641359e-07, "loss": 0.0084, "step": 7502 }, { "epoch": 2.505593588245116, "grad_norm": 0.2718079570234273, "learning_rate": 8.035980302729008e-07, "loss": 0.0141, "step": 7503 }, { "epoch": 2.5059275338119886, "grad_norm": 0.26982581170003705, "learning_rate": 8.025417591975926e-07, "loss": 0.0096, "step": 7504 }, { "epoch": 2.5062614793788613, "grad_norm": 0.2680654971650118, "learning_rate": 8.014861221977749e-07, "loss": 0.0146, "step": 7505 }, { "epoch": 2.5065954249457336, "grad_norm": 0.3180238980190388, "learning_rate": 8.004311194329145e-07, "loss": 0.0152, "step": 7506 }, { "epoch": 2.5069293705126063, "grad_norm": 0.24209967918279646, "learning_rate": 7.993767510623834e-07, "loss": 0.0112, "step": 7507 }, { "epoch": 2.507263316079479, "grad_norm": 0.3272298526452807, "learning_rate": 7.983230172454531e-07, "loss": 0.018, "step": 7508 }, { "epoch": 2.5075972616463518, "grad_norm": 0.27835864475758876, "learning_rate": 7.972699181413058e-07, "loss": 0.012, "step": 7509 }, { "epoch": 2.5079312072132245, "grad_norm": 0.27716962050594207, "learning_rate": 7.962174539090201e-07, "loss": 0.0104, "step": 7510 }, { "epoch": 2.5082651527800968, "grad_norm": 0.26851036744495926, "learning_rate": 7.951656247075884e-07, "loss": 0.0174, "step": 7511 }, { "epoch": 2.5085990983469695, "grad_norm": 0.3936971307343381, "learning_rate": 7.941144306958986e-07, "loss": 0.0266, "step": 7512 }, { "epoch": 2.508933043913842, "grad_norm": 0.38525609535037336, "learning_rate": 7.930638720327477e-07, "loss": 0.0187, "step": 7513 }, { "epoch": 2.5092669894807145, "grad_norm": 0.30054222328034796, "learning_rate": 7.920139488768325e-07, "loss": 0.0158, "step": 7514 }, { "epoch": 2.509600935047587, "grad_norm": 0.36527045291501914, "learning_rate": 7.909646613867594e-07, "loss": 0.0135, "step": 7515 }, { "epoch": 2.50993488061446, "grad_norm": 0.28270674511044247, "learning_rate": 7.899160097210329e-07, "loss": 0.0127, "step": 7516 }, { "epoch": 2.510268826181332, "grad_norm": 0.26290850558982126, "learning_rate": 7.888679940380644e-07, "loss": 0.0108, "step": 7517 }, { "epoch": 2.510602771748205, "grad_norm": 0.30135195085473543, "learning_rate": 7.87820614496172e-07, "loss": 0.0286, "step": 7518 }, { "epoch": 2.5109367173150776, "grad_norm": 0.30532503578447345, "learning_rate": 7.867738712535711e-07, "loss": 0.0137, "step": 7519 }, { "epoch": 2.5112706628819503, "grad_norm": 0.2941924862667987, "learning_rate": 7.857277644683858e-07, "loss": 0.0193, "step": 7520 }, { "epoch": 2.511604608448823, "grad_norm": 0.26657489483577435, "learning_rate": 7.846822942986449e-07, "loss": 0.0104, "step": 7521 }, { "epoch": 2.5119385540156953, "grad_norm": 0.34821001515380123, "learning_rate": 7.836374609022756e-07, "loss": 0.0162, "step": 7522 }, { "epoch": 2.512272499582568, "grad_norm": 0.25992029250167054, "learning_rate": 7.825932644371137e-07, "loss": 0.0119, "step": 7523 }, { "epoch": 2.5126064451494408, "grad_norm": 0.26989945031975804, "learning_rate": 7.815497050608989e-07, "loss": 0.0107, "step": 7524 }, { "epoch": 2.512940390716313, "grad_norm": 0.284167749975026, "learning_rate": 7.805067829312707e-07, "loss": 0.0131, "step": 7525 }, { "epoch": 2.5132743362831858, "grad_norm": 0.26881029588972233, "learning_rate": 7.79464498205777e-07, "loss": 0.0137, "step": 7526 }, { "epoch": 2.5136082818500585, "grad_norm": 0.28799350090633236, "learning_rate": 7.78422851041865e-07, "loss": 0.0124, "step": 7527 }, { "epoch": 2.513942227416931, "grad_norm": 0.347891544805565, "learning_rate": 7.773818415968887e-07, "loss": 0.0134, "step": 7528 }, { "epoch": 2.514276172983804, "grad_norm": 0.2970741533603355, "learning_rate": 7.763414700281053e-07, "loss": 0.0119, "step": 7529 }, { "epoch": 2.514610118550676, "grad_norm": 0.3539225000276794, "learning_rate": 7.753017364926757e-07, "loss": 0.0208, "step": 7530 }, { "epoch": 2.514944064117549, "grad_norm": 0.33629761066323144, "learning_rate": 7.742626411476617e-07, "loss": 0.0134, "step": 7531 }, { "epoch": 2.5152780096844216, "grad_norm": 0.28155100990311854, "learning_rate": 7.732241841500332e-07, "loss": 0.0137, "step": 7532 }, { "epoch": 2.515611955251294, "grad_norm": 0.2659163733742089, "learning_rate": 7.721863656566597e-07, "loss": 0.0128, "step": 7533 }, { "epoch": 2.5159459008181666, "grad_norm": 0.24208404243411683, "learning_rate": 7.711491858243164e-07, "loss": 0.0138, "step": 7534 }, { "epoch": 2.5162798463850393, "grad_norm": 0.26105239540569763, "learning_rate": 7.701126448096813e-07, "loss": 0.0101, "step": 7535 }, { "epoch": 2.5166137919519116, "grad_norm": 0.2802534568300955, "learning_rate": 7.69076742769338e-07, "loss": 0.0126, "step": 7536 }, { "epoch": 2.5169477375187843, "grad_norm": 0.24357848550385142, "learning_rate": 7.68041479859769e-07, "loss": 0.0154, "step": 7537 }, { "epoch": 2.517281683085657, "grad_norm": 0.35228867344053577, "learning_rate": 7.670068562373656e-07, "loss": 0.0183, "step": 7538 }, { "epoch": 2.5176156286525297, "grad_norm": 0.332641741046175, "learning_rate": 7.65972872058417e-07, "loss": 0.0135, "step": 7539 }, { "epoch": 2.5179495742194025, "grad_norm": 0.20734832982401147, "learning_rate": 7.6493952747912e-07, "loss": 0.0084, "step": 7540 }, { "epoch": 2.5182835197862747, "grad_norm": 0.23680939621094196, "learning_rate": 7.639068226555751e-07, "loss": 0.0116, "step": 7541 }, { "epoch": 2.5186174653531475, "grad_norm": 0.24809204560322856, "learning_rate": 7.628747577437817e-07, "loss": 0.0106, "step": 7542 }, { "epoch": 2.51895141092002, "grad_norm": 0.24711898252742784, "learning_rate": 7.618433328996466e-07, "loss": 0.0106, "step": 7543 }, { "epoch": 2.5192853564868924, "grad_norm": 0.39365759248803117, "learning_rate": 7.608125482789802e-07, "loss": 0.0192, "step": 7544 }, { "epoch": 2.519619302053765, "grad_norm": 0.41108589361119663, "learning_rate": 7.597824040374918e-07, "loss": 0.02, "step": 7545 }, { "epoch": 2.519953247620638, "grad_norm": 0.19765272619035332, "learning_rate": 7.587529003307981e-07, "loss": 0.0081, "step": 7546 }, { "epoch": 2.5202871931875106, "grad_norm": 0.31210370871376003, "learning_rate": 7.57724037314419e-07, "loss": 0.0157, "step": 7547 }, { "epoch": 2.520621138754383, "grad_norm": 0.3066968391693989, "learning_rate": 7.566958151437743e-07, "loss": 0.0146, "step": 7548 }, { "epoch": 2.5209550843212556, "grad_norm": 0.299597242559998, "learning_rate": 7.556682339741911e-07, "loss": 0.0159, "step": 7549 }, { "epoch": 2.5212890298881283, "grad_norm": 0.21504881144512858, "learning_rate": 7.546412939608955e-07, "loss": 0.0096, "step": 7550 }, { "epoch": 2.521622975455001, "grad_norm": 0.46879823634392637, "learning_rate": 7.5361499525902e-07, "loss": 0.0323, "step": 7551 }, { "epoch": 2.5219569210218733, "grad_norm": 0.22235446846648538, "learning_rate": 7.525893380235988e-07, "loss": 0.0082, "step": 7552 }, { "epoch": 2.522290866588746, "grad_norm": 0.32767128932311534, "learning_rate": 7.515643224095709e-07, "loss": 0.0212, "step": 7553 }, { "epoch": 2.5226248121556187, "grad_norm": 0.3528000327967212, "learning_rate": 7.505399485717746e-07, "loss": 0.0254, "step": 7554 }, { "epoch": 2.522958757722491, "grad_norm": 0.2639725899059416, "learning_rate": 7.495162166649561e-07, "loss": 0.0125, "step": 7555 }, { "epoch": 2.5232927032893637, "grad_norm": 0.3484569359303812, "learning_rate": 7.484931268437595e-07, "loss": 0.0127, "step": 7556 }, { "epoch": 2.5236266488562364, "grad_norm": 0.21416052591129747, "learning_rate": 7.474706792627362e-07, "loss": 0.0139, "step": 7557 }, { "epoch": 2.523960594423109, "grad_norm": 0.2237026002440248, "learning_rate": 7.464488740763387e-07, "loss": 0.01, "step": 7558 }, { "epoch": 2.524294539989982, "grad_norm": 0.30163527901392606, "learning_rate": 7.454277114389241e-07, "loss": 0.0119, "step": 7559 }, { "epoch": 2.524628485556854, "grad_norm": 0.3235806606250762, "learning_rate": 7.444071915047479e-07, "loss": 0.0148, "step": 7560 }, { "epoch": 2.524962431123727, "grad_norm": 0.2672013720760013, "learning_rate": 7.433873144279751e-07, "loss": 0.0124, "step": 7561 }, { "epoch": 2.5252963766905996, "grad_norm": 0.4594222545439994, "learning_rate": 7.42368080362667e-07, "loss": 0.0211, "step": 7562 }, { "epoch": 2.525630322257472, "grad_norm": 0.2933518960107444, "learning_rate": 7.413494894627926e-07, "loss": 0.0135, "step": 7563 }, { "epoch": 2.5259642678243446, "grad_norm": 0.26549948183498945, "learning_rate": 7.403315418822215e-07, "loss": 0.0127, "step": 7564 }, { "epoch": 2.5262982133912173, "grad_norm": 0.29135788599720286, "learning_rate": 7.393142377747287e-07, "loss": 0.0143, "step": 7565 }, { "epoch": 2.5266321589580896, "grad_norm": 0.2077255763095782, "learning_rate": 7.382975772939866e-07, "loss": 0.0088, "step": 7566 }, { "epoch": 2.5269661045249623, "grad_norm": 0.29887337214834014, "learning_rate": 7.372815605935763e-07, "loss": 0.0193, "step": 7567 }, { "epoch": 2.527300050091835, "grad_norm": 0.29036737874299484, "learning_rate": 7.362661878269772e-07, "loss": 0.0162, "step": 7568 }, { "epoch": 2.5276339956587077, "grad_norm": 0.28973666902738576, "learning_rate": 7.352514591475746e-07, "loss": 0.0156, "step": 7569 }, { "epoch": 2.5279679412255804, "grad_norm": 0.5974911089777487, "learning_rate": 7.342373747086557e-07, "loss": 0.0222, "step": 7570 }, { "epoch": 2.5283018867924527, "grad_norm": 0.2353122906771022, "learning_rate": 7.332239346634079e-07, "loss": 0.0081, "step": 7571 }, { "epoch": 2.5286358323593254, "grad_norm": 0.27092979239073095, "learning_rate": 7.322111391649261e-07, "loss": 0.0155, "step": 7572 }, { "epoch": 2.528969777926198, "grad_norm": 0.262717083733501, "learning_rate": 7.311989883662018e-07, "loss": 0.0167, "step": 7573 }, { "epoch": 2.5293037234930704, "grad_norm": 0.5500810831817916, "learning_rate": 7.301874824201349e-07, "loss": 0.0178, "step": 7574 }, { "epoch": 2.529637669059943, "grad_norm": 0.31262105886576164, "learning_rate": 7.29176621479522e-07, "loss": 0.0135, "step": 7575 }, { "epoch": 2.529971614626816, "grad_norm": 0.3606297777899301, "learning_rate": 7.2816640569707e-07, "loss": 0.0207, "step": 7576 }, { "epoch": 2.5303055601936886, "grad_norm": 0.2674616062185029, "learning_rate": 7.271568352253804e-07, "loss": 0.0112, "step": 7577 }, { "epoch": 2.5306395057605613, "grad_norm": 0.3352255147195767, "learning_rate": 7.261479102169627e-07, "loss": 0.0131, "step": 7578 }, { "epoch": 2.5309734513274336, "grad_norm": 0.24186772682285884, "learning_rate": 7.251396308242259e-07, "loss": 0.0077, "step": 7579 }, { "epoch": 2.5313073968943063, "grad_norm": 0.2645461840386459, "learning_rate": 7.241319971994831e-07, "loss": 0.0123, "step": 7580 }, { "epoch": 2.531641342461179, "grad_norm": 0.1929282593891482, "learning_rate": 7.231250094949472e-07, "loss": 0.0076, "step": 7581 }, { "epoch": 2.5319752880280513, "grad_norm": 0.36436333781062513, "learning_rate": 7.221186678627389e-07, "loss": 0.0164, "step": 7582 }, { "epoch": 2.532309233594924, "grad_norm": 0.3421473823759679, "learning_rate": 7.211129724548754e-07, "loss": 0.0193, "step": 7583 }, { "epoch": 2.5326431791617967, "grad_norm": 0.2593095570302387, "learning_rate": 7.201079234232805e-07, "loss": 0.0116, "step": 7584 }, { "epoch": 2.532977124728669, "grad_norm": 0.28797119661021686, "learning_rate": 7.191035209197772e-07, "loss": 0.0164, "step": 7585 }, { "epoch": 2.5333110702955417, "grad_norm": 0.3690058847072772, "learning_rate": 7.180997650960936e-07, "loss": 0.0241, "step": 7586 }, { "epoch": 2.5336450158624144, "grad_norm": 0.3042760633825699, "learning_rate": 7.170966561038561e-07, "loss": 0.0159, "step": 7587 }, { "epoch": 2.533978961429287, "grad_norm": 0.3038722096925673, "learning_rate": 7.160941940946009e-07, "loss": 0.0136, "step": 7588 }, { "epoch": 2.53431290699616, "grad_norm": 0.36788491784806165, "learning_rate": 7.150923792197579e-07, "loss": 0.0184, "step": 7589 }, { "epoch": 2.534646852563032, "grad_norm": 0.26606150918988686, "learning_rate": 7.140912116306648e-07, "loss": 0.009, "step": 7590 }, { "epoch": 2.534980798129905, "grad_norm": 0.3430438676656165, "learning_rate": 7.130906914785585e-07, "loss": 0.0113, "step": 7591 }, { "epoch": 2.5353147436967776, "grad_norm": 0.27256466236182747, "learning_rate": 7.120908189145798e-07, "loss": 0.0107, "step": 7592 }, { "epoch": 2.53564868926365, "grad_norm": 0.28485656773511575, "learning_rate": 7.110915940897722e-07, "loss": 0.0095, "step": 7593 }, { "epoch": 2.5359826348305226, "grad_norm": 0.3147528853222281, "learning_rate": 7.100930171550785e-07, "loss": 0.0186, "step": 7594 }, { "epoch": 2.5363165803973953, "grad_norm": 0.30007872100734273, "learning_rate": 7.090950882613479e-07, "loss": 0.0131, "step": 7595 }, { "epoch": 2.536650525964268, "grad_norm": 0.3003483182019013, "learning_rate": 7.08097807559327e-07, "loss": 0.0141, "step": 7596 }, { "epoch": 2.5369844715311403, "grad_norm": 0.27311366686798244, "learning_rate": 7.071011751996687e-07, "loss": 0.0093, "step": 7597 }, { "epoch": 2.537318417098013, "grad_norm": 0.33483066735765116, "learning_rate": 7.061051913329231e-07, "loss": 0.0219, "step": 7598 }, { "epoch": 2.5376523626648857, "grad_norm": 0.27756377119210085, "learning_rate": 7.051098561095493e-07, "loss": 0.0129, "step": 7599 }, { "epoch": 2.5379863082317584, "grad_norm": 0.2838138355904635, "learning_rate": 7.041151696799014e-07, "loss": 0.013, "step": 7600 }, { "epoch": 2.5383202537986307, "grad_norm": 0.2870240267803289, "learning_rate": 7.031211321942405e-07, "loss": 0.0124, "step": 7601 }, { "epoch": 2.5386541993655034, "grad_norm": 0.22704462870237718, "learning_rate": 7.021277438027258e-07, "loss": 0.0104, "step": 7602 }, { "epoch": 2.538988144932376, "grad_norm": 0.43760637475478004, "learning_rate": 7.011350046554227e-07, "loss": 0.0151, "step": 7603 }, { "epoch": 2.5393220904992484, "grad_norm": 0.32335918948087855, "learning_rate": 7.001429149022915e-07, "loss": 0.0196, "step": 7604 }, { "epoch": 2.539656036066121, "grad_norm": 0.24849127560724107, "learning_rate": 6.991514746932048e-07, "loss": 0.0093, "step": 7605 }, { "epoch": 2.539989981632994, "grad_norm": 0.289605915782507, "learning_rate": 6.981606841779281e-07, "loss": 0.0124, "step": 7606 }, { "epoch": 2.5403239271998665, "grad_norm": 0.2405093080198529, "learning_rate": 6.971705435061333e-07, "loss": 0.0093, "step": 7607 }, { "epoch": 2.5406578727667393, "grad_norm": 0.365772313700415, "learning_rate": 6.96181052827391e-07, "loss": 0.0189, "step": 7608 }, { "epoch": 2.5409918183336115, "grad_norm": 0.29299179302693745, "learning_rate": 6.951922122911775e-07, "loss": 0.0129, "step": 7609 }, { "epoch": 2.5413257639004843, "grad_norm": 0.21477693270870274, "learning_rate": 6.942040220468654e-07, "loss": 0.0098, "step": 7610 }, { "epoch": 2.541659709467357, "grad_norm": 0.2627015365413541, "learning_rate": 6.932164822437371e-07, "loss": 0.0092, "step": 7611 }, { "epoch": 2.5419936550342292, "grad_norm": 0.3053830771654161, "learning_rate": 6.922295930309691e-07, "loss": 0.0128, "step": 7612 }, { "epoch": 2.542327600601102, "grad_norm": 0.31507398162041766, "learning_rate": 6.912433545576446e-07, "loss": 0.013, "step": 7613 }, { "epoch": 2.5426615461679747, "grad_norm": 0.35287209472836883, "learning_rate": 6.90257766972744e-07, "loss": 0.0209, "step": 7614 }, { "epoch": 2.542995491734847, "grad_norm": 0.23745912593933097, "learning_rate": 6.892728304251544e-07, "loss": 0.009, "step": 7615 }, { "epoch": 2.5433294373017197, "grad_norm": 0.2910545767108932, "learning_rate": 6.8828854506366e-07, "loss": 0.0133, "step": 7616 }, { "epoch": 2.5436633828685924, "grad_norm": 0.2950379758085675, "learning_rate": 6.873049110369495e-07, "loss": 0.0115, "step": 7617 }, { "epoch": 2.543997328435465, "grad_norm": 0.2464590801362038, "learning_rate": 6.863219284936135e-07, "loss": 0.0109, "step": 7618 }, { "epoch": 2.544331274002338, "grad_norm": 0.3096588368307441, "learning_rate": 6.853395975821414e-07, "loss": 0.0144, "step": 7619 }, { "epoch": 2.54466521956921, "grad_norm": 0.2714670629343454, "learning_rate": 6.843579184509275e-07, "loss": 0.0107, "step": 7620 }, { "epoch": 2.544999165136083, "grad_norm": 0.19873774869983954, "learning_rate": 6.833768912482636e-07, "loss": 0.0088, "step": 7621 }, { "epoch": 2.5453331107029555, "grad_norm": 0.2732883874401984, "learning_rate": 6.823965161223472e-07, "loss": 0.0156, "step": 7622 }, { "epoch": 2.545667056269828, "grad_norm": 0.2141874848917232, "learning_rate": 6.814167932212751e-07, "loss": 0.0074, "step": 7623 }, { "epoch": 2.5460010018367005, "grad_norm": 0.2861711486348828, "learning_rate": 6.804377226930469e-07, "loss": 0.0179, "step": 7624 }, { "epoch": 2.5463349474035732, "grad_norm": 0.24788504031554284, "learning_rate": 6.794593046855613e-07, "loss": 0.0093, "step": 7625 }, { "epoch": 2.546668892970446, "grad_norm": 0.20039467144689546, "learning_rate": 6.784815393466215e-07, "loss": 0.0092, "step": 7626 }, { "epoch": 2.5470028385373187, "grad_norm": 0.2830293777900935, "learning_rate": 6.775044268239278e-07, "loss": 0.0089, "step": 7627 }, { "epoch": 2.547336784104191, "grad_norm": 0.31463979243652335, "learning_rate": 6.765279672650865e-07, "loss": 0.0167, "step": 7628 }, { "epoch": 2.5476707296710637, "grad_norm": 0.2826136080760445, "learning_rate": 6.75552160817603e-07, "loss": 0.0106, "step": 7629 }, { "epoch": 2.5480046752379364, "grad_norm": 0.27206352461557476, "learning_rate": 6.745770076288854e-07, "loss": 0.0108, "step": 7630 }, { "epoch": 2.5483386208048087, "grad_norm": 0.29022417323012484, "learning_rate": 6.736025078462399e-07, "loss": 0.013, "step": 7631 }, { "epoch": 2.5486725663716814, "grad_norm": 0.22705775467584716, "learning_rate": 6.726286616168781e-07, "loss": 0.0095, "step": 7632 }, { "epoch": 2.549006511938554, "grad_norm": 0.2822732800886761, "learning_rate": 6.716554690879085e-07, "loss": 0.0148, "step": 7633 }, { "epoch": 2.5493404575054264, "grad_norm": 0.34597594823694056, "learning_rate": 6.706829304063467e-07, "loss": 0.0193, "step": 7634 }, { "epoch": 2.549674403072299, "grad_norm": 0.27436227871033536, "learning_rate": 6.697110457191031e-07, "loss": 0.0156, "step": 7635 }, { "epoch": 2.550008348639172, "grad_norm": 0.29438849725969307, "learning_rate": 6.687398151729951e-07, "loss": 0.018, "step": 7636 }, { "epoch": 2.5503422942060445, "grad_norm": 0.28789897424501965, "learning_rate": 6.677692389147355e-07, "loss": 0.0123, "step": 7637 }, { "epoch": 2.5506762397729172, "grad_norm": 0.2913385282077587, "learning_rate": 6.667993170909437e-07, "loss": 0.0119, "step": 7638 }, { "epoch": 2.5510101853397895, "grad_norm": 0.22954873855808372, "learning_rate": 6.658300498481363e-07, "loss": 0.0104, "step": 7639 }, { "epoch": 2.5513441309066622, "grad_norm": 0.4022487794590685, "learning_rate": 6.648614373327328e-07, "loss": 0.0304, "step": 7640 }, { "epoch": 2.551678076473535, "grad_norm": 0.31154895901564816, "learning_rate": 6.638934796910545e-07, "loss": 0.0108, "step": 7641 }, { "epoch": 2.5520120220404072, "grad_norm": 0.32510097482021605, "learning_rate": 6.629261770693213e-07, "loss": 0.0149, "step": 7642 }, { "epoch": 2.55234596760728, "grad_norm": 0.3928796312435496, "learning_rate": 6.619595296136577e-07, "loss": 0.0131, "step": 7643 }, { "epoch": 2.5526799131741527, "grad_norm": 0.26520098808510795, "learning_rate": 6.609935374700849e-07, "loss": 0.0104, "step": 7644 }, { "epoch": 2.5530138587410254, "grad_norm": 0.28194036719044246, "learning_rate": 6.600282007845277e-07, "loss": 0.0122, "step": 7645 }, { "epoch": 2.5533478043078977, "grad_norm": 0.23465194951929508, "learning_rate": 6.590635197028128e-07, "loss": 0.0084, "step": 7646 }, { "epoch": 2.5536817498747704, "grad_norm": 0.2654408510525354, "learning_rate": 6.580994943706675e-07, "loss": 0.0124, "step": 7647 }, { "epoch": 2.554015695441643, "grad_norm": 0.23877218559479607, "learning_rate": 6.571361249337161e-07, "loss": 0.0101, "step": 7648 }, { "epoch": 2.554349641008516, "grad_norm": 0.2791182314163454, "learning_rate": 6.561734115374901e-07, "loss": 0.0113, "step": 7649 }, { "epoch": 2.554683586575388, "grad_norm": 0.24895617590823962, "learning_rate": 6.552113543274158e-07, "loss": 0.0101, "step": 7650 }, { "epoch": 2.555017532142261, "grad_norm": 0.23747428376138735, "learning_rate": 6.54249953448825e-07, "loss": 0.0097, "step": 7651 }, { "epoch": 2.5553514777091335, "grad_norm": 0.28487369253520783, "learning_rate": 6.532892090469484e-07, "loss": 0.0176, "step": 7652 }, { "epoch": 2.555685423276006, "grad_norm": 0.3385339732444899, "learning_rate": 6.52329121266918e-07, "loss": 0.0194, "step": 7653 }, { "epoch": 2.5560193688428785, "grad_norm": 0.3390754635806754, "learning_rate": 6.513696902537653e-07, "loss": 0.018, "step": 7654 }, { "epoch": 2.556353314409751, "grad_norm": 0.36453070238608143, "learning_rate": 6.504109161524257e-07, "loss": 0.0135, "step": 7655 }, { "epoch": 2.556687259976624, "grad_norm": 0.2416387585270207, "learning_rate": 6.494527991077304e-07, "loss": 0.0102, "step": 7656 }, { "epoch": 2.5570212055434967, "grad_norm": 0.3112773119366655, "learning_rate": 6.484953392644161e-07, "loss": 0.0113, "step": 7657 }, { "epoch": 2.557355151110369, "grad_norm": 0.2958964808698193, "learning_rate": 6.475385367671183e-07, "loss": 0.0158, "step": 7658 }, { "epoch": 2.5576890966772416, "grad_norm": 0.3434098314001665, "learning_rate": 6.465823917603742e-07, "loss": 0.0199, "step": 7659 }, { "epoch": 2.5580230422441144, "grad_norm": 0.30371730468915425, "learning_rate": 6.456269043886182e-07, "loss": 0.0133, "step": 7660 }, { "epoch": 2.5583569878109866, "grad_norm": 0.2606506237596131, "learning_rate": 6.446720747961904e-07, "loss": 0.0104, "step": 7661 }, { "epoch": 2.5586909333778594, "grad_norm": 0.22068375526277711, "learning_rate": 6.437179031273272e-07, "loss": 0.0114, "step": 7662 }, { "epoch": 2.559024878944732, "grad_norm": 0.271297923643344, "learning_rate": 6.427643895261687e-07, "loss": 0.0147, "step": 7663 }, { "epoch": 2.5593588245116043, "grad_norm": 0.24599932657434925, "learning_rate": 6.418115341367543e-07, "loss": 0.0099, "step": 7664 }, { "epoch": 2.559692770078477, "grad_norm": 0.2343290015702136, "learning_rate": 6.408593371030231e-07, "loss": 0.0092, "step": 7665 }, { "epoch": 2.56002671564535, "grad_norm": 0.3017282925307235, "learning_rate": 6.399077985688168e-07, "loss": 0.0118, "step": 7666 }, { "epoch": 2.5603606612122225, "grad_norm": 0.24802952333676967, "learning_rate": 6.389569186778754e-07, "loss": 0.0118, "step": 7667 }, { "epoch": 2.560694606779095, "grad_norm": 0.19669040049922745, "learning_rate": 6.38006697573842e-07, "loss": 0.0088, "step": 7668 }, { "epoch": 2.5610285523459675, "grad_norm": 0.31336593097840215, "learning_rate": 6.370571354002553e-07, "loss": 0.0146, "step": 7669 }, { "epoch": 2.56136249791284, "grad_norm": 0.2375684635525042, "learning_rate": 6.361082323005624e-07, "loss": 0.0105, "step": 7670 }, { "epoch": 2.561696443479713, "grad_norm": 0.3266110919657544, "learning_rate": 6.351599884181037e-07, "loss": 0.0144, "step": 7671 }, { "epoch": 2.562030389046585, "grad_norm": 0.3165245167755812, "learning_rate": 6.342124038961234e-07, "loss": 0.0101, "step": 7672 }, { "epoch": 2.562364334613458, "grad_norm": 0.24043706621919486, "learning_rate": 6.332654788777642e-07, "loss": 0.0112, "step": 7673 }, { "epoch": 2.5626982801803306, "grad_norm": 0.3571244729415984, "learning_rate": 6.323192135060713e-07, "loss": 0.0168, "step": 7674 }, { "epoch": 2.5630322257472034, "grad_norm": 0.36735819824508675, "learning_rate": 6.31373607923989e-07, "loss": 0.0211, "step": 7675 }, { "epoch": 2.563366171314076, "grad_norm": 0.23117631065589533, "learning_rate": 6.304286622743627e-07, "loss": 0.0082, "step": 7676 }, { "epoch": 2.5637001168809483, "grad_norm": 0.25786461123132604, "learning_rate": 6.294843766999364e-07, "loss": 0.013, "step": 7677 }, { "epoch": 2.564034062447821, "grad_norm": 0.21760487266740203, "learning_rate": 6.285407513433572e-07, "loss": 0.0089, "step": 7678 }, { "epoch": 2.564368008014694, "grad_norm": 0.295119637137543, "learning_rate": 6.275977863471683e-07, "loss": 0.0118, "step": 7679 }, { "epoch": 2.564701953581566, "grad_norm": 0.4136616325040832, "learning_rate": 6.266554818538173e-07, "loss": 0.0167, "step": 7680 }, { "epoch": 2.5650358991484388, "grad_norm": 0.2318301327249102, "learning_rate": 6.257138380056505e-07, "loss": 0.0116, "step": 7681 }, { "epoch": 2.5653698447153115, "grad_norm": 0.2987969519182141, "learning_rate": 6.24772854944915e-07, "loss": 0.0144, "step": 7682 }, { "epoch": 2.5657037902821838, "grad_norm": 0.2911329749864612, "learning_rate": 6.238325328137552e-07, "loss": 0.0139, "step": 7683 }, { "epoch": 2.5660377358490565, "grad_norm": 0.26056387721801183, "learning_rate": 6.228928717542205e-07, "loss": 0.0131, "step": 7684 }, { "epoch": 2.566371681415929, "grad_norm": 0.23239858572829802, "learning_rate": 6.219538719082546e-07, "loss": 0.0116, "step": 7685 }, { "epoch": 2.566705626982802, "grad_norm": 0.24117066736489734, "learning_rate": 6.210155334177064e-07, "loss": 0.0088, "step": 7686 }, { "epoch": 2.5670395725496746, "grad_norm": 0.24887119804868804, "learning_rate": 6.200778564243237e-07, "loss": 0.011, "step": 7687 }, { "epoch": 2.567373518116547, "grad_norm": 0.30875433406908454, "learning_rate": 6.19140841069752e-07, "loss": 0.0144, "step": 7688 }, { "epoch": 2.5677074636834196, "grad_norm": 0.22436605657133116, "learning_rate": 6.1820448749554e-07, "loss": 0.0093, "step": 7689 }, { "epoch": 2.5680414092502923, "grad_norm": 0.3271448836658194, "learning_rate": 6.172687958431328e-07, "loss": 0.0134, "step": 7690 }, { "epoch": 2.5683753548171646, "grad_norm": 0.2576727611285128, "learning_rate": 6.163337662538793e-07, "loss": 0.0161, "step": 7691 }, { "epoch": 2.5687093003840373, "grad_norm": 0.29886937485751547, "learning_rate": 6.153993988690266e-07, "loss": 0.0201, "step": 7692 }, { "epoch": 2.56904324595091, "grad_norm": 0.3063418935809696, "learning_rate": 6.144656938297227e-07, "loss": 0.0167, "step": 7693 }, { "epoch": 2.5693771915177828, "grad_norm": 0.32364416951354497, "learning_rate": 6.135326512770124e-07, "loss": 0.0126, "step": 7694 }, { "epoch": 2.569711137084655, "grad_norm": 0.2810622315265582, "learning_rate": 6.126002713518453e-07, "loss": 0.0164, "step": 7695 }, { "epoch": 2.5700450826515278, "grad_norm": 0.22022296796365828, "learning_rate": 6.116685541950663e-07, "loss": 0.008, "step": 7696 }, { "epoch": 2.5703790282184005, "grad_norm": 0.39942266286389627, "learning_rate": 6.107374999474236e-07, "loss": 0.0249, "step": 7697 }, { "epoch": 2.570712973785273, "grad_norm": 0.21230024467934458, "learning_rate": 6.098071087495621e-07, "loss": 0.0106, "step": 7698 }, { "epoch": 2.5710469193521455, "grad_norm": 0.3109824929172941, "learning_rate": 6.088773807420312e-07, "loss": 0.0141, "step": 7699 }, { "epoch": 2.571380864919018, "grad_norm": 0.3816087620716031, "learning_rate": 6.07948316065275e-07, "loss": 0.0138, "step": 7700 }, { "epoch": 2.571714810485891, "grad_norm": 0.2952428503471391, "learning_rate": 6.070199148596411e-07, "loss": 0.0137, "step": 7701 }, { "epoch": 2.572048756052763, "grad_norm": 0.3007554892639507, "learning_rate": 6.060921772653738e-07, "loss": 0.0108, "step": 7702 }, { "epoch": 2.572382701619636, "grad_norm": 0.22970382919460358, "learning_rate": 6.051651034226208e-07, "loss": 0.013, "step": 7703 }, { "epoch": 2.5727166471865086, "grad_norm": 0.23843549246395956, "learning_rate": 6.042386934714245e-07, "loss": 0.012, "step": 7704 }, { "epoch": 2.5730505927533813, "grad_norm": 0.29578000624515915, "learning_rate": 6.03312947551734e-07, "loss": 0.0152, "step": 7705 }, { "epoch": 2.573384538320254, "grad_norm": 0.3037885960494729, "learning_rate": 6.02387865803391e-07, "loss": 0.0157, "step": 7706 }, { "epoch": 2.5737184838871263, "grad_norm": 0.3112733567743452, "learning_rate": 6.014634483661419e-07, "loss": 0.0124, "step": 7707 }, { "epoch": 2.574052429453999, "grad_norm": 0.2704221580142933, "learning_rate": 6.005396953796294e-07, "loss": 0.0144, "step": 7708 }, { "epoch": 2.5743863750208718, "grad_norm": 0.2189766287771203, "learning_rate": 5.996166069833976e-07, "loss": 0.0094, "step": 7709 }, { "epoch": 2.574720320587744, "grad_norm": 0.3106542306978342, "learning_rate": 5.986941833168913e-07, "loss": 0.0149, "step": 7710 }, { "epoch": 2.5750542661546167, "grad_norm": 0.23564296655504463, "learning_rate": 5.97772424519451e-07, "loss": 0.0154, "step": 7711 }, { "epoch": 2.5753882117214895, "grad_norm": 0.30775047343083617, "learning_rate": 5.96851330730322e-07, "loss": 0.0129, "step": 7712 }, { "epoch": 2.5757221572883617, "grad_norm": 0.23715489706835652, "learning_rate": 5.959309020886433e-07, "loss": 0.0087, "step": 7713 }, { "epoch": 2.5760561028552345, "grad_norm": 0.25233445364544266, "learning_rate": 5.950111387334584e-07, "loss": 0.0119, "step": 7714 }, { "epoch": 2.576390048422107, "grad_norm": 0.3186240092501902, "learning_rate": 5.940920408037081e-07, "loss": 0.013, "step": 7715 }, { "epoch": 2.57672399398898, "grad_norm": 0.2728406137670835, "learning_rate": 5.93173608438234e-07, "loss": 0.0125, "step": 7716 }, { "epoch": 2.5770579395558526, "grad_norm": 0.25408731449490213, "learning_rate": 5.92255841775774e-07, "loss": 0.0113, "step": 7717 }, { "epoch": 2.577391885122725, "grad_norm": 0.3087394440844738, "learning_rate": 5.913387409549693e-07, "loss": 0.014, "step": 7718 }, { "epoch": 2.5777258306895976, "grad_norm": 0.27728454768206234, "learning_rate": 5.904223061143577e-07, "loss": 0.016, "step": 7719 }, { "epoch": 2.5780597762564703, "grad_norm": 0.2976316052310394, "learning_rate": 5.895065373923781e-07, "loss": 0.0142, "step": 7720 }, { "epoch": 2.5783937218233426, "grad_norm": 0.28446300269637065, "learning_rate": 5.885914349273664e-07, "loss": 0.0142, "step": 7721 }, { "epoch": 2.5787276673902153, "grad_norm": 0.25315372764163047, "learning_rate": 5.876769988575631e-07, "loss": 0.0095, "step": 7722 }, { "epoch": 2.579061612957088, "grad_norm": 0.32400592565478337, "learning_rate": 5.867632293211011e-07, "loss": 0.0187, "step": 7723 }, { "epoch": 2.5793955585239607, "grad_norm": 0.30809121386222355, "learning_rate": 5.85850126456019e-07, "loss": 0.0136, "step": 7724 }, { "epoch": 2.5797295040908335, "grad_norm": 0.2820419405285735, "learning_rate": 5.84937690400249e-07, "loss": 0.0086, "step": 7725 }, { "epoch": 2.5800634496577057, "grad_norm": 0.30018832499031356, "learning_rate": 5.840259212916277e-07, "loss": 0.011, "step": 7726 }, { "epoch": 2.5803973952245784, "grad_norm": 0.29804342774190334, "learning_rate": 5.831148192678853e-07, "loss": 0.0131, "step": 7727 }, { "epoch": 2.580731340791451, "grad_norm": 0.21314453819518234, "learning_rate": 5.822043844666586e-07, "loss": 0.0079, "step": 7728 }, { "epoch": 2.5810652863583234, "grad_norm": 0.3536771226099465, "learning_rate": 5.812946170254763e-07, "loss": 0.0219, "step": 7729 }, { "epoch": 2.581399231925196, "grad_norm": 0.3179741117100696, "learning_rate": 5.803855170817718e-07, "loss": 0.0125, "step": 7730 }, { "epoch": 2.581733177492069, "grad_norm": 0.33287077989551483, "learning_rate": 5.794770847728736e-07, "loss": 0.015, "step": 7731 }, { "epoch": 2.582067123058941, "grad_norm": 0.24447350410250296, "learning_rate": 5.785693202360121e-07, "loss": 0.0085, "step": 7732 }, { "epoch": 2.582401068625814, "grad_norm": 0.24002018090808191, "learning_rate": 5.776622236083146e-07, "loss": 0.0078, "step": 7733 }, { "epoch": 2.5827350141926866, "grad_norm": 0.21165752468060922, "learning_rate": 5.767557950268099e-07, "loss": 0.0092, "step": 7734 }, { "epoch": 2.5830689597595593, "grad_norm": 0.3562783324191469, "learning_rate": 5.758500346284252e-07, "loss": 0.016, "step": 7735 }, { "epoch": 2.583402905326432, "grad_norm": 0.3204976801978708, "learning_rate": 5.749449425499843e-07, "loss": 0.0202, "step": 7736 }, { "epoch": 2.5837368508933043, "grad_norm": 0.263270083553635, "learning_rate": 5.740405189282134e-07, "loss": 0.0157, "step": 7737 }, { "epoch": 2.584070796460177, "grad_norm": 0.27260822303976673, "learning_rate": 5.73136763899737e-07, "loss": 0.01, "step": 7738 }, { "epoch": 2.5844047420270497, "grad_norm": 0.25270345589887516, "learning_rate": 5.722336776010756e-07, "loss": 0.0106, "step": 7739 }, { "epoch": 2.584738687593922, "grad_norm": 0.34004623097511627, "learning_rate": 5.713312601686533e-07, "loss": 0.0199, "step": 7740 }, { "epoch": 2.5850726331607947, "grad_norm": 0.32386090811729507, "learning_rate": 5.704295117387904e-07, "loss": 0.0165, "step": 7741 }, { "epoch": 2.5854065787276674, "grad_norm": 0.3082270707226719, "learning_rate": 5.695284324477052e-07, "loss": 0.0143, "step": 7742 }, { "epoch": 2.58574052429454, "grad_norm": 0.25192242369583967, "learning_rate": 5.686280224315189e-07, "loss": 0.0121, "step": 7743 }, { "epoch": 2.5860744698614124, "grad_norm": 0.2573272403479741, "learning_rate": 5.677282818262464e-07, "loss": 0.0112, "step": 7744 }, { "epoch": 2.586408415428285, "grad_norm": 0.29357437238210815, "learning_rate": 5.668292107678048e-07, "loss": 0.0132, "step": 7745 }, { "epoch": 2.586742360995158, "grad_norm": 0.3752654176831875, "learning_rate": 5.659308093920101e-07, "loss": 0.0177, "step": 7746 }, { "epoch": 2.5870763065620306, "grad_norm": 0.2861699049823195, "learning_rate": 5.650330778345776e-07, "loss": 0.013, "step": 7747 }, { "epoch": 2.587410252128903, "grad_norm": 0.23728377208801518, "learning_rate": 5.641360162311171e-07, "loss": 0.0091, "step": 7748 }, { "epoch": 2.5877441976957756, "grad_norm": 0.2538075352426027, "learning_rate": 5.632396247171429e-07, "loss": 0.0134, "step": 7749 }, { "epoch": 2.5880781432626483, "grad_norm": 0.2775516067571275, "learning_rate": 5.623439034280625e-07, "loss": 0.0127, "step": 7750 }, { "epoch": 2.5884120888295206, "grad_norm": 0.3580076647398268, "learning_rate": 5.614488524991896e-07, "loss": 0.0189, "step": 7751 }, { "epoch": 2.5887460343963933, "grad_norm": 0.2606431332039776, "learning_rate": 5.605544720657286e-07, "loss": 0.0103, "step": 7752 }, { "epoch": 2.589079979963266, "grad_norm": 0.5082055733961746, "learning_rate": 5.596607622627887e-07, "loss": 0.0165, "step": 7753 }, { "epoch": 2.5894139255301387, "grad_norm": 0.2675982553742719, "learning_rate": 5.587677232253725e-07, "loss": 0.0115, "step": 7754 }, { "epoch": 2.5897478710970114, "grad_norm": 0.23943591364195121, "learning_rate": 5.57875355088387e-07, "loss": 0.0107, "step": 7755 }, { "epoch": 2.5900818166638837, "grad_norm": 0.28025763503549567, "learning_rate": 5.569836579866316e-07, "loss": 0.0109, "step": 7756 }, { "epoch": 2.5904157622307564, "grad_norm": 0.2860556200199481, "learning_rate": 5.560926320548105e-07, "loss": 0.0149, "step": 7757 }, { "epoch": 2.590749707797629, "grad_norm": 0.28321692167299, "learning_rate": 5.552022774275228e-07, "loss": 0.0111, "step": 7758 }, { "epoch": 2.5910836533645014, "grad_norm": 0.2748970840787289, "learning_rate": 5.543125942392664e-07, "loss": 0.01, "step": 7759 }, { "epoch": 2.591417598931374, "grad_norm": 0.22994941830488969, "learning_rate": 5.534235826244389e-07, "loss": 0.0103, "step": 7760 }, { "epoch": 2.591751544498247, "grad_norm": 0.35041866367625174, "learning_rate": 5.525352427173369e-07, "loss": 0.0193, "step": 7761 }, { "epoch": 2.592085490065119, "grad_norm": 0.2863518183261444, "learning_rate": 5.516475746521527e-07, "loss": 0.0118, "step": 7762 }, { "epoch": 2.592419435631992, "grad_norm": 0.2705942370985263, "learning_rate": 5.507605785629794e-07, "loss": 0.0122, "step": 7763 }, { "epoch": 2.5927533811988646, "grad_norm": 0.29767577101175474, "learning_rate": 5.498742545838104e-07, "loss": 0.0156, "step": 7764 }, { "epoch": 2.5930873267657373, "grad_norm": 0.19476089992018275, "learning_rate": 5.48988602848533e-07, "loss": 0.008, "step": 7765 }, { "epoch": 2.59342127233261, "grad_norm": 0.23204861842288593, "learning_rate": 5.481036234909365e-07, "loss": 0.0091, "step": 7766 }, { "epoch": 2.5937552178994823, "grad_norm": 0.29781530126581196, "learning_rate": 5.472193166447065e-07, "loss": 0.0161, "step": 7767 }, { "epoch": 2.594089163466355, "grad_norm": 0.2546827814738975, "learning_rate": 5.463356824434285e-07, "loss": 0.0112, "step": 7768 }, { "epoch": 2.5944231090332277, "grad_norm": 0.19954995236601833, "learning_rate": 5.454527210205857e-07, "loss": 0.0071, "step": 7769 }, { "epoch": 2.5947570546001, "grad_norm": 0.28657743196987234, "learning_rate": 5.445704325095613e-07, "loss": 0.0167, "step": 7770 }, { "epoch": 2.5950910001669727, "grad_norm": 0.32990929435981003, "learning_rate": 5.436888170436327e-07, "loss": 0.019, "step": 7771 }, { "epoch": 2.5954249457338454, "grad_norm": 0.22321175087720183, "learning_rate": 5.428078747559806e-07, "loss": 0.0101, "step": 7772 }, { "epoch": 2.595758891300718, "grad_norm": 0.2939906935182841, "learning_rate": 5.419276057796802e-07, "loss": 0.012, "step": 7773 }, { "epoch": 2.596092836867591, "grad_norm": 0.3560238371429349, "learning_rate": 5.410480102477067e-07, "loss": 0.0172, "step": 7774 }, { "epoch": 2.596426782434463, "grad_norm": 0.30955404912738377, "learning_rate": 5.401690882929333e-07, "loss": 0.0207, "step": 7775 }, { "epoch": 2.596760728001336, "grad_norm": 0.2687572631681687, "learning_rate": 5.392908400481334e-07, "loss": 0.0121, "step": 7776 }, { "epoch": 2.5970946735682086, "grad_norm": 0.24242286721787118, "learning_rate": 5.384132656459745e-07, "loss": 0.0116, "step": 7777 }, { "epoch": 2.597428619135081, "grad_norm": 0.2903164453331221, "learning_rate": 5.375363652190257e-07, "loss": 0.0141, "step": 7778 }, { "epoch": 2.5977625647019535, "grad_norm": 0.2947254617095242, "learning_rate": 5.366601388997522e-07, "loss": 0.015, "step": 7779 }, { "epoch": 2.5980965102688263, "grad_norm": 0.23439835279182605, "learning_rate": 5.357845868205191e-07, "loss": 0.0079, "step": 7780 }, { "epoch": 2.5984304558356985, "grad_norm": 0.2871025774439091, "learning_rate": 5.34909709113589e-07, "loss": 0.0132, "step": 7781 }, { "epoch": 2.5987644014025713, "grad_norm": 0.32349870967462496, "learning_rate": 5.340355059111213e-07, "loss": 0.0138, "step": 7782 }, { "epoch": 2.599098346969444, "grad_norm": 0.2303197504499297, "learning_rate": 5.331619773451757e-07, "loss": 0.0104, "step": 7783 }, { "epoch": 2.5994322925363167, "grad_norm": 0.2917282176252594, "learning_rate": 5.32289123547709e-07, "loss": 0.0152, "step": 7784 }, { "epoch": 2.5997662381031894, "grad_norm": 0.29936471310078494, "learning_rate": 5.314169446505757e-07, "loss": 0.0138, "step": 7785 }, { "epoch": 2.6001001836700617, "grad_norm": 0.2808927535783025, "learning_rate": 5.305454407855282e-07, "loss": 0.0136, "step": 7786 }, { "epoch": 2.6004341292369344, "grad_norm": 0.30781220110910307, "learning_rate": 5.296746120842189e-07, "loss": 0.0166, "step": 7787 }, { "epoch": 2.600768074803807, "grad_norm": 0.33708264863124765, "learning_rate": 5.288044586781955e-07, "loss": 0.0198, "step": 7788 }, { "epoch": 2.6011020203706794, "grad_norm": 0.3525456228701188, "learning_rate": 5.279349806989054e-07, "loss": 0.0191, "step": 7789 }, { "epoch": 2.601435965937552, "grad_norm": 0.29643886549077286, "learning_rate": 5.270661782776931e-07, "loss": 0.0134, "step": 7790 }, { "epoch": 2.601769911504425, "grad_norm": 0.3522666220172226, "learning_rate": 5.26198051545801e-07, "loss": 0.0207, "step": 7791 }, { "epoch": 2.6021038570712975, "grad_norm": 0.2335916004126007, "learning_rate": 5.253306006343706e-07, "loss": 0.0104, "step": 7792 }, { "epoch": 2.60243780263817, "grad_norm": 0.22011570111638332, "learning_rate": 5.244638256744422e-07, "loss": 0.0113, "step": 7793 }, { "epoch": 2.6027717482050425, "grad_norm": 0.3209341737954237, "learning_rate": 5.235977267969489e-07, "loss": 0.0127, "step": 7794 }, { "epoch": 2.6031056937719153, "grad_norm": 0.28920014648819403, "learning_rate": 5.227323041327281e-07, "loss": 0.0133, "step": 7795 }, { "epoch": 2.603439639338788, "grad_norm": 0.2582740540276249, "learning_rate": 5.218675578125099e-07, "loss": 0.0102, "step": 7796 }, { "epoch": 2.6037735849056602, "grad_norm": 0.3073350534343159, "learning_rate": 5.210034879669257e-07, "loss": 0.0162, "step": 7797 }, { "epoch": 2.604107530472533, "grad_norm": 0.28066859886880224, "learning_rate": 5.201400947265029e-07, "loss": 0.0129, "step": 7798 }, { "epoch": 2.6044414760394057, "grad_norm": 0.3188945910965467, "learning_rate": 5.192773782216681e-07, "loss": 0.0155, "step": 7799 }, { "epoch": 2.604775421606278, "grad_norm": 0.30526122254461424, "learning_rate": 5.184153385827434e-07, "loss": 0.0155, "step": 7800 }, { "epoch": 2.6051093671731507, "grad_norm": 0.2701536008517957, "learning_rate": 5.175539759399518e-07, "loss": 0.0096, "step": 7801 }, { "epoch": 2.6054433127400234, "grad_norm": 0.22318834661855, "learning_rate": 5.166932904234101e-07, "loss": 0.0097, "step": 7802 }, { "epoch": 2.605777258306896, "grad_norm": 0.2435016455260839, "learning_rate": 5.158332821631362e-07, "loss": 0.0109, "step": 7803 }, { "epoch": 2.606111203873769, "grad_norm": 0.2918131522531103, "learning_rate": 5.149739512890445e-07, "loss": 0.0155, "step": 7804 }, { "epoch": 2.606445149440641, "grad_norm": 0.30717006423983095, "learning_rate": 5.141152979309477e-07, "loss": 0.016, "step": 7805 }, { "epoch": 2.606779095007514, "grad_norm": 0.347405470232238, "learning_rate": 5.132573222185539e-07, "loss": 0.017, "step": 7806 }, { "epoch": 2.6071130405743865, "grad_norm": 0.2976334048782599, "learning_rate": 5.124000242814725e-07, "loss": 0.0158, "step": 7807 }, { "epoch": 2.607446986141259, "grad_norm": 0.2869317304551244, "learning_rate": 5.115434042492057e-07, "loss": 0.0126, "step": 7808 }, { "epoch": 2.6077809317081315, "grad_norm": 0.27989027783297954, "learning_rate": 5.106874622511576e-07, "loss": 0.0124, "step": 7809 }, { "epoch": 2.6081148772750042, "grad_norm": 0.2866276623557583, "learning_rate": 5.098321984166293e-07, "loss": 0.0166, "step": 7810 }, { "epoch": 2.6084488228418765, "grad_norm": 0.36940599428484794, "learning_rate": 5.089776128748169e-07, "loss": 0.0127, "step": 7811 }, { "epoch": 2.6087827684087492, "grad_norm": 0.27347853902866026, "learning_rate": 5.081237057548166e-07, "loss": 0.0128, "step": 7812 }, { "epoch": 2.609116713975622, "grad_norm": 0.501024325812926, "learning_rate": 5.072704771856201e-07, "loss": 0.0192, "step": 7813 }, { "epoch": 2.6094506595424947, "grad_norm": 0.270871788245993, "learning_rate": 5.06417927296119e-07, "loss": 0.009, "step": 7814 }, { "epoch": 2.6097846051093674, "grad_norm": 0.26017247693699724, "learning_rate": 5.055660562150983e-07, "loss": 0.0114, "step": 7815 }, { "epoch": 2.6101185506762397, "grad_norm": 0.2854172663792184, "learning_rate": 5.047148640712468e-07, "loss": 0.0101, "step": 7816 }, { "epoch": 2.6104524962431124, "grad_norm": 0.30607888740182465, "learning_rate": 5.038643509931446e-07, "loss": 0.0186, "step": 7817 }, { "epoch": 2.610786441809985, "grad_norm": 0.3272155334397794, "learning_rate": 5.030145171092732e-07, "loss": 0.0143, "step": 7818 }, { "epoch": 2.6111203873768574, "grad_norm": 0.26239799598991864, "learning_rate": 5.021653625480089e-07, "loss": 0.0119, "step": 7819 }, { "epoch": 2.61145433294373, "grad_norm": 0.3149126051533802, "learning_rate": 5.013168874376273e-07, "loss": 0.0163, "step": 7820 }, { "epoch": 2.611788278510603, "grad_norm": 0.33734061301169993, "learning_rate": 5.004690919062983e-07, "loss": 0.0138, "step": 7821 }, { "epoch": 2.6121222240774755, "grad_norm": 0.3582999727333817, "learning_rate": 4.996219760820947e-07, "loss": 0.0145, "step": 7822 }, { "epoch": 2.6124561696443482, "grad_norm": 0.36645911870783865, "learning_rate": 4.987755400929817e-07, "loss": 0.0124, "step": 7823 }, { "epoch": 2.6127901152112205, "grad_norm": 0.263407979707053, "learning_rate": 4.97929784066824e-07, "loss": 0.0106, "step": 7824 }, { "epoch": 2.6131240607780932, "grad_norm": 0.28669460111882933, "learning_rate": 4.970847081313818e-07, "loss": 0.0126, "step": 7825 }, { "epoch": 2.613458006344966, "grad_norm": 0.34181469769634143, "learning_rate": 4.962403124143156e-07, "loss": 0.0134, "step": 7826 }, { "epoch": 2.613791951911838, "grad_norm": 0.2943655509060492, "learning_rate": 4.953965970431779e-07, "loss": 0.0156, "step": 7827 }, { "epoch": 2.614125897478711, "grad_norm": 0.2693809041343175, "learning_rate": 4.945535621454268e-07, "loss": 0.012, "step": 7828 }, { "epoch": 2.6144598430455837, "grad_norm": 0.30883783427694184, "learning_rate": 4.937112078484086e-07, "loss": 0.0147, "step": 7829 }, { "epoch": 2.614793788612456, "grad_norm": 0.30272494392255195, "learning_rate": 4.928695342793733e-07, "loss": 0.0163, "step": 7830 }, { "epoch": 2.6151277341793286, "grad_norm": 0.3222143371438454, "learning_rate": 4.92028541565464e-07, "loss": 0.0099, "step": 7831 }, { "epoch": 2.6154616797462014, "grad_norm": 0.35735085101889014, "learning_rate": 4.911882298337228e-07, "loss": 0.0175, "step": 7832 }, { "epoch": 2.615795625313074, "grad_norm": 0.19956015862823392, "learning_rate": 4.903485992110901e-07, "loss": 0.0076, "step": 7833 }, { "epoch": 2.616129570879947, "grad_norm": 0.30573601023846625, "learning_rate": 4.895096498243995e-07, "loss": 0.0118, "step": 7834 }, { "epoch": 2.616463516446819, "grad_norm": 0.3836827700213579, "learning_rate": 4.886713818003874e-07, "loss": 0.0147, "step": 7835 }, { "epoch": 2.616797462013692, "grad_norm": 0.2902416380310542, "learning_rate": 4.878337952656809e-07, "loss": 0.0152, "step": 7836 }, { "epoch": 2.6171314075805645, "grad_norm": 0.3225021336855386, "learning_rate": 4.869968903468092e-07, "loss": 0.0159, "step": 7837 }, { "epoch": 2.617465353147437, "grad_norm": 0.2990407742420705, "learning_rate": 4.861606671701946e-07, "loss": 0.0112, "step": 7838 }, { "epoch": 2.6177992987143095, "grad_norm": 0.32854258306647427, "learning_rate": 4.853251258621621e-07, "loss": 0.0196, "step": 7839 }, { "epoch": 2.618133244281182, "grad_norm": 0.3114884955059214, "learning_rate": 4.844902665489265e-07, "loss": 0.0143, "step": 7840 }, { "epoch": 2.618467189848055, "grad_norm": 0.24263950940313359, "learning_rate": 4.836560893566056e-07, "loss": 0.0109, "step": 7841 }, { "epoch": 2.618801135414927, "grad_norm": 0.2452556024663025, "learning_rate": 4.828225944112097e-07, "loss": 0.0153, "step": 7842 }, { "epoch": 2.6191350809818, "grad_norm": 0.3056648642732613, "learning_rate": 4.819897818386499e-07, "loss": 0.0121, "step": 7843 }, { "epoch": 2.6194690265486726, "grad_norm": 0.23198214125610783, "learning_rate": 4.811576517647299e-07, "loss": 0.0097, "step": 7844 }, { "epoch": 2.6198029721155454, "grad_norm": 0.34270264884464957, "learning_rate": 4.803262043151557e-07, "loss": 0.0205, "step": 7845 }, { "epoch": 2.6201369176824176, "grad_norm": 0.3403452881567089, "learning_rate": 4.794954396155249e-07, "loss": 0.0163, "step": 7846 }, { "epoch": 2.6204708632492903, "grad_norm": 0.28619503287707593, "learning_rate": 4.786653577913364e-07, "loss": 0.0144, "step": 7847 }, { "epoch": 2.620804808816163, "grad_norm": 0.29811911634736843, "learning_rate": 4.77835958967981e-07, "loss": 0.0143, "step": 7848 }, { "epoch": 2.6211387543830353, "grad_norm": 0.2733347677589082, "learning_rate": 4.770072432707523e-07, "loss": 0.0106, "step": 7849 }, { "epoch": 2.621472699949908, "grad_norm": 0.35678348971836715, "learning_rate": 4.761792108248342e-07, "loss": 0.0228, "step": 7850 }, { "epoch": 2.6218066455167808, "grad_norm": 0.30352719624801333, "learning_rate": 4.753518617553138e-07, "loss": 0.0163, "step": 7851 }, { "epoch": 2.6221405910836535, "grad_norm": 0.2541955296554193, "learning_rate": 4.745251961871705e-07, "loss": 0.0155, "step": 7852 }, { "epoch": 2.622474536650526, "grad_norm": 0.3092492674665828, "learning_rate": 4.736992142452823e-07, "loss": 0.0149, "step": 7853 }, { "epoch": 2.6228084822173985, "grad_norm": 0.3452254837202833, "learning_rate": 4.728739160544227e-07, "loss": 0.0132, "step": 7854 }, { "epoch": 2.623142427784271, "grad_norm": 0.29395539921865604, "learning_rate": 4.720493017392641e-07, "loss": 0.013, "step": 7855 }, { "epoch": 2.623476373351144, "grad_norm": 0.2625686656669474, "learning_rate": 4.712253714243725e-07, "loss": 0.0122, "step": 7856 }, { "epoch": 2.623810318918016, "grad_norm": 0.30476099803603063, "learning_rate": 4.7040212523421335e-07, "loss": 0.0155, "step": 7857 }, { "epoch": 2.624144264484889, "grad_norm": 0.28600785805079787, "learning_rate": 4.695795632931477e-07, "loss": 0.0156, "step": 7858 }, { "epoch": 2.6244782100517616, "grad_norm": 0.20763764250099112, "learning_rate": 4.687576857254328e-07, "loss": 0.0066, "step": 7859 }, { "epoch": 2.624812155618634, "grad_norm": 0.2392660061310057, "learning_rate": 4.679364926552238e-07, "loss": 0.0087, "step": 7860 }, { "epoch": 2.6251461011855066, "grad_norm": 0.2398332101110139, "learning_rate": 4.671159842065698e-07, "loss": 0.0123, "step": 7861 }, { "epoch": 2.6254800467523793, "grad_norm": 0.3110648976872475, "learning_rate": 4.662961605034194e-07, "loss": 0.0143, "step": 7862 }, { "epoch": 2.625813992319252, "grad_norm": 0.25693346836410147, "learning_rate": 4.654770216696169e-07, "loss": 0.0099, "step": 7863 }, { "epoch": 2.6261479378861248, "grad_norm": 0.28816989474842675, "learning_rate": 4.646585678289034e-07, "loss": 0.0179, "step": 7864 }, { "epoch": 2.626481883452997, "grad_norm": 0.3194372432316785, "learning_rate": 4.6384079910491376e-07, "loss": 0.0139, "step": 7865 }, { "epoch": 2.6268158290198698, "grad_norm": 0.2594581126553459, "learning_rate": 4.630237156211842e-07, "loss": 0.0107, "step": 7866 }, { "epoch": 2.6271497745867425, "grad_norm": 0.32859966823538417, "learning_rate": 4.6220731750114267e-07, "loss": 0.0208, "step": 7867 }, { "epoch": 2.6274837201536148, "grad_norm": 0.3396579390284738, "learning_rate": 4.6139160486811663e-07, "loss": 0.0179, "step": 7868 }, { "epoch": 2.6278176657204875, "grad_norm": 0.2345310176014767, "learning_rate": 4.605765778453292e-07, "loss": 0.0091, "step": 7869 }, { "epoch": 2.62815161128736, "grad_norm": 0.3285333915493347, "learning_rate": 4.597622365559007e-07, "loss": 0.0267, "step": 7870 }, { "epoch": 2.628485556854233, "grad_norm": 0.26953723072086627, "learning_rate": 4.5894858112284445e-07, "loss": 0.0111, "step": 7871 }, { "epoch": 2.6288195024211056, "grad_norm": 0.24074633675252588, "learning_rate": 4.581356116690755e-07, "loss": 0.0105, "step": 7872 }, { "epoch": 2.629153447987978, "grad_norm": 0.1822939536673304, "learning_rate": 4.573233283173989e-07, "loss": 0.0077, "step": 7873 }, { "epoch": 2.6294873935548506, "grad_norm": 0.2215272780562539, "learning_rate": 4.5651173119052427e-07, "loss": 0.0085, "step": 7874 }, { "epoch": 2.6298213391217233, "grad_norm": 0.3693724794689987, "learning_rate": 4.5570082041104915e-07, "loss": 0.0156, "step": 7875 }, { "epoch": 2.6301552846885956, "grad_norm": 0.27095227834966373, "learning_rate": 4.5489059610147323e-07, "loss": 0.0128, "step": 7876 }, { "epoch": 2.6304892302554683, "grad_norm": 0.2618553764053157, "learning_rate": 4.5408105838418924e-07, "loss": 0.01, "step": 7877 }, { "epoch": 2.630823175822341, "grad_norm": 0.3314126642674739, "learning_rate": 4.5327220738148823e-07, "loss": 0.0177, "step": 7878 }, { "epoch": 2.6311571213892133, "grad_norm": 0.2184385277216219, "learning_rate": 4.524640432155558e-07, "loss": 0.0124, "step": 7879 }, { "epoch": 2.631491066956086, "grad_norm": 0.23886265818361968, "learning_rate": 4.516565660084754e-07, "loss": 0.0106, "step": 7880 }, { "epoch": 2.6318250125229588, "grad_norm": 0.2093007838288809, "learning_rate": 4.5084977588222613e-07, "loss": 0.0102, "step": 7881 }, { "epoch": 2.6321589580898315, "grad_norm": 0.22978651514981277, "learning_rate": 4.500436729586821e-07, "loss": 0.011, "step": 7882 }, { "epoch": 2.632492903656704, "grad_norm": 0.25385386043168123, "learning_rate": 4.4923825735961604e-07, "loss": 0.011, "step": 7883 }, { "epoch": 2.6328268492235765, "grad_norm": 0.29391073574245624, "learning_rate": 4.484335292066938e-07, "loss": 0.0111, "step": 7884 }, { "epoch": 2.633160794790449, "grad_norm": 0.21919384575116094, "learning_rate": 4.476294886214799e-07, "loss": 0.0062, "step": 7885 }, { "epoch": 2.633494740357322, "grad_norm": 0.3645711293848547, "learning_rate": 4.468261357254339e-07, "loss": 0.0151, "step": 7886 }, { "epoch": 2.633828685924194, "grad_norm": 0.23348420561641633, "learning_rate": 4.46023470639913e-07, "loss": 0.0126, "step": 7887 }, { "epoch": 2.634162631491067, "grad_norm": 0.4223737283881882, "learning_rate": 4.452214934861676e-07, "loss": 0.0233, "step": 7888 }, { "epoch": 2.6344965770579396, "grad_norm": 0.4017994011427713, "learning_rate": 4.4442020438534737e-07, "loss": 0.0228, "step": 7889 }, { "epoch": 2.6348305226248123, "grad_norm": 0.25590162258996446, "learning_rate": 4.436196034584944e-07, "loss": 0.0136, "step": 7890 }, { "epoch": 2.6351644681916846, "grad_norm": 0.2628167263739924, "learning_rate": 4.4281969082654976e-07, "loss": 0.0124, "step": 7891 }, { "epoch": 2.6354984137585573, "grad_norm": 0.33306797091433593, "learning_rate": 4.4202046661035e-07, "loss": 0.0101, "step": 7892 }, { "epoch": 2.63583235932543, "grad_norm": 0.2899528935574976, "learning_rate": 4.4122193093062815e-07, "loss": 0.016, "step": 7893 }, { "epoch": 2.6361663048923027, "grad_norm": 0.2715359116789147, "learning_rate": 4.4042408390801097e-07, "loss": 0.0091, "step": 7894 }, { "epoch": 2.636500250459175, "grad_norm": 0.2292529748099387, "learning_rate": 4.3962692566302366e-07, "loss": 0.0082, "step": 7895 }, { "epoch": 2.6368341960260477, "grad_norm": 0.2674417840369013, "learning_rate": 4.38830456316085e-07, "loss": 0.013, "step": 7896 }, { "epoch": 2.6371681415929205, "grad_norm": 0.3124380951995918, "learning_rate": 4.38034675987512e-07, "loss": 0.0152, "step": 7897 }, { "epoch": 2.6375020871597927, "grad_norm": 0.34431549977777304, "learning_rate": 4.372395847975164e-07, "loss": 0.0179, "step": 7898 }, { "epoch": 2.6378360327266654, "grad_norm": 0.3441224854387786, "learning_rate": 4.364451828662075e-07, "loss": 0.0124, "step": 7899 }, { "epoch": 2.638169978293538, "grad_norm": 0.34566215561888247, "learning_rate": 4.356514703135867e-07, "loss": 0.0211, "step": 7900 }, { "epoch": 2.638503923860411, "grad_norm": 0.28016577725788827, "learning_rate": 4.348584472595557e-07, "loss": 0.0096, "step": 7901 }, { "epoch": 2.6388378694272836, "grad_norm": 0.26713189067875304, "learning_rate": 4.3406611382390826e-07, "loss": 0.0105, "step": 7902 }, { "epoch": 2.639171814994156, "grad_norm": 0.26716024400740856, "learning_rate": 4.3327447012633695e-07, "loss": 0.0118, "step": 7903 }, { "epoch": 2.6395057605610286, "grad_norm": 0.23008488659461046, "learning_rate": 4.324835162864283e-07, "loss": 0.0075, "step": 7904 }, { "epoch": 2.6398397061279013, "grad_norm": 0.25282644215243727, "learning_rate": 4.31693252423665e-07, "loss": 0.0109, "step": 7905 }, { "epoch": 2.6401736516947736, "grad_norm": 0.2528030423118093, "learning_rate": 4.3090367865742666e-07, "loss": 0.0114, "step": 7906 }, { "epoch": 2.6405075972616463, "grad_norm": 0.26423103992968655, "learning_rate": 4.3011479510698615e-07, "loss": 0.0112, "step": 7907 }, { "epoch": 2.640841542828519, "grad_norm": 0.2417972838722505, "learning_rate": 4.293266018915149e-07, "loss": 0.0125, "step": 7908 }, { "epoch": 2.6411754883953913, "grad_norm": 0.2200353282870818, "learning_rate": 4.2853909913007807e-07, "loss": 0.008, "step": 7909 }, { "epoch": 2.641509433962264, "grad_norm": 0.3074011894361888, "learning_rate": 4.277522869416384e-07, "loss": 0.0158, "step": 7910 }, { "epoch": 2.6418433795291367, "grad_norm": 0.28136241325893485, "learning_rate": 4.269661654450513e-07, "loss": 0.0107, "step": 7911 }, { "epoch": 2.6421773250960094, "grad_norm": 0.22021278378467113, "learning_rate": 4.261807347590713e-07, "loss": 0.0084, "step": 7912 }, { "epoch": 2.642511270662882, "grad_norm": 0.26388601100882164, "learning_rate": 4.253959950023456e-07, "loss": 0.0143, "step": 7913 }, { "epoch": 2.6428452162297544, "grad_norm": 0.2833154500686583, "learning_rate": 4.246119462934195e-07, "loss": 0.0131, "step": 7914 }, { "epoch": 2.643179161796627, "grad_norm": 0.2923873934305097, "learning_rate": 4.238285887507315e-07, "loss": 0.0151, "step": 7915 }, { "epoch": 2.6435131073635, "grad_norm": 0.24641108987918847, "learning_rate": 4.230459224926198e-07, "loss": 0.0087, "step": 7916 }, { "epoch": 2.643847052930372, "grad_norm": 0.3087129446537835, "learning_rate": 4.222639476373119e-07, "loss": 0.016, "step": 7917 }, { "epoch": 2.644180998497245, "grad_norm": 0.2963660809794913, "learning_rate": 4.2148266430293627e-07, "loss": 0.0115, "step": 7918 }, { "epoch": 2.6445149440641176, "grad_norm": 0.31529640063222963, "learning_rate": 4.207020726075145e-07, "loss": 0.0169, "step": 7919 }, { "epoch": 2.6448488896309903, "grad_norm": 0.30567255421916395, "learning_rate": 4.199221726689634e-07, "loss": 0.0171, "step": 7920 }, { "epoch": 2.645182835197863, "grad_norm": 0.2694320969090033, "learning_rate": 4.191429646050971e-07, "loss": 0.011, "step": 7921 }, { "epoch": 2.6455167807647353, "grad_norm": 0.24352837601722344, "learning_rate": 4.1836444853362465e-07, "loss": 0.01, "step": 7922 }, { "epoch": 2.645850726331608, "grad_norm": 0.29336149931524796, "learning_rate": 4.1758662457214884e-07, "loss": 0.0086, "step": 7923 }, { "epoch": 2.6461846718984807, "grad_norm": 0.2433622100799936, "learning_rate": 4.1680949283816996e-07, "loss": 0.0083, "step": 7924 }, { "epoch": 2.646518617465353, "grad_norm": 0.3011407689417221, "learning_rate": 4.160330534490814e-07, "loss": 0.0138, "step": 7925 }, { "epoch": 2.6468525630322257, "grad_norm": 0.3218541200246468, "learning_rate": 4.152573065221749e-07, "loss": 0.0115, "step": 7926 }, { "epoch": 2.6471865085990984, "grad_norm": 0.27047284576994124, "learning_rate": 4.1448225217463724e-07, "loss": 0.0128, "step": 7927 }, { "epoch": 2.6475204541659707, "grad_norm": 0.21217979110827842, "learning_rate": 4.1370789052354644e-07, "loss": 0.0095, "step": 7928 }, { "epoch": 2.6478543997328434, "grad_norm": 0.2692342587483447, "learning_rate": 4.129342216858817e-07, "loss": 0.0128, "step": 7929 }, { "epoch": 2.648188345299716, "grad_norm": 0.34308220776747567, "learning_rate": 4.1216124577851293e-07, "loss": 0.0165, "step": 7930 }, { "epoch": 2.648522290866589, "grad_norm": 0.37272679841455425, "learning_rate": 4.113889629182083e-07, "loss": 0.015, "step": 7931 }, { "epoch": 2.6488562364334616, "grad_norm": 0.3403401664272049, "learning_rate": 4.106173732216295e-07, "loss": 0.021, "step": 7932 }, { "epoch": 2.649190182000334, "grad_norm": 0.29545181861376985, "learning_rate": 4.0984647680533564e-07, "loss": 0.0136, "step": 7933 }, { "epoch": 2.6495241275672066, "grad_norm": 0.29643565717233555, "learning_rate": 4.090762737857784e-07, "loss": 0.0127, "step": 7934 }, { "epoch": 2.6498580731340793, "grad_norm": 0.4484958160602119, "learning_rate": 4.0830676427930646e-07, "loss": 0.014, "step": 7935 }, { "epoch": 2.6501920187009516, "grad_norm": 0.24193996415630625, "learning_rate": 4.0753794840216296e-07, "loss": 0.0123, "step": 7936 }, { "epoch": 2.6505259642678243, "grad_norm": 0.34522343178556875, "learning_rate": 4.067698262704878e-07, "loss": 0.0194, "step": 7937 }, { "epoch": 2.650859909834697, "grad_norm": 0.2816068497944424, "learning_rate": 4.0600239800031136e-07, "loss": 0.0163, "step": 7938 }, { "epoch": 2.6511938554015697, "grad_norm": 0.28276559725879397, "learning_rate": 4.0523566370756774e-07, "loss": 0.0105, "step": 7939 }, { "epoch": 2.651527800968442, "grad_norm": 0.3014795509320092, "learning_rate": 4.044696235080775e-07, "loss": 0.015, "step": 7940 }, { "epoch": 2.6518617465353147, "grad_norm": 0.2974375921754659, "learning_rate": 4.037042775175626e-07, "loss": 0.0143, "step": 7941 }, { "epoch": 2.6521956921021874, "grad_norm": 0.52780190094282, "learning_rate": 4.0293962585163493e-07, "loss": 0.0218, "step": 7942 }, { "epoch": 2.65252963766906, "grad_norm": 0.25706312676563936, "learning_rate": 4.02175668625806e-07, "loss": 0.0113, "step": 7943 }, { "epoch": 2.6528635832359324, "grad_norm": 0.2235437248344753, "learning_rate": 4.014124059554786e-07, "loss": 0.0126, "step": 7944 }, { "epoch": 2.653197528802805, "grad_norm": 0.2884744355343741, "learning_rate": 4.006498379559559e-07, "loss": 0.0126, "step": 7945 }, { "epoch": 2.653531474369678, "grad_norm": 0.2805849343560099, "learning_rate": 3.9988796474242977e-07, "loss": 0.0143, "step": 7946 }, { "epoch": 2.65386541993655, "grad_norm": 0.27667167315839414, "learning_rate": 3.9912678642999134e-07, "loss": 0.0159, "step": 7947 }, { "epoch": 2.654199365503423, "grad_norm": 0.2572554077178099, "learning_rate": 3.983663031336249e-07, "loss": 0.0106, "step": 7948 }, { "epoch": 2.6545333110702956, "grad_norm": 0.2528709531305157, "learning_rate": 3.976065149682112e-07, "loss": 0.0102, "step": 7949 }, { "epoch": 2.6548672566371683, "grad_norm": 0.23314522901235568, "learning_rate": 3.968474220485252e-07, "loss": 0.0129, "step": 7950 }, { "epoch": 2.655201202204041, "grad_norm": 0.4715283509730433, "learning_rate": 3.960890244892362e-07, "loss": 0.0247, "step": 7951 }, { "epoch": 2.6555351477709133, "grad_norm": 0.252285031994945, "learning_rate": 3.953313224049099e-07, "loss": 0.0123, "step": 7952 }, { "epoch": 2.655869093337786, "grad_norm": 0.3095174305167158, "learning_rate": 3.945743159100046e-07, "loss": 0.0087, "step": 7953 }, { "epoch": 2.6562030389046587, "grad_norm": 0.28143247101252206, "learning_rate": 3.938180051188756e-07, "loss": 0.0145, "step": 7954 }, { "epoch": 2.656536984471531, "grad_norm": 0.26371545852487455, "learning_rate": 3.930623901457736e-07, "loss": 0.0111, "step": 7955 }, { "epoch": 2.6568709300384037, "grad_norm": 0.32299396913893724, "learning_rate": 3.92307471104843e-07, "loss": 0.0142, "step": 7956 }, { "epoch": 2.6572048756052764, "grad_norm": 0.2540323397540815, "learning_rate": 3.915532481101225e-07, "loss": 0.0128, "step": 7957 }, { "epoch": 2.6575388211721487, "grad_norm": 0.3154603263706469, "learning_rate": 3.9079972127554657e-07, "loss": 0.0176, "step": 7958 }, { "epoch": 2.6578727667390214, "grad_norm": 0.29047185577927653, "learning_rate": 3.9004689071494406e-07, "loss": 0.0144, "step": 7959 }, { "epoch": 2.658206712305894, "grad_norm": 0.25129909334646433, "learning_rate": 3.8929475654203963e-07, "loss": 0.0111, "step": 7960 }, { "epoch": 2.658540657872767, "grad_norm": 0.3390049623505517, "learning_rate": 3.8854331887045016e-07, "loss": 0.0158, "step": 7961 }, { "epoch": 2.6588746034396396, "grad_norm": 0.2527729454463676, "learning_rate": 3.877925778136921e-07, "loss": 0.0143, "step": 7962 }, { "epoch": 2.659208549006512, "grad_norm": 0.3583623696241767, "learning_rate": 3.870425334851713e-07, "loss": 0.0205, "step": 7963 }, { "epoch": 2.6595424945733845, "grad_norm": 0.33013999254450654, "learning_rate": 3.8629318599819224e-07, "loss": 0.0208, "step": 7964 }, { "epoch": 2.6598764401402573, "grad_norm": 0.23859842709351678, "learning_rate": 3.855445354659515e-07, "loss": 0.0128, "step": 7965 }, { "epoch": 2.6602103857071295, "grad_norm": 0.2781061623677838, "learning_rate": 3.847965820015426e-07, "loss": 0.0138, "step": 7966 }, { "epoch": 2.6605443312740022, "grad_norm": 0.2925399044881, "learning_rate": 3.8404932571795115e-07, "loss": 0.0116, "step": 7967 }, { "epoch": 2.660878276840875, "grad_norm": 0.31565697847506213, "learning_rate": 3.833027667280614e-07, "loss": 0.0144, "step": 7968 }, { "epoch": 2.6612122224077477, "grad_norm": 0.31843999255394795, "learning_rate": 3.825569051446476e-07, "loss": 0.01, "step": 7969 }, { "epoch": 2.6615461679746204, "grad_norm": 0.2959583423166002, "learning_rate": 3.8181174108038286e-07, "loss": 0.0121, "step": 7970 }, { "epoch": 2.6618801135414927, "grad_norm": 0.31047100804786, "learning_rate": 3.810672746478317e-07, "loss": 0.0151, "step": 7971 }, { "epoch": 2.6622140591083654, "grad_norm": 0.3295440901634914, "learning_rate": 3.803235059594551e-07, "loss": 0.0189, "step": 7972 }, { "epoch": 2.662548004675238, "grad_norm": 0.3180453972385154, "learning_rate": 3.795804351276072e-07, "loss": 0.0173, "step": 7973 }, { "epoch": 2.6628819502421104, "grad_norm": 0.2767718062908812, "learning_rate": 3.788380622645382e-07, "loss": 0.0161, "step": 7974 }, { "epoch": 2.663215895808983, "grad_norm": 0.2735517432168516, "learning_rate": 3.780963874823934e-07, "loss": 0.0132, "step": 7975 }, { "epoch": 2.663549841375856, "grad_norm": 0.32422867861538396, "learning_rate": 3.773554108932093e-07, "loss": 0.0152, "step": 7976 }, { "epoch": 2.663883786942728, "grad_norm": 0.37788016034916155, "learning_rate": 3.7661513260892067e-07, "loss": 0.0158, "step": 7977 }, { "epoch": 2.664217732509601, "grad_norm": 0.3500274988992972, "learning_rate": 3.7587555274135544e-07, "loss": 0.0198, "step": 7978 }, { "epoch": 2.6645516780764735, "grad_norm": 0.37085429756111626, "learning_rate": 3.751366714022342e-07, "loss": 0.019, "step": 7979 }, { "epoch": 2.6648856236433462, "grad_norm": 0.25528020499862125, "learning_rate": 3.7439848870317487e-07, "loss": 0.0118, "step": 7980 }, { "epoch": 2.665219569210219, "grad_norm": 0.3816040838362776, "learning_rate": 3.7366100475568935e-07, "loss": 0.022, "step": 7981 }, { "epoch": 2.6655535147770912, "grad_norm": 0.7436971589685613, "learning_rate": 3.7292421967118185e-07, "loss": 0.0178, "step": 7982 }, { "epoch": 2.665887460343964, "grad_norm": 0.26221711860801566, "learning_rate": 3.72188133560954e-07, "loss": 0.013, "step": 7983 }, { "epoch": 2.6662214059108367, "grad_norm": 0.29950999992828703, "learning_rate": 3.7145274653619776e-07, "loss": 0.0136, "step": 7984 }, { "epoch": 2.666555351477709, "grad_norm": 0.21266047381305353, "learning_rate": 3.7071805870800395e-07, "loss": 0.0088, "step": 7985 }, { "epoch": 2.6668892970445817, "grad_norm": 0.30667339393285914, "learning_rate": 3.6998407018735525e-07, "loss": 0.0127, "step": 7986 }, { "epoch": 2.6672232426114544, "grad_norm": 0.33277628709798035, "learning_rate": 3.6925078108513033e-07, "loss": 0.0222, "step": 7987 }, { "epoch": 2.667557188178327, "grad_norm": 0.3070737249695833, "learning_rate": 3.6851819151209947e-07, "loss": 0.0096, "step": 7988 }, { "epoch": 2.6678911337451994, "grad_norm": 0.34693941534621314, "learning_rate": 3.677863015789307e-07, "loss": 0.0129, "step": 7989 }, { "epoch": 2.668225079312072, "grad_norm": 0.24484842749232208, "learning_rate": 3.6705511139618177e-07, "loss": 0.0096, "step": 7990 }, { "epoch": 2.668559024878945, "grad_norm": 0.3031856005710634, "learning_rate": 3.66324621074311e-07, "loss": 0.0182, "step": 7991 }, { "epoch": 2.6688929704458175, "grad_norm": 0.2435595058431979, "learning_rate": 3.6559483072366506e-07, "loss": 0.0104, "step": 7992 }, { "epoch": 2.66922691601269, "grad_norm": 0.26620775820370635, "learning_rate": 3.6486574045448973e-07, "loss": 0.0141, "step": 7993 }, { "epoch": 2.6695608615795625, "grad_norm": 0.24179455274565803, "learning_rate": 3.6413735037691966e-07, "loss": 0.0087, "step": 7994 }, { "epoch": 2.6698948071464352, "grad_norm": 0.30278456733418374, "learning_rate": 3.634096606009896e-07, "loss": 0.014, "step": 7995 }, { "epoch": 2.6702287527133075, "grad_norm": 0.2328723905753084, "learning_rate": 3.626826712366233e-07, "loss": 0.0116, "step": 7996 }, { "epoch": 2.6705626982801802, "grad_norm": 0.2423226347129979, "learning_rate": 3.6195638239364225e-07, "loss": 0.0106, "step": 7997 }, { "epoch": 2.670896643847053, "grad_norm": 0.2854525409037026, "learning_rate": 3.612307941817622e-07, "loss": 0.0146, "step": 7998 }, { "epoch": 2.6712305894139257, "grad_norm": 0.2186770992320193, "learning_rate": 3.605059067105887e-07, "loss": 0.0092, "step": 7999 }, { "epoch": 2.6715645349807984, "grad_norm": 0.3058158622552652, "learning_rate": 3.59781720089627e-07, "loss": 0.014, "step": 8000 }, { "epoch": 2.6718984805476707, "grad_norm": 0.34223709014788484, "learning_rate": 3.5905823442827393e-07, "loss": 0.0137, "step": 8001 }, { "epoch": 2.6722324261145434, "grad_norm": 0.30259500138328077, "learning_rate": 3.583354498358188e-07, "loss": 0.0157, "step": 8002 }, { "epoch": 2.672566371681416, "grad_norm": 0.22158030938206486, "learning_rate": 3.576133664214476e-07, "loss": 0.0097, "step": 8003 }, { "epoch": 2.6729003172482884, "grad_norm": 0.2649250336288251, "learning_rate": 3.568919842942409e-07, "loss": 0.012, "step": 8004 }, { "epoch": 2.673234262815161, "grad_norm": 0.2637569150240111, "learning_rate": 3.5617130356316977e-07, "loss": 0.0102, "step": 8005 }, { "epoch": 2.673568208382034, "grad_norm": 0.30541122925855263, "learning_rate": 3.554513243371038e-07, "loss": 0.016, "step": 8006 }, { "epoch": 2.673902153948906, "grad_norm": 0.27997727959432517, "learning_rate": 3.5473204672480224e-07, "loss": 0.0157, "step": 8007 }, { "epoch": 2.674236099515779, "grad_norm": 0.30358337684296777, "learning_rate": 3.5401347083492077e-07, "loss": 0.0124, "step": 8008 }, { "epoch": 2.6745700450826515, "grad_norm": 0.25635752574494675, "learning_rate": 3.532955967760093e-07, "loss": 0.0093, "step": 8009 }, { "epoch": 2.674903990649524, "grad_norm": 0.2681677235283582, "learning_rate": 3.5257842465651226e-07, "loss": 0.0118, "step": 8010 }, { "epoch": 2.675237936216397, "grad_norm": 0.28356099347718106, "learning_rate": 3.5186195458476515e-07, "loss": 0.0122, "step": 8011 }, { "epoch": 2.675571881783269, "grad_norm": 0.2990796627631461, "learning_rate": 3.5114618666900023e-07, "loss": 0.0121, "step": 8012 }, { "epoch": 2.675905827350142, "grad_norm": 0.29134438124978296, "learning_rate": 3.5043112101734166e-07, "loss": 0.0132, "step": 8013 }, { "epoch": 2.6762397729170146, "grad_norm": 0.22071778531624958, "learning_rate": 3.4971675773780913e-07, "loss": 0.0109, "step": 8014 }, { "epoch": 2.676573718483887, "grad_norm": 0.2844547768955682, "learning_rate": 3.490030969383157e-07, "loss": 0.0162, "step": 8015 }, { "epoch": 2.6769076640507596, "grad_norm": 0.31511100856874213, "learning_rate": 3.482901387266685e-07, "loss": 0.0139, "step": 8016 }, { "epoch": 2.6772416096176324, "grad_norm": 0.22112655020495478, "learning_rate": 3.475778832105681e-07, "loss": 0.0108, "step": 8017 }, { "epoch": 2.677575555184505, "grad_norm": 0.3212531174791497, "learning_rate": 3.468663304976089e-07, "loss": 0.0172, "step": 8018 }, { "epoch": 2.677909500751378, "grad_norm": 0.26223578531047076, "learning_rate": 3.4615548069527883e-07, "loss": 0.011, "step": 8019 }, { "epoch": 2.67824344631825, "grad_norm": 0.36040554371455963, "learning_rate": 3.4544533391096093e-07, "loss": 0.0136, "step": 8020 }, { "epoch": 2.678577391885123, "grad_norm": 0.4739960398196577, "learning_rate": 3.4473589025193155e-07, "loss": 0.0216, "step": 8021 }, { "epoch": 2.6789113374519955, "grad_norm": 0.3343868088008449, "learning_rate": 3.440271498253589e-07, "loss": 0.0134, "step": 8022 }, { "epoch": 2.6792452830188678, "grad_norm": 0.31885474021106164, "learning_rate": 3.433191127383079e-07, "loss": 0.0138, "step": 8023 }, { "epoch": 2.6795792285857405, "grad_norm": 0.21750168149864377, "learning_rate": 3.4261177909773624e-07, "loss": 0.0098, "step": 8024 }, { "epoch": 2.679913174152613, "grad_norm": 0.2439737161928737, "learning_rate": 3.419051490104935e-07, "loss": 0.0086, "step": 8025 }, { "epoch": 2.6802471197194855, "grad_norm": 0.2600754845704635, "learning_rate": 3.4119922258332496e-07, "loss": 0.0118, "step": 8026 }, { "epoch": 2.680581065286358, "grad_norm": 0.22698588542062947, "learning_rate": 3.4049399992287067e-07, "loss": 0.0072, "step": 8027 }, { "epoch": 2.680915010853231, "grad_norm": 0.2628552445714976, "learning_rate": 3.3978948113566056e-07, "loss": 0.011, "step": 8028 }, { "epoch": 2.6812489564201036, "grad_norm": 0.3248050397345352, "learning_rate": 3.390856663281228e-07, "loss": 0.0158, "step": 8029 }, { "epoch": 2.6815829019869764, "grad_norm": 0.2863939964227281, "learning_rate": 3.3838255560657453e-07, "loss": 0.012, "step": 8030 }, { "epoch": 2.6819168475538486, "grad_norm": 0.3037715014542918, "learning_rate": 3.3768014907722966e-07, "loss": 0.0145, "step": 8031 }, { "epoch": 2.6822507931207213, "grad_norm": 0.2524322097338231, "learning_rate": 3.369784468461956e-07, "loss": 0.0097, "step": 8032 }, { "epoch": 2.682584738687594, "grad_norm": 0.3118841681447238, "learning_rate": 3.3627744901947313e-07, "loss": 0.0142, "step": 8033 }, { "epoch": 2.6829186842544663, "grad_norm": 0.310488698328538, "learning_rate": 3.3557715570295523e-07, "loss": 0.0141, "step": 8034 }, { "epoch": 2.683252629821339, "grad_norm": 0.32455197075888503, "learning_rate": 3.3487756700243014e-07, "loss": 0.0156, "step": 8035 }, { "epoch": 2.6835865753882118, "grad_norm": 0.3049481263981558, "learning_rate": 3.341786830235777e-07, "loss": 0.0149, "step": 8036 }, { "epoch": 2.6839205209550845, "grad_norm": 0.27165623449050624, "learning_rate": 3.334805038719735e-07, "loss": 0.0097, "step": 8037 }, { "epoch": 2.6842544665219568, "grad_norm": 0.3288765366179696, "learning_rate": 3.3278302965308593e-07, "loss": 0.0115, "step": 8038 }, { "epoch": 2.6845884120888295, "grad_norm": 0.28492294659238515, "learning_rate": 3.3208626047227687e-07, "loss": 0.0186, "step": 8039 }, { "epoch": 2.684922357655702, "grad_norm": 0.2755663138797607, "learning_rate": 3.313901964348004e-07, "loss": 0.0127, "step": 8040 }, { "epoch": 2.685256303222575, "grad_norm": 0.32871736218643544, "learning_rate": 3.306948376458069e-07, "loss": 0.0188, "step": 8041 }, { "epoch": 2.685590248789447, "grad_norm": 0.3093111892613027, "learning_rate": 3.3000018421033675e-07, "loss": 0.0153, "step": 8042 }, { "epoch": 2.68592419435632, "grad_norm": 0.2796736650506178, "learning_rate": 3.29306236233326e-07, "loss": 0.0144, "step": 8043 }, { "epoch": 2.6862581399231926, "grad_norm": 0.31501408410236686, "learning_rate": 3.286129938196048e-07, "loss": 0.0093, "step": 8044 }, { "epoch": 2.686592085490065, "grad_norm": 0.28540574385417367, "learning_rate": 3.279204570738936e-07, "loss": 0.0154, "step": 8045 }, { "epoch": 2.6869260310569376, "grad_norm": 0.2695178465695714, "learning_rate": 3.272286261008095e-07, "loss": 0.0107, "step": 8046 }, { "epoch": 2.6872599766238103, "grad_norm": 0.2998989004523264, "learning_rate": 3.2653750100486213e-07, "loss": 0.0103, "step": 8047 }, { "epoch": 2.687593922190683, "grad_norm": 0.2582947189414363, "learning_rate": 3.25847081890453e-07, "loss": 0.0116, "step": 8048 }, { "epoch": 2.6879278677575558, "grad_norm": 0.24954918084620384, "learning_rate": 3.251573688618781e-07, "loss": 0.0131, "step": 8049 }, { "epoch": 2.688261813324428, "grad_norm": 0.2927417683379775, "learning_rate": 3.2446836202332854e-07, "loss": 0.0122, "step": 8050 }, { "epoch": 2.6885957588913008, "grad_norm": 0.2691385750291096, "learning_rate": 3.237800614788844e-07, "loss": 0.0134, "step": 8051 }, { "epoch": 2.6889297044581735, "grad_norm": 0.2973712814192687, "learning_rate": 3.230924673325231e-07, "loss": 0.0133, "step": 8052 }, { "epoch": 2.6892636500250457, "grad_norm": 0.2490960157513933, "learning_rate": 3.2240557968811315e-07, "loss": 0.0099, "step": 8053 }, { "epoch": 2.6895975955919185, "grad_norm": 0.3191588454744281, "learning_rate": 3.217193986494177e-07, "loss": 0.0234, "step": 8054 }, { "epoch": 2.689931541158791, "grad_norm": 0.25023599468160346, "learning_rate": 3.2103392432009105e-07, "loss": 0.0106, "step": 8055 }, { "epoch": 2.6902654867256635, "grad_norm": 0.28204142857925146, "learning_rate": 3.203491568036843e-07, "loss": 0.0116, "step": 8056 }, { "epoch": 2.690599432292536, "grad_norm": 0.2734569800461845, "learning_rate": 3.196650962036374e-07, "loss": 0.014, "step": 8057 }, { "epoch": 2.690933377859409, "grad_norm": 0.39120310053416835, "learning_rate": 3.189817426232883e-07, "loss": 0.0169, "step": 8058 }, { "epoch": 2.6912673234262816, "grad_norm": 0.3281417840344649, "learning_rate": 3.182990961658633e-07, "loss": 0.017, "step": 8059 }, { "epoch": 2.6916012689931543, "grad_norm": 0.3200298450914653, "learning_rate": 3.1761715693448546e-07, "loss": 0.0132, "step": 8060 }, { "epoch": 2.6919352145600266, "grad_norm": 0.27028344663247766, "learning_rate": 3.1693592503216795e-07, "loss": 0.0104, "step": 8061 }, { "epoch": 2.6922691601268993, "grad_norm": 0.2550863765463541, "learning_rate": 3.162554005618218e-07, "loss": 0.0125, "step": 8062 }, { "epoch": 2.692603105693772, "grad_norm": 0.24732052208510927, "learning_rate": 3.155755836262464e-07, "loss": 0.0132, "step": 8063 }, { "epoch": 2.6929370512606443, "grad_norm": 0.33277953986162934, "learning_rate": 3.148964743281363e-07, "loss": 0.0208, "step": 8064 }, { "epoch": 2.693270996827517, "grad_norm": 0.2689153505510985, "learning_rate": 3.1421807277007885e-07, "loss": 0.018, "step": 8065 }, { "epoch": 2.6936049423943897, "grad_norm": 0.24414343837281255, "learning_rate": 3.1354037905455547e-07, "loss": 0.0131, "step": 8066 }, { "epoch": 2.6939388879612625, "grad_norm": 0.28762204133327557, "learning_rate": 3.1286339328393755e-07, "loss": 0.0239, "step": 8067 }, { "epoch": 2.694272833528135, "grad_norm": 0.24201984860907172, "learning_rate": 3.1218711556049494e-07, "loss": 0.0082, "step": 8068 }, { "epoch": 2.6946067790950075, "grad_norm": 0.27568309089650767, "learning_rate": 3.115115459863849e-07, "loss": 0.0095, "step": 8069 }, { "epoch": 2.69494072466188, "grad_norm": 0.3442774595777744, "learning_rate": 3.108366846636618e-07, "loss": 0.0213, "step": 8070 }, { "epoch": 2.695274670228753, "grad_norm": 0.3006104524297572, "learning_rate": 3.101625316942697e-07, "loss": 0.0156, "step": 8071 }, { "epoch": 2.695608615795625, "grad_norm": 0.33543481218275956, "learning_rate": 3.094890871800488e-07, "loss": 0.0229, "step": 8072 }, { "epoch": 2.695942561362498, "grad_norm": 0.2712218291717025, "learning_rate": 3.0881635122273047e-07, "loss": 0.0164, "step": 8073 }, { "epoch": 2.6962765069293706, "grad_norm": 0.25249449195822743, "learning_rate": 3.0814432392393847e-07, "loss": 0.0116, "step": 8074 }, { "epoch": 2.696610452496243, "grad_norm": 0.2302066994843045, "learning_rate": 3.074730053851921e-07, "loss": 0.008, "step": 8075 }, { "epoch": 2.6969443980631156, "grad_norm": 0.23432412131902264, "learning_rate": 3.068023957078997e-07, "loss": 0.0106, "step": 8076 }, { "epoch": 2.6972783436299883, "grad_norm": 0.24369353617110326, "learning_rate": 3.061324949933675e-07, "loss": 0.0099, "step": 8077 }, { "epoch": 2.697612289196861, "grad_norm": 0.3668128503923368, "learning_rate": 3.054633033427884e-07, "loss": 0.0151, "step": 8078 }, { "epoch": 2.6979462347637337, "grad_norm": 0.3131536970729404, "learning_rate": 3.0479482085725545e-07, "loss": 0.0178, "step": 8079 }, { "epoch": 2.698280180330606, "grad_norm": 0.37327430626818686, "learning_rate": 3.0412704763774836e-07, "loss": 0.0212, "step": 8080 }, { "epoch": 2.6986141258974787, "grad_norm": 0.27930249318555983, "learning_rate": 3.034599837851432e-07, "loss": 0.0141, "step": 8081 }, { "epoch": 2.6989480714643515, "grad_norm": 0.2976113162848265, "learning_rate": 3.027936294002071e-07, "loss": 0.0136, "step": 8082 }, { "epoch": 2.6992820170312237, "grad_norm": 0.35091913577245337, "learning_rate": 3.021279845836017e-07, "loss": 0.0155, "step": 8083 }, { "epoch": 2.6996159625980964, "grad_norm": 0.2765008982831201, "learning_rate": 3.0146304943587833e-07, "loss": 0.0167, "step": 8084 }, { "epoch": 2.699949908164969, "grad_norm": 0.26195630891909977, "learning_rate": 3.007988240574866e-07, "loss": 0.014, "step": 8085 }, { "epoch": 2.7002838537318414, "grad_norm": 0.29485613821089995, "learning_rate": 3.0013530854876296e-07, "loss": 0.0167, "step": 8086 }, { "epoch": 2.700617799298714, "grad_norm": 0.3045324367854926, "learning_rate": 2.9947250300994046e-07, "loss": 0.0181, "step": 8087 }, { "epoch": 2.700951744865587, "grad_norm": 0.20798518186480916, "learning_rate": 2.98810407541143e-07, "loss": 0.0092, "step": 8088 }, { "epoch": 2.7012856904324596, "grad_norm": 0.2512070057497568, "learning_rate": 2.9814902224238886e-07, "loss": 0.0084, "step": 8089 }, { "epoch": 2.7016196359993323, "grad_norm": 0.30290084768448594, "learning_rate": 2.974883472135859e-07, "loss": 0.011, "step": 8090 }, { "epoch": 2.7019535815662046, "grad_norm": 0.2897283208639351, "learning_rate": 2.968283825545398e-07, "loss": 0.0141, "step": 8091 }, { "epoch": 2.7022875271330773, "grad_norm": 0.26225589520719345, "learning_rate": 2.961691283649437e-07, "loss": 0.011, "step": 8092 }, { "epoch": 2.70262147269995, "grad_norm": 0.29711547066990696, "learning_rate": 2.955105847443873e-07, "loss": 0.0136, "step": 8093 }, { "epoch": 2.7029554182668223, "grad_norm": 0.2745414877389005, "learning_rate": 2.9485275179235e-07, "loss": 0.0118, "step": 8094 }, { "epoch": 2.703289363833695, "grad_norm": 0.2873513473148231, "learning_rate": 2.9419562960820656e-07, "loss": 0.0217, "step": 8095 }, { "epoch": 2.7036233094005677, "grad_norm": 0.2553205630656262, "learning_rate": 2.9353921829122167e-07, "loss": 0.0154, "step": 8096 }, { "epoch": 2.7039572549674404, "grad_norm": 0.2285208586329596, "learning_rate": 2.928835179405548e-07, "loss": 0.008, "step": 8097 }, { "epoch": 2.704291200534313, "grad_norm": 0.25691285277924664, "learning_rate": 2.922285286552579e-07, "loss": 0.0105, "step": 8098 }, { "epoch": 2.7046251461011854, "grad_norm": 0.24433197907471793, "learning_rate": 2.915742505342728e-07, "loss": 0.0139, "step": 8099 }, { "epoch": 2.704959091668058, "grad_norm": 0.34876670402277193, "learning_rate": 2.9092068367643776e-07, "loss": 0.0189, "step": 8100 }, { "epoch": 2.705293037234931, "grad_norm": 0.31597055136604885, "learning_rate": 2.902678281804805e-07, "loss": 0.0087, "step": 8101 }, { "epoch": 2.705626982801803, "grad_norm": 0.20953360657229994, "learning_rate": 2.896156841450232e-07, "loss": 0.0093, "step": 8102 }, { "epoch": 2.705960928368676, "grad_norm": 0.2502107260103645, "learning_rate": 2.8896425166857976e-07, "loss": 0.0101, "step": 8103 }, { "epoch": 2.7062948739355486, "grad_norm": 0.3900922235048903, "learning_rate": 2.8831353084955717e-07, "loss": 0.0143, "step": 8104 }, { "epoch": 2.706628819502421, "grad_norm": 0.29621728324110896, "learning_rate": 2.8766352178625387e-07, "loss": 0.015, "step": 8105 }, { "epoch": 2.7069627650692936, "grad_norm": 0.2883050242164719, "learning_rate": 2.87014224576862e-07, "loss": 0.0169, "step": 8106 }, { "epoch": 2.7072967106361663, "grad_norm": 0.32317815155902635, "learning_rate": 2.863656393194636e-07, "loss": 0.0105, "step": 8107 }, { "epoch": 2.707630656203039, "grad_norm": 0.22040945097768017, "learning_rate": 2.8571776611203804e-07, "loss": 0.0096, "step": 8108 }, { "epoch": 2.7079646017699117, "grad_norm": 0.48663802471115114, "learning_rate": 2.850706050524521e-07, "loss": 0.0146, "step": 8109 }, { "epoch": 2.708298547336784, "grad_norm": 0.2628339874374742, "learning_rate": 2.844241562384686e-07, "loss": 0.0132, "step": 8110 }, { "epoch": 2.7086324929036567, "grad_norm": 0.2806529826266646, "learning_rate": 2.8377841976773955e-07, "loss": 0.0128, "step": 8111 }, { "epoch": 2.7089664384705294, "grad_norm": 0.2933929171767404, "learning_rate": 2.83133395737813e-07, "loss": 0.0123, "step": 8112 }, { "epoch": 2.7093003840374017, "grad_norm": 0.4455140012227884, "learning_rate": 2.824890842461242e-07, "loss": 0.0301, "step": 8113 }, { "epoch": 2.7096343296042744, "grad_norm": 0.2858646834365565, "learning_rate": 2.818454853900082e-07, "loss": 0.0112, "step": 8114 }, { "epoch": 2.709968275171147, "grad_norm": 0.2761482203608089, "learning_rate": 2.8120259926668505e-07, "loss": 0.0163, "step": 8115 }, { "epoch": 2.71030222073802, "grad_norm": 0.35001589040383363, "learning_rate": 2.8056042597327196e-07, "loss": 0.0193, "step": 8116 }, { "epoch": 2.7106361663048926, "grad_norm": 0.27105912949837924, "learning_rate": 2.799189656067758e-07, "loss": 0.0104, "step": 8117 }, { "epoch": 2.710970111871765, "grad_norm": 0.24695272701899895, "learning_rate": 2.792782182640974e-07, "loss": 0.0138, "step": 8118 }, { "epoch": 2.7113040574386376, "grad_norm": 0.2774423352284431, "learning_rate": 2.7863818404202823e-07, "loss": 0.0161, "step": 8119 }, { "epoch": 2.7116380030055103, "grad_norm": 0.3021631042135275, "learning_rate": 2.7799886303725376e-07, "loss": 0.0134, "step": 8120 }, { "epoch": 2.7119719485723826, "grad_norm": 0.3236457801146371, "learning_rate": 2.7736025534635115e-07, "loss": 0.0106, "step": 8121 }, { "epoch": 2.7123058941392553, "grad_norm": 0.23251390289212942, "learning_rate": 2.767223610657888e-07, "loss": 0.0103, "step": 8122 }, { "epoch": 2.712639839706128, "grad_norm": 0.32552163691099706, "learning_rate": 2.7608518029192897e-07, "loss": 0.0153, "step": 8123 }, { "epoch": 2.7129737852730003, "grad_norm": 0.346564248573981, "learning_rate": 2.7544871312102485e-07, "loss": 0.0157, "step": 8124 }, { "epoch": 2.713307730839873, "grad_norm": 0.2567624866081163, "learning_rate": 2.7481295964922216e-07, "loss": 0.0107, "step": 8125 }, { "epoch": 2.7136416764067457, "grad_norm": 0.253078640282921, "learning_rate": 2.7417791997255916e-07, "loss": 0.0104, "step": 8126 }, { "epoch": 2.7139756219736184, "grad_norm": 0.3255809496823744, "learning_rate": 2.735435941869663e-07, "loss": 0.0145, "step": 8127 }, { "epoch": 2.714309567540491, "grad_norm": 0.24491376851842311, "learning_rate": 2.7290998238826584e-07, "loss": 0.0095, "step": 8128 }, { "epoch": 2.7146435131073634, "grad_norm": 0.3531616503885645, "learning_rate": 2.7227708467217227e-07, "loss": 0.0183, "step": 8129 }, { "epoch": 2.714977458674236, "grad_norm": 0.2397817905060915, "learning_rate": 2.71644901134292e-07, "loss": 0.0102, "step": 8130 }, { "epoch": 2.715311404241109, "grad_norm": 0.4127895105540195, "learning_rate": 2.7101343187012354e-07, "loss": 0.0125, "step": 8131 }, { "epoch": 2.715645349807981, "grad_norm": 0.28047852679318064, "learning_rate": 2.7038267697505894e-07, "loss": 0.0145, "step": 8132 }, { "epoch": 2.715979295374854, "grad_norm": 0.24697138421435968, "learning_rate": 2.697526365443803e-07, "loss": 0.0128, "step": 8133 }, { "epoch": 2.7163132409417265, "grad_norm": 0.31241607663246995, "learning_rate": 2.691233106732627e-07, "loss": 0.015, "step": 8134 }, { "epoch": 2.716647186508599, "grad_norm": 0.2832482923302587, "learning_rate": 2.684946994567733e-07, "loss": 0.0136, "step": 8135 }, { "epoch": 2.7169811320754715, "grad_norm": 0.27857070735171374, "learning_rate": 2.678668029898712e-07, "loss": 0.0113, "step": 8136 }, { "epoch": 2.7173150776423443, "grad_norm": 0.2229044900955066, "learning_rate": 2.672396213674072e-07, "loss": 0.0119, "step": 8137 }, { "epoch": 2.717649023209217, "grad_norm": 0.20152071027655438, "learning_rate": 2.66613154684125e-07, "loss": 0.0091, "step": 8138 }, { "epoch": 2.7179829687760897, "grad_norm": 0.2938025169107522, "learning_rate": 2.659874030346604e-07, "loss": 0.017, "step": 8139 }, { "epoch": 2.718316914342962, "grad_norm": 0.35397618109201595, "learning_rate": 2.653623665135391e-07, "loss": 0.0153, "step": 8140 }, { "epoch": 2.7186508599098347, "grad_norm": 0.2778299889738381, "learning_rate": 2.6473804521518097e-07, "loss": 0.0122, "step": 8141 }, { "epoch": 2.7189848054767074, "grad_norm": 0.21914334159016047, "learning_rate": 2.641144392338968e-07, "loss": 0.0078, "step": 8142 }, { "epoch": 2.7193187510435797, "grad_norm": 0.25109535212329154, "learning_rate": 2.6349154866389e-07, "loss": 0.0114, "step": 8143 }, { "epoch": 2.7196526966104524, "grad_norm": 0.3195333006679426, "learning_rate": 2.6286937359925545e-07, "loss": 0.011, "step": 8144 }, { "epoch": 2.719986642177325, "grad_norm": 0.19721292850911923, "learning_rate": 2.622479141339801e-07, "loss": 0.0084, "step": 8145 }, { "epoch": 2.720320587744198, "grad_norm": 0.2693325560794091, "learning_rate": 2.6162717036194274e-07, "loss": 0.012, "step": 8146 }, { "epoch": 2.7206545333110705, "grad_norm": 0.2868955195589291, "learning_rate": 2.610071423769128e-07, "loss": 0.0119, "step": 8147 }, { "epoch": 2.720988478877943, "grad_norm": 0.3962024079719642, "learning_rate": 2.603878302725543e-07, "loss": 0.0236, "step": 8148 }, { "epoch": 2.7213224244448155, "grad_norm": 0.2405097653399732, "learning_rate": 2.5976923414242126e-07, "loss": 0.0105, "step": 8149 }, { "epoch": 2.7216563700116883, "grad_norm": 0.2654105029663251, "learning_rate": 2.5915135407996005e-07, "loss": 0.0113, "step": 8150 }, { "epoch": 2.7219903155785605, "grad_norm": 0.2233171349241094, "learning_rate": 2.585341901785082e-07, "loss": 0.0113, "step": 8151 }, { "epoch": 2.7223242611454332, "grad_norm": 0.3385804106149172, "learning_rate": 2.579177425312962e-07, "loss": 0.0137, "step": 8152 }, { "epoch": 2.722658206712306, "grad_norm": 0.25119578119040376, "learning_rate": 2.5730201123144503e-07, "loss": 0.0135, "step": 8153 }, { "epoch": 2.7229921522791782, "grad_norm": 0.33930815236777434, "learning_rate": 2.566869963719681e-07, "loss": 0.0135, "step": 8154 }, { "epoch": 2.723326097846051, "grad_norm": 0.3114077082281841, "learning_rate": 2.5607269804577174e-07, "loss": 0.0125, "step": 8155 }, { "epoch": 2.7236600434129237, "grad_norm": 0.2592412741302571, "learning_rate": 2.5545911634565266e-07, "loss": 0.0088, "step": 8156 }, { "epoch": 2.7239939889797964, "grad_norm": 0.282059793097031, "learning_rate": 2.5484625136429854e-07, "loss": 0.0118, "step": 8157 }, { "epoch": 2.724327934546669, "grad_norm": 0.27502578295242364, "learning_rate": 2.5423410319429075e-07, "loss": 0.0113, "step": 8158 }, { "epoch": 2.7246618801135414, "grad_norm": 0.28663454986505266, "learning_rate": 2.5362267192810095e-07, "loss": 0.013, "step": 8159 }, { "epoch": 2.724995825680414, "grad_norm": 0.3061127537372279, "learning_rate": 2.530119576580936e-07, "loss": 0.0128, "step": 8160 }, { "epoch": 2.725329771247287, "grad_norm": 0.2231746098020713, "learning_rate": 2.5240196047652377e-07, "loss": 0.0083, "step": 8161 }, { "epoch": 2.725663716814159, "grad_norm": 0.2902111367982422, "learning_rate": 2.5179268047553937e-07, "loss": 0.0147, "step": 8162 }, { "epoch": 2.725997662381032, "grad_norm": 0.29400139303921846, "learning_rate": 2.5118411774717857e-07, "loss": 0.0125, "step": 8163 }, { "epoch": 2.7263316079479045, "grad_norm": 0.31005771623133305, "learning_rate": 2.5057627238337324e-07, "loss": 0.0144, "step": 8164 }, { "epoch": 2.7266655535147772, "grad_norm": 0.2824070007399706, "learning_rate": 2.4996914447594334e-07, "loss": 0.0185, "step": 8165 }, { "epoch": 2.72699949908165, "grad_norm": 0.3182981688916357, "learning_rate": 2.493627341166044e-07, "loss": 0.0144, "step": 8166 }, { "epoch": 2.7273334446485222, "grad_norm": 0.2449660151149918, "learning_rate": 2.48757041396962e-07, "loss": 0.011, "step": 8167 }, { "epoch": 2.727667390215395, "grad_norm": 0.26265010743791456, "learning_rate": 2.481520664085113e-07, "loss": 0.0103, "step": 8168 }, { "epoch": 2.7280013357822677, "grad_norm": 0.29750084853772446, "learning_rate": 2.4754780924264366e-07, "loss": 0.015, "step": 8169 }, { "epoch": 2.72833528134914, "grad_norm": 0.24564357049346475, "learning_rate": 2.4694426999063657e-07, "loss": 0.0088, "step": 8170 }, { "epoch": 2.7286692269160127, "grad_norm": 0.28270023167926145, "learning_rate": 2.463414487436633e-07, "loss": 0.007, "step": 8171 }, { "epoch": 2.7290031724828854, "grad_norm": 0.2535544268708272, "learning_rate": 2.4573934559278646e-07, "loss": 0.0101, "step": 8172 }, { "epoch": 2.7293371180497576, "grad_norm": 0.20093188441982038, "learning_rate": 2.4513796062896166e-07, "loss": 0.0078, "step": 8173 }, { "epoch": 2.7296710636166304, "grad_norm": 0.3142522270328027, "learning_rate": 2.4453729394303404e-07, "loss": 0.0104, "step": 8174 }, { "epoch": 2.730005009183503, "grad_norm": 0.3166882283710865, "learning_rate": 2.439373456257427e-07, "loss": 0.0115, "step": 8175 }, { "epoch": 2.730338954750376, "grad_norm": 0.29365553858337456, "learning_rate": 2.433381157677156e-07, "loss": 0.0132, "step": 8176 }, { "epoch": 2.7306729003172485, "grad_norm": 0.24433303266143705, "learning_rate": 2.427396044594743e-07, "loss": 0.0104, "step": 8177 }, { "epoch": 2.731006845884121, "grad_norm": 0.6027711421808059, "learning_rate": 2.421418117914298e-07, "loss": 0.0144, "step": 8178 }, { "epoch": 2.7313407914509935, "grad_norm": 0.2897866679069052, "learning_rate": 2.415447378538871e-07, "loss": 0.0106, "step": 8179 }, { "epoch": 2.7316747370178662, "grad_norm": 0.40784173706742166, "learning_rate": 2.409483827370407e-07, "loss": 0.023, "step": 8180 }, { "epoch": 2.7320086825847385, "grad_norm": 0.3133242765929012, "learning_rate": 2.4035274653097797e-07, "loss": 0.0151, "step": 8181 }, { "epoch": 2.732342628151611, "grad_norm": 0.379545256249535, "learning_rate": 2.3975782932567473e-07, "loss": 0.0286, "step": 8182 }, { "epoch": 2.732676573718484, "grad_norm": 0.274653596333824, "learning_rate": 2.391636312110024e-07, "loss": 0.0098, "step": 8183 }, { "epoch": 2.733010519285356, "grad_norm": 0.24565840561989097, "learning_rate": 2.385701522767192e-07, "loss": 0.0114, "step": 8184 }, { "epoch": 2.733344464852229, "grad_norm": 0.32159704736477485, "learning_rate": 2.3797739261247955e-07, "loss": 0.0168, "step": 8185 }, { "epoch": 2.7336784104191016, "grad_norm": 0.3453941063010027, "learning_rate": 2.3738535230782568e-07, "loss": 0.0112, "step": 8186 }, { "epoch": 2.7340123559859744, "grad_norm": 0.25889253912849375, "learning_rate": 2.3679403145219214e-07, "loss": 0.0111, "step": 8187 }, { "epoch": 2.734346301552847, "grad_norm": 0.3782274519653321, "learning_rate": 2.362034301349053e-07, "loss": 0.0103, "step": 8188 }, { "epoch": 2.7346802471197194, "grad_norm": 0.328646201769715, "learning_rate": 2.3561354844518157e-07, "loss": 0.0194, "step": 8189 }, { "epoch": 2.735014192686592, "grad_norm": 0.29036599921858414, "learning_rate": 2.3502438647213132e-07, "loss": 0.0143, "step": 8190 }, { "epoch": 2.735348138253465, "grad_norm": 0.23070780893248077, "learning_rate": 2.3443594430475224e-07, "loss": 0.0079, "step": 8191 }, { "epoch": 2.735682083820337, "grad_norm": 0.24094828888252745, "learning_rate": 2.3384822203193714e-07, "loss": 0.0099, "step": 8192 }, { "epoch": 2.73601602938721, "grad_norm": 0.2792369213107012, "learning_rate": 2.332612197424672e-07, "loss": 0.0118, "step": 8193 }, { "epoch": 2.7363499749540825, "grad_norm": 0.3373230905145829, "learning_rate": 2.32674937525017e-07, "loss": 0.0164, "step": 8194 }, { "epoch": 2.736683920520955, "grad_norm": 0.28816021991983115, "learning_rate": 2.3208937546815026e-07, "loss": 0.0142, "step": 8195 }, { "epoch": 2.737017866087828, "grad_norm": 0.3125144546949365, "learning_rate": 2.3150453366032445e-07, "loss": 0.0138, "step": 8196 }, { "epoch": 2.7373518116547, "grad_norm": 0.3136773891008518, "learning_rate": 2.309204121898856e-07, "loss": 0.0127, "step": 8197 }, { "epoch": 2.737685757221573, "grad_norm": 0.28118525928695043, "learning_rate": 2.3033701114507313e-07, "loss": 0.0113, "step": 8198 }, { "epoch": 2.7380197027884456, "grad_norm": 0.36287240362798984, "learning_rate": 2.2975433061401541e-07, "loss": 0.0128, "step": 8199 }, { "epoch": 2.738353648355318, "grad_norm": 0.3175172402768893, "learning_rate": 2.2917237068473484e-07, "loss": 0.017, "step": 8200 }, { "epoch": 2.7386875939221906, "grad_norm": 0.3288060226239171, "learning_rate": 2.2859113144514055e-07, "loss": 0.012, "step": 8201 }, { "epoch": 2.7390215394890634, "grad_norm": 0.2694604123570534, "learning_rate": 2.2801061298303895e-07, "loss": 0.0121, "step": 8202 }, { "epoch": 2.7393554850559356, "grad_norm": 0.23336597870360848, "learning_rate": 2.2743081538612154e-07, "loss": 0.0108, "step": 8203 }, { "epoch": 2.7396894306228083, "grad_norm": 0.22539188471782481, "learning_rate": 2.268517387419761e-07, "loss": 0.0092, "step": 8204 }, { "epoch": 2.740023376189681, "grad_norm": 0.34108942387662355, "learning_rate": 2.2627338313807645e-07, "loss": 0.014, "step": 8205 }, { "epoch": 2.7403573217565538, "grad_norm": 0.2880143722050247, "learning_rate": 2.2569574866179166e-07, "loss": 0.0146, "step": 8206 }, { "epoch": 2.7406912673234265, "grad_norm": 0.2861275846293823, "learning_rate": 2.2511883540037805e-07, "loss": 0.0151, "step": 8207 }, { "epoch": 2.7410252128902988, "grad_norm": 0.21978870075720008, "learning_rate": 2.2454264344098865e-07, "loss": 0.0081, "step": 8208 }, { "epoch": 2.7413591584571715, "grad_norm": 0.28266386451179687, "learning_rate": 2.2396717287066106e-07, "loss": 0.0127, "step": 8209 }, { "epoch": 2.741693104024044, "grad_norm": 0.2519643336463548, "learning_rate": 2.233924237763291e-07, "loss": 0.0101, "step": 8210 }, { "epoch": 2.7420270495909165, "grad_norm": 0.27988364118866643, "learning_rate": 2.2281839624481328e-07, "loss": 0.0141, "step": 8211 }, { "epoch": 2.742360995157789, "grad_norm": 0.20551095327571595, "learning_rate": 2.222450903628287e-07, "loss": 0.009, "step": 8212 }, { "epoch": 2.742694940724662, "grad_norm": 0.2921509515606145, "learning_rate": 2.2167250621697944e-07, "loss": 0.0151, "step": 8213 }, { "epoch": 2.7430288862915346, "grad_norm": 0.2811275891975004, "learning_rate": 2.2110064389376017e-07, "loss": 0.0128, "step": 8214 }, { "epoch": 2.7433628318584073, "grad_norm": 0.3089409785322832, "learning_rate": 2.205295034795596e-07, "loss": 0.011, "step": 8215 }, { "epoch": 2.7436967774252796, "grad_norm": 0.23001989701544498, "learning_rate": 2.1995908506065366e-07, "loss": 0.0069, "step": 8216 }, { "epoch": 2.7440307229921523, "grad_norm": 0.273604365015591, "learning_rate": 2.1938938872321014e-07, "loss": 0.0105, "step": 8217 }, { "epoch": 2.744364668559025, "grad_norm": 0.2765553120627356, "learning_rate": 2.1882041455329073e-07, "loss": 0.0154, "step": 8218 }, { "epoch": 2.7446986141258973, "grad_norm": 0.35516951383829043, "learning_rate": 2.1825216263684336e-07, "loss": 0.0189, "step": 8219 }, { "epoch": 2.74503255969277, "grad_norm": 0.32217286641570436, "learning_rate": 2.176846330597099e-07, "loss": 0.0158, "step": 8220 }, { "epoch": 2.7453665052596428, "grad_norm": 0.24849044606753853, "learning_rate": 2.1711782590762344e-07, "loss": 0.0075, "step": 8221 }, { "epoch": 2.745700450826515, "grad_norm": 0.20754128360930993, "learning_rate": 2.165517412662055e-07, "loss": 0.0085, "step": 8222 }, { "epoch": 2.7460343963933878, "grad_norm": 0.3109369589059883, "learning_rate": 2.1598637922097098e-07, "loss": 0.0165, "step": 8223 }, { "epoch": 2.7463683419602605, "grad_norm": 0.2596669166669222, "learning_rate": 2.1542173985732274e-07, "loss": 0.0135, "step": 8224 }, { "epoch": 2.746702287527133, "grad_norm": 0.28923075375522145, "learning_rate": 2.148578232605575e-07, "loss": 0.0144, "step": 8225 }, { "epoch": 2.747036233094006, "grad_norm": 0.2157993011104292, "learning_rate": 2.14294629515861e-07, "loss": 0.0084, "step": 8226 }, { "epoch": 2.747370178660878, "grad_norm": 0.2542783390431188, "learning_rate": 2.137321587083119e-07, "loss": 0.0095, "step": 8227 }, { "epoch": 2.747704124227751, "grad_norm": 0.2739989612947763, "learning_rate": 2.1317041092287548e-07, "loss": 0.01, "step": 8228 }, { "epoch": 2.7480380697946236, "grad_norm": 0.33830015955136855, "learning_rate": 2.126093862444123e-07, "loss": 0.0214, "step": 8229 }, { "epoch": 2.748372015361496, "grad_norm": 0.273366251968806, "learning_rate": 2.1204908475767005e-07, "loss": 0.0134, "step": 8230 }, { "epoch": 2.7487059609283686, "grad_norm": 0.2641925398859872, "learning_rate": 2.114895065472905e-07, "loss": 0.0106, "step": 8231 }, { "epoch": 2.7490399064952413, "grad_norm": 0.5610458409108607, "learning_rate": 2.109306516978038e-07, "loss": 0.0159, "step": 8232 }, { "epoch": 2.7493738520621136, "grad_norm": 0.25527657542575755, "learning_rate": 2.1037252029363242e-07, "loss": 0.0114, "step": 8233 }, { "epoch": 2.7497077976289863, "grad_norm": 0.23889412036671726, "learning_rate": 2.098151124190867e-07, "loss": 0.0103, "step": 8234 }, { "epoch": 2.750041743195859, "grad_norm": 0.37063197825375, "learning_rate": 2.092584281583715e-07, "loss": 0.019, "step": 8235 }, { "epoch": 2.7503756887627318, "grad_norm": 0.31912294717825307, "learning_rate": 2.0870246759557956e-07, "loss": 0.0132, "step": 8236 }, { "epoch": 2.7507096343296045, "grad_norm": 0.2984038361409962, "learning_rate": 2.0814723081469535e-07, "loss": 0.0172, "step": 8237 }, { "epoch": 2.7510435798964767, "grad_norm": 0.31061195911274414, "learning_rate": 2.0759271789959513e-07, "loss": 0.0136, "step": 8238 }, { "epoch": 2.7513775254633495, "grad_norm": 0.310034734249356, "learning_rate": 2.0703892893404299e-07, "loss": 0.0175, "step": 8239 }, { "epoch": 2.751711471030222, "grad_norm": 0.3638576227274786, "learning_rate": 2.064858640016959e-07, "loss": 0.0114, "step": 8240 }, { "epoch": 2.7520454165970945, "grad_norm": 0.3131498484811339, "learning_rate": 2.0593352318610093e-07, "loss": 0.0115, "step": 8241 }, { "epoch": 2.752379362163967, "grad_norm": 0.23521264084117519, "learning_rate": 2.0538190657069523e-07, "loss": 0.0098, "step": 8242 }, { "epoch": 2.75271330773084, "grad_norm": 0.3490876568458798, "learning_rate": 2.048310142388077e-07, "loss": 0.0195, "step": 8243 }, { "epoch": 2.7530472532977126, "grad_norm": 0.22969879460114054, "learning_rate": 2.0428084627365729e-07, "loss": 0.0099, "step": 8244 }, { "epoch": 2.7533811988645853, "grad_norm": 0.25860538536446154, "learning_rate": 2.0373140275835203e-07, "loss": 0.0116, "step": 8245 }, { "epoch": 2.7537151444314576, "grad_norm": 0.23864379334135657, "learning_rate": 2.0318268377589323e-07, "loss": 0.0088, "step": 8246 }, { "epoch": 2.7540490899983303, "grad_norm": 0.31311993559159723, "learning_rate": 2.026346894091702e-07, "loss": 0.012, "step": 8247 }, { "epoch": 2.754383035565203, "grad_norm": 0.32681308038560614, "learning_rate": 2.0208741974096445e-07, "loss": 0.0104, "step": 8248 }, { "epoch": 2.7547169811320753, "grad_norm": 0.2744665529596475, "learning_rate": 2.0154087485394713e-07, "loss": 0.0112, "step": 8249 }, { "epoch": 2.755050926698948, "grad_norm": 0.2119489886910582, "learning_rate": 2.0099505483068216e-07, "loss": 0.0101, "step": 8250 }, { "epoch": 2.7553848722658207, "grad_norm": 0.29936244161533654, "learning_rate": 2.0044995975361914e-07, "loss": 0.0173, "step": 8251 }, { "epoch": 2.755718817832693, "grad_norm": 0.3434641407558833, "learning_rate": 1.9990558970510388e-07, "loss": 0.018, "step": 8252 }, { "epoch": 2.7560527633995657, "grad_norm": 0.2883160995611944, "learning_rate": 1.9936194476736782e-07, "loss": 0.0159, "step": 8253 }, { "epoch": 2.7563867089664384, "grad_norm": 0.2768632663561537, "learning_rate": 1.9881902502253525e-07, "loss": 0.0094, "step": 8254 }, { "epoch": 2.756720654533311, "grad_norm": 0.4145861037047996, "learning_rate": 1.9827683055262114e-07, "loss": 0.0217, "step": 8255 }, { "epoch": 2.757054600100184, "grad_norm": 0.22037173957492298, "learning_rate": 1.977353614395311e-07, "loss": 0.0076, "step": 8256 }, { "epoch": 2.757388545667056, "grad_norm": 0.23478814974502749, "learning_rate": 1.971946177650591e-07, "loss": 0.0079, "step": 8257 }, { "epoch": 2.757722491233929, "grad_norm": 0.3040970763032444, "learning_rate": 1.966545996108915e-07, "loss": 0.0106, "step": 8258 }, { "epoch": 2.7580564368008016, "grad_norm": 0.30878299409882476, "learning_rate": 1.961153070586036e-07, "loss": 0.0146, "step": 8259 }, { "epoch": 2.758390382367674, "grad_norm": 0.4017240722856722, "learning_rate": 1.9557674018966244e-07, "loss": 0.0187, "step": 8260 }, { "epoch": 2.7587243279345466, "grad_norm": 0.2859867777005715, "learning_rate": 1.9503889908542572e-07, "loss": 0.0124, "step": 8261 }, { "epoch": 2.7590582735014193, "grad_norm": 0.3568003795269254, "learning_rate": 1.9450178382713957e-07, "loss": 0.0175, "step": 8262 }, { "epoch": 2.759392219068292, "grad_norm": 0.28907602310094616, "learning_rate": 1.9396539449594131e-07, "loss": 0.0116, "step": 8263 }, { "epoch": 2.7597261646351647, "grad_norm": 0.2261150234283839, "learning_rate": 1.9342973117286056e-07, "loss": 0.0115, "step": 8264 }, { "epoch": 2.760060110202037, "grad_norm": 0.3483782701719299, "learning_rate": 1.9289479393881317e-07, "loss": 0.0137, "step": 8265 }, { "epoch": 2.7603940557689097, "grad_norm": 0.2950855371660985, "learning_rate": 1.9236058287460946e-07, "loss": 0.0128, "step": 8266 }, { "epoch": 2.7607280013357824, "grad_norm": 0.3326006143018145, "learning_rate": 1.9182709806094823e-07, "loss": 0.0205, "step": 8267 }, { "epoch": 2.7610619469026547, "grad_norm": 0.24293641324088883, "learning_rate": 1.9129433957841781e-07, "loss": 0.0128, "step": 8268 }, { "epoch": 2.7613958924695274, "grad_norm": 0.26490891658974086, "learning_rate": 1.907623075074988e-07, "loss": 0.0104, "step": 8269 }, { "epoch": 2.7617298380364, "grad_norm": 0.3232274630483029, "learning_rate": 1.9023100192855914e-07, "loss": 0.015, "step": 8270 }, { "epoch": 2.7620637836032724, "grad_norm": 0.28624041980764076, "learning_rate": 1.897004229218602e-07, "loss": 0.0114, "step": 8271 }, { "epoch": 2.762397729170145, "grad_norm": 0.3475303207976439, "learning_rate": 1.8917057056755172e-07, "loss": 0.0129, "step": 8272 }, { "epoch": 2.762731674737018, "grad_norm": 0.21519354891121584, "learning_rate": 1.8864144494567528e-07, "loss": 0.0087, "step": 8273 }, { "epoch": 2.7630656203038906, "grad_norm": 0.3212635662342531, "learning_rate": 1.881130461361591e-07, "loss": 0.0132, "step": 8274 }, { "epoch": 2.7633995658707633, "grad_norm": 0.20233210592967804, "learning_rate": 1.8758537421882662e-07, "loss": 0.0088, "step": 8275 }, { "epoch": 2.7637335114376356, "grad_norm": 0.29612235037874146, "learning_rate": 1.870584292733868e-07, "loss": 0.017, "step": 8276 }, { "epoch": 2.7640674570045083, "grad_norm": 0.36608550574456183, "learning_rate": 1.8653221137944155e-07, "loss": 0.0174, "step": 8277 }, { "epoch": 2.764401402571381, "grad_norm": 0.24274056332917915, "learning_rate": 1.8600672061648283e-07, "loss": 0.0091, "step": 8278 }, { "epoch": 2.7647353481382533, "grad_norm": 0.37905160620084416, "learning_rate": 1.8548195706389272e-07, "loss": 0.0274, "step": 8279 }, { "epoch": 2.765069293705126, "grad_norm": 0.2027616527742171, "learning_rate": 1.849579208009411e-07, "loss": 0.0079, "step": 8280 }, { "epoch": 2.7654032392719987, "grad_norm": 0.29833224591697277, "learning_rate": 1.844346119067919e-07, "loss": 0.0125, "step": 8281 }, { "epoch": 2.765737184838871, "grad_norm": 0.2802195270286133, "learning_rate": 1.8391203046049522e-07, "loss": 0.0135, "step": 8282 }, { "epoch": 2.7660711304057437, "grad_norm": 0.24459429299538646, "learning_rate": 1.8339017654099344e-07, "loss": 0.0099, "step": 8283 }, { "epoch": 2.7664050759726164, "grad_norm": 0.3075248922555339, "learning_rate": 1.828690502271202e-07, "loss": 0.0111, "step": 8284 }, { "epoch": 2.766739021539489, "grad_norm": 0.284368159402573, "learning_rate": 1.823486515975964e-07, "loss": 0.0195, "step": 8285 }, { "epoch": 2.767072967106362, "grad_norm": 0.2786709877146374, "learning_rate": 1.818289807310347e-07, "loss": 0.0125, "step": 8286 }, { "epoch": 2.767406912673234, "grad_norm": 0.33666991223234466, "learning_rate": 1.813100377059379e-07, "loss": 0.0138, "step": 8287 }, { "epoch": 2.767740858240107, "grad_norm": 0.2952515007051211, "learning_rate": 1.8079182260069773e-07, "loss": 0.0115, "step": 8288 }, { "epoch": 2.7680748038069796, "grad_norm": 0.19962157741548053, "learning_rate": 1.8027433549359764e-07, "loss": 0.0065, "step": 8289 }, { "epoch": 2.768408749373852, "grad_norm": 0.3311269764807552, "learning_rate": 1.7975757646280955e-07, "loss": 0.0137, "step": 8290 }, { "epoch": 2.7687426949407246, "grad_norm": 0.26512061831780587, "learning_rate": 1.792415455863955e-07, "loss": 0.014, "step": 8291 }, { "epoch": 2.7690766405075973, "grad_norm": 0.316513736875226, "learning_rate": 1.7872624294230924e-07, "loss": 0.0155, "step": 8292 }, { "epoch": 2.76941058607447, "grad_norm": 0.3049738888738994, "learning_rate": 1.7821166860839179e-07, "loss": 0.0166, "step": 8293 }, { "epoch": 2.7697445316413427, "grad_norm": 0.34716067903135284, "learning_rate": 1.7769782266237767e-07, "loss": 0.0089, "step": 8294 }, { "epoch": 2.770078477208215, "grad_norm": 0.2752261525444329, "learning_rate": 1.7718470518188645e-07, "loss": 0.0127, "step": 8295 }, { "epoch": 2.7704124227750877, "grad_norm": 0.2474051824336063, "learning_rate": 1.7667231624443393e-07, "loss": 0.0103, "step": 8296 }, { "epoch": 2.7707463683419604, "grad_norm": 0.2671973771982285, "learning_rate": 1.7616065592742038e-07, "loss": 0.0135, "step": 8297 }, { "epoch": 2.7710803139088327, "grad_norm": 0.2664023577613685, "learning_rate": 1.7564972430813899e-07, "loss": 0.0128, "step": 8298 }, { "epoch": 2.7714142594757054, "grad_norm": 0.2864257505428321, "learning_rate": 1.751395214637708e-07, "loss": 0.0169, "step": 8299 }, { "epoch": 2.771748205042578, "grad_norm": 0.23575076178032306, "learning_rate": 1.7463004747138967e-07, "loss": 0.0102, "step": 8300 }, { "epoch": 2.7720821506094504, "grad_norm": 0.2798314314257989, "learning_rate": 1.7412130240795578e-07, "loss": 0.0122, "step": 8301 }, { "epoch": 2.772416096176323, "grad_norm": 0.2858984309856811, "learning_rate": 1.736132863503226e-07, "loss": 0.0116, "step": 8302 }, { "epoch": 2.772750041743196, "grad_norm": 0.28838342882239176, "learning_rate": 1.7310599937523153e-07, "loss": 0.0119, "step": 8303 }, { "epoch": 2.7730839873100686, "grad_norm": 0.31156847304098106, "learning_rate": 1.7259944155931407e-07, "loss": 0.0133, "step": 8304 }, { "epoch": 2.7734179328769413, "grad_norm": 0.27535895094586527, "learning_rate": 1.720936129790912e-07, "loss": 0.0144, "step": 8305 }, { "epoch": 2.7737518784438135, "grad_norm": 0.25413450315704256, "learning_rate": 1.7158851371097518e-07, "loss": 0.0088, "step": 8306 }, { "epoch": 2.7740858240106863, "grad_norm": 0.2909353055050728, "learning_rate": 1.7108414383126658e-07, "loss": 0.0151, "step": 8307 }, { "epoch": 2.774419769577559, "grad_norm": 0.25360047465185825, "learning_rate": 1.7058050341615783e-07, "loss": 0.0088, "step": 8308 }, { "epoch": 2.7747537151444313, "grad_norm": 0.3036242447424197, "learning_rate": 1.7007759254172752e-07, "loss": 0.0177, "step": 8309 }, { "epoch": 2.775087660711304, "grad_norm": 0.26463018184094694, "learning_rate": 1.6957541128394817e-07, "loss": 0.0095, "step": 8310 }, { "epoch": 2.7754216062781767, "grad_norm": 0.3597683368456448, "learning_rate": 1.6907395971867858e-07, "loss": 0.0164, "step": 8311 }, { "epoch": 2.7757555518450494, "grad_norm": 0.28661055365270777, "learning_rate": 1.685732379216698e-07, "loss": 0.0145, "step": 8312 }, { "epoch": 2.776089497411922, "grad_norm": 0.24566557974280834, "learning_rate": 1.680732459685619e-07, "loss": 0.0092, "step": 8313 }, { "epoch": 2.7764234429787944, "grad_norm": 0.3819991270581732, "learning_rate": 1.6757398393488443e-07, "loss": 0.0295, "step": 8314 }, { "epoch": 2.776757388545667, "grad_norm": 0.25257389005476655, "learning_rate": 1.6707545189605657e-07, "loss": 0.0113, "step": 8315 }, { "epoch": 2.77709133411254, "grad_norm": 0.2692351510514966, "learning_rate": 1.6657764992738746e-07, "loss": 0.0116, "step": 8316 }, { "epoch": 2.777425279679412, "grad_norm": 0.2621873406998039, "learning_rate": 1.6608057810407586e-07, "loss": 0.0129, "step": 8317 }, { "epoch": 2.777759225246285, "grad_norm": 0.2199847937029809, "learning_rate": 1.6558423650121003e-07, "loss": 0.0069, "step": 8318 }, { "epoch": 2.7780931708131575, "grad_norm": 0.32149850230755, "learning_rate": 1.6508862519376945e-07, "loss": 0.0155, "step": 8319 }, { "epoch": 2.77842711638003, "grad_norm": 0.25761239616973197, "learning_rate": 1.6459374425662088e-07, "loss": 0.0105, "step": 8320 }, { "epoch": 2.7787610619469025, "grad_norm": 0.23574463132175358, "learning_rate": 1.6409959376452289e-07, "loss": 0.0095, "step": 8321 }, { "epoch": 2.7790950075137753, "grad_norm": 0.24001499632074175, "learning_rate": 1.6360617379212185e-07, "loss": 0.0111, "step": 8322 }, { "epoch": 2.779428953080648, "grad_norm": 0.3145800959517236, "learning_rate": 1.6311348441395535e-07, "loss": 0.0163, "step": 8323 }, { "epoch": 2.7797628986475207, "grad_norm": 0.2788190083968446, "learning_rate": 1.6262152570444777e-07, "loss": 0.0146, "step": 8324 }, { "epoch": 2.780096844214393, "grad_norm": 0.2697233591058573, "learning_rate": 1.6213029773791912e-07, "loss": 0.014, "step": 8325 }, { "epoch": 2.7804307897812657, "grad_norm": 0.35953557372609307, "learning_rate": 1.6163980058857164e-07, "loss": 0.0128, "step": 8326 }, { "epoch": 2.7807647353481384, "grad_norm": 0.4353401983976432, "learning_rate": 1.6115003433050336e-07, "loss": 0.0189, "step": 8327 }, { "epoch": 2.7810986809150107, "grad_norm": 0.23561925217782875, "learning_rate": 1.6066099903769726e-07, "loss": 0.0112, "step": 8328 }, { "epoch": 2.7814326264818834, "grad_norm": 0.20941101105095522, "learning_rate": 1.6017269478402875e-07, "loss": 0.0098, "step": 8329 }, { "epoch": 2.781766572048756, "grad_norm": 0.32137760512333763, "learning_rate": 1.59685121643261e-07, "loss": 0.0177, "step": 8330 }, { "epoch": 2.7821005176156284, "grad_norm": 0.263350231782085, "learning_rate": 1.5919827968904955e-07, "loss": 0.0082, "step": 8331 }, { "epoch": 2.782434463182501, "grad_norm": 0.27889321678011675, "learning_rate": 1.5871216899493612e-07, "loss": 0.0136, "step": 8332 }, { "epoch": 2.782768408749374, "grad_norm": 0.26586255032676426, "learning_rate": 1.5822678963435479e-07, "loss": 0.0201, "step": 8333 }, { "epoch": 2.7831023543162465, "grad_norm": 0.30107242446010524, "learning_rate": 1.5774214168062575e-07, "loss": 0.0167, "step": 8334 }, { "epoch": 2.7834362998831192, "grad_norm": 0.299668277910782, "learning_rate": 1.5725822520696267e-07, "loss": 0.0173, "step": 8335 }, { "epoch": 2.7837702454499915, "grad_norm": 0.3040714664780066, "learning_rate": 1.567750402864654e-07, "loss": 0.0106, "step": 8336 }, { "epoch": 2.7841041910168642, "grad_norm": 0.31414191753188925, "learning_rate": 1.5629258699212613e-07, "loss": 0.0155, "step": 8337 }, { "epoch": 2.784438136583737, "grad_norm": 0.2652252331317683, "learning_rate": 1.5581086539682433e-07, "loss": 0.0094, "step": 8338 }, { "epoch": 2.7847720821506092, "grad_norm": 0.2914617850460002, "learning_rate": 1.5532987557332902e-07, "loss": 0.0135, "step": 8339 }, { "epoch": 2.785106027717482, "grad_norm": 0.2576522483730419, "learning_rate": 1.5484961759430095e-07, "loss": 0.011, "step": 8340 }, { "epoch": 2.7854399732843547, "grad_norm": 0.35448494208890885, "learning_rate": 1.5437009153228766e-07, "loss": 0.0115, "step": 8341 }, { "epoch": 2.7857739188512274, "grad_norm": 0.24914656785217557, "learning_rate": 1.538912974597273e-07, "loss": 0.0101, "step": 8342 }, { "epoch": 2.7861078644181, "grad_norm": 0.2527460071849557, "learning_rate": 1.5341323544894758e-07, "loss": 0.0092, "step": 8343 }, { "epoch": 2.7864418099849724, "grad_norm": 0.28201016644945903, "learning_rate": 1.5293590557216577e-07, "loss": 0.0156, "step": 8344 }, { "epoch": 2.786775755551845, "grad_norm": 0.28782282799149767, "learning_rate": 1.5245930790148743e-07, "loss": 0.0106, "step": 8345 }, { "epoch": 2.787109701118718, "grad_norm": 0.23980872769715128, "learning_rate": 1.5198344250890894e-07, "loss": 0.0095, "step": 8346 }, { "epoch": 2.78744364668559, "grad_norm": 0.24121466281769527, "learning_rate": 1.515083094663139e-07, "loss": 0.01, "step": 8347 }, { "epoch": 2.787777592252463, "grad_norm": 0.2464676825272029, "learning_rate": 1.5103390884547931e-07, "loss": 0.0118, "step": 8348 }, { "epoch": 2.7881115378193355, "grad_norm": 0.344368993388975, "learning_rate": 1.5056024071806674e-07, "loss": 0.0178, "step": 8349 }, { "epoch": 2.788445483386208, "grad_norm": 0.42110579895533357, "learning_rate": 1.5008730515563064e-07, "loss": 0.0392, "step": 8350 }, { "epoch": 2.7887794289530805, "grad_norm": 0.25930592309704714, "learning_rate": 1.4961510222961216e-07, "loss": 0.0134, "step": 8351 }, { "epoch": 2.7891133745199532, "grad_norm": 0.20907074816868176, "learning_rate": 1.4914363201134486e-07, "loss": 0.0075, "step": 8352 }, { "epoch": 2.789447320086826, "grad_norm": 0.3429588995847818, "learning_rate": 1.4867289457204726e-07, "loss": 0.0255, "step": 8353 }, { "epoch": 2.7897812656536987, "grad_norm": 0.28556945714494547, "learning_rate": 1.4820288998283304e-07, "loss": 0.0138, "step": 8354 }, { "epoch": 2.790115211220571, "grad_norm": 0.22833053998156072, "learning_rate": 1.477336183146999e-07, "loss": 0.0077, "step": 8355 }, { "epoch": 2.7904491567874437, "grad_norm": 0.3207524916382697, "learning_rate": 1.4726507963853776e-07, "loss": 0.0093, "step": 8356 }, { "epoch": 2.7907831023543164, "grad_norm": 0.31784077700257857, "learning_rate": 1.4679727402512334e-07, "loss": 0.0168, "step": 8357 }, { "epoch": 2.7911170479211886, "grad_norm": 0.22468100786292214, "learning_rate": 1.4633020154512677e-07, "loss": 0.0097, "step": 8358 }, { "epoch": 2.7914509934880614, "grad_norm": 0.3174227513116866, "learning_rate": 1.458638622691022e-07, "loss": 0.016, "step": 8359 }, { "epoch": 2.791784939054934, "grad_norm": 0.2639845338570142, "learning_rate": 1.4539825626749715e-07, "loss": 0.0147, "step": 8360 }, { "epoch": 2.792118884621807, "grad_norm": 0.2627020187115576, "learning_rate": 1.4493338361064646e-07, "loss": 0.014, "step": 8361 }, { "epoch": 2.7924528301886795, "grad_norm": 0.2535252270232833, "learning_rate": 1.4446924436877507e-07, "loss": 0.0133, "step": 8362 }, { "epoch": 2.792786775755552, "grad_norm": 0.33633229865249903, "learning_rate": 1.4400583861199636e-07, "loss": 0.0091, "step": 8363 }, { "epoch": 2.7931207213224245, "grad_norm": 0.24451771841418674, "learning_rate": 1.4354316641031263e-07, "loss": 0.0132, "step": 8364 }, { "epoch": 2.7934546668892972, "grad_norm": 0.31350688222524326, "learning_rate": 1.4308122783361688e-07, "loss": 0.0128, "step": 8365 }, { "epoch": 2.7937886124561695, "grad_norm": 0.2668994633524597, "learning_rate": 1.4262002295168997e-07, "loss": 0.0121, "step": 8366 }, { "epoch": 2.794122558023042, "grad_norm": 0.2473686699944472, "learning_rate": 1.4215955183420282e-07, "loss": 0.012, "step": 8367 }, { "epoch": 2.794456503589915, "grad_norm": 0.23643030610916663, "learning_rate": 1.4169981455071368e-07, "loss": 0.0185, "step": 8368 }, { "epoch": 2.794790449156787, "grad_norm": 0.25203111907376435, "learning_rate": 1.4124081117067313e-07, "loss": 0.0101, "step": 8369 }, { "epoch": 2.79512439472366, "grad_norm": 0.3042123226518819, "learning_rate": 1.4078254176341788e-07, "loss": 0.0165, "step": 8370 }, { "epoch": 2.7954583402905326, "grad_norm": 0.2805553275281972, "learning_rate": 1.4032500639817426e-07, "loss": 0.0131, "step": 8371 }, { "epoch": 2.7957922858574054, "grad_norm": 0.3299491931551103, "learning_rate": 1.3986820514405973e-07, "loss": 0.0143, "step": 8372 }, { "epoch": 2.796126231424278, "grad_norm": 0.3957722296616314, "learning_rate": 1.394121380700797e-07, "loss": 0.0187, "step": 8373 }, { "epoch": 2.7964601769911503, "grad_norm": 0.3194524644252628, "learning_rate": 1.3895680524512734e-07, "loss": 0.0158, "step": 8374 }, { "epoch": 2.796794122558023, "grad_norm": 0.2865162913913697, "learning_rate": 1.3850220673798655e-07, "loss": 0.019, "step": 8375 }, { "epoch": 2.797128068124896, "grad_norm": 0.3621511730767648, "learning_rate": 1.3804834261732957e-07, "loss": 0.0217, "step": 8376 }, { "epoch": 2.797462013691768, "grad_norm": 0.26980065793884134, "learning_rate": 1.3759521295171773e-07, "loss": 0.014, "step": 8377 }, { "epoch": 2.7977959592586408, "grad_norm": 0.23282010559479888, "learning_rate": 1.3714281780960237e-07, "loss": 0.0132, "step": 8378 }, { "epoch": 2.7981299048255135, "grad_norm": 0.3347537165698293, "learning_rate": 1.366911572593227e-07, "loss": 0.0142, "step": 8379 }, { "epoch": 2.7984638503923858, "grad_norm": 0.31495637518135455, "learning_rate": 1.3624023136910691e-07, "loss": 0.0136, "step": 8380 }, { "epoch": 2.7987977959592585, "grad_norm": 0.25856814855169197, "learning_rate": 1.3579004020707387e-07, "loss": 0.0089, "step": 8381 }, { "epoch": 2.799131741526131, "grad_norm": 0.3036830138780181, "learning_rate": 1.3534058384122862e-07, "loss": 0.0133, "step": 8382 }, { "epoch": 2.799465687093004, "grad_norm": 0.3126728041819368, "learning_rate": 1.3489186233946793e-07, "loss": 0.0171, "step": 8383 }, { "epoch": 2.7997996326598766, "grad_norm": 0.34767110591674416, "learning_rate": 1.3444387576957706e-07, "loss": 0.0183, "step": 8384 }, { "epoch": 2.800133578226749, "grad_norm": 0.28919378287035513, "learning_rate": 1.33996624199228e-07, "loss": 0.0135, "step": 8385 }, { "epoch": 2.8004675237936216, "grad_norm": 0.33563019294314644, "learning_rate": 1.335501076959844e-07, "loss": 0.0155, "step": 8386 }, { "epoch": 2.8008014693604943, "grad_norm": 0.34985493173092885, "learning_rate": 1.331043263272974e-07, "loss": 0.0136, "step": 8387 }, { "epoch": 2.8011354149273666, "grad_norm": 0.3023121399247982, "learning_rate": 1.3265928016050756e-07, "loss": 0.0103, "step": 8388 }, { "epoch": 2.8014693604942393, "grad_norm": 0.29181389524536816, "learning_rate": 1.3221496926284493e-07, "loss": 0.0131, "step": 8389 }, { "epoch": 2.801803306061112, "grad_norm": 0.3205013258648555, "learning_rate": 1.3177139370142755e-07, "loss": 0.0156, "step": 8390 }, { "epoch": 2.8021372516279848, "grad_norm": 0.26342576122319006, "learning_rate": 1.3132855354326236e-07, "loss": 0.0122, "step": 8391 }, { "epoch": 2.8024711971948575, "grad_norm": 0.2924159114380533, "learning_rate": 1.3088644885524637e-07, "loss": 0.0141, "step": 8392 }, { "epoch": 2.8028051427617298, "grad_norm": 0.26825934096652987, "learning_rate": 1.3044507970416398e-07, "loss": 0.0123, "step": 8393 }, { "epoch": 2.8031390883286025, "grad_norm": 0.29890559772657394, "learning_rate": 1.3000444615668906e-07, "loss": 0.0196, "step": 8394 }, { "epoch": 2.803473033895475, "grad_norm": 0.26558573606851443, "learning_rate": 1.2956454827938557e-07, "loss": 0.0119, "step": 8395 }, { "epoch": 2.8038069794623475, "grad_norm": 0.2786752920512263, "learning_rate": 1.291253861387043e-07, "loss": 0.0098, "step": 8396 }, { "epoch": 2.80414092502922, "grad_norm": 0.23928573084074356, "learning_rate": 1.28686959800986e-07, "loss": 0.0088, "step": 8397 }, { "epoch": 2.804474870596093, "grad_norm": 0.3268059382040651, "learning_rate": 1.2824926933246106e-07, "loss": 0.0134, "step": 8398 }, { "epoch": 2.804808816162965, "grad_norm": 0.26133151883908656, "learning_rate": 1.2781231479924606e-07, "loss": 0.011, "step": 8399 }, { "epoch": 2.805142761729838, "grad_norm": 0.2895260831922916, "learning_rate": 1.2737609626734927e-07, "loss": 0.0119, "step": 8400 }, { "epoch": 2.8054767072967106, "grad_norm": 0.3425229516058676, "learning_rate": 1.269406138026663e-07, "loss": 0.0193, "step": 8401 }, { "epoch": 2.8058106528635833, "grad_norm": 0.283686057394506, "learning_rate": 1.2650586747098238e-07, "loss": 0.0138, "step": 8402 }, { "epoch": 2.806144598430456, "grad_norm": 0.2851033035169662, "learning_rate": 1.2607185733797044e-07, "loss": 0.018, "step": 8403 }, { "epoch": 2.8064785439973283, "grad_norm": 0.3138832867948211, "learning_rate": 1.2563858346919365e-07, "loss": 0.0137, "step": 8404 }, { "epoch": 2.806812489564201, "grad_norm": 0.23805023464287908, "learning_rate": 1.2520604593010189e-07, "loss": 0.0139, "step": 8405 }, { "epoch": 2.8071464351310738, "grad_norm": 0.28574374435213823, "learning_rate": 1.247742447860356e-07, "loss": 0.0157, "step": 8406 }, { "epoch": 2.807480380697946, "grad_norm": 0.28526161309306497, "learning_rate": 1.2434318010222434e-07, "loss": 0.0114, "step": 8407 }, { "epoch": 2.8078143262648187, "grad_norm": 0.28554133114268365, "learning_rate": 1.2391285194378433e-07, "loss": 0.0122, "step": 8408 }, { "epoch": 2.8081482718316915, "grad_norm": 0.2679164720587576, "learning_rate": 1.2348326037572244e-07, "loss": 0.0136, "step": 8409 }, { "epoch": 2.808482217398564, "grad_norm": 0.34096941221535726, "learning_rate": 1.2305440546293236e-07, "loss": 0.0177, "step": 8410 }, { "epoch": 2.808816162965437, "grad_norm": 0.23635400336796952, "learning_rate": 1.2262628727019942e-07, "loss": 0.011, "step": 8411 }, { "epoch": 2.809150108532309, "grad_norm": 0.23210019008562568, "learning_rate": 1.221989058621942e-07, "loss": 0.0087, "step": 8412 }, { "epoch": 2.809484054099182, "grad_norm": 0.278626224498374, "learning_rate": 1.2177226130347886e-07, "loss": 0.0125, "step": 8413 }, { "epoch": 2.8098179996660546, "grad_norm": 0.22672558954302258, "learning_rate": 1.21346353658503e-07, "loss": 0.015, "step": 8414 }, { "epoch": 2.810151945232927, "grad_norm": 0.24113637952206665, "learning_rate": 1.209211829916046e-07, "loss": 0.0118, "step": 8415 }, { "epoch": 2.8104858907997996, "grad_norm": 0.30406505669797074, "learning_rate": 1.204967493670106e-07, "loss": 0.0133, "step": 8416 }, { "epoch": 2.8108198363666723, "grad_norm": 0.280063746051557, "learning_rate": 1.2007305284883696e-07, "loss": 0.0112, "step": 8417 }, { "epoch": 2.8111537819335446, "grad_norm": 0.30297807034631186, "learning_rate": 1.1965009350108747e-07, "loss": 0.0271, "step": 8418 }, { "epoch": 2.8114877275004173, "grad_norm": 0.27213332098461923, "learning_rate": 1.1922787138765656e-07, "loss": 0.0112, "step": 8419 }, { "epoch": 2.81182167306729, "grad_norm": 0.26232310304595063, "learning_rate": 1.188063865723238e-07, "loss": 0.0114, "step": 8420 }, { "epoch": 2.8121556186341627, "grad_norm": 0.22527477009169755, "learning_rate": 1.1838563911876155e-07, "loss": 0.0113, "step": 8421 }, { "epoch": 2.8124895642010355, "grad_norm": 0.2809461366900473, "learning_rate": 1.1796562909052734e-07, "loss": 0.0126, "step": 8422 }, { "epoch": 2.8128235097679077, "grad_norm": 0.36457444507388703, "learning_rate": 1.1754635655106928e-07, "loss": 0.021, "step": 8423 }, { "epoch": 2.8131574553347805, "grad_norm": 0.2571348420697469, "learning_rate": 1.1712782156372226e-07, "loss": 0.0121, "step": 8424 }, { "epoch": 2.813491400901653, "grad_norm": 0.2642181664400732, "learning_rate": 1.167100241917124e-07, "loss": 0.0148, "step": 8425 }, { "epoch": 2.8138253464685254, "grad_norm": 0.29227821178334, "learning_rate": 1.1629296449815197e-07, "loss": 0.0128, "step": 8426 }, { "epoch": 2.814159292035398, "grad_norm": 0.22396646696066097, "learning_rate": 1.1587664254604336e-07, "loss": 0.0086, "step": 8427 }, { "epoch": 2.814493237602271, "grad_norm": 0.26553623466395604, "learning_rate": 1.1546105839827626e-07, "loss": 0.0114, "step": 8428 }, { "epoch": 2.814827183169143, "grad_norm": 0.2357038060759402, "learning_rate": 1.150462121176299e-07, "loss": 0.0104, "step": 8429 }, { "epoch": 2.815161128736016, "grad_norm": 0.34500413522697126, "learning_rate": 1.1463210376677192e-07, "loss": 0.0173, "step": 8430 }, { "epoch": 2.8154950743028886, "grad_norm": 0.27684281474371836, "learning_rate": 1.1421873340825729e-07, "loss": 0.012, "step": 8431 }, { "epoch": 2.8158290198697613, "grad_norm": 0.27103118963474787, "learning_rate": 1.1380610110453217e-07, "loss": 0.0138, "step": 8432 }, { "epoch": 2.816162965436634, "grad_norm": 0.24028946798563416, "learning_rate": 1.133942069179278e-07, "loss": 0.0105, "step": 8433 }, { "epoch": 2.8164969110035063, "grad_norm": 0.40271768500581506, "learning_rate": 1.1298305091066664e-07, "loss": 0.0216, "step": 8434 }, { "epoch": 2.816830856570379, "grad_norm": 0.37001445583746134, "learning_rate": 1.1257263314485844e-07, "loss": 0.0158, "step": 8435 }, { "epoch": 2.8171648021372517, "grad_norm": 0.28539301569455333, "learning_rate": 1.1216295368250196e-07, "loss": 0.0149, "step": 8436 }, { "epoch": 2.817498747704124, "grad_norm": 0.26953687510639085, "learning_rate": 1.1175401258548324e-07, "loss": 0.0124, "step": 8437 }, { "epoch": 2.8178326932709967, "grad_norm": 0.2986731476846757, "learning_rate": 1.1134580991557842e-07, "loss": 0.0162, "step": 8438 }, { "epoch": 2.8181666388378694, "grad_norm": 0.24487272593825893, "learning_rate": 1.1093834573445094e-07, "loss": 0.0125, "step": 8439 }, { "epoch": 2.818500584404742, "grad_norm": 0.33134490719741383, "learning_rate": 1.1053162010365326e-07, "loss": 0.0122, "step": 8440 }, { "epoch": 2.818834529971615, "grad_norm": 0.38176389561748614, "learning_rate": 1.1012563308462565e-07, "loss": 0.0239, "step": 8441 }, { "epoch": 2.819168475538487, "grad_norm": 0.24794065109067973, "learning_rate": 1.0972038473869795e-07, "loss": 0.0109, "step": 8442 }, { "epoch": 2.81950242110536, "grad_norm": 0.32262308348965707, "learning_rate": 1.093158751270873e-07, "loss": 0.0151, "step": 8443 }, { "epoch": 2.8198363666722326, "grad_norm": 0.29734626106521483, "learning_rate": 1.0891210431089983e-07, "loss": 0.0106, "step": 8444 }, { "epoch": 2.820170312239105, "grad_norm": 0.28189267524660805, "learning_rate": 1.0850907235112895e-07, "loss": 0.0128, "step": 8445 }, { "epoch": 2.8205042578059776, "grad_norm": 0.2869779936280662, "learning_rate": 1.0810677930865876e-07, "loss": 0.012, "step": 8446 }, { "epoch": 2.8208382033728503, "grad_norm": 0.2901808939607543, "learning_rate": 1.0770522524425898e-07, "loss": 0.0134, "step": 8447 }, { "epoch": 2.8211721489397226, "grad_norm": 0.32842743169031546, "learning_rate": 1.0730441021859106e-07, "loss": 0.0134, "step": 8448 }, { "epoch": 2.8215060945065953, "grad_norm": 0.338455094506942, "learning_rate": 1.0690433429220049e-07, "loss": 0.0144, "step": 8449 }, { "epoch": 2.821840040073468, "grad_norm": 0.27895776404779044, "learning_rate": 1.0650499752552557e-07, "loss": 0.0159, "step": 8450 }, { "epoch": 2.8221739856403407, "grad_norm": 0.2976618575069383, "learning_rate": 1.0610639997888917e-07, "loss": 0.0119, "step": 8451 }, { "epoch": 2.8225079312072134, "grad_norm": 0.25465651607196915, "learning_rate": 1.0570854171250478e-07, "loss": 0.0129, "step": 8452 }, { "epoch": 2.8228418767740857, "grad_norm": 0.3134176234571828, "learning_rate": 1.0531142278647378e-07, "loss": 0.0128, "step": 8453 }, { "epoch": 2.8231758223409584, "grad_norm": 0.27868733224420017, "learning_rate": 1.0491504326078483e-07, "loss": 0.0075, "step": 8454 }, { "epoch": 2.823509767907831, "grad_norm": 0.23699800131791296, "learning_rate": 1.0451940319531728e-07, "loss": 0.0084, "step": 8455 }, { "epoch": 2.8238437134747034, "grad_norm": 0.40073273739297754, "learning_rate": 1.0412450264983609e-07, "loss": 0.0185, "step": 8456 }, { "epoch": 2.824177659041576, "grad_norm": 0.2813315415535806, "learning_rate": 1.0373034168399521e-07, "loss": 0.0207, "step": 8457 }, { "epoch": 2.824511604608449, "grad_norm": 0.23131194956899465, "learning_rate": 1.0333692035733867e-07, "loss": 0.0108, "step": 8458 }, { "epoch": 2.8248455501753216, "grad_norm": 0.4652767329889953, "learning_rate": 1.0294423872929615e-07, "loss": 0.017, "step": 8459 }, { "epoch": 2.8251794957421943, "grad_norm": 0.36143315764560047, "learning_rate": 1.0255229685918744e-07, "loss": 0.0224, "step": 8460 }, { "epoch": 2.8255134413090666, "grad_norm": 0.2974311280656284, "learning_rate": 1.0216109480622017e-07, "loss": 0.0118, "step": 8461 }, { "epoch": 2.8258473868759393, "grad_norm": 1.1702102524851237, "learning_rate": 1.0177063262948927e-07, "loss": 0.014, "step": 8462 }, { "epoch": 2.826181332442812, "grad_norm": 0.3534851980015013, "learning_rate": 1.0138091038797982e-07, "loss": 0.0158, "step": 8463 }, { "epoch": 2.8265152780096843, "grad_norm": 0.2723782456039754, "learning_rate": 1.0099192814056247e-07, "loss": 0.0143, "step": 8464 }, { "epoch": 2.826849223576557, "grad_norm": 0.22216513094016702, "learning_rate": 1.0060368594599856e-07, "loss": 0.012, "step": 8465 }, { "epoch": 2.8271831691434297, "grad_norm": 0.35377554256356336, "learning_rate": 1.002161838629362e-07, "loss": 0.0087, "step": 8466 }, { "epoch": 2.827517114710302, "grad_norm": 0.20115279592806767, "learning_rate": 9.982942194991297e-08, "loss": 0.0066, "step": 8467 }, { "epoch": 2.8278510602771747, "grad_norm": 0.30575556991443903, "learning_rate": 9.94434002653527e-08, "loss": 0.0161, "step": 8468 }, { "epoch": 2.8281850058440474, "grad_norm": 0.25882505482968193, "learning_rate": 9.905811886756933e-08, "loss": 0.0129, "step": 8469 }, { "epoch": 2.82851895141092, "grad_norm": 0.2929484761962619, "learning_rate": 9.867357781476294e-08, "loss": 0.0139, "step": 8470 }, { "epoch": 2.828852896977793, "grad_norm": 0.4300323198699619, "learning_rate": 9.828977716502486e-08, "loss": 0.0144, "step": 8471 }, { "epoch": 2.829186842544665, "grad_norm": 0.2973213778977374, "learning_rate": 9.790671697633092e-08, "loss": 0.0151, "step": 8472 }, { "epoch": 2.829520788111538, "grad_norm": 0.3748368631779452, "learning_rate": 9.752439730654872e-08, "loss": 0.0224, "step": 8473 }, { "epoch": 2.8298547336784106, "grad_norm": 0.3208448130130757, "learning_rate": 9.714281821343041e-08, "loss": 0.0121, "step": 8474 }, { "epoch": 2.830188679245283, "grad_norm": 0.2544573923943401, "learning_rate": 9.676197975461876e-08, "loss": 0.012, "step": 8475 }, { "epoch": 2.8305226248121556, "grad_norm": 0.27098197281819014, "learning_rate": 9.638188198764387e-08, "loss": 0.0119, "step": 8476 }, { "epoch": 2.8308565703790283, "grad_norm": 0.2879514253745916, "learning_rate": 9.600252496992369e-08, "loss": 0.0107, "step": 8477 }, { "epoch": 2.8311905159459005, "grad_norm": 0.2322135051338501, "learning_rate": 9.562390875876515e-08, "loss": 0.0082, "step": 8478 }, { "epoch": 2.8315244615127733, "grad_norm": 0.3014665266822095, "learning_rate": 9.524603341136251e-08, "loss": 0.0145, "step": 8479 }, { "epoch": 2.831858407079646, "grad_norm": 0.24936357592251743, "learning_rate": 9.486889898479734e-08, "loss": 0.0098, "step": 8480 }, { "epoch": 2.8321923526465187, "grad_norm": 0.23670990454764407, "learning_rate": 9.449250553604184e-08, "loss": 0.0093, "step": 8481 }, { "epoch": 2.8325262982133914, "grad_norm": 0.2030634051800307, "learning_rate": 9.41168531219533e-08, "loss": 0.0089, "step": 8482 }, { "epoch": 2.8328602437802637, "grad_norm": 0.32479644427744586, "learning_rate": 9.374194179927909e-08, "loss": 0.0168, "step": 8483 }, { "epoch": 2.8331941893471364, "grad_norm": 0.28146344194371314, "learning_rate": 9.336777162465449e-08, "loss": 0.0136, "step": 8484 }, { "epoch": 2.833528134914009, "grad_norm": 0.28794033571057465, "learning_rate": 9.299434265460095e-08, "loss": 0.0138, "step": 8485 }, { "epoch": 2.8338620804808814, "grad_norm": 0.2845003959676957, "learning_rate": 9.262165494553055e-08, "loss": 0.0154, "step": 8486 }, { "epoch": 2.834196026047754, "grad_norm": 0.37816624497676077, "learning_rate": 9.22497085537416e-08, "loss": 0.0135, "step": 8487 }, { "epoch": 2.834529971614627, "grad_norm": 0.42961347025405255, "learning_rate": 9.187850353542082e-08, "loss": 0.0173, "step": 8488 }, { "epoch": 2.8348639171814995, "grad_norm": 0.2786534731328175, "learning_rate": 9.150803994664337e-08, "loss": 0.0154, "step": 8489 }, { "epoch": 2.8351978627483723, "grad_norm": 0.5368650444095259, "learning_rate": 9.113831784337279e-08, "loss": 0.0169, "step": 8490 }, { "epoch": 2.8355318083152445, "grad_norm": 0.330794427971968, "learning_rate": 9.076933728145832e-08, "loss": 0.0132, "step": 8491 }, { "epoch": 2.8358657538821173, "grad_norm": 0.3324737949827866, "learning_rate": 9.040109831664035e-08, "loss": 0.0182, "step": 8492 }, { "epoch": 2.83619969944899, "grad_norm": 0.2736450738801953, "learning_rate": 9.003360100454495e-08, "loss": 0.0118, "step": 8493 }, { "epoch": 2.8365336450158622, "grad_norm": 0.27565907317194843, "learning_rate": 8.966684540068659e-08, "loss": 0.0121, "step": 8494 }, { "epoch": 2.836867590582735, "grad_norm": 0.2851302314397976, "learning_rate": 8.930083156046931e-08, "loss": 0.011, "step": 8495 }, { "epoch": 2.8372015361496077, "grad_norm": 0.3410516278374826, "learning_rate": 8.893555953918276e-08, "loss": 0.0143, "step": 8496 }, { "epoch": 2.83753548171648, "grad_norm": 0.29575558981494526, "learning_rate": 8.857102939200557e-08, "loss": 0.0136, "step": 8497 }, { "epoch": 2.8378694272833527, "grad_norm": 0.29374282178627564, "learning_rate": 8.820724117400536e-08, "loss": 0.0116, "step": 8498 }, { "epoch": 2.8382033728502254, "grad_norm": 0.2226854391655593, "learning_rate": 8.784419494013541e-08, "loss": 0.0101, "step": 8499 }, { "epoch": 2.838537318417098, "grad_norm": 0.26833583075797063, "learning_rate": 8.74818907452385e-08, "loss": 0.0164, "step": 8500 }, { "epoch": 2.838871263983971, "grad_norm": 0.25534358032828847, "learning_rate": 8.712032864404529e-08, "loss": 0.0124, "step": 8501 }, { "epoch": 2.839205209550843, "grad_norm": 0.2242527309536938, "learning_rate": 8.675950869117323e-08, "loss": 0.0092, "step": 8502 }, { "epoch": 2.839539155117716, "grad_norm": 0.3060378584530344, "learning_rate": 8.639943094112868e-08, "loss": 0.0127, "step": 8503 }, { "epoch": 2.8398731006845885, "grad_norm": 0.32974861235148545, "learning_rate": 8.604009544830705e-08, "loss": 0.0181, "step": 8504 }, { "epoch": 2.840207046251461, "grad_norm": 0.27513536522164506, "learning_rate": 8.568150226698823e-08, "loss": 0.0121, "step": 8505 }, { "epoch": 2.8405409918183335, "grad_norm": 0.36152341510785624, "learning_rate": 8.532365145134226e-08, "loss": 0.0157, "step": 8506 }, { "epoch": 2.8408749373852062, "grad_norm": 0.31136696128853075, "learning_rate": 8.496654305542807e-08, "loss": 0.0162, "step": 8507 }, { "epoch": 2.841208882952079, "grad_norm": 0.3697520653983878, "learning_rate": 8.461017713318976e-08, "loss": 0.0189, "step": 8508 }, { "epoch": 2.8415428285189517, "grad_norm": 0.26632652178921934, "learning_rate": 8.425455373846147e-08, "loss": 0.0122, "step": 8509 }, { "epoch": 2.841876774085824, "grad_norm": 0.24430852577156545, "learning_rate": 8.38996729249636e-08, "loss": 0.0086, "step": 8510 }, { "epoch": 2.8422107196526967, "grad_norm": 0.34850018244760905, "learning_rate": 8.354553474630489e-08, "loss": 0.0175, "step": 8511 }, { "epoch": 2.8425446652195694, "grad_norm": 0.36303384284039225, "learning_rate": 8.319213925598258e-08, "loss": 0.0183, "step": 8512 }, { "epoch": 2.8428786107864417, "grad_norm": 0.2679445838842964, "learning_rate": 8.283948650738172e-08, "loss": 0.0119, "step": 8513 }, { "epoch": 2.8432125563533144, "grad_norm": 0.3110846121880377, "learning_rate": 8.248757655377415e-08, "loss": 0.0183, "step": 8514 }, { "epoch": 2.843546501920187, "grad_norm": 0.22924830934423535, "learning_rate": 8.213640944831957e-08, "loss": 0.013, "step": 8515 }, { "epoch": 2.8438804474870594, "grad_norm": 0.2661107097521215, "learning_rate": 8.178598524406667e-08, "loss": 0.0164, "step": 8516 }, { "epoch": 2.844214393053932, "grad_norm": 0.2956356286503423, "learning_rate": 8.143630399395031e-08, "loss": 0.0119, "step": 8517 }, { "epoch": 2.844548338620805, "grad_norm": 0.3453447186550457, "learning_rate": 8.108736575079434e-08, "loss": 0.0189, "step": 8518 }, { "epoch": 2.8448822841876775, "grad_norm": 0.24075542788156157, "learning_rate": 8.073917056731106e-08, "loss": 0.0116, "step": 8519 }, { "epoch": 2.8452162297545502, "grad_norm": 0.35297050546727476, "learning_rate": 8.039171849609728e-08, "loss": 0.0133, "step": 8520 }, { "epoch": 2.8455501753214225, "grad_norm": 0.2527799302308656, "learning_rate": 8.004500958964211e-08, "loss": 0.0104, "step": 8521 }, { "epoch": 2.8458841208882952, "grad_norm": 0.27290973030080484, "learning_rate": 7.969904390031812e-08, "loss": 0.0117, "step": 8522 }, { "epoch": 2.846218066455168, "grad_norm": 0.2921832885408687, "learning_rate": 7.935382148038794e-08, "loss": 0.0116, "step": 8523 }, { "epoch": 2.8465520120220402, "grad_norm": 0.2988831261199746, "learning_rate": 7.900934238200265e-08, "loss": 0.0138, "step": 8524 }, { "epoch": 2.846885957588913, "grad_norm": 0.328375647624051, "learning_rate": 7.866560665719836e-08, "loss": 0.0147, "step": 8525 }, { "epoch": 2.8472199031557857, "grad_norm": 0.3395522286479151, "learning_rate": 7.832261435790078e-08, "loss": 0.02, "step": 8526 }, { "epoch": 2.847553848722658, "grad_norm": 0.3205350868707443, "learning_rate": 7.798036553592403e-08, "loss": 0.0128, "step": 8527 }, { "epoch": 2.8478877942895306, "grad_norm": 0.21270356895204348, "learning_rate": 7.763886024296729e-08, "loss": 0.0073, "step": 8528 }, { "epoch": 2.8482217398564034, "grad_norm": 0.26342686256275705, "learning_rate": 7.729809853061987e-08, "loss": 0.0116, "step": 8529 }, { "epoch": 2.848555685423276, "grad_norm": 0.3028909044047864, "learning_rate": 7.69580804503578e-08, "loss": 0.0151, "step": 8530 }, { "epoch": 2.848889630990149, "grad_norm": 0.2404667209106315, "learning_rate": 7.661880605354444e-08, "loss": 0.0109, "step": 8531 }, { "epoch": 2.849223576557021, "grad_norm": 0.2594877666811875, "learning_rate": 7.628027539143156e-08, "loss": 0.0114, "step": 8532 }, { "epoch": 2.849557522123894, "grad_norm": 0.2030132717973318, "learning_rate": 7.594248851515717e-08, "loss": 0.009, "step": 8533 }, { "epoch": 2.8498914676907665, "grad_norm": 0.29151936401500517, "learning_rate": 7.560544547574988e-08, "loss": 0.0121, "step": 8534 }, { "epoch": 2.850225413257639, "grad_norm": 0.22534914580383095, "learning_rate": 7.526914632412175e-08, "loss": 0.0083, "step": 8535 }, { "epoch": 2.8505593588245115, "grad_norm": 0.33263694189915644, "learning_rate": 7.493359111107712e-08, "loss": 0.0139, "step": 8536 }, { "epoch": 2.850893304391384, "grad_norm": 0.30218668753966177, "learning_rate": 7.459877988730325e-08, "loss": 0.0175, "step": 8537 }, { "epoch": 2.851227249958257, "grad_norm": 0.2430993896365685, "learning_rate": 7.42647127033791e-08, "loss": 0.0105, "step": 8538 }, { "epoch": 2.8515611955251297, "grad_norm": 0.303225374287104, "learning_rate": 7.393138960976876e-08, "loss": 0.0145, "step": 8539 }, { "epoch": 2.851895141092002, "grad_norm": 0.2842340670103229, "learning_rate": 7.359881065682473e-08, "loss": 0.0151, "step": 8540 }, { "epoch": 2.8522290866588746, "grad_norm": 0.24462551342850572, "learning_rate": 7.32669758947857e-08, "loss": 0.0112, "step": 8541 }, { "epoch": 2.8525630322257474, "grad_norm": 0.24453162496305242, "learning_rate": 7.29358853737816e-08, "loss": 0.0105, "step": 8542 }, { "epoch": 2.8528969777926196, "grad_norm": 0.2637551487610621, "learning_rate": 7.260553914382573e-08, "loss": 0.0103, "step": 8543 }, { "epoch": 2.8532309233594924, "grad_norm": 0.2662783522173254, "learning_rate": 7.227593725482207e-08, "loss": 0.0133, "step": 8544 }, { "epoch": 2.853564868926365, "grad_norm": 0.28591637654753016, "learning_rate": 7.194707975655912e-08, "loss": 0.0167, "step": 8545 }, { "epoch": 2.8538988144932373, "grad_norm": 0.34518955222935715, "learning_rate": 7.161896669871605e-08, "loss": 0.0149, "step": 8546 }, { "epoch": 2.85423276006011, "grad_norm": 0.2941960925392155, "learning_rate": 7.129159813085817e-08, "loss": 0.0163, "step": 8547 }, { "epoch": 2.854566705626983, "grad_norm": 0.2712849523985334, "learning_rate": 7.096497410243819e-08, "loss": 0.0122, "step": 8548 }, { "epoch": 2.8549006511938555, "grad_norm": 0.3106976369290745, "learning_rate": 7.063909466279605e-08, "loss": 0.0139, "step": 8549 }, { "epoch": 2.855234596760728, "grad_norm": 0.363119854761904, "learning_rate": 7.031395986116019e-08, "loss": 0.0157, "step": 8550 }, { "epoch": 2.8555685423276005, "grad_norm": 0.25050871163578137, "learning_rate": 6.998956974664573e-08, "loss": 0.0109, "step": 8551 }, { "epoch": 2.855902487894473, "grad_norm": 0.3928211617260432, "learning_rate": 6.966592436825514e-08, "loss": 0.0276, "step": 8552 }, { "epoch": 2.856236433461346, "grad_norm": 0.2503705121237086, "learning_rate": 6.934302377488045e-08, "loss": 0.0123, "step": 8553 }, { "epoch": 2.856570379028218, "grad_norm": 0.259217195806298, "learning_rate": 6.902086801529817e-08, "loss": 0.0102, "step": 8554 }, { "epoch": 2.856904324595091, "grad_norm": 0.24940453514628133, "learning_rate": 6.869945713817438e-08, "loss": 0.0093, "step": 8555 }, { "epoch": 2.8572382701619636, "grad_norm": 0.2735992066774658, "learning_rate": 6.837879119206192e-08, "loss": 0.014, "step": 8556 }, { "epoch": 2.8575722157288364, "grad_norm": 0.6348134765812143, "learning_rate": 6.805887022540093e-08, "loss": 0.017, "step": 8557 }, { "epoch": 2.857906161295709, "grad_norm": 0.2972151048234801, "learning_rate": 6.773969428651883e-08, "loss": 0.0146, "step": 8558 }, { "epoch": 2.8582401068625813, "grad_norm": 0.3751021316558058, "learning_rate": 6.742126342363153e-08, "loss": 0.0196, "step": 8559 }, { "epoch": 2.858574052429454, "grad_norm": 0.35876437528401917, "learning_rate": 6.710357768484165e-08, "loss": 0.017, "step": 8560 }, { "epoch": 2.8589079979963268, "grad_norm": 0.22743288700869818, "learning_rate": 6.67866371181397e-08, "loss": 0.0089, "step": 8561 }, { "epoch": 2.859241943563199, "grad_norm": 0.32869503420014967, "learning_rate": 6.647044177140293e-08, "loss": 0.0127, "step": 8562 }, { "epoch": 2.8595758891300718, "grad_norm": 0.25432944067935565, "learning_rate": 6.615499169239647e-08, "loss": 0.0145, "step": 8563 }, { "epoch": 2.8599098346969445, "grad_norm": 0.2673055231272969, "learning_rate": 6.584028692877164e-08, "loss": 0.0104, "step": 8564 }, { "epoch": 2.8602437802638168, "grad_norm": 0.324891112488638, "learning_rate": 6.552632752807042e-08, "loss": 0.0181, "step": 8565 }, { "epoch": 2.8605777258306895, "grad_norm": 0.27747951249758, "learning_rate": 6.52131135377182e-08, "loss": 0.0157, "step": 8566 }, { "epoch": 2.860911671397562, "grad_norm": 0.2685322676877951, "learning_rate": 6.490064500503102e-08, "loss": 0.0116, "step": 8567 }, { "epoch": 2.861245616964435, "grad_norm": 0.28318602323609887, "learning_rate": 6.458892197721e-08, "loss": 0.0129, "step": 8568 }, { "epoch": 2.8615795625313076, "grad_norm": 0.2645660557938995, "learning_rate": 6.427794450134529e-08, "loss": 0.0103, "step": 8569 }, { "epoch": 2.86191350809818, "grad_norm": 0.2570216881372266, "learning_rate": 6.396771262441259e-08, "loss": 0.0119, "step": 8570 }, { "epoch": 2.8622474536650526, "grad_norm": 0.29667492982733074, "learning_rate": 6.365822639327724e-08, "loss": 0.0124, "step": 8571 }, { "epoch": 2.8625813992319253, "grad_norm": 0.3254460047612813, "learning_rate": 6.334948585469014e-08, "loss": 0.0147, "step": 8572 }, { "epoch": 2.8629153447987976, "grad_norm": 0.30839793575310176, "learning_rate": 6.304149105529067e-08, "loss": 0.0196, "step": 8573 }, { "epoch": 2.8632492903656703, "grad_norm": 0.3010324802122414, "learning_rate": 6.273424204160438e-08, "loss": 0.0165, "step": 8574 }, { "epoch": 2.863583235932543, "grad_norm": 0.25597586835558983, "learning_rate": 6.242773886004583e-08, "loss": 0.0111, "step": 8575 }, { "epoch": 2.8639171814994153, "grad_norm": 0.23180622247104044, "learning_rate": 6.212198155691518e-08, "loss": 0.009, "step": 8576 }, { "epoch": 2.864251127066288, "grad_norm": 0.26706278527959076, "learning_rate": 6.181697017840049e-08, "loss": 0.0128, "step": 8577 }, { "epoch": 2.8645850726331608, "grad_norm": 0.3062347972340337, "learning_rate": 6.151270477057825e-08, "loss": 0.0228, "step": 8578 }, { "epoch": 2.8649190182000335, "grad_norm": 0.28246594751599063, "learning_rate": 6.120918537941001e-08, "loss": 0.0177, "step": 8579 }, { "epoch": 2.865252963766906, "grad_norm": 0.2404341118431869, "learning_rate": 6.090641205074743e-08, "loss": 0.0086, "step": 8580 }, { "epoch": 2.8655869093337785, "grad_norm": 0.23522856635149827, "learning_rate": 6.060438483032671e-08, "loss": 0.0084, "step": 8581 }, { "epoch": 2.865920854900651, "grad_norm": 0.3327211212380855, "learning_rate": 6.030310376377302e-08, "loss": 0.0131, "step": 8582 }, { "epoch": 2.866254800467524, "grad_norm": 0.2355739520729309, "learning_rate": 6.000256889659883e-08, "loss": 0.0139, "step": 8583 }, { "epoch": 2.866588746034396, "grad_norm": 0.2901331532502492, "learning_rate": 5.97027802742034e-08, "loss": 0.0114, "step": 8584 }, { "epoch": 2.866922691601269, "grad_norm": 0.29450294292448287, "learning_rate": 5.940373794187326e-08, "loss": 0.0117, "step": 8585 }, { "epoch": 2.8672566371681416, "grad_norm": 0.19670420235039293, "learning_rate": 5.910544194478174e-08, "loss": 0.0072, "step": 8586 }, { "epoch": 2.8675905827350143, "grad_norm": 0.3484481470828203, "learning_rate": 5.880789232799e-08, "loss": 0.0285, "step": 8587 }, { "epoch": 2.867924528301887, "grad_norm": 0.2550703476448516, "learning_rate": 5.851108913644765e-08, "loss": 0.0105, "step": 8588 }, { "epoch": 2.8682584738687593, "grad_norm": 0.2961267925782829, "learning_rate": 5.821503241498882e-08, "loss": 0.0184, "step": 8589 }, { "epoch": 2.868592419435632, "grad_norm": 0.27187699589818554, "learning_rate": 5.791972220833719e-08, "loss": 0.0109, "step": 8590 }, { "epoch": 2.8689263650025048, "grad_norm": 0.25437284434188395, "learning_rate": 5.762515856110262e-08, "loss": 0.0099, "step": 8591 }, { "epoch": 2.869260310569377, "grad_norm": 0.2543393518130791, "learning_rate": 5.7331341517782855e-08, "loss": 0.0126, "step": 8592 }, { "epoch": 2.8695942561362497, "grad_norm": 0.1976849610871578, "learning_rate": 5.703827112276128e-08, "loss": 0.0082, "step": 8593 }, { "epoch": 2.8699282017031225, "grad_norm": 0.22656775373268642, "learning_rate": 5.674594742031081e-08, "loss": 0.0082, "step": 8594 }, { "epoch": 2.8702621472699947, "grad_norm": 0.3012257046414503, "learning_rate": 5.6454370454589456e-08, "loss": 0.0143, "step": 8595 }, { "epoch": 2.8705960928368675, "grad_norm": 0.3275795352504451, "learning_rate": 5.6163540269644215e-08, "loss": 0.0234, "step": 8596 }, { "epoch": 2.87093003840374, "grad_norm": 0.3114172374999129, "learning_rate": 5.5873456909407706e-08, "loss": 0.0106, "step": 8597 }, { "epoch": 2.871263983970613, "grad_norm": 0.283723519554095, "learning_rate": 5.5584120417701005e-08, "loss": 0.0127, "step": 8598 }, { "epoch": 2.8715979295374856, "grad_norm": 0.21519421110218015, "learning_rate": 5.529553083823136e-08, "loss": 0.0099, "step": 8599 }, { "epoch": 2.871931875104358, "grad_norm": 0.34906342767238513, "learning_rate": 5.50076882145939e-08, "loss": 0.0165, "step": 8600 }, { "epoch": 2.8722658206712306, "grad_norm": 0.27965922090622586, "learning_rate": 5.472059259027051e-08, "loss": 0.0168, "step": 8601 }, { "epoch": 2.8725997662381033, "grad_norm": 0.42169443466053885, "learning_rate": 5.44342440086304e-08, "loss": 0.0177, "step": 8602 }, { "epoch": 2.8729337118049756, "grad_norm": 0.23363688731913357, "learning_rate": 5.414864251293006e-08, "loss": 0.0085, "step": 8603 }, { "epoch": 2.8732676573718483, "grad_norm": 0.32918919326802065, "learning_rate": 5.386378814631277e-08, "loss": 0.0182, "step": 8604 }, { "epoch": 2.873601602938721, "grad_norm": 0.25972777777421585, "learning_rate": 5.3579680951808545e-08, "loss": 0.0097, "step": 8605 }, { "epoch": 2.8739355485055937, "grad_norm": 0.31170947328139753, "learning_rate": 5.329632097233639e-08, "loss": 0.0095, "step": 8606 }, { "epoch": 2.8742694940724665, "grad_norm": 0.30807716276869274, "learning_rate": 5.3013708250700405e-08, "loss": 0.0086, "step": 8607 }, { "epoch": 2.8746034396393387, "grad_norm": 0.26363665871249625, "learning_rate": 5.2731842829591984e-08, "loss": 0.0136, "step": 8608 }, { "epoch": 2.8749373852062114, "grad_norm": 0.36052019194724466, "learning_rate": 5.2450724751592076e-08, "loss": 0.0138, "step": 8609 }, { "epoch": 2.875271330773084, "grad_norm": 0.25209575972945103, "learning_rate": 5.217035405916449e-08, "loss": 0.0107, "step": 8610 }, { "epoch": 2.8756052763399564, "grad_norm": 0.26828565895104245, "learning_rate": 5.1890730794664227e-08, "loss": 0.0105, "step": 8611 }, { "epoch": 2.875939221906829, "grad_norm": 0.31240163681450744, "learning_rate": 5.161185500033139e-08, "loss": 0.0145, "step": 8612 }, { "epoch": 2.876273167473702, "grad_norm": 0.3429539275711395, "learning_rate": 5.1333726718293396e-08, "loss": 0.0157, "step": 8613 }, { "epoch": 2.876607113040574, "grad_norm": 0.2784040083795381, "learning_rate": 5.105634599056386e-08, "loss": 0.0113, "step": 8614 }, { "epoch": 2.876941058607447, "grad_norm": 0.3182685741683252, "learning_rate": 5.077971285904593e-08, "loss": 0.0187, "step": 8615 }, { "epoch": 2.8772750041743196, "grad_norm": 0.28762007501535114, "learning_rate": 5.050382736552728e-08, "loss": 0.0125, "step": 8616 }, { "epoch": 2.8776089497411923, "grad_norm": 0.2701771129361674, "learning_rate": 5.022868955168403e-08, "loss": 0.01, "step": 8617 }, { "epoch": 2.877942895308065, "grad_norm": 0.28235383957536087, "learning_rate": 4.995429945907848e-08, "loss": 0.0125, "step": 8618 }, { "epoch": 2.8782768408749373, "grad_norm": 0.3443954539083251, "learning_rate": 4.968065712916137e-08, "loss": 0.0206, "step": 8619 }, { "epoch": 2.87861078644181, "grad_norm": 0.26059304888406293, "learning_rate": 4.940776260326907e-08, "loss": 0.0167, "step": 8620 }, { "epoch": 2.8789447320086827, "grad_norm": 0.25791943630723885, "learning_rate": 4.913561592262528e-08, "loss": 0.0121, "step": 8621 }, { "epoch": 2.879278677575555, "grad_norm": 0.2655859327898246, "learning_rate": 4.886421712834155e-08, "loss": 0.0133, "step": 8622 }, { "epoch": 2.8796126231424277, "grad_norm": 0.30223419439100285, "learning_rate": 4.859356626141509e-08, "loss": 0.0157, "step": 8623 }, { "epoch": 2.8799465687093004, "grad_norm": 0.32348923902848525, "learning_rate": 4.8323663362732084e-08, "loss": 0.0198, "step": 8624 }, { "epoch": 2.8802805142761727, "grad_norm": 0.31712286439454235, "learning_rate": 4.8054508473063253e-08, "loss": 0.0154, "step": 8625 }, { "epoch": 2.8806144598430454, "grad_norm": 0.25780039181033787, "learning_rate": 4.778610163306885e-08, "loss": 0.0121, "step": 8626 }, { "epoch": 2.880948405409918, "grad_norm": 0.35193051371421863, "learning_rate": 4.751844288329366e-08, "loss": 0.0232, "step": 8627 }, { "epoch": 2.881282350976791, "grad_norm": 0.22856815421864873, "learning_rate": 4.72515322641709e-08, "loss": 0.0088, "step": 8628 }, { "epoch": 2.8816162965436636, "grad_norm": 0.3197896763682093, "learning_rate": 4.6985369816021644e-08, "loss": 0.0082, "step": 8629 }, { "epoch": 2.881950242110536, "grad_norm": 0.2553349556509692, "learning_rate": 4.6719955579052064e-08, "loss": 0.0177, "step": 8630 }, { "epoch": 2.8822841876774086, "grad_norm": 0.26694491821126515, "learning_rate": 4.6455289593355656e-08, "loss": 0.0123, "step": 8631 }, { "epoch": 2.8826181332442813, "grad_norm": 0.2932190552414894, "learning_rate": 4.619137189891432e-08, "loss": 0.0154, "step": 8632 }, { "epoch": 2.8829520788111536, "grad_norm": 0.32154559052800763, "learning_rate": 4.5928202535595044e-08, "loss": 0.0178, "step": 8633 }, { "epoch": 2.8832860243780263, "grad_norm": 0.2685217622127027, "learning_rate": 4.5665781543153266e-08, "loss": 0.0105, "step": 8634 }, { "epoch": 2.883619969944899, "grad_norm": 0.31063917030906485, "learning_rate": 4.54041089612306e-08, "loss": 0.0127, "step": 8635 }, { "epoch": 2.8839539155117717, "grad_norm": 0.28607046453238255, "learning_rate": 4.514318482935598e-08, "loss": 0.0173, "step": 8636 }, { "epoch": 2.8842878610786444, "grad_norm": 0.24263089680815572, "learning_rate": 4.488300918694455e-08, "loss": 0.0112, "step": 8637 }, { "epoch": 2.8846218066455167, "grad_norm": 0.2736154358704746, "learning_rate": 4.4623582073299864e-08, "loss": 0.0121, "step": 8638 }, { "epoch": 2.8849557522123894, "grad_norm": 0.28647529293389273, "learning_rate": 4.4364903527610026e-08, "loss": 0.014, "step": 8639 }, { "epoch": 2.885289697779262, "grad_norm": 0.2810451887262522, "learning_rate": 4.410697358895211e-08, "loss": 0.0113, "step": 8640 }, { "epoch": 2.8856236433461344, "grad_norm": 0.3365754474126144, "learning_rate": 4.384979229628994e-08, "loss": 0.0186, "step": 8641 }, { "epoch": 2.885957588913007, "grad_norm": 0.33665519179452863, "learning_rate": 4.359335968847356e-08, "loss": 0.0163, "step": 8642 }, { "epoch": 2.88629153447988, "grad_norm": 0.32270914265569817, "learning_rate": 4.333767580423976e-08, "loss": 0.0134, "step": 8643 }, { "epoch": 2.886625480046752, "grad_norm": 0.32316203055753245, "learning_rate": 4.3082740682213186e-08, "loss": 0.0164, "step": 8644 }, { "epoch": 2.886959425613625, "grad_norm": 0.33441153874827034, "learning_rate": 4.2828554360904165e-08, "loss": 0.0194, "step": 8645 }, { "epoch": 2.8872933711804976, "grad_norm": 0.27536690878555026, "learning_rate": 4.25751168787103e-08, "loss": 0.0141, "step": 8646 }, { "epoch": 2.8876273167473703, "grad_norm": 0.23970856884759603, "learning_rate": 4.2322428273917635e-08, "loss": 0.0113, "step": 8647 }, { "epoch": 2.887961262314243, "grad_norm": 0.2933379710425046, "learning_rate": 4.2070488584696754e-08, "loss": 0.0116, "step": 8648 }, { "epoch": 2.8882952078811153, "grad_norm": 0.27268144692593416, "learning_rate": 4.18192978491061e-08, "loss": 0.0126, "step": 8649 }, { "epoch": 2.888629153447988, "grad_norm": 0.23595906502391337, "learning_rate": 4.1568856105091424e-08, "loss": 0.009, "step": 8650 }, { "epoch": 2.8889630990148607, "grad_norm": 0.29042081610527437, "learning_rate": 4.1319163390484693e-08, "loss": 0.0168, "step": 8651 }, { "epoch": 2.889297044581733, "grad_norm": 0.3239046850135436, "learning_rate": 4.107021974300407e-08, "loss": 0.0151, "step": 8652 }, { "epoch": 2.8896309901486057, "grad_norm": 0.2848550150771415, "learning_rate": 4.082202520025724e-08, "loss": 0.0162, "step": 8653 }, { "epoch": 2.8899649357154784, "grad_norm": 0.2662346842513805, "learning_rate": 4.0574579799735335e-08, "loss": 0.0132, "step": 8654 }, { "epoch": 2.890298881282351, "grad_norm": 0.3175551371439921, "learning_rate": 4.0327883578819006e-08, "loss": 0.0151, "step": 8655 }, { "epoch": 2.890632826849224, "grad_norm": 0.29936501600193655, "learning_rate": 4.008193657477399e-08, "loss": 0.0139, "step": 8656 }, { "epoch": 2.890966772416096, "grad_norm": 0.2675577484150458, "learning_rate": 3.9836738824753364e-08, "loss": 0.0157, "step": 8657 }, { "epoch": 2.891300717982969, "grad_norm": 0.2185916716171451, "learning_rate": 3.959229036579748e-08, "loss": 0.0099, "step": 8658 }, { "epoch": 2.8916346635498416, "grad_norm": 0.25632943115240303, "learning_rate": 3.9348591234832926e-08, "loss": 0.0126, "step": 8659 }, { "epoch": 2.891968609116714, "grad_norm": 0.24724161710315734, "learning_rate": 3.9105641468673574e-08, "loss": 0.0106, "step": 8660 }, { "epoch": 2.8923025546835865, "grad_norm": 0.2237241762635861, "learning_rate": 3.886344110402007e-08, "loss": 0.0098, "step": 8661 }, { "epoch": 2.8926365002504593, "grad_norm": 0.280499808435951, "learning_rate": 3.862199017745871e-08, "loss": 0.0103, "step": 8662 }, { "epoch": 2.8929704458173315, "grad_norm": 0.21940658965183582, "learning_rate": 3.838128872546421e-08, "loss": 0.0108, "step": 8663 }, { "epoch": 2.8933043913842043, "grad_norm": 0.25735699024605174, "learning_rate": 3.814133678439691e-08, "loss": 0.0132, "step": 8664 }, { "epoch": 2.893638336951077, "grad_norm": 0.24372094354039864, "learning_rate": 3.790213439050561e-08, "loss": 0.0091, "step": 8665 }, { "epoch": 2.8939722825179497, "grad_norm": 0.24735252275639746, "learning_rate": 3.766368157992306e-08, "loss": 0.0123, "step": 8666 }, { "epoch": 2.8943062280848224, "grad_norm": 0.267038414140022, "learning_rate": 3.7425978388671014e-08, "loss": 0.0151, "step": 8667 }, { "epoch": 2.8946401736516947, "grad_norm": 0.2591770195906735, "learning_rate": 3.718902485265741e-08, "loss": 0.0111, "step": 8668 }, { "epoch": 2.8949741192185674, "grad_norm": 0.28739627134963536, "learning_rate": 3.6952821007676943e-08, "loss": 0.0122, "step": 8669 }, { "epoch": 2.89530806478544, "grad_norm": 0.2728938631570337, "learning_rate": 3.671736688941108e-08, "loss": 0.0126, "step": 8670 }, { "epoch": 2.8956420103523124, "grad_norm": 0.19581358338224103, "learning_rate": 3.6482662533426914e-08, "loss": 0.009, "step": 8671 }, { "epoch": 2.895975955919185, "grad_norm": 0.2844638849590318, "learning_rate": 3.6248707975181096e-08, "loss": 0.0106, "step": 8672 }, { "epoch": 2.896309901486058, "grad_norm": 0.2482466704684108, "learning_rate": 3.601550325001313e-08, "loss": 0.0092, "step": 8673 }, { "epoch": 2.89664384705293, "grad_norm": 0.282092274796016, "learning_rate": 3.578304839315316e-08, "loss": 0.0099, "step": 8674 }, { "epoch": 2.896977792619803, "grad_norm": 0.28047275914017394, "learning_rate": 3.5551343439715336e-08, "loss": 0.0133, "step": 8675 }, { "epoch": 2.8973117381866755, "grad_norm": 0.2855626412237467, "learning_rate": 3.5320388424701644e-08, "loss": 0.0133, "step": 8676 }, { "epoch": 2.8976456837535483, "grad_norm": 0.28058573181733476, "learning_rate": 3.50901833830003e-08, "loss": 0.0078, "step": 8677 }, { "epoch": 2.897979629320421, "grad_norm": 0.2116113856357964, "learning_rate": 3.4860728349386807e-08, "loss": 0.0097, "step": 8678 }, { "epoch": 2.8983135748872932, "grad_norm": 0.31618409308896017, "learning_rate": 3.4632023358522894e-08, "loss": 0.0156, "step": 8679 }, { "epoch": 2.898647520454166, "grad_norm": 0.28344553338861805, "learning_rate": 3.440406844495758e-08, "loss": 0.013, "step": 8680 }, { "epoch": 2.8989814660210387, "grad_norm": 0.22523993877103798, "learning_rate": 3.4176863643125e-08, "loss": 0.0083, "step": 8681 }, { "epoch": 2.899315411587911, "grad_norm": 0.27484649593540206, "learning_rate": 3.395040898734825e-08, "loss": 0.0148, "step": 8682 }, { "epoch": 2.8996493571547837, "grad_norm": 0.3562614267683864, "learning_rate": 3.372470451183496e-08, "loss": 0.0215, "step": 8683 }, { "epoch": 2.8999833027216564, "grad_norm": 0.24983673297695597, "learning_rate": 3.349975025068175e-08, "loss": 0.0119, "step": 8684 }, { "epoch": 2.900317248288529, "grad_norm": 0.3660830884604933, "learning_rate": 3.327554623786977e-08, "loss": 0.0143, "step": 8685 }, { "epoch": 2.900651193855402, "grad_norm": 0.2695513887765066, "learning_rate": 3.305209250726804e-08, "loss": 0.0153, "step": 8686 }, { "epoch": 2.900985139422274, "grad_norm": 0.2972901671866103, "learning_rate": 3.282938909263122e-08, "loss": 0.016, "step": 8687 }, { "epoch": 2.901319084989147, "grad_norm": 0.32989993264429485, "learning_rate": 3.2607436027601854e-08, "loss": 0.0187, "step": 8688 }, { "epoch": 2.9016530305560195, "grad_norm": 0.28538556338583776, "learning_rate": 3.238623334570812e-08, "loss": 0.0155, "step": 8689 }, { "epoch": 2.901986976122892, "grad_norm": 0.3164212136337197, "learning_rate": 3.2165781080366054e-08, "loss": 0.0158, "step": 8690 }, { "epoch": 2.9023209216897645, "grad_norm": 0.3062420702595496, "learning_rate": 3.194607926487681e-08, "loss": 0.0202, "step": 8691 }, { "epoch": 2.9026548672566372, "grad_norm": 0.2767076916236891, "learning_rate": 3.1727127932429936e-08, "loss": 0.0134, "step": 8692 }, { "epoch": 2.9029888128235095, "grad_norm": 0.2034915095180857, "learning_rate": 3.150892711609899e-08, "loss": 0.0064, "step": 8693 }, { "epoch": 2.9033227583903822, "grad_norm": 0.23242973300874203, "learning_rate": 3.129147684884704e-08, "loss": 0.0089, "step": 8694 }, { "epoch": 2.903656703957255, "grad_norm": 0.2663350165078304, "learning_rate": 3.107477716352225e-08, "loss": 0.0125, "step": 8695 }, { "epoch": 2.9039906495241277, "grad_norm": 0.33861888331435996, "learning_rate": 3.0858828092859564e-08, "loss": 0.0173, "step": 8696 }, { "epoch": 2.9043245950910004, "grad_norm": 0.3209430979212991, "learning_rate": 3.0643629669480644e-08, "loss": 0.0128, "step": 8697 }, { "epoch": 2.9046585406578727, "grad_norm": 0.306165968142353, "learning_rate": 3.042918192589395e-08, "loss": 0.0141, "step": 8698 }, { "epoch": 2.9049924862247454, "grad_norm": 0.26839596857883113, "learning_rate": 3.021548489449355e-08, "loss": 0.0134, "step": 8699 }, { "epoch": 2.905326431791618, "grad_norm": 0.2358241488476551, "learning_rate": 3.000253860756197e-08, "loss": 0.0103, "step": 8700 }, { "epoch": 2.9056603773584904, "grad_norm": 0.31773401568435783, "learning_rate": 2.979034309726625e-08, "loss": 0.0188, "step": 8701 }, { "epoch": 2.905994322925363, "grad_norm": 0.22052354319979584, "learning_rate": 2.9578898395661858e-08, "loss": 0.0085, "step": 8702 }, { "epoch": 2.906328268492236, "grad_norm": 0.24852538150490014, "learning_rate": 2.9368204534689916e-08, "loss": 0.019, "step": 8703 }, { "epoch": 2.9066622140591085, "grad_norm": 0.29408670813894333, "learning_rate": 2.915826154617718e-08, "loss": 0.0128, "step": 8704 }, { "epoch": 2.9069961596259812, "grad_norm": 0.24527733369242583, "learning_rate": 2.8949069461839952e-08, "loss": 0.0102, "step": 8705 }, { "epoch": 2.9073301051928535, "grad_norm": 0.2969798744619057, "learning_rate": 2.8740628313276842e-08, "loss": 0.0147, "step": 8706 }, { "epoch": 2.9076640507597262, "grad_norm": 0.24956870936206352, "learning_rate": 2.853293813197766e-08, "loss": 0.0131, "step": 8707 }, { "epoch": 2.907997996326599, "grad_norm": 0.33572656869942524, "learning_rate": 2.8325998949314536e-08, "loss": 0.0148, "step": 8708 }, { "epoch": 2.908331941893471, "grad_norm": 0.43030877178110216, "learning_rate": 2.811981079654913e-08, "loss": 0.0195, "step": 8709 }, { "epoch": 2.908665887460344, "grad_norm": 0.30389642896195596, "learning_rate": 2.7914373704827634e-08, "loss": 0.0113, "step": 8710 }, { "epoch": 2.9089998330272167, "grad_norm": 0.4144430579268167, "learning_rate": 2.7709687705185227e-08, "loss": 0.021, "step": 8711 }, { "epoch": 2.909333778594089, "grad_norm": 0.28878617439074666, "learning_rate": 2.7505752828541065e-08, "loss": 0.0112, "step": 8712 }, { "epoch": 2.9096677241609616, "grad_norm": 0.3441762591137517, "learning_rate": 2.730256910570217e-08, "loss": 0.0183, "step": 8713 }, { "epoch": 2.9100016697278344, "grad_norm": 0.4018551305354934, "learning_rate": 2.7100136567361767e-08, "loss": 0.0203, "step": 8714 }, { "epoch": 2.910335615294707, "grad_norm": 0.2742731362558585, "learning_rate": 2.689845524409984e-08, "loss": 0.0145, "step": 8715 }, { "epoch": 2.91066956086158, "grad_norm": 0.2640693293958381, "learning_rate": 2.6697525166382575e-08, "loss": 0.015, "step": 8716 }, { "epoch": 2.911003506428452, "grad_norm": 0.20604488465997525, "learning_rate": 2.649734636456236e-08, "loss": 0.0087, "step": 8717 }, { "epoch": 2.911337451995325, "grad_norm": 0.2602360261733091, "learning_rate": 2.629791886888e-08, "loss": 0.0134, "step": 8718 }, { "epoch": 2.9116713975621975, "grad_norm": 0.23131539403434442, "learning_rate": 2.6099242709459737e-08, "loss": 0.0127, "step": 8719 }, { "epoch": 2.91200534312907, "grad_norm": 0.33407922563788495, "learning_rate": 2.5901317916314783e-08, "loss": 0.0154, "step": 8720 }, { "epoch": 2.9123392886959425, "grad_norm": 0.31518587321229735, "learning_rate": 2.5704144519344e-08, "loss": 0.0178, "step": 8721 }, { "epoch": 2.912673234262815, "grad_norm": 0.2994628741166894, "learning_rate": 2.5507722548332446e-08, "loss": 0.0125, "step": 8722 }, { "epoch": 2.9130071798296875, "grad_norm": 0.36807659410784077, "learning_rate": 2.5312052032952505e-08, "loss": 0.0134, "step": 8723 }, { "epoch": 2.91334112539656, "grad_norm": 0.24185584342283248, "learning_rate": 2.5117133002762196e-08, "loss": 0.0104, "step": 8724 }, { "epoch": 2.913675070963433, "grad_norm": 0.25499478938167525, "learning_rate": 2.492296548720574e-08, "loss": 0.0113, "step": 8725 }, { "epoch": 2.9140090165303056, "grad_norm": 0.2535233257687075, "learning_rate": 2.4729549515615235e-08, "loss": 0.0128, "step": 8726 }, { "epoch": 2.9143429620971784, "grad_norm": 0.28416792676128777, "learning_rate": 2.453688511720842e-08, "loss": 0.0099, "step": 8727 }, { "epoch": 2.9146769076640506, "grad_norm": 0.3454984001255326, "learning_rate": 2.4344972321089234e-08, "loss": 0.016, "step": 8728 }, { "epoch": 2.9150108532309233, "grad_norm": 0.26467789417597154, "learning_rate": 2.415381115624782e-08, "loss": 0.0105, "step": 8729 }, { "epoch": 2.915344798797796, "grad_norm": 0.28623993926336866, "learning_rate": 2.3963401651562747e-08, "loss": 0.0131, "step": 8730 }, { "epoch": 2.9156787443646683, "grad_norm": 0.22363555013131667, "learning_rate": 2.3773743835796558e-08, "loss": 0.0109, "step": 8731 }, { "epoch": 2.916012689931541, "grad_norm": 0.26112782302038295, "learning_rate": 2.358483773759912e-08, "loss": 0.0101, "step": 8732 }, { "epoch": 2.9163466354984138, "grad_norm": 0.3250892536190527, "learning_rate": 2.33966833855076e-08, "loss": 0.0147, "step": 8733 }, { "epoch": 2.9166805810652865, "grad_norm": 0.30163415328018506, "learning_rate": 2.320928080794482e-08, "loss": 0.0125, "step": 8734 }, { "epoch": 2.917014526632159, "grad_norm": 0.3275314454711843, "learning_rate": 2.3022630033219807e-08, "loss": 0.0131, "step": 8735 }, { "epoch": 2.9173484721990315, "grad_norm": 0.37484526774275956, "learning_rate": 2.2836731089528886e-08, "loss": 0.0179, "step": 8736 }, { "epoch": 2.917682417765904, "grad_norm": 0.3665810232642229, "learning_rate": 2.2651584004953485e-08, "loss": 0.0206, "step": 8737 }, { "epoch": 2.918016363332777, "grad_norm": 0.34566664698549754, "learning_rate": 2.2467188807462902e-08, "loss": 0.0182, "step": 8738 }, { "epoch": 2.918350308899649, "grad_norm": 0.2779653906613591, "learning_rate": 2.2283545524912075e-08, "loss": 0.0125, "step": 8739 }, { "epoch": 2.918684254466522, "grad_norm": 0.4214386655749725, "learning_rate": 2.210065418504215e-08, "loss": 0.0313, "step": 8740 }, { "epoch": 2.9190182000333946, "grad_norm": 0.26226303616696295, "learning_rate": 2.1918514815481572e-08, "loss": 0.0103, "step": 8741 }, { "epoch": 2.919352145600267, "grad_norm": 0.23341419489962137, "learning_rate": 2.17371274437439e-08, "loss": 0.0153, "step": 8742 }, { "epoch": 2.9196860911671396, "grad_norm": 0.3488548374087801, "learning_rate": 2.155649209723054e-08, "loss": 0.0196, "step": 8743 }, { "epoch": 2.9200200367340123, "grad_norm": 0.2935287958576697, "learning_rate": 2.137660880322856e-08, "loss": 0.0099, "step": 8744 }, { "epoch": 2.920353982300885, "grad_norm": 0.25887511113137146, "learning_rate": 2.1197477588910666e-08, "loss": 0.0111, "step": 8745 }, { "epoch": 2.9206879278677578, "grad_norm": 0.32883151961491597, "learning_rate": 2.101909848133743e-08, "loss": 0.0164, "step": 8746 }, { "epoch": 2.92102187343463, "grad_norm": 0.20466615117086215, "learning_rate": 2.0841471507455635e-08, "loss": 0.0074, "step": 8747 }, { "epoch": 2.9213558190015028, "grad_norm": 0.3481873657204859, "learning_rate": 2.0664596694096596e-08, "loss": 0.016, "step": 8748 }, { "epoch": 2.9216897645683755, "grad_norm": 0.2948192719071884, "learning_rate": 2.0488474067980045e-08, "loss": 0.0212, "step": 8749 }, { "epoch": 2.9220237101352478, "grad_norm": 0.3409574930296834, "learning_rate": 2.0313103655711373e-08, "loss": 0.0124, "step": 8750 }, { "epoch": 2.9223576557021205, "grad_norm": 0.3038855744380676, "learning_rate": 2.0138485483782723e-08, "loss": 0.0106, "step": 8751 }, { "epoch": 2.922691601268993, "grad_norm": 0.3159034059422304, "learning_rate": 1.996461957857132e-08, "loss": 0.0138, "step": 8752 }, { "epoch": 2.923025546835866, "grad_norm": 0.34065442628863085, "learning_rate": 1.9791505966342273e-08, "loss": 0.0164, "step": 8753 }, { "epoch": 2.9233594924027386, "grad_norm": 0.28514155172896277, "learning_rate": 1.9619144673246325e-08, "loss": 0.0149, "step": 8754 }, { "epoch": 2.923693437969611, "grad_norm": 0.3061562221831954, "learning_rate": 1.9447535725320987e-08, "loss": 0.0148, "step": 8755 }, { "epoch": 2.9240273835364836, "grad_norm": 0.3602103456345199, "learning_rate": 1.9276679148488854e-08, "loss": 0.0149, "step": 8756 }, { "epoch": 2.9243613291033563, "grad_norm": 0.3478663526829974, "learning_rate": 1.9106574968560943e-08, "loss": 0.0145, "step": 8757 }, { "epoch": 2.9246952746702286, "grad_norm": 0.2915040935327554, "learning_rate": 1.8937223211232257e-08, "loss": 0.0156, "step": 8758 }, { "epoch": 2.9250292202371013, "grad_norm": 0.3098722563320424, "learning_rate": 1.876862390208678e-08, "loss": 0.0134, "step": 8759 }, { "epoch": 2.925363165803974, "grad_norm": 0.28094944512030523, "learning_rate": 1.8600777066593023e-08, "loss": 0.0184, "step": 8760 }, { "epoch": 2.9256971113708463, "grad_norm": 0.2797787627219745, "learning_rate": 1.8433682730105706e-08, "loss": 0.0146, "step": 8761 }, { "epoch": 2.926031056937719, "grad_norm": 0.2576842763659107, "learning_rate": 1.8267340917866306e-08, "loss": 0.0123, "step": 8762 }, { "epoch": 2.9263650025045918, "grad_norm": 0.31929204793777843, "learning_rate": 1.8101751655003053e-08, "loss": 0.0137, "step": 8763 }, { "epoch": 2.9266989480714645, "grad_norm": 0.2631502610137791, "learning_rate": 1.793691496653094e-08, "loss": 0.0155, "step": 8764 }, { "epoch": 2.927032893638337, "grad_norm": 0.2625030484537923, "learning_rate": 1.7772830877348933e-08, "loss": 0.0099, "step": 8765 }, { "epoch": 2.9273668392052095, "grad_norm": 0.2231807882324196, "learning_rate": 1.760949941224499e-08, "loss": 0.0094, "step": 8766 }, { "epoch": 2.927700784772082, "grad_norm": 0.31845048629295863, "learning_rate": 1.7446920595892147e-08, "loss": 0.016, "step": 8767 }, { "epoch": 2.928034730338955, "grad_norm": 0.2612939278961106, "learning_rate": 1.7285094452849095e-08, "loss": 0.0114, "step": 8768 }, { "epoch": 2.928368675905827, "grad_norm": 0.3775987112064319, "learning_rate": 1.7124021007562385e-08, "loss": 0.0098, "step": 8769 }, { "epoch": 2.9287026214727, "grad_norm": 0.31509669595507855, "learning_rate": 1.696370028436367e-08, "loss": 0.0143, "step": 8770 }, { "epoch": 2.9290365670395726, "grad_norm": 0.29480089335483095, "learning_rate": 1.6804132307471354e-08, "loss": 0.0099, "step": 8771 }, { "epoch": 2.929370512606445, "grad_norm": 0.2897833691487252, "learning_rate": 1.6645317100990044e-08, "loss": 0.0128, "step": 8772 }, { "epoch": 2.9297044581733176, "grad_norm": 0.36133706228333407, "learning_rate": 1.6487254688910546e-08, "loss": 0.0181, "step": 8773 }, { "epoch": 2.9300384037401903, "grad_norm": 0.2404834580281326, "learning_rate": 1.6329945095110435e-08, "loss": 0.0102, "step": 8774 }, { "epoch": 2.930372349307063, "grad_norm": 0.27565874276431335, "learning_rate": 1.6173388343352915e-08, "loss": 0.0248, "step": 8775 }, { "epoch": 2.9307062948739357, "grad_norm": 0.32454303115924815, "learning_rate": 1.601758445728796e-08, "loss": 0.0101, "step": 8776 }, { "epoch": 2.931040240440808, "grad_norm": 0.3657214541168895, "learning_rate": 1.586253346045119e-08, "loss": 0.0152, "step": 8777 }, { "epoch": 2.9313741860076807, "grad_norm": 0.25646527488828263, "learning_rate": 1.570823537626498e-08, "loss": 0.0147, "step": 8778 }, { "epoch": 2.9317081315745535, "grad_norm": 0.25589474577330984, "learning_rate": 1.5554690228037905e-08, "loss": 0.0096, "step": 8779 }, { "epoch": 2.9320420771414257, "grad_norm": 0.21896462623009297, "learning_rate": 1.5401898038964748e-08, "loss": 0.0093, "step": 8780 }, { "epoch": 2.9323760227082984, "grad_norm": 0.2189527789194369, "learning_rate": 1.5249858832126486e-08, "loss": 0.008, "step": 8781 }, { "epoch": 2.932709968275171, "grad_norm": 0.32614187488584695, "learning_rate": 1.5098572630491414e-08, "loss": 0.0156, "step": 8782 }, { "epoch": 2.933043913842044, "grad_norm": 0.2768713733398159, "learning_rate": 1.4948039456911256e-08, "loss": 0.0116, "step": 8783 }, { "epoch": 2.9333778594089166, "grad_norm": 0.33250918763287496, "learning_rate": 1.4798259334127263e-08, "loss": 0.0132, "step": 8784 }, { "epoch": 2.933711804975789, "grad_norm": 0.30370025576747817, "learning_rate": 1.4649232284765225e-08, "loss": 0.0128, "step": 8785 }, { "epoch": 2.9340457505426616, "grad_norm": 0.2236663279924463, "learning_rate": 1.4500958331337134e-08, "loss": 0.0083, "step": 8786 }, { "epoch": 2.9343796961095343, "grad_norm": 0.2681366571156902, "learning_rate": 1.435343749624174e-08, "loss": 0.012, "step": 8787 }, { "epoch": 2.9347136416764066, "grad_norm": 0.2907583918296267, "learning_rate": 1.420666980176344e-08, "loss": 0.0115, "step": 8788 }, { "epoch": 2.9350475872432793, "grad_norm": 0.47247866287833307, "learning_rate": 1.4060655270073387e-08, "loss": 0.0155, "step": 8789 }, { "epoch": 2.935381532810152, "grad_norm": 0.2662172562483003, "learning_rate": 1.3915393923228936e-08, "loss": 0.0123, "step": 8790 }, { "epoch": 2.9357154783770243, "grad_norm": 0.250694005734805, "learning_rate": 1.3770885783173649e-08, "loss": 0.009, "step": 8791 }, { "epoch": 2.936049423943897, "grad_norm": 0.2936251125656518, "learning_rate": 1.3627130871737282e-08, "loss": 0.012, "step": 8792 }, { "epoch": 2.9363833695107697, "grad_norm": 0.25562172619410156, "learning_rate": 1.3484129210635243e-08, "loss": 0.0117, "step": 8793 }, { "epoch": 2.9367173150776424, "grad_norm": 0.3303159384027027, "learning_rate": 1.3341880821469699e-08, "loss": 0.0179, "step": 8794 }, { "epoch": 2.937051260644515, "grad_norm": 0.2584599284615368, "learning_rate": 1.3200385725729014e-08, "loss": 0.0139, "step": 8795 }, { "epoch": 2.9373852062113874, "grad_norm": 0.35616793549975456, "learning_rate": 1.3059643944787759e-08, "loss": 0.0105, "step": 8796 }, { "epoch": 2.93771915177826, "grad_norm": 0.2935202521427242, "learning_rate": 1.2919655499906703e-08, "loss": 0.012, "step": 8797 }, { "epoch": 2.938053097345133, "grad_norm": 0.28669489947757315, "learning_rate": 1.2780420412232263e-08, "loss": 0.0101, "step": 8798 }, { "epoch": 2.938387042912005, "grad_norm": 0.5376356661696571, "learning_rate": 1.2641938702798174e-08, "loss": 0.0175, "step": 8799 }, { "epoch": 2.938720988478878, "grad_norm": 0.23862457416839503, "learning_rate": 1.2504210392523808e-08, "loss": 0.0089, "step": 8800 }, { "epoch": 2.9390549340457506, "grad_norm": 0.3331618786605174, "learning_rate": 1.2367235502214192e-08, "loss": 0.0147, "step": 8801 }, { "epoch": 2.9393888796126233, "grad_norm": 0.27056080806631566, "learning_rate": 1.2231014052560553e-08, "loss": 0.0117, "step": 8802 }, { "epoch": 2.939722825179496, "grad_norm": 0.312923999169443, "learning_rate": 1.2095546064141982e-08, "loss": 0.016, "step": 8803 }, { "epoch": 2.9400567707463683, "grad_norm": 0.37431215586823213, "learning_rate": 1.196083155742156e-08, "loss": 0.0146, "step": 8804 }, { "epoch": 2.940390716313241, "grad_norm": 0.2726460575793837, "learning_rate": 1.1826870552749669e-08, "loss": 0.0118, "step": 8805 }, { "epoch": 2.9407246618801137, "grad_norm": 0.2816191408205805, "learning_rate": 1.169366307036346e-08, "loss": 0.0135, "step": 8806 }, { "epoch": 2.941058607446986, "grad_norm": 0.25846786258633725, "learning_rate": 1.1561209130384055e-08, "loss": 0.0094, "step": 8807 }, { "epoch": 2.9413925530138587, "grad_norm": 0.22722585849230448, "learning_rate": 1.1429508752821561e-08, "loss": 0.0111, "step": 8808 }, { "epoch": 2.9417264985807314, "grad_norm": 0.2970481442227012, "learning_rate": 1.1298561957570065e-08, "loss": 0.0096, "step": 8809 }, { "epoch": 2.9420604441476037, "grad_norm": 0.38339991784619254, "learning_rate": 1.1168368764410408e-08, "loss": 0.0209, "step": 8810 }, { "epoch": 2.9423943897144764, "grad_norm": 0.25604073011029854, "learning_rate": 1.103892919301075e-08, "loss": 0.0115, "step": 8811 }, { "epoch": 2.942728335281349, "grad_norm": 0.2727229391793246, "learning_rate": 1.0910243262923781e-08, "loss": 0.0152, "step": 8812 }, { "epoch": 2.943062280848222, "grad_norm": 0.30034640582846706, "learning_rate": 1.0782310993589506e-08, "loss": 0.0147, "step": 8813 }, { "epoch": 2.9433962264150946, "grad_norm": 0.2684617169656456, "learning_rate": 1.0655132404333024e-08, "loss": 0.0116, "step": 8814 }, { "epoch": 2.943730171981967, "grad_norm": 0.33602067958668014, "learning_rate": 1.0528707514366743e-08, "loss": 0.0198, "step": 8815 }, { "epoch": 2.9440641175488396, "grad_norm": 0.2827167620515951, "learning_rate": 1.0403036342787609e-08, "loss": 0.0132, "step": 8816 }, { "epoch": 2.9443980631157123, "grad_norm": 0.180604425398328, "learning_rate": 1.0278118908580992e-08, "loss": 0.0057, "step": 8817 }, { "epoch": 2.9447320086825846, "grad_norm": 0.3180144117107635, "learning_rate": 1.0153955230616241e-08, "loss": 0.0146, "step": 8818 }, { "epoch": 2.9450659542494573, "grad_norm": 0.2736349316275919, "learning_rate": 1.0030545327650576e-08, "loss": 0.0138, "step": 8819 }, { "epoch": 2.94539989981633, "grad_norm": 0.2686584094563105, "learning_rate": 9.907889218325751e-09, "loss": 0.0172, "step": 8820 }, { "epoch": 2.9457338453832023, "grad_norm": 0.32138877629272083, "learning_rate": 9.78598692117083e-09, "loss": 0.0206, "step": 8821 }, { "epoch": 2.946067790950075, "grad_norm": 0.25011918184665605, "learning_rate": 9.664838454599978e-09, "loss": 0.0115, "step": 8822 }, { "epoch": 2.9464017365169477, "grad_norm": 0.26070132075448404, "learning_rate": 9.544443836914664e-09, "loss": 0.0139, "step": 8823 }, { "epoch": 2.9467356820838204, "grad_norm": 0.30632666567619926, "learning_rate": 9.42480308630256e-09, "loss": 0.0135, "step": 8824 }, { "epoch": 2.947069627650693, "grad_norm": 0.3313239726533775, "learning_rate": 9.30591622083532e-09, "loss": 0.0171, "step": 8825 }, { "epoch": 2.9474035732175654, "grad_norm": 0.2797843695228306, "learning_rate": 9.187783258473027e-09, "loss": 0.0086, "step": 8826 }, { "epoch": 2.947737518784438, "grad_norm": 0.5733809244732592, "learning_rate": 9.070404217061402e-09, "loss": 0.0195, "step": 8827 }, { "epoch": 2.948071464351311, "grad_norm": 0.26359051517832066, "learning_rate": 8.953779114331262e-09, "loss": 0.0088, "step": 8828 }, { "epoch": 2.948405409918183, "grad_norm": 0.4198663123835468, "learning_rate": 8.837907967900183e-09, "loss": 0.0216, "step": 8829 }, { "epoch": 2.948739355485056, "grad_norm": 0.342304415582743, "learning_rate": 8.722790795272495e-09, "loss": 0.0161, "step": 8830 }, { "epoch": 2.9490733010519286, "grad_norm": 0.3091524362828571, "learning_rate": 8.608427613837622e-09, "loss": 0.0211, "step": 8831 }, { "epoch": 2.9494072466188013, "grad_norm": 0.34980570394693283, "learning_rate": 8.494818440871189e-09, "loss": 0.0186, "step": 8832 }, { "epoch": 2.949741192185674, "grad_norm": 0.32398566768990994, "learning_rate": 8.381963293535577e-09, "loss": 0.0129, "step": 8833 }, { "epoch": 2.9500751377525463, "grad_norm": 0.21247373849122486, "learning_rate": 8.269862188879374e-09, "loss": 0.0132, "step": 8834 }, { "epoch": 2.950409083319419, "grad_norm": 0.3118763549997627, "learning_rate": 8.158515143835698e-09, "loss": 0.0124, "step": 8835 }, { "epoch": 2.9507430288862917, "grad_norm": 0.2520505010345685, "learning_rate": 8.047922175225542e-09, "loss": 0.0107, "step": 8836 }, { "epoch": 2.951076974453164, "grad_norm": 0.2962583648791619, "learning_rate": 7.938083299754984e-09, "loss": 0.012, "step": 8837 }, { "epoch": 2.9514109200200367, "grad_norm": 0.3061490935777087, "learning_rate": 7.828998534016308e-09, "loss": 0.0167, "step": 8838 }, { "epoch": 2.9517448655869094, "grad_norm": 0.34722479934123923, "learning_rate": 7.720667894488554e-09, "loss": 0.0184, "step": 8839 }, { "epoch": 2.9520788111537817, "grad_norm": 0.25616124307887617, "learning_rate": 7.613091397535855e-09, "loss": 0.0101, "step": 8840 }, { "epoch": 2.9524127567206544, "grad_norm": 0.2498636529316553, "learning_rate": 7.506269059409654e-09, "loss": 0.0119, "step": 8841 }, { "epoch": 2.952746702287527, "grad_norm": 0.24209862700190157, "learning_rate": 7.400200896245935e-09, "loss": 0.0115, "step": 8842 }, { "epoch": 2.9530806478544, "grad_norm": 0.3096880800803158, "learning_rate": 7.29488692406799e-09, "loss": 0.0143, "step": 8843 }, { "epoch": 2.9534145934212725, "grad_norm": 0.2995089814872804, "learning_rate": 7.190327158784205e-09, "loss": 0.0144, "step": 8844 }, { "epoch": 2.953748538988145, "grad_norm": 0.28098736177119993, "learning_rate": 7.0865216161902785e-09, "loss": 0.0145, "step": 8845 }, { "epoch": 2.9540824845550175, "grad_norm": 0.25440121926030973, "learning_rate": 6.983470311967e-09, "loss": 0.0108, "step": 8846 }, { "epoch": 2.9544164301218903, "grad_norm": 0.26911353456543136, "learning_rate": 6.881173261680807e-09, "loss": 0.0113, "step": 8847 }, { "epoch": 2.9547503756887625, "grad_norm": 0.2838865771790412, "learning_rate": 6.779630480786004e-09, "loss": 0.0136, "step": 8848 }, { "epoch": 2.9550843212556352, "grad_norm": 0.2503857183749329, "learning_rate": 6.678841984621432e-09, "loss": 0.0109, "step": 8849 }, { "epoch": 2.955418266822508, "grad_norm": 0.30434982450600045, "learning_rate": 6.578807788411579e-09, "loss": 0.0151, "step": 8850 }, { "epoch": 2.9557522123893807, "grad_norm": 0.2573602736956534, "learning_rate": 6.479527907268801e-09, "loss": 0.0127, "step": 8851 }, { "epoch": 2.9560861579562534, "grad_norm": 0.29336908656839916, "learning_rate": 6.381002356189991e-09, "loss": 0.0118, "step": 8852 }, { "epoch": 2.9564201035231257, "grad_norm": 0.31956946539516207, "learning_rate": 6.283231150058799e-09, "loss": 0.012, "step": 8853 }, { "epoch": 2.9567540490899984, "grad_norm": 0.26904002432170365, "learning_rate": 6.186214303645077e-09, "loss": 0.0096, "step": 8854 }, { "epoch": 2.957087994656871, "grad_norm": 0.2817797657289491, "learning_rate": 6.0899518316032135e-09, "loss": 0.0144, "step": 8855 }, { "epoch": 2.9574219402237434, "grad_norm": 0.2891315774175175, "learning_rate": 5.99444374847602e-09, "loss": 0.0121, "step": 8856 }, { "epoch": 2.957755885790616, "grad_norm": 0.23546706675110624, "learning_rate": 5.899690068690289e-09, "loss": 0.0118, "step": 8857 }, { "epoch": 2.958089831357489, "grad_norm": 0.2503079524048365, "learning_rate": 5.805690806560127e-09, "loss": 0.0109, "step": 8858 }, { "epoch": 2.958423776924361, "grad_norm": 0.28038767855348384, "learning_rate": 5.712445976285286e-09, "loss": 0.014, "step": 8859 }, { "epoch": 2.958757722491234, "grad_norm": 0.3581868050776321, "learning_rate": 5.619955591951165e-09, "loss": 0.0271, "step": 8860 }, { "epoch": 2.9590916680581065, "grad_norm": 0.28498615376492137, "learning_rate": 5.528219667529921e-09, "loss": 0.02, "step": 8861 }, { "epoch": 2.9594256136249792, "grad_norm": 0.21941935688277253, "learning_rate": 5.437238216878804e-09, "loss": 0.0085, "step": 8862 }, { "epoch": 2.959759559191852, "grad_norm": 0.30634642551595337, "learning_rate": 5.347011253741819e-09, "loss": 0.0099, "step": 8863 }, { "epoch": 2.9600935047587242, "grad_norm": 0.2477810188915607, "learning_rate": 5.257538791749173e-09, "loss": 0.0125, "step": 8864 }, { "epoch": 2.960427450325597, "grad_norm": 0.27414438700160665, "learning_rate": 5.168820844416167e-09, "loss": 0.0136, "step": 8865 }, { "epoch": 2.9607613958924697, "grad_norm": 0.22304751891524885, "learning_rate": 5.080857425145413e-09, "loss": 0.0096, "step": 8866 }, { "epoch": 2.961095341459342, "grad_norm": 0.25385290361558166, "learning_rate": 4.993648547224062e-09, "loss": 0.0133, "step": 8867 }, { "epoch": 2.9614292870262147, "grad_norm": 0.309799338589133, "learning_rate": 4.907194223826572e-09, "loss": 0.017, "step": 8868 }, { "epoch": 2.9617632325930874, "grad_norm": 0.309199565152647, "learning_rate": 4.8214944680125e-09, "loss": 0.0138, "step": 8869 }, { "epoch": 2.9620971781599597, "grad_norm": 0.31027870939859425, "learning_rate": 4.736549292728154e-09, "loss": 0.0145, "step": 8870 }, { "epoch": 2.9624311237268324, "grad_norm": 0.39834860507149, "learning_rate": 4.652358710805494e-09, "loss": 0.0272, "step": 8871 }, { "epoch": 2.962765069293705, "grad_norm": 0.3028396026405605, "learning_rate": 4.5689227349626775e-09, "loss": 0.013, "step": 8872 }, { "epoch": 2.963099014860578, "grad_norm": 0.24185272703681146, "learning_rate": 4.486241377802958e-09, "loss": 0.0113, "step": 8873 }, { "epoch": 2.9634329604274505, "grad_norm": 0.27679245352313514, "learning_rate": 4.404314651816344e-09, "loss": 0.0178, "step": 8874 }, { "epoch": 2.963766905994323, "grad_norm": 0.3055707292428465, "learning_rate": 4.323142569379602e-09, "loss": 0.016, "step": 8875 }, { "epoch": 2.9641008515611955, "grad_norm": 0.3105337545534266, "learning_rate": 4.242725142754589e-09, "loss": 0.0128, "step": 8876 }, { "epoch": 2.9644347971280682, "grad_norm": 0.2042614322041993, "learning_rate": 4.163062384088812e-09, "loss": 0.0067, "step": 8877 }, { "epoch": 2.9647687426949405, "grad_norm": 0.42435157442567994, "learning_rate": 4.0841543054165324e-09, "loss": 0.0211, "step": 8878 }, { "epoch": 2.9651026882618132, "grad_norm": 0.2605864036382383, "learning_rate": 4.006000918658215e-09, "loss": 0.0103, "step": 8879 }, { "epoch": 2.965436633828686, "grad_norm": 0.2542717122667056, "learning_rate": 3.928602235618861e-09, "loss": 0.012, "step": 8880 }, { "epoch": 2.9657705793955587, "grad_norm": 0.3105241541659346, "learning_rate": 3.851958267990785e-09, "loss": 0.0117, "step": 8881 }, { "epoch": 2.9661045249624314, "grad_norm": 0.2810087337567596, "learning_rate": 3.776069027352503e-09, "loss": 0.0121, "step": 8882 }, { "epoch": 2.9664384705293037, "grad_norm": 0.3579953796513702, "learning_rate": 3.700934525167621e-09, "loss": 0.0135, "step": 8883 }, { "epoch": 2.9667724160961764, "grad_norm": 0.1942049991957917, "learning_rate": 3.626554772786506e-09, "loss": 0.0072, "step": 8884 }, { "epoch": 2.967106361663049, "grad_norm": 0.2874251572794949, "learning_rate": 3.5529297814440587e-09, "loss": 0.0142, "step": 8885 }, { "epoch": 2.9674403072299214, "grad_norm": 0.3079602859417745, "learning_rate": 3.4800595622630497e-09, "loss": 0.0164, "step": 8886 }, { "epoch": 2.967774252796794, "grad_norm": 0.284304055545771, "learning_rate": 3.407944126251339e-09, "loss": 0.0116, "step": 8887 }, { "epoch": 2.968108198363667, "grad_norm": 0.3024477077304248, "learning_rate": 3.336583484301881e-09, "loss": 0.0124, "step": 8888 }, { "epoch": 2.968442143930539, "grad_norm": 0.3452235143357418, "learning_rate": 3.2659776471960505e-09, "loss": 0.0243, "step": 8889 }, { "epoch": 2.968776089497412, "grad_norm": 0.19849054713234496, "learning_rate": 3.19612662559865e-09, "loss": 0.0085, "step": 8890 }, { "epoch": 2.9691100350642845, "grad_norm": 0.3413891758042656, "learning_rate": 3.1270304300617947e-09, "loss": 0.0185, "step": 8891 }, { "epoch": 2.969443980631157, "grad_norm": 0.27201839017148005, "learning_rate": 3.0586890710232465e-09, "loss": 0.0154, "step": 8892 }, { "epoch": 2.96977792619803, "grad_norm": 0.20336426410852446, "learning_rate": 2.9911025588069685e-09, "loss": 0.009, "step": 8893 }, { "epoch": 2.970111871764902, "grad_norm": 0.2243150327657297, "learning_rate": 2.9242709036225723e-09, "loss": 0.0104, "step": 8894 }, { "epoch": 2.970445817331775, "grad_norm": 0.28980795488718264, "learning_rate": 2.858194115565871e-09, "loss": 0.0136, "step": 8895 }, { "epoch": 2.9707797628986476, "grad_norm": 0.3133702871500427, "learning_rate": 2.7928722046177692e-09, "loss": 0.0151, "step": 8896 }, { "epoch": 2.97111370846552, "grad_norm": 0.25170122840614795, "learning_rate": 2.7283051806470394e-09, "loss": 0.0108, "step": 8897 }, { "epoch": 2.9714476540323926, "grad_norm": 0.3569107012442742, "learning_rate": 2.664493053406436e-09, "loss": 0.0126, "step": 8898 }, { "epoch": 2.9717815995992654, "grad_norm": 0.3503953913843525, "learning_rate": 2.6014358325360256e-09, "loss": 0.0171, "step": 8899 }, { "epoch": 2.972115545166138, "grad_norm": 0.2662913521255247, "learning_rate": 2.5391335275609665e-09, "loss": 0.0107, "step": 8900 }, { "epoch": 2.972449490733011, "grad_norm": 0.31383737687940944, "learning_rate": 2.4775861478937293e-09, "loss": 0.0137, "step": 8901 }, { "epoch": 2.972783436299883, "grad_norm": 0.27899930926298144, "learning_rate": 2.416793702830211e-09, "loss": 0.0096, "step": 8902 }, { "epoch": 2.973117381866756, "grad_norm": 0.1619463579430933, "learning_rate": 2.3567562015547328e-09, "loss": 0.0066, "step": 8903 }, { "epoch": 2.9734513274336285, "grad_norm": 0.2975755401252966, "learning_rate": 2.297473653136706e-09, "loss": 0.0145, "step": 8904 }, { "epoch": 2.9737852730005008, "grad_norm": 0.3154087999668039, "learning_rate": 2.2389460665317443e-09, "loss": 0.0158, "step": 8905 }, { "epoch": 2.9741192185673735, "grad_norm": 0.3746830196540353, "learning_rate": 2.1811734505799985e-09, "loss": 0.0244, "step": 8906 }, { "epoch": 2.974453164134246, "grad_norm": 0.24551995169987959, "learning_rate": 2.1241558140100426e-09, "loss": 0.0138, "step": 8907 }, { "epoch": 2.9747871097011185, "grad_norm": 0.309728247324327, "learning_rate": 2.0678931654344314e-09, "loss": 0.0104, "step": 8908 }, { "epoch": 2.975121055267991, "grad_norm": 0.22154709674498288, "learning_rate": 2.012385513351922e-09, "loss": 0.0088, "step": 8909 }, { "epoch": 2.975455000834864, "grad_norm": 0.28139571562754934, "learning_rate": 1.9576328661480293e-09, "loss": 0.0161, "step": 8910 }, { "epoch": 2.9757889464017366, "grad_norm": 0.25823426789692366, "learning_rate": 1.9036352320939146e-09, "loss": 0.0114, "step": 8911 }, { "epoch": 2.9761228919686094, "grad_norm": 0.3092041079081828, "learning_rate": 1.850392619345831e-09, "loss": 0.0135, "step": 8912 }, { "epoch": 2.9764568375354816, "grad_norm": 0.3460372426064356, "learning_rate": 1.7979050359479e-09, "loss": 0.015, "step": 8913 }, { "epoch": 2.9767907831023543, "grad_norm": 0.2979925804685489, "learning_rate": 1.746172489828224e-09, "loss": 0.0171, "step": 8914 }, { "epoch": 2.977124728669227, "grad_norm": 0.23833186938846646, "learning_rate": 1.6951949888016627e-09, "loss": 0.0101, "step": 8915 }, { "epoch": 2.9774586742360993, "grad_norm": 0.27949876327286516, "learning_rate": 1.6449725405687234e-09, "loss": 0.0106, "step": 8916 }, { "epoch": 2.977792619802972, "grad_norm": 0.38883786417116634, "learning_rate": 1.59550515271667e-09, "loss": 0.0138, "step": 8917 }, { "epoch": 2.9781265653698448, "grad_norm": 0.2897917268514722, "learning_rate": 1.5467928327178582e-09, "loss": 0.0123, "step": 8918 }, { "epoch": 2.978460510936717, "grad_norm": 0.30086512667102744, "learning_rate": 1.498835587930847e-09, "loss": 0.0119, "step": 8919 }, { "epoch": 2.9787944565035898, "grad_norm": 0.23115848105974932, "learning_rate": 1.4516334256003962e-09, "loss": 0.0116, "step": 8920 }, { "epoch": 2.9791284020704625, "grad_norm": 0.2572533680568834, "learning_rate": 1.4051863528563581e-09, "loss": 0.0097, "step": 8921 }, { "epoch": 2.979462347637335, "grad_norm": 0.22789782990855917, "learning_rate": 1.3594943767158974e-09, "loss": 0.0117, "step": 8922 }, { "epoch": 2.979796293204208, "grad_norm": 0.276466857170987, "learning_rate": 1.3145575040801605e-09, "loss": 0.0136, "step": 8923 }, { "epoch": 2.98013023877108, "grad_norm": 0.2251519102107459, "learning_rate": 1.2703757417387164e-09, "loss": 0.0101, "step": 8924 }, { "epoch": 2.980464184337953, "grad_norm": 0.2564634115977526, "learning_rate": 1.2269490963651154e-09, "loss": 0.0115, "step": 8925 }, { "epoch": 2.9807981299048256, "grad_norm": 0.2942127534403624, "learning_rate": 1.1842775745196655e-09, "loss": 0.0142, "step": 8926 }, { "epoch": 2.981132075471698, "grad_norm": 0.20510329834816302, "learning_rate": 1.1423611826477665e-09, "loss": 0.01, "step": 8927 }, { "epoch": 2.9814660210385706, "grad_norm": 0.22350720799851165, "learning_rate": 1.1011999270821305e-09, "loss": 0.008, "step": 8928 }, { "epoch": 2.9817999666054433, "grad_norm": 0.2701959643035672, "learning_rate": 1.0607938140400064e-09, "loss": 0.0117, "step": 8929 }, { "epoch": 2.982133912172316, "grad_norm": 0.2876642994947557, "learning_rate": 1.0211428496259557e-09, "loss": 0.0146, "step": 8930 }, { "epoch": 2.9824678577391888, "grad_norm": 0.2424076255433641, "learning_rate": 9.822470398296312e-10, "loss": 0.0085, "step": 8931 }, { "epoch": 2.982801803306061, "grad_norm": 0.2765927673245172, "learning_rate": 9.441063905257785e-10, "loss": 0.0104, "step": 8932 }, { "epoch": 2.9831357488729338, "grad_norm": 0.23925778763867475, "learning_rate": 9.067209074770101e-10, "loss": 0.0099, "step": 8933 }, { "epoch": 2.9834696944398065, "grad_norm": 0.27178512887000095, "learning_rate": 8.700905963304751e-10, "loss": 0.018, "step": 8934 }, { "epoch": 2.9838036400066787, "grad_norm": 0.28788672421898087, "learning_rate": 8.342154626195254e-10, "loss": 0.0128, "step": 8935 }, { "epoch": 2.9841375855735515, "grad_norm": 0.31505382086792205, "learning_rate": 7.990955117631594e-10, "loss": 0.016, "step": 8936 }, { "epoch": 2.984471531140424, "grad_norm": 0.29247050361386245, "learning_rate": 7.647307490676881e-10, "loss": 0.0108, "step": 8937 }, { "epoch": 2.9848054767072965, "grad_norm": 0.2752972063113647, "learning_rate": 7.311211797234041e-10, "loss": 0.0098, "step": 8938 }, { "epoch": 2.985139422274169, "grad_norm": 0.35351372704834705, "learning_rate": 6.982668088079126e-10, "loss": 0.015, "step": 8939 }, { "epoch": 2.985473367841042, "grad_norm": 0.28399803612731106, "learning_rate": 6.661676412844653e-10, "loss": 0.0147, "step": 8940 }, { "epoch": 2.9858073134079146, "grad_norm": 0.28649577636249296, "learning_rate": 6.348236820008513e-10, "loss": 0.0091, "step": 8941 }, { "epoch": 2.9861412589747873, "grad_norm": 0.25405506085454094, "learning_rate": 6.042349356932819e-10, "loss": 0.0135, "step": 8942 }, { "epoch": 2.9864752045416596, "grad_norm": 0.27463384914468136, "learning_rate": 5.744014069819503e-10, "loss": 0.0101, "step": 8943 }, { "epoch": 2.9868091501085323, "grad_norm": 0.24347676417976405, "learning_rate": 5.453231003732518e-10, "loss": 0.0101, "step": 8944 }, { "epoch": 2.987143095675405, "grad_norm": 0.2737953735825022, "learning_rate": 5.170000202608938e-10, "loss": 0.0123, "step": 8945 }, { "epoch": 2.9874770412422773, "grad_norm": 0.2646011800101981, "learning_rate": 4.894321709220106e-10, "loss": 0.0124, "step": 8946 }, { "epoch": 2.98781098680915, "grad_norm": 0.2703402316729183, "learning_rate": 4.626195565221592e-10, "loss": 0.011, "step": 8947 }, { "epoch": 2.9881449323760227, "grad_norm": 0.38012935164309164, "learning_rate": 4.365621811108778e-10, "loss": 0.0187, "step": 8948 }, { "epoch": 2.9884788779428955, "grad_norm": 0.5136560493941013, "learning_rate": 4.112600486250173e-10, "loss": 0.0216, "step": 8949 }, { "epoch": 2.988812823509768, "grad_norm": 0.3051470139876893, "learning_rate": 3.867131628865206e-10, "loss": 0.0193, "step": 8950 }, { "epoch": 2.9891467690766405, "grad_norm": 0.28368761760330513, "learning_rate": 3.629215276035325e-10, "loss": 0.0177, "step": 8951 }, { "epoch": 2.989480714643513, "grad_norm": 0.20997699417754753, "learning_rate": 3.3988514637040003e-10, "loss": 0.0119, "step": 8952 }, { "epoch": 2.989814660210386, "grad_norm": 0.24432751054133622, "learning_rate": 3.176040226660071e-10, "loss": 0.01, "step": 8953 }, { "epoch": 2.990148605777258, "grad_norm": 0.2316491881808177, "learning_rate": 2.960781598576601e-10, "loss": 0.0097, "step": 8954 }, { "epoch": 2.990482551344131, "grad_norm": 0.2472025433832417, "learning_rate": 2.7530756119609204e-10, "loss": 0.0099, "step": 8955 }, { "epoch": 2.9908164969110036, "grad_norm": 0.31696959508144734, "learning_rate": 2.5529222981879323e-10, "loss": 0.0176, "step": 8956 }, { "epoch": 2.991150442477876, "grad_norm": 0.24118441142533606, "learning_rate": 2.360321687500111e-10, "loss": 0.0104, "step": 8957 }, { "epoch": 2.9914843880447486, "grad_norm": 0.2928267346519803, "learning_rate": 2.175273808985301e-10, "loss": 0.0134, "step": 8958 }, { "epoch": 2.9918183336116213, "grad_norm": 0.3227256092843894, "learning_rate": 1.9977786906044683e-10, "loss": 0.0152, "step": 8959 }, { "epoch": 2.992152279178494, "grad_norm": 0.22895195359200451, "learning_rate": 1.827836359163948e-10, "loss": 0.01, "step": 8960 }, { "epoch": 2.9924862247453667, "grad_norm": 0.25063268294709157, "learning_rate": 1.665446840343199e-10, "loss": 0.0118, "step": 8961 }, { "epoch": 2.992820170312239, "grad_norm": 0.2837991515215964, "learning_rate": 1.5106101586614963e-10, "loss": 0.0092, "step": 8962 }, { "epoch": 2.9931541158791117, "grad_norm": 0.3109087909957699, "learning_rate": 1.3633263375223414e-10, "loss": 0.0184, "step": 8963 }, { "epoch": 2.9934880614459844, "grad_norm": 0.28444023257986717, "learning_rate": 1.223595399163502e-10, "loss": 0.0114, "step": 8964 }, { "epoch": 2.9938220070128567, "grad_norm": 0.34104403056075155, "learning_rate": 1.091417364695868e-10, "loss": 0.0133, "step": 8965 }, { "epoch": 2.9941559525797294, "grad_norm": 0.27880065197401577, "learning_rate": 9.667922540868013e-11, "loss": 0.0136, "step": 8966 }, { "epoch": 2.994489898146602, "grad_norm": 0.3617347223905858, "learning_rate": 8.49720086165684e-11, "loss": 0.0141, "step": 8967 }, { "epoch": 2.9948238437134744, "grad_norm": 0.25073747583367095, "learning_rate": 7.40200878618369e-11, "loss": 0.0135, "step": 8968 }, { "epoch": 2.995157789280347, "grad_norm": 0.3533225001730111, "learning_rate": 6.382346479816282e-11, "loss": 0.0177, "step": 8969 }, { "epoch": 2.99549173484722, "grad_norm": 0.3506858654933053, "learning_rate": 5.438214096653571e-11, "loss": 0.0203, "step": 8970 }, { "epoch": 2.9958256804140926, "grad_norm": 0.2517764691384077, "learning_rate": 4.569611779248195e-11, "loss": 0.0088, "step": 8971 }, { "epoch": 2.9961596259809653, "grad_norm": 0.22076960965863357, "learning_rate": 3.776539658939538e-11, "loss": 0.0111, "step": 8972 }, { "epoch": 2.9964935715478376, "grad_norm": 0.35316797391407945, "learning_rate": 3.0589978553541286e-11, "loss": 0.0281, "step": 8973 }, { "epoch": 2.9968275171147103, "grad_norm": 0.2310932540955058, "learning_rate": 2.416986477071781e-11, "loss": 0.0081, "step": 8974 }, { "epoch": 2.997161462681583, "grad_norm": 0.436589341577375, "learning_rate": 1.850505620903942e-11, "loss": 0.0168, "step": 8975 }, { "epoch": 2.9974954082484553, "grad_norm": 0.2866360076380691, "learning_rate": 1.3595553725598287e-11, "loss": 0.0143, "step": 8976 }, { "epoch": 2.997829353815328, "grad_norm": 0.1983579307956965, "learning_rate": 9.441358061468286e-12, "loss": 0.0061, "step": 8977 }, { "epoch": 2.9981632993822007, "grad_norm": 0.24569002362550083, "learning_rate": 6.042469843925425e-12, "loss": 0.0122, "step": 8978 }, { "epoch": 2.9984972449490734, "grad_norm": 0.32003891184141653, "learning_rate": 3.398889586447851e-12, "loss": 0.0157, "step": 8979 }, { "epoch": 2.998831190515946, "grad_norm": 0.2915456339083111, "learning_rate": 1.5106176892709656e-12, "loss": 0.0155, "step": 8980 }, { "epoch": 2.9991651360828184, "grad_norm": 0.233203274595632, "learning_rate": 3.7765443661186283e-13, "loss": 0.0068, "step": 8981 }, { "epoch": 2.999499081649691, "grad_norm": 0.24824277888305468, "learning_rate": 0.0, "loss": 0.0116, "step": 8982 }, { "epoch": 2.999499081649691, "eval_loss": 0.025284353643655777, "eval_runtime": 179.6039, "eval_samples_per_second": 112.319, "eval_steps_per_second": 1.759, "step": 8982 }, { "epoch": 2.999499081649691, "step": 8982, "total_flos": 2.774679234360541e+18, "train_loss": 0.03039964318122616, "train_runtime": 42140.8615, "train_samples_per_second": 27.284, "train_steps_per_second": 0.213 } ], "logging_steps": 1, "max_steps": 8982, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.774679234360541e+18, "train_batch_size": 8, "trial_name": null, "trial_params": null }