{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.99878308487983, "eval_steps": 500, "global_step": 6572, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.000608457560085184, "grad_norm": 9.286478996276855, "learning_rate": 1.9011406844106465e-07, "loss": 0.8452, "step": 1 }, { "epoch": 0.001216915120170368, "grad_norm": 9.28686809539795, "learning_rate": 3.802281368821293e-07, "loss": 0.8357, "step": 2 }, { "epoch": 0.0018253726802555522, "grad_norm": 9.397164344787598, "learning_rate": 5.70342205323194e-07, "loss": 0.8038, "step": 3 }, { "epoch": 0.002433830240340736, "grad_norm": 8.887619972229004, "learning_rate": 7.604562737642586e-07, "loss": 0.7907, "step": 4 }, { "epoch": 0.0030422878004259203, "grad_norm": 10.438508033752441, "learning_rate": 9.505703422053232e-07, "loss": 0.8433, "step": 5 }, { "epoch": 0.0036507453605111044, "grad_norm": 9.955141067504883, "learning_rate": 1.140684410646388e-06, "loss": 0.8832, "step": 6 }, { "epoch": 0.0042592029205962886, "grad_norm": 7.124298095703125, "learning_rate": 1.3307984790874525e-06, "loss": 0.7259, "step": 7 }, { "epoch": 0.004867660480681472, "grad_norm": 6.805534839630127, "learning_rate": 1.5209125475285172e-06, "loss": 0.7877, "step": 8 }, { "epoch": 0.005476118040766657, "grad_norm": 5.027353286743164, "learning_rate": 1.711026615969582e-06, "loss": 0.6938, "step": 9 }, { "epoch": 0.0060845756008518406, "grad_norm": 4.865087509155273, "learning_rate": 1.9011406844106463e-06, "loss": 0.704, "step": 10 }, { "epoch": 0.006693033160937024, "grad_norm": 4.999395847320557, "learning_rate": 2.091254752851711e-06, "loss": 0.6225, "step": 11 }, { "epoch": 0.007301490721022209, "grad_norm": 4.837018013000488, "learning_rate": 2.281368821292776e-06, "loss": 0.6357, "step": 12 }, { "epoch": 0.007909948281107393, "grad_norm": 4.025296211242676, "learning_rate": 2.4714828897338406e-06, "loss": 0.6002, "step": 13 }, { "epoch": 0.008518405841192577, "grad_norm": 3.6638612747192383, "learning_rate": 2.661596958174905e-06, "loss": 0.5499, "step": 14 }, { "epoch": 0.009126863401277762, "grad_norm": 5.011624336242676, "learning_rate": 2.8517110266159697e-06, "loss": 0.6042, "step": 15 }, { "epoch": 0.009735320961362945, "grad_norm": 4.899325847625732, "learning_rate": 3.0418250950570345e-06, "loss": 0.6279, "step": 16 }, { "epoch": 0.010343778521448129, "grad_norm": 4.226297378540039, "learning_rate": 3.2319391634980988e-06, "loss": 0.6134, "step": 17 }, { "epoch": 0.010952236081533314, "grad_norm": 4.012551784515381, "learning_rate": 3.422053231939164e-06, "loss": 0.6285, "step": 18 }, { "epoch": 0.011560693641618497, "grad_norm": 3.9116461277008057, "learning_rate": 3.612167300380228e-06, "loss": 0.5381, "step": 19 }, { "epoch": 0.012169151201703681, "grad_norm": 2.996412515640259, "learning_rate": 3.8022813688212926e-06, "loss": 0.4892, "step": 20 }, { "epoch": 0.012777608761788866, "grad_norm": 2.7711269855499268, "learning_rate": 3.992395437262358e-06, "loss": 0.4954, "step": 21 }, { "epoch": 0.013386066321874049, "grad_norm": 3.1407854557037354, "learning_rate": 4.182509505703422e-06, "loss": 0.6063, "step": 22 }, { "epoch": 0.013994523881959233, "grad_norm": 2.8379178047180176, "learning_rate": 4.3726235741444865e-06, "loss": 0.5944, "step": 23 }, { "epoch": 0.014602981442044418, "grad_norm": 2.903754234313965, "learning_rate": 4.562737642585552e-06, "loss": 0.5006, "step": 24 }, { "epoch": 0.015211439002129602, "grad_norm": 3.051934003829956, "learning_rate": 4.752851711026616e-06, "loss": 0.5743, "step": 25 }, { "epoch": 0.015819896562214785, "grad_norm": 2.664247751235962, "learning_rate": 4.942965779467681e-06, "loss": 0.5162, "step": 26 }, { "epoch": 0.016428354122299968, "grad_norm": 3.0488321781158447, "learning_rate": 5.1330798479087455e-06, "loss": 0.5195, "step": 27 }, { "epoch": 0.017036811682385154, "grad_norm": 2.9393467903137207, "learning_rate": 5.32319391634981e-06, "loss": 0.5466, "step": 28 }, { "epoch": 0.017645269242470337, "grad_norm": 2.603747844696045, "learning_rate": 5.513307984790875e-06, "loss": 0.5294, "step": 29 }, { "epoch": 0.018253726802555523, "grad_norm": 2.919405698776245, "learning_rate": 5.703422053231939e-06, "loss": 0.5621, "step": 30 }, { "epoch": 0.018862184362640706, "grad_norm": 2.5989558696746826, "learning_rate": 5.893536121673004e-06, "loss": 0.5719, "step": 31 }, { "epoch": 0.01947064192272589, "grad_norm": 2.815772294998169, "learning_rate": 6.083650190114069e-06, "loss": 0.5748, "step": 32 }, { "epoch": 0.020079099482811075, "grad_norm": 2.713179111480713, "learning_rate": 6.273764258555133e-06, "loss": 0.4973, "step": 33 }, { "epoch": 0.020687557042896258, "grad_norm": 2.5474116802215576, "learning_rate": 6.4638783269961976e-06, "loss": 0.5012, "step": 34 }, { "epoch": 0.02129601460298144, "grad_norm": 2.737952947616577, "learning_rate": 6.653992395437263e-06, "loss": 0.4963, "step": 35 }, { "epoch": 0.021904472163066627, "grad_norm": 2.6271746158599854, "learning_rate": 6.844106463878328e-06, "loss": 0.5512, "step": 36 }, { "epoch": 0.02251292972315181, "grad_norm": 2.547055721282959, "learning_rate": 7.034220532319392e-06, "loss": 0.5489, "step": 37 }, { "epoch": 0.023121387283236993, "grad_norm": 2.6234169006347656, "learning_rate": 7.224334600760456e-06, "loss": 0.4627, "step": 38 }, { "epoch": 0.02372984484332218, "grad_norm": 2.4380102157592773, "learning_rate": 7.414448669201521e-06, "loss": 0.5985, "step": 39 }, { "epoch": 0.024338302403407362, "grad_norm": 2.633112668991089, "learning_rate": 7.604562737642585e-06, "loss": 0.4824, "step": 40 }, { "epoch": 0.024946759963492545, "grad_norm": 2.357767343521118, "learning_rate": 7.79467680608365e-06, "loss": 0.4725, "step": 41 }, { "epoch": 0.02555521752357773, "grad_norm": 2.5731077194213867, "learning_rate": 7.984790874524716e-06, "loss": 0.5152, "step": 42 }, { "epoch": 0.026163675083662914, "grad_norm": 2.6900103092193604, "learning_rate": 8.17490494296578e-06, "loss": 0.4789, "step": 43 }, { "epoch": 0.026772132643748097, "grad_norm": 2.833528757095337, "learning_rate": 8.365019011406844e-06, "loss": 0.534, "step": 44 }, { "epoch": 0.027380590203833283, "grad_norm": 2.824359893798828, "learning_rate": 8.55513307984791e-06, "loss": 0.4799, "step": 45 }, { "epoch": 0.027989047763918466, "grad_norm": 2.2743606567382812, "learning_rate": 8.745247148288973e-06, "loss": 0.4938, "step": 46 }, { "epoch": 0.02859750532400365, "grad_norm": 2.405674934387207, "learning_rate": 8.935361216730038e-06, "loss": 0.5277, "step": 47 }, { "epoch": 0.029205962884088835, "grad_norm": 2.473686695098877, "learning_rate": 9.125475285171103e-06, "loss": 0.4787, "step": 48 }, { "epoch": 0.029814420444174018, "grad_norm": 2.3129618167877197, "learning_rate": 9.315589353612169e-06, "loss": 0.4755, "step": 49 }, { "epoch": 0.030422878004259205, "grad_norm": 2.2328269481658936, "learning_rate": 9.505703422053232e-06, "loss": 0.5602, "step": 50 }, { "epoch": 0.031031335564344387, "grad_norm": 2.6735682487487793, "learning_rate": 9.695817490494297e-06, "loss": 0.5262, "step": 51 }, { "epoch": 0.03163979312442957, "grad_norm": 2.37196946144104, "learning_rate": 9.885931558935362e-06, "loss": 0.5204, "step": 52 }, { "epoch": 0.032248250684514757, "grad_norm": 2.7100112438201904, "learning_rate": 1.0076045627376426e-05, "loss": 0.5656, "step": 53 }, { "epoch": 0.032856708244599936, "grad_norm": 2.4364941120147705, "learning_rate": 1.0266159695817491e-05, "loss": 0.4961, "step": 54 }, { "epoch": 0.03346516580468512, "grad_norm": 2.3422157764434814, "learning_rate": 1.0456273764258556e-05, "loss": 0.492, "step": 55 }, { "epoch": 0.03407362336477031, "grad_norm": 2.0109498500823975, "learning_rate": 1.064638783269962e-05, "loss": 0.503, "step": 56 }, { "epoch": 0.03468208092485549, "grad_norm": 2.42881441116333, "learning_rate": 1.0836501901140685e-05, "loss": 0.5093, "step": 57 }, { "epoch": 0.035290538484940674, "grad_norm": 2.187835693359375, "learning_rate": 1.102661596958175e-05, "loss": 0.4663, "step": 58 }, { "epoch": 0.03589899604502586, "grad_norm": 2.3914639949798584, "learning_rate": 1.1216730038022814e-05, "loss": 0.4773, "step": 59 }, { "epoch": 0.03650745360511105, "grad_norm": 2.4356846809387207, "learning_rate": 1.1406844106463879e-05, "loss": 0.5378, "step": 60 }, { "epoch": 0.037115911165196226, "grad_norm": 2.398271083831787, "learning_rate": 1.1596958174904944e-05, "loss": 0.4864, "step": 61 }, { "epoch": 0.03772436872528141, "grad_norm": 2.2733852863311768, "learning_rate": 1.1787072243346007e-05, "loss": 0.4751, "step": 62 }, { "epoch": 0.0383328262853666, "grad_norm": 2.115436315536499, "learning_rate": 1.1977186311787073e-05, "loss": 0.4688, "step": 63 }, { "epoch": 0.03894128384545178, "grad_norm": 2.285353660583496, "learning_rate": 1.2167300380228138e-05, "loss": 0.5107, "step": 64 }, { "epoch": 0.039549741405536964, "grad_norm": 2.3437185287475586, "learning_rate": 1.2357414448669203e-05, "loss": 0.5132, "step": 65 }, { "epoch": 0.04015819896562215, "grad_norm": 2.5334279537200928, "learning_rate": 1.2547528517110266e-05, "loss": 0.4868, "step": 66 }, { "epoch": 0.04076665652570733, "grad_norm": 2.2732203006744385, "learning_rate": 1.2737642585551332e-05, "loss": 0.4784, "step": 67 }, { "epoch": 0.041375114085792516, "grad_norm": 2.3344926834106445, "learning_rate": 1.2927756653992395e-05, "loss": 0.5702, "step": 68 }, { "epoch": 0.0419835716458777, "grad_norm": 2.6183888912200928, "learning_rate": 1.3117870722433462e-05, "loss": 0.4828, "step": 69 }, { "epoch": 0.04259202920596288, "grad_norm": 3.995020627975464, "learning_rate": 1.3307984790874526e-05, "loss": 0.5182, "step": 70 }, { "epoch": 0.04320048676604807, "grad_norm": 2.3043413162231445, "learning_rate": 1.3498098859315589e-05, "loss": 0.5346, "step": 71 }, { "epoch": 0.043808944326133255, "grad_norm": 2.5298848152160645, "learning_rate": 1.3688212927756656e-05, "loss": 0.6079, "step": 72 }, { "epoch": 0.044417401886218434, "grad_norm": 2.1517183780670166, "learning_rate": 1.387832699619772e-05, "loss": 0.4962, "step": 73 }, { "epoch": 0.04502585944630362, "grad_norm": 2.3889400959014893, "learning_rate": 1.4068441064638785e-05, "loss": 0.5005, "step": 74 }, { "epoch": 0.04563431700638881, "grad_norm": 2.265465259552002, "learning_rate": 1.4258555133079848e-05, "loss": 0.5684, "step": 75 }, { "epoch": 0.046242774566473986, "grad_norm": 2.3578243255615234, "learning_rate": 1.4448669201520912e-05, "loss": 0.5401, "step": 76 }, { "epoch": 0.04685123212655917, "grad_norm": 2.2916693687438965, "learning_rate": 1.4638783269961978e-05, "loss": 0.482, "step": 77 }, { "epoch": 0.04745968968664436, "grad_norm": 2.678074359893799, "learning_rate": 1.4828897338403042e-05, "loss": 0.5157, "step": 78 }, { "epoch": 0.04806814724672954, "grad_norm": 2.3698818683624268, "learning_rate": 1.5019011406844107e-05, "loss": 0.4558, "step": 79 }, { "epoch": 0.048676604806814724, "grad_norm": 2.1744048595428467, "learning_rate": 1.520912547528517e-05, "loss": 0.4658, "step": 80 }, { "epoch": 0.04928506236689991, "grad_norm": 2.1933300495147705, "learning_rate": 1.5399239543726237e-05, "loss": 0.518, "step": 81 }, { "epoch": 0.04989351992698509, "grad_norm": 2.573246479034424, "learning_rate": 1.55893536121673e-05, "loss": 0.5668, "step": 82 }, { "epoch": 0.050501977487070276, "grad_norm": 2.249875068664551, "learning_rate": 1.5779467680608364e-05, "loss": 0.4757, "step": 83 }, { "epoch": 0.05111043504715546, "grad_norm": 2.1911234855651855, "learning_rate": 1.596958174904943e-05, "loss": 0.4753, "step": 84 }, { "epoch": 0.05171889260724064, "grad_norm": 3.867871046066284, "learning_rate": 1.6159695817490495e-05, "loss": 0.4797, "step": 85 }, { "epoch": 0.05232735016732583, "grad_norm": 2.745781421661377, "learning_rate": 1.634980988593156e-05, "loss": 0.4968, "step": 86 }, { "epoch": 0.052935807727411015, "grad_norm": 2.0692405700683594, "learning_rate": 1.6539923954372625e-05, "loss": 0.481, "step": 87 }, { "epoch": 0.053544265287496194, "grad_norm": 2.262392997741699, "learning_rate": 1.673003802281369e-05, "loss": 0.5532, "step": 88 }, { "epoch": 0.05415272284758138, "grad_norm": 2.1288435459136963, "learning_rate": 1.6920152091254756e-05, "loss": 0.4839, "step": 89 }, { "epoch": 0.05476118040766657, "grad_norm": 2.0853445529937744, "learning_rate": 1.711026615969582e-05, "loss": 0.5071, "step": 90 }, { "epoch": 0.055369637967751746, "grad_norm": 2.296994686126709, "learning_rate": 1.7300380228136882e-05, "loss": 0.5614, "step": 91 }, { "epoch": 0.05597809552783693, "grad_norm": 2.261611223220825, "learning_rate": 1.7490494296577946e-05, "loss": 0.536, "step": 92 }, { "epoch": 0.05658655308792212, "grad_norm": 2.2473561763763428, "learning_rate": 1.7680608365019013e-05, "loss": 0.4652, "step": 93 }, { "epoch": 0.0571950106480073, "grad_norm": 2.532961368560791, "learning_rate": 1.7870722433460076e-05, "loss": 0.5331, "step": 94 }, { "epoch": 0.057803468208092484, "grad_norm": 2.3481945991516113, "learning_rate": 1.806083650190114e-05, "loss": 0.52, "step": 95 }, { "epoch": 0.05841192576817767, "grad_norm": 2.024817943572998, "learning_rate": 1.8250950570342207e-05, "loss": 0.4745, "step": 96 }, { "epoch": 0.05902038332826286, "grad_norm": 2.2880470752716064, "learning_rate": 1.844106463878327e-05, "loss": 0.5019, "step": 97 }, { "epoch": 0.059628840888348036, "grad_norm": 1.9540607929229736, "learning_rate": 1.8631178707224337e-05, "loss": 0.5138, "step": 98 }, { "epoch": 0.06023729844843322, "grad_norm": 2.5749783515930176, "learning_rate": 1.88212927756654e-05, "loss": 0.5639, "step": 99 }, { "epoch": 0.06084575600851841, "grad_norm": 2.0296337604522705, "learning_rate": 1.9011406844106464e-05, "loss": 0.4929, "step": 100 }, { "epoch": 0.06145421356860359, "grad_norm": 2.068000316619873, "learning_rate": 1.920152091254753e-05, "loss": 0.4568, "step": 101 }, { "epoch": 0.062062671128688775, "grad_norm": 2.4328064918518066, "learning_rate": 1.9391634980988594e-05, "loss": 0.5764, "step": 102 }, { "epoch": 0.06267112868877396, "grad_norm": 1.961745262145996, "learning_rate": 1.958174904942966e-05, "loss": 0.4756, "step": 103 }, { "epoch": 0.06327958624885914, "grad_norm": 2.189243793487549, "learning_rate": 1.9771863117870725e-05, "loss": 0.5254, "step": 104 }, { "epoch": 0.06388804380894432, "grad_norm": 1.9438084363937378, "learning_rate": 1.9961977186311788e-05, "loss": 0.4921, "step": 105 }, { "epoch": 0.06449650136902951, "grad_norm": 1.9308326244354248, "learning_rate": 2.0152091254752852e-05, "loss": 0.4508, "step": 106 }, { "epoch": 0.06510495892911469, "grad_norm": 1.9742590188980103, "learning_rate": 2.0342205323193915e-05, "loss": 0.4899, "step": 107 }, { "epoch": 0.06571341648919987, "grad_norm": 2.1796038150787354, "learning_rate": 2.0532319391634982e-05, "loss": 0.5594, "step": 108 }, { "epoch": 0.06632187404928507, "grad_norm": 2.2020134925842285, "learning_rate": 2.0722433460076046e-05, "loss": 0.4946, "step": 109 }, { "epoch": 0.06693033160937024, "grad_norm": 1.9152429103851318, "learning_rate": 2.0912547528517112e-05, "loss": 0.485, "step": 110 }, { "epoch": 0.06753878916945542, "grad_norm": 2.1203103065490723, "learning_rate": 2.1102661596958176e-05, "loss": 0.5062, "step": 111 }, { "epoch": 0.06814724672954062, "grad_norm": 1.934693694114685, "learning_rate": 2.129277566539924e-05, "loss": 0.4841, "step": 112 }, { "epoch": 0.0687557042896258, "grad_norm": 2.1393649578094482, "learning_rate": 2.1482889733840306e-05, "loss": 0.585, "step": 113 }, { "epoch": 0.06936416184971098, "grad_norm": 2.0923867225646973, "learning_rate": 2.167300380228137e-05, "loss": 0.5467, "step": 114 }, { "epoch": 0.06997261940979617, "grad_norm": 1.947996735572815, "learning_rate": 2.1863117870722437e-05, "loss": 0.4804, "step": 115 }, { "epoch": 0.07058107696988135, "grad_norm": 2.1590495109558105, "learning_rate": 2.20532319391635e-05, "loss": 0.5335, "step": 116 }, { "epoch": 0.07118953452996654, "grad_norm": 2.0173380374908447, "learning_rate": 2.2243346007604564e-05, "loss": 0.5273, "step": 117 }, { "epoch": 0.07179799209005172, "grad_norm": 2.251988649368286, "learning_rate": 2.2433460076045627e-05, "loss": 0.5975, "step": 118 }, { "epoch": 0.0724064496501369, "grad_norm": 2.279703378677368, "learning_rate": 2.262357414448669e-05, "loss": 0.5642, "step": 119 }, { "epoch": 0.0730149072102221, "grad_norm": 2.0928151607513428, "learning_rate": 2.2813688212927758e-05, "loss": 0.5462, "step": 120 }, { "epoch": 0.07362336477030727, "grad_norm": 9.408754348754883, "learning_rate": 2.300380228136882e-05, "loss": 0.5737, "step": 121 }, { "epoch": 0.07423182233039245, "grad_norm": 2.0510871410369873, "learning_rate": 2.3193916349809888e-05, "loss": 0.4958, "step": 122 }, { "epoch": 0.07484027989047765, "grad_norm": 1.8674732446670532, "learning_rate": 2.338403041825095e-05, "loss": 0.4752, "step": 123 }, { "epoch": 0.07544873745056282, "grad_norm": 1.8963392972946167, "learning_rate": 2.3574144486692015e-05, "loss": 0.5461, "step": 124 }, { "epoch": 0.076057195010648, "grad_norm": 1.7347912788391113, "learning_rate": 2.3764258555133082e-05, "loss": 0.4789, "step": 125 }, { "epoch": 0.0766656525707332, "grad_norm": 2.5174317359924316, "learning_rate": 2.3954372623574145e-05, "loss": 0.5642, "step": 126 }, { "epoch": 0.07727411013081838, "grad_norm": 2.2092862129211426, "learning_rate": 2.4144486692015212e-05, "loss": 0.5812, "step": 127 }, { "epoch": 0.07788256769090356, "grad_norm": 2.6390881538391113, "learning_rate": 2.4334600760456276e-05, "loss": 0.5021, "step": 128 }, { "epoch": 0.07849102525098875, "grad_norm": 1.8941545486450195, "learning_rate": 2.452471482889734e-05, "loss": 0.4816, "step": 129 }, { "epoch": 0.07909948281107393, "grad_norm": 1.7453467845916748, "learning_rate": 2.4714828897338406e-05, "loss": 0.4823, "step": 130 }, { "epoch": 0.07970794037115911, "grad_norm": 1.9925953149795532, "learning_rate": 2.490494296577947e-05, "loss": 0.5208, "step": 131 }, { "epoch": 0.0803163979312443, "grad_norm": 2.222625494003296, "learning_rate": 2.5095057034220533e-05, "loss": 0.4962, "step": 132 }, { "epoch": 0.08092485549132948, "grad_norm": 1.99649178981781, "learning_rate": 2.5285171102661596e-05, "loss": 0.5195, "step": 133 }, { "epoch": 0.08153331305141466, "grad_norm": 4.0735273361206055, "learning_rate": 2.5475285171102663e-05, "loss": 0.5421, "step": 134 }, { "epoch": 0.08214177061149985, "grad_norm": 2.117365837097168, "learning_rate": 2.5665399239543723e-05, "loss": 0.5745, "step": 135 }, { "epoch": 0.08275022817158503, "grad_norm": 2.24652099609375, "learning_rate": 2.585551330798479e-05, "loss": 0.5328, "step": 136 }, { "epoch": 0.08335868573167021, "grad_norm": 2.288801670074463, "learning_rate": 2.6045627376425857e-05, "loss": 0.5145, "step": 137 }, { "epoch": 0.0839671432917554, "grad_norm": 1.9129204750061035, "learning_rate": 2.6235741444866924e-05, "loss": 0.5466, "step": 138 }, { "epoch": 0.08457560085184058, "grad_norm": 2.242582082748413, "learning_rate": 2.6425855513307984e-05, "loss": 0.5592, "step": 139 }, { "epoch": 0.08518405841192576, "grad_norm": 1.8476616144180298, "learning_rate": 2.661596958174905e-05, "loss": 0.5257, "step": 140 }, { "epoch": 0.08579251597201096, "grad_norm": 1.9070931673049927, "learning_rate": 2.6806083650190118e-05, "loss": 0.5186, "step": 141 }, { "epoch": 0.08640097353209614, "grad_norm": 1.9647867679595947, "learning_rate": 2.6996197718631178e-05, "loss": 0.595, "step": 142 }, { "epoch": 0.08700943109218132, "grad_norm": 2.1235222816467285, "learning_rate": 2.7186311787072245e-05, "loss": 0.4921, "step": 143 }, { "epoch": 0.08761788865226651, "grad_norm": 2.1548256874084473, "learning_rate": 2.7376425855513312e-05, "loss": 0.5166, "step": 144 }, { "epoch": 0.08822634621235169, "grad_norm": 3.3486247062683105, "learning_rate": 2.7566539923954375e-05, "loss": 0.5909, "step": 145 }, { "epoch": 0.08883480377243687, "grad_norm": 2.245957136154175, "learning_rate": 2.775665399239544e-05, "loss": 0.5726, "step": 146 }, { "epoch": 0.08944326133252206, "grad_norm": 1.7607914209365845, "learning_rate": 2.7946768060836502e-05, "loss": 0.4802, "step": 147 }, { "epoch": 0.09005171889260724, "grad_norm": 1.840922474861145, "learning_rate": 2.813688212927757e-05, "loss": 0.5054, "step": 148 }, { "epoch": 0.09066017645269242, "grad_norm": 1.794522762298584, "learning_rate": 2.832699619771863e-05, "loss": 0.5443, "step": 149 }, { "epoch": 0.09126863401277761, "grad_norm": 1.9434462785720825, "learning_rate": 2.8517110266159696e-05, "loss": 0.5159, "step": 150 }, { "epoch": 0.09187709157286279, "grad_norm": 2.252189874649048, "learning_rate": 2.8707224334600763e-05, "loss": 0.5802, "step": 151 }, { "epoch": 0.09248554913294797, "grad_norm": 1.9505751132965088, "learning_rate": 2.8897338403041823e-05, "loss": 0.5033, "step": 152 }, { "epoch": 0.09309400669303317, "grad_norm": 2.0647778511047363, "learning_rate": 2.908745247148289e-05, "loss": 0.5465, "step": 153 }, { "epoch": 0.09370246425311834, "grad_norm": 2.034170150756836, "learning_rate": 2.9277566539923957e-05, "loss": 0.5278, "step": 154 }, { "epoch": 0.09431092181320352, "grad_norm": 2.2246432304382324, "learning_rate": 2.9467680608365024e-05, "loss": 0.5276, "step": 155 }, { "epoch": 0.09491937937328872, "grad_norm": 1.8863154649734497, "learning_rate": 2.9657794676806084e-05, "loss": 0.4909, "step": 156 }, { "epoch": 0.0955278369333739, "grad_norm": 2.914320945739746, "learning_rate": 2.984790874524715e-05, "loss": 0.674, "step": 157 }, { "epoch": 0.09613629449345908, "grad_norm": 4.749186038970947, "learning_rate": 3.0038022813688214e-05, "loss": 0.5386, "step": 158 }, { "epoch": 0.09674475205354427, "grad_norm": 2.0162932872772217, "learning_rate": 3.0228136882129278e-05, "loss": 0.5441, "step": 159 }, { "epoch": 0.09735320961362945, "grad_norm": 1.8768587112426758, "learning_rate": 3.041825095057034e-05, "loss": 0.5373, "step": 160 }, { "epoch": 0.09796166717371463, "grad_norm": 1.9952895641326904, "learning_rate": 3.060836501901141e-05, "loss": 0.5037, "step": 161 }, { "epoch": 0.09857012473379982, "grad_norm": 1.961302638053894, "learning_rate": 3.0798479087452475e-05, "loss": 0.4827, "step": 162 }, { "epoch": 0.099178582293885, "grad_norm": 2.2090559005737305, "learning_rate": 3.098859315589354e-05, "loss": 0.5783, "step": 163 }, { "epoch": 0.09978703985397018, "grad_norm": 1.7766873836517334, "learning_rate": 3.11787072243346e-05, "loss": 0.4749, "step": 164 }, { "epoch": 0.10039549741405537, "grad_norm": 2.083379030227661, "learning_rate": 3.1368821292775665e-05, "loss": 0.5149, "step": 165 }, { "epoch": 0.10100395497414055, "grad_norm": 2.4703352451324463, "learning_rate": 3.155893536121673e-05, "loss": 0.5672, "step": 166 }, { "epoch": 0.10161241253422573, "grad_norm": 2.7795681953430176, "learning_rate": 3.174904942965779e-05, "loss": 0.583, "step": 167 }, { "epoch": 0.10222087009431093, "grad_norm": 1.8572745323181152, "learning_rate": 3.193916349809886e-05, "loss": 0.4859, "step": 168 }, { "epoch": 0.1028293276543961, "grad_norm": 2.1731905937194824, "learning_rate": 3.2129277566539926e-05, "loss": 0.4851, "step": 169 }, { "epoch": 0.10343778521448128, "grad_norm": 2.5642776489257812, "learning_rate": 3.231939163498099e-05, "loss": 0.5155, "step": 170 }, { "epoch": 0.10404624277456648, "grad_norm": 2.716291904449463, "learning_rate": 3.250950570342205e-05, "loss": 0.5387, "step": 171 }, { "epoch": 0.10465470033465166, "grad_norm": 2.027435541152954, "learning_rate": 3.269961977186312e-05, "loss": 0.5266, "step": 172 }, { "epoch": 0.10526315789473684, "grad_norm": 2.161743402481079, "learning_rate": 3.288973384030418e-05, "loss": 0.58, "step": 173 }, { "epoch": 0.10587161545482203, "grad_norm": 2.0740175247192383, "learning_rate": 3.307984790874525e-05, "loss": 0.5476, "step": 174 }, { "epoch": 0.10648007301490721, "grad_norm": 1.8054178953170776, "learning_rate": 3.3269961977186314e-05, "loss": 0.513, "step": 175 }, { "epoch": 0.10708853057499239, "grad_norm": 1.872541069984436, "learning_rate": 3.346007604562738e-05, "loss": 0.5266, "step": 176 }, { "epoch": 0.10769698813507758, "grad_norm": 2.594137191772461, "learning_rate": 3.365019011406844e-05, "loss": 0.56, "step": 177 }, { "epoch": 0.10830544569516276, "grad_norm": 1.95430588722229, "learning_rate": 3.384030418250951e-05, "loss": 0.5971, "step": 178 }, { "epoch": 0.10891390325524794, "grad_norm": 2.0000500679016113, "learning_rate": 3.4030418250950574e-05, "loss": 0.4828, "step": 179 }, { "epoch": 0.10952236081533313, "grad_norm": 1.7855273485183716, "learning_rate": 3.422053231939164e-05, "loss": 0.5368, "step": 180 }, { "epoch": 0.11013081837541831, "grad_norm": 2.0093138217926025, "learning_rate": 3.44106463878327e-05, "loss": 0.5519, "step": 181 }, { "epoch": 0.11073927593550349, "grad_norm": 1.8398433923721313, "learning_rate": 3.4600760456273765e-05, "loss": 0.5089, "step": 182 }, { "epoch": 0.11134773349558869, "grad_norm": 1.9832158088684082, "learning_rate": 3.479087452471483e-05, "loss": 0.578, "step": 183 }, { "epoch": 0.11195619105567386, "grad_norm": 1.8576923608779907, "learning_rate": 3.498098859315589e-05, "loss": 0.545, "step": 184 }, { "epoch": 0.11256464861575904, "grad_norm": 1.8127909898757935, "learning_rate": 3.517110266159696e-05, "loss": 0.5694, "step": 185 }, { "epoch": 0.11317310617584424, "grad_norm": 2.124783515930176, "learning_rate": 3.5361216730038026e-05, "loss": 0.6131, "step": 186 }, { "epoch": 0.11378156373592942, "grad_norm": 1.8797577619552612, "learning_rate": 3.555133079847909e-05, "loss": 0.5098, "step": 187 }, { "epoch": 0.1143900212960146, "grad_norm": 1.871032476425171, "learning_rate": 3.574144486692015e-05, "loss": 0.5785, "step": 188 }, { "epoch": 0.11499847885609979, "grad_norm": 1.6941313743591309, "learning_rate": 3.593155893536122e-05, "loss": 0.5115, "step": 189 }, { "epoch": 0.11560693641618497, "grad_norm": 2.301297903060913, "learning_rate": 3.612167300380228e-05, "loss": 0.587, "step": 190 }, { "epoch": 0.11621539397627015, "grad_norm": 1.9337679147720337, "learning_rate": 3.631178707224335e-05, "loss": 0.5393, "step": 191 }, { "epoch": 0.11682385153635534, "grad_norm": 2.24143648147583, "learning_rate": 3.6501901140684413e-05, "loss": 0.5152, "step": 192 }, { "epoch": 0.11743230909644052, "grad_norm": 2.0369713306427, "learning_rate": 3.669201520912548e-05, "loss": 0.5691, "step": 193 }, { "epoch": 0.11804076665652571, "grad_norm": 1.8279515504837036, "learning_rate": 3.688212927756654e-05, "loss": 0.4814, "step": 194 }, { "epoch": 0.1186492242166109, "grad_norm": 2.0841150283813477, "learning_rate": 3.7072243346007604e-05, "loss": 0.5266, "step": 195 }, { "epoch": 0.11925768177669607, "grad_norm": 1.830251693725586, "learning_rate": 3.7262357414448674e-05, "loss": 0.5533, "step": 196 }, { "epoch": 0.11986613933678127, "grad_norm": 1.8735334873199463, "learning_rate": 3.745247148288973e-05, "loss": 0.5265, "step": 197 }, { "epoch": 0.12047459689686645, "grad_norm": 2.144216299057007, "learning_rate": 3.76425855513308e-05, "loss": 0.5795, "step": 198 }, { "epoch": 0.12108305445695162, "grad_norm": 1.7879762649536133, "learning_rate": 3.7832699619771865e-05, "loss": 0.5834, "step": 199 }, { "epoch": 0.12169151201703682, "grad_norm": 2.255518674850464, "learning_rate": 3.802281368821293e-05, "loss": 0.5094, "step": 200 }, { "epoch": 0.122299969577122, "grad_norm": 1.92243230342865, "learning_rate": 3.821292775665399e-05, "loss": 0.5215, "step": 201 }, { "epoch": 0.12290842713720718, "grad_norm": 2.395334005355835, "learning_rate": 3.840304182509506e-05, "loss": 0.5715, "step": 202 }, { "epoch": 0.12351688469729237, "grad_norm": 2.1944241523742676, "learning_rate": 3.8593155893536125e-05, "loss": 0.5486, "step": 203 }, { "epoch": 0.12412534225737755, "grad_norm": 1.983839750289917, "learning_rate": 3.878326996197719e-05, "loss": 0.5876, "step": 204 }, { "epoch": 0.12473379981746273, "grad_norm": 2.557204246520996, "learning_rate": 3.897338403041825e-05, "loss": 0.5066, "step": 205 }, { "epoch": 0.12534225737754792, "grad_norm": 1.9440828561782837, "learning_rate": 3.916349809885932e-05, "loss": 0.567, "step": 206 }, { "epoch": 0.1259507149376331, "grad_norm": 1.8540195226669312, "learning_rate": 3.935361216730038e-05, "loss": 0.5652, "step": 207 }, { "epoch": 0.12655917249771828, "grad_norm": 1.9962955713272095, "learning_rate": 3.954372623574145e-05, "loss": 0.5995, "step": 208 }, { "epoch": 0.12716763005780346, "grad_norm": 2.1377549171447754, "learning_rate": 3.973384030418251e-05, "loss": 0.671, "step": 209 }, { "epoch": 0.12777608761788864, "grad_norm": 2.1312196254730225, "learning_rate": 3.9923954372623577e-05, "loss": 0.5925, "step": 210 }, { "epoch": 0.12838454517797385, "grad_norm": 1.659692645072937, "learning_rate": 4.011406844106464e-05, "loss": 0.5313, "step": 211 }, { "epoch": 0.12899300273805903, "grad_norm": 2.2088122367858887, "learning_rate": 4.0304182509505703e-05, "loss": 0.5368, "step": 212 }, { "epoch": 0.1296014602981442, "grad_norm": 1.8686944246292114, "learning_rate": 4.0494296577946774e-05, "loss": 0.5576, "step": 213 }, { "epoch": 0.13020991785822938, "grad_norm": 1.934857726097107, "learning_rate": 4.068441064638783e-05, "loss": 0.5519, "step": 214 }, { "epoch": 0.13081837541831456, "grad_norm": 1.8353568315505981, "learning_rate": 4.08745247148289e-05, "loss": 0.5312, "step": 215 }, { "epoch": 0.13142683297839974, "grad_norm": 1.9443809986114502, "learning_rate": 4.1064638783269964e-05, "loss": 0.5108, "step": 216 }, { "epoch": 0.13203529053848495, "grad_norm": 1.8545550107955933, "learning_rate": 4.125475285171103e-05, "loss": 0.559, "step": 217 }, { "epoch": 0.13264374809857013, "grad_norm": 1.9417598247528076, "learning_rate": 4.144486692015209e-05, "loss": 0.5631, "step": 218 }, { "epoch": 0.1332522056586553, "grad_norm": 1.7785390615463257, "learning_rate": 4.163498098859316e-05, "loss": 0.5519, "step": 219 }, { "epoch": 0.1338606632187405, "grad_norm": 1.9775593280792236, "learning_rate": 4.1825095057034225e-05, "loss": 0.6203, "step": 220 }, { "epoch": 0.13446912077882567, "grad_norm": 1.7184250354766846, "learning_rate": 4.201520912547529e-05, "loss": 0.5356, "step": 221 }, { "epoch": 0.13507757833891085, "grad_norm": 1.7691705226898193, "learning_rate": 4.220532319391635e-05, "loss": 0.5185, "step": 222 }, { "epoch": 0.13568603589899605, "grad_norm": 1.759832739830017, "learning_rate": 4.2395437262357415e-05, "loss": 0.5276, "step": 223 }, { "epoch": 0.13629449345908123, "grad_norm": 1.8581064939498901, "learning_rate": 4.258555133079848e-05, "loss": 0.5341, "step": 224 }, { "epoch": 0.1369029510191664, "grad_norm": 1.9540525674819946, "learning_rate": 4.277566539923954e-05, "loss": 0.5544, "step": 225 }, { "epoch": 0.1375114085792516, "grad_norm": 2.0600829124450684, "learning_rate": 4.296577946768061e-05, "loss": 0.5259, "step": 226 }, { "epoch": 0.13811986613933677, "grad_norm": 1.8912417888641357, "learning_rate": 4.3155893536121676e-05, "loss": 0.5448, "step": 227 }, { "epoch": 0.13872832369942195, "grad_norm": 1.7389650344848633, "learning_rate": 4.334600760456274e-05, "loss": 0.5143, "step": 228 }, { "epoch": 0.13933678125950716, "grad_norm": 1.844450831413269, "learning_rate": 4.35361216730038e-05, "loss": 0.542, "step": 229 }, { "epoch": 0.13994523881959234, "grad_norm": 1.8148777484893799, "learning_rate": 4.3726235741444873e-05, "loss": 0.5151, "step": 230 }, { "epoch": 0.14055369637967752, "grad_norm": 1.9488646984100342, "learning_rate": 4.391634980988593e-05, "loss": 0.6014, "step": 231 }, { "epoch": 0.1411621539397627, "grad_norm": 1.8368064165115356, "learning_rate": 4.4106463878327e-05, "loss": 0.5752, "step": 232 }, { "epoch": 0.14177061149984788, "grad_norm": 2.520059823989868, "learning_rate": 4.4296577946768064e-05, "loss": 0.5604, "step": 233 }, { "epoch": 0.14237906905993308, "grad_norm": 1.6602476835250854, "learning_rate": 4.448669201520913e-05, "loss": 0.5702, "step": 234 }, { "epoch": 0.14298752662001826, "grad_norm": 2.0854058265686035, "learning_rate": 4.467680608365019e-05, "loss": 0.6663, "step": 235 }, { "epoch": 0.14359598418010344, "grad_norm": 1.8286073207855225, "learning_rate": 4.4866920152091254e-05, "loss": 0.5896, "step": 236 }, { "epoch": 0.14420444174018862, "grad_norm": 1.8150510787963867, "learning_rate": 4.5057034220532325e-05, "loss": 0.5698, "step": 237 }, { "epoch": 0.1448128993002738, "grad_norm": 1.7325129508972168, "learning_rate": 4.524714828897338e-05, "loss": 0.5399, "step": 238 }, { "epoch": 0.14542135686035898, "grad_norm": 1.7553200721740723, "learning_rate": 4.543726235741445e-05, "loss": 0.5335, "step": 239 }, { "epoch": 0.1460298144204442, "grad_norm": 1.677822470664978, "learning_rate": 4.5627376425855515e-05, "loss": 0.5662, "step": 240 }, { "epoch": 0.14663827198052937, "grad_norm": 2.0692317485809326, "learning_rate": 4.581749049429658e-05, "loss": 0.7082, "step": 241 }, { "epoch": 0.14724672954061455, "grad_norm": 1.5937222242355347, "learning_rate": 4.600760456273764e-05, "loss": 0.5736, "step": 242 }, { "epoch": 0.14785518710069973, "grad_norm": 1.6691651344299316, "learning_rate": 4.619771863117871e-05, "loss": 0.5081, "step": 243 }, { "epoch": 0.1484636446607849, "grad_norm": 1.916918396949768, "learning_rate": 4.6387832699619776e-05, "loss": 0.5834, "step": 244 }, { "epoch": 0.14907210222087008, "grad_norm": 1.7552460432052612, "learning_rate": 4.657794676806084e-05, "loss": 0.5644, "step": 245 }, { "epoch": 0.1496805597809553, "grad_norm": 1.794980764389038, "learning_rate": 4.67680608365019e-05, "loss": 0.5694, "step": 246 }, { "epoch": 0.15028901734104047, "grad_norm": 1.563038945198059, "learning_rate": 4.695817490494297e-05, "loss": 0.5564, "step": 247 }, { "epoch": 0.15089747490112565, "grad_norm": 1.6664254665374756, "learning_rate": 4.714828897338403e-05, "loss": 0.5837, "step": 248 }, { "epoch": 0.15150593246121083, "grad_norm": 1.7116878032684326, "learning_rate": 4.73384030418251e-05, "loss": 0.5491, "step": 249 }, { "epoch": 0.152114390021296, "grad_norm": 1.987626075744629, "learning_rate": 4.7528517110266163e-05, "loss": 0.6203, "step": 250 }, { "epoch": 0.1527228475813812, "grad_norm": 1.6196045875549316, "learning_rate": 4.771863117870723e-05, "loss": 0.5393, "step": 251 }, { "epoch": 0.1533313051414664, "grad_norm": 1.8573944568634033, "learning_rate": 4.790874524714829e-05, "loss": 0.575, "step": 252 }, { "epoch": 0.15393976270155157, "grad_norm": 1.9413925409317017, "learning_rate": 4.8098859315589354e-05, "loss": 0.5713, "step": 253 }, { "epoch": 0.15454822026163675, "grad_norm": 1.883753776550293, "learning_rate": 4.8288973384030424e-05, "loss": 0.5714, "step": 254 }, { "epoch": 0.15515667782172193, "grad_norm": 1.7571375370025635, "learning_rate": 4.847908745247148e-05, "loss": 0.6265, "step": 255 }, { "epoch": 0.1557651353818071, "grad_norm": 1.780029058456421, "learning_rate": 4.866920152091255e-05, "loss": 0.5677, "step": 256 }, { "epoch": 0.1563735929418923, "grad_norm": 2.0636849403381348, "learning_rate": 4.8859315589353615e-05, "loss": 0.5038, "step": 257 }, { "epoch": 0.1569820505019775, "grad_norm": 1.6824311017990112, "learning_rate": 4.904942965779468e-05, "loss": 0.5905, "step": 258 }, { "epoch": 0.15759050806206268, "grad_norm": 2.048978328704834, "learning_rate": 4.923954372623574e-05, "loss": 0.6073, "step": 259 }, { "epoch": 0.15819896562214786, "grad_norm": 2.075801134109497, "learning_rate": 4.942965779467681e-05, "loss": 0.5321, "step": 260 }, { "epoch": 0.15880742318223304, "grad_norm": 1.8397325277328491, "learning_rate": 4.9619771863117875e-05, "loss": 0.6598, "step": 261 }, { "epoch": 0.15941588074231822, "grad_norm": 2.7808053493499756, "learning_rate": 4.980988593155894e-05, "loss": 0.5265, "step": 262 }, { "epoch": 0.1600243383024034, "grad_norm": 2.129298686981201, "learning_rate": 5e-05, "loss": 0.5235, "step": 263 }, { "epoch": 0.1606327958624886, "grad_norm": 6.710923671722412, "learning_rate": 4.999999690052103e-05, "loss": 0.562, "step": 264 }, { "epoch": 0.16124125342257378, "grad_norm": 4.849277973175049, "learning_rate": 4.9999987602084876e-05, "loss": 0.5677, "step": 265 }, { "epoch": 0.16184971098265896, "grad_norm": 1.6120680570602417, "learning_rate": 4.999997210469385e-05, "loss": 0.5322, "step": 266 }, { "epoch": 0.16245816854274414, "grad_norm": 27.193222045898438, "learning_rate": 4.9999950408351784e-05, "loss": 0.7448, "step": 267 }, { "epoch": 0.16306662610282932, "grad_norm": 2.693523645401001, "learning_rate": 4.999992251306407e-05, "loss": 0.6533, "step": 268 }, { "epoch": 0.1636750836629145, "grad_norm": 1.7629886865615845, "learning_rate": 4.999988841883763e-05, "loss": 0.5668, "step": 269 }, { "epoch": 0.1642835412229997, "grad_norm": 1.9537672996520996, "learning_rate": 4.999984812568089e-05, "loss": 0.5254, "step": 270 }, { "epoch": 0.1648919987830849, "grad_norm": 4.926598072052002, "learning_rate": 4.999980163360388e-05, "loss": 0.5795, "step": 271 }, { "epoch": 0.16550045634317007, "grad_norm": 2.0768604278564453, "learning_rate": 4.9999748942618094e-05, "loss": 0.6014, "step": 272 }, { "epoch": 0.16610891390325525, "grad_norm": 2.3579049110412598, "learning_rate": 4.999969005273661e-05, "loss": 0.6518, "step": 273 }, { "epoch": 0.16671737146334042, "grad_norm": 2.4885222911834717, "learning_rate": 4.9999624963974045e-05, "loss": 0.6194, "step": 274 }, { "epoch": 0.1673258290234256, "grad_norm": 2.2712252140045166, "learning_rate": 4.999955367634652e-05, "loss": 0.6094, "step": 275 }, { "epoch": 0.1679342865835108, "grad_norm": 1.9794126749038696, "learning_rate": 4.999947618987171e-05, "loss": 0.5214, "step": 276 }, { "epoch": 0.168542744143596, "grad_norm": 1.6654919385910034, "learning_rate": 4.999939250456884e-05, "loss": 0.5547, "step": 277 }, { "epoch": 0.16915120170368117, "grad_norm": 1.932141900062561, "learning_rate": 4.999930262045865e-05, "loss": 0.5427, "step": 278 }, { "epoch": 0.16975965926376635, "grad_norm": 1.752447247505188, "learning_rate": 4.999920653756344e-05, "loss": 0.5661, "step": 279 }, { "epoch": 0.17036811682385153, "grad_norm": 2.4955854415893555, "learning_rate": 4.9999104255907015e-05, "loss": 0.6635, "step": 280 }, { "epoch": 0.1709765743839367, "grad_norm": 1.774271011352539, "learning_rate": 4.999899577551476e-05, "loss": 0.6433, "step": 281 }, { "epoch": 0.17158503194402192, "grad_norm": 1.7379838228225708, "learning_rate": 4.9998881096413554e-05, "loss": 0.4936, "step": 282 }, { "epoch": 0.1721934895041071, "grad_norm": 1.5458259582519531, "learning_rate": 4.9998760218631845e-05, "loss": 0.5219, "step": 283 }, { "epoch": 0.17280194706419227, "grad_norm": 1.8059486150741577, "learning_rate": 4.99986331421996e-05, "loss": 0.5384, "step": 284 }, { "epoch": 0.17341040462427745, "grad_norm": 1.9652469158172607, "learning_rate": 4.999849986714833e-05, "loss": 0.6101, "step": 285 }, { "epoch": 0.17401886218436263, "grad_norm": 1.7066302299499512, "learning_rate": 4.999836039351108e-05, "loss": 0.5981, "step": 286 }, { "epoch": 0.1746273197444478, "grad_norm": 1.5256247520446777, "learning_rate": 4.999821472132244e-05, "loss": 0.5109, "step": 287 }, { "epoch": 0.17523577730453302, "grad_norm": 1.713955283164978, "learning_rate": 4.999806285061852e-05, "loss": 0.557, "step": 288 }, { "epoch": 0.1758442348646182, "grad_norm": 1.8485389947891235, "learning_rate": 4.999790478143699e-05, "loss": 0.575, "step": 289 }, { "epoch": 0.17645269242470338, "grad_norm": 1.7782764434814453, "learning_rate": 4.999774051381704e-05, "loss": 0.5693, "step": 290 }, { "epoch": 0.17706114998478856, "grad_norm": 2.34918212890625, "learning_rate": 4.999757004779939e-05, "loss": 0.6003, "step": 291 }, { "epoch": 0.17766960754487374, "grad_norm": 1.9476317167282104, "learning_rate": 4.999739338342633e-05, "loss": 0.6231, "step": 292 }, { "epoch": 0.17827806510495892, "grad_norm": 1.6623529195785522, "learning_rate": 4.999721052074164e-05, "loss": 0.5573, "step": 293 }, { "epoch": 0.17888652266504412, "grad_norm": 1.4444576501846313, "learning_rate": 4.999702145979069e-05, "loss": 0.5658, "step": 294 }, { "epoch": 0.1794949802251293, "grad_norm": 1.9570586681365967, "learning_rate": 4.9996826200620336e-05, "loss": 0.538, "step": 295 }, { "epoch": 0.18010343778521448, "grad_norm": 1.6489269733428955, "learning_rate": 4.999662474327901e-05, "loss": 0.6005, "step": 296 }, { "epoch": 0.18071189534529966, "grad_norm": 1.8315165042877197, "learning_rate": 4.999641708781665e-05, "loss": 0.5694, "step": 297 }, { "epoch": 0.18132035290538484, "grad_norm": 1.9754319190979004, "learning_rate": 4.9996203234284755e-05, "loss": 0.5658, "step": 298 }, { "epoch": 0.18192881046547002, "grad_norm": 1.5500773191452026, "learning_rate": 4.999598318273636e-05, "loss": 0.5115, "step": 299 }, { "epoch": 0.18253726802555523, "grad_norm": 2.730600595474243, "learning_rate": 4.999575693322601e-05, "loss": 0.5655, "step": 300 }, { "epoch": 0.1831457255856404, "grad_norm": 2.1181812286376953, "learning_rate": 4.9995524485809816e-05, "loss": 0.5391, "step": 301 }, { "epoch": 0.18375418314572559, "grad_norm": 1.870579481124878, "learning_rate": 4.9995285840545425e-05, "loss": 0.5846, "step": 302 }, { "epoch": 0.18436264070581077, "grad_norm": 1.8520623445510864, "learning_rate": 4.9995040997491993e-05, "loss": 0.5664, "step": 303 }, { "epoch": 0.18497109826589594, "grad_norm": 1.7064025402069092, "learning_rate": 4.999478995671024e-05, "loss": 0.5567, "step": 304 }, { "epoch": 0.18557955582598112, "grad_norm": 2.016923427581787, "learning_rate": 4.99945327182624e-05, "loss": 0.5576, "step": 305 }, { "epoch": 0.18618801338606633, "grad_norm": 1.6377376317977905, "learning_rate": 4.999426928221229e-05, "loss": 0.5462, "step": 306 }, { "epoch": 0.1867964709461515, "grad_norm": 1.6940040588378906, "learning_rate": 4.9993999648625197e-05, "loss": 0.5148, "step": 307 }, { "epoch": 0.1874049285062367, "grad_norm": 2.552048921585083, "learning_rate": 4.9993723817567996e-05, "loss": 0.6583, "step": 308 }, { "epoch": 0.18801338606632187, "grad_norm": 1.4757635593414307, "learning_rate": 4.9993441789109074e-05, "loss": 0.5491, "step": 309 }, { "epoch": 0.18862184362640705, "grad_norm": 1.6392966508865356, "learning_rate": 4.999315356331837e-05, "loss": 0.5338, "step": 310 }, { "epoch": 0.18923030118649226, "grad_norm": 1.7045416831970215, "learning_rate": 4.999285914026734e-05, "loss": 0.627, "step": 311 }, { "epoch": 0.18983875874657744, "grad_norm": 1.4857970476150513, "learning_rate": 4.999255852002901e-05, "loss": 0.6172, "step": 312 }, { "epoch": 0.19044721630666261, "grad_norm": 3.2289485931396484, "learning_rate": 4.9992251702677904e-05, "loss": 0.5705, "step": 313 }, { "epoch": 0.1910556738667478, "grad_norm": 1.866363763809204, "learning_rate": 4.9991938688290105e-05, "loss": 0.5102, "step": 314 }, { "epoch": 0.19166413142683297, "grad_norm": 2.0050652027130127, "learning_rate": 4.999161947694322e-05, "loss": 0.5714, "step": 315 }, { "epoch": 0.19227258898691815, "grad_norm": 1.7371540069580078, "learning_rate": 4.9991294068716416e-05, "loss": 0.5933, "step": 316 }, { "epoch": 0.19288104654700336, "grad_norm": 1.7616161108016968, "learning_rate": 4.9990962463690364e-05, "loss": 0.5356, "step": 317 }, { "epoch": 0.19348950410708854, "grad_norm": 1.7778440713882446, "learning_rate": 4.99906246619473e-05, "loss": 0.5461, "step": 318 }, { "epoch": 0.19409796166717372, "grad_norm": 1.7068995237350464, "learning_rate": 4.999028066357098e-05, "loss": 0.509, "step": 319 }, { "epoch": 0.1947064192272589, "grad_norm": 1.5592914819717407, "learning_rate": 4.9989930468646703e-05, "loss": 0.5702, "step": 320 }, { "epoch": 0.19531487678734408, "grad_norm": 1.4004979133605957, "learning_rate": 4.99895740772613e-05, "loss": 0.5483, "step": 321 }, { "epoch": 0.19592333434742926, "grad_norm": 1.4833515882492065, "learning_rate": 4.998921148950314e-05, "loss": 0.5739, "step": 322 }, { "epoch": 0.19653179190751446, "grad_norm": 1.715824007987976, "learning_rate": 4.998884270546214e-05, "loss": 0.5488, "step": 323 }, { "epoch": 0.19714024946759964, "grad_norm": 1.557228684425354, "learning_rate": 4.998846772522972e-05, "loss": 0.5267, "step": 324 }, { "epoch": 0.19774870702768482, "grad_norm": 1.7454534769058228, "learning_rate": 4.998808654889888e-05, "loss": 0.533, "step": 325 }, { "epoch": 0.19835716458777, "grad_norm": 1.6410638093948364, "learning_rate": 4.998769917656414e-05, "loss": 0.5841, "step": 326 }, { "epoch": 0.19896562214785518, "grad_norm": 1.7179322242736816, "learning_rate": 4.998730560832154e-05, "loss": 0.6051, "step": 327 }, { "epoch": 0.19957407970794036, "grad_norm": 1.5742508172988892, "learning_rate": 4.9986905844268667e-05, "loss": 0.501, "step": 328 }, { "epoch": 0.20018253726802557, "grad_norm": 1.944952130317688, "learning_rate": 4.998649988450465e-05, "loss": 0.543, "step": 329 }, { "epoch": 0.20079099482811075, "grad_norm": 2.1101162433624268, "learning_rate": 4.998608772913015e-05, "loss": 0.7782, "step": 330 }, { "epoch": 0.20139945238819593, "grad_norm": 1.778678297996521, "learning_rate": 4.998566937824737e-05, "loss": 0.5768, "step": 331 }, { "epoch": 0.2020079099482811, "grad_norm": 1.5352647304534912, "learning_rate": 4.9985244831960034e-05, "loss": 0.5784, "step": 332 }, { "epoch": 0.20261636750836629, "grad_norm": 1.6977012157440186, "learning_rate": 4.998481409037342e-05, "loss": 0.537, "step": 333 }, { "epoch": 0.20322482506845146, "grad_norm": 1.7605286836624146, "learning_rate": 4.9984377153594327e-05, "loss": 0.5381, "step": 334 }, { "epoch": 0.20383328262853667, "grad_norm": 2.0058248043060303, "learning_rate": 4.99839340217311e-05, "loss": 0.6097, "step": 335 }, { "epoch": 0.20444174018862185, "grad_norm": 2.2246720790863037, "learning_rate": 4.9983484694893615e-05, "loss": 0.5455, "step": 336 }, { "epoch": 0.20505019774870703, "grad_norm": 1.8565617799758911, "learning_rate": 4.99830291731933e-05, "loss": 0.6721, "step": 337 }, { "epoch": 0.2056586553087922, "grad_norm": 1.691021203994751, "learning_rate": 4.998256745674308e-05, "loss": 0.58, "step": 338 }, { "epoch": 0.2062671128688774, "grad_norm": 2.6199169158935547, "learning_rate": 4.998209954565746e-05, "loss": 0.6639, "step": 339 }, { "epoch": 0.20687557042896257, "grad_norm": 1.632177710533142, "learning_rate": 4.998162544005246e-05, "loss": 0.5949, "step": 340 }, { "epoch": 0.20748402798904778, "grad_norm": 2.9773964881896973, "learning_rate": 4.9981145140045634e-05, "loss": 0.547, "step": 341 }, { "epoch": 0.20809248554913296, "grad_norm": 2.0288548469543457, "learning_rate": 4.998065864575608e-05, "loss": 0.619, "step": 342 }, { "epoch": 0.20870094310921813, "grad_norm": 1.6673290729522705, "learning_rate": 4.998016595730442e-05, "loss": 0.592, "step": 343 }, { "epoch": 0.2093094006693033, "grad_norm": 1.5016260147094727, "learning_rate": 4.997966707481284e-05, "loss": 0.6014, "step": 344 }, { "epoch": 0.2099178582293885, "grad_norm": 2.3136346340179443, "learning_rate": 4.9979161998405024e-05, "loss": 0.5583, "step": 345 }, { "epoch": 0.21052631578947367, "grad_norm": 1.6642321348190308, "learning_rate": 4.997865072820621e-05, "loss": 0.5468, "step": 346 }, { "epoch": 0.21113477334955888, "grad_norm": 1.8811125755310059, "learning_rate": 4.9978133264343186e-05, "loss": 0.6072, "step": 347 }, { "epoch": 0.21174323090964406, "grad_norm": 1.5865939855575562, "learning_rate": 4.997760960694424e-05, "loss": 0.509, "step": 348 }, { "epoch": 0.21235168846972924, "grad_norm": 1.6846966743469238, "learning_rate": 4.9977079756139247e-05, "loss": 0.6028, "step": 349 }, { "epoch": 0.21296014602981442, "grad_norm": 2.0997796058654785, "learning_rate": 4.997654371205955e-05, "loss": 0.5462, "step": 350 }, { "epoch": 0.2135686035898996, "grad_norm": 2.4784786701202393, "learning_rate": 4.9976001474838105e-05, "loss": 0.5719, "step": 351 }, { "epoch": 0.21417706114998478, "grad_norm": 2.554870367050171, "learning_rate": 4.997545304460933e-05, "loss": 0.5194, "step": 352 }, { "epoch": 0.21478551871006998, "grad_norm": 1.4950354099273682, "learning_rate": 4.997489842150924e-05, "loss": 0.5605, "step": 353 }, { "epoch": 0.21539397627015516, "grad_norm": 1.4311949014663696, "learning_rate": 4.9974337605675335e-05, "loss": 0.5508, "step": 354 }, { "epoch": 0.21600243383024034, "grad_norm": 1.7525572776794434, "learning_rate": 4.9973770597246696e-05, "loss": 0.5666, "step": 355 }, { "epoch": 0.21661089139032552, "grad_norm": 2.4212021827697754, "learning_rate": 4.99731973963639e-05, "loss": 0.5436, "step": 356 }, { "epoch": 0.2172193489504107, "grad_norm": 6.587791919708252, "learning_rate": 4.997261800316909e-05, "loss": 0.6509, "step": 357 }, { "epoch": 0.21782780651049588, "grad_norm": 1.5659176111221313, "learning_rate": 4.997203241780592e-05, "loss": 0.5326, "step": 358 }, { "epoch": 0.2184362640705811, "grad_norm": 1.5538829565048218, "learning_rate": 4.99714406404196e-05, "loss": 0.6136, "step": 359 }, { "epoch": 0.21904472163066627, "grad_norm": 1.7956405878067017, "learning_rate": 4.997084267115686e-05, "loss": 0.604, "step": 360 }, { "epoch": 0.21965317919075145, "grad_norm": 1.8146167993545532, "learning_rate": 4.997023851016598e-05, "loss": 0.5673, "step": 361 }, { "epoch": 0.22026163675083663, "grad_norm": 1.5407328605651855, "learning_rate": 4.996962815759675e-05, "loss": 0.5848, "step": 362 }, { "epoch": 0.2208700943109218, "grad_norm": 1.560318946838379, "learning_rate": 4.9969011613600525e-05, "loss": 0.4974, "step": 363 }, { "epoch": 0.22147855187100698, "grad_norm": 1.5309618711471558, "learning_rate": 4.996838887833018e-05, "loss": 0.6155, "step": 364 }, { "epoch": 0.2220870094310922, "grad_norm": 1.6208105087280273, "learning_rate": 4.9967759951940127e-05, "loss": 0.5615, "step": 365 }, { "epoch": 0.22269546699117737, "grad_norm": 1.5246007442474365, "learning_rate": 4.996712483458632e-05, "loss": 0.5311, "step": 366 }, { "epoch": 0.22330392455126255, "grad_norm": 1.9340999126434326, "learning_rate": 4.9966483526426223e-05, "loss": 0.5678, "step": 367 }, { "epoch": 0.22391238211134773, "grad_norm": 1.4264390468597412, "learning_rate": 4.996583602761887e-05, "loss": 0.5468, "step": 368 }, { "epoch": 0.2245208396714329, "grad_norm": 1.5261754989624023, "learning_rate": 4.996518233832481e-05, "loss": 0.5593, "step": 369 }, { "epoch": 0.2251292972315181, "grad_norm": 1.5662906169891357, "learning_rate": 4.996452245870614e-05, "loss": 0.5155, "step": 370 }, { "epoch": 0.2257377547916033, "grad_norm": 1.6063148975372314, "learning_rate": 4.9963856388926464e-05, "loss": 0.5384, "step": 371 }, { "epoch": 0.22634621235168847, "grad_norm": 1.5906771421432495, "learning_rate": 4.996318412915095e-05, "loss": 0.5049, "step": 372 }, { "epoch": 0.22695466991177365, "grad_norm": 1.676358938217163, "learning_rate": 4.9962505679546285e-05, "loss": 0.5344, "step": 373 }, { "epoch": 0.22756312747185883, "grad_norm": 1.6183301210403442, "learning_rate": 4.9961821040280697e-05, "loss": 0.5611, "step": 374 }, { "epoch": 0.228171585031944, "grad_norm": 1.670375108718872, "learning_rate": 4.996113021152397e-05, "loss": 0.5533, "step": 375 }, { "epoch": 0.2287800425920292, "grad_norm": 1.6146153211593628, "learning_rate": 4.996043319344736e-05, "loss": 0.5368, "step": 376 }, { "epoch": 0.2293885001521144, "grad_norm": 1.5858700275421143, "learning_rate": 4.9959729986223725e-05, "loss": 0.5109, "step": 377 }, { "epoch": 0.22999695771219958, "grad_norm": 1.696811318397522, "learning_rate": 4.995902059002743e-05, "loss": 0.5879, "step": 378 }, { "epoch": 0.23060541527228476, "grad_norm": 1.5121724605560303, "learning_rate": 4.995830500503438e-05, "loss": 0.518, "step": 379 }, { "epoch": 0.23121387283236994, "grad_norm": 1.4365136623382568, "learning_rate": 4.995758323142199e-05, "loss": 0.5384, "step": 380 }, { "epoch": 0.23182233039245512, "grad_norm": 1.4836268424987793, "learning_rate": 4.995685526936924e-05, "loss": 0.5867, "step": 381 }, { "epoch": 0.2324307879525403, "grad_norm": 1.8685383796691895, "learning_rate": 4.9956121119056646e-05, "loss": 0.628, "step": 382 }, { "epoch": 0.2330392455126255, "grad_norm": 1.426859974861145, "learning_rate": 4.9955380780666233e-05, "loss": 0.4974, "step": 383 }, { "epoch": 0.23364770307271068, "grad_norm": 2.282662868499756, "learning_rate": 4.9954634254381576e-05, "loss": 0.5333, "step": 384 }, { "epoch": 0.23425616063279586, "grad_norm": 1.881532073020935, "learning_rate": 4.995388154038779e-05, "loss": 0.6067, "step": 385 }, { "epoch": 0.23486461819288104, "grad_norm": 1.4905718564987183, "learning_rate": 4.9953122638871505e-05, "loss": 0.5767, "step": 386 }, { "epoch": 0.23547307575296622, "grad_norm": 1.5915948152542114, "learning_rate": 4.99523575500209e-05, "loss": 0.5713, "step": 387 }, { "epoch": 0.23608153331305143, "grad_norm": 1.8757609128952026, "learning_rate": 4.9951586274025695e-05, "loss": 0.5355, "step": 388 }, { "epoch": 0.2366899908731366, "grad_norm": 1.7897573709487915, "learning_rate": 4.9950808811077135e-05, "loss": 0.5522, "step": 389 }, { "epoch": 0.2372984484332218, "grad_norm": 1.5204428434371948, "learning_rate": 4.995002516136797e-05, "loss": 0.5935, "step": 390 }, { "epoch": 0.23790690599330697, "grad_norm": 1.376928448677063, "learning_rate": 4.994923532509255e-05, "loss": 0.5006, "step": 391 }, { "epoch": 0.23851536355339215, "grad_norm": 1.6038637161254883, "learning_rate": 4.99484393024467e-05, "loss": 0.4999, "step": 392 }, { "epoch": 0.23912382111347733, "grad_norm": 1.59404456615448, "learning_rate": 4.99476370936278e-05, "loss": 0.5816, "step": 393 }, { "epoch": 0.23973227867356253, "grad_norm": 1.748207688331604, "learning_rate": 4.994682869883478e-05, "loss": 0.5596, "step": 394 }, { "epoch": 0.2403407362336477, "grad_norm": 1.8243966102600098, "learning_rate": 4.994601411826807e-05, "loss": 0.6351, "step": 395 }, { "epoch": 0.2409491937937329, "grad_norm": 1.70762300491333, "learning_rate": 4.994519335212966e-05, "loss": 0.644, "step": 396 }, { "epoch": 0.24155765135381807, "grad_norm": 1.5090755224227905, "learning_rate": 4.9944366400623066e-05, "loss": 0.5991, "step": 397 }, { "epoch": 0.24216610891390325, "grad_norm": 1.3629839420318604, "learning_rate": 4.994353326395334e-05, "loss": 0.4862, "step": 398 }, { "epoch": 0.24277456647398843, "grad_norm": 1.7295219898223877, "learning_rate": 4.9942693942327054e-05, "loss": 0.5848, "step": 399 }, { "epoch": 0.24338302403407364, "grad_norm": 1.5481480360031128, "learning_rate": 4.994184843595234e-05, "loss": 0.4668, "step": 400 }, { "epoch": 0.24399148159415882, "grad_norm": 1.3394376039505005, "learning_rate": 4.994099674503885e-05, "loss": 0.5137, "step": 401 }, { "epoch": 0.244599939154244, "grad_norm": 1.7005079984664917, "learning_rate": 4.994013886979775e-05, "loss": 0.6468, "step": 402 }, { "epoch": 0.24520839671432917, "grad_norm": 1.682056188583374, "learning_rate": 4.993927481044176e-05, "loss": 0.5919, "step": 403 }, { "epoch": 0.24581685427441435, "grad_norm": 1.6984502077102661, "learning_rate": 4.993840456718515e-05, "loss": 0.5864, "step": 404 }, { "epoch": 0.24642531183449953, "grad_norm": 1.8722606897354126, "learning_rate": 4.993752814024368e-05, "loss": 0.4948, "step": 405 }, { "epoch": 0.24703376939458474, "grad_norm": 1.4154351949691772, "learning_rate": 4.993664552983469e-05, "loss": 0.5495, "step": 406 }, { "epoch": 0.24764222695466992, "grad_norm": 1.7346272468566895, "learning_rate": 4.9935756736177006e-05, "loss": 0.6606, "step": 407 }, { "epoch": 0.2482506845147551, "grad_norm": 1.3692924976348877, "learning_rate": 4.993486175949104e-05, "loss": 0.4961, "step": 408 }, { "epoch": 0.24885914207484028, "grad_norm": 1.9384909868240356, "learning_rate": 4.993396059999868e-05, "loss": 0.6164, "step": 409 }, { "epoch": 0.24946759963492546, "grad_norm": 1.8157694339752197, "learning_rate": 4.99330532579234e-05, "loss": 0.5164, "step": 410 }, { "epoch": 0.25007605719501064, "grad_norm": 1.4756087064743042, "learning_rate": 4.993213973349017e-05, "loss": 0.5389, "step": 411 }, { "epoch": 0.25068451475509584, "grad_norm": 1.3147797584533691, "learning_rate": 4.9931220026925506e-05, "loss": 0.4488, "step": 412 }, { "epoch": 0.251292972315181, "grad_norm": 1.9356815814971924, "learning_rate": 4.993029413845746e-05, "loss": 0.5484, "step": 413 }, { "epoch": 0.2519014298752662, "grad_norm": 1.7992812395095825, "learning_rate": 4.992936206831561e-05, "loss": 0.5857, "step": 414 }, { "epoch": 0.2525098874353514, "grad_norm": 1.4807167053222656, "learning_rate": 4.9928423816731086e-05, "loss": 0.5957, "step": 415 }, { "epoch": 0.25311834499543656, "grad_norm": 1.877353549003601, "learning_rate": 4.9927479383936516e-05, "loss": 0.5062, "step": 416 }, { "epoch": 0.25372680255552177, "grad_norm": 1.6704474687576294, "learning_rate": 4.992652877016608e-05, "loss": 0.6027, "step": 417 }, { "epoch": 0.2543352601156069, "grad_norm": 1.7480648756027222, "learning_rate": 4.992557197565551e-05, "loss": 0.5166, "step": 418 }, { "epoch": 0.2549437176756921, "grad_norm": 1.8488813638687134, "learning_rate": 4.992460900064203e-05, "loss": 0.627, "step": 419 }, { "epoch": 0.2555521752357773, "grad_norm": 1.7881669998168945, "learning_rate": 4.992363984536443e-05, "loss": 0.5289, "step": 420 }, { "epoch": 0.2561606327958625, "grad_norm": 1.5146019458770752, "learning_rate": 4.9922664510063024e-05, "loss": 0.5334, "step": 421 }, { "epoch": 0.2567690903559477, "grad_norm": 3.4007725715637207, "learning_rate": 4.992168299497964e-05, "loss": 0.6357, "step": 422 }, { "epoch": 0.25737754791603284, "grad_norm": 1.778419017791748, "learning_rate": 4.9920695300357664e-05, "loss": 0.5248, "step": 423 }, { "epoch": 0.25798600547611805, "grad_norm": 1.3506314754486084, "learning_rate": 4.9919701426442e-05, "loss": 0.4993, "step": 424 }, { "epoch": 0.2585944630362032, "grad_norm": 1.7571451663970947, "learning_rate": 4.991870137347908e-05, "loss": 0.591, "step": 425 }, { "epoch": 0.2592029205962884, "grad_norm": 1.8246015310287476, "learning_rate": 4.9917695141716884e-05, "loss": 0.5652, "step": 426 }, { "epoch": 0.2598113781563736, "grad_norm": 1.585374355316162, "learning_rate": 4.991668273140492e-05, "loss": 0.494, "step": 427 }, { "epoch": 0.26041983571645877, "grad_norm": 1.5021830797195435, "learning_rate": 4.991566414279421e-05, "loss": 0.5471, "step": 428 }, { "epoch": 0.261028293276544, "grad_norm": 1.493720293045044, "learning_rate": 4.991463937613733e-05, "loss": 0.5409, "step": 429 }, { "epoch": 0.26163675083662913, "grad_norm": 1.550927758216858, "learning_rate": 4.991360843168838e-05, "loss": 0.5049, "step": 430 }, { "epoch": 0.26224520839671434, "grad_norm": 1.793480634689331, "learning_rate": 4.991257130970299e-05, "loss": 0.5746, "step": 431 }, { "epoch": 0.2628536659567995, "grad_norm": 1.3120859861373901, "learning_rate": 4.991152801043832e-05, "loss": 0.48, "step": 432 }, { "epoch": 0.2634621235168847, "grad_norm": 1.489817500114441, "learning_rate": 4.991047853415307e-05, "loss": 0.5376, "step": 433 }, { "epoch": 0.2640705810769699, "grad_norm": 1.668395757675171, "learning_rate": 4.990942288110746e-05, "loss": 0.5131, "step": 434 }, { "epoch": 0.26467903863705505, "grad_norm": 1.8639039993286133, "learning_rate": 4.9908361051563244e-05, "loss": 0.5146, "step": 435 }, { "epoch": 0.26528749619714026, "grad_norm": 1.944084644317627, "learning_rate": 4.990729304578373e-05, "loss": 0.5916, "step": 436 }, { "epoch": 0.2658959537572254, "grad_norm": 1.5414292812347412, "learning_rate": 4.990621886403373e-05, "loss": 0.5606, "step": 437 }, { "epoch": 0.2665044113173106, "grad_norm": 1.4970581531524658, "learning_rate": 4.990513850657958e-05, "loss": 0.5137, "step": 438 }, { "epoch": 0.2671128688773958, "grad_norm": 1.5545778274536133, "learning_rate": 4.990405197368919e-05, "loss": 0.5935, "step": 439 }, { "epoch": 0.267721326437481, "grad_norm": 1.836313009262085, "learning_rate": 4.9902959265631966e-05, "loss": 0.4994, "step": 440 }, { "epoch": 0.2683297839975662, "grad_norm": 2.459785223007202, "learning_rate": 4.990186038267884e-05, "loss": 0.5529, "step": 441 }, { "epoch": 0.26893824155765134, "grad_norm": 1.6246469020843506, "learning_rate": 4.990075532510231e-05, "loss": 0.5785, "step": 442 }, { "epoch": 0.26954669911773654, "grad_norm": 1.563841700553894, "learning_rate": 4.989964409317637e-05, "loss": 0.5102, "step": 443 }, { "epoch": 0.2701551566778217, "grad_norm": 1.729370355606079, "learning_rate": 4.9898526687176554e-05, "loss": 0.6466, "step": 444 }, { "epoch": 0.2707636142379069, "grad_norm": 1.5674041509628296, "learning_rate": 4.989740310737995e-05, "loss": 0.5578, "step": 445 }, { "epoch": 0.2713720717979921, "grad_norm": 2.1116135120391846, "learning_rate": 4.989627335406515e-05, "loss": 0.6759, "step": 446 }, { "epoch": 0.27198052935807726, "grad_norm": 1.3847877979278564, "learning_rate": 4.9895137427512284e-05, "loss": 0.4906, "step": 447 }, { "epoch": 0.27258898691816247, "grad_norm": 2.1487720012664795, "learning_rate": 4.989399532800302e-05, "loss": 0.4625, "step": 448 }, { "epoch": 0.2731974444782476, "grad_norm": 2.71392560005188, "learning_rate": 4.989284705582055e-05, "loss": 0.5155, "step": 449 }, { "epoch": 0.2738059020383328, "grad_norm": 1.6772319078445435, "learning_rate": 4.989169261124958e-05, "loss": 0.5715, "step": 450 }, { "epoch": 0.27441435959841803, "grad_norm": 1.6435317993164062, "learning_rate": 4.9890531994576394e-05, "loss": 0.5688, "step": 451 }, { "epoch": 0.2750228171585032, "grad_norm": 1.4484583139419556, "learning_rate": 4.9889365206088755e-05, "loss": 0.5269, "step": 452 }, { "epoch": 0.2756312747185884, "grad_norm": 1.4935004711151123, "learning_rate": 4.9888192246075986e-05, "loss": 0.5619, "step": 453 }, { "epoch": 0.27623973227867354, "grad_norm": 2.1516928672790527, "learning_rate": 4.988701311482893e-05, "loss": 0.6126, "step": 454 }, { "epoch": 0.27684818983875875, "grad_norm": 2.0537219047546387, "learning_rate": 4.988582781263997e-05, "loss": 0.5805, "step": 455 }, { "epoch": 0.2774566473988439, "grad_norm": 1.384886622428894, "learning_rate": 4.9884636339803e-05, "loss": 0.5707, "step": 456 }, { "epoch": 0.2780651049589291, "grad_norm": 1.4588273763656616, "learning_rate": 4.988343869661346e-05, "loss": 0.5806, "step": 457 }, { "epoch": 0.2786735625190143, "grad_norm": 1.3114267587661743, "learning_rate": 4.988223488336832e-05, "loss": 0.5397, "step": 458 }, { "epoch": 0.27928202007909947, "grad_norm": 1.5628232955932617, "learning_rate": 4.988102490036606e-05, "loss": 0.5486, "step": 459 }, { "epoch": 0.2798904776391847, "grad_norm": 1.5347833633422852, "learning_rate": 4.987980874790673e-05, "loss": 0.492, "step": 460 }, { "epoch": 0.2804989351992698, "grad_norm": 1.5386556386947632, "learning_rate": 4.9878586426291864e-05, "loss": 0.5133, "step": 461 }, { "epoch": 0.28110739275935503, "grad_norm": 1.603733777999878, "learning_rate": 4.987735793582456e-05, "loss": 0.5197, "step": 462 }, { "epoch": 0.28171585031944024, "grad_norm": 1.7382800579071045, "learning_rate": 4.987612327680943e-05, "loss": 0.6745, "step": 463 }, { "epoch": 0.2823243078795254, "grad_norm": 1.6408590078353882, "learning_rate": 4.987488244955261e-05, "loss": 0.5331, "step": 464 }, { "epoch": 0.2829327654396106, "grad_norm": 1.5231281518936157, "learning_rate": 4.987363545436178e-05, "loss": 0.5993, "step": 465 }, { "epoch": 0.28354122299969575, "grad_norm": 1.5222764015197754, "learning_rate": 4.9872382291546136e-05, "loss": 0.4993, "step": 466 }, { "epoch": 0.28414968055978096, "grad_norm": 1.3687183856964111, "learning_rate": 4.9871122961416417e-05, "loss": 0.5918, "step": 467 }, { "epoch": 0.28475813811986617, "grad_norm": 1.5194501876831055, "learning_rate": 4.9869857464284885e-05, "loss": 0.5499, "step": 468 }, { "epoch": 0.2853665956799513, "grad_norm": 1.4260061979293823, "learning_rate": 4.986858580046534e-05, "loss": 0.5771, "step": 469 }, { "epoch": 0.2859750532400365, "grad_norm": 1.58305823802948, "learning_rate": 4.986730797027307e-05, "loss": 0.5616, "step": 470 }, { "epoch": 0.2865835108001217, "grad_norm": 1.5807416439056396, "learning_rate": 4.9866023974024954e-05, "loss": 0.513, "step": 471 }, { "epoch": 0.2871919683602069, "grad_norm": 1.3939672708511353, "learning_rate": 4.986473381203937e-05, "loss": 0.587, "step": 472 }, { "epoch": 0.28780042592029204, "grad_norm": 1.4460794925689697, "learning_rate": 4.98634374846362e-05, "loss": 0.5267, "step": 473 }, { "epoch": 0.28840888348037724, "grad_norm": 1.4511432647705078, "learning_rate": 4.986213499213689e-05, "loss": 0.497, "step": 474 }, { "epoch": 0.28901734104046245, "grad_norm": 1.877480149269104, "learning_rate": 4.986082633486442e-05, "loss": 0.5801, "step": 475 }, { "epoch": 0.2896257986005476, "grad_norm": 1.3986396789550781, "learning_rate": 4.985951151314326e-05, "loss": 0.5483, "step": 476 }, { "epoch": 0.2902342561606328, "grad_norm": 2.344064474105835, "learning_rate": 4.985819052729944e-05, "loss": 0.4837, "step": 477 }, { "epoch": 0.29084271372071796, "grad_norm": 1.5389190912246704, "learning_rate": 4.9856863377660515e-05, "loss": 0.5699, "step": 478 }, { "epoch": 0.29145117128080317, "grad_norm": 1.4740633964538574, "learning_rate": 4.985553006455556e-05, "loss": 0.512, "step": 479 }, { "epoch": 0.2920596288408884, "grad_norm": 3.1011834144592285, "learning_rate": 4.985419058831517e-05, "loss": 0.5975, "step": 480 }, { "epoch": 0.2926680864009735, "grad_norm": 1.6393465995788574, "learning_rate": 4.9852844949271496e-05, "loss": 0.5588, "step": 481 }, { "epoch": 0.29327654396105873, "grad_norm": 1.4237196445465088, "learning_rate": 4.985149314775818e-05, "loss": 0.5831, "step": 482 }, { "epoch": 0.2938850015211439, "grad_norm": 1.3287136554718018, "learning_rate": 4.985013518411044e-05, "loss": 0.4769, "step": 483 }, { "epoch": 0.2944934590812291, "grad_norm": 1.4517908096313477, "learning_rate": 4.984877105866497e-05, "loss": 0.5007, "step": 484 }, { "epoch": 0.29510191664131424, "grad_norm": 1.5900462865829468, "learning_rate": 4.984740077176002e-05, "loss": 0.6005, "step": 485 }, { "epoch": 0.29571037420139945, "grad_norm": 1.5251234769821167, "learning_rate": 4.984602432373537e-05, "loss": 0.5368, "step": 486 }, { "epoch": 0.29631883176148466, "grad_norm": 1.551039457321167, "learning_rate": 4.984464171493233e-05, "loss": 0.6079, "step": 487 }, { "epoch": 0.2969272893215698, "grad_norm": 2.1228880882263184, "learning_rate": 4.984325294569372e-05, "loss": 0.6416, "step": 488 }, { "epoch": 0.297535746881655, "grad_norm": 1.4991825819015503, "learning_rate": 4.98418580163639e-05, "loss": 0.4824, "step": 489 }, { "epoch": 0.29814420444174017, "grad_norm": 1.746289610862732, "learning_rate": 4.9840456927288734e-05, "loss": 0.5908, "step": 490 }, { "epoch": 0.2987526620018254, "grad_norm": 1.5192077159881592, "learning_rate": 4.983904967881567e-05, "loss": 0.6048, "step": 491 }, { "epoch": 0.2993611195619106, "grad_norm": 1.415953278541565, "learning_rate": 4.983763627129362e-05, "loss": 0.5042, "step": 492 }, { "epoch": 0.29996957712199573, "grad_norm": 1.255936622619629, "learning_rate": 4.983621670507306e-05, "loss": 0.5223, "step": 493 }, { "epoch": 0.30057803468208094, "grad_norm": 1.3377435207366943, "learning_rate": 4.9834790980505985e-05, "loss": 0.553, "step": 494 }, { "epoch": 0.3011864922421661, "grad_norm": 1.7899580001831055, "learning_rate": 4.983335909794591e-05, "loss": 0.4885, "step": 495 }, { "epoch": 0.3017949498022513, "grad_norm": 1.4705740213394165, "learning_rate": 4.983192105774788e-05, "loss": 0.5556, "step": 496 }, { "epoch": 0.30240340736233645, "grad_norm": 1.9178767204284668, "learning_rate": 4.983047686026847e-05, "loss": 0.4836, "step": 497 }, { "epoch": 0.30301186492242166, "grad_norm": 1.54046630859375, "learning_rate": 4.9829026505865794e-05, "loss": 0.5707, "step": 498 }, { "epoch": 0.30362032248250687, "grad_norm": 1.558304786682129, "learning_rate": 4.982756999489947e-05, "loss": 0.5615, "step": 499 }, { "epoch": 0.304228780042592, "grad_norm": 1.782384991645813, "learning_rate": 4.982610732773064e-05, "loss": 0.4886, "step": 500 }, { "epoch": 0.3048372376026772, "grad_norm": 1.3223114013671875, "learning_rate": 4.9824638504722005e-05, "loss": 0.517, "step": 501 }, { "epoch": 0.3054456951627624, "grad_norm": 1.3240535259246826, "learning_rate": 4.982316352623776e-05, "loss": 0.5153, "step": 502 }, { "epoch": 0.3060541527228476, "grad_norm": 1.610282301902771, "learning_rate": 4.982168239264364e-05, "loss": 0.5231, "step": 503 }, { "epoch": 0.3066626102829328, "grad_norm": 1.6229352951049805, "learning_rate": 4.982019510430691e-05, "loss": 0.613, "step": 504 }, { "epoch": 0.30727106784301794, "grad_norm": 1.3226631879806519, "learning_rate": 4.981870166159635e-05, "loss": 0.5142, "step": 505 }, { "epoch": 0.30787952540310315, "grad_norm": 1.495801568031311, "learning_rate": 4.981720206488226e-05, "loss": 0.5711, "step": 506 }, { "epoch": 0.3084879829631883, "grad_norm": 1.7586286067962646, "learning_rate": 4.9815696314536504e-05, "loss": 0.5152, "step": 507 }, { "epoch": 0.3090964405232735, "grad_norm": 1.5324755907058716, "learning_rate": 4.981418441093243e-05, "loss": 0.5501, "step": 508 }, { "epoch": 0.30970489808335866, "grad_norm": 1.775768756866455, "learning_rate": 4.981266635444492e-05, "loss": 0.6562, "step": 509 }, { "epoch": 0.31031335564344387, "grad_norm": 1.3444887399673462, "learning_rate": 4.98111421454504e-05, "loss": 0.5188, "step": 510 }, { "epoch": 0.3109218132035291, "grad_norm": 1.3959381580352783, "learning_rate": 4.9809611784326815e-05, "loss": 0.5093, "step": 511 }, { "epoch": 0.3115302707636142, "grad_norm": 3.0742530822753906, "learning_rate": 4.9808075271453616e-05, "loss": 0.5801, "step": 512 }, { "epoch": 0.31213872832369943, "grad_norm": 1.4250247478485107, "learning_rate": 4.9806532607211797e-05, "loss": 0.5579, "step": 513 }, { "epoch": 0.3127471858837846, "grad_norm": 2.064180374145508, "learning_rate": 4.980498379198389e-05, "loss": 0.5433, "step": 514 }, { "epoch": 0.3133556434438698, "grad_norm": 1.5153679847717285, "learning_rate": 4.980342882615392e-05, "loss": 0.5676, "step": 515 }, { "epoch": 0.313964101003955, "grad_norm": 1.5368945598602295, "learning_rate": 4.9801867710107454e-05, "loss": 0.5181, "step": 516 }, { "epoch": 0.31457255856404015, "grad_norm": 1.2389271259307861, "learning_rate": 4.98003004442316e-05, "loss": 0.4591, "step": 517 }, { "epoch": 0.31518101612412536, "grad_norm": 1.1532279253005981, "learning_rate": 4.979872702891495e-05, "loss": 0.4422, "step": 518 }, { "epoch": 0.3157894736842105, "grad_norm": 1.6176403760910034, "learning_rate": 4.9797147464547664e-05, "loss": 0.6151, "step": 519 }, { "epoch": 0.3163979312442957, "grad_norm": 1.5165481567382812, "learning_rate": 4.9795561751521405e-05, "loss": 0.5284, "step": 520 }, { "epoch": 0.31700638880438087, "grad_norm": 1.6209813356399536, "learning_rate": 4.9793969890229364e-05, "loss": 0.5867, "step": 521 }, { "epoch": 0.3176148463644661, "grad_norm": 1.3948253393173218, "learning_rate": 4.9792371881066245e-05, "loss": 0.5734, "step": 522 }, { "epoch": 0.3182233039245513, "grad_norm": 1.2350413799285889, "learning_rate": 4.9790767724428304e-05, "loss": 0.5079, "step": 523 }, { "epoch": 0.31883176148463643, "grad_norm": 1.3993268013000488, "learning_rate": 4.978915742071329e-05, "loss": 0.5713, "step": 524 }, { "epoch": 0.31944021904472164, "grad_norm": 1.3843022584915161, "learning_rate": 4.978754097032051e-05, "loss": 0.5258, "step": 525 }, { "epoch": 0.3200486766048068, "grad_norm": 1.2739397287368774, "learning_rate": 4.978591837365076e-05, "loss": 0.5044, "step": 526 }, { "epoch": 0.320657134164892, "grad_norm": 2.009359359741211, "learning_rate": 4.978428963110638e-05, "loss": 0.495, "step": 527 }, { "epoch": 0.3212655917249772, "grad_norm": 1.5859858989715576, "learning_rate": 4.978265474309123e-05, "loss": 0.5793, "step": 528 }, { "epoch": 0.32187404928506236, "grad_norm": 1.5144940614700317, "learning_rate": 4.9781013710010696e-05, "loss": 0.5305, "step": 529 }, { "epoch": 0.32248250684514757, "grad_norm": 1.5362976789474487, "learning_rate": 4.977936653227169e-05, "loss": 0.6301, "step": 530 }, { "epoch": 0.3230909644052327, "grad_norm": 1.516918420791626, "learning_rate": 4.9777713210282636e-05, "loss": 0.581, "step": 531 }, { "epoch": 0.3236994219653179, "grad_norm": 1.3938144445419312, "learning_rate": 4.977605374445349e-05, "loss": 0.6385, "step": 532 }, { "epoch": 0.3243078795254031, "grad_norm": 1.1043601036071777, "learning_rate": 4.977438813519574e-05, "loss": 0.4772, "step": 533 }, { "epoch": 0.3249163370854883, "grad_norm": 1.2884318828582764, "learning_rate": 4.977271638292237e-05, "loss": 0.6186, "step": 534 }, { "epoch": 0.3255247946455735, "grad_norm": 1.278027057647705, "learning_rate": 4.9771038488047915e-05, "loss": 0.5577, "step": 535 }, { "epoch": 0.32613325220565864, "grad_norm": 1.1427042484283447, "learning_rate": 4.976935445098843e-05, "loss": 0.4842, "step": 536 }, { "epoch": 0.32674170976574385, "grad_norm": 1.1449129581451416, "learning_rate": 4.9767664272161474e-05, "loss": 0.4771, "step": 537 }, { "epoch": 0.327350167325829, "grad_norm": 1.4872303009033203, "learning_rate": 4.976596795198615e-05, "loss": 0.5961, "step": 538 }, { "epoch": 0.3279586248859142, "grad_norm": 1.3000589609146118, "learning_rate": 4.976426549088307e-05, "loss": 0.5076, "step": 539 }, { "epoch": 0.3285670824459994, "grad_norm": 1.8529880046844482, "learning_rate": 4.976255688927436e-05, "loss": 0.5203, "step": 540 }, { "epoch": 0.32917554000608457, "grad_norm": 2.4158706665039062, "learning_rate": 4.976084214758371e-05, "loss": 0.5025, "step": 541 }, { "epoch": 0.3297839975661698, "grad_norm": 1.3759514093399048, "learning_rate": 4.9759121266236286e-05, "loss": 0.5908, "step": 542 }, { "epoch": 0.3303924551262549, "grad_norm": 1.3580955266952515, "learning_rate": 4.97573942456588e-05, "loss": 0.4575, "step": 543 }, { "epoch": 0.33100091268634013, "grad_norm": 3.9890565872192383, "learning_rate": 4.975566108627948e-05, "loss": 0.4681, "step": 544 }, { "epoch": 0.33160937024642534, "grad_norm": 1.6312041282653809, "learning_rate": 4.975392178852808e-05, "loss": 0.5273, "step": 545 }, { "epoch": 0.3322178278065105, "grad_norm": 2.074263572692871, "learning_rate": 4.9752176352835866e-05, "loss": 0.5376, "step": 546 }, { "epoch": 0.3328262853665957, "grad_norm": 2.1829655170440674, "learning_rate": 4.975042477963564e-05, "loss": 0.5093, "step": 547 }, { "epoch": 0.33343474292668085, "grad_norm": 1.5592143535614014, "learning_rate": 4.9748667069361715e-05, "loss": 0.5319, "step": 548 }, { "epoch": 0.33404320048676606, "grad_norm": 1.8820182085037231, "learning_rate": 4.974690322244994e-05, "loss": 0.5467, "step": 549 }, { "epoch": 0.3346516580468512, "grad_norm": 1.3456025123596191, "learning_rate": 4.974513323933766e-05, "loss": 0.5365, "step": 550 }, { "epoch": 0.3352601156069364, "grad_norm": 1.2476035356521606, "learning_rate": 4.974335712046376e-05, "loss": 0.4927, "step": 551 }, { "epoch": 0.3358685731670216, "grad_norm": 1.5756597518920898, "learning_rate": 4.974157486626866e-05, "loss": 0.5539, "step": 552 }, { "epoch": 0.3364770307271068, "grad_norm": 1.4776535034179688, "learning_rate": 4.973978647719426e-05, "loss": 0.5584, "step": 553 }, { "epoch": 0.337085488287192, "grad_norm": 2.738008975982666, "learning_rate": 4.9737991953684024e-05, "loss": 0.5617, "step": 554 }, { "epoch": 0.33769394584727713, "grad_norm": 1.5317890644073486, "learning_rate": 4.973619129618292e-05, "loss": 0.5562, "step": 555 }, { "epoch": 0.33830240340736234, "grad_norm": 1.49234938621521, "learning_rate": 4.973438450513743e-05, "loss": 0.5217, "step": 556 }, { "epoch": 0.33891086096744755, "grad_norm": 1.301456093788147, "learning_rate": 4.973257158099556e-05, "loss": 0.4828, "step": 557 }, { "epoch": 0.3395193185275327, "grad_norm": 2.2302095890045166, "learning_rate": 4.9730752524206835e-05, "loss": 0.5493, "step": 558 }, { "epoch": 0.3401277760876179, "grad_norm": 1.5827584266662598, "learning_rate": 4.972892733522232e-05, "loss": 0.5375, "step": 559 }, { "epoch": 0.34073623364770306, "grad_norm": 1.3773258924484253, "learning_rate": 4.972709601449458e-05, "loss": 0.5809, "step": 560 }, { "epoch": 0.34134469120778826, "grad_norm": 1.2351981401443481, "learning_rate": 4.972525856247769e-05, "loss": 0.4553, "step": 561 }, { "epoch": 0.3419531487678734, "grad_norm": 1.5786877870559692, "learning_rate": 4.972341497962729e-05, "loss": 0.5861, "step": 562 }, { "epoch": 0.3425616063279586, "grad_norm": 1.3937610387802124, "learning_rate": 4.97215652664005e-05, "loss": 0.5209, "step": 563 }, { "epoch": 0.34317006388804383, "grad_norm": 1.723417043685913, "learning_rate": 4.971970942325597e-05, "loss": 0.6222, "step": 564 }, { "epoch": 0.343778521448129, "grad_norm": 1.3207844495773315, "learning_rate": 4.971784745065386e-05, "loss": 0.547, "step": 565 }, { "epoch": 0.3443869790082142, "grad_norm": 1.4648035764694214, "learning_rate": 4.971597934905587e-05, "loss": 0.4984, "step": 566 }, { "epoch": 0.34499543656829934, "grad_norm": 1.2435007095336914, "learning_rate": 4.971410511892523e-05, "loss": 0.5244, "step": 567 }, { "epoch": 0.34560389412838455, "grad_norm": 1.309339165687561, "learning_rate": 4.971222476072665e-05, "loss": 0.5165, "step": 568 }, { "epoch": 0.34621235168846975, "grad_norm": 1.554057240486145, "learning_rate": 4.9710338274926384e-05, "loss": 0.6061, "step": 569 }, { "epoch": 0.3468208092485549, "grad_norm": 1.3458071947097778, "learning_rate": 4.97084456619922e-05, "loss": 0.5489, "step": 570 }, { "epoch": 0.3474292668086401, "grad_norm": 1.2686628103256226, "learning_rate": 4.9706546922393396e-05, "loss": 0.4569, "step": 571 }, { "epoch": 0.34803772436872527, "grad_norm": 2.3073008060455322, "learning_rate": 4.970464205660077e-05, "loss": 0.4969, "step": 572 }, { "epoch": 0.3486461819288105, "grad_norm": 1.3341611623764038, "learning_rate": 4.970273106508666e-05, "loss": 0.501, "step": 573 }, { "epoch": 0.3492546394888956, "grad_norm": 1.5147910118103027, "learning_rate": 4.97008139483249e-05, "loss": 0.5889, "step": 574 }, { "epoch": 0.34986309704898083, "grad_norm": 2.7254374027252197, "learning_rate": 4.9698890706790866e-05, "loss": 0.598, "step": 575 }, { "epoch": 0.35047155460906604, "grad_norm": 1.4066073894500732, "learning_rate": 4.969696134096143e-05, "loss": 0.5344, "step": 576 }, { "epoch": 0.3510800121691512, "grad_norm": 1.7791637182235718, "learning_rate": 4.969502585131502e-05, "loss": 0.4642, "step": 577 }, { "epoch": 0.3516884697292364, "grad_norm": 1.9573460817337036, "learning_rate": 4.969308423833152e-05, "loss": 0.5322, "step": 578 }, { "epoch": 0.35229692728932155, "grad_norm": 1.4854366779327393, "learning_rate": 4.96911365024924e-05, "loss": 0.486, "step": 579 }, { "epoch": 0.35290538484940676, "grad_norm": 1.8296064138412476, "learning_rate": 4.968918264428059e-05, "loss": 0.5649, "step": 580 }, { "epoch": 0.35351384240949196, "grad_norm": 1.3343514204025269, "learning_rate": 4.9687222664180585e-05, "loss": 0.524, "step": 581 }, { "epoch": 0.3541222999695771, "grad_norm": 1.521429419517517, "learning_rate": 4.968525656267838e-05, "loss": 0.5018, "step": 582 }, { "epoch": 0.3547307575296623, "grad_norm": 2.845484972000122, "learning_rate": 4.968328434026148e-05, "loss": 0.6363, "step": 583 }, { "epoch": 0.3553392150897475, "grad_norm": 1.4245471954345703, "learning_rate": 4.9681305997418906e-05, "loss": 0.5181, "step": 584 }, { "epoch": 0.3559476726498327, "grad_norm": 1.27480149269104, "learning_rate": 4.9679321534641214e-05, "loss": 0.5343, "step": 585 }, { "epoch": 0.35655613020991783, "grad_norm": 1.2429792881011963, "learning_rate": 4.967733095242047e-05, "loss": 0.4989, "step": 586 }, { "epoch": 0.35716458777000304, "grad_norm": 1.9116991758346558, "learning_rate": 4.967533425125025e-05, "loss": 0.611, "step": 587 }, { "epoch": 0.35777304533008825, "grad_norm": 1.498067855834961, "learning_rate": 4.967333143162565e-05, "loss": 0.5896, "step": 588 }, { "epoch": 0.3583815028901734, "grad_norm": 1.4009770154953003, "learning_rate": 4.967132249404329e-05, "loss": 0.5002, "step": 589 }, { "epoch": 0.3589899604502586, "grad_norm": 1.346002459526062, "learning_rate": 4.966930743900131e-05, "loss": 0.5411, "step": 590 }, { "epoch": 0.35959841801034376, "grad_norm": 1.3442580699920654, "learning_rate": 4.9667286266999354e-05, "loss": 0.5158, "step": 591 }, { "epoch": 0.36020687557042896, "grad_norm": 1.6261683702468872, "learning_rate": 4.966525897853858e-05, "loss": 0.6546, "step": 592 }, { "epoch": 0.36081533313051417, "grad_norm": 1.5490068197250366, "learning_rate": 4.966322557412168e-05, "loss": 0.5536, "step": 593 }, { "epoch": 0.3614237906905993, "grad_norm": 1.3161338567733765, "learning_rate": 4.966118605425285e-05, "loss": 0.5315, "step": 594 }, { "epoch": 0.36203224825068453, "grad_norm": 1.6688951253890991, "learning_rate": 4.965914041943781e-05, "loss": 0.5392, "step": 595 }, { "epoch": 0.3626407058107697, "grad_norm": 1.894654393196106, "learning_rate": 4.9657088670183794e-05, "loss": 0.5337, "step": 596 }, { "epoch": 0.3632491633708549, "grad_norm": 1.5747034549713135, "learning_rate": 4.9655030806999534e-05, "loss": 0.5689, "step": 597 }, { "epoch": 0.36385762093094004, "grad_norm": 1.315900206565857, "learning_rate": 4.965296683039532e-05, "loss": 0.5897, "step": 598 }, { "epoch": 0.36446607849102525, "grad_norm": 1.3753433227539062, "learning_rate": 4.9650896740882905e-05, "loss": 0.5475, "step": 599 }, { "epoch": 0.36507453605111045, "grad_norm": 1.546858787536621, "learning_rate": 4.96488205389756e-05, "loss": 0.5734, "step": 600 }, { "epoch": 0.3656829936111956, "grad_norm": 1.2179769277572632, "learning_rate": 4.9646738225188226e-05, "loss": 0.4914, "step": 601 }, { "epoch": 0.3662914511712808, "grad_norm": 1.370540738105774, "learning_rate": 4.964464980003709e-05, "loss": 0.4964, "step": 602 }, { "epoch": 0.36689990873136596, "grad_norm": 1.3386831283569336, "learning_rate": 4.9642555264040046e-05, "loss": 0.5037, "step": 603 }, { "epoch": 0.36750836629145117, "grad_norm": 1.3772310018539429, "learning_rate": 4.964045461771645e-05, "loss": 0.4962, "step": 604 }, { "epoch": 0.3681168238515364, "grad_norm": 1.5445538759231567, "learning_rate": 4.963834786158717e-05, "loss": 0.4728, "step": 605 }, { "epoch": 0.36872528141162153, "grad_norm": 1.419505000114441, "learning_rate": 4.96362349961746e-05, "loss": 0.502, "step": 606 }, { "epoch": 0.36933373897170674, "grad_norm": 1.3720256090164185, "learning_rate": 4.963411602200264e-05, "loss": 0.5714, "step": 607 }, { "epoch": 0.3699421965317919, "grad_norm": 1.3194488286972046, "learning_rate": 4.963199093959671e-05, "loss": 0.5201, "step": 608 }, { "epoch": 0.3705506540918771, "grad_norm": 1.3677685260772705, "learning_rate": 4.962985974948373e-05, "loss": 0.5827, "step": 609 }, { "epoch": 0.37115911165196225, "grad_norm": 1.1780776977539062, "learning_rate": 4.9627722452192164e-05, "loss": 0.4458, "step": 610 }, { "epoch": 0.37176756921204746, "grad_norm": 1.2468417882919312, "learning_rate": 4.962557904825196e-05, "loss": 0.5893, "step": 611 }, { "epoch": 0.37237602677213266, "grad_norm": 1.4214433431625366, "learning_rate": 4.962342953819459e-05, "loss": 0.5349, "step": 612 }, { "epoch": 0.3729844843322178, "grad_norm": 1.3830993175506592, "learning_rate": 4.9621273922553055e-05, "loss": 0.4956, "step": 613 }, { "epoch": 0.373592941892303, "grad_norm": 1.322393774986267, "learning_rate": 4.961911220186186e-05, "loss": 0.528, "step": 614 }, { "epoch": 0.3742013994523882, "grad_norm": 1.3154321908950806, "learning_rate": 4.961694437665701e-05, "loss": 0.4628, "step": 615 }, { "epoch": 0.3748098570124734, "grad_norm": 1.4259490966796875, "learning_rate": 4.9614770447476037e-05, "loss": 0.5198, "step": 616 }, { "epoch": 0.3754183145725586, "grad_norm": 1.643979549407959, "learning_rate": 4.961259041485799e-05, "loss": 0.4984, "step": 617 }, { "epoch": 0.37602677213264374, "grad_norm": 1.36431884765625, "learning_rate": 4.9610404279343415e-05, "loss": 0.5148, "step": 618 }, { "epoch": 0.37663522969272895, "grad_norm": 1.2996978759765625, "learning_rate": 4.96082120414744e-05, "loss": 0.5277, "step": 619 }, { "epoch": 0.3772436872528141, "grad_norm": 1.7564353942871094, "learning_rate": 4.960601370179452e-05, "loss": 0.5884, "step": 620 }, { "epoch": 0.3778521448128993, "grad_norm": 1.3980865478515625, "learning_rate": 4.9603809260848864e-05, "loss": 0.4982, "step": 621 }, { "epoch": 0.3784606023729845, "grad_norm": 1.2997080087661743, "learning_rate": 4.960159871918405e-05, "loss": 0.5634, "step": 622 }, { "epoch": 0.37906905993306966, "grad_norm": 1.2841782569885254, "learning_rate": 4.9599382077348205e-05, "loss": 0.4826, "step": 623 }, { "epoch": 0.37967751749315487, "grad_norm": 1.347280502319336, "learning_rate": 4.959715933589095e-05, "loss": 0.5076, "step": 624 }, { "epoch": 0.38028597505324, "grad_norm": 1.6214102506637573, "learning_rate": 4.9594930495363445e-05, "loss": 0.5878, "step": 625 }, { "epoch": 0.38089443261332523, "grad_norm": 1.3995033502578735, "learning_rate": 4.959269555631835e-05, "loss": 0.476, "step": 626 }, { "epoch": 0.3815028901734104, "grad_norm": 1.3252878189086914, "learning_rate": 4.959045451930982e-05, "loss": 0.5469, "step": 627 }, { "epoch": 0.3821113477334956, "grad_norm": 1.2933142185211182, "learning_rate": 4.958820738489355e-05, "loss": 0.535, "step": 628 }, { "epoch": 0.3827198052935808, "grad_norm": 1.3921632766723633, "learning_rate": 4.958595415362675e-05, "loss": 0.5282, "step": 629 }, { "epoch": 0.38332826285366595, "grad_norm": 1.2744975090026855, "learning_rate": 4.95836948260681e-05, "loss": 0.5035, "step": 630 }, { "epoch": 0.38393672041375115, "grad_norm": 2.0861146450042725, "learning_rate": 4.9581429402777826e-05, "loss": 0.5104, "step": 631 }, { "epoch": 0.3845451779738363, "grad_norm": 1.7195478677749634, "learning_rate": 4.957915788431768e-05, "loss": 0.5618, "step": 632 }, { "epoch": 0.3851536355339215, "grad_norm": 1.2929902076721191, "learning_rate": 4.957688027125088e-05, "loss": 0.4858, "step": 633 }, { "epoch": 0.3857620930940067, "grad_norm": 1.324310064315796, "learning_rate": 4.957459656414219e-05, "loss": 0.576, "step": 634 }, { "epoch": 0.38637055065409187, "grad_norm": 1.3465945720672607, "learning_rate": 4.957230676355787e-05, "loss": 0.5138, "step": 635 }, { "epoch": 0.3869790082141771, "grad_norm": 1.2938052415847778, "learning_rate": 4.95700108700657e-05, "loss": 0.5101, "step": 636 }, { "epoch": 0.38758746577426223, "grad_norm": 1.6239326000213623, "learning_rate": 4.956770888423495e-05, "loss": 0.5932, "step": 637 }, { "epoch": 0.38819592333434744, "grad_norm": 1.3024382591247559, "learning_rate": 4.9565400806636447e-05, "loss": 0.5088, "step": 638 }, { "epoch": 0.3888043808944326, "grad_norm": 1.557921051979065, "learning_rate": 4.956308663784247e-05, "loss": 0.5358, "step": 639 }, { "epoch": 0.3894128384545178, "grad_norm": 1.2118357419967651, "learning_rate": 4.956076637842685e-05, "loss": 0.5089, "step": 640 }, { "epoch": 0.390021296014603, "grad_norm": 1.4801642894744873, "learning_rate": 4.9558440028964914e-05, "loss": 0.4758, "step": 641 }, { "epoch": 0.39062975357468815, "grad_norm": 1.3965163230895996, "learning_rate": 4.95561075900335e-05, "loss": 0.5498, "step": 642 }, { "epoch": 0.39123821113477336, "grad_norm": 1.5353859663009644, "learning_rate": 4.955376906221094e-05, "loss": 0.4723, "step": 643 }, { "epoch": 0.3918466686948585, "grad_norm": 1.248234748840332, "learning_rate": 4.955142444607711e-05, "loss": 0.5461, "step": 644 }, { "epoch": 0.3924551262549437, "grad_norm": 1.5625355243682861, "learning_rate": 4.9549073742213375e-05, "loss": 0.5561, "step": 645 }, { "epoch": 0.3930635838150289, "grad_norm": 1.327852487564087, "learning_rate": 4.9546716951202606e-05, "loss": 0.4967, "step": 646 }, { "epoch": 0.3936720413751141, "grad_norm": 1.3109062910079956, "learning_rate": 4.9544354073629186e-05, "loss": 0.5108, "step": 647 }, { "epoch": 0.3942804989351993, "grad_norm": 1.6410901546478271, "learning_rate": 4.954198511007902e-05, "loss": 0.5998, "step": 648 }, { "epoch": 0.39488895649528444, "grad_norm": 1.4157142639160156, "learning_rate": 4.9539610061139505e-05, "loss": 0.54, "step": 649 }, { "epoch": 0.39549741405536964, "grad_norm": 1.2379719018936157, "learning_rate": 4.953722892739956e-05, "loss": 0.4716, "step": 650 }, { "epoch": 0.3961058716154548, "grad_norm": 1.3392927646636963, "learning_rate": 4.95348417094496e-05, "loss": 0.4665, "step": 651 }, { "epoch": 0.39671432917554, "grad_norm": 1.3489748239517212, "learning_rate": 4.953244840788156e-05, "loss": 0.5533, "step": 652 }, { "epoch": 0.3973227867356252, "grad_norm": 1.4216614961624146, "learning_rate": 4.953004902328887e-05, "loss": 0.5107, "step": 653 }, { "epoch": 0.39793124429571036, "grad_norm": 1.2721261978149414, "learning_rate": 4.9527643556266493e-05, "loss": 0.5454, "step": 654 }, { "epoch": 0.39853970185579557, "grad_norm": 1.292842984199524, "learning_rate": 4.952523200741088e-05, "loss": 0.4989, "step": 655 }, { "epoch": 0.3991481594158807, "grad_norm": 1.2680416107177734, "learning_rate": 4.952281437731998e-05, "loss": 0.4913, "step": 656 }, { "epoch": 0.39975661697596593, "grad_norm": 1.3643546104431152, "learning_rate": 4.9520390666593286e-05, "loss": 0.5144, "step": 657 }, { "epoch": 0.40036507453605114, "grad_norm": 1.310077428817749, "learning_rate": 4.951796087583176e-05, "loss": 0.5468, "step": 658 }, { "epoch": 0.4009735320961363, "grad_norm": 1.3737618923187256, "learning_rate": 4.95155250056379e-05, "loss": 0.5958, "step": 659 }, { "epoch": 0.4015819896562215, "grad_norm": 1.2556304931640625, "learning_rate": 4.9513083056615695e-05, "loss": 0.5171, "step": 660 }, { "epoch": 0.40219044721630665, "grad_norm": 1.2657099962234497, "learning_rate": 4.9510635029370646e-05, "loss": 0.5154, "step": 661 }, { "epoch": 0.40279890477639185, "grad_norm": 1.3634426593780518, "learning_rate": 4.9508180924509763e-05, "loss": 0.4383, "step": 662 }, { "epoch": 0.403407362336477, "grad_norm": 1.582513689994812, "learning_rate": 4.950572074264156e-05, "loss": 0.4719, "step": 663 }, { "epoch": 0.4040158198965622, "grad_norm": 2.247115135192871, "learning_rate": 4.9503254484376074e-05, "loss": 0.5479, "step": 664 }, { "epoch": 0.4046242774566474, "grad_norm": 1.3867719173431396, "learning_rate": 4.950078215032481e-05, "loss": 0.5422, "step": 665 }, { "epoch": 0.40523273501673257, "grad_norm": 1.2318755388259888, "learning_rate": 4.949830374110081e-05, "loss": 0.5662, "step": 666 }, { "epoch": 0.4058411925768178, "grad_norm": 1.5015090703964233, "learning_rate": 4.9495819257318635e-05, "loss": 0.5895, "step": 667 }, { "epoch": 0.40644965013690293, "grad_norm": 1.2691212892532349, "learning_rate": 4.949332869959432e-05, "loss": 0.4953, "step": 668 }, { "epoch": 0.40705810769698814, "grad_norm": 1.7610334157943726, "learning_rate": 4.9490832068545414e-05, "loss": 0.5327, "step": 669 }, { "epoch": 0.40766656525707334, "grad_norm": 1.3113877773284912, "learning_rate": 4.9488329364790986e-05, "loss": 0.5369, "step": 670 }, { "epoch": 0.4082750228171585, "grad_norm": 1.3471949100494385, "learning_rate": 4.948582058895159e-05, "loss": 0.4791, "step": 671 }, { "epoch": 0.4088834803772437, "grad_norm": 1.1278780698776245, "learning_rate": 4.9483305741649324e-05, "loss": 0.4704, "step": 672 }, { "epoch": 0.40949193793732885, "grad_norm": 1.2446277141571045, "learning_rate": 4.948078482350774e-05, "loss": 0.4728, "step": 673 }, { "epoch": 0.41010039549741406, "grad_norm": 1.507495641708374, "learning_rate": 4.947825783515193e-05, "loss": 0.6533, "step": 674 }, { "epoch": 0.4107088530574992, "grad_norm": 1.4783514738082886, "learning_rate": 4.9475724777208474e-05, "loss": 0.5185, "step": 675 }, { "epoch": 0.4113173106175844, "grad_norm": 1.186607003211975, "learning_rate": 4.947318565030548e-05, "loss": 0.4684, "step": 676 }, { "epoch": 0.4119257681776696, "grad_norm": 1.4391772747039795, "learning_rate": 4.947064045507253e-05, "loss": 0.5561, "step": 677 }, { "epoch": 0.4125342257377548, "grad_norm": 1.3930890560150146, "learning_rate": 4.946808919214074e-05, "loss": 0.5353, "step": 678 }, { "epoch": 0.41314268329784, "grad_norm": 1.2394658327102661, "learning_rate": 4.946553186214271e-05, "loss": 0.4428, "step": 679 }, { "epoch": 0.41375114085792514, "grad_norm": 1.2956104278564453, "learning_rate": 4.9462968465712555e-05, "loss": 0.4658, "step": 680 }, { "epoch": 0.41435959841801034, "grad_norm": 2.5605669021606445, "learning_rate": 4.946039900348588e-05, "loss": 0.5784, "step": 681 }, { "epoch": 0.41496805597809555, "grad_norm": 1.3224326372146606, "learning_rate": 4.945782347609982e-05, "loss": 0.4152, "step": 682 }, { "epoch": 0.4155765135381807, "grad_norm": 1.4174445867538452, "learning_rate": 4.945524188419298e-05, "loss": 0.5388, "step": 683 }, { "epoch": 0.4161849710982659, "grad_norm": 1.2914178371429443, "learning_rate": 4.9452654228405506e-05, "loss": 0.4773, "step": 684 }, { "epoch": 0.41679342865835106, "grad_norm": 1.3811676502227783, "learning_rate": 4.945006050937902e-05, "loss": 0.551, "step": 685 }, { "epoch": 0.41740188621843627, "grad_norm": 1.2903940677642822, "learning_rate": 4.944746072775665e-05, "loss": 0.535, "step": 686 }, { "epoch": 0.4180103437785214, "grad_norm": 1.8577662706375122, "learning_rate": 4.9444854884183046e-05, "loss": 0.533, "step": 687 }, { "epoch": 0.4186188013386066, "grad_norm": 1.3438717126846313, "learning_rate": 4.944224297930434e-05, "loss": 0.5539, "step": 688 }, { "epoch": 0.41922725889869183, "grad_norm": 1.184409260749817, "learning_rate": 4.943962501376818e-05, "loss": 0.4907, "step": 689 }, { "epoch": 0.419835716458777, "grad_norm": 1.1499260663986206, "learning_rate": 4.9437000988223705e-05, "loss": 0.4658, "step": 690 }, { "epoch": 0.4204441740188622, "grad_norm": 1.16009521484375, "learning_rate": 4.9434370903321566e-05, "loss": 0.5088, "step": 691 }, { "epoch": 0.42105263157894735, "grad_norm": 1.2988529205322266, "learning_rate": 4.943173475971393e-05, "loss": 0.5152, "step": 692 }, { "epoch": 0.42166108913903255, "grad_norm": 1.246013879776001, "learning_rate": 4.942909255805443e-05, "loss": 0.4863, "step": 693 }, { "epoch": 0.42226954669911776, "grad_norm": 1.4207082986831665, "learning_rate": 4.942644429899824e-05, "loss": 0.5582, "step": 694 }, { "epoch": 0.4228780042592029, "grad_norm": 1.1736639738082886, "learning_rate": 4.9423789983201994e-05, "loss": 0.4872, "step": 695 }, { "epoch": 0.4234864618192881, "grad_norm": 1.3265244960784912, "learning_rate": 4.942112961132388e-05, "loss": 0.4979, "step": 696 }, { "epoch": 0.42409491937937327, "grad_norm": 1.4081310033798218, "learning_rate": 4.941846318402353e-05, "loss": 0.5683, "step": 697 }, { "epoch": 0.4247033769394585, "grad_norm": 1.3100318908691406, "learning_rate": 4.941579070196214e-05, "loss": 0.5316, "step": 698 }, { "epoch": 0.4253118344995437, "grad_norm": 1.369620442390442, "learning_rate": 4.9413112165802345e-05, "loss": 0.5076, "step": 699 }, { "epoch": 0.42592029205962884, "grad_norm": 1.268389105796814, "learning_rate": 4.9410427576208316e-05, "loss": 0.4792, "step": 700 }, { "epoch": 0.42652874961971404, "grad_norm": 1.2050520181655884, "learning_rate": 4.940773693384574e-05, "loss": 0.4954, "step": 701 }, { "epoch": 0.4271372071797992, "grad_norm": 1.5060069561004639, "learning_rate": 4.940504023938176e-05, "loss": 0.5535, "step": 702 }, { "epoch": 0.4277456647398844, "grad_norm": 1.380094289779663, "learning_rate": 4.940233749348505e-05, "loss": 0.48, "step": 703 }, { "epoch": 0.42835412229996955, "grad_norm": 1.2237451076507568, "learning_rate": 4.9399628696825786e-05, "loss": 0.4835, "step": 704 }, { "epoch": 0.42896257986005476, "grad_norm": 1.639406442642212, "learning_rate": 4.9396913850075636e-05, "loss": 0.5267, "step": 705 }, { "epoch": 0.42957103742013997, "grad_norm": 1.3455315828323364, "learning_rate": 4.9394192953907757e-05, "loss": 0.5586, "step": 706 }, { "epoch": 0.4301794949802251, "grad_norm": 1.4164340496063232, "learning_rate": 4.939146600899683e-05, "loss": 0.4524, "step": 707 }, { "epoch": 0.4307879525403103, "grad_norm": 1.3369954824447632, "learning_rate": 4.938873301601902e-05, "loss": 0.5494, "step": 708 }, { "epoch": 0.4313964101003955, "grad_norm": 1.3146734237670898, "learning_rate": 4.938599397565199e-05, "loss": 0.4943, "step": 709 }, { "epoch": 0.4320048676604807, "grad_norm": 1.4245113134384155, "learning_rate": 4.9383248888574916e-05, "loss": 0.5151, "step": 710 }, { "epoch": 0.4326133252205659, "grad_norm": 1.6800333261489868, "learning_rate": 4.938049775546846e-05, "loss": 0.5604, "step": 711 }, { "epoch": 0.43322178278065104, "grad_norm": 1.3402103185653687, "learning_rate": 4.9377740577014784e-05, "loss": 0.5283, "step": 712 }, { "epoch": 0.43383024034073625, "grad_norm": 1.3066250085830688, "learning_rate": 4.9374977353897566e-05, "loss": 0.4636, "step": 713 }, { "epoch": 0.4344386979008214, "grad_norm": 1.3179736137390137, "learning_rate": 4.937220808680196e-05, "loss": 0.471, "step": 714 }, { "epoch": 0.4350471554609066, "grad_norm": 1.3768038749694824, "learning_rate": 4.9369432776414634e-05, "loss": 0.538, "step": 715 }, { "epoch": 0.43565561302099176, "grad_norm": 1.1771548986434937, "learning_rate": 4.936665142342375e-05, "loss": 0.4856, "step": 716 }, { "epoch": 0.43626407058107697, "grad_norm": 2.29483962059021, "learning_rate": 4.936386402851896e-05, "loss": 0.4863, "step": 717 }, { "epoch": 0.4368725281411622, "grad_norm": 1.554796576499939, "learning_rate": 4.936107059239143e-05, "loss": 0.5158, "step": 718 }, { "epoch": 0.4374809857012473, "grad_norm": 1.3302370309829712, "learning_rate": 4.935827111573381e-05, "loss": 0.5295, "step": 719 }, { "epoch": 0.43808944326133253, "grad_norm": 1.2260332107543945, "learning_rate": 4.9355465599240265e-05, "loss": 0.5124, "step": 720 }, { "epoch": 0.4386979008214177, "grad_norm": 1.272337555885315, "learning_rate": 4.935265404360643e-05, "loss": 0.4962, "step": 721 }, { "epoch": 0.4393063583815029, "grad_norm": 1.1549131870269775, "learning_rate": 4.9349836449529463e-05, "loss": 0.5208, "step": 722 }, { "epoch": 0.4399148159415881, "grad_norm": 1.3710724115371704, "learning_rate": 4.9347012817708e-05, "loss": 0.5462, "step": 723 }, { "epoch": 0.44052327350167325, "grad_norm": 1.3693798780441284, "learning_rate": 4.93441831488422e-05, "loss": 0.5133, "step": 724 }, { "epoch": 0.44113173106175846, "grad_norm": 1.2647064924240112, "learning_rate": 4.934134744363369e-05, "loss": 0.5089, "step": 725 }, { "epoch": 0.4417401886218436, "grad_norm": 1.302121877670288, "learning_rate": 4.933850570278562e-05, "loss": 0.4566, "step": 726 }, { "epoch": 0.4423486461819288, "grad_norm": 1.633015751838684, "learning_rate": 4.933565792700261e-05, "loss": 0.5596, "step": 727 }, { "epoch": 0.44295710374201397, "grad_norm": 1.376413345336914, "learning_rate": 4.9332804116990795e-05, "loss": 0.5597, "step": 728 }, { "epoch": 0.4435655613020992, "grad_norm": 1.3377238512039185, "learning_rate": 4.9329944273457794e-05, "loss": 0.4782, "step": 729 }, { "epoch": 0.4441740188621844, "grad_norm": 1.3485767841339111, "learning_rate": 4.932707839711273e-05, "loss": 0.504, "step": 730 }, { "epoch": 0.44478247642226953, "grad_norm": 1.3925731182098389, "learning_rate": 4.9324206488666244e-05, "loss": 0.5548, "step": 731 }, { "epoch": 0.44539093398235474, "grad_norm": 1.1115320920944214, "learning_rate": 4.9321328548830426e-05, "loss": 0.5035, "step": 732 }, { "epoch": 0.4459993915424399, "grad_norm": 1.2374329566955566, "learning_rate": 4.9318444578318886e-05, "loss": 0.47, "step": 733 }, { "epoch": 0.4466078491025251, "grad_norm": 1.378677487373352, "learning_rate": 4.931555457784674e-05, "loss": 0.5206, "step": 734 }, { "epoch": 0.4472163066626103, "grad_norm": 1.3036342859268188, "learning_rate": 4.931265854813057e-05, "loss": 0.4869, "step": 735 }, { "epoch": 0.44782476422269546, "grad_norm": 1.1578119993209839, "learning_rate": 4.930975648988849e-05, "loss": 0.495, "step": 736 }, { "epoch": 0.44843322178278067, "grad_norm": 1.815414309501648, "learning_rate": 4.930684840384008e-05, "loss": 0.6301, "step": 737 }, { "epoch": 0.4490416793428658, "grad_norm": 1.4656637907028198, "learning_rate": 4.9303934290706424e-05, "loss": 0.4809, "step": 738 }, { "epoch": 0.449650136902951, "grad_norm": 1.5585298538208008, "learning_rate": 4.93010141512101e-05, "loss": 0.5067, "step": 739 }, { "epoch": 0.4502585944630362, "grad_norm": 1.466367483139038, "learning_rate": 4.929808798607518e-05, "loss": 0.5188, "step": 740 }, { "epoch": 0.4508670520231214, "grad_norm": 1.1880073547363281, "learning_rate": 4.9295155796027244e-05, "loss": 0.5133, "step": 741 }, { "epoch": 0.4514755095832066, "grad_norm": 1.425815224647522, "learning_rate": 4.929221758179333e-05, "loss": 0.5826, "step": 742 }, { "epoch": 0.45208396714329174, "grad_norm": 1.4366766214370728, "learning_rate": 4.9289273344102014e-05, "loss": 0.5136, "step": 743 }, { "epoch": 0.45269242470337695, "grad_norm": 1.2157377004623413, "learning_rate": 4.928632308368334e-05, "loss": 0.4507, "step": 744 }, { "epoch": 0.4533008822634621, "grad_norm": 1.1545383930206299, "learning_rate": 4.928336680126884e-05, "loss": 0.4523, "step": 745 }, { "epoch": 0.4539093398235473, "grad_norm": 1.2865034341812134, "learning_rate": 4.9280404497591545e-05, "loss": 0.4843, "step": 746 }, { "epoch": 0.4545177973836325, "grad_norm": 1.1949740648269653, "learning_rate": 4.9277436173386006e-05, "loss": 0.4669, "step": 747 }, { "epoch": 0.45512625494371767, "grad_norm": 1.2178860902786255, "learning_rate": 4.927446182938822e-05, "loss": 0.5262, "step": 748 }, { "epoch": 0.4557347125038029, "grad_norm": 1.3532716035842896, "learning_rate": 4.927148146633571e-05, "loss": 0.4994, "step": 749 }, { "epoch": 0.456343170063888, "grad_norm": 1.2420387268066406, "learning_rate": 4.9268495084967485e-05, "loss": 0.4912, "step": 750 }, { "epoch": 0.45695162762397323, "grad_norm": 1.242753267288208, "learning_rate": 4.926550268602404e-05, "loss": 0.5301, "step": 751 }, { "epoch": 0.4575600851840584, "grad_norm": 1.2774479389190674, "learning_rate": 4.926250427024736e-05, "loss": 0.4994, "step": 752 }, { "epoch": 0.4581685427441436, "grad_norm": 1.3734140396118164, "learning_rate": 4.925949983838094e-05, "loss": 0.5812, "step": 753 }, { "epoch": 0.4587770003042288, "grad_norm": 1.4266109466552734, "learning_rate": 4.925648939116974e-05, "loss": 0.5418, "step": 754 }, { "epoch": 0.45938545786431395, "grad_norm": 1.2357332706451416, "learning_rate": 4.9253472929360235e-05, "loss": 0.5441, "step": 755 }, { "epoch": 0.45999391542439916, "grad_norm": 1.4429142475128174, "learning_rate": 4.925045045370037e-05, "loss": 0.5116, "step": 756 }, { "epoch": 0.4606023729844843, "grad_norm": 1.2204004526138306, "learning_rate": 4.924742196493961e-05, "loss": 0.4969, "step": 757 }, { "epoch": 0.4612108305445695, "grad_norm": 1.2525185346603394, "learning_rate": 4.9244387463828876e-05, "loss": 0.4672, "step": 758 }, { "epoch": 0.4618192881046547, "grad_norm": 1.238554835319519, "learning_rate": 4.9241346951120616e-05, "loss": 0.5296, "step": 759 }, { "epoch": 0.4624277456647399, "grad_norm": 1.2626805305480957, "learning_rate": 4.923830042756874e-05, "loss": 0.5025, "step": 760 }, { "epoch": 0.4630362032248251, "grad_norm": 1.3802168369293213, "learning_rate": 4.923524789392866e-05, "loss": 0.5273, "step": 761 }, { "epoch": 0.46364466078491023, "grad_norm": 1.498776912689209, "learning_rate": 4.923218935095727e-05, "loss": 0.4597, "step": 762 }, { "epoch": 0.46425311834499544, "grad_norm": 1.449130892753601, "learning_rate": 4.922912479941297e-05, "loss": 0.533, "step": 763 }, { "epoch": 0.4648615759050806, "grad_norm": 2.3711140155792236, "learning_rate": 4.922605424005565e-05, "loss": 0.5551, "step": 764 }, { "epoch": 0.4654700334651658, "grad_norm": 1.1763001680374146, "learning_rate": 4.922297767364666e-05, "loss": 0.4951, "step": 765 }, { "epoch": 0.466078491025251, "grad_norm": 1.5831891298294067, "learning_rate": 4.921989510094888e-05, "loss": 0.6159, "step": 766 }, { "epoch": 0.46668694858533616, "grad_norm": 1.3769197463989258, "learning_rate": 4.921680652272665e-05, "loss": 0.5048, "step": 767 }, { "epoch": 0.46729540614542137, "grad_norm": 1.137963056564331, "learning_rate": 4.9213711939745795e-05, "loss": 0.4931, "step": 768 }, { "epoch": 0.4679038637055065, "grad_norm": 1.7989009618759155, "learning_rate": 4.921061135277366e-05, "loss": 0.5065, "step": 769 }, { "epoch": 0.4685123212655917, "grad_norm": 1.294255256652832, "learning_rate": 4.920750476257906e-05, "loss": 0.4863, "step": 770 }, { "epoch": 0.46912077882567693, "grad_norm": 1.5027679204940796, "learning_rate": 4.92043921699323e-05, "loss": 0.507, "step": 771 }, { "epoch": 0.4697292363857621, "grad_norm": 1.1163376569747925, "learning_rate": 4.920127357560517e-05, "loss": 0.4507, "step": 772 }, { "epoch": 0.4703376939458473, "grad_norm": 1.1562211513519287, "learning_rate": 4.919814898037095e-05, "loss": 0.4933, "step": 773 }, { "epoch": 0.47094615150593244, "grad_norm": 1.191632866859436, "learning_rate": 4.919501838500441e-05, "loss": 0.4549, "step": 774 }, { "epoch": 0.47155460906601765, "grad_norm": 1.2718918323516846, "learning_rate": 4.9191881790281815e-05, "loss": 0.5133, "step": 775 }, { "epoch": 0.47216306662610286, "grad_norm": 1.677453875541687, "learning_rate": 4.91887391969809e-05, "loss": 0.4517, "step": 776 }, { "epoch": 0.472771524186188, "grad_norm": 1.3520299196243286, "learning_rate": 4.91855906058809e-05, "loss": 0.4373, "step": 777 }, { "epoch": 0.4733799817462732, "grad_norm": 1.4283875226974487, "learning_rate": 4.9182436017762535e-05, "loss": 0.517, "step": 778 }, { "epoch": 0.47398843930635837, "grad_norm": 1.2896888256072998, "learning_rate": 4.917927543340801e-05, "loss": 0.5267, "step": 779 }, { "epoch": 0.4745968968664436, "grad_norm": 1.275476098060608, "learning_rate": 4.9176108853601024e-05, "loss": 0.4542, "step": 780 }, { "epoch": 0.4752053544265287, "grad_norm": 1.1607316732406616, "learning_rate": 4.917293627912675e-05, "loss": 0.4794, "step": 781 }, { "epoch": 0.47581381198661393, "grad_norm": 1.6451760530471802, "learning_rate": 4.916975771077185e-05, "loss": 0.5549, "step": 782 }, { "epoch": 0.47642226954669914, "grad_norm": 1.3104053735733032, "learning_rate": 4.9166573149324486e-05, "loss": 0.4835, "step": 783 }, { "epoch": 0.4770307271067843, "grad_norm": 1.5024620294570923, "learning_rate": 4.916338259557429e-05, "loss": 0.5095, "step": 784 }, { "epoch": 0.4776391846668695, "grad_norm": 1.4474741220474243, "learning_rate": 4.91601860503124e-05, "loss": 0.5574, "step": 785 }, { "epoch": 0.47824764222695465, "grad_norm": 1.329298734664917, "learning_rate": 4.915698351433141e-05, "loss": 0.5496, "step": 786 }, { "epoch": 0.47885609978703986, "grad_norm": 1.1795927286148071, "learning_rate": 4.915377498842542e-05, "loss": 0.4553, "step": 787 }, { "epoch": 0.47946455734712506, "grad_norm": 1.329775094985962, "learning_rate": 4.915056047339002e-05, "loss": 0.5445, "step": 788 }, { "epoch": 0.4800730149072102, "grad_norm": 1.1533706188201904, "learning_rate": 4.9147339970022256e-05, "loss": 0.507, "step": 789 }, { "epoch": 0.4806814724672954, "grad_norm": 1.1243101358413696, "learning_rate": 4.9144113479120695e-05, "loss": 0.5018, "step": 790 }, { "epoch": 0.4812899300273806, "grad_norm": 1.1606624126434326, "learning_rate": 4.9140881001485374e-05, "loss": 0.4971, "step": 791 }, { "epoch": 0.4818983875874658, "grad_norm": 1.342514991760254, "learning_rate": 4.91376425379178e-05, "loss": 0.541, "step": 792 }, { "epoch": 0.48250684514755093, "grad_norm": 1.347623586654663, "learning_rate": 4.913439808922098e-05, "loss": 0.509, "step": 793 }, { "epoch": 0.48311530270763614, "grad_norm": 1.340950608253479, "learning_rate": 4.91311476561994e-05, "loss": 0.4773, "step": 794 }, { "epoch": 0.48372376026772135, "grad_norm": 1.0880382061004639, "learning_rate": 4.912789123965905e-05, "loss": 0.4569, "step": 795 }, { "epoch": 0.4843322178278065, "grad_norm": 1.922781229019165, "learning_rate": 4.912462884040737e-05, "loss": 0.4752, "step": 796 }, { "epoch": 0.4849406753878917, "grad_norm": 1.0886273384094238, "learning_rate": 4.91213604592533e-05, "loss": 0.4649, "step": 797 }, { "epoch": 0.48554913294797686, "grad_norm": 1.411138892173767, "learning_rate": 4.911808609700726e-05, "loss": 0.5271, "step": 798 }, { "epoch": 0.48615759050806207, "grad_norm": 0.9868936538696289, "learning_rate": 4.911480575448116e-05, "loss": 0.435, "step": 799 }, { "epoch": 0.4867660480681473, "grad_norm": 1.06026291847229, "learning_rate": 4.911151943248839e-05, "loss": 0.4693, "step": 800 }, { "epoch": 0.4873745056282324, "grad_norm": 1.5773180723190308, "learning_rate": 4.910822713184382e-05, "loss": 0.5369, "step": 801 }, { "epoch": 0.48798296318831763, "grad_norm": 1.2286581993103027, "learning_rate": 4.910492885336381e-05, "loss": 0.4814, "step": 802 }, { "epoch": 0.4885914207484028, "grad_norm": 1.3670451641082764, "learning_rate": 4.910162459786617e-05, "loss": 0.5145, "step": 803 }, { "epoch": 0.489199878308488, "grad_norm": 1.6404004096984863, "learning_rate": 4.9098314366170245e-05, "loss": 0.5741, "step": 804 }, { "epoch": 0.48980833586857314, "grad_norm": 1.1956790685653687, "learning_rate": 4.909499815909682e-05, "loss": 0.4935, "step": 805 }, { "epoch": 0.49041679342865835, "grad_norm": 1.4000447988510132, "learning_rate": 4.909167597746819e-05, "loss": 0.4661, "step": 806 }, { "epoch": 0.49102525098874356, "grad_norm": 1.4137040376663208, "learning_rate": 4.908834782210809e-05, "loss": 0.5235, "step": 807 }, { "epoch": 0.4916337085488287, "grad_norm": 1.3332107067108154, "learning_rate": 4.90850136938418e-05, "loss": 0.5003, "step": 808 }, { "epoch": 0.4922421661089139, "grad_norm": 1.3718630075454712, "learning_rate": 4.908167359349601e-05, "loss": 0.4756, "step": 809 }, { "epoch": 0.49285062366899907, "grad_norm": 1.3051707744598389, "learning_rate": 4.907832752189896e-05, "loss": 0.5024, "step": 810 }, { "epoch": 0.4934590812290843, "grad_norm": 1.2241960763931274, "learning_rate": 4.907497547988031e-05, "loss": 0.4989, "step": 811 }, { "epoch": 0.4940675387891695, "grad_norm": 1.3939323425292969, "learning_rate": 4.9071617468271234e-05, "loss": 0.5231, "step": 812 }, { "epoch": 0.49467599634925463, "grad_norm": 1.4326186180114746, "learning_rate": 4.906825348790438e-05, "loss": 0.4494, "step": 813 }, { "epoch": 0.49528445390933984, "grad_norm": 1.1022182703018188, "learning_rate": 4.9064883539613884e-05, "loss": 0.5195, "step": 814 }, { "epoch": 0.495892911469425, "grad_norm": 1.2197952270507812, "learning_rate": 4.9061507624235334e-05, "loss": 0.4793, "step": 815 }, { "epoch": 0.4965013690295102, "grad_norm": 1.329502820968628, "learning_rate": 4.905812574260583e-05, "loss": 0.5823, "step": 816 }, { "epoch": 0.49710982658959535, "grad_norm": 1.4672378301620483, "learning_rate": 4.9054737895563935e-05, "loss": 0.5231, "step": 817 }, { "epoch": 0.49771828414968056, "grad_norm": 1.1329609155654907, "learning_rate": 4.905134408394969e-05, "loss": 0.501, "step": 818 }, { "epoch": 0.49832674170976576, "grad_norm": 1.1195259094238281, "learning_rate": 4.904794430860462e-05, "loss": 0.482, "step": 819 }, { "epoch": 0.4989351992698509, "grad_norm": 1.258161187171936, "learning_rate": 4.904453857037173e-05, "loss": 0.4575, "step": 820 }, { "epoch": 0.4995436568299361, "grad_norm": 1.2955543994903564, "learning_rate": 4.904112687009551e-05, "loss": 0.563, "step": 821 }, { "epoch": 0.5001521143900213, "grad_norm": 1.3455455303192139, "learning_rate": 4.90377092086219e-05, "loss": 0.5372, "step": 822 }, { "epoch": 0.5007605719501065, "grad_norm": 1.2363775968551636, "learning_rate": 4.903428558679835e-05, "loss": 0.4754, "step": 823 }, { "epoch": 0.5013690295101917, "grad_norm": 1.1832222938537598, "learning_rate": 4.9030856005473776e-05, "loss": 0.4748, "step": 824 }, { "epoch": 0.5019774870702769, "grad_norm": 1.2365323305130005, "learning_rate": 4.902742046549856e-05, "loss": 0.5218, "step": 825 }, { "epoch": 0.502585944630362, "grad_norm": 1.120848298072815, "learning_rate": 4.902397896772459e-05, "loss": 0.4514, "step": 826 }, { "epoch": 0.5031944021904472, "grad_norm": 1.7753840684890747, "learning_rate": 4.9020531513005194e-05, "loss": 0.5859, "step": 827 }, { "epoch": 0.5038028597505324, "grad_norm": 1.2528876066207886, "learning_rate": 4.901707810219522e-05, "loss": 0.4974, "step": 828 }, { "epoch": 0.5044113173106176, "grad_norm": 1.1275379657745361, "learning_rate": 4.901361873615095e-05, "loss": 0.4611, "step": 829 }, { "epoch": 0.5050197748707028, "grad_norm": 1.2789126634597778, "learning_rate": 4.901015341573017e-05, "loss": 0.4913, "step": 830 }, { "epoch": 0.5056282324307879, "grad_norm": 1.2548869848251343, "learning_rate": 4.900668214179214e-05, "loss": 0.5161, "step": 831 }, { "epoch": 0.5062366899908731, "grad_norm": 1.1596348285675049, "learning_rate": 4.900320491519759e-05, "loss": 0.5323, "step": 832 }, { "epoch": 0.5068451475509583, "grad_norm": 1.0443363189697266, "learning_rate": 4.8999721736808714e-05, "loss": 0.4074, "step": 833 }, { "epoch": 0.5074536051110435, "grad_norm": 1.1528189182281494, "learning_rate": 4.899623260748921e-05, "loss": 0.4656, "step": 834 }, { "epoch": 0.5080620626711286, "grad_norm": 1.4290034770965576, "learning_rate": 4.899273752810423e-05, "loss": 0.5818, "step": 835 }, { "epoch": 0.5086705202312138, "grad_norm": 1.298559546470642, "learning_rate": 4.898923649952041e-05, "loss": 0.5283, "step": 836 }, { "epoch": 0.509278977791299, "grad_norm": 1.4868043661117554, "learning_rate": 4.8985729522605864e-05, "loss": 0.4575, "step": 837 }, { "epoch": 0.5098874353513843, "grad_norm": 1.40908944606781, "learning_rate": 4.898221659823016e-05, "loss": 0.5026, "step": 838 }, { "epoch": 0.5104958929114695, "grad_norm": 1.5016429424285889, "learning_rate": 4.897869772726438e-05, "loss": 0.4975, "step": 839 }, { "epoch": 0.5111043504715546, "grad_norm": 1.2684623003005981, "learning_rate": 4.8975172910581033e-05, "loss": 0.5549, "step": 840 }, { "epoch": 0.5117128080316398, "grad_norm": 1.4201849699020386, "learning_rate": 4.897164214905414e-05, "loss": 0.5264, "step": 841 }, { "epoch": 0.512321265591725, "grad_norm": 1.300970435142517, "learning_rate": 4.8968105443559194e-05, "loss": 0.509, "step": 842 }, { "epoch": 0.5129297231518102, "grad_norm": 1.3174622058868408, "learning_rate": 4.896456279497312e-05, "loss": 0.4918, "step": 843 }, { "epoch": 0.5135381807118954, "grad_norm": 1.2107031345367432, "learning_rate": 4.8961014204174384e-05, "loss": 0.5297, "step": 844 }, { "epoch": 0.5141466382719805, "grad_norm": 1.4777075052261353, "learning_rate": 4.895745967204286e-05, "loss": 0.4719, "step": 845 }, { "epoch": 0.5147550958320657, "grad_norm": 1.252469539642334, "learning_rate": 4.895389919945993e-05, "loss": 0.4674, "step": 846 }, { "epoch": 0.5153635533921509, "grad_norm": 1.2273253202438354, "learning_rate": 4.895033278730845e-05, "loss": 0.5269, "step": 847 }, { "epoch": 0.5159720109522361, "grad_norm": 1.6084344387054443, "learning_rate": 4.894676043647274e-05, "loss": 0.4133, "step": 848 }, { "epoch": 0.5165804685123213, "grad_norm": 0.9914146065711975, "learning_rate": 4.894318214783859e-05, "loss": 0.4391, "step": 849 }, { "epoch": 0.5171889260724064, "grad_norm": 1.3645412921905518, "learning_rate": 4.893959792229327e-05, "loss": 0.5509, "step": 850 }, { "epoch": 0.5177973836324916, "grad_norm": 1.2271888256072998, "learning_rate": 4.8936007760725514e-05, "loss": 0.5111, "step": 851 }, { "epoch": 0.5184058411925768, "grad_norm": 1.262268304824829, "learning_rate": 4.893241166402553e-05, "loss": 0.4809, "step": 852 }, { "epoch": 0.519014298752662, "grad_norm": 1.3610913753509521, "learning_rate": 4.892880963308502e-05, "loss": 0.5159, "step": 853 }, { "epoch": 0.5196227563127472, "grad_norm": 1.3238227367401123, "learning_rate": 4.8925201668797117e-05, "loss": 0.5162, "step": 854 }, { "epoch": 0.5202312138728323, "grad_norm": 1.2725099325180054, "learning_rate": 4.8921587772056444e-05, "loss": 0.4616, "step": 855 }, { "epoch": 0.5208396714329175, "grad_norm": 1.4523481130599976, "learning_rate": 4.8917967943759114e-05, "loss": 0.6087, "step": 856 }, { "epoch": 0.5214481289930027, "grad_norm": 1.192795753479004, "learning_rate": 4.8914342184802675e-05, "loss": 0.4805, "step": 857 }, { "epoch": 0.522056586553088, "grad_norm": 1.2161285877227783, "learning_rate": 4.891071049608618e-05, "loss": 0.481, "step": 858 }, { "epoch": 0.5226650441131732, "grad_norm": 1.391660213470459, "learning_rate": 4.890707287851013e-05, "loss": 0.4796, "step": 859 }, { "epoch": 0.5232735016732583, "grad_norm": 1.189489722251892, "learning_rate": 4.8903429332976494e-05, "loss": 0.5197, "step": 860 }, { "epoch": 0.5238819592333435, "grad_norm": 1.2670400142669678, "learning_rate": 4.889977986038874e-05, "loss": 0.4648, "step": 861 }, { "epoch": 0.5244904167934287, "grad_norm": 1.570553183555603, "learning_rate": 4.889612446165176e-05, "loss": 0.5102, "step": 862 }, { "epoch": 0.5250988743535139, "grad_norm": 1.1414783000946045, "learning_rate": 4.8892463137671963e-05, "loss": 0.4489, "step": 863 }, { "epoch": 0.525707331913599, "grad_norm": 1.4709701538085938, "learning_rate": 4.888879588935719e-05, "loss": 0.5205, "step": 864 }, { "epoch": 0.5263157894736842, "grad_norm": 1.218973159790039, "learning_rate": 4.888512271761677e-05, "loss": 0.466, "step": 865 }, { "epoch": 0.5269242470337694, "grad_norm": 1.2933622598648071, "learning_rate": 4.88814436233615e-05, "loss": 0.5473, "step": 866 }, { "epoch": 0.5275327045938546, "grad_norm": 1.085365891456604, "learning_rate": 4.887775860750363e-05, "loss": 0.4572, "step": 867 }, { "epoch": 0.5281411621539398, "grad_norm": 1.5779204368591309, "learning_rate": 4.8874067670956905e-05, "loss": 0.4428, "step": 868 }, { "epoch": 0.5287496197140249, "grad_norm": 1.36058509349823, "learning_rate": 4.887037081463652e-05, "loss": 0.5667, "step": 869 }, { "epoch": 0.5293580772741101, "grad_norm": 1.1790504455566406, "learning_rate": 4.886666803945914e-05, "loss": 0.5067, "step": 870 }, { "epoch": 0.5299665348341953, "grad_norm": 1.2920281887054443, "learning_rate": 4.886295934634289e-05, "loss": 0.5173, "step": 871 }, { "epoch": 0.5305749923942805, "grad_norm": 1.085190773010254, "learning_rate": 4.8859244736207395e-05, "loss": 0.4904, "step": 872 }, { "epoch": 0.5311834499543657, "grad_norm": 1.1711935997009277, "learning_rate": 4.885552420997369e-05, "loss": 0.4416, "step": 873 }, { "epoch": 0.5317919075144508, "grad_norm": 1.4009342193603516, "learning_rate": 4.885179776856435e-05, "loss": 0.529, "step": 874 }, { "epoch": 0.532400365074536, "grad_norm": 1.261432409286499, "learning_rate": 4.8848065412903335e-05, "loss": 0.5338, "step": 875 }, { "epoch": 0.5330088226346212, "grad_norm": 1.0821967124938965, "learning_rate": 4.884432714391613e-05, "loss": 0.4251, "step": 876 }, { "epoch": 0.5336172801947064, "grad_norm": 1.4930713176727295, "learning_rate": 4.884058296252969e-05, "loss": 0.5063, "step": 877 }, { "epoch": 0.5342257377547917, "grad_norm": 1.2942625284194946, "learning_rate": 4.88368328696724e-05, "loss": 0.4825, "step": 878 }, { "epoch": 0.5348341953148767, "grad_norm": 1.1449693441390991, "learning_rate": 4.883307686627412e-05, "loss": 0.4833, "step": 879 }, { "epoch": 0.535442652874962, "grad_norm": 1.2848173379898071, "learning_rate": 4.882931495326619e-05, "loss": 0.4981, "step": 880 }, { "epoch": 0.5360511104350472, "grad_norm": 1.0941145420074463, "learning_rate": 4.882554713158141e-05, "loss": 0.4457, "step": 881 }, { "epoch": 0.5366595679951324, "grad_norm": 1.2851606607437134, "learning_rate": 4.8821773402154025e-05, "loss": 0.4238, "step": 882 }, { "epoch": 0.5372680255552176, "grad_norm": 1.5043134689331055, "learning_rate": 4.881799376591979e-05, "loss": 0.53, "step": 883 }, { "epoch": 0.5378764831153027, "grad_norm": 1.2732433080673218, "learning_rate": 4.8814208223815886e-05, "loss": 0.5126, "step": 884 }, { "epoch": 0.5384849406753879, "grad_norm": 1.2558897733688354, "learning_rate": 4.8810416776780956e-05, "loss": 0.4542, "step": 885 }, { "epoch": 0.5390933982354731, "grad_norm": 1.1462931632995605, "learning_rate": 4.880661942575514e-05, "loss": 0.4536, "step": 886 }, { "epoch": 0.5397018557955583, "grad_norm": 1.2264448404312134, "learning_rate": 4.880281617168001e-05, "loss": 0.4497, "step": 887 }, { "epoch": 0.5403103133556434, "grad_norm": 1.3015364408493042, "learning_rate": 4.879900701549863e-05, "loss": 0.4697, "step": 888 }, { "epoch": 0.5409187709157286, "grad_norm": 1.5858356952667236, "learning_rate": 4.879519195815549e-05, "loss": 0.4573, "step": 889 }, { "epoch": 0.5415272284758138, "grad_norm": 1.2215701341629028, "learning_rate": 4.8791371000596585e-05, "loss": 0.5027, "step": 890 }, { "epoch": 0.542135686035899, "grad_norm": 1.3186434507369995, "learning_rate": 4.8787544143769335e-05, "loss": 0.5387, "step": 891 }, { "epoch": 0.5427441435959842, "grad_norm": 1.487970232963562, "learning_rate": 4.878371138862267e-05, "loss": 0.5038, "step": 892 }, { "epoch": 0.5433526011560693, "grad_norm": 1.4170433282852173, "learning_rate": 4.8779872736106916e-05, "loss": 0.4948, "step": 893 }, { "epoch": 0.5439610587161545, "grad_norm": 1.167901635169983, "learning_rate": 4.877602818717393e-05, "loss": 0.503, "step": 894 }, { "epoch": 0.5445695162762397, "grad_norm": 1.3846609592437744, "learning_rate": 4.877217774277698e-05, "loss": 0.6564, "step": 895 }, { "epoch": 0.5451779738363249, "grad_norm": 1.4311436414718628, "learning_rate": 4.876832140387082e-05, "loss": 0.5094, "step": 896 }, { "epoch": 0.5457864313964101, "grad_norm": 1.1872178316116333, "learning_rate": 4.876445917141167e-05, "loss": 0.4632, "step": 897 }, { "epoch": 0.5463948889564952, "grad_norm": 1.4088075160980225, "learning_rate": 4.8760591046357196e-05, "loss": 0.4989, "step": 898 }, { "epoch": 0.5470033465165804, "grad_norm": 1.2164874076843262, "learning_rate": 4.875671702966653e-05, "loss": 0.4258, "step": 899 }, { "epoch": 0.5476118040766657, "grad_norm": 1.2215385437011719, "learning_rate": 4.875283712230027e-05, "loss": 0.4936, "step": 900 }, { "epoch": 0.5482202616367509, "grad_norm": 1.4215302467346191, "learning_rate": 4.874895132522047e-05, "loss": 0.425, "step": 901 }, { "epoch": 0.5488287191968361, "grad_norm": 1.227898359298706, "learning_rate": 4.874505963939066e-05, "loss": 0.4306, "step": 902 }, { "epoch": 0.5494371767569212, "grad_norm": 1.2220474481582642, "learning_rate": 4.874116206577578e-05, "loss": 0.5173, "step": 903 }, { "epoch": 0.5500456343170064, "grad_norm": 1.1359342336654663, "learning_rate": 4.8737258605342304e-05, "loss": 0.4496, "step": 904 }, { "epoch": 0.5506540918770916, "grad_norm": 1.4966158866882324, "learning_rate": 4.8733349259058105e-05, "loss": 0.4165, "step": 905 }, { "epoch": 0.5512625494371768, "grad_norm": 1.28777277469635, "learning_rate": 4.872943402789255e-05, "loss": 0.5104, "step": 906 }, { "epoch": 0.551871006997262, "grad_norm": 1.30458664894104, "learning_rate": 4.872551291281644e-05, "loss": 0.4773, "step": 907 }, { "epoch": 0.5524794645573471, "grad_norm": 1.116233229637146, "learning_rate": 4.872158591480206e-05, "loss": 0.4654, "step": 908 }, { "epoch": 0.5530879221174323, "grad_norm": 1.341030240058899, "learning_rate": 4.871765303482314e-05, "loss": 0.4994, "step": 909 }, { "epoch": 0.5536963796775175, "grad_norm": 1.6074213981628418, "learning_rate": 4.871371427385486e-05, "loss": 0.4937, "step": 910 }, { "epoch": 0.5543048372376027, "grad_norm": 1.4934123754501343, "learning_rate": 4.870976963287389e-05, "loss": 0.4713, "step": 911 }, { "epoch": 0.5549132947976878, "grad_norm": 1.2505860328674316, "learning_rate": 4.8705819112858306e-05, "loss": 0.5438, "step": 912 }, { "epoch": 0.555521752357773, "grad_norm": 1.4453731775283813, "learning_rate": 4.8701862714787704e-05, "loss": 0.5272, "step": 913 }, { "epoch": 0.5561302099178582, "grad_norm": 1.0299216508865356, "learning_rate": 4.8697900439643087e-05, "loss": 0.4406, "step": 914 }, { "epoch": 0.5567386674779434, "grad_norm": 1.381816029548645, "learning_rate": 4.869393228840693e-05, "loss": 0.4839, "step": 915 }, { "epoch": 0.5573471250380286, "grad_norm": 1.1093368530273438, "learning_rate": 4.8689958262063186e-05, "loss": 0.4567, "step": 916 }, { "epoch": 0.5579555825981137, "grad_norm": 1.302557349205017, "learning_rate": 4.8685978361597234e-05, "loss": 0.4838, "step": 917 }, { "epoch": 0.5585640401581989, "grad_norm": 1.2300167083740234, "learning_rate": 4.868199258799593e-05, "loss": 0.4985, "step": 918 }, { "epoch": 0.5591724977182841, "grad_norm": 1.171228051185608, "learning_rate": 4.867800094224758e-05, "loss": 0.4577, "step": 919 }, { "epoch": 0.5597809552783694, "grad_norm": 1.348331093788147, "learning_rate": 4.867400342534194e-05, "loss": 0.4892, "step": 920 }, { "epoch": 0.5603894128384546, "grad_norm": 1.2311930656433105, "learning_rate": 4.8670000038270236e-05, "loss": 0.4981, "step": 921 }, { "epoch": 0.5609978703985397, "grad_norm": 1.3021697998046875, "learning_rate": 4.866599078202514e-05, "loss": 0.5145, "step": 922 }, { "epoch": 0.5616063279586249, "grad_norm": 1.426529049873352, "learning_rate": 4.8661975657600765e-05, "loss": 0.502, "step": 923 }, { "epoch": 0.5622147855187101, "grad_norm": 1.096643090248108, "learning_rate": 4.865795466599272e-05, "loss": 0.4567, "step": 924 }, { "epoch": 0.5628232430787953, "grad_norm": 1.1198285818099976, "learning_rate": 4.8653927808198024e-05, "loss": 0.4742, "step": 925 }, { "epoch": 0.5634317006388805, "grad_norm": 1.3766636848449707, "learning_rate": 4.8649895085215177e-05, "loss": 0.5223, "step": 926 }, { "epoch": 0.5640401581989656, "grad_norm": 1.1397812366485596, "learning_rate": 4.8645856498044125e-05, "loss": 0.4926, "step": 927 }, { "epoch": 0.5646486157590508, "grad_norm": 1.2435989379882812, "learning_rate": 4.8641812047686266e-05, "loss": 0.44, "step": 928 }, { "epoch": 0.565257073319136, "grad_norm": 1.5014331340789795, "learning_rate": 4.863776173514446e-05, "loss": 0.4282, "step": 929 }, { "epoch": 0.5658655308792212, "grad_norm": 1.1607805490493774, "learning_rate": 4.8633705561423014e-05, "loss": 0.4856, "step": 930 }, { "epoch": 0.5664739884393064, "grad_norm": 1.08901846408844, "learning_rate": 4.862964352752769e-05, "loss": 0.5092, "step": 931 }, { "epoch": 0.5670824459993915, "grad_norm": 1.114229440689087, "learning_rate": 4.862557563446571e-05, "loss": 0.4652, "step": 932 }, { "epoch": 0.5676909035594767, "grad_norm": 1.2063711881637573, "learning_rate": 4.862150188324573e-05, "loss": 0.4485, "step": 933 }, { "epoch": 0.5682993611195619, "grad_norm": 1.2078471183776855, "learning_rate": 4.861742227487788e-05, "loss": 0.5443, "step": 934 }, { "epoch": 0.5689078186796471, "grad_norm": 1.0555641651153564, "learning_rate": 4.861333681037372e-05, "loss": 0.4346, "step": 935 }, { "epoch": 0.5695162762397323, "grad_norm": 0.9378579258918762, "learning_rate": 4.8609245490746283e-05, "loss": 0.4181, "step": 936 }, { "epoch": 0.5701247337998174, "grad_norm": 1.4141374826431274, "learning_rate": 4.8605148317010054e-05, "loss": 0.4459, "step": 937 }, { "epoch": 0.5707331913599026, "grad_norm": 1.106641173362732, "learning_rate": 4.8601045290180946e-05, "loss": 0.4694, "step": 938 }, { "epoch": 0.5713416489199878, "grad_norm": 1.272030234336853, "learning_rate": 4.8596936411276354e-05, "loss": 0.5127, "step": 939 }, { "epoch": 0.571950106480073, "grad_norm": 1.3205722570419312, "learning_rate": 4.8592821681315096e-05, "loss": 0.4937, "step": 940 }, { "epoch": 0.5725585640401581, "grad_norm": 1.1618589162826538, "learning_rate": 4.858870110131746e-05, "loss": 0.5017, "step": 941 }, { "epoch": 0.5731670216002434, "grad_norm": 1.2044800519943237, "learning_rate": 4.858457467230517e-05, "loss": 0.4831, "step": 942 }, { "epoch": 0.5737754791603286, "grad_norm": 1.5071766376495361, "learning_rate": 4.858044239530143e-05, "loss": 0.5215, "step": 943 }, { "epoch": 0.5743839367204138, "grad_norm": 1.1561444997787476, "learning_rate": 4.857630427133084e-05, "loss": 0.4897, "step": 944 }, { "epoch": 0.574992394280499, "grad_norm": 1.1617779731750488, "learning_rate": 4.857216030141951e-05, "loss": 0.4417, "step": 945 }, { "epoch": 0.5756008518405841, "grad_norm": 1.083703637123108, "learning_rate": 4.8568010486594964e-05, "loss": 0.4986, "step": 946 }, { "epoch": 0.5762093094006693, "grad_norm": 1.3023382425308228, "learning_rate": 4.8563854827886166e-05, "loss": 0.5696, "step": 947 }, { "epoch": 0.5768177669607545, "grad_norm": 1.446385383605957, "learning_rate": 4.855969332632357e-05, "loss": 0.49, "step": 948 }, { "epoch": 0.5774262245208397, "grad_norm": 1.1349619626998901, "learning_rate": 4.8555525982939046e-05, "loss": 0.5179, "step": 949 }, { "epoch": 0.5780346820809249, "grad_norm": 1.1926076412200928, "learning_rate": 4.855135279876592e-05, "loss": 0.4852, "step": 950 }, { "epoch": 0.57864313964101, "grad_norm": 1.0169259309768677, "learning_rate": 4.8547173774838975e-05, "loss": 0.4002, "step": 951 }, { "epoch": 0.5792515972010952, "grad_norm": 1.2183793783187866, "learning_rate": 4.854298891219441e-05, "loss": 0.5109, "step": 952 }, { "epoch": 0.5798600547611804, "grad_norm": 1.2902944087982178, "learning_rate": 4.853879821186993e-05, "loss": 0.5235, "step": 953 }, { "epoch": 0.5804685123212656, "grad_norm": 1.0406612157821655, "learning_rate": 4.8534601674904635e-05, "loss": 0.4147, "step": 954 }, { "epoch": 0.5810769698813508, "grad_norm": 1.2372276782989502, "learning_rate": 4.85303993023391e-05, "loss": 0.4658, "step": 955 }, { "epoch": 0.5816854274414359, "grad_norm": 1.1386033296585083, "learning_rate": 4.852619109521533e-05, "loss": 0.4555, "step": 956 }, { "epoch": 0.5822938850015211, "grad_norm": 1.4579136371612549, "learning_rate": 4.8521977054576783e-05, "loss": 0.4944, "step": 957 }, { "epoch": 0.5829023425616063, "grad_norm": 1.3394607305526733, "learning_rate": 4.851775718146838e-05, "loss": 0.5533, "step": 958 }, { "epoch": 0.5835108001216915, "grad_norm": 1.263627290725708, "learning_rate": 4.851353147693646e-05, "loss": 0.5234, "step": 959 }, { "epoch": 0.5841192576817767, "grad_norm": 1.30045747756958, "learning_rate": 4.850929994202882e-05, "loss": 0.5035, "step": 960 }, { "epoch": 0.5847277152418618, "grad_norm": 1.3387147188186646, "learning_rate": 4.8505062577794716e-05, "loss": 0.4725, "step": 961 }, { "epoch": 0.585336172801947, "grad_norm": 1.1547223329544067, "learning_rate": 4.8500819385284835e-05, "loss": 0.4985, "step": 962 }, { "epoch": 0.5859446303620323, "grad_norm": 1.1327009201049805, "learning_rate": 4.849657036555131e-05, "loss": 0.4579, "step": 963 }, { "epoch": 0.5865530879221175, "grad_norm": 0.9808380007743835, "learning_rate": 4.849231551964771e-05, "loss": 0.4366, "step": 964 }, { "epoch": 0.5871615454822026, "grad_norm": 1.2232143878936768, "learning_rate": 4.848805484862908e-05, "loss": 0.4611, "step": 965 }, { "epoch": 0.5877700030422878, "grad_norm": 1.2399910688400269, "learning_rate": 4.8483788353551876e-05, "loss": 0.5725, "step": 966 }, { "epoch": 0.588378460602373, "grad_norm": 1.1158905029296875, "learning_rate": 4.8479516035474003e-05, "loss": 0.4809, "step": 967 }, { "epoch": 0.5889869181624582, "grad_norm": 1.1464722156524658, "learning_rate": 4.8475237895454833e-05, "loss": 0.4469, "step": 968 }, { "epoch": 0.5895953757225434, "grad_norm": 1.3894585371017456, "learning_rate": 4.847095393455516e-05, "loss": 0.4433, "step": 969 }, { "epoch": 0.5902038332826285, "grad_norm": 1.2021290063858032, "learning_rate": 4.846666415383724e-05, "loss": 0.4745, "step": 970 }, { "epoch": 0.5908122908427137, "grad_norm": 1.128505825996399, "learning_rate": 4.846236855436473e-05, "loss": 0.3727, "step": 971 }, { "epoch": 0.5914207484027989, "grad_norm": 1.2462071180343628, "learning_rate": 4.845806713720279e-05, "loss": 0.5052, "step": 972 }, { "epoch": 0.5920292059628841, "grad_norm": 1.1397188901901245, "learning_rate": 4.845375990341798e-05, "loss": 0.447, "step": 973 }, { "epoch": 0.5926376635229693, "grad_norm": 1.1206566095352173, "learning_rate": 4.844944685407831e-05, "loss": 0.4872, "step": 974 }, { "epoch": 0.5932461210830544, "grad_norm": 1.167952299118042, "learning_rate": 4.8445127990253244e-05, "loss": 0.489, "step": 975 }, { "epoch": 0.5938545786431396, "grad_norm": 1.4353526830673218, "learning_rate": 4.844080331301368e-05, "loss": 0.5701, "step": 976 }, { "epoch": 0.5944630362032248, "grad_norm": 1.1396512985229492, "learning_rate": 4.843647282343195e-05, "loss": 0.4253, "step": 977 }, { "epoch": 0.59507149376331, "grad_norm": 1.1176103353500366, "learning_rate": 4.843213652258185e-05, "loss": 0.4273, "step": 978 }, { "epoch": 0.5956799513233952, "grad_norm": 0.9949272274971008, "learning_rate": 4.842779441153858e-05, "loss": 0.4386, "step": 979 }, { "epoch": 0.5962884088834803, "grad_norm": 1.0839688777923584, "learning_rate": 4.842344649137882e-05, "loss": 0.4394, "step": 980 }, { "epoch": 0.5968968664435655, "grad_norm": 1.2163331508636475, "learning_rate": 4.8419092763180673e-05, "loss": 0.4894, "step": 981 }, { "epoch": 0.5975053240036508, "grad_norm": 1.1652277708053589, "learning_rate": 4.841473322802367e-05, "loss": 0.4745, "step": 982 }, { "epoch": 0.598113781563736, "grad_norm": 1.3939238786697388, "learning_rate": 4.84103678869888e-05, "loss": 0.5641, "step": 983 }, { "epoch": 0.5987222391238212, "grad_norm": 1.0871104001998901, "learning_rate": 4.840599674115849e-05, "loss": 0.457, "step": 984 }, { "epoch": 0.5993306966839063, "grad_norm": 1.193232536315918, "learning_rate": 4.8401619791616595e-05, "loss": 0.5823, "step": 985 }, { "epoch": 0.5999391542439915, "grad_norm": 1.1120039224624634, "learning_rate": 4.839723703944842e-05, "loss": 0.464, "step": 986 }, { "epoch": 0.6005476118040767, "grad_norm": 1.0750576257705688, "learning_rate": 4.8392848485740706e-05, "loss": 0.5022, "step": 987 }, { "epoch": 0.6011560693641619, "grad_norm": 1.062077522277832, "learning_rate": 4.838845413158162e-05, "loss": 0.4077, "step": 988 }, { "epoch": 0.601764526924247, "grad_norm": 1.1485848426818848, "learning_rate": 4.83840539780608e-05, "loss": 0.4517, "step": 989 }, { "epoch": 0.6023729844843322, "grad_norm": 1.139553427696228, "learning_rate": 4.837964802626929e-05, "loss": 0.5155, "step": 990 }, { "epoch": 0.6029814420444174, "grad_norm": 1.2427302598953247, "learning_rate": 4.8375236277299575e-05, "loss": 0.4747, "step": 991 }, { "epoch": 0.6035898996045026, "grad_norm": 1.172774076461792, "learning_rate": 4.837081873224559e-05, "loss": 0.5702, "step": 992 }, { "epoch": 0.6041983571645878, "grad_norm": 1.059998869895935, "learning_rate": 4.83663953922027e-05, "loss": 0.4608, "step": 993 }, { "epoch": 0.6048068147246729, "grad_norm": 1.0970420837402344, "learning_rate": 4.836196625826772e-05, "loss": 0.4902, "step": 994 }, { "epoch": 0.6054152722847581, "grad_norm": 1.263503909111023, "learning_rate": 4.835753133153888e-05, "loss": 0.4825, "step": 995 }, { "epoch": 0.6060237298448433, "grad_norm": 1.0827324390411377, "learning_rate": 4.8353090613115856e-05, "loss": 0.4678, "step": 996 }, { "epoch": 0.6066321874049285, "grad_norm": 1.2301872968673706, "learning_rate": 4.8348644104099773e-05, "loss": 0.4315, "step": 997 }, { "epoch": 0.6072406449650137, "grad_norm": 1.1217997074127197, "learning_rate": 4.834419180559317e-05, "loss": 0.4979, "step": 998 }, { "epoch": 0.6078491025250988, "grad_norm": 0.9536124467849731, "learning_rate": 4.833973371870003e-05, "loss": 0.4011, "step": 999 }, { "epoch": 0.608457560085184, "grad_norm": 1.1605530977249146, "learning_rate": 4.833526984452578e-05, "loss": 0.5327, "step": 1000 }, { "epoch": 0.6090660176452692, "grad_norm": 1.1049379110336304, "learning_rate": 4.833080018417726e-05, "loss": 0.4454, "step": 1001 }, { "epoch": 0.6096744752053544, "grad_norm": 1.0455185174942017, "learning_rate": 4.8326324738762774e-05, "loss": 0.4685, "step": 1002 }, { "epoch": 0.6102829327654397, "grad_norm": 1.3146376609802246, "learning_rate": 4.832184350939205e-05, "loss": 0.465, "step": 1003 }, { "epoch": 0.6108913903255248, "grad_norm": 1.5373934507369995, "learning_rate": 4.831735649717623e-05, "loss": 0.5181, "step": 1004 }, { "epoch": 0.61149984788561, "grad_norm": 1.0241584777832031, "learning_rate": 4.831286370322792e-05, "loss": 0.4116, "step": 1005 }, { "epoch": 0.6121083054456952, "grad_norm": 1.1920700073242188, "learning_rate": 4.830836512866113e-05, "loss": 0.5441, "step": 1006 }, { "epoch": 0.6127167630057804, "grad_norm": 1.1300125122070312, "learning_rate": 4.8303860774591336e-05, "loss": 0.4439, "step": 1007 }, { "epoch": 0.6133252205658656, "grad_norm": 1.3509349822998047, "learning_rate": 4.8299350642135424e-05, "loss": 0.5142, "step": 1008 }, { "epoch": 0.6139336781259507, "grad_norm": 1.2040926218032837, "learning_rate": 4.8294834732411714e-05, "loss": 0.5396, "step": 1009 }, { "epoch": 0.6145421356860359, "grad_norm": 1.0937200784683228, "learning_rate": 4.829031304653997e-05, "loss": 0.4634, "step": 1010 }, { "epoch": 0.6151505932461211, "grad_norm": 1.4639860391616821, "learning_rate": 4.8285785585641375e-05, "loss": 0.4316, "step": 1011 }, { "epoch": 0.6157590508062063, "grad_norm": 1.1813201904296875, "learning_rate": 4.8281252350838557e-05, "loss": 0.4838, "step": 1012 }, { "epoch": 0.6163675083662915, "grad_norm": 1.0957978963851929, "learning_rate": 4.827671334325556e-05, "loss": 0.4231, "step": 1013 }, { "epoch": 0.6169759659263766, "grad_norm": 1.061135172843933, "learning_rate": 4.827216856401788e-05, "loss": 0.4508, "step": 1014 }, { "epoch": 0.6175844234864618, "grad_norm": 1.2017053365707397, "learning_rate": 4.826761801425243e-05, "loss": 0.4579, "step": 1015 }, { "epoch": 0.618192881046547, "grad_norm": 1.263437271118164, "learning_rate": 4.826306169508755e-05, "loss": 0.511, "step": 1016 }, { "epoch": 0.6188013386066322, "grad_norm": 1.6724522113800049, "learning_rate": 4.825849960765303e-05, "loss": 0.4669, "step": 1017 }, { "epoch": 0.6194097961667173, "grad_norm": 1.3626164197921753, "learning_rate": 4.825393175308006e-05, "loss": 0.5212, "step": 1018 }, { "epoch": 0.6200182537268025, "grad_norm": 1.1306309700012207, "learning_rate": 4.824935813250129e-05, "loss": 0.4786, "step": 1019 }, { "epoch": 0.6206267112868877, "grad_norm": 1.1889963150024414, "learning_rate": 4.824477874705079e-05, "loss": 0.4653, "step": 1020 }, { "epoch": 0.6212351688469729, "grad_norm": 1.1174501180648804, "learning_rate": 4.8240193597864044e-05, "loss": 0.4382, "step": 1021 }, { "epoch": 0.6218436264070581, "grad_norm": 1.1703723669052124, "learning_rate": 4.8235602686077986e-05, "loss": 0.4405, "step": 1022 }, { "epoch": 0.6224520839671432, "grad_norm": 1.100953221321106, "learning_rate": 4.823100601283097e-05, "loss": 0.417, "step": 1023 }, { "epoch": 0.6230605415272285, "grad_norm": 0.9949542880058289, "learning_rate": 4.822640357926278e-05, "loss": 0.4109, "step": 1024 }, { "epoch": 0.6236689990873137, "grad_norm": 1.0636481046676636, "learning_rate": 4.8221795386514625e-05, "loss": 0.4349, "step": 1025 }, { "epoch": 0.6242774566473989, "grad_norm": 1.30801522731781, "learning_rate": 4.821718143572914e-05, "loss": 0.4903, "step": 1026 }, { "epoch": 0.6248859142074841, "grad_norm": 1.0893471240997314, "learning_rate": 4.821256172805041e-05, "loss": 0.4154, "step": 1027 }, { "epoch": 0.6254943717675692, "grad_norm": 1.0990405082702637, "learning_rate": 4.820793626462391e-05, "loss": 0.3986, "step": 1028 }, { "epoch": 0.6261028293276544, "grad_norm": 1.1979132890701294, "learning_rate": 4.8203305046596584e-05, "loss": 0.5231, "step": 1029 }, { "epoch": 0.6267112868877396, "grad_norm": 1.0785170793533325, "learning_rate": 4.8198668075116754e-05, "loss": 0.495, "step": 1030 }, { "epoch": 0.6273197444478248, "grad_norm": 1.0589004755020142, "learning_rate": 4.819402535133422e-05, "loss": 0.4546, "step": 1031 }, { "epoch": 0.62792820200791, "grad_norm": 1.189632534980774, "learning_rate": 4.818937687640016e-05, "loss": 0.4908, "step": 1032 }, { "epoch": 0.6285366595679951, "grad_norm": 1.1523140668869019, "learning_rate": 4.818472265146722e-05, "loss": 0.4496, "step": 1033 }, { "epoch": 0.6291451171280803, "grad_norm": 1.1302261352539062, "learning_rate": 4.818006267768945e-05, "loss": 0.4783, "step": 1034 }, { "epoch": 0.6297535746881655, "grad_norm": 1.0538748502731323, "learning_rate": 4.817539695622234e-05, "loss": 0.4308, "step": 1035 }, { "epoch": 0.6303620322482507, "grad_norm": 1.1539735794067383, "learning_rate": 4.817072548822277e-05, "loss": 0.4797, "step": 1036 }, { "epoch": 0.6309704898083359, "grad_norm": 1.2327778339385986, "learning_rate": 4.816604827484908e-05, "loss": 0.4814, "step": 1037 }, { "epoch": 0.631578947368421, "grad_norm": 1.306960105895996, "learning_rate": 4.816136531726104e-05, "loss": 0.5386, "step": 1038 }, { "epoch": 0.6321874049285062, "grad_norm": 1.0629810094833374, "learning_rate": 4.815667661661981e-05, "loss": 0.4891, "step": 1039 }, { "epoch": 0.6327958624885914, "grad_norm": 1.113374948501587, "learning_rate": 4.8151982174088e-05, "loss": 0.4601, "step": 1040 }, { "epoch": 0.6334043200486766, "grad_norm": 1.1330082416534424, "learning_rate": 4.814728199082962e-05, "loss": 0.4374, "step": 1041 }, { "epoch": 0.6340127776087617, "grad_norm": 1.10567307472229, "learning_rate": 4.8142576068010135e-05, "loss": 0.4555, "step": 1042 }, { "epoch": 0.6346212351688469, "grad_norm": 1.0396449565887451, "learning_rate": 4.813786440679642e-05, "loss": 0.4372, "step": 1043 }, { "epoch": 0.6352296927289321, "grad_norm": 1.0404661893844604, "learning_rate": 4.813314700835677e-05, "loss": 0.4632, "step": 1044 }, { "epoch": 0.6358381502890174, "grad_norm": 1.0946401357650757, "learning_rate": 4.8128423873860894e-05, "loss": 0.4613, "step": 1045 }, { "epoch": 0.6364466078491026, "grad_norm": 1.223552942276001, "learning_rate": 4.812369500447994e-05, "loss": 0.475, "step": 1046 }, { "epoch": 0.6370550654091877, "grad_norm": 1.0299303531646729, "learning_rate": 4.8118960401386466e-05, "loss": 0.4937, "step": 1047 }, { "epoch": 0.6376635229692729, "grad_norm": 1.256249189376831, "learning_rate": 4.811422006575446e-05, "loss": 0.4967, "step": 1048 }, { "epoch": 0.6382719805293581, "grad_norm": 1.0132943391799927, "learning_rate": 4.810947399875933e-05, "loss": 0.4349, "step": 1049 }, { "epoch": 0.6388804380894433, "grad_norm": 1.097622036933899, "learning_rate": 4.810472220157789e-05, "loss": 0.4606, "step": 1050 }, { "epoch": 0.6394888956495285, "grad_norm": 1.0819436311721802, "learning_rate": 4.80999646753884e-05, "loss": 0.4715, "step": 1051 }, { "epoch": 0.6400973532096136, "grad_norm": 1.1682971715927124, "learning_rate": 4.8095201421370515e-05, "loss": 0.477, "step": 1052 }, { "epoch": 0.6407058107696988, "grad_norm": 0.9915129542350769, "learning_rate": 4.8090432440705344e-05, "loss": 0.4397, "step": 1053 }, { "epoch": 0.641314268329784, "grad_norm": 1.028359293937683, "learning_rate": 4.8085657734575387e-05, "loss": 0.4533, "step": 1054 }, { "epoch": 0.6419227258898692, "grad_norm": 1.2199668884277344, "learning_rate": 4.8080877304164564e-05, "loss": 0.4632, "step": 1055 }, { "epoch": 0.6425311834499544, "grad_norm": 1.3181172609329224, "learning_rate": 4.807609115065823e-05, "loss": 0.5162, "step": 1056 }, { "epoch": 0.6431396410100395, "grad_norm": 1.1091372966766357, "learning_rate": 4.8071299275243145e-05, "loss": 0.4368, "step": 1057 }, { "epoch": 0.6437480985701247, "grad_norm": 1.1425464153289795, "learning_rate": 4.80665016791075e-05, "loss": 0.4589, "step": 1058 }, { "epoch": 0.6443565561302099, "grad_norm": 1.0934760570526123, "learning_rate": 4.80616983634409e-05, "loss": 0.4216, "step": 1059 }, { "epoch": 0.6449650136902951, "grad_norm": 1.2468703985214233, "learning_rate": 4.805688932943436e-05, "loss": 0.4859, "step": 1060 }, { "epoch": 0.6455734712503803, "grad_norm": 1.0578359365463257, "learning_rate": 4.805207457828034e-05, "loss": 0.4568, "step": 1061 }, { "epoch": 0.6461819288104654, "grad_norm": 1.2633030414581299, "learning_rate": 4.8047254111172665e-05, "loss": 0.4284, "step": 1062 }, { "epoch": 0.6467903863705506, "grad_norm": 1.122072458267212, "learning_rate": 4.804242792930663e-05, "loss": 0.4531, "step": 1063 }, { "epoch": 0.6473988439306358, "grad_norm": 1.2295277118682861, "learning_rate": 4.803759603387894e-05, "loss": 0.5087, "step": 1064 }, { "epoch": 0.648007301490721, "grad_norm": 1.3162295818328857, "learning_rate": 4.803275842608767e-05, "loss": 0.5562, "step": 1065 }, { "epoch": 0.6486157590508062, "grad_norm": 1.1886372566223145, "learning_rate": 4.802791510713237e-05, "loss": 0.4722, "step": 1066 }, { "epoch": 0.6492242166108914, "grad_norm": 1.1018664836883545, "learning_rate": 4.802306607821398e-05, "loss": 0.4603, "step": 1067 }, { "epoch": 0.6498326741709766, "grad_norm": 1.2671383619308472, "learning_rate": 4.8018211340534835e-05, "loss": 0.506, "step": 1068 }, { "epoch": 0.6504411317310618, "grad_norm": 1.1520271301269531, "learning_rate": 4.8013350895298735e-05, "loss": 0.465, "step": 1069 }, { "epoch": 0.651049589291147, "grad_norm": 1.2675180435180664, "learning_rate": 4.8008484743710854e-05, "loss": 0.4874, "step": 1070 }, { "epoch": 0.6516580468512321, "grad_norm": 1.073280692100525, "learning_rate": 4.80036128869778e-05, "loss": 0.439, "step": 1071 }, { "epoch": 0.6522665044113173, "grad_norm": 1.365029215812683, "learning_rate": 4.7998735326307585e-05, "loss": 0.3928, "step": 1072 }, { "epoch": 0.6528749619714025, "grad_norm": 1.06315016746521, "learning_rate": 4.799385206290965e-05, "loss": 0.4725, "step": 1073 }, { "epoch": 0.6534834195314877, "grad_norm": 1.0730098485946655, "learning_rate": 4.798896309799483e-05, "loss": 0.447, "step": 1074 }, { "epoch": 0.6540918770915729, "grad_norm": 1.4145798683166504, "learning_rate": 4.798406843277538e-05, "loss": 0.5263, "step": 1075 }, { "epoch": 0.654700334651658, "grad_norm": 1.3397396802902222, "learning_rate": 4.7979168068465e-05, "loss": 0.4561, "step": 1076 }, { "epoch": 0.6553087922117432, "grad_norm": 1.0366169214248657, "learning_rate": 4.7974262006278745e-05, "loss": 0.3994, "step": 1077 }, { "epoch": 0.6559172497718284, "grad_norm": 1.1666359901428223, "learning_rate": 4.796935024743313e-05, "loss": 0.4478, "step": 1078 }, { "epoch": 0.6565257073319136, "grad_norm": 1.1900277137756348, "learning_rate": 4.7964432793146065e-05, "loss": 0.4719, "step": 1079 }, { "epoch": 0.6571341648919988, "grad_norm": 1.5251827239990234, "learning_rate": 4.795950964463687e-05, "loss": 0.475, "step": 1080 }, { "epoch": 0.6577426224520839, "grad_norm": 1.298077940940857, "learning_rate": 4.795458080312628e-05, "loss": 0.4526, "step": 1081 }, { "epoch": 0.6583510800121691, "grad_norm": 1.0009061098098755, "learning_rate": 4.794964626983646e-05, "loss": 0.3941, "step": 1082 }, { "epoch": 0.6589595375722543, "grad_norm": 1.320255994796753, "learning_rate": 4.794470604599093e-05, "loss": 0.54, "step": 1083 }, { "epoch": 0.6595679951323395, "grad_norm": 1.177507758140564, "learning_rate": 4.79397601328147e-05, "loss": 0.4851, "step": 1084 }, { "epoch": 0.6601764526924248, "grad_norm": 1.2261627912521362, "learning_rate": 4.793480853153412e-05, "loss": 0.4907, "step": 1085 }, { "epoch": 0.6607849102525098, "grad_norm": 1.0971055030822754, "learning_rate": 4.792985124337701e-05, "loss": 0.4562, "step": 1086 }, { "epoch": 0.6613933678125951, "grad_norm": 1.3122934103012085, "learning_rate": 4.7924888269572545e-05, "loss": 0.5057, "step": 1087 }, { "epoch": 0.6620018253726803, "grad_norm": 1.1012256145477295, "learning_rate": 4.791991961135135e-05, "loss": 0.4912, "step": 1088 }, { "epoch": 0.6626102829327655, "grad_norm": 1.0547481775283813, "learning_rate": 4.791494526994544e-05, "loss": 0.4898, "step": 1089 }, { "epoch": 0.6632187404928507, "grad_norm": 1.085845708847046, "learning_rate": 4.790996524658824e-05, "loss": 0.4328, "step": 1090 }, { "epoch": 0.6638271980529358, "grad_norm": 1.1783792972564697, "learning_rate": 4.790497954251459e-05, "loss": 0.4246, "step": 1091 }, { "epoch": 0.664435655613021, "grad_norm": 1.1917533874511719, "learning_rate": 4.789998815896075e-05, "loss": 0.494, "step": 1092 }, { "epoch": 0.6650441131731062, "grad_norm": 1.0818932056427002, "learning_rate": 4.7894991097164366e-05, "loss": 0.5001, "step": 1093 }, { "epoch": 0.6656525707331914, "grad_norm": 1.1833261251449585, "learning_rate": 4.788998835836449e-05, "loss": 0.5394, "step": 1094 }, { "epoch": 0.6662610282932765, "grad_norm": 1.4886353015899658, "learning_rate": 4.788497994380162e-05, "loss": 0.3904, "step": 1095 }, { "epoch": 0.6668694858533617, "grad_norm": 1.6799591779708862, "learning_rate": 4.78799658547176e-05, "loss": 0.478, "step": 1096 }, { "epoch": 0.6674779434134469, "grad_norm": 1.1299816370010376, "learning_rate": 4.787494609235575e-05, "loss": 0.5295, "step": 1097 }, { "epoch": 0.6680864009735321, "grad_norm": 1.024692177772522, "learning_rate": 4.786992065796072e-05, "loss": 0.4684, "step": 1098 }, { "epoch": 0.6686948585336173, "grad_norm": 1.1255176067352295, "learning_rate": 4.786488955277865e-05, "loss": 0.455, "step": 1099 }, { "epoch": 0.6693033160937024, "grad_norm": 1.1396479606628418, "learning_rate": 4.7859852778057016e-05, "loss": 0.5006, "step": 1100 }, { "epoch": 0.6699117736537876, "grad_norm": 1.0285197496414185, "learning_rate": 4.7854810335044745e-05, "loss": 0.4194, "step": 1101 }, { "epoch": 0.6705202312138728, "grad_norm": 0.9936107993125916, "learning_rate": 4.7849762224992144e-05, "loss": 0.4164, "step": 1102 }, { "epoch": 0.671128688773958, "grad_norm": 1.3650137186050415, "learning_rate": 4.784470844915093e-05, "loss": 0.4367, "step": 1103 }, { "epoch": 0.6717371463340432, "grad_norm": 1.2089672088623047, "learning_rate": 4.783964900877425e-05, "loss": 0.491, "step": 1104 }, { "epoch": 0.6723456038941283, "grad_norm": 1.2130366563796997, "learning_rate": 4.78345839051166e-05, "loss": 0.4589, "step": 1105 }, { "epoch": 0.6729540614542135, "grad_norm": 1.3927088975906372, "learning_rate": 4.782951313943395e-05, "loss": 0.5312, "step": 1106 }, { "epoch": 0.6735625190142988, "grad_norm": 1.0126924514770508, "learning_rate": 4.782443671298362e-05, "loss": 0.4162, "step": 1107 }, { "epoch": 0.674170976574384, "grad_norm": 1.0403350591659546, "learning_rate": 4.781935462702435e-05, "loss": 0.407, "step": 1108 }, { "epoch": 0.6747794341344692, "grad_norm": 1.1724456548690796, "learning_rate": 4.7814266882816296e-05, "loss": 0.4615, "step": 1109 }, { "epoch": 0.6753878916945543, "grad_norm": 1.178318977355957, "learning_rate": 4.780917348162099e-05, "loss": 0.5085, "step": 1110 }, { "epoch": 0.6759963492546395, "grad_norm": 1.088005542755127, "learning_rate": 4.7804074424701406e-05, "loss": 0.4744, "step": 1111 }, { "epoch": 0.6766048068147247, "grad_norm": 1.1180678606033325, "learning_rate": 4.7798969713321874e-05, "loss": 0.4814, "step": 1112 }, { "epoch": 0.6772132643748099, "grad_norm": 1.1669161319732666, "learning_rate": 4.779385934874817e-05, "loss": 0.4187, "step": 1113 }, { "epoch": 0.6778217219348951, "grad_norm": 1.0521961450576782, "learning_rate": 4.7788743332247437e-05, "loss": 0.459, "step": 1114 }, { "epoch": 0.6784301794949802, "grad_norm": 1.1008024215698242, "learning_rate": 4.778362166508824e-05, "loss": 0.5048, "step": 1115 }, { "epoch": 0.6790386370550654, "grad_norm": 1.0038844347000122, "learning_rate": 4.777849434854054e-05, "loss": 0.3917, "step": 1116 }, { "epoch": 0.6796470946151506, "grad_norm": 1.1488128900527954, "learning_rate": 4.7773361383875697e-05, "loss": 0.4351, "step": 1117 }, { "epoch": 0.6802555521752358, "grad_norm": 1.1399554014205933, "learning_rate": 4.7768222772366466e-05, "loss": 0.4365, "step": 1118 }, { "epoch": 0.6808640097353209, "grad_norm": 1.3681772947311401, "learning_rate": 4.776307851528702e-05, "loss": 0.5406, "step": 1119 }, { "epoch": 0.6814724672954061, "grad_norm": 1.2079730033874512, "learning_rate": 4.7757928613912914e-05, "loss": 0.4459, "step": 1120 }, { "epoch": 0.6820809248554913, "grad_norm": 1.1010768413543701, "learning_rate": 4.7752773069521104e-05, "loss": 0.42, "step": 1121 }, { "epoch": 0.6826893824155765, "grad_norm": 1.095807433128357, "learning_rate": 4.774761188338995e-05, "loss": 0.4267, "step": 1122 }, { "epoch": 0.6832978399756617, "grad_norm": 1.1266393661499023, "learning_rate": 4.774244505679923e-05, "loss": 0.4663, "step": 1123 }, { "epoch": 0.6839062975357468, "grad_norm": 1.1465984582901, "learning_rate": 4.773727259103008e-05, "loss": 0.4362, "step": 1124 }, { "epoch": 0.684514755095832, "grad_norm": 1.1885544061660767, "learning_rate": 4.7732094487365065e-05, "loss": 0.4638, "step": 1125 }, { "epoch": 0.6851232126559172, "grad_norm": 1.164554476737976, "learning_rate": 4.772691074708814e-05, "loss": 0.409, "step": 1126 }, { "epoch": 0.6857316702160025, "grad_norm": 1.0508933067321777, "learning_rate": 4.7721721371484654e-05, "loss": 0.4334, "step": 1127 }, { "epoch": 0.6863401277760877, "grad_norm": 1.0762606859207153, "learning_rate": 4.771652636184135e-05, "loss": 0.4735, "step": 1128 }, { "epoch": 0.6869485853361728, "grad_norm": 1.0883415937423706, "learning_rate": 4.771132571944639e-05, "loss": 0.4749, "step": 1129 }, { "epoch": 0.687557042896258, "grad_norm": 1.061272144317627, "learning_rate": 4.770611944558929e-05, "loss": 0.4797, "step": 1130 }, { "epoch": 0.6881655004563432, "grad_norm": 1.0837888717651367, "learning_rate": 4.770090754156102e-05, "loss": 0.516, "step": 1131 }, { "epoch": 0.6887739580164284, "grad_norm": 1.197726845741272, "learning_rate": 4.7695690008653896e-05, "loss": 0.4487, "step": 1132 }, { "epoch": 0.6893824155765136, "grad_norm": 0.9252657294273376, "learning_rate": 4.769046684816165e-05, "loss": 0.3675, "step": 1133 }, { "epoch": 0.6899908731365987, "grad_norm": 1.079782247543335, "learning_rate": 4.768523806137941e-05, "loss": 0.4489, "step": 1134 }, { "epoch": 0.6905993306966839, "grad_norm": 1.2456952333450317, "learning_rate": 4.76800036496037e-05, "loss": 0.5507, "step": 1135 }, { "epoch": 0.6912077882567691, "grad_norm": 1.1496522426605225, "learning_rate": 4.7674763614132434e-05, "loss": 0.496, "step": 1136 }, { "epoch": 0.6918162458168543, "grad_norm": 1.1047946214675903, "learning_rate": 4.766951795626493e-05, "loss": 0.4656, "step": 1137 }, { "epoch": 0.6924247033769395, "grad_norm": 1.0433160066604614, "learning_rate": 4.7664266677301874e-05, "loss": 0.4197, "step": 1138 }, { "epoch": 0.6930331609370246, "grad_norm": 1.2909495830535889, "learning_rate": 4.7659009778545384e-05, "loss": 0.4733, "step": 1139 }, { "epoch": 0.6936416184971098, "grad_norm": 1.110093355178833, "learning_rate": 4.765374726129893e-05, "loss": 0.4824, "step": 1140 }, { "epoch": 0.694250076057195, "grad_norm": 1.285891056060791, "learning_rate": 4.764847912686742e-05, "loss": 0.485, "step": 1141 }, { "epoch": 0.6948585336172802, "grad_norm": 1.0111443996429443, "learning_rate": 4.764320537655712e-05, "loss": 0.4081, "step": 1142 }, { "epoch": 0.6954669911773653, "grad_norm": 1.2149741649627686, "learning_rate": 4.76379260116757e-05, "loss": 0.5088, "step": 1143 }, { "epoch": 0.6960754487374505, "grad_norm": 1.2215123176574707, "learning_rate": 4.7632641033532226e-05, "loss": 0.4491, "step": 1144 }, { "epoch": 0.6966839062975357, "grad_norm": 1.1258493661880493, "learning_rate": 4.762735044343715e-05, "loss": 0.4473, "step": 1145 }, { "epoch": 0.697292363857621, "grad_norm": 1.2326325178146362, "learning_rate": 4.7622054242702316e-05, "loss": 0.4819, "step": 1146 }, { "epoch": 0.6979008214177062, "grad_norm": 1.06281316280365, "learning_rate": 4.761675243264097e-05, "loss": 0.4228, "step": 1147 }, { "epoch": 0.6985092789777912, "grad_norm": 1.1261435747146606, "learning_rate": 4.761144501456773e-05, "loss": 0.4255, "step": 1148 }, { "epoch": 0.6991177365378765, "grad_norm": 1.1780974864959717, "learning_rate": 4.760613198979862e-05, "loss": 0.4302, "step": 1149 }, { "epoch": 0.6997261940979617, "grad_norm": 1.119859218597412, "learning_rate": 4.760081335965104e-05, "loss": 0.4293, "step": 1150 }, { "epoch": 0.7003346516580469, "grad_norm": 1.2447001934051514, "learning_rate": 4.75954891254438e-05, "loss": 0.4851, "step": 1151 }, { "epoch": 0.7009431092181321, "grad_norm": 1.1865140199661255, "learning_rate": 4.759015928849709e-05, "loss": 0.4618, "step": 1152 }, { "epoch": 0.7015515667782172, "grad_norm": 1.0534874200820923, "learning_rate": 4.758482385013247e-05, "loss": 0.4203, "step": 1153 }, { "epoch": 0.7021600243383024, "grad_norm": 0.9607533812522888, "learning_rate": 4.757948281167292e-05, "loss": 0.368, "step": 1154 }, { "epoch": 0.7027684818983876, "grad_norm": 1.1073296070098877, "learning_rate": 4.75741361744428e-05, "loss": 0.4375, "step": 1155 }, { "epoch": 0.7033769394584728, "grad_norm": 1.2616397142410278, "learning_rate": 4.756878393976783e-05, "loss": 0.5214, "step": 1156 }, { "epoch": 0.703985397018558, "grad_norm": 1.152910590171814, "learning_rate": 4.756342610897517e-05, "loss": 0.5083, "step": 1157 }, { "epoch": 0.7045938545786431, "grad_norm": 1.2775417566299438, "learning_rate": 4.7558062683393314e-05, "loss": 0.5371, "step": 1158 }, { "epoch": 0.7052023121387283, "grad_norm": 1.0952919721603394, "learning_rate": 4.755269366435219e-05, "loss": 0.4826, "step": 1159 }, { "epoch": 0.7058107696988135, "grad_norm": 1.0538053512573242, "learning_rate": 4.754731905318307e-05, "loss": 0.4789, "step": 1160 }, { "epoch": 0.7064192272588987, "grad_norm": 1.293050765991211, "learning_rate": 4.754193885121865e-05, "loss": 0.5337, "step": 1161 }, { "epoch": 0.7070276848189839, "grad_norm": 1.0085028409957886, "learning_rate": 4.7536553059792984e-05, "loss": 0.4581, "step": 1162 }, { "epoch": 0.707636142379069, "grad_norm": 1.063040852546692, "learning_rate": 4.753116168024153e-05, "loss": 0.5102, "step": 1163 }, { "epoch": 0.7082445999391542, "grad_norm": 0.9961603283882141, "learning_rate": 4.752576471390112e-05, "loss": 0.4818, "step": 1164 }, { "epoch": 0.7088530574992394, "grad_norm": 1.0528628826141357, "learning_rate": 4.7520362162109986e-05, "loss": 0.4396, "step": 1165 }, { "epoch": 0.7094615150593246, "grad_norm": 1.2712112665176392, "learning_rate": 4.751495402620774e-05, "loss": 0.496, "step": 1166 }, { "epoch": 0.7100699726194099, "grad_norm": 1.0716404914855957, "learning_rate": 4.750954030753535e-05, "loss": 0.4065, "step": 1167 }, { "epoch": 0.710678430179495, "grad_norm": 1.2594822645187378, "learning_rate": 4.7504121007435224e-05, "loss": 0.5146, "step": 1168 }, { "epoch": 0.7112868877395802, "grad_norm": 1.2636467218399048, "learning_rate": 4.749869612725108e-05, "loss": 0.5225, "step": 1169 }, { "epoch": 0.7118953452996654, "grad_norm": 1.1434320211410522, "learning_rate": 4.749326566832811e-05, "loss": 0.4127, "step": 1170 }, { "epoch": 0.7125038028597506, "grad_norm": 1.0728880167007446, "learning_rate": 4.7487829632012816e-05, "loss": 0.4583, "step": 1171 }, { "epoch": 0.7131122604198357, "grad_norm": 1.2107247114181519, "learning_rate": 4.7482388019653114e-05, "loss": 0.4255, "step": 1172 }, { "epoch": 0.7137207179799209, "grad_norm": 1.1021230220794678, "learning_rate": 4.7476940832598295e-05, "loss": 0.4829, "step": 1173 }, { "epoch": 0.7143291755400061, "grad_norm": 1.2605928182601929, "learning_rate": 4.747148807219902e-05, "loss": 0.4636, "step": 1174 }, { "epoch": 0.7149376331000913, "grad_norm": 1.0456511974334717, "learning_rate": 4.746602973980738e-05, "loss": 0.4248, "step": 1175 }, { "epoch": 0.7155460906601765, "grad_norm": 1.1637083292007446, "learning_rate": 4.746056583677678e-05, "loss": 0.4544, "step": 1176 }, { "epoch": 0.7161545482202616, "grad_norm": 1.1425434350967407, "learning_rate": 4.745509636446207e-05, "loss": 0.4557, "step": 1177 }, { "epoch": 0.7167630057803468, "grad_norm": 1.1743497848510742, "learning_rate": 4.744962132421943e-05, "loss": 0.425, "step": 1178 }, { "epoch": 0.717371463340432, "grad_norm": 1.1483440399169922, "learning_rate": 4.744414071740644e-05, "loss": 0.4819, "step": 1179 }, { "epoch": 0.7179799209005172, "grad_norm": 1.0176396369934082, "learning_rate": 4.7438654545382076e-05, "loss": 0.4357, "step": 1180 }, { "epoch": 0.7185883784606024, "grad_norm": 1.065658450126648, "learning_rate": 4.743316280950667e-05, "loss": 0.4108, "step": 1181 }, { "epoch": 0.7191968360206875, "grad_norm": 0.9383304119110107, "learning_rate": 4.7427665511141955e-05, "loss": 0.3916, "step": 1182 }, { "epoch": 0.7198052935807727, "grad_norm": 1.0549168586730957, "learning_rate": 4.7422162651651026e-05, "loss": 0.4165, "step": 1183 }, { "epoch": 0.7204137511408579, "grad_norm": 1.241140365600586, "learning_rate": 4.741665423239835e-05, "loss": 0.4203, "step": 1184 }, { "epoch": 0.7210222087009431, "grad_norm": 1.204376220703125, "learning_rate": 4.741114025474981e-05, "loss": 0.4497, "step": 1185 }, { "epoch": 0.7216306662610283, "grad_norm": 1.0280909538269043, "learning_rate": 4.7405620720072616e-05, "loss": 0.4068, "step": 1186 }, { "epoch": 0.7222391238211134, "grad_norm": 1.3876770734786987, "learning_rate": 4.74000956297354e-05, "loss": 0.4713, "step": 1187 }, { "epoch": 0.7228475813811986, "grad_norm": 1.0588303804397583, "learning_rate": 4.739456498510815e-05, "loss": 0.4042, "step": 1188 }, { "epoch": 0.7234560389412839, "grad_norm": 1.0324196815490723, "learning_rate": 4.738902878756224e-05, "loss": 0.469, "step": 1189 }, { "epoch": 0.7240644965013691, "grad_norm": 0.9513891935348511, "learning_rate": 4.738348703847041e-05, "loss": 0.4032, "step": 1190 }, { "epoch": 0.7246729540614543, "grad_norm": 1.0217005014419556, "learning_rate": 4.737793973920678e-05, "loss": 0.4112, "step": 1191 }, { "epoch": 0.7252814116215394, "grad_norm": 1.1146938800811768, "learning_rate": 4.737238689114686e-05, "loss": 0.4397, "step": 1192 }, { "epoch": 0.7258898691816246, "grad_norm": 1.017473816871643, "learning_rate": 4.736682849566751e-05, "loss": 0.4661, "step": 1193 }, { "epoch": 0.7264983267417098, "grad_norm": 1.3287296295166016, "learning_rate": 4.736126455414699e-05, "loss": 0.4159, "step": 1194 }, { "epoch": 0.727106784301795, "grad_norm": 1.1023236513137817, "learning_rate": 4.7355695067964925e-05, "loss": 0.425, "step": 1195 }, { "epoch": 0.7277152418618801, "grad_norm": 1.1281861066818237, "learning_rate": 4.735012003850232e-05, "loss": 0.4161, "step": 1196 }, { "epoch": 0.7283236994219653, "grad_norm": 1.031778335571289, "learning_rate": 4.734453946714154e-05, "loss": 0.4355, "step": 1197 }, { "epoch": 0.7289321569820505, "grad_norm": 1.101798176765442, "learning_rate": 4.733895335526633e-05, "loss": 0.4299, "step": 1198 }, { "epoch": 0.7295406145421357, "grad_norm": 1.27028489112854, "learning_rate": 4.7333361704261834e-05, "loss": 0.4291, "step": 1199 }, { "epoch": 0.7301490721022209, "grad_norm": 1.2124416828155518, "learning_rate": 4.732776451551453e-05, "loss": 0.4873, "step": 1200 }, { "epoch": 0.730757529662306, "grad_norm": 1.1454896926879883, "learning_rate": 4.73221617904123e-05, "loss": 0.4799, "step": 1201 }, { "epoch": 0.7313659872223912, "grad_norm": 1.1555596590042114, "learning_rate": 4.731655353034437e-05, "loss": 0.465, "step": 1202 }, { "epoch": 0.7319744447824764, "grad_norm": 1.1674630641937256, "learning_rate": 4.7310939736701364e-05, "loss": 0.4619, "step": 1203 }, { "epoch": 0.7325829023425616, "grad_norm": 1.320896863937378, "learning_rate": 4.7305320410875277e-05, "loss": 0.4513, "step": 1204 }, { "epoch": 0.7331913599026468, "grad_norm": 3.161492347717285, "learning_rate": 4.7299695554259455e-05, "loss": 0.512, "step": 1205 }, { "epoch": 0.7337998174627319, "grad_norm": 1.1867998838424683, "learning_rate": 4.729406516824864e-05, "loss": 0.4125, "step": 1206 }, { "epoch": 0.7344082750228171, "grad_norm": 1.1488641500473022, "learning_rate": 4.7288429254238917e-05, "loss": 0.4309, "step": 1207 }, { "epoch": 0.7350167325829023, "grad_norm": 1.1211802959442139, "learning_rate": 4.728278781362777e-05, "loss": 0.4359, "step": 1208 }, { "epoch": 0.7356251901429876, "grad_norm": 0.9918986558914185, "learning_rate": 4.7277140847814025e-05, "loss": 0.4088, "step": 1209 }, { "epoch": 0.7362336477030728, "grad_norm": 1.1625863313674927, "learning_rate": 4.7271488358197924e-05, "loss": 0.424, "step": 1210 }, { "epoch": 0.7368421052631579, "grad_norm": 1.432852029800415, "learning_rate": 4.726583034618103e-05, "loss": 0.4789, "step": 1211 }, { "epoch": 0.7374505628232431, "grad_norm": 1.154141902923584, "learning_rate": 4.7260166813166285e-05, "loss": 0.4522, "step": 1212 }, { "epoch": 0.7380590203833283, "grad_norm": 1.013265609741211, "learning_rate": 4.7254497760558024e-05, "loss": 0.3628, "step": 1213 }, { "epoch": 0.7386674779434135, "grad_norm": 1.169954538345337, "learning_rate": 4.724882318976194e-05, "loss": 0.4394, "step": 1214 }, { "epoch": 0.7392759355034987, "grad_norm": 1.1213183403015137, "learning_rate": 4.724314310218507e-05, "loss": 0.4598, "step": 1215 }, { "epoch": 0.7398843930635838, "grad_norm": 1.1155545711517334, "learning_rate": 4.723745749923586e-05, "loss": 0.4315, "step": 1216 }, { "epoch": 0.740492850623669, "grad_norm": 1.197703242301941, "learning_rate": 4.723176638232408e-05, "loss": 0.4351, "step": 1217 }, { "epoch": 0.7411013081837542, "grad_norm": 1.4502240419387817, "learning_rate": 4.7226069752860915e-05, "loss": 0.47, "step": 1218 }, { "epoch": 0.7417097657438394, "grad_norm": 1.4130961894989014, "learning_rate": 4.722036761225888e-05, "loss": 0.5292, "step": 1219 }, { "epoch": 0.7423182233039245, "grad_norm": 1.2829434871673584, "learning_rate": 4.7214659961931864e-05, "loss": 0.4497, "step": 1220 }, { "epoch": 0.7429266808640097, "grad_norm": 1.1895910501480103, "learning_rate": 4.720894680329513e-05, "loss": 0.3774, "step": 1221 }, { "epoch": 0.7435351384240949, "grad_norm": 1.1315902471542358, "learning_rate": 4.720322813776531e-05, "loss": 0.4519, "step": 1222 }, { "epoch": 0.7441435959841801, "grad_norm": 1.225305199623108, "learning_rate": 4.7197503966760375e-05, "loss": 0.5347, "step": 1223 }, { "epoch": 0.7447520535442653, "grad_norm": 1.2546653747558594, "learning_rate": 4.7191774291699695e-05, "loss": 0.4597, "step": 1224 }, { "epoch": 0.7453605111043504, "grad_norm": 0.880062997341156, "learning_rate": 4.7186039114004e-05, "loss": 0.3922, "step": 1225 }, { "epoch": 0.7459689686644356, "grad_norm": 1.0395137071609497, "learning_rate": 4.718029843509536e-05, "loss": 0.4635, "step": 1226 }, { "epoch": 0.7465774262245208, "grad_norm": 1.2031601667404175, "learning_rate": 4.717455225639723e-05, "loss": 0.3788, "step": 1227 }, { "epoch": 0.747185883784606, "grad_norm": 0.9248659014701843, "learning_rate": 4.716880057933441e-05, "loss": 0.4171, "step": 1228 }, { "epoch": 0.7477943413446912, "grad_norm": 0.9494261145591736, "learning_rate": 4.71630434053331e-05, "loss": 0.3695, "step": 1229 }, { "epoch": 0.7484027989047763, "grad_norm": 1.4120838642120361, "learning_rate": 4.715728073582082e-05, "loss": 0.4758, "step": 1230 }, { "epoch": 0.7490112564648616, "grad_norm": 1.4075794219970703, "learning_rate": 4.715151257222649e-05, "loss": 0.4857, "step": 1231 }, { "epoch": 0.7496197140249468, "grad_norm": 1.2198188304901123, "learning_rate": 4.7145738915980354e-05, "loss": 0.4428, "step": 1232 }, { "epoch": 0.750228171585032, "grad_norm": 1.3420535326004028, "learning_rate": 4.7139959768514044e-05, "loss": 0.4027, "step": 1233 }, { "epoch": 0.7508366291451172, "grad_norm": 1.2325292825698853, "learning_rate": 4.713417513126055e-05, "loss": 0.4469, "step": 1234 }, { "epoch": 0.7514450867052023, "grad_norm": 1.3523552417755127, "learning_rate": 4.712838500565423e-05, "loss": 0.4269, "step": 1235 }, { "epoch": 0.7520535442652875, "grad_norm": 1.099088191986084, "learning_rate": 4.712258939313078e-05, "loss": 0.4336, "step": 1236 }, { "epoch": 0.7526620018253727, "grad_norm": 1.248347520828247, "learning_rate": 4.7116788295127275e-05, "loss": 0.3627, "step": 1237 }, { "epoch": 0.7532704593854579, "grad_norm": 1.3930177688598633, "learning_rate": 4.711098171308214e-05, "loss": 0.4765, "step": 1238 }, { "epoch": 0.7538789169455431, "grad_norm": 0.9983875155448914, "learning_rate": 4.7105169648435176e-05, "loss": 0.4179, "step": 1239 }, { "epoch": 0.7544873745056282, "grad_norm": 1.0693212747573853, "learning_rate": 4.7099352102627536e-05, "loss": 0.4764, "step": 1240 }, { "epoch": 0.7550958320657134, "grad_norm": 1.1990723609924316, "learning_rate": 4.7093529077101714e-05, "loss": 0.4924, "step": 1241 }, { "epoch": 0.7557042896257986, "grad_norm": 1.0372933149337769, "learning_rate": 4.7087700573301585e-05, "loss": 0.4129, "step": 1242 }, { "epoch": 0.7563127471858838, "grad_norm": 1.0057023763656616, "learning_rate": 4.7081866592672376e-05, "loss": 0.4319, "step": 1243 }, { "epoch": 0.756921204745969, "grad_norm": 1.0604325532913208, "learning_rate": 4.7076027136660663e-05, "loss": 0.4533, "step": 1244 }, { "epoch": 0.7575296623060541, "grad_norm": 1.1443201303482056, "learning_rate": 4.70701822067144e-05, "loss": 0.4355, "step": 1245 }, { "epoch": 0.7581381198661393, "grad_norm": 1.1489462852478027, "learning_rate": 4.706433180428288e-05, "loss": 0.4098, "step": 1246 }, { "epoch": 0.7587465774262245, "grad_norm": 1.0587549209594727, "learning_rate": 4.705847593081676e-05, "loss": 0.3956, "step": 1247 }, { "epoch": 0.7593550349863097, "grad_norm": 1.0858579874038696, "learning_rate": 4.705261458776805e-05, "loss": 0.4166, "step": 1248 }, { "epoch": 0.7599634925463948, "grad_norm": 1.7913252115249634, "learning_rate": 4.704674777659012e-05, "loss": 0.4482, "step": 1249 }, { "epoch": 0.76057195010648, "grad_norm": 1.3128273487091064, "learning_rate": 4.70408754987377e-05, "loss": 0.4596, "step": 1250 }, { "epoch": 0.7611804076665653, "grad_norm": 1.0580440759658813, "learning_rate": 4.703499775566686e-05, "loss": 0.4517, "step": 1251 }, { "epoch": 0.7617888652266505, "grad_norm": 1.076398491859436, "learning_rate": 4.702911454883504e-05, "loss": 0.3933, "step": 1252 }, { "epoch": 0.7623973227867357, "grad_norm": 1.267801284790039, "learning_rate": 4.702322587970104e-05, "loss": 0.4953, "step": 1253 }, { "epoch": 0.7630057803468208, "grad_norm": 0.9255053400993347, "learning_rate": 4.701733174972498e-05, "loss": 0.3944, "step": 1254 }, { "epoch": 0.763614237906906, "grad_norm": 0.9950581192970276, "learning_rate": 4.7011432160368385e-05, "loss": 0.4811, "step": 1255 }, { "epoch": 0.7642226954669912, "grad_norm": 1.2969601154327393, "learning_rate": 4.7005527113094094e-05, "loss": 0.4454, "step": 1256 }, { "epoch": 0.7648311530270764, "grad_norm": 1.045771837234497, "learning_rate": 4.699961660936631e-05, "loss": 0.4248, "step": 1257 }, { "epoch": 0.7654396105871616, "grad_norm": 1.1103192567825317, "learning_rate": 4.69937006506506e-05, "loss": 0.402, "step": 1258 }, { "epoch": 0.7660480681472467, "grad_norm": 1.0572874546051025, "learning_rate": 4.698777923841386e-05, "loss": 0.4319, "step": 1259 }, { "epoch": 0.7666565257073319, "grad_norm": 1.1772733926773071, "learning_rate": 4.6981852374124384e-05, "loss": 0.4311, "step": 1260 }, { "epoch": 0.7672649832674171, "grad_norm": 1.0701271295547485, "learning_rate": 4.697592005925176e-05, "loss": 0.4412, "step": 1261 }, { "epoch": 0.7678734408275023, "grad_norm": 1.0132157802581787, "learning_rate": 4.696998229526696e-05, "loss": 0.3985, "step": 1262 }, { "epoch": 0.7684818983875875, "grad_norm": 1.0832394361495972, "learning_rate": 4.696403908364231e-05, "loss": 0.4161, "step": 1263 }, { "epoch": 0.7690903559476726, "grad_norm": 1.0529204607009888, "learning_rate": 4.6958090425851465e-05, "loss": 0.5481, "step": 1264 }, { "epoch": 0.7696988135077578, "grad_norm": 1.1204801797866821, "learning_rate": 4.6952136323369463e-05, "loss": 0.4214, "step": 1265 }, { "epoch": 0.770307271067843, "grad_norm": 1.0171352624893188, "learning_rate": 4.6946176777672654e-05, "loss": 0.4146, "step": 1266 }, { "epoch": 0.7709157286279282, "grad_norm": 0.9407910704612732, "learning_rate": 4.694021179023877e-05, "loss": 0.4224, "step": 1267 }, { "epoch": 0.7715241861880134, "grad_norm": 1.1776046752929688, "learning_rate": 4.6934241362546874e-05, "loss": 0.5192, "step": 1268 }, { "epoch": 0.7721326437480985, "grad_norm": 1.0098623037338257, "learning_rate": 4.692826549607738e-05, "loss": 0.4352, "step": 1269 }, { "epoch": 0.7727411013081837, "grad_norm": 1.0229098796844482, "learning_rate": 4.6922284192312074e-05, "loss": 0.4095, "step": 1270 }, { "epoch": 0.773349558868269, "grad_norm": 1.0688673257827759, "learning_rate": 4.691629745273404e-05, "loss": 0.4093, "step": 1271 }, { "epoch": 0.7739580164283542, "grad_norm": 1.7956994771957397, "learning_rate": 4.691030527882776e-05, "loss": 0.4594, "step": 1272 }, { "epoch": 0.7745664739884393, "grad_norm": 1.1307945251464844, "learning_rate": 4.690430767207903e-05, "loss": 0.4544, "step": 1273 }, { "epoch": 0.7751749315485245, "grad_norm": 0.9435398578643799, "learning_rate": 4.689830463397502e-05, "loss": 0.3981, "step": 1274 }, { "epoch": 0.7757833891086097, "grad_norm": 1.1967155933380127, "learning_rate": 4.689229616600422e-05, "loss": 0.4067, "step": 1275 }, { "epoch": 0.7763918466686949, "grad_norm": 1.2077351808547974, "learning_rate": 4.68862822696565e-05, "loss": 0.4763, "step": 1276 }, { "epoch": 0.7770003042287801, "grad_norm": 1.1517752408981323, "learning_rate": 4.688026294642303e-05, "loss": 0.4822, "step": 1277 }, { "epoch": 0.7776087617888652, "grad_norm": 1.3446024656295776, "learning_rate": 4.687423819779637e-05, "loss": 0.3934, "step": 1278 }, { "epoch": 0.7782172193489504, "grad_norm": 0.9664615988731384, "learning_rate": 4.6868208025270396e-05, "loss": 0.3821, "step": 1279 }, { "epoch": 0.7788256769090356, "grad_norm": 1.1611120700836182, "learning_rate": 4.6862172430340344e-05, "loss": 0.4274, "step": 1280 }, { "epoch": 0.7794341344691208, "grad_norm": 1.4211469888687134, "learning_rate": 4.6856131414502795e-05, "loss": 0.5036, "step": 1281 }, { "epoch": 0.780042592029206, "grad_norm": 1.0818086862564087, "learning_rate": 4.685008497925566e-05, "loss": 0.4481, "step": 1282 }, { "epoch": 0.7806510495892911, "grad_norm": 1.3467516899108887, "learning_rate": 4.6844033126098206e-05, "loss": 0.4211, "step": 1283 }, { "epoch": 0.7812595071493763, "grad_norm": 1.0704116821289062, "learning_rate": 4.683797585653104e-05, "loss": 0.4344, "step": 1284 }, { "epoch": 0.7818679647094615, "grad_norm": 1.2070298194885254, "learning_rate": 4.683191317205612e-05, "loss": 0.4463, "step": 1285 }, { "epoch": 0.7824764222695467, "grad_norm": 1.0552793741226196, "learning_rate": 4.682584507417672e-05, "loss": 0.4259, "step": 1286 }, { "epoch": 0.7830848798296319, "grad_norm": 1.0915594100952148, "learning_rate": 4.6819771564397496e-05, "loss": 0.4085, "step": 1287 }, { "epoch": 0.783693337389717, "grad_norm": 1.2251918315887451, "learning_rate": 4.681369264422441e-05, "loss": 0.4623, "step": 1288 }, { "epoch": 0.7843017949498022, "grad_norm": 1.0790926218032837, "learning_rate": 4.68076083151648e-05, "loss": 0.454, "step": 1289 }, { "epoch": 0.7849102525098874, "grad_norm": 1.0661593675613403, "learning_rate": 4.680151857872731e-05, "loss": 0.406, "step": 1290 }, { "epoch": 0.7855187100699726, "grad_norm": 0.9886572957038879, "learning_rate": 4.6795423436421934e-05, "loss": 0.4058, "step": 1291 }, { "epoch": 0.7861271676300579, "grad_norm": 1.0567951202392578, "learning_rate": 4.678932288976004e-05, "loss": 0.4134, "step": 1292 }, { "epoch": 0.786735625190143, "grad_norm": 1.0652602910995483, "learning_rate": 4.678321694025428e-05, "loss": 0.4439, "step": 1293 }, { "epoch": 0.7873440827502282, "grad_norm": 1.048807144165039, "learning_rate": 4.6777105589418695e-05, "loss": 0.3748, "step": 1294 }, { "epoch": 0.7879525403103134, "grad_norm": 1.044026494026184, "learning_rate": 4.6770988838768634e-05, "loss": 0.3903, "step": 1295 }, { "epoch": 0.7885609978703986, "grad_norm": 0.9846064448356628, "learning_rate": 4.676486668982081e-05, "loss": 0.4294, "step": 1296 }, { "epoch": 0.7891694554304837, "grad_norm": 1.2005590200424194, "learning_rate": 4.675873914409324e-05, "loss": 0.4848, "step": 1297 }, { "epoch": 0.7897779129905689, "grad_norm": 1.192406177520752, "learning_rate": 4.6752606203105314e-05, "loss": 0.3836, "step": 1298 }, { "epoch": 0.7903863705506541, "grad_norm": 0.9216708540916443, "learning_rate": 4.6746467868377744e-05, "loss": 0.419, "step": 1299 }, { "epoch": 0.7909948281107393, "grad_norm": 1.1610586643218994, "learning_rate": 4.674032414143258e-05, "loss": 0.4027, "step": 1300 }, { "epoch": 0.7916032856708245, "grad_norm": 1.142449975013733, "learning_rate": 4.673417502379321e-05, "loss": 0.4675, "step": 1301 }, { "epoch": 0.7922117432309096, "grad_norm": 1.1200268268585205, "learning_rate": 4.672802051698436e-05, "loss": 0.4913, "step": 1302 }, { "epoch": 0.7928202007909948, "grad_norm": 1.5124874114990234, "learning_rate": 4.672186062253209e-05, "loss": 0.5245, "step": 1303 }, { "epoch": 0.79342865835108, "grad_norm": 1.1234627962112427, "learning_rate": 4.671569534196379e-05, "loss": 0.4452, "step": 1304 }, { "epoch": 0.7940371159111652, "grad_norm": 1.069880723953247, "learning_rate": 4.6709524676808215e-05, "loss": 0.4368, "step": 1305 }, { "epoch": 0.7946455734712504, "grad_norm": 1.0936565399169922, "learning_rate": 4.670334862859541e-05, "loss": 0.4527, "step": 1306 }, { "epoch": 0.7952540310313355, "grad_norm": 1.0277516841888428, "learning_rate": 4.669716719885679e-05, "loss": 0.4186, "step": 1307 }, { "epoch": 0.7958624885914207, "grad_norm": 0.9716640114784241, "learning_rate": 4.6690980389125075e-05, "loss": 0.3765, "step": 1308 }, { "epoch": 0.7964709461515059, "grad_norm": 1.0334478616714478, "learning_rate": 4.668478820093436e-05, "loss": 0.416, "step": 1309 }, { "epoch": 0.7970794037115911, "grad_norm": 1.0650469064712524, "learning_rate": 4.667859063582003e-05, "loss": 0.428, "step": 1310 }, { "epoch": 0.7976878612716763, "grad_norm": 1.0338752269744873, "learning_rate": 4.667238769531883e-05, "loss": 0.4597, "step": 1311 }, { "epoch": 0.7982963188317614, "grad_norm": 1.0277113914489746, "learning_rate": 4.666617938096884e-05, "loss": 0.4062, "step": 1312 }, { "epoch": 0.7989047763918466, "grad_norm": 1.0234774351119995, "learning_rate": 4.6659965694309446e-05, "loss": 0.3839, "step": 1313 }, { "epoch": 0.7995132339519319, "grad_norm": 1.0939502716064453, "learning_rate": 4.66537466368814e-05, "loss": 0.4286, "step": 1314 }, { "epoch": 0.8001216915120171, "grad_norm": 1.0294753313064575, "learning_rate": 4.664752221022676e-05, "loss": 0.3898, "step": 1315 }, { "epoch": 0.8007301490721023, "grad_norm": 1.0266523361206055, "learning_rate": 4.6641292415888916e-05, "loss": 0.3917, "step": 1316 }, { "epoch": 0.8013386066321874, "grad_norm": 1.0287213325500488, "learning_rate": 4.6635057255412606e-05, "loss": 0.4362, "step": 1317 }, { "epoch": 0.8019470641922726, "grad_norm": 1.0150967836380005, "learning_rate": 4.662881673034389e-05, "loss": 0.3872, "step": 1318 }, { "epoch": 0.8025555217523578, "grad_norm": 1.106278896331787, "learning_rate": 4.662257084223017e-05, "loss": 0.4082, "step": 1319 }, { "epoch": 0.803163979312443, "grad_norm": 0.9770355224609375, "learning_rate": 4.661631959262015e-05, "loss": 0.4392, "step": 1320 }, { "epoch": 0.8037724368725282, "grad_norm": 1.2682089805603027, "learning_rate": 4.661006298306388e-05, "loss": 0.4548, "step": 1321 }, { "epoch": 0.8043808944326133, "grad_norm": 1.000850796699524, "learning_rate": 4.660380101511275e-05, "loss": 0.374, "step": 1322 }, { "epoch": 0.8049893519926985, "grad_norm": 0.9677361249923706, "learning_rate": 4.659753369031945e-05, "loss": 0.3699, "step": 1323 }, { "epoch": 0.8055978095527837, "grad_norm": 1.096622109413147, "learning_rate": 4.659126101023802e-05, "loss": 0.4508, "step": 1324 }, { "epoch": 0.8062062671128689, "grad_norm": 0.9467645883560181, "learning_rate": 4.658498297642384e-05, "loss": 0.3777, "step": 1325 }, { "epoch": 0.806814724672954, "grad_norm": 0.9743712544441223, "learning_rate": 4.6578699590433585e-05, "loss": 0.3962, "step": 1326 }, { "epoch": 0.8074231822330392, "grad_norm": 1.0897773504257202, "learning_rate": 4.657241085382527e-05, "loss": 0.4304, "step": 1327 }, { "epoch": 0.8080316397931244, "grad_norm": 1.0162287950515747, "learning_rate": 4.6566116768158254e-05, "loss": 0.4047, "step": 1328 }, { "epoch": 0.8086400973532096, "grad_norm": 0.9446073770523071, "learning_rate": 4.65598173349932e-05, "loss": 0.3908, "step": 1329 }, { "epoch": 0.8092485549132948, "grad_norm": 1.0082128047943115, "learning_rate": 4.655351255589209e-05, "loss": 0.3977, "step": 1330 }, { "epoch": 0.8098570124733799, "grad_norm": 1.08930242061615, "learning_rate": 4.6547202432418274e-05, "loss": 0.4132, "step": 1331 }, { "epoch": 0.8104654700334651, "grad_norm": 1.1194206476211548, "learning_rate": 4.654088696613638e-05, "loss": 0.3912, "step": 1332 }, { "epoch": 0.8110739275935503, "grad_norm": 1.0496968030929565, "learning_rate": 4.6534566158612395e-05, "loss": 0.3896, "step": 1333 }, { "epoch": 0.8116823851536356, "grad_norm": 1.0310301780700684, "learning_rate": 4.65282400114136e-05, "loss": 0.4467, "step": 1334 }, { "epoch": 0.8122908427137208, "grad_norm": 1.234102487564087, "learning_rate": 4.6521908526108624e-05, "loss": 0.403, "step": 1335 }, { "epoch": 0.8128993002738059, "grad_norm": 1.0917807817459106, "learning_rate": 4.6515571704267414e-05, "loss": 0.4464, "step": 1336 }, { "epoch": 0.8135077578338911, "grad_norm": 1.1464637517929077, "learning_rate": 4.650922954746123e-05, "loss": 0.4617, "step": 1337 }, { "epoch": 0.8141162153939763, "grad_norm": 1.1391267776489258, "learning_rate": 4.6502882057262675e-05, "loss": 0.4209, "step": 1338 }, { "epoch": 0.8147246729540615, "grad_norm": 1.1518268585205078, "learning_rate": 4.6496529235245644e-05, "loss": 0.4591, "step": 1339 }, { "epoch": 0.8153331305141467, "grad_norm": 1.207227349281311, "learning_rate": 4.649017108298539e-05, "loss": 0.4252, "step": 1340 }, { "epoch": 0.8159415880742318, "grad_norm": 1.172620415687561, "learning_rate": 4.648380760205846e-05, "loss": 0.4601, "step": 1341 }, { "epoch": 0.816550045634317, "grad_norm": 1.1918513774871826, "learning_rate": 4.647743879404273e-05, "loss": 0.4373, "step": 1342 }, { "epoch": 0.8171585031944022, "grad_norm": 1.1502752304077148, "learning_rate": 4.647106466051741e-05, "loss": 0.4047, "step": 1343 }, { "epoch": 0.8177669607544874, "grad_norm": 1.0501556396484375, "learning_rate": 4.6464685203063005e-05, "loss": 0.422, "step": 1344 }, { "epoch": 0.8183754183145726, "grad_norm": 0.8644611835479736, "learning_rate": 4.645830042326137e-05, "loss": 0.3404, "step": 1345 }, { "epoch": 0.8189838758746577, "grad_norm": 0.9725202322006226, "learning_rate": 4.645191032269565e-05, "loss": 0.4371, "step": 1346 }, { "epoch": 0.8195923334347429, "grad_norm": 1.0984727144241333, "learning_rate": 4.644551490295033e-05, "loss": 0.396, "step": 1347 }, { "epoch": 0.8202007909948281, "grad_norm": 1.0615260601043701, "learning_rate": 4.643911416561121e-05, "loss": 0.4147, "step": 1348 }, { "epoch": 0.8208092485549133, "grad_norm": 1.1247777938842773, "learning_rate": 4.6432708112265397e-05, "loss": 0.3993, "step": 1349 }, { "epoch": 0.8214177061149984, "grad_norm": 1.0866373777389526, "learning_rate": 4.642629674450134e-05, "loss": 0.439, "step": 1350 }, { "epoch": 0.8220261636750836, "grad_norm": 1.415543556213379, "learning_rate": 4.641988006390877e-05, "loss": 0.4355, "step": 1351 }, { "epoch": 0.8226346212351688, "grad_norm": 3.2268388271331787, "learning_rate": 4.641345807207879e-05, "loss": 0.4287, "step": 1352 }, { "epoch": 0.823243078795254, "grad_norm": 1.167698860168457, "learning_rate": 4.640703077060374e-05, "loss": 0.4728, "step": 1353 }, { "epoch": 0.8238515363553393, "grad_norm": 1.2120864391326904, "learning_rate": 4.640059816107737e-05, "loss": 0.4648, "step": 1354 }, { "epoch": 0.8244599939154243, "grad_norm": 1.0012540817260742, "learning_rate": 4.639416024509466e-05, "loss": 0.4621, "step": 1355 }, { "epoch": 0.8250684514755096, "grad_norm": 2.5287537574768066, "learning_rate": 4.638771702425197e-05, "loss": 0.4767, "step": 1356 }, { "epoch": 0.8256769090355948, "grad_norm": 1.1467103958129883, "learning_rate": 4.638126850014694e-05, "loss": 0.4715, "step": 1357 }, { "epoch": 0.82628536659568, "grad_norm": 1.286732792854309, "learning_rate": 4.637481467437854e-05, "loss": 0.404, "step": 1358 }, { "epoch": 0.8268938241557652, "grad_norm": 1.0348389148712158, "learning_rate": 4.6368355548547046e-05, "loss": 0.4483, "step": 1359 }, { "epoch": 0.8275022817158503, "grad_norm": 1.1077744960784912, "learning_rate": 4.636189112425405e-05, "loss": 0.4157, "step": 1360 }, { "epoch": 0.8281107392759355, "grad_norm": 0.9787223935127258, "learning_rate": 4.635542140310246e-05, "loss": 0.3761, "step": 1361 }, { "epoch": 0.8287191968360207, "grad_norm": 1.1058927774429321, "learning_rate": 4.6348946386696506e-05, "loss": 0.3918, "step": 1362 }, { "epoch": 0.8293276543961059, "grad_norm": 1.0943537950515747, "learning_rate": 4.6342466076641715e-05, "loss": 0.4403, "step": 1363 }, { "epoch": 0.8299361119561911, "grad_norm": 1.1616437435150146, "learning_rate": 4.633598047454494e-05, "loss": 0.4496, "step": 1364 }, { "epoch": 0.8305445695162762, "grad_norm": 1.026498794555664, "learning_rate": 4.632948958201432e-05, "loss": 0.4414, "step": 1365 }, { "epoch": 0.8311530270763614, "grad_norm": 1.1366041898727417, "learning_rate": 4.6322993400659355e-05, "loss": 0.4226, "step": 1366 }, { "epoch": 0.8317614846364466, "grad_norm": 0.9318983554840088, "learning_rate": 4.631649193209081e-05, "loss": 0.3743, "step": 1367 }, { "epoch": 0.8323699421965318, "grad_norm": 1.0723328590393066, "learning_rate": 4.6309985177920776e-05, "loss": 0.3779, "step": 1368 }, { "epoch": 0.832978399756617, "grad_norm": 0.9842046499252319, "learning_rate": 4.630347313976266e-05, "loss": 0.3871, "step": 1369 }, { "epoch": 0.8335868573167021, "grad_norm": 1.232901930809021, "learning_rate": 4.629695581923118e-05, "loss": 0.3983, "step": 1370 }, { "epoch": 0.8341953148767873, "grad_norm": 1.4811533689498901, "learning_rate": 4.629043321794237e-05, "loss": 0.4282, "step": 1371 }, { "epoch": 0.8348037724368725, "grad_norm": 0.9871484637260437, "learning_rate": 4.628390533751353e-05, "loss": 0.3485, "step": 1372 }, { "epoch": 0.8354122299969577, "grad_norm": 1.0894109010696411, "learning_rate": 4.6277372179563336e-05, "loss": 0.4226, "step": 1373 }, { "epoch": 0.8360206875570428, "grad_norm": 0.9860946536064148, "learning_rate": 4.627083374571173e-05, "loss": 0.3852, "step": 1374 }, { "epoch": 0.836629145117128, "grad_norm": 0.9105382561683655, "learning_rate": 4.6264290037579955e-05, "loss": 0.3875, "step": 1375 }, { "epoch": 0.8372376026772133, "grad_norm": 0.9577265977859497, "learning_rate": 4.625774105679059e-05, "loss": 0.4193, "step": 1376 }, { "epoch": 0.8378460602372985, "grad_norm": 1.2966721057891846, "learning_rate": 4.625118680496752e-05, "loss": 0.4104, "step": 1377 }, { "epoch": 0.8384545177973837, "grad_norm": 1.9938510656356812, "learning_rate": 4.624462728373591e-05, "loss": 0.4041, "step": 1378 }, { "epoch": 0.8390629753574688, "grad_norm": 0.9969017505645752, "learning_rate": 4.6238062494722254e-05, "loss": 0.4448, "step": 1379 }, { "epoch": 0.839671432917554, "grad_norm": 1.0461878776550293, "learning_rate": 4.623149243955435e-05, "loss": 0.4411, "step": 1380 }, { "epoch": 0.8402798904776392, "grad_norm": 1.1554114818572998, "learning_rate": 4.6224917119861286e-05, "loss": 0.4828, "step": 1381 }, { "epoch": 0.8408883480377244, "grad_norm": 1.0953601598739624, "learning_rate": 4.6218336537273476e-05, "loss": 0.4883, "step": 1382 }, { "epoch": 0.8414968055978096, "grad_norm": 1.0780771970748901, "learning_rate": 4.621175069342263e-05, "loss": 0.4425, "step": 1383 }, { "epoch": 0.8421052631578947, "grad_norm": 0.8909410834312439, "learning_rate": 4.620515958994176e-05, "loss": 0.4176, "step": 1384 }, { "epoch": 0.8427137207179799, "grad_norm": 1.1288505792617798, "learning_rate": 4.619856322846518e-05, "loss": 0.4576, "step": 1385 }, { "epoch": 0.8433221782780651, "grad_norm": 1.2266353368759155, "learning_rate": 4.619196161062854e-05, "loss": 0.3678, "step": 1386 }, { "epoch": 0.8439306358381503, "grad_norm": 1.1094207763671875, "learning_rate": 4.6185354738068726e-05, "loss": 0.3636, "step": 1387 }, { "epoch": 0.8445390933982355, "grad_norm": 1.0035345554351807, "learning_rate": 4.617874261242399e-05, "loss": 0.3561, "step": 1388 }, { "epoch": 0.8451475509583206, "grad_norm": 1.1671876907348633, "learning_rate": 4.617212523533386e-05, "loss": 0.395, "step": 1389 }, { "epoch": 0.8457560085184058, "grad_norm": 1.2926161289215088, "learning_rate": 4.616550260843917e-05, "loss": 0.4506, "step": 1390 }, { "epoch": 0.846364466078491, "grad_norm": 1.220173954963684, "learning_rate": 4.6158874733382056e-05, "loss": 0.4585, "step": 1391 }, { "epoch": 0.8469729236385762, "grad_norm": 0.9969469308853149, "learning_rate": 4.6152241611805956e-05, "loss": 0.3849, "step": 1392 }, { "epoch": 0.8475813811986614, "grad_norm": 0.9798814058303833, "learning_rate": 4.61456032453556e-05, "loss": 0.4175, "step": 1393 }, { "epoch": 0.8481898387587465, "grad_norm": 1.095361351966858, "learning_rate": 4.613895963567704e-05, "loss": 0.4047, "step": 1394 }, { "epoch": 0.8487982963188317, "grad_norm": 0.9531521201133728, "learning_rate": 4.6132310784417595e-05, "loss": 0.348, "step": 1395 }, { "epoch": 0.849406753878917, "grad_norm": 1.015824556350708, "learning_rate": 4.612565669322592e-05, "loss": 0.4401, "step": 1396 }, { "epoch": 0.8500152114390022, "grad_norm": 1.1718368530273438, "learning_rate": 4.611899736375194e-05, "loss": 0.3683, "step": 1397 }, { "epoch": 0.8506236689990874, "grad_norm": 1.0078071355819702, "learning_rate": 4.61123327976469e-05, "loss": 0.3512, "step": 1398 }, { "epoch": 0.8512321265591725, "grad_norm": 1.269882321357727, "learning_rate": 4.610566299656332e-05, "loss": 0.425, "step": 1399 }, { "epoch": 0.8518405841192577, "grad_norm": 1.0912398099899292, "learning_rate": 4.609898796215506e-05, "loss": 0.421, "step": 1400 }, { "epoch": 0.8524490416793429, "grad_norm": 1.116542100906372, "learning_rate": 4.609230769607723e-05, "loss": 0.4179, "step": 1401 }, { "epoch": 0.8530574992394281, "grad_norm": 1.4198042154312134, "learning_rate": 4.6085622199986266e-05, "loss": 0.4061, "step": 1402 }, { "epoch": 0.8536659567995132, "grad_norm": 1.0006221532821655, "learning_rate": 4.607893147553989e-05, "loss": 0.3707, "step": 1403 }, { "epoch": 0.8542744143595984, "grad_norm": 1.0920796394348145, "learning_rate": 4.607223552439711e-05, "loss": 0.3692, "step": 1404 }, { "epoch": 0.8548828719196836, "grad_norm": 0.9862841367721558, "learning_rate": 4.606553434821826e-05, "loss": 0.3852, "step": 1405 }, { "epoch": 0.8554913294797688, "grad_norm": 1.6820799112319946, "learning_rate": 4.605882794866495e-05, "loss": 0.4196, "step": 1406 }, { "epoch": 0.856099787039854, "grad_norm": 1.1753647327423096, "learning_rate": 4.605211632740008e-05, "loss": 0.4056, "step": 1407 }, { "epoch": 0.8567082445999391, "grad_norm": 1.041169285774231, "learning_rate": 4.6045399486087856e-05, "loss": 0.4012, "step": 1408 }, { "epoch": 0.8573167021600243, "grad_norm": 1.1424866914749146, "learning_rate": 4.603867742639377e-05, "loss": 0.4843, "step": 1409 }, { "epoch": 0.8579251597201095, "grad_norm": 1.1785975694656372, "learning_rate": 4.6031950149984624e-05, "loss": 0.3528, "step": 1410 }, { "epoch": 0.8585336172801947, "grad_norm": 1.092089056968689, "learning_rate": 4.6025217658528497e-05, "loss": 0.4248, "step": 1411 }, { "epoch": 0.8591420748402799, "grad_norm": 1.6886132955551147, "learning_rate": 4.601847995369477e-05, "loss": 0.3746, "step": 1412 }, { "epoch": 0.859750532400365, "grad_norm": 0.9200694561004639, "learning_rate": 4.60117370371541e-05, "loss": 0.3175, "step": 1413 }, { "epoch": 0.8603589899604502, "grad_norm": 0.9181839823722839, "learning_rate": 4.600498891057845e-05, "loss": 0.3799, "step": 1414 }, { "epoch": 0.8609674475205354, "grad_norm": 0.9873498678207397, "learning_rate": 4.599823557564109e-05, "loss": 0.3687, "step": 1415 }, { "epoch": 0.8615759050806207, "grad_norm": 1.0958657264709473, "learning_rate": 4.5991477034016564e-05, "loss": 0.4309, "step": 1416 }, { "epoch": 0.8621843626407059, "grad_norm": 0.935565710067749, "learning_rate": 4.598471328738069e-05, "loss": 0.3494, "step": 1417 }, { "epoch": 0.862792820200791, "grad_norm": 1.0181663036346436, "learning_rate": 4.597794433741061e-05, "loss": 0.3567, "step": 1418 }, { "epoch": 0.8634012777608762, "grad_norm": 1.1385202407836914, "learning_rate": 4.597117018578473e-05, "loss": 0.4259, "step": 1419 }, { "epoch": 0.8640097353209614, "grad_norm": 1.041556715965271, "learning_rate": 4.596439083418278e-05, "loss": 0.3825, "step": 1420 }, { "epoch": 0.8646181928810466, "grad_norm": 0.9178049564361572, "learning_rate": 4.5957606284285736e-05, "loss": 0.3587, "step": 1421 }, { "epoch": 0.8652266504411318, "grad_norm": 1.0861730575561523, "learning_rate": 4.595081653777589e-05, "loss": 0.4345, "step": 1422 }, { "epoch": 0.8658351080012169, "grad_norm": 1.1010128259658813, "learning_rate": 4.594402159633681e-05, "loss": 0.4007, "step": 1423 }, { "epoch": 0.8664435655613021, "grad_norm": 1.000855803489685, "learning_rate": 4.593722146165337e-05, "loss": 0.4319, "step": 1424 }, { "epoch": 0.8670520231213873, "grad_norm": 1.2340881824493408, "learning_rate": 4.5930416135411715e-05, "loss": 0.4397, "step": 1425 }, { "epoch": 0.8676604806814725, "grad_norm": 1.198055386543274, "learning_rate": 4.592360561929928e-05, "loss": 0.4217, "step": 1426 }, { "epoch": 0.8682689382415576, "grad_norm": 1.011428952217102, "learning_rate": 4.591678991500479e-05, "loss": 0.3898, "step": 1427 }, { "epoch": 0.8688773958016428, "grad_norm": 1.2541236877441406, "learning_rate": 4.590996902421825e-05, "loss": 0.485, "step": 1428 }, { "epoch": 0.869485853361728, "grad_norm": 1.0267783403396606, "learning_rate": 4.590314294863097e-05, "loss": 0.4073, "step": 1429 }, { "epoch": 0.8700943109218132, "grad_norm": 1.1068730354309082, "learning_rate": 4.589631168993552e-05, "loss": 0.3804, "step": 1430 }, { "epoch": 0.8707027684818984, "grad_norm": 0.9509475827217102, "learning_rate": 4.5889475249825774e-05, "loss": 0.3519, "step": 1431 }, { "epoch": 0.8713112260419835, "grad_norm": 1.0428766012191772, "learning_rate": 4.5882633629996886e-05, "loss": 0.3894, "step": 1432 }, { "epoch": 0.8719196836020687, "grad_norm": 1.0796087980270386, "learning_rate": 4.5875786832145287e-05, "loss": 0.3919, "step": 1433 }, { "epoch": 0.8725281411621539, "grad_norm": 1.0344951152801514, "learning_rate": 4.5868934857968695e-05, "loss": 0.4074, "step": 1434 }, { "epoch": 0.8731365987222391, "grad_norm": 1.127869963645935, "learning_rate": 4.586207770916612e-05, "loss": 0.3942, "step": 1435 }, { "epoch": 0.8737450562823244, "grad_norm": 1.5959341526031494, "learning_rate": 4.585521538743785e-05, "loss": 0.3944, "step": 1436 }, { "epoch": 0.8743535138424094, "grad_norm": 0.9701662659645081, "learning_rate": 4.584834789448544e-05, "loss": 0.3523, "step": 1437 }, { "epoch": 0.8749619714024947, "grad_norm": 1.1680253744125366, "learning_rate": 4.5841475232011773e-05, "loss": 0.4448, "step": 1438 }, { "epoch": 0.8755704289625799, "grad_norm": 1.0042173862457275, "learning_rate": 4.5834597401720956e-05, "loss": 0.3796, "step": 1439 }, { "epoch": 0.8761788865226651, "grad_norm": 1.100633978843689, "learning_rate": 4.582771440531841e-05, "loss": 0.3943, "step": 1440 }, { "epoch": 0.8767873440827503, "grad_norm": 1.1571612358093262, "learning_rate": 4.582082624451084e-05, "loss": 0.3959, "step": 1441 }, { "epoch": 0.8773958016428354, "grad_norm": 1.4444702863693237, "learning_rate": 4.581393292100621e-05, "loss": 0.4962, "step": 1442 }, { "epoch": 0.8780042592029206, "grad_norm": 0.991072952747345, "learning_rate": 4.5807034436513784e-05, "loss": 0.384, "step": 1443 }, { "epoch": 0.8786127167630058, "grad_norm": 1.3088997602462769, "learning_rate": 4.5800130792744096e-05, "loss": 0.3756, "step": 1444 }, { "epoch": 0.879221174323091, "grad_norm": 1.1207791566848755, "learning_rate": 4.5793221991408966e-05, "loss": 0.4245, "step": 1445 }, { "epoch": 0.8798296318831762, "grad_norm": 0.9720820188522339, "learning_rate": 4.578630803422148e-05, "loss": 0.4084, "step": 1446 }, { "epoch": 0.8804380894432613, "grad_norm": 0.9813503623008728, "learning_rate": 4.577938892289603e-05, "loss": 0.3907, "step": 1447 }, { "epoch": 0.8810465470033465, "grad_norm": 0.9682145714759827, "learning_rate": 4.577246465914825e-05, "loss": 0.4057, "step": 1448 }, { "epoch": 0.8816550045634317, "grad_norm": 1.0260658264160156, "learning_rate": 4.576553524469507e-05, "loss": 0.351, "step": 1449 }, { "epoch": 0.8822634621235169, "grad_norm": 1.1361267566680908, "learning_rate": 4.575860068125471e-05, "loss": 0.4011, "step": 1450 }, { "epoch": 0.882871919683602, "grad_norm": 1.084456443786621, "learning_rate": 4.575166097054662e-05, "loss": 0.3984, "step": 1451 }, { "epoch": 0.8834803772436872, "grad_norm": 1.2074880599975586, "learning_rate": 4.57447161142916e-05, "loss": 0.3984, "step": 1452 }, { "epoch": 0.8840888348037724, "grad_norm": 1.044838786125183, "learning_rate": 4.5737766114211654e-05, "loss": 0.3779, "step": 1453 }, { "epoch": 0.8846972923638576, "grad_norm": 1.0638879537582397, "learning_rate": 4.5730810972030114e-05, "loss": 0.3623, "step": 1454 }, { "epoch": 0.8853057499239428, "grad_norm": 1.0590424537658691, "learning_rate": 4.572385068947155e-05, "loss": 0.3891, "step": 1455 }, { "epoch": 0.8859142074840279, "grad_norm": 1.0808945894241333, "learning_rate": 4.5716885268261834e-05, "loss": 0.374, "step": 1456 }, { "epoch": 0.8865226650441131, "grad_norm": 0.9356719851493835, "learning_rate": 4.570991471012809e-05, "loss": 0.3635, "step": 1457 }, { "epoch": 0.8871311226041984, "grad_norm": 1.1011765003204346, "learning_rate": 4.570293901679873e-05, "loss": 0.3804, "step": 1458 }, { "epoch": 0.8877395801642836, "grad_norm": 1.0141364336013794, "learning_rate": 4.569595819000344e-05, "loss": 0.4072, "step": 1459 }, { "epoch": 0.8883480377243688, "grad_norm": 1.7948359251022339, "learning_rate": 4.568897223147316e-05, "loss": 0.3803, "step": 1460 }, { "epoch": 0.8889564952844539, "grad_norm": 1.2225985527038574, "learning_rate": 4.5681981142940126e-05, "loss": 0.3975, "step": 1461 }, { "epoch": 0.8895649528445391, "grad_norm": 0.9955945611000061, "learning_rate": 4.5674984926137844e-05, "loss": 0.3601, "step": 1462 }, { "epoch": 0.8901734104046243, "grad_norm": 1.1640013456344604, "learning_rate": 4.5667983582801064e-05, "loss": 0.3927, "step": 1463 }, { "epoch": 0.8907818679647095, "grad_norm": 1.3846951723098755, "learning_rate": 4.566097711466585e-05, "loss": 0.3688, "step": 1464 }, { "epoch": 0.8913903255247947, "grad_norm": 1.1909358501434326, "learning_rate": 4.56539655234695e-05, "loss": 0.4304, "step": 1465 }, { "epoch": 0.8919987830848798, "grad_norm": 1.1379756927490234, "learning_rate": 4.56469488109506e-05, "loss": 0.4431, "step": 1466 }, { "epoch": 0.892607240644965, "grad_norm": 0.9373139142990112, "learning_rate": 4.5639926978849e-05, "loss": 0.4213, "step": 1467 }, { "epoch": 0.8932156982050502, "grad_norm": 1.0285266637802124, "learning_rate": 4.563290002890583e-05, "loss": 0.4201, "step": 1468 }, { "epoch": 0.8938241557651354, "grad_norm": 1.1803969144821167, "learning_rate": 4.5625867962863466e-05, "loss": 0.4099, "step": 1469 }, { "epoch": 0.8944326133252206, "grad_norm": 1.115708589553833, "learning_rate": 4.5618830782465584e-05, "loss": 0.4085, "step": 1470 }, { "epoch": 0.8950410708853057, "grad_norm": 1.1141424179077148, "learning_rate": 4.56117884894571e-05, "loss": 0.4274, "step": 1471 }, { "epoch": 0.8956495284453909, "grad_norm": 1.2159873247146606, "learning_rate": 4.5604741085584215e-05, "loss": 0.4164, "step": 1472 }, { "epoch": 0.8962579860054761, "grad_norm": 1.0636794567108154, "learning_rate": 4.559768857259438e-05, "loss": 0.4036, "step": 1473 }, { "epoch": 0.8968664435655613, "grad_norm": 1.2936556339263916, "learning_rate": 4.5590630952236336e-05, "loss": 0.4509, "step": 1474 }, { "epoch": 0.8974749011256465, "grad_norm": 1.1456127166748047, "learning_rate": 4.558356822626008e-05, "loss": 0.4016, "step": 1475 }, { "epoch": 0.8980833586857316, "grad_norm": 1.0449507236480713, "learning_rate": 4.557650039641687e-05, "loss": 0.4636, "step": 1476 }, { "epoch": 0.8986918162458168, "grad_norm": 1.048593282699585, "learning_rate": 4.5569427464459226e-05, "loss": 0.4334, "step": 1477 }, { "epoch": 0.899300273805902, "grad_norm": 0.8729017972946167, "learning_rate": 4.556234943214095e-05, "loss": 0.3823, "step": 1478 }, { "epoch": 0.8999087313659873, "grad_norm": 1.1806243658065796, "learning_rate": 4.55552663012171e-05, "loss": 0.4189, "step": 1479 }, { "epoch": 0.9005171889260724, "grad_norm": 1.0129870176315308, "learning_rate": 4.554817807344399e-05, "loss": 0.3643, "step": 1480 }, { "epoch": 0.9011256464861576, "grad_norm": 1.1537796258926392, "learning_rate": 4.5541084750579205e-05, "loss": 0.4925, "step": 1481 }, { "epoch": 0.9017341040462428, "grad_norm": 0.9418867230415344, "learning_rate": 4.55339863343816e-05, "loss": 0.417, "step": 1482 }, { "epoch": 0.902342561606328, "grad_norm": 1.1266993284225464, "learning_rate": 4.5526882826611285e-05, "loss": 0.4141, "step": 1483 }, { "epoch": 0.9029510191664132, "grad_norm": 1.0497965812683105, "learning_rate": 4.5519774229029625e-05, "loss": 0.3798, "step": 1484 }, { "epoch": 0.9035594767264983, "grad_norm": 1.159199595451355, "learning_rate": 4.551266054339927e-05, "loss": 0.3659, "step": 1485 }, { "epoch": 0.9041679342865835, "grad_norm": 0.9190648794174194, "learning_rate": 4.55055417714841e-05, "loss": 0.3857, "step": 1486 }, { "epoch": 0.9047763918466687, "grad_norm": 1.1719613075256348, "learning_rate": 4.549841791504929e-05, "loss": 0.3847, "step": 1487 }, { "epoch": 0.9053848494067539, "grad_norm": 0.9929993748664856, "learning_rate": 4.5491288975861254e-05, "loss": 0.4189, "step": 1488 }, { "epoch": 0.9059933069668391, "grad_norm": 1.0294368267059326, "learning_rate": 4.548415495568767e-05, "loss": 0.4341, "step": 1489 }, { "epoch": 0.9066017645269242, "grad_norm": 1.082219123840332, "learning_rate": 4.5477015856297475e-05, "loss": 0.418, "step": 1490 }, { "epoch": 0.9072102220870094, "grad_norm": 1.0178451538085938, "learning_rate": 4.546987167946088e-05, "loss": 0.4209, "step": 1491 }, { "epoch": 0.9078186796470946, "grad_norm": 0.9833378791809082, "learning_rate": 4.546272242694933e-05, "loss": 0.4089, "step": 1492 }, { "epoch": 0.9084271372071798, "grad_norm": 1.0812345743179321, "learning_rate": 4.5455568100535545e-05, "loss": 0.39, "step": 1493 }, { "epoch": 0.909035594767265, "grad_norm": 1.0747591257095337, "learning_rate": 4.544840870199351e-05, "loss": 0.4182, "step": 1494 }, { "epoch": 0.9096440523273501, "grad_norm": 0.9758956432342529, "learning_rate": 4.5441244233098434e-05, "loss": 0.32, "step": 1495 }, { "epoch": 0.9102525098874353, "grad_norm": 1.013469934463501, "learning_rate": 4.5434074695626826e-05, "loss": 0.4348, "step": 1496 }, { "epoch": 0.9108609674475205, "grad_norm": 0.9901841878890991, "learning_rate": 4.542690009135643e-05, "loss": 0.3635, "step": 1497 }, { "epoch": 0.9114694250076057, "grad_norm": 1.0742686986923218, "learning_rate": 4.541972042206625e-05, "loss": 0.4546, "step": 1498 }, { "epoch": 0.912077882567691, "grad_norm": 1.0467991828918457, "learning_rate": 4.541253568953654e-05, "loss": 0.4002, "step": 1499 }, { "epoch": 0.912686340127776, "grad_norm": 1.4334415197372437, "learning_rate": 4.540534589554881e-05, "loss": 0.3806, "step": 1500 }, { "epoch": 0.9132947976878613, "grad_norm": 1.107809066772461, "learning_rate": 4.539815104188584e-05, "loss": 0.4402, "step": 1501 }, { "epoch": 0.9139032552479465, "grad_norm": 1.0222909450531006, "learning_rate": 4.539095113033165e-05, "loss": 0.4404, "step": 1502 }, { "epoch": 0.9145117128080317, "grad_norm": 1.0362690687179565, "learning_rate": 4.538374616267151e-05, "loss": 0.385, "step": 1503 }, { "epoch": 0.9151201703681168, "grad_norm": 1.0879029035568237, "learning_rate": 4.537653614069196e-05, "loss": 0.4255, "step": 1504 }, { "epoch": 0.915728627928202, "grad_norm": 1.040208339691162, "learning_rate": 4.536932106618078e-05, "loss": 0.3637, "step": 1505 }, { "epoch": 0.9163370854882872, "grad_norm": 1.043503999710083, "learning_rate": 4.536210094092702e-05, "loss": 0.3691, "step": 1506 }, { "epoch": 0.9169455430483724, "grad_norm": 1.0326288938522339, "learning_rate": 4.535487576672095e-05, "loss": 0.3499, "step": 1507 }, { "epoch": 0.9175540006084576, "grad_norm": 1.1494892835617065, "learning_rate": 4.5347645545354136e-05, "loss": 0.3684, "step": 1508 }, { "epoch": 0.9181624581685427, "grad_norm": 1.2776802778244019, "learning_rate": 4.534041027861935e-05, "loss": 0.4245, "step": 1509 }, { "epoch": 0.9187709157286279, "grad_norm": 1.056604266166687, "learning_rate": 4.533316996831064e-05, "loss": 0.3585, "step": 1510 }, { "epoch": 0.9193793732887131, "grad_norm": 1.12501859664917, "learning_rate": 4.532592461622331e-05, "loss": 0.3689, "step": 1511 }, { "epoch": 0.9199878308487983, "grad_norm": 1.0672175884246826, "learning_rate": 4.531867422415391e-05, "loss": 0.393, "step": 1512 }, { "epoch": 0.9205962884088835, "grad_norm": 1.0935568809509277, "learning_rate": 4.531141879390022e-05, "loss": 0.3826, "step": 1513 }, { "epoch": 0.9212047459689686, "grad_norm": 1.1044657230377197, "learning_rate": 4.5304158327261294e-05, "loss": 0.4487, "step": 1514 }, { "epoch": 0.9218132035290538, "grad_norm": 1.1453882455825806, "learning_rate": 4.5296892826037414e-05, "loss": 0.3947, "step": 1515 }, { "epoch": 0.922421661089139, "grad_norm": 0.9416497349739075, "learning_rate": 4.5289622292030134e-05, "loss": 0.3573, "step": 1516 }, { "epoch": 0.9230301186492242, "grad_norm": 1.0041788816452026, "learning_rate": 4.528234672704224e-05, "loss": 0.3347, "step": 1517 }, { "epoch": 0.9236385762093094, "grad_norm": 0.9780565500259399, "learning_rate": 4.527506613287776e-05, "loss": 0.3597, "step": 1518 }, { "epoch": 0.9242470337693945, "grad_norm": 1.2946865558624268, "learning_rate": 4.526778051134199e-05, "loss": 0.4323, "step": 1519 }, { "epoch": 0.9248554913294798, "grad_norm": 1.030163049697876, "learning_rate": 4.526048986424146e-05, "loss": 0.3541, "step": 1520 }, { "epoch": 0.925463948889565, "grad_norm": 1.0969247817993164, "learning_rate": 4.525319419338394e-05, "loss": 0.415, "step": 1521 }, { "epoch": 0.9260724064496502, "grad_norm": 1.0787875652313232, "learning_rate": 4.5245893500578455e-05, "loss": 0.4565, "step": 1522 }, { "epoch": 0.9266808640097354, "grad_norm": 1.2321113348007202, "learning_rate": 4.523858778763528e-05, "loss": 0.4511, "step": 1523 }, { "epoch": 0.9272893215698205, "grad_norm": 1.0210340023040771, "learning_rate": 4.523127705636591e-05, "loss": 0.4325, "step": 1524 }, { "epoch": 0.9278977791299057, "grad_norm": 0.9291285872459412, "learning_rate": 4.522396130858311e-05, "loss": 0.3934, "step": 1525 }, { "epoch": 0.9285062366899909, "grad_norm": 1.1080670356750488, "learning_rate": 4.5216640546100884e-05, "loss": 0.381, "step": 1526 }, { "epoch": 0.9291146942500761, "grad_norm": 0.9290838837623596, "learning_rate": 4.5209314770734475e-05, "loss": 0.3477, "step": 1527 }, { "epoch": 0.9297231518101612, "grad_norm": 1.5479507446289062, "learning_rate": 4.520198398430037e-05, "loss": 0.3941, "step": 1528 }, { "epoch": 0.9303316093702464, "grad_norm": 1.0913853645324707, "learning_rate": 4.5194648188616294e-05, "loss": 0.3909, "step": 1529 }, { "epoch": 0.9309400669303316, "grad_norm": 1.1218714714050293, "learning_rate": 4.518730738550122e-05, "loss": 0.4052, "step": 1530 }, { "epoch": 0.9315485244904168, "grad_norm": 1.0080488920211792, "learning_rate": 4.517996157677537e-05, "loss": 0.3412, "step": 1531 }, { "epoch": 0.932156982050502, "grad_norm": 1.1207358837127686, "learning_rate": 4.517261076426018e-05, "loss": 0.3878, "step": 1532 }, { "epoch": 0.9327654396105871, "grad_norm": 1.1212834119796753, "learning_rate": 4.516525494977837e-05, "loss": 0.407, "step": 1533 }, { "epoch": 0.9333738971706723, "grad_norm": 1.1488605737686157, "learning_rate": 4.5157894135153845e-05, "loss": 0.4392, "step": 1534 }, { "epoch": 0.9339823547307575, "grad_norm": 1.036495327949524, "learning_rate": 4.515052832221181e-05, "loss": 0.3508, "step": 1535 }, { "epoch": 0.9345908122908427, "grad_norm": 1.168699860572815, "learning_rate": 4.514315751277867e-05, "loss": 0.4334, "step": 1536 }, { "epoch": 0.9351992698509279, "grad_norm": 1.0106819868087769, "learning_rate": 4.513578170868206e-05, "loss": 0.3479, "step": 1537 }, { "epoch": 0.935807727411013, "grad_norm": 0.9819125533103943, "learning_rate": 4.512840091175089e-05, "loss": 0.3646, "step": 1538 }, { "epoch": 0.9364161849710982, "grad_norm": 0.8794593811035156, "learning_rate": 4.51210151238153e-05, "loss": 0.3568, "step": 1539 }, { "epoch": 0.9370246425311834, "grad_norm": 0.9659105539321899, "learning_rate": 4.511362434670663e-05, "loss": 0.3865, "step": 1540 }, { "epoch": 0.9376331000912687, "grad_norm": 1.012877345085144, "learning_rate": 4.510622858225752e-05, "loss": 0.4071, "step": 1541 }, { "epoch": 0.9382415576513539, "grad_norm": 0.9462942481040955, "learning_rate": 4.509882783230177e-05, "loss": 0.345, "step": 1542 }, { "epoch": 0.938850015211439, "grad_norm": 1.1898541450500488, "learning_rate": 4.509142209867448e-05, "loss": 0.3663, "step": 1543 }, { "epoch": 0.9394584727715242, "grad_norm": 1.2533682584762573, "learning_rate": 4.508401138321196e-05, "loss": 0.3441, "step": 1544 }, { "epoch": 0.9400669303316094, "grad_norm": 1.0154099464416504, "learning_rate": 4.507659568775177e-05, "loss": 0.3578, "step": 1545 }, { "epoch": 0.9406753878916946, "grad_norm": 1.03611159324646, "learning_rate": 4.506917501413268e-05, "loss": 0.3639, "step": 1546 }, { "epoch": 0.9412838454517798, "grad_norm": 1.214494228363037, "learning_rate": 4.506174936419471e-05, "loss": 0.3985, "step": 1547 }, { "epoch": 0.9418923030118649, "grad_norm": 1.0812007188796997, "learning_rate": 4.505431873977911e-05, "loss": 0.3469, "step": 1548 }, { "epoch": 0.9425007605719501, "grad_norm": 1.1645228862762451, "learning_rate": 4.504688314272837e-05, "loss": 0.4381, "step": 1549 }, { "epoch": 0.9431092181320353, "grad_norm": 1.1173677444458008, "learning_rate": 4.5039442574886204e-05, "loss": 0.3876, "step": 1550 }, { "epoch": 0.9437176756921205, "grad_norm": 1.0114907026290894, "learning_rate": 4.503199703809757e-05, "loss": 0.4089, "step": 1551 }, { "epoch": 0.9443261332522057, "grad_norm": 0.9891692996025085, "learning_rate": 4.5024546534208645e-05, "loss": 0.3515, "step": 1552 }, { "epoch": 0.9449345908122908, "grad_norm": 1.005385398864746, "learning_rate": 4.5017091065066837e-05, "loss": 0.3328, "step": 1553 }, { "epoch": 0.945543048372376, "grad_norm": 1.1059056520462036, "learning_rate": 4.50096306325208e-05, "loss": 0.4516, "step": 1554 }, { "epoch": 0.9461515059324612, "grad_norm": 0.9695335626602173, "learning_rate": 4.500216523842041e-05, "loss": 0.3779, "step": 1555 }, { "epoch": 0.9467599634925464, "grad_norm": 1.2391901016235352, "learning_rate": 4.499469488461677e-05, "loss": 0.3915, "step": 1556 }, { "epoch": 0.9473684210526315, "grad_norm": 0.9961259961128235, "learning_rate": 4.4987219572962224e-05, "loss": 0.3433, "step": 1557 }, { "epoch": 0.9479768786127167, "grad_norm": 1.959693193435669, "learning_rate": 4.497973930531033e-05, "loss": 0.3842, "step": 1558 }, { "epoch": 0.9485853361728019, "grad_norm": 1.5181488990783691, "learning_rate": 4.497225408351589e-05, "loss": 0.4688, "step": 1559 }, { "epoch": 0.9491937937328871, "grad_norm": 1.1111069917678833, "learning_rate": 4.4964763909434914e-05, "loss": 0.3564, "step": 1560 }, { "epoch": 0.9498022512929724, "grad_norm": 1.0508601665496826, "learning_rate": 4.495726878492465e-05, "loss": 0.3788, "step": 1561 }, { "epoch": 0.9504107088530575, "grad_norm": 1.1294502019882202, "learning_rate": 4.494976871184361e-05, "loss": 0.3692, "step": 1562 }, { "epoch": 0.9510191664131427, "grad_norm": 1.0184043645858765, "learning_rate": 4.494226369205147e-05, "loss": 0.4318, "step": 1563 }, { "epoch": 0.9516276239732279, "grad_norm": 1.053466796875, "learning_rate": 4.493475372740916e-05, "loss": 0.4011, "step": 1564 }, { "epoch": 0.9522360815333131, "grad_norm": 1.0696157217025757, "learning_rate": 4.492723881977885e-05, "loss": 0.4152, "step": 1565 }, { "epoch": 0.9528445390933983, "grad_norm": 1.0308218002319336, "learning_rate": 4.4919718971023926e-05, "loss": 0.3829, "step": 1566 }, { "epoch": 0.9534529966534834, "grad_norm": 1.0396000146865845, "learning_rate": 4.4912194183008994e-05, "loss": 0.3648, "step": 1567 }, { "epoch": 0.9540614542135686, "grad_norm": 1.12269926071167, "learning_rate": 4.490466445759988e-05, "loss": 0.4426, "step": 1568 }, { "epoch": 0.9546699117736538, "grad_norm": 1.1220544576644897, "learning_rate": 4.489712979666365e-05, "loss": 0.3841, "step": 1569 }, { "epoch": 0.955278369333739, "grad_norm": 0.9514595866203308, "learning_rate": 4.4889590202068584e-05, "loss": 0.384, "step": 1570 }, { "epoch": 0.9558868268938242, "grad_norm": 0.9942927956581116, "learning_rate": 4.4882045675684184e-05, "loss": 0.3854, "step": 1571 }, { "epoch": 0.9564952844539093, "grad_norm": 1.0529226064682007, "learning_rate": 4.487449621938118e-05, "loss": 0.3861, "step": 1572 }, { "epoch": 0.9571037420139945, "grad_norm": 1.1254863739013672, "learning_rate": 4.486694183503153e-05, "loss": 0.4056, "step": 1573 }, { "epoch": 0.9577121995740797, "grad_norm": 0.9550037980079651, "learning_rate": 4.48593825245084e-05, "loss": 0.3541, "step": 1574 }, { "epoch": 0.9583206571341649, "grad_norm": 1.0226643085479736, "learning_rate": 4.4851818289686175e-05, "loss": 0.4194, "step": 1575 }, { "epoch": 0.9589291146942501, "grad_norm": 1.5648815631866455, "learning_rate": 4.484424913244049e-05, "loss": 0.3899, "step": 1576 }, { "epoch": 0.9595375722543352, "grad_norm": 1.0767285823822021, "learning_rate": 4.4836675054648156e-05, "loss": 0.386, "step": 1577 }, { "epoch": 0.9601460298144204, "grad_norm": 1.153304100036621, "learning_rate": 4.482909605818725e-05, "loss": 0.4087, "step": 1578 }, { "epoch": 0.9607544873745056, "grad_norm": 0.8764035105705261, "learning_rate": 4.482151214493704e-05, "loss": 0.3476, "step": 1579 }, { "epoch": 0.9613629449345908, "grad_norm": 2.1990628242492676, "learning_rate": 4.4813923316778014e-05, "loss": 0.43, "step": 1580 }, { "epoch": 0.9619714024946759, "grad_norm": 1.129573106765747, "learning_rate": 4.48063295755919e-05, "loss": 0.4173, "step": 1581 }, { "epoch": 0.9625798600547611, "grad_norm": 1.4812309741973877, "learning_rate": 4.4798730923261614e-05, "loss": 0.4175, "step": 1582 }, { "epoch": 0.9631883176148464, "grad_norm": 0.9128961563110352, "learning_rate": 4.4791127361671304e-05, "loss": 0.3943, "step": 1583 }, { "epoch": 0.9637967751749316, "grad_norm": 1.1068278551101685, "learning_rate": 4.478351889270635e-05, "loss": 0.4152, "step": 1584 }, { "epoch": 0.9644052327350168, "grad_norm": 0.88494873046875, "learning_rate": 4.477590551825333e-05, "loss": 0.3745, "step": 1585 }, { "epoch": 0.9650136902951019, "grad_norm": 1.0295628309249878, "learning_rate": 4.476828724020004e-05, "loss": 0.4369, "step": 1586 }, { "epoch": 0.9656221478551871, "grad_norm": 0.9472355246543884, "learning_rate": 4.47606640604355e-05, "loss": 0.3364, "step": 1587 }, { "epoch": 0.9662306054152723, "grad_norm": 0.9680747389793396, "learning_rate": 4.4753035980849935e-05, "loss": 0.3878, "step": 1588 }, { "epoch": 0.9668390629753575, "grad_norm": 0.9909029603004456, "learning_rate": 4.4745403003334784e-05, "loss": 0.3365, "step": 1589 }, { "epoch": 0.9674475205354427, "grad_norm": 1.3338682651519775, "learning_rate": 4.4737765129782735e-05, "loss": 0.3644, "step": 1590 }, { "epoch": 0.9680559780955278, "grad_norm": 0.9651134014129639, "learning_rate": 4.473012236208763e-05, "loss": 0.3977, "step": 1591 }, { "epoch": 0.968664435655613, "grad_norm": 1.1207659244537354, "learning_rate": 4.472247470214458e-05, "loss": 0.395, "step": 1592 }, { "epoch": 0.9692728932156982, "grad_norm": 1.0144009590148926, "learning_rate": 4.471482215184988e-05, "loss": 0.3607, "step": 1593 }, { "epoch": 0.9698813507757834, "grad_norm": 0.911418616771698, "learning_rate": 4.470716471310103e-05, "loss": 0.3452, "step": 1594 }, { "epoch": 0.9704898083358686, "grad_norm": 0.9752874374389648, "learning_rate": 4.469950238779677e-05, "loss": 0.3708, "step": 1595 }, { "epoch": 0.9710982658959537, "grad_norm": 0.8316762447357178, "learning_rate": 4.469183517783704e-05, "loss": 0.3127, "step": 1596 }, { "epoch": 0.9717067234560389, "grad_norm": 0.9781742095947266, "learning_rate": 4.4684163085122976e-05, "loss": 0.3197, "step": 1597 }, { "epoch": 0.9723151810161241, "grad_norm": 1.1461424827575684, "learning_rate": 4.4676486111556936e-05, "loss": 0.3754, "step": 1598 }, { "epoch": 0.9729236385762093, "grad_norm": 1.0407254695892334, "learning_rate": 4.466880425904251e-05, "loss": 0.4198, "step": 1599 }, { "epoch": 0.9735320961362945, "grad_norm": 0.9534010887145996, "learning_rate": 4.466111752948446e-05, "loss": 0.3313, "step": 1600 }, { "epoch": 0.9741405536963796, "grad_norm": 1.008195400238037, "learning_rate": 4.465342592478878e-05, "loss": 0.366, "step": 1601 }, { "epoch": 0.9747490112564648, "grad_norm": 0.928765058517456, "learning_rate": 4.464572944686266e-05, "loss": 0.3674, "step": 1602 }, { "epoch": 0.97535746881655, "grad_norm": 1.0775413513183594, "learning_rate": 4.4638028097614515e-05, "loss": 0.448, "step": 1603 }, { "epoch": 0.9759659263766353, "grad_norm": 0.9224221706390381, "learning_rate": 4.463032187895395e-05, "loss": 0.3466, "step": 1604 }, { "epoch": 0.9765743839367204, "grad_norm": 1.0811947584152222, "learning_rate": 4.46226107927918e-05, "loss": 0.4303, "step": 1605 }, { "epoch": 0.9771828414968056, "grad_norm": 1.023656964302063, "learning_rate": 4.4614894841040076e-05, "loss": 0.3192, "step": 1606 }, { "epoch": 0.9777912990568908, "grad_norm": 1.0087846517562866, "learning_rate": 4.460717402561203e-05, "loss": 0.3878, "step": 1607 }, { "epoch": 0.978399756616976, "grad_norm": 1.254363775253296, "learning_rate": 4.4599448348422087e-05, "loss": 0.4048, "step": 1608 }, { "epoch": 0.9790082141770612, "grad_norm": 1.086838722229004, "learning_rate": 4.45917178113859e-05, "loss": 0.3949, "step": 1609 }, { "epoch": 0.9796166717371463, "grad_norm": 1.0660868883132935, "learning_rate": 4.458398241642032e-05, "loss": 0.3275, "step": 1610 }, { "epoch": 0.9802251292972315, "grad_norm": 1.0936857461929321, "learning_rate": 4.4576242165443394e-05, "loss": 0.4235, "step": 1611 }, { "epoch": 0.9808335868573167, "grad_norm": 1.1144509315490723, "learning_rate": 4.456849706037439e-05, "loss": 0.4732, "step": 1612 }, { "epoch": 0.9814420444174019, "grad_norm": 1.0750304460525513, "learning_rate": 4.456074710313378e-05, "loss": 0.3902, "step": 1613 }, { "epoch": 0.9820505019774871, "grad_norm": 1.7120141983032227, "learning_rate": 4.455299229564321e-05, "loss": 0.5309, "step": 1614 }, { "epoch": 0.9826589595375722, "grad_norm": 1.0807467699050903, "learning_rate": 4.454523263982557e-05, "loss": 0.4076, "step": 1615 }, { "epoch": 0.9832674170976574, "grad_norm": 1.0277988910675049, "learning_rate": 4.453746813760492e-05, "loss": 0.3838, "step": 1616 }, { "epoch": 0.9838758746577426, "grad_norm": 1.0415362119674683, "learning_rate": 4.452969879090653e-05, "loss": 0.3689, "step": 1617 }, { "epoch": 0.9844843322178278, "grad_norm": 1.0815787315368652, "learning_rate": 4.452192460165687e-05, "loss": 0.337, "step": 1618 }, { "epoch": 0.985092789777913, "grad_norm": 1.2625021934509277, "learning_rate": 4.451414557178363e-05, "loss": 0.3757, "step": 1619 }, { "epoch": 0.9857012473379981, "grad_norm": 1.092826247215271, "learning_rate": 4.450636170321568e-05, "loss": 0.3752, "step": 1620 }, { "epoch": 0.9863097048980833, "grad_norm": 1.0274910926818848, "learning_rate": 4.449857299788309e-05, "loss": 0.3388, "step": 1621 }, { "epoch": 0.9869181624581685, "grad_norm": 0.9931478500366211, "learning_rate": 4.449077945771714e-05, "loss": 0.366, "step": 1622 }, { "epoch": 0.9875266200182538, "grad_norm": 1.035408616065979, "learning_rate": 4.44829810846503e-05, "loss": 0.3855, "step": 1623 }, { "epoch": 0.988135077578339, "grad_norm": 1.1031928062438965, "learning_rate": 4.447517788061624e-05, "loss": 0.4198, "step": 1624 }, { "epoch": 0.9887435351384241, "grad_norm": 1.122717261314392, "learning_rate": 4.446736984754982e-05, "loss": 0.4499, "step": 1625 }, { "epoch": 0.9893519926985093, "grad_norm": 0.972223699092865, "learning_rate": 4.445955698738714e-05, "loss": 0.4174, "step": 1626 }, { "epoch": 0.9899604502585945, "grad_norm": 0.8995710611343384, "learning_rate": 4.445173930206543e-05, "loss": 0.3257, "step": 1627 }, { "epoch": 0.9905689078186797, "grad_norm": 0.9216899871826172, "learning_rate": 4.444391679352315e-05, "loss": 0.3732, "step": 1628 }, { "epoch": 0.9911773653787649, "grad_norm": 1.138875961303711, "learning_rate": 4.4436089463699984e-05, "loss": 0.4355, "step": 1629 }, { "epoch": 0.99178582293885, "grad_norm": 0.9231753945350647, "learning_rate": 4.442825731453676e-05, "loss": 0.3415, "step": 1630 }, { "epoch": 0.9923942804989352, "grad_norm": 1.0919394493103027, "learning_rate": 4.4420420347975535e-05, "loss": 0.3939, "step": 1631 }, { "epoch": 0.9930027380590204, "grad_norm": 1.25301194190979, "learning_rate": 4.4412578565959554e-05, "loss": 0.3438, "step": 1632 }, { "epoch": 0.9936111956191056, "grad_norm": 1.0430591106414795, "learning_rate": 4.440473197043323e-05, "loss": 0.4462, "step": 1633 }, { "epoch": 0.9942196531791907, "grad_norm": 0.9053996205329895, "learning_rate": 4.439688056334221e-05, "loss": 0.3515, "step": 1634 }, { "epoch": 0.9948281107392759, "grad_norm": 1.0166853666305542, "learning_rate": 4.438902434663331e-05, "loss": 0.3846, "step": 1635 }, { "epoch": 0.9954365682993611, "grad_norm": 1.1809371709823608, "learning_rate": 4.438116332225456e-05, "loss": 0.4166, "step": 1636 }, { "epoch": 0.9960450258594463, "grad_norm": 0.8665146231651306, "learning_rate": 4.437329749215514e-05, "loss": 0.3229, "step": 1637 }, { "epoch": 0.9966534834195315, "grad_norm": 0.9558749198913574, "learning_rate": 4.4365426858285466e-05, "loss": 0.3278, "step": 1638 }, { "epoch": 0.9972619409796166, "grad_norm": 0.9763336777687073, "learning_rate": 4.435755142259712e-05, "loss": 0.3675, "step": 1639 }, { "epoch": 0.9978703985397018, "grad_norm": 1.078508734703064, "learning_rate": 4.434967118704289e-05, "loss": 0.3435, "step": 1640 }, { "epoch": 0.998478856099787, "grad_norm": 1.0996962785720825, "learning_rate": 4.434178615357673e-05, "loss": 0.4125, "step": 1641 }, { "epoch": 0.9990873136598722, "grad_norm": 1.0043129920959473, "learning_rate": 4.433389632415381e-05, "loss": 0.4054, "step": 1642 }, { "epoch": 0.9996957712199575, "grad_norm": 1.0766171216964722, "learning_rate": 4.432600170073048e-05, "loss": 0.3377, "step": 1643 }, { "epoch": 0.9996957712199575, "eval_loss": 1.0254058837890625, "eval_runtime": 108.4457, "eval_samples_per_second": 7.027, "eval_steps_per_second": 0.443, "step": 1643 }, { "epoch": 1.0003042287800425, "grad_norm": 0.9912607073783875, "learning_rate": 4.431810228526428e-05, "loss": 0.3113, "step": 1644 }, { "epoch": 1.0009126863401279, "grad_norm": 0.9680461287498474, "learning_rate": 4.431019807971393e-05, "loss": 0.3006, "step": 1645 }, { "epoch": 1.001521143900213, "grad_norm": 0.9305565357208252, "learning_rate": 4.430228908603934e-05, "loss": 0.2665, "step": 1646 }, { "epoch": 1.002129601460298, "grad_norm": 0.9174848198890686, "learning_rate": 4.4294375306201617e-05, "loss": 0.2765, "step": 1647 }, { "epoch": 1.0027380590203834, "grad_norm": 0.8088688254356384, "learning_rate": 4.4286456742163055e-05, "loss": 0.2378, "step": 1648 }, { "epoch": 1.0033465165804685, "grad_norm": 1.254451870918274, "learning_rate": 4.427853339588711e-05, "loss": 0.2796, "step": 1649 }, { "epoch": 1.0039549741405538, "grad_norm": 0.9241443872451782, "learning_rate": 4.427060526933846e-05, "loss": 0.2814, "step": 1650 }, { "epoch": 1.0045634317006389, "grad_norm": 1.0514885187149048, "learning_rate": 4.4262672364482935e-05, "loss": 0.2402, "step": 1651 }, { "epoch": 1.005171889260724, "grad_norm": 0.931946337223053, "learning_rate": 4.4254734683287575e-05, "loss": 0.2502, "step": 1652 }, { "epoch": 1.0057803468208093, "grad_norm": 2.174992561340332, "learning_rate": 4.424679222772059e-05, "loss": 0.2845, "step": 1653 }, { "epoch": 1.0063888043808944, "grad_norm": 0.9909196496009827, "learning_rate": 4.4238844999751376e-05, "loss": 0.2355, "step": 1654 }, { "epoch": 1.0069972619409797, "grad_norm": 0.9932165741920471, "learning_rate": 4.423089300135052e-05, "loss": 0.2375, "step": 1655 }, { "epoch": 1.0076057195010648, "grad_norm": 1.1847784519195557, "learning_rate": 4.422293623448978e-05, "loss": 0.2185, "step": 1656 }, { "epoch": 1.00821417706115, "grad_norm": 0.9104436039924622, "learning_rate": 4.421497470114211e-05, "loss": 0.2347, "step": 1657 }, { "epoch": 1.0088226346212352, "grad_norm": 1.0744693279266357, "learning_rate": 4.420700840328162e-05, "loss": 0.2824, "step": 1658 }, { "epoch": 1.0094310921813203, "grad_norm": 1.0241225957870483, "learning_rate": 4.4199037342883656e-05, "loss": 0.2483, "step": 1659 }, { "epoch": 1.0100395497414056, "grad_norm": 1.3999607563018799, "learning_rate": 4.419106152192467e-05, "loss": 0.2848, "step": 1660 }, { "epoch": 1.0106480073014907, "grad_norm": 1.127521276473999, "learning_rate": 4.418308094238235e-05, "loss": 0.2263, "step": 1661 }, { "epoch": 1.0112564648615758, "grad_norm": 1.0944982767105103, "learning_rate": 4.417509560623555e-05, "loss": 0.2932, "step": 1662 }, { "epoch": 1.0118649224216612, "grad_norm": 0.9151603579521179, "learning_rate": 4.4167105515464305e-05, "loss": 0.2578, "step": 1663 }, { "epoch": 1.0124733799817462, "grad_norm": 1.0785083770751953, "learning_rate": 4.415911067204981e-05, "loss": 0.2092, "step": 1664 }, { "epoch": 1.0130818375418316, "grad_norm": 1.1762471199035645, "learning_rate": 4.415111107797445e-05, "loss": 0.3176, "step": 1665 }, { "epoch": 1.0136902951019167, "grad_norm": 0.9114542007446289, "learning_rate": 4.414310673522181e-05, "loss": 0.1926, "step": 1666 }, { "epoch": 1.0142987526620018, "grad_norm": 1.0898659229278564, "learning_rate": 4.413509764577663e-05, "loss": 0.2444, "step": 1667 }, { "epoch": 1.014907210222087, "grad_norm": 1.3012317419052124, "learning_rate": 4.412708381162481e-05, "loss": 0.2119, "step": 1668 }, { "epoch": 1.0155156677821722, "grad_norm": 0.9848833680152893, "learning_rate": 4.411906523475347e-05, "loss": 0.2507, "step": 1669 }, { "epoch": 1.0161241253422575, "grad_norm": 1.0226378440856934, "learning_rate": 4.411104191715087e-05, "loss": 0.2876, "step": 1670 }, { "epoch": 1.0167325829023426, "grad_norm": 0.9476386308670044, "learning_rate": 4.410301386080646e-05, "loss": 0.209, "step": 1671 }, { "epoch": 1.0173410404624277, "grad_norm": 1.0117278099060059, "learning_rate": 4.4094981067710864e-05, "loss": 0.2523, "step": 1672 }, { "epoch": 1.017949498022513, "grad_norm": 1.3588906526565552, "learning_rate": 4.408694353985589e-05, "loss": 0.2786, "step": 1673 }, { "epoch": 1.018557955582598, "grad_norm": 1.4190876483917236, "learning_rate": 4.40789012792345e-05, "loss": 0.2771, "step": 1674 }, { "epoch": 1.0191664131426832, "grad_norm": 1.1058770418167114, "learning_rate": 4.4070854287840836e-05, "loss": 0.2673, "step": 1675 }, { "epoch": 1.0197748707027685, "grad_norm": 0.8771906495094299, "learning_rate": 4.406280256767022e-05, "loss": 0.2254, "step": 1676 }, { "epoch": 1.0203833282628536, "grad_norm": 0.8889464735984802, "learning_rate": 4.4054746120719145e-05, "loss": 0.2608, "step": 1677 }, { "epoch": 1.020991785822939, "grad_norm": 0.8301272392272949, "learning_rate": 4.404668494898527e-05, "loss": 0.2089, "step": 1678 }, { "epoch": 1.021600243383024, "grad_norm": 0.9632772207260132, "learning_rate": 4.403861905446744e-05, "loss": 0.2448, "step": 1679 }, { "epoch": 1.0222087009431091, "grad_norm": 0.8604543209075928, "learning_rate": 4.4030548439165654e-05, "loss": 0.254, "step": 1680 }, { "epoch": 1.0228171585031944, "grad_norm": 0.9880070090293884, "learning_rate": 4.402247310508108e-05, "loss": 0.2649, "step": 1681 }, { "epoch": 1.0234256160632795, "grad_norm": 1.0415153503417969, "learning_rate": 4.4014393054216076e-05, "loss": 0.243, "step": 1682 }, { "epoch": 1.0240340736233648, "grad_norm": 1.1902730464935303, "learning_rate": 4.400630828857415e-05, "loss": 0.2617, "step": 1683 }, { "epoch": 1.02464253118345, "grad_norm": 0.9809486269950867, "learning_rate": 4.3998218810159995e-05, "loss": 0.2415, "step": 1684 }, { "epoch": 1.025250988743535, "grad_norm": 1.0021169185638428, "learning_rate": 4.3990124620979454e-05, "loss": 0.2443, "step": 1685 }, { "epoch": 1.0258594463036204, "grad_norm": 0.8601137399673462, "learning_rate": 4.3982025723039564e-05, "loss": 0.2291, "step": 1686 }, { "epoch": 1.0264679038637055, "grad_norm": 1.0233848094940186, "learning_rate": 4.3973922118348497e-05, "loss": 0.2482, "step": 1687 }, { "epoch": 1.0270763614237908, "grad_norm": 1.0119305849075317, "learning_rate": 4.396581380891562e-05, "loss": 0.2517, "step": 1688 }, { "epoch": 1.0276848189838759, "grad_norm": 0.9090347290039062, "learning_rate": 4.3957700796751446e-05, "loss": 0.2357, "step": 1689 }, { "epoch": 1.028293276543961, "grad_norm": 0.8543658256530762, "learning_rate": 4.394958308386768e-05, "loss": 0.2061, "step": 1690 }, { "epoch": 1.0289017341040463, "grad_norm": 1.0883513689041138, "learning_rate": 4.394146067227716e-05, "loss": 0.2704, "step": 1691 }, { "epoch": 1.0295101916641314, "grad_norm": 0.9570848345756531, "learning_rate": 4.393333356399391e-05, "loss": 0.2509, "step": 1692 }, { "epoch": 1.0301186492242167, "grad_norm": 0.9206425547599792, "learning_rate": 4.3925201761033116e-05, "loss": 0.2453, "step": 1693 }, { "epoch": 1.0307271067843018, "grad_norm": 0.9678740501403809, "learning_rate": 4.391706526541114e-05, "loss": 0.2438, "step": 1694 }, { "epoch": 1.031335564344387, "grad_norm": 0.9064133763313293, "learning_rate": 4.390892407914547e-05, "loss": 0.2221, "step": 1695 }, { "epoch": 1.0319440219044722, "grad_norm": 1.0082942247390747, "learning_rate": 4.390077820425479e-05, "loss": 0.2859, "step": 1696 }, { "epoch": 1.0325524794645573, "grad_norm": 0.9536372423171997, "learning_rate": 4.3892627642758946e-05, "loss": 0.2616, "step": 1697 }, { "epoch": 1.0331609370246426, "grad_norm": 0.9097919464111328, "learning_rate": 4.388447239667892e-05, "loss": 0.2647, "step": 1698 }, { "epoch": 1.0337693945847277, "grad_norm": 0.9455602765083313, "learning_rate": 4.387631246803689e-05, "loss": 0.2271, "step": 1699 }, { "epoch": 1.0343778521448128, "grad_norm": 1.0754998922348022, "learning_rate": 4.386814785885617e-05, "loss": 0.2671, "step": 1700 }, { "epoch": 1.0349863097048981, "grad_norm": 0.8816566467285156, "learning_rate": 4.3859978571161245e-05, "loss": 0.2149, "step": 1701 }, { "epoch": 1.0355947672649832, "grad_norm": 0.8995217084884644, "learning_rate": 4.3851804606977756e-05, "loss": 0.2361, "step": 1702 }, { "epoch": 1.0362032248250685, "grad_norm": 1.0205188989639282, "learning_rate": 4.38436259683325e-05, "loss": 0.2671, "step": 1703 }, { "epoch": 1.0368116823851536, "grad_norm": 0.9358905553817749, "learning_rate": 4.383544265725346e-05, "loss": 0.2259, "step": 1704 }, { "epoch": 1.0374201399452387, "grad_norm": 1.6457610130310059, "learning_rate": 4.382725467576973e-05, "loss": 0.3296, "step": 1705 }, { "epoch": 1.038028597505324, "grad_norm": 0.9523285627365112, "learning_rate": 4.38190620259116e-05, "loss": 0.2466, "step": 1706 }, { "epoch": 1.0386370550654092, "grad_norm": 2.4622488021850586, "learning_rate": 4.381086470971051e-05, "loss": 0.2464, "step": 1707 }, { "epoch": 1.0392455126254945, "grad_norm": 0.9736045002937317, "learning_rate": 4.380266272919904e-05, "loss": 0.226, "step": 1708 }, { "epoch": 1.0398539701855796, "grad_norm": 0.9621555209159851, "learning_rate": 4.379445608641095e-05, "loss": 0.247, "step": 1709 }, { "epoch": 1.0404624277456647, "grad_norm": 0.9200358390808105, "learning_rate": 4.378624478338115e-05, "loss": 0.2385, "step": 1710 }, { "epoch": 1.04107088530575, "grad_norm": 0.921256422996521, "learning_rate": 4.377802882214568e-05, "loss": 0.2536, "step": 1711 }, { "epoch": 1.041679342865835, "grad_norm": 1.0362918376922607, "learning_rate": 4.3769808204741766e-05, "loss": 0.2505, "step": 1712 }, { "epoch": 1.0422878004259204, "grad_norm": 0.9670946002006531, "learning_rate": 4.37615829332078e-05, "loss": 0.2311, "step": 1713 }, { "epoch": 1.0428962579860055, "grad_norm": 0.9415709376335144, "learning_rate": 4.3753353009583275e-05, "loss": 0.2073, "step": 1714 }, { "epoch": 1.0435047155460906, "grad_norm": 0.9122322797775269, "learning_rate": 4.374511843590888e-05, "loss": 0.2956, "step": 1715 }, { "epoch": 1.044113173106176, "grad_norm": 0.8133691549301147, "learning_rate": 4.373687921422646e-05, "loss": 0.2302, "step": 1716 }, { "epoch": 1.044721630666261, "grad_norm": 0.9611904621124268, "learning_rate": 4.372863534657897e-05, "loss": 0.242, "step": 1717 }, { "epoch": 1.045330088226346, "grad_norm": 1.105997085571289, "learning_rate": 4.372038683501057e-05, "loss": 0.2738, "step": 1718 }, { "epoch": 1.0459385457864314, "grad_norm": 0.9072719216346741, "learning_rate": 4.3712133681566546e-05, "loss": 0.3021, "step": 1719 }, { "epoch": 1.0465470033465165, "grad_norm": 0.9673424959182739, "learning_rate": 4.370387588829332e-05, "loss": 0.2756, "step": 1720 }, { "epoch": 1.0471554609066018, "grad_norm": 0.8735299706459045, "learning_rate": 4.369561345723849e-05, "loss": 0.244, "step": 1721 }, { "epoch": 1.047763918466687, "grad_norm": 0.8752166628837585, "learning_rate": 4.36873463904508e-05, "loss": 0.2251, "step": 1722 }, { "epoch": 1.048372376026772, "grad_norm": 0.845640242099762, "learning_rate": 4.367907468998013e-05, "loss": 0.2487, "step": 1723 }, { "epoch": 1.0489808335868573, "grad_norm": 0.846122682094574, "learning_rate": 4.3670798357877515e-05, "loss": 0.2061, "step": 1724 }, { "epoch": 1.0495892911469424, "grad_norm": 0.9644899368286133, "learning_rate": 4.366251739619515e-05, "loss": 0.2853, "step": 1725 }, { "epoch": 1.0501977487070278, "grad_norm": 1.052946925163269, "learning_rate": 4.365423180698636e-05, "loss": 0.2502, "step": 1726 }, { "epoch": 1.0508062062671129, "grad_norm": 1.0697331428527832, "learning_rate": 4.364594159230563e-05, "loss": 0.2259, "step": 1727 }, { "epoch": 1.051414663827198, "grad_norm": 0.9684371948242188, "learning_rate": 4.363764675420858e-05, "loss": 0.2298, "step": 1728 }, { "epoch": 1.0520231213872833, "grad_norm": 0.9283509254455566, "learning_rate": 4.3629347294752e-05, "loss": 0.2221, "step": 1729 }, { "epoch": 1.0526315789473684, "grad_norm": 0.9486536383628845, "learning_rate": 4.3621043215993793e-05, "loss": 0.2321, "step": 1730 }, { "epoch": 1.0532400365074537, "grad_norm": 0.9327476024627686, "learning_rate": 4.3612734519993035e-05, "loss": 0.2353, "step": 1731 }, { "epoch": 1.0538484940675388, "grad_norm": 0.9721671342849731, "learning_rate": 4.360442120880994e-05, "loss": 0.2255, "step": 1732 }, { "epoch": 1.0544569516276239, "grad_norm": 1.0690611600875854, "learning_rate": 4.3596103284505854e-05, "loss": 0.2511, "step": 1733 }, { "epoch": 1.0550654091877092, "grad_norm": 0.9945971965789795, "learning_rate": 4.358778074914326e-05, "loss": 0.2729, "step": 1734 }, { "epoch": 1.0556738667477943, "grad_norm": 1.0413203239440918, "learning_rate": 4.357945360478584e-05, "loss": 0.2918, "step": 1735 }, { "epoch": 1.0562823243078796, "grad_norm": 0.8384091854095459, "learning_rate": 4.357112185349834e-05, "loss": 0.2324, "step": 1736 }, { "epoch": 1.0568907818679647, "grad_norm": 1.018020510673523, "learning_rate": 4.3562785497346706e-05, "loss": 0.262, "step": 1737 }, { "epoch": 1.0574992394280498, "grad_norm": 0.9685660004615784, "learning_rate": 4.355444453839801e-05, "loss": 0.2684, "step": 1738 }, { "epoch": 1.0581076969881351, "grad_norm": 1.006584882736206, "learning_rate": 4.354609897872044e-05, "loss": 0.2912, "step": 1739 }, { "epoch": 1.0587161545482202, "grad_norm": 0.9230665564537048, "learning_rate": 4.3537748820383386e-05, "loss": 0.2542, "step": 1740 }, { "epoch": 1.0593246121083055, "grad_norm": 0.9952296018600464, "learning_rate": 4.35293940654573e-05, "loss": 0.2588, "step": 1741 }, { "epoch": 1.0599330696683906, "grad_norm": 0.8689709305763245, "learning_rate": 4.352103471601383e-05, "loss": 0.2159, "step": 1742 }, { "epoch": 1.0605415272284757, "grad_norm": 0.9874710440635681, "learning_rate": 4.351267077412575e-05, "loss": 0.2766, "step": 1743 }, { "epoch": 1.061149984788561, "grad_norm": 0.9209951758384705, "learning_rate": 4.350430224186696e-05, "loss": 0.2918, "step": 1744 }, { "epoch": 1.0617584423486461, "grad_norm": 0.9789804220199585, "learning_rate": 4.349592912131252e-05, "loss": 0.26, "step": 1745 }, { "epoch": 1.0623668999087315, "grad_norm": 0.963188111782074, "learning_rate": 4.3487551414538595e-05, "loss": 0.2295, "step": 1746 }, { "epoch": 1.0629753574688166, "grad_norm": 0.8524497151374817, "learning_rate": 4.347916912362252e-05, "loss": 0.2094, "step": 1747 }, { "epoch": 1.0635838150289016, "grad_norm": 1.0623359680175781, "learning_rate": 4.347078225064276e-05, "loss": 0.263, "step": 1748 }, { "epoch": 1.064192272588987, "grad_norm": 1.013371467590332, "learning_rate": 4.34623907976789e-05, "loss": 0.2404, "step": 1749 }, { "epoch": 1.064800730149072, "grad_norm": 0.8995924592018127, "learning_rate": 4.345399476681167e-05, "loss": 0.2167, "step": 1750 }, { "epoch": 1.0654091877091574, "grad_norm": 1.011319875717163, "learning_rate": 4.344559416012293e-05, "loss": 0.2701, "step": 1751 }, { "epoch": 1.0660176452692425, "grad_norm": 0.9735434055328369, "learning_rate": 4.34371889796957e-05, "loss": 0.2159, "step": 1752 }, { "epoch": 1.0666261028293276, "grad_norm": 0.8756011128425598, "learning_rate": 4.34287792276141e-05, "loss": 0.2148, "step": 1753 }, { "epoch": 1.067234560389413, "grad_norm": 0.8667939305305481, "learning_rate": 4.34203649059634e-05, "loss": 0.1809, "step": 1754 }, { "epoch": 1.067843017949498, "grad_norm": 0.9129221439361572, "learning_rate": 4.341194601683e-05, "loss": 0.2344, "step": 1755 }, { "epoch": 1.0684514755095833, "grad_norm": 0.952587902545929, "learning_rate": 4.340352256230144e-05, "loss": 0.2158, "step": 1756 }, { "epoch": 1.0690599330696684, "grad_norm": 0.932098388671875, "learning_rate": 4.339509454446637e-05, "loss": 0.2488, "step": 1757 }, { "epoch": 1.0696683906297535, "grad_norm": 1.0526845455169678, "learning_rate": 4.338666196541461e-05, "loss": 0.276, "step": 1758 }, { "epoch": 1.0702768481898388, "grad_norm": 1.0784953832626343, "learning_rate": 4.337822482723708e-05, "loss": 0.2574, "step": 1759 }, { "epoch": 1.070885305749924, "grad_norm": 1.0775099992752075, "learning_rate": 4.336978313202583e-05, "loss": 0.2513, "step": 1760 }, { "epoch": 1.0714937633100092, "grad_norm": 0.990006685256958, "learning_rate": 4.336133688187405e-05, "loss": 0.237, "step": 1761 }, { "epoch": 1.0721022208700943, "grad_norm": 0.9512050151824951, "learning_rate": 4.3352886078876065e-05, "loss": 0.2594, "step": 1762 }, { "epoch": 1.0727106784301794, "grad_norm": 1.0135704278945923, "learning_rate": 4.3344430725127315e-05, "loss": 0.2292, "step": 1763 }, { "epoch": 1.0733191359902647, "grad_norm": 1.0516897439956665, "learning_rate": 4.333597082272438e-05, "loss": 0.2289, "step": 1764 }, { "epoch": 1.0739275935503498, "grad_norm": 0.9647794365882874, "learning_rate": 4.332750637376496e-05, "loss": 0.2488, "step": 1765 }, { "epoch": 1.0745360511104352, "grad_norm": 0.84974604845047, "learning_rate": 4.331903738034789e-05, "loss": 0.2196, "step": 1766 }, { "epoch": 1.0751445086705202, "grad_norm": 0.9360343813896179, "learning_rate": 4.331056384457313e-05, "loss": 0.2452, "step": 1767 }, { "epoch": 1.0757529662306053, "grad_norm": 0.920875608921051, "learning_rate": 4.330208576854176e-05, "loss": 0.2618, "step": 1768 }, { "epoch": 1.0763614237906907, "grad_norm": 0.9963610172271729, "learning_rate": 4.3293603154355976e-05, "loss": 0.2536, "step": 1769 }, { "epoch": 1.0769698813507758, "grad_norm": 0.914340615272522, "learning_rate": 4.328511600411913e-05, "loss": 0.2487, "step": 1770 }, { "epoch": 1.077578338910861, "grad_norm": 0.8099347949028015, "learning_rate": 4.327662431993568e-05, "loss": 0.2186, "step": 1771 }, { "epoch": 1.0781867964709462, "grad_norm": 0.8634665608406067, "learning_rate": 4.32681281039112e-05, "loss": 0.2381, "step": 1772 }, { "epoch": 1.0787952540310313, "grad_norm": 1.6494232416152954, "learning_rate": 4.325962735815241e-05, "loss": 0.3326, "step": 1773 }, { "epoch": 1.0794037115911166, "grad_norm": 0.9181280732154846, "learning_rate": 4.3251122084767124e-05, "loss": 0.2321, "step": 1774 }, { "epoch": 1.0800121691512017, "grad_norm": 1.1291999816894531, "learning_rate": 4.324261228586431e-05, "loss": 0.2762, "step": 1775 }, { "epoch": 1.080620626711287, "grad_norm": 0.8536227941513062, "learning_rate": 4.323409796355404e-05, "loss": 0.1981, "step": 1776 }, { "epoch": 1.081229084271372, "grad_norm": 0.9988884329795837, "learning_rate": 4.322557911994751e-05, "loss": 0.2943, "step": 1777 }, { "epoch": 1.0818375418314572, "grad_norm": 0.9455403089523315, "learning_rate": 4.321705575715703e-05, "loss": 0.2459, "step": 1778 }, { "epoch": 1.0824459993915425, "grad_norm": 0.9276781678199768, "learning_rate": 4.320852787729606e-05, "loss": 0.2779, "step": 1779 }, { "epoch": 1.0830544569516276, "grad_norm": 1.0638751983642578, "learning_rate": 4.319999548247914e-05, "loss": 0.2595, "step": 1780 }, { "epoch": 1.0836629145117127, "grad_norm": 0.9427651762962341, "learning_rate": 4.3191458574821955e-05, "loss": 0.2628, "step": 1781 }, { "epoch": 1.084271372071798, "grad_norm": 0.9555470943450928, "learning_rate": 4.3182917156441296e-05, "loss": 0.245, "step": 1782 }, { "epoch": 1.0848798296318831, "grad_norm": 0.9582526683807373, "learning_rate": 4.317437122945508e-05, "loss": 0.2842, "step": 1783 }, { "epoch": 1.0854882871919684, "grad_norm": 0.9005835652351379, "learning_rate": 4.316582079598235e-05, "loss": 0.2646, "step": 1784 }, { "epoch": 1.0860967447520535, "grad_norm": 0.7882102727890015, "learning_rate": 4.3157265858143247e-05, "loss": 0.2415, "step": 1785 }, { "epoch": 1.0867052023121386, "grad_norm": 0.8828130960464478, "learning_rate": 4.3148706418059046e-05, "loss": 0.2327, "step": 1786 }, { "epoch": 1.087313659872224, "grad_norm": 0.823904812335968, "learning_rate": 4.314014247785214e-05, "loss": 0.2377, "step": 1787 }, { "epoch": 1.087922117432309, "grad_norm": 0.9788227677345276, "learning_rate": 4.313157403964601e-05, "loss": 0.2552, "step": 1788 }, { "epoch": 1.0885305749923944, "grad_norm": 0.935871422290802, "learning_rate": 4.312300110556527e-05, "loss": 0.2474, "step": 1789 }, { "epoch": 1.0891390325524795, "grad_norm": 17.13036346435547, "learning_rate": 4.311442367773567e-05, "loss": 0.2759, "step": 1790 }, { "epoch": 1.0897474901125646, "grad_norm": 1.1542432308197021, "learning_rate": 4.3105841758284035e-05, "loss": 0.2905, "step": 1791 }, { "epoch": 1.0903559476726499, "grad_norm": 1.1316038370132446, "learning_rate": 4.3097255349338344e-05, "loss": 0.2549, "step": 1792 }, { "epoch": 1.090964405232735, "grad_norm": 1.0348681211471558, "learning_rate": 4.308866445302766e-05, "loss": 0.2649, "step": 1793 }, { "epoch": 1.0915728627928203, "grad_norm": 1.0835968255996704, "learning_rate": 4.308006907148215e-05, "loss": 0.2424, "step": 1794 }, { "epoch": 1.0921813203529054, "grad_norm": 0.9840952754020691, "learning_rate": 4.307146920683313e-05, "loss": 0.27, "step": 1795 }, { "epoch": 1.0927897779129905, "grad_norm": 0.8699171543121338, "learning_rate": 4.3062864861213e-05, "loss": 0.2061, "step": 1796 }, { "epoch": 1.0933982354730758, "grad_norm": 0.9339224100112915, "learning_rate": 4.305425603675529e-05, "loss": 0.2612, "step": 1797 }, { "epoch": 1.094006693033161, "grad_norm": 1.0255519151687622, "learning_rate": 4.304564273559462e-05, "loss": 0.2652, "step": 1798 }, { "epoch": 1.0946151505932462, "grad_norm": 1.1433568000793457, "learning_rate": 4.303702495986672e-05, "loss": 0.2684, "step": 1799 }, { "epoch": 1.0952236081533313, "grad_norm": 1.2076388597488403, "learning_rate": 4.302840271170846e-05, "loss": 0.2964, "step": 1800 }, { "epoch": 1.0958320657134164, "grad_norm": 0.9302932620048523, "learning_rate": 4.301977599325779e-05, "loss": 0.2291, "step": 1801 }, { "epoch": 1.0964405232735017, "grad_norm": 0.9104456901550293, "learning_rate": 4.301114480665377e-05, "loss": 0.235, "step": 1802 }, { "epoch": 1.0970489808335868, "grad_norm": 1.0696483850479126, "learning_rate": 4.3002509154036585e-05, "loss": 0.3616, "step": 1803 }, { "epoch": 1.0976574383936721, "grad_norm": 0.8358986377716064, "learning_rate": 4.299386903754751e-05, "loss": 0.197, "step": 1804 }, { "epoch": 1.0982658959537572, "grad_norm": 0.949712872505188, "learning_rate": 4.2985224459328934e-05, "loss": 0.2621, "step": 1805 }, { "epoch": 1.0988743535138423, "grad_norm": 1.1388565301895142, "learning_rate": 4.297657542152434e-05, "loss": 0.3019, "step": 1806 }, { "epoch": 1.0994828110739276, "grad_norm": 0.9658118486404419, "learning_rate": 4.296792192627836e-05, "loss": 0.2406, "step": 1807 }, { "epoch": 1.1000912686340127, "grad_norm": 0.9098191261291504, "learning_rate": 4.2959263975736676e-05, "loss": 0.2166, "step": 1808 }, { "epoch": 1.100699726194098, "grad_norm": 1.0185233354568481, "learning_rate": 4.29506015720461e-05, "loss": 0.2498, "step": 1809 }, { "epoch": 1.1013081837541832, "grad_norm": 1.152592658996582, "learning_rate": 4.294193471735456e-05, "loss": 0.2417, "step": 1810 }, { "epoch": 1.1019166413142683, "grad_norm": 0.8422519564628601, "learning_rate": 4.2933263413811065e-05, "loss": 0.2361, "step": 1811 }, { "epoch": 1.1025250988743536, "grad_norm": 0.9988545179367065, "learning_rate": 4.292458766356574e-05, "loss": 0.273, "step": 1812 }, { "epoch": 1.1031335564344387, "grad_norm": 1.068097472190857, "learning_rate": 4.29159074687698e-05, "loss": 0.2403, "step": 1813 }, { "epoch": 1.103742013994524, "grad_norm": 1.0569621324539185, "learning_rate": 4.290722283157559e-05, "loss": 0.2446, "step": 1814 }, { "epoch": 1.104350471554609, "grad_norm": 0.9275895357131958, "learning_rate": 4.289853375413652e-05, "loss": 0.2369, "step": 1815 }, { "epoch": 1.1049589291146942, "grad_norm": 1.1227593421936035, "learning_rate": 4.2889840238607135e-05, "loss": 0.262, "step": 1816 }, { "epoch": 1.1055673866747795, "grad_norm": 0.9638474583625793, "learning_rate": 4.2881142287143044e-05, "loss": 0.232, "step": 1817 }, { "epoch": 1.1061758442348646, "grad_norm": 0.9380021691322327, "learning_rate": 4.2872439901901e-05, "loss": 0.244, "step": 1818 }, { "epoch": 1.1067843017949497, "grad_norm": 0.9702509641647339, "learning_rate": 4.286373308503881e-05, "loss": 0.2451, "step": 1819 }, { "epoch": 1.107392759355035, "grad_norm": 0.9465189576148987, "learning_rate": 4.285502183871542e-05, "loss": 0.2479, "step": 1820 }, { "epoch": 1.10800121691512, "grad_norm": 0.9717510342597961, "learning_rate": 4.284630616509084e-05, "loss": 0.2178, "step": 1821 }, { "epoch": 1.1086096744752054, "grad_norm": 0.9717493653297424, "learning_rate": 4.283758606632621e-05, "loss": 0.2601, "step": 1822 }, { "epoch": 1.1092181320352905, "grad_norm": 1.4151792526245117, "learning_rate": 4.2828861544583746e-05, "loss": 0.29, "step": 1823 }, { "epoch": 1.1098265895953756, "grad_norm": 1.0413273572921753, "learning_rate": 4.282013260202675e-05, "loss": 0.262, "step": 1824 }, { "epoch": 1.110435047155461, "grad_norm": 1.031923532485962, "learning_rate": 4.281139924081966e-05, "loss": 0.2929, "step": 1825 }, { "epoch": 1.111043504715546, "grad_norm": 0.9092714786529541, "learning_rate": 4.280266146312797e-05, "loss": 0.2703, "step": 1826 }, { "epoch": 1.1116519622756313, "grad_norm": 1.0851198434829712, "learning_rate": 4.279391927111828e-05, "loss": 0.3064, "step": 1827 }, { "epoch": 1.1122604198357164, "grad_norm": 0.839637279510498, "learning_rate": 4.2785172666958305e-05, "loss": 0.2491, "step": 1828 }, { "epoch": 1.1128688773958015, "grad_norm": 0.8800391554832458, "learning_rate": 4.2776421652816834e-05, "loss": 0.2591, "step": 1829 }, { "epoch": 1.1134773349558869, "grad_norm": 0.9919121861457825, "learning_rate": 4.2767666230863743e-05, "loss": 0.2876, "step": 1830 }, { "epoch": 1.114085792515972, "grad_norm": 1.0813971757888794, "learning_rate": 4.2758906403270026e-05, "loss": 0.2622, "step": 1831 }, { "epoch": 1.1146942500760573, "grad_norm": 0.9650895595550537, "learning_rate": 4.275014217220775e-05, "loss": 0.2185, "step": 1832 }, { "epoch": 1.1153027076361424, "grad_norm": 0.9158915281295776, "learning_rate": 4.2741373539850076e-05, "loss": 0.2519, "step": 1833 }, { "epoch": 1.1159111651962275, "grad_norm": 1.001535415649414, "learning_rate": 4.273260050837126e-05, "loss": 0.268, "step": 1834 }, { "epoch": 1.1165196227563128, "grad_norm": 0.9505628347396851, "learning_rate": 4.272382307994665e-05, "loss": 0.2334, "step": 1835 }, { "epoch": 1.1171280803163979, "grad_norm": 1.0092861652374268, "learning_rate": 4.271504125675269e-05, "loss": 0.2992, "step": 1836 }, { "epoch": 1.1177365378764832, "grad_norm": 0.9263057708740234, "learning_rate": 4.270625504096688e-05, "loss": 0.2283, "step": 1837 }, { "epoch": 1.1183449954365683, "grad_norm": 1.5665264129638672, "learning_rate": 4.269746443476787e-05, "loss": 0.235, "step": 1838 }, { "epoch": 1.1189534529966534, "grad_norm": 1.0436450242996216, "learning_rate": 4.268866944033533e-05, "loss": 0.26, "step": 1839 }, { "epoch": 1.1195619105567387, "grad_norm": 0.8918569684028625, "learning_rate": 4.267987005985008e-05, "loss": 0.2506, "step": 1840 }, { "epoch": 1.1201703681168238, "grad_norm": 0.9875538349151611, "learning_rate": 4.267106629549398e-05, "loss": 0.2407, "step": 1841 }, { "epoch": 1.1207788256769091, "grad_norm": 1.0079050064086914, "learning_rate": 4.266225814945001e-05, "loss": 0.3127, "step": 1842 }, { "epoch": 1.1213872832369942, "grad_norm": 0.8947494029998779, "learning_rate": 4.265344562390222e-05, "loss": 0.2271, "step": 1843 }, { "epoch": 1.1219957407970793, "grad_norm": 0.9922159314155579, "learning_rate": 4.264462872103575e-05, "loss": 0.2515, "step": 1844 }, { "epoch": 1.1226041983571646, "grad_norm": 0.8498148918151855, "learning_rate": 4.263580744303681e-05, "loss": 0.2378, "step": 1845 }, { "epoch": 1.1232126559172497, "grad_norm": 1.0088249444961548, "learning_rate": 4.2626981792092735e-05, "loss": 0.2551, "step": 1846 }, { "epoch": 1.123821113477335, "grad_norm": 0.899652898311615, "learning_rate": 4.261815177039189e-05, "loss": 0.2272, "step": 1847 }, { "epoch": 1.1244295710374201, "grad_norm": 1.3068557977676392, "learning_rate": 4.260931738012378e-05, "loss": 0.2594, "step": 1848 }, { "epoch": 1.1250380285975052, "grad_norm": 1.037555456161499, "learning_rate": 4.260047862347894e-05, "loss": 0.2607, "step": 1849 }, { "epoch": 1.1256464861575906, "grad_norm": 1.0918004512786865, "learning_rate": 4.259163550264904e-05, "loss": 0.2699, "step": 1850 }, { "epoch": 1.1262549437176756, "grad_norm": 0.9314645528793335, "learning_rate": 4.258278801982678e-05, "loss": 0.2482, "step": 1851 }, { "epoch": 1.126863401277761, "grad_norm": 0.8757783770561218, "learning_rate": 4.257393617720599e-05, "loss": 0.2579, "step": 1852 }, { "epoch": 1.127471858837846, "grad_norm": 1.0207921266555786, "learning_rate": 4.256507997698152e-05, "loss": 0.2311, "step": 1853 }, { "epoch": 1.1280803163979312, "grad_norm": 0.9114212989807129, "learning_rate": 4.2556219421349394e-05, "loss": 0.2494, "step": 1854 }, { "epoch": 1.1286887739580165, "grad_norm": 1.038779616355896, "learning_rate": 4.254735451250661e-05, "loss": 0.2665, "step": 1855 }, { "epoch": 1.1292972315181016, "grad_norm": 0.8652352094650269, "learning_rate": 4.2538485252651326e-05, "loss": 0.2155, "step": 1856 }, { "epoch": 1.129905689078187, "grad_norm": 0.9267082810401917, "learning_rate": 4.2529611643982735e-05, "loss": 0.2499, "step": 1857 }, { "epoch": 1.130514146638272, "grad_norm": 1.116803526878357, "learning_rate": 4.252073368870113e-05, "loss": 0.2744, "step": 1858 }, { "epoch": 1.131122604198357, "grad_norm": 0.9374389052391052, "learning_rate": 4.251185138900787e-05, "loss": 0.2698, "step": 1859 }, { "epoch": 1.1317310617584424, "grad_norm": 0.8052027821540833, "learning_rate": 4.25029647471054e-05, "loss": 0.2098, "step": 1860 }, { "epoch": 1.1323395193185275, "grad_norm": 0.9955977201461792, "learning_rate": 4.249407376519722e-05, "loss": 0.2417, "step": 1861 }, { "epoch": 1.1329479768786128, "grad_norm": 0.8962797522544861, "learning_rate": 4.248517844548795e-05, "loss": 0.2428, "step": 1862 }, { "epoch": 1.133556434438698, "grad_norm": 1.001209020614624, "learning_rate": 4.247627879018323e-05, "loss": 0.2477, "step": 1863 }, { "epoch": 1.134164891998783, "grad_norm": 1.0366243124008179, "learning_rate": 4.246737480148983e-05, "loss": 0.2382, "step": 1864 }, { "epoch": 1.1347733495588683, "grad_norm": 0.9120834469795227, "learning_rate": 4.245846648161554e-05, "loss": 0.2555, "step": 1865 }, { "epoch": 1.1353818071189534, "grad_norm": 0.9888159036636353, "learning_rate": 4.2449553832769284e-05, "loss": 0.2803, "step": 1866 }, { "epoch": 1.1359902646790387, "grad_norm": 0.9909419417381287, "learning_rate": 4.2440636857161e-05, "loss": 0.27, "step": 1867 }, { "epoch": 1.1365987222391238, "grad_norm": 1.0697113275527954, "learning_rate": 4.243171555700174e-05, "loss": 0.2664, "step": 1868 }, { "epoch": 1.137207179799209, "grad_norm": 0.9328745603561401, "learning_rate": 4.242278993450361e-05, "loss": 0.2762, "step": 1869 }, { "epoch": 1.1378156373592943, "grad_norm": 0.8406645059585571, "learning_rate": 4.24138599918798e-05, "loss": 0.2151, "step": 1870 }, { "epoch": 1.1384240949193793, "grad_norm": 0.9748327136039734, "learning_rate": 4.240492573134455e-05, "loss": 0.2286, "step": 1871 }, { "epoch": 1.1390325524794647, "grad_norm": 0.8779383301734924, "learning_rate": 4.239598715511319e-05, "loss": 0.2185, "step": 1872 }, { "epoch": 1.1396410100395498, "grad_norm": 0.96523517370224, "learning_rate": 4.238704426540213e-05, "loss": 0.2499, "step": 1873 }, { "epoch": 1.1402494675996349, "grad_norm": 1.156562328338623, "learning_rate": 4.23780970644288e-05, "loss": 0.272, "step": 1874 }, { "epoch": 1.1408579251597202, "grad_norm": 1.0613001585006714, "learning_rate": 4.236914555441177e-05, "loss": 0.268, "step": 1875 }, { "epoch": 1.1414663827198053, "grad_norm": 0.803700864315033, "learning_rate": 4.236018973757061e-05, "loss": 0.2026, "step": 1876 }, { "epoch": 1.1420748402798906, "grad_norm": 1.021272897720337, "learning_rate": 4.235122961612602e-05, "loss": 0.2904, "step": 1877 }, { "epoch": 1.1426832978399757, "grad_norm": 0.7920538187026978, "learning_rate": 4.234226519229971e-05, "loss": 0.2118, "step": 1878 }, { "epoch": 1.1432917554000608, "grad_norm": 1.0034376382827759, "learning_rate": 4.233329646831449e-05, "loss": 0.2355, "step": 1879 }, { "epoch": 1.143900212960146, "grad_norm": 1.0037422180175781, "learning_rate": 4.2324323446394244e-05, "loss": 0.2941, "step": 1880 }, { "epoch": 1.1445086705202312, "grad_norm": 0.7395780086517334, "learning_rate": 4.2315346128763886e-05, "loss": 0.1974, "step": 1881 }, { "epoch": 1.1451171280803165, "grad_norm": 0.9243549108505249, "learning_rate": 4.2306364517649434e-05, "loss": 0.2657, "step": 1882 }, { "epoch": 1.1457255856404016, "grad_norm": 0.8601450324058533, "learning_rate": 4.2297378615277935e-05, "loss": 0.2241, "step": 1883 }, { "epoch": 1.1463340432004867, "grad_norm": 0.9598115682601929, "learning_rate": 4.228838842387755e-05, "loss": 0.2484, "step": 1884 }, { "epoch": 1.146942500760572, "grad_norm": 1.2198432683944702, "learning_rate": 4.2279393945677437e-05, "loss": 0.217, "step": 1885 }, { "epoch": 1.1475509583206571, "grad_norm": 0.9545729756355286, "learning_rate": 4.227039518290786e-05, "loss": 0.2229, "step": 1886 }, { "epoch": 1.1481594158807424, "grad_norm": 1.0584890842437744, "learning_rate": 4.226139213780016e-05, "loss": 0.2805, "step": 1887 }, { "epoch": 1.1487678734408275, "grad_norm": 0.9882286787033081, "learning_rate": 4.225238481258669e-05, "loss": 0.2621, "step": 1888 }, { "epoch": 1.1493763310009126, "grad_norm": 0.944648265838623, "learning_rate": 4.22433732095009e-05, "loss": 0.2358, "step": 1889 }, { "epoch": 1.149984788560998, "grad_norm": 0.7821002006530762, "learning_rate": 4.223435733077731e-05, "loss": 0.1971, "step": 1890 }, { "epoch": 1.150593246121083, "grad_norm": 1.0753979682922363, "learning_rate": 4.2225337178651444e-05, "loss": 0.2761, "step": 1891 }, { "epoch": 1.1512017036811681, "grad_norm": 0.9069803953170776, "learning_rate": 4.221631275535996e-05, "loss": 0.258, "step": 1892 }, { "epoch": 1.1518101612412535, "grad_norm": 1.0057759284973145, "learning_rate": 4.2207284063140514e-05, "loss": 0.2679, "step": 1893 }, { "epoch": 1.1524186188013386, "grad_norm": 1.0301549434661865, "learning_rate": 4.2198251104231854e-05, "loss": 0.2995, "step": 1894 }, { "epoch": 1.1530270763614239, "grad_norm": 0.9355301260948181, "learning_rate": 4.218921388087379e-05, "loss": 0.2179, "step": 1895 }, { "epoch": 1.153635533921509, "grad_norm": 1.027159571647644, "learning_rate": 4.2180172395307156e-05, "loss": 0.2577, "step": 1896 }, { "epoch": 1.154243991481594, "grad_norm": 1.046653389930725, "learning_rate": 4.217112664977387e-05, "loss": 0.2326, "step": 1897 }, { "epoch": 1.1548524490416794, "grad_norm": 0.9612645506858826, "learning_rate": 4.216207664651691e-05, "loss": 0.2137, "step": 1898 }, { "epoch": 1.1554609066017645, "grad_norm": 0.9418598413467407, "learning_rate": 4.21530223877803e-05, "loss": 0.2772, "step": 1899 }, { "epoch": 1.1560693641618498, "grad_norm": 1.0486358404159546, "learning_rate": 4.2143963875809096e-05, "loss": 0.2663, "step": 1900 }, { "epoch": 1.156677821721935, "grad_norm": 1.084235668182373, "learning_rate": 4.213490111284945e-05, "loss": 0.2499, "step": 1901 }, { "epoch": 1.15728627928202, "grad_norm": 0.9962263107299805, "learning_rate": 4.212583410114855e-05, "loss": 0.2512, "step": 1902 }, { "epoch": 1.1578947368421053, "grad_norm": 0.9712923765182495, "learning_rate": 4.2116762842954625e-05, "loss": 0.2396, "step": 1903 }, { "epoch": 1.1585031944021904, "grad_norm": 0.9264492988586426, "learning_rate": 4.210768734051699e-05, "loss": 0.2584, "step": 1904 }, { "epoch": 1.1591116519622757, "grad_norm": 1.201596975326538, "learning_rate": 4.209860759608597e-05, "loss": 0.287, "step": 1905 }, { "epoch": 1.1597201095223608, "grad_norm": 0.9659140110015869, "learning_rate": 4.2089523611912966e-05, "loss": 0.2453, "step": 1906 }, { "epoch": 1.160328567082446, "grad_norm": 0.8334513306617737, "learning_rate": 4.208043539025044e-05, "loss": 0.2573, "step": 1907 }, { "epoch": 1.1609370246425312, "grad_norm": 0.9269493222236633, "learning_rate": 4.2071342933351886e-05, "loss": 0.2988, "step": 1908 }, { "epoch": 1.1615454822026163, "grad_norm": 0.8946632146835327, "learning_rate": 4.2062246243471846e-05, "loss": 0.2378, "step": 1909 }, { "epoch": 1.1621539397627014, "grad_norm": 0.9380949139595032, "learning_rate": 4.2053145322865936e-05, "loss": 0.2316, "step": 1910 }, { "epoch": 1.1627623973227867, "grad_norm": 0.8564540147781372, "learning_rate": 4.204404017379079e-05, "loss": 0.2255, "step": 1911 }, { "epoch": 1.1633708548828718, "grad_norm": 1.0175832509994507, "learning_rate": 4.2034930798504114e-05, "loss": 0.2384, "step": 1912 }, { "epoch": 1.1639793124429572, "grad_norm": 0.9850523471832275, "learning_rate": 4.202581719926465e-05, "loss": 0.2682, "step": 1913 }, { "epoch": 1.1645877700030423, "grad_norm": 1.0594969987869263, "learning_rate": 4.201669937833219e-05, "loss": 0.2491, "step": 1914 }, { "epoch": 1.1651962275631274, "grad_norm": 0.9938765168190002, "learning_rate": 4.2007577337967574e-05, "loss": 0.2847, "step": 1915 }, { "epoch": 1.1658046851232127, "grad_norm": 0.8707656860351562, "learning_rate": 4.19984510804327e-05, "loss": 0.2301, "step": 1916 }, { "epoch": 1.1664131426832978, "grad_norm": 0.9624968767166138, "learning_rate": 4.1989320607990474e-05, "loss": 0.2459, "step": 1917 }, { "epoch": 1.167021600243383, "grad_norm": 1.0143096446990967, "learning_rate": 4.1980185922904894e-05, "loss": 0.2169, "step": 1918 }, { "epoch": 1.1676300578034682, "grad_norm": 0.8604801893234253, "learning_rate": 4.197104702744097e-05, "loss": 0.2162, "step": 1919 }, { "epoch": 1.1682385153635533, "grad_norm": 1.1591583490371704, "learning_rate": 4.1961903923864775e-05, "loss": 0.2425, "step": 1920 }, { "epoch": 1.1688469729236386, "grad_norm": 1.020859718322754, "learning_rate": 4.1952756614443415e-05, "loss": 0.2657, "step": 1921 }, { "epoch": 1.1694554304837237, "grad_norm": 0.9535967111587524, "learning_rate": 4.194360510144504e-05, "loss": 0.2221, "step": 1922 }, { "epoch": 1.170063888043809, "grad_norm": 1.1001362800598145, "learning_rate": 4.1934449387138845e-05, "loss": 0.3064, "step": 1923 }, { "epoch": 1.170672345603894, "grad_norm": 0.851999819278717, "learning_rate": 4.192528947379506e-05, "loss": 0.2202, "step": 1924 }, { "epoch": 1.1712808031639792, "grad_norm": 0.888411283493042, "learning_rate": 4.1916125363684965e-05, "loss": 0.2617, "step": 1925 }, { "epoch": 1.1718892607240645, "grad_norm": 0.9610258340835571, "learning_rate": 4.1906957059080886e-05, "loss": 0.2411, "step": 1926 }, { "epoch": 1.1724977182841496, "grad_norm": 0.9737393856048584, "learning_rate": 4.189778456225617e-05, "loss": 0.2612, "step": 1927 }, { "epoch": 1.173106175844235, "grad_norm": 0.9638609886169434, "learning_rate": 4.188860787548522e-05, "loss": 0.2328, "step": 1928 }, { "epoch": 1.17371463340432, "grad_norm": 1.1524028778076172, "learning_rate": 4.187942700104346e-05, "loss": 0.2513, "step": 1929 }, { "epoch": 1.1743230909644051, "grad_norm": 0.9276042580604553, "learning_rate": 4.1870241941207375e-05, "loss": 0.2542, "step": 1930 }, { "epoch": 1.1749315485244904, "grad_norm": 0.9600099325180054, "learning_rate": 4.186105269825449e-05, "loss": 0.2704, "step": 1931 }, { "epoch": 1.1755400060845755, "grad_norm": 0.940571129322052, "learning_rate": 4.1851859274463326e-05, "loss": 0.2159, "step": 1932 }, { "epoch": 1.1761484636446609, "grad_norm": 0.9095624685287476, "learning_rate": 4.184266167211348e-05, "loss": 0.2321, "step": 1933 }, { "epoch": 1.176756921204746, "grad_norm": 0.9772641658782959, "learning_rate": 4.183345989348558e-05, "loss": 0.2242, "step": 1934 }, { "epoch": 1.177365378764831, "grad_norm": 0.8371894359588623, "learning_rate": 4.182425394086128e-05, "loss": 0.1883, "step": 1935 }, { "epoch": 1.1779738363249164, "grad_norm": 0.9441771507263184, "learning_rate": 4.181504381652327e-05, "loss": 0.2521, "step": 1936 }, { "epoch": 1.1785822938850015, "grad_norm": 1.1066073179244995, "learning_rate": 4.180582952275528e-05, "loss": 0.275, "step": 1937 }, { "epoch": 1.1791907514450868, "grad_norm": 0.9698868989944458, "learning_rate": 4.179661106184207e-05, "loss": 0.2583, "step": 1938 }, { "epoch": 1.1797992090051719, "grad_norm": 0.9682748317718506, "learning_rate": 4.178738843606943e-05, "loss": 0.2726, "step": 1939 }, { "epoch": 1.180407666565257, "grad_norm": 0.9256848692893982, "learning_rate": 4.1778161647724203e-05, "loss": 0.225, "step": 1940 }, { "epoch": 1.1810161241253423, "grad_norm": 0.8582398891448975, "learning_rate": 4.176893069909422e-05, "loss": 0.2164, "step": 1941 }, { "epoch": 1.1816245816854274, "grad_norm": 0.9054145812988281, "learning_rate": 4.1759695592468395e-05, "loss": 0.2086, "step": 1942 }, { "epoch": 1.1822330392455127, "grad_norm": 0.9626482725143433, "learning_rate": 4.175045633013665e-05, "loss": 0.2939, "step": 1943 }, { "epoch": 1.1828414968055978, "grad_norm": 0.8653888702392578, "learning_rate": 4.1741212914389914e-05, "loss": 0.2459, "step": 1944 }, { "epoch": 1.183449954365683, "grad_norm": 0.94068843126297, "learning_rate": 4.173196534752019e-05, "loss": 0.2704, "step": 1945 }, { "epoch": 1.1840584119257682, "grad_norm": 0.8980154991149902, "learning_rate": 4.1722713631820485e-05, "loss": 0.2332, "step": 1946 }, { "epoch": 1.1846668694858533, "grad_norm": 0.9154375791549683, "learning_rate": 4.171345776958483e-05, "loss": 0.2293, "step": 1947 }, { "epoch": 1.1852753270459386, "grad_norm": 0.9358067512512207, "learning_rate": 4.17041977631083e-05, "loss": 0.2325, "step": 1948 }, { "epoch": 1.1858837846060237, "grad_norm": 0.8783141374588013, "learning_rate": 4.1694933614686995e-05, "loss": 0.2125, "step": 1949 }, { "epoch": 1.1864922421661088, "grad_norm": 1.0046310424804688, "learning_rate": 4.168566532661803e-05, "loss": 0.2796, "step": 1950 }, { "epoch": 1.1871006997261941, "grad_norm": 0.9390839338302612, "learning_rate": 4.167639290119956e-05, "loss": 0.2875, "step": 1951 }, { "epoch": 1.1877091572862792, "grad_norm": 0.9401413202285767, "learning_rate": 4.166711634073075e-05, "loss": 0.2614, "step": 1952 }, { "epoch": 1.1883176148463646, "grad_norm": 1.0055378675460815, "learning_rate": 4.1657835647511804e-05, "loss": 0.2489, "step": 1953 }, { "epoch": 1.1889260724064497, "grad_norm": 0.8895730376243591, "learning_rate": 4.164855082384396e-05, "loss": 0.2491, "step": 1954 }, { "epoch": 1.1895345299665347, "grad_norm": 0.8464339375495911, "learning_rate": 4.163926187202946e-05, "loss": 0.199, "step": 1955 }, { "epoch": 1.19014298752662, "grad_norm": 0.9686809778213501, "learning_rate": 4.162996879437156e-05, "loss": 0.2742, "step": 1956 }, { "epoch": 1.1907514450867052, "grad_norm": 1.0053505897521973, "learning_rate": 4.1620671593174585e-05, "loss": 0.2875, "step": 1957 }, { "epoch": 1.1913599026467905, "grad_norm": 1.0522572994232178, "learning_rate": 4.1611370270743826e-05, "loss": 0.2174, "step": 1958 }, { "epoch": 1.1919683602068756, "grad_norm": 0.8975366353988647, "learning_rate": 4.160206482938565e-05, "loss": 0.2535, "step": 1959 }, { "epoch": 1.1925768177669607, "grad_norm": 0.8640589714050293, "learning_rate": 4.159275527140739e-05, "loss": 0.2178, "step": 1960 }, { "epoch": 1.193185275327046, "grad_norm": 0.9383702874183655, "learning_rate": 4.1583441599117453e-05, "loss": 0.2912, "step": 1961 }, { "epoch": 1.193793732887131, "grad_norm": 0.8808874487876892, "learning_rate": 4.1574123814825226e-05, "loss": 0.2189, "step": 1962 }, { "epoch": 1.1944021904472164, "grad_norm": 0.9913263320922852, "learning_rate": 4.156480192084114e-05, "loss": 0.2931, "step": 1963 }, { "epoch": 1.1950106480073015, "grad_norm": 1.0202916860580444, "learning_rate": 4.155547591947663e-05, "loss": 0.2677, "step": 1964 }, { "epoch": 1.1956191055673866, "grad_norm": 1.405519962310791, "learning_rate": 4.154614581304416e-05, "loss": 0.3883, "step": 1965 }, { "epoch": 1.196227563127472, "grad_norm": 0.9594994783401489, "learning_rate": 4.15368116038572e-05, "loss": 0.2595, "step": 1966 }, { "epoch": 1.196836020687557, "grad_norm": 1.043186902999878, "learning_rate": 4.1527473294230255e-05, "loss": 0.2648, "step": 1967 }, { "epoch": 1.1974444782476423, "grad_norm": 0.9707902073860168, "learning_rate": 4.151813088647883e-05, "loss": 0.2546, "step": 1968 }, { "epoch": 1.1980529358077274, "grad_norm": 0.9176664352416992, "learning_rate": 4.1508784382919466e-05, "loss": 0.2161, "step": 1969 }, { "epoch": 1.1986613933678125, "grad_norm": 0.9123236536979675, "learning_rate": 4.149943378586968e-05, "loss": 0.242, "step": 1970 }, { "epoch": 1.1992698509278978, "grad_norm": 0.9186065793037415, "learning_rate": 4.149007909764805e-05, "loss": 0.2001, "step": 1971 }, { "epoch": 1.199878308487983, "grad_norm": 1.0215867757797241, "learning_rate": 4.148072032057415e-05, "loss": 0.2529, "step": 1972 }, { "epoch": 1.2004867660480683, "grad_norm": 0.9522402882575989, "learning_rate": 4.1471357456968556e-05, "loss": 0.2096, "step": 1973 }, { "epoch": 1.2010952236081534, "grad_norm": 0.9875971674919128, "learning_rate": 4.146199050915288e-05, "loss": 0.2659, "step": 1974 }, { "epoch": 1.2017036811682384, "grad_norm": 0.892595112323761, "learning_rate": 4.1452619479449714e-05, "loss": 0.2422, "step": 1975 }, { "epoch": 1.2023121387283238, "grad_norm": 0.9332039952278137, "learning_rate": 4.14432443701827e-05, "loss": 0.1921, "step": 1976 }, { "epoch": 1.2029205962884089, "grad_norm": 0.8022421598434448, "learning_rate": 4.143386518367648e-05, "loss": 0.1945, "step": 1977 }, { "epoch": 1.2035290538484942, "grad_norm": 0.9475464224815369, "learning_rate": 4.142448192225669e-05, "loss": 0.2119, "step": 1978 }, { "epoch": 1.2041375114085793, "grad_norm": 0.8126545548439026, "learning_rate": 4.1415094588249975e-05, "loss": 0.2195, "step": 1979 }, { "epoch": 1.2047459689686644, "grad_norm": 1.1211334466934204, "learning_rate": 4.140570318398403e-05, "loss": 0.2904, "step": 1980 }, { "epoch": 1.2053544265287497, "grad_norm": 1.054579734802246, "learning_rate": 4.1396307711787516e-05, "loss": 0.2495, "step": 1981 }, { "epoch": 1.2059628840888348, "grad_norm": 0.9004554152488708, "learning_rate": 4.1386908173990126e-05, "loss": 0.2597, "step": 1982 }, { "epoch": 1.20657134164892, "grad_norm": 0.9053953886032104, "learning_rate": 4.1377504572922534e-05, "loss": 0.2532, "step": 1983 }, { "epoch": 1.2071797992090052, "grad_norm": 0.8492591381072998, "learning_rate": 4.136809691091647e-05, "loss": 0.2266, "step": 1984 }, { "epoch": 1.2077882567690903, "grad_norm": 0.9747501611709595, "learning_rate": 4.135868519030463e-05, "loss": 0.2336, "step": 1985 }, { "epoch": 1.2083967143291756, "grad_norm": 0.907906711101532, "learning_rate": 4.134926941342071e-05, "loss": 0.2306, "step": 1986 }, { "epoch": 1.2090051718892607, "grad_norm": 1.0222368240356445, "learning_rate": 4.1339849582599454e-05, "loss": 0.2196, "step": 1987 }, { "epoch": 1.209613629449346, "grad_norm": 1.05088210105896, "learning_rate": 4.1330425700176586e-05, "loss": 0.2503, "step": 1988 }, { "epoch": 1.2102220870094311, "grad_norm": 1.1211512088775635, "learning_rate": 4.132099776848882e-05, "loss": 0.2371, "step": 1989 }, { "epoch": 1.2108305445695162, "grad_norm": 0.9060412645339966, "learning_rate": 4.1311565789873914e-05, "loss": 0.2377, "step": 1990 }, { "epoch": 1.2114390021296015, "grad_norm": 0.8177694082260132, "learning_rate": 4.1302129766670586e-05, "loss": 0.2452, "step": 1991 }, { "epoch": 1.2120474596896866, "grad_norm": 0.9358994960784912, "learning_rate": 4.129268970121858e-05, "loss": 0.2222, "step": 1992 }, { "epoch": 1.212655917249772, "grad_norm": 0.8588563203811646, "learning_rate": 4.128324559585863e-05, "loss": 0.219, "step": 1993 }, { "epoch": 1.213264374809857, "grad_norm": 0.9672694206237793, "learning_rate": 4.127379745293251e-05, "loss": 0.2587, "step": 1994 }, { "epoch": 1.2138728323699421, "grad_norm": 0.8904327154159546, "learning_rate": 4.1264345274782937e-05, "loss": 0.2371, "step": 1995 }, { "epoch": 1.2144812899300275, "grad_norm": 1.017808198928833, "learning_rate": 4.125488906375367e-05, "loss": 0.2382, "step": 1996 }, { "epoch": 1.2150897474901126, "grad_norm": 0.9038395881652832, "learning_rate": 4.1245428822189444e-05, "loss": 0.2444, "step": 1997 }, { "epoch": 1.2156982050501977, "grad_norm": 1.1696877479553223, "learning_rate": 4.123596455243603e-05, "loss": 0.2993, "step": 1998 }, { "epoch": 1.216306662610283, "grad_norm": 0.8989875316619873, "learning_rate": 4.122649625684014e-05, "loss": 0.1964, "step": 1999 }, { "epoch": 1.216915120170368, "grad_norm": 1.0550363063812256, "learning_rate": 4.1217023937749544e-05, "loss": 0.2987, "step": 2000 }, { "epoch": 1.2175235777304534, "grad_norm": 0.9538779258728027, "learning_rate": 4.120754759751296e-05, "loss": 0.2522, "step": 2001 }, { "epoch": 1.2181320352905385, "grad_norm": 1.279292106628418, "learning_rate": 4.1198067238480145e-05, "loss": 0.2853, "step": 2002 }, { "epoch": 1.2187404928506236, "grad_norm": 1.0189933776855469, "learning_rate": 4.118858286300182e-05, "loss": 0.2265, "step": 2003 }, { "epoch": 1.219348950410709, "grad_norm": 0.8648117184638977, "learning_rate": 4.117909447342972e-05, "loss": 0.2193, "step": 2004 }, { "epoch": 1.219957407970794, "grad_norm": 0.8088192939758301, "learning_rate": 4.1169602072116567e-05, "loss": 0.2087, "step": 2005 }, { "epoch": 1.2205658655308793, "grad_norm": 1.034005045890808, "learning_rate": 4.116010566141608e-05, "loss": 0.2648, "step": 2006 }, { "epoch": 1.2211743230909644, "grad_norm": 1.020052194595337, "learning_rate": 4.115060524368297e-05, "loss": 0.2715, "step": 2007 }, { "epoch": 1.2217827806510495, "grad_norm": 0.8594374656677246, "learning_rate": 4.114110082127296e-05, "loss": 0.2395, "step": 2008 }, { "epoch": 1.2223912382111348, "grad_norm": 0.9257277250289917, "learning_rate": 4.113159239654273e-05, "loss": 0.2356, "step": 2009 }, { "epoch": 1.22299969577122, "grad_norm": 0.9078371524810791, "learning_rate": 4.1122079971849994e-05, "loss": 0.2536, "step": 2010 }, { "epoch": 1.2236081533313052, "grad_norm": 0.9304254055023193, "learning_rate": 4.1112563549553427e-05, "loss": 0.2504, "step": 2011 }, { "epoch": 1.2242166108913903, "grad_norm": 0.9377618432044983, "learning_rate": 4.11030431320127e-05, "loss": 0.2245, "step": 2012 }, { "epoch": 1.2248250684514754, "grad_norm": 0.8383059501647949, "learning_rate": 4.1093518721588484e-05, "loss": 0.1871, "step": 2013 }, { "epoch": 1.2254335260115607, "grad_norm": 0.853079617023468, "learning_rate": 4.108399032064243e-05, "loss": 0.2169, "step": 2014 }, { "epoch": 1.2260419835716458, "grad_norm": 0.9059860110282898, "learning_rate": 4.10744579315372e-05, "loss": 0.2204, "step": 2015 }, { "epoch": 1.226650441131731, "grad_norm": 0.9250550866127014, "learning_rate": 4.1064921556636413e-05, "loss": 0.2556, "step": 2016 }, { "epoch": 1.2272588986918163, "grad_norm": 0.9633865356445312, "learning_rate": 4.1055381198304705e-05, "loss": 0.2426, "step": 2017 }, { "epoch": 1.2278673562519014, "grad_norm": 0.865479588508606, "learning_rate": 4.1045836858907676e-05, "loss": 0.2093, "step": 2018 }, { "epoch": 1.2284758138119867, "grad_norm": 0.8409626483917236, "learning_rate": 4.1036288540811935e-05, "loss": 0.1839, "step": 2019 }, { "epoch": 1.2290842713720718, "grad_norm": 0.8517163991928101, "learning_rate": 4.1026736246385055e-05, "loss": 0.1791, "step": 2020 }, { "epoch": 1.2296927289321569, "grad_norm": 1.0229874849319458, "learning_rate": 4.101717997799562e-05, "loss": 0.2466, "step": 2021 }, { "epoch": 1.2303011864922422, "grad_norm": 1.0059734582901, "learning_rate": 4.100761973801317e-05, "loss": 0.2218, "step": 2022 }, { "epoch": 1.2309096440523273, "grad_norm": 1.6977488994598389, "learning_rate": 4.0998055528808266e-05, "loss": 0.2822, "step": 2023 }, { "epoch": 1.2315181016124126, "grad_norm": 1.067182183265686, "learning_rate": 4.0988487352752414e-05, "loss": 0.2576, "step": 2024 }, { "epoch": 1.2321265591724977, "grad_norm": 0.9440335631370544, "learning_rate": 4.097891521221814e-05, "loss": 0.2397, "step": 2025 }, { "epoch": 1.2327350167325828, "grad_norm": 0.9076058268547058, "learning_rate": 4.096933910957892e-05, "loss": 0.2329, "step": 2026 }, { "epoch": 1.233343474292668, "grad_norm": 0.8461357951164246, "learning_rate": 4.0959759047209234e-05, "loss": 0.2133, "step": 2027 }, { "epoch": 1.2339519318527532, "grad_norm": 0.8329628109931946, "learning_rate": 4.095017502748455e-05, "loss": 0.2251, "step": 2028 }, { "epoch": 1.2345603894128385, "grad_norm": 0.8861196041107178, "learning_rate": 4.094058705278129e-05, "loss": 0.2232, "step": 2029 }, { "epoch": 1.2351688469729236, "grad_norm": 0.9130899906158447, "learning_rate": 4.093099512547687e-05, "loss": 0.2517, "step": 2030 }, { "epoch": 1.2357773045330087, "grad_norm": 0.8920985460281372, "learning_rate": 4.09213992479497e-05, "loss": 0.2403, "step": 2031 }, { "epoch": 1.236385762093094, "grad_norm": 0.9351683259010315, "learning_rate": 4.0911799422579155e-05, "loss": 0.2416, "step": 2032 }, { "epoch": 1.2369942196531791, "grad_norm": 0.8944339156150818, "learning_rate": 4.090219565174559e-05, "loss": 0.2441, "step": 2033 }, { "epoch": 1.2376026772132644, "grad_norm": 0.9203975200653076, "learning_rate": 4.089258793783034e-05, "loss": 0.2541, "step": 2034 }, { "epoch": 1.2382111347733495, "grad_norm": 0.880241334438324, "learning_rate": 4.0882976283215714e-05, "loss": 0.2492, "step": 2035 }, { "epoch": 1.2388195923334346, "grad_norm": 0.9325846433639526, "learning_rate": 4.087336069028501e-05, "loss": 0.2663, "step": 2036 }, { "epoch": 1.23942804989352, "grad_norm": 1.0154072046279907, "learning_rate": 4.086374116142249e-05, "loss": 0.2699, "step": 2037 }, { "epoch": 1.240036507453605, "grad_norm": 0.9841558933258057, "learning_rate": 4.0854117699013396e-05, "loss": 0.2496, "step": 2038 }, { "epoch": 1.2406449650136904, "grad_norm": 0.8984540700912476, "learning_rate": 4.0844490305443934e-05, "loss": 0.2008, "step": 2039 }, { "epoch": 1.2412534225737755, "grad_norm": 0.8929226994514465, "learning_rate": 4.083485898310131e-05, "loss": 0.2588, "step": 2040 }, { "epoch": 1.2418618801338606, "grad_norm": 0.7902721762657166, "learning_rate": 4.0825223734373696e-05, "loss": 0.1924, "step": 2041 }, { "epoch": 1.2424703376939459, "grad_norm": 0.8901758790016174, "learning_rate": 4.081558456165022e-05, "loss": 0.246, "step": 2042 }, { "epoch": 1.243078795254031, "grad_norm": 1.1308690309524536, "learning_rate": 4.080594146732099e-05, "loss": 0.286, "step": 2043 }, { "epoch": 1.2436872528141163, "grad_norm": 1.1202785968780518, "learning_rate": 4.079629445377712e-05, "loss": 0.3143, "step": 2044 }, { "epoch": 1.2442957103742014, "grad_norm": 0.8539735078811646, "learning_rate": 4.078664352341063e-05, "loss": 0.2171, "step": 2045 }, { "epoch": 1.2449041679342865, "grad_norm": 1.0043787956237793, "learning_rate": 4.077698867861457e-05, "loss": 0.2746, "step": 2046 }, { "epoch": 1.2455126254943718, "grad_norm": 1.0371819734573364, "learning_rate": 4.076732992178294e-05, "loss": 0.269, "step": 2047 }, { "epoch": 1.246121083054457, "grad_norm": 0.9970436096191406, "learning_rate": 4.075766725531069e-05, "loss": 0.2185, "step": 2048 }, { "epoch": 1.2467295406145422, "grad_norm": 1.058952808380127, "learning_rate": 4.074800068159379e-05, "loss": 0.2503, "step": 2049 }, { "epoch": 1.2473379981746273, "grad_norm": 0.8718734383583069, "learning_rate": 4.073833020302912e-05, "loss": 0.2422, "step": 2050 }, { "epoch": 1.2479464557347124, "grad_norm": 0.8423463106155396, "learning_rate": 4.0728655822014574e-05, "loss": 0.2396, "step": 2051 }, { "epoch": 1.2485549132947977, "grad_norm": 0.9200191497802734, "learning_rate": 4.071897754094898e-05, "loss": 0.201, "step": 2052 }, { "epoch": 1.2491633708548828, "grad_norm": 1.0252035856246948, "learning_rate": 4.0709295362232156e-05, "loss": 0.2548, "step": 2053 }, { "epoch": 1.2497718284149681, "grad_norm": 0.875744640827179, "learning_rate": 4.069960928826488e-05, "loss": 0.2319, "step": 2054 }, { "epoch": 1.2503802859750532, "grad_norm": 1.0111494064331055, "learning_rate": 4.0689919321448885e-05, "loss": 0.2188, "step": 2055 }, { "epoch": 1.2509887435351383, "grad_norm": 1.0929096937179565, "learning_rate": 4.0680225464186895e-05, "loss": 0.2029, "step": 2056 }, { "epoch": 1.2515972010952237, "grad_norm": 0.9133079051971436, "learning_rate": 4.067052771888257e-05, "loss": 0.2084, "step": 2057 }, { "epoch": 1.2522056586553088, "grad_norm": 0.8972569108009338, "learning_rate": 4.066082608794055e-05, "loss": 0.238, "step": 2058 }, { "epoch": 1.252814116215394, "grad_norm": 0.9103267788887024, "learning_rate": 4.0651120573766447e-05, "loss": 0.2144, "step": 2059 }, { "epoch": 1.2534225737754792, "grad_norm": 0.8961523771286011, "learning_rate": 4.0641411178766795e-05, "loss": 0.2639, "step": 2060 }, { "epoch": 1.2540310313355643, "grad_norm": 0.907996654510498, "learning_rate": 4.0631697905349144e-05, "loss": 0.2035, "step": 2061 }, { "epoch": 1.2546394888956496, "grad_norm": 1.2141492366790771, "learning_rate": 4.0621980755921974e-05, "loss": 0.2882, "step": 2062 }, { "epoch": 1.2552479464557347, "grad_norm": 0.889756441116333, "learning_rate": 4.061225973289473e-05, "loss": 0.2281, "step": 2063 }, { "epoch": 1.25585640401582, "grad_norm": 0.9822306632995605, "learning_rate": 4.060253483867783e-05, "loss": 0.2966, "step": 2064 }, { "epoch": 1.256464861575905, "grad_norm": 0.9463213682174683, "learning_rate": 4.059280607568263e-05, "loss": 0.2358, "step": 2065 }, { "epoch": 1.2570733191359902, "grad_norm": 0.9215368032455444, "learning_rate": 4.058307344632147e-05, "loss": 0.2188, "step": 2066 }, { "epoch": 1.2576817766960755, "grad_norm": 0.9397695064544678, "learning_rate": 4.057333695300762e-05, "loss": 0.2423, "step": 2067 }, { "epoch": 1.2582902342561606, "grad_norm": 0.9168745875358582, "learning_rate": 4.056359659815534e-05, "loss": 0.1766, "step": 2068 }, { "epoch": 1.258898691816246, "grad_norm": 0.8033076524734497, "learning_rate": 4.055385238417984e-05, "loss": 0.2049, "step": 2069 }, { "epoch": 1.259507149376331, "grad_norm": 0.8447003960609436, "learning_rate": 4.054410431349724e-05, "loss": 0.2532, "step": 2070 }, { "epoch": 1.260115606936416, "grad_norm": 0.9117369651794434, "learning_rate": 4.053435238852469e-05, "loss": 0.2495, "step": 2071 }, { "epoch": 1.2607240644965014, "grad_norm": 0.838207483291626, "learning_rate": 4.052459661168025e-05, "loss": 0.2442, "step": 2072 }, { "epoch": 1.2613325220565865, "grad_norm": 0.9510546922683716, "learning_rate": 4.051483698538295e-05, "loss": 0.2255, "step": 2073 }, { "epoch": 1.2619409796166718, "grad_norm": 1.0252444744110107, "learning_rate": 4.0505073512052774e-05, "loss": 0.2537, "step": 2074 }, { "epoch": 1.262549437176757, "grad_norm": 0.8936956524848938, "learning_rate": 4.049530619411065e-05, "loss": 0.1971, "step": 2075 }, { "epoch": 1.263157894736842, "grad_norm": 0.9542527198791504, "learning_rate": 4.0485535033978455e-05, "loss": 0.2105, "step": 2076 }, { "epoch": 1.2637663522969274, "grad_norm": 0.9937785863876343, "learning_rate": 4.047576003407905e-05, "loss": 0.2452, "step": 2077 }, { "epoch": 1.2643748098570124, "grad_norm": 0.9868922233581543, "learning_rate": 4.046598119683621e-05, "loss": 0.2215, "step": 2078 }, { "epoch": 1.2649832674170978, "grad_norm": 0.9198853373527527, "learning_rate": 4.045619852467469e-05, "loss": 0.2079, "step": 2079 }, { "epoch": 1.2655917249771829, "grad_norm": 0.9050553441047668, "learning_rate": 4.0446412020020185e-05, "loss": 0.2373, "step": 2080 }, { "epoch": 1.266200182537268, "grad_norm": 1.0860339403152466, "learning_rate": 4.0436621685299334e-05, "loss": 0.2303, "step": 2081 }, { "epoch": 1.2668086400973533, "grad_norm": 0.9267223477363586, "learning_rate": 4.0426827522939735e-05, "loss": 0.2241, "step": 2082 }, { "epoch": 1.2674170976574384, "grad_norm": 0.8770591616630554, "learning_rate": 4.041702953536994e-05, "loss": 0.2209, "step": 2083 }, { "epoch": 1.2680255552175237, "grad_norm": 0.7461957335472107, "learning_rate": 4.0407227725019426e-05, "loss": 0.2294, "step": 2084 }, { "epoch": 1.2686340127776088, "grad_norm": 1.0627920627593994, "learning_rate": 4.039742209431864e-05, "loss": 0.2419, "step": 2085 }, { "epoch": 1.2692424703376939, "grad_norm": 0.9372715353965759, "learning_rate": 4.0387612645698974e-05, "loss": 0.2496, "step": 2086 }, { "epoch": 1.2698509278977792, "grad_norm": 0.9207741022109985, "learning_rate": 4.037779938159276e-05, "loss": 0.2682, "step": 2087 }, { "epoch": 1.2704593854578643, "grad_norm": 0.9947006702423096, "learning_rate": 4.036798230443328e-05, "loss": 0.241, "step": 2088 }, { "epoch": 1.2710678430179496, "grad_norm": 0.9042183756828308, "learning_rate": 4.035816141665475e-05, "loss": 0.2357, "step": 2089 }, { "epoch": 1.2716763005780347, "grad_norm": 0.9522048830986023, "learning_rate": 4.0348336720692345e-05, "loss": 0.1831, "step": 2090 }, { "epoch": 1.2722847581381198, "grad_norm": 0.8158754706382751, "learning_rate": 4.0338508218982197e-05, "loss": 0.2126, "step": 2091 }, { "epoch": 1.2728932156982051, "grad_norm": 1.012651801109314, "learning_rate": 4.032867591396135e-05, "loss": 0.2472, "step": 2092 }, { "epoch": 1.2735016732582902, "grad_norm": 0.9281013011932373, "learning_rate": 4.0318839808067796e-05, "loss": 0.2052, "step": 2093 }, { "epoch": 1.2741101308183755, "grad_norm": 1.0591697692871094, "learning_rate": 4.0308999903740496e-05, "loss": 0.2503, "step": 2094 }, { "epoch": 1.2747185883784606, "grad_norm": 0.852708637714386, "learning_rate": 4.029915620341933e-05, "loss": 0.2026, "step": 2095 }, { "epoch": 1.2753270459385457, "grad_norm": 0.9270033836364746, "learning_rate": 4.028930870954512e-05, "loss": 0.223, "step": 2096 }, { "epoch": 1.275935503498631, "grad_norm": 0.8873688578605652, "learning_rate": 4.0279457424559654e-05, "loss": 0.2448, "step": 2097 }, { "epoch": 1.2765439610587161, "grad_norm": 0.8902617692947388, "learning_rate": 4.0269602350905615e-05, "loss": 0.2214, "step": 2098 }, { "epoch": 1.2771524186188015, "grad_norm": 0.8026160597801208, "learning_rate": 4.025974349102667e-05, "loss": 0.201, "step": 2099 }, { "epoch": 1.2777608761788866, "grad_norm": 0.8452394008636475, "learning_rate": 4.024988084736739e-05, "loss": 0.2145, "step": 2100 }, { "epoch": 1.2783693337389717, "grad_norm": 0.859439492225647, "learning_rate": 4.0240014422373304e-05, "loss": 0.2221, "step": 2101 }, { "epoch": 1.2789777912990568, "grad_norm": 0.929378867149353, "learning_rate": 4.023014421849088e-05, "loss": 0.2458, "step": 2102 }, { "epoch": 1.279586248859142, "grad_norm": 0.9007444381713867, "learning_rate": 4.0220270238167514e-05, "loss": 0.2225, "step": 2103 }, { "epoch": 1.2801947064192274, "grad_norm": 0.821709394454956, "learning_rate": 4.021039248385154e-05, "loss": 0.1803, "step": 2104 }, { "epoch": 1.2808031639793125, "grad_norm": 1.293845534324646, "learning_rate": 4.0200510957992234e-05, "loss": 0.397, "step": 2105 }, { "epoch": 1.2814116215393976, "grad_norm": 0.8068326711654663, "learning_rate": 4.0190625663039796e-05, "loss": 0.2132, "step": 2106 }, { "epoch": 1.2820200790994827, "grad_norm": 0.9025530219078064, "learning_rate": 4.0180736601445365e-05, "loss": 0.2425, "step": 2107 }, { "epoch": 1.282628536659568, "grad_norm": 0.9707067012786865, "learning_rate": 4.0170843775661025e-05, "loss": 0.2608, "step": 2108 }, { "epoch": 1.2832369942196533, "grad_norm": 0.9354795217514038, "learning_rate": 4.0160947188139786e-05, "loss": 0.2323, "step": 2109 }, { "epoch": 1.2838454517797384, "grad_norm": 1.0434750318527222, "learning_rate": 4.015104684133558e-05, "loss": 0.2411, "step": 2110 }, { "epoch": 1.2844539093398235, "grad_norm": 0.9146111011505127, "learning_rate": 4.014114273770328e-05, "loss": 0.247, "step": 2111 }, { "epoch": 1.2850623668999086, "grad_norm": 0.9847277402877808, "learning_rate": 4.01312348796987e-05, "loss": 0.2576, "step": 2112 }, { "epoch": 1.285670824459994, "grad_norm": 0.9638257026672363, "learning_rate": 4.0121323269778565e-05, "loss": 0.2398, "step": 2113 }, { "epoch": 1.286279282020079, "grad_norm": 0.8207821249961853, "learning_rate": 4.0111407910400555e-05, "loss": 0.2134, "step": 2114 }, { "epoch": 1.2868877395801643, "grad_norm": 0.9123344421386719, "learning_rate": 4.010148880402326e-05, "loss": 0.1845, "step": 2115 }, { "epoch": 1.2874961971402494, "grad_norm": 0.9210152626037598, "learning_rate": 4.009156595310619e-05, "loss": 0.2483, "step": 2116 }, { "epoch": 1.2881046547003345, "grad_norm": 0.9474868178367615, "learning_rate": 4.0081639360109816e-05, "loss": 0.2452, "step": 2117 }, { "epoch": 1.2887131122604198, "grad_norm": 0.931167721748352, "learning_rate": 4.007170902749552e-05, "loss": 0.25, "step": 2118 }, { "epoch": 1.289321569820505, "grad_norm": 0.9042659401893616, "learning_rate": 4.00617749577256e-05, "loss": 0.2474, "step": 2119 }, { "epoch": 1.2899300273805903, "grad_norm": 0.9150558114051819, "learning_rate": 4.0051837153263296e-05, "loss": 0.2376, "step": 2120 }, { "epoch": 1.2905384849406754, "grad_norm": 0.7283451557159424, "learning_rate": 4.0041895616572765e-05, "loss": 0.1869, "step": 2121 }, { "epoch": 1.2911469425007605, "grad_norm": 0.8587160110473633, "learning_rate": 4.0031950350119106e-05, "loss": 0.2099, "step": 2122 }, { "epoch": 1.2917554000608458, "grad_norm": 0.7844583988189697, "learning_rate": 4.002200135636832e-05, "loss": 0.1998, "step": 2123 }, { "epoch": 1.2923638576209309, "grad_norm": 0.8835726380348206, "learning_rate": 4.001204863778735e-05, "loss": 0.2201, "step": 2124 }, { "epoch": 1.2929723151810162, "grad_norm": 0.8643476366996765, "learning_rate": 4.0002092196844046e-05, "loss": 0.2101, "step": 2125 }, { "epoch": 1.2935807727411013, "grad_norm": 0.8908064961433411, "learning_rate": 3.999213203600719e-05, "loss": 0.2303, "step": 2126 }, { "epoch": 1.2941892303011864, "grad_norm": 1.0389541387557983, "learning_rate": 3.99821681577465e-05, "loss": 0.233, "step": 2127 }, { "epoch": 1.2947976878612717, "grad_norm": 1.1947957277297974, "learning_rate": 3.997220056453259e-05, "loss": 0.2872, "step": 2128 }, { "epoch": 1.2954061454213568, "grad_norm": 1.0398141145706177, "learning_rate": 3.9962229258837013e-05, "loss": 0.2141, "step": 2129 }, { "epoch": 1.296014602981442, "grad_norm": 0.8543111085891724, "learning_rate": 3.9952254243132243e-05, "loss": 0.2153, "step": 2130 }, { "epoch": 1.2966230605415272, "grad_norm": 1.0683271884918213, "learning_rate": 3.9942275519891656e-05, "loss": 0.2487, "step": 2131 }, { "epoch": 1.2972315181016123, "grad_norm": 0.8721834421157837, "learning_rate": 3.993229309158957e-05, "loss": 0.2295, "step": 2132 }, { "epoch": 1.2978399756616976, "grad_norm": 0.8389768004417419, "learning_rate": 3.9922306960701196e-05, "loss": 0.2181, "step": 2133 }, { "epoch": 1.2984484332217827, "grad_norm": 0.9460014700889587, "learning_rate": 3.991231712970269e-05, "loss": 0.2342, "step": 2134 }, { "epoch": 1.299056890781868, "grad_norm": 0.9399822950363159, "learning_rate": 3.990232360107111e-05, "loss": 0.2097, "step": 2135 }, { "epoch": 1.2996653483419531, "grad_norm": 0.8793272376060486, "learning_rate": 3.989232637728445e-05, "loss": 0.2349, "step": 2136 }, { "epoch": 1.3002738059020382, "grad_norm": 0.9901358485221863, "learning_rate": 3.988232546082158e-05, "loss": 0.2422, "step": 2137 }, { "epoch": 1.3008822634621235, "grad_norm": 0.8974143862724304, "learning_rate": 3.9872320854162324e-05, "loss": 0.2433, "step": 2138 }, { "epoch": 1.3014907210222086, "grad_norm": 0.9541013836860657, "learning_rate": 3.9862312559787404e-05, "loss": 0.2531, "step": 2139 }, { "epoch": 1.302099178582294, "grad_norm": 1.2113054990768433, "learning_rate": 3.985230058017846e-05, "loss": 0.2654, "step": 2140 }, { "epoch": 1.302707636142379, "grad_norm": 0.8780795931816101, "learning_rate": 3.984228491781805e-05, "loss": 0.2094, "step": 2141 }, { "epoch": 1.3033160937024642, "grad_norm": 0.9655569791793823, "learning_rate": 3.9832265575189635e-05, "loss": 0.2571, "step": 2142 }, { "epoch": 1.3039245512625495, "grad_norm": 0.9380378723144531, "learning_rate": 3.98222425547776e-05, "loss": 0.2191, "step": 2143 }, { "epoch": 1.3045330088226346, "grad_norm": 0.887397825717926, "learning_rate": 3.981221585906723e-05, "loss": 0.2585, "step": 2144 }, { "epoch": 1.3051414663827199, "grad_norm": 0.8565236926078796, "learning_rate": 3.980218549054473e-05, "loss": 0.1939, "step": 2145 }, { "epoch": 1.305749923942805, "grad_norm": 0.8174847364425659, "learning_rate": 3.979215145169721e-05, "loss": 0.2119, "step": 2146 }, { "epoch": 1.30635838150289, "grad_norm": 0.8997043371200562, "learning_rate": 3.97821137450127e-05, "loss": 0.2329, "step": 2147 }, { "epoch": 1.3069668390629754, "grad_norm": 0.9833238124847412, "learning_rate": 3.977207237298014e-05, "loss": 0.2813, "step": 2148 }, { "epoch": 1.3075752966230605, "grad_norm": 0.8356313705444336, "learning_rate": 3.9762027338089356e-05, "loss": 0.2254, "step": 2149 }, { "epoch": 1.3081837541831458, "grad_norm": 1.0737109184265137, "learning_rate": 3.97519786428311e-05, "loss": 0.2502, "step": 2150 }, { "epoch": 1.308792211743231, "grad_norm": 0.8580512404441833, "learning_rate": 3.9741926289697036e-05, "loss": 0.2491, "step": 2151 }, { "epoch": 1.309400669303316, "grad_norm": 0.9692168831825256, "learning_rate": 3.973187028117972e-05, "loss": 0.2251, "step": 2152 }, { "epoch": 1.3100091268634013, "grad_norm": 0.8899025917053223, "learning_rate": 3.9721810619772636e-05, "loss": 0.2117, "step": 2153 }, { "epoch": 1.3106175844234864, "grad_norm": 0.784062385559082, "learning_rate": 3.971174730797015e-05, "loss": 0.2105, "step": 2154 }, { "epoch": 1.3112260419835717, "grad_norm": 0.7936700582504272, "learning_rate": 3.970168034826755e-05, "loss": 0.1936, "step": 2155 }, { "epoch": 1.3118344995436568, "grad_norm": 0.844158947467804, "learning_rate": 3.9691609743161015e-05, "loss": 0.2318, "step": 2156 }, { "epoch": 1.312442957103742, "grad_norm": 0.9515892267227173, "learning_rate": 3.968153549514765e-05, "loss": 0.2188, "step": 2157 }, { "epoch": 1.3130514146638272, "grad_norm": 0.8544421792030334, "learning_rate": 3.967145760672543e-05, "loss": 0.225, "step": 2158 }, { "epoch": 1.3136598722239123, "grad_norm": 0.9000852704048157, "learning_rate": 3.9661376080393266e-05, "loss": 0.2036, "step": 2159 }, { "epoch": 1.3142683297839977, "grad_norm": 0.9248766899108887, "learning_rate": 3.965129091865094e-05, "loss": 0.2235, "step": 2160 }, { "epoch": 1.3148767873440828, "grad_norm": 1.002901315689087, "learning_rate": 3.9641202123999174e-05, "loss": 0.2248, "step": 2161 }, { "epoch": 1.3154852449041678, "grad_norm": 0.8213080763816833, "learning_rate": 3.963110969893955e-05, "loss": 0.2088, "step": 2162 }, { "epoch": 1.3160937024642532, "grad_norm": 1.0072423219680786, "learning_rate": 3.9621013645974574e-05, "loss": 0.2406, "step": 2163 }, { "epoch": 1.3167021600243383, "grad_norm": 1.0771087408065796, "learning_rate": 3.961091396760765e-05, "loss": 0.2093, "step": 2164 }, { "epoch": 1.3173106175844236, "grad_norm": 0.9404158592224121, "learning_rate": 3.960081066634308e-05, "loss": 0.2502, "step": 2165 }, { "epoch": 1.3179190751445087, "grad_norm": 0.910447359085083, "learning_rate": 3.959070374468605e-05, "loss": 0.2086, "step": 2166 }, { "epoch": 1.3185275327045938, "grad_norm": 0.7833483219146729, "learning_rate": 3.958059320514267e-05, "loss": 0.2102, "step": 2167 }, { "epoch": 1.319135990264679, "grad_norm": 1.0158355236053467, "learning_rate": 3.957047905021991e-05, "loss": 0.2798, "step": 2168 }, { "epoch": 1.3197444478247642, "grad_norm": 0.9133449196815491, "learning_rate": 3.956036128242568e-05, "loss": 0.2381, "step": 2169 }, { "epoch": 1.3203529053848495, "grad_norm": 0.9535892009735107, "learning_rate": 3.955023990426876e-05, "loss": 0.2382, "step": 2170 }, { "epoch": 1.3209613629449346, "grad_norm": 0.919695258140564, "learning_rate": 3.954011491825883e-05, "loss": 0.2257, "step": 2171 }, { "epoch": 1.3215698205050197, "grad_norm": 0.8631134033203125, "learning_rate": 3.952998632690646e-05, "loss": 0.1773, "step": 2172 }, { "epoch": 1.322178278065105, "grad_norm": 1.0971744060516357, "learning_rate": 3.951985413272312e-05, "loss": 0.251, "step": 2173 }, { "epoch": 1.3227867356251901, "grad_norm": 0.8867610692977905, "learning_rate": 3.950971833822117e-05, "loss": 0.1989, "step": 2174 }, { "epoch": 1.3233951931852754, "grad_norm": 0.8583669066429138, "learning_rate": 3.949957894591387e-05, "loss": 0.2238, "step": 2175 }, { "epoch": 1.3240036507453605, "grad_norm": 0.8695077300071716, "learning_rate": 3.9489435958315354e-05, "loss": 0.2755, "step": 2176 }, { "epoch": 1.3246121083054456, "grad_norm": 0.9294130206108093, "learning_rate": 3.947928937794069e-05, "loss": 0.2271, "step": 2177 }, { "epoch": 1.325220565865531, "grad_norm": 0.871168851852417, "learning_rate": 3.946913920730577e-05, "loss": 0.215, "step": 2178 }, { "epoch": 1.325829023425616, "grad_norm": 0.9917259812355042, "learning_rate": 3.945898544892744e-05, "loss": 0.2764, "step": 2179 }, { "epoch": 1.3264374809857014, "grad_norm": 0.689329206943512, "learning_rate": 3.944882810532339e-05, "loss": 0.1973, "step": 2180 }, { "epoch": 1.3270459385457865, "grad_norm": 0.8238869905471802, "learning_rate": 3.943866717901223e-05, "loss": 0.2145, "step": 2181 }, { "epoch": 1.3276543961058715, "grad_norm": 0.8981241583824158, "learning_rate": 3.9428502672513446e-05, "loss": 0.2292, "step": 2182 }, { "epoch": 1.3282628536659569, "grad_norm": 0.902793824672699, "learning_rate": 3.9418334588347406e-05, "loss": 0.2068, "step": 2183 }, { "epoch": 1.328871311226042, "grad_norm": 1.0001530647277832, "learning_rate": 3.9408162929035375e-05, "loss": 0.282, "step": 2184 }, { "epoch": 1.3294797687861273, "grad_norm": 0.8582153916358948, "learning_rate": 3.939798769709949e-05, "loss": 0.2009, "step": 2185 }, { "epoch": 1.3300882263462124, "grad_norm": 1.967231035232544, "learning_rate": 3.93878088950628e-05, "loss": 0.2382, "step": 2186 }, { "epoch": 1.3306966839062975, "grad_norm": 1.2576441764831543, "learning_rate": 3.937762652544923e-05, "loss": 0.3125, "step": 2187 }, { "epoch": 1.3313051414663828, "grad_norm": 1.0156564712524414, "learning_rate": 3.9367440590783554e-05, "loss": 0.2311, "step": 2188 }, { "epoch": 1.3319135990264679, "grad_norm": 0.9007154703140259, "learning_rate": 3.9357251093591485e-05, "loss": 0.2335, "step": 2189 }, { "epoch": 1.3325220565865532, "grad_norm": 0.9337432980537415, "learning_rate": 3.934705803639959e-05, "loss": 0.2229, "step": 2190 }, { "epoch": 1.3331305141466383, "grad_norm": 0.875648021697998, "learning_rate": 3.9336861421735305e-05, "loss": 0.2179, "step": 2191 }, { "epoch": 1.3337389717067234, "grad_norm": 0.7988814115524292, "learning_rate": 3.9326661252126984e-05, "loss": 0.2233, "step": 2192 }, { "epoch": 1.3343474292668087, "grad_norm": 0.8480890989303589, "learning_rate": 3.931645753010383e-05, "loss": 0.2495, "step": 2193 }, { "epoch": 1.3349558868268938, "grad_norm": 0.9835476279258728, "learning_rate": 3.9306250258195954e-05, "loss": 0.2171, "step": 2194 }, { "epoch": 1.3355643443869791, "grad_norm": 1.1172250509262085, "learning_rate": 3.929603943893432e-05, "loss": 0.2434, "step": 2195 }, { "epoch": 1.3361728019470642, "grad_norm": 0.8083821535110474, "learning_rate": 3.92858250748508e-05, "loss": 0.2134, "step": 2196 }, { "epoch": 1.3367812595071493, "grad_norm": 0.8030064702033997, "learning_rate": 3.9275607168478126e-05, "loss": 0.2281, "step": 2197 }, { "epoch": 1.3373897170672346, "grad_norm": 0.8596853017807007, "learning_rate": 3.926538572234991e-05, "loss": 0.1805, "step": 2198 }, { "epoch": 1.3379981746273197, "grad_norm": 1.0698977708816528, "learning_rate": 3.925516073900064e-05, "loss": 0.2718, "step": 2199 }, { "epoch": 1.338606632187405, "grad_norm": 1.0942227840423584, "learning_rate": 3.9244932220965704e-05, "loss": 0.2411, "step": 2200 }, { "epoch": 1.3392150897474902, "grad_norm": 0.9584268927574158, "learning_rate": 3.9234700170781316e-05, "loss": 0.277, "step": 2201 }, { "epoch": 1.3398235473075752, "grad_norm": 1.1233789920806885, "learning_rate": 3.922446459098463e-05, "loss": 0.2515, "step": 2202 }, { "epoch": 1.3404320048676603, "grad_norm": 1.0286003351211548, "learning_rate": 3.9214225484113634e-05, "loss": 0.199, "step": 2203 }, { "epoch": 1.3410404624277457, "grad_norm": 0.920686662197113, "learning_rate": 3.9203982852707184e-05, "loss": 0.2214, "step": 2204 }, { "epoch": 1.341648919987831, "grad_norm": 0.9834322333335876, "learning_rate": 3.9193736699305044e-05, "loss": 0.2745, "step": 2205 }, { "epoch": 1.342257377547916, "grad_norm": 0.8372621536254883, "learning_rate": 3.9183487026447824e-05, "loss": 0.2355, "step": 2206 }, { "epoch": 1.3428658351080012, "grad_norm": 0.7708995938301086, "learning_rate": 3.9173233836677024e-05, "loss": 0.1949, "step": 2207 }, { "epoch": 1.3434742926680863, "grad_norm": 0.9579456448554993, "learning_rate": 3.9162977132534996e-05, "loss": 0.2197, "step": 2208 }, { "epoch": 1.3440827502281716, "grad_norm": 0.8540910482406616, "learning_rate": 3.915271691656498e-05, "loss": 0.2385, "step": 2209 }, { "epoch": 1.344691207788257, "grad_norm": 0.8921311497688293, "learning_rate": 3.914245319131109e-05, "loss": 0.2211, "step": 2210 }, { "epoch": 1.345299665348342, "grad_norm": 0.8396940231323242, "learning_rate": 3.913218595931829e-05, "loss": 0.2176, "step": 2211 }, { "epoch": 1.345908122908427, "grad_norm": 0.8367871642112732, "learning_rate": 3.9121915223132436e-05, "loss": 0.2457, "step": 2212 }, { "epoch": 1.3465165804685122, "grad_norm": 0.9021058082580566, "learning_rate": 3.911164098530023e-05, "loss": 0.2251, "step": 2213 }, { "epoch": 1.3471250380285975, "grad_norm": 0.911104679107666, "learning_rate": 3.910136324836927e-05, "loss": 0.2683, "step": 2214 }, { "epoch": 1.3477334955886826, "grad_norm": 0.906808614730835, "learning_rate": 3.909108201488799e-05, "loss": 0.2472, "step": 2215 }, { "epoch": 1.348341953148768, "grad_norm": 0.8513998985290527, "learning_rate": 3.908079728740571e-05, "loss": 0.205, "step": 2216 }, { "epoch": 1.348950410708853, "grad_norm": 0.9133200645446777, "learning_rate": 3.9070509068472635e-05, "loss": 0.2453, "step": 2217 }, { "epoch": 1.3495588682689381, "grad_norm": 0.8982611298561096, "learning_rate": 3.90602173606398e-05, "loss": 0.1938, "step": 2218 }, { "epoch": 1.3501673258290234, "grad_norm": 1.0791575908660889, "learning_rate": 3.90499221664591e-05, "loss": 0.2885, "step": 2219 }, { "epoch": 1.3507757833891085, "grad_norm": 0.8639183044433594, "learning_rate": 3.9039623488483346e-05, "loss": 0.2382, "step": 2220 }, { "epoch": 1.3513842409491938, "grad_norm": 1.2875553369522095, "learning_rate": 3.902932132926616e-05, "loss": 0.2692, "step": 2221 }, { "epoch": 1.351992698509279, "grad_norm": 0.9130270481109619, "learning_rate": 3.901901569136206e-05, "loss": 0.2361, "step": 2222 }, { "epoch": 1.352601156069364, "grad_norm": 0.832845151424408, "learning_rate": 3.900870657732641e-05, "loss": 0.2147, "step": 2223 }, { "epoch": 1.3532096136294494, "grad_norm": 1.053515911102295, "learning_rate": 3.8998393989715434e-05, "loss": 0.2453, "step": 2224 }, { "epoch": 1.3538180711895345, "grad_norm": 0.9339423179626465, "learning_rate": 3.898807793108624e-05, "loss": 0.1898, "step": 2225 }, { "epoch": 1.3544265287496198, "grad_norm": 0.9041353464126587, "learning_rate": 3.8977758403996765e-05, "loss": 0.2065, "step": 2226 }, { "epoch": 1.3550349863097049, "grad_norm": 0.8681005239486694, "learning_rate": 3.896743541100583e-05, "loss": 0.2186, "step": 2227 }, { "epoch": 1.35564344386979, "grad_norm": 0.8641691207885742, "learning_rate": 3.89571089546731e-05, "loss": 0.2016, "step": 2228 }, { "epoch": 1.3562519014298753, "grad_norm": 0.8914021253585815, "learning_rate": 3.8946779037559115e-05, "loss": 0.2332, "step": 2229 }, { "epoch": 1.3568603589899604, "grad_norm": 0.9768247604370117, "learning_rate": 3.8936445662225264e-05, "loss": 0.1901, "step": 2230 }, { "epoch": 1.3574688165500457, "grad_norm": 0.982510507106781, "learning_rate": 3.892610883123378e-05, "loss": 0.2356, "step": 2231 }, { "epoch": 1.3580772741101308, "grad_norm": 0.9750663638114929, "learning_rate": 3.891576854714777e-05, "loss": 0.2241, "step": 2232 }, { "epoch": 1.358685731670216, "grad_norm": 0.8455461263656616, "learning_rate": 3.890542481253121e-05, "loss": 0.2119, "step": 2233 }, { "epoch": 1.3592941892303012, "grad_norm": 1.088861107826233, "learning_rate": 3.88950776299489e-05, "loss": 0.2778, "step": 2234 }, { "epoch": 1.3599026467903863, "grad_norm": 1.2681231498718262, "learning_rate": 3.888472700196651e-05, "loss": 0.2204, "step": 2235 }, { "epoch": 1.3605111043504716, "grad_norm": 0.9089130163192749, "learning_rate": 3.887437293115057e-05, "loss": 0.2306, "step": 2236 }, { "epoch": 1.3611195619105567, "grad_norm": 1.256752610206604, "learning_rate": 3.8864015420068454e-05, "loss": 0.2447, "step": 2237 }, { "epoch": 1.3617280194706418, "grad_norm": 0.9000508189201355, "learning_rate": 3.88536544712884e-05, "loss": 0.2213, "step": 2238 }, { "epoch": 1.3623364770307271, "grad_norm": 1.016454815864563, "learning_rate": 3.884329008737947e-05, "loss": 0.2309, "step": 2239 }, { "epoch": 1.3629449345908122, "grad_norm": 0.9450438618659973, "learning_rate": 3.883292227091163e-05, "loss": 0.2443, "step": 2240 }, { "epoch": 1.3635533921508975, "grad_norm": 1.0585530996322632, "learning_rate": 3.882255102445565e-05, "loss": 0.2413, "step": 2241 }, { "epoch": 1.3641618497109826, "grad_norm": 0.9803087115287781, "learning_rate": 3.8812176350583164e-05, "loss": 0.219, "step": 2242 }, { "epoch": 1.3647703072710677, "grad_norm": 0.93431156873703, "learning_rate": 3.880179825186667e-05, "loss": 0.2136, "step": 2243 }, { "epoch": 1.365378764831153, "grad_norm": 0.9018591642379761, "learning_rate": 3.879141673087949e-05, "loss": 0.2249, "step": 2244 }, { "epoch": 1.3659872223912382, "grad_norm": 1.3140918016433716, "learning_rate": 3.878103179019581e-05, "loss": 0.2312, "step": 2245 }, { "epoch": 1.3665956799513235, "grad_norm": 0.9108927249908447, "learning_rate": 3.877064343239068e-05, "loss": 0.2274, "step": 2246 }, { "epoch": 1.3672041375114086, "grad_norm": 1.067284107208252, "learning_rate": 3.8760251660039956e-05, "loss": 0.2968, "step": 2247 }, { "epoch": 1.3678125950714937, "grad_norm": 0.8846755623817444, "learning_rate": 3.874985647572039e-05, "loss": 0.2381, "step": 2248 }, { "epoch": 1.368421052631579, "grad_norm": 0.9099476933479309, "learning_rate": 3.8739457882009526e-05, "loss": 0.1729, "step": 2249 }, { "epoch": 1.369029510191664, "grad_norm": 0.866028904914856, "learning_rate": 3.87290558814858e-05, "loss": 0.2237, "step": 2250 }, { "epoch": 1.3696379677517494, "grad_norm": 0.8966884016990662, "learning_rate": 3.871865047672848e-05, "loss": 0.2003, "step": 2251 }, { "epoch": 1.3702464253118345, "grad_norm": 0.9942765831947327, "learning_rate": 3.8708241670317645e-05, "loss": 0.2374, "step": 2252 }, { "epoch": 1.3708548828719196, "grad_norm": 0.9010903239250183, "learning_rate": 3.869782946483428e-05, "loss": 0.1948, "step": 2253 }, { "epoch": 1.371463340432005, "grad_norm": 0.9701775908470154, "learning_rate": 3.868741386286016e-05, "loss": 0.2412, "step": 2254 }, { "epoch": 1.37207179799209, "grad_norm": 0.7762331366539001, "learning_rate": 3.867699486697791e-05, "loss": 0.1873, "step": 2255 }, { "epoch": 1.3726802555521753, "grad_norm": 0.9885874390602112, "learning_rate": 3.866657247977103e-05, "loss": 0.2072, "step": 2256 }, { "epoch": 1.3732887131122604, "grad_norm": 0.9853124618530273, "learning_rate": 3.865614670382382e-05, "loss": 0.2562, "step": 2257 }, { "epoch": 1.3738971706723455, "grad_norm": 0.9936604499816895, "learning_rate": 3.864571754172144e-05, "loss": 0.2386, "step": 2258 }, { "epoch": 1.3745056282324308, "grad_norm": 0.9598672389984131, "learning_rate": 3.8635284996049904e-05, "loss": 0.2379, "step": 2259 }, { "epoch": 1.375114085792516, "grad_norm": 0.9736387133598328, "learning_rate": 3.8624849069396024e-05, "loss": 0.2614, "step": 2260 }, { "epoch": 1.3757225433526012, "grad_norm": 0.8407406210899353, "learning_rate": 3.8614409764347494e-05, "loss": 0.2115, "step": 2261 }, { "epoch": 1.3763310009126863, "grad_norm": 0.8771750330924988, "learning_rate": 3.860396708349281e-05, "loss": 0.2091, "step": 2262 }, { "epoch": 1.3769394584727714, "grad_norm": 0.9052261114120483, "learning_rate": 3.859352102942134e-05, "loss": 0.2031, "step": 2263 }, { "epoch": 1.3775479160328568, "grad_norm": 0.8526843190193176, "learning_rate": 3.8583071604723256e-05, "loss": 0.1998, "step": 2264 }, { "epoch": 1.3781563735929419, "grad_norm": 1.0175981521606445, "learning_rate": 3.857261881198958e-05, "loss": 0.209, "step": 2265 }, { "epoch": 1.3787648311530272, "grad_norm": 1.1324195861816406, "learning_rate": 3.856216265381219e-05, "loss": 0.2031, "step": 2266 }, { "epoch": 1.3793732887131123, "grad_norm": 1.1673684120178223, "learning_rate": 3.8551703132783745e-05, "loss": 0.2526, "step": 2267 }, { "epoch": 1.3799817462731974, "grad_norm": 0.929223895072937, "learning_rate": 3.854124025149778e-05, "loss": 0.2443, "step": 2268 }, { "epoch": 1.3805902038332827, "grad_norm": 0.9426333904266357, "learning_rate": 3.8530774012548674e-05, "loss": 0.2002, "step": 2269 }, { "epoch": 1.3811986613933678, "grad_norm": 0.7749611139297485, "learning_rate": 3.85203044185316e-05, "loss": 0.2044, "step": 2270 }, { "epoch": 1.381807118953453, "grad_norm": 0.8301417231559753, "learning_rate": 3.8509831472042585e-05, "loss": 0.2198, "step": 2271 }, { "epoch": 1.3824155765135382, "grad_norm": 1.038061499595642, "learning_rate": 3.849935517567848e-05, "loss": 0.2578, "step": 2272 }, { "epoch": 1.3830240340736233, "grad_norm": 0.9880487322807312, "learning_rate": 3.8488875532036975e-05, "loss": 0.2688, "step": 2273 }, { "epoch": 1.3836324916337086, "grad_norm": 0.9047033190727234, "learning_rate": 3.847839254371658e-05, "loss": 0.2179, "step": 2274 }, { "epoch": 1.3842409491937937, "grad_norm": 0.9710778594017029, "learning_rate": 3.8467906213316636e-05, "loss": 0.2569, "step": 2275 }, { "epoch": 1.384849406753879, "grad_norm": 1.0498100519180298, "learning_rate": 3.845741654343733e-05, "loss": 0.2389, "step": 2276 }, { "epoch": 1.3854578643139641, "grad_norm": 0.9659872055053711, "learning_rate": 3.8446923536679644e-05, "loss": 0.2312, "step": 2277 }, { "epoch": 1.3860663218740492, "grad_norm": 0.956907331943512, "learning_rate": 3.843642719564542e-05, "loss": 0.2527, "step": 2278 }, { "epoch": 1.3866747794341345, "grad_norm": 0.9140104651451111, "learning_rate": 3.842592752293731e-05, "loss": 0.2255, "step": 2279 }, { "epoch": 1.3872832369942196, "grad_norm": 0.8140145540237427, "learning_rate": 3.8415424521158804e-05, "loss": 0.2145, "step": 2280 }, { "epoch": 1.387891694554305, "grad_norm": 0.9981208443641663, "learning_rate": 3.8404918192914184e-05, "loss": 0.2338, "step": 2281 }, { "epoch": 1.38850015211439, "grad_norm": 1.1599704027175903, "learning_rate": 3.839440854080861e-05, "loss": 0.3089, "step": 2282 }, { "epoch": 1.3891086096744751, "grad_norm": 0.9346492886543274, "learning_rate": 3.8383895567448015e-05, "loss": 0.1997, "step": 2283 }, { "epoch": 1.3897170672345605, "grad_norm": 0.9026308655738831, "learning_rate": 3.8373379275439194e-05, "loss": 0.198, "step": 2284 }, { "epoch": 1.3903255247946456, "grad_norm": 0.8270521759986877, "learning_rate": 3.836285966738974e-05, "loss": 0.2308, "step": 2285 }, { "epoch": 1.3909339823547309, "grad_norm": 0.8808834552764893, "learning_rate": 3.8352336745908076e-05, "loss": 0.2238, "step": 2286 }, { "epoch": 1.391542439914816, "grad_norm": 0.8379645347595215, "learning_rate": 3.834181051360346e-05, "loss": 0.1938, "step": 2287 }, { "epoch": 1.392150897474901, "grad_norm": 0.894270658493042, "learning_rate": 3.833128097308594e-05, "loss": 0.2172, "step": 2288 }, { "epoch": 1.3927593550349864, "grad_norm": 0.8662185668945312, "learning_rate": 3.832074812696642e-05, "loss": 0.2332, "step": 2289 }, { "epoch": 1.3933678125950715, "grad_norm": 1.1685889959335327, "learning_rate": 3.8310211977856605e-05, "loss": 0.254, "step": 2290 }, { "epoch": 1.3939762701551568, "grad_norm": 0.9299534559249878, "learning_rate": 3.8299672528369014e-05, "loss": 0.2024, "step": 2291 }, { "epoch": 1.394584727715242, "grad_norm": 0.7964719533920288, "learning_rate": 3.828912978111699e-05, "loss": 0.1907, "step": 2292 }, { "epoch": 1.395193185275327, "grad_norm": 0.9740342497825623, "learning_rate": 3.8278583738714696e-05, "loss": 0.2479, "step": 2293 }, { "epoch": 1.3958016428354123, "grad_norm": 1.016251564025879, "learning_rate": 3.826803440377712e-05, "loss": 0.2517, "step": 2294 }, { "epoch": 1.3964101003954974, "grad_norm": 1.0677728652954102, "learning_rate": 3.8257481778920045e-05, "loss": 0.23, "step": 2295 }, { "epoch": 1.3970185579555827, "grad_norm": 0.9394170045852661, "learning_rate": 3.824692586676009e-05, "loss": 0.2383, "step": 2296 }, { "epoch": 1.3976270155156678, "grad_norm": 0.9719864130020142, "learning_rate": 3.823636666991468e-05, "loss": 0.2135, "step": 2297 }, { "epoch": 1.398235473075753, "grad_norm": 0.9637436270713806, "learning_rate": 3.8225804191002054e-05, "loss": 0.2471, "step": 2298 }, { "epoch": 1.3988439306358382, "grad_norm": 1.0839718580245972, "learning_rate": 3.821523843264127e-05, "loss": 0.2424, "step": 2299 }, { "epoch": 1.3994523881959233, "grad_norm": 0.9015427231788635, "learning_rate": 3.82046693974522e-05, "loss": 0.2568, "step": 2300 }, { "epoch": 1.4000608457560086, "grad_norm": 0.9065006971359253, "learning_rate": 3.8194097088055505e-05, "loss": 0.1932, "step": 2301 }, { "epoch": 1.4006693033160937, "grad_norm": 0.876019299030304, "learning_rate": 3.818352150707269e-05, "loss": 0.2278, "step": 2302 }, { "epoch": 1.4012777608761788, "grad_norm": 0.8508959412574768, "learning_rate": 3.817294265712606e-05, "loss": 0.2087, "step": 2303 }, { "epoch": 1.4018862184362642, "grad_norm": 0.8045951128005981, "learning_rate": 3.8162360540838726e-05, "loss": 0.1957, "step": 2304 }, { "epoch": 1.4024946759963492, "grad_norm": 0.893156111240387, "learning_rate": 3.815177516083461e-05, "loss": 0.1938, "step": 2305 }, { "epoch": 1.4031031335564346, "grad_norm": 0.9133938550949097, "learning_rate": 3.8141186519738456e-05, "loss": 0.2445, "step": 2306 }, { "epoch": 1.4037115911165197, "grad_norm": 0.9459069967269897, "learning_rate": 3.8130594620175786e-05, "loss": 0.2532, "step": 2307 }, { "epoch": 1.4043200486766048, "grad_norm": 0.9361640810966492, "learning_rate": 3.811999946477296e-05, "loss": 0.2845, "step": 2308 }, { "epoch": 1.4049285062366899, "grad_norm": 0.7425569295883179, "learning_rate": 3.810940105615715e-05, "loss": 0.1709, "step": 2309 }, { "epoch": 1.4055369637967752, "grad_norm": 0.9451609253883362, "learning_rate": 3.8098799396956284e-05, "loss": 0.2082, "step": 2310 }, { "epoch": 1.4061454213568605, "grad_norm": 0.9120255708694458, "learning_rate": 3.808819448979917e-05, "loss": 0.2128, "step": 2311 }, { "epoch": 1.4067538789169456, "grad_norm": 0.9225917458534241, "learning_rate": 3.8077586337315365e-05, "loss": 0.2095, "step": 2312 }, { "epoch": 1.4073623364770307, "grad_norm": 0.9341639876365662, "learning_rate": 3.8066974942135234e-05, "loss": 0.2372, "step": 2313 }, { "epoch": 1.4079707940371158, "grad_norm": 0.9186847805976868, "learning_rate": 3.8056360306889985e-05, "loss": 0.2227, "step": 2314 }, { "epoch": 1.408579251597201, "grad_norm": 0.8264140486717224, "learning_rate": 3.8045742434211595e-05, "loss": 0.1934, "step": 2315 }, { "epoch": 1.4091877091572864, "grad_norm": 0.8795403242111206, "learning_rate": 3.803512132673286e-05, "loss": 0.2349, "step": 2316 }, { "epoch": 1.4097961667173715, "grad_norm": 0.9693627953529358, "learning_rate": 3.802449698708736e-05, "loss": 0.2146, "step": 2317 }, { "epoch": 1.4104046242774566, "grad_norm": 0.9687555432319641, "learning_rate": 3.8013869417909496e-05, "loss": 0.2455, "step": 2318 }, { "epoch": 1.4110130818375417, "grad_norm": 4.701800346374512, "learning_rate": 3.800323862183446e-05, "loss": 0.2436, "step": 2319 }, { "epoch": 1.411621539397627, "grad_norm": 0.9624002575874329, "learning_rate": 3.799260460149825e-05, "loss": 0.2143, "step": 2320 }, { "epoch": 1.4122299969577121, "grad_norm": 1.0840182304382324, "learning_rate": 3.7981967359537656e-05, "loss": 0.219, "step": 2321 }, { "epoch": 1.4128384545177974, "grad_norm": 0.9719953536987305, "learning_rate": 3.797132689859027e-05, "loss": 0.2347, "step": 2322 }, { "epoch": 1.4134469120778825, "grad_norm": 0.9404840469360352, "learning_rate": 3.796068322129449e-05, "loss": 0.2089, "step": 2323 }, { "epoch": 1.4140553696379676, "grad_norm": 0.7975339889526367, "learning_rate": 3.795003633028949e-05, "loss": 0.2036, "step": 2324 }, { "epoch": 1.414663827198053, "grad_norm": 0.9089730978012085, "learning_rate": 3.793938622821528e-05, "loss": 0.2158, "step": 2325 }, { "epoch": 1.415272284758138, "grad_norm": 1.023104190826416, "learning_rate": 3.792873291771261e-05, "loss": 0.2297, "step": 2326 }, { "epoch": 1.4158807423182234, "grad_norm": 0.9837586283683777, "learning_rate": 3.7918076401423076e-05, "loss": 0.2383, "step": 2327 }, { "epoch": 1.4164891998783085, "grad_norm": 0.9490445852279663, "learning_rate": 3.790741668198906e-05, "loss": 0.2691, "step": 2328 }, { "epoch": 1.4170976574383936, "grad_norm": 1.8430839776992798, "learning_rate": 3.7896753762053693e-05, "loss": 0.2033, "step": 2329 }, { "epoch": 1.4177061149984789, "grad_norm": 0.8883879780769348, "learning_rate": 3.788608764426097e-05, "loss": 0.2133, "step": 2330 }, { "epoch": 1.418314572558564, "grad_norm": 0.9225703477859497, "learning_rate": 3.787541833125563e-05, "loss": 0.1979, "step": 2331 }, { "epoch": 1.4189230301186493, "grad_norm": 0.8261762261390686, "learning_rate": 3.786474582568321e-05, "loss": 0.2257, "step": 2332 }, { "epoch": 1.4195314876787344, "grad_norm": 0.8989948630332947, "learning_rate": 3.785407013019006e-05, "loss": 0.2048, "step": 2333 }, { "epoch": 1.4201399452388195, "grad_norm": 0.8330014944076538, "learning_rate": 3.78433912474233e-05, "loss": 0.2363, "step": 2334 }, { "epoch": 1.4207484027989048, "grad_norm": 0.8741576075553894, "learning_rate": 3.783270918003085e-05, "loss": 0.2149, "step": 2335 }, { "epoch": 1.42135686035899, "grad_norm": 0.9070691466331482, "learning_rate": 3.782202393066141e-05, "loss": 0.2316, "step": 2336 }, { "epoch": 1.4219653179190752, "grad_norm": 0.9362895488739014, "learning_rate": 3.7811335501964495e-05, "loss": 0.2179, "step": 2337 }, { "epoch": 1.4225737754791603, "grad_norm": 0.8994218111038208, "learning_rate": 3.7800643896590375e-05, "loss": 0.1861, "step": 2338 }, { "epoch": 1.4231822330392454, "grad_norm": 0.9640007019042969, "learning_rate": 3.778994911719013e-05, "loss": 0.2578, "step": 2339 }, { "epoch": 1.4237906905993307, "grad_norm": 0.8841598629951477, "learning_rate": 3.7779251166415606e-05, "loss": 0.2208, "step": 2340 }, { "epoch": 1.4243991481594158, "grad_norm": 0.7951266765594482, "learning_rate": 3.776855004691946e-05, "loss": 0.1759, "step": 2341 }, { "epoch": 1.4250076057195011, "grad_norm": 0.9464953541755676, "learning_rate": 3.775784576135513e-05, "loss": 0.2117, "step": 2342 }, { "epoch": 1.4256160632795862, "grad_norm": 0.8788123726844788, "learning_rate": 3.774713831237682e-05, "loss": 0.2078, "step": 2343 }, { "epoch": 1.4262245208396713, "grad_norm": 0.9868049621582031, "learning_rate": 3.7736427702639526e-05, "loss": 0.2355, "step": 2344 }, { "epoch": 1.4268329783997566, "grad_norm": 1.0244841575622559, "learning_rate": 3.7725713934799045e-05, "loss": 0.2093, "step": 2345 }, { "epoch": 1.4274414359598417, "grad_norm": 0.9810612797737122, "learning_rate": 3.7714997011511956e-05, "loss": 0.1909, "step": 2346 }, { "epoch": 1.428049893519927, "grad_norm": 0.8638327717781067, "learning_rate": 3.770427693543558e-05, "loss": 0.209, "step": 2347 }, { "epoch": 1.4286583510800122, "grad_norm": 0.9180063009262085, "learning_rate": 3.769355370922807e-05, "loss": 0.1966, "step": 2348 }, { "epoch": 1.4292668086400973, "grad_norm": 2.471264600753784, "learning_rate": 3.768282733554833e-05, "loss": 0.189, "step": 2349 }, { "epoch": 1.4298752662001826, "grad_norm": 0.8565406799316406, "learning_rate": 3.7672097817056065e-05, "loss": 0.2314, "step": 2350 }, { "epoch": 1.4304837237602677, "grad_norm": 0.9157159924507141, "learning_rate": 3.766136515641174e-05, "loss": 0.2094, "step": 2351 }, { "epoch": 1.431092181320353, "grad_norm": 0.7455861568450928, "learning_rate": 3.76506293562766e-05, "loss": 0.1656, "step": 2352 }, { "epoch": 1.431700638880438, "grad_norm": 0.8483655452728271, "learning_rate": 3.7639890419312694e-05, "loss": 0.2195, "step": 2353 }, { "epoch": 1.4323090964405232, "grad_norm": 0.9521898627281189, "learning_rate": 3.7629148348182807e-05, "loss": 0.1907, "step": 2354 }, { "epoch": 1.4329175540006085, "grad_norm": 0.9046710133552551, "learning_rate": 3.761840314555055e-05, "loss": 0.1728, "step": 2355 }, { "epoch": 1.4335260115606936, "grad_norm": 0.9136427044868469, "learning_rate": 3.760765481408027e-05, "loss": 0.2079, "step": 2356 }, { "epoch": 1.434134469120779, "grad_norm": 0.7689423561096191, "learning_rate": 3.759690335643711e-05, "loss": 0.1898, "step": 2357 }, { "epoch": 1.434742926680864, "grad_norm": 0.868561327457428, "learning_rate": 3.758614877528698e-05, "loss": 0.1768, "step": 2358 }, { "epoch": 1.435351384240949, "grad_norm": 0.889870285987854, "learning_rate": 3.7575391073296575e-05, "loss": 0.2147, "step": 2359 }, { "epoch": 1.4359598418010344, "grad_norm": 1.013046145439148, "learning_rate": 3.756463025313335e-05, "loss": 0.2365, "step": 2360 }, { "epoch": 1.4365682993611195, "grad_norm": 1.0037925243377686, "learning_rate": 3.7553866317465536e-05, "loss": 0.2229, "step": 2361 }, { "epoch": 1.4371767569212048, "grad_norm": 0.910197913646698, "learning_rate": 3.754309926896215e-05, "loss": 0.2133, "step": 2362 }, { "epoch": 1.43778521448129, "grad_norm": 0.8792757987976074, "learning_rate": 3.7532329110292966e-05, "loss": 0.2481, "step": 2363 }, { "epoch": 1.438393672041375, "grad_norm": 0.8445896506309509, "learning_rate": 3.752155584412854e-05, "loss": 0.2402, "step": 2364 }, { "epoch": 1.4390021296014603, "grad_norm": 0.8201121687889099, "learning_rate": 3.751077947314019e-05, "loss": 0.1696, "step": 2365 }, { "epoch": 1.4396105871615454, "grad_norm": 0.9546027183532715, "learning_rate": 3.7500000000000003e-05, "loss": 0.2487, "step": 2366 }, { "epoch": 1.4402190447216308, "grad_norm": 0.9001442193984985, "learning_rate": 3.748921742738084e-05, "loss": 0.2163, "step": 2367 }, { "epoch": 1.4408275022817159, "grad_norm": 0.7833166122436523, "learning_rate": 3.7478431757956335e-05, "loss": 0.2015, "step": 2368 }, { "epoch": 1.441435959841801, "grad_norm": 0.7708922028541565, "learning_rate": 3.746764299440087e-05, "loss": 0.2106, "step": 2369 }, { "epoch": 1.4420444174018863, "grad_norm": 0.7704883217811584, "learning_rate": 3.745685113938963e-05, "loss": 0.2011, "step": 2370 }, { "epoch": 1.4426528749619714, "grad_norm": 0.7907694578170776, "learning_rate": 3.7446056195598536e-05, "loss": 0.2023, "step": 2371 }, { "epoch": 1.4432613325220567, "grad_norm": 1.1114675998687744, "learning_rate": 3.743525816570428e-05, "loss": 0.1887, "step": 2372 }, { "epoch": 1.4438697900821418, "grad_norm": 0.7749055624008179, "learning_rate": 3.742445705238432e-05, "loss": 0.1806, "step": 2373 }, { "epoch": 1.4444782476422269, "grad_norm": 0.8483683466911316, "learning_rate": 3.74136528583169e-05, "loss": 0.1959, "step": 2374 }, { "epoch": 1.4450867052023122, "grad_norm": 0.9588725566864014, "learning_rate": 3.740284558618099e-05, "loss": 0.2355, "step": 2375 }, { "epoch": 1.4456951627623973, "grad_norm": 0.9711139798164368, "learning_rate": 3.739203523865635e-05, "loss": 0.2419, "step": 2376 }, { "epoch": 1.4463036203224826, "grad_norm": 0.8849513530731201, "learning_rate": 3.738122181842349e-05, "loss": 0.2023, "step": 2377 }, { "epoch": 1.4469120778825677, "grad_norm": 0.8123235702514648, "learning_rate": 3.73704053281637e-05, "loss": 0.2084, "step": 2378 }, { "epoch": 1.4475205354426528, "grad_norm": 0.9523190259933472, "learning_rate": 3.7359585770559024e-05, "loss": 0.2318, "step": 2379 }, { "epoch": 1.4481289930027381, "grad_norm": 0.7958305478096008, "learning_rate": 3.7348763148292236e-05, "loss": 0.2133, "step": 2380 }, { "epoch": 1.4487374505628232, "grad_norm": 0.9427193403244019, "learning_rate": 3.733793746404692e-05, "loss": 0.1828, "step": 2381 }, { "epoch": 1.4493459081229085, "grad_norm": 0.9071337580680847, "learning_rate": 3.732710872050737e-05, "loss": 0.1915, "step": 2382 }, { "epoch": 1.4499543656829936, "grad_norm": 0.9505900740623474, "learning_rate": 3.731627692035869e-05, "loss": 0.2522, "step": 2383 }, { "epoch": 1.4505628232430787, "grad_norm": 0.9712157845497131, "learning_rate": 3.730544206628669e-05, "loss": 0.2114, "step": 2384 }, { "epoch": 1.451171280803164, "grad_norm": 0.8542957901954651, "learning_rate": 3.729460416097797e-05, "loss": 0.1831, "step": 2385 }, { "epoch": 1.4517797383632491, "grad_norm": 0.9015089869499207, "learning_rate": 3.7283763207119894e-05, "loss": 0.1942, "step": 2386 }, { "epoch": 1.4523881959233345, "grad_norm": 0.8781471848487854, "learning_rate": 3.7272919207400556e-05, "loss": 0.2114, "step": 2387 }, { "epoch": 1.4529966534834196, "grad_norm": 0.9101613163948059, "learning_rate": 3.72620721645088e-05, "loss": 0.1945, "step": 2388 }, { "epoch": 1.4536051110435046, "grad_norm": 0.9514543414115906, "learning_rate": 3.725122208113427e-05, "loss": 0.2435, "step": 2389 }, { "epoch": 1.45421356860359, "grad_norm": 0.9269886612892151, "learning_rate": 3.724036895996732e-05, "loss": 0.2401, "step": 2390 }, { "epoch": 1.454822026163675, "grad_norm": 0.9546543955802917, "learning_rate": 3.722951280369906e-05, "loss": 0.2285, "step": 2391 }, { "epoch": 1.4554304837237604, "grad_norm": 0.8362632393836975, "learning_rate": 3.7218653615021395e-05, "loss": 0.2115, "step": 2392 }, { "epoch": 1.4560389412838455, "grad_norm": 0.9519213438034058, "learning_rate": 3.720779139662691e-05, "loss": 0.2214, "step": 2393 }, { "epoch": 1.4566473988439306, "grad_norm": 0.7873554825782776, "learning_rate": 3.719692615120902e-05, "loss": 0.2028, "step": 2394 }, { "epoch": 1.457255856404016, "grad_norm": 0.8509390354156494, "learning_rate": 3.718605788146183e-05, "loss": 0.2084, "step": 2395 }, { "epoch": 1.457864313964101, "grad_norm": 0.9013881087303162, "learning_rate": 3.717518659008023e-05, "loss": 0.2139, "step": 2396 }, { "epoch": 1.4584727715241863, "grad_norm": 0.8233511447906494, "learning_rate": 3.7164312279759836e-05, "loss": 0.1946, "step": 2397 }, { "epoch": 1.4590812290842714, "grad_norm": 0.8857746124267578, "learning_rate": 3.7153434953197044e-05, "loss": 0.1972, "step": 2398 }, { "epoch": 1.4596896866443565, "grad_norm": 0.793010950088501, "learning_rate": 3.714255461308895e-05, "loss": 0.184, "step": 2399 }, { "epoch": 1.4602981442044418, "grad_norm": 0.9649643301963806, "learning_rate": 3.7131671262133444e-05, "loss": 0.2153, "step": 2400 }, { "epoch": 1.460906601764527, "grad_norm": 0.9244347810745239, "learning_rate": 3.712078490302913e-05, "loss": 0.1921, "step": 2401 }, { "epoch": 1.4615150593246122, "grad_norm": 1.1217639446258545, "learning_rate": 3.7109895538475394e-05, "loss": 0.2125, "step": 2402 }, { "epoch": 1.4621235168846973, "grad_norm": 0.8992077112197876, "learning_rate": 3.709900317117232e-05, "loss": 0.1979, "step": 2403 }, { "epoch": 1.4627319744447824, "grad_norm": 0.9109466075897217, "learning_rate": 3.708810780382077e-05, "loss": 0.1959, "step": 2404 }, { "epoch": 1.4633404320048677, "grad_norm": 0.8444501161575317, "learning_rate": 3.707720943912235e-05, "loss": 0.1709, "step": 2405 }, { "epoch": 1.4639488895649528, "grad_norm": 0.9722615480422974, "learning_rate": 3.706630807977938e-05, "loss": 0.2304, "step": 2406 }, { "epoch": 1.4645573471250382, "grad_norm": 0.9228814244270325, "learning_rate": 3.705540372849496e-05, "loss": 0.2209, "step": 2407 }, { "epoch": 1.4651658046851233, "grad_norm": 0.9532406330108643, "learning_rate": 3.7044496387972914e-05, "loss": 0.2162, "step": 2408 }, { "epoch": 1.4657742622452083, "grad_norm": 0.7384613156318665, "learning_rate": 3.7033586060917795e-05, "loss": 0.1676, "step": 2409 }, { "epoch": 1.4663827198052934, "grad_norm": 0.804396390914917, "learning_rate": 3.7022672750034926e-05, "loss": 0.2018, "step": 2410 }, { "epoch": 1.4669911773653788, "grad_norm": 0.8668960332870483, "learning_rate": 3.701175645803034e-05, "loss": 0.238, "step": 2411 }, { "epoch": 1.467599634925464, "grad_norm": 0.8890330195426941, "learning_rate": 3.7000837187610826e-05, "loss": 0.1914, "step": 2412 }, { "epoch": 1.4682080924855492, "grad_norm": 1.1121948957443237, "learning_rate": 3.698991494148391e-05, "loss": 0.2169, "step": 2413 }, { "epoch": 1.4688165500456343, "grad_norm": 1.0296958684921265, "learning_rate": 3.697898972235785e-05, "loss": 0.2164, "step": 2414 }, { "epoch": 1.4694250076057194, "grad_norm": 0.9091324210166931, "learning_rate": 3.6968061532941654e-05, "loss": 0.207, "step": 2415 }, { "epoch": 1.4700334651658047, "grad_norm": 0.8409296274185181, "learning_rate": 3.695713037594505e-05, "loss": 0.2317, "step": 2416 }, { "epoch": 1.47064192272589, "grad_norm": 0.8644183278083801, "learning_rate": 3.6946196254078515e-05, "loss": 0.2301, "step": 2417 }, { "epoch": 1.471250380285975, "grad_norm": 0.9687103033065796, "learning_rate": 3.693525917005324e-05, "loss": 0.2214, "step": 2418 }, { "epoch": 1.4718588378460602, "grad_norm": 0.8268604278564453, "learning_rate": 3.692431912658118e-05, "loss": 0.2614, "step": 2419 }, { "epoch": 1.4724672954061453, "grad_norm": 0.8227600455284119, "learning_rate": 3.691337612637501e-05, "loss": 0.1931, "step": 2420 }, { "epoch": 1.4730757529662306, "grad_norm": 0.8828481435775757, "learning_rate": 3.690243017214813e-05, "loss": 0.1842, "step": 2421 }, { "epoch": 1.4736842105263157, "grad_norm": 0.7842615246772766, "learning_rate": 3.689148126661469e-05, "loss": 0.2161, "step": 2422 }, { "epoch": 1.474292668086401, "grad_norm": 0.9619740843772888, "learning_rate": 3.688052941248956e-05, "loss": 0.2421, "step": 2423 }, { "epoch": 1.4749011256464861, "grad_norm": 0.8373282551765442, "learning_rate": 3.686957461248833e-05, "loss": 0.2045, "step": 2424 }, { "epoch": 1.4755095832065712, "grad_norm": 0.9472412467002869, "learning_rate": 3.685861686932735e-05, "loss": 0.2319, "step": 2425 }, { "epoch": 1.4761180407666565, "grad_norm": 0.9255704283714294, "learning_rate": 3.6847656185723686e-05, "loss": 0.2266, "step": 2426 }, { "epoch": 1.4767264983267416, "grad_norm": 0.851377010345459, "learning_rate": 3.683669256439511e-05, "loss": 0.2215, "step": 2427 }, { "epoch": 1.477334955886827, "grad_norm": 0.9094513654708862, "learning_rate": 3.6825726008060155e-05, "loss": 0.2713, "step": 2428 }, { "epoch": 1.477943413446912, "grad_norm": 0.9373210668563843, "learning_rate": 3.681475651943809e-05, "loss": 0.2021, "step": 2429 }, { "epoch": 1.4785518710069971, "grad_norm": 0.8862281441688538, "learning_rate": 3.680378410124885e-05, "loss": 0.2172, "step": 2430 }, { "epoch": 1.4791603285670825, "grad_norm": 0.9415962100028992, "learning_rate": 3.6792808756213166e-05, "loss": 0.2334, "step": 2431 }, { "epoch": 1.4797687861271676, "grad_norm": 0.8250126838684082, "learning_rate": 3.678183048705246e-05, "loss": 0.2086, "step": 2432 }, { "epoch": 1.4803772436872529, "grad_norm": 0.8701198697090149, "learning_rate": 3.6770849296488885e-05, "loss": 0.218, "step": 2433 }, { "epoch": 1.480985701247338, "grad_norm": 0.7581288814544678, "learning_rate": 3.675986518724532e-05, "loss": 0.1843, "step": 2434 }, { "epoch": 1.481594158807423, "grad_norm": 0.8139165043830872, "learning_rate": 3.674887816204536e-05, "loss": 0.1995, "step": 2435 }, { "epoch": 1.4822026163675084, "grad_norm": 0.9862119555473328, "learning_rate": 3.6737888223613323e-05, "loss": 0.173, "step": 2436 }, { "epoch": 1.4828110739275935, "grad_norm": 0.9770458936691284, "learning_rate": 3.6726895374674286e-05, "loss": 0.2326, "step": 2437 }, { "epoch": 1.4834195314876788, "grad_norm": 0.9238155484199524, "learning_rate": 3.671589961795399e-05, "loss": 0.1974, "step": 2438 }, { "epoch": 1.484027989047764, "grad_norm": 0.7940313816070557, "learning_rate": 3.6704900956178924e-05, "loss": 0.1888, "step": 2439 }, { "epoch": 1.484636446607849, "grad_norm": 0.9048517942428589, "learning_rate": 3.6693899392076306e-05, "loss": 0.2122, "step": 2440 }, { "epoch": 1.4852449041679343, "grad_norm": 0.8240786194801331, "learning_rate": 3.6682894928374074e-05, "loss": 0.169, "step": 2441 }, { "epoch": 1.4858533617280194, "grad_norm": 0.9464930295944214, "learning_rate": 3.6671887567800853e-05, "loss": 0.1947, "step": 2442 }, { "epoch": 1.4864618192881047, "grad_norm": 0.8999909162521362, "learning_rate": 3.666087731308604e-05, "loss": 0.2237, "step": 2443 }, { "epoch": 1.4870702768481898, "grad_norm": 0.8082036972045898, "learning_rate": 3.664986416695969e-05, "loss": 0.1476, "step": 2444 }, { "epoch": 1.487678734408275, "grad_norm": 0.8836786150932312, "learning_rate": 3.663884813215263e-05, "loss": 0.1995, "step": 2445 }, { "epoch": 1.4882871919683602, "grad_norm": 0.8656254410743713, "learning_rate": 3.6627829211396345e-05, "loss": 0.2678, "step": 2446 }, { "epoch": 1.4888956495284453, "grad_norm": 0.7731005549430847, "learning_rate": 3.66168074074231e-05, "loss": 0.1776, "step": 2447 }, { "epoch": 1.4895041070885306, "grad_norm": 0.8301362991333008, "learning_rate": 3.6605782722965834e-05, "loss": 0.1878, "step": 2448 }, { "epoch": 1.4901125646486157, "grad_norm": 0.8362447023391724, "learning_rate": 3.65947551607582e-05, "loss": 0.2027, "step": 2449 }, { "epoch": 1.4907210222087008, "grad_norm": 0.9739522933959961, "learning_rate": 3.6583724723534574e-05, "loss": 0.2361, "step": 2450 }, { "epoch": 1.4913294797687862, "grad_norm": 0.9524062275886536, "learning_rate": 3.6572691414030055e-05, "loss": 0.2252, "step": 2451 }, { "epoch": 1.4919379373288713, "grad_norm": 1.0326991081237793, "learning_rate": 3.656165523498044e-05, "loss": 0.2235, "step": 2452 }, { "epoch": 1.4925463948889566, "grad_norm": 0.9849575161933899, "learning_rate": 3.655061618912224e-05, "loss": 0.2279, "step": 2453 }, { "epoch": 1.4931548524490417, "grad_norm": 0.8630450367927551, "learning_rate": 3.653957427919268e-05, "loss": 0.1915, "step": 2454 }, { "epoch": 1.4937633100091268, "grad_norm": 0.845496654510498, "learning_rate": 3.652852950792969e-05, "loss": 0.1721, "step": 2455 }, { "epoch": 1.494371767569212, "grad_norm": 1.2583898305892944, "learning_rate": 3.651748187807191e-05, "loss": 0.2039, "step": 2456 }, { "epoch": 1.4949802251292972, "grad_norm": 0.7946431040763855, "learning_rate": 3.650643139235871e-05, "loss": 0.1687, "step": 2457 }, { "epoch": 1.4955886826893825, "grad_norm": 1.014686107635498, "learning_rate": 3.649537805353013e-05, "loss": 0.2319, "step": 2458 }, { "epoch": 1.4961971402494676, "grad_norm": 1.0813982486724854, "learning_rate": 3.648432186432694e-05, "loss": 0.2508, "step": 2459 }, { "epoch": 1.4968055978095527, "grad_norm": 0.8511247038841248, "learning_rate": 3.6473262827490614e-05, "loss": 0.1737, "step": 2460 }, { "epoch": 1.497414055369638, "grad_norm": 0.963362455368042, "learning_rate": 3.646220094576334e-05, "loss": 0.2431, "step": 2461 }, { "epoch": 1.498022512929723, "grad_norm": 0.8051499724388123, "learning_rate": 3.6451136221888005e-05, "loss": 0.1954, "step": 2462 }, { "epoch": 1.4986309704898084, "grad_norm": 0.8649519085884094, "learning_rate": 3.644006865860819e-05, "loss": 0.1716, "step": 2463 }, { "epoch": 1.4992394280498935, "grad_norm": 0.9137884378433228, "learning_rate": 3.642899825866819e-05, "loss": 0.1964, "step": 2464 }, { "epoch": 1.4998478856099786, "grad_norm": 0.7866896986961365, "learning_rate": 3.641792502481301e-05, "loss": 0.1809, "step": 2465 }, { "epoch": 1.500456343170064, "grad_norm": 0.9844456911087036, "learning_rate": 3.640684895978834e-05, "loss": 0.2182, "step": 2466 }, { "epoch": 1.501064800730149, "grad_norm": 0.9635726809501648, "learning_rate": 3.6395770066340596e-05, "loss": 0.2182, "step": 2467 }, { "epoch": 1.5016732582902343, "grad_norm": 0.9193302392959595, "learning_rate": 3.6384688347216875e-05, "loss": 0.2455, "step": 2468 }, { "epoch": 1.5022817158503194, "grad_norm": 1.0144778490066528, "learning_rate": 3.637360380516498e-05, "loss": 0.2717, "step": 2469 }, { "epoch": 1.5028901734104045, "grad_norm": 0.8575990796089172, "learning_rate": 3.6362516442933416e-05, "loss": 0.2333, "step": 2470 }, { "epoch": 1.5034986309704899, "grad_norm": 1.1676721572875977, "learning_rate": 3.635142626327139e-05, "loss": 0.2948, "step": 2471 }, { "epoch": 1.504107088530575, "grad_norm": 0.7690470814704895, "learning_rate": 3.634033326892879e-05, "loss": 0.2268, "step": 2472 }, { "epoch": 1.5047155460906603, "grad_norm": 0.8740969300270081, "learning_rate": 3.632923746265623e-05, "loss": 0.227, "step": 2473 }, { "epoch": 1.5053240036507454, "grad_norm": 0.8315359354019165, "learning_rate": 3.631813884720502e-05, "loss": 0.2036, "step": 2474 }, { "epoch": 1.5059324612108305, "grad_norm": 0.8344742059707642, "learning_rate": 3.630703742532713e-05, "loss": 0.2397, "step": 2475 }, { "epoch": 1.5065409187709158, "grad_norm": 1.0899635553359985, "learning_rate": 3.6295933199775265e-05, "loss": 0.1978, "step": 2476 }, { "epoch": 1.5071493763310009, "grad_norm": 0.9321961998939514, "learning_rate": 3.628482617330279e-05, "loss": 0.2296, "step": 2477 }, { "epoch": 1.5077578338910862, "grad_norm": 0.8498799204826355, "learning_rate": 3.627371634866381e-05, "loss": 0.1986, "step": 2478 }, { "epoch": 1.5083662914511713, "grad_norm": 0.8041394948959351, "learning_rate": 3.626260372861308e-05, "loss": 0.1844, "step": 2479 }, { "epoch": 1.5089747490112564, "grad_norm": 0.8486339449882507, "learning_rate": 3.625148831590608e-05, "loss": 0.2032, "step": 2480 }, { "epoch": 1.5095832065713415, "grad_norm": 0.817248523235321, "learning_rate": 3.624037011329896e-05, "loss": 0.1985, "step": 2481 }, { "epoch": 1.5101916641314268, "grad_norm": 0.8181208372116089, "learning_rate": 3.6229249123548574e-05, "loss": 0.1856, "step": 2482 }, { "epoch": 1.5108001216915121, "grad_norm": 0.7917095422744751, "learning_rate": 3.621812534941246e-05, "loss": 0.1694, "step": 2483 }, { "epoch": 1.5114085792515972, "grad_norm": 0.8646035194396973, "learning_rate": 3.620699879364886e-05, "loss": 0.2158, "step": 2484 }, { "epoch": 1.5120170368116823, "grad_norm": 0.9511632323265076, "learning_rate": 3.619586945901669e-05, "loss": 0.1938, "step": 2485 }, { "epoch": 1.5126254943717674, "grad_norm": 0.7660108208656311, "learning_rate": 3.618473734827556e-05, "loss": 0.1687, "step": 2486 }, { "epoch": 1.5132339519318527, "grad_norm": 0.772885799407959, "learning_rate": 3.6173602464185765e-05, "loss": 0.2055, "step": 2487 }, { "epoch": 1.513842409491938, "grad_norm": 0.8760683536529541, "learning_rate": 3.61624648095083e-05, "loss": 0.1855, "step": 2488 }, { "epoch": 1.5144508670520231, "grad_norm": 1.2097331285476685, "learning_rate": 3.615132438700484e-05, "loss": 0.2581, "step": 2489 }, { "epoch": 1.5150593246121082, "grad_norm": 0.7943812012672424, "learning_rate": 3.614018119943774e-05, "loss": 0.2076, "step": 2490 }, { "epoch": 1.5156677821721933, "grad_norm": 0.8257327079772949, "learning_rate": 3.6129035249570053e-05, "loss": 0.202, "step": 2491 }, { "epoch": 1.5162762397322787, "grad_norm": 0.768048107624054, "learning_rate": 3.6117886540165504e-05, "loss": 0.1859, "step": 2492 }, { "epoch": 1.516884697292364, "grad_norm": 0.8744530081748962, "learning_rate": 3.6106735073988504e-05, "loss": 0.1751, "step": 2493 }, { "epoch": 1.517493154852449, "grad_norm": 0.9588749408721924, "learning_rate": 3.6095580853804155e-05, "loss": 0.2156, "step": 2494 }, { "epoch": 1.5181016124125342, "grad_norm": 0.8880223035812378, "learning_rate": 3.608442388237825e-05, "loss": 0.2116, "step": 2495 }, { "epoch": 1.5187100699726193, "grad_norm": 0.9411167502403259, "learning_rate": 3.607326416247723e-05, "loss": 0.217, "step": 2496 }, { "epoch": 1.5193185275327046, "grad_norm": 0.9615315198898315, "learning_rate": 3.606210169686827e-05, "loss": 0.2129, "step": 2497 }, { "epoch": 1.51992698509279, "grad_norm": 0.9808461666107178, "learning_rate": 3.605093648831917e-05, "loss": 0.2242, "step": 2498 }, { "epoch": 1.520535442652875, "grad_norm": 1.2025115489959717, "learning_rate": 3.603976853959845e-05, "loss": 0.2028, "step": 2499 }, { "epoch": 1.52114390021296, "grad_norm": 0.8565399646759033, "learning_rate": 3.602859785347529e-05, "loss": 0.228, "step": 2500 }, { "epoch": 1.5217523577730452, "grad_norm": 0.769661545753479, "learning_rate": 3.601742443271956e-05, "loss": 0.1927, "step": 2501 }, { "epoch": 1.5223608153331305, "grad_norm": 0.8793697953224182, "learning_rate": 3.600624828010181e-05, "loss": 0.199, "step": 2502 }, { "epoch": 1.5229692728932158, "grad_norm": 0.9120194315910339, "learning_rate": 3.599506939839323e-05, "loss": 0.2185, "step": 2503 }, { "epoch": 1.523577730453301, "grad_norm": 0.9638735055923462, "learning_rate": 3.598388779036575e-05, "loss": 0.2264, "step": 2504 }, { "epoch": 1.524186188013386, "grad_norm": 0.9277864694595337, "learning_rate": 3.597270345879192e-05, "loss": 0.2542, "step": 2505 }, { "epoch": 1.524794645573471, "grad_norm": 0.8778446316719055, "learning_rate": 3.5961516406445e-05, "loss": 0.2076, "step": 2506 }, { "epoch": 1.5254031031335564, "grad_norm": 0.8511151671409607, "learning_rate": 3.595032663609891e-05, "loss": 0.1923, "step": 2507 }, { "epoch": 1.5260115606936417, "grad_norm": 0.7811472415924072, "learning_rate": 3.593913415052825e-05, "loss": 0.1589, "step": 2508 }, { "epoch": 1.5266200182537268, "grad_norm": 0.8555339574813843, "learning_rate": 3.5927938952508284e-05, "loss": 0.1826, "step": 2509 }, { "epoch": 1.527228475813812, "grad_norm": 0.8637688159942627, "learning_rate": 3.591674104481495e-05, "loss": 0.193, "step": 2510 }, { "epoch": 1.527836933373897, "grad_norm": 0.8080950379371643, "learning_rate": 3.590554043022488e-05, "loss": 0.1814, "step": 2511 }, { "epoch": 1.5284453909339824, "grad_norm": 0.9069305658340454, "learning_rate": 3.5894337111515344e-05, "loss": 0.1547, "step": 2512 }, { "epoch": 1.5290538484940677, "grad_norm": 0.9532658457756042, "learning_rate": 3.58831310914643e-05, "loss": 0.1953, "step": 2513 }, { "epoch": 1.5296623060541528, "grad_norm": 1.2767683267593384, "learning_rate": 3.5871922372850376e-05, "loss": 0.2019, "step": 2514 }, { "epoch": 1.5302707636142379, "grad_norm": 0.8507498502731323, "learning_rate": 3.586071095845287e-05, "loss": 0.2115, "step": 2515 }, { "epoch": 1.530879221174323, "grad_norm": 1.0665339231491089, "learning_rate": 3.5849496851051744e-05, "loss": 0.2373, "step": 2516 }, { "epoch": 1.5314876787344083, "grad_norm": 0.9984211325645447, "learning_rate": 3.583828005342763e-05, "loss": 0.245, "step": 2517 }, { "epoch": 1.5320961362944936, "grad_norm": 0.850425124168396, "learning_rate": 3.5827060568361817e-05, "loss": 0.1987, "step": 2518 }, { "epoch": 1.5327045938545787, "grad_norm": 1.2310839891433716, "learning_rate": 3.5815838398636284e-05, "loss": 0.2151, "step": 2519 }, { "epoch": 1.5333130514146638, "grad_norm": 0.8494765162467957, "learning_rate": 3.580461354703365e-05, "loss": 0.245, "step": 2520 }, { "epoch": 1.5339215089747489, "grad_norm": 0.929420530796051, "learning_rate": 3.579338601633722e-05, "loss": 0.24, "step": 2521 }, { "epoch": 1.5345299665348342, "grad_norm": 0.7413504719734192, "learning_rate": 3.578215580933095e-05, "loss": 0.1886, "step": 2522 }, { "epoch": 1.5351384240949195, "grad_norm": 0.9758164286613464, "learning_rate": 3.577092292879946e-05, "loss": 0.2134, "step": 2523 }, { "epoch": 1.5357468816550046, "grad_norm": 0.8202041983604431, "learning_rate": 3.5759687377528026e-05, "loss": 0.1697, "step": 2524 }, { "epoch": 1.5363553392150897, "grad_norm": 0.816893994808197, "learning_rate": 3.574844915830263e-05, "loss": 0.1809, "step": 2525 }, { "epoch": 1.5369637967751748, "grad_norm": 0.8673279285430908, "learning_rate": 3.573720827390984e-05, "loss": 0.2501, "step": 2526 }, { "epoch": 1.5375722543352601, "grad_norm": 0.8785646557807922, "learning_rate": 3.572596472713696e-05, "loss": 0.1991, "step": 2527 }, { "epoch": 1.5381807118953454, "grad_norm": 0.8835013508796692, "learning_rate": 3.5714718520771904e-05, "loss": 0.1982, "step": 2528 }, { "epoch": 1.5387891694554305, "grad_norm": 0.8082259297370911, "learning_rate": 3.570346965760326e-05, "loss": 0.174, "step": 2529 }, { "epoch": 1.5393976270155156, "grad_norm": 0.869325578212738, "learning_rate": 3.5692218140420295e-05, "loss": 0.1844, "step": 2530 }, { "epoch": 1.5400060845756007, "grad_norm": 0.9018534421920776, "learning_rate": 3.5680963972012894e-05, "loss": 0.1921, "step": 2531 }, { "epoch": 1.540614542135686, "grad_norm": 0.8284593820571899, "learning_rate": 3.566970715517164e-05, "loss": 0.1818, "step": 2532 }, { "epoch": 1.5412229996957714, "grad_norm": 0.7905526161193848, "learning_rate": 3.565844769268774e-05, "loss": 0.1917, "step": 2533 }, { "epoch": 1.5418314572558565, "grad_norm": 0.7893586158752441, "learning_rate": 3.564718558735308e-05, "loss": 0.177, "step": 2534 }, { "epoch": 1.5424399148159416, "grad_norm": 0.7778828144073486, "learning_rate": 3.56359208419602e-05, "loss": 0.2133, "step": 2535 }, { "epoch": 1.5430483723760267, "grad_norm": 0.8154110312461853, "learning_rate": 3.562465345930227e-05, "loss": 0.2001, "step": 2536 }, { "epoch": 1.543656829936112, "grad_norm": 0.8788914084434509, "learning_rate": 3.561338344217314e-05, "loss": 0.1756, "step": 2537 }, { "epoch": 1.5442652874961973, "grad_norm": 0.7563880085945129, "learning_rate": 3.560211079336731e-05, "loss": 0.1944, "step": 2538 }, { "epoch": 1.5448737450562824, "grad_norm": 0.7627280354499817, "learning_rate": 3.559083551567991e-05, "loss": 0.1919, "step": 2539 }, { "epoch": 1.5454822026163675, "grad_norm": 1.1467633247375488, "learning_rate": 3.5579557611906755e-05, "loss": 0.1835, "step": 2540 }, { "epoch": 1.5460906601764526, "grad_norm": 0.9292544722557068, "learning_rate": 3.5568277084844295e-05, "loss": 0.2053, "step": 2541 }, { "epoch": 1.546699117736538, "grad_norm": 0.7936065196990967, "learning_rate": 3.555699393728962e-05, "loss": 0.1753, "step": 2542 }, { "epoch": 1.5473075752966232, "grad_norm": 0.8932594060897827, "learning_rate": 3.554570817204048e-05, "loss": 0.1695, "step": 2543 }, { "epoch": 1.5479160328567083, "grad_norm": 1.080686330795288, "learning_rate": 3.553441979189529e-05, "loss": 0.259, "step": 2544 }, { "epoch": 1.5485244904167934, "grad_norm": 0.7938811182975769, "learning_rate": 3.5523128799653084e-05, "loss": 0.1673, "step": 2545 }, { "epoch": 1.5491329479768785, "grad_norm": 0.8280820846557617, "learning_rate": 3.551183519811356e-05, "loss": 0.1884, "step": 2546 }, { "epoch": 1.5497414055369638, "grad_norm": 0.8619610071182251, "learning_rate": 3.550053899007707e-05, "loss": 0.1851, "step": 2547 }, { "epoch": 1.5503498630970491, "grad_norm": 0.824386477470398, "learning_rate": 3.548924017834458e-05, "loss": 0.1824, "step": 2548 }, { "epoch": 1.5509583206571342, "grad_norm": 0.8854561448097229, "learning_rate": 3.547793876571775e-05, "loss": 0.1681, "step": 2549 }, { "epoch": 1.5515667782172193, "grad_norm": 0.7834053635597229, "learning_rate": 3.546663475499884e-05, "loss": 0.1574, "step": 2550 }, { "epoch": 1.5521752357773044, "grad_norm": 1.0175268650054932, "learning_rate": 3.5455328148990794e-05, "loss": 0.2224, "step": 2551 }, { "epoch": 1.5527836933373897, "grad_norm": 0.7926445007324219, "learning_rate": 3.544401895049716e-05, "loss": 0.1664, "step": 2552 }, { "epoch": 1.553392150897475, "grad_norm": 0.8977791666984558, "learning_rate": 3.543270716232215e-05, "loss": 0.2121, "step": 2553 }, { "epoch": 1.5540006084575602, "grad_norm": 0.7897874116897583, "learning_rate": 3.542139278727062e-05, "loss": 0.2033, "step": 2554 }, { "epoch": 1.5546090660176453, "grad_norm": 0.8928864598274231, "learning_rate": 3.541007582814807e-05, "loss": 0.1999, "step": 2555 }, { "epoch": 1.5552175235777304, "grad_norm": 0.8382681608200073, "learning_rate": 3.539875628776062e-05, "loss": 0.1996, "step": 2556 }, { "epoch": 1.5558259811378157, "grad_norm": 0.7771824598312378, "learning_rate": 3.5387434168915065e-05, "loss": 0.1788, "step": 2557 }, { "epoch": 1.5564344386979008, "grad_norm": 0.9810817241668701, "learning_rate": 3.53761094744188e-05, "loss": 0.1917, "step": 2558 }, { "epoch": 1.557042896257986, "grad_norm": 0.8481950163841248, "learning_rate": 3.5364782207079886e-05, "loss": 0.1748, "step": 2559 }, { "epoch": 1.5576513538180712, "grad_norm": 0.8863506317138672, "learning_rate": 3.5353452369707e-05, "loss": 0.1898, "step": 2560 }, { "epoch": 1.5582598113781563, "grad_norm": 0.9136548638343811, "learning_rate": 3.534211996510949e-05, "loss": 0.2652, "step": 2561 }, { "epoch": 1.5588682689382416, "grad_norm": 0.814624011516571, "learning_rate": 3.53307849960973e-05, "loss": 0.1837, "step": 2562 }, { "epoch": 1.5594767264983267, "grad_norm": 0.9348104000091553, "learning_rate": 3.531944746548105e-05, "loss": 0.1614, "step": 2563 }, { "epoch": 1.560085184058412, "grad_norm": 0.817783534526825, "learning_rate": 3.530810737607195e-05, "loss": 0.1853, "step": 2564 }, { "epoch": 1.560693641618497, "grad_norm": 0.8149775266647339, "learning_rate": 3.529676473068189e-05, "loss": 0.1745, "step": 2565 }, { "epoch": 1.5613020991785822, "grad_norm": 0.8247946500778198, "learning_rate": 3.5285419532123375e-05, "loss": 0.1779, "step": 2566 }, { "epoch": 1.5619105567386675, "grad_norm": 0.8637406826019287, "learning_rate": 3.5274071783209525e-05, "loss": 0.1981, "step": 2567 }, { "epoch": 1.5625190142987526, "grad_norm": 0.7747231721878052, "learning_rate": 3.5262721486754125e-05, "loss": 0.1827, "step": 2568 }, { "epoch": 1.563127471858838, "grad_norm": 0.9045551419258118, "learning_rate": 3.525136864557156e-05, "loss": 0.2214, "step": 2569 }, { "epoch": 1.563735929418923, "grad_norm": 0.9226981997489929, "learning_rate": 3.5240013262476866e-05, "loss": 0.2045, "step": 2570 }, { "epoch": 1.5643443869790081, "grad_norm": 0.793237566947937, "learning_rate": 3.522865534028572e-05, "loss": 0.1727, "step": 2571 }, { "epoch": 1.5649528445390934, "grad_norm": 0.9380844235420227, "learning_rate": 3.5217294881814386e-05, "loss": 0.2171, "step": 2572 }, { "epoch": 1.5655613020991785, "grad_norm": 0.964013397693634, "learning_rate": 3.520593188987982e-05, "loss": 0.2032, "step": 2573 }, { "epoch": 1.5661697596592639, "grad_norm": 0.9411283731460571, "learning_rate": 3.519456636729953e-05, "loss": 0.1901, "step": 2574 }, { "epoch": 1.566778217219349, "grad_norm": 0.8983286023139954, "learning_rate": 3.518319831689172e-05, "loss": 0.1837, "step": 2575 }, { "epoch": 1.567386674779434, "grad_norm": 0.8221237659454346, "learning_rate": 3.517182774147518e-05, "loss": 0.1748, "step": 2576 }, { "epoch": 1.5679951323395194, "grad_norm": 0.9628398418426514, "learning_rate": 3.516045464386935e-05, "loss": 0.1888, "step": 2577 }, { "epoch": 1.5686035898996045, "grad_norm": 0.8376243710517883, "learning_rate": 3.5149079026894266e-05, "loss": 0.2096, "step": 2578 }, { "epoch": 1.5692120474596898, "grad_norm": 0.8941128253936768, "learning_rate": 3.513770089337063e-05, "loss": 0.2027, "step": 2579 }, { "epoch": 1.5698205050197749, "grad_norm": 0.9198672771453857, "learning_rate": 3.512632024611972e-05, "loss": 0.218, "step": 2580 }, { "epoch": 1.57042896257986, "grad_norm": 0.9642953872680664, "learning_rate": 3.511493708796348e-05, "loss": 0.2402, "step": 2581 }, { "epoch": 1.571037420139945, "grad_norm": 0.8804176449775696, "learning_rate": 3.5103551421724457e-05, "loss": 0.2181, "step": 2582 }, { "epoch": 1.5716458777000304, "grad_norm": 0.7416863441467285, "learning_rate": 3.509216325022582e-05, "loss": 0.192, "step": 2583 }, { "epoch": 1.5722543352601157, "grad_norm": 0.8433103561401367, "learning_rate": 3.5080772576291356e-05, "loss": 0.2138, "step": 2584 }, { "epoch": 1.5728627928202008, "grad_norm": 0.8431623578071594, "learning_rate": 3.506937940274547e-05, "loss": 0.1981, "step": 2585 }, { "epoch": 1.573471250380286, "grad_norm": 0.8424465656280518, "learning_rate": 3.5057983732413224e-05, "loss": 0.1745, "step": 2586 }, { "epoch": 1.574079707940371, "grad_norm": 0.8414326310157776, "learning_rate": 3.504658556812024e-05, "loss": 0.1687, "step": 2587 }, { "epoch": 1.5746881655004563, "grad_norm": 0.8884822130203247, "learning_rate": 3.503518491269279e-05, "loss": 0.1771, "step": 2588 }, { "epoch": 1.5752966230605416, "grad_norm": 0.7992542386054993, "learning_rate": 3.502378176895778e-05, "loss": 0.1782, "step": 2589 }, { "epoch": 1.5759050806206267, "grad_norm": 0.9021846055984497, "learning_rate": 3.50123761397427e-05, "loss": 0.2075, "step": 2590 }, { "epoch": 1.5765135381807118, "grad_norm": 1.1060278415679932, "learning_rate": 3.500096802787567e-05, "loss": 0.2263, "step": 2591 }, { "epoch": 1.577121995740797, "grad_norm": 1.0591530799865723, "learning_rate": 3.4989557436185434e-05, "loss": 0.1848, "step": 2592 }, { "epoch": 1.5777304533008822, "grad_norm": 0.945749044418335, "learning_rate": 3.4978144367501335e-05, "loss": 0.2026, "step": 2593 }, { "epoch": 1.5783389108609676, "grad_norm": 0.8668262958526611, "learning_rate": 3.496672882465335e-05, "loss": 0.1701, "step": 2594 }, { "epoch": 1.5789473684210527, "grad_norm": 0.8319501280784607, "learning_rate": 3.495531081047204e-05, "loss": 0.1532, "step": 2595 }, { "epoch": 1.5795558259811378, "grad_norm": 0.9786222577095032, "learning_rate": 3.494389032778862e-05, "loss": 0.2242, "step": 2596 }, { "epoch": 1.5801642835412228, "grad_norm": 0.8528378009796143, "learning_rate": 3.493246737943487e-05, "loss": 0.2037, "step": 2597 }, { "epoch": 1.5807727411013082, "grad_norm": 1.3592191934585571, "learning_rate": 3.492104196824322e-05, "loss": 0.2073, "step": 2598 }, { "epoch": 1.5813811986613935, "grad_norm": 0.7618198394775391, "learning_rate": 3.4909614097046686e-05, "loss": 0.1418, "step": 2599 }, { "epoch": 1.5819896562214786, "grad_norm": 0.8264166712760925, "learning_rate": 3.489818376867891e-05, "loss": 0.1669, "step": 2600 }, { "epoch": 1.5825981137815637, "grad_norm": 0.8527823090553284, "learning_rate": 3.4886750985974136e-05, "loss": 0.1732, "step": 2601 }, { "epoch": 1.5832065713416488, "grad_norm": 0.8907186985015869, "learning_rate": 3.487531575176722e-05, "loss": 0.2095, "step": 2602 }, { "epoch": 1.583815028901734, "grad_norm": 0.8088558912277222, "learning_rate": 3.4863878068893625e-05, "loss": 0.1638, "step": 2603 }, { "epoch": 1.5844234864618194, "grad_norm": 1.0112056732177734, "learning_rate": 3.4852437940189414e-05, "loss": 0.1842, "step": 2604 }, { "epoch": 1.5850319440219045, "grad_norm": 0.6868709325790405, "learning_rate": 3.4840995368491255e-05, "loss": 0.1572, "step": 2605 }, { "epoch": 1.5856404015819896, "grad_norm": 0.8623439073562622, "learning_rate": 3.4829550356636445e-05, "loss": 0.1846, "step": 2606 }, { "epoch": 1.5862488591420747, "grad_norm": 0.8471298813819885, "learning_rate": 3.481810290746287e-05, "loss": 0.1654, "step": 2607 }, { "epoch": 1.58685731670216, "grad_norm": 0.8028208017349243, "learning_rate": 3.4806653023808996e-05, "loss": 0.1955, "step": 2608 }, { "epoch": 1.5874657742622453, "grad_norm": 0.8367049694061279, "learning_rate": 3.4795200708513945e-05, "loss": 0.1996, "step": 2609 }, { "epoch": 1.5880742318223304, "grad_norm": 0.8771877884864807, "learning_rate": 3.47837459644174e-05, "loss": 0.2201, "step": 2610 }, { "epoch": 1.5886826893824155, "grad_norm": 0.876565158367157, "learning_rate": 3.477228879435966e-05, "loss": 0.1817, "step": 2611 }, { "epoch": 1.5892911469425006, "grad_norm": 0.9059187173843384, "learning_rate": 3.4760829201181635e-05, "loss": 0.2115, "step": 2612 }, { "epoch": 1.589899604502586, "grad_norm": 0.9595020413398743, "learning_rate": 3.474936718772481e-05, "loss": 0.212, "step": 2613 }, { "epoch": 1.5905080620626713, "grad_norm": 0.8010035753250122, "learning_rate": 3.47379027568313e-05, "loss": 0.1616, "step": 2614 }, { "epoch": 1.5911165196227564, "grad_norm": 0.9189199805259705, "learning_rate": 3.4726435911343804e-05, "loss": 0.1859, "step": 2615 }, { "epoch": 1.5917249771828414, "grad_norm": 0.8791665434837341, "learning_rate": 3.471496665410561e-05, "loss": 0.2185, "step": 2616 }, { "epoch": 1.5923334347429265, "grad_norm": 0.9701746106147766, "learning_rate": 3.4703494987960637e-05, "loss": 0.2412, "step": 2617 }, { "epoch": 1.5929418923030119, "grad_norm": 0.7525075674057007, "learning_rate": 3.469202091575337e-05, "loss": 0.1916, "step": 2618 }, { "epoch": 1.5935503498630972, "grad_norm": 0.9566659331321716, "learning_rate": 3.468054444032889e-05, "loss": 0.1985, "step": 2619 }, { "epoch": 1.5941588074231823, "grad_norm": 0.7912515997886658, "learning_rate": 3.46690655645329e-05, "loss": 0.2016, "step": 2620 }, { "epoch": 1.5947672649832674, "grad_norm": 0.6893926858901978, "learning_rate": 3.465758429121168e-05, "loss": 0.1687, "step": 2621 }, { "epoch": 1.5953757225433525, "grad_norm": 0.8185518980026245, "learning_rate": 3.4646100623212094e-05, "loss": 0.1726, "step": 2622 }, { "epoch": 1.5959841801034378, "grad_norm": 1.019525170326233, "learning_rate": 3.4634614563381624e-05, "loss": 0.1899, "step": 2623 }, { "epoch": 1.596592637663523, "grad_norm": 0.8567332625389099, "learning_rate": 3.4623126114568336e-05, "loss": 0.1901, "step": 2624 }, { "epoch": 1.5972010952236082, "grad_norm": 0.7891209125518799, "learning_rate": 3.4611635279620885e-05, "loss": 0.1894, "step": 2625 }, { "epoch": 1.5978095527836933, "grad_norm": 0.782440721988678, "learning_rate": 3.460014206138851e-05, "loss": 0.1761, "step": 2626 }, { "epoch": 1.5984180103437784, "grad_norm": 1.1387567520141602, "learning_rate": 3.458864646272107e-05, "loss": 0.1906, "step": 2627 }, { "epoch": 1.5990264679038637, "grad_norm": 0.766904354095459, "learning_rate": 3.4577148486468975e-05, "loss": 0.1905, "step": 2628 }, { "epoch": 1.599634925463949, "grad_norm": 0.8891963362693787, "learning_rate": 3.456564813548325e-05, "loss": 0.1965, "step": 2629 }, { "epoch": 1.6002433830240341, "grad_norm": 0.8071811199188232, "learning_rate": 3.45541454126155e-05, "loss": 0.185, "step": 2630 }, { "epoch": 1.6008518405841192, "grad_norm": 0.9761980175971985, "learning_rate": 3.4542640320717926e-05, "loss": 0.2078, "step": 2631 }, { "epoch": 1.6014602981442043, "grad_norm": 1.070326805114746, "learning_rate": 3.453113286264332e-05, "loss": 0.2261, "step": 2632 }, { "epoch": 1.6020687557042896, "grad_norm": 0.7715917825698853, "learning_rate": 3.4519623041245026e-05, "loss": 0.1596, "step": 2633 }, { "epoch": 1.602677213264375, "grad_norm": 0.7550219893455505, "learning_rate": 3.450811085937702e-05, "loss": 0.1756, "step": 2634 }, { "epoch": 1.60328567082446, "grad_norm": 1.0285836458206177, "learning_rate": 3.4496596319893844e-05, "loss": 0.1657, "step": 2635 }, { "epoch": 1.6038941283845451, "grad_norm": 0.9173053503036499, "learning_rate": 3.448507942565061e-05, "loss": 0.1729, "step": 2636 }, { "epoch": 1.6045025859446302, "grad_norm": 3.4212629795074463, "learning_rate": 3.4473560179503045e-05, "loss": 0.2194, "step": 2637 }, { "epoch": 1.6051110435047156, "grad_norm": 0.9600544571876526, "learning_rate": 3.4462038584307424e-05, "loss": 0.16, "step": 2638 }, { "epoch": 1.6057195010648009, "grad_norm": 0.8442495465278625, "learning_rate": 3.4450514642920636e-05, "loss": 0.1872, "step": 2639 }, { "epoch": 1.606327958624886, "grad_norm": 0.9204790592193604, "learning_rate": 3.443898835820014e-05, "loss": 0.1905, "step": 2640 }, { "epoch": 1.606936416184971, "grad_norm": 0.9242935180664062, "learning_rate": 3.442745973300395e-05, "loss": 0.1767, "step": 2641 }, { "epoch": 1.6075448737450562, "grad_norm": 0.7664668560028076, "learning_rate": 3.441592877019072e-05, "loss": 0.1682, "step": 2642 }, { "epoch": 1.6081533313051415, "grad_norm": 0.7136707901954651, "learning_rate": 3.440439547261962e-05, "loss": 0.1695, "step": 2643 }, { "epoch": 1.6087617888652268, "grad_norm": 0.8739480972290039, "learning_rate": 3.4392859843150435e-05, "loss": 0.1854, "step": 2644 }, { "epoch": 1.609370246425312, "grad_norm": 0.9654523134231567, "learning_rate": 3.4381321884643534e-05, "loss": 0.1728, "step": 2645 }, { "epoch": 1.609978703985397, "grad_norm": 0.9120699167251587, "learning_rate": 3.4369781599959825e-05, "loss": 0.1915, "step": 2646 }, { "epoch": 1.610587161545482, "grad_norm": 0.8703420758247375, "learning_rate": 3.4358238991960837e-05, "loss": 0.1929, "step": 2647 }, { "epoch": 1.6111956191055674, "grad_norm": 1.062511920928955, "learning_rate": 3.434669406350866e-05, "loss": 0.2282, "step": 2648 }, { "epoch": 1.6118040766656527, "grad_norm": 0.9723806381225586, "learning_rate": 3.433514681746593e-05, "loss": 0.2546, "step": 2649 }, { "epoch": 1.6124125342257378, "grad_norm": 0.8156774044036865, "learning_rate": 3.4323597256695906e-05, "loss": 0.1962, "step": 2650 }, { "epoch": 1.613020991785823, "grad_norm": 0.6880114674568176, "learning_rate": 3.4312045384062386e-05, "loss": 0.1365, "step": 2651 }, { "epoch": 1.613629449345908, "grad_norm": 0.9442213773727417, "learning_rate": 3.430049120242975e-05, "loss": 0.2038, "step": 2652 }, { "epoch": 1.6142379069059933, "grad_norm": 0.8928872346878052, "learning_rate": 3.428893471466297e-05, "loss": 0.2134, "step": 2653 }, { "epoch": 1.6148463644660787, "grad_norm": 0.7231988310813904, "learning_rate": 3.427737592362755e-05, "loss": 0.1674, "step": 2654 }, { "epoch": 1.6154548220261638, "grad_norm": 0.8021813631057739, "learning_rate": 3.4265814832189614e-05, "loss": 0.1836, "step": 2655 }, { "epoch": 1.6160632795862488, "grad_norm": 0.8998550772666931, "learning_rate": 3.425425144321581e-05, "loss": 0.1854, "step": 2656 }, { "epoch": 1.616671737146334, "grad_norm": 0.8088403344154358, "learning_rate": 3.424268575957339e-05, "loss": 0.1573, "step": 2657 }, { "epoch": 1.6172801947064193, "grad_norm": 0.9545161724090576, "learning_rate": 3.423111778413015e-05, "loss": 0.229, "step": 2658 }, { "epoch": 1.6178886522665044, "grad_norm": 1.0301522016525269, "learning_rate": 3.4219547519754475e-05, "loss": 0.2269, "step": 2659 }, { "epoch": 1.6184971098265897, "grad_norm": 1.287305474281311, "learning_rate": 3.42079749693153e-05, "loss": 0.2085, "step": 2660 }, { "epoch": 1.6191055673866748, "grad_norm": 0.7046839594841003, "learning_rate": 3.419640013568215e-05, "loss": 0.1573, "step": 2661 }, { "epoch": 1.6197140249467599, "grad_norm": 0.8109933137893677, "learning_rate": 3.418482302172508e-05, "loss": 0.173, "step": 2662 }, { "epoch": 1.6203224825068452, "grad_norm": 0.792901873588562, "learning_rate": 3.4173243630314754e-05, "loss": 0.1872, "step": 2663 }, { "epoch": 1.6209309400669303, "grad_norm": 0.8485409021377563, "learning_rate": 3.4161661964322354e-05, "loss": 0.1863, "step": 2664 }, { "epoch": 1.6215393976270156, "grad_norm": 0.7961007356643677, "learning_rate": 3.415007802661967e-05, "loss": 0.1448, "step": 2665 }, { "epoch": 1.6221478551871007, "grad_norm": 0.7520683407783508, "learning_rate": 3.413849182007903e-05, "loss": 0.1682, "step": 2666 }, { "epoch": 1.6227563127471858, "grad_norm": 0.8546669483184814, "learning_rate": 3.412690334757334e-05, "loss": 0.1731, "step": 2667 }, { "epoch": 1.623364770307271, "grad_norm": 0.9142362475395203, "learning_rate": 3.411531261197603e-05, "loss": 0.2352, "step": 2668 }, { "epoch": 1.6239732278673562, "grad_norm": 0.7282983064651489, "learning_rate": 3.410371961616114e-05, "loss": 0.1176, "step": 2669 }, { "epoch": 1.6245816854274415, "grad_norm": 0.8929833769798279, "learning_rate": 3.409212436300326e-05, "loss": 0.1912, "step": 2670 }, { "epoch": 1.6251901429875266, "grad_norm": 0.7896724939346313, "learning_rate": 3.40805268553775e-05, "loss": 0.1885, "step": 2671 }, { "epoch": 1.6257986005476117, "grad_norm": 0.9245496392250061, "learning_rate": 3.4068927096159594e-05, "loss": 0.1935, "step": 2672 }, { "epoch": 1.626407058107697, "grad_norm": 0.8307329416275024, "learning_rate": 3.4057325088225764e-05, "loss": 0.1696, "step": 2673 }, { "epoch": 1.6270155156677821, "grad_norm": 0.9103578925132751, "learning_rate": 3.404572083445285e-05, "loss": 0.1489, "step": 2674 }, { "epoch": 1.6276239732278674, "grad_norm": 0.868897557258606, "learning_rate": 3.40341143377182e-05, "loss": 0.2073, "step": 2675 }, { "epoch": 1.6282324307879525, "grad_norm": 0.79030442237854, "learning_rate": 3.402250560089977e-05, "loss": 0.195, "step": 2676 }, { "epoch": 1.6288408883480376, "grad_norm": 0.7882320284843445, "learning_rate": 3.401089462687602e-05, "loss": 0.1669, "step": 2677 }, { "epoch": 1.629449345908123, "grad_norm": 0.920220673084259, "learning_rate": 3.399928141852599e-05, "loss": 0.2067, "step": 2678 }, { "epoch": 1.630057803468208, "grad_norm": 1.1648184061050415, "learning_rate": 3.398766597872928e-05, "loss": 0.1998, "step": 2679 }, { "epoch": 1.6306662610282934, "grad_norm": 1.0704290866851807, "learning_rate": 3.397604831036604e-05, "loss": 0.2699, "step": 2680 }, { "epoch": 1.6312747185883785, "grad_norm": 1.071129560470581, "learning_rate": 3.396442841631695e-05, "loss": 0.224, "step": 2681 }, { "epoch": 1.6318831761484636, "grad_norm": 0.7212443947792053, "learning_rate": 3.395280629946327e-05, "loss": 0.1551, "step": 2682 }, { "epoch": 1.6324916337085489, "grad_norm": 0.8159400224685669, "learning_rate": 3.39411819626868e-05, "loss": 0.1954, "step": 2683 }, { "epoch": 1.633100091268634, "grad_norm": 0.8605476021766663, "learning_rate": 3.3929555408869896e-05, "loss": 0.2296, "step": 2684 }, { "epoch": 1.6337085488287193, "grad_norm": 0.8847449421882629, "learning_rate": 3.3917926640895445e-05, "loss": 0.2023, "step": 2685 }, { "epoch": 1.6343170063888044, "grad_norm": 0.8677266240119934, "learning_rate": 3.390629566164691e-05, "loss": 0.2, "step": 2686 }, { "epoch": 1.6349254639488895, "grad_norm": 0.8311256170272827, "learning_rate": 3.389466247400828e-05, "loss": 0.2025, "step": 2687 }, { "epoch": 1.6355339215089746, "grad_norm": 3.200028896331787, "learning_rate": 3.38830270808641e-05, "loss": 0.2398, "step": 2688 }, { "epoch": 1.63614237906906, "grad_norm": 0.7760987877845764, "learning_rate": 3.3871389485099474e-05, "loss": 0.1651, "step": 2689 }, { "epoch": 1.6367508366291452, "grad_norm": 1.227342128753662, "learning_rate": 3.385974968960003e-05, "loss": 0.1905, "step": 2690 }, { "epoch": 1.6373592941892303, "grad_norm": 0.832114577293396, "learning_rate": 3.384810769725196e-05, "loss": 0.1901, "step": 2691 }, { "epoch": 1.6379677517493154, "grad_norm": 0.7426611185073853, "learning_rate": 3.383646351094198e-05, "loss": 0.1425, "step": 2692 }, { "epoch": 1.6385762093094005, "grad_norm": 0.7370221018791199, "learning_rate": 3.382481713355738e-05, "loss": 0.1638, "step": 2693 }, { "epoch": 1.6391846668694858, "grad_norm": 0.9661339521408081, "learning_rate": 3.381316856798596e-05, "loss": 0.1937, "step": 2694 }, { "epoch": 1.6397931244295711, "grad_norm": 0.8891693353652954, "learning_rate": 3.3801517817116094e-05, "loss": 0.1547, "step": 2695 }, { "epoch": 1.6404015819896562, "grad_norm": 1.042301058769226, "learning_rate": 3.378986488383667e-05, "loss": 0.2168, "step": 2696 }, { "epoch": 1.6410100395497413, "grad_norm": 0.7799018025398254, "learning_rate": 3.377820977103714e-05, "loss": 0.1444, "step": 2697 }, { "epoch": 1.6416184971098264, "grad_norm": 0.9158018231391907, "learning_rate": 3.376655248160747e-05, "loss": 0.2048, "step": 2698 }, { "epoch": 1.6422269546699118, "grad_norm": 0.9366563558578491, "learning_rate": 3.37548930184382e-05, "loss": 0.2095, "step": 2699 }, { "epoch": 1.642835412229997, "grad_norm": 0.7867175340652466, "learning_rate": 3.3743231384420384e-05, "loss": 0.1721, "step": 2700 }, { "epoch": 1.6434438697900822, "grad_norm": 0.9113629460334778, "learning_rate": 3.3731567582445615e-05, "loss": 0.1875, "step": 2701 }, { "epoch": 1.6440523273501673, "grad_norm": 0.9016094207763672, "learning_rate": 3.371990161540603e-05, "loss": 0.2127, "step": 2702 }, { "epoch": 1.6446607849102524, "grad_norm": 0.8337920904159546, "learning_rate": 3.3708233486194324e-05, "loss": 0.1788, "step": 2703 }, { "epoch": 1.6452692424703377, "grad_norm": 0.8652783036231995, "learning_rate": 3.369656319770369e-05, "loss": 0.1558, "step": 2704 }, { "epoch": 1.645877700030423, "grad_norm": 0.8067019581794739, "learning_rate": 3.368489075282786e-05, "loss": 0.174, "step": 2705 }, { "epoch": 1.646486157590508, "grad_norm": 0.7883349657058716, "learning_rate": 3.367321615446113e-05, "loss": 0.1485, "step": 2706 }, { "epoch": 1.6470946151505932, "grad_norm": 1.0703591108322144, "learning_rate": 3.366153940549832e-05, "loss": 0.1884, "step": 2707 }, { "epoch": 1.6477030727106783, "grad_norm": 0.6376674175262451, "learning_rate": 3.364986050883476e-05, "loss": 0.1306, "step": 2708 }, { "epoch": 1.6483115302707636, "grad_norm": 0.9308871030807495, "learning_rate": 3.363817946736634e-05, "loss": 0.1994, "step": 2709 }, { "epoch": 1.648919987830849, "grad_norm": 0.8675797581672668, "learning_rate": 3.3626496283989476e-05, "loss": 0.197, "step": 2710 }, { "epoch": 1.649528445390934, "grad_norm": 0.9659071564674377, "learning_rate": 3.361481096160109e-05, "loss": 0.2214, "step": 2711 }, { "epoch": 1.6501369029510191, "grad_norm": 0.8575699925422668, "learning_rate": 3.3603123503098675e-05, "loss": 0.1685, "step": 2712 }, { "epoch": 1.6507453605111042, "grad_norm": 0.8685420155525208, "learning_rate": 3.359143391138023e-05, "loss": 0.1846, "step": 2713 }, { "epoch": 1.6513538180711895, "grad_norm": 0.862989604473114, "learning_rate": 3.357974218934429e-05, "loss": 0.2076, "step": 2714 }, { "epoch": 1.6519622756312748, "grad_norm": 0.9207594990730286, "learning_rate": 3.356804833988989e-05, "loss": 0.1636, "step": 2715 }, { "epoch": 1.65257073319136, "grad_norm": 0.9252761602401733, "learning_rate": 3.3556352365916646e-05, "loss": 0.2106, "step": 2716 }, { "epoch": 1.653179190751445, "grad_norm": 1.0307517051696777, "learning_rate": 3.354465427032467e-05, "loss": 0.255, "step": 2717 }, { "epoch": 1.6537876483115301, "grad_norm": 0.7410596013069153, "learning_rate": 3.353295405601457e-05, "loss": 0.1325, "step": 2718 }, { "epoch": 1.6543961058716155, "grad_norm": 0.8445053696632385, "learning_rate": 3.352125172588756e-05, "loss": 0.1847, "step": 2719 }, { "epoch": 1.6550045634317008, "grad_norm": 0.7926309704780579, "learning_rate": 3.350954728284529e-05, "loss": 0.178, "step": 2720 }, { "epoch": 1.6556130209917859, "grad_norm": 0.8683445453643799, "learning_rate": 3.349784072979e-05, "loss": 0.1496, "step": 2721 }, { "epoch": 1.656221478551871, "grad_norm": 0.9647382497787476, "learning_rate": 3.348613206962441e-05, "loss": 0.1771, "step": 2722 }, { "epoch": 1.656829936111956, "grad_norm": 1.0857597589492798, "learning_rate": 3.3474421305251785e-05, "loss": 0.1893, "step": 2723 }, { "epoch": 1.6574383936720414, "grad_norm": 0.904272198677063, "learning_rate": 3.3462708439575916e-05, "loss": 0.1812, "step": 2724 }, { "epoch": 1.6580468512321267, "grad_norm": 0.9223242402076721, "learning_rate": 3.3450993475501084e-05, "loss": 0.2006, "step": 2725 }, { "epoch": 1.6586553087922118, "grad_norm": 0.8843355178833008, "learning_rate": 3.343927641593213e-05, "loss": 0.1732, "step": 2726 }, { "epoch": 1.6592637663522969, "grad_norm": 0.8139360547065735, "learning_rate": 3.3427557263774395e-05, "loss": 0.1904, "step": 2727 }, { "epoch": 1.659872223912382, "grad_norm": 0.8607851266860962, "learning_rate": 3.341583602193373e-05, "loss": 0.1773, "step": 2728 }, { "epoch": 1.6604806814724673, "grad_norm": 0.7943541407585144, "learning_rate": 3.3404112693316525e-05, "loss": 0.1659, "step": 2729 }, { "epoch": 1.6610891390325526, "grad_norm": 0.8024083971977234, "learning_rate": 3.339238728082968e-05, "loss": 0.1562, "step": 2730 }, { "epoch": 1.6616975965926377, "grad_norm": 0.7931082248687744, "learning_rate": 3.338065978738059e-05, "loss": 0.1367, "step": 2731 }, { "epoch": 1.6623060541527228, "grad_norm": 0.8233804106712341, "learning_rate": 3.33689302158772e-05, "loss": 0.1757, "step": 2732 }, { "epoch": 1.662914511712808, "grad_norm": 0.9862247705459595, "learning_rate": 3.3357198569227954e-05, "loss": 0.1799, "step": 2733 }, { "epoch": 1.6635229692728932, "grad_norm": 0.7572802901268005, "learning_rate": 3.33454648503418e-05, "loss": 0.1937, "step": 2734 }, { "epoch": 1.6641314268329785, "grad_norm": 0.764121949672699, "learning_rate": 3.333372906212823e-05, "loss": 0.1609, "step": 2735 }, { "epoch": 1.6647398843930636, "grad_norm": 0.9266926646232605, "learning_rate": 3.332199120749721e-05, "loss": 0.1643, "step": 2736 }, { "epoch": 1.6653483419531487, "grad_norm": 0.7189835906028748, "learning_rate": 3.331025128935926e-05, "loss": 0.1632, "step": 2737 }, { "epoch": 1.6659567995132338, "grad_norm": 0.9593905806541443, "learning_rate": 3.3298509310625363e-05, "loss": 0.1825, "step": 2738 }, { "epoch": 1.6665652570733192, "grad_norm": 0.8221538066864014, "learning_rate": 3.328676527420706e-05, "loss": 0.184, "step": 2739 }, { "epoch": 1.6671737146334045, "grad_norm": 0.8926746249198914, "learning_rate": 3.3275019183016384e-05, "loss": 0.1851, "step": 2740 }, { "epoch": 1.6677821721934896, "grad_norm": 0.8557993769645691, "learning_rate": 3.326327103996587e-05, "loss": 0.1588, "step": 2741 }, { "epoch": 1.6683906297535747, "grad_norm": 0.701745867729187, "learning_rate": 3.3251520847968566e-05, "loss": 0.1516, "step": 2742 }, { "epoch": 1.6689990873136598, "grad_norm": 0.9786536693572998, "learning_rate": 3.323976860993803e-05, "loss": 0.1958, "step": 2743 }, { "epoch": 1.669607544873745, "grad_norm": 0.7492821216583252, "learning_rate": 3.322801432878833e-05, "loss": 0.1302, "step": 2744 }, { "epoch": 1.6702160024338304, "grad_norm": 0.8823762536048889, "learning_rate": 3.321625800743404e-05, "loss": 0.1697, "step": 2745 }, { "epoch": 1.6708244599939155, "grad_norm": 1.0150517225265503, "learning_rate": 3.320449964879023e-05, "loss": 0.1984, "step": 2746 }, { "epoch": 1.6714329175540006, "grad_norm": 0.9592936635017395, "learning_rate": 3.31927392557725e-05, "loss": 0.2018, "step": 2747 }, { "epoch": 1.6720413751140857, "grad_norm": 0.9560689926147461, "learning_rate": 3.318097683129691e-05, "loss": 0.1886, "step": 2748 }, { "epoch": 1.672649832674171, "grad_norm": 1.1188546419143677, "learning_rate": 3.316921237828007e-05, "loss": 0.2193, "step": 2749 }, { "epoch": 1.6732582902342563, "grad_norm": 0.8860459923744202, "learning_rate": 3.315744589963907e-05, "loss": 0.1493, "step": 2750 }, { "epoch": 1.6738667477943414, "grad_norm": 0.9111357927322388, "learning_rate": 3.314567739829151e-05, "loss": 0.2217, "step": 2751 }, { "epoch": 1.6744752053544265, "grad_norm": 0.9053143262863159, "learning_rate": 3.313390687715548e-05, "loss": 0.2102, "step": 2752 }, { "epoch": 1.6750836629145116, "grad_norm": 0.8748982548713684, "learning_rate": 3.3122134339149585e-05, "loss": 0.1728, "step": 2753 }, { "epoch": 1.675692120474597, "grad_norm": 0.8217591643333435, "learning_rate": 3.311035978719292e-05, "loss": 0.1708, "step": 2754 }, { "epoch": 1.6763005780346822, "grad_norm": 0.9044924378395081, "learning_rate": 3.309858322420508e-05, "loss": 0.2065, "step": 2755 }, { "epoch": 1.6769090355947673, "grad_norm": 0.7668700814247131, "learning_rate": 3.308680465310617e-05, "loss": 0.2101, "step": 2756 }, { "epoch": 1.6775174931548524, "grad_norm": 0.9722397327423096, "learning_rate": 3.307502407681678e-05, "loss": 0.1881, "step": 2757 }, { "epoch": 1.6781259507149375, "grad_norm": 0.842012882232666, "learning_rate": 3.3063241498258e-05, "loss": 0.1471, "step": 2758 }, { "epoch": 1.6787344082750228, "grad_norm": 0.8087152242660522, "learning_rate": 3.305145692035143e-05, "loss": 0.1465, "step": 2759 }, { "epoch": 1.6793428658351082, "grad_norm": 0.8884347081184387, "learning_rate": 3.303967034601914e-05, "loss": 0.1811, "step": 2760 }, { "epoch": 1.6799513233951933, "grad_norm": 1.1114667654037476, "learning_rate": 3.3027881778183715e-05, "loss": 0.1998, "step": 2761 }, { "epoch": 1.6805597809552784, "grad_norm": 0.8144546151161194, "learning_rate": 3.301609121976822e-05, "loss": 0.1806, "step": 2762 }, { "epoch": 1.6811682385153635, "grad_norm": 0.858739972114563, "learning_rate": 3.300429867369623e-05, "loss": 0.2098, "step": 2763 }, { "epoch": 1.6817766960754488, "grad_norm": 0.9147648811340332, "learning_rate": 3.299250414289181e-05, "loss": 0.1956, "step": 2764 }, { "epoch": 1.6823851536355339, "grad_norm": 0.8137851357460022, "learning_rate": 3.298070763027951e-05, "loss": 0.2142, "step": 2765 }, { "epoch": 1.6829936111956192, "grad_norm": 0.8987473845481873, "learning_rate": 3.296890913878436e-05, "loss": 0.1742, "step": 2766 }, { "epoch": 1.6836020687557043, "grad_norm": 0.6859608888626099, "learning_rate": 3.295710867133191e-05, "loss": 0.1589, "step": 2767 }, { "epoch": 1.6842105263157894, "grad_norm": 0.7220624685287476, "learning_rate": 3.2945306230848185e-05, "loss": 0.138, "step": 2768 }, { "epoch": 1.6848189838758747, "grad_norm": 0.6847586631774902, "learning_rate": 3.293350182025968e-05, "loss": 0.1614, "step": 2769 }, { "epoch": 1.6854274414359598, "grad_norm": 0.846610426902771, "learning_rate": 3.292169544249341e-05, "loss": 0.1999, "step": 2770 }, { "epoch": 1.6860358989960451, "grad_norm": 0.7773505449295044, "learning_rate": 3.290988710047687e-05, "loss": 0.1724, "step": 2771 }, { "epoch": 1.6866443565561302, "grad_norm": 0.7428063750267029, "learning_rate": 3.289807679713803e-05, "loss": 0.1307, "step": 2772 }, { "epoch": 1.6872528141162153, "grad_norm": 0.8252652883529663, "learning_rate": 3.288626453540535e-05, "loss": 0.2018, "step": 2773 }, { "epoch": 1.6878612716763006, "grad_norm": 0.8059362173080444, "learning_rate": 3.287445031820777e-05, "loss": 0.1656, "step": 2774 }, { "epoch": 1.6884697292363857, "grad_norm": 0.9173924326896667, "learning_rate": 3.2862634148474744e-05, "loss": 0.1938, "step": 2775 }, { "epoch": 1.689078186796471, "grad_norm": 0.8214101791381836, "learning_rate": 3.285081602913618e-05, "loss": 0.1551, "step": 2776 }, { "epoch": 1.6896866443565561, "grad_norm": 0.9609821438789368, "learning_rate": 3.283899596312247e-05, "loss": 0.1968, "step": 2777 }, { "epoch": 1.6902951019166412, "grad_norm": 0.8379231095314026, "learning_rate": 3.2827173953364526e-05, "loss": 0.1466, "step": 2778 }, { "epoch": 1.6909035594767265, "grad_norm": 0.7881366014480591, "learning_rate": 3.281535000279368e-05, "loss": 0.132, "step": 2779 }, { "epoch": 1.6915120170368116, "grad_norm": 0.8496389389038086, "learning_rate": 3.2803524114341795e-05, "loss": 0.2176, "step": 2780 }, { "epoch": 1.692120474596897, "grad_norm": 0.7638592720031738, "learning_rate": 3.2791696290941196e-05, "loss": 0.1763, "step": 2781 }, { "epoch": 1.692728932156982, "grad_norm": 0.9417856335639954, "learning_rate": 3.27798665355247e-05, "loss": 0.1721, "step": 2782 }, { "epoch": 1.6933373897170672, "grad_norm": 0.788231611251831, "learning_rate": 3.276803485102557e-05, "loss": 0.1493, "step": 2783 }, { "epoch": 1.6939458472771525, "grad_norm": 0.8815106153488159, "learning_rate": 3.2756201240377596e-05, "loss": 0.1762, "step": 2784 }, { "epoch": 1.6945543048372376, "grad_norm": 0.8450623750686646, "learning_rate": 3.2744365706514995e-05, "loss": 0.2006, "step": 2785 }, { "epoch": 1.6951627623973229, "grad_norm": 1.0778264999389648, "learning_rate": 3.273252825237251e-05, "loss": 0.2408, "step": 2786 }, { "epoch": 1.695771219957408, "grad_norm": 0.7945128083229065, "learning_rate": 3.2720688880885324e-05, "loss": 0.1696, "step": 2787 }, { "epoch": 1.696379677517493, "grad_norm": 0.7426414489746094, "learning_rate": 3.270884759498911e-05, "loss": 0.157, "step": 2788 }, { "epoch": 1.6969881350775782, "grad_norm": 0.8385169506072998, "learning_rate": 3.269700439762001e-05, "loss": 0.1832, "step": 2789 }, { "epoch": 1.6975965926376635, "grad_norm": 1.0522770881652832, "learning_rate": 3.268515929171465e-05, "loss": 0.2037, "step": 2790 }, { "epoch": 1.6982050501977488, "grad_norm": 0.7473107576370239, "learning_rate": 3.2673312280210124e-05, "loss": 0.1878, "step": 2791 }, { "epoch": 1.698813507757834, "grad_norm": 0.7824332118034363, "learning_rate": 3.2661463366043985e-05, "loss": 0.1556, "step": 2792 }, { "epoch": 1.699421965317919, "grad_norm": 0.8999038338661194, "learning_rate": 3.2649612552154276e-05, "loss": 0.1974, "step": 2793 }, { "epoch": 1.700030422878004, "grad_norm": 0.7992719411849976, "learning_rate": 3.263775984147951e-05, "loss": 0.1679, "step": 2794 }, { "epoch": 1.7006388804380894, "grad_norm": 0.7823406457901001, "learning_rate": 3.2625905236958655e-05, "loss": 0.1676, "step": 2795 }, { "epoch": 1.7012473379981747, "grad_norm": 0.8602148294448853, "learning_rate": 3.2614048741531166e-05, "loss": 0.1666, "step": 2796 }, { "epoch": 1.7018557955582598, "grad_norm": 0.8012514114379883, "learning_rate": 3.2602190358136965e-05, "loss": 0.1704, "step": 2797 }, { "epoch": 1.702464253118345, "grad_norm": 0.9197515249252319, "learning_rate": 3.259033008971642e-05, "loss": 0.1809, "step": 2798 }, { "epoch": 1.70307271067843, "grad_norm": 1.046052098274231, "learning_rate": 3.25784679392104e-05, "loss": 0.2223, "step": 2799 }, { "epoch": 1.7036811682385153, "grad_norm": 0.7376840710639954, "learning_rate": 3.256660390956022e-05, "loss": 0.1596, "step": 2800 }, { "epoch": 1.7042896257986007, "grad_norm": 0.8339237570762634, "learning_rate": 3.255473800370765e-05, "loss": 0.1818, "step": 2801 }, { "epoch": 1.7048980833586858, "grad_norm": 0.7850296497344971, "learning_rate": 3.254287022459496e-05, "loss": 0.1752, "step": 2802 }, { "epoch": 1.7055065409187709, "grad_norm": 0.8697124719619751, "learning_rate": 3.253100057516486e-05, "loss": 0.1768, "step": 2803 }, { "epoch": 1.706114998478856, "grad_norm": 1.1126950979232788, "learning_rate": 3.251912905836052e-05, "loss": 0.2258, "step": 2804 }, { "epoch": 1.7067234560389413, "grad_norm": 0.7312096953392029, "learning_rate": 3.250725567712559e-05, "loss": 0.1779, "step": 2805 }, { "epoch": 1.7073319135990266, "grad_norm": 0.7978745102882385, "learning_rate": 3.2495380434404167e-05, "loss": 0.1554, "step": 2806 }, { "epoch": 1.7079403711591117, "grad_norm": 0.9084555506706238, "learning_rate": 3.248350333314082e-05, "loss": 0.1468, "step": 2807 }, { "epoch": 1.7085488287191968, "grad_norm": 0.7953585386276245, "learning_rate": 3.247162437628057e-05, "loss": 0.1308, "step": 2808 }, { "epoch": 1.7091572862792819, "grad_norm": 0.7833218574523926, "learning_rate": 3.2459743566768916e-05, "loss": 0.1697, "step": 2809 }, { "epoch": 1.7097657438393672, "grad_norm": 0.7225537896156311, "learning_rate": 3.2447860907551786e-05, "loss": 0.1569, "step": 2810 }, { "epoch": 1.7103742013994525, "grad_norm": 0.7659318447113037, "learning_rate": 3.24359764015756e-05, "loss": 0.1846, "step": 2811 }, { "epoch": 1.7109826589595376, "grad_norm": 0.8912885189056396, "learning_rate": 3.242409005178721e-05, "loss": 0.1805, "step": 2812 }, { "epoch": 1.7115911165196227, "grad_norm": 0.7760403156280518, "learning_rate": 3.241220186113394e-05, "loss": 0.1389, "step": 2813 }, { "epoch": 1.7121995740797078, "grad_norm": 1.0232988595962524, "learning_rate": 3.2400311832563563e-05, "loss": 0.2085, "step": 2814 }, { "epoch": 1.7128080316397931, "grad_norm": 0.8213723301887512, "learning_rate": 3.238841996902431e-05, "loss": 0.1491, "step": 2815 }, { "epoch": 1.7134164891998784, "grad_norm": 0.8206520676612854, "learning_rate": 3.237652627346487e-05, "loss": 0.1543, "step": 2816 }, { "epoch": 1.7140249467599635, "grad_norm": 0.885313093662262, "learning_rate": 3.2364630748834385e-05, "loss": 0.1985, "step": 2817 }, { "epoch": 1.7146334043200486, "grad_norm": 0.7833919525146484, "learning_rate": 3.235273339808245e-05, "loss": 0.1493, "step": 2818 }, { "epoch": 1.7152418618801337, "grad_norm": 0.8376486301422119, "learning_rate": 3.2340834224159104e-05, "loss": 0.164, "step": 2819 }, { "epoch": 1.715850319440219, "grad_norm": 0.8096402287483215, "learning_rate": 3.232893323001485e-05, "loss": 0.1883, "step": 2820 }, { "epoch": 1.7164587770003044, "grad_norm": 0.881629467010498, "learning_rate": 3.2317030418600645e-05, "loss": 0.1901, "step": 2821 }, { "epoch": 1.7170672345603895, "grad_norm": 0.910102128982544, "learning_rate": 3.2305125792867886e-05, "loss": 0.1814, "step": 2822 }, { "epoch": 1.7176756921204746, "grad_norm": 0.7410189509391785, "learning_rate": 3.229321935576842e-05, "loss": 0.1637, "step": 2823 }, { "epoch": 1.7182841496805596, "grad_norm": 0.8830031752586365, "learning_rate": 3.228131111025455e-05, "loss": 0.1492, "step": 2824 }, { "epoch": 1.718892607240645, "grad_norm": 0.9548397660255432, "learning_rate": 3.226940105927903e-05, "loss": 0.1643, "step": 2825 }, { "epoch": 1.7195010648007303, "grad_norm": 0.829814612865448, "learning_rate": 3.2257489205795034e-05, "loss": 0.1411, "step": 2826 }, { "epoch": 1.7201095223608154, "grad_norm": 0.9172588586807251, "learning_rate": 3.224557555275623e-05, "loss": 0.1536, "step": 2827 }, { "epoch": 1.7207179799209005, "grad_norm": 0.8592941164970398, "learning_rate": 3.223366010311671e-05, "loss": 0.1777, "step": 2828 }, { "epoch": 1.7213264374809856, "grad_norm": 0.8923897743225098, "learning_rate": 3.2221742859831e-05, "loss": 0.1696, "step": 2829 }, { "epoch": 1.721934895041071, "grad_norm": 1.0709412097930908, "learning_rate": 3.220982382585406e-05, "loss": 0.2554, "step": 2830 }, { "epoch": 1.7225433526011562, "grad_norm": 0.8038744330406189, "learning_rate": 3.219790300414134e-05, "loss": 0.1586, "step": 2831 }, { "epoch": 1.7231518101612413, "grad_norm": 0.8866575956344604, "learning_rate": 3.21859803976487e-05, "loss": 0.1691, "step": 2832 }, { "epoch": 1.7237602677213264, "grad_norm": 0.8635597825050354, "learning_rate": 3.217405600933245e-05, "loss": 0.1865, "step": 2833 }, { "epoch": 1.7243687252814115, "grad_norm": 0.8194993138313293, "learning_rate": 3.2162129842149336e-05, "loss": 0.1614, "step": 2834 }, { "epoch": 1.7249771828414968, "grad_norm": 0.8445285558700562, "learning_rate": 3.215020189905655e-05, "loss": 0.2077, "step": 2835 }, { "epoch": 1.7255856404015821, "grad_norm": 0.9541175365447998, "learning_rate": 3.213827218301173e-05, "loss": 0.2071, "step": 2836 }, { "epoch": 1.7261940979616672, "grad_norm": 0.7591641545295715, "learning_rate": 3.212634069697295e-05, "loss": 0.1616, "step": 2837 }, { "epoch": 1.7268025555217523, "grad_norm": 0.7483083605766296, "learning_rate": 3.211440744389871e-05, "loss": 0.1756, "step": 2838 }, { "epoch": 1.7274110130818374, "grad_norm": 0.7312026619911194, "learning_rate": 3.2102472426747975e-05, "loss": 0.1834, "step": 2839 }, { "epoch": 1.7280194706419227, "grad_norm": 0.8581993579864502, "learning_rate": 3.2090535648480126e-05, "loss": 0.1491, "step": 2840 }, { "epoch": 1.728627928202008, "grad_norm": 0.7880068421363831, "learning_rate": 3.207859711205498e-05, "loss": 0.1904, "step": 2841 }, { "epoch": 1.7292363857620932, "grad_norm": 0.9084392786026001, "learning_rate": 3.20666568204328e-05, "loss": 0.1815, "step": 2842 }, { "epoch": 1.7298448433221782, "grad_norm": 0.8204681873321533, "learning_rate": 3.205471477657428e-05, "loss": 0.1863, "step": 2843 }, { "epoch": 1.7304533008822633, "grad_norm": 0.728208065032959, "learning_rate": 3.204277098344055e-05, "loss": 0.1384, "step": 2844 }, { "epoch": 1.7310617584423487, "grad_norm": 0.9103240966796875, "learning_rate": 3.203082544399318e-05, "loss": 0.1875, "step": 2845 }, { "epoch": 1.731670216002434, "grad_norm": 0.8469733595848083, "learning_rate": 3.201887816119416e-05, "loss": 0.1546, "step": 2846 }, { "epoch": 1.732278673562519, "grad_norm": 1.0149333477020264, "learning_rate": 3.2006929138005905e-05, "loss": 0.1469, "step": 2847 }, { "epoch": 1.7328871311226042, "grad_norm": 0.8808649778366089, "learning_rate": 3.1994978377391295e-05, "loss": 0.1811, "step": 2848 }, { "epoch": 1.7334955886826893, "grad_norm": 0.77308589220047, "learning_rate": 3.198302588231361e-05, "loss": 0.1662, "step": 2849 }, { "epoch": 1.7341040462427746, "grad_norm": 0.7700105309486389, "learning_rate": 3.197107165573657e-05, "loss": 0.1371, "step": 2850 }, { "epoch": 1.73471250380286, "grad_norm": 0.8109223246574402, "learning_rate": 3.195911570062434e-05, "loss": 0.138, "step": 2851 }, { "epoch": 1.735320961362945, "grad_norm": 0.8431537747383118, "learning_rate": 3.1947158019941476e-05, "loss": 0.1787, "step": 2852 }, { "epoch": 1.73592941892303, "grad_norm": 0.8387619256973267, "learning_rate": 3.1935198616652996e-05, "loss": 0.1893, "step": 2853 }, { "epoch": 1.7365378764831152, "grad_norm": 0.8504119515419006, "learning_rate": 3.192323749372433e-05, "loss": 0.1783, "step": 2854 }, { "epoch": 1.7371463340432005, "grad_norm": 0.8203498125076294, "learning_rate": 3.1911274654121345e-05, "loss": 0.1651, "step": 2855 }, { "epoch": 1.7377547916032858, "grad_norm": 0.941662073135376, "learning_rate": 3.1899310100810326e-05, "loss": 0.193, "step": 2856 }, { "epoch": 1.738363249163371, "grad_norm": 0.8915706276893616, "learning_rate": 3.1887343836757977e-05, "loss": 0.2059, "step": 2857 }, { "epoch": 1.738971706723456, "grad_norm": 0.7377119064331055, "learning_rate": 3.1875375864931426e-05, "loss": 0.1607, "step": 2858 }, { "epoch": 1.7395801642835411, "grad_norm": 0.8279535174369812, "learning_rate": 3.186340618829825e-05, "loss": 0.1356, "step": 2859 }, { "epoch": 1.7401886218436264, "grad_norm": 0.9348523020744324, "learning_rate": 3.185143480982642e-05, "loss": 0.2069, "step": 2860 }, { "epoch": 1.7407970794037118, "grad_norm": 0.7679805159568787, "learning_rate": 3.183946173248433e-05, "loss": 0.1833, "step": 2861 }, { "epoch": 1.7414055369637969, "grad_norm": 0.8387174010276794, "learning_rate": 3.182748695924082e-05, "loss": 0.2116, "step": 2862 }, { "epoch": 1.742013994523882, "grad_norm": 0.8004626035690308, "learning_rate": 3.181551049306513e-05, "loss": 0.1897, "step": 2863 }, { "epoch": 1.742622452083967, "grad_norm": 0.8716647624969482, "learning_rate": 3.180353233692691e-05, "loss": 0.2086, "step": 2864 }, { "epoch": 1.7432309096440524, "grad_norm": 0.901866614818573, "learning_rate": 3.179155249379628e-05, "loss": 0.2046, "step": 2865 }, { "epoch": 1.7438393672041375, "grad_norm": 0.7688228487968445, "learning_rate": 3.17795709666437e-05, "loss": 0.1335, "step": 2866 }, { "epoch": 1.7444478247642228, "grad_norm": 0.7995738983154297, "learning_rate": 3.1767587758440106e-05, "loss": 0.1418, "step": 2867 }, { "epoch": 1.7450562823243079, "grad_norm": 0.8365558981895447, "learning_rate": 3.175560287215684e-05, "loss": 0.1897, "step": 2868 }, { "epoch": 1.745664739884393, "grad_norm": 0.8732959032058716, "learning_rate": 3.1743616310765644e-05, "loss": 0.1946, "step": 2869 }, { "epoch": 1.7462731974444783, "grad_norm": 0.972126841545105, "learning_rate": 3.1731628077238694e-05, "loss": 0.1725, "step": 2870 }, { "epoch": 1.7468816550045634, "grad_norm": 0.811004102230072, "learning_rate": 3.171963817454857e-05, "loss": 0.1305, "step": 2871 }, { "epoch": 1.7474901125646487, "grad_norm": 0.9573332667350769, "learning_rate": 3.170764660566826e-05, "loss": 0.1681, "step": 2872 }, { "epoch": 1.7480985701247338, "grad_norm": 0.7610151767730713, "learning_rate": 3.1695653373571196e-05, "loss": 0.154, "step": 2873 }, { "epoch": 1.748707027684819, "grad_norm": 1.0020116567611694, "learning_rate": 3.168365848123117e-05, "loss": 0.171, "step": 2874 }, { "epoch": 1.7493154852449042, "grad_norm": 0.8454048037528992, "learning_rate": 3.167166193162244e-05, "loss": 0.1776, "step": 2875 }, { "epoch": 1.7499239428049893, "grad_norm": 0.8977103233337402, "learning_rate": 3.1659663727719625e-05, "loss": 0.1786, "step": 2876 }, { "epoch": 1.7505324003650746, "grad_norm": 0.7743524312973022, "learning_rate": 3.1647663872497804e-05, "loss": 0.1425, "step": 2877 }, { "epoch": 1.7511408579251597, "grad_norm": 0.9040097594261169, "learning_rate": 3.1635662368932426e-05, "loss": 0.2031, "step": 2878 }, { "epoch": 1.7517493154852448, "grad_norm": 0.6998404264450073, "learning_rate": 3.1623659219999374e-05, "loss": 0.1344, "step": 2879 }, { "epoch": 1.7523577730453301, "grad_norm": 0.8501226902008057, "learning_rate": 3.161165442867492e-05, "loss": 0.1767, "step": 2880 }, { "epoch": 1.7529662306054152, "grad_norm": 0.9239878058433533, "learning_rate": 3.159964799793575e-05, "loss": 0.2296, "step": 2881 }, { "epoch": 1.7535746881655006, "grad_norm": 0.9915949106216431, "learning_rate": 3.158763993075897e-05, "loss": 0.2277, "step": 2882 }, { "epoch": 1.7541831457255856, "grad_norm": 0.746919572353363, "learning_rate": 3.157563023012208e-05, "loss": 0.1781, "step": 2883 }, { "epoch": 1.7547916032856707, "grad_norm": 0.6546300649642944, "learning_rate": 3.1563618899002965e-05, "loss": 0.1176, "step": 2884 }, { "epoch": 1.755400060845756, "grad_norm": 0.824098527431488, "learning_rate": 3.1551605940379954e-05, "loss": 0.1666, "step": 2885 }, { "epoch": 1.7560085184058412, "grad_norm": 0.7335473299026489, "learning_rate": 3.1539591357231755e-05, "loss": 0.1489, "step": 2886 }, { "epoch": 1.7566169759659265, "grad_norm": 0.8762836456298828, "learning_rate": 3.152757515253748e-05, "loss": 0.1802, "step": 2887 }, { "epoch": 1.7572254335260116, "grad_norm": 0.9030479192733765, "learning_rate": 3.1515557329276654e-05, "loss": 0.1571, "step": 2888 }, { "epoch": 1.7578338910860967, "grad_norm": 0.8152849674224854, "learning_rate": 3.150353789042919e-05, "loss": 0.1772, "step": 2889 }, { "epoch": 1.7584423486461818, "grad_norm": 0.8419698476791382, "learning_rate": 3.149151683897541e-05, "loss": 0.1603, "step": 2890 }, { "epoch": 1.759050806206267, "grad_norm": 1.0735416412353516, "learning_rate": 3.147949417789604e-05, "loss": 0.1659, "step": 2891 }, { "epoch": 1.7596592637663524, "grad_norm": 0.8551287651062012, "learning_rate": 3.1467469910172184e-05, "loss": 0.199, "step": 2892 }, { "epoch": 1.7602677213264375, "grad_norm": 1.121510624885559, "learning_rate": 3.145544403878538e-05, "loss": 0.1591, "step": 2893 }, { "epoch": 1.7608761788865226, "grad_norm": 0.9003729224205017, "learning_rate": 3.144341656671751e-05, "loss": 0.1713, "step": 2894 }, { "epoch": 1.7614846364466077, "grad_norm": 0.8984287977218628, "learning_rate": 3.143138749695091e-05, "loss": 0.2174, "step": 2895 }, { "epoch": 1.762093094006693, "grad_norm": 0.8700388669967651, "learning_rate": 3.14193568324683e-05, "loss": 0.2276, "step": 2896 }, { "epoch": 1.7627015515667783, "grad_norm": 0.7582150101661682, "learning_rate": 3.140732457625276e-05, "loss": 0.1604, "step": 2897 }, { "epoch": 1.7633100091268634, "grad_norm": 0.8540982007980347, "learning_rate": 3.13952907312878e-05, "loss": 0.1728, "step": 2898 }, { "epoch": 1.7639184666869485, "grad_norm": 0.8038197159767151, "learning_rate": 3.1383255300557293e-05, "loss": 0.171, "step": 2899 }, { "epoch": 1.7645269242470336, "grad_norm": 0.8544119000434875, "learning_rate": 3.137121828704555e-05, "loss": 0.1521, "step": 2900 }, { "epoch": 1.765135381807119, "grad_norm": 0.8337200284004211, "learning_rate": 3.135917969373724e-05, "loss": 0.1895, "step": 2901 }, { "epoch": 1.7657438393672042, "grad_norm": 0.8394232988357544, "learning_rate": 3.134713952361742e-05, "loss": 0.2031, "step": 2902 }, { "epoch": 1.7663522969272893, "grad_norm": 0.7780349850654602, "learning_rate": 3.1335097779671564e-05, "loss": 0.1559, "step": 2903 }, { "epoch": 1.7669607544873744, "grad_norm": 0.7052262425422668, "learning_rate": 3.132305446488552e-05, "loss": 0.1905, "step": 2904 }, { "epoch": 1.7675692120474595, "grad_norm": 0.7838650941848755, "learning_rate": 3.1311009582245525e-05, "loss": 0.1404, "step": 2905 }, { "epoch": 1.7681776696075449, "grad_norm": 0.9082717299461365, "learning_rate": 3.1298963134738214e-05, "loss": 0.2033, "step": 2906 }, { "epoch": 1.7687861271676302, "grad_norm": 0.882872462272644, "learning_rate": 3.128691512535059e-05, "loss": 0.186, "step": 2907 }, { "epoch": 1.7693945847277153, "grad_norm": 1.0027045011520386, "learning_rate": 3.127486555707007e-05, "loss": 0.1539, "step": 2908 }, { "epoch": 1.7700030422878004, "grad_norm": 0.9138793349266052, "learning_rate": 3.126281443288445e-05, "loss": 0.1769, "step": 2909 }, { "epoch": 1.7706114998478855, "grad_norm": 0.8712251782417297, "learning_rate": 3.125076175578189e-05, "loss": 0.1787, "step": 2910 }, { "epoch": 1.7712199574079708, "grad_norm": 1.6176162958145142, "learning_rate": 3.123870752875096e-05, "loss": 0.174, "step": 2911 }, { "epoch": 1.771828414968056, "grad_norm": 0.8051137924194336, "learning_rate": 3.12266517547806e-05, "loss": 0.177, "step": 2912 }, { "epoch": 1.7724368725281412, "grad_norm": 0.8419429063796997, "learning_rate": 3.121459443686015e-05, "loss": 0.1795, "step": 2913 }, { "epoch": 1.7730453300882263, "grad_norm": 0.8457537889480591, "learning_rate": 3.120253557797932e-05, "loss": 0.1616, "step": 2914 }, { "epoch": 1.7736537876483114, "grad_norm": 0.7322502732276917, "learning_rate": 3.1190475181128194e-05, "loss": 0.1396, "step": 2915 }, { "epoch": 1.7742622452083967, "grad_norm": 0.755520761013031, "learning_rate": 3.1178413249297255e-05, "loss": 0.1479, "step": 2916 }, { "epoch": 1.774870702768482, "grad_norm": 0.8889976739883423, "learning_rate": 3.116634978547737e-05, "loss": 0.2115, "step": 2917 }, { "epoch": 1.7754791603285671, "grad_norm": 0.8492110967636108, "learning_rate": 3.115428479265975e-05, "loss": 0.182, "step": 2918 }, { "epoch": 1.7760876178886522, "grad_norm": 0.7823325395584106, "learning_rate": 3.1142218273836025e-05, "loss": 0.1563, "step": 2919 }, { "epoch": 1.7766960754487373, "grad_norm": 0.774039626121521, "learning_rate": 3.11301502319982e-05, "loss": 0.1498, "step": 2920 }, { "epoch": 1.7773045330088226, "grad_norm": 0.8219572901725769, "learning_rate": 3.111808067013863e-05, "loss": 0.1536, "step": 2921 }, { "epoch": 1.777912990568908, "grad_norm": 0.7709328532218933, "learning_rate": 3.1106009591250066e-05, "loss": 0.135, "step": 2922 }, { "epoch": 1.778521448128993, "grad_norm": 0.7555780410766602, "learning_rate": 3.109393699832564e-05, "loss": 0.1303, "step": 2923 }, { "epoch": 1.7791299056890781, "grad_norm": 0.58537757396698, "learning_rate": 3.108186289435884e-05, "loss": 0.1188, "step": 2924 }, { "epoch": 1.7797383632491632, "grad_norm": 0.8996820449829102, "learning_rate": 3.106978728234354e-05, "loss": 0.1952, "step": 2925 }, { "epoch": 1.7803468208092486, "grad_norm": 1.680888056755066, "learning_rate": 3.1057710165274004e-05, "loss": 0.2346, "step": 2926 }, { "epoch": 1.7809552783693339, "grad_norm": 0.6698862314224243, "learning_rate": 3.1045631546144846e-05, "loss": 0.1375, "step": 2927 }, { "epoch": 1.781563735929419, "grad_norm": 0.7749894261360168, "learning_rate": 3.1033551427951064e-05, "loss": 0.1948, "step": 2928 }, { "epoch": 1.782172193489504, "grad_norm": 0.8020423054695129, "learning_rate": 3.102146981368801e-05, "loss": 0.1814, "step": 2929 }, { "epoch": 1.7827806510495892, "grad_norm": 0.8180800676345825, "learning_rate": 3.100938670635143e-05, "loss": 0.1584, "step": 2930 }, { "epoch": 1.7833891086096745, "grad_norm": 1.1050705909729004, "learning_rate": 3.099730210893743e-05, "loss": 0.1605, "step": 2931 }, { "epoch": 1.7839975661697598, "grad_norm": 0.8110726475715637, "learning_rate": 3.0985216024442484e-05, "loss": 0.1342, "step": 2932 }, { "epoch": 1.784606023729845, "grad_norm": 0.7474027872085571, "learning_rate": 3.097312845586345e-05, "loss": 0.1579, "step": 2933 }, { "epoch": 1.78521448128993, "grad_norm": 0.908333957195282, "learning_rate": 3.096103940619752e-05, "loss": 0.2047, "step": 2934 }, { "epoch": 1.785822938850015, "grad_norm": 1.506729006767273, "learning_rate": 3.0948948878442293e-05, "loss": 0.189, "step": 2935 }, { "epoch": 1.7864313964101004, "grad_norm": 0.7309430837631226, "learning_rate": 3.093685687559571e-05, "loss": 0.1383, "step": 2936 }, { "epoch": 1.7870398539701857, "grad_norm": 0.866033136844635, "learning_rate": 3.092476340065608e-05, "loss": 0.1515, "step": 2937 }, { "epoch": 1.7876483115302708, "grad_norm": 0.8974893689155579, "learning_rate": 3.091266845662208e-05, "loss": 0.1721, "step": 2938 }, { "epoch": 1.788256769090356, "grad_norm": 0.8083369135856628, "learning_rate": 3.090057204649276e-05, "loss": 0.1714, "step": 2939 }, { "epoch": 1.788865226650441, "grad_norm": 1.0619298219680786, "learning_rate": 3.088847417326752e-05, "loss": 0.172, "step": 2940 }, { "epoch": 1.7894736842105263, "grad_norm": 0.9113779067993164, "learning_rate": 3.087637483994612e-05, "loss": 0.1679, "step": 2941 }, { "epoch": 1.7900821417706116, "grad_norm": 0.8694355487823486, "learning_rate": 3.086427404952871e-05, "loss": 0.1498, "step": 2942 }, { "epoch": 1.7906905993306967, "grad_norm": 0.789814829826355, "learning_rate": 3.085217180501576e-05, "loss": 0.1598, "step": 2943 }, { "epoch": 1.7912990568907818, "grad_norm": 0.7697128653526306, "learning_rate": 3.084006810940814e-05, "loss": 0.1259, "step": 2944 }, { "epoch": 1.791907514450867, "grad_norm": 0.8780056834220886, "learning_rate": 3.082796296570706e-05, "loss": 0.1966, "step": 2945 }, { "epoch": 1.7925159720109523, "grad_norm": 0.790896475315094, "learning_rate": 3.081585637691407e-05, "loss": 0.1333, "step": 2946 }, { "epoch": 1.7931244295710376, "grad_norm": 0.8985186815261841, "learning_rate": 3.080374834603113e-05, "loss": 0.127, "step": 2947 }, { "epoch": 1.7937328871311227, "grad_norm": 0.9989727735519409, "learning_rate": 3.079163887606051e-05, "loss": 0.1706, "step": 2948 }, { "epoch": 1.7943413446912078, "grad_norm": 0.7312899827957153, "learning_rate": 3.077952797000485e-05, "loss": 0.1619, "step": 2949 }, { "epoch": 1.7949498022512929, "grad_norm": 0.9777790307998657, "learning_rate": 3.0767415630867165e-05, "loss": 0.178, "step": 2950 }, { "epoch": 1.7955582598113782, "grad_norm": 1.0288338661193848, "learning_rate": 3.0755301861650794e-05, "loss": 0.1798, "step": 2951 }, { "epoch": 1.7961667173714635, "grad_norm": 0.7798663973808289, "learning_rate": 3.074318666535946e-05, "loss": 0.1747, "step": 2952 }, { "epoch": 1.7967751749315486, "grad_norm": 0.8773430585861206, "learning_rate": 3.0731070044997215e-05, "loss": 0.1644, "step": 2953 }, { "epoch": 1.7973836324916337, "grad_norm": 0.828058123588562, "learning_rate": 3.071895200356848e-05, "loss": 0.163, "step": 2954 }, { "epoch": 1.7979920900517188, "grad_norm": 0.8165954947471619, "learning_rate": 3.070683254407803e-05, "loss": 0.1583, "step": 2955 }, { "epoch": 1.798600547611804, "grad_norm": 0.6299397349357605, "learning_rate": 3.069471166953098e-05, "loss": 0.1258, "step": 2956 }, { "epoch": 1.7992090051718894, "grad_norm": 0.7792373895645142, "learning_rate": 3.068258938293281e-05, "loss": 0.1431, "step": 2957 }, { "epoch": 1.7998174627319745, "grad_norm": 0.6819939613342285, "learning_rate": 3.0670465687289325e-05, "loss": 0.1071, "step": 2958 }, { "epoch": 1.8004259202920596, "grad_norm": 1.1301064491271973, "learning_rate": 3.065834058560671e-05, "loss": 0.1438, "step": 2959 }, { "epoch": 1.8010343778521447, "grad_norm": 1.0041139125823975, "learning_rate": 3.064621408089148e-05, "loss": 0.1515, "step": 2960 }, { "epoch": 1.80164283541223, "grad_norm": 0.7390562295913696, "learning_rate": 3.0634086176150504e-05, "loss": 0.1325, "step": 2961 }, { "epoch": 1.8022512929723153, "grad_norm": 0.8052049279212952, "learning_rate": 3.0621956874391e-05, "loss": 0.1349, "step": 2962 }, { "epoch": 1.8028597505324004, "grad_norm": 0.7416874170303345, "learning_rate": 3.060982617862053e-05, "loss": 0.1505, "step": 2963 }, { "epoch": 1.8034682080924855, "grad_norm": 1.035781979560852, "learning_rate": 3.0597694091846985e-05, "loss": 0.1632, "step": 2964 }, { "epoch": 1.8040766656525706, "grad_norm": 1.187511682510376, "learning_rate": 3.058556061707863e-05, "loss": 0.26, "step": 2965 }, { "epoch": 1.804685123212656, "grad_norm": 0.9462476372718811, "learning_rate": 3.057342575732406e-05, "loss": 0.1846, "step": 2966 }, { "epoch": 1.805293580772741, "grad_norm": 0.9190717339515686, "learning_rate": 3.0561289515592226e-05, "loss": 0.1966, "step": 2967 }, { "epoch": 1.8059020383328264, "grad_norm": 0.8364342451095581, "learning_rate": 3.054915189489239e-05, "loss": 0.1795, "step": 2968 }, { "epoch": 1.8065104958929115, "grad_norm": 0.7401247620582581, "learning_rate": 3.053701289823418e-05, "loss": 0.1576, "step": 2969 }, { "epoch": 1.8071189534529966, "grad_norm": 0.7658497095108032, "learning_rate": 3.052487252862758e-05, "loss": 0.1873, "step": 2970 }, { "epoch": 1.8077274110130819, "grad_norm": 0.765739381313324, "learning_rate": 3.0512730789082862e-05, "loss": 0.1662, "step": 2971 }, { "epoch": 1.808335868573167, "grad_norm": 0.7397434711456299, "learning_rate": 3.0500587682610694e-05, "loss": 0.154, "step": 2972 }, { "epoch": 1.8089443261332523, "grad_norm": 0.8113225102424622, "learning_rate": 3.0488443212222067e-05, "loss": 0.1421, "step": 2973 }, { "epoch": 1.8095527836933374, "grad_norm": 0.9399985074996948, "learning_rate": 3.047629738092828e-05, "loss": 0.1484, "step": 2974 }, { "epoch": 1.8101612412534225, "grad_norm": 0.8929994106292725, "learning_rate": 3.046415019174102e-05, "loss": 0.1615, "step": 2975 }, { "epoch": 1.8107696988135078, "grad_norm": 0.9099370837211609, "learning_rate": 3.0452001647672256e-05, "loss": 0.1902, "step": 2976 }, { "epoch": 1.811378156373593, "grad_norm": 0.7755023837089539, "learning_rate": 3.043985175173434e-05, "loss": 0.1494, "step": 2977 }, { "epoch": 1.8119866139336782, "grad_norm": 0.868893027305603, "learning_rate": 3.042770050693994e-05, "loss": 0.1345, "step": 2978 }, { "epoch": 1.8125950714937633, "grad_norm": 0.9862670302391052, "learning_rate": 3.0415547916302044e-05, "loss": 0.1903, "step": 2979 }, { "epoch": 1.8132035290538484, "grad_norm": 0.7265908718109131, "learning_rate": 3.0403393982834e-05, "loss": 0.1474, "step": 2980 }, { "epoch": 1.8138119866139337, "grad_norm": 0.7525795698165894, "learning_rate": 3.039123870954947e-05, "loss": 0.1376, "step": 2981 }, { "epoch": 1.8144204441740188, "grad_norm": 0.8074904680252075, "learning_rate": 3.0379082099462454e-05, "loss": 0.1699, "step": 2982 }, { "epoch": 1.8150289017341041, "grad_norm": 0.7635090351104736, "learning_rate": 3.0366924155587296e-05, "loss": 0.1865, "step": 2983 }, { "epoch": 1.8156373592941892, "grad_norm": 0.8797259330749512, "learning_rate": 3.0354764880938647e-05, "loss": 0.1832, "step": 2984 }, { "epoch": 1.8162458168542743, "grad_norm": 0.7532181739807129, "learning_rate": 3.0342604278531512e-05, "loss": 0.1289, "step": 2985 }, { "epoch": 1.8168542744143596, "grad_norm": 1.013737440109253, "learning_rate": 3.0330442351381198e-05, "loss": 0.1913, "step": 2986 }, { "epoch": 1.8174627319744447, "grad_norm": 0.8055927753448486, "learning_rate": 3.0318279102503367e-05, "loss": 0.1773, "step": 2987 }, { "epoch": 1.81807118953453, "grad_norm": 0.8605936765670776, "learning_rate": 3.0306114534913998e-05, "loss": 0.1499, "step": 2988 }, { "epoch": 1.8186796470946152, "grad_norm": 0.8703776001930237, "learning_rate": 3.0293948651629388e-05, "loss": 0.2272, "step": 2989 }, { "epoch": 1.8192881046547003, "grad_norm": 0.7393494844436646, "learning_rate": 3.0281781455666182e-05, "loss": 0.1363, "step": 2990 }, { "epoch": 1.8198965622147856, "grad_norm": 0.9417717456817627, "learning_rate": 3.026961295004133e-05, "loss": 0.1626, "step": 2991 }, { "epoch": 1.8205050197748707, "grad_norm": 0.72954922914505, "learning_rate": 3.025744313777211e-05, "loss": 0.1273, "step": 2992 }, { "epoch": 1.821113477334956, "grad_norm": 0.8940945863723755, "learning_rate": 3.0245272021876144e-05, "loss": 0.1814, "step": 2993 }, { "epoch": 1.821721934895041, "grad_norm": 0.8653423190116882, "learning_rate": 3.023309960537134e-05, "loss": 0.1743, "step": 2994 }, { "epoch": 1.8223303924551262, "grad_norm": 0.9504120945930481, "learning_rate": 3.0220925891275957e-05, "loss": 0.18, "step": 2995 }, { "epoch": 1.8229388500152113, "grad_norm": 0.9393471479415894, "learning_rate": 3.0208750882608583e-05, "loss": 0.1825, "step": 2996 }, { "epoch": 1.8235473075752966, "grad_norm": 0.9305369257926941, "learning_rate": 3.0196574582388095e-05, "loss": 0.1618, "step": 2997 }, { "epoch": 1.824155765135382, "grad_norm": 0.7652149796485901, "learning_rate": 3.0184396993633718e-05, "loss": 0.1558, "step": 2998 }, { "epoch": 1.824764222695467, "grad_norm": 0.9691605567932129, "learning_rate": 3.0172218119364975e-05, "loss": 0.1435, "step": 2999 }, { "epoch": 1.825372680255552, "grad_norm": 1.8901022672653198, "learning_rate": 3.0160037962601727e-05, "loss": 0.1259, "step": 3000 }, { "epoch": 1.8259811378156372, "grad_norm": 0.7600100636482239, "learning_rate": 3.014785652636416e-05, "loss": 0.1442, "step": 3001 }, { "epoch": 1.8265895953757225, "grad_norm": 0.904800295829773, "learning_rate": 3.0135673813672734e-05, "loss": 0.1831, "step": 3002 }, { "epoch": 1.8271980529358078, "grad_norm": 0.9065991640090942, "learning_rate": 3.0123489827548273e-05, "loss": 0.1632, "step": 3003 }, { "epoch": 1.827806510495893, "grad_norm": 0.7399230599403381, "learning_rate": 3.0111304571011888e-05, "loss": 0.1456, "step": 3004 }, { "epoch": 1.828414968055978, "grad_norm": 0.6830241680145264, "learning_rate": 3.0099118047085024e-05, "loss": 0.1322, "step": 3005 }, { "epoch": 1.8290234256160631, "grad_norm": 0.8357345461845398, "learning_rate": 3.0086930258789426e-05, "loss": 0.1839, "step": 3006 }, { "epoch": 1.8296318831761484, "grad_norm": 0.8393163681030273, "learning_rate": 3.0074741209147157e-05, "loss": 0.1388, "step": 3007 }, { "epoch": 1.8302403407362338, "grad_norm": 0.7641516923904419, "learning_rate": 3.006255090118059e-05, "loss": 0.134, "step": 3008 }, { "epoch": 1.8308487982963189, "grad_norm": 0.8829279541969299, "learning_rate": 3.005035933791242e-05, "loss": 0.165, "step": 3009 }, { "epoch": 1.831457255856404, "grad_norm": 0.7792665958404541, "learning_rate": 3.0038166522365642e-05, "loss": 0.1351, "step": 3010 }, { "epoch": 1.832065713416489, "grad_norm": 0.7733790278434753, "learning_rate": 3.0025972457563573e-05, "loss": 0.1098, "step": 3011 }, { "epoch": 1.8326741709765744, "grad_norm": 0.7731171250343323, "learning_rate": 3.001377714652982e-05, "loss": 0.1458, "step": 3012 }, { "epoch": 1.8332826285366597, "grad_norm": 0.7515898942947388, "learning_rate": 3.000158059228832e-05, "loss": 0.1682, "step": 3013 }, { "epoch": 1.8338910860967448, "grad_norm": 0.7409710884094238, "learning_rate": 2.9989382797863313e-05, "loss": 0.1413, "step": 3014 }, { "epoch": 1.8344995436568299, "grad_norm": 0.8755896687507629, "learning_rate": 2.997718376627934e-05, "loss": 0.1678, "step": 3015 }, { "epoch": 1.835108001216915, "grad_norm": 0.7442592978477478, "learning_rate": 2.996498350056125e-05, "loss": 0.1285, "step": 3016 }, { "epoch": 1.8357164587770003, "grad_norm": 0.7244062423706055, "learning_rate": 2.9952782003734202e-05, "loss": 0.1296, "step": 3017 }, { "epoch": 1.8363249163370856, "grad_norm": 0.7951245903968811, "learning_rate": 2.994057927882366e-05, "loss": 0.1638, "step": 3018 }, { "epoch": 1.8369333738971707, "grad_norm": 0.778948962688446, "learning_rate": 2.9928375328855396e-05, "loss": 0.1594, "step": 3019 }, { "epoch": 1.8375418314572558, "grad_norm": 1.0201570987701416, "learning_rate": 2.9916170156855467e-05, "loss": 0.1291, "step": 3020 }, { "epoch": 1.838150289017341, "grad_norm": 0.8786885738372803, "learning_rate": 2.9903963765850263e-05, "loss": 0.1835, "step": 3021 }, { "epoch": 1.8387587465774262, "grad_norm": 0.8748721480369568, "learning_rate": 2.989175615886644e-05, "loss": 0.1542, "step": 3022 }, { "epoch": 1.8393672041375115, "grad_norm": 0.7832545042037964, "learning_rate": 2.9879547338930997e-05, "loss": 0.1622, "step": 3023 }, { "epoch": 1.8399756616975966, "grad_norm": 0.902432382106781, "learning_rate": 2.98673373090712e-05, "loss": 0.1641, "step": 3024 }, { "epoch": 1.8405841192576817, "grad_norm": 0.9937904477119446, "learning_rate": 2.9855126072314638e-05, "loss": 0.1468, "step": 3025 }, { "epoch": 1.8411925768177668, "grad_norm": 0.8179795145988464, "learning_rate": 2.984291363168918e-05, "loss": 0.1675, "step": 3026 }, { "epoch": 1.8418010343778521, "grad_norm": 0.7362605929374695, "learning_rate": 2.9830699990222992e-05, "loss": 0.1494, "step": 3027 }, { "epoch": 1.8424094919379375, "grad_norm": 0.7124605178833008, "learning_rate": 2.981848515094457e-05, "loss": 0.117, "step": 3028 }, { "epoch": 1.8430179494980226, "grad_norm": 0.9013100266456604, "learning_rate": 2.9806269116882678e-05, "loss": 0.1282, "step": 3029 }, { "epoch": 1.8436264070581077, "grad_norm": 0.7831732630729675, "learning_rate": 2.979405189106637e-05, "loss": 0.1539, "step": 3030 }, { "epoch": 1.8442348646181927, "grad_norm": 0.8050282597541809, "learning_rate": 2.9781833476525022e-05, "loss": 0.1714, "step": 3031 }, { "epoch": 1.844843322178278, "grad_norm": 0.7580960988998413, "learning_rate": 2.97696138762883e-05, "loss": 0.133, "step": 3032 }, { "epoch": 1.8454517797383634, "grad_norm": 0.5984554290771484, "learning_rate": 2.9757393093386133e-05, "loss": 0.1178, "step": 3033 }, { "epoch": 1.8460602372984485, "grad_norm": 0.9737577438354492, "learning_rate": 2.974517113084878e-05, "loss": 0.2278, "step": 3034 }, { "epoch": 1.8466686948585336, "grad_norm": 0.7336621284484863, "learning_rate": 2.973294799170677e-05, "loss": 0.1532, "step": 3035 }, { "epoch": 1.8472771524186187, "grad_norm": 0.9183807373046875, "learning_rate": 2.9720723678990943e-05, "loss": 0.1445, "step": 3036 }, { "epoch": 1.847885609978704, "grad_norm": 0.8194828629493713, "learning_rate": 2.970849819573241e-05, "loss": 0.1511, "step": 3037 }, { "epoch": 1.8484940675387893, "grad_norm": 0.9400119185447693, "learning_rate": 2.9696271544962583e-05, "loss": 0.1763, "step": 3038 }, { "epoch": 1.8491025250988744, "grad_norm": 0.9202710390090942, "learning_rate": 2.968404372971316e-05, "loss": 0.1323, "step": 3039 }, { "epoch": 1.8497109826589595, "grad_norm": 1.0977171659469604, "learning_rate": 2.9671814753016147e-05, "loss": 0.1417, "step": 3040 }, { "epoch": 1.8503194402190446, "grad_norm": 0.8560418486595154, "learning_rate": 2.9659584617903795e-05, "loss": 0.16, "step": 3041 }, { "epoch": 1.85092789777913, "grad_norm": 0.9262466430664062, "learning_rate": 2.9647353327408678e-05, "loss": 0.1722, "step": 3042 }, { "epoch": 1.8515363553392152, "grad_norm": 0.9393658638000488, "learning_rate": 2.9635120884563654e-05, "loss": 0.1889, "step": 3043 }, { "epoch": 1.8521448128993003, "grad_norm": 0.6739933490753174, "learning_rate": 2.9622887292401847e-05, "loss": 0.1257, "step": 3044 }, { "epoch": 1.8527532704593854, "grad_norm": 0.8402737975120544, "learning_rate": 2.9610652553956688e-05, "loss": 0.1763, "step": 3045 }, { "epoch": 1.8533617280194705, "grad_norm": 0.7270654439926147, "learning_rate": 2.959841667226187e-05, "loss": 0.1321, "step": 3046 }, { "epoch": 1.8539701855795558, "grad_norm": 0.8025201559066772, "learning_rate": 2.9586179650351386e-05, "loss": 0.1524, "step": 3047 }, { "epoch": 1.8545786431396412, "grad_norm": 0.7017185091972351, "learning_rate": 2.9573941491259523e-05, "loss": 0.1274, "step": 3048 }, { "epoch": 1.8551871006997263, "grad_norm": 0.7935231328010559, "learning_rate": 2.9561702198020813e-05, "loss": 0.1738, "step": 3049 }, { "epoch": 1.8557955582598114, "grad_norm": 0.830694317817688, "learning_rate": 2.9549461773670094e-05, "loss": 0.1483, "step": 3050 }, { "epoch": 1.8564040158198964, "grad_norm": 0.8252891302108765, "learning_rate": 2.9537220221242496e-05, "loss": 0.2033, "step": 3051 }, { "epoch": 1.8570124733799818, "grad_norm": 1.0744190216064453, "learning_rate": 2.9524977543773397e-05, "loss": 0.1949, "step": 3052 }, { "epoch": 1.857620930940067, "grad_norm": 0.7989039421081543, "learning_rate": 2.9512733744298482e-05, "loss": 0.1604, "step": 3053 }, { "epoch": 1.8582293885001522, "grad_norm": 0.768959105014801, "learning_rate": 2.9500488825853702e-05, "loss": 0.1412, "step": 3054 }, { "epoch": 1.8588378460602373, "grad_norm": 0.8371010422706604, "learning_rate": 2.9488242791475272e-05, "loss": 0.1422, "step": 3055 }, { "epoch": 1.8594463036203224, "grad_norm": 0.8141250610351562, "learning_rate": 2.947599564419971e-05, "loss": 0.2141, "step": 3056 }, { "epoch": 1.8600547611804077, "grad_norm": 0.7183147072792053, "learning_rate": 2.9463747387063807e-05, "loss": 0.1782, "step": 3057 }, { "epoch": 1.860663218740493, "grad_norm": 0.7062293887138367, "learning_rate": 2.94514980231046e-05, "loss": 0.1377, "step": 3058 }, { "epoch": 1.861271676300578, "grad_norm": 0.7800901532173157, "learning_rate": 2.943924755535944e-05, "loss": 0.16, "step": 3059 }, { "epoch": 1.8618801338606632, "grad_norm": 0.8286892175674438, "learning_rate": 2.9426995986865918e-05, "loss": 0.1543, "step": 3060 }, { "epoch": 1.8624885914207483, "grad_norm": 0.7443828582763672, "learning_rate": 2.941474332066192e-05, "loss": 0.1543, "step": 3061 }, { "epoch": 1.8630970489808336, "grad_norm": 0.6537884473800659, "learning_rate": 2.9402489559785594e-05, "loss": 0.1595, "step": 3062 }, { "epoch": 1.863705506540919, "grad_norm": 0.7740469574928284, "learning_rate": 2.9390234707275355e-05, "loss": 0.1529, "step": 3063 }, { "epoch": 1.864313964101004, "grad_norm": 0.6976113319396973, "learning_rate": 2.9377978766169912e-05, "loss": 0.1339, "step": 3064 }, { "epoch": 1.8649224216610891, "grad_norm": 0.7111392021179199, "learning_rate": 2.9365721739508213e-05, "loss": 0.142, "step": 3065 }, { "epoch": 1.8655308792211742, "grad_norm": 0.671752393245697, "learning_rate": 2.9353463630329493e-05, "loss": 0.1487, "step": 3066 }, { "epoch": 1.8661393367812595, "grad_norm": 0.8591635823249817, "learning_rate": 2.9341204441673266e-05, "loss": 0.1523, "step": 3067 }, { "epoch": 1.8667477943413449, "grad_norm": 1.018031358718872, "learning_rate": 2.932894417657927e-05, "loss": 0.1422, "step": 3068 }, { "epoch": 1.86735625190143, "grad_norm": 1.00792396068573, "learning_rate": 2.9316682838087565e-05, "loss": 0.1496, "step": 3069 }, { "epoch": 1.867964709461515, "grad_norm": 0.8856237530708313, "learning_rate": 2.930442042923845e-05, "loss": 0.1572, "step": 3070 }, { "epoch": 1.8685731670216001, "grad_norm": 0.9289006590843201, "learning_rate": 2.929215695307248e-05, "loss": 0.1744, "step": 3071 }, { "epoch": 1.8691816245816855, "grad_norm": 0.7653868794441223, "learning_rate": 2.9279892412630493e-05, "loss": 0.17, "step": 3072 }, { "epoch": 1.8697900821417706, "grad_norm": 0.700298011302948, "learning_rate": 2.9267626810953584e-05, "loss": 0.1628, "step": 3073 }, { "epoch": 1.8703985397018559, "grad_norm": 0.7971607446670532, "learning_rate": 2.9255360151083107e-05, "loss": 0.1727, "step": 3074 }, { "epoch": 1.871006997261941, "grad_norm": 0.8248562812805176, "learning_rate": 2.924309243606069e-05, "loss": 0.1263, "step": 3075 }, { "epoch": 1.871615454822026, "grad_norm": 0.8164551854133606, "learning_rate": 2.9230823668928198e-05, "loss": 0.1476, "step": 3076 }, { "epoch": 1.8722239123821114, "grad_norm": 0.7059839963912964, "learning_rate": 2.9218553852727794e-05, "loss": 0.1399, "step": 3077 }, { "epoch": 1.8728323699421965, "grad_norm": 0.8123924732208252, "learning_rate": 2.920628299050187e-05, "loss": 0.1431, "step": 3078 }, { "epoch": 1.8734408275022818, "grad_norm": 0.6520960927009583, "learning_rate": 2.9194011085293093e-05, "loss": 0.1423, "step": 3079 }, { "epoch": 1.874049285062367, "grad_norm": 0.7921064496040344, "learning_rate": 2.9181738140144382e-05, "loss": 0.1541, "step": 3080 }, { "epoch": 1.874657742622452, "grad_norm": 0.8729209303855896, "learning_rate": 2.9169464158098914e-05, "loss": 0.1544, "step": 3081 }, { "epoch": 1.8752662001825373, "grad_norm": 0.745010256767273, "learning_rate": 2.9157189142200124e-05, "loss": 0.1548, "step": 3082 }, { "epoch": 1.8758746577426224, "grad_norm": 0.7852745056152344, "learning_rate": 2.914491309549171e-05, "loss": 0.1595, "step": 3083 }, { "epoch": 1.8764831153027077, "grad_norm": 0.7547739744186401, "learning_rate": 2.9132636021017616e-05, "loss": 0.14, "step": 3084 }, { "epoch": 1.8770915728627928, "grad_norm": 0.9375334978103638, "learning_rate": 2.912035792182205e-05, "loss": 0.1149, "step": 3085 }, { "epoch": 1.877700030422878, "grad_norm": 0.7835490703582764, "learning_rate": 2.9108078800949456e-05, "loss": 0.1577, "step": 3086 }, { "epoch": 1.8783084879829632, "grad_norm": 0.7294179797172546, "learning_rate": 2.9095798661444557e-05, "loss": 0.1358, "step": 3087 }, { "epoch": 1.8789169455430483, "grad_norm": 0.7559580206871033, "learning_rate": 2.9083517506352315e-05, "loss": 0.144, "step": 3088 }, { "epoch": 1.8795254031031337, "grad_norm": 0.7099955677986145, "learning_rate": 2.9071235338717935e-05, "loss": 0.1271, "step": 3089 }, { "epoch": 1.8801338606632187, "grad_norm": 0.8515664339065552, "learning_rate": 2.9058952161586896e-05, "loss": 0.1777, "step": 3090 }, { "epoch": 1.8807423182233038, "grad_norm": 0.8556992411613464, "learning_rate": 2.90466679780049e-05, "loss": 0.1626, "step": 3091 }, { "epoch": 1.8813507757833892, "grad_norm": 0.872840404510498, "learning_rate": 2.9034382791017918e-05, "loss": 0.1781, "step": 3092 }, { "epoch": 1.8819592333434743, "grad_norm": 0.6905577182769775, "learning_rate": 2.9022096603672166e-05, "loss": 0.171, "step": 3093 }, { "epoch": 1.8825676909035596, "grad_norm": 0.7203934788703918, "learning_rate": 2.9009809419014107e-05, "loss": 0.1366, "step": 3094 }, { "epoch": 1.8831761484636447, "grad_norm": 0.8403582572937012, "learning_rate": 2.8997521240090448e-05, "loss": 0.1349, "step": 3095 }, { "epoch": 1.8837846060237298, "grad_norm": 0.7579777836799622, "learning_rate": 2.898523206994815e-05, "loss": 0.1664, "step": 3096 }, { "epoch": 1.8843930635838149, "grad_norm": 0.8160215616226196, "learning_rate": 2.8972941911634406e-05, "loss": 0.1475, "step": 3097 }, { "epoch": 1.8850015211439002, "grad_norm": 0.7258248925209045, "learning_rate": 2.8960650768196672e-05, "loss": 0.1625, "step": 3098 }, { "epoch": 1.8856099787039855, "grad_norm": 0.9643956422805786, "learning_rate": 2.894835864268263e-05, "loss": 0.1672, "step": 3099 }, { "epoch": 1.8862184362640706, "grad_norm": 0.7425087690353394, "learning_rate": 2.8936065538140228e-05, "loss": 0.1373, "step": 3100 }, { "epoch": 1.8868268938241557, "grad_norm": 0.8233270645141602, "learning_rate": 2.8923771457617634e-05, "loss": 0.2127, "step": 3101 }, { "epoch": 1.8874353513842408, "grad_norm": 0.7719386219978333, "learning_rate": 2.891147640416327e-05, "loss": 0.1492, "step": 3102 }, { "epoch": 1.888043808944326, "grad_norm": 0.8687146902084351, "learning_rate": 2.889918038082579e-05, "loss": 0.1577, "step": 3103 }, { "epoch": 1.8886522665044114, "grad_norm": 0.8466896414756775, "learning_rate": 2.8886883390654106e-05, "loss": 0.1688, "step": 3104 }, { "epoch": 1.8892607240644965, "grad_norm": 0.7307385206222534, "learning_rate": 2.8874585436697355e-05, "loss": 0.0943, "step": 3105 }, { "epoch": 1.8898691816245816, "grad_norm": 0.9252493977546692, "learning_rate": 2.8862286522004916e-05, "loss": 0.1971, "step": 3106 }, { "epoch": 1.8904776391846667, "grad_norm": 0.7721157073974609, "learning_rate": 2.8849986649626405e-05, "loss": 0.1684, "step": 3107 }, { "epoch": 1.891086096744752, "grad_norm": 0.8666673302650452, "learning_rate": 2.8837685822611682e-05, "loss": 0.1636, "step": 3108 }, { "epoch": 1.8916945543048374, "grad_norm": 0.8242167234420776, "learning_rate": 2.882538404401084e-05, "loss": 0.1706, "step": 3109 }, { "epoch": 1.8923030118649224, "grad_norm": 0.8100599050521851, "learning_rate": 2.8813081316874197e-05, "loss": 0.1448, "step": 3110 }, { "epoch": 1.8929114694250075, "grad_norm": 0.7981263399124146, "learning_rate": 2.8800777644252336e-05, "loss": 0.1324, "step": 3111 }, { "epoch": 1.8935199269850926, "grad_norm": 0.7660496830940247, "learning_rate": 2.8788473029196033e-05, "loss": 0.1506, "step": 3112 }, { "epoch": 1.894128384545178, "grad_norm": 0.8053763508796692, "learning_rate": 2.877616747475634e-05, "loss": 0.1377, "step": 3113 }, { "epoch": 1.8947368421052633, "grad_norm": 0.7818877696990967, "learning_rate": 2.8763860983984502e-05, "loss": 0.1491, "step": 3114 }, { "epoch": 1.8953452996653484, "grad_norm": 0.8091548085212708, "learning_rate": 2.8751553559932033e-05, "loss": 0.1398, "step": 3115 }, { "epoch": 1.8959537572254335, "grad_norm": 0.8318586349487305, "learning_rate": 2.873924520565065e-05, "loss": 0.1602, "step": 3116 }, { "epoch": 1.8965622147855186, "grad_norm": 0.8310206532478333, "learning_rate": 2.872693592419231e-05, "loss": 0.1797, "step": 3117 }, { "epoch": 1.8971706723456039, "grad_norm": 0.802715003490448, "learning_rate": 2.8714625718609213e-05, "loss": 0.156, "step": 3118 }, { "epoch": 1.8977791299056892, "grad_norm": 0.7569795846939087, "learning_rate": 2.8702314591953776e-05, "loss": 0.1636, "step": 3119 }, { "epoch": 1.8983875874657743, "grad_norm": 0.6739256381988525, "learning_rate": 2.8690002547278633e-05, "loss": 0.1341, "step": 3120 }, { "epoch": 1.8989960450258594, "grad_norm": 0.8993747234344482, "learning_rate": 2.8677689587636668e-05, "loss": 0.1541, "step": 3121 }, { "epoch": 1.8996045025859445, "grad_norm": 0.7927727699279785, "learning_rate": 2.866537571608098e-05, "loss": 0.1603, "step": 3122 }, { "epoch": 1.9002129601460298, "grad_norm": 0.7829379439353943, "learning_rate": 2.8653060935664888e-05, "loss": 0.1674, "step": 3123 }, { "epoch": 1.9008214177061151, "grad_norm": 0.8112666606903076, "learning_rate": 2.8640745249441958e-05, "loss": 0.1731, "step": 3124 }, { "epoch": 1.9014298752662002, "grad_norm": 0.8178973197937012, "learning_rate": 2.8628428660465957e-05, "loss": 0.1408, "step": 3125 }, { "epoch": 1.9020383328262853, "grad_norm": 0.7954561114311218, "learning_rate": 2.8616111171790894e-05, "loss": 0.1557, "step": 3126 }, { "epoch": 1.9026467903863704, "grad_norm": 0.8346177935600281, "learning_rate": 2.860379278647098e-05, "loss": 0.1281, "step": 3127 }, { "epoch": 1.9032552479464557, "grad_norm": 0.7774602174758911, "learning_rate": 2.8591473507560667e-05, "loss": 0.1308, "step": 3128 }, { "epoch": 1.903863705506541, "grad_norm": 0.8558622598648071, "learning_rate": 2.8579153338114635e-05, "loss": 0.1745, "step": 3129 }, { "epoch": 1.9044721630666261, "grad_norm": 0.8205204606056213, "learning_rate": 2.856683228118775e-05, "loss": 0.1451, "step": 3130 }, { "epoch": 1.9050806206267112, "grad_norm": 0.8976646661758423, "learning_rate": 2.8554510339835144e-05, "loss": 0.1594, "step": 3131 }, { "epoch": 1.9056890781867963, "grad_norm": 0.6619974970817566, "learning_rate": 2.8542187517112124e-05, "loss": 0.1453, "step": 3132 }, { "epoch": 1.9062975357468817, "grad_norm": 0.9032138586044312, "learning_rate": 2.8529863816074244e-05, "loss": 0.1222, "step": 3133 }, { "epoch": 1.906905993306967, "grad_norm": 0.7461523413658142, "learning_rate": 2.851753923977728e-05, "loss": 0.129, "step": 3134 }, { "epoch": 1.907514450867052, "grad_norm": 0.8470805883407593, "learning_rate": 2.850521379127719e-05, "loss": 0.1702, "step": 3135 }, { "epoch": 1.9081229084271372, "grad_norm": 0.7833938002586365, "learning_rate": 2.8492887473630193e-05, "loss": 0.1323, "step": 3136 }, { "epoch": 1.9087313659872223, "grad_norm": 0.8622560501098633, "learning_rate": 2.848056028989269e-05, "loss": 0.1663, "step": 3137 }, { "epoch": 1.9093398235473076, "grad_norm": 0.7503676414489746, "learning_rate": 2.8468232243121313e-05, "loss": 0.1282, "step": 3138 }, { "epoch": 1.909948281107393, "grad_norm": 0.7636743187904358, "learning_rate": 2.8455903336372902e-05, "loss": 0.1245, "step": 3139 }, { "epoch": 1.910556738667478, "grad_norm": 0.8729671239852905, "learning_rate": 2.844357357270451e-05, "loss": 0.1747, "step": 3140 }, { "epoch": 1.911165196227563, "grad_norm": 0.6713892817497253, "learning_rate": 2.8431242955173408e-05, "loss": 0.1449, "step": 3141 }, { "epoch": 1.9117736537876482, "grad_norm": 0.7802202105522156, "learning_rate": 2.841891148683708e-05, "loss": 0.1552, "step": 3142 }, { "epoch": 1.9123821113477335, "grad_norm": 0.8736104369163513, "learning_rate": 2.8406579170753205e-05, "loss": 0.1342, "step": 3143 }, { "epoch": 1.9129905689078188, "grad_norm": 0.730114758014679, "learning_rate": 2.8394246009979697e-05, "loss": 0.1631, "step": 3144 }, { "epoch": 1.913599026467904, "grad_norm": 0.7180743217468262, "learning_rate": 2.8381912007574653e-05, "loss": 0.124, "step": 3145 }, { "epoch": 1.914207484027989, "grad_norm": 0.6744349002838135, "learning_rate": 2.836957716659639e-05, "loss": 0.1257, "step": 3146 }, { "epoch": 1.914815941588074, "grad_norm": 0.8679302930831909, "learning_rate": 2.8357241490103447e-05, "loss": 0.1492, "step": 3147 }, { "epoch": 1.9154243991481594, "grad_norm": 0.821484386920929, "learning_rate": 2.8344904981154548e-05, "loss": 0.1498, "step": 3148 }, { "epoch": 1.9160328567082447, "grad_norm": 0.8600984811782837, "learning_rate": 2.833256764280864e-05, "loss": 0.1466, "step": 3149 }, { "epoch": 1.9166413142683298, "grad_norm": 0.7836093902587891, "learning_rate": 2.832022947812486e-05, "loss": 0.1434, "step": 3150 }, { "epoch": 1.917249771828415, "grad_norm": 0.8246343731880188, "learning_rate": 2.8307890490162564e-05, "loss": 0.1209, "step": 3151 }, { "epoch": 1.9178582293885, "grad_norm": 0.8240684866905212, "learning_rate": 2.8295550681981314e-05, "loss": 0.1811, "step": 3152 }, { "epoch": 1.9184666869485854, "grad_norm": 0.7745963931083679, "learning_rate": 2.828321005664085e-05, "loss": 0.1375, "step": 3153 }, { "epoch": 1.9190751445086707, "grad_norm": 0.8456833362579346, "learning_rate": 2.827086861720115e-05, "loss": 0.1742, "step": 3154 }, { "epoch": 1.9196836020687558, "grad_norm": 0.8081454634666443, "learning_rate": 2.8258526366722364e-05, "loss": 0.1807, "step": 3155 }, { "epoch": 1.9202920596288409, "grad_norm": 0.7536942362785339, "learning_rate": 2.8246183308264862e-05, "loss": 0.144, "step": 3156 }, { "epoch": 1.920900517188926, "grad_norm": 0.7224938869476318, "learning_rate": 2.8233839444889216e-05, "loss": 0.1139, "step": 3157 }, { "epoch": 1.9215089747490113, "grad_norm": 0.6938396692276001, "learning_rate": 2.822149477965617e-05, "loss": 0.1324, "step": 3158 }, { "epoch": 1.9221174323090966, "grad_norm": 0.8000092506408691, "learning_rate": 2.8209149315626697e-05, "loss": 0.1448, "step": 3159 }, { "epoch": 1.9227258898691817, "grad_norm": 0.6927624344825745, "learning_rate": 2.8196803055861964e-05, "loss": 0.1666, "step": 3160 }, { "epoch": 1.9233343474292668, "grad_norm": 0.6752164363861084, "learning_rate": 2.818445600342332e-05, "loss": 0.1415, "step": 3161 }, { "epoch": 1.9239428049893519, "grad_norm": 0.8532926440238953, "learning_rate": 2.817210816137232e-05, "loss": 0.2002, "step": 3162 }, { "epoch": 1.9245512625494372, "grad_norm": 0.6797624230384827, "learning_rate": 2.815975953277072e-05, "loss": 0.1286, "step": 3163 }, { "epoch": 1.9251597201095225, "grad_norm": 0.7706435918807983, "learning_rate": 2.8147410120680455e-05, "loss": 0.1423, "step": 3164 }, { "epoch": 1.9257681776696076, "grad_norm": 0.7351078391075134, "learning_rate": 2.8135059928163683e-05, "loss": 0.1081, "step": 3165 }, { "epoch": 1.9263766352296927, "grad_norm": 0.7624043226242065, "learning_rate": 2.812270895828271e-05, "loss": 0.1184, "step": 3166 }, { "epoch": 1.9269850927897778, "grad_norm": 1.0484083890914917, "learning_rate": 2.811035721410008e-05, "loss": 0.169, "step": 3167 }, { "epoch": 1.9275935503498631, "grad_norm": 0.7948042750358582, "learning_rate": 2.8098004698678517e-05, "loss": 0.1207, "step": 3168 }, { "epoch": 1.9282020079099484, "grad_norm": 0.7036427855491638, "learning_rate": 2.8085651415080903e-05, "loss": 0.1205, "step": 3169 }, { "epoch": 1.9288104654700335, "grad_norm": 0.7674738168716431, "learning_rate": 2.8073297366370365e-05, "loss": 0.1141, "step": 3170 }, { "epoch": 1.9294189230301186, "grad_norm": 0.932759702205658, "learning_rate": 2.806094255561018e-05, "loss": 0.1654, "step": 3171 }, { "epoch": 1.9300273805902037, "grad_norm": 2.075066566467285, "learning_rate": 2.804858698586383e-05, "loss": 0.1699, "step": 3172 }, { "epoch": 1.930635838150289, "grad_norm": 0.8003236055374146, "learning_rate": 2.8036230660194972e-05, "loss": 0.1529, "step": 3173 }, { "epoch": 1.9312442957103741, "grad_norm": 0.8078186511993408, "learning_rate": 2.8023873581667476e-05, "loss": 0.1817, "step": 3174 }, { "epoch": 1.9318527532704595, "grad_norm": 0.8531659841537476, "learning_rate": 2.8011515753345363e-05, "loss": 0.1293, "step": 3175 }, { "epoch": 1.9324612108305446, "grad_norm": 0.642602264881134, "learning_rate": 2.7999157178292873e-05, "loss": 0.1312, "step": 3176 }, { "epoch": 1.9330696683906297, "grad_norm": 0.780268132686615, "learning_rate": 2.798679785957442e-05, "loss": 0.1632, "step": 3177 }, { "epoch": 1.933678125950715, "grad_norm": 0.8036219477653503, "learning_rate": 2.797443780025459e-05, "loss": 0.1647, "step": 3178 }, { "epoch": 1.9342865835108, "grad_norm": 0.6498638391494751, "learning_rate": 2.7962077003398162e-05, "loss": 0.1176, "step": 3179 }, { "epoch": 1.9348950410708854, "grad_norm": 0.6818913817405701, "learning_rate": 2.794971547207011e-05, "loss": 0.1363, "step": 3180 }, { "epoch": 1.9355034986309705, "grad_norm": 0.8026784062385559, "learning_rate": 2.793735320933556e-05, "loss": 0.1645, "step": 3181 }, { "epoch": 1.9361119561910556, "grad_norm": 0.7087329030036926, "learning_rate": 2.7924990218259862e-05, "loss": 0.1045, "step": 3182 }, { "epoch": 1.936720413751141, "grad_norm": 0.7351903319358826, "learning_rate": 2.79126265019085e-05, "loss": 0.1643, "step": 3183 }, { "epoch": 1.937328871311226, "grad_norm": 0.7057482004165649, "learning_rate": 2.7900262063347172e-05, "loss": 0.1522, "step": 3184 }, { "epoch": 1.9379373288713113, "grad_norm": 0.8340151309967041, "learning_rate": 2.7887896905641746e-05, "loss": 0.152, "step": 3185 }, { "epoch": 1.9385457864313964, "grad_norm": 0.8494538068771362, "learning_rate": 2.7875531031858253e-05, "loss": 0.1919, "step": 3186 }, { "epoch": 1.9391542439914815, "grad_norm": 0.6910502314567566, "learning_rate": 2.7863164445062928e-05, "loss": 0.1377, "step": 3187 }, { "epoch": 1.9397627015515668, "grad_norm": 0.7750650644302368, "learning_rate": 2.785079714832216e-05, "loss": 0.1633, "step": 3188 }, { "epoch": 1.940371159111652, "grad_norm": 0.7941158413887024, "learning_rate": 2.7838429144702528e-05, "loss": 0.1317, "step": 3189 }, { "epoch": 1.9409796166717372, "grad_norm": 0.7237272262573242, "learning_rate": 2.7826060437270786e-05, "loss": 0.1287, "step": 3190 }, { "epoch": 1.9415880742318223, "grad_norm": 0.6614869236946106, "learning_rate": 2.781369102909384e-05, "loss": 0.0973, "step": 3191 }, { "epoch": 1.9421965317919074, "grad_norm": 0.8167053461074829, "learning_rate": 2.78013209232388e-05, "loss": 0.1562, "step": 3192 }, { "epoch": 1.9428049893519928, "grad_norm": 0.6749504804611206, "learning_rate": 2.7788950122772944e-05, "loss": 0.1205, "step": 3193 }, { "epoch": 1.9434134469120778, "grad_norm": 0.729088544845581, "learning_rate": 2.77765786307637e-05, "loss": 0.1377, "step": 3194 }, { "epoch": 1.9440219044721632, "grad_norm": 0.7123382687568665, "learning_rate": 2.7764206450278697e-05, "loss": 0.1255, "step": 3195 }, { "epoch": 1.9446303620322483, "grad_norm": 0.7942476272583008, "learning_rate": 2.7751833584385707e-05, "loss": 0.1207, "step": 3196 }, { "epoch": 1.9452388195923334, "grad_norm": 0.6616600751876831, "learning_rate": 2.7739460036152686e-05, "loss": 0.1114, "step": 3197 }, { "epoch": 1.9458472771524185, "grad_norm": 0.7162274718284607, "learning_rate": 2.772708580864777e-05, "loss": 0.1137, "step": 3198 }, { "epoch": 1.9464557347125038, "grad_norm": 0.7372970581054688, "learning_rate": 2.7714710904939238e-05, "loss": 0.1576, "step": 3199 }, { "epoch": 1.947064192272589, "grad_norm": 0.8062869906425476, "learning_rate": 2.7702335328095562e-05, "loss": 0.1603, "step": 3200 }, { "epoch": 1.9476726498326742, "grad_norm": 0.7464812397956848, "learning_rate": 2.7689959081185355e-05, "loss": 0.1139, "step": 3201 }, { "epoch": 1.9482811073927593, "grad_norm": 0.7284194231033325, "learning_rate": 2.7677582167277428e-05, "loss": 0.1258, "step": 3202 }, { "epoch": 1.9488895649528444, "grad_norm": 0.8625699877738953, "learning_rate": 2.766520458944073e-05, "loss": 0.164, "step": 3203 }, { "epoch": 1.9494980225129297, "grad_norm": 0.8282449841499329, "learning_rate": 2.7652826350744375e-05, "loss": 0.136, "step": 3204 }, { "epoch": 1.950106480073015, "grad_norm": 0.8043208122253418, "learning_rate": 2.7640447454257667e-05, "loss": 0.1447, "step": 3205 }, { "epoch": 1.9507149376331, "grad_norm": 0.7022130489349365, "learning_rate": 2.7628067903050052e-05, "loss": 0.1197, "step": 3206 }, { "epoch": 1.9513233951931852, "grad_norm": 0.7067160606384277, "learning_rate": 2.761568770019114e-05, "loss": 0.1394, "step": 3207 }, { "epoch": 1.9519318527532703, "grad_norm": 0.779850423336029, "learning_rate": 2.7603306848750703e-05, "loss": 0.1669, "step": 3208 }, { "epoch": 1.9525403103133556, "grad_norm": 0.765367865562439, "learning_rate": 2.759092535179868e-05, "loss": 0.1147, "step": 3209 }, { "epoch": 1.953148767873441, "grad_norm": 0.6569228172302246, "learning_rate": 2.757854321240516e-05, "loss": 0.1039, "step": 3210 }, { "epoch": 1.953757225433526, "grad_norm": 0.7526434063911438, "learning_rate": 2.756616043364041e-05, "loss": 0.1418, "step": 3211 }, { "epoch": 1.9543656829936111, "grad_norm": 0.7364466786384583, "learning_rate": 2.7553777018574834e-05, "loss": 0.1265, "step": 3212 }, { "epoch": 1.9549741405536962, "grad_norm": 0.7600389122962952, "learning_rate": 2.7541392970279e-05, "loss": 0.1372, "step": 3213 }, { "epoch": 1.9555825981137815, "grad_norm": 0.7566266059875488, "learning_rate": 2.7529008291823642e-05, "loss": 0.1094, "step": 3214 }, { "epoch": 1.9561910556738669, "grad_norm": 0.8381999135017395, "learning_rate": 2.7516622986279638e-05, "loss": 0.1618, "step": 3215 }, { "epoch": 1.956799513233952, "grad_norm": 0.8576135039329529, "learning_rate": 2.7504237056718042e-05, "loss": 0.149, "step": 3216 }, { "epoch": 1.957407970794037, "grad_norm": 0.7162174582481384, "learning_rate": 2.7491850506210027e-05, "loss": 0.129, "step": 3217 }, { "epoch": 1.9580164283541222, "grad_norm": 0.6965352892875671, "learning_rate": 2.747946333782696e-05, "loss": 0.0866, "step": 3218 }, { "epoch": 1.9586248859142075, "grad_norm": 0.7965853810310364, "learning_rate": 2.7467075554640326e-05, "loss": 0.1564, "step": 3219 }, { "epoch": 1.9592333434742928, "grad_norm": 0.6897141933441162, "learning_rate": 2.745468715972179e-05, "loss": 0.1425, "step": 3220 }, { "epoch": 1.9598418010343779, "grad_norm": 0.6696418523788452, "learning_rate": 2.744229815614316e-05, "loss": 0.1209, "step": 3221 }, { "epoch": 1.960450258594463, "grad_norm": 0.7108597159385681, "learning_rate": 2.742990854697638e-05, "loss": 0.1068, "step": 3222 }, { "epoch": 1.961058716154548, "grad_norm": 0.716172456741333, "learning_rate": 2.741751833529357e-05, "loss": 0.1491, "step": 3223 }, { "epoch": 1.9616671737146334, "grad_norm": 0.8014208674430847, "learning_rate": 2.7405127524166973e-05, "loss": 0.1468, "step": 3224 }, { "epoch": 1.9622756312747187, "grad_norm": 0.7219249606132507, "learning_rate": 2.7392736116669005e-05, "loss": 0.123, "step": 3225 }, { "epoch": 1.9628840888348038, "grad_norm": 0.8846964240074158, "learning_rate": 2.738034411587222e-05, "loss": 0.1553, "step": 3226 }, { "epoch": 1.963492546394889, "grad_norm": 0.7260108590126038, "learning_rate": 2.73679515248493e-05, "loss": 0.1358, "step": 3227 }, { "epoch": 1.964101003954974, "grad_norm": 0.7410035133361816, "learning_rate": 2.735555834667311e-05, "loss": 0.1261, "step": 3228 }, { "epoch": 1.9647094615150593, "grad_norm": 0.8541549444198608, "learning_rate": 2.7343164584416637e-05, "loss": 0.1394, "step": 3229 }, { "epoch": 1.9653179190751446, "grad_norm": 0.7482866048812866, "learning_rate": 2.7330770241153008e-05, "loss": 0.1007, "step": 3230 }, { "epoch": 1.9659263766352297, "grad_norm": 0.9279438853263855, "learning_rate": 2.7318375319955512e-05, "loss": 0.1857, "step": 3231 }, { "epoch": 1.9665348341953148, "grad_norm": 0.815904438495636, "learning_rate": 2.7305979823897576e-05, "loss": 0.1255, "step": 3232 }, { "epoch": 1.9671432917554, "grad_norm": 0.8731521368026733, "learning_rate": 2.7293583756052755e-05, "loss": 0.1447, "step": 3233 }, { "epoch": 1.9677517493154852, "grad_norm": 0.8212949633598328, "learning_rate": 2.7281187119494773e-05, "loss": 0.1359, "step": 3234 }, { "epoch": 1.9683602068755706, "grad_norm": 0.7952809929847717, "learning_rate": 2.726878991729746e-05, "loss": 0.1302, "step": 3235 }, { "epoch": 1.9689686644356557, "grad_norm": 0.8877054452896118, "learning_rate": 2.7256392152534825e-05, "loss": 0.1527, "step": 3236 }, { "epoch": 1.9695771219957408, "grad_norm": 0.844585120677948, "learning_rate": 2.724399382828098e-05, "loss": 0.1602, "step": 3237 }, { "epoch": 1.9701855795558258, "grad_norm": 0.7585496306419373, "learning_rate": 2.72315949476102e-05, "loss": 0.148, "step": 3238 }, { "epoch": 1.9707940371159112, "grad_norm": 0.7337778210639954, "learning_rate": 2.7219195513596894e-05, "loss": 0.1213, "step": 3239 }, { "epoch": 1.9714024946759965, "grad_norm": 0.8415425419807434, "learning_rate": 2.72067955293156e-05, "loss": 0.1589, "step": 3240 }, { "epoch": 1.9720109522360816, "grad_norm": 0.7917572855949402, "learning_rate": 2.7194394997841e-05, "loss": 0.1552, "step": 3241 }, { "epoch": 1.9726194097961667, "grad_norm": 0.7449464201927185, "learning_rate": 2.7181993922247907e-05, "loss": 0.1232, "step": 3242 }, { "epoch": 1.9732278673562518, "grad_norm": 0.956392228603363, "learning_rate": 2.7169592305611262e-05, "loss": 0.1336, "step": 3243 }, { "epoch": 1.973836324916337, "grad_norm": 0.8234003782272339, "learning_rate": 2.715719015100617e-05, "loss": 0.1144, "step": 3244 }, { "epoch": 1.9744447824764224, "grad_norm": 0.7959507703781128, "learning_rate": 2.714478746150783e-05, "loss": 0.1309, "step": 3245 }, { "epoch": 1.9750532400365075, "grad_norm": 0.8472030162811279, "learning_rate": 2.71323842401916e-05, "loss": 0.1262, "step": 3246 }, { "epoch": 1.9756616975965926, "grad_norm": 0.9578633904457092, "learning_rate": 2.711998049013297e-05, "loss": 0.1577, "step": 3247 }, { "epoch": 1.9762701551566777, "grad_norm": 0.6737805604934692, "learning_rate": 2.710757621440753e-05, "loss": 0.1057, "step": 3248 }, { "epoch": 1.976878612716763, "grad_norm": 0.8866187334060669, "learning_rate": 2.709517141609105e-05, "loss": 0.1305, "step": 3249 }, { "epoch": 1.9774870702768483, "grad_norm": 0.7365625500679016, "learning_rate": 2.7082766098259377e-05, "loss": 0.1691, "step": 3250 }, { "epoch": 1.9780955278369334, "grad_norm": 0.7187755703926086, "learning_rate": 2.7070360263988526e-05, "loss": 0.1036, "step": 3251 }, { "epoch": 1.9787039853970185, "grad_norm": 0.7651616334915161, "learning_rate": 2.7057953916354638e-05, "loss": 0.1153, "step": 3252 }, { "epoch": 1.9793124429571036, "grad_norm": 0.7826346158981323, "learning_rate": 2.7045547058433953e-05, "loss": 0.1409, "step": 3253 }, { "epoch": 1.979920900517189, "grad_norm": 0.7577517032623291, "learning_rate": 2.7033139693302854e-05, "loss": 0.1137, "step": 3254 }, { "epoch": 1.9805293580772743, "grad_norm": 0.7163174152374268, "learning_rate": 2.7020731824037865e-05, "loss": 0.1258, "step": 3255 }, { "epoch": 1.9811378156373594, "grad_norm": 0.7855852246284485, "learning_rate": 2.7008323453715607e-05, "loss": 0.1282, "step": 3256 }, { "epoch": 1.9817462731974445, "grad_norm": 0.8346313834190369, "learning_rate": 2.6995914585412847e-05, "loss": 0.126, "step": 3257 }, { "epoch": 1.9823547307575295, "grad_norm": 0.7510027885437012, "learning_rate": 2.6983505222206456e-05, "loss": 0.1334, "step": 3258 }, { "epoch": 1.9829631883176149, "grad_norm": 0.8980780839920044, "learning_rate": 2.697109536717346e-05, "loss": 0.1224, "step": 3259 }, { "epoch": 1.9835716458777002, "grad_norm": 0.7538368105888367, "learning_rate": 2.6958685023390963e-05, "loss": 0.1239, "step": 3260 }, { "epoch": 1.9841801034377853, "grad_norm": 0.9021039605140686, "learning_rate": 2.6946274193936222e-05, "loss": 0.1808, "step": 3261 }, { "epoch": 1.9847885609978704, "grad_norm": 0.6933613419532776, "learning_rate": 2.6933862881886607e-05, "loss": 0.1236, "step": 3262 }, { "epoch": 1.9853970185579555, "grad_norm": 0.8434465527534485, "learning_rate": 2.6921451090319603e-05, "loss": 0.1461, "step": 3263 }, { "epoch": 1.9860054761180408, "grad_norm": 0.692387580871582, "learning_rate": 2.6909038822312826e-05, "loss": 0.1007, "step": 3264 }, { "epoch": 1.986613933678126, "grad_norm": 0.7423060536384583, "learning_rate": 2.6896626080943983e-05, "loss": 0.1531, "step": 3265 }, { "epoch": 1.9872223912382112, "grad_norm": 0.7204686403274536, "learning_rate": 2.6884212869290932e-05, "loss": 0.132, "step": 3266 }, { "epoch": 1.9878308487982963, "grad_norm": 0.7953908443450928, "learning_rate": 2.6871799190431627e-05, "loss": 0.1327, "step": 3267 }, { "epoch": 1.9884393063583814, "grad_norm": 0.7700750827789307, "learning_rate": 2.6859385047444146e-05, "loss": 0.1351, "step": 3268 }, { "epoch": 1.9890477639184667, "grad_norm": 0.8604716658592224, "learning_rate": 2.684697044340667e-05, "loss": 0.1616, "step": 3269 }, { "epoch": 1.989656221478552, "grad_norm": 0.7228001356124878, "learning_rate": 2.683455538139752e-05, "loss": 0.1162, "step": 3270 }, { "epoch": 1.9902646790386371, "grad_norm": 0.7633928060531616, "learning_rate": 2.6822139864495092e-05, "loss": 0.1104, "step": 3271 }, { "epoch": 1.9908731365987222, "grad_norm": 0.7890945076942444, "learning_rate": 2.680972389577794e-05, "loss": 0.1464, "step": 3272 }, { "epoch": 1.9914815941588073, "grad_norm": 0.6925262808799744, "learning_rate": 2.6797307478324683e-05, "loss": 0.1416, "step": 3273 }, { "epoch": 1.9920900517188926, "grad_norm": 0.7635548710823059, "learning_rate": 2.678489061521409e-05, "loss": 0.0874, "step": 3274 }, { "epoch": 1.9926985092789777, "grad_norm": 0.7441225051879883, "learning_rate": 2.6772473309525027e-05, "loss": 0.1141, "step": 3275 }, { "epoch": 1.993306966839063, "grad_norm": 0.7360615730285645, "learning_rate": 2.6760055564336462e-05, "loss": 0.1356, "step": 3276 }, { "epoch": 1.9939154243991482, "grad_norm": 0.7741050720214844, "learning_rate": 2.674763738272748e-05, "loss": 0.1568, "step": 3277 }, { "epoch": 1.9945238819592332, "grad_norm": 0.7115604281425476, "learning_rate": 2.673521876777727e-05, "loss": 0.1049, "step": 3278 }, { "epoch": 1.9951323395193186, "grad_norm": 0.750180721282959, "learning_rate": 2.6722799722565127e-05, "loss": 0.1387, "step": 3279 }, { "epoch": 1.9957407970794037, "grad_norm": 0.7199980020523071, "learning_rate": 2.6710380250170476e-05, "loss": 0.1238, "step": 3280 }, { "epoch": 1.996349254639489, "grad_norm": 0.7894657850265503, "learning_rate": 2.6697960353672808e-05, "loss": 0.1399, "step": 3281 }, { "epoch": 1.996957712199574, "grad_norm": 0.7097627520561218, "learning_rate": 2.6685540036151752e-05, "loss": 0.1499, "step": 3282 }, { "epoch": 1.9975661697596592, "grad_norm": 0.7155794501304626, "learning_rate": 2.6673119300687015e-05, "loss": 0.1142, "step": 3283 }, { "epoch": 1.9981746273197445, "grad_norm": 0.6458375453948975, "learning_rate": 2.6660698150358433e-05, "loss": 0.0996, "step": 3284 }, { "epoch": 1.9987830848798296, "grad_norm": 0.6683632135391235, "learning_rate": 2.6648276588245935e-05, "loss": 0.1444, "step": 3285 }, { "epoch": 1.999391542439915, "grad_norm": 0.6919201016426086, "learning_rate": 2.663585461742954e-05, "loss": 0.1567, "step": 3286 }, { "epoch": 2.0, "grad_norm": 0.7347707152366638, "learning_rate": 2.662343224098939e-05, "loss": 0.1315, "step": 3287 }, { "epoch": 2.0, "eval_loss": 1.1706266403198242, "eval_runtime": 105.2304, "eval_samples_per_second": 7.241, "eval_steps_per_second": 0.456, "step": 3287 }, { "epoch": 2.000608457560085, "grad_norm": 0.6429162621498108, "learning_rate": 2.6611009462005716e-05, "loss": 0.0639, "step": 3288 }, { "epoch": 2.00121691512017, "grad_norm": 0.9474073052406311, "learning_rate": 2.659858628355884e-05, "loss": 0.0587, "step": 3289 }, { "epoch": 2.0018253726802557, "grad_norm": 0.5293480753898621, "learning_rate": 2.6586162708729197e-05, "loss": 0.041, "step": 3290 }, { "epoch": 2.002433830240341, "grad_norm": 0.6578330993652344, "learning_rate": 2.657373874059732e-05, "loss": 0.0757, "step": 3291 }, { "epoch": 2.003042287800426, "grad_norm": 0.4521423876285553, "learning_rate": 2.6561314382243835e-05, "loss": 0.0431, "step": 3292 }, { "epoch": 2.003650745360511, "grad_norm": 0.5849077701568604, "learning_rate": 2.654888963674945e-05, "loss": 0.0604, "step": 3293 }, { "epoch": 2.004259202920596, "grad_norm": 0.6320520043373108, "learning_rate": 2.6536464507195015e-05, "loss": 0.0662, "step": 3294 }, { "epoch": 2.0048676604806817, "grad_norm": 0.6279209852218628, "learning_rate": 2.652403899666141e-05, "loss": 0.0622, "step": 3295 }, { "epoch": 2.0054761180407668, "grad_norm": 0.7935817837715149, "learning_rate": 2.651161310822966e-05, "loss": 0.0558, "step": 3296 }, { "epoch": 2.006084575600852, "grad_norm": 0.5588900446891785, "learning_rate": 2.649918684498087e-05, "loss": 0.0638, "step": 3297 }, { "epoch": 2.006693033160937, "grad_norm": 0.5976481437683105, "learning_rate": 2.6486760209996238e-05, "loss": 0.0582, "step": 3298 }, { "epoch": 2.007301490721022, "grad_norm": 0.7166075110435486, "learning_rate": 2.6474333206357038e-05, "loss": 0.0676, "step": 3299 }, { "epoch": 2.0079099482811076, "grad_norm": 0.71381014585495, "learning_rate": 2.6461905837144656e-05, "loss": 0.0694, "step": 3300 }, { "epoch": 2.0085184058411927, "grad_norm": 0.626546323299408, "learning_rate": 2.644947810544056e-05, "loss": 0.0456, "step": 3301 }, { "epoch": 2.0091268634012778, "grad_norm": 0.5405663251876831, "learning_rate": 2.6437050014326313e-05, "loss": 0.0539, "step": 3302 }, { "epoch": 2.009735320961363, "grad_norm": 0.56814044713974, "learning_rate": 2.642462156688356e-05, "loss": 0.0652, "step": 3303 }, { "epoch": 2.010343778521448, "grad_norm": 0.6406037211418152, "learning_rate": 2.6412192766194043e-05, "loss": 0.0693, "step": 3304 }, { "epoch": 2.0109522360815335, "grad_norm": 0.5662345886230469, "learning_rate": 2.6399763615339583e-05, "loss": 0.0526, "step": 3305 }, { "epoch": 2.0115606936416186, "grad_norm": 0.5635650157928467, "learning_rate": 2.638733411740209e-05, "loss": 0.0641, "step": 3306 }, { "epoch": 2.0121691512017037, "grad_norm": 0.6141881346702576, "learning_rate": 2.6374904275463563e-05, "loss": 0.0482, "step": 3307 }, { "epoch": 2.012777608761789, "grad_norm": 0.6459068655967712, "learning_rate": 2.6362474092606088e-05, "loss": 0.0558, "step": 3308 }, { "epoch": 2.013386066321874, "grad_norm": 0.6503152251243591, "learning_rate": 2.635004357191182e-05, "loss": 0.0591, "step": 3309 }, { "epoch": 2.0139945238819594, "grad_norm": 0.6401180624961853, "learning_rate": 2.633761271646302e-05, "loss": 0.052, "step": 3310 }, { "epoch": 2.0146029814420445, "grad_norm": 0.6508215069770813, "learning_rate": 2.632518152934203e-05, "loss": 0.0513, "step": 3311 }, { "epoch": 2.0152114390021296, "grad_norm": 0.5980373024940491, "learning_rate": 2.631275001363125e-05, "loss": 0.0738, "step": 3312 }, { "epoch": 2.0158198965622147, "grad_norm": 0.6737512946128845, "learning_rate": 2.630031817241319e-05, "loss": 0.0581, "step": 3313 }, { "epoch": 2.0164283541223, "grad_norm": 0.5935463905334473, "learning_rate": 2.6287886008770418e-05, "loss": 0.0489, "step": 3314 }, { "epoch": 2.0170368116823854, "grad_norm": 0.565272331237793, "learning_rate": 2.62754535257856e-05, "loss": 0.0601, "step": 3315 }, { "epoch": 2.0176452692424705, "grad_norm": 0.5857186317443848, "learning_rate": 2.626302072654147e-05, "loss": 0.0419, "step": 3316 }, { "epoch": 2.0182537268025555, "grad_norm": 0.6303470134735107, "learning_rate": 2.625058761412085e-05, "loss": 0.061, "step": 3317 }, { "epoch": 2.0188621843626406, "grad_norm": 0.48001688718795776, "learning_rate": 2.6238154191606625e-05, "loss": 0.0497, "step": 3318 }, { "epoch": 2.0194706419227257, "grad_norm": 0.6741344332695007, "learning_rate": 2.6225720462081765e-05, "loss": 0.0624, "step": 3319 }, { "epoch": 2.0200790994828113, "grad_norm": 0.5248076915740967, "learning_rate": 2.6213286428629324e-05, "loss": 0.0398, "step": 3320 }, { "epoch": 2.0206875570428964, "grad_norm": 0.5384348630905151, "learning_rate": 2.6200852094332423e-05, "loss": 0.0336, "step": 3321 }, { "epoch": 2.0212960146029815, "grad_norm": 0.527263343334198, "learning_rate": 2.618841746227425e-05, "loss": 0.0464, "step": 3322 }, { "epoch": 2.0219044721630666, "grad_norm": 0.6149359941482544, "learning_rate": 2.6175982535538098e-05, "loss": 0.0427, "step": 3323 }, { "epoch": 2.0225129297231517, "grad_norm": 0.691123366355896, "learning_rate": 2.6163547317207276e-05, "loss": 0.0704, "step": 3324 }, { "epoch": 2.023121387283237, "grad_norm": 0.6679997444152832, "learning_rate": 2.6151111810365224e-05, "loss": 0.0534, "step": 3325 }, { "epoch": 2.0237298448433223, "grad_norm": 0.5870081186294556, "learning_rate": 2.613867601809543e-05, "loss": 0.0486, "step": 3326 }, { "epoch": 2.0243383024034074, "grad_norm": 0.5979164838790894, "learning_rate": 2.612623994348144e-05, "loss": 0.0516, "step": 3327 }, { "epoch": 2.0249467599634925, "grad_norm": 0.6477155685424805, "learning_rate": 2.6113803589606894e-05, "loss": 0.0495, "step": 3328 }, { "epoch": 2.0255552175235776, "grad_norm": 0.6475479006767273, "learning_rate": 2.6101366959555478e-05, "loss": 0.0615, "step": 3329 }, { "epoch": 2.026163675083663, "grad_norm": 0.48027321696281433, "learning_rate": 2.608893005641096e-05, "loss": 0.0438, "step": 3330 }, { "epoch": 2.0267721326437482, "grad_norm": 0.7494072914123535, "learning_rate": 2.6076492883257187e-05, "loss": 0.0782, "step": 3331 }, { "epoch": 2.0273805902038333, "grad_norm": 0.49906063079833984, "learning_rate": 2.6064055443178036e-05, "loss": 0.0474, "step": 3332 }, { "epoch": 2.0279890477639184, "grad_norm": 0.6378363370895386, "learning_rate": 2.6051617739257494e-05, "loss": 0.0663, "step": 3333 }, { "epoch": 2.0285975053240035, "grad_norm": 0.6063147783279419, "learning_rate": 2.603917977457959e-05, "loss": 0.0589, "step": 3334 }, { "epoch": 2.029205962884089, "grad_norm": 0.575063169002533, "learning_rate": 2.6026741552228417e-05, "loss": 0.054, "step": 3335 }, { "epoch": 2.029814420444174, "grad_norm": 0.569412112236023, "learning_rate": 2.601430307528813e-05, "loss": 0.0419, "step": 3336 }, { "epoch": 2.0304228780042592, "grad_norm": 0.5765204429626465, "learning_rate": 2.6001864346842964e-05, "loss": 0.0443, "step": 3337 }, { "epoch": 2.0310313355643443, "grad_norm": 0.5352804064750671, "learning_rate": 2.5989425369977195e-05, "loss": 0.0451, "step": 3338 }, { "epoch": 2.0316397931244294, "grad_norm": 0.6071703433990479, "learning_rate": 2.5976986147775178e-05, "loss": 0.0492, "step": 3339 }, { "epoch": 2.032248250684515, "grad_norm": 1.0402305126190186, "learning_rate": 2.5964546683321317e-05, "loss": 0.0945, "step": 3340 }, { "epoch": 2.0328567082446, "grad_norm": 0.7192779183387756, "learning_rate": 2.595210697970009e-05, "loss": 0.041, "step": 3341 }, { "epoch": 2.033465165804685, "grad_norm": 0.623833417892456, "learning_rate": 2.5939667039996006e-05, "loss": 0.0514, "step": 3342 }, { "epoch": 2.0340736233647703, "grad_norm": 0.7127055525779724, "learning_rate": 2.592722686729367e-05, "loss": 0.0609, "step": 3343 }, { "epoch": 2.0346820809248554, "grad_norm": 0.7052009701728821, "learning_rate": 2.5914786464677728e-05, "loss": 0.047, "step": 3344 }, { "epoch": 2.0352905384849405, "grad_norm": 0.5680457949638367, "learning_rate": 2.590234583523286e-05, "loss": 0.0566, "step": 3345 }, { "epoch": 2.035898996045026, "grad_norm": 0.505352795124054, "learning_rate": 2.5889904982043845e-05, "loss": 0.0526, "step": 3346 }, { "epoch": 2.036507453605111, "grad_norm": 0.4761469066143036, "learning_rate": 2.5877463908195475e-05, "loss": 0.0408, "step": 3347 }, { "epoch": 2.037115911165196, "grad_norm": 0.5836338996887207, "learning_rate": 2.586502261677264e-05, "loss": 0.0519, "step": 3348 }, { "epoch": 2.0377243687252813, "grad_norm": 0.7985496520996094, "learning_rate": 2.5852581110860253e-05, "loss": 0.0479, "step": 3349 }, { "epoch": 2.0383328262853664, "grad_norm": 0.645341157913208, "learning_rate": 2.5840139393543285e-05, "loss": 0.0608, "step": 3350 }, { "epoch": 2.038941283845452, "grad_norm": 0.6739196181297302, "learning_rate": 2.582769746790677e-05, "loss": 0.0543, "step": 3351 }, { "epoch": 2.039549741405537, "grad_norm": 0.6631483435630798, "learning_rate": 2.5815255337035775e-05, "loss": 0.0529, "step": 3352 }, { "epoch": 2.040158198965622, "grad_norm": 0.596749484539032, "learning_rate": 2.5802813004015443e-05, "loss": 0.0584, "step": 3353 }, { "epoch": 2.040766656525707, "grad_norm": 0.5741117596626282, "learning_rate": 2.5790370471930953e-05, "loss": 0.0559, "step": 3354 }, { "epoch": 2.0413751140857923, "grad_norm": 0.5789355635643005, "learning_rate": 2.577792774386752e-05, "loss": 0.0371, "step": 3355 }, { "epoch": 2.041983571645878, "grad_norm": 0.5400775074958801, "learning_rate": 2.5765484822910434e-05, "loss": 0.0524, "step": 3356 }, { "epoch": 2.042592029205963, "grad_norm": 0.5711213946342468, "learning_rate": 2.5753041712145032e-05, "loss": 0.0506, "step": 3357 }, { "epoch": 2.043200486766048, "grad_norm": 0.6942930817604065, "learning_rate": 2.5740598414656658e-05, "loss": 0.0607, "step": 3358 }, { "epoch": 2.043808944326133, "grad_norm": 0.6652560234069824, "learning_rate": 2.5728154933530755e-05, "loss": 0.0663, "step": 3359 }, { "epoch": 2.0444174018862182, "grad_norm": 0.6147892475128174, "learning_rate": 2.5715711271852777e-05, "loss": 0.0445, "step": 3360 }, { "epoch": 2.0450258594463038, "grad_norm": 0.546258270740509, "learning_rate": 2.570326743270824e-05, "loss": 0.0399, "step": 3361 }, { "epoch": 2.045634317006389, "grad_norm": 0.6004377007484436, "learning_rate": 2.569082341918269e-05, "loss": 0.048, "step": 3362 }, { "epoch": 2.046242774566474, "grad_norm": 0.6591867208480835, "learning_rate": 2.5678379234361728e-05, "loss": 0.0637, "step": 3363 }, { "epoch": 2.046851232126559, "grad_norm": 0.7136169075965881, "learning_rate": 2.566593488133099e-05, "loss": 0.0634, "step": 3364 }, { "epoch": 2.047459689686644, "grad_norm": 0.5792864561080933, "learning_rate": 2.5653490363176157e-05, "loss": 0.0515, "step": 3365 }, { "epoch": 2.0480681472467297, "grad_norm": 0.6176591515541077, "learning_rate": 2.5641045682982957e-05, "loss": 0.0665, "step": 3366 }, { "epoch": 2.048676604806815, "grad_norm": 0.5917629599571228, "learning_rate": 2.5628600843837147e-05, "loss": 0.0364, "step": 3367 }, { "epoch": 2.0492850623669, "grad_norm": 0.6469512581825256, "learning_rate": 2.561615584882453e-05, "loss": 0.0625, "step": 3368 }, { "epoch": 2.049893519926985, "grad_norm": 0.7131155133247375, "learning_rate": 2.5603710701030946e-05, "loss": 0.0567, "step": 3369 }, { "epoch": 2.05050197748707, "grad_norm": 0.7157228589057922, "learning_rate": 2.559126540354227e-05, "loss": 0.0609, "step": 3370 }, { "epoch": 2.0511104350471556, "grad_norm": 0.5784830451011658, "learning_rate": 2.5578819959444413e-05, "loss": 0.0495, "step": 3371 }, { "epoch": 2.0517188926072407, "grad_norm": 0.6077147126197815, "learning_rate": 2.5566374371823342e-05, "loss": 0.0473, "step": 3372 }, { "epoch": 2.052327350167326, "grad_norm": 0.47868582606315613, "learning_rate": 2.555392864376503e-05, "loss": 0.0446, "step": 3373 }, { "epoch": 2.052935807727411, "grad_norm": 0.646273136138916, "learning_rate": 2.554148277835551e-05, "loss": 0.0495, "step": 3374 }, { "epoch": 2.053544265287496, "grad_norm": 0.5764486193656921, "learning_rate": 2.552903677868082e-05, "loss": 0.0493, "step": 3375 }, { "epoch": 2.0541527228475815, "grad_norm": 0.5889171957969666, "learning_rate": 2.551659064782706e-05, "loss": 0.0579, "step": 3376 }, { "epoch": 2.0547611804076666, "grad_norm": 0.6388028860092163, "learning_rate": 2.5504144388880364e-05, "loss": 0.0435, "step": 3377 }, { "epoch": 2.0553696379677517, "grad_norm": 0.6067814230918884, "learning_rate": 2.5491698004926862e-05, "loss": 0.0472, "step": 3378 }, { "epoch": 2.055978095527837, "grad_norm": 0.652549684047699, "learning_rate": 2.5479251499052752e-05, "loss": 0.0672, "step": 3379 }, { "epoch": 2.056586553087922, "grad_norm": 0.6091573238372803, "learning_rate": 2.5466804874344253e-05, "loss": 0.0586, "step": 3380 }, { "epoch": 2.0571950106480075, "grad_norm": 0.6867235898971558, "learning_rate": 2.5454358133887594e-05, "loss": 0.0604, "step": 3381 }, { "epoch": 2.0578034682080926, "grad_norm": 0.5185385942459106, "learning_rate": 2.5441911280769065e-05, "loss": 0.0516, "step": 3382 }, { "epoch": 2.0584119257681777, "grad_norm": 0.5316767692565918, "learning_rate": 2.5429464318074952e-05, "loss": 0.0468, "step": 3383 }, { "epoch": 2.0590203833282628, "grad_norm": 0.6571846008300781, "learning_rate": 2.541701724889159e-05, "loss": 0.0547, "step": 3384 }, { "epoch": 2.059628840888348, "grad_norm": 0.6461378335952759, "learning_rate": 2.5404570076305334e-05, "loss": 0.0463, "step": 3385 }, { "epoch": 2.0602372984484334, "grad_norm": 0.5801909565925598, "learning_rate": 2.5392122803402562e-05, "loss": 0.0558, "step": 3386 }, { "epoch": 2.0608457560085185, "grad_norm": 0.5066788196563721, "learning_rate": 2.5379675433269684e-05, "loss": 0.0393, "step": 3387 }, { "epoch": 2.0614542135686036, "grad_norm": 0.6354464292526245, "learning_rate": 2.5367227968993112e-05, "loss": 0.0723, "step": 3388 }, { "epoch": 2.0620626711286887, "grad_norm": 0.6328490972518921, "learning_rate": 2.535478041365932e-05, "loss": 0.0533, "step": 3389 }, { "epoch": 2.062671128688774, "grad_norm": 0.6055465936660767, "learning_rate": 2.5342332770354772e-05, "loss": 0.0624, "step": 3390 }, { "epoch": 2.0632795862488593, "grad_norm": 0.6035982370376587, "learning_rate": 2.5329885042165963e-05, "loss": 0.0491, "step": 3391 }, { "epoch": 2.0638880438089444, "grad_norm": 0.5577566027641296, "learning_rate": 2.531743723217942e-05, "loss": 0.0433, "step": 3392 }, { "epoch": 2.0644965013690295, "grad_norm": 0.6334397196769714, "learning_rate": 2.5304989343481668e-05, "loss": 0.0557, "step": 3393 }, { "epoch": 2.0651049589291146, "grad_norm": 0.6014379262924194, "learning_rate": 2.5292541379159273e-05, "loss": 0.0428, "step": 3394 }, { "epoch": 2.0657134164891997, "grad_norm": 0.624415934085846, "learning_rate": 2.5280093342298817e-05, "loss": 0.0525, "step": 3395 }, { "epoch": 2.0663218740492852, "grad_norm": 0.5978030562400818, "learning_rate": 2.5267645235986874e-05, "loss": 0.0581, "step": 3396 }, { "epoch": 2.0669303316093703, "grad_norm": 0.5354481935501099, "learning_rate": 2.5255197063310075e-05, "loss": 0.0519, "step": 3397 }, { "epoch": 2.0675387891694554, "grad_norm": 0.6657739877700806, "learning_rate": 2.5242748827355046e-05, "loss": 0.0474, "step": 3398 }, { "epoch": 2.0681472467295405, "grad_norm": 0.5657456517219543, "learning_rate": 2.5230300531208417e-05, "loss": 0.0368, "step": 3399 }, { "epoch": 2.0687557042896256, "grad_norm": 0.572913646697998, "learning_rate": 2.5217852177956862e-05, "loss": 0.0542, "step": 3400 }, { "epoch": 2.069364161849711, "grad_norm": 0.45607224106788635, "learning_rate": 2.5205403770687048e-05, "loss": 0.0383, "step": 3401 }, { "epoch": 2.0699726194097963, "grad_norm": 0.5699754953384399, "learning_rate": 2.5192955312485655e-05, "loss": 0.0553, "step": 3402 }, { "epoch": 2.0705810769698814, "grad_norm": 0.6078694462776184, "learning_rate": 2.51805068064394e-05, "loss": 0.0538, "step": 3403 }, { "epoch": 2.0711895345299665, "grad_norm": 0.6573747396469116, "learning_rate": 2.5168058255634963e-05, "loss": 0.0367, "step": 3404 }, { "epoch": 2.0717979920900516, "grad_norm": 0.5173442959785461, "learning_rate": 2.5155609663159098e-05, "loss": 0.0324, "step": 3405 }, { "epoch": 2.072406449650137, "grad_norm": 0.6545349359512329, "learning_rate": 2.514316103209851e-05, "loss": 0.0446, "step": 3406 }, { "epoch": 2.073014907210222, "grad_norm": 0.6538964509963989, "learning_rate": 2.513071236553996e-05, "loss": 0.0494, "step": 3407 }, { "epoch": 2.0736233647703073, "grad_norm": 0.636489987373352, "learning_rate": 2.5118263666570198e-05, "loss": 0.048, "step": 3408 }, { "epoch": 2.0742318223303924, "grad_norm": 0.6040591597557068, "learning_rate": 2.5105814938275968e-05, "loss": 0.065, "step": 3409 }, { "epoch": 2.0748402798904775, "grad_norm": 0.624994695186615, "learning_rate": 2.5093366183744045e-05, "loss": 0.0609, "step": 3410 }, { "epoch": 2.075448737450563, "grad_norm": 0.7067580223083496, "learning_rate": 2.50809174060612e-05, "loss": 0.0649, "step": 3411 }, { "epoch": 2.076057195010648, "grad_norm": 0.6198956370353699, "learning_rate": 2.5068468608314212e-05, "loss": 0.0583, "step": 3412 }, { "epoch": 2.076665652570733, "grad_norm": 0.6904608607292175, "learning_rate": 2.5056019793589858e-05, "loss": 0.0611, "step": 3413 }, { "epoch": 2.0772741101308183, "grad_norm": 0.6566850543022156, "learning_rate": 2.504357096497494e-05, "loss": 0.0545, "step": 3414 }, { "epoch": 2.0778825676909034, "grad_norm": 0.544262707233429, "learning_rate": 2.5031122125556234e-05, "loss": 0.0518, "step": 3415 }, { "epoch": 2.078491025250989, "grad_norm": 0.5134237408638, "learning_rate": 2.5018673278420534e-05, "loss": 0.0523, "step": 3416 }, { "epoch": 2.079099482811074, "grad_norm": 0.5637457370758057, "learning_rate": 2.500622442665465e-05, "loss": 0.0466, "step": 3417 }, { "epoch": 2.079707940371159, "grad_norm": 0.6894789338111877, "learning_rate": 2.4993775573345356e-05, "loss": 0.0601, "step": 3418 }, { "epoch": 2.0803163979312442, "grad_norm": 0.5236656069755554, "learning_rate": 2.4981326721579472e-05, "loss": 0.0439, "step": 3419 }, { "epoch": 2.0809248554913293, "grad_norm": 0.5601134300231934, "learning_rate": 2.496887787444377e-05, "loss": 0.0547, "step": 3420 }, { "epoch": 2.081533313051415, "grad_norm": 0.6053742170333862, "learning_rate": 2.4956429035025063e-05, "loss": 0.064, "step": 3421 }, { "epoch": 2.0821417706115, "grad_norm": 0.5905669331550598, "learning_rate": 2.4943980206410144e-05, "loss": 0.0522, "step": 3422 }, { "epoch": 2.082750228171585, "grad_norm": 0.5676148533821106, "learning_rate": 2.4931531391685794e-05, "loss": 0.054, "step": 3423 }, { "epoch": 2.08335868573167, "grad_norm": 0.6174594759941101, "learning_rate": 2.4919082593938802e-05, "loss": 0.0686, "step": 3424 }, { "epoch": 2.0839671432917553, "grad_norm": 0.5777302980422974, "learning_rate": 2.4906633816255964e-05, "loss": 0.0531, "step": 3425 }, { "epoch": 2.084575600851841, "grad_norm": 0.5795081257820129, "learning_rate": 2.489418506172404e-05, "loss": 0.0629, "step": 3426 }, { "epoch": 2.085184058411926, "grad_norm": 0.6431803107261658, "learning_rate": 2.4881736333429808e-05, "loss": 0.0645, "step": 3427 }, { "epoch": 2.085792515972011, "grad_norm": 0.5932779312133789, "learning_rate": 2.4869287634460045e-05, "loss": 0.0682, "step": 3428 }, { "epoch": 2.086400973532096, "grad_norm": 0.4519423246383667, "learning_rate": 2.4856838967901492e-05, "loss": 0.0399, "step": 3429 }, { "epoch": 2.087009431092181, "grad_norm": 0.5230922698974609, "learning_rate": 2.4844390336840908e-05, "loss": 0.0462, "step": 3430 }, { "epoch": 2.0876178886522667, "grad_norm": 0.5311405658721924, "learning_rate": 2.4831941744365043e-05, "loss": 0.0402, "step": 3431 }, { "epoch": 2.088226346212352, "grad_norm": 0.658665657043457, "learning_rate": 2.4819493193560618e-05, "loss": 0.0498, "step": 3432 }, { "epoch": 2.088834803772437, "grad_norm": 0.5413145422935486, "learning_rate": 2.4807044687514344e-05, "loss": 0.0499, "step": 3433 }, { "epoch": 2.089443261332522, "grad_norm": 0.6543242931365967, "learning_rate": 2.4794596229312958e-05, "loss": 0.0605, "step": 3434 }, { "epoch": 2.090051718892607, "grad_norm": 0.5483290553092957, "learning_rate": 2.4782147822043144e-05, "loss": 0.0289, "step": 3435 }, { "epoch": 2.090660176452692, "grad_norm": 0.5126484036445618, "learning_rate": 2.476969946879159e-05, "loss": 0.0396, "step": 3436 }, { "epoch": 2.0912686340127777, "grad_norm": 0.5537660121917725, "learning_rate": 2.475725117264496e-05, "loss": 0.0515, "step": 3437 }, { "epoch": 2.091877091572863, "grad_norm": 0.5816277861595154, "learning_rate": 2.474480293668993e-05, "loss": 0.041, "step": 3438 }, { "epoch": 2.092485549132948, "grad_norm": 0.6799830794334412, "learning_rate": 2.473235476401313e-05, "loss": 0.0589, "step": 3439 }, { "epoch": 2.093094006693033, "grad_norm": 0.6604799032211304, "learning_rate": 2.4719906657701192e-05, "loss": 0.0506, "step": 3440 }, { "epoch": 2.093702464253118, "grad_norm": 0.7943026423454285, "learning_rate": 2.4707458620840732e-05, "loss": 0.0622, "step": 3441 }, { "epoch": 2.0943109218132037, "grad_norm": 0.5456538200378418, "learning_rate": 2.469501065651834e-05, "loss": 0.0451, "step": 3442 }, { "epoch": 2.0949193793732888, "grad_norm": 0.6690013408660889, "learning_rate": 2.4682562767820587e-05, "loss": 0.0578, "step": 3443 }, { "epoch": 2.095527836933374, "grad_norm": 0.645366907119751, "learning_rate": 2.4670114957834043e-05, "loss": 0.062, "step": 3444 }, { "epoch": 2.096136294493459, "grad_norm": 0.5978085398674011, "learning_rate": 2.4657667229645237e-05, "loss": 0.0486, "step": 3445 }, { "epoch": 2.096744752053544, "grad_norm": 0.5921434760093689, "learning_rate": 2.4645219586340683e-05, "loss": 0.0548, "step": 3446 }, { "epoch": 2.0973532096136296, "grad_norm": 0.4595530033111572, "learning_rate": 2.463277203100689e-05, "loss": 0.0425, "step": 3447 }, { "epoch": 2.0979616671737147, "grad_norm": 0.5539052486419678, "learning_rate": 2.462032456673033e-05, "loss": 0.0537, "step": 3448 }, { "epoch": 2.0985701247338, "grad_norm": 0.5757131576538086, "learning_rate": 2.4607877196597437e-05, "loss": 0.0438, "step": 3449 }, { "epoch": 2.099178582293885, "grad_norm": 0.569847047328949, "learning_rate": 2.4595429923694668e-05, "loss": 0.0506, "step": 3450 }, { "epoch": 2.09978703985397, "grad_norm": 0.5236031413078308, "learning_rate": 2.4582982751108417e-05, "loss": 0.0345, "step": 3451 }, { "epoch": 2.1003954974140555, "grad_norm": 0.6032043695449829, "learning_rate": 2.4570535681925047e-05, "loss": 0.0528, "step": 3452 }, { "epoch": 2.1010039549741406, "grad_norm": 0.5206643342971802, "learning_rate": 2.455808871923094e-05, "loss": 0.046, "step": 3453 }, { "epoch": 2.1016124125342257, "grad_norm": 0.5682435035705566, "learning_rate": 2.4545641866112408e-05, "loss": 0.0408, "step": 3454 }, { "epoch": 2.102220870094311, "grad_norm": 0.6149486899375916, "learning_rate": 2.453319512565576e-05, "loss": 0.0507, "step": 3455 }, { "epoch": 2.102829327654396, "grad_norm": 0.5755969882011414, "learning_rate": 2.4520748500947247e-05, "loss": 0.0593, "step": 3456 }, { "epoch": 2.1034377852144814, "grad_norm": 0.6103445291519165, "learning_rate": 2.4508301995073144e-05, "loss": 0.0553, "step": 3457 }, { "epoch": 2.1040462427745665, "grad_norm": 0.5596705675125122, "learning_rate": 2.449585561111965e-05, "loss": 0.0489, "step": 3458 }, { "epoch": 2.1046547003346516, "grad_norm": 0.535492479801178, "learning_rate": 2.4483409352172936e-05, "loss": 0.0503, "step": 3459 }, { "epoch": 2.1052631578947367, "grad_norm": 0.6022449731826782, "learning_rate": 2.4470963221319188e-05, "loss": 0.0457, "step": 3460 }, { "epoch": 2.105871615454822, "grad_norm": 0.680794894695282, "learning_rate": 2.4458517221644507e-05, "loss": 0.0521, "step": 3461 }, { "epoch": 2.1064800730149074, "grad_norm": 0.6097959280014038, "learning_rate": 2.444607135623497e-05, "loss": 0.0467, "step": 3462 }, { "epoch": 2.1070885305749925, "grad_norm": 0.4896848499774933, "learning_rate": 2.4433625628176663e-05, "loss": 0.0311, "step": 3463 }, { "epoch": 2.1076969881350776, "grad_norm": 0.6469544172286987, "learning_rate": 2.442118004055559e-05, "loss": 0.0737, "step": 3464 }, { "epoch": 2.1083054456951626, "grad_norm": 0.5251388549804688, "learning_rate": 2.4408734596457744e-05, "loss": 0.0456, "step": 3465 }, { "epoch": 2.1089139032552477, "grad_norm": 0.5152940154075623, "learning_rate": 2.439628929896906e-05, "loss": 0.0327, "step": 3466 }, { "epoch": 2.1095223608153333, "grad_norm": 0.552280068397522, "learning_rate": 2.4383844151175478e-05, "loss": 0.0592, "step": 3467 }, { "epoch": 2.1101308183754184, "grad_norm": 0.6947097182273865, "learning_rate": 2.4371399156162862e-05, "loss": 0.0865, "step": 3468 }, { "epoch": 2.1107392759355035, "grad_norm": 0.5415664315223694, "learning_rate": 2.4358954317017045e-05, "loss": 0.0453, "step": 3469 }, { "epoch": 2.1113477334955886, "grad_norm": 0.5632659196853638, "learning_rate": 2.434650963682385e-05, "loss": 0.0497, "step": 3470 }, { "epoch": 2.1119561910556737, "grad_norm": 0.5756339430809021, "learning_rate": 2.433406511866902e-05, "loss": 0.0493, "step": 3471 }, { "epoch": 2.112564648615759, "grad_norm": 0.5048176646232605, "learning_rate": 2.4321620765638274e-05, "loss": 0.0509, "step": 3472 }, { "epoch": 2.1131731061758443, "grad_norm": 0.6544382572174072, "learning_rate": 2.4309176580817318e-05, "loss": 0.0654, "step": 3473 }, { "epoch": 2.1137815637359294, "grad_norm": 0.5566495656967163, "learning_rate": 2.429673256729177e-05, "loss": 0.0493, "step": 3474 }, { "epoch": 2.1143900212960145, "grad_norm": 0.53816819190979, "learning_rate": 2.428428872814722e-05, "loss": 0.0438, "step": 3475 }, { "epoch": 2.1149984788560996, "grad_norm": 0.5756782293319702, "learning_rate": 2.4271845066469247e-05, "loss": 0.0514, "step": 3476 }, { "epoch": 2.115606936416185, "grad_norm": 0.6478017568588257, "learning_rate": 2.4259401585343344e-05, "loss": 0.0584, "step": 3477 }, { "epoch": 2.1162153939762702, "grad_norm": 0.6319558024406433, "learning_rate": 2.424695828785498e-05, "loss": 0.0524, "step": 3478 }, { "epoch": 2.1168238515363553, "grad_norm": 0.5530511736869812, "learning_rate": 2.4234515177089562e-05, "loss": 0.0523, "step": 3479 }, { "epoch": 2.1174323090964404, "grad_norm": 0.5840123295783997, "learning_rate": 2.4222072256132483e-05, "loss": 0.0473, "step": 3480 }, { "epoch": 2.1180407666565255, "grad_norm": 0.5127155184745789, "learning_rate": 2.4209629528069063e-05, "loss": 0.0456, "step": 3481 }, { "epoch": 2.118649224216611, "grad_norm": 0.6958695650100708, "learning_rate": 2.419718699598456e-05, "loss": 0.0579, "step": 3482 }, { "epoch": 2.119257681776696, "grad_norm": 0.6823195815086365, "learning_rate": 2.418474466296423e-05, "loss": 0.0523, "step": 3483 }, { "epoch": 2.1198661393367813, "grad_norm": 0.5722386240959167, "learning_rate": 2.417230253209324e-05, "loss": 0.0497, "step": 3484 }, { "epoch": 2.1204745968968663, "grad_norm": 1.0104351043701172, "learning_rate": 2.4159860606456718e-05, "loss": 0.0758, "step": 3485 }, { "epoch": 2.1210830544569514, "grad_norm": 0.5905888080596924, "learning_rate": 2.414741888913975e-05, "loss": 0.0535, "step": 3486 }, { "epoch": 2.121691512017037, "grad_norm": 0.732254683971405, "learning_rate": 2.4134977383227364e-05, "loss": 0.0567, "step": 3487 }, { "epoch": 2.122299969577122, "grad_norm": 0.5152473449707031, "learning_rate": 2.412253609180453e-05, "loss": 0.0359, "step": 3488 }, { "epoch": 2.122908427137207, "grad_norm": 0.6493118405342102, "learning_rate": 2.4110095017956164e-05, "loss": 0.0565, "step": 3489 }, { "epoch": 2.1235168846972923, "grad_norm": 0.5857681632041931, "learning_rate": 2.4097654164767148e-05, "loss": 0.0332, "step": 3490 }, { "epoch": 2.1241253422573774, "grad_norm": 0.533673107624054, "learning_rate": 2.4085213535322288e-05, "loss": 0.0364, "step": 3491 }, { "epoch": 2.124733799817463, "grad_norm": 0.6203796863555908, "learning_rate": 2.4072773132706326e-05, "loss": 0.0544, "step": 3492 }, { "epoch": 2.125342257377548, "grad_norm": 0.5900121927261353, "learning_rate": 2.4060332960003996e-05, "loss": 0.0369, "step": 3493 }, { "epoch": 2.125950714937633, "grad_norm": 0.6247245669364929, "learning_rate": 2.4047893020299922e-05, "loss": 0.0499, "step": 3494 }, { "epoch": 2.126559172497718, "grad_norm": 0.5890246033668518, "learning_rate": 2.403545331667868e-05, "loss": 0.0455, "step": 3495 }, { "epoch": 2.1271676300578033, "grad_norm": 0.7032641768455505, "learning_rate": 2.4023013852224828e-05, "loss": 0.0676, "step": 3496 }, { "epoch": 2.127776087617889, "grad_norm": 0.5493380427360535, "learning_rate": 2.401057463002281e-05, "loss": 0.0512, "step": 3497 }, { "epoch": 2.128384545177974, "grad_norm": 0.590238094329834, "learning_rate": 2.3998135653157035e-05, "loss": 0.0602, "step": 3498 }, { "epoch": 2.128993002738059, "grad_norm": 0.6478054523468018, "learning_rate": 2.398569692471187e-05, "loss": 0.0459, "step": 3499 }, { "epoch": 2.129601460298144, "grad_norm": 0.6916689872741699, "learning_rate": 2.397325844777159e-05, "loss": 0.0472, "step": 3500 }, { "epoch": 2.130209917858229, "grad_norm": 0.5881983637809753, "learning_rate": 2.3960820225420418e-05, "loss": 0.0581, "step": 3501 }, { "epoch": 2.1308183754183148, "grad_norm": 0.5607914924621582, "learning_rate": 2.39483822607425e-05, "loss": 0.0485, "step": 3502 }, { "epoch": 2.1314268329784, "grad_norm": 0.601393461227417, "learning_rate": 2.3935944556821966e-05, "loss": 0.0679, "step": 3503 }, { "epoch": 2.132035290538485, "grad_norm": 0.5792237520217896, "learning_rate": 2.3923507116742826e-05, "loss": 0.0596, "step": 3504 }, { "epoch": 2.13264374809857, "grad_norm": 0.5309643745422363, "learning_rate": 2.391106994358904e-05, "loss": 0.0422, "step": 3505 }, { "epoch": 2.133252205658655, "grad_norm": 0.61043381690979, "learning_rate": 2.3898633040444528e-05, "loss": 0.0519, "step": 3506 }, { "epoch": 2.1338606632187407, "grad_norm": 0.5031585693359375, "learning_rate": 2.388619641039312e-05, "loss": 0.0334, "step": 3507 }, { "epoch": 2.134469120778826, "grad_norm": 0.5993291139602661, "learning_rate": 2.387376005651856e-05, "loss": 0.0476, "step": 3508 }, { "epoch": 2.135077578338911, "grad_norm": 0.49983397126197815, "learning_rate": 2.3861323981904575e-05, "loss": 0.0476, "step": 3509 }, { "epoch": 2.135686035898996, "grad_norm": 0.6096305847167969, "learning_rate": 2.3848888189634778e-05, "loss": 0.0449, "step": 3510 }, { "epoch": 2.136294493459081, "grad_norm": 0.6761244535446167, "learning_rate": 2.3836452682792734e-05, "loss": 0.0474, "step": 3511 }, { "epoch": 2.1369029510191666, "grad_norm": 0.6972514390945435, "learning_rate": 2.382401746446191e-05, "loss": 0.0652, "step": 3512 }, { "epoch": 2.1375114085792517, "grad_norm": 0.6121935844421387, "learning_rate": 2.3811582537725753e-05, "loss": 0.0501, "step": 3513 }, { "epoch": 2.138119866139337, "grad_norm": 0.6078847646713257, "learning_rate": 2.379914790566759e-05, "loss": 0.0568, "step": 3514 }, { "epoch": 2.138728323699422, "grad_norm": 0.6310839653015137, "learning_rate": 2.378671357137068e-05, "loss": 0.0451, "step": 3515 }, { "epoch": 2.139336781259507, "grad_norm": 0.48389577865600586, "learning_rate": 2.377427953791824e-05, "loss": 0.0449, "step": 3516 }, { "epoch": 2.1399452388195925, "grad_norm": 0.6960240006446838, "learning_rate": 2.3761845808393388e-05, "loss": 0.0582, "step": 3517 }, { "epoch": 2.1405536963796776, "grad_norm": 0.571576714515686, "learning_rate": 2.3749412385879154e-05, "loss": 0.0661, "step": 3518 }, { "epoch": 2.1411621539397627, "grad_norm": 0.6685907244682312, "learning_rate": 2.3736979273458535e-05, "loss": 0.0555, "step": 3519 }, { "epoch": 2.141770611499848, "grad_norm": 0.6551706194877625, "learning_rate": 2.3724546474214406e-05, "loss": 0.0392, "step": 3520 }, { "epoch": 2.142379069059933, "grad_norm": 0.609714686870575, "learning_rate": 2.371211399122958e-05, "loss": 0.0674, "step": 3521 }, { "epoch": 2.1429875266200185, "grad_norm": 0.5921576619148254, "learning_rate": 2.3699681827586813e-05, "loss": 0.0498, "step": 3522 }, { "epoch": 2.1435959841801036, "grad_norm": 0.5523459315299988, "learning_rate": 2.3687249986368752e-05, "loss": 0.0502, "step": 3523 }, { "epoch": 2.1442044417401886, "grad_norm": 0.6032785177230835, "learning_rate": 2.3674818470657975e-05, "loss": 0.0567, "step": 3524 }, { "epoch": 2.1448128993002737, "grad_norm": 0.5920435786247253, "learning_rate": 2.3662387283536978e-05, "loss": 0.0389, "step": 3525 }, { "epoch": 2.145421356860359, "grad_norm": 0.5291022062301636, "learning_rate": 2.3649956428088187e-05, "loss": 0.042, "step": 3526 }, { "epoch": 2.1460298144204444, "grad_norm": 0.6911409497261047, "learning_rate": 2.363752590739392e-05, "loss": 0.0591, "step": 3527 }, { "epoch": 2.1466382719805295, "grad_norm": 0.6609728336334229, "learning_rate": 2.362509572453644e-05, "loss": 0.0463, "step": 3528 }, { "epoch": 2.1472467295406146, "grad_norm": 0.5278157591819763, "learning_rate": 2.3612665882597915e-05, "loss": 0.0378, "step": 3529 }, { "epoch": 2.1478551871006997, "grad_norm": 0.7173718810081482, "learning_rate": 2.360023638466042e-05, "loss": 0.0511, "step": 3530 }, { "epoch": 2.1484636446607848, "grad_norm": 0.5184733271598816, "learning_rate": 2.3587807233805956e-05, "loss": 0.0448, "step": 3531 }, { "epoch": 2.1490721022208703, "grad_norm": 0.7443384528160095, "learning_rate": 2.3575378433116445e-05, "loss": 0.0522, "step": 3532 }, { "epoch": 2.1496805597809554, "grad_norm": 0.5677869915962219, "learning_rate": 2.356294998567369e-05, "loss": 0.0391, "step": 3533 }, { "epoch": 2.1502890173410405, "grad_norm": 0.7566191554069519, "learning_rate": 2.3550521894559446e-05, "loss": 0.0478, "step": 3534 }, { "epoch": 2.1508974749011256, "grad_norm": 0.5744057297706604, "learning_rate": 2.353809416285535e-05, "loss": 0.0428, "step": 3535 }, { "epoch": 2.1515059324612107, "grad_norm": 0.6628245115280151, "learning_rate": 2.3525666793642968e-05, "loss": 0.0547, "step": 3536 }, { "epoch": 2.1521143900212962, "grad_norm": 0.5155208706855774, "learning_rate": 2.351323979000377e-05, "loss": 0.047, "step": 3537 }, { "epoch": 2.1527228475813813, "grad_norm": 0.6521499752998352, "learning_rate": 2.350081315501913e-05, "loss": 0.0421, "step": 3538 }, { "epoch": 2.1533313051414664, "grad_norm": 0.6431947946548462, "learning_rate": 2.3488386891770346e-05, "loss": 0.0441, "step": 3539 }, { "epoch": 2.1539397627015515, "grad_norm": 0.6007315516471863, "learning_rate": 2.3475961003338594e-05, "loss": 0.052, "step": 3540 }, { "epoch": 2.1545482202616366, "grad_norm": 0.5834375023841858, "learning_rate": 2.3463535492804994e-05, "loss": 0.056, "step": 3541 }, { "epoch": 2.155156677821722, "grad_norm": 0.6027958989143372, "learning_rate": 2.345111036325055e-05, "loss": 0.0483, "step": 3542 }, { "epoch": 2.1557651353818073, "grad_norm": 0.7618597745895386, "learning_rate": 2.3438685617756174e-05, "loss": 0.0605, "step": 3543 }, { "epoch": 2.1563735929418923, "grad_norm": 0.6837668418884277, "learning_rate": 2.342626125940268e-05, "loss": 0.0551, "step": 3544 }, { "epoch": 2.1569820505019774, "grad_norm": 0.5220887064933777, "learning_rate": 2.341383729127081e-05, "loss": 0.0433, "step": 3545 }, { "epoch": 2.1575905080620625, "grad_norm": 0.7032208442687988, "learning_rate": 2.3401413716441166e-05, "loss": 0.066, "step": 3546 }, { "epoch": 2.158198965622148, "grad_norm": 0.5578327178955078, "learning_rate": 2.3388990537994296e-05, "loss": 0.0519, "step": 3547 }, { "epoch": 2.158807423182233, "grad_norm": 0.686423122882843, "learning_rate": 2.3376567759010614e-05, "loss": 0.0479, "step": 3548 }, { "epoch": 2.1594158807423183, "grad_norm": 0.7522620558738708, "learning_rate": 2.3364145382570462e-05, "loss": 0.0713, "step": 3549 }, { "epoch": 2.1600243383024034, "grad_norm": 0.5565488934516907, "learning_rate": 2.3351723411754074e-05, "loss": 0.0474, "step": 3550 }, { "epoch": 2.1606327958624885, "grad_norm": 0.5260975956916809, "learning_rate": 2.3339301849641573e-05, "loss": 0.0464, "step": 3551 }, { "epoch": 2.161241253422574, "grad_norm": 0.574340283870697, "learning_rate": 2.332688069931299e-05, "loss": 0.0425, "step": 3552 }, { "epoch": 2.161849710982659, "grad_norm": 0.5723605751991272, "learning_rate": 2.3314459963848264e-05, "loss": 0.0519, "step": 3553 }, { "epoch": 2.162458168542744, "grad_norm": 0.6394765377044678, "learning_rate": 2.33020396463272e-05, "loss": 0.0454, "step": 3554 }, { "epoch": 2.1630666261028293, "grad_norm": 0.6276997923851013, "learning_rate": 2.328961974982953e-05, "loss": 0.0485, "step": 3555 }, { "epoch": 2.1636750836629144, "grad_norm": 0.6199513077735901, "learning_rate": 2.3277200277434876e-05, "loss": 0.0526, "step": 3556 }, { "epoch": 2.164283541223, "grad_norm": 0.5897301435470581, "learning_rate": 2.3264781232222742e-05, "loss": 0.0539, "step": 3557 }, { "epoch": 2.164891998783085, "grad_norm": 0.5835275053977966, "learning_rate": 2.3252362617272527e-05, "loss": 0.0548, "step": 3558 }, { "epoch": 2.16550045634317, "grad_norm": 0.5806435346603394, "learning_rate": 2.3239944435663547e-05, "loss": 0.0499, "step": 3559 }, { "epoch": 2.166108913903255, "grad_norm": 0.6462282538414001, "learning_rate": 2.3227526690474982e-05, "loss": 0.0611, "step": 3560 }, { "epoch": 2.1667173714633403, "grad_norm": 0.58769690990448, "learning_rate": 2.321510938478591e-05, "loss": 0.0462, "step": 3561 }, { "epoch": 2.1673258290234254, "grad_norm": 0.5471524596214294, "learning_rate": 2.3202692521675323e-05, "loss": 0.0348, "step": 3562 }, { "epoch": 2.167934286583511, "grad_norm": 0.5656521320343018, "learning_rate": 2.3190276104222073e-05, "loss": 0.055, "step": 3563 }, { "epoch": 2.168542744143596, "grad_norm": 0.6103416085243225, "learning_rate": 2.3177860135504907e-05, "loss": 0.0562, "step": 3564 }, { "epoch": 2.169151201703681, "grad_norm": 0.6096418499946594, "learning_rate": 2.316544461860249e-05, "loss": 0.0513, "step": 3565 }, { "epoch": 2.1697596592637662, "grad_norm": 0.5852376818656921, "learning_rate": 2.3153029556593335e-05, "loss": 0.0505, "step": 3566 }, { "epoch": 2.1703681168238513, "grad_norm": 0.6946520805358887, "learning_rate": 2.3140614952555856e-05, "loss": 0.0659, "step": 3567 }, { "epoch": 2.170976574383937, "grad_norm": 0.546222448348999, "learning_rate": 2.3128200809568375e-05, "loss": 0.057, "step": 3568 }, { "epoch": 2.171585031944022, "grad_norm": 0.5653393864631653, "learning_rate": 2.3115787130709074e-05, "loss": 0.0415, "step": 3569 }, { "epoch": 2.172193489504107, "grad_norm": 0.5774003863334656, "learning_rate": 2.3103373919056026e-05, "loss": 0.0561, "step": 3570 }, { "epoch": 2.172801947064192, "grad_norm": 0.5911126136779785, "learning_rate": 2.309096117768718e-05, "loss": 0.0538, "step": 3571 }, { "epoch": 2.1734104046242773, "grad_norm": 0.549182653427124, "learning_rate": 2.3078548909680403e-05, "loss": 0.0474, "step": 3572 }, { "epoch": 2.174018862184363, "grad_norm": 0.5475966334342957, "learning_rate": 2.3066137118113405e-05, "loss": 0.0486, "step": 3573 }, { "epoch": 2.174627319744448, "grad_norm": 0.6825101375579834, "learning_rate": 2.305372580606378e-05, "loss": 0.0569, "step": 3574 }, { "epoch": 2.175235777304533, "grad_norm": 0.5372380614280701, "learning_rate": 2.3041314976609043e-05, "loss": 0.0525, "step": 3575 }, { "epoch": 2.175844234864618, "grad_norm": 0.5879507064819336, "learning_rate": 2.3028904632826555e-05, "loss": 0.0485, "step": 3576 }, { "epoch": 2.176452692424703, "grad_norm": 0.6694579124450684, "learning_rate": 2.301649477779354e-05, "loss": 0.0516, "step": 3577 }, { "epoch": 2.1770611499847887, "grad_norm": 0.48730021715164185, "learning_rate": 2.300408541458716e-05, "loss": 0.0444, "step": 3578 }, { "epoch": 2.177669607544874, "grad_norm": 0.6008870005607605, "learning_rate": 2.29916765462844e-05, "loss": 0.0413, "step": 3579 }, { "epoch": 2.178278065104959, "grad_norm": 0.557727038860321, "learning_rate": 2.2979268175962134e-05, "loss": 0.0375, "step": 3580 }, { "epoch": 2.178886522665044, "grad_norm": 0.5483887195587158, "learning_rate": 2.2966860306697148e-05, "loss": 0.0426, "step": 3581 }, { "epoch": 2.179494980225129, "grad_norm": 0.6324334144592285, "learning_rate": 2.2954452941566057e-05, "loss": 0.0419, "step": 3582 }, { "epoch": 2.1801034377852146, "grad_norm": 0.643545925617218, "learning_rate": 2.2942046083645375e-05, "loss": 0.0425, "step": 3583 }, { "epoch": 2.1807118953452997, "grad_norm": 0.5663473010063171, "learning_rate": 2.292963973601147e-05, "loss": 0.0356, "step": 3584 }, { "epoch": 2.181320352905385, "grad_norm": 0.6918662786483765, "learning_rate": 2.291723390174063e-05, "loss": 0.0698, "step": 3585 }, { "epoch": 2.18192881046547, "grad_norm": 0.8219648003578186, "learning_rate": 2.2904828583908967e-05, "loss": 0.0711, "step": 3586 }, { "epoch": 2.182537268025555, "grad_norm": 0.5029035806655884, "learning_rate": 2.289242378559247e-05, "loss": 0.0496, "step": 3587 }, { "epoch": 2.1831457255856406, "grad_norm": 0.5251539945602417, "learning_rate": 2.288001950986704e-05, "loss": 0.0411, "step": 3588 }, { "epoch": 2.1837541831457257, "grad_norm": 0.7362745404243469, "learning_rate": 2.2867615759808403e-05, "loss": 0.0523, "step": 3589 }, { "epoch": 2.1843626407058108, "grad_norm": 0.5987080335617065, "learning_rate": 2.2855212538492167e-05, "loss": 0.0515, "step": 3590 }, { "epoch": 2.184971098265896, "grad_norm": 0.5399122834205627, "learning_rate": 2.2842809848993834e-05, "loss": 0.0555, "step": 3591 }, { "epoch": 2.185579555825981, "grad_norm": 0.6066261529922485, "learning_rate": 2.2830407694388743e-05, "loss": 0.0404, "step": 3592 }, { "epoch": 2.1861880133860665, "grad_norm": 0.5191651582717896, "learning_rate": 2.281800607775211e-05, "loss": 0.0519, "step": 3593 }, { "epoch": 2.1867964709461516, "grad_norm": 0.6118345856666565, "learning_rate": 2.2805605002159007e-05, "loss": 0.05, "step": 3594 }, { "epoch": 2.1874049285062367, "grad_norm": 0.6577736139297485, "learning_rate": 2.2793204470684406e-05, "loss": 0.0558, "step": 3595 }, { "epoch": 2.188013386066322, "grad_norm": 0.5983335375785828, "learning_rate": 2.2780804486403115e-05, "loss": 0.064, "step": 3596 }, { "epoch": 2.188621843626407, "grad_norm": 0.6087679266929626, "learning_rate": 2.2768405052389802e-05, "loss": 0.0561, "step": 3597 }, { "epoch": 2.1892303011864924, "grad_norm": 0.5540795922279358, "learning_rate": 2.2756006171719027e-05, "loss": 0.0431, "step": 3598 }, { "epoch": 2.1898387587465775, "grad_norm": 0.5119609236717224, "learning_rate": 2.2743607847465188e-05, "loss": 0.0466, "step": 3599 }, { "epoch": 2.1904472163066626, "grad_norm": 0.5084932446479797, "learning_rate": 2.273121008270254e-05, "loss": 0.0535, "step": 3600 }, { "epoch": 2.1910556738667477, "grad_norm": 0.5782137513160706, "learning_rate": 2.2718812880505232e-05, "loss": 0.0635, "step": 3601 }, { "epoch": 2.191664131426833, "grad_norm": 0.6178430318832397, "learning_rate": 2.2706416243947248e-05, "loss": 0.0442, "step": 3602 }, { "epoch": 2.1922725889869183, "grad_norm": 0.5488747954368591, "learning_rate": 2.2694020176102427e-05, "loss": 0.0394, "step": 3603 }, { "epoch": 2.1928810465470034, "grad_norm": 0.6640757322311401, "learning_rate": 2.268162468004449e-05, "loss": 0.0524, "step": 3604 }, { "epoch": 2.1934895041070885, "grad_norm": 0.5369768738746643, "learning_rate": 2.2669229758846998e-05, "loss": 0.044, "step": 3605 }, { "epoch": 2.1940979616671736, "grad_norm": 0.554397702217102, "learning_rate": 2.265683541558338e-05, "loss": 0.0384, "step": 3606 }, { "epoch": 2.1947064192272587, "grad_norm": 0.6047532558441162, "learning_rate": 2.2644441653326892e-05, "loss": 0.0555, "step": 3607 }, { "epoch": 2.1953148767873443, "grad_norm": 0.5616478323936462, "learning_rate": 2.2632048475150705e-05, "loss": 0.0516, "step": 3608 }, { "epoch": 2.1959233343474294, "grad_norm": 0.5725663900375366, "learning_rate": 2.2619655884127793e-05, "loss": 0.0551, "step": 3609 }, { "epoch": 2.1965317919075145, "grad_norm": 0.6617404222488403, "learning_rate": 2.2607263883330994e-05, "loss": 0.0457, "step": 3610 }, { "epoch": 2.1971402494675996, "grad_norm": 1.2666044235229492, "learning_rate": 2.259487247583303e-05, "loss": 0.078, "step": 3611 }, { "epoch": 2.1977487070276847, "grad_norm": 0.7400673031806946, "learning_rate": 2.258248166470644e-05, "loss": 0.0534, "step": 3612 }, { "epoch": 2.19835716458777, "grad_norm": 0.6314421892166138, "learning_rate": 2.257009145302362e-05, "loss": 0.039, "step": 3613 }, { "epoch": 2.1989656221478553, "grad_norm": 0.5742847919464111, "learning_rate": 2.2557701843856847e-05, "loss": 0.0468, "step": 3614 }, { "epoch": 2.1995740797079404, "grad_norm": 0.66370689868927, "learning_rate": 2.2545312840278214e-05, "loss": 0.0853, "step": 3615 }, { "epoch": 2.2001825372680255, "grad_norm": 0.5838887095451355, "learning_rate": 2.2532924445359686e-05, "loss": 0.0373, "step": 3616 }, { "epoch": 2.2007909948281106, "grad_norm": 0.6079072952270508, "learning_rate": 2.252053666217305e-05, "loss": 0.0372, "step": 3617 }, { "epoch": 2.201399452388196, "grad_norm": 0.7246667742729187, "learning_rate": 2.250814949378998e-05, "loss": 0.057, "step": 3618 }, { "epoch": 2.202007909948281, "grad_norm": 0.5479733943939209, "learning_rate": 2.2495762943281974e-05, "loss": 0.0404, "step": 3619 }, { "epoch": 2.2026163675083663, "grad_norm": 0.5117501020431519, "learning_rate": 2.248337701372036e-05, "loss": 0.0419, "step": 3620 }, { "epoch": 2.2032248250684514, "grad_norm": 0.735734224319458, "learning_rate": 2.2470991708176364e-05, "loss": 0.0618, "step": 3621 }, { "epoch": 2.2038332826285365, "grad_norm": 0.4875301122665405, "learning_rate": 2.245860702972101e-05, "loss": 0.0497, "step": 3622 }, { "epoch": 2.204441740188622, "grad_norm": 0.6231479644775391, "learning_rate": 2.244622298142517e-05, "loss": 0.0457, "step": 3623 }, { "epoch": 2.205050197748707, "grad_norm": 0.5008909106254578, "learning_rate": 2.2433839566359593e-05, "loss": 0.0479, "step": 3624 }, { "epoch": 2.2056586553087922, "grad_norm": 0.49350735545158386, "learning_rate": 2.2421456787594845e-05, "loss": 0.0543, "step": 3625 }, { "epoch": 2.2062671128688773, "grad_norm": 0.6941964626312256, "learning_rate": 2.240907464820132e-05, "loss": 0.0747, "step": 3626 }, { "epoch": 2.2068755704289624, "grad_norm": 0.5441345572471619, "learning_rate": 2.2396693151249303e-05, "loss": 0.047, "step": 3627 }, { "epoch": 2.207484027989048, "grad_norm": 0.5408318042755127, "learning_rate": 2.2384312299808868e-05, "loss": 0.0556, "step": 3628 }, { "epoch": 2.208092485549133, "grad_norm": 0.6332383155822754, "learning_rate": 2.2371932096949957e-05, "loss": 0.0488, "step": 3629 }, { "epoch": 2.208700943109218, "grad_norm": 0.6663236021995544, "learning_rate": 2.2359552545742332e-05, "loss": 0.0381, "step": 3630 }, { "epoch": 2.2093094006693033, "grad_norm": 0.5764215588569641, "learning_rate": 2.2347173649255627e-05, "loss": 0.0576, "step": 3631 }, { "epoch": 2.2099178582293884, "grad_norm": 0.4953424632549286, "learning_rate": 2.2334795410559283e-05, "loss": 0.0395, "step": 3632 }, { "epoch": 2.2105263157894735, "grad_norm": 0.5949841737747192, "learning_rate": 2.2322417832722574e-05, "loss": 0.0563, "step": 3633 }, { "epoch": 2.211134773349559, "grad_norm": 0.5839807391166687, "learning_rate": 2.2310040918814647e-05, "loss": 0.0493, "step": 3634 }, { "epoch": 2.211743230909644, "grad_norm": 0.648003101348877, "learning_rate": 2.2297664671904447e-05, "loss": 0.0691, "step": 3635 }, { "epoch": 2.212351688469729, "grad_norm": 0.582639217376709, "learning_rate": 2.2285289095060764e-05, "loss": 0.0542, "step": 3636 }, { "epoch": 2.2129601460298143, "grad_norm": 0.6050490140914917, "learning_rate": 2.2272914191352234e-05, "loss": 0.0421, "step": 3637 }, { "epoch": 2.2135686035898994, "grad_norm": 0.8048490285873413, "learning_rate": 2.2260539963847317e-05, "loss": 0.0838, "step": 3638 }, { "epoch": 2.214177061149985, "grad_norm": 0.6212171316146851, "learning_rate": 2.2248166415614305e-05, "loss": 0.0601, "step": 3639 }, { "epoch": 2.21478551871007, "grad_norm": 0.5588234663009644, "learning_rate": 2.223579354972131e-05, "loss": 0.0384, "step": 3640 }, { "epoch": 2.215393976270155, "grad_norm": 0.5870838761329651, "learning_rate": 2.2223421369236304e-05, "loss": 0.0476, "step": 3641 }, { "epoch": 2.21600243383024, "grad_norm": 0.6339545845985413, "learning_rate": 2.2211049877227065e-05, "loss": 0.0791, "step": 3642 }, { "epoch": 2.2166108913903253, "grad_norm": 0.6535237431526184, "learning_rate": 2.2198679076761196e-05, "loss": 0.0458, "step": 3643 }, { "epoch": 2.217219348950411, "grad_norm": 0.4435776174068451, "learning_rate": 2.2186308970906166e-05, "loss": 0.0386, "step": 3644 }, { "epoch": 2.217827806510496, "grad_norm": 0.6395978331565857, "learning_rate": 2.217393956272923e-05, "loss": 0.0606, "step": 3645 }, { "epoch": 2.218436264070581, "grad_norm": 0.6212596297264099, "learning_rate": 2.2161570855297474e-05, "loss": 0.0597, "step": 3646 }, { "epoch": 2.219044721630666, "grad_norm": 0.49921971559524536, "learning_rate": 2.2149202851677842e-05, "loss": 0.041, "step": 3647 }, { "epoch": 2.2196531791907512, "grad_norm": 0.45042991638183594, "learning_rate": 2.213683555493708e-05, "loss": 0.0454, "step": 3648 }, { "epoch": 2.2202616367508368, "grad_norm": 0.5090251564979553, "learning_rate": 2.2124468968141746e-05, "loss": 0.0528, "step": 3649 }, { "epoch": 2.220870094310922, "grad_norm": 0.6496664881706238, "learning_rate": 2.211210309435826e-05, "loss": 0.0516, "step": 3650 }, { "epoch": 2.221478551871007, "grad_norm": 0.5032176375389099, "learning_rate": 2.2099737936652834e-05, "loss": 0.0365, "step": 3651 }, { "epoch": 2.222087009431092, "grad_norm": 0.5386142730712891, "learning_rate": 2.2087373498091505e-05, "loss": 0.0489, "step": 3652 }, { "epoch": 2.222695466991177, "grad_norm": 0.5866867303848267, "learning_rate": 2.2075009781740144e-05, "loss": 0.0507, "step": 3653 }, { "epoch": 2.2233039245512627, "grad_norm": 0.6343091726303101, "learning_rate": 2.2062646790664443e-05, "loss": 0.05, "step": 3654 }, { "epoch": 2.223912382111348, "grad_norm": 0.49765875935554504, "learning_rate": 2.2050284527929897e-05, "loss": 0.0349, "step": 3655 }, { "epoch": 2.224520839671433, "grad_norm": 0.6474698185920715, "learning_rate": 2.203792299660184e-05, "loss": 0.0544, "step": 3656 }, { "epoch": 2.225129297231518, "grad_norm": 0.5999079346656799, "learning_rate": 2.202556219974542e-05, "loss": 0.0387, "step": 3657 }, { "epoch": 2.225737754791603, "grad_norm": 0.5736773014068604, "learning_rate": 2.2013202140425584e-05, "loss": 0.0553, "step": 3658 }, { "epoch": 2.2263462123516886, "grad_norm": 0.6365640163421631, "learning_rate": 2.2000842821707122e-05, "loss": 0.046, "step": 3659 }, { "epoch": 2.2269546699117737, "grad_norm": 0.5798011422157288, "learning_rate": 2.198848424665464e-05, "loss": 0.0466, "step": 3660 }, { "epoch": 2.227563127471859, "grad_norm": 0.599982738494873, "learning_rate": 2.197612641833253e-05, "loss": 0.0608, "step": 3661 }, { "epoch": 2.228171585031944, "grad_norm": 0.5781305432319641, "learning_rate": 2.196376933980503e-05, "loss": 0.0446, "step": 3662 }, { "epoch": 2.228780042592029, "grad_norm": 0.6489582061767578, "learning_rate": 2.1951413014136177e-05, "loss": 0.0562, "step": 3663 }, { "epoch": 2.2293885001521145, "grad_norm": 0.5983100533485413, "learning_rate": 2.1939057444389822e-05, "loss": 0.046, "step": 3664 }, { "epoch": 2.2299969577121996, "grad_norm": 0.5980080366134644, "learning_rate": 2.192670263362964e-05, "loss": 0.0658, "step": 3665 }, { "epoch": 2.2306054152722847, "grad_norm": 0.5348019003868103, "learning_rate": 2.19143485849191e-05, "loss": 0.0498, "step": 3666 }, { "epoch": 2.23121387283237, "grad_norm": 0.5412232875823975, "learning_rate": 2.1901995301321493e-05, "loss": 0.0397, "step": 3667 }, { "epoch": 2.231822330392455, "grad_norm": 0.5473434925079346, "learning_rate": 2.1889642785899926e-05, "loss": 0.0388, "step": 3668 }, { "epoch": 2.2324307879525405, "grad_norm": 0.8068600296974182, "learning_rate": 2.1877291041717294e-05, "loss": 0.0524, "step": 3669 }, { "epoch": 2.2330392455126256, "grad_norm": 0.5853879451751709, "learning_rate": 2.1864940071836326e-05, "loss": 0.0582, "step": 3670 }, { "epoch": 2.2336477030727107, "grad_norm": 0.5249461531639099, "learning_rate": 2.1852589879319547e-05, "loss": 0.048, "step": 3671 }, { "epoch": 2.2342561606327958, "grad_norm": 0.5302576422691345, "learning_rate": 2.1840240467229283e-05, "loss": 0.0553, "step": 3672 }, { "epoch": 2.234864618192881, "grad_norm": 0.5220465660095215, "learning_rate": 2.1827891838627687e-05, "loss": 0.0431, "step": 3673 }, { "epoch": 2.2354730757529664, "grad_norm": 0.5852304697036743, "learning_rate": 2.1815543996576688e-05, "loss": 0.0496, "step": 3674 }, { "epoch": 2.2360815333130515, "grad_norm": 0.6456661820411682, "learning_rate": 2.1803196944138045e-05, "loss": 0.0519, "step": 3675 }, { "epoch": 2.2366899908731366, "grad_norm": 0.5966864824295044, "learning_rate": 2.1790850684373305e-05, "loss": 0.0358, "step": 3676 }, { "epoch": 2.2372984484332217, "grad_norm": 0.6074363589286804, "learning_rate": 2.1778505220343836e-05, "loss": 0.0507, "step": 3677 }, { "epoch": 2.2379069059933068, "grad_norm": 0.5367887616157532, "learning_rate": 2.17661605551108e-05, "loss": 0.0602, "step": 3678 }, { "epoch": 2.2385153635533923, "grad_norm": 0.5965633392333984, "learning_rate": 2.175381669173514e-05, "loss": 0.043, "step": 3679 }, { "epoch": 2.2391238211134774, "grad_norm": 0.5841217041015625, "learning_rate": 2.174147363327764e-05, "loss": 0.0536, "step": 3680 }, { "epoch": 2.2397322786735625, "grad_norm": 0.5816332101821899, "learning_rate": 2.1729131382798858e-05, "loss": 0.0578, "step": 3681 }, { "epoch": 2.2403407362336476, "grad_norm": 0.5089899301528931, "learning_rate": 2.1716789943359155e-05, "loss": 0.0424, "step": 3682 }, { "epoch": 2.2409491937937327, "grad_norm": 1.4226927757263184, "learning_rate": 2.1704449318018692e-05, "loss": 0.0814, "step": 3683 }, { "epoch": 2.2415576513538182, "grad_norm": 0.6067140102386475, "learning_rate": 2.1692109509837442e-05, "loss": 0.0553, "step": 3684 }, { "epoch": 2.2421661089139033, "grad_norm": 0.504880964756012, "learning_rate": 2.167977052187515e-05, "loss": 0.0329, "step": 3685 }, { "epoch": 2.2427745664739884, "grad_norm": 0.5495906472206116, "learning_rate": 2.1667432357191364e-05, "loss": 0.0526, "step": 3686 }, { "epoch": 2.2433830240340735, "grad_norm": 0.6342142820358276, "learning_rate": 2.1655095018845455e-05, "loss": 0.0491, "step": 3687 }, { "epoch": 2.2439914815941586, "grad_norm": 0.5818422436714172, "learning_rate": 2.1642758509896562e-05, "loss": 0.0383, "step": 3688 }, { "epoch": 2.244599939154244, "grad_norm": 0.6000524163246155, "learning_rate": 2.1630422833403613e-05, "loss": 0.0517, "step": 3689 }, { "epoch": 2.2452083967143293, "grad_norm": 0.5182064175605774, "learning_rate": 2.1618087992425356e-05, "loss": 0.0441, "step": 3690 }, { "epoch": 2.2458168542744144, "grad_norm": 0.6751769781112671, "learning_rate": 2.1605753990020315e-05, "loss": 0.0608, "step": 3691 }, { "epoch": 2.2464253118344994, "grad_norm": 0.6493861079216003, "learning_rate": 2.1593420829246794e-05, "loss": 0.0533, "step": 3692 }, { "epoch": 2.2470337693945845, "grad_norm": 0.6242604851722717, "learning_rate": 2.1581088513162923e-05, "loss": 0.0522, "step": 3693 }, { "epoch": 2.24764222695467, "grad_norm": 0.5824303030967712, "learning_rate": 2.1568757044826595e-05, "loss": 0.0507, "step": 3694 }, { "epoch": 2.248250684514755, "grad_norm": 0.6880918741226196, "learning_rate": 2.1556426427295488e-05, "loss": 0.0582, "step": 3695 }, { "epoch": 2.2488591420748403, "grad_norm": 0.6002342104911804, "learning_rate": 2.1544096663627104e-05, "loss": 0.053, "step": 3696 }, { "epoch": 2.2494675996349254, "grad_norm": 0.5104690790176392, "learning_rate": 2.1531767756878696e-05, "loss": 0.0394, "step": 3697 }, { "epoch": 2.2500760571950105, "grad_norm": 0.5821288228034973, "learning_rate": 2.151943971010732e-05, "loss": 0.0422, "step": 3698 }, { "epoch": 2.250684514755096, "grad_norm": 0.6568170785903931, "learning_rate": 2.150711252636981e-05, "loss": 0.0554, "step": 3699 }, { "epoch": 2.251292972315181, "grad_norm": 0.5166933536529541, "learning_rate": 2.1494786208722814e-05, "loss": 0.0454, "step": 3700 }, { "epoch": 2.251901429875266, "grad_norm": 0.6102219223976135, "learning_rate": 2.1482460760222733e-05, "loss": 0.0525, "step": 3701 }, { "epoch": 2.2525098874353513, "grad_norm": 0.5432948470115662, "learning_rate": 2.1470136183925755e-05, "loss": 0.0538, "step": 3702 }, { "epoch": 2.2531183449954364, "grad_norm": 0.5394953489303589, "learning_rate": 2.1457812482887882e-05, "loss": 0.0412, "step": 3703 }, { "epoch": 2.253726802555522, "grad_norm": 0.5707597136497498, "learning_rate": 2.1445489660164868e-05, "loss": 0.0498, "step": 3704 }, { "epoch": 2.254335260115607, "grad_norm": 0.5889186263084412, "learning_rate": 2.1433167718812247e-05, "loss": 0.0554, "step": 3705 }, { "epoch": 2.254943717675692, "grad_norm": 0.5888684391975403, "learning_rate": 2.142084666188537e-05, "loss": 0.0512, "step": 3706 }, { "epoch": 2.2555521752357772, "grad_norm": 0.4928998053073883, "learning_rate": 2.1408526492439336e-05, "loss": 0.0578, "step": 3707 }, { "epoch": 2.2561606327958623, "grad_norm": 0.5299960374832153, "learning_rate": 2.139620721352903e-05, "loss": 0.0405, "step": 3708 }, { "epoch": 2.256769090355948, "grad_norm": 0.5291889905929565, "learning_rate": 2.138388882820911e-05, "loss": 0.0555, "step": 3709 }, { "epoch": 2.257377547916033, "grad_norm": 0.6831831932067871, "learning_rate": 2.1371571339534046e-05, "loss": 0.0863, "step": 3710 }, { "epoch": 2.257986005476118, "grad_norm": 0.5145639181137085, "learning_rate": 2.135925475055805e-05, "loss": 0.0474, "step": 3711 }, { "epoch": 2.258594463036203, "grad_norm": 0.5370884537696838, "learning_rate": 2.134693906433511e-05, "loss": 0.0416, "step": 3712 }, { "epoch": 2.2592029205962882, "grad_norm": 0.5465013384819031, "learning_rate": 2.1334624283919026e-05, "loss": 0.0537, "step": 3713 }, { "epoch": 2.259811378156374, "grad_norm": 0.5709042549133301, "learning_rate": 2.132231041236334e-05, "loss": 0.0366, "step": 3714 }, { "epoch": 2.260419835716459, "grad_norm": 0.6409594416618347, "learning_rate": 2.1309997452721366e-05, "loss": 0.0586, "step": 3715 }, { "epoch": 2.261028293276544, "grad_norm": 0.634441077709198, "learning_rate": 2.129768540804623e-05, "loss": 0.0634, "step": 3716 }, { "epoch": 2.261636750836629, "grad_norm": 0.5557528734207153, "learning_rate": 2.128537428139079e-05, "loss": 0.0429, "step": 3717 }, { "epoch": 2.262245208396714, "grad_norm": 0.678367018699646, "learning_rate": 2.1273064075807686e-05, "loss": 0.0686, "step": 3718 }, { "epoch": 2.2628536659567997, "grad_norm": 0.5972436666488647, "learning_rate": 2.1260754794349354e-05, "loss": 0.0461, "step": 3719 }, { "epoch": 2.263462123516885, "grad_norm": 0.664558470249176, "learning_rate": 2.1248446440067976e-05, "loss": 0.0467, "step": 3720 }, { "epoch": 2.26407058107697, "grad_norm": 0.6116575598716736, "learning_rate": 2.1236139016015507e-05, "loss": 0.0517, "step": 3721 }, { "epoch": 2.264679038637055, "grad_norm": 0.5580343008041382, "learning_rate": 2.1223832525243663e-05, "loss": 0.0533, "step": 3722 }, { "epoch": 2.26528749619714, "grad_norm": 0.5420104265213013, "learning_rate": 2.1211526970803973e-05, "loss": 0.039, "step": 3723 }, { "epoch": 2.2658959537572256, "grad_norm": 1.199730634689331, "learning_rate": 2.1199222355747674e-05, "loss": 0.0618, "step": 3724 }, { "epoch": 2.2665044113173107, "grad_norm": 0.6017125844955444, "learning_rate": 2.1186918683125802e-05, "loss": 0.0559, "step": 3725 }, { "epoch": 2.267112868877396, "grad_norm": 0.5252535343170166, "learning_rate": 2.117461595598917e-05, "loss": 0.0335, "step": 3726 }, { "epoch": 2.267721326437481, "grad_norm": 0.6079027652740479, "learning_rate": 2.1162314177388327e-05, "loss": 0.0508, "step": 3727 }, { "epoch": 2.268329783997566, "grad_norm": 0.49775218963623047, "learning_rate": 2.1150013350373594e-05, "loss": 0.0402, "step": 3728 }, { "epoch": 2.2689382415576516, "grad_norm": 0.5613334774971008, "learning_rate": 2.113771347799509e-05, "loss": 0.0483, "step": 3729 }, { "epoch": 2.2695466991177367, "grad_norm": 0.5892758965492249, "learning_rate": 2.1125414563302654e-05, "loss": 0.0589, "step": 3730 }, { "epoch": 2.2701551566778218, "grad_norm": 0.6577876806259155, "learning_rate": 2.1113116609345893e-05, "loss": 0.0417, "step": 3731 }, { "epoch": 2.270763614237907, "grad_norm": 0.6426102519035339, "learning_rate": 2.110081961917421e-05, "loss": 0.0514, "step": 3732 }, { "epoch": 2.271372071797992, "grad_norm": 0.5062007308006287, "learning_rate": 2.108852359583674e-05, "loss": 0.0332, "step": 3733 }, { "epoch": 2.2719805293580775, "grad_norm": 0.5272140502929688, "learning_rate": 2.1076228542382376e-05, "loss": 0.0393, "step": 3734 }, { "epoch": 2.2725889869181626, "grad_norm": 0.6248688697814941, "learning_rate": 2.1063934461859774e-05, "loss": 0.0518, "step": 3735 }, { "epoch": 2.2731974444782477, "grad_norm": 0.6348571181297302, "learning_rate": 2.105164135731737e-05, "loss": 0.0552, "step": 3736 }, { "epoch": 2.2738059020383328, "grad_norm": 0.5358805656433105, "learning_rate": 2.1039349231803337e-05, "loss": 0.0462, "step": 3737 }, { "epoch": 2.274414359598418, "grad_norm": 0.5874552130699158, "learning_rate": 2.1027058088365593e-05, "loss": 0.0503, "step": 3738 }, { "epoch": 2.2750228171585034, "grad_norm": 0.5456854104995728, "learning_rate": 2.1014767930051856e-05, "loss": 0.0462, "step": 3739 }, { "epoch": 2.2756312747185885, "grad_norm": 0.5097061991691589, "learning_rate": 2.1002478759909558e-05, "loss": 0.0488, "step": 3740 }, { "epoch": 2.2762397322786736, "grad_norm": 0.5960326790809631, "learning_rate": 2.0990190580985892e-05, "loss": 0.057, "step": 3741 }, { "epoch": 2.2768481898387587, "grad_norm": 0.6707984805107117, "learning_rate": 2.0977903396327837e-05, "loss": 0.0909, "step": 3742 }, { "epoch": 2.277456647398844, "grad_norm": 0.5370257496833801, "learning_rate": 2.096561720898209e-05, "loss": 0.0396, "step": 3743 }, { "epoch": 2.2780651049589293, "grad_norm": 0.5569398403167725, "learning_rate": 2.0953332021995114e-05, "loss": 0.0492, "step": 3744 }, { "epoch": 2.2786735625190144, "grad_norm": 0.5827397108078003, "learning_rate": 2.094104783841311e-05, "loss": 0.0591, "step": 3745 }, { "epoch": 2.2792820200790995, "grad_norm": 0.5374563336372375, "learning_rate": 2.0928764661282068e-05, "loss": 0.0438, "step": 3746 }, { "epoch": 2.2798904776391846, "grad_norm": 0.41728445887565613, "learning_rate": 2.0916482493647694e-05, "loss": 0.0278, "step": 3747 }, { "epoch": 2.2804989351992697, "grad_norm": 0.5607496500015259, "learning_rate": 2.0904201338555442e-05, "loss": 0.0335, "step": 3748 }, { "epoch": 2.2811073927593553, "grad_norm": 0.6181521415710449, "learning_rate": 2.0891921199050547e-05, "loss": 0.0428, "step": 3749 }, { "epoch": 2.2817158503194404, "grad_norm": 0.6066822409629822, "learning_rate": 2.087964207817796e-05, "loss": 0.051, "step": 3750 }, { "epoch": 2.2823243078795254, "grad_norm": 0.5879988670349121, "learning_rate": 2.0867363978982383e-05, "loss": 0.0372, "step": 3751 }, { "epoch": 2.2829327654396105, "grad_norm": 0.6240618228912354, "learning_rate": 2.085508690450829e-05, "loss": 0.0432, "step": 3752 }, { "epoch": 2.2835412229996956, "grad_norm": 0.5587534308433533, "learning_rate": 2.084281085779988e-05, "loss": 0.0415, "step": 3753 }, { "epoch": 2.284149680559781, "grad_norm": 0.545318067073822, "learning_rate": 2.083053584190109e-05, "loss": 0.0417, "step": 3754 }, { "epoch": 2.2847581381198663, "grad_norm": 0.5254773497581482, "learning_rate": 2.0818261859855624e-05, "loss": 0.0284, "step": 3755 }, { "epoch": 2.2853665956799514, "grad_norm": 0.6514537930488586, "learning_rate": 2.0805988914706913e-05, "loss": 0.0714, "step": 3756 }, { "epoch": 2.2859750532400365, "grad_norm": 0.5673707127571106, "learning_rate": 2.079371700949814e-05, "loss": 0.0426, "step": 3757 }, { "epoch": 2.2865835108001216, "grad_norm": 0.8576644062995911, "learning_rate": 2.078144614727221e-05, "loss": 0.0707, "step": 3758 }, { "epoch": 2.287191968360207, "grad_norm": 0.6915388107299805, "learning_rate": 2.0769176331071805e-05, "loss": 0.052, "step": 3759 }, { "epoch": 2.287800425920292, "grad_norm": 0.536024808883667, "learning_rate": 2.0756907563939327e-05, "loss": 0.041, "step": 3760 }, { "epoch": 2.2884088834803773, "grad_norm": 0.5904122591018677, "learning_rate": 2.0744639848916895e-05, "loss": 0.047, "step": 3761 }, { "epoch": 2.2890173410404624, "grad_norm": 0.49669381976127625, "learning_rate": 2.073237318904642e-05, "loss": 0.0417, "step": 3762 }, { "epoch": 2.2896257986005475, "grad_norm": 0.540749192237854, "learning_rate": 2.0720107587369513e-05, "loss": 0.0603, "step": 3763 }, { "epoch": 2.290234256160633, "grad_norm": 0.5298412442207336, "learning_rate": 2.070784304692752e-05, "loss": 0.056, "step": 3764 }, { "epoch": 2.290842713720718, "grad_norm": 0.623453676700592, "learning_rate": 2.0695579570761552e-05, "loss": 0.055, "step": 3765 }, { "epoch": 2.2914511712808032, "grad_norm": 0.5402014255523682, "learning_rate": 2.0683317161912437e-05, "loss": 0.0448, "step": 3766 }, { "epoch": 2.2920596288408883, "grad_norm": 0.5555523633956909, "learning_rate": 2.067105582342073e-05, "loss": 0.0432, "step": 3767 }, { "epoch": 2.2926680864009734, "grad_norm": 0.4748152792453766, "learning_rate": 2.0658795558326743e-05, "loss": 0.0386, "step": 3768 }, { "epoch": 2.293276543961059, "grad_norm": 0.5108730792999268, "learning_rate": 2.064653636967051e-05, "loss": 0.0442, "step": 3769 }, { "epoch": 2.293885001521144, "grad_norm": 0.6018065810203552, "learning_rate": 2.063427826049179e-05, "loss": 0.0436, "step": 3770 }, { "epoch": 2.294493459081229, "grad_norm": 0.5590872168540955, "learning_rate": 2.062202123383009e-05, "loss": 0.0568, "step": 3771 }, { "epoch": 2.2951019166413142, "grad_norm": 0.6751280426979065, "learning_rate": 2.0609765292724647e-05, "loss": 0.0736, "step": 3772 }, { "epoch": 2.2957103742013993, "grad_norm": 0.5929213762283325, "learning_rate": 2.059751044021441e-05, "loss": 0.0584, "step": 3773 }, { "epoch": 2.296318831761485, "grad_norm": 1.4224424362182617, "learning_rate": 2.0585256679338083e-05, "loss": 0.0544, "step": 3774 }, { "epoch": 2.29692728932157, "grad_norm": 0.5579968690872192, "learning_rate": 2.0573004013134088e-05, "loss": 0.0644, "step": 3775 }, { "epoch": 2.297535746881655, "grad_norm": 0.6264181137084961, "learning_rate": 2.0560752444640567e-05, "loss": 0.0596, "step": 3776 }, { "epoch": 2.29814420444174, "grad_norm": 0.6934359669685364, "learning_rate": 2.0548501976895395e-05, "loss": 0.0666, "step": 3777 }, { "epoch": 2.2987526620018253, "grad_norm": 0.5914580821990967, "learning_rate": 2.0536252612936196e-05, "loss": 0.0455, "step": 3778 }, { "epoch": 2.299361119561911, "grad_norm": 0.4668748080730438, "learning_rate": 2.0524004355800292e-05, "loss": 0.0424, "step": 3779 }, { "epoch": 2.299969577121996, "grad_norm": 0.5458466410636902, "learning_rate": 2.0511757208524734e-05, "loss": 0.0447, "step": 3780 }, { "epoch": 2.300578034682081, "grad_norm": 0.6712079644203186, "learning_rate": 2.0499511174146307e-05, "loss": 0.0512, "step": 3781 }, { "epoch": 2.301186492242166, "grad_norm": 0.5523056387901306, "learning_rate": 2.0487266255701527e-05, "loss": 0.0471, "step": 3782 }, { "epoch": 2.301794949802251, "grad_norm": 0.612053632736206, "learning_rate": 2.0475022456226606e-05, "loss": 0.0547, "step": 3783 }, { "epoch": 2.3024034073623363, "grad_norm": 0.6411483883857727, "learning_rate": 2.0462779778757507e-05, "loss": 0.0698, "step": 3784 }, { "epoch": 2.303011864922422, "grad_norm": 0.5178706645965576, "learning_rate": 2.045053822632991e-05, "loss": 0.0485, "step": 3785 }, { "epoch": 2.303620322482507, "grad_norm": 0.5511070489883423, "learning_rate": 2.0438297801979196e-05, "loss": 0.0442, "step": 3786 }, { "epoch": 2.304228780042592, "grad_norm": 0.5382661819458008, "learning_rate": 2.0426058508740483e-05, "loss": 0.0564, "step": 3787 }, { "epoch": 2.304837237602677, "grad_norm": 0.5051350593566895, "learning_rate": 2.041382034964862e-05, "loss": 0.0333, "step": 3788 }, { "epoch": 2.305445695162762, "grad_norm": 0.6360859870910645, "learning_rate": 2.040158332773814e-05, "loss": 0.0551, "step": 3789 }, { "epoch": 2.3060541527228477, "grad_norm": 0.651378333568573, "learning_rate": 2.0389347446043325e-05, "loss": 0.0489, "step": 3790 }, { "epoch": 2.306662610282933, "grad_norm": 0.4725477397441864, "learning_rate": 2.0377112707598163e-05, "loss": 0.0325, "step": 3791 }, { "epoch": 2.307271067843018, "grad_norm": 0.6588869690895081, "learning_rate": 2.0364879115436352e-05, "loss": 0.0461, "step": 3792 }, { "epoch": 2.307879525403103, "grad_norm": 0.5729846954345703, "learning_rate": 2.0352646672591328e-05, "loss": 0.0658, "step": 3793 }, { "epoch": 2.308487982963188, "grad_norm": 0.6155021786689758, "learning_rate": 2.034041538209621e-05, "loss": 0.0583, "step": 3794 }, { "epoch": 2.3090964405232737, "grad_norm": 0.5684482455253601, "learning_rate": 2.0328185246983862e-05, "loss": 0.0488, "step": 3795 }, { "epoch": 2.3097048980833588, "grad_norm": 0.5797809362411499, "learning_rate": 2.031595627028684e-05, "loss": 0.0394, "step": 3796 }, { "epoch": 2.310313355643444, "grad_norm": 0.7263544201850891, "learning_rate": 2.0303728455037422e-05, "loss": 0.0685, "step": 3797 }, { "epoch": 2.310921813203529, "grad_norm": 0.5464826226234436, "learning_rate": 2.0291501804267592e-05, "loss": 0.0483, "step": 3798 }, { "epoch": 2.311530270763614, "grad_norm": 0.5334215760231018, "learning_rate": 2.0279276321009067e-05, "loss": 0.0416, "step": 3799 }, { "epoch": 2.3121387283236996, "grad_norm": 0.5960489511489868, "learning_rate": 2.026705200829323e-05, "loss": 0.0465, "step": 3800 }, { "epoch": 2.3127471858837847, "grad_norm": 0.5816206932067871, "learning_rate": 2.0254828869151225e-05, "loss": 0.0482, "step": 3801 }, { "epoch": 2.31335564344387, "grad_norm": 0.5829434990882874, "learning_rate": 2.0242606906613876e-05, "loss": 0.0693, "step": 3802 }, { "epoch": 2.313964101003955, "grad_norm": 0.5510879158973694, "learning_rate": 2.0230386123711714e-05, "loss": 0.0428, "step": 3803 }, { "epoch": 2.31457255856404, "grad_norm": 0.6710597276687622, "learning_rate": 2.0218166523474973e-05, "loss": 0.0483, "step": 3804 }, { "epoch": 2.3151810161241255, "grad_norm": 0.5956816673278809, "learning_rate": 2.020594810893363e-05, "loss": 0.0566, "step": 3805 }, { "epoch": 2.3157894736842106, "grad_norm": 0.6392737030982971, "learning_rate": 2.0193730883117335e-05, "loss": 0.0547, "step": 3806 }, { "epoch": 2.3163979312442957, "grad_norm": 0.548627495765686, "learning_rate": 2.018151484905543e-05, "loss": 0.0348, "step": 3807 }, { "epoch": 2.317006388804381, "grad_norm": 0.560031533241272, "learning_rate": 2.016930000977701e-05, "loss": 0.0522, "step": 3808 }, { "epoch": 2.317614846364466, "grad_norm": 0.6135550141334534, "learning_rate": 2.0157086368310836e-05, "loss": 0.0511, "step": 3809 }, { "epoch": 2.3182233039245514, "grad_norm": 0.6668503284454346, "learning_rate": 2.014487392768537e-05, "loss": 0.0485, "step": 3810 }, { "epoch": 2.3188317614846365, "grad_norm": 0.6775378584861755, "learning_rate": 2.01326626909288e-05, "loss": 0.0683, "step": 3811 }, { "epoch": 2.3194402190447216, "grad_norm": 0.48050111532211304, "learning_rate": 2.012045266106901e-05, "loss": 0.0377, "step": 3812 }, { "epoch": 2.3200486766048067, "grad_norm": 0.9371777772903442, "learning_rate": 2.010824384113357e-05, "loss": 0.0584, "step": 3813 }, { "epoch": 2.320657134164892, "grad_norm": 0.6224126815795898, "learning_rate": 2.0096036234149746e-05, "loss": 0.0479, "step": 3814 }, { "epoch": 2.3212655917249774, "grad_norm": 0.6318977475166321, "learning_rate": 2.0083829843144543e-05, "loss": 0.0657, "step": 3815 }, { "epoch": 2.3218740492850625, "grad_norm": 0.5769774913787842, "learning_rate": 2.0071624671144616e-05, "loss": 0.0409, "step": 3816 }, { "epoch": 2.3224825068451476, "grad_norm": 0.5656179189682007, "learning_rate": 2.005942072117634e-05, "loss": 0.0515, "step": 3817 }, { "epoch": 2.3230909644052327, "grad_norm": 0.5796024799346924, "learning_rate": 2.00472179962658e-05, "loss": 0.0501, "step": 3818 }, { "epoch": 2.3236994219653178, "grad_norm": 0.5330328345298767, "learning_rate": 2.0035016499438758e-05, "loss": 0.0495, "step": 3819 }, { "epoch": 2.324307879525403, "grad_norm": 0.6313185691833496, "learning_rate": 2.002281623372066e-05, "loss": 0.045, "step": 3820 }, { "epoch": 2.3249163370854884, "grad_norm": 0.5151857137680054, "learning_rate": 2.001061720213669e-05, "loss": 0.0319, "step": 3821 }, { "epoch": 2.3255247946455735, "grad_norm": 0.6799402236938477, "learning_rate": 1.9998419407711686e-05, "loss": 0.0514, "step": 3822 }, { "epoch": 2.3261332522056586, "grad_norm": 0.499461829662323, "learning_rate": 1.998622285347018e-05, "loss": 0.0368, "step": 3823 }, { "epoch": 2.3267417097657437, "grad_norm": 0.5110387206077576, "learning_rate": 1.9974027542436433e-05, "loss": 0.0464, "step": 3824 }, { "epoch": 2.327350167325829, "grad_norm": 0.7454649806022644, "learning_rate": 1.996183347763436e-05, "loss": 0.076, "step": 3825 }, { "epoch": 2.3279586248859143, "grad_norm": 0.5358094573020935, "learning_rate": 1.994964066208759e-05, "loss": 0.0465, "step": 3826 }, { "epoch": 2.3285670824459994, "grad_norm": 0.6061232686042786, "learning_rate": 1.993744909881941e-05, "loss": 0.0434, "step": 3827 }, { "epoch": 2.3291755400060845, "grad_norm": 0.6150352358818054, "learning_rate": 1.9925258790852852e-05, "loss": 0.0528, "step": 3828 }, { "epoch": 2.3297839975661696, "grad_norm": 0.5760040283203125, "learning_rate": 1.9913069741210583e-05, "loss": 0.0573, "step": 3829 }, { "epoch": 2.3303924551262547, "grad_norm": 0.6556773781776428, "learning_rate": 1.9900881952914975e-05, "loss": 0.0423, "step": 3830 }, { "epoch": 2.3310009126863402, "grad_norm": 0.5123857259750366, "learning_rate": 1.9888695428988115e-05, "loss": 0.0432, "step": 3831 }, { "epoch": 2.3316093702464253, "grad_norm": 0.651202917098999, "learning_rate": 1.9876510172451733e-05, "loss": 0.0423, "step": 3832 }, { "epoch": 2.3322178278065104, "grad_norm": 0.4787401258945465, "learning_rate": 1.9864326186327265e-05, "loss": 0.0382, "step": 3833 }, { "epoch": 2.3328262853665955, "grad_norm": 0.5490178465843201, "learning_rate": 1.9852143473635847e-05, "loss": 0.0627, "step": 3834 }, { "epoch": 2.3334347429266806, "grad_norm": 0.47695285081863403, "learning_rate": 1.9839962037398276e-05, "loss": 0.0515, "step": 3835 }, { "epoch": 2.334043200486766, "grad_norm": 0.5950469374656677, "learning_rate": 1.9827781880635034e-05, "loss": 0.0443, "step": 3836 }, { "epoch": 2.3346516580468513, "grad_norm": 0.5753946304321289, "learning_rate": 1.981560300636629e-05, "loss": 0.0452, "step": 3837 }, { "epoch": 2.3352601156069364, "grad_norm": 0.5735435485839844, "learning_rate": 1.9803425417611915e-05, "loss": 0.0579, "step": 3838 }, { "epoch": 2.3358685731670215, "grad_norm": 0.5695839524269104, "learning_rate": 1.979124911739143e-05, "loss": 0.0466, "step": 3839 }, { "epoch": 2.3364770307271066, "grad_norm": 0.4902591407299042, "learning_rate": 1.9779074108724042e-05, "loss": 0.0431, "step": 3840 }, { "epoch": 2.337085488287192, "grad_norm": 0.6744665503501892, "learning_rate": 1.9766900394628668e-05, "loss": 0.0583, "step": 3841 }, { "epoch": 2.337693945847277, "grad_norm": 0.5691081285476685, "learning_rate": 1.975472797812387e-05, "loss": 0.0315, "step": 3842 }, { "epoch": 2.3383024034073623, "grad_norm": 0.5368900299072266, "learning_rate": 1.9742556862227888e-05, "loss": 0.0418, "step": 3843 }, { "epoch": 2.3389108609674474, "grad_norm": 0.5534371137619019, "learning_rate": 1.9730387049958672e-05, "loss": 0.0415, "step": 3844 }, { "epoch": 2.3395193185275325, "grad_norm": 0.5242774486541748, "learning_rate": 1.971821854433382e-05, "loss": 0.0447, "step": 3845 }, { "epoch": 2.340127776087618, "grad_norm": 0.6565991044044495, "learning_rate": 1.9706051348370608e-05, "loss": 0.0448, "step": 3846 }, { "epoch": 2.340736233647703, "grad_norm": 0.4980752766132355, "learning_rate": 1.9693885465086008e-05, "loss": 0.044, "step": 3847 }, { "epoch": 2.341344691207788, "grad_norm": 0.5127741694450378, "learning_rate": 1.968172089749664e-05, "loss": 0.0374, "step": 3848 }, { "epoch": 2.3419531487678733, "grad_norm": 0.5670256614685059, "learning_rate": 1.9669557648618815e-05, "loss": 0.0474, "step": 3849 }, { "epoch": 2.3425616063279584, "grad_norm": 0.6457985043525696, "learning_rate": 1.9657395721468494e-05, "loss": 0.06, "step": 3850 }, { "epoch": 2.343170063888044, "grad_norm": 0.6051198244094849, "learning_rate": 1.964523511906136e-05, "loss": 0.0545, "step": 3851 }, { "epoch": 2.343778521448129, "grad_norm": 0.6616335511207581, "learning_rate": 1.9633075844412714e-05, "loss": 0.0371, "step": 3852 }, { "epoch": 2.344386979008214, "grad_norm": 0.5447221398353577, "learning_rate": 1.9620917900537546e-05, "loss": 0.0435, "step": 3853 }, { "epoch": 2.3449954365682992, "grad_norm": 0.518090546131134, "learning_rate": 1.960876129045054e-05, "loss": 0.0454, "step": 3854 }, { "epoch": 2.3456038941283843, "grad_norm": 0.5191083550453186, "learning_rate": 1.959660601716601e-05, "loss": 0.0364, "step": 3855 }, { "epoch": 2.34621235168847, "grad_norm": 0.6573032140731812, "learning_rate": 1.958445208369796e-05, "loss": 0.0589, "step": 3856 }, { "epoch": 2.346820809248555, "grad_norm": 0.5200658440589905, "learning_rate": 1.9572299493060067e-05, "loss": 0.0445, "step": 3857 }, { "epoch": 2.34742926680864, "grad_norm": 0.5530250072479248, "learning_rate": 1.9560148248265662e-05, "loss": 0.0455, "step": 3858 }, { "epoch": 2.348037724368725, "grad_norm": 0.7511923313140869, "learning_rate": 1.954799835232774e-05, "loss": 0.0459, "step": 3859 }, { "epoch": 2.3486461819288103, "grad_norm": 0.48297080397605896, "learning_rate": 1.9535849808258985e-05, "loss": 0.0386, "step": 3860 }, { "epoch": 2.349254639488896, "grad_norm": 0.4536675214767456, "learning_rate": 1.9523702619071722e-05, "loss": 0.0357, "step": 3861 }, { "epoch": 2.349863097048981, "grad_norm": 0.522323727607727, "learning_rate": 1.9511556787777945e-05, "loss": 0.0429, "step": 3862 }, { "epoch": 2.350471554609066, "grad_norm": 0.5316919088363647, "learning_rate": 1.9499412317389305e-05, "loss": 0.0322, "step": 3863 }, { "epoch": 2.351080012169151, "grad_norm": 0.5202551484107971, "learning_rate": 1.9487269210917144e-05, "loss": 0.0453, "step": 3864 }, { "epoch": 2.351688469729236, "grad_norm": 0.6849666833877563, "learning_rate": 1.9475127471372436e-05, "loss": 0.0488, "step": 3865 }, { "epoch": 2.3522969272893217, "grad_norm": 0.531730592250824, "learning_rate": 1.946298710176582e-05, "loss": 0.0435, "step": 3866 }, { "epoch": 2.352905384849407, "grad_norm": 0.6236624121665955, "learning_rate": 1.9450848105107615e-05, "loss": 0.0723, "step": 3867 }, { "epoch": 2.353513842409492, "grad_norm": 0.502086877822876, "learning_rate": 1.9438710484407786e-05, "loss": 0.0486, "step": 3868 }, { "epoch": 2.354122299969577, "grad_norm": 0.4249696731567383, "learning_rate": 1.9426574242675932e-05, "loss": 0.0282, "step": 3869 }, { "epoch": 2.354730757529662, "grad_norm": 0.6139441132545471, "learning_rate": 1.941443938292137e-05, "loss": 0.0492, "step": 3870 }, { "epoch": 2.3553392150897476, "grad_norm": 0.4936158061027527, "learning_rate": 1.9402305908153025e-05, "loss": 0.0392, "step": 3871 }, { "epoch": 2.3559476726498327, "grad_norm": 0.5353962182998657, "learning_rate": 1.9390173821379486e-05, "loss": 0.0347, "step": 3872 }, { "epoch": 2.356556130209918, "grad_norm": 0.46766939759254456, "learning_rate": 1.9378043125609003e-05, "loss": 0.041, "step": 3873 }, { "epoch": 2.357164587770003, "grad_norm": 0.5060580372810364, "learning_rate": 1.93659138238495e-05, "loss": 0.0414, "step": 3874 }, { "epoch": 2.357773045330088, "grad_norm": 0.5533692240715027, "learning_rate": 1.935378591910853e-05, "loss": 0.0365, "step": 3875 }, { "epoch": 2.3583815028901736, "grad_norm": 0.5114442110061646, "learning_rate": 1.934165941439329e-05, "loss": 0.0452, "step": 3876 }, { "epoch": 2.3589899604502587, "grad_norm": 0.563607394695282, "learning_rate": 1.932953431271068e-05, "loss": 0.06, "step": 3877 }, { "epoch": 2.3595984180103438, "grad_norm": 0.4950112998485565, "learning_rate": 1.93174106170672e-05, "loss": 0.047, "step": 3878 }, { "epoch": 2.360206875570429, "grad_norm": 0.6233085989952087, "learning_rate": 1.930528833046902e-05, "loss": 0.0455, "step": 3879 }, { "epoch": 2.360815333130514, "grad_norm": 0.5214982628822327, "learning_rate": 1.9293167455921972e-05, "loss": 0.0359, "step": 3880 }, { "epoch": 2.3614237906905995, "grad_norm": 0.5291669368743896, "learning_rate": 1.9281047996431527e-05, "loss": 0.041, "step": 3881 }, { "epoch": 2.3620322482506846, "grad_norm": 0.7979464530944824, "learning_rate": 1.9268929955002787e-05, "loss": 0.0629, "step": 3882 }, { "epoch": 2.3626407058107697, "grad_norm": 0.5536260008811951, "learning_rate": 1.9256813334640546e-05, "loss": 0.05, "step": 3883 }, { "epoch": 2.3632491633708548, "grad_norm": 0.5184295177459717, "learning_rate": 1.9244698138349212e-05, "loss": 0.0453, "step": 3884 }, { "epoch": 2.36385762093094, "grad_norm": 0.632335901260376, "learning_rate": 1.9232584369132848e-05, "loss": 0.0579, "step": 3885 }, { "epoch": 2.3644660784910254, "grad_norm": 0.611529529094696, "learning_rate": 1.922047202999515e-05, "loss": 0.0749, "step": 3886 }, { "epoch": 2.3650745360511105, "grad_norm": 0.6848330497741699, "learning_rate": 1.9208361123939498e-05, "loss": 0.0487, "step": 3887 }, { "epoch": 2.3656829936111956, "grad_norm": 0.5429683923721313, "learning_rate": 1.9196251653968877e-05, "loss": 0.0382, "step": 3888 }, { "epoch": 2.3662914511712807, "grad_norm": 0.5638923645019531, "learning_rate": 1.9184143623085926e-05, "loss": 0.0403, "step": 3889 }, { "epoch": 2.366899908731366, "grad_norm": 0.5319681167602539, "learning_rate": 1.917203703429295e-05, "loss": 0.0563, "step": 3890 }, { "epoch": 2.3675083662914513, "grad_norm": 0.5609192848205566, "learning_rate": 1.9159931890591865e-05, "loss": 0.0425, "step": 3891 }, { "epoch": 2.3681168238515364, "grad_norm": 0.5878105759620667, "learning_rate": 1.9147828194984236e-05, "loss": 0.0424, "step": 3892 }, { "epoch": 2.3687252814116215, "grad_norm": 0.5947871208190918, "learning_rate": 1.91357259504713e-05, "loss": 0.0574, "step": 3893 }, { "epoch": 2.3693337389717066, "grad_norm": 0.4823724031448364, "learning_rate": 1.9123625160053885e-05, "loss": 0.0452, "step": 3894 }, { "epoch": 2.3699421965317917, "grad_norm": 0.5193595886230469, "learning_rate": 1.911152582673249e-05, "loss": 0.0459, "step": 3895 }, { "epoch": 2.3705506540918773, "grad_norm": 0.6103157997131348, "learning_rate": 1.9099427953507245e-05, "loss": 0.0555, "step": 3896 }, { "epoch": 2.3711591116519624, "grad_norm": 0.5717165470123291, "learning_rate": 1.9087331543377925e-05, "loss": 0.0421, "step": 3897 }, { "epoch": 2.3717675692120475, "grad_norm": 0.5617209076881409, "learning_rate": 1.9075236599343927e-05, "loss": 0.0419, "step": 3898 }, { "epoch": 2.3723760267721326, "grad_norm": 0.503222644329071, "learning_rate": 1.9063143124404293e-05, "loss": 0.054, "step": 3899 }, { "epoch": 2.3729844843322176, "grad_norm": 0.6546376943588257, "learning_rate": 1.9051051121557712e-05, "loss": 0.0527, "step": 3900 }, { "epoch": 2.373592941892303, "grad_norm": 0.5000297427177429, "learning_rate": 1.903896059380248e-05, "loss": 0.0388, "step": 3901 }, { "epoch": 2.3742013994523883, "grad_norm": 0.6017688512802124, "learning_rate": 1.9026871544136554e-05, "loss": 0.0479, "step": 3902 }, { "epoch": 2.3748098570124734, "grad_norm": 0.5484517216682434, "learning_rate": 1.901478397555752e-05, "loss": 0.0428, "step": 3903 }, { "epoch": 2.3754183145725585, "grad_norm": 0.6508259177207947, "learning_rate": 1.9002697891062572e-05, "loss": 0.0521, "step": 3904 }, { "epoch": 2.3760267721326436, "grad_norm": 0.6614736914634705, "learning_rate": 1.8990613293648572e-05, "loss": 0.0626, "step": 3905 }, { "epoch": 2.376635229692729, "grad_norm": 0.5715472102165222, "learning_rate": 1.8978530186312e-05, "loss": 0.0365, "step": 3906 }, { "epoch": 2.377243687252814, "grad_norm": 0.4740070104598999, "learning_rate": 1.8966448572048946e-05, "loss": 0.0401, "step": 3907 }, { "epoch": 2.3778521448128993, "grad_norm": 0.4682742655277252, "learning_rate": 1.895436845385516e-05, "loss": 0.0391, "step": 3908 }, { "epoch": 2.3784606023729844, "grad_norm": 0.6115542054176331, "learning_rate": 1.8942289834725995e-05, "loss": 0.0662, "step": 3909 }, { "epoch": 2.3790690599330695, "grad_norm": 0.45415550470352173, "learning_rate": 1.8930212717656464e-05, "loss": 0.0377, "step": 3910 }, { "epoch": 2.379677517493155, "grad_norm": 0.6085162162780762, "learning_rate": 1.891813710564117e-05, "loss": 0.0598, "step": 3911 }, { "epoch": 2.38028597505324, "grad_norm": 0.5516512393951416, "learning_rate": 1.8906063001674368e-05, "loss": 0.0526, "step": 3912 }, { "epoch": 2.3808944326133252, "grad_norm": 0.5332078337669373, "learning_rate": 1.8893990408749943e-05, "loss": 0.0507, "step": 3913 }, { "epoch": 2.3815028901734103, "grad_norm": 0.5707116723060608, "learning_rate": 1.8881919329861377e-05, "loss": 0.0486, "step": 3914 }, { "epoch": 2.3821113477334954, "grad_norm": 0.5801469683647156, "learning_rate": 1.8869849768001803e-05, "loss": 0.05, "step": 3915 }, { "epoch": 2.382719805293581, "grad_norm": 0.4456758201122284, "learning_rate": 1.8857781726163977e-05, "loss": 0.0382, "step": 3916 }, { "epoch": 2.383328262853666, "grad_norm": 0.5199500322341919, "learning_rate": 1.8845715207340254e-05, "loss": 0.0443, "step": 3917 }, { "epoch": 2.383936720413751, "grad_norm": 0.6407058835029602, "learning_rate": 1.8833650214522643e-05, "loss": 0.065, "step": 3918 }, { "epoch": 2.3845451779738362, "grad_norm": 0.5200806856155396, "learning_rate": 1.882158675070275e-05, "loss": 0.0285, "step": 3919 }, { "epoch": 2.3851536355339213, "grad_norm": 0.5379014611244202, "learning_rate": 1.880952481887181e-05, "loss": 0.0615, "step": 3920 }, { "epoch": 2.385762093094007, "grad_norm": 0.46084079146385193, "learning_rate": 1.879746442202069e-05, "loss": 0.0379, "step": 3921 }, { "epoch": 2.386370550654092, "grad_norm": 0.5246143937110901, "learning_rate": 1.878540556313986e-05, "loss": 0.0558, "step": 3922 }, { "epoch": 2.386979008214177, "grad_norm": 0.5521069169044495, "learning_rate": 1.8773348245219403e-05, "loss": 0.0533, "step": 3923 }, { "epoch": 2.387587465774262, "grad_norm": 0.6149189472198486, "learning_rate": 1.876129247124905e-05, "loss": 0.0434, "step": 3924 }, { "epoch": 2.3881959233343473, "grad_norm": 0.6903012990951538, "learning_rate": 1.874923824421812e-05, "loss": 0.052, "step": 3925 }, { "epoch": 2.388804380894433, "grad_norm": 0.8194772005081177, "learning_rate": 1.8737185567115555e-05, "loss": 0.0505, "step": 3926 }, { "epoch": 2.389412838454518, "grad_norm": 0.4386597275733948, "learning_rate": 1.872513444292993e-05, "loss": 0.028, "step": 3927 }, { "epoch": 2.390021296014603, "grad_norm": 0.4583303928375244, "learning_rate": 1.871308487464941e-05, "loss": 0.0417, "step": 3928 }, { "epoch": 2.390629753574688, "grad_norm": 0.5294291973114014, "learning_rate": 1.8701036865261792e-05, "loss": 0.0372, "step": 3929 }, { "epoch": 2.391238211134773, "grad_norm": 0.5232396721839905, "learning_rate": 1.868899041775448e-05, "loss": 0.0386, "step": 3930 }, { "epoch": 2.3918466686948587, "grad_norm": 0.5549443960189819, "learning_rate": 1.867694553511449e-05, "loss": 0.053, "step": 3931 }, { "epoch": 2.392455126254944, "grad_norm": 0.6172237396240234, "learning_rate": 1.8664902220328435e-05, "loss": 0.0574, "step": 3932 }, { "epoch": 2.393063583815029, "grad_norm": 0.7529854774475098, "learning_rate": 1.8652860476382584e-05, "loss": 0.0613, "step": 3933 }, { "epoch": 2.393672041375114, "grad_norm": 0.5938291549682617, "learning_rate": 1.8640820306262774e-05, "loss": 0.0592, "step": 3934 }, { "epoch": 2.394280498935199, "grad_norm": 0.6293975114822388, "learning_rate": 1.862878171295445e-05, "loss": 0.0491, "step": 3935 }, { "epoch": 2.3948889564952847, "grad_norm": 0.42942139506340027, "learning_rate": 1.8616744699442706e-05, "loss": 0.0368, "step": 3936 }, { "epoch": 2.3954974140553698, "grad_norm": 0.5470993518829346, "learning_rate": 1.8604709268712213e-05, "loss": 0.0506, "step": 3937 }, { "epoch": 2.396105871615455, "grad_norm": 1.0791350603103638, "learning_rate": 1.859267542374724e-05, "loss": 0.0415, "step": 3938 }, { "epoch": 2.39671432917554, "grad_norm": 0.6190977096557617, "learning_rate": 1.8580643167531703e-05, "loss": 0.0691, "step": 3939 }, { "epoch": 2.397322786735625, "grad_norm": 0.5365692973136902, "learning_rate": 1.8568612503049086e-05, "loss": 0.0547, "step": 3940 }, { "epoch": 2.3979312442957106, "grad_norm": 0.5777300596237183, "learning_rate": 1.8556583433282497e-05, "loss": 0.0521, "step": 3941 }, { "epoch": 2.3985397018557957, "grad_norm": 0.612249493598938, "learning_rate": 1.8544555961214634e-05, "loss": 0.0507, "step": 3942 }, { "epoch": 2.3991481594158808, "grad_norm": 0.5814237594604492, "learning_rate": 1.853253008982782e-05, "loss": 0.0476, "step": 3943 }, { "epoch": 2.399756616975966, "grad_norm": 0.5139836072921753, "learning_rate": 1.8520505822103972e-05, "loss": 0.033, "step": 3944 }, { "epoch": 2.400365074536051, "grad_norm": 0.5720403790473938, "learning_rate": 1.8508483161024586e-05, "loss": 0.0567, "step": 3945 }, { "epoch": 2.4009735320961365, "grad_norm": 0.6230379343032837, "learning_rate": 1.849646210957081e-05, "loss": 0.0538, "step": 3946 }, { "epoch": 2.4015819896562216, "grad_norm": 0.4990738034248352, "learning_rate": 1.848444267072335e-05, "loss": 0.0483, "step": 3947 }, { "epoch": 2.4021904472163067, "grad_norm": 0.600247323513031, "learning_rate": 1.8472424847462517e-05, "loss": 0.0729, "step": 3948 }, { "epoch": 2.402798904776392, "grad_norm": 0.535139799118042, "learning_rate": 1.8460408642768244e-05, "loss": 0.0502, "step": 3949 }, { "epoch": 2.403407362336477, "grad_norm": 0.4909554421901703, "learning_rate": 1.844839405962005e-05, "loss": 0.0368, "step": 3950 }, { "epoch": 2.4040158198965624, "grad_norm": 0.5839510560035706, "learning_rate": 1.8436381100997034e-05, "loss": 0.0523, "step": 3951 }, { "epoch": 2.4046242774566475, "grad_norm": 0.7578384280204773, "learning_rate": 1.8424369769877927e-05, "loss": 0.0429, "step": 3952 }, { "epoch": 2.4052327350167326, "grad_norm": 0.5987670421600342, "learning_rate": 1.8412360069241034e-05, "loss": 0.0529, "step": 3953 }, { "epoch": 2.4058411925768177, "grad_norm": 0.5466412901878357, "learning_rate": 1.8400352002064253e-05, "loss": 0.0491, "step": 3954 }, { "epoch": 2.406449650136903, "grad_norm": 1.5153589248657227, "learning_rate": 1.8388345571325083e-05, "loss": 0.0481, "step": 3955 }, { "epoch": 2.4070581076969884, "grad_norm": 0.5942850708961487, "learning_rate": 1.8376340780000635e-05, "loss": 0.0641, "step": 3956 }, { "epoch": 2.4076665652570735, "grad_norm": 0.5600593090057373, "learning_rate": 1.836433763106758e-05, "loss": 0.0534, "step": 3957 }, { "epoch": 2.4082750228171586, "grad_norm": 0.5812585353851318, "learning_rate": 1.8352336127502198e-05, "loss": 0.0532, "step": 3958 }, { "epoch": 2.4088834803772436, "grad_norm": 0.4886872172355652, "learning_rate": 1.8340336272280377e-05, "loss": 0.0524, "step": 3959 }, { "epoch": 2.4094919379373287, "grad_norm": 0.6744169592857361, "learning_rate": 1.8328338068377578e-05, "loss": 0.0497, "step": 3960 }, { "epoch": 2.4101003954974143, "grad_norm": 0.6337938904762268, "learning_rate": 1.831634151876883e-05, "loss": 0.0431, "step": 3961 }, { "epoch": 2.4107088530574994, "grad_norm": 0.6581575870513916, "learning_rate": 1.8304346626428813e-05, "loss": 0.0637, "step": 3962 }, { "epoch": 2.4113173106175845, "grad_norm": 1.0695427656173706, "learning_rate": 1.829235339433174e-05, "loss": 0.0482, "step": 3963 }, { "epoch": 2.4119257681776696, "grad_norm": 0.6076899766921997, "learning_rate": 1.828036182545144e-05, "loss": 0.0375, "step": 3964 }, { "epoch": 2.4125342257377547, "grad_norm": 0.6036027669906616, "learning_rate": 1.8268371922761308e-05, "loss": 0.0442, "step": 3965 }, { "epoch": 2.41314268329784, "grad_norm": 0.5249053239822388, "learning_rate": 1.8256383689234362e-05, "loss": 0.0415, "step": 3966 }, { "epoch": 2.4137511408579253, "grad_norm": 0.5730521082878113, "learning_rate": 1.824439712784317e-05, "loss": 0.0438, "step": 3967 }, { "epoch": 2.4143595984180104, "grad_norm": 0.5469090342521667, "learning_rate": 1.8232412241559896e-05, "loss": 0.0265, "step": 3968 }, { "epoch": 2.4149680559780955, "grad_norm": 0.5718511939048767, "learning_rate": 1.8220429033356305e-05, "loss": 0.0395, "step": 3969 }, { "epoch": 2.4155765135381806, "grad_norm": 0.5090388655662537, "learning_rate": 1.8208447506203734e-05, "loss": 0.0359, "step": 3970 }, { "epoch": 2.416184971098266, "grad_norm": 0.4648432731628418, "learning_rate": 1.819646766307308e-05, "loss": 0.0351, "step": 3971 }, { "epoch": 2.4167934286583512, "grad_norm": 0.5751369595527649, "learning_rate": 1.8184489506934875e-05, "loss": 0.035, "step": 3972 }, { "epoch": 2.4174018862184363, "grad_norm": 0.607879638671875, "learning_rate": 1.8172513040759183e-05, "loss": 0.0605, "step": 3973 }, { "epoch": 2.4180103437785214, "grad_norm": 0.5648092031478882, "learning_rate": 1.8160538267515665e-05, "loss": 0.0417, "step": 3974 }, { "epoch": 2.4186188013386065, "grad_norm": 0.6005630493164062, "learning_rate": 1.8148565190173586e-05, "loss": 0.0441, "step": 3975 }, { "epoch": 2.419227258898692, "grad_norm": 0.524438202381134, "learning_rate": 1.813659381170176e-05, "loss": 0.0344, "step": 3976 }, { "epoch": 2.419835716458777, "grad_norm": 0.4712424874305725, "learning_rate": 1.812462413506858e-05, "loss": 0.0264, "step": 3977 }, { "epoch": 2.4204441740188622, "grad_norm": 0.561529278755188, "learning_rate": 1.811265616324203e-05, "loss": 0.0522, "step": 3978 }, { "epoch": 2.4210526315789473, "grad_norm": 0.6183802485466003, "learning_rate": 1.8100689899189683e-05, "loss": 0.0509, "step": 3979 }, { "epoch": 2.4216610891390324, "grad_norm": 0.5042179226875305, "learning_rate": 1.808872534587866e-05, "loss": 0.0501, "step": 3980 }, { "epoch": 2.422269546699118, "grad_norm": 0.6235750317573547, "learning_rate": 1.8076762506275667e-05, "loss": 0.0617, "step": 3981 }, { "epoch": 2.422878004259203, "grad_norm": 0.5784009099006653, "learning_rate": 1.8064801383347006e-05, "loss": 0.0542, "step": 3982 }, { "epoch": 2.423486461819288, "grad_norm": 0.6061785221099854, "learning_rate": 1.8052841980058533e-05, "loss": 0.0477, "step": 3983 }, { "epoch": 2.4240949193793733, "grad_norm": 0.5976025462150574, "learning_rate": 1.8040884299375665e-05, "loss": 0.0554, "step": 3984 }, { "epoch": 2.4247033769394584, "grad_norm": 0.5100542306900024, "learning_rate": 1.802892834426343e-05, "loss": 0.0437, "step": 3985 }, { "epoch": 2.425311834499544, "grad_norm": 0.6174180507659912, "learning_rate": 1.8016974117686398e-05, "loss": 0.0661, "step": 3986 }, { "epoch": 2.425920292059629, "grad_norm": 0.6207003593444824, "learning_rate": 1.8005021622608717e-05, "loss": 0.0463, "step": 3987 }, { "epoch": 2.426528749619714, "grad_norm": 0.5053902864456177, "learning_rate": 1.79930708619941e-05, "loss": 0.043, "step": 3988 }, { "epoch": 2.427137207179799, "grad_norm": 0.5556389689445496, "learning_rate": 1.7981121838805852e-05, "loss": 0.0522, "step": 3989 }, { "epoch": 2.4277456647398843, "grad_norm": 0.522250235080719, "learning_rate": 1.7969174556006832e-05, "loss": 0.0527, "step": 3990 }, { "epoch": 2.4283541222999694, "grad_norm": 0.5426846742630005, "learning_rate": 1.795722901655945e-05, "loss": 0.0481, "step": 3991 }, { "epoch": 2.428962579860055, "grad_norm": 0.5758845806121826, "learning_rate": 1.7945285223425724e-05, "loss": 0.0414, "step": 3992 }, { "epoch": 2.42957103742014, "grad_norm": 0.556006669998169, "learning_rate": 1.7933343179567208e-05, "loss": 0.0647, "step": 3993 }, { "epoch": 2.430179494980225, "grad_norm": 0.6288806796073914, "learning_rate": 1.792140288794502e-05, "loss": 0.0633, "step": 3994 }, { "epoch": 2.43078795254031, "grad_norm": 0.5981294512748718, "learning_rate": 1.790946435151988e-05, "loss": 0.0627, "step": 3995 }, { "epoch": 2.4313964101003953, "grad_norm": 0.5165243148803711, "learning_rate": 1.789752757325203e-05, "loss": 0.0411, "step": 3996 }, { "epoch": 2.432004867660481, "grad_norm": 0.464417964220047, "learning_rate": 1.7885592556101282e-05, "loss": 0.0387, "step": 3997 }, { "epoch": 2.432613325220566, "grad_norm": 0.4882068932056427, "learning_rate": 1.7873659303027052e-05, "loss": 0.0376, "step": 3998 }, { "epoch": 2.433221782780651, "grad_norm": 0.5661106109619141, "learning_rate": 1.7861727816988275e-05, "loss": 0.0457, "step": 3999 }, { "epoch": 2.433830240340736, "grad_norm": 0.5687078833580017, "learning_rate": 1.784979810094346e-05, "loss": 0.0454, "step": 4000 }, { "epoch": 2.4344386979008212, "grad_norm": 0.6012535691261292, "learning_rate": 1.783787015785067e-05, "loss": 0.0612, "step": 4001 }, { "epoch": 2.4350471554609068, "grad_norm": 0.5028715133666992, "learning_rate": 1.782594399066756e-05, "loss": 0.0442, "step": 4002 }, { "epoch": 2.435655613020992, "grad_norm": 0.5349405407905579, "learning_rate": 1.781401960235131e-05, "loss": 0.06, "step": 4003 }, { "epoch": 2.436264070581077, "grad_norm": 0.5278841257095337, "learning_rate": 1.7802096995858658e-05, "loss": 0.051, "step": 4004 }, { "epoch": 2.436872528141162, "grad_norm": 0.6222814917564392, "learning_rate": 1.7790176174145944e-05, "loss": 0.0548, "step": 4005 }, { "epoch": 2.437480985701247, "grad_norm": 0.47187790274620056, "learning_rate": 1.7778257140169016e-05, "loss": 0.0439, "step": 4006 }, { "epoch": 2.4380894432613327, "grad_norm": 0.48313429951667786, "learning_rate": 1.7766339896883292e-05, "loss": 0.0319, "step": 4007 }, { "epoch": 2.438697900821418, "grad_norm": 0.5558401346206665, "learning_rate": 1.7754424447243767e-05, "loss": 0.0484, "step": 4008 }, { "epoch": 2.439306358381503, "grad_norm": 0.45081377029418945, "learning_rate": 1.774251079420497e-05, "loss": 0.0323, "step": 4009 }, { "epoch": 2.439914815941588, "grad_norm": 0.4787575900554657, "learning_rate": 1.773059894072098e-05, "loss": 0.0441, "step": 4010 }, { "epoch": 2.440523273501673, "grad_norm": 0.5224798321723938, "learning_rate": 1.7718688889745455e-05, "loss": 0.0532, "step": 4011 }, { "epoch": 2.4411317310617586, "grad_norm": 0.40973082184791565, "learning_rate": 1.7706780644231592e-05, "loss": 0.037, "step": 4012 }, { "epoch": 2.4417401886218437, "grad_norm": 0.4948725998401642, "learning_rate": 1.7694874207132127e-05, "loss": 0.0422, "step": 4013 }, { "epoch": 2.442348646181929, "grad_norm": 0.5685492753982544, "learning_rate": 1.7682969581399358e-05, "loss": 0.0348, "step": 4014 }, { "epoch": 2.442957103742014, "grad_norm": 0.4990760385990143, "learning_rate": 1.7671066769985155e-05, "loss": 0.0444, "step": 4015 }, { "epoch": 2.443565561302099, "grad_norm": 0.5178315043449402, "learning_rate": 1.765916577584091e-05, "loss": 0.0404, "step": 4016 }, { "epoch": 2.4441740188621845, "grad_norm": 0.7539467811584473, "learning_rate": 1.7647266601917557e-05, "loss": 0.0713, "step": 4017 }, { "epoch": 2.4447824764222696, "grad_norm": 0.4690307676792145, "learning_rate": 1.7635369251165617e-05, "loss": 0.0527, "step": 4018 }, { "epoch": 2.4453909339823547, "grad_norm": 0.37490659952163696, "learning_rate": 1.7623473726535136e-05, "loss": 0.0346, "step": 4019 }, { "epoch": 2.44599939154244, "grad_norm": 0.5802814960479736, "learning_rate": 1.7611580030975688e-05, "loss": 0.0545, "step": 4020 }, { "epoch": 2.446607849102525, "grad_norm": 0.5390932559967041, "learning_rate": 1.7599688167436442e-05, "loss": 0.0484, "step": 4021 }, { "epoch": 2.4472163066626105, "grad_norm": 0.5329564809799194, "learning_rate": 1.7587798138866067e-05, "loss": 0.0373, "step": 4022 }, { "epoch": 2.4478247642226956, "grad_norm": 0.4844886064529419, "learning_rate": 1.7575909948212795e-05, "loss": 0.0382, "step": 4023 }, { "epoch": 2.4484332217827807, "grad_norm": 0.4697389602661133, "learning_rate": 1.7564023598424403e-05, "loss": 0.0416, "step": 4024 }, { "epoch": 2.4490416793428658, "grad_norm": 0.46831750869750977, "learning_rate": 1.7552139092448216e-05, "loss": 0.0292, "step": 4025 }, { "epoch": 2.449650136902951, "grad_norm": 0.4629685878753662, "learning_rate": 1.754025643323109e-05, "loss": 0.0332, "step": 4026 }, { "epoch": 2.450258594463036, "grad_norm": 0.6923487782478333, "learning_rate": 1.7528375623719427e-05, "loss": 0.0602, "step": 4027 }, { "epoch": 2.4508670520231215, "grad_norm": 0.5200254917144775, "learning_rate": 1.751649666685919e-05, "loss": 0.0419, "step": 4028 }, { "epoch": 2.4514755095832066, "grad_norm": 0.635124683380127, "learning_rate": 1.7504619565595836e-05, "loss": 0.0439, "step": 4029 }, { "epoch": 2.4520839671432917, "grad_norm": 0.47341227531433105, "learning_rate": 1.7492744322874414e-05, "loss": 0.0356, "step": 4030 }, { "epoch": 2.452692424703377, "grad_norm": 0.5724462270736694, "learning_rate": 1.7480870941639485e-05, "loss": 0.0399, "step": 4031 }, { "epoch": 2.453300882263462, "grad_norm": 0.5328457355499268, "learning_rate": 1.7468999424835144e-05, "loss": 0.0526, "step": 4032 }, { "epoch": 2.4539093398235474, "grad_norm": 0.5413589477539062, "learning_rate": 1.745712977540504e-05, "loss": 0.0466, "step": 4033 }, { "epoch": 2.4545177973836325, "grad_norm": 0.5428217053413391, "learning_rate": 1.744526199629235e-05, "loss": 0.0372, "step": 4034 }, { "epoch": 2.4551262549437176, "grad_norm": 0.5191711187362671, "learning_rate": 1.743339609043979e-05, "loss": 0.0486, "step": 4035 }, { "epoch": 2.4557347125038027, "grad_norm": 0.5395163297653198, "learning_rate": 1.742153206078961e-05, "loss": 0.0505, "step": 4036 }, { "epoch": 2.456343170063888, "grad_norm": 0.5724096894264221, "learning_rate": 1.7409669910283583e-05, "loss": 0.0428, "step": 4037 }, { "epoch": 2.4569516276239733, "grad_norm": 0.6358472108840942, "learning_rate": 1.739780964186304e-05, "loss": 0.0344, "step": 4038 }, { "epoch": 2.4575600851840584, "grad_norm": 0.6139354109764099, "learning_rate": 1.738595125846884e-05, "loss": 0.082, "step": 4039 }, { "epoch": 2.4581685427441435, "grad_norm": 0.5228933095932007, "learning_rate": 1.7374094763041347e-05, "loss": 0.0266, "step": 4040 }, { "epoch": 2.4587770003042286, "grad_norm": 0.5466329455375671, "learning_rate": 1.73622401585205e-05, "loss": 0.0438, "step": 4041 }, { "epoch": 2.4593854578643137, "grad_norm": 0.5522369742393494, "learning_rate": 1.735038744784573e-05, "loss": 0.0568, "step": 4042 }, { "epoch": 2.4599939154243993, "grad_norm": 0.7028734087944031, "learning_rate": 1.733853663395602e-05, "loss": 0.0511, "step": 4043 }, { "epoch": 2.4606023729844844, "grad_norm": 0.48779624700546265, "learning_rate": 1.732668771978988e-05, "loss": 0.0265, "step": 4044 }, { "epoch": 2.4612108305445695, "grad_norm": 0.5411667823791504, "learning_rate": 1.7314840708285354e-05, "loss": 0.0484, "step": 4045 }, { "epoch": 2.4618192881046546, "grad_norm": 0.5587241649627686, "learning_rate": 1.7302995602379996e-05, "loss": 0.0461, "step": 4046 }, { "epoch": 2.4624277456647397, "grad_norm": 0.49550849199295044, "learning_rate": 1.7291152405010898e-05, "loss": 0.051, "step": 4047 }, { "epoch": 2.463036203224825, "grad_norm": 0.5094623565673828, "learning_rate": 1.727931111911468e-05, "loss": 0.0483, "step": 4048 }, { "epoch": 2.4636446607849103, "grad_norm": 0.5672603249549866, "learning_rate": 1.72674717476275e-05, "loss": 0.0535, "step": 4049 }, { "epoch": 2.4642531183449954, "grad_norm": 0.4778454005718231, "learning_rate": 1.725563429348501e-05, "loss": 0.0443, "step": 4050 }, { "epoch": 2.4648615759050805, "grad_norm": 0.5189485549926758, "learning_rate": 1.7243798759622414e-05, "loss": 0.0483, "step": 4051 }, { "epoch": 2.4654700334651656, "grad_norm": 0.5488086938858032, "learning_rate": 1.7231965148974438e-05, "loss": 0.0477, "step": 4052 }, { "epoch": 2.466078491025251, "grad_norm": 0.5553414225578308, "learning_rate": 1.7220133464475312e-05, "loss": 0.0434, "step": 4053 }, { "epoch": 2.466686948585336, "grad_norm": 0.5483329892158508, "learning_rate": 1.7208303709058806e-05, "loss": 0.0504, "step": 4054 }, { "epoch": 2.4672954061454213, "grad_norm": 0.4734504818916321, "learning_rate": 1.7196475885658214e-05, "loss": 0.0363, "step": 4055 }, { "epoch": 2.4679038637055064, "grad_norm": 0.4984531104564667, "learning_rate": 1.7184649997206324e-05, "loss": 0.039, "step": 4056 }, { "epoch": 2.4685123212655915, "grad_norm": 0.5191166996955872, "learning_rate": 1.7172826046635483e-05, "loss": 0.044, "step": 4057 }, { "epoch": 2.469120778825677, "grad_norm": 0.5484380722045898, "learning_rate": 1.7161004036877526e-05, "loss": 0.0433, "step": 4058 }, { "epoch": 2.469729236385762, "grad_norm": 0.570693850517273, "learning_rate": 1.714918397086383e-05, "loss": 0.0468, "step": 4059 }, { "epoch": 2.4703376939458472, "grad_norm": 0.5350715517997742, "learning_rate": 1.7137365851525255e-05, "loss": 0.0406, "step": 4060 }, { "epoch": 2.4709461515059323, "grad_norm": 0.5469467639923096, "learning_rate": 1.7125549681792233e-05, "loss": 0.0499, "step": 4061 }, { "epoch": 2.4715546090660174, "grad_norm": 0.5633114576339722, "learning_rate": 1.7113735464594665e-05, "loss": 0.0371, "step": 4062 }, { "epoch": 2.472163066626103, "grad_norm": 0.6275270581245422, "learning_rate": 1.7101923202861974e-05, "loss": 0.0482, "step": 4063 }, { "epoch": 2.472771524186188, "grad_norm": 0.6021205186843872, "learning_rate": 1.7090112899523132e-05, "loss": 0.0524, "step": 4064 }, { "epoch": 2.473379981746273, "grad_norm": 0.6164666414260864, "learning_rate": 1.7078304557506593e-05, "loss": 0.0459, "step": 4065 }, { "epoch": 2.4739884393063583, "grad_norm": 0.5763289332389832, "learning_rate": 1.7066498179740318e-05, "loss": 0.046, "step": 4066 }, { "epoch": 2.4745968968664434, "grad_norm": 0.6294301152229309, "learning_rate": 1.705469376915182e-05, "loss": 0.0465, "step": 4067 }, { "epoch": 2.475205354426529, "grad_norm": 0.409706711769104, "learning_rate": 1.7042891328668094e-05, "loss": 0.0301, "step": 4068 }, { "epoch": 2.475813811986614, "grad_norm": 0.42503032088279724, "learning_rate": 1.7031090861215646e-05, "loss": 0.0307, "step": 4069 }, { "epoch": 2.476422269546699, "grad_norm": 0.5593361258506775, "learning_rate": 1.7019292369720493e-05, "loss": 0.0366, "step": 4070 }, { "epoch": 2.477030727106784, "grad_norm": 0.5310861468315125, "learning_rate": 1.700749585710819e-05, "loss": 0.0367, "step": 4071 }, { "epoch": 2.4776391846668693, "grad_norm": 0.5321909189224243, "learning_rate": 1.6995701326303776e-05, "loss": 0.031, "step": 4072 }, { "epoch": 2.478247642226955, "grad_norm": 0.5682234764099121, "learning_rate": 1.6983908780231782e-05, "loss": 0.0455, "step": 4073 }, { "epoch": 2.47885609978704, "grad_norm": 0.46487629413604736, "learning_rate": 1.6972118221816298e-05, "loss": 0.032, "step": 4074 }, { "epoch": 2.479464557347125, "grad_norm": 0.616710364818573, "learning_rate": 1.696032965398087e-05, "loss": 0.0516, "step": 4075 }, { "epoch": 2.48007301490721, "grad_norm": 0.5947301387786865, "learning_rate": 1.6948543079648575e-05, "loss": 0.0376, "step": 4076 }, { "epoch": 2.480681472467295, "grad_norm": 0.5795893669128418, "learning_rate": 1.6936758501742e-05, "loss": 0.0487, "step": 4077 }, { "epoch": 2.4812899300273807, "grad_norm": 0.5290836095809937, "learning_rate": 1.6924975923183228e-05, "loss": 0.0292, "step": 4078 }, { "epoch": 2.481898387587466, "grad_norm": 0.4861297309398651, "learning_rate": 1.6913195346893828e-05, "loss": 0.0336, "step": 4079 }, { "epoch": 2.482506845147551, "grad_norm": 0.5103831887245178, "learning_rate": 1.690141677579492e-05, "loss": 0.0422, "step": 4080 }, { "epoch": 2.483115302707636, "grad_norm": 0.42127537727355957, "learning_rate": 1.688964021280709e-05, "loss": 0.0317, "step": 4081 }, { "epoch": 2.483723760267721, "grad_norm": 0.7771899104118347, "learning_rate": 1.6877865660850427e-05, "loss": 0.0537, "step": 4082 }, { "epoch": 2.4843322178278067, "grad_norm": 0.5473512411117554, "learning_rate": 1.6866093122844523e-05, "loss": 0.0322, "step": 4083 }, { "epoch": 2.4849406753878918, "grad_norm": 0.5636205077171326, "learning_rate": 1.6854322601708495e-05, "loss": 0.0538, "step": 4084 }, { "epoch": 2.485549132947977, "grad_norm": 0.5534929037094116, "learning_rate": 1.6842554100360937e-05, "loss": 0.0604, "step": 4085 }, { "epoch": 2.486157590508062, "grad_norm": 0.5635712742805481, "learning_rate": 1.683078762171993e-05, "loss": 0.0418, "step": 4086 }, { "epoch": 2.486766048068147, "grad_norm": 0.5449815392494202, "learning_rate": 1.6819023168703094e-05, "loss": 0.0422, "step": 4087 }, { "epoch": 2.4873745056282326, "grad_norm": 0.5356243252754211, "learning_rate": 1.6807260744227513e-05, "loss": 0.052, "step": 4088 }, { "epoch": 2.4879829631883177, "grad_norm": 0.525938093662262, "learning_rate": 1.6795500351209766e-05, "loss": 0.0457, "step": 4089 }, { "epoch": 2.488591420748403, "grad_norm": 0.5305033922195435, "learning_rate": 1.6783741992565963e-05, "loss": 0.0469, "step": 4090 }, { "epoch": 2.489199878308488, "grad_norm": 0.47671419382095337, "learning_rate": 1.6771985671211673e-05, "loss": 0.0426, "step": 4091 }, { "epoch": 2.489808335868573, "grad_norm": 0.6762577891349792, "learning_rate": 1.676023139006198e-05, "loss": 0.0523, "step": 4092 }, { "epoch": 2.4904167934286585, "grad_norm": 0.5468366146087646, "learning_rate": 1.6748479152031436e-05, "loss": 0.0502, "step": 4093 }, { "epoch": 2.4910252509887436, "grad_norm": 0.5539106130599976, "learning_rate": 1.6736728960034137e-05, "loss": 0.0457, "step": 4094 }, { "epoch": 2.4916337085488287, "grad_norm": 0.5105188488960266, "learning_rate": 1.6724980816983625e-05, "loss": 0.0392, "step": 4095 }, { "epoch": 2.492242166108914, "grad_norm": 0.5564658045768738, "learning_rate": 1.6713234725792938e-05, "loss": 0.0382, "step": 4096 }, { "epoch": 2.492850623668999, "grad_norm": 0.6725486516952515, "learning_rate": 1.6701490689374642e-05, "loss": 0.0393, "step": 4097 }, { "epoch": 2.4934590812290844, "grad_norm": 0.46235400438308716, "learning_rate": 1.6689748710640756e-05, "loss": 0.0401, "step": 4098 }, { "epoch": 2.4940675387891695, "grad_norm": 0.5110651254653931, "learning_rate": 1.6678008792502793e-05, "loss": 0.035, "step": 4099 }, { "epoch": 2.4946759963492546, "grad_norm": 0.6853166818618774, "learning_rate": 1.6666270937871774e-05, "loss": 0.0396, "step": 4100 }, { "epoch": 2.4952844539093397, "grad_norm": 0.4850734770298004, "learning_rate": 1.6654535149658203e-05, "loss": 0.0599, "step": 4101 }, { "epoch": 2.495892911469425, "grad_norm": 0.5870274305343628, "learning_rate": 1.6642801430772048e-05, "loss": 0.0399, "step": 4102 }, { "epoch": 2.4965013690295104, "grad_norm": 0.561387300491333, "learning_rate": 1.6631069784122803e-05, "loss": 0.033, "step": 4103 }, { "epoch": 2.4971098265895955, "grad_norm": 0.5187516212463379, "learning_rate": 1.6619340212619417e-05, "loss": 0.0425, "step": 4104 }, { "epoch": 2.4977182841496806, "grad_norm": 0.4592403173446655, "learning_rate": 1.660761271917033e-05, "loss": 0.0334, "step": 4105 }, { "epoch": 2.4983267417097657, "grad_norm": 0.5504740476608276, "learning_rate": 1.6595887306683474e-05, "loss": 0.0474, "step": 4106 }, { "epoch": 2.4989351992698507, "grad_norm": 0.5082219243049622, "learning_rate": 1.658416397806627e-05, "loss": 0.0448, "step": 4107 }, { "epoch": 2.4995436568299363, "grad_norm": 0.6115595102310181, "learning_rate": 1.6572442736225614e-05, "loss": 0.0633, "step": 4108 }, { "epoch": 2.5001521143900214, "grad_norm": 0.4949551224708557, "learning_rate": 1.656072358406787e-05, "loss": 0.0471, "step": 4109 }, { "epoch": 2.5007605719501065, "grad_norm": 0.6002260446548462, "learning_rate": 1.654900652449892e-05, "loss": 0.0593, "step": 4110 }, { "epoch": 2.5013690295101916, "grad_norm": 0.5908356308937073, "learning_rate": 1.6537291560424097e-05, "loss": 0.0536, "step": 4111 }, { "epoch": 2.5019774870702767, "grad_norm": 0.6577576398849487, "learning_rate": 1.6525578694748217e-05, "loss": 0.0594, "step": 4112 }, { "epoch": 2.502585944630362, "grad_norm": 0.6429257392883301, "learning_rate": 1.6513867930375596e-05, "loss": 0.0613, "step": 4113 }, { "epoch": 2.5031944021904473, "grad_norm": 0.5084056854248047, "learning_rate": 1.650215927021001e-05, "loss": 0.0447, "step": 4114 }, { "epoch": 2.5038028597505324, "grad_norm": 0.6193863749504089, "learning_rate": 1.649045271715472e-05, "loss": 0.0486, "step": 4115 }, { "epoch": 2.5044113173106175, "grad_norm": 0.5566953420639038, "learning_rate": 1.6478748274112445e-05, "loss": 0.0441, "step": 4116 }, { "epoch": 2.5050197748707026, "grad_norm": 0.5336138606071472, "learning_rate": 1.646704594398543e-05, "loss": 0.0477, "step": 4117 }, { "epoch": 2.505628232430788, "grad_norm": 0.544262170791626, "learning_rate": 1.6455345729675348e-05, "loss": 0.0542, "step": 4118 }, { "epoch": 2.5062366899908732, "grad_norm": 0.46165311336517334, "learning_rate": 1.6443647634083353e-05, "loss": 0.0326, "step": 4119 }, { "epoch": 2.5068451475509583, "grad_norm": 0.4707409143447876, "learning_rate": 1.6431951660110113e-05, "loss": 0.0401, "step": 4120 }, { "epoch": 2.5074536051110434, "grad_norm": 0.5299643874168396, "learning_rate": 1.6420257810655727e-05, "loss": 0.0488, "step": 4121 }, { "epoch": 2.5080620626711285, "grad_norm": 0.9495867490768433, "learning_rate": 1.640856608861977e-05, "loss": 0.0375, "step": 4122 }, { "epoch": 2.508670520231214, "grad_norm": 0.5048344135284424, "learning_rate": 1.6396876496901324e-05, "loss": 0.0285, "step": 4123 }, { "epoch": 2.509278977791299, "grad_norm": 0.6059793829917908, "learning_rate": 1.6385189038398914e-05, "loss": 0.0442, "step": 4124 }, { "epoch": 2.5098874353513843, "grad_norm": 0.5671916604042053, "learning_rate": 1.637350371601053e-05, "loss": 0.0442, "step": 4125 }, { "epoch": 2.5104958929114694, "grad_norm": 0.49703407287597656, "learning_rate": 1.6361820532633665e-05, "loss": 0.0431, "step": 4126 }, { "epoch": 2.5111043504715544, "grad_norm": 0.5402058959007263, "learning_rate": 1.6350139491165246e-05, "loss": 0.0349, "step": 4127 }, { "epoch": 2.51171280803164, "grad_norm": 0.5467166900634766, "learning_rate": 1.6338460594501692e-05, "loss": 0.0577, "step": 4128 }, { "epoch": 2.512321265591725, "grad_norm": 0.5586373805999756, "learning_rate": 1.632678384553887e-05, "loss": 0.0426, "step": 4129 }, { "epoch": 2.51292972315181, "grad_norm": 0.613847017288208, "learning_rate": 1.6315109247172146e-05, "loss": 0.0535, "step": 4130 }, { "epoch": 2.5135381807118953, "grad_norm": 0.47474873065948486, "learning_rate": 1.6303436802296325e-05, "loss": 0.0426, "step": 4131 }, { "epoch": 2.5141466382719804, "grad_norm": 0.5153751373291016, "learning_rate": 1.629176651380568e-05, "loss": 0.0478, "step": 4132 }, { "epoch": 2.514755095832066, "grad_norm": 0.7585486769676208, "learning_rate": 1.6280098384593966e-05, "loss": 0.0642, "step": 4133 }, { "epoch": 2.515363553392151, "grad_norm": 0.5414904952049255, "learning_rate": 1.626843241755439e-05, "loss": 0.0553, "step": 4134 }, { "epoch": 2.515972010952236, "grad_norm": 0.8704773783683777, "learning_rate": 1.625676861557962e-05, "loss": 0.0435, "step": 4135 }, { "epoch": 2.516580468512321, "grad_norm": 0.551547110080719, "learning_rate": 1.6245106981561804e-05, "loss": 0.0475, "step": 4136 }, { "epoch": 2.5171889260724063, "grad_norm": 0.614238977432251, "learning_rate": 1.6233447518392537e-05, "loss": 0.0593, "step": 4137 }, { "epoch": 2.517797383632492, "grad_norm": 0.7122321724891663, "learning_rate": 1.622179022896287e-05, "loss": 0.0735, "step": 4138 }, { "epoch": 2.518405841192577, "grad_norm": 0.5139821767807007, "learning_rate": 1.6210135116163333e-05, "loss": 0.0405, "step": 4139 }, { "epoch": 2.519014298752662, "grad_norm": 0.6365357637405396, "learning_rate": 1.6198482182883912e-05, "loss": 0.0634, "step": 4140 }, { "epoch": 2.519622756312747, "grad_norm": 0.5459882616996765, "learning_rate": 1.618683143201404e-05, "loss": 0.0341, "step": 4141 }, { "epoch": 2.520231213872832, "grad_norm": 0.45921745896339417, "learning_rate": 1.6175182866442624e-05, "loss": 0.0335, "step": 4142 }, { "epoch": 2.5208396714329178, "grad_norm": 0.524821400642395, "learning_rate": 1.6163536489058023e-05, "loss": 0.0544, "step": 4143 }, { "epoch": 2.521448128993003, "grad_norm": 0.5286942720413208, "learning_rate": 1.6151892302748046e-05, "loss": 0.046, "step": 4144 }, { "epoch": 2.522056586553088, "grad_norm": 0.5625891089439392, "learning_rate": 1.614025031039997e-05, "loss": 0.0543, "step": 4145 }, { "epoch": 2.522665044113173, "grad_norm": 0.5208843350410461, "learning_rate": 1.612861051490053e-05, "loss": 0.0391, "step": 4146 }, { "epoch": 2.523273501673258, "grad_norm": 0.6042157411575317, "learning_rate": 1.6116972919135907e-05, "loss": 0.0726, "step": 4147 }, { "epoch": 2.5238819592333437, "grad_norm": 0.43940305709838867, "learning_rate": 1.6105337525991723e-05, "loss": 0.0372, "step": 4148 }, { "epoch": 2.524490416793429, "grad_norm": 0.4963323771953583, "learning_rate": 1.6093704338353097e-05, "loss": 0.0383, "step": 4149 }, { "epoch": 2.525098874353514, "grad_norm": 0.4309125244617462, "learning_rate": 1.6082073359104564e-05, "loss": 0.0448, "step": 4150 }, { "epoch": 2.525707331913599, "grad_norm": 0.4299990236759186, "learning_rate": 1.6070444591130113e-05, "loss": 0.0361, "step": 4151 }, { "epoch": 2.526315789473684, "grad_norm": 0.5838400721549988, "learning_rate": 1.60588180373132e-05, "loss": 0.0494, "step": 4152 }, { "epoch": 2.5269242470337696, "grad_norm": 0.4594270586967468, "learning_rate": 1.6047193700536734e-05, "loss": 0.0425, "step": 4153 }, { "epoch": 2.5275327045938547, "grad_norm": 0.6414621472358704, "learning_rate": 1.6035571583683055e-05, "loss": 0.0606, "step": 4154 }, { "epoch": 2.52814116215394, "grad_norm": 0.4599505364894867, "learning_rate": 1.6023951689633965e-05, "loss": 0.0304, "step": 4155 }, { "epoch": 2.528749619714025, "grad_norm": 0.4809854030609131, "learning_rate": 1.601233402127072e-05, "loss": 0.0463, "step": 4156 }, { "epoch": 2.52935807727411, "grad_norm": 0.5729118585586548, "learning_rate": 1.600071858147401e-05, "loss": 0.0412, "step": 4157 }, { "epoch": 2.5299665348341955, "grad_norm": 0.5415656566619873, "learning_rate": 1.5989105373123986e-05, "loss": 0.0426, "step": 4158 }, { "epoch": 2.5305749923942806, "grad_norm": 0.46047234535217285, "learning_rate": 1.5977494399100242e-05, "loss": 0.0414, "step": 4159 }, { "epoch": 2.5311834499543657, "grad_norm": 0.5674901604652405, "learning_rate": 1.59658856622818e-05, "loss": 0.061, "step": 4160 }, { "epoch": 2.531791907514451, "grad_norm": 0.539481520652771, "learning_rate": 1.5954279165547158e-05, "loss": 0.0422, "step": 4161 }, { "epoch": 2.532400365074536, "grad_norm": 0.4912717342376709, "learning_rate": 1.594267491177424e-05, "loss": 0.0409, "step": 4162 }, { "epoch": 2.5330088226346215, "grad_norm": 0.5007967948913574, "learning_rate": 1.5931072903840415e-05, "loss": 0.045, "step": 4163 }, { "epoch": 2.5336172801947066, "grad_norm": 0.5245926976203918, "learning_rate": 1.5919473144622502e-05, "loss": 0.0438, "step": 4164 }, { "epoch": 2.5342257377547917, "grad_norm": 0.5035905241966248, "learning_rate": 1.5907875636996748e-05, "loss": 0.0442, "step": 4165 }, { "epoch": 2.5348341953148767, "grad_norm": 0.573156476020813, "learning_rate": 1.5896280383838856e-05, "loss": 0.0642, "step": 4166 }, { "epoch": 2.535442652874962, "grad_norm": 0.5619865655899048, "learning_rate": 1.5884687388023975e-05, "loss": 0.0432, "step": 4167 }, { "epoch": 2.5360511104350474, "grad_norm": 0.4712476432323456, "learning_rate": 1.587309665242667e-05, "loss": 0.0344, "step": 4168 }, { "epoch": 2.5366595679951325, "grad_norm": 0.5480785965919495, "learning_rate": 1.586150817992097e-05, "loss": 0.0429, "step": 4169 }, { "epoch": 2.5372680255552176, "grad_norm": 0.602826714515686, "learning_rate": 1.5849921973380332e-05, "loss": 0.047, "step": 4170 }, { "epoch": 2.5378764831153027, "grad_norm": 0.5343915820121765, "learning_rate": 1.5838338035677645e-05, "loss": 0.0414, "step": 4171 }, { "epoch": 2.5384849406753878, "grad_norm": 0.5871137380599976, "learning_rate": 1.582675636968525e-05, "loss": 0.0423, "step": 4172 }, { "epoch": 2.5390933982354733, "grad_norm": 0.47411757707595825, "learning_rate": 1.5815176978274924e-05, "loss": 0.0368, "step": 4173 }, { "epoch": 2.5397018557955584, "grad_norm": 0.5202276110649109, "learning_rate": 1.580359986431786e-05, "loss": 0.0423, "step": 4174 }, { "epoch": 2.5403103133556435, "grad_norm": 0.5111712217330933, "learning_rate": 1.57920250306847e-05, "loss": 0.0434, "step": 4175 }, { "epoch": 2.5409187709157286, "grad_norm": 0.5714934468269348, "learning_rate": 1.578045248024553e-05, "loss": 0.0529, "step": 4176 }, { "epoch": 2.5415272284758137, "grad_norm": 0.5358994603157043, "learning_rate": 1.5768882215869858e-05, "loss": 0.043, "step": 4177 }, { "epoch": 2.5421356860358992, "grad_norm": 0.5511855483055115, "learning_rate": 1.5757314240426613e-05, "loss": 0.0419, "step": 4178 }, { "epoch": 2.5427441435959843, "grad_norm": 0.6131108999252319, "learning_rate": 1.5745748556784194e-05, "loss": 0.0716, "step": 4179 }, { "epoch": 2.5433526011560694, "grad_norm": 0.39662912487983704, "learning_rate": 1.5734185167810395e-05, "loss": 0.0278, "step": 4180 }, { "epoch": 2.5439610587161545, "grad_norm": 0.5008838772773743, "learning_rate": 1.572262407637245e-05, "loss": 0.0401, "step": 4181 }, { "epoch": 2.5445695162762396, "grad_norm": 0.6066216230392456, "learning_rate": 1.5711065285337035e-05, "loss": 0.0686, "step": 4182 }, { "epoch": 2.545177973836325, "grad_norm": 0.5468981862068176, "learning_rate": 1.5699508797570255e-05, "loss": 0.028, "step": 4183 }, { "epoch": 2.5457864313964103, "grad_norm": 0.5890219807624817, "learning_rate": 1.5687954615937623e-05, "loss": 0.0497, "step": 4184 }, { "epoch": 2.5463948889564954, "grad_norm": 0.489978164434433, "learning_rate": 1.56764027433041e-05, "loss": 0.0387, "step": 4185 }, { "epoch": 2.5470033465165804, "grad_norm": 0.5345786809921265, "learning_rate": 1.5664853182534077e-05, "loss": 0.0441, "step": 4186 }, { "epoch": 2.5476118040766655, "grad_norm": 0.5965332984924316, "learning_rate": 1.565330593649135e-05, "loss": 0.0456, "step": 4187 }, { "epoch": 2.548220261636751, "grad_norm": 0.5223818421363831, "learning_rate": 1.564176100803916e-05, "loss": 0.0447, "step": 4188 }, { "epoch": 2.548828719196836, "grad_norm": 0.6047852635383606, "learning_rate": 1.5630218400040174e-05, "loss": 0.049, "step": 4189 }, { "epoch": 2.5494371767569213, "grad_norm": 0.6201773285865784, "learning_rate": 1.561867811535648e-05, "loss": 0.0598, "step": 4190 }, { "epoch": 2.5500456343170064, "grad_norm": 0.4544050693511963, "learning_rate": 1.5607140156849564e-05, "loss": 0.0313, "step": 4191 }, { "epoch": 2.5506540918770915, "grad_norm": 0.5224777460098267, "learning_rate": 1.5595604527380387e-05, "loss": 0.0538, "step": 4192 }, { "epoch": 2.551262549437177, "grad_norm": 0.57136070728302, "learning_rate": 1.5584071229809294e-05, "loss": 0.0452, "step": 4193 }, { "epoch": 2.551871006997262, "grad_norm": 0.45948007702827454, "learning_rate": 1.5572540266996047e-05, "loss": 0.0289, "step": 4194 }, { "epoch": 2.552479464557347, "grad_norm": 0.5531154274940491, "learning_rate": 1.5561011641799872e-05, "loss": 0.0417, "step": 4195 }, { "epoch": 2.5530879221174323, "grad_norm": 0.6279348134994507, "learning_rate": 1.5549485357079373e-05, "loss": 0.0619, "step": 4196 }, { "epoch": 2.5536963796775174, "grad_norm": 0.5489982962608337, "learning_rate": 1.5537961415692585e-05, "loss": 0.0395, "step": 4197 }, { "epoch": 2.554304837237603, "grad_norm": 0.4770684540271759, "learning_rate": 1.5526439820496965e-05, "loss": 0.0422, "step": 4198 }, { "epoch": 2.5549132947976876, "grad_norm": 0.5624024868011475, "learning_rate": 1.5514920574349397e-05, "loss": 0.0552, "step": 4199 }, { "epoch": 2.555521752357773, "grad_norm": 0.5154407620429993, "learning_rate": 1.5503403680106168e-05, "loss": 0.0474, "step": 4200 }, { "epoch": 2.556130209917858, "grad_norm": 0.48511430621147156, "learning_rate": 1.549188914062298e-05, "loss": 0.0311, "step": 4201 }, { "epoch": 2.5567386674779433, "grad_norm": 0.4834255874156952, "learning_rate": 1.5480376958754976e-05, "loss": 0.0426, "step": 4202 }, { "epoch": 2.557347125038029, "grad_norm": 0.5208700299263, "learning_rate": 1.5468867137356696e-05, "loss": 0.0357, "step": 4203 }, { "epoch": 2.5579555825981135, "grad_norm": 0.4725053906440735, "learning_rate": 1.545735967928207e-05, "loss": 0.0489, "step": 4204 }, { "epoch": 2.558564040158199, "grad_norm": 0.4903971552848816, "learning_rate": 1.54458545873845e-05, "loss": 0.048, "step": 4205 }, { "epoch": 2.559172497718284, "grad_norm": 0.5080806612968445, "learning_rate": 1.543435186451676e-05, "loss": 0.0525, "step": 4206 }, { "epoch": 2.5597809552783692, "grad_norm": 0.608734130859375, "learning_rate": 1.5422851513531028e-05, "loss": 0.0495, "step": 4207 }, { "epoch": 2.560389412838455, "grad_norm": 0.4380703270435333, "learning_rate": 1.5411353537278935e-05, "loss": 0.0282, "step": 4208 }, { "epoch": 2.5609978703985394, "grad_norm": 0.4923262894153595, "learning_rate": 1.539985793861149e-05, "loss": 0.0473, "step": 4209 }, { "epoch": 2.561606327958625, "grad_norm": 0.45295250415802, "learning_rate": 1.5388364720379124e-05, "loss": 0.041, "step": 4210 }, { "epoch": 2.56221478551871, "grad_norm": 0.568891704082489, "learning_rate": 1.5376873885431663e-05, "loss": 0.0386, "step": 4211 }, { "epoch": 2.562823243078795, "grad_norm": 0.4208332598209381, "learning_rate": 1.536538543661838e-05, "loss": 0.0389, "step": 4212 }, { "epoch": 2.5634317006388807, "grad_norm": 0.4591726064682007, "learning_rate": 1.535389937678792e-05, "loss": 0.0361, "step": 4213 }, { "epoch": 2.5640401581989654, "grad_norm": 0.48485690355300903, "learning_rate": 1.5342415708788326e-05, "loss": 0.0322, "step": 4214 }, { "epoch": 2.564648615759051, "grad_norm": 0.546795129776001, "learning_rate": 1.5330934435467104e-05, "loss": 0.0465, "step": 4215 }, { "epoch": 2.565257073319136, "grad_norm": 0.5020440816879272, "learning_rate": 1.5319455559671116e-05, "loss": 0.0341, "step": 4216 }, { "epoch": 2.565865530879221, "grad_norm": 0.592922031879425, "learning_rate": 1.530797908424663e-05, "loss": 0.063, "step": 4217 }, { "epoch": 2.5664739884393066, "grad_norm": 0.5598524808883667, "learning_rate": 1.5296505012039362e-05, "loss": 0.0321, "step": 4218 }, { "epoch": 2.5670824459993913, "grad_norm": 0.5240581631660461, "learning_rate": 1.5285033345894392e-05, "loss": 0.0521, "step": 4219 }, { "epoch": 2.567690903559477, "grad_norm": 0.4785360097885132, "learning_rate": 1.5273564088656208e-05, "loss": 0.0447, "step": 4220 }, { "epoch": 2.568299361119562, "grad_norm": 0.502627432346344, "learning_rate": 1.5262097243168705e-05, "loss": 0.0438, "step": 4221 }, { "epoch": 2.568907818679647, "grad_norm": 0.5592759251594543, "learning_rate": 1.5250632812275194e-05, "loss": 0.0488, "step": 4222 }, { "epoch": 2.5695162762397326, "grad_norm": 0.528427004814148, "learning_rate": 1.5239170798818381e-05, "loss": 0.0451, "step": 4223 }, { "epoch": 2.570124733799817, "grad_norm": 0.5380241870880127, "learning_rate": 1.5227711205640341e-05, "loss": 0.0442, "step": 4224 }, { "epoch": 2.5707331913599027, "grad_norm": 0.4866541624069214, "learning_rate": 1.5216254035582605e-05, "loss": 0.0364, "step": 4225 }, { "epoch": 2.571341648919988, "grad_norm": 0.6870383620262146, "learning_rate": 1.5204799291486063e-05, "loss": 0.0662, "step": 4226 }, { "epoch": 2.571950106480073, "grad_norm": 0.5109632611274719, "learning_rate": 1.5193346976191003e-05, "loss": 0.0363, "step": 4227 }, { "epoch": 2.572558564040158, "grad_norm": 0.4471718370914459, "learning_rate": 1.518189709253714e-05, "loss": 0.028, "step": 4228 }, { "epoch": 2.573167021600243, "grad_norm": 0.5469757318496704, "learning_rate": 1.517044964336356e-05, "loss": 0.0474, "step": 4229 }, { "epoch": 2.5737754791603287, "grad_norm": 0.5387968420982361, "learning_rate": 1.5159004631508739e-05, "loss": 0.0518, "step": 4230 }, { "epoch": 2.5743839367204138, "grad_norm": 0.4985266327857971, "learning_rate": 1.514756205981059e-05, "loss": 0.0372, "step": 4231 }, { "epoch": 2.574992394280499, "grad_norm": 0.8584986925125122, "learning_rate": 1.5136121931106378e-05, "loss": 0.0439, "step": 4232 }, { "epoch": 2.575600851840584, "grad_norm": 0.4893166720867157, "learning_rate": 1.5124684248232784e-05, "loss": 0.0351, "step": 4233 }, { "epoch": 2.576209309400669, "grad_norm": 0.5629805326461792, "learning_rate": 1.511324901402586e-05, "loss": 0.046, "step": 4234 }, { "epoch": 2.5768177669607546, "grad_norm": 0.593343198299408, "learning_rate": 1.5101816231321092e-05, "loss": 0.0604, "step": 4235 }, { "epoch": 2.5774262245208397, "grad_norm": 0.45182177424430847, "learning_rate": 1.5090385902953325e-05, "loss": 0.0377, "step": 4236 }, { "epoch": 2.578034682080925, "grad_norm": 0.4370990991592407, "learning_rate": 1.5078958031756784e-05, "loss": 0.0393, "step": 4237 }, { "epoch": 2.57864313964101, "grad_norm": 0.5261332392692566, "learning_rate": 1.506753262056514e-05, "loss": 0.0346, "step": 4238 }, { "epoch": 2.579251597201095, "grad_norm": 0.5731834173202515, "learning_rate": 1.5056109672211394e-05, "loss": 0.0374, "step": 4239 }, { "epoch": 2.5798600547611805, "grad_norm": 0.4817114770412445, "learning_rate": 1.5044689189527957e-05, "loss": 0.0364, "step": 4240 }, { "epoch": 2.5804685123212656, "grad_norm": 0.6388393044471741, "learning_rate": 1.5033271175346658e-05, "loss": 0.0571, "step": 4241 }, { "epoch": 2.5810769698813507, "grad_norm": 0.4850732088088989, "learning_rate": 1.5021855632498671e-05, "loss": 0.0417, "step": 4242 }, { "epoch": 2.581685427441436, "grad_norm": 0.5559534430503845, "learning_rate": 1.5010442563814575e-05, "loss": 0.0567, "step": 4243 }, { "epoch": 2.582293885001521, "grad_norm": 0.4856293201446533, "learning_rate": 1.4999031972124333e-05, "loss": 0.0494, "step": 4244 }, { "epoch": 2.5829023425616064, "grad_norm": 0.8523834347724915, "learning_rate": 1.498762386025731e-05, "loss": 0.0328, "step": 4245 }, { "epoch": 2.5835108001216915, "grad_norm": 0.6239566802978516, "learning_rate": 1.4976218231042233e-05, "loss": 0.0505, "step": 4246 }, { "epoch": 2.5841192576817766, "grad_norm": 0.5066648125648499, "learning_rate": 1.496481508730721e-05, "loss": 0.0503, "step": 4247 }, { "epoch": 2.5847277152418617, "grad_norm": 0.6433850526809692, "learning_rate": 1.495341443187977e-05, "loss": 0.0472, "step": 4248 }, { "epoch": 2.585336172801947, "grad_norm": 0.5411747097969055, "learning_rate": 1.4942016267586789e-05, "loss": 0.0525, "step": 4249 }, { "epoch": 2.5859446303620324, "grad_norm": 0.5121768116950989, "learning_rate": 1.4930620597254524e-05, "loss": 0.0443, "step": 4250 }, { "epoch": 2.5865530879221175, "grad_norm": 0.40693843364715576, "learning_rate": 1.4919227423708653e-05, "loss": 0.0301, "step": 4251 }, { "epoch": 2.5871615454822026, "grad_norm": 0.49290603399276733, "learning_rate": 1.490783674977419e-05, "loss": 0.0389, "step": 4252 }, { "epoch": 2.5877700030422877, "grad_norm": 0.5338709354400635, "learning_rate": 1.489644857827554e-05, "loss": 0.0364, "step": 4253 }, { "epoch": 2.5883784606023728, "grad_norm": 0.5304934978485107, "learning_rate": 1.4885062912036517e-05, "loss": 0.0427, "step": 4254 }, { "epoch": 2.5889869181624583, "grad_norm": 0.47712579369544983, "learning_rate": 1.4873679753880284e-05, "loss": 0.0442, "step": 4255 }, { "epoch": 2.5895953757225434, "grad_norm": 0.5361647605895996, "learning_rate": 1.4862299106629385e-05, "loss": 0.0473, "step": 4256 }, { "epoch": 2.5902038332826285, "grad_norm": 0.49737563729286194, "learning_rate": 1.4850920973105736e-05, "loss": 0.0344, "step": 4257 }, { "epoch": 2.5908122908427136, "grad_norm": 0.5141322612762451, "learning_rate": 1.483954535613066e-05, "loss": 0.0588, "step": 4258 }, { "epoch": 2.5914207484027987, "grad_norm": 0.4289071559906006, "learning_rate": 1.4828172258524828e-05, "loss": 0.0318, "step": 4259 }, { "epoch": 2.592029205962884, "grad_norm": 0.4381569027900696, "learning_rate": 1.4816801683108283e-05, "loss": 0.0371, "step": 4260 }, { "epoch": 2.5926376635229693, "grad_norm": 0.6164323091506958, "learning_rate": 1.4805433632700475e-05, "loss": 0.0378, "step": 4261 }, { "epoch": 2.5932461210830544, "grad_norm": 0.4577103555202484, "learning_rate": 1.4794068110120196e-05, "loss": 0.0393, "step": 4262 }, { "epoch": 2.5938545786431395, "grad_norm": 0.42026767134666443, "learning_rate": 1.4782705118185608e-05, "loss": 0.0397, "step": 4263 }, { "epoch": 2.5944630362032246, "grad_norm": 0.4118916690349579, "learning_rate": 1.4771344659714287e-05, "loss": 0.0307, "step": 4264 }, { "epoch": 2.59507149376331, "grad_norm": 0.4349444508552551, "learning_rate": 1.4759986737523135e-05, "loss": 0.0329, "step": 4265 }, { "epoch": 2.5956799513233952, "grad_norm": 0.6133854389190674, "learning_rate": 1.4748631354428444e-05, "loss": 0.0446, "step": 4266 }, { "epoch": 2.5962884088834803, "grad_norm": 0.5277345776557922, "learning_rate": 1.473727851324588e-05, "loss": 0.0432, "step": 4267 }, { "epoch": 2.5968968664435654, "grad_norm": 0.6331156492233276, "learning_rate": 1.472592821679048e-05, "loss": 0.0598, "step": 4268 }, { "epoch": 2.5975053240036505, "grad_norm": 0.5062826871871948, "learning_rate": 1.4714580467876633e-05, "loss": 0.0414, "step": 4269 }, { "epoch": 2.598113781563736, "grad_norm": 0.43930584192276, "learning_rate": 1.4703235269318107e-05, "loss": 0.0425, "step": 4270 }, { "epoch": 2.598722239123821, "grad_norm": 0.36331331729888916, "learning_rate": 1.4691892623928052e-05, "loss": 0.0169, "step": 4271 }, { "epoch": 2.5993306966839063, "grad_norm": 0.49878576397895813, "learning_rate": 1.468055253451896e-05, "loss": 0.0363, "step": 4272 }, { "epoch": 2.5999391542439914, "grad_norm": 0.5917057991027832, "learning_rate": 1.4669215003902704e-05, "loss": 0.063, "step": 4273 }, { "epoch": 2.6005476118040765, "grad_norm": 0.6224356293678284, "learning_rate": 1.465788003489052e-05, "loss": 0.045, "step": 4274 }, { "epoch": 2.601156069364162, "grad_norm": 0.6835285425186157, "learning_rate": 1.4646547630293003e-05, "loss": 0.0451, "step": 4275 }, { "epoch": 2.601764526924247, "grad_norm": 0.5661000609397888, "learning_rate": 1.463521779292012e-05, "loss": 0.0451, "step": 4276 }, { "epoch": 2.602372984484332, "grad_norm": 0.4806481897830963, "learning_rate": 1.4623890525581204e-05, "loss": 0.0418, "step": 4277 }, { "epoch": 2.6029814420444173, "grad_norm": 0.5451545119285583, "learning_rate": 1.4612565831084937e-05, "loss": 0.0359, "step": 4278 }, { "epoch": 2.6035898996045024, "grad_norm": 0.6044538021087646, "learning_rate": 1.4601243712239376e-05, "loss": 0.0408, "step": 4279 }, { "epoch": 2.604198357164588, "grad_norm": 0.5169386863708496, "learning_rate": 1.4589924171851926e-05, "loss": 0.0313, "step": 4280 }, { "epoch": 2.604806814724673, "grad_norm": 0.5409014821052551, "learning_rate": 1.4578607212729384e-05, "loss": 0.0353, "step": 4281 }, { "epoch": 2.605415272284758, "grad_norm": 1.0071747303009033, "learning_rate": 1.4567292837677855e-05, "loss": 0.1323, "step": 4282 }, { "epoch": 2.606023729844843, "grad_norm": 0.5074636936187744, "learning_rate": 1.4555981049502849e-05, "loss": 0.0353, "step": 4283 }, { "epoch": 2.6066321874049283, "grad_norm": 0.4935401976108551, "learning_rate": 1.4544671851009212e-05, "loss": 0.0389, "step": 4284 }, { "epoch": 2.607240644965014, "grad_norm": 0.46502718329429626, "learning_rate": 1.4533365245001168e-05, "loss": 0.0413, "step": 4285 }, { "epoch": 2.607849102525099, "grad_norm": 0.49090376496315, "learning_rate": 1.452206123428225e-05, "loss": 0.0323, "step": 4286 }, { "epoch": 2.608457560085184, "grad_norm": 0.3921135365962982, "learning_rate": 1.4510759821655423e-05, "loss": 0.0423, "step": 4287 }, { "epoch": 2.609066017645269, "grad_norm": 0.6183372139930725, "learning_rate": 1.449946100992294e-05, "loss": 0.0389, "step": 4288 }, { "epoch": 2.6096744752053542, "grad_norm": 0.4686357080936432, "learning_rate": 1.4488164801886453e-05, "loss": 0.0434, "step": 4289 }, { "epoch": 2.6102829327654398, "grad_norm": 0.49797457456588745, "learning_rate": 1.447687120034692e-05, "loss": 0.0394, "step": 4290 }, { "epoch": 2.610891390325525, "grad_norm": 0.5921012163162231, "learning_rate": 1.4465580208104722e-05, "loss": 0.0566, "step": 4291 }, { "epoch": 2.61149984788561, "grad_norm": 0.5064262747764587, "learning_rate": 1.4454291827959526e-05, "loss": 0.0424, "step": 4292 }, { "epoch": 2.612108305445695, "grad_norm": 0.45030635595321655, "learning_rate": 1.4443006062710391e-05, "loss": 0.0361, "step": 4293 }, { "epoch": 2.61271676300578, "grad_norm": 0.8079550266265869, "learning_rate": 1.4431722915155716e-05, "loss": 0.045, "step": 4294 }, { "epoch": 2.6133252205658657, "grad_norm": 0.5597800612449646, "learning_rate": 1.4420442388093258e-05, "loss": 0.0415, "step": 4295 }, { "epoch": 2.613933678125951, "grad_norm": 0.5742923617362976, "learning_rate": 1.4409164484320092e-05, "loss": 0.059, "step": 4296 }, { "epoch": 2.614542135686036, "grad_norm": 0.4569978713989258, "learning_rate": 1.4397889206632706e-05, "loss": 0.0445, "step": 4297 }, { "epoch": 2.615150593246121, "grad_norm": 0.4496994614601135, "learning_rate": 1.4386616557826869e-05, "loss": 0.0386, "step": 4298 }, { "epoch": 2.615759050806206, "grad_norm": 0.531541645526886, "learning_rate": 1.4375346540697738e-05, "loss": 0.0294, "step": 4299 }, { "epoch": 2.6163675083662916, "grad_norm": 0.4035661816596985, "learning_rate": 1.4364079158039807e-05, "loss": 0.0337, "step": 4300 }, { "epoch": 2.6169759659263767, "grad_norm": 0.42909929156303406, "learning_rate": 1.4352814412646926e-05, "loss": 0.0295, "step": 4301 }, { "epoch": 2.617584423486462, "grad_norm": 0.6985564231872559, "learning_rate": 1.4341552307312266e-05, "loss": 0.0477, "step": 4302 }, { "epoch": 2.618192881046547, "grad_norm": 0.5794458985328674, "learning_rate": 1.4330292844828368e-05, "loss": 0.0554, "step": 4303 }, { "epoch": 2.618801338606632, "grad_norm": 0.6460904479026794, "learning_rate": 1.431903602798711e-05, "loss": 0.0393, "step": 4304 }, { "epoch": 2.6194097961667175, "grad_norm": 0.5019935965538025, "learning_rate": 1.4307781859579721e-05, "loss": 0.0309, "step": 4305 }, { "epoch": 2.6200182537268026, "grad_norm": 0.5265465974807739, "learning_rate": 1.4296530342396741e-05, "loss": 0.0407, "step": 4306 }, { "epoch": 2.6206267112868877, "grad_norm": 0.6341969966888428, "learning_rate": 1.4285281479228107e-05, "loss": 0.0558, "step": 4307 }, { "epoch": 2.621235168846973, "grad_norm": 0.5350863337516785, "learning_rate": 1.4274035272863051e-05, "loss": 0.0376, "step": 4308 }, { "epoch": 2.621843626407058, "grad_norm": 0.5693925023078918, "learning_rate": 1.4262791726090163e-05, "loss": 0.0418, "step": 4309 }, { "epoch": 2.6224520839671435, "grad_norm": 0.4471037983894348, "learning_rate": 1.425155084169738e-05, "loss": 0.0379, "step": 4310 }, { "epoch": 2.6230605415272286, "grad_norm": 0.428018718957901, "learning_rate": 1.424031262247198e-05, "loss": 0.0334, "step": 4311 }, { "epoch": 2.6236689990873137, "grad_norm": 0.5358444452285767, "learning_rate": 1.4229077071200542e-05, "loss": 0.0471, "step": 4312 }, { "epoch": 2.6242774566473988, "grad_norm": 0.515167772769928, "learning_rate": 1.4217844190669058e-05, "loss": 0.0456, "step": 4313 }, { "epoch": 2.624885914207484, "grad_norm": 0.5426524877548218, "learning_rate": 1.4206613983662781e-05, "loss": 0.0412, "step": 4314 }, { "epoch": 2.6254943717675694, "grad_norm": 0.6369853019714355, "learning_rate": 1.4195386452966359e-05, "loss": 0.0569, "step": 4315 }, { "epoch": 2.6261028293276545, "grad_norm": 0.5484540462493896, "learning_rate": 1.4184161601363716e-05, "loss": 0.0377, "step": 4316 }, { "epoch": 2.6267112868877396, "grad_norm": 0.4859999418258667, "learning_rate": 1.4172939431638188e-05, "loss": 0.0353, "step": 4317 }, { "epoch": 2.6273197444478247, "grad_norm": 0.5194180011749268, "learning_rate": 1.4161719946572377e-05, "loss": 0.0401, "step": 4318 }, { "epoch": 2.6279282020079098, "grad_norm": 0.6942944526672363, "learning_rate": 1.415050314894826e-05, "loss": 0.0614, "step": 4319 }, { "epoch": 2.6285366595679953, "grad_norm": 0.4476464092731476, "learning_rate": 1.4139289041547132e-05, "loss": 0.0307, "step": 4320 }, { "epoch": 2.6291451171280804, "grad_norm": 0.5009614825248718, "learning_rate": 1.4128077627149633e-05, "loss": 0.0404, "step": 4321 }, { "epoch": 2.6297535746881655, "grad_norm": 0.47980043292045593, "learning_rate": 1.4116868908535702e-05, "loss": 0.0377, "step": 4322 }, { "epoch": 2.6303620322482506, "grad_norm": 0.5018118619918823, "learning_rate": 1.4105662888484667e-05, "loss": 0.0424, "step": 4323 }, { "epoch": 2.6309704898083357, "grad_norm": 0.602103590965271, "learning_rate": 1.4094459569775128e-05, "loss": 0.0571, "step": 4324 }, { "epoch": 2.6315789473684212, "grad_norm": 0.5462225675582886, "learning_rate": 1.4083258955185053e-05, "loss": 0.059, "step": 4325 }, { "epoch": 2.6321874049285063, "grad_norm": 0.4753000736236572, "learning_rate": 1.4072061047491721e-05, "loss": 0.0439, "step": 4326 }, { "epoch": 2.6327958624885914, "grad_norm": 0.45958274602890015, "learning_rate": 1.4060865849471764e-05, "loss": 0.0353, "step": 4327 }, { "epoch": 2.6334043200486765, "grad_norm": 0.5752948522567749, "learning_rate": 1.4049673363901097e-05, "loss": 0.0462, "step": 4328 }, { "epoch": 2.6340127776087616, "grad_norm": 0.4886283874511719, "learning_rate": 1.4038483593555007e-05, "loss": 0.0483, "step": 4329 }, { "epoch": 2.634621235168847, "grad_norm": 0.4635407030582428, "learning_rate": 1.4027296541208084e-05, "loss": 0.038, "step": 4330 }, { "epoch": 2.6352296927289323, "grad_norm": 0.4888119399547577, "learning_rate": 1.4016112209634258e-05, "loss": 0.0394, "step": 4331 }, { "epoch": 2.6358381502890174, "grad_norm": 0.52603679895401, "learning_rate": 1.400493060160677e-05, "loss": 0.0452, "step": 4332 }, { "epoch": 2.6364466078491025, "grad_norm": 0.5692835450172424, "learning_rate": 1.3993751719898207e-05, "loss": 0.0364, "step": 4333 }, { "epoch": 2.6370550654091875, "grad_norm": 0.5845534205436707, "learning_rate": 1.3982575567280442e-05, "loss": 0.0503, "step": 4334 }, { "epoch": 2.637663522969273, "grad_norm": 0.4657052755355835, "learning_rate": 1.397140214652471e-05, "loss": 0.0328, "step": 4335 }, { "epoch": 2.638271980529358, "grad_norm": 0.4424096941947937, "learning_rate": 1.3960231460401552e-05, "loss": 0.0287, "step": 4336 }, { "epoch": 2.6388804380894433, "grad_norm": 0.4316795766353607, "learning_rate": 1.3949063511680837e-05, "loss": 0.0282, "step": 4337 }, { "epoch": 2.6394888956495284, "grad_norm": 0.5137184262275696, "learning_rate": 1.3937898303131742e-05, "loss": 0.0395, "step": 4338 }, { "epoch": 2.6400973532096135, "grad_norm": 0.5804075002670288, "learning_rate": 1.392673583752277e-05, "loss": 0.0472, "step": 4339 }, { "epoch": 2.640705810769699, "grad_norm": 0.43918368220329285, "learning_rate": 1.3915576117621758e-05, "loss": 0.0335, "step": 4340 }, { "epoch": 2.641314268329784, "grad_norm": 0.4672797918319702, "learning_rate": 1.3904419146195846e-05, "loss": 0.0429, "step": 4341 }, { "epoch": 2.641922725889869, "grad_norm": 0.515037477016449, "learning_rate": 1.3893264926011502e-05, "loss": 0.0467, "step": 4342 }, { "epoch": 2.6425311834499543, "grad_norm": 0.5160783529281616, "learning_rate": 1.3882113459834512e-05, "loss": 0.0379, "step": 4343 }, { "epoch": 2.6431396410100394, "grad_norm": 0.5049687623977661, "learning_rate": 1.3870964750429954e-05, "loss": 0.0434, "step": 4344 }, { "epoch": 2.643748098570125, "grad_norm": 0.5025326013565063, "learning_rate": 1.3859818800562263e-05, "loss": 0.051, "step": 4345 }, { "epoch": 2.64435655613021, "grad_norm": 0.5900479555130005, "learning_rate": 1.384867561299516e-05, "loss": 0.0468, "step": 4346 }, { "epoch": 2.644965013690295, "grad_norm": 0.48302316665649414, "learning_rate": 1.3837535190491696e-05, "loss": 0.041, "step": 4347 }, { "epoch": 2.6455734712503802, "grad_norm": 0.5139932632446289, "learning_rate": 1.3826397535814242e-05, "loss": 0.0532, "step": 4348 }, { "epoch": 2.6461819288104653, "grad_norm": 0.40311169624328613, "learning_rate": 1.3815262651724448e-05, "loss": 0.0332, "step": 4349 }, { "epoch": 2.646790386370551, "grad_norm": 0.512266218662262, "learning_rate": 1.3804130540983317e-05, "loss": 0.0447, "step": 4350 }, { "epoch": 2.647398843930636, "grad_norm": 0.48747456073760986, "learning_rate": 1.3793001206351142e-05, "loss": 0.0312, "step": 4351 }, { "epoch": 2.648007301490721, "grad_norm": 0.3904288709163666, "learning_rate": 1.3781874650587536e-05, "loss": 0.0294, "step": 4352 }, { "epoch": 2.648615759050806, "grad_norm": 0.6491031050682068, "learning_rate": 1.3770750876451427e-05, "loss": 0.046, "step": 4353 }, { "epoch": 2.6492242166108912, "grad_norm": 0.4241449534893036, "learning_rate": 1.3759629886701047e-05, "loss": 0.0422, "step": 4354 }, { "epoch": 2.649832674170977, "grad_norm": 0.48629888892173767, "learning_rate": 1.3748511684093926e-05, "loss": 0.0308, "step": 4355 }, { "epoch": 2.650441131731062, "grad_norm": 0.5005238056182861, "learning_rate": 1.3737396271386921e-05, "loss": 0.0566, "step": 4356 }, { "epoch": 2.651049589291147, "grad_norm": 0.6130560040473938, "learning_rate": 1.3726283651336194e-05, "loss": 0.064, "step": 4357 }, { "epoch": 2.651658046851232, "grad_norm": 0.578168511390686, "learning_rate": 1.3715173826697209e-05, "loss": 0.0479, "step": 4358 }, { "epoch": 2.652266504411317, "grad_norm": 0.5515162944793701, "learning_rate": 1.3704066800224741e-05, "loss": 0.0452, "step": 4359 }, { "epoch": 2.6528749619714027, "grad_norm": 0.5804712772369385, "learning_rate": 1.369296257467288e-05, "loss": 0.0467, "step": 4360 }, { "epoch": 2.653483419531488, "grad_norm": 0.45583152770996094, "learning_rate": 1.3681861152794984e-05, "loss": 0.0318, "step": 4361 }, { "epoch": 2.654091877091573, "grad_norm": 0.8064979910850525, "learning_rate": 1.3670762537343765e-05, "loss": 0.0576, "step": 4362 }, { "epoch": 2.654700334651658, "grad_norm": 0.48464739322662354, "learning_rate": 1.3659666731071207e-05, "loss": 0.0436, "step": 4363 }, { "epoch": 2.655308792211743, "grad_norm": 0.48261159658432007, "learning_rate": 1.3648573736728627e-05, "loss": 0.03, "step": 4364 }, { "epoch": 2.6559172497718286, "grad_norm": 0.5928403735160828, "learning_rate": 1.3637483557066583e-05, "loss": 0.0423, "step": 4365 }, { "epoch": 2.6565257073319137, "grad_norm": 0.521212637424469, "learning_rate": 1.3626396194835026e-05, "loss": 0.0518, "step": 4366 }, { "epoch": 2.657134164891999, "grad_norm": 0.47702714800834656, "learning_rate": 1.3615311652783127e-05, "loss": 0.0327, "step": 4367 }, { "epoch": 2.657742622452084, "grad_norm": 0.5482526421546936, "learning_rate": 1.3604229933659402e-05, "loss": 0.0494, "step": 4368 }, { "epoch": 2.658351080012169, "grad_norm": 0.45253828167915344, "learning_rate": 1.3593151040211654e-05, "loss": 0.0375, "step": 4369 }, { "epoch": 2.6589595375722546, "grad_norm": 0.5163220167160034, "learning_rate": 1.3582074975186998e-05, "loss": 0.034, "step": 4370 }, { "epoch": 2.6595679951323397, "grad_norm": 0.4799509346485138, "learning_rate": 1.3571001741331815e-05, "loss": 0.035, "step": 4371 }, { "epoch": 2.6601764526924248, "grad_norm": 3.0906639099121094, "learning_rate": 1.3559931341391815e-05, "loss": 0.0468, "step": 4372 }, { "epoch": 2.66078491025251, "grad_norm": 0.5727887153625488, "learning_rate": 1.3548863778111998e-05, "loss": 0.0482, "step": 4373 }, { "epoch": 2.661393367812595, "grad_norm": 0.5182603597640991, "learning_rate": 1.3537799054236666e-05, "loss": 0.0403, "step": 4374 }, { "epoch": 2.6620018253726805, "grad_norm": 0.4546807110309601, "learning_rate": 1.3526737172509383e-05, "loss": 0.0396, "step": 4375 }, { "epoch": 2.6626102829327656, "grad_norm": 0.5297713279724121, "learning_rate": 1.3515678135673072e-05, "loss": 0.0366, "step": 4376 }, { "epoch": 2.6632187404928507, "grad_norm": 0.4167693853378296, "learning_rate": 1.350462194646988e-05, "loss": 0.0333, "step": 4377 }, { "epoch": 2.6638271980529358, "grad_norm": 0.4427233040332794, "learning_rate": 1.3493568607641294e-05, "loss": 0.0403, "step": 4378 }, { "epoch": 2.664435655613021, "grad_norm": 0.4933980405330658, "learning_rate": 1.3482518121928083e-05, "loss": 0.0355, "step": 4379 }, { "epoch": 2.6650441131731064, "grad_norm": 0.6081718802452087, "learning_rate": 1.3471470492070315e-05, "loss": 0.0516, "step": 4380 }, { "epoch": 2.6656525707331915, "grad_norm": 0.5365004539489746, "learning_rate": 1.3460425720807316e-05, "loss": 0.0451, "step": 4381 }, { "epoch": 2.6662610282932766, "grad_norm": 0.4933543801307678, "learning_rate": 1.3449383810877761e-05, "loss": 0.0492, "step": 4382 }, { "epoch": 2.6668694858533617, "grad_norm": 0.48126405477523804, "learning_rate": 1.3438344765019558e-05, "loss": 0.0424, "step": 4383 }, { "epoch": 2.667477943413447, "grad_norm": 0.5277286171913147, "learning_rate": 1.342730858596995e-05, "loss": 0.0428, "step": 4384 }, { "epoch": 2.6680864009735323, "grad_norm": 0.45332902669906616, "learning_rate": 1.341627527646542e-05, "loss": 0.037, "step": 4385 }, { "epoch": 2.6686948585336174, "grad_norm": 0.4423314034938812, "learning_rate": 1.340524483924181e-05, "loss": 0.0283, "step": 4386 }, { "epoch": 2.6693033160937025, "grad_norm": 0.6126822829246521, "learning_rate": 1.3394217277034171e-05, "loss": 0.0286, "step": 4387 }, { "epoch": 2.6699117736537876, "grad_norm": 0.5217598080635071, "learning_rate": 1.3383192592576898e-05, "loss": 0.0424, "step": 4388 }, { "epoch": 2.6705202312138727, "grad_norm": 0.5640128254890442, "learning_rate": 1.3372170788603649e-05, "loss": 0.0493, "step": 4389 }, { "epoch": 2.6711286887739583, "grad_norm": 0.504953145980835, "learning_rate": 1.3361151867847382e-05, "loss": 0.0329, "step": 4390 }, { "epoch": 2.6717371463340434, "grad_norm": 0.4337262511253357, "learning_rate": 1.3350135833040305e-05, "loss": 0.0376, "step": 4391 }, { "epoch": 2.6723456038941285, "grad_norm": 0.5187790989875793, "learning_rate": 1.3339122686913968e-05, "loss": 0.0401, "step": 4392 }, { "epoch": 2.6729540614542135, "grad_norm": 0.48012837767601013, "learning_rate": 1.3328112432199144e-05, "loss": 0.039, "step": 4393 }, { "epoch": 2.6735625190142986, "grad_norm": 0.5445698499679565, "learning_rate": 1.331710507162594e-05, "loss": 0.062, "step": 4394 }, { "epoch": 2.674170976574384, "grad_norm": 0.45752736926078796, "learning_rate": 1.3306100607923687e-05, "loss": 0.04, "step": 4395 }, { "epoch": 2.6747794341344693, "grad_norm": 0.5074790120124817, "learning_rate": 1.3295099043821085e-05, "loss": 0.0531, "step": 4396 }, { "epoch": 2.6753878916945544, "grad_norm": 0.46912896633148193, "learning_rate": 1.3284100382046022e-05, "loss": 0.0366, "step": 4397 }, { "epoch": 2.6759963492546395, "grad_norm": 0.4760879576206207, "learning_rate": 1.3273104625325722e-05, "loss": 0.0374, "step": 4398 }, { "epoch": 2.6766048068147246, "grad_norm": 0.6422132253646851, "learning_rate": 1.326211177638667e-05, "loss": 0.0607, "step": 4399 }, { "epoch": 2.67721326437481, "grad_norm": 0.47899577021598816, "learning_rate": 1.3251121837954655e-05, "loss": 0.0411, "step": 4400 }, { "epoch": 2.677821721934895, "grad_norm": 0.6358346939086914, "learning_rate": 1.3240134812754681e-05, "loss": 0.0489, "step": 4401 }, { "epoch": 2.6784301794949803, "grad_norm": 0.5330213308334351, "learning_rate": 1.3229150703511122e-05, "loss": 0.0398, "step": 4402 }, { "epoch": 2.6790386370550654, "grad_norm": 0.4606907367706299, "learning_rate": 1.3218169512947542e-05, "loss": 0.0269, "step": 4403 }, { "epoch": 2.6796470946151505, "grad_norm": 0.5010808110237122, "learning_rate": 1.3207191243786834e-05, "loss": 0.0487, "step": 4404 }, { "epoch": 2.680255552175236, "grad_norm": 0.4977697432041168, "learning_rate": 1.319621589875115e-05, "loss": 0.0378, "step": 4405 }, { "epoch": 2.6808640097353207, "grad_norm": 0.41439104080200195, "learning_rate": 1.3185243480561926e-05, "loss": 0.0436, "step": 4406 }, { "epoch": 2.6814724672954062, "grad_norm": 0.35358157753944397, "learning_rate": 1.3174273991939845e-05, "loss": 0.0269, "step": 4407 }, { "epoch": 2.6820809248554913, "grad_norm": 0.4562048017978668, "learning_rate": 1.3163307435604893e-05, "loss": 0.0383, "step": 4408 }, { "epoch": 2.6826893824155764, "grad_norm": 0.5168052911758423, "learning_rate": 1.3152343814276318e-05, "loss": 0.046, "step": 4409 }, { "epoch": 2.683297839975662, "grad_norm": 0.5310634970664978, "learning_rate": 1.3141383130672658e-05, "loss": 0.0405, "step": 4410 }, { "epoch": 2.6839062975357466, "grad_norm": 0.5165497660636902, "learning_rate": 1.3130425387511667e-05, "loss": 0.0516, "step": 4411 }, { "epoch": 2.684514755095832, "grad_norm": 0.4916205406188965, "learning_rate": 1.3119470587510451e-05, "loss": 0.0372, "step": 4412 }, { "epoch": 2.6851232126559172, "grad_norm": 0.40354472398757935, "learning_rate": 1.3108518733385314e-05, "loss": 0.0213, "step": 4413 }, { "epoch": 2.6857316702160023, "grad_norm": 0.4055176079273224, "learning_rate": 1.309756982785187e-05, "loss": 0.0283, "step": 4414 }, { "epoch": 2.686340127776088, "grad_norm": 0.5336512923240662, "learning_rate": 1.3086623873624992e-05, "loss": 0.041, "step": 4415 }, { "epoch": 2.6869485853361725, "grad_norm": 0.4223940670490265, "learning_rate": 1.3075680873418828e-05, "loss": 0.0254, "step": 4416 }, { "epoch": 2.687557042896258, "grad_norm": 0.6376833319664001, "learning_rate": 1.306474082994677e-05, "loss": 0.0409, "step": 4417 }, { "epoch": 2.688165500456343, "grad_norm": 0.4131200611591339, "learning_rate": 1.3053803745921498e-05, "loss": 0.0309, "step": 4418 }, { "epoch": 2.6887739580164283, "grad_norm": 0.5285510420799255, "learning_rate": 1.3042869624054955e-05, "loss": 0.0427, "step": 4419 }, { "epoch": 2.689382415576514, "grad_norm": 0.4723327159881592, "learning_rate": 1.3031938467058358e-05, "loss": 0.0292, "step": 4420 }, { "epoch": 2.6899908731365985, "grad_norm": 0.5857917070388794, "learning_rate": 1.3021010277642145e-05, "loss": 0.0493, "step": 4421 }, { "epoch": 2.690599330696684, "grad_norm": 0.5390817523002625, "learning_rate": 1.3010085058516097e-05, "loss": 0.043, "step": 4422 }, { "epoch": 2.691207788256769, "grad_norm": 0.45317864418029785, "learning_rate": 1.2999162812389181e-05, "loss": 0.029, "step": 4423 }, { "epoch": 2.691816245816854, "grad_norm": 0.4660782217979431, "learning_rate": 1.2988243541969667e-05, "loss": 0.0258, "step": 4424 }, { "epoch": 2.6924247033769397, "grad_norm": 0.5454357266426086, "learning_rate": 1.297732724996508e-05, "loss": 0.0432, "step": 4425 }, { "epoch": 2.6930331609370244, "grad_norm": 0.4717608094215393, "learning_rate": 1.2966413939082214e-05, "loss": 0.0356, "step": 4426 }, { "epoch": 2.69364161849711, "grad_norm": 0.43902650475502014, "learning_rate": 1.2955503612027086e-05, "loss": 0.0321, "step": 4427 }, { "epoch": 2.694250076057195, "grad_norm": 0.5439276695251465, "learning_rate": 1.2944596271505046e-05, "loss": 0.0477, "step": 4428 }, { "epoch": 2.69485853361728, "grad_norm": 0.5469129085540771, "learning_rate": 1.2933691920220626e-05, "loss": 0.0429, "step": 4429 }, { "epoch": 2.695466991177365, "grad_norm": 0.6971737742424011, "learning_rate": 1.2922790560877667e-05, "loss": 0.067, "step": 4430 }, { "epoch": 2.6960754487374503, "grad_norm": 0.5088085532188416, "learning_rate": 1.2911892196179231e-05, "loss": 0.0499, "step": 4431 }, { "epoch": 2.696683906297536, "grad_norm": 0.502051591873169, "learning_rate": 1.2900996828827693e-05, "loss": 0.0361, "step": 4432 }, { "epoch": 2.697292363857621, "grad_norm": 0.4716339707374573, "learning_rate": 1.2890104461524619e-05, "loss": 0.032, "step": 4433 }, { "epoch": 2.697900821417706, "grad_norm": 0.4593643546104431, "learning_rate": 1.2879215096970871e-05, "loss": 0.0321, "step": 4434 }, { "epoch": 2.698509278977791, "grad_norm": 0.42600390315055847, "learning_rate": 1.2868328737866563e-05, "loss": 0.049, "step": 4435 }, { "epoch": 2.6991177365378762, "grad_norm": 0.5230850577354431, "learning_rate": 1.2857445386911064e-05, "loss": 0.0362, "step": 4436 }, { "epoch": 2.6997261940979618, "grad_norm": 0.5436407923698425, "learning_rate": 1.284656504680296e-05, "loss": 0.0493, "step": 4437 }, { "epoch": 2.700334651658047, "grad_norm": 0.5182242393493652, "learning_rate": 1.283568772024017e-05, "loss": 0.0354, "step": 4438 }, { "epoch": 2.700943109218132, "grad_norm": 0.4881172776222229, "learning_rate": 1.2824813409919777e-05, "loss": 0.0473, "step": 4439 }, { "epoch": 2.701551566778217, "grad_norm": 0.5372771620750427, "learning_rate": 1.2813942118538181e-05, "loss": 0.0472, "step": 4440 }, { "epoch": 2.702160024338302, "grad_norm": 0.31403908133506775, "learning_rate": 1.2803073848790983e-05, "loss": 0.0213, "step": 4441 }, { "epoch": 2.7027684818983877, "grad_norm": 0.5783568024635315, "learning_rate": 1.2792208603373096e-05, "loss": 0.046, "step": 4442 }, { "epoch": 2.703376939458473, "grad_norm": 0.48002150654792786, "learning_rate": 1.278134638497862e-05, "loss": 0.0315, "step": 4443 }, { "epoch": 2.703985397018558, "grad_norm": 0.4475983679294586, "learning_rate": 1.277048719630094e-05, "loss": 0.0338, "step": 4444 }, { "epoch": 2.704593854578643, "grad_norm": 0.4856289029121399, "learning_rate": 1.2759631040032688e-05, "loss": 0.0384, "step": 4445 }, { "epoch": 2.705202312138728, "grad_norm": 0.34708070755004883, "learning_rate": 1.274877791886574e-05, "loss": 0.0199, "step": 4446 }, { "epoch": 2.7058107696988136, "grad_norm": 0.4791733920574188, "learning_rate": 1.2737927835491196e-05, "loss": 0.0384, "step": 4447 }, { "epoch": 2.7064192272588987, "grad_norm": 0.4502567648887634, "learning_rate": 1.2727080792599455e-05, "loss": 0.0389, "step": 4448 }, { "epoch": 2.707027684818984, "grad_norm": 0.583976149559021, "learning_rate": 1.2716236792880112e-05, "loss": 0.0401, "step": 4449 }, { "epoch": 2.707636142379069, "grad_norm": 0.5464617013931274, "learning_rate": 1.270539583902203e-05, "loss": 0.0389, "step": 4450 }, { "epoch": 2.708244599939154, "grad_norm": 0.4903790354728699, "learning_rate": 1.2694557933713316e-05, "loss": 0.0391, "step": 4451 }, { "epoch": 2.7088530574992395, "grad_norm": 0.42102527618408203, "learning_rate": 1.2683723079641329e-05, "loss": 0.0348, "step": 4452 }, { "epoch": 2.7094615150593246, "grad_norm": 0.44486379623413086, "learning_rate": 1.2672891279492638e-05, "loss": 0.0285, "step": 4453 }, { "epoch": 2.7100699726194097, "grad_norm": 0.5711926221847534, "learning_rate": 1.2662062535953095e-05, "loss": 0.0396, "step": 4454 }, { "epoch": 2.710678430179495, "grad_norm": 0.5700972676277161, "learning_rate": 1.2651236851707768e-05, "loss": 0.0475, "step": 4455 }, { "epoch": 2.71128688773958, "grad_norm": 0.4372463822364807, "learning_rate": 1.2640414229440983e-05, "loss": 0.0364, "step": 4456 }, { "epoch": 2.7118953452996655, "grad_norm": 0.5272347927093506, "learning_rate": 1.2629594671836292e-05, "loss": 0.0545, "step": 4457 }, { "epoch": 2.7125038028597506, "grad_norm": 0.49635735154151917, "learning_rate": 1.2618778181576513e-05, "loss": 0.0401, "step": 4458 }, { "epoch": 2.7131122604198357, "grad_norm": 0.430007666349411, "learning_rate": 1.260796476134366e-05, "loss": 0.0336, "step": 4459 }, { "epoch": 2.7137207179799208, "grad_norm": 0.6025989055633545, "learning_rate": 1.2597154413819018e-05, "loss": 0.0432, "step": 4460 }, { "epoch": 2.714329175540006, "grad_norm": 0.5454067587852478, "learning_rate": 1.2586347141683108e-05, "loss": 0.0383, "step": 4461 }, { "epoch": 2.7149376331000914, "grad_norm": 0.5201361775398254, "learning_rate": 1.2575542947615675e-05, "loss": 0.0388, "step": 4462 }, { "epoch": 2.7155460906601765, "grad_norm": 0.5323150753974915, "learning_rate": 1.256474183429572e-05, "loss": 0.0497, "step": 4463 }, { "epoch": 2.7161545482202616, "grad_norm": 0.4000251889228821, "learning_rate": 1.2553943804401472e-05, "loss": 0.0306, "step": 4464 }, { "epoch": 2.7167630057803467, "grad_norm": 0.47391611337661743, "learning_rate": 1.254314886061037e-05, "loss": 0.0445, "step": 4465 }, { "epoch": 2.717371463340432, "grad_norm": 0.3792179524898529, "learning_rate": 1.2532357005599126e-05, "loss": 0.0361, "step": 4466 }, { "epoch": 2.7179799209005173, "grad_norm": 0.4722985029220581, "learning_rate": 1.2521568242043669e-05, "loss": 0.0487, "step": 4467 }, { "epoch": 2.7185883784606024, "grad_norm": 0.4448596239089966, "learning_rate": 1.251078257261916e-05, "loss": 0.0373, "step": 4468 }, { "epoch": 2.7191968360206875, "grad_norm": 0.482346773147583, "learning_rate": 1.2500000000000006e-05, "loss": 0.0407, "step": 4469 }, { "epoch": 2.7198052935807726, "grad_norm": 0.5089819431304932, "learning_rate": 1.2489220526859816e-05, "loss": 0.0476, "step": 4470 }, { "epoch": 2.7204137511408577, "grad_norm": 0.3887785077095032, "learning_rate": 1.247844415587146e-05, "loss": 0.0293, "step": 4471 }, { "epoch": 2.7210222087009432, "grad_norm": 0.5263569951057434, "learning_rate": 1.2467670889707032e-05, "loss": 0.0463, "step": 4472 }, { "epoch": 2.7216306662610283, "grad_norm": 0.7234339714050293, "learning_rate": 1.2456900731037849e-05, "loss": 0.0493, "step": 4473 }, { "epoch": 2.7222391238211134, "grad_norm": 0.565403938293457, "learning_rate": 1.2446133682534473e-05, "loss": 0.0406, "step": 4474 }, { "epoch": 2.7228475813811985, "grad_norm": 0.476296067237854, "learning_rate": 1.243536974686666e-05, "loss": 0.0374, "step": 4475 }, { "epoch": 2.7234560389412836, "grad_norm": 0.49702188372612, "learning_rate": 1.2424608926703433e-05, "loss": 0.0248, "step": 4476 }, { "epoch": 2.724064496501369, "grad_norm": 0.4639529585838318, "learning_rate": 1.2413851224713022e-05, "loss": 0.0491, "step": 4477 }, { "epoch": 2.7246729540614543, "grad_norm": 0.5371832251548767, "learning_rate": 1.2403096643562891e-05, "loss": 0.0396, "step": 4478 }, { "epoch": 2.7252814116215394, "grad_norm": 0.7577241659164429, "learning_rate": 1.2392345185919737e-05, "loss": 0.0627, "step": 4479 }, { "epoch": 2.7258898691816245, "grad_norm": 0.46722671389579773, "learning_rate": 1.2381596854449457e-05, "loss": 0.0446, "step": 4480 }, { "epoch": 2.7264983267417096, "grad_norm": 0.39545291662216187, "learning_rate": 1.2370851651817194e-05, "loss": 0.0299, "step": 4481 }, { "epoch": 2.727106784301795, "grad_norm": 0.4299526810646057, "learning_rate": 1.2360109580687313e-05, "loss": 0.0343, "step": 4482 }, { "epoch": 2.72771524186188, "grad_norm": 0.3463142514228821, "learning_rate": 1.2349370643723399e-05, "loss": 0.0296, "step": 4483 }, { "epoch": 2.7283236994219653, "grad_norm": 0.42831888794898987, "learning_rate": 1.2338634843588263e-05, "loss": 0.0388, "step": 4484 }, { "epoch": 2.7289321569820504, "grad_norm": 0.5531302690505981, "learning_rate": 1.232790218294394e-05, "loss": 0.0563, "step": 4485 }, { "epoch": 2.7295406145421355, "grad_norm": 0.537441611289978, "learning_rate": 1.231717266445167e-05, "loss": 0.0507, "step": 4486 }, { "epoch": 2.730149072102221, "grad_norm": 0.5240452885627747, "learning_rate": 1.2306446290771934e-05, "loss": 0.0478, "step": 4487 }, { "epoch": 2.730757529662306, "grad_norm": 0.5667134523391724, "learning_rate": 1.2295723064564422e-05, "loss": 0.0346, "step": 4488 }, { "epoch": 2.731365987222391, "grad_norm": 0.6184573769569397, "learning_rate": 1.228500298848806e-05, "loss": 0.0501, "step": 4489 }, { "epoch": 2.7319744447824763, "grad_norm": 0.47468021512031555, "learning_rate": 1.227428606520095e-05, "loss": 0.0299, "step": 4490 }, { "epoch": 2.7325829023425614, "grad_norm": 0.534231424331665, "learning_rate": 1.2263572297360478e-05, "loss": 0.0534, "step": 4491 }, { "epoch": 2.733191359902647, "grad_norm": 0.3944413959980011, "learning_rate": 1.225286168762319e-05, "loss": 0.0295, "step": 4492 }, { "epoch": 2.733799817462732, "grad_norm": 0.45191413164138794, "learning_rate": 1.2242154238644879e-05, "loss": 0.0385, "step": 4493 }, { "epoch": 2.734408275022817, "grad_norm": 0.5744283199310303, "learning_rate": 1.223144995308054e-05, "loss": 0.0453, "step": 4494 }, { "epoch": 2.7350167325829022, "grad_norm": 0.542323887348175, "learning_rate": 1.2220748833584403e-05, "loss": 0.0428, "step": 4495 }, { "epoch": 2.7356251901429873, "grad_norm": 0.4557442367076874, "learning_rate": 1.221005088280987e-05, "loss": 0.0253, "step": 4496 }, { "epoch": 2.736233647703073, "grad_norm": 0.5451676845550537, "learning_rate": 1.219935610340963e-05, "loss": 0.0469, "step": 4497 }, { "epoch": 2.736842105263158, "grad_norm": 0.568136990070343, "learning_rate": 1.2188664498035504e-05, "loss": 0.0496, "step": 4498 }, { "epoch": 2.737450562823243, "grad_norm": 0.451236754655838, "learning_rate": 1.2177976069338592e-05, "loss": 0.0302, "step": 4499 }, { "epoch": 2.738059020383328, "grad_norm": 0.5220772624015808, "learning_rate": 1.2167290819969149e-05, "loss": 0.0521, "step": 4500 }, { "epoch": 2.7386674779434133, "grad_norm": 0.508506715297699, "learning_rate": 1.2156608752576707e-05, "loss": 0.0526, "step": 4501 }, { "epoch": 2.739275935503499, "grad_norm": 0.49221986532211304, "learning_rate": 1.2145929869809944e-05, "loss": 0.0402, "step": 4502 }, { "epoch": 2.739884393063584, "grad_norm": 0.5821833610534668, "learning_rate": 1.213525417431679e-05, "loss": 0.0393, "step": 4503 }, { "epoch": 2.740492850623669, "grad_norm": 0.4816054701805115, "learning_rate": 1.2124581668744372e-05, "loss": 0.0305, "step": 4504 }, { "epoch": 2.741101308183754, "grad_norm": 0.5656616687774658, "learning_rate": 1.2113912355739037e-05, "loss": 0.0549, "step": 4505 }, { "epoch": 2.741709765743839, "grad_norm": 0.5027511715888977, "learning_rate": 1.21032462379463e-05, "loss": 0.042, "step": 4506 }, { "epoch": 2.7423182233039247, "grad_norm": 1.0891318321228027, "learning_rate": 1.209258331801095e-05, "loss": 0.1, "step": 4507 }, { "epoch": 2.74292668086401, "grad_norm": 0.5207118988037109, "learning_rate": 1.2081923598576921e-05, "loss": 0.0473, "step": 4508 }, { "epoch": 2.743535138424095, "grad_norm": 0.5301030278205872, "learning_rate": 1.2071267082287388e-05, "loss": 0.0409, "step": 4509 }, { "epoch": 2.74414359598418, "grad_norm": 0.577142596244812, "learning_rate": 1.2060613771784724e-05, "loss": 0.0391, "step": 4510 }, { "epoch": 2.744752053544265, "grad_norm": 0.4636692702770233, "learning_rate": 1.204996366971051e-05, "loss": 0.0381, "step": 4511 }, { "epoch": 2.7453605111043506, "grad_norm": 0.43843039870262146, "learning_rate": 1.2039316778705514e-05, "loss": 0.0349, "step": 4512 }, { "epoch": 2.7459689686644357, "grad_norm": 0.5346546769142151, "learning_rate": 1.2028673101409729e-05, "loss": 0.0466, "step": 4513 }, { "epoch": 2.746577426224521, "grad_norm": 0.5091422200202942, "learning_rate": 1.2018032640462345e-05, "loss": 0.0382, "step": 4514 }, { "epoch": 2.747185883784606, "grad_norm": 0.4113479256629944, "learning_rate": 1.2007395398501759e-05, "loss": 0.0386, "step": 4515 }, { "epoch": 2.747794341344691, "grad_norm": 0.44784247875213623, "learning_rate": 1.1996761378165535e-05, "loss": 0.0397, "step": 4516 }, { "epoch": 2.7484027989047766, "grad_norm": 0.44350409507751465, "learning_rate": 1.198613058209051e-05, "loss": 0.0327, "step": 4517 }, { "epoch": 2.7490112564648617, "grad_norm": 0.4770455062389374, "learning_rate": 1.1975503012912645e-05, "loss": 0.0396, "step": 4518 }, { "epoch": 2.7496197140249468, "grad_norm": 0.4313164949417114, "learning_rate": 1.1964878673267143e-05, "loss": 0.0279, "step": 4519 }, { "epoch": 2.750228171585032, "grad_norm": 0.5598992705345154, "learning_rate": 1.1954257565788402e-05, "loss": 0.0391, "step": 4520 }, { "epoch": 2.750836629145117, "grad_norm": 0.44698452949523926, "learning_rate": 1.194363969311002e-05, "loss": 0.0492, "step": 4521 }, { "epoch": 2.7514450867052025, "grad_norm": 0.4371298551559448, "learning_rate": 1.1933025057864769e-05, "loss": 0.0342, "step": 4522 }, { "epoch": 2.7520535442652876, "grad_norm": 0.4278777539730072, "learning_rate": 1.1922413662684644e-05, "loss": 0.0251, "step": 4523 }, { "epoch": 2.7526620018253727, "grad_norm": 0.6282891631126404, "learning_rate": 1.1911805510200833e-05, "loss": 0.0435, "step": 4524 }, { "epoch": 2.753270459385458, "grad_norm": 0.6077399253845215, "learning_rate": 1.1901200603043718e-05, "loss": 0.0501, "step": 4525 }, { "epoch": 2.753878916945543, "grad_norm": 0.4395311772823334, "learning_rate": 1.1890598943842854e-05, "loss": 0.0423, "step": 4526 }, { "epoch": 2.7544873745056284, "grad_norm": 0.5030261874198914, "learning_rate": 1.188000053522704e-05, "loss": 0.0456, "step": 4527 }, { "epoch": 2.7550958320657135, "grad_norm": 0.49082061648368835, "learning_rate": 1.186940537982422e-05, "loss": 0.0404, "step": 4528 }, { "epoch": 2.7557042896257986, "grad_norm": 0.476769357919693, "learning_rate": 1.1858813480261552e-05, "loss": 0.0286, "step": 4529 }, { "epoch": 2.7563127471858837, "grad_norm": 0.4839482307434082, "learning_rate": 1.1848224839165389e-05, "loss": 0.0287, "step": 4530 }, { "epoch": 2.756921204745969, "grad_norm": 0.5627169609069824, "learning_rate": 1.1837639459161284e-05, "loss": 0.0645, "step": 4531 }, { "epoch": 2.7575296623060543, "grad_norm": 0.5222127437591553, "learning_rate": 1.182705734287394e-05, "loss": 0.0388, "step": 4532 }, { "epoch": 2.7581381198661394, "grad_norm": 0.4021141827106476, "learning_rate": 1.1816478492927316e-05, "loss": 0.0307, "step": 4533 }, { "epoch": 2.7587465774262245, "grad_norm": 0.41538554430007935, "learning_rate": 1.1805902911944503e-05, "loss": 0.032, "step": 4534 }, { "epoch": 2.7593550349863096, "grad_norm": 0.517659604549408, "learning_rate": 1.179533060254782e-05, "loss": 0.0451, "step": 4535 }, { "epoch": 2.7599634925463947, "grad_norm": 0.5641751885414124, "learning_rate": 1.1784761567358729e-05, "loss": 0.0513, "step": 4536 }, { "epoch": 2.7605719501064803, "grad_norm": 0.5050565600395203, "learning_rate": 1.177419580899795e-05, "loss": 0.0318, "step": 4537 }, { "epoch": 2.7611804076665654, "grad_norm": 0.5235201716423035, "learning_rate": 1.1763633330085325e-05, "loss": 0.0241, "step": 4538 }, { "epoch": 2.7617888652266505, "grad_norm": 0.6112051010131836, "learning_rate": 1.1753074133239914e-05, "loss": 0.0513, "step": 4539 }, { "epoch": 2.7623973227867356, "grad_norm": 0.3812270164489746, "learning_rate": 1.1742518221079957e-05, "loss": 0.0267, "step": 4540 }, { "epoch": 2.7630057803468207, "grad_norm": 0.4638807773590088, "learning_rate": 1.1731965596222893e-05, "loss": 0.0362, "step": 4541 }, { "epoch": 2.763614237906906, "grad_norm": 0.47099676728248596, "learning_rate": 1.1721416261285303e-05, "loss": 0.0516, "step": 4542 }, { "epoch": 2.7642226954669913, "grad_norm": 0.6157107353210449, "learning_rate": 1.1710870218883022e-05, "loss": 0.0583, "step": 4543 }, { "epoch": 2.7648311530270764, "grad_norm": 0.4588291049003601, "learning_rate": 1.1700327471630995e-05, "loss": 0.03, "step": 4544 }, { "epoch": 2.7654396105871615, "grad_norm": 0.54620760679245, "learning_rate": 1.1689788022143411e-05, "loss": 0.0413, "step": 4545 }, { "epoch": 2.7660480681472466, "grad_norm": 0.5352728366851807, "learning_rate": 1.167925187303358e-05, "loss": 0.0469, "step": 4546 }, { "epoch": 2.766656525707332, "grad_norm": 0.560219407081604, "learning_rate": 1.1668719026914068e-05, "loss": 0.032, "step": 4547 }, { "epoch": 2.767264983267417, "grad_norm": 0.5410435199737549, "learning_rate": 1.165818948639655e-05, "loss": 0.0457, "step": 4548 }, { "epoch": 2.7678734408275023, "grad_norm": 0.46575361490249634, "learning_rate": 1.1647663254091928e-05, "loss": 0.0321, "step": 4549 }, { "epoch": 2.7684818983875874, "grad_norm": 0.571999728679657, "learning_rate": 1.1637140332610267e-05, "loss": 0.05, "step": 4550 }, { "epoch": 2.7690903559476725, "grad_norm": 0.4646073281764984, "learning_rate": 1.1626620724560819e-05, "loss": 0.04, "step": 4551 }, { "epoch": 2.769698813507758, "grad_norm": 0.44628095626831055, "learning_rate": 1.1616104432551982e-05, "loss": 0.0328, "step": 4552 }, { "epoch": 2.770307271067843, "grad_norm": 0.49966830015182495, "learning_rate": 1.16055914591914e-05, "loss": 0.0392, "step": 4553 }, { "epoch": 2.7709157286279282, "grad_norm": 0.4285971522331238, "learning_rate": 1.1595081807085816e-05, "loss": 0.0387, "step": 4554 }, { "epoch": 2.7715241861880133, "grad_norm": 0.46280503273010254, "learning_rate": 1.1584575478841204e-05, "loss": 0.0361, "step": 4555 }, { "epoch": 2.7721326437480984, "grad_norm": 0.48611634969711304, "learning_rate": 1.1574072477062686e-05, "loss": 0.0432, "step": 4556 }, { "epoch": 2.772741101308184, "grad_norm": 0.4622981548309326, "learning_rate": 1.1563572804354586e-05, "loss": 0.0429, "step": 4557 }, { "epoch": 2.773349558868269, "grad_norm": 0.500038206577301, "learning_rate": 1.1553076463320363e-05, "loss": 0.0648, "step": 4558 }, { "epoch": 2.773958016428354, "grad_norm": 0.5276514291763306, "learning_rate": 1.154258345656268e-05, "loss": 0.0452, "step": 4559 }, { "epoch": 2.7745664739884393, "grad_norm": 0.6001972556114197, "learning_rate": 1.1532093786683368e-05, "loss": 0.0578, "step": 4560 }, { "epoch": 2.7751749315485243, "grad_norm": 0.5367801785469055, "learning_rate": 1.1521607456283437e-05, "loss": 0.0388, "step": 4561 }, { "epoch": 2.77578338910861, "grad_norm": 0.45866864919662476, "learning_rate": 1.151112446796303e-05, "loss": 0.0373, "step": 4562 }, { "epoch": 2.776391846668695, "grad_norm": 0.549251139163971, "learning_rate": 1.1500644824321529e-05, "loss": 0.05, "step": 4563 }, { "epoch": 2.77700030422878, "grad_norm": 0.6965852379798889, "learning_rate": 1.1490168527957423e-05, "loss": 0.0401, "step": 4564 }, { "epoch": 2.777608761788865, "grad_norm": 0.4115385413169861, "learning_rate": 1.1479695581468405e-05, "loss": 0.0365, "step": 4565 }, { "epoch": 2.7782172193489503, "grad_norm": 0.4784400761127472, "learning_rate": 1.1469225987451327e-05, "loss": 0.0356, "step": 4566 }, { "epoch": 2.778825676909036, "grad_norm": 0.49439680576324463, "learning_rate": 1.1458759748502223e-05, "loss": 0.0499, "step": 4567 }, { "epoch": 2.779434134469121, "grad_norm": 0.39347532391548157, "learning_rate": 1.1448296867216268e-05, "loss": 0.0287, "step": 4568 }, { "epoch": 2.780042592029206, "grad_norm": 0.5476490259170532, "learning_rate": 1.1437837346187824e-05, "loss": 0.0491, "step": 4569 }, { "epoch": 2.780651049589291, "grad_norm": 0.5295094847679138, "learning_rate": 1.142738118801042e-05, "loss": 0.0493, "step": 4570 }, { "epoch": 2.781259507149376, "grad_norm": 0.5610222816467285, "learning_rate": 1.1416928395276757e-05, "loss": 0.0442, "step": 4571 }, { "epoch": 2.7818679647094617, "grad_norm": 0.443450003862381, "learning_rate": 1.1406478970578662e-05, "loss": 0.0362, "step": 4572 }, { "epoch": 2.782476422269547, "grad_norm": 0.5407084226608276, "learning_rate": 1.1396032916507196e-05, "loss": 0.046, "step": 4573 }, { "epoch": 2.783084879829632, "grad_norm": 0.5253666043281555, "learning_rate": 1.1385590235652515e-05, "loss": 0.0456, "step": 4574 }, { "epoch": 2.783693337389717, "grad_norm": 0.5159477591514587, "learning_rate": 1.1375150930603982e-05, "loss": 0.0368, "step": 4575 }, { "epoch": 2.784301794949802, "grad_norm": 0.49028316140174866, "learning_rate": 1.1364715003950102e-05, "loss": 0.0431, "step": 4576 }, { "epoch": 2.7849102525098877, "grad_norm": 0.4681493937969208, "learning_rate": 1.1354282458278567e-05, "loss": 0.0318, "step": 4577 }, { "epoch": 2.7855187100699728, "grad_norm": 0.44730204343795776, "learning_rate": 1.134385329617618e-05, "loss": 0.0361, "step": 4578 }, { "epoch": 2.786127167630058, "grad_norm": 0.47637778520584106, "learning_rate": 1.1333427520228979e-05, "loss": 0.037, "step": 4579 }, { "epoch": 2.786735625190143, "grad_norm": 0.42056208848953247, "learning_rate": 1.1323005133022095e-05, "loss": 0.0391, "step": 4580 }, { "epoch": 2.787344082750228, "grad_norm": 0.4972124695777893, "learning_rate": 1.131258613713985e-05, "loss": 0.0455, "step": 4581 }, { "epoch": 2.7879525403103136, "grad_norm": 0.5506184697151184, "learning_rate": 1.130217053516572e-05, "loss": 0.0345, "step": 4582 }, { "epoch": 2.7885609978703987, "grad_norm": 0.5120692253112793, "learning_rate": 1.1291758329682358e-05, "loss": 0.0371, "step": 4583 }, { "epoch": 2.789169455430484, "grad_norm": 0.4274442791938782, "learning_rate": 1.1281349523271534e-05, "loss": 0.0344, "step": 4584 }, { "epoch": 2.789777912990569, "grad_norm": 0.5016206502914429, "learning_rate": 1.1270944118514203e-05, "loss": 0.0366, "step": 4585 }, { "epoch": 2.790386370550654, "grad_norm": 0.5108194351196289, "learning_rate": 1.1260542117990478e-05, "loss": 0.0465, "step": 4586 }, { "epoch": 2.7909948281107395, "grad_norm": 0.5079597234725952, "learning_rate": 1.1250143524279618e-05, "loss": 0.0422, "step": 4587 }, { "epoch": 2.7916032856708246, "grad_norm": 0.42185690999031067, "learning_rate": 1.1239748339960043e-05, "loss": 0.0339, "step": 4588 }, { "epoch": 2.7922117432309097, "grad_norm": 0.5816838145256042, "learning_rate": 1.122935656760933e-05, "loss": 0.065, "step": 4589 }, { "epoch": 2.792820200790995, "grad_norm": 0.5395743250846863, "learning_rate": 1.1218968209804192e-05, "loss": 0.0482, "step": 4590 }, { "epoch": 2.79342865835108, "grad_norm": 0.5038939118385315, "learning_rate": 1.1208583269120517e-05, "loss": 0.0371, "step": 4591 }, { "epoch": 2.7940371159111654, "grad_norm": 0.43629151582717896, "learning_rate": 1.1198201748133338e-05, "loss": 0.0306, "step": 4592 }, { "epoch": 2.7946455734712505, "grad_norm": 0.3945944607257843, "learning_rate": 1.1187823649416836e-05, "loss": 0.0291, "step": 4593 }, { "epoch": 2.7952540310313356, "grad_norm": 0.3942541778087616, "learning_rate": 1.1177448975544361e-05, "loss": 0.0294, "step": 4594 }, { "epoch": 2.7958624885914207, "grad_norm": 0.4381770193576813, "learning_rate": 1.1167077729088374e-05, "loss": 0.0294, "step": 4595 }, { "epoch": 2.796470946151506, "grad_norm": 0.7765640616416931, "learning_rate": 1.1156709912620529e-05, "loss": 0.0846, "step": 4596 }, { "epoch": 2.7970794037115914, "grad_norm": 0.38119301199913025, "learning_rate": 1.1146345528711608e-05, "loss": 0.0215, "step": 4597 }, { "epoch": 2.7976878612716765, "grad_norm": 0.4524082541465759, "learning_rate": 1.1135984579931547e-05, "loss": 0.0351, "step": 4598 }, { "epoch": 2.7982963188317616, "grad_norm": 0.4416946768760681, "learning_rate": 1.1125627068849428e-05, "loss": 0.037, "step": 4599 }, { "epoch": 2.7989047763918466, "grad_norm": 0.5243279933929443, "learning_rate": 1.1115272998033496e-05, "loss": 0.037, "step": 4600 }, { "epoch": 2.7995132339519317, "grad_norm": 0.5025190711021423, "learning_rate": 1.1104922370051105e-05, "loss": 0.0333, "step": 4601 }, { "epoch": 2.8001216915120173, "grad_norm": 0.5427147746086121, "learning_rate": 1.1094575187468792e-05, "loss": 0.0279, "step": 4602 }, { "epoch": 2.8007301490721024, "grad_norm": 0.369625985622406, "learning_rate": 1.1084231452852226e-05, "loss": 0.029, "step": 4603 }, { "epoch": 2.8013386066321875, "grad_norm": 0.39575597643852234, "learning_rate": 1.1073891168766229e-05, "loss": 0.0311, "step": 4604 }, { "epoch": 2.8019470641922726, "grad_norm": 0.6142522692680359, "learning_rate": 1.1063554337774745e-05, "loss": 0.0341, "step": 4605 }, { "epoch": 2.8025555217523577, "grad_norm": 0.4445866048336029, "learning_rate": 1.1053220962440889e-05, "loss": 0.0396, "step": 4606 }, { "epoch": 2.803163979312443, "grad_norm": 0.5825973749160767, "learning_rate": 1.10428910453269e-05, "loss": 0.033, "step": 4607 }, { "epoch": 2.8037724368725283, "grad_norm": 0.4755668342113495, "learning_rate": 1.103256458899417e-05, "loss": 0.0303, "step": 4608 }, { "epoch": 2.8043808944326134, "grad_norm": 0.5290636420249939, "learning_rate": 1.1022241596003232e-05, "loss": 0.045, "step": 4609 }, { "epoch": 2.8049893519926985, "grad_norm": 0.44851988554000854, "learning_rate": 1.101192206891377e-05, "loss": 0.0318, "step": 4610 }, { "epoch": 2.8055978095527836, "grad_norm": 0.5043997764587402, "learning_rate": 1.1001606010284569e-05, "loss": 0.0462, "step": 4611 }, { "epoch": 2.806206267112869, "grad_norm": 0.41543683409690857, "learning_rate": 1.0991293422673596e-05, "loss": 0.0359, "step": 4612 }, { "epoch": 2.806814724672954, "grad_norm": 0.5744189620018005, "learning_rate": 1.0980984308637944e-05, "loss": 0.0454, "step": 4613 }, { "epoch": 2.8074231822330393, "grad_norm": 0.4493009150028229, "learning_rate": 1.0970678670733839e-05, "loss": 0.031, "step": 4614 }, { "epoch": 2.8080316397931244, "grad_norm": 0.466262549161911, "learning_rate": 1.0960376511516655e-05, "loss": 0.0362, "step": 4615 }, { "epoch": 2.8086400973532095, "grad_norm": 0.5800133943557739, "learning_rate": 1.0950077833540906e-05, "loss": 0.0544, "step": 4616 }, { "epoch": 2.809248554913295, "grad_norm": 0.32791540026664734, "learning_rate": 1.0939782639360214e-05, "loss": 0.0188, "step": 4617 }, { "epoch": 2.8098570124733797, "grad_norm": 0.45963072776794434, "learning_rate": 1.0929490931527369e-05, "loss": 0.0387, "step": 4618 }, { "epoch": 2.8104654700334653, "grad_norm": 0.46646299958229065, "learning_rate": 1.0919202712594284e-05, "loss": 0.0344, "step": 4619 }, { "epoch": 2.8110739275935503, "grad_norm": 0.39890867471694946, "learning_rate": 1.0908917985112021e-05, "loss": 0.0238, "step": 4620 }, { "epoch": 2.8116823851536354, "grad_norm": 0.439327210187912, "learning_rate": 1.0898636751630733e-05, "loss": 0.0415, "step": 4621 }, { "epoch": 2.812290842713721, "grad_norm": 0.3781556785106659, "learning_rate": 1.0888359014699776e-05, "loss": 0.0332, "step": 4622 }, { "epoch": 2.8128993002738056, "grad_norm": 0.42909082770347595, "learning_rate": 1.087808477686757e-05, "loss": 0.0419, "step": 4623 }, { "epoch": 2.813507757833891, "grad_norm": 0.4752819538116455, "learning_rate": 1.0867814040681711e-05, "loss": 0.0308, "step": 4624 }, { "epoch": 2.8141162153939763, "grad_norm": 0.5056931376457214, "learning_rate": 1.0857546808688912e-05, "loss": 0.0384, "step": 4625 }, { "epoch": 2.8147246729540614, "grad_norm": 0.9717746376991272, "learning_rate": 1.0847283083435026e-05, "loss": 0.0731, "step": 4626 }, { "epoch": 2.815333130514147, "grad_norm": 0.4587456285953522, "learning_rate": 1.083702286746501e-05, "loss": 0.0435, "step": 4627 }, { "epoch": 2.8159415880742316, "grad_norm": 0.5184073448181152, "learning_rate": 1.0826766163322982e-05, "loss": 0.0337, "step": 4628 }, { "epoch": 2.816550045634317, "grad_norm": 0.5532636046409607, "learning_rate": 1.0816512973552178e-05, "loss": 0.0415, "step": 4629 }, { "epoch": 2.817158503194402, "grad_norm": 0.5456773638725281, "learning_rate": 1.0806263300694966e-05, "loss": 0.0372, "step": 4630 }, { "epoch": 2.8177669607544873, "grad_norm": 0.5305153727531433, "learning_rate": 1.0796017147292817e-05, "loss": 0.0458, "step": 4631 }, { "epoch": 2.818375418314573, "grad_norm": 0.4857366681098938, "learning_rate": 1.0785774515886379e-05, "loss": 0.0372, "step": 4632 }, { "epoch": 2.8189838758746575, "grad_norm": 0.5140689611434937, "learning_rate": 1.0775535409015374e-05, "loss": 0.0521, "step": 4633 }, { "epoch": 2.819592333434743, "grad_norm": 0.5720868110656738, "learning_rate": 1.0765299829218683e-05, "loss": 0.0484, "step": 4634 }, { "epoch": 2.820200790994828, "grad_norm": 0.4893862009048462, "learning_rate": 1.0755067779034302e-05, "loss": 0.0446, "step": 4635 }, { "epoch": 2.820809248554913, "grad_norm": 0.4779762923717499, "learning_rate": 1.0744839260999368e-05, "loss": 0.0327, "step": 4636 }, { "epoch": 2.8214177061149983, "grad_norm": 0.4866357743740082, "learning_rate": 1.073461427765009e-05, "loss": 0.0258, "step": 4637 }, { "epoch": 2.8220261636750834, "grad_norm": 0.42479321360588074, "learning_rate": 1.0724392831521878e-05, "loss": 0.0353, "step": 4638 }, { "epoch": 2.822634621235169, "grad_norm": 0.4593687951564789, "learning_rate": 1.0714174925149201e-05, "loss": 0.0423, "step": 4639 }, { "epoch": 2.823243078795254, "grad_norm": 0.5180348753929138, "learning_rate": 1.0703960561065688e-05, "loss": 0.0448, "step": 4640 }, { "epoch": 2.823851536355339, "grad_norm": 0.5316763520240784, "learning_rate": 1.0693749741804048e-05, "loss": 0.038, "step": 4641 }, { "epoch": 2.8244599939154242, "grad_norm": 0.28423795104026794, "learning_rate": 1.068354246989618e-05, "loss": 0.0152, "step": 4642 }, { "epoch": 2.8250684514755093, "grad_norm": 0.3734150230884552, "learning_rate": 1.0673338747873027e-05, "loss": 0.0258, "step": 4643 }, { "epoch": 2.825676909035595, "grad_norm": 0.4261896312236786, "learning_rate": 1.06631385782647e-05, "loss": 0.0279, "step": 4644 }, { "epoch": 2.82628536659568, "grad_norm": 0.35567089915275574, "learning_rate": 1.0652941963600418e-05, "loss": 0.027, "step": 4645 }, { "epoch": 2.826893824155765, "grad_norm": 0.39518773555755615, "learning_rate": 1.0642748906408522e-05, "loss": 0.0355, "step": 4646 }, { "epoch": 2.82750228171585, "grad_norm": 0.5049468874931335, "learning_rate": 1.0632559409216442e-05, "loss": 0.0393, "step": 4647 }, { "epoch": 2.8281107392759353, "grad_norm": 0.4476812481880188, "learning_rate": 1.062237347455078e-05, "loss": 0.0258, "step": 4648 }, { "epoch": 2.828719196836021, "grad_norm": 0.41204380989074707, "learning_rate": 1.0612191104937198e-05, "loss": 0.0345, "step": 4649 }, { "epoch": 2.829327654396106, "grad_norm": 0.5308571457862854, "learning_rate": 1.0602012302900516e-05, "loss": 0.0422, "step": 4650 }, { "epoch": 2.829936111956191, "grad_norm": 0.43992185592651367, "learning_rate": 1.059183707096463e-05, "loss": 0.0342, "step": 4651 }, { "epoch": 2.830544569516276, "grad_norm": 0.482450395822525, "learning_rate": 1.0581665411652605e-05, "loss": 0.0282, "step": 4652 }, { "epoch": 2.831153027076361, "grad_norm": 0.41237378120422363, "learning_rate": 1.0571497327486563e-05, "loss": 0.033, "step": 4653 }, { "epoch": 2.8317614846364467, "grad_norm": 0.5276917219161987, "learning_rate": 1.0561332820987774e-05, "loss": 0.0362, "step": 4654 }, { "epoch": 2.832369942196532, "grad_norm": 0.4543791115283966, "learning_rate": 1.0551171894676615e-05, "loss": 0.0247, "step": 4655 }, { "epoch": 2.832978399756617, "grad_norm": 0.4912313222885132, "learning_rate": 1.0541014551072576e-05, "loss": 0.0258, "step": 4656 }, { "epoch": 2.833586857316702, "grad_norm": 0.46352067589759827, "learning_rate": 1.053086079269423e-05, "loss": 0.0371, "step": 4657 }, { "epoch": 2.834195314876787, "grad_norm": 0.5429676175117493, "learning_rate": 1.052071062205932e-05, "loss": 0.0499, "step": 4658 }, { "epoch": 2.8348037724368726, "grad_norm": 0.4813297688961029, "learning_rate": 1.0510564041684645e-05, "loss": 0.0556, "step": 4659 }, { "epoch": 2.8354122299969577, "grad_norm": 0.5711151957511902, "learning_rate": 1.0500421054086135e-05, "loss": 0.0396, "step": 4660 }, { "epoch": 2.836020687557043, "grad_norm": 0.48414355516433716, "learning_rate": 1.049028166177883e-05, "loss": 0.0347, "step": 4661 }, { "epoch": 2.836629145117128, "grad_norm": 0.5011805295944214, "learning_rate": 1.0480145867276892e-05, "loss": 0.0439, "step": 4662 }, { "epoch": 2.837237602677213, "grad_norm": 0.49640247225761414, "learning_rate": 1.0470013673093548e-05, "loss": 0.0395, "step": 4663 }, { "epoch": 2.8378460602372986, "grad_norm": 0.48717397451400757, "learning_rate": 1.0459885081741175e-05, "loss": 0.0382, "step": 4664 }, { "epoch": 2.8384545177973837, "grad_norm": 0.4565880596637726, "learning_rate": 1.044976009573124e-05, "loss": 0.0269, "step": 4665 }, { "epoch": 2.8390629753574688, "grad_norm": 0.6080870032310486, "learning_rate": 1.0439638717574327e-05, "loss": 0.0385, "step": 4666 }, { "epoch": 2.839671432917554, "grad_norm": 0.6103970408439636, "learning_rate": 1.0429520949780085e-05, "loss": 0.0523, "step": 4667 }, { "epoch": 2.840279890477639, "grad_norm": 0.45440056920051575, "learning_rate": 1.0419406794857343e-05, "loss": 0.0385, "step": 4668 }, { "epoch": 2.8408883480377245, "grad_norm": 0.3543758690357208, "learning_rate": 1.0409296255313955e-05, "loss": 0.0286, "step": 4669 }, { "epoch": 2.8414968055978096, "grad_norm": 0.5107575058937073, "learning_rate": 1.0399189333656925e-05, "loss": 0.0439, "step": 4670 }, { "epoch": 2.8421052631578947, "grad_norm": 0.45144620537757874, "learning_rate": 1.0389086032392348e-05, "loss": 0.0318, "step": 4671 }, { "epoch": 2.84271372071798, "grad_norm": 0.41293710470199585, "learning_rate": 1.0378986354025433e-05, "loss": 0.0224, "step": 4672 }, { "epoch": 2.843322178278065, "grad_norm": 0.4346982538700104, "learning_rate": 1.0368890301060457e-05, "loss": 0.0418, "step": 4673 }, { "epoch": 2.8439306358381504, "grad_norm": 0.47238776087760925, "learning_rate": 1.0358797876000834e-05, "loss": 0.0314, "step": 4674 }, { "epoch": 2.8445390933982355, "grad_norm": 0.4769528806209564, "learning_rate": 1.0348709081349062e-05, "loss": 0.0416, "step": 4675 }, { "epoch": 2.8451475509583206, "grad_norm": 0.4458237886428833, "learning_rate": 1.033862391960675e-05, "loss": 0.0395, "step": 4676 }, { "epoch": 2.8457560085184057, "grad_norm": 0.3304067850112915, "learning_rate": 1.0328542393274571e-05, "loss": 0.0189, "step": 4677 }, { "epoch": 2.846364466078491, "grad_norm": 0.3914128243923187, "learning_rate": 1.031846450485236e-05, "loss": 0.0229, "step": 4678 }, { "epoch": 2.8469729236385763, "grad_norm": 0.4684884250164032, "learning_rate": 1.0308390256838987e-05, "loss": 0.0423, "step": 4679 }, { "epoch": 2.8475813811986614, "grad_norm": 0.5104598999023438, "learning_rate": 1.0298319651732455e-05, "loss": 0.0385, "step": 4680 }, { "epoch": 2.8481898387587465, "grad_norm": 0.5490165948867798, "learning_rate": 1.0288252692029851e-05, "loss": 0.0506, "step": 4681 }, { "epoch": 2.8487982963188316, "grad_norm": 0.3741062581539154, "learning_rate": 1.0278189380227373e-05, "loss": 0.0251, "step": 4682 }, { "epoch": 2.8494067538789167, "grad_norm": 0.5128927230834961, "learning_rate": 1.0268129718820277e-05, "loss": 0.0526, "step": 4683 }, { "epoch": 2.8500152114390023, "grad_norm": 0.43258655071258545, "learning_rate": 1.0258073710302973e-05, "loss": 0.0312, "step": 4684 }, { "epoch": 2.8506236689990874, "grad_norm": 0.5292765498161316, "learning_rate": 1.0248021357168908e-05, "loss": 0.0539, "step": 4685 }, { "epoch": 2.8512321265591725, "grad_norm": 0.5508760809898376, "learning_rate": 1.023797266191066e-05, "loss": 0.0453, "step": 4686 }, { "epoch": 2.8518405841192576, "grad_norm": 0.45886969566345215, "learning_rate": 1.0227927627019862e-05, "loss": 0.0382, "step": 4687 }, { "epoch": 2.8524490416793427, "grad_norm": 0.587009847164154, "learning_rate": 1.0217886254987303e-05, "loss": 0.0589, "step": 4688 }, { "epoch": 2.853057499239428, "grad_norm": 0.5119481682777405, "learning_rate": 1.0207848548302793e-05, "loss": 0.0305, "step": 4689 }, { "epoch": 2.8536659567995133, "grad_norm": 0.4871687889099121, "learning_rate": 1.0197814509455275e-05, "loss": 0.0551, "step": 4690 }, { "epoch": 2.8542744143595984, "grad_norm": 0.5183733701705933, "learning_rate": 1.0187784140932774e-05, "loss": 0.0435, "step": 4691 }, { "epoch": 2.8548828719196835, "grad_norm": 0.46334466338157654, "learning_rate": 1.0177757445222411e-05, "loss": 0.0406, "step": 4692 }, { "epoch": 2.8554913294797686, "grad_norm": 0.42779427766799927, "learning_rate": 1.0167734424810363e-05, "loss": 0.024, "step": 4693 }, { "epoch": 2.856099787039854, "grad_norm": 0.5750960111618042, "learning_rate": 1.0157715082181956e-05, "loss": 0.0467, "step": 4694 }, { "epoch": 2.856708244599939, "grad_norm": 0.6426092982292175, "learning_rate": 1.0147699419821543e-05, "loss": 0.0576, "step": 4695 }, { "epoch": 2.8573167021600243, "grad_norm": 0.39237314462661743, "learning_rate": 1.0137687440212598e-05, "loss": 0.0315, "step": 4696 }, { "epoch": 2.8579251597201094, "grad_norm": 0.4717332124710083, "learning_rate": 1.012767914583768e-05, "loss": 0.029, "step": 4697 }, { "epoch": 2.8585336172801945, "grad_norm": 0.4430489242076874, "learning_rate": 1.0117674539178428e-05, "loss": 0.0343, "step": 4698 }, { "epoch": 2.85914207484028, "grad_norm": 0.42511722445487976, "learning_rate": 1.010767362271556e-05, "loss": 0.0306, "step": 4699 }, { "epoch": 2.859750532400365, "grad_norm": 0.5086604952812195, "learning_rate": 1.009767639892889e-05, "loss": 0.0396, "step": 4700 }, { "epoch": 2.8603589899604502, "grad_norm": 0.4445130527019501, "learning_rate": 1.0087682870297313e-05, "loss": 0.0335, "step": 4701 }, { "epoch": 2.8609674475205353, "grad_norm": 0.4963395595550537, "learning_rate": 1.0077693039298808e-05, "loss": 0.0442, "step": 4702 }, { "epoch": 2.8615759050806204, "grad_norm": 0.5074029564857483, "learning_rate": 1.0067706908410437e-05, "loss": 0.0469, "step": 4703 }, { "epoch": 2.862184362640706, "grad_norm": 0.4995264708995819, "learning_rate": 1.0057724480108352e-05, "loss": 0.0369, "step": 4704 }, { "epoch": 2.862792820200791, "grad_norm": 0.41249075531959534, "learning_rate": 1.0047745756867762e-05, "loss": 0.0245, "step": 4705 }, { "epoch": 2.863401277760876, "grad_norm": 0.4585972726345062, "learning_rate": 1.0037770741162986e-05, "loss": 0.0362, "step": 4706 }, { "epoch": 2.8640097353209613, "grad_norm": 0.4985556900501251, "learning_rate": 1.0027799435467409e-05, "loss": 0.0433, "step": 4707 }, { "epoch": 2.8646181928810464, "grad_norm": 0.46208369731903076, "learning_rate": 1.0017831842253509e-05, "loss": 0.0252, "step": 4708 }, { "epoch": 2.865226650441132, "grad_norm": 0.5662164688110352, "learning_rate": 1.0007867963992814e-05, "loss": 0.0602, "step": 4709 }, { "epoch": 2.865835108001217, "grad_norm": 0.5789265036582947, "learning_rate": 9.997907803155962e-06, "loss": 0.051, "step": 4710 }, { "epoch": 2.866443565561302, "grad_norm": 0.4753771126270294, "learning_rate": 9.987951362212658e-06, "loss": 0.0505, "step": 4711 }, { "epoch": 2.867052023121387, "grad_norm": 0.5585101842880249, "learning_rate": 9.977998643631684e-06, "loss": 0.0621, "step": 4712 }, { "epoch": 2.8676604806814723, "grad_norm": 0.396291583776474, "learning_rate": 9.968049649880895e-06, "loss": 0.031, "step": 4713 }, { "epoch": 2.868268938241558, "grad_norm": 0.5255879163742065, "learning_rate": 9.95810438342724e-06, "loss": 0.0365, "step": 4714 }, { "epoch": 2.868877395801643, "grad_norm": 0.5379154086112976, "learning_rate": 9.948162846736715e-06, "loss": 0.0466, "step": 4715 }, { "epoch": 2.869485853361728, "grad_norm": 0.5413723587989807, "learning_rate": 9.93822504227441e-06, "loss": 0.0422, "step": 4716 }, { "epoch": 2.870094310921813, "grad_norm": 0.6132776141166687, "learning_rate": 9.928290972504489e-06, "loss": 0.044, "step": 4717 }, { "epoch": 2.870702768481898, "grad_norm": 0.5400335192680359, "learning_rate": 9.918360639890187e-06, "loss": 0.0462, "step": 4718 }, { "epoch": 2.8713112260419837, "grad_norm": 0.37978994846343994, "learning_rate": 9.908434046893822e-06, "loss": 0.0303, "step": 4719 }, { "epoch": 2.871919683602069, "grad_norm": 0.49378716945648193, "learning_rate": 9.898511195976756e-06, "loss": 0.0331, "step": 4720 }, { "epoch": 2.872528141162154, "grad_norm": 0.44092321395874023, "learning_rate": 9.888592089599452e-06, "loss": 0.0358, "step": 4721 }, { "epoch": 2.873136598722239, "grad_norm": 0.4932500123977661, "learning_rate": 9.878676730221437e-06, "loss": 0.0459, "step": 4722 }, { "epoch": 2.873745056282324, "grad_norm": 0.36311423778533936, "learning_rate": 9.868765120301305e-06, "loss": 0.0274, "step": 4723 }, { "epoch": 2.8743535138424097, "grad_norm": 0.4552887976169586, "learning_rate": 9.858857262296719e-06, "loss": 0.0377, "step": 4724 }, { "epoch": 2.8749619714024948, "grad_norm": 0.4563083052635193, "learning_rate": 9.848953158664431e-06, "loss": 0.0407, "step": 4725 }, { "epoch": 2.87557042896258, "grad_norm": 0.5411676168441772, "learning_rate": 9.839052811860222e-06, "loss": 0.04, "step": 4726 }, { "epoch": 2.876178886522665, "grad_norm": 0.34208500385284424, "learning_rate": 9.829156224338975e-06, "loss": 0.0262, "step": 4727 }, { "epoch": 2.87678734408275, "grad_norm": 0.4771437644958496, "learning_rate": 9.819263398554634e-06, "loss": 0.0411, "step": 4728 }, { "epoch": 2.8773958016428356, "grad_norm": 0.43631598353385925, "learning_rate": 9.809374336960208e-06, "loss": 0.0348, "step": 4729 }, { "epoch": 2.8780042592029207, "grad_norm": 0.457810640335083, "learning_rate": 9.799489042007767e-06, "loss": 0.0462, "step": 4730 }, { "epoch": 2.878612716763006, "grad_norm": 0.5080879926681519, "learning_rate": 9.789607516148467e-06, "loss": 0.0379, "step": 4731 }, { "epoch": 2.879221174323091, "grad_norm": 0.5162711143493652, "learning_rate": 9.77972976183249e-06, "loss": 0.0444, "step": 4732 }, { "epoch": 2.879829631883176, "grad_norm": 0.4640320837497711, "learning_rate": 9.769855781509121e-06, "loss": 0.0301, "step": 4733 }, { "epoch": 2.8804380894432615, "grad_norm": 0.457856148481369, "learning_rate": 9.759985577626695e-06, "loss": 0.0409, "step": 4734 }, { "epoch": 2.8810465470033466, "grad_norm": 0.43068334460258484, "learning_rate": 9.75011915263262e-06, "loss": 0.0367, "step": 4735 }, { "epoch": 2.8816550045634317, "grad_norm": 0.5189123153686523, "learning_rate": 9.74025650897333e-06, "loss": 0.0427, "step": 4736 }, { "epoch": 2.882263462123517, "grad_norm": 0.4327934682369232, "learning_rate": 9.730397649094386e-06, "loss": 0.0368, "step": 4737 }, { "epoch": 2.882871919683602, "grad_norm": 0.3771323263645172, "learning_rate": 9.72054257544035e-06, "loss": 0.0311, "step": 4738 }, { "epoch": 2.8834803772436874, "grad_norm": 0.5253776907920837, "learning_rate": 9.710691290454874e-06, "loss": 0.0475, "step": 4739 }, { "epoch": 2.8840888348037725, "grad_norm": 0.47808972001075745, "learning_rate": 9.70084379658067e-06, "loss": 0.0314, "step": 4740 }, { "epoch": 2.8846972923638576, "grad_norm": 0.4568268656730652, "learning_rate": 9.691000096259512e-06, "loss": 0.0294, "step": 4741 }, { "epoch": 2.8853057499239427, "grad_norm": 0.417096346616745, "learning_rate": 9.68116019193221e-06, "loss": 0.0287, "step": 4742 }, { "epoch": 2.885914207484028, "grad_norm": 0.4551639258861542, "learning_rate": 9.671324086038658e-06, "loss": 0.0252, "step": 4743 }, { "epoch": 2.8865226650441134, "grad_norm": 0.4030291736125946, "learning_rate": 9.661491781017806e-06, "loss": 0.039, "step": 4744 }, { "epoch": 2.8871311226041985, "grad_norm": 0.49422821402549744, "learning_rate": 9.651663279307657e-06, "loss": 0.0379, "step": 4745 }, { "epoch": 2.8877395801642836, "grad_norm": 0.4428713321685791, "learning_rate": 9.64183858334525e-06, "loss": 0.0335, "step": 4746 }, { "epoch": 2.8883480377243687, "grad_norm": 0.4136061370372772, "learning_rate": 9.632017695566731e-06, "loss": 0.0354, "step": 4747 }, { "epoch": 2.8889564952844538, "grad_norm": 0.44465282559394836, "learning_rate": 9.622200618407246e-06, "loss": 0.0353, "step": 4748 }, { "epoch": 2.8895649528445393, "grad_norm": 0.42466622591018677, "learning_rate": 9.612387354301028e-06, "loss": 0.0322, "step": 4749 }, { "epoch": 2.8901734104046244, "grad_norm": 0.5076179504394531, "learning_rate": 9.602577905681358e-06, "loss": 0.0531, "step": 4750 }, { "epoch": 2.8907818679647095, "grad_norm": 0.42435580492019653, "learning_rate": 9.592772274980582e-06, "loss": 0.0338, "step": 4751 }, { "epoch": 2.8913903255247946, "grad_norm": 0.43116191029548645, "learning_rate": 9.58297046463006e-06, "loss": 0.0269, "step": 4752 }, { "epoch": 2.8919987830848797, "grad_norm": 0.4200409948825836, "learning_rate": 9.573172477060266e-06, "loss": 0.0255, "step": 4753 }, { "epoch": 2.892607240644965, "grad_norm": 0.44302016496658325, "learning_rate": 9.563378314700665e-06, "loss": 0.036, "step": 4754 }, { "epoch": 2.8932156982050503, "grad_norm": 0.4883388876914978, "learning_rate": 9.553587979979826e-06, "loss": 0.0334, "step": 4755 }, { "epoch": 2.8938241557651354, "grad_norm": 0.5191654562950134, "learning_rate": 9.543801475325306e-06, "loss": 0.0383, "step": 4756 }, { "epoch": 2.8944326133252205, "grad_norm": 0.47728797793388367, "learning_rate": 9.534018803163794e-06, "loss": 0.031, "step": 4757 }, { "epoch": 2.8950410708853056, "grad_norm": 0.42437100410461426, "learning_rate": 9.524239965920958e-06, "loss": 0.0288, "step": 4758 }, { "epoch": 2.895649528445391, "grad_norm": 0.46275144815444946, "learning_rate": 9.514464966021547e-06, "loss": 0.0391, "step": 4759 }, { "epoch": 2.8962579860054762, "grad_norm": 0.5176787376403809, "learning_rate": 9.504693805889355e-06, "loss": 0.0275, "step": 4760 }, { "epoch": 2.8968664435655613, "grad_norm": 0.39485782384872437, "learning_rate": 9.494926487947233e-06, "loss": 0.0326, "step": 4761 }, { "epoch": 2.8974749011256464, "grad_norm": 1.0066802501678467, "learning_rate": 9.485163014617043e-06, "loss": 0.0387, "step": 4762 }, { "epoch": 2.8980833586857315, "grad_norm": 0.38768336176872253, "learning_rate": 9.475403388319752e-06, "loss": 0.029, "step": 4763 }, { "epoch": 2.898691816245817, "grad_norm": 0.3965464234352112, "learning_rate": 9.465647611475312e-06, "loss": 0.0288, "step": 4764 }, { "epoch": 2.899300273805902, "grad_norm": 0.5666676163673401, "learning_rate": 9.455895686502762e-06, "loss": 0.0437, "step": 4765 }, { "epoch": 2.8999087313659873, "grad_norm": 0.49036771059036255, "learning_rate": 9.446147615820169e-06, "loss": 0.0332, "step": 4766 }, { "epoch": 2.9005171889260724, "grad_norm": 0.49707627296447754, "learning_rate": 9.436403401844663e-06, "loss": 0.0317, "step": 4767 }, { "epoch": 2.9011256464861574, "grad_norm": 0.5176650881767273, "learning_rate": 9.426663046992381e-06, "loss": 0.0347, "step": 4768 }, { "epoch": 2.901734104046243, "grad_norm": 0.536285936832428, "learning_rate": 9.416926553678535e-06, "loss": 0.0404, "step": 4769 }, { "epoch": 2.902342561606328, "grad_norm": 0.5301802158355713, "learning_rate": 9.40719392431737e-06, "loss": 0.0418, "step": 4770 }, { "epoch": 2.902951019166413, "grad_norm": 0.5603205561637878, "learning_rate": 9.397465161322177e-06, "loss": 0.0421, "step": 4771 }, { "epoch": 2.9035594767264983, "grad_norm": 0.4085746109485626, "learning_rate": 9.387740267105263e-06, "loss": 0.0362, "step": 4772 }, { "epoch": 2.9041679342865834, "grad_norm": 0.4638700485229492, "learning_rate": 9.378019244078028e-06, "loss": 0.0435, "step": 4773 }, { "epoch": 2.904776391846669, "grad_norm": 0.5153172612190247, "learning_rate": 9.368302094650858e-06, "loss": 0.0505, "step": 4774 }, { "epoch": 2.905384849406754, "grad_norm": 0.3367726802825928, "learning_rate": 9.358588821233207e-06, "loss": 0.0292, "step": 4775 }, { "epoch": 2.905993306966839, "grad_norm": 0.4512867033481598, "learning_rate": 9.348879426233561e-06, "loss": 0.0333, "step": 4776 }, { "epoch": 2.906601764526924, "grad_norm": 0.4421890676021576, "learning_rate": 9.339173912059455e-06, "loss": 0.0347, "step": 4777 }, { "epoch": 2.9072102220870093, "grad_norm": 0.379062294960022, "learning_rate": 9.329472281117434e-06, "loss": 0.0256, "step": 4778 }, { "epoch": 2.907818679647095, "grad_norm": 0.4036319851875305, "learning_rate": 9.319774535813109e-06, "loss": 0.0392, "step": 4779 }, { "epoch": 2.90842713720718, "grad_norm": 0.47172126173973083, "learning_rate": 9.310080678551114e-06, "loss": 0.0408, "step": 4780 }, { "epoch": 2.909035594767265, "grad_norm": 0.494945228099823, "learning_rate": 9.300390711735132e-06, "loss": 0.0257, "step": 4781 }, { "epoch": 2.90964405232735, "grad_norm": 0.4051055610179901, "learning_rate": 9.290704637767843e-06, "loss": 0.0298, "step": 4782 }, { "epoch": 2.9102525098874352, "grad_norm": 0.5722257494926453, "learning_rate": 9.281022459051029e-06, "loss": 0.0371, "step": 4783 }, { "epoch": 2.9108609674475208, "grad_norm": 0.4758877754211426, "learning_rate": 9.271344177985433e-06, "loss": 0.045, "step": 4784 }, { "epoch": 2.911469425007606, "grad_norm": 0.5374777913093567, "learning_rate": 9.261669796970881e-06, "loss": 0.0507, "step": 4785 }, { "epoch": 2.912077882567691, "grad_norm": 0.35278376936912537, "learning_rate": 9.251999318406213e-06, "loss": 0.0239, "step": 4786 }, { "epoch": 2.912686340127776, "grad_norm": 0.40819981694221497, "learning_rate": 9.242332744689314e-06, "loss": 0.026, "step": 4787 }, { "epoch": 2.913294797687861, "grad_norm": 0.5325673222541809, "learning_rate": 9.232670078217062e-06, "loss": 0.0409, "step": 4788 }, { "epoch": 2.9139032552479467, "grad_norm": 0.44101467728614807, "learning_rate": 9.223011321385435e-06, "loss": 0.0378, "step": 4789 }, { "epoch": 2.914511712808032, "grad_norm": 0.47359776496887207, "learning_rate": 9.213356476589375e-06, "loss": 0.0324, "step": 4790 }, { "epoch": 2.915120170368117, "grad_norm": 0.3451775908470154, "learning_rate": 9.2037055462229e-06, "loss": 0.0211, "step": 4791 }, { "epoch": 2.915728627928202, "grad_norm": 0.3723897635936737, "learning_rate": 9.194058532679006e-06, "loss": 0.0358, "step": 4792 }, { "epoch": 2.916337085488287, "grad_norm": 0.5562166571617126, "learning_rate": 9.184415438349788e-06, "loss": 0.0396, "step": 4793 }, { "epoch": 2.9169455430483726, "grad_norm": 0.41815292835235596, "learning_rate": 9.174776265626312e-06, "loss": 0.0194, "step": 4794 }, { "epoch": 2.9175540006084577, "grad_norm": 0.5843579173088074, "learning_rate": 9.16514101689869e-06, "loss": 0.0494, "step": 4795 }, { "epoch": 2.918162458168543, "grad_norm": 0.42992332577705383, "learning_rate": 9.15550969455607e-06, "loss": 0.033, "step": 4796 }, { "epoch": 2.918770915728628, "grad_norm": 0.4272845387458801, "learning_rate": 9.14588230098662e-06, "loss": 0.0286, "step": 4797 }, { "epoch": 2.919379373288713, "grad_norm": 0.4098367691040039, "learning_rate": 9.136258838577511e-06, "loss": 0.0226, "step": 4798 }, { "epoch": 2.9199878308487985, "grad_norm": 0.47262877225875854, "learning_rate": 9.126639309714997e-06, "loss": 0.0416, "step": 4799 }, { "epoch": 2.9205962884088836, "grad_norm": 0.5105498433113098, "learning_rate": 9.117023716784287e-06, "loss": 0.043, "step": 4800 }, { "epoch": 2.9212047459689687, "grad_norm": 0.4782066345214844, "learning_rate": 9.10741206216967e-06, "loss": 0.0292, "step": 4801 }, { "epoch": 2.921813203529054, "grad_norm": 0.5072314143180847, "learning_rate": 9.09780434825441e-06, "loss": 0.0364, "step": 4802 }, { "epoch": 2.922421661089139, "grad_norm": 0.46522530913352966, "learning_rate": 9.08820057742085e-06, "loss": 0.0315, "step": 4803 }, { "epoch": 2.9230301186492245, "grad_norm": 0.5069283246994019, "learning_rate": 9.078600752050304e-06, "loss": 0.0451, "step": 4804 }, { "epoch": 2.9236385762093096, "grad_norm": 0.3636358678340912, "learning_rate": 9.069004874523136e-06, "loss": 0.0215, "step": 4805 }, { "epoch": 2.9242470337693947, "grad_norm": 0.43121346831321716, "learning_rate": 9.059412947218718e-06, "loss": 0.0287, "step": 4806 }, { "epoch": 2.9248554913294798, "grad_norm": 0.35692986845970154, "learning_rate": 9.049824972515464e-06, "loss": 0.0224, "step": 4807 }, { "epoch": 2.925463948889565, "grad_norm": 0.4458438456058502, "learning_rate": 9.040240952790765e-06, "loss": 0.0279, "step": 4808 }, { "epoch": 2.9260724064496504, "grad_norm": 0.4637523591518402, "learning_rate": 9.030660890421089e-06, "loss": 0.0411, "step": 4809 }, { "epoch": 2.9266808640097355, "grad_norm": 0.4921475648880005, "learning_rate": 9.02108478778187e-06, "loss": 0.0346, "step": 4810 }, { "epoch": 2.9272893215698206, "grad_norm": 0.4898071587085724, "learning_rate": 9.011512647247588e-06, "loss": 0.0455, "step": 4811 }, { "epoch": 2.9278977791299057, "grad_norm": 0.3920941650867462, "learning_rate": 9.00194447119174e-06, "loss": 0.0236, "step": 4812 }, { "epoch": 2.9285062366899908, "grad_norm": 0.4192269444465637, "learning_rate": 8.992380261986837e-06, "loss": 0.0303, "step": 4813 }, { "epoch": 2.9291146942500763, "grad_norm": 0.44664058089256287, "learning_rate": 8.98282002200439e-06, "loss": 0.0363, "step": 4814 }, { "epoch": 2.929723151810161, "grad_norm": 0.5392205715179443, "learning_rate": 8.973263753614949e-06, "loss": 0.0398, "step": 4815 }, { "epoch": 2.9303316093702465, "grad_norm": 0.43230360746383667, "learning_rate": 8.96371145918807e-06, "loss": 0.0439, "step": 4816 }, { "epoch": 2.9309400669303316, "grad_norm": 0.3646244704723358, "learning_rate": 8.954163141092333e-06, "loss": 0.0249, "step": 4817 }, { "epoch": 2.9315485244904167, "grad_norm": 0.5149335861206055, "learning_rate": 8.944618801695295e-06, "loss": 0.0292, "step": 4818 }, { "epoch": 2.9321569820505022, "grad_norm": 0.3785509765148163, "learning_rate": 8.935078443363592e-06, "loss": 0.0376, "step": 4819 }, { "epoch": 2.932765439610587, "grad_norm": 0.5124600529670715, "learning_rate": 8.925542068462806e-06, "loss": 0.0407, "step": 4820 }, { "epoch": 2.9333738971706724, "grad_norm": 0.4565979838371277, "learning_rate": 8.91600967935757e-06, "loss": 0.0334, "step": 4821 }, { "epoch": 2.9339823547307575, "grad_norm": 0.5674571394920349, "learning_rate": 8.906481278411522e-06, "loss": 0.0486, "step": 4822 }, { "epoch": 2.9345908122908426, "grad_norm": 0.5254626870155334, "learning_rate": 8.896956867987313e-06, "loss": 0.0281, "step": 4823 }, { "epoch": 2.935199269850928, "grad_norm": 0.380880743265152, "learning_rate": 8.887436450446584e-06, "loss": 0.0277, "step": 4824 }, { "epoch": 2.935807727411013, "grad_norm": 0.41296547651290894, "learning_rate": 8.877920028150013e-06, "loss": 0.028, "step": 4825 }, { "epoch": 2.9364161849710984, "grad_norm": 0.5368117094039917, "learning_rate": 8.868407603457272e-06, "loss": 0.0386, "step": 4826 }, { "epoch": 2.9370246425311834, "grad_norm": 0.5776300430297852, "learning_rate": 8.858899178727045e-06, "loss": 0.0416, "step": 4827 }, { "epoch": 2.9376331000912685, "grad_norm": 0.38237324357032776, "learning_rate": 8.84939475631703e-06, "loss": 0.0324, "step": 4828 }, { "epoch": 2.938241557651354, "grad_norm": 0.42977848649024963, "learning_rate": 8.839894338583935e-06, "loss": 0.0297, "step": 4829 }, { "epoch": 2.9388500152114387, "grad_norm": 0.5375818610191345, "learning_rate": 8.830397927883446e-06, "loss": 0.0374, "step": 4830 }, { "epoch": 2.9394584727715243, "grad_norm": 0.6935356259346008, "learning_rate": 8.820905526570289e-06, "loss": 0.0601, "step": 4831 }, { "epoch": 2.9400669303316094, "grad_norm": 0.5644581317901611, "learning_rate": 8.811417136998184e-06, "loss": 0.0488, "step": 4832 }, { "epoch": 2.9406753878916945, "grad_norm": 0.4846365451812744, "learning_rate": 8.801932761519857e-06, "loss": 0.0472, "step": 4833 }, { "epoch": 2.94128384545178, "grad_norm": 0.4495841860771179, "learning_rate": 8.792452402487037e-06, "loss": 0.033, "step": 4834 }, { "epoch": 2.9418923030118647, "grad_norm": 0.42976799607276917, "learning_rate": 8.782976062250464e-06, "loss": 0.0334, "step": 4835 }, { "epoch": 2.94250076057195, "grad_norm": 0.5387213230133057, "learning_rate": 8.77350374315986e-06, "loss": 0.0398, "step": 4836 }, { "epoch": 2.9431092181320353, "grad_norm": 0.46354934573173523, "learning_rate": 8.764035447563976e-06, "loss": 0.0363, "step": 4837 }, { "epoch": 2.9437176756921204, "grad_norm": 0.471663236618042, "learning_rate": 8.75457117781055e-06, "loss": 0.0491, "step": 4838 }, { "epoch": 2.944326133252206, "grad_norm": 0.584274172782898, "learning_rate": 8.745110936246331e-06, "loss": 0.0437, "step": 4839 }, { "epoch": 2.9449345908122906, "grad_norm": 0.5374387502670288, "learning_rate": 8.73565472521707e-06, "loss": 0.0475, "step": 4840 }, { "epoch": 2.945543048372376, "grad_norm": 0.46591848134994507, "learning_rate": 8.726202547067496e-06, "loss": 0.0427, "step": 4841 }, { "epoch": 2.9461515059324612, "grad_norm": 0.3446124792098999, "learning_rate": 8.716754404141368e-06, "loss": 0.0233, "step": 4842 }, { "epoch": 2.9467599634925463, "grad_norm": 0.43566611409187317, "learning_rate": 8.707310298781427e-06, "loss": 0.0277, "step": 4843 }, { "epoch": 2.9473684210526314, "grad_norm": 0.38527342677116394, "learning_rate": 8.69787023332942e-06, "loss": 0.0224, "step": 4844 }, { "epoch": 2.9479768786127165, "grad_norm": 0.36706897616386414, "learning_rate": 8.688434210126099e-06, "loss": 0.0239, "step": 4845 }, { "epoch": 2.948585336172802, "grad_norm": 0.43865934014320374, "learning_rate": 8.679002231511182e-06, "loss": 0.0322, "step": 4846 }, { "epoch": 2.949193793732887, "grad_norm": 0.5478273034095764, "learning_rate": 8.669574299823422e-06, "loss": 0.0374, "step": 4847 }, { "epoch": 2.9498022512929722, "grad_norm": 0.4504847824573517, "learning_rate": 8.660150417400547e-06, "loss": 0.0363, "step": 4848 }, { "epoch": 2.9504107088530573, "grad_norm": 0.4088916778564453, "learning_rate": 8.650730586579292e-06, "loss": 0.036, "step": 4849 }, { "epoch": 2.9510191664131424, "grad_norm": 0.4998932480812073, "learning_rate": 8.641314809695389e-06, "loss": 0.0482, "step": 4850 }, { "epoch": 2.951627623973228, "grad_norm": 0.506369411945343, "learning_rate": 8.63190308908354e-06, "loss": 0.0313, "step": 4851 }, { "epoch": 2.952236081533313, "grad_norm": 0.4469977915287018, "learning_rate": 8.622495427077468e-06, "loss": 0.0239, "step": 4852 }, { "epoch": 2.952844539093398, "grad_norm": 0.4590071439743042, "learning_rate": 8.613091826009884e-06, "loss": 0.0423, "step": 4853 }, { "epoch": 2.9534529966534833, "grad_norm": 0.614018440246582, "learning_rate": 8.603692288212486e-06, "loss": 0.0576, "step": 4854 }, { "epoch": 2.9540614542135684, "grad_norm": 0.45783132314682007, "learning_rate": 8.59429681601597e-06, "loss": 0.0325, "step": 4855 }, { "epoch": 2.954669911773654, "grad_norm": 0.4062897562980652, "learning_rate": 8.58490541175003e-06, "loss": 0.0286, "step": 4856 }, { "epoch": 2.955278369333739, "grad_norm": 0.561907947063446, "learning_rate": 8.575518077743322e-06, "loss": 0.0356, "step": 4857 }, { "epoch": 2.955886826893824, "grad_norm": 0.42090898752212524, "learning_rate": 8.566134816323526e-06, "loss": 0.0326, "step": 4858 }, { "epoch": 2.956495284453909, "grad_norm": 0.45067688822746277, "learning_rate": 8.556755629817295e-06, "loss": 0.0378, "step": 4859 }, { "epoch": 2.9571037420139943, "grad_norm": 0.4070775806903839, "learning_rate": 8.54738052055029e-06, "loss": 0.0318, "step": 4860 }, { "epoch": 2.95771219957408, "grad_norm": 0.40776872634887695, "learning_rate": 8.538009490847124e-06, "loss": 0.0308, "step": 4861 }, { "epoch": 2.958320657134165, "grad_norm": 0.4585787355899811, "learning_rate": 8.528642543031448e-06, "loss": 0.0288, "step": 4862 }, { "epoch": 2.95892911469425, "grad_norm": 0.4308152496814728, "learning_rate": 8.519279679425851e-06, "loss": 0.0528, "step": 4863 }, { "epoch": 2.959537572254335, "grad_norm": 0.3569679856300354, "learning_rate": 8.509920902351948e-06, "loss": 0.0242, "step": 4864 }, { "epoch": 2.96014602981442, "grad_norm": 0.38262954354286194, "learning_rate": 8.500566214130317e-06, "loss": 0.0322, "step": 4865 }, { "epoch": 2.9607544873745058, "grad_norm": 0.4387529492378235, "learning_rate": 8.491215617080545e-06, "loss": 0.0275, "step": 4866 }, { "epoch": 2.961362944934591, "grad_norm": 0.41847920417785645, "learning_rate": 8.481869113521163e-06, "loss": 0.0294, "step": 4867 }, { "epoch": 2.961971402494676, "grad_norm": 0.41633474826812744, "learning_rate": 8.472526705769746e-06, "loss": 0.0315, "step": 4868 }, { "epoch": 2.962579860054761, "grad_norm": 0.5311760902404785, "learning_rate": 8.463188396142799e-06, "loss": 0.0422, "step": 4869 }, { "epoch": 2.963188317614846, "grad_norm": 0.4289592504501343, "learning_rate": 8.453854186955851e-06, "loss": 0.0289, "step": 4870 }, { "epoch": 2.9637967751749317, "grad_norm": 0.6336865425109863, "learning_rate": 8.444524080523373e-06, "loss": 0.0491, "step": 4871 }, { "epoch": 2.9644052327350168, "grad_norm": 0.3597281873226166, "learning_rate": 8.435198079158867e-06, "loss": 0.0251, "step": 4872 }, { "epoch": 2.965013690295102, "grad_norm": 0.5553586483001709, "learning_rate": 8.42587618517478e-06, "loss": 0.0345, "step": 4873 }, { "epoch": 2.965622147855187, "grad_norm": 0.5425621271133423, "learning_rate": 8.416558400882552e-06, "loss": 0.0432, "step": 4874 }, { "epoch": 2.966230605415272, "grad_norm": 0.3684383034706116, "learning_rate": 8.407244728592612e-06, "loss": 0.0214, "step": 4875 }, { "epoch": 2.9668390629753576, "grad_norm": 0.42099565267562866, "learning_rate": 8.397935170614366e-06, "loss": 0.0292, "step": 4876 }, { "epoch": 2.9674475205354427, "grad_norm": 0.48442554473876953, "learning_rate": 8.38862972925617e-06, "loss": 0.0449, "step": 4877 }, { "epoch": 2.968055978095528, "grad_norm": 0.491372287273407, "learning_rate": 8.379328406825426e-06, "loss": 0.0444, "step": 4878 }, { "epoch": 2.968664435655613, "grad_norm": 0.4108073115348816, "learning_rate": 8.370031205628442e-06, "loss": 0.0385, "step": 4879 }, { "epoch": 2.969272893215698, "grad_norm": 0.4929574429988861, "learning_rate": 8.36073812797055e-06, "loss": 0.0421, "step": 4880 }, { "epoch": 2.9698813507757835, "grad_norm": 0.4480864405632019, "learning_rate": 8.351449176156043e-06, "loss": 0.0321, "step": 4881 }, { "epoch": 2.9704898083358686, "grad_norm": 0.39888888597488403, "learning_rate": 8.342164352488202e-06, "loss": 0.0195, "step": 4882 }, { "epoch": 2.9710982658959537, "grad_norm": 0.475404292345047, "learning_rate": 8.332883659269261e-06, "loss": 0.031, "step": 4883 }, { "epoch": 2.971706723456039, "grad_norm": 0.509152352809906, "learning_rate": 8.323607098800454e-06, "loss": 0.0327, "step": 4884 }, { "epoch": 2.972315181016124, "grad_norm": 0.4782547652721405, "learning_rate": 8.314334673381976e-06, "loss": 0.0375, "step": 4885 }, { "epoch": 2.9729236385762094, "grad_norm": 0.3744230568408966, "learning_rate": 8.305066385313021e-06, "loss": 0.0262, "step": 4886 }, { "epoch": 2.9735320961362945, "grad_norm": 0.5393217206001282, "learning_rate": 8.2958022368917e-06, "loss": 0.0503, "step": 4887 }, { "epoch": 2.9741405536963796, "grad_norm": 0.44529324769973755, "learning_rate": 8.286542230415182e-06, "loss": 0.0344, "step": 4888 }, { "epoch": 2.9747490112564647, "grad_norm": 0.46004465222358704, "learning_rate": 8.277286368179526e-06, "loss": 0.0374, "step": 4889 }, { "epoch": 2.97535746881655, "grad_norm": 0.46514490246772766, "learning_rate": 8.268034652479817e-06, "loss": 0.0374, "step": 4890 }, { "epoch": 2.9759659263766354, "grad_norm": 0.4334418475627899, "learning_rate": 8.25878708561009e-06, "loss": 0.031, "step": 4891 }, { "epoch": 2.9765743839367205, "grad_norm": 0.3778439164161682, "learning_rate": 8.249543669863366e-06, "loss": 0.0354, "step": 4892 }, { "epoch": 2.9771828414968056, "grad_norm": 0.4430623948574066, "learning_rate": 8.240304407531602e-06, "loss": 0.0393, "step": 4893 }, { "epoch": 2.9777912990568907, "grad_norm": 0.3321574628353119, "learning_rate": 8.231069300905783e-06, "loss": 0.0243, "step": 4894 }, { "epoch": 2.9783997566169758, "grad_norm": 0.6121447086334229, "learning_rate": 8.221838352275807e-06, "loss": 0.0376, "step": 4895 }, { "epoch": 2.9790082141770613, "grad_norm": 0.48445889353752136, "learning_rate": 8.212611563930577e-06, "loss": 0.0425, "step": 4896 }, { "epoch": 2.9796166717371464, "grad_norm": 0.43182381987571716, "learning_rate": 8.20338893815793e-06, "loss": 0.0318, "step": 4897 }, { "epoch": 2.9802251292972315, "grad_norm": 0.4070221185684204, "learning_rate": 8.194170477244729e-06, "loss": 0.0291, "step": 4898 }, { "epoch": 2.9808335868573166, "grad_norm": 0.4428864121437073, "learning_rate": 8.184956183476735e-06, "loss": 0.0336, "step": 4899 }, { "epoch": 2.9814420444174017, "grad_norm": 0.4261319637298584, "learning_rate": 8.175746059138725e-06, "loss": 0.03, "step": 4900 }, { "epoch": 2.9820505019774872, "grad_norm": 0.45288318395614624, "learning_rate": 8.166540106514422e-06, "loss": 0.0374, "step": 4901 }, { "epoch": 2.9826589595375723, "grad_norm": 0.4366372227668762, "learning_rate": 8.15733832788653e-06, "loss": 0.037, "step": 4902 }, { "epoch": 2.9832674170976574, "grad_norm": 0.5225448608398438, "learning_rate": 8.148140725536676e-06, "loss": 0.0371, "step": 4903 }, { "epoch": 2.9838758746577425, "grad_norm": 0.47045180201530457, "learning_rate": 8.138947301745519e-06, "loss": 0.0357, "step": 4904 }, { "epoch": 2.9844843322178276, "grad_norm": 0.5777865648269653, "learning_rate": 8.129758058792623e-06, "loss": 0.0439, "step": 4905 }, { "epoch": 2.985092789777913, "grad_norm": 0.5593705773353577, "learning_rate": 8.120572998956546e-06, "loss": 0.0355, "step": 4906 }, { "epoch": 2.9857012473379982, "grad_norm": 0.41948768496513367, "learning_rate": 8.111392124514783e-06, "loss": 0.0446, "step": 4907 }, { "epoch": 2.9863097048980833, "grad_norm": 0.44301876425743103, "learning_rate": 8.102215437743835e-06, "loss": 0.0342, "step": 4908 }, { "epoch": 2.9869181624581684, "grad_norm": 0.4251943826675415, "learning_rate": 8.093042940919118e-06, "loss": 0.0345, "step": 4909 }, { "epoch": 2.9875266200182535, "grad_norm": 0.28501009941101074, "learning_rate": 8.083874636315034e-06, "loss": 0.0155, "step": 4910 }, { "epoch": 2.988135077578339, "grad_norm": 0.4958946108818054, "learning_rate": 8.07471052620494e-06, "loss": 0.0373, "step": 4911 }, { "epoch": 2.988743535138424, "grad_norm": 0.48140478134155273, "learning_rate": 8.065550612861164e-06, "loss": 0.0397, "step": 4912 }, { "epoch": 2.9893519926985093, "grad_norm": 0.556651771068573, "learning_rate": 8.05639489855496e-06, "loss": 0.0472, "step": 4913 }, { "epoch": 2.9899604502585944, "grad_norm": 0.4993399381637573, "learning_rate": 8.047243385556588e-06, "loss": 0.0437, "step": 4914 }, { "epoch": 2.9905689078186795, "grad_norm": 0.33873289823532104, "learning_rate": 8.038096076135227e-06, "loss": 0.0172, "step": 4915 }, { "epoch": 2.991177365378765, "grad_norm": 0.624120831489563, "learning_rate": 8.028952972559028e-06, "loss": 0.0467, "step": 4916 }, { "epoch": 2.99178582293885, "grad_norm": 0.4880430996417999, "learning_rate": 8.019814077095107e-06, "loss": 0.0364, "step": 4917 }, { "epoch": 2.992394280498935, "grad_norm": 0.40606868267059326, "learning_rate": 8.010679392009532e-06, "loss": 0.0334, "step": 4918 }, { "epoch": 2.9930027380590203, "grad_norm": 0.6444923281669617, "learning_rate": 8.001548919567312e-06, "loss": 0.0422, "step": 4919 }, { "epoch": 2.9936111956191054, "grad_norm": 0.45829200744628906, "learning_rate": 7.992422662032429e-06, "loss": 0.0284, "step": 4920 }, { "epoch": 2.994219653179191, "grad_norm": 0.38635167479515076, "learning_rate": 7.983300621667814e-06, "loss": 0.0256, "step": 4921 }, { "epoch": 2.994828110739276, "grad_norm": 0.47939708828926086, "learning_rate": 7.974182800735361e-06, "loss": 0.041, "step": 4922 }, { "epoch": 2.995436568299361, "grad_norm": 0.343191534280777, "learning_rate": 7.965069201495887e-06, "loss": 0.0212, "step": 4923 }, { "epoch": 2.996045025859446, "grad_norm": 0.40407371520996094, "learning_rate": 7.955959826209217e-06, "loss": 0.0233, "step": 4924 }, { "epoch": 2.9966534834195313, "grad_norm": 0.5077594518661499, "learning_rate": 7.946854677134072e-06, "loss": 0.0448, "step": 4925 }, { "epoch": 2.997261940979617, "grad_norm": 0.41432538628578186, "learning_rate": 7.937753756528155e-06, "loss": 0.0365, "step": 4926 }, { "epoch": 2.997870398539702, "grad_norm": 0.3649181127548218, "learning_rate": 7.928657066648118e-06, "loss": 0.0347, "step": 4927 }, { "epoch": 2.998478856099787, "grad_norm": 0.4499366283416748, "learning_rate": 7.919564609749568e-06, "loss": 0.0343, "step": 4928 }, { "epoch": 2.999087313659872, "grad_norm": 0.4373056888580322, "learning_rate": 7.910476388087038e-06, "loss": 0.035, "step": 4929 }, { "epoch": 2.9996957712199572, "grad_norm": 0.3516384959220886, "learning_rate": 7.90139240391404e-06, "loss": 0.0229, "step": 4930 }, { "epoch": 2.9996957712199572, "eval_loss": 1.429209589958191, "eval_runtime": 105.3655, "eval_samples_per_second": 7.232, "eval_steps_per_second": 0.456, "step": 4930 }, { "epoch": 3.0003042287800428, "grad_norm": 0.4027365744113922, "learning_rate": 7.892312659483017e-06, "loss": 0.0212, "step": 4931 }, { "epoch": 3.000912686340128, "grad_norm": 0.18064260482788086, "learning_rate": 7.88323715704538e-06, "loss": 0.0077, "step": 4932 }, { "epoch": 3.001521143900213, "grad_norm": 0.20326943695545197, "learning_rate": 7.874165898851454e-06, "loss": 0.0088, "step": 4933 }, { "epoch": 3.002129601460298, "grad_norm": 0.21207283437252045, "learning_rate": 7.865098887150557e-06, "loss": 0.0098, "step": 4934 }, { "epoch": 3.002738059020383, "grad_norm": 0.25013598799705505, "learning_rate": 7.856036124190913e-06, "loss": 0.0099, "step": 4935 }, { "epoch": 3.0033465165804687, "grad_norm": 0.16713546216487885, "learning_rate": 7.84697761221971e-06, "loss": 0.0071, "step": 4936 }, { "epoch": 3.003954974140554, "grad_norm": 0.20086368918418884, "learning_rate": 7.83792335348309e-06, "loss": 0.0092, "step": 4937 }, { "epoch": 3.004563431700639, "grad_norm": 0.19693416357040405, "learning_rate": 7.828873350226135e-06, "loss": 0.0084, "step": 4938 }, { "epoch": 3.005171889260724, "grad_norm": 0.22012776136398315, "learning_rate": 7.819827604692845e-06, "loss": 0.0081, "step": 4939 }, { "epoch": 3.005780346820809, "grad_norm": 0.15234141051769257, "learning_rate": 7.81078611912622e-06, "loss": 0.0062, "step": 4940 }, { "epoch": 3.0063888043808946, "grad_norm": 0.23004913330078125, "learning_rate": 7.801748895768148e-06, "loss": 0.0081, "step": 4941 }, { "epoch": 3.0069972619409797, "grad_norm": 0.32940471172332764, "learning_rate": 7.79271593685949e-06, "loss": 0.0068, "step": 4942 }, { "epoch": 3.007605719501065, "grad_norm": 0.19095928966999054, "learning_rate": 7.783687244640048e-06, "loss": 0.0077, "step": 4943 }, { "epoch": 3.00821417706115, "grad_norm": 0.25689762830734253, "learning_rate": 7.774662821348563e-06, "loss": 0.0107, "step": 4944 }, { "epoch": 3.008822634621235, "grad_norm": 0.22656317055225372, "learning_rate": 7.765642669222706e-06, "loss": 0.0056, "step": 4945 }, { "epoch": 3.0094310921813205, "grad_norm": 0.20489272475242615, "learning_rate": 7.7566267904991e-06, "loss": 0.0086, "step": 4946 }, { "epoch": 3.0100395497414056, "grad_norm": 0.24489440023899078, "learning_rate": 7.747615187413312e-06, "loss": 0.009, "step": 4947 }, { "epoch": 3.0106480073014907, "grad_norm": 0.32784104347229004, "learning_rate": 7.738607862199851e-06, "loss": 0.0107, "step": 4948 }, { "epoch": 3.011256464861576, "grad_norm": 0.25375160574913025, "learning_rate": 7.729604817092134e-06, "loss": 0.0066, "step": 4949 }, { "epoch": 3.011864922421661, "grad_norm": 0.33918920159339905, "learning_rate": 7.720606054322574e-06, "loss": 0.0123, "step": 4950 }, { "epoch": 3.0124733799817465, "grad_norm": 0.28510206937789917, "learning_rate": 7.71161157612246e-06, "loss": 0.0111, "step": 4951 }, { "epoch": 3.0130818375418316, "grad_norm": 0.18262362480163574, "learning_rate": 7.702621384722064e-06, "loss": 0.0048, "step": 4952 }, { "epoch": 3.0136902951019167, "grad_norm": 0.3342544138431549, "learning_rate": 7.693635482350572e-06, "loss": 0.013, "step": 4953 }, { "epoch": 3.0142987526620018, "grad_norm": 0.2429402768611908, "learning_rate": 7.684653871236125e-06, "loss": 0.0059, "step": 4954 }, { "epoch": 3.014907210222087, "grad_norm": 0.30693817138671875, "learning_rate": 7.675676553605768e-06, "loss": 0.007, "step": 4955 }, { "epoch": 3.0155156677821724, "grad_norm": 0.27962803840637207, "learning_rate": 7.666703531685516e-06, "loss": 0.0086, "step": 4956 }, { "epoch": 3.0161241253422575, "grad_norm": 0.5548810958862305, "learning_rate": 7.657734807700297e-06, "loss": 0.0059, "step": 4957 }, { "epoch": 3.0167325829023426, "grad_norm": 0.20625406503677368, "learning_rate": 7.648770383873988e-06, "loss": 0.0054, "step": 4958 }, { "epoch": 3.0173410404624277, "grad_norm": 0.19295968115329742, "learning_rate": 7.639810262429386e-06, "loss": 0.0042, "step": 4959 }, { "epoch": 3.0179494980225128, "grad_norm": 0.231733500957489, "learning_rate": 7.630854445588239e-06, "loss": 0.0065, "step": 4960 }, { "epoch": 3.0185579555825983, "grad_norm": 0.42650073766708374, "learning_rate": 7.621902935571201e-06, "loss": 0.0063, "step": 4961 }, { "epoch": 3.0191664131426834, "grad_norm": 0.21923278272151947, "learning_rate": 7.612955734597879e-06, "loss": 0.0045, "step": 4962 }, { "epoch": 3.0197748707027685, "grad_norm": 0.3130083978176117, "learning_rate": 7.6040128448868096e-06, "loss": 0.0065, "step": 4963 }, { "epoch": 3.0203833282628536, "grad_norm": 0.2901938557624817, "learning_rate": 7.5950742686554525e-06, "loss": 0.0072, "step": 4964 }, { "epoch": 3.0209917858229387, "grad_norm": 0.2919492721557617, "learning_rate": 7.58614000812021e-06, "loss": 0.0072, "step": 4965 }, { "epoch": 3.0216002433830242, "grad_norm": 0.2188805788755417, "learning_rate": 7.577210065496396e-06, "loss": 0.0032, "step": 4966 }, { "epoch": 3.0222087009431093, "grad_norm": 0.24292194843292236, "learning_rate": 7.568284442998263e-06, "loss": 0.0065, "step": 4967 }, { "epoch": 3.0228171585031944, "grad_norm": 0.26626601815223694, "learning_rate": 7.559363142839002e-06, "loss": 0.011, "step": 4968 }, { "epoch": 3.0234256160632795, "grad_norm": 0.2926938235759735, "learning_rate": 7.550446167230718e-06, "loss": 0.0076, "step": 4969 }, { "epoch": 3.0240340736233646, "grad_norm": 0.26158931851387024, "learning_rate": 7.5415335183844525e-06, "loss": 0.0093, "step": 4970 }, { "epoch": 3.02464253118345, "grad_norm": 0.2527156472206116, "learning_rate": 7.53262519851018e-06, "loss": 0.0066, "step": 4971 }, { "epoch": 3.0252509887435353, "grad_norm": 0.21547862887382507, "learning_rate": 7.52372120981677e-06, "loss": 0.0039, "step": 4972 }, { "epoch": 3.0258594463036204, "grad_norm": 0.1877271980047226, "learning_rate": 7.514821554512056e-06, "loss": 0.0046, "step": 4973 }, { "epoch": 3.0264679038637055, "grad_norm": 0.31582483649253845, "learning_rate": 7.505926234802777e-06, "loss": 0.0115, "step": 4974 }, { "epoch": 3.0270763614237906, "grad_norm": 0.2469736635684967, "learning_rate": 7.49703525289461e-06, "loss": 0.0073, "step": 4975 }, { "epoch": 3.027684818983876, "grad_norm": 0.2651013731956482, "learning_rate": 7.488148610992127e-06, "loss": 0.0059, "step": 4976 }, { "epoch": 3.028293276543961, "grad_norm": 0.3526837229728699, "learning_rate": 7.479266311298871e-06, "loss": 0.0103, "step": 4977 }, { "epoch": 3.0289017341040463, "grad_norm": 0.4271092712879181, "learning_rate": 7.470388356017266e-06, "loss": 0.0092, "step": 4978 }, { "epoch": 3.0295101916641314, "grad_norm": 0.1828882098197937, "learning_rate": 7.461514747348674e-06, "loss": 0.0029, "step": 4979 }, { "epoch": 3.0301186492242165, "grad_norm": 0.2836543917655945, "learning_rate": 7.452645487493387e-06, "loss": 0.008, "step": 4980 }, { "epoch": 3.030727106784302, "grad_norm": 0.5397449731826782, "learning_rate": 7.4437805786506164e-06, "loss": 0.0153, "step": 4981 }, { "epoch": 3.031335564344387, "grad_norm": 0.24427630007266998, "learning_rate": 7.434920023018476e-06, "loss": 0.004, "step": 4982 }, { "epoch": 3.031944021904472, "grad_norm": 0.320932000875473, "learning_rate": 7.426063822794022e-06, "loss": 0.0092, "step": 4983 }, { "epoch": 3.0325524794645573, "grad_norm": 0.280251145362854, "learning_rate": 7.417211980173222e-06, "loss": 0.0093, "step": 4984 }, { "epoch": 3.0331609370246424, "grad_norm": 0.19520291686058044, "learning_rate": 7.408364497350964e-06, "loss": 0.0055, "step": 4985 }, { "epoch": 3.0337693945847275, "grad_norm": 0.28560855984687805, "learning_rate": 7.3995213765210545e-06, "loss": 0.0072, "step": 4986 }, { "epoch": 3.034377852144813, "grad_norm": 0.226197749376297, "learning_rate": 7.390682619876227e-06, "loss": 0.0038, "step": 4987 }, { "epoch": 3.034986309704898, "grad_norm": 0.3306165039539337, "learning_rate": 7.38184822960811e-06, "loss": 0.0087, "step": 4988 }, { "epoch": 3.0355947672649832, "grad_norm": 0.20668867230415344, "learning_rate": 7.37301820790727e-06, "loss": 0.0043, "step": 4989 }, { "epoch": 3.0362032248250683, "grad_norm": 0.28162097930908203, "learning_rate": 7.364192556963187e-06, "loss": 0.0085, "step": 4990 }, { "epoch": 3.0368116823851534, "grad_norm": 0.2704124450683594, "learning_rate": 7.35537127896426e-06, "loss": 0.0047, "step": 4991 }, { "epoch": 3.037420139945239, "grad_norm": 0.18822601437568665, "learning_rate": 7.346554376097778e-06, "loss": 0.0044, "step": 4992 }, { "epoch": 3.038028597505324, "grad_norm": 0.28500285744667053, "learning_rate": 7.337741850549992e-06, "loss": 0.0073, "step": 4993 }, { "epoch": 3.038637055065409, "grad_norm": 0.394612193107605, "learning_rate": 7.328933704506022e-06, "loss": 0.0124, "step": 4994 }, { "epoch": 3.0392455126254942, "grad_norm": 0.2029077261686325, "learning_rate": 7.320129940149925e-06, "loss": 0.0046, "step": 4995 }, { "epoch": 3.0398539701855793, "grad_norm": 0.3149600625038147, "learning_rate": 7.311330559664673e-06, "loss": 0.0061, "step": 4996 }, { "epoch": 3.040462427745665, "grad_norm": 0.18582724034786224, "learning_rate": 7.3025355652321464e-06, "loss": 0.0046, "step": 4997 }, { "epoch": 3.04107088530575, "grad_norm": 0.2304268330335617, "learning_rate": 7.293744959033124e-06, "loss": 0.0065, "step": 4998 }, { "epoch": 3.041679342865835, "grad_norm": 0.3009764850139618, "learning_rate": 7.284958743247322e-06, "loss": 0.0064, "step": 4999 }, { "epoch": 3.04228780042592, "grad_norm": 0.2738197147846222, "learning_rate": 7.276176920053351e-06, "loss": 0.0107, "step": 5000 }, { "epoch": 3.0428962579860053, "grad_norm": 0.28810760378837585, "learning_rate": 7.267399491628748e-06, "loss": 0.0055, "step": 5001 }, { "epoch": 3.043504715546091, "grad_norm": 0.3494376838207245, "learning_rate": 7.258626460149922e-06, "loss": 0.0112, "step": 5002 }, { "epoch": 3.044113173106176, "grad_norm": 0.20463795959949493, "learning_rate": 7.249857827792253e-06, "loss": 0.0053, "step": 5003 }, { "epoch": 3.044721630666261, "grad_norm": 0.22482581436634064, "learning_rate": 7.241093596729976e-06, "loss": 0.0088, "step": 5004 }, { "epoch": 3.045330088226346, "grad_norm": 0.29432442784309387, "learning_rate": 7.232333769136254e-06, "loss": 0.0081, "step": 5005 }, { "epoch": 3.045938545786431, "grad_norm": 0.273941308259964, "learning_rate": 7.223578347183166e-06, "loss": 0.0092, "step": 5006 }, { "epoch": 3.0465470033465167, "grad_norm": 0.3165578842163086, "learning_rate": 7.2148273330416985e-06, "loss": 0.0134, "step": 5007 }, { "epoch": 3.047155460906602, "grad_norm": 0.293646901845932, "learning_rate": 7.206080728881715e-06, "loss": 0.009, "step": 5008 }, { "epoch": 3.047763918466687, "grad_norm": 0.24793358147144318, "learning_rate": 7.197338536872039e-06, "loss": 0.008, "step": 5009 }, { "epoch": 3.048372376026772, "grad_norm": 0.23065130412578583, "learning_rate": 7.188600759180347e-06, "loss": 0.0052, "step": 5010 }, { "epoch": 3.048980833586857, "grad_norm": 0.32581770420074463, "learning_rate": 7.1798673979732585e-06, "loss": 0.0089, "step": 5011 }, { "epoch": 3.0495892911469427, "grad_norm": 0.2617538571357727, "learning_rate": 7.17113845541626e-06, "loss": 0.0074, "step": 5012 }, { "epoch": 3.0501977487070278, "grad_norm": 0.2555639445781708, "learning_rate": 7.162413933673795e-06, "loss": 0.008, "step": 5013 }, { "epoch": 3.050806206267113, "grad_norm": 0.1875854879617691, "learning_rate": 7.153693834909161e-06, "loss": 0.005, "step": 5014 }, { "epoch": 3.051414663827198, "grad_norm": 0.19386368989944458, "learning_rate": 7.144978161284585e-06, "loss": 0.0039, "step": 5015 }, { "epoch": 3.052023121387283, "grad_norm": 0.23817914724349976, "learning_rate": 7.13626691496119e-06, "loss": 0.0068, "step": 5016 }, { "epoch": 3.0526315789473686, "grad_norm": 0.215720534324646, "learning_rate": 7.127560098099012e-06, "loss": 0.006, "step": 5017 }, { "epoch": 3.0532400365074537, "grad_norm": 0.35492417216300964, "learning_rate": 7.118857712856952e-06, "loss": 0.0065, "step": 5018 }, { "epoch": 3.0538484940675388, "grad_norm": 0.30515995621681213, "learning_rate": 7.110159761392876e-06, "loss": 0.0106, "step": 5019 }, { "epoch": 3.054456951627624, "grad_norm": 0.1642477661371231, "learning_rate": 7.101466245863483e-06, "loss": 0.0038, "step": 5020 }, { "epoch": 3.055065409187709, "grad_norm": 0.3018817901611328, "learning_rate": 7.092777168424422e-06, "loss": 0.0085, "step": 5021 }, { "epoch": 3.0556738667477945, "grad_norm": 0.4562574625015259, "learning_rate": 7.084092531230196e-06, "loss": 0.0108, "step": 5022 }, { "epoch": 3.0562823243078796, "grad_norm": 0.3128783702850342, "learning_rate": 7.07541233643427e-06, "loss": 0.007, "step": 5023 }, { "epoch": 3.0568907818679647, "grad_norm": 0.2551473379135132, "learning_rate": 7.066736586188941e-06, "loss": 0.0075, "step": 5024 }, { "epoch": 3.05749923942805, "grad_norm": 0.20833851397037506, "learning_rate": 7.058065282645443e-06, "loss": 0.0061, "step": 5025 }, { "epoch": 3.058107696988135, "grad_norm": 0.2601064443588257, "learning_rate": 7.049398427953899e-06, "loss": 0.005, "step": 5026 }, { "epoch": 3.0587161545482204, "grad_norm": 0.2717060148715973, "learning_rate": 7.040736024263334e-06, "loss": 0.007, "step": 5027 }, { "epoch": 3.0593246121083055, "grad_norm": 0.1747554987668991, "learning_rate": 7.03207807372164e-06, "loss": 0.0047, "step": 5028 }, { "epoch": 3.0599330696683906, "grad_norm": 0.3020252585411072, "learning_rate": 7.023424578475659e-06, "loss": 0.008, "step": 5029 }, { "epoch": 3.0605415272284757, "grad_norm": 0.20203709602355957, "learning_rate": 7.014775540671076e-06, "loss": 0.0062, "step": 5030 }, { "epoch": 3.061149984788561, "grad_norm": 0.30242085456848145, "learning_rate": 7.006130962452498e-06, "loss": 0.0091, "step": 5031 }, { "epoch": 3.0617584423486464, "grad_norm": 0.25199273228645325, "learning_rate": 6.997490845963417e-06, "loss": 0.0062, "step": 5032 }, { "epoch": 3.0623668999087315, "grad_norm": 0.2626577615737915, "learning_rate": 6.988855193346236e-06, "loss": 0.0067, "step": 5033 }, { "epoch": 3.0629753574688166, "grad_norm": 0.21209199726581573, "learning_rate": 6.980224006742214e-06, "loss": 0.0036, "step": 5034 }, { "epoch": 3.0635838150289016, "grad_norm": 0.14695894718170166, "learning_rate": 6.97159728829154e-06, "loss": 0.0028, "step": 5035 }, { "epoch": 3.0641922725889867, "grad_norm": 0.28200700879096985, "learning_rate": 6.9629750401332766e-06, "loss": 0.0076, "step": 5036 }, { "epoch": 3.0648007301490723, "grad_norm": 0.26668545603752136, "learning_rate": 6.954357264405392e-06, "loss": 0.0058, "step": 5037 }, { "epoch": 3.0654091877091574, "grad_norm": 0.2839964032173157, "learning_rate": 6.945743963244711e-06, "loss": 0.0102, "step": 5038 }, { "epoch": 3.0660176452692425, "grad_norm": 0.29744213819503784, "learning_rate": 6.937135138787001e-06, "loss": 0.0084, "step": 5039 }, { "epoch": 3.0666261028293276, "grad_norm": 0.3763432800769806, "learning_rate": 6.928530793166874e-06, "loss": 0.0099, "step": 5040 }, { "epoch": 3.0672345603894127, "grad_norm": 0.23103450238704681, "learning_rate": 6.919930928517854e-06, "loss": 0.0071, "step": 5041 }, { "epoch": 3.067843017949498, "grad_norm": 0.3754606246948242, "learning_rate": 6.91133554697235e-06, "loss": 0.0078, "step": 5042 }, { "epoch": 3.0684514755095833, "grad_norm": 0.31299889087677, "learning_rate": 6.902744650661663e-06, "loss": 0.0066, "step": 5043 }, { "epoch": 3.0690599330696684, "grad_norm": 0.26163333654403687, "learning_rate": 6.894158241715959e-06, "loss": 0.0059, "step": 5044 }, { "epoch": 3.0696683906297535, "grad_norm": 0.25679513812065125, "learning_rate": 6.885576322264336e-06, "loss": 0.0043, "step": 5045 }, { "epoch": 3.0702768481898386, "grad_norm": 0.38195693492889404, "learning_rate": 6.87699889443473e-06, "loss": 0.0088, "step": 5046 }, { "epoch": 3.070885305749924, "grad_norm": 0.3322720229625702, "learning_rate": 6.868425960354005e-06, "loss": 0.0103, "step": 5047 }, { "epoch": 3.0714937633100092, "grad_norm": 0.3915663957595825, "learning_rate": 6.859857522147864e-06, "loss": 0.0069, "step": 5048 }, { "epoch": 3.0721022208700943, "grad_norm": 0.26933425664901733, "learning_rate": 6.851293581940954e-06, "loss": 0.0077, "step": 5049 }, { "epoch": 3.0727106784301794, "grad_norm": 0.24823108315467834, "learning_rate": 6.842734141856755e-06, "loss": 0.0065, "step": 5050 }, { "epoch": 3.0733191359902645, "grad_norm": 0.31463244557380676, "learning_rate": 6.834179204017655e-06, "loss": 0.0049, "step": 5051 }, { "epoch": 3.07392759355035, "grad_norm": 0.23442064225673676, "learning_rate": 6.82562877054492e-06, "loss": 0.0064, "step": 5052 }, { "epoch": 3.074536051110435, "grad_norm": 0.36819660663604736, "learning_rate": 6.817082843558717e-06, "loss": 0.0045, "step": 5053 }, { "epoch": 3.0751445086705202, "grad_norm": 0.24585528671741486, "learning_rate": 6.80854142517805e-06, "loss": 0.0048, "step": 5054 }, { "epoch": 3.0757529662306053, "grad_norm": 0.18307526409626007, "learning_rate": 6.80000451752087e-06, "loss": 0.0048, "step": 5055 }, { "epoch": 3.0763614237906904, "grad_norm": 0.20536406338214874, "learning_rate": 6.791472122703946e-06, "loss": 0.0045, "step": 5056 }, { "epoch": 3.076969881350776, "grad_norm": 0.3933330774307251, "learning_rate": 6.782944242842976e-06, "loss": 0.0126, "step": 5057 }, { "epoch": 3.077578338910861, "grad_norm": 0.2311299741268158, "learning_rate": 6.774420880052496e-06, "loss": 0.0061, "step": 5058 }, { "epoch": 3.078186796470946, "grad_norm": 0.2359190732240677, "learning_rate": 6.7659020364459705e-06, "loss": 0.0067, "step": 5059 }, { "epoch": 3.0787952540310313, "grad_norm": 0.2867051959037781, "learning_rate": 6.757387714135696e-06, "loss": 0.0055, "step": 5060 }, { "epoch": 3.0794037115911164, "grad_norm": 0.2318498194217682, "learning_rate": 6.748877915232882e-06, "loss": 0.0062, "step": 5061 }, { "epoch": 3.080012169151202, "grad_norm": 0.15246573090553284, "learning_rate": 6.7403726418476005e-06, "loss": 0.003, "step": 5062 }, { "epoch": 3.080620626711287, "grad_norm": 0.30910444259643555, "learning_rate": 6.731871896088812e-06, "loss": 0.009, "step": 5063 }, { "epoch": 3.081229084271372, "grad_norm": 0.32887697219848633, "learning_rate": 6.723375680064325e-06, "loss": 0.007, "step": 5064 }, { "epoch": 3.081837541831457, "grad_norm": 0.38914939761161804, "learning_rate": 6.714883995880877e-06, "loss": 0.0124, "step": 5065 }, { "epoch": 3.0824459993915423, "grad_norm": 0.3048250079154968, "learning_rate": 6.706396845644031e-06, "loss": 0.0073, "step": 5066 }, { "epoch": 3.083054456951628, "grad_norm": 0.28059664368629456, "learning_rate": 6.697914231458249e-06, "loss": 0.0043, "step": 5067 }, { "epoch": 3.083662914511713, "grad_norm": 0.23873847723007202, "learning_rate": 6.689436155426873e-06, "loss": 0.0053, "step": 5068 }, { "epoch": 3.084271372071798, "grad_norm": 0.4285385012626648, "learning_rate": 6.680962619652115e-06, "loss": 0.0055, "step": 5069 }, { "epoch": 3.084879829631883, "grad_norm": 0.2138291299343109, "learning_rate": 6.672493626235044e-06, "loss": 0.0059, "step": 5070 }, { "epoch": 3.085488287191968, "grad_norm": 0.591597318649292, "learning_rate": 6.664029177275624e-06, "loss": 0.0073, "step": 5071 }, { "epoch": 3.0860967447520538, "grad_norm": 0.301170289516449, "learning_rate": 6.655569274872689e-06, "loss": 0.0088, "step": 5072 }, { "epoch": 3.086705202312139, "grad_norm": 0.309881329536438, "learning_rate": 6.647113921123941e-06, "loss": 0.0096, "step": 5073 }, { "epoch": 3.087313659872224, "grad_norm": 0.17016667127609253, "learning_rate": 6.638663118125951e-06, "loss": 0.0056, "step": 5074 }, { "epoch": 3.087922117432309, "grad_norm": 0.18052910268306732, "learning_rate": 6.6302168679741785e-06, "loss": 0.0032, "step": 5075 }, { "epoch": 3.088530574992394, "grad_norm": 0.22513878345489502, "learning_rate": 6.6217751727629285e-06, "loss": 0.0041, "step": 5076 }, { "epoch": 3.0891390325524792, "grad_norm": 0.2399200052022934, "learning_rate": 6.61333803458539e-06, "loss": 0.0082, "step": 5077 }, { "epoch": 3.0897474901125648, "grad_norm": 0.20230963826179504, "learning_rate": 6.604905455533625e-06, "loss": 0.0073, "step": 5078 }, { "epoch": 3.09035594767265, "grad_norm": 0.24099519848823547, "learning_rate": 6.596477437698565e-06, "loss": 0.0054, "step": 5079 }, { "epoch": 3.090964405232735, "grad_norm": 0.16781140863895416, "learning_rate": 6.588053983170006e-06, "loss": 0.0036, "step": 5080 }, { "epoch": 3.09157286279282, "grad_norm": 0.24010391533374786, "learning_rate": 6.579635094036607e-06, "loss": 0.0067, "step": 5081 }, { "epoch": 3.092181320352905, "grad_norm": 0.3611644208431244, "learning_rate": 6.571220772385905e-06, "loss": 0.009, "step": 5082 }, { "epoch": 3.0927897779129907, "grad_norm": 0.2840390205383301, "learning_rate": 6.562811020304305e-06, "loss": 0.008, "step": 5083 }, { "epoch": 3.093398235473076, "grad_norm": 0.29138410091400146, "learning_rate": 6.55440583987707e-06, "loss": 0.0054, "step": 5084 }, { "epoch": 3.094006693033161, "grad_norm": 0.23507152497768402, "learning_rate": 6.546005233188343e-06, "loss": 0.0051, "step": 5085 }, { "epoch": 3.094615150593246, "grad_norm": 0.2997680902481079, "learning_rate": 6.537609202321113e-06, "loss": 0.007, "step": 5086 }, { "epoch": 3.095223608153331, "grad_norm": 0.2853991389274597, "learning_rate": 6.529217749357247e-06, "loss": 0.0089, "step": 5087 }, { "epoch": 3.0958320657134166, "grad_norm": 0.18508751690387726, "learning_rate": 6.520830876377482e-06, "loss": 0.0029, "step": 5088 }, { "epoch": 3.0964405232735017, "grad_norm": 0.1906099170446396, "learning_rate": 6.5124485854614086e-06, "loss": 0.0041, "step": 5089 }, { "epoch": 3.097048980833587, "grad_norm": 0.3323763906955719, "learning_rate": 6.504070878687485e-06, "loss": 0.0109, "step": 5090 }, { "epoch": 3.097657438393672, "grad_norm": 0.2999829947948456, "learning_rate": 6.495697758133046e-06, "loss": 0.0077, "step": 5091 }, { "epoch": 3.098265895953757, "grad_norm": 0.3580150604248047, "learning_rate": 6.487329225874256e-06, "loss": 0.0135, "step": 5092 }, { "epoch": 3.0988743535138426, "grad_norm": 0.17444658279418945, "learning_rate": 6.478965283986174e-06, "loss": 0.0035, "step": 5093 }, { "epoch": 3.0994828110739276, "grad_norm": 0.20913676917552948, "learning_rate": 6.470605934542703e-06, "loss": 0.0045, "step": 5094 }, { "epoch": 3.1000912686340127, "grad_norm": 0.2408442199230194, "learning_rate": 6.462251179616621e-06, "loss": 0.0065, "step": 5095 }, { "epoch": 3.100699726194098, "grad_norm": 0.18831227719783783, "learning_rate": 6.453901021279559e-06, "loss": 0.0035, "step": 5096 }, { "epoch": 3.101308183754183, "grad_norm": 0.3297005295753479, "learning_rate": 6.445555461602001e-06, "loss": 0.0078, "step": 5097 }, { "epoch": 3.1019166413142685, "grad_norm": 0.21692611277103424, "learning_rate": 6.437214502653299e-06, "loss": 0.0051, "step": 5098 }, { "epoch": 3.1025250988743536, "grad_norm": 0.24288775026798248, "learning_rate": 6.428878146501666e-06, "loss": 0.0051, "step": 5099 }, { "epoch": 3.1031335564344387, "grad_norm": 0.17630305886268616, "learning_rate": 6.420546395214167e-06, "loss": 0.0034, "step": 5100 }, { "epoch": 3.1037420139945238, "grad_norm": 0.2515610158443451, "learning_rate": 6.412219250856735e-06, "loss": 0.006, "step": 5101 }, { "epoch": 3.104350471554609, "grad_norm": 0.3058002293109894, "learning_rate": 6.403896715494159e-06, "loss": 0.01, "step": 5102 }, { "epoch": 3.1049589291146944, "grad_norm": 0.29932624101638794, "learning_rate": 6.395578791190066e-06, "loss": 0.0046, "step": 5103 }, { "epoch": 3.1055673866747795, "grad_norm": 0.2611561119556427, "learning_rate": 6.3872654800069626e-06, "loss": 0.0097, "step": 5104 }, { "epoch": 3.1061758442348646, "grad_norm": 0.2513970136642456, "learning_rate": 6.378956784006204e-06, "loss": 0.006, "step": 5105 }, { "epoch": 3.1067843017949497, "grad_norm": 0.2295328974723816, "learning_rate": 6.370652705248007e-06, "loss": 0.0055, "step": 5106 }, { "epoch": 3.107392759355035, "grad_norm": 0.3369624614715576, "learning_rate": 6.362353245791411e-06, "loss": 0.013, "step": 5107 }, { "epoch": 3.1080012169151203, "grad_norm": 0.21619094908237457, "learning_rate": 6.354058407694374e-06, "loss": 0.0059, "step": 5108 }, { "epoch": 3.1086096744752054, "grad_norm": 0.2147800624370575, "learning_rate": 6.34576819301364e-06, "loss": 0.0071, "step": 5109 }, { "epoch": 3.1092181320352905, "grad_norm": 0.18836335837841034, "learning_rate": 6.337482603804851e-06, "loss": 0.0051, "step": 5110 }, { "epoch": 3.1098265895953756, "grad_norm": 0.15471284091472626, "learning_rate": 6.329201642122481e-06, "loss": 0.0046, "step": 5111 }, { "epoch": 3.1104350471554607, "grad_norm": 0.2526698708534241, "learning_rate": 6.320925310019876e-06, "loss": 0.0071, "step": 5112 }, { "epoch": 3.1110435047155462, "grad_norm": 0.2389310896396637, "learning_rate": 6.312653609549196e-06, "loss": 0.0065, "step": 5113 }, { "epoch": 3.1116519622756313, "grad_norm": 0.22375918924808502, "learning_rate": 6.304386542761509e-06, "loss": 0.0057, "step": 5114 }, { "epoch": 3.1122604198357164, "grad_norm": 0.29425686597824097, "learning_rate": 6.29612411170668e-06, "loss": 0.005, "step": 5115 }, { "epoch": 3.1128688773958015, "grad_norm": 0.37902581691741943, "learning_rate": 6.287866318433464e-06, "loss": 0.0065, "step": 5116 }, { "epoch": 3.1134773349558866, "grad_norm": 0.3976466655731201, "learning_rate": 6.279613164989426e-06, "loss": 0.0142, "step": 5117 }, { "epoch": 3.114085792515972, "grad_norm": 0.2854953706264496, "learning_rate": 6.27136465342103e-06, "loss": 0.0062, "step": 5118 }, { "epoch": 3.1146942500760573, "grad_norm": 0.6086744070053101, "learning_rate": 6.263120785773549e-06, "loss": 0.0085, "step": 5119 }, { "epoch": 3.1153027076361424, "grad_norm": 0.2866113483905792, "learning_rate": 6.254881564091119e-06, "loss": 0.0042, "step": 5120 }, { "epoch": 3.1159111651962275, "grad_norm": 0.20252281427383423, "learning_rate": 6.246646990416727e-06, "loss": 0.0056, "step": 5121 }, { "epoch": 3.1165196227563126, "grad_norm": 0.21693319082260132, "learning_rate": 6.238417066792212e-06, "loss": 0.0052, "step": 5122 }, { "epoch": 3.117128080316398, "grad_norm": 0.2629375159740448, "learning_rate": 6.230191795258228e-06, "loss": 0.0105, "step": 5123 }, { "epoch": 3.117736537876483, "grad_norm": 0.18073906004428864, "learning_rate": 6.2219711778543274e-06, "loss": 0.0055, "step": 5124 }, { "epoch": 3.1183449954365683, "grad_norm": 0.3107531666755676, "learning_rate": 6.213755216618861e-06, "loss": 0.0077, "step": 5125 }, { "epoch": 3.1189534529966534, "grad_norm": 0.3121819496154785, "learning_rate": 6.205543913589059e-06, "loss": 0.0042, "step": 5126 }, { "epoch": 3.1195619105567385, "grad_norm": 0.3623504638671875, "learning_rate": 6.19733727080096e-06, "loss": 0.0076, "step": 5127 }, { "epoch": 3.120170368116824, "grad_norm": 0.3374268412590027, "learning_rate": 6.189135290289499e-06, "loss": 0.0076, "step": 5128 }, { "epoch": 3.120778825676909, "grad_norm": 0.2626723051071167, "learning_rate": 6.180937974088405e-06, "loss": 0.0045, "step": 5129 }, { "epoch": 3.121387283236994, "grad_norm": 0.39012977480888367, "learning_rate": 6.172745324230275e-06, "loss": 0.0123, "step": 5130 }, { "epoch": 3.1219957407970793, "grad_norm": 0.1949702948331833, "learning_rate": 6.164557342746547e-06, "loss": 0.0064, "step": 5131 }, { "epoch": 3.1226041983571644, "grad_norm": 0.3511587679386139, "learning_rate": 6.156374031667503e-06, "loss": 0.0094, "step": 5132 }, { "epoch": 3.12321265591725, "grad_norm": 0.2506071627140045, "learning_rate": 6.1481953930222435e-06, "loss": 0.0068, "step": 5133 }, { "epoch": 3.123821113477335, "grad_norm": 0.2543926537036896, "learning_rate": 6.140021428838761e-06, "loss": 0.0035, "step": 5134 }, { "epoch": 3.12442957103742, "grad_norm": 0.2524166703224182, "learning_rate": 6.131852141143834e-06, "loss": 0.0059, "step": 5135 }, { "epoch": 3.1250380285975052, "grad_norm": 0.3138072192668915, "learning_rate": 6.123687531963113e-06, "loss": 0.0061, "step": 5136 }, { "epoch": 3.1256464861575903, "grad_norm": 0.32244062423706055, "learning_rate": 6.115527603321081e-06, "loss": 0.0081, "step": 5137 }, { "epoch": 3.126254943717676, "grad_norm": 0.21048331260681152, "learning_rate": 6.1073723572410645e-06, "loss": 0.0037, "step": 5138 }, { "epoch": 3.126863401277761, "grad_norm": 0.255831778049469, "learning_rate": 6.099221795745213e-06, "loss": 0.0074, "step": 5139 }, { "epoch": 3.127471858837846, "grad_norm": 0.3076886236667633, "learning_rate": 6.091075920854536e-06, "loss": 0.0058, "step": 5140 }, { "epoch": 3.128080316397931, "grad_norm": 0.21628208458423615, "learning_rate": 6.082934734588866e-06, "loss": 0.0051, "step": 5141 }, { "epoch": 3.1286887739580163, "grad_norm": 0.2794947326183319, "learning_rate": 6.074798238966886e-06, "loss": 0.0078, "step": 5142 }, { "epoch": 3.129297231518102, "grad_norm": 0.38834357261657715, "learning_rate": 6.066666436006088e-06, "loss": 0.0055, "step": 5143 }, { "epoch": 3.129905689078187, "grad_norm": 0.28329214453697205, "learning_rate": 6.058539327722848e-06, "loss": 0.0069, "step": 5144 }, { "epoch": 3.130514146638272, "grad_norm": 0.2482929527759552, "learning_rate": 6.050416916132329e-06, "loss": 0.0069, "step": 5145 }, { "epoch": 3.131122604198357, "grad_norm": 0.22554363310337067, "learning_rate": 6.042299203248555e-06, "loss": 0.0045, "step": 5146 }, { "epoch": 3.131731061758442, "grad_norm": 0.3571288287639618, "learning_rate": 6.034186191084384e-06, "loss": 0.0082, "step": 5147 }, { "epoch": 3.1323395193185277, "grad_norm": 0.24375028908252716, "learning_rate": 6.026077881651513e-06, "loss": 0.0037, "step": 5148 }, { "epoch": 3.132947976878613, "grad_norm": 0.33513811230659485, "learning_rate": 6.017974276960445e-06, "loss": 0.0047, "step": 5149 }, { "epoch": 3.133556434438698, "grad_norm": 0.32763129472732544, "learning_rate": 6.0098753790205494e-06, "loss": 0.0091, "step": 5150 }, { "epoch": 3.134164891998783, "grad_norm": 0.18572641909122467, "learning_rate": 6.001781189840011e-06, "loss": 0.0065, "step": 5151 }, { "epoch": 3.134773349558868, "grad_norm": 0.24770891666412354, "learning_rate": 5.9936917114258585e-06, "loss": 0.007, "step": 5152 }, { "epoch": 3.1353818071189536, "grad_norm": 0.38206419348716736, "learning_rate": 5.985606945783926e-06, "loss": 0.005, "step": 5153 }, { "epoch": 3.1359902646790387, "grad_norm": 0.24334634840488434, "learning_rate": 5.977526894918928e-06, "loss": 0.0062, "step": 5154 }, { "epoch": 3.136598722239124, "grad_norm": 0.23436123132705688, "learning_rate": 5.969451560834355e-06, "loss": 0.0062, "step": 5155 }, { "epoch": 3.137207179799209, "grad_norm": 0.20244354009628296, "learning_rate": 5.9613809455325635e-06, "loss": 0.0046, "step": 5156 }, { "epoch": 3.137815637359294, "grad_norm": 0.20892736315727234, "learning_rate": 5.95331505101473e-06, "loss": 0.0037, "step": 5157 }, { "epoch": 3.1384240949193796, "grad_norm": 0.32044070959091187, "learning_rate": 5.945253879280862e-06, "loss": 0.0097, "step": 5158 }, { "epoch": 3.1390325524794647, "grad_norm": 0.27256110310554504, "learning_rate": 5.93719743232978e-06, "loss": 0.0057, "step": 5159 }, { "epoch": 3.1396410100395498, "grad_norm": 0.22417287528514862, "learning_rate": 5.929145712159173e-06, "loss": 0.0039, "step": 5160 }, { "epoch": 3.140249467599635, "grad_norm": 0.28702032566070557, "learning_rate": 5.921098720765508e-06, "loss": 0.0038, "step": 5161 }, { "epoch": 3.14085792515972, "grad_norm": 0.3003734350204468, "learning_rate": 5.91305646014412e-06, "loss": 0.0079, "step": 5162 }, { "epoch": 3.1414663827198055, "grad_norm": 0.3371279835700989, "learning_rate": 5.905018932289133e-06, "loss": 0.0085, "step": 5163 }, { "epoch": 3.1420748402798906, "grad_norm": 0.24667906761169434, "learning_rate": 5.896986139193547e-06, "loss": 0.0089, "step": 5164 }, { "epoch": 3.1426832978399757, "grad_norm": 0.20292513072490692, "learning_rate": 5.888958082849139e-06, "loss": 0.0047, "step": 5165 }, { "epoch": 3.143291755400061, "grad_norm": 0.1873752474784851, "learning_rate": 5.880934765246537e-06, "loss": 0.0056, "step": 5166 }, { "epoch": 3.143900212960146, "grad_norm": 0.2573238015174866, "learning_rate": 5.872916188375194e-06, "loss": 0.0035, "step": 5167 }, { "epoch": 3.1445086705202314, "grad_norm": 0.22613008320331573, "learning_rate": 5.864902354223384e-06, "loss": 0.0063, "step": 5168 }, { "epoch": 3.1451171280803165, "grad_norm": 0.2717379927635193, "learning_rate": 5.856893264778188e-06, "loss": 0.0072, "step": 5169 }, { "epoch": 3.1457255856404016, "grad_norm": 0.18429331481456757, "learning_rate": 5.848888922025553e-06, "loss": 0.005, "step": 5170 }, { "epoch": 3.1463340432004867, "grad_norm": 0.2671307921409607, "learning_rate": 5.8408893279502e-06, "loss": 0.0073, "step": 5171 }, { "epoch": 3.146942500760572, "grad_norm": 0.263640820980072, "learning_rate": 5.832894484535709e-06, "loss": 0.0094, "step": 5172 }, { "epoch": 3.1475509583206573, "grad_norm": 0.17915846407413483, "learning_rate": 5.8249043937644465e-06, "loss": 0.0039, "step": 5173 }, { "epoch": 3.1481594158807424, "grad_norm": 0.30745455622673035, "learning_rate": 5.816919057617653e-06, "loss": 0.0067, "step": 5174 }, { "epoch": 3.1487678734408275, "grad_norm": 0.2966357171535492, "learning_rate": 5.808938478075335e-06, "loss": 0.0063, "step": 5175 }, { "epoch": 3.1493763310009126, "grad_norm": 0.3174609839916229, "learning_rate": 5.800962657116351e-06, "loss": 0.0088, "step": 5176 }, { "epoch": 3.1499847885609977, "grad_norm": 0.25037750601768494, "learning_rate": 5.792991596718375e-06, "loss": 0.0063, "step": 5177 }, { "epoch": 3.1505932461210833, "grad_norm": 0.30678263306617737, "learning_rate": 5.785025298857902e-06, "loss": 0.0043, "step": 5178 }, { "epoch": 3.1512017036811684, "grad_norm": 0.25345203280448914, "learning_rate": 5.777063765510219e-06, "loss": 0.0061, "step": 5179 }, { "epoch": 3.1518101612412535, "grad_norm": 0.5578262209892273, "learning_rate": 5.769106998649488e-06, "loss": 0.0109, "step": 5180 }, { "epoch": 3.1524186188013386, "grad_norm": 0.2420710027217865, "learning_rate": 5.761155000248627e-06, "loss": 0.007, "step": 5181 }, { "epoch": 3.1530270763614237, "grad_norm": 0.21901202201843262, "learning_rate": 5.753207772279415e-06, "loss": 0.0049, "step": 5182 }, { "epoch": 3.153635533921509, "grad_norm": 0.26763221621513367, "learning_rate": 5.745265316712428e-06, "loss": 0.0065, "step": 5183 }, { "epoch": 3.1542439914815943, "grad_norm": 0.23685556650161743, "learning_rate": 5.737327635517073e-06, "loss": 0.007, "step": 5184 }, { "epoch": 3.1548524490416794, "grad_norm": 0.318765789270401, "learning_rate": 5.729394730661547e-06, "loss": 0.0065, "step": 5185 }, { "epoch": 3.1554609066017645, "grad_norm": 0.289823055267334, "learning_rate": 5.721466604112893e-06, "loss": 0.0079, "step": 5186 }, { "epoch": 3.1560693641618496, "grad_norm": 0.23716124892234802, "learning_rate": 5.713543257836951e-06, "loss": 0.0055, "step": 5187 }, { "epoch": 3.156677821721935, "grad_norm": 0.25619250535964966, "learning_rate": 5.705624693798389e-06, "loss": 0.0063, "step": 5188 }, { "epoch": 3.15728627928202, "grad_norm": 0.21789373457431793, "learning_rate": 5.6977109139606605e-06, "loss": 0.0036, "step": 5189 }, { "epoch": 3.1578947368421053, "grad_norm": 0.2993996739387512, "learning_rate": 5.689801920286078e-06, "loss": 0.0066, "step": 5190 }, { "epoch": 3.1585031944021904, "grad_norm": 0.3557785749435425, "learning_rate": 5.681897714735726e-06, "loss": 0.0102, "step": 5191 }, { "epoch": 3.1591116519622755, "grad_norm": 0.21176862716674805, "learning_rate": 5.673998299269523e-06, "loss": 0.0035, "step": 5192 }, { "epoch": 3.159720109522361, "grad_norm": 0.23229624330997467, "learning_rate": 5.666103675846191e-06, "loss": 0.0055, "step": 5193 }, { "epoch": 3.160328567082446, "grad_norm": 0.1171480268239975, "learning_rate": 5.65821384642328e-06, "loss": 0.0017, "step": 5194 }, { "epoch": 3.1609370246425312, "grad_norm": 0.20375606417655945, "learning_rate": 5.650328812957115e-06, "loss": 0.0049, "step": 5195 }, { "epoch": 3.1615454822026163, "grad_norm": 0.3008398711681366, "learning_rate": 5.642448577402887e-06, "loss": 0.0067, "step": 5196 }, { "epoch": 3.1621539397627014, "grad_norm": 0.3283615708351135, "learning_rate": 5.6345731417145385e-06, "loss": 0.005, "step": 5197 }, { "epoch": 3.162762397322787, "grad_norm": 0.2205916792154312, "learning_rate": 5.6267025078448634e-06, "loss": 0.0053, "step": 5198 }, { "epoch": 3.163370854882872, "grad_norm": 0.23091734945774078, "learning_rate": 5.618836677745445e-06, "loss": 0.0067, "step": 5199 }, { "epoch": 3.163979312442957, "grad_norm": 0.3182157278060913, "learning_rate": 5.610975653366693e-06, "loss": 0.0087, "step": 5200 }, { "epoch": 3.1645877700030423, "grad_norm": 0.3846273422241211, "learning_rate": 5.603119436657794e-06, "loss": 0.0171, "step": 5201 }, { "epoch": 3.1651962275631274, "grad_norm": 0.26661422848701477, "learning_rate": 5.595268029566777e-06, "loss": 0.0062, "step": 5202 }, { "epoch": 3.165804685123213, "grad_norm": 0.4423938989639282, "learning_rate": 5.587421434040457e-06, "loss": 0.0079, "step": 5203 }, { "epoch": 3.166413142683298, "grad_norm": 0.3375749886035919, "learning_rate": 5.579579652024467e-06, "loss": 0.0085, "step": 5204 }, { "epoch": 3.167021600243383, "grad_norm": 0.2887289822101593, "learning_rate": 5.571742685463238e-06, "loss": 0.0104, "step": 5205 }, { "epoch": 3.167630057803468, "grad_norm": 0.23182101547718048, "learning_rate": 5.563910536300021e-06, "loss": 0.005, "step": 5206 }, { "epoch": 3.1682385153635533, "grad_norm": 0.252333402633667, "learning_rate": 5.55608320647685e-06, "loss": 0.0059, "step": 5207 }, { "epoch": 3.1688469729236384, "grad_norm": 0.22592368721961975, "learning_rate": 5.548260697934579e-06, "loss": 0.0045, "step": 5208 }, { "epoch": 3.169455430483724, "grad_norm": 0.2711806893348694, "learning_rate": 5.540443012612867e-06, "loss": 0.0061, "step": 5209 }, { "epoch": 3.170063888043809, "grad_norm": 0.23822571337223053, "learning_rate": 5.532630152450175e-06, "loss": 0.0053, "step": 5210 }, { "epoch": 3.170672345603894, "grad_norm": 0.3533017039299011, "learning_rate": 5.5248221193837715e-06, "loss": 0.0102, "step": 5211 }, { "epoch": 3.171280803163979, "grad_norm": 0.2815355360507965, "learning_rate": 5.5170189153497085e-06, "loss": 0.0073, "step": 5212 }, { "epoch": 3.1718892607240643, "grad_norm": 0.3320218622684479, "learning_rate": 5.509220542282864e-06, "loss": 0.0076, "step": 5213 }, { "epoch": 3.17249771828415, "grad_norm": 0.2814382314682007, "learning_rate": 5.501427002116913e-06, "loss": 0.0074, "step": 5214 }, { "epoch": 3.173106175844235, "grad_norm": 0.29092541337013245, "learning_rate": 5.4936382967843206e-06, "loss": 0.0057, "step": 5215 }, { "epoch": 3.17371463340432, "grad_norm": 0.1667691469192505, "learning_rate": 5.485854428216375e-06, "loss": 0.0039, "step": 5216 }, { "epoch": 3.174323090964405, "grad_norm": 0.2939755618572235, "learning_rate": 5.478075398343133e-06, "loss": 0.0082, "step": 5217 }, { "epoch": 3.17493154852449, "grad_norm": 0.21566243469715118, "learning_rate": 5.470301209093478e-06, "loss": 0.0053, "step": 5218 }, { "epoch": 3.1755400060845758, "grad_norm": 0.26064324378967285, "learning_rate": 5.462531862395087e-06, "loss": 0.0046, "step": 5219 }, { "epoch": 3.176148463644661, "grad_norm": 0.22674903273582458, "learning_rate": 5.454767360174431e-06, "loss": 0.0043, "step": 5220 }, { "epoch": 3.176756921204746, "grad_norm": 0.2666117548942566, "learning_rate": 5.447007704356791e-06, "loss": 0.0051, "step": 5221 }, { "epoch": 3.177365378764831, "grad_norm": 0.27889886498451233, "learning_rate": 5.439252896866226e-06, "loss": 0.0053, "step": 5222 }, { "epoch": 3.177973836324916, "grad_norm": 0.273966908454895, "learning_rate": 5.431502939625608e-06, "loss": 0.0053, "step": 5223 }, { "epoch": 3.1785822938850017, "grad_norm": 0.29631203413009644, "learning_rate": 5.423757834556606e-06, "loss": 0.0079, "step": 5224 }, { "epoch": 3.179190751445087, "grad_norm": 0.3811946213245392, "learning_rate": 5.416017583579686e-06, "loss": 0.0081, "step": 5225 }, { "epoch": 3.179799209005172, "grad_norm": 0.285285621881485, "learning_rate": 5.408282188614103e-06, "loss": 0.0083, "step": 5226 }, { "epoch": 3.180407666565257, "grad_norm": 0.2812410593032837, "learning_rate": 5.40055165157792e-06, "loss": 0.0051, "step": 5227 }, { "epoch": 3.181016124125342, "grad_norm": 0.3686572015285492, "learning_rate": 5.392825974387977e-06, "loss": 0.0078, "step": 5228 }, { "epoch": 3.1816245816854276, "grad_norm": 0.34885480999946594, "learning_rate": 5.385105158959924e-06, "loss": 0.0081, "step": 5229 }, { "epoch": 3.1822330392455127, "grad_norm": 0.2072272151708603, "learning_rate": 5.377389207208203e-06, "loss": 0.0027, "step": 5230 }, { "epoch": 3.182841496805598, "grad_norm": 0.22844795882701874, "learning_rate": 5.369678121046054e-06, "loss": 0.0068, "step": 5231 }, { "epoch": 3.183449954365683, "grad_norm": 0.3148641884326935, "learning_rate": 5.361971902385485e-06, "loss": 0.0111, "step": 5232 }, { "epoch": 3.184058411925768, "grad_norm": 0.2549585700035095, "learning_rate": 5.354270553137347e-06, "loss": 0.008, "step": 5233 }, { "epoch": 3.1846668694858535, "grad_norm": 0.26395851373672485, "learning_rate": 5.346574075211228e-06, "loss": 0.0077, "step": 5234 }, { "epoch": 3.1852753270459386, "grad_norm": 0.24896180629730225, "learning_rate": 5.338882470515544e-06, "loss": 0.0076, "step": 5235 }, { "epoch": 3.1858837846060237, "grad_norm": 0.240103617310524, "learning_rate": 5.331195740957493e-06, "loss": 0.0065, "step": 5236 }, { "epoch": 3.186492242166109, "grad_norm": 0.1803087294101715, "learning_rate": 5.3235138884430655e-06, "loss": 0.0052, "step": 5237 }, { "epoch": 3.187100699726194, "grad_norm": 0.25235310196876526, "learning_rate": 5.315836914877026e-06, "loss": 0.0046, "step": 5238 }, { "epoch": 3.1877091572862795, "grad_norm": 0.2217434048652649, "learning_rate": 5.30816482216297e-06, "loss": 0.0073, "step": 5239 }, { "epoch": 3.1883176148463646, "grad_norm": 0.1752319633960724, "learning_rate": 5.300497612203231e-06, "loss": 0.0029, "step": 5240 }, { "epoch": 3.1889260724064497, "grad_norm": 0.2559622824192047, "learning_rate": 5.292835286898973e-06, "loss": 0.006, "step": 5241 }, { "epoch": 3.1895345299665347, "grad_norm": 0.30886584520339966, "learning_rate": 5.285177848150127e-06, "loss": 0.0065, "step": 5242 }, { "epoch": 3.19014298752662, "grad_norm": 0.32938501238822937, "learning_rate": 5.277525297855427e-06, "loss": 0.0087, "step": 5243 }, { "epoch": 3.1907514450867054, "grad_norm": 0.30601125955581665, "learning_rate": 5.269877637912374e-06, "loss": 0.0065, "step": 5244 }, { "epoch": 3.1913599026467905, "grad_norm": 0.1772543340921402, "learning_rate": 5.262234870217272e-06, "loss": 0.0038, "step": 5245 }, { "epoch": 3.1919683602068756, "grad_norm": 0.31185612082481384, "learning_rate": 5.254596996665215e-06, "loss": 0.0116, "step": 5246 }, { "epoch": 3.1925768177669607, "grad_norm": 0.2804357409477234, "learning_rate": 5.246964019150077e-06, "loss": 0.0092, "step": 5247 }, { "epoch": 3.1931852753270458, "grad_norm": 0.31903064250946045, "learning_rate": 5.239335939564505e-06, "loss": 0.0092, "step": 5248 }, { "epoch": 3.1937937328871313, "grad_norm": 0.27632802724838257, "learning_rate": 5.231712759799967e-06, "loss": 0.0074, "step": 5249 }, { "epoch": 3.1944021904472164, "grad_norm": 0.15155087411403656, "learning_rate": 5.224094481746675e-06, "loss": 0.0031, "step": 5250 }, { "epoch": 3.1950106480073015, "grad_norm": 0.322252094745636, "learning_rate": 5.216481107293653e-06, "loss": 0.0045, "step": 5251 }, { "epoch": 3.1956191055673866, "grad_norm": 0.12668286263942719, "learning_rate": 5.2088726383286965e-06, "loss": 0.0028, "step": 5252 }, { "epoch": 3.1962275631274717, "grad_norm": 0.3014906048774719, "learning_rate": 5.2012690767383964e-06, "loss": 0.008, "step": 5253 }, { "epoch": 3.1968360206875572, "grad_norm": 0.21465936303138733, "learning_rate": 5.193670424408109e-06, "loss": 0.0053, "step": 5254 }, { "epoch": 3.1974444782476423, "grad_norm": 0.24435143172740936, "learning_rate": 5.186076683221988e-06, "loss": 0.0062, "step": 5255 }, { "epoch": 3.1980529358077274, "grad_norm": 0.28407108783721924, "learning_rate": 5.1784878550629635e-06, "loss": 0.0073, "step": 5256 }, { "epoch": 3.1986613933678125, "grad_norm": 0.3360306918621063, "learning_rate": 5.1709039418127575e-06, "loss": 0.0102, "step": 5257 }, { "epoch": 3.1992698509278976, "grad_norm": 0.15760943293571472, "learning_rate": 5.163324945351841e-06, "loss": 0.0029, "step": 5258 }, { "epoch": 3.199878308487983, "grad_norm": 0.31353893876075745, "learning_rate": 5.15575086755952e-06, "loss": 0.0061, "step": 5259 }, { "epoch": 3.2004867660480683, "grad_norm": 0.2500174641609192, "learning_rate": 5.148181710313827e-06, "loss": 0.0055, "step": 5260 }, { "epoch": 3.2010952236081534, "grad_norm": 0.1809082180261612, "learning_rate": 5.140617475491605e-06, "loss": 0.0049, "step": 5261 }, { "epoch": 3.2017036811682384, "grad_norm": 0.1834014654159546, "learning_rate": 5.1330581649684715e-06, "loss": 0.0044, "step": 5262 }, { "epoch": 3.2023121387283235, "grad_norm": 0.3861825466156006, "learning_rate": 5.125503780618824e-06, "loss": 0.01, "step": 5263 }, { "epoch": 3.202920596288409, "grad_norm": 0.33081695437431335, "learning_rate": 5.117954324315813e-06, "loss": 0.0065, "step": 5264 }, { "epoch": 3.203529053848494, "grad_norm": 0.1698094755411148, "learning_rate": 5.11040979793142e-06, "loss": 0.0031, "step": 5265 }, { "epoch": 3.2041375114085793, "grad_norm": 0.2472512573003769, "learning_rate": 5.102870203336352e-06, "loss": 0.0062, "step": 5266 }, { "epoch": 3.2047459689686644, "grad_norm": 0.19104048609733582, "learning_rate": 5.095335542400129e-06, "loss": 0.0045, "step": 5267 }, { "epoch": 3.2053544265287495, "grad_norm": 0.23127706348896027, "learning_rate": 5.087805816991006e-06, "loss": 0.0056, "step": 5268 }, { "epoch": 3.205962884088835, "grad_norm": 0.28482139110565186, "learning_rate": 5.080281028976078e-06, "loss": 0.0062, "step": 5269 }, { "epoch": 3.20657134164892, "grad_norm": 0.3393114507198334, "learning_rate": 5.07276118022115e-06, "loss": 0.0099, "step": 5270 }, { "epoch": 3.207179799209005, "grad_norm": 0.25069156289100647, "learning_rate": 5.06524627259084e-06, "loss": 0.0043, "step": 5271 }, { "epoch": 3.2077882567690903, "grad_norm": 0.28203946352005005, "learning_rate": 5.057736307948535e-06, "loss": 0.0068, "step": 5272 }, { "epoch": 3.2083967143291754, "grad_norm": 0.2758210301399231, "learning_rate": 5.050231288156398e-06, "loss": 0.0066, "step": 5273 }, { "epoch": 3.2090051718892605, "grad_norm": 0.2686399519443512, "learning_rate": 5.042731215075341e-06, "loss": 0.0049, "step": 5274 }, { "epoch": 3.209613629449346, "grad_norm": 0.25653237104415894, "learning_rate": 5.035236090565093e-06, "loss": 0.0062, "step": 5275 }, { "epoch": 3.210222087009431, "grad_norm": 0.21267692744731903, "learning_rate": 5.027745916484119e-06, "loss": 0.0056, "step": 5276 }, { "epoch": 3.210830544569516, "grad_norm": 0.3413097858428955, "learning_rate": 5.02026069468968e-06, "loss": 0.0055, "step": 5277 }, { "epoch": 3.2114390021296013, "grad_norm": 0.4562237560749054, "learning_rate": 5.012780427037775e-06, "loss": 0.0205, "step": 5278 }, { "epoch": 3.2120474596896864, "grad_norm": 0.28480085730552673, "learning_rate": 5.005305115383233e-06, "loss": 0.0078, "step": 5279 }, { "epoch": 3.212655917249772, "grad_norm": 0.24477575719356537, "learning_rate": 4.997834761579595e-06, "loss": 0.0043, "step": 5280 }, { "epoch": 3.213264374809857, "grad_norm": 0.27294448018074036, "learning_rate": 4.990369367479203e-06, "loss": 0.0061, "step": 5281 }, { "epoch": 3.213872832369942, "grad_norm": 0.21413196623325348, "learning_rate": 4.982908934933167e-06, "loss": 0.0047, "step": 5282 }, { "epoch": 3.2144812899300272, "grad_norm": 0.31827402114868164, "learning_rate": 4.975453465791366e-06, "loss": 0.0074, "step": 5283 }, { "epoch": 3.2150897474901123, "grad_norm": 0.27179375290870667, "learning_rate": 4.9680029619024295e-06, "loss": 0.0051, "step": 5284 }, { "epoch": 3.215698205050198, "grad_norm": 0.16175347566604614, "learning_rate": 4.9605574251137985e-06, "loss": 0.0041, "step": 5285 }, { "epoch": 3.216306662610283, "grad_norm": 0.3044746220111847, "learning_rate": 4.953116857271634e-06, "loss": 0.0106, "step": 5286 }, { "epoch": 3.216915120170368, "grad_norm": 0.263638436794281, "learning_rate": 4.945681260220891e-06, "loss": 0.0108, "step": 5287 }, { "epoch": 3.217523577730453, "grad_norm": 0.26071223616600037, "learning_rate": 4.9382506358052916e-06, "loss": 0.0084, "step": 5288 }, { "epoch": 3.2181320352905383, "grad_norm": 0.20165440440177917, "learning_rate": 4.930824985867328e-06, "loss": 0.0038, "step": 5289 }, { "epoch": 3.218740492850624, "grad_norm": 0.29097187519073486, "learning_rate": 4.923404312248234e-06, "loss": 0.0078, "step": 5290 }, { "epoch": 3.219348950410709, "grad_norm": 0.21615852415561676, "learning_rate": 4.915988616788039e-06, "loss": 0.0036, "step": 5291 }, { "epoch": 3.219957407970794, "grad_norm": 0.2644842565059662, "learning_rate": 4.9085779013255225e-06, "loss": 0.0066, "step": 5292 }, { "epoch": 3.220565865530879, "grad_norm": 0.330758273601532, "learning_rate": 4.901172167698242e-06, "loss": 0.0079, "step": 5293 }, { "epoch": 3.221174323090964, "grad_norm": 0.2616957128047943, "learning_rate": 4.89377141774249e-06, "loss": 0.0041, "step": 5294 }, { "epoch": 3.2217827806510497, "grad_norm": 0.38702672719955444, "learning_rate": 4.886375653293371e-06, "loss": 0.0084, "step": 5295 }, { "epoch": 3.222391238211135, "grad_norm": 0.3087538778781891, "learning_rate": 4.878984876184706e-06, "loss": 0.0064, "step": 5296 }, { "epoch": 3.22299969577122, "grad_norm": 0.2493748515844345, "learning_rate": 4.871599088249107e-06, "loss": 0.0052, "step": 5297 }, { "epoch": 3.223608153331305, "grad_norm": 0.3203872740268707, "learning_rate": 4.86421829131794e-06, "loss": 0.0077, "step": 5298 }, { "epoch": 3.22421661089139, "grad_norm": 0.21354947984218597, "learning_rate": 4.856842487221344e-06, "loss": 0.0052, "step": 5299 }, { "epoch": 3.2248250684514757, "grad_norm": 0.41666483879089355, "learning_rate": 4.849471677788195e-06, "loss": 0.0138, "step": 5300 }, { "epoch": 3.2254335260115607, "grad_norm": 0.1607670783996582, "learning_rate": 4.842105864846155e-06, "loss": 0.0021, "step": 5301 }, { "epoch": 3.226041983571646, "grad_norm": 0.25365200638771057, "learning_rate": 4.83474505022164e-06, "loss": 0.008, "step": 5302 }, { "epoch": 3.226650441131731, "grad_norm": 0.23420965671539307, "learning_rate": 4.82738923573983e-06, "loss": 0.0039, "step": 5303 }, { "epoch": 3.227258898691816, "grad_norm": 0.24002118408679962, "learning_rate": 4.820038423224638e-06, "loss": 0.0039, "step": 5304 }, { "epoch": 3.2278673562519016, "grad_norm": 0.17119701206684113, "learning_rate": 4.812692614498787e-06, "loss": 0.0022, "step": 5305 }, { "epoch": 3.2284758138119867, "grad_norm": 0.16908611357212067, "learning_rate": 4.805351811383716e-06, "loss": 0.0036, "step": 5306 }, { "epoch": 3.2290842713720718, "grad_norm": 0.24062961339950562, "learning_rate": 4.798016015699638e-06, "loss": 0.0045, "step": 5307 }, { "epoch": 3.229692728932157, "grad_norm": 0.26717016100883484, "learning_rate": 4.790685229265529e-06, "loss": 0.0064, "step": 5308 }, { "epoch": 3.230301186492242, "grad_norm": 0.30753692984580994, "learning_rate": 4.783359453899125e-06, "loss": 0.0083, "step": 5309 }, { "epoch": 3.2309096440523275, "grad_norm": 0.2237798273563385, "learning_rate": 4.776038691416892e-06, "loss": 0.0078, "step": 5310 }, { "epoch": 3.2315181016124126, "grad_norm": 0.24201908707618713, "learning_rate": 4.768722943634099e-06, "loss": 0.0034, "step": 5311 }, { "epoch": 3.2321265591724977, "grad_norm": 0.14591459929943085, "learning_rate": 4.7614122123647295e-06, "loss": 0.002, "step": 5312 }, { "epoch": 3.232735016732583, "grad_norm": 0.1915680468082428, "learning_rate": 4.754106499421545e-06, "loss": 0.0033, "step": 5313 }, { "epoch": 3.233343474292668, "grad_norm": 0.24156653881072998, "learning_rate": 4.746805806616059e-06, "loss": 0.0066, "step": 5314 }, { "epoch": 3.2339519318527534, "grad_norm": 0.2683356702327728, "learning_rate": 4.739510135758546e-06, "loss": 0.0054, "step": 5315 }, { "epoch": 3.2345603894128385, "grad_norm": 0.29896220564842224, "learning_rate": 4.7322194886580105e-06, "loss": 0.0082, "step": 5316 }, { "epoch": 3.2351688469729236, "grad_norm": 0.2597740590572357, "learning_rate": 4.724933867122242e-06, "loss": 0.0076, "step": 5317 }, { "epoch": 3.2357773045330087, "grad_norm": 0.17904557287693024, "learning_rate": 4.717653272957767e-06, "loss": 0.0035, "step": 5318 }, { "epoch": 3.236385762093094, "grad_norm": 0.26095980405807495, "learning_rate": 4.710377707969876e-06, "loss": 0.0066, "step": 5319 }, { "epoch": 3.2369942196531793, "grad_norm": 0.14280088245868683, "learning_rate": 4.703107173962587e-06, "loss": 0.0027, "step": 5320 }, { "epoch": 3.2376026772132644, "grad_norm": 0.2902020812034607, "learning_rate": 4.695841672738718e-06, "loss": 0.0059, "step": 5321 }, { "epoch": 3.2382111347733495, "grad_norm": 0.34895089268684387, "learning_rate": 4.688581206099787e-06, "loss": 0.009, "step": 5322 }, { "epoch": 3.2388195923334346, "grad_norm": 0.21605385839939117, "learning_rate": 4.681325775846096e-06, "loss": 0.0052, "step": 5323 }, { "epoch": 3.2394280498935197, "grad_norm": 0.21634212136268616, "learning_rate": 4.674075383776689e-06, "loss": 0.0058, "step": 5324 }, { "epoch": 3.2400365074536053, "grad_norm": 0.25203219056129456, "learning_rate": 4.666830031689365e-06, "loss": 0.0048, "step": 5325 }, { "epoch": 3.2406449650136904, "grad_norm": 0.26320701837539673, "learning_rate": 4.659589721380661e-06, "loss": 0.0049, "step": 5326 }, { "epoch": 3.2412534225737755, "grad_norm": 0.268195241689682, "learning_rate": 4.652354454645874e-06, "loss": 0.0058, "step": 5327 }, { "epoch": 3.2418618801338606, "grad_norm": 0.2631436288356781, "learning_rate": 4.64512423327905e-06, "loss": 0.0049, "step": 5328 }, { "epoch": 3.2424703376939457, "grad_norm": 0.3505512773990631, "learning_rate": 4.637899059072984e-06, "loss": 0.0039, "step": 5329 }, { "epoch": 3.243078795254031, "grad_norm": 0.2612294554710388, "learning_rate": 4.630678933819218e-06, "loss": 0.0065, "step": 5330 }, { "epoch": 3.2436872528141163, "grad_norm": 0.2038453072309494, "learning_rate": 4.623463859308047e-06, "loss": 0.0045, "step": 5331 }, { "epoch": 3.2442957103742014, "grad_norm": 0.26115256547927856, "learning_rate": 4.616253837328497e-06, "loss": 0.0074, "step": 5332 }, { "epoch": 3.2449041679342865, "grad_norm": 0.31222108006477356, "learning_rate": 4.6090488696683585e-06, "loss": 0.008, "step": 5333 }, { "epoch": 3.2455126254943716, "grad_norm": 0.2225833237171173, "learning_rate": 4.601848958114164e-06, "loss": 0.0064, "step": 5334 }, { "epoch": 3.246121083054457, "grad_norm": 0.2462644726037979, "learning_rate": 4.5946541044511905e-06, "loss": 0.0049, "step": 5335 }, { "epoch": 3.246729540614542, "grad_norm": 0.27759093046188354, "learning_rate": 4.5874643104634685e-06, "loss": 0.0099, "step": 5336 }, { "epoch": 3.2473379981746273, "grad_norm": 0.24177397787570953, "learning_rate": 4.580279577933755e-06, "loss": 0.0059, "step": 5337 }, { "epoch": 3.2479464557347124, "grad_norm": 0.2759784162044525, "learning_rate": 4.573099908643572e-06, "loss": 0.0062, "step": 5338 }, { "epoch": 3.2485549132947975, "grad_norm": 0.400174617767334, "learning_rate": 4.565925304373176e-06, "loss": 0.0064, "step": 5339 }, { "epoch": 3.249163370854883, "grad_norm": 0.3525528609752655, "learning_rate": 4.558755766901568e-06, "loss": 0.0097, "step": 5340 }, { "epoch": 3.249771828414968, "grad_norm": 0.41769731044769287, "learning_rate": 4.551591298006497e-06, "loss": 0.0096, "step": 5341 }, { "epoch": 3.2503802859750532, "grad_norm": 0.2166384756565094, "learning_rate": 4.54443189946446e-06, "loss": 0.0031, "step": 5342 }, { "epoch": 3.2509887435351383, "grad_norm": 0.2720005214214325, "learning_rate": 4.537277573050674e-06, "loss": 0.0067, "step": 5343 }, { "epoch": 3.2515972010952234, "grad_norm": 0.2838521897792816, "learning_rate": 4.530128320539126e-06, "loss": 0.01, "step": 5344 }, { "epoch": 3.252205658655309, "grad_norm": 0.22926031053066254, "learning_rate": 4.522984143702524e-06, "loss": 0.0049, "step": 5345 }, { "epoch": 3.252814116215394, "grad_norm": 0.19992421567440033, "learning_rate": 4.515845044312331e-06, "loss": 0.0041, "step": 5346 }, { "epoch": 3.253422573775479, "grad_norm": 0.28706929087638855, "learning_rate": 4.508711024138746e-06, "loss": 0.0085, "step": 5347 }, { "epoch": 3.2540310313355643, "grad_norm": 0.1090521439909935, "learning_rate": 4.501582084950715e-06, "loss": 0.0021, "step": 5348 }, { "epoch": 3.2546394888956494, "grad_norm": 0.18653200566768646, "learning_rate": 4.494458228515902e-06, "loss": 0.0045, "step": 5349 }, { "epoch": 3.255247946455735, "grad_norm": 0.2562299370765686, "learning_rate": 4.487339456600736e-06, "loss": 0.0074, "step": 5350 }, { "epoch": 3.25585640401582, "grad_norm": 0.21422119438648224, "learning_rate": 4.480225770970378e-06, "loss": 0.0036, "step": 5351 }, { "epoch": 3.256464861575905, "grad_norm": 0.16707159578800201, "learning_rate": 4.4731171733887245e-06, "loss": 0.0038, "step": 5352 }, { "epoch": 3.25707331913599, "grad_norm": 0.27651339769363403, "learning_rate": 4.466013665618407e-06, "loss": 0.0049, "step": 5353 }, { "epoch": 3.2576817766960753, "grad_norm": 0.18352994322776794, "learning_rate": 4.458915249420798e-06, "loss": 0.0034, "step": 5354 }, { "epoch": 3.258290234256161, "grad_norm": 0.24244019389152527, "learning_rate": 4.451821926556016e-06, "loss": 0.006, "step": 5355 }, { "epoch": 3.258898691816246, "grad_norm": 0.2059207558631897, "learning_rate": 4.444733698782902e-06, "loss": 0.0051, "step": 5356 }, { "epoch": 3.259507149376331, "grad_norm": 0.32662034034729004, "learning_rate": 4.437650567859047e-06, "loss": 0.0062, "step": 5357 }, { "epoch": 3.260115606936416, "grad_norm": 0.2572704553604126, "learning_rate": 4.430572535540778e-06, "loss": 0.0055, "step": 5358 }, { "epoch": 3.260724064496501, "grad_norm": 0.15180638432502747, "learning_rate": 4.423499603583137e-06, "loss": 0.0031, "step": 5359 }, { "epoch": 3.2613325220565867, "grad_norm": 0.20388486981391907, "learning_rate": 4.416431773739924e-06, "loss": 0.004, "step": 5360 }, { "epoch": 3.261940979616672, "grad_norm": 0.25693532824516296, "learning_rate": 4.409369047763664e-06, "loss": 0.0053, "step": 5361 }, { "epoch": 3.262549437176757, "grad_norm": 0.311781644821167, "learning_rate": 4.402311427405628e-06, "loss": 0.0063, "step": 5362 }, { "epoch": 3.263157894736842, "grad_norm": 0.23537062108516693, "learning_rate": 4.39525891441579e-06, "loss": 0.0053, "step": 5363 }, { "epoch": 3.263766352296927, "grad_norm": 0.3553119897842407, "learning_rate": 4.388211510542906e-06, "loss": 0.0094, "step": 5364 }, { "epoch": 3.2643748098570127, "grad_norm": 0.20130811631679535, "learning_rate": 4.38116921753442e-06, "loss": 0.0049, "step": 5365 }, { "epoch": 3.2649832674170978, "grad_norm": 0.24931548535823822, "learning_rate": 4.374132037136533e-06, "loss": 0.004, "step": 5366 }, { "epoch": 3.265591724977183, "grad_norm": 0.29390352964401245, "learning_rate": 4.367099971094174e-06, "loss": 0.008, "step": 5367 }, { "epoch": 3.266200182537268, "grad_norm": 0.3790845274925232, "learning_rate": 4.360073021151004e-06, "loss": 0.0104, "step": 5368 }, { "epoch": 3.266808640097353, "grad_norm": 0.2828552722930908, "learning_rate": 4.353051189049398e-06, "loss": 0.0077, "step": 5369 }, { "epoch": 3.2674170976574386, "grad_norm": 0.13647259771823883, "learning_rate": 4.346034476530503e-06, "loss": 0.0035, "step": 5370 }, { "epoch": 3.2680255552175237, "grad_norm": 0.28553807735443115, "learning_rate": 4.339022885334154e-06, "loss": 0.0087, "step": 5371 }, { "epoch": 3.268634012777609, "grad_norm": 0.37478142976760864, "learning_rate": 4.332016417198942e-06, "loss": 0.0084, "step": 5372 }, { "epoch": 3.269242470337694, "grad_norm": 0.2366034984588623, "learning_rate": 4.32501507386216e-06, "loss": 0.0061, "step": 5373 }, { "epoch": 3.269850927897779, "grad_norm": 0.25971001386642456, "learning_rate": 4.318018857059878e-06, "loss": 0.0082, "step": 5374 }, { "epoch": 3.2704593854578645, "grad_norm": 0.24216106534004211, "learning_rate": 4.311027768526846e-06, "loss": 0.0056, "step": 5375 }, { "epoch": 3.2710678430179496, "grad_norm": 0.32058781385421753, "learning_rate": 4.3040418099965674e-06, "loss": 0.0085, "step": 5376 }, { "epoch": 3.2716763005780347, "grad_norm": 0.25957608222961426, "learning_rate": 4.297060983201273e-06, "loss": 0.005, "step": 5377 }, { "epoch": 3.27228475813812, "grad_norm": 0.2595154047012329, "learning_rate": 4.290085289871917e-06, "loss": 0.005, "step": 5378 }, { "epoch": 3.272893215698205, "grad_norm": 0.25397470593452454, "learning_rate": 4.283114731738166e-06, "loss": 0.0034, "step": 5379 }, { "epoch": 3.2735016732582904, "grad_norm": 0.2799084782600403, "learning_rate": 4.276149310528452e-06, "loss": 0.0068, "step": 5380 }, { "epoch": 3.2741101308183755, "grad_norm": 0.27302873134613037, "learning_rate": 4.269189027969889e-06, "loss": 0.0046, "step": 5381 }, { "epoch": 3.2747185883784606, "grad_norm": 0.27299267053604126, "learning_rate": 4.26223388578835e-06, "loss": 0.0083, "step": 5382 }, { "epoch": 3.2753270459385457, "grad_norm": 0.3280734717845917, "learning_rate": 4.255283885708403e-06, "loss": 0.0097, "step": 5383 }, { "epoch": 3.275935503498631, "grad_norm": 0.2109125405550003, "learning_rate": 4.2483390294533825e-06, "loss": 0.005, "step": 5384 }, { "epoch": 3.2765439610587164, "grad_norm": 0.3015410304069519, "learning_rate": 4.2413993187453035e-06, "loss": 0.005, "step": 5385 }, { "epoch": 3.2771524186188015, "grad_norm": 0.239542618393898, "learning_rate": 4.234464755304934e-06, "loss": 0.0064, "step": 5386 }, { "epoch": 3.2777608761788866, "grad_norm": 0.3143784999847412, "learning_rate": 4.227535340851754e-06, "loss": 0.0056, "step": 5387 }, { "epoch": 3.2783693337389717, "grad_norm": 0.3585420250892639, "learning_rate": 4.220611077103978e-06, "loss": 0.0104, "step": 5388 }, { "epoch": 3.2789777912990568, "grad_norm": 0.3050104081630707, "learning_rate": 4.213691965778516e-06, "loss": 0.0094, "step": 5389 }, { "epoch": 3.2795862488591423, "grad_norm": 0.22966350615024567, "learning_rate": 4.206778008591039e-06, "loss": 0.0047, "step": 5390 }, { "epoch": 3.2801947064192274, "grad_norm": 0.2953983247280121, "learning_rate": 4.199869207255907e-06, "loss": 0.0051, "step": 5391 }, { "epoch": 3.2808031639793125, "grad_norm": 0.22054730355739594, "learning_rate": 4.192965563486221e-06, "loss": 0.0047, "step": 5392 }, { "epoch": 3.2814116215393976, "grad_norm": 0.21199162304401398, "learning_rate": 4.186067078993794e-06, "loss": 0.0051, "step": 5393 }, { "epoch": 3.2820200790994827, "grad_norm": 0.184243842959404, "learning_rate": 4.179173755489171e-06, "loss": 0.0028, "step": 5394 }, { "epoch": 3.282628536659568, "grad_norm": 0.15772341191768646, "learning_rate": 4.172285594681594e-06, "loss": 0.0034, "step": 5395 }, { "epoch": 3.2832369942196533, "grad_norm": 0.2756187617778778, "learning_rate": 4.165402598279048e-06, "loss": 0.005, "step": 5396 }, { "epoch": 3.2838454517797384, "grad_norm": 0.2757441997528076, "learning_rate": 4.1585247679882315e-06, "loss": 0.0049, "step": 5397 }, { "epoch": 3.2844539093398235, "grad_norm": 0.2898223400115967, "learning_rate": 4.151652105514559e-06, "loss": 0.0065, "step": 5398 }, { "epoch": 3.2850623668999086, "grad_norm": 1.003989815711975, "learning_rate": 4.144784612562152e-06, "loss": 0.0069, "step": 5399 }, { "epoch": 3.285670824459994, "grad_norm": 0.230075865983963, "learning_rate": 4.1379222908338844e-06, "loss": 0.0046, "step": 5400 }, { "epoch": 3.2862792820200792, "grad_norm": 0.24226723611354828, "learning_rate": 4.13106514203131e-06, "loss": 0.0061, "step": 5401 }, { "epoch": 3.2868877395801643, "grad_norm": 0.2802073657512665, "learning_rate": 4.124213167854721e-06, "loss": 0.0073, "step": 5402 }, { "epoch": 3.2874961971402494, "grad_norm": 0.2725411355495453, "learning_rate": 4.1173663700031174e-06, "loss": 0.0034, "step": 5403 }, { "epoch": 3.2881046547003345, "grad_norm": 0.20064030587673187, "learning_rate": 4.11052475017423e-06, "loss": 0.0034, "step": 5404 }, { "epoch": 3.28871311226042, "grad_norm": 0.2537406086921692, "learning_rate": 4.103688310064485e-06, "loss": 0.0069, "step": 5405 }, { "epoch": 3.289321569820505, "grad_norm": 0.22151683270931244, "learning_rate": 4.096857051369035e-06, "loss": 0.006, "step": 5406 }, { "epoch": 3.2899300273805903, "grad_norm": 0.14316684007644653, "learning_rate": 4.090030975781753e-06, "loss": 0.003, "step": 5407 }, { "epoch": 3.2905384849406754, "grad_norm": 0.3060914874076843, "learning_rate": 4.08321008499522e-06, "loss": 0.0093, "step": 5408 }, { "epoch": 3.2911469425007605, "grad_norm": 0.23609232902526855, "learning_rate": 4.076394380700724e-06, "loss": 0.0054, "step": 5409 }, { "epoch": 3.291755400060846, "grad_norm": 0.34055647253990173, "learning_rate": 4.069583864588291e-06, "loss": 0.007, "step": 5410 }, { "epoch": 3.292363857620931, "grad_norm": 0.20988178253173828, "learning_rate": 4.062778538346634e-06, "loss": 0.0051, "step": 5411 }, { "epoch": 3.292972315181016, "grad_norm": 0.23799017071723938, "learning_rate": 4.055978403663191e-06, "loss": 0.0048, "step": 5412 }, { "epoch": 3.2935807727411013, "grad_norm": 0.28818562626838684, "learning_rate": 4.049183462224115e-06, "loss": 0.0076, "step": 5413 }, { "epoch": 3.2941892303011864, "grad_norm": 0.2099044919013977, "learning_rate": 4.042393715714274e-06, "loss": 0.0033, "step": 5414 }, { "epoch": 3.294797687861272, "grad_norm": 0.27117833495140076, "learning_rate": 4.0356091658172225e-06, "loss": 0.0066, "step": 5415 }, { "epoch": 3.295406145421357, "grad_norm": 0.3128974437713623, "learning_rate": 4.028829814215271e-06, "loss": 0.0082, "step": 5416 }, { "epoch": 3.296014602981442, "grad_norm": 0.20908896625041962, "learning_rate": 4.022055662589397e-06, "loss": 0.0045, "step": 5417 }, { "epoch": 3.296623060541527, "grad_norm": 0.2693098783493042, "learning_rate": 4.01528671261932e-06, "loss": 0.0045, "step": 5418 }, { "epoch": 3.2972315181016123, "grad_norm": 0.25489383935928345, "learning_rate": 4.008522965983444e-06, "loss": 0.0049, "step": 5419 }, { "epoch": 3.297839975661698, "grad_norm": 0.27955323457717896, "learning_rate": 4.001764424358914e-06, "loss": 0.0091, "step": 5420 }, { "epoch": 3.298448433221783, "grad_norm": 0.1617559790611267, "learning_rate": 3.995011089421552e-06, "loss": 0.0024, "step": 5421 }, { "epoch": 3.299056890781868, "grad_norm": 0.1988738626241684, "learning_rate": 3.9882629628459095e-06, "loss": 0.0048, "step": 5422 }, { "epoch": 3.299665348341953, "grad_norm": 0.32020819187164307, "learning_rate": 3.98152004630524e-06, "loss": 0.0103, "step": 5423 }, { "epoch": 3.3002738059020382, "grad_norm": 0.2752763628959656, "learning_rate": 3.974782341471508e-06, "loss": 0.0072, "step": 5424 }, { "epoch": 3.3008822634621233, "grad_norm": 0.177403524518013, "learning_rate": 3.968049850015374e-06, "loss": 0.0036, "step": 5425 }, { "epoch": 3.301490721022209, "grad_norm": 0.2594372630119324, "learning_rate": 3.96132257360623e-06, "loss": 0.0054, "step": 5426 }, { "epoch": 3.302099178582294, "grad_norm": 0.24807630479335785, "learning_rate": 3.954600513912149e-06, "loss": 0.0048, "step": 5427 }, { "epoch": 3.302707636142379, "grad_norm": 0.24896809458732605, "learning_rate": 3.9478836725999306e-06, "loss": 0.0062, "step": 5428 }, { "epoch": 3.303316093702464, "grad_norm": 0.28816282749176025, "learning_rate": 3.941172051335054e-06, "loss": 0.0076, "step": 5429 }, { "epoch": 3.3039245512625492, "grad_norm": 0.27258211374282837, "learning_rate": 3.934465651781746e-06, "loss": 0.0078, "step": 5430 }, { "epoch": 3.304533008822635, "grad_norm": 0.2567967176437378, "learning_rate": 3.927764475602893e-06, "loss": 0.0085, "step": 5431 }, { "epoch": 3.30514146638272, "grad_norm": 0.2778639793395996, "learning_rate": 3.9210685244601195e-06, "loss": 0.0084, "step": 5432 }, { "epoch": 3.305749923942805, "grad_norm": 0.2634139060974121, "learning_rate": 3.914377800013738e-06, "loss": 0.0034, "step": 5433 }, { "epoch": 3.30635838150289, "grad_norm": 0.3199003338813782, "learning_rate": 3.907692303922775e-06, "loss": 0.0084, "step": 5434 }, { "epoch": 3.306966839062975, "grad_norm": 0.22238844633102417, "learning_rate": 3.9010120378449365e-06, "loss": 0.004, "step": 5435 }, { "epoch": 3.3075752966230607, "grad_norm": 0.23715293407440186, "learning_rate": 3.894337003436679e-06, "loss": 0.0041, "step": 5436 }, { "epoch": 3.308183754183146, "grad_norm": 0.2940605580806732, "learning_rate": 3.88766720235311e-06, "loss": 0.007, "step": 5437 }, { "epoch": 3.308792211743231, "grad_norm": 0.3147020637989044, "learning_rate": 3.8810026362480684e-06, "loss": 0.0059, "step": 5438 }, { "epoch": 3.309400669303316, "grad_norm": 0.19217897951602936, "learning_rate": 3.8743433067740895e-06, "loss": 0.0067, "step": 5439 }, { "epoch": 3.310009126863401, "grad_norm": 0.36218491196632385, "learning_rate": 3.867689215582415e-06, "loss": 0.0147, "step": 5440 }, { "epoch": 3.3106175844234866, "grad_norm": 0.23334696888923645, "learning_rate": 3.861040364322974e-06, "loss": 0.0067, "step": 5441 }, { "epoch": 3.3112260419835717, "grad_norm": 0.3586207926273346, "learning_rate": 3.854396754644405e-06, "loss": 0.0062, "step": 5442 }, { "epoch": 3.311834499543657, "grad_norm": 0.3108196556568146, "learning_rate": 3.84775838819405e-06, "loss": 0.0105, "step": 5443 }, { "epoch": 3.312442957103742, "grad_norm": 0.21571455895900726, "learning_rate": 3.841125266617946e-06, "loss": 0.0057, "step": 5444 }, { "epoch": 3.313051414663827, "grad_norm": 0.2929924726486206, "learning_rate": 3.834497391560829e-06, "loss": 0.0084, "step": 5445 }, { "epoch": 3.3136598722239126, "grad_norm": 0.1458842009305954, "learning_rate": 3.827874764666145e-06, "loss": 0.0021, "step": 5446 }, { "epoch": 3.3142683297839977, "grad_norm": 0.25384876132011414, "learning_rate": 3.821257387576014e-06, "loss": 0.0074, "step": 5447 }, { "epoch": 3.3148767873440828, "grad_norm": 0.29817652702331543, "learning_rate": 3.8146452619312768e-06, "loss": 0.0078, "step": 5448 }, { "epoch": 3.315485244904168, "grad_norm": 0.2839571535587311, "learning_rate": 3.8080383893714684e-06, "loss": 0.006, "step": 5449 }, { "epoch": 3.316093702464253, "grad_norm": 0.3559333384037018, "learning_rate": 3.801436771534814e-06, "loss": 0.0114, "step": 5450 }, { "epoch": 3.3167021600243385, "grad_norm": 0.46538999676704407, "learning_rate": 3.7948404100582453e-06, "loss": 0.0078, "step": 5451 }, { "epoch": 3.3173106175844236, "grad_norm": 0.41122740507125854, "learning_rate": 3.788249306577374e-06, "loss": 0.0127, "step": 5452 }, { "epoch": 3.3179190751445087, "grad_norm": 0.22749930620193481, "learning_rate": 3.7816634627265285e-06, "loss": 0.0075, "step": 5453 }, { "epoch": 3.3185275327045938, "grad_norm": 0.257730633020401, "learning_rate": 3.7750828801387167e-06, "loss": 0.0062, "step": 5454 }, { "epoch": 3.319135990264679, "grad_norm": 0.27139922976493835, "learning_rate": 3.7685075604456553e-06, "loss": 0.0042, "step": 5455 }, { "epoch": 3.3197444478247644, "grad_norm": 0.23728352785110474, "learning_rate": 3.7619375052777518e-06, "loss": 0.0036, "step": 5456 }, { "epoch": 3.3203529053848495, "grad_norm": 0.35973915457725525, "learning_rate": 3.755372716264094e-06, "loss": 0.0105, "step": 5457 }, { "epoch": 3.3209613629449346, "grad_norm": 0.15002618730068207, "learning_rate": 3.748813195032483e-06, "loss": 0.0033, "step": 5458 }, { "epoch": 3.3215698205050197, "grad_norm": 0.14451484382152557, "learning_rate": 3.7422589432094064e-06, "loss": 0.0016, "step": 5459 }, { "epoch": 3.322178278065105, "grad_norm": 0.19004569947719574, "learning_rate": 3.7357099624200475e-06, "loss": 0.0041, "step": 5460 }, { "epoch": 3.3227867356251903, "grad_norm": 0.27472901344299316, "learning_rate": 3.729166254288277e-06, "loss": 0.006, "step": 5461 }, { "epoch": 3.3233951931852754, "grad_norm": 0.19888129830360413, "learning_rate": 3.7226278204366695e-06, "loss": 0.0047, "step": 5462 }, { "epoch": 3.3240036507453605, "grad_norm": 0.267270565032959, "learning_rate": 3.7160946624864716e-06, "loss": 0.0077, "step": 5463 }, { "epoch": 3.3246121083054456, "grad_norm": 0.24350838363170624, "learning_rate": 3.7095667820576415e-06, "loss": 0.0058, "step": 5464 }, { "epoch": 3.3252205658655307, "grad_norm": 0.28811752796173096, "learning_rate": 3.7030441807688193e-06, "loss": 0.007, "step": 5465 }, { "epoch": 3.325829023425616, "grad_norm": 0.1940048485994339, "learning_rate": 3.69652686023734e-06, "loss": 0.0038, "step": 5466 }, { "epoch": 3.3264374809857014, "grad_norm": 0.2174157351255417, "learning_rate": 3.6900148220792323e-06, "loss": 0.0034, "step": 5467 }, { "epoch": 3.3270459385457865, "grad_norm": 0.18492846190929413, "learning_rate": 3.6835080679091998e-06, "loss": 0.0027, "step": 5468 }, { "epoch": 3.3276543961058715, "grad_norm": 0.30942097306251526, "learning_rate": 3.6770065993406517e-06, "loss": 0.0077, "step": 5469 }, { "epoch": 3.3282628536659566, "grad_norm": 0.3265366852283478, "learning_rate": 3.6705104179856785e-06, "loss": 0.0071, "step": 5470 }, { "epoch": 3.3288713112260417, "grad_norm": 0.2290954440832138, "learning_rate": 3.6640195254550676e-06, "loss": 0.0061, "step": 5471 }, { "epoch": 3.3294797687861273, "grad_norm": 0.21861323714256287, "learning_rate": 3.657533923358286e-06, "loss": 0.0048, "step": 5472 }, { "epoch": 3.3300882263462124, "grad_norm": 0.25013089179992676, "learning_rate": 3.6510536133034985e-06, "loss": 0.0059, "step": 5473 }, { "epoch": 3.3306966839062975, "grad_norm": 0.26969480514526367, "learning_rate": 3.644578596897541e-06, "loss": 0.0078, "step": 5474 }, { "epoch": 3.3313051414663826, "grad_norm": 0.23772546648979187, "learning_rate": 3.638108875745955e-06, "loss": 0.0055, "step": 5475 }, { "epoch": 3.3319135990264677, "grad_norm": 0.245829239487648, "learning_rate": 3.631644451452959e-06, "loss": 0.007, "step": 5476 }, { "epoch": 3.332522056586553, "grad_norm": 0.23014789819717407, "learning_rate": 3.625185325621469e-06, "loss": 0.0052, "step": 5477 }, { "epoch": 3.3331305141466383, "grad_norm": 0.2805229425430298, "learning_rate": 3.618731499853059e-06, "loss": 0.0076, "step": 5478 }, { "epoch": 3.3337389717067234, "grad_norm": 0.23805426061153412, "learning_rate": 3.6122829757480354e-06, "loss": 0.0048, "step": 5479 }, { "epoch": 3.3343474292668085, "grad_norm": 0.3036494553089142, "learning_rate": 3.605839754905341e-06, "loss": 0.0084, "step": 5480 }, { "epoch": 3.3349558868268936, "grad_norm": 0.2554338872432709, "learning_rate": 3.5994018389226365e-06, "loss": 0.0045, "step": 5481 }, { "epoch": 3.335564344386979, "grad_norm": 0.26599669456481934, "learning_rate": 3.5929692293962562e-06, "loss": 0.0069, "step": 5482 }, { "epoch": 3.3361728019470642, "grad_norm": 0.1806708723306656, "learning_rate": 3.586541927921222e-06, "loss": 0.003, "step": 5483 }, { "epoch": 3.3367812595071493, "grad_norm": 0.21414224803447723, "learning_rate": 3.5801199360912226e-06, "loss": 0.0046, "step": 5484 }, { "epoch": 3.3373897170672344, "grad_norm": 0.23333413898944855, "learning_rate": 3.573703255498667e-06, "loss": 0.0036, "step": 5485 }, { "epoch": 3.3379981746273195, "grad_norm": 0.19560351967811584, "learning_rate": 3.567291887734603e-06, "loss": 0.0052, "step": 5486 }, { "epoch": 3.338606632187405, "grad_norm": 0.48690447211265564, "learning_rate": 3.5608858343887997e-06, "loss": 0.0215, "step": 5487 }, { "epoch": 3.33921508974749, "grad_norm": 0.27765342593193054, "learning_rate": 3.55448509704967e-06, "loss": 0.0068, "step": 5488 }, { "epoch": 3.3398235473075752, "grad_norm": 0.2087118774652481, "learning_rate": 3.548089677304356e-06, "loss": 0.0043, "step": 5489 }, { "epoch": 3.3404320048676603, "grad_norm": 0.3580057919025421, "learning_rate": 3.5416995767386357e-06, "loss": 0.0066, "step": 5490 }, { "epoch": 3.3410404624277454, "grad_norm": 0.2551713287830353, "learning_rate": 3.5353147969369948e-06, "loss": 0.0061, "step": 5491 }, { "epoch": 3.341648919987831, "grad_norm": 0.26948314905166626, "learning_rate": 3.5289353394825947e-06, "loss": 0.0082, "step": 5492 }, { "epoch": 3.342257377547916, "grad_norm": 0.438873827457428, "learning_rate": 3.5225612059572727e-06, "loss": 0.0112, "step": 5493 }, { "epoch": 3.342865835108001, "grad_norm": 0.18292073905467987, "learning_rate": 3.5161923979415395e-06, "loss": 0.0039, "step": 5494 }, { "epoch": 3.3434742926680863, "grad_norm": 0.2990265488624573, "learning_rate": 3.509828917014615e-06, "loss": 0.0097, "step": 5495 }, { "epoch": 3.3440827502281714, "grad_norm": 0.23416587710380554, "learning_rate": 3.5034707647543576e-06, "loss": 0.0061, "step": 5496 }, { "epoch": 3.344691207788257, "grad_norm": 0.28342893719673157, "learning_rate": 3.4971179427373295e-06, "loss": 0.0072, "step": 5497 }, { "epoch": 3.345299665348342, "grad_norm": 0.22702716290950775, "learning_rate": 3.490770452538769e-06, "loss": 0.0053, "step": 5498 }, { "epoch": 3.345908122908427, "grad_norm": 0.24197997152805328, "learning_rate": 3.484428295732592e-06, "loss": 0.0071, "step": 5499 }, { "epoch": 3.346516580468512, "grad_norm": 0.16110782325267792, "learning_rate": 3.4780914738913816e-06, "loss": 0.0035, "step": 5500 }, { "epoch": 3.3471250380285973, "grad_norm": 0.3154269754886627, "learning_rate": 3.471759988586404e-06, "loss": 0.0057, "step": 5501 }, { "epoch": 3.347733495588683, "grad_norm": 0.24822941422462463, "learning_rate": 3.465433841387611e-06, "loss": 0.0036, "step": 5502 }, { "epoch": 3.348341953148768, "grad_norm": 0.26176613569259644, "learning_rate": 3.4591130338636257e-06, "loss": 0.0049, "step": 5503 }, { "epoch": 3.348950410708853, "grad_norm": 0.31120842695236206, "learning_rate": 3.4527975675817282e-06, "loss": 0.0095, "step": 5504 }, { "epoch": 3.349558868268938, "grad_norm": 0.20220082998275757, "learning_rate": 3.4464874441079126e-06, "loss": 0.005, "step": 5505 }, { "epoch": 3.350167325829023, "grad_norm": 0.29147812724113464, "learning_rate": 3.440182665006811e-06, "loss": 0.0047, "step": 5506 }, { "epoch": 3.3507757833891088, "grad_norm": 0.3253108561038971, "learning_rate": 3.43388323184175e-06, "loss": 0.01, "step": 5507 }, { "epoch": 3.351384240949194, "grad_norm": 0.3161683976650238, "learning_rate": 3.4275891461747283e-06, "loss": 0.007, "step": 5508 }, { "epoch": 3.351992698509279, "grad_norm": 0.3239922821521759, "learning_rate": 3.421300409566422e-06, "loss": 0.0054, "step": 5509 }, { "epoch": 3.352601156069364, "grad_norm": 0.30600473284721375, "learning_rate": 3.415017023576164e-06, "loss": 0.0044, "step": 5510 }, { "epoch": 3.353209613629449, "grad_norm": 0.2611445486545563, "learning_rate": 3.408738989761978e-06, "loss": 0.0062, "step": 5511 }, { "epoch": 3.3538180711895347, "grad_norm": 0.1755179911851883, "learning_rate": 3.4024663096805547e-06, "loss": 0.0036, "step": 5512 }, { "epoch": 3.3544265287496198, "grad_norm": 0.2571423649787903, "learning_rate": 3.396198984887261e-06, "loss": 0.0089, "step": 5513 }, { "epoch": 3.355034986309705, "grad_norm": 0.23700016736984253, "learning_rate": 3.389937016936118e-06, "loss": 0.0037, "step": 5514 }, { "epoch": 3.35564344386979, "grad_norm": 0.31217527389526367, "learning_rate": 3.383680407379855e-06, "loss": 0.0048, "step": 5515 }, { "epoch": 3.356251901429875, "grad_norm": 0.2647400200366974, "learning_rate": 3.377429157769832e-06, "loss": 0.0099, "step": 5516 }, { "epoch": 3.3568603589899606, "grad_norm": 0.199229896068573, "learning_rate": 3.3711832696561056e-06, "loss": 0.0034, "step": 5517 }, { "epoch": 3.3574688165500457, "grad_norm": 0.24624145030975342, "learning_rate": 3.3649427445873934e-06, "loss": 0.0058, "step": 5518 }, { "epoch": 3.358077274110131, "grad_norm": 0.21135321259498596, "learning_rate": 3.3587075841110927e-06, "loss": 0.0047, "step": 5519 }, { "epoch": 3.358685731670216, "grad_norm": 0.31276053190231323, "learning_rate": 3.3524777897732452e-06, "loss": 0.0068, "step": 5520 }, { "epoch": 3.359294189230301, "grad_norm": 0.20705646276474, "learning_rate": 3.3462533631186066e-06, "loss": 0.0041, "step": 5521 }, { "epoch": 3.3599026467903865, "grad_norm": 0.373995840549469, "learning_rate": 3.340034305690554e-06, "loss": 0.0137, "step": 5522 }, { "epoch": 3.3605111043504716, "grad_norm": 0.1533001810312271, "learning_rate": 3.3338206190311667e-06, "loss": 0.0022, "step": 5523 }, { "epoch": 3.3611195619105567, "grad_norm": 0.3650040328502655, "learning_rate": 3.327612304681166e-06, "loss": 0.0084, "step": 5524 }, { "epoch": 3.361728019470642, "grad_norm": 0.2500038146972656, "learning_rate": 3.321409364179975e-06, "loss": 0.0052, "step": 5525 }, { "epoch": 3.362336477030727, "grad_norm": 0.23246996104717255, "learning_rate": 3.3152117990656456e-06, "loss": 0.0054, "step": 5526 }, { "epoch": 3.3629449345908125, "grad_norm": 0.21729208528995514, "learning_rate": 3.309019610874925e-06, "loss": 0.0031, "step": 5527 }, { "epoch": 3.3635533921508975, "grad_norm": 0.29215723276138306, "learning_rate": 3.3028328011432157e-06, "loss": 0.0071, "step": 5528 }, { "epoch": 3.3641618497109826, "grad_norm": 0.21962468326091766, "learning_rate": 3.2966513714045967e-06, "loss": 0.0037, "step": 5529 }, { "epoch": 3.3647703072710677, "grad_norm": 0.24007774889469147, "learning_rate": 3.2904753231917857e-06, "loss": 0.0048, "step": 5530 }, { "epoch": 3.365378764831153, "grad_norm": 0.2580210864543915, "learning_rate": 3.284304658036208e-06, "loss": 0.0049, "step": 5531 }, { "epoch": 3.3659872223912384, "grad_norm": 0.1776011735200882, "learning_rate": 3.278139377467912e-06, "loss": 0.0027, "step": 5532 }, { "epoch": 3.3665956799513235, "grad_norm": 0.22608719766139984, "learning_rate": 3.271979483015647e-06, "loss": 0.0057, "step": 5533 }, { "epoch": 3.3672041375114086, "grad_norm": 0.29510682821273804, "learning_rate": 3.265824976206791e-06, "loss": 0.0056, "step": 5534 }, { "epoch": 3.3678125950714937, "grad_norm": 0.2816297113895416, "learning_rate": 3.2596758585674238e-06, "loss": 0.0058, "step": 5535 }, { "epoch": 3.3684210526315788, "grad_norm": 0.21519798040390015, "learning_rate": 3.253532131622261e-06, "loss": 0.0077, "step": 5536 }, { "epoch": 3.3690295101916643, "grad_norm": 0.1678488701581955, "learning_rate": 3.247393796894688e-06, "loss": 0.003, "step": 5537 }, { "epoch": 3.3696379677517494, "grad_norm": 0.26176977157592773, "learning_rate": 3.2412608559067638e-06, "loss": 0.0042, "step": 5538 }, { "epoch": 3.3702464253118345, "grad_norm": 0.2218218296766281, "learning_rate": 3.2351333101792005e-06, "loss": 0.0036, "step": 5539 }, { "epoch": 3.3708548828719196, "grad_norm": 0.2444075644016266, "learning_rate": 3.229011161231363e-06, "loss": 0.0051, "step": 5540 }, { "epoch": 3.3714633404320047, "grad_norm": 0.18922793865203857, "learning_rate": 3.2228944105813093e-06, "loss": 0.0036, "step": 5541 }, { "epoch": 3.3720717979920902, "grad_norm": 0.24314537644386292, "learning_rate": 3.2167830597457205e-06, "loss": 0.0046, "step": 5542 }, { "epoch": 3.3726802555521753, "grad_norm": 0.12830045819282532, "learning_rate": 3.2106771102399625e-06, "loss": 0.0019, "step": 5543 }, { "epoch": 3.3732887131122604, "grad_norm": 0.26987224817276, "learning_rate": 3.20457656357806e-06, "loss": 0.0062, "step": 5544 }, { "epoch": 3.3738971706723455, "grad_norm": 0.21220239996910095, "learning_rate": 3.198481421272698e-06, "loss": 0.0038, "step": 5545 }, { "epoch": 3.3745056282324306, "grad_norm": 0.2044731080532074, "learning_rate": 3.1923916848352025e-06, "loss": 0.0044, "step": 5546 }, { "epoch": 3.375114085792516, "grad_norm": 0.3579948842525482, "learning_rate": 3.186307355775586e-06, "loss": 0.0061, "step": 5547 }, { "epoch": 3.3757225433526012, "grad_norm": 0.6677601337432861, "learning_rate": 3.180228435602503e-06, "loss": 0.0061, "step": 5548 }, { "epoch": 3.3763310009126863, "grad_norm": 0.21300187706947327, "learning_rate": 3.1741549258232833e-06, "loss": 0.005, "step": 5549 }, { "epoch": 3.3769394584727714, "grad_norm": 0.24835187196731567, "learning_rate": 3.1680868279438835e-06, "loss": 0.0057, "step": 5550 }, { "epoch": 3.3775479160328565, "grad_norm": 0.4124807119369507, "learning_rate": 3.162024143468964e-06, "loss": 0.0088, "step": 5551 }, { "epoch": 3.378156373592942, "grad_norm": 0.2930738925933838, "learning_rate": 3.1559668739017974e-06, "loss": 0.0062, "step": 5552 }, { "epoch": 3.378764831153027, "grad_norm": 0.18829555809497833, "learning_rate": 3.149915020744343e-06, "loss": 0.0042, "step": 5553 }, { "epoch": 3.3793732887131123, "grad_norm": 0.24336494505405426, "learning_rate": 3.143868585497206e-06, "loss": 0.0039, "step": 5554 }, { "epoch": 3.3799817462731974, "grad_norm": 0.2603733241558075, "learning_rate": 3.1378275696596597e-06, "loss": 0.0061, "step": 5555 }, { "epoch": 3.3805902038332825, "grad_norm": 0.17050331830978394, "learning_rate": 3.131791974729609e-06, "loss": 0.0035, "step": 5556 }, { "epoch": 3.381198661393368, "grad_norm": 0.263378381729126, "learning_rate": 3.125761802203636e-06, "loss": 0.0063, "step": 5557 }, { "epoch": 3.381807118953453, "grad_norm": 0.3855268359184265, "learning_rate": 3.119737053576971e-06, "loss": 0.0063, "step": 5558 }, { "epoch": 3.382415576513538, "grad_norm": 0.17356686294078827, "learning_rate": 3.11371773034351e-06, "loss": 0.0025, "step": 5559 }, { "epoch": 3.3830240340736233, "grad_norm": 0.2114478200674057, "learning_rate": 3.107703833995776e-06, "loss": 0.0054, "step": 5560 }, { "epoch": 3.3836324916337084, "grad_norm": 0.31965020298957825, "learning_rate": 3.101695366024987e-06, "loss": 0.0067, "step": 5561 }, { "epoch": 3.384240949193794, "grad_norm": 0.2662268579006195, "learning_rate": 3.095692327920974e-06, "loss": 0.0069, "step": 5562 }, { "epoch": 3.384849406753879, "grad_norm": 0.204771026968956, "learning_rate": 3.0896947211722484e-06, "loss": 0.004, "step": 5563 }, { "epoch": 3.385457864313964, "grad_norm": 0.22940324246883392, "learning_rate": 3.083702547265965e-06, "loss": 0.0058, "step": 5564 }, { "epoch": 3.386066321874049, "grad_norm": 0.2345285266637802, "learning_rate": 3.077715807687939e-06, "loss": 0.0132, "step": 5565 }, { "epoch": 3.3866747794341343, "grad_norm": 1.2732864618301392, "learning_rate": 3.071734503922616e-06, "loss": 0.0262, "step": 5566 }, { "epoch": 3.38728323699422, "grad_norm": 0.24260850250720978, "learning_rate": 3.065758637453131e-06, "loss": 0.0033, "step": 5567 }, { "epoch": 3.387891694554305, "grad_norm": 0.28996798396110535, "learning_rate": 3.059788209761233e-06, "loss": 0.0037, "step": 5568 }, { "epoch": 3.38850015211439, "grad_norm": 0.2517380714416504, "learning_rate": 3.0538232223273482e-06, "loss": 0.0056, "step": 5569 }, { "epoch": 3.389108609674475, "grad_norm": 0.2352709025144577, "learning_rate": 3.047863676630541e-06, "loss": 0.0056, "step": 5570 }, { "epoch": 3.3897170672345602, "grad_norm": 0.24700261652469635, "learning_rate": 3.041909574148538e-06, "loss": 0.0038, "step": 5571 }, { "epoch": 3.3903255247946458, "grad_norm": 0.20761002600193024, "learning_rate": 3.035960916357697e-06, "loss": 0.0046, "step": 5572 }, { "epoch": 3.390933982354731, "grad_norm": 0.24791277945041656, "learning_rate": 3.030017704733043e-06, "loss": 0.0048, "step": 5573 }, { "epoch": 3.391542439914816, "grad_norm": 0.20745399594306946, "learning_rate": 3.0240799407482452e-06, "loss": 0.0044, "step": 5574 }, { "epoch": 3.392150897474901, "grad_norm": 0.14504899084568024, "learning_rate": 3.018147625875617e-06, "loss": 0.002, "step": 5575 }, { "epoch": 3.392759355034986, "grad_norm": 0.3044413626194, "learning_rate": 3.012220761586132e-06, "loss": 0.0064, "step": 5576 }, { "epoch": 3.3933678125950717, "grad_norm": 0.2477070391178131, "learning_rate": 3.006299349349406e-06, "loss": 0.0065, "step": 5577 }, { "epoch": 3.393976270155157, "grad_norm": 0.28918352723121643, "learning_rate": 3.000383390633696e-06, "loss": 0.0062, "step": 5578 }, { "epoch": 3.394584727715242, "grad_norm": 0.2834755480289459, "learning_rate": 2.9944728869059136e-06, "loss": 0.0063, "step": 5579 }, { "epoch": 3.395193185275327, "grad_norm": 0.2524814009666443, "learning_rate": 2.98856783963162e-06, "loss": 0.0038, "step": 5580 }, { "epoch": 3.395801642835412, "grad_norm": 0.2619141638278961, "learning_rate": 2.98266825027502e-06, "loss": 0.0057, "step": 5581 }, { "epoch": 3.3964101003954976, "grad_norm": 0.2823928892612457, "learning_rate": 2.9767741202989723e-06, "loss": 0.0061, "step": 5582 }, { "epoch": 3.3970185579555827, "grad_norm": 0.20528404414653778, "learning_rate": 2.970885451164965e-06, "loss": 0.0046, "step": 5583 }, { "epoch": 3.397627015515668, "grad_norm": 0.2534896433353424, "learning_rate": 2.9650022443331453e-06, "loss": 0.0045, "step": 5584 }, { "epoch": 3.398235473075753, "grad_norm": 0.26259493827819824, "learning_rate": 2.9591245012623058e-06, "loss": 0.0071, "step": 5585 }, { "epoch": 3.398843930635838, "grad_norm": 0.3380330204963684, "learning_rate": 2.9532522234098803e-06, "loss": 0.0089, "step": 5586 }, { "epoch": 3.3994523881959235, "grad_norm": 0.18706291913986206, "learning_rate": 2.947385412231951e-06, "loss": 0.0031, "step": 5587 }, { "epoch": 3.4000608457560086, "grad_norm": 0.2913658916950226, "learning_rate": 2.9415240691832463e-06, "loss": 0.0085, "step": 5588 }, { "epoch": 3.4006693033160937, "grad_norm": 0.3028022050857544, "learning_rate": 2.9356681957171227e-06, "loss": 0.0066, "step": 5589 }, { "epoch": 3.401277760876179, "grad_norm": 0.2161863148212433, "learning_rate": 2.9298177932856025e-06, "loss": 0.0045, "step": 5590 }, { "epoch": 3.401886218436264, "grad_norm": 0.25855207443237305, "learning_rate": 2.923972863339336e-06, "loss": 0.007, "step": 5591 }, { "epoch": 3.4024946759963495, "grad_norm": 0.2582375109195709, "learning_rate": 2.9181334073276334e-06, "loss": 0.0055, "step": 5592 }, { "epoch": 3.4031031335564346, "grad_norm": 0.24739745259284973, "learning_rate": 2.9122994266984226e-06, "loss": 0.0048, "step": 5593 }, { "epoch": 3.4037115911165197, "grad_norm": 0.2663595676422119, "learning_rate": 2.906470922898291e-06, "loss": 0.0044, "step": 5594 }, { "epoch": 3.4043200486766048, "grad_norm": 0.21414734423160553, "learning_rate": 2.900647897372469e-06, "loss": 0.0041, "step": 5595 }, { "epoch": 3.40492850623669, "grad_norm": 0.33169928193092346, "learning_rate": 2.894830351564823e-06, "loss": 0.0088, "step": 5596 }, { "epoch": 3.4055369637967754, "grad_norm": 0.3833865523338318, "learning_rate": 2.889018286917858e-06, "loss": 0.0072, "step": 5597 }, { "epoch": 3.4061454213568605, "grad_norm": 0.30657726526260376, "learning_rate": 2.8832117048727348e-06, "loss": 0.009, "step": 5598 }, { "epoch": 3.4067538789169456, "grad_norm": 0.1277196705341339, "learning_rate": 2.877410606869227e-06, "loss": 0.0017, "step": 5599 }, { "epoch": 3.4073623364770307, "grad_norm": 0.29806557297706604, "learning_rate": 2.8716149943457755e-06, "loss": 0.0072, "step": 5600 }, { "epoch": 3.407970794037116, "grad_norm": 0.24409450590610504, "learning_rate": 2.86582486873945e-06, "loss": 0.0052, "step": 5601 }, { "epoch": 3.4085792515972013, "grad_norm": 0.23363740742206573, "learning_rate": 2.8600402314859636e-06, "loss": 0.0072, "step": 5602 }, { "epoch": 3.4091877091572864, "grad_norm": 0.23178331553936005, "learning_rate": 2.8542610840196493e-06, "loss": 0.0055, "step": 5603 }, { "epoch": 3.4097961667173715, "grad_norm": 0.2069847732782364, "learning_rate": 2.848487427773519e-06, "loss": 0.0039, "step": 5604 }, { "epoch": 3.4104046242774566, "grad_norm": 0.17575199902057648, "learning_rate": 2.842719264179178e-06, "loss": 0.0028, "step": 5605 }, { "epoch": 3.4110130818375417, "grad_norm": 0.1965407133102417, "learning_rate": 2.836956594666901e-06, "loss": 0.0036, "step": 5606 }, { "epoch": 3.4116215393976272, "grad_norm": 0.25646457076072693, "learning_rate": 2.8311994206655867e-06, "loss": 0.0067, "step": 5607 }, { "epoch": 3.4122299969577123, "grad_norm": 0.2868242859840393, "learning_rate": 2.8254477436027797e-06, "loss": 0.0063, "step": 5608 }, { "epoch": 3.4128384545177974, "grad_norm": 0.277547150850296, "learning_rate": 2.8197015649046393e-06, "loss": 0.0067, "step": 5609 }, { "epoch": 3.4134469120778825, "grad_norm": 0.20469379425048828, "learning_rate": 2.813960885996003e-06, "loss": 0.0029, "step": 5610 }, { "epoch": 3.4140553696379676, "grad_norm": 0.39242902398109436, "learning_rate": 2.808225708300302e-06, "loss": 0.004, "step": 5611 }, { "epoch": 3.414663827198053, "grad_norm": 0.2715418338775635, "learning_rate": 2.8024960332396266e-06, "loss": 0.0084, "step": 5612 }, { "epoch": 3.4152722847581383, "grad_norm": 0.1898062378168106, "learning_rate": 2.7967718622346957e-06, "loss": 0.0042, "step": 5613 }, { "epoch": 3.4158807423182234, "grad_norm": 0.2265051305294037, "learning_rate": 2.7910531967048736e-06, "loss": 0.0029, "step": 5614 }, { "epoch": 3.4164891998783085, "grad_norm": 0.3068977892398834, "learning_rate": 2.7853400380681404e-06, "loss": 0.0095, "step": 5615 }, { "epoch": 3.4170976574383936, "grad_norm": 0.22495239973068237, "learning_rate": 2.779632387741121e-06, "loss": 0.0059, "step": 5616 }, { "epoch": 3.417706114998479, "grad_norm": 0.2805343568325043, "learning_rate": 2.7739302471390836e-06, "loss": 0.0073, "step": 5617 }, { "epoch": 3.418314572558564, "grad_norm": 0.2405698299407959, "learning_rate": 2.7682336176759195e-06, "loss": 0.0081, "step": 5618 }, { "epoch": 3.4189230301186493, "grad_norm": 0.3347167372703552, "learning_rate": 2.7625425007641425e-06, "loss": 0.0061, "step": 5619 }, { "epoch": 3.4195314876787344, "grad_norm": 0.25292569398880005, "learning_rate": 2.756856897814933e-06, "loss": 0.0041, "step": 5620 }, { "epoch": 3.4201399452388195, "grad_norm": 0.1868637651205063, "learning_rate": 2.7511768102380654e-06, "loss": 0.0028, "step": 5621 }, { "epoch": 3.420748402798905, "grad_norm": 0.175969198346138, "learning_rate": 2.7455022394419746e-06, "loss": 0.0043, "step": 5622 }, { "epoch": 3.42135686035899, "grad_norm": 0.31663379073143005, "learning_rate": 2.739833186833715e-06, "loss": 0.0037, "step": 5623 }, { "epoch": 3.421965317919075, "grad_norm": 0.3171955347061157, "learning_rate": 2.73416965381898e-06, "loss": 0.0114, "step": 5624 }, { "epoch": 3.4225737754791603, "grad_norm": 0.22550025582313538, "learning_rate": 2.728511641802076e-06, "loss": 0.0033, "step": 5625 }, { "epoch": 3.4231822330392454, "grad_norm": 0.16939371824264526, "learning_rate": 2.722859152185972e-06, "loss": 0.0042, "step": 5626 }, { "epoch": 3.423790690599331, "grad_norm": 0.3074353337287903, "learning_rate": 2.7172121863722366e-06, "loss": 0.0072, "step": 5627 }, { "epoch": 3.424399148159416, "grad_norm": 0.3095559775829315, "learning_rate": 2.711570745761094e-06, "loss": 0.0081, "step": 5628 }, { "epoch": 3.425007605719501, "grad_norm": 0.24687674641609192, "learning_rate": 2.7059348317513665e-06, "loss": 0.0073, "step": 5629 }, { "epoch": 3.4256160632795862, "grad_norm": 0.28120920062065125, "learning_rate": 2.700304445740551e-06, "loss": 0.0062, "step": 5630 }, { "epoch": 3.4262245208396713, "grad_norm": 0.18883898854255676, "learning_rate": 2.6946795891247266e-06, "loss": 0.0037, "step": 5631 }, { "epoch": 3.4268329783997564, "grad_norm": 0.21660275757312775, "learning_rate": 2.6890602632986373e-06, "loss": 0.0056, "step": 5632 }, { "epoch": 3.427441435959842, "grad_norm": 0.2389414757490158, "learning_rate": 2.6834464696556343e-06, "loss": 0.0043, "step": 5633 }, { "epoch": 3.428049893519927, "grad_norm": 0.23681369423866272, "learning_rate": 2.6778382095877104e-06, "loss": 0.0051, "step": 5634 }, { "epoch": 3.428658351080012, "grad_norm": 0.251058965921402, "learning_rate": 2.6722354844854693e-06, "loss": 0.0031, "step": 5635 }, { "epoch": 3.4292668086400973, "grad_norm": 0.27185845375061035, "learning_rate": 2.666638295738169e-06, "loss": 0.0058, "step": 5636 }, { "epoch": 3.4298752662001823, "grad_norm": 0.2628857493400574, "learning_rate": 2.6610466447336697e-06, "loss": 0.0045, "step": 5637 }, { "epoch": 3.430483723760268, "grad_norm": 0.17281438410282135, "learning_rate": 2.65546053285847e-06, "loss": 0.0036, "step": 5638 }, { "epoch": 3.431092181320353, "grad_norm": 0.16873584687709808, "learning_rate": 2.649879961497684e-06, "loss": 0.0028, "step": 5639 }, { "epoch": 3.431700638880438, "grad_norm": 0.23468784987926483, "learning_rate": 2.64430493203508e-06, "loss": 0.0062, "step": 5640 }, { "epoch": 3.432309096440523, "grad_norm": 0.20407533645629883, "learning_rate": 2.6387354458530134e-06, "loss": 0.0034, "step": 5641 }, { "epoch": 3.4329175540006083, "grad_norm": 0.17050844430923462, "learning_rate": 2.6331715043324935e-06, "loss": 0.0037, "step": 5642 }, { "epoch": 3.433526011560694, "grad_norm": 0.26065853238105774, "learning_rate": 2.627613108853147e-06, "loss": 0.004, "step": 5643 }, { "epoch": 3.434134469120779, "grad_norm": 0.31893685460090637, "learning_rate": 2.622060260793227e-06, "loss": 0.0089, "step": 5644 }, { "epoch": 3.434742926680864, "grad_norm": 0.16491414606571198, "learning_rate": 2.616512961529591e-06, "loss": 0.0041, "step": 5645 }, { "epoch": 3.435351384240949, "grad_norm": 0.30626043677330017, "learning_rate": 2.610971212437763e-06, "loss": 0.0042, "step": 5646 }, { "epoch": 3.435959841801034, "grad_norm": 0.3586437702178955, "learning_rate": 2.6054350148918493e-06, "loss": 0.0068, "step": 5647 }, { "epoch": 3.4365682993611197, "grad_norm": 0.2317257821559906, "learning_rate": 2.5999043702646e-06, "loss": 0.0054, "step": 5648 }, { "epoch": 3.437176756921205, "grad_norm": 0.2033744603395462, "learning_rate": 2.5943792799273838e-06, "loss": 0.0042, "step": 5649 }, { "epoch": 3.43778521448129, "grad_norm": 0.18324805796146393, "learning_rate": 2.5888597452501994e-06, "loss": 0.0049, "step": 5650 }, { "epoch": 3.438393672041375, "grad_norm": 0.2906474769115448, "learning_rate": 2.5833457676016526e-06, "loss": 0.0073, "step": 5651 }, { "epoch": 3.43900212960146, "grad_norm": 0.16437779366970062, "learning_rate": 2.5778373483489827e-06, "loss": 0.0042, "step": 5652 }, { "epoch": 3.4396105871615457, "grad_norm": 0.2783378064632416, "learning_rate": 2.572334488858047e-06, "loss": 0.009, "step": 5653 }, { "epoch": 3.4402190447216308, "grad_norm": 0.21060001850128174, "learning_rate": 2.566837190493332e-06, "loss": 0.0041, "step": 5654 }, { "epoch": 3.440827502281716, "grad_norm": 0.2840571403503418, "learning_rate": 2.561345454617925e-06, "loss": 0.0058, "step": 5655 }, { "epoch": 3.441435959841801, "grad_norm": 0.20874862372875214, "learning_rate": 2.5558592825935645e-06, "loss": 0.0034, "step": 5656 }, { "epoch": 3.442044417401886, "grad_norm": 0.237037792801857, "learning_rate": 2.5503786757805794e-06, "loss": 0.0045, "step": 5657 }, { "epoch": 3.4426528749619716, "grad_norm": 0.25760436058044434, "learning_rate": 2.5449036355379347e-06, "loss": 0.0053, "step": 5658 }, { "epoch": 3.4432613325220567, "grad_norm": 0.2317677140235901, "learning_rate": 2.539434163223217e-06, "loss": 0.0052, "step": 5659 }, { "epoch": 3.443869790082142, "grad_norm": 0.3090670704841614, "learning_rate": 2.533970260192628e-06, "loss": 0.0103, "step": 5660 }, { "epoch": 3.444478247642227, "grad_norm": 0.23244629800319672, "learning_rate": 2.528511927800978e-06, "loss": 0.0052, "step": 5661 }, { "epoch": 3.445086705202312, "grad_norm": 0.20796987414360046, "learning_rate": 2.5230591674017145e-06, "loss": 0.0032, "step": 5662 }, { "epoch": 3.4456951627623975, "grad_norm": 0.2811738848686218, "learning_rate": 2.51761198034689e-06, "loss": 0.007, "step": 5663 }, { "epoch": 3.4463036203224826, "grad_norm": 0.2699826955795288, "learning_rate": 2.5121703679871907e-06, "loss": 0.0047, "step": 5664 }, { "epoch": 3.4469120778825677, "grad_norm": 0.21873120963573456, "learning_rate": 2.5067343316718866e-06, "loss": 0.0041, "step": 5665 }, { "epoch": 3.447520535442653, "grad_norm": 0.21390405297279358, "learning_rate": 2.501303872748917e-06, "loss": 0.0045, "step": 5666 }, { "epoch": 3.448128993002738, "grad_norm": 0.18419447541236877, "learning_rate": 2.4958789925647873e-06, "loss": 0.0044, "step": 5667 }, { "epoch": 3.4487374505628234, "grad_norm": 0.19194208085536957, "learning_rate": 2.49045969246465e-06, "loss": 0.0049, "step": 5668 }, { "epoch": 3.4493459081229085, "grad_norm": 0.18368539214134216, "learning_rate": 2.485045973792266e-06, "loss": 0.0046, "step": 5669 }, { "epoch": 3.4499543656829936, "grad_norm": 0.33034926652908325, "learning_rate": 2.4796378378900142e-06, "loss": 0.0104, "step": 5670 }, { "epoch": 3.4505628232430787, "grad_norm": 0.23672623932361603, "learning_rate": 2.474235286098878e-06, "loss": 0.0038, "step": 5671 }, { "epoch": 3.451171280803164, "grad_norm": 0.23982059955596924, "learning_rate": 2.4688383197584762e-06, "loss": 0.0046, "step": 5672 }, { "epoch": 3.451779738363249, "grad_norm": 0.3084208369255066, "learning_rate": 2.4634469402070233e-06, "loss": 0.0091, "step": 5673 }, { "epoch": 3.4523881959233345, "grad_norm": 0.14161497354507446, "learning_rate": 2.458061148781363e-06, "loss": 0.0026, "step": 5674 }, { "epoch": 3.4529966534834196, "grad_norm": 0.3343401253223419, "learning_rate": 2.452680946816932e-06, "loss": 0.0078, "step": 5675 }, { "epoch": 3.4536051110435046, "grad_norm": 0.23120488226413727, "learning_rate": 2.4473063356478198e-06, "loss": 0.0054, "step": 5676 }, { "epoch": 3.4542135686035897, "grad_norm": 0.2343619167804718, "learning_rate": 2.4419373166066904e-06, "loss": 0.0056, "step": 5677 }, { "epoch": 3.454822026163675, "grad_norm": 0.37773486971855164, "learning_rate": 2.4365738910248375e-06, "loss": 0.0102, "step": 5678 }, { "epoch": 3.4554304837237604, "grad_norm": 0.19855354726314545, "learning_rate": 2.431216060232169e-06, "loss": 0.0056, "step": 5679 }, { "epoch": 3.4560389412838455, "grad_norm": 0.2364191859960556, "learning_rate": 2.425863825557212e-06, "loss": 0.0064, "step": 5680 }, { "epoch": 3.4566473988439306, "grad_norm": 0.4233652353286743, "learning_rate": 2.420517188327079e-06, "loss": 0.0106, "step": 5681 }, { "epoch": 3.4572558564040157, "grad_norm": 0.2906593084335327, "learning_rate": 2.4151761498675345e-06, "loss": 0.0053, "step": 5682 }, { "epoch": 3.4578643139641008, "grad_norm": 0.2784557044506073, "learning_rate": 2.409840711502917e-06, "loss": 0.0079, "step": 5683 }, { "epoch": 3.4584727715241863, "grad_norm": 0.2974449396133423, "learning_rate": 2.4045108745561985e-06, "loss": 0.0071, "step": 5684 }, { "epoch": 3.4590812290842714, "grad_norm": 0.3297038674354553, "learning_rate": 2.3991866403489577e-06, "loss": 0.0073, "step": 5685 }, { "epoch": 3.4596896866443565, "grad_norm": 0.2977392077445984, "learning_rate": 2.393868010201386e-06, "loss": 0.0072, "step": 5686 }, { "epoch": 3.4602981442044416, "grad_norm": 0.3237118721008301, "learning_rate": 2.388554985432273e-06, "loss": 0.005, "step": 5687 }, { "epoch": 3.4609066017645267, "grad_norm": 0.23049971461296082, "learning_rate": 2.3832475673590316e-06, "loss": 0.0063, "step": 5688 }, { "epoch": 3.4615150593246122, "grad_norm": 0.2660450339317322, "learning_rate": 2.377945757297681e-06, "loss": 0.0061, "step": 5689 }, { "epoch": 3.4621235168846973, "grad_norm": 0.2178175449371338, "learning_rate": 2.3726495565628506e-06, "loss": 0.0058, "step": 5690 }, { "epoch": 3.4627319744447824, "grad_norm": 0.47303879261016846, "learning_rate": 2.3673589664677727e-06, "loss": 0.0134, "step": 5691 }, { "epoch": 3.4633404320048675, "grad_norm": 0.2600957453250885, "learning_rate": 2.362073988324304e-06, "loss": 0.0049, "step": 5692 }, { "epoch": 3.4639488895649526, "grad_norm": 0.2990921139717102, "learning_rate": 2.3567946234428844e-06, "loss": 0.0068, "step": 5693 }, { "epoch": 3.464557347125038, "grad_norm": 0.19195446372032166, "learning_rate": 2.3515208731325815e-06, "loss": 0.004, "step": 5694 }, { "epoch": 3.4651658046851233, "grad_norm": 0.3154228627681732, "learning_rate": 2.346252738701071e-06, "loss": 0.005, "step": 5695 }, { "epoch": 3.4657742622452083, "grad_norm": 0.19232860207557678, "learning_rate": 2.340990221454628e-06, "loss": 0.0035, "step": 5696 }, { "epoch": 3.4663827198052934, "grad_norm": 0.28526800870895386, "learning_rate": 2.3357333226981333e-06, "loss": 0.0065, "step": 5697 }, { "epoch": 3.4669911773653785, "grad_norm": 0.20558756589889526, "learning_rate": 2.33048204373508e-06, "loss": 0.0026, "step": 5698 }, { "epoch": 3.467599634925464, "grad_norm": 0.21182219684123993, "learning_rate": 2.3252363858675684e-06, "loss": 0.0044, "step": 5699 }, { "epoch": 3.468208092485549, "grad_norm": 0.22109881043434143, "learning_rate": 2.3199963503963e-06, "loss": 0.0043, "step": 5700 }, { "epoch": 3.4688165500456343, "grad_norm": 0.26969975233078003, "learning_rate": 2.314761938620591e-06, "loss": 0.0073, "step": 5701 }, { "epoch": 3.4694250076057194, "grad_norm": 0.20506982505321503, "learning_rate": 2.3095331518383582e-06, "loss": 0.0061, "step": 5702 }, { "epoch": 3.4700334651658045, "grad_norm": 0.20657236874103546, "learning_rate": 2.3043099913461125e-06, "loss": 0.0033, "step": 5703 }, { "epoch": 3.47064192272589, "grad_norm": 0.22096757590770721, "learning_rate": 2.2990924584389868e-06, "loss": 0.0039, "step": 5704 }, { "epoch": 3.471250380285975, "grad_norm": 0.221493199467659, "learning_rate": 2.29388055441071e-06, "loss": 0.0034, "step": 5705 }, { "epoch": 3.47185883784606, "grad_norm": 0.2772558331489563, "learning_rate": 2.2886742805536183e-06, "loss": 0.0058, "step": 5706 }, { "epoch": 3.4724672954061453, "grad_norm": 0.26506444811820984, "learning_rate": 2.283473638158656e-06, "loss": 0.0064, "step": 5707 }, { "epoch": 3.4730757529662304, "grad_norm": 0.18250389397144318, "learning_rate": 2.278278628515354e-06, "loss": 0.0037, "step": 5708 }, { "epoch": 3.473684210526316, "grad_norm": 0.21926113963127136, "learning_rate": 2.2730892529118643e-06, "loss": 0.0046, "step": 5709 }, { "epoch": 3.474292668086401, "grad_norm": 0.2898429334163666, "learning_rate": 2.267905512634935e-06, "loss": 0.0055, "step": 5710 }, { "epoch": 3.474901125646486, "grad_norm": 0.21994440257549286, "learning_rate": 2.2627274089699195e-06, "loss": 0.005, "step": 5711 }, { "epoch": 3.475509583206571, "grad_norm": 0.3623565137386322, "learning_rate": 2.257554943200771e-06, "loss": 0.0111, "step": 5712 }, { "epoch": 3.4761180407666563, "grad_norm": 0.2396956980228424, "learning_rate": 2.252388116610046e-06, "loss": 0.0043, "step": 5713 }, { "epoch": 3.476726498326742, "grad_norm": 0.25912052392959595, "learning_rate": 2.247226930478899e-06, "loss": 0.0062, "step": 5714 }, { "epoch": 3.477334955886827, "grad_norm": 0.22874920070171356, "learning_rate": 2.2420713860870914e-06, "loss": 0.0055, "step": 5715 }, { "epoch": 3.477943413446912, "grad_norm": 0.3113996386528015, "learning_rate": 2.2369214847129812e-06, "loss": 0.0056, "step": 5716 }, { "epoch": 3.478551871006997, "grad_norm": 0.21554897725582123, "learning_rate": 2.231777227633533e-06, "loss": 0.0053, "step": 5717 }, { "epoch": 3.4791603285670822, "grad_norm": 0.29633817076683044, "learning_rate": 2.2266386161243045e-06, "loss": 0.0057, "step": 5718 }, { "epoch": 3.479768786127168, "grad_norm": 0.14219973981380463, "learning_rate": 2.221505651459463e-06, "loss": 0.0035, "step": 5719 }, { "epoch": 3.480377243687253, "grad_norm": 0.2941674590110779, "learning_rate": 2.2163783349117616e-06, "loss": 0.0053, "step": 5720 }, { "epoch": 3.480985701247338, "grad_norm": 0.30135074257850647, "learning_rate": 2.211256667752565e-06, "loss": 0.0099, "step": 5721 }, { "epoch": 3.481594158807423, "grad_norm": 0.3033702075481415, "learning_rate": 2.206140651251834e-06, "loss": 0.0064, "step": 5722 }, { "epoch": 3.482202616367508, "grad_norm": 0.26715901494026184, "learning_rate": 2.2010302866781296e-06, "loss": 0.0059, "step": 5723 }, { "epoch": 3.4828110739275937, "grad_norm": 0.3949749767780304, "learning_rate": 2.1959255752986017e-06, "loss": 0.0104, "step": 5724 }, { "epoch": 3.483419531487679, "grad_norm": 0.2797585129737854, "learning_rate": 2.1908265183790105e-06, "loss": 0.0073, "step": 5725 }, { "epoch": 3.484027989047764, "grad_norm": 0.23626194894313812, "learning_rate": 2.1857331171837107e-06, "loss": 0.009, "step": 5726 }, { "epoch": 3.484636446607849, "grad_norm": 0.23918910324573517, "learning_rate": 2.180645372975651e-06, "loss": 0.0059, "step": 5727 }, { "epoch": 3.485244904167934, "grad_norm": 0.16909535229206085, "learning_rate": 2.1755632870163828e-06, "loss": 0.0032, "step": 5728 }, { "epoch": 3.4858533617280196, "grad_norm": 0.2599836587905884, "learning_rate": 2.170486860566054e-06, "loss": 0.008, "step": 5729 }, { "epoch": 3.4864618192881047, "grad_norm": 0.3136194944381714, "learning_rate": 2.1654160948833963e-06, "loss": 0.0058, "step": 5730 }, { "epoch": 3.48707027684819, "grad_norm": 0.2240346372127533, "learning_rate": 2.160350991225757e-06, "loss": 0.0049, "step": 5731 }, { "epoch": 3.487678734408275, "grad_norm": 0.34254634380340576, "learning_rate": 2.1552915508490675e-06, "loss": 0.0119, "step": 5732 }, { "epoch": 3.48828719196836, "grad_norm": 0.22509586811065674, "learning_rate": 2.1502377750078627e-06, "loss": 0.0043, "step": 5733 }, { "epoch": 3.4888956495284456, "grad_norm": 0.19744735956192017, "learning_rate": 2.145189664955258e-06, "loss": 0.0055, "step": 5734 }, { "epoch": 3.4895041070885306, "grad_norm": 0.24436576664447784, "learning_rate": 2.1401472219429867e-06, "loss": 0.0053, "step": 5735 }, { "epoch": 3.4901125646486157, "grad_norm": 0.29822880029678345, "learning_rate": 2.1351104472213585e-06, "loss": 0.0054, "step": 5736 }, { "epoch": 3.490721022208701, "grad_norm": 0.33873575925827026, "learning_rate": 2.1300793420392815e-06, "loss": 0.0053, "step": 5737 }, { "epoch": 3.491329479768786, "grad_norm": 0.3289222717285156, "learning_rate": 2.1250539076442617e-06, "loss": 0.0073, "step": 5738 }, { "epoch": 3.4919379373288715, "grad_norm": 0.23619408905506134, "learning_rate": 2.120034145282404e-06, "loss": 0.0044, "step": 5739 }, { "epoch": 3.4925463948889566, "grad_norm": 0.2359532117843628, "learning_rate": 2.115020056198383e-06, "loss": 0.0052, "step": 5740 }, { "epoch": 3.4931548524490417, "grad_norm": 0.20343980193138123, "learning_rate": 2.1100116416355063e-06, "loss": 0.0051, "step": 5741 }, { "epoch": 3.4937633100091268, "grad_norm": 0.2578510642051697, "learning_rate": 2.1050089028356366e-06, "loss": 0.0055, "step": 5742 }, { "epoch": 3.494371767569212, "grad_norm": 0.2600227892398834, "learning_rate": 2.100011841039251e-06, "loss": 0.0072, "step": 5743 }, { "epoch": 3.4949802251292974, "grad_norm": 0.23431195318698883, "learning_rate": 2.0950204574854027e-06, "loss": 0.0045, "step": 5744 }, { "epoch": 3.4955886826893825, "grad_norm": 0.265465646982193, "learning_rate": 2.0900347534117627e-06, "loss": 0.007, "step": 5745 }, { "epoch": 3.4961971402494676, "grad_norm": 0.22313670814037323, "learning_rate": 2.0850547300545668e-06, "loss": 0.0041, "step": 5746 }, { "epoch": 3.4968055978095527, "grad_norm": 0.3893153965473175, "learning_rate": 2.080080388648653e-06, "loss": 0.0111, "step": 5747 }, { "epoch": 3.497414055369638, "grad_norm": 0.31730809807777405, "learning_rate": 2.0751117304274563e-06, "loss": 0.0099, "step": 5748 }, { "epoch": 3.4980225129297233, "grad_norm": 0.30250468850135803, "learning_rate": 2.070148756622997e-06, "loss": 0.0057, "step": 5749 }, { "epoch": 3.4986309704898084, "grad_norm": 0.30746808648109436, "learning_rate": 2.065191468465874e-06, "loss": 0.0084, "step": 5750 }, { "epoch": 3.4992394280498935, "grad_norm": 0.1614643782377243, "learning_rate": 2.060239867185304e-06, "loss": 0.0021, "step": 5751 }, { "epoch": 3.4998478856099786, "grad_norm": 0.2963274121284485, "learning_rate": 2.0552939540090687e-06, "loss": 0.006, "step": 5752 }, { "epoch": 3.5004563431700637, "grad_norm": 0.20009563863277435, "learning_rate": 2.050353730163554e-06, "loss": 0.0052, "step": 5753 }, { "epoch": 3.5010648007301493, "grad_norm": 0.20086035132408142, "learning_rate": 2.045419196873716e-06, "loss": 0.0028, "step": 5754 }, { "epoch": 3.5016732582902343, "grad_norm": 0.22441764175891876, "learning_rate": 2.0404903553631337e-06, "loss": 0.0052, "step": 5755 }, { "epoch": 3.5022817158503194, "grad_norm": 0.2553858160972595, "learning_rate": 2.0355672068539387e-06, "loss": 0.0045, "step": 5756 }, { "epoch": 3.5028901734104045, "grad_norm": 0.32011979818344116, "learning_rate": 2.030649752566871e-06, "loss": 0.0076, "step": 5757 }, { "epoch": 3.5034986309704896, "grad_norm": 0.21500691771507263, "learning_rate": 2.025737993721255e-06, "loss": 0.0053, "step": 5758 }, { "epoch": 3.504107088530575, "grad_norm": 0.32461220026016235, "learning_rate": 2.020831931535008e-06, "loss": 0.0096, "step": 5759 }, { "epoch": 3.5047155460906603, "grad_norm": 0.28721633553504944, "learning_rate": 2.015931567224613e-06, "loss": 0.0088, "step": 5760 }, { "epoch": 3.5053240036507454, "grad_norm": 0.17419946193695068, "learning_rate": 2.0110369020051755e-06, "loss": 0.0027, "step": 5761 }, { "epoch": 3.5059324612108305, "grad_norm": 0.33820995688438416, "learning_rate": 2.006147937090355e-06, "loss": 0.0107, "step": 5762 }, { "epoch": 3.5065409187709156, "grad_norm": 0.3115573227405548, "learning_rate": 2.001264673692413e-06, "loss": 0.0061, "step": 5763 }, { "epoch": 3.507149376331001, "grad_norm": 0.2463875263929367, "learning_rate": 1.9963871130221997e-06, "loss": 0.0061, "step": 5764 }, { "epoch": 3.507757833891086, "grad_norm": 0.19313430786132812, "learning_rate": 1.9915152562891476e-06, "loss": 0.0046, "step": 5765 }, { "epoch": 3.5083662914511713, "grad_norm": 0.27370426058769226, "learning_rate": 1.9866491047012687e-06, "loss": 0.0052, "step": 5766 }, { "epoch": 3.5089747490112564, "grad_norm": 0.22294797003269196, "learning_rate": 1.981788659465164e-06, "loss": 0.0045, "step": 5767 }, { "epoch": 3.5095832065713415, "grad_norm": 0.24368692934513092, "learning_rate": 1.976933921786028e-06, "loss": 0.0047, "step": 5768 }, { "epoch": 3.510191664131427, "grad_norm": 0.29862114787101746, "learning_rate": 1.972084892867637e-06, "loss": 0.0081, "step": 5769 }, { "epoch": 3.510800121691512, "grad_norm": 0.16069160401821136, "learning_rate": 1.967241573912329e-06, "loss": 0.0023, "step": 5770 }, { "epoch": 3.511408579251597, "grad_norm": 0.20247617363929749, "learning_rate": 1.962403966121071e-06, "loss": 0.0043, "step": 5771 }, { "epoch": 3.5120170368116823, "grad_norm": 0.2329401671886444, "learning_rate": 1.957572070693367e-06, "loss": 0.0039, "step": 5772 }, { "epoch": 3.5126254943717674, "grad_norm": 0.2965979278087616, "learning_rate": 1.952745888827337e-06, "loss": 0.0107, "step": 5773 }, { "epoch": 3.513233951931853, "grad_norm": 0.10749952495098114, "learning_rate": 1.947925421719668e-06, "loss": 0.0021, "step": 5774 }, { "epoch": 3.513842409491938, "grad_norm": 0.2879178822040558, "learning_rate": 1.9431106705656397e-06, "loss": 0.0084, "step": 5775 }, { "epoch": 3.514450867052023, "grad_norm": 0.34235483407974243, "learning_rate": 1.938301636559098e-06, "loss": 0.0071, "step": 5776 }, { "epoch": 3.5150593246121082, "grad_norm": 0.21877804398536682, "learning_rate": 1.9334983208925017e-06, "loss": 0.007, "step": 5777 }, { "epoch": 3.5156677821721933, "grad_norm": 0.36894896626472473, "learning_rate": 1.9287007247568573e-06, "loss": 0.007, "step": 5778 }, { "epoch": 3.516276239732279, "grad_norm": 0.3058571517467499, "learning_rate": 1.9239088493417796e-06, "loss": 0.0046, "step": 5779 }, { "epoch": 3.516884697292364, "grad_norm": 0.24490618705749512, "learning_rate": 1.9191226958354403e-06, "loss": 0.0057, "step": 5780 }, { "epoch": 3.517493154852449, "grad_norm": 0.304394006729126, "learning_rate": 1.9143422654246205e-06, "loss": 0.0098, "step": 5781 }, { "epoch": 3.518101612412534, "grad_norm": 0.28492802381515503, "learning_rate": 1.9095675592946587e-06, "loss": 0.0061, "step": 5782 }, { "epoch": 3.5187100699726193, "grad_norm": 0.2946596145629883, "learning_rate": 1.9047985786294853e-06, "loss": 0.0061, "step": 5783 }, { "epoch": 3.519318527532705, "grad_norm": 0.18398457765579224, "learning_rate": 1.900035324611607e-06, "loss": 0.0037, "step": 5784 }, { "epoch": 3.51992698509279, "grad_norm": 0.23536744713783264, "learning_rate": 1.895277798422121e-06, "loss": 0.004, "step": 5785 }, { "epoch": 3.520535442652875, "grad_norm": 0.24130214750766754, "learning_rate": 1.8905260012406778e-06, "loss": 0.0049, "step": 5786 }, { "epoch": 3.52114390021296, "grad_norm": 0.2898615300655365, "learning_rate": 1.8857799342455462e-06, "loss": 0.0057, "step": 5787 }, { "epoch": 3.521752357773045, "grad_norm": 0.25997620820999146, "learning_rate": 1.8810395986135377e-06, "loss": 0.0057, "step": 5788 }, { "epoch": 3.5223608153331307, "grad_norm": 0.2553774416446686, "learning_rate": 1.8763049955200674e-06, "loss": 0.0057, "step": 5789 }, { "epoch": 3.522969272893216, "grad_norm": 0.26607316732406616, "learning_rate": 1.8715761261391074e-06, "loss": 0.0045, "step": 5790 }, { "epoch": 3.523577730453301, "grad_norm": 0.24370121955871582, "learning_rate": 1.8668529916432365e-06, "loss": 0.0047, "step": 5791 }, { "epoch": 3.524186188013386, "grad_norm": 0.2677232325077057, "learning_rate": 1.8621355932035788e-06, "loss": 0.0061, "step": 5792 }, { "epoch": 3.524794645573471, "grad_norm": 0.29431116580963135, "learning_rate": 1.8574239319898657e-06, "loss": 0.0068, "step": 5793 }, { "epoch": 3.5254031031335566, "grad_norm": 0.2646505832672119, "learning_rate": 1.8527180091703843e-06, "loss": 0.005, "step": 5794 }, { "epoch": 3.5260115606936417, "grad_norm": 0.2067060023546219, "learning_rate": 1.848017825912013e-06, "loss": 0.0051, "step": 5795 }, { "epoch": 3.526620018253727, "grad_norm": 0.19585838913917542, "learning_rate": 1.843323383380194e-06, "loss": 0.0028, "step": 5796 }, { "epoch": 3.527228475813812, "grad_norm": 0.27853307127952576, "learning_rate": 1.8386346827389629e-06, "loss": 0.006, "step": 5797 }, { "epoch": 3.527836933373897, "grad_norm": 0.260459840297699, "learning_rate": 1.8339517251509146e-06, "loss": 0.0058, "step": 5798 }, { "epoch": 3.5284453909339826, "grad_norm": 0.21536771953105927, "learning_rate": 1.829274511777232e-06, "loss": 0.006, "step": 5799 }, { "epoch": 3.5290538484940677, "grad_norm": 0.3227105736732483, "learning_rate": 1.8246030437776645e-06, "loss": 0.0062, "step": 5800 }, { "epoch": 3.5296623060541528, "grad_norm": 0.21509003639221191, "learning_rate": 1.8199373223105498e-06, "loss": 0.0041, "step": 5801 }, { "epoch": 3.530270763614238, "grad_norm": 0.3301815092563629, "learning_rate": 1.8152773485327818e-06, "loss": 0.0088, "step": 5802 }, { "epoch": 3.530879221174323, "grad_norm": 0.13643626868724823, "learning_rate": 1.8106231235998444e-06, "loss": 0.0032, "step": 5803 }, { "epoch": 3.5314876787344085, "grad_norm": 0.3046422600746155, "learning_rate": 1.8059746486657896e-06, "loss": 0.0091, "step": 5804 }, { "epoch": 3.5320961362944936, "grad_norm": 0.2640552222728729, "learning_rate": 1.8013319248832538e-06, "loss": 0.0067, "step": 5805 }, { "epoch": 3.5327045938545787, "grad_norm": 0.32940080761909485, "learning_rate": 1.7966949534034243e-06, "loss": 0.0086, "step": 5806 }, { "epoch": 3.533313051414664, "grad_norm": 0.2538890242576599, "learning_rate": 1.7920637353760928e-06, "loss": 0.0057, "step": 5807 }, { "epoch": 3.533921508974749, "grad_norm": 0.6640288233757019, "learning_rate": 1.7874382719495958e-06, "loss": 0.0576, "step": 5808 }, { "epoch": 3.5345299665348344, "grad_norm": 0.3605937063694, "learning_rate": 1.7828185642708605e-06, "loss": 0.0078, "step": 5809 }, { "epoch": 3.5351384240949195, "grad_norm": 0.2806709408760071, "learning_rate": 1.7782046134853792e-06, "loss": 0.0088, "step": 5810 }, { "epoch": 3.5357468816550046, "grad_norm": 0.17481489479541779, "learning_rate": 1.773596420737228e-06, "loss": 0.0039, "step": 5811 }, { "epoch": 3.5363553392150897, "grad_norm": 0.15608619153499603, "learning_rate": 1.7689939871690375e-06, "loss": 0.0043, "step": 5812 }, { "epoch": 3.536963796775175, "grad_norm": 0.2716093361377716, "learning_rate": 1.7643973139220198e-06, "loss": 0.0101, "step": 5813 }, { "epoch": 3.5375722543352603, "grad_norm": 0.13356530666351318, "learning_rate": 1.7598064021359607e-06, "loss": 0.0015, "step": 5814 }, { "epoch": 3.5381807118953454, "grad_norm": 0.17770147323608398, "learning_rate": 1.7552212529492158e-06, "loss": 0.0029, "step": 5815 }, { "epoch": 3.5387891694554305, "grad_norm": 0.1851194053888321, "learning_rate": 1.750641867498709e-06, "loss": 0.0034, "step": 5816 }, { "epoch": 3.5393976270155156, "grad_norm": 0.3669975697994232, "learning_rate": 1.7460682469199435e-06, "loss": 0.0077, "step": 5817 }, { "epoch": 3.5400060845756007, "grad_norm": 0.24136383831501007, "learning_rate": 1.7415003923469787e-06, "loss": 0.0053, "step": 5818 }, { "epoch": 3.5406145421356863, "grad_norm": 0.2976721227169037, "learning_rate": 1.7369383049124498e-06, "loss": 0.0061, "step": 5819 }, { "epoch": 3.5412229996957714, "grad_norm": 0.20041820406913757, "learning_rate": 1.7323819857475721e-06, "loss": 0.0051, "step": 5820 }, { "epoch": 3.5418314572558565, "grad_norm": 0.2120663970708847, "learning_rate": 1.727831435982119e-06, "loss": 0.0047, "step": 5821 }, { "epoch": 3.5424399148159416, "grad_norm": 0.18316033482551575, "learning_rate": 1.7232866567444384e-06, "loss": 0.0035, "step": 5822 }, { "epoch": 3.5430483723760267, "grad_norm": 0.22955255210399628, "learning_rate": 1.7187476491614507e-06, "loss": 0.0058, "step": 5823 }, { "epoch": 3.543656829936112, "grad_norm": 0.22870229184627533, "learning_rate": 1.7142144143586308e-06, "loss": 0.0033, "step": 5824 }, { "epoch": 3.5442652874961973, "grad_norm": 0.24868161976337433, "learning_rate": 1.7096869534600352e-06, "loss": 0.0076, "step": 5825 }, { "epoch": 3.5448737450562824, "grad_norm": 0.32703396677970886, "learning_rate": 1.7051652675882878e-06, "loss": 0.0066, "step": 5826 }, { "epoch": 3.5454822026163675, "grad_norm": 0.28158140182495117, "learning_rate": 1.7006493578645838e-06, "loss": 0.0064, "step": 5827 }, { "epoch": 3.5460906601764526, "grad_norm": 0.2724444270133972, "learning_rate": 1.6961392254086689e-06, "loss": 0.0065, "step": 5828 }, { "epoch": 3.546699117736538, "grad_norm": 0.312369704246521, "learning_rate": 1.6916348713388708e-06, "loss": 0.0063, "step": 5829 }, { "epoch": 3.547307575296623, "grad_norm": 0.2701030373573303, "learning_rate": 1.6871362967720878e-06, "loss": 0.006, "step": 5830 }, { "epoch": 3.5479160328567083, "grad_norm": 0.16680403053760529, "learning_rate": 1.682643502823772e-06, "loss": 0.0028, "step": 5831 }, { "epoch": 3.5485244904167934, "grad_norm": 0.25249308347702026, "learning_rate": 1.6781564906079545e-06, "loss": 0.0045, "step": 5832 }, { "epoch": 3.5491329479768785, "grad_norm": 0.1920885592699051, "learning_rate": 1.6736752612372286e-06, "loss": 0.0041, "step": 5833 }, { "epoch": 3.549741405536964, "grad_norm": 0.28763115406036377, "learning_rate": 1.6691998158227446e-06, "loss": 0.007, "step": 5834 }, { "epoch": 3.550349863097049, "grad_norm": 0.479855477809906, "learning_rate": 1.6647301554742312e-06, "loss": 0.0151, "step": 5835 }, { "epoch": 3.5509583206571342, "grad_norm": 0.4631776213645935, "learning_rate": 1.6602662812999742e-06, "loss": 0.0091, "step": 5836 }, { "epoch": 3.5515667782172193, "grad_norm": 0.3424180746078491, "learning_rate": 1.6558081944068354e-06, "loss": 0.0108, "step": 5837 }, { "epoch": 3.5521752357773044, "grad_norm": 0.24957185983657837, "learning_rate": 1.6513558959002334e-06, "loss": 0.0073, "step": 5838 }, { "epoch": 3.55278369333739, "grad_norm": 0.23904968798160553, "learning_rate": 1.6469093868841434e-06, "loss": 0.0071, "step": 5839 }, { "epoch": 3.553392150897475, "grad_norm": 0.1692766696214676, "learning_rate": 1.6424686684611224e-06, "loss": 0.0027, "step": 5840 }, { "epoch": 3.55400060845756, "grad_norm": 0.28650224208831787, "learning_rate": 1.638033741732281e-06, "loss": 0.0097, "step": 5841 }, { "epoch": 3.5546090660176453, "grad_norm": 0.14089111983776093, "learning_rate": 1.6336046077972983e-06, "loss": 0.0019, "step": 5842 }, { "epoch": 3.5552175235777304, "grad_norm": 0.148437961935997, "learning_rate": 1.6291812677544121e-06, "loss": 0.0024, "step": 5843 }, { "epoch": 3.555825981137816, "grad_norm": 0.26019591093063354, "learning_rate": 1.6247637227004342e-06, "loss": 0.007, "step": 5844 }, { "epoch": 3.5564344386979005, "grad_norm": 0.2771880626678467, "learning_rate": 1.6203519737307187e-06, "loss": 0.0042, "step": 5845 }, { "epoch": 3.557042896257986, "grad_norm": 0.27632075548171997, "learning_rate": 1.615946021939202e-06, "loss": 0.0059, "step": 5846 }, { "epoch": 3.557651353818071, "grad_norm": 0.2640374004840851, "learning_rate": 1.6115458684183793e-06, "loss": 0.0031, "step": 5847 }, { "epoch": 3.5582598113781563, "grad_norm": 0.23111113905906677, "learning_rate": 1.607151514259303e-06, "loss": 0.0044, "step": 5848 }, { "epoch": 3.558868268938242, "grad_norm": 0.40425482392311096, "learning_rate": 1.602762960551582e-06, "loss": 0.0084, "step": 5849 }, { "epoch": 3.5594767264983265, "grad_norm": 0.17525078356266022, "learning_rate": 1.5983802083834126e-06, "loss": 0.0032, "step": 5850 }, { "epoch": 3.560085184058412, "grad_norm": 0.2451610118150711, "learning_rate": 1.5940032588415171e-06, "loss": 0.0077, "step": 5851 }, { "epoch": 3.560693641618497, "grad_norm": 0.2035626322031021, "learning_rate": 1.5896321130112023e-06, "loss": 0.0035, "step": 5852 }, { "epoch": 3.561302099178582, "grad_norm": 0.16917142271995544, "learning_rate": 1.5852667719763348e-06, "loss": 0.0026, "step": 5853 }, { "epoch": 3.5619105567386677, "grad_norm": 0.40496107935905457, "learning_rate": 1.5809072368193345e-06, "loss": 0.027, "step": 5854 }, { "epoch": 3.5625190142987524, "grad_norm": 0.2576707899570465, "learning_rate": 1.5765535086211786e-06, "loss": 0.0059, "step": 5855 }, { "epoch": 3.563127471858838, "grad_norm": 0.27489739656448364, "learning_rate": 1.57220558846142e-06, "loss": 0.0037, "step": 5856 }, { "epoch": 3.563735929418923, "grad_norm": 0.16968370974063873, "learning_rate": 1.567863477418155e-06, "loss": 0.0036, "step": 5857 }, { "epoch": 3.564344386979008, "grad_norm": 0.271075040102005, "learning_rate": 1.5635271765680525e-06, "loss": 0.0051, "step": 5858 }, { "epoch": 3.5649528445390937, "grad_norm": 0.27260100841522217, "learning_rate": 1.5591966869863196e-06, "loss": 0.0062, "step": 5859 }, { "epoch": 3.5655613020991783, "grad_norm": 0.15648119151592255, "learning_rate": 1.554872009746758e-06, "loss": 0.0032, "step": 5860 }, { "epoch": 3.566169759659264, "grad_norm": 0.23940731585025787, "learning_rate": 1.5505531459216904e-06, "loss": 0.004, "step": 5861 }, { "epoch": 3.566778217219349, "grad_norm": 0.3243446350097656, "learning_rate": 1.5462400965820218e-06, "loss": 0.0109, "step": 5862 }, { "epoch": 3.567386674779434, "grad_norm": 0.14726978540420532, "learning_rate": 1.5419328627972103e-06, "loss": 0.0021, "step": 5863 }, { "epoch": 3.5679951323395196, "grad_norm": 0.2737961411476135, "learning_rate": 1.5376314456352708e-06, "loss": 0.0065, "step": 5864 }, { "epoch": 3.5686035898996042, "grad_norm": 0.2394101470708847, "learning_rate": 1.5333358461627673e-06, "loss": 0.0036, "step": 5865 }, { "epoch": 3.56921204745969, "grad_norm": 0.20798607170581818, "learning_rate": 1.5290460654448418e-06, "loss": 0.0051, "step": 5866 }, { "epoch": 3.569820505019775, "grad_norm": 0.21632260084152222, "learning_rate": 1.5247621045451688e-06, "loss": 0.0044, "step": 5867 }, { "epoch": 3.57042896257986, "grad_norm": 0.28413519263267517, "learning_rate": 1.5204839645259983e-06, "loss": 0.005, "step": 5868 }, { "epoch": 3.571037420139945, "grad_norm": 0.28667712211608887, "learning_rate": 1.5162116464481318e-06, "loss": 0.0077, "step": 5869 }, { "epoch": 3.57164587770003, "grad_norm": 0.25631603598594666, "learning_rate": 1.5119451513709277e-06, "loss": 0.0039, "step": 5870 }, { "epoch": 3.5722543352601157, "grad_norm": 0.19717690348625183, "learning_rate": 1.5076844803522922e-06, "loss": 0.0024, "step": 5871 }, { "epoch": 3.572862792820201, "grad_norm": 0.28270480036735535, "learning_rate": 1.503429634448697e-06, "loss": 0.0071, "step": 5872 }, { "epoch": 3.573471250380286, "grad_norm": 0.23655082285404205, "learning_rate": 1.4991806147151677e-06, "loss": 0.0069, "step": 5873 }, { "epoch": 3.574079707940371, "grad_norm": 0.22452490031719208, "learning_rate": 1.4949374222052864e-06, "loss": 0.0042, "step": 5874 }, { "epoch": 3.574688165500456, "grad_norm": 0.21213477849960327, "learning_rate": 1.4907000579711782e-06, "loss": 0.0043, "step": 5875 }, { "epoch": 3.5752966230605416, "grad_norm": 0.27884024381637573, "learning_rate": 1.4864685230635473e-06, "loss": 0.0057, "step": 5876 }, { "epoch": 3.5759050806206267, "grad_norm": 0.23877516388893127, "learning_rate": 1.4822428185316261e-06, "loss": 0.0067, "step": 5877 }, { "epoch": 3.576513538180712, "grad_norm": 0.20616334676742554, "learning_rate": 1.4780229454232158e-06, "loss": 0.0053, "step": 5878 }, { "epoch": 3.577121995740797, "grad_norm": 0.18893709778785706, "learning_rate": 1.4738089047846736e-06, "loss": 0.0038, "step": 5879 }, { "epoch": 3.577730453300882, "grad_norm": 0.2203339785337448, "learning_rate": 1.4696006976609057e-06, "loss": 0.0054, "step": 5880 }, { "epoch": 3.5783389108609676, "grad_norm": 0.15522873401641846, "learning_rate": 1.4653983250953657e-06, "loss": 0.0029, "step": 5881 }, { "epoch": 3.5789473684210527, "grad_norm": 0.14982964098453522, "learning_rate": 1.4612017881300704e-06, "loss": 0.0032, "step": 5882 }, { "epoch": 3.5795558259811378, "grad_norm": 0.26596924662590027, "learning_rate": 1.4570110878055877e-06, "loss": 0.0044, "step": 5883 }, { "epoch": 3.580164283541223, "grad_norm": 0.2501273453235626, "learning_rate": 1.4528262251610358e-06, "loss": 0.007, "step": 5884 }, { "epoch": 3.580772741101308, "grad_norm": 0.19661864638328552, "learning_rate": 1.4486472012340824e-06, "loss": 0.0042, "step": 5885 }, { "epoch": 3.5813811986613935, "grad_norm": 0.22375038266181946, "learning_rate": 1.4444740170609567e-06, "loss": 0.0061, "step": 5886 }, { "epoch": 3.5819896562214786, "grad_norm": 0.28383755683898926, "learning_rate": 1.440306673676431e-06, "loss": 0.0056, "step": 5887 }, { "epoch": 3.5825981137815637, "grad_norm": 0.17903096973896027, "learning_rate": 1.436145172113834e-06, "loss": 0.0037, "step": 5888 }, { "epoch": 3.5832065713416488, "grad_norm": 0.24099615216255188, "learning_rate": 1.4319895134050437e-06, "loss": 0.0047, "step": 5889 }, { "epoch": 3.583815028901734, "grad_norm": 0.20468921959400177, "learning_rate": 1.4278396985804966e-06, "loss": 0.005, "step": 5890 }, { "epoch": 3.5844234864618194, "grad_norm": 0.26146969199180603, "learning_rate": 1.4236957286691581e-06, "loss": 0.0064, "step": 5891 }, { "epoch": 3.5850319440219045, "grad_norm": 0.328022837638855, "learning_rate": 1.4195576046985793e-06, "loss": 0.0064, "step": 5892 }, { "epoch": 3.5856404015819896, "grad_norm": 0.26720380783081055, "learning_rate": 1.4154253276948276e-06, "loss": 0.0044, "step": 5893 }, { "epoch": 3.5862488591420747, "grad_norm": 0.20053601264953613, "learning_rate": 1.4112988986825476e-06, "loss": 0.0033, "step": 5894 }, { "epoch": 3.58685731670216, "grad_norm": 0.251700758934021, "learning_rate": 1.407178318684907e-06, "loss": 0.0063, "step": 5895 }, { "epoch": 3.5874657742622453, "grad_norm": 0.4364267587661743, "learning_rate": 1.403063588723652e-06, "loss": 0.0114, "step": 5896 }, { "epoch": 3.5880742318223304, "grad_norm": 0.25533613562583923, "learning_rate": 1.3989547098190559e-06, "loss": 0.0049, "step": 5897 }, { "epoch": 3.5886826893824155, "grad_norm": 0.22121703624725342, "learning_rate": 1.3948516829899505e-06, "loss": 0.0056, "step": 5898 }, { "epoch": 3.5892911469425006, "grad_norm": 0.16689860820770264, "learning_rate": 1.3907545092537166e-06, "loss": 0.0029, "step": 5899 }, { "epoch": 3.5898996045025857, "grad_norm": 0.22792737185955048, "learning_rate": 1.386663189626286e-06, "loss": 0.0051, "step": 5900 }, { "epoch": 3.5905080620626713, "grad_norm": 0.275244802236557, "learning_rate": 1.3825777251221278e-06, "loss": 0.0072, "step": 5901 }, { "epoch": 3.5911165196227564, "grad_norm": 0.34632372856140137, "learning_rate": 1.378498116754276e-06, "loss": 0.0078, "step": 5902 }, { "epoch": 3.5917249771828414, "grad_norm": 0.20081204175949097, "learning_rate": 1.3744243655342937e-06, "loss": 0.006, "step": 5903 }, { "epoch": 3.5923334347429265, "grad_norm": 0.11412859708070755, "learning_rate": 1.3703564724723116e-06, "loss": 0.0016, "step": 5904 }, { "epoch": 3.5929418923030116, "grad_norm": 0.22430390119552612, "learning_rate": 1.3662944385769843e-06, "loss": 0.004, "step": 5905 }, { "epoch": 3.593550349863097, "grad_norm": 0.25405508279800415, "learning_rate": 1.362238264855542e-06, "loss": 0.007, "step": 5906 }, { "epoch": 3.5941588074231823, "grad_norm": 0.447488397359848, "learning_rate": 1.3581879523137386e-06, "loss": 0.0064, "step": 5907 }, { "epoch": 3.5947672649832674, "grad_norm": 0.2718401253223419, "learning_rate": 1.354143501955879e-06, "loss": 0.0067, "step": 5908 }, { "epoch": 3.5953757225433525, "grad_norm": 0.18165600299835205, "learning_rate": 1.3501049147848277e-06, "loss": 0.0047, "step": 5909 }, { "epoch": 3.5959841801034376, "grad_norm": 0.2794153392314911, "learning_rate": 1.346072191801981e-06, "loss": 0.0047, "step": 5910 }, { "epoch": 3.596592637663523, "grad_norm": 0.21536363661289215, "learning_rate": 1.3420453340072832e-06, "loss": 0.0067, "step": 5911 }, { "epoch": 3.597201095223608, "grad_norm": 0.2198522537946701, "learning_rate": 1.3380243423992328e-06, "loss": 0.0053, "step": 5912 }, { "epoch": 3.5978095527836933, "grad_norm": 0.17715135216712952, "learning_rate": 1.3340092179748658e-06, "loss": 0.004, "step": 5913 }, { "epoch": 3.5984180103437784, "grad_norm": 0.2602989375591278, "learning_rate": 1.3299999617297637e-06, "loss": 0.0056, "step": 5914 }, { "epoch": 3.5990264679038635, "grad_norm": 0.20000648498535156, "learning_rate": 1.3259965746580588e-06, "loss": 0.0018, "step": 5915 }, { "epoch": 3.599634925463949, "grad_norm": 0.25586432218551636, "learning_rate": 1.3219990577524239e-06, "loss": 0.008, "step": 5916 }, { "epoch": 3.600243383024034, "grad_norm": 0.2492225617170334, "learning_rate": 1.3180074120040741e-06, "loss": 0.0054, "step": 5917 }, { "epoch": 3.6008518405841192, "grad_norm": 0.21238954365253448, "learning_rate": 1.3140216384027682e-06, "loss": 0.0044, "step": 5918 }, { "epoch": 3.6014602981442043, "grad_norm": 0.13737817108631134, "learning_rate": 1.3100417379368179e-06, "loss": 0.0033, "step": 5919 }, { "epoch": 3.6020687557042894, "grad_norm": 0.34668809175491333, "learning_rate": 1.306067711593076e-06, "loss": 0.0071, "step": 5920 }, { "epoch": 3.602677213264375, "grad_norm": 0.2114226520061493, "learning_rate": 1.3020995603569203e-06, "loss": 0.0044, "step": 5921 }, { "epoch": 3.60328567082446, "grad_norm": 0.3277231454849243, "learning_rate": 1.298137285212303e-06, "loss": 0.008, "step": 5922 }, { "epoch": 3.603894128384545, "grad_norm": 0.2284201830625534, "learning_rate": 1.2941808871416938e-06, "loss": 0.0035, "step": 5923 }, { "epoch": 3.6045025859446302, "grad_norm": 0.2673366069793701, "learning_rate": 1.290230367126119e-06, "loss": 0.005, "step": 5924 }, { "epoch": 3.6051110435047153, "grad_norm": 0.21040508151054382, "learning_rate": 1.2862857261451395e-06, "loss": 0.0037, "step": 5925 }, { "epoch": 3.605719501064801, "grad_norm": 0.2350999116897583, "learning_rate": 1.282346965176867e-06, "loss": 0.0021, "step": 5926 }, { "epoch": 3.606327958624886, "grad_norm": 0.33141762018203735, "learning_rate": 1.2784140851979404e-06, "loss": 0.0068, "step": 5927 }, { "epoch": 3.606936416184971, "grad_norm": 0.2573280334472656, "learning_rate": 1.2744870871835623e-06, "loss": 0.0072, "step": 5928 }, { "epoch": 3.607544873745056, "grad_norm": 0.21989892423152924, "learning_rate": 1.270565972107457e-06, "loss": 0.0088, "step": 5929 }, { "epoch": 3.6081533313051413, "grad_norm": 0.2301366925239563, "learning_rate": 1.2666507409418993e-06, "loss": 0.0065, "step": 5930 }, { "epoch": 3.608761788865227, "grad_norm": 0.18148985505104065, "learning_rate": 1.2627413946576989e-06, "loss": 0.0043, "step": 5931 }, { "epoch": 3.609370246425312, "grad_norm": 0.24122391641139984, "learning_rate": 1.2588379342242218e-06, "loss": 0.0051, "step": 5932 }, { "epoch": 3.609978703985397, "grad_norm": 0.4258638322353363, "learning_rate": 1.2549403606093525e-06, "loss": 0.0056, "step": 5933 }, { "epoch": 3.610587161545482, "grad_norm": 0.2916209399700165, "learning_rate": 1.2510486747795286e-06, "loss": 0.006, "step": 5934 }, { "epoch": 3.611195619105567, "grad_norm": 0.2829422056674957, "learning_rate": 1.2471628776997312e-06, "loss": 0.0039, "step": 5935 }, { "epoch": 3.6118040766656527, "grad_norm": 0.2982374429702759, "learning_rate": 1.2432829703334759e-06, "loss": 0.006, "step": 5936 }, { "epoch": 3.612412534225738, "grad_norm": 0.19474712014198303, "learning_rate": 1.2394089536428067e-06, "loss": 0.006, "step": 5937 }, { "epoch": 3.613020991785823, "grad_norm": 0.2293567806482315, "learning_rate": 1.2355408285883357e-06, "loss": 0.0076, "step": 5938 }, { "epoch": 3.613629449345908, "grad_norm": 0.26767081022262573, "learning_rate": 1.2316785961291849e-06, "loss": 0.0035, "step": 5939 }, { "epoch": 3.614237906905993, "grad_norm": 0.2060486376285553, "learning_rate": 1.2278222572230268e-06, "loss": 0.0043, "step": 5940 }, { "epoch": 3.6148463644660787, "grad_norm": 0.21085241436958313, "learning_rate": 1.2239718128260774e-06, "loss": 0.0062, "step": 5941 }, { "epoch": 3.6154548220261638, "grad_norm": 0.24178573489189148, "learning_rate": 1.2201272638930894e-06, "loss": 0.0058, "step": 5942 }, { "epoch": 3.616063279586249, "grad_norm": 0.20811717212200165, "learning_rate": 1.216288611377342e-06, "loss": 0.0049, "step": 5943 }, { "epoch": 3.616671737146334, "grad_norm": 0.3802126348018646, "learning_rate": 1.2124558562306625e-06, "loss": 0.0064, "step": 5944 }, { "epoch": 3.617280194706419, "grad_norm": 0.2511650323867798, "learning_rate": 1.2086289994034217e-06, "loss": 0.0051, "step": 5945 }, { "epoch": 3.6178886522665046, "grad_norm": 0.27000531554222107, "learning_rate": 1.204808041844513e-06, "loss": 0.0072, "step": 5946 }, { "epoch": 3.6184971098265897, "grad_norm": 0.26983216404914856, "learning_rate": 1.2009929845013757e-06, "loss": 0.0072, "step": 5947 }, { "epoch": 3.6191055673866748, "grad_norm": 0.19793999195098877, "learning_rate": 1.197183828319992e-06, "loss": 0.0045, "step": 5948 }, { "epoch": 3.61971402494676, "grad_norm": 0.238411545753479, "learning_rate": 1.193380574244865e-06, "loss": 0.0068, "step": 5949 }, { "epoch": 3.620322482506845, "grad_norm": 0.1789894700050354, "learning_rate": 1.1895832232190485e-06, "loss": 0.0026, "step": 5950 }, { "epoch": 3.6209309400669305, "grad_norm": 0.27357521653175354, "learning_rate": 1.1857917761841225e-06, "loss": 0.0085, "step": 5951 }, { "epoch": 3.6215393976270156, "grad_norm": 0.31104499101638794, "learning_rate": 1.182006234080213e-06, "loss": 0.0063, "step": 5952 }, { "epoch": 3.6221478551871007, "grad_norm": 0.23851986229419708, "learning_rate": 1.1782265978459771e-06, "loss": 0.0035, "step": 5953 }, { "epoch": 3.622756312747186, "grad_norm": 0.48092883825302124, "learning_rate": 1.1744528684186018e-06, "loss": 0.0056, "step": 5954 }, { "epoch": 3.623364770307271, "grad_norm": 0.19673019647598267, "learning_rate": 1.170685046733816e-06, "loss": 0.0052, "step": 5955 }, { "epoch": 3.6239732278673564, "grad_norm": 0.24021019041538239, "learning_rate": 1.1669231337258862e-06, "loss": 0.0064, "step": 5956 }, { "epoch": 3.6245816854274415, "grad_norm": 0.2394445836544037, "learning_rate": 1.1631671303276054e-06, "loss": 0.0043, "step": 5957 }, { "epoch": 3.6251901429875266, "grad_norm": 0.24804948270320892, "learning_rate": 1.1594170374703088e-06, "loss": 0.0048, "step": 5958 }, { "epoch": 3.6257986005476117, "grad_norm": 0.3146321475505829, "learning_rate": 1.155672856083867e-06, "loss": 0.0046, "step": 5959 }, { "epoch": 3.626407058107697, "grad_norm": 0.2864423990249634, "learning_rate": 1.1519345870966703e-06, "loss": 0.0097, "step": 5960 }, { "epoch": 3.6270155156677824, "grad_norm": 0.26554256677627563, "learning_rate": 1.1482022314356606e-06, "loss": 0.007, "step": 5961 }, { "epoch": 3.6276239732278674, "grad_norm": 0.2531816363334656, "learning_rate": 1.144475790026306e-06, "loss": 0.0054, "step": 5962 }, { "epoch": 3.6282324307879525, "grad_norm": 0.2108616828918457, "learning_rate": 1.1407552637926117e-06, "loss": 0.0055, "step": 5963 }, { "epoch": 3.6288408883480376, "grad_norm": 0.19454440474510193, "learning_rate": 1.1370406536571066e-06, "loss": 0.0038, "step": 5964 }, { "epoch": 3.6294493459081227, "grad_norm": 0.2602161169052124, "learning_rate": 1.1333319605408622e-06, "loss": 0.0079, "step": 5965 }, { "epoch": 3.6300578034682083, "grad_norm": 0.22308175265789032, "learning_rate": 1.1296291853634816e-06, "loss": 0.0068, "step": 5966 }, { "epoch": 3.6306662610282934, "grad_norm": 0.19663815200328827, "learning_rate": 1.1259323290430944e-06, "loss": 0.0054, "step": 5967 }, { "epoch": 3.6312747185883785, "grad_norm": 0.23746468126773834, "learning_rate": 1.1222413924963705e-06, "loss": 0.0043, "step": 5968 }, { "epoch": 3.6318831761484636, "grad_norm": 0.19152231514453888, "learning_rate": 1.1185563766385077e-06, "loss": 0.0048, "step": 5969 }, { "epoch": 3.6324916337085487, "grad_norm": 0.21245577931404114, "learning_rate": 1.1148772823832365e-06, "loss": 0.0034, "step": 5970 }, { "epoch": 3.633100091268634, "grad_norm": 0.29071682691574097, "learning_rate": 1.1112041106428162e-06, "loss": 0.008, "step": 5971 }, { "epoch": 3.6337085488287193, "grad_norm": 0.23585431277751923, "learning_rate": 1.107536862328043e-06, "loss": 0.0071, "step": 5972 }, { "epoch": 3.6343170063888044, "grad_norm": 0.29736191034317017, "learning_rate": 1.1038755383482397e-06, "loss": 0.0092, "step": 5973 }, { "epoch": 3.6349254639488895, "grad_norm": 0.24729853868484497, "learning_rate": 1.100220139611266e-06, "loss": 0.0049, "step": 5974 }, { "epoch": 3.6355339215089746, "grad_norm": 0.15655024349689484, "learning_rate": 1.0965706670235081e-06, "loss": 0.0027, "step": 5975 }, { "epoch": 3.63614237906906, "grad_norm": 0.15523763000965118, "learning_rate": 1.0929271214898755e-06, "loss": 0.0044, "step": 5976 }, { "epoch": 3.6367508366291452, "grad_norm": 0.33551907539367676, "learning_rate": 1.0892895039138234e-06, "loss": 0.0069, "step": 5977 }, { "epoch": 3.6373592941892303, "grad_norm": 0.2048521190881729, "learning_rate": 1.0856578151973246e-06, "loss": 0.0038, "step": 5978 }, { "epoch": 3.6379677517493154, "grad_norm": 0.4035876393318176, "learning_rate": 1.0820320562408947e-06, "loss": 0.013, "step": 5979 }, { "epoch": 3.6385762093094005, "grad_norm": 0.3847244083881378, "learning_rate": 1.0784122279435565e-06, "loss": 0.0125, "step": 5980 }, { "epoch": 3.639184666869486, "grad_norm": 0.2535593807697296, "learning_rate": 1.074798331202892e-06, "loss": 0.0073, "step": 5981 }, { "epoch": 3.639793124429571, "grad_norm": 0.3089277148246765, "learning_rate": 1.0711903669149843e-06, "loss": 0.0079, "step": 5982 }, { "epoch": 3.6404015819896562, "grad_norm": 0.3164043724536896, "learning_rate": 1.067588335974465e-06, "loss": 0.0064, "step": 5983 }, { "epoch": 3.6410100395497413, "grad_norm": 0.30803948640823364, "learning_rate": 1.0639922392744889e-06, "loss": 0.0103, "step": 5984 }, { "epoch": 3.6416184971098264, "grad_norm": 0.32882362604141235, "learning_rate": 1.0604020777067347e-06, "loss": 0.0082, "step": 5985 }, { "epoch": 3.642226954669912, "grad_norm": 0.9143747091293335, "learning_rate": 1.0568178521614125e-06, "loss": 0.0232, "step": 5986 }, { "epoch": 3.642835412229997, "grad_norm": 0.43392911553382874, "learning_rate": 1.0532395635272613e-06, "loss": 0.0092, "step": 5987 }, { "epoch": 3.643443869790082, "grad_norm": 0.18668395280838013, "learning_rate": 1.0496672126915492e-06, "loss": 0.0029, "step": 5988 }, { "epoch": 3.6440523273501673, "grad_norm": 0.2678108513355255, "learning_rate": 1.046100800540073e-06, "loss": 0.0064, "step": 5989 }, { "epoch": 3.6446607849102524, "grad_norm": 0.30396801233291626, "learning_rate": 1.0425403279571422e-06, "loss": 0.0081, "step": 5990 }, { "epoch": 3.645269242470338, "grad_norm": 0.17665399610996246, "learning_rate": 1.0389857958256227e-06, "loss": 0.0039, "step": 5991 }, { "epoch": 3.645877700030423, "grad_norm": 0.24349285662174225, "learning_rate": 1.0354372050268762e-06, "loss": 0.003, "step": 5992 }, { "epoch": 3.646486157590508, "grad_norm": 0.22671709954738617, "learning_rate": 1.03189455644081e-06, "loss": 0.0066, "step": 5993 }, { "epoch": 3.647094615150593, "grad_norm": 0.2378721684217453, "learning_rate": 1.0283578509458548e-06, "loss": 0.0059, "step": 5994 }, { "epoch": 3.6477030727106783, "grad_norm": 0.3115555942058563, "learning_rate": 1.0248270894189698e-06, "loss": 0.0051, "step": 5995 }, { "epoch": 3.648311530270764, "grad_norm": 0.3361762464046478, "learning_rate": 1.0213022727356247e-06, "loss": 0.0077, "step": 5996 }, { "epoch": 3.648919987830849, "grad_norm": 0.27221810817718506, "learning_rate": 1.0177834017698423e-06, "loss": 0.0068, "step": 5997 }, { "epoch": 3.649528445390934, "grad_norm": 0.2501876652240753, "learning_rate": 1.0142704773941443e-06, "loss": 0.0063, "step": 5998 }, { "epoch": 3.650136902951019, "grad_norm": 0.23100760579109192, "learning_rate": 1.0107635004795946e-06, "loss": 0.0052, "step": 5999 }, { "epoch": 3.650745360511104, "grad_norm": 0.18547479808330536, "learning_rate": 1.007262471895773e-06, "loss": 0.004, "step": 6000 }, { "epoch": 3.6513538180711897, "grad_norm": 0.22204366326332092, "learning_rate": 1.003767392510796e-06, "loss": 0.0041, "step": 6001 }, { "epoch": 3.651962275631275, "grad_norm": 0.1914484053850174, "learning_rate": 1.00027826319129e-06, "loss": 0.0057, "step": 6002 }, { "epoch": 3.65257073319136, "grad_norm": 0.18928459286689758, "learning_rate": 9.967950848024183e-07, "loss": 0.0041, "step": 6003 }, { "epoch": 3.653179190751445, "grad_norm": 0.2991024851799011, "learning_rate": 9.933178582078624e-07, "loss": 0.005, "step": 6004 }, { "epoch": 3.65378764831153, "grad_norm": 0.12466610223054886, "learning_rate": 9.898465842698323e-07, "loss": 0.0017, "step": 6005 }, { "epoch": 3.6543961058716157, "grad_norm": 0.5989731550216675, "learning_rate": 9.863812638490511e-07, "loss": 0.016, "step": 6006 }, { "epoch": 3.6550045634317008, "grad_norm": 0.22727476060390472, "learning_rate": 9.829218978047839e-07, "loss": 0.0059, "step": 6007 }, { "epoch": 3.655613020991786, "grad_norm": 0.26156729459762573, "learning_rate": 9.794684869948056e-07, "loss": 0.0066, "step": 6008 }, { "epoch": 3.656221478551871, "grad_norm": 0.24984483420848846, "learning_rate": 9.760210322754175e-07, "loss": 0.0052, "step": 6009 }, { "epoch": 3.656829936111956, "grad_norm": 0.1444278508424759, "learning_rate": 9.725795345014387e-07, "loss": 0.0034, "step": 6010 }, { "epoch": 3.6574383936720416, "grad_norm": 0.2517237663269043, "learning_rate": 9.69143994526231e-07, "loss": 0.0065, "step": 6011 }, { "epoch": 3.6580468512321267, "grad_norm": 0.2072015106678009, "learning_rate": 9.657144132016517e-07, "loss": 0.0028, "step": 6012 }, { "epoch": 3.658655308792212, "grad_norm": 0.21760626137256622, "learning_rate": 9.62290791378101e-07, "loss": 0.0055, "step": 6013 }, { "epoch": 3.659263766352297, "grad_norm": 0.22900134325027466, "learning_rate": 9.588731299044945e-07, "loss": 0.0089, "step": 6014 }, { "epoch": 3.659872223912382, "grad_norm": 0.23601651191711426, "learning_rate": 9.554614296282682e-07, "loss": 0.008, "step": 6015 }, { "epoch": 3.6604806814724675, "grad_norm": 0.20876143872737885, "learning_rate": 9.52055691395376e-07, "loss": 0.0056, "step": 6016 }, { "epoch": 3.6610891390325526, "grad_norm": 0.20409031212329865, "learning_rate": 9.486559160503117e-07, "loss": 0.0051, "step": 6017 }, { "epoch": 3.6616975965926377, "grad_norm": 0.17437663674354553, "learning_rate": 9.452621044360676e-07, "loss": 0.0045, "step": 6018 }, { "epoch": 3.662306054152723, "grad_norm": 0.7772207260131836, "learning_rate": 9.418742573941707e-07, "loss": 0.042, "step": 6019 }, { "epoch": 3.662914511712808, "grad_norm": 0.21668609976768494, "learning_rate": 9.384923757646657e-07, "loss": 0.0037, "step": 6020 }, { "epoch": 3.6635229692728934, "grad_norm": 0.22653992474079132, "learning_rate": 9.351164603861234e-07, "loss": 0.0067, "step": 6021 }, { "epoch": 3.6641314268329785, "grad_norm": 0.19564014673233032, "learning_rate": 9.317465120956215e-07, "loss": 0.0054, "step": 6022 }, { "epoch": 3.6647398843930636, "grad_norm": 0.17543290555477142, "learning_rate": 9.283825317287692e-07, "loss": 0.0033, "step": 6023 }, { "epoch": 3.6653483419531487, "grad_norm": 0.16604088246822357, "learning_rate": 9.250245201196938e-07, "loss": 0.0054, "step": 6024 }, { "epoch": 3.665956799513234, "grad_norm": 0.25378942489624023, "learning_rate": 9.216724781010461e-07, "loss": 0.005, "step": 6025 }, { "epoch": 3.6665652570733194, "grad_norm": 0.2088557928800583, "learning_rate": 9.183264065039859e-07, "loss": 0.0033, "step": 6026 }, { "epoch": 3.6671737146334045, "grad_norm": 0.22516481578350067, "learning_rate": 9.149863061582053e-07, "loss": 0.0092, "step": 6027 }, { "epoch": 3.6677821721934896, "grad_norm": 0.31587862968444824, "learning_rate": 9.116521778919085e-07, "loss": 0.0089, "step": 6028 }, { "epoch": 3.6683906297535747, "grad_norm": 0.34137919545173645, "learning_rate": 9.083240225318202e-07, "loss": 0.0135, "step": 6029 }, { "epoch": 3.6689990873136598, "grad_norm": 0.21958407759666443, "learning_rate": 9.050018409031801e-07, "loss": 0.0044, "step": 6030 }, { "epoch": 3.6696075448737453, "grad_norm": 0.32528388500213623, "learning_rate": 9.016856338297602e-07, "loss": 0.0076, "step": 6031 }, { "epoch": 3.6702160024338304, "grad_norm": 0.21904724836349487, "learning_rate": 8.983754021338331e-07, "loss": 0.0048, "step": 6032 }, { "epoch": 3.6708244599939155, "grad_norm": 0.3020154535770416, "learning_rate": 8.950711466362005e-07, "loss": 0.006, "step": 6033 }, { "epoch": 3.6714329175540006, "grad_norm": 0.2894566059112549, "learning_rate": 8.917728681561793e-07, "loss": 0.0057, "step": 6034 }, { "epoch": 3.6720413751140857, "grad_norm": 0.2588036060333252, "learning_rate": 8.884805675116098e-07, "loss": 0.0056, "step": 6035 }, { "epoch": 3.672649832674171, "grad_norm": 0.2685299813747406, "learning_rate": 8.851942455188362e-07, "loss": 0.007, "step": 6036 }, { "epoch": 3.6732582902342563, "grad_norm": 0.19193175435066223, "learning_rate": 8.819139029927425e-07, "loss": 0.0032, "step": 6037 }, { "epoch": 3.6738667477943414, "grad_norm": 0.20610147714614868, "learning_rate": 8.786395407467062e-07, "loss": 0.0036, "step": 6038 }, { "epoch": 3.6744752053544265, "grad_norm": 0.17886589467525482, "learning_rate": 8.753711595926334e-07, "loss": 0.0046, "step": 6039 }, { "epoch": 3.6750836629145116, "grad_norm": 0.2853025794029236, "learning_rate": 8.721087603409506e-07, "loss": 0.0057, "step": 6040 }, { "epoch": 3.675692120474597, "grad_norm": 0.22553856670856476, "learning_rate": 8.688523438005996e-07, "loss": 0.0043, "step": 6041 }, { "epoch": 3.6763005780346822, "grad_norm": 0.20292532444000244, "learning_rate": 8.656019107790237e-07, "loss": 0.0061, "step": 6042 }, { "epoch": 3.6769090355947673, "grad_norm": 0.24954308569431305, "learning_rate": 8.623574620822083e-07, "loss": 0.0065, "step": 6043 }, { "epoch": 3.6775174931548524, "grad_norm": 0.36442530155181885, "learning_rate": 8.591189985146352e-07, "loss": 0.0074, "step": 6044 }, { "epoch": 3.6781259507149375, "grad_norm": 0.26411545276641846, "learning_rate": 8.558865208793093e-07, "loss": 0.0049, "step": 6045 }, { "epoch": 3.678734408275023, "grad_norm": 0.2690979838371277, "learning_rate": 8.526600299777448e-07, "loss": 0.0055, "step": 6046 }, { "epoch": 3.679342865835108, "grad_norm": 0.14458657801151276, "learning_rate": 8.49439526609988e-07, "loss": 0.0029, "step": 6047 }, { "epoch": 3.6799513233951933, "grad_norm": 0.2396114468574524, "learning_rate": 8.462250115745807e-07, "loss": 0.0076, "step": 6048 }, { "epoch": 3.6805597809552784, "grad_norm": 0.22277654707431793, "learning_rate": 8.430164856685935e-07, "loss": 0.0046, "step": 6049 }, { "epoch": 3.6811682385153635, "grad_norm": 0.22810159623622894, "learning_rate": 8.398139496876012e-07, "loss": 0.0072, "step": 6050 }, { "epoch": 3.681776696075449, "grad_norm": 0.21732278168201447, "learning_rate": 8.366174044257103e-07, "loss": 0.0048, "step": 6051 }, { "epoch": 3.6823851536355336, "grad_norm": 0.22662782669067383, "learning_rate": 8.334268506755144e-07, "loss": 0.0038, "step": 6052 }, { "epoch": 3.682993611195619, "grad_norm": 0.26345643401145935, "learning_rate": 8.302422892281558e-07, "loss": 0.0067, "step": 6053 }, { "epoch": 3.6836020687557043, "grad_norm": 0.24986715614795685, "learning_rate": 8.270637208732585e-07, "loss": 0.0056, "step": 6054 }, { "epoch": 3.6842105263157894, "grad_norm": 0.31368815898895264, "learning_rate": 8.238911463989835e-07, "loss": 0.0067, "step": 6055 }, { "epoch": 3.684818983875875, "grad_norm": 0.6407575607299805, "learning_rate": 8.207245665919932e-07, "loss": 0.0278, "step": 6056 }, { "epoch": 3.6854274414359596, "grad_norm": 0.259106308221817, "learning_rate": 8.175639822374709e-07, "loss": 0.0082, "step": 6057 }, { "epoch": 3.686035898996045, "grad_norm": 0.1873266100883484, "learning_rate": 8.144093941191061e-07, "loss": 0.0042, "step": 6058 }, { "epoch": 3.68664435655613, "grad_norm": 0.3827762007713318, "learning_rate": 8.112608030191037e-07, "loss": 0.0117, "step": 6059 }, { "epoch": 3.6872528141162153, "grad_norm": 0.3754812479019165, "learning_rate": 8.081182097181894e-07, "loss": 0.0145, "step": 6060 }, { "epoch": 3.687861271676301, "grad_norm": 0.26298126578330994, "learning_rate": 8.049816149955896e-07, "loss": 0.0056, "step": 6061 }, { "epoch": 3.6884697292363855, "grad_norm": 0.1487468034029007, "learning_rate": 8.018510196290519e-07, "loss": 0.0029, "step": 6062 }, { "epoch": 3.689078186796471, "grad_norm": 0.30363762378692627, "learning_rate": 7.987264243948356e-07, "loss": 0.0086, "step": 6063 }, { "epoch": 3.689686644356556, "grad_norm": 0.1642022281885147, "learning_rate": 7.956078300677045e-07, "loss": 0.0029, "step": 6064 }, { "epoch": 3.6902951019166412, "grad_norm": 0.2749357521533966, "learning_rate": 7.924952374209399e-07, "loss": 0.008, "step": 6065 }, { "epoch": 3.6909035594767268, "grad_norm": 0.14298266172409058, "learning_rate": 7.893886472263412e-07, "loss": 0.0026, "step": 6066 }, { "epoch": 3.6915120170368114, "grad_norm": 0.2198246717453003, "learning_rate": 7.862880602542116e-07, "loss": 0.005, "step": 6067 }, { "epoch": 3.692120474596897, "grad_norm": 0.21324285864830017, "learning_rate": 7.83193477273364e-07, "loss": 0.0043, "step": 6068 }, { "epoch": 3.692728932156982, "grad_norm": 0.3092917203903198, "learning_rate": 7.801048990511262e-07, "loss": 0.0179, "step": 6069 }, { "epoch": 3.693337389717067, "grad_norm": 0.24335362017154694, "learning_rate": 7.770223263533411e-07, "loss": 0.0066, "step": 6070 }, { "epoch": 3.6939458472771527, "grad_norm": 0.27647829055786133, "learning_rate": 7.739457599443528e-07, "loss": 0.0069, "step": 6071 }, { "epoch": 3.6945543048372373, "grad_norm": 0.27333924174308777, "learning_rate": 7.708752005870263e-07, "loss": 0.0076, "step": 6072 }, { "epoch": 3.695162762397323, "grad_norm": 0.21789668500423431, "learning_rate": 7.6781064904273e-07, "loss": 0.0057, "step": 6073 }, { "epoch": 3.695771219957408, "grad_norm": 0.25622355937957764, "learning_rate": 7.647521060713452e-07, "loss": 0.0054, "step": 6074 }, { "epoch": 3.696379677517493, "grad_norm": 0.28123655915260315, "learning_rate": 7.616995724312626e-07, "loss": 0.0085, "step": 6075 }, { "epoch": 3.696988135077578, "grad_norm": 0.2564314305782318, "learning_rate": 7.586530488793847e-07, "loss": 0.0069, "step": 6076 }, { "epoch": 3.6975965926376633, "grad_norm": 0.30161136388778687, "learning_rate": 7.556125361711214e-07, "loss": 0.0059, "step": 6077 }, { "epoch": 3.698205050197749, "grad_norm": 0.17149339616298676, "learning_rate": 7.525780350603917e-07, "loss": 0.0035, "step": 6078 }, { "epoch": 3.698813507757834, "grad_norm": 0.2607922852039337, "learning_rate": 7.495495462996327e-07, "loss": 0.0156, "step": 6079 }, { "epoch": 3.699421965317919, "grad_norm": 0.30373069643974304, "learning_rate": 7.465270706397714e-07, "loss": 0.0071, "step": 6080 }, { "epoch": 3.700030422878004, "grad_norm": 0.17537471652030945, "learning_rate": 7.43510608830264e-07, "loss": 0.0047, "step": 6081 }, { "epoch": 3.700638880438089, "grad_norm": 0.2823267877101898, "learning_rate": 7.405001616190649e-07, "loss": 0.0044, "step": 6082 }, { "epoch": 3.7012473379981747, "grad_norm": 0.3912118375301361, "learning_rate": 7.374957297526408e-07, "loss": 0.006, "step": 6083 }, { "epoch": 3.70185579555826, "grad_norm": 0.2731781303882599, "learning_rate": 7.344973139759654e-07, "loss": 0.0076, "step": 6084 }, { "epoch": 3.702464253118345, "grad_norm": 0.23557397723197937, "learning_rate": 7.315049150325187e-07, "loss": 0.0049, "step": 6085 }, { "epoch": 3.70307271067843, "grad_norm": 0.19650277495384216, "learning_rate": 7.285185336642908e-07, "loss": 0.0042, "step": 6086 }, { "epoch": 3.703681168238515, "grad_norm": 0.18600913882255554, "learning_rate": 7.255381706117837e-07, "loss": 0.0031, "step": 6087 }, { "epoch": 3.7042896257986007, "grad_norm": 0.21114131808280945, "learning_rate": 7.225638266140006e-07, "loss": 0.0051, "step": 6088 }, { "epoch": 3.7048980833586858, "grad_norm": 0.2112278938293457, "learning_rate": 7.195955024084544e-07, "loss": 0.0045, "step": 6089 }, { "epoch": 3.705506540918771, "grad_norm": 0.16948889195919037, "learning_rate": 7.166331987311675e-07, "loss": 0.0044, "step": 6090 }, { "epoch": 3.706114998478856, "grad_norm": 0.19342589378356934, "learning_rate": 7.136769163166662e-07, "loss": 0.0037, "step": 6091 }, { "epoch": 3.706723456038941, "grad_norm": 0.2661781311035156, "learning_rate": 7.107266558979864e-07, "loss": 0.0043, "step": 6092 }, { "epoch": 3.7073319135990266, "grad_norm": 0.23141947388648987, "learning_rate": 7.077824182066678e-07, "loss": 0.0044, "step": 6093 }, { "epoch": 3.7079403711591117, "grad_norm": 0.25309503078460693, "learning_rate": 7.048442039727627e-07, "loss": 0.004, "step": 6094 }, { "epoch": 3.7085488287191968, "grad_norm": 0.2142305076122284, "learning_rate": 7.019120139248187e-07, "loss": 0.005, "step": 6095 }, { "epoch": 3.709157286279282, "grad_norm": 0.2936217188835144, "learning_rate": 6.989858487899043e-07, "loss": 0.0058, "step": 6096 }, { "epoch": 3.709765743839367, "grad_norm": 0.22409573197364807, "learning_rate": 6.960657092935807e-07, "loss": 0.0069, "step": 6097 }, { "epoch": 3.7103742013994525, "grad_norm": 0.21282601356506348, "learning_rate": 6.931515961599244e-07, "loss": 0.0028, "step": 6098 }, { "epoch": 3.7109826589595376, "grad_norm": 0.22673866152763367, "learning_rate": 6.902435101115129e-07, "loss": 0.0061, "step": 6099 }, { "epoch": 3.7115911165196227, "grad_norm": 0.181802436709404, "learning_rate": 6.873414518694332e-07, "loss": 0.0035, "step": 6100 }, { "epoch": 3.712199574079708, "grad_norm": 0.24081139266490936, "learning_rate": 6.844454221532682e-07, "loss": 0.0051, "step": 6101 }, { "epoch": 3.712808031639793, "grad_norm": 0.19030770659446716, "learning_rate": 6.815554216811182e-07, "loss": 0.0031, "step": 6102 }, { "epoch": 3.7134164891998784, "grad_norm": 0.2784750163555145, "learning_rate": 6.786714511695796e-07, "loss": 0.0089, "step": 6103 }, { "epoch": 3.7140249467599635, "grad_norm": 0.26827266812324524, "learning_rate": 6.75793511333761e-07, "loss": 0.0047, "step": 6104 }, { "epoch": 3.7146334043200486, "grad_norm": 0.2427385449409485, "learning_rate": 6.729216028872637e-07, "loss": 0.0047, "step": 6105 }, { "epoch": 3.7152418618801337, "grad_norm": 0.19240151345729828, "learning_rate": 6.700557265422097e-07, "loss": 0.0039, "step": 6106 }, { "epoch": 3.715850319440219, "grad_norm": 0.27845215797424316, "learning_rate": 6.67195883009214e-07, "loss": 0.0066, "step": 6107 }, { "epoch": 3.7164587770003044, "grad_norm": 0.23421838879585266, "learning_rate": 6.643420729973954e-07, "loss": 0.0073, "step": 6108 }, { "epoch": 3.7170672345603895, "grad_norm": 0.23832927644252777, "learning_rate": 6.614942972143822e-07, "loss": 0.0045, "step": 6109 }, { "epoch": 3.7176756921204746, "grad_norm": 0.47682875394821167, "learning_rate": 6.586525563663099e-07, "loss": 0.0115, "step": 6110 }, { "epoch": 3.7182841496805596, "grad_norm": 0.2529081702232361, "learning_rate": 6.558168511577978e-07, "loss": 0.0061, "step": 6111 }, { "epoch": 3.7188926072406447, "grad_norm": 0.3189617693424225, "learning_rate": 6.529871822919975e-07, "loss": 0.0086, "step": 6112 }, { "epoch": 3.7195010648007303, "grad_norm": 0.21395058929920197, "learning_rate": 6.501635504705422e-07, "loss": 0.0042, "step": 6113 }, { "epoch": 3.7201095223608154, "grad_norm": 0.1643410176038742, "learning_rate": 6.473459563935747e-07, "loss": 0.0024, "step": 6114 }, { "epoch": 3.7207179799209005, "grad_norm": 0.15923701226711273, "learning_rate": 6.445344007597387e-07, "loss": 0.0026, "step": 6115 }, { "epoch": 3.7213264374809856, "grad_norm": 0.2772526741027832, "learning_rate": 6.417288842661878e-07, "loss": 0.0062, "step": 6116 }, { "epoch": 3.7219348950410707, "grad_norm": 0.21529936790466309, "learning_rate": 6.389294076085684e-07, "loss": 0.0061, "step": 6117 }, { "epoch": 3.722543352601156, "grad_norm": 0.23362010717391968, "learning_rate": 6.361359714810389e-07, "loss": 0.0059, "step": 6118 }, { "epoch": 3.7231518101612413, "grad_norm": 0.23200243711471558, "learning_rate": 6.333485765762509e-07, "loss": 0.0065, "step": 6119 }, { "epoch": 3.7237602677213264, "grad_norm": 0.22773078083992004, "learning_rate": 6.305672235853682e-07, "loss": 0.0042, "step": 6120 }, { "epoch": 3.7243687252814115, "grad_norm": 0.29405245184898376, "learning_rate": 6.277919131980392e-07, "loss": 0.0076, "step": 6121 }, { "epoch": 3.7249771828414966, "grad_norm": 0.16285215318202972, "learning_rate": 6.250226461024383e-07, "loss": 0.0032, "step": 6122 }, { "epoch": 3.725585640401582, "grad_norm": 0.2497759461402893, "learning_rate": 6.222594229852163e-07, "loss": 0.0056, "step": 6123 }, { "epoch": 3.7261940979616672, "grad_norm": 0.18337193131446838, "learning_rate": 6.195022445315474e-07, "loss": 0.0032, "step": 6124 }, { "epoch": 3.7268025555217523, "grad_norm": 0.2235439568758011, "learning_rate": 6.167511114250901e-07, "loss": 0.0057, "step": 6125 }, { "epoch": 3.7274110130818374, "grad_norm": 0.26169970631599426, "learning_rate": 6.140060243480156e-07, "loss": 0.0074, "step": 6126 }, { "epoch": 3.7280194706419225, "grad_norm": 0.41065341234207153, "learning_rate": 6.112669839809876e-07, "loss": 0.0077, "step": 6127 }, { "epoch": 3.728627928202008, "grad_norm": 0.22760555148124695, "learning_rate": 6.08533991003174e-07, "loss": 0.0051, "step": 6128 }, { "epoch": 3.729236385762093, "grad_norm": 0.3023330867290497, "learning_rate": 6.058070460922466e-07, "loss": 0.0043, "step": 6129 }, { "epoch": 3.7298448433221782, "grad_norm": 0.17906133830547333, "learning_rate": 6.030861499243701e-07, "loss": 0.0027, "step": 6130 }, { "epoch": 3.7304533008822633, "grad_norm": 0.18436305224895477, "learning_rate": 6.003713031742131e-07, "loss": 0.0043, "step": 6131 }, { "epoch": 3.7310617584423484, "grad_norm": 0.16411884129047394, "learning_rate": 5.97662506514951e-07, "loss": 0.003, "step": 6132 }, { "epoch": 3.731670216002434, "grad_norm": 0.28377214074134827, "learning_rate": 5.949597606182439e-07, "loss": 0.0052, "step": 6133 }, { "epoch": 3.732278673562519, "grad_norm": 0.24661727249622345, "learning_rate": 5.922630661542639e-07, "loss": 0.0072, "step": 6134 }, { "epoch": 3.732887131122604, "grad_norm": 0.23685060441493988, "learning_rate": 5.895724237916816e-07, "loss": 0.0057, "step": 6135 }, { "epoch": 3.7334955886826893, "grad_norm": 0.24538807570934296, "learning_rate": 5.868878341976608e-07, "loss": 0.006, "step": 6136 }, { "epoch": 3.7341040462427744, "grad_norm": 0.23321305215358734, "learning_rate": 5.842092980378688e-07, "loss": 0.0036, "step": 6137 }, { "epoch": 3.73471250380286, "grad_norm": 0.2515392005443573, "learning_rate": 5.815368159764689e-07, "loss": 0.0044, "step": 6138 }, { "epoch": 3.735320961362945, "grad_norm": 0.3265810012817383, "learning_rate": 5.788703886761254e-07, "loss": 0.0064, "step": 6139 }, { "epoch": 3.73592941892303, "grad_norm": 0.2389630675315857, "learning_rate": 5.762100167980067e-07, "loss": 0.0038, "step": 6140 }, { "epoch": 3.736537876483115, "grad_norm": 0.26586848497390747, "learning_rate": 5.735557010017656e-07, "loss": 0.0071, "step": 6141 }, { "epoch": 3.7371463340432003, "grad_norm": 0.1601967066526413, "learning_rate": 5.709074419455701e-07, "loss": 0.0019, "step": 6142 }, { "epoch": 3.737754791603286, "grad_norm": 0.29823827743530273, "learning_rate": 5.682652402860727e-07, "loss": 0.0091, "step": 6143 }, { "epoch": 3.738363249163371, "grad_norm": 0.24698449671268463, "learning_rate": 5.6562909667843e-07, "loss": 0.0061, "step": 6144 }, { "epoch": 3.738971706723456, "grad_norm": 0.23663544654846191, "learning_rate": 5.629990117762968e-07, "loss": 0.0065, "step": 6145 }, { "epoch": 3.739580164283541, "grad_norm": 0.300060898065567, "learning_rate": 5.603749862318292e-07, "loss": 0.0047, "step": 6146 }, { "epoch": 3.740188621843626, "grad_norm": 0.13921256363391876, "learning_rate": 5.577570206956623e-07, "loss": 0.0032, "step": 6147 }, { "epoch": 3.7407970794037118, "grad_norm": 0.15815310180187225, "learning_rate": 5.551451158169602e-07, "loss": 0.0035, "step": 6148 }, { "epoch": 3.741405536963797, "grad_norm": 0.21306031942367554, "learning_rate": 5.52539272243352e-07, "loss": 0.0043, "step": 6149 }, { "epoch": 3.742013994523882, "grad_norm": 0.2478276789188385, "learning_rate": 5.499394906209876e-07, "loss": 0.0076, "step": 6150 }, { "epoch": 3.742622452083967, "grad_norm": 0.18793390691280365, "learning_rate": 5.473457715944957e-07, "loss": 0.0035, "step": 6151 }, { "epoch": 3.743230909644052, "grad_norm": 0.2907355725765228, "learning_rate": 5.447581158070203e-07, "loss": 0.0041, "step": 6152 }, { "epoch": 3.7438393672041377, "grad_norm": 0.31171515583992004, "learning_rate": 5.42176523900187e-07, "loss": 0.0065, "step": 6153 }, { "epoch": 3.7444478247642228, "grad_norm": 0.27889809012413025, "learning_rate": 5.396009965141197e-07, "loss": 0.0062, "step": 6154 }, { "epoch": 3.745056282324308, "grad_norm": 0.2419702708721161, "learning_rate": 5.370315342874494e-07, "loss": 0.0047, "step": 6155 }, { "epoch": 3.745664739884393, "grad_norm": 0.22070972621440887, "learning_rate": 5.344681378572913e-07, "loss": 0.0048, "step": 6156 }, { "epoch": 3.746273197444478, "grad_norm": 0.2477099597454071, "learning_rate": 5.319108078592567e-07, "loss": 0.0039, "step": 6157 }, { "epoch": 3.7468816550045636, "grad_norm": 0.1542275846004486, "learning_rate": 5.293595449274685e-07, "loss": 0.0033, "step": 6158 }, { "epoch": 3.7474901125646487, "grad_norm": 0.17303107678890228, "learning_rate": 5.268143496945239e-07, "loss": 0.0031, "step": 6159 }, { "epoch": 3.748098570124734, "grad_norm": 0.26074641942977905, "learning_rate": 5.242752227915287e-07, "loss": 0.0071, "step": 6160 }, { "epoch": 3.748707027684819, "grad_norm": 0.20203956961631775, "learning_rate": 5.217421648480769e-07, "loss": 0.0051, "step": 6161 }, { "epoch": 3.749315485244904, "grad_norm": 0.18085290491580963, "learning_rate": 5.192151764922659e-07, "loss": 0.0027, "step": 6162 }, { "epoch": 3.7499239428049895, "grad_norm": 0.31731289625167847, "learning_rate": 5.166942583506806e-07, "loss": 0.0055, "step": 6163 }, { "epoch": 3.7505324003650746, "grad_norm": 0.1756841391324997, "learning_rate": 5.141794110484071e-07, "loss": 0.0023, "step": 6164 }, { "epoch": 3.7511408579251597, "grad_norm": 0.17156068980693817, "learning_rate": 5.116706352090189e-07, "loss": 0.003, "step": 6165 }, { "epoch": 3.751749315485245, "grad_norm": 0.2008720338344574, "learning_rate": 5.091679314545905e-07, "loss": 0.0039, "step": 6166 }, { "epoch": 3.75235777304533, "grad_norm": 0.1687059849500656, "learning_rate": 5.066713004056839e-07, "loss": 0.0052, "step": 6167 }, { "epoch": 3.7529662306054155, "grad_norm": 0.2529749572277069, "learning_rate": 5.041807426813649e-07, "loss": 0.0085, "step": 6168 }, { "epoch": 3.7535746881655006, "grad_norm": 0.2726878225803375, "learning_rate": 5.01696258899187e-07, "loss": 0.0089, "step": 6169 }, { "epoch": 3.7541831457255856, "grad_norm": 0.14040397107601166, "learning_rate": 4.992178496751931e-07, "loss": 0.0023, "step": 6170 }, { "epoch": 3.7547916032856707, "grad_norm": 0.2372034788131714, "learning_rate": 4.967455156239337e-07, "loss": 0.0057, "step": 6171 }, { "epoch": 3.755400060845756, "grad_norm": 0.22318537533283234, "learning_rate": 4.942792573584404e-07, "loss": 0.0033, "step": 6172 }, { "epoch": 3.7560085184058414, "grad_norm": 0.22246554493904114, "learning_rate": 4.918190754902408e-07, "loss": 0.0038, "step": 6173 }, { "epoch": 3.7566169759659265, "grad_norm": 0.18072238564491272, "learning_rate": 4.89364970629358e-07, "loss": 0.0038, "step": 6174 }, { "epoch": 3.7572254335260116, "grad_norm": 0.2487388253211975, "learning_rate": 4.86916943384308e-07, "loss": 0.0067, "step": 6175 }, { "epoch": 3.7578338910860967, "grad_norm": 0.1718432605266571, "learning_rate": 4.844749943621052e-07, "loss": 0.0029, "step": 6176 }, { "epoch": 3.7584423486461818, "grad_norm": 0.20204098522663116, "learning_rate": 4.820391241682404e-07, "loss": 0.0034, "step": 6177 }, { "epoch": 3.7590508062062673, "grad_norm": 0.20857053995132446, "learning_rate": 4.796093334067192e-07, "loss": 0.0042, "step": 6178 }, { "epoch": 3.7596592637663524, "grad_norm": 0.2744029760360718, "learning_rate": 4.771856226800209e-07, "loss": 0.0074, "step": 6179 }, { "epoch": 3.7602677213264375, "grad_norm": 0.33221253752708435, "learning_rate": 4.7476799258912574e-07, "loss": 0.0068, "step": 6180 }, { "epoch": 3.7608761788865226, "grad_norm": 0.3299937844276428, "learning_rate": 4.7235644373350707e-07, "loss": 0.0058, "step": 6181 }, { "epoch": 3.7614846364466077, "grad_norm": 0.33729061484336853, "learning_rate": 4.69950976711131e-07, "loss": 0.0079, "step": 6182 }, { "epoch": 3.7620930940066932, "grad_norm": 0.26489993929862976, "learning_rate": 4.675515921184481e-07, "loss": 0.0064, "step": 6183 }, { "epoch": 3.7627015515667783, "grad_norm": 0.14743492007255554, "learning_rate": 4.651582905504048e-07, "loss": 0.0022, "step": 6184 }, { "epoch": 3.7633100091268634, "grad_norm": 0.21888674795627594, "learning_rate": 4.627710726004458e-07, "loss": 0.0074, "step": 6185 }, { "epoch": 3.7639184666869485, "grad_norm": 0.21569287776947021, "learning_rate": 4.6038993886049484e-07, "loss": 0.005, "step": 6186 }, { "epoch": 3.7645269242470336, "grad_norm": 0.22505217790603638, "learning_rate": 4.5801488992098243e-07, "loss": 0.0038, "step": 6187 }, { "epoch": 3.765135381807119, "grad_norm": 0.196213498711586, "learning_rate": 4.5564592637081517e-07, "loss": 0.0047, "step": 6188 }, { "epoch": 3.7657438393672042, "grad_norm": 0.26138564944267273, "learning_rate": 4.5328304879739823e-07, "loss": 0.0047, "step": 6189 }, { "epoch": 3.7663522969272893, "grad_norm": 0.30381667613983154, "learning_rate": 4.509262577866269e-07, "loss": 0.0102, "step": 6190 }, { "epoch": 3.7669607544873744, "grad_norm": 0.17448806762695312, "learning_rate": 4.4857555392288917e-07, "loss": 0.0023, "step": 6191 }, { "epoch": 3.7675692120474595, "grad_norm": 0.2946946620941162, "learning_rate": 4.4623093778906053e-07, "loss": 0.0049, "step": 6192 }, { "epoch": 3.768177669607545, "grad_norm": 0.23686204850673676, "learning_rate": 4.438924099665065e-07, "loss": 0.0048, "step": 6193 }, { "epoch": 3.76878612716763, "grad_norm": 0.2854866087436676, "learning_rate": 4.4155997103508817e-07, "loss": 0.0049, "step": 6194 }, { "epoch": 3.7693945847277153, "grad_norm": 0.22638092935085297, "learning_rate": 4.392336215731513e-07, "loss": 0.0056, "step": 6195 }, { "epoch": 3.7700030422878004, "grad_norm": 0.1855926811695099, "learning_rate": 4.369133621575289e-07, "loss": 0.0034, "step": 6196 }, { "epoch": 3.7706114998478855, "grad_norm": 0.21199901401996613, "learning_rate": 4.3459919336355514e-07, "loss": 0.0035, "step": 6197 }, { "epoch": 3.771219957407971, "grad_norm": 0.1068754643201828, "learning_rate": 4.322911157650433e-07, "loss": 0.0015, "step": 6198 }, { "epoch": 3.771828414968056, "grad_norm": 0.24557361006736755, "learning_rate": 4.2998912993430785e-07, "loss": 0.0054, "step": 6199 }, { "epoch": 3.772436872528141, "grad_norm": 0.3791668117046356, "learning_rate": 4.2769323644213375e-07, "loss": 0.0127, "step": 6200 }, { "epoch": 3.7730453300882263, "grad_norm": 0.19211210310459137, "learning_rate": 4.2540343585781573e-07, "loss": 0.0034, "step": 6201 }, { "epoch": 3.7736537876483114, "grad_norm": 0.23425185680389404, "learning_rate": 4.2311972874912453e-07, "loss": 0.0062, "step": 6202 }, { "epoch": 3.774262245208397, "grad_norm": 0.2918088138103485, "learning_rate": 4.208421156823239e-07, "loss": 0.005, "step": 6203 }, { "epoch": 3.774870702768482, "grad_norm": 0.17967435717582703, "learning_rate": 4.1857059722217316e-07, "loss": 0.0033, "step": 6204 }, { "epoch": 3.775479160328567, "grad_norm": 0.2613130211830139, "learning_rate": 4.1630517393190794e-07, "loss": 0.0055, "step": 6205 }, { "epoch": 3.776087617888652, "grad_norm": 0.3404596149921417, "learning_rate": 4.1404584637325936e-07, "loss": 0.0088, "step": 6206 }, { "epoch": 3.7766960754487373, "grad_norm": 0.286954402923584, "learning_rate": 4.117926151064488e-07, "loss": 0.0082, "step": 6207 }, { "epoch": 3.777304533008823, "grad_norm": 0.19479084014892578, "learning_rate": 4.0954548069018217e-07, "loss": 0.0041, "step": 6208 }, { "epoch": 3.777912990568908, "grad_norm": 0.20791840553283691, "learning_rate": 4.073044436816581e-07, "loss": 0.0048, "step": 6209 }, { "epoch": 3.778521448128993, "grad_norm": 0.2315748929977417, "learning_rate": 4.0506950463655713e-07, "loss": 0.0048, "step": 6210 }, { "epoch": 3.779129905689078, "grad_norm": 0.2659534811973572, "learning_rate": 4.028406641090499e-07, "loss": 0.004, "step": 6211 }, { "epoch": 3.7797383632491632, "grad_norm": 0.2190115749835968, "learning_rate": 4.0061792265179696e-07, "loss": 0.0029, "step": 6212 }, { "epoch": 3.7803468208092488, "grad_norm": 0.20221631228923798, "learning_rate": 3.984012808159493e-07, "loss": 0.0021, "step": 6213 }, { "epoch": 3.780955278369334, "grad_norm": 0.2616676390171051, "learning_rate": 3.961907391511366e-07, "loss": 0.0075, "step": 6214 }, { "epoch": 3.781563735929419, "grad_norm": 0.27306365966796875, "learning_rate": 3.9398629820548703e-07, "loss": 0.0063, "step": 6215 }, { "epoch": 3.782172193489504, "grad_norm": 0.18841589987277985, "learning_rate": 3.9178795852560236e-07, "loss": 0.0023, "step": 6216 }, { "epoch": 3.782780651049589, "grad_norm": 0.27202504873275757, "learning_rate": 3.8959572065658535e-07, "loss": 0.0057, "step": 6217 }, { "epoch": 3.7833891086096747, "grad_norm": 0.24065257608890533, "learning_rate": 3.874095851420151e-07, "loss": 0.0048, "step": 6218 }, { "epoch": 3.78399756616976, "grad_norm": 0.31770059466362, "learning_rate": 3.852295525239663e-07, "loss": 0.0073, "step": 6219 }, { "epoch": 3.784606023729845, "grad_norm": 0.20511983335018158, "learning_rate": 3.830556233429927e-07, "loss": 0.0039, "step": 6220 }, { "epoch": 3.78521448128993, "grad_norm": 0.33700552582740784, "learning_rate": 3.808877981381437e-07, "loss": 0.0049, "step": 6221 }, { "epoch": 3.785822938850015, "grad_norm": 0.2216031849384308, "learning_rate": 3.78726077446942e-07, "loss": 0.0034, "step": 6222 }, { "epoch": 3.7864313964101006, "grad_norm": 0.1690044105052948, "learning_rate": 3.7657046180540887e-07, "loss": 0.0042, "step": 6223 }, { "epoch": 3.7870398539701857, "grad_norm": 0.3780827224254608, "learning_rate": 3.744209517480446e-07, "loss": 0.0068, "step": 6224 }, { "epoch": 3.787648311530271, "grad_norm": 0.2709617018699646, "learning_rate": 3.722775478078422e-07, "loss": 0.0064, "step": 6225 }, { "epoch": 3.788256769090356, "grad_norm": 0.24110420048236847, "learning_rate": 3.70140250516271e-07, "loss": 0.0058, "step": 6226 }, { "epoch": 3.788865226650441, "grad_norm": 0.31499508023262024, "learning_rate": 3.6800906040329595e-07, "loss": 0.0075, "step": 6227 }, { "epoch": 3.7894736842105265, "grad_norm": 0.2634466588497162, "learning_rate": 3.658839779973611e-07, "loss": 0.0054, "step": 6228 }, { "epoch": 3.7900821417706116, "grad_norm": 0.20416542887687683, "learning_rate": 3.637650038254004e-07, "loss": 0.0045, "step": 6229 }, { "epoch": 3.7906905993306967, "grad_norm": 0.3133479356765747, "learning_rate": 3.6165213841282966e-07, "loss": 0.0248, "step": 6230 }, { "epoch": 3.791299056890782, "grad_norm": 0.16795231401920319, "learning_rate": 3.5954538228355205e-07, "loss": 0.0035, "step": 6231 }, { "epoch": 3.791907514450867, "grad_norm": 0.26573193073272705, "learning_rate": 3.5744473595995533e-07, "loss": 0.0077, "step": 6232 }, { "epoch": 3.7925159720109525, "grad_norm": 0.3238775432109833, "learning_rate": 3.5535019996290885e-07, "loss": 0.0108, "step": 6233 }, { "epoch": 3.7931244295710376, "grad_norm": 0.1755254566669464, "learning_rate": 3.5326177481177505e-07, "loss": 0.0038, "step": 6234 }, { "epoch": 3.7937328871311227, "grad_norm": 0.2531315088272095, "learning_rate": 3.5117946102439513e-07, "loss": 0.0048, "step": 6235 }, { "epoch": 3.7943413446912078, "grad_norm": 0.28363898396492004, "learning_rate": 3.491032591170951e-07, "loss": 0.0049, "step": 6236 }, { "epoch": 3.794949802251293, "grad_norm": 0.24325834214687347, "learning_rate": 3.4703316960468524e-07, "loss": 0.0048, "step": 6237 }, { "epoch": 3.7955582598113784, "grad_norm": 0.22880126535892487, "learning_rate": 3.4496919300046617e-07, "loss": 0.0046, "step": 6238 }, { "epoch": 3.7961667173714635, "grad_norm": 0.30837124586105347, "learning_rate": 3.4291132981621174e-07, "loss": 0.002, "step": 6239 }, { "epoch": 3.7967751749315486, "grad_norm": 0.16037946939468384, "learning_rate": 3.408595805621889e-07, "loss": 0.0026, "step": 6240 }, { "epoch": 3.7973836324916337, "grad_norm": 0.1504233032464981, "learning_rate": 3.3881394574715174e-07, "loss": 0.0035, "step": 6241 }, { "epoch": 3.797992090051719, "grad_norm": 0.25224924087524414, "learning_rate": 3.367744258783223e-07, "loss": 0.006, "step": 6242 }, { "epoch": 3.7986005476118043, "grad_norm": 0.25352349877357483, "learning_rate": 3.347410214614211e-07, "loss": 0.0061, "step": 6243 }, { "epoch": 3.7992090051718894, "grad_norm": 0.27149248123168945, "learning_rate": 3.327137330006502e-07, "loss": 0.0158, "step": 6244 }, { "epoch": 3.7998174627319745, "grad_norm": 0.23779939115047455, "learning_rate": 3.3069256099869105e-07, "loss": 0.0074, "step": 6245 }, { "epoch": 3.8004259202920596, "grad_norm": 0.16755706071853638, "learning_rate": 3.2867750595670657e-07, "loss": 0.0032, "step": 6246 }, { "epoch": 3.8010343778521447, "grad_norm": 0.19651807844638824, "learning_rate": 3.266685683743498e-07, "loss": 0.0047, "step": 6247 }, { "epoch": 3.8016428354122302, "grad_norm": 0.277131587266922, "learning_rate": 3.2466574874975565e-07, "loss": 0.0041, "step": 6248 }, { "epoch": 3.8022512929723153, "grad_norm": 0.27917802333831787, "learning_rate": 3.226690475795324e-07, "loss": 0.0071, "step": 6249 }, { "epoch": 3.8028597505324004, "grad_norm": 0.2759116291999817, "learning_rate": 3.206784653587869e-07, "loss": 0.0063, "step": 6250 }, { "epoch": 3.8034682080924855, "grad_norm": 0.37202462553977966, "learning_rate": 3.186940025810992e-07, "loss": 0.0067, "step": 6251 }, { "epoch": 3.8040766656525706, "grad_norm": 0.1423821896314621, "learning_rate": 3.1671565973852567e-07, "loss": 0.0034, "step": 6252 }, { "epoch": 3.804685123212656, "grad_norm": 0.6763217449188232, "learning_rate": 3.14743437321624e-07, "loss": 0.0252, "step": 6253 }, { "epoch": 3.805293580772741, "grad_norm": 0.34884288907051086, "learning_rate": 3.1277733581941416e-07, "loss": 0.0078, "step": 6254 }, { "epoch": 3.8059020383328264, "grad_norm": 0.24357537925243378, "learning_rate": 3.1081735571941437e-07, "loss": 0.0061, "step": 6255 }, { "epoch": 3.8065104958929115, "grad_norm": 0.27209481596946716, "learning_rate": 3.088634975076082e-07, "loss": 0.0054, "step": 6256 }, { "epoch": 3.8071189534529966, "grad_norm": 0.24121397733688354, "learning_rate": 3.0691576166848314e-07, "loss": 0.0054, "step": 6257 }, { "epoch": 3.807727411013082, "grad_norm": 0.24442483484745026, "learning_rate": 3.0497414868498884e-07, "loss": 0.0058, "step": 6258 }, { "epoch": 3.8083358685731667, "grad_norm": 0.24496260285377502, "learning_rate": 3.030386590385653e-07, "loss": 0.0083, "step": 6259 }, { "epoch": 3.8089443261332523, "grad_norm": 0.14519323408603668, "learning_rate": 3.0110929320913694e-07, "loss": 0.0029, "step": 6260 }, { "epoch": 3.8095527836933374, "grad_norm": 0.32007092237472534, "learning_rate": 2.991860516751016e-07, "loss": 0.0097, "step": 6261 }, { "epoch": 3.8101612412534225, "grad_norm": 0.3110872209072113, "learning_rate": 2.972689349133417e-07, "loss": 0.0085, "step": 6262 }, { "epoch": 3.810769698813508, "grad_norm": 0.27134284377098083, "learning_rate": 2.9535794339922984e-07, "loss": 0.0026, "step": 6263 }, { "epoch": 3.8113781563735927, "grad_norm": 0.2500753700733185, "learning_rate": 2.9345307760660637e-07, "loss": 0.0059, "step": 6264 }, { "epoch": 3.811986613933678, "grad_norm": 0.3057078421115875, "learning_rate": 2.9155433800780176e-07, "loss": 0.0049, "step": 6265 }, { "epoch": 3.8125950714937633, "grad_norm": 0.2372872531414032, "learning_rate": 2.8966172507362e-07, "loss": 0.0049, "step": 6266 }, { "epoch": 3.8132035290538484, "grad_norm": 0.2749864161014557, "learning_rate": 2.8777523927335515e-07, "loss": 0.0056, "step": 6267 }, { "epoch": 3.813811986613934, "grad_norm": 0.26591286063194275, "learning_rate": 2.8589488107477194e-07, "loss": 0.0046, "step": 6268 }, { "epoch": 3.8144204441740186, "grad_norm": 0.21569810807704926, "learning_rate": 2.840206509441251e-07, "loss": 0.0049, "step": 6269 }, { "epoch": 3.815028901734104, "grad_norm": 0.19832342863082886, "learning_rate": 2.82152549346143e-07, "loss": 0.0056, "step": 6270 }, { "epoch": 3.8156373592941892, "grad_norm": 0.3168319761753082, "learning_rate": 2.8029057674404115e-07, "loss": 0.0104, "step": 6271 }, { "epoch": 3.8162458168542743, "grad_norm": 0.1959853619337082, "learning_rate": 2.7843473359950303e-07, "loss": 0.0045, "step": 6272 }, { "epoch": 3.81685427441436, "grad_norm": 0.3243808150291443, "learning_rate": 2.7658502037270774e-07, "loss": 0.0067, "step": 6273 }, { "epoch": 3.8174627319744445, "grad_norm": 0.2744379937648773, "learning_rate": 2.747414375223051e-07, "loss": 0.0057, "step": 6274 }, { "epoch": 3.81807118953453, "grad_norm": 0.26081326603889465, "learning_rate": 2.7290398550542664e-07, "loss": 0.0047, "step": 6275 }, { "epoch": 3.818679647094615, "grad_norm": 0.24908572435379028, "learning_rate": 2.710726647776829e-07, "loss": 0.0062, "step": 6276 }, { "epoch": 3.8192881046547003, "grad_norm": 0.23935918509960175, "learning_rate": 2.692474757931662e-07, "loss": 0.0075, "step": 6277 }, { "epoch": 3.819896562214786, "grad_norm": 0.1932537704706192, "learning_rate": 2.674284190044479e-07, "loss": 0.0034, "step": 6278 }, { "epoch": 3.8205050197748704, "grad_norm": 0.3207075893878937, "learning_rate": 2.6561549486257556e-07, "loss": 0.0078, "step": 6279 }, { "epoch": 3.821113477334956, "grad_norm": 0.2624523639678955, "learning_rate": 2.638087038170811e-07, "loss": 0.0082, "step": 6280 }, { "epoch": 3.821721934895041, "grad_norm": 0.16647395491600037, "learning_rate": 2.62008046315973e-07, "loss": 0.003, "step": 6281 }, { "epoch": 3.822330392455126, "grad_norm": 0.2986607253551483, "learning_rate": 2.602135228057384e-07, "loss": 0.0101, "step": 6282 }, { "epoch": 3.8229388500152113, "grad_norm": 0.35330304503440857, "learning_rate": 2.5842513373134645e-07, "loss": 0.0103, "step": 6283 }, { "epoch": 3.8235473075752964, "grad_norm": 0.2447318136692047, "learning_rate": 2.566428795362397e-07, "loss": 0.0059, "step": 6284 }, { "epoch": 3.824155765135382, "grad_norm": 0.16872382164001465, "learning_rate": 2.5486676066234504e-07, "loss": 0.0039, "step": 6285 }, { "epoch": 3.824764222695467, "grad_norm": 0.2803313434123993, "learning_rate": 2.5309677755006867e-07, "loss": 0.0081, "step": 6286 }, { "epoch": 3.825372680255552, "grad_norm": 0.36526891589164734, "learning_rate": 2.5133293063828724e-07, "loss": 0.0137, "step": 6287 }, { "epoch": 3.825981137815637, "grad_norm": 0.2731226086616516, "learning_rate": 2.4957522036436474e-07, "loss": 0.0092, "step": 6288 }, { "epoch": 3.8265895953757223, "grad_norm": 0.19423212110996246, "learning_rate": 2.4782364716413873e-07, "loss": 0.0049, "step": 6289 }, { "epoch": 3.827198052935808, "grad_norm": 0.16490328311920166, "learning_rate": 2.460782114719257e-07, "loss": 0.0032, "step": 6290 }, { "epoch": 3.827806510495893, "grad_norm": 0.30284324288368225, "learning_rate": 2.4433891372052376e-07, "loss": 0.0055, "step": 6291 }, { "epoch": 3.828414968055978, "grad_norm": 0.18749664723873138, "learning_rate": 2.426057543412019e-07, "loss": 0.0038, "step": 6292 }, { "epoch": 3.829023425616063, "grad_norm": 0.2562442719936371, "learning_rate": 2.408787337637164e-07, "loss": 0.0059, "step": 6293 }, { "epoch": 3.829631883176148, "grad_norm": 0.1709728091955185, "learning_rate": 2.3915785241629406e-07, "loss": 0.0035, "step": 6294 }, { "epoch": 3.8302403407362338, "grad_norm": 0.3805026113986969, "learning_rate": 2.3744311072563808e-07, "loss": 0.0078, "step": 6295 }, { "epoch": 3.830848798296319, "grad_norm": 0.20816834270954132, "learning_rate": 2.3573450911693883e-07, "loss": 0.0035, "step": 6296 }, { "epoch": 3.831457255856404, "grad_norm": 0.16338828206062317, "learning_rate": 2.3403204801385746e-07, "loss": 0.0023, "step": 6297 }, { "epoch": 3.832065713416489, "grad_norm": 0.21396136283874512, "learning_rate": 2.3233572783852854e-07, "loss": 0.0054, "step": 6298 }, { "epoch": 3.832674170976574, "grad_norm": 0.20282062888145447, "learning_rate": 2.3064554901157388e-07, "loss": 0.0051, "step": 6299 }, { "epoch": 3.8332826285366597, "grad_norm": 0.2224971354007721, "learning_rate": 2.2896151195208603e-07, "loss": 0.004, "step": 6300 }, { "epoch": 3.833891086096745, "grad_norm": 0.3189062476158142, "learning_rate": 2.272836170776338e-07, "loss": 0.0039, "step": 6301 }, { "epoch": 3.83449954365683, "grad_norm": 0.14660929143428802, "learning_rate": 2.2561186480426766e-07, "loss": 0.0035, "step": 6302 }, { "epoch": 3.835108001216915, "grad_norm": 0.30386772751808167, "learning_rate": 2.239462555465116e-07, "loss": 0.0065, "step": 6303 }, { "epoch": 3.835716458777, "grad_norm": 0.18635998666286469, "learning_rate": 2.222867897173686e-07, "loss": 0.0032, "step": 6304 }, { "epoch": 3.8363249163370856, "grad_norm": 0.12038879841566086, "learning_rate": 2.206334677283123e-07, "loss": 0.0022, "step": 6305 }, { "epoch": 3.8369333738971707, "grad_norm": 0.14553184807300568, "learning_rate": 2.189862899893036e-07, "loss": 0.0035, "step": 6306 }, { "epoch": 3.837541831457256, "grad_norm": 0.21486513316631317, "learning_rate": 2.173452569087714e-07, "loss": 0.0051, "step": 6307 }, { "epoch": 3.838150289017341, "grad_norm": 0.25689902901649475, "learning_rate": 2.157103688936235e-07, "loss": 0.0056, "step": 6308 }, { "epoch": 3.838758746577426, "grad_norm": 0.22431597113609314, "learning_rate": 2.14081626349244e-07, "loss": 0.0025, "step": 6309 }, { "epoch": 3.8393672041375115, "grad_norm": 0.23348523676395416, "learning_rate": 2.1245902967949315e-07, "loss": 0.0043, "step": 6310 }, { "epoch": 3.8399756616975966, "grad_norm": 0.21763131022453308, "learning_rate": 2.1084257928670748e-07, "loss": 0.0034, "step": 6311 }, { "epoch": 3.8405841192576817, "grad_norm": 0.2803143560886383, "learning_rate": 2.092322755716969e-07, "loss": 0.015, "step": 6312 }, { "epoch": 3.841192576817767, "grad_norm": 0.1908082216978073, "learning_rate": 2.0762811893375588e-07, "loss": 0.0037, "step": 6313 }, { "epoch": 3.841801034377852, "grad_norm": 0.24294763803482056, "learning_rate": 2.060301097706413e-07, "loss": 0.0044, "step": 6314 }, { "epoch": 3.8424094919379375, "grad_norm": 0.2627767026424408, "learning_rate": 2.0443824847859727e-07, "loss": 0.0064, "step": 6315 }, { "epoch": 3.8430179494980226, "grad_norm": 0.24739083647727966, "learning_rate": 2.0285253545233585e-07, "loss": 0.0071, "step": 6316 }, { "epoch": 3.8436264070581077, "grad_norm": 0.31518417596817017, "learning_rate": 2.0127297108505082e-07, "loss": 0.0102, "step": 6317 }, { "epoch": 3.8442348646181927, "grad_norm": 0.40057793259620667, "learning_rate": 1.996995557684067e-07, "loss": 0.006, "step": 6318 }, { "epoch": 3.844843322178278, "grad_norm": 0.1834147572517395, "learning_rate": 1.9813228989254695e-07, "loss": 0.0039, "step": 6319 }, { "epoch": 3.8454517797383634, "grad_norm": 0.2752745449542999, "learning_rate": 1.9657117384608569e-07, "loss": 0.0065, "step": 6320 }, { "epoch": 3.8460602372984485, "grad_norm": 0.3888533413410187, "learning_rate": 1.950162080161161e-07, "loss": 0.0083, "step": 6321 }, { "epoch": 3.8466686948585336, "grad_norm": 0.3024943172931671, "learning_rate": 1.9346739278820192e-07, "loss": 0.0117, "step": 6322 }, { "epoch": 3.8472771524186187, "grad_norm": 0.2485586553812027, "learning_rate": 1.9192472854638875e-07, "loss": 0.0073, "step": 6323 }, { "epoch": 3.8478856099787038, "grad_norm": 0.19068674743175507, "learning_rate": 1.9038821567319286e-07, "loss": 0.0028, "step": 6324 }, { "epoch": 3.8484940675387893, "grad_norm": 0.18191272020339966, "learning_rate": 1.8885785454960115e-07, "loss": 0.0039, "step": 6325 }, { "epoch": 3.8491025250988744, "grad_norm": 0.28522956371307373, "learning_rate": 1.8733364555508225e-07, "loss": 0.0054, "step": 6326 }, { "epoch": 3.8497109826589595, "grad_norm": 0.24018339812755585, "learning_rate": 1.8581558906757557e-07, "loss": 0.0064, "step": 6327 }, { "epoch": 3.8503194402190446, "grad_norm": 0.3272731304168701, "learning_rate": 1.8430368546349942e-07, "loss": 0.0079, "step": 6328 }, { "epoch": 3.8509278977791297, "grad_norm": 0.30089452862739563, "learning_rate": 1.8279793511774e-07, "loss": 0.0038, "step": 6329 }, { "epoch": 3.8515363553392152, "grad_norm": 0.2161916345357895, "learning_rate": 1.8129833840365985e-07, "loss": 0.0054, "step": 6330 }, { "epoch": 3.8521448128993003, "grad_norm": 0.2529414892196655, "learning_rate": 1.7980489569309755e-07, "loss": 0.005, "step": 6331 }, { "epoch": 3.8527532704593854, "grad_norm": 0.3203285336494446, "learning_rate": 1.7831760735636248e-07, "loss": 0.0057, "step": 6332 }, { "epoch": 3.8533617280194705, "grad_norm": 0.20392923057079315, "learning_rate": 1.76836473762243e-07, "loss": 0.004, "step": 6333 }, { "epoch": 3.8539701855795556, "grad_norm": 0.3112758994102478, "learning_rate": 1.7536149527800082e-07, "loss": 0.0069, "step": 6334 }, { "epoch": 3.854578643139641, "grad_norm": 0.2692670226097107, "learning_rate": 1.738926722693629e-07, "loss": 0.0065, "step": 6335 }, { "epoch": 3.8551871006997263, "grad_norm": 0.1982191503047943, "learning_rate": 1.7243000510053787e-07, "loss": 0.0045, "step": 6336 }, { "epoch": 3.8557955582598114, "grad_norm": 0.21474218368530273, "learning_rate": 1.7097349413420781e-07, "loss": 0.0056, "step": 6337 }, { "epoch": 3.8564040158198964, "grad_norm": 0.16877877712249756, "learning_rate": 1.6952313973152834e-07, "loss": 0.0047, "step": 6338 }, { "epoch": 3.8570124733799815, "grad_norm": 0.35990428924560547, "learning_rate": 1.6807894225212283e-07, "loss": 0.0087, "step": 6339 }, { "epoch": 3.857620930940067, "grad_norm": 0.3100360333919525, "learning_rate": 1.6664090205409656e-07, "loss": 0.0051, "step": 6340 }, { "epoch": 3.858229388500152, "grad_norm": 0.1680872142314911, "learning_rate": 1.652090194940198e-07, "loss": 0.0032, "step": 6341 }, { "epoch": 3.8588378460602373, "grad_norm": 0.22448216378688812, "learning_rate": 1.637832949269419e-07, "loss": 0.0045, "step": 6342 }, { "epoch": 3.8594463036203224, "grad_norm": 0.20747099816799164, "learning_rate": 1.6236372870638284e-07, "loss": 0.0045, "step": 6343 }, { "epoch": 3.8600547611804075, "grad_norm": 0.23858143389225006, "learning_rate": 1.609503211843333e-07, "loss": 0.0066, "step": 6344 }, { "epoch": 3.860663218740493, "grad_norm": 0.2025892287492752, "learning_rate": 1.595430727112629e-07, "loss": 0.0042, "step": 6345 }, { "epoch": 3.861271676300578, "grad_norm": 0.3395010530948639, "learning_rate": 1.581419836361092e-07, "loss": 0.0097, "step": 6346 }, { "epoch": 3.861880133860663, "grad_norm": 0.23754116892814636, "learning_rate": 1.5674705430628323e-07, "loss": 0.0032, "step": 6347 }, { "epoch": 3.8624885914207483, "grad_norm": 0.21957284212112427, "learning_rate": 1.5535828506766936e-07, "loss": 0.0063, "step": 6348 }, { "epoch": 3.8630970489808334, "grad_norm": 0.18481026589870453, "learning_rate": 1.539756762646255e-07, "loss": 0.0039, "step": 6349 }, { "epoch": 3.863705506540919, "grad_norm": 0.3242833912372589, "learning_rate": 1.5259922823998297e-07, "loss": 0.0067, "step": 6350 }, { "epoch": 3.864313964101004, "grad_norm": 0.24175108969211578, "learning_rate": 1.5122894133503817e-07, "loss": 0.0048, "step": 6351 }, { "epoch": 3.864922421661089, "grad_norm": 0.27556535601615906, "learning_rate": 1.4986481588956934e-07, "loss": 0.0043, "step": 6352 }, { "epoch": 3.865530879221174, "grad_norm": 0.27273163199424744, "learning_rate": 1.485068522418226e-07, "loss": 0.0042, "step": 6353 }, { "epoch": 3.8661393367812593, "grad_norm": 0.277964323759079, "learning_rate": 1.4715505072851188e-07, "loss": 0.015, "step": 6354 }, { "epoch": 3.866747794341345, "grad_norm": 0.3170606791973114, "learning_rate": 1.4580941168483298e-07, "loss": 0.0103, "step": 6355 }, { "epoch": 3.86735625190143, "grad_norm": 0.23001722991466522, "learning_rate": 1.4446993544444954e-07, "loss": 0.0063, "step": 6356 }, { "epoch": 3.867964709461515, "grad_norm": 0.240625262260437, "learning_rate": 1.4313662233948755e-07, "loss": 0.0072, "step": 6357 }, { "epoch": 3.8685731670216, "grad_norm": 0.22342710196971893, "learning_rate": 1.4180947270056032e-07, "loss": 0.0043, "step": 6358 }, { "epoch": 3.8691816245816852, "grad_norm": 0.2697957754135132, "learning_rate": 1.4048848685674354e-07, "loss": 0.0061, "step": 6359 }, { "epoch": 3.869790082141771, "grad_norm": 0.21805325150489807, "learning_rate": 1.3917366513558629e-07, "loss": 0.004, "step": 6360 }, { "epoch": 3.870398539701856, "grad_norm": 0.23698726296424866, "learning_rate": 1.378650078631083e-07, "loss": 0.0069, "step": 6361 }, { "epoch": 3.871006997261941, "grad_norm": 0.2618488669395447, "learning_rate": 1.365625153638056e-07, "loss": 0.0074, "step": 6362 }, { "epoch": 3.871615454822026, "grad_norm": 0.2968907654285431, "learning_rate": 1.352661879606393e-07, "loss": 0.0097, "step": 6363 }, { "epoch": 3.872223912382111, "grad_norm": 0.31761443614959717, "learning_rate": 1.339760259750439e-07, "loss": 0.0071, "step": 6364 }, { "epoch": 3.8728323699421967, "grad_norm": 0.18307559192180634, "learning_rate": 1.3269202972692741e-07, "loss": 0.0043, "step": 6365 }, { "epoch": 3.873440827502282, "grad_norm": 0.2085932344198227, "learning_rate": 1.314141995346685e-07, "loss": 0.0047, "step": 6366 }, { "epoch": 3.874049285062367, "grad_norm": 0.21864819526672363, "learning_rate": 1.301425357151137e-07, "loss": 0.0072, "step": 6367 }, { "epoch": 3.874657742622452, "grad_norm": 0.15939606726169586, "learning_rate": 1.2887703858358302e-07, "loss": 0.0028, "step": 6368 }, { "epoch": 3.875266200182537, "grad_norm": 0.21072454750537872, "learning_rate": 1.2761770845386712e-07, "loss": 0.0034, "step": 6369 }, { "epoch": 3.8758746577426226, "grad_norm": 0.3073256313800812, "learning_rate": 1.2636454563823009e-07, "loss": 0.0075, "step": 6370 }, { "epoch": 3.8764831153027077, "grad_norm": 0.2551986873149872, "learning_rate": 1.2511755044739836e-07, "loss": 0.0072, "step": 6371 }, { "epoch": 3.877091572862793, "grad_norm": 0.2689383327960968, "learning_rate": 1.238767231905774e-07, "loss": 0.0041, "step": 6372 }, { "epoch": 3.877700030422878, "grad_norm": 0.2862478792667389, "learning_rate": 1.2264206417544333e-07, "loss": 0.0051, "step": 6373 }, { "epoch": 3.878308487982963, "grad_norm": 0.34870097041130066, "learning_rate": 1.2141357370813732e-07, "loss": 0.0063, "step": 6374 }, { "epoch": 3.8789169455430486, "grad_norm": 0.27742713689804077, "learning_rate": 1.2019125209327409e-07, "loss": 0.0047, "step": 6375 }, { "epoch": 3.8795254031031337, "grad_norm": 0.19228945672512054, "learning_rate": 1.1897509963394171e-07, "loss": 0.0043, "step": 6376 }, { "epoch": 3.8801338606632187, "grad_norm": 0.2762080430984497, "learning_rate": 1.1776511663168788e-07, "loss": 0.0081, "step": 6377 }, { "epoch": 3.880742318223304, "grad_norm": 0.3993409276008606, "learning_rate": 1.165613033865448e-07, "loss": 0.0087, "step": 6378 }, { "epoch": 3.881350775783389, "grad_norm": 0.26734060049057007, "learning_rate": 1.1536366019700428e-07, "loss": 0.0054, "step": 6379 }, { "epoch": 3.8819592333434745, "grad_norm": 0.2520960867404938, "learning_rate": 1.1417218736003432e-07, "loss": 0.0048, "step": 6380 }, { "epoch": 3.8825676909035596, "grad_norm": 0.24928855895996094, "learning_rate": 1.1298688517107081e-07, "loss": 0.0046, "step": 6381 }, { "epoch": 3.8831761484636447, "grad_norm": 0.20136448740959167, "learning_rate": 1.1180775392401754e-07, "loss": 0.0037, "step": 6382 }, { "epoch": 3.8837846060237298, "grad_norm": 0.2751845121383667, "learning_rate": 1.1063479391124898e-07, "loss": 0.0036, "step": 6383 }, { "epoch": 3.884393063583815, "grad_norm": 0.25994813442230225, "learning_rate": 1.0946800542361025e-07, "loss": 0.0059, "step": 6384 }, { "epoch": 3.8850015211439004, "grad_norm": 0.2973096966743469, "learning_rate": 1.083073887504199e-07, "loss": 0.0103, "step": 6385 }, { "epoch": 3.8856099787039855, "grad_norm": 0.1909116506576538, "learning_rate": 1.0715294417946164e-07, "loss": 0.0021, "step": 6386 }, { "epoch": 3.8862184362640706, "grad_norm": 0.17645008862018585, "learning_rate": 1.0600467199698427e-07, "loss": 0.0044, "step": 6387 }, { "epoch": 3.8868268938241557, "grad_norm": 0.24159935116767883, "learning_rate": 1.0486257248771835e-07, "loss": 0.0088, "step": 6388 }, { "epoch": 3.887435351384241, "grad_norm": 0.30770307779312134, "learning_rate": 1.0372664593485403e-07, "loss": 0.0055, "step": 6389 }, { "epoch": 3.8880438089443263, "grad_norm": 0.7140611410140991, "learning_rate": 1.0259689262005212e-07, "loss": 0.0058, "step": 6390 }, { "epoch": 3.8886522665044114, "grad_norm": 0.249837726354599, "learning_rate": 1.0147331282344686e-07, "loss": 0.0047, "step": 6391 }, { "epoch": 3.8892607240644965, "grad_norm": 0.1296958178281784, "learning_rate": 1.003559068236376e-07, "loss": 0.0019, "step": 6392 }, { "epoch": 3.8898691816245816, "grad_norm": 0.31125199794769287, "learning_rate": 9.924467489769717e-08, "loss": 0.0084, "step": 6393 }, { "epoch": 3.8904776391846667, "grad_norm": 0.25288596749305725, "learning_rate": 9.813961732116073e-08, "loss": 0.0079, "step": 6394 }, { "epoch": 3.8910860967447523, "grad_norm": 0.21936556696891785, "learning_rate": 9.704073436803685e-08, "loss": 0.0032, "step": 6395 }, { "epoch": 3.8916945543048374, "grad_norm": 0.3556055724620819, "learning_rate": 9.594802631080756e-08, "loss": 0.0119, "step": 6396 }, { "epoch": 3.8923030118649224, "grad_norm": 0.2905397117137909, "learning_rate": 9.48614934204145e-08, "loss": 0.007, "step": 6397 }, { "epoch": 3.8929114694250075, "grad_norm": 0.27061349153518677, "learning_rate": 9.378113596627546e-08, "loss": 0.0089, "step": 6398 }, { "epoch": 3.8935199269850926, "grad_norm": 0.182336688041687, "learning_rate": 9.270695421626784e-08, "loss": 0.0036, "step": 6399 }, { "epoch": 3.894128384545178, "grad_norm": 0.07943737506866455, "learning_rate": 9.163894843675357e-08, "loss": 0.0007, "step": 6400 }, { "epoch": 3.8947368421052633, "grad_norm": 0.24037736654281616, "learning_rate": 9.057711889254584e-08, "loss": 0.0051, "step": 6401 }, { "epoch": 3.8953452996653484, "grad_norm": 0.28533557057380676, "learning_rate": 8.952146584693677e-08, "loss": 0.0073, "step": 6402 }, { "epoch": 3.8959537572254335, "grad_norm": 0.1930028647184372, "learning_rate": 8.847198956168368e-08, "loss": 0.0045, "step": 6403 }, { "epoch": 3.8965622147855186, "grad_norm": 0.24128195643424988, "learning_rate": 8.742869029701451e-08, "loss": 0.0045, "step": 6404 }, { "epoch": 3.897170672345604, "grad_norm": 0.2623744308948517, "learning_rate": 8.639156831162231e-08, "loss": 0.0079, "step": 6405 }, { "epoch": 3.897779129905689, "grad_norm": 0.31534165143966675, "learning_rate": 8.536062386267362e-08, "loss": 0.0061, "step": 6406 }, { "epoch": 3.8983875874657743, "grad_norm": 0.22992461919784546, "learning_rate": 8.433585720579173e-08, "loss": 0.0066, "step": 6407 }, { "epoch": 3.8989960450258594, "grad_norm": 0.17727993428707123, "learning_rate": 8.331726859508726e-08, "loss": 0.0031, "step": 6408 }, { "epoch": 3.8996045025859445, "grad_norm": 0.2012277990579605, "learning_rate": 8.230485828311651e-08, "loss": 0.0044, "step": 6409 }, { "epoch": 3.90021296014603, "grad_norm": 0.253017783164978, "learning_rate": 8.129862652092313e-08, "loss": 0.0045, "step": 6410 }, { "epoch": 3.900821417706115, "grad_norm": 0.19332891702651978, "learning_rate": 8.029857355800475e-08, "loss": 0.0044, "step": 6411 }, { "epoch": 3.9014298752662, "grad_norm": 0.27358150482177734, "learning_rate": 7.930469964234078e-08, "loss": 0.0062, "step": 6412 }, { "epoch": 3.9020383328262853, "grad_norm": 0.23901762068271637, "learning_rate": 7.83170050203591e-08, "loss": 0.0046, "step": 6413 }, { "epoch": 3.9026467903863704, "grad_norm": 0.23956961929798126, "learning_rate": 7.73354899369777e-08, "loss": 0.0065, "step": 6414 }, { "epoch": 3.903255247946456, "grad_norm": 0.25745102763175964, "learning_rate": 7.636015463556578e-08, "loss": 0.0074, "step": 6415 }, { "epoch": 3.903863705506541, "grad_norm": 0.23999325931072235, "learning_rate": 7.539099935796879e-08, "loss": 0.0059, "step": 6416 }, { "epoch": 3.904472163066626, "grad_norm": 0.18391796946525574, "learning_rate": 7.442802434449169e-08, "loss": 0.0043, "step": 6417 }, { "epoch": 3.9050806206267112, "grad_norm": 0.2279115468263626, "learning_rate": 7.347122983391851e-08, "loss": 0.0061, "step": 6418 }, { "epoch": 3.9056890781867963, "grad_norm": 0.3626938760280609, "learning_rate": 7.252061606349003e-08, "loss": 0.0105, "step": 6419 }, { "epoch": 3.906297535746882, "grad_norm": 0.26662662625312805, "learning_rate": 7.157618326892046e-08, "loss": 0.0039, "step": 6420 }, { "epoch": 3.906905993306967, "grad_norm": 0.23431222140789032, "learning_rate": 7.063793168438915e-08, "loss": 0.0097, "step": 6421 }, { "epoch": 3.907514450867052, "grad_norm": 0.23029586672782898, "learning_rate": 6.970586154254333e-08, "loss": 0.0044, "step": 6422 }, { "epoch": 3.908122908427137, "grad_norm": 0.30431026220321655, "learning_rate": 6.877997307449813e-08, "loss": 0.003, "step": 6423 }, { "epoch": 3.9087313659872223, "grad_norm": 0.31623929738998413, "learning_rate": 6.786026650983657e-08, "loss": 0.003, "step": 6424 }, { "epoch": 3.909339823547308, "grad_norm": 0.2245815098285675, "learning_rate": 6.6946742076604e-08, "loss": 0.0021, "step": 6425 }, { "epoch": 3.909948281107393, "grad_norm": 0.20678408443927765, "learning_rate": 6.603940000132203e-08, "loss": 0.0039, "step": 6426 }, { "epoch": 3.910556738667478, "grad_norm": 0.32013562321662903, "learning_rate": 6.513824050896622e-08, "loss": 0.0073, "step": 6427 }, { "epoch": 3.911165196227563, "grad_norm": 0.26314496994018555, "learning_rate": 6.424326382299394e-08, "loss": 0.0059, "step": 6428 }, { "epoch": 3.911773653787648, "grad_norm": 0.2609120011329651, "learning_rate": 6.33544701653166e-08, "loss": 0.0074, "step": 6429 }, { "epoch": 3.9123821113477337, "grad_norm": 0.22066128253936768, "learning_rate": 6.247185975631897e-08, "loss": 0.0059, "step": 6430 }, { "epoch": 3.912990568907819, "grad_norm": 0.24796698987483978, "learning_rate": 6.15954328148538e-08, "loss": 0.0056, "step": 6431 }, { "epoch": 3.913599026467904, "grad_norm": 0.32350313663482666, "learning_rate": 6.072518955823891e-08, "loss": 0.0097, "step": 6432 }, { "epoch": 3.914207484027989, "grad_norm": 0.24313661456108093, "learning_rate": 5.986113020225448e-08, "loss": 0.0052, "step": 6433 }, { "epoch": 3.914815941588074, "grad_norm": 0.15304477512836456, "learning_rate": 5.900325496115411e-08, "loss": 0.0037, "step": 6434 }, { "epoch": 3.9154243991481597, "grad_norm": 0.189406618475914, "learning_rate": 5.815156404765654e-08, "loss": 0.003, "step": 6435 }, { "epoch": 3.9160328567082447, "grad_norm": 0.23600012063980103, "learning_rate": 5.7306057672942833e-08, "loss": 0.0029, "step": 6436 }, { "epoch": 3.91664131426833, "grad_norm": 0.20945847034454346, "learning_rate": 5.6466736046661974e-08, "loss": 0.0038, "step": 6437 }, { "epoch": 3.917249771828415, "grad_norm": 0.2792292535305023, "learning_rate": 5.5633599376936353e-08, "loss": 0.0071, "step": 6438 }, { "epoch": 3.9178582293885, "grad_norm": 0.1970946043729782, "learning_rate": 5.48066478703424e-08, "loss": 0.0035, "step": 6439 }, { "epoch": 3.9184666869485856, "grad_norm": 0.23822718858718872, "learning_rate": 5.398588173193275e-08, "loss": 0.0058, "step": 6440 }, { "epoch": 3.9190751445086707, "grad_norm": 0.30124393105506897, "learning_rate": 5.317130116522517e-08, "loss": 0.0082, "step": 6441 }, { "epoch": 3.9196836020687558, "grad_norm": 0.22608311474323273, "learning_rate": 5.2362906372199764e-08, "loss": 0.0037, "step": 6442 }, { "epoch": 3.920292059628841, "grad_norm": 0.23782384395599365, "learning_rate": 5.156069755330451e-08, "loss": 0.0057, "step": 6443 }, { "epoch": 3.920900517188926, "grad_norm": 0.20344187319278717, "learning_rate": 5.0764674907452516e-08, "loss": 0.0058, "step": 6444 }, { "epoch": 3.9215089747490115, "grad_norm": 0.2951229214668274, "learning_rate": 4.997483863202757e-08, "loss": 0.0071, "step": 6445 }, { "epoch": 3.9221174323090966, "grad_norm": 0.18063601851463318, "learning_rate": 4.9191188922875775e-08, "loss": 0.0046, "step": 6446 }, { "epoch": 3.9227258898691817, "grad_norm": 0.2033570259809494, "learning_rate": 4.8413725974305604e-08, "loss": 0.0046, "step": 6447 }, { "epoch": 3.923334347429267, "grad_norm": 0.3676776587963104, "learning_rate": 4.764244997909895e-08, "loss": 0.0053, "step": 6448 }, { "epoch": 3.923942804989352, "grad_norm": 0.1831228882074356, "learning_rate": 4.6877361128497276e-08, "loss": 0.0039, "step": 6449 }, { "epoch": 3.9245512625494374, "grad_norm": 0.17482344806194305, "learning_rate": 4.61184596122155e-08, "loss": 0.0048, "step": 6450 }, { "epoch": 3.9251597201095225, "grad_norm": 0.25266796350479126, "learning_rate": 4.5365745618425325e-08, "loss": 0.007, "step": 6451 }, { "epoch": 3.9257681776696076, "grad_norm": 0.25007006525993347, "learning_rate": 4.4619219333769115e-08, "loss": 0.0068, "step": 6452 }, { "epoch": 3.9263766352296927, "grad_norm": 0.2621901035308838, "learning_rate": 4.3878880943357124e-08, "loss": 0.0078, "step": 6453 }, { "epoch": 3.926985092789778, "grad_norm": 0.31776395440101624, "learning_rate": 4.314473063075919e-08, "loss": 0.0099, "step": 6454 }, { "epoch": 3.9275935503498633, "grad_norm": 0.19145816564559937, "learning_rate": 4.24167685780158e-08, "loss": 0.0034, "step": 6455 }, { "epoch": 3.9282020079099484, "grad_norm": 0.24003374576568604, "learning_rate": 4.169499496562701e-08, "loss": 0.004, "step": 6456 }, { "epoch": 3.9288104654700335, "grad_norm": 0.19230146706104279, "learning_rate": 4.097940997256911e-08, "loss": 0.005, "step": 6457 }, { "epoch": 3.9294189230301186, "grad_norm": 0.22203390300273895, "learning_rate": 4.0270013776275153e-08, "loss": 0.005, "step": 6458 }, { "epoch": 3.9300273805902037, "grad_norm": 0.28375643491744995, "learning_rate": 3.9566806552643335e-08, "loss": 0.0063, "step": 6459 }, { "epoch": 3.9306358381502893, "grad_norm": 0.20378822088241577, "learning_rate": 3.8869788476039725e-08, "loss": 0.0035, "step": 6460 }, { "epoch": 3.931244295710374, "grad_norm": 0.611138641834259, "learning_rate": 3.817895971930108e-08, "loss": 0.0211, "step": 6461 }, { "epoch": 3.9318527532704595, "grad_norm": 0.20204846560955048, "learning_rate": 3.749432045371815e-08, "loss": 0.005, "step": 6462 }, { "epoch": 3.9324612108305446, "grad_norm": 0.2756820619106293, "learning_rate": 3.6815870849055134e-08, "loss": 0.0047, "step": 6463 }, { "epoch": 3.9330696683906297, "grad_norm": 0.26881980895996094, "learning_rate": 3.614361107354136e-08, "loss": 0.004, "step": 6464 }, { "epoch": 3.933678125950715, "grad_norm": 0.30591699481010437, "learning_rate": 3.547754129386571e-08, "loss": 0.0054, "step": 6465 }, { "epoch": 3.9342865835108, "grad_norm": 0.3070509433746338, "learning_rate": 3.481766167518774e-08, "loss": 0.0074, "step": 6466 }, { "epoch": 3.9348950410708854, "grad_norm": 0.21430547535419464, "learning_rate": 3.416397238112934e-08, "loss": 0.0054, "step": 6467 }, { "epoch": 3.9355034986309705, "grad_norm": 0.15337048470973969, "learning_rate": 3.351647357377752e-08, "loss": 0.0031, "step": 6468 }, { "epoch": 3.9361119561910556, "grad_norm": 0.17072342336177826, "learning_rate": 3.2875165413687184e-08, "loss": 0.0035, "step": 6469 }, { "epoch": 3.936720413751141, "grad_norm": 0.22262074053287506, "learning_rate": 3.2240048059872793e-08, "loss": 0.0051, "step": 6470 }, { "epoch": 3.9373288713112258, "grad_norm": 0.22603394091129303, "learning_rate": 3.161112166982227e-08, "loss": 0.0052, "step": 6471 }, { "epoch": 3.9379373288713113, "grad_norm": 0.20640049874782562, "learning_rate": 3.098838639947754e-08, "loss": 0.0047, "step": 6472 }, { "epoch": 3.9385457864313964, "grad_norm": 0.17780143022537231, "learning_rate": 3.037184240325397e-08, "loss": 0.0027, "step": 6473 }, { "epoch": 3.9391542439914815, "grad_norm": 0.1945071816444397, "learning_rate": 2.976148983402649e-08, "loss": 0.0056, "step": 6474 }, { "epoch": 3.939762701551567, "grad_norm": 0.2443944811820984, "learning_rate": 2.9157328843140708e-08, "loss": 0.005, "step": 6475 }, { "epoch": 3.9403711591116517, "grad_norm": 0.1970926970243454, "learning_rate": 2.855935958040179e-08, "loss": 0.0036, "step": 6476 }, { "epoch": 3.9409796166717372, "grad_norm": 0.2412329912185669, "learning_rate": 2.7967582194080022e-08, "loss": 0.0042, "step": 6477 }, { "epoch": 3.9415880742318223, "grad_norm": 0.15232165157794952, "learning_rate": 2.7381996830910805e-08, "loss": 0.0041, "step": 6478 }, { "epoch": 3.9421965317919074, "grad_norm": 0.2518268823623657, "learning_rate": 2.6802603636097435e-08, "loss": 0.0074, "step": 6479 }, { "epoch": 3.942804989351993, "grad_norm": 0.25513625144958496, "learning_rate": 2.6229402753305544e-08, "loss": 0.0088, "step": 6480 }, { "epoch": 3.9434134469120776, "grad_norm": 0.2241678684949875, "learning_rate": 2.5662394324663108e-08, "loss": 0.0053, "step": 6481 }, { "epoch": 3.944021904472163, "grad_norm": 0.1274229884147644, "learning_rate": 2.5101578490763223e-08, "loss": 0.0015, "step": 6482 }, { "epoch": 3.9446303620322483, "grad_norm": 0.20924502611160278, "learning_rate": 2.4546955390669645e-08, "loss": 0.003, "step": 6483 }, { "epoch": 3.9452388195923334, "grad_norm": 0.23410069942474365, "learning_rate": 2.3998525161900153e-08, "loss": 0.0072, "step": 6484 }, { "epoch": 3.9458472771524185, "grad_norm": 0.18146799504756927, "learning_rate": 2.345628794044874e-08, "loss": 0.0023, "step": 6485 }, { "epoch": 3.9464557347125035, "grad_norm": 0.3447299003601074, "learning_rate": 2.292024386076064e-08, "loss": 0.0028, "step": 6486 }, { "epoch": 3.947064192272589, "grad_norm": 0.2687872052192688, "learning_rate": 2.2390393055757297e-08, "loss": 0.0052, "step": 6487 }, { "epoch": 3.947672649832674, "grad_norm": 0.20712141692638397, "learning_rate": 2.1866735656819736e-08, "loss": 0.0062, "step": 6488 }, { "epoch": 3.9482811073927593, "grad_norm": 0.31326937675476074, "learning_rate": 2.1349271793791313e-08, "loss": 0.0079, "step": 6489 }, { "epoch": 3.9488895649528444, "grad_norm": 0.3461199104785919, "learning_rate": 2.0838001594980504e-08, "loss": 0.009, "step": 6490 }, { "epoch": 3.9494980225129295, "grad_norm": 0.18858326971530914, "learning_rate": 2.0332925187163676e-08, "loss": 0.0052, "step": 6491 }, { "epoch": 3.950106480073015, "grad_norm": 0.26478853821754456, "learning_rate": 1.983404269557676e-08, "loss": 0.0061, "step": 6492 }, { "epoch": 3.9507149376331, "grad_norm": 0.2934277057647705, "learning_rate": 1.9341354243923583e-08, "loss": 0.0034, "step": 6493 }, { "epoch": 3.951323395193185, "grad_norm": 0.18972255289554596, "learning_rate": 1.8854859954370306e-08, "loss": 0.0042, "step": 6494 }, { "epoch": 3.9519318527532703, "grad_norm": 0.3544655740261078, "learning_rate": 1.8374559947545446e-08, "loss": 0.0206, "step": 6495 }, { "epoch": 3.9525403103133554, "grad_norm": 0.1487325131893158, "learning_rate": 1.7900454342542616e-08, "loss": 0.0032, "step": 6496 }, { "epoch": 3.953148767873441, "grad_norm": 0.3513227701187134, "learning_rate": 1.743254325692334e-08, "loss": 0.0078, "step": 6497 }, { "epoch": 3.953757225433526, "grad_norm": 0.19835789501667023, "learning_rate": 1.6970826806708694e-08, "loss": 0.004, "step": 6498 }, { "epoch": 3.954365682993611, "grad_norm": 0.16036944091320038, "learning_rate": 1.651530510638488e-08, "loss": 0.0027, "step": 6499 }, { "epoch": 3.9549741405536962, "grad_norm": 0.2660086750984192, "learning_rate": 1.6065978268903215e-08, "loss": 0.0068, "step": 6500 }, { "epoch": 3.9555825981137813, "grad_norm": 0.18971866369247437, "learning_rate": 1.562284640567735e-08, "loss": 0.0053, "step": 6501 }, { "epoch": 3.956191055673867, "grad_norm": 0.22135283052921295, "learning_rate": 1.5185909626583284e-08, "loss": 0.0057, "step": 6502 }, { "epoch": 3.956799513233952, "grad_norm": 0.33586928248405457, "learning_rate": 1.4755168039967682e-08, "loss": 0.0154, "step": 6503 }, { "epoch": 3.957407970794037, "grad_norm": 0.21343040466308594, "learning_rate": 1.4330621752631224e-08, "loss": 0.0041, "step": 6504 }, { "epoch": 3.958016428354122, "grad_norm": 0.15883713960647583, "learning_rate": 1.3912270869848032e-08, "loss": 0.0037, "step": 6505 }, { "epoch": 3.9586248859142072, "grad_norm": 0.33996376395225525, "learning_rate": 1.3500115495351795e-08, "loss": 0.0054, "step": 6506 }, { "epoch": 3.959233343474293, "grad_norm": 0.22483274340629578, "learning_rate": 1.3094155731335767e-08, "loss": 0.0047, "step": 6507 }, { "epoch": 3.959841801034378, "grad_norm": 0.3512939214706421, "learning_rate": 1.2694391678463868e-08, "loss": 0.0082, "step": 6508 }, { "epoch": 3.960450258594463, "grad_norm": 0.3100382685661316, "learning_rate": 1.2300823435862363e-08, "loss": 0.0046, "step": 6509 }, { "epoch": 3.961058716154548, "grad_norm": 0.15239042043685913, "learning_rate": 1.191345110111708e-08, "loss": 0.0034, "step": 6510 }, { "epoch": 3.961667173714633, "grad_norm": 0.2930486798286438, "learning_rate": 1.1532274770281737e-08, "loss": 0.0059, "step": 6511 }, { "epoch": 3.9622756312747187, "grad_norm": 0.18088679015636444, "learning_rate": 1.1157294537869622e-08, "loss": 0.0063, "step": 6512 }, { "epoch": 3.962884088834804, "grad_norm": 0.2748885452747345, "learning_rate": 1.078851049686469e-08, "loss": 0.0051, "step": 6513 }, { "epoch": 3.963492546394889, "grad_norm": 0.25067272782325745, "learning_rate": 1.0425922738704908e-08, "loss": 0.0065, "step": 6514 }, { "epoch": 3.964101003954974, "grad_norm": 0.2923562228679657, "learning_rate": 1.0069531353301687e-08, "loss": 0.0112, "step": 6515 }, { "epoch": 3.964709461515059, "grad_norm": 0.23911620676517487, "learning_rate": 9.719336429023229e-09, "loss": 0.0069, "step": 6516 }, { "epoch": 3.9653179190751446, "grad_norm": 0.23181907832622528, "learning_rate": 9.375338052702853e-09, "loss": 0.0054, "step": 6517 }, { "epoch": 3.9659263766352297, "grad_norm": 0.25822049379348755, "learning_rate": 9.037536309636218e-09, "loss": 0.0059, "step": 6518 }, { "epoch": 3.966534834195315, "grad_norm": 0.29777318239212036, "learning_rate": 8.705931283586877e-09, "loss": 0.008, "step": 6519 }, { "epoch": 3.9671432917554, "grad_norm": 0.440320760011673, "learning_rate": 8.380523056777944e-09, "loss": 0.008, "step": 6520 }, { "epoch": 3.967751749315485, "grad_norm": 0.2474939078092575, "learning_rate": 8.061311709897656e-09, "loss": 0.0038, "step": 6521 }, { "epoch": 3.9683602068755706, "grad_norm": 0.4338011145591736, "learning_rate": 7.748297322096586e-09, "loss": 0.007, "step": 6522 }, { "epoch": 3.9689686644356557, "grad_norm": 0.20323798060417175, "learning_rate": 7.441479970990428e-09, "loss": 0.0051, "step": 6523 }, { "epoch": 3.9695771219957408, "grad_norm": 0.21181131899356842, "learning_rate": 7.140859732654437e-09, "loss": 0.0047, "step": 6524 }, { "epoch": 3.970185579555826, "grad_norm": 0.25651806592941284, "learning_rate": 6.846436681631763e-09, "loss": 0.006, "step": 6525 }, { "epoch": 3.970794037115911, "grad_norm": 0.20691846311092377, "learning_rate": 6.558210890927896e-09, "loss": 0.0041, "step": 6526 }, { "epoch": 3.9714024946759965, "grad_norm": 0.21363694965839386, "learning_rate": 6.2761824320106684e-09, "loss": 0.0032, "step": 6527 }, { "epoch": 3.9720109522360816, "grad_norm": 0.22716128826141357, "learning_rate": 6.000351374807478e-09, "loss": 0.005, "step": 6528 }, { "epoch": 3.9726194097961667, "grad_norm": 0.19671691954135895, "learning_rate": 5.730717787716389e-09, "loss": 0.0042, "step": 6529 }, { "epoch": 3.9732278673562518, "grad_norm": 0.27143892645835876, "learning_rate": 5.467281737597807e-09, "loss": 0.0053, "step": 6530 }, { "epoch": 3.973836324916337, "grad_norm": 0.20458009839057922, "learning_rate": 5.2100432897661535e-09, "loss": 0.0049, "step": 6531 }, { "epoch": 3.9744447824764224, "grad_norm": 0.2738465964794159, "learning_rate": 4.959002508012067e-09, "loss": 0.0075, "step": 6532 }, { "epoch": 3.9750532400365075, "grad_norm": 0.2310987263917923, "learning_rate": 4.714159454580203e-09, "loss": 0.0029, "step": 6533 }, { "epoch": 3.9756616975965926, "grad_norm": 0.33232930302619934, "learning_rate": 4.47551419018033e-09, "loss": 0.0072, "step": 6534 }, { "epoch": 3.9762701551566777, "grad_norm": 0.21468819677829742, "learning_rate": 4.243066773990112e-09, "loss": 0.005, "step": 6535 }, { "epoch": 3.976878612716763, "grad_norm": 0.21161320805549622, "learning_rate": 4.016817263644002e-09, "loss": 0.0057, "step": 6536 }, { "epoch": 3.9774870702768483, "grad_norm": 0.22753821313381195, "learning_rate": 3.796765715244344e-09, "loss": 0.0048, "step": 6537 }, { "epoch": 3.9780955278369334, "grad_norm": 0.229328915476799, "learning_rate": 3.5829121833530488e-09, "loss": 0.0063, "step": 6538 }, { "epoch": 3.9787039853970185, "grad_norm": 0.25419679284095764, "learning_rate": 3.3752567209971444e-09, "loss": 0.0048, "step": 6539 }, { "epoch": 3.9793124429571036, "grad_norm": 0.2747920751571655, "learning_rate": 3.173799379665998e-09, "loss": 0.0061, "step": 6540 }, { "epoch": 3.9799209005171887, "grad_norm": 0.24330097436904907, "learning_rate": 2.978540209314096e-09, "loss": 0.0035, "step": 6541 }, { "epoch": 3.9805293580772743, "grad_norm": 0.20687943696975708, "learning_rate": 2.789479258358263e-09, "loss": 0.0051, "step": 6542 }, { "epoch": 3.9811378156373594, "grad_norm": 0.22026467323303223, "learning_rate": 2.6066165736748914e-09, "loss": 0.0068, "step": 6543 }, { "epoch": 3.9817462731974445, "grad_norm": 0.22411870956420898, "learning_rate": 2.429952200611041e-09, "loss": 0.0042, "step": 6544 }, { "epoch": 3.9823547307575295, "grad_norm": 0.250068336725235, "learning_rate": 2.2594861829650094e-09, "loss": 0.0051, "step": 6545 }, { "epoch": 3.9829631883176146, "grad_norm": 0.2509680688381195, "learning_rate": 2.095218563011314e-09, "loss": 0.0048, "step": 6546 }, { "epoch": 3.9835716458777, "grad_norm": 0.20843267440795898, "learning_rate": 1.9371493814784867e-09, "loss": 0.0041, "step": 6547 }, { "epoch": 3.9841801034377853, "grad_norm": 0.12238210439682007, "learning_rate": 1.7852786775629516e-09, "loss": 0.0014, "step": 6548 }, { "epoch": 3.9847885609978704, "grad_norm": 0.22475853562355042, "learning_rate": 1.6396064889206974e-09, "loss": 0.0046, "step": 6549 }, { "epoch": 3.9853970185579555, "grad_norm": 0.1801108866930008, "learning_rate": 1.5001328516728309e-09, "loss": 0.0037, "step": 6550 }, { "epoch": 3.9860054761180406, "grad_norm": 0.2936258912086487, "learning_rate": 1.3668578004027988e-09, "loss": 0.0074, "step": 6551 }, { "epoch": 3.986613933678126, "grad_norm": 0.27133995294570923, "learning_rate": 1.2397813681591653e-09, "loss": 0.0061, "step": 6552 }, { "epoch": 3.987222391238211, "grad_norm": 0.29440489411354065, "learning_rate": 1.1189035864500597e-09, "loss": 0.006, "step": 6553 }, { "epoch": 3.9878308487982963, "grad_norm": 0.24190589785575867, "learning_rate": 1.0042244852487281e-09, "loss": 0.0042, "step": 6554 }, { "epoch": 3.9884393063583814, "grad_norm": 0.2079675942659378, "learning_rate": 8.957440929879823e-10, "loss": 0.0042, "step": 6555 }, { "epoch": 3.9890477639184665, "grad_norm": 0.2172488123178482, "learning_rate": 7.934624365685261e-10, "loss": 0.0041, "step": 6556 }, { "epoch": 3.989656221478552, "grad_norm": 0.23664279282093048, "learning_rate": 6.973795413534045e-10, "loss": 0.0086, "step": 6557 }, { "epoch": 3.990264679038637, "grad_norm": 0.12852723896503448, "learning_rate": 6.074954311652281e-10, "loss": 0.0013, "step": 6558 }, { "epoch": 3.9908731365987222, "grad_norm": 0.24331684410572052, "learning_rate": 5.238101282944996e-10, "loss": 0.0056, "step": 6559 }, { "epoch": 3.9914815941588073, "grad_norm": 0.49298709630966187, "learning_rate": 4.4632365348573625e-10, "loss": 0.0224, "step": 6560 }, { "epoch": 3.9920900517188924, "grad_norm": 0.31049343943595886, "learning_rate": 3.750360259596741e-10, "loss": 0.0094, "step": 6561 }, { "epoch": 3.992698509278978, "grad_norm": 0.2852802276611328, "learning_rate": 3.099472633855127e-10, "loss": 0.0075, "step": 6562 }, { "epoch": 3.993306966839063, "grad_norm": 0.20569032430648804, "learning_rate": 2.5105738190867033e-10, "loss": 0.0051, "step": 6563 }, { "epoch": 3.993915424399148, "grad_norm": 0.23459145426750183, "learning_rate": 1.9836639612580422e-10, "loss": 0.0032, "step": 6564 }, { "epoch": 3.9945238819592332, "grad_norm": 0.24636982381343842, "learning_rate": 1.518743191070149e-10, "loss": 0.0067, "step": 6565 }, { "epoch": 3.9951323395193183, "grad_norm": 0.21381396055221558, "learning_rate": 1.1158116237919292e-10, "loss": 0.0054, "step": 6566 }, { "epoch": 3.995740797079404, "grad_norm": 0.2777492105960846, "learning_rate": 7.748693593156997e-11, "loss": 0.0209, "step": 6567 }, { "epoch": 3.996349254639489, "grad_norm": 0.229709655046463, "learning_rate": 4.959164821849438e-11, "loss": 0.0044, "step": 6568 }, { "epoch": 3.996957712199574, "grad_norm": 0.21392680704593658, "learning_rate": 2.7895306156655587e-11, "loss": 0.0033, "step": 6569 }, { "epoch": 3.997566169759659, "grad_norm": 0.23358656466007233, "learning_rate": 1.2397915127859705e-11, "loss": 0.0094, "step": 6570 }, { "epoch": 3.9981746273197443, "grad_norm": 0.23164047300815582, "learning_rate": 3.099478973478398e-12, "loss": 0.0039, "step": 6571 }, { "epoch": 3.99878308487983, "grad_norm": 0.16882868111133575, "learning_rate": 0.0, "loss": 0.002, "step": 6572 }, { "epoch": 3.99878308487983, "eval_loss": 1.711881399154663, "eval_runtime": 104.2004, "eval_samples_per_second": 7.313, "eval_steps_per_second": 0.461, "step": 6572 }, { "epoch": 3.99878308487983, "step": 6572, "total_flos": 9.69717152698545e+18, "train_loss": 0.1857661268678992, "train_runtime": 120983.7455, "train_samples_per_second": 1.739, "train_steps_per_second": 0.054 } ], "logging_steps": 1.0, "max_steps": 6572, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 9.69717152698545e+18, "train_batch_size": 2, "trial_name": null, "trial_params": null }